├── frames
└── frame.jpg
├── .idea
├── .gitignore
├── misc.xml
├── inspectionProfiles
│ ├── profiles_settings.xml
│ └── Project_Default.xml
├── modules.xml
└── GPT4V.iml
├── requirements.txt
├── LICENSE
├── capture.py
├── README.md
├── main.py
└── singlestore.py
/frames/frame.jpg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/GPT4V.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | annotated-types==0.6.0
2 | anyio==3.7.1
3 | certifi==2023.11.17
4 | cffi==1.16.0
5 | charset-normalizer==3.3.2
6 | distro==1.8.0
7 | exceptiongroup==1.1.3
8 | h11==0.14.0
9 | httpcore==1.0.2
10 | httpx==0.25.1
11 | idna==3.4
12 | numpy==1.26.2
13 | openai==1.3.3
14 | opencv-python==4.8.1.78
15 | Pillow==10.1.0
16 | playsound==1.3.0
17 | PyAudio==0.2.14
18 | pycparser==2.21
19 | pydantic==2.5.1
20 | pydantic_core==2.14.3
21 | pyobjc==10.0
22 | requests==2.31.0
23 | sniffio==1.3.0
24 | sounddevice==0.4.6
25 | soundfile==0.12.1
26 | SpeechRecognition==3.10.0
27 | tqdm==4.66.1
28 | typing_extensions==4.8.0
29 | urllib3==2.1.0
30 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Ayush Pai
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/capture.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import time
3 | from PIL import Image
4 | import numpy as np
5 | import os
6 |
7 | # Folder
8 | folder = "frames"
9 |
10 | # Create the frames folder if it doesn't exist
11 | frames_dir = os.path.join(os.getcwd(), folder)
12 | os.makedirs(frames_dir, exist_ok=True)
13 |
14 | # Initialize the video capture
15 | cap = cv2.VideoCapture("videos/clippers.mp4")
16 |
17 | # Wait for the camera to initialize and adjust light levels
18 | time.sleep(2)
19 |
20 | while True:
21 | ret, frame = cap.read()
22 | if ret:
23 | # Convert the frame to a PIL image
24 | pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
25 |
26 | # Convert the PIL image back to an OpenCV image
27 | frame = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
28 |
29 | # Display the frame
30 | cv2.imshow("Video Preview", frame)
31 |
32 | # Save the frame as an image file
33 | print("Saved current frame")
34 | path = f"{folder}/frame.jpg"
35 | cv2.imwrite(path, frame)
36 | else:
37 | print("Failed to capture image")
38 | break
39 |
40 | # Check if the user pressed the 'q' key
41 | if cv2.waitKey(25) & 0xFF == ord('q'):
42 | break
43 |
44 | # Release the camera and close all windows
45 | cap.release()
46 | cv2.destroyAllWindows()
47 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Conversational AI with GPT-4 Vision, OpenAI Whisper, and TTS
2 | ## Overview
3 | This project integrates GPT-4 Vision, OpenAI Whisper, and OpenAI Text-to-Speech (TTS) to create an interactive AI system for conversations. It combines visual and audio inputs for a seamless user experience.
4 |
5 | ## Demo Video:
6 | https://twitter.com/ayushspai/status/1726222559480557647
7 |
8 | ## Components
9 | - **GPT-4 Vision**: Analyzes visual input and generates contextual responses.
10 | - **OpenAI Whisper**: Converts spoken language into text.
11 | - **OpenAI TTS**: Transforms text responses into spoken language.
12 |
13 | ## Main Files
14 | - `main.py`: Manages audio processing, image encoding, AI interactions, and text-to-speech output.
15 | - `capture.py`: Captures and processes video frames for visual analysis.
16 |
17 | ## Installation
18 |
19 | ### Prerequisites
20 | - Python 3.x
21 | - An OpenAI API key (set as an environment variable `OPENAI_API_KEY`)
22 |
23 | ### Libraries
24 | Install the necessary libraries with the requirements.txt file.
25 | ```
26 | pip install -r requirements.txt
27 | ```
28 |
29 | ## Usage
30 |
31 | ### Running the Scripts
32 | - **Start `capture.py`**: Captures video frames and saves them for AI analysis.
33 | - Reads a video file, displays the video, and saves the current frame as `frame.jpg`.
34 | - Execute with `python capture.py`.
35 |
36 | - **Run `main.py` concurrently**: Orchestrates the conversational AI.
37 | - Continuously listens for user audio input.
38 | - Transcribes speech to text, captures the current video frame, and sends both to GPT-4 for analysis.
39 | - Converts the AI's response to speech and plays it back.
40 | - Execute with `python main.py`.
41 |
42 | ### Workflow
43 | 1. `main.py` listens for audio input and transcribes it using OpenAI Whisper.
44 | 2. Meanwhile, `capture.py` captures a video frame.
45 | 3. Both the audio transcription and the encoded image are sent to GPT-4 Vision.
46 | 4. GPT-4 Vision responds, considering the visual and textual context.
47 | 5. The response is vocalized using OpenAI TTS and played to the user.
48 |
49 | ### Notes
50 | - Ensure both `main.py` and `capture.py` are active for the system to function.
51 | - The video file in `capture.py` can be customized.
52 | - Adequate hardware is recommended for smooth audio and video processing.
53 |
54 | ## Conclusion
55 | This project demonstrates a novel approach to combining various AI technologies, creating a dynamic and interactive conversational AI experience. It harnesses the capabilities of GPT-4 Vision, Whisper, and TTS for a comprehensive audio-visual interaction.
56 |
57 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import sounddevice as sd
2 | import soundfile as sf
3 | import numpy as np
4 | import speech_recognition as sr
5 | from openai import OpenAI
6 | import os
7 | import base64
8 | from playsound import playsound
9 | from io import BytesIO
10 |
11 | OpenAI.api_key = os.environ["OPENAI_API_KEY"]
12 |
13 |
14 | # Function to encode the image
15 | def encode_image(image_path):
16 | with open(image_path, "rb") as image_file:
17 | return base64.b64encode(image_file.read()).decode('utf-8')
18 |
19 |
20 | client = OpenAI()
21 |
22 |
23 | def frame_description(base64_image, user_prompt):
24 | return [
25 | {
26 | "role": "user",
27 | "content": [
28 | {"type": "text", "text": user_prompt},
29 | {
30 | "type": "image_url",
31 | "image_url": f"data:image/jpeg;base64,{base64_image}",
32 | },
33 | ],
34 | },
35 | ]
36 |
37 |
38 | def analyze_image(full_analysis, base64_image, user_prompt):
39 | response = client.chat.completions.create(
40 | model="gpt-4-vision-preview",
41 | messages=[
42 | {
43 | "role": "system",
44 | "content": """
45 | Response should be a sentence max, maybe 2. You are a friend of someone who is watching a basketball game.
46 | They are asking you questions about what is happening in the basketball game. Talk to them naturally like a friendly conversation. Be very passionate and excited about the game and use exclamation marks.
47 | """,
48 | },
49 | ]
50 | + full_analysis
51 | + frame_description(base64_image, user_prompt),
52 | max_tokens=500,
53 | )
54 | response_text = response.choices[0].message.content
55 | return response_text
56 |
57 |
58 | def play_audio(text):
59 | response = client.audio.speech.create(
60 | model="tts-1",
61 | voice="alloy",
62 | input=text,
63 | )
64 |
65 | response.stream_to_file("audio/output.mp3")
66 | playsound("audio/output.mp3")
67 |
68 |
69 | def get_prompt():
70 | audio_file = open("audio/input.mp3", "rb")
71 | transcript = client.audio.transcriptions.create(
72 | model="whisper-1",
73 | file=audio_file,
74 | response_format="text"
75 | )
76 | return transcript
77 |
78 |
79 | def get_input_file(threshold=0.03, silence_duration=3, base64_image=None):
80 | recognizer = sr.Recognizer()
81 | with sr.Microphone() as mic:
82 | print("Listening for speech...")
83 | # Adjust the recognizer sensitivity to ambient noise
84 | recognizer.adjust_for_ambient_noise(mic)
85 | started = False
86 | start_time = None
87 | audio_frames = []
88 |
89 | recording = True
90 |
91 | def callback(indata, frames, time, status):
92 | nonlocal started, start_time, audio_frames, recording, base64_image
93 | if np.any(indata > threshold):
94 | if not started:
95 | print("Starting recording...")
96 | # Path to your image
97 | image_path = "frames/frame.jpg"
98 | # Getting the base64 string
99 | base64_image = encode_image(image_path)
100 | started = True
101 | start_time = time.inputBufferAdcTime
102 | audio_frames.append(indata.copy())
103 | elif started:
104 | if time.inputBufferAdcTime - start_time > silence_duration:
105 | recording = False
106 | raise sd.CallbackAbort
107 |
108 | with sd.InputStream(callback=callback, channels=1):
109 | while True:
110 | if not recording:
111 | break
112 |
113 | if audio_frames:
114 | audio_data = np.concatenate(audio_frames, axis=0)
115 | with BytesIO() as f:
116 | sf.write(f, audio_data, samplerate=70000, format='WAV')
117 | f.seek(0)
118 | with sr.AudioFile(f) as source:
119 | audio = recognizer.record(source)
120 | with open("audio/input.mp3", "wb") as mp3_file:
121 | mp3_file.write(audio.get_wav_data(convert_rate=16000, convert_width=2))
122 | #print("Audio saved as input.mp3")
123 | else:
124 | print("No speech detected")
125 | return base64_image
126 |
127 |
128 | def main():
129 | full_analysis = []
130 | while True:
131 | final_image = get_input_file()
132 | user_prompt = get_prompt()
133 | print(user_prompt)
134 | analysis = analyze_image(full_analysis, final_image, user_prompt)
135 | print(analysis)
136 | play_audio(analysis)
137 | full_analysis = full_analysis + [{"role": "assistant", "content": analysis}]
138 |
139 |
140 |
141 | if __name__ == "__main__":
142 | main()
143 |
--------------------------------------------------------------------------------
/singlestore.py:
--------------------------------------------------------------------------------
1 | import singlestoredb as s2
2 | import sounddevice as sd
3 | import soundfile as sf
4 | import numpy as np
5 | import speech_recognition as sr
6 | from openai import OpenAI
7 | import os
8 | import base64
9 | from playsound import playsound
10 | from io import BytesIO
11 |
12 | # You can store user queries with SingleStore. If you do not want to use SingleStore, just use main.py.
13 |
14 | OpenAI.api_key = os.environ["OPENAI_API_KEY"]
15 |
16 |
17 | # Function to encode the image
18 | def encode_image(image_path):
19 | with open(image_path, "rb") as image_file:
20 | return base64.b64encode(image_file.read()).decode('utf-8')
21 |
22 |
23 | client = OpenAI()
24 |
25 |
26 | def frame_description(base64_image, user_prompt):
27 | return [
28 | {
29 | "role": "user",
30 | "content": [
31 | {"type": "text", "text": user_prompt},
32 | {
33 | "type": "image_url",
34 | "image_url": f"data:image/jpeg;base64,{base64_image}",
35 | },
36 | ],
37 | },
38 | ]
39 |
40 |
41 | def analyze_image(full_analysis, base64_image, user_prompt):
42 | response = client.chat.completions.create(
43 | model="gpt-4-vision-preview",
44 | messages=[
45 | {
46 | "role": "system",
47 | "content": """
48 | Response should be a sentence max, maybe 2. You are a friend of someone who is watching a basketball game. The clippers are the black jersey team (with white logo on bottom). The other team is the rockets and they are in a white jersey (red logo on bottom).
49 | They are asking you questions about what is happening in the basketball game. Talk to them naturally like a friendly conversation. Be very passionate and excited about the game and use exclamation marks.
50 | """,
51 | },
52 | ]
53 | + full_analysis
54 | + frame_description(base64_image, user_prompt),
55 | max_tokens=500,
56 | )
57 | response_text = response.choices[0].message.content
58 | return response_text
59 |
60 |
61 | def play_audio(text):
62 | response = client.audio.speech.create(
63 | model="tts-1",
64 | voice="alloy",
65 | input=text,
66 | )
67 |
68 | response.stream_to_file("audio/output.mp3")
69 | playsound("audio/output.mp3")
70 |
71 |
72 | def get_prompt():
73 | audio_file = open("audio/input.mp3", "rb")
74 | transcript = client.audio.transcriptions.create(
75 | model="whisper-1",
76 | file=audio_file,
77 | response_format="text"
78 | )
79 | return transcript
80 |
81 |
82 | def get_input_file(threshold=0.03, silence_duration=3, base64_image=None):
83 | recognizer = sr.Recognizer()
84 | with sr.Microphone() as mic:
85 | print("Listening for speech...")
86 | # Adjust the recognizer sensitivity to ambient noise
87 | recognizer.adjust_for_ambient_noise(mic)
88 | started = False
89 | start_time = None
90 | audio_frames = []
91 |
92 | recording = True
93 |
94 | def callback(indata, frames, time, status):
95 | nonlocal started, start_time, audio_frames, recording, base64_image
96 | if np.any(indata > threshold):
97 | if not started:
98 | print("Starting recording...")
99 | # Path to your image
100 | image_path = "frames/frame.jpg"
101 | # Getting the base64 string
102 | base64_image = encode_image(image_path)
103 | started = True
104 | start_time = time.inputBufferAdcTime
105 | audio_frames.append(indata.copy())
106 | elif started:
107 | if time.inputBufferAdcTime - start_time > silence_duration:
108 | recording = False
109 | raise sd.CallbackAbort
110 |
111 | with sd.InputStream(callback=callback, channels=1):
112 | while True:
113 | if not recording:
114 | break
115 |
116 | if audio_frames:
117 | audio_data = np.concatenate(audio_frames, axis=0)
118 | with BytesIO() as f:
119 | sf.write(f, audio_data, samplerate=70000, format='WAV')
120 | f.seek(0)
121 | with sr.AudioFile(f) as source:
122 | audio = recognizer.record(source)
123 | with open("audio/input.mp3", "wb") as mp3_file:
124 | mp3_file.write(audio.get_wav_data(convert_rate=16000, convert_width=2))
125 | #print("Audio saved as input.mp3")
126 | else:
127 | print("No speech detected")
128 | return base64_image
129 |
130 |
131 | def main():
132 | full_analysis = []
133 | conn = s2.connect(
134 | 'admin:Testing123@svc-ca8fa339-0d39-4942-ad73-4463f4110a1c-dml.aws-virginia-5.svc.singlestore.com:3306/testing')
135 |
136 | conn.autocommit(True)
137 | try:
138 | while True:
139 | final_image = get_input_file()
140 | user_prompt = get_prompt()
141 | print(user_prompt)
142 | analysis = analyze_image(full_analysis, final_image, user_prompt)
143 | print(analysis)
144 | play_audio(analysis)
145 | full_analysis.append({"role": "assistant", "content": analysis})
146 |
147 | # SQL statement for a regular INSERT
148 | insert_stmt = 'INSERT INTO OpenAISingleStore (TextValue) VALUES (%s)'
149 |
150 | with conn.cursor() as cur:
151 | # Insert the data without specifying TextKey; it will auto-increment
152 | cur.execute(insert_stmt, (user_prompt,))
153 |
154 | # Retrieve the last inserted ID
155 | cur.execute('SELECT LAST_INSERT_ID()')
156 | last_id_result = cur.fetchone()
157 | if last_id_result:
158 | last_id = last_id_result[0]
159 | print("Last inserted ID:", last_id)
160 |
161 | finally:
162 | # Ensure the connection is closed after the loop
163 | conn.close()
164 |
165 |
166 | if __name__ == "__main__":
167 | main()
168 |
--------------------------------------------------------------------------------