├── frames
    └── frame.jpg
├── .idea
    ├── .gitignore
    ├── misc.xml
    ├── inspectionProfiles
    │   ├── profiles_settings.xml
    │   └── Project_Default.xml
    ├── modules.xml
    └── GPT4V.iml
├── requirements.txt
├── LICENSE
├── capture.py
├── README.md
├── main.py
└── singlestore.py


/frames/frame.jpg:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (GPT4V)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/GPT4V.iml" filepath="$PROJECT_DIR$/.idea/GPT4V.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/GPT4V.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="PYTHON_MODULE" version="4">
3 |   <component name="NewModuleRootManager">
4 |     <content url="file://$MODULE_DIR$" />
5 |     <orderEntry type="inheritedJdk" />
6 |     <orderEntry type="sourceFolder" forTests="false" />
7 |   </component>
8 | </module>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
 5 |       <option name="ignoredIdentifiers">
 6 |         <list>
 7 |           <option value="face_recognition" />
 8 |         </list>
 9 |       </option>
10 |     </inspection_tool>
11 |   </profile>
12 | </component>


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | annotated-types==0.6.0
 2 | anyio==3.7.1
 3 | certifi==2023.11.17
 4 | cffi==1.16.0
 5 | charset-normalizer==3.3.2
 6 | distro==1.8.0
 7 | exceptiongroup==1.1.3
 8 | h11==0.14.0
 9 | httpcore==1.0.2
10 | httpx==0.25.1
11 | idna==3.4
12 | numpy==1.26.2
13 | openai==1.3.3
14 | opencv-python==4.8.1.78
15 | Pillow==10.1.0
16 | playsound==1.3.0
17 | PyAudio==0.2.14
18 | pycparser==2.21
19 | pydantic==2.5.1
20 | pydantic_core==2.14.3
21 | pyobjc==10.0
22 | requests==2.31.0
23 | sniffio==1.3.0
24 | sounddevice==0.4.6
25 | soundfile==0.12.1
26 | SpeechRecognition==3.10.0
27 | tqdm==4.66.1
28 | typing_extensions==4.8.0
29 | urllib3==2.1.0
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Ayush Pai
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/capture.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import time
 3 | from PIL import Image
 4 | import numpy as np
 5 | import os
 6 | 
 7 | # Folder
 8 | folder = "frames"
 9 | 
10 | # Create the frames folder if it doesn't exist
11 | frames_dir = os.path.join(os.getcwd(), folder)
12 | os.makedirs(frames_dir, exist_ok=True)
13 | 
14 | # Initialize the video capture
15 | cap = cv2.VideoCapture("videos/clippers.mp4")
16 | 
17 | # Wait for the camera to initialize and adjust light levels
18 | time.sleep(2)
19 | 
20 | while True:
21 |     ret, frame = cap.read()
22 |     if ret:
23 |         # Convert the frame to a PIL image
24 |         pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
25 | 
26 |         # Convert the PIL image back to an OpenCV image
27 |         frame = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
28 | 
29 |         # Display the frame
30 |         cv2.imshow("Video Preview", frame)
31 | 
32 |         # Save the frame as an image file
33 |         print("Saved current frame")
34 |         path = f"{folder}/frame.jpg"
35 |         cv2.imwrite(path, frame)
36 |     else:
37 |         print("Failed to capture image")
38 |         break
39 | 
40 |     # Check if the user pressed the 'q' key
41 |     if cv2.waitKey(25) & 0xFF == ord('q'):
42 |         break
43 | 
44 | # Release the camera and close all windows
45 | cap.release()
46 | cv2.destroyAllWindows()
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Conversational AI with GPT-4 Vision, OpenAI Whisper, and TTS
 2 | ## Overview
 3 | This project integrates GPT-4 Vision, OpenAI Whisper, and OpenAI Text-to-Speech (TTS) to create an interactive AI system for conversations. It combines visual and audio inputs for a seamless user experience.
 4 | 
 5 | ## Demo Video: 
 6 | https://twitter.com/ayushspai/status/1726222559480557647
 7 | 
 8 | ## Components
 9 | - **GPT-4 Vision**: Analyzes visual input and generates contextual responses.
10 | - **OpenAI Whisper**: Converts spoken language into text.
11 | - **OpenAI TTS**: Transforms text responses into spoken language.
12 | 
13 | ## Main Files
14 | - `main.py`: Manages audio processing, image encoding, AI interactions, and text-to-speech output.
15 | - `capture.py`: Captures and processes video frames for visual analysis.
16 | 
17 | ## Installation
18 | 
19 | ### Prerequisites
20 | - Python 3.x
21 | - An OpenAI API key (set as an environment variable `OPENAI_API_KEY`)
22 | 
23 | ### Libraries
24 | Install the necessary libraries with the requirements.txt file.
25 | ```
26 | pip install -r requirements.txt
27 | ```
28 | 
29 | ## Usage
30 | 
31 | ### Running the Scripts
32 | - **Start `capture.py`**: Captures video frames and saves them for AI analysis.
33 |   - Reads a video file, displays the video, and saves the current frame as `frame.jpg`.
34 |   - Execute with `python capture.py`.
35 | 
36 | - **Run `main.py` concurrently**: Orchestrates the conversational AI.
37 |   - Continuously listens for user audio input.
38 |   - Transcribes speech to text, captures the current video frame, and sends both to GPT-4 for analysis.
39 |   - Converts the AI's response to speech and plays it back.
40 |   - Execute with `python main.py`.
41 | 
42 | ### Workflow
43 | 1. `main.py` listens for audio input and transcribes it using OpenAI Whisper.
44 | 2. Meanwhile, `capture.py` captures a video frame.
45 | 3. Both the audio transcription and the encoded image are sent to GPT-4 Vision.
46 | 4. GPT-4 Vision responds, considering the visual and textual context.
47 | 5. The response is vocalized using OpenAI TTS and played to the user.
48 | 
49 | ### Notes
50 | - Ensure both `main.py` and `capture.py` are active for the system to function.
51 | - The video file in `capture.py` can be customized.
52 | - Adequate hardware is recommended for smooth audio and video processing.
53 | 
54 | ## Conclusion
55 | This project demonstrates a novel approach to combining various AI technologies, creating a dynamic and interactive conversational AI experience. It harnesses the capabilities of GPT-4 Vision, Whisper, and TTS for a comprehensive audio-visual interaction.
56 | 
57 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import sounddevice as sd
  2 | import soundfile as sf
  3 | import numpy as np
  4 | import speech_recognition as sr
  5 | from openai import OpenAI
  6 | import os
  7 | import base64
  8 | from playsound import playsound
  9 | from io import BytesIO
 10 | 
 11 | OpenAI.api_key = os.environ["OPENAI_API_KEY"]
 12 | 
 13 | 
 14 | # Function to encode the image
 15 | def encode_image(image_path):
 16 |     with open(image_path, "rb") as image_file:
 17 |         return base64.b64encode(image_file.read()).decode('utf-8')
 18 | 
 19 | 
 20 | client = OpenAI()
 21 | 
 22 | 
 23 | def frame_description(base64_image, user_prompt):
 24 |     return [
 25 |         {
 26 |             "role": "user",
 27 |             "content": [
 28 |                 {"type": "text", "text": user_prompt},
 29 |                 {
 30 |                     "type": "image_url",
 31 |                     "image_url": f"data:image/jpeg;base64,{base64_image}",
 32 |                 },
 33 |             ],
 34 |         },
 35 |     ]
 36 | 
 37 | 
 38 | def analyze_image(full_analysis, base64_image, user_prompt):
 39 |     response = client.chat.completions.create(
 40 |         model="gpt-4-vision-preview",
 41 |         messages=[
 42 |                      {
 43 |                          "role": "system",
 44 |                          "content": """
 45 |                 Response should be a sentence max, maybe 2. You are a friend of someone who is watching a basketball game.
 46 |                 They are asking you questions about what is happening in the basketball game. Talk to them naturally like a friendly conversation. Be very passionate and excited about the game and use exclamation marks. 
 47 |                 """,
 48 |                      },
 49 |                  ]
 50 |                  + full_analysis
 51 |                  + frame_description(base64_image, user_prompt),
 52 |         max_tokens=500,
 53 |     )
 54 |     response_text = response.choices[0].message.content
 55 |     return response_text
 56 | 
 57 | 
 58 | def play_audio(text):
 59 |     response = client.audio.speech.create(
 60 |         model="tts-1",
 61 |         voice="alloy",
 62 |         input=text,
 63 |     )
 64 | 
 65 |     response.stream_to_file("audio/output.mp3")
 66 |     playsound("audio/output.mp3")
 67 | 
 68 | 
 69 | def get_prompt():
 70 |     audio_file = open("audio/input.mp3", "rb")
 71 |     transcript = client.audio.transcriptions.create(
 72 |         model="whisper-1",
 73 |         file=audio_file,
 74 |         response_format="text"
 75 |     )
 76 |     return transcript
 77 | 
 78 | 
 79 | def get_input_file(threshold=0.03, silence_duration=3, base64_image=None):
 80 |     recognizer = sr.Recognizer()
 81 |     with sr.Microphone() as mic:
 82 |         print("Listening for speech...")
 83 |         # Adjust the recognizer sensitivity to ambient noise
 84 |         recognizer.adjust_for_ambient_noise(mic)
 85 |         started = False
 86 |         start_time = None
 87 |         audio_frames = []
 88 | 
 89 |         recording = True
 90 | 
 91 |         def callback(indata, frames, time, status):
 92 |             nonlocal started, start_time, audio_frames, recording, base64_image
 93 |             if np.any(indata > threshold):
 94 |                 if not started:
 95 |                     print("Starting recording...")
 96 |                     # Path to your image
 97 |                     image_path = "frames/frame.jpg"
 98 |                     # Getting the base64 string
 99 |                     base64_image = encode_image(image_path)
100 |                     started = True
101 |                     start_time = time.inputBufferAdcTime
102 |                 audio_frames.append(indata.copy())
103 |             elif started:
104 |                 if time.inputBufferAdcTime - start_time > silence_duration:
105 |                     recording = False
106 |                     raise sd.CallbackAbort
107 | 
108 |         with sd.InputStream(callback=callback, channels=1):
109 |             while True:
110 |                 if not recording:
111 |                     break
112 | 
113 |         if audio_frames:
114 |             audio_data = np.concatenate(audio_frames, axis=0)
115 |             with BytesIO() as f:
116 |                 sf.write(f, audio_data, samplerate=70000, format='WAV')
117 |                 f.seek(0)
118 |                 with sr.AudioFile(f) as source:
119 |                     audio = recognizer.record(source)
120 |                     with open("audio/input.mp3", "wb") as mp3_file:
121 |                         mp3_file.write(audio.get_wav_data(convert_rate=16000, convert_width=2))
122 |             #print("Audio saved as input.mp3")
123 |         else:
124 |             print("No speech detected")
125 |         return base64_image
126 | 
127 | 
128 | def main():
129 |     full_analysis = []
130 |     while True:
131 |         final_image = get_input_file()
132 |         user_prompt = get_prompt()
133 |         print(user_prompt)
134 |         analysis = analyze_image(full_analysis, final_image, user_prompt)
135 |         print(analysis)
136 |         play_audio(analysis)
137 |         full_analysis = full_analysis + [{"role": "assistant", "content": analysis}]
138 | 
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     main()
143 | 


--------------------------------------------------------------------------------
/singlestore.py:
--------------------------------------------------------------------------------
  1 | import singlestoredb as s2
  2 | import sounddevice as sd
  3 | import soundfile as sf
  4 | import numpy as np
  5 | import speech_recognition as sr
  6 | from openai import OpenAI
  7 | import os
  8 | import base64
  9 | from playsound import playsound
 10 | from io import BytesIO
 11 | 
 12 | # You can store user queries with SingleStore. If you do not want to use SingleStore, just use main.py.
 13 | 
 14 | OpenAI.api_key = os.environ["OPENAI_API_KEY"]
 15 | 
 16 | 
 17 | # Function to encode the image
 18 | def encode_image(image_path):
 19 |     with open(image_path, "rb") as image_file:
 20 |         return base64.b64encode(image_file.read()).decode('utf-8')
 21 | 
 22 | 
 23 | client = OpenAI()
 24 | 
 25 | 
 26 | def frame_description(base64_image, user_prompt):
 27 |     return [
 28 |         {
 29 |             "role": "user",
 30 |             "content": [
 31 |                 {"type": "text", "text": user_prompt},
 32 |                 {
 33 |                     "type": "image_url",
 34 |                     "image_url": f"data:image/jpeg;base64,{base64_image}",
 35 |                 },
 36 |             ],
 37 |         },
 38 |     ]
 39 | 
 40 | 
 41 | def analyze_image(full_analysis, base64_image, user_prompt):
 42 |     response = client.chat.completions.create(
 43 |         model="gpt-4-vision-preview",
 44 |         messages=[
 45 |                      {
 46 |                          "role": "system",
 47 |                          "content": """
 48 |                 Response should be a sentence max, maybe 2. You are a friend of someone who is watching a basketball game. The clippers are the black jersey team (with white logo on bottom). The other team is the rockets and they are in a white jersey (red logo on bottom). 
 49 |                 They are asking you questions about what is happening in the basketball game. Talk to them naturally like a friendly conversation. Be very passionate and excited about the game and use exclamation marks. 
 50 |                 """,
 51 |                      },
 52 |                  ]
 53 |                  + full_analysis
 54 |                  + frame_description(base64_image, user_prompt),
 55 |         max_tokens=500,
 56 |     )
 57 |     response_text = response.choices[0].message.content
 58 |     return response_text
 59 | 
 60 | 
 61 | def play_audio(text):
 62 |     response = client.audio.speech.create(
 63 |         model="tts-1",
 64 |         voice="alloy",
 65 |         input=text,
 66 |     )
 67 | 
 68 |     response.stream_to_file("audio/output.mp3")
 69 |     playsound("audio/output.mp3")
 70 | 
 71 | 
 72 | def get_prompt():
 73 |     audio_file = open("audio/input.mp3", "rb")
 74 |     transcript = client.audio.transcriptions.create(
 75 |         model="whisper-1",
 76 |         file=audio_file,
 77 |         response_format="text"
 78 |     )
 79 |     return transcript
 80 | 
 81 | 
 82 | def get_input_file(threshold=0.03, silence_duration=3, base64_image=None):
 83 |     recognizer = sr.Recognizer()
 84 |     with sr.Microphone() as mic:
 85 |         print("Listening for speech...")
 86 |         # Adjust the recognizer sensitivity to ambient noise
 87 |         recognizer.adjust_for_ambient_noise(mic)
 88 |         started = False
 89 |         start_time = None
 90 |         audio_frames = []
 91 | 
 92 |         recording = True
 93 | 
 94 |         def callback(indata, frames, time, status):
 95 |             nonlocal started, start_time, audio_frames, recording, base64_image
 96 |             if np.any(indata > threshold):
 97 |                 if not started:
 98 |                     print("Starting recording...")
 99 |                     # Path to your image
100 |                     image_path = "frames/frame.jpg"
101 |                     # Getting the base64 string
102 |                     base64_image = encode_image(image_path)
103 |                     started = True
104 |                     start_time = time.inputBufferAdcTime
105 |                 audio_frames.append(indata.copy())
106 |             elif started:
107 |                 if time.inputBufferAdcTime - start_time > silence_duration:
108 |                     recording = False
109 |                     raise sd.CallbackAbort
110 | 
111 |         with sd.InputStream(callback=callback, channels=1):
112 |             while True:
113 |                 if not recording:
114 |                     break
115 | 
116 |         if audio_frames:
117 |             audio_data = np.concatenate(audio_frames, axis=0)
118 |             with BytesIO() as f:
119 |                 sf.write(f, audio_data, samplerate=70000, format='WAV')
120 |                 f.seek(0)
121 |                 with sr.AudioFile(f) as source:
122 |                     audio = recognizer.record(source)
123 |                     with open("audio/input.mp3", "wb") as mp3_file:
124 |                         mp3_file.write(audio.get_wav_data(convert_rate=16000, convert_width=2))
125 |             #print("Audio saved as input.mp3")
126 |         else:
127 |             print("No speech detected")
128 |         return base64_image
129 | 
130 | 
131 | def main():
132 |     full_analysis = []
133 |     conn = s2.connect(
134 |         'admin:Testing123@svc-ca8fa339-0d39-4942-ad73-4463f4110a1c-dml.aws-virginia-5.svc.singlestore.com:3306/testing')
135 | 
136 |     conn.autocommit(True)
137 |     try:
138 |         while True:
139 |             final_image = get_input_file()
140 |             user_prompt = get_prompt()
141 |             print(user_prompt)
142 |             analysis = analyze_image(full_analysis, final_image, user_prompt)
143 |             print(analysis)
144 |             play_audio(analysis)
145 |             full_analysis.append({"role": "assistant", "content": analysis})
146 | 
147 |             # SQL statement for a regular INSERT
148 |             insert_stmt = 'INSERT INTO OpenAISingleStore (TextValue) VALUES (%s)'
149 | 
150 |             with conn.cursor() as cur:
151 |                 # Insert the data without specifying TextKey; it will auto-increment
152 |                 cur.execute(insert_stmt, (user_prompt,))
153 | 
154 |                 # Retrieve the last inserted ID
155 |                 cur.execute('SELECT LAST_INSERT_ID()')
156 |                 last_id_result = cur.fetchone()
157 |                 if last_id_result:
158 |                     last_id = last_id_result[0]
159 |                     print("Last inserted ID:", last_id)
160 | 
161 |     finally:
162 |         # Ensure the connection is closed after the loop
163 |         conn.close()
164 | 
165 | 
166 | if __name__ == "__main__":
167 |     main()
168 | 


--------------------------------------------------------------------------------