├── .gitignore
├── config.py
├── modules
    ├── helpers.py
    ├── recorder.py
    └── cv2_stream.py
├── motion.py
├── README.md
├── voice.py
├── auto.py
└── auto_with_ui.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | collage.jpg
3 | detect.jpg
4 | frame.jpg
5 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | config = {
 2 |     # how sensitive motion detection is
 3 |     "motion_threshold": 4,
 4 | 
 5 |     # what is considered "big" motion
 6 |     "big_motion_threshold": 30,
 7 | 
 8 |     # how many still frames before motion ends
 9 |     "still_frame_threshold": 20,
10 | 
11 |     # when to stop recording a long motion
12 |     "automatic_motion_cutoff": 200,
13 | 
14 |     # what level of audio is detected as speech
15 |     "speech_threshold": 3,
16 | }
17 | 


--------------------------------------------------------------------------------
/modules/helpers.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | import base64
 3 | import sys
 4 | import cv2
 5 | import re
 6 | 
 7 | def filter_garbage(message):
 8 |     if re.sub(r"[^a-z0-9]", "", message) == "":
 9 |         return True
10 | 
11 |     if message.count(",") / len(message) > 0.1:
12 |         return True
13 | 
14 |     if message.strip().strip(",!?") in ["mm-hmm", "cough", "tshh", "pfft", "swoosh"]:
15 |         return True
16 | 
17 |     for word in ["mm-hmm,", "cough,", "tshh,", "pfft,", "swoosh,"]:
18 |         if word in message:
19 |             return True
20 | 
21 |     return False
22 | 
23 | def image_b64(image):
24 |     if isinstance(image, str):
25 |         with open(image, "rb") as f:
26 |             return base64.b64encode(f.read()).decode()
27 |     elif isinstance(image, bytes):
28 |         return base64.b64encode(image).decode()
29 |     else:
30 |         buffer = BytesIO()
31 |         image.save(buffer, format="JPEG")
32 |         return base64.b64encode(buffer.getvalue()).decode()
33 | 
34 | def sharpness(image):
35 |     if isinstance(image, str):
36 |         image = cv2.imread(image)
37 | 
38 |     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
39 | 
40 |     laplacian = cv2.Laplacian(gray, cv2.CV_64F)
41 | 
42 |     return laplacian.var()
43 | 
44 | def get_stream():
45 |     if len(sys.argv) < 2:
46 |         print("ERROR: No video stream provided")
47 |         print(f"Usage: {sys.argv[0]} VIDEO_STREAM_URL")
48 |         sys.exit(1)
49 | 
50 |     return sys.argv[1]


--------------------------------------------------------------------------------
/motion.py:
--------------------------------------------------------------------------------
 1 | import modules.cv2_stream as cv2_stream
 2 | from playsound import playsound
 3 | from openai import OpenAI
 4 | import os
 5 | 
 6 | import modules.helpers as helpers
 7 | 
 8 | client = OpenAI()
 9 | 
10 | stream_url = helpers.get_stream()
11 | 
12 | messages = [
13 |     {
14 |         "role": "system",
15 |         "content": """You are an AI assistant that reports what it sees in short sentences. You are connected to a video feed and will be shown a collage of frames from a video and you will tell what what is happening in the video as if you are watching. Don't let the user know that you are seeing a sequence of images. Pretend as if you are a human seeing what is happening live. You are talking directly to the user. Assume any activity is performed by the user, who you shall refer to as "You". Example responses: "I see you drew an elephant" or "I see a dog" or "I see you put a TV remote on the table". Keep your responses very concise. If nothing particularly interesting happened since the previous image sequence, answer with the JSON {"status": "NO_CHANGE"}""".strip(),
16 |     }
17 | ]
18 | 
19 | for collage in cv2_stream.detect_changes(stream_url):
20 |     print("Motion detected!")
21 |     collage.save("collage.jpg", format="JPEG")
22 | 
23 |     messages.append({
24 |         "role": "user",
25 |         "content": [
26 |             {
27 |                 "type": "image_url",
28 |                 "image_url": f"data:image/jpeg;base64,{helpers.image_b64(collage)}"
29 |             }
30 |         ]
31 |     })
32 | 
33 |     response = client.chat.completions.create(
34 |         messages=messages,
35 |         model="gpt-4-vision-preview",
36 |         max_tokens=1024
37 |     )
38 | 
39 |     response_message = response.choices[0].message
40 | 
41 |     if '{"status": "NO_CHANGE"}' in response_message.content:
42 |         messages.pop()
43 |         continue
44 | 
45 |     messages.append(response_message)
46 | 
47 |     audio = client.audio.speech.create(
48 |         input=response_message.content,
49 |         model="tts-1",
50 |         voice="onyx",
51 |     )
52 | 
53 |     audio.stream_to_file("audio.mp3")
54 |     print("GPT: " + response_message.content)
55 |     playsound("audio.mp3")
56 |     os.remove("audio.mp3")
57 | 
58 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # GPT-4V Gemini
 2 | 
 3 | This is a crude demo project made to mimic the supposed [live video ingestion capabilities](https://www.youtube.com/watch?v=UIZAiXYceBI) of Google's multimodal Gemini LLM, but made with the GPT-4 Vision API.
 4 | 
 5 | Demo: https://youtu.be/UxQb88gENeg
 6 | 
 7 | ## Setup
 8 | 
 9 | ```shell
10 | $ pip install -r requirements.txt
11 | $ export OPENAI_API_KEY=YOUR_OPENAI_API_KEY
12 | ```
13 | 
14 | ## Voice version (terminal)
15 | 
16 | To run the voice commanded terminal version, run the `voice.py` script.
17 | 
18 | ```shell
19 | $ python3 voice.py VIDEO_STREAM_URL
20 | ```
21 | 
22 | The assistant only reacts to voice commands.
23 | 
24 | ## Motion version (terminal)
25 | 
26 | To run the motion detecting version, run the `motion.py` script.
27 | 
28 | ```shell
29 | $ python3 motion.py VIDEO_STREAM_URL
30 | ```
31 | 
32 | The assistants reacts every time motion is detected in the video. A tripod is recommended.
33 | 
34 | ## Automatic version (terminal)
35 | 
36 | To run the automatic version that detects both voice commands and motion in the video, run the `auto.py` script.
37 | 
38 | ```shell
39 | $ python3 auto.py VIDEO_STREAM_URL
40 | ```
41 | 
42 | The assistants reacts every time motion is detected in the video or a voice command is given. A tripod is recommended.
43 | 
44 | ## Automatic version with UI
45 | 
46 | There is also a version with a "UI" made with CV2 (it sucks but kinda works). It both listens to voice commands and detects motion in the video and automatically sends both to the GPT4V API.
47 | 
48 | ```shell
49 | $ python3 auto_with_ui.py VIDEO_STREAM_URL
50 | ```
51 | 
52 | ## How to get a video stream URL
53 | 
54 | In my testing, I have used my phone camera as the video stream. For this, I used the [IP Webcam](https://play.google.com/store/apps/details?id=com.pas.webcam&pcampaignid=web_share) app on Play Store. I set the camera to 10 fps at 640x480 resolution.
55 | 
56 | The VIDEO_STREAM_URL is passed directly into `cv2.VideoCapture()`, so I guess you should be able to pass in a video file too, or any kind of video stream.
57 | 
58 | ## Configuration
59 | 
60 | There is a `config.py` file where you can tweak some settings if you are having trouble with the motion detection or speech detection.
61 | 
62 | # Known issues
63 | 
64 | * GPT-4V API is often slow
65 | * Sometimes the assistant response is detected as a user message
66 | * The CV2 UI sucks and should be made with another way
67 | * The CV2 UI can only be closed by hittin Ctrl+C in the terminal
68 | 


--------------------------------------------------------------------------------
/voice.py:
--------------------------------------------------------------------------------
 1 | import modules.cv2_stream as cv2_stream
 2 | from multiprocessing import Process
 3 | from playsound import playsound
 4 | from openai import OpenAI
 5 | import shutil
 6 | import os
 7 | 
 8 | import modules.recorder as recorder
 9 | import modules.helpers as helpers
10 | 
11 | client = OpenAI()
12 | 
13 | stream_url = helpers.get_stream()
14 | 
15 | messages = [
16 |     {
17 |         "role": "system",
18 |         "content": """You are an AI assistant that can see. The photos provided to you are the view from your eyes. Answer the user based on what you see. The user is holding the camera. If you see them pointing to something and asking what it is, tell them what it is. Don't say what you're looking at is an image, unless the image sent to you is of a physical image. Answer in short, concise answers.""",
19 |     }
20 | ]
21 | 
22 | def write_changes():
23 |     for _ in cv2_stream.stream_frames(stream_url, "frame.jpg"):
24 |         pass
25 | 
26 | video_process = Process(target=write_changes)
27 | video_process.start()
28 | 
29 | while True:
30 |     for message in recorder.live_speech(60):
31 |         if helpers.filter_garbage(message):
32 |             break
33 | 
34 |         print("You: " + message)
35 | 
36 |         shutil.copy("frame.jpg", "detect.jpg")
37 | 
38 |         try:
39 |             messages.append({
40 |                 "role": "user",
41 |                 "content": [
42 |                     {
43 |                         "type": "image_url",
44 |                         "image_url": f"data:image/jpeg;base64,{helpers.image_b64('detect.jpg')}",
45 |                     },
46 |                     {
47 |                         "type": "text",
48 |                         "text": "Message transcribed from recording (might contain mistakes): " + message,
49 |                     }
50 |                 ]
51 |             })
52 | 
53 |             response = client.chat.completions.create(
54 |                 messages=messages,
55 |                 model="gpt-4-vision-preview",
56 |                 max_tokens=1024
57 |             )
58 | 
59 |             response_message = response.choices[0].message
60 |             response_text = response_message.content
61 | 
62 |             messages.append(response_message)
63 |         except Exception as e:
64 |             print(str(e))
65 |             response_text = "Sorry, I missed that"
66 |             messages.append({
67 |                 "role": "system",
68 |                 "content": "The user sent an invalid message"
69 |             })
70 |             messages.append({
71 |                 "role": "assistant",
72 |                 "content": response_text
73 |             })
74 | 
75 |         audio = client.audio.speech.create(
76 |             input=response_text,
77 |             model="tts-1",
78 |             voice="onyx",
79 |         )
80 | 
81 |         audio.stream_to_file("audio.mp3")
82 |         print("GPT: " + response_text)
83 |         playsound("audio.mp3")
84 |         os.remove("audio.mp3")
85 | 
86 |         break
87 | 
88 | video_process.join() # i really wanna join, but I can't
89 | 


--------------------------------------------------------------------------------
/modules/recorder.py:
--------------------------------------------------------------------------------
  1 | import audioop
  2 | import whisper
  3 | import pyaudio
  4 | import wave
  5 | import os
  6 | 
  7 | from config import config
  8 | 
  9 | whisper_model = whisper.load_model("base")
 10 | ambient_detected = False
 11 | speech_volume = 100
 12 | 
 13 | def transcribe(audio_file):
 14 |     result = whisper_model.transcribe(
 15 |         audio_file,
 16 |         fp16=False,
 17 |         no_speech_threshold=0.1,
 18 |         initial_prompt="mm-hmm, cough, tshh, pfft, swoosh"
 19 |     )
 20 | 
 21 |     return result["text"].strip()
 22 | 
 23 | def live_speech(wait_time=10, transcribe_audio=True, processing=None, ui_queue=None, winwidth=0, winheight=0):
 24 |     global ambient_detected
 25 |     global speech_volume
 26 | 
 27 |     FORMAT = pyaudio.paInt16
 28 |     CHANNELS = 1
 29 |     RATE = 44100
 30 |     CHUNK = 1024
 31 | 
 32 |     audio = pyaudio.PyAudio()
 33 | 
 34 |     stream = audio.open(
 35 |         format=FORMAT,
 36 |         channels=CHANNELS,
 37 |         rate=RATE,
 38 |         input=True,
 39 |         frames_per_buffer=CHUNK
 40 |     )
 41 | 
 42 |     frames = []
 43 |     recording = False
 44 |     frames_recorded = 0
 45 | 
 46 |     while True:
 47 |         frames_recorded += 1
 48 |         data = stream.read(CHUNK)
 49 |         rms = audioop.rms(data, 2)
 50 | 
 51 |         if not ambient_detected:
 52 |             if frames_recorded < 40:
 53 |                 if frames_recorded == 1:
 54 |                     print("Detecting ambient noise...")
 55 |                 if frames_recorded > 5:
 56 |                     if speech_volume < rms:
 57 |                         speech_volume = rms
 58 |                 continue
 59 |             elif frames_recorded == 40:
 60 |                 print("Listening...")
 61 |                 speech_volume = speech_volume * config["speech_threshold"]
 62 |                 ambient_detected = True
 63 | 
 64 |         if rms > speech_volume:
 65 |             if not recording:
 66 |                 if processing:
 67 |                     if processing.value == True:
 68 |                         continue
 69 |                     with processing.get_lock():
 70 |                         processing.value = True
 71 |                 if ui_queue:
 72 |                     # show "listening..."
 73 |                     ui_queue.put({
 74 |                         "type": "draw_text",
 75 |                         "args": {
 76 |                             "text": "listening...",
 77 |                             "position": (winwidth//2, int(winheight*0.9)),
 78 |                             "color": (150, 150, 150)
 79 |                         }
 80 |                     })
 81 |                 print("Voice detected!")
 82 |             recording = True
 83 |             frames_recorded = 0
 84 |         elif recording and frames_recorded > wait_time:
 85 |             recording = False
 86 | 
 87 |             wf = wave.open("audio.wav", 'wb')
 88 |             wf.setnchannels(CHANNELS)
 89 |             wf.setsampwidth(audio.get_sample_size(FORMAT))
 90 |             wf.setframerate(RATE)
 91 |             wf.writeframes(b''.join(frames))
 92 |             wf.close()
 93 | 
 94 |             if transcribe_audio:
 95 |                 result = transcribe("audio.wav")
 96 | 
 97 |                 os.remove("audio.wav")
 98 | 
 99 |                 yield result
100 |             else:
101 |                 yield "audio.wav"
102 | 
103 |             if ui_queue:
104 |                 # hide "listening..."
105 |                 ui_queue.put({
106 |                     "type": "draw_text",
107 |                     "args": {
108 |                         "text": "listening...",
109 |                         "position": (winwidth//2, int(winheight*0.9)),
110 |                         "color": (0, 0, 0)
111 |                     }
112 |                 })
113 | 
114 |             frames = []
115 | 
116 |         if recording:
117 |             frames.append(data)
118 | 
119 |     # TODO: do these when breaking from generator
120 |     stream.stop_stream()
121 |     stream.close()
122 |     audio.terminate()


--------------------------------------------------------------------------------
/auto.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing import Process, Queue, Value
  2 | import modules.cv2_stream as cv2_stream
  3 | from playsound import playsound
  4 | from openai import OpenAI
  5 | from queue import Empty
  6 | from PIL import Image
  7 | import base64
  8 | import shutil
  9 | import time
 10 | import os
 11 | 
 12 | import modules.recorder as recorder
 13 | import modules.helpers as helpers
 14 | 
 15 | client = OpenAI()
 16 | 
 17 | def motion_fn(queue: Queue, stream_url, processing: Value):
 18 |     for collage in cv2_stream.detect_changes(stream_url, processing=processing):
 19 |         print("Motion detected!")
 20 |         collage.save("collage.jpg", format="JPEG")
 21 | 
 22 |         queue.put({
 23 |             "image": collage
 24 |         })
 25 | 
 26 |         with processing.get_lock():
 27 |             processing.value = False
 28 | 
 29 | def voice_fn(queue: Queue, processing: Value):
 30 |     for audio_file in recorder.live_speech(60, transcribe_audio=False, processing=processing):
 31 |         if os.path.exists("collage.jpg"):
 32 |             image = Image.open("collage.jpg")
 33 |             os.remove("collage.jpg")
 34 |         else:
 35 |             shutil.copy("frame.jpg", "detect.jpg")
 36 |             try:
 37 |                 image = Image.open("detect.jpg")
 38 |             except OSError:
 39 |                 time.sleep(0.1)
 40 |                 shutil.copy("frame.jpg", "detect.jpg")
 41 |                 image = Image.open("detect.jpg")
 42 | 
 43 |             os.remove("detect.jpg")
 44 | 
 45 |         queue.put({
 46 |             "audio": audio_file,
 47 |             "image": image
 48 |         })
 49 | 
 50 |         with processing.get_lock():
 51 |             processing.value = False
 52 | 
 53 | def video_fn():
 54 |     for _ in cv2_stream.stream_frames(stream_url, "frame.jpg"):
 55 |         pass
 56 | 
 57 | def parse_message(message):
 58 |     image = helpers.image_b64(message["image"])
 59 | 
 60 |     with open("capture.jpg", "wb") as f:
 61 |         captured_image = base64.b64decode(image)
 62 |         f.write(captured_image)
 63 | 
 64 |     content = [
 65 |         {
 66 |             "type": "image_url",
 67 |             "image_url": f"data:image/jpeg;base64,{image}"
 68 |         }
 69 |     ]
 70 | 
 71 |     if "audio" in message:
 72 |         text = recorder.transcribe(message["audio"])
 73 | 
 74 |         if helpers.filter_garbage(text):
 75 |             return []
 76 | 
 77 |         print("You: " + text)
 78 | 
 79 |         content.append({
 80 |             "type": "text",
 81 |             "text": "Message transcribed from recording (might contain mistakes): " + text,
 82 |         })
 83 |     else:
 84 |         if os.path.exists("collage.jpg"):
 85 |             os.remove("collage.jpg")
 86 | 
 87 |     return content
 88 | 
 89 | def empty_queue(queue):
 90 |     try:
 91 |         while True:
 92 |             queue.get_nowait()
 93 |     except Empty:
 94 |         pass
 95 | 
 96 | stream_url = helpers.get_stream()
 97 | 
 98 | if os.path.exists("collage.jpg"):
 99 |     os.remove("collage.jpg")
100 | 
101 | if os.path.exists("detect.jpg"):
102 |     os.remove("detect.jpg")
103 | 
104 | if os.path.exists("frame.jpg"):
105 |     os.remove("frame.jpg")
106 | 
107 | queue = Queue()
108 | 
109 | processing_audio = Value('b', False)
110 | processing_video = Value('b', False)
111 | 
112 | print("Starting video process...")
113 | video_process = Process(target=video_fn)
114 | video_process.start()
115 | 
116 | print("Starting motion process...")
117 | motion_process = Process(target=motion_fn, args=(queue, stream_url, processing_video))
118 | motion_process.start()
119 | 
120 | print("Starting voice process...")
121 | voice_process = Process(target=voice_fn, args=(queue, processing_audio), daemon=True)
122 | voice_process.start()
123 | 
124 | messages = [
125 |     {
126 |         "role": "system",
127 |         "content": """You are an AI assistant that reports what it sees in short sentences. You are connected to a video feed and will be shown a collage of frames from a video and you will tell what what is happening in the video as if you are watching. Don't let the user know that you are seeing a sequence of images. Pretend as if you are a human seeing what is happening live. You are talking directly to the user. Assume any activity is performed by the user, who you shall refer to as "You". "I see you drew an elephant" or "I see a dog" or "I see you put a TV remote on the table". Keep your answers very concise. If nothing particularly interesting happened since the previous image sequence, answer with the JSON {"status": "NO_CHANGE"}""".strip(),
128 |     }
129 | ]
130 | 
131 | while True:
132 |     message = queue.get()
133 | 
134 |     content = parse_message(message)
135 | 
136 |     time.sleep(0.5)
137 | 
138 |     if processing_audio.value == True:
139 |         while processing_audio.value == True:
140 |             time.sleep(0.1)
141 | 
142 |     if processing_video.value == True:
143 |         while processing_video.value == True:
144 |             time.sleep(0.1)
145 | 
146 |     try:
147 |         message2 = queue.get(timeout=0.5)
148 |         if message2:
149 |             content += parse_message(message2)
150 |     except Empty:
151 |         pass
152 | 
153 |     messages.append({
154 |         "role": "user",
155 |         "content": content
156 |     })
157 | 
158 |     response = client.chat.completions.create(
159 |         messages=messages,
160 |         model="gpt-4-vision-preview",
161 |         max_tokens=1024
162 |     )
163 | 
164 |     response_message = response.choices[0].message
165 |     response_text = response_message.content
166 | 
167 |     if '{"status": "NO_CHANGE"}' in response_text:
168 |         messages.pop()
169 |         continue
170 | 
171 |     messages.append(response_message)
172 | 
173 |     audio = client.audio.speech.create(
174 |         input=response_text,
175 |         model="tts-1",
176 |         voice="onyx",
177 |     )
178 | 
179 |     audio.stream_to_file("audio.mp3")
180 |     print("GPT: " + response_text)
181 | 
182 |     with processing_audio.get_lock():
183 |         processing_audio.value = True
184 | 
185 |     playsound("audio.mp3")
186 |     os.remove("audio.mp3")
187 | 
188 |     time.sleep(0.2)
189 | 
190 |     empty_queue(queue)
191 | 
192 |     with processing_audio.get_lock():
193 |         processing_audio.value = False
194 | 
195 |     with processing_video.get_lock():
196 |         processing_video.value = False
197 | 
198 | video_process.join()
199 | motion_process.join()
200 | voice_process.join()
201 | 


--------------------------------------------------------------------------------
/modules/cv2_stream.py:
--------------------------------------------------------------------------------
  1 | from PIL import Image
  2 | import numpy as np
  3 | import math
  4 | import cv2
  5 | 
  6 | import modules.helpers as helpers
  7 | from config import config
  8 | 
  9 | def make_collage(frames, border=35):
 10 |     frame_count = len(frames)
 11 | 
 12 |     rows = 1
 13 |     while frame_count / rows > 4:
 14 |         rows += 1
 15 |     per_row = math.ceil(frame_count / rows)
 16 | 
 17 |     try:
 18 |         frame = cv2.cvtColor(frames[0], cv2.COLOR_BGR2RGB)
 19 |         image1 = Image.fromarray(frame)
 20 |         collage = Image.new('RGB', (image1.width*per_row+border*(per_row-1), image1.height*rows+border*(per_row-1)))
 21 |         collage.paste(image1, (0, 0))
 22 |     except OSError:
 23 |         print("Error saving collage...")
 24 |         return
 25 | 
 26 |     pos_x = image1.width + border
 27 |     pos_y = 0
 28 | 
 29 |     for i, frame in enumerate(frames[1:]):
 30 |         try:
 31 |             frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
 32 |             image = Image.fromarray(frame)
 33 |             collage.paste(image, (pos_x, pos_y))
 34 |         except OSError:
 35 |             print("Error adding changed frame...")
 36 |             continue
 37 | 
 38 |         pos_x += image.width + border
 39 | 
 40 |         if (i+2) % per_row == 0:
 41 |             pos_y += image.height + border
 42 |             pos_x = 0
 43 | 
 44 |     return collage
 45 | 
 46 | def detect_changes(stream_url, count=9, min_frames=5, max_frames=None, processing=None, frame_queue=None):
 47 |     if max_frames is None:
 48 |         max_frames = config["automatic_motion_cutoff"]
 49 | 
 50 |     # Create a VideoCapture object
 51 |     cap = cv2.VideoCapture(stream_url)
 52 | 
 53 |     # Check if the stream is opened successfully
 54 |     if not cap.isOpened():
 55 |         print("Error: Unable to open video stream")
 56 |         exit()
 57 | 
 58 |     # Read the first frame
 59 |     ret, previous_frame = cap.read()
 60 |     if not ret:
 61 |         print("Error: Unable to read video stream")
 62 |         cap.release()
 63 |         exit()
 64 | 
 65 |     # Convert the first frame to grayscale
 66 |     previous_frame_gray = cv2.cvtColor(previous_frame, cv2.COLOR_BGR2GRAY)
 67 | 
 68 |     # Still frame counter
 69 |     still_frame_counter = 0
 70 | 
 71 |     # Frames
 72 |     frames = []
 73 | 
 74 |     frame_counter = 0
 75 |     big_movement = 0
 76 |     while cap.isOpened():
 77 |         frame_counter += 1
 78 |         # Capture frame-by-frame
 79 |         ret, current_frame = cap.read()
 80 |         if not ret:
 81 |             break
 82 | 
 83 |         if frame_queue:
 84 |             frame_queue.put(current_frame)
 85 | 
 86 |         # Convert current frame to grayscale
 87 |         gray_frame = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)
 88 | 
 89 |         # Calculate the absolute difference
 90 |         frame_diff = cv2.absdiff(previous_frame_gray, gray_frame)
 91 | 
 92 |         # Threshold for significant change
 93 |         _, thresh = cv2.threshold(frame_diff, 30, 255, cv2.THRESH_BINARY)
 94 | 
 95 |         # Count the number of changed pixels
 96 |         change_count = np.sum(thresh != 0)
 97 | 
 98 |         # If significant change is detected, save the frame
 99 |         if change_count > current_frame.shape[1]*config["motion_threshold"]: # Threshold for change, adjust as needed
100 |             if change_count > current_frame.shape[1]*config["big_motion_threshold"]:
101 |                 big_movement += 1
102 | 
103 |             if processing:
104 |                 with processing.get_lock():
105 |                     processing.value = True
106 | 
107 |             frames.append(previous_frame)
108 |             frames.append(current_frame)
109 |             still_frame_counter = 0
110 |         else:
111 |             if still_frame_counter < 2:
112 |                 frames.append(current_frame)
113 |             still_frame_counter += 1
114 | 
115 |         frame_count = len(frames)
116 |         if still_frame_counter == config["still_frame_threshold"] or frame_count > max_frames:
117 |             if frame_count > min_frames and big_movement >= int(frame_count/30):
118 |                 if frame_count > count:
119 |                     sharp_frames = {}
120 |                     frame_num = 0
121 | 
122 |                     while frame_count > 50:
123 |                         frames = frames[0::2]
124 |                         frame_count = len(frames)
125 | 
126 |                     for i, frame in enumerate(frames):
127 |                         if str(frame_num) not in sharp_frames:
128 |                             sharp_frames[str(frame_num)] = (0, None)
129 | 
130 |                         sharpness = helpers.sharpness(frame)
131 |                         if sharp_frames[str(frame_num)][0] < sharpness:
132 |                             sharp_frames[str(frame_num)] = (sharpness, frame)
133 | 
134 |                         if i % int(frame_count / count) == 0:
135 |                             frame_num += 1
136 |                     frames = []
137 |                     for sharpness, frame in sharp_frames.values():
138 |                         frames.append(frame)
139 |                     frame_count = len(frames)
140 |                 step = int(frame_count / count)
141 |                 step = 1 if step < 1 else step
142 |                 spread_out_frames = list(reversed(frames[-1::-step]))[-count:] # i no gud at math
143 | 
144 |                 # Yield new motion
145 |                 yield make_collage(spread_out_frames)
146 | 
147 |             if processing:
148 |                 with processing.get_lock():
149 |                     processing.value = False
150 | 
151 |             frames = []
152 |             big_movement = 0
153 | 
154 |         # Update the previous frame
155 |         previous_frame_gray = gray_frame.copy()
156 |         previous_frame = current_frame.copy()
157 | 
158 |         # Display the frame (optional)
159 |         #cv2.imshow('Frame', current_frame)
160 | 
161 |         # Press Q on keyboard to exit the loop
162 |         #if cv2.waitKey(1) & 0xFF == ord('q'):
163 |         #    break
164 | 
165 |     # Release the video capture object
166 |     cap.release()
167 | 
168 |     # Close all frames
169 |     cv2.destroyAllWindows()
170 | 
171 | def stream_frames(stream_url, output_file=None):
172 |     # Create a VideoCapture object
173 |     cap = cv2.VideoCapture(stream_url)
174 | 
175 |     # Check if the stream is opened successfully
176 |     if not cap.isOpened():
177 |         print("Error: Unable to open video stream")
178 |         exit()
179 | 
180 |     frame_number = 0
181 |     while cap.isOpened():
182 |         # Capture frame-by-frame
183 |         ret, frame = cap.read()
184 |         if not ret:
185 |             break
186 | 
187 |         frame_number += 1
188 | 
189 |         if output_file:
190 |             if frame_number % 10 == 0:
191 |                 # Save frame
192 |                 cv2.imwrite(output_file, frame)
193 |         else:
194 |             yield frame
195 | 
196 |     # Release the video capture object
197 |     cap.release()
198 | 
199 |     # Close all frames
200 |     cv2.destroyAllWindows()
201 | 


--------------------------------------------------------------------------------
/auto_with_ui.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing import Process, Queue, Value
  2 | import modules.cv2_stream as cv2_stream
  3 | from playsound import playsound
  4 | from openai import OpenAI
  5 | from queue import Empty
  6 | from PIL import Image
  7 | import numpy as np
  8 | import textwrap
  9 | import base64
 10 | import time
 11 | import sys
 12 | import cv2
 13 | import os
 14 | 
 15 | import modules.recorder as recorder
 16 | import modules.helpers as helpers
 17 | 
 18 | client = OpenAI()
 19 | 
 20 | def motion_fn(queue: Queue, stream_url, processing: Value, frame_queue: Queue):
 21 |     for collage in cv2_stream.detect_changes(stream_url, processing=processing, frame_queue=frame_queue):
 22 |         print("Motion detected!")
 23 |         collage.save("collage.jpg", format="JPEG")
 24 | 
 25 |         queue.put({
 26 |             "image": collage
 27 |         })
 28 | 
 29 |         with processing.get_lock():
 30 |             processing.value = False
 31 | 
 32 | def voice_fn(queue: Queue, processing: Value, ui_queue: Queue, winwidth, winheight):
 33 |     for audio_file in recorder.live_speech(60, transcribe_audio=False, processing=processing, ui_queue=ui_queue, winwidth=winwidth, winheight=winheight):
 34 |         if os.path.exists("collage.jpg"):
 35 |             image = Image.open("collage.jpg")
 36 |             os.remove("collage.jpg")
 37 |         else:
 38 |             frame_rgb = cv2.cvtColor(frame_queue.get(), cv2.COLOR_BGR2RGB)
 39 |             image = Image.fromarray(frame_rgb)
 40 |             #image.save("still.jpg", format="JPEG")
 41 | 
 42 |         queue.put({
 43 |             "audio": audio_file,
 44 |             "image": image
 45 |         })
 46 | 
 47 |         with processing.get_lock():
 48 |             processing.value = False
 49 | 
 50 | def parse_message(message):
 51 |     image = helpers.image_b64(message["image"])
 52 | 
 53 |     with open("capture.jpg", "wb") as f:
 54 |         captured_image = base64.b64decode(image)
 55 |         f.write(captured_image)
 56 | 
 57 |     content = [
 58 |         {
 59 |             "type": "image_url",
 60 |             "image_url": f"data:image/jpeg;base64,{image}"
 61 |         }
 62 |     ]
 63 | 
 64 |     if "audio" in message:
 65 |         text = recorder.transcribe(message["audio"])
 66 | 
 67 |         if helpers.filter_garbage(text):
 68 |             return []
 69 | 
 70 |         content.append({
 71 |             "type": "text",
 72 |             "text": text,
 73 |         })
 74 |     else:
 75 |         if os.path.exists("collage.jpg"):
 76 |             os.remove("collage.jpg")
 77 | 
 78 |     return content
 79 | 
 80 | def empty_queue(queue):
 81 |     try:
 82 |         while True:
 83 |             queue.get_nowait()
 84 |     except Empty:
 85 |         pass
 86 | 
 87 | def draw_text(window, text, position, color=(255, 255, 255), centered=True):
 88 |     font = cv2.FONT_HERSHEY_SIMPLEX
 89 |     font_scale = 0.8
 90 |     thickness = 2
 91 | 
 92 |     lines = textwrap.wrap(text, width=50)
 93 | 
 94 |     line_y = 0
 95 |     for line in lines:
 96 |         pos = position
 97 | 
 98 |         if centered:
 99 |             text_size = cv2.getTextSize(line, font, font_scale, thickness)[0]
100 |             text_x = position[0] - text_size[0] // 2
101 |             text_y = position[1] - text_size[1] // 2
102 |             pos = (text_x, text_y)
103 | 
104 |         pos = (pos[0], pos[1]+line_y)
105 |         line_y += int(text_size[1]*1.3)
106 | 
107 |         cv2.putText(window, line, pos, font, font_scale, color, thickness)
108 | 
109 |     cv2.imshow('GPT4GEMINI', window)
110 |     if cv2.getWindowProperty('GPT4GEMINI', cv2.WND_PROP_VISIBLE) < 1:
111 |         sys.exit()
112 |     cv2.waitKey(1)
113 | 
114 | def draw_window(winwidth, winheight, queue: Queue, frame_queue: Queue):
115 |     cv2.namedWindow('GPT4GEMINI', cv2.WINDOW_NORMAL)
116 |     cv2.resizeWindow('GPT4GEMINI', winwidth, winheight)
117 | 
118 |     window = np.zeros((winheight, winwidth, 3), dtype=np.uint8)
119 |     cv2.imshow('GPT4GEMINI', window)
120 |     if cv2.getWindowProperty('GPT4GEMINI', cv2.WND_PROP_VISIBLE) < 1:
121 |         sys.exit()
122 | 
123 |     while True:
124 |         frame = frame_queue.get()
125 |         try:
126 |             event = queue.get_nowait()
127 |         except Empty:
128 |             frame = cv2.resize(frame, (640, 400))
129 | 
130 |             x = 50
131 |             y = int(winheight / 2 - frame.shape[0] / 2)
132 | 
133 |             window[y:frame.shape[0]+y, x:frame.shape[1]+x] = frame
134 |             cv2.imshow('GPT4GEMINI', window)
135 |             cv2.waitKey(1)
136 |             continue
137 | 
138 |         if event["type"] == "draw_text":
139 |             draw_text(window, **event["args"])
140 | 
141 |         if event["type"] == "clear":
142 |             window = np.zeros((winheight, winwidth, 3), dtype=np.uint8)
143 |             cv2.imshow('GPT4GEMINI', window)
144 | 
145 |         cv2.waitKey(1)
146 | 
147 |     cv2.destroyAllWindows()
148 | 
149 | stream_url = helpers.get_stream()
150 | 
151 | if os.path.exists("collage.jpg"):
152 |     os.remove("collage.jpg")
153 | 
154 | if os.path.exists("detect.jpg"):
155 |     os.remove("detect.jpg")
156 | 
157 | #if os.path.exists("frame.jpg"):
158 |     #os.remove("frame.jpg")
159 | 
160 | winwidth = int(1920*0.8)
161 | winheight = int(1080*0.8)
162 | 
163 | user_text_color = (255, 255, 0)
164 | 
165 | queue = Queue()
166 | ui_queue = Queue()
167 | frame_queue = Queue()
168 | 
169 | processing_audio = Value('b', False)
170 | processing_video = Value('b', False)
171 | 
172 | print("Starting motion process...")
173 | motion_process = Process(target=motion_fn, args=(queue, stream_url, processing_video, frame_queue))
174 | motion_process.start()
175 | 
176 | print("Starting voice process...")
177 | voice_process = Process(target=voice_fn, args=(queue, processing_audio, ui_queue, winwidth, winheight))
178 | voice_process.start()
179 | 
180 | print("Starting UI process...")
181 | ui_process = Process(target=draw_window, args=(winwidth, winheight, ui_queue, frame_queue))
182 | ui_process.start()
183 | 
184 | messages = [
185 |     {
186 |         "role": "system",
187 |         "content": """You are an AI assistant that reports what it sees in short sentences. You are connected to a video feed and will be shown a collage of frames from a video and you will tell what what is happening in the video as if you are seeing it live. Don't let the user know that you are seeing a sequence of images or frames. Don't say anything about a series of images or frames. Answer as if you are seeing the sequence of images in real life. Don't say they are a sequence of images. Just say what is happening in them, as if it is happening in front of your eyes. If the user asks you a direct question, answer it based on what you see. You are talking directly to the user. Assume any activity is performed by the user, who you shall refer to as "You". Example responses: "I see you drew an elephant" or "I see a cat" or "I see you put a coin on the table". If you notice something out of the ordinary, point it out. Keep your answers very concise. When playing games, tell the user if they won / are correct. If nothing particularly interesting happened since the previous image sequence, answer with the JSON {"status": "NO_CHANGE"}""".strip(),
188 |     }
189 | ]
190 | 
191 | while True:
192 |     message = queue.get()
193 | 
194 |     content = parse_message(message)
195 | 
196 |     have_text = False
197 | 
198 |     for msg in content:
199 |         if "text" in msg:
200 |             have_text = True
201 |             ui_queue.put({
202 |                 "type": "clear",
203 |             })
204 |             ui_queue.put({
205 |                 "type": "draw_text",
206 |                 "args": {
207 |                     "text": msg["text"],
208 |                     "position": (winwidth//2, 80),
209 |                     "color": user_text_color
210 |                 }
211 |             })
212 |             print("You: " + msg["text"])
213 | 
214 |     time.sleep(0.5)
215 | 
216 |     if processing_audio.value == True:
217 |         while processing_audio.value == True:
218 |             time.sleep(0.1)
219 | 
220 |     if processing_video.value == True:
221 |         while processing_video.value == True:
222 |             time.sleep(0.1)
223 | 
224 |     with processing_audio.get_lock():
225 |         processing_audio.value = True
226 | 
227 |     try:
228 |         message2 = queue.get(timeout=0.5)
229 |         if message2:
230 |             msg2 = parse_message(message2)
231 |             for msg in msg2:
232 |                 if "text" in msg:
233 |                     have_text = True
234 |                     ui_queue.put({
235 |                         "type": "clear",
236 |                     })
237 |                     ui_queue.put({
238 |                         "type": "draw_text",
239 |                         "args": {
240 |                             "text": msg["text"],
241 |                             "position": (winwidth//2, 80),
242 |                             "color": user_text_color
243 |                         }
244 |                     })
245 |                     print("You: " + msg["text"])
246 |             content += msg2
247 |     except Empty:
248 |         pass
249 | 
250 |     if len(content) == 0:
251 |         continue
252 | 
253 |     messages.append({
254 |         "role": "user",
255 |         "content": content
256 |     })
257 | 
258 |     if True:
259 |         print("Sending GPT4V request...")
260 | 
261 |         # show "calling gpt4v..."
262 |         ui_queue.put({
263 |             "type": "draw_text",
264 |             "args": {
265 |                 "text": "calling gpt4v...",
266 |                 "position": (winwidth//2, int(winheight*0.9)),
267 |                 "color": (255, 0, 255)
268 |             }
269 |         })
270 | 
271 |         response = client.chat.completions.create(
272 |             messages=messages,
273 |             model="gpt-4-vision-preview",
274 |             max_tokens=1024
275 |         )
276 | 
277 |         response_message = response.choices[0].message
278 |         response_text = response_message.content
279 | 
280 |         if '{"status": "NO_CHANGE"}' in response_text:
281 |             messages.pop()
282 |             continue
283 |     else:
284 |         response_text = "This is a test"
285 |         response_message = {
286 |             "role": "assistant",
287 |             "content": response_text,
288 |         }
289 | 
290 |     messages.append(response_message)
291 | 
292 |     print("Generating audio response...")
293 | 
294 |     # hide "calling gpt4v..."
295 |     ui_queue.put({
296 |         "type": "draw_text",
297 |         "args": {
298 |             "text": "calling gpt4v...",
299 |             "position": (winwidth//2, int(winheight*0.9)),
300 |             "color": (0, 0, 0)
301 |         }
302 |     })
303 | 
304 |     # show "generating audio..."
305 |     ui_queue.put({
306 |         "type": "draw_text",
307 |         "args": {
308 |             "text": "generating audio...",
309 |             "position": (winwidth//2, int(winheight*0.9)),
310 |             "color": (0, 255, 255)
311 |         }
312 |     })
313 | 
314 |     audio = client.audio.speech.create(
315 |         input=response_text,
316 |         model="tts-1",
317 |         voice="onyx",
318 |     )
319 | 
320 |     audio.stream_to_file("audio.mp3")
321 | 
322 |     # hide "generating audio..."
323 |     ui_queue.put({
324 |         "type": "draw_text",
325 |         "args": {
326 |             "text": "generating audio...",
327 |             "position": (winwidth//2, int(winheight*0.9)),
328 |             "color": (0, 0, 0)
329 |         }
330 |     })
331 | 
332 |     if not have_text:
333 |         ui_queue.put({
334 |             "type": "clear",
335 |         })
336 | 
337 |     ui_queue.put({
338 |         "type": "draw_text",
339 |         "args": {
340 |             "text": response_text,
341 |             "position": (int(winwidth*0.71), int(winheight*0.5))
342 |         }
343 |     })
344 |     print("GPT: " + response_text)
345 | 
346 |     playsound("audio.mp3")
347 |     os.remove("audio.mp3")
348 | 
349 |     time.sleep(1)
350 | 
351 |     with processing_audio.get_lock():
352 |         processing_audio.value = False
353 | 
354 |     with processing_video.get_lock():
355 |         processing_video.value = False
356 | 
357 |     empty_queue(queue)
358 | 
359 | video_process.join()
360 | motion_process.join()
361 | voice_process.join()
362 | ui_process.join()
363 | 


--------------------------------------------------------------------------------