├── .gitignore ├── config.py ├── modules ├── helpers.py ├── recorder.py └── cv2_stream.py ├── motion.py ├── README.md ├── voice.py ├── auto.py └── auto_with_ui.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | collage.jpg 3 | detect.jpg 4 | frame.jpg 5 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | config = { 2 | # how sensitive motion detection is 3 | "motion_threshold": 4, 4 | 5 | # what is considered "big" motion 6 | "big_motion_threshold": 30, 7 | 8 | # how many still frames before motion ends 9 | "still_frame_threshold": 20, 10 | 11 | # when to stop recording a long motion 12 | "automatic_motion_cutoff": 200, 13 | 14 | # what level of audio is detected as speech 15 | "speech_threshold": 3, 16 | } 17 | -------------------------------------------------------------------------------- /modules/helpers.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | import base64 3 | import sys 4 | import cv2 5 | import re 6 | 7 | def filter_garbage(message): 8 | if re.sub(r"[^a-z0-9]", "", message) == "": 9 | return True 10 | 11 | if message.count(",") / len(message) > 0.1: 12 | return True 13 | 14 | if message.strip().strip(",!?") in ["mm-hmm", "cough", "tshh", "pfft", "swoosh"]: 15 | return True 16 | 17 | for word in ["mm-hmm,", "cough,", "tshh,", "pfft,", "swoosh,"]: 18 | if word in message: 19 | return True 20 | 21 | return False 22 | 23 | def image_b64(image): 24 | if isinstance(image, str): 25 | with open(image, "rb") as f: 26 | return base64.b64encode(f.read()).decode() 27 | elif isinstance(image, bytes): 28 | return base64.b64encode(image).decode() 29 | else: 30 | buffer = BytesIO() 31 | image.save(buffer, format="JPEG") 32 | return base64.b64encode(buffer.getvalue()).decode() 33 | 34 | def sharpness(image): 35 | if isinstance(image, str): 36 | image = cv2.imread(image) 37 | 38 | gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) 39 | 40 | laplacian = cv2.Laplacian(gray, cv2.CV_64F) 41 | 42 | return laplacian.var() 43 | 44 | def get_stream(): 45 | if len(sys.argv) < 2: 46 | print("ERROR: No video stream provided") 47 | print(f"Usage: {sys.argv[0]} VIDEO_STREAM_URL") 48 | sys.exit(1) 49 | 50 | return sys.argv[1] -------------------------------------------------------------------------------- /motion.py: -------------------------------------------------------------------------------- 1 | import modules.cv2_stream as cv2_stream 2 | from playsound import playsound 3 | from openai import OpenAI 4 | import os 5 | 6 | import modules.helpers as helpers 7 | 8 | client = OpenAI() 9 | 10 | stream_url = helpers.get_stream() 11 | 12 | messages = [ 13 | { 14 | "role": "system", 15 | "content": """You are an AI assistant that reports what it sees in short sentences. You are connected to a video feed and will be shown a collage of frames from a video and you will tell what what is happening in the video as if you are watching. Don't let the user know that you are seeing a sequence of images. Pretend as if you are a human seeing what is happening live. You are talking directly to the user. Assume any activity is performed by the user, who you shall refer to as "You". Example responses: "I see you drew an elephant" or "I see a dog" or "I see you put a TV remote on the table". Keep your responses very concise. If nothing particularly interesting happened since the previous image sequence, answer with the JSON {"status": "NO_CHANGE"}""".strip(), 16 | } 17 | ] 18 | 19 | for collage in cv2_stream.detect_changes(stream_url): 20 | print("Motion detected!") 21 | collage.save("collage.jpg", format="JPEG") 22 | 23 | messages.append({ 24 | "role": "user", 25 | "content": [ 26 | { 27 | "type": "image_url", 28 | "image_url": f"data:image/jpeg;base64,{helpers.image_b64(collage)}" 29 | } 30 | ] 31 | }) 32 | 33 | response = client.chat.completions.create( 34 | messages=messages, 35 | model="gpt-4-vision-preview", 36 | max_tokens=1024 37 | ) 38 | 39 | response_message = response.choices[0].message 40 | 41 | if '{"status": "NO_CHANGE"}' in response_message.content: 42 | messages.pop() 43 | continue 44 | 45 | messages.append(response_message) 46 | 47 | audio = client.audio.speech.create( 48 | input=response_message.content, 49 | model="tts-1", 50 | voice="onyx", 51 | ) 52 | 53 | audio.stream_to_file("audio.mp3") 54 | print("GPT: " + response_message.content) 55 | playsound("audio.mp3") 56 | os.remove("audio.mp3") 57 | 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GPT-4V Gemini 2 | 3 | This is a crude demo project made to mimic the supposed [live video ingestion capabilities](https://www.youtube.com/watch?v=UIZAiXYceBI) of Google's multimodal Gemini LLM, but made with the GPT-4 Vision API. 4 | 5 | Demo: https://youtu.be/UxQb88gENeg 6 | 7 | ## Setup 8 | 9 | ```shell 10 | $ pip install -r requirements.txt 11 | $ export OPENAI_API_KEY=YOUR_OPENAI_API_KEY 12 | ``` 13 | 14 | ## Voice version (terminal) 15 | 16 | To run the voice commanded terminal version, run the `voice.py` script. 17 | 18 | ```shell 19 | $ python3 voice.py VIDEO_STREAM_URL 20 | ``` 21 | 22 | The assistant only reacts to voice commands. 23 | 24 | ## Motion version (terminal) 25 | 26 | To run the motion detecting version, run the `motion.py` script. 27 | 28 | ```shell 29 | $ python3 motion.py VIDEO_STREAM_URL 30 | ``` 31 | 32 | The assistants reacts every time motion is detected in the video. A tripod is recommended. 33 | 34 | ## Automatic version (terminal) 35 | 36 | To run the automatic version that detects both voice commands and motion in the video, run the `auto.py` script. 37 | 38 | ```shell 39 | $ python3 auto.py VIDEO_STREAM_URL 40 | ``` 41 | 42 | The assistants reacts every time motion is detected in the video or a voice command is given. A tripod is recommended. 43 | 44 | ## Automatic version with UI 45 | 46 | There is also a version with a "UI" made with CV2 (it sucks but kinda works). It both listens to voice commands and detects motion in the video and automatically sends both to the GPT4V API. 47 | 48 | ```shell 49 | $ python3 auto_with_ui.py VIDEO_STREAM_URL 50 | ``` 51 | 52 | ## How to get a video stream URL 53 | 54 | In my testing, I have used my phone camera as the video stream. For this, I used the [IP Webcam](https://play.google.com/store/apps/details?id=com.pas.webcam&pcampaignid=web_share) app on Play Store. I set the camera to 10 fps at 640x480 resolution. 55 | 56 | The VIDEO_STREAM_URL is passed directly into `cv2.VideoCapture()`, so I guess you should be able to pass in a video file too, or any kind of video stream. 57 | 58 | ## Configuration 59 | 60 | There is a `config.py` file where you can tweak some settings if you are having trouble with the motion detection or speech detection. 61 | 62 | # Known issues 63 | 64 | * GPT-4V API is often slow 65 | * Sometimes the assistant response is detected as a user message 66 | * The CV2 UI sucks and should be made with another way 67 | * The CV2 UI can only be closed by hittin Ctrl+C in the terminal 68 | -------------------------------------------------------------------------------- /voice.py: -------------------------------------------------------------------------------- 1 | import modules.cv2_stream as cv2_stream 2 | from multiprocessing import Process 3 | from playsound import playsound 4 | from openai import OpenAI 5 | import shutil 6 | import os 7 | 8 | import modules.recorder as recorder 9 | import modules.helpers as helpers 10 | 11 | client = OpenAI() 12 | 13 | stream_url = helpers.get_stream() 14 | 15 | messages = [ 16 | { 17 | "role": "system", 18 | "content": """You are an AI assistant that can see. The photos provided to you are the view from your eyes. Answer the user based on what you see. The user is holding the camera. If you see them pointing to something and asking what it is, tell them what it is. Don't say what you're looking at is an image, unless the image sent to you is of a physical image. Answer in short, concise answers.""", 19 | } 20 | ] 21 | 22 | def write_changes(): 23 | for _ in cv2_stream.stream_frames(stream_url, "frame.jpg"): 24 | pass 25 | 26 | video_process = Process(target=write_changes) 27 | video_process.start() 28 | 29 | while True: 30 | for message in recorder.live_speech(60): 31 | if helpers.filter_garbage(message): 32 | break 33 | 34 | print("You: " + message) 35 | 36 | shutil.copy("frame.jpg", "detect.jpg") 37 | 38 | try: 39 | messages.append({ 40 | "role": "user", 41 | "content": [ 42 | { 43 | "type": "image_url", 44 | "image_url": f"data:image/jpeg;base64,{helpers.image_b64('detect.jpg')}", 45 | }, 46 | { 47 | "type": "text", 48 | "text": "Message transcribed from recording (might contain mistakes): " + message, 49 | } 50 | ] 51 | }) 52 | 53 | response = client.chat.completions.create( 54 | messages=messages, 55 | model="gpt-4-vision-preview", 56 | max_tokens=1024 57 | ) 58 | 59 | response_message = response.choices[0].message 60 | response_text = response_message.content 61 | 62 | messages.append(response_message) 63 | except Exception as e: 64 | print(str(e)) 65 | response_text = "Sorry, I missed that" 66 | messages.append({ 67 | "role": "system", 68 | "content": "The user sent an invalid message" 69 | }) 70 | messages.append({ 71 | "role": "assistant", 72 | "content": response_text 73 | }) 74 | 75 | audio = client.audio.speech.create( 76 | input=response_text, 77 | model="tts-1", 78 | voice="onyx", 79 | ) 80 | 81 | audio.stream_to_file("audio.mp3") 82 | print("GPT: " + response_text) 83 | playsound("audio.mp3") 84 | os.remove("audio.mp3") 85 | 86 | break 87 | 88 | video_process.join() # i really wanna join, but I can't 89 | -------------------------------------------------------------------------------- /modules/recorder.py: -------------------------------------------------------------------------------- 1 | import audioop 2 | import whisper 3 | import pyaudio 4 | import wave 5 | import os 6 | 7 | from config import config 8 | 9 | whisper_model = whisper.load_model("base") 10 | ambient_detected = False 11 | speech_volume = 100 12 | 13 | def transcribe(audio_file): 14 | result = whisper_model.transcribe( 15 | audio_file, 16 | fp16=False, 17 | no_speech_threshold=0.1, 18 | initial_prompt="mm-hmm, cough, tshh, pfft, swoosh" 19 | ) 20 | 21 | return result["text"].strip() 22 | 23 | def live_speech(wait_time=10, transcribe_audio=True, processing=None, ui_queue=None, winwidth=0, winheight=0): 24 | global ambient_detected 25 | global speech_volume 26 | 27 | FORMAT = pyaudio.paInt16 28 | CHANNELS = 1 29 | RATE = 44100 30 | CHUNK = 1024 31 | 32 | audio = pyaudio.PyAudio() 33 | 34 | stream = audio.open( 35 | format=FORMAT, 36 | channels=CHANNELS, 37 | rate=RATE, 38 | input=True, 39 | frames_per_buffer=CHUNK 40 | ) 41 | 42 | frames = [] 43 | recording = False 44 | frames_recorded = 0 45 | 46 | while True: 47 | frames_recorded += 1 48 | data = stream.read(CHUNK) 49 | rms = audioop.rms(data, 2) 50 | 51 | if not ambient_detected: 52 | if frames_recorded < 40: 53 | if frames_recorded == 1: 54 | print("Detecting ambient noise...") 55 | if frames_recorded > 5: 56 | if speech_volume < rms: 57 | speech_volume = rms 58 | continue 59 | elif frames_recorded == 40: 60 | print("Listening...") 61 | speech_volume = speech_volume * config["speech_threshold"] 62 | ambient_detected = True 63 | 64 | if rms > speech_volume: 65 | if not recording: 66 | if processing: 67 | if processing.value == True: 68 | continue 69 | with processing.get_lock(): 70 | processing.value = True 71 | if ui_queue: 72 | # show "listening..." 73 | ui_queue.put({ 74 | "type": "draw_text", 75 | "args": { 76 | "text": "listening...", 77 | "position": (winwidth//2, int(winheight*0.9)), 78 | "color": (150, 150, 150) 79 | } 80 | }) 81 | print("Voice detected!") 82 | recording = True 83 | frames_recorded = 0 84 | elif recording and frames_recorded > wait_time: 85 | recording = False 86 | 87 | wf = wave.open("audio.wav", 'wb') 88 | wf.setnchannels(CHANNELS) 89 | wf.setsampwidth(audio.get_sample_size(FORMAT)) 90 | wf.setframerate(RATE) 91 | wf.writeframes(b''.join(frames)) 92 | wf.close() 93 | 94 | if transcribe_audio: 95 | result = transcribe("audio.wav") 96 | 97 | os.remove("audio.wav") 98 | 99 | yield result 100 | else: 101 | yield "audio.wav" 102 | 103 | if ui_queue: 104 | # hide "listening..." 105 | ui_queue.put({ 106 | "type": "draw_text", 107 | "args": { 108 | "text": "listening...", 109 | "position": (winwidth//2, int(winheight*0.9)), 110 | "color": (0, 0, 0) 111 | } 112 | }) 113 | 114 | frames = [] 115 | 116 | if recording: 117 | frames.append(data) 118 | 119 | # TODO: do these when breaking from generator 120 | stream.stop_stream() 121 | stream.close() 122 | audio.terminate() -------------------------------------------------------------------------------- /auto.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process, Queue, Value 2 | import modules.cv2_stream as cv2_stream 3 | from playsound import playsound 4 | from openai import OpenAI 5 | from queue import Empty 6 | from PIL import Image 7 | import base64 8 | import shutil 9 | import time 10 | import os 11 | 12 | import modules.recorder as recorder 13 | import modules.helpers as helpers 14 | 15 | client = OpenAI() 16 | 17 | def motion_fn(queue: Queue, stream_url, processing: Value): 18 | for collage in cv2_stream.detect_changes(stream_url, processing=processing): 19 | print("Motion detected!") 20 | collage.save("collage.jpg", format="JPEG") 21 | 22 | queue.put({ 23 | "image": collage 24 | }) 25 | 26 | with processing.get_lock(): 27 | processing.value = False 28 | 29 | def voice_fn(queue: Queue, processing: Value): 30 | for audio_file in recorder.live_speech(60, transcribe_audio=False, processing=processing): 31 | if os.path.exists("collage.jpg"): 32 | image = Image.open("collage.jpg") 33 | os.remove("collage.jpg") 34 | else: 35 | shutil.copy("frame.jpg", "detect.jpg") 36 | try: 37 | image = Image.open("detect.jpg") 38 | except OSError: 39 | time.sleep(0.1) 40 | shutil.copy("frame.jpg", "detect.jpg") 41 | image = Image.open("detect.jpg") 42 | 43 | os.remove("detect.jpg") 44 | 45 | queue.put({ 46 | "audio": audio_file, 47 | "image": image 48 | }) 49 | 50 | with processing.get_lock(): 51 | processing.value = False 52 | 53 | def video_fn(): 54 | for _ in cv2_stream.stream_frames(stream_url, "frame.jpg"): 55 | pass 56 | 57 | def parse_message(message): 58 | image = helpers.image_b64(message["image"]) 59 | 60 | with open("capture.jpg", "wb") as f: 61 | captured_image = base64.b64decode(image) 62 | f.write(captured_image) 63 | 64 | content = [ 65 | { 66 | "type": "image_url", 67 | "image_url": f"data:image/jpeg;base64,{image}" 68 | } 69 | ] 70 | 71 | if "audio" in message: 72 | text = recorder.transcribe(message["audio"]) 73 | 74 | if helpers.filter_garbage(text): 75 | return [] 76 | 77 | print("You: " + text) 78 | 79 | content.append({ 80 | "type": "text", 81 | "text": "Message transcribed from recording (might contain mistakes): " + text, 82 | }) 83 | else: 84 | if os.path.exists("collage.jpg"): 85 | os.remove("collage.jpg") 86 | 87 | return content 88 | 89 | def empty_queue(queue): 90 | try: 91 | while True: 92 | queue.get_nowait() 93 | except Empty: 94 | pass 95 | 96 | stream_url = helpers.get_stream() 97 | 98 | if os.path.exists("collage.jpg"): 99 | os.remove("collage.jpg") 100 | 101 | if os.path.exists("detect.jpg"): 102 | os.remove("detect.jpg") 103 | 104 | if os.path.exists("frame.jpg"): 105 | os.remove("frame.jpg") 106 | 107 | queue = Queue() 108 | 109 | processing_audio = Value('b', False) 110 | processing_video = Value('b', False) 111 | 112 | print("Starting video process...") 113 | video_process = Process(target=video_fn) 114 | video_process.start() 115 | 116 | print("Starting motion process...") 117 | motion_process = Process(target=motion_fn, args=(queue, stream_url, processing_video)) 118 | motion_process.start() 119 | 120 | print("Starting voice process...") 121 | voice_process = Process(target=voice_fn, args=(queue, processing_audio), daemon=True) 122 | voice_process.start() 123 | 124 | messages = [ 125 | { 126 | "role": "system", 127 | "content": """You are an AI assistant that reports what it sees in short sentences. You are connected to a video feed and will be shown a collage of frames from a video and you will tell what what is happening in the video as if you are watching. Don't let the user know that you are seeing a sequence of images. Pretend as if you are a human seeing what is happening live. You are talking directly to the user. Assume any activity is performed by the user, who you shall refer to as "You". "I see you drew an elephant" or "I see a dog" or "I see you put a TV remote on the table". Keep your answers very concise. If nothing particularly interesting happened since the previous image sequence, answer with the JSON {"status": "NO_CHANGE"}""".strip(), 128 | } 129 | ] 130 | 131 | while True: 132 | message = queue.get() 133 | 134 | content = parse_message(message) 135 | 136 | time.sleep(0.5) 137 | 138 | if processing_audio.value == True: 139 | while processing_audio.value == True: 140 | time.sleep(0.1) 141 | 142 | if processing_video.value == True: 143 | while processing_video.value == True: 144 | time.sleep(0.1) 145 | 146 | try: 147 | message2 = queue.get(timeout=0.5) 148 | if message2: 149 | content += parse_message(message2) 150 | except Empty: 151 | pass 152 | 153 | messages.append({ 154 | "role": "user", 155 | "content": content 156 | }) 157 | 158 | response = client.chat.completions.create( 159 | messages=messages, 160 | model="gpt-4-vision-preview", 161 | max_tokens=1024 162 | ) 163 | 164 | response_message = response.choices[0].message 165 | response_text = response_message.content 166 | 167 | if '{"status": "NO_CHANGE"}' in response_text: 168 | messages.pop() 169 | continue 170 | 171 | messages.append(response_message) 172 | 173 | audio = client.audio.speech.create( 174 | input=response_text, 175 | model="tts-1", 176 | voice="onyx", 177 | ) 178 | 179 | audio.stream_to_file("audio.mp3") 180 | print("GPT: " + response_text) 181 | 182 | with processing_audio.get_lock(): 183 | processing_audio.value = True 184 | 185 | playsound("audio.mp3") 186 | os.remove("audio.mp3") 187 | 188 | time.sleep(0.2) 189 | 190 | empty_queue(queue) 191 | 192 | with processing_audio.get_lock(): 193 | processing_audio.value = False 194 | 195 | with processing_video.get_lock(): 196 | processing_video.value = False 197 | 198 | video_process.join() 199 | motion_process.join() 200 | voice_process.join() 201 | -------------------------------------------------------------------------------- /modules/cv2_stream.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import numpy as np 3 | import math 4 | import cv2 5 | 6 | import modules.helpers as helpers 7 | from config import config 8 | 9 | def make_collage(frames, border=35): 10 | frame_count = len(frames) 11 | 12 | rows = 1 13 | while frame_count / rows > 4: 14 | rows += 1 15 | per_row = math.ceil(frame_count / rows) 16 | 17 | try: 18 | frame = cv2.cvtColor(frames[0], cv2.COLOR_BGR2RGB) 19 | image1 = Image.fromarray(frame) 20 | collage = Image.new('RGB', (image1.width*per_row+border*(per_row-1), image1.height*rows+border*(per_row-1))) 21 | collage.paste(image1, (0, 0)) 22 | except OSError: 23 | print("Error saving collage...") 24 | return 25 | 26 | pos_x = image1.width + border 27 | pos_y = 0 28 | 29 | for i, frame in enumerate(frames[1:]): 30 | try: 31 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 32 | image = Image.fromarray(frame) 33 | collage.paste(image, (pos_x, pos_y)) 34 | except OSError: 35 | print("Error adding changed frame...") 36 | continue 37 | 38 | pos_x += image.width + border 39 | 40 | if (i+2) % per_row == 0: 41 | pos_y += image.height + border 42 | pos_x = 0 43 | 44 | return collage 45 | 46 | def detect_changes(stream_url, count=9, min_frames=5, max_frames=None, processing=None, frame_queue=None): 47 | if max_frames is None: 48 | max_frames = config["automatic_motion_cutoff"] 49 | 50 | # Create a VideoCapture object 51 | cap = cv2.VideoCapture(stream_url) 52 | 53 | # Check if the stream is opened successfully 54 | if not cap.isOpened(): 55 | print("Error: Unable to open video stream") 56 | exit() 57 | 58 | # Read the first frame 59 | ret, previous_frame = cap.read() 60 | if not ret: 61 | print("Error: Unable to read video stream") 62 | cap.release() 63 | exit() 64 | 65 | # Convert the first frame to grayscale 66 | previous_frame_gray = cv2.cvtColor(previous_frame, cv2.COLOR_BGR2GRAY) 67 | 68 | # Still frame counter 69 | still_frame_counter = 0 70 | 71 | # Frames 72 | frames = [] 73 | 74 | frame_counter = 0 75 | big_movement = 0 76 | while cap.isOpened(): 77 | frame_counter += 1 78 | # Capture frame-by-frame 79 | ret, current_frame = cap.read() 80 | if not ret: 81 | break 82 | 83 | if frame_queue: 84 | frame_queue.put(current_frame) 85 | 86 | # Convert current frame to grayscale 87 | gray_frame = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY) 88 | 89 | # Calculate the absolute difference 90 | frame_diff = cv2.absdiff(previous_frame_gray, gray_frame) 91 | 92 | # Threshold for significant change 93 | _, thresh = cv2.threshold(frame_diff, 30, 255, cv2.THRESH_BINARY) 94 | 95 | # Count the number of changed pixels 96 | change_count = np.sum(thresh != 0) 97 | 98 | # If significant change is detected, save the frame 99 | if change_count > current_frame.shape[1]*config["motion_threshold"]: # Threshold for change, adjust as needed 100 | if change_count > current_frame.shape[1]*config["big_motion_threshold"]: 101 | big_movement += 1 102 | 103 | if processing: 104 | with processing.get_lock(): 105 | processing.value = True 106 | 107 | frames.append(previous_frame) 108 | frames.append(current_frame) 109 | still_frame_counter = 0 110 | else: 111 | if still_frame_counter < 2: 112 | frames.append(current_frame) 113 | still_frame_counter += 1 114 | 115 | frame_count = len(frames) 116 | if still_frame_counter == config["still_frame_threshold"] or frame_count > max_frames: 117 | if frame_count > min_frames and big_movement >= int(frame_count/30): 118 | if frame_count > count: 119 | sharp_frames = {} 120 | frame_num = 0 121 | 122 | while frame_count > 50: 123 | frames = frames[0::2] 124 | frame_count = len(frames) 125 | 126 | for i, frame in enumerate(frames): 127 | if str(frame_num) not in sharp_frames: 128 | sharp_frames[str(frame_num)] = (0, None) 129 | 130 | sharpness = helpers.sharpness(frame) 131 | if sharp_frames[str(frame_num)][0] < sharpness: 132 | sharp_frames[str(frame_num)] = (sharpness, frame) 133 | 134 | if i % int(frame_count / count) == 0: 135 | frame_num += 1 136 | frames = [] 137 | for sharpness, frame in sharp_frames.values(): 138 | frames.append(frame) 139 | frame_count = len(frames) 140 | step = int(frame_count / count) 141 | step = 1 if step < 1 else step 142 | spread_out_frames = list(reversed(frames[-1::-step]))[-count:] # i no gud at math 143 | 144 | # Yield new motion 145 | yield make_collage(spread_out_frames) 146 | 147 | if processing: 148 | with processing.get_lock(): 149 | processing.value = False 150 | 151 | frames = [] 152 | big_movement = 0 153 | 154 | # Update the previous frame 155 | previous_frame_gray = gray_frame.copy() 156 | previous_frame = current_frame.copy() 157 | 158 | # Display the frame (optional) 159 | #cv2.imshow('Frame', current_frame) 160 | 161 | # Press Q on keyboard to exit the loop 162 | #if cv2.waitKey(1) & 0xFF == ord('q'): 163 | # break 164 | 165 | # Release the video capture object 166 | cap.release() 167 | 168 | # Close all frames 169 | cv2.destroyAllWindows() 170 | 171 | def stream_frames(stream_url, output_file=None): 172 | # Create a VideoCapture object 173 | cap = cv2.VideoCapture(stream_url) 174 | 175 | # Check if the stream is opened successfully 176 | if not cap.isOpened(): 177 | print("Error: Unable to open video stream") 178 | exit() 179 | 180 | frame_number = 0 181 | while cap.isOpened(): 182 | # Capture frame-by-frame 183 | ret, frame = cap.read() 184 | if not ret: 185 | break 186 | 187 | frame_number += 1 188 | 189 | if output_file: 190 | if frame_number % 10 == 0: 191 | # Save frame 192 | cv2.imwrite(output_file, frame) 193 | else: 194 | yield frame 195 | 196 | # Release the video capture object 197 | cap.release() 198 | 199 | # Close all frames 200 | cv2.destroyAllWindows() 201 | -------------------------------------------------------------------------------- /auto_with_ui.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process, Queue, Value 2 | import modules.cv2_stream as cv2_stream 3 | from playsound import playsound 4 | from openai import OpenAI 5 | from queue import Empty 6 | from PIL import Image 7 | import numpy as np 8 | import textwrap 9 | import base64 10 | import time 11 | import sys 12 | import cv2 13 | import os 14 | 15 | import modules.recorder as recorder 16 | import modules.helpers as helpers 17 | 18 | client = OpenAI() 19 | 20 | def motion_fn(queue: Queue, stream_url, processing: Value, frame_queue: Queue): 21 | for collage in cv2_stream.detect_changes(stream_url, processing=processing, frame_queue=frame_queue): 22 | print("Motion detected!") 23 | collage.save("collage.jpg", format="JPEG") 24 | 25 | queue.put({ 26 | "image": collage 27 | }) 28 | 29 | with processing.get_lock(): 30 | processing.value = False 31 | 32 | def voice_fn(queue: Queue, processing: Value, ui_queue: Queue, winwidth, winheight): 33 | for audio_file in recorder.live_speech(60, transcribe_audio=False, processing=processing, ui_queue=ui_queue, winwidth=winwidth, winheight=winheight): 34 | if os.path.exists("collage.jpg"): 35 | image = Image.open("collage.jpg") 36 | os.remove("collage.jpg") 37 | else: 38 | frame_rgb = cv2.cvtColor(frame_queue.get(), cv2.COLOR_BGR2RGB) 39 | image = Image.fromarray(frame_rgb) 40 | #image.save("still.jpg", format="JPEG") 41 | 42 | queue.put({ 43 | "audio": audio_file, 44 | "image": image 45 | }) 46 | 47 | with processing.get_lock(): 48 | processing.value = False 49 | 50 | def parse_message(message): 51 | image = helpers.image_b64(message["image"]) 52 | 53 | with open("capture.jpg", "wb") as f: 54 | captured_image = base64.b64decode(image) 55 | f.write(captured_image) 56 | 57 | content = [ 58 | { 59 | "type": "image_url", 60 | "image_url": f"data:image/jpeg;base64,{image}" 61 | } 62 | ] 63 | 64 | if "audio" in message: 65 | text = recorder.transcribe(message["audio"]) 66 | 67 | if helpers.filter_garbage(text): 68 | return [] 69 | 70 | content.append({ 71 | "type": "text", 72 | "text": text, 73 | }) 74 | else: 75 | if os.path.exists("collage.jpg"): 76 | os.remove("collage.jpg") 77 | 78 | return content 79 | 80 | def empty_queue(queue): 81 | try: 82 | while True: 83 | queue.get_nowait() 84 | except Empty: 85 | pass 86 | 87 | def draw_text(window, text, position, color=(255, 255, 255), centered=True): 88 | font = cv2.FONT_HERSHEY_SIMPLEX 89 | font_scale = 0.8 90 | thickness = 2 91 | 92 | lines = textwrap.wrap(text, width=50) 93 | 94 | line_y = 0 95 | for line in lines: 96 | pos = position 97 | 98 | if centered: 99 | text_size = cv2.getTextSize(line, font, font_scale, thickness)[0] 100 | text_x = position[0] - text_size[0] // 2 101 | text_y = position[1] - text_size[1] // 2 102 | pos = (text_x, text_y) 103 | 104 | pos = (pos[0], pos[1]+line_y) 105 | line_y += int(text_size[1]*1.3) 106 | 107 | cv2.putText(window, line, pos, font, font_scale, color, thickness) 108 | 109 | cv2.imshow('GPT4GEMINI', window) 110 | if cv2.getWindowProperty('GPT4GEMINI', cv2.WND_PROP_VISIBLE) < 1: 111 | sys.exit() 112 | cv2.waitKey(1) 113 | 114 | def draw_window(winwidth, winheight, queue: Queue, frame_queue: Queue): 115 | cv2.namedWindow('GPT4GEMINI', cv2.WINDOW_NORMAL) 116 | cv2.resizeWindow('GPT4GEMINI', winwidth, winheight) 117 | 118 | window = np.zeros((winheight, winwidth, 3), dtype=np.uint8) 119 | cv2.imshow('GPT4GEMINI', window) 120 | if cv2.getWindowProperty('GPT4GEMINI', cv2.WND_PROP_VISIBLE) < 1: 121 | sys.exit() 122 | 123 | while True: 124 | frame = frame_queue.get() 125 | try: 126 | event = queue.get_nowait() 127 | except Empty: 128 | frame = cv2.resize(frame, (640, 400)) 129 | 130 | x = 50 131 | y = int(winheight / 2 - frame.shape[0] / 2) 132 | 133 | window[y:frame.shape[0]+y, x:frame.shape[1]+x] = frame 134 | cv2.imshow('GPT4GEMINI', window) 135 | cv2.waitKey(1) 136 | continue 137 | 138 | if event["type"] == "draw_text": 139 | draw_text(window, **event["args"]) 140 | 141 | if event["type"] == "clear": 142 | window = np.zeros((winheight, winwidth, 3), dtype=np.uint8) 143 | cv2.imshow('GPT4GEMINI', window) 144 | 145 | cv2.waitKey(1) 146 | 147 | cv2.destroyAllWindows() 148 | 149 | stream_url = helpers.get_stream() 150 | 151 | if os.path.exists("collage.jpg"): 152 | os.remove("collage.jpg") 153 | 154 | if os.path.exists("detect.jpg"): 155 | os.remove("detect.jpg") 156 | 157 | #if os.path.exists("frame.jpg"): 158 | #os.remove("frame.jpg") 159 | 160 | winwidth = int(1920*0.8) 161 | winheight = int(1080*0.8) 162 | 163 | user_text_color = (255, 255, 0) 164 | 165 | queue = Queue() 166 | ui_queue = Queue() 167 | frame_queue = Queue() 168 | 169 | processing_audio = Value('b', False) 170 | processing_video = Value('b', False) 171 | 172 | print("Starting motion process...") 173 | motion_process = Process(target=motion_fn, args=(queue, stream_url, processing_video, frame_queue)) 174 | motion_process.start() 175 | 176 | print("Starting voice process...") 177 | voice_process = Process(target=voice_fn, args=(queue, processing_audio, ui_queue, winwidth, winheight)) 178 | voice_process.start() 179 | 180 | print("Starting UI process...") 181 | ui_process = Process(target=draw_window, args=(winwidth, winheight, ui_queue, frame_queue)) 182 | ui_process.start() 183 | 184 | messages = [ 185 | { 186 | "role": "system", 187 | "content": """You are an AI assistant that reports what it sees in short sentences. You are connected to a video feed and will be shown a collage of frames from a video and you will tell what what is happening in the video as if you are seeing it live. Don't let the user know that you are seeing a sequence of images or frames. Don't say anything about a series of images or frames. Answer as if you are seeing the sequence of images in real life. Don't say they are a sequence of images. Just say what is happening in them, as if it is happening in front of your eyes. If the user asks you a direct question, answer it based on what you see. You are talking directly to the user. Assume any activity is performed by the user, who you shall refer to as "You". Example responses: "I see you drew an elephant" or "I see a cat" or "I see you put a coin on the table". If you notice something out of the ordinary, point it out. Keep your answers very concise. When playing games, tell the user if they won / are correct. If nothing particularly interesting happened since the previous image sequence, answer with the JSON {"status": "NO_CHANGE"}""".strip(), 188 | } 189 | ] 190 | 191 | while True: 192 | message = queue.get() 193 | 194 | content = parse_message(message) 195 | 196 | have_text = False 197 | 198 | for msg in content: 199 | if "text" in msg: 200 | have_text = True 201 | ui_queue.put({ 202 | "type": "clear", 203 | }) 204 | ui_queue.put({ 205 | "type": "draw_text", 206 | "args": { 207 | "text": msg["text"], 208 | "position": (winwidth//2, 80), 209 | "color": user_text_color 210 | } 211 | }) 212 | print("You: " + msg["text"]) 213 | 214 | time.sleep(0.5) 215 | 216 | if processing_audio.value == True: 217 | while processing_audio.value == True: 218 | time.sleep(0.1) 219 | 220 | if processing_video.value == True: 221 | while processing_video.value == True: 222 | time.sleep(0.1) 223 | 224 | with processing_audio.get_lock(): 225 | processing_audio.value = True 226 | 227 | try: 228 | message2 = queue.get(timeout=0.5) 229 | if message2: 230 | msg2 = parse_message(message2) 231 | for msg in msg2: 232 | if "text" in msg: 233 | have_text = True 234 | ui_queue.put({ 235 | "type": "clear", 236 | }) 237 | ui_queue.put({ 238 | "type": "draw_text", 239 | "args": { 240 | "text": msg["text"], 241 | "position": (winwidth//2, 80), 242 | "color": user_text_color 243 | } 244 | }) 245 | print("You: " + msg["text"]) 246 | content += msg2 247 | except Empty: 248 | pass 249 | 250 | if len(content) == 0: 251 | continue 252 | 253 | messages.append({ 254 | "role": "user", 255 | "content": content 256 | }) 257 | 258 | if True: 259 | print("Sending GPT4V request...") 260 | 261 | # show "calling gpt4v..." 262 | ui_queue.put({ 263 | "type": "draw_text", 264 | "args": { 265 | "text": "calling gpt4v...", 266 | "position": (winwidth//2, int(winheight*0.9)), 267 | "color": (255, 0, 255) 268 | } 269 | }) 270 | 271 | response = client.chat.completions.create( 272 | messages=messages, 273 | model="gpt-4-vision-preview", 274 | max_tokens=1024 275 | ) 276 | 277 | response_message = response.choices[0].message 278 | response_text = response_message.content 279 | 280 | if '{"status": "NO_CHANGE"}' in response_text: 281 | messages.pop() 282 | continue 283 | else: 284 | response_text = "This is a test" 285 | response_message = { 286 | "role": "assistant", 287 | "content": response_text, 288 | } 289 | 290 | messages.append(response_message) 291 | 292 | print("Generating audio response...") 293 | 294 | # hide "calling gpt4v..." 295 | ui_queue.put({ 296 | "type": "draw_text", 297 | "args": { 298 | "text": "calling gpt4v...", 299 | "position": (winwidth//2, int(winheight*0.9)), 300 | "color": (0, 0, 0) 301 | } 302 | }) 303 | 304 | # show "generating audio..." 305 | ui_queue.put({ 306 | "type": "draw_text", 307 | "args": { 308 | "text": "generating audio...", 309 | "position": (winwidth//2, int(winheight*0.9)), 310 | "color": (0, 255, 255) 311 | } 312 | }) 313 | 314 | audio = client.audio.speech.create( 315 | input=response_text, 316 | model="tts-1", 317 | voice="onyx", 318 | ) 319 | 320 | audio.stream_to_file("audio.mp3") 321 | 322 | # hide "generating audio..." 323 | ui_queue.put({ 324 | "type": "draw_text", 325 | "args": { 326 | "text": "generating audio...", 327 | "position": (winwidth//2, int(winheight*0.9)), 328 | "color": (0, 0, 0) 329 | } 330 | }) 331 | 332 | if not have_text: 333 | ui_queue.put({ 334 | "type": "clear", 335 | }) 336 | 337 | ui_queue.put({ 338 | "type": "draw_text", 339 | "args": { 340 | "text": response_text, 341 | "position": (int(winwidth*0.71), int(winheight*0.5)) 342 | } 343 | }) 344 | print("GPT: " + response_text) 345 | 346 | playsound("audio.mp3") 347 | os.remove("audio.mp3") 348 | 349 | time.sleep(1) 350 | 351 | with processing_audio.get_lock(): 352 | processing_audio.value = False 353 | 354 | with processing_video.get_lock(): 355 | processing_video.value = False 356 | 357 | empty_queue(queue) 358 | 359 | video_process.join() 360 | motion_process.join() 361 | voice_process.join() 362 | ui_process.join() 363 | --------------------------------------------------------------------------------