├── .gitignore ├── README.md ├── init.py ├── recognize.py ├── recorder.py ├── requirements.txt ├── sounds ├── detected.mp3 └── processing.mp3 └── voices.py /.gitignore: -------------------------------------------------------------------------------- 1 | wakeup_words.json 2 | __pycache__/ 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ok, GPT! 2 | 3 | This is an ongoing project where I'm building my own Google Home / Alexa style device that can be interacted with via voice commands. 4 | 5 | Currently it can detect a wakeup keyphrase, such as "Ok, GPT!" and then listen to a voice command. Speech recognition is done with OpenAI Whisper, locally. The command is then sent to the ChatGPT API and the response is spoken via the OpenAI Text-to-Speech API. 6 | 7 | You need to initialize it with the `init.py` script by saying the keyphrase you want to wake up the device with 10 times. After that you can run `recognize.py` and it will recognize when you say the keyphrase and then listen for the command. 8 | 9 | ## Quick Start 10 | 11 | ```shell 12 | $ export OPENAI_API_KEY=YOUR_API_KEY 13 | $ pip install -r requirements.txt 14 | $ python3 init.py 15 | $ python3 recognize.py 16 | ``` 17 | 18 | ## Videos 19 | 20 | The building of this project is documented on my YouTube channel. 21 | 22 | - Video 1: https://www.youtube.com/watch?v=_vLKWNv4d5E 23 | - Video 2: https://www.youtube.com/watch?v=xQdLiyCxyWQ 24 | -------------------------------------------------------------------------------- /init.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from recorder import live_speech 4 | 5 | wakeup_words = [] 6 | 7 | for i in range(10): 8 | print("Please say the wakeup keyphrase") 9 | for phrase in live_speech(): 10 | print(f"Heard '{str(phrase)}'\n") 11 | wakeup_words.append(phrase) 12 | break 13 | 14 | with open("wakeup_words.json", "w") as f: 15 | json.dump(list(set(wakeup_words)), f) 16 | 17 | print("Recognition finished") 18 | 19 | -------------------------------------------------------------------------------- /recognize.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | from pathlib import Path 3 | import playsound 4 | import json 5 | import sys 6 | import os 7 | import re 8 | 9 | from recorder import live_speech 10 | 11 | chatgpt = OpenAI() 12 | messages = [ 13 | { 14 | "role": "system", 15 | "content": "You are a voice controlled assistant. Answer the user's prompts as best you can. Answer in 20 words or less. If the question requires a longer answer, ask the user first if they would like to know more. After confirmation, you can provide a full answer." 16 | }, 17 | ] 18 | 19 | def detect_wakeup(command: str, wakeup_words: list[str]): 20 | command = re.sub(r"[,\.!?]", "", command.lower()) 21 | 22 | for word in wakeup_words: 23 | word = re.sub(r"[,\.!?]", "", word.lower()) 24 | if word in command: 25 | return True 26 | 27 | return False 28 | 29 | if not os.path.exists("wakeup_words.json"): 30 | print("You must run init.py first!") 31 | sys.exit(1) 32 | 33 | with open("wakeup_words.json", "r") as f: 34 | wakeup_words = json.load(f) 35 | 36 | while True: 37 | for message in live_speech(): 38 | if detect_wakeup(message, wakeup_words): 39 | print(f"Detected: {message}") 40 | playsound.playsound(Path(__file__).parent / "sounds" / "detected.mp3") 41 | break 42 | for message in live_speech(50): 43 | playsound.playsound(Path(__file__).parent / "sounds" / "processing.mp3") 44 | messages.append( 45 | { 46 | "role": "user", 47 | "content": message 48 | } 49 | ) 50 | response = chatgpt.chat.completions.create( 51 | model="gpt-3.5-turbo", 52 | messages=messages 53 | ) 54 | 55 | response_text = response.choices[0].message.content 56 | print(f"ChatGPT: {response_text}") 57 | 58 | messages.append( 59 | { 60 | "role": "assistant", 61 | "content": response_text 62 | } 63 | ) 64 | 65 | voice = chatgpt.audio.speech.create( 66 | input=response_text, 67 | model="tts-1", 68 | voice="alloy", 69 | ) 70 | 71 | voice.stream_to_file("audio.mp3") 72 | playsound.playsound("audio.mp3") 73 | os.remove("audio.mp3") 74 | break 75 | -------------------------------------------------------------------------------- /recorder.py: -------------------------------------------------------------------------------- 1 | import audioop 2 | import whisper 3 | import pyaudio 4 | import wave 5 | import os 6 | 7 | whisper_model = whisper.load_model("base") 8 | ambient_detected = False 9 | speech_volume = 100 10 | 11 | def live_speech(wait_time=10): 12 | global ambient_detected 13 | global speech_volume 14 | 15 | FORMAT = pyaudio.paInt16 16 | CHANNELS = 1 17 | RATE = 44100 18 | CHUNK = 1024 19 | 20 | audio = pyaudio.PyAudio() 21 | 22 | stream = audio.open( 23 | format=FORMAT, 24 | channels=CHANNELS, 25 | rate=RATE, 26 | input=True, 27 | frames_per_buffer=CHUNK 28 | ) 29 | 30 | frames = [] 31 | recording = False 32 | frames_recorded = 0 33 | 34 | while True: 35 | frames_recorded += 1 36 | data = stream.read(CHUNK) 37 | rms = audioop.rms(data, 2) 38 | 39 | if not ambient_detected: 40 | if frames_recorded < 40: 41 | if frames_recorded == 1: 42 | print("Detecting ambient noise...") 43 | if frames_recorded > 5: 44 | if speech_volume < rms: 45 | speech_volume = rms 46 | continue 47 | elif frames_recorded == 40: 48 | print("Listening...") 49 | speech_volume = speech_volume * 3 50 | ambient_detected = True 51 | 52 | if rms > speech_volume: 53 | recording = True 54 | frames_recorded = 0 55 | elif recording and frames_recorded > wait_time: 56 | recording = False 57 | 58 | wf = wave.open("audio.wav", 'wb') 59 | wf.setnchannels(CHANNELS) 60 | wf.setsampwidth(audio.get_sample_size(FORMAT)) 61 | wf.setframerate(RATE) 62 | wf.writeframes(b''.join(frames)) 63 | wf.close() 64 | 65 | result = whisper_model.transcribe( 66 | "audio.wav", 67 | fp16=False 68 | ) 69 | 70 | os.remove("audio.wav") 71 | 72 | yield result["text"].strip() 73 | 74 | frames = [] 75 | 76 | if recording: 77 | frames.append(data) 78 | 79 | # TODO: do these when breaking from generator 80 | stream.stop_stream() 81 | stream.close() 82 | audio.terminate() 83 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/openai/whisper.git 2 | openai 3 | -------------------------------------------------------------------------------- /sounds/detected.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unconv/ok-gpt/a8f4fd01cb8abcd8342946988d1ba568f3d15291/sounds/detected.mp3 -------------------------------------------------------------------------------- /sounds/processing.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unconv/ok-gpt/a8f4fd01cb8abcd8342946988d1ba568f3d15291/sounds/processing.mp3 -------------------------------------------------------------------------------- /voices.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | 3 | model = OpenAI() 4 | 5 | voice = model.audio.speech.create( 6 | input="Yes?", 7 | model="tts-1", 8 | voice="alloy", 9 | ) 10 | 11 | voice.stream_to_file("sounds/detected.mp3") 12 | 13 | voice = model.audio.speech.create( 14 | input="Just a moment", 15 | model="tts-1", 16 | voice="alloy", 17 | ) 18 | 19 | voice.stream_to_file("sounds/processing.mp3") --------------------------------------------------------------------------------