├── .gitignore
├── README.md
├── init.py
├── recognize.py
├── recorder.py
├── requirements.txt
├── sounds
    ├── detected.mp3
    └── processing.mp3
└── voices.py


/.gitignore:
--------------------------------------------------------------------------------
1 | wakeup_words.json
2 | __pycache__/
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Ok, GPT!
 2 | 
 3 | This is an ongoing project where I'm building my own Google Home / Alexa style device that can be interacted with via voice commands.
 4 | 
 5 | Currently it can detect a wakeup keyphrase, such as "Ok, GPT!" and then listen to a voice command. Speech recognition is done with OpenAI Whisper, locally. The command is then sent to the ChatGPT API and the response is spoken via the OpenAI Text-to-Speech API.
 6 | 
 7 | You need to initialize it with the `init.py` script by saying the keyphrase you want to wake up the device with 10 times. After that you can run `recognize.py` and it will recognize when you say the keyphrase and then listen for the command.
 8 | 
 9 | ## Quick Start
10 | 
11 | ```shell
12 | $ export OPENAI_API_KEY=YOUR_API_KEY
13 | $ pip install -r requirements.txt
14 | $ python3 init.py
15 | $ python3 recognize.py
16 | ```
17 | 
18 | ## Videos
19 | 
20 | The building of this project is documented on my YouTube channel.
21 | 
22 | - Video 1: https://www.youtube.com/watch?v=_vLKWNv4d5E
23 | - Video 2: https://www.youtube.com/watch?v=xQdLiyCxyWQ
24 | 


--------------------------------------------------------------------------------
/init.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from recorder import live_speech
 4 | 
 5 | wakeup_words = []
 6 | 
 7 | for i in range(10):
 8 |     print("Please say the wakeup keyphrase")
 9 |     for phrase in live_speech():
10 |         print(f"Heard '{str(phrase)}'\n")
11 |         wakeup_words.append(phrase)
12 |         break
13 | 
14 | with open("wakeup_words.json", "w") as f:
15 |     json.dump(list(set(wakeup_words)), f)
16 | 
17 | print("Recognition finished")
18 | 
19 | 


--------------------------------------------------------------------------------
/recognize.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | from pathlib import Path
 3 | import playsound
 4 | import json
 5 | import sys
 6 | import os
 7 | import re
 8 | 
 9 | from recorder import live_speech
10 | 
11 | chatgpt = OpenAI()
12 | messages = [
13 |     {
14 |         "role": "system",
15 |         "content": "You are a voice controlled assistant. Answer the user's prompts as best you can. Answer in 20 words or less. If the question requires a longer answer, ask the user first if they would like to know more. After confirmation, you can provide a full answer."
16 |     },
17 | ]
18 | 
19 | def detect_wakeup(command: str, wakeup_words: list[str]):
20 |     command = re.sub(r"[,\.!?]", "", command.lower())
21 | 
22 |     for word in wakeup_words:
23 |         word = re.sub(r"[,\.!?]", "", word.lower())
24 |         if word in command:
25 |             return True
26 | 
27 |     return False
28 | 
29 | if not os.path.exists("wakeup_words.json"):
30 |     print("You must run init.py first!")
31 |     sys.exit(1)
32 | 
33 | with open("wakeup_words.json", "r") as f:
34 |     wakeup_words = json.load(f)
35 | 
36 | while True:
37 |     for message in live_speech():
38 |         if detect_wakeup(message, wakeup_words):
39 |             print(f"Detected: {message}")
40 |             playsound.playsound(Path(__file__).parent / "sounds" / "detected.mp3")
41 |             break
42 |     for message in live_speech(50):
43 |         playsound.playsound(Path(__file__).parent / "sounds" / "processing.mp3")
44 |         messages.append(
45 |             {
46 |                 "role": "user",
47 |                 "content": message
48 |             }
49 |         )
50 |         response = chatgpt.chat.completions.create(
51 |             model="gpt-3.5-turbo",
52 |             messages=messages
53 |         )
54 | 
55 |         response_text = response.choices[0].message.content
56 |         print(f"ChatGPT: {response_text}")
57 | 
58 |         messages.append(
59 |             {
60 |                 "role": "assistant",
61 |                 "content": response_text
62 |             }
63 |         )
64 | 
65 |         voice = chatgpt.audio.speech.create(
66 |             input=response_text,
67 |             model="tts-1",
68 |             voice="alloy",
69 |         )
70 | 
71 |         voice.stream_to_file("audio.mp3")
72 |         playsound.playsound("audio.mp3")
73 |         os.remove("audio.mp3")
74 |         break
75 | 


--------------------------------------------------------------------------------
/recorder.py:
--------------------------------------------------------------------------------
 1 | import audioop
 2 | import whisper
 3 | import pyaudio
 4 | import wave
 5 | import os
 6 | 
 7 | whisper_model = whisper.load_model("base")
 8 | ambient_detected = False
 9 | speech_volume = 100
10 | 
11 | def live_speech(wait_time=10):
12 |     global ambient_detected
13 |     global speech_volume
14 | 
15 |     FORMAT = pyaudio.paInt16
16 |     CHANNELS = 1
17 |     RATE = 44100
18 |     CHUNK = 1024
19 | 
20 |     audio = pyaudio.PyAudio()
21 | 
22 |     stream = audio.open(
23 |         format=FORMAT,
24 |         channels=CHANNELS,
25 |         rate=RATE,
26 |         input=True,
27 |         frames_per_buffer=CHUNK
28 |     )
29 | 
30 |     frames = []
31 |     recording = False
32 |     frames_recorded = 0
33 | 
34 |     while True:
35 |         frames_recorded += 1
36 |         data = stream.read(CHUNK)
37 |         rms = audioop.rms(data, 2)
38 | 
39 |         if not ambient_detected:
40 |             if frames_recorded < 40:
41 |                 if frames_recorded == 1:
42 |                     print("Detecting ambient noise...")
43 |                 if frames_recorded > 5:
44 |                     if speech_volume < rms:
45 |                         speech_volume = rms
46 |                 continue
47 |             elif frames_recorded == 40:
48 |                 print("Listening...")
49 |                 speech_volume = speech_volume * 3
50 |                 ambient_detected = True
51 | 
52 |         if rms > speech_volume:
53 |             recording = True
54 |             frames_recorded = 0
55 |         elif recording and frames_recorded > wait_time:
56 |             recording = False
57 | 
58 |             wf = wave.open("audio.wav", 'wb')
59 |             wf.setnchannels(CHANNELS)
60 |             wf.setsampwidth(audio.get_sample_size(FORMAT))
61 |             wf.setframerate(RATE)
62 |             wf.writeframes(b''.join(frames))
63 |             wf.close()
64 | 
65 |             result = whisper_model.transcribe(
66 |                 "audio.wav",
67 |                 fp16=False
68 |             )
69 | 
70 |             os.remove("audio.wav")
71 | 
72 |             yield result["text"].strip()
73 | 
74 |             frames = []
75 | 
76 |         if recording:
77 |             frames.append(data)
78 | 
79 |     # TODO: do these when breaking from generator
80 |     stream.stop_stream()
81 |     stream.close()
82 |     audio.terminate()
83 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/openai/whisper.git
2 | openai
3 | 


--------------------------------------------------------------------------------
/sounds/detected.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unconv/ok-gpt/a8f4fd01cb8abcd8342946988d1ba568f3d15291/sounds/detected.mp3


--------------------------------------------------------------------------------
/sounds/processing.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unconv/ok-gpt/a8f4fd01cb8abcd8342946988d1ba568f3d15291/sounds/processing.mp3


--------------------------------------------------------------------------------
/voices.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | 
 3 | model = OpenAI()
 4 | 
 5 | voice = model.audio.speech.create(
 6 |     input="Yes?",
 7 |     model="tts-1",
 8 |     voice="alloy",
 9 | )
10 | 
11 | voice.stream_to_file("sounds/detected.mp3")
12 | 
13 | voice = model.audio.speech.create(
14 |     input="Just a moment",
15 |     model="tts-1",
16 |     voice="alloy",
17 | )
18 | 
19 | voice.stream_to_file("sounds/processing.mp3")


--------------------------------------------------------------------------------