├── demo.png ├── README.md ├── recording_helper.py ├── main.py ├── tf_helper.py └── turtle_helper.py /demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rubentak/realtime-voice-command-recognition/main/demo.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Realtime Voice Command Recognition 2 | 3 | Build your own real-time voice command recognition model with TensorFlow. 4 | 5 | **Watch the video tutorial:** 6 | 7 | [![Alt text](https://img.youtube.com/vi/m-JzldXm9bQ/hqdefault.jpg)](https://youtu.be/m-JzldXm9bQ) 8 | 9 | ## Instructions 10 | 11 | - Run the Colab from [this TensorFlow tutorial](https://www.tensorflow.org/tutorials/audio/simple_audio) 12 | - Follow the video to zip and download the trained model 13 | - Make sure to use the correct `commands` order in `main.py` (same as when running the Colab) 14 | - Install pyaudio and tensorflow 15 | - Extraxt the zip to a folder named `saved_model` 16 | - Run `python main.py` 17 | 18 | ## Screenshot 19 | 20 | ![Demo](demo.png) 21 | 22 | -------------------------------------------------------------------------------- /recording_helper.py: -------------------------------------------------------------------------------- 1 | import pyaudio 2 | import numpy as np 3 | 4 | FRAMES_PER_BUFFER = 3200 5 | FORMAT = pyaudio.paInt16 6 | CHANNELS = 1 7 | RATE = 16000 8 | p = pyaudio.PyAudio() 9 | 10 | def record_audio(): 11 | stream = p.open( 12 | format=FORMAT, 13 | channels=CHANNELS, 14 | rate=RATE, 15 | input=True, 16 | frames_per_buffer=FRAMES_PER_BUFFER 17 | ) 18 | 19 | #print("start recording...") 20 | 21 | frames = [] 22 | seconds = 1 23 | for i in range(0, int(RATE / FRAMES_PER_BUFFER * seconds)): 24 | data = stream.read(FRAMES_PER_BUFFER) 25 | frames.append(data) 26 | 27 | # print("recording stopped") 28 | 29 | stream.stop_stream() 30 | stream.close() 31 | 32 | return np.frombuffer(b''.join(frames), dtype=np.int16) 33 | 34 | 35 | def terminate(): 36 | p.terminate() -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from tensorflow.keras import models 4 | 5 | from recording_helper import record_audio, terminate 6 | from tf_helper import preprocess_audiobuffer 7 | 8 | # !! Modify this in the correct order 9 | commands = ['left', 'down', 'stop', 'up', 'right', 'no', 'go', 'yes'] 10 | 11 | loaded_model = models.load_model("saved_model") 12 | 13 | def predict_mic(): 14 | audio = record_audio() 15 | spec = preprocess_audiobuffer(audio) 16 | prediction = loaded_model(spec) 17 | label_pred = np.argmax(prediction, axis=1) 18 | command = commands[label_pred[0]] 19 | print("Predicted label:", command) 20 | return command 21 | 22 | if __name__ == "__main__": 23 | from turtle_helper import move_turtle 24 | while True: 25 | command = predict_mic() 26 | move_turtle(command) 27 | if command == "stop": 28 | terminate() 29 | break -------------------------------------------------------------------------------- /tf_helper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | # Set the seed value for experiment reproducibility. 6 | seed = 42 7 | tf.random.set_seed(seed) 8 | np.random.seed(seed) 9 | 10 | def get_spectrogram(waveform): 11 | # Zero-padding for an audio waveform with less than 16,000 samples. 12 | input_len = 16000 13 | waveform = waveform[:input_len] 14 | zero_padding = tf.zeros( 15 | [16000] - tf.shape(waveform), 16 | dtype=tf.float32) 17 | # Cast the waveform tensors' dtype to float32. 18 | waveform = tf.cast(waveform, dtype=tf.float32) 19 | # Concatenate the waveform with `zero_padding`, which ensures all audio 20 | # clips are of the same length. 21 | equal_length = tf.concat([waveform, zero_padding], 0) 22 | # Convert the waveform to a spectrogram via a STFT. 23 | spectrogram = tf.signal.stft( 24 | equal_length, frame_length=255, frame_step=128) 25 | # Obtain the magnitude of the STFT. 26 | spectrogram = tf.abs(spectrogram) 27 | # Add a `channels` dimension, so that the spectrogram can be used 28 | # as image-like input data with convolution layers (which expect 29 | # shape (`batch_size`, `height`, `width`, `channels`). 30 | spectrogram = spectrogram[..., tf.newaxis] 31 | return spectrogram 32 | 33 | 34 | def preprocess_audiobuffer(waveform): 35 | """ 36 | waveform: ndarray of size (16000, ) 37 | 38 | output: Spectogram Tensor of size: (1, `height`, `width`, `channels`) 39 | """ 40 | # normalize from [-32768, 32767] to [-1, 1] 41 | waveform = waveform / 32768 42 | 43 | waveform = tf.convert_to_tensor(waveform, dtype=tf.float32) 44 | 45 | spectogram = get_spectrogram(waveform) 46 | 47 | # add one dimension 48 | spectogram = tf.expand_dims(spectogram, 0) 49 | 50 | return spectogram -------------------------------------------------------------------------------- /turtle_helper.py: -------------------------------------------------------------------------------- 1 | import turtle 2 | 3 | s = turtle.getscreen() 4 | 5 | t = turtle.Turtle() # starts at right: 6 | 7 | size = t.turtlesize() 8 | increase = (2 * num for num in size) 9 | t.turtlesize(*increase) 10 | 11 | t.pensize(5) 12 | t.shapesize() 13 | t.pencolor("blue") 14 | 15 | def go_right(): 16 | # target = 0 17 | current = t.heading() 18 | if current == 0: 19 | pass 20 | elif current == 90: 21 | t.right(90) 22 | elif current == 180: 23 | t.right(180) 24 | elif current == 270: 25 | t.left(90) 26 | else: 27 | raise ValueError('not a right angle!') 28 | 29 | def go_up(): 30 | # target = 90 31 | current = t.heading() 32 | if current == 0: 33 | t.left(90) 34 | elif current == 90: 35 | pass 36 | elif current == 180: 37 | t.right(90) 38 | elif current == 270: 39 | t.left(180) 40 | else: 41 | raise ValueError('not a right angle!') 42 | 43 | def go_left(): 44 | # target = 180 45 | current = t.heading() 46 | if current == 0: 47 | t.left(180) 48 | elif current == 90: 49 | t.left(90) 50 | elif current == 180: 51 | pass 52 | elif current == 270: 53 | t.right(90) 54 | else: 55 | raise ValueError('not a right angle!') 56 | 57 | def go_down(): 58 | # target = 270 59 | current = t.heading() 60 | if current == 0: 61 | t.right(90) 62 | elif current == 90: 63 | t.right(180) 64 | elif current == 180: 65 | t.left(90) 66 | elif current == 270: 67 | pass 68 | else: 69 | raise ValueError('not a right angle!') 70 | 71 | 72 | def move_turtle(command): 73 | if command == 'up': 74 | go_up() 75 | elif command == 'down': 76 | go_down() 77 | elif command == 'left': 78 | go_left() 79 | elif command == 'right': 80 | go_right() 81 | elif command == 'go': 82 | t.forward(100) 83 | elif command == 'stop': 84 | print('Stopping the turtle') 85 | --------------------------------------------------------------------------------