├── demo.png
├── README.md
├── recording_helper.py
├── main.py
├── tf_helper.py
└── turtle_helper.py


/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rubentak/realtime-voice-command-recognition/main/demo.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Realtime Voice Command Recognition
 2 | 
 3 | Build your own real-time voice command recognition model with TensorFlow.
 4 | 
 5 | **Watch the video tutorial:**
 6 | 
 7 | [![Alt text](https://img.youtube.com/vi/m-JzldXm9bQ/hqdefault.jpg)](https://youtu.be/m-JzldXm9bQ)
 8 | 
 9 | ## Instructions
10 | 
11 | - Run the Colab from [this TensorFlow tutorial](https://www.tensorflow.org/tutorials/audio/simple_audio)
12 | - Follow the video to zip and download the trained model
13 | - Make sure to use the correct `commands` order in `main.py` (same as when running the Colab)
14 | - Install pyaudio and tensorflow
15 | - Extraxt the zip to a folder named `saved_model`
16 | - Run `python main.py`
17 | 
18 | ## Screenshot
19 | 
20 | ![Demo](demo.png)
21 | 
22 | 


--------------------------------------------------------------------------------
/recording_helper.py:
--------------------------------------------------------------------------------
 1 | import pyaudio
 2 | import numpy as np
 3 | 
 4 | FRAMES_PER_BUFFER = 3200
 5 | FORMAT = pyaudio.paInt16
 6 | CHANNELS = 1
 7 | RATE = 16000
 8 | p = pyaudio.PyAudio()
 9 | 
10 | def record_audio():
11 |     stream = p.open(
12 |         format=FORMAT,
13 |         channels=CHANNELS,
14 |         rate=RATE,
15 |         input=True,
16 |         frames_per_buffer=FRAMES_PER_BUFFER
17 |     )
18 | 
19 |     #print("start recording...")
20 | 
21 |     frames = []
22 |     seconds = 1
23 |     for i in range(0, int(RATE / FRAMES_PER_BUFFER * seconds)):
24 |         data = stream.read(FRAMES_PER_BUFFER)
25 |         frames.append(data)
26 | 
27 |     # print("recording stopped")
28 | 
29 |     stream.stop_stream()
30 |     stream.close()
31 |     
32 |     return np.frombuffer(b''.join(frames), dtype=np.int16)
33 | 
34 | 
35 | def terminate():
36 |     p.terminate()


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from tensorflow.keras import models
 4 | 
 5 | from recording_helper import record_audio, terminate
 6 | from tf_helper import preprocess_audiobuffer
 7 | 
 8 | # !! Modify this in the correct order
 9 | commands = ['left', 'down', 'stop', 'up', 'right', 'no', 'go', 'yes']
10 | 
11 | loaded_model = models.load_model("saved_model")
12 | 
13 | def predict_mic():
14 |     audio = record_audio()
15 |     spec = preprocess_audiobuffer(audio)
16 |     prediction = loaded_model(spec)
17 |     label_pred = np.argmax(prediction, axis=1)
18 |     command = commands[label_pred[0]]
19 |     print("Predicted label:", command)
20 |     return command
21 | 
22 | if __name__ == "__main__":
23 |     from turtle_helper import move_turtle
24 |     while True:
25 |         command = predict_mic()
26 |         move_turtle(command)
27 |         if command == "stop":
28 |             terminate()
29 |             break


--------------------------------------------------------------------------------
/tf_helper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | # Set the seed value for experiment reproducibility.
 6 | seed = 42
 7 | tf.random.set_seed(seed)
 8 | np.random.seed(seed)
 9 | 
10 | def get_spectrogram(waveform):
11 |     # Zero-padding for an audio waveform with less than 16,000 samples.
12 |     input_len = 16000
13 |     waveform = waveform[:input_len]
14 |     zero_padding = tf.zeros(
15 |         [16000] - tf.shape(waveform),
16 |         dtype=tf.float32)
17 |     # Cast the waveform tensors' dtype to float32.
18 |     waveform = tf.cast(waveform, dtype=tf.float32)
19 |     # Concatenate the waveform with `zero_padding`, which ensures all audio
20 |     # clips are of the same length.
21 |     equal_length = tf.concat([waveform, zero_padding], 0)
22 |     # Convert the waveform to a spectrogram via a STFT.
23 |     spectrogram = tf.signal.stft(
24 |         equal_length, frame_length=255, frame_step=128)
25 |     # Obtain the magnitude of the STFT.
26 |     spectrogram = tf.abs(spectrogram)
27 |     # Add a `channels` dimension, so that the spectrogram can be used
28 |     # as image-like input data with convolution layers (which expect
29 |     # shape (`batch_size`, `height`, `width`, `channels`).
30 |     spectrogram = spectrogram[..., tf.newaxis]
31 |     return spectrogram
32 | 
33 | 
34 | def preprocess_audiobuffer(waveform):
35 |     """
36 |     waveform: ndarray of size (16000, )
37 |     
38 |     output: Spectogram Tensor of size: (1, `height`, `width`, `channels`)
39 |     """
40 |     #  normalize from [-32768, 32767] to [-1, 1]
41 |     waveform =  waveform / 32768
42 | 
43 |     waveform = tf.convert_to_tensor(waveform, dtype=tf.float32)
44 | 
45 |     spectogram = get_spectrogram(waveform)
46 |     
47 |     # add one dimension
48 |     spectogram = tf.expand_dims(spectogram, 0)
49 |     
50 |     return spectogram


--------------------------------------------------------------------------------
/turtle_helper.py:
--------------------------------------------------------------------------------
 1 | import turtle
 2 | 
 3 | s = turtle.getscreen()
 4 | 
 5 | t = turtle.Turtle() # starts at right:
 6 | 
 7 | size = t.turtlesize()
 8 | increase = (2 * num for num in size)
 9 | t.turtlesize(*increase)
10 | 
11 | t.pensize(5)
12 | t.shapesize()
13 | t.pencolor("blue")
14 | 
15 | def go_right():
16 |     # target = 0
17 |     current = t.heading()
18 |     if current == 0:
19 |         pass
20 |     elif current == 90:
21 |         t.right(90)
22 |     elif current == 180:
23 |         t.right(180)
24 |     elif current == 270:
25 |         t.left(90)
26 |     else:
27 |         raise ValueError('not a right angle!')
28 | 
29 | def go_up():
30 |     # target = 90
31 |     current = t.heading()
32 |     if current == 0:
33 |         t.left(90)
34 |     elif current == 90:
35 |         pass
36 |     elif current == 180:
37 |         t.right(90)
38 |     elif current == 270:
39 |         t.left(180)
40 |     else:
41 |         raise ValueError('not a right angle!')
42 |     
43 | def go_left():
44 |     # target = 180
45 |     current = t.heading()
46 |     if current == 0:
47 |         t.left(180)
48 |     elif current == 90:
49 |         t.left(90)
50 |     elif current == 180:
51 |         pass
52 |     elif current == 270:
53 |         t.right(90)
54 |     else:
55 |         raise ValueError('not a right angle!')
56 |     
57 | def go_down():
58 |     # target = 270
59 |     current = t.heading()
60 |     if current == 0:
61 |         t.right(90)
62 |     elif current == 90:
63 |         t.right(180)
64 |     elif current == 180:
65 |         t.left(90)
66 |     elif current == 270:
67 |         pass
68 |     else:
69 |         raise ValueError('not a right angle!')
70 | 
71 | 
72 | def move_turtle(command):
73 |     if command == 'up':
74 |         go_up()
75 |     elif command == 'down':
76 |         go_down()
77 |     elif command == 'left':
78 |         go_left()
79 |     elif command == 'right':
80 |         go_right()
81 |     elif command == 'go':
82 |         t.forward(100)
83 |     elif command == 'stop':
84 |         print('Stopping the turtle')
85 | 


--------------------------------------------------------------------------------