├── .gitignore ├── LICENSE ├── README.md ├── collect_data.py ├── data ├── keyword │ └── .gitignore ├── not-keyword │ └── .gitignore └── test │ ├── keyword │ └── .gitignore │ └── not-keyword │ └── .gitignore ├── model └── .gitignore ├── mycroft_keyword.py ├── requirements.txt ├── test_keyword.py └── train_keyword.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | src/ 3 | env/ 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2017 Mycroft AI, Inc. 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mycroft Precise # 2 | 3 | Mycroft Precise is a wake word listener using the latest machine learning and audio processing technology available. 4 | 5 | ## How it works ## 6 | 7 | First, using [librosa][librosa], it extracts `20` [mfcc features][mfcc] from the input audio. Next, it creates an [LSTM network][lstm] using [tflearn][tflearn]. This model is saved to the `model/` folder and can be loaded to test for the wake word against new audio. 8 | 9 | ## Current State ## 10 | 11 | Currently, we are in the process of collecting and tagging data to train the network and assess how well it generalizes. 12 | 13 | ## File Descriptions ## 14 | 15 | - `train_keyword.py`: Reads wav files in the `data` folder and creates/trains the model. Every `20` epochs you will see the script saves the model to the `model` folder. Hit `Ctrl+C` to stop training. The next time the script is run it will continue training where it left off. 16 | - `test_keyword.py`: Uses the audio in `data/test` and the model saved in `model/` to run some statistics on the accuracy of the model. 17 | - `collect_data.py`: Allows quickly recording wav files. 18 | - `mycroft_keyword.py`: Used internally to architect the network and to perform common tasks like loading training data. 19 | 20 | ## Setup ## 21 | 22 | - Install `python3`. (Ubuntu: `sudo apt-get install python3-pip`) 23 | - Install dependencies, ideally in a virtualenv: (`sudo` if installing system wide) `pip3 install -r requirements.txt` 24 | 25 | That's it! Now putt some data in the data folder, run the script, and you should be good to go! 26 | 27 | [tflearn]:http://tflearn.org/ 28 | [librosa]:https://github.com/librosa/librosa 29 | [mfcc]:https://en.wikipedia.org/wiki/Mel-frequency_cepstrum 30 | [lstm]:https://en.wikipedia.org/wiki/Long_short-term_memory 31 | 32 | -------------------------------------------------------------------------------- /collect_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from sys import byteorder 4 | from array import array 5 | from struct import pack 6 | from os.path import isfile 7 | import curses 8 | 9 | import pyaudio 10 | import wave 11 | 12 | from sys import stdin 13 | from select import select 14 | import tty 15 | from termios import tcsetattr, tcgetattr, TCSADRAIN 16 | 17 | def key_pressed(): 18 | return select([stdin], [], [], 0) == ([stdin], [], []) 19 | 20 | orig_settings = None 21 | def termios_wrapper(main): 22 | global orig_settings 23 | orig_settings = tcgetattr(stdin) 24 | try: 25 | hide_input() 26 | main() 27 | finally: 28 | tcsetattr(stdin, TCSADRAIN, orig_settings) 29 | 30 | def show_input(): 31 | tcsetattr(stdin, TCSADRAIN, orig_settings) 32 | 33 | def hide_input(): 34 | tty.setcbreak(stdin.fileno()) 35 | 36 | CHANNELS = 1 37 | CHUNK_SIZE = 1024 38 | FORMAT = pyaudio.paInt16 39 | RATE = 44100 40 | 41 | RECORD_KEY = ' ' 42 | EXIT_KEY_CODE = 27 43 | 44 | def record_until(p, should_return): 45 | stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, 46 | input=True, frames_per_buffer=CHUNK_SIZE) 47 | 48 | frames = [] 49 | while not should_return(): 50 | frames.append(stream.read(CHUNK_SIZE)) 51 | 52 | stream.stop_stream() 53 | stream.close() 54 | 55 | return b''.join(frames), p.get_sample_size(FORMAT) 56 | 57 | def save_audio(name, data, width): 58 | wf = wave.open(name, 'wb') 59 | wf.setnchannels(CHANNELS) 60 | wf.setsampwidth(width) 61 | wf.setframerate(RATE) 62 | wf.writeframes(data) 63 | wf.close() 64 | 65 | def next_name(name): 66 | name += '.wav' 67 | pos, num_digits = None, None 68 | try: 69 | pos = name.index('#') 70 | num_digits = name.count('#') 71 | except ValueError: 72 | print("Name must contain at least one # to indicate where to put the number.") 73 | raise 74 | 75 | def get_name(i): 76 | nonlocal name, pos 77 | return name[:pos] + str(i).zfill(num_digits) + name[pos + num_digits:] 78 | 79 | i = 0 80 | while True: 81 | if not isfile(get_name(i)): 82 | break 83 | i += 1 84 | 85 | return get_name(i) 86 | 87 | def wait_to_continue(): 88 | while True: 89 | c = stdin.read(1) 90 | if c == RECORD_KEY: 91 | return True 92 | elif ord(c) == EXIT_KEY_CODE: 93 | return False 94 | 95 | def main(): 96 | def should_return(): 97 | return key_pressed() and stdin.read(1) == RECORD_KEY 98 | 99 | show_input() 100 | audio_name = input("Audio name (Ex. recording-##):") 101 | hide_input() 102 | 103 | p = pyaudio.PyAudio() 104 | 105 | while True: 106 | print('Press space to record (esc to exit)...') 107 | 108 | if not wait_to_continue(): 109 | break 110 | 111 | print('Recording...') 112 | d, w = record_until(p, should_return) 113 | name = next_name(audio_name) 114 | save_audio(name, d, w) 115 | print('Saved as ' + name) 116 | 117 | p.terminate() 118 | 119 | if __name__ == '__main__': 120 | termios_wrapper(main) 121 | 122 | 123 | -------------------------------------------------------------------------------- /data/keyword/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /data/not-keyword/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /data/test/keyword/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /data/test/not-keyword/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /model/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /mycroft_keyword.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from __future__ import division, print_function, absolute_import 4 | from os import listdir 5 | from os.path import isfile 6 | from os.path import join 7 | import numpy as np 8 | import librosa 9 | import tflearn 10 | import tensorflow as tf 11 | 12 | model_dir = 'model' 13 | model_name = 'keyword.model' 14 | data_dir = 'data' 15 | test_dir = 'test' 16 | keyword_dir = 'keyword' 17 | not_keyword_dir = 'not-keyword' 18 | length_ext = '.len' 19 | 20 | learning_rate = 0.0001 21 | num_features = 20 22 | lstm_size = 16 23 | lstm_dropout = 0.8 24 | 25 | def load_length(): 26 | len_file = join(model_dir, model_name + length_ext) 27 | if isfile(len_file): 28 | print("Found length file.") 29 | with open(len_file) as f: 30 | for line in f: 31 | if len(line) > 0: 32 | return int(line) 33 | return None 34 | 35 | def save_length(length): 36 | len_file = join(model_dir, model_name + length_ext) 37 | with open(len_file, 'w') as f: 38 | f.write(str(length)) 39 | 40 | def save_model(model, length): 41 | model.save(join(model_dir, model_name)) 42 | save_length(length) 43 | 44 | def _load_mfcc(filename): 45 | wave, sr = librosa.load(filename, mono=True) 46 | return librosa.feature.mfcc(wave, sr, n_mfcc=num_features) 47 | 48 | def load_mfccs(path): 49 | return [_load_mfcc(join(path, f)) for f in listdir(path)] 50 | 51 | def max_length_mfccs(mfccs): 52 | max_length = 0 53 | for i in mfccs: 54 | width, length = i.shape 55 | if length > max_length: 56 | max_length = length 57 | return max_length 58 | 59 | def normalize_mfccs(mfccs, max_length): 60 | for i in range(len(mfccs)): 61 | width, length = mfccs[i].shape 62 | if length <= max_length: 63 | padding_dim = ((0, 0), (0, max_length - length)) 64 | mfccs[i] = np.pad(mfccs[i], padding_dim, mode='constant', constant_values=0) 65 | else: 66 | mfccs[i] = np.split(mfccs[i], [max_length], axis=1)[0] 67 | 68 | def create_net(in_sx, in_sy, out_sx): 69 | """ 70 | Creates a tflearn neural network with the correct 71 | architecture for learning to hear the keyword 72 | """ 73 | net = tflearn.input_data([None, in_sx, in_sy]) 74 | net = tflearn.lstm(net, lstm_size, dropout=lstm_dropout) 75 | net = tflearn.fully_connected(net, out_sx, activation='softmax') 76 | net = tflearn.regression(net, learning_rate=learning_rate, optimizer='adam', loss='categorical_crossentropy') 77 | return net 78 | 79 | def create_model(net): 80 | return tflearn.DNN(net, tensorboard_verbose=0) 81 | 82 | def train_model(model, inputs, outputs, n_epoch): 83 | model.fit(inputs, outputs, n_epoch=n_epoch, validation_set=0.1, show_metric=True, 84 | batch_size=len(inputs)) 85 | 86 | def fix_version_errors(): 87 | col = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) 88 | for x in col: 89 | tf.add_to_collection(tf.GraphKeys.VARIABLES, x) 90 | 91 | def try_load_into_model(model): 92 | """ 93 | Returns True if found and loaded model 94 | Returns False otherwise 95 | """ 96 | model_path = join(model_dir, model_name) 97 | if isfile(model_path + '.index'): 98 | print('Loading saved model...') 99 | model.load(model_path) 100 | return True 101 | return False 102 | 103 | def _load_data(parent_dir, max_length=None): 104 | """ 105 | Provide max_length to force audio files to be chopped 106 | or extended in the case of an already trained network 107 | """ 108 | mfccs = [] 109 | outputs = [] 110 | 111 | def load_subdir(subdir, output): 112 | nonlocal mfccs, outputs 113 | path = join(parent_dir, subdir) 114 | subdir_mfccs = load_mfccs(path) 115 | 116 | mfccs += subdir_mfccs 117 | outputs += [np.array(output)] * len(subdir_mfccs) 118 | 119 | load_subdir(keyword_dir, [1, 0]) 120 | load_subdir(not_keyword_dir, [0, 1]) 121 | 122 | if max_length is None: 123 | max_length = max_length_mfccs(mfccs) 124 | normalize_mfccs(mfccs, max_length) 125 | 126 | return mfccs, outputs 127 | 128 | def load_training_data(max_length=None): 129 | return _load_data(data_dir, max_length) 130 | 131 | def load_test_data(max_length=None): 132 | return _load_data(join(data_dir, test_dir), max_length) 133 | 134 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | audioread==2.1.4 2 | cycler==0.10.0 3 | Cython==0.25.2 4 | dask==0.12.0 5 | decorator==4.0.10 6 | flake8==3.2.1 7 | h5py==2.6.0 8 | joblib==0.10.3 9 | -e git+https://github.com/librosa/librosa.git@01951000f20105ef31a4f80e897cd41c386b9314#egg=librosa 10 | matplotlib==1.5.3 11 | mccabe==0.5.2 12 | networkx==1.11 13 | numpy==1.11.2 14 | Pillow==3.4.2 15 | protobuf==3.1.0 16 | pycodestyle==2.2.0 17 | pyflakes==1.3.0 18 | pyparsing==2.1.10 19 | python-dateutil==2.6.0 20 | pytz==2016.10 21 | resampy==0.1.4 22 | scikit-image==0.12.3 23 | scikit-learn==0.18.1 24 | scipy==0.18.1 25 | six==1.10.0 26 | tensorflow==1.2.0rc0 27 | tflearn==0.3.1 28 | toolz==0.8.1 29 | -------------------------------------------------------------------------------- /test_keyword.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from __future__ import division, print_function, absolute_import 4 | import mycroft_keyword as kw 5 | 6 | ins, outs = kw.load_test_data(kw.load_length()) 7 | net = kw.create_net(*ins[0].shape, *outs[0].shape) 8 | kw.fix_version_errors() 9 | model = kw.create_model(net) 10 | if not kw.try_load_into_model(model): 11 | raise RuntimeError 12 | 13 | _outs = model.predict(ins) 14 | 15 | print (outs) 16 | print (_outs) 17 | 18 | num_correct, num_incorrect = 0, 0 19 | num_false_pos, num_false_neg = 0, 0 20 | for i in range(len(outs)): 21 | if (_outs[i][0] > _outs[i][1]) == (outs[i][0] > outs[i][1]): 22 | num_correct += 1 23 | else: 24 | if _outs[i][0] > _outs[i][1]: 25 | num_false_neg += 1 26 | else: 27 | num_false_pos += 1 28 | num_incorrect += 1 29 | total = num_incorrect + num_correct 30 | 31 | print(str(num_correct) + " out of " + str(total)) 32 | print(str(100.0 * num_correct / total) + " %") 33 | print 34 | print(str(100.0 * num_false_pos / total) + " % false positives") 35 | print(str(100.0 * num_false_neg / total) + " % false negatives") 36 | 37 | -------------------------------------------------------------------------------- /train_keyword.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from __future__ import division, print_function, absolute_import 4 | import mycroft_keyword as kw 5 | 6 | epochs_per_step = 20 7 | 8 | inputs, outputs = kw.load_training_data(kw.load_length()) 9 | net = kw.create_net(*inputs[0].shape, *outputs[0].shape) 10 | kw.fix_version_errors() 11 | model = kw.create_model(net) 12 | kw.try_load_into_model(model) 13 | 14 | while True: 15 | kw.train_model(model, inputs, outputs, 20) 16 | print("Saving...") 17 | kw.save_model(model, len(inputs[0][0])) 18 | 19 | --------------------------------------------------------------------------------