├── data_processing ├── labels.txt ├── reformat_filename.py └── get_dataset.py ├── datasets └── sources.txt ├── um_detector.PNG ├── tensorflow_speech_commands.PNG ├── .gitignore ├── demo ├── save_audio_stream_to_folder.py ├── label_wav_folder.py └── label_wav.py └── README.md /data_processing/labels.txt: -------------------------------------------------------------------------------- 1 | _silence_ 2 | _unknown_ 3 | um -------------------------------------------------------------------------------- /datasets/sources.txt: -------------------------------------------------------------------------------- 1 | http://openslr.org/45/ 2 | http://www.dcs.gla.ac.uk/vincia/?p=307 -------------------------------------------------------------------------------- /um_detector.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ezxzeng/um_detector/HEAD/um_detector.PNG -------------------------------------------------------------------------------- /tensorflow_speech_commands.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ezxzeng/um_detector/HEAD/tensorflow_speech_commands.PNG -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /datasets/ST-AEDS-20180100_1-OS 2 | /datasets/ST-AEDS-20180100_1-OS/* 3 | datasets/up 4 | datasets/up/* 5 | datasets/vocalizationcorpus 6 | datasets/vocalizationcorpus/* 7 | .idea 8 | .idea/* 9 | demo/demo.wav -------------------------------------------------------------------------------- /demo/save_audio_stream_to_folder.py: -------------------------------------------------------------------------------- 1 | import sounddevice as sd 2 | from scipy.io.wavfile import write 3 | 4 | from pydub import AudioSegment 5 | 6 | fs = 16000 # Sample rate 7 | 8 | 9 | if __name__ == "__main__": 10 | seconds = int(input("how many seconds should I record? (must be greater than 2): ")) 11 | while seconds < 2: 12 | seconds = int(input("how many seconds should I record? (must be greater than 2): ")) 13 | 14 | myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=1, dtype='int16') 15 | sd.wait() # Wait until recording is finished 16 | write('demo.wav', fs, myrecording, ) # Save as WAV file 17 | 18 | print("done recording, saving recording") 19 | audio_sample = AudioSegment.from_wav('demo.wav') 20 | 21 | start = 0 22 | count = 0 23 | while start + 2 < seconds: 24 | sample_segment = audio_sample[start * 1000: (start + 2) * 1000] 25 | sample_segment.export(f"demo/{count}.wav", format="wav", ) 26 | start += 0.25 27 | count += 1 28 | -------------------------------------------------------------------------------- /data_processing/reformat_filename.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import pandas as pd 4 | 5 | import tqdm 6 | tqdm.tqdm.pandas() 7 | 8 | 9 | def copy_with_new_name(row): 10 | if row["is_filler"]: 11 | is_um = "um" 12 | else: 13 | is_um = "other" 14 | 15 | segment_path = f"datasets/vocalizationcorpus/data_2s/{row['set']}/{is_um}/{row['name']}.wav" 16 | 17 | new_path = f"datasets/vocalizationcorpus/data_2s/reformatted/{is_um}/{row['original_spk']}-{row['sample']}-nohash-{row['name'][-1]}-.wav" 18 | 19 | shutil.copy(segment_path, new_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | if not os.path.exists("datasets/vocalizationcorpus/data_2s/reformatted"): 24 | os.mkdir("datasets/vocalizationcorpus/data_2s/reformatted") 25 | 26 | if not os.path.exists("datasets/vocalizationcorpus/data_2s/reformatted/other"): 27 | os.mkdir("datasets/vocalizationcorpus/data_2s/reformatted/other") 28 | 29 | if not os.path.exists("datasets/vocalizationcorpus/data_2s/reformatted/um"): 30 | os.mkdir("datasets/vocalizationcorpus/data_2s/reformatted/um") 31 | 32 | labels_sections_splitted = pd.read_pickle("datasets/vocalizationcorpus/labeled_sections_splitted.pickle") 33 | labels_sections_splitted.progress_apply(copy_with_new_name, axis=1) -------------------------------------------------------------------------------- /demo/label_wav_folder.py: -------------------------------------------------------------------------------- 1 | from demo.label_wav import load_labels, load_graph, run_graph 2 | import tensorflow as tf 3 | import argparse 4 | import glob 5 | import os 6 | import sys 7 | 8 | FLAGS = None 9 | 10 | def load_graph_and_label(labels, graph): 11 | if not labels or not tf.io.gfile.exists(labels): 12 | tf.compat.v1.logging.fatal('Labels file does not exist %s', labels) 13 | 14 | if not graph or not tf.io.gfile.exists(graph): 15 | tf.compat.v1.logging.fatal('Graph file does not exist %s', graph) 16 | 17 | labels_list = load_labels(labels) 18 | 19 | # load graph, which is stored in the default session 20 | load_graph(graph) 21 | 22 | return labels_list 23 | 24 | 25 | def label_wav(wav, labels_list, input_name, output_name, how_many_labels): 26 | """Loads the model and labels, and runs the inference to print predictions.""" 27 | if not wav or not tf.io.gfile.exists(wav): 28 | tf.compat.v1.logging.fatal('Audio file does not exist %s', wav) 29 | 30 | with open(wav, 'rb') as wav_file: 31 | wav_data = wav_file.read() 32 | 33 | label, score = run_graph(wav_data, labels_list, input_name, output_name, how_many_labels) 34 | return label, score 35 | 36 | 37 | def main(_): 38 | """Entry point for script, converts flags to arguments.""" 39 | labels_list = load_graph_and_label(FLAGS.labels, FLAGS.graph) 40 | 41 | wav_paths = glob.glob(os.path.join(FLAGS.wav_folder, "*.wav")) 42 | 43 | for wav in wav_paths: 44 | label, score = label_wav(wav, labels_list, FLAGS.input_name, 45 | FLAGS.output_name, FLAGS.how_many_labels) 46 | 47 | print(f"{wav} \t {label}: {score}") 48 | 49 | 50 | if __name__ == '__main__': 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument( 53 | '--wav_folder', type=str, default='', help='folder of audio files to be identified.') 54 | parser.add_argument( 55 | '--graph', type=str, default='', help='Model to use for identification.') 56 | parser.add_argument( 57 | '--labels', type=str, default='', help='Path to file containing labels.') 58 | parser.add_argument( 59 | '--input_name', 60 | type=str, 61 | default='wav_data:0', 62 | help='Name of WAVE data input node in model.') 63 | parser.add_argument( 64 | '--output_name', 65 | type=str, 66 | default='labels_softmax:0', 67 | help='Name of node outputting a prediction in the model.') 68 | parser.add_argument( 69 | '--how_many_labels', 70 | type=int, 71 | default=3, 72 | help='Number of results to show.') 73 | 74 | FLAGS, unparsed = parser.parse_known_args() 75 | tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed) 76 | 77 | -------------------------------------------------------------------------------- /demo/label_wav.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | r"""Runs a trained audio graph against a WAVE file and reports the results. 16 | The model, labels and .wav file specified in the arguments will be loaded, and 17 | then the predictions from running the model against the audio data will be 18 | printed to the console. This is a useful script for sanity checking trained 19 | models, and as an example of how to use an audio model from Python. 20 | Here's an example of running it: 21 | python tensorflow/examples/speech_commands/label_wav.py \ 22 | --graph=/tmp/my_frozen_graph.pb \ 23 | --labels=/tmp/speech_commands_train/conv_labels.txt \ 24 | --wav=/tmp/speech_dataset/left/a5d485dc_nohash_0.wav 25 | """ 26 | from __future__ import absolute_import 27 | from __future__ import division 28 | from __future__ import print_function 29 | 30 | import argparse 31 | import sys 32 | 33 | import tensorflow as tf 34 | 35 | 36 | FLAGS = None 37 | 38 | 39 | def load_graph(filename): 40 | """Unpersists graph from file as default graph.""" 41 | with tf.io.gfile.GFile(filename, 'rb') as f: 42 | graph_def = tf.compat.v1.GraphDef() 43 | graph_def.ParseFromString(f.read()) 44 | tf.import_graph_def(graph_def, name='') 45 | 46 | 47 | def load_labels(filename): 48 | """Read in labels, one label per line.""" 49 | return [line.rstrip() for line in tf.io.gfile.GFile(filename)] 50 | 51 | 52 | def run_graph(wav_data, labels, input_layer_name, output_layer_name, 53 | num_top_predictions): 54 | """Runs the audio data through the graph and prints predictions.""" 55 | with tf.compat.v1.Session() as sess: 56 | # Feed the audio data as input to the graph. 57 | # predictions will contain a two-dimensional array, where one 58 | # dimension represents the input image count, and the other has 59 | # predictions per class 60 | softmax_tensor = sess.graph.get_tensor_by_name(output_layer_name) 61 | predictions, = sess.run(softmax_tensor, {input_layer_name: wav_data}) 62 | 63 | # Sort to show labels in order of confidence 64 | top_k = predictions.argsort()[-num_top_predictions:][::-1] 65 | # for node_id in top_k: 66 | # human_string = labels[node_id] 67 | # score = predictions[node_id] 68 | # print('%s (score = %.5f)' % (human_string, score)) 69 | 70 | return labels[top_k[0]], predictions[top_k[0]] 71 | 72 | 73 | def label_wav(wav, labels, graph, input_name, output_name, how_many_labels): 74 | """Loads the model and labels, and runs the inference to print predictions.""" 75 | if not wav or not tf.io.gfile.exists(wav): 76 | tf.compat.v1.logging.fatal('Audio file does not exist %s', wav) 77 | 78 | if not labels or not tf.io.gfile.exists(labels): 79 | tf.compat.v1.logging.fatal('Labels file does not exist %s', labels) 80 | 81 | if not graph or not tf.io.gfile.exists(graph): 82 | tf.compat.v1.logging.fatal('Graph file does not exist %s', graph) 83 | 84 | labels_list = load_labels(labels) 85 | 86 | # load graph, which is stored in the default session 87 | load_graph(graph) 88 | 89 | with open(wav, 'rb') as wav_file: 90 | wav_data = wav_file.read() 91 | 92 | run_graph(wav_data, labels_list, input_name, output_name, how_many_labels) 93 | 94 | 95 | def main(_): 96 | """Entry point for script, converts flags to arguments.""" 97 | label_wav(FLAGS.wav, FLAGS.labels, FLAGS.graph, FLAGS.input_name, 98 | FLAGS.output_name, FLAGS.how_many_labels) 99 | 100 | 101 | if __name__ == '__main__': 102 | parser = argparse.ArgumentParser() 103 | parser.add_argument( 104 | '--wav', type=str, default='', help='Audio file to be identified.') 105 | parser.add_argument( 106 | '--graph', type=str, default='', help='Model to use for identification.') 107 | parser.add_argument( 108 | '--labels', type=str, default='', help='Path to file containing labels.') 109 | parser.add_argument( 110 | '--input_name', 111 | type=str, 112 | default='wav_data:0', 113 | help='Name of WAVE data input node in model.') 114 | parser.add_argument( 115 | '--output_name', 116 | type=str, 117 | default='labels_softmax:0', 118 | help='Name of node outputting a prediction in the model.') 119 | parser.add_argument( 120 | '--how_many_labels', 121 | type=int, 122 | default=3, 123 | help='Number of results to show.') 124 | 125 | FLAGS, unparsed = parser.parse_known_args() 126 | tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed) -------------------------------------------------------------------------------- /data_processing/get_dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split as sklearn_train_test_split 4 | 5 | from pydub import AudioSegment 6 | 7 | import os 8 | import tqdm 9 | 10 | 11 | def join_action(row, index): 12 | if np.isnan(row[f"start_voc_{index}"]): 13 | return None 14 | else: 15 | return ",".join(row[f"type_voc_{index},start_voc_{index},end_voc_{index}".split(",")].astype(str)) 16 | 17 | 18 | def expand_actions(row): 19 | return row["value"].split(",") 20 | 21 | 22 | def get_labels(labels_path): 23 | labels = pd.read_csv(labels_path) 24 | for i in range(1, 7): 25 | labels[f"action_{i}"] = labels.apply(join_action, axis=1, args=(i,)) 26 | 27 | labels = labels[['Sample', 'original_spk', 'gender', 'original_time', 'action_1', 'action_2', 'action_3', 28 | 'action_4', 'action_5', 'action_6']] 29 | 30 | labels = labels.melt(['Sample', 'original_spk', 'gender', 'original_time']) 31 | 32 | labels = labels.dropna() 33 | 34 | labels = pd.concat([labels, labels.apply(lambda x: pd.Series(x["value"].split(",")), axis=1)], axis=1) 35 | labels = labels[['Sample', 'original_spk', 'gender', 'original_time', 0, 1, 2]] 36 | labels.columns = ['Sample', 'original_spk', 'gender', 'original_time', "type", "start", "end"] 37 | 38 | labels = labels[labels.type == "filler"] 39 | labels["start"] = labels.start.astype(float) 40 | labels["end"] = labels.end.astype(float) 41 | return labels 42 | 43 | 44 | def get_2s_label(row, filtered_sample_df): 45 | for sample_row in filtered_sample_df.iterrows(): 46 | if sample_row[1]['end'] < row['start'] or sample_row[1]['start'] > row['end']: 47 | continue 48 | elif sample_row[1]['start'] > row['start'] and sample_row[1]['end'] < row['end']: 49 | return 1 50 | else: 51 | if sample_row[1]['start'] > row['start']: 52 | overlap = row['end'] - sample_row[1]['start'] 53 | else: 54 | overlap = sample_row[1]['end'] - row['start'] 55 | 56 | if overlap > 1 or (overlap / (sample_row[1]['end'] - sample_row[1]['start'])) > 0.9: 57 | return 1 58 | 59 | # TODO: what to do when somewhere in between 60 | 61 | return 0 62 | 63 | 64 | def gen_2s_sections(labels_df): 65 | samples = labels_df["Sample"].unique() 66 | print("labling 2 second clips") 67 | for sample in tqdm.tqdm(samples): 68 | section_df = pd.DataFrame(np.linspace(0, 11, 23)) 69 | section_df.columns = ['start'] 70 | section_df['end'] = section_df.shift(-4) 71 | section_df = section_df.dropna() 72 | 73 | section_df["name"] = [sample + "-" + str(i) for i in range(len(section_df))] 74 | section_df["sample"] = sample 75 | 76 | section_df["original_spk"] = labels_df[labels_df.Sample == sample]["original_spk"].values[0] 77 | 78 | section_df["is_filler"] = section_df.apply(get_2s_label, axis=1, args=(labels_df[labels_df.Sample == sample], )) 79 | 80 | yield section_df 81 | 82 | 83 | def train_test_split(labeled_sections): 84 | # split people 85 | speakers = labeled_sections.original_spk.unique() 86 | train_speakers, val_speakers = sklearn_train_test_split(speakers, test_size=0.1) 87 | 88 | # split samples 89 | train_samples, val_samples = sklearn_train_test_split( 90 | labeled_sections[labeled_sections.original_spk.isin(train_speakers)], test_size=0.15) 91 | 92 | def label_train_val(row): 93 | nonlocal val_samples 94 | nonlocal val_speakers 95 | 96 | if row['original_spk'] in val_speakers or row['sample'] in val_samples: 97 | return "val" 98 | else: 99 | return "train" 100 | 101 | labeled_sections['set'] = labeled_sections.apply(label_train_val, axis=1) 102 | 103 | 104 | def split_save_audio(labeled_sections): 105 | if not os.path.exists("datasets/vocalizationcorpus/data_2s"): 106 | os.mkdir("datasets/vocalizationcorpus/data_2s") 107 | 108 | if not os.path.exists("datasets/vocalizationcorpus/data_2s/val"): 109 | os.mkdir("datasets/vocalizationcorpus/data_2s/val") 110 | if not os.path.exists("datasets/vocalizationcorpus/data_2s/val/um"): 111 | os.mkdir("datasets/vocalizationcorpus/data_2s/val/um") 112 | if not os.path.exists("datasets/vocalizationcorpus/data_2s/val/other"): 113 | os.mkdir("datasets/vocalizationcorpus/data_2s/val/other") 114 | 115 | if not os.path.exists("datasets/vocalizationcorpus/data_2s/train"): 116 | os.mkdir("datasets/vocalizationcorpus/data_2s/train") 117 | if not os.path.exists("datasets/vocalizationcorpus/data_2s/train/um"): 118 | os.mkdir("datasets/vocalizationcorpus/data_2s/train/um") 119 | if not os.path.exists("datasets/vocalizationcorpus/data_2s/train/other"): 120 | os.mkdir("datasets/vocalizationcorpus/data_2s/train/other") 121 | 122 | print("saving_segments:") 123 | for sample, data in tqdm.tqdm(labeled_sections.groupby("sample")): 124 | orig_file_path = f"datasets/vocalizationcorpus/data/{sample}.wav" 125 | 126 | audio_sample = AudioSegment.from_wav(orig_file_path) 127 | 128 | for row in data.iterrows(): 129 | start = row[1]['start'] 130 | end = row[1]['end'] 131 | 132 | if row[1]["is_filler"]: 133 | is_um = "um" 134 | 135 | else: 136 | is_um = "other" 137 | 138 | segment_path = f"datasets/vocalizationcorpus/data_2s/{row[1]['set']}/{is_um}/{row[1]['name']}.wav" 139 | 140 | sample_segment = audio_sample[start * 1000: end * 1000] 141 | sample_segment.export(segment_path, format="wav", ) 142 | 143 | 144 | if __name__ == "__main__": 145 | 146 | if os.path.exists("labels.pickle"): 147 | labels_df = pd.read_pickle("labels.pickle") 148 | else: 149 | print("getting labels") 150 | labels_df = get_labels("datasets/vocalizationcorpus/labels.txt") 151 | labels_df.to_pickle("labels.pickle") 152 | 153 | if os.path.exists("labeled_sections.pickle"): 154 | labeled_sections = pd.read_pickle("labeled_sections.pickle") 155 | else: 156 | labeled_sections = pd.concat(gen_2s_sections(labels_df)) 157 | labeled_sections.to_pickle("labeled_sections.pickle") 158 | 159 | train_test_split(labeled_sections) 160 | labeled_sections.to_pickle("labeled_sections_splitted.pickle") 161 | 162 | split_save_audio(labeled_sections) 163 | 164 | 165 | 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Um Detector 2 | 3 | Goal: train a model to detect when an audio segment contains "um" or other filler words. 4 | 5 | project was started at Hack the North 2019 6 | 7 | 1. [work down during hack the north](#work-done-during-hack-the-north:) 8 | 1. [model](#model) 9 | 2. [data](#data) 10 | 3. [training](#training) 11 | 4. [demo](#demo) 12 | 5. [improvements](#improvements) 13 | 2. [post hackathon](#post-hackathon) 14 | 15 | ## Work done during hack the north: 16 | 17 | Easiest method is probably to treat this as an audio classification task for every n seconds of audio. Since there's no actual long term time dependency when detecting a single word, a CNN should be good enough. A spectrogram will be generated by doing consecutive fourier transforms on the audio segment, which will serve as input to the model. 18 | 19 | _note: spectrograms show the intensity of frequencies as it changes over time_ 20 | 21 | ### Model: 22 | #### use a simple CNN described in http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf 23 | 24 | Google already has a training system set up (https://www.tensorflow.org/tutorials/sequences/audio_recognition) so that might be easy enough to build on. The performance here might not be state of the art, but this is a hackathon, and I can build on top of this later after a proof of concept. 25 | 26 | ### Data 27 | 28 | the requirement here is an unscripted, labeled dataset containing filler words such as um. 29 | I've thought about using google's or microsoft api to label some audio, but those models sometimes actively ignore filler words. That makes some sense from a design decision, but is entirely useless here. 30 | 31 | #### found a dataset! 32 | 33 | SSPNet Vocalization Corpus (http://www.dcs.gla.ac.uk/vincia/?p=378) contains: 34 | 35 | > 2763 audio clips (11 seconds each) containing at least one laughter or filler instance. Overall, the corpus involves 120 subjects (63 females and 57 males). The clips are extracted from phone calls where two fully unacquainted speakers try to solve the Winter Survival Task. 36 | 37 | In this case, we're interested in the filler instances. After filtering for that, There are a total of 2988 instances of filler moments in this dataset. Not a whole lot to be training on, but this is the only dataset I was able to find that I could actually access. 38 | 39 | ~~not including the laugh parts at all so that the final dataset would be balanced ish instead of there being significantly more of non-filler moments. It's still not totally balanced, but that's probably okay. Reflective of actual testing distribution and whatever.~~ 40 | It's actually not at all balanced........ note to self: try undersampling the not um's if the initial model doesn't work 41 | 42 | Other issue: they speak with british accents. That's likely to mess with the results. 43 | 44 | ###### alternative datasets: https://catalog.ldc.upenn.edu/LDC2005S16 45 | 46 | #### sectioning: 47 | 48 | google's predefined model operates on 1 second segments. However, in the dataset, there are 81 instances where the filler segment exceeds 1 second. However, only 10 instances contains more than 2 seconds of filler, so we can use that as a filter 49 | 50 | A 2 second sliding window can be used to create many 2 second audio clips. They will be labeled according to whether or not they contain a filler word 51 | 52 | ##### partials 53 | 54 | question: what should I do for 2 second segments which contains a part of a filler word? 55 | 56 | Decision: if it lasts longer than a second or is more than 90% of the original filler section, then it's a filler 57 | 58 | #### train val split 59 | 60 | the validation or testing should contain 2 types of instances: 61 | 62 | - the unique person speaking was not included in the training set 63 | - the specific filler instance was not included in the training set 64 | 65 | a quick hack was used to make tensorflow's training pipeline use my custom train val split 66 | 67 | ### training 68 | 69 | #### base model: 70 | 71 | after running tensorflow's train script, the validation set is obviously very wrong. 72 | 73 | ![tensorflow_speech_commands.PNG](tensorflow_speech_commands.PNG) 74 | 75 | potential reason: didn't configure tensorflow properly (something something bazel, something something ./configure) 76 | 77 | However, the overall shape of the graph seems right. I've also tested with the provided testing options: 78 | 79 | with label_wav.py: 80 | 81 | ``` 82 | left (score = 0.80921) 83 | right (score = 0.12201) 84 | _unknown_ (score = 0.04661) 85 | ``` 86 | 87 | with generate_streaming_test_wav 88 | 89 | ###### warning: this is where I screwed up 90 | spent much too long trying to get this working. Many many hours spent on something that doesn't actually contribute to how this would work, leaving very little time to train my actual model 91 | 92 | #### training the actual um detector: 93 | 94 | so at this point it's late at night and I realize that the checkpoint file cannot automatically accommodate the change in input and output vectors (ie. input is now 2000ms, output is now just a couple of classes) 95 | 96 | last minute retraining of the base model. Adjusted unknown_percentage to 50 since there's only a single other class. starts right away with 50% accuracy, with makes sense as there's only 2 options. Converged somewhat at only 1.6k steps when I stopped it in order to train my actual um model. The lack of reliable validation data is really hurting right now as there isn't really a way to check for overfitting and not enough time to write something up as I still haven't finished figuring out how to get the model to predict on a real time audio stream 97 | 98 | in conclusion: bad planning on my part 99 | 100 | ![um_detector_training](um_detector.PNG) 101 | 102 | this incredibly messed up graph is the training result. The bit at the start was when I started, then restarted a training run, but couldn't be bothered to remove it. The base model trained for 1.5k steps before switching over to using the um dataset. Later on, I forgot to turn off the training run and I really hope it didn't overfit too badly, but again, validation isn't working. The fact that the model is converging so quickly might be worrying or it might just be because there are so few classes. And data. There isn't enough data. 103 | 104 | However, it does seem to be converging to something, which is good! loss isn't going down. 105 | 106 | ### Demo 107 | 108 | This section is where I spent both not enough time and at the same time, too much time. The planned demo involved a real time beep as the speaker says a filler word (ie. um). The only inference that's really working is reading 2 second wav files and giving you a result, so that's what the demo will be. I'll add the ability to speak into the speaker, and save a bunch of wav files into a folder, which would be sent for inference. 109 | 110 | ### Improvements 111 | 112 | - I have not implemented any data augmentation methods. I should try to do some. 113 | - dataset: find a bigger, more varied dataset with many different accents 114 | - CNN is fine for this use case since I'm only detecting specific sounds. If I want to incorporate other filler words that are actually in use in normal english, such as "like", I'll probably have to do something with recurrent networks or similar so that the model will be able to take into account that time dependency 115 | - actually have a working demonstration with real time detection. The model is fast enough, theoretically, given the architecture, even off my CPU, I just didn't have the time to implement it 116 | 117 | ## Post Hackathon 118 | 119 | The two most important things to fix: 120 | 1. The training pipeline was messed up somewhere and was unable to show validation loss properly 121 | 2. the dataset was small and only contained british english 122 | 123 | ### Training pipeline 124 | Since I'm no longer constrained by the time limit of a hackathon, may as well just rewrite the whole thing without bothering with tensorflow's thing. 125 | 126 | I'll still train first on tensorflow's [speech command dataset](https://ai.googleblog.com/2017/08/launching-speech-commands-dataset.html) first before training on "um"s. This time, I'll try to find a way to train the base model on the full dataset instead of just a single word. Meaning when transferring, replace the output layer shape. 127 | 128 | ### Dataset 129 | in addition to the [filler words dataset](#found-a-dataset) from before, new potential datasets: 130 | - CallHome English corpus of telephone speech (https://ca.talkbank.org/access/CallHome/eng.html) 131 | - Santa Barbara Corpus of Spoken American English (https://www.linguistics.ucsb.edu/research/santa-barbara-corpus) 132 | - The Buckeye Speech Corpus (https://buckeyecorpus.osu.edu/) if the above isn't enough 133 | - LDC Spoken Language Sampler (https://catalog.ldc.upenn.edu/LDC2017S16) --------------------------------------------------------------------------------