├── README.md ├── demo.py ├── requirements.txt └── speech_data.py /README.md: -------------------------------------------------------------------------------- 1 | # tensorflow_speech_recognition_demo 2 | This is the code for 'How to Make a Simple Tensorflow Speech Recognizer' by @Sirajology on Youtube 3 | 4 | Overview 5 | ============ 6 | This is the full code for 'How to Make a Simple Tensorflow Speech Recognizer' by @Sirajology on [Youtube](https://youtu.be/u9FPqkuoEJ8). 7 | In this demo code we build an LSTM recurrent neural network using the TFLearn high level Tensorflow-based library to train 8 | on a labeled dataset of spoken digits. Then we test it on spoken digits. 9 | 10 | Dependencies 11 | ============ 12 | * tflearn (http://tflearn.org/) 13 | * tensorflow (https://www.tensorflow.org/versions/r0.12/get_started/os_setup.html) 14 | * future 15 | 16 | Use [pip](https://pypi.python.org/pypi/pip) to install any missing dependencies 17 | 18 | Usage 19 | =========== 20 | 21 | Run the following code in terminal. This will take a couple hours to train fully. 22 | 23 | `python demo.py` 24 | 25 | 26 | Challenge 27 | =========== 28 | 29 | The weekly challenge is from the last video, it's still running! Check it out [here](https://www.youtube.com/watch?v=mGYU5t8MO7s) 30 | 31 | Credits 32 | =========== 33 | Credit for the vast majority of code here goes to [pannouse](https://github.com/pannous). I've merely created a wrapper to get people started! 34 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, absolute_import 2 | import tflearn 3 | import speech_data 4 | import tensorflow as tf 5 | 6 | learning_rate = 0.0001 7 | training_iters = 300000 # steps 8 | batch_size = 64 9 | 10 | width = 20 # mfcc features 11 | height = 80 # (max) length of utterance 12 | classes = 10 # digits 13 | 14 | batch = word_batch = speech_data.mfcc_batch_generator(batch_size) 15 | X, Y = next(batch) 16 | trainX, trainY = X, Y 17 | testX, testY = X, Y #overfit for now 18 | 19 | # Network building 20 | net = tflearn.input_data([None, width, height]) 21 | net = tflearn.lstm(net, 128, dropout=0.8) 22 | net = tflearn.fully_connected(net, classes, activation='softmax') 23 | net = tflearn.regression(net, optimizer='adam', learning_rate=learning_rate, loss='categorical_crossentropy') 24 | # Training 25 | 26 | ### add this "fix" for tensorflow version errors 27 | col = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) 28 | for x in col: 29 | tf.add_to_collection(tf.GraphKeys.VARIABLES, x ) 30 | 31 | 32 | model = tflearn.DNN(net, tensorboard_verbose=0) 33 | while 1: #training_iters 34 | model.fit(trainX, trainY, n_epoch=10, validation_set=(testX, testY), show_metric=True, 35 | batch_size=batch_size) 36 | _y=model.predict(X) 37 | model.save("tflearn.lstm.model") 38 | print (_y) 39 | print (y) 40 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | audioread==2.1.4 2 | cycler==0.10.0 3 | Cython==0.25.2 4 | dask==0.12.0 5 | decorator==4.0.10 6 | flake8==3.2.1 7 | h5py==2.6.0 8 | joblib==0.10.3 9 | -e git+https://github.com/librosa/librosa.git@01951000f20105ef31a4f80e897cd41c386b9314#egg=librosa 10 | matplotlib==1.5.3 11 | mccabe==0.5.2 12 | networkx==1.11 13 | numpy==1.11.2 14 | Pillow==3.4.2 15 | protobuf==3.1.0 16 | pycodestyle==2.2.0 17 | pyflakes==1.3.0 18 | pyparsing==2.1.10 19 | python-dateutil==2.6.0 20 | pytz==2016.10 21 | resampy==0.1.4 22 | scikit-image==0.12.3 23 | scikit-learn==0.18.1 24 | scipy==0.18.1 25 | six==1.10.0 26 | tensorflow==0.12.0rc1 27 | tflearn==0.2.2 28 | toolz==0.8.1 29 | -------------------------------------------------------------------------------- /speech_data.py: -------------------------------------------------------------------------------- 1 | """Utilities for downloading and providing data from openslr.org, libriSpeech, Pannous, Gutenberg, WMT, tokenizing, vocabularies.""" 2 | # TODO! see https://github.com/pannous/caffe-speech-recognition for some data sources 3 | 4 | import os 5 | import re 6 | import sys 7 | import wave 8 | 9 | import numpy 10 | import numpy as np 11 | import skimage.io # scikit-image 12 | import librosa 13 | import matplotlib 14 | # try: 15 | # 16 | # except: 17 | # print("pip install librosa ; if you want mfcc_batch_generator") 18 | 19 | # import extensions as xx 20 | from random import shuffle 21 | from six.moves import urllib 22 | from six.moves import xrange # pylint: disable=redefined-builtin 23 | 24 | # TRAIN_INDEX='train_words_index.txt' 25 | # TEST_INDEX='test_words_index.txt' 26 | SOURCE_URL = 'http://pannous.net/files/' #spoken_numbers.tar' 27 | DATA_DIR = 'data/' 28 | pcm_path = "data/spoken_numbers_pcm/" # 8 bit 29 | wav_path = "data/spoken_numbers_wav/" # 16 bit s16le 30 | path = pcm_path 31 | CHUNK = 4096 32 | test_fraction=0.1 # 10% of data for test / verification 33 | 34 | # http://pannous.net/files/spoken_numbers_pcm.tar 35 | class Source: # labels 36 | DIGIT_WAVES = 'spoken_numbers_pcm.tar' 37 | DIGIT_SPECTROS = 'spoken_numbers_spectros_64x64.tar' # 64x64 baby data set, works astonishingly well 38 | NUMBER_WAVES = 'spoken_numbers_wav.tar' 39 | NUMBER_IMAGES = 'spoken_numbers.tar' # width=256 height=256 40 | WORD_SPECTROS = 'https://dl.dropboxusercontent.com/u/23615316/spoken_words.tar' # width,height=512# todo: sliding window! 41 | TEST_INDEX = 'test_index.txt' 42 | TRAIN_INDEX = 'train_index.txt' 43 | 44 | from enum import Enum 45 | class Target(Enum): # labels 46 | digits=1 47 | speaker=2 48 | words_per_minute=3 49 | word_phonemes=4 50 | word=5#characters=5 51 | sentence=6 52 | sentiment=7 53 | first_letter=8 54 | 55 | 56 | 57 | def progresshook(blocknum, blocksize, totalsize): 58 | readsofar = blocknum * blocksize 59 | if totalsize > 0: 60 | percent = readsofar * 1e2 / totalsize 61 | s = "\r%5.1f%% %*d / %d" % ( 62 | percent, len(str(totalsize)), readsofar, totalsize) 63 | sys.stderr.write(s) 64 | if readsofar >= totalsize: # near the end 65 | sys.stderr.write("\n") 66 | else: # total size is unknown 67 | sys.stderr.write("read %d\n" % (readsofar,)) 68 | 69 | def maybe_download(file, work_directory): 70 | """Download the data from Pannous's website, unless it's already here.""" 71 | print("Looking for data %s in %s"%(file,work_directory)) 72 | if not os.path.exists(work_directory): 73 | os.mkdir(work_directory) 74 | filepath = os.path.join(work_directory, re.sub('.*\/','',file)) 75 | if not os.path.exists(filepath): 76 | if not file.startswith("http"): url_filename = SOURCE_URL + file 77 | else: url_filename=file 78 | print('Downloading from %s to %s' % (url_filename, filepath)) 79 | filepath, _ = urllib.request.urlretrieve(url_filename, filepath,progresshook) 80 | statinfo = os.stat(filepath) 81 | print('Successfully downloaded', file, statinfo.st_size, 'bytes.') 82 | # os.system('ln -s '+work_directory) 83 | if os.path.exists(filepath): 84 | print('Extracting %s to %s' % ( filepath, work_directory)) 85 | os.system('tar xf %s -C %s' % ( filepath, work_directory)) 86 | print('Data ready!') 87 | return filepath.replace(".tar","") 88 | 89 | def spectro_batch(batch_size=10): 90 | return spectro_batch_generator(batch_size) 91 | 92 | def speaker(file): # vom Dateinamen 93 | # if not "_" in file: 94 | # return "Unknown" 95 | return file.split("_")[1] 96 | 97 | def get_speakers(path=pcm_path): 98 | files = os.listdir(path) 99 | def nobad(file): 100 | return "_" in file and not "." in file.split("_")[1] 101 | speakers=list(set(map(speaker,filter(nobad,files)))) 102 | print(len(speakers)," speakers: ",speakers) 103 | return speakers 104 | 105 | def load_wav_file(name): 106 | f = wave.open(name, "rb") 107 | # print("loading %s"%name) 108 | chunk = [] 109 | data0 = f.readframes(CHUNK) 110 | while data0: # f.getnframes() 111 | # data=numpy.fromstring(data0, dtype='float32') 112 | # data = numpy.fromstring(data0, dtype='uint16') 113 | data = numpy.fromstring(data0, dtype='uint8') 114 | data = (data + 128) / 255. # 0-1 for Better convergence 115 | # chunks.append(data) 116 | chunk.extend(data) 117 | data0 = f.readframes(CHUNK) 118 | # finally trim: 119 | chunk = chunk[0:CHUNK * 2] # should be enough for now -> cut 120 | chunk.extend(numpy.zeros(CHUNK * 2 - len(chunk))) # fill with padding 0's 121 | # print("%s loaded"%name) 122 | return chunk 123 | 124 | 125 | def spectro_batch_generator(batch_size=10,width=64,source_data=Source.DIGIT_SPECTROS,target=Target.digits): 126 | # maybe_download(Source.NUMBER_IMAGES , DATA_DIR) 127 | # maybe_download(Source.SPOKEN_WORDS, DATA_DIR) 128 | path=maybe_download(source_data, DATA_DIR) 129 | path=path.replace("_spectros","")# HACK! remove! 130 | height = width 131 | batch = [] 132 | labels = [] 133 | speakers=get_speakers(path) 134 | if target==Target.digits: num_classes=10 135 | if target==Target.first_letter: num_classes=32 136 | files = os.listdir(path) 137 | # shuffle(files) # todo : split test_fraction batch here! 138 | # files=files[0:int(len(files)*(1-test_fraction))] 139 | print("Got %d source data files from %s"%(len(files),path)) 140 | while True: 141 | # print("shuffling source data files") 142 | shuffle(files) 143 | for image_name in files: 144 | if not "_" in image_name: continue # bad !?! 145 | image = skimage.io.imread(path + "/" + image_name).astype(numpy.float32) 146 | # image.resize(width,height) # lets see ... 147 | data = image / 255. # 0-1 for Better convergence 148 | data = data.reshape([width * height]) # tensorflow matmul needs flattened matrices wtf 149 | batch.append(list(data)) 150 | # classe=(ord(image_name[0]) - 48) # -> 0=0 .. A:65-48 ... 74 for 'z' 151 | classe = (ord(image_name[0]) - 48) % 32# -> 0=0 17 for A, 10 for z ;) 152 | labels.append(dense_to_one_hot(classe,num_classes)) 153 | if len(batch) >= batch_size: 154 | yield batch, labels 155 | batch = [] # Reset for next batch 156 | labels = [] 157 | 158 | def mfcc_batch_generator(batch_size=10, source=Source.DIGIT_WAVES, target=Target.digits): 159 | maybe_download(source, DATA_DIR) 160 | if target == Target.speaker: speakers = get_speakers() 161 | batch_features = [] 162 | labels = [] 163 | files = os.listdir(path) 164 | while True: 165 | print("loaded batch of %d files" % len(files)) 166 | shuffle(files) 167 | for wav in files: 168 | if not wav.endswith(".wav"): continue 169 | wave, sr = librosa.load(path+wav, mono=True) 170 | if target==Target.speaker: label=one_hot_from_item(speaker(wav), speakers) 171 | elif target==Target.digits: label=dense_to_one_hot(int(wav[0]),10) 172 | elif target==Target.first_letter: label=dense_to_one_hot((ord(wav[0]) - 48) % 32,32) 173 | else: raise Exception("todo : labels for Target!") 174 | labels.append(label) 175 | mfcc = librosa.feature.mfcc(wave, sr) 176 | # print(np.array(mfcc).shape) 177 | mfcc=np.pad(mfcc,((0,0),(0,80-len(mfcc[0]))), mode='constant', constant_values=0) 178 | batch_features.append(np.array(mfcc)) 179 | if len(batch_features) >= batch_size: 180 | # print(np.array(batch_features).shape) 181 | # yield np.array(batch_features), labels 182 | yield batch_features, labels # basic_rnn_seq2seq inputs must be a sequence 183 | batch_features = [] # Reset for next batch 184 | labels = [] 185 | 186 | 187 | # If you set dynamic_pad=True when calling tf.train.batch the returned batch will be automatically padded with 0s. Handy! A lower-level option is to use tf.PaddingFIFOQueue. 188 | # only apply to a subset of all images at one time 189 | def wave_batch_generator(batch_size=10,source=Source.DIGIT_WAVES,target=Target.digits): #speaker 190 | maybe_download(source, DATA_DIR) 191 | if target == Target.speaker: speakers=get_speakers() 192 | batch_waves = [] 193 | labels = [] 194 | # input_width=CHUNK*6 # wow, big!! 195 | files = os.listdir(path) 196 | while True: 197 | shuffle(files) 198 | print("loaded batch of %d files" % len(files)) 199 | for wav in files: 200 | if not wav.endswith(".wav"):continue 201 | if target==Target.digits: labels.append(dense_to_one_hot(int(wav[0]))) 202 | elif target==Target.speaker: labels.append(one_hot_from_item(speaker(wav), speakers)) 203 | elif target==Target.first_letter: label=dense_to_one_hot((ord(wav[0]) - 48) % 32,32) 204 | else: raise Exception("todo : Target.word label!") 205 | chunk = load_wav_file(path+wav) 206 | batch_waves.append(chunk) 207 | # batch_waves.append(chunks[input_width]) 208 | if len(batch_waves) >= batch_size: 209 | yield batch_waves, labels 210 | batch_waves = [] # Reset for next batch 211 | labels = [] 212 | 213 | class DataSet(object): 214 | 215 | def __init__(self, images, labels, fake_data=False, one_hot=False, load=False): 216 | """Construct a DataSet. one_hot arg is used only if fake_data is true.""" 217 | if fake_data: 218 | self._num_examples = 10000 219 | self.one_hot = one_hot 220 | else: 221 | num = len(images) 222 | assert num == len(labels), ('images.shape: %s labels.shape: %s' % (images.shape, labels.shape)) 223 | print("len(images) %d" % num) 224 | self._num_examples = num 225 | self.cache={} 226 | self._image_names = numpy.array(images) 227 | self._labels = labels 228 | self._epochs_completed = 0 229 | self._index_in_epoch = 0 230 | self._images=[] 231 | if load: # Otherwise loaded on demand 232 | self._images=self.load(self._image_names) 233 | 234 | @property 235 | def images(self): 236 | return self._images 237 | 238 | @property 239 | def image_names(self): 240 | return self._image_names 241 | 242 | @property 243 | def labels(self): 244 | return self._labels 245 | 246 | @property 247 | def num_examples(self): 248 | return self._num_examples 249 | 250 | @property 251 | def epochs_completed(self): 252 | return self._epochs_completed 253 | 254 | # only apply to a subset of all images at one time 255 | def load(self,image_names): 256 | print("loading %d images"%len(image_names)) 257 | return list(map(self.load_image,image_names)) # python3 map object WTF 258 | 259 | def load_image(self,image_name): 260 | if image_name in self.cache: 261 | return self.cache[image_name] 262 | else: 263 | image = skimage.io.imread(DATA_DIR+ image_name).astype(numpy.float32) 264 | # images = numpy.multiply(images, 1.0 / 255.0) 265 | self.cache[image_name]=image 266 | return image 267 | 268 | 269 | def next_batch(self, batch_size, fake_data=False): 270 | """Return the next `batch_size` examples from this data set.""" 271 | if fake_data: 272 | fake_image = [1] * width * height 273 | if self.one_hot: 274 | fake_label = [1] + [0] * 9 275 | else: 276 | fake_label = 0 277 | return [fake_image for _ in xrange(batch_size)], [ 278 | fake_label for _ in xrange(batch_size)] 279 | start = self._index_in_epoch 280 | self._index_in_epoch += batch_size 281 | if self._index_in_epoch > self._num_examples: 282 | # Finished epoch 283 | self._epochs_completed += 1 284 | # Shuffle the data 285 | perm = numpy.arange(self._num_examples) 286 | numpy.random.shuffle(perm) 287 | # self._images = self._images[perm] 288 | self._image_names = self._image_names[perm] 289 | self._labels = self._labels[perm] 290 | # Start next epoch 291 | start = 0 292 | self._index_in_epoch = batch_size 293 | assert batch_size <= self._num_examples 294 | end = self._index_in_epoch 295 | return self.load(self._image_names[start:end]), self._labels[start:end] 296 | 297 | 298 | # multi-label 299 | def dense_to_some_hot(labels_dense, num_classes=140): 300 | """Convert class labels from int vectors to many-hot vectors!""" 301 | raise "TODO dense_to_some_hot" 302 | 303 | 304 | def one_hot_to_item(hot, items): 305 | i=np.argmax(hot) 306 | item=items[i] 307 | return item 308 | 309 | def one_hot_from_item(item, items): 310 | # items=set(items) # assure uniqueness 311 | x=[0]*len(items)# numpy.zeros(len(items)) 312 | i=items.index(item) 313 | x[i]=1 314 | return x 315 | 316 | def dense_to_one_hot(batch, batch_size, num_labels): 317 | sparse_labels = tf.reshape(batch, [batch_size, 1]) 318 | indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1]) 319 | concatenated = tf.concat(1, [indices, sparse_labels]) 320 | concat = tf.concat(0, [[batch_size], [num_labels]]) 321 | output_shape = tf.reshape(concat, [2]) 322 | sparse_to_dense = tf.sparse_to_dense(concatenated, output_shape, 1.0, 0.0) 323 | return tf.reshape(sparse_to_dense, [batch_size, num_labels]) 324 | 325 | def dense_to_one_hot(labels_dense, num_classes=10): 326 | """Convert class labels from scalars to one-hot vectors.""" 327 | return numpy.eye(num_classes)[labels_dense] 328 | 329 | def extract_labels(names_file,train, one_hot): 330 | labels=[] 331 | for line in open(names_file).readlines(): 332 | image_file,image_label = line.split("\t") 333 | labels.append(image_label) 334 | if one_hot: 335 | return dense_to_one_hot(labels) 336 | return labels 337 | 338 | def extract_images(names_file,train): 339 | image_files=[] 340 | for line in open(names_file).readlines(): 341 | image_file,image_label = line.split("\t") 342 | image_files.append(image_file) 343 | return image_files 344 | 345 | 346 | def read_data_sets(train_dir,source_data=Source.NUMBER_IMAGES, fake_data=False, one_hot=True): 347 | class DataSets(object): 348 | pass 349 | data_sets = DataSets() 350 | if fake_data: 351 | data_sets.train = DataSet([], [], fake_data=True, one_hot=one_hot) 352 | data_sets.validation = DataSet([], [], fake_data=True, one_hot=one_hot) 353 | data_sets.test = DataSet([], [], fake_data=True, one_hot=one_hot) 354 | return data_sets 355 | VALIDATION_SIZE = 2000 356 | local_file = maybe_download(source_data, train_dir) 357 | train_images = extract_images(TRAIN_INDEX,train=True) 358 | train_labels = extract_labels(TRAIN_INDEX,train=True, one_hot=one_hot) 359 | test_images = extract_images(TEST_INDEX,train=False) 360 | test_labels = extract_labels(TEST_INDEX,train=False, one_hot=one_hot) 361 | # train_images = train_images[:VALIDATION_SIZE] 362 | # train_labels = train_labels[:VALIDATION_SIZE:] 363 | # test_images = test_images[VALIDATION_SIZE:] 364 | # test_labels = test_labels[VALIDATION_SIZE:] 365 | data_sets.train = DataSet(train_images, train_labels , load=False) 366 | data_sets.test = DataSet(test_images, test_labels, load=True) 367 | # data_sets.validation = DataSet(validation_images, validation_labels, load=True) 368 | return data_sets 369 | 370 | if __name__ == "__main__": 371 | print("downloading speech datasets") 372 | maybe_download( Source.DIGIT_SPECTROS) 373 | maybe_download( Source.DIGIT_WAVES) 374 | maybe_download( Source.NUMBER_IMAGES) 375 | maybe_download( Source.NUMBER_WAVES) 376 | --------------------------------------------------------------------------------