├── README.md
├── demo.py
├── requirements.txt
└── speech_data.py


/README.md:
--------------------------------------------------------------------------------
 1 | # tensorflow_speech_recognition_demo
 2 | This is the code for 'How to Make a Simple Tensorflow Speech Recognizer' by @Sirajology on Youtube
 3 | 
 4 | Overview
 5 | ============
 6 | This is the full code for 'How to Make a Simple Tensorflow Speech Recognizer' by @Sirajology on [Youtube](https://youtu.be/u9FPqkuoEJ8).
 7 | In this demo code we build an LSTM recurrent neural network using the TFLearn high level Tensorflow-based library to train
 8 | on a labeled dataset of spoken digits. Then we test it on spoken digits. 
 9 | 
10 | Dependencies
11 | ============
12 | * tflearn (http://tflearn.org/)
13 | * tensorflow  (https://www.tensorflow.org/versions/r0.12/get_started/os_setup.html)
14 | * future
15 | 
16 | Use [pip](https://pypi.python.org/pypi/pip) to install any missing dependencies
17 | 
18 | Usage
19 | ===========
20 | 
21 | Run the following code in terminal. This will take a couple hours to train fully.
22 | 
23 | `python demo.py`
24 | 
25 | 
26 | Challenge
27 | ===========
28 | 
29 | The weekly challenge is from the last video, it's still running! Check it out [here](https://www.youtube.com/watch?v=mGYU5t8MO7s)
30 | 
31 | Credits
32 | ===========
33 | Credit for the vast majority of code here goes to [pannouse](https://github.com/pannous). I've merely created a wrapper to get people started!
34 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division, print_function, absolute_import
 2 | import tflearn
 3 | import speech_data
 4 | import tensorflow as tf
 5 | 
 6 | learning_rate = 0.0001
 7 | training_iters = 300000  # steps
 8 | batch_size = 64
 9 | 
10 | width = 20  # mfcc features
11 | height = 80  # (max) length of utterance
12 | classes = 10  # digits
13 | 
14 | batch = word_batch = speech_data.mfcc_batch_generator(batch_size)
15 | X, Y = next(batch)
16 | trainX, trainY = X, Y
17 | testX, testY = X, Y #overfit for now
18 | 
19 | # Network building
20 | net = tflearn.input_data([None, width, height])
21 | net = tflearn.lstm(net, 128, dropout=0.8)
22 | net = tflearn.fully_connected(net, classes, activation='softmax')
23 | net = tflearn.regression(net, optimizer='adam', learning_rate=learning_rate, loss='categorical_crossentropy')
24 | # Training
25 | 
26 | ### add this "fix" for tensorflow version errors
27 | col = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
28 | for x in col:
29 |     tf.add_to_collection(tf.GraphKeys.VARIABLES, x ) 
30 | 
31 | 
32 | model = tflearn.DNN(net, tensorboard_verbose=0)
33 | while 1: #training_iters
34 |   model.fit(trainX, trainY, n_epoch=10, validation_set=(testX, testY), show_metric=True,
35 |           batch_size=batch_size)
36 |   _y=model.predict(X)
37 | model.save("tflearn.lstm.model")
38 | print (_y)
39 | print (y)
40 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | audioread==2.1.4
 2 | cycler==0.10.0
 3 | Cython==0.25.2
 4 | dask==0.12.0
 5 | decorator==4.0.10
 6 | flake8==3.2.1
 7 | h5py==2.6.0
 8 | joblib==0.10.3
 9 | -e git+https://github.com/librosa/librosa.git@01951000f20105ef31a4f80e897cd41c386b9314#egg=librosa
10 | matplotlib==1.5.3
11 | mccabe==0.5.2
12 | networkx==1.11
13 | numpy==1.11.2
14 | Pillow==3.4.2
15 | protobuf==3.1.0
16 | pycodestyle==2.2.0
17 | pyflakes==1.3.0
18 | pyparsing==2.1.10
19 | python-dateutil==2.6.0
20 | pytz==2016.10
21 | resampy==0.1.4
22 | scikit-image==0.12.3
23 | scikit-learn==0.18.1
24 | scipy==0.18.1
25 | six==1.10.0
26 | tensorflow==0.12.0rc1
27 | tflearn==0.2.2
28 | toolz==0.8.1
29 | 


--------------------------------------------------------------------------------
/speech_data.py:
--------------------------------------------------------------------------------
  1 | """Utilities for downloading and providing data from openslr.org, libriSpeech, Pannous, Gutenberg, WMT, tokenizing, vocabularies."""
  2 | # TODO! see https://github.com/pannous/caffe-speech-recognition for some data sources
  3 | 
  4 | import os
  5 | import re
  6 | import sys
  7 | import wave
  8 | 
  9 | import numpy
 10 | import numpy as np
 11 | import skimage.io  # scikit-image
 12 | import librosa
 13 | import matplotlib
 14 | # try:
 15 | #
 16 | # except:
 17 | #   print("pip install librosa ; if you want mfcc_batch_generator")
 18 | 
 19 | # import extensions as xx
 20 | from random import shuffle
 21 | from six.moves import urllib
 22 | from six.moves import xrange  # pylint: disable=redefined-builtin
 23 | 
 24 | # TRAIN_INDEX='train_words_index.txt'
 25 | # TEST_INDEX='test_words_index.txt'
 26 | SOURCE_URL = 'http://pannous.net/files/' #spoken_numbers.tar'
 27 | DATA_DIR = 'data/'
 28 | pcm_path = "data/spoken_numbers_pcm/" # 8 bit
 29 | wav_path = "data/spoken_numbers_wav/" # 16 bit s16le
 30 | path = pcm_path
 31 | CHUNK = 4096
 32 | test_fraction=0.1 # 10% of data for test / verification
 33 | 
 34 | # http://pannous.net/files/spoken_numbers_pcm.tar
 35 | class Source:  # labels
 36 |   DIGIT_WAVES = 'spoken_numbers_pcm.tar'
 37 |   DIGIT_SPECTROS = 'spoken_numbers_spectros_64x64.tar'  # 64x64  baby data set, works astonishingly well
 38 |   NUMBER_WAVES = 'spoken_numbers_wav.tar'
 39 |   NUMBER_IMAGES = 'spoken_numbers.tar'  # width=256 height=256
 40 |   WORD_SPECTROS = 'https://dl.dropboxusercontent.com/u/23615316/spoken_words.tar'  # width,height=512# todo: sliding window!
 41 |   TEST_INDEX = 'test_index.txt'
 42 |   TRAIN_INDEX = 'train_index.txt'
 43 | 
 44 | from enum import Enum
 45 | class Target(Enum):  # labels
 46 |   digits=1
 47 |   speaker=2
 48 |   words_per_minute=3
 49 |   word_phonemes=4
 50 |   word=5#characters=5
 51 |   sentence=6
 52 |   sentiment=7
 53 |   first_letter=8
 54 | 
 55 | 
 56 | 
 57 | def progresshook(blocknum, blocksize, totalsize):
 58 |     readsofar = blocknum * blocksize
 59 |     if totalsize > 0:
 60 |         percent = readsofar * 1e2 / totalsize
 61 |         s = "\r%5.1f%% %*d / %d" % (
 62 |             percent, len(str(totalsize)), readsofar, totalsize)
 63 |         sys.stderr.write(s)
 64 |         if readsofar >= totalsize: # near the end
 65 |             sys.stderr.write("\n")
 66 |     else: # total size is unknown
 67 |         sys.stderr.write("read %d\n" % (readsofar,))
 68 | 
 69 | def maybe_download(file, work_directory):
 70 |   """Download the data from Pannous's website, unless it's already here."""
 71 |   print("Looking for data %s in %s"%(file,work_directory))
 72 |   if not os.path.exists(work_directory):
 73 |     os.mkdir(work_directory)
 74 |   filepath = os.path.join(work_directory, re.sub('.*\/','',file))
 75 |   if not os.path.exists(filepath):
 76 |     if not file.startswith("http"): url_filename = SOURCE_URL + file
 77 |     else: url_filename=file
 78 |     print('Downloading from %s to %s' % (url_filename, filepath))
 79 |     filepath, _ = urllib.request.urlretrieve(url_filename, filepath,progresshook)
 80 |     statinfo = os.stat(filepath)
 81 |     print('Successfully downloaded', file, statinfo.st_size, 'bytes.')
 82 |     # os.system('ln -s '+work_directory)
 83 |   if os.path.exists(filepath):
 84 |     print('Extracting %s to %s' % ( filepath, work_directory))
 85 |     os.system('tar xf %s -C %s' % ( filepath, work_directory))
 86 |     print('Data ready!')
 87 |   return filepath.replace(".tar","")
 88 | 
 89 | def spectro_batch(batch_size=10):
 90 |   return spectro_batch_generator(batch_size)
 91 | 
 92 | def speaker(file):  # vom Dateinamen
 93 |   # if not "_" in file:
 94 |   #   return "Unknown"
 95 |   return file.split("_")[1]
 96 | 
 97 | def get_speakers(path=pcm_path):
 98 |   files = os.listdir(path)
 99 |   def nobad(file):
100 |     return "_" in file and not "." in file.split("_")[1]
101 |   speakers=list(set(map(speaker,filter(nobad,files))))
102 |   print(len(speakers)," speakers: ",speakers)
103 |   return speakers
104 | 
105 | def load_wav_file(name):
106 |   f = wave.open(name, "rb")
107 |   # print("loading %s"%name)
108 |   chunk = []
109 |   data0 = f.readframes(CHUNK)
110 |   while data0:  # f.getnframes()
111 |     # data=numpy.fromstring(data0, dtype='float32')
112 |     # data = numpy.fromstring(data0, dtype='uint16')
113 |     data = numpy.fromstring(data0, dtype='uint8')
114 |     data = (data + 128) / 255.  # 0-1 for Better convergence
115 |     # chunks.append(data)
116 |     chunk.extend(data)
117 |     data0 = f.readframes(CHUNK)
118 |   # finally trim:
119 |   chunk = chunk[0:CHUNK * 2]  # should be enough for now -> cut
120 |   chunk.extend(numpy.zeros(CHUNK * 2 - len(chunk)))  # fill with padding 0's
121 |   # print("%s loaded"%name)
122 |   return chunk
123 | 
124 | 
125 | def spectro_batch_generator(batch_size=10,width=64,source_data=Source.DIGIT_SPECTROS,target=Target.digits):
126 |   # maybe_download(Source.NUMBER_IMAGES , DATA_DIR)
127 |   # maybe_download(Source.SPOKEN_WORDS, DATA_DIR)
128 |   path=maybe_download(source_data, DATA_DIR)
129 |   path=path.replace("_spectros","")# HACK! remove!
130 |   height = width
131 |   batch = []
132 |   labels = []
133 |   speakers=get_speakers(path)
134 |   if target==Target.digits: num_classes=10
135 |   if target==Target.first_letter: num_classes=32
136 |   files = os.listdir(path)
137 |   # shuffle(files) # todo : split test_fraction batch here!
138 |   # files=files[0:int(len(files)*(1-test_fraction))]
139 |   print("Got %d source data files from %s"%(len(files),path))
140 |   while True:
141 |     # print("shuffling source data files")
142 |     shuffle(files)
143 |     for image_name in files:
144 |       if not "_" in image_name: continue # bad !?!
145 |       image = skimage.io.imread(path + "/" + image_name).astype(numpy.float32)
146 |       # image.resize(width,height) # lets see ...
147 |       data = image / 255.  # 0-1 for Better convergence
148 |       data = data.reshape([width * height])  # tensorflow matmul needs flattened matrices wtf
149 |       batch.append(list(data))
150 |       # classe=(ord(image_name[0]) - 48)  # -> 0=0 .. A:65-48 ... 74 for 'z'
151 |       classe = (ord(image_name[0]) - 48) % 32# -> 0=0  17 for A, 10 for z ;)
152 |       labels.append(dense_to_one_hot(classe,num_classes))
153 |       if len(batch) >= batch_size:
154 |         yield batch, labels
155 |         batch = []  # Reset for next batch
156 |         labels = []
157 | 
158 | def mfcc_batch_generator(batch_size=10, source=Source.DIGIT_WAVES, target=Target.digits):
159 |   maybe_download(source, DATA_DIR)
160 |   if target == Target.speaker: speakers = get_speakers()
161 |   batch_features = []
162 |   labels = []
163 |   files = os.listdir(path)
164 |   while True:
165 |     print("loaded batch of %d files" % len(files))
166 |     shuffle(files)
167 |     for wav in files:
168 |       if not wav.endswith(".wav"): continue
169 |       wave, sr = librosa.load(path+wav, mono=True)
170 |       if target==Target.speaker: label=one_hot_from_item(speaker(wav), speakers)
171 |       elif target==Target.digits:  label=dense_to_one_hot(int(wav[0]),10)
172 |       elif target==Target.first_letter:  label=dense_to_one_hot((ord(wav[0]) - 48) % 32,32)
173 |       else: raise Exception("todo : labels for Target!")
174 |       labels.append(label)
175 |       mfcc = librosa.feature.mfcc(wave, sr)
176 |       # print(np.array(mfcc).shape)
177 |       mfcc=np.pad(mfcc,((0,0),(0,80-len(mfcc[0]))), mode='constant', constant_values=0)
178 |       batch_features.append(np.array(mfcc))
179 |       if len(batch_features) >= batch_size:
180 |         # print(np.array(batch_features).shape)
181 |         # yield np.array(batch_features), labels
182 |         yield batch_features, labels  # basic_rnn_seq2seq inputs must be a sequence
183 |         batch_features = []  # Reset for next batch
184 |         labels = []
185 | 
186 | 
187 | # If you set dynamic_pad=True when calling tf.train.batch the returned batch will be automatically padded with 0s. Handy! A lower-level option is to use tf.PaddingFIFOQueue.
188 | # only apply to a subset of all images at one time
189 | def wave_batch_generator(batch_size=10,source=Source.DIGIT_WAVES,target=Target.digits): #speaker
190 |   maybe_download(source, DATA_DIR)
191 |   if target == Target.speaker: speakers=get_speakers()
192 |   batch_waves = []
193 |   labels = []
194 |   # input_width=CHUNK*6 # wow, big!!
195 |   files = os.listdir(path)
196 |   while True:
197 |     shuffle(files)
198 |     print("loaded batch of %d files" % len(files))
199 |     for wav in files:
200 |       if not wav.endswith(".wav"):continue
201 |       if target==Target.digits: labels.append(dense_to_one_hot(int(wav[0])))
202 |       elif target==Target.speaker: labels.append(one_hot_from_item(speaker(wav), speakers))
203 |       elif target==Target.first_letter:  label=dense_to_one_hot((ord(wav[0]) - 48) % 32,32)
204 |       else: raise Exception("todo : Target.word label!")
205 |       chunk = load_wav_file(path+wav)
206 |       batch_waves.append(chunk)
207 |       # batch_waves.append(chunks[input_width])
208 |       if len(batch_waves) >= batch_size:
209 |         yield batch_waves, labels
210 |         batch_waves = []  # Reset for next batch
211 |         labels = []
212 | 
213 | class DataSet(object):
214 | 
215 |   def __init__(self, images, labels, fake_data=False, one_hot=False, load=False):
216 |     """Construct a DataSet. one_hot arg is used only if fake_data is true."""
217 |     if fake_data:
218 |       self._num_examples = 10000
219 |       self.one_hot = one_hot
220 |     else:
221 |       num = len(images)
222 |       assert num == len(labels), ('images.shape: %s labels.shape: %s' % (images.shape, labels.shape))
223 |       print("len(images) %d" % num)
224 |       self._num_examples = num
225 |     self.cache={}
226 |     self._image_names = numpy.array(images)
227 |     self._labels = labels
228 |     self._epochs_completed = 0
229 |     self._index_in_epoch = 0
230 |     self._images=[]
231 |     if load: # Otherwise loaded on demand
232 |       self._images=self.load(self._image_names)
233 | 
234 |   @property
235 |   def images(self):
236 |     return self._images
237 | 
238 |   @property
239 |   def image_names(self):
240 |     return self._image_names
241 | 
242 |   @property
243 |   def labels(self):
244 |     return self._labels
245 | 
246 |   @property
247 |   def num_examples(self):
248 |     return self._num_examples
249 | 
250 |   @property
251 |   def epochs_completed(self):
252 |     return self._epochs_completed
253 | 
254 |   # only apply to a subset of all images at one time
255 |   def load(self,image_names):
256 |     print("loading %d images"%len(image_names))
257 |     return list(map(self.load_image,image_names)) # python3 map object WTF
258 | 
259 |   def load_image(self,image_name):
260 |     if image_name in self.cache:
261 |         return self.cache[image_name]
262 |     else:
263 |       image = skimage.io.imread(DATA_DIR+ image_name).astype(numpy.float32)
264 |       # images = numpy.multiply(images, 1.0 / 255.0)
265 |       self.cache[image_name]=image
266 |       return image
267 | 
268 | 
269 |   def next_batch(self, batch_size, fake_data=False):
270 |     """Return the next `batch_size` examples from this data set."""
271 |     if fake_data:
272 |       fake_image = [1] * width * height
273 |       if self.one_hot:
274 |         fake_label = [1] + [0] * 9
275 |       else:
276 |         fake_label = 0
277 |       return [fake_image for _ in xrange(batch_size)], [
278 |           fake_label for _ in xrange(batch_size)]
279 |     start = self._index_in_epoch
280 |     self._index_in_epoch += batch_size
281 |     if self._index_in_epoch > self._num_examples:
282 |       # Finished epoch
283 |       self._epochs_completed += 1
284 |       # Shuffle the data
285 |       perm = numpy.arange(self._num_examples)
286 |       numpy.random.shuffle(perm)
287 |       # self._images = self._images[perm]
288 |       self._image_names = self._image_names[perm]
289 |       self._labels = self._labels[perm]
290 |       # Start next epoch
291 |       start = 0
292 |       self._index_in_epoch = batch_size
293 |       assert batch_size <= self._num_examples
294 |     end = self._index_in_epoch
295 |     return self.load(self._image_names[start:end]), self._labels[start:end]
296 | 
297 | 
298 | # multi-label
299 | def dense_to_some_hot(labels_dense, num_classes=140):
300 |   """Convert class labels from int vectors to many-hot vectors!"""
301 |   raise "TODO dense_to_some_hot"
302 | 
303 | 
304 | def one_hot_to_item(hot, items):
305 |   i=np.argmax(hot)
306 |   item=items[i]
307 |   return item
308 | 
309 | def one_hot_from_item(item, items):
310 |   # items=set(items) # assure uniqueness
311 |   x=[0]*len(items)# numpy.zeros(len(items))
312 |   i=items.index(item)
313 |   x[i]=1
314 |   return x
315 | 
316 | def dense_to_one_hot(batch, batch_size, num_labels):
317 |   sparse_labels = tf.reshape(batch, [batch_size, 1])
318 |   indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1])
319 |   concatenated = tf.concat(1, [indices, sparse_labels])
320 |   concat = tf.concat(0, [[batch_size], [num_labels]])
321 |   output_shape = tf.reshape(concat, [2])
322 |   sparse_to_dense = tf.sparse_to_dense(concatenated, output_shape, 1.0, 0.0)
323 |   return tf.reshape(sparse_to_dense, [batch_size, num_labels])
324 | 
325 | def dense_to_one_hot(labels_dense, num_classes=10):
326 |   """Convert class labels from scalars to one-hot vectors."""
327 |   return numpy.eye(num_classes)[labels_dense]
328 | 
329 | def extract_labels(names_file,train, one_hot):
330 |   labels=[]
331 |   for line in open(names_file).readlines():
332 |     image_file,image_label = line.split("\t")
333 |     labels.append(image_label)
334 |   if one_hot:
335 |       return dense_to_one_hot(labels)
336 |   return labels
337 | 
338 | def extract_images(names_file,train):
339 |   image_files=[]
340 |   for line in open(names_file).readlines():
341 |     image_file,image_label = line.split("\t")
342 |     image_files.append(image_file)
343 |   return image_files
344 | 
345 | 
346 | def read_data_sets(train_dir,source_data=Source.NUMBER_IMAGES, fake_data=False, one_hot=True):
347 |   class DataSets(object):
348 |     pass
349 |   data_sets = DataSets()
350 |   if fake_data:
351 |     data_sets.train = DataSet([], [], fake_data=True, one_hot=one_hot)
352 |     data_sets.validation = DataSet([], [], fake_data=True, one_hot=one_hot)
353 |     data_sets.test = DataSet([], [], fake_data=True, one_hot=one_hot)
354 |     return data_sets
355 |   VALIDATION_SIZE = 2000
356 |   local_file = maybe_download(source_data, train_dir)
357 |   train_images = extract_images(TRAIN_INDEX,train=True)
358 |   train_labels = extract_labels(TRAIN_INDEX,train=True, one_hot=one_hot)
359 |   test_images = extract_images(TEST_INDEX,train=False)
360 |   test_labels = extract_labels(TEST_INDEX,train=False, one_hot=one_hot)
361 |   # train_images = train_images[:VALIDATION_SIZE]
362 |   # train_labels = train_labels[:VALIDATION_SIZE:]
363 |   # test_images = test_images[VALIDATION_SIZE:]
364 |   # test_labels = test_labels[VALIDATION_SIZE:]
365 |   data_sets.train = DataSet(train_images, train_labels , load=False)
366 |   data_sets.test = DataSet(test_images, test_labels, load=True)
367 |   # data_sets.validation = DataSet(validation_images, validation_labels, load=True)
368 |   return data_sets
369 | 
370 | if __name__ == "__main__":
371 |   print("downloading speech datasets")
372 |   maybe_download( Source.DIGIT_SPECTROS)
373 |   maybe_download( Source.DIGIT_WAVES)
374 |   maybe_download( Source.NUMBER_IMAGES)
375 |   maybe_download( Source.NUMBER_WAVES)
376 | 


--------------------------------------------------------------------------------