├── .gitignore
├── README.md
├── mnist
    ├── mnist_advanced.py
    ├── mnist_softmax.py
    └── mnist_visualized.py
├── requirements.txt
├── rnn
    ├── lstm
    │   └── words_prediction
    │   │   ├── data.tgz
    │   │   ├── ptb_word_lm.py
    │   │   └── reader.py
    ├── text8.zip
    └── word2vec_basic.py
└── speech
    ├── generate_speech_data.py
    ├── mll_data.py
    ├── mll_irrelevant_words.txt
    ├── mll_lstm.py
    ├── mll_relevant_words.txt
    ├── record_to_wav.py
    ├── speech_data.py
    └── tflearn_simple_number_classifier.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | speech/data
3 | /rnn/lstm/words_prediction/data/
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tensorflow-playground
2 | A simple playground project for TensorFlow Python lib
3 | 


--------------------------------------------------------------------------------
/mnist/mnist_advanced.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Advanced MNIST classifier.
 17 | See extensive documentation at
 18 | https://www.tensorflow.org/tutorials/mnist/pros/
 19 | """
 20 | from __future__ import absolute_import
 21 | from __future__ import division
 22 | from __future__ import print_function
 23 | 
 24 | import argparse
 25 | import sys
 26 | from datetime import datetime
 27 | 
 28 | # Import data
 29 | from tensorflow.examples.tutorials.mnist import input_data
 30 | 
 31 | import tensorflow as tf
 32 | 
 33 | FLAGS = None
 34 | 
 35 | 
 36 | def weight_variable(shape):
 37 |     initial = tf.truncated_normal(shape, stddev=0.1)
 38 |     return tf.Variable(initial)
 39 | 
 40 | 
 41 | def bias_variable(shape):
 42 |     initial = tf.constant(0.1, shape=shape)
 43 |     return tf.Variable(initial)
 44 | 
 45 | 
 46 | def conv2d(x, W):
 47 |     return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
 48 | 
 49 | 
 50 | def max_pool_2x2(x):
 51 |     return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
 52 | 
 53 | 
 54 | def main(_):
 55 |     mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
 56 | 
 57 |     # Create the model
 58 | 
 59 |     # Data placeholders
 60 |     x = tf.placeholder(tf.float32, shape=[None, 784])
 61 |     y_ = tf.placeholder(tf.float32, shape=[None, 10])
 62 | 
 63 |     # First Convolutional Layer
 64 |     W_conv1 = weight_variable([5, 5, 1, 32])
 65 |     b_conv1 = bias_variable([32])
 66 |     x_image = tf.reshape(x, [-1, 28, 28, 1])
 67 |     h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
 68 |     h_pool1 = max_pool_2x2(h_conv1)
 69 | 
 70 |     # Second Convolutional Layer
 71 |     W_conv2 = weight_variable([5, 5, 32, 64])
 72 |     b_conv2 = bias_variable([64])
 73 |     h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
 74 |     h_pool2 = max_pool_2x2(h_conv2)
 75 | 
 76 |     # Densely Connected Layer
 77 |     W_fc1 = weight_variable([7 * 7 * 64, 1024])
 78 |     b_fc1 = bias_variable([1024])
 79 |     h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
 80 |     h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
 81 | 
 82 |     # Dropout
 83 |     keep_prob = tf.placeholder(tf.float32)
 84 |     h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
 85 | 
 86 |     # Readout Layer
 87 |     W_fc2 = weight_variable([1024, 10])
 88 |     b_fc2 = bias_variable([10])
 89 |     y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
 90 | 
 91 |     # Define loss and optimizer
 92 |     cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_conv, y_))
 93 |     train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
 94 |     correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
 95 |     accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
 96 |     sess = tf.InteractiveSession()
 97 |     sess.run(tf.global_variables_initializer())
 98 | 
 99 |     # Train
100 |     for i in range(5000):
101 |         batch_xs, batch_ys = mnist.train.next_batch(200)
102 |         if i % 100 == 0:
103 |             train_accuracy = accuracy.eval(feed_dict={
104 |                 x: batch_xs, y_: batch_ys, keep_prob: 1.0})
105 |             print("[%s] step %d, trained by %d examples, estimated prediction accuracy: %g%%" %
106 |                   (datetime.now().strftime("%H:%M"), i, i * 200, train_accuracy * 100))
107 |         train_step.run(feed_dict={x: batch_xs, y_: batch_ys, keep_prob: 0.5})
108 | 
109 |     # Test trained model
110 |     final_accuracy = accuracy.eval(feed_dict={
111 |         x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0})
112 |     print("Trained model final accuracy on test data is %g%%" % (final_accuracy * 100))
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     parser = argparse.ArgumentParser()
117 |     parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
118 |                         help='Directory for storing input data')
119 |     FLAGS, unparsed = parser.parse_known_args()
120 |     tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
121 | 


--------------------------------------------------------------------------------
/mnist/mnist_softmax.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """A very simple MNIST classifier.
17 | See extensive documentation at
18 | http://tensorflow.org/tutorials/mnist/beginners/index.md
19 | """
20 | from __future__ import absolute_import
21 | from __future__ import division
22 | from __future__ import print_function
23 | 
24 | import argparse
25 | import sys
26 | 
27 | # Import data
28 | from tensorflow.examples.tutorials.mnist import input_data
29 | 
30 | import tensorflow as tf
31 | 
32 | FLAGS = None
33 | 
34 | 
35 | def main(_):
36 |     mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
37 | 
38 |     # Create the model
39 |     x = tf.placeholder(tf.float32, [None, 784])
40 |     W = tf.Variable(tf.zeros([784, 10]))
41 |     b = tf.Variable(tf.zeros([10]))
42 |     y = tf.matmul(x, W) + b
43 | 
44 |     # Define loss and optimizer
45 |     y_ = tf.placeholder(tf.float32, [None, 10])
46 | 
47 |     # The raw formulation of cross-entropy,
48 |     #
49 |     #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)),
50 |     #                                 reduction_indices=[1]))
51 |     #
52 |     # can be numerically unstable.
53 |     #
54 |     # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
55 |     # outputs of 'y', and then average across the batch.
56 |     cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
57 |     train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
58 | 
59 |     sess = tf.InteractiveSession()
60 |     # Train
61 |     tf.global_variables_initializer().run()
62 |     for _ in range(1000):
63 |         batch_xs, batch_ys = mnist.train.next_batch(100)
64 |         sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
65 | 
66 |     # Test trained model
67 |     correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
68 |     accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
69 |     print(sess.run(accuracy, feed_dict={x: mnist.test.images,
70 |                                         y_: mnist.test.labels}))
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     parser = argparse.ArgumentParser()
75 |     parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
76 |                         help='Directory for storing input data')
77 |     FLAGS, unparsed = parser.parse_known_args()
78 |     tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
79 | 


--------------------------------------------------------------------------------
/mnist/mnist_visualized.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the 'License');
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an 'AS IS' BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """A simple MNIST classifier which displays summaries in TensorBoard.
 16 | 
 17 |  This is an unimpressive MNIST model, but it is a good example of using
 18 | tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of
 19 | naming summary tags so that they are grouped meaningfully in TensorBoard.
 20 | 
 21 | It demonstrates the functionality of every TensorBoard dashboard.
 22 | """
 23 | from __future__ import absolute_import
 24 | from __future__ import division
 25 | from __future__ import print_function
 26 | 
 27 | import argparse
 28 | import sys
 29 | 
 30 | import tensorflow as tf
 31 | 
 32 | from tensorflow.examples.tutorials.mnist import input_data
 33 | 
 34 | FLAGS = None
 35 | 
 36 | 
 37 | def train():
 38 |     # Import data
 39 |     mnist = input_data.read_data_sets(FLAGS.data_dir,
 40 |                                       one_hot=True,
 41 |                                       fake_data=FLAGS.fake_data)
 42 | 
 43 |     sess = tf.InteractiveSession()
 44 |     # Create a multilayer model.
 45 | 
 46 |     # Input placeholders
 47 |     with tf.name_scope('input'):
 48 |         x = tf.placeholder(tf.float32, [None, 784], name='x-input')
 49 |         y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
 50 | 
 51 |     with tf.name_scope('input_reshape'):
 52 |         image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
 53 |         tf.summary.image('input', image_shaped_input, 10)
 54 | 
 55 |     # We can't initialize these variables to 0 - the network will get stuck.
 56 |     def weight_variable(shape):
 57 |         """Create a weight variable with appropriate initialization."""
 58 |         initial = tf.truncated_normal(shape, stddev=0.1)
 59 |         return tf.Variable(initial)
 60 | 
 61 |     def bias_variable(shape):
 62 |         """Create a bias variable with appropriate initialization."""
 63 |         initial = tf.constant(0.1, shape=shape)
 64 |         return tf.Variable(initial)
 65 | 
 66 |     def variable_summaries(var):
 67 |         """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
 68 |         with tf.name_scope('summaries'):
 69 |             mean = tf.reduce_mean(var)
 70 |             tf.summary.scalar('mean', mean)
 71 |             with tf.name_scope('stddev'):
 72 |                 stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
 73 |             tf.summary.scalar('stddev', stddev)
 74 |             tf.summary.scalar('max', tf.reduce_max(var))
 75 |             tf.summary.scalar('min', tf.reduce_min(var))
 76 |             tf.summary.histogram('histogram', var)
 77 | 
 78 |     def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
 79 |         """Reusable code for making a simple neural net layer.
 80 | 
 81 |         It does a matrix multiply, bias add, and then uses relu to nonlinearize.
 82 |         It also sets up name scoping so that the resultant graph is easy to read,
 83 |         and adds a number of summary ops.
 84 |         """
 85 |         # Adding a name scope ensures logical grouping of the layers in the graph.
 86 |         with tf.name_scope(layer_name):
 87 |             # This Variable will hold the state of the weights for the layer
 88 |             with tf.name_scope('weights'):
 89 |                 weights = weight_variable([input_dim, output_dim])
 90 |                 variable_summaries(weights)
 91 |             with tf.name_scope('biases'):
 92 |                 biases = bias_variable([output_dim])
 93 |                 variable_summaries(biases)
 94 |             with tf.name_scope('Wx_plus_b'):
 95 |                 preactivate = tf.matmul(input_tensor, weights) + biases
 96 |                 tf.summary.histogram('pre_activations', preactivate)
 97 |             activations = act(preactivate, name='activation')
 98 |             tf.summary.histogram('activations', activations)
 99 |             return activations
100 | 
101 |     hidden1 = nn_layer(x, 784, 500, 'layer1')
102 | 
103 |     with tf.name_scope('dropout'):
104 |         keep_prob = tf.placeholder(tf.float32)
105 |         tf.summary.scalar('dropout_keep_probability', keep_prob)
106 |         dropped = tf.nn.dropout(hidden1, keep_prob)
107 | 
108 |     # Do not apply softmax activation yet, see below.
109 |     y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity)
110 | 
111 |     with tf.name_scope('cross_entropy'):
112 |         # The raw formulation of cross-entropy,
113 |         #
114 |         # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)),
115 |         #                               reduction_indices=[1]))
116 |         #
117 |         # can be numerically unstable.
118 |         #
119 |         # So here we use tf.nn.softmax_cross_entropy_with_logits on the
120 |         # raw outputs of the nn_layer above, and then average across
121 |         # the batch.
122 |         diff = tf.nn.softmax_cross_entropy_with_logits(y, y_)
123 |         with tf.name_scope('total'):
124 |             cross_entropy = tf.reduce_mean(diff)
125 |     tf.summary.scalar('cross_entropy', cross_entropy)
126 | 
127 |     with tf.name_scope('train'):
128 |         train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(
129 |             cross_entropy)
130 | 
131 |     with tf.name_scope('accuracy'):
132 |         with tf.name_scope('correct_prediction'):
133 |             correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
134 |         with tf.name_scope('accuracy'):
135 |             accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
136 |     tf.summary.scalar('accuracy', accuracy)
137 | 
138 |     # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
139 |     merged = tf.summary.merge_all()
140 |     train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph)
141 |     test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test')
142 |     tf.global_variables_initializer().run()
143 | 
144 |     # Train the model, and also write summaries.
145 |     # Every 10th step, measure test-set accuracy, and write test summaries
146 |     # All other steps, run train_step on training data, & add training summaries
147 | 
148 |     def feed_dict(train):
149 |         """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
150 |         if train or FLAGS.fake_data:
151 |             xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
152 |             k = FLAGS.dropout
153 |         else:
154 |             xs, ys = mnist.test.images, mnist.test.labels
155 |             k = 1.0
156 |         return {x: xs, y_: ys, keep_prob: k}
157 | 
158 |     for i in range(FLAGS.max_steps):
159 |         if i % 10 == 0:  # Record summaries and test-set accuracy
160 |             summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
161 |             test_writer.add_summary(summary, i)
162 |             print('Accuracy at step %s: %s' % (i, acc))
163 |         else:  # Record train set summaries, and train
164 |             if i % 100 == 99:  # Record execution stats
165 |                 run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
166 |                 run_metadata = tf.RunMetadata()
167 |                 summary, _ = sess.run([merged, train_step],
168 |                                       feed_dict=feed_dict(True),
169 |                                       options=run_options,
170 |                                       run_metadata=run_metadata)
171 |                 train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
172 |                 train_writer.add_summary(summary, i)
173 |                 print('Adding run metadata for', i)
174 |             else:  # Record a summary
175 |                 summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
176 |                 train_writer.add_summary(summary, i)
177 |     train_writer.close()
178 |     test_writer.close()
179 | 
180 | 
181 | def main(_):
182 |     if tf.gfile.Exists(FLAGS.log_dir):
183 |         tf.gfile.DeleteRecursively(FLAGS.log_dir)
184 |     tf.gfile.MakeDirs(FLAGS.log_dir)
185 |     train()
186 | 
187 | 
188 | if __name__ == '__main__':
189 |     parser = argparse.ArgumentParser()
190 |     parser.add_argument('--fake_data', nargs='?', const=True, type=bool,
191 |                         default=False,
192 |                         help='If true, uses fake data for unit testing.')
193 |     parser.add_argument('--max_steps', type=int, default=1000,
194 |                         help='Number of steps to run trainer.')
195 |     parser.add_argument('--learning_rate', type=float, default=0.001,
196 |                         help='Initial learning rate')
197 |     parser.add_argument('--dropout', type=float, default=0.9,
198 |                         help='Keep probability for training dropout.')
199 |     parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
200 |                         help='Directory for storing input data')
201 |     parser.add_argument('--log_dir', type=str, default='/tmp/tensorflow/mnist/logs/mnist_with_summaries',
202 |                         help='Summaries log directory')
203 |     FLAGS, unparsed = parser.parse_known_args()
204 |     tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
205 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | #required
 2 | tensorflow
 3 | tflearn
 4 | numpy
 5 | 
 6 | #optional
 7 | scikit-image
 8 | pyaudio
 9 | wave
10 | python_speech_features
11 | matplotlib
12 | librosa
13 | scikits.talkbox
14 | 


--------------------------------------------------------------------------------
/rnn/lstm/words_prediction/data.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dangartman/tensorflow-playground/64ec42a7c5c3cc9da61e80eccac94e4efc3aac01/rnn/lstm/words_prediction/data.tgz


--------------------------------------------------------------------------------
/rnn/lstm/words_prediction/ptb_word_lm.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Example / benchmark for building a PTB LSTM model.
 17 | 
 18 | Trains the model described in:
 19 | (Zaremba, et. al.) Recurrent Neural Network Regularization
 20 | http://arxiv.org/abs/1409.2329
 21 | 
 22 | There are 3 supported model configurations:
 23 | ===========================================
 24 | | config | epochs | train | valid  | test
 25 | ===========================================
 26 | | small  | 13     | 37.99 | 121.39 | 115.91
 27 | | medium | 39     | 48.45 |  86.16 |  82.07
 28 | | large  | 55     | 37.87 |  82.62 |  78.29
 29 | The exact results may vary depending on the random initialization.
 30 | 
 31 | The hyperparameters used in the model:
 32 | - init_scale - the initial scale of the weights
 33 | - learning_rate - the initial value of the learning rate
 34 | - max_grad_norm - the maximum permissible norm of the gradient
 35 | - num_layers - the number of LSTM layers
 36 | - num_steps - the number of unrolled steps of LSTM
 37 | - hidden_size - the number of LSTM units
 38 | - max_epoch - the number of epochs trained with the initial learning rate
 39 | - max_max_epoch - the total number of epochs for training
 40 | - keep_prob - the probability of keeping weights in the dropout layer
 41 | - lr_decay - the decay of the learning rate for each epoch after "max_epoch"
 42 | - batch_size - the batch size
 43 | 
 44 | The data required for this example is in the data/ dir of the
 45 | PTB dataset from Tomas Mikolov's webpage:
 46 | 
 47 | $ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
 48 | $ tar xvf simple-examples.tgz
 49 | 
 50 | To run:
 51 | 
 52 | $ python ptb_word_lm.py --data_path=simple-examples/data/
 53 | 
 54 | """
 55 | from __future__ import absolute_import
 56 | from __future__ import division
 57 | from __future__ import print_function
 58 | 
 59 | import time
 60 | 
 61 | import numpy as np
 62 | import tensorflow as tf
 63 | 
 64 | import reader
 65 | 
 66 | flags = tf.flags
 67 | logging = tf.logging
 68 | 
 69 | flags.DEFINE_string(
 70 |     "model", "small",
 71 |     "A type of model. Possible options are: small, medium, large.")
 72 | flags.DEFINE_string("data_path", None,
 73 |                     "Where the training/test data is stored.")
 74 | flags.DEFINE_string("save_path", None,
 75 |                     "Model output directory.")
 76 | flags.DEFINE_bool("use_fp16", False,
 77 |                   "Train using 16-bit floats instead of 32bit floats")
 78 | 
 79 | FLAGS = flags.FLAGS
 80 | 
 81 | 
 82 | def data_type():
 83 |     return tf.float16 if FLAGS.use_fp16 else tf.float32
 84 | 
 85 | 
 86 | class SmallConfig(object):
 87 |     """Small config."""
 88 |     init_scale = 0.1
 89 |     learning_rate = 1.0
 90 |     max_grad_norm = 5
 91 |     num_layers = 2
 92 |     num_steps = 20
 93 |     hidden_size = 200
 94 |     max_epoch = 4
 95 |     max_max_epoch = 13
 96 |     keep_prob = 1.0
 97 |     lr_decay = 0.5
 98 |     batch_size = 20
 99 |     vocab_size = 10000
100 | 
101 | 
102 | class MediumConfig(object):
103 |     """Medium config."""
104 |     init_scale = 0.05
105 |     learning_rate = 1.0
106 |     max_grad_norm = 5
107 |     num_layers = 2
108 |     num_steps = 35
109 |     hidden_size = 650
110 |     max_epoch = 6
111 |     max_max_epoch = 39
112 |     keep_prob = 0.5
113 |     lr_decay = 0.8
114 |     batch_size = 20
115 |     vocab_size = 10000
116 | 
117 | 
118 | class LargeConfig(object):
119 |     """Large config."""
120 |     init_scale = 0.04
121 |     learning_rate = 1.0
122 |     max_grad_norm = 10
123 |     num_layers = 2
124 |     num_steps = 35
125 |     hidden_size = 1500
126 |     max_epoch = 14
127 |     max_max_epoch = 55
128 |     keep_prob = 0.35
129 |     lr_decay = 1 / 1.15
130 |     batch_size = 20
131 |     vocab_size = 10000
132 | 
133 | 
134 | class TestConfig(object):
135 |     """Tiny config, for testing."""
136 |     init_scale = 0.1
137 |     learning_rate = 1.0
138 |     max_grad_norm = 1
139 |     num_layers = 1
140 |     num_steps = 2
141 |     hidden_size = 2
142 |     max_epoch = 1
143 |     max_max_epoch = 1
144 |     keep_prob = 1.0
145 |     lr_decay = 0.5
146 |     batch_size = 20
147 |     vocab_size = 10000
148 | 
149 | 
150 | def get_config():
151 |     if FLAGS.model == "small":
152 |         return SmallConfig()
153 |     elif FLAGS.model == "medium":
154 |         return MediumConfig()
155 |     elif FLAGS.model == "large":
156 |         return LargeConfig()
157 |     elif FLAGS.model == "test":
158 |         return TestConfig()
159 |     else:
160 |         raise ValueError("Invalid model: %s", FLAGS.model)
161 | 
162 | 
163 | class PTBInput(object):
164 |     """The input data."""
165 | 
166 |     def __init__(self, config, data, name=None):
167 |         self.batch_size = batch_size = config.batch_size
168 |         self.num_steps = num_steps = config.num_steps
169 |         self.epoch_size = ((len(data) // batch_size) - 1) // num_steps
170 |         self.input_data, self.targets = reader.ptb_producer(
171 |             data, batch_size, num_steps, name=name)
172 | 
173 | 
174 | class PTBModel(object):
175 |     """The PTB model."""
176 | 
177 |     def assign_lr(self, session, lr_value):
178 |         session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
179 | 
180 |     @property
181 |     def input(self):
182 |         return self._input
183 | 
184 |     @property
185 |     def initial_state(self):
186 |         return self._initial_state
187 | 
188 |     @property
189 |     def cost(self):
190 |         return self._cost
191 | 
192 |     @property
193 |     def loss(self):
194 |         return self._loss
195 | 
196 |     @property
197 |     def final_state(self):
198 |         return self._final_state
199 | 
200 |     @property
201 |     def lr(self):
202 |         return self._lr
203 | 
204 |     @property
205 |     def train_op(self):
206 |         return self._train_op
207 | 
208 |     def __init__(self, is_training, config, input_):
209 |         self._input = input_
210 | 
211 |         batch_size = input_.batch_size
212 |         num_steps = input_.num_steps
213 |         size = config.hidden_size
214 |         vocab_size = config.vocab_size
215 | 
216 |         # Slightly better results can be obtained with forget gate biases
217 |         # initialized to 1 but the hyperparameters of the model would need to be
218 |         # different than reported in the paper.
219 |         lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=True)
220 |         if is_training and config.keep_prob < 1:
221 |             lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
222 |                 lstm_cell, output_keep_prob=config.keep_prob)
223 |         cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers, state_is_tuple=True)
224 | 
225 |         self._initial_state = cell.zero_state(batch_size, data_type())
226 | 
227 |         with tf.device("/cpu:0"):
228 |             embedding = tf.get_variable(
229 |                 "embedding", [vocab_size, size], dtype=data_type())
230 |             inputs = tf.nn.embedding_lookup(embedding, input_.input_data)
231 | 
232 |         if is_training and config.keep_prob < 1:
233 |             inputs = tf.nn.dropout(inputs, config.keep_prob)
234 | 
235 |         # Simplified version of tensorflow.models.rnn.rnn.py's rnn().
236 |         # This builds an unrolled LSTM for tutorial purposes only.
237 |         # In general, use the rnn() or state_saving_rnn() from rnn.py.
238 |         #
239 |         # The alternative version of the code below is:
240 |         #
241 |         # inputs = tf.unstack(inputs, num=num_steps, axis=1)
242 |         # outputs, state = tf.nn.rnn(cell, inputs, initial_state=self._initial_state)
243 |         outputs = []
244 |         state = self._initial_state
245 |         with tf.variable_scope("RNN"):
246 |             for time_step in range(num_steps):
247 |                 if time_step > 0: tf.get_variable_scope().reuse_variables()
248 |                 (cell_output, state) = cell(inputs[:, time_step, :], state)
249 |                 outputs.append(cell_output)
250 | 
251 |         output = tf.reshape(tf.concat(1, outputs), [-1, size])
252 |         print("output shape: ", output.get_shape())
253 |         softmax_w = tf.get_variable(
254 |             "softmax_w", [size, vocab_size], dtype=data_type())
255 |         softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type())
256 |         logits = tf.matmul(output, softmax_w) + softmax_b
257 |         print("logits shape: ", logits.get_shape())
258 |         loss = tf.nn.seq2seq.sequence_loss_by_example(
259 |             [logits],
260 |             [tf.reshape(input_.targets, [-1])],
261 |             [tf.ones([batch_size * num_steps], dtype=data_type())])
262 |         self._loss = loss
263 |         self._cost = cost = tf.reduce_sum(loss) / batch_size
264 |         self._final_state = state
265 | 
266 |         if not is_training:
267 |             return
268 | 
269 |         self._lr = tf.Variable(0.0, trainable=False)
270 |         tvars = tf.trainable_variables()
271 |         grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
272 |                                           config.max_grad_norm)
273 |         optimizer = tf.train.GradientDescentOptimizer(self._lr)
274 |         self._train_op = optimizer.apply_gradients(
275 |             zip(grads, tvars),
276 |             global_step=tf.contrib.framework.get_or_create_global_step())
277 | 
278 |         self._new_lr = tf.placeholder(
279 |             tf.float32, shape=[], name="new_learning_rate")
280 |         self._lr_update = tf.assign(self._lr, self._new_lr)
281 | 
282 | 
283 | def run_epoch(session, model, eval_op=None, verbose=False):
284 |     """Runs the model on the given data."""
285 |     start_time = time.time()
286 |     costs = 0.0
287 |     iters = 0
288 |     state = session.run(model.initial_state)
289 | 
290 |     fetches = {
291 |         "cost": model.cost,
292 |         "final_state": model.final_state,
293 |     }
294 |     if eval_op is not None:
295 |         fetches["eval_op"] = eval_op
296 | 
297 |     for step in range(model.input.epoch_size):
298 |         feed_dict = {}
299 |         for i, (c, h) in enumerate(model.initial_state):
300 |             feed_dict[c] = state[i].c
301 |             feed_dict[h] = state[i].h
302 | 
303 |         vals = session.run(fetches, feed_dict)
304 |         cost = vals["cost"]
305 |         state = vals["final_state"]
306 | 
307 |         costs += cost
308 |         iters += model.input.num_steps
309 | 
310 |         if verbose and step % (model.input.epoch_size // 10) == 10:
311 |             print("%.3f perplexity: %.3f speed: %.0f wps" %
312 |                   (step * 1.0 / model.input.epoch_size, np.exp(costs / iters),
313 |                    iters * model.input.batch_size / (time.time() - start_time)))
314 | 
315 |     return np.exp(costs / iters)
316 | 
317 | 
318 | def main(_):
319 |     if not FLAGS.data_path:
320 |         raise ValueError("Must set --data_path to PTB data directory")
321 | 
322 |     raw_data = reader.ptb_raw_data(FLAGS.data_path)
323 |     train_data, valid_data, test_data, _ = raw_data
324 | 
325 |     config = get_config()
326 |     eval_config = get_config()
327 |     eval_config.batch_size = 1
328 |     eval_config.num_steps = 1
329 | 
330 |     with tf.Graph().as_default():
331 |         initializer = tf.random_uniform_initializer(-config.init_scale,
332 |                                                     config.init_scale)
333 | 
334 |         with tf.name_scope("Train"):
335 |             train_input = PTBInput(config=config, data=train_data, name="TrainInput")
336 |             with tf.variable_scope("Model", reuse=None, initializer=initializer):
337 |                 m = PTBModel(is_training=True, config=config, input_=train_input)
338 |             tf.scalar_summary("Training Loss", m.cost)
339 |             tf.scalar_summary("Learning Rate", m.lr)
340 | 
341 |         with tf.name_scope("Valid"):
342 |             valid_input = PTBInput(config=config, data=valid_data, name="ValidInput")
343 |             with tf.variable_scope("Model", reuse=True, initializer=initializer):
344 |                 mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
345 |             tf.scalar_summary("Validation Loss", mvalid.cost)
346 | 
347 |         with tf.name_scope("Test"):
348 |             test_input = PTBInput(config=eval_config, data=test_data, name="TestInput")
349 |             with tf.variable_scope("Model", reuse=True, initializer=initializer):
350 |                 mtest = PTBModel(is_training=False, config=eval_config,
351 |                                  input_=test_input)
352 | 
353 |         sv = tf.train.Supervisor(logdir=FLAGS.save_path)
354 |         with sv.managed_session() as session:
355 |             for i in range(config.max_max_epoch):
356 |                 lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
357 |                 m.assign_lr(session, config.learning_rate * lr_decay)
358 | 
359 |                 print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
360 |                 train_perplexity = run_epoch(session, m, eval_op=m.train_op,
361 |                                              verbose=True)
362 |                 print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
363 |                 valid_perplexity = run_epoch(session, mvalid)
364 |                 print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
365 | 
366 |             test_perplexity = run_epoch(session, mtest)
367 |             print("Test Perplexity: %.3f" % test_perplexity)
368 | 
369 |             if FLAGS.save_path:
370 |                 print("Saving model to %s." % FLAGS.save_path)
371 |                 sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
372 | 
373 | 
374 | if __name__ == "__main__":
375 |     tf.app.run()
376 | 


--------------------------------------------------------------------------------
/rnn/lstm/words_prediction/reader.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | 
 17 | """Utilities for parsing PTB text files."""
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import collections
 23 | import os
 24 | 
 25 | import tensorflow as tf
 26 | 
 27 | 
 28 | def _read_words(filename):
 29 |     with tf.gfile.GFile(filename, "r") as f:
 30 |         return f.read().decode("utf-8").replace("\n", "<eos>").split()
 31 | 
 32 | 
 33 | def _build_vocab(filename):
 34 |     data = _read_words(filename)
 35 | 
 36 |     counter = collections.Counter(data)
 37 |     count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
 38 | 
 39 |     words, _ = list(zip(*count_pairs))
 40 |     word_to_id = dict(zip(words, range(len(words))))
 41 | 
 42 |     return word_to_id
 43 | 
 44 | 
 45 | def _file_to_word_ids(filename, word_to_id):
 46 |     data = _read_words(filename)
 47 |     return [word_to_id[word] for word in data if word in word_to_id]
 48 | 
 49 | 
 50 | def ptb_raw_data(data_path=None):
 51 |     """Load PTB raw data from data directory "data_path".
 52 | 
 53 |     Reads PTB text files, converts strings to integer ids,
 54 |     and performs mini-batching of the inputs.
 55 | 
 56 |     The PTB dataset comes from Tomas Mikolov's webpage:
 57 | 
 58 |     http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
 59 | 
 60 |     Args:
 61 |       data_path: string path to the directory where simple-examples.tgz has
 62 |         been extracted.
 63 | 
 64 |     Returns:
 65 |       tuple (train_data, valid_data, test_data, vocabulary)
 66 |       where each of the data objects can be passed to PTBIterator.
 67 |     """
 68 | 
 69 |     train_path = os.path.join(data_path, "ptb.train.txt")
 70 |     valid_path = os.path.join(data_path, "ptb.valid.txt")
 71 |     test_path = os.path.join(data_path, "ptb.test.txt")
 72 | 
 73 |     word_to_id = _build_vocab(train_path)
 74 |     train_data = _file_to_word_ids(train_path, word_to_id)
 75 |     valid_data = _file_to_word_ids(valid_path, word_to_id)
 76 |     test_data = _file_to_word_ids(test_path, word_to_id)
 77 |     vocabulary = len(word_to_id)
 78 |     return train_data, valid_data, test_data, vocabulary
 79 | 
 80 | 
 81 | def ptb_producer(raw_data, batch_size, num_steps, name=None):
 82 |     """Iterate on the raw PTB data.
 83 | 
 84 |     This chunks up raw_data into batches of examples and returns Tensors that
 85 |     are drawn from these batches.
 86 | 
 87 |     Args:
 88 |       raw_data: one of the raw data outputs from ptb_raw_data.
 89 |       batch_size: int, the batch size.
 90 |       num_steps: int, the number of unrolls.
 91 |       name: the name of this operation (optional).
 92 | 
 93 |     Returns:
 94 |       A pair of Tensors, each shaped [batch_size, num_steps]. The second element
 95 |       of the tuple is the same data time-shifted to the right by one.
 96 | 
 97 |     Raises:
 98 |       tf.errors.InvalidArgumentError: if batch_size or num_steps are too high.
 99 |     """
100 |     with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]):
101 |         raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32)
102 | 
103 |         data_len = tf.size(raw_data)
104 |         batch_len = data_len // batch_size
105 |         data = tf.reshape(raw_data[0: batch_size * batch_len],
106 |                           [batch_size, batch_len])
107 | 
108 |         epoch_size = (batch_len - 1) // num_steps
109 |         assertion = tf.assert_positive(
110 |             epoch_size,
111 |             message="epoch_size == 0, decrease batch_size or num_steps")
112 |         with tf.control_dependencies([assertion]):
113 |             epoch_size = tf.identity(epoch_size, name="epoch_size")
114 | 
115 |         i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
116 |         x = tf.slice(data, [0, i * num_steps], [batch_size, num_steps])
117 |         y = tf.slice(data, [0, i * num_steps + 1], [batch_size, num_steps])
118 |         return x, y
119 | 


--------------------------------------------------------------------------------
/rnn/text8.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dangartman/tensorflow-playground/64ec42a7c5c3cc9da61e80eccac94e4efc3aac01/rnn/text8.zip


--------------------------------------------------------------------------------
/rnn/word2vec_basic.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | from __future__ import absolute_import
 17 | from __future__ import division
 18 | from __future__ import print_function
 19 | 
 20 | import collections
 21 | import math
 22 | import os
 23 | import random
 24 | import zipfile
 25 | 
 26 | import numpy as np
 27 | from six.moves import urllib
 28 | from six.moves import xrange  # pylint: disable=redefined-builtin
 29 | import tensorflow as tf
 30 | 
 31 | # Step 1: Download the data.
 32 | url = 'http://mattmahoney.net/dc/'
 33 | 
 34 | 
 35 | def maybe_download(filename, expected_bytes):
 36 |     """Download a file if not present, and make sure it's the right size."""
 37 |     if not os.path.exists(filename):
 38 |         filename, _ = urllib.request.urlretrieve(url + filename, filename)
 39 |     statinfo = os.stat(filename)
 40 |     if statinfo.st_size == expected_bytes:
 41 |         print('Found and verified', filename)
 42 |     else:
 43 |         print(statinfo.st_size)
 44 |         raise Exception(
 45 |             'Failed to verify ' + filename + '. Can you get to it with a browser?')
 46 |     return filename
 47 | 
 48 | 
 49 | filename = maybe_download('text8.zip', 31344016)
 50 | 
 51 | 
 52 | # Read the data into a list of strings.
 53 | def read_data(filename):
 54 |     """Extract the first file enclosed in a zip file as a list of words"""
 55 |     with zipfile.ZipFile(filename) as f:
 56 |         data = tf.compat.as_str(f.read(f.namelist()[0])).split()
 57 |     return data
 58 | 
 59 | 
 60 | words = read_data(filename)
 61 | print('Data size', len(words))
 62 | 
 63 | # Step 2: Build the dictionary and replace rare words with UNK token.
 64 | vocabulary_size = 50000
 65 | 
 66 | 
 67 | def build_dataset(words):
 68 |     count = [['UNK', -1]]
 69 |     count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
 70 |     dictionary = dict()
 71 |     for word, _ in count:
 72 |         dictionary[word] = len(dictionary)
 73 |     data = list()
 74 |     unk_count = 0
 75 |     for word in words:
 76 |         if word in dictionary:
 77 |             index = dictionary[word]
 78 |         else:
 79 |             index = 0  # dictionary['UNK']
 80 |             unk_count += 1
 81 |         data.append(index)
 82 |     count[0][1] = unk_count
 83 |     reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
 84 |     return data, count, dictionary, reverse_dictionary
 85 | 
 86 | 
 87 | data, count, dictionary, reverse_dictionary = build_dataset(words)
 88 | del words  # Hint to reduce memory.
 89 | print('Most common words (+UNK)', count[:5])
 90 | print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
 91 | 
 92 | data_index = 0
 93 | 
 94 | 
 95 | # Step 3: Function to generate a training batch for the skip-gram model.
 96 | def generate_batch(batch_size, num_skips, skip_window):
 97 |     global data_index
 98 |     assert batch_size % num_skips == 0
 99 |     assert num_skips <= 2 * skip_window
100 |     batch = np.ndarray(shape=(batch_size), dtype=np.int32)
101 |     labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
102 |     span = 2 * skip_window + 1  # [ skip_window target skip_window ]
103 |     buffer = collections.deque(maxlen=span)
104 |     for _ in range(span):
105 |         buffer.append(data[data_index])
106 |         data_index = (data_index + 1) % len(data)
107 |     for i in range(batch_size // num_skips):
108 |         target = skip_window  # target label at the center of the buffer
109 |         targets_to_avoid = [skip_window]
110 |         for j in range(num_skips):
111 |             while target in targets_to_avoid:
112 |                 target = random.randint(0, span - 1)
113 |             targets_to_avoid.append(target)
114 |             batch[i * num_skips + j] = buffer[skip_window]
115 |             labels[i * num_skips + j, 0] = buffer[target]
116 |         buffer.append(data[data_index])
117 |         data_index = (data_index + 1) % len(data)
118 |     return batch, labels
119 | 
120 | 
121 | batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
122 | for i in range(8):
123 |     print(batch[i], reverse_dictionary[batch[i]],
124 |           '->', labels[i, 0], reverse_dictionary[labels[i, 0]])
125 | 
126 | # Step 4: Build and train a skip-gram model.
127 | 
128 | batch_size = 128
129 | embedding_size = 128  # Dimension of the embedding vector.
130 | skip_window = 1  # How many words to consider left and right.
131 | num_skips = 2  # How many times to reuse an input to generate a label.
132 | 
133 | # We pick a random validation set to sample nearest neighbors. Here we limit the
134 | # validation samples to the words that have a low numeric ID, which by
135 | # construction are also the most frequent.
136 | valid_size = 16  # Random set of words to evaluate similarity on.
137 | valid_window = 100  # Only pick dev samples in the head of the distribution.
138 | valid_examples = np.random.choice(valid_window, valid_size, replace=False)
139 | num_sampled = 64  # Number of negative examples to sample.
140 | 
141 | graph = tf.Graph()
142 | 
143 | with graph.as_default():
144 |     # Input data.
145 |     train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
146 |     train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
147 |     valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
148 | 
149 |     # Ops and variables pinned to the CPU because of missing GPU implementation
150 |     with tf.device('/cpu:0'):
151 |         # Look up embeddings for inputs.
152 |         embeddings = tf.Variable(
153 |             tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
154 |         embed = tf.nn.embedding_lookup(embeddings, train_inputs)
155 | 
156 |         # Construct the variables for the NCE loss
157 |         nce_weights = tf.Variable(
158 |             tf.truncated_normal([vocabulary_size, embedding_size],
159 |                                 stddev=1.0 / math.sqrt(embedding_size)))
160 |         nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
161 | 
162 |     # Compute the average NCE loss for the batch.
163 |     # tf.nce_loss automatically draws a new sample of the negative labels each
164 |     # time we evaluate the loss.
165 |     loss = tf.reduce_mean(
166 |         tf.nn.nce_loss(weights=nce_weights,
167 |                        biases=nce_biases,
168 |                        labels=train_labels,
169 |                        inputs=embed,
170 |                        num_sampled=num_sampled,
171 |                        num_classes=vocabulary_size))
172 | 
173 |     # Construct the SGD optimizer using a learning rate of 1.0.
174 |     optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
175 | 
176 |     # Compute the cosine similarity between minibatch examples and all embeddings.
177 |     norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
178 |     normalized_embeddings = embeddings / norm
179 |     valid_embeddings = tf.nn.embedding_lookup(
180 |         normalized_embeddings, valid_dataset)
181 |     similarity = tf.matmul(
182 |         valid_embeddings, normalized_embeddings, transpose_b=True)
183 | 
184 |     # Add variable initializer.
185 |     init = tf.global_variables_initializer()
186 | 
187 | # Step 5: Begin training.
188 | num_steps = 100001
189 | 
190 | with tf.Session(graph=graph) as session:
191 |     # We must initialize all variables before we use them.
192 |     init.run()
193 |     print("Initialized")
194 | 
195 |     average_loss = 0
196 |     for step in xrange(num_steps):
197 |         batch_inputs, batch_labels = generate_batch(
198 |             batch_size, num_skips, skip_window)
199 |         feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
200 | 
201 |         # We perform one update step by evaluating the optimizer op (including it
202 |         # in the list of returned values for session.run()
203 |         _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
204 |         average_loss += loss_val
205 | 
206 |         if step % 2000 == 0:
207 |             if step > 0:
208 |                 average_loss /= 2000
209 |             # The average loss is an estimate of the loss over the last 2000 batches.
210 |             print("Average loss at step ", step, ": ", average_loss)
211 |             average_loss = 0
212 | 
213 |         # Note that this is expensive (~20% slowdown if computed every 500 steps)
214 |         if step % 10000 == 0:
215 |             sim = similarity.eval()
216 |             for i in xrange(valid_size):
217 |                 valid_word = reverse_dictionary[valid_examples[i]]
218 |                 top_k = 8  # number of nearest neighbors
219 |                 nearest = (-sim[i, :]).argsort()[1:top_k + 1]
220 |                 log_str = "Nearest to %s:" % valid_word
221 |                 for k in xrange(top_k):
222 |                     close_word = reverse_dictionary[nearest[k]]
223 |                     log_str = "%s %s," % (log_str, close_word)
224 |                 print(log_str)
225 |     final_embeddings = normalized_embeddings.eval()
226 | 
227 | 
228 | # Step 6: Visualize the embeddings.
229 | 
230 | 
231 | def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
232 |     assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
233 |     plt.figure(figsize=(18, 18))  # in inches
234 |     for i, label in enumerate(labels):
235 |         x, y = low_dim_embs[i, :]
236 |         plt.scatter(x, y)
237 |         plt.annotate(label,
238 |                      xy=(x, y),
239 |                      xytext=(5, 2),
240 |                      textcoords='offset points',
241 |                      ha='right',
242 |                      va='bottom')
243 | 
244 |     plt.savefig(filename)
245 | 
246 | 
247 | try:
248 |     from sklearn.manifold import TSNE
249 |     import matplotlib.pyplot as plt
250 | 
251 |     tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
252 |     plot_only = 500
253 |     low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
254 |     labels = [reverse_dictionary[i] for i in xrange(plot_only)]
255 |     plot_with_labels(low_dim_embs, labels)
256 | 
257 | except ImportError:
258 |     print("Please install sklearn, matplotlib, and scipy to visualize embeddings.")
259 | 


--------------------------------------------------------------------------------
/speech/generate_speech_data.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import os
  6 | import os.path
  7 | import numpy as np
  8 | import subprocess
  9 | import random
 10 | 
 11 | DATA_DIR = 'data/'
 12 | 
 13 | NUMBERS_PATH = DATA_DIR + "spoken_numbers"
 14 | WORDS_PATH = DATA_DIR + "spoken_words_wav"
 15 | SENTENCES_PATH = DATA_DIR + "spoken_sentences_wav"
 16 | SENTENCES_MLL_PATH = DATA_DIR + "spoken_sentences_mll_wav"
 17 | 
 18 | good_voices = {
 19 |     'english-mb-en1': {'name': 'En1', 'rate': 100},
 20 |     'us-mbrola-1': {'name': 'Us1', 'rate': 120},
 21 |     'us-mbrola-2': {'name': 'Us2', 'rate': 120},
 22 |     'us-mbrola-3': {'name': 'Us3', 'rate': 120},
 23 |     'en-german': {'name': 'German', 'rate': 110},
 24 |     'en-german-5': {'name': 'German1', 'rate': 100},
 25 |     'en-romanian': {'name': 'Romanian', 'rate': 120},
 26 |     'en-dutch': {'name': 'Dutch', 'rate': 120},
 27 |     'en-french': {'name': 'French', 'rate': 110},
 28 |     'en-hungarian': {'name': 'Hungarian', 'rate': 100},
 29 |     'en-swedish': {'name': 'Swedish', 'rate': 110},
 30 |     'en-swedish-f': {'name': 'Swedish1', 'rate': 110}
 31 | }
 32 | 
 33 | bad_voices = {
 34 |     'english-us': {'name': 'Us', 'rate': 120},
 35 |     'en-greek': {'name': 'Greek', 'rate': 150},
 36 |     'english': {'name': 'En', 'rate': 120},
 37 |     'english-north': {'name': 'En2', 'rate': 130},
 38 |     'english_rp': {'name': 'En3', 'rate': 110},
 39 |     'english_wmids': {'name': 'En4', 'rate': 120},
 40 |     'en-scottish': {'name': 'Scottish', 'rate': 130},
 41 |     'en-westindies': {'name': 'Westindies', 'rate': 140},
 42 | 
 43 |     'en-afrikaans': {'name': 'Afrikaans', 'rate': 100},
 44 |     'en-polish': {'name': 'Polish', 'rate': 110}
 45 | }
 46 | 
 47 | validation_percent = 10
 48 | validation_voices = ['us-mbrola-2', 'en-german-5']
 49 | n_features = 26
 50 | 
 51 | 
 52 | def check_voices():
 53 |     voice_infos = str(subprocess.check_output(["espeak", "--voices=en"])).split("\n")[1:-1]
 54 |     voices = map(lambda x: x.split()[3], voice_infos)
 55 |     for voice in good_voices.keys():
 56 |         if voice in voices:
 57 |             print(voice + " FOUND!")
 58 |     for voice in good_voices.keys():
 59 |         if not voice in voices:
 60 |             print(voice + " MISSING!")
 61 |             del good_voices[voice]
 62 | 
 63 | 
 64 | def generate_mfcc(voice_name, voice_id, line, line_num, rate, path):
 65 |     from librosa import load
 66 |     from scikits.talkbox.features import mfcc
 67 | 
 68 |     filename = path + "/wav/{0}_{1}_{2}.wav".format(line_num, voice_name, rate)
 69 |     try:
 70 |         out = str(subprocess.check_output([
 71 |             "espeak",
 72 |             "-v", voice_id,
 73 |             "-w", filename,
 74 |             "-s {0}".format(rate),
 75 |             line
 76 |         ], stderr=subprocess.STDOUT))
 77 |         if "FATAL ERROR" in out:
 78 |             print("CANNOT GENERATE WAV")
 79 |         else:
 80 |             signal, sample_rate = load(filename, mono=True)
 81 |             mel_features, mspec, spec = mfcc(signal, fs=sample_rate, nceps=n_features)
 82 |             # mel_features = np.swapaxes(mel_features, 0, 1)  # timesteps x nFeatures -> nFeatures x timesteps
 83 |             np.save(path + "/mfcc/%s_%s_%d.npy" % (line_num, voice_name, rate), mel_features)
 84 |     except:
 85 |         pass
 86 | 
 87 | 
 88 | def generate_labels(line, path, line_num, relevant_words):
 89 |     num_of_labels = len(relevant_words) + 1  # Add last label if none words are relevant
 90 |     labels = np.full(num_of_labels, -1)
 91 |     at_least_one_present = False
 92 |     for word in line.split(" "):
 93 |         try:
 94 |             relevant_index = relevant_words.index(word)
 95 |             labels[relevant_index] = 1
 96 |             at_least_one_present = True
 97 |         except:
 98 |             pass  # ignore if word is not relevant
 99 |     if not at_least_one_present:
100 |         labels[num_of_labels - 1] = 1
101 | 
102 |     np.save(path + "/labels/%s.npy" % line_num, labels)
103 |     return labels
104 | 
105 | 
106 | def generate_phonemes(line, path):
107 |     pronounced = subprocess.check_output(["./line_to_phonemes", line]).decode('UTF-8').strip()  # todo
108 |     # phonemes = string_to_int_line(pronounced, pad_to=max_line_length)  # hack for numbers!
109 |     # phonemes = string_to_int_line(line, pad_to=max_line_length)
110 |     # np.save(path + "/phonemes/%s.npy" % line, phonemes)
111 | 
112 | 
113 | def generate(lines, path, relevant_words = None):
114 |     # generate a bunch of files for each line (with many voices, nuances):
115 |     # spoken wav
116 |     # mfcc: Mel-frequency cepstrum
117 |     # mll labels
118 |     if not os.path.exists(path): os.mkdir(path)
119 |     if not os.path.exists(path + "/labels/"): os.mkdir(path + "/labels/")
120 |     if not os.path.exists(path + "/mfcc/"): os.mkdir(path + "/mfcc/")
121 |     if not os.path.exists(path + "/wav/"): os.mkdir(path + "/wav/")
122 |     out = open(path + "/lines.list", "wt")
123 |     line_num = 1
124 |     for line in lines:
125 |         if isinstance(line, bytes):
126 |             line = line.decode('UTF-8').strip()
127 |         type = "train"
128 |         if random.randint(1, 100) < validation_percent:
129 |             type = "validation"
130 |         print("generating [%s] %s" % (type, line))
131 |         out.write("%d:%s:%s\n" % (line_num, type, line))
132 |         voices = good_voices.keys()
133 |         if relevant_words:
134 |             generate_labels(line, path, line_num, relevant_words)
135 |             if type == "validation":
136 |                 voice_id = validation_voices[random.randint(0, len(validation_voices) - 1)]
137 |             else:
138 |                 voice_id = voices[random.randint(0, len(voices) - 1)]
139 |                 while voice_id in validation_voices:
140 |                     voice_id = voices[random.randint(0, len(voices) - 1)]
141 |             voices = [voice_id]
142 |         for voice in voices:
143 |             # from_rate = good_voices[voice]['rate'] - 40
144 |             # to_rate = good_voices[voice]['rate'] + 81
145 |             # for rate in range(from_rate, to_rate, 20):
146 |             rate = random.randint(good_voices[voice]['rate'] - 30, good_voices[voice]['rate'] + 40)
147 |             try:
148 |                 generate_mfcc(good_voices[voice]['name'], voice, line, line_num, rate, path)
149 |             except:
150 |                 pass  # ignore after debug!
151 |         line_num += 1
152 | 
153 | 
154 | def generate_lines(relevant_words, irrelevant_words, num_of_lines, max_line_length, mean_relevance_percent):
155 |     lines = []
156 |     for i in range(0, num_of_lines):
157 |         line = ""
158 |         for w in range(0, random.randint(1, max_line_length)):
159 |             if random.randint(1, 100) < mean_relevance_percent:
160 |                 line += relevant_words[random.randint(0, len(relevant_words) - 1)] + " "
161 |             else:
162 |                 line += irrelevant_words[random.randint(0, len(irrelevant_words) - 1)] + " "
163 |         lines.append(line)
164 |     return lines
165 | 
166 | 
167 | def generate_spoken_numbers():
168 |     nums = list(map(str, range(0, 10)))
169 |     generate(nums, NUMBERS_PATH)
170 | 
171 | 
172 | def generate_spoken_words():
173 |     wordslist = "wordslist.txt"
174 |     words = open(wordslist).readlines()
175 |     generate(words, WORDS_PATH)
176 | 
177 | 
178 | def generate_spoken_sentences():
179 |     linelist = "sentences.txt"
180 |     lines = open(linelist).readlines()
181 |     generate(lines, SENTENCES_PATH)
182 | 
183 | 
184 | def generate_spoken_sentences_mll():
185 |     relevant_wordlist = "mll_relevant_words.txt"
186 |     relevant_words = list(map(
187 |         lambda w: w.replace("\n", ''),
188 |         open(relevant_wordlist).readlines()
189 |     ))
190 |     irrelevant_wordlist = "mll_irrelevant_words.txt"
191 |     irrelevant_words = list(map(
192 |         lambda w: w.replace("\n", ''),
193 |         open(irrelevant_wordlist).readlines()
194 |     ))
195 |     lines = generate_lines(relevant_words, irrelevant_words,
196 |                            num_of_lines=10000, max_line_length=20, mean_relevance_percent=20)
197 |     generate(lines, SENTENCES_MLL_PATH, relevant_words)
198 | 
199 | 
200 | def main():
201 |     check_voices()
202 |     generate_spoken_sentences_mll()
203 | 
204 | 
205 | if __name__ == '__main__':
206 |     main()
207 |     print("DONE!")
208 | 


--------------------------------------------------------------------------------
/speech/mll_data.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import os
  6 | import os.path
  7 | import numpy as np
  8 | 
  9 | from generate_speech_data import SENTENCES_MLL_PATH
 10 | 
 11 | 
 12 | class MllData(object):
 13 |     """The multi-label learning input data."""
 14 | 
 15 |     @property
 16 |     def num_steps(self):
 17 |         return self.max_steps
 18 | 
 19 |     @property
 20 |     def num_classes(self):
 21 |         return self.n_classes
 22 | 
 23 |     def __init__(self, raw_input, raw_labels, cell_size):
 24 |         '''Object with input data for LSTM-MLL NN
 25 |              raw_input: list of 2D numpy arrays with raw mfcc input frames [timesteps x n_features]
 26 |              raw_labels: list of 1D arrays with labels [n_classes]
 27 |              cell_size: int with the size on lstm cell
 28 | 
 29 |              WARNING: cell_size should be multiple of n_features
 30 |         '''
 31 | 
 32 |         assert len(raw_input) == len(raw_labels), "input len %d != labels len %d" % (len(raw_input), len(labels))
 33 | 
 34 |         self.input_size = len(raw_labels)
 35 |         self.raw_input = raw_input
 36 |         self.raw_labels = raw_labels
 37 |         self.cell_size = cell_size
 38 | 
 39 |         self.n_features = raw_input[0].shape[1]
 40 |         self.n_classes = raw_labels[0].shape[0]
 41 |         assert cell_size % self.n_features == 0, "cell size should be multiple num of features"
 42 |         cell_size_factor = cell_size // self.n_features
 43 | 
 44 |         self.max_timesteps = 0
 45 |         for input_index in range(len(raw_labels)):
 46 |             self.max_timesteps = max(self.max_timesteps, raw_input[input_index].shape[0])
 47 |         print("max timesteps", self.max_timesteps)
 48 |         self.max_timesteps += cell_size_factor - self.max_timesteps % cell_size_factor
 49 |         print("increased max timesteps", self.max_timesteps)
 50 |         self.max_steps = self.max_timesteps // cell_size_factor
 51 |         print("max steps", self.max_steps)
 52 | 
 53 |     def get_batch(self, batch_size):
 54 |         '''Produce random batch from raw input data
 55 |              batch_size: int with number of inputs/labels per batch
 56 |              returns: batch tuple (inputs, labels) consists of
 57 |                     inputs = 3D array w/ shape [batch_size x max_steps x cell_size]
 58 |                     labels = 2D array w/ shape [batch_size x n_classes]
 59 |         '''
 60 |         random_indexes = np.random.permutation(self.input_size)
 61 | 
 62 |         inputs = np.zeros([batch_size, self.max_steps, self.cell_size])
 63 |         labels = np.zeros([batch_size, self.n_classes])
 64 |         for batch_index, raw_index in enumerate(random_indexes[0:batch_size]):
 65 |             mfcc = self.raw_input[raw_index]
 66 |             # pad with zeros to max_timesteps
 67 |             pad_len = self.max_timesteps - mfcc.shape[0]
 68 |             padded = np.pad(mfcc, ((0, pad_len), (0, 0)), 'constant', constant_values=0)
 69 |             # reshape time_steps x n_features -> steps x cell_size
 70 |             inputs[batch_index] = padded.reshape([self.max_steps, self.cell_size])
 71 |             labels[batch_index] = self.raw_labels[raw_index]
 72 |         return inputs, labels
 73 | 
 74 | 
 75 | def load_data(path, swap_axes=True):
 76 |     print("load data from " + path)
 77 |     texts = {}
 78 |     types = {}
 79 |     for line in open(path + "/lines.list").readlines():
 80 |         num, type, text = line.split(":")
 81 |         types[num] = type
 82 |         texts[num] = text.replace("\n", '')
 83 |     train = {'texts': [], 'mfcc': [], 'labels': []}
 84 |     validation = {'texts': [], 'mfcc': [], 'labels': []}
 85 |     for file_name in os.listdir(path + "/mfcc/"):
 86 |         num, voice, rate = file_name.split("_")
 87 |         if types[num] == "train":
 88 |             target = train
 89 |         else:
 90 |             target = validation
 91 |         target['texts'].append(texts[num])
 92 |         mfcc = np.load(os.path.join(path + "/mfcc/", file_name))
 93 |         if swap_axes:
 94 |             mfcc = np.swapaxes(mfcc, 0, 1)
 95 |         target['mfcc'].append(mfcc)
 96 |         target['labels'].append(np.load(os.path.join(path + "/labels/", num + ".npy")))
 97 |     return train, validation
 98 | 
 99 | 
100 | def main():
101 |     train, validation = load_data(SENTENCES_MLL_PATH)
102 |     validation_input = MllData(validation['mfcc'], validation['labels'], 208)
103 |     v_inputs, v_labels = validation_input.get_batch(5)
104 | 
105 | 
106 | if __name__ == '__main__':
107 |     main()
108 |     print("DONE!")


--------------------------------------------------------------------------------
/speech/mll_irrelevant_words.txt:
--------------------------------------------------------------------------------
  1 | other
  2 | new
  3 | good
  4 | high
  5 | old
  6 | great
  7 | big
  8 | American
  9 | small
 10 | large
 11 | national
 12 | young
 13 | different
 14 | black
 15 | long
 16 | little
 17 | important
 18 | political
 19 | bad
 20 | white
 21 | real
 22 | best
 23 | right
 24 | social
 25 | only
 26 | public
 27 | sure
 28 | low
 29 | early
 30 | able
 31 | human
 32 | local
 33 | late
 34 | hard
 35 | major
 36 | better
 37 | economic
 38 | strong
 39 | possible
 40 | whole
 41 | free
 42 | military
 43 | true
 44 | federal
 45 | international
 46 | full
 47 | special
 48 | easy
 49 | clear
 50 | recent
 51 | certain
 52 | personal
 53 | open
 54 | red
 55 | difficult
 56 | available
 57 | likely
 58 | short
 59 | single
 60 | medical
 61 | current
 62 | wrong
 63 | private
 64 | past
 65 | foreign
 66 | fine
 67 | common
 68 | poor
 69 | natural
 70 | significant
 71 | similar
 72 | hot
 73 | dead
 74 | central
 75 | happy
 76 | serious
 77 | ready
 78 | simple
 79 | left
 80 | physical
 81 | general
 82 | environmental
 83 | financial
 84 | blue
 85 | democratic
 86 | dark
 87 | various
 88 | entire
 89 | close
 90 | legal
 91 | religious
 92 | cold
 93 | final
 94 | main
 95 | green
 96 | nice
 97 | huge
 98 | popular
 99 | traditional
100 | cultural
101 | time
102 | year
103 | people
104 | way
105 | day
106 | man
107 | thing
108 | woman
109 | life
110 | child
111 | world
112 | school
113 | state
114 | family
115 | student
116 | group
117 | country
118 | problem
119 | hand
120 | part
121 | place
122 | case
123 | week
124 | company
125 | system
126 | program
127 | question
128 | work
129 | government
130 | number
131 | night
132 | point
133 | home
134 | water
135 | room
136 | mother
137 | area
138 | money
139 | story
140 | fact
141 | month
142 | lot
143 | right
144 | study
145 | book
146 | eye
147 | job
148 | word
149 | business
150 | issue
151 | side
152 | kind
153 | head
154 | house
155 | service
156 | friend
157 | father
158 | power
159 | hour
160 | game
161 | line
162 | end
163 | member
164 | law
165 | car
166 | city
167 | community
168 | name
169 | president
170 | team
171 | minute
172 | idea
173 | kid
174 | body
175 | information
176 | back
177 | parent
178 | face
179 | others
180 | level
181 | office
182 | door
183 | health
184 | person
185 | art
186 | war
187 | history
188 | party
189 | result
190 | change
191 | morning
192 | reason
193 | research
194 | girl
195 | guy
196 | moment
197 | air
198 | teacher
199 | force
200 | education
201 | be
202 | have
203 | do
204 | say
205 | go
206 | can
207 | get
208 | would
209 | make
210 | know
211 | will
212 | think
213 | take
214 | see
215 | come
216 | could
217 | want
218 | look
219 | use
220 | find
221 | give
222 | tell
223 | work
224 | may
225 | should
226 | call
227 | try
228 | ask
229 | need
230 | feel
231 | become
232 | leave
233 | put
234 | mean
235 | keep
236 | let
237 | begin
238 | seem
239 | help
240 | talk
241 | turn
242 | start
243 | might
244 | show
245 | hear
246 | play
247 | run
248 | move
249 | like
250 | live
251 | believe
252 | hold
253 | bring
254 | happen
255 | must
256 | write
257 | provide
258 | sit
259 | stand
260 | lose
261 | pay
262 | meet
263 | include
264 | continue
265 | set
266 | learn
267 | change
268 | lead
269 | understand
270 | watch


--------------------------------------------------------------------------------
/speech/mll_lstm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import time
  6 | 
  7 | import numpy as np
  8 | import tensorflow as tf
  9 | 
 10 | from mll_data import MllData, load_data
 11 | 
 12 | flags = tf.flags
 13 | logging = tf.logging
 14 | flags.DEFINE_string(
 15 |     "model", "small",
 16 |     "A type of model. Possible options are: small, medium, large.")
 17 | flags.DEFINE_string("data_path", "data/spoken_sentences_mll_wav",
 18 |                     "Where the training/test data is stored.")
 19 | flags.DEFINE_string("save_path", None,
 20 |                     "Model output directory.")
 21 | flags.DEFINE_bool("use_fp64", False,
 22 |                   "Train using 64-bit floats instead of 32-bit floats")
 23 | FLAGS = flags.FLAGS
 24 | 
 25 | 
 26 | def data_type():
 27 |     return tf.float64 if FLAGS.use_fp64 else tf.float32
 28 | 
 29 | 
 30 | class SmallConfig(object):
 31 |     """Small config."""
 32 |     init_scale = 0.1
 33 |     learning_rate = 1.0
 34 |     max_grad_norm = 5
 35 |     num_layers = 2
 36 |     hidden_size = 104
 37 |     epoch_size = 30
 38 |     constant_lr_max_epoch = 4
 39 |     max_epoch = 10
 40 |     keep_prob = 1.0
 41 |     lr_decay = 0.5
 42 |     batch_size = 10
 43 |     validation_batch_size = 3
 44 | 
 45 | 
 46 | class MediumConfig(object):
 47 |     """Medium config."""
 48 |     init_scale = 0.05
 49 |     learning_rate = 1.0
 50 |     max_grad_norm = 5
 51 |     num_layers = 2
 52 |     hidden_size = 416
 53 |     epoch_size = 40
 54 |     constant_lr_max_epoch = 6
 55 |     max_epoch = 16
 56 |     keep_prob = 0.5
 57 |     lr_decay = 0.8
 58 |     batch_size = 20
 59 |     validation_batch_size = 5
 60 | 
 61 | 
 62 | class LargeConfig(object):
 63 |     """Large config."""
 64 |     init_scale = 0.04
 65 |     learning_rate = 1.0
 66 |     max_grad_norm = 10
 67 |     num_layers = 2
 68 |     hidden_size = 1300
 69 |     epoch_size = 60
 70 |     constant_lr_max_epoch = 8
 71 |     max_epoch = 24
 72 |     keep_prob = 0.35
 73 |     lr_decay = 1 / 1.15
 74 |     batch_size = 20
 75 |     validation_batch_size = 5
 76 | 
 77 | 
 78 | def get_config():
 79 |     if FLAGS.model == "small":
 80 |         return SmallConfig()
 81 |     elif FLAGS.model == "medium":
 82 |         return MediumConfig()
 83 |     elif FLAGS.model == "large":
 84 |         return LargeConfig()
 85 |     else:
 86 |         raise ValueError("Invalid model: %s", FLAGS.model)
 87 | 
 88 | 
 89 | class MLLModel(object):
 90 |     """The MLL model."""
 91 | 
 92 |     def assign_lr(self, session, lr_value):
 93 |         session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
 94 | 
 95 |     @property
 96 |     def input(self):
 97 |         return self._input
 98 | 
 99 |     @property
100 |     def inputs_ph(self):
101 |         return self._inputs_ph
102 | 
103 |     @property
104 |     def labels_ph(self):
105 |         return self._labels_ph
106 | 
107 |     @property
108 |     def epoch_size(self):
109 |         return self._epoch_size
110 | 
111 |     @property
112 |     def batch_size(self):
113 |         return self._batch_size
114 | 
115 |     @property
116 |     def initial_state(self):
117 |         return self._initial_state
118 | 
119 |     @property
120 |     def cost(self):
121 |         return self._cost
122 | 
123 |     @property
124 |     def loss(self):
125 |         return self._loss
126 | 
127 |     @property
128 |     def final_state(self):
129 |         return self._final_state
130 | 
131 |     @property
132 |     def lr(self):
133 |         return self._lr
134 | 
135 |     @property
136 |     def train_op(self):
137 |         return self._train_op
138 | 
139 |     def __init__(self, is_training, config, input_):
140 |         self._input = input_
141 |         self._epoch_size = config.epoch_size if is_training else 1
142 | 
143 |         self._batch_size = batch_size = config.batch_size if is_training else config.validation_batch_size
144 |         num_steps = input_.num_steps
145 |         size = config.hidden_size
146 |         num_classes = input_.num_classes
147 | 
148 |         lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=True)
149 |         if is_training and config.keep_prob < 1:
150 |             lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
151 |                 lstm_cell, output_keep_prob=config.keep_prob)
152 |         cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers, state_is_tuple=True)
153 | 
154 |         self._initial_state = cell.zero_state(batch_size, data_type())
155 | 
156 |         self._inputs_ph = tf.placeholder(data_type(), shape=[batch_size, num_steps, size])
157 |         if is_training and config.keep_prob < 1:
158 |             self._inputs_ph = tf.nn.dropout(self._inputs_ph, config.keep_prob)
159 |         self._labels_ph = tf.placeholder(data_type(), shape=[batch_size, num_classes])
160 | 
161 |         with tf.variable_scope("RNN"):
162 |             inputs = tf.unstack(self._inputs_ph, num=num_steps, axis=1)
163 |             outputs, state = tf.nn.rnn(cell, inputs, initial_state=self._initial_state)
164 |         self._final_state = state
165 | 
166 |         print("outputs len: ", len(outputs))
167 |         output = outputs.pop()
168 |         print("output shape: ", output.get_shape())
169 | 
170 |         softmax_w = tf.get_variable(
171 |             "softmax_w", [size, num_classes], dtype=data_type())
172 |         softmax_b = tf.get_variable("softmax_b", [num_classes], dtype=data_type())
173 |         classes = tf.matmul(output, softmax_w) + softmax_b
174 |         print("classes shape: ", classes.get_shape())
175 | 
176 |         classes_w = tf.get_variable(
177 |             "classes_w", [num_classes, num_classes], dtype=data_type())
178 |         classes_b = tf.get_variable("classes_b", [num_classes], dtype=data_type())
179 |         logits = tf.matmul(classes, classes_w) + classes_b
180 |         print("logits shape: ", logits.get_shape())
181 | 
182 |         loss = tf.nn.sigmoid_cross_entropy_with_logits(logits, self._labels_ph)
183 |         self._loss = loss
184 |         self._cost = cost = tf.reduce_sum(loss) / batch_size
185 | 
186 |         if not is_training:
187 |             return
188 | 
189 |         self._lr = tf.Variable(0.0, trainable=False)
190 |         tvars = tf.trainable_variables()
191 |         grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
192 |                                           config.max_grad_norm)
193 |         optimizer = tf.train.GradientDescentOptimizer(self._lr)
194 |         self._train_op = optimizer.apply_gradients(
195 |             zip(grads, tvars),
196 |             global_step=tf.contrib.framework.get_or_create_global_step())
197 | 
198 |         self._new_lr = tf.placeholder(
199 |             tf.float32, shape=[], name="new_learning_rate")
200 |         self._lr_update = tf.assign(self._lr, self._new_lr)
201 | 
202 | 
203 | def run_epoch(session, model, eval_op=None, verbose=False):
204 |     """Runs the model on the given data."""
205 |     start_time = time.time()
206 |     costs = 0.0
207 |     iters = 0
208 |     state = session.run(model.initial_state)
209 | 
210 |     fetches = {
211 |         "cost": model.cost,
212 |         "final_state": model.final_state,
213 |     }
214 |     if eval_op is not None:
215 |         fetches["eval_op"] = eval_op
216 | 
217 |     for step in range(model.epoch_size):
218 |         _inputs, _labels = model.input.get_batch(model.batch_size)
219 | 
220 |         vals = session.run(fetches, feed_dict={
221 |             model.inputs_ph: _inputs,
222 |             model.labels_ph: _labels
223 |         })
224 |         # state = vals["final_state"]
225 |         costs += vals["cost"]
226 |         iters += 1
227 | 
228 |         if verbose and step % (model.epoch_size // 10) == 10:
229 |             print("%.3f Accuracy: %.3f speed: %.0f sentences/sec" %
230 |                   (step * 1.0 / model.epoch_size, np.exp(costs / iters),
231 |                    iters * model.batch_size / (time.time() - start_time)))
232 | 
233 |     return np.exp(costs / iters)
234 | 
235 | 
236 | def main(_):
237 |     if not FLAGS.data_path:
238 |         raise ValueError("Must set --data_path to MLL data directory")
239 | 
240 |     config = get_config()
241 |     train_data, valid_data = load_data(FLAGS.data_path)
242 | 
243 |     with tf.Graph().as_default():
244 |         initializer = tf.random_uniform_initializer(-config.init_scale,
245 |                                                     config.init_scale)
246 | 
247 |         with tf.name_scope("Train"):
248 |             train_input = MllData(train_data['mfcc'], train_data['labels'], config.hidden_size)
249 |             with tf.variable_scope("Model", reuse=None, initializer=initializer):
250 |                 m = MLLModel(is_training=True, config=config, input_=train_input)
251 |             tf.scalar_summary("Training Loss", m.cost)
252 |             tf.scalar_summary("Learning Rate", m.lr)
253 | 
254 |         with tf.name_scope("Valid"):
255 |             valid_input = MllData(valid_data['mfcc'], valid_data['labels'], config.hidden_size)
256 |             with tf.variable_scope("Model", reuse=True, initializer=initializer):
257 |                 mvalid = MLLModel(is_training=False, config=config, input_=valid_input)
258 |             tf.scalar_summary("Validation Loss", mvalid.cost)
259 | 
260 |         sv = tf.train.Supervisor(logdir=FLAGS.save_path)
261 |         with sv.managed_session() as session:
262 |             for i in range(config.max_epoch):
263 |                 lr_decay = config.lr_decay ** max(i + 1 - config.constant_lr_max_epoch, 0.0)
264 |                 m.assign_lr(session, config.learning_rate * lr_decay)
265 | 
266 |                 print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
267 |                 train_accuracy = run_epoch(session, m, eval_op=m.train_op,
268 |                                              verbose=True)
269 |                 print("Epoch: %d Train Accuracy: %.3f" % (i + 1, train_accuracy))
270 |                 valid_accuracy = run_epoch(session, mvalid)
271 |                 print("Epoch: %d Valid Accuracy: %.3f" % (i + 1, valid_accuracy))
272 | 
273 |             if FLAGS.save_path:
274 |                 print("Saving model to %s." % FLAGS.save_path)
275 |                 sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
276 | 
277 | 
278 | if __name__ == "__main__":
279 |     tf.app.run()
280 | 


--------------------------------------------------------------------------------
/speech/mll_relevant_words.txt:
--------------------------------------------------------------------------------
 1 | follow
 2 | stop
 3 | create
 4 | speak
 5 | read
 6 | allow
 7 | add
 8 | spend
 9 | grow
10 | open
11 | walk
12 | win
13 | offer
14 | remember
15 | love
16 | consider
17 | appear
18 | buy
19 | wait
20 | serve
21 | die
22 | send
23 | expect
24 | build
25 | stay
26 | fall
27 | cut
28 | reach
29 | kill
30 | remain


--------------------------------------------------------------------------------
/speech/record_to_wav.py:
--------------------------------------------------------------------------------
  1 | from sys import byteorder
  2 | from array import array
  3 | from struct import pack
  4 | 
  5 | import pyaudio
  6 | import wave
  7 | 
  8 | THRESHOLD = 500
  9 | CHUNK_SIZE = 1024
 10 | FORMAT = pyaudio.paInt16
 11 | RATE = 44100
 12 | 
 13 | 
 14 | def is_silent(snd_data):
 15 |     "Returns 'True' if below the 'silent' threshold"
 16 |     return max(snd_data) < THRESHOLD
 17 | 
 18 | 
 19 | def normalize(snd_data):
 20 |     "Average the volume out"
 21 |     MAXIMUM = 16384
 22 |     times = float(MAXIMUM) / max(abs(i) for i in snd_data)
 23 | 
 24 |     r = array('h')
 25 |     for i in snd_data:
 26 |         r.append(int(i * times))
 27 |     return r
 28 | 
 29 | 
 30 | def trim(snd_data):
 31 |     "Trim the blank spots at the start and end"
 32 | 
 33 |     def _trim(snd_data):
 34 |         snd_started = False
 35 |         r = array('h')
 36 | 
 37 |         for i in snd_data:
 38 |             if not snd_started and abs(i) > THRESHOLD:
 39 |                 snd_started = True
 40 |                 r.append(i)
 41 | 
 42 |             elif snd_started:
 43 |                 r.append(i)
 44 |         return r
 45 | 
 46 |     # Trim to the left
 47 |     snd_data = _trim(snd_data)
 48 | 
 49 |     # Trim to the right
 50 |     snd_data.reverse()
 51 |     snd_data = _trim(snd_data)
 52 |     snd_data.reverse()
 53 |     return snd_data
 54 | 
 55 | 
 56 | def add_silence(snd_data, seconds):
 57 |     "Add silence to the start and end of 'snd_data' of length 'seconds' (float)"
 58 |     r = array('h', [0 for i in xrange(int(seconds * RATE))])
 59 |     r.extend(snd_data)
 60 |     r.extend([0 for i in xrange(int(seconds * RATE))])
 61 |     return r
 62 | 
 63 | 
 64 | def record():
 65 |     """
 66 |     Record a word or words from the microphone and
 67 |     return the data as an array of signed shorts.
 68 | 
 69 |     Normalizes the audio, trims silence from the
 70 |     start and end, and pads with 0.5 seconds of
 71 |     blank sound to make sure VLC et al can play
 72 |     it without getting chopped off.
 73 |     """
 74 |     p = pyaudio.PyAudio()
 75 |     stream = p.open(format=FORMAT, channels=1, rate=RATE,
 76 |                     input=True, output=True,
 77 |                     frames_per_buffer=CHUNK_SIZE)
 78 | 
 79 |     num_silent = 0
 80 |     snd_started = False
 81 | 
 82 |     r = array('h')
 83 | 
 84 |     while 1:
 85 |         # little endian, signed short
 86 |         snd_data = array('h', stream.read(CHUNK_SIZE))
 87 |         if byteorder == 'big':
 88 |             snd_data.byteswap()
 89 |         r.extend(snd_data)
 90 | 
 91 |         silent = is_silent(snd_data)
 92 | 
 93 |         if silent and snd_started:
 94 |             num_silent += 1
 95 |         elif not silent and not snd_started:
 96 |             snd_started = True
 97 | 
 98 |         if snd_started and num_silent > 30:
 99 |             break
100 | 
101 |     sample_width = p.get_sample_size(FORMAT)
102 |     stream.stop_stream()
103 |     stream.close()
104 |     p.terminate()
105 | 
106 |     r = normalize(r)
107 |     r = trim(r)
108 |     r = add_silence(r, 0.5)
109 |     return sample_width, r
110 | 
111 | 
112 | def record_to_file(path):
113 |     "Records from the microphone and outputs the resulting data to 'path'"
114 |     sample_width, data = record()
115 |     data = pack('<' + ('h' * len(data)), *data)
116 | 
117 |     wf = wave.open(path, 'wb')
118 |     wf.setnchannels(1)
119 |     wf.setsampwidth(sample_width)
120 |     wf.setframerate(RATE)
121 |     wf.writeframes(data)
122 |     wf.close()
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     print("please speak a word into the microphone")
127 |     record_to_file('demo.wav')
128 |     print("done - result written to demo.wav")
129 | 


--------------------------------------------------------------------------------
/speech/speech_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/python
  2 | """Utilities for downloading and providing data from openslr.org, libriSpeech, Pannous, Gutenberg, WMT, tokenizing, vocabularies."""
  3 | # TODO! see https://github.com/pannous/caffe-speech-recognition for some data sources
  4 | 
  5 | import os
  6 | import re
  7 | import sys
  8 | import wave
  9 | 
 10 | import numpy
 11 | import numpy as np
 12 | import skimage.io  # scikit-image
 13 | 
 14 | try:
 15 |     import librosa
 16 | except:
 17 |     print("pip install librosa ; if you want mfcc_batch_generator")
 18 | # import extensions as xx
 19 | from random import shuffle
 20 | 
 21 | try:
 22 |     from six.moves import urllib
 23 |     from six.moves import xrange  # pylint: disable=redefined-builtin
 24 | except:
 25 |     pass  # fuck 2to3
 26 | 
 27 | # TRAIN_INDEX='train_words_index.txt'
 28 | # TEST_INDEX='test_words_index.txt'
 29 | SOURCE_URL = 'http://pannous.net/files/'  # spoken_numbers.tar'
 30 | DATA_DIR = 'data/'
 31 | pcm_path = "data/spoken_numbers_pcm/"  # 8 bit
 32 | wav_path = "data/spoken_numbers_wav/"  # 16 bit s16le
 33 | path = pcm_path
 34 | CHUNK = 4096
 35 | test_fraction = 0.1  # 10% of data for test / verification
 36 | 
 37 | 
 38 | # http://pannous.net/files/spoken_numbers_pcm.tar
 39 | class Source:  # labels
 40 |     DIGIT_WAVES = 'spoken_numbers_pcm.tar'
 41 |     DIGIT_SPECTROS = 'spoken_numbers_spectros_64x64.tar'  # 64x64  baby data set, works astonishingly well
 42 |     NUMBER_WAVES = 'spoken_numbers_wav.tar'
 43 |     NUMBER_IMAGES = 'spoken_numbers.tar'  # width=256 height=256
 44 |     WORD_SPECTROS = 'https://dl.dropboxusercontent.com/u/23615316/spoken_words.tar'  # width,height=512# todo: sliding window!
 45 |     WORD_WAVES = 'spoken_words_wav.tar'
 46 |     TEST_INDEX = 'test_index.txt'
 47 |     TRAIN_INDEX = 'train_index.txt'
 48 | 
 49 | 
 50 | from enum import Enum
 51 | 
 52 | 
 53 | class Target(Enum):  # labels
 54 |     digits = 1
 55 |     speaker = 2
 56 |     words_per_minute = 3
 57 |     word_phonemes = 4
 58 |     word = 5  # int vector as opposed to binary hotword
 59 |     sentence = 6
 60 |     sentiment = 7
 61 |     first_letter = 8
 62 |     hotword = 9
 63 | # test_word=9 # use 5 even for speaker etc
 64 | 
 65 | 
 66 | num_characters = 32
 67 | # num_characters=60 #  only one case, Including numbers
 68 | # num_characters=128 #
 69 | # num_characters=256 #  including special characters
 70 | # offset=0  # 1:1 mapping ++
 71 | # offset=32 # starting with ' ' space
 72 | # offset=48 # starting with  numbers
 73 | offset = 64  # starting with characters
 74 | max_word_length = 20
 75 | terminal_symbol = 0
 76 | 
 77 | 
 78 | def pad(vec, pad_to=max_word_length, one_hot=False, paddy=terminal_symbol):
 79 |     for i in range(0, pad_to - len(vec)):
 80 |         if one_hot:
 81 |             vec.append([paddy] * num_characters)
 82 |         else:
 83 |             vec.append(paddy)
 84 |     return vec
 85 | 
 86 | 
 87 | def char_to_class(c):
 88 |     return (ord(c) - offset) % num_characters
 89 | 
 90 | 
 91 | def string_to_int_word(word, pad_to):
 92 |     z = map(char_to_class, word)
 93 |     z = list(z)
 94 |     z = pad(z)
 95 |     return z
 96 | 
 97 | 
 98 | class SparseLabels:
 99 |     def __init__(labels):
100 |         labels.indices = {}
101 |         labels.values = []
102 | 
103 |     def shape(self):
104 |         return (len(self.indices), len(self.values))
105 | 
106 | 
107 | # labels: An `int32` `SparseTensor`.
108 | # labels.indices[i, :] == [b, t] means `labels.values[i]` stores the id for (batch b, time t).
109 | # labels.values[i]` must take on values in `[0, num_labels)`.
110 | def sparse_labels(vec):
111 |     labels = SparseLabels()
112 |     b = 0
113 |     for lab in vec:
114 |         t = 0
115 |         for c in lab:
116 |             labels.indices[b, t] = len(labels.values)
117 |             labels.values.append(char_to_class(c))
118 |             # labels.values[i] = char_to_class(c)
119 |             t += 1
120 |         b += 1
121 |     return labels
122 | 
123 | 
124 | def progresshook(blocknum, blocksize, totalsize):
125 |     readsofar = blocknum * blocksize
126 |     if totalsize > 0:
127 |         percent = readsofar * 1e2 / totalsize
128 |         s = "\r%5.1f%% %*d / %d" % (
129 |             percent, len(str(totalsize)), readsofar, totalsize)
130 |         sys.stderr.write(s)
131 |         if readsofar >= totalsize:  # near the end
132 |             sys.stderr.write("\n")
133 |     else:  # total size is unknown
134 |         sys.stderr.write("read %d\n" % (readsofar,))
135 | 
136 | 
137 | def maybe_download(file, work_directory=DATA_DIR):
138 |     """Download the data from Pannous's website, unless it's already here."""
139 |     print("Looking for data %s in %s" % (file, work_directory))
140 |     if not os.path.exists(work_directory):
141 |         os.mkdir(work_directory)
142 |     filepath = os.path.join(work_directory, re.sub('.*\/', '', file))
143 |     if not os.path.exists(filepath):
144 |         if not file.startswith("http"):
145 |             url_filename = SOURCE_URL + file
146 |         else:
147 |             url_filename = file
148 |         print('Downloading from %s to %s' % (url_filename, filepath))
149 |         filepath, _ = urllib.request.urlretrieve(url_filename, filepath, progresshook)
150 |         statinfo = os.stat(filepath)
151 |         print('Successfully downloaded', file, statinfo.st_size, 'bytes.')
152 |     # os.system('ln -s '+work_directory)
153 |     if os.path.exists(filepath):
154 |         print('Extracting %s to %s' % (filepath, work_directory))
155 |         os.system('tar xf ' + filepath + " -C " + work_directory)
156 |         print('Data ready!')
157 |     return filepath.replace(".tar", "")
158 | 
159 | 
160 | def spectro_batch(batch_size=10):
161 |     return spectro_batch_generator(batch_size)
162 | 
163 | 
164 | def speaker(filename):  # vom Dateinamen
165 |     # if not "_" in file:
166 |     #   return "Unknown"
167 |     return filename.split("_")[1]
168 | 
169 | 
170 | def get_speakers(path=pcm_path):
171 |     maybe_download(Source.DIGIT_SPECTROS)
172 |     maybe_download(Source.DIGIT_WAVES)
173 |     files = os.listdir(path)
174 | 
175 |     def nobad(name):
176 |         return "_" in name and not "." in name.split("_")[1]
177 | 
178 |     speakers = list(set(map(speaker, filter(nobad, files))))
179 |     print(len(speakers), " speakers: ", speakers)
180 |     return speakers
181 | 
182 | 
183 | def load_wav_file(name):
184 |     f = wave.open(name, "rb")
185 |     # print("loading %s"%name)
186 |     chunk = []
187 |     data0 = f.readframes(CHUNK)
188 |     while data0:  # f.getnframes()
189 |         # data=numpy.fromstring(data0, dtype='float32')
190 |         # data = numpy.fromstring(data0, dtype='uint16')
191 |         data = numpy.fromstring(data0, dtype='uint8')
192 |         data = (data + 128) / 255.  # 0-1 for Better convergence
193 |         # chunks.append(data)
194 |         chunk.extend(data)
195 |         data0 = f.readframes(CHUNK)
196 |     # finally trim:
197 |     chunk = chunk[0:CHUNK * 2]  # should be enough for now -> cut
198 |     chunk.extend(numpy.zeros(CHUNK * 2 - len(chunk)))  # fill with padding 0's
199 |     # print("%s loaded"%name)
200 |     return chunk
201 | 
202 | 
203 | def spectro_batch_generator(batch_size=10, width=64, source_data=Source.DIGIT_SPECTROS, target=Target.digits):
204 |     # maybe_download(Source.NUMBER_IMAGES , DATA_DIR)
205 |     # maybe_download(Source.SPOKEN_WORDS, DATA_DIR)
206 |     path = maybe_download(source_data, DATA_DIR)
207 |     path = path.replace("_spectros", "")  # HACK! remove!
208 |     height = width
209 |     batch = []
210 |     labels = []
211 |     speakers = get_speakers(path)
212 |     if target == Target.digits: num_classes = 10
213 |     if target == Target.first_letter: num_classes = 32
214 |     files = os.listdir(path)
215 |     # shuffle(files) # todo : split test_fraction batch here!
216 |     # files=files[0:int(len(files)*(1-test_fraction))]
217 |     print("Got %d source data files from %s" % (len(files), path))
218 |     while True:
219 |         # print("shuffling source data files")
220 |         shuffle(files)
221 |         for image_name in files:
222 |             if not "_" in image_name: continue  # bad !?!
223 |             image = skimage.io.imread(path + "/" + image_name).astype(numpy.float32)
224 |             # image.resize(width,height) # lets see ...
225 |             data = image / 255.  # 0-1 for Better convergence
226 |             # data = data.reshape([width * height])  # tensorflow matmul needs flattened matrices wtf
227 |             batch.append(list(data))
228 |             # classe=(ord(image_name[0]) - 48)  # -> 0=0 .. A:65-48 ... 74 for 'z'
229 |             classe = (ord(image_name[0]) - 48) % 32  # -> 0=0  17 for A, 10 for z ;)
230 |             labels.append(dense_to_one_hot(classe, num_classes))
231 |             if len(batch) >= batch_size:
232 |                 yield batch, labels
233 |                 batch = []  # Reset for next batch
234 |                 labels = []
235 | 
236 | 
237 | def mfcc_batch_generator(batch_size=10, source=Source.DIGIT_WAVES, target=Target.digits):
238 |     maybe_download(source, DATA_DIR)
239 |     if target == Target.speaker: speakers = get_speakers()
240 |     batch_features = []
241 |     labels = []
242 |     files = os.listdir(path)
243 |     while True:
244 |         print("loaded batch of %d files" % len(files))
245 |         shuffle(files)
246 |         for file in files:
247 |             if not file.endswith(".wav"): continue
248 |             wave, sr = librosa.load(path + file, mono=True)
249 |             mfcc = librosa.feature.mfcc(wave, sr)
250 |             if target == Target.speaker:
251 |                 label = one_hot_from_item(speaker(file), speakers)
252 |             elif target == Target.digits:
253 |                 label = dense_to_one_hot(int(file[0]), 10)
254 |             elif target == Target.first_letter:
255 |                 label = dense_to_one_hot((ord(file[0]) - 48) % 32, 32)
256 |             elif target == Target.hotword:
257 |                 label = one_hot_word(file, pad_to=max_word_length)  #
258 |             elif target == Target.word:
259 |                 label = string_to_int_word(file, pad_to=max_word_length)
260 |             # label = file  # sparse_labels(file, pad_to=20)  # max_output_length
261 |             else:
262 |                 raise Exception("todo : labels for Target!")
263 |             labels.append(label)
264 |             # print(np.array(mfcc).shape)
265 |             mfcc = np.pad(mfcc, ((0, 0), (0, 80 - len(mfcc[0]))), mode='constant', constant_values=0)
266 |             batch_features.append(np.array(mfcc))
267 |             if len(batch_features) >= batch_size:
268 |                 # if target == Target.word:  labels = sparse_labels(labels)
269 |                 # labels=np.array(labels)
270 |                 # print(np.array(batch_features).shape)
271 |                 # yield np.array(batch_features), labels
272 |                 # print(np.array(labels).shape) # why (64,) instead of (64, 15, 32)? OK IFF dim_1==const (20)
273 |                 yield batch_features, labels  # basic_rnn_seq2seq inputs must be a sequence
274 |                 batch_features = []  # Reset for next batch
275 |                 labels = []
276 | 
277 | 
278 | # If you set dynamic_pad=True when calling tf.train.batch the returned batch will be automatically padded with 0s. Handy! A lower-level option is to use tf.PaddingFIFOQueue.
279 | # only apply to a subset of all images at one time
280 | def wave_batch_generator(batch_size=10, source=Source.DIGIT_WAVES, target=Target.digits):  # speaker
281 |     maybe_download(source, DATA_DIR)
282 |     if target == Target.speaker: speakers = get_speakers()
283 |     batch_waves = []
284 |     labels = []
285 |     # input_width=CHUNK*6 # wow, big!!
286 |     files = os.listdir(path)
287 |     while True:
288 |         shuffle(files)
289 |         print("loaded batch of %d files" % len(files))
290 |         for wav in files:
291 |             if not wav.endswith(".wav"): continue
292 |             if target == Target.digits:
293 |                 labels.append(dense_to_one_hot(int(wav[0])))
294 |             elif target == Target.speaker:
295 |                 labels.append(one_hot_from_item(speaker(wav), speakers))
296 |             elif target == Target.first_letter:
297 |                 label = dense_to_one_hot((ord(wav[0]) - 48) % 32, 32)
298 |             else:
299 |                 raise Exception("todo : Target.word label!")
300 |             chunk = load_wav_file(path + wav)
301 |             batch_waves.append(chunk)
302 |             # batch_waves.append(chunks[input_width])
303 |             if len(batch_waves) >= batch_size:
304 |                 yield batch_waves, labels
305 |                 batch_waves = []  # Reset for next batch
306 |                 labels = []
307 | 
308 | 
309 | class DataSet(object):
310 |     def __init__(self, images, labels, fake_data=False, one_hot=False, load=False):
311 |         """Construct a DataSet. one_hot arg is used only if fake_data is true."""
312 |         if fake_data:
313 |             self._num_examples = 10000
314 |             self.one_hot = one_hot
315 |         else:
316 |             num = len(images)
317 |             assert num == len(labels), ('images.shape: %s labels.shape: %s' % (images.shape, labels.shape))
318 |             print("len(images) %d" % num)
319 |             self._num_examples = num
320 |         self.cache = {}
321 |         self._image_names = numpy.array(images)
322 |         self._labels = labels
323 |         self._epochs_completed = 0
324 |         self._index_in_epoch = 0
325 |         self._images = []
326 |         if load:  # Otherwise loaded on demand
327 |             self._images = self.load(self._image_names)
328 | 
329 |     @property
330 |     def images(self):
331 |         return self._images
332 | 
333 |     @property
334 |     def image_names(self):
335 |         return self._image_names
336 | 
337 |     @property
338 |     def labels(self):
339 |         return self._labels
340 | 
341 |     @property
342 |     def num_examples(self):
343 |         return self._num_examples
344 | 
345 |     @property
346 |     def epochs_completed(self):
347 |         return self._epochs_completed
348 | 
349 |     # only apply to a subset of all images at one time
350 |     def load(self, image_names):
351 |         print("loading %d images" % len(image_names))
352 |         return list(map(self.load_image, image_names))  # python3 map object WTF
353 | 
354 |     def load_image(self, image_name):
355 |         if image_name in self.cache:
356 |             return self.cache[image_name]
357 |         else:
358 |             image = skimage.io.imread(DATA_DIR + image_name).astype(numpy.float32)
359 |             # images = numpy.multiply(images, 1.0 / 255.0)
360 |             self.cache[image_name] = image
361 |             return image
362 | 
363 |     def next_batch(self, batch_size, fake_data=False):
364 |         """Return the next `batch_size` examples from this data set."""
365 |         if fake_data:
366 |             fake_image = [1] * width * height
367 |             if self.one_hot:
368 |                 fake_label = [1] + [0] * 9
369 |             else:
370 |                 fake_label = 0
371 |             return [fake_image for _ in xrange(batch_size)], [
372 |                 fake_label for _ in xrange(batch_size)]
373 |         start = self._index_in_epoch
374 |         self._index_in_epoch += batch_size
375 |         if self._index_in_epoch > self._num_examples:
376 |             # Finished epoch
377 |             self._epochs_completed += 1
378 |             # Shuffle the data
379 |             perm = numpy.arange(self._num_examples)
380 |             numpy.random.shuffle(perm)
381 |             # self._images = self._images[perm]
382 |             self._image_names = self._image_names[perm]
383 |             self._labels = self._labels[perm]
384 |             # Start next epoch
385 |             start = 0
386 |             self._index_in_epoch = batch_size
387 |             assert batch_size <= self._num_examples
388 |         end = self._index_in_epoch
389 |         return self.load(self._image_names[start:end]), self._labels[start:end]
390 | 
391 | 
392 | # multi-label
393 | def dense_to_some_hot(labels_dense, num_classes=140):
394 |     """Convert class labels from int vectors to many-hot vectors!"""
395 |     raise "TODO dense_to_some_hot"
396 | 
397 | 
398 | def one_hot_to_item(hot, items):
399 |     i = np.argmax(hot)
400 |     item = items[i]
401 |     return item
402 | 
403 | 
404 | def one_hot_from_item(item, items):
405 |     # items=set(items) # assure uniqueness
406 |     x = [0] * len(items)  # numpy.zeros(len(items))
407 |     i = items.index(item)
408 |     x[i] = 1
409 |     return x
410 | 
411 | 
412 | def one_hot_word(word, pad_to=max_word_length):
413 |     vec = []
414 |     for c in word:  # .upper():
415 |         x = [0] * num_characters
416 |         x[(ord(c) - offset) % num_characters] = 1
417 |         vec.append(x)
418 |     if pad_to: vec = pad(vec, pad_to, one_hot=True)
419 |     return vec
420 | 
421 | 
422 | def many_hot_to_word(word):
423 |     s = ""
424 |     for c in word:
425 |         x = np.argmax(c)
426 |         s += chr(x + offset)
427 |     # s += chr(x + 48) # numbers
428 |     return s
429 | 
430 | 
431 | def dense_to_one_hot(batch, batch_size, num_labels):
432 |     sparse_labels = tf.reshape(batch, [batch_size, 1])
433 |     indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1])
434 |     concatenated = tf.concat(1, [indices, sparse_labels])
435 |     concat = tf.concat(0, [[batch_size], [num_labels]])
436 |     output_shape = tf.reshape(concat, [2])
437 |     sparse_to_dense = tf.sparse_to_dense(concatenated, output_shape, 1.0, 0.0)
438 |     return tf.reshape(sparse_to_dense, [batch_size, num_labels])
439 | 
440 | 
441 | def dense_to_one_hot(batch, batch_size, num_labels):
442 |     sparse_labels = tf.reshape(batch, [batch_size, 1])
443 |     indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1])
444 |     concatenated = tf.concat(1, [indices, sparse_labels])
445 |     concat = tf.concat(0, [[batch_size], [num_labels]])
446 |     output_shape = tf.reshape(concat, [2])
447 |     sparse_to_dense = tf.sparse_to_dense(concatenated, output_shape, 1.0, 0.0)
448 |     return tf.reshape(sparse_to_dense, [batch_size, num_labels])
449 | 
450 | 
451 | def dense_to_one_hot(labels_dense, num_classes=10):
452 |     """Convert class labels from scalars to one-hot vectors."""
453 |     return numpy.eye(num_classes)[labels_dense]
454 | 
455 | 
456 | def extract_labels(names_file, train, one_hot):
457 |     labels = []
458 |     for line in open(names_file).readlines():
459 |         image_file, image_label = line.split("\t")
460 |         labels.append(image_label)
461 |     if one_hot:
462 |         return dense_to_one_hot(labels)
463 |     return labels
464 | 
465 | 
466 | def extract_images(names_file, train):
467 |     image_files = []
468 |     for line in open(names_file).readlines():
469 |         image_file, image_label = line.split("\t")
470 |         image_files.append(image_file)
471 |     return image_files
472 | 
473 | 
474 | def read_data_sets(train_dir, source_data=Source.NUMBER_IMAGES, fake_data=False, one_hot=True):
475 |     class DataSets(object):
476 |         pass
477 | 
478 |     data_sets = DataSets()
479 |     if fake_data:
480 |         data_sets.train = DataSet([], [], fake_data=True, one_hot=one_hot)
481 |         data_sets.validation = DataSet([], [], fake_data=True, one_hot=one_hot)
482 |         data_sets.test = DataSet([], [], fake_data=True, one_hot=one_hot)
483 |         return data_sets
484 |     VALIDATION_SIZE = 2000
485 |     local_file = maybe_download(source_data, train_dir)
486 |     train_images = extract_images(TRAIN_INDEX, train=True)
487 |     train_labels = extract_labels(TRAIN_INDEX, train=True, one_hot=one_hot)
488 |     test_images = extract_images(TEST_INDEX, train=False)
489 |     test_labels = extract_labels(TEST_INDEX, train=False, one_hot=one_hot)
490 |     # train_images = train_images[:VALIDATION_SIZE]
491 |     # train_labels = train_labels[:VALIDATION_SIZE:]
492 |     # test_images = test_images[VALIDATION_SIZE:]
493 |     # test_labels = test_labels[VALIDATION_SIZE:]
494 |     data_sets.train = DataSet(train_images, train_labels, load=False)
495 |     data_sets.test = DataSet(test_images, test_labels, load=True)
496 |     # data_sets.validation = DataSet(validation_images, validation_labels, load=True)
497 |     return data_sets
498 | 
499 | 
500 | if __name__ == "__main__":
501 |     print("downloading speech datasets")
502 |     maybe_download(Source.DIGIT_SPECTROS)
503 |     maybe_download(Source.DIGIT_WAVES)
504 |     maybe_download(Source.NUMBER_IMAGES)
505 |     maybe_download(Source.NUMBER_WAVES)
506 | 


--------------------------------------------------------------------------------
/speech/tflearn_simple_number_classifier.py:
--------------------------------------------------------------------------------
 1 | import tflearn
 2 | from speech_data import wave_batch_generator, Target, load_wav_file, path
 3 | import numpy
 4 | 
 5 | # Simple spoken digit recognition demo, with 98% accuracy in under a minute
 6 | 
 7 | # Training Step: 544  | total loss: 0.15866
 8 | # | Adam | epoch: 034 | loss: 0.15866 - acc: 0.9818 -- iter: 0000/1000
 9 | 
10 | if __name__ == '__main__':
11 |     batch = wave_batch_generator(10000, target=Target.digits)
12 |     X, Y = next(batch)
13 | 
14 |     number_classes = 10  # Digits
15 | 
16 |     # Classification
17 |     tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5)
18 | 
19 |     net = tflearn.input_data(shape=[None, 8192])
20 |     net = tflearn.fully_connected(net, 64)
21 |     net = tflearn.dropout(net, 0.5)
22 |     net = tflearn.fully_connected(net, number_classes, activation='softmax')
23 |     net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')
24 | 
25 |     model = tflearn.DNN(net)
26 |     model.fit(X, Y, n_epoch=3, show_metric=True, snapshot_step=100)
27 |     # Overfitting okay for now
28 | 
29 |     demo_file = "5_Vicki_260.wav"
30 |     demo = load_wav_file(path + demo_file)
31 |     result = model.predict([demo])
32 |     result = numpy.argmax(result)
33 |     print("predicted digit for %s : result = %d " % (demo_file, result))
34 | 


--------------------------------------------------------------------------------