├── .gitignore ├── README.md ├── mnist ├── mnist_advanced.py ├── mnist_softmax.py └── mnist_visualized.py ├── requirements.txt ├── rnn ├── lstm │ └── words_prediction │ │ ├── data.tgz │ │ ├── ptb_word_lm.py │ │ └── reader.py ├── text8.zip └── word2vec_basic.py └── speech ├── generate_speech_data.py ├── mll_data.py ├── mll_irrelevant_words.txt ├── mll_lstm.py ├── mll_relevant_words.txt ├── record_to_wav.py ├── speech_data.py └── tflearn_simple_number_classifier.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | speech/data 3 | /rnn/lstm/words_prediction/data/ 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tensorflow-playground 2 | A simple playground project for TensorFlow Python lib 3 | -------------------------------------------------------------------------------- /mnist/mnist_advanced.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Advanced MNIST classifier. 17 | See extensive documentation at 18 | https://www.tensorflow.org/tutorials/mnist/pros/ 19 | """ 20 | from __future__ import absolute_import 21 | from __future__ import division 22 | from __future__ import print_function 23 | 24 | import argparse 25 | import sys 26 | from datetime import datetime 27 | 28 | # Import data 29 | from tensorflow.examples.tutorials.mnist import input_data 30 | 31 | import tensorflow as tf 32 | 33 | FLAGS = None 34 | 35 | 36 | def weight_variable(shape): 37 | initial = tf.truncated_normal(shape, stddev=0.1) 38 | return tf.Variable(initial) 39 | 40 | 41 | def bias_variable(shape): 42 | initial = tf.constant(0.1, shape=shape) 43 | return tf.Variable(initial) 44 | 45 | 46 | def conv2d(x, W): 47 | return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') 48 | 49 | 50 | def max_pool_2x2(x): 51 | return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') 52 | 53 | 54 | def main(_): 55 | mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) 56 | 57 | # Create the model 58 | 59 | # Data placeholders 60 | x = tf.placeholder(tf.float32, shape=[None, 784]) 61 | y_ = tf.placeholder(tf.float32, shape=[None, 10]) 62 | 63 | # First Convolutional Layer 64 | W_conv1 = weight_variable([5, 5, 1, 32]) 65 | b_conv1 = bias_variable([32]) 66 | x_image = tf.reshape(x, [-1, 28, 28, 1]) 67 | h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1) 68 | h_pool1 = max_pool_2x2(h_conv1) 69 | 70 | # Second Convolutional Layer 71 | W_conv2 = weight_variable([5, 5, 32, 64]) 72 | b_conv2 = bias_variable([64]) 73 | h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) 74 | h_pool2 = max_pool_2x2(h_conv2) 75 | 76 | # Densely Connected Layer 77 | W_fc1 = weight_variable([7 * 7 * 64, 1024]) 78 | b_fc1 = bias_variable([1024]) 79 | h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) 80 | h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) 81 | 82 | # Dropout 83 | keep_prob = tf.placeholder(tf.float32) 84 | h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob) 85 | 86 | # Readout Layer 87 | W_fc2 = weight_variable([1024, 10]) 88 | b_fc2 = bias_variable([10]) 89 | y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2 90 | 91 | # Define loss and optimizer 92 | cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_conv, y_)) 93 | train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) 94 | correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) 95 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 96 | sess = tf.InteractiveSession() 97 | sess.run(tf.global_variables_initializer()) 98 | 99 | # Train 100 | for i in range(5000): 101 | batch_xs, batch_ys = mnist.train.next_batch(200) 102 | if i % 100 == 0: 103 | train_accuracy = accuracy.eval(feed_dict={ 104 | x: batch_xs, y_: batch_ys, keep_prob: 1.0}) 105 | print("[%s] step %d, trained by %d examples, estimated prediction accuracy: %g%%" % 106 | (datetime.now().strftime("%H:%M"), i, i * 200, train_accuracy * 100)) 107 | train_step.run(feed_dict={x: batch_xs, y_: batch_ys, keep_prob: 0.5}) 108 | 109 | # Test trained model 110 | final_accuracy = accuracy.eval(feed_dict={ 111 | x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}) 112 | print("Trained model final accuracy on test data is %g%%" % (final_accuracy * 100)) 113 | 114 | 115 | if __name__ == '__main__': 116 | parser = argparse.ArgumentParser() 117 | parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data', 118 | help='Directory for storing input data') 119 | FLAGS, unparsed = parser.parse_known_args() 120 | tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) 121 | -------------------------------------------------------------------------------- /mnist/mnist_softmax.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """A very simple MNIST classifier. 17 | See extensive documentation at 18 | http://tensorflow.org/tutorials/mnist/beginners/index.md 19 | """ 20 | from __future__ import absolute_import 21 | from __future__ import division 22 | from __future__ import print_function 23 | 24 | import argparse 25 | import sys 26 | 27 | # Import data 28 | from tensorflow.examples.tutorials.mnist import input_data 29 | 30 | import tensorflow as tf 31 | 32 | FLAGS = None 33 | 34 | 35 | def main(_): 36 | mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) 37 | 38 | # Create the model 39 | x = tf.placeholder(tf.float32, [None, 784]) 40 | W = tf.Variable(tf.zeros([784, 10])) 41 | b = tf.Variable(tf.zeros([10])) 42 | y = tf.matmul(x, W) + b 43 | 44 | # Define loss and optimizer 45 | y_ = tf.placeholder(tf.float32, [None, 10]) 46 | 47 | # The raw formulation of cross-entropy, 48 | # 49 | # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)), 50 | # reduction_indices=[1])) 51 | # 52 | # can be numerically unstable. 53 | # 54 | # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw 55 | # outputs of 'y', and then average across the batch. 56 | cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_)) 57 | train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy) 58 | 59 | sess = tf.InteractiveSession() 60 | # Train 61 | tf.global_variables_initializer().run() 62 | for _ in range(1000): 63 | batch_xs, batch_ys = mnist.train.next_batch(100) 64 | sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) 65 | 66 | # Test trained model 67 | correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) 68 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 69 | print(sess.run(accuracy, feed_dict={x: mnist.test.images, 70 | y_: mnist.test.labels})) 71 | 72 | 73 | if __name__ == '__main__': 74 | parser = argparse.ArgumentParser() 75 | parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data', 76 | help='Directory for storing input data') 77 | FLAGS, unparsed = parser.parse_known_args() 78 | tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) 79 | -------------------------------------------------------------------------------- /mnist/mnist_visualized.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the 'License'); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an 'AS IS' BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """A simple MNIST classifier which displays summaries in TensorBoard. 16 | 17 | This is an unimpressive MNIST model, but it is a good example of using 18 | tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of 19 | naming summary tags so that they are grouped meaningfully in TensorBoard. 20 | 21 | It demonstrates the functionality of every TensorBoard dashboard. 22 | """ 23 | from __future__ import absolute_import 24 | from __future__ import division 25 | from __future__ import print_function 26 | 27 | import argparse 28 | import sys 29 | 30 | import tensorflow as tf 31 | 32 | from tensorflow.examples.tutorials.mnist import input_data 33 | 34 | FLAGS = None 35 | 36 | 37 | def train(): 38 | # Import data 39 | mnist = input_data.read_data_sets(FLAGS.data_dir, 40 | one_hot=True, 41 | fake_data=FLAGS.fake_data) 42 | 43 | sess = tf.InteractiveSession() 44 | # Create a multilayer model. 45 | 46 | # Input placeholders 47 | with tf.name_scope('input'): 48 | x = tf.placeholder(tf.float32, [None, 784], name='x-input') 49 | y_ = tf.placeholder(tf.float32, [None, 10], name='y-input') 50 | 51 | with tf.name_scope('input_reshape'): 52 | image_shaped_input = tf.reshape(x, [-1, 28, 28, 1]) 53 | tf.summary.image('input', image_shaped_input, 10) 54 | 55 | # We can't initialize these variables to 0 - the network will get stuck. 56 | def weight_variable(shape): 57 | """Create a weight variable with appropriate initialization.""" 58 | initial = tf.truncated_normal(shape, stddev=0.1) 59 | return tf.Variable(initial) 60 | 61 | def bias_variable(shape): 62 | """Create a bias variable with appropriate initialization.""" 63 | initial = tf.constant(0.1, shape=shape) 64 | return tf.Variable(initial) 65 | 66 | def variable_summaries(var): 67 | """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" 68 | with tf.name_scope('summaries'): 69 | mean = tf.reduce_mean(var) 70 | tf.summary.scalar('mean', mean) 71 | with tf.name_scope('stddev'): 72 | stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) 73 | tf.summary.scalar('stddev', stddev) 74 | tf.summary.scalar('max', tf.reduce_max(var)) 75 | tf.summary.scalar('min', tf.reduce_min(var)) 76 | tf.summary.histogram('histogram', var) 77 | 78 | def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu): 79 | """Reusable code for making a simple neural net layer. 80 | 81 | It does a matrix multiply, bias add, and then uses relu to nonlinearize. 82 | It also sets up name scoping so that the resultant graph is easy to read, 83 | and adds a number of summary ops. 84 | """ 85 | # Adding a name scope ensures logical grouping of the layers in the graph. 86 | with tf.name_scope(layer_name): 87 | # This Variable will hold the state of the weights for the layer 88 | with tf.name_scope('weights'): 89 | weights = weight_variable([input_dim, output_dim]) 90 | variable_summaries(weights) 91 | with tf.name_scope('biases'): 92 | biases = bias_variable([output_dim]) 93 | variable_summaries(biases) 94 | with tf.name_scope('Wx_plus_b'): 95 | preactivate = tf.matmul(input_tensor, weights) + biases 96 | tf.summary.histogram('pre_activations', preactivate) 97 | activations = act(preactivate, name='activation') 98 | tf.summary.histogram('activations', activations) 99 | return activations 100 | 101 | hidden1 = nn_layer(x, 784, 500, 'layer1') 102 | 103 | with tf.name_scope('dropout'): 104 | keep_prob = tf.placeholder(tf.float32) 105 | tf.summary.scalar('dropout_keep_probability', keep_prob) 106 | dropped = tf.nn.dropout(hidden1, keep_prob) 107 | 108 | # Do not apply softmax activation yet, see below. 109 | y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity) 110 | 111 | with tf.name_scope('cross_entropy'): 112 | # The raw formulation of cross-entropy, 113 | # 114 | # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)), 115 | # reduction_indices=[1])) 116 | # 117 | # can be numerically unstable. 118 | # 119 | # So here we use tf.nn.softmax_cross_entropy_with_logits on the 120 | # raw outputs of the nn_layer above, and then average across 121 | # the batch. 122 | diff = tf.nn.softmax_cross_entropy_with_logits(y, y_) 123 | with tf.name_scope('total'): 124 | cross_entropy = tf.reduce_mean(diff) 125 | tf.summary.scalar('cross_entropy', cross_entropy) 126 | 127 | with tf.name_scope('train'): 128 | train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize( 129 | cross_entropy) 130 | 131 | with tf.name_scope('accuracy'): 132 | with tf.name_scope('correct_prediction'): 133 | correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) 134 | with tf.name_scope('accuracy'): 135 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 136 | tf.summary.scalar('accuracy', accuracy) 137 | 138 | # Merge all the summaries and write them out to /tmp/mnist_logs (by default) 139 | merged = tf.summary.merge_all() 140 | train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph) 141 | test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test') 142 | tf.global_variables_initializer().run() 143 | 144 | # Train the model, and also write summaries. 145 | # Every 10th step, measure test-set accuracy, and write test summaries 146 | # All other steps, run train_step on training data, & add training summaries 147 | 148 | def feed_dict(train): 149 | """Make a TensorFlow feed_dict: maps data onto Tensor placeholders.""" 150 | if train or FLAGS.fake_data: 151 | xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data) 152 | k = FLAGS.dropout 153 | else: 154 | xs, ys = mnist.test.images, mnist.test.labels 155 | k = 1.0 156 | return {x: xs, y_: ys, keep_prob: k} 157 | 158 | for i in range(FLAGS.max_steps): 159 | if i % 10 == 0: # Record summaries and test-set accuracy 160 | summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) 161 | test_writer.add_summary(summary, i) 162 | print('Accuracy at step %s: %s' % (i, acc)) 163 | else: # Record train set summaries, and train 164 | if i % 100 == 99: # Record execution stats 165 | run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) 166 | run_metadata = tf.RunMetadata() 167 | summary, _ = sess.run([merged, train_step], 168 | feed_dict=feed_dict(True), 169 | options=run_options, 170 | run_metadata=run_metadata) 171 | train_writer.add_run_metadata(run_metadata, 'step%03d' % i) 172 | train_writer.add_summary(summary, i) 173 | print('Adding run metadata for', i) 174 | else: # Record a summary 175 | summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True)) 176 | train_writer.add_summary(summary, i) 177 | train_writer.close() 178 | test_writer.close() 179 | 180 | 181 | def main(_): 182 | if tf.gfile.Exists(FLAGS.log_dir): 183 | tf.gfile.DeleteRecursively(FLAGS.log_dir) 184 | tf.gfile.MakeDirs(FLAGS.log_dir) 185 | train() 186 | 187 | 188 | if __name__ == '__main__': 189 | parser = argparse.ArgumentParser() 190 | parser.add_argument('--fake_data', nargs='?', const=True, type=bool, 191 | default=False, 192 | help='If true, uses fake data for unit testing.') 193 | parser.add_argument('--max_steps', type=int, default=1000, 194 | help='Number of steps to run trainer.') 195 | parser.add_argument('--learning_rate', type=float, default=0.001, 196 | help='Initial learning rate') 197 | parser.add_argument('--dropout', type=float, default=0.9, 198 | help='Keep probability for training dropout.') 199 | parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data', 200 | help='Directory for storing input data') 201 | parser.add_argument('--log_dir', type=str, default='/tmp/tensorflow/mnist/logs/mnist_with_summaries', 202 | help='Summaries log directory') 203 | FLAGS, unparsed = parser.parse_known_args() 204 | tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) 205 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | #required 2 | tensorflow 3 | tflearn 4 | numpy 5 | 6 | #optional 7 | scikit-image 8 | pyaudio 9 | wave 10 | python_speech_features 11 | matplotlib 12 | librosa 13 | scikits.talkbox 14 | -------------------------------------------------------------------------------- /rnn/lstm/words_prediction/data.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dangartman/tensorflow-playground/64ec42a7c5c3cc9da61e80eccac94e4efc3aac01/rnn/lstm/words_prediction/data.tgz -------------------------------------------------------------------------------- /rnn/lstm/words_prediction/ptb_word_lm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Example / benchmark for building a PTB LSTM model. 17 | 18 | Trains the model described in: 19 | (Zaremba, et. al.) Recurrent Neural Network Regularization 20 | http://arxiv.org/abs/1409.2329 21 | 22 | There are 3 supported model configurations: 23 | =========================================== 24 | | config | epochs | train | valid | test 25 | =========================================== 26 | | small | 13 | 37.99 | 121.39 | 115.91 27 | | medium | 39 | 48.45 | 86.16 | 82.07 28 | | large | 55 | 37.87 | 82.62 | 78.29 29 | The exact results may vary depending on the random initialization. 30 | 31 | The hyperparameters used in the model: 32 | - init_scale - the initial scale of the weights 33 | - learning_rate - the initial value of the learning rate 34 | - max_grad_norm - the maximum permissible norm of the gradient 35 | - num_layers - the number of LSTM layers 36 | - num_steps - the number of unrolled steps of LSTM 37 | - hidden_size - the number of LSTM units 38 | - max_epoch - the number of epochs trained with the initial learning rate 39 | - max_max_epoch - the total number of epochs for training 40 | - keep_prob - the probability of keeping weights in the dropout layer 41 | - lr_decay - the decay of the learning rate for each epoch after "max_epoch" 42 | - batch_size - the batch size 43 | 44 | The data required for this example is in the data/ dir of the 45 | PTB dataset from Tomas Mikolov's webpage: 46 | 47 | $ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz 48 | $ tar xvf simple-examples.tgz 49 | 50 | To run: 51 | 52 | $ python ptb_word_lm.py --data_path=simple-examples/data/ 53 | 54 | """ 55 | from __future__ import absolute_import 56 | from __future__ import division 57 | from __future__ import print_function 58 | 59 | import time 60 | 61 | import numpy as np 62 | import tensorflow as tf 63 | 64 | import reader 65 | 66 | flags = tf.flags 67 | logging = tf.logging 68 | 69 | flags.DEFINE_string( 70 | "model", "small", 71 | "A type of model. Possible options are: small, medium, large.") 72 | flags.DEFINE_string("data_path", None, 73 | "Where the training/test data is stored.") 74 | flags.DEFINE_string("save_path", None, 75 | "Model output directory.") 76 | flags.DEFINE_bool("use_fp16", False, 77 | "Train using 16-bit floats instead of 32bit floats") 78 | 79 | FLAGS = flags.FLAGS 80 | 81 | 82 | def data_type(): 83 | return tf.float16 if FLAGS.use_fp16 else tf.float32 84 | 85 | 86 | class SmallConfig(object): 87 | """Small config.""" 88 | init_scale = 0.1 89 | learning_rate = 1.0 90 | max_grad_norm = 5 91 | num_layers = 2 92 | num_steps = 20 93 | hidden_size = 200 94 | max_epoch = 4 95 | max_max_epoch = 13 96 | keep_prob = 1.0 97 | lr_decay = 0.5 98 | batch_size = 20 99 | vocab_size = 10000 100 | 101 | 102 | class MediumConfig(object): 103 | """Medium config.""" 104 | init_scale = 0.05 105 | learning_rate = 1.0 106 | max_grad_norm = 5 107 | num_layers = 2 108 | num_steps = 35 109 | hidden_size = 650 110 | max_epoch = 6 111 | max_max_epoch = 39 112 | keep_prob = 0.5 113 | lr_decay = 0.8 114 | batch_size = 20 115 | vocab_size = 10000 116 | 117 | 118 | class LargeConfig(object): 119 | """Large config.""" 120 | init_scale = 0.04 121 | learning_rate = 1.0 122 | max_grad_norm = 10 123 | num_layers = 2 124 | num_steps = 35 125 | hidden_size = 1500 126 | max_epoch = 14 127 | max_max_epoch = 55 128 | keep_prob = 0.35 129 | lr_decay = 1 / 1.15 130 | batch_size = 20 131 | vocab_size = 10000 132 | 133 | 134 | class TestConfig(object): 135 | """Tiny config, for testing.""" 136 | init_scale = 0.1 137 | learning_rate = 1.0 138 | max_grad_norm = 1 139 | num_layers = 1 140 | num_steps = 2 141 | hidden_size = 2 142 | max_epoch = 1 143 | max_max_epoch = 1 144 | keep_prob = 1.0 145 | lr_decay = 0.5 146 | batch_size = 20 147 | vocab_size = 10000 148 | 149 | 150 | def get_config(): 151 | if FLAGS.model == "small": 152 | return SmallConfig() 153 | elif FLAGS.model == "medium": 154 | return MediumConfig() 155 | elif FLAGS.model == "large": 156 | return LargeConfig() 157 | elif FLAGS.model == "test": 158 | return TestConfig() 159 | else: 160 | raise ValueError("Invalid model: %s", FLAGS.model) 161 | 162 | 163 | class PTBInput(object): 164 | """The input data.""" 165 | 166 | def __init__(self, config, data, name=None): 167 | self.batch_size = batch_size = config.batch_size 168 | self.num_steps = num_steps = config.num_steps 169 | self.epoch_size = ((len(data) // batch_size) - 1) // num_steps 170 | self.input_data, self.targets = reader.ptb_producer( 171 | data, batch_size, num_steps, name=name) 172 | 173 | 174 | class PTBModel(object): 175 | """The PTB model.""" 176 | 177 | def assign_lr(self, session, lr_value): 178 | session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) 179 | 180 | @property 181 | def input(self): 182 | return self._input 183 | 184 | @property 185 | def initial_state(self): 186 | return self._initial_state 187 | 188 | @property 189 | def cost(self): 190 | return self._cost 191 | 192 | @property 193 | def loss(self): 194 | return self._loss 195 | 196 | @property 197 | def final_state(self): 198 | return self._final_state 199 | 200 | @property 201 | def lr(self): 202 | return self._lr 203 | 204 | @property 205 | def train_op(self): 206 | return self._train_op 207 | 208 | def __init__(self, is_training, config, input_): 209 | self._input = input_ 210 | 211 | batch_size = input_.batch_size 212 | num_steps = input_.num_steps 213 | size = config.hidden_size 214 | vocab_size = config.vocab_size 215 | 216 | # Slightly better results can be obtained with forget gate biases 217 | # initialized to 1 but the hyperparameters of the model would need to be 218 | # different than reported in the paper. 219 | lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=True) 220 | if is_training and config.keep_prob < 1: 221 | lstm_cell = tf.nn.rnn_cell.DropoutWrapper( 222 | lstm_cell, output_keep_prob=config.keep_prob) 223 | cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers, state_is_tuple=True) 224 | 225 | self._initial_state = cell.zero_state(batch_size, data_type()) 226 | 227 | with tf.device("/cpu:0"): 228 | embedding = tf.get_variable( 229 | "embedding", [vocab_size, size], dtype=data_type()) 230 | inputs = tf.nn.embedding_lookup(embedding, input_.input_data) 231 | 232 | if is_training and config.keep_prob < 1: 233 | inputs = tf.nn.dropout(inputs, config.keep_prob) 234 | 235 | # Simplified version of tensorflow.models.rnn.rnn.py's rnn(). 236 | # This builds an unrolled LSTM for tutorial purposes only. 237 | # In general, use the rnn() or state_saving_rnn() from rnn.py. 238 | # 239 | # The alternative version of the code below is: 240 | # 241 | # inputs = tf.unstack(inputs, num=num_steps, axis=1) 242 | # outputs, state = tf.nn.rnn(cell, inputs, initial_state=self._initial_state) 243 | outputs = [] 244 | state = self._initial_state 245 | with tf.variable_scope("RNN"): 246 | for time_step in range(num_steps): 247 | if time_step > 0: tf.get_variable_scope().reuse_variables() 248 | (cell_output, state) = cell(inputs[:, time_step, :], state) 249 | outputs.append(cell_output) 250 | 251 | output = tf.reshape(tf.concat(1, outputs), [-1, size]) 252 | print("output shape: ", output.get_shape()) 253 | softmax_w = tf.get_variable( 254 | "softmax_w", [size, vocab_size], dtype=data_type()) 255 | softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) 256 | logits = tf.matmul(output, softmax_w) + softmax_b 257 | print("logits shape: ", logits.get_shape()) 258 | loss = tf.nn.seq2seq.sequence_loss_by_example( 259 | [logits], 260 | [tf.reshape(input_.targets, [-1])], 261 | [tf.ones([batch_size * num_steps], dtype=data_type())]) 262 | self._loss = loss 263 | self._cost = cost = tf.reduce_sum(loss) / batch_size 264 | self._final_state = state 265 | 266 | if not is_training: 267 | return 268 | 269 | self._lr = tf.Variable(0.0, trainable=False) 270 | tvars = tf.trainable_variables() 271 | grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 272 | config.max_grad_norm) 273 | optimizer = tf.train.GradientDescentOptimizer(self._lr) 274 | self._train_op = optimizer.apply_gradients( 275 | zip(grads, tvars), 276 | global_step=tf.contrib.framework.get_or_create_global_step()) 277 | 278 | self._new_lr = tf.placeholder( 279 | tf.float32, shape=[], name="new_learning_rate") 280 | self._lr_update = tf.assign(self._lr, self._new_lr) 281 | 282 | 283 | def run_epoch(session, model, eval_op=None, verbose=False): 284 | """Runs the model on the given data.""" 285 | start_time = time.time() 286 | costs = 0.0 287 | iters = 0 288 | state = session.run(model.initial_state) 289 | 290 | fetches = { 291 | "cost": model.cost, 292 | "final_state": model.final_state, 293 | } 294 | if eval_op is not None: 295 | fetches["eval_op"] = eval_op 296 | 297 | for step in range(model.input.epoch_size): 298 | feed_dict = {} 299 | for i, (c, h) in enumerate(model.initial_state): 300 | feed_dict[c] = state[i].c 301 | feed_dict[h] = state[i].h 302 | 303 | vals = session.run(fetches, feed_dict) 304 | cost = vals["cost"] 305 | state = vals["final_state"] 306 | 307 | costs += cost 308 | iters += model.input.num_steps 309 | 310 | if verbose and step % (model.input.epoch_size // 10) == 10: 311 | print("%.3f perplexity: %.3f speed: %.0f wps" % 312 | (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), 313 | iters * model.input.batch_size / (time.time() - start_time))) 314 | 315 | return np.exp(costs / iters) 316 | 317 | 318 | def main(_): 319 | if not FLAGS.data_path: 320 | raise ValueError("Must set --data_path to PTB data directory") 321 | 322 | raw_data = reader.ptb_raw_data(FLAGS.data_path) 323 | train_data, valid_data, test_data, _ = raw_data 324 | 325 | config = get_config() 326 | eval_config = get_config() 327 | eval_config.batch_size = 1 328 | eval_config.num_steps = 1 329 | 330 | with tf.Graph().as_default(): 331 | initializer = tf.random_uniform_initializer(-config.init_scale, 332 | config.init_scale) 333 | 334 | with tf.name_scope("Train"): 335 | train_input = PTBInput(config=config, data=train_data, name="TrainInput") 336 | with tf.variable_scope("Model", reuse=None, initializer=initializer): 337 | m = PTBModel(is_training=True, config=config, input_=train_input) 338 | tf.scalar_summary("Training Loss", m.cost) 339 | tf.scalar_summary("Learning Rate", m.lr) 340 | 341 | with tf.name_scope("Valid"): 342 | valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") 343 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 344 | mvalid = PTBModel(is_training=False, config=config, input_=valid_input) 345 | tf.scalar_summary("Validation Loss", mvalid.cost) 346 | 347 | with tf.name_scope("Test"): 348 | test_input = PTBInput(config=eval_config, data=test_data, name="TestInput") 349 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 350 | mtest = PTBModel(is_training=False, config=eval_config, 351 | input_=test_input) 352 | 353 | sv = tf.train.Supervisor(logdir=FLAGS.save_path) 354 | with sv.managed_session() as session: 355 | for i in range(config.max_max_epoch): 356 | lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) 357 | m.assign_lr(session, config.learning_rate * lr_decay) 358 | 359 | print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) 360 | train_perplexity = run_epoch(session, m, eval_op=m.train_op, 361 | verbose=True) 362 | print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) 363 | valid_perplexity = run_epoch(session, mvalid) 364 | print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) 365 | 366 | test_perplexity = run_epoch(session, mtest) 367 | print("Test Perplexity: %.3f" % test_perplexity) 368 | 369 | if FLAGS.save_path: 370 | print("Saving model to %s." % FLAGS.save_path) 371 | sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step) 372 | 373 | 374 | if __name__ == "__main__": 375 | tf.app.run() 376 | -------------------------------------------------------------------------------- /rnn/lstm/words_prediction/reader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | 17 | """Utilities for parsing PTB text files.""" 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import collections 23 | import os 24 | 25 | import tensorflow as tf 26 | 27 | 28 | def _read_words(filename): 29 | with tf.gfile.GFile(filename, "r") as f: 30 | return f.read().decode("utf-8").replace("\n", "").split() 31 | 32 | 33 | def _build_vocab(filename): 34 | data = _read_words(filename) 35 | 36 | counter = collections.Counter(data) 37 | count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) 38 | 39 | words, _ = list(zip(*count_pairs)) 40 | word_to_id = dict(zip(words, range(len(words)))) 41 | 42 | return word_to_id 43 | 44 | 45 | def _file_to_word_ids(filename, word_to_id): 46 | data = _read_words(filename) 47 | return [word_to_id[word] for word in data if word in word_to_id] 48 | 49 | 50 | def ptb_raw_data(data_path=None): 51 | """Load PTB raw data from data directory "data_path". 52 | 53 | Reads PTB text files, converts strings to integer ids, 54 | and performs mini-batching of the inputs. 55 | 56 | The PTB dataset comes from Tomas Mikolov's webpage: 57 | 58 | http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz 59 | 60 | Args: 61 | data_path: string path to the directory where simple-examples.tgz has 62 | been extracted. 63 | 64 | Returns: 65 | tuple (train_data, valid_data, test_data, vocabulary) 66 | where each of the data objects can be passed to PTBIterator. 67 | """ 68 | 69 | train_path = os.path.join(data_path, "ptb.train.txt") 70 | valid_path = os.path.join(data_path, "ptb.valid.txt") 71 | test_path = os.path.join(data_path, "ptb.test.txt") 72 | 73 | word_to_id = _build_vocab(train_path) 74 | train_data = _file_to_word_ids(train_path, word_to_id) 75 | valid_data = _file_to_word_ids(valid_path, word_to_id) 76 | test_data = _file_to_word_ids(test_path, word_to_id) 77 | vocabulary = len(word_to_id) 78 | return train_data, valid_data, test_data, vocabulary 79 | 80 | 81 | def ptb_producer(raw_data, batch_size, num_steps, name=None): 82 | """Iterate on the raw PTB data. 83 | 84 | This chunks up raw_data into batches of examples and returns Tensors that 85 | are drawn from these batches. 86 | 87 | Args: 88 | raw_data: one of the raw data outputs from ptb_raw_data. 89 | batch_size: int, the batch size. 90 | num_steps: int, the number of unrolls. 91 | name: the name of this operation (optional). 92 | 93 | Returns: 94 | A pair of Tensors, each shaped [batch_size, num_steps]. The second element 95 | of the tuple is the same data time-shifted to the right by one. 96 | 97 | Raises: 98 | tf.errors.InvalidArgumentError: if batch_size or num_steps are too high. 99 | """ 100 | with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]): 101 | raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32) 102 | 103 | data_len = tf.size(raw_data) 104 | batch_len = data_len // batch_size 105 | data = tf.reshape(raw_data[0: batch_size * batch_len], 106 | [batch_size, batch_len]) 107 | 108 | epoch_size = (batch_len - 1) // num_steps 109 | assertion = tf.assert_positive( 110 | epoch_size, 111 | message="epoch_size == 0, decrease batch_size or num_steps") 112 | with tf.control_dependencies([assertion]): 113 | epoch_size = tf.identity(epoch_size, name="epoch_size") 114 | 115 | i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue() 116 | x = tf.slice(data, [0, i * num_steps], [batch_size, num_steps]) 117 | y = tf.slice(data, [0, i * num_steps + 1], [batch_size, num_steps]) 118 | return x, y 119 | -------------------------------------------------------------------------------- /rnn/text8.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dangartman/tensorflow-playground/64ec42a7c5c3cc9da61e80eccac94e4efc3aac01/rnn/text8.zip -------------------------------------------------------------------------------- /rnn/word2vec_basic.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import collections 21 | import math 22 | import os 23 | import random 24 | import zipfile 25 | 26 | import numpy as np 27 | from six.moves import urllib 28 | from six.moves import xrange # pylint: disable=redefined-builtin 29 | import tensorflow as tf 30 | 31 | # Step 1: Download the data. 32 | url = 'http://mattmahoney.net/dc/' 33 | 34 | 35 | def maybe_download(filename, expected_bytes): 36 | """Download a file if not present, and make sure it's the right size.""" 37 | if not os.path.exists(filename): 38 | filename, _ = urllib.request.urlretrieve(url + filename, filename) 39 | statinfo = os.stat(filename) 40 | if statinfo.st_size == expected_bytes: 41 | print('Found and verified', filename) 42 | else: 43 | print(statinfo.st_size) 44 | raise Exception( 45 | 'Failed to verify ' + filename + '. Can you get to it with a browser?') 46 | return filename 47 | 48 | 49 | filename = maybe_download('text8.zip', 31344016) 50 | 51 | 52 | # Read the data into a list of strings. 53 | def read_data(filename): 54 | """Extract the first file enclosed in a zip file as a list of words""" 55 | with zipfile.ZipFile(filename) as f: 56 | data = tf.compat.as_str(f.read(f.namelist()[0])).split() 57 | return data 58 | 59 | 60 | words = read_data(filename) 61 | print('Data size', len(words)) 62 | 63 | # Step 2: Build the dictionary and replace rare words with UNK token. 64 | vocabulary_size = 50000 65 | 66 | 67 | def build_dataset(words): 68 | count = [['UNK', -1]] 69 | count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) 70 | dictionary = dict() 71 | for word, _ in count: 72 | dictionary[word] = len(dictionary) 73 | data = list() 74 | unk_count = 0 75 | for word in words: 76 | if word in dictionary: 77 | index = dictionary[word] 78 | else: 79 | index = 0 # dictionary['UNK'] 80 | unk_count += 1 81 | data.append(index) 82 | count[0][1] = unk_count 83 | reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 84 | return data, count, dictionary, reverse_dictionary 85 | 86 | 87 | data, count, dictionary, reverse_dictionary = build_dataset(words) 88 | del words # Hint to reduce memory. 89 | print('Most common words (+UNK)', count[:5]) 90 | print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]) 91 | 92 | data_index = 0 93 | 94 | 95 | # Step 3: Function to generate a training batch for the skip-gram model. 96 | def generate_batch(batch_size, num_skips, skip_window): 97 | global data_index 98 | assert batch_size % num_skips == 0 99 | assert num_skips <= 2 * skip_window 100 | batch = np.ndarray(shape=(batch_size), dtype=np.int32) 101 | labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) 102 | span = 2 * skip_window + 1 # [ skip_window target skip_window ] 103 | buffer = collections.deque(maxlen=span) 104 | for _ in range(span): 105 | buffer.append(data[data_index]) 106 | data_index = (data_index + 1) % len(data) 107 | for i in range(batch_size // num_skips): 108 | target = skip_window # target label at the center of the buffer 109 | targets_to_avoid = [skip_window] 110 | for j in range(num_skips): 111 | while target in targets_to_avoid: 112 | target = random.randint(0, span - 1) 113 | targets_to_avoid.append(target) 114 | batch[i * num_skips + j] = buffer[skip_window] 115 | labels[i * num_skips + j, 0] = buffer[target] 116 | buffer.append(data[data_index]) 117 | data_index = (data_index + 1) % len(data) 118 | return batch, labels 119 | 120 | 121 | batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1) 122 | for i in range(8): 123 | print(batch[i], reverse_dictionary[batch[i]], 124 | '->', labels[i, 0], reverse_dictionary[labels[i, 0]]) 125 | 126 | # Step 4: Build and train a skip-gram model. 127 | 128 | batch_size = 128 129 | embedding_size = 128 # Dimension of the embedding vector. 130 | skip_window = 1 # How many words to consider left and right. 131 | num_skips = 2 # How many times to reuse an input to generate a label. 132 | 133 | # We pick a random validation set to sample nearest neighbors. Here we limit the 134 | # validation samples to the words that have a low numeric ID, which by 135 | # construction are also the most frequent. 136 | valid_size = 16 # Random set of words to evaluate similarity on. 137 | valid_window = 100 # Only pick dev samples in the head of the distribution. 138 | valid_examples = np.random.choice(valid_window, valid_size, replace=False) 139 | num_sampled = 64 # Number of negative examples to sample. 140 | 141 | graph = tf.Graph() 142 | 143 | with graph.as_default(): 144 | # Input data. 145 | train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) 146 | train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) 147 | valid_dataset = tf.constant(valid_examples, dtype=tf.int32) 148 | 149 | # Ops and variables pinned to the CPU because of missing GPU implementation 150 | with tf.device('/cpu:0'): 151 | # Look up embeddings for inputs. 152 | embeddings = tf.Variable( 153 | tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) 154 | embed = tf.nn.embedding_lookup(embeddings, train_inputs) 155 | 156 | # Construct the variables for the NCE loss 157 | nce_weights = tf.Variable( 158 | tf.truncated_normal([vocabulary_size, embedding_size], 159 | stddev=1.0 / math.sqrt(embedding_size))) 160 | nce_biases = tf.Variable(tf.zeros([vocabulary_size])) 161 | 162 | # Compute the average NCE loss for the batch. 163 | # tf.nce_loss automatically draws a new sample of the negative labels each 164 | # time we evaluate the loss. 165 | loss = tf.reduce_mean( 166 | tf.nn.nce_loss(weights=nce_weights, 167 | biases=nce_biases, 168 | labels=train_labels, 169 | inputs=embed, 170 | num_sampled=num_sampled, 171 | num_classes=vocabulary_size)) 172 | 173 | # Construct the SGD optimizer using a learning rate of 1.0. 174 | optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) 175 | 176 | # Compute the cosine similarity between minibatch examples and all embeddings. 177 | norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) 178 | normalized_embeddings = embeddings / norm 179 | valid_embeddings = tf.nn.embedding_lookup( 180 | normalized_embeddings, valid_dataset) 181 | similarity = tf.matmul( 182 | valid_embeddings, normalized_embeddings, transpose_b=True) 183 | 184 | # Add variable initializer. 185 | init = tf.global_variables_initializer() 186 | 187 | # Step 5: Begin training. 188 | num_steps = 100001 189 | 190 | with tf.Session(graph=graph) as session: 191 | # We must initialize all variables before we use them. 192 | init.run() 193 | print("Initialized") 194 | 195 | average_loss = 0 196 | for step in xrange(num_steps): 197 | batch_inputs, batch_labels = generate_batch( 198 | batch_size, num_skips, skip_window) 199 | feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels} 200 | 201 | # We perform one update step by evaluating the optimizer op (including it 202 | # in the list of returned values for session.run() 203 | _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict) 204 | average_loss += loss_val 205 | 206 | if step % 2000 == 0: 207 | if step > 0: 208 | average_loss /= 2000 209 | # The average loss is an estimate of the loss over the last 2000 batches. 210 | print("Average loss at step ", step, ": ", average_loss) 211 | average_loss = 0 212 | 213 | # Note that this is expensive (~20% slowdown if computed every 500 steps) 214 | if step % 10000 == 0: 215 | sim = similarity.eval() 216 | for i in xrange(valid_size): 217 | valid_word = reverse_dictionary[valid_examples[i]] 218 | top_k = 8 # number of nearest neighbors 219 | nearest = (-sim[i, :]).argsort()[1:top_k + 1] 220 | log_str = "Nearest to %s:" % valid_word 221 | for k in xrange(top_k): 222 | close_word = reverse_dictionary[nearest[k]] 223 | log_str = "%s %s," % (log_str, close_word) 224 | print(log_str) 225 | final_embeddings = normalized_embeddings.eval() 226 | 227 | 228 | # Step 6: Visualize the embeddings. 229 | 230 | 231 | def plot_with_labels(low_dim_embs, labels, filename='tsne.png'): 232 | assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings" 233 | plt.figure(figsize=(18, 18)) # in inches 234 | for i, label in enumerate(labels): 235 | x, y = low_dim_embs[i, :] 236 | plt.scatter(x, y) 237 | plt.annotate(label, 238 | xy=(x, y), 239 | xytext=(5, 2), 240 | textcoords='offset points', 241 | ha='right', 242 | va='bottom') 243 | 244 | plt.savefig(filename) 245 | 246 | 247 | try: 248 | from sklearn.manifold import TSNE 249 | import matplotlib.pyplot as plt 250 | 251 | tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) 252 | plot_only = 500 253 | low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :]) 254 | labels = [reverse_dictionary[i] for i in xrange(plot_only)] 255 | plot_with_labels(low_dim_embs, labels) 256 | 257 | except ImportError: 258 | print("Please install sklearn, matplotlib, and scipy to visualize embeddings.") 259 | -------------------------------------------------------------------------------- /speech/generate_speech_data.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os 6 | import os.path 7 | import numpy as np 8 | import subprocess 9 | import random 10 | 11 | DATA_DIR = 'data/' 12 | 13 | NUMBERS_PATH = DATA_DIR + "spoken_numbers" 14 | WORDS_PATH = DATA_DIR + "spoken_words_wav" 15 | SENTENCES_PATH = DATA_DIR + "spoken_sentences_wav" 16 | SENTENCES_MLL_PATH = DATA_DIR + "spoken_sentences_mll_wav" 17 | 18 | good_voices = { 19 | 'english-mb-en1': {'name': 'En1', 'rate': 100}, 20 | 'us-mbrola-1': {'name': 'Us1', 'rate': 120}, 21 | 'us-mbrola-2': {'name': 'Us2', 'rate': 120}, 22 | 'us-mbrola-3': {'name': 'Us3', 'rate': 120}, 23 | 'en-german': {'name': 'German', 'rate': 110}, 24 | 'en-german-5': {'name': 'German1', 'rate': 100}, 25 | 'en-romanian': {'name': 'Romanian', 'rate': 120}, 26 | 'en-dutch': {'name': 'Dutch', 'rate': 120}, 27 | 'en-french': {'name': 'French', 'rate': 110}, 28 | 'en-hungarian': {'name': 'Hungarian', 'rate': 100}, 29 | 'en-swedish': {'name': 'Swedish', 'rate': 110}, 30 | 'en-swedish-f': {'name': 'Swedish1', 'rate': 110} 31 | } 32 | 33 | bad_voices = { 34 | 'english-us': {'name': 'Us', 'rate': 120}, 35 | 'en-greek': {'name': 'Greek', 'rate': 150}, 36 | 'english': {'name': 'En', 'rate': 120}, 37 | 'english-north': {'name': 'En2', 'rate': 130}, 38 | 'english_rp': {'name': 'En3', 'rate': 110}, 39 | 'english_wmids': {'name': 'En4', 'rate': 120}, 40 | 'en-scottish': {'name': 'Scottish', 'rate': 130}, 41 | 'en-westindies': {'name': 'Westindies', 'rate': 140}, 42 | 43 | 'en-afrikaans': {'name': 'Afrikaans', 'rate': 100}, 44 | 'en-polish': {'name': 'Polish', 'rate': 110} 45 | } 46 | 47 | validation_percent = 10 48 | validation_voices = ['us-mbrola-2', 'en-german-5'] 49 | n_features = 26 50 | 51 | 52 | def check_voices(): 53 | voice_infos = str(subprocess.check_output(["espeak", "--voices=en"])).split("\n")[1:-1] 54 | voices = map(lambda x: x.split()[3], voice_infos) 55 | for voice in good_voices.keys(): 56 | if voice in voices: 57 | print(voice + " FOUND!") 58 | for voice in good_voices.keys(): 59 | if not voice in voices: 60 | print(voice + " MISSING!") 61 | del good_voices[voice] 62 | 63 | 64 | def generate_mfcc(voice_name, voice_id, line, line_num, rate, path): 65 | from librosa import load 66 | from scikits.talkbox.features import mfcc 67 | 68 | filename = path + "/wav/{0}_{1}_{2}.wav".format(line_num, voice_name, rate) 69 | try: 70 | out = str(subprocess.check_output([ 71 | "espeak", 72 | "-v", voice_id, 73 | "-w", filename, 74 | "-s {0}".format(rate), 75 | line 76 | ], stderr=subprocess.STDOUT)) 77 | if "FATAL ERROR" in out: 78 | print("CANNOT GENERATE WAV") 79 | else: 80 | signal, sample_rate = load(filename, mono=True) 81 | mel_features, mspec, spec = mfcc(signal, fs=sample_rate, nceps=n_features) 82 | # mel_features = np.swapaxes(mel_features, 0, 1) # timesteps x nFeatures -> nFeatures x timesteps 83 | np.save(path + "/mfcc/%s_%s_%d.npy" % (line_num, voice_name, rate), mel_features) 84 | except: 85 | pass 86 | 87 | 88 | def generate_labels(line, path, line_num, relevant_words): 89 | num_of_labels = len(relevant_words) + 1 # Add last label if none words are relevant 90 | labels = np.full(num_of_labels, -1) 91 | at_least_one_present = False 92 | for word in line.split(" "): 93 | try: 94 | relevant_index = relevant_words.index(word) 95 | labels[relevant_index] = 1 96 | at_least_one_present = True 97 | except: 98 | pass # ignore if word is not relevant 99 | if not at_least_one_present: 100 | labels[num_of_labels - 1] = 1 101 | 102 | np.save(path + "/labels/%s.npy" % line_num, labels) 103 | return labels 104 | 105 | 106 | def generate_phonemes(line, path): 107 | pronounced = subprocess.check_output(["./line_to_phonemes", line]).decode('UTF-8').strip() # todo 108 | # phonemes = string_to_int_line(pronounced, pad_to=max_line_length) # hack for numbers! 109 | # phonemes = string_to_int_line(line, pad_to=max_line_length) 110 | # np.save(path + "/phonemes/%s.npy" % line, phonemes) 111 | 112 | 113 | def generate(lines, path, relevant_words = None): 114 | # generate a bunch of files for each line (with many voices, nuances): 115 | # spoken wav 116 | # mfcc: Mel-frequency cepstrum 117 | # mll labels 118 | if not os.path.exists(path): os.mkdir(path) 119 | if not os.path.exists(path + "/labels/"): os.mkdir(path + "/labels/") 120 | if not os.path.exists(path + "/mfcc/"): os.mkdir(path + "/mfcc/") 121 | if not os.path.exists(path + "/wav/"): os.mkdir(path + "/wav/") 122 | out = open(path + "/lines.list", "wt") 123 | line_num = 1 124 | for line in lines: 125 | if isinstance(line, bytes): 126 | line = line.decode('UTF-8').strip() 127 | type = "train" 128 | if random.randint(1, 100) < validation_percent: 129 | type = "validation" 130 | print("generating [%s] %s" % (type, line)) 131 | out.write("%d:%s:%s\n" % (line_num, type, line)) 132 | voices = good_voices.keys() 133 | if relevant_words: 134 | generate_labels(line, path, line_num, relevant_words) 135 | if type == "validation": 136 | voice_id = validation_voices[random.randint(0, len(validation_voices) - 1)] 137 | else: 138 | voice_id = voices[random.randint(0, len(voices) - 1)] 139 | while voice_id in validation_voices: 140 | voice_id = voices[random.randint(0, len(voices) - 1)] 141 | voices = [voice_id] 142 | for voice in voices: 143 | # from_rate = good_voices[voice]['rate'] - 40 144 | # to_rate = good_voices[voice]['rate'] + 81 145 | # for rate in range(from_rate, to_rate, 20): 146 | rate = random.randint(good_voices[voice]['rate'] - 30, good_voices[voice]['rate'] + 40) 147 | try: 148 | generate_mfcc(good_voices[voice]['name'], voice, line, line_num, rate, path) 149 | except: 150 | pass # ignore after debug! 151 | line_num += 1 152 | 153 | 154 | def generate_lines(relevant_words, irrelevant_words, num_of_lines, max_line_length, mean_relevance_percent): 155 | lines = [] 156 | for i in range(0, num_of_lines): 157 | line = "" 158 | for w in range(0, random.randint(1, max_line_length)): 159 | if random.randint(1, 100) < mean_relevance_percent: 160 | line += relevant_words[random.randint(0, len(relevant_words) - 1)] + " " 161 | else: 162 | line += irrelevant_words[random.randint(0, len(irrelevant_words) - 1)] + " " 163 | lines.append(line) 164 | return lines 165 | 166 | 167 | def generate_spoken_numbers(): 168 | nums = list(map(str, range(0, 10))) 169 | generate(nums, NUMBERS_PATH) 170 | 171 | 172 | def generate_spoken_words(): 173 | wordslist = "wordslist.txt" 174 | words = open(wordslist).readlines() 175 | generate(words, WORDS_PATH) 176 | 177 | 178 | def generate_spoken_sentences(): 179 | linelist = "sentences.txt" 180 | lines = open(linelist).readlines() 181 | generate(lines, SENTENCES_PATH) 182 | 183 | 184 | def generate_spoken_sentences_mll(): 185 | relevant_wordlist = "mll_relevant_words.txt" 186 | relevant_words = list(map( 187 | lambda w: w.replace("\n", ''), 188 | open(relevant_wordlist).readlines() 189 | )) 190 | irrelevant_wordlist = "mll_irrelevant_words.txt" 191 | irrelevant_words = list(map( 192 | lambda w: w.replace("\n", ''), 193 | open(irrelevant_wordlist).readlines() 194 | )) 195 | lines = generate_lines(relevant_words, irrelevant_words, 196 | num_of_lines=10000, max_line_length=20, mean_relevance_percent=20) 197 | generate(lines, SENTENCES_MLL_PATH, relevant_words) 198 | 199 | 200 | def main(): 201 | check_voices() 202 | generate_spoken_sentences_mll() 203 | 204 | 205 | if __name__ == '__main__': 206 | main() 207 | print("DONE!") 208 | -------------------------------------------------------------------------------- /speech/mll_data.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os 6 | import os.path 7 | import numpy as np 8 | 9 | from generate_speech_data import SENTENCES_MLL_PATH 10 | 11 | 12 | class MllData(object): 13 | """The multi-label learning input data.""" 14 | 15 | @property 16 | def num_steps(self): 17 | return self.max_steps 18 | 19 | @property 20 | def num_classes(self): 21 | return self.n_classes 22 | 23 | def __init__(self, raw_input, raw_labels, cell_size): 24 | '''Object with input data for LSTM-MLL NN 25 | raw_input: list of 2D numpy arrays with raw mfcc input frames [timesteps x n_features] 26 | raw_labels: list of 1D arrays with labels [n_classes] 27 | cell_size: int with the size on lstm cell 28 | 29 | WARNING: cell_size should be multiple of n_features 30 | ''' 31 | 32 | assert len(raw_input) == len(raw_labels), "input len %d != labels len %d" % (len(raw_input), len(labels)) 33 | 34 | self.input_size = len(raw_labels) 35 | self.raw_input = raw_input 36 | self.raw_labels = raw_labels 37 | self.cell_size = cell_size 38 | 39 | self.n_features = raw_input[0].shape[1] 40 | self.n_classes = raw_labels[0].shape[0] 41 | assert cell_size % self.n_features == 0, "cell size should be multiple num of features" 42 | cell_size_factor = cell_size // self.n_features 43 | 44 | self.max_timesteps = 0 45 | for input_index in range(len(raw_labels)): 46 | self.max_timesteps = max(self.max_timesteps, raw_input[input_index].shape[0]) 47 | print("max timesteps", self.max_timesteps) 48 | self.max_timesteps += cell_size_factor - self.max_timesteps % cell_size_factor 49 | print("increased max timesteps", self.max_timesteps) 50 | self.max_steps = self.max_timesteps // cell_size_factor 51 | print("max steps", self.max_steps) 52 | 53 | def get_batch(self, batch_size): 54 | '''Produce random batch from raw input data 55 | batch_size: int with number of inputs/labels per batch 56 | returns: batch tuple (inputs, labels) consists of 57 | inputs = 3D array w/ shape [batch_size x max_steps x cell_size] 58 | labels = 2D array w/ shape [batch_size x n_classes] 59 | ''' 60 | random_indexes = np.random.permutation(self.input_size) 61 | 62 | inputs = np.zeros([batch_size, self.max_steps, self.cell_size]) 63 | labels = np.zeros([batch_size, self.n_classes]) 64 | for batch_index, raw_index in enumerate(random_indexes[0:batch_size]): 65 | mfcc = self.raw_input[raw_index] 66 | # pad with zeros to max_timesteps 67 | pad_len = self.max_timesteps - mfcc.shape[0] 68 | padded = np.pad(mfcc, ((0, pad_len), (0, 0)), 'constant', constant_values=0) 69 | # reshape time_steps x n_features -> steps x cell_size 70 | inputs[batch_index] = padded.reshape([self.max_steps, self.cell_size]) 71 | labels[batch_index] = self.raw_labels[raw_index] 72 | return inputs, labels 73 | 74 | 75 | def load_data(path, swap_axes=True): 76 | print("load data from " + path) 77 | texts = {} 78 | types = {} 79 | for line in open(path + "/lines.list").readlines(): 80 | num, type, text = line.split(":") 81 | types[num] = type 82 | texts[num] = text.replace("\n", '') 83 | train = {'texts': [], 'mfcc': [], 'labels': []} 84 | validation = {'texts': [], 'mfcc': [], 'labels': []} 85 | for file_name in os.listdir(path + "/mfcc/"): 86 | num, voice, rate = file_name.split("_") 87 | if types[num] == "train": 88 | target = train 89 | else: 90 | target = validation 91 | target['texts'].append(texts[num]) 92 | mfcc = np.load(os.path.join(path + "/mfcc/", file_name)) 93 | if swap_axes: 94 | mfcc = np.swapaxes(mfcc, 0, 1) 95 | target['mfcc'].append(mfcc) 96 | target['labels'].append(np.load(os.path.join(path + "/labels/", num + ".npy"))) 97 | return train, validation 98 | 99 | 100 | def main(): 101 | train, validation = load_data(SENTENCES_MLL_PATH) 102 | validation_input = MllData(validation['mfcc'], validation['labels'], 208) 103 | v_inputs, v_labels = validation_input.get_batch(5) 104 | 105 | 106 | if __name__ == '__main__': 107 | main() 108 | print("DONE!") -------------------------------------------------------------------------------- /speech/mll_irrelevant_words.txt: -------------------------------------------------------------------------------- 1 | other 2 | new 3 | good 4 | high 5 | old 6 | great 7 | big 8 | American 9 | small 10 | large 11 | national 12 | young 13 | different 14 | black 15 | long 16 | little 17 | important 18 | political 19 | bad 20 | white 21 | real 22 | best 23 | right 24 | social 25 | only 26 | public 27 | sure 28 | low 29 | early 30 | able 31 | human 32 | local 33 | late 34 | hard 35 | major 36 | better 37 | economic 38 | strong 39 | possible 40 | whole 41 | free 42 | military 43 | true 44 | federal 45 | international 46 | full 47 | special 48 | easy 49 | clear 50 | recent 51 | certain 52 | personal 53 | open 54 | red 55 | difficult 56 | available 57 | likely 58 | short 59 | single 60 | medical 61 | current 62 | wrong 63 | private 64 | past 65 | foreign 66 | fine 67 | common 68 | poor 69 | natural 70 | significant 71 | similar 72 | hot 73 | dead 74 | central 75 | happy 76 | serious 77 | ready 78 | simple 79 | left 80 | physical 81 | general 82 | environmental 83 | financial 84 | blue 85 | democratic 86 | dark 87 | various 88 | entire 89 | close 90 | legal 91 | religious 92 | cold 93 | final 94 | main 95 | green 96 | nice 97 | huge 98 | popular 99 | traditional 100 | cultural 101 | time 102 | year 103 | people 104 | way 105 | day 106 | man 107 | thing 108 | woman 109 | life 110 | child 111 | world 112 | school 113 | state 114 | family 115 | student 116 | group 117 | country 118 | problem 119 | hand 120 | part 121 | place 122 | case 123 | week 124 | company 125 | system 126 | program 127 | question 128 | work 129 | government 130 | number 131 | night 132 | point 133 | home 134 | water 135 | room 136 | mother 137 | area 138 | money 139 | story 140 | fact 141 | month 142 | lot 143 | right 144 | study 145 | book 146 | eye 147 | job 148 | word 149 | business 150 | issue 151 | side 152 | kind 153 | head 154 | house 155 | service 156 | friend 157 | father 158 | power 159 | hour 160 | game 161 | line 162 | end 163 | member 164 | law 165 | car 166 | city 167 | community 168 | name 169 | president 170 | team 171 | minute 172 | idea 173 | kid 174 | body 175 | information 176 | back 177 | parent 178 | face 179 | others 180 | level 181 | office 182 | door 183 | health 184 | person 185 | art 186 | war 187 | history 188 | party 189 | result 190 | change 191 | morning 192 | reason 193 | research 194 | girl 195 | guy 196 | moment 197 | air 198 | teacher 199 | force 200 | education 201 | be 202 | have 203 | do 204 | say 205 | go 206 | can 207 | get 208 | would 209 | make 210 | know 211 | will 212 | think 213 | take 214 | see 215 | come 216 | could 217 | want 218 | look 219 | use 220 | find 221 | give 222 | tell 223 | work 224 | may 225 | should 226 | call 227 | try 228 | ask 229 | need 230 | feel 231 | become 232 | leave 233 | put 234 | mean 235 | keep 236 | let 237 | begin 238 | seem 239 | help 240 | talk 241 | turn 242 | start 243 | might 244 | show 245 | hear 246 | play 247 | run 248 | move 249 | like 250 | live 251 | believe 252 | hold 253 | bring 254 | happen 255 | must 256 | write 257 | provide 258 | sit 259 | stand 260 | lose 261 | pay 262 | meet 263 | include 264 | continue 265 | set 266 | learn 267 | change 268 | lead 269 | understand 270 | watch -------------------------------------------------------------------------------- /speech/mll_lstm.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import time 6 | 7 | import numpy as np 8 | import tensorflow as tf 9 | 10 | from mll_data import MllData, load_data 11 | 12 | flags = tf.flags 13 | logging = tf.logging 14 | flags.DEFINE_string( 15 | "model", "small", 16 | "A type of model. Possible options are: small, medium, large.") 17 | flags.DEFINE_string("data_path", "data/spoken_sentences_mll_wav", 18 | "Where the training/test data is stored.") 19 | flags.DEFINE_string("save_path", None, 20 | "Model output directory.") 21 | flags.DEFINE_bool("use_fp64", False, 22 | "Train using 64-bit floats instead of 32-bit floats") 23 | FLAGS = flags.FLAGS 24 | 25 | 26 | def data_type(): 27 | return tf.float64 if FLAGS.use_fp64 else tf.float32 28 | 29 | 30 | class SmallConfig(object): 31 | """Small config.""" 32 | init_scale = 0.1 33 | learning_rate = 1.0 34 | max_grad_norm = 5 35 | num_layers = 2 36 | hidden_size = 104 37 | epoch_size = 30 38 | constant_lr_max_epoch = 4 39 | max_epoch = 10 40 | keep_prob = 1.0 41 | lr_decay = 0.5 42 | batch_size = 10 43 | validation_batch_size = 3 44 | 45 | 46 | class MediumConfig(object): 47 | """Medium config.""" 48 | init_scale = 0.05 49 | learning_rate = 1.0 50 | max_grad_norm = 5 51 | num_layers = 2 52 | hidden_size = 416 53 | epoch_size = 40 54 | constant_lr_max_epoch = 6 55 | max_epoch = 16 56 | keep_prob = 0.5 57 | lr_decay = 0.8 58 | batch_size = 20 59 | validation_batch_size = 5 60 | 61 | 62 | class LargeConfig(object): 63 | """Large config.""" 64 | init_scale = 0.04 65 | learning_rate = 1.0 66 | max_grad_norm = 10 67 | num_layers = 2 68 | hidden_size = 1300 69 | epoch_size = 60 70 | constant_lr_max_epoch = 8 71 | max_epoch = 24 72 | keep_prob = 0.35 73 | lr_decay = 1 / 1.15 74 | batch_size = 20 75 | validation_batch_size = 5 76 | 77 | 78 | def get_config(): 79 | if FLAGS.model == "small": 80 | return SmallConfig() 81 | elif FLAGS.model == "medium": 82 | return MediumConfig() 83 | elif FLAGS.model == "large": 84 | return LargeConfig() 85 | else: 86 | raise ValueError("Invalid model: %s", FLAGS.model) 87 | 88 | 89 | class MLLModel(object): 90 | """The MLL model.""" 91 | 92 | def assign_lr(self, session, lr_value): 93 | session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) 94 | 95 | @property 96 | def input(self): 97 | return self._input 98 | 99 | @property 100 | def inputs_ph(self): 101 | return self._inputs_ph 102 | 103 | @property 104 | def labels_ph(self): 105 | return self._labels_ph 106 | 107 | @property 108 | def epoch_size(self): 109 | return self._epoch_size 110 | 111 | @property 112 | def batch_size(self): 113 | return self._batch_size 114 | 115 | @property 116 | def initial_state(self): 117 | return self._initial_state 118 | 119 | @property 120 | def cost(self): 121 | return self._cost 122 | 123 | @property 124 | def loss(self): 125 | return self._loss 126 | 127 | @property 128 | def final_state(self): 129 | return self._final_state 130 | 131 | @property 132 | def lr(self): 133 | return self._lr 134 | 135 | @property 136 | def train_op(self): 137 | return self._train_op 138 | 139 | def __init__(self, is_training, config, input_): 140 | self._input = input_ 141 | self._epoch_size = config.epoch_size if is_training else 1 142 | 143 | self._batch_size = batch_size = config.batch_size if is_training else config.validation_batch_size 144 | num_steps = input_.num_steps 145 | size = config.hidden_size 146 | num_classes = input_.num_classes 147 | 148 | lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=True) 149 | if is_training and config.keep_prob < 1: 150 | lstm_cell = tf.nn.rnn_cell.DropoutWrapper( 151 | lstm_cell, output_keep_prob=config.keep_prob) 152 | cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers, state_is_tuple=True) 153 | 154 | self._initial_state = cell.zero_state(batch_size, data_type()) 155 | 156 | self._inputs_ph = tf.placeholder(data_type(), shape=[batch_size, num_steps, size]) 157 | if is_training and config.keep_prob < 1: 158 | self._inputs_ph = tf.nn.dropout(self._inputs_ph, config.keep_prob) 159 | self._labels_ph = tf.placeholder(data_type(), shape=[batch_size, num_classes]) 160 | 161 | with tf.variable_scope("RNN"): 162 | inputs = tf.unstack(self._inputs_ph, num=num_steps, axis=1) 163 | outputs, state = tf.nn.rnn(cell, inputs, initial_state=self._initial_state) 164 | self._final_state = state 165 | 166 | print("outputs len: ", len(outputs)) 167 | output = outputs.pop() 168 | print("output shape: ", output.get_shape()) 169 | 170 | softmax_w = tf.get_variable( 171 | "softmax_w", [size, num_classes], dtype=data_type()) 172 | softmax_b = tf.get_variable("softmax_b", [num_classes], dtype=data_type()) 173 | classes = tf.matmul(output, softmax_w) + softmax_b 174 | print("classes shape: ", classes.get_shape()) 175 | 176 | classes_w = tf.get_variable( 177 | "classes_w", [num_classes, num_classes], dtype=data_type()) 178 | classes_b = tf.get_variable("classes_b", [num_classes], dtype=data_type()) 179 | logits = tf.matmul(classes, classes_w) + classes_b 180 | print("logits shape: ", logits.get_shape()) 181 | 182 | loss = tf.nn.sigmoid_cross_entropy_with_logits(logits, self._labels_ph) 183 | self._loss = loss 184 | self._cost = cost = tf.reduce_sum(loss) / batch_size 185 | 186 | if not is_training: 187 | return 188 | 189 | self._lr = tf.Variable(0.0, trainable=False) 190 | tvars = tf.trainable_variables() 191 | grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 192 | config.max_grad_norm) 193 | optimizer = tf.train.GradientDescentOptimizer(self._lr) 194 | self._train_op = optimizer.apply_gradients( 195 | zip(grads, tvars), 196 | global_step=tf.contrib.framework.get_or_create_global_step()) 197 | 198 | self._new_lr = tf.placeholder( 199 | tf.float32, shape=[], name="new_learning_rate") 200 | self._lr_update = tf.assign(self._lr, self._new_lr) 201 | 202 | 203 | def run_epoch(session, model, eval_op=None, verbose=False): 204 | """Runs the model on the given data.""" 205 | start_time = time.time() 206 | costs = 0.0 207 | iters = 0 208 | state = session.run(model.initial_state) 209 | 210 | fetches = { 211 | "cost": model.cost, 212 | "final_state": model.final_state, 213 | } 214 | if eval_op is not None: 215 | fetches["eval_op"] = eval_op 216 | 217 | for step in range(model.epoch_size): 218 | _inputs, _labels = model.input.get_batch(model.batch_size) 219 | 220 | vals = session.run(fetches, feed_dict={ 221 | model.inputs_ph: _inputs, 222 | model.labels_ph: _labels 223 | }) 224 | # state = vals["final_state"] 225 | costs += vals["cost"] 226 | iters += 1 227 | 228 | if verbose and step % (model.epoch_size // 10) == 10: 229 | print("%.3f Accuracy: %.3f speed: %.0f sentences/sec" % 230 | (step * 1.0 / model.epoch_size, np.exp(costs / iters), 231 | iters * model.batch_size / (time.time() - start_time))) 232 | 233 | return np.exp(costs / iters) 234 | 235 | 236 | def main(_): 237 | if not FLAGS.data_path: 238 | raise ValueError("Must set --data_path to MLL data directory") 239 | 240 | config = get_config() 241 | train_data, valid_data = load_data(FLAGS.data_path) 242 | 243 | with tf.Graph().as_default(): 244 | initializer = tf.random_uniform_initializer(-config.init_scale, 245 | config.init_scale) 246 | 247 | with tf.name_scope("Train"): 248 | train_input = MllData(train_data['mfcc'], train_data['labels'], config.hidden_size) 249 | with tf.variable_scope("Model", reuse=None, initializer=initializer): 250 | m = MLLModel(is_training=True, config=config, input_=train_input) 251 | tf.scalar_summary("Training Loss", m.cost) 252 | tf.scalar_summary("Learning Rate", m.lr) 253 | 254 | with tf.name_scope("Valid"): 255 | valid_input = MllData(valid_data['mfcc'], valid_data['labels'], config.hidden_size) 256 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 257 | mvalid = MLLModel(is_training=False, config=config, input_=valid_input) 258 | tf.scalar_summary("Validation Loss", mvalid.cost) 259 | 260 | sv = tf.train.Supervisor(logdir=FLAGS.save_path) 261 | with sv.managed_session() as session: 262 | for i in range(config.max_epoch): 263 | lr_decay = config.lr_decay ** max(i + 1 - config.constant_lr_max_epoch, 0.0) 264 | m.assign_lr(session, config.learning_rate * lr_decay) 265 | 266 | print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) 267 | train_accuracy = run_epoch(session, m, eval_op=m.train_op, 268 | verbose=True) 269 | print("Epoch: %d Train Accuracy: %.3f" % (i + 1, train_accuracy)) 270 | valid_accuracy = run_epoch(session, mvalid) 271 | print("Epoch: %d Valid Accuracy: %.3f" % (i + 1, valid_accuracy)) 272 | 273 | if FLAGS.save_path: 274 | print("Saving model to %s." % FLAGS.save_path) 275 | sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step) 276 | 277 | 278 | if __name__ == "__main__": 279 | tf.app.run() 280 | -------------------------------------------------------------------------------- /speech/mll_relevant_words.txt: -------------------------------------------------------------------------------- 1 | follow 2 | stop 3 | create 4 | speak 5 | read 6 | allow 7 | add 8 | spend 9 | grow 10 | open 11 | walk 12 | win 13 | offer 14 | remember 15 | love 16 | consider 17 | appear 18 | buy 19 | wait 20 | serve 21 | die 22 | send 23 | expect 24 | build 25 | stay 26 | fall 27 | cut 28 | reach 29 | kill 30 | remain -------------------------------------------------------------------------------- /speech/record_to_wav.py: -------------------------------------------------------------------------------- 1 | from sys import byteorder 2 | from array import array 3 | from struct import pack 4 | 5 | import pyaudio 6 | import wave 7 | 8 | THRESHOLD = 500 9 | CHUNK_SIZE = 1024 10 | FORMAT = pyaudio.paInt16 11 | RATE = 44100 12 | 13 | 14 | def is_silent(snd_data): 15 | "Returns 'True' if below the 'silent' threshold" 16 | return max(snd_data) < THRESHOLD 17 | 18 | 19 | def normalize(snd_data): 20 | "Average the volume out" 21 | MAXIMUM = 16384 22 | times = float(MAXIMUM) / max(abs(i) for i in snd_data) 23 | 24 | r = array('h') 25 | for i in snd_data: 26 | r.append(int(i * times)) 27 | return r 28 | 29 | 30 | def trim(snd_data): 31 | "Trim the blank spots at the start and end" 32 | 33 | def _trim(snd_data): 34 | snd_started = False 35 | r = array('h') 36 | 37 | for i in snd_data: 38 | if not snd_started and abs(i) > THRESHOLD: 39 | snd_started = True 40 | r.append(i) 41 | 42 | elif snd_started: 43 | r.append(i) 44 | return r 45 | 46 | # Trim to the left 47 | snd_data = _trim(snd_data) 48 | 49 | # Trim to the right 50 | snd_data.reverse() 51 | snd_data = _trim(snd_data) 52 | snd_data.reverse() 53 | return snd_data 54 | 55 | 56 | def add_silence(snd_data, seconds): 57 | "Add silence to the start and end of 'snd_data' of length 'seconds' (float)" 58 | r = array('h', [0 for i in xrange(int(seconds * RATE))]) 59 | r.extend(snd_data) 60 | r.extend([0 for i in xrange(int(seconds * RATE))]) 61 | return r 62 | 63 | 64 | def record(): 65 | """ 66 | Record a word or words from the microphone and 67 | return the data as an array of signed shorts. 68 | 69 | Normalizes the audio, trims silence from the 70 | start and end, and pads with 0.5 seconds of 71 | blank sound to make sure VLC et al can play 72 | it without getting chopped off. 73 | """ 74 | p = pyaudio.PyAudio() 75 | stream = p.open(format=FORMAT, channels=1, rate=RATE, 76 | input=True, output=True, 77 | frames_per_buffer=CHUNK_SIZE) 78 | 79 | num_silent = 0 80 | snd_started = False 81 | 82 | r = array('h') 83 | 84 | while 1: 85 | # little endian, signed short 86 | snd_data = array('h', stream.read(CHUNK_SIZE)) 87 | if byteorder == 'big': 88 | snd_data.byteswap() 89 | r.extend(snd_data) 90 | 91 | silent = is_silent(snd_data) 92 | 93 | if silent and snd_started: 94 | num_silent += 1 95 | elif not silent and not snd_started: 96 | snd_started = True 97 | 98 | if snd_started and num_silent > 30: 99 | break 100 | 101 | sample_width = p.get_sample_size(FORMAT) 102 | stream.stop_stream() 103 | stream.close() 104 | p.terminate() 105 | 106 | r = normalize(r) 107 | r = trim(r) 108 | r = add_silence(r, 0.5) 109 | return sample_width, r 110 | 111 | 112 | def record_to_file(path): 113 | "Records from the microphone and outputs the resulting data to 'path'" 114 | sample_width, data = record() 115 | data = pack('<' + ('h' * len(data)), *data) 116 | 117 | wf = wave.open(path, 'wb') 118 | wf.setnchannels(1) 119 | wf.setsampwidth(sample_width) 120 | wf.setframerate(RATE) 121 | wf.writeframes(data) 122 | wf.close() 123 | 124 | 125 | if __name__ == '__main__': 126 | print("please speak a word into the microphone") 127 | record_to_file('demo.wav') 128 | print("done - result written to demo.wav") 129 | -------------------------------------------------------------------------------- /speech/speech_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python 2 | """Utilities for downloading and providing data from openslr.org, libriSpeech, Pannous, Gutenberg, WMT, tokenizing, vocabularies.""" 3 | # TODO! see https://github.com/pannous/caffe-speech-recognition for some data sources 4 | 5 | import os 6 | import re 7 | import sys 8 | import wave 9 | 10 | import numpy 11 | import numpy as np 12 | import skimage.io # scikit-image 13 | 14 | try: 15 | import librosa 16 | except: 17 | print("pip install librosa ; if you want mfcc_batch_generator") 18 | # import extensions as xx 19 | from random import shuffle 20 | 21 | try: 22 | from six.moves import urllib 23 | from six.moves import xrange # pylint: disable=redefined-builtin 24 | except: 25 | pass # fuck 2to3 26 | 27 | # TRAIN_INDEX='train_words_index.txt' 28 | # TEST_INDEX='test_words_index.txt' 29 | SOURCE_URL = 'http://pannous.net/files/' # spoken_numbers.tar' 30 | DATA_DIR = 'data/' 31 | pcm_path = "data/spoken_numbers_pcm/" # 8 bit 32 | wav_path = "data/spoken_numbers_wav/" # 16 bit s16le 33 | path = pcm_path 34 | CHUNK = 4096 35 | test_fraction = 0.1 # 10% of data for test / verification 36 | 37 | 38 | # http://pannous.net/files/spoken_numbers_pcm.tar 39 | class Source: # labels 40 | DIGIT_WAVES = 'spoken_numbers_pcm.tar' 41 | DIGIT_SPECTROS = 'spoken_numbers_spectros_64x64.tar' # 64x64 baby data set, works astonishingly well 42 | NUMBER_WAVES = 'spoken_numbers_wav.tar' 43 | NUMBER_IMAGES = 'spoken_numbers.tar' # width=256 height=256 44 | WORD_SPECTROS = 'https://dl.dropboxusercontent.com/u/23615316/spoken_words.tar' # width,height=512# todo: sliding window! 45 | WORD_WAVES = 'spoken_words_wav.tar' 46 | TEST_INDEX = 'test_index.txt' 47 | TRAIN_INDEX = 'train_index.txt' 48 | 49 | 50 | from enum import Enum 51 | 52 | 53 | class Target(Enum): # labels 54 | digits = 1 55 | speaker = 2 56 | words_per_minute = 3 57 | word_phonemes = 4 58 | word = 5 # int vector as opposed to binary hotword 59 | sentence = 6 60 | sentiment = 7 61 | first_letter = 8 62 | hotword = 9 63 | # test_word=9 # use 5 even for speaker etc 64 | 65 | 66 | num_characters = 32 67 | # num_characters=60 # only one case, Including numbers 68 | # num_characters=128 # 69 | # num_characters=256 # including special characters 70 | # offset=0 # 1:1 mapping ++ 71 | # offset=32 # starting with ' ' space 72 | # offset=48 # starting with numbers 73 | offset = 64 # starting with characters 74 | max_word_length = 20 75 | terminal_symbol = 0 76 | 77 | 78 | def pad(vec, pad_to=max_word_length, one_hot=False, paddy=terminal_symbol): 79 | for i in range(0, pad_to - len(vec)): 80 | if one_hot: 81 | vec.append([paddy] * num_characters) 82 | else: 83 | vec.append(paddy) 84 | return vec 85 | 86 | 87 | def char_to_class(c): 88 | return (ord(c) - offset) % num_characters 89 | 90 | 91 | def string_to_int_word(word, pad_to): 92 | z = map(char_to_class, word) 93 | z = list(z) 94 | z = pad(z) 95 | return z 96 | 97 | 98 | class SparseLabels: 99 | def __init__(labels): 100 | labels.indices = {} 101 | labels.values = [] 102 | 103 | def shape(self): 104 | return (len(self.indices), len(self.values)) 105 | 106 | 107 | # labels: An `int32` `SparseTensor`. 108 | # labels.indices[i, :] == [b, t] means `labels.values[i]` stores the id for (batch b, time t). 109 | # labels.values[i]` must take on values in `[0, num_labels)`. 110 | def sparse_labels(vec): 111 | labels = SparseLabels() 112 | b = 0 113 | for lab in vec: 114 | t = 0 115 | for c in lab: 116 | labels.indices[b, t] = len(labels.values) 117 | labels.values.append(char_to_class(c)) 118 | # labels.values[i] = char_to_class(c) 119 | t += 1 120 | b += 1 121 | return labels 122 | 123 | 124 | def progresshook(blocknum, blocksize, totalsize): 125 | readsofar = blocknum * blocksize 126 | if totalsize > 0: 127 | percent = readsofar * 1e2 / totalsize 128 | s = "\r%5.1f%% %*d / %d" % ( 129 | percent, len(str(totalsize)), readsofar, totalsize) 130 | sys.stderr.write(s) 131 | if readsofar >= totalsize: # near the end 132 | sys.stderr.write("\n") 133 | else: # total size is unknown 134 | sys.stderr.write("read %d\n" % (readsofar,)) 135 | 136 | 137 | def maybe_download(file, work_directory=DATA_DIR): 138 | """Download the data from Pannous's website, unless it's already here.""" 139 | print("Looking for data %s in %s" % (file, work_directory)) 140 | if not os.path.exists(work_directory): 141 | os.mkdir(work_directory) 142 | filepath = os.path.join(work_directory, re.sub('.*\/', '', file)) 143 | if not os.path.exists(filepath): 144 | if not file.startswith("http"): 145 | url_filename = SOURCE_URL + file 146 | else: 147 | url_filename = file 148 | print('Downloading from %s to %s' % (url_filename, filepath)) 149 | filepath, _ = urllib.request.urlretrieve(url_filename, filepath, progresshook) 150 | statinfo = os.stat(filepath) 151 | print('Successfully downloaded', file, statinfo.st_size, 'bytes.') 152 | # os.system('ln -s '+work_directory) 153 | if os.path.exists(filepath): 154 | print('Extracting %s to %s' % (filepath, work_directory)) 155 | os.system('tar xf ' + filepath + " -C " + work_directory) 156 | print('Data ready!') 157 | return filepath.replace(".tar", "") 158 | 159 | 160 | def spectro_batch(batch_size=10): 161 | return spectro_batch_generator(batch_size) 162 | 163 | 164 | def speaker(filename): # vom Dateinamen 165 | # if not "_" in file: 166 | # return "Unknown" 167 | return filename.split("_")[1] 168 | 169 | 170 | def get_speakers(path=pcm_path): 171 | maybe_download(Source.DIGIT_SPECTROS) 172 | maybe_download(Source.DIGIT_WAVES) 173 | files = os.listdir(path) 174 | 175 | def nobad(name): 176 | return "_" in name and not "." in name.split("_")[1] 177 | 178 | speakers = list(set(map(speaker, filter(nobad, files)))) 179 | print(len(speakers), " speakers: ", speakers) 180 | return speakers 181 | 182 | 183 | def load_wav_file(name): 184 | f = wave.open(name, "rb") 185 | # print("loading %s"%name) 186 | chunk = [] 187 | data0 = f.readframes(CHUNK) 188 | while data0: # f.getnframes() 189 | # data=numpy.fromstring(data0, dtype='float32') 190 | # data = numpy.fromstring(data0, dtype='uint16') 191 | data = numpy.fromstring(data0, dtype='uint8') 192 | data = (data + 128) / 255. # 0-1 for Better convergence 193 | # chunks.append(data) 194 | chunk.extend(data) 195 | data0 = f.readframes(CHUNK) 196 | # finally trim: 197 | chunk = chunk[0:CHUNK * 2] # should be enough for now -> cut 198 | chunk.extend(numpy.zeros(CHUNK * 2 - len(chunk))) # fill with padding 0's 199 | # print("%s loaded"%name) 200 | return chunk 201 | 202 | 203 | def spectro_batch_generator(batch_size=10, width=64, source_data=Source.DIGIT_SPECTROS, target=Target.digits): 204 | # maybe_download(Source.NUMBER_IMAGES , DATA_DIR) 205 | # maybe_download(Source.SPOKEN_WORDS, DATA_DIR) 206 | path = maybe_download(source_data, DATA_DIR) 207 | path = path.replace("_spectros", "") # HACK! remove! 208 | height = width 209 | batch = [] 210 | labels = [] 211 | speakers = get_speakers(path) 212 | if target == Target.digits: num_classes = 10 213 | if target == Target.first_letter: num_classes = 32 214 | files = os.listdir(path) 215 | # shuffle(files) # todo : split test_fraction batch here! 216 | # files=files[0:int(len(files)*(1-test_fraction))] 217 | print("Got %d source data files from %s" % (len(files), path)) 218 | while True: 219 | # print("shuffling source data files") 220 | shuffle(files) 221 | for image_name in files: 222 | if not "_" in image_name: continue # bad !?! 223 | image = skimage.io.imread(path + "/" + image_name).astype(numpy.float32) 224 | # image.resize(width,height) # lets see ... 225 | data = image / 255. # 0-1 for Better convergence 226 | # data = data.reshape([width * height]) # tensorflow matmul needs flattened matrices wtf 227 | batch.append(list(data)) 228 | # classe=(ord(image_name[0]) - 48) # -> 0=0 .. A:65-48 ... 74 for 'z' 229 | classe = (ord(image_name[0]) - 48) % 32 # -> 0=0 17 for A, 10 for z ;) 230 | labels.append(dense_to_one_hot(classe, num_classes)) 231 | if len(batch) >= batch_size: 232 | yield batch, labels 233 | batch = [] # Reset for next batch 234 | labels = [] 235 | 236 | 237 | def mfcc_batch_generator(batch_size=10, source=Source.DIGIT_WAVES, target=Target.digits): 238 | maybe_download(source, DATA_DIR) 239 | if target == Target.speaker: speakers = get_speakers() 240 | batch_features = [] 241 | labels = [] 242 | files = os.listdir(path) 243 | while True: 244 | print("loaded batch of %d files" % len(files)) 245 | shuffle(files) 246 | for file in files: 247 | if not file.endswith(".wav"): continue 248 | wave, sr = librosa.load(path + file, mono=True) 249 | mfcc = librosa.feature.mfcc(wave, sr) 250 | if target == Target.speaker: 251 | label = one_hot_from_item(speaker(file), speakers) 252 | elif target == Target.digits: 253 | label = dense_to_one_hot(int(file[0]), 10) 254 | elif target == Target.first_letter: 255 | label = dense_to_one_hot((ord(file[0]) - 48) % 32, 32) 256 | elif target == Target.hotword: 257 | label = one_hot_word(file, pad_to=max_word_length) # 258 | elif target == Target.word: 259 | label = string_to_int_word(file, pad_to=max_word_length) 260 | # label = file # sparse_labels(file, pad_to=20) # max_output_length 261 | else: 262 | raise Exception("todo : labels for Target!") 263 | labels.append(label) 264 | # print(np.array(mfcc).shape) 265 | mfcc = np.pad(mfcc, ((0, 0), (0, 80 - len(mfcc[0]))), mode='constant', constant_values=0) 266 | batch_features.append(np.array(mfcc)) 267 | if len(batch_features) >= batch_size: 268 | # if target == Target.word: labels = sparse_labels(labels) 269 | # labels=np.array(labels) 270 | # print(np.array(batch_features).shape) 271 | # yield np.array(batch_features), labels 272 | # print(np.array(labels).shape) # why (64,) instead of (64, 15, 32)? OK IFF dim_1==const (20) 273 | yield batch_features, labels # basic_rnn_seq2seq inputs must be a sequence 274 | batch_features = [] # Reset for next batch 275 | labels = [] 276 | 277 | 278 | # If you set dynamic_pad=True when calling tf.train.batch the returned batch will be automatically padded with 0s. Handy! A lower-level option is to use tf.PaddingFIFOQueue. 279 | # only apply to a subset of all images at one time 280 | def wave_batch_generator(batch_size=10, source=Source.DIGIT_WAVES, target=Target.digits): # speaker 281 | maybe_download(source, DATA_DIR) 282 | if target == Target.speaker: speakers = get_speakers() 283 | batch_waves = [] 284 | labels = [] 285 | # input_width=CHUNK*6 # wow, big!! 286 | files = os.listdir(path) 287 | while True: 288 | shuffle(files) 289 | print("loaded batch of %d files" % len(files)) 290 | for wav in files: 291 | if not wav.endswith(".wav"): continue 292 | if target == Target.digits: 293 | labels.append(dense_to_one_hot(int(wav[0]))) 294 | elif target == Target.speaker: 295 | labels.append(one_hot_from_item(speaker(wav), speakers)) 296 | elif target == Target.first_letter: 297 | label = dense_to_one_hot((ord(wav[0]) - 48) % 32, 32) 298 | else: 299 | raise Exception("todo : Target.word label!") 300 | chunk = load_wav_file(path + wav) 301 | batch_waves.append(chunk) 302 | # batch_waves.append(chunks[input_width]) 303 | if len(batch_waves) >= batch_size: 304 | yield batch_waves, labels 305 | batch_waves = [] # Reset for next batch 306 | labels = [] 307 | 308 | 309 | class DataSet(object): 310 | def __init__(self, images, labels, fake_data=False, one_hot=False, load=False): 311 | """Construct a DataSet. one_hot arg is used only if fake_data is true.""" 312 | if fake_data: 313 | self._num_examples = 10000 314 | self.one_hot = one_hot 315 | else: 316 | num = len(images) 317 | assert num == len(labels), ('images.shape: %s labels.shape: %s' % (images.shape, labels.shape)) 318 | print("len(images) %d" % num) 319 | self._num_examples = num 320 | self.cache = {} 321 | self._image_names = numpy.array(images) 322 | self._labels = labels 323 | self._epochs_completed = 0 324 | self._index_in_epoch = 0 325 | self._images = [] 326 | if load: # Otherwise loaded on demand 327 | self._images = self.load(self._image_names) 328 | 329 | @property 330 | def images(self): 331 | return self._images 332 | 333 | @property 334 | def image_names(self): 335 | return self._image_names 336 | 337 | @property 338 | def labels(self): 339 | return self._labels 340 | 341 | @property 342 | def num_examples(self): 343 | return self._num_examples 344 | 345 | @property 346 | def epochs_completed(self): 347 | return self._epochs_completed 348 | 349 | # only apply to a subset of all images at one time 350 | def load(self, image_names): 351 | print("loading %d images" % len(image_names)) 352 | return list(map(self.load_image, image_names)) # python3 map object WTF 353 | 354 | def load_image(self, image_name): 355 | if image_name in self.cache: 356 | return self.cache[image_name] 357 | else: 358 | image = skimage.io.imread(DATA_DIR + image_name).astype(numpy.float32) 359 | # images = numpy.multiply(images, 1.0 / 255.0) 360 | self.cache[image_name] = image 361 | return image 362 | 363 | def next_batch(self, batch_size, fake_data=False): 364 | """Return the next `batch_size` examples from this data set.""" 365 | if fake_data: 366 | fake_image = [1] * width * height 367 | if self.one_hot: 368 | fake_label = [1] + [0] * 9 369 | else: 370 | fake_label = 0 371 | return [fake_image for _ in xrange(batch_size)], [ 372 | fake_label for _ in xrange(batch_size)] 373 | start = self._index_in_epoch 374 | self._index_in_epoch += batch_size 375 | if self._index_in_epoch > self._num_examples: 376 | # Finished epoch 377 | self._epochs_completed += 1 378 | # Shuffle the data 379 | perm = numpy.arange(self._num_examples) 380 | numpy.random.shuffle(perm) 381 | # self._images = self._images[perm] 382 | self._image_names = self._image_names[perm] 383 | self._labels = self._labels[perm] 384 | # Start next epoch 385 | start = 0 386 | self._index_in_epoch = batch_size 387 | assert batch_size <= self._num_examples 388 | end = self._index_in_epoch 389 | return self.load(self._image_names[start:end]), self._labels[start:end] 390 | 391 | 392 | # multi-label 393 | def dense_to_some_hot(labels_dense, num_classes=140): 394 | """Convert class labels from int vectors to many-hot vectors!""" 395 | raise "TODO dense_to_some_hot" 396 | 397 | 398 | def one_hot_to_item(hot, items): 399 | i = np.argmax(hot) 400 | item = items[i] 401 | return item 402 | 403 | 404 | def one_hot_from_item(item, items): 405 | # items=set(items) # assure uniqueness 406 | x = [0] * len(items) # numpy.zeros(len(items)) 407 | i = items.index(item) 408 | x[i] = 1 409 | return x 410 | 411 | 412 | def one_hot_word(word, pad_to=max_word_length): 413 | vec = [] 414 | for c in word: # .upper(): 415 | x = [0] * num_characters 416 | x[(ord(c) - offset) % num_characters] = 1 417 | vec.append(x) 418 | if pad_to: vec = pad(vec, pad_to, one_hot=True) 419 | return vec 420 | 421 | 422 | def many_hot_to_word(word): 423 | s = "" 424 | for c in word: 425 | x = np.argmax(c) 426 | s += chr(x + offset) 427 | # s += chr(x + 48) # numbers 428 | return s 429 | 430 | 431 | def dense_to_one_hot(batch, batch_size, num_labels): 432 | sparse_labels = tf.reshape(batch, [batch_size, 1]) 433 | indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1]) 434 | concatenated = tf.concat(1, [indices, sparse_labels]) 435 | concat = tf.concat(0, [[batch_size], [num_labels]]) 436 | output_shape = tf.reshape(concat, [2]) 437 | sparse_to_dense = tf.sparse_to_dense(concatenated, output_shape, 1.0, 0.0) 438 | return tf.reshape(sparse_to_dense, [batch_size, num_labels]) 439 | 440 | 441 | def dense_to_one_hot(batch, batch_size, num_labels): 442 | sparse_labels = tf.reshape(batch, [batch_size, 1]) 443 | indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1]) 444 | concatenated = tf.concat(1, [indices, sparse_labels]) 445 | concat = tf.concat(0, [[batch_size], [num_labels]]) 446 | output_shape = tf.reshape(concat, [2]) 447 | sparse_to_dense = tf.sparse_to_dense(concatenated, output_shape, 1.0, 0.0) 448 | return tf.reshape(sparse_to_dense, [batch_size, num_labels]) 449 | 450 | 451 | def dense_to_one_hot(labels_dense, num_classes=10): 452 | """Convert class labels from scalars to one-hot vectors.""" 453 | return numpy.eye(num_classes)[labels_dense] 454 | 455 | 456 | def extract_labels(names_file, train, one_hot): 457 | labels = [] 458 | for line in open(names_file).readlines(): 459 | image_file, image_label = line.split("\t") 460 | labels.append(image_label) 461 | if one_hot: 462 | return dense_to_one_hot(labels) 463 | return labels 464 | 465 | 466 | def extract_images(names_file, train): 467 | image_files = [] 468 | for line in open(names_file).readlines(): 469 | image_file, image_label = line.split("\t") 470 | image_files.append(image_file) 471 | return image_files 472 | 473 | 474 | def read_data_sets(train_dir, source_data=Source.NUMBER_IMAGES, fake_data=False, one_hot=True): 475 | class DataSets(object): 476 | pass 477 | 478 | data_sets = DataSets() 479 | if fake_data: 480 | data_sets.train = DataSet([], [], fake_data=True, one_hot=one_hot) 481 | data_sets.validation = DataSet([], [], fake_data=True, one_hot=one_hot) 482 | data_sets.test = DataSet([], [], fake_data=True, one_hot=one_hot) 483 | return data_sets 484 | VALIDATION_SIZE = 2000 485 | local_file = maybe_download(source_data, train_dir) 486 | train_images = extract_images(TRAIN_INDEX, train=True) 487 | train_labels = extract_labels(TRAIN_INDEX, train=True, one_hot=one_hot) 488 | test_images = extract_images(TEST_INDEX, train=False) 489 | test_labels = extract_labels(TEST_INDEX, train=False, one_hot=one_hot) 490 | # train_images = train_images[:VALIDATION_SIZE] 491 | # train_labels = train_labels[:VALIDATION_SIZE:] 492 | # test_images = test_images[VALIDATION_SIZE:] 493 | # test_labels = test_labels[VALIDATION_SIZE:] 494 | data_sets.train = DataSet(train_images, train_labels, load=False) 495 | data_sets.test = DataSet(test_images, test_labels, load=True) 496 | # data_sets.validation = DataSet(validation_images, validation_labels, load=True) 497 | return data_sets 498 | 499 | 500 | if __name__ == "__main__": 501 | print("downloading speech datasets") 502 | maybe_download(Source.DIGIT_SPECTROS) 503 | maybe_download(Source.DIGIT_WAVES) 504 | maybe_download(Source.NUMBER_IMAGES) 505 | maybe_download(Source.NUMBER_WAVES) 506 | -------------------------------------------------------------------------------- /speech/tflearn_simple_number_classifier.py: -------------------------------------------------------------------------------- 1 | import tflearn 2 | from speech_data import wave_batch_generator, Target, load_wav_file, path 3 | import numpy 4 | 5 | # Simple spoken digit recognition demo, with 98% accuracy in under a minute 6 | 7 | # Training Step: 544 | total loss: 0.15866 8 | # | Adam | epoch: 034 | loss: 0.15866 - acc: 0.9818 -- iter: 0000/1000 9 | 10 | if __name__ == '__main__': 11 | batch = wave_batch_generator(10000, target=Target.digits) 12 | X, Y = next(batch) 13 | 14 | number_classes = 10 # Digits 15 | 16 | # Classification 17 | tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) 18 | 19 | net = tflearn.input_data(shape=[None, 8192]) 20 | net = tflearn.fully_connected(net, 64) 21 | net = tflearn.dropout(net, 0.5) 22 | net = tflearn.fully_connected(net, number_classes, activation='softmax') 23 | net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') 24 | 25 | model = tflearn.DNN(net) 26 | model.fit(X, Y, n_epoch=3, show_metric=True, snapshot_step=100) 27 | # Overfitting okay for now 28 | 29 | demo_file = "5_Vicki_260.wav" 30 | demo = load_wav_file(path + demo_file) 31 | result = model.predict([demo]) 32 | result = numpy.argmax(result) 33 | print("predicted digit for %s : result = %d " % (demo_file, result)) 34 | --------------------------------------------------------------------------------