├── LICENSE ├── README.md ├── imdb_birnn.py └── birnn.py /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bidirectional_RNN 2 | bidirectional lstm 3 | 4 | This repo demonstrates how to use [keras](https://github.com/fchollet/keras) to build a deep bidirectional RNN/LSTM with mlp layers before and after the LSTM layers 5 | 6 | This repo can be used for the deep speech paper from Baidu 7 | 8 | Deep Speech: Scaling up end-to-end speech recognition 9 | arXiv:1412.5567, 2014 10 | A. Hannun etc 11 | 12 | ```python 13 | max_features=20000 14 | maxseqlen = 100 # cut texts after this number of words (among top max_features most common words) 15 | batch_size = 16 16 | word_vec_len = 256 17 | 18 | model = Sequential() 19 | model.add(Embedding(max_features, word_vec_len)) 20 | 21 | # MLP layers 22 | model.add(Transform((word_vec_len,))) # transform from 3d dimensional input to 2d input for mlp 23 | model.add(Dense(word_vec_len, 100, activation='relu')) 24 | model.add(BatchNormalization((100,))) 25 | model.add(Dense(100,100,activation='relu')) 26 | model.add(BatchNormalization((100,))) 27 | model.add(Dense(100, word_vec_len, activation='relu')) 28 | model.add(Transform((maxseqlen, word_vec_len))) # transform back from 2d to 3d for recurrent input 29 | 30 | # Stacked up BiDirectionLSTM layers 31 | model.add(BiDirectionLSTM(word_vec_len, 100, output_mode='concat')) 32 | model.add(BiDirectionLSTM(200, 24, output_mode='sum')) 33 | 34 | # MLP layers 35 | model.add(Reshape(24 * maxseqlen)) 36 | model.add(BatchNormalization((24 * maxseqlen,))) 37 | model.add(Dense(24 * maxseqlen, 100, activation='relu')) 38 | model.add(Dropout(0.2)) 39 | model.add(Dense(100, 1, activation='sigmoid')) 40 | ``` 41 | -------------------------------------------------------------------------------- /imdb_birnn.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | import numpy as np 4 | 5 | from keras.preprocessing import sequence 6 | from keras.optimizers import SGD, RMSprop, Adagrad 7 | from keras.utils import np_utils 8 | from keras.models import Sequential 9 | from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape 10 | from keras.datasets import cifar10 11 | from keras.layers.normalization import BatchNormalization 12 | from keras.layers.embeddings import Embedding 13 | from keras.layers.recurrent import * 14 | from keras.datasets import imdb 15 | import cPickle 16 | import sys 17 | from birnn import BiDirectionLSTM, Transform 18 | ''' 19 | Train a BiDirectionLSTM LSTM on the IMDB sentiment classification task. 20 | 21 | The dataset is actually too small for LSTM to be of any advantage 22 | compared to simpler, much faster methods such as TF-IDF+LogReg. 23 | 24 | Notes: 25 | 26 | - RNNs are tricky. Choice of batch size is important, 27 | choice of loss and optimizer is critical, etc. 28 | Most configurations won't converge. 29 | 30 | - LSTM loss decrease during training can be quite different 31 | from what you see with CNNs/MLPs/etc. It's more or less a sigmoid 32 | instead of an inverse exponential. 33 | 34 | GPU command: 35 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python imdb_lstm.py 36 | 37 | 250s/epoch on GPU (GT 650M), vs. 400s/epoch on CPU (2.4Ghz Core i7). 38 | ''' 39 | 40 | max_features=20000 41 | maxseqlen = 100 # cut texts after this number of words (among top max_features most common words) 42 | batch_size = 16 43 | word_vec_len = 256 44 | 45 | print("Loading data...") 46 | (train_X, train_y), (test_X, test_y) = imdb.load_data(nb_words=max_features, test_split=0.2) 47 | print(len(train_X), 'train sequences') 48 | print(len(test_X), 'test sequences') 49 | 50 | print("Pad sequences (samples x time)") 51 | train_X = sequence.pad_sequences(train_X, maxlen=maxseqlen) 52 | test_X = sequence.pad_sequences(test_X, maxlen=maxseqlen) 53 | 54 | print('train_X shape:', train_X.shape) 55 | print('test_X shape:', test_X.shape) 56 | 57 | 58 | print('Build model...') 59 | model = Sequential() 60 | model.add(Embedding(max_features, word_vec_len)) 61 | 62 | # MLP layers 63 | model.add(Transform((word_vec_len,))) # transform from 3d dimensional input to 2d input for mlp 64 | model.add(Dense(word_vec_len, 100, activation='relu')) 65 | model.add(BatchNormalization((100,))) 66 | model.add(Dense(100,100,activation='relu')) 67 | model.add(BatchNormalization((100,))) 68 | model.add(Dense(100, word_vec_len, activation='relu')) 69 | model.add(Transform((maxseqlen, word_vec_len))) # transform back from 2d to 3d for recurrent input 70 | 71 | # Stacked up BiDirectionLSTM layers 72 | model.add(BiDirectionLSTM(word_vec_len, 50, output_mode='concat')) 73 | model.add(BiDirectionLSTM(100, 24, output_mode='sum')) 74 | 75 | # MLP layers 76 | model.add(Reshape(24 * maxseqlen)) 77 | model.add(BatchNormalization((24 * maxseqlen,))) 78 | model.add(Dense(24 * maxseqlen, 50, activation='relu')) 79 | model.add(Dropout(0.2)) 80 | model.add(Dense(50, 1, activation='sigmoid')) 81 | 82 | # try using different optimizers and different optimizer configs 83 | model.compile(loss='mean_squared_error', optimizer='adam', class_mode="binary") 84 | 85 | print("Train...") 86 | model.fit(train_X, train_y, batch_size=batch_size, nb_epoch=5, validation_data=(test_X, test_y), show_accuracy=True) 87 | 88 | score = model.evaluate(test_X, test_y, batch_size=batch_size) 89 | print('Test score:', score) 90 | 91 | classes = model.predict_classes(test_X, batch_size=batch_size) 92 | acc = np_utils.accuracy(classes, test_y) 93 | print('Test accuracy:', acc) 94 | -------------------------------------------------------------------------------- /birnn.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | 4 | from keras import activations, initializations 5 | from keras.utils.theano_utils import shared_zeros, alloc_zeros_matrix 6 | from keras.layers.core import Layer 7 | 8 | 9 | class Transform(Layer): 10 | ''' 11 | This function is needed transform the dataset such that we can 12 | add mlp layers before RNN/LSTM or after the RNN/LSTM. 13 | ''' 14 | def __init__(self, dims, input=None): 15 | ''' 16 | If input is three dimensional tensor3, with dimensions (nb_samples, sequence_length, vector_length) 17 | and dims is tuple (vector_length,), then the output will be (nb_samples * sequence_length, vector_length) 18 | If input is two dimensional matrix, with dimensions (nb_samples * sequence_length, vector_length) 19 | and if we want to revert back to (nb_samples, sequence_length, vector_length) so that we can feed 20 | the LSTM, then we can set dims as (sequence_length, vector_length). 21 | This function is needed for adding mlp layers before LSTM or after the LSTM. 22 | When used as first layer, input has to be set either as tensor3 or matrix 23 | ''' 24 | 25 | super(Transform, self).__init__() 26 | self.dims = dims 27 | if input is not None: 28 | self.input = input 29 | 30 | def get_output(self, train): 31 | X = self.get_input(train) 32 | dim_mul = 1 33 | for i in range(X.ndim): 34 | dim_mul *= X.shape[i] 35 | input_dim = 1 36 | for i in range(len(self.dims)): 37 | input_dim *= self.dims[i] 38 | di = (dim_mul/input_dim,) + self.dims 39 | return theano.tensor.reshape(X, di) 40 | 41 | def get_config(self): 42 | return {"name":self.__class__.__name__, 43 | "dims":self.dims} 44 | 45 | 46 | class BiDirectionLSTM(Layer): 47 | ''' 48 | Acts as a spatiotemporal projection, 49 | 50 | Eats inputs with shape: 51 | (nb_samples, max_sample_length (samples shorter than this are padded with zeros at the end), input_dim) 52 | 53 | and returns outputs with shape: 54 | (nb_samples, max_sample_length, output_dim) 55 | 56 | For a step-by-step description of the algorithm, see: 57 | http://deeplearning.net/tutorial/lstm.html 58 | 59 | References: 60 | Long short-term memory (original 97 paper) 61 | http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf 62 | Learning to forget: Continual prediction with LSTM 63 | http://www.mitpressjournals.org/doi/pdf/10.1162/089976600300015015 64 | Supervised sequence labelling with recurrent neural networks 65 | http://www.cs.toronto.edu/~graves/preprint.pdf 66 | ''' 67 | def __init__(self, input_dim, output_dim=128, 68 | init='glorot_uniform', inner_init='orthogonal', 69 | activation='tanh', inner_activation='hard_sigmoid', 70 | weights=None, truncate_gradient=-1, output_mode='sum'): 71 | 72 | super(BiDirectionLSTM,self).__init__() 73 | self.input_dim = input_dim 74 | self.output_dim = output_dim 75 | self.truncate_gradient = truncate_gradient 76 | self.output_mode = output_mode # output_mode is either sum or concatenate 77 | 78 | self.init = initializations.get(init) 79 | self.inner_init = initializations.get(inner_init) 80 | self.activation = activations.get(activation) 81 | self.inner_activation = activations.get(inner_activation) 82 | self.input = T.tensor3() 83 | 84 | # forward weights 85 | self.W_i = self.init((self.input_dim, self.output_dim)) 86 | self.U_i = self.inner_init((self.output_dim, self.output_dim)) 87 | self.b_i = shared_zeros((self.output_dim)) 88 | 89 | self.W_f = self.init((self.input_dim, self.output_dim)) 90 | self.U_f = self.inner_init((self.output_dim, self.output_dim)) 91 | self.b_f = shared_zeros((self.output_dim)) 92 | 93 | self.W_c = self.init((self.input_dim, self.output_dim)) 94 | self.U_c = self.inner_init((self.output_dim, self.output_dim)) 95 | self.b_c = shared_zeros((self.output_dim)) 96 | 97 | self.W_o = self.init((self.input_dim, self.output_dim)) 98 | self.U_o = self.inner_init((self.output_dim, self.output_dim)) 99 | self.b_o = shared_zeros((self.output_dim)) 100 | 101 | # backward weights 102 | self.Wb_i = self.init((self.input_dim, self.output_dim)) 103 | self.Ub_i = self.inner_init((self.output_dim, self.output_dim)) 104 | self.bb_i = shared_zeros((self.output_dim)) 105 | 106 | self.Wb_f = self.init((self.input_dim, self.output_dim)) 107 | self.Ub_f = self.inner_init((self.output_dim, self.output_dim)) 108 | self.bb_f = shared_zeros((self.output_dim)) 109 | 110 | self.Wb_c = self.init((self.input_dim, self.output_dim)) 111 | self.Ub_c = self.inner_init((self.output_dim, self.output_dim)) 112 | self.bb_c = shared_zeros((self.output_dim)) 113 | 114 | self.Wb_o = self.init((self.input_dim, self.output_dim)) 115 | self.Ub_o = self.inner_init((self.output_dim, self.output_dim)) 116 | self.bb_o = shared_zeros((self.output_dim)) 117 | 118 | self.params = [ 119 | self.W_i, self.U_i, self.b_i, 120 | self.W_c, self.U_c, self.b_c, 121 | self.W_f, self.U_f, self.b_f, 122 | self.W_o, self.U_o, self.b_o, 123 | 124 | self.Wb_i, self.Ub_i, self.bb_i, 125 | self.Wb_c, self.Ub_c, self.bb_c, 126 | self.Wb_f, self.Ub_f, self.bb_f, 127 | self.Wb_o, self.Ub_o, self.bb_o, 128 | ] 129 | 130 | if weights is not None: 131 | self.set_weights(weights) 132 | 133 | def _forward_step(self, 134 | xi_t, xf_t, xo_t, xc_t, 135 | h_tm1, c_tm1, 136 | u_i, u_f, u_o, u_c): 137 | i_t = self.inner_activation(xi_t + T.dot(h_tm1, u_i)) 138 | f_t = self.inner_activation(xf_t + T.dot(h_tm1, u_f)) 139 | o_t = self.inner_activation(xo_t + T.dot(h_tm1, u_o)) 140 | g_t = self.activation(xc_t + T.dot(h_tm1, u_c)) 141 | c_t = f_t * c_tm1 + i_t * g_t 142 | h_t = o_t * self.activation(c_t) 143 | return h_t, c_t 144 | 145 | def get_forward_output(self, train): 146 | X = self.get_input(train) 147 | X = X.dimshuffle((1,0,2)) 148 | 149 | xi = T.dot(X, self.W_i) + self.b_i 150 | xf = T.dot(X, self.W_f) + self.b_f 151 | xc = T.dot(X, self.W_c) + self.b_c 152 | xo = T.dot(X, self.W_o) + self.b_o 153 | 154 | [outputs, memories], updates = theano.scan( 155 | self._forward_step, 156 | sequences=[xi, xf, xo, xc], 157 | outputs_info=[ 158 | T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), 159 | T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) 160 | ], 161 | non_sequences=[self.U_i, self.U_f, self.U_o, self.U_c], 162 | truncate_gradient=self.truncate_gradient 163 | ) 164 | return outputs.dimshuffle((1,0,2)) 165 | 166 | 167 | def get_backward_output(self, train): 168 | X = self.get_input(train) 169 | X = X.dimshuffle((1,0,2)) 170 | 171 | xi = T.dot(X, self.Wb_i) + self.bb_i 172 | xf = T.dot(X, self.Wb_f) + self.bb_f 173 | xc = T.dot(X, self.Wb_c) + self.bb_c 174 | xo = T.dot(X, self.Wb_o) + self.bb_o 175 | 176 | [outputs, memories], updates = theano.scan( 177 | self._forward_step, 178 | sequences=[xi, xf, xo, xc], 179 | outputs_info=[ 180 | T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1), 181 | T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1) 182 | ], 183 | non_sequences=[self.Ub_i, self.Ub_f, self.Ub_o, self.Ub_c], 184 | go_backwards = True, 185 | truncate_gradient=self.truncate_gradient 186 | ) 187 | return outputs.dimshuffle((1,0,2)) 188 | 189 | 190 | def get_output(self, train): 191 | forward = self.get_forward_output(train) 192 | backward = self.get_backward_output(train) 193 | if self.output_mode is 'sum': 194 | return forward + backward 195 | elif self.output_mode is 'concat': 196 | return T.concatenate([forward, backward], axis=2) 197 | else: 198 | raise Exception('output mode is not sum or concat') 199 | 200 | 201 | def get_config(self): 202 | return {"name":self.__class__.__name__, 203 | "input_dim":self.input_dim, 204 | "output_dim":self.output_dim, 205 | "init":self.init.__name__, 206 | "inner_init":self.inner_init.__name__, 207 | "activation":self.activation.__name__, 208 | "inner_activation":self.inner_activation.__name__, 209 | "truncate_gradient":self.truncate_gradient} 210 | --------------------------------------------------------------------------------