├── LICENSE
├── README.md
├── imdb_birnn.py
└── birnn.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # bidirectional_RNN
 2 | bidirectional lstm
 3 | 
 4 | This repo demonstrates how to use [keras](https://github.com/fchollet/keras) to build a deep bidirectional RNN/LSTM with mlp layers before and after the LSTM layers
 5 | 
 6 | This repo can be used for the deep speech paper from Baidu
 7 | 
 8 | Deep Speech: Scaling up end-to-end speech recognition
 9 | arXiv:1412.5567, 2014
10 | A. Hannun etc
11 | 
12 | ```python
13 | max_features=20000
14 | maxseqlen = 100 # cut texts after this number of words (among top max_features most common words)
15 | batch_size = 16
16 | word_vec_len = 256
17 | 
18 | model = Sequential()
19 | model.add(Embedding(max_features, word_vec_len))
20 | 
21 | # MLP layers
22 | model.add(Transform((word_vec_len,))) # transform from 3d dimensional input to 2d input for mlp
23 | model.add(Dense(word_vec_len, 100, activation='relu'))
24 | model.add(BatchNormalization((100,)))
25 | model.add(Dense(100,100,activation='relu'))
26 | model.add(BatchNormalization((100,)))
27 | model.add(Dense(100, word_vec_len, activation='relu'))
28 | model.add(Transform((maxseqlen, word_vec_len))) # transform back from 2d to 3d for recurrent input
29 | 
30 | # Stacked up BiDirectionLSTM layers
31 | model.add(BiDirectionLSTM(word_vec_len, 100, output_mode='concat'))
32 | model.add(BiDirectionLSTM(200, 24, output_mode='sum'))
33 | 
34 | # MLP layers
35 | model.add(Reshape(24 * maxseqlen))
36 | model.add(BatchNormalization((24 * maxseqlen,)))
37 | model.add(Dense(24 * maxseqlen, 100, activation='relu'))
38 | model.add(Dropout(0.2))
39 | model.add(Dense(100, 1, activation='sigmoid'))
40 | ```
41 | 


--------------------------------------------------------------------------------
/imdb_birnn.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import print_function
 3 | import numpy as np
 4 | 
 5 | from keras.preprocessing import sequence
 6 | from keras.optimizers import SGD, RMSprop, Adagrad
 7 | from keras.utils import np_utils
 8 | from keras.models import Sequential
 9 | from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
10 | from keras.datasets import cifar10
11 | from keras.layers.normalization import BatchNormalization
12 | from keras.layers.embeddings import Embedding
13 | from keras.layers.recurrent import *
14 | from keras.datasets import imdb
15 | import cPickle
16 | import sys
17 | from birnn import BiDirectionLSTM, Transform
18 | '''
19 |     Train a BiDirectionLSTM LSTM on the IMDB sentiment classification task.
20 | 
21 |     The dataset is actually too small for LSTM to be of any advantage
22 |     compared to simpler, much faster methods such as TF-IDF+LogReg.
23 | 
24 |     Notes:
25 | 
26 |     - RNNs are tricky. Choice of batch size is important,
27 |     choice of loss and optimizer is critical, etc.
28 |     Most configurations won't converge.
29 | 
30 |     - LSTM loss decrease during training can be quite different
31 |     from what you see with CNNs/MLPs/etc. It's more or less a sigmoid
32 |     instead of an inverse exponential.
33 | 
34 |     GPU command:
35 |         THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python imdb_lstm.py
36 | 
37 |     250s/epoch on GPU (GT 650M), vs. 400s/epoch on CPU (2.4Ghz Core i7).
38 | '''
39 | 
40 | max_features=20000
41 | maxseqlen = 100 # cut texts after this number of words (among top max_features most common words)
42 | batch_size = 16
43 | word_vec_len = 256
44 | 
45 | print("Loading data...")
46 | (train_X, train_y), (test_X, test_y) = imdb.load_data(nb_words=max_features, test_split=0.2)
47 | print(len(train_X), 'train sequences')
48 | print(len(test_X), 'test sequences')
49 | 
50 | print("Pad sequences (samples x time)")
51 | train_X = sequence.pad_sequences(train_X, maxlen=maxseqlen)
52 | test_X = sequence.pad_sequences(test_X, maxlen=maxseqlen)
53 | 
54 | print('train_X shape:', train_X.shape)
55 | print('test_X shape:', test_X.shape)
56 | 
57 | 
58 | print('Build model...')
59 | model = Sequential()
60 | model.add(Embedding(max_features, word_vec_len))
61 | 
62 | # MLP layers
63 | model.add(Transform((word_vec_len,))) # transform from 3d dimensional input to 2d input for mlp
64 | model.add(Dense(word_vec_len, 100, activation='relu'))
65 | model.add(BatchNormalization((100,)))
66 | model.add(Dense(100,100,activation='relu'))
67 | model.add(BatchNormalization((100,)))
68 | model.add(Dense(100, word_vec_len, activation='relu'))
69 | model.add(Transform((maxseqlen, word_vec_len))) # transform back from 2d to 3d for recurrent input
70 | 
71 | # Stacked up BiDirectionLSTM layers
72 | model.add(BiDirectionLSTM(word_vec_len, 50, output_mode='concat'))
73 | model.add(BiDirectionLSTM(100, 24, output_mode='sum'))
74 | 
75 | # MLP layers
76 | model.add(Reshape(24 * maxseqlen))
77 | model.add(BatchNormalization((24 * maxseqlen,)))
78 | model.add(Dense(24 * maxseqlen, 50, activation='relu'))
79 | model.add(Dropout(0.2))
80 | model.add(Dense(50, 1, activation='sigmoid'))
81 | 
82 | # try using different optimizers and different optimizer configs
83 | model.compile(loss='mean_squared_error', optimizer='adam', class_mode="binary")
84 | 
85 | print("Train...")
86 | model.fit(train_X, train_y, batch_size=batch_size, nb_epoch=5, validation_data=(test_X, test_y), show_accuracy=True)
87 | 
88 | score = model.evaluate(test_X, test_y, batch_size=batch_size)
89 | print('Test score:', score)
90 | 
91 | classes = model.predict_classes(test_X, batch_size=batch_size)
92 | acc = np_utils.accuracy(classes, test_y)
93 | print('Test accuracy:', acc)
94 | 


--------------------------------------------------------------------------------
/birnn.py:
--------------------------------------------------------------------------------
  1 | import theano
  2 | import theano.tensor as T
  3 | 
  4 | from keras import activations, initializations
  5 | from keras.utils.theano_utils import shared_zeros, alloc_zeros_matrix
  6 | from keras.layers.core import Layer
  7 | 
  8 | 
  9 | class Transform(Layer):
 10 |      '''
 11 |          This function is needed transform the dataset such that we can
 12 |          add mlp layers before RNN/LSTM or after the RNN/LSTM.
 13 |      '''
 14 |      def __init__(self, dims, input=None):
 15 |          '''
 16 |          If input is three dimensional tensor3, with dimensions (nb_samples, sequence_length, vector_length)
 17 |          and dims is tuple (vector_length,), then the output will be (nb_samples * sequence_length, vector_length)
 18 |          If input is two dimensional matrix, with dimensions (nb_samples * sequence_length, vector_length)
 19 |          and if we want to revert back to (nb_samples, sequence_length, vector_length) so that we can feed
 20 |          the LSTM, then we can set dims as (sequence_length, vector_length).
 21 |          This function is needed for adding mlp layers before LSTM or after the LSTM.
 22 |          When used as first layer, input has to be set either as tensor3 or matrix
 23 |          '''
 24 | 
 25 |          super(Transform, self).__init__()
 26 |          self.dims = dims
 27 |          if input is not None:
 28 |              self.input = input
 29 | 
 30 |      def get_output(self, train):
 31 |          X = self.get_input(train)
 32 |          dim_mul = 1
 33 |          for i in range(X.ndim):
 34 |             dim_mul *= X.shape[i]
 35 |          input_dim = 1
 36 |          for i in range(len(self.dims)):
 37 |             input_dim *= self.dims[i]
 38 |          di = (dim_mul/input_dim,) + self.dims
 39 |          return theano.tensor.reshape(X, di)
 40 | 
 41 |      def get_config(self):
 42 |          return {"name":self.__class__.__name__,
 43 |              "dims":self.dims}
 44 | 
 45 | 
 46 | class BiDirectionLSTM(Layer):
 47 |     '''
 48 |         Acts as a spatiotemporal projection,
 49 | 
 50 |         Eats inputs with shape:
 51 |         (nb_samples, max_sample_length (samples shorter than this are padded with zeros at the end), input_dim)
 52 | 
 53 |         and returns outputs with shape:
 54 |         (nb_samples, max_sample_length, output_dim)
 55 | 
 56 |         For a step-by-step description of the algorithm, see:
 57 |         http://deeplearning.net/tutorial/lstm.html
 58 | 
 59 |         References:
 60 |             Long short-term memory (original 97 paper)
 61 |                 http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
 62 |             Learning to forget: Continual prediction with LSTM
 63 |                 http://www.mitpressjournals.org/doi/pdf/10.1162/089976600300015015
 64 |             Supervised sequence labelling with recurrent neural networks
 65 |                 http://www.cs.toronto.edu/~graves/preprint.pdf
 66 |     '''
 67 |     def __init__(self, input_dim, output_dim=128,
 68 |         init='glorot_uniform', inner_init='orthogonal',
 69 |         activation='tanh', inner_activation='hard_sigmoid',
 70 |         weights=None, truncate_gradient=-1, output_mode='sum'):
 71 | 
 72 |         super(BiDirectionLSTM,self).__init__()
 73 |         self.input_dim = input_dim
 74 |         self.output_dim = output_dim
 75 |         self.truncate_gradient = truncate_gradient
 76 |         self.output_mode = output_mode # output_mode is either sum or concatenate
 77 | 
 78 |         self.init = initializations.get(init)
 79 |         self.inner_init = initializations.get(inner_init)
 80 |         self.activation = activations.get(activation)
 81 |         self.inner_activation = activations.get(inner_activation)
 82 |         self.input = T.tensor3()
 83 | 
 84 |         # forward weights
 85 |         self.W_i = self.init((self.input_dim, self.output_dim))
 86 |         self.U_i = self.inner_init((self.output_dim, self.output_dim))
 87 |         self.b_i = shared_zeros((self.output_dim))
 88 | 
 89 |         self.W_f = self.init((self.input_dim, self.output_dim))
 90 |         self.U_f = self.inner_init((self.output_dim, self.output_dim))
 91 |         self.b_f = shared_zeros((self.output_dim))
 92 | 
 93 |         self.W_c = self.init((self.input_dim, self.output_dim))
 94 |         self.U_c = self.inner_init((self.output_dim, self.output_dim))
 95 |         self.b_c = shared_zeros((self.output_dim))
 96 | 
 97 |         self.W_o = self.init((self.input_dim, self.output_dim))
 98 |         self.U_o = self.inner_init((self.output_dim, self.output_dim))
 99 |         self.b_o = shared_zeros((self.output_dim))
100 | 
101 |         # backward weights
102 |         self.Wb_i = self.init((self.input_dim, self.output_dim))
103 |         self.Ub_i = self.inner_init((self.output_dim, self.output_dim))
104 |         self.bb_i = shared_zeros((self.output_dim))
105 | 
106 |         self.Wb_f = self.init((self.input_dim, self.output_dim))
107 |         self.Ub_f = self.inner_init((self.output_dim, self.output_dim))
108 |         self.bb_f = shared_zeros((self.output_dim))
109 | 
110 |         self.Wb_c = self.init((self.input_dim, self.output_dim))
111 |         self.Ub_c = self.inner_init((self.output_dim, self.output_dim))
112 |         self.bb_c = shared_zeros((self.output_dim))
113 | 
114 |         self.Wb_o = self.init((self.input_dim, self.output_dim))
115 |         self.Ub_o = self.inner_init((self.output_dim, self.output_dim))
116 |         self.bb_o = shared_zeros((self.output_dim))
117 | 
118 |         self.params = [
119 |             self.W_i, self.U_i, self.b_i,
120 |             self.W_c, self.U_c, self.b_c,
121 |             self.W_f, self.U_f, self.b_f,
122 |             self.W_o, self.U_o, self.b_o,
123 | 
124 |             self.Wb_i, self.Ub_i, self.bb_i,
125 |             self.Wb_c, self.Ub_c, self.bb_c,
126 |             self.Wb_f, self.Ub_f, self.bb_f,
127 |             self.Wb_o, self.Ub_o, self.bb_o,
128 |         ]
129 | 
130 |         if weights is not None:
131 |             self.set_weights(weights)
132 | 
133 |     def _forward_step(self,
134 |         xi_t, xf_t, xo_t, xc_t,
135 |         h_tm1, c_tm1,
136 |         u_i, u_f, u_o, u_c):
137 |         i_t = self.inner_activation(xi_t + T.dot(h_tm1, u_i))
138 |         f_t = self.inner_activation(xf_t + T.dot(h_tm1, u_f))
139 |         o_t = self.inner_activation(xo_t + T.dot(h_tm1, u_o))
140 |         g_t = self.activation(xc_t + T.dot(h_tm1, u_c))
141 |         c_t = f_t * c_tm1 + i_t * g_t
142 |         h_t = o_t * self.activation(c_t)
143 |         return h_t, c_t
144 | 
145 |     def get_forward_output(self, train):
146 |         X = self.get_input(train)
147 |         X = X.dimshuffle((1,0,2))
148 | 
149 |         xi = T.dot(X, self.W_i) + self.b_i
150 |         xf = T.dot(X, self.W_f) + self.b_f
151 |         xc = T.dot(X, self.W_c) + self.b_c
152 |         xo = T.dot(X, self.W_o) + self.b_o
153 | 
154 |         [outputs, memories], updates = theano.scan(
155 |             self._forward_step,
156 |             sequences=[xi, xf, xo, xc],
157 |             outputs_info=[
158 |                 T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
159 |                 T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1)
160 |             ],
161 |             non_sequences=[self.U_i, self.U_f, self.U_o, self.U_c],
162 |             truncate_gradient=self.truncate_gradient
163 |         )
164 |         return outputs.dimshuffle((1,0,2))
165 | 
166 | 
167 |     def get_backward_output(self, train):
168 |         X = self.get_input(train)
169 |         X = X.dimshuffle((1,0,2))
170 | 
171 |         xi = T.dot(X, self.Wb_i) + self.bb_i
172 |         xf = T.dot(X, self.Wb_f) + self.bb_f
173 |         xc = T.dot(X, self.Wb_c) + self.bb_c
174 |         xo = T.dot(X, self.Wb_o) + self.bb_o
175 | 
176 |         [outputs, memories], updates = theano.scan(
177 |             self._forward_step,
178 |             sequences=[xi, xf, xo, xc],
179 |             outputs_info=[
180 |                 T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1),
181 |                 T.unbroadcast(alloc_zeros_matrix(X.shape[1], self.output_dim), 1)
182 |             ],
183 |             non_sequences=[self.Ub_i, self.Ub_f, self.Ub_o, self.Ub_c],
184 |             go_backwards = True,
185 |             truncate_gradient=self.truncate_gradient
186 |         )
187 |         return outputs.dimshuffle((1,0,2))
188 | 
189 | 
190 |     def get_output(self, train):
191 |         forward = self.get_forward_output(train)
192 |         backward = self.get_backward_output(train)
193 |         if self.output_mode is 'sum':
194 |             return forward + backward
195 |         elif self.output_mode is 'concat':
196 |             return T.concatenate([forward, backward], axis=2)
197 |         else:
198 |             raise Exception('output mode is not sum or concat')
199 | 
200 | 
201 |     def get_config(self):
202 |         return {"name":self.__class__.__name__,
203 |             "input_dim":self.input_dim,
204 |             "output_dim":self.output_dim,
205 |             "init":self.init.__name__,
206 |             "inner_init":self.inner_init.__name__,
207 |             "activation":self.activation.__name__,
208 |             "inner_activation":self.inner_activation.__name__,
209 |             "truncate_gradient":self.truncate_gradient}
210 | 


--------------------------------------------------------------------------------