├── .gitignore
├── License.md
├── basic_rnn_example.py
├── README.md
├── hf_example.py
├── rnn.py
└── rnn_minibatch.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[co]
 2 | 
 3 | # Packages
 4 | *.egg
 5 | *.egg-info
 6 | dist
 7 | build
 8 | eggs
 9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 | 
16 | # Installer logs
17 | pip-log.txt
18 | 
19 | # Unit test / coverage reports
20 | .coverage
21 | .tox
22 | 
23 | #Translations
24 | *.mo
25 | 
26 | #Mr Developer
27 | .mr.developer.cfg
28 | 


--------------------------------------------------------------------------------
/License.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014, Graham William Taylor
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice, this
11 |   list of conditions and the following disclaimer in the documentation and/or
12 |   other materials provided with the distribution.
13 | 
14 | * Neither the name of the copyright holder nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/basic_rnn_example.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import theano
 3 | import theano.tensor as TT
 4 | 
 5 | # number of hidden units
 6 | n = 50
 7 | # number of input units
 8 | nin = 5
 9 | # number of output units
10 | nout = 5
11 | 
12 | # input (where first dimension is time)
13 | u = TT.matrix()
14 | # target (where first dimension is time)
15 | t = TT.matrix()
16 | # initial hidden state of the RNN
17 | h0 = TT.vector()
18 | # learning rate
19 | lr = TT.scalar()
20 | # recurrent weights as a shared variable
21 | W = theano.shared(numpy.random.uniform(size=(n, n), low=-.01, high=.01))
22 | # input to hidden layer weights
23 | W_in = theano.shared(numpy.random.uniform(size=(nin, n), low=-.01, high=.01))
24 | # hidden to output layer weights
25 | W_out = theano.shared(numpy.random.uniform(size=(n, nout), low=-.01, high=.01))
26 | 
27 | 
28 | # recurrent function (using tanh activation function) and linear output
29 | # activation function
30 | def step(u_t, h_tm1, W, W_in, W_out):
31 |     h_t = TT.tanh(TT.dot(u_t, W_in) + TT.dot(h_tm1, W))
32 |     y_t = TT.dot(h_t, W_out)
33 |     return h_t, y_t
34 | 
35 | # the hidden state `h` for the entire sequence, and the output for the
36 | # entrie sequence `y` (first dimension is always time)
37 | [h, y], _ = theano.scan(step,
38 |                         sequences=u,
39 |                         outputs_info=[h0, None],
40 |                         non_sequences=[W, W_in, W_out])
41 | # error between output and target
42 | error = ((y - t) ** 2).sum()
43 | # gradients on the weights using BPTT
44 | gW, gW_in, gW_out = TT.grad(error, [W, W_in, W_out])
45 | # training function, that computes the error and updates the weights using
46 | # SGD.
47 | fn = theano.function([h0, u, t, lr],
48 |                      error,
49 |                      updates={W: W - lr * gW,
50 |                              W_in: W_in - lr * gW_in,
51 |                              W_out: W_out - lr * gW_out})
52 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #theano-rnn
 2 | 
 3 | Demonstration of recurrent neural network implemented with Theano
 4 | 
 5 | ## Dependencies
 6 | 
 7 | * [Theano](http://deeplearning.net/software/theano/)
 8 | * [Scikit-learn](http://scikit-learn.org/stable/)
 9 | This relies on scikits-learn simply because I subclass their BaseEstimator
10 | class, but this dependency could easily be removed.
11 | * A reasonably good Python distribution with numpy and scipy. I
12 | recommend [Enthought](http://enthought.com/) because it is heavily optimized and it has a free
13 | academic license.
14 | * If you want to use the Hessian-Free optimizer then you will also need:
15 | [theano-hf](https://github.com/boulanni/theano-hf)
16 | 
17 | ## Description
18 | 
19 | * rnn.py : this is the most basic implementation of a "vanilla" RNN. It
20 | is designed for readability. It processes each sequence at a time.
21 | There are three test functions which show how to train an RNN with
22 | real-valued, binary or softmax outputs using stochastic gradient
23 | descent.
24 | * rnn_minibatch.py : this is similar to rnn.py but it is slightly more
25 | complicated code and processes multiple sequences at once for speed
26 | (i.e. in "mini-batches"). It also hooks into scipy.optimize to use
27 | more sophisticated optimization methods. It again includes three test
28 | functions based on output type.
29 | * hf_example.py: this uses the class defined by rnn.py but instead of
30 | training it with stochastic gradient descent, it trains it with
31 | [Martens and Sutskever's variant of Hessian-Free optimization](http://www.cs.toronto.edu/~jmartens/docs/RNN_HF.pdf).
32 | 
33 | ## Other implementations
34 | 
35 | There are other Theano rnn implementations publicly available, for example:
36 | * [Razvan Pascanu's implementation](https://github.com/pascanur/trainingRNNs).
37 | * Razvan also provides a [simple sketch](http://groups.google.com/group/theano-users/browse_thread/thread/39c755b93675f437). This was the starting point for this code on the Theano-users list. 
38 | * [Matthias Zoehrer's implementation](https://github.com/mzoehr/Theano/tree/rnn_benchmark/benchmark/rnn).
39 | 


--------------------------------------------------------------------------------
/hf_example.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This code uses the recurrent neural net implementation in rnn.py
  3 | but trains it using Hessian-Free optimization.
  4 | 
  5 | It requires the theano-hf package:
  6 | https://github.com/boulanni/theano-hf
  7 | 
  8 | @author Graham Taylor
  9 | 
 10 | """
 11 | from rnn import MetaRNN
 12 | from hf import SequenceDataset, hf_optimizer
 13 | import numpy as np
 14 | import matplotlib.pyplot as plt
 15 | import logging
 16 | 
 17 | 
 18 | def test_real(n_updates=100):
 19 |     """ Test RNN with real-valued outputs. """
 20 |     n_hidden = 10
 21 |     n_in = 5
 22 |     n_out = 3
 23 |     n_steps = 10
 24 |     n_seq = 1000
 25 | 
 26 |     np.random.seed(0)
 27 |     # simple lag test
 28 |     seq = np.random.randn(n_seq, n_steps, n_in)
 29 | 
 30 |     targets = np.zeros((n_seq, n_steps, n_out))
 31 |     targets[:, 1:, 0] = seq[:, :-1, 3]  # delayed 1
 32 |     targets[:, 1:, 1] = seq[:, :-1, 2]  # delayed 1
 33 |     targets[:, 2:, 2] = seq[:, :-2, 0]  # delayed 2
 34 | 
 35 |     targets += 0.01 * np.random.standard_normal(targets.shape)
 36 | 
 37 |     # SequenceDataset wants a list of sequences
 38 |     # this allows them to be different lengths, but here they're not
 39 |     seq = [i for i in seq]
 40 |     targets = [i for i in targets]
 41 | 
 42 |     gradient_dataset = SequenceDataset([seq, targets], batch_size=None,
 43 |                                        number_batches=100)
 44 |     cg_dataset = SequenceDataset([seq, targets], batch_size=None,
 45 |                                  number_batches=20)
 46 | 
 47 |     model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out,
 48 |                     activation='tanh')
 49 | 
 50 |     opt = hf_optimizer(p=model.rnn.params, inputs=[model.x, model.y],
 51 |                        s=model.rnn.y_pred,
 52 |                        costs=[model.rnn.loss(model.y)], h=model.rnn.h)
 53 | 
 54 |     opt.train(gradient_dataset, cg_dataset, num_updates=n_updates)
 55 | 
 56 |     plt.close('all')
 57 |     fig = plt.figure()
 58 |     ax1 = plt.subplot(211)
 59 |     plt.plot(seq[0])
 60 |     ax1.set_title('input')
 61 |     ax2 = plt.subplot(212)
 62 |     true_targets = plt.plot(targets[0])
 63 | 
 64 |     guess = model.predict(seq[0])
 65 |     guessed_targets = plt.plot(guess, linestyle='--')
 66 |     for i, x in enumerate(guessed_targets):
 67 |         x.set_color(true_targets[i].get_color())
 68 |     ax2.set_title('solid: true output, dashed: model output')
 69 | 
 70 | 
 71 | def test_binary(multiple_out=False, n_updates=250):
 72 |     """ Test RNN with binary outputs. """
 73 |     n_hidden = 10
 74 |     n_in = 5
 75 |     if multiple_out:
 76 |         n_out = 2
 77 |     else:
 78 |         n_out = 1
 79 |     n_steps = 10
 80 |     n_seq = 100
 81 | 
 82 |     np.random.seed(0)
 83 |     # simple lag test
 84 |     seq = np.random.randn(n_seq, n_steps, n_in)
 85 |     targets = np.zeros((n_seq, n_steps, n_out), dtype='int32')
 86 | 
 87 |     # whether lag 1 (dim 3) is greater than lag 2 (dim 0)
 88 |     targets[:, 2:, 0] = np.cast[np.int32](seq[:, 1:-1, 3] > seq[:, :-2, 0])
 89 | 
 90 |     if multiple_out:
 91 |         # whether product of lag 1 (dim 4) and lag 1 (dim 2)
 92 |         # is less than lag 2 (dim 0)
 93 |         targets[:, 2:, 1] = np.cast[np.int32](
 94 |             (seq[:, 1:-1, 4] * seq[:, 1:-1, 2]) > seq[:, :-2, 0])
 95 | 
 96 |     # SequenceDataset wants a list of sequences
 97 |     # this allows them to be different lengths, but here they're not
 98 |     seq = [i for i in seq]
 99 |     targets = [i for i in targets]
100 | 
101 |     gradient_dataset = SequenceDataset([seq, targets], batch_size=None,
102 |                                        number_batches=500)
103 |     cg_dataset = SequenceDataset([seq, targets], batch_size=None,
104 |                                  number_batches=100)
105 | 
106 |     model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out,
107 |                     activation='tanh', output_type='binary')
108 | 
109 |     # optimizes negative log likelihood
110 |     # but also reports zero-one error
111 |     opt = hf_optimizer(p=model.rnn.params, inputs=[model.x, model.y],
112 |                        s=model.rnn.y_pred,
113 |                        costs=[model.rnn.loss(model.y),
114 |                               model.rnn.errors(model.y)], h=model.rnn.h)
115 | 
116 |     # using settings of initial_lambda and mu given in Nicolas' RNN example
117 |     # seem to do a little worse than the default
118 |     opt.train(gradient_dataset, cg_dataset, num_updates=n_updates)
119 | 
120 |     seqs = xrange(10)
121 | 
122 |     plt.close('all')
123 |     for seq_num in seqs:
124 |         fig = plt.figure()
125 |         ax1 = plt.subplot(211)
126 |         plt.plot(seq[seq_num])
127 |         ax1.set_title('input')
128 |         ax2 = plt.subplot(212)
129 |         true_targets = plt.step(xrange(n_steps), targets[seq_num], marker='o')
130 | 
131 |         guess = model.predict_proba(seq[seq_num])
132 |         guessed_targets = plt.step(xrange(n_steps), guess)
133 |         plt.setp(guessed_targets, linestyle='--', marker='d')
134 |         for i, x in enumerate(guessed_targets):
135 |             x.set_color(true_targets[i].get_color())
136 |         ax2.set_ylim((-0.1, 1.1))
137 |         ax2.set_title('solid: true output, dashed: model output (prob)')
138 | 
139 | 
140 | def test_softmax(n_updates=250):
141 |     """ Test RNN with softmax outputs. """
142 |     n_hidden = 10
143 |     n_in = 5
144 |     n_steps = 10
145 |     n_seq = 100
146 |     n_classes = 3
147 |     n_out = n_classes  # restricted to single softmax per time step
148 | 
149 |     np.random.seed(0)
150 |     # simple lag test
151 |     seq = np.random.randn(n_seq, n_steps, n_in)
152 |     targets = np.zeros((n_seq, n_steps), dtype='int32')
153 | 
154 |     thresh = 0.5
155 |     # if lag 1 (dim 3) is greater than lag 2 (dim 0) + thresh
156 |     # class 1
157 |     # if lag 1 (dim 3) is less than lag 2 (dim 0) - thresh
158 |     # class 2
159 |     # if lag 2(dim0) - thresh <= lag 1 (dim 3) <= lag2(dim0) + thresh
160 |     # class 0
161 |     targets[:, 2:][seq[:, 1:-1, 3] > seq[:, :-2, 0] + thresh] = 1
162 |     targets[:, 2:][seq[:, 1:-1, 3] < seq[:, :-2, 0] - thresh] = 2
163 |     #targets[:, 2:, 0] = np.cast[np.int](seq[:, 1:-1, 3] > seq[:, :-2, 0])
164 | 
165 |     # SequenceDataset wants a list of sequences
166 |     # this allows them to be different lengths, but here they're not
167 |     seq = [i for i in seq]
168 |     targets = [i for i in targets]
169 | 
170 |     gradient_dataset = SequenceDataset([seq, targets], batch_size=None,
171 |                                        number_batches=500)
172 |     cg_dataset = SequenceDataset([seq, targets], batch_size=None,
173 |                                  number_batches=100)
174 | 
175 |     model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out,
176 |                     activation='tanh', output_type='softmax',
177 |                     use_symbolic_softmax=True)
178 | 
179 |     # optimizes negative log likelihood
180 |     # but also reports zero-one error
181 |     opt = hf_optimizer(p=model.rnn.params, inputs=[model.x, model.y],
182 |                        s=model.rnn.y_pred,
183 |                        costs=[model.rnn.loss(model.y),
184 |                               model.rnn.errors(model.y)], h=model.rnn.h)
185 | 
186 |     # using settings of initial_lambda and mu given in Nicolas' RNN example
187 |     # seem to do a little worse than the default
188 |     opt.train(gradient_dataset, cg_dataset, num_updates=n_updates)
189 | 
190 |     seqs = xrange(10)
191 | 
192 |     plt.close('all')
193 |     for seq_num in seqs:
194 |         fig = plt.figure()
195 |         ax1 = plt.subplot(211)
196 |         plt.plot(seq[seq_num])
197 |         ax1.set_title('input')
198 | 
199 |         ax2 = plt.subplot(212)
200 |         # blue line will represent true classes
201 |         true_targets = plt.step(xrange(n_steps), targets[seq_num], marker='o')
202 | 
203 |         # show probabilities (in b/w) output by model
204 |         guess = model.predict_proba(seq[seq_num])
205 |         guessed_probs = plt.imshow(guess.T, interpolation='nearest',
206 |                                    cmap='gray')
207 |         ax2.set_title('blue: true class, grayscale: probs assigned by model')
208 | 
209 | 
210 | if __name__ == "__main__":
211 |     logging.basicConfig(level=logging.INFO)
212 |     #test_real(n_updates=20)
213 |     #test_binary(multiple_out=True, n_updates=20)
214 |     test_softmax(n_updates=20)
215 | 


--------------------------------------------------------------------------------
/rnn.py:
--------------------------------------------------------------------------------
  1 | """ Vanilla RNN
  2 | 
  3 | @author Graham Taylor
  4 | """
  5 | import numpy as np
  6 | import theano
  7 | import theano.tensor as T
  8 | from sklearn.base import BaseEstimator
  9 | import logging
 10 | import time
 11 | import os
 12 | import datetime
 13 | import cPickle as pickle
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | import matplotlib.pyplot as plt
 18 | plt.ion()
 19 | 
 20 | mode = theano.Mode(linker='cvm')
 21 | #mode = 'DEBUG_MODE'
 22 | 
 23 | 
 24 | class RNN(object):
 25 |     """    Recurrent neural network class
 26 | 
 27 |     Supported output types:
 28 |     real : linear output units, use mean-squared error
 29 |     binary : binary output units, use cross-entropy error
 30 |     softmax : single softmax out, use cross-entropy error
 31 |     """
 32 |     def __init__(self, input, n_in, n_hidden, n_out, activation=T.tanh,
 33 |                  output_type='real', use_symbolic_softmax=False):
 34 | 
 35 |         self.input = input
 36 |         self.activation = activation
 37 |         self.output_type = output_type
 38 | 
 39 |         # when using HF, SoftmaxGrad.grad is not implemented
 40 |         # use a symbolic softmax which is slightly slower than T.nnet.softmax
 41 |         # See: http://groups.google.com/group/theano-dev/browse_thread/
 42 |         # thread/3930bd5a6a67d27a
 43 |         if use_symbolic_softmax:
 44 |             def symbolic_softmax(x):
 45 |                 e = T.exp(x)
 46 |                 return e / T.sum(e, axis=1).dimshuffle(0, 'x')
 47 |             self.softmax = symbolic_softmax
 48 |         else:
 49 |             self.softmax = T.nnet.softmax
 50 | 
 51 |         # recurrent weights as a shared variable
 52 |         W_init = np.asarray(np.random.uniform(size=(n_hidden, n_hidden),
 53 |                                               low=-.01, high=.01),
 54 |                                               dtype=theano.config.floatX)
 55 |         self.W = theano.shared(value=W_init, name='W')
 56 |         # input to hidden layer weights
 57 |         W_in_init = np.asarray(np.random.uniform(size=(n_in, n_hidden),
 58 |                                                  low=-.01, high=.01),
 59 |                                                  dtype=theano.config.floatX)
 60 |         self.W_in = theano.shared(value=W_in_init, name='W_in')
 61 | 
 62 |         # hidden to output layer weights
 63 |         W_out_init = np.asarray(np.random.uniform(size=(n_hidden, n_out),
 64 |                                                   low=-.01, high=.01),
 65 |                                                   dtype=theano.config.floatX)
 66 |         self.W_out = theano.shared(value=W_out_init, name='W_out')
 67 | 
 68 |         h0_init = np.zeros((n_hidden,), dtype=theano.config.floatX)
 69 |         self.h0 = theano.shared(value=h0_init, name='h0')
 70 | 
 71 |         bh_init = np.zeros((n_hidden,), dtype=theano.config.floatX)
 72 |         self.bh = theano.shared(value=bh_init, name='bh')
 73 | 
 74 |         by_init = np.zeros((n_out,), dtype=theano.config.floatX)
 75 |         self.by = theano.shared(value=by_init, name='by')
 76 | 
 77 |         self.params = [self.W, self.W_in, self.W_out, self.h0,
 78 |                        self.bh, self.by]
 79 | 
 80 |         # for every parameter, we maintain it's last update
 81 |         # the idea here is to use "momentum"
 82 |         # keep moving mostly in the same direction
 83 |         self.updates = {}
 84 |         for param in self.params:
 85 |             init = np.zeros(param.get_value(borrow=True).shape,
 86 |                             dtype=theano.config.floatX)
 87 |             self.updates[param] = theano.shared(init)
 88 | 
 89 |         # recurrent function (using tanh activation function) and linear output
 90 |         # activation function
 91 |         def step(x_t, h_tm1):
 92 |             h_t = self.activation(T.dot(x_t, self.W_in) + \
 93 |                                   T.dot(h_tm1, self.W) + self.bh)
 94 |             y_t = T.dot(h_t, self.W_out) + self.by
 95 |             return h_t, y_t
 96 | 
 97 |         # the hidden state `h` for the entire sequence, and the output for the
 98 |         # entire sequence `y` (first dimension is always time)
 99 |         [self.h, self.y_pred], _ = theano.scan(step,
100 |                                                sequences=self.input,
101 |                                                outputs_info=[self.h0, None])
102 | 
103 |         # L1 norm ; one regularization option is to enforce L1 norm to
104 |         # be small
105 |         self.L1 = 0
106 |         self.L1 += abs(self.W.sum())
107 |         self.L1 += abs(self.W_in.sum())
108 |         self.L1 += abs(self.W_out.sum())
109 | 
110 |         # square of L2 norm ; one regularization option is to enforce
111 |         # square of L2 norm to be small
112 |         self.L2_sqr = 0
113 |         self.L2_sqr += (self.W ** 2).sum()
114 |         self.L2_sqr += (self.W_in ** 2).sum()
115 |         self.L2_sqr += (self.W_out ** 2).sum()
116 | 
117 |         if self.output_type == 'real':
118 |             self.loss = lambda y: self.mse(y)
119 |         elif self.output_type == 'binary':
120 |             # push through sigmoid
121 |             self.p_y_given_x = T.nnet.sigmoid(self.y_pred)  # apply sigmoid
122 |             self.y_out = T.round(self.p_y_given_x)  # round to {0,1}
123 |             self.loss = lambda y: self.nll_binary(y)
124 |         elif self.output_type == 'softmax':
125 |             # push through softmax, computing vector of class-membership
126 |             # probabilities in symbolic form
127 |             self.p_y_given_x = self.softmax(self.y_pred)
128 | 
129 |             # compute prediction as class whose probability is maximal
130 |             self.y_out = T.argmax(self.p_y_given_x, axis=-1)
131 |             self.loss = lambda y: self.nll_multiclass(y)
132 |         else:
133 |             raise NotImplementedError
134 | 
135 |     def mse(self, y):
136 |         # error between output and target
137 |         return T.mean((self.y_pred - y) ** 2)
138 | 
139 |     def nll_binary(self, y):
140 |         # negative log likelihood based on binary cross entropy error
141 |         return T.mean(T.nnet.binary_crossentropy(self.p_y_given_x, y))
142 | 
143 |     def nll_multiclass(self, y):
144 |         # negative log likelihood based on multiclass cross entropy error
145 |         # y.shape[0] is (symbolically) the number of rows in y, i.e.,
146 |         # number of time steps (call it T) in the sequence
147 |         # T.arange(y.shape[0]) is a symbolic vector which will contain
148 |         # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
149 |         # Log-Probabilities (call it LP) with one row per example and
150 |         # one column per class LP[T.arange(y.shape[0]),y] is a vector
151 |         # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
152 |         # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
153 |         # the mean (across minibatch examples) of the elements in v,
154 |         # i.e., the mean log-likelihood across the minibatch.
155 |         return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
156 | 
157 |     def errors(self, y):
158 |         """Return a float representing the number of errors in the sequence
159 |         over the total number of examples in the sequence ; zero one
160 |         loss over the size of the sequence
161 | 
162 |         :type y: theano.tensor.TensorType
163 |         :param y: corresponds to a vector that gives for each example the
164 |                   correct label
165 |         """
166 |         # check if y has same dimension of y_pred
167 |         if y.ndim != self.y_out.ndim:
168 |             raise TypeError('y should have the same shape as self.y_out',
169 |                 ('y', y.type, 'y_out', self.y_out.type))
170 | 
171 |         if self.output_type in ('binary', 'softmax'):
172 |             # check if y is of the correct datatype
173 |             if y.dtype.startswith('int'):
174 |                 # the T.neq operator returns a vector of 0s and 1s, where 1
175 |                 # represents a mistake in prediction
176 |                 return T.mean(T.neq(self.y_out, y))
177 |             else:
178 |                 raise NotImplementedError()
179 | 
180 | 
181 | class MetaRNN(BaseEstimator):
182 |     def __init__(self, n_in=5, n_hidden=50, n_out=5, learning_rate=0.01,
183 |                  n_epochs=100, L1_reg=0.00, L2_reg=0.00, learning_rate_decay=1,
184 |                  activation='tanh', output_type='real',
185 |                  final_momentum=0.9, initial_momentum=0.5,
186 |                  momentum_switchover=5,
187 |                  use_symbolic_softmax=False):
188 |         self.n_in = int(n_in)
189 |         self.n_hidden = int(n_hidden)
190 |         self.n_out = int(n_out)
191 |         self.learning_rate = float(learning_rate)
192 |         self.learning_rate_decay = float(learning_rate_decay)
193 |         self.n_epochs = int(n_epochs)
194 |         self.L1_reg = float(L1_reg)
195 |         self.L2_reg = float(L2_reg)
196 |         self.activation = activation
197 |         self.output_type = output_type
198 |         self.initial_momentum = float(initial_momentum)
199 |         self.final_momentum = float(final_momentum)
200 |         self.momentum_switchover = int(momentum_switchover)
201 |         self.use_symbolic_softmax = use_symbolic_softmax
202 | 
203 |         self.ready()
204 | 
205 |     def ready(self):
206 |         # input (where first dimension is time)
207 |         self.x = T.matrix()
208 |         # target (where first dimension is time)
209 |         if self.output_type == 'real':
210 |             self.y = T.matrix(name='y', dtype=theano.config.floatX)
211 |         elif self.output_type == 'binary':
212 |             self.y = T.matrix(name='y', dtype='int32')
213 |         elif self.output_type == 'softmax':  # only vector labels supported
214 |             self.y = T.vector(name='y', dtype='int32')
215 |         else:
216 |             raise NotImplementedError
217 |         # initial hidden state of the RNN
218 |         self.h0 = T.vector()
219 |         # learning rate
220 |         self.lr = T.scalar()
221 | 
222 |         if self.activation == 'tanh':
223 |             activation = T.tanh
224 |         elif self.activation == 'sigmoid':
225 |             activation = T.nnet.sigmoid
226 |         elif self.activation == 'relu':
227 |             activation = lambda x: x * (x > 0)
228 |         elif self.activation == 'cappedrelu':
229 |             activation = lambda x: T.minimum(x * (x > 0), 6)
230 |         else:
231 |             raise NotImplementedError
232 | 
233 |         self.rnn = RNN(input=self.x, n_in=self.n_in,
234 |                        n_hidden=self.n_hidden, n_out=self.n_out,
235 |                        activation=activation, output_type=self.output_type,
236 |                        use_symbolic_softmax=self.use_symbolic_softmax)
237 | 
238 |         if self.output_type == 'real':
239 |             self.predict = theano.function(inputs=[self.x, ],
240 |                                            outputs=self.rnn.y_pred,
241 |                                            mode=mode)
242 |         elif self.output_type == 'binary':
243 |             self.predict_proba = theano.function(inputs=[self.x, ],
244 |                                 outputs=self.rnn.p_y_given_x, mode=mode)
245 |             self.predict = theano.function(inputs=[self.x, ],
246 |                                 outputs=T.round(self.rnn.p_y_given_x),
247 |                                 mode=mode)
248 |         elif self.output_type == 'softmax':
249 |             self.predict_proba = theano.function(inputs=[self.x, ],
250 |                         outputs=self.rnn.p_y_given_x, mode=mode)
251 |             self.predict = theano.function(inputs=[self.x, ],
252 |                                 outputs=self.rnn.y_out, mode=mode)
253 |         else:
254 |             raise NotImplementedError
255 | 
256 |     def shared_dataset(self, data_xy):
257 |         """ Load the dataset into shared variables """
258 | 
259 |         data_x, data_y = data_xy
260 |         shared_x = theano.shared(np.asarray(data_x,
261 |                                             dtype=theano.config.floatX))
262 | 
263 |         shared_y = theano.shared(np.asarray(data_y,
264 |                                             dtype=theano.config.floatX))
265 | 
266 |         if self.output_type in ('binary', 'softmax'):
267 |             return shared_x, T.cast(shared_y, 'int32')
268 |         else:
269 |             return shared_x, shared_y
270 | 
271 |     def __getstate__(self):
272 |         """ Return state sequence."""
273 |         params = self._get_params()  # parameters set in constructor
274 |         weights = [p.get_value() for p in self.rnn.params]
275 |         state = (params, weights)
276 |         return state
277 | 
278 |     def _set_weights(self, weights):
279 |         """ Set fittable parameters from weights sequence.
280 | 
281 |         Parameters must be in the order defined by self.params:
282 |             W, W_in, W_out, h0, bh, by
283 |         """
284 |         i = iter(weights)
285 | 
286 |         for param in self.rnn.params:
287 |             param.set_value(i.next())
288 | 
289 |     def __setstate__(self, state):
290 |         """ Set parameters from state sequence.
291 | 
292 |         Parameters must be in the order defined by self.params:
293 |             W, W_in, W_out, h0, bh, by
294 |         """
295 |         params, weights = state
296 |         self.set_params(**params)
297 |         self.ready()
298 |         self._set_weights(weights)
299 | 
300 |     def save(self, fpath='.', fname=None):
301 |         """ Save a pickled representation of Model state. """
302 |         fpathstart, fpathext = os.path.splitext(fpath)
303 |         if fpathext == '.pkl':
304 |             # User supplied an absolute path to a pickle file
305 |             fpath, fname = os.path.split(fpath)
306 | 
307 |         elif fname is None:
308 |             # Generate filename based on date
309 |             date_obj = datetime.datetime.now()
310 |             date_str = date_obj.strftime('%Y-%m-%d-%H:%M:%S')
311 |             class_name = self.__class__.__name__
312 |             fname = '%s.%s.pkl' % (class_name, date_str)
313 | 
314 |         fabspath = os.path.join(fpath, fname)
315 | 
316 |         logger.info("Saving to %s ..." % fabspath)
317 |         file = open(fabspath, 'wb')
318 |         state = self.__getstate__()
319 |         pickle.dump(state, file, protocol=pickle.HIGHEST_PROTOCOL)
320 |         file.close()
321 | 
322 |     def load(self, path):
323 |         """ Load model parameters from path. """
324 |         logger.info("Loading from %s ..." % path)
325 |         file = open(path, 'rb')
326 |         state = pickle.load(file)
327 |         self.__setstate__(state)
328 |         file.close()
329 | 
330 |     def fit(self, X_train, Y_train, X_test=None, Y_test=None,
331 |             validation_frequency=100):
332 |         """ Fit model
333 | 
334 |         Pass in X_test, Y_test to compute test error and report during
335 |         training.
336 | 
337 |         X_train : ndarray (n_seq x n_steps x n_in)
338 |         Y_train : ndarray (n_seq x n_steps x n_out)
339 | 
340 |         validation_frequency : int
341 |             in terms of number of sequences (or number of weight updates)
342 |         """
343 |         if X_test is not None:
344 |             assert(Y_test is not None)
345 |             self.interactive = True
346 |             test_set_x, test_set_y = self.shared_dataset((X_test, Y_test))
347 |         else:
348 |             self.interactive = False
349 | 
350 |         train_set_x, train_set_y = self.shared_dataset((X_train, Y_train))
351 | 
352 |         n_train = train_set_x.get_value(borrow=True).shape[0]
353 |         if self.interactive:
354 |             n_test = test_set_x.get_value(borrow=True).shape[0]
355 | 
356 |         ######################
357 |         # BUILD ACTUAL MODEL #
358 |         ######################
359 |         logger.info('... building the model')
360 | 
361 |         index = T.lscalar('index')    # index to a case
362 |         # learning rate (may change)
363 |         l_r = T.scalar('l_r', dtype=theano.config.floatX)
364 |         mom = T.scalar('mom', dtype=theano.config.floatX)  # momentum
365 | 
366 |         cost = self.rnn.loss(self.y) \
367 |             + self.L1_reg * self.rnn.L1 \
368 |             + self.L2_reg * self.rnn.L2_sqr
369 | 
370 |         compute_train_error = theano.function(inputs=[index, ],
371 |                                               outputs=self.rnn.loss(self.y),
372 |                                               givens={
373 |                                                   self.x: train_set_x[index],
374 |                                                   self.y: train_set_y[index]},
375 |             mode=mode)
376 | 
377 |         if self.interactive:
378 |             compute_test_error = theano.function(inputs=[index, ],
379 |                         outputs=self.rnn.loss(self.y),
380 |                         givens={
381 |                             self.x: test_set_x[index],
382 |                             self.y: test_set_y[index]},
383 |                         mode=mode)
384 | 
385 |         # compute the gradient of cost with respect to theta = (W, W_in, W_out)
386 |         # gradients on the weights using BPTT
387 |         gparams = []
388 |         for param in self.rnn.params:
389 |             gparam = T.grad(cost, param)
390 |             gparams.append(gparam)
391 | 
392 |         updates = {}
393 |         for param, gparam in zip(self.rnn.params, gparams):
394 |             weight_update = self.rnn.updates[param]
395 |             upd = mom * weight_update - l_r * gparam
396 |             updates[weight_update] = upd
397 |             updates[param] = param + upd
398 | 
399 |         # compiling a Theano function `train_model` that returns the
400 |         # cost, but in the same time updates the parameter of the
401 |         # model based on the rules defined in `updates`
402 |         train_model = theano.function(inputs=[index, l_r, mom],
403 |                                       outputs=cost,
404 |                                       updates=updates,
405 |                                       givens={
406 |                                           self.x: train_set_x[index],
407 |                                           self.y: train_set_y[index]},
408 |                                           mode=mode)
409 | 
410 |         ###############
411 |         # TRAIN MODEL #
412 |         ###############
413 |         logger.info('... training')
414 |         epoch = 0
415 | 
416 |         while (epoch < self.n_epochs):
417 |             epoch = epoch + 1
418 |             for idx in xrange(n_train):
419 |                 effective_momentum = self.final_momentum \
420 |                                if epoch > self.momentum_switchover \
421 |                                else self.initial_momentum
422 |                 example_cost = train_model(idx, self.learning_rate,
423 |                                            effective_momentum)
424 | 
425 |                 # iteration number (how many weight updates have we made?)
426 |                 # epoch is 1-based, index is 0 based
427 |                 iter = (epoch - 1) * n_train + idx + 1
428 | 
429 |                 if iter % validation_frequency == 0:
430 |                     # compute loss on training set
431 |                     train_losses = [compute_train_error(i)
432 |                                     for i in xrange(n_train)]
433 |                     this_train_loss = np.mean(train_losses)
434 | 
435 |                     if self.interactive:
436 |                         test_losses = [compute_test_error(i)
437 |                                         for i in xrange(n_test)]
438 |                         this_test_loss = np.mean(test_losses)
439 | 
440 |                         logger.info('epoch %i, seq %i/%i, tr loss %f '
441 |                                     'te loss %f lr: %f' % \
442 |                         (epoch, idx + 1, n_train,
443 |                          this_train_loss, this_test_loss, self.learning_rate))
444 |                     else:
445 |                         logger.info('epoch %i, seq %i/%i, train loss %f '
446 |                                     'lr: %f' % \
447 |                                     (epoch, idx + 1, n_train, this_train_loss,
448 |                                      self.learning_rate))
449 | 
450 |             self.learning_rate *= self.learning_rate_decay
451 | 
452 | 
453 | def test_real():
454 |     """ Test RNN with real-valued outputs. """
455 |     n_hidden = 10
456 |     n_in = 5
457 |     n_out = 3
458 |     n_steps = 10
459 |     n_seq = 100
460 | 
461 |     np.random.seed(0)
462 |     # simple lag test
463 |     seq = np.random.randn(n_seq, n_steps, n_in)
464 |     targets = np.zeros((n_seq, n_steps, n_out))
465 | 
466 |     targets[:, 1:, 0] = seq[:, :-1, 3]  # delayed 1
467 |     targets[:, 1:, 1] = seq[:, :-1, 2]  # delayed 1
468 |     targets[:, 2:, 2] = seq[:, :-2, 0]  # delayed 2
469 | 
470 |     targets += 0.01 * np.random.standard_normal(targets.shape)
471 | 
472 |     model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out,
473 |                     learning_rate=0.001, learning_rate_decay=0.999,
474 |                     n_epochs=400, activation='tanh')
475 | 
476 |     model.fit(seq, targets, validation_frequency=1000)
477 | 
478 |     plt.close('all')
479 |     fig = plt.figure()
480 |     ax1 = plt.subplot(211)
481 |     plt.plot(seq[0])
482 |     ax1.set_title('input')
483 | 
484 |     ax2 = plt.subplot(212)
485 |     true_targets = plt.plot(targets[0])
486 | 
487 |     guess = model.predict(seq[0])
488 |     guessed_targets = plt.plot(guess, linestyle='--')
489 |     for i, x in enumerate(guessed_targets):
490 |         x.set_color(true_targets[i].get_color())
491 |     ax2.set_title('solid: true output, dashed: model output')
492 | 
493 | 
494 | def test_binary(multiple_out=False, n_epochs=250):
495 |     """ Test RNN with binary outputs. """
496 |     n_hidden = 10
497 |     n_in = 5
498 |     if multiple_out:
499 |         n_out = 2
500 |     else:
501 |         n_out = 1
502 |     n_steps = 10
503 |     n_seq = 100
504 | 
505 |     np.random.seed(0)
506 |     # simple lag test
507 |     seq = np.random.randn(n_seq, n_steps, n_in)
508 |     targets = np.zeros((n_seq, n_steps, n_out))
509 | 
510 |     # whether lag 1 (dim 3) is greater than lag 2 (dim 0)
511 |     targets[:, 2:, 0] = np.cast[np.int](seq[:, 1:-1, 3] > seq[:, :-2, 0])
512 | 
513 |     if multiple_out:
514 |         # whether product of lag 1 (dim 4) and lag 1 (dim 2)
515 |         # is less than lag 2 (dim 0)
516 |         targets[:, 2:, 1] = np.cast[np.int](
517 |             (seq[:, 1:-1, 4] * seq[:, 1:-1, 2]) > seq[:, :-2, 0])
518 | 
519 |     model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out,
520 |                     learning_rate=0.001, learning_rate_decay=0.999,
521 |                     n_epochs=n_epochs, activation='tanh', output_type='binary')
522 | 
523 |     model.fit(seq, targets, validation_frequency=1000)
524 | 
525 |     seqs = xrange(10)
526 | 
527 |     plt.close('all')
528 |     for seq_num in seqs:
529 |         fig = plt.figure()
530 |         ax1 = plt.subplot(211)
531 |         plt.plot(seq[seq_num])
532 |         ax1.set_title('input')
533 |         ax2 = plt.subplot(212)
534 |         true_targets = plt.step(xrange(n_steps), targets[seq_num], marker='o')
535 | 
536 |         guess = model.predict_proba(seq[seq_num])
537 |         guessed_targets = plt.step(xrange(n_steps), guess)
538 |         plt.setp(guessed_targets, linestyle='--', marker='d')
539 |         for i, x in enumerate(guessed_targets):
540 |             x.set_color(true_targets[i].get_color())
541 |         ax2.set_ylim((-0.1, 1.1))
542 |         ax2.set_title('solid: true output, dashed: model output (prob)')
543 | 
544 | 
545 | def test_softmax(n_epochs=250):
546 |     """ Test RNN with softmax outputs. """
547 |     n_hidden = 10
548 |     n_in = 5
549 |     n_steps = 10
550 |     n_seq = 100
551 |     n_classes = 3
552 |     n_out = n_classes  # restricted to single softmax per time step
553 | 
554 |     np.random.seed(0)
555 |     # simple lag test
556 |     seq = np.random.randn(n_seq, n_steps, n_in)
557 |     targets = np.zeros((n_seq, n_steps), dtype=np.int)
558 | 
559 |     thresh = 0.5
560 |     # if lag 1 (dim 3) is greater than lag 2 (dim 0) + thresh
561 |     # class 1
562 |     # if lag 1 (dim 3) is less than lag 2 (dim 0) - thresh
563 |     # class 2
564 |     # if lag 2(dim0) - thresh <= lag 1 (dim 3) <= lag2(dim0) + thresh
565 |     # class 0
566 |     targets[:, 2:][seq[:, 1:-1, 3] > seq[:, :-2, 0] + thresh] = 1
567 |     targets[:, 2:][seq[:, 1:-1, 3] < seq[:, :-2, 0] - thresh] = 2
568 |     #targets[:, 2:, 0] = np.cast[np.int](seq[:, 1:-1, 3] > seq[:, :-2, 0])
569 | 
570 |     model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out,
571 |                     learning_rate=0.001, learning_rate_decay=0.999,
572 |                     n_epochs=n_epochs, activation='tanh',
573 |                     output_type='softmax', use_symbolic_softmax=False)
574 | 
575 |     model.fit(seq, targets, validation_frequency=1000)
576 | 
577 |     seqs = xrange(10)
578 | 
579 |     plt.close('all')
580 |     for seq_num in seqs:
581 |         fig = plt.figure()
582 |         ax1 = plt.subplot(211)
583 |         plt.plot(seq[seq_num])
584 |         ax1.set_title('input')
585 |         ax2 = plt.subplot(212)
586 | 
587 |         # blue line will represent true classes
588 |         true_targets = plt.step(xrange(n_steps), targets[seq_num], marker='o')
589 | 
590 |         # show probabilities (in b/w) output by model
591 |         guess = model.predict_proba(seq[seq_num])
592 |         guessed_probs = plt.imshow(guess.T, interpolation='nearest',
593 |                                    cmap='gray')
594 |         ax2.set_title('blue: true class, grayscale: probs assigned by model')
595 | 
596 | 
597 | if __name__ == "__main__":
598 |     logging.basicConfig(level=logging.INFO)
599 |     t0 = time.time()
600 |     test_real()
601 |     # problem takes more epochs to solve
602 |     #test_binary(multiple_out=True, n_epochs=2400)
603 |     #test_softmax(n_epochs=250)
604 |     print "Elapsed time: %f" % (time.time() - t0)
605 | 


--------------------------------------------------------------------------------
/rnn_minibatch.py:
--------------------------------------------------------------------------------
  1 | """ Vanilla RNN
  2 | Parallelizes scan over sequences by using mini-batches.
  3 | 
  4 | @author Graham Taylor
  5 | """
  6 | import numpy as np
  7 | import theano
  8 | import theano.tensor as T
  9 | from sklearn.base import BaseEstimator
 10 | import logging
 11 | import time
 12 | import os
 13 | import datetime
 14 | import cPickle as pickle
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | import matplotlib.pyplot as plt
 19 | plt.ion()
 20 | 
 21 | mode = theano.Mode(linker='cvm')
 22 | #mode = 'DEBUG_MODE'
 23 | 
 24 | 
 25 | class RNN(object):
 26 |     """    Recurrent neural network class
 27 | 
 28 |     Supported output types:
 29 |     real : linear output units, use mean-squared error
 30 |     binary : binary output units, use cross-entropy error
 31 |     softmax : single softmax out, use cross-entropy error
 32 | 
 33 |     """
 34 |     def __init__(self, input, n_in, n_hidden, n_out, activation=T.tanh,
 35 |                  output_type='real'):
 36 | 
 37 |         self.input = input
 38 |         self.activation = activation
 39 |         self.output_type = output_type
 40 | 
 41 |         self.batch_size = T.iscalar()
 42 | 
 43 |         # theta is a vector of all trainable parameters
 44 |         # it represents the value of W, W_in, W_out, h0, bh, by
 45 |         theta_shape = n_hidden ** 2 + n_in * n_hidden + n_hidden * n_out + \
 46 |                       n_hidden + n_hidden + n_out
 47 |         self.theta = theano.shared(value=np.zeros(theta_shape,
 48 |                                                   dtype=theano.config.floatX))
 49 | 
 50 |         # Parameters are reshaped views of theta
 51 |         param_idx = 0  # pointer to somewhere along parameter vector
 52 | 
 53 |         # recurrent weights as a shared variable
 54 |         self.W = self.theta[param_idx:(param_idx + n_hidden ** 2)].reshape(
 55 |             (n_hidden, n_hidden))
 56 |         self.W.name = 'W'
 57 |         W_init = np.asarray(np.random.uniform(size=(n_hidden, n_hidden),
 58 |                                               low=-0.01, high=0.01),
 59 |                                               dtype=theano.config.floatX)
 60 |         param_idx += n_hidden ** 2
 61 | 
 62 |         # input to hidden layer weights
 63 |         self.W_in = self.theta[param_idx:(param_idx + n_in * \
 64 |                                           n_hidden)].reshape((n_in, n_hidden))
 65 |         self.W_in.name = 'W_in'
 66 |         W_in_init = np.asarray(np.random.uniform(size=(n_in, n_hidden),
 67 |                                                  low=-0.01, high=0.01),
 68 |                                                  dtype=theano.config.floatX)
 69 |         param_idx += n_in * n_hidden
 70 | 
 71 |         # hidden to output layer weights
 72 |         self.W_out = self.theta[param_idx:(param_idx + n_hidden * \
 73 |                                            n_out)].reshape((n_hidden, n_out))
 74 |         self.W_out.name = 'W_out'
 75 | 
 76 |         W_out_init = np.asarray(np.random.uniform(size=(n_hidden, n_out),
 77 |                                                   low=-0.01, high=0.01),
 78 |                                                   dtype=theano.config.floatX)
 79 |         param_idx += n_hidden * n_out
 80 | 
 81 |         self.h0 = self.theta[param_idx:(param_idx + n_hidden)]
 82 |         self.h0.name = 'h0'
 83 |         h0_init = np.zeros((n_hidden,), dtype=theano.config.floatX)
 84 |         param_idx += n_hidden
 85 | 
 86 |         self.bh = self.theta[param_idx:(param_idx + n_hidden)]
 87 |         self.bh.name = 'bh'
 88 |         bh_init = np.zeros((n_hidden,), dtype=theano.config.floatX)
 89 |         param_idx += n_hidden
 90 | 
 91 |         self.by = self.theta[param_idx:(param_idx + n_out)]
 92 |         self.by.name = 'by'
 93 |         by_init = np.zeros((n_out,), dtype=theano.config.floatX)
 94 |         param_idx += n_out
 95 | 
 96 |         assert(param_idx == theta_shape)
 97 | 
 98 |         # for convenience
 99 |         self.params = [self.W, self.W_in, self.W_out, self.h0, self.bh,
100 |                        self.by]
101 | 
102 |         # shortcut to norms (for monitoring)
103 |         self.l2_norms = {}
104 |         for param in self.params:
105 |             self.l2_norms[param] = T.sqrt(T.sum(param ** 2))
106 | 
107 |         # initialize parameters
108 |         # DEBUG_MODE gives division by zero error when we leave parameters
109 |         # as zeros
110 |         self.theta.set_value(np.concatenate([x.ravel() for x in
111 |             (W_init, W_in_init, W_out_init, h0_init, bh_init, by_init)]))
112 | 
113 |         self.theta_update = theano.shared(
114 |             value=np.zeros(theta_shape, dtype=theano.config.floatX))
115 | 
116 |         # recurrent function (using tanh activation function) and arbitrary output
117 |         # activation function
118 |         def step(x_t, h_tm1):
119 |             h_t = self.activation(T.dot(x_t, self.W_in) + \
120 |                                   T.dot(h_tm1, self.W) + self.bh)
121 |             y_t = T.dot(h_t, self.W_out) + self.by
122 |             return h_t, y_t
123 | 
124 |         # the hidden state `h` for the entire sequence, and the output for the
125 |         # entire sequence `y` (first dimension is always time)
126 |         # Note the implementation of weight-sharing h0 across variable-size
127 |         # batches using T.ones multiplying h0
128 |         # Alternatively, T.alloc approach is more robust
129 |         [self.h, self.y_pred], _ = theano.scan(step,
130 |                     sequences=self.input,
131 |                     outputs_info=[T.alloc(self.h0, self.input.shape[1],
132 |                                           n_hidden), None])
133 |                     # outputs_info=[T.ones(shape=(self.input.shape[1],
134 |                     # self.h0.shape[0])) * self.h0, None])
135 | 
136 |         # L1 norm ; one regularization option is to enforce L1 norm to
137 |         # be small
138 |         self.L1 = 0
139 |         self.L1 += abs(self.W.sum())
140 |         self.L1 += abs(self.W_in.sum())
141 |         self.L1 += abs(self.W_out.sum())
142 | 
143 |         # square of L2 norm ; one regularization option is to enforce
144 |         # square of L2 norm to be small
145 |         self.L2_sqr = 0
146 |         self.L2_sqr += (self.W ** 2).sum()
147 |         self.L2_sqr += (self.W_in ** 2).sum()
148 |         self.L2_sqr += (self.W_out ** 2).sum()
149 | 
150 |         if self.output_type == 'real':
151 |             self.loss = lambda y: self.mse(y)
152 |         elif self.output_type == 'binary':
153 |             # push through sigmoid
154 |             self.p_y_given_x = T.nnet.sigmoid(self.y_pred)  # apply sigmoid
155 |             self.y_out = T.round(self.p_y_given_x)  # round to {0,1}
156 |             self.loss = lambda y: self.nll_binary(y)
157 |         elif self.output_type == 'softmax':
158 |             # push through softmax, computing vector of class-membership
159 |             # probabilities in symbolic form
160 |             #
161 |             # T.nnet.softmax will not operate on T.tensor3 types, only matrices
162 |             # We take our n_steps x n_seq x n_classes output from the net
163 |             # and reshape it into a (n_steps * n_seq) x n_classes matrix
164 |             # apply softmax, then reshape back
165 |             y_p = self.y_pred
166 |             y_p_m = T.reshape(y_p, (y_p.shape[0] * y_p.shape[1], -1))
167 |             y_p_s = T.nnet.softmax(y_p_m)
168 |             self.p_y_given_x = T.reshape(y_p_s, y_p.shape)
169 | 
170 |             # compute prediction as class whose probability is maximal
171 |             self.y_out = T.argmax(self.p_y_given_x, axis=-1)
172 |             self.loss = lambda y: self.nll_multiclass(y)
173 | 
174 |         else:
175 |             raise NotImplementedError
176 | 
177 |     def mse(self, y):
178 |         # error between output and target
179 |         return T.mean((self.y_pred - y) ** 2)
180 | 
181 |     def nll_binary(self, y):
182 |         # negative log likelihood based on binary cross entropy error
183 |         return T.mean(T.nnet.binary_crossentropy(self.p_y_given_x, y))
184 | 
185 |     def nll_multiclass(self, y):
186 |         # negative log likelihood based on multiclass cross entropy error
187 |         #
188 |         # Theano's advanced indexing is limited
189 |         # therefore we reshape our n_steps x n_seq x n_classes tensor3 of probs
190 |         # to a (n_steps * n_seq) x n_classes matrix of probs
191 |         # so that we can use advanced indexing (i.e. get the probs which
192 |         # correspond to the true class)
193 |         # the labels y also must be flattened when we do this to use the
194 |         # advanced indexing
195 |         p_y = self.p_y_given_x
196 |         p_y_m = T.reshape(p_y, (p_y.shape[0] * p_y.shape[1], -1))
197 |         y_f = y.flatten(ndim=1)
198 |         return -T.mean(T.log(p_y_m)[T.arange(p_y_m.shape[0]), y_f])
199 | 
200 |     def errors(self, y):
201 |         """Return a float representing the number of errors in the minibatch
202 |         over the total number of examples of the minibatch ; zero one
203 |         loss over the size of the minibatch
204 | 
205 |         :type y: theano.tensor.TensorType
206 |         :param y: corresponds to a vector that gives for each example the
207 |                   correct label
208 |         """
209 | 
210 |         # check if y has same dimension of y_pred
211 |         if y.ndim != self.y_out.ndim:
212 |             raise TypeError('y should have the same shape as self.y_out',
213 |                 ('y', y.type, 'y_out', self.y_out.type))
214 |         # check if y is of the correct datatype
215 |         if y.dtype.startswith('int'):
216 |             # the T.neq operator returns a vector of 0s and 1s, where 1
217 |             # represents a mistake in prediction
218 |             return T.mean(T.neq(self.y_out, y))
219 |         else:
220 |             raise NotImplementedError()
221 | 
222 | 
223 | class MetaRNN(BaseEstimator):
224 |     def __init__(self, n_in=5, n_hidden=50, n_out=5, learning_rate=0.01,
225 |                  n_epochs=100, batch_size=100, L1_reg=0.00, L2_reg=0.00,
226 |                  learning_rate_decay=1,
227 |                  activation='tanh', output_type='real', final_momentum=0.9,
228 |                  initial_momentum=0.5, momentum_switchover=5,
229 |                  snapshot_every=None, snapshot_path='/tmp'):
230 |         self.n_in = int(n_in)
231 |         self.n_hidden = int(n_hidden)
232 |         self.n_out = int(n_out)
233 |         self.learning_rate = float(learning_rate)
234 |         self.learning_rate_decay = float(learning_rate_decay)
235 |         self.n_epochs = int(n_epochs)
236 |         self.batch_size = int(batch_size)
237 |         self.L1_reg = float(L1_reg)
238 |         self.L2_reg = float(L2_reg)
239 |         self.activation = activation
240 |         self.output_type = output_type
241 |         self.initial_momentum = float(initial_momentum)
242 |         self.final_momentum = float(final_momentum)
243 |         self.momentum_switchover = int(momentum_switchover)
244 |         if snapshot_every is not None:
245 |             self.snapshot_every = int(snapshot_every)
246 |         else:
247 |             self.snapshot_every = None
248 |         self.snapshot_path = snapshot_path
249 | 
250 |         self.ready()
251 | 
252 |     def ready(self):
253 |         # input (where first dimension is time)
254 |         self.x = T.tensor3(name='x')
255 |         # target (where first dimension is time)
256 |         if self.output_type == 'real':
257 |             self.y = T.tensor3(name='y', dtype=theano.config.floatX)
258 |         elif self.output_type == 'binary':
259 |             self.y = T.tensor3(name='y', dtype='int32')
260 |         elif self.output_type == 'softmax':  # now it is a matrix (T x n_seq)
261 |             self.y = T.matrix(name='y', dtype='int32')
262 |         else:
263 |             raise NotImplementedError
264 | 
265 |         # learning rate
266 |         self.lr = T.scalar()
267 | 
268 |         if self.activation == 'tanh':
269 |             activation = T.tanh
270 |         elif self.activation == 'sigmoid':
271 |             activation = T.nnet.sigmoid
272 |         elif self.activation == 'relu':
273 |             activation = lambda x: x * (x > 0)
274 |         elif self.activation == 'cappedrelu':
275 |             activation = lambda x: T.minimum(x * (x > 0), 6)
276 |         else:
277 |             raise NotImplementedError
278 | 
279 |         self.rnn = RNN(input=self.x, n_in=self.n_in,
280 |                        n_hidden=self.n_hidden, n_out=self.n_out,
281 |                        activation=activation, output_type=self.output_type)
282 | 
283 |         if self.output_type == 'real':
284 |             self.predict = theano.function(inputs=[self.x, ],
285 |                                            outputs=self.rnn.y_pred,
286 |                                            mode=mode)
287 |         elif self.output_type == 'binary':
288 |             self.predict_proba = theano.function(inputs=[self.x, ],
289 |                                 outputs=self.rnn.p_y_given_x, mode=mode)
290 |             self.predict = theano.function(inputs=[self.x, ],
291 |                                 outputs=T.round(self.rnn.p_y_given_x),
292 |                                 mode=mode)
293 |         elif self.output_type == 'softmax':
294 |             self.predict_proba = theano.function(inputs=[self.x, ],
295 |                         outputs=self.rnn.p_y_given_x, mode=mode)
296 |             self.predict = theano.function(inputs=[self.x, ],
297 |                                 outputs=self.rnn.y_out, mode=mode)
298 |         else:
299 |             raise NotImplementedError
300 | 
301 |     def shared_dataset(self, data_xy, borrow=True):
302 |         """ Load the dataset into shared variables """
303 | 
304 |         data_x, data_y = data_xy
305 |         shared_x = theano.shared(np.asarray(data_x,
306 |                                             dtype=theano.config.floatX),
307 |                                  borrow=True)
308 | 
309 |         shared_y = theano.shared(np.asarray(data_y,
310 |                                             dtype=theano.config.floatX),
311 |                                  borrow=True)
312 | 
313 |         if self.output_type in ('binary', 'softmax'):
314 |             return shared_x, T.cast(shared_y, 'int32')
315 |         else:
316 |             return shared_x, shared_y
317 | 
318 |     def __getstate__(self):
319 |         """ Return state sequence."""
320 |         params = self._get_params()  # parameters set in constructor
321 |         theta = self.rnn.theta.get_value()
322 |         state = (params, theta)
323 |         return state
324 | 
325 |     def _set_weights(self, theta):
326 |         """ Set fittable parameters from weights sequence.
327 |         """
328 |         self.rnn.theta.set_value(theta)
329 | 
330 |     def __setstate__(self, state):
331 |         """ Set parameters from state sequence.
332 |         """
333 |         params, theta = state
334 |         self.set_params(**params)
335 |         self.ready()
336 |         self._set_weights(theta)
337 | 
338 |     def save(self, fpath='.', fname=None):
339 |         """ Save a pickled representation of Model state. """
340 |         fpathstart, fpathext = os.path.splitext(fpath)
341 |         if fpathext == '.pkl':
342 |             # User supplied an absolute path to a pickle file
343 |             fpath, fname = os.path.split(fpath)
344 | 
345 |         elif fname is None:
346 |             # Generate filename based on date
347 |             date_obj = datetime.datetime.now()
348 |             date_str = date_obj.strftime('%Y-%m-%d-%H:%M:%S')
349 |             class_name = self.__class__.__name__
350 |             fname = '%s.%s.pkl' % (class_name, date_str)
351 | 
352 |         fabspath = os.path.join(fpath, fname)
353 | 
354 |         logger.info("Saving to %s ..." % fabspath)
355 |         file = open(fabspath, 'wb')
356 |         state = self.__getstate__()
357 |         pickle.dump(state, file, protocol=pickle.HIGHEST_PROTOCOL)
358 |         file.close()
359 | 
360 |     def load(self, path):
361 |         """ Load model parameters from path. """
362 |         logger.info("Loading from %s ..." % path)
363 |         file = open(path, 'rb')
364 |         state = pickle.load(file)
365 |         self.__setstate__(state)
366 |         file.close()
367 | 
368 |     def optional_output(self, train_set_x, show_norms=True, show_output=True):
369 |         """ Produces some debugging output. """
370 |         if show_norms:
371 |             norm_output = []
372 |             for param in self.rnn.params:
373 |                 norm_output.append('%s: %6.4f' % (param.name,
374 |                                                    self.get_norms[param]()))
375 |             logger.info("norms: {" + ', '.join(norm_output) + "}")
376 | 
377 |         if show_output:
378 |             # show output for a single case
379 |             if self.output_type == 'binary':
380 |                 output_fn = self.predict_proba
381 |             else:
382 |                 output_fn = self.predict
383 |             logger.info("sample output: " + \
384 |                     str(output_fn(train_set_x.get_value(
385 |                         borrow=True)[:, 0, :][:, np.newaxis, :]).flatten()))
386 | 
387 |     def fit(self, X_train, Y_train, X_test=None, Y_test=None,
388 |             validate_every=100, optimizer='sgd', compute_zero_one=False,
389 |             show_norms=True, show_output=True):
390 |         """ Fit model
391 | 
392 |         Pass in X_test, Y_test to compute test error and report during
393 |         training.
394 | 
395 |         X_train : ndarray (T x n_in)
396 |         Y_train : ndarray (T x n_out)
397 | 
398 |         validation_frequency : int
399 |             in terms of number of epochs
400 | 
401 |         optimizer : string
402 |             Optimizer type.
403 |             Possible values:
404 |                 'sgd'  : batch stochastic gradient descent
405 |                 'cg'   : nonlinear conjugate gradient algorithm
406 |                          (scipy.optimize.fmin_cg)
407 |                 'bfgs' : quasi-Newton method of Broyden, Fletcher, Goldfarb,
408 |                          and Shanno (scipy.optimize.fmin_bfgs)
409 |                 'l_bfgs_b' : Limited-memory BFGS (scipy.optimize.fmin_l_bfgs_b)
410 | 
411 |         compute_zero_one : bool
412 |             in the case of binary output, compute zero-one error in addition to
413 |             cross-entropy error
414 |         show_norms : bool
415 |             Show L2 norms of individual parameter groups while training.
416 |         show_output : bool
417 |             Show the model output on first training case while training.
418 |         """
419 |         if X_test is not None:
420 |             assert(Y_test is not None)
421 |             self.interactive = True
422 |             test_set_x, test_set_y = self.shared_dataset((X_test, Y_test))
423 |         else:
424 |             self.interactive = False
425 | 
426 |         train_set_x, train_set_y = self.shared_dataset((X_train, Y_train))
427 | 
428 |         if compute_zero_one:
429 |             assert(self.output_type == 'binary' \
430 |                    or self.output_type == 'softmax')
431 |         # compute number of minibatches for training
432 |         # note that cases are the second dimension, not the first
433 |         n_train = train_set_x.get_value(borrow=True).shape[1]
434 |         n_train_batches = int(np.ceil(1.0 * n_train / self.batch_size))
435 |         if self.interactive:
436 |             n_test = test_set_x.get_value(borrow=True).shape[1]
437 |             n_test_batches = int(np.ceil(1.0 * n_test / self.batch_size))
438 | 
439 |         #validate_every is specified in terms of epochs
440 |         validation_frequency = validate_every * n_train_batches
441 | 
442 |         ######################
443 |         # BUILD ACTUAL MODEL #
444 |         ######################
445 |         logger.info('... building the model')
446 | 
447 |         index = T.lscalar('index')    # index to a [mini]batch
448 |         n_ex = T.lscalar('n_ex')      # total number of examples
449 |         # learning rate (may change)
450 |         l_r = T.scalar('l_r', dtype=theano.config.floatX)
451 |         mom = T.scalar('mom', dtype=theano.config.floatX)  # momentum
452 | 
453 |         cost = self.rnn.loss(self.y) \
454 |             + self.L1_reg * self.rnn.L1 \
455 |             + self.L2_reg * self.rnn.L2_sqr
456 | 
457 |         # Proper implementation of variable-batch size evaluation
458 |         # Note that classifier.errors() returns the mean error
459 |         # But the last batch may be a smaller size
460 |         # So we keep around the effective_batch_size (whose last element may
461 |         # be smaller than the rest)
462 |         # And weight the reported error by the batch_size when we average
463 |         # Also, by keeping batch_start and batch_stop as symbolic variables,
464 |         # we make the theano function easier to read
465 |         batch_start = index * self.batch_size
466 |         batch_stop = T.minimum(n_ex, (index + 1) * self.batch_size)
467 |         effective_batch_size = batch_stop - batch_start
468 | 
469 |         get_batch_size = theano.function(inputs=[index, n_ex],
470 |                                           outputs=effective_batch_size)
471 | 
472 |         compute_train_error = theano.function(inputs=[index, n_ex],
473 |             outputs=self.rnn.loss(self.y),
474 |             givens={self.x: train_set_x[:, batch_start:batch_stop],
475 |                     self.y: train_set_y[:, batch_start:batch_stop]},
476 |             mode=mode)
477 | 
478 |         if compute_zero_one:
479 |             compute_train_zo = theano.function(inputs=[index, n_ex],
480 |             outputs=self.rnn.errors(self.y),
481 |             givens={self.x: train_set_x[:, batch_start:batch_stop],
482 |                     self.y: train_set_y[:, batch_start:batch_stop]},
483 |             mode=mode)
484 | 
485 |         if self.interactive:
486 |             compute_test_error = theano.function(inputs=[index, n_ex],
487 |                 outputs=self.rnn.loss(self.y),
488 |                 givens={self.x: test_set_x[:, batch_start:batch_stop],
489 |                         self.y: test_set_y[:, batch_start:batch_stop]},
490 |                 mode=mode)
491 | 
492 |             if compute_zero_one:
493 |                 compute_test_zo = theano.function(inputs=[index, n_ex],
494 |                     outputs=self.rnn.errors(self.y),
495 |                     givens={self.x: test_set_x[:, batch_start:batch_stop],
496 |                             self.y: test_set_y[:, batch_start:batch_stop]},
497 |                             mode=mode)
498 | 
499 |         self.get_norms = {}
500 |         for param in self.rnn.params:
501 |             self.get_norms[param] = theano.function(inputs=[],
502 |                     outputs=self.rnn.l2_norms[param], mode=mode)
503 | 
504 |         # compute the gradient of cost with respect to theta using BPTT
505 |         gtheta = T.grad(cost, self.rnn.theta)
506 | 
507 |         if optimizer == 'sgd':
508 | 
509 |             updates = {}
510 |             theta = self.rnn.theta
511 |             theta_update = self.rnn.theta_update
512 |             # careful here, update to the shared variable
513 |             # cannot depend on an updated other shared variable
514 |             # since updates happen in parallel
515 |             # so we need to be explicit
516 |             upd = mom * theta_update - l_r * gtheta
517 |             updates[theta_update] = upd
518 |             updates[theta] = theta + upd
519 | 
520 |             # compiling a Theano function `train_model` that returns the
521 |             # cost, but in the same time updates the parameter of the
522 |             # model based on the rules defined in `updates`
523 |             train_model = theano.function(inputs=[index, n_ex, l_r, mom],
524 |                 outputs=cost,
525 |                 updates=updates,
526 |                 givens={self.x: train_set_x[:, batch_start:batch_stop],
527 |                         self.y: train_set_y[:, batch_start:batch_stop]},
528 |                 mode=mode)
529 | 
530 |             ###############
531 |             # TRAIN MODEL #
532 |             ###############
533 |             logger.info('... training')
534 |             epoch = 0
535 | 
536 |             while (epoch < self.n_epochs):
537 |                 epoch = epoch + 1
538 |                 effective_momentum = self.final_momentum \
539 |                                      if epoch > self.momentum_switchover \
540 |                                      else self.initial_momentum
541 | 
542 |                 for minibatch_idx in xrange(n_train_batches):
543 |                     minibatch_avg_cost = train_model(minibatch_idx, n_train,
544 |                                                      self.learning_rate,
545 |                                                      effective_momentum)
546 | 
547 |                     # iteration number (how many weight updates have we made?)
548 |                     # epoch is 1-based, index is 0 based
549 |                     iter = (epoch - 1) * n_train_batches + minibatch_idx + 1
550 | 
551 |                     if iter % validation_frequency == 0:
552 |                         # compute loss on training set
553 |                         train_losses = [compute_train_error(i, n_train)
554 |                                         for i in xrange(n_train_batches)]
555 |                         train_batch_sizes = [get_batch_size(i, n_train)
556 |                                              for i in xrange(n_train_batches)]
557 | 
558 |                         this_train_loss = np.average(train_losses,
559 |                                                      weights=train_batch_sizes)
560 | 
561 |                         if compute_zero_one:
562 |                             train_zero_one = [compute_train_zo(i, n_train)
563 |                                               for i in xrange(n_train_batches)]
564 | 
565 |                             this_train_zero_one = np.average(train_zero_one,
566 |                                                     weights=train_batch_sizes)
567 | 
568 |                         if self.interactive:
569 |                             test_losses = [compute_test_error(i, n_test)
570 |                                             for i in xrange(n_test_batches)]
571 | 
572 |                             test_batch_sizes = [get_batch_size(i, n_test)
573 |                                             for i in xrange(n_test_batches)]
574 | 
575 |                             this_test_loss = np.average(test_losses,
576 |                                                     weights=test_batch_sizes)
577 | 
578 |                             if compute_zero_one:
579 |                                 test_zero_one = [compute_test_zo(i, n_test)
580 |                                         for i in xrange(n_test_batches)]
581 | 
582 |                                 this_test_zero_one = np.average(test_zero_one,
583 |                                         weights=test_batch_sizes)
584 | 
585 |                             if compute_zero_one:
586 |                                 logger.info('epoch %i, mb %i/%i, tr loss %f, '
587 |                                             'tr zo %f, te loss %f '
588 |                                             'te zo %f lr: %f' % \
589 |                                         (epoch, minibatch_idx + 1,
590 |                                          n_train_batches,
591 |                                          this_train_loss, this_train_zero_one,
592 |                                          this_test_loss, this_test_zero_one,
593 |                                          self.learning_rate))
594 |                             else:
595 |                                 logger.info('epoch %i, mb %i/%i, tr loss %f '
596 |                                             'te loss %f lr: %f' % \
597 |                                 (epoch, minibatch_idx + 1, n_train_batches,
598 |                                  this_train_loss, this_test_loss,
599 |                                  self.learning_rate))
600 | 
601 |                         else:
602 |                             if compute_zero_one:
603 |                                 logger.info('epoch %i, mb %i/%i, train loss %f'
604 |                                             ' train zo %f '
605 |                                             'lr: %f' % (epoch,
606 |                                                         minibatch_idx + 1,
607 |                                                         n_train_batches,
608 |                                                         this_train_loss,
609 |                                                         this_train_zero_one,
610 |                                                         self.learning_rate))
611 |                             else:
612 |                                 logger.info('epoch %i, mb %i/%i, train loss %f'
613 |                                             ' lr: %f' % (epoch,
614 |                                                          minibatch_idx + 1,
615 |                                                          n_train_batches,
616 |                                                          this_train_loss,
617 |                                                          self.learning_rate))
618 | 
619 |                         self.optional_output(train_set_x, show_norms,
620 |                                              show_output)
621 | 
622 |                 self.learning_rate *= self.learning_rate_decay
623 | 
624 |                 if self.snapshot_every is not None:
625 |                     if (epoch + 1) % self.snapshot_every == 0:
626 |                         date_obj = datetime.datetime.now()
627 |                         date_str = date_obj.strftime('%Y-%m-%d-%H:%M:%S')
628 |                         class_name = self.__class__.__name__
629 |                         fname = '%s.%s-snapshot-%d.pkl' % (class_name,
630 |                                                            date_str, epoch + 1)
631 |                         fabspath = os.path.join(self.snapshot_path, fname)
632 |                         self.save(fpath=fabspath)
633 | 
634 |         elif optimizer == 'cg' or optimizer == 'bfgs' \
635 |                  or optimizer == 'l_bfgs_b':
636 |             # compile a theano function that returns the cost of a minibatch
637 |             batch_cost = theano.function(inputs=[index, n_ex],
638 |                 outputs=cost,
639 |                 givens={self.x: train_set_x[:, batch_start:batch_stop],
640 |                         self.y: train_set_y[:, batch_start:batch_stop]},
641 |                 mode=mode, name="batch_cost")
642 | 
643 |             # compile a theano function that returns the gradient of the
644 |             # minibatch with respect to theta
645 |             batch_grad = theano.function(inputs=[index, n_ex],
646 |                 outputs=T.grad(cost, self.rnn.theta),
647 |                 givens={self.x: train_set_x[:, batch_start:batch_stop],
648 |                         self.y: train_set_y[:, batch_start:batch_stop]},
649 |                 mode=mode, name="batch_grad")
650 | 
651 |             # creates a function that computes the average cost on the training
652 |             # set
653 |             def train_fn(theta_value):
654 |                 self.rnn.theta.set_value(theta_value, borrow=True)
655 |                 train_losses = [batch_cost(i, n_train)
656 |                                 for i in xrange(n_train_batches)]
657 |                 train_batch_sizes = [get_batch_size(i, n_train)
658 |                                      for i in xrange(n_train_batches)]
659 |                 return np.average(train_losses, weights=train_batch_sizes)
660 | 
661 |             # creates a function that computes the average gradient of cost
662 |             # with respect to theta
663 |             def train_fn_grad(theta_value):
664 |                 self.rnn.theta.set_value(theta_value, borrow=True)
665 | 
666 |                 train_grads = [batch_grad(i, n_train)
667 |                                 for i in xrange(n_train_batches)]
668 |                 train_batch_sizes = [get_batch_size(i, n_train)
669 |                                      for i in xrange(n_train_batches)]
670 | 
671 |                 return np.average(train_grads, weights=train_batch_sizes,
672 |                                   axis=0)
673 | 
674 |             # validation function, prints useful output after each iteration
675 |             def callback(theta_value):
676 |                 self.epoch += 1
677 |                 if (self.epoch) % validate_every == 0:
678 |                     self.rnn.theta.set_value(theta_value, borrow=True)
679 |                     # compute loss on training set
680 |                     train_losses = [compute_train_error(i, n_train)
681 |                                     for i in xrange(n_train_batches)]
682 |                     train_batch_sizes = [get_batch_size(i, n_train)
683 |                                          for i in xrange(n_train_batches)]
684 | 
685 |                     this_train_loss = np.average(train_losses,
686 |                                                     weights=train_batch_sizes)
687 | 
688 |                     if compute_zero_one:
689 |                         train_zero_one = [compute_train_zo(i, n_train)
690 |                                           for i in xrange(n_train_batches)]
691 | 
692 |                         this_train_zero_one = np.average(train_zero_one,
693 |                                                 weights=train_batch_sizes)
694 | 
695 |                     if self.interactive:
696 |                         test_losses = [compute_test_error(i, n_test)
697 |                                         for i in xrange(n_test_batches)]
698 | 
699 |                         test_batch_sizes = [get_batch_size(i, n_test)
700 |                                         for i in xrange(n_test_batches)]
701 | 
702 |                         this_test_loss = np.average(test_losses,
703 |                                                     weights=test_batch_sizes)
704 | 
705 |                         if compute_zero_one:
706 |                             test_zero_one = [compute_test_zo(i, n_test)
707 |                                               for i in xrange(n_test_batches)]
708 | 
709 |                             this_test_zero_one = np.average(test_zero_one,
710 |                                                     weights=test_batch_sizes)
711 | 
712 |                         if compute_zero_one:
713 |                             logger.info('epoch %i, tr loss %f, '
714 |                                         'tr zo %f, te loss %f '
715 |                                             'te zo %f' % \
716 |                                         (self.epoch, this_train_loss,
717 |                                          this_train_zero_one, this_test_loss,
718 |                                          this_test_zero_one))
719 |                         else:
720 |                             logger.info('epoch %i, tr loss %f, te loss %f' % \
721 |                                         (self.epoch, this_train_loss,
722 |                                          this_test_loss, self.learning_rate))
723 | 
724 |                     else:
725 |                         if compute_zero_one:
726 |                             logger.info('epoch %i, train loss %f'
727 |                                         ', train zo %f ' % \
728 |                                         (self.epoch, this_train_loss,
729 |                                          this_train_zero_one))
730 |                         else:
731 |                             logger.info('epoch %i, train loss %f ' % \
732 |                                         (self.epoch, this_train_loss))
733 | 
734 |                     self.optional_output(train_set_x, show_norms, show_output)
735 | 
736 |             ###############
737 |             # TRAIN MODEL #
738 |             ###############
739 |             logger.info('... training')
740 |             # using scipy conjugate gradient optimizer
741 |             import scipy.optimize
742 |             if optimizer == 'cg':
743 |                 of = scipy.optimize.fmin_cg
744 |             elif optimizer == 'bfgs':
745 |                 of = scipy.optimize.fmin_bfgs
746 |             elif optimizer == 'l_bfgs_b':
747 |                 of = scipy.optimize.fmin_l_bfgs_b
748 |             logger.info("Optimizing using %s..." % of.__name__)
749 |             start_time = time.clock()
750 | 
751 |             # keep track of epochs externally
752 |             # these get updated through callback
753 |             self.epoch = 0
754 | 
755 |             # interface to l_bfgs_b is different than that of cg, bfgs
756 |             # however, this will be changed in scipy 0.11
757 |             # unified under scipy.optimize.minimize
758 |             if optimizer == 'cg' or optimizer == 'bfgs':
759 |                 best_theta = of(
760 |                     f=train_fn,
761 |                     x0=self.rnn.theta.get_value(),
762 |                     # x0=np.zeros(self.rnn.theta.get_value().shape,
763 |                     #             dtype=theano.config.floatX),
764 |                     fprime=train_fn_grad,
765 |                     callback=callback,
766 |                     disp=1,
767 |                     retall=1,
768 |                     maxiter=self.n_epochs)
769 |             elif optimizer == 'l_bfgs_b':
770 |                 best_theta, f_best_theta, info = of(
771 |                     func=train_fn,
772 |                     x0=self.rnn.theta.get_value(),
773 |                     fprime=train_fn_grad,
774 |                     iprint=validate_every,
775 |                     maxfun=self.n_epochs)  # max number of feval
776 | 
777 |             end_time = time.clock()
778 | 
779 |             print "Optimization time: %f" % (end_time - start_time)
780 | 
781 |         else:
782 |             raise NotImplementedError
783 | 
784 | 
785 | def test_real(n_epochs=1000):
786 |     """ Test RNN with real-valued outputs. """
787 |     n_hidden = 10
788 |     n_in = 5
789 |     n_out = 3
790 |     n_steps = 10
791 |     n_seq = 10  # per batch
792 |     n_batches = 10
793 | 
794 |     np.random.seed(0)
795 |     # simple lag test
796 |     seq = np.random.randn(n_steps, n_seq * n_batches, n_in)
797 |     targets = np.zeros((n_steps, n_seq * n_batches, n_out))
798 | 
799 |     targets[1:, :, 0] = seq[:-1, :, 3]  # delayed 1
800 |     targets[1:, :, 1] = seq[:-1, :, 2]  # delayed 1
801 |     targets[2:, :, 2] = seq[:-2, :, 0]  # delayed 2
802 | 
803 |     targets += 0.01 * np.random.standard_normal(targets.shape)
804 | 
805 |     model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out,
806 |                     learning_rate=0.01, learning_rate_decay=0.999,
807 |                     n_epochs=n_epochs, batch_size=n_seq, activation='tanh',
808 |                     L2_reg=1e-3)
809 | 
810 |     model.fit(seq, targets, validate_every=100, optimizer='bfgs')
811 | 
812 |     plt.close('all')
813 |     fig = plt.figure()
814 |     ax1 = plt.subplot(211)
815 |     plt.plot(seq[:, 0, :])
816 |     ax1.set_title('input')
817 |     ax2 = plt.subplot(212)
818 |     true_targets = plt.plot(targets[:, 0, :])
819 | 
820 |     guess = model.predict(seq[:, 0, :][:, np.newaxis, :])
821 | 
822 |     guessed_targets = plt.plot(guess.squeeze(), linestyle='--')
823 |     for i, x in enumerate(guessed_targets):
824 |         x.set_color(true_targets[i].get_color())
825 |     ax2.set_title('solid: true output, dashed: model output')
826 | 
827 | 
828 | def test_binary(multiple_out=False, n_epochs=1000, optimizer='cg'):
829 |     """ Test RNN with binary outputs. """
830 |     n_hidden = 10
831 |     n_in = 5
832 |     if multiple_out:
833 |         n_out = 2
834 |     else:
835 |         n_out = 1
836 |     n_steps = 10
837 |     n_seq = 10  # per batch
838 |     n_batches = 50
839 | 
840 |     np.random.seed(0)
841 |     # simple lag test
842 |     seq = np.random.randn(n_steps, n_seq * n_batches, n_in)
843 |     targets = np.zeros((n_steps, n_seq * n_batches, n_out))
844 | 
845 |     # whether lag 1 (dim 3) is greater than lag 2 (dim 0)
846 |     targets[2:, :, 0] = np.cast[np.int](seq[1:-1, :, 3] > seq[:-2, :, 0])
847 | 
848 |     if multiple_out:
849 |         # whether product of lag 1 (dim 4) and lag 1 (dim 2)
850 |         # is less than lag 2 (dim 0)
851 |         targets[2:, :, 1] = np.cast[np.int](
852 |             (seq[1:-1, :, 4] * seq[1:-1, :, 2]) > seq[:-2, :, 0])
853 | 
854 |     model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out,
855 |                     learning_rate=0.005, learning_rate_decay=0.999,
856 |                     n_epochs=n_epochs, batch_size=n_seq, activation='tanh',
857 |                     output_type='binary')
858 | 
859 |     model.fit(seq, targets, validate_every=100, compute_zero_one=True,
860 |               optimizer=optimizer)
861 | 
862 |     seqs = xrange(10)
863 | 
864 |     plt.close('all')
865 |     for seq_num in seqs:
866 |         fig = plt.figure()
867 |         ax1 = plt.subplot(211)
868 |         plt.plot(seq[:, seq_num, :])
869 |         ax1.set_title('input')
870 |         ax2 = plt.subplot(212)
871 |         true_targets = plt.step(xrange(n_steps), targets[:, seq_num, :],
872 |                                 marker='o')
873 | 
874 |         guess = model.predict_proba(seq[:, seq_num, :][:, np.newaxis, :])
875 |         guessed_targets = plt.step(xrange(n_steps), guess.squeeze())
876 |         plt.setp(guessed_targets, linestyle='--', marker='d')
877 |         for i, x in enumerate(guessed_targets):
878 |             x.set_color(true_targets[i].get_color())
879 |         ax2.set_ylim((-0.1, 1.1))
880 |         ax2.set_title('solid: true output, dashed: model output (prob)')
881 | 
882 | 
883 | def test_softmax(n_epochs=250, optimizer='cg'):
884 |     """ Test RNN with softmax outputs. """
885 |     n_hidden = 10
886 |     n_in = 5
887 |     n_steps = 10
888 |     n_seq = 10  # per batch
889 |     n_batches = 50
890 |     n_classes = 3
891 |     n_out = n_classes  # restricted to single softmax per time step
892 | 
893 |     np.random.seed(0)
894 |     # simple lag test
895 |     seq = np.random.randn(n_steps, n_seq * n_batches, n_in)
896 |     targets = np.zeros((n_steps, n_seq * n_batches), dtype=np.int)
897 | 
898 |     thresh = 0.5
899 |     # if lag 1 (dim 3) is greater than lag 2 (dim 0) + thresh
900 |     # class 1
901 |     # if lag 1 (dim 3) is less than lag 2 (dim 0) - thresh
902 |     # class 2
903 |     # if lag 2(dim0) - thresh <= lag 1 (dim 3) <= lag2(dim0) + thresh
904 |     # class 0
905 |     targets[2:, :][seq[1:-1, :, 3] > seq[:-2, :, 0] + thresh] = 1
906 |     targets[2:, :][seq[1:-1, :, 3] < seq[:-2, :, 0] - thresh] = 2
907 |     #targets[:, 2:, 0] = np.cast[np.int](seq[:, 1:-1, 3] > seq[:, :-2, 0])
908 | 
909 |     model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out,
910 |                     learning_rate=0.005, learning_rate_decay=0.999,
911 |                     n_epochs=n_epochs, batch_size=n_seq, activation='tanh',
912 |                     output_type='softmax')
913 | 
914 |     model.fit(seq, targets, validate_every=10, compute_zero_one=True,
915 |               optimizer=optimizer)
916 | 
917 |     seqs = xrange(10)
918 | 
919 |     plt.close('all')
920 |     for seq_num in seqs:
921 |         fig = plt.figure()
922 |         ax1 = plt.subplot(211)
923 |         plt.plot(seq[:, seq_num])
924 |         ax1.set_title('input')
925 |         ax2 = plt.subplot(212)
926 | 
927 |         # blue line will represent true classes
928 |         true_targets = plt.step(xrange(n_steps), targets[:, seq_num],
929 |                                 marker='o')
930 | 
931 |         # show probabilities (in b/w) output by model
932 |         guess = model.predict_proba(seq[:, seq_num][:, np.newaxis])
933 |         guessed_probs = plt.imshow(guess.squeeze().T, interpolation='nearest',
934 |                                    cmap='gray')
935 |         ax2.set_title('blue: true class, grayscale: probs assigned by model')
936 | 
937 | 
938 | if __name__ == "__main__":
939 |     logging.basicConfig(level=logging.INFO)
940 |     t0 = time.time()
941 |     test_real(n_epochs=1000)
942 |     #test_binary(optimizer='sgd', n_epochs=1000)
943 |     #test_softmax(n_epochs=250, optimizer='sgd')
944 |     print "Elapsed time: %f" % (time.time() - t0)
945 | 


--------------------------------------------------------------------------------