├── .gitignore ├── License.md ├── basic_rnn_example.py ├── README.md ├── hf_example.py ├── rnn.py └── rnn_minibatch.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /License.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Graham William Taylor 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, this 11 | list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | * Neither the name of the copyright holder nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /basic_rnn_example.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import theano 3 | import theano.tensor as TT 4 | 5 | # number of hidden units 6 | n = 50 7 | # number of input units 8 | nin = 5 9 | # number of output units 10 | nout = 5 11 | 12 | # input (where first dimension is time) 13 | u = TT.matrix() 14 | # target (where first dimension is time) 15 | t = TT.matrix() 16 | # initial hidden state of the RNN 17 | h0 = TT.vector() 18 | # learning rate 19 | lr = TT.scalar() 20 | # recurrent weights as a shared variable 21 | W = theano.shared(numpy.random.uniform(size=(n, n), low=-.01, high=.01)) 22 | # input to hidden layer weights 23 | W_in = theano.shared(numpy.random.uniform(size=(nin, n), low=-.01, high=.01)) 24 | # hidden to output layer weights 25 | W_out = theano.shared(numpy.random.uniform(size=(n, nout), low=-.01, high=.01)) 26 | 27 | 28 | # recurrent function (using tanh activation function) and linear output 29 | # activation function 30 | def step(u_t, h_tm1, W, W_in, W_out): 31 | h_t = TT.tanh(TT.dot(u_t, W_in) + TT.dot(h_tm1, W)) 32 | y_t = TT.dot(h_t, W_out) 33 | return h_t, y_t 34 | 35 | # the hidden state `h` for the entire sequence, and the output for the 36 | # entrie sequence `y` (first dimension is always time) 37 | [h, y], _ = theano.scan(step, 38 | sequences=u, 39 | outputs_info=[h0, None], 40 | non_sequences=[W, W_in, W_out]) 41 | # error between output and target 42 | error = ((y - t) ** 2).sum() 43 | # gradients on the weights using BPTT 44 | gW, gW_in, gW_out = TT.grad(error, [W, W_in, W_out]) 45 | # training function, that computes the error and updates the weights using 46 | # SGD. 47 | fn = theano.function([h0, u, t, lr], 48 | error, 49 | updates={W: W - lr * gW, 50 | W_in: W_in - lr * gW_in, 51 | W_out: W_out - lr * gW_out}) 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #theano-rnn 2 | 3 | Demonstration of recurrent neural network implemented with Theano 4 | 5 | ## Dependencies 6 | 7 | * [Theano](http://deeplearning.net/software/theano/) 8 | * [Scikit-learn](http://scikit-learn.org/stable/) 9 | This relies on scikits-learn simply because I subclass their BaseEstimator 10 | class, but this dependency could easily be removed. 11 | * A reasonably good Python distribution with numpy and scipy. I 12 | recommend [Enthought](http://enthought.com/) because it is heavily optimized and it has a free 13 | academic license. 14 | * If you want to use the Hessian-Free optimizer then you will also need: 15 | [theano-hf](https://github.com/boulanni/theano-hf) 16 | 17 | ## Description 18 | 19 | * rnn.py : this is the most basic implementation of a "vanilla" RNN. It 20 | is designed for readability. It processes each sequence at a time. 21 | There are three test functions which show how to train an RNN with 22 | real-valued, binary or softmax outputs using stochastic gradient 23 | descent. 24 | * rnn_minibatch.py : this is similar to rnn.py but it is slightly more 25 | complicated code and processes multiple sequences at once for speed 26 | (i.e. in "mini-batches"). It also hooks into scipy.optimize to use 27 | more sophisticated optimization methods. It again includes three test 28 | functions based on output type. 29 | * hf_example.py: this uses the class defined by rnn.py but instead of 30 | training it with stochastic gradient descent, it trains it with 31 | [Martens and Sutskever's variant of Hessian-Free optimization](http://www.cs.toronto.edu/~jmartens/docs/RNN_HF.pdf). 32 | 33 | ## Other implementations 34 | 35 | There are other Theano rnn implementations publicly available, for example: 36 | * [Razvan Pascanu's implementation](https://github.com/pascanur/trainingRNNs). 37 | * Razvan also provides a [simple sketch](http://groups.google.com/group/theano-users/browse_thread/thread/39c755b93675f437). This was the starting point for this code on the Theano-users list. 38 | * [Matthias Zoehrer's implementation](https://github.com/mzoehr/Theano/tree/rnn_benchmark/benchmark/rnn). 39 | -------------------------------------------------------------------------------- /hf_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | This code uses the recurrent neural net implementation in rnn.py 3 | but trains it using Hessian-Free optimization. 4 | 5 | It requires the theano-hf package: 6 | https://github.com/boulanni/theano-hf 7 | 8 | @author Graham Taylor 9 | 10 | """ 11 | from rnn import MetaRNN 12 | from hf import SequenceDataset, hf_optimizer 13 | import numpy as np 14 | import matplotlib.pyplot as plt 15 | import logging 16 | 17 | 18 | def test_real(n_updates=100): 19 | """ Test RNN with real-valued outputs. """ 20 | n_hidden = 10 21 | n_in = 5 22 | n_out = 3 23 | n_steps = 10 24 | n_seq = 1000 25 | 26 | np.random.seed(0) 27 | # simple lag test 28 | seq = np.random.randn(n_seq, n_steps, n_in) 29 | 30 | targets = np.zeros((n_seq, n_steps, n_out)) 31 | targets[:, 1:, 0] = seq[:, :-1, 3] # delayed 1 32 | targets[:, 1:, 1] = seq[:, :-1, 2] # delayed 1 33 | targets[:, 2:, 2] = seq[:, :-2, 0] # delayed 2 34 | 35 | targets += 0.01 * np.random.standard_normal(targets.shape) 36 | 37 | # SequenceDataset wants a list of sequences 38 | # this allows them to be different lengths, but here they're not 39 | seq = [i for i in seq] 40 | targets = [i for i in targets] 41 | 42 | gradient_dataset = SequenceDataset([seq, targets], batch_size=None, 43 | number_batches=100) 44 | cg_dataset = SequenceDataset([seq, targets], batch_size=None, 45 | number_batches=20) 46 | 47 | model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out, 48 | activation='tanh') 49 | 50 | opt = hf_optimizer(p=model.rnn.params, inputs=[model.x, model.y], 51 | s=model.rnn.y_pred, 52 | costs=[model.rnn.loss(model.y)], h=model.rnn.h) 53 | 54 | opt.train(gradient_dataset, cg_dataset, num_updates=n_updates) 55 | 56 | plt.close('all') 57 | fig = plt.figure() 58 | ax1 = plt.subplot(211) 59 | plt.plot(seq[0]) 60 | ax1.set_title('input') 61 | ax2 = plt.subplot(212) 62 | true_targets = plt.plot(targets[0]) 63 | 64 | guess = model.predict(seq[0]) 65 | guessed_targets = plt.plot(guess, linestyle='--') 66 | for i, x in enumerate(guessed_targets): 67 | x.set_color(true_targets[i].get_color()) 68 | ax2.set_title('solid: true output, dashed: model output') 69 | 70 | 71 | def test_binary(multiple_out=False, n_updates=250): 72 | """ Test RNN with binary outputs. """ 73 | n_hidden = 10 74 | n_in = 5 75 | if multiple_out: 76 | n_out = 2 77 | else: 78 | n_out = 1 79 | n_steps = 10 80 | n_seq = 100 81 | 82 | np.random.seed(0) 83 | # simple lag test 84 | seq = np.random.randn(n_seq, n_steps, n_in) 85 | targets = np.zeros((n_seq, n_steps, n_out), dtype='int32') 86 | 87 | # whether lag 1 (dim 3) is greater than lag 2 (dim 0) 88 | targets[:, 2:, 0] = np.cast[np.int32](seq[:, 1:-1, 3] > seq[:, :-2, 0]) 89 | 90 | if multiple_out: 91 | # whether product of lag 1 (dim 4) and lag 1 (dim 2) 92 | # is less than lag 2 (dim 0) 93 | targets[:, 2:, 1] = np.cast[np.int32]( 94 | (seq[:, 1:-1, 4] * seq[:, 1:-1, 2]) > seq[:, :-2, 0]) 95 | 96 | # SequenceDataset wants a list of sequences 97 | # this allows them to be different lengths, but here they're not 98 | seq = [i for i in seq] 99 | targets = [i for i in targets] 100 | 101 | gradient_dataset = SequenceDataset([seq, targets], batch_size=None, 102 | number_batches=500) 103 | cg_dataset = SequenceDataset([seq, targets], batch_size=None, 104 | number_batches=100) 105 | 106 | model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out, 107 | activation='tanh', output_type='binary') 108 | 109 | # optimizes negative log likelihood 110 | # but also reports zero-one error 111 | opt = hf_optimizer(p=model.rnn.params, inputs=[model.x, model.y], 112 | s=model.rnn.y_pred, 113 | costs=[model.rnn.loss(model.y), 114 | model.rnn.errors(model.y)], h=model.rnn.h) 115 | 116 | # using settings of initial_lambda and mu given in Nicolas' RNN example 117 | # seem to do a little worse than the default 118 | opt.train(gradient_dataset, cg_dataset, num_updates=n_updates) 119 | 120 | seqs = xrange(10) 121 | 122 | plt.close('all') 123 | for seq_num in seqs: 124 | fig = plt.figure() 125 | ax1 = plt.subplot(211) 126 | plt.plot(seq[seq_num]) 127 | ax1.set_title('input') 128 | ax2 = plt.subplot(212) 129 | true_targets = plt.step(xrange(n_steps), targets[seq_num], marker='o') 130 | 131 | guess = model.predict_proba(seq[seq_num]) 132 | guessed_targets = plt.step(xrange(n_steps), guess) 133 | plt.setp(guessed_targets, linestyle='--', marker='d') 134 | for i, x in enumerate(guessed_targets): 135 | x.set_color(true_targets[i].get_color()) 136 | ax2.set_ylim((-0.1, 1.1)) 137 | ax2.set_title('solid: true output, dashed: model output (prob)') 138 | 139 | 140 | def test_softmax(n_updates=250): 141 | """ Test RNN with softmax outputs. """ 142 | n_hidden = 10 143 | n_in = 5 144 | n_steps = 10 145 | n_seq = 100 146 | n_classes = 3 147 | n_out = n_classes # restricted to single softmax per time step 148 | 149 | np.random.seed(0) 150 | # simple lag test 151 | seq = np.random.randn(n_seq, n_steps, n_in) 152 | targets = np.zeros((n_seq, n_steps), dtype='int32') 153 | 154 | thresh = 0.5 155 | # if lag 1 (dim 3) is greater than lag 2 (dim 0) + thresh 156 | # class 1 157 | # if lag 1 (dim 3) is less than lag 2 (dim 0) - thresh 158 | # class 2 159 | # if lag 2(dim0) - thresh <= lag 1 (dim 3) <= lag2(dim0) + thresh 160 | # class 0 161 | targets[:, 2:][seq[:, 1:-1, 3] > seq[:, :-2, 0] + thresh] = 1 162 | targets[:, 2:][seq[:, 1:-1, 3] < seq[:, :-2, 0] - thresh] = 2 163 | #targets[:, 2:, 0] = np.cast[np.int](seq[:, 1:-1, 3] > seq[:, :-2, 0]) 164 | 165 | # SequenceDataset wants a list of sequences 166 | # this allows them to be different lengths, but here they're not 167 | seq = [i for i in seq] 168 | targets = [i for i in targets] 169 | 170 | gradient_dataset = SequenceDataset([seq, targets], batch_size=None, 171 | number_batches=500) 172 | cg_dataset = SequenceDataset([seq, targets], batch_size=None, 173 | number_batches=100) 174 | 175 | model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out, 176 | activation='tanh', output_type='softmax', 177 | use_symbolic_softmax=True) 178 | 179 | # optimizes negative log likelihood 180 | # but also reports zero-one error 181 | opt = hf_optimizer(p=model.rnn.params, inputs=[model.x, model.y], 182 | s=model.rnn.y_pred, 183 | costs=[model.rnn.loss(model.y), 184 | model.rnn.errors(model.y)], h=model.rnn.h) 185 | 186 | # using settings of initial_lambda and mu given in Nicolas' RNN example 187 | # seem to do a little worse than the default 188 | opt.train(gradient_dataset, cg_dataset, num_updates=n_updates) 189 | 190 | seqs = xrange(10) 191 | 192 | plt.close('all') 193 | for seq_num in seqs: 194 | fig = plt.figure() 195 | ax1 = plt.subplot(211) 196 | plt.plot(seq[seq_num]) 197 | ax1.set_title('input') 198 | 199 | ax2 = plt.subplot(212) 200 | # blue line will represent true classes 201 | true_targets = plt.step(xrange(n_steps), targets[seq_num], marker='o') 202 | 203 | # show probabilities (in b/w) output by model 204 | guess = model.predict_proba(seq[seq_num]) 205 | guessed_probs = plt.imshow(guess.T, interpolation='nearest', 206 | cmap='gray') 207 | ax2.set_title('blue: true class, grayscale: probs assigned by model') 208 | 209 | 210 | if __name__ == "__main__": 211 | logging.basicConfig(level=logging.INFO) 212 | #test_real(n_updates=20) 213 | #test_binary(multiple_out=True, n_updates=20) 214 | test_softmax(n_updates=20) 215 | -------------------------------------------------------------------------------- /rnn.py: -------------------------------------------------------------------------------- 1 | """ Vanilla RNN 2 | 3 | @author Graham Taylor 4 | """ 5 | import numpy as np 6 | import theano 7 | import theano.tensor as T 8 | from sklearn.base import BaseEstimator 9 | import logging 10 | import time 11 | import os 12 | import datetime 13 | import cPickle as pickle 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | import matplotlib.pyplot as plt 18 | plt.ion() 19 | 20 | mode = theano.Mode(linker='cvm') 21 | #mode = 'DEBUG_MODE' 22 | 23 | 24 | class RNN(object): 25 | """ Recurrent neural network class 26 | 27 | Supported output types: 28 | real : linear output units, use mean-squared error 29 | binary : binary output units, use cross-entropy error 30 | softmax : single softmax out, use cross-entropy error 31 | """ 32 | def __init__(self, input, n_in, n_hidden, n_out, activation=T.tanh, 33 | output_type='real', use_symbolic_softmax=False): 34 | 35 | self.input = input 36 | self.activation = activation 37 | self.output_type = output_type 38 | 39 | # when using HF, SoftmaxGrad.grad is not implemented 40 | # use a symbolic softmax which is slightly slower than T.nnet.softmax 41 | # See: http://groups.google.com/group/theano-dev/browse_thread/ 42 | # thread/3930bd5a6a67d27a 43 | if use_symbolic_softmax: 44 | def symbolic_softmax(x): 45 | e = T.exp(x) 46 | return e / T.sum(e, axis=1).dimshuffle(0, 'x') 47 | self.softmax = symbolic_softmax 48 | else: 49 | self.softmax = T.nnet.softmax 50 | 51 | # recurrent weights as a shared variable 52 | W_init = np.asarray(np.random.uniform(size=(n_hidden, n_hidden), 53 | low=-.01, high=.01), 54 | dtype=theano.config.floatX) 55 | self.W = theano.shared(value=W_init, name='W') 56 | # input to hidden layer weights 57 | W_in_init = np.asarray(np.random.uniform(size=(n_in, n_hidden), 58 | low=-.01, high=.01), 59 | dtype=theano.config.floatX) 60 | self.W_in = theano.shared(value=W_in_init, name='W_in') 61 | 62 | # hidden to output layer weights 63 | W_out_init = np.asarray(np.random.uniform(size=(n_hidden, n_out), 64 | low=-.01, high=.01), 65 | dtype=theano.config.floatX) 66 | self.W_out = theano.shared(value=W_out_init, name='W_out') 67 | 68 | h0_init = np.zeros((n_hidden,), dtype=theano.config.floatX) 69 | self.h0 = theano.shared(value=h0_init, name='h0') 70 | 71 | bh_init = np.zeros((n_hidden,), dtype=theano.config.floatX) 72 | self.bh = theano.shared(value=bh_init, name='bh') 73 | 74 | by_init = np.zeros((n_out,), dtype=theano.config.floatX) 75 | self.by = theano.shared(value=by_init, name='by') 76 | 77 | self.params = [self.W, self.W_in, self.W_out, self.h0, 78 | self.bh, self.by] 79 | 80 | # for every parameter, we maintain it's last update 81 | # the idea here is to use "momentum" 82 | # keep moving mostly in the same direction 83 | self.updates = {} 84 | for param in self.params: 85 | init = np.zeros(param.get_value(borrow=True).shape, 86 | dtype=theano.config.floatX) 87 | self.updates[param] = theano.shared(init) 88 | 89 | # recurrent function (using tanh activation function) and linear output 90 | # activation function 91 | def step(x_t, h_tm1): 92 | h_t = self.activation(T.dot(x_t, self.W_in) + \ 93 | T.dot(h_tm1, self.W) + self.bh) 94 | y_t = T.dot(h_t, self.W_out) + self.by 95 | return h_t, y_t 96 | 97 | # the hidden state `h` for the entire sequence, and the output for the 98 | # entire sequence `y` (first dimension is always time) 99 | [self.h, self.y_pred], _ = theano.scan(step, 100 | sequences=self.input, 101 | outputs_info=[self.h0, None]) 102 | 103 | # L1 norm ; one regularization option is to enforce L1 norm to 104 | # be small 105 | self.L1 = 0 106 | self.L1 += abs(self.W.sum()) 107 | self.L1 += abs(self.W_in.sum()) 108 | self.L1 += abs(self.W_out.sum()) 109 | 110 | # square of L2 norm ; one regularization option is to enforce 111 | # square of L2 norm to be small 112 | self.L2_sqr = 0 113 | self.L2_sqr += (self.W ** 2).sum() 114 | self.L2_sqr += (self.W_in ** 2).sum() 115 | self.L2_sqr += (self.W_out ** 2).sum() 116 | 117 | if self.output_type == 'real': 118 | self.loss = lambda y: self.mse(y) 119 | elif self.output_type == 'binary': 120 | # push through sigmoid 121 | self.p_y_given_x = T.nnet.sigmoid(self.y_pred) # apply sigmoid 122 | self.y_out = T.round(self.p_y_given_x) # round to {0,1} 123 | self.loss = lambda y: self.nll_binary(y) 124 | elif self.output_type == 'softmax': 125 | # push through softmax, computing vector of class-membership 126 | # probabilities in symbolic form 127 | self.p_y_given_x = self.softmax(self.y_pred) 128 | 129 | # compute prediction as class whose probability is maximal 130 | self.y_out = T.argmax(self.p_y_given_x, axis=-1) 131 | self.loss = lambda y: self.nll_multiclass(y) 132 | else: 133 | raise NotImplementedError 134 | 135 | def mse(self, y): 136 | # error between output and target 137 | return T.mean((self.y_pred - y) ** 2) 138 | 139 | def nll_binary(self, y): 140 | # negative log likelihood based on binary cross entropy error 141 | return T.mean(T.nnet.binary_crossentropy(self.p_y_given_x, y)) 142 | 143 | def nll_multiclass(self, y): 144 | # negative log likelihood based on multiclass cross entropy error 145 | # y.shape[0] is (symbolically) the number of rows in y, i.e., 146 | # number of time steps (call it T) in the sequence 147 | # T.arange(y.shape[0]) is a symbolic vector which will contain 148 | # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of 149 | # Log-Probabilities (call it LP) with one row per example and 150 | # one column per class LP[T.arange(y.shape[0]),y] is a vector 151 | # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., 152 | # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is 153 | # the mean (across minibatch examples) of the elements in v, 154 | # i.e., the mean log-likelihood across the minibatch. 155 | return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) 156 | 157 | def errors(self, y): 158 | """Return a float representing the number of errors in the sequence 159 | over the total number of examples in the sequence ; zero one 160 | loss over the size of the sequence 161 | 162 | :type y: theano.tensor.TensorType 163 | :param y: corresponds to a vector that gives for each example the 164 | correct label 165 | """ 166 | # check if y has same dimension of y_pred 167 | if y.ndim != self.y_out.ndim: 168 | raise TypeError('y should have the same shape as self.y_out', 169 | ('y', y.type, 'y_out', self.y_out.type)) 170 | 171 | if self.output_type in ('binary', 'softmax'): 172 | # check if y is of the correct datatype 173 | if y.dtype.startswith('int'): 174 | # the T.neq operator returns a vector of 0s and 1s, where 1 175 | # represents a mistake in prediction 176 | return T.mean(T.neq(self.y_out, y)) 177 | else: 178 | raise NotImplementedError() 179 | 180 | 181 | class MetaRNN(BaseEstimator): 182 | def __init__(self, n_in=5, n_hidden=50, n_out=5, learning_rate=0.01, 183 | n_epochs=100, L1_reg=0.00, L2_reg=0.00, learning_rate_decay=1, 184 | activation='tanh', output_type='real', 185 | final_momentum=0.9, initial_momentum=0.5, 186 | momentum_switchover=5, 187 | use_symbolic_softmax=False): 188 | self.n_in = int(n_in) 189 | self.n_hidden = int(n_hidden) 190 | self.n_out = int(n_out) 191 | self.learning_rate = float(learning_rate) 192 | self.learning_rate_decay = float(learning_rate_decay) 193 | self.n_epochs = int(n_epochs) 194 | self.L1_reg = float(L1_reg) 195 | self.L2_reg = float(L2_reg) 196 | self.activation = activation 197 | self.output_type = output_type 198 | self.initial_momentum = float(initial_momentum) 199 | self.final_momentum = float(final_momentum) 200 | self.momentum_switchover = int(momentum_switchover) 201 | self.use_symbolic_softmax = use_symbolic_softmax 202 | 203 | self.ready() 204 | 205 | def ready(self): 206 | # input (where first dimension is time) 207 | self.x = T.matrix() 208 | # target (where first dimension is time) 209 | if self.output_type == 'real': 210 | self.y = T.matrix(name='y', dtype=theano.config.floatX) 211 | elif self.output_type == 'binary': 212 | self.y = T.matrix(name='y', dtype='int32') 213 | elif self.output_type == 'softmax': # only vector labels supported 214 | self.y = T.vector(name='y', dtype='int32') 215 | else: 216 | raise NotImplementedError 217 | # initial hidden state of the RNN 218 | self.h0 = T.vector() 219 | # learning rate 220 | self.lr = T.scalar() 221 | 222 | if self.activation == 'tanh': 223 | activation = T.tanh 224 | elif self.activation == 'sigmoid': 225 | activation = T.nnet.sigmoid 226 | elif self.activation == 'relu': 227 | activation = lambda x: x * (x > 0) 228 | elif self.activation == 'cappedrelu': 229 | activation = lambda x: T.minimum(x * (x > 0), 6) 230 | else: 231 | raise NotImplementedError 232 | 233 | self.rnn = RNN(input=self.x, n_in=self.n_in, 234 | n_hidden=self.n_hidden, n_out=self.n_out, 235 | activation=activation, output_type=self.output_type, 236 | use_symbolic_softmax=self.use_symbolic_softmax) 237 | 238 | if self.output_type == 'real': 239 | self.predict = theano.function(inputs=[self.x, ], 240 | outputs=self.rnn.y_pred, 241 | mode=mode) 242 | elif self.output_type == 'binary': 243 | self.predict_proba = theano.function(inputs=[self.x, ], 244 | outputs=self.rnn.p_y_given_x, mode=mode) 245 | self.predict = theano.function(inputs=[self.x, ], 246 | outputs=T.round(self.rnn.p_y_given_x), 247 | mode=mode) 248 | elif self.output_type == 'softmax': 249 | self.predict_proba = theano.function(inputs=[self.x, ], 250 | outputs=self.rnn.p_y_given_x, mode=mode) 251 | self.predict = theano.function(inputs=[self.x, ], 252 | outputs=self.rnn.y_out, mode=mode) 253 | else: 254 | raise NotImplementedError 255 | 256 | def shared_dataset(self, data_xy): 257 | """ Load the dataset into shared variables """ 258 | 259 | data_x, data_y = data_xy 260 | shared_x = theano.shared(np.asarray(data_x, 261 | dtype=theano.config.floatX)) 262 | 263 | shared_y = theano.shared(np.asarray(data_y, 264 | dtype=theano.config.floatX)) 265 | 266 | if self.output_type in ('binary', 'softmax'): 267 | return shared_x, T.cast(shared_y, 'int32') 268 | else: 269 | return shared_x, shared_y 270 | 271 | def __getstate__(self): 272 | """ Return state sequence.""" 273 | params = self._get_params() # parameters set in constructor 274 | weights = [p.get_value() for p in self.rnn.params] 275 | state = (params, weights) 276 | return state 277 | 278 | def _set_weights(self, weights): 279 | """ Set fittable parameters from weights sequence. 280 | 281 | Parameters must be in the order defined by self.params: 282 | W, W_in, W_out, h0, bh, by 283 | """ 284 | i = iter(weights) 285 | 286 | for param in self.rnn.params: 287 | param.set_value(i.next()) 288 | 289 | def __setstate__(self, state): 290 | """ Set parameters from state sequence. 291 | 292 | Parameters must be in the order defined by self.params: 293 | W, W_in, W_out, h0, bh, by 294 | """ 295 | params, weights = state 296 | self.set_params(**params) 297 | self.ready() 298 | self._set_weights(weights) 299 | 300 | def save(self, fpath='.', fname=None): 301 | """ Save a pickled representation of Model state. """ 302 | fpathstart, fpathext = os.path.splitext(fpath) 303 | if fpathext == '.pkl': 304 | # User supplied an absolute path to a pickle file 305 | fpath, fname = os.path.split(fpath) 306 | 307 | elif fname is None: 308 | # Generate filename based on date 309 | date_obj = datetime.datetime.now() 310 | date_str = date_obj.strftime('%Y-%m-%d-%H:%M:%S') 311 | class_name = self.__class__.__name__ 312 | fname = '%s.%s.pkl' % (class_name, date_str) 313 | 314 | fabspath = os.path.join(fpath, fname) 315 | 316 | logger.info("Saving to %s ..." % fabspath) 317 | file = open(fabspath, 'wb') 318 | state = self.__getstate__() 319 | pickle.dump(state, file, protocol=pickle.HIGHEST_PROTOCOL) 320 | file.close() 321 | 322 | def load(self, path): 323 | """ Load model parameters from path. """ 324 | logger.info("Loading from %s ..." % path) 325 | file = open(path, 'rb') 326 | state = pickle.load(file) 327 | self.__setstate__(state) 328 | file.close() 329 | 330 | def fit(self, X_train, Y_train, X_test=None, Y_test=None, 331 | validation_frequency=100): 332 | """ Fit model 333 | 334 | Pass in X_test, Y_test to compute test error and report during 335 | training. 336 | 337 | X_train : ndarray (n_seq x n_steps x n_in) 338 | Y_train : ndarray (n_seq x n_steps x n_out) 339 | 340 | validation_frequency : int 341 | in terms of number of sequences (or number of weight updates) 342 | """ 343 | if X_test is not None: 344 | assert(Y_test is not None) 345 | self.interactive = True 346 | test_set_x, test_set_y = self.shared_dataset((X_test, Y_test)) 347 | else: 348 | self.interactive = False 349 | 350 | train_set_x, train_set_y = self.shared_dataset((X_train, Y_train)) 351 | 352 | n_train = train_set_x.get_value(borrow=True).shape[0] 353 | if self.interactive: 354 | n_test = test_set_x.get_value(borrow=True).shape[0] 355 | 356 | ###################### 357 | # BUILD ACTUAL MODEL # 358 | ###################### 359 | logger.info('... building the model') 360 | 361 | index = T.lscalar('index') # index to a case 362 | # learning rate (may change) 363 | l_r = T.scalar('l_r', dtype=theano.config.floatX) 364 | mom = T.scalar('mom', dtype=theano.config.floatX) # momentum 365 | 366 | cost = self.rnn.loss(self.y) \ 367 | + self.L1_reg * self.rnn.L1 \ 368 | + self.L2_reg * self.rnn.L2_sqr 369 | 370 | compute_train_error = theano.function(inputs=[index, ], 371 | outputs=self.rnn.loss(self.y), 372 | givens={ 373 | self.x: train_set_x[index], 374 | self.y: train_set_y[index]}, 375 | mode=mode) 376 | 377 | if self.interactive: 378 | compute_test_error = theano.function(inputs=[index, ], 379 | outputs=self.rnn.loss(self.y), 380 | givens={ 381 | self.x: test_set_x[index], 382 | self.y: test_set_y[index]}, 383 | mode=mode) 384 | 385 | # compute the gradient of cost with respect to theta = (W, W_in, W_out) 386 | # gradients on the weights using BPTT 387 | gparams = [] 388 | for param in self.rnn.params: 389 | gparam = T.grad(cost, param) 390 | gparams.append(gparam) 391 | 392 | updates = {} 393 | for param, gparam in zip(self.rnn.params, gparams): 394 | weight_update = self.rnn.updates[param] 395 | upd = mom * weight_update - l_r * gparam 396 | updates[weight_update] = upd 397 | updates[param] = param + upd 398 | 399 | # compiling a Theano function `train_model` that returns the 400 | # cost, but in the same time updates the parameter of the 401 | # model based on the rules defined in `updates` 402 | train_model = theano.function(inputs=[index, l_r, mom], 403 | outputs=cost, 404 | updates=updates, 405 | givens={ 406 | self.x: train_set_x[index], 407 | self.y: train_set_y[index]}, 408 | mode=mode) 409 | 410 | ############### 411 | # TRAIN MODEL # 412 | ############### 413 | logger.info('... training') 414 | epoch = 0 415 | 416 | while (epoch < self.n_epochs): 417 | epoch = epoch + 1 418 | for idx in xrange(n_train): 419 | effective_momentum = self.final_momentum \ 420 | if epoch > self.momentum_switchover \ 421 | else self.initial_momentum 422 | example_cost = train_model(idx, self.learning_rate, 423 | effective_momentum) 424 | 425 | # iteration number (how many weight updates have we made?) 426 | # epoch is 1-based, index is 0 based 427 | iter = (epoch - 1) * n_train + idx + 1 428 | 429 | if iter % validation_frequency == 0: 430 | # compute loss on training set 431 | train_losses = [compute_train_error(i) 432 | for i in xrange(n_train)] 433 | this_train_loss = np.mean(train_losses) 434 | 435 | if self.interactive: 436 | test_losses = [compute_test_error(i) 437 | for i in xrange(n_test)] 438 | this_test_loss = np.mean(test_losses) 439 | 440 | logger.info('epoch %i, seq %i/%i, tr loss %f ' 441 | 'te loss %f lr: %f' % \ 442 | (epoch, idx + 1, n_train, 443 | this_train_loss, this_test_loss, self.learning_rate)) 444 | else: 445 | logger.info('epoch %i, seq %i/%i, train loss %f ' 446 | 'lr: %f' % \ 447 | (epoch, idx + 1, n_train, this_train_loss, 448 | self.learning_rate)) 449 | 450 | self.learning_rate *= self.learning_rate_decay 451 | 452 | 453 | def test_real(): 454 | """ Test RNN with real-valued outputs. """ 455 | n_hidden = 10 456 | n_in = 5 457 | n_out = 3 458 | n_steps = 10 459 | n_seq = 100 460 | 461 | np.random.seed(0) 462 | # simple lag test 463 | seq = np.random.randn(n_seq, n_steps, n_in) 464 | targets = np.zeros((n_seq, n_steps, n_out)) 465 | 466 | targets[:, 1:, 0] = seq[:, :-1, 3] # delayed 1 467 | targets[:, 1:, 1] = seq[:, :-1, 2] # delayed 1 468 | targets[:, 2:, 2] = seq[:, :-2, 0] # delayed 2 469 | 470 | targets += 0.01 * np.random.standard_normal(targets.shape) 471 | 472 | model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out, 473 | learning_rate=0.001, learning_rate_decay=0.999, 474 | n_epochs=400, activation='tanh') 475 | 476 | model.fit(seq, targets, validation_frequency=1000) 477 | 478 | plt.close('all') 479 | fig = plt.figure() 480 | ax1 = plt.subplot(211) 481 | plt.plot(seq[0]) 482 | ax1.set_title('input') 483 | 484 | ax2 = plt.subplot(212) 485 | true_targets = plt.plot(targets[0]) 486 | 487 | guess = model.predict(seq[0]) 488 | guessed_targets = plt.plot(guess, linestyle='--') 489 | for i, x in enumerate(guessed_targets): 490 | x.set_color(true_targets[i].get_color()) 491 | ax2.set_title('solid: true output, dashed: model output') 492 | 493 | 494 | def test_binary(multiple_out=False, n_epochs=250): 495 | """ Test RNN with binary outputs. """ 496 | n_hidden = 10 497 | n_in = 5 498 | if multiple_out: 499 | n_out = 2 500 | else: 501 | n_out = 1 502 | n_steps = 10 503 | n_seq = 100 504 | 505 | np.random.seed(0) 506 | # simple lag test 507 | seq = np.random.randn(n_seq, n_steps, n_in) 508 | targets = np.zeros((n_seq, n_steps, n_out)) 509 | 510 | # whether lag 1 (dim 3) is greater than lag 2 (dim 0) 511 | targets[:, 2:, 0] = np.cast[np.int](seq[:, 1:-1, 3] > seq[:, :-2, 0]) 512 | 513 | if multiple_out: 514 | # whether product of lag 1 (dim 4) and lag 1 (dim 2) 515 | # is less than lag 2 (dim 0) 516 | targets[:, 2:, 1] = np.cast[np.int]( 517 | (seq[:, 1:-1, 4] * seq[:, 1:-1, 2]) > seq[:, :-2, 0]) 518 | 519 | model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out, 520 | learning_rate=0.001, learning_rate_decay=0.999, 521 | n_epochs=n_epochs, activation='tanh', output_type='binary') 522 | 523 | model.fit(seq, targets, validation_frequency=1000) 524 | 525 | seqs = xrange(10) 526 | 527 | plt.close('all') 528 | for seq_num in seqs: 529 | fig = plt.figure() 530 | ax1 = plt.subplot(211) 531 | plt.plot(seq[seq_num]) 532 | ax1.set_title('input') 533 | ax2 = plt.subplot(212) 534 | true_targets = plt.step(xrange(n_steps), targets[seq_num], marker='o') 535 | 536 | guess = model.predict_proba(seq[seq_num]) 537 | guessed_targets = plt.step(xrange(n_steps), guess) 538 | plt.setp(guessed_targets, linestyle='--', marker='d') 539 | for i, x in enumerate(guessed_targets): 540 | x.set_color(true_targets[i].get_color()) 541 | ax2.set_ylim((-0.1, 1.1)) 542 | ax2.set_title('solid: true output, dashed: model output (prob)') 543 | 544 | 545 | def test_softmax(n_epochs=250): 546 | """ Test RNN with softmax outputs. """ 547 | n_hidden = 10 548 | n_in = 5 549 | n_steps = 10 550 | n_seq = 100 551 | n_classes = 3 552 | n_out = n_classes # restricted to single softmax per time step 553 | 554 | np.random.seed(0) 555 | # simple lag test 556 | seq = np.random.randn(n_seq, n_steps, n_in) 557 | targets = np.zeros((n_seq, n_steps), dtype=np.int) 558 | 559 | thresh = 0.5 560 | # if lag 1 (dim 3) is greater than lag 2 (dim 0) + thresh 561 | # class 1 562 | # if lag 1 (dim 3) is less than lag 2 (dim 0) - thresh 563 | # class 2 564 | # if lag 2(dim0) - thresh <= lag 1 (dim 3) <= lag2(dim0) + thresh 565 | # class 0 566 | targets[:, 2:][seq[:, 1:-1, 3] > seq[:, :-2, 0] + thresh] = 1 567 | targets[:, 2:][seq[:, 1:-1, 3] < seq[:, :-2, 0] - thresh] = 2 568 | #targets[:, 2:, 0] = np.cast[np.int](seq[:, 1:-1, 3] > seq[:, :-2, 0]) 569 | 570 | model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out, 571 | learning_rate=0.001, learning_rate_decay=0.999, 572 | n_epochs=n_epochs, activation='tanh', 573 | output_type='softmax', use_symbolic_softmax=False) 574 | 575 | model.fit(seq, targets, validation_frequency=1000) 576 | 577 | seqs = xrange(10) 578 | 579 | plt.close('all') 580 | for seq_num in seqs: 581 | fig = plt.figure() 582 | ax1 = plt.subplot(211) 583 | plt.plot(seq[seq_num]) 584 | ax1.set_title('input') 585 | ax2 = plt.subplot(212) 586 | 587 | # blue line will represent true classes 588 | true_targets = plt.step(xrange(n_steps), targets[seq_num], marker='o') 589 | 590 | # show probabilities (in b/w) output by model 591 | guess = model.predict_proba(seq[seq_num]) 592 | guessed_probs = plt.imshow(guess.T, interpolation='nearest', 593 | cmap='gray') 594 | ax2.set_title('blue: true class, grayscale: probs assigned by model') 595 | 596 | 597 | if __name__ == "__main__": 598 | logging.basicConfig(level=logging.INFO) 599 | t0 = time.time() 600 | test_real() 601 | # problem takes more epochs to solve 602 | #test_binary(multiple_out=True, n_epochs=2400) 603 | #test_softmax(n_epochs=250) 604 | print "Elapsed time: %f" % (time.time() - t0) 605 | -------------------------------------------------------------------------------- /rnn_minibatch.py: -------------------------------------------------------------------------------- 1 | """ Vanilla RNN 2 | Parallelizes scan over sequences by using mini-batches. 3 | 4 | @author Graham Taylor 5 | """ 6 | import numpy as np 7 | import theano 8 | import theano.tensor as T 9 | from sklearn.base import BaseEstimator 10 | import logging 11 | import time 12 | import os 13 | import datetime 14 | import cPickle as pickle 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | import matplotlib.pyplot as plt 19 | plt.ion() 20 | 21 | mode = theano.Mode(linker='cvm') 22 | #mode = 'DEBUG_MODE' 23 | 24 | 25 | class RNN(object): 26 | """ Recurrent neural network class 27 | 28 | Supported output types: 29 | real : linear output units, use mean-squared error 30 | binary : binary output units, use cross-entropy error 31 | softmax : single softmax out, use cross-entropy error 32 | 33 | """ 34 | def __init__(self, input, n_in, n_hidden, n_out, activation=T.tanh, 35 | output_type='real'): 36 | 37 | self.input = input 38 | self.activation = activation 39 | self.output_type = output_type 40 | 41 | self.batch_size = T.iscalar() 42 | 43 | # theta is a vector of all trainable parameters 44 | # it represents the value of W, W_in, W_out, h0, bh, by 45 | theta_shape = n_hidden ** 2 + n_in * n_hidden + n_hidden * n_out + \ 46 | n_hidden + n_hidden + n_out 47 | self.theta = theano.shared(value=np.zeros(theta_shape, 48 | dtype=theano.config.floatX)) 49 | 50 | # Parameters are reshaped views of theta 51 | param_idx = 0 # pointer to somewhere along parameter vector 52 | 53 | # recurrent weights as a shared variable 54 | self.W = self.theta[param_idx:(param_idx + n_hidden ** 2)].reshape( 55 | (n_hidden, n_hidden)) 56 | self.W.name = 'W' 57 | W_init = np.asarray(np.random.uniform(size=(n_hidden, n_hidden), 58 | low=-0.01, high=0.01), 59 | dtype=theano.config.floatX) 60 | param_idx += n_hidden ** 2 61 | 62 | # input to hidden layer weights 63 | self.W_in = self.theta[param_idx:(param_idx + n_in * \ 64 | n_hidden)].reshape((n_in, n_hidden)) 65 | self.W_in.name = 'W_in' 66 | W_in_init = np.asarray(np.random.uniform(size=(n_in, n_hidden), 67 | low=-0.01, high=0.01), 68 | dtype=theano.config.floatX) 69 | param_idx += n_in * n_hidden 70 | 71 | # hidden to output layer weights 72 | self.W_out = self.theta[param_idx:(param_idx + n_hidden * \ 73 | n_out)].reshape((n_hidden, n_out)) 74 | self.W_out.name = 'W_out' 75 | 76 | W_out_init = np.asarray(np.random.uniform(size=(n_hidden, n_out), 77 | low=-0.01, high=0.01), 78 | dtype=theano.config.floatX) 79 | param_idx += n_hidden * n_out 80 | 81 | self.h0 = self.theta[param_idx:(param_idx + n_hidden)] 82 | self.h0.name = 'h0' 83 | h0_init = np.zeros((n_hidden,), dtype=theano.config.floatX) 84 | param_idx += n_hidden 85 | 86 | self.bh = self.theta[param_idx:(param_idx + n_hidden)] 87 | self.bh.name = 'bh' 88 | bh_init = np.zeros((n_hidden,), dtype=theano.config.floatX) 89 | param_idx += n_hidden 90 | 91 | self.by = self.theta[param_idx:(param_idx + n_out)] 92 | self.by.name = 'by' 93 | by_init = np.zeros((n_out,), dtype=theano.config.floatX) 94 | param_idx += n_out 95 | 96 | assert(param_idx == theta_shape) 97 | 98 | # for convenience 99 | self.params = [self.W, self.W_in, self.W_out, self.h0, self.bh, 100 | self.by] 101 | 102 | # shortcut to norms (for monitoring) 103 | self.l2_norms = {} 104 | for param in self.params: 105 | self.l2_norms[param] = T.sqrt(T.sum(param ** 2)) 106 | 107 | # initialize parameters 108 | # DEBUG_MODE gives division by zero error when we leave parameters 109 | # as zeros 110 | self.theta.set_value(np.concatenate([x.ravel() for x in 111 | (W_init, W_in_init, W_out_init, h0_init, bh_init, by_init)])) 112 | 113 | self.theta_update = theano.shared( 114 | value=np.zeros(theta_shape, dtype=theano.config.floatX)) 115 | 116 | # recurrent function (using tanh activation function) and arbitrary output 117 | # activation function 118 | def step(x_t, h_tm1): 119 | h_t = self.activation(T.dot(x_t, self.W_in) + \ 120 | T.dot(h_tm1, self.W) + self.bh) 121 | y_t = T.dot(h_t, self.W_out) + self.by 122 | return h_t, y_t 123 | 124 | # the hidden state `h` for the entire sequence, and the output for the 125 | # entire sequence `y` (first dimension is always time) 126 | # Note the implementation of weight-sharing h0 across variable-size 127 | # batches using T.ones multiplying h0 128 | # Alternatively, T.alloc approach is more robust 129 | [self.h, self.y_pred], _ = theano.scan(step, 130 | sequences=self.input, 131 | outputs_info=[T.alloc(self.h0, self.input.shape[1], 132 | n_hidden), None]) 133 | # outputs_info=[T.ones(shape=(self.input.shape[1], 134 | # self.h0.shape[0])) * self.h0, None]) 135 | 136 | # L1 norm ; one regularization option is to enforce L1 norm to 137 | # be small 138 | self.L1 = 0 139 | self.L1 += abs(self.W.sum()) 140 | self.L1 += abs(self.W_in.sum()) 141 | self.L1 += abs(self.W_out.sum()) 142 | 143 | # square of L2 norm ; one regularization option is to enforce 144 | # square of L2 norm to be small 145 | self.L2_sqr = 0 146 | self.L2_sqr += (self.W ** 2).sum() 147 | self.L2_sqr += (self.W_in ** 2).sum() 148 | self.L2_sqr += (self.W_out ** 2).sum() 149 | 150 | if self.output_type == 'real': 151 | self.loss = lambda y: self.mse(y) 152 | elif self.output_type == 'binary': 153 | # push through sigmoid 154 | self.p_y_given_x = T.nnet.sigmoid(self.y_pred) # apply sigmoid 155 | self.y_out = T.round(self.p_y_given_x) # round to {0,1} 156 | self.loss = lambda y: self.nll_binary(y) 157 | elif self.output_type == 'softmax': 158 | # push through softmax, computing vector of class-membership 159 | # probabilities in symbolic form 160 | # 161 | # T.nnet.softmax will not operate on T.tensor3 types, only matrices 162 | # We take our n_steps x n_seq x n_classes output from the net 163 | # and reshape it into a (n_steps * n_seq) x n_classes matrix 164 | # apply softmax, then reshape back 165 | y_p = self.y_pred 166 | y_p_m = T.reshape(y_p, (y_p.shape[0] * y_p.shape[1], -1)) 167 | y_p_s = T.nnet.softmax(y_p_m) 168 | self.p_y_given_x = T.reshape(y_p_s, y_p.shape) 169 | 170 | # compute prediction as class whose probability is maximal 171 | self.y_out = T.argmax(self.p_y_given_x, axis=-1) 172 | self.loss = lambda y: self.nll_multiclass(y) 173 | 174 | else: 175 | raise NotImplementedError 176 | 177 | def mse(self, y): 178 | # error between output and target 179 | return T.mean((self.y_pred - y) ** 2) 180 | 181 | def nll_binary(self, y): 182 | # negative log likelihood based on binary cross entropy error 183 | return T.mean(T.nnet.binary_crossentropy(self.p_y_given_x, y)) 184 | 185 | def nll_multiclass(self, y): 186 | # negative log likelihood based on multiclass cross entropy error 187 | # 188 | # Theano's advanced indexing is limited 189 | # therefore we reshape our n_steps x n_seq x n_classes tensor3 of probs 190 | # to a (n_steps * n_seq) x n_classes matrix of probs 191 | # so that we can use advanced indexing (i.e. get the probs which 192 | # correspond to the true class) 193 | # the labels y also must be flattened when we do this to use the 194 | # advanced indexing 195 | p_y = self.p_y_given_x 196 | p_y_m = T.reshape(p_y, (p_y.shape[0] * p_y.shape[1], -1)) 197 | y_f = y.flatten(ndim=1) 198 | return -T.mean(T.log(p_y_m)[T.arange(p_y_m.shape[0]), y_f]) 199 | 200 | def errors(self, y): 201 | """Return a float representing the number of errors in the minibatch 202 | over the total number of examples of the minibatch ; zero one 203 | loss over the size of the minibatch 204 | 205 | :type y: theano.tensor.TensorType 206 | :param y: corresponds to a vector that gives for each example the 207 | correct label 208 | """ 209 | 210 | # check if y has same dimension of y_pred 211 | if y.ndim != self.y_out.ndim: 212 | raise TypeError('y should have the same shape as self.y_out', 213 | ('y', y.type, 'y_out', self.y_out.type)) 214 | # check if y is of the correct datatype 215 | if y.dtype.startswith('int'): 216 | # the T.neq operator returns a vector of 0s and 1s, where 1 217 | # represents a mistake in prediction 218 | return T.mean(T.neq(self.y_out, y)) 219 | else: 220 | raise NotImplementedError() 221 | 222 | 223 | class MetaRNN(BaseEstimator): 224 | def __init__(self, n_in=5, n_hidden=50, n_out=5, learning_rate=0.01, 225 | n_epochs=100, batch_size=100, L1_reg=0.00, L2_reg=0.00, 226 | learning_rate_decay=1, 227 | activation='tanh', output_type='real', final_momentum=0.9, 228 | initial_momentum=0.5, momentum_switchover=5, 229 | snapshot_every=None, snapshot_path='/tmp'): 230 | self.n_in = int(n_in) 231 | self.n_hidden = int(n_hidden) 232 | self.n_out = int(n_out) 233 | self.learning_rate = float(learning_rate) 234 | self.learning_rate_decay = float(learning_rate_decay) 235 | self.n_epochs = int(n_epochs) 236 | self.batch_size = int(batch_size) 237 | self.L1_reg = float(L1_reg) 238 | self.L2_reg = float(L2_reg) 239 | self.activation = activation 240 | self.output_type = output_type 241 | self.initial_momentum = float(initial_momentum) 242 | self.final_momentum = float(final_momentum) 243 | self.momentum_switchover = int(momentum_switchover) 244 | if snapshot_every is not None: 245 | self.snapshot_every = int(snapshot_every) 246 | else: 247 | self.snapshot_every = None 248 | self.snapshot_path = snapshot_path 249 | 250 | self.ready() 251 | 252 | def ready(self): 253 | # input (where first dimension is time) 254 | self.x = T.tensor3(name='x') 255 | # target (where first dimension is time) 256 | if self.output_type == 'real': 257 | self.y = T.tensor3(name='y', dtype=theano.config.floatX) 258 | elif self.output_type == 'binary': 259 | self.y = T.tensor3(name='y', dtype='int32') 260 | elif self.output_type == 'softmax': # now it is a matrix (T x n_seq) 261 | self.y = T.matrix(name='y', dtype='int32') 262 | else: 263 | raise NotImplementedError 264 | 265 | # learning rate 266 | self.lr = T.scalar() 267 | 268 | if self.activation == 'tanh': 269 | activation = T.tanh 270 | elif self.activation == 'sigmoid': 271 | activation = T.nnet.sigmoid 272 | elif self.activation == 'relu': 273 | activation = lambda x: x * (x > 0) 274 | elif self.activation == 'cappedrelu': 275 | activation = lambda x: T.minimum(x * (x > 0), 6) 276 | else: 277 | raise NotImplementedError 278 | 279 | self.rnn = RNN(input=self.x, n_in=self.n_in, 280 | n_hidden=self.n_hidden, n_out=self.n_out, 281 | activation=activation, output_type=self.output_type) 282 | 283 | if self.output_type == 'real': 284 | self.predict = theano.function(inputs=[self.x, ], 285 | outputs=self.rnn.y_pred, 286 | mode=mode) 287 | elif self.output_type == 'binary': 288 | self.predict_proba = theano.function(inputs=[self.x, ], 289 | outputs=self.rnn.p_y_given_x, mode=mode) 290 | self.predict = theano.function(inputs=[self.x, ], 291 | outputs=T.round(self.rnn.p_y_given_x), 292 | mode=mode) 293 | elif self.output_type == 'softmax': 294 | self.predict_proba = theano.function(inputs=[self.x, ], 295 | outputs=self.rnn.p_y_given_x, mode=mode) 296 | self.predict = theano.function(inputs=[self.x, ], 297 | outputs=self.rnn.y_out, mode=mode) 298 | else: 299 | raise NotImplementedError 300 | 301 | def shared_dataset(self, data_xy, borrow=True): 302 | """ Load the dataset into shared variables """ 303 | 304 | data_x, data_y = data_xy 305 | shared_x = theano.shared(np.asarray(data_x, 306 | dtype=theano.config.floatX), 307 | borrow=True) 308 | 309 | shared_y = theano.shared(np.asarray(data_y, 310 | dtype=theano.config.floatX), 311 | borrow=True) 312 | 313 | if self.output_type in ('binary', 'softmax'): 314 | return shared_x, T.cast(shared_y, 'int32') 315 | else: 316 | return shared_x, shared_y 317 | 318 | def __getstate__(self): 319 | """ Return state sequence.""" 320 | params = self._get_params() # parameters set in constructor 321 | theta = self.rnn.theta.get_value() 322 | state = (params, theta) 323 | return state 324 | 325 | def _set_weights(self, theta): 326 | """ Set fittable parameters from weights sequence. 327 | """ 328 | self.rnn.theta.set_value(theta) 329 | 330 | def __setstate__(self, state): 331 | """ Set parameters from state sequence. 332 | """ 333 | params, theta = state 334 | self.set_params(**params) 335 | self.ready() 336 | self._set_weights(theta) 337 | 338 | def save(self, fpath='.', fname=None): 339 | """ Save a pickled representation of Model state. """ 340 | fpathstart, fpathext = os.path.splitext(fpath) 341 | if fpathext == '.pkl': 342 | # User supplied an absolute path to a pickle file 343 | fpath, fname = os.path.split(fpath) 344 | 345 | elif fname is None: 346 | # Generate filename based on date 347 | date_obj = datetime.datetime.now() 348 | date_str = date_obj.strftime('%Y-%m-%d-%H:%M:%S') 349 | class_name = self.__class__.__name__ 350 | fname = '%s.%s.pkl' % (class_name, date_str) 351 | 352 | fabspath = os.path.join(fpath, fname) 353 | 354 | logger.info("Saving to %s ..." % fabspath) 355 | file = open(fabspath, 'wb') 356 | state = self.__getstate__() 357 | pickle.dump(state, file, protocol=pickle.HIGHEST_PROTOCOL) 358 | file.close() 359 | 360 | def load(self, path): 361 | """ Load model parameters from path. """ 362 | logger.info("Loading from %s ..." % path) 363 | file = open(path, 'rb') 364 | state = pickle.load(file) 365 | self.__setstate__(state) 366 | file.close() 367 | 368 | def optional_output(self, train_set_x, show_norms=True, show_output=True): 369 | """ Produces some debugging output. """ 370 | if show_norms: 371 | norm_output = [] 372 | for param in self.rnn.params: 373 | norm_output.append('%s: %6.4f' % (param.name, 374 | self.get_norms[param]())) 375 | logger.info("norms: {" + ', '.join(norm_output) + "}") 376 | 377 | if show_output: 378 | # show output for a single case 379 | if self.output_type == 'binary': 380 | output_fn = self.predict_proba 381 | else: 382 | output_fn = self.predict 383 | logger.info("sample output: " + \ 384 | str(output_fn(train_set_x.get_value( 385 | borrow=True)[:, 0, :][:, np.newaxis, :]).flatten())) 386 | 387 | def fit(self, X_train, Y_train, X_test=None, Y_test=None, 388 | validate_every=100, optimizer='sgd', compute_zero_one=False, 389 | show_norms=True, show_output=True): 390 | """ Fit model 391 | 392 | Pass in X_test, Y_test to compute test error and report during 393 | training. 394 | 395 | X_train : ndarray (T x n_in) 396 | Y_train : ndarray (T x n_out) 397 | 398 | validation_frequency : int 399 | in terms of number of epochs 400 | 401 | optimizer : string 402 | Optimizer type. 403 | Possible values: 404 | 'sgd' : batch stochastic gradient descent 405 | 'cg' : nonlinear conjugate gradient algorithm 406 | (scipy.optimize.fmin_cg) 407 | 'bfgs' : quasi-Newton method of Broyden, Fletcher, Goldfarb, 408 | and Shanno (scipy.optimize.fmin_bfgs) 409 | 'l_bfgs_b' : Limited-memory BFGS (scipy.optimize.fmin_l_bfgs_b) 410 | 411 | compute_zero_one : bool 412 | in the case of binary output, compute zero-one error in addition to 413 | cross-entropy error 414 | show_norms : bool 415 | Show L2 norms of individual parameter groups while training. 416 | show_output : bool 417 | Show the model output on first training case while training. 418 | """ 419 | if X_test is not None: 420 | assert(Y_test is not None) 421 | self.interactive = True 422 | test_set_x, test_set_y = self.shared_dataset((X_test, Y_test)) 423 | else: 424 | self.interactive = False 425 | 426 | train_set_x, train_set_y = self.shared_dataset((X_train, Y_train)) 427 | 428 | if compute_zero_one: 429 | assert(self.output_type == 'binary' \ 430 | or self.output_type == 'softmax') 431 | # compute number of minibatches for training 432 | # note that cases are the second dimension, not the first 433 | n_train = train_set_x.get_value(borrow=True).shape[1] 434 | n_train_batches = int(np.ceil(1.0 * n_train / self.batch_size)) 435 | if self.interactive: 436 | n_test = test_set_x.get_value(borrow=True).shape[1] 437 | n_test_batches = int(np.ceil(1.0 * n_test / self.batch_size)) 438 | 439 | #validate_every is specified in terms of epochs 440 | validation_frequency = validate_every * n_train_batches 441 | 442 | ###################### 443 | # BUILD ACTUAL MODEL # 444 | ###################### 445 | logger.info('... building the model') 446 | 447 | index = T.lscalar('index') # index to a [mini]batch 448 | n_ex = T.lscalar('n_ex') # total number of examples 449 | # learning rate (may change) 450 | l_r = T.scalar('l_r', dtype=theano.config.floatX) 451 | mom = T.scalar('mom', dtype=theano.config.floatX) # momentum 452 | 453 | cost = self.rnn.loss(self.y) \ 454 | + self.L1_reg * self.rnn.L1 \ 455 | + self.L2_reg * self.rnn.L2_sqr 456 | 457 | # Proper implementation of variable-batch size evaluation 458 | # Note that classifier.errors() returns the mean error 459 | # But the last batch may be a smaller size 460 | # So we keep around the effective_batch_size (whose last element may 461 | # be smaller than the rest) 462 | # And weight the reported error by the batch_size when we average 463 | # Also, by keeping batch_start and batch_stop as symbolic variables, 464 | # we make the theano function easier to read 465 | batch_start = index * self.batch_size 466 | batch_stop = T.minimum(n_ex, (index + 1) * self.batch_size) 467 | effective_batch_size = batch_stop - batch_start 468 | 469 | get_batch_size = theano.function(inputs=[index, n_ex], 470 | outputs=effective_batch_size) 471 | 472 | compute_train_error = theano.function(inputs=[index, n_ex], 473 | outputs=self.rnn.loss(self.y), 474 | givens={self.x: train_set_x[:, batch_start:batch_stop], 475 | self.y: train_set_y[:, batch_start:batch_stop]}, 476 | mode=mode) 477 | 478 | if compute_zero_one: 479 | compute_train_zo = theano.function(inputs=[index, n_ex], 480 | outputs=self.rnn.errors(self.y), 481 | givens={self.x: train_set_x[:, batch_start:batch_stop], 482 | self.y: train_set_y[:, batch_start:batch_stop]}, 483 | mode=mode) 484 | 485 | if self.interactive: 486 | compute_test_error = theano.function(inputs=[index, n_ex], 487 | outputs=self.rnn.loss(self.y), 488 | givens={self.x: test_set_x[:, batch_start:batch_stop], 489 | self.y: test_set_y[:, batch_start:batch_stop]}, 490 | mode=mode) 491 | 492 | if compute_zero_one: 493 | compute_test_zo = theano.function(inputs=[index, n_ex], 494 | outputs=self.rnn.errors(self.y), 495 | givens={self.x: test_set_x[:, batch_start:batch_stop], 496 | self.y: test_set_y[:, batch_start:batch_stop]}, 497 | mode=mode) 498 | 499 | self.get_norms = {} 500 | for param in self.rnn.params: 501 | self.get_norms[param] = theano.function(inputs=[], 502 | outputs=self.rnn.l2_norms[param], mode=mode) 503 | 504 | # compute the gradient of cost with respect to theta using BPTT 505 | gtheta = T.grad(cost, self.rnn.theta) 506 | 507 | if optimizer == 'sgd': 508 | 509 | updates = {} 510 | theta = self.rnn.theta 511 | theta_update = self.rnn.theta_update 512 | # careful here, update to the shared variable 513 | # cannot depend on an updated other shared variable 514 | # since updates happen in parallel 515 | # so we need to be explicit 516 | upd = mom * theta_update - l_r * gtheta 517 | updates[theta_update] = upd 518 | updates[theta] = theta + upd 519 | 520 | # compiling a Theano function `train_model` that returns the 521 | # cost, but in the same time updates the parameter of the 522 | # model based on the rules defined in `updates` 523 | train_model = theano.function(inputs=[index, n_ex, l_r, mom], 524 | outputs=cost, 525 | updates=updates, 526 | givens={self.x: train_set_x[:, batch_start:batch_stop], 527 | self.y: train_set_y[:, batch_start:batch_stop]}, 528 | mode=mode) 529 | 530 | ############### 531 | # TRAIN MODEL # 532 | ############### 533 | logger.info('... training') 534 | epoch = 0 535 | 536 | while (epoch < self.n_epochs): 537 | epoch = epoch + 1 538 | effective_momentum = self.final_momentum \ 539 | if epoch > self.momentum_switchover \ 540 | else self.initial_momentum 541 | 542 | for minibatch_idx in xrange(n_train_batches): 543 | minibatch_avg_cost = train_model(minibatch_idx, n_train, 544 | self.learning_rate, 545 | effective_momentum) 546 | 547 | # iteration number (how many weight updates have we made?) 548 | # epoch is 1-based, index is 0 based 549 | iter = (epoch - 1) * n_train_batches + minibatch_idx + 1 550 | 551 | if iter % validation_frequency == 0: 552 | # compute loss on training set 553 | train_losses = [compute_train_error(i, n_train) 554 | for i in xrange(n_train_batches)] 555 | train_batch_sizes = [get_batch_size(i, n_train) 556 | for i in xrange(n_train_batches)] 557 | 558 | this_train_loss = np.average(train_losses, 559 | weights=train_batch_sizes) 560 | 561 | if compute_zero_one: 562 | train_zero_one = [compute_train_zo(i, n_train) 563 | for i in xrange(n_train_batches)] 564 | 565 | this_train_zero_one = np.average(train_zero_one, 566 | weights=train_batch_sizes) 567 | 568 | if self.interactive: 569 | test_losses = [compute_test_error(i, n_test) 570 | for i in xrange(n_test_batches)] 571 | 572 | test_batch_sizes = [get_batch_size(i, n_test) 573 | for i in xrange(n_test_batches)] 574 | 575 | this_test_loss = np.average(test_losses, 576 | weights=test_batch_sizes) 577 | 578 | if compute_zero_one: 579 | test_zero_one = [compute_test_zo(i, n_test) 580 | for i in xrange(n_test_batches)] 581 | 582 | this_test_zero_one = np.average(test_zero_one, 583 | weights=test_batch_sizes) 584 | 585 | if compute_zero_one: 586 | logger.info('epoch %i, mb %i/%i, tr loss %f, ' 587 | 'tr zo %f, te loss %f ' 588 | 'te zo %f lr: %f' % \ 589 | (epoch, minibatch_idx + 1, 590 | n_train_batches, 591 | this_train_loss, this_train_zero_one, 592 | this_test_loss, this_test_zero_one, 593 | self.learning_rate)) 594 | else: 595 | logger.info('epoch %i, mb %i/%i, tr loss %f ' 596 | 'te loss %f lr: %f' % \ 597 | (epoch, minibatch_idx + 1, n_train_batches, 598 | this_train_loss, this_test_loss, 599 | self.learning_rate)) 600 | 601 | else: 602 | if compute_zero_one: 603 | logger.info('epoch %i, mb %i/%i, train loss %f' 604 | ' train zo %f ' 605 | 'lr: %f' % (epoch, 606 | minibatch_idx + 1, 607 | n_train_batches, 608 | this_train_loss, 609 | this_train_zero_one, 610 | self.learning_rate)) 611 | else: 612 | logger.info('epoch %i, mb %i/%i, train loss %f' 613 | ' lr: %f' % (epoch, 614 | minibatch_idx + 1, 615 | n_train_batches, 616 | this_train_loss, 617 | self.learning_rate)) 618 | 619 | self.optional_output(train_set_x, show_norms, 620 | show_output) 621 | 622 | self.learning_rate *= self.learning_rate_decay 623 | 624 | if self.snapshot_every is not None: 625 | if (epoch + 1) % self.snapshot_every == 0: 626 | date_obj = datetime.datetime.now() 627 | date_str = date_obj.strftime('%Y-%m-%d-%H:%M:%S') 628 | class_name = self.__class__.__name__ 629 | fname = '%s.%s-snapshot-%d.pkl' % (class_name, 630 | date_str, epoch + 1) 631 | fabspath = os.path.join(self.snapshot_path, fname) 632 | self.save(fpath=fabspath) 633 | 634 | elif optimizer == 'cg' or optimizer == 'bfgs' \ 635 | or optimizer == 'l_bfgs_b': 636 | # compile a theano function that returns the cost of a minibatch 637 | batch_cost = theano.function(inputs=[index, n_ex], 638 | outputs=cost, 639 | givens={self.x: train_set_x[:, batch_start:batch_stop], 640 | self.y: train_set_y[:, batch_start:batch_stop]}, 641 | mode=mode, name="batch_cost") 642 | 643 | # compile a theano function that returns the gradient of the 644 | # minibatch with respect to theta 645 | batch_grad = theano.function(inputs=[index, n_ex], 646 | outputs=T.grad(cost, self.rnn.theta), 647 | givens={self.x: train_set_x[:, batch_start:batch_stop], 648 | self.y: train_set_y[:, batch_start:batch_stop]}, 649 | mode=mode, name="batch_grad") 650 | 651 | # creates a function that computes the average cost on the training 652 | # set 653 | def train_fn(theta_value): 654 | self.rnn.theta.set_value(theta_value, borrow=True) 655 | train_losses = [batch_cost(i, n_train) 656 | for i in xrange(n_train_batches)] 657 | train_batch_sizes = [get_batch_size(i, n_train) 658 | for i in xrange(n_train_batches)] 659 | return np.average(train_losses, weights=train_batch_sizes) 660 | 661 | # creates a function that computes the average gradient of cost 662 | # with respect to theta 663 | def train_fn_grad(theta_value): 664 | self.rnn.theta.set_value(theta_value, borrow=True) 665 | 666 | train_grads = [batch_grad(i, n_train) 667 | for i in xrange(n_train_batches)] 668 | train_batch_sizes = [get_batch_size(i, n_train) 669 | for i in xrange(n_train_batches)] 670 | 671 | return np.average(train_grads, weights=train_batch_sizes, 672 | axis=0) 673 | 674 | # validation function, prints useful output after each iteration 675 | def callback(theta_value): 676 | self.epoch += 1 677 | if (self.epoch) % validate_every == 0: 678 | self.rnn.theta.set_value(theta_value, borrow=True) 679 | # compute loss on training set 680 | train_losses = [compute_train_error(i, n_train) 681 | for i in xrange(n_train_batches)] 682 | train_batch_sizes = [get_batch_size(i, n_train) 683 | for i in xrange(n_train_batches)] 684 | 685 | this_train_loss = np.average(train_losses, 686 | weights=train_batch_sizes) 687 | 688 | if compute_zero_one: 689 | train_zero_one = [compute_train_zo(i, n_train) 690 | for i in xrange(n_train_batches)] 691 | 692 | this_train_zero_one = np.average(train_zero_one, 693 | weights=train_batch_sizes) 694 | 695 | if self.interactive: 696 | test_losses = [compute_test_error(i, n_test) 697 | for i in xrange(n_test_batches)] 698 | 699 | test_batch_sizes = [get_batch_size(i, n_test) 700 | for i in xrange(n_test_batches)] 701 | 702 | this_test_loss = np.average(test_losses, 703 | weights=test_batch_sizes) 704 | 705 | if compute_zero_one: 706 | test_zero_one = [compute_test_zo(i, n_test) 707 | for i in xrange(n_test_batches)] 708 | 709 | this_test_zero_one = np.average(test_zero_one, 710 | weights=test_batch_sizes) 711 | 712 | if compute_zero_one: 713 | logger.info('epoch %i, tr loss %f, ' 714 | 'tr zo %f, te loss %f ' 715 | 'te zo %f' % \ 716 | (self.epoch, this_train_loss, 717 | this_train_zero_one, this_test_loss, 718 | this_test_zero_one)) 719 | else: 720 | logger.info('epoch %i, tr loss %f, te loss %f' % \ 721 | (self.epoch, this_train_loss, 722 | this_test_loss, self.learning_rate)) 723 | 724 | else: 725 | if compute_zero_one: 726 | logger.info('epoch %i, train loss %f' 727 | ', train zo %f ' % \ 728 | (self.epoch, this_train_loss, 729 | this_train_zero_one)) 730 | else: 731 | logger.info('epoch %i, train loss %f ' % \ 732 | (self.epoch, this_train_loss)) 733 | 734 | self.optional_output(train_set_x, show_norms, show_output) 735 | 736 | ############### 737 | # TRAIN MODEL # 738 | ############### 739 | logger.info('... training') 740 | # using scipy conjugate gradient optimizer 741 | import scipy.optimize 742 | if optimizer == 'cg': 743 | of = scipy.optimize.fmin_cg 744 | elif optimizer == 'bfgs': 745 | of = scipy.optimize.fmin_bfgs 746 | elif optimizer == 'l_bfgs_b': 747 | of = scipy.optimize.fmin_l_bfgs_b 748 | logger.info("Optimizing using %s..." % of.__name__) 749 | start_time = time.clock() 750 | 751 | # keep track of epochs externally 752 | # these get updated through callback 753 | self.epoch = 0 754 | 755 | # interface to l_bfgs_b is different than that of cg, bfgs 756 | # however, this will be changed in scipy 0.11 757 | # unified under scipy.optimize.minimize 758 | if optimizer == 'cg' or optimizer == 'bfgs': 759 | best_theta = of( 760 | f=train_fn, 761 | x0=self.rnn.theta.get_value(), 762 | # x0=np.zeros(self.rnn.theta.get_value().shape, 763 | # dtype=theano.config.floatX), 764 | fprime=train_fn_grad, 765 | callback=callback, 766 | disp=1, 767 | retall=1, 768 | maxiter=self.n_epochs) 769 | elif optimizer == 'l_bfgs_b': 770 | best_theta, f_best_theta, info = of( 771 | func=train_fn, 772 | x0=self.rnn.theta.get_value(), 773 | fprime=train_fn_grad, 774 | iprint=validate_every, 775 | maxfun=self.n_epochs) # max number of feval 776 | 777 | end_time = time.clock() 778 | 779 | print "Optimization time: %f" % (end_time - start_time) 780 | 781 | else: 782 | raise NotImplementedError 783 | 784 | 785 | def test_real(n_epochs=1000): 786 | """ Test RNN with real-valued outputs. """ 787 | n_hidden = 10 788 | n_in = 5 789 | n_out = 3 790 | n_steps = 10 791 | n_seq = 10 # per batch 792 | n_batches = 10 793 | 794 | np.random.seed(0) 795 | # simple lag test 796 | seq = np.random.randn(n_steps, n_seq * n_batches, n_in) 797 | targets = np.zeros((n_steps, n_seq * n_batches, n_out)) 798 | 799 | targets[1:, :, 0] = seq[:-1, :, 3] # delayed 1 800 | targets[1:, :, 1] = seq[:-1, :, 2] # delayed 1 801 | targets[2:, :, 2] = seq[:-2, :, 0] # delayed 2 802 | 803 | targets += 0.01 * np.random.standard_normal(targets.shape) 804 | 805 | model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out, 806 | learning_rate=0.01, learning_rate_decay=0.999, 807 | n_epochs=n_epochs, batch_size=n_seq, activation='tanh', 808 | L2_reg=1e-3) 809 | 810 | model.fit(seq, targets, validate_every=100, optimizer='bfgs') 811 | 812 | plt.close('all') 813 | fig = plt.figure() 814 | ax1 = plt.subplot(211) 815 | plt.plot(seq[:, 0, :]) 816 | ax1.set_title('input') 817 | ax2 = plt.subplot(212) 818 | true_targets = plt.plot(targets[:, 0, :]) 819 | 820 | guess = model.predict(seq[:, 0, :][:, np.newaxis, :]) 821 | 822 | guessed_targets = plt.plot(guess.squeeze(), linestyle='--') 823 | for i, x in enumerate(guessed_targets): 824 | x.set_color(true_targets[i].get_color()) 825 | ax2.set_title('solid: true output, dashed: model output') 826 | 827 | 828 | def test_binary(multiple_out=False, n_epochs=1000, optimizer='cg'): 829 | """ Test RNN with binary outputs. """ 830 | n_hidden = 10 831 | n_in = 5 832 | if multiple_out: 833 | n_out = 2 834 | else: 835 | n_out = 1 836 | n_steps = 10 837 | n_seq = 10 # per batch 838 | n_batches = 50 839 | 840 | np.random.seed(0) 841 | # simple lag test 842 | seq = np.random.randn(n_steps, n_seq * n_batches, n_in) 843 | targets = np.zeros((n_steps, n_seq * n_batches, n_out)) 844 | 845 | # whether lag 1 (dim 3) is greater than lag 2 (dim 0) 846 | targets[2:, :, 0] = np.cast[np.int](seq[1:-1, :, 3] > seq[:-2, :, 0]) 847 | 848 | if multiple_out: 849 | # whether product of lag 1 (dim 4) and lag 1 (dim 2) 850 | # is less than lag 2 (dim 0) 851 | targets[2:, :, 1] = np.cast[np.int]( 852 | (seq[1:-1, :, 4] * seq[1:-1, :, 2]) > seq[:-2, :, 0]) 853 | 854 | model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out, 855 | learning_rate=0.005, learning_rate_decay=0.999, 856 | n_epochs=n_epochs, batch_size=n_seq, activation='tanh', 857 | output_type='binary') 858 | 859 | model.fit(seq, targets, validate_every=100, compute_zero_one=True, 860 | optimizer=optimizer) 861 | 862 | seqs = xrange(10) 863 | 864 | plt.close('all') 865 | for seq_num in seqs: 866 | fig = plt.figure() 867 | ax1 = plt.subplot(211) 868 | plt.plot(seq[:, seq_num, :]) 869 | ax1.set_title('input') 870 | ax2 = plt.subplot(212) 871 | true_targets = plt.step(xrange(n_steps), targets[:, seq_num, :], 872 | marker='o') 873 | 874 | guess = model.predict_proba(seq[:, seq_num, :][:, np.newaxis, :]) 875 | guessed_targets = plt.step(xrange(n_steps), guess.squeeze()) 876 | plt.setp(guessed_targets, linestyle='--', marker='d') 877 | for i, x in enumerate(guessed_targets): 878 | x.set_color(true_targets[i].get_color()) 879 | ax2.set_ylim((-0.1, 1.1)) 880 | ax2.set_title('solid: true output, dashed: model output (prob)') 881 | 882 | 883 | def test_softmax(n_epochs=250, optimizer='cg'): 884 | """ Test RNN with softmax outputs. """ 885 | n_hidden = 10 886 | n_in = 5 887 | n_steps = 10 888 | n_seq = 10 # per batch 889 | n_batches = 50 890 | n_classes = 3 891 | n_out = n_classes # restricted to single softmax per time step 892 | 893 | np.random.seed(0) 894 | # simple lag test 895 | seq = np.random.randn(n_steps, n_seq * n_batches, n_in) 896 | targets = np.zeros((n_steps, n_seq * n_batches), dtype=np.int) 897 | 898 | thresh = 0.5 899 | # if lag 1 (dim 3) is greater than lag 2 (dim 0) + thresh 900 | # class 1 901 | # if lag 1 (dim 3) is less than lag 2 (dim 0) - thresh 902 | # class 2 903 | # if lag 2(dim0) - thresh <= lag 1 (dim 3) <= lag2(dim0) + thresh 904 | # class 0 905 | targets[2:, :][seq[1:-1, :, 3] > seq[:-2, :, 0] + thresh] = 1 906 | targets[2:, :][seq[1:-1, :, 3] < seq[:-2, :, 0] - thresh] = 2 907 | #targets[:, 2:, 0] = np.cast[np.int](seq[:, 1:-1, 3] > seq[:, :-2, 0]) 908 | 909 | model = MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out, 910 | learning_rate=0.005, learning_rate_decay=0.999, 911 | n_epochs=n_epochs, batch_size=n_seq, activation='tanh', 912 | output_type='softmax') 913 | 914 | model.fit(seq, targets, validate_every=10, compute_zero_one=True, 915 | optimizer=optimizer) 916 | 917 | seqs = xrange(10) 918 | 919 | plt.close('all') 920 | for seq_num in seqs: 921 | fig = plt.figure() 922 | ax1 = plt.subplot(211) 923 | plt.plot(seq[:, seq_num]) 924 | ax1.set_title('input') 925 | ax2 = plt.subplot(212) 926 | 927 | # blue line will represent true classes 928 | true_targets = plt.step(xrange(n_steps), targets[:, seq_num], 929 | marker='o') 930 | 931 | # show probabilities (in b/w) output by model 932 | guess = model.predict_proba(seq[:, seq_num][:, np.newaxis]) 933 | guessed_probs = plt.imshow(guess.squeeze().T, interpolation='nearest', 934 | cmap='gray') 935 | ax2.set_title('blue: true class, grayscale: probs assigned by model') 936 | 937 | 938 | if __name__ == "__main__": 939 | logging.basicConfig(level=logging.INFO) 940 | t0 = time.time() 941 | test_real(n_epochs=1000) 942 | #test_binary(optimizer='sgd', n_epochs=1000) 943 | #test_softmax(n_epochs=250, optimizer='sgd') 944 | print "Elapsed time: %f" % (time.time() - t0) 945 | --------------------------------------------------------------------------------