├── .gitignore
├── .hgignore
├── .travis.yml
├── LICENSE.txt
├── README.rst
├── code
├── DBN.py
├── SdA.py
├── cA.py
├── convolutional_mlp.py
├── dA.py
├── hmc
│ ├── __init__.py
│ ├── hmc.py
│ └── test_hmc.py
├── imdb.py
├── imdb_preprocess.py
├── logistic_cg.py
├── logistic_sgd.py
├── lstm.py
├── mlp.py
├── rbm.py
├── rnnrbm.py
├── rnnslu.py
├── test.py
└── utils.py
├── data
├── download.sh
└── training_colorpatches_16x16_demo.mat
├── doc
├── .templates
│ └── layout.html
├── DBN.txt
├── LICENSE.txt
├── Makefile
├── SdA.txt
├── conf.py
├── contents.txt
├── dA.txt
├── gettingstarted.txt
├── hmc.txt
├── images
│ ├── 3wolfmoon.jpg
│ ├── 3wolfmoon_output.png
│ ├── DBN3.png
│ ├── bm.png
│ ├── cnn_explained.png
│ ├── conv_1D_nn.png
│ ├── filters_at_epoch_14.png
│ ├── filters_corruption_0.png
│ ├── filters_corruption_30.png
│ ├── lstm.png
│ ├── lstm_memorycell.png
│ ├── markov_chain.png
│ ├── mlp.png
│ ├── mnist_0.png
│ ├── mnist_1.png
│ ├── mnist_2.png
│ ├── mnist_3.png
│ ├── mnist_4.png
│ ├── mnist_5.png
│ ├── mylenet.png
│ ├── rbm.png
│ ├── rnnrbm.png
│ ├── rnnrbm.svg
│ ├── sample1.png
│ ├── sample2.png
│ ├── samples.png
│ └── sparse_1D_nn.png
├── index.txt
├── lenet.txt
├── logreg.txt
├── lstm.txt
├── mlp.txt
├── rbm.txt
├── references.txt
├── rnnrbm.txt
├── rnnslu.txt
├── scripts
│ └── docgen.py
└── utilities.txt
├── issues_closed
└── 2_RBM_cost_fn.txt
├── issues_open
├── 1_SdA_performance.txt
├── 3_RBM_scan_GPU.txt
├── 4_RBM_scan.txt
├── 5_results.txt
└── 6_benchmarking_pybrain.txt
└── misc
└── do_nightly_build
/.gitignore:
--------------------------------------------------------------------------------
1 | code/*.pyc
2 | code/*_plots
3 | code/tmp*
4 | code/midi
5 | code/rnnslu
6 | data/atis.*
7 | data/mnist.pkl.gz
8 | data/mnist_py3k.pkl.gz
9 | data/Nottingham.zip
10 | data/Nottingham
11 | data/midi.zip
12 | html
13 | *.pyc
14 | *~
15 | *.swp
16 |
--------------------------------------------------------------------------------
/.hgignore:
--------------------------------------------------------------------------------
1 | syntax: glob
2 | *.pyc
3 | *.png
4 | *~
5 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # After changing this file, check it on:
2 | # http://lint.travis-ci.org/
3 |
4 | language: python
5 | #python:
6 | # - "2.7"
7 | # - "3.2"
8 | # command to install dependencies
9 | before_install:
10 | - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
11 | - chmod +x miniconda.sh
12 | - ./miniconda.sh -b
13 | - export PATH=/home/travis/miniconda/bin:$PATH
14 | - conda update --yes conda
15 |
16 | install:
17 | - conda create --yes -q -n pyenv mkl python=2.7 numpy scipy pip nose yaml pyflakes pillow pyparsing=1.5
18 | - source activate pyenv
19 | - pip install git+git://github.com/Theano/Theano.git
20 |
21 | env:
22 | - PART="test.py:test_logistic_sgd test.py:test_logistic_cg test.py:test_mlp test.py:test_convolutional_mlp test.py:test_dA"
23 | - PART="test.py:test_SdA"
24 | - PART="test.py:test_dbn"
25 | - PART="test.py:test_rbm test.py:test_rnnrbm"
26 | - PART="-e test.py"
27 |
28 | #i7-2600K CPU @ 3.40GHz
29 | #166.572s #8 test.test_rbm OK
30 | #155.114s #7 test.test_dbn OK
31 | #152.365s #9 test.test_rnnrbm OK
32 | #127.286s #6 test.test_SdA OK
33 | #39.252s #5 test.test_dA OK
34 | #27.56s #4 test.test_convolutional_mlp OK
35 | #15.454s #3 test.test_mlp OK
36 | #12.732s #1 test.test_logistic_sgd OK
37 | #12.638s #2 test.test_logistic_cg OK
38 |
39 | #i7-920
40 | #296.475s #7 code.test.test_dbn OK
41 | #257.272s #6 code.test.test_SdA OK
42 | #234.776s #9 code.test.test_rnnrbm OK
43 | #233.896s #8 code.test.test_rbm OK
44 | #65.737s #5 code.test.test_dA OK
45 | #37.658s #4 code.test.test_convolutional_mlp OK
46 | #24.172s #3 code.test.test_mlp OK
47 | #20.401s #1 code.test.test_logistic_sgd OK
48 | #17.546s #2 code.test.test_logistic_cg OK
49 |
50 | # On Core2 duo E8500 with MRG
51 | #308.004s #7 code.test.test_dbn OK
52 | #277.268s #6 code.test.test_SdA OK
53 | #126.102s #8 code.test.test_rbm OK
54 | #123.652s #9 code.test.test_rnnrbm OK
55 | #77.101s #5 code.test.test_dA OK
56 | #39.75s #4 code.test.test_convolutional_mlp OK
57 | #30.406s #3 code.test.test_mlp OK
58 | #21.132s #2 code.test.test_logistic_cg OK
59 | #17.945s #1 code.test.test_logistic_sgd OK
60 |
61 | # Unknown computer with older version of Theano
62 | #569.882s #9 code.test.test_rbm OK
63 | #298.992s #8 code.test.test_dbn OK
64 | #268.901s #7 code.test.test_SdA OK
65 | #67.292s #6 code.test.test_dA OK
66 | #27.485s #4 code.test.test_mlp OK
67 | #26.204s #5 code.test.test_convolutional_mlp OK
68 | #14.676s #3 code.test.test_logistic_cg OK
69 | #10.66s #2 code.test.test_logistic_sgd OK
70 | #5.795s #1 code.hmc.test_hmc.test_hmc OK
71 |
72 | script:
73 | - cd data
74 | - ./download.sh
75 | - ls
76 | - cd ../code
77 | - pwd
78 | - ls
79 | - export THEANO_FLAGS=warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise
80 | - python --version
81 | - nosetests -v $PART
82 |
83 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | .. _license:
2 |
3 | LICENSE
4 | =======
5 |
6 | Copyright (c) 2010--2015, Deep Learning Tutorials Development Team
7 | All rights reserved.
8 |
9 | Redistribution and use in source and binary forms, with or without
10 | modification, are permitted provided that the following conditions are met:
11 |
12 | * Redistributions of source code must retain the above copyright
13 | notice, this list of conditions and the following disclaimer.
14 | * Redistributions in binary form must reproduce the above copyright
15 | notice, this list of conditions and the following disclaimer in the
16 | documentation and/or other materials provided with the distribution.
17 | * Neither the name of Theano nor the names of its contributors may be
18 | used to endorse or promote products derived from this software without
19 | specific prior written permission.
20 |
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
22 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
28 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | Deep Learning Tutorials
2 | =======================
3 |
4 | Deep Learning is a new area of Machine Learning research, which has been
5 | introduced with the objective of moving Machine Learning closer to one of its
6 | original goals: Artificial Intelligence. Deep Learning is about learning
7 | multiple levels of representation and abstraction that help to make sense of
8 | data such as images, sound, and text. The tutorials presented here will
9 | introduce you to some of the most important deep learning algorithms and will
10 | also show you how to run them using Theano. Theano is a python library that
11 | makes writing deep learning models easy, and gives the option of training them
12 | on a GPU.
13 |
14 | The easiest way to follow the tutorials is to `browse them online
15 | `_.
16 |
17 | `Main development `_
18 | of this project.
19 |
20 | .. image:: https://secure.travis-ci.org/lisa-lab/DeepLearningTutorials.png
21 | :target: http://travis-ci.org/lisa-lab/DeepLearningTutorials
22 |
23 | Project Layout
24 | --------------
25 |
26 | Subdirectories:
27 |
28 | - code - Python files corresponding to each tutorial
29 | - data - data and scripts to download data that is used by the tutorials
30 | - doc - restructured text used by Sphinx to build the tutorial website
31 | - html - built automatically by doc/Makefile, contains tutorial website
32 | - issues_closed - issue tracking
33 | - issues_open - issue tracking
34 | - misc - administrative scripts
35 |
36 |
37 | Build instructions
38 | ------------------
39 |
40 | To build the html version of the tutorials, install sphinx and run doc/Makefile
41 |
--------------------------------------------------------------------------------
/code/cA.py:
--------------------------------------------------------------------------------
1 | """This tutorial introduces Contractive auto-encoders (cA) using Theano.
2 |
3 | They are based on auto-encoders as the ones used in Bengio et
4 | al. 2007. An autoencoder takes an input x and first maps it to a
5 | hidden representation y = f_{\theta}(x) = s(Wx+b), parameterized by
6 | \theta={W,b}. The resulting latent representation y is then mapped
7 | back to a "reconstructed" vector z \in [0,1]^d in input space z =
8 | g_{\theta'}(y) = s(W'y + b'). The weight matrix W' can optionally be
9 | constrained such that W' = W^T, in which case the autoencoder is said
10 | to have tied weights. The network is trained such that to minimize
11 | the reconstruction error (the error between x and z). Adding the
12 | squared Frobenius norm of the Jacobian of the hidden mapping h with
13 | respect to the visible units yields the contractive auto-encoder:
14 |
15 | - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
16 | + \| \frac{\partial h(x)}{\partial x} \|^2
17 |
18 | References :
19 | - S. Rifai, P. Vincent, X. Muller, X. Glorot, Y. Bengio: Contractive
20 | Auto-Encoders: Explicit Invariance During Feature Extraction, ICML-11
21 |
22 | - S. Rifai, X. Muller, X. Glorot, G. Mesnil, Y. Bengio, and Pascal
23 | Vincent. Learning invariant features through local space
24 | contraction. Technical Report 1360, Universite de Montreal
25 |
26 | - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
27 | Training of Deep Networks, Advances in Neural Information Processing
28 | Systems 19, 2007
29 |
30 | """
31 | import os
32 | import sys
33 | import timeit
34 |
35 | import numpy
36 |
37 | import theano
38 | import theano.tensor as T
39 |
40 |
41 | from logistic_sgd import load_data
42 | from utils import tile_raster_images
43 |
44 | try:
45 | import PIL.Image as Image
46 | except ImportError:
47 | import Image
48 |
49 |
50 | class cA(object):
51 | """ Contractive Auto-Encoder class (cA)
52 |
53 | The contractive autoencoder tries to reconstruct the input with an
54 | additional constraint on the latent space. With the objective of
55 | obtaining a robust representation of the input space, we
56 | regularize the L2 norm(Froebenius) of the jacobian of the hidden
57 | representation with respect to the input. Please refer to Rifai et
58 | al.,2011 for more details.
59 |
60 | If x is the input then equation (1) computes the projection of the
61 | input into the latent space h. Equation (2) computes the jacobian
62 | of h with respect to x. Equation (3) computes the reconstruction
63 | of the input, while equation (4) computes the reconstruction
64 | error and the added regularization term from Eq.(2).
65 |
66 | .. math::
67 |
68 | h_i = s(W_i x + b_i) (1)
69 |
70 | J_i = h_i (1 - h_i) * W_i (2)
71 |
72 | x' = s(W' h + b') (3)
73 |
74 | L = -sum_{k=1}^d [x_k \log x'_k + (1-x_k) \log( 1-x'_k)]
75 | + lambda * sum_{i=1}^d sum_{j=1}^n J_{ij}^2 (4)
76 |
77 | """
78 |
79 | def __init__(self, numpy_rng, input=None, n_visible=784, n_hidden=100,
80 | n_batchsize=1, W=None, bhid=None, bvis=None):
81 | """Initialize the cA class by specifying the number of visible units
82 | (the dimension d of the input), the number of hidden units (the
83 | dimension d' of the latent or hidden space) and the contraction level.
84 | The constructor also receives symbolic variables for the input, weights
85 | and bias.
86 |
87 | :type numpy_rng: numpy.random.RandomState
88 | :param numpy_rng: number random generator used to generate weights
89 |
90 | :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
91 | :param theano_rng: Theano random generator; if None is given
92 | one is generated based on a seed drawn from `rng`
93 |
94 | :type input: theano.tensor.TensorType
95 | :param input: a symbolic description of the input or None for
96 | standalone cA
97 |
98 | :type n_visible: int
99 | :param n_visible: number of visible units
100 |
101 | :type n_hidden: int
102 | :param n_hidden: number of hidden units
103 |
104 | :type n_batchsize int
105 | :param n_batchsize: number of examples per batch
106 |
107 | :type W: theano.tensor.TensorType
108 | :param W: Theano variable pointing to a set of weights that should be
109 | shared belong the dA and another architecture; if dA should
110 | be standalone set this to None
111 |
112 | :type bhid: theano.tensor.TensorType
113 | :param bhid: Theano variable pointing to a set of biases values (for
114 | hidden units) that should be shared belong dA and another
115 | architecture; if dA should be standalone set this to None
116 |
117 | :type bvis: theano.tensor.TensorType
118 | :param bvis: Theano variable pointing to a set of biases values (for
119 | visible units) that should be shared belong dA and another
120 | architecture; if dA should be standalone set this to None
121 |
122 | """
123 | self.n_visible = n_visible
124 | self.n_hidden = n_hidden
125 | self.n_batchsize = n_batchsize
126 | # note : W' was written as `W_prime` and b' as `b_prime`
127 | if not W:
128 | # W is initialized with `initial_W` which is uniformely sampled
129 | # from -4*sqrt(6./(n_visible+n_hidden)) and
130 | # 4*sqrt(6./(n_hidden+n_visible))the output of uniform if
131 | # converted using asarray to dtype
132 | # theano.config.floatX so that the code is runable on GPU
133 | initial_W = numpy.asarray(
134 | numpy_rng.uniform(
135 | low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
136 | high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
137 | size=(n_visible, n_hidden)
138 | ),
139 | dtype=theano.config.floatX
140 | )
141 | W = theano.shared(value=initial_W, name='W', borrow=True)
142 |
143 | if not bvis:
144 | bvis = theano.shared(value=numpy.zeros(n_visible,
145 | dtype=theano.config.floatX),
146 | borrow=True)
147 |
148 | if not bhid:
149 | bhid = theano.shared(value=numpy.zeros(n_hidden,
150 | dtype=theano.config.floatX),
151 | name='b',
152 | borrow=True)
153 |
154 | self.W = W
155 | # b corresponds to the bias of the hidden
156 | self.b = bhid
157 | # b_prime corresponds to the bias of the visible
158 | self.b_prime = bvis
159 | # tied weights, therefore W_prime is W transpose
160 | self.W_prime = self.W.T
161 |
162 | # if no input is given, generate a variable representing the input
163 | if input is None:
164 | # we use a matrix because we expect a minibatch of several
165 | # examples, each example being a row
166 | self.x = T.dmatrix(name='input')
167 | else:
168 | self.x = input
169 |
170 | self.params = [self.W, self.b, self.b_prime]
171 |
172 | def get_hidden_values(self, input):
173 | """ Computes the values of the hidden layer """
174 | return T.nnet.sigmoid(T.dot(input, self.W) + self.b)
175 |
176 | def get_jacobian(self, hidden, W):
177 | """Computes the jacobian of the hidden layer with respect to
178 | the input, reshapes are necessary for broadcasting the
179 | element-wise product on the right axis
180 |
181 | """
182 | return T.reshape(hidden * (1 - hidden),
183 | (self.n_batchsize, 1, self.n_hidden)) * T.reshape(
184 | W, (1, self.n_visible, self.n_hidden))
185 |
186 | def get_reconstructed_input(self, hidden):
187 | """Computes the reconstructed input given the values of the
188 | hidden layer
189 |
190 | """
191 | return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime)
192 |
193 | def get_cost_updates(self, contraction_level, learning_rate):
194 | """ This function computes the cost and the updates for one trainng
195 | step of the cA """
196 |
197 | y = self.get_hidden_values(self.x)
198 | z = self.get_reconstructed_input(y)
199 | J = self.get_jacobian(y, self.W)
200 | # note : we sum over the size of a datapoint; if we are using
201 | # minibatches, L will be a vector, with one entry per
202 | # example in minibatch
203 | self.L_rec = - T.sum(self.x * T.log(z) +
204 | (1 - self.x) * T.log(1 - z),
205 | axis=1)
206 |
207 | # Compute the jacobian and average over the number of samples/minibatch
208 | self.L_jacob = T.sum(J ** 2) / self.n_batchsize
209 |
210 | # note : L is now a vector, where each element is the
211 | # cross-entropy cost of the reconstruction of the
212 | # corresponding example of the minibatch. We need to
213 | # compute the average of all these to get the cost of
214 | # the minibatch
215 | cost = T.mean(self.L_rec) + contraction_level * T.mean(self.L_jacob)
216 |
217 | # compute the gradients of the cost of the `cA` with respect
218 | # to its parameters
219 | gparams = T.grad(cost, self.params)
220 | # generate the list of updates
221 | updates = []
222 | for param, gparam in zip(self.params, gparams):
223 | updates.append((param, param - learning_rate * gparam))
224 |
225 | return (cost, updates)
226 |
227 |
228 | def test_cA(learning_rate=0.01, training_epochs=20,
229 | dataset='mnist.pkl.gz',
230 | batch_size=10, output_folder='cA_plots', contraction_level=.1):
231 | """
232 | This demo is tested on MNIST
233 |
234 | :type learning_rate: float
235 | :param learning_rate: learning rate used for training the contracting
236 | AutoEncoder
237 |
238 | :type training_epochs: int
239 | :param training_epochs: number of epochs used for training
240 |
241 | :type dataset: string
242 | :param dataset: path to the picked dataset
243 |
244 | """
245 | datasets = load_data(dataset)
246 | train_set_x, train_set_y = datasets[0]
247 |
248 | # compute number of minibatches for training, validation and testing
249 | n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
250 |
251 | # allocate symbolic variables for the data
252 | index = T.lscalar() # index to a [mini]batch
253 | x = T.matrix('x') # the data is presented as rasterized images
254 |
255 | if not os.path.isdir(output_folder):
256 | os.makedirs(output_folder)
257 | os.chdir(output_folder)
258 | ####################################
259 | # BUILDING THE MODEL #
260 | ####################################
261 |
262 | rng = numpy.random.RandomState(123)
263 |
264 | ca = cA(numpy_rng=rng, input=x,
265 | n_visible=28 * 28, n_hidden=500, n_batchsize=batch_size)
266 |
267 | cost, updates = ca.get_cost_updates(contraction_level=contraction_level,
268 | learning_rate=learning_rate)
269 |
270 | train_ca = theano.function(
271 | [index],
272 | [T.mean(ca.L_rec), ca.L_jacob],
273 | updates=updates,
274 | givens={
275 | x: train_set_x[index * batch_size: (index + 1) * batch_size]
276 | }
277 | )
278 |
279 | start_time = timeit.default_timer()
280 |
281 | ############
282 | # TRAINING #
283 | ############
284 |
285 | # go through training epochs
286 | for epoch in xrange(training_epochs):
287 | # go through trainng set
288 | c = []
289 | for batch_index in xrange(n_train_batches):
290 | c.append(train_ca(batch_index))
291 |
292 | c_array = numpy.vstack(c)
293 | print 'Training epoch %d, reconstruction cost ' % epoch, numpy.mean(
294 | c_array[0]), ' jacobian norm ', numpy.mean(numpy.sqrt(c_array[1]))
295 |
296 | end_time = timeit.default_timer()
297 |
298 | training_time = (end_time - start_time)
299 |
300 | print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
301 | ' ran for %.2fm' % ((training_time) / 60.))
302 | image = Image.fromarray(tile_raster_images(
303 | X=ca.W.get_value(borrow=True).T,
304 | img_shape=(28, 28), tile_shape=(10, 10),
305 | tile_spacing=(1, 1)))
306 |
307 | image.save('cae_filters.png')
308 |
309 | os.chdir('../')
310 |
311 |
312 | if __name__ == '__main__':
313 | test_cA()
314 |
--------------------------------------------------------------------------------
/code/convolutional_mlp.py:
--------------------------------------------------------------------------------
1 | """This tutorial introduces the LeNet5 neural network architecture
2 | using Theano. LeNet5 is a convolutional neural network, good for
3 | classifying images. This tutorial shows how to build the architecture,
4 | and comes with all the hyper-parameters you need to reproduce the
5 | paper's MNIST results.
6 |
7 |
8 | This implementation simplifies the model in the following ways:
9 |
10 | - LeNetConvPool doesn't implement location-specific gain and bias parameters
11 | - LeNetConvPool doesn't implement pooling by average, it implements pooling
12 | by max.
13 | - Digit classification is implemented with a logistic regression rather than
14 | an RBF network
15 | - LeNet5 was not fully-connected convolutions at second layer
16 |
17 | References:
18 | - Y. LeCun, L. Bottou, Y. Bengio and P. Haffner:
19 | Gradient-Based Learning Applied to Document
20 | Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998.
21 | http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf
22 |
23 | """
24 | import os
25 | import sys
26 | import timeit
27 |
28 | import numpy
29 |
30 | import theano
31 | import theano.tensor as T
32 | from theano.tensor.signal import downsample
33 | from theano.tensor.nnet import conv
34 |
35 | from logistic_sgd import LogisticRegression, load_data
36 | from mlp import HiddenLayer
37 |
38 |
39 | class LeNetConvPoolLayer(object):
40 | """Pool Layer of a convolutional network """
41 |
42 | def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)):
43 | """
44 | Allocate a LeNetConvPoolLayer with shared variable internal parameters.
45 |
46 | :type rng: numpy.random.RandomState
47 | :param rng: a random number generator used to initialize weights
48 |
49 | :type input: theano.tensor.dtensor4
50 | :param input: symbolic image tensor, of shape image_shape
51 |
52 | :type filter_shape: tuple or list of length 4
53 | :param filter_shape: (number of filters, num input feature maps,
54 | filter height, filter width)
55 |
56 | :type image_shape: tuple or list of length 4
57 | :param image_shape: (batch size, num input feature maps,
58 | image height, image width)
59 |
60 | :type poolsize: tuple or list of length 2
61 | :param poolsize: the downsampling (pooling) factor (#rows, #cols)
62 | """
63 |
64 | assert image_shape[1] == filter_shape[1]
65 | self.input = input
66 |
67 | # there are "num input feature maps * filter height * filter width"
68 | # inputs to each hidden unit
69 | fan_in = numpy.prod(filter_shape[1:])
70 | # each unit in the lower layer receives a gradient from:
71 | # "num output feature maps * filter height * filter width" /
72 | # pooling size
73 | fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) /
74 | numpy.prod(poolsize))
75 | # initialize weights with random weights
76 | W_bound = numpy.sqrt(6. / (fan_in + fan_out))
77 | self.W = theano.shared(
78 | numpy.asarray(
79 | rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
80 | dtype=theano.config.floatX
81 | ),
82 | borrow=True
83 | )
84 |
85 | # the bias is a 1D tensor -- one bias per output feature map
86 | b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
87 | self.b = theano.shared(value=b_values, borrow=True)
88 |
89 | # convolve input feature maps with filters
90 | conv_out = conv.conv2d(
91 | input=input,
92 | filters=self.W,
93 | filter_shape=filter_shape,
94 | image_shape=image_shape
95 | )
96 |
97 | # downsample each feature map individually, using maxpooling
98 | pooled_out = downsample.max_pool_2d(
99 | input=conv_out,
100 | ds=poolsize,
101 | ignore_border=True
102 | )
103 |
104 | # add the bias term. Since the bias is a vector (1D array), we first
105 | # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will
106 | # thus be broadcasted across mini-batches and feature map
107 | # width & height
108 | self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
109 |
110 | # store parameters of this layer
111 | self.params = [self.W, self.b]
112 |
113 | # keep track of model input
114 | self.input = input
115 |
116 |
117 | def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
118 | dataset='mnist.pkl.gz',
119 | nkerns=[20, 50], batch_size=500):
120 | """ Demonstrates lenet on MNIST dataset
121 |
122 | :type learning_rate: float
123 | :param learning_rate: learning rate used (factor for the stochastic
124 | gradient)
125 |
126 | :type n_epochs: int
127 | :param n_epochs: maximal number of epochs to run the optimizer
128 |
129 | :type dataset: string
130 | :param dataset: path to the dataset used for training /testing (MNIST here)
131 |
132 | :type nkerns: list of ints
133 | :param nkerns: number of kernels on each layer
134 | """
135 |
136 | rng = numpy.random.RandomState(23455)
137 |
138 | datasets = load_data(dataset)
139 |
140 | train_set_x, train_set_y = datasets[0]
141 | valid_set_x, valid_set_y = datasets[1]
142 | test_set_x, test_set_y = datasets[2]
143 |
144 | # compute number of minibatches for training, validation and testing
145 | n_train_batches = train_set_x.get_value(borrow=True).shape[0]
146 | n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
147 | n_test_batches = test_set_x.get_value(borrow=True).shape[0]
148 | n_train_batches /= batch_size
149 | n_valid_batches /= batch_size
150 | n_test_batches /= batch_size
151 |
152 | # allocate symbolic variables for the data
153 | index = T.lscalar() # index to a [mini]batch
154 |
155 | # start-snippet-1
156 | x = T.matrix('x') # the data is presented as rasterized images
157 | y = T.ivector('y') # the labels are presented as 1D vector of
158 | # [int] labels
159 |
160 | ######################
161 | # BUILD ACTUAL MODEL #
162 | ######################
163 | print '... building the model'
164 |
165 | # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
166 | # to a 4D tensor, compatible with our LeNetConvPoolLayer
167 | # (28, 28) is the size of MNIST images.
168 | layer0_input = x.reshape((batch_size, 1, 28, 28))
169 |
170 | # Construct the first convolutional pooling layer:
171 | # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
172 | # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
173 | # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
174 | layer0 = LeNetConvPoolLayer(
175 | rng,
176 | input=layer0_input,
177 | image_shape=(batch_size, 1, 28, 28),
178 | filter_shape=(nkerns[0], 1, 5, 5),
179 | poolsize=(2, 2)
180 | )
181 |
182 | # Construct the second convolutional pooling layer
183 | # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
184 | # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
185 | # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
186 | layer1 = LeNetConvPoolLayer(
187 | rng,
188 | input=layer0.output,
189 | image_shape=(batch_size, nkerns[0], 12, 12),
190 | filter_shape=(nkerns[1], nkerns[0], 5, 5),
191 | poolsize=(2, 2)
192 | )
193 |
194 | # the HiddenLayer being fully-connected, it operates on 2D matrices of
195 | # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
196 | # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
197 | # or (500, 50 * 4 * 4) = (500, 800) with the default values.
198 | layer2_input = layer1.output.flatten(2)
199 |
200 | # construct a fully-connected sigmoidal layer
201 | layer2 = HiddenLayer(
202 | rng,
203 | input=layer2_input,
204 | n_in=nkerns[1] * 4 * 4,
205 | n_out=500,
206 | activation=T.tanh
207 | )
208 |
209 | # classify the values of the fully-connected sigmoidal layer
210 | layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)
211 |
212 | # the cost we minimize during training is the NLL of the model
213 | cost = layer3.negative_log_likelihood(y)
214 |
215 | # create a function to compute the mistakes that are made by the model
216 | test_model = theano.function(
217 | [index],
218 | layer3.errors(y),
219 | givens={
220 | x: test_set_x[index * batch_size: (index + 1) * batch_size],
221 | y: test_set_y[index * batch_size: (index + 1) * batch_size]
222 | }
223 | )
224 |
225 | validate_model = theano.function(
226 | [index],
227 | layer3.errors(y),
228 | givens={
229 | x: valid_set_x[index * batch_size: (index + 1) * batch_size],
230 | y: valid_set_y[index * batch_size: (index + 1) * batch_size]
231 | }
232 | )
233 |
234 | # create a list of all model parameters to be fit by gradient descent
235 | params = layer3.params + layer2.params + layer1.params + layer0.params
236 |
237 | # create a list of gradients for all model parameters
238 | grads = T.grad(cost, params)
239 |
240 | # train_model is a function that updates the model parameters by
241 | # SGD Since this model has many parameters, it would be tedious to
242 | # manually create an update rule for each model parameter. We thus
243 | # create the updates list by automatically looping over all
244 | # (params[i], grads[i]) pairs.
245 | updates = [
246 | (param_i, param_i - learning_rate * grad_i)
247 | for param_i, grad_i in zip(params, grads)
248 | ]
249 |
250 | train_model = theano.function(
251 | [index],
252 | cost,
253 | updates=updates,
254 | givens={
255 | x: train_set_x[index * batch_size: (index + 1) * batch_size],
256 | y: train_set_y[index * batch_size: (index + 1) * batch_size]
257 | }
258 | )
259 | # end-snippet-1
260 |
261 | ###############
262 | # TRAIN MODEL #
263 | ###############
264 | print '... training'
265 | # early-stopping parameters
266 | patience = 10000 # look as this many examples regardless
267 | patience_increase = 2 # wait this much longer when a new best is
268 | # found
269 | improvement_threshold = 0.995 # a relative improvement of this much is
270 | # considered significant
271 | validation_frequency = min(n_train_batches, patience / 2)
272 | # go through this many
273 | # minibatche before checking the network
274 | # on the validation set; in this case we
275 | # check every epoch
276 |
277 | best_validation_loss = numpy.inf
278 | best_iter = 0
279 | test_score = 0.
280 | start_time = timeit.default_timer()
281 |
282 | epoch = 0
283 | done_looping = False
284 |
285 | while (epoch < n_epochs) and (not done_looping):
286 | epoch = epoch + 1
287 | for minibatch_index in xrange(n_train_batches):
288 |
289 | iter = (epoch - 1) * n_train_batches + minibatch_index
290 |
291 | if iter % 100 == 0:
292 | print 'training @ iter = ', iter
293 | cost_ij = train_model(minibatch_index)
294 |
295 | if (iter + 1) % validation_frequency == 0:
296 |
297 | # compute zero-one loss on validation set
298 | validation_losses = [validate_model(i) for i
299 | in xrange(n_valid_batches)]
300 | this_validation_loss = numpy.mean(validation_losses)
301 | print('epoch %i, minibatch %i/%i, validation error %f %%' %
302 | (epoch, minibatch_index + 1, n_train_batches,
303 | this_validation_loss * 100.))
304 |
305 | # if we got the best validation score until now
306 | if this_validation_loss < best_validation_loss:
307 |
308 | #improve patience if loss improvement is good enough
309 | if this_validation_loss < best_validation_loss * \
310 | improvement_threshold:
311 | patience = max(patience, iter * patience_increase)
312 |
313 | # save best validation score and iteration number
314 | best_validation_loss = this_validation_loss
315 | best_iter = iter
316 |
317 | # test it on the test set
318 | test_losses = [
319 | test_model(i)
320 | for i in xrange(n_test_batches)
321 | ]
322 | test_score = numpy.mean(test_losses)
323 | print((' epoch %i, minibatch %i/%i, test error of '
324 | 'best model %f %%') %
325 | (epoch, minibatch_index + 1, n_train_batches,
326 | test_score * 100.))
327 |
328 | if patience <= iter:
329 | done_looping = True
330 | break
331 |
332 | end_time = timeit.default_timer()
333 | print('Optimization complete.')
334 | print('Best validation score of %f %% obtained at iteration %i, '
335 | 'with test performance %f %%' %
336 | (best_validation_loss * 100., best_iter + 1, test_score * 100.))
337 | print >> sys.stderr, ('The code for file ' +
338 | os.path.split(__file__)[1] +
339 | ' ran for %.2fm' % ((end_time - start_time) / 60.))
340 |
341 | if __name__ == '__main__':
342 | evaluate_lenet5()
343 |
344 |
345 | def experiment(state, channel):
346 | evaluate_lenet5(state.learning_rate, dataset=state.dataset)
347 |
--------------------------------------------------------------------------------
/code/hmc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/code/hmc/__init__.py
--------------------------------------------------------------------------------
/code/hmc/test_hmc.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | from scipy import linalg
3 | import theano
4 |
5 | from hmc import HMC_sampler
6 |
7 |
8 | def sampler_on_nd_gaussian(sampler_cls, burnin, n_samples, dim=10):
9 | batchsize = 3
10 |
11 | rng = numpy.random.RandomState(123)
12 |
13 | # Define a covariance and mu for a gaussian
14 | mu = numpy.array(rng.rand(dim) * 10, dtype=theano.config.floatX)
15 | cov = numpy.array(rng.rand(dim, dim), dtype=theano.config.floatX)
16 | cov = (cov + cov.T) / 2.
17 | cov[numpy.arange(dim), numpy.arange(dim)] = 1.0
18 | cov_inv = linalg.inv(cov)
19 |
20 | # Define energy function for a multi-variate Gaussian
21 | def gaussian_energy(x):
22 | return 0.5 * (theano.tensor.dot((x - mu), cov_inv) *
23 | (x - mu)).sum(axis=1)
24 |
25 | # Declared shared random variable for positions
26 | position = rng.randn(batchsize, dim).astype(theano.config.floatX)
27 | position = theano.shared(position)
28 |
29 | # Create HMC sampler
30 | sampler = sampler_cls(position, gaussian_energy,
31 | initial_stepsize=1e-3, stepsize_max=0.5)
32 |
33 | # Start with a burn-in process
34 | garbage = [sampler.draw() for r in xrange(burnin)] # burn-in Draw
35 | # `n_samples`: result is a 3D tensor of dim [n_samples, batchsize,
36 | # dim]
37 | _samples = numpy.asarray([sampler.draw() for r in xrange(n_samples)])
38 | # Flatten to [n_samples * batchsize, dim]
39 | samples = _samples.T.reshape(dim, -1).T
40 |
41 | print '****** TARGET VALUES ******'
42 | print 'target mean:', mu
43 | print 'target cov:\n', cov
44 |
45 | print '****** EMPIRICAL MEAN/COV USING HMC ******'
46 | print 'empirical mean: ', samples.mean(axis=0)
47 | print 'empirical_cov:\n', numpy.cov(samples.T)
48 |
49 | print '****** HMC INTERNALS ******'
50 | print 'final stepsize', sampler.stepsize.get_value()
51 | print 'final acceptance_rate', sampler.avg_acceptance_rate.get_value()
52 |
53 | return sampler
54 |
55 |
56 | def test_hmc():
57 | sampler = sampler_on_nd_gaussian(HMC_sampler.new_from_shared_positions,
58 | burnin=1000, n_samples=1000, dim=5)
59 | assert abs(sampler.avg_acceptance_rate.get_value() -
60 | sampler.target_acceptance_rate) < .1
61 | assert sampler.stepsize.get_value() >= sampler.stepsize_min
62 | assert sampler.stepsize.get_value() <= sampler.stepsize_max
63 |
--------------------------------------------------------------------------------
/code/imdb.py:
--------------------------------------------------------------------------------
1 | import cPickle
2 | import gzip
3 | import os
4 |
5 | import numpy
6 | import theano
7 |
8 |
9 | def prepare_data(seqs, labels, maxlen=None):
10 | """Create the matrices from the datasets.
11 |
12 | This pad each sequence to the same lenght: the lenght of the
13 | longuest sequence or maxlen.
14 |
15 | if maxlen is set, we will cut all sequence to this maximum
16 | lenght.
17 |
18 | This swap the axis!
19 | """
20 | # x: a list of sentences
21 | lengths = [len(s) for s in seqs]
22 |
23 | if maxlen is not None:
24 | new_seqs = []
25 | new_labels = []
26 | new_lengths = []
27 | for l, s, y in zip(lengths, seqs, labels):
28 | if l < maxlen:
29 | new_seqs.append(s)
30 | new_labels.append(y)
31 | new_lengths.append(l)
32 | lengths = new_lengths
33 | labels = new_labels
34 | seqs = new_seqs
35 |
36 | if len(lengths) < 1:
37 | return None, None, None
38 |
39 | n_samples = len(seqs)
40 | maxlen = numpy.max(lengths)
41 |
42 | x = numpy.zeros((maxlen, n_samples)).astype('int64')
43 | x_mask = numpy.zeros((maxlen, n_samples)).astype(theano.config.floatX)
44 | for idx, s in enumerate(seqs):
45 | x[:lengths[idx], idx] = s
46 | x_mask[:lengths[idx], idx] = 1.
47 |
48 | return x, x_mask, labels
49 |
50 |
51 | def get_dataset_file(dataset, default_dataset, origin):
52 | '''Look for it as if it was a full path, if not, try local file,
53 | if not try in the data directory.
54 |
55 | Download dataset if it is not present
56 |
57 | '''
58 | data_dir, data_file = os.path.split(dataset)
59 | if data_dir == "" and not os.path.isfile(dataset):
60 | # Check if dataset is in the data directory.
61 | new_path = os.path.join(
62 | os.path.split(__file__)[0],
63 | "..",
64 | "data",
65 | dataset
66 | )
67 | if os.path.isfile(new_path) or data_file == default_dataset:
68 | dataset = new_path
69 |
70 | if (not os.path.isfile(dataset)) and data_file == default_dataset:
71 | import urllib
72 | print 'Downloading data from %s' % origin
73 | urllib.urlretrieve(origin, dataset)
74 | return dataset
75 |
76 |
77 | def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None,
78 | sort_by_len=True):
79 | '''Loads the dataset
80 |
81 | :type path: String
82 | :param path: The path to the dataset (here IMDB)
83 | :type n_words: int
84 | :param n_words: The number of word to keep in the vocabulary.
85 | All extra words are set to unknow (1).
86 | :type valid_portion: float
87 | :param valid_portion: The proportion of the full train set used for
88 | the validation set.
89 | :type maxlen: None or positive int
90 | :param maxlen: the max sequence length we use in the train/valid set.
91 | :type sort_by_len: bool
92 | :name sort_by_len: Sort by the sequence lenght for the train,
93 | valid and test set. This allow faster execution as it cause
94 | less padding per minibatch. Another mechanism must be used to
95 | shuffle the train set at each epoch.
96 |
97 | '''
98 |
99 | #############
100 | # LOAD DATA #
101 | #############
102 |
103 | # Load the dataset
104 | path = get_dataset_file(
105 | path, "imdb.pkl",
106 | "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
107 |
108 | if path.endswith(".gz"):
109 | f = gzip.open(path, 'rb')
110 | else:
111 | f = open(path, 'rb')
112 |
113 | train_set = cPickle.load(f)
114 | test_set = cPickle.load(f)
115 | f.close()
116 | if maxlen:
117 | new_train_set_x = []
118 | new_train_set_y = []
119 | for x, y in zip(train_set[0], train_set[1]):
120 | if len(x) < maxlen:
121 | new_train_set_x.append(x)
122 | new_train_set_y.append(y)
123 | train_set = (new_train_set_x, new_train_set_y)
124 | del new_train_set_x, new_train_set_y
125 |
126 | # split training set into validation set
127 | train_set_x, train_set_y = train_set
128 | n_samples = len(train_set_x)
129 | sidx = numpy.random.permutation(n_samples)
130 | n_train = int(numpy.round(n_samples * (1. - valid_portion)))
131 | valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
132 | valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
133 | train_set_x = [train_set_x[s] for s in sidx[:n_train]]
134 | train_set_y = [train_set_y[s] for s in sidx[:n_train]]
135 |
136 | train_set = (train_set_x, train_set_y)
137 | valid_set = (valid_set_x, valid_set_y)
138 |
139 | def remove_unk(x):
140 | return [[1 if w >= n_words else w for w in sen] for sen in x]
141 |
142 | test_set_x, test_set_y = test_set
143 | valid_set_x, valid_set_y = valid_set
144 | train_set_x, train_set_y = train_set
145 |
146 | train_set_x = remove_unk(train_set_x)
147 | valid_set_x = remove_unk(valid_set_x)
148 | test_set_x = remove_unk(test_set_x)
149 |
150 | def len_argsort(seq):
151 | return sorted(range(len(seq)), key=lambda x: len(seq[x]))
152 |
153 | if sort_by_len:
154 | sorted_index = len_argsort(test_set_x)
155 | test_set_x = [test_set_x[i] for i in sorted_index]
156 | test_set_y = [test_set_y[i] for i in sorted_index]
157 |
158 | sorted_index = len_argsort(valid_set_x)
159 | valid_set_x = [valid_set_x[i] for i in sorted_index]
160 | valid_set_y = [valid_set_y[i] for i in sorted_index]
161 |
162 | sorted_index = len_argsort(train_set_x)
163 | train_set_x = [train_set_x[i] for i in sorted_index]
164 | train_set_y = [train_set_y[i] for i in sorted_index]
165 |
166 | train = (train_set_x, train_set_y)
167 | valid = (valid_set_x, valid_set_y)
168 | test = (test_set_x, test_set_y)
169 |
170 | return train, valid, test
171 |
--------------------------------------------------------------------------------
/code/imdb_preprocess.py:
--------------------------------------------------------------------------------
1 | """
2 | This script is what created the dataset pickled.
3 |
4 | 1) You need to download this file and put it in the same directory as this file.
5 | https://github.com/moses-smt/mosesdecoder/raw/master/scripts/tokenizer/tokenizer.perl . Give it execution permission.
6 |
7 | 2) Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and extract it in the current directory.
8 |
9 | 3) Then run this script.
10 | """
11 |
12 | dataset_path='/Tmp/bastienf/aclImdb/'
13 |
14 | import numpy
15 | import cPickle as pkl
16 |
17 | from collections import OrderedDict
18 |
19 | import glob
20 | import os
21 |
22 | from subprocess import Popen, PIPE
23 |
24 | # tokenizer.perl is from Moses: https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer
25 | tokenizer_cmd = ['./tokenizer.perl', '-l', 'en', '-q', '-']
26 |
27 |
28 | def tokenize(sentences):
29 |
30 | print 'Tokenizing..',
31 | text = "\n".join(sentences)
32 | tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
33 | tok_text, _ = tokenizer.communicate(text)
34 | toks = tok_text.split('\n')[:-1]
35 | print 'Done'
36 |
37 | return toks
38 |
39 |
40 | def build_dict(path):
41 | sentences = []
42 | currdir = os.getcwd()
43 | os.chdir('%s/pos/' % path)
44 | for ff in glob.glob("*.txt"):
45 | with open(ff, 'r') as f:
46 | sentences.append(f.readline().strip())
47 | os.chdir('%s/neg/' % path)
48 | for ff in glob.glob("*.txt"):
49 | with open(ff, 'r') as f:
50 | sentences.append(f.readline().strip())
51 | os.chdir(currdir)
52 |
53 | sentences = tokenize(sentences)
54 |
55 | print 'Building dictionary..',
56 | wordcount = dict()
57 | for ss in sentences:
58 | words = ss.strip().lower().split()
59 | for w in words:
60 | if w not in wordcount:
61 | wordcount[w] = 1
62 | else:
63 | wordcount[w] += 1
64 |
65 | counts = wordcount.values()
66 | keys = wordcount.keys()
67 |
68 | sorted_idx = numpy.argsort(counts)[::-1]
69 |
70 | worddict = dict()
71 |
72 | for idx, ss in enumerate(sorted_idx):
73 | worddict[keys[ss]] = idx+2 # leave 0 and 1 (UNK)
74 |
75 | print numpy.sum(counts), ' total words ', len(keys), ' unique words'
76 |
77 | return worddict
78 |
79 |
80 | def grab_data(path, dictionary):
81 | sentences = []
82 | currdir = os.getcwd()
83 | os.chdir(path)
84 | for ff in glob.glob("*.txt"):
85 | with open(ff, 'r') as f:
86 | sentences.append(f.readline().strip())
87 | os.chdir(currdir)
88 | sentences = tokenize(sentences)
89 |
90 | seqs = [None] * len(sentences)
91 | for idx, ss in enumerate(sentences):
92 | words = ss.strip().lower().split()
93 | seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words]
94 |
95 | return seqs
96 |
97 |
98 | def main():
99 | # Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/
100 | path = dataset_path
101 | dictionary = build_dict(os.path.join(path, 'train'))
102 |
103 | train_x_pos = grab_data(path+'train/pos', dictionary)
104 | train_x_neg = grab_data(path+'train/neg', dictionary)
105 | train_x = train_x_pos + train_x_neg
106 | train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)
107 |
108 | test_x_pos = grab_data(path+'test/pos', dictionary)
109 | test_x_neg = grab_data(path+'test/neg', dictionary)
110 | test_x = test_x_pos + test_x_neg
111 | test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)
112 |
113 | f = open('imdb.pkl', 'wb')
114 | pkl.dump((train_x, train_y), f, -1)
115 | pkl.dump((test_x, test_y), f, -1)
116 | f.close()
117 |
118 | f = open('imdb.dict.pkl', 'wb')
119 | pkl.dump(dictionary, f, -1)
120 | f.close()
121 |
122 | if __name__ == '__main__':
123 | main()
124 |
--------------------------------------------------------------------------------
/code/logistic_cg.py:
--------------------------------------------------------------------------------
1 | """
2 | This tutorial introduces logistic regression using Theano and conjugate
3 | gradient descent.
4 |
5 | Logistic regression is a probabilistic, linear classifier. It is parametrized
6 | by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
7 | done by projecting data points onto a set of hyperplanes, the distance to
8 | which is used to determine a class membership probability.
9 |
10 | Mathematically, this can be written as:
11 |
12 | .. math::
13 | P(Y=i|x, W,b) &= softmax_i(W x + b) \\
14 | &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
15 |
16 |
17 | The output of the model or prediction is then done by taking the argmax of
18 | the vector whose i'th element is P(Y=i|x).
19 |
20 | .. math::
21 |
22 | y_{pred} = argmax_i P(Y=i|x,W,b)
23 |
24 |
25 | This tutorial presents a conjugate gradient optimization method that is
26 | suitable for smaller datasets.
27 |
28 |
29 | References:
30 |
31 | - textbooks: "Pattern Recognition and Machine Learning" -
32 | Christopher M. Bishop, section 4.3.2
33 |
34 |
35 | """
36 | __docformat__ = 'restructedtext en'
37 |
38 |
39 | import os
40 | import sys
41 | import timeit
42 |
43 | import numpy
44 |
45 | import theano
46 | import theano.tensor as T
47 |
48 | from logistic_sgd import load_data
49 |
50 |
51 | class LogisticRegression(object):
52 | """Multi-class Logistic Regression Class
53 |
54 | The logistic regression is fully described by a weight matrix :math:`W`
55 | and bias vector :math:`b`. Classification is done by projecting data
56 | points onto a set of hyperplanes, the distance to which is used to
57 | determine a class membership probability.
58 | """
59 |
60 | def __init__(self, input, n_in, n_out):
61 | """ Initialize the parameters of the logistic regression
62 |
63 | :type input: theano.tensor.TensorType
64 | :param input: symbolic variable that describes the input of the
65 | architecture ( one minibatch)
66 |
67 | :type n_in: int
68 | :param n_in: number of input units, the dimension of the space in
69 | which the datapoint lies
70 |
71 | :type n_out: int
72 | :param n_out: number of output units, the dimension of the space in
73 | which the target lies
74 |
75 | """
76 |
77 | # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out),
78 | # while b is a vector of n_out elements, making theta a vector of
79 | # n_in*n_out + n_out elements
80 | self.theta = theano.shared(
81 | value=numpy.zeros(
82 | n_in * n_out + n_out,
83 | dtype=theano.config.floatX
84 | ),
85 | name='theta',
86 | borrow=True
87 | )
88 | # W is represented by the fisr n_in*n_out elements of theta
89 | self.W = self.theta[0:n_in * n_out].reshape((n_in, n_out))
90 | # b is the rest (last n_out elements)
91 | self.b = self.theta[n_in * n_out:n_in * n_out + n_out]
92 |
93 | # compute vector of class-membership probabilities in symbolic form
94 | self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
95 |
96 | # compute prediction as class whose probability is maximal in
97 | # symbolic form
98 | self.y_pred = T.argmax(self.p_y_given_x, axis=1)
99 |
100 | # keep track of model input
101 | self.input = input
102 |
103 | def negative_log_likelihood(self, y):
104 | """Return the negative log-likelihood of the prediction of this model
105 | under a given target distribution.
106 |
107 | .. math::
108 |
109 | \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
110 | \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|}
111 | \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
112 | \ell (\theta=\{W,b\}, \mathcal{D})
113 |
114 | :type y: theano.tensor.TensorType
115 | :param y: corresponds to a vector that gives for each example the
116 | correct label
117 | """
118 | return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
119 |
120 | def errors(self, y):
121 | """Return a float representing the number of errors in the minibatch
122 | over the total number of examples of the minibatch
123 |
124 | :type y: theano.tensor.TensorType
125 | :param y: corresponds to a vector that gives for each example
126 | the correct label
127 | """
128 |
129 | # check if y has same dimension of y_pred
130 | if y.ndim != self.y_pred.ndim:
131 | raise TypeError(
132 | 'y should have the same shape as self.y_pred',
133 | ('y', y.type, 'y_pred', self.y_pred.type)
134 | )
135 | # check if y is of the correct datatype
136 | if y.dtype.startswith('int'):
137 | # the T.neq operator returns a vector of 0s and 1s, where 1
138 | # represents a mistake in prediction
139 | return T.mean(T.neq(self.y_pred, y))
140 | else:
141 | raise NotImplementedError()
142 |
143 |
144 | def cg_optimization_mnist(n_epochs=50, mnist_pkl_gz='mnist.pkl.gz'):
145 | """Demonstrate conjugate gradient optimization of a log-linear model
146 |
147 | This is demonstrated on MNIST.
148 |
149 | :type n_epochs: int
150 | :param n_epochs: number of epochs to run the optimizer
151 |
152 | :type mnist_pkl_gz: string
153 | :param mnist_pkl_gz: the path of the mnist training file from
154 | http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
155 |
156 | """
157 | #############
158 | # LOAD DATA #
159 | #############
160 | datasets = load_data(mnist_pkl_gz)
161 |
162 | train_set_x, train_set_y = datasets[0]
163 | valid_set_x, valid_set_y = datasets[1]
164 | test_set_x, test_set_y = datasets[2]
165 |
166 | batch_size = 600 # size of the minibatch
167 |
168 | n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
169 | n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
170 | n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
171 |
172 | n_in = 28 * 28 # number of input units
173 | n_out = 10 # number of output units
174 |
175 | ######################
176 | # BUILD ACTUAL MODEL #
177 | ######################
178 | print '... building the model'
179 |
180 | # allocate symbolic variables for the data
181 | minibatch_offset = T.lscalar() # offset to the start of a [mini]batch
182 | x = T.matrix() # the data is presented as rasterized images
183 | y = T.ivector() # the labels are presented as 1D vector of
184 | # [int] labels
185 |
186 | # construct the logistic regression class
187 | classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)
188 |
189 | # the cost we minimize during training is the negative log likelihood of
190 | # the model in symbolic format
191 | cost = classifier.negative_log_likelihood(y).mean()
192 |
193 | # compile a theano function that computes the mistakes that are made by
194 | # the model on a minibatch
195 | test_model = theano.function(
196 | [minibatch_offset],
197 | classifier.errors(y),
198 | givens={
199 | x: test_set_x[minibatch_offset:minibatch_offset + batch_size],
200 | y: test_set_y[minibatch_offset:minibatch_offset + batch_size]
201 | },
202 | name="test"
203 | )
204 |
205 | validate_model = theano.function(
206 | [minibatch_offset],
207 | classifier.errors(y),
208 | givens={
209 | x: valid_set_x[minibatch_offset: minibatch_offset + batch_size],
210 | y: valid_set_y[minibatch_offset: minibatch_offset + batch_size]
211 | },
212 | name="validate"
213 | )
214 |
215 | # compile a theano function that returns the cost of a minibatch
216 | batch_cost = theano.function(
217 | [minibatch_offset],
218 | cost,
219 | givens={
220 | x: train_set_x[minibatch_offset: minibatch_offset + batch_size],
221 | y: train_set_y[minibatch_offset: minibatch_offset + batch_size]
222 | },
223 | name="batch_cost"
224 | )
225 |
226 | # compile a theano function that returns the gradient of the minibatch
227 | # with respect to theta
228 | batch_grad = theano.function(
229 | [minibatch_offset],
230 | T.grad(cost, classifier.theta),
231 | givens={
232 | x: train_set_x[minibatch_offset: minibatch_offset + batch_size],
233 | y: train_set_y[minibatch_offset: minibatch_offset + batch_size]
234 | },
235 | name="batch_grad"
236 | )
237 |
238 | # creates a function that computes the average cost on the training set
239 | def train_fn(theta_value):
240 | classifier.theta.set_value(theta_value, borrow=True)
241 | train_losses = [batch_cost(i * batch_size)
242 | for i in xrange(n_train_batches)]
243 | return numpy.mean(train_losses)
244 |
245 | # creates a function that computes the average gradient of cost with
246 | # respect to theta
247 | def train_fn_grad(theta_value):
248 | classifier.theta.set_value(theta_value, borrow=True)
249 | grad = batch_grad(0)
250 | for i in xrange(1, n_train_batches):
251 | grad += batch_grad(i * batch_size)
252 | return grad / n_train_batches
253 |
254 | validation_scores = [numpy.inf, 0]
255 |
256 | # creates the validation function
257 | def callback(theta_value):
258 | classifier.theta.set_value(theta_value, borrow=True)
259 | #compute the validation loss
260 | validation_losses = [validate_model(i * batch_size)
261 | for i in xrange(n_valid_batches)]
262 | this_validation_loss = numpy.mean(validation_losses)
263 | print('validation error %f %%' % (this_validation_loss * 100.,))
264 |
265 | # check if it is better then best validation score got until now
266 | if this_validation_loss < validation_scores[0]:
267 | # if so, replace the old one, and compute the score on the
268 | # testing dataset
269 | validation_scores[0] = this_validation_loss
270 | test_losses = [test_model(i * batch_size)
271 | for i in xrange(n_test_batches)]
272 | validation_scores[1] = numpy.mean(test_losses)
273 |
274 | ###############
275 | # TRAIN MODEL #
276 | ###############
277 |
278 | # using scipy conjugate gradient optimizer
279 | import scipy.optimize
280 | print ("Optimizing using scipy.optimize.fmin_cg...")
281 | start_time = timeit.default_timer()
282 | best_w_b = scipy.optimize.fmin_cg(
283 | f=train_fn,
284 | x0=numpy.zeros((n_in + 1) * n_out, dtype=x.dtype),
285 | fprime=train_fn_grad,
286 | callback=callback,
287 | disp=0,
288 | maxiter=n_epochs
289 | )
290 | end_time = timeit.default_timer()
291 | print(
292 | (
293 | 'Optimization complete with best validation score of %f %%, with '
294 | 'test performance %f %%'
295 | )
296 | % (validation_scores[0] * 100., validation_scores[1] * 100.)
297 | )
298 |
299 | print >> sys.stderr, ('The code for file ' +
300 | os.path.split(__file__)[1] +
301 | ' ran for %.1fs' % ((end_time - start_time)))
302 |
303 |
304 | if __name__ == '__main__':
305 | cg_optimization_mnist()
306 |
--------------------------------------------------------------------------------
/code/rnnrbm.py:
--------------------------------------------------------------------------------
1 | # Author: Nicolas Boulanger-Lewandowski
2 | # University of Montreal (2012)
3 | # RNN-RBM deep learning tutorial
4 | # More information at http://deeplearning.net/tutorial/rnnrbm.html
5 |
6 | import glob
7 | import os
8 | import sys
9 |
10 | import numpy
11 | try:
12 | import pylab
13 | except ImportError:
14 | print (
15 | "pylab isn't available. If you use its functionality, it will crash."
16 | )
17 | print "It can be installed with 'pip install -q Pillow'"
18 |
19 | from midi.utils import midiread, midiwrite
20 | import theano
21 | import theano.tensor as T
22 | from theano.tensor.shared_randomstreams import RandomStreams
23 |
24 | #Don't use a python long as this don't work on 32 bits computers.
25 | numpy.random.seed(0xbeef)
26 | rng = RandomStreams(seed=numpy.random.randint(1 << 30))
27 | theano.config.warn.subtensor_merge_bug = False
28 |
29 |
30 | def build_rbm(v, W, bv, bh, k):
31 | '''Construct a k-step Gibbs chain starting at v for an RBM.
32 |
33 | v : Theano vector or matrix
34 | If a matrix, multiple chains will be run in parallel (batch).
35 | W : Theano matrix
36 | Weight matrix of the RBM.
37 | bv : Theano vector
38 | Visible bias vector of the RBM.
39 | bh : Theano vector
40 | Hidden bias vector of the RBM.
41 | k : scalar or Theano scalar
42 | Length of the Gibbs chain.
43 |
44 | Return a (v_sample, cost, monitor, updates) tuple:
45 |
46 | v_sample : Theano vector or matrix with the same shape as `v`
47 | Corresponds to the generated sample(s).
48 | cost : Theano scalar
49 | Expression whose gradient with respect to W, bv, bh is the CD-k
50 | approximation to the log-likelihood of `v` (training example) under the
51 | RBM. The cost is averaged in the batch case.
52 | monitor: Theano scalar
53 | Pseudo log-likelihood (also averaged in the batch case).
54 | updates: dictionary of Theano variable -> Theano variable
55 | The `updates` object returned by scan.'''
56 |
57 | def gibbs_step(v):
58 | mean_h = T.nnet.sigmoid(T.dot(v, W) + bh)
59 | h = rng.binomial(size=mean_h.shape, n=1, p=mean_h,
60 | dtype=theano.config.floatX)
61 | mean_v = T.nnet.sigmoid(T.dot(h, W.T) + bv)
62 | v = rng.binomial(size=mean_v.shape, n=1, p=mean_v,
63 | dtype=theano.config.floatX)
64 | return mean_v, v
65 |
66 | chain, updates = theano.scan(lambda v: gibbs_step(v)[1], outputs_info=[v],
67 | n_steps=k)
68 | v_sample = chain[-1]
69 |
70 | mean_v = gibbs_step(v_sample)[0]
71 | monitor = T.xlogx.xlogy0(v, mean_v) + T.xlogx.xlogy0(1 - v, 1 - mean_v)
72 | monitor = monitor.sum() / v.shape[0]
73 |
74 | def free_energy(v):
75 | return -(v * bv).sum() - T.log(1 + T.exp(T.dot(v, W) + bh)).sum()
76 | cost = (free_energy(v) - free_energy(v_sample)) / v.shape[0]
77 |
78 | return v_sample, cost, monitor, updates
79 |
80 |
81 | def shared_normal(num_rows, num_cols, scale=1):
82 | '''Initialize a matrix shared variable with normally distributed
83 | elements.'''
84 | return theano.shared(numpy.random.normal(
85 | scale=scale, size=(num_rows, num_cols)).astype(theano.config.floatX))
86 |
87 |
88 | def shared_zeros(*shape):
89 | '''Initialize a vector shared variable with zero elements.'''
90 | return theano.shared(numpy.zeros(shape, dtype=theano.config.floatX))
91 |
92 |
93 | def build_rnnrbm(n_visible, n_hidden, n_hidden_recurrent):
94 | '''Construct a symbolic RNN-RBM and initialize parameters.
95 |
96 | n_visible : integer
97 | Number of visible units.
98 | n_hidden : integer
99 | Number of hidden units of the conditional RBMs.
100 | n_hidden_recurrent : integer
101 | Number of hidden units of the RNN.
102 |
103 | Return a (v, v_sample, cost, monitor, params, updates_train, v_t,
104 | updates_generate) tuple:
105 |
106 | v : Theano matrix
107 | Symbolic variable holding an input sequence (used during training)
108 | v_sample : Theano matrix
109 | Symbolic variable holding the negative particles for CD log-likelihood
110 | gradient estimation (used during training)
111 | cost : Theano scalar
112 | Expression whose gradient (considering v_sample constant) corresponds
113 | to the LL gradient of the RNN-RBM (used during training)
114 | monitor : Theano scalar
115 | Frame-level pseudo-likelihood (useful for monitoring during training)
116 | params : tuple of Theano shared variables
117 | The parameters of the model to be optimized during training.
118 | updates_train : dictionary of Theano variable -> Theano variable
119 | Update object that should be passed to theano.function when compiling
120 | the training function.
121 | v_t : Theano matrix
122 | Symbolic variable holding a generated sequence (used during sampling)
123 | updates_generate : dictionary of Theano variable -> Theano variable
124 | Update object that should be passed to theano.function when compiling
125 | the generation function.'''
126 |
127 | W = shared_normal(n_visible, n_hidden, 0.01)
128 | bv = shared_zeros(n_visible)
129 | bh = shared_zeros(n_hidden)
130 | Wuh = shared_normal(n_hidden_recurrent, n_hidden, 0.0001)
131 | Wuv = shared_normal(n_hidden_recurrent, n_visible, 0.0001)
132 | Wvu = shared_normal(n_visible, n_hidden_recurrent, 0.0001)
133 | Wuu = shared_normal(n_hidden_recurrent, n_hidden_recurrent, 0.0001)
134 | bu = shared_zeros(n_hidden_recurrent)
135 |
136 | params = W, bv, bh, Wuh, Wuv, Wvu, Wuu, bu # learned parameters as shared
137 | # variables
138 |
139 | v = T.matrix() # a training sequence
140 | u0 = T.zeros((n_hidden_recurrent,)) # initial value for the RNN hidden
141 | # units
142 |
143 | # If `v_t` is given, deterministic recurrence to compute the variable
144 | # biases bv_t, bh_t at each time step. If `v_t` is None, same recurrence
145 | # but with a separate Gibbs chain at each time step to sample (generate)
146 | # from the RNN-RBM. The resulting sample v_t is returned in order to be
147 | # passed down to the sequence history.
148 | def recurrence(v_t, u_tm1):
149 | bv_t = bv + T.dot(u_tm1, Wuv)
150 | bh_t = bh + T.dot(u_tm1, Wuh)
151 | generate = v_t is None
152 | if generate:
153 | v_t, _, _, updates = build_rbm(T.zeros((n_visible,)), W, bv_t,
154 | bh_t, k=25)
155 | u_t = T.tanh(bu + T.dot(v_t, Wvu) + T.dot(u_tm1, Wuu))
156 | return ([v_t, u_t], updates) if generate else [u_t, bv_t, bh_t]
157 |
158 | # For training, the deterministic recurrence is used to compute all the
159 | # {bv_t, bh_t, 1 <= t <= T} given v. Conditional RBMs can then be trained
160 | # in batches using those parameters.
161 | (u_t, bv_t, bh_t), updates_train = theano.scan(
162 | lambda v_t, u_tm1, *_: recurrence(v_t, u_tm1),
163 | sequences=v, outputs_info=[u0, None, None], non_sequences=params)
164 | v_sample, cost, monitor, updates_rbm = build_rbm(v, W, bv_t[:], bh_t[:],
165 | k=15)
166 | updates_train.update(updates_rbm)
167 |
168 | # symbolic loop for sequence generation
169 | (v_t, u_t), updates_generate = theano.scan(
170 | lambda u_tm1, *_: recurrence(None, u_tm1),
171 | outputs_info=[None, u0], non_sequences=params, n_steps=200)
172 |
173 | return (v, v_sample, cost, monitor, params, updates_train, v_t,
174 | updates_generate)
175 |
176 |
177 | class RnnRbm:
178 | '''Simple class to train an RNN-RBM from MIDI files and to generate sample
179 | sequences.'''
180 |
181 | def __init__(
182 | self,
183 | n_hidden=150,
184 | n_hidden_recurrent=100,
185 | lr=0.001,
186 | r=(21, 109),
187 | dt=0.3
188 | ):
189 | '''Constructs and compiles Theano functions for training and sequence
190 | generation.
191 |
192 | n_hidden : integer
193 | Number of hidden units of the conditional RBMs.
194 | n_hidden_recurrent : integer
195 | Number of hidden units of the RNN.
196 | lr : float
197 | Learning rate
198 | r : (integer, integer) tuple
199 | Specifies the pitch range of the piano-roll in MIDI note numbers,
200 | including r[0] but not r[1], such that r[1]-r[0] is the number of
201 | visible units of the RBM at a given time step. The default (21,
202 | 109) corresponds to the full range of piano (88 notes).
203 | dt : float
204 | Sampling period when converting the MIDI files into piano-rolls, or
205 | equivalently the time difference between consecutive time steps.'''
206 |
207 | self.r = r
208 | self.dt = dt
209 | (v, v_sample, cost, monitor, params, updates_train, v_t,
210 | updates_generate) = build_rnnrbm(
211 | r[1] - r[0],
212 | n_hidden,
213 | n_hidden_recurrent
214 | )
215 |
216 | gradient = T.grad(cost, params, consider_constant=[v_sample])
217 | updates_train.update(
218 | ((p, p - lr * g) for p, g in zip(params, gradient))
219 | )
220 | self.train_function = theano.function(
221 | [v],
222 | monitor,
223 | updates=updates_train
224 | )
225 | self.generate_function = theano.function(
226 | [],
227 | v_t,
228 | updates=updates_generate
229 | )
230 |
231 | def train(self, files, batch_size=100, num_epochs=200):
232 | '''Train the RNN-RBM via stochastic gradient descent (SGD) using MIDI
233 | files converted to piano-rolls.
234 |
235 | files : list of strings
236 | List of MIDI files that will be loaded as piano-rolls for training.
237 | batch_size : integer
238 | Training sequences will be split into subsequences of at most this
239 | size before applying the SGD updates.
240 | num_epochs : integer
241 | Number of epochs (pass over the training set) performed. The user
242 | can safely interrupt training with Ctrl+C at any time.'''
243 |
244 | assert len(files) > 0, 'Training set is empty!' \
245 | ' (did you download the data files?)'
246 | dataset = [midiread(f, self.r,
247 | self.dt).piano_roll.astype(theano.config.floatX)
248 | for f in files]
249 |
250 | try:
251 | for epoch in xrange(num_epochs):
252 | numpy.random.shuffle(dataset)
253 | costs = []
254 |
255 | for s, sequence in enumerate(dataset):
256 | for i in xrange(0, len(sequence), batch_size):
257 | cost = self.train_function(sequence[i:i + batch_size])
258 | costs.append(cost)
259 |
260 | print 'Epoch %i/%i' % (epoch + 1, num_epochs),
261 | print numpy.mean(costs)
262 | sys.stdout.flush()
263 |
264 | except KeyboardInterrupt:
265 | print 'Interrupted by user.'
266 |
267 | def generate(self, filename, show=True):
268 | '''Generate a sample sequence, plot the resulting piano-roll and save
269 | it as a MIDI file.
270 |
271 | filename : string
272 | A MIDI file will be created at this location.
273 | show : boolean
274 | If True, a piano-roll of the generated sequence will be shown.'''
275 |
276 | piano_roll = self.generate_function()
277 | midiwrite(filename, piano_roll, self.r, self.dt)
278 | if show:
279 | extent = (0, self.dt * len(piano_roll)) + self.r
280 | pylab.figure()
281 | pylab.imshow(piano_roll.T, origin='lower', aspect='auto',
282 | interpolation='nearest', cmap=pylab.cm.gray_r,
283 | extent=extent)
284 | pylab.xlabel('time (s)')
285 | pylab.ylabel('MIDI note number')
286 | pylab.title('generated piano-roll')
287 |
288 |
289 | def test_rnnrbm(batch_size=100, num_epochs=200):
290 | model = RnnRbm()
291 | re = os.path.join(os.path.split(os.path.dirname(__file__))[0],
292 | 'data', 'Nottingham', 'train', '*.mid')
293 | model.train(glob.glob(re),
294 | batch_size=batch_size, num_epochs=num_epochs)
295 | return model
296 |
297 | if __name__ == '__main__':
298 | model = test_rnnrbm()
299 | model.generate('sample1.mid')
300 | model.generate('sample2.mid')
301 | pylab.show()
302 |
--------------------------------------------------------------------------------
/code/rnnslu.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | import copy
3 | import cPickle
4 | import gzip
5 | import os
6 | import urllib
7 | import random
8 | import stat
9 | import subprocess
10 | import sys
11 | import timeit
12 |
13 | import numpy
14 |
15 | import theano
16 | from theano import tensor as T
17 |
18 | # Otherwise the deepcopy fails
19 | import sys
20 | sys.setrecursionlimit(1500)
21 |
22 | PREFIX = os.getenv(
23 | 'ATISDATA',
24 | os.path.join(os.path.split(os.path.abspath(os.path.dirname(__file__)))[0],
25 | 'data'))
26 |
27 |
28 | # utils functions
29 | def shuffle(lol, seed):
30 | '''
31 | lol :: list of list as input
32 | seed :: seed the shuffling
33 |
34 | shuffle inplace each list in the same order
35 | '''
36 | for l in lol:
37 | random.seed(seed)
38 | random.shuffle(l)
39 |
40 |
41 | # start-snippet-1
42 | def contextwin(l, win):
43 | '''
44 | win :: int corresponding to the size of the window
45 | given a list of indexes composing a sentence
46 |
47 | l :: array containing the word indexes
48 |
49 | it will return a list of list of indexes corresponding
50 | to context windows surrounding each word in the sentence
51 | '''
52 | assert (win % 2) == 1
53 | assert win >= 1
54 | l = list(l)
55 |
56 | lpadded = win // 2 * [-1] + l + win // 2 * [-1]
57 | out = [lpadded[i:(i + win)] for i in range(len(l))]
58 |
59 | assert len(out) == len(l)
60 | return out
61 | # end-snippet-1
62 |
63 |
64 | # data loading functions
65 | def atisfold(fold):
66 | assert fold in range(5)
67 | filename = os.path.join(PREFIX, 'atis.fold'+str(fold)+'.pkl.gz')
68 | f = gzip.open(filename, 'rb')
69 | train_set, valid_set, test_set, dicts = cPickle.load(f)
70 | return train_set, valid_set, test_set, dicts
71 |
72 |
73 | # metrics function using conlleval.pl
74 | def conlleval(p, g, w, filename, script_path):
75 | '''
76 | INPUT:
77 | p :: predictions
78 | g :: groundtruth
79 | w :: corresponding words
80 |
81 | OUTPUT:
82 | filename :: name of the file where the predictions
83 | are written. it will be the input of conlleval.pl script
84 | for computing the performance in terms of precision
85 | recall and f1 score
86 |
87 | OTHER:
88 | script_path :: path to the directory containing the
89 | conlleval.pl script
90 | '''
91 | out = ''
92 | for sl, sp, sw in zip(g, p, w):
93 | out += 'BOS O O\n'
94 | for wl, wp, w in zip(sl, sp, sw):
95 | out += w + ' ' + wl + ' ' + wp + '\n'
96 | out += 'EOS O O\n\n'
97 |
98 | f = open(filename, 'w')
99 | f.writelines(out)
100 | f.close()
101 |
102 | return get_perf(filename, script_path)
103 |
104 |
105 | def download(origin, destination):
106 | '''
107 | download the corresponding atis file
108 | from http://www-etud.iro.umontreal.ca/~mesnilgr/atis/
109 | '''
110 | print 'Downloading data from %s' % origin
111 | urllib.urlretrieve(origin, destination)
112 |
113 |
114 | def get_perf(filename, folder):
115 | ''' run conlleval.pl perl script to obtain
116 | precision/recall and F1 score '''
117 | _conlleval = os.path.join(folder, 'conlleval.pl')
118 | if not os.path.isfile(_conlleval):
119 | url = 'http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl'
120 | download(url, _conlleval)
121 | os.chmod(_conlleval, stat.S_IRWXU) # give the execute permissions
122 |
123 | proc = subprocess.Popen(["perl",
124 | _conlleval],
125 | stdin=subprocess.PIPE,
126 | stdout=subprocess.PIPE)
127 |
128 | stdout, _ = proc.communicate(''.join(open(filename).readlines()))
129 | for line in stdout.split('\n'):
130 | if 'accuracy' in line:
131 | out = line.split()
132 | break
133 |
134 | precision = float(out[6][:-2])
135 | recall = float(out[8][:-2])
136 | f1score = float(out[10])
137 |
138 | return {'p': precision, 'r': recall, 'f1': f1score}
139 |
140 |
141 | # start-snippet-2
142 | class RNNSLU(object):
143 | ''' elman neural net model '''
144 | def __init__(self, nh, nc, ne, de, cs):
145 | '''
146 | nh :: dimension of the hidden layer
147 | nc :: number of classes
148 | ne :: number of word embeddings in the vocabulary
149 | de :: dimension of the word embeddings
150 | cs :: word window context size
151 | '''
152 | # parameters of the model
153 | self.emb = theano.shared(name='embeddings',
154 | value=0.2 * numpy.random.uniform(-1.0, 1.0,
155 | (ne+1, de))
156 | # add one for padding at the end
157 | .astype(theano.config.floatX))
158 | self.wx = theano.shared(name='wx',
159 | value=0.2 * numpy.random.uniform(-1.0, 1.0,
160 | (de * cs, nh))
161 | .astype(theano.config.floatX))
162 | self.wh = theano.shared(name='wh',
163 | value=0.2 * numpy.random.uniform(-1.0, 1.0,
164 | (nh, nh))
165 | .astype(theano.config.floatX))
166 | self.w = theano.shared(name='w',
167 | value=0.2 * numpy.random.uniform(-1.0, 1.0,
168 | (nh, nc))
169 | .astype(theano.config.floatX))
170 | self.bh = theano.shared(name='bh',
171 | value=numpy.zeros(nh,
172 | dtype=theano.config.floatX))
173 | self.b = theano.shared(name='b',
174 | value=numpy.zeros(nc,
175 | dtype=theano.config.floatX))
176 | self.h0 = theano.shared(name='h0',
177 | value=numpy.zeros(nh,
178 | dtype=theano.config.floatX))
179 |
180 | # bundle
181 | self.params = [self.emb, self.wx, self.wh, self.w,
182 | self.bh, self.b, self.h0]
183 | # end-snippet-2
184 | # as many columns as context window size
185 | # as many lines as words in the sentence
186 | # start-snippet-3
187 | idxs = T.imatrix()
188 | x = self.emb[idxs].reshape((idxs.shape[0], de*cs))
189 | y_sentence = T.ivector('y_sentence') # labels
190 | # end-snippet-3 start-snippet-4
191 |
192 | def recurrence(x_t, h_tm1):
193 | h_t = T.nnet.sigmoid(T.dot(x_t, self.wx)
194 | + T.dot(h_tm1, self.wh) + self.bh)
195 | s_t = T.nnet.softmax(T.dot(h_t, self.w) + self.b)
196 | return [h_t, s_t]
197 |
198 | [h, s], _ = theano.scan(fn=recurrence,
199 | sequences=x,
200 | outputs_info=[self.h0, None],
201 | n_steps=x.shape[0])
202 |
203 | p_y_given_x_sentence = s[:, 0, :]
204 | y_pred = T.argmax(p_y_given_x_sentence, axis=1)
205 | # end-snippet-4
206 |
207 | # cost and gradients and learning rate
208 | # start-snippet-5
209 | lr = T.scalar('lr')
210 |
211 | sentence_nll = -T.mean(T.log(p_y_given_x_sentence)
212 | [T.arange(x.shape[0]), y_sentence])
213 | sentence_gradients = T.grad(sentence_nll, self.params)
214 | sentence_updates = OrderedDict((p, p - lr*g)
215 | for p, g in
216 | zip(self.params, sentence_gradients))
217 | # end-snippet-5
218 |
219 | # theano functions to compile
220 | # start-snippet-6
221 | self.classify = theano.function(inputs=[idxs], outputs=y_pred)
222 | self.sentence_train = theano.function(inputs=[idxs, y_sentence, lr],
223 | outputs=sentence_nll,
224 | updates=sentence_updates)
225 | # end-snippet-6 start-snippet-7
226 | self.normalize = theano.function(inputs=[],
227 | updates={self.emb:
228 | self.emb /
229 | T.sqrt((self.emb**2)
230 | .sum(axis=1))
231 | .dimshuffle(0, 'x')})
232 | # end-snippet-7
233 |
234 | def train(self, x, y, window_size, learning_rate):
235 |
236 | cwords = contextwin(x, window_size)
237 | words = map(lambda x: numpy.asarray(x).astype('int32'), cwords)
238 | labels = y
239 |
240 | self.sentence_train(words, labels, learning_rate)
241 | self.normalize()
242 |
243 | def save(self, folder):
244 | for param in self.params:
245 | numpy.save(os.path.join(folder,
246 | param.name + '.npy'), param.get_value())
247 |
248 | def load(self, folder):
249 | for param in self.params:
250 | param.set_value(numpy.load(os.path.join(folder,
251 | param.name + '.npy')))
252 |
253 |
254 | def main(param=None):
255 | if not param:
256 | param = {
257 | 'fold': 3,
258 | # 5 folds 0,1,2,3,4
259 | 'data': 'atis',
260 | 'lr': 0.0970806646812754,
261 | 'verbose': 1,
262 | 'decay': True,
263 | # decay on the learning rate if improvement stops
264 | 'win': 7,
265 | # number of words in the context window
266 | 'nhidden': 200,
267 | # number of hidden units
268 | 'seed': 345,
269 | 'emb_dimension': 50,
270 | # dimension of word embedding
271 | 'nepochs': 60,
272 | # 60 is recommended
273 | 'savemodel': False}
274 | print param
275 |
276 | folder_name = os.path.basename(__file__).split('.')[0]
277 | folder = os.path.join(os.path.dirname(__file__), folder_name)
278 | if not os.path.exists(folder):
279 | os.mkdir(folder)
280 |
281 | # load the dataset
282 | train_set, valid_set, test_set, dic = atisfold(param['fold'])
283 |
284 | idx2label = dict((k, v) for v, k in dic['labels2idx'].iteritems())
285 | idx2word = dict((k, v) for v, k in dic['words2idx'].iteritems())
286 |
287 | train_lex, train_ne, train_y = train_set
288 | valid_lex, valid_ne, valid_y = valid_set
289 | test_lex, test_ne, test_y = test_set
290 |
291 | vocsize = len(set(reduce(lambda x, y: list(x) + list(y),
292 | train_lex + valid_lex + test_lex)))
293 | nclasses = len(set(reduce(lambda x, y: list(x)+list(y),
294 | train_y + test_y + valid_y)))
295 | nsentences = len(train_lex)
296 |
297 | groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]
298 | words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]
299 | groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
300 | words_test = [map(lambda x: idx2word[x], w) for w in test_lex]
301 |
302 | # instanciate the model
303 | numpy.random.seed(param['seed'])
304 | random.seed(param['seed'])
305 |
306 | rnn = RNNSLU(nh=param['nhidden'],
307 | nc=nclasses,
308 | ne=vocsize,
309 | de=param['emb_dimension'],
310 | cs=param['win'])
311 |
312 | # train with early stopping on validation set
313 | best_f1 = -numpy.inf
314 | param['clr'] = param['lr']
315 | for e in xrange(param['nepochs']):
316 |
317 | # shuffle
318 | shuffle([train_lex, train_ne, train_y], param['seed'])
319 |
320 | param['ce'] = e
321 | tic = timeit.default_timer()
322 |
323 | for i, (x, y) in enumerate(zip(train_lex, train_y)):
324 | rnn.train(x, y, param['win'], param['clr'])
325 | print '[learning] epoch %i >> %2.2f%%' % (
326 | e, (i + 1) * 100. / nsentences),
327 | print 'completed in %.2f (sec) <<\r' % (timeit.default_timer() - tic),
328 | sys.stdout.flush()
329 |
330 | # evaluation // back into the real world : idx -> words
331 | predictions_test = [map(lambda x: idx2label[x],
332 | rnn.classify(numpy.asarray(
333 | contextwin(x, param['win'])).astype('int32')))
334 | for x in test_lex]
335 | predictions_valid = [map(lambda x: idx2label[x],
336 | rnn.classify(numpy.asarray(
337 | contextwin(x, param['win'])).astype('int32')))
338 | for x in valid_lex]
339 |
340 | # evaluation // compute the accuracy using conlleval.pl
341 | res_test = conlleval(predictions_test,
342 | groundtruth_test,
343 | words_test,
344 | folder + '/current.test.txt',
345 | folder)
346 | res_valid = conlleval(predictions_valid,
347 | groundtruth_valid,
348 | words_valid,
349 | folder + '/current.valid.txt',
350 | folder)
351 |
352 | if res_valid['f1'] > best_f1:
353 |
354 | if param['savemodel']:
355 | rnn.save(folder)
356 |
357 | best_rnn = copy.deepcopy(rnn)
358 | best_f1 = res_valid['f1']
359 |
360 | if param['verbose']:
361 | print('NEW BEST: epoch', e,
362 | 'valid F1', res_valid['f1'],
363 | 'best test F1', res_test['f1'])
364 |
365 | param['vf1'], param['tf1'] = res_valid['f1'], res_test['f1']
366 | param['vp'], param['tp'] = res_valid['p'], res_test['p']
367 | param['vr'], param['tr'] = res_valid['r'], res_test['r']
368 | param['be'] = e
369 |
370 | subprocess.call(['mv', folder + '/current.test.txt',
371 | folder + '/best.test.txt'])
372 | subprocess.call(['mv', folder + '/current.valid.txt',
373 | folder + '/best.valid.txt'])
374 | else:
375 | if param['verbose']:
376 | print ''
377 |
378 | # learning rate decay if no improvement in 10 epochs
379 | if param['decay'] and abs(param['be']-param['ce']) >= 10:
380 | param['clr'] *= 0.5
381 | rnn = best_rnn
382 |
383 | if param['clr'] < 1e-5:
384 | break
385 |
386 | print('BEST RESULT: epoch', param['be'],
387 | 'valid F1', param['vf1'],
388 | 'best test F1', param['tf1'],
389 | 'with the model', folder)
390 |
391 |
392 | if __name__ == '__main__':
393 | main()
394 |
--------------------------------------------------------------------------------
/code/test.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import numpy
4 |
5 | import convolutional_mlp
6 | import dA
7 | import DBN
8 | import logistic_cg
9 | import logistic_sgd
10 | import mlp
11 | import rbm
12 | import rnnrbm
13 | import SdA
14 | import rnnslu
15 | import lstm
16 |
17 |
18 | def test_rnnslu():
19 | rnnslu.main()
20 |
21 |
22 | def test_logistic_sgd():
23 | logistic_sgd.sgd_optimization_mnist(n_epochs=10)
24 |
25 |
26 | def test_logistic_cg():
27 | try:
28 | import scipy
29 | logistic_cg.cg_optimization_mnist(n_epochs=10)
30 | except ImportError:
31 | from nose.plugins.skip import SkipTest
32 | raise SkipTest(
33 | 'SciPy not available. Needed for the logistic_cg example.')
34 |
35 |
36 | def test_mlp():
37 | mlp.test_mlp(n_epochs=1)
38 |
39 |
40 | def test_convolutional_mlp():
41 | convolutional_mlp.evaluate_lenet5(n_epochs=1, nkerns=[5, 5])
42 |
43 |
44 | def test_dA():
45 | dA.test_dA(training_epochs=1, output_folder='tmp_dA_plots')
46 |
47 |
48 | def test_SdA():
49 | SdA.test_SdA(pretraining_epochs=1, training_epochs=1, batch_size=300)
50 |
51 |
52 | def test_dbn():
53 | DBN.test_DBN(pretraining_epochs=1, training_epochs=1, batch_size=300)
54 |
55 |
56 | def test_rbm():
57 | rbm.test_rbm(training_epochs=1, batch_size=300, n_chains=1, n_samples=1,
58 | n_hidden=20, output_folder='tmp_rbm_plots')
59 |
60 |
61 | def test_rnnrbm():
62 | rnnrbm.test_rnnrbm(num_epochs=1)
63 |
64 |
65 | def test_lstm():
66 | lstm.train_lstm(max_epochs=1, test_size=1000, saveto='')
67 |
68 |
69 | def speed():
70 | """
71 | This fonction modify the configuration theano and don't restore it!
72 | """
73 |
74 | algo = ['logistic_sgd', 'logistic_cg', 'mlp', 'convolutional_mlp',
75 | 'dA', 'SdA', 'DBN', 'rbm', 'rnnrbm', 'rnnslu', 'lstm']
76 | to_exec = [True] * len(algo)
77 | # to_exec = [False] * len(algo)
78 | # to_exec[-1] = True
79 | do_float64 = True
80 | do_float32 = True
81 | do_gpu = True
82 |
83 | algo_executed = [s for idx, s in enumerate(algo) if to_exec[idx]]
84 | #Timming expected are from the buildbot that have an i7-920 @
85 | # 2.67GHz with hyperthread enabled for the cpu, 12G of ram. An GeForce GTX
86 | # 580 for the GPU. OS=Fedora 14, gcc=4.5.1, python/BLAS from EPD
87 | # 7.1-2 (python 2.7.2, mkl unknow). BLAS with only 1 thread.
88 |
89 | expected_times_64 = numpy.asarray([9.8, 22.0, 76.1, 73.7, 116.4,
90 | 346.9, 355.0, 558.1, 130.4, 50.8, 113.6])
91 | expected_times_32 = numpy.asarray([8.1, 17.9, 42.5, 66.5, 71,
92 | 191.2, 199.0, 432.8, 119.5, 36.9, 78.0])
93 |
94 | # Number with just 1 decimal are new value that are faster with
95 | # the Theano version 0.5rc2 Other number are older. They are not
96 | # updated, as we where faster in the past!
97 | # TODO: find why and fix this!
98 |
99 | # Here is the value for the buildbot on February 3th 2012 with a GTX 285
100 | # sgd, cg mlp conv da
101 | # sda dbn rbm
102 | # gpu times[3.72957802, 9.94316864, 29.1772666, 9.13857198, 25.91144657,
103 | # 18.30802011, 53.38651466, 285.41386175]
104 | # expected [3.076634879, 7.555234910, 18.99226785, 9.58915591, 24.130070450,
105 | # 24.77524018, 92.66246653, 322.340329170]
106 | # sgd, cg mlp conv da
107 | # sda dbn rbm
108 | #expected/get [0.82492841, 0.75984178, 0.65092691, 1.04930573, 0.93125138
109 | # 1.35324519 1.7356905 1.12937868]
110 |
111 | expected_times_gpu = numpy.asarray([3.0, 7.55523491, 18.99226785,
112 | 5.8, 20.0,
113 | 11.8, 18.2, 280.1, 132.8, 38.8, 10.5])
114 | expected_times_64 = [s for idx, s in enumerate(expected_times_64)
115 | if to_exec[idx]]
116 | expected_times_32 = [s for idx, s in enumerate(expected_times_32)
117 | if to_exec[idx]]
118 | expected_times_gpu = [s for idx, s in enumerate(expected_times_gpu)
119 | if to_exec[idx]]
120 |
121 | def time_test(m, l, idx, f, **kwargs):
122 | if not to_exec[idx]:
123 | return
124 | print algo[idx]
125 | ts = m.call_time
126 | try:
127 | f(**kwargs)
128 | except Exception, e:
129 | print >> sys.stderr, 'test', algo[idx], 'FAILED', e
130 | l.append(numpy.nan)
131 | return
132 | te = m.call_time
133 | l.append(te - ts)
134 |
135 | def do_tests():
136 | m = theano.compile.mode.get_default_mode()
137 | l = []
138 | time_test(m, l, 0, logistic_sgd.sgd_optimization_mnist, n_epochs=30)
139 | time_test(m, l, 1, logistic_cg.cg_optimization_mnist, n_epochs=30)
140 | time_test(m, l, 2, mlp.test_mlp, n_epochs=5)
141 | time_test(m, l, 3, convolutional_mlp.evaluate_lenet5, n_epochs=5,
142 | nkerns=[5, 5])
143 | time_test(m, l, 4, dA.test_dA, training_epochs=2,
144 | output_folder='tmp_dA_plots')
145 | time_test(m, l, 5, SdA.test_SdA, pretraining_epochs=1,
146 | training_epochs=2, batch_size=300)
147 | time_test(m, l, 6, DBN.test_DBN, pretraining_epochs=1,
148 | training_epochs=2, batch_size=300)
149 | time_test(m, l, 7, rbm.test_rbm, training_epochs=1, batch_size=300,
150 | n_chains=1, n_samples=1, output_folder='tmp_rbm_plots')
151 | time_test(m, l, 8, rnnrbm.test_rnnrbm, num_epochs=1)
152 | s = {'fold': 3,
153 | # 5 folds 0,1,2,3,4
154 | 'data': 'atis',
155 | 'lr': 0.0970806646812754,
156 | 'verbose': 1,
157 | 'decay': True,
158 | # decay on the learning rate if improvement stops
159 | 'win': 7,
160 | # number of words in the context window
161 | 'nhidden': 200,
162 | # number of hidden units
163 | 'seed': 345,
164 | 'emb_dimension': 50,
165 | # dimension of word embedding
166 | 'nepochs': 1,
167 | # 60 is recommended
168 | 'savemodel': False}
169 | time_test(m, l, 9, rnnslu.main, param=s)
170 | time_test(m, l, 10, lstm.train_lstm, max_epochs=1, test_size=1000,
171 | saveto='')
172 | return numpy.asarray(l)
173 |
174 | #test in float64 in FAST_RUN mode on the cpu
175 | import theano
176 | if do_float64:
177 | theano.config.floatX = 'float64'
178 | theano.config.mode = 'FAST_RUN'
179 | float64_times = do_tests()
180 | print >> sys.stderr, algo_executed
181 | print >> sys.stderr, 'float64 times', float64_times
182 | print >> sys.stderr, 'float64 expected', expected_times_64
183 | print >> sys.stderr, 'float64 % expected/get', (
184 | expected_times_64 / float64_times)
185 |
186 | #test in float32 in FAST_RUN mode on the cpu
187 | theano.config.floatX = 'float32'
188 | if do_float32:
189 | float32_times = do_tests()
190 | print >> sys.stderr, algo_executed
191 | print >> sys.stderr, 'float32 times', float32_times
192 | print >> sys.stderr, 'float32 expected', expected_times_32
193 | print >> sys.stderr, 'float32 % expected/get', (
194 | expected_times_32 / float32_times)
195 |
196 | if do_float64:
197 | print >> sys.stderr, 'float64/float32', (
198 | float64_times / float32_times)
199 | print >> sys.stderr
200 | print >> sys.stderr, ('Duplicate the timing to have everything '
201 | 'in one place')
202 | print >> sys.stderr, algo_executed
203 | print >> sys.stderr, 'float64 times', float64_times
204 | print >> sys.stderr, 'float64 expected', expected_times_64
205 | print >> sys.stderr, 'float64 % expected/get', (
206 | expected_times_64 / float64_times)
207 | print >> sys.stderr, 'float32 times', float32_times
208 | print >> sys.stderr, 'float32 expected', expected_times_32
209 | print >> sys.stderr, 'float32 % expected/get', (
210 | expected_times_32 / float32_times)
211 |
212 | print >> sys.stderr, 'float64/float32', (
213 | float64_times / float32_times)
214 | print >> sys.stderr, 'expected float64/float32', (
215 | expected_times_64 / float32_times)
216 |
217 | #test in float32 in FAST_RUN mode on the gpu
218 | import theano.sandbox.cuda
219 | if do_gpu:
220 | theano.sandbox.cuda.use('gpu')
221 | gpu_times = do_tests()
222 | print >> sys.stderr, algo_executed
223 | print >> sys.stderr, 'gpu times', gpu_times
224 | print >> sys.stderr, 'gpu expected', expected_times_gpu
225 | print >> sys.stderr, 'gpu % expected/get', (
226 | expected_times_gpu / gpu_times)
227 |
228 | if do_float64:
229 | print >> sys.stderr, 'float64/gpu', float64_times / gpu_times
230 |
231 | if (do_float64 + do_float32 + do_gpu) > 1:
232 | print >> sys.stderr
233 | print >> sys.stderr, ('Duplicate the timing to have everything '
234 | 'in one place')
235 | print >> sys.stderr, algo_executed
236 | if do_float64:
237 | print >> sys.stderr, 'float64 times', float64_times
238 | print >> sys.stderr, 'float64 expected', expected_times_64
239 | print >> sys.stderr, 'float64 % expected/get', (
240 | expected_times_64 / float64_times)
241 | if do_float32:
242 | print >> sys.stderr, 'float32 times', float32_times
243 | print >> sys.stderr, 'float32 expected', expected_times_32
244 | print >> sys.stderr, 'float32 % expected/get', (
245 | expected_times_32 / float32_times)
246 | if do_gpu:
247 | print >> sys.stderr, 'gpu times', gpu_times
248 | print >> sys.stderr, 'gpu expected', expected_times_gpu
249 | print >> sys.stderr, 'gpu % expected/get', (
250 | expected_times_gpu / gpu_times)
251 |
252 | print
253 | if do_float64 and do_float32:
254 | print >> sys.stderr, 'float64/float32', (
255 | float64_times / float32_times)
256 | print >> sys.stderr, 'expected float64/float32', (
257 | expected_times_64 / float32_times)
258 | if do_float64 and do_gpu:
259 | print >> sys.stderr, 'float64/gpu', float64_times / gpu_times
260 | print >> sys.stderr, 'expected float64/gpu', (
261 | expected_times_64 / gpu_times)
262 | if do_float32 and do_gpu:
263 | print >> sys.stderr, 'float32/gpu', float32_times / gpu_times
264 | print >> sys.stderr, 'expected float32/gpu', (
265 | expected_times_32 / gpu_times)
266 |
267 | def compare(x, y):
268 | ratio = x / y
269 | # If there is more then 5% difference between the expected
270 | # time and the real time, we consider this an error.
271 | return sum((ratio < 0.95) + (ratio > 1.05))
272 |
273 | print
274 | if do_float64:
275 | err = compare(expected_times_64, float64_times)
276 | print >> sys.stderr, 'speed_failure_float64=' + str(err)
277 | if do_float32:
278 | err = compare(expected_times_32, float32_times)
279 | print >> sys.stderr, 'speed_failure_float32=' + str(err)
280 | if do_gpu:
281 | err = compare(expected_times_gpu, gpu_times)
282 | print >> sys.stderr, 'speed_failure_gpu=' + str(err)
283 |
284 | assert not numpy.isnan(gpu_times).any()
285 |
--------------------------------------------------------------------------------
/code/utils.py:
--------------------------------------------------------------------------------
1 | """ This file contains different utility functions that are not connected
2 | in anyway to the networks presented in the tutorials, but rather help in
3 | processing the outputs into a more understandable way.
4 |
5 | For example ``tile_raster_images`` helps in generating a easy to grasp
6 | image from a set of samples or weights.
7 | """
8 |
9 |
10 | import numpy
11 |
12 |
13 | def scale_to_unit_interval(ndar, eps=1e-8):
14 | """ Scales all values in the ndarray ndar to be between 0 and 1 """
15 | ndar = ndar.copy()
16 | ndar -= ndar.min()
17 | ndar *= 1.0 / (ndar.max() + eps)
18 | return ndar
19 |
20 |
21 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
22 | scale_rows_to_unit_interval=True,
23 | output_pixel_vals=True):
24 | """
25 | Transform an array with one flattened image per row, into an array in
26 | which images are reshaped and layed out like tiles on a floor.
27 |
28 | This function is useful for visualizing datasets whose rows are images,
29 | and also columns of matrices for transforming those rows
30 | (such as the first layer of a neural net).
31 |
32 | :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
33 | be 2-D ndarrays or None;
34 | :param X: a 2-D array in which every row is a flattened image.
35 |
36 | :type img_shape: tuple; (height, width)
37 | :param img_shape: the original shape of each image
38 |
39 | :type tile_shape: tuple; (rows, cols)
40 | :param tile_shape: the number of images to tile (rows, cols)
41 |
42 | :param output_pixel_vals: if output should be pixel values (i.e. int8
43 | values) or floats
44 |
45 | :param scale_rows_to_unit_interval: if the values need to be scaled before
46 | being plotted to [0,1] or not
47 |
48 |
49 | :returns: array suitable for viewing as an image.
50 | (See:`Image.fromarray`.)
51 | :rtype: a 2-d array with same dtype as X.
52 |
53 | """
54 |
55 | assert len(img_shape) == 2
56 | assert len(tile_shape) == 2
57 | assert len(tile_spacing) == 2
58 |
59 | # The expression below can be re-written in a more C style as
60 | # follows :
61 | #
62 | # out_shape = [0,0]
63 | # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] -
64 | # tile_spacing[0]
65 | # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
66 | # tile_spacing[1]
67 | out_shape = [
68 | (ishp + tsp) * tshp - tsp
69 | for ishp, tshp, tsp in zip(img_shape, tile_shape, tile_spacing)
70 | ]
71 |
72 | if isinstance(X, tuple):
73 | assert len(X) == 4
74 | # Create an output numpy ndarray to store the image
75 | if output_pixel_vals:
76 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
77 | dtype='uint8')
78 | else:
79 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
80 | dtype=X.dtype)
81 |
82 | #colors default to 0, alpha defaults to 1 (opaque)
83 | if output_pixel_vals:
84 | channel_defaults = [0, 0, 0, 255]
85 | else:
86 | channel_defaults = [0., 0., 0., 1.]
87 |
88 | for i in xrange(4):
89 | if X[i] is None:
90 | # if channel is None, fill it with zeros of the correct
91 | # dtype
92 | dt = out_array.dtype
93 | if output_pixel_vals:
94 | dt = 'uint8'
95 | out_array[:, :, i] = numpy.zeros(
96 | out_shape,
97 | dtype=dt
98 | ) + channel_defaults[i]
99 | else:
100 | # use a recurrent call to compute the channel and store it
101 | # in the output
102 | out_array[:, :, i] = tile_raster_images(
103 | X[i], img_shape, tile_shape, tile_spacing,
104 | scale_rows_to_unit_interval, output_pixel_vals)
105 | return out_array
106 |
107 | else:
108 | # if we are dealing with only one channel
109 | H, W = img_shape
110 | Hs, Ws = tile_spacing
111 |
112 | # generate a matrix to store the output
113 | dt = X.dtype
114 | if output_pixel_vals:
115 | dt = 'uint8'
116 | out_array = numpy.zeros(out_shape, dtype=dt)
117 |
118 | for tile_row in xrange(tile_shape[0]):
119 | for tile_col in xrange(tile_shape[1]):
120 | if tile_row * tile_shape[1] + tile_col < X.shape[0]:
121 | this_x = X[tile_row * tile_shape[1] + tile_col]
122 | if scale_rows_to_unit_interval:
123 | # if we should scale values to be between 0 and 1
124 | # do this by calling the `scale_to_unit_interval`
125 | # function
126 | this_img = scale_to_unit_interval(
127 | this_x.reshape(img_shape))
128 | else:
129 | this_img = this_x.reshape(img_shape)
130 | # add the slice to the corresponding position in the
131 | # output array
132 | c = 1
133 | if output_pixel_vals:
134 | c = 255
135 | out_array[
136 | tile_row * (H + Hs): tile_row * (H + Hs) + H,
137 | tile_col * (W + Ws): tile_col * (W + Ws) + W
138 | ] = this_img * c
139 | return out_array
140 |
--------------------------------------------------------------------------------
/data/download.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | which wget >/dev/null 2>&1
4 | WGET=$?
5 | which curl >/dev/null 2>&1
6 | CURL=$?
7 | if [ "$WGET" -eq 0 ]; then
8 | DL_CMD="wget -c"
9 | elif [ "$CURL" -eq 0 ]; then
10 | DL_CMD="curl -C - -O"
11 | else
12 | echo "You need wget or curl installed to download"
13 | exit 1
14 | fi
15 |
16 | $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
17 | $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist_py3k.pkl.gz
18 | $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl.gz && gunzip imdb.pkl.gz
19 | $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/imdb.dict.pkl.gz && gunzip imdb.dict.pkl.gz
20 | $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/Nottingham.zip && unzip -u Nottingham.zip
21 | $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/midi.zip && unzip -u midi.zip -d ../code && echo "extracted Modified Python MIDI package (GPL)"
22 | $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold0.pkl.gz
23 | $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold1.pkl.gz
24 | $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold2.pkl.gz
25 | $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold3.pkl.gz
26 | $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold4.pkl.gz
27 |
--------------------------------------------------------------------------------
/data/training_colorpatches_16x16_demo.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/data/training_colorpatches_16x16_demo.mat
--------------------------------------------------------------------------------
/doc/.templates/layout.html:
--------------------------------------------------------------------------------
1 | {% extends "!layout.html" %}
2 |
3 | {%- block extrahead %}
4 | {{ super() }}
5 |
10 | {% endblock %}
11 |
12 | {% block footer %}
13 | {{ super() }}
14 |
23 | {% endblock %}
24 |
25 |
--------------------------------------------------------------------------------
/doc/DBN.txt:
--------------------------------------------------------------------------------
1 | .. _DBN:
2 |
3 | Deep Belief Networks
4 | ====================
5 |
6 | .. note::
7 | This section assumes the reader has already read through :doc:`logreg`
8 | and :doc:`mlp` and :doc:`rbm`. Additionally it uses the following Theano
9 | functions and concepts : `T.tanh`_, `shared variables`_, `basic arithmetic
10 | ops`_, `T.grad`_, `Random numbers`_, `floatX`_. If you intend to run the
11 | code on GPU also read `GPU`_.
12 |
13 | .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh
14 |
15 | .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables
16 |
17 | .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars
18 |
19 | .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients
20 |
21 | .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX
22 |
23 | .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html
24 |
25 | .. _Random numbers: http://deeplearning.net/software/theano/tutorial/examples.html#using-random-numbers
26 |
27 |
28 | .. note::
29 | The code for this section is available for download `here`_.
30 |
31 | .. _here: http://deeplearning.net/tutorial/code/DBN.py
32 |
33 |
34 | Deep Belief Networks
35 | ++++++++++++++++++++
36 |
37 | [Hinton06]_ showed that RBMs can be stacked and trained in a greedy manner
38 | to form so-called Deep Belief Networks (DBN). DBNs are graphical models which
39 | learn to extract a deep hierarchical representation of the training data.
40 | They model the joint distribution between observed vector :math:`x` and
41 | the :math:`\ell` hidden layers :math:`h^k` as follows:
42 |
43 | .. math::
44 | :label: dbn
45 |
46 | P(x, h^1, \ldots, h^{\ell}) = \left(\prod_{k=0}^{\ell-2} P(h^k|h^{k+1})\right) P(h^{\ell-1},h^{\ell})
47 |
48 | where :math:`x=h^0`, :math:`P(h^{k-1} | h^k)` is a conditional distribution
49 | for the visible units conditioned on the hidden units of the RBM at level
50 | :math:`k`, and :math:`P(h^{\ell-1}, h^{\ell})` is the visible-hidden joint
51 | distribution in the top-level RBM. This is illustrated in the figure below.
52 |
53 |
54 | .. figure:: images/DBN3.png
55 | :align: center
56 |
57 | The principle of greedy layer-wise unsupervised training can be applied to
58 | DBNs with RBMs as the building blocks for each layer [Hinton06]_, [Bengio07]_.
59 | The process is as follows:
60 |
61 | 1. Train the first layer as an RBM that models the raw input :math:`x =
62 | h^{(0)}` as its visible layer.
63 |
64 | 2. Use that first layer to obtain a representation of the input that will
65 | be used as data for the second layer. Two common solutions exist. This
66 | representation can be chosen as being the mean activations
67 | :math:`p(h^{(1)}=1|h^{(0)})` or samples of :math:`p(h^{(1)}|h^{(0)})`.
68 |
69 | 3. Train the second layer as an RBM, taking the transformed data (samples or
70 | mean activations) as training examples (for the visible layer of that RBM).
71 |
72 | 4. Iterate (2 and 3) for the desired number of layers, each time propagating
73 | upward either samples or mean values.
74 |
75 | 5. Fine-tune all the parameters of this deep architecture with respect to a
76 | proxy for the DBN log- likelihood, or with respect to a supervised training
77 | criterion (after adding extra learning machinery to convert the learned
78 | representation into supervised predictions, e.g. a linear classifier).
79 |
80 |
81 | In this tutorial, we focus on fine-tuning via supervised gradient descent.
82 | Specifically, we use a logistic regression classifier to classify the input
83 | :math:`x` based on the output of the last hidden layer :math:`h^{(l)}` of the
84 | DBN. Fine-tuning is then performed via supervised gradient descent of the
85 | negative log-likelihood cost function. Since the supervised gradient is only
86 | non-null for the weights and hidden layer biases of each layer (i.e. null for
87 | the visible biases of each RBM), this procedure is equivalent to initializing
88 | the parameters of a deep MLP with the weights and hidden layer biases obtained
89 | with the unsupervised training strategy.
90 |
91 | Justifying Greedy-Layer Wise Pre-Training
92 | +++++++++++++++++++++++++++++++++++++++++
93 |
94 | Why does such an algorithm work ? Taking as example a 2-layer DBN with hidden
95 | layers :math:`h^{(1)}` and :math:`h^{(2)}` (with respective weight parameters
96 | :math:`W^{(1)}` and :math:`W^{(2)}`), [Hinton06]_ established
97 | (see also Bengio09]_ for a detailed derivation) that :math:`\log
98 | p(x)` can be rewritten as,
99 |
100 | .. math::
101 | :label: dbn_bound
102 |
103 | \log p(x) = &KL(Q(h^{(1)}|x)||p(h^{(1)}|x)) + H_{Q(h^{(1)}|x)} + \\
104 | &\sum_h Q(h^{(1)}|x)(\log p(h^{(1)}) + \log p(x|h^{(1)})).
105 |
106 | :math:`KL(Q(h^{(1)}|x) || p(h^{(1)}|x))` represents the KL divergence between
107 | the posterior :math:`Q(h^{(1)}|x)` of the first RBM if it were standalone, and the
108 | probability :math:`p(h^{(1)}|x)` for the same layer but defined by the entire DBN
109 | (i.e. taking into account the prior :math:`p(h^{(1)},h^{(2)})` defined by the
110 | top-level RBM). :math:`H_{Q(h^{(1)}|x)}` is the entropy of the distribution
111 | :math:`Q(h^{(1)}|x)`.
112 |
113 | It can be shown that if we initialize both hidden layers such that
114 | :math:`W^{(2)}={W^{(1)}}^T`, :math:`Q(h^{(1)}|x)=p(h^{(1)}|x)` and the KL
115 | divergence term is null. If we learn the first level RBM and then keep its
116 | parameters :math:`W^{(1)}` fixed, optimizing Eq. :eq:`dbn_bound` with respect
117 | to :math:`W^{(2)}` can thus only increase the likelihood :math:`p(x)`.
118 |
119 | Also, notice that if we isolate the terms which depend only on :math:`W^{(2)}`, we
120 | get:
121 |
122 | .. math::
123 | \sum_h Q(h^{(1)}|x)p(h^{(1)})
124 |
125 | Optimizing this with respect to :math:`W^{(2)}` amounts to training a second-stage
126 | RBM, using the output of :math:`Q(h^{(1)}|x)` as the training distribution,
127 | when :math:`x` is sampled from the training distribution for the first RBM.
128 |
129 | Implementation
130 | ++++++++++++++
131 |
132 | To implement DBNs in Theano, we will use the class defined in the :doc:`rbm`
133 | tutorial. One can also observe that the code for the DBN is very similar with the one
134 | for SdA, because both involve the principle of unsupervised layer-wise
135 | pre-training followed by supervised fine-tuning as a deep MLP.
136 | The main difference is that we use the RBM class instead of the dA
137 | class.
138 |
139 | We start off by defining the DBN class which will store the layers of the
140 | MLP, along with their associated RBMs. Since we take the viewpoint of using
141 | the RBMs to initialize an MLP, the code will reflect this by seperating as
142 | much as possible the RBMs used to initialize the network and the MLP used for
143 | classification.
144 |
145 | .. literalinclude:: ../code/DBN.py
146 | :start-after: start-snippet-1
147 | :end-before: end-snippet-1
148 |
149 | ``self.sigmoid_layers`` will store the feed-forward graphs which together form
150 | the MLP, while ``self.rbm_layers`` will store the RBMs used to pretrain each
151 | layer of the MLP.
152 |
153 | Next step, we construct ``n_layers`` sigmoid layers (we use the
154 | ``HiddenLayer`` class introduced in :ref:`mlp`, with the only modification
155 | that we replaced the non-linearity from ``tanh`` to the logistic function
156 | :math:`s(x) = \frac{1}{1+e^{-x}}`) and ``n_layers`` RBMs, where ``n_layers``
157 | is the depth of our model. We link the sigmoid layers such that they form an
158 | MLP, and construct each RBM such that they share the weight matrix and the
159 | hidden bias with its corresponding sigmoid layer.
160 |
161 | .. literalinclude:: ../code/DBN.py
162 | :start-after: # MLP.
163 | :end-before: # We now need to add a logistic layer on top of the MLP
164 |
165 | All that is left is to stack one last logistic regression layer in order to
166 | form an MLP. We will use the ``LogisticRegression`` class introduced in
167 | :ref:`logreg`.
168 |
169 | .. literalinclude:: ../code/DBN.py
170 | :start-after: # We now need to add a logistic layer on top of the MLP
171 | :end-before: def pretraining_functions
172 |
173 | The class also provides a method which generates training functions for each
174 | of the RBMs. They are returned as a list, where element :math:`i` is a
175 | function which implements one step of training for the ``RBM`` at layer
176 | :math:`i`.
177 |
178 | .. literalinclude:: ../code/DBN.py
179 | :start-after: self.errors = self.logLayer.errors(self.y)
180 | :end-before: learning_rate = T.scalar('lr')
181 |
182 | In order to be able to change the learning rate during training, we associate a
183 | Theano variable to it that has a default value.
184 |
185 | .. literalinclude:: ../code/DBN.py
186 | :start-after: index = T.lscalar('index')
187 | :end-before: def build_finetune_functions
188 |
189 | Now any function ``pretrain_fns[i]`` takes as arguments ``index`` and
190 | optionally ``lr`` -- the learning rate. Note that the names of the parameters
191 | are the names given to the Theano variables (e.g. ``lr``) when they are
192 | constructed and not the name of the python variables (e.g. ``learning_rate``). Keep
193 | this in mind when working with Theano. Optionally, if you provide ``k`` (the
194 | number of Gibbs steps to perform in CD or PCD) this will also become an
195 | argument of your function.
196 |
197 | In the same fashion, the DBN class includes a method for building the
198 | functions required for finetuning ( a ``train_model``, a ``validate_model``
199 | and a ``test_model`` function).
200 |
201 | .. literalinclude:: ../code/DBN.py
202 | :pyobject: DBN.build_finetune_functions
203 |
204 | Note that the returned ``valid_score`` and ``test_score`` are not Theano
205 | functions, but rather Python functions. These loop over the entire
206 | validation set and the entire test set to produce a list of the losses
207 | obtained over these sets.
208 |
209 |
210 | Putting it all together
211 | +++++++++++++++++++++++
212 |
213 | The few lines of code below constructs the deep belief network :
214 |
215 | .. literalinclude:: ../code/DBN.py
216 | :start-after: # numpy random generator
217 | :end-before: start-snippet-2
218 |
219 | There are two stages in training this network: (1) a layer-wise pre-training and
220 | (2) a fine-tuning stage.
221 |
222 | For the pre-training stage, we loop over all the layers of the network. For
223 | each layer, we use the compiled theano function which determines the
224 | input to the ``i``-th level RBM and performs one step of CD-k within this RBM.
225 | This function is applied to the training set for a fixed number of epochs
226 | given by ``pretraining_epochs``.
227 |
228 | .. literalinclude:: ../code/DBN.py
229 | :start-after: start-snippet-2
230 | :end-before: end-snippet-2
231 |
232 | The fine-tuning loop is very similar to the one in the :ref:`mlp` tutorial,
233 | the only difference being that we now use the functions given by
234 | ``build_finetune_functions``.
235 |
236 | Running the Code
237 | ++++++++++++++++
238 |
239 | The user can run the code by calling:
240 |
241 | .. code-block:: bash
242 |
243 | python code/DBN.py
244 |
245 | With the default parameters, the code runs for 100 pre-training epochs with
246 | mini-batches of size 10. This corresponds to performing 500,000 unsupervised
247 | parameter updates. We use an unsupervised learning rate of 0.01, with a
248 | supervised learning rate of 0.1. The DBN itself consists of three
249 | hidden layers with 1000 units per layer. With early-stopping, this configuration
250 | achieved a minimal validation error of 1.27 with corresponding test
251 | error of 1.34 after 46 supervised epochs.
252 |
253 | On an Intel(R) Xeon(R) CPU X5560 running at 2.80GHz, using a multi-threaded MKL
254 | library (running on 4 cores), pretraining took 615 minutes with an average of
255 | 2.05 mins/(layer * epoch). Fine-tuning took only 101 minutes or approximately
256 | 2.20 mins/epoch.
257 |
258 | Hyper-parameters were selected by optimizing on the validation error. We tested
259 | unsupervised learning rates in :math:`\{10^{-1}, ..., 10^{-5}\}` and supervised
260 | learning rates in :math:`\{10^{-1}, ..., 10^{-4}\}`. We did not use any form of
261 | regularization besides early-stopping, nor did we optimize over the number of
262 | pretraining updates.
263 |
264 |
265 | Tips and Tricks
266 | +++++++++++++++
267 |
268 | One way to improve the running time of your code (given that you have
269 | sufficient memory available), is to compute the representation of the entire
270 | dataset at layer ``i`` in a single pass, once the weights of the
271 | :math:`i-1`-th layers have been fixed. Namely, start by training your first
272 | layer RBM. Once it is trained, you can compute the hidden units values for
273 | every example in the dataset and store this as a new dataset which is used to
274 | train the 2nd layer RBM. Once you trained the RBM for layer 2, you compute, in
275 | a similar fashion, the dataset for layer 3 and so on. This avoids calculating
276 | the intermediate (hidden layer) representations, ``pretraining_epochs`` times
277 | at the expense of increased memory usage.
278 |
--------------------------------------------------------------------------------
/doc/LICENSE.txt:
--------------------------------------------------------------------------------
1 | .. _license:
2 |
3 | LICENSE
4 | =======
5 |
6 | Copyright (c) 2008--2013, Theano Development Team
7 | All rights reserved.
8 |
9 | Redistribution and use in source and binary forms, with or without
10 | modification, are permitted provided that the following conditions are met:
11 |
12 | * Redistributions of source code must retain the above copyright
13 | notice, this list of conditions and the following disclaimer.
14 | * Redistributions in binary form must reproduce the above copyright
15 | notice, this list of conditions and the following disclaimer in the
16 | documentation and/or other materials provided with the distribution.
17 | * Neither the name of Theano nor the names of its contributors may be
18 | used to endorse or promote products derived from this software without
19 | specific prior written permission.
20 |
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
22 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
28 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 |
--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | python scripts/docgen.py
3 |
--------------------------------------------------------------------------------
/doc/SdA.txt:
--------------------------------------------------------------------------------
1 | .. _SdA:
2 |
3 | Stacked Denoising Autoencoders (SdA)
4 | ====================================
5 |
6 | .. note::
7 | This section assumes you have already read through :doc:`logreg`
8 | and :doc:`mlp`. Additionally it uses the following Theano functions
9 | and concepts : `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, `Random numbers`_, `floatX`_. If you intend to run the code on GPU also read `GPU`_.
10 |
11 | .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh
12 |
13 | .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables
14 |
15 | .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars
16 |
17 | .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients
18 |
19 | .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX
20 |
21 | .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html
22 |
23 | .. _Random numbers: http://deeplearning.net/software/theano/tutorial/examples.html#using-random-numbers
24 |
25 |
26 | .. note::
27 | The code for this section is available for download `here`_.
28 |
29 | .. _here: http://deeplearning.net/tutorial/code/SdA.py
30 |
31 |
32 | The Stacked Denoising Autoencoder (SdA) is an extension of the stacked
33 | autoencoder [Bengio07]_ and it was introduced in [Vincent08]_.
34 |
35 | This tutorial builds on the previous tutorial :ref:`dA`.
36 | Especially if you do not have experience with autoencoders, we recommend reading it
37 | before going any further.
38 |
39 | .. _stacked_autoencoders:
40 |
41 | Stacked Autoencoders
42 | ++++++++++++++++++++
43 |
44 | Denoising autoencoders can be stacked to form a deep network by
45 | feeding the latent representation (output code)
46 | of the denoising autoencoder found on the layer
47 | below as input to the current layer. The **unsupervised pre-training** of such an
48 | architecture is done one layer at a time. Each layer is trained as
49 | a denoising autoencoder by minimizing the error in reconstructing its input
50 | (which is the output code of the previous layer).
51 | Once the first :math:`k` layers
52 | are trained, we can train the :math:`k+1`-th layer because we can now
53 | compute the code or latent representation from the layer below.
54 |
55 | Once all layers are pre-trained, the network goes through a second stage
56 | of training called **fine-tuning**. Here we consider **supervised fine-tuning**
57 | where we want to minimize prediction error on a supervised task.
58 | For this, we first add a logistic regression
59 | layer on top of the network (more precisely on the output code of the
60 | output layer). We then
61 | train the entire network as we would train a multilayer
62 | perceptron. At this point, we only consider the encoding parts of
63 | each auto-encoder.
64 | This stage is supervised, since now we use the target class during
65 | training. (See the :ref:`mlp` for details on the multilayer perceptron.)
66 |
67 | This can be easily implemented in Theano, using the class defined
68 | previously for a denoising autoencoder. We can see the stacked denoising
69 | autoencoder as having two facades: a list of
70 | autoencoders, and an MLP. During pre-training we use the first facade, i.e., we treat our model
71 | as a list of autoencoders, and train each autoencoder seperately. In the
72 | second stage of training, we use the second facade. These two facades are linked because:
73 |
74 | * the autoencoders and the sigmoid layers of the MLP share parameters, and
75 |
76 | * the latent representations computed by intermediate layers of the MLP are fed as input to the autoencoders.
77 |
78 | .. literalinclude:: ../code/SdA.py
79 | :start-after: start-snippet-1
80 | :end-before: end-snippet-1
81 |
82 | ``self.sigmoid_layers`` will store the sigmoid layers of the MLP facade, while
83 | ``self.dA_layers`` will store the denoising autoencoder associated with the layers of the MLP.
84 |
85 | Next, we construct ``n_layers`` sigmoid layers and ``n_layers`` denoising
86 | autoencoders, where ``n_layers`` is the depth of our model. We use the
87 | ``HiddenLayer`` class introduced in :ref:`mlp`, with one
88 | modification: we replace the ``tanh`` non-linearity with the
89 | logistic function :math:`s(x) = \frac{1}{1+e^{-x}}`).
90 | We link the sigmoid layers to form an MLP, and construct
91 | the denoising autoencoders such that each shares the weight matrix and the
92 | bias of its encoding part with its corresponding sigmoid layer.
93 |
94 | .. literalinclude:: ../code/SdA.py
95 | :start-after: start-snippet-2
96 | :end-before: end-snippet-2
97 |
98 | All we need now is to add a logistic layer on top of the sigmoid
99 | layers such that we have an MLP. We will
100 | use the ``LogisticRegression`` class introduced in :ref:`logreg`.
101 |
102 | .. literalinclude:: ../code/SdA.py
103 | :start-after: end-snippet-2
104 | :end-before: def pretraining_functions
105 |
106 | The ``SdA`` class also provides a method that generates training functions for
107 | the denoising autoencoders in its layers.
108 | They are returned as a list, where element :math:`i` is a function that
109 | implements one step of training the ``dA`` corresponding to layer
110 | :math:`i`.
111 |
112 | .. literalinclude:: ../code/SdA.py
113 | :start-after: self.errors = self.logLayer.errors(self.y)
114 | :end-before: corruption_level = T.scalar('corruption')
115 |
116 | To be able to change the corruption level or the learning rate
117 | during training, we associate Theano variables with them.
118 |
119 | .. literalinclude:: ../code/SdA.py
120 | :start-after: index = T.lscalar('index')
121 | :end-before: def build_finetune_functions
122 |
123 | Now any function ``pretrain_fns[i]`` takes as arguments ``index`` and
124 | optionally ``corruption``---the corruption level or ``lr``---the
125 | learning rate. Note that the names of the parameters are the names given
126 | to the Theano variables when they are constructed, not the names of the
127 | Python variables (``learning_rate`` or ``corruption_level``). Keep this
128 | in mind when working with Theano.
129 |
130 | In the same fashion we build a method for constructing the functions required
131 | during finetuning (``train_fn``, ``valid_score`` and
132 | ``test_score``).
133 |
134 | .. literalinclude:: ../code/SdA.py
135 | :pyobject: SdA.build_finetune_functions
136 |
137 | Note that ``valid_score`` and ``test_score`` are not Theano
138 | functions, but rather Python functions that loop over the entire
139 | validation set and the entire test set, respectively, producing a list of the losses
140 | over these sets.
141 |
142 | Putting it all together
143 | +++++++++++++++++++++++
144 |
145 | The few lines of code below construct the stacked denoising
146 | autoencoder:
147 |
148 | .. literalinclude:: ../code/SdA.py
149 | :start-after: start-snippet-3
150 | :end-before: end-snippet-3
151 |
152 | There are two stages of training for this network: layer-wise pre-training
153 | followed by fine-tuning.
154 |
155 | For the pre-training stage, we will loop over all the layers of the
156 | network. For each layer we will use the compiled Theano function that
157 | implements a SGD step towards optimizing the weights for reducing
158 | the reconstruction cost of that layer. This function will be applied
159 | to the training set for a fixed number of epochs given by
160 | ``pretraining_epochs``.
161 |
162 | .. literalinclude:: ../code/SdA.py
163 | :start-after: start-snippet-4
164 | :end-before: end-snippet-4
165 |
166 | The fine-tuning loop is very similar to the one in the :ref:`mlp`. The
167 | only difference is that it uses the functions given by
168 | ``build_finetune_functions``.
169 |
170 | Running the Code
171 | ++++++++++++++++
172 |
173 | The user can run the code by calling:
174 |
175 | .. code-block:: bash
176 |
177 | python code/SdA.py
178 |
179 | By default the code runs 15 pre-training epochs for each layer, with a batch
180 | size of 1. The corruption levels are 0.1 for the first layer, 0.2 for the second,
181 | and 0.3 for the third. The pretraining learning rate is 0.001 and
182 | the finetuning learning rate is 0.1. Pre-training takes 585.01 minutes, with
183 | an average of 13 minutes per epoch. Fine-tuning is completed after 36 epochs
184 | in 444.2 minutes, with an average of 12.34 minutes per epoch. The final
185 | validation score is 1.39% with a testing score of 1.3%.
186 | These results were obtained on a machine with an Intel
187 | Xeon E5430 @ 2.66GHz CPU, with a single-threaded GotoBLAS.
188 |
189 |
190 | Tips and Tricks
191 | +++++++++++++++
192 |
193 | One way to improve the running time of your code (assuming you have
194 | sufficient memory available), is to compute how the network, up to layer
195 | :math:`k-1`, transforms your data. Namely, you start by training your first
196 | layer dA. Once it is trained, you can compute the hidden units values for
197 | every datapoint in your dataset and store this as a new dataset that you will
198 | use to train the dA corresponding to layer 2. Once you have trained the dA for
199 | layer 2, you compute, in a similar fashion, the dataset for layer 3 and so on.
200 | You can see now, that at this point, the dAs are trained individually, and
201 | they just provide (one to the other) a non-linear transformation of the input.
202 | Once all dAs are trained, you can start fine-tuning the model.
203 |
--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # theano documentation build configuration file, created by
4 | # sphinx-quickstart on Tue Oct 7 16:34:06 2008.
5 | #
6 | # This file is execfile()d with the current directory set to its containing dir.
7 | #
8 | # The contents of this file are pickled, so don't put values in the namespace
9 | # that aren't pickleable (module imports are okay, they're removed automatically).
10 | #
11 | # All configuration values have a default value; values that are commented out
12 | # serve to show the default value.
13 | import sys, os
14 |
15 | # If your extensions are in another directory, add it here. If the directory
16 | # is relative to the documentation root, use os.path.abspath to make it
17 | # absolute, like shown here.
18 | #sys.path.append(os.path.abspath('some/directory'))
19 |
20 | # General configuration
21 | # ---------------------
22 |
23 | # Add any Sphinx extension module names here, as strings. They can be extensions
24 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
25 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo']
26 |
27 | try:
28 | from sphinx.ext import pngmath
29 | extensions.append('sphinx.ext.pngmath')
30 | except ImportError:
31 | print >>sys.stderr, 'Warning: could not import sphinx.ext.pngmath'
32 | pass
33 |
34 | # Add any paths that contain templates here, relative to this directory.
35 | templates_path = ['.templates']
36 |
37 | # The suffix of source filenames.
38 | source_suffix = '.txt'
39 |
40 | # The master toctree document.
41 | master_doc = 'contents'
42 |
43 | # General substitutions.
44 | project = 'DeepLearning'
45 | copyright = '2008--2010, LISA lab'
46 |
47 | # The default replacements for |version| and |release|, also used in various
48 | # other places throughout the built documents.
49 | #
50 | # The short X.Y version.
51 | version = '0.1'
52 | # The full version, including alpha/beta/rc tags.
53 | release = '0.1'
54 |
55 | # There are two options for replacing |today|: either, you set today to some
56 | # non-false value, then it is used:
57 | #today = ''
58 | # Else, today_fmt is used as the format for a strftime call.
59 | today_fmt = '%B %d, %Y'
60 |
61 | # List of documents that shouldn't be included in the build.
62 | #unused_docs = []
63 |
64 | # List of directories, relative to source directories, that shouldn't be searched
65 | # for source files.
66 | exclude_dirs = ['scripts']
67 |
68 | # The reST default role (used for this markup: `text`) to use for all documents.
69 | #default_role = None
70 |
71 | # If true, '()' will be appended to :func: etc. cross-reference text.
72 | #add_function_parentheses = True
73 |
74 | # If true, the current module name will be prepended to all description
75 | # unit titles (such as .. function::).
76 | #add_module_names = True
77 |
78 | # If true, sectionauthor and moduleauthor directives will be shown in the
79 | # output. They are ignored by default.
80 | #show_authors = False
81 |
82 | # The name of the Pygments (syntax highlighting) style to use.
83 | pygments_style = 'sphinx'
84 |
85 |
86 | # Options for HTML output
87 | # -----------------------
88 |
89 | # The style sheet to use for HTML and HTML Help pages. A file of that name
90 | # must exist either in Sphinx' static/ path, or in one of the custom paths
91 | # given in html_static_path.
92 | #html_style = 'default.css'
93 | html_theme = 'sphinxdoc'
94 |
95 | # The name for this set of Sphinx documents. If None, it defaults to
96 | # " v documentation".
97 | #html_title = None
98 |
99 | # A shorter title for the navigation bar. Default is the same as html_title.
100 | #html_short_title = None
101 |
102 | # The name of an image file (within the static path) to place at the top of
103 | # the sidebar.
104 | #html_logo = None
105 |
106 | # The name of an image file (within the static path) to use as favicon of the
107 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
108 | # pixels large.
109 | #html_favicon = None
110 |
111 | # Add any paths that contain custom static files (such as style sheets) here,
112 | # relative to this directory. They are copied after the builtin static files,
113 | # so a file named "default.css" will overwrite the builtin "default.css".
114 | #html_static_path = ['.static', 'images']
115 | html_static_path = ['images']
116 |
117 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
118 | # using the given strftime format.
119 | html_last_updated_fmt = '%b %d, %Y'
120 |
121 | # If true, SmartyPants will be used to convert quotes and dashes to
122 | # typographically correct entities.
123 | html_use_smartypants = True
124 |
125 | # Custom sidebar templates, maps document names to template names.
126 | #html_sidebars = {}
127 |
128 | # Additional templates that should be rendered to pages, maps page names to
129 | # template names.
130 | #html_additional_pages = {}
131 |
132 | # If false, no module index is generated.
133 | html_use_modindex = True
134 |
135 | # If false, no index is generated.
136 | html_use_index = True
137 |
138 | # If true, the index is split into individual pages for each letter.
139 | #html_split_index = False
140 |
141 | # If true, the reST sources are included in the HTML build as _sources/.
142 | #html_copy_source = True
143 |
144 | # If true, an OpenSearch description file will be output, and all pages will
145 | # contain a tag referring to it. The value of this option must be the
146 | # base URL from which the finished HTML is served.
147 | #html_use_opensearch = ''
148 |
149 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
150 | #html_file_suffix = ''
151 |
152 | # Output file base name for HTML help builder.
153 | htmlhelp_basename = 'deeplearningdoc'
154 |
155 |
156 | # Options for LaTeX output
157 | # ------------------------
158 |
159 | # The paper size ('letter' or 'a4').
160 | #latex_paper_size = 'letter'
161 |
162 | # The font size ('10pt', '11pt' or '12pt').
163 | latex_font_size = '11pt'
164 |
165 | # Grouping the document tree into LaTeX files. List of tuples
166 | # (source start file, target name, title, author, document class [howto/manual]).
167 | latex_documents = [
168 | ('contents', 'deeplearning.tex', 'Deep Learning Tutorial',
169 | 'LISA lab, University of Montreal', 'manual'),
170 | ]
171 |
172 | # The name of an image file (relative to this directory) to place at the top of
173 | # the title page.
174 | latex_logo = None
175 |
176 | # For "manual" documents, if this is true, then toplevel headings are parts,
177 | # not chapters.
178 | #latex_use_parts = False
179 |
180 | # Additional stuff for the LaTeX preamble.
181 | #latex_preamble = ''
182 |
183 | # Documents to append as an appendix to all manuals.
184 | #latex_appendices = []
185 |
186 | # If false, no module index is generated.
187 | #latex_use_modindex = True
188 |
189 | default_role = 'math'
190 | pngmath_divpng_args = ['-gamma 1.5','-D 110']
191 | pngmath_latex_preamble = '\\usepackage{amsmath}\n'+\
192 | '\\usepackage{amsfonts}\n'+\
193 | '\\usepackage{amssymb}\n'+\
194 | '\\def\\E{\\mathbf{E}}\n'+\
195 | '\\def\\F{\\mathbf{F}}\n'+\
196 | '\\def\\x{\\mathbf{x}}\n'+\
197 | '\\def\\h{\\mathbf{h}}\n'+\
198 | '\\def\\v{\\mathbf{v}}\n'+\
199 | '\\def\\nv{\\mathbf{v^{{\bf -}}}}\n'+\
200 | '\\def\\nh{\\mathbf{h^{{\bf -}}}}\n'+\
201 | '\\def\\s{\\mathbf{s}}\n'+\
202 | '\\def\\b{\\mathbf{b}}\n'+\
203 | '\\def\\c{\\mathbf{c}}\n'+\
204 | '\\def\\W{\\mathbf{W}}\n'+\
205 | '\\def\\C{\\mathbf{C}}\n'+\
206 | '\\def\\P{\\mathbf{P}}\n'+\
207 | '\\def\\T{{\\bf \\mathcal T}}\n'+\
208 | '\\def\\B{{\\bf \\mathcal B}}\n'
209 |
--------------------------------------------------------------------------------
/doc/contents.txt:
--------------------------------------------------------------------------------
1 |
2 | .. _contents:
3 |
4 | ========
5 | Contents
6 | ========
7 |
8 | .. toctree::
9 | :maxdepth: 2
10 |
11 | LICENSE
12 | index
13 | gettingstarted
14 | logreg
15 | mlp
16 | lenet
17 | dA
18 | SdA
19 | rbm
20 | DBN
21 | hmc
22 | rnnslu
23 | lstm
24 | rnnrbm
25 | utilities
26 | references
27 |
--------------------------------------------------------------------------------
/doc/dA.txt:
--------------------------------------------------------------------------------
1 | .. _daa:
2 |
3 | Denoising Autoencoders (dA)
4 | ===========================
5 |
6 | .. note::
7 | This section assumes the reader has already read through :doc:`logreg`
8 | and :doc:`mlp`. Additionally it uses the following Theano functions
9 | and concepts : `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, `Random numbers`_, `floatX`_. If you intend to run the code on GPU also read `GPU`_.
10 |
11 | .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh
12 |
13 | .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables
14 |
15 | .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars
16 |
17 | .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients
18 |
19 | .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX
20 |
21 | .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html
22 |
23 | .. _Random numbers: http://deeplearning.net/software/theano/tutorial/examples.html#using-random-numbers
24 |
25 |
26 | .. note::
27 | The code for this section is available for download `here`_.
28 |
29 | .. _here: http://deeplearning.net/tutorial/code/dA.py
30 |
31 |
32 | The Denoising Autoencoder (dA) is an extension of a classical
33 | autoencoder and it was introduced as a building block for deep networks
34 | in [Vincent08]_. We will start the tutorial with a short discussion on
35 | :ref:`autoencoders`.
36 |
37 | .. _autoencoders:
38 |
39 | Autoencoders
40 | +++++++++++++
41 |
42 | See section 4.6 of [Bengio09]_ for an overview of auto-encoders.
43 | An autoencoder takes an input :math:`\mathbf{x} \in [0,1]^d` and first
44 | maps it (with an *encoder)* to a hidden representation :math:`\mathbf{y} \in [0,1]^{d'}`
45 | through a deterministic mapping, e.g.:
46 |
47 | .. math::
48 |
49 | \mathbf{y} = s(\mathbf{W}\mathbf{x} + \mathbf{b})
50 |
51 | Where :math:`s` is a non-linearity such as the sigmoid. The latent
52 | representation :math:`\mathbf{y}`, or **code** is then mapped back (with a
53 | *decoder)* into a **reconstruction** :math:`\mathbf{z}` of the same shape as
54 | :math:`\mathbf{x}`. The mapping happens through a similar transformation, e.g.:
55 |
56 | .. math::
57 |
58 | \mathbf{z} = s(\mathbf{W'}\mathbf{y} + \mathbf{b'})
59 |
60 | (Here, the prime symbol does not indicate matrix transposition.)
61 | :math:`\mathbf{z}` should be seen as a prediction of :math:`\mathbf{x}`, given
62 | the code :math:`\mathbf{y}`. Optionally, the weight matrix :math:`\mathbf{W'}`
63 | of the reverse mapping may be constrained to be the transpose of the forward
64 | mapping: :math:`\mathbf{W'} = \mathbf{W}^T`. This is referred to as *tied
65 | weights*. The parameters of this model (namely :math:`\mathbf{W}`,
66 | :math:`\mathbf{b}`, :math:`\mathbf{b'}` and, if one doesn't use tied weights,
67 | also :math:`\mathbf{W'}`) are optimized such that the average reconstruction
68 | error is minimized.
69 |
70 | The reconstruction error can be measured in many ways, depending on the
71 | appropriate distributional assumptions on the input given the code. The
72 | traditional *squared error* :math:`L(\mathbf{x} \mathbf{z}) = || \mathbf{x} -
73 | \mathbf{z} ||^2`, can be used. If the input is interpreted as either bit
74 | vectors or vectors of bit probabilities, *cross-entropy* of the reconstruction
75 | can be used:
76 |
77 | .. math::
78 |
79 | L_{H} (\mathbf{x}, \mathbf{z}) = - \sum^d_{k=1}[\mathbf{x}_k \log
80 | \mathbf{z}_k + (1 - \mathbf{x}_k)\log(1 - \mathbf{z}_k)]
81 |
82 | The hope is that the code :math:`\mathbf{y}` is a *distributed* representation
83 | that captures the coordinates along the main factors of variation in the data.
84 | This is similar to the way the projection on principal components would capture
85 | the main factors of variation in the data. Indeed, if there is one linear
86 | hidden layer (the *code)* and the mean squared error criterion is used to train
87 | the network, then the :math:`k` hidden units learn to project the input in the
88 | span of the first :math:`k` principal components of the data. If the hidden
89 | layer is non-linear, the auto-encoder behaves differently from PCA, with the
90 | ability to capture multi-modal aspects of the input distribution. The departure
91 | from PCA becomes even more important when we consider *stacking multiple
92 | encoders* (and their corresponding decoders) when building a deep auto-encoder
93 | [Hinton06]_.
94 |
95 | Because :math:`\mathbf{y}` is viewed as a lossy compression of
96 | :math:`\mathbf{x}`, it cannot be a good (small-loss) compression for all
97 | :math:`\mathbf{x}`. Optimization makes it a good compression for training
98 | examples, and hopefully for other inputs as well, but not for arbitrary inputs.
99 | That is the sense in which an auto-encoder generalizes: it gives low
100 | reconstruction error on test examples from the same distribution as the
101 | training examples, but generally high reconstruction error on samples randomly
102 | chosen from the input space.
103 |
104 | We want to implement an auto-encoder using Theano, in the form of a class, that
105 | could be afterwards used in constructing a stacked autoencoder. The first step
106 | is to create shared variables for the parameters of the autoencoder
107 | :math:`\mathbf{W}`, :math:`\mathbf{b}` and :math:`\mathbf{b'}`. (Since we are
108 | using tied weights in this tutorial, :math:`\mathbf{W}^T` will be used for
109 | :math:`\mathbf{W'}`):
110 |
111 | .. literalinclude:: ../code/dA.py
112 | :pyobject: dA.__init__
113 |
114 | Note that we pass the symbolic ``input`` to the autoencoder as a parameter.
115 | This is so that we can concatenate layers of autoencoders to form a deep
116 | network: the symbolic output (the :math:`\mathbf{y}` above) of layer :math:`k` will
117 | be the symbolic input of layer :math:`k+1`.
118 |
119 | Now we can express the computation of the latent representation and of the reconstructed
120 | signal:
121 |
122 | .. literalinclude:: ../code/dA.py
123 | :pyobject: dA.get_hidden_values
124 |
125 | .. literalinclude:: ../code/dA.py
126 | :pyobject: dA.get_reconstructed_input
127 |
128 | And using these functions we can compute the cost and the updates of
129 | one stochastic gradient descent step :
130 |
131 | .. literalinclude:: ../code/dA.py
132 | :pyobject: dA.get_cost_updates
133 |
134 | We can now define a function that applied iteratively will update the
135 | parameters ``W``, ``b`` and ``b_prime`` such that the
136 | reconstruction cost is approximately minimized.
137 |
138 | .. literalinclude:: ../code/dA.py
139 | :start-after: theano_rng = RandomStreams(rng.randint(2 ** 30))
140 | :end-before: start_time = time.clock()
141 |
142 | If there is no constraint besides minimizing the reconstruction error, one
143 | might expect an auto-encoder with :math:`n` inputs and an encoding of dimension
144 | :math:`n` (or greater) to learn the identity function, merely mapping an input
145 | to its copy. Such an autoencoder would not differentiate test examples (from
146 | the training distribution) from other input configurations.
147 |
148 | Surprisingly,
149 | experiments reported in [Bengio07]_ suggest that, in practice, when trained
150 | with stochastic gradient descent, non-linear auto-encoders with more hidden
151 | units than inputs (called overcomplete) yield useful representations. (Here,
152 | "useful" means that a network taking the encoding as input has low
153 | classification error.)
154 |
155 | A simple explanation is that stochastic gradient descent with early stopping is
156 | similar to an L2 regularization of the parameters. To achieve perfect
157 | reconstruction of continuous inputs, a one-hidden layer auto-encoder with
158 | non-linear hidden units (exactly like in the above code) needs very small
159 | weights in the first (encoding) layer, to bring the non-linearity of the hidden
160 | units into their linear regime, and very large weights in the second (decoding)
161 | layer. With binary inputs, very large weights are also needed to completely
162 | minimize the reconstruction error. Since the implicit or explicit
163 | regularization makes it difficult to reach large-weight solutions, the
164 | optimization algorithm finds encodings which only work well for examples
165 | similar to those in the training set, which is what we want. It means that the
166 | *representation is exploiting statistical regularities present in the training
167 | set,* rather than merely learning to replicate the input.
168 |
169 | There are other ways by which an auto-encoder with more hidden units than inputs
170 | could be prevented from learning the identity function, capturing something
171 | useful about the input in its hidden representation. One is the addition of
172 | *sparsity* (forcing many of the hidden units to be zero or near-zero). Sparsity
173 | has been exploited very successfully by many [Ranzato07]_ [Lee08]_. Another is
174 | to add randomness in the transformation from input to reconstruction. This
175 | technique is used in Restricted Boltzmann Machines (discussed later in
176 | :ref:`rbm`), as well as in Denoising Auto-Encoders, discussed below.
177 |
178 | .. _DA:
179 |
180 | Denoising Autoencoders
181 | ++++++++++++++++++++++
182 |
183 | The idea behind denoising autoencoders is simple. In order to force
184 | the hidden layer to discover more robust features and prevent it
185 | from simply learning the identity, we train the
186 | autoencoder to *reconstruct the input from a corrupted version of it*.
187 |
188 | The denoising auto-encoder is a stochastic version of the auto-encoder.
189 | Intuitively, a denoising auto-encoder does two things: try to encode the input
190 | (preserve the information about the input), and try to undo the effect of a
191 | corruption process stochastically applied to the input of the auto-encoder. The
192 | latter can only be done by capturing the statistical dependencies between the
193 | inputs. The denoising auto-encoder can be understood from different
194 | perspectives (the manifold learning perspective, stochastic operator
195 | perspective, bottom-up -- information theoretic perspective, top-down --
196 | generative model perspective), all of which are explained in [Vincent08]_. See
197 | also section 7.2 of [Bengio09]_ for an overview of auto-encoders.
198 |
199 | In [Vincent08]_, the stochastic corruption process randomly sets some of the
200 | inputs (as many as half of them) to zero. Hence the denoising auto-encoder is
201 | trying to *predict the corrupted (i.e. missing) values from the uncorrupted
202 | (i.e., non-missing) values*, for randomly selected subsets of missing patterns.
203 | Note how being able to predict any subset of variables from the rest is a
204 | sufficient condition for completely capturing the joint distribution between a
205 | set of variables (this is how Gibbs sampling works).
206 |
207 | To convert the autoencoder class into a denoising autoencoder class, all we
208 | need to do is to add a stochastic corruption step operating on the input. The input can be
209 | corrupted in many ways, but in this tutorial we will stick to the original
210 | corruption mechanism of randomly masking entries of the input by making
211 | them zero. The code below
212 | does just that :
213 |
214 | .. literalinclude:: ../code/dA.py
215 | :pyobject: dA.get_corrupted_input
216 |
217 |
218 | In the stacked autoencoder class (:ref:`stacked_autoencoders`) the weights of
219 | the ``dA`` class have to be shared with those of a corresponding sigmoid layer.
220 | For this reason, the constructor of the ``dA`` also gets Theano variables
221 | pointing to the shared parameters. If those parameters are left to ``None``,
222 | new ones will be constructed.
223 |
224 | The final denoising autoencoder class becomes :
225 |
226 | .. literalinclude:: ../code/dA.py
227 | :pyobject: dA
228 |
229 |
230 |
231 | Putting it All Together
232 | +++++++++++++++++++++++
233 |
234 |
235 | It is easy now to construct an instance of our ``dA`` class and train
236 | it.
237 |
238 | .. literalinclude:: ../code/dA.py
239 | :language: python
240 | :start-after: start-snippet-2
241 | :end-before: end-snippet-2
242 |
243 | .. literalinclude:: ../code/dA.py
244 | :start-after: start-snippet-3
245 | :end-before: end-snippet-3
246 |
247 |
248 | In order to get a feeling of what the network learned we are going to
249 | plot the filters (defined by the weight matrix). Bear in mind, however,
250 | that this does not provide the entire story,
251 | since we neglect the biases and plot the weights up to a multiplicative
252 | constant (weights are converted to values between 0 and 1).
253 |
254 | To plot our filters we will need the help of ``tile_raster_images`` (see
255 | :ref:`how-to-plot`) so we urge the reader to study it. Also
256 | using the help of the Python Image Library, the following lines of code will
257 | save the filters as an image :
258 |
259 | .. literalinclude:: ../code/dA.py
260 | :start-after: start-snippet-4
261 | :end-before: end-snippet-4
262 |
263 |
264 | Running the Code
265 | ++++++++++++++++
266 |
267 | To run the code :
268 |
269 | .. code-block:: bash
270 |
271 | python dA.py
272 |
273 | The resulted filters when we do not use any noise are :
274 |
275 | .. figure:: images/filters_corruption_0.png
276 | :align: center
277 |
278 |
279 |
280 | The filters for 30 percent noise :
281 |
282 |
283 | .. figure:: images/filters_corruption_30.png
284 | :align: center
285 |
286 |
287 |
288 |
--------------------------------------------------------------------------------
/doc/images/3wolfmoon.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/3wolfmoon.jpg
--------------------------------------------------------------------------------
/doc/images/3wolfmoon_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/3wolfmoon_output.png
--------------------------------------------------------------------------------
/doc/images/DBN3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/DBN3.png
--------------------------------------------------------------------------------
/doc/images/bm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/bm.png
--------------------------------------------------------------------------------
/doc/images/cnn_explained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/cnn_explained.png
--------------------------------------------------------------------------------
/doc/images/conv_1D_nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/conv_1D_nn.png
--------------------------------------------------------------------------------
/doc/images/filters_at_epoch_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/filters_at_epoch_14.png
--------------------------------------------------------------------------------
/doc/images/filters_corruption_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/filters_corruption_0.png
--------------------------------------------------------------------------------
/doc/images/filters_corruption_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/filters_corruption_30.png
--------------------------------------------------------------------------------
/doc/images/lstm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/lstm.png
--------------------------------------------------------------------------------
/doc/images/lstm_memorycell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/lstm_memorycell.png
--------------------------------------------------------------------------------
/doc/images/markov_chain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/markov_chain.png
--------------------------------------------------------------------------------
/doc/images/mlp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/mlp.png
--------------------------------------------------------------------------------
/doc/images/mnist_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/mnist_0.png
--------------------------------------------------------------------------------
/doc/images/mnist_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/mnist_1.png
--------------------------------------------------------------------------------
/doc/images/mnist_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/mnist_2.png
--------------------------------------------------------------------------------
/doc/images/mnist_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/mnist_3.png
--------------------------------------------------------------------------------
/doc/images/mnist_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/mnist_4.png
--------------------------------------------------------------------------------
/doc/images/mnist_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/mnist_5.png
--------------------------------------------------------------------------------
/doc/images/mylenet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/mylenet.png
--------------------------------------------------------------------------------
/doc/images/rbm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/rbm.png
--------------------------------------------------------------------------------
/doc/images/rnnrbm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/rnnrbm.png
--------------------------------------------------------------------------------
/doc/images/sample1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/sample1.png
--------------------------------------------------------------------------------
/doc/images/sample2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/sample2.png
--------------------------------------------------------------------------------
/doc/images/samples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/samples.png
--------------------------------------------------------------------------------
/doc/images/sparse_1D_nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/sparse_1D_nn.png
--------------------------------------------------------------------------------
/doc/index.txt:
--------------------------------------------------------------------------------
1 | =======================
2 | Deep Learning Tutorials
3 | =======================
4 |
5 | Deep Learning is a new area of Machine Learning research, which
6 | has been introduced with the objective of moving Machine Learning
7 | closer to one of its original goals: Artificial Intelligence.
8 | See these course notes for a `brief introduction to Machine Learning for AI `_
9 | and an `introduction to Deep Learning algorithms `_.
10 |
11 | Deep Learning is about learning multiple levels of representation
12 | and abstraction that help to
13 | make sense of data such as images, sound, and text.
14 | For more about deep learning algorithms, see for example:
15 |
16 | - The monograph or review paper `Learning Deep Architectures for AI `_ (Foundations & Trends in Machine Learning, 2009).
17 | - The ICML 2009 Workshop on Learning Feature Hierarchies `webpage `_ has a `list of references `_.
18 | - The LISA `public wiki `_ has a `reading list `_ and a `bibliography `_.
19 | - Geoff Hinton has `readings `_ from 2009's `NIPS tutorial `_.
20 |
21 | The tutorials presented here will introduce you to some of the most important deep learning
22 | algorithms and will also show you how to run them using Theano_. Theano is a python library that makes writing deep learning models easy, and gives the option of
23 | training them on a GPU.
24 |
25 | The algorithm tutorials have some prerequisites. You should know some python,
26 | and be familiar with numpy. Since this tutorial is about using Theano, you
27 | should read over the `Theano basic tutorial`_ first. Once you've done that,
28 | read through our :ref:`gettingstarted` chapter -- it introduces the notation, and [downloadable] datasets used in the algorithm tutorials, and the way we do optimization by stochastic gradient descent.
29 |
30 | The purely supervised learning algorithms are meant to be read in order:
31 |
32 | #. :ref:`Logistic Regression ` - using Theano for something simple
33 | #. :ref:`Multilayer perceptron ` - introduction to layers
34 | #. :ref:`Deep Convolutional Network ` - a simplified version of LeNet5
35 |
36 | The unsupervised and semi-supervised learning algorithms can be read in any
37 | order (the auto-encoders can be read independently of the RBM/DBN thread):
38 |
39 | * :ref:`Auto Encoders, Denoising Autoencoders ` - description of autoencoders
40 | * :ref:`Stacked Denoising Auto-Encoders ` - easy steps into unsupervised pre-training for deep nets
41 | * :ref:`Restricted Boltzmann Machines ` - single layer generative RBM model
42 | * :ref:`Deep Belief Networks ` - unsupervised generative pre-training of stacked RBMs followed by supervised fine-tuning
43 |
44 | Building towards including the mcRBM model, we have a new tutorial on sampling
45 | from energy models:
46 |
47 | * :ref:`HMC Sampling ` - hybrid (aka Hamiltonian) Monte-Carlo sampling with scan()
48 |
49 | Building towards including the Contractive auto-encoders tutorial, we have the code for now:
50 | * `Contractive auto-encoders`_ code - There is some basic doc in the code.
51 |
52 | Recurrent neural networks with word embeddings and context window:
53 | * :ref:`Semantic Parsing of Speech using Recurrent Net `
54 |
55 | LSTM network for sentiment analysis:
56 | * :ref:`LSTM network `
57 |
58 | Energy-based recurrent neural network (RNN-RBM):
59 | * :ref:`Modeling and generating sequences of polyphonic music `
60 |
61 | .. _Theano: http://deeplearning.net/software/theano
62 |
63 | .. _Theano basic tutorial: http://deeplearning.net/software/theano/tutorial
64 |
65 | .. _Contractive auto-encoders: https://github.com/lisa-lab/DeepLearningTutorials/blob/master/code/cA.py
66 |
--------------------------------------------------------------------------------
/doc/logreg.txt:
--------------------------------------------------------------------------------
1 | .. index:: Logistic Regression
2 |
3 | .. _logreg :
4 |
5 |
6 | Classifying MNIST digits using Logistic Regression
7 | ==================================================
8 |
9 | .. note::
10 | This sections assumes familiarity with the following Theano
11 | concepts: `shared variables`_ , `basic arithmetic ops`_ , `T.grad`_ ,
12 | `floatX`_. If you intend to run the code on GPU also read `GPU`_.
13 |
14 | .. note::
15 | The code for this section is available for download `here`_.
16 |
17 | .. _here: http://deeplearning.net/tutorial/code/logistic_sgd.py
18 |
19 | .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables
20 |
21 | .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars
22 |
23 | .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients
24 |
25 | .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX
26 |
27 | .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html
28 |
29 | In this section, we show how Theano can be used to implement the most basic
30 | classifier: the logistic regression. We start off with a quick primer of the
31 | model, which serves both as a refresher but also to anchor the notation and
32 | show how mathematical expressions are mapped onto Theano graphs.
33 |
34 | In the deepest of machine learning traditions, this tutorial will tackle the exciting
35 | problem of MNIST digit classification.
36 |
37 | The Model
38 | +++++++++
39 |
40 | Logistic regression is a probabilistic, linear classifier. It is parametrized
41 | by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
42 | done by projecting an input vector onto a set of hyperplanes, each of which
43 | corresponds to a class. The distance from the input to a hyperplane reflects
44 | the probability that the input is a member of the corresponding class.
45 |
46 | Mathematically, the probability that an input vector :math:`x` is a member of a
47 | class :math:`i`, a value of a stochastic variable :math:`Y`, can be written as:
48 |
49 | .. math::
50 | P(Y=i|x, W,b) &= softmax_i(W x + b) \\
51 | &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
52 |
53 | The model's prediction :math:`y_{pred}` is the class whose probability is maximal, specifically:
54 |
55 | .. math::
56 | y_{pred} = {\rm argmax}_i P(Y=i|x,W,b)
57 |
58 | The code to do this in Theano is the following:
59 |
60 | .. literalinclude:: ../code/logistic_sgd.py
61 | :start-after: start-snippet-1
62 | :end-before: end-snippet-1
63 |
64 | Since the parameters of the model must maintain a persistent state throughout
65 | training, we allocate shared variables for :math:`W,b`. This declares them both
66 | as being symbolic Theano variables, but also initializes their contents. The
67 | dot and softmax operators are then used to compute the vector :math:`P(Y|x,
68 | W,b)`. The result ``p_y_given_x`` is a symbolic variable of vector-type.
69 |
70 | To get the actual model prediction, we can use the ``T.argmax`` operator, which
71 | will return the index at which ``p_y_given_x`` is maximal (i.e. the class with
72 | maximum probability).
73 |
74 | Now of course, the model we have defined so far does not do anything useful
75 | yet, since its parameters are still in their initial state. The following
76 | section will thus cover how to learn the optimal parameters.
77 |
78 |
79 | .. note::
80 | For a complete list of Theano ops, see: `list of ops `_
81 |
82 |
83 | Defining a Loss Function
84 | ++++++++++++++++++++++++
85 |
86 | Learning optimal model parameters involves minimizing a loss function. In the
87 | case of multi-class logistic regression, it is very common to use the negative
88 | log-likelihood as the loss. This is equivalent to maximizing the likelihood of the
89 | data set :math:`\cal{D}` under the model parameterized by :math:`\theta`. Let
90 | us first start by defining the likelihood :math:`\cal{L}` and loss
91 | :math:`\ell`:
92 |
93 | .. math::
94 |
95 | \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
96 | \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
97 | \ell (\theta=\{W,b\}, \mathcal{D}) = - \mathcal{L} (\theta=\{W,b\}, \mathcal{D})
98 |
99 | While entire books are dedicated to the topic of minimization, gradient
100 | descent is by far the simplest method for minimizing arbitrary non-linear
101 | functions. This tutorial will use the method of stochastic gradient method with
102 | mini-batches (MSGD). See :ref:`opt_SGD` for more details.
103 |
104 | The following Theano code defines the (symbolic) loss for a given minibatch:
105 |
106 | .. literalinclude:: ../code/logistic_sgd.py
107 | :start-after: start-snippet-2
108 | :end-before: end-snippet-2
109 |
110 | .. note::
111 |
112 | Even though the loss is formally defined as the *sum*, over the data set,
113 | of individual error terms, in practice, we use the *mean* (``T.mean``)
114 | in the code. This allows for the learning rate choice to be less dependent
115 | of the minibatch size.
116 |
117 |
118 | Creating a LogisticRegression class
119 | +++++++++++++++++++++++++++++++++++
120 |
121 | We now have all the tools we need to define a ``LogisticRegression`` class, which
122 | encapsulates the basic behaviour of logistic regression. The code is very
123 | similar to what we have covered so far, and should be self explanatory.
124 |
125 | .. literalinclude:: ../code/logistic_sgd.py
126 | :pyobject: LogisticRegression
127 |
128 | We instantiate this class as follows:
129 |
130 | .. literalinclude:: ../code/logistic_sgd.py
131 | :start-after: index = T.lscalar()
132 | :end-before: # the cost we minimize during
133 |
134 | We start by allocating symbolic variables for the training inputs :math:`x` and
135 | their corresponding classes :math:`y`. Note that ``x`` and ``y`` are defined
136 | outside the scope of the ``LogisticRegression`` object. Since the class
137 | requires the input to build its graph, it is passed as a parameter of the
138 | ``__init__`` function. This is useful in case you want to connect instances of
139 | such classes to form a deep network. The output of one layer can be passed as
140 | the input of the layer above. (This tutorial does not build a multi-layer
141 | network, but this code will be reused in future tutorials that do.)
142 |
143 | Finally, we define a (symbolic) ``cost`` variable to minimize, using the instance
144 | method ``classifier.negative_log_likelihood``.
145 |
146 | .. literalinclude:: ../code/logistic_sgd.py
147 | :start-after: classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)
148 | :end-before: # compiling a Theano function that computes the mistakes
149 |
150 | Note that ``x`` is an implicit symbolic input to the definition of ``cost``,
151 | because the symbolic variables of ``classifier`` were defined in terms of ``x``
152 | at initialization.
153 |
154 | Learning the Model
155 | ++++++++++++++++++
156 |
157 | To implement MSGD in most programming languages (C/C++, Matlab, Python), one
158 | would start by manually deriving the expressions for the gradient of the loss
159 | with respect to the parameters: in this case :math:`\partial{\ell}/\partial{W}`,
160 | and :math:`\partial{\ell}/\partial{b}`, This can get pretty tricky for complex
161 | models, as expressions for :math:`\partial{\ell}/\partial{\theta}` can get
162 | fairly complex, especially when taking into account problems of numerical
163 | stability.
164 |
165 | With Theano, this work is greatly simplified. It performs
166 | automatic differentiation and applies certain math transforms to improve
167 | numerical stability.
168 |
169 | To get the gradients :math:`\partial{\ell}/\partial{W}` and
170 | :math:`\partial{\ell}/\partial{b}` in Theano, simply do the following:
171 |
172 | .. literalinclude:: ../code/logistic_sgd.py
173 | :start-after: # compute the gradient of cost
174 | :end-before: # start-snippet-3
175 |
176 | ``g_W`` and ``g_b`` are symbolic variables, which can be used as part
177 | of a computation graph. The function ``train_model``, which performs one step
178 | of gradient descent, can then be defined as follows:
179 |
180 | .. literalinclude:: ../code/logistic_sgd.py
181 | :start-after: start-snippet-3
182 | :end-before: end-snippet-3
183 |
184 | ``updates`` is a list of pairs. In each pair, the first element is the symbolic
185 | variable to be updated in the step, and the second element is the symbolic
186 | function for calculating its new value. Similarly, ``givens`` is a dictionary
187 | whose keys are symbolic variables and whose values specify
188 | their replacements during the step. The function ``train_model`` is then defined such
189 | that:
190 |
191 | * the input is the mini-batch index ``index`` that, together with the batch
192 | size (which is not an input since it is fixed) defines :math:`x` with
193 | corresponding labels :math:`y`
194 | * the return value is the cost/loss associated with the x, y defined by
195 | the ``index``
196 | * on every function call, it will first replace ``x`` and ``y`` with the slices
197 | from the training set specified by ``index``. Then, it will evaluate the cost
198 | associated with that minibatch and apply the operations defined by the
199 | ``updates`` list.
200 |
201 | Each time ``train_model(index)`` is called, it will thus compute and return the
202 | cost of a minibatch, while also performing a step of MSGD. The entire learning
203 | algorithm thus consists in looping over all examples in the dataset, considering
204 | all the examples in one minibatch at a time,
205 | and repeatedly calling the ``train_model`` function.
206 |
207 |
208 | Testing the model
209 | +++++++++++++++++
210 |
211 | As explained in :ref:`opt_learn_classifier`, when testing the model we are
212 | interested in the number of misclassified examples (and not only in the likelihood).
213 | The ``LogisticRegression`` class therefore has an extra instance method, which
214 | builds the symbolic graph for retrieving the number of misclassified examples in
215 | each minibatch.
216 |
217 | The code is as follows:
218 |
219 | .. literalinclude:: ../code/logistic_sgd.py
220 | :pyobject: LogisticRegression.errors
221 |
222 | We then create a function ``test_model`` and a function ``validate_model``,
223 | which we can call to retrieve this value. As you will see shortly,
224 | ``validate_model`` is key to our early-stopping implementation (see
225 | :ref:`opt_early_stopping`). These functions take a minibatch index and compute,
226 | for the examples in that minibatch, the number that were misclassified by the
227 | model. The only difference between them is that ``test_model`` draws its
228 | minibatches from the testing set, while ``validate_model`` draws its from the
229 | validation set.
230 |
231 | .. literalinclude:: ../code/logistic_sgd.py
232 | :start-after: cost = classifier.negative_log_likelihood(y)
233 | :end-before: # compute the gradient of cost
234 |
235 | Putting it All Together
236 | +++++++++++++++++++++++
237 |
238 | The finished product is as follows.
239 |
240 | .. literalinclude:: ../code/logistic_sgd.py
241 |
242 | The user can learn to classify MNIST digits with SGD logistic regression, by typing, from
243 | within the DeepLearningTutorials folder:
244 |
245 | .. code-block:: bash
246 |
247 | python code/logistic_sgd.py
248 |
249 | The output one should expect is of the form :
250 |
251 | .. code-block:: bash
252 |
253 | ...
254 | epoch 72, minibatch 83/83, validation error 7.510417 %
255 | epoch 72, minibatch 83/83, test error of best model 7.510417 %
256 | epoch 73, minibatch 83/83, validation error 7.500000 %
257 | epoch 73, minibatch 83/83, test error of best model 7.489583 %
258 | Optimization complete with best validation score of 7.500000 %,with test performance 7.489583 %
259 | The code run for 74 epochs, with 1.936983 epochs/sec
260 |
261 |
262 | On an Intel(R) Core(TM)2 Duo CPU E8400 @ 3.00 Ghz the code runs with
263 | approximately 1.936 epochs/sec and it took 75 epochs to reach a test
264 | error of 7.489%. On the GPU the code does almost 10.0 epochs/sec. For this
265 | instance we used a batch size of 600.
266 |
267 |
268 | Prediction Using a Trained Model
269 | ++++++++++++++++++++++++++++++++
270 |
271 | ``sgd_optimization_mnist`` serialize and pickle the model each time new
272 | lowest validation error is reached. We can reload this model and predict
273 | labels of new data. ``predict`` function shows an example of how
274 | this could be done.
275 |
276 | .. literalinclude:: ../code/logistic_sgd.py
277 | :pyobject: predict
278 |
279 |
280 | .. rubric:: Footnotes
281 |
282 | .. [#f1] For smaller datasets and simpler models, more sophisticated descent
283 | algorithms can be more effective. The sample code
284 | `logistic_cg.py `_
285 | demonstrates how to use SciPy's conjugate gradient solver with Theano
286 | on the logistic regression task.
287 |
--------------------------------------------------------------------------------
/doc/lstm.txt:
--------------------------------------------------------------------------------
1 | .. _lstm:
2 |
3 | LSTM Networks for Sentiment Analysis
4 | **********************************************
5 |
6 | Summary
7 | +++++++
8 |
9 | This tutorial aims to provide an example of how a Recurrent Neural Network
10 | (RNN) using the Long Short Term Memory (LSTM) architecture can be implemented
11 | using Theano. In this tutorial, this model is used to perform sentiment
12 | analysis on movie reviews from the `Large Movie Review Dataset
13 | `_, sometimes known as the
14 | IMDB dataset.
15 |
16 | In this task, given a movie review, the model attempts to predict whether it
17 | is positive or negative. This is a binary classification task.
18 |
19 | Data
20 | ++++
21 |
22 | As previously mentioned, the provided scripts are used to train a LSTM
23 | recurrent neural network on the Large Movie Review Dataset dataset.
24 |
25 | While the dataset is public, in this tutorial we provide a copy of the dataset
26 | that has previously been preprocessed according to the needs of this LSTM
27 | implementation. Running the code provided in this tutorial will automatically
28 | download the data to the local directory. In order to use your own data, please
29 | use a (`preprocessing script
30 | `_)
31 | provided as a part of this tutorial.
32 |
33 | Once the model is trained, you can test it with your own corpus using the
34 | word-index dictionary
35 | (`imdb.dict.pkl.gz `_)
36 | provided as a part of this tutorial.
37 |
38 | Model
39 | +++++
40 |
41 | LSTM
42 | ====
43 |
44 | In a *traditional* recurrent neural network, during the gradient
45 | back-propagation phase, the gradient signal can end up being multiplied a
46 | large number of times (as many as the number of timesteps) by the weight
47 | matrix associated with the connections between the neurons of the recurrent
48 | hidden layer. This means that, the magnitude of weights in the transition
49 | matrix can have a strong impact on the learning process.
50 |
51 | If the weights in this matrix are small (or, more formally, if the leading
52 | eigenvalue of the weight matrix is smaller than 1.0), it can lead to a
53 | situation called *vanishing gradients* where the gradient signal gets so small
54 | that learning either becomes very slow or stops working altogether. It can
55 | also make more difficult the task of learning long-term dependencies in the
56 | data. Conversely, if the weights in this matrix are large (or, again, more
57 | formally, if the leading eigenvalue of the weight matrix is larger than 1.0),
58 | it can lead to a situation where the gradient signal is so large that it can
59 | cause learning to diverge. This is often referred to as *exploding gradients*.
60 |
61 | These issues are the main motivation behind the LSTM model which introduces a
62 | new structure called a *memory cell* (see Figure 1 below). A memory cell is
63 | composed of four main elements: an input gate, a neuron with a self-recurrent
64 | connection (a connection to itself), a forget gate and an output gate. The
65 | self-recurrent connection has a weight of 1.0 and ensures that, barring any
66 | outside interference, the state of a memory cell can remain constant from one
67 | timestep to another. The gates serve to modulate the interactions between the
68 | memory cell itself and its environment. The input gate can allow incoming
69 | signal to alter the state of the memory cell or block it. On the other hand,
70 | the output gate can allow the state of the memory cell to have an effect on
71 | other neurons or prevent it. Finally, the forget gate can modulate the memory
72 | cell’s self-recurrent connection, allowing the cell to remember or forget its
73 | previous state, as needed.
74 |
75 | .. figure:: images/lstm_memorycell.png
76 | :align: center
77 |
78 | **Figure 1** : Illustration of an LSTM memory cell.
79 |
80 | The equations below describe how a layer of memory cells is updated at every
81 | timestep :math:`t`. In these equations :
82 |
83 | * :math:`x_t` is the input to the memory cell layer at time :math:`t`
84 | * :math:`W_i`, :math:`W_f`, :math:`W_c`, :math:`W_o`, :math:`U_i`,
85 | :math:`U_f`, :math:`U_c`, :math:`U_o` and :math:`V_o` are weight
86 | matrices
87 | * :math:`b_i`, :math:`b_f`, :math:`b_c` and :math:`b_o` are bias vectors
88 |
89 |
90 | First, we compute the values for :math:`i_t`, the input gate, and
91 | :math:`\widetilde{C_t}` the candidate value for the states of the memory
92 | cells at time :math:`t` :
93 |
94 | .. math::
95 | :label: 1
96 |
97 | i_t = \sigma(W_i x_t + U_i h_{t-1} + b_i)
98 |
99 | .. math::
100 | :label: 2
101 |
102 | \widetilde{C_t} = tanh(W_c x_t + U_c h_{t-1} + b_c)
103 |
104 | Second, we compute the value for :math:`f_t`, the activation of the memory
105 | cells' forget gates at time :math:`t` :
106 |
107 | .. math::
108 | :label: 3
109 |
110 | f_t = \sigma(W_f x_t + U_f h_{t-1} + b_f)
111 |
112 | Given the value of the input gate activation :math:`i_t`, the forget gate
113 | activation :math:`f_t` and the candidate state value :math:`\widetilde{C_t}`,
114 | we can compute :math:`C_t` the memory cells' new state at time :math:`t` :
115 |
116 | .. math::
117 | :label: 4
118 |
119 | C_t = i_t * \widetilde{C_t} + f_t * C_{t-1}
120 |
121 | With the new state of the memory cells, we can compute the value of their
122 | output gates and, subsequently, their outputs :
123 |
124 | .. math::
125 | :label: 5
126 |
127 | o_t = \sigma(W_o x_t + U_o h_{t-1} + V_o C_t + b_o)
128 |
129 | .. math::
130 | :label: 6
131 |
132 | h_t = o_t * tanh(C_t)
133 |
134 | Our model
135 | =========
136 |
137 | The model we used in this tutorial is a variation of the standard LSTM model.
138 | In this variant, the activation of a cell’s output gate does not depend on the
139 | memory cell’s state :math:`C_t`. This allows us to perform part of the
140 | computation more efficiently (see the implementation note, below, for
141 | details). This means that, in the variant we have implemented, there is no
142 | matrix :math:`V_o` and equation :eq:`5` is replaced by equation :eq:`5-alt` :
143 |
144 | .. math::
145 | :label: 5-alt
146 |
147 | o_t = \sigma(W_o x_t + U_o h_{t-1} + b_o)
148 |
149 | Our model is composed of a single LSTM layer followed by an average pooling
150 | and a logistic regression layer as illustrated in Figure 2 below. Thus, from
151 | an input sequence :math:`x_0, x_1, x_2, ..., x_n`, the memory cells in the
152 | LSTM layer will produce a representation sequence :math:`h_0, h_1, h_2, ...,
153 | h_n`. This representation sequence is then averaged over all timesteps
154 | resulting in representation h. Finally, this representation is fed to a
155 | logistic regression layer whose target is the class label associated with the
156 | input sequence.
157 |
158 | .. figure:: images/lstm.png
159 | :align: center
160 |
161 | **Figure 2** : Illustration of the model used in this tutorial. It is
162 | composed of a single LSTM layer followed by mean pooling over time and
163 | logistic regression.
164 |
165 | **Implementation note** : In the code included this tutorial, the equations
166 | :eq:`1`, :eq:`2`, :eq:`3` and :eq:`5-alt` are performed in parallel to make
167 | the computation more efficient. This is possible because none of these
168 | equations rely on a result produced by the other ones. It is achieved by
169 | concatenating the four matrices :math:`W_*` into a single weight matrix
170 | :math:`W` and performing the same concatenation on the weight matrices
171 | :math:`U_*` to produce the matrix :math:`U` and the bias vectors :math:`b_*`
172 | to produce the vector :math:`b`. Then, the pre-nonlinearity activations can
173 | be computed with :
174 |
175 | .. math::
176 |
177 | z = \sigma(W x_t + U h_{t-1} + b)
178 |
179 | The result is then sliced to obtain the pre-nonlinearity activations for
180 | :math:`i`, :math:`f`, :math:`\widetilde{C_t}`, and :math:`o` and the
181 | non-linearities are then applied independently for each.
182 |
183 |
184 | Code - Citations - Contact
185 | ++++++++++++++++++++++++++
186 |
187 | Code
188 | ====
189 |
190 | The LSTM implementation can be found in the two following files :
191 |
192 | * `lstm.py `_ : Main script. Defines and train the model.
193 |
194 | * `imdb.py `_ : Secondary script. Handles the loading and preprocessing of the IMDB dataset.
195 |
196 | After downloading both scripts and putting both in the same folder, the user
197 | can run the code by calling:
198 |
199 | .. code-block:: bash
200 |
201 | THEANO_FLAGS="floatX=float32" python lstm.py
202 |
203 | The script will automatically download the data and decompress it.
204 |
205 | **Note** : The provided code supports the Stochastic Gradient Descent (SGD),
206 | AdaDelta and RMSProp optimization methods. You are advised to use AdaDelta or
207 | RMSProp because SGD appears to performs poorly on this task with this
208 | particular model.
209 |
210 | Papers
211 | ======
212 |
213 | If you use this tutorial, please cite the following papers.
214 |
215 | Introduction of the LSTM model:
216 |
217 | * `[pdf] `__ Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural computation, 9(8), 1735-1780.
218 |
219 | Addition of the forget gate to the LSTM model:
220 |
221 | * `[pdf] `__ Gers, F. A., Schmidhuber, J., & Cummins, F. (2000). Learning to forget: Continual prediction with LSTM. Neural computation, 12(10), 2451-2471.
222 |
223 | More recent LSTM paper:
224 |
225 | * `[pdf] `__ Graves, Alex. Supervised sequence labelling with recurrent neural networks. Vol. 385. Springer, 2012.
226 |
227 | Papers related to Theano:
228 |
229 | * `[pdf] `__ Bastien, Frédéric, Lamblin, Pascal, Pascanu, Razvan, Bergstra, James, Goodfellow, Ian, Bergeron, Arnaud, Bouchard, Nicolas, and Bengio, Yoshua. Theano: new features and speed improvements. NIPS Workshop on Deep Learning and Unsupervised Feature Learning, 2012.
230 |
231 | * `[pdf] `__ Bergstra, James, Breuleux, Olivier, Bastien, Frédéric, Lamblin, Pascal, Pascanu, Razvan, Desjardins, Guillaume, Turian, Joseph, Warde-Farley, David, and Bengio, Yoshua. Theano: a CPU and GPU math expression compiler. In Proceedings of the Python for Scientific Computing Conference (SciPy), June 2010.
232 |
233 | Thank you!
234 |
235 | Contact
236 | =======
237 |
238 | Please email `Pierre Luc Carrier `_ or
239 | `Kyunghyun Cho `_ for any problem report or
240 | feedback. We will be glad to hear from you.
241 |
242 | References
243 | ++++++++++
244 |
245 | * Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural computation, 9(8), 1735-1780.
246 |
247 | * Gers, F. A., Schmidhuber, J., & Cummins, F. (2000). Learning to forget: Continual prediction with LSTM. Neural computation, 12(10), 2451-2471.
248 |
249 | * Graves, A. (2012). Supervised sequence labelling with recurrent neural networks (Vol. 385). Springer.
250 |
251 | * Hochreiter, S., Bengio, Y., Frasconi, P., & Schmidhuber, J. (2001). Gradient flow in recurrent nets: the difficulty of learning long-term dependencies.
252 |
253 | * Bengio, Y., Simard, P., & Frasconi, P. (1994). Learning long-term dependencies with gradient descent is difficult. Neural Networks, IEEE Transactions on, 5(2), 157-166.
254 |
255 | * Maas, A. L., Daly, R. E., Pham, P. T., Huang, D., Ng, A. Y., & Potts, C. (2011, June). Learning word vectors for sentiment analysis. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies-Volume 1 (pp. 142-150). Association for Computational Linguistics.
256 |
--------------------------------------------------------------------------------
/doc/mlp.txt:
--------------------------------------------------------------------------------
1 | .. index:: Multilayer Perceptron
2 |
3 | .. _mlp:
4 |
5 |
6 | Multilayer Perceptron
7 | =====================
8 |
9 | .. note::
10 | This section assumes the reader has already read through :doc:`logreg`.
11 | Additionally, it uses the following new Theano functions and concepts:
12 | `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_,
13 | :ref:`L1_L2_regularization`, `floatX`_. If you intend to run the
14 | code on GPU also read `GPU`_.
15 |
16 | .. note::
17 | The code for this section is available for download `here`_.
18 |
19 | .. _here: http://deeplearning.net/tutorial/code/mlp.py
20 |
21 | .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh
22 |
23 | .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables
24 |
25 | .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars
26 |
27 | .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients
28 |
29 | .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX
30 |
31 | .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html
32 |
33 |
34 | The next architecture we are going to present using Theano is the
35 | single-hidden-layer Multi-Layer Perceptron (MLP). An MLP can be viewed as a
36 | logistic regression classifier where the input is first transformed using a
37 | learnt non-linear transformation :math:`\Phi`. This transformation projects the
38 | input data into a space where it becomes linearly separable. This intermediate
39 | layer is referred to as a **hidden layer**. A single hidden layer is sufficient
40 | to make MLPs a **universal approximator**. However we will see later on that
41 | there are substantial benefits to using many such hidden layers, i.e. the very
42 | premise of **deep learning**. See these course notes for an `introduction to
43 | MLPs, the back-propagation algorithm, and how to train MLPs
44 | `_.
45 |
46 | This tutorial will again tackle the problem of MNIST digit classification.
47 |
48 | The Model
49 | +++++++++
50 |
51 | An MLP (or Artificial Neural Network - ANN) with a single hidden layer
52 | can be represented graphically as
53 | follows:
54 |
55 | .. figure:: images/mlp.png
56 | :align: center
57 |
58 | Formally, a one-hidden-layer MLP is a function :math:`f: R^D \rightarrow
59 | R^L`, where :math:`D` is the size of input vector :math:`x` and :math:`L` is
60 | the size of the output vector :math:`f(x)`, such that, in matrix notation:
61 |
62 | .. math::
63 |
64 | f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
65 |
66 | with bias vectors :math:`b^{(1)}`, :math:`b^{(2)}`; weight matrices
67 | :math:`W^{(1)}`, :math:`W^{(2)}` and activation functions :math:`G` and :math:`s`.
68 |
69 | The vector :math:`h(x) = \Phi(x) = s(b^{(1)} + W^{(1)} x)` constitutes the hidden layer.
70 | :math:`W^{(1)} \in R^{D \times D_h}` is the weight matrix connecting the input vector
71 | to the hidden layer. Each column :math:`W^{(1)}_{\cdot i}` represents the weights
72 | from the input units to the i-th hidden unit. Typical choices for :math:`s`
73 | include :math:`tanh`, with :math:`tanh(a)=(e^a-e^{-a})/(e^a+e^{-a})`,
74 | or the logistic :math:`sigmoid` function, with :math:`sigmoid(a)=1/(1+e^{-a})`. We will be using
75 | :math:`tanh` in this tutorial because it typically yields to faster training
76 | (and sometimes also to better local minima). Both the :math:`tanh`
77 | and :math:`sigmoid` are scalar-to-scalar functions but their natural
78 | extension to vectors and tensors consists in applying them element-wise
79 | (e.g. separately on each element of the vector, yielding a same-size vector).
80 |
81 | The output vector is then obtained as: :math:`o(x) = G(b^{(2)} + W^{(2)} h(x))`.
82 | The reader should recognize the form we already used for
83 | :doc:`logreg`. As before,
84 | class-membership probabilities can be obtained by choosing :math:`G` as the
85 | :math:`softmax` function (in the case of multi-class classification).
86 |
87 | To train an MLP, we learn **all** parameters of the model, and here we use
88 | :ref:`opt_SGD` with minibatches.
89 | The set of parameters to learn is the set :math:`\theta =
90 | \{W^{(2)},b^{(2)},W^{(1)},b^{(1)}\}`. Obtaining the gradients
91 | :math:`\partial{\ell}/\partial{\theta}` can be achieved through the
92 | **backpropagation algorithm** (a special case of the chain-rule of derivation).
93 | Thankfully, since Theano performs automatic differentation, we will not need to
94 | cover this in the tutorial !
95 |
96 |
97 | Going from logistic regression to MLP
98 | +++++++++++++++++++++++++++++++++++++
99 |
100 | This tutorial will focus on a single-hidden-layer MLP. We start off by
101 | implementing a class that will represent a hidden layer. To
102 | construct the MLP we will then only need to throw a logistic regression
103 | layer on top.
104 |
105 | .. literalinclude:: ../code/mlp.py
106 | :start-after: start-snippet-1
107 | :end-before: end-snippet-1
108 |
109 | The initial values for the weights of a hidden layer :math:`i` should be uniformly
110 | sampled from a symmetric interval that depends on the activation function. For
111 | :math:`tanh` activation function results obtained in [Xavier10]_ show that the
112 | interval should be
113 | :math:`[-\sqrt{\frac{6}{fan_{in}+fan_{out}}},\sqrt{\frac{6}{fan_{in}+fan_{out}}}]`, where
114 | :math:`fan_{in}` is the number of units in the :math:`(i-1)`-th layer,
115 | and :math:`fan_{out}` is the number of units in the :math:`i`-th layer. For
116 | the sigmoid function the interval is :math:`[-4\sqrt{\frac{6}{fan_{in}+fan_{out}}},4\sqrt{\frac{6}{fan_{in}+fan_{out}}}]`.
117 | This initialization ensures that, early in training, each neuron operates in a
118 | regime of its activation function where information can easily be propagated
119 | both upward (activations flowing from inputs to outputs) and backward
120 | (gradients flowing from outputs to inputs).
121 |
122 | .. literalinclude:: ../code/mlp.py
123 | :start-after: end-snippet-1
124 | :end-before: lin_output = T.dot(input, self.W) + self.b
125 |
126 | Note that we used a given non-linear function as the activation function of the hidden layer. By default this is ``tanh``, but in many cases we might want
127 | to use something else.
128 |
129 | .. literalinclude:: ../code/mlp.py
130 | :start-after: self.b = b
131 | :end-before: # parameters of the model
132 |
133 | If you look into theory this class implements the graph that computes
134 | the hidden layer value :math:`h(x) = \Phi(x) = s(b^{(1)} + W^{(1)} x)`.
135 | If you give this graph as input to the ``LogisticRegression`` class,
136 | implemented in the previous tutorial :doc:`logreg`, you get the output
137 | of the MLP. You can see this in the following short implementation of
138 | the ``MLP`` class.
139 |
140 | .. literalinclude:: ../code/mlp.py
141 | :start-after: start-snippet-2
142 | :end-before: end-snippet-2
143 |
144 | In this tutorial we will also use L1 and L2 regularization (see
145 | :ref:`L1_L2_regularization`). For this, we need to compute the L1 norm and the squared L2
146 | norm of the weights :math:`W^{(1)}, W^{(2)}`.
147 |
148 | .. literalinclude:: ../code/mlp.py
149 | :start-after: start-snippet-3
150 | :end-before: end-snippet-3
151 |
152 | As before, we train this model using stochastic gradient descent with
153 | mini-batches. The difference is that we modify the cost function to include the
154 | regularization term. ``L1_reg`` and ``L2_reg`` are the hyperparameters
155 | controlling the weight of these regularization terms in the total cost function.
156 | The code that computes the new cost is:
157 |
158 | .. literalinclude:: ../code/mlp.py
159 | :start-after: start-snippet-4
160 | :end-before: end-snippet-4
161 |
162 | We then update the parameters of the model using the gradient. This code is
163 | almost identical to the one for logistic regression. Only the number of
164 | parameters differ. To get around this ( and write code that could work
165 | for any number of parameters) we will use the list of parameters that
166 | we created with the model ``params`` and parse it, computing a gradient
167 | at each step.
168 |
169 | .. literalinclude:: ../code/mlp.py
170 | :start-after: start-snippet-5
171 | :end-before: end-snippet-5
172 |
173 | Putting it All Together
174 | +++++++++++++++++++++++
175 |
176 | Having covered the basic concepts, writing an MLP class becomes quite easy.
177 | The code below shows how this can be done, in a way which is analogous to our previous logistic regression implementation.
178 |
179 | .. literalinclude:: ../code/mlp.py
180 |
181 | The user can then run the code by calling :
182 |
183 | .. code-block:: bash
184 |
185 | python code/mlp.py
186 |
187 | The output one should expect is of the form :
188 |
189 | .. code-block:: bash
190 |
191 | Optimization complete. Best validation score of 1.690000 % obtained at iteration 2070000, with test performance 1.650000 %
192 | The code for file mlp.py ran for 97.34m
193 |
194 | On an Intel(R) Core(TM) i7-2600K CPU @ 3.40GHz the code runs with
195 | approximately 10.3 epoch/minute and it took 828 epochs to reach a test
196 | error of 1.65%.
197 |
198 | To put this into perspective, we refer the reader to the results section of `this
199 | `_ page.
200 |
201 | Tips and Tricks for training MLPs
202 | +++++++++++++++++++++++++++++++++
203 |
204 | There are several hyper-parameters in the above code, which are not (and,
205 | generally speaking, cannot be) optimized by gradient descent. Strictly speaking,
206 | finding an optimal set of values for these
207 | hyper-parameters is not a feasible problem. First, we can't simply optimize
208 | each of them independently. Second, we cannot readily apply gradient
209 | techniques that we described previously (partly because some parameters are
210 | discrete values and others are real-valued). Third, the optimization problem
211 | is not convex and finding a (local) minimum would involve a non-trivial
212 | amount of work.
213 |
214 | The good news is that over the last 25 years, researchers have devised various
215 | rules of thumb for choosing hyper-parameters in a neural network. A very
216 | good overview of these tricks can be found in `Efficient
217 | BackProp `_ by Yann LeCun,
218 | Leon Bottou, Genevieve Orr, and Klaus-Robert Mueller. In here, we summarize
219 | the same issues, with an emphasis on the parameters and techniques that we
220 | actually used in our code.
221 |
222 | Nonlinearity
223 | --------------
224 |
225 | Two of the most common ones are the :math:`sigmoid` and the :math:`tanh` function. For
226 | reasons explained in `Section 4.4 `_, nonlinearities that
227 | are symmetric around the origin are preferred because they tend to produce
228 | zero-mean inputs to the next layer (which is a desirable property).
229 | Empirically, we have observed that the :math:`tanh` has better convergence
230 | properties.
231 |
232 | Weight initialization
233 | ---------------------
234 |
235 | At initialization we want the weights to be small enough around the origin
236 | so that the activation function operates in its linear regime, where gradients are
237 | the largest. Other desirable properties, especially for deep networks,
238 | are to conserve variance of the activation as well as variance of back-propagated gradients from layer to layer.
239 | This allows information to flow well upward and downward in the network and
240 | reduces discrepancies between layers.
241 | Under some assumptions, a compromise between these two constraints leads to the following
242 | initialization: :math:`uniform[-\frac{\sqrt{6}}{\sqrt{fan_{in}+fan_{out}}},\frac{\sqrt{6}}{\sqrt{fan_{in}+fan_{out}}}]`
243 | for tanh and :math:`uniform[-4*\frac{\sqrt{6}}{\sqrt{fan_{in}+fan_{out}}},4*\frac{\sqrt{6}}{\sqrt{fan_{in}+fan_{out}}}]`
244 | for sigmoid. Where :math:`fan_{in}` is the number of inputs and :math:`fan_{out}` the number of hidden units.
245 | For mathematical considerations please refer to [Xavier10]_.
246 |
247 | Learning rate
248 | --------------
249 |
250 | There is a great deal of literature on choosing a good learning rate. The
251 | simplest solution is to simply have a constant rate. Rule of thumb: try
252 | several log-spaced values (:math:`10^{-1},10^{-2},\ldots`) and narrow the
253 | (logarithmic) grid search to the region where you obtain the lowest
254 | validation error.
255 |
256 | Decreasing the learning rate over time is sometimes a good idea. One simple
257 | rule for doing that is :math:`\frac{\mu_0}{1 + d\times t}` where
258 | :math:`\mu_0` is the initial rate (chosen, perhaps, using the grid search
259 | technique explained above), :math:`d` is a so-called "decrease constant"
260 | which controls the rate at which the learning rate decreases (typically, a
261 | smaller positive number, :math:`10^{-3}` and smaller) and :math:`t` is the
262 | epoch/stage.
263 |
264 | `Section 4.7 `_ details
265 | procedures for choosing a learning rate for each parameter (weight) in our
266 | network and for choosing them adaptively based on the error of the
267 | classifier.
268 |
269 | Number of hidden units
270 | -----------------------
271 |
272 | This hyper-parameter is very much dataset-dependent. Vaguely speaking, the
273 | more complicated the input distribution is, the more capacity the network
274 | will require to model it, and so the larger the number of hidden units that
275 | will be needed (note that the number of weights in a layer, perhaps a more direct
276 | measure of capacity, is :math:`D\times D_h` (recall :math:`D` is the number of
277 | inputs and :math:`D_h` is the number of hidden units).
278 |
279 | Unless we employ some regularization scheme (early stopping or L1/L2
280 | penalties), a typical number of hidden units vs. generalization performance graph will be U-shaped.
281 |
282 | Regularization parameter
283 | ------------------------
284 |
285 | Typical values to try for the L1/L2 regularization parameter :math:`\lambda`
286 | are :math:`10^{-2},10^{-3},\ldots`. In the framework that we described so
287 | far, optimizing this parameter will not lead to significantly better
288 | solutions, but is worth exploring nonetheless.
289 |
290 |
--------------------------------------------------------------------------------
/doc/references.txt:
--------------------------------------------------------------------------------
1 | .. _references:
2 |
3 | ==========
4 | References
5 | ==========
6 |
7 | .. [Bengio07] Y. Bengio, P. Lamblin, D. Popovici and H. Larochelle, `Greedy Layer-Wise Training of Deep Networks `_, in Advances in Neural Information Processing Systems 19 (NIPS'06), pages 153-160, MIT Press 2007.
8 |
9 | .. [Bengio09] Y. Bengio, `Learning deep architectures for AI `_, Foundations and Trends in Machine Learning 1(2) pages 1-127.
10 |
11 | .. [BengioDelalleau09] Y. Bengio, O. Delalleau, Justifying and Generalizing Contrastive Divergence (2009), Neural Computation, 21(6): 1601-1621.
12 |
13 | .. [BoulangerLewandowski12] N Boulanger-Lewandowski, Y. Bengio and P. Vincent, `Modeling Temporal Dependencies in High-Dimensional Sequences: Application to Polyphonic Music Generation and Transcription `_, in Proceedings of the 29th International Conference on Machine Learning (ICML), 2012.
14 |
15 | .. [Fukushima] Fukushima, K. (1980). Neocognitron: A self-organizing neural network model for a mechanism of pattern recognition unaffected by shift in position. Biological Cybernetics, 36, 193–202.
16 |
17 | .. [Hinton06] G.E. Hinton and R.R. Salakhutdinov, `Reducing the Dimensionality of Data with Neural Networks `_, Science, 28 July 2006, Vol. 313. no. 5786, pp. 504 - 507.
18 |
19 | .. [Hinton07] G.E. Hinton, S. Osindero, and Y. Teh, "A fast learning algorithm for deep belief nets", Neural Computation, vol 18, 2006
20 |
21 | .. [Hubel68] Hubel, D. and Wiesel, T. (1968). Receptive fields and functional architecture of monkey striate cortex. Journal of Physiology (London), 195, 215–243.
22 |
23 | .. [LeCun98] LeCun, Y., Bottou, L., Bengio, Y., and Haffner, P. (1998d). Gradient-based learning applied to document recognition. Proceedings of the IEEE, 86(11), 2278–2324.
24 |
25 | .. [Lee08] H. Lee, C. Ekanadham, and A.Y. Ng., `Sparse deep belief net model for visual area V2 `_, in Advances in Neural Information Processing Systems (NIPS) 20, 2008.
26 |
27 | .. [Lee09] H. Lee, R. Grosse, R. Ranganath, and A.Y. Ng, "Convolutional deep belief networks for scalable unsupervised learning of hierarchical representations.", ICML 2009
28 |
29 | .. [Ranzato10] M. Ranzato, A. Krizhevsky, G. Hinton, "Factored 3-Way Restricted Boltzmann Machines for Modeling Natural Images". Proc. of the 13-th International Conference on Artificial Intelligence and Statistics (AISTATS 2010), Italy, 2010
30 |
31 | .. [Ranzato07] M.A. Ranzato, C. Poultney, S. Chopra and Y. LeCun, in J. Platt et al., `Efficient Learning of Sparse Representations with an Energy-Based Model `_, Advances in Neural Information Processing Systems (NIPS 2006), MIT Press, 2007.
32 |
33 | .. [Serre07] Serre, T., Wolf, L., Bileschi, S., and Riesenhuber, M. (2007). Robust object recog- nition with cortex-like mechanisms. IEEE Trans. Pattern Anal. Mach. Intell., 29(3), 411–426. Member-Poggio, Tomaso.
34 |
35 | .. [Vincent08] P. Vincent, H. Larochelle Y. Bengio and P.A. Manzagol, `Extracting and Composing Robust Features with Denoising Autoencoders `_, Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08), pages 1096 - 1103, ACM, 2008.
36 |
37 | .. [Tieleman08] T. Tieleman, Training restricted boltzmann machines using approximations to the likelihood gradient, ICML 2008.
38 |
39 | .. [Xavier10] Y. Bengio, X. Glorot, Understanding the difficulty of training deep feedforward neuralnetworks, AISTATS 2010
40 |
--------------------------------------------------------------------------------
/doc/rnnrbm.txt:
--------------------------------------------------------------------------------
1 | .. _rnnrbm:
2 |
3 | Modeling and generating sequences of polyphonic music with the RNN-RBM
4 | ========================================================================
5 |
6 | .. note::
7 | This tutorial demonstrates a basic implementation of the RNN-RBM as described in [BoulangerLewandowski12]_
8 | (`pdf `_).
9 | We assume the reader is familiar with
10 | `recurrent neural networks using the scan op `_
11 | and `restricted Boltzmann machines (RBM) `_.
12 |
13 | .. note::
14 | The code for this section is available for download here: `rnnrbm.py `_.
15 |
16 | You will need the modified `Python MIDI package (GPL license) `_ in your ``$PYTHONPATH`` or in the working directory in order to convert MIDI files to and from piano-rolls.
17 | The script also assumes that the content of the `Nottingham Database of folk tunes `_ has been extracted in the ``../data`` directory.
18 | Alternative MIDI datasets are available `here `_.
19 |
20 | Note that both dependencies above can be setup automatically by running the ``download.sh`` script in the ``../data`` directory.
21 |
22 | .. caution::
23 | Need Theano 0.6 or more recent.
24 |
25 |
26 | The RNN-RBM
27 | +++++++++++++++++++++++++
28 |
29 | The RNN-RBM is an energy-based model for density estimation of temporal sequences, where the feature vector :math:`v^{(t)}` at time step :math:`t` may be high-dimensional.
30 | It allows to describe multimodal conditional distributions of :math:`v^{(t)}|\mathcal A^{(t)}`, where :math:`\mathcal A^{(t)}\equiv \{v_\tau|\tau`_
146 |
147 | .. figure:: images/sample2.png
148 | :scale: 60%
149 |
150 | Listen to `sample2.mid `_
151 |
152 |
153 | How to improve this code
154 | +++++++++++++++++++++++++
155 |
156 | The code shown in this tutorial is a stripped-down version that can be improved in the following ways:
157 |
158 | * Preprocessing: transposing the sequences in a common tonality (e.g. C major / minor) and normalizing the tempo in beats (quarternotes) per minute can have the most effect on the generative quality of the model.
159 | * Pretraining techniques: initialize the :math:`W,b_v,b_h` parameters with independent RBMs with fully shuffled frames (i.e. :math:`W_{uh}=W_{uv}=W_{uu}=W_{vu}=0`); initialize the :math:`W_{uv},W_{uu},W_{vu},b_u` parameters of the RNN with the auxiliary cross-entropy objective via either SGD or, preferably, Hessian-free optimization [BoulangerLewandowski12]_.
160 | * Optimization techniques: gradient clipping, Nesterov momentum and the use of NADE for conditional density estimation.
161 | * Hyperparameter search: learning rate (separately for the RBM and RNN parts), learning rate schedules, batch size, number of hidden units (recurrent and RBM), momentum coefficient, momentum schedule, Gibbs chain length :math:`k` and early stopping.
162 | * Learn the initial condition :math:`u^{(0)}` as a model parameter.
163 |
164 |
165 | A few samples generated with code including these features are available here: `sequences.zip `_.
166 |
167 |
--------------------------------------------------------------------------------
/doc/scripts/docgen.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import sys
3 | import os
4 | import shutil
5 |
6 | import getopt
7 | from collections import defaultdict
8 |
9 | if __name__ == '__main__':
10 |
11 | throot = "/".join(sys.path[0].split("/")[:-2])
12 |
13 | options = defaultdict(bool)
14 | output_arg = getopt.getopt(sys.argv[1:], 'o:', ['rst', 'help', 'nopdf'])[0]
15 | options.update(dict([x, y or True] for x, y in output_arg))
16 | if options['--help']:
17 | print('Usage: %s [OPTIONS]' % sys.argv[0])
18 | print(' -o : output the html files in the specified dir')
19 | print(' --rst: only compile the doc (requires sphinx)')
20 | print(' --nopdf: do not produce a PDF file from the doc, only HTML')
21 | print(' --help: this help')
22 | sys.exit(0)
23 |
24 | options['--all'] = not bool(options['--rst'])
25 |
26 | def mkdir(path):
27 | try:
28 | os.mkdir(path)
29 | except OSError:
30 | pass
31 |
32 | outdir = options['-o'] or (throot + '/html')
33 | mkdir(outdir)
34 | os.chdir(outdir)
35 | mkdir("doc")
36 |
37 | # Make sure the appropriate 'deeplearning' directory is in the PYTHONPATH
38 | pythonpath = os.environ.get('PYTHONPATH', '')
39 | pythonpath = throot + ':' + pythonpath
40 | os.environ['PYTHONPATH'] = pythonpath
41 |
42 | if options['--all'] or options['--rst']:
43 | import sphinx
44 | sys.path[0:0] = [os.path.join(throot, 'doc')]
45 | sphinx.main(['', '-E', os.path.join(throot, 'doc'), '.'])
46 |
47 | if not options['--nopdf']:
48 | # Generate latex file in a temp directory
49 | import tempfile
50 | workdir = tempfile.mkdtemp()
51 | sphinx.main(['', '-E', '-b', 'latex',
52 | os.path.join(throot, 'doc'), workdir])
53 | # Compile to PDF
54 | os.chdir(workdir)
55 | os.system('make')
56 | try:
57 | shutil.copy(os.path.join(workdir, 'deeplearning.pdf'), outdir)
58 | os.chdir(outdir)
59 | shutil.rmtree(workdir)
60 | except OSError as e:
61 | print('OSError:', e)
62 | except IOError as e:
63 | print('IOError:', e)
64 |
--------------------------------------------------------------------------------
/doc/utilities.txt:
--------------------------------------------------------------------------------
1 | =============
2 | Miscellaneous
3 | =============
4 |
5 | .. _how-to-plot:
6 |
7 | Plotting Samples and Filters
8 | ++++++++++++++++++++++++++++
9 |
10 | .. note::
11 | The code for this section is available for download `here`_.
12 |
13 | .. _here: http://deeplearning.net/tutorial/code/utils.py
14 |
15 |
16 | To plot a sample, what we need to do is to take the visible units, which
17 | are a flattened image (there is no 2D structure to the visible units,
18 | just a 1D string of unit activations) and reshape it into a 2D image. The order in
19 | which the points from the 1D array go into the 2D image is given by the
20 | order in which the inital MNIST images where converted into a 1D array.
21 | Lucky for us this is just a call of the ``numpy.reshape`` function.
22 |
23 | Plotting the weights is a bit more tricky. We have ``n_hidden`` hidden
24 | units, each of them corresponding to a column of the weight matrix. A
25 | column has the same shape as the visible, where the weight corresponding
26 | to the connection with visible unit `j` is at position `j`. Therefore,
27 | if we reshape every such column, using ``numpy.reshape``, we get a
28 | filter image that tells us how this hidden unit is influenced by
29 | the input image.
30 |
31 | We need a utility function that takes a minibatch, or the weight matrix,
32 | and converts each row ( for the weight matrix we do a transpose ) into a
33 | 2D image and then tile these images together. Once we converted the
34 | minibatch or the weights in this image of tiles, we can use PIL to plot
35 | and save. `PIL `_ is a standard
36 | python libarary to deal with images.
37 |
38 | Tiling minibatches together is done for us by the
39 | ``tile_raster_image`` function which we provide here.
40 |
41 | .. code-block:: python
42 |
43 |
44 | def scale_to_unit_interval(ndar, eps=1e-8):
45 | """ Scales all values in the ndarray ndar to be between 0 and 1 """
46 | ndar = ndar.copy()
47 | ndar -= ndar.min()
48 | ndar *= 1.0 / (ndar.max() + eps)
49 | return ndar
50 |
51 |
52 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
53 | scale_rows_to_unit_interval=True,
54 | output_pixel_vals=True):
55 | """
56 | Transform an array with one flattened image per row, into an array in
57 | which images are reshaped and layed out like tiles on a floor.
58 |
59 | This function is useful for visualizing datasets whose rows are images,
60 | and also columns of matrices for transforming those rows
61 | (such as the first layer of a neural net).
62 |
63 | :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
64 | be 2-D ndarrays or None;
65 | :param X: a 2-D array in which every row is a flattened image.
66 |
67 | :type img_shape: tuple; (height, width)
68 | :param img_shape: the original shape of each image
69 |
70 | :type tile_shape: tuple; (rows, cols)
71 | :param tile_shape: the number of images to tile (rows, cols)
72 |
73 | :param output_pixel_vals: if output should be pixel values (i.e. int8
74 | values) or floats
75 |
76 | :param scale_rows_to_unit_interval: if the values need to be scaled before
77 | being plotted to [0,1] or not
78 |
79 |
80 | :returns: array suitable for viewing as an image.
81 | (See:`Image.fromarray`.)
82 | :rtype: a 2-d array with same dtype as X.
83 |
84 | """
85 |
86 | assert len(img_shape) == 2
87 | assert len(tile_shape) == 2
88 | assert len(tile_spacing) == 2
89 |
90 | # The expression below can be re-written in a more C style as
91 | # follows :
92 | #
93 | # out_shape = [0,0]
94 | # out_shape[0] = (img_shape[0] + tile_spacing[0]) * tile_shape[0] -
95 | # tile_spacing[0]
96 | # out_shape[1] = (img_shape[1] + tile_spacing[1]) * tile_shape[1] -
97 | # tile_spacing[1]
98 | out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp
99 | in zip(img_shape, tile_shape, tile_spacing)]
100 |
101 | if isinstance(X, tuple):
102 | assert len(X) == 4
103 | # Create an output numpy ndarray to store the image
104 | if output_pixel_vals:
105 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8')
106 | else:
107 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype)
108 |
109 | #colors default to 0, alpha defaults to 1 (opaque)
110 | if output_pixel_vals:
111 | channel_defaults = [0, 0, 0, 255]
112 | else:
113 | channel_defaults = [0., 0., 0., 1.]
114 |
115 | for i in xrange(4):
116 | if X[i] is None:
117 | # if channel is None, fill it with zeros of the correct
118 | # dtype
119 | out_array[:, :, i] = numpy.zeros(out_shape,
120 | dtype='uint8' if output_pixel_vals else out_array.dtype
121 | ) + channel_defaults[i]
122 | else:
123 | # use a recurrent call to compute the channel and store it
124 | # in the output
125 | out_array[:, :, i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals)
126 | return out_array
127 |
128 | else:
129 | # if we are dealing with only one channel
130 | H, W = img_shape
131 | Hs, Ws = tile_spacing
132 |
133 | # generate a matrix to store the output
134 | out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype)
135 |
136 |
137 | for tile_row in xrange(tile_shape[0]):
138 | for tile_col in xrange(tile_shape[1]):
139 | if tile_row * tile_shape[1] + tile_col < X.shape[0]:
140 | if scale_rows_to_unit_interval:
141 | # if we should scale values to be between 0 and 1
142 | # do this by calling the `scale_to_unit_interval`
143 | # function
144 | this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape))
145 | else:
146 | this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)
147 | # add the slice to the corresponding position in the
148 | # output array
149 | out_array[
150 | tile_row * (H+Hs): tile_row * (H + Hs) + H,
151 | tile_col * (W+Ws): tile_col * (W + Ws) + W
152 | ] \
153 | = this_img * (255 if output_pixel_vals else 1)
154 | return out_array
155 |
--------------------------------------------------------------------------------
/issues_closed/2_RBM_cost_fn.txt:
--------------------------------------------------------------------------------
1 | Reported by : Razvan
2 |
3 | Cost function (delta of free energy) has a reversed sign (i.e. free_energy(positive) - free_energy(negative) ). I'm not sure
4 | where the minus pops in .. but is confusing when going from theory to code.
5 |
6 |
7 | FIXED
8 |
--------------------------------------------------------------------------------
/issues_open/1_SdA_performance.txt:
--------------------------------------------------------------------------------
1 | Reported by : Razvan
2 |
3 | Best performance for SdA float64 CPU : 1.23%
4 | float32 CPU : 1.30%
5 | target : 1.10%
6 |
7 | Possible reasons:
8 | - bug !?
9 | - random seed / weights initialization / finetuning early stopping parameters
10 |
--------------------------------------------------------------------------------
/issues_open/3_RBM_scan_GPU.txt:
--------------------------------------------------------------------------------
1 | Reported by : Razvan
2 |
3 | Scan is not GPU ready.. making RBM tutorial slow on GPU (not tested yet).
4 | Quick fix is a optimization that removes scan if you're doing CD-1.
5 |
--------------------------------------------------------------------------------
/issues_open/4_RBM_scan.txt:
--------------------------------------------------------------------------------
1 | Reported by : Razvan
2 |
3 | The bug can be reproduced if you do :
4 | z = scan(..)
5 | c = f(z[-1])
6 | gp = T.grad(c, p, consider_constant = [ z[-1] ] )
7 |
8 | In this case grad will not consider z[-1] constant. Workaround:
9 |
10 | z = scan(..)
11 | z_1 = z[-1]
12 | c = f(z_1)
13 | gp = T.grad(c,p, consider_constant = [z_1])
14 |
15 | Note : I need to make sure this actually happens .. it might have been an
16 | artifact of something else when I first got this.
17 |
--------------------------------------------------------------------------------
/issues_open/5_results.txt:
--------------------------------------------------------------------------------
1 | Reported by : Razvan
2 |
3 | We should produce results + time for CPU float32 / CPU float64 / GPU . We should also
4 | specify the batchsize (or number of updates) pointing out that you can't always just
5 | compare the number of epochs.
6 |
--------------------------------------------------------------------------------
/issues_open/6_benchmarking_pybrain.txt:
--------------------------------------------------------------------------------
1 | Reported by : Razvan
2 |
3 | Observations :
4 |
5 | 1. First thing, working with their dataset model is a pain ! Either I had
6 | not figure it out, or it allows you to add only one datapoint at a time
7 | in the dataset. This seems to me highly unoptimal ...
8 |
9 | 2. You do not get batches for sgd ! The only thing you can do is compare with
10 | batch size of 1.
11 |
12 | 3. Their early stopping is different from ours. Differences :
13 | - You can not set how often you do a pass on the validation set
14 | (i.e. ``patience`` in our case). You always do one epoch of training
15 | and then you go through the validation set.
16 | - You do not have an improvement thereshold, any improvement in
17 | validation score leads to storing the new best parameters, and
18 | increasing the time you will still look for better parameters
19 | - The increase is not by multiplication but summation. So if at
20 | epoch x you do better on the validation step, you will go on for
21 | x+y epochs to look for something better ( we do x*y )
22 |
23 | 4. The errors return by pyBrain are divided by the number of
24 | classes. So if you do classification, you take the number of
25 | errors and divide it by the number of test examples times the
26 | number of classes. For MNIST this yields 10 times smaller
27 | errors. Is this something standard .. should we do it ? It
28 | definetelly makes error look smaller.
29 |
30 | 5. There is no straight forward way of adding L1/L2 regularization (from
31 | what I've seen), unless you go into their code and change it. That is not
32 | ard to do .. but for now I do not want to meangle with the library
33 |
34 | 6. The code for RBM is not ready (they say that it is work in progress). It seems to me that the
35 | code is wrong .. They have 3 loops, which to me would mean that the inner most is for CD-k (
36 | second is for one epoch / third for training). But they update the weights after each Gibbs
37 | step in CD-k .. which results in a strage form of CD-1 that sees same example several time before
38 | moving to the next one. I could (?) potentially fix the code but it is outside the scope of
39 | benchmarking.
40 |
41 | 7. There are question marks of how easy it would be to implement a SdA ( autoassociators might be
42 | easy to do though).
43 |
44 |
45 | RESULTS :
46 | logistic_sgd on maggie46
47 |
48 | Total error: 0.015611011103
49 | Total error: 0.00966772673335
50 | Total error: 0.00860664508883
51 | Time spend per epoch: 43.32
52 | Final error is : 10.44
53 | Time spend per epoch: 43.32
54 | Final error is : 10.44
55 |
56 | Arac :
57 |
58 | Total error: 0.0366924968888
59 | Total error: 0.0366576944937
60 | Total error: 0.0367442383338
61 | Time spend per epoch: 24.71
62 | Final error is : 69.28
63 | Time spend per epoch: 24.71
64 | Final error is : 69.28
65 |
66 |
67 | ** Our thing with batchsize =1 **
68 |
69 | test error of best model 8.45
70 | time : 12.99
71 | 12.01
72 |
73 |
74 |
75 |
76 | Results :
77 | mlp on maggie46
78 |
79 |
80 | pybrain ::
81 |
82 | Total error: 0.0124744609817
83 | Total error: 0.00722484141084
84 | Total error: 0.00599591269763
85 | Time spend per epoch : 1226.69
86 | Final error is : 8.68
87 | Time spend per epoch: 1226.69
88 | Final error is : 8.68
89 |
90 | 20.4448 min
91 |
92 | arac::
93 |
94 | Total error: 0.0318599056504
95 | Total error: 0.0316029246672
96 | Total error: 0.0315542295953
97 | Time spend per epoch: 860.336666667 (s)
98 | Final error is : 58.59
99 |
100 | our thing::
101 |
102 | test error of best model 3.88
103 | time: 381.92
104 |
105 |
--------------------------------------------------------------------------------
/misc/do_nightly_build:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #we set the compiledir to the /Tmp dir to make the test faster by bypassing the nfs network.
3 | date
4 | ROOT_CWD=/Tmp/nightly_build
5 | COMPILEDIR=/Tmp/lisa_theano_compile_dir_deeplearning
6 | NOSETESTS=${ROOT_CWD}/Theano/bin/theano-nose
7 |
8 | FLAGS=warn.ignore_bug_before=0.5,compiledir=${COMPILEDIR}
9 | export PYTHONPATH=${ROOT_CWD}/Theano:${ROOT_CWD}/Pylearn:$PYTHONPATH
10 |
11 | cd ${ROOT_CWD}/DeepLearningTutorials/data
12 | ./download.sh
13 |
14 | cd ${ROOT_CWD}/Theano
15 | echo "git version for Theano:" `git rev-parse HEAD`
16 | cd ${ROOT_CWD}/DeepLearningTutorials/code
17 | echo "git version:" `git rev-parse HEAD`
18 |
19 | #echo "executing nosetests with mode=FAST_COMPILE"
20 | #THEANO_FLAGS=${FLAGS},mode=FAST_COMPILE ${NOSETESTS}
21 | echo "executing nosetests speed with mode=FAST_RUN"
22 | THEANO_FLAGS=${FLAGS},mode=FAST_RUN ${NOSETESTS} test.py:speed
23 | #echo "executing nosetests speed with mode=FAST_RUN and OMP_NUM_THREADS=2"
24 | #OMP_NUM_THREADS=2 THEANO_FLAGS=${FLAGS},mode=FAST_RUN ${NOSETESTS} test.py:speed
25 | echo "executing nosetests with mode=FAST_RUN,floatX=float32"
26 | THEANO_FLAGS=${FLAGS},mode=FAST_RUN,floatX=float32 ${NOSETESTS}
27 |
28 | #we change the seed and record it everyday to test different combination. We record it to be able to reproduce bug caused by different seed. We don't want multiple test in DEBUG_MODE each day as this take too long.
29 | #seed=$RANDOM
30 | #echo "executing nosetests with mode=DEBUG_MODE with seed of the day $seed"
31 | #THEANO_DEBUGMODE_CHECK_STRIDES=0 THEANO_DEBUGMODE_PATIENCE=3 THEANO_COMPILEDIR=/Tmp/lisa_theano_compile_dir_deeplearning THEANO_UNITTEST_SEED=$seed THEANO_DEFAULT_MODE=DEBUG_MODE ${NOSETESTS}
32 |
33 |
--------------------------------------------------------------------------------