├── .gitignore
├── .hgignore
├── .travis.yml
├── LICENSE.txt
├── README.rst
├── code
    ├── DBN.py
    ├── SdA.py
    ├── cA.py
    ├── convolutional_mlp.py
    ├── dA.py
    ├── hmc
    │   ├── __init__.py
    │   ├── hmc.py
    │   └── test_hmc.py
    ├── imdb.py
    ├── imdb_preprocess.py
    ├── logistic_cg.py
    ├── logistic_sgd.py
    ├── lstm.py
    ├── mlp.py
    ├── rbm.py
    ├── rnnrbm.py
    ├── rnnslu.py
    ├── test.py
    └── utils.py
├── data
    ├── download.sh
    └── training_colorpatches_16x16_demo.mat
├── doc
    ├── .templates
    │   └── layout.html
    ├── DBN.txt
    ├── LICENSE.txt
    ├── Makefile
    ├── SdA.txt
    ├── conf.py
    ├── contents.txt
    ├── dA.txt
    ├── gettingstarted.txt
    ├── hmc.txt
    ├── images
    │   ├── 3wolfmoon.jpg
    │   ├── 3wolfmoon_output.png
    │   ├── DBN3.png
    │   ├── bm.png
    │   ├── cnn_explained.png
    │   ├── conv_1D_nn.png
    │   ├── filters_at_epoch_14.png
    │   ├── filters_corruption_0.png
    │   ├── filters_corruption_30.png
    │   ├── lstm.png
    │   ├── lstm_memorycell.png
    │   ├── markov_chain.png
    │   ├── mlp.png
    │   ├── mnist_0.png
    │   ├── mnist_1.png
    │   ├── mnist_2.png
    │   ├── mnist_3.png
    │   ├── mnist_4.png
    │   ├── mnist_5.png
    │   ├── mylenet.png
    │   ├── rbm.png
    │   ├── rnnrbm.png
    │   ├── rnnrbm.svg
    │   ├── sample1.png
    │   ├── sample2.png
    │   ├── samples.png
    │   └── sparse_1D_nn.png
    ├── index.txt
    ├── lenet.txt
    ├── logreg.txt
    ├── lstm.txt
    ├── mlp.txt
    ├── rbm.txt
    ├── references.txt
    ├── rnnrbm.txt
    ├── rnnslu.txt
    ├── scripts
    │   └── docgen.py
    └── utilities.txt
├── issues_closed
    └── 2_RBM_cost_fn.txt
├── issues_open
    ├── 1_SdA_performance.txt
    ├── 3_RBM_scan_GPU.txt
    ├── 4_RBM_scan.txt
    ├── 5_results.txt
    └── 6_benchmarking_pybrain.txt
└── misc
    └── do_nightly_build


/.gitignore:
--------------------------------------------------------------------------------
 1 | code/*.pyc
 2 | code/*_plots
 3 | code/tmp*
 4 | code/midi
 5 | code/rnnslu
 6 | data/atis.*
 7 | data/mnist.pkl.gz
 8 | data/mnist_py3k.pkl.gz
 9 | data/Nottingham.zip
10 | data/Nottingham
11 | data/midi.zip
12 | html
13 | *.pyc
14 | *~
15 | *.swp
16 | 


--------------------------------------------------------------------------------
/.hgignore:
--------------------------------------------------------------------------------
1 | syntax: glob
2 | *.pyc
3 | *.png
4 | *~
5 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # After changing this file, check it on:
 2 | # http://lint.travis-ci.org/
 3 | 
 4 | language: python
 5 | #python:
 6 | #  - "2.7"
 7 | #  - "3.2"
 8 | # command to install dependencies
 9 | before_install:
10 |   - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
11 |   - chmod +x miniconda.sh
12 |   - ./miniconda.sh -b
13 |   - export PATH=/home/travis/miniconda/bin:$PATH
14 |   - conda update --yes conda
15 | 
16 | install:
17 |   - conda create --yes -q -n pyenv mkl python=2.7 numpy scipy pip nose yaml pyflakes pillow pyparsing=1.5
18 |   - source activate pyenv
19 |   - pip install git+git://github.com/Theano/Theano.git
20 | 
21 | env:
22 |   - PART="test.py:test_logistic_sgd test.py:test_logistic_cg test.py:test_mlp test.py:test_convolutional_mlp test.py:test_dA"
23 |   - PART="test.py:test_SdA"
24 |   - PART="test.py:test_dbn"
25 |   - PART="test.py:test_rbm test.py:test_rnnrbm"
26 |   - PART="-e test.py"
27 | 
28 | #i7-2600K CPU @ 3.40GHz
29 | #166.572s   #8      test.test_rbm OK
30 | #155.114s   #7      test.test_dbn OK
31 | #152.365s   #9      test.test_rnnrbm OK
32 | #127.286s   #6      test.test_SdA OK
33 | #39.252s    #5      test.test_dA OK
34 | #27.56s     #4      test.test_convolutional_mlp OK
35 | #15.454s    #3      test.test_mlp OK
36 | #12.732s    #1      test.test_logistic_sgd OK
37 | #12.638s    #2      test.test_logistic_cg OK
38 | 
39 | #i7-920
40 | #296.475s   #7      code.test.test_dbn OK
41 | #257.272s   #6      code.test.test_SdA OK
42 | #234.776s   #9      code.test.test_rnnrbm OK
43 | #233.896s   #8      code.test.test_rbm OK
44 | #65.737s    #5      code.test.test_dA OK
45 | #37.658s    #4      code.test.test_convolutional_mlp OK
46 | #24.172s    #3      code.test.test_mlp OK
47 | #20.401s    #1      code.test.test_logistic_sgd OK
48 | #17.546s    #2      code.test.test_logistic_cg OK
49 | 
50 | # On Core2 duo E8500 with MRG
51 | #308.004s   #7      code.test.test_dbn OK
52 | #277.268s   #6      code.test.test_SdA OK
53 | #126.102s   #8      code.test.test_rbm OK
54 | #123.652s   #9      code.test.test_rnnrbm OK
55 | #77.101s    #5      code.test.test_dA OK
56 | #39.75s     #4      code.test.test_convolutional_mlp OK
57 | #30.406s    #3      code.test.test_mlp OK
58 | #21.132s    #2      code.test.test_logistic_cg OK
59 | #17.945s    #1      code.test.test_logistic_sgd OK
60 | 
61 | # Unknown computer with older version of Theano
62 | #569.882s   #9      code.test.test_rbm OK
63 | #298.992s   #8      code.test.test_dbn OK
64 | #268.901s   #7      code.test.test_SdA OK
65 | #67.292s    #6      code.test.test_dA OK
66 | #27.485s    #4      code.test.test_mlp OK
67 | #26.204s    #5      code.test.test_convolutional_mlp OK
68 | #14.676s    #3      code.test.test_logistic_cg OK
69 | #10.66s     #2      code.test.test_logistic_sgd OK
70 | #5.795s     #1      code.hmc.test_hmc.test_hmc OK
71 | 
72 | script:
73 |   - cd data
74 |   - ./download.sh
75 |   - ls
76 |   - cd ../code
77 |   - pwd
78 |   - ls
79 |   - export THEANO_FLAGS=warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise
80 |   - python --version
81 |   - nosetests -v $PART
82 | 
83 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | .. _license:
 2 | 
 3 | LICENSE
 4 | =======
 5 | 
 6 | Copyright (c) 2010--2015, Deep Learning Tutorials Development Team
 7 | All rights reserved.
 8 | 
 9 | Redistribution and use in source and binary forms, with or without
10 | modification, are permitted provided that the following conditions are met:
11 | 
12 |     * Redistributions of source code must retain the above copyright
13 |       notice, this list of conditions and the following disclaimer.
14 |     * Redistributions in binary form must reproduce the above copyright
15 |       notice, this list of conditions and the following disclaimer in the
16 |       documentation and/or other materials provided with the distribution.
17 |     * Neither the name of Theano nor the names of its contributors may be
18 |       used to endorse or promote products derived from this software without
19 |       specific prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
22 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
28 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Deep Learning Tutorials
 2 | =======================
 3 | 
 4 | Deep Learning is a new area of Machine Learning research, which has been
 5 | introduced with the objective of moving Machine Learning closer to one of its
 6 | original goals: Artificial Intelligence.  Deep Learning is about learning
 7 | multiple levels of representation and abstraction that help to make sense of
 8 | data such as images, sound, and text.  The tutorials presented here will
 9 | introduce you to some of the most important deep learning algorithms and will
10 | also show you how to run them using Theano.  Theano is a python library that
11 | makes writing deep learning models easy, and gives the option of training them
12 | on a GPU.
13 | 
14 | The easiest way to follow the tutorials is to `browse them online
15 | <http://deeplearning.net/tutorial/>`_.
16 | 
17 | `Main development <http://github.com/lisa-lab/DeepLearningTutorials>`_
18 | of this project.
19 | 
20 | .. image:: https://secure.travis-ci.org/lisa-lab/DeepLearningTutorials.png
21 |    :target: http://travis-ci.org/lisa-lab/DeepLearningTutorials
22 | 
23 | Project Layout
24 | --------------
25 | 
26 | Subdirectories:
27 | 
28 | - code - Python files corresponding to each tutorial
29 | - data - data and scripts to download data that is used by the tutorials
30 | - doc  - restructured text used by Sphinx to build the tutorial website
31 | - html - built automatically by doc/Makefile, contains tutorial website
32 | - issues_closed - issue tracking
33 | - issues_open - issue tracking
34 | - misc - administrative scripts
35 | 
36 | 
37 | Build instructions
38 | ------------------
39 | 
40 | To build the html version of the tutorials, install sphinx and run doc/Makefile
41 | 


--------------------------------------------------------------------------------
/code/cA.py:
--------------------------------------------------------------------------------
  1 | """This tutorial introduces Contractive auto-encoders (cA) using Theano.
  2 | 
  3 |  They are based on auto-encoders as the ones used in Bengio et
  4 |  al. 2007.  An autoencoder takes an input x and first maps it to a
  5 |  hidden representation y = f_{\theta}(x) = s(Wx+b), parameterized by
  6 |  \theta={W,b}. The resulting latent representation y is then mapped
  7 |  back to a "reconstructed" vector z \in [0,1]^d in input space z =
  8 |  g_{\theta'}(y) = s(W'y + b').  The weight matrix W' can optionally be
  9 |  constrained such that W' = W^T, in which case the autoencoder is said
 10 |  to have tied weights. The network is trained such that to minimize
 11 |  the reconstruction error (the error between x and z).  Adding the
 12 |  squared Frobenius norm of the Jacobian of the hidden mapping h with
 13 |  respect to the visible units yields the contractive auto-encoder:
 14 | 
 15 |       - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
 16 |       + \| \frac{\partial h(x)}{\partial x} \|^2
 17 | 
 18 |  References :
 19 |    - S. Rifai, P. Vincent, X. Muller, X. Glorot, Y. Bengio: Contractive
 20 |    Auto-Encoders: Explicit Invariance During Feature Extraction, ICML-11
 21 | 
 22 |    - S. Rifai, X. Muller, X. Glorot, G. Mesnil, Y. Bengio, and Pascal
 23 |      Vincent. Learning invariant features through local space
 24 |      contraction. Technical Report 1360, Universite de Montreal
 25 | 
 26 |    - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
 27 |    Training of Deep Networks, Advances in Neural Information Processing
 28 |    Systems 19, 2007
 29 | 
 30 | """
 31 | import os
 32 | import sys
 33 | import timeit
 34 | 
 35 | import numpy
 36 | 
 37 | import theano
 38 | import theano.tensor as T
 39 | 
 40 | 
 41 | from logistic_sgd import load_data
 42 | from utils import tile_raster_images
 43 | 
 44 | try:
 45 |     import PIL.Image as Image
 46 | except ImportError:
 47 |     import Image
 48 | 
 49 | 
 50 | class cA(object):
 51 |     """ Contractive Auto-Encoder class (cA)
 52 | 
 53 |     The contractive autoencoder tries to reconstruct the input with an
 54 |     additional constraint on the latent space. With the objective of
 55 |     obtaining a robust representation of the input space, we
 56 |     regularize the L2 norm(Froebenius) of the jacobian of the hidden
 57 |     representation with respect to the input. Please refer to Rifai et
 58 |     al.,2011 for more details.
 59 | 
 60 |     If x is the input then equation (1) computes the projection of the
 61 |     input into the latent space h. Equation (2) computes the jacobian
 62 |     of h with respect to x.  Equation (3) computes the reconstruction
 63 |     of the input, while equation (4) computes the reconstruction
 64 |     error and the added regularization term from Eq.(2).
 65 | 
 66 |     .. math::
 67 | 
 68 |         h_i = s(W_i x + b_i)                                             (1)
 69 | 
 70 |         J_i = h_i (1 - h_i) * W_i                                        (2)
 71 | 
 72 |         x' = s(W' h  + b')                                               (3)
 73 | 
 74 |         L = -sum_{k=1}^d [x_k \log x'_k + (1-x_k) \log( 1-x'_k)]
 75 |              + lambda * sum_{i=1}^d sum_{j=1}^n J_{ij}^2                 (4)
 76 | 
 77 |     """
 78 | 
 79 |     def __init__(self, numpy_rng, input=None, n_visible=784, n_hidden=100,
 80 |                  n_batchsize=1, W=None, bhid=None, bvis=None):
 81 |         """Initialize the cA class by specifying the number of visible units
 82 |         (the dimension d of the input), the number of hidden units (the
 83 |         dimension d' of the latent or hidden space) and the contraction level.
 84 |         The constructor also receives symbolic variables for the input, weights
 85 |         and bias.
 86 | 
 87 |         :type numpy_rng: numpy.random.RandomState
 88 |         :param numpy_rng: number random generator used to generate weights
 89 | 
 90 |         :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
 91 |         :param theano_rng: Theano random generator; if None is given
 92 |                      one is generated based on a seed drawn from `rng`
 93 | 
 94 |         :type input: theano.tensor.TensorType
 95 |         :param input: a symbolic description of the input or None for
 96 |                       standalone cA
 97 | 
 98 |         :type n_visible: int
 99 |         :param n_visible: number of visible units
100 | 
101 |         :type n_hidden: int
102 |         :param n_hidden:  number of hidden units
103 | 
104 |         :type n_batchsize int
105 |         :param n_batchsize: number of examples per batch
106 | 
107 |         :type W: theano.tensor.TensorType
108 |         :param W: Theano variable pointing to a set of weights that should be
109 |                   shared belong the dA and another architecture; if dA should
110 |                   be standalone set this to None
111 | 
112 |         :type bhid: theano.tensor.TensorType
113 |         :param bhid: Theano variable pointing to a set of biases values (for
114 |                      hidden units) that should be shared belong dA and another
115 |                      architecture; if dA should be standalone set this to None
116 | 
117 |         :type bvis: theano.tensor.TensorType
118 |         :param bvis: Theano variable pointing to a set of biases values (for
119 |                      visible units) that should be shared belong dA and another
120 |                      architecture; if dA should be standalone set this to None
121 | 
122 |         """
123 |         self.n_visible = n_visible
124 |         self.n_hidden = n_hidden
125 |         self.n_batchsize = n_batchsize
126 |         # note : W' was written as `W_prime` and b' as `b_prime`
127 |         if not W:
128 |             # W is initialized with `initial_W` which is uniformely sampled
129 |             # from -4*sqrt(6./(n_visible+n_hidden)) and
130 |             # 4*sqrt(6./(n_hidden+n_visible))the output of uniform if
131 |             # converted using asarray to dtype
132 |             # theano.config.floatX so that the code is runable on GPU
133 |             initial_W = numpy.asarray(
134 |                 numpy_rng.uniform(
135 |                     low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
136 |                     high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
137 |                     size=(n_visible, n_hidden)
138 |                 ),
139 |                 dtype=theano.config.floatX
140 |             )
141 |             W = theano.shared(value=initial_W, name='W', borrow=True)
142 | 
143 |         if not bvis:
144 |             bvis = theano.shared(value=numpy.zeros(n_visible,
145 |                                                    dtype=theano.config.floatX),
146 |                                  borrow=True)
147 | 
148 |         if not bhid:
149 |             bhid = theano.shared(value=numpy.zeros(n_hidden,
150 |                                                    dtype=theano.config.floatX),
151 |                                  name='b',
152 |                                  borrow=True)
153 | 
154 |         self.W = W
155 |         # b corresponds to the bias of the hidden
156 |         self.b = bhid
157 |         # b_prime corresponds to the bias of the visible
158 |         self.b_prime = bvis
159 |         # tied weights, therefore W_prime is W transpose
160 |         self.W_prime = self.W.T
161 | 
162 |         # if no input is given, generate a variable representing the input
163 |         if input is None:
164 |             # we use a matrix because we expect a minibatch of several
165 |             # examples, each example being a row
166 |             self.x = T.dmatrix(name='input')
167 |         else:
168 |             self.x = input
169 | 
170 |         self.params = [self.W, self.b, self.b_prime]
171 | 
172 |     def get_hidden_values(self, input):
173 |         """ Computes the values of the hidden layer """
174 |         return T.nnet.sigmoid(T.dot(input, self.W) + self.b)
175 | 
176 |     def get_jacobian(self, hidden, W):
177 |         """Computes the jacobian of the hidden layer with respect to
178 |         the input, reshapes are necessary for broadcasting the
179 |         element-wise product on the right axis
180 | 
181 |         """
182 |         return T.reshape(hidden * (1 - hidden),
183 |                          (self.n_batchsize, 1, self.n_hidden)) * T.reshape(
184 |                              W, (1, self.n_visible, self.n_hidden))
185 | 
186 |     def get_reconstructed_input(self, hidden):
187 |         """Computes the reconstructed input given the values of the
188 |         hidden layer
189 | 
190 |         """
191 |         return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime)
192 | 
193 |     def get_cost_updates(self, contraction_level, learning_rate):
194 |         """ This function computes the cost and the updates for one trainng
195 |         step of the cA """
196 | 
197 |         y = self.get_hidden_values(self.x)
198 |         z = self.get_reconstructed_input(y)
199 |         J = self.get_jacobian(y, self.W)
200 |         # note : we sum over the size of a datapoint; if we are using
201 |         #        minibatches, L will be a vector, with one entry per
202 |         #        example in minibatch
203 |         self.L_rec = - T.sum(self.x * T.log(z) +
204 |                              (1 - self.x) * T.log(1 - z),
205 |                              axis=1)
206 | 
207 |         # Compute the jacobian and average over the number of samples/minibatch
208 |         self.L_jacob = T.sum(J ** 2) / self.n_batchsize
209 | 
210 |         # note : L is now a vector, where each element is the
211 |         #        cross-entropy cost of the reconstruction of the
212 |         #        corresponding example of the minibatch. We need to
213 |         #        compute the average of all these to get the cost of
214 |         #        the minibatch
215 |         cost = T.mean(self.L_rec) + contraction_level * T.mean(self.L_jacob)
216 | 
217 |         # compute the gradients of the cost of the `cA` with respect
218 |         # to its parameters
219 |         gparams = T.grad(cost, self.params)
220 |         # generate the list of updates
221 |         updates = []
222 |         for param, gparam in zip(self.params, gparams):
223 |             updates.append((param, param - learning_rate * gparam))
224 | 
225 |         return (cost, updates)
226 | 
227 | 
228 | def test_cA(learning_rate=0.01, training_epochs=20,
229 |             dataset='mnist.pkl.gz',
230 |             batch_size=10, output_folder='cA_plots', contraction_level=.1):
231 |     """
232 |     This demo is tested on MNIST
233 | 
234 |     :type learning_rate: float
235 |     :param learning_rate: learning rate used for training the contracting
236 |                           AutoEncoder
237 | 
238 |     :type training_epochs: int
239 |     :param training_epochs: number of epochs used for training
240 | 
241 |     :type dataset: string
242 |     :param dataset: path to the picked dataset
243 | 
244 |     """
245 |     datasets = load_data(dataset)
246 |     train_set_x, train_set_y = datasets[0]
247 | 
248 |     # compute number of minibatches for training, validation and testing
249 |     n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
250 | 
251 |     # allocate symbolic variables for the data
252 |     index = T.lscalar()    # index to a [mini]batch
253 |     x = T.matrix('x')  # the data is presented as rasterized images
254 | 
255 |     if not os.path.isdir(output_folder):
256 |         os.makedirs(output_folder)
257 |     os.chdir(output_folder)
258 |     ####################################
259 |     #        BUILDING THE MODEL        #
260 |     ####################################
261 | 
262 |     rng = numpy.random.RandomState(123)
263 | 
264 |     ca = cA(numpy_rng=rng, input=x,
265 |             n_visible=28 * 28, n_hidden=500, n_batchsize=batch_size)
266 | 
267 |     cost, updates = ca.get_cost_updates(contraction_level=contraction_level,
268 |                                         learning_rate=learning_rate)
269 | 
270 |     train_ca = theano.function(
271 |         [index],
272 |         [T.mean(ca.L_rec), ca.L_jacob],
273 |         updates=updates,
274 |         givens={
275 |             x: train_set_x[index * batch_size: (index + 1) * batch_size]
276 |         }
277 |     )
278 | 
279 |     start_time = timeit.default_timer()
280 | 
281 |     ############
282 |     # TRAINING #
283 |     ############
284 | 
285 |     # go through training epochs
286 |     for epoch in xrange(training_epochs):
287 |         # go through trainng set
288 |         c = []
289 |         for batch_index in xrange(n_train_batches):
290 |             c.append(train_ca(batch_index))
291 | 
292 |         c_array = numpy.vstack(c)
293 |         print 'Training epoch %d, reconstruction cost ' % epoch, numpy.mean(
294 |             c_array[0]), ' jacobian norm ', numpy.mean(numpy.sqrt(c_array[1]))
295 | 
296 |     end_time = timeit.default_timer()
297 | 
298 |     training_time = (end_time - start_time)
299 | 
300 |     print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
301 |                           ' ran for %.2fm' % ((training_time) / 60.))
302 |     image = Image.fromarray(tile_raster_images(
303 |         X=ca.W.get_value(borrow=True).T,
304 |         img_shape=(28, 28), tile_shape=(10, 10),
305 |         tile_spacing=(1, 1)))
306 | 
307 |     image.save('cae_filters.png')
308 | 
309 |     os.chdir('../')
310 | 
311 | 
312 | if __name__ == '__main__':
313 |     test_cA()
314 | 


--------------------------------------------------------------------------------
/code/convolutional_mlp.py:
--------------------------------------------------------------------------------
  1 | """This tutorial introduces the LeNet5 neural network architecture
  2 | using Theano.  LeNet5 is a convolutional neural network, good for
  3 | classifying images. This tutorial shows how to build the architecture,
  4 | and comes with all the hyper-parameters you need to reproduce the
  5 | paper's MNIST results.
  6 | 
  7 | 
  8 | This implementation simplifies the model in the following ways:
  9 | 
 10 |  - LeNetConvPool doesn't implement location-specific gain and bias parameters
 11 |  - LeNetConvPool doesn't implement pooling by average, it implements pooling
 12 |    by max.
 13 |  - Digit classification is implemented with a logistic regression rather than
 14 |    an RBF network
 15 |  - LeNet5 was not fully-connected convolutions at second layer
 16 | 
 17 | References:
 18 |  - Y. LeCun, L. Bottou, Y. Bengio and P. Haffner:
 19 |    Gradient-Based Learning Applied to Document
 20 |    Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998.
 21 |    http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf
 22 | 
 23 | """
 24 | import os
 25 | import sys
 26 | import timeit
 27 | 
 28 | import numpy
 29 | 
 30 | import theano
 31 | import theano.tensor as T
 32 | from theano.tensor.signal import downsample
 33 | from theano.tensor.nnet import conv
 34 | 
 35 | from logistic_sgd import LogisticRegression, load_data
 36 | from mlp import HiddenLayer
 37 | 
 38 | 
 39 | class LeNetConvPoolLayer(object):
 40 |     """Pool Layer of a convolutional network """
 41 | 
 42 |     def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)):
 43 |         """
 44 |         Allocate a LeNetConvPoolLayer with shared variable internal parameters.
 45 | 
 46 |         :type rng: numpy.random.RandomState
 47 |         :param rng: a random number generator used to initialize weights
 48 | 
 49 |         :type input: theano.tensor.dtensor4
 50 |         :param input: symbolic image tensor, of shape image_shape
 51 | 
 52 |         :type filter_shape: tuple or list of length 4
 53 |         :param filter_shape: (number of filters, num input feature maps,
 54 |                               filter height, filter width)
 55 | 
 56 |         :type image_shape: tuple or list of length 4
 57 |         :param image_shape: (batch size, num input feature maps,
 58 |                              image height, image width)
 59 | 
 60 |         :type poolsize: tuple or list of length 2
 61 |         :param poolsize: the downsampling (pooling) factor (#rows, #cols)
 62 |         """
 63 | 
 64 |         assert image_shape[1] == filter_shape[1]
 65 |         self.input = input
 66 | 
 67 |         # there are "num input feature maps * filter height * filter width"
 68 |         # inputs to each hidden unit
 69 |         fan_in = numpy.prod(filter_shape[1:])
 70 |         # each unit in the lower layer receives a gradient from:
 71 |         # "num output feature maps * filter height * filter width" /
 72 |         #   pooling size
 73 |         fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) /
 74 |                    numpy.prod(poolsize))
 75 |         # initialize weights with random weights
 76 |         W_bound = numpy.sqrt(6. / (fan_in + fan_out))
 77 |         self.W = theano.shared(
 78 |             numpy.asarray(
 79 |                 rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
 80 |                 dtype=theano.config.floatX
 81 |             ),
 82 |             borrow=True
 83 |         )
 84 | 
 85 |         # the bias is a 1D tensor -- one bias per output feature map
 86 |         b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
 87 |         self.b = theano.shared(value=b_values, borrow=True)
 88 | 
 89 |         # convolve input feature maps with filters
 90 |         conv_out = conv.conv2d(
 91 |             input=input,
 92 |             filters=self.W,
 93 |             filter_shape=filter_shape,
 94 |             image_shape=image_shape
 95 |         )
 96 | 
 97 |         # downsample each feature map individually, using maxpooling
 98 |         pooled_out = downsample.max_pool_2d(
 99 |             input=conv_out,
100 |             ds=poolsize,
101 |             ignore_border=True
102 |         )
103 | 
104 |         # add the bias term. Since the bias is a vector (1D array), we first
105 |         # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will
106 |         # thus be broadcasted across mini-batches and feature map
107 |         # width & height
108 |         self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
109 | 
110 |         # store parameters of this layer
111 |         self.params = [self.W, self.b]
112 | 
113 |         # keep track of model input
114 |         self.input = input
115 | 
116 | 
117 | def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
118 |                     dataset='mnist.pkl.gz',
119 |                     nkerns=[20, 50], batch_size=500):
120 |     """ Demonstrates lenet on MNIST dataset
121 | 
122 |     :type learning_rate: float
123 |     :param learning_rate: learning rate used (factor for the stochastic
124 |                           gradient)
125 | 
126 |     :type n_epochs: int
127 |     :param n_epochs: maximal number of epochs to run the optimizer
128 | 
129 |     :type dataset: string
130 |     :param dataset: path to the dataset used for training /testing (MNIST here)
131 | 
132 |     :type nkerns: list of ints
133 |     :param nkerns: number of kernels on each layer
134 |     """
135 | 
136 |     rng = numpy.random.RandomState(23455)
137 | 
138 |     datasets = load_data(dataset)
139 | 
140 |     train_set_x, train_set_y = datasets[0]
141 |     valid_set_x, valid_set_y = datasets[1]
142 |     test_set_x, test_set_y = datasets[2]
143 | 
144 |     # compute number of minibatches for training, validation and testing
145 |     n_train_batches = train_set_x.get_value(borrow=True).shape[0]
146 |     n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
147 |     n_test_batches = test_set_x.get_value(borrow=True).shape[0]
148 |     n_train_batches /= batch_size
149 |     n_valid_batches /= batch_size
150 |     n_test_batches /= batch_size
151 | 
152 |     # allocate symbolic variables for the data
153 |     index = T.lscalar()  # index to a [mini]batch
154 | 
155 |     # start-snippet-1
156 |     x = T.matrix('x')   # the data is presented as rasterized images
157 |     y = T.ivector('y')  # the labels are presented as 1D vector of
158 |                         # [int] labels
159 | 
160 |     ######################
161 |     # BUILD ACTUAL MODEL #
162 |     ######################
163 |     print '... building the model'
164 | 
165 |     # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
166 |     # to a 4D tensor, compatible with our LeNetConvPoolLayer
167 |     # (28, 28) is the size of MNIST images.
168 |     layer0_input = x.reshape((batch_size, 1, 28, 28))
169 | 
170 |     # Construct the first convolutional pooling layer:
171 |     # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
172 |     # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
173 |     # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
174 |     layer0 = LeNetConvPoolLayer(
175 |         rng,
176 |         input=layer0_input,
177 |         image_shape=(batch_size, 1, 28, 28),
178 |         filter_shape=(nkerns[0], 1, 5, 5),
179 |         poolsize=(2, 2)
180 |     )
181 | 
182 |     # Construct the second convolutional pooling layer
183 |     # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
184 |     # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
185 |     # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
186 |     layer1 = LeNetConvPoolLayer(
187 |         rng,
188 |         input=layer0.output,
189 |         image_shape=(batch_size, nkerns[0], 12, 12),
190 |         filter_shape=(nkerns[1], nkerns[0], 5, 5),
191 |         poolsize=(2, 2)
192 |     )
193 | 
194 |     # the HiddenLayer being fully-connected, it operates on 2D matrices of
195 |     # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
196 |     # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
197 |     # or (500, 50 * 4 * 4) = (500, 800) with the default values.
198 |     layer2_input = layer1.output.flatten(2)
199 | 
200 |     # construct a fully-connected sigmoidal layer
201 |     layer2 = HiddenLayer(
202 |         rng,
203 |         input=layer2_input,
204 |         n_in=nkerns[1] * 4 * 4,
205 |         n_out=500,
206 |         activation=T.tanh
207 |     )
208 | 
209 |     # classify the values of the fully-connected sigmoidal layer
210 |     layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)
211 | 
212 |     # the cost we minimize during training is the NLL of the model
213 |     cost = layer3.negative_log_likelihood(y)
214 | 
215 |     # create a function to compute the mistakes that are made by the model
216 |     test_model = theano.function(
217 |         [index],
218 |         layer3.errors(y),
219 |         givens={
220 |             x: test_set_x[index * batch_size: (index + 1) * batch_size],
221 |             y: test_set_y[index * batch_size: (index + 1) * batch_size]
222 |         }
223 |     )
224 | 
225 |     validate_model = theano.function(
226 |         [index],
227 |         layer3.errors(y),
228 |         givens={
229 |             x: valid_set_x[index * batch_size: (index + 1) * batch_size],
230 |             y: valid_set_y[index * batch_size: (index + 1) * batch_size]
231 |         }
232 |     )
233 | 
234 |     # create a list of all model parameters to be fit by gradient descent
235 |     params = layer3.params + layer2.params + layer1.params + layer0.params
236 | 
237 |     # create a list of gradients for all model parameters
238 |     grads = T.grad(cost, params)
239 | 
240 |     # train_model is a function that updates the model parameters by
241 |     # SGD Since this model has many parameters, it would be tedious to
242 |     # manually create an update rule for each model parameter. We thus
243 |     # create the updates list by automatically looping over all
244 |     # (params[i], grads[i]) pairs.
245 |     updates = [
246 |         (param_i, param_i - learning_rate * grad_i)
247 |         for param_i, grad_i in zip(params, grads)
248 |     ]
249 | 
250 |     train_model = theano.function(
251 |         [index],
252 |         cost,
253 |         updates=updates,
254 |         givens={
255 |             x: train_set_x[index * batch_size: (index + 1) * batch_size],
256 |             y: train_set_y[index * batch_size: (index + 1) * batch_size]
257 |         }
258 |     )
259 |     # end-snippet-1
260 | 
261 |     ###############
262 |     # TRAIN MODEL #
263 |     ###############
264 |     print '... training'
265 |     # early-stopping parameters
266 |     patience = 10000  # look as this many examples regardless
267 |     patience_increase = 2  # wait this much longer when a new best is
268 |                            # found
269 |     improvement_threshold = 0.995  # a relative improvement of this much is
270 |                                    # considered significant
271 |     validation_frequency = min(n_train_batches, patience / 2)
272 |                                   # go through this many
273 |                                   # minibatche before checking the network
274 |                                   # on the validation set; in this case we
275 |                                   # check every epoch
276 | 
277 |     best_validation_loss = numpy.inf
278 |     best_iter = 0
279 |     test_score = 0.
280 |     start_time = timeit.default_timer()
281 | 
282 |     epoch = 0
283 |     done_looping = False
284 | 
285 |     while (epoch < n_epochs) and (not done_looping):
286 |         epoch = epoch + 1
287 |         for minibatch_index in xrange(n_train_batches):
288 | 
289 |             iter = (epoch - 1) * n_train_batches + minibatch_index
290 | 
291 |             if iter % 100 == 0:
292 |                 print 'training @ iter = ', iter
293 |             cost_ij = train_model(minibatch_index)
294 | 
295 |             if (iter + 1) % validation_frequency == 0:
296 | 
297 |                 # compute zero-one loss on validation set
298 |                 validation_losses = [validate_model(i) for i
299 |                                      in xrange(n_valid_batches)]
300 |                 this_validation_loss = numpy.mean(validation_losses)
301 |                 print('epoch %i, minibatch %i/%i, validation error %f %%' %
302 |                       (epoch, minibatch_index + 1, n_train_batches,
303 |                        this_validation_loss * 100.))
304 | 
305 |                 # if we got the best validation score until now
306 |                 if this_validation_loss < best_validation_loss:
307 | 
308 |                     #improve patience if loss improvement is good enough
309 |                     if this_validation_loss < best_validation_loss *  \
310 |                        improvement_threshold:
311 |                         patience = max(patience, iter * patience_increase)
312 | 
313 |                     # save best validation score and iteration number
314 |                     best_validation_loss = this_validation_loss
315 |                     best_iter = iter
316 | 
317 |                     # test it on the test set
318 |                     test_losses = [
319 |                         test_model(i)
320 |                         for i in xrange(n_test_batches)
321 |                     ]
322 |                     test_score = numpy.mean(test_losses)
323 |                     print(('     epoch %i, minibatch %i/%i, test error of '
324 |                            'best model %f %%') %
325 |                           (epoch, minibatch_index + 1, n_train_batches,
326 |                            test_score * 100.))
327 | 
328 |             if patience <= iter:
329 |                 done_looping = True
330 |                 break
331 | 
332 |     end_time = timeit.default_timer()
333 |     print('Optimization complete.')
334 |     print('Best validation score of %f %% obtained at iteration %i, '
335 |           'with test performance %f %%' %
336 |           (best_validation_loss * 100., best_iter + 1, test_score * 100.))
337 |     print >> sys.stderr, ('The code for file ' +
338 |                           os.path.split(__file__)[1] +
339 |                           ' ran for %.2fm' % ((end_time - start_time) / 60.))
340 | 
341 | if __name__ == '__main__':
342 |     evaluate_lenet5()
343 | 
344 | 
345 | def experiment(state, channel):
346 |     evaluate_lenet5(state.learning_rate, dataset=state.dataset)
347 | 


--------------------------------------------------------------------------------
/code/hmc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/code/hmc/__init__.py


--------------------------------------------------------------------------------
/code/hmc/test_hmc.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from scipy import linalg
 3 | import theano
 4 | 
 5 | from hmc import HMC_sampler
 6 | 
 7 | 
 8 | def sampler_on_nd_gaussian(sampler_cls, burnin, n_samples, dim=10):
 9 |     batchsize = 3
10 | 
11 |     rng = numpy.random.RandomState(123)
12 | 
13 |     # Define a covariance and mu for a gaussian
14 |     mu = numpy.array(rng.rand(dim) * 10, dtype=theano.config.floatX)
15 |     cov = numpy.array(rng.rand(dim, dim), dtype=theano.config.floatX)
16 |     cov = (cov + cov.T) / 2.
17 |     cov[numpy.arange(dim), numpy.arange(dim)] = 1.0
18 |     cov_inv = linalg.inv(cov)
19 | 
20 |     # Define energy function for a multi-variate Gaussian
21 |     def gaussian_energy(x):
22 |         return 0.5 * (theano.tensor.dot((x - mu), cov_inv) *
23 |                       (x - mu)).sum(axis=1)
24 | 
25 |     # Declared shared random variable for positions
26 |     position = rng.randn(batchsize, dim).astype(theano.config.floatX)
27 |     position = theano.shared(position)
28 | 
29 |     # Create HMC sampler
30 |     sampler = sampler_cls(position, gaussian_energy,
31 |                           initial_stepsize=1e-3, stepsize_max=0.5)
32 | 
33 |     # Start with a burn-in process
34 |     garbage = [sampler.draw() for r in xrange(burnin)]  # burn-in Draw
35 |     # `n_samples`: result is a 3D tensor of dim [n_samples, batchsize,
36 |     # dim]
37 |     _samples = numpy.asarray([sampler.draw() for r in xrange(n_samples)])
38 |     # Flatten to [n_samples * batchsize, dim]
39 |     samples = _samples.T.reshape(dim, -1).T
40 | 
41 |     print '****** TARGET VALUES ******'
42 |     print 'target mean:', mu
43 |     print 'target cov:\n', cov
44 | 
45 |     print '****** EMPIRICAL MEAN/COV USING HMC ******'
46 |     print 'empirical mean: ', samples.mean(axis=0)
47 |     print 'empirical_cov:\n', numpy.cov(samples.T)
48 | 
49 |     print '****** HMC INTERNALS ******'
50 |     print 'final stepsize', sampler.stepsize.get_value()
51 |     print 'final acceptance_rate', sampler.avg_acceptance_rate.get_value()
52 | 
53 |     return sampler
54 | 
55 | 
56 | def test_hmc():
57 |     sampler = sampler_on_nd_gaussian(HMC_sampler.new_from_shared_positions,
58 |                                      burnin=1000, n_samples=1000, dim=5)
59 |     assert abs(sampler.avg_acceptance_rate.get_value() -
60 |                sampler.target_acceptance_rate) < .1
61 |     assert sampler.stepsize.get_value() >= sampler.stepsize_min
62 |     assert sampler.stepsize.get_value() <= sampler.stepsize_max
63 | 


--------------------------------------------------------------------------------
/code/imdb.py:
--------------------------------------------------------------------------------
  1 | import cPickle
  2 | import gzip
  3 | import os
  4 | 
  5 | import numpy
  6 | import theano
  7 | 
  8 | 
  9 | def prepare_data(seqs, labels, maxlen=None):
 10 |     """Create the matrices from the datasets.
 11 | 
 12 |     This pad each sequence to the same lenght: the lenght of the
 13 |     longuest sequence or maxlen.
 14 | 
 15 |     if maxlen is set, we will cut all sequence to this maximum
 16 |     lenght.
 17 | 
 18 |     This swap the axis!
 19 |     """
 20 |     # x: a list of sentences
 21 |     lengths = [len(s) for s in seqs]
 22 | 
 23 |     if maxlen is not None:
 24 |         new_seqs = []
 25 |         new_labels = []
 26 |         new_lengths = []
 27 |         for l, s, y in zip(lengths, seqs, labels):
 28 |             if l < maxlen:
 29 |                 new_seqs.append(s)
 30 |                 new_labels.append(y)
 31 |                 new_lengths.append(l)
 32 |         lengths = new_lengths
 33 |         labels = new_labels
 34 |         seqs = new_seqs
 35 | 
 36 |         if len(lengths) < 1:
 37 |             return None, None, None
 38 | 
 39 |     n_samples = len(seqs)
 40 |     maxlen = numpy.max(lengths)
 41 | 
 42 |     x = numpy.zeros((maxlen, n_samples)).astype('int64')
 43 |     x_mask = numpy.zeros((maxlen, n_samples)).astype(theano.config.floatX)
 44 |     for idx, s in enumerate(seqs):
 45 |         x[:lengths[idx], idx] = s
 46 |         x_mask[:lengths[idx], idx] = 1.
 47 | 
 48 |     return x, x_mask, labels
 49 | 
 50 | 
 51 | def get_dataset_file(dataset, default_dataset, origin):
 52 |     '''Look for it as if it was a full path, if not, try local file,
 53 |     if not try in the data directory.
 54 | 
 55 |     Download dataset if it is not present
 56 | 
 57 |     '''
 58 |     data_dir, data_file = os.path.split(dataset)
 59 |     if data_dir == "" and not os.path.isfile(dataset):
 60 |         # Check if dataset is in the data directory.
 61 |         new_path = os.path.join(
 62 |             os.path.split(__file__)[0],
 63 |             "..",
 64 |             "data",
 65 |             dataset
 66 |         )
 67 |         if os.path.isfile(new_path) or data_file == default_dataset:
 68 |             dataset = new_path
 69 | 
 70 |     if (not os.path.isfile(dataset)) and data_file == default_dataset:
 71 |         import urllib
 72 |         print 'Downloading data from %s' % origin
 73 |         urllib.urlretrieve(origin, dataset)
 74 |     return dataset
 75 | 
 76 | 
 77 | def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None,
 78 |               sort_by_len=True):
 79 |     '''Loads the dataset
 80 | 
 81 |     :type path: String
 82 |     :param path: The path to the dataset (here IMDB)
 83 |     :type n_words: int
 84 |     :param n_words: The number of word to keep in the vocabulary.
 85 |         All extra words are set to unknow (1).
 86 |     :type valid_portion: float
 87 |     :param valid_portion: The proportion of the full train set used for
 88 |         the validation set.
 89 |     :type maxlen: None or positive int
 90 |     :param maxlen: the max sequence length we use in the train/valid set.
 91 |     :type sort_by_len: bool
 92 |     :name sort_by_len: Sort by the sequence lenght for the train,
 93 |         valid and test set. This allow faster execution as it cause
 94 |         less padding per minibatch. Another mechanism must be used to
 95 |         shuffle the train set at each epoch.
 96 | 
 97 |     '''
 98 | 
 99 |     #############
100 |     # LOAD DATA #
101 |     #############
102 | 
103 |     # Load the dataset
104 |     path = get_dataset_file(
105 |         path, "imdb.pkl",
106 |         "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
107 | 
108 |     if path.endswith(".gz"):
109 |         f = gzip.open(path, 'rb')
110 |     else:
111 |         f = open(path, 'rb')
112 | 
113 |     train_set = cPickle.load(f)
114 |     test_set = cPickle.load(f)
115 |     f.close()
116 |     if maxlen:
117 |         new_train_set_x = []
118 |         new_train_set_y = []
119 |         for x, y in zip(train_set[0], train_set[1]):
120 |             if len(x) < maxlen:
121 |                 new_train_set_x.append(x)
122 |                 new_train_set_y.append(y)
123 |         train_set = (new_train_set_x, new_train_set_y)
124 |         del new_train_set_x, new_train_set_y
125 | 
126 |     # split training set into validation set
127 |     train_set_x, train_set_y = train_set
128 |     n_samples = len(train_set_x)
129 |     sidx = numpy.random.permutation(n_samples)
130 |     n_train = int(numpy.round(n_samples * (1. - valid_portion)))
131 |     valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
132 |     valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
133 |     train_set_x = [train_set_x[s] for s in sidx[:n_train]]
134 |     train_set_y = [train_set_y[s] for s in sidx[:n_train]]
135 | 
136 |     train_set = (train_set_x, train_set_y)
137 |     valid_set = (valid_set_x, valid_set_y)
138 | 
139 |     def remove_unk(x):
140 |         return [[1 if w >= n_words else w for w in sen] for sen in x]
141 | 
142 |     test_set_x, test_set_y = test_set
143 |     valid_set_x, valid_set_y = valid_set
144 |     train_set_x, train_set_y = train_set
145 | 
146 |     train_set_x = remove_unk(train_set_x)
147 |     valid_set_x = remove_unk(valid_set_x)
148 |     test_set_x = remove_unk(test_set_x)
149 | 
150 |     def len_argsort(seq):
151 |         return sorted(range(len(seq)), key=lambda x: len(seq[x]))
152 | 
153 |     if sort_by_len:
154 |         sorted_index = len_argsort(test_set_x)
155 |         test_set_x = [test_set_x[i] for i in sorted_index]
156 |         test_set_y = [test_set_y[i] for i in sorted_index]
157 | 
158 |         sorted_index = len_argsort(valid_set_x)
159 |         valid_set_x = [valid_set_x[i] for i in sorted_index]
160 |         valid_set_y = [valid_set_y[i] for i in sorted_index]
161 | 
162 |         sorted_index = len_argsort(train_set_x)
163 |         train_set_x = [train_set_x[i] for i in sorted_index]
164 |         train_set_y = [train_set_y[i] for i in sorted_index]
165 | 
166 |     train = (train_set_x, train_set_y)
167 |     valid = (valid_set_x, valid_set_y)
168 |     test = (test_set_x, test_set_y)
169 | 
170 |     return train, valid, test
171 | 


--------------------------------------------------------------------------------
/code/imdb_preprocess.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script is what created the dataset pickled.
  3 | 
  4 | 1) You need to download this file and put it in the same directory as this file.
  5 | https://github.com/moses-smt/mosesdecoder/raw/master/scripts/tokenizer/tokenizer.perl . Give it execution permission.
  6 | 
  7 | 2) Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/ and extract it in the current directory.
  8 | 
  9 | 3) Then run this script.
 10 | """
 11 | 
 12 | dataset_path='/Tmp/bastienf/aclImdb/'
 13 | 
 14 | import numpy
 15 | import cPickle as pkl
 16 | 
 17 | from collections import OrderedDict
 18 | 
 19 | import glob
 20 | import os
 21 | 
 22 | from subprocess import Popen, PIPE
 23 | 
 24 | # tokenizer.perl is from Moses: https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer
 25 | tokenizer_cmd = ['./tokenizer.perl', '-l', 'en', '-q', '-']
 26 | 
 27 | 
 28 | def tokenize(sentences):
 29 | 
 30 |     print 'Tokenizing..',
 31 |     text = "\n".join(sentences)
 32 |     tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
 33 |     tok_text, _ = tokenizer.communicate(text)
 34 |     toks = tok_text.split('\n')[:-1]
 35 |     print 'Done'
 36 | 
 37 |     return toks
 38 | 
 39 | 
 40 | def build_dict(path):
 41 |     sentences = []
 42 |     currdir = os.getcwd()
 43 |     os.chdir('%s/pos/' % path)
 44 |     for ff in glob.glob("*.txt"):
 45 |         with open(ff, 'r') as f:
 46 |             sentences.append(f.readline().strip())
 47 |     os.chdir('%s/neg/' % path)
 48 |     for ff in glob.glob("*.txt"):
 49 |         with open(ff, 'r') as f:
 50 |             sentences.append(f.readline().strip())
 51 |     os.chdir(currdir)
 52 | 
 53 |     sentences = tokenize(sentences)
 54 | 
 55 |     print 'Building dictionary..',
 56 |     wordcount = dict()
 57 |     for ss in sentences:
 58 |         words = ss.strip().lower().split()
 59 |         for w in words:
 60 |             if w not in wordcount:
 61 |                 wordcount[w] = 1
 62 |             else:
 63 |                 wordcount[w] += 1
 64 | 
 65 |     counts = wordcount.values()
 66 |     keys = wordcount.keys()
 67 | 
 68 |     sorted_idx = numpy.argsort(counts)[::-1]
 69 | 
 70 |     worddict = dict()
 71 | 
 72 |     for idx, ss in enumerate(sorted_idx):
 73 |         worddict[keys[ss]] = idx+2  # leave 0 and 1 (UNK)
 74 | 
 75 |     print numpy.sum(counts), ' total words ', len(keys), ' unique words'
 76 | 
 77 |     return worddict
 78 | 
 79 | 
 80 | def grab_data(path, dictionary):
 81 |     sentences = []
 82 |     currdir = os.getcwd()
 83 |     os.chdir(path)
 84 |     for ff in glob.glob("*.txt"):
 85 |         with open(ff, 'r') as f:
 86 |             sentences.append(f.readline().strip())
 87 |     os.chdir(currdir)
 88 |     sentences = tokenize(sentences)
 89 | 
 90 |     seqs = [None] * len(sentences)
 91 |     for idx, ss in enumerate(sentences):
 92 |         words = ss.strip().lower().split()
 93 |         seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words]
 94 | 
 95 |     return seqs
 96 | 
 97 | 
 98 | def main():
 99 |     # Get the dataset from http://ai.stanford.edu/~amaas/data/sentiment/
100 |     path = dataset_path
101 |     dictionary = build_dict(os.path.join(path, 'train'))
102 | 
103 |     train_x_pos = grab_data(path+'train/pos', dictionary)
104 |     train_x_neg = grab_data(path+'train/neg', dictionary)
105 |     train_x = train_x_pos + train_x_neg
106 |     train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)
107 | 
108 |     test_x_pos = grab_data(path+'test/pos', dictionary)
109 |     test_x_neg = grab_data(path+'test/neg', dictionary)
110 |     test_x = test_x_pos + test_x_neg
111 |     test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)
112 | 
113 |     f = open('imdb.pkl', 'wb')
114 |     pkl.dump((train_x, train_y), f, -1)
115 |     pkl.dump((test_x, test_y), f, -1)
116 |     f.close()
117 | 
118 |     f = open('imdb.dict.pkl', 'wb')
119 |     pkl.dump(dictionary, f, -1)
120 |     f.close()
121 | 
122 | if __name__ == '__main__':
123 |     main()
124 | 


--------------------------------------------------------------------------------
/code/logistic_cg.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This tutorial introduces logistic regression using Theano and conjugate
  3 | gradient descent.
  4 | 
  5 | Logistic regression is a probabilistic, linear classifier. It is parametrized
  6 | by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
  7 | done by projecting data points onto a set of hyperplanes, the distance to
  8 | which is used to determine a class membership probability.
  9 | 
 10 | Mathematically, this can be written as:
 11 | 
 12 | .. math::
 13 |   P(Y=i|x, W,b) &= softmax_i(W x + b) \\
 14 |                 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
 15 | 
 16 | 
 17 | The output of the model or prediction is then done by taking the argmax of
 18 | the vector whose i'th element is P(Y=i|x).
 19 | 
 20 | .. math::
 21 | 
 22 |   y_{pred} = argmax_i P(Y=i|x,W,b)
 23 | 
 24 | 
 25 | This tutorial presents a conjugate gradient optimization method that is
 26 | suitable for smaller datasets.
 27 | 
 28 | 
 29 | References:
 30 | 
 31 |    - textbooks: "Pattern Recognition and Machine Learning" -
 32 |                  Christopher M. Bishop, section 4.3.2
 33 | 
 34 | 
 35 | """
 36 | __docformat__ = 'restructedtext en'
 37 | 
 38 | 
 39 | import os
 40 | import sys
 41 | import timeit
 42 | 
 43 | import numpy
 44 | 
 45 | import theano
 46 | import theano.tensor as T
 47 | 
 48 | from logistic_sgd import load_data
 49 | 
 50 | 
 51 | class LogisticRegression(object):
 52 |     """Multi-class Logistic Regression Class
 53 | 
 54 |     The logistic regression is fully described by a weight matrix :math:`W`
 55 |     and bias vector :math:`b`. Classification is done by projecting data
 56 |     points onto a set of hyperplanes, the distance to which is used to
 57 |     determine a class membership probability.
 58 |     """
 59 | 
 60 |     def __init__(self, input, n_in, n_out):
 61 |         """ Initialize the parameters of the logistic regression
 62 | 
 63 |         :type input: theano.tensor.TensorType
 64 |         :param input: symbolic variable that describes the input of the
 65 |                       architecture ( one minibatch)
 66 | 
 67 |         :type n_in: int
 68 |         :param n_in: number of input units, the dimension of the space in
 69 |                      which the datapoint lies
 70 | 
 71 |         :type n_out: int
 72 |         :param n_out: number of output units, the dimension of the space in
 73 |                       which the target lies
 74 | 
 75 |         """
 76 | 
 77 |         # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out),
 78 |         # while b is a vector of n_out elements, making theta a vector of
 79 |         # n_in*n_out + n_out elements
 80 |         self.theta = theano.shared(
 81 |             value=numpy.zeros(
 82 |                 n_in * n_out + n_out,
 83 |                 dtype=theano.config.floatX
 84 |             ),
 85 |             name='theta',
 86 |             borrow=True
 87 |         )
 88 |         # W is represented by the fisr n_in*n_out elements of theta
 89 |         self.W = self.theta[0:n_in * n_out].reshape((n_in, n_out))
 90 |         # b is the rest (last n_out elements)
 91 |         self.b = self.theta[n_in * n_out:n_in * n_out + n_out]
 92 | 
 93 |         # compute vector of class-membership probabilities in symbolic form
 94 |         self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
 95 | 
 96 |         # compute prediction as class whose probability is maximal in
 97 |         # symbolic form
 98 |         self.y_pred = T.argmax(self.p_y_given_x, axis=1)
 99 | 
100 |         # keep track of model input
101 |         self.input = input
102 | 
103 |     def negative_log_likelihood(self, y):
104 |         """Return the negative log-likelihood of the prediction of this model
105 |         under a given target distribution.
106 | 
107 |         .. math::
108 | 
109 |             \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
110 |             \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|}
111 |                 \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
112 |             \ell (\theta=\{W,b\}, \mathcal{D})
113 | 
114 |         :type y: theano.tensor.TensorType
115 |         :param y: corresponds to a vector that gives for each example the
116 |                   correct label
117 |         """
118 |         return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
119 | 
120 |     def errors(self, y):
121 |         """Return a float representing the number of errors in the minibatch
122 |         over the total number of examples of the minibatch
123 | 
124 |         :type y: theano.tensor.TensorType
125 |         :param y: corresponds to a vector that gives for each example
126 |                   the correct label
127 |         """
128 | 
129 |         # check if y has same dimension of y_pred
130 |         if y.ndim != self.y_pred.ndim:
131 |             raise TypeError(
132 |                 'y should have the same shape as self.y_pred',
133 |                 ('y', y.type, 'y_pred', self.y_pred.type)
134 |             )
135 |         # check if y is of the correct datatype
136 |         if y.dtype.startswith('int'):
137 |             # the T.neq operator returns a vector of 0s and 1s, where 1
138 |             # represents a mistake in prediction
139 |             return T.mean(T.neq(self.y_pred, y))
140 |         else:
141 |             raise NotImplementedError()
142 | 
143 | 
144 | def cg_optimization_mnist(n_epochs=50, mnist_pkl_gz='mnist.pkl.gz'):
145 |     """Demonstrate conjugate gradient optimization of a log-linear model
146 | 
147 |     This is demonstrated on MNIST.
148 | 
149 |     :type n_epochs: int
150 |     :param n_epochs: number of epochs to run the optimizer
151 | 
152 |     :type mnist_pkl_gz: string
153 |     :param mnist_pkl_gz: the path of the mnist training file from
154 |                  http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
155 | 
156 |     """
157 |     #############
158 |     # LOAD DATA #
159 |     #############
160 |     datasets = load_data(mnist_pkl_gz)
161 | 
162 |     train_set_x, train_set_y = datasets[0]
163 |     valid_set_x, valid_set_y = datasets[1]
164 |     test_set_x, test_set_y = datasets[2]
165 | 
166 |     batch_size = 600    # size of the minibatch
167 | 
168 |     n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
169 |     n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
170 |     n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
171 | 
172 |     n_in = 28 * 28  # number of input units
173 |     n_out = 10  # number of output units
174 | 
175 |     ######################
176 |     # BUILD ACTUAL MODEL #
177 |     ######################
178 |     print '... building the model'
179 | 
180 |     # allocate symbolic variables for the data
181 |     minibatch_offset = T.lscalar()  # offset to the start of a [mini]batch
182 |     x = T.matrix()   # the data is presented as rasterized images
183 |     y = T.ivector()  # the labels are presented as 1D vector of
184 |                      # [int] labels
185 | 
186 |     # construct the logistic regression class
187 |     classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)
188 | 
189 |     # the cost we minimize during training is the negative log likelihood of
190 |     # the model in symbolic format
191 |     cost = classifier.negative_log_likelihood(y).mean()
192 | 
193 |     # compile a theano function that computes the mistakes that are made by
194 |     # the model on a minibatch
195 |     test_model = theano.function(
196 |         [minibatch_offset],
197 |         classifier.errors(y),
198 |         givens={
199 |             x: test_set_x[minibatch_offset:minibatch_offset + batch_size],
200 |             y: test_set_y[minibatch_offset:minibatch_offset + batch_size]
201 |         },
202 |         name="test"
203 |     )
204 | 
205 |     validate_model = theano.function(
206 |         [minibatch_offset],
207 |         classifier.errors(y),
208 |         givens={
209 |             x: valid_set_x[minibatch_offset: minibatch_offset + batch_size],
210 |             y: valid_set_y[minibatch_offset: minibatch_offset + batch_size]
211 |         },
212 |         name="validate"
213 |     )
214 | 
215 |     #  compile a theano function that returns the cost of a minibatch
216 |     batch_cost = theano.function(
217 |         [minibatch_offset],
218 |         cost,
219 |         givens={
220 |             x: train_set_x[minibatch_offset: minibatch_offset + batch_size],
221 |             y: train_set_y[minibatch_offset: minibatch_offset + batch_size]
222 |         },
223 |         name="batch_cost"
224 |     )
225 | 
226 |     # compile a theano function that returns the gradient of the minibatch
227 |     # with respect to theta
228 |     batch_grad = theano.function(
229 |         [minibatch_offset],
230 |         T.grad(cost, classifier.theta),
231 |         givens={
232 |             x: train_set_x[minibatch_offset: minibatch_offset + batch_size],
233 |             y: train_set_y[minibatch_offset: minibatch_offset + batch_size]
234 |         },
235 |         name="batch_grad"
236 |     )
237 | 
238 |     # creates a function that computes the average cost on the training set
239 |     def train_fn(theta_value):
240 |         classifier.theta.set_value(theta_value, borrow=True)
241 |         train_losses = [batch_cost(i * batch_size)
242 |                         for i in xrange(n_train_batches)]
243 |         return numpy.mean(train_losses)
244 | 
245 |     # creates a function that computes the average gradient of cost with
246 |     # respect to theta
247 |     def train_fn_grad(theta_value):
248 |         classifier.theta.set_value(theta_value, borrow=True)
249 |         grad = batch_grad(0)
250 |         for i in xrange(1, n_train_batches):
251 |             grad += batch_grad(i * batch_size)
252 |         return grad / n_train_batches
253 | 
254 |     validation_scores = [numpy.inf, 0]
255 | 
256 |     # creates the validation function
257 |     def callback(theta_value):
258 |         classifier.theta.set_value(theta_value, borrow=True)
259 |         #compute the validation loss
260 |         validation_losses = [validate_model(i * batch_size)
261 |                              for i in xrange(n_valid_batches)]
262 |         this_validation_loss = numpy.mean(validation_losses)
263 |         print('validation error %f %%' % (this_validation_loss * 100.,))
264 | 
265 |         # check if it is better then best validation score got until now
266 |         if this_validation_loss < validation_scores[0]:
267 |             # if so, replace the old one, and compute the score on the
268 |             # testing dataset
269 |             validation_scores[0] = this_validation_loss
270 |             test_losses = [test_model(i * batch_size)
271 |                            for i in xrange(n_test_batches)]
272 |             validation_scores[1] = numpy.mean(test_losses)
273 | 
274 |     ###############
275 |     # TRAIN MODEL #
276 |     ###############
277 | 
278 |     # using scipy conjugate gradient optimizer
279 |     import scipy.optimize
280 |     print ("Optimizing using scipy.optimize.fmin_cg...")
281 |     start_time = timeit.default_timer()
282 |     best_w_b = scipy.optimize.fmin_cg(
283 |         f=train_fn,
284 |         x0=numpy.zeros((n_in + 1) * n_out, dtype=x.dtype),
285 |         fprime=train_fn_grad,
286 |         callback=callback,
287 |         disp=0,
288 |         maxiter=n_epochs
289 |     )
290 |     end_time = timeit.default_timer()
291 |     print(
292 |         (
293 |             'Optimization complete with best validation score of %f %%, with '
294 |             'test performance %f %%'
295 |         )
296 |         % (validation_scores[0] * 100., validation_scores[1] * 100.)
297 |     )
298 | 
299 |     print >> sys.stderr, ('The code for file ' +
300 |                           os.path.split(__file__)[1] +
301 |                           ' ran for %.1fs' % ((end_time - start_time)))
302 | 
303 | 
304 | if __name__ == '__main__':
305 |     cg_optimization_mnist()
306 | 


--------------------------------------------------------------------------------
/code/rnnrbm.py:
--------------------------------------------------------------------------------
  1 | # Author: Nicolas Boulanger-Lewandowski
  2 | # University of Montreal (2012)
  3 | # RNN-RBM deep learning tutorial
  4 | # More information at http://deeplearning.net/tutorial/rnnrbm.html
  5 | 
  6 | import glob
  7 | import os
  8 | import sys
  9 | 
 10 | import numpy
 11 | try:
 12 |     import pylab
 13 | except ImportError:
 14 |     print (
 15 |         "pylab isn't available. If you use its functionality, it will crash."
 16 |     )
 17 |     print "It can be installed with 'pip install -q Pillow'"
 18 | 
 19 | from midi.utils import midiread, midiwrite
 20 | import theano
 21 | import theano.tensor as T
 22 | from theano.tensor.shared_randomstreams import RandomStreams
 23 | 
 24 | #Don't use a python long as this don't work on 32 bits computers.
 25 | numpy.random.seed(0xbeef)
 26 | rng = RandomStreams(seed=numpy.random.randint(1 << 30))
 27 | theano.config.warn.subtensor_merge_bug = False
 28 | 
 29 | 
 30 | def build_rbm(v, W, bv, bh, k):
 31 |     '''Construct a k-step Gibbs chain starting at v for an RBM.
 32 | 
 33 |     v : Theano vector or matrix
 34 |         If a matrix, multiple chains will be run in parallel (batch).
 35 |     W : Theano matrix
 36 |         Weight matrix of the RBM.
 37 |     bv : Theano vector
 38 |         Visible bias vector of the RBM.
 39 |     bh : Theano vector
 40 |         Hidden bias vector of the RBM.
 41 |     k : scalar or Theano scalar
 42 |         Length of the Gibbs chain.
 43 | 
 44 |     Return a (v_sample, cost, monitor, updates) tuple:
 45 | 
 46 |     v_sample : Theano vector or matrix with the same shape as `v`
 47 |         Corresponds to the generated sample(s).
 48 |     cost : Theano scalar
 49 |         Expression whose gradient with respect to W, bv, bh is the CD-k
 50 |         approximation to the log-likelihood of `v` (training example) under the
 51 |         RBM. The cost is averaged in the batch case.
 52 |     monitor: Theano scalar
 53 |         Pseudo log-likelihood (also averaged in the batch case).
 54 |     updates: dictionary of Theano variable -> Theano variable
 55 |         The `updates` object returned by scan.'''
 56 | 
 57 |     def gibbs_step(v):
 58 |         mean_h = T.nnet.sigmoid(T.dot(v, W) + bh)
 59 |         h = rng.binomial(size=mean_h.shape, n=1, p=mean_h,
 60 |                          dtype=theano.config.floatX)
 61 |         mean_v = T.nnet.sigmoid(T.dot(h, W.T) + bv)
 62 |         v = rng.binomial(size=mean_v.shape, n=1, p=mean_v,
 63 |                          dtype=theano.config.floatX)
 64 |         return mean_v, v
 65 | 
 66 |     chain, updates = theano.scan(lambda v: gibbs_step(v)[1], outputs_info=[v],
 67 |                                  n_steps=k)
 68 |     v_sample = chain[-1]
 69 | 
 70 |     mean_v = gibbs_step(v_sample)[0]
 71 |     monitor = T.xlogx.xlogy0(v, mean_v) + T.xlogx.xlogy0(1 - v, 1 - mean_v)
 72 |     monitor = monitor.sum() / v.shape[0]
 73 | 
 74 |     def free_energy(v):
 75 |         return -(v * bv).sum() - T.log(1 + T.exp(T.dot(v, W) + bh)).sum()
 76 |     cost = (free_energy(v) - free_energy(v_sample)) / v.shape[0]
 77 | 
 78 |     return v_sample, cost, monitor, updates
 79 | 
 80 | 
 81 | def shared_normal(num_rows, num_cols, scale=1):
 82 |     '''Initialize a matrix shared variable with normally distributed
 83 |     elements.'''
 84 |     return theano.shared(numpy.random.normal(
 85 |         scale=scale, size=(num_rows, num_cols)).astype(theano.config.floatX))
 86 | 
 87 | 
 88 | def shared_zeros(*shape):
 89 |     '''Initialize a vector shared variable with zero elements.'''
 90 |     return theano.shared(numpy.zeros(shape, dtype=theano.config.floatX))
 91 | 
 92 | 
 93 | def build_rnnrbm(n_visible, n_hidden, n_hidden_recurrent):
 94 |     '''Construct a symbolic RNN-RBM and initialize parameters.
 95 | 
 96 |     n_visible : integer
 97 |         Number of visible units.
 98 |     n_hidden : integer
 99 |         Number of hidden units of the conditional RBMs.
100 |     n_hidden_recurrent : integer
101 |         Number of hidden units of the RNN.
102 | 
103 |     Return a (v, v_sample, cost, monitor, params, updates_train, v_t,
104 |     updates_generate) tuple:
105 | 
106 |     v : Theano matrix
107 |         Symbolic variable holding an input sequence (used during training)
108 |     v_sample : Theano matrix
109 |         Symbolic variable holding the negative particles for CD log-likelihood
110 |         gradient estimation (used during training)
111 |     cost : Theano scalar
112 |         Expression whose gradient (considering v_sample constant) corresponds
113 |         to the LL gradient of the RNN-RBM (used during training)
114 |     monitor : Theano scalar
115 |         Frame-level pseudo-likelihood (useful for monitoring during training)
116 |     params : tuple of Theano shared variables
117 |         The parameters of the model to be optimized during training.
118 |     updates_train : dictionary of Theano variable -> Theano variable
119 |         Update object that should be passed to theano.function when compiling
120 |         the training function.
121 |     v_t : Theano matrix
122 |         Symbolic variable holding a generated sequence (used during sampling)
123 |     updates_generate : dictionary of Theano variable -> Theano variable
124 |         Update object that should be passed to theano.function when compiling
125 |         the generation function.'''
126 | 
127 |     W = shared_normal(n_visible, n_hidden, 0.01)
128 |     bv = shared_zeros(n_visible)
129 |     bh = shared_zeros(n_hidden)
130 |     Wuh = shared_normal(n_hidden_recurrent, n_hidden, 0.0001)
131 |     Wuv = shared_normal(n_hidden_recurrent, n_visible, 0.0001)
132 |     Wvu = shared_normal(n_visible, n_hidden_recurrent, 0.0001)
133 |     Wuu = shared_normal(n_hidden_recurrent, n_hidden_recurrent, 0.0001)
134 |     bu = shared_zeros(n_hidden_recurrent)
135 | 
136 |     params = W, bv, bh, Wuh, Wuv, Wvu, Wuu, bu  # learned parameters as shared
137 |                                                 # variables
138 | 
139 |     v = T.matrix()  # a training sequence
140 |     u0 = T.zeros((n_hidden_recurrent,))  # initial value for the RNN hidden
141 |                                          # units
142 | 
143 |     # If `v_t` is given, deterministic recurrence to compute the variable
144 |     # biases bv_t, bh_t at each time step. If `v_t` is None, same recurrence
145 |     # but with a separate Gibbs chain at each time step to sample (generate)
146 |     # from the RNN-RBM. The resulting sample v_t is returned in order to be
147 |     # passed down to the sequence history.
148 |     def recurrence(v_t, u_tm1):
149 |         bv_t = bv + T.dot(u_tm1, Wuv)
150 |         bh_t = bh + T.dot(u_tm1, Wuh)
151 |         generate = v_t is None
152 |         if generate:
153 |             v_t, _, _, updates = build_rbm(T.zeros((n_visible,)), W, bv_t,
154 |                                            bh_t, k=25)
155 |         u_t = T.tanh(bu + T.dot(v_t, Wvu) + T.dot(u_tm1, Wuu))
156 |         return ([v_t, u_t], updates) if generate else [u_t, bv_t, bh_t]
157 | 
158 |     # For training, the deterministic recurrence is used to compute all the
159 |     # {bv_t, bh_t, 1 <= t <= T} given v. Conditional RBMs can then be trained
160 |     # in batches using those parameters.
161 |     (u_t, bv_t, bh_t), updates_train = theano.scan(
162 |         lambda v_t, u_tm1, *_: recurrence(v_t, u_tm1),
163 |         sequences=v, outputs_info=[u0, None, None], non_sequences=params)
164 |     v_sample, cost, monitor, updates_rbm = build_rbm(v, W, bv_t[:], bh_t[:],
165 |                                                      k=15)
166 |     updates_train.update(updates_rbm)
167 | 
168 |     # symbolic loop for sequence generation
169 |     (v_t, u_t), updates_generate = theano.scan(
170 |         lambda u_tm1, *_: recurrence(None, u_tm1),
171 |         outputs_info=[None, u0], non_sequences=params, n_steps=200)
172 | 
173 |     return (v, v_sample, cost, monitor, params, updates_train, v_t,
174 |             updates_generate)
175 | 
176 | 
177 | class RnnRbm:
178 |     '''Simple class to train an RNN-RBM from MIDI files and to generate sample
179 |     sequences.'''
180 | 
181 |     def __init__(
182 |         self,
183 |         n_hidden=150,
184 |         n_hidden_recurrent=100,
185 |         lr=0.001,
186 |         r=(21, 109),
187 |         dt=0.3
188 |     ):
189 |         '''Constructs and compiles Theano functions for training and sequence
190 |         generation.
191 | 
192 |         n_hidden : integer
193 |             Number of hidden units of the conditional RBMs.
194 |         n_hidden_recurrent : integer
195 |             Number of hidden units of the RNN.
196 |         lr : float
197 |             Learning rate
198 |         r : (integer, integer) tuple
199 |             Specifies the pitch range of the piano-roll in MIDI note numbers,
200 |             including r[0] but not r[1], such that r[1]-r[0] is the number of
201 |             visible units of the RBM at a given time step. The default (21,
202 |             109) corresponds to the full range of piano (88 notes).
203 |         dt : float
204 |             Sampling period when converting the MIDI files into piano-rolls, or
205 |             equivalently the time difference between consecutive time steps.'''
206 | 
207 |         self.r = r
208 |         self.dt = dt
209 |         (v, v_sample, cost, monitor, params, updates_train, v_t,
210 |             updates_generate) = build_rnnrbm(
211 |                 r[1] - r[0],
212 |                 n_hidden,
213 |                 n_hidden_recurrent
214 |             )
215 | 
216 |         gradient = T.grad(cost, params, consider_constant=[v_sample])
217 |         updates_train.update(
218 |             ((p, p - lr * g) for p, g in zip(params, gradient))
219 |         )
220 |         self.train_function = theano.function(
221 |             [v],
222 |             monitor,
223 |             updates=updates_train
224 |         )
225 |         self.generate_function = theano.function(
226 |             [],
227 |             v_t,
228 |             updates=updates_generate
229 |         )
230 | 
231 |     def train(self, files, batch_size=100, num_epochs=200):
232 |         '''Train the RNN-RBM via stochastic gradient descent (SGD) using MIDI
233 |         files converted to piano-rolls.
234 | 
235 |         files : list of strings
236 |             List of MIDI files that will be loaded as piano-rolls for training.
237 |         batch_size : integer
238 |             Training sequences will be split into subsequences of at most this
239 |             size before applying the SGD updates.
240 |         num_epochs : integer
241 |             Number of epochs (pass over the training set) performed. The user
242 |             can safely interrupt training with Ctrl+C at any time.'''
243 | 
244 |         assert len(files) > 0, 'Training set is empty!' \
245 |                                ' (did you download the data files?)'
246 |         dataset = [midiread(f, self.r,
247 |                             self.dt).piano_roll.astype(theano.config.floatX)
248 |                    for f in files]
249 | 
250 |         try:
251 |             for epoch in xrange(num_epochs):
252 |                 numpy.random.shuffle(dataset)
253 |                 costs = []
254 | 
255 |                 for s, sequence in enumerate(dataset):
256 |                     for i in xrange(0, len(sequence), batch_size):
257 |                         cost = self.train_function(sequence[i:i + batch_size])
258 |                         costs.append(cost)
259 | 
260 |                 print 'Epoch %i/%i' % (epoch + 1, num_epochs),
261 |                 print numpy.mean(costs)
262 |                 sys.stdout.flush()
263 | 
264 |         except KeyboardInterrupt:
265 |             print 'Interrupted by user.'
266 | 
267 |     def generate(self, filename, show=True):
268 |         '''Generate a sample sequence, plot the resulting piano-roll and save
269 |         it as a MIDI file.
270 | 
271 |         filename : string
272 |             A MIDI file will be created at this location.
273 |         show : boolean
274 |             If True, a piano-roll of the generated sequence will be shown.'''
275 | 
276 |         piano_roll = self.generate_function()
277 |         midiwrite(filename, piano_roll, self.r, self.dt)
278 |         if show:
279 |             extent = (0, self.dt * len(piano_roll)) + self.r
280 |             pylab.figure()
281 |             pylab.imshow(piano_roll.T, origin='lower', aspect='auto',
282 |                          interpolation='nearest', cmap=pylab.cm.gray_r,
283 |                          extent=extent)
284 |             pylab.xlabel('time (s)')
285 |             pylab.ylabel('MIDI note number')
286 |             pylab.title('generated piano-roll')
287 | 
288 | 
289 | def test_rnnrbm(batch_size=100, num_epochs=200):
290 |     model = RnnRbm()
291 |     re = os.path.join(os.path.split(os.path.dirname(__file__))[0],
292 |                       'data', 'Nottingham', 'train', '*.mid')
293 |     model.train(glob.glob(re),
294 |                 batch_size=batch_size, num_epochs=num_epochs)
295 |     return model
296 | 
297 | if __name__ == '__main__':
298 |     model = test_rnnrbm()
299 |     model.generate('sample1.mid')
300 |     model.generate('sample2.mid')
301 |     pylab.show()
302 | 


--------------------------------------------------------------------------------
/code/rnnslu.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | import copy
  3 | import cPickle
  4 | import gzip
  5 | import os
  6 | import urllib
  7 | import random
  8 | import stat
  9 | import subprocess
 10 | import sys
 11 | import timeit
 12 | 
 13 | import numpy
 14 | 
 15 | import theano
 16 | from theano import tensor as T
 17 | 
 18 | # Otherwise the deepcopy fails
 19 | import sys
 20 | sys.setrecursionlimit(1500)
 21 | 
 22 | PREFIX = os.getenv(
 23 |     'ATISDATA',
 24 |     os.path.join(os.path.split(os.path.abspath(os.path.dirname(__file__)))[0],
 25 |                  'data'))
 26 | 
 27 | 
 28 | # utils functions
 29 | def shuffle(lol, seed):
 30 |     '''
 31 |     lol :: list of list as input
 32 |     seed :: seed the shuffling
 33 | 
 34 |     shuffle inplace each list in the same order
 35 |     '''
 36 |     for l in lol:
 37 |         random.seed(seed)
 38 |         random.shuffle(l)
 39 | 
 40 | 
 41 | # start-snippet-1
 42 | def contextwin(l, win):
 43 |     '''
 44 |     win :: int corresponding to the size of the window
 45 |     given a list of indexes composing a sentence
 46 | 
 47 |     l :: array containing the word indexes
 48 | 
 49 |     it will return a list of list of indexes corresponding
 50 |     to context windows surrounding each word in the sentence
 51 |     '''
 52 |     assert (win % 2) == 1
 53 |     assert win >= 1
 54 |     l = list(l)
 55 | 
 56 |     lpadded = win // 2 * [-1] + l + win // 2 * [-1]
 57 |     out = [lpadded[i:(i + win)] for i in range(len(l))]
 58 | 
 59 |     assert len(out) == len(l)
 60 |     return out
 61 | # end-snippet-1
 62 | 
 63 | 
 64 | # data loading functions
 65 | def atisfold(fold):
 66 |     assert fold in range(5)
 67 |     filename = os.path.join(PREFIX, 'atis.fold'+str(fold)+'.pkl.gz')
 68 |     f = gzip.open(filename, 'rb')
 69 |     train_set, valid_set, test_set, dicts = cPickle.load(f)
 70 |     return train_set, valid_set, test_set, dicts
 71 | 
 72 | 
 73 | # metrics function using conlleval.pl
 74 | def conlleval(p, g, w, filename, script_path):
 75 |     '''
 76 |     INPUT:
 77 |     p :: predictions
 78 |     g :: groundtruth
 79 |     w :: corresponding words
 80 | 
 81 |     OUTPUT:
 82 |     filename :: name of the file where the predictions
 83 |     are written. it will be the input of conlleval.pl script
 84 |     for computing the performance in terms of precision
 85 |     recall and f1 score
 86 | 
 87 |     OTHER:
 88 |     script_path :: path to the directory containing the
 89 |     conlleval.pl script
 90 |     '''
 91 |     out = ''
 92 |     for sl, sp, sw in zip(g, p, w):
 93 |         out += 'BOS O O\n'
 94 |         for wl, wp, w in zip(sl, sp, sw):
 95 |             out += w + ' ' + wl + ' ' + wp + '\n'
 96 |         out += 'EOS O O\n\n'
 97 | 
 98 |     f = open(filename, 'w')
 99 |     f.writelines(out)
100 |     f.close()
101 | 
102 |     return get_perf(filename, script_path)
103 | 
104 | 
105 | def download(origin, destination):
106 |     '''
107 |     download the corresponding atis file
108 |     from http://www-etud.iro.umontreal.ca/~mesnilgr/atis/
109 |     '''
110 |     print 'Downloading data from %s' % origin
111 |     urllib.urlretrieve(origin, destination)
112 | 
113 | 
114 | def get_perf(filename, folder):
115 |     ''' run conlleval.pl perl script to obtain
116 |     precision/recall and F1 score '''
117 |     _conlleval = os.path.join(folder, 'conlleval.pl')
118 |     if not os.path.isfile(_conlleval):
119 |         url = 'http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl'
120 |         download(url, _conlleval)
121 |         os.chmod(_conlleval, stat.S_IRWXU)  # give the execute permissions
122 | 
123 |     proc = subprocess.Popen(["perl",
124 |                             _conlleval],
125 |                             stdin=subprocess.PIPE,
126 |                             stdout=subprocess.PIPE)
127 | 
128 |     stdout, _ = proc.communicate(''.join(open(filename).readlines()))
129 |     for line in stdout.split('\n'):
130 |         if 'accuracy' in line:
131 |             out = line.split()
132 |             break
133 | 
134 |     precision = float(out[6][:-2])
135 |     recall = float(out[8][:-2])
136 |     f1score = float(out[10])
137 | 
138 |     return {'p': precision, 'r': recall, 'f1': f1score}
139 | 
140 | 
141 | # start-snippet-2
142 | class RNNSLU(object):
143 |     ''' elman neural net model '''
144 |     def __init__(self, nh, nc, ne, de, cs):
145 |         '''
146 |         nh :: dimension of the hidden layer
147 |         nc :: number of classes
148 |         ne :: number of word embeddings in the vocabulary
149 |         de :: dimension of the word embeddings
150 |         cs :: word window context size
151 |         '''
152 |         # parameters of the model
153 |         self.emb = theano.shared(name='embeddings',
154 |                                  value=0.2 * numpy.random.uniform(-1.0, 1.0,
155 |                                  (ne+1, de))
156 |                                  # add one for padding at the end
157 |                                  .astype(theano.config.floatX))
158 |         self.wx = theano.shared(name='wx',
159 |                                 value=0.2 * numpy.random.uniform(-1.0, 1.0,
160 |                                 (de * cs, nh))
161 |                                 .astype(theano.config.floatX))
162 |         self.wh = theano.shared(name='wh',
163 |                                 value=0.2 * numpy.random.uniform(-1.0, 1.0,
164 |                                 (nh, nh))
165 |                                 .astype(theano.config.floatX))
166 |         self.w = theano.shared(name='w',
167 |                                value=0.2 * numpy.random.uniform(-1.0, 1.0,
168 |                                (nh, nc))
169 |                                .astype(theano.config.floatX))
170 |         self.bh = theano.shared(name='bh',
171 |                                 value=numpy.zeros(nh,
172 |                                 dtype=theano.config.floatX))
173 |         self.b = theano.shared(name='b',
174 |                                value=numpy.zeros(nc,
175 |                                dtype=theano.config.floatX))
176 |         self.h0 = theano.shared(name='h0',
177 |                                 value=numpy.zeros(nh,
178 |                                 dtype=theano.config.floatX))
179 | 
180 |         # bundle
181 |         self.params = [self.emb, self.wx, self.wh, self.w,
182 |                        self.bh, self.b, self.h0]
183 |         # end-snippet-2
184 |         # as many columns as context window size
185 |         # as many lines as words in the sentence
186 |         # start-snippet-3
187 |         idxs = T.imatrix()
188 |         x = self.emb[idxs].reshape((idxs.shape[0], de*cs))
189 |         y_sentence = T.ivector('y_sentence')  # labels
190 |         # end-snippet-3 start-snippet-4
191 | 
192 |         def recurrence(x_t, h_tm1):
193 |             h_t = T.nnet.sigmoid(T.dot(x_t, self.wx)
194 |                                  + T.dot(h_tm1, self.wh) + self.bh)
195 |             s_t = T.nnet.softmax(T.dot(h_t, self.w) + self.b)
196 |             return [h_t, s_t]
197 | 
198 |         [h, s], _ = theano.scan(fn=recurrence,
199 |                                 sequences=x,
200 |                                 outputs_info=[self.h0, None],
201 |                                 n_steps=x.shape[0])
202 | 
203 |         p_y_given_x_sentence = s[:, 0, :]
204 |         y_pred = T.argmax(p_y_given_x_sentence, axis=1)
205 |         # end-snippet-4
206 | 
207 |         # cost and gradients and learning rate
208 |         # start-snippet-5
209 |         lr = T.scalar('lr')
210 | 
211 |         sentence_nll = -T.mean(T.log(p_y_given_x_sentence)
212 |                                [T.arange(x.shape[0]), y_sentence])
213 |         sentence_gradients = T.grad(sentence_nll, self.params)
214 |         sentence_updates = OrderedDict((p, p - lr*g)
215 |                                        for p, g in
216 |                                        zip(self.params, sentence_gradients))
217 |         # end-snippet-5
218 | 
219 |         # theano functions to compile
220 |         # start-snippet-6
221 |         self.classify = theano.function(inputs=[idxs], outputs=y_pred)
222 |         self.sentence_train = theano.function(inputs=[idxs, y_sentence, lr],
223 |                                               outputs=sentence_nll,
224 |                                               updates=sentence_updates)
225 |         # end-snippet-6 start-snippet-7
226 |         self.normalize = theano.function(inputs=[],
227 |                                          updates={self.emb:
228 |                                                   self.emb /
229 |                                                   T.sqrt((self.emb**2)
230 |                                                   .sum(axis=1))
231 |                                                   .dimshuffle(0, 'x')})
232 |         # end-snippet-7
233 | 
234 |     def train(self, x, y, window_size, learning_rate):
235 | 
236 |         cwords = contextwin(x, window_size)
237 |         words = map(lambda x: numpy.asarray(x).astype('int32'), cwords)
238 |         labels = y
239 | 
240 |         self.sentence_train(words, labels, learning_rate)
241 |         self.normalize()
242 | 
243 |     def save(self, folder):
244 |         for param in self.params:
245 |             numpy.save(os.path.join(folder,
246 |                        param.name + '.npy'), param.get_value())
247 | 
248 |     def load(self, folder):
249 |         for param in self.params:
250 |             param.set_value(numpy.load(os.path.join(folder,
251 |                             param.name + '.npy')))
252 | 
253 | 
254 | def main(param=None):
255 |     if not param:
256 |         param = {
257 |             'fold': 3,
258 |             # 5 folds 0,1,2,3,4
259 |             'data': 'atis',
260 |             'lr': 0.0970806646812754,
261 |             'verbose': 1,
262 |             'decay': True,
263 |             # decay on the learning rate if improvement stops
264 |             'win': 7,
265 |             # number of words in the context window
266 |             'nhidden': 200,
267 |             # number of hidden units
268 |             'seed': 345,
269 |             'emb_dimension': 50,
270 |             # dimension of word embedding
271 |             'nepochs': 60,
272 |             # 60 is recommended
273 |             'savemodel': False}
274 |     print param
275 | 
276 |     folder_name = os.path.basename(__file__).split('.')[0]
277 |     folder = os.path.join(os.path.dirname(__file__), folder_name)
278 |     if not os.path.exists(folder):
279 |         os.mkdir(folder)
280 | 
281 |     # load the dataset
282 |     train_set, valid_set, test_set, dic = atisfold(param['fold'])
283 | 
284 |     idx2label = dict((k, v) for v, k in dic['labels2idx'].iteritems())
285 |     idx2word = dict((k, v) for v, k in dic['words2idx'].iteritems())
286 | 
287 |     train_lex, train_ne, train_y = train_set
288 |     valid_lex, valid_ne, valid_y = valid_set
289 |     test_lex, test_ne, test_y = test_set
290 | 
291 |     vocsize = len(set(reduce(lambda x, y: list(x) + list(y),
292 |                              train_lex + valid_lex + test_lex)))
293 |     nclasses = len(set(reduce(lambda x, y: list(x)+list(y),
294 |                               train_y + test_y + valid_y)))
295 |     nsentences = len(train_lex)
296 | 
297 |     groundtruth_valid = [map(lambda x: idx2label[x], y) for y in valid_y]
298 |     words_valid = [map(lambda x: idx2word[x], w) for w in valid_lex]
299 |     groundtruth_test = [map(lambda x: idx2label[x], y) for y in test_y]
300 |     words_test = [map(lambda x: idx2word[x], w) for w in test_lex]
301 | 
302 |     # instanciate the model
303 |     numpy.random.seed(param['seed'])
304 |     random.seed(param['seed'])
305 | 
306 |     rnn = RNNSLU(nh=param['nhidden'],
307 |                  nc=nclasses,
308 |                  ne=vocsize,
309 |                  de=param['emb_dimension'],
310 |                  cs=param['win'])
311 | 
312 |     # train with early stopping on validation set
313 |     best_f1 = -numpy.inf
314 |     param['clr'] = param['lr']
315 |     for e in xrange(param['nepochs']):
316 | 
317 |         # shuffle
318 |         shuffle([train_lex, train_ne, train_y], param['seed'])
319 | 
320 |         param['ce'] = e
321 |         tic = timeit.default_timer()
322 | 
323 |         for i, (x, y) in enumerate(zip(train_lex, train_y)):
324 |             rnn.train(x, y, param['win'], param['clr'])
325 |             print '[learning] epoch %i >> %2.2f%%' % (
326 |                 e, (i + 1) * 100. / nsentences),
327 |             print 'completed in %.2f (sec) <<\r' % (timeit.default_timer() - tic),
328 |             sys.stdout.flush()
329 | 
330 |         # evaluation // back into the real world : idx -> words
331 |         predictions_test = [map(lambda x: idx2label[x],
332 |                             rnn.classify(numpy.asarray(
333 |                             contextwin(x, param['win'])).astype('int32')))
334 |                             for x in test_lex]
335 |         predictions_valid = [map(lambda x: idx2label[x],
336 |                              rnn.classify(numpy.asarray(
337 |                              contextwin(x, param['win'])).astype('int32')))
338 |                              for x in valid_lex]
339 | 
340 |         # evaluation // compute the accuracy using conlleval.pl
341 |         res_test = conlleval(predictions_test,
342 |                              groundtruth_test,
343 |                              words_test,
344 |                              folder + '/current.test.txt',
345 |                              folder)
346 |         res_valid = conlleval(predictions_valid,
347 |                               groundtruth_valid,
348 |                               words_valid,
349 |                               folder + '/current.valid.txt',
350 |                               folder)
351 | 
352 |         if res_valid['f1'] > best_f1:
353 | 
354 |             if param['savemodel']:
355 |                 rnn.save(folder)
356 | 
357 |             best_rnn = copy.deepcopy(rnn)
358 |             best_f1 = res_valid['f1']
359 | 
360 |             if param['verbose']:
361 |                 print('NEW BEST: epoch', e,
362 |                       'valid F1', res_valid['f1'],
363 |                       'best test F1', res_test['f1'])
364 | 
365 |             param['vf1'], param['tf1'] = res_valid['f1'], res_test['f1']
366 |             param['vp'], param['tp'] = res_valid['p'], res_test['p']
367 |             param['vr'], param['tr'] = res_valid['r'], res_test['r']
368 |             param['be'] = e
369 | 
370 |             subprocess.call(['mv', folder + '/current.test.txt',
371 |                             folder + '/best.test.txt'])
372 |             subprocess.call(['mv', folder + '/current.valid.txt',
373 |                             folder + '/best.valid.txt'])
374 |         else:
375 |             if param['verbose']:
376 |                 print ''
377 | 
378 |         # learning rate decay if no improvement in 10 epochs
379 |         if param['decay'] and abs(param['be']-param['ce']) >= 10:
380 |             param['clr'] *= 0.5
381 |             rnn = best_rnn
382 | 
383 |         if param['clr'] < 1e-5:
384 |             break
385 | 
386 |     print('BEST RESULT: epoch', param['be'],
387 |           'valid F1', param['vf1'],
388 |           'best test F1', param['tf1'],
389 |           'with the model', folder)
390 | 
391 | 
392 | if __name__ == '__main__':
393 |     main()
394 | 


--------------------------------------------------------------------------------
/code/test.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import numpy
  4 | 
  5 | import convolutional_mlp
  6 | import dA
  7 | import DBN
  8 | import logistic_cg
  9 | import logistic_sgd
 10 | import mlp
 11 | import rbm
 12 | import rnnrbm
 13 | import SdA
 14 | import rnnslu
 15 | import lstm
 16 | 
 17 | 
 18 | def test_rnnslu():
 19 |     rnnslu.main()
 20 | 
 21 | 
 22 | def test_logistic_sgd():
 23 |     logistic_sgd.sgd_optimization_mnist(n_epochs=10)
 24 | 
 25 | 
 26 | def test_logistic_cg():
 27 |     try:
 28 |         import scipy
 29 |         logistic_cg.cg_optimization_mnist(n_epochs=10)
 30 |     except ImportError:
 31 |         from nose.plugins.skip import SkipTest
 32 |         raise SkipTest(
 33 |             'SciPy not available. Needed for the logistic_cg example.')
 34 | 
 35 | 
 36 | def test_mlp():
 37 |     mlp.test_mlp(n_epochs=1)
 38 | 
 39 | 
 40 | def test_convolutional_mlp():
 41 |     convolutional_mlp.evaluate_lenet5(n_epochs=1, nkerns=[5, 5])
 42 | 
 43 | 
 44 | def test_dA():
 45 |     dA.test_dA(training_epochs=1, output_folder='tmp_dA_plots')
 46 | 
 47 | 
 48 | def test_SdA():
 49 |     SdA.test_SdA(pretraining_epochs=1, training_epochs=1, batch_size=300)
 50 | 
 51 | 
 52 | def test_dbn():
 53 |     DBN.test_DBN(pretraining_epochs=1, training_epochs=1, batch_size=300)
 54 | 
 55 | 
 56 | def test_rbm():
 57 |     rbm.test_rbm(training_epochs=1, batch_size=300, n_chains=1, n_samples=1,
 58 |                  n_hidden=20, output_folder='tmp_rbm_plots')
 59 | 
 60 | 
 61 | def test_rnnrbm():
 62 |     rnnrbm.test_rnnrbm(num_epochs=1)
 63 | 
 64 | 
 65 | def test_lstm():
 66 |     lstm.train_lstm(max_epochs=1, test_size=1000, saveto='')
 67 | 
 68 | 
 69 | def speed():
 70 |     """
 71 |     This fonction modify the configuration theano and don't restore it!
 72 |     """
 73 | 
 74 |     algo = ['logistic_sgd', 'logistic_cg', 'mlp', 'convolutional_mlp',
 75 |             'dA', 'SdA', 'DBN', 'rbm', 'rnnrbm', 'rnnslu', 'lstm']
 76 |     to_exec = [True] * len(algo)
 77 | #    to_exec = [False] * len(algo)
 78 | #    to_exec[-1] = True
 79 |     do_float64 = True
 80 |     do_float32 = True
 81 |     do_gpu = True
 82 | 
 83 |     algo_executed = [s for idx, s in enumerate(algo) if to_exec[idx]]
 84 |     #Timming expected are from the buildbot that have an i7-920 @
 85 |     # 2.67GHz with hyperthread enabled for the cpu, 12G of ram. An GeForce GTX
 86 |     # 580 for the GPU. OS=Fedora 14, gcc=4.5.1, python/BLAS from EPD
 87 |     # 7.1-2 (python 2.7.2, mkl unknow). BLAS with only 1 thread.
 88 | 
 89 |     expected_times_64 = numpy.asarray([9.8, 22.0, 76.1, 73.7, 116.4,
 90 |                                        346.9, 355.0, 558.1, 130.4, 50.8, 113.6])
 91 |     expected_times_32 = numpy.asarray([8.1, 17.9, 42.5, 66.5, 71,
 92 |                                        191.2, 199.0, 432.8, 119.5, 36.9, 78.0])
 93 | 
 94 |     # Number with just 1 decimal are new value that are faster with
 95 |     # the Theano version 0.5rc2 Other number are older. They are not
 96 |     # updated, as we where faster in the past!
 97 |     # TODO: find why and fix this!
 98 | 
 99 | # Here is the value for the buildbot on February 3th 2012 with a GTX 285
100 | #              sgd,         cg           mlp          conv        da
101 | #              sda          dbn          rbm
102 | #    gpu times[3.72957802,  9.94316864,  29.1772666,  9.13857198, 25.91144657,
103 | #              18.30802011, 53.38651466, 285.41386175]
104 | #    expected [3.076634879, 7.555234910, 18.99226785, 9.58915591, 24.130070450,
105 | #              24.77524018, 92.66246653, 322.340329170]
106 | #              sgd,         cg           mlp          conv        da
107 | #              sda          dbn          rbm
108 | #expected/get [0.82492841,  0.75984178,  0.65092691,  1.04930573, 0.93125138
109 | #              1.35324519 1.7356905   1.12937868]
110 | 
111 |     expected_times_gpu = numpy.asarray([3.0, 7.55523491, 18.99226785,
112 |                                         5.8, 20.0,
113 |                                         11.8, 18.2, 280.1, 132.8, 38.8, 10.5])
114 |     expected_times_64 = [s for idx, s in enumerate(expected_times_64)
115 |                          if to_exec[idx]]
116 |     expected_times_32 = [s for idx, s in enumerate(expected_times_32)
117 |                          if to_exec[idx]]
118 |     expected_times_gpu = [s for idx, s in enumerate(expected_times_gpu)
119 |                           if to_exec[idx]]
120 | 
121 |     def time_test(m, l, idx, f, **kwargs):
122 |         if not to_exec[idx]:
123 |             return
124 |         print algo[idx]
125 |         ts = m.call_time
126 |         try:
127 |             f(**kwargs)
128 |         except Exception, e:
129 |             print >> sys.stderr, 'test', algo[idx], 'FAILED', e
130 |             l.append(numpy.nan)
131 |             return
132 |         te = m.call_time
133 |         l.append(te - ts)
134 | 
135 |     def do_tests():
136 |         m = theano.compile.mode.get_default_mode()
137 |         l = []
138 |         time_test(m, l, 0, logistic_sgd.sgd_optimization_mnist, n_epochs=30)
139 |         time_test(m, l, 1, logistic_cg.cg_optimization_mnist, n_epochs=30)
140 |         time_test(m, l, 2, mlp.test_mlp, n_epochs=5)
141 |         time_test(m, l, 3, convolutional_mlp.evaluate_lenet5, n_epochs=5,
142 |                   nkerns=[5, 5])
143 |         time_test(m, l, 4, dA.test_dA, training_epochs=2,
144 |                   output_folder='tmp_dA_plots')
145 |         time_test(m, l, 5, SdA.test_SdA, pretraining_epochs=1,
146 |                   training_epochs=2, batch_size=300)
147 |         time_test(m, l, 6, DBN.test_DBN, pretraining_epochs=1,
148 |                   training_epochs=2, batch_size=300)
149 |         time_test(m, l, 7, rbm.test_rbm, training_epochs=1, batch_size=300,
150 |                   n_chains=1, n_samples=1, output_folder='tmp_rbm_plots')
151 |         time_test(m, l, 8, rnnrbm.test_rnnrbm, num_epochs=1)
152 |         s = {'fold': 3,
153 |              # 5 folds 0,1,2,3,4
154 |              'data': 'atis',
155 |              'lr': 0.0970806646812754,
156 |              'verbose': 1,
157 |              'decay': True,
158 |              # decay on the learning rate if improvement stops
159 |              'win': 7,
160 |              # number of words in the context window
161 |              'nhidden': 200,
162 |              # number of hidden units
163 |              'seed': 345,
164 |              'emb_dimension': 50,
165 |              # dimension of word embedding
166 |              'nepochs': 1,
167 |              # 60 is recommended
168 |              'savemodel': False}
169 |         time_test(m, l, 9, rnnslu.main, param=s)
170 |         time_test(m, l, 10, lstm.train_lstm, max_epochs=1, test_size=1000,
171 |                   saveto='')
172 |         return numpy.asarray(l)
173 | 
174 |     #test in float64 in FAST_RUN mode on the cpu
175 |     import theano
176 |     if do_float64:
177 |         theano.config.floatX = 'float64'
178 |         theano.config.mode = 'FAST_RUN'
179 |         float64_times = do_tests()
180 |         print >> sys.stderr, algo_executed
181 |         print >> sys.stderr, 'float64 times', float64_times
182 |         print >> sys.stderr, 'float64 expected', expected_times_64
183 |         print >> sys.stderr, 'float64 % expected/get', (
184 |             expected_times_64 / float64_times)
185 | 
186 |     #test in float32 in FAST_RUN mode on the cpu
187 |     theano.config.floatX = 'float32'
188 |     if do_float32:
189 |         float32_times = do_tests()
190 |         print >> sys.stderr, algo_executed
191 |         print >> sys.stderr, 'float32 times', float32_times
192 |         print >> sys.stderr, 'float32 expected', expected_times_32
193 |         print >> sys.stderr, 'float32 % expected/get', (
194 |             expected_times_32 / float32_times)
195 | 
196 |         if do_float64:
197 |             print >> sys.stderr, 'float64/float32', (
198 |                 float64_times / float32_times)
199 |             print >> sys.stderr
200 |             print >> sys.stderr, ('Duplicate the timing to have everything '
201 |                                   'in one place')
202 |             print >> sys.stderr, algo_executed
203 |             print >> sys.stderr, 'float64 times', float64_times
204 |             print >> sys.stderr, 'float64 expected', expected_times_64
205 |             print >> sys.stderr, 'float64 % expected/get', (
206 |                 expected_times_64 / float64_times)
207 |             print >> sys.stderr, 'float32 times', float32_times
208 |             print >> sys.stderr, 'float32 expected', expected_times_32
209 |             print >> sys.stderr, 'float32 % expected/get', (
210 |                 expected_times_32 / float32_times)
211 | 
212 |             print >> sys.stderr, 'float64/float32', (
213 |                 float64_times / float32_times)
214 |             print >> sys.stderr, 'expected float64/float32', (
215 |                 expected_times_64 / float32_times)
216 | 
217 |     #test in float32 in FAST_RUN mode on the gpu
218 |     import theano.sandbox.cuda
219 |     if do_gpu:
220 |         theano.sandbox.cuda.use('gpu')
221 |         gpu_times = do_tests()
222 |         print >> sys.stderr, algo_executed
223 |         print >> sys.stderr, 'gpu times', gpu_times
224 |         print >> sys.stderr, 'gpu expected', expected_times_gpu
225 |         print >> sys.stderr, 'gpu % expected/get', (
226 |             expected_times_gpu / gpu_times)
227 | 
228 |         if do_float64:
229 |             print >> sys.stderr, 'float64/gpu', float64_times / gpu_times
230 | 
231 |         if (do_float64 + do_float32 + do_gpu) > 1:
232 |             print >> sys.stderr
233 |             print >> sys.stderr, ('Duplicate the timing to have everything '
234 |                                   'in one place')
235 |             print >> sys.stderr, algo_executed
236 |             if do_float64:
237 |                 print >> sys.stderr, 'float64 times', float64_times
238 |                 print >> sys.stderr, 'float64 expected', expected_times_64
239 |                 print >> sys.stderr, 'float64 % expected/get', (
240 |                     expected_times_64 / float64_times)
241 |             if do_float32:
242 |                 print >> sys.stderr, 'float32 times', float32_times
243 |                 print >> sys.stderr, 'float32 expected', expected_times_32
244 |                 print >> sys.stderr, 'float32 % expected/get', (
245 |                     expected_times_32 / float32_times)
246 |             if do_gpu:
247 |                 print >> sys.stderr, 'gpu times', gpu_times
248 |                 print >> sys.stderr, 'gpu expected', expected_times_gpu
249 |                 print >> sys.stderr, 'gpu % expected/get', (
250 |                     expected_times_gpu / gpu_times)
251 | 
252 |             print
253 |             if do_float64 and do_float32:
254 |                 print >> sys.stderr, 'float64/float32', (
255 |                     float64_times / float32_times)
256 |                 print >> sys.stderr, 'expected float64/float32', (
257 |                     expected_times_64 / float32_times)
258 |             if do_float64 and do_gpu:
259 |                 print >> sys.stderr, 'float64/gpu', float64_times / gpu_times
260 |                 print >> sys.stderr, 'expected float64/gpu', (
261 |                     expected_times_64 / gpu_times)
262 |             if do_float32 and do_gpu:
263 |                 print >> sys.stderr, 'float32/gpu', float32_times / gpu_times
264 |                 print >> sys.stderr, 'expected float32/gpu', (
265 |                     expected_times_32 / gpu_times)
266 | 
267 |     def compare(x, y):
268 |         ratio = x / y
269 |         # If there is more then 5% difference between the expected
270 |         # time and the real time, we consider this an error.
271 |         return sum((ratio < 0.95) + (ratio > 1.05))
272 | 
273 |     print
274 |     if do_float64:
275 |         err = compare(expected_times_64, float64_times)
276 |         print >> sys.stderr, 'speed_failure_float64=' + str(err)
277 |     if do_float32:
278 |         err = compare(expected_times_32, float32_times)
279 |         print >> sys.stderr, 'speed_failure_float32=' + str(err)
280 |     if do_gpu:
281 |         err = compare(expected_times_gpu, gpu_times)
282 |         print >> sys.stderr, 'speed_failure_gpu=' + str(err)
283 | 
284 |         assert not numpy.isnan(gpu_times).any()
285 | 


--------------------------------------------------------------------------------
/code/utils.py:
--------------------------------------------------------------------------------
  1 | """ This file contains different utility functions that are not connected
  2 | in anyway to the networks presented in the tutorials, but rather help in
  3 | processing the outputs into a more understandable way.
  4 | 
  5 | For example ``tile_raster_images`` helps in generating a easy to grasp
  6 | image from a set of samples or weights.
  7 | """
  8 | 
  9 | 
 10 | import numpy
 11 | 
 12 | 
 13 | def scale_to_unit_interval(ndar, eps=1e-8):
 14 |     """ Scales all values in the ndarray ndar to be between 0 and 1 """
 15 |     ndar = ndar.copy()
 16 |     ndar -= ndar.min()
 17 |     ndar *= 1.0 / (ndar.max() + eps)
 18 |     return ndar
 19 | 
 20 | 
 21 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
 22 |                        scale_rows_to_unit_interval=True,
 23 |                        output_pixel_vals=True):
 24 |     """
 25 |     Transform an array with one flattened image per row, into an array in
 26 |     which images are reshaped and layed out like tiles on a floor.
 27 | 
 28 |     This function is useful for visualizing datasets whose rows are images,
 29 |     and also columns of matrices for transforming those rows
 30 |     (such as the first layer of a neural net).
 31 | 
 32 |     :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
 33 |     be 2-D ndarrays or None;
 34 |     :param X: a 2-D array in which every row is a flattened image.
 35 | 
 36 |     :type img_shape: tuple; (height, width)
 37 |     :param img_shape: the original shape of each image
 38 | 
 39 |     :type tile_shape: tuple; (rows, cols)
 40 |     :param tile_shape: the number of images to tile (rows, cols)
 41 | 
 42 |     :param output_pixel_vals: if output should be pixel values (i.e. int8
 43 |     values) or floats
 44 | 
 45 |     :param scale_rows_to_unit_interval: if the values need to be scaled before
 46 |     being plotted to [0,1] or not
 47 | 
 48 | 
 49 |     :returns: array suitable for viewing as an image.
 50 |     (See:`Image.fromarray`.)
 51 |     :rtype: a 2-d array with same dtype as X.
 52 | 
 53 |     """
 54 | 
 55 |     assert len(img_shape) == 2
 56 |     assert len(tile_shape) == 2
 57 |     assert len(tile_spacing) == 2
 58 | 
 59 |     # The expression below can be re-written in a more C style as
 60 |     # follows :
 61 |     #
 62 |     # out_shape    = [0,0]
 63 |     # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] -
 64 |     #                tile_spacing[0]
 65 |     # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
 66 |     #                tile_spacing[1]
 67 |     out_shape = [
 68 |         (ishp + tsp) * tshp - tsp
 69 |         for ishp, tshp, tsp in zip(img_shape, tile_shape, tile_spacing)
 70 |     ]
 71 | 
 72 |     if isinstance(X, tuple):
 73 |         assert len(X) == 4
 74 |         # Create an output numpy ndarray to store the image
 75 |         if output_pixel_vals:
 76 |             out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
 77 |                                     dtype='uint8')
 78 |         else:
 79 |             out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
 80 |                                     dtype=X.dtype)
 81 | 
 82 |         #colors default to 0, alpha defaults to 1 (opaque)
 83 |         if output_pixel_vals:
 84 |             channel_defaults = [0, 0, 0, 255]
 85 |         else:
 86 |             channel_defaults = [0., 0., 0., 1.]
 87 | 
 88 |         for i in xrange(4):
 89 |             if X[i] is None:
 90 |                 # if channel is None, fill it with zeros of the correct
 91 |                 # dtype
 92 |                 dt = out_array.dtype
 93 |                 if output_pixel_vals:
 94 |                     dt = 'uint8'
 95 |                 out_array[:, :, i] = numpy.zeros(
 96 |                     out_shape,
 97 |                     dtype=dt
 98 |                 ) + channel_defaults[i]
 99 |             else:
100 |                 # use a recurrent call to compute the channel and store it
101 |                 # in the output
102 |                 out_array[:, :, i] = tile_raster_images(
103 |                     X[i], img_shape, tile_shape, tile_spacing,
104 |                     scale_rows_to_unit_interval, output_pixel_vals)
105 |         return out_array
106 | 
107 |     else:
108 |         # if we are dealing with only one channel
109 |         H, W = img_shape
110 |         Hs, Ws = tile_spacing
111 | 
112 |         # generate a matrix to store the output
113 |         dt = X.dtype
114 |         if output_pixel_vals:
115 |             dt = 'uint8'
116 |         out_array = numpy.zeros(out_shape, dtype=dt)
117 | 
118 |         for tile_row in xrange(tile_shape[0]):
119 |             for tile_col in xrange(tile_shape[1]):
120 |                 if tile_row * tile_shape[1] + tile_col < X.shape[0]:
121 |                     this_x = X[tile_row * tile_shape[1] + tile_col]
122 |                     if scale_rows_to_unit_interval:
123 |                         # if we should scale values to be between 0 and 1
124 |                         # do this by calling the `scale_to_unit_interval`
125 |                         # function
126 |                         this_img = scale_to_unit_interval(
127 |                             this_x.reshape(img_shape))
128 |                     else:
129 |                         this_img = this_x.reshape(img_shape)
130 |                     # add the slice to the corresponding position in the
131 |                     # output array
132 |                     c = 1
133 |                     if output_pixel_vals:
134 |                         c = 255
135 |                     out_array[
136 |                         tile_row * (H + Hs): tile_row * (H + Hs) + H,
137 |                         tile_col * (W + Ws): tile_col * (W + Ws) + W
138 |                     ] = this_img * c
139 |         return out_array
140 | 


--------------------------------------------------------------------------------
/data/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | which wget >/dev/null 2>&1
 4 | WGET=$?
 5 | which curl >/dev/null 2>&1
 6 | CURL=$?
 7 | if [ "$WGET" -eq 0 ]; then
 8 |     DL_CMD="wget -c"
 9 | elif [ "$CURL" -eq 0 ]; then
10 |     DL_CMD="curl -C - -O"
11 | else
12 |     echo "You need wget or curl installed to download"
13 |     exit 1
14 | fi
15 | 
16 | $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
17 | $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist_py3k.pkl.gz
18 | $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl.gz && gunzip imdb.pkl.gz
19 | $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/imdb.dict.pkl.gz && gunzip imdb.dict.pkl.gz
20 | $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/Nottingham.zip && unzip -u Nottingham.zip
21 | $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/midi.zip && unzip -u midi.zip -d ../code && echo "extracted Modified Python MIDI package (GPL)"
22 | $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold0.pkl.gz
23 | $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold1.pkl.gz
24 | $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold2.pkl.gz
25 | $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold3.pkl.gz
26 | $DL_CMD http://www-etud.iro.umontreal.ca/~mesnilgr/atis/atis.fold4.pkl.gz
27 | 


--------------------------------------------------------------------------------
/data/training_colorpatches_16x16_demo.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/data/training_colorpatches_16x16_demo.mat


--------------------------------------------------------------------------------
/doc/.templates/layout.html:
--------------------------------------------------------------------------------
 1 | {% extends "!layout.html" %}
 2 | 
 3 | {%- block extrahead %}
 4 | {{ super() }}
 5 | <script type="text/javascript">
 6 |   var _gaq = _gaq || [];
 7 |   _gaq.push(['_setAccount', 'UA-168290-9']);
 8 |   _gaq.push(['_trackPageview']);
 9 | </script>
10 | {% endblock %}
11 | 
12 | {% block footer %}
13 | {{ super() }}
14 | <script type="text/javascript">
15 |   (function() {
16 |     var ga = document.createElement('script');
17 |     ga.src = ('https:' == document.location.protocol ?
18 |               'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
19 |     ga.setAttribute('async', 'true');
20 |     document.documentElement.firstChild.appendChild(ga);
21 |   })();
22 | </script>
23 | {% endblock %}
24 | 
25 | 


--------------------------------------------------------------------------------
/doc/DBN.txt:
--------------------------------------------------------------------------------
  1 | .. _DBN:
  2 | 
  3 | Deep Belief Networks
  4 | ====================
  5 | 
  6 | .. note::
  7 |   This section assumes the reader has already read through :doc:`logreg`
  8 |   and :doc:`mlp` and :doc:`rbm`. Additionally it uses the following Theano
  9 |   functions and concepts : `T.tanh`_, `shared variables`_, `basic arithmetic
 10 |   ops`_, `T.grad`_, `Random numbers`_, `floatX`_. If you intend to run the
 11 |   code on GPU also read `GPU`_.
 12 | 
 13 | .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh
 14 | 
 15 | .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables
 16 | 
 17 | .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars
 18 | 
 19 | .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients
 20 | 
 21 | .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX
 22 | 
 23 | .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html 
 24 | 
 25 | .. _Random numbers: http://deeplearning.net/software/theano/tutorial/examples.html#using-random-numbers
 26 | 
 27 | 
 28 | .. note::
 29 |     The code for this section is available for download `here`_.
 30 | 
 31 | .. _here: http://deeplearning.net/tutorial/code/DBN.py
 32 | 
 33 | 
 34 | Deep Belief Networks
 35 | ++++++++++++++++++++
 36 | 
 37 | [Hinton06]_ showed that RBMs can be stacked and trained in a greedy manner
 38 | to form so-called Deep Belief Networks (DBN). DBNs are graphical models which
 39 | learn to extract a deep hierarchical representation of the training data.
 40 | They model the joint distribution between observed vector :math:`x` and
 41 | the :math:`\ell` hidden layers :math:`h^k` as follows:
 42 | 
 43 | .. math::
 44 |     :label: dbn
 45 |  
 46 |     P(x, h^1, \ldots, h^{\ell}) = \left(\prod_{k=0}^{\ell-2} P(h^k|h^{k+1})\right) P(h^{\ell-1},h^{\ell})
 47 | 
 48 | where :math:`x=h^0`, :math:`P(h^{k-1} | h^k)` is a conditional distribution
 49 | for the visible units conditioned on the hidden units of the RBM at level
 50 | :math:`k`, and :math:`P(h^{\ell-1}, h^{\ell})` is the visible-hidden joint
 51 | distribution in the top-level RBM. This is illustrated in the figure below.
 52 | 
 53 | 
 54 | .. figure:: images/DBN3.png
 55 |     :align: center
 56 | 
 57 | The principle of greedy layer-wise unsupervised training can be applied to
 58 | DBNs with RBMs as the building blocks for each layer [Hinton06]_, [Bengio07]_.
 59 | The process is as follows:
 60 | 
 61 | 1. Train the first layer as an RBM that models the raw input :math:`x =
 62 | h^{(0)}` as its visible layer.
 63 | 
 64 | 2. Use that first layer to obtain a representation of the input that will
 65 | be used as data for the second layer. Two common solutions exist. This
 66 | representation can be chosen as being the mean activations
 67 | :math:`p(h^{(1)}=1|h^{(0)})` or samples of :math:`p(h^{(1)}|h^{(0)})`.
 68 | 
 69 | 3. Train the second layer as an RBM, taking the transformed data (samples or
 70 | mean activations) as training examples (for the visible layer of that RBM).
 71 | 
 72 | 4. Iterate (2 and 3) for the desired number of layers, each time propagating
 73 | upward either samples or mean values.
 74 | 
 75 | 5. Fine-tune all the parameters of this deep architecture with respect to a
 76 | proxy for the DBN log- likelihood, or with respect to a supervised training
 77 | criterion (after adding extra learning machinery to convert the learned
 78 | representation into supervised predictions, e.g. a linear classifier).
 79 | 
 80 | 
 81 | In this tutorial, we focus on fine-tuning via supervised gradient descent.
 82 | Specifically, we use a logistic regression classifier to classify the input
 83 | :math:`x` based on the output of the last hidden layer :math:`h^{(l)}` of the
 84 | DBN. Fine-tuning is then performed via supervised gradient descent of the
 85 | negative log-likelihood cost function. Since the supervised gradient is only
 86 | non-null for the weights and hidden layer biases of each layer (i.e. null for
 87 | the visible biases of each RBM), this procedure is equivalent to initializing
 88 | the parameters of a deep MLP with the weights and hidden layer biases obtained
 89 | with the unsupervised training strategy.
 90 | 
 91 | Justifying Greedy-Layer Wise Pre-Training
 92 | +++++++++++++++++++++++++++++++++++++++++
 93 | 
 94 | Why does such an algorithm work ? Taking as example a 2-layer DBN with hidden
 95 | layers :math:`h^{(1)}` and :math:`h^{(2)}` (with respective weight parameters
 96 | :math:`W^{(1)}` and :math:`W^{(2)}`), [Hinton06]_ established 
 97 | (see also Bengio09]_ for a detailed derivation) that :math:`\log
 98 | p(x)` can be rewritten as,
 99 | 
100 | .. math::
101 |     :label: dbn_bound
102 | 
103 |     \log p(x) = &KL(Q(h^{(1)}|x)||p(h^{(1)}|x)) + H_{Q(h^{(1)}|x)} + \\
104 |                 &\sum_h Q(h^{(1)}|x)(\log p(h^{(1)}) + \log p(x|h^{(1)})).
105 | 
106 | :math:`KL(Q(h^{(1)}|x) || p(h^{(1)}|x))` represents the KL divergence between
107 | the posterior :math:`Q(h^{(1)}|x)` of the first RBM if it were standalone, and the
108 | probability :math:`p(h^{(1)}|x)` for the same layer but defined by the entire DBN
109 | (i.e. taking into account the prior :math:`p(h^{(1)},h^{(2)})` defined by the
110 | top-level RBM). :math:`H_{Q(h^{(1)}|x)}` is the entropy of the distribution
111 | :math:`Q(h^{(1)}|x)`. 
112 | 
113 | It can be shown that if we initialize both hidden layers such that
114 | :math:`W^{(2)}={W^{(1)}}^T`, :math:`Q(h^{(1)}|x)=p(h^{(1)}|x)` and the KL
115 | divergence term is null. If we learn the first level RBM and then keep its
116 | parameters :math:`W^{(1)}` fixed, optimizing Eq. :eq:`dbn_bound` with respect
117 | to :math:`W^{(2)}` can thus only increase the likelihood :math:`p(x)`.  
118 | 
119 | Also, notice that if we isolate the terms which depend only on :math:`W^{(2)}`, we
120 | get: 
121 | 
122 | .. math:: 
123 |     \sum_h Q(h^{(1)}|x)p(h^{(1)})
124 |     
125 | Optimizing this with respect to :math:`W^{(2)}` amounts to training a second-stage
126 | RBM, using the output of :math:`Q(h^{(1)}|x)` as the training distribution,
127 | when :math:`x` is sampled from the training distribution for the first RBM.
128 | 
129 | Implementation
130 | ++++++++++++++
131 | 
132 | To implement DBNs in Theano, we will use the class defined in the :doc:`rbm`
133 | tutorial. One can also observe that the code for the DBN is very similar with the one
134 | for SdA, because both involve the principle of unsupervised layer-wise
135 | pre-training followed by supervised fine-tuning as a deep MLP. 
136 | The main difference is that we use the RBM class instead of the dA
137 | class.
138 | 
139 | We start off by defining the DBN class which will store the layers of the 
140 | MLP, along with their associated RBMs. Since we take the viewpoint of using
141 | the RBMs to initialize an MLP, the code will reflect this by seperating as
142 | much as possible the RBMs used to initialize the network and the MLP used for
143 | classification.
144 | 
145 | .. literalinclude:: ../code/DBN.py
146 |   :start-after: start-snippet-1
147 |   :end-before: end-snippet-1
148 | 
149 | ``self.sigmoid_layers`` will store the feed-forward graphs which together form
150 | the MLP, while ``self.rbm_layers`` will store the RBMs used to pretrain each
151 | layer of the MLP.
152 | 
153 | Next step, we construct ``n_layers`` sigmoid layers (we use the
154 | ``HiddenLayer`` class introduced in :ref:`mlp`, with the only modification
155 | that we replaced the non-linearity from ``tanh`` to the logistic function
156 | :math:`s(x) = \frac{1}{1+e^{-x}}`) and ``n_layers`` RBMs, where ``n_layers``
157 | is the depth of our model.  We link the sigmoid layers such that they form an
158 | MLP, and construct each RBM such that they share the weight matrix and the
159 | hidden bias with its corresponding sigmoid layer.
160 | 
161 | .. literalinclude:: ../code/DBN.py
162 |   :start-after: # MLP.
163 |   :end-before: # We now need to add a logistic layer on top of the MLP
164 | 
165 | All that is left is to stack one last logistic regression layer in order to
166 | form an MLP. We will use the ``LogisticRegression`` class introduced in
167 | :ref:`logreg`. 
168 | 
169 | .. literalinclude:: ../code/DBN.py
170 |   :start-after: # We now need to add a logistic layer on top of the MLP
171 |   :end-before: def pretraining_functions
172 | 
173 | The class also provides a method which generates training functions for each
174 | of the RBMs. They are returned as a list, where element :math:`i` is a
175 | function which implements one step of training for the ``RBM`` at layer
176 | :math:`i`.
177 | 
178 | .. literalinclude:: ../code/DBN.py
179 |   :start-after: self.errors = self.logLayer.errors(self.y)
180 |   :end-before: learning_rate = T.scalar('lr')
181 | 
182 | In order to be able to change the learning rate during training, we associate a
183 | Theano variable to it that has a default value.
184 | 
185 | .. literalinclude:: ../code/DBN.py
186 |   :start-after: index = T.lscalar('index')
187 |   :end-before: def build_finetune_functions
188 | 
189 | Now any function ``pretrain_fns[i]`` takes as arguments ``index`` and
190 | optionally ``lr`` -- the learning rate. Note that the names of the parameters
191 | are the names given to the Theano variables (e.g. ``lr``) when they are
192 | constructed and not the name of the python variables (e.g. ``learning_rate``). Keep
193 | this in mind when working with Theano. Optionally, if you provide ``k`` (the
194 | number of Gibbs steps to perform in CD or PCD) this will also become an
195 | argument of your function.
196 | 
197 | In the same fashion, the DBN class includes a method for building the
198 | functions required for finetuning ( a ``train_model``, a ``validate_model``
199 | and a ``test_model`` function). 
200 | 
201 | .. literalinclude:: ../code/DBN.py
202 |   :pyobject: DBN.build_finetune_functions
203 | 
204 | Note that the returned ``valid_score`` and ``test_score`` are not Theano
205 | functions, but rather Python functions. These loop over the entire 
206 | validation set and the entire test set to produce a list of the losses
207 | obtained over these sets.
208 | 
209 | 
210 | Putting it all together
211 | +++++++++++++++++++++++
212 | 
213 | The few lines of code below constructs the deep belief network : 
214 | 
215 | .. literalinclude:: ../code/DBN.py
216 |   :start-after: # numpy random generator
217 |   :end-before: start-snippet-2
218 | 
219 | There are two stages in training this network: (1) a layer-wise pre-training and
220 | (2) a fine-tuning stage.
221 | 
222 | For the pre-training stage, we loop over all the layers of the network. For
223 | each layer, we use the compiled theano function which determines the
224 | input to the ``i``-th level RBM and performs one step of CD-k within this RBM.
225 | This function is applied to the training set for a fixed number of epochs
226 | given by ``pretraining_epochs``.
227 | 
228 | .. literalinclude:: ../code/DBN.py
229 |   :start-after: start-snippet-2
230 |   :end-before: end-snippet-2
231 | 
232 | The fine-tuning loop is very similar to the one in the :ref:`mlp` tutorial,
233 | the only difference being that we now use the functions given by
234 | ``build_finetune_functions``.
235 | 
236 | Running the Code
237 | ++++++++++++++++
238 | 
239 | The user can run the code by calling:
240 | 
241 | .. code-block:: bash
242 |   
243 |   python code/DBN.py
244 | 
245 | With the default parameters, the code runs for 100 pre-training epochs with
246 | mini-batches of size 10. This corresponds to performing 500,000 unsupervised
247 | parameter updates. We use an unsupervised learning rate of 0.01, with a
248 | supervised learning rate of 0.1.  The DBN itself consists of three
249 | hidden layers with 1000 units per layer. With early-stopping, this configuration
250 | achieved a minimal validation error of 1.27 with corresponding test
251 | error of 1.34 after 46 supervised epochs.
252 | 
253 | On an Intel(R) Xeon(R) CPU X5560 running at 2.80GHz, using a multi-threaded MKL
254 | library (running on 4 cores), pretraining took 615 minutes with an average of
255 | 2.05 mins/(layer * epoch). Fine-tuning took only 101 minutes or approximately
256 | 2.20 mins/epoch.
257 | 
258 | Hyper-parameters were selected by optimizing on the validation error. We tested
259 | unsupervised learning rates in :math:`\{10^{-1}, ..., 10^{-5}\}` and supervised
260 | learning rates in :math:`\{10^{-1}, ..., 10^{-4}\}`. We did not use any form of
261 | regularization besides early-stopping, nor did we optimize over the number of
262 | pretraining updates.
263 | 
264 | 
265 | Tips and Tricks
266 | +++++++++++++++
267 | 
268 | One way to improve the running time of your code (given that you have
269 | sufficient memory available), is to compute the representation of the entire
270 | dataset at layer ``i`` in a single pass, once the weights of the
271 | :math:`i-1`-th layers have been fixed. Namely, start by training your first
272 | layer RBM. Once it is trained, you can compute the hidden units values for
273 | every example in the dataset and store this as a new dataset which is used to
274 | train the 2nd layer RBM. Once you trained the RBM for layer 2, you compute, in
275 | a similar fashion, the dataset for layer 3 and so on. This avoids calculating
276 | the intermediate (hidden layer) representations, ``pretraining_epochs`` times
277 | at the expense of increased memory usage.
278 | 


--------------------------------------------------------------------------------
/doc/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | .. _license:
 2 | 
 3 | LICENSE
 4 | =======
 5 | 
 6 | Copyright (c) 2008--2013, Theano Development Team
 7 | All rights reserved.
 8 | 
 9 | Redistribution and use in source and binary forms, with or without
10 | modification, are permitted provided that the following conditions are met:
11 | 
12 |     * Redistributions of source code must retain the above copyright
13 |       notice, this list of conditions and the following disclaimer.
14 |     * Redistributions in binary form must reproduce the above copyright
15 |       notice, this list of conditions and the following disclaimer in the
16 |       documentation and/or other materials provided with the distribution.
17 |     * Neither the name of Theano nor the names of its contributors may be
18 |       used to endorse or promote products derived from this software without
19 |       specific prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
22 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
28 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	python scripts/docgen.py
3 | 


--------------------------------------------------------------------------------
/doc/SdA.txt:
--------------------------------------------------------------------------------
  1 | .. _SdA:
  2 | 
  3 | Stacked Denoising Autoencoders (SdA)
  4 | ====================================
  5 | 
  6 | .. note::
  7 |   This section assumes you have already read through :doc:`logreg`
  8 |   and :doc:`mlp`. Additionally it uses the following Theano functions
  9 |   and concepts : `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, `Random numbers`_, `floatX`_. If you intend to run the code on GPU also read `GPU`_.
 10 | 
 11 | .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh
 12 | 
 13 | .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables
 14 | 
 15 | .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars
 16 | 
 17 | .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients
 18 | 
 19 | .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX
 20 | 
 21 | .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html 
 22 | 
 23 | .. _Random numbers: http://deeplearning.net/software/theano/tutorial/examples.html#using-random-numbers
 24 | 
 25 | 
 26 | .. note::
 27 |     The code for this section is available for download `here`_.
 28 | 
 29 | .. _here: http://deeplearning.net/tutorial/code/SdA.py
 30 | 
 31 | 
 32 | The Stacked Denoising Autoencoder (SdA) is an extension of the stacked 
 33 | autoencoder [Bengio07]_ and it was introduced in [Vincent08]_. 
 34 | 
 35 | This tutorial builds on the previous tutorial :ref:`dA`.
 36 | Especially if you do not have experience with autoencoders, we recommend reading it
 37 | before going any further.
 38 | 
 39 | .. _stacked_autoencoders:
 40 | 
 41 | Stacked Autoencoders
 42 | ++++++++++++++++++++
 43 | 
 44 | Denoising autoencoders can be stacked to form a deep network by
 45 | feeding the latent representation (output code)
 46 | of the denoising autoencoder found on the layer 
 47 | below as input to the current layer. The **unsupervised pre-training** of such an 
 48 | architecture is done one layer at a time. Each layer is trained as 
 49 | a denoising autoencoder by minimizing the error in reconstructing its input
 50 | (which is the output code of the previous layer).
 51 | Once the first :math:`k` layers 
 52 | are trained, we can train the :math:`k+1`-th layer because we can now 
 53 | compute the code or latent representation from the layer below. 
 54 | 
 55 | Once all layers are pre-trained, the network goes through a second stage
 56 | of training called **fine-tuning**. Here we consider **supervised fine-tuning**
 57 | where we want to minimize prediction error on a supervised task.
 58 | For this, we first add a logistic regression 
 59 | layer on top of the network (more precisely on the output code of the
 60 | output layer). We then
 61 | train the entire network as we would train a multilayer 
 62 | perceptron. At this point, we only consider the encoding parts of
 63 | each auto-encoder.
 64 | This stage is supervised, since now we use the target class during
 65 | training. (See the :ref:`mlp` for details on the multilayer perceptron.)
 66 | 
 67 | This can be easily implemented in Theano, using the class defined
 68 | previously for a denoising autoencoder. We can see the stacked denoising
 69 | autoencoder as having two facades: a list of
 70 | autoencoders, and an MLP. During pre-training we use the first facade, i.e., we treat our model
 71 | as a list of autoencoders, and train each autoencoder seperately. In the 
 72 | second stage of training, we use the second facade. These two facades are linked because:
 73 | 
 74 | * the autoencoders and the sigmoid layers of the MLP share parameters, and
 75 | 
 76 | * the latent representations computed by intermediate layers of the MLP are fed as input to the autoencoders.
 77 | 
 78 | .. literalinclude:: ../code/SdA.py
 79 |   :start-after: start-snippet-1
 80 |   :end-before: end-snippet-1
 81 | 
 82 | ``self.sigmoid_layers`` will store the sigmoid layers of the MLP facade, while
 83 | ``self.dA_layers`` will store  the denoising autoencoder associated with the layers of the MLP. 
 84 | 
 85 | Next, we construct ``n_layers`` sigmoid layers and ``n_layers`` denoising 
 86 | autoencoders, where ``n_layers`` is the depth of our model. We use the
 87 | ``HiddenLayer`` class introduced in :ref:`mlp`, with one
 88 | modification: we replace the ``tanh`` non-linearity with the
 89 | logistic function :math:`s(x) = \frac{1}{1+e^{-x}}`).
 90 | We link the sigmoid layers to form an MLP, and construct
 91 | the denoising autoencoders such that each shares the weight matrix and the 
 92 | bias of its encoding part with its corresponding sigmoid layer.
 93 | 
 94 | .. literalinclude:: ../code/SdA.py
 95 |   :start-after: start-snippet-2
 96 |   :end-before: end-snippet-2
 97 | 
 98 | All we need now is to add a logistic layer on top of the sigmoid
 99 | layers such that we have an MLP. We will 
100 | use the ``LogisticRegression`` class introduced in :ref:`logreg`. 
101 | 
102 | .. literalinclude:: ../code/SdA.py
103 |   :start-after: end-snippet-2
104 |   :end-before: def pretraining_functions
105 | 
106 | The ``SdA`` class also provides a method that generates training functions for
107 | the denoising autoencoders in its layers. 
108 | They are returned as a list, where element :math:`i` is a function that
109 | implements one step of training the ``dA`` corresponding to layer 
110 | :math:`i`.
111 | 
112 | .. literalinclude:: ../code/SdA.py
113 |   :start-after: self.errors = self.logLayer.errors(self.y)
114 |   :end-before: corruption_level = T.scalar('corruption')
115 | 
116 | To be able to change the corruption level or the learning rate
117 | during training, we associate Theano variables with them.
118 | 
119 | .. literalinclude:: ../code/SdA.py
120 |   :start-after: index = T.lscalar('index')
121 |   :end-before: def build_finetune_functions
122 |  
123 | Now any function ``pretrain_fns[i]`` takes as arguments ``index`` and 
124 | optionally ``corruption``---the corruption level or ``lr``---the
125 | learning rate. Note that the names of the parameters are the names given 
126 | to the Theano variables when they are constructed, not the names of the 
127 | Python variables (``learning_rate`` or ``corruption_level``). Keep this 
128 | in mind when working with Theano. 
129 | 
130 | In the same fashion we build a method for constructing the functions required 
131 | during finetuning (``train_fn``, ``valid_score`` and
132 | ``test_score``).
133 | 
134 | .. literalinclude:: ../code/SdA.py
135 |   :pyobject: SdA.build_finetune_functions
136 | 
137 | Note that ``valid_score`` and ``test_score`` are not Theano
138 | functions, but rather Python functions that loop over the entire 
139 | validation set and the entire test set, respectively, producing a list of the losses
140 | over these sets.
141 | 
142 | Putting it all together
143 | +++++++++++++++++++++++
144 | 
145 | The few lines of code below construct the stacked denoising
146 | autoencoder: 
147 | 
148 | .. literalinclude:: ../code/SdA.py
149 |   :start-after: start-snippet-3
150 |   :end-before: end-snippet-3
151 | 
152 | There are two stages of training for this network: layer-wise pre-training 
153 | followed by fine-tuning. 
154 | 
155 | For the pre-training stage, we will loop over all the layers of the
156 | network. For each layer we will use the compiled Theano function that
157 | implements a SGD step towards optimizing the weights for reducing 
158 | the reconstruction cost of that layer. This function will be applied 
159 | to the training set for a fixed number of epochs given by
160 | ``pretraining_epochs``.
161 | 
162 | .. literalinclude:: ../code/SdA.py
163 |   :start-after: start-snippet-4
164 |   :end-before: end-snippet-4
165 | 
166 | The fine-tuning loop is very similar to the one in the :ref:`mlp`. The
167 | only difference is that it uses the functions given by
168 | ``build_finetune_functions``.
169 | 
170 | Running the Code
171 | ++++++++++++++++
172 | 
173 | The user can run the code by calling:
174 | 
175 | .. code-block:: bash
176 |   
177 |   python code/SdA.py
178 | 
179 | By default the code runs 15 pre-training epochs for each layer, with a batch
180 | size of 1. The corruption levels are 0.1 for the first layer, 0.2 for the second,
181 | and 0.3 for the third. The pretraining learning rate is 0.001 and 
182 | the finetuning learning rate is 0.1. Pre-training takes 585.01 minutes, with 
183 | an average of 13 minutes per epoch. Fine-tuning is completed after 36 epochs
184 | in 444.2 minutes, with an average of 12.34 minutes per epoch. The final 
185 | validation score is 1.39% with a testing score of 1.3%. 
186 | These results were obtained on a machine with an Intel
187 | Xeon E5430 @ 2.66GHz CPU, with a single-threaded GotoBLAS.
188 | 
189 | 
190 | Tips and Tricks
191 | +++++++++++++++
192 | 
193 | One way to improve the running time of your code (assuming you have
194 | sufficient memory available), is to compute how the network, up to layer
195 | :math:`k-1`, transforms your data. Namely, you start by training your first
196 | layer dA. Once it is trained, you can compute the hidden units values for
197 | every datapoint in your dataset and store this as a new dataset that you will
198 | use to train the dA corresponding to layer 2. Once you have trained the dA for
199 | layer 2, you compute, in a similar fashion, the dataset for layer 3 and so on.
200 | You can see now, that at this point, the dAs are trained individually, and
201 | they just provide (one to the other) a non-linear transformation of the input.
202 | Once all dAs are trained, you can start fine-tuning the model.
203 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # theano documentation build configuration file, created by
  4 | # sphinx-quickstart on Tue Oct  7 16:34:06 2008.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # The contents of this file are pickled, so don't put values in the namespace
  9 | # that aren't pickleable (module imports are okay, they're removed automatically).
 10 | #
 11 | # All configuration values have a default value; values that are commented out
 12 | # serve to show the default value.
 13 | import sys, os
 14 | 
 15 | # If your extensions are in another directory, add it here. If the directory
 16 | # is relative to the documentation root, use os.path.abspath to make it
 17 | # absolute, like shown here.
 18 | #sys.path.append(os.path.abspath('some/directory'))
 19 | 
 20 | # General configuration
 21 | # ---------------------
 22 | 
 23 | # Add any Sphinx extension module names here, as strings. They can be extensions
 24 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 25 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo']
 26 | 
 27 | try:
 28 |     from sphinx.ext import pngmath
 29 |     extensions.append('sphinx.ext.pngmath')
 30 | except ImportError:
 31 |     print >>sys.stderr, 'Warning: could not import sphinx.ext.pngmath'
 32 |     pass
 33 | 
 34 | # Add any paths that contain templates here, relative to this directory.
 35 | templates_path = ['.templates']
 36 | 
 37 | # The suffix of source filenames.
 38 | source_suffix = '.txt'
 39 | 
 40 | # The master toctree document.
 41 | master_doc = 'contents'
 42 | 
 43 | # General substitutions.
 44 | project = 'DeepLearning'
 45 | copyright = '2008--2010, LISA lab'
 46 | 
 47 | # The default replacements for |version| and |release|, also used in various
 48 | # other places throughout the built documents.
 49 | #
 50 | # The short X.Y version.
 51 | version = '0.1'
 52 | # The full version, including alpha/beta/rc tags.
 53 | release = '0.1'
 54 | 
 55 | # There are two options for replacing |today|: either, you set today to some
 56 | # non-false value, then it is used:
 57 | #today = ''
 58 | # Else, today_fmt is used as the format for a strftime call.
 59 | today_fmt = '%B %d, %Y'
 60 | 
 61 | # List of documents that shouldn't be included in the build.
 62 | #unused_docs = []
 63 | 
 64 | # List of directories, relative to source directories, that shouldn't be searched
 65 | # for source files.
 66 | exclude_dirs = ['scripts']
 67 | 
 68 | # The reST default role (used for this markup: `text`) to use for all documents.
 69 | #default_role = None
 70 | 
 71 | # If true, '()' will be appended to :func: etc. cross-reference text.
 72 | #add_function_parentheses = True
 73 | 
 74 | # If true, the current module name will be prepended to all description
 75 | # unit titles (such as .. function::).
 76 | #add_module_names = True
 77 | 
 78 | # If true, sectionauthor and moduleauthor directives will be shown in the
 79 | # output. They are ignored by default.
 80 | #show_authors = False
 81 | 
 82 | # The name of the Pygments (syntax highlighting) style to use.
 83 | pygments_style = 'sphinx'
 84 | 
 85 | 
 86 | # Options for HTML output
 87 | # -----------------------
 88 | 
 89 | # The style sheet to use for HTML and HTML Help pages. A file of that name
 90 | # must exist either in Sphinx' static/ path, or in one of the custom paths
 91 | # given in html_static_path.
 92 | #html_style = 'default.css'
 93 | html_theme = 'sphinxdoc'
 94 | 
 95 | # The name for this set of Sphinx documents.  If None, it defaults to
 96 | # "<project> v<release> documentation".
 97 | #html_title = None
 98 | 
 99 | # A shorter title for the navigation bar.  Default is the same as html_title.
100 | #html_short_title = None
101 | 
102 | # The name of an image file (within the static path) to place at the top of
103 | # the sidebar.
104 | #html_logo = None
105 | 
106 | # The name of an image file (within the static path) to use as favicon of the
107 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
108 | # pixels large.
109 | #html_favicon = None
110 | 
111 | # Add any paths that contain custom static files (such as style sheets) here,
112 | # relative to this directory. They are copied after the builtin static files,
113 | # so a file named "default.css" will overwrite the builtin "default.css".
114 | #html_static_path = ['.static', 'images']
115 | html_static_path = ['images']
116 | 
117 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
118 | # using the given strftime format.
119 | html_last_updated_fmt = '%b %d, %Y'
120 | 
121 | # If true, SmartyPants will be used to convert quotes and dashes to
122 | # typographically correct entities.
123 | html_use_smartypants = True
124 | 
125 | # Custom sidebar templates, maps document names to template names.
126 | #html_sidebars = {}
127 | 
128 | # Additional templates that should be rendered to pages, maps page names to
129 | # template names.
130 | #html_additional_pages = {}
131 | 
132 | # If false, no module index is generated.
133 | html_use_modindex = True
134 | 
135 | # If false, no index is generated.
136 | html_use_index = True
137 | 
138 | # If true, the index is split into individual pages for each letter.
139 | #html_split_index = False
140 | 
141 | # If true, the reST sources are included in the HTML build as _sources/<name>.
142 | #html_copy_source = True
143 | 
144 | # If true, an OpenSearch description file will be output, and all pages will
145 | # contain a <link> tag referring to it.  The value of this option must be the
146 | # base URL from which the finished HTML is served.
147 | #html_use_opensearch = ''
148 | 
149 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
150 | #html_file_suffix = ''
151 | 
152 | # Output file base name for HTML help builder.
153 | htmlhelp_basename = 'deeplearningdoc'
154 | 
155 | 
156 | # Options for LaTeX output
157 | # ------------------------
158 | 
159 | # The paper size ('letter' or 'a4').
160 | #latex_paper_size = 'letter'
161 | 
162 | # The font size ('10pt', '11pt' or '12pt').
163 | latex_font_size = '11pt'
164 | 
165 | # Grouping the document tree into LaTeX files. List of tuples
166 | # (source start file, target name, title, author, document class [howto/manual]).
167 | latex_documents = [
168 |   ('contents', 'deeplearning.tex', 'Deep Learning Tutorial',
169 |    'LISA lab, University of Montreal', 'manual'),
170 | ]
171 | 
172 | # The name of an image file (relative to this directory) to place at the top of
173 | # the title page.
174 | latex_logo = None
175 | 
176 | # For "manual" documents, if this is true, then toplevel headings are parts,
177 | # not chapters.
178 | #latex_use_parts = False
179 | 
180 | # Additional stuff for the LaTeX preamble.
181 | #latex_preamble = ''
182 | 
183 | # Documents to append as an appendix to all manuals.
184 | #latex_appendices = []
185 | 
186 | # If false, no module index is generated.
187 | #latex_use_modindex = True
188 | 
189 | default_role = 'math'
190 | pngmath_divpng_args = ['-gamma 1.5','-D 110']
191 | pngmath_latex_preamble =  '\\usepackage{amsmath}\n'+\
192 |                           '\\usepackage{amsfonts}\n'+\
193 |                           '\\usepackage{amssymb}\n'+\
194 |                           '\\def\\E{\\mathbf{E}}\n'+\
195 |                           '\\def\\F{\\mathbf{F}}\n'+\
196 |                           '\\def\\x{\\mathbf{x}}\n'+\
197 |                           '\\def\\h{\\mathbf{h}}\n'+\
198 |                           '\\def\\v{\\mathbf{v}}\n'+\
199 |                           '\\def\\nv{\\mathbf{v^{{\bf -}}}}\n'+\
200 |                           '\\def\\nh{\\mathbf{h^{{\bf -}}}}\n'+\
201 |                           '\\def\\s{\\mathbf{s}}\n'+\
202 |                           '\\def\\b{\\mathbf{b}}\n'+\
203 |                           '\\def\\c{\\mathbf{c}}\n'+\
204 |                           '\\def\\W{\\mathbf{W}}\n'+\
205 |                           '\\def\\C{\\mathbf{C}}\n'+\
206 |                           '\\def\\P{\\mathbf{P}}\n'+\
207 |                           '\\def\\T{{\\bf \\mathcal T}}\n'+\
208 |                           '\\def\\B{{\\bf \\mathcal B}}\n'
209 | 


--------------------------------------------------------------------------------
/doc/contents.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | .. _contents:
 3 | 
 4 | ========
 5 | Contents
 6 | ========
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 2
10 | 
11 |    LICENSE
12 |    index
13 |    gettingstarted
14 |    logreg
15 |    mlp
16 |    lenet
17 |    dA
18 |    SdA
19 |    rbm
20 |    DBN
21 |    hmc
22 |    rnnslu
23 |    lstm
24 |    rnnrbm
25 |    utilities
26 |    references
27 | 


--------------------------------------------------------------------------------
/doc/dA.txt:
--------------------------------------------------------------------------------
  1 | .. _daa:
  2 | 
  3 | Denoising Autoencoders (dA)
  4 | ===========================
  5 | 
  6 | .. note::
  7 |   This section assumes the reader has already read through :doc:`logreg`
  8 |   and :doc:`mlp`. Additionally it uses the following Theano functions
  9 |   and concepts : `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, `Random numbers`_, `floatX`_. If you intend to run the code on GPU also read `GPU`_.
 10 | 
 11 | .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh
 12 | 
 13 | .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables
 14 | 
 15 | .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars
 16 | 
 17 | .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients
 18 | 
 19 | .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX
 20 | 
 21 | .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html
 22 | 
 23 | .. _Random numbers: http://deeplearning.net/software/theano/tutorial/examples.html#using-random-numbers
 24 | 
 25 | 
 26 | .. note::
 27 |     The code for this section is available for download `here`_.
 28 | 
 29 | .. _here: http://deeplearning.net/tutorial/code/dA.py
 30 | 
 31 | 
 32 | The Denoising Autoencoder (dA) is an extension of a classical
 33 | autoencoder and it was introduced as a building block for deep networks
 34 | in [Vincent08]_. We will start the tutorial with a short discussion on
 35 | :ref:`autoencoders`.
 36 | 
 37 | .. _autoencoders:
 38 | 
 39 | Autoencoders
 40 | +++++++++++++
 41 | 
 42 | See section 4.6 of [Bengio09]_ for an overview of auto-encoders.
 43 | An autoencoder takes an input :math:`\mathbf{x} \in [0,1]^d` and first
 44 | maps it (with an *encoder)* to a hidden representation :math:`\mathbf{y} \in [0,1]^{d'}`
 45 | through a deterministic mapping, e.g.:
 46 | 
 47 | .. math::
 48 | 
 49 |   \mathbf{y} = s(\mathbf{W}\mathbf{x} + \mathbf{b})
 50 | 
 51 | Where :math:`s` is a non-linearity such as the sigmoid. The latent
 52 | representation :math:`\mathbf{y}`, or **code** is then mapped back (with a
 53 | *decoder)* into a **reconstruction** :math:`\mathbf{z}` of the same shape as
 54 | :math:`\mathbf{x}`. The mapping happens through a similar transformation, e.g.:
 55 | 
 56 | .. math::
 57 | 
 58 |   \mathbf{z} = s(\mathbf{W'}\mathbf{y} + \mathbf{b'})
 59 | 
 60 | (Here, the prime symbol does not indicate matrix transposition.)
 61 | :math:`\mathbf{z}` should be seen as a prediction of :math:`\mathbf{x}`, given
 62 | the code :math:`\mathbf{y}`. Optionally, the weight matrix :math:`\mathbf{W'}`
 63 | of the reverse mapping may be constrained to be the transpose of the forward
 64 | mapping: :math:`\mathbf{W'} = \mathbf{W}^T`. This is referred to as *tied
 65 | weights*. The parameters of this model (namely :math:`\mathbf{W}`,
 66 | :math:`\mathbf{b}`, :math:`\mathbf{b'}` and, if one doesn't use tied weights,
 67 | also :math:`\mathbf{W'}`) are optimized such that the average reconstruction
 68 | error is minimized.
 69 | 
 70 | The reconstruction error can be measured in many ways, depending on the
 71 | appropriate distributional assumptions on the input given the code. The
 72 | traditional *squared error* :math:`L(\mathbf{x} \mathbf{z}) = || \mathbf{x} -
 73 | \mathbf{z} ||^2`, can be used. If the input is interpreted as either bit
 74 | vectors or vectors of bit probabilities, *cross-entropy* of the reconstruction
 75 | can be used:
 76 | 
 77 | .. math::
 78 | 
 79 |   L_{H} (\mathbf{x}, \mathbf{z}) = - \sum^d_{k=1}[\mathbf{x}_k \log
 80 |           \mathbf{z}_k + (1 - \mathbf{x}_k)\log(1 - \mathbf{z}_k)]
 81 | 
 82 | The hope is that the code :math:`\mathbf{y}` is a *distributed* representation
 83 | that captures the coordinates along the main factors of variation in the data.
 84 | This is similar to the way the projection on principal components would capture
 85 | the main factors of variation in the data. Indeed, if there is one linear
 86 | hidden layer (the *code)* and the mean squared error criterion is used to train
 87 | the network, then the :math:`k` hidden units learn to project the input in the
 88 | span of the first :math:`k` principal components of the data. If the hidden
 89 | layer is non-linear, the auto-encoder behaves differently from PCA, with the
 90 | ability to capture multi-modal aspects of the input distribution. The departure
 91 | from PCA becomes even more important when we consider *stacking multiple
 92 | encoders* (and their corresponding decoders) when building a deep auto-encoder
 93 | [Hinton06]_.
 94 | 
 95 | Because :math:`\mathbf{y}` is viewed as a lossy compression of
 96 | :math:`\mathbf{x}`, it cannot be a good (small-loss) compression for all
 97 | :math:`\mathbf{x}`. Optimization makes it a good compression for training
 98 | examples, and hopefully for other inputs as well, but not for arbitrary inputs.
 99 | That is the sense in which an auto-encoder generalizes: it gives low
100 | reconstruction error on test examples from the same distribution as the
101 | training examples, but generally high reconstruction error on samples randomly
102 | chosen from the input space.
103 | 
104 | We want to implement an auto-encoder using Theano, in the form of a class, that
105 | could be afterwards used in constructing a stacked autoencoder. The first step
106 | is to create shared variables for the parameters of the autoencoder
107 | :math:`\mathbf{W}`, :math:`\mathbf{b}` and :math:`\mathbf{b'}`. (Since we are
108 | using tied weights in this tutorial, :math:`\mathbf{W}^T` will be used for
109 | :math:`\mathbf{W'}`):
110 | 
111 | .. literalinclude:: ../code/dA.py
112 |   :pyobject: dA.__init__
113 | 
114 | Note that we pass the symbolic ``input`` to the autoencoder as a parameter.
115 | This is so that we can concatenate layers of autoencoders to form a deep
116 | network: the symbolic output (the :math:`\mathbf{y}` above) of layer :math:`k` will
117 | be the symbolic input of layer :math:`k+1`.
118 | 
119 | Now we can express the computation of the latent representation and of the reconstructed
120 | signal:
121 | 
122 | .. literalinclude:: ../code/dA.py
123 |   :pyobject: dA.get_hidden_values
124 | 
125 | .. literalinclude:: ../code/dA.py
126 |   :pyobject: dA.get_reconstructed_input
127 | 
128 | And using these functions we can compute the cost and the updates of
129 | one stochastic gradient descent step :
130 | 
131 | .. literalinclude:: ../code/dA.py
132 |   :pyobject: dA.get_cost_updates
133 | 
134 | We can now define a function that applied iteratively will update the
135 | parameters ``W``, ``b`` and ``b_prime`` such that the
136 | reconstruction cost is approximately minimized.
137 | 
138 | .. literalinclude:: ../code/dA.py
139 |   :start-after: theano_rng = RandomStreams(rng.randint(2 ** 30))
140 |   :end-before: start_time = time.clock()
141 | 
142 | If there is no constraint besides minimizing the reconstruction error, one
143 | might expect an auto-encoder with :math:`n` inputs and an encoding of dimension
144 | :math:`n` (or greater) to learn the identity function, merely mapping an input
145 | to its copy. Such an autoencoder would not differentiate test examples (from
146 | the training distribution) from other input configurations. 
147 | 
148 | Surprisingly,
149 | experiments reported in [Bengio07]_ suggest that, in practice, when trained
150 | with stochastic gradient descent, non-linear auto-encoders with more hidden
151 | units than inputs (called overcomplete) yield useful representations. (Here,
152 | "useful" means that a network taking the encoding as input has low
153 | classification error.) 
154 | 
155 | A simple explanation is that stochastic gradient descent with early stopping is
156 | similar to an L2 regularization of the parameters. To achieve perfect
157 | reconstruction of continuous inputs, a one-hidden layer auto-encoder with
158 | non-linear hidden units (exactly like in the above code) needs very small
159 | weights in the first (encoding) layer, to bring the non-linearity of the hidden
160 | units into their linear regime, and very large weights in the second (decoding)
161 | layer. With binary inputs, very large weights are also needed to completely
162 | minimize the reconstruction error. Since the implicit or explicit
163 | regularization makes it difficult to reach large-weight solutions, the
164 | optimization algorithm finds encodings which only work well for examples
165 | similar to those in the training set, which is what we want. It means that the
166 | *representation is exploiting statistical regularities present in the training
167 | set,* rather than merely learning to replicate the input.
168 | 
169 | There are other ways by which an auto-encoder with more hidden units than inputs
170 | could be prevented from learning the identity function, capturing something
171 | useful about the input in its hidden representation. One is the addition of
172 | *sparsity* (forcing many of the hidden units to be zero or near-zero). Sparsity
173 | has been exploited very successfully by many [Ranzato07]_ [Lee08]_. Another is
174 | to add randomness in the transformation from input to reconstruction. This
175 | technique is used in Restricted Boltzmann Machines (discussed later in
176 | :ref:`rbm`), as well as in Denoising Auto-Encoders, discussed below.
177 | 
178 | .. _DA:
179 | 
180 | Denoising Autoencoders
181 | ++++++++++++++++++++++
182 | 
183 | The idea behind denoising autoencoders is simple. In order to force
184 | the hidden layer to discover more robust features and prevent it
185 | from simply learning the identity, we train the
186 | autoencoder to *reconstruct the input from a corrupted version of it*.
187 | 
188 | The denoising auto-encoder is a stochastic version of the auto-encoder.
189 | Intuitively, a denoising auto-encoder does two things: try to encode the input
190 | (preserve the information about the input), and try to undo the effect of a
191 | corruption process stochastically applied to the input of the auto-encoder. The
192 | latter can only be done by capturing the statistical dependencies between the
193 | inputs. The denoising auto-encoder can be understood from different
194 | perspectives (the manifold learning perspective, stochastic operator
195 | perspective, bottom-up -- information theoretic perspective, top-down --
196 | generative model perspective), all of which are explained in [Vincent08]_. See
197 | also section 7.2 of [Bengio09]_ for an overview of auto-encoders.
198 | 
199 | In [Vincent08]_, the stochastic corruption process randomly sets some of the
200 | inputs (as many as half of them) to zero. Hence the denoising auto-encoder is
201 | trying to *predict the corrupted (i.e. missing) values from the uncorrupted
202 | (i.e., non-missing) values*, for randomly selected subsets of missing patterns.
203 | Note how being able to predict any subset of variables from the rest is a
204 | sufficient condition for completely capturing the joint distribution between a
205 | set of variables (this is how Gibbs sampling works).
206 | 
207 | To convert the autoencoder class into a denoising autoencoder class, all we
208 | need to do is to add a stochastic corruption step operating on the input. The input can be
209 | corrupted in many ways, but in this tutorial we will stick to the original
210 | corruption mechanism of randomly masking entries of the input by making
211 | them zero. The code below
212 | does just that :
213 | 
214 | .. literalinclude:: ../code/dA.py
215 |   :pyobject: dA.get_corrupted_input
216 | 
217 | 
218 | In the stacked autoencoder class (:ref:`stacked_autoencoders`) the weights of
219 | the ``dA`` class have to be shared with those of a corresponding sigmoid layer.
220 | For this reason, the constructor of the ``dA`` also gets Theano variables
221 | pointing to the shared parameters. If those parameters are left to ``None``,
222 | new ones will be constructed.
223 | 
224 | The final denoising autoencoder class becomes :
225 | 
226 | .. literalinclude:: ../code/dA.py
227 |   :pyobject: dA
228 | 
229 | 
230 | 
231 | Putting it All Together
232 | +++++++++++++++++++++++
233 | 
234 | 
235 | It is easy now to construct an instance of our ``dA`` class and train
236 | it.
237 | 
238 | .. literalinclude:: ../code/dA.py
239 |   :language: python
240 |   :start-after: start-snippet-2
241 |   :end-before: end-snippet-2
242 | 
243 | .. literalinclude:: ../code/dA.py
244 |   :start-after: start-snippet-3
245 |   :end-before: end-snippet-3
246 | 
247 | 
248 | In order to get a feeling of what the network learned we are going to
249 | plot the filters (defined by the weight matrix). Bear in mind, however,
250 | that this does not provide the entire story,
251 | since we neglect the biases and plot the weights up to a multiplicative
252 | constant (weights are converted to values between 0 and 1).
253 | 
254 | To plot our filters we will need the help of ``tile_raster_images`` (see
255 | :ref:`how-to-plot`) so we urge the reader to study it. Also
256 | using the help of the Python Image Library, the following lines of code will
257 | save the filters as an image :
258 | 
259 | .. literalinclude:: ../code/dA.py
260 |   :start-after: start-snippet-4
261 |   :end-before: end-snippet-4
262 | 
263 | 
264 | Running the Code
265 | ++++++++++++++++
266 | 
267 | To run the code :
268 | 
269 | .. code-block:: bash
270 | 
271 |   python dA.py
272 | 
273 | The resulted filters when we do not use any noise are :
274 | 
275 | .. figure:: images/filters_corruption_0.png
276 |     :align: center
277 | 
278 | 
279 | 
280 | The filters for 30 percent noise :
281 | 
282 | 
283 | .. figure:: images/filters_corruption_30.png
284 |     :align: center
285 | 
286 | 
287 | 
288 | 


--------------------------------------------------------------------------------
/doc/images/3wolfmoon.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/3wolfmoon.jpg


--------------------------------------------------------------------------------
/doc/images/3wolfmoon_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/3wolfmoon_output.png


--------------------------------------------------------------------------------
/doc/images/DBN3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/DBN3.png


--------------------------------------------------------------------------------
/doc/images/bm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/bm.png


--------------------------------------------------------------------------------
/doc/images/cnn_explained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/cnn_explained.png


--------------------------------------------------------------------------------
/doc/images/conv_1D_nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/conv_1D_nn.png


--------------------------------------------------------------------------------
/doc/images/filters_at_epoch_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/filters_at_epoch_14.png


--------------------------------------------------------------------------------
/doc/images/filters_corruption_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/filters_corruption_0.png


--------------------------------------------------------------------------------
/doc/images/filters_corruption_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/filters_corruption_30.png


--------------------------------------------------------------------------------
/doc/images/lstm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/lstm.png


--------------------------------------------------------------------------------
/doc/images/lstm_memorycell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/lstm_memorycell.png


--------------------------------------------------------------------------------
/doc/images/markov_chain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/markov_chain.png


--------------------------------------------------------------------------------
/doc/images/mlp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/mlp.png


--------------------------------------------------------------------------------
/doc/images/mnist_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/mnist_0.png


--------------------------------------------------------------------------------
/doc/images/mnist_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/mnist_1.png


--------------------------------------------------------------------------------
/doc/images/mnist_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/mnist_2.png


--------------------------------------------------------------------------------
/doc/images/mnist_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/mnist_3.png


--------------------------------------------------------------------------------
/doc/images/mnist_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/mnist_4.png


--------------------------------------------------------------------------------
/doc/images/mnist_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/mnist_5.png


--------------------------------------------------------------------------------
/doc/images/mylenet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/mylenet.png


--------------------------------------------------------------------------------
/doc/images/rbm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/rbm.png


--------------------------------------------------------------------------------
/doc/images/rnnrbm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/rnnrbm.png


--------------------------------------------------------------------------------
/doc/images/sample1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/sample1.png


--------------------------------------------------------------------------------
/doc/images/sample2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/sample2.png


--------------------------------------------------------------------------------
/doc/images/samples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/samples.png


--------------------------------------------------------------------------------
/doc/images/sparse_1D_nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamtrask/DeepLearningTutorials/c5d2aafc2a1448b412b7c5326a618fef6c24295a/doc/images/sparse_1D_nn.png


--------------------------------------------------------------------------------
/doc/index.txt:
--------------------------------------------------------------------------------
 1 | =======================
 2 | Deep Learning Tutorials
 3 | =======================
 4 | 
 5 | Deep Learning is a new area of Machine Learning research, which
 6 | has been introduced with the objective of moving Machine Learning
 7 | closer to one of its original goals: Artificial Intelligence.
 8 | See these course notes for a `brief introduction to Machine Learning for AI <http://www.iro.umontreal.ca/~pift6266/H10/notes/mlintro.html>`_
 9 | and an `introduction to Deep Learning algorithms <http://www.iro.umontreal.ca/~pift6266/H10/notes/deepintro.html>`_.
10 | 
11 | Deep Learning is about learning multiple levels of representation
12 | and abstraction that help to
13 | make sense of data such as images, sound, and text. 
14 | For more about deep learning algorithms, see for example:
15 | 
16 |  - The monograph or review paper `Learning Deep Architectures for AI <http://www.iro.umontreal.ca/~lisa/publications2/index.php/publications/show/239>`_ (Foundations & Trends in Machine Learning, 2009).
17 |  - The ICML 2009 Workshop on Learning Feature Hierarchies `webpage <http://www.cs.toronto.edu/~rsalakhu/deeplearning/index.html>`_ has a `list of references <http://www.cs.toronto.edu/~rsalakhu/deeplearning/references.html>`_.
18 |  - The LISA `public wiki <http://www.iro.umontreal.ca/~lisa/twiki/bin/view.cgi/Public/WebHome>`_ has a `reading list <http://www.iro.umontreal.ca/~lisa/twiki/bin/view.cgi/Public/ReadingOnDeepNetworks>`_ and a `bibliography <http://www.iro.umontreal.ca/~lisa/twiki/bin/view.cgi/Public/DeepNetworksBibliography>`_.
19 |  - Geoff Hinton has `readings <http://www.cs.toronto.edu/~hinton/deeprefs.html>`_ from 2009's `NIPS tutorial <http://videolectures.net/jul09_hinton_deeplearn/>`_.
20 | 
21 | The tutorials presented here will introduce you to some of the most important deep learning
22 | algorithms and will also show you how to run them using Theano_. Theano is a python library that makes writing deep learning models easy, and gives the option of
23 | training them on a GPU.
24 | 
25 | The algorithm tutorials have some prerequisites.  You should know some python,
26 | and be familiar with numpy. Since this tutorial is about using Theano, you
27 | should read over the `Theano basic tutorial`_ first.  Once you've done that,
28 | read through our :ref:`gettingstarted` chapter -- it introduces the notation, and [downloadable] datasets used in the algorithm tutorials, and the way we do optimization by stochastic gradient descent.  
29 | 
30 | The purely supervised learning algorithms are meant to be read in order:
31 | 
32 |   #. :ref:`Logistic Regression <logreg>` - using Theano for something simple
33 |   #. :ref:`Multilayer perceptron <mlp>` - introduction to layers
34 |   #. :ref:`Deep Convolutional Network <lenet>` - a simplified version of LeNet5
35 | 
36 | The unsupervised and semi-supervised learning algorithms can be read in any
37 | order (the auto-encoders can be read independently of the RBM/DBN thread):
38 | 
39 |   * :ref:`Auto Encoders, Denoising Autoencoders <daa>` - description of autoencoders
40 |   * :ref:`Stacked Denoising Auto-Encoders <SdA>` - easy steps into unsupervised pre-training for deep nets
41 |   * :ref:`Restricted Boltzmann Machines <rbm>` - single layer generative RBM model
42 |   * :ref:`Deep Belief Networks <DBN>` - unsupervised generative pre-training of stacked RBMs followed by supervised fine-tuning
43 | 
44 | Building towards including the mcRBM model, we have a new tutorial on sampling
45 | from energy models:
46 | 
47 |   * :ref:`HMC Sampling <HMC>` - hybrid (aka Hamiltonian) Monte-Carlo sampling with scan()
48 | 
49 | Building towards including the Contractive auto-encoders tutorial, we have the code for now:
50 |   * `Contractive auto-encoders`_ code - There is some basic doc in the code.
51 | 
52 | Recurrent neural networks with word embeddings and context window:
53 |   * :ref:`Semantic Parsing of Speech using Recurrent Net <rnnslu>`
54 | 
55 | LSTM network for sentiment analysis:
56 |   * :ref:`LSTM network <lstm>`
57 | 
58 | Energy-based recurrent neural network (RNN-RBM):
59 |   * :ref:`Modeling and generating sequences of polyphonic music <rnnrbm>`
60 | 
61 | .. _Theano: http://deeplearning.net/software/theano
62 | 
63 | .. _Theano basic tutorial: http://deeplearning.net/software/theano/tutorial
64 | 
65 | .. _Contractive auto-encoders: https://github.com/lisa-lab/DeepLearningTutorials/blob/master/code/cA.py
66 | 


--------------------------------------------------------------------------------
/doc/logreg.txt:
--------------------------------------------------------------------------------
  1 | .. index:: Logistic Regression
  2 | 
  3 | .. _logreg :
  4 | 
  5 | 
  6 | Classifying MNIST digits using Logistic Regression
  7 | ==================================================
  8 | 
  9 | .. note::
 10 |     This sections assumes familiarity with the following Theano
 11 |     concepts: `shared variables`_ , `basic arithmetic ops`_ , `T.grad`_ ,
 12 |     `floatX`_. If you intend to run the code on GPU also read `GPU`_.
 13 | 
 14 | .. note::
 15 |     The code for this section is available for download `here`_.
 16 | 
 17 | .. _here: http://deeplearning.net/tutorial/code/logistic_sgd.py
 18 | 
 19 | .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables
 20 | 
 21 | .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars
 22 | 
 23 | .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients
 24 | 
 25 | .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX
 26 | 
 27 | .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html
 28 | 
 29 | In this section, we show how Theano can be used to implement the most basic
 30 | classifier: the logistic regression. We start off with a quick primer of the
 31 | model, which serves both as a refresher but also to anchor the notation and
 32 | show how mathematical expressions are mapped onto Theano graphs.
 33 | 
 34 | In the deepest of machine learning traditions, this tutorial will tackle the exciting
 35 | problem of MNIST digit classification.
 36 | 
 37 | The Model
 38 | +++++++++
 39 | 
 40 | Logistic regression is a probabilistic, linear classifier. It is parametrized
 41 | by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
 42 | done by projecting an input vector onto a set of hyperplanes, each of which
 43 | corresponds to a class. The distance from the input to a hyperplane reflects
 44 | the probability that the input is a member of the corresponding class.
 45 | 
 46 | Mathematically, the probability that an input vector :math:`x` is a member of a
 47 | class :math:`i`, a value of a stochastic variable :math:`Y`, can be written as:
 48 | 
 49 | .. math::
 50 |   P(Y=i|x, W,b) &= softmax_i(W x + b) \\
 51 |                 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
 52 | 
 53 | The model's prediction :math:`y_{pred}` is the class whose probability is maximal, specifically:
 54 | 
 55 | .. math::
 56 |   y_{pred} = {\rm argmax}_i P(Y=i|x,W,b)
 57 | 
 58 | The code to do this in Theano is the following:
 59 | 
 60 | .. literalinclude:: ../code/logistic_sgd.py
 61 |   :start-after: start-snippet-1
 62 |   :end-before: end-snippet-1
 63 | 
 64 | Since the parameters of the model must maintain a persistent state throughout
 65 | training, we allocate shared variables for :math:`W,b`. This declares them both
 66 | as being symbolic Theano variables, but also initializes their contents. The
 67 | dot and softmax operators are then used to compute the vector :math:`P(Y|x,
 68 | W,b)`. The result ``p_y_given_x`` is a symbolic variable of vector-type.
 69 | 
 70 | To get the actual model prediction, we can use the ``T.argmax`` operator, which
 71 | will return the index at which ``p_y_given_x`` is maximal (i.e. the class with
 72 | maximum probability).
 73 | 
 74 | Now of course, the model we have defined so far does not do anything useful
 75 | yet, since its parameters are still in their initial state. The following
 76 | section will thus cover how to learn the optimal parameters.
 77 | 
 78 | 
 79 | .. note::
 80 |     For a complete list of Theano ops, see: `list of ops <http://deeplearning.net/software/theano/library/tensor/basic.html#basic-tensor-functionality>`_
 81 | 
 82 | 
 83 | Defining a Loss Function
 84 | ++++++++++++++++++++++++
 85 | 
 86 | Learning optimal model parameters involves minimizing a loss function. In the
 87 | case of multi-class logistic regression, it is very common to use the negative
 88 | log-likelihood as the loss. This is equivalent to maximizing the likelihood of the
 89 | data set :math:`\cal{D}` under the model parameterized by :math:`\theta`. Let
 90 | us first start by defining the likelihood :math:`\cal{L}` and loss
 91 | :math:`\ell`:
 92 | 
 93 | .. math::
 94 | 
 95 |    \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
 96 |      \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
 97 |    \ell (\theta=\{W,b\}, \mathcal{D}) = - \mathcal{L} (\theta=\{W,b\}, \mathcal{D})
 98 | 
 99 | While entire books are dedicated to the topic of minimization, gradient
100 | descent is by far the simplest method for minimizing arbitrary non-linear
101 | functions. This tutorial will use the method of stochastic gradient method with
102 | mini-batches (MSGD). See :ref:`opt_SGD` for more details.
103 | 
104 | The following Theano code defines the (symbolic) loss for a given minibatch:
105 | 
106 | .. literalinclude:: ../code/logistic_sgd.py
107 |   :start-after: start-snippet-2
108 |   :end-before: end-snippet-2
109 | 
110 | .. note::
111 | 
112 |     Even though the loss is formally defined as the *sum*, over the data set,
113 |     of individual error terms, in practice, we use the *mean* (``T.mean``)
114 |     in the code. This allows for the learning rate choice to be less dependent
115 |     of the minibatch size.
116 | 
117 | 
118 | Creating a LogisticRegression class
119 | +++++++++++++++++++++++++++++++++++
120 | 
121 | We now have all the tools we need to define a ``LogisticRegression`` class, which
122 | encapsulates the basic behaviour of logistic regression. The code is very
123 | similar to what we have covered so far, and should be self explanatory.
124 | 
125 | .. literalinclude:: ../code/logistic_sgd.py
126 |   :pyobject: LogisticRegression
127 | 
128 | We instantiate this class as follows:
129 | 
130 | .. literalinclude:: ../code/logistic_sgd.py
131 |   :start-after: index = T.lscalar()  
132 |   :end-before: # the cost we minimize during
133 | 
134 | We start by allocating symbolic variables for the training inputs :math:`x` and
135 | their corresponding classes :math:`y`. Note that ``x`` and ``y`` are defined
136 | outside the scope of the ``LogisticRegression`` object. Since the class
137 | requires the input to build its graph, it is passed as a parameter of the
138 | ``__init__`` function. This is useful in case you want to connect instances of
139 | such classes to form a deep network. The output of one layer can be passed as
140 | the input of the layer above. (This tutorial does not build a multi-layer
141 | network, but this code will be reused in future tutorials that do.)
142 | 
143 | Finally, we define a (symbolic) ``cost`` variable to minimize, using the instance
144 | method ``classifier.negative_log_likelihood``.
145 | 
146 | .. literalinclude:: ../code/logistic_sgd.py
147 |   :start-after: classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) 
148 |   :end-before: # compiling a Theano function that computes the mistakes
149 | 
150 | Note that ``x`` is an implicit symbolic input to the definition of ``cost``,
151 | because the symbolic variables of ``classifier`` were defined in terms of ``x``
152 | at initialization.
153 | 
154 | Learning the Model
155 | ++++++++++++++++++
156 | 
157 | To implement MSGD in most programming languages (C/C++, Matlab, Python), one
158 | would start by manually deriving the expressions for the gradient of the loss
159 | with respect to the parameters: in this case :math:`\partial{\ell}/\partial{W}`,
160 | and :math:`\partial{\ell}/\partial{b}`, This can get pretty tricky for complex
161 | models, as expressions for :math:`\partial{\ell}/\partial{\theta}` can get
162 | fairly complex, especially when taking into account problems of numerical
163 | stability.
164 | 
165 | With Theano, this work is greatly simplified. It performs
166 | automatic differentiation and applies certain math transforms to improve
167 | numerical stability.
168 | 
169 | To get the gradients :math:`\partial{\ell}/\partial{W}` and
170 | :math:`\partial{\ell}/\partial{b}` in Theano, simply do the following:
171 | 
172 | .. literalinclude:: ../code/logistic_sgd.py
173 |   :start-after: # compute the gradient of cost
174 |   :end-before: # start-snippet-3
175 | 
176 | ``g_W`` and ``g_b`` are symbolic variables, which can be used as part
177 | of a computation graph. The function ``train_model``, which performs one step
178 | of gradient descent, can then be defined as follows:
179 | 
180 | .. literalinclude:: ../code/logistic_sgd.py
181 |   :start-after: start-snippet-3
182 |   :end-before: end-snippet-3
183 | 
184 | ``updates`` is a list of pairs. In each pair, the first element is the symbolic
185 | variable to be updated in the step, and the second element is the symbolic
186 | function for calculating its new value. Similarly, ``givens`` is a dictionary
187 | whose keys are symbolic variables and whose values specify
188 | their replacements during the step. The function ``train_model`` is then defined such
189 | that:
190 | 
191 | * the input is the mini-batch index ``index`` that, together with the batch
192 |   size (which is not an input since it is fixed) defines :math:`x` with
193 |   corresponding labels :math:`y`
194 | * the return value is the cost/loss associated with the x, y defined by
195 |   the ``index``
196 | * on every function call, it will first replace ``x`` and ``y`` with the slices
197 |   from the training set specified by ``index``. Then, it will evaluate the cost
198 |   associated with that minibatch and apply the operations defined by the
199 |   ``updates`` list.
200 | 
201 | Each time ``train_model(index)`` is called, it will thus compute and return the
202 | cost of a minibatch, while also performing a step of MSGD. The entire learning
203 | algorithm thus consists in looping over all examples in the dataset, considering
204 | all the examples in one minibatch at a time,
205 | and repeatedly calling the ``train_model`` function.
206 | 
207 | 
208 | Testing the model
209 | +++++++++++++++++
210 | 
211 | As explained in :ref:`opt_learn_classifier`, when testing the model we are
212 | interested in the number of misclassified examples (and not only in the likelihood).
213 | The ``LogisticRegression`` class therefore has an extra instance method, which
214 | builds the symbolic graph for retrieving the number of misclassified examples in
215 | each minibatch.
216 | 
217 | The code is as follows:
218 | 
219 | .. literalinclude:: ../code/logistic_sgd.py
220 |   :pyobject: LogisticRegression.errors
221 | 
222 | We then create a function ``test_model`` and a function ``validate_model``,
223 | which we can call to retrieve this value. As you will see shortly,
224 | ``validate_model`` is key to our early-stopping implementation (see
225 | :ref:`opt_early_stopping`). These functions take a minibatch index and compute,
226 | for the examples in that minibatch, the number that were misclassified by the
227 | model. The only difference between them is that ``test_model`` draws its
228 | minibatches from the testing set, while ``validate_model`` draws its from the
229 | validation set.
230 | 
231 | .. literalinclude:: ../code/logistic_sgd.py
232 |   :start-after: cost = classifier.negative_log_likelihood(y)
233 |   :end-before: # compute the gradient of cost 
234 | 
235 | Putting it All Together
236 | +++++++++++++++++++++++
237 | 
238 | The finished product is as follows.
239 | 
240 | .. literalinclude:: ../code/logistic_sgd.py
241 | 
242 | The user can learn to classify MNIST digits with SGD logistic regression, by typing, from
243 | within the DeepLearningTutorials folder:
244 | 
245 | .. code-block:: bash
246 | 
247 |     python code/logistic_sgd.py
248 | 
249 | The output one should expect is of the form :
250 | 
251 | .. code-block:: bash
252 | 
253 |     ...
254 |     epoch 72, minibatch 83/83, validation error 7.510417 %
255 |          epoch 72, minibatch 83/83, test error of best model 7.510417 %
256 |     epoch 73, minibatch 83/83, validation error 7.500000 %
257 |          epoch 73, minibatch 83/83, test error of best model 7.489583 %
258 |     Optimization complete with best validation score of 7.500000 %,with test performance 7.489583 %
259 |     The code run for 74 epochs, with 1.936983 epochs/sec
260 | 
261 | 
262 | On an Intel(R) Core(TM)2 Duo CPU E8400 @ 3.00 Ghz  the code runs with
263 | approximately 1.936 epochs/sec and it took 75 epochs to reach a test
264 | error of 7.489%. On the GPU the code does almost 10.0 epochs/sec. For this
265 | instance we used a batch size of 600.
266 | 
267 | 
268 | Prediction Using a Trained Model
269 | ++++++++++++++++++++++++++++++++
270 | 
271 | ``sgd_optimization_mnist`` serialize and pickle the model each time new
272 | lowest validation error is reached. We can reload this model and predict
273 | labels of new data. ``predict`` function shows an example of how
274 | this could be done.
275 | 
276 | .. literalinclude:: ../code/logistic_sgd.py
277 |     :pyobject: predict
278 | 
279 | 
280 | .. rubric:: Footnotes
281 | 
282 | .. [#f1] For smaller datasets and simpler models, more sophisticated descent
283 |          algorithms can be more effective. The sample code
284 |          `logistic_cg.py <http://deeplearning.net/tutorial/code/logistic_cg.py>`_
285 |          demonstrates how to use SciPy's conjugate gradient solver with Theano
286 |          on the logistic regression task.
287 | 


--------------------------------------------------------------------------------
/doc/lstm.txt:
--------------------------------------------------------------------------------
  1 | .. _lstm:
  2 | 
  3 | LSTM Networks for Sentiment Analysis
  4 | **********************************************
  5 | 
  6 | Summary
  7 | +++++++
  8 | 
  9 | This tutorial aims to provide an example of how a Recurrent Neural Network
 10 | (RNN) using the Long Short Term Memory (LSTM) architecture can be implemented
 11 | using Theano. In this tutorial, this model is used to perform sentiment
 12 | analysis on movie reviews from the `Large Movie Review Dataset
 13 | <http://ai.stanford.edu/~amaas/data/sentiment/>`_, sometimes known as the
 14 | IMDB dataset.
 15 | 
 16 | In this task, given a movie review, the model attempts to predict whether it
 17 | is positive or negative. This is a binary classification task.
 18 | 
 19 | Data
 20 | ++++
 21 | 
 22 | As previously mentioned, the provided scripts are used to train a LSTM
 23 | recurrent neural network on the Large Movie Review Dataset dataset.
 24 | 
 25 | While the dataset is public, in this tutorial we provide a copy of the dataset
 26 | that has previously been preprocessed according to the needs of this LSTM
 27 | implementation. Running the code provided in this tutorial will automatically
 28 | download the data to the local directory. In order to use your own data, please
 29 | use a (`preprocessing script
 30 | <https://raw.githubusercontent.com/kyunghyuncho/DeepLearningTutorials/master/code/imdb_preprocess.py>`_)
 31 | provided as a part of this tutorial.
 32 | 
 33 | Once the model is trained, you can test it with your own corpus using the
 34 | word-index dictionary 
 35 | (`imdb.dict.pkl.gz <http://www.iro.umontreal.ca/~lisa/deep/data/imdb.dict.pkl.gz>`_)
 36 | provided as a part of this tutorial.
 37 | 
 38 | Model
 39 | +++++
 40 | 
 41 | LSTM
 42 | ====
 43 | 
 44 | In a *traditional* recurrent neural network, during the gradient
 45 | back-propagation phase, the gradient signal can end up being multiplied a
 46 | large number of times (as many as the number of timesteps) by the weight
 47 | matrix associated with the connections between the neurons of the recurrent
 48 | hidden layer. This means that, the magnitude of weights in the transition
 49 | matrix can have a strong impact on the learning process.
 50 | 
 51 | If the weights in this matrix are small (or, more formally, if the leading
 52 | eigenvalue of the weight matrix is smaller than 1.0), it can lead to a
 53 | situation called *vanishing gradients* where the gradient signal gets so small
 54 | that learning either becomes very slow or stops working altogether. It can
 55 | also make more difficult the task of learning long-term dependencies in the
 56 | data. Conversely, if the weights in this matrix are large (or, again, more
 57 | formally, if the leading eigenvalue of the weight matrix is larger than 1.0),
 58 | it can lead to a situation where the gradient signal is so large that it can
 59 | cause learning to diverge. This is often referred to as *exploding gradients*.
 60 | 
 61 | These issues are the main motivation behind the LSTM model which introduces a
 62 | new structure called a *memory cell* (see Figure 1 below). A memory cell is
 63 | composed of four main elements: an input gate, a neuron with a self-recurrent
 64 | connection (a connection to itself), a forget gate and an output gate. The
 65 | self-recurrent connection has a weight of 1.0 and ensures that, barring any
 66 | outside interference, the state of a memory cell can remain constant from one
 67 | timestep to another. The gates serve to modulate the interactions between the
 68 | memory cell itself and its environment. The input gate can allow incoming
 69 | signal to alter the state of the memory cell or block it. On the other hand,
 70 | the output gate can allow the state of the memory cell to have an effect on
 71 | other neurons or prevent it. Finally, the forget gate can modulate the memory
 72 | cell’s self-recurrent connection, allowing the cell to remember or forget its
 73 | previous state, as needed.
 74 | 
 75 | .. figure:: images/lstm_memorycell.png
 76 |     :align: center
 77 | 
 78 |     **Figure 1** : Illustration of an LSTM memory cell.
 79 | 
 80 | The equations below describe how a layer of memory cells is updated at every
 81 | timestep :math:`t`. In these equations :
 82 | 
 83 | *       :math:`x_t` is the input to the memory cell layer at time :math:`t`
 84 | *       :math:`W_i`, :math:`W_f`, :math:`W_c`, :math:`W_o`, :math:`U_i`,
 85 |         :math:`U_f`, :math:`U_c`, :math:`U_o` and :math:`V_o` are weight
 86 |         matrices
 87 | *       :math:`b_i`, :math:`b_f`, :math:`b_c` and :math:`b_o` are bias vectors
 88 | 
 89 | 
 90 | First, we compute the values for :math:`i_t`, the input gate, and
 91 | :math:`\widetilde{C_t}` the candidate value for the states of the memory
 92 | cells at time :math:`t` :
 93 | 
 94 | .. math::
 95 |     :label: 1
 96 | 
 97 |     i_t = \sigma(W_i x_t + U_i h_{t-1} + b_i)
 98 | 
 99 | .. math::
100 |     :label: 2
101 | 
102 |     \widetilde{C_t} = tanh(W_c x_t + U_c h_{t-1} + b_c)
103 | 
104 | Second, we compute the value for :math:`f_t`, the activation of the memory
105 | cells' forget gates at time :math:`t` :
106 | 
107 | .. math::
108 |     :label: 3
109 | 
110 |     f_t = \sigma(W_f x_t + U_f h_{t-1} + b_f)
111 | 
112 | Given the value of the input gate activation :math:`i_t`, the forget gate
113 | activation :math:`f_t` and the candidate state value :math:`\widetilde{C_t}`,
114 | we can compute :math:`C_t` the memory cells' new state at time :math:`t` :
115 | 
116 | .. math::
117 |     :label: 4
118 | 
119 |     C_t = i_t * \widetilde{C_t} + f_t * C_{t-1}
120 | 
121 | With the new state of the memory cells, we can compute the value of their
122 | output gates and, subsequently, their outputs :
123 | 
124 | .. math::
125 |     :label: 5
126 | 
127 |     o_t = \sigma(W_o x_t + U_o h_{t-1} + V_o C_t + b_o)
128 | 
129 | .. math::
130 |     :label: 6
131 | 
132 |     h_t = o_t * tanh(C_t)
133 | 
134 | Our model
135 | =========
136 | 
137 | The model we used in this tutorial is a variation of the standard LSTM model.
138 | In this variant, the activation of a cell’s output gate does not depend on the
139 | memory cell’s state :math:`C_t`. This allows us to perform part of the
140 | computation more efficiently (see the implementation note, below, for
141 | details). This means that, in the variant we have implemented, there is no
142 | matrix :math:`V_o` and equation :eq:`5` is replaced by equation :eq:`5-alt` :
143 | 
144 | .. math::
145 |     :label: 5-alt
146 | 
147 |     o_t = \sigma(W_o x_t + U_o h_{t-1} + b_o)
148 | 
149 | Our model is composed of a single LSTM layer followed by an average pooling
150 | and a logistic regression layer as illustrated in Figure 2 below. Thus, from
151 | an input sequence :math:`x_0, x_1, x_2, ..., x_n`, the memory cells in the
152 | LSTM layer will produce a representation sequence :math:`h_0, h_1, h_2, ...,
153 | h_n`. This representation sequence is then averaged over all timesteps
154 | resulting in representation h. Finally, this representation is fed to a
155 | logistic regression layer whose target is the class label associated with the
156 | input sequence.
157 | 
158 | .. figure:: images/lstm.png
159 |     :align: center
160 | 
161 |     **Figure 2** : Illustration of the model used in this tutorial. It is
162 |     composed of a single LSTM layer followed by mean pooling over time and
163 |     logistic regression.
164 | 
165 | **Implementation note** : In the code included this tutorial, the equations
166 | :eq:`1`, :eq:`2`, :eq:`3` and :eq:`5-alt` are performed in parallel to make
167 | the computation more efficient. This is possible because none of these
168 | equations rely on a result produced by the other ones. It is achieved by
169 | concatenating the four matrices :math:`W_*` into a single weight matrix
170 | :math:`W` and performing the same concatenation on the weight matrices
171 | :math:`U_*` to produce the matrix :math:`U` and the bias vectors :math:`b_*`
172 | to produce the vector :math:`b`. Then, the pre-nonlinearity activations can
173 | be computed with :
174 | 
175 | .. math::
176 | 
177 |     z = \sigma(W x_t + U h_{t-1} + b)
178 | 
179 | The result is then sliced to obtain the pre-nonlinearity activations for
180 | :math:`i`, :math:`f`, :math:`\widetilde{C_t}`, and :math:`o` and the
181 | non-linearities are then applied independently for each.
182 | 
183 | 
184 | Code - Citations - Contact
185 | ++++++++++++++++++++++++++
186 | 
187 | Code
188 | ====
189 | 
190 | The LSTM implementation can be found in the two following files :
191 | 
192 | * `lstm.py <http://deeplearning.net/tutorial/code/lstm.py>`_ : Main script. Defines and train the model.
193 | 
194 | * `imdb.py <http://deeplearning.net/tutorial/code/imdb.py>`_ : Secondary script. Handles the loading and preprocessing of the IMDB dataset.
195 | 
196 | After downloading both scripts and putting both in the same folder, the user
197 | can run the code by calling:
198 | 
199 | .. code-block:: bash
200 | 
201 |     THEANO_FLAGS="floatX=float32" python lstm.py
202 | 
203 | The script will automatically download the data and decompress it.
204 | 
205 | **Note** : The provided code supports the Stochastic Gradient Descent (SGD),
206 | AdaDelta and RMSProp optimization methods. You are advised to use AdaDelta or
207 | RMSProp because SGD appears to performs poorly on this task with this
208 | particular model.
209 | 
210 | Papers
211 | ======
212 | 
213 | If you use this tutorial, please cite the following papers.
214 | 
215 | Introduction of the LSTM model:
216 | 
217 | * `[pdf] <http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf>`__ Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural computation, 9(8), 1735-1780.
218 | 
219 | Addition of the forget gate to the LSTM model:
220 | 
221 | * `[pdf] <http://www.mitpressjournals.org/doi/pdf/10.1162/089976600300015015>`__ Gers, F. A., Schmidhuber, J., & Cummins, F. (2000). Learning to forget: Continual prediction with LSTM. Neural computation, 12(10), 2451-2471.
222 | 
223 | More recent LSTM paper:
224 | 
225 | * `[pdf] <http://www.cs.toronto.edu/~graves/preprint.pdf>`__ Graves, Alex. Supervised sequence labelling with recurrent neural networks. Vol. 385. Springer, 2012.
226 | 
227 | Papers related to Theano:
228 | 
229 | * `[pdf] <http://www.iro.umontreal.ca/~lisa/pointeurs/nips2012_deep_workshop_theano_final.pdf>`__ Bastien, Frédéric, Lamblin, Pascal, Pascanu, Razvan, Bergstra, James, Goodfellow, Ian, Bergeron, Arnaud, Bouchard, Nicolas, and Bengio, Yoshua. Theano: new features and speed improvements. NIPS Workshop on Deep Learning and Unsupervised Feature Learning, 2012.
230 | 
231 | * `[pdf] <http://www.iro.umontreal.ca/~lisa/pointeurs/theano_scipy2010.pdf>`__ Bergstra, James, Breuleux, Olivier, Bastien, Frédéric, Lamblin, Pascal, Pascanu, Razvan, Desjardins, Guillaume, Turian, Joseph, Warde-Farley, David, and Bengio, Yoshua. Theano: a CPU and GPU math expression compiler. In Proceedings of the Python for Scientific Computing Conference (SciPy), June 2010.
232 | 
233 | Thank you!
234 | 
235 | Contact
236 | =======
237 | 
238 | Please email `Pierre Luc Carrier <mailto:carriepl..at..iro.umontreal.ca>`_ or
239 | `Kyunghyun Cho <http://www.kyunghyuncho.me/>`_ for any problem report or
240 | feedback. We will be glad to hear from you.
241 | 
242 | References
243 | ++++++++++
244 | 
245 | * Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural computation, 9(8), 1735-1780.
246 | 
247 | * Gers, F. A., Schmidhuber, J., & Cummins, F. (2000). Learning to forget: Continual prediction with LSTM. Neural computation, 12(10), 2451-2471.
248 | 
249 | * Graves, A. (2012). Supervised sequence labelling with recurrent neural networks (Vol. 385). Springer.
250 | 
251 | * Hochreiter, S., Bengio, Y., Frasconi, P., & Schmidhuber, J. (2001). Gradient flow in recurrent nets: the difficulty of learning long-term dependencies.
252 | 
253 | * Bengio, Y., Simard, P., & Frasconi, P. (1994). Learning long-term dependencies with gradient descent is difficult. Neural Networks, IEEE Transactions on, 5(2), 157-166.
254 | 
255 | * Maas, A. L., Daly, R. E., Pham, P. T., Huang, D., Ng, A. Y., & Potts, C. (2011, June). Learning word vectors for sentiment analysis. In Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies-Volume 1 (pp. 142-150). Association for Computational Linguistics.
256 | 


--------------------------------------------------------------------------------
/doc/mlp.txt:
--------------------------------------------------------------------------------
  1 | .. index:: Multilayer Perceptron
  2 | 
  3 | .. _mlp:
  4 | 
  5 | 
  6 | Multilayer Perceptron
  7 | =====================
  8 | 
  9 | .. note::
 10 |     This section assumes the reader has already read through :doc:`logreg`.
 11 |     Additionally, it uses the following new Theano functions and concepts:
 12 |     `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_,
 13 |     :ref:`L1_L2_regularization`, `floatX`_. If you intend to run the
 14 |     code on GPU also read `GPU`_.
 15 | 
 16 | .. note::
 17 |     The code for this section is available for download `here`_.
 18 | 
 19 | .. _here: http://deeplearning.net/tutorial/code/mlp.py
 20 | 
 21 | .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh
 22 | 
 23 | .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables
 24 | 
 25 | .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars
 26 | 
 27 | .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients
 28 | 
 29 | .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX
 30 | 
 31 | .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html
 32 | 
 33 | 
 34 | The next architecture we are going to present using Theano is the
 35 | single-hidden-layer Multi-Layer Perceptron (MLP). An MLP can be viewed as a
 36 | logistic regression classifier where the input is first transformed using a
 37 | learnt non-linear transformation :math:`\Phi`. This transformation projects the
 38 | input data into a space where it becomes linearly separable. This intermediate
 39 | layer is referred to as a **hidden layer**. A single hidden layer is sufficient
 40 | to make MLPs a **universal approximator**. However we will see later on that
 41 | there are substantial benefits to using many such hidden layers, i.e. the very
 42 | premise of **deep learning**. See these course notes for an `introduction to
 43 | MLPs, the back-propagation algorithm, and how to train MLPs
 44 | <http://www.iro.umontreal.ca/~pift6266/H10/notes/mlp.html>`_.
 45 | 
 46 | This tutorial will again tackle the problem of MNIST digit classification.
 47 | 
 48 | The Model
 49 | +++++++++
 50 | 
 51 | An MLP (or Artificial Neural Network - ANN) with a single hidden layer
 52 | can be represented graphically as
 53 | follows:
 54 | 
 55 | .. figure:: images/mlp.png
 56 |     :align: center
 57 | 
 58 | Formally, a one-hidden-layer MLP is a function :math:`f: R^D \rightarrow
 59 | R^L`, where :math:`D` is the size of input vector :math:`x` and :math:`L` is
 60 | the size of the output vector :math:`f(x)`, such that, in matrix notation:
 61 | 
 62 | .. math::
 63 | 
 64 |     f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
 65 | 
 66 | with bias vectors :math:`b^{(1)}`, :math:`b^{(2)}`; weight matrices
 67 | :math:`W^{(1)}`, :math:`W^{(2)}` and activation functions :math:`G` and :math:`s`.
 68 | 
 69 | The vector :math:`h(x) = \Phi(x) = s(b^{(1)} + W^{(1)} x)` constitutes the hidden layer.
 70 | :math:`W^{(1)} \in R^{D \times D_h}` is the weight matrix connecting the input vector
 71 | to the hidden layer.  Each column :math:`W^{(1)}_{\cdot i}` represents the weights
 72 | from the input units to the i-th hidden unit. Typical choices for :math:`s`
 73 | include :math:`tanh`, with :math:`tanh(a)=(e^a-e^{-a})/(e^a+e^{-a})`,
 74 | or the logistic :math:`sigmoid` function, with :math:`sigmoid(a)=1/(1+e^{-a})`. We will be using
 75 | :math:`tanh` in this tutorial because it typically yields to faster training
 76 | (and sometimes also to better local minima). Both the :math:`tanh`
 77 | and :math:`sigmoid` are scalar-to-scalar functions but their natural
 78 | extension to vectors and tensors consists in applying them element-wise
 79 | (e.g. separately on each element of the vector, yielding a same-size vector).
 80 | 
 81 | The output vector is then obtained as: :math:`o(x) = G(b^{(2)} + W^{(2)} h(x))`.
 82 | The reader should recognize the form we already used for
 83 | :doc:`logreg`. As before,
 84 | class-membership probabilities can be obtained by choosing :math:`G` as the
 85 | :math:`softmax` function (in the case of multi-class classification).
 86 | 
 87 | To train an MLP, we learn **all** parameters of the model, and here we use
 88 | :ref:`opt_SGD` with minibatches.
 89 | The set of parameters to learn is the set :math:`\theta =
 90 | \{W^{(2)},b^{(2)},W^{(1)},b^{(1)}\}`.  Obtaining the gradients
 91 | :math:`\partial{\ell}/\partial{\theta}` can be achieved through the
 92 | **backpropagation algorithm** (a special case of the chain-rule of derivation).
 93 | Thankfully, since Theano performs automatic differentation, we will not need to
 94 | cover this in the tutorial !
 95 | 
 96 | 
 97 | Going from logistic regression to MLP
 98 | +++++++++++++++++++++++++++++++++++++
 99 | 
100 | This tutorial will focus on a single-hidden-layer MLP. We start off by
101 | implementing a class that will represent a hidden layer. To
102 | construct the MLP we will then only need to throw a logistic regression
103 | layer on top.
104 | 
105 | .. literalinclude:: ../code/mlp.py
106 |   :start-after: start-snippet-1
107 |   :end-before: end-snippet-1
108 | 
109 | The initial values for the weights of a hidden layer :math:`i` should be uniformly
110 | sampled from a symmetric interval that depends on the activation function. For
111 | :math:`tanh` activation function results obtained in [Xavier10]_ show that the
112 | interval should be
113 | :math:`[-\sqrt{\frac{6}{fan_{in}+fan_{out}}},\sqrt{\frac{6}{fan_{in}+fan_{out}}}]`, where
114 | :math:`fan_{in}` is the number of units in the :math:`(i-1)`-th layer,
115 | and :math:`fan_{out}` is the number of units in the :math:`i`-th layer. For
116 | the sigmoid function the interval is :math:`[-4\sqrt{\frac{6}{fan_{in}+fan_{out}}},4\sqrt{\frac{6}{fan_{in}+fan_{out}}}]`.
117 | This initialization ensures that, early in training, each neuron operates in a
118 | regime of its activation function where information can easily be propagated
119 | both upward (activations flowing from inputs to outputs) and backward
120 | (gradients flowing from outputs to inputs).
121 | 
122 | .. literalinclude:: ../code/mlp.py
123 |   :start-after: end-snippet-1
124 |   :end-before:  lin_output = T.dot(input, self.W) + self.b
125 | 
126 | Note that we used a given non-linear function as the activation function of the hidden layer. By default this is ``tanh``, but in many cases we might want
127 | to use something else.
128 | 
129 | .. literalinclude:: ../code/mlp.py
130 |   :start-after: self.b = b
131 |   :end-before: # parameters of the model
132 | 
133 | If you look into theory this class implements the graph that computes
134 | the hidden layer value :math:`h(x) = \Phi(x) = s(b^{(1)} + W^{(1)} x)`.
135 | If you give this graph as input to the ``LogisticRegression`` class,
136 | implemented in the previous tutorial :doc:`logreg`, you get the output
137 | of the MLP. You can see this in the following short implementation of
138 | the ``MLP`` class.
139 | 
140 | .. literalinclude:: ../code/mlp.py
141 |   :start-after: start-snippet-2
142 |   :end-before: end-snippet-2
143 | 
144 | In this tutorial we will also use L1 and L2 regularization (see
145 | :ref:`L1_L2_regularization`). For this, we need to compute the L1 norm and the squared L2
146 | norm of the weights :math:`W^{(1)}, W^{(2)}`.
147 | 
148 | .. literalinclude:: ../code/mlp.py
149 |   :start-after: start-snippet-3
150 |   :end-before: end-snippet-3
151 | 
152 | As before, we train this model using stochastic gradient descent with
153 | mini-batches. The difference is that we modify the cost function to include the
154 | regularization term. ``L1_reg`` and ``L2_reg`` are the hyperparameters
155 | controlling the weight of these regularization terms in the total cost function.
156 | The code that computes the new cost is:
157 | 
158 | .. literalinclude:: ../code/mlp.py
159 |   :start-after: start-snippet-4
160 |   :end-before: end-snippet-4
161 | 
162 | We then update the parameters of the model using the gradient. This code is
163 | almost identical to the one for logistic regression. Only the number of
164 | parameters differ. To get around this ( and write code that could work
165 | for any number of parameters) we will use the list of parameters that
166 | we created with the model ``params`` and parse it, computing a gradient
167 | at each step.
168 | 
169 | .. literalinclude:: ../code/mlp.py
170 |   :start-after: start-snippet-5
171 |   :end-before: end-snippet-5
172 | 
173 | Putting it All Together
174 | +++++++++++++++++++++++
175 | 
176 | Having covered the basic concepts, writing an MLP class becomes quite easy.
177 | The code below shows how this can be done, in a way which is analogous to our previous logistic regression implementation.
178 | 
179 | .. literalinclude:: ../code/mlp.py
180 | 
181 | The user can then run the code by calling :
182 | 
183 | .. code-block:: bash
184 | 
185 |     python code/mlp.py
186 | 
187 | The output one should expect is of the form :
188 | 
189 | .. code-block:: bash
190 | 
191 |   Optimization complete. Best validation score of 1.690000 % obtained at iteration 2070000, with test performance 1.650000 %
192 |   The code for file mlp.py ran for 97.34m
193 | 
194 | On an Intel(R) Core(TM) i7-2600K CPU @ 3.40GHz the code runs with
195 | approximately 10.3 epoch/minute and it took 828 epochs to reach a test
196 | error of 1.65%.
197 | 
198 | To put this into perspective, we refer the reader to the results section of `this
199 | <http://yann.lecun.com/exdb/mnist>`_  page.
200 | 
201 | Tips and Tricks for training MLPs
202 | +++++++++++++++++++++++++++++++++
203 | 
204 | There are several hyper-parameters in the above code, which are not (and,
205 | generally speaking, cannot be) optimized by gradient descent. Strictly speaking,
206 | finding an optimal set of values for these
207 | hyper-parameters is not a feasible problem. First, we can't simply optimize
208 | each of them independently. Second, we cannot readily apply gradient
209 | techniques that we described previously (partly because some parameters are
210 | discrete values and others are real-valued). Third, the optimization problem
211 | is not convex and finding a (local) minimum would involve a non-trivial
212 | amount of work.
213 | 
214 | The good news is that over the last 25 years, researchers have devised various
215 | rules of thumb for choosing hyper-parameters in a neural network. A very
216 | good overview of these tricks can be found in `Efficient
217 | BackProp <http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf>`_ by Yann LeCun,
218 | Leon Bottou, Genevieve Orr, and Klaus-Robert Mueller. In here, we summarize
219 | the same issues, with an emphasis on the parameters and techniques that we
220 | actually used in our code.
221 | 
222 | Nonlinearity
223 | --------------
224 | 
225 | Two of the most common ones are the :math:`sigmoid` and the :math:`tanh` function. For
226 | reasons explained in `Section 4.4  <http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf>`_, nonlinearities that
227 | are symmetric around the origin are preferred because they tend to produce
228 | zero-mean inputs to the next layer (which is a desirable property).
229 | Empirically, we have observed that the :math:`tanh` has better convergence
230 | properties.
231 | 
232 | Weight initialization
233 | ---------------------
234 | 
235 | At initialization we want the weights to be small enough around the origin
236 | so that the activation function operates in its linear regime, where gradients are
237 | the largest. Other desirable properties, especially for deep networks,
238 | are to conserve variance of the activation as well as variance of back-propagated gradients from layer to layer.
239 | This allows information to flow well upward and downward in the network and
240 | reduces discrepancies between layers.
241 | Under some assumptions, a compromise between these two constraints leads to the following
242 | initialization: :math:`uniform[-\frac{\sqrt{6}}{\sqrt{fan_{in}+fan_{out}}},\frac{\sqrt{6}}{\sqrt{fan_{in}+fan_{out}}}]`
243 | for tanh and :math:`uniform[-4*\frac{\sqrt{6}}{\sqrt{fan_{in}+fan_{out}}},4*\frac{\sqrt{6}}{\sqrt{fan_{in}+fan_{out}}}]`
244 | for sigmoid. Where :math:`fan_{in}` is the number of inputs and :math:`fan_{out}` the number of hidden units.
245 | For mathematical considerations please refer to [Xavier10]_.
246 | 
247 | Learning rate
248 | --------------
249 | 
250 | There is a great deal of literature on choosing a good learning rate. The
251 | simplest solution is to simply have a constant rate. Rule of thumb: try
252 | several log-spaced values (:math:`10^{-1},10^{-2},\ldots`) and narrow the
253 | (logarithmic) grid search to the region where you obtain the lowest
254 | validation error.
255 | 
256 | Decreasing the learning rate over time is sometimes a good idea. One simple
257 | rule for doing that is :math:`\frac{\mu_0}{1 + d\times t}` where
258 | :math:`\mu_0` is the initial rate (chosen, perhaps, using the grid search
259 | technique explained above), :math:`d` is a so-called "decrease constant"
260 | which controls the rate at which the learning rate decreases (typically, a
261 | smaller positive number, :math:`10^{-3}` and smaller) and :math:`t` is the
262 | epoch/stage.
263 | 
264 | `Section 4.7 <http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf>`_ details
265 | procedures for choosing a learning rate for each parameter (weight) in our
266 | network and for choosing them adaptively based on the error of the
267 | classifier.
268 | 
269 | Number of hidden units
270 | -----------------------
271 | 
272 | This hyper-parameter is very much dataset-dependent. Vaguely speaking, the
273 | more complicated the input distribution is, the more capacity the network
274 | will require to model it, and so the larger the number of hidden units that
275 | will be needed (note that the number of weights in a layer, perhaps a more direct
276 | measure of capacity, is :math:`D\times D_h` (recall :math:`D` is the number of
277 | inputs and :math:`D_h` is the number of hidden units).
278 | 
279 | Unless we employ some regularization scheme (early stopping or L1/L2
280 | penalties), a typical number of hidden  units vs. generalization performance graph will be U-shaped.
281 | 
282 | Regularization parameter
283 | ------------------------
284 | 
285 | Typical values to try for the L1/L2 regularization parameter :math:`\lambda`
286 | are :math:`10^{-2},10^{-3},\ldots`. In the framework that we described so
287 | far, optimizing this parameter will not lead to significantly better
288 | solutions, but is worth exploring nonetheless.
289 | 
290 | 


--------------------------------------------------------------------------------
/doc/references.txt:
--------------------------------------------------------------------------------
 1 | .. _references:
 2 | 
 3 | ==========
 4 | References
 5 | ==========
 6 | 
 7 | .. [Bengio07] Y. Bengio, P. Lamblin, D. Popovici and H. Larochelle, `Greedy Layer-Wise Training of Deep Networks <http://www.iro.umontreal.ca/~lisa/publications2/index.php/publications/show/190>`_, in Advances in Neural Information Processing Systems 19 (NIPS'06), pages  153-160, MIT Press 2007.
 8 | 
 9 | .. [Bengio09] Y. Bengio, `Learning deep architectures for AI <http://www.iro.umontreal.ca/~lisa/publications2/index.php/publications/show/239>`_, Foundations and Trends in Machine Learning 1(2) pages 1-127.
10 | 
11 | .. [BengioDelalleau09] Y. Bengio, O. Delalleau, Justifying and Generalizing Contrastive Divergence (2009), Neural Computation, 21(6): 1601-1621.
12 | 
13 | .. [BoulangerLewandowski12] N Boulanger-Lewandowski, Y. Bengio and P. Vincent, `Modeling Temporal Dependencies in High-Dimensional Sequences: Application to Polyphonic Music Generation and Transcription <http://www-etud.iro.umontreal.ca/~boulanni/icml2012>`_, in Proceedings of the 29th International Conference on Machine Learning (ICML), 2012.
14 | 
15 | .. [Fukushima] Fukushima, K. (1980). Neocognitron: A self-organizing neural network model for a mechanism of pattern recognition unaffected by shift in position. Biological Cybernetics, 36, 193–202.
16 | 
17 | .. [Hinton06] G.E. Hinton and R.R. Salakhutdinov, `Reducing the Dimensionality of Data with Neural Networks <http://www.cs.toronto.edu/~rsalakhu/papers/science.pdf>`_, Science, 28 July 2006, Vol. 313. no. 5786, pp. 504 - 507.
18 | 
19 | .. [Hinton07] G.E. Hinton, S. Osindero, and Y. Teh, "A fast learning algorithm for deep belief nets", Neural Computation, vol 18, 2006
20 | 
21 | .. [Hubel68] Hubel, D. and Wiesel, T. (1968). Receptive fields and functional architecture of monkey striate cortex. Journal of Physiology (London), 195, 215–243.
22 | 
23 | .. [LeCun98] LeCun, Y., Bottou, L., Bengio, Y., and Haffner, P. (1998d).  Gradient-based learning applied to document recognition. Proceedings of the IEEE, 86(11), 2278–2324.
24 | 
25 | .. [Lee08] H. Lee, C. Ekanadham, and A.Y. Ng., `Sparse deep belief net model for visual area V2 <http://www.stanford.edu/~hllee/nips07-sparseDBN.pdf>`_, in Advances in Neural Information Processing Systems (NIPS) 20, 2008.
26 | 
27 | .. [Lee09] H. Lee, R. Grosse, R. Ranganath, and A.Y. Ng, "Convolutional deep belief networks for scalable unsupervised learning of hierarchical representations.", ICML 2009
28 | 
29 | .. [Ranzato10] M. Ranzato, A. Krizhevsky, G. Hinton, "Factored 3-Way Restricted Boltzmann Machines for Modeling Natural Images". Proc. of the 13-th International Conference on Artificial Intelligence and Statistics (AISTATS 2010), Italy, 2010
30 | 
31 | .. [Ranzato07] M.A. Ranzato, C. Poultney, S. Chopra and Y. LeCun, in J. Platt et al., `Efficient Learning of Sparse Representations with an Energy-Based Model <http://yann.lecun.com/exdb/publis/pdf/ranzato-06.pdf>`_, Advances in Neural Information Processing Systems (NIPS 2006), MIT Press, 2007.
32 | 
33 | .. [Serre07] Serre, T., Wolf, L., Bileschi, S., and Riesenhuber, M. (2007).  Robust object recog- nition with cortex-like mechanisms. IEEE Trans. Pattern Anal. Mach. Intell., 29(3), 411–426. Member-Poggio, Tomaso.
34 | 
35 | .. [Vincent08] P. Vincent, H. Larochelle Y. Bengio and P.A. Manzagol, `Extracting and Composing Robust Features with Denoising Autoencoders <http://www.iro.umontreal.ca/~lisa/publications2/index.php/publications/show/217>`_, Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08), pages 1096 - 1103, ACM, 2008.
36 | 
37 | .. [Tieleman08] T. Tieleman, Training restricted boltzmann machines using approximations to the likelihood gradient, ICML 2008.
38 | 
39 | .. [Xavier10] Y. Bengio, X. Glorot, Understanding the difficulty of training deep feedforward neuralnetworks, AISTATS 2010
40 | 


--------------------------------------------------------------------------------
/doc/rnnrbm.txt:
--------------------------------------------------------------------------------
  1 | .. _rnnrbm:
  2 | 
  3 | Modeling and generating sequences of polyphonic music with the RNN-RBM
  4 | ========================================================================
  5 | 
  6 | .. note::
  7 |   This tutorial demonstrates a basic implementation of the RNN-RBM as described in [BoulangerLewandowski12]_
  8 |   (`pdf <http://www-etud.iro.umontreal.ca/~boulanni/ICML2012.pdf>`_).
  9 |   We assume the reader is familiar with
 10 |   `recurrent neural networks using the scan op <http://deeplearning.net/software/theano/library/scan.html>`_
 11 |   and `restricted Boltzmann machines (RBM) <rbm.html>`_.
 12 | 
 13 | .. note::
 14 |   The code for this section is available for download here: `rnnrbm.py <code/rnnrbm.py>`_.
 15 | 
 16 |   You will need the modified `Python MIDI package (GPL license) <http://www.iro.umontreal.ca/~lisa/deep/midi.zip>`_ in your ``$PYTHONPATH`` or in the working directory in order to convert MIDI files to and from piano-rolls.
 17 |   The script also assumes that the content of the `Nottingham Database of folk tunes <http://www.iro.umontreal.ca/~lisa/deep/data/Nottingham.zip>`_ has been extracted in the ``../data`` directory.
 18 |   Alternative MIDI datasets are available `here <http://www-etud.iro.umontreal.ca/~boulanni/icml2012>`_.
 19 |   
 20 |   Note that both dependencies above can be setup automatically by running the ``download.sh`` script in the ``../data`` directory.
 21 | 
 22 | .. caution::
 23 |   Need Theano 0.6 or more recent.
 24 | 
 25 | 
 26 | The RNN-RBM
 27 | +++++++++++++++++++++++++
 28 | 
 29 | The RNN-RBM is an energy-based model for density estimation of temporal sequences, where the feature vector :math:`v^{(t)}` at time step :math:`t` may be high-dimensional.
 30 | It allows to describe multimodal conditional distributions of :math:`v^{(t)}|\mathcal A^{(t)}`, where :math:`\mathcal A^{(t)}\equiv \{v_\tau|\tau<t\}` denotes the *sequence history* at time :math:`t`, via a series of conditional RBMs (one a each time step) whose parameters :math:`b_v^{(t)},b_h^{(t)}` depend on the output of a deterministic RNN with hidden units :math:`u^{(t)}`:
 31 | 
 32 | .. math::
 33 |   :label: bv_t
 34 | 
 35 |   b_v^{(t)} = b_v + W_{uv} u^{(t-1)}
 36 | 
 37 | .. math::
 38 |   :label: bh_t
 39 | 
 40 |   b_h^{(t)} = b_h + W_{uh} u^{(t-1)}
 41 | 
 42 | and the single-layer RNN recurrence relation is defined by:
 43 | 
 44 | .. math::
 45 |   :label: u_t
 46 | 
 47 |   u^{(t)} = \tanh (b_u + W_{uu} u^{(t-1)} + W_{vu} v^{(t)})
 48 | 
 49 | The resulting model is unrolled in time in the following figure:
 50 | 
 51 | .. image:: images/rnnrbm.png
 52 |     :align: center
 53 | 
 54 | The overall probability distribution is given by the sum over the :math:`T` time steps in a given sequence:
 55 | 
 56 | .. math::
 57 |   :label: prob_rnnrbm
 58 | 
 59 |   P(\{v^{(t)}\}) = \sum_{t=1}^T P(v^{(t)} | \mathcal A^{(t)})
 60 | 
 61 | where the right-hand side multiplicand is the marginalized probability of the :math:`t^\mathrm{th}` RBM.
 62 | 
 63 | Note that for clarity of the implementation, contrarily to [BoulangerLewandowski12]_, we use the obvious naming convention for weight matrices and we use :math:`u^{(t)}` instead of :math:`\hat h^{(t)}` for the recurrent hidden units.
 64 | 
 65 | 
 66 | 
 67 | Implementation
 68 | ++++++++++++++
 69 | 
 70 | We wish to construct two Theano functions: one to train the RNN-RBM, and one to generate sample sequences from it.
 71 | 
 72 | For *training*, i.e. given :math:`\{v^{(t)}\}`, the RNN hidden state :math:`\{u^{(t)}\}` and the associated :math:`\{b_v^{(t)}, b_h^{(t)}\}` parameters are deterministic and can be readily computed for each training sequence.
 73 | A stochastic gradient descent (SGD) update on the parameters can then be estimated via contrastive divergence (CD) on the individual time steps of a sequence in the same way that individual training examples are treated in a mini-batch for regular RBMs.
 74 | 
 75 | *Sequence generation* is similar except that the :math:`v^{(t)}` must be sampled sequentially at each time step with a separate (non-batch) Gibbs chain before being passed down to the recurrence and the sequence history.
 76 | 
 77 | 
 78 | The RBM layer
 79 | ---------------
 80 | 
 81 | The ``build_rbm`` function shown below builds a Gibbs chain from an input mini-batch (a binary matrix) via the CD approximation.
 82 | Note that it also supports a single frame (a binary vector) in the non-batch case.
 83 | 
 84 | .. literalinclude:: ../code/rnnrbm.py
 85 |   :pyobject: build_rbm
 86 | 
 87 | The RNN layer
 88 | ---------------
 89 | 
 90 | The ``build_rnnrbm`` function defines the RNN recurrence relation to obtain the RBM parameters; the recurrence function is flexible enough to serve both in the training scenario where :math:`v^{(t)}` is given and the "batch" RBM is constructed at the end on the whole sequence at once, and in the generation scenario where :math:`v^{(t)}` is sampled separately at each time step using the Gibbs chain defined above.
 91 | 
 92 | 
 93 | .. literalinclude:: ../code/rnnrbm.py
 94 |   :pyobject: build_rnnrbm
 95 | 
 96 | Putting it all together
 97 | ---------------------------
 98 | 
 99 | We now have all the necessary ingredients to start training our network on real symbolic sequences of polyphonic music.
100 | 
101 | .. literalinclude:: ../code/rnnrbm.py
102 |   :pyobject: RnnRbm
103 | 
104 | Results
105 | ++++++++
106 | 
107 | We ran the code on the Nottingham database for 200 epochs; training took approximately 24 hours.
108 | 
109 | The output was the following:
110 | 
111 | .. code-block:: text
112 | 
113 |   Epoch 1/200 -15.0308940028
114 |   Epoch 2/200 -10.4892606673
115 |   Epoch 3/200 -10.2394696138
116 |   Epoch 4/200 -10.1431669994
117 |   Epoch 5/200 -9.7005382843
118 |   Epoch 6/200 -8.5985647524
119 |   Epoch 7/200 -8.35115428534
120 |   Epoch 8/200 -8.26453580552
121 |   Epoch 9/200 -8.21208991542
122 |   Epoch 10/200 -8.16847274143
123 | 
124 |   ... truncated for brevity ...
125 | 
126 |   Epoch 190/200 -4.74799179994
127 |   Epoch 191/200 -4.73488515216
128 |   Epoch 192/200 -4.7326138489
129 |   Epoch 193/200 -4.73841636884
130 |   Epoch 194/200 -4.70255511452
131 |   Epoch 195/200 -4.71872634914
132 |   Epoch 196/200 -4.7276415885
133 |   Epoch 197/200 -4.73497644728
134 |   Epoch 198/200 -inf
135 |   Epoch 199/200 -4.75554987143
136 |   Epoch 200/200 -4.72591935412
137 | 
138 | 
139 | 
140 | The figures below show the piano-rolls of two sample sequences and we provide the corresponding MIDI files:
141 | 
142 | .. figure:: images/sample1.png
143 |   :scale: 60%
144 | 
145 |   Listen to `sample1.mid <http://www-etud.iro.umontreal.ca/~boulanni/sample1.mid>`_
146 | 
147 | .. figure:: images/sample2.png
148 |   :scale: 60%
149 | 
150 |   Listen to `sample2.mid <http://www-etud.iro.umontreal.ca/~boulanni/sample2.mid>`_
151 | 
152 | 
153 | How to improve this code
154 | +++++++++++++++++++++++++
155 | 
156 | The code shown in this tutorial is a stripped-down version that can be improved in the following ways:
157 | 
158 | * Preprocessing: transposing the sequences in a common tonality (e.g. C major / minor) and normalizing the tempo in beats (quarternotes) per minute can have the most effect on the generative quality of the model.
159 | * Pretraining techniques: initialize the :math:`W,b_v,b_h` parameters with independent RBMs with fully shuffled frames (i.e. :math:`W_{uh}=W_{uv}=W_{uu}=W_{vu}=0`); initialize the :math:`W_{uv},W_{uu},W_{vu},b_u` parameters of the RNN with the auxiliary cross-entropy objective via either SGD or, preferably, Hessian-free optimization [BoulangerLewandowski12]_.
160 | * Optimization techniques: gradient clipping, Nesterov momentum and the use of NADE for conditional density estimation.
161 | * Hyperparameter search: learning rate (separately for the RBM and RNN parts), learning rate schedules, batch size, number of hidden units (recurrent and RBM), momentum coefficient, momentum schedule, Gibbs chain length :math:`k` and early stopping.
162 | * Learn the initial condition :math:`u^{(0)}` as a model parameter.
163 | 
164 | 
165 | A few samples generated with code including these features are available here: `sequences.zip <http://www-etud.iro.umontreal.ca/~boulanni/sequences.zip>`_.
166 | 
167 | 


--------------------------------------------------------------------------------
/doc/scripts/docgen.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | import os
 4 | import shutil
 5 | 
 6 | import getopt
 7 | from collections import defaultdict
 8 | 
 9 | if __name__ == '__main__':
10 | 
11 |     throot = "/".join(sys.path[0].split("/")[:-2])
12 | 
13 |     options = defaultdict(bool)
14 |     output_arg = getopt.getopt(sys.argv[1:], 'o:', ['rst', 'help', 'nopdf'])[0]
15 |     options.update(dict([x, y or True] for x, y in output_arg))
16 |     if options['--help']:
17 |         print('Usage: %s [OPTIONS]' % sys.argv[0])
18 |         print('  -o <dir>: output the html files in the specified dir')
19 |         print('  --rst: only compile the doc (requires sphinx)')
20 |         print('  --nopdf: do not produce a PDF file from the doc, only HTML')
21 |         print('  --help: this help')
22 |         sys.exit(0)
23 | 
24 |     options['--all'] = not bool(options['--rst'])
25 | 
26 |     def mkdir(path):
27 |         try:
28 |             os.mkdir(path)
29 |         except OSError:
30 |             pass
31 | 
32 |     outdir = options['-o'] or (throot + '/html')
33 |     mkdir(outdir)
34 |     os.chdir(outdir)
35 |     mkdir("doc")
36 | 
37 |     # Make sure the appropriate 'deeplearning' directory is in the PYTHONPATH
38 |     pythonpath = os.environ.get('PYTHONPATH', '')
39 |     pythonpath = throot + ':' + pythonpath
40 |     os.environ['PYTHONPATH'] = pythonpath
41 | 
42 |     if options['--all'] or options['--rst']:
43 |         import sphinx
44 |         sys.path[0:0] = [os.path.join(throot, 'doc')]
45 |         sphinx.main(['', '-E', os.path.join(throot, 'doc'), '.'])
46 | 
47 |         if not options['--nopdf']:
48 |             # Generate latex file in a temp directory
49 |             import tempfile
50 |             workdir = tempfile.mkdtemp()
51 |             sphinx.main(['', '-E', '-b', 'latex',
52 |                          os.path.join(throot, 'doc'), workdir])
53 |             # Compile to PDF
54 |             os.chdir(workdir)
55 |             os.system('make')
56 |             try:
57 |                 shutil.copy(os.path.join(workdir, 'deeplearning.pdf'), outdir)
58 |                 os.chdir(outdir)
59 |                 shutil.rmtree(workdir)
60 |             except OSError as e:
61 |                 print('OSError:', e)
62 |             except IOError as e:
63 |                 print('IOError:', e)
64 | 


--------------------------------------------------------------------------------
/doc/utilities.txt:
--------------------------------------------------------------------------------
  1 | =============
  2 | Miscellaneous
  3 | =============
  4 | 
  5 | .. _how-to-plot:
  6 | 
  7 | Plotting Samples and Filters
  8 | ++++++++++++++++++++++++++++
  9 | 
 10 | .. note::
 11 |     The code for this section is available for download `here`_.
 12 | 
 13 | .. _here: http://deeplearning.net/tutorial/code/utils.py
 14 | 
 15 | 
 16 | To plot a sample, what we need to do is to take the visible units, which
 17 | are a flattened image (there is no 2D structure to the visible units,
 18 | just a 1D string of unit activations) and reshape it into a 2D image. The order in
 19 | which the points from the 1D array go into the 2D image is given by the
 20 | order in which the inital MNIST images where converted into a 1D array.
 21 | Lucky for us this is just a call of the ``numpy.reshape`` function.
 22 | 
 23 | Plotting the weights is a bit more tricky. We have ``n_hidden`` hidden
 24 | units, each of them corresponding to a column of the weight matrix. A
 25 | column has the same shape as the visible, where the weight corresponding
 26 | to the connection with visible unit `j` is at position `j`. Therefore,
 27 | if we reshape every such column, using ``numpy.reshape``, we get a
 28 | filter image that tells us how this hidden unit is influenced by
 29 | the input image.
 30 | 
 31 | We need a utility function that takes a minibatch, or the weight matrix,
 32 | and converts each row ( for the weight matrix we do a transpose ) into a
 33 | 2D image and then tile these images together.  Once we converted the
 34 | minibatch or the weights in this image of tiles, we can use PIL to plot
 35 | and save. `PIL <http://www.pythonware.com/products/pil/>`_ is a standard
 36 | python libarary to deal with images.
 37 | 
 38 | Tiling minibatches together is done for us by the
 39 | ``tile_raster_image`` function which we provide here.
 40 | 
 41 | .. code-block:: python
 42 | 
 43 | 
 44 |   def scale_to_unit_interval(ndar, eps=1e-8):
 45 |     """ Scales all values in the ndarray ndar to be between 0 and 1 """
 46 |     ndar = ndar.copy()
 47 |     ndar -= ndar.min()
 48 |     ndar *= 1.0 / (ndar.max() + eps)
 49 |     return ndar
 50 | 
 51 | 
 52 |   def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
 53 |                          scale_rows_to_unit_interval=True,
 54 |                          output_pixel_vals=True):
 55 |     """
 56 |     Transform an array with one flattened image per row, into an array in
 57 |     which images are reshaped and layed out like tiles on a floor.
 58 | 
 59 |     This function is useful for visualizing datasets whose rows are images,
 60 |     and also columns of matrices for transforming those rows
 61 |     (such as the first layer of a neural net).
 62 | 
 63 |     :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
 64 |     be 2-D ndarrays or None;
 65 |     :param X: a 2-D array in which every row is a flattened image.
 66 | 
 67 |     :type img_shape: tuple; (height, width)
 68 |     :param img_shape: the original shape of each image
 69 | 
 70 |     :type tile_shape: tuple; (rows, cols)
 71 |     :param tile_shape: the number of images to tile (rows, cols)
 72 | 
 73 |     :param output_pixel_vals: if output should be pixel values (i.e. int8
 74 |     values) or floats
 75 | 
 76 |     :param scale_rows_to_unit_interval: if the values need to be scaled before
 77 |     being plotted to [0,1] or not
 78 | 
 79 | 
 80 |     :returns: array suitable for viewing as an image.
 81 |     (See:`Image.fromarray`.)
 82 |     :rtype: a 2-d array with same dtype as X.
 83 | 
 84 |     """
 85 | 
 86 |     assert len(img_shape) == 2
 87 |     assert len(tile_shape) == 2
 88 |     assert len(tile_spacing) == 2
 89 | 
 90 |     # The expression below can be re-written in a more C style as
 91 |     # follows :
 92 |     #
 93 |     # out_shape = [0,0]
 94 |     # out_shape[0] = (img_shape[0] + tile_spacing[0]) * tile_shape[0] -
 95 |     #                tile_spacing[0]
 96 |     # out_shape[1] = (img_shape[1] + tile_spacing[1]) * tile_shape[1] -
 97 |     #                tile_spacing[1]
 98 |     out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp
 99 |                         in zip(img_shape, tile_shape, tile_spacing)]
100 | 
101 |     if isinstance(X, tuple):
102 |         assert len(X) == 4
103 |         # Create an output numpy ndarray to store the image
104 |         if output_pixel_vals:
105 |             out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8')
106 |         else:
107 |             out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype)
108 | 
109 |         #colors default to 0, alpha defaults to 1 (opaque)
110 |         if output_pixel_vals:
111 |             channel_defaults = [0, 0, 0, 255]
112 |         else:
113 |             channel_defaults = [0., 0., 0., 1.]
114 | 
115 |         for i in xrange(4):
116 |             if X[i] is None:
117 |                 # if channel is None, fill it with zeros of the correct
118 |                 # dtype
119 |                 out_array[:, :, i] = numpy.zeros(out_shape,
120 |                         dtype='uint8' if output_pixel_vals else out_array.dtype
121 |                         ) + channel_defaults[i]
122 |             else:
123 |                 # use a recurrent call to compute the channel and store it
124 |                 # in the output
125 |                 out_array[:, :, i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals)
126 |         return out_array
127 | 
128 |     else:
129 |         # if we are dealing with only one channel
130 |         H, W = img_shape
131 |         Hs, Ws = tile_spacing
132 | 
133 |         # generate a matrix to store the output
134 |         out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype)
135 | 
136 | 
137 |         for tile_row in xrange(tile_shape[0]):
138 |             for tile_col in xrange(tile_shape[1]):
139 |                 if tile_row * tile_shape[1] + tile_col < X.shape[0]:
140 |                     if scale_rows_to_unit_interval:
141 |                         # if we should scale values to be between 0 and 1
142 |                         # do this by calling the `scale_to_unit_interval`
143 |                         # function
144 |                         this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape))
145 |                     else:
146 |                         this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)
147 |                     # add the slice to the corresponding position in the
148 |                     # output array
149 |                     out_array[
150 |                         tile_row * (H+Hs): tile_row * (H + Hs) + H,
151 |                         tile_col * (W+Ws): tile_col * (W + Ws) + W
152 |                         ] \
153 |                         = this_img * (255 if output_pixel_vals else 1)
154 |         return out_array
155 | 


--------------------------------------------------------------------------------
/issues_closed/2_RBM_cost_fn.txt:
--------------------------------------------------------------------------------
1 | Reported by : Razvan
2 | 
3 | Cost function (delta of free energy) has a reversed sign (i.e. free_energy(positive) - free_energy(negative) ). I'm not sure
4 | where the minus pops in .. but is confusing when going from theory to code. 
5 | 
6 | 
7 | FIXED 
8 | 


--------------------------------------------------------------------------------
/issues_open/1_SdA_performance.txt:
--------------------------------------------------------------------------------
 1 | Reported by : Razvan
 2 | 
 3 | Best performance for SdA float64 CPU : 1.23%
 4 |                          float32 CPU : 1.30%
 5 | target : 1.10%
 6 | 
 7 | Possible reasons:
 8 |     - bug !? 
 9 |     - random seed / weights initialization / finetuning early stopping parameters
10 | 


--------------------------------------------------------------------------------
/issues_open/3_RBM_scan_GPU.txt:
--------------------------------------------------------------------------------
1 | Reported by : Razvan
2 | 
3 | Scan is not GPU ready.. making RBM tutorial slow on GPU (not tested yet).
4 | Quick fix is a optimization that removes scan if you're doing CD-1.
5 | 


--------------------------------------------------------------------------------
/issues_open/4_RBM_scan.txt:
--------------------------------------------------------------------------------
 1 | Reported by : Razvan
 2 | 
 3 | The bug can be reproduced if you do : 
 4 |  z = scan(..)
 5 |  c = f(z[-1])
 6 |  gp = T.grad(c, p, consider_constant = [ z[-1] ] )
 7 | 
 8 | In this case grad will not consider z[-1] constant. Workaround: 
 9 | 
10 |  z = scan(..)
11 |  z_1 = z[-1]
12 |  c = f(z_1)
13 |  gp = T.grad(c,p, consider_constant = [z_1])
14 | 
15 |  Note : I need to make sure this actually happens .. it might have been an
16 |  artifact of something else when I first got this. 
17 | 


--------------------------------------------------------------------------------
/issues_open/5_results.txt:
--------------------------------------------------------------------------------
1 | Reported by : Razvan
2 | 
3 | We should produce results + time for CPU float32 / CPU float64 / GPU . We should also 
4 | specify the batchsize (or number of updates) pointing out that you can't always just 
5 | compare the number of epochs. 
6 | 


--------------------------------------------------------------------------------
/issues_open/6_benchmarking_pybrain.txt:
--------------------------------------------------------------------------------
  1 | Reported by : Razvan
  2 | 
  3 | Observations : 
  4 | 
  5 |     1.  First thing, working with their dataset model is a pain ! Either I had 
  6 |         not figure it out, or it allows you to add only one datapoint at a time 
  7 |         in the dataset. This seems to me highly unoptimal ...
  8 | 
  9 |     2.  You do not get batches for sgd ! The only thing you can do is compare with 
 10 |         batch size of 1.
 11 | 
 12 |     3.  Their early stopping is different from ours. Differences : 
 13 |             - You can not set how often you do a pass on the validation set 
 14 |               (i.e. ``patience`` in our case).  You always do one epoch of training 
 15 |               and then you go through the validation set.
 16 |             - You do not have an improvement thereshold, any improvement in
 17 |               validation score leads to storing the new best parameters, and
 18 |               increasing the time you will still look for better parameters
 19 |             - The increase is not by multiplication but summation. So if at
 20 |               epoch x you do better on the validation step, you will go on for 
 21 |               x+y epochs to look for something better ( we do x*y )
 22 | 
 23 |     4.  The errors return by pyBrain are divided by the number of
 24 |         classes. So if you do classification, you take the number of
 25 |         errors and divide it by the number of test examples times the
 26 |         number of classes. For MNIST this yields 10 times smaller
 27 |         errors. Is this something standard .. should we do it ? It
 28 |         definetelly makes error look smaller.
 29 | 
 30 |     5.  There is no straight forward way of adding L1/L2 regularization (from
 31 |         what I've seen), unless you go into their code and change it. That is not
 32 |         ard to do .. but for now I do not want to meangle with the library
 33 | 
 34 |     6.  The code for RBM is not ready (they say that it is work in progress). It seems to me that the 
 35 |         code is wrong .. They have 3 loops, which to me would mean that the inner most is for CD-k (
 36 |         second is for one epoch / third for training). But they update the weights after each Gibbs 
 37 |         step in CD-k .. which results in a strage form of CD-1 that sees same example several time before
 38 |         moving to the next one. I could (?) potentially fix the code but it is outside the scope of 
 39 |         benchmarking. 
 40 | 
 41 |     7.  There are question marks of how easy it would be to implement a SdA ( autoassociators might be 
 42 |         easy to do though).
 43 | 
 44 | 
 45 |     RESULTS : 
 46 |     logistic_sgd on maggie46
 47 | 
 48 | Total error: 0.015611011103
 49 | Total error: 0.00966772673335
 50 | Total error: 0.00860664508883
 51 | Time spend per epoch: 43.32
 52 | Final error is : 10.44
 53 | Time spend per epoch: 43.32
 54 | Final error is : 10.44
 55 | 
 56 |     Arac : 
 57 | 
 58 | Total error: 0.0366924968888
 59 | Total error: 0.0366576944937
 60 | Total error: 0.0367442383338
 61 | Time spend per epoch: 24.71
 62 | Final error is : 69.28
 63 | Time spend per epoch: 24.71
 64 | Final error is : 69.28
 65 | 
 66 | 
 67 |     ** Our thing with batchsize =1 **
 68 | 
 69 | test error of best model  8.45
 70 | time : 12.99
 71 | 12.01
 72 | 
 73 | 
 74 | 
 75 | 
 76 |     Results : 
 77 |     mlp on maggie46
 78 | 
 79 | 
 80 |     pybrain :: 
 81 | 
 82 | Total error: 0.0124744609817
 83 | Total error: 0.00722484141084
 84 | Total error: 0.00599591269763
 85 | Time spend per epoch : 1226.69
 86 | Final error is : 8.68
 87 | Time spend per epoch: 1226.69
 88 | Final error is : 8.68
 89 | 
 90 | 20.4448 min
 91 | 
 92 |     arac::
 93 | 
 94 | Total error: 0.0318599056504
 95 | Total error: 0.0316029246672
 96 | Total error: 0.0315542295953
 97 | Time spend per epoch: 860.336666667 (s)
 98 | Final error is : 58.59
 99 | 
100 |     our thing::
101 | 
102 | test error of best model  3.88
103 | time: 381.92
104 | 
105 | 


--------------------------------------------------------------------------------
/misc/do_nightly_build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #we set the compiledir to the /Tmp dir to make the test faster by bypassing the nfs network.
 3 | date
 4 | ROOT_CWD=/Tmp/nightly_build
 5 | COMPILEDIR=/Tmp/lisa_theano_compile_dir_deeplearning
 6 | NOSETESTS=${ROOT_CWD}/Theano/bin/theano-nose
 7 | 
 8 | FLAGS=warn.ignore_bug_before=0.5,compiledir=${COMPILEDIR}
 9 | export PYTHONPATH=${ROOT_CWD}/Theano:${ROOT_CWD}/Pylearn:$PYTHONPATH
10 | 
11 | cd ${ROOT_CWD}/DeepLearningTutorials/data
12 | ./download.sh
13 | 
14 | cd ${ROOT_CWD}/Theano
15 | echo "git version for Theano:" `git rev-parse HEAD`
16 | cd ${ROOT_CWD}/DeepLearningTutorials/code
17 | echo "git version:" `git rev-parse HEAD`
18 | 
19 | #echo "executing nosetests with mode=FAST_COMPILE"
20 | #THEANO_FLAGS=${FLAGS},mode=FAST_COMPILE ${NOSETESTS}
21 | echo "executing nosetests speed with mode=FAST_RUN"
22 | THEANO_FLAGS=${FLAGS},mode=FAST_RUN ${NOSETESTS} test.py:speed
23 | #echo "executing nosetests speed with mode=FAST_RUN and OMP_NUM_THREADS=2"
24 | #OMP_NUM_THREADS=2 THEANO_FLAGS=${FLAGS},mode=FAST_RUN ${NOSETESTS} test.py:speed
25 | echo "executing nosetests with mode=FAST_RUN,floatX=float32"
26 | THEANO_FLAGS=${FLAGS},mode=FAST_RUN,floatX=float32 ${NOSETESTS}
27 | 
28 | #we change the seed and record it everyday to test different combination. We record it to be able to reproduce bug caused by different seed. We don't want multiple test in DEBUG_MODE each day as this take too long.
29 | #seed=$RANDOM
30 | #echo "executing nosetests with mode=DEBUG_MODE with seed of the day $seed"
31 | #THEANO_DEBUGMODE_CHECK_STRIDES=0 THEANO_DEBUGMODE_PATIENCE=3 THEANO_COMPILEDIR=/Tmp/lisa_theano_compile_dir_deeplearning THEANO_UNITTEST_SEED=$seed THEANO_DEFAULT_MODE=DEBUG_MODE ${NOSETESTS}
32 | 
33 | 


--------------------------------------------------------------------------------