├── code
    ├── hmc
    │   ├── __init__.py
    │   ├── test_hmc.py
    │   └── hmc.py
    ├── utils.py
    ├── test.py
    ├── logistic_cg.py
    ├── rnnrbm.py
    ├── cA.py
    ├── convolutional_mlp.py
    ├── mlp.py
    └── dA.py
├── .hgignore
├── doc
    ├── Makefile
    ├── images
    │   ├── bm.png
    │   ├── DBN3.png
    │   ├── mlp.png
    │   ├── rbm.png
    │   ├── mnist_0.png
    │   ├── mnist_1.png
    │   ├── mnist_2.png
    │   ├── mnist_3.png
    │   ├── mnist_4.png
    │   ├── mnist_5.png
    │   ├── mylenet.png
    │   ├── rnnrbm.png
    │   ├── sample1.png
    │   ├── sample2.png
    │   ├── samples.png
    │   ├── 3wolfmoon.jpg
    │   ├── conv_1D_nn.png
    │   ├── markov_chain.png
    │   ├── sparse_1D_nn.png
    │   ├── cnn_explained.png
    │   ├── 3wolfmoon_output.png
    │   ├── filters_at_epoch_14.png
    │   ├── filters_corruption_0.png
    │   └── filters_corruption_30.png
    ├── contents.txt
    ├── .templates
    │   └── layout.html
    ├── LICENSE.txt
    ├── scripts
    │   └── docgen.py
    ├── references.txt
    ├── intro.txt
    ├── deep.txt
    ├── utilities.txt
    ├── conf.py
    ├── rnnrbm.txt
    ├── SdA.txt
    ├── logreg.txt
    ├── DBN.txt
    └── mlp.txt
├── data
    ├── training_colorpatches_16x16_demo.mat
    └── download.sh
├── .gitignore
├── issues_open
    ├── 3_RBM_scan_GPU.txt
    ├── 5_results.txt
    ├── 1_SdA_performance.txt
    ├── 4_RBM_scan.txt
    └── 6_benchmarking_pybrain.txt
├── issues_closed
    └── 2_RBM_cost_fn.txt
├── README.rst
├── misc
    └── do_nightly_build
└── .travis.yml


/code/hmc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.hgignore:
--------------------------------------------------------------------------------
1 | syntax: glob
2 | *.pyc
3 | *.png
4 | *~
5 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	python scripts/docgen.py
3 | 


--------------------------------------------------------------------------------
/doc/images/bm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/bm.png


--------------------------------------------------------------------------------
/doc/images/DBN3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/DBN3.png


--------------------------------------------------------------------------------
/doc/images/mlp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/mlp.png


--------------------------------------------------------------------------------
/doc/images/rbm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/rbm.png


--------------------------------------------------------------------------------
/doc/images/mnist_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/mnist_0.png


--------------------------------------------------------------------------------
/doc/images/mnist_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/mnist_1.png


--------------------------------------------------------------------------------
/doc/images/mnist_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/mnist_2.png


--------------------------------------------------------------------------------
/doc/images/mnist_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/mnist_3.png


--------------------------------------------------------------------------------
/doc/images/mnist_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/mnist_4.png


--------------------------------------------------------------------------------
/doc/images/mnist_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/mnist_5.png


--------------------------------------------------------------------------------
/doc/images/mylenet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/mylenet.png


--------------------------------------------------------------------------------
/doc/images/rnnrbm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/rnnrbm.png


--------------------------------------------------------------------------------
/doc/images/sample1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/sample1.png


--------------------------------------------------------------------------------
/doc/images/sample2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/sample2.png


--------------------------------------------------------------------------------
/doc/images/samples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/samples.png


--------------------------------------------------------------------------------
/doc/images/3wolfmoon.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/3wolfmoon.jpg


--------------------------------------------------------------------------------
/doc/images/conv_1D_nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/conv_1D_nn.png


--------------------------------------------------------------------------------
/doc/images/markov_chain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/markov_chain.png


--------------------------------------------------------------------------------
/doc/images/sparse_1D_nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/sparse_1D_nn.png


--------------------------------------------------------------------------------
/doc/images/cnn_explained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/cnn_explained.png


--------------------------------------------------------------------------------
/doc/images/3wolfmoon_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/3wolfmoon_output.png


--------------------------------------------------------------------------------
/doc/images/filters_at_epoch_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/filters_at_epoch_14.png


--------------------------------------------------------------------------------
/doc/images/filters_corruption_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/filters_corruption_0.png


--------------------------------------------------------------------------------
/doc/images/filters_corruption_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/doc/images/filters_corruption_30.png


--------------------------------------------------------------------------------
/data/training_colorpatches_16x16_demo.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/findmyway/DeepLearningTutorials/master/data/training_colorpatches_16x16_demo.mat


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | code/*.pyc
 2 | code/tmp*
 3 | code/midi
 4 | data/mnist.pkl.gz
 5 | data/mnist_py3k.pkl.gz
 6 | data/Nottingham.zip
 7 | data/Nottingham
 8 | data/midi.zip
 9 | html
10 | *.pyc
11 | *~
12 | *.swp
13 | 


--------------------------------------------------------------------------------
/issues_open/3_RBM_scan_GPU.txt:
--------------------------------------------------------------------------------
1 | Reported by : Razvan
2 | 
3 | Scan is not GPU ready.. making RBM tutorial slow on GPU (not tested yet).
4 | Quick fix is a optimization that removes scan if you're doing CD-1.
5 | 


--------------------------------------------------------------------------------
/issues_open/5_results.txt:
--------------------------------------------------------------------------------
1 | Reported by : Razvan
2 | 
3 | We should produce results + time for CPU float32 / CPU float64 / GPU . We should also 
4 | specify the batchsize (or number of updates) pointing out that you can't always just 
5 | compare the number of epochs. 
6 | 


--------------------------------------------------------------------------------
/issues_closed/2_RBM_cost_fn.txt:
--------------------------------------------------------------------------------
1 | Reported by : Razvan
2 | 
3 | Cost function (delta of free energy) has a reversed sign (i.e. free_energy(positive) - free_energy(negative) ). I'm not sure
4 | where the minus pops in .. but is confusing when going from theory to code. 
5 | 
6 | 
7 | FIXED 
8 | 


--------------------------------------------------------------------------------
/issues_open/1_SdA_performance.txt:
--------------------------------------------------------------------------------
 1 | Reported by : Razvan
 2 | 
 3 | Best performance for SdA float64 CPU : 1.23%
 4 |                          float32 CPU : 1.30%
 5 | target : 1.10%
 6 | 
 7 | Possible reasons:
 8 |     - bug !? 
 9 |     - random seed / weights initialization / finetuning early stopping parameters
10 | 


--------------------------------------------------------------------------------
/doc/contents.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | .. _contents:
 3 | 
 4 | ========
 5 | Contents
 6 | ========
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 2
10 | 
11 |    LICENSE
12 |    intro
13 |    gettingstarted
14 |    logreg
15 |    mlp
16 |    lenet
17 |    dA
18 |    SdA
19 |    rbm
20 |    DBN
21 |    hmc
22 |    rnnrbm
23 |    utilities
24 |    references
25 | 


--------------------------------------------------------------------------------
/issues_open/4_RBM_scan.txt:
--------------------------------------------------------------------------------
 1 | Reported by : Razvan
 2 | 
 3 | The bug can be reproduced if you do : 
 4 |  z = scan(..)
 5 |  c = f(z[-1])
 6 |  gp = T.grad(c, p, consider_constant = [ z[-1] ] )
 7 | 
 8 | In this case grad will not consider z[-1] constant. Workaround: 
 9 | 
10 |  z = scan(..)
11 |  z_1 = z[-1]
12 |  c = f(z_1)
13 |  gp = T.grad(c,p, consider_constant = [z_1])
14 | 
15 |  Note : I need to make sure this actually happens .. it might have been an
16 |  artifact of something else when I first got this. 
17 | 


--------------------------------------------------------------------------------
/data/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | which wget >/dev/null 2>&1
 4 | WGET=$?
 5 | which curl >/dev/null 2>&1
 6 | CURL=$?
 7 | if [ "$WGET" -eq 0 ]; then
 8 |     DL_CMD="wget -c"
 9 | elif [ "$CURL" -eq 0 ]; then
10 |     DL_CMD="curl -C - -O"
11 | else
12 |     echo "You need wget or curl installed to download"
13 |     exit 1
14 | fi
15 | 
16 | $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
17 | $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist_py3k.pkl.gz
18 | $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/data/Nottingham.zip && unzip -u Nottingham.zip
19 | $DL_CMD http://www.iro.umontreal.ca/~lisa/deep/midi.zip && unzip -u midi.zip -d ../code && echo "extracted Modified Python MIDI package (GPL)"
20 | 


--------------------------------------------------------------------------------
/doc/.templates/layout.html:
--------------------------------------------------------------------------------
 1 | {% extends "!layout.html" %}
 2 | 
 3 | {%- block extrahead %}
 4 | {{ super() }}
 5 | <script type="text/javascript">
 6 |   var _gaq = _gaq || [];
 7 |   _gaq.push(['_setAccount', 'UA-168290-9']);
 8 |   _gaq.push(['_trackPageview']);
 9 | </script>
10 | {% endblock %}
11 | 
12 | {% block footer %}
13 | {{ super() }}
14 | <script type="text/javascript">
15 |   (function() {
16 |     var ga = document.createElement('script');
17 |     ga.src = ('https:' == document.location.protocol ?
18 |               'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
19 |     ga.setAttribute('async', 'true');
20 |     document.documentElement.firstChild.appendChild(ga);
21 |   })();
22 | </script>
23 | {% endblock %}
24 | 
25 | 


--------------------------------------------------------------------------------
/doc/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | .. _license:
 2 | 
 3 | LICENSE
 4 | =======
 5 | 
 6 | Copyright (c) 2008--2013, Theano Development Team
 7 | All rights reserved.
 8 | 
 9 | Redistribution and use in source and binary forms, with or without
10 | modification, are permitted provided that the following conditions are met:
11 | 
12 |     * Redistributions of source code must retain the above copyright
13 |       notice, this list of conditions and the following disclaimer.
14 |     * Redistributions in binary form must reproduce the above copyright
15 |       notice, this list of conditions and the following disclaimer in the
16 |       documentation and/or other materials provided with the distribution.
17 |     * Neither the name of Theano nor the names of its contributors may be
18 |       used to endorse or promote products derived from this software without
19 |       specific prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
22 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
28 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Deep Learning Tutorials
 2 | =======================
 3 | 
 4 | Deep Learning is a new area of Machine Learning research, which has been
 5 | introduced with the objective of moving Machine Learning closer to one of its
 6 | original goals: Artificial Intelligence.  Deep Learning is about learning
 7 | multiple levels of representation and abstraction that help to make sense of
 8 | data such as images, sound, and text.  The tutorials presented here will
 9 | introduce you to some of the most important deep learning algorithms and will
10 | also show you how to run them using Theano.  Theano is a python library that
11 | makes writing deep learning models easy, and gives the option of training them
12 | on a GPU.
13 | 
14 | The easiest way to follow the tutorials is to `browse them online
15 | <http://deeplearning.net/tutorial/>`_.
16 | 
17 | `Main development <http://github.com/lisa-lab/DeepLearningTutorials>`_
18 | of this project.
19 | 
20 | .. image:: https://secure.travis-ci.org/lisa-lab/DeepLearningTutorials.png
21 |    :target: http://travis-ci.org/lisa-lab/DeepLearningTutorials
22 | 
23 | Project Layout
24 | --------------
25 | 
26 | Subdirectories:
27 | 
28 | - code - Python files corresponding to each tutorial
29 | - data - data and scripts to download data that is used by the tutorials
30 | - doc  - restructured text used by Sphinx to build the tutorial website
31 | - html - built automatically by doc/Makefile, contains tutorial website
32 | - issues_closed - issue tracking
33 | - issues_open - issue tracking
34 | - misc - administrative scripts
35 | 
36 | 
37 | Build instructions
38 | ------------------
39 | 
40 | To build the html version of the tutorials, install sphinx and run doc/Makefile
41 | 


--------------------------------------------------------------------------------
/misc/do_nightly_build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #we set the compiledir to the /Tmp dir to make the test faster by bypassing the nfs network.
 3 | date
 4 | ROOT_CWD=/Tmp/nightly_build
 5 | COMPILEDIR=/Tmp/lisa_theano_compile_dir_deeplearning
 6 | NOSETESTS=${ROOT_CWD}/Theano/bin/theano-nose
 7 | 
 8 | FLAGS=warn.ignore_bug_before=0.5,compiledir=${COMPILEDIR}
 9 | export PYTHONPATH=${ROOT_CWD}/Theano:${ROOT_CWD}/Pylearn:$PYTHONPATH
10 | 
11 | cd ${ROOT_CWD}/DeepLearningTutorials/data
12 | ./download.sh
13 | 
14 | cd ${ROOT_CWD}/Theano
15 | echo "git version for Theano:" `git rev-parse HEAD`
16 | cd ${ROOT_CWD}/DeepLearningTutorials/code
17 | echo "git version:" `git rev-parse HEAD`
18 | 
19 | #echo "executing nosetests with mode=FAST_COMPILE"
20 | #THEANO_FLAGS=${FLAGS},mode=FAST_COMPILE ${NOSETESTS}
21 | echo "executing nosetests speed with mode=FAST_RUN"
22 | THEANO_FLAGS=${FLAGS},mode=FAST_RUN ${NOSETESTS} test.py:speed
23 | #echo "executing nosetests speed with mode=FAST_RUN and OMP_NUM_THREADS=2"
24 | #OMP_NUM_THREADS=2 THEANO_FLAGS=${FLAGS},mode=FAST_RUN ${NOSETESTS} test.py:speed
25 | echo "executing nosetests with mode=FAST_RUN,floatX=float32"
26 | THEANO_FLAGS=${FLAGS},mode=FAST_RUN,floatX=float32 ${NOSETESTS}
27 | 
28 | #we change the seed and record it everyday to test different combination. We record it to be able to reproduce bug caused by different seed. We don't want multiple test in DEBUG_MODE each day as this take too long.
29 | #seed=$RANDOM
30 | #echo "executing nosetests with mode=DEBUG_MODE with seed of the day $seed"
31 | #THEANO_DEBUGMODE_CHECK_STRIDES=0 THEANO_DEBUGMODE_PATIENCE=3 THEANO_COMPILEDIR=/Tmp/lisa_theano_compile_dir_deeplearning THEANO_UNITTEST_SEED=$seed THEANO_DEFAULT_MODE=DEBUG_MODE ${NOSETESTS}
32 | 
33 | 


--------------------------------------------------------------------------------
/doc/scripts/docgen.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | import os
 4 | import shutil
 5 | 
 6 | import getopt
 7 | from collections import defaultdict
 8 | 
 9 | if __name__ == '__main__':
10 | 
11 |     throot = "/".join(sys.path[0].split("/")[:-2])
12 | 
13 |     options = defaultdict(bool)
14 |     output_arg = getopt.getopt(sys.argv[1:], 'o:', ['rst', 'help', 'nopdf'])[0]
15 |     options.update(dict([x, y or True] for x, y in output_arg))
16 |     if options['--help']:
17 |         print('Usage: %s [OPTIONS]' % sys.argv[0])
18 |         print('  -o <dir>: output the html files in the specified dir')
19 |         print('  --rst: only compile the doc (requires sphinx)')
20 |         print('  --nopdf: do not produce a PDF file from the doc, only HTML')
21 |         print('  --help: this help')
22 |         sys.exit(0)
23 | 
24 |     options['--all'] = not bool(options['--rst'])
25 | 
26 |     def mkdir(path):
27 |         try:
28 |             os.mkdir(path)
29 |         except OSError:
30 |             pass
31 | 
32 |     outdir = options['-o'] or (throot + '/html')
33 |     mkdir(outdir)
34 |     os.chdir(outdir)
35 |     mkdir("doc")
36 | 
37 |     # Make sure the appropriate 'deeplearning' directory is in the PYTHONPATH
38 |     pythonpath = os.environ.get('PYTHONPATH', '')
39 |     pythonpath = throot + ':' + pythonpath
40 |     os.environ['PYTHONPATH'] = pythonpath
41 | 
42 |     if options['--all'] or options['--rst']:
43 |         import sphinx
44 |         sys.path[0:0] = [os.path.join(throot, 'doc')]
45 |         sphinx.main(['', '-E', os.path.join(throot, 'doc'), '.'])
46 | 
47 |         if not options['--nopdf']:
48 |             # Generate latex file in a temp directory
49 |             import tempfile
50 |             workdir = tempfile.mkdtemp()
51 |             sphinx.main(['', '-E', '-b', 'latex',
52 |                          os.path.join(throot, 'doc'), workdir])
53 |             # Compile to PDF
54 |             os.chdir(workdir)
55 |             os.system('make')
56 |             try:
57 |                 shutil.copy(os.path.join(workdir, 'deeplearning.pdf'), outdir)
58 |                 os.chdir(outdir)
59 |                 shutil.rmtree(workdir)
60 |             except OSError as e:
61 |                 print('OSError:', e)
62 |             except IOError as e:
63 |                 print('IOError:', e)
64 | 


--------------------------------------------------------------------------------
/code/hmc/test_hmc.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from scipy import linalg
 3 | import theano
 4 | 
 5 | from hmc import HMC_sampler
 6 | 
 7 | 
 8 | def sampler_on_nd_gaussian(sampler_cls, burnin, n_samples, dim=10):
 9 |     batchsize = 3
10 | 
11 |     rng = numpy.random.RandomState(123)
12 | 
13 |     # Define a covariance and mu for a gaussian
14 |     mu = numpy.array(rng.rand(dim) * 10, dtype=theano.config.floatX)
15 |     cov = numpy.array(rng.rand(dim, dim), dtype=theano.config.floatX)
16 |     cov = (cov + cov.T) / 2.
17 |     cov[numpy.arange(dim), numpy.arange(dim)] = 1.0
18 |     cov_inv = linalg.inv(cov)
19 | 
20 |     # Define energy function for a multi-variate Gaussian
21 |     def gaussian_energy(x):
22 |         return 0.5 * (theano.tensor.dot((x - mu), cov_inv) *
23 |                       (x - mu)).sum(axis=1)
24 | 
25 |     # Declared shared random variable for positions
26 |     position = rng.randn(batchsize, dim).astype(theano.config.floatX)
27 |     position = theano.shared(position)
28 | 
29 |     # Create HMC sampler
30 |     sampler = sampler_cls(position, gaussian_energy,
31 |                           initial_stepsize=1e-3, stepsize_max=0.5)
32 | 
33 |     # Start with a burn-in process
34 |     garbage = [sampler.draw() for r in xrange(burnin)]  # burn-in Draw
35 |     # `n_samples`: result is a 3D tensor of dim [n_samples, batchsize,
36 |     # dim]
37 |     _samples = numpy.asarray([sampler.draw() for r in xrange(n_samples)])
38 |     # Flatten to [n_samples * batchsize, dim]
39 |     samples = _samples.T.reshape(dim, -1).T
40 | 
41 |     print '****** TARGET VALUES ******'
42 |     print 'target mean:', mu
43 |     print 'target cov:\n', cov
44 | 
45 |     print '****** EMPIRICAL MEAN/COV USING HMC ******'
46 |     print 'empirical mean: ', samples.mean(axis=0)
47 |     print 'empirical_cov:\n', numpy.cov(samples.T)
48 | 
49 |     print '****** HMC INTERNALS ******'
50 |     print 'final stepsize', sampler.stepsize.get_value()
51 |     print 'final acceptance_rate', sampler.avg_acceptance_rate.get_value()
52 | 
53 |     return sampler
54 | 
55 | 
56 | def test_hmc():
57 |     sampler = sampler_on_nd_gaussian(HMC_sampler.new_from_shared_positions,
58 |                                      burnin=1000, n_samples=1000, dim=5)
59 |     assert abs(sampler.avg_acceptance_rate.get_value() -
60 |                sampler.target_acceptance_rate) < .1
61 |     assert sampler.stepsize.get_value() >= sampler.stepsize_min
62 |     assert sampler.stepsize.get_value() <= sampler.stepsize_max
63 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # After changing this file, check it on:
 2 | # http://lint.travis-ci.org/
 3 | 
 4 | #We can't get scipy installed with the python language
 5 | #So we will use the system python from the c language.
 6 | language: c
 7 | #language: python
 8 | #python:
 9 | #  - "2.5"
10 | #  - "2.7"
11 | #  - "3.2"
12 | # command to install dependencies
13 | before_install:
14 | #zlib1g-dev is needed to allow PIL to uncompress the dataset.
15 |   - sudo apt-get install -qq libatlas3gf-base libatlas-dev zlib1g-dev zip unzip zlibc libzip-dev libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev python-numpy python-scipy python-pip python-nose python-yaml pyflakes python-imaging
16 | 
17 | install:
18 | #  - "pip install -q numpy --use-mirrors"
19 | # Use Pillow instead of PIL as it is better packaged
20 | #  - "pip install -q Pillow --use-mirrors"
21 | #If we don't install numpy before SciPy 0.10.1, the SciPy installations fails.
22 | #  - "pip install -q scipy --use-mirrors"
23 |   - "sudo pip install --no-deps git+git://github.com/Theano/Theano.git"
24 | 
25 | env:
26 |   - PART="test.py:test_logistic_sgd test.py:test_logistic_cg test.py:test_mlp test.py:test_convolutional_mlp test.py:test_dA"
27 |   - PART="test.py:test_SdA"
28 |   - PART="test.py:test_dbn"
29 |   - PART="test.py:test_rbm test.py:test_rnnrbm"
30 |   - PART="-e test.py"
31 | 
32 | #i7-2600K CPU @ 3.40GHz
33 | #166.572s   #8      test.test_rbm OK
34 | #155.114s   #7      test.test_dbn OK
35 | #152.365s   #9      test.test_rnnrbm OK
36 | #127.286s   #6      test.test_SdA OK
37 | #39.252s    #5      test.test_dA OK
38 | #27.56s     #4      test.test_convolutional_mlp OK
39 | #15.454s    #3      test.test_mlp OK
40 | #12.732s    #1      test.test_logistic_sgd OK
41 | #12.638s    #2      test.test_logistic_cg OK
42 | 
43 | #i7-920
44 | #296.475s   #7      code.test.test_dbn OK
45 | #257.272s   #6      code.test.test_SdA OK
46 | #234.776s   #9      code.test.test_rnnrbm OK
47 | #233.896s   #8      code.test.test_rbm OK
48 | #65.737s    #5      code.test.test_dA OK
49 | #37.658s    #4      code.test.test_convolutional_mlp OK
50 | #24.172s    #3      code.test.test_mlp OK
51 | #20.401s    #1      code.test.test_logistic_sgd OK
52 | #17.546s    #2      code.test.test_logistic_cg OK
53 | 
54 | # On Core2 duo E8500 with MRG
55 | #308.004s   #7      code.test.test_dbn OK
56 | #277.268s   #6      code.test.test_SdA OK
57 | #126.102s   #8      code.test.test_rbm OK
58 | #123.652s   #9      code.test.test_rnnrbm OK
59 | #77.101s    #5      code.test.test_dA OK
60 | #39.75s     #4      code.test.test_convolutional_mlp OK
61 | #30.406s    #3      code.test.test_mlp OK
62 | #21.132s    #2      code.test.test_logistic_cg OK
63 | #17.945s    #1      code.test.test_logistic_sgd OK
64 | 
65 | # Unknown computer with older version of Theano
66 | #569.882s   #9      code.test.test_rbm OK
67 | #298.992s   #8      code.test.test_dbn OK
68 | #268.901s   #7      code.test.test_SdA OK
69 | #67.292s    #6      code.test.test_dA OK
70 | #27.485s    #4      code.test.test_mlp OK
71 | #26.204s    #5      code.test.test_convolutional_mlp OK
72 | #14.676s    #3      code.test.test_logistic_cg OK
73 | #10.66s     #2      code.test.test_logistic_sgd OK
74 | #5.795s     #1      code.hmc.test_hmc.test_hmc OK
75 | 
76 | script:
77 |   - cd data
78 |   - ./download.sh
79 |   - ls
80 |   - cd ../code
81 |   - pwd
82 |   - ls
83 |   - export THEANO_FLAGS=warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise
84 |   - python --version
85 |   - nosetests $PART
86 | 
87 | 


--------------------------------------------------------------------------------
/doc/references.txt:
--------------------------------------------------------------------------------
 1 | .. _references:
 2 | 
 3 | ==========
 4 | References
 5 | ==========
 6 | 
 7 | .. [Bengio07] Y. Bengio, P. Lamblin, D. Popovici and H. Larochelle, `Greedy Layer-Wise Training of Deep Networks <http://www.iro.umontreal.ca/~lisa/publications2/index.php/publications/show/190>`_, in Advances in Neural Information Processing Systems 19 (NIPS'06), pages  153-160, MIT Press 2007.
 8 | 
 9 | .. [Bengio09] Y. Bengio, `Learning deep architectures for AI <http://www.iro.umontreal.ca/~lisa/publications2/index.php/publications/show/239>`_, Foundations and Trends in Machine Learning 1(2) pages 1-127.
10 | 
11 | .. [BengioDelalleau09] Y. Bengio, O. Delalleau, Justifying and Generalizing Contrastive Divergence (2009), Neural Computation, 21(6): 1601-1621.
12 | 
13 | .. [BoulangerLewandowski12] N Boulanger-Lewandowski, Y. Bengio and P. Vincent, `Modeling Temporal Dependencies in High-Dimensional Sequences: Application to Polyphonic Music Generation and Transcription <http://www-etud.iro.umontreal.ca/~boulanni/icml2012>`_, in Proceedings of the 29th International Conference on Machine Learning (ICML), 2012.
14 | 
15 | .. [Fukushima] Fukushima, K. (1980). Neocognitron: A self-organizing neural network model for a mechanism of pattern recognition unaffected by shift in position. Biological Cybernetics, 36, 193–202.
16 | 
17 | .. [Hinton06] G.E. Hinton and R.R. Salakhutdinov, `Reducing the Dimensionality of Data with Neural Networks <http://www.cs.toronto.edu/~rsalakhu/papers/science.pdf>`_, Science, 28 July 2006, Vol. 313. no. 5786, pp. 504 - 507.
18 | 
19 | .. [Hinton07] G.E. Hinton, S. Osindero, and Y. Teh, "A fast learning algorithm for deep belief nets", Neural Computation, vol 18, 2006
20 | 
21 | .. [Hubel68] Hubel, D. and Wiesel, T. (1968). Receptive fields and functional architecture of monkey striate cortex. Journal of Physiology (London), 195, 215–243.
22 | 
23 | .. [LeCun98] LeCun, Y., Bottou, L., Bengio, Y., and Haffner, P. (1998d).  Gradient-based learning applied to document recognition. Proceedings of the IEEE, 86(11), 2278–2324.
24 | 
25 | .. [Lee08] H. Lee, C. Ekanadham, and A.Y. Ng., `Sparse deep belief net model for visual area V2 <http://www.stanford.edu/~hllee/nips07-sparseDBN.pdf>`_, in Advances in Neural Information Processing Systems (NIPS) 20, 2008.
26 | 
27 | .. [Lee09] H. Lee, R. Grosse, R. Ranganath, and A.Y. Ng, "Convolutional deep belief networks for scalable unsupervised learning of hierarchical representations.", ICML 2009
28 | 
29 | .. [Ranzato10] M. Ranzato, A. Krizhevsky, G. Hinton, "Factored 3-Way Restricted Boltzmann Machines for Modeling Natural Images". Proc. of the 13-th International Conference on Artificial Intelligence and Statistics (AISTATS 2010), Italy, 2010
30 | 
31 | .. [Ranzato07] M.A. Ranzato, C. Poultney, S. Chopra and Y. LeCun, in J. Platt et al., `Efficient Learning of Sparse Representations with an Energy-Based Model <http://yann.lecun.com/exdb/publis/pdf/ranzato-06.pdf>`_, Advances in Neural Information Processing Systems (NIPS 2006), MIT Press, 2007.
32 | 
33 | .. [Serre07] Serre, T., Wolf, L., Bileschi, S., and Riesenhuber, M. (2007).  Robust object recog- nition with cortex-like mechanisms. IEEE Trans. Pattern Anal. Mach. Intell., 29(3), 411–426. Member-Poggio, Tomaso.
34 | 
35 | .. [Vincent08] P. Vincent, H. Larochelle Y. Bengio and P.A. Manzagol, `Extracting and Composing Robust Features with Denoising Autoencoders <http://www.iro.umontreal.ca/~lisa/publications2/index.php/publications/show/217>`_, Proceedings of the Twenty-fifth International Conference on Machine Learning (ICML'08), pages 1096 - 1103, ACM, 2008.
36 | 
37 | .. [Tieleman08] T. Tieleman, Training restricted boltzmann machines using approximations to the likelihood gradient, ICML 2008.
38 | 
39 | .. [Xavier10] Y. Bengio, X. Glorot, Understanding the difficulty of training deep feedforward neuralnetworks, AISTATS 2010
40 | 


--------------------------------------------------------------------------------
/issues_open/6_benchmarking_pybrain.txt:
--------------------------------------------------------------------------------
  1 | Reported by : Razvan
  2 | 
  3 | Observations : 
  4 | 
  5 |     1.  First thing, working with their dataset model is a pain ! Either I had 
  6 |         not figure it out, or it allows you to add only one datapoint at a time 
  7 |         in the dataset. This seems to me highly unoptimal ...
  8 | 
  9 |     2.  You do not get batches for sgd ! The only thing you can do is compare with 
 10 |         batch size of 1.
 11 | 
 12 |     3.  Their early stopping is different from ours. Differences : 
 13 |             - You can not set how often you do a pass on the validation set 
 14 |               (i.e. ``patience`` in our case).  You always do one epoch of training 
 15 |               and then you go through the validation set.
 16 |             - You do not have an improvement thereshold, any improvement in
 17 |               validation score leads to storing the new best parameters, and
 18 |               increasing the time you will still look for better parameters
 19 |             - The increase is not by multiplication but summation. So if at
 20 |               epoch x you do better on the validation step, you will go on for 
 21 |               x+y epochs to look for something better ( we do x*y )
 22 | 
 23 |     4.  The errors return by pyBrain are divided by the number of
 24 |         classes. So if you do classification, you take the number of
 25 |         errors and divide it by the number of test examples times the
 26 |         number of classes. For MNIST this yields 10 times smaller
 27 |         errors. Is this something standard .. should we do it ? It
 28 |         definetelly makes error look smaller.
 29 | 
 30 |     5.  There is no straight forward way of adding L1/L2 regularization (from
 31 |         what I've seen), unless you go into their code and change it. That is not
 32 |         ard to do .. but for now I do not want to meangle with the library
 33 | 
 34 |     6.  The code for RBM is not ready (they say that it is work in progress). It seems to me that the 
 35 |         code is wrong .. They have 3 loops, which to me would mean that the inner most is for CD-k (
 36 |         second is for one epoch / third for training). But they update the weights after each Gibbs 
 37 |         step in CD-k .. which results in a strage form of CD-1 that sees same example several time before
 38 |         moving to the next one. I could (?) potentially fix the code but it is outside the scope of 
 39 |         benchmarking. 
 40 | 
 41 |     7.  There are question marks of how easy it would be to implement a SdA ( autoassociators might be 
 42 |         easy to do though).
 43 | 
 44 | 
 45 |     RESULTS : 
 46 |     logistic_sgd on maggie46
 47 | 
 48 | Total error: 0.015611011103
 49 | Total error: 0.00966772673335
 50 | Total error: 0.00860664508883
 51 | Time spend per epoch: 43.32
 52 | Final error is : 10.44
 53 | Time spend per epoch: 43.32
 54 | Final error is : 10.44
 55 | 
 56 |     Arac : 
 57 | 
 58 | Total error: 0.0366924968888
 59 | Total error: 0.0366576944937
 60 | Total error: 0.0367442383338
 61 | Time spend per epoch: 24.71
 62 | Final error is : 69.28
 63 | Time spend per epoch: 24.71
 64 | Final error is : 69.28
 65 | 
 66 | 
 67 |     ** Our thing with batchsize =1 **
 68 | 
 69 | test error of best model  8.45
 70 | time : 12.99
 71 | 12.01
 72 | 
 73 | 
 74 | 
 75 | 
 76 |     Results : 
 77 |     mlp on maggie46
 78 | 
 79 | 
 80 |     pybrain :: 
 81 | 
 82 | Total error: 0.0124744609817
 83 | Total error: 0.00722484141084
 84 | Total error: 0.00599591269763
 85 | Time spend per epoch : 1226.69
 86 | Final error is : 8.68
 87 | Time spend per epoch: 1226.69
 88 | Final error is : 8.68
 89 | 
 90 | 20.4448 min
 91 | 
 92 |     arac::
 93 | 
 94 | Total error: 0.0318599056504
 95 | Total error: 0.0316029246672
 96 | Total error: 0.0315542295953
 97 | Time spend per epoch: 860.336666667 (s)
 98 | Final error is : 58.59
 99 | 
100 |     our thing::
101 | 
102 | test error of best model  3.88
103 | time: 381.92
104 | 
105 | 


--------------------------------------------------------------------------------
/doc/intro.txt:
--------------------------------------------------------------------------------
 1 | =======================
 2 | Deep Learning Tutorials
 3 | =======================
 4 | 
 5 | Deep Learning is a new area of Machine Learning research, which
 6 | has been introduced with the objective of moving Machine Learning
 7 | closer to one of its original goals: Artificial Intelligence.
 8 | See these course notes for a `brief introduction to Machine Learning for AI <http://www.iro.umontreal.ca/~pift6266/H10/notes/mlintro.html>`_
 9 | and an `introduction to Deep Learning algorithms <http://www.iro.umontreal.ca/~pift6266/H10/notes/deepintro.html>`_.
10 | 
11 | Deep Learning is about learning multiple levels of representation
12 | and abstraction that help to
13 | make sense of data such as images, sound, and text. 
14 | For more about deep learning algorithms, see for example:
15 | 
16 |  - The monograph or review paper `Learning Deep Architectures for AI <http://www.iro.umontreal.ca/~lisa/publications2/index.php/publications/show/239>`_ (Foundations & Trends in Machine Learning, 2009).
17 |  - The ICML 2009 Workshop on Learning Feature Hierarchies `webpage <http://www.cs.toronto.edu/~rsalakhu/deeplearning/index.html>`_ has a `list of references <http://www.cs.toronto.edu/~rsalakhu/deeplearning/references.html>`_.
18 |  - The LISA `public wiki <http://www.iro.umontreal.ca/~lisa/twiki/bin/view.cgi/Public/WebHome>`_ has a `reading list <http://www.iro.umontreal.ca/~lisa/twiki/bin/view.cgi/Public/ReadingOnDeepNetworks>`_ and a `bibliography <http://www.iro.umontreal.ca/~lisa/twiki/bin/view.cgi/Public/DeepNetworksBibliography>`_.
19 |  - Geoff Hinton has `readings <http://www.cs.toronto.edu/~hinton/deeprefs.html>`_ from last year's `NIPS tutorial <http://videolectures.net/jul09_hinton_deeplearn/>`_.
20 | 
21 | The tutorials presented here will introduce you to some of the most important deep learning
22 | algorithms and will also show you how to run them using Theano_. Theano is a python library that makes writing deep learning models easy, and gives the option of
23 | training them on a GPU.
24 | 
25 | The algorithm tutorials have some prerequisites.  You should know some python,
26 | and be familiar with numpy. Since this tutorial is about using Theano, you
27 | should read over the `Theano basic tutorial`_ first.  Once you've done that,
28 | read through our :ref:`gettingstarted` chapter -- it introduces the notation, and [downloadable] datasets used in the algorithm tutorials, and the way we do optimization by stochastic gradient descent.  
29 | 
30 | The purely supervised learning algorithms are meant to be read in order:
31 | 
32 |   #. :ref:`Logistic Regression <logreg>` - using Theano for something simple
33 |   #. :ref:`Multilayer perceptron <mlp>` - introduction to layers
34 |   #. :ref:`Deep Convolutional Network <lenet>` - a simplified version of LeNet5
35 | 
36 | The unsupervised and semi-supervised learning algorithms can be read in any
37 | order (the auto-encoders can be read independently of the RBM/DBN thread):
38 | 
39 |   * :ref:`Auto Encoders, Denoising Autoencoders <daa>` - description of autoencoders
40 |   * :ref:`Stacked Denoising Auto-Encoders <SdA>` - easy steps into unsupervised pre-training for deep nets
41 |   * :ref:`Restricted Boltzmann Machines <rbm>` - single layer generative RBM model
42 |   * :ref:`Deep Belief Networks <DBN>` - unsupervised generative pre-training of stacked RBMs followed by supervised fine-tuning
43 | 
44 | Building towards including the mcRBM model, we have a new tutorial on sampling
45 | from energy models:
46 | 
47 |   * :ref:`HMC Sampling <HMC>` - hybrid (aka Hamiltonian) Monte-Carlo sampling with scan()
48 | 
49 | Building towards including the Contractive auto-encoders tutorial, we have the code for now:
50 |   * `Contractive auto-encoders`_ code - There is some basic doc in the code.
51 | 
52 | Energy-based recurrent neural network (RNN-RBM):
53 |   * :ref:`Modeling and generating sequences of polyphonic music <rnnrbm>`
54 | 
55 | .. _Theano: http://deeplearning.net/software/theano
56 | 
57 | .. _Theano basic tutorial: http://deeplearning.net/software/theano/tutorial
58 | 
59 | .. _Contractive auto-encoders: https://github.com/lisa-lab/DeepLearningTutorials/blob/master/code/cA.py
60 | 


--------------------------------------------------------------------------------
/doc/deep.txt:
--------------------------------------------------------------------------------
  1 | .. _deep:
  2 | 
  3 | Deep Learning
  4 | =============
  5 | 
  6 | The breakthrough to effective training strategies for deep architectures came in
  7 | 2006 with the algorithms for training deep belief networks
  8 | (DBN) [Hinton07]_ and stacked auto-encoders [Ranzato07]_ , [Bengio07]_ .
  9 | All these methods are based on a similar approach: **greedy layer-wise unsupervised
 10 | pre-training** followed by **supervised fine-tuning**.
 11 | 
 12 | The pretraining strategy consists in using unsupervised learning to guide the
 13 | training of intermediate levels of representation. Each layer is pre-trained
 14 | with an unsupervised learning algorithm, which attempts to learn a nonlinear
 15 | transformation of its input, in order to captures its main variations.  Higher
 16 | levels of abstractions are created by feeding the output of one layer, to the
 17 | input of the subsequent layer.
 18 | 
 19 | The resulting an architecture can then be seen in two lights:
 20 | 
 21 | * the pre-trained deep network can be used to initialize the weights of all, but
 22 |   the last layer of a deep neural network. The weights are then further adapted
 23 |   to a supervised task (such as classification) through traditional gradient
 24 |   descent (see :ref:`Multilayer perceptron <mlp>`). This is referred to as the
 25 |   fine-tuning step.
 26 | 
 27 | * the pre-trained deep network can also serve solely as a feature extractor. The
 28 |   output of the last layer is fed to a classifier, such as logistic regression,
 29 |   which is trained independently. Better results can be obtained by
 30 |   concatenating the output of the last layer, with the hidden representations of
 31 |   all intermediate layers [Lee09]_.
 32 | 
 33 | For the purposes of this tutorial, we will focus on the first interpretation,
 34 | as that is what was first proposed in [Hinton06]_. 
 35 | 
 36 | Deep Coding
 37 | +++++++++++
 38 | 
 39 | Since Deep Belief Networks (DBN) and Stacked Denoising-AutoEncoders (SDA) share
 40 | much of the same architecture and have very similar training algorithms (in
 41 | terms of pretraining and fine-tuning stages), it makes sense to implement them
 42 | in a similar fashion, as part of a "Deep Learning" framework.
 43 | 
 44 | We thus define a generic interface, which both of these architectures will
 45 | share.
 46 | 
 47 | .. code-block:: python
 48 | 
 49 |     class DeepLayerwiseModel(object):
 50 | 
 51 |         def layerwise_pretrain(self, layer_fns, pretrain_amounts):
 52 |             """
 53 |             """
 54 | 
 55 |         def finetune(self, datasets, lr, batch_size):
 56 |             """
 57 | 
 58 |     class DBN(DeepLayerwiseModel):
 59 |         """
 60 |         """
 61 | 
 62 |     class StackedDAA(DeepLayerwiseModel):
 63 |         """
 64 |         """
 65 | 
 66 | .. code-block:: python
 67 | 
 68 |     def deep_main(learning_rate=0.1,
 69 |             pretraining_epochs=20,
 70 |             pretrain_lr=0.1,
 71 |             training_epochs=1000,
 72 |             batch_size=20,
 73 |             mnist_file='mnist.pkl.gz'):
 74 |      
 75 |         n_train_examples, train_valid_test = load_mnist(mnist_file)
 76 | 
 77 |         # instantiate model
 78 |         deep_model = ...
 79 | 
 80 |         ####
 81 |         #### Phase 1: Pre-training
 82 |         ####
 83 | 
 84 |         # create an array of functions, which will be used for the greedy
 85 |         # layer-wise unsupervised training procedure
 86 | 
 87 |         pretrain_functions = deep_model.pretrain_functions(
 88 |                 batch_size=batch_size,
 89 |                 train_set_x=train_set_x,
 90 |                 learning_rate=pretrain_lr,
 91 |                 ...
 92 |                 )
 93 | 
 94 |         # loop over all the layers in our network
 95 |         for layer_idx, pretrain_fn in enumerate(pretrain_functions):
 96 | 
 97 |             # iterate over a certain number of epochs) 
 98 |             for i in xrange(pretraining_epochs * n_train_examples / batch_size):
 99 | 
100 |                 # follow one step in the gradient of the unsupervised cost
101 |                 # function, at the given layer
102 |                 layer_fn(i)
103 |     
104 | 
105 | .. code-block:: python
106 | 
107 |         ####
108 |         #### Phase 2: Fine Tuning
109 |         ####
110 | 
111 |         # create theano functions for fine-tuning, as well as
112 |         # validation and testing our model.
113 | 
114 |         train_fn, valid_scores, test_scores =\
115 |             deep_model.finetune_functions(
116 |                 train_valid_test[0][0],       # training dataset
117 |                 learning_rate=finetune_lr,    # the learning rate
118 |                 batch_size=batch_size)        # number of examples to use at once
119 | 
120 |         
121 |         # use these functions as part of the generic early-stopping procedure
122 |         for i in xrange(patience_max):
123 | 
124 |             if i >= patience:
125 |                 break
126 | 
127 |             cost_i = train_fn(i)
128 | 
129 |             ...
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/code/utils.py:
--------------------------------------------------------------------------------
  1 | """ This file contains different utility functions that are not connected
  2 | in anyway to the networks presented in the tutorials, but rather help in
  3 | processing the outputs into a more understandable way.
  4 | 
  5 | For example ``tile_raster_images`` helps in generating a easy to grasp
  6 | image from a set of samples or weights.
  7 | """
  8 | 
  9 | 
 10 | import numpy
 11 | 
 12 | 
 13 | def scale_to_unit_interval(ndar, eps=1e-8):
 14 |     """ Scales all values in the ndarray ndar to be between 0 and 1 """
 15 |     ndar = ndar.copy()
 16 |     ndar -= ndar.min()
 17 |     ndar *= 1.0 / (ndar.max() + eps)
 18 |     return ndar
 19 | 
 20 | 
 21 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
 22 |                        scale_rows_to_unit_interval=True,
 23 |                        output_pixel_vals=True):
 24 |     """
 25 |     Transform an array with one flattened image per row, into an array in
 26 |     which images are reshaped and layed out like tiles on a floor.
 27 | 
 28 |     This function is useful for visualizing datasets whose rows are images,
 29 |     and also columns of matrices for transforming those rows
 30 |     (such as the first layer of a neural net).
 31 | 
 32 |     :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
 33 |     be 2-D ndarrays or None;
 34 |     :param X: a 2-D array in which every row is a flattened image.
 35 | 
 36 |     :type img_shape: tuple; (height, width)
 37 |     :param img_shape: the original shape of each image
 38 | 
 39 |     :type tile_shape: tuple; (rows, cols)
 40 |     :param tile_shape: the number of images to tile (rows, cols)
 41 | 
 42 |     :param output_pixel_vals: if output should be pixel values (i.e. int8
 43 |     values) or floats
 44 | 
 45 |     :param scale_rows_to_unit_interval: if the values need to be scaled before
 46 |     being plotted to [0,1] or not
 47 | 
 48 | 
 49 |     :returns: array suitable for viewing as an image.
 50 |     (See:`Image.fromarray`.)
 51 |     :rtype: a 2-d array with same dtype as X.
 52 | 
 53 |     """
 54 | 
 55 |     assert len(img_shape) == 2
 56 |     assert len(tile_shape) == 2
 57 |     assert len(tile_spacing) == 2
 58 | 
 59 |     # The expression below can be re-written in a more C style as
 60 |     # follows :
 61 |     #
 62 |     # out_shape    = [0,0]
 63 |     # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] -
 64 |     #                tile_spacing[0]
 65 |     # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
 66 |     #                tile_spacing[1]
 67 |     out_shape = [
 68 |         (ishp + tsp) * tshp - tsp
 69 |         for ishp, tshp, tsp in zip(img_shape, tile_shape, tile_spacing)
 70 |     ]
 71 | 
 72 |     if isinstance(X, tuple):
 73 |         assert len(X) == 4
 74 |         # Create an output numpy ndarray to store the image
 75 |         if output_pixel_vals:
 76 |             out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
 77 |                                     dtype='uint8')
 78 |         else:
 79 |             out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
 80 |                                     dtype=X.dtype)
 81 | 
 82 |         #colors default to 0, alpha defaults to 1 (opaque)
 83 |         if output_pixel_vals:
 84 |             channel_defaults = [0, 0, 0, 255]
 85 |         else:
 86 |             channel_defaults = [0., 0., 0., 1.]
 87 | 
 88 |         for i in xrange(4):
 89 |             if X[i] is None:
 90 |                 # if channel is None, fill it with zeros of the correct
 91 |                 # dtype
 92 |                 dt = out_array.dtype
 93 |                 if output_pixel_vals:
 94 |                     dt = 'uint8'
 95 |                 out_array[:, :, i] = numpy.zeros(
 96 |                     out_shape,
 97 |                     dtype=dt
 98 |                 ) + channel_defaults[i]
 99 |             else:
100 |                 # use a recurrent call to compute the channel and store it
101 |                 # in the output
102 |                 out_array[:, :, i] = tile_raster_images(
103 |                     X[i], img_shape, tile_shape, tile_spacing,
104 |                     scale_rows_to_unit_interval, output_pixel_vals)
105 |         return out_array
106 | 
107 |     else:
108 |         # if we are dealing with only one channel
109 |         H, W = img_shape
110 |         Hs, Ws = tile_spacing
111 | 
112 |         # generate a matrix to store the output
113 |         dt = X.dtype
114 |         if output_pixel_vals:
115 |             dt = 'uint8'
116 |         out_array = numpy.zeros(out_shape, dtype=dt)
117 | 
118 |         for tile_row in xrange(tile_shape[0]):
119 |             for tile_col in xrange(tile_shape[1]):
120 |                 if tile_row * tile_shape[1] + tile_col < X.shape[0]:
121 |                     this_x = X[tile_row * tile_shape[1] + tile_col]
122 |                     if scale_rows_to_unit_interval:
123 |                         # if we should scale values to be between 0 and 1
124 |                         # do this by calling the `scale_to_unit_interval`
125 |                         # function
126 |                         this_img = scale_to_unit_interval(
127 |                             this_x.reshape(img_shape))
128 |                     else:
129 |                         this_img = this_x.reshape(img_shape)
130 |                     # add the slice to the corresponding position in the
131 |                     # output array
132 |                     c = 1
133 |                     if output_pixel_vals:
134 |                         c = 255
135 |                     out_array[
136 |                         tile_row * (H + Hs): tile_row * (H + Hs) + H,
137 |                         tile_col * (W + Ws): tile_col * (W + Ws) + W
138 |                     ] = this_img * c
139 |         return out_array
140 | 


--------------------------------------------------------------------------------
/doc/utilities.txt:
--------------------------------------------------------------------------------
  1 | =============
  2 | Miscellaneous
  3 | =============
  4 | 
  5 | .. _how-to-plot:
  6 | 
  7 | Plotting Samples and Filters
  8 | ++++++++++++++++++++++++++++
  9 | 
 10 | .. note::
 11 |     The code for this section is available for download `here`_.
 12 | 
 13 | .. _here: http://deeplearning.net/tutorial/code/utils.py
 14 | 
 15 | 
 16 | To plot a sample, what we need to do is to take the visible units, which
 17 | are a flattened image (there is no 2D structure to the visible units,
 18 | just a 1D string of unit activations) and reshape it into a 2D image. The order in
 19 | which the points from the 1D array go into the 2D image is given by the
 20 | order in which the inital MNIST images where converted into a 1D array.
 21 | Lucky for us this is just a call of the ``numpy.reshape`` function.
 22 | 
 23 | Plotting the weights is a bit more tricky. We have ``n_hidden`` hidden
 24 | units, each of them corresponding to a column of the weight matrix. A
 25 | column has the same shape as the visible, where the weight corresponding
 26 | to the connection with visible unit `j` is at position `j`. Therefore,
 27 | if we reshape every such column, using ``numpy.reshape``, we get a
 28 | filter image that tells us how this hidden unit is influenced by
 29 | the input image.
 30 | 
 31 | We need a utility function that takes a minibatch, or the weight matrix,
 32 | and converts each row ( for the weight matrix we do a transpose ) into a
 33 | 2D image and then tile these images together.  Once we converted the
 34 | minibatch or the weights in this image of tiles, we can use PIL to plot
 35 | and save. `PIL <http://www.pythonware.com/products/pil/>`_ is a standard
 36 | python libarary to deal with images.
 37 | 
 38 | Tiling minibatches together is done for us by the
 39 | ``tile_raster_image`` function which we provide here.
 40 | 
 41 | .. code-block:: python
 42 | 
 43 | 
 44 |   def scale_to_unit_interval(ndar, eps=1e-8):
 45 |     """ Scales all values in the ndarray ndar to be between 0 and 1 """
 46 |     ndar = ndar.copy()
 47 |     ndar -= ndar.min()
 48 |     ndar *= 1.0 / (ndar.max() + eps)
 49 |     return ndar
 50 | 
 51 | 
 52 |   def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
 53 |                          scale_rows_to_unit_interval=True,
 54 |                          output_pixel_vals=True):
 55 |     """
 56 |     Transform an array with one flattened image per row, into an array in
 57 |     which images are reshaped and layed out like tiles on a floor.
 58 | 
 59 |     This function is useful for visualizing datasets whose rows are images,
 60 |     and also columns of matrices for transforming those rows
 61 |     (such as the first layer of a neural net).
 62 | 
 63 |     :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
 64 |     be 2-D ndarrays or None;
 65 |     :param X: a 2-D array in which every row is a flattened image.
 66 | 
 67 |     :type img_shape: tuple; (height, width)
 68 |     :param img_shape: the original shape of each image
 69 | 
 70 |     :type tile_shape: tuple; (rows, cols)
 71 |     :param tile_shape: the number of images to tile (rows, cols)
 72 | 
 73 |     :param output_pixel_vals: if output should be pixel values (i.e. int8
 74 |     values) or floats
 75 | 
 76 |     :param scale_rows_to_unit_interval: if the values need to be scaled before
 77 |     being plotted to [0,1] or not
 78 | 
 79 | 
 80 |     :returns: array suitable for viewing as an image.
 81 |     (See:`Image.fromarray`.)
 82 |     :rtype: a 2-d array with same dtype as X.
 83 | 
 84 |     """
 85 | 
 86 |     assert len(img_shape) == 2
 87 |     assert len(tile_shape) == 2
 88 |     assert len(tile_spacing) == 2
 89 | 
 90 |     # The expression below can be re-written in a more C style as
 91 |     # follows :
 92 |     #
 93 |     # out_shape = [0,0]
 94 |     # out_shape[0] = (img_shape[0] + tile_spacing[0]) * tile_shape[0] -
 95 |     #                tile_spacing[0]
 96 |     # out_shape[1] = (img_shape[1] + tile_spacing[1]) * tile_shape[1] -
 97 |     #                tile_spacing[1]
 98 |     out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp
 99 |                         in zip(img_shape, tile_shape, tile_spacing)]
100 | 
101 |     if isinstance(X, tuple):
102 |         assert len(X) == 4
103 |         # Create an output numpy ndarray to store the image
104 |         if output_pixel_vals:
105 |             out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8')
106 |         else:
107 |             out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype)
108 | 
109 |         #colors default to 0, alpha defaults to 1 (opaque)
110 |         if output_pixel_vals:
111 |             channel_defaults = [0, 0, 0, 255]
112 |         else:
113 |             channel_defaults = [0., 0., 0., 1.]
114 | 
115 |         for i in xrange(4):
116 |             if X[i] is None:
117 |                 # if channel is None, fill it with zeros of the correct
118 |                 # dtype
119 |                 out_array[:, :, i] = numpy.zeros(out_shape,
120 |                         dtype='uint8' if output_pixel_vals else out_array.dtype
121 |                         ) + channel_defaults[i]
122 |             else:
123 |                 # use a recurrent call to compute the channel and store it
124 |                 # in the output
125 |                 out_array[:, :, i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals)
126 |         return out_array
127 | 
128 |     else:
129 |         # if we are dealing with only one channel
130 |         H, W = img_shape
131 |         Hs, Ws = tile_spacing
132 | 
133 |         # generate a matrix to store the output
134 |         out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype)
135 | 
136 | 
137 |         for tile_row in xrange(tile_shape[0]):
138 |             for tile_col in xrange(tile_shape[1]):
139 |                 if tile_row * tile_shape[1] + tile_col < X.shape[0]:
140 |                     if scale_rows_to_unit_interval:
141 |                         # if we should scale values to be between 0 and 1
142 |                         # do this by calling the `scale_to_unit_interval`
143 |                         # function
144 |                         this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape))
145 |                     else:
146 |                         this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)
147 |                     # add the slice to the corresponding position in the
148 |                     # output array
149 |                     out_array[
150 |                         tile_row * (H+Hs): tile_row * (H + Hs) + H,
151 |                         tile_col * (W+Ws): tile_col * (W + Ws) + W
152 |                         ] \
153 |                         = this_img * (255 if output_pixel_vals else 1)
154 |         return out_array
155 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # theano documentation build configuration file, created by
  4 | # sphinx-quickstart on Tue Oct  7 16:34:06 2008.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # The contents of this file are pickled, so don't put values in the namespace
  9 | # that aren't pickleable (module imports are okay, they're removed automatically).
 10 | #
 11 | # All configuration values have a default value; values that are commented out
 12 | # serve to show the default value.
 13 | import sys, os
 14 | 
 15 | # If your extensions are in another directory, add it here. If the directory
 16 | # is relative to the documentation root, use os.path.abspath to make it
 17 | # absolute, like shown here.
 18 | #sys.path.append(os.path.abspath('some/directory'))
 19 | 
 20 | # General configuration
 21 | # ---------------------
 22 | 
 23 | # Add any Sphinx extension module names here, as strings. They can be extensions
 24 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 25 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo']
 26 | 
 27 | try:
 28 |     from sphinx.ext import pngmath
 29 |     extensions.append('sphinx.ext.pngmath')
 30 | except ImportError:
 31 |     print >>sys.stderr, 'Warning: could not import sphinx.ext.pngmath'
 32 |     pass
 33 | 
 34 | # Add any paths that contain templates here, relative to this directory.
 35 | templates_path = ['.templates']
 36 | 
 37 | # The suffix of source filenames.
 38 | source_suffix = '.txt'
 39 | 
 40 | # The master toctree document.
 41 | master_doc = 'contents'
 42 | 
 43 | # General substitutions.
 44 | project = 'DeepLearning'
 45 | copyright = '2008--2010, LISA lab'
 46 | 
 47 | # The default replacements for |version| and |release|, also used in various
 48 | # other places throughout the built documents.
 49 | #
 50 | # The short X.Y version.
 51 | version = '0.1'
 52 | # The full version, including alpha/beta/rc tags.
 53 | release = '0.1'
 54 | 
 55 | # There are two options for replacing |today|: either, you set today to some
 56 | # non-false value, then it is used:
 57 | #today = ''
 58 | # Else, today_fmt is used as the format for a strftime call.
 59 | today_fmt = '%B %d, %Y'
 60 | 
 61 | # List of documents that shouldn't be included in the build.
 62 | #unused_docs = []
 63 | 
 64 | # List of directories, relative to source directories, that shouldn't be searched
 65 | # for source files.
 66 | exclude_dirs = ['scripts']
 67 | 
 68 | # The reST default role (used for this markup: `text`) to use for all documents.
 69 | #default_role = None
 70 | 
 71 | # If true, '()' will be appended to :func: etc. cross-reference text.
 72 | #add_function_parentheses = True
 73 | 
 74 | # If true, the current module name will be prepended to all description
 75 | # unit titles (such as .. function::).
 76 | #add_module_names = True
 77 | 
 78 | # If true, sectionauthor and moduleauthor directives will be shown in the
 79 | # output. They are ignored by default.
 80 | #show_authors = False
 81 | 
 82 | # The name of the Pygments (syntax highlighting) style to use.
 83 | pygments_style = 'sphinx'
 84 | 
 85 | 
 86 | # Options for HTML output
 87 | # -----------------------
 88 | 
 89 | # The style sheet to use for HTML and HTML Help pages. A file of that name
 90 | # must exist either in Sphinx' static/ path, or in one of the custom paths
 91 | # given in html_static_path.
 92 | #html_style = 'default.css'
 93 | html_theme = 'sphinxdoc'
 94 | 
 95 | # The name for this set of Sphinx documents.  If None, it defaults to
 96 | # "<project> v<release> documentation".
 97 | #html_title = None
 98 | 
 99 | # A shorter title for the navigation bar.  Default is the same as html_title.
100 | #html_short_title = None
101 | 
102 | # The name of an image file (within the static path) to place at the top of
103 | # the sidebar.
104 | #html_logo = None
105 | 
106 | # The name of an image file (within the static path) to use as favicon of the
107 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
108 | # pixels large.
109 | #html_favicon = None
110 | 
111 | # Add any paths that contain custom static files (such as style sheets) here,
112 | # relative to this directory. They are copied after the builtin static files,
113 | # so a file named "default.css" will overwrite the builtin "default.css".
114 | #html_static_path = ['.static', 'images']
115 | html_static_path = ['images']
116 | 
117 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
118 | # using the given strftime format.
119 | html_last_updated_fmt = '%b %d, %Y'
120 | 
121 | # If true, SmartyPants will be used to convert quotes and dashes to
122 | # typographically correct entities.
123 | html_use_smartypants = True
124 | 
125 | # Custom sidebar templates, maps document names to template names.
126 | #html_sidebars = {}
127 | 
128 | # Additional templates that should be rendered to pages, maps page names to
129 | # template names.
130 | #html_additional_pages = {}
131 | 
132 | # If false, no module index is generated.
133 | html_use_modindex = True
134 | 
135 | # If false, no index is generated.
136 | html_use_index = True
137 | 
138 | # If true, the index is split into individual pages for each letter.
139 | #html_split_index = False
140 | 
141 | # If true, the reST sources are included in the HTML build as _sources/<name>.
142 | #html_copy_source = True
143 | 
144 | # If true, an OpenSearch description file will be output, and all pages will
145 | # contain a <link> tag referring to it.  The value of this option must be the
146 | # base URL from which the finished HTML is served.
147 | #html_use_opensearch = ''
148 | 
149 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
150 | #html_file_suffix = ''
151 | 
152 | # Output file base name for HTML help builder.
153 | htmlhelp_basename = 'deeplearningdoc'
154 | 
155 | 
156 | # Options for LaTeX output
157 | # ------------------------
158 | 
159 | # The paper size ('letter' or 'a4').
160 | #latex_paper_size = 'letter'
161 | 
162 | # The font size ('10pt', '11pt' or '12pt').
163 | latex_font_size = '11pt'
164 | 
165 | # Grouping the document tree into LaTeX files. List of tuples
166 | # (source start file, target name, title, author, document class [howto/manual]).
167 | latex_documents = [
168 |   ('contents', 'deeplearning.tex', 'Deep Learning Tutorial',
169 |    'LISA lab, University of Montreal', 'manual'),
170 | ]
171 | 
172 | # The name of an image file (relative to this directory) to place at the top of
173 | # the title page.
174 | latex_logo = None
175 | 
176 | # For "manual" documents, if this is true, then toplevel headings are parts,
177 | # not chapters.
178 | #latex_use_parts = False
179 | 
180 | # Additional stuff for the LaTeX preamble.
181 | #latex_preamble = ''
182 | 
183 | # Documents to append as an appendix to all manuals.
184 | #latex_appendices = []
185 | 
186 | # If false, no module index is generated.
187 | #latex_use_modindex = True
188 | 
189 | default_role = 'math'
190 | pngmath_divpng_args = ['-gamma 1.5','-D 110']
191 | pngmath_latex_preamble =  '\\usepackage{amsmath}\n'+\
192 |                           '\\usepackage{amsfonts}\n'+\
193 |                           '\\usepackage{amssymb}\n'+\
194 |                           '\\def\\E{\\mathbf{E}}\n'+\
195 |                           '\\def\\F{\\mathbf{F}}\n'+\
196 |                           '\\def\\x{\\mathbf{x}}\n'+\
197 |                           '\\def\\h{\\mathbf{h}}\n'+\
198 |                           '\\def\\v{\\mathbf{v}}\n'+\
199 |                           '\\def\\nv{\\mathbf{v^{{\bf -}}}}\n'+\
200 |                           '\\def\\nh{\\mathbf{h^{{\bf -}}}}\n'+\
201 |                           '\\def\\s{\\mathbf{s}}\n'+\
202 |                           '\\def\\b{\\mathbf{b}}\n'+\
203 |                           '\\def\\c{\\mathbf{c}}\n'+\
204 |                           '\\def\\W{\\mathbf{W}}\n'+\
205 |                           '\\def\\C{\\mathbf{C}}\n'+\
206 |                           '\\def\\P{\\mathbf{P}}\n'+\
207 |                           '\\def\\T{{\\bf \\mathcal T}}\n'+\
208 |                           '\\def\\B{{\\bf \\mathcal B}}\n'
209 | 


--------------------------------------------------------------------------------
/doc/rnnrbm.txt:
--------------------------------------------------------------------------------
  1 | .. _rnnrbm:
  2 | 
  3 | Modeling and generating sequences of polyphonic music with the RNN-RBM
  4 | ========================================================================
  5 | 
  6 | .. note::
  7 |   This tutorial demonstrates a basic implementation of the RNN-RBM as described in [BoulangerLewandowski12]_
  8 |   (`pdf <http://www-etud.iro.umontreal.ca/~boulanni/ICML2012.pdf>`_).
  9 |   We assume the reader is familiar with
 10 |   `recurrent neural networks using the scan op <http://deeplearning.net/software/theano/library/scan.html>`_
 11 |   and `restricted Boltzmann machines (RBM) <rbm.html>`_.
 12 | 
 13 | .. note::
 14 |   The code for this section is available for download here: `rnnrbm.py <code/rnnrbm.py>`_.
 15 | 
 16 |   You will need the modified `Python MIDI package (GPL license) <http://www.iro.umontreal.ca/~lisa/deep/midi.zip>`_ in your ``$PYTHONPATH`` or in the working directory in order to convert MIDI files to and from piano-rolls.
 17 |   The script also assumes that the content of the `Nottingham Database of folk tunes <http://www.iro.umontreal.ca/~lisa/deep/data/Nottingham.zip>`_ has been extracted in the ``../data`` directory.
 18 |   Alternative MIDI datasets are available `here <http://www-etud.iro.umontreal.ca/~boulanni/icml2012>`_.
 19 |   
 20 |   Note that both dependencies above can be setup automatically by running the ``download.sh`` script in the ``../data`` directory.
 21 | 
 22 | .. caution::
 23 |   Need Theano 0.6 or more recent.
 24 | 
 25 | 
 26 | The RNN-RBM
 27 | +++++++++++++++++++++++++
 28 | 
 29 | The RNN-RBM is an energy-based model for density estimation of temporal sequences, where the feature vector :math:`v^{(t)}` at time step :math:`t` may be high-dimensional.
 30 | It allows to describe multimodal conditional distributions of :math:`v^{(t)}|\mathcal A^{(t)}`, where :math:`\mathcal A^{(t)}\equiv \{v_\tau|\tau<t\}` denotes the *sequence history* at time :math:`t`, via a series of conditional RBMs (one a each time step) whose parameters :math:`b_v^{(t)},b_h^{(t)}` depend on the output of a deterministic RNN with hidden units :math:`u^{(t)}`:
 31 | 
 32 | .. math::
 33 |   :label: bv_t
 34 | 
 35 |   b_v^{(t)} = b_v + W_{uv} u^{(t-1)}
 36 | 
 37 | .. math::
 38 |   :label: bh_t
 39 | 
 40 |   b_h^{(t)} = b_h + W_{uh} u^{(t-1)}
 41 | 
 42 | and the single-layer RNN recurrence relation is defined by:
 43 | 
 44 | .. math::
 45 |   :label: u_t
 46 | 
 47 |   u^{(t)} = \tanh (b_u + W_{uu} u^{(t-1)} + W_{vu} v^{(t)})
 48 | 
 49 | The resulting model is unrolled in time in the following figure:
 50 | 
 51 | .. image:: images/rnnrbm.png
 52 |     :align: center
 53 | 
 54 | The overall probability distribution is given by the sum over the :math:`T` time steps in a given sequence:
 55 | 
 56 | .. math::
 57 |   :label: prob_rnnrbm
 58 | 
 59 |   P(\{v^{(t)}\}) = \sum_{t=1}^T P(v^{(t)} | \mathcal A^{(t)})
 60 | 
 61 | where the right-hand side multiplicand is the marginalized probability of the :math:`t^\mathrm{th}` RBM.
 62 | 
 63 | Note that for clarity of the implementation, contrarily to [BoulangerLewandowski12]_, we use the obvious naming convention for weight matrices and we use :math:`u^{(t)}` instead of :math:`\hat h^{(t)}` for the recurrent hidden units.
 64 | 
 65 | 
 66 | 
 67 | Implementation
 68 | ++++++++++++++
 69 | 
 70 | We wish to construct two Theano functions: one to train the RNN-RBM, and one to generate sample sequences from it.
 71 | 
 72 | For *training*, i.e. given :math:`\{v^{(t)}\}`, the RNN hidden state :math:`\{u^{(t)}\}` and the associated :math:`\{b_v^{(t)}, b_h^{(t)}\}` parameters are deterministic and can be readily computed for each training sequence.
 73 | A stochastic gradient descent (SGD) update on the parameters can then be estimated via contrastive divergence (CD) on the individual time steps of a sequence in the same way that individual training examples are treated in a mini-batch for regular RBMs.
 74 | 
 75 | *Sequence generation* is similar except that the :math:`v^{(t)}` must be sampled sequentially at each time step with a separate (non-batch) Gibbs chain before being passed down to the recurrence and the sequence history.
 76 | 
 77 | 
 78 | The RBM layer
 79 | ---------------
 80 | 
 81 | The ``build_rbm`` function shown below builds a Gibbs chain from an input mini-batch (a binary matrix) via the CD approximation.
 82 | Note that it also supports a single frame (a binary vector) in the non-batch case.
 83 | 
 84 | .. literalinclude:: ../code/rnnrbm.py
 85 |   :pyobject: build_rbm
 86 | 
 87 | The RNN layer
 88 | ---------------
 89 | 
 90 | The ``build_rnnrbm`` function defines the RNN recurrence relation to obtain the RBM parameters; the recurrence function is flexible enough to serve both in the training scenario where :math:`v^{(t)}` is given and the "batch" RBM is constructed at the end on the whole sequence at once, and in the generation scenario where :math:`v^{(t)}` is sampled separately at each time step using the Gibbs chain defined above.
 91 | 
 92 | 
 93 | .. literalinclude:: ../code/rnnrbm.py
 94 |   :pyobject: build_rnnrbm
 95 | 
 96 | Putting it all together
 97 | ---------------------------
 98 | 
 99 | We now have all the necessary ingredients to start training our network on real symbolic sequences of polyphonic music.
100 | 
101 | .. literalinclude:: ../code/rnnrbm.py
102 |   :pyobject: RnnRbm
103 | 
104 | Results
105 | ++++++++
106 | 
107 | We ran the code on the Nottingham database for 200 epochs; training took approximately 24 hours.
108 | 
109 | The output was the following:
110 | 
111 | .. code-block:: text
112 | 
113 |   Epoch 1/200 -15.0308940028
114 |   Epoch 2/200 -10.4892606673
115 |   Epoch 3/200 -10.2394696138
116 |   Epoch 4/200 -10.1431669994
117 |   Epoch 5/200 -9.7005382843
118 |   Epoch 6/200 -8.5985647524
119 |   Epoch 7/200 -8.35115428534
120 |   Epoch 8/200 -8.26453580552
121 |   Epoch 9/200 -8.21208991542
122 |   Epoch 10/200 -8.16847274143
123 | 
124 |   ... truncated for brevity ...
125 | 
126 |   Epoch 190/200 -4.74799179994
127 |   Epoch 191/200 -4.73488515216
128 |   Epoch 192/200 -4.7326138489
129 |   Epoch 193/200 -4.73841636884
130 |   Epoch 194/200 -4.70255511452
131 |   Epoch 195/200 -4.71872634914
132 |   Epoch 196/200 -4.7276415885
133 |   Epoch 197/200 -4.73497644728
134 |   Epoch 198/200 -inf
135 |   Epoch 199/200 -4.75554987143
136 |   Epoch 200/200 -4.72591935412
137 | 
138 | 
139 | 
140 | The figures below show the piano-rolls of two sample sequences and we provide the corresponding MIDI files:
141 | 
142 | .. figure:: images/sample1.png
143 |   :scale: 60%
144 | 
145 |   Listen to `sample1.mid <http://www-etud.iro.umontreal.ca/~boulanni/sample1.mid>`_
146 | 
147 | .. figure:: images/sample2.png
148 |   :scale: 60%
149 | 
150 |   Listen to `sample2.mid <http://www-etud.iro.umontreal.ca/~boulanni/sample2.mid>`_
151 | 
152 | 
153 | How to improve this code
154 | +++++++++++++++++++++++++
155 | 
156 | The code shown in this tutorial is a stripped-down version that can be improved in the following ways:
157 | 
158 | * Preprocessing: transposing the sequences in a common tonality (e.g. C major / minor) and normalizing the tempo in beats (quarternotes) per minute can have the most effect on the generative quality of the model.
159 | * Pretraining techniques: initialize the :math:`W,b_v,b_h` parameters with independent RBMs with fully shuffled frames (i.e. :math:`W_{uh}=W_{uv}=W_{uu}=W_{vu}=0`); initialize the :math:`W_{uv},W_{uu},W_{vu},b_u` parameters of the RNN with the auxiliary cross-entropy objective via either SGD or, preferably, Hessian-free optimization [BoulangerLewandowski12]_.
160 | * Optimization techniques: gradient clipping, Nesterov momentum and the use of NADE for conditional density estimation.
161 | * Hyperparameter search: learning rate (separately for the RBM and RNN parts), learning rate schedules, batch size, number of hidden units (recurrent and RBM), momentum coefficient, momentum schedule, Gibbs chain length :math:`k` and early stopping.
162 | * Learn the initial condition :math:`u^{(0)}` as a model parameter.
163 | 
164 | 
165 | A few samples generated with code including these features are available here: `sequences.zip <http://www-etud.iro.umontreal.ca/~boulanni/sequences.zip>`_.
166 | 
167 | 


--------------------------------------------------------------------------------
/doc/SdA.txt:
--------------------------------------------------------------------------------
  1 | .. _SdA:
  2 | 
  3 | Stacked Denoising Autoencoders (SdA)
  4 | ====================================
  5 | 
  6 | .. note::
  7 |   This section assumes the reader has already read through :doc:`logreg`
  8 |   and :doc:`mlp`. Additionally it uses the following Theano functions
  9 |   and concepts : `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_, `Random numbers`_, `floatX`_. If you intend to run the code on GPU also read `GPU`_.
 10 | 
 11 | .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh
 12 | 
 13 | .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables
 14 | 
 15 | .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars
 16 | 
 17 | .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients
 18 | 
 19 | .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX
 20 | 
 21 | .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html 
 22 | 
 23 | .. _Random numbers: http://deeplearning.net/software/theano/tutorial/examples.html#using-random-numbers
 24 | 
 25 | 
 26 | .. note::
 27 |     The code for this section is available for download `here`_.
 28 | 
 29 | .. _here: http://deeplearning.net/tutorial/code/SdA.py
 30 | 
 31 | 
 32 | The Stacked Denoising Autoencoder (SdA) is an extension of the stacked 
 33 | autoencoder [Bengio07]_ and it was introduced in [Vincent08]_. 
 34 | 
 35 | This tutorial builds on the previous tutorial :ref:`dA` and we recommend,
 36 | especially if you do not have experience with autoencoders, to read it
 37 | before going any further.
 38 | 
 39 | .. _stacked_autoencoders:
 40 | 
 41 | Stacked Autoencoders
 42 | ++++++++++++++++++++
 43 | 
 44 | The denoising autoencoders can be stacked to form a deep network by
 45 | feeding the latent representation (output code)
 46 | of the denoising auto-encoder found on the layer 
 47 | below as input to the current layer. The **unsupervised pre-training** of such an 
 48 | architecture is done one layer at a time. Each layer is trained as 
 49 | a denoising auto-encoder by minimizing the reconstruction of its input
 50 | (which is the output code of the previous layer).
 51 | Once the first :math:`k` layers 
 52 | are trained, we can train the :math:`k+1`-th layer because we can now 
 53 | compute the code or latent representation from the layer below. 
 54 | Once all layers are pre-trained, the network goes through a second stage
 55 | of training called **fine-tuning**. Here we consider **supervised fine-tuning**
 56 | where we want to minimize prediction error on a supervised task.
 57 | For this we first add a logistic regression 
 58 | layer on top of the network (more precisely on the output code of the
 59 | output layer). We then
 60 | train the entire network as we would train a multilayer 
 61 | perceptron. At this point, we only consider the encoding parts of
 62 | each auto-encoder.
 63 | This stage is supervised, since now we use the target class during
 64 | training (see the :ref:`mlp` for details on the multilayer perceptron).
 65 | 
 66 | This can be easily implemented in Theano, using the class defined
 67 | before for a denoising autoencoder. We can see the stacked denoising
 68 | autoencoder as having two facades, one is a list of
 69 | autoencoders, the other is an MLP. During pre-training we use the first facade, i.e we treat our model
 70 | as a list of autoencoders, and train each autoencoder seperately. In the 
 71 | second stage of training, we use the second facade. These two
 72 | facedes are linked by the fact that the autoencoders and the sigmoid layers of 
 73 | the MLP share parameters, and the fact that autoencoders get as input latent
 74 | representations of intermediate layers of the MLP. 
 75 | 
 76 | .. literalinclude:: ../code/SdA.py
 77 |   :start-after: start-snippet-1
 78 |   :end-before: end-snippet-1
 79 | 
 80 | ``self.sigmoid_layers`` will store the sigmoid layers of the MLP facade, while
 81 | ``self.dA_layers`` will store  the denoising autoencoder associated with the layers of the MLP. 
 82 | 
 83 | Next step, we construct ``n_layers`` sigmoid layers (we use the
 84 | ``HiddenLayer`` class introduced in :ref:`mlp`, with the only
 85 | modification that we replaced the non-linearity from ``tanh`` to the
 86 | logistic function :math:`s(x) = \frac{1}{1+e^{-x}}`) and ``n_layers``
 87 | denoising autoencoders, where ``n_layers`` is the depth of our model.
 88 | We link the sigmoid layers such that they form an MLP, and construct
 89 | each denoising autoencoder such that they share the weight matrix and the 
 90 | bias of the encoding part with its corresponding sigmoid layer.
 91 | 
 92 | .. literalinclude:: ../code/SdA.py
 93 |   :start-after: start-snippet-2
 94 |   :end-before: end-snippet-2
 95 | 
 96 | All we need now is to add the logistic layer on top of the sigmoid
 97 | layers such that we have an MLP. We will 
 98 | use the ``LogisticRegression`` class introduced in :ref:`logreg`. 
 99 | 
100 | .. literalinclude:: ../code/SdA.py
101 |   :start-after: end-snippet-2
102 |   :end-before: def pretraining_functions
103 | 
104 | The class also provides a method that generates training functions for
105 | each of the denoising autoencoder associated with the different layers. 
106 | They are returned as a list, where element :math:`i` is a function that
107 | implements one step of training the ``dA`` correspoinding to layer 
108 | :math:`i`.
109 | 
110 | .. literalinclude:: ../code/SdA.py
111 |   :start-after: self.errors = self.logLayer.errors(self.y)
112 |   :end-before: corruption_level = T.scalar('corruption')
113 | 
114 | In order to be able to change the corruption level or the learning rate
115 | during training we associate a Theano variable to them.
116 | 
117 | .. literalinclude:: ../code/SdA.py
118 |   :start-after: index = T.lscalar('index')
119 |   :end-before: def build_finetune_functions
120 |  
121 | Now any function ``pretrain_fns[i]`` takes as arguments ``index`` and 
122 | optionally ``corruption`` -- the corruption level or ``lr`` -- the
123 | learning rate. Note that the name of the parameters are the name given 
124 | to the Theano variables when they are constructed, not the name of the 
125 | python variables (``learning_rate`` or ``corruption_level``). Keep this 
126 | in mind when working with Theano. 
127 | 
128 | In the same fashion we build a method for constructing function required 
129 | during finetuning ( a ``train_model``, a ``validate_model`` and a
130 | ``test_model`` function). 
131 | 
132 | .. literalinclude:: ../code/SdA.py
133 |   :pyobject: SdA.build_finetune_functions
134 | 
135 | Note that the returned ``valid_score`` and ``test_score`` are not Theano
136 | functions, but rather python functions that also loop over the entire 
137 | validation set and the entire test set producing a list of the losses
138 | over these sets.
139 | 
140 | Putting it all together
141 | +++++++++++++++++++++++
142 | 
143 | The few lines of code below constructs the stacked denoising
144 | autoencoder : 
145 | 
146 | .. literalinclude:: ../code/SdA.py
147 |   :start-after: start-snippet-3
148 |   :end-before: end-snippet-3
149 | 
150 | There are two stages in training this network, a layer-wise pre-training and 
151 | fine-tuning afterwards. 
152 | 
153 | For the pre-training stage, we will loop over all the layers of the
154 | network. For each layer we will use the compiled theano function that
155 | implements a SGD step towards optimizing the weights for reducing 
156 | the reconstruction cost of that layer. This function will be applied 
157 | to the training set for a fixed number of epochs given by
158 | ``pretraining_epochs``.
159 | 
160 | .. literalinclude:: ../code/SdA.py
161 |   :start-after: start-snippet-4
162 |   :end-before: end-snippet-4
163 | 
164 | The fine-tuning loop is very similar with the one in the :ref:`mlp`, the
165 | only difference is that we will use now the functions given by
166 | ``build_finetune_functions``  .
167 | 
168 | Running the Code
169 | ++++++++++++++++
170 | 
171 | The user can run the code by calling:
172 | 
173 | .. code-block:: bash
174 |   
175 |   python code/SdA.py
176 | 
177 | By default the code runs 15 pre-training epochs for each layer, with a batch
178 | size of 1. The  corruption level for the first layer is 0.1, for the second
179 | 0.2 and 0.3 for the third. The pretraining learning rate is was 0.001 and 
180 | the finetuning learning rate is 0.1. Pre-training takes 585.01 minutes, with 
181 | an average of 13 minutes per epoch. Fine-tuning is completed after 36 epochs
182 | in 444.2 minutes, with an average of 12.34 minutes per epoch. The final 
183 | validation score is 1.39% with a testing score of 1.3%. 
184 | These results were obtained on a machine with an Intel
185 | Xeon E5430 @ 2.66GHz CPU, with a single-threaded GotoBLAS.
186 | 
187 | 
188 | Tips and Tricks
189 | +++++++++++++++
190 | 
191 | One way to improve the running time of your code (given that you have
192 | sufficient memory available), is to compute how the network, up to layer
193 | :math:`k-1`, transforms your data. Namely, you start by training your first
194 | layer dA. Once it is trained, you can compute the hidden units values for
195 | every datapoint in your dataset and store this as a new dataset that you will
196 | use to train the dA corresponding to layer 2. Once you trained the dA for
197 | layer 2, you compute, in a similar fashion, the dataset for layer 3 and so on.
198 | You can see now, that at this point, the dAs are trained individually, and
199 | they just provide (one to the other) a non-linear transformation of the input.
200 | Once all dAs are trained, you can start fine-tunning the model.
201 | 


--------------------------------------------------------------------------------
/code/test.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import numpy
  4 | 
  5 | import convolutional_mlp
  6 | import dA
  7 | import DBN
  8 | import logistic_cg
  9 | import logistic_sgd
 10 | import mlp
 11 | import rbm
 12 | import rnnrbm
 13 | import SdA
 14 | 
 15 | 
 16 | def test_logistic_sgd():
 17 |     logistic_sgd.sgd_optimization_mnist(n_epochs=10)
 18 | 
 19 | 
 20 | def test_logistic_cg():
 21 |     try:
 22 |         import scipy
 23 |         logistic_cg.cg_optimization_mnist(n_epochs=10)
 24 |     except ImportError:
 25 |         from nose.plugins.skip import SkipTest
 26 |         raise SkipTest(
 27 |             'SciPy not available. Needed for the logistic_cg example.')
 28 | 
 29 | 
 30 | def test_mlp():
 31 |     mlp.test_mlp(n_epochs=1)
 32 | 
 33 | 
 34 | def test_convolutional_mlp():
 35 |     convolutional_mlp.evaluate_lenet5(n_epochs=1, nkerns=[5, 5])
 36 | 
 37 | 
 38 | def test_dA():
 39 |     dA.test_dA(training_epochs=1, output_folder='tmp_dA_plots')
 40 | 
 41 | 
 42 | def test_SdA():
 43 |     SdA.test_SdA(pretraining_epochs=1, training_epochs=1, batch_size=300)
 44 | 
 45 | 
 46 | def test_dbn():
 47 |     DBN.test_DBN(pretraining_epochs=1, training_epochs=1, batch_size=300)
 48 | 
 49 | 
 50 | def test_rbm():
 51 |     rbm.test_rbm(training_epochs=1, batch_size=300, n_chains=1, n_samples=1,
 52 |                  n_hidden=20, output_folder='tmp_rbm_plots')
 53 | 
 54 | 
 55 | def test_rnnrbm():
 56 |     rnnrbm.test_rnnrbm(num_epochs=1)
 57 | 
 58 | 
 59 | def speed():
 60 |     """
 61 |     This fonction modify the configuration theano and don't restore it!
 62 |     """
 63 | 
 64 |     algo = ['logistic_sgd', 'logistic_cg', 'mlp', 'convolutional_mlp',
 65 |             'dA', 'SdA', 'DBN', 'rbm', 'rnnrbm']
 66 |     to_exec = [True] * len(algo)
 67 | #    to_exec = [False] * len(algo)
 68 | #    to_exec[-1] = True
 69 |     do_float64 = True
 70 |     do_float32 = True
 71 |     do_gpu = True
 72 | 
 73 |     algo_executed = [s for idx, s in enumerate(algo) if to_exec[idx]]
 74 |     #Timming expected are from the buildbot that have an i7-920 @
 75 |     # 2.67GHz with hyperthread enabled for the cpu, 12G of ram. An GeForce GTX
 76 |     # 580 for the GPU. OS=Fedora 14, gcc=4.5.1, python/BLAS from EPD
 77 |     # 7.1-2 (python 2.7.2, mkl unknow). BLAS with only 1 thread.
 78 | 
 79 |     expected_times_64 = numpy.asarray([9.8, 22.5, 76.1, 73.7, 116.4,
 80 |                                        346.9, 381.9, 558.1, 186.3])
 81 |     expected_times_32 = numpy.asarray([8.1, 17.9, 42.5, 66.5, 71,
 82 |                                        191.2, 226.8, 432.8, 176.2])
 83 | 
 84 |     # Number with just 1 decimal are new value that are faster with
 85 |     # the Theano version 0.5rc2 Other number are older. They are not
 86 |     # updated, as we where faster in the past!
 87 |     # TODO: find why and fix this!
 88 | 
 89 | # Here is the value for the buildbot on February 3th 2012 with a GTX 285
 90 | #              sgd,         cg           mlp          conv        da
 91 | #              sda          dbn          rbm
 92 | #    gpu times[3.72957802,  9.94316864,  29.1772666,  9.13857198, 25.91144657,
 93 | #              18.30802011, 53.38651466, 285.41386175]
 94 | #    expected [3.076634879, 7.555234910, 18.99226785, 9.58915591, 24.130070450,
 95 | #              24.77524018, 92.66246653, 322.340329170]
 96 | #              sgd,         cg           mlp          conv        da
 97 | #              sda          dbn          rbm
 98 | #expected/get [0.82492841,  0.75984178,  0.65092691,  1.04930573, 0.93125138
 99 | #              1.35324519 1.7356905   1.12937868]
100 | 
101 |     expected_times_gpu = numpy.asarray([3.0, 7.55523491, 18.99226785,
102 |                                         5.8, 21.5,
103 |                                         11.8, 47.9, 290.1, 315.4])
104 | 
105 |     expected_times_64 = [s for idx, s in enumerate(expected_times_64)
106 |                          if to_exec[idx]]
107 |     expected_times_32 = [s for idx, s in enumerate(expected_times_32)
108 |                          if to_exec[idx]]
109 |     expected_times_gpu = [s for idx, s in enumerate(expected_times_gpu)
110 |                           if to_exec[idx]]
111 | 
112 |     def time_test(m, l, idx, f, **kwargs):
113 |         if not to_exec[idx]:
114 |             return
115 |         print algo[idx]
116 |         ts = m.call_time
117 |         try:
118 |             f(**kwargs)
119 |         except Exception, e:
120 |             print >> sys.stderr, 'test', algo[idx], 'FAILED', e
121 |             l.append(numpy.nan)
122 |             return
123 |         te = m.call_time
124 |         l.append(te - ts)
125 | 
126 |     def do_tests():
127 |         m = theano.compile.mode.get_default_mode()
128 |         l = []
129 |         time_test(m, l, 0, logistic_sgd.sgd_optimization_mnist, n_epochs=30)
130 |         time_test(m, l, 1, logistic_cg.cg_optimization_mnist, n_epochs=30)
131 |         time_test(m, l, 2, mlp.test_mlp, n_epochs=5)
132 |         time_test(m, l, 3, convolutional_mlp.evaluate_lenet5, n_epochs=5,
133 |                   nkerns=[5, 5])
134 |         time_test(m, l, 4, dA.test_dA, training_epochs=2,
135 |                   output_folder='tmp_dA_plots')
136 |         time_test(m, l, 5, SdA.test_SdA, pretraining_epochs=1,
137 |                   training_epochs=2, batch_size=300)
138 |         time_test(m, l, 6, DBN.test_DBN, pretraining_epochs=1,
139 |                   training_epochs=2, batch_size=300)
140 |         time_test(m, l, 7, rbm.test_rbm, training_epochs=1, batch_size=300,
141 |                   n_chains=1, n_samples=1, output_folder='tmp_rbm_plots')
142 |         time_test(m, l, 8, rnnrbm.test_rnnrbm, num_epochs=1)
143 |         return numpy.asarray(l)
144 | 
145 |     #test in float64 in FAST_RUN mode on the cpu
146 |     import theano
147 |     if do_float64:
148 |         theano.config.floatX = 'float64'
149 |         theano.config.mode = 'FAST_RUN'
150 |         float64_times = do_tests()
151 |         print >> sys.stderr, algo_executed
152 |         print >> sys.stderr, 'float64 times', float64_times
153 |         print >> sys.stderr, 'float64 expected', expected_times_64
154 |         print >> sys.stderr, 'float64 % expected/get', (
155 |             expected_times_64 / float64_times)
156 | 
157 |     #test in float32 in FAST_RUN mode on the cpu
158 |     theano.config.floatX = 'float32'
159 |     if do_float32:
160 |         float32_times = do_tests()
161 |         print >> sys.stderr, algo_executed
162 |         print >> sys.stderr, 'float32 times', float32_times
163 |         print >> sys.stderr, 'float32 expected', expected_times_32
164 |         print >> sys.stderr, 'float32 % expected/get', (
165 |             expected_times_32 / float32_times)
166 | 
167 |         if do_float64:
168 |             print >> sys.stderr, 'float64/float32', (
169 |                 float64_times / float32_times)
170 |             print >> sys.stderr
171 |             print >> sys.stderr, ('Duplicate the timing to have everything '
172 |                                   'in one place')
173 |             print >> sys.stderr, algo_executed
174 |             print >> sys.stderr, 'float64 times', float64_times
175 |             print >> sys.stderr, 'float64 expected', expected_times_64
176 |             print >> sys.stderr, 'float64 % expected/get', (
177 |                 expected_times_64 / float64_times)
178 |             print >> sys.stderr, 'float32 times', float32_times
179 |             print >> sys.stderr, 'float32 expected', expected_times_32
180 |             print >> sys.stderr, 'float32 % expected/get', (
181 |                 expected_times_32 / float32_times)
182 | 
183 |             print >> sys.stderr, 'float64/float32', (
184 |                 float64_times / float32_times)
185 |             print >> sys.stderr, 'expected float64/float32', (
186 |                 expected_times_64 / float32_times)
187 | 
188 |     #test in float32 in FAST_RUN mode on the gpu
189 |     import theano.sandbox.cuda
190 |     if do_gpu:
191 |         theano.sandbox.cuda.use('gpu')
192 |         gpu_times = do_tests()
193 |         print >> sys.stderr, algo_executed
194 |         print >> sys.stderr, 'gpu times', gpu_times
195 |         print >> sys.stderr, 'gpu expected', expected_times_gpu
196 |         print >> sys.stderr, 'gpu % expected/get', (
197 |             expected_times_gpu / gpu_times)
198 | 
199 |         if do_float64:
200 |             print >> sys.stderr, 'float64/gpu', float64_times / gpu_times
201 | 
202 |         if (do_float64 + do_float32 + do_gpu) > 1:
203 |             print >> sys.stderr
204 |             print >> sys.stderr, ('Duplicate the timing to have everything '
205 |                                   'in one place')
206 |             print >> sys.stderr, algo_executed
207 |             if do_float64:
208 |                 print >> sys.stderr, 'float64 times', float64_times
209 |                 print >> sys.stderr, 'float64 expected', expected_times_64
210 |                 print >> sys.stderr, 'float64 % expected/get', (
211 |                     expected_times_64 / float64_times)
212 |             if do_float32:
213 |                 print >> sys.stderr, 'float32 times', float32_times
214 |                 print >> sys.stderr, 'float32 expected', expected_times_32
215 |                 print >> sys.stderr, 'float32 % expected/get', (
216 |                     expected_times_32 / float32_times)
217 |             if do_gpu:
218 |                 print >> sys.stderr, 'gpu times', gpu_times
219 |                 print >> sys.stderr, 'gpu expected', expected_times_gpu
220 |                 print >> sys.stderr, 'gpu % expected/get', (
221 |                     expected_times_gpu / gpu_times)
222 | 
223 |             print
224 |             if do_float64 and do_float32:
225 |                 print >> sys.stderr, 'float64/float32', (
226 |                     float64_times / float32_times)
227 |                 print >> sys.stderr, 'expected float64/float32', (
228 |                     expected_times_64 / float32_times)
229 |             if do_float64 and do_gpu:
230 |                 print >> sys.stderr, 'float64/gpu', float64_times / gpu_times
231 |                 print >> sys.stderr, 'expected float64/gpu', (
232 |                     expected_times_64 / gpu_times)
233 |             if do_float32 and do_gpu:
234 |                 print >> sys.stderr, 'float32/gpu', float32_times / gpu_times
235 |                 print >> sys.stderr, 'expected float32/gpu', (
236 |                     expected_times_32 / gpu_times)
237 | 
238 |     def compare(x, y):
239 |         ratio = x / y
240 |         # If there is more then 5% difference between the expected
241 |         # time and the real time, we consider this an error.
242 |         return sum((ratio < 0.95) + (ratio > 1.05))
243 | 
244 |     print
245 |     if do_float64:
246 |         err = compare(expected_times_64, float64_times)
247 |         print >> sys.stderr, 'speed_failure_float64=' + str(err)
248 |     if do_float32:
249 |         err = compare(expected_times_32, float32_times)
250 |         print >> sys.stderr, 'speed_failure_float32=' + str(err)
251 |     if do_gpu:
252 |         err = compare(expected_times_gpu, gpu_times)
253 |         print >> sys.stderr, 'speed_failure_gpu=' + str(err)
254 | 
255 |         assert not numpy.isnan(gpu_times).any()
256 | 


--------------------------------------------------------------------------------
/code/logistic_cg.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This tutorial introduces logistic regression using Theano and conjugate
  3 | gradient descent.
  4 | 
  5 | Logistic regression is a probabilistic, linear classifier. It is parametrized
  6 | by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
  7 | done by projecting data points onto a set of hyperplanes, the distance to
  8 | which is used to determine a class membership probability.
  9 | 
 10 | Mathematically, this can be written as:
 11 | 
 12 | .. math::
 13 |   P(Y=i|x, W,b) &= softmax_i(W x + b) \\
 14 |                 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
 15 | 
 16 | 
 17 | The output of the model or prediction is then done by taking the argmax of
 18 | the vector whose i'th element is P(Y=i|x).
 19 | 
 20 | .. math::
 21 | 
 22 |   y_{pred} = argmax_i P(Y=i|x,W,b)
 23 | 
 24 | 
 25 | This tutorial presents a conjugate gradient optimization method that is
 26 | suitable for smaller datasets.
 27 | 
 28 | 
 29 | References:
 30 | 
 31 |    - textbooks: "Pattern Recognition and Machine Learning" -
 32 |                  Christopher M. Bishop, section 4.3.2
 33 | 
 34 | 
 35 | """
 36 | __docformat__ = 'restructedtext en'
 37 | 
 38 | 
 39 | import os
 40 | import sys
 41 | import time
 42 | 
 43 | import numpy
 44 | 
 45 | import theano
 46 | import theano.tensor as T
 47 | 
 48 | from logistic_sgd import load_data
 49 | 
 50 | 
 51 | class LogisticRegression(object):
 52 |     """Multi-class Logistic Regression Class
 53 | 
 54 |     The logistic regression is fully described by a weight matrix :math:`W`
 55 |     and bias vector :math:`b`. Classification is done by projecting data
 56 |     points onto a set of hyperplanes, the distance to which is used to
 57 |     determine a class membership probability.
 58 |     """
 59 | 
 60 |     def __init__(self, input, n_in, n_out):
 61 |         """ Initialize the parameters of the logistic regression
 62 | 
 63 |         :type input: theano.tensor.TensorType
 64 |         :param input: symbolic variable that describes the input of the
 65 |                       architecture ( one minibatch)
 66 | 
 67 |         :type n_in: int
 68 |         :param n_in: number of input units, the dimension of the space in
 69 |                      which the datapoint lies
 70 | 
 71 |         :type n_out: int
 72 |         :param n_out: number of output units, the dimension of the space in
 73 |                       which the target lies
 74 | 
 75 |         """
 76 | 
 77 |         # initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out),
 78 |         # while b is a vector of n_out elements, making theta a vector of
 79 |         # n_in*n_out + n_out elements
 80 |         self.theta = theano.shared(
 81 |             value=numpy.zeros(
 82 |                 n_in * n_out + n_out,
 83 |                 dtype=theano.config.floatX
 84 |             ),
 85 |             name='theta',
 86 |             borrow=True
 87 |         )
 88 |         # W is represented by the fisr n_in*n_out elements of theta
 89 |         self.W = self.theta[0:n_in * n_out].reshape((n_in, n_out))
 90 |         # b is the rest (last n_out elements)
 91 |         self.b = self.theta[n_in * n_out:n_in * n_out + n_out]
 92 | 
 93 |         # compute vector of class-membership probabilities in symbolic form
 94 |         self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
 95 | 
 96 |         # compute prediction as class whose probability is maximal in
 97 |         # symbolic form
 98 |         self.y_pred = T.argmax(self.p_y_given_x, axis=1)
 99 | 
100 |     def negative_log_likelihood(self, y):
101 |         """Return the negative log-likelihood of the prediction of this model
102 |         under a given target distribution.
103 | 
104 |         .. math::
105 | 
106 |             \frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
107 |             \frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|}
108 |                 \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
109 |             \ell (\theta=\{W,b\}, \mathcal{D})
110 | 
111 |         :type y: theano.tensor.TensorType
112 |         :param y: corresponds to a vector that gives for each example the
113 |                   correct label
114 |         """
115 |         return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
116 | 
117 |     def errors(self, y):
118 |         """Return a float representing the number of errors in the minibatch
119 |         over the total number of examples of the minibatch
120 | 
121 |         :type y: theano.tensor.TensorType
122 |         :param y: corresponds to a vector that gives for each example
123 |                   the correct label
124 |         """
125 | 
126 |         # check if y has same dimension of y_pred
127 |         if y.ndim != self.y_pred.ndim:
128 |             raise TypeError(
129 |                 'y should have the same shape as self.y_pred',
130 |                 ('y', y.type, 'y_pred', self.y_pred.type)
131 |             )
132 |         # check if y is of the correct datatype
133 |         if y.dtype.startswith('int'):
134 |             # the T.neq operator returns a vector of 0s and 1s, where 1
135 |             # represents a mistake in prediction
136 |             return T.mean(T.neq(self.y_pred, y))
137 |         else:
138 |             raise NotImplementedError()
139 | 
140 | 
141 | def cg_optimization_mnist(n_epochs=50, mnist_pkl_gz='mnist.pkl.gz'):
142 |     """Demonstrate conjugate gradient optimization of a log-linear model
143 | 
144 |     This is demonstrated on MNIST.
145 | 
146 |     :type n_epochs: int
147 |     :param n_epochs: number of epochs to run the optimizer
148 | 
149 |     :type mnist_pkl_gz: string
150 |     :param mnist_pkl_gz: the path of the mnist training file from
151 |                  http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
152 | 
153 |     """
154 |     #############
155 |     # LOAD DATA #
156 |     #############
157 |     datasets = load_data(mnist_pkl_gz)
158 | 
159 |     train_set_x, train_set_y = datasets[0]
160 |     valid_set_x, valid_set_y = datasets[1]
161 |     test_set_x, test_set_y = datasets[2]
162 | 
163 |     batch_size = 600    # size of the minibatch
164 | 
165 |     n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
166 |     n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
167 |     n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
168 | 
169 |     n_in = 28 * 28  # number of input units
170 |     n_out = 10  # number of output units
171 | 
172 |     ######################
173 |     # BUILD ACTUAL MODEL #
174 |     ######################
175 |     print '... building the model'
176 | 
177 |     # allocate symbolic variables for the data
178 |     minibatch_offset = T.lscalar()  # offset to the start of a [mini]batch
179 |     x = T.matrix()   # the data is presented as rasterized images
180 |     y = T.ivector()  # the labels are presented as 1D vector of
181 |                      # [int] labels
182 | 
183 |     # construct the logistic regression class
184 |     classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)
185 | 
186 |     # the cost we minimize during training is the negative log likelihood of
187 |     # the model in symbolic format
188 |     cost = classifier.negative_log_likelihood(y).mean()
189 | 
190 |     # compile a theano function that computes the mistakes that are made by
191 |     # the model on a minibatch
192 |     test_model = theano.function(
193 |         [minibatch_offset],
194 |         classifier.errors(y),
195 |         givens={
196 |             x: test_set_x[minibatch_offset:minibatch_offset + batch_size],
197 |             y: test_set_y[minibatch_offset:minibatch_offset + batch_size]
198 |         },
199 |         name="test"
200 |     )
201 | 
202 |     validate_model = theano.function(
203 |         [minibatch_offset],
204 |         classifier.errors(y),
205 |         givens={
206 |             x: valid_set_x[minibatch_offset: minibatch_offset + batch_size],
207 |             y: valid_set_y[minibatch_offset: minibatch_offset + batch_size]
208 |         },
209 |         name="validate"
210 |     )
211 | 
212 |     #  compile a theano function that returns the cost of a minibatch
213 |     batch_cost = theano.function(
214 |         [minibatch_offset],
215 |         cost,
216 |         givens={
217 |             x: train_set_x[minibatch_offset: minibatch_offset + batch_size],
218 |             y: train_set_y[minibatch_offset: minibatch_offset + batch_size]
219 |         },
220 |         name="batch_cost"
221 |     )
222 | 
223 |     # compile a theano function that returns the gradient of the minibatch
224 |     # with respect to theta
225 |     batch_grad = theano.function(
226 |         [minibatch_offset],
227 |         T.grad(cost, classifier.theta),
228 |         givens={
229 |             x: train_set_x[minibatch_offset: minibatch_offset + batch_size],
230 |             y: train_set_y[minibatch_offset: minibatch_offset + batch_size]
231 |         },
232 |         name="batch_grad"
233 |     )
234 | 
235 |     # creates a function that computes the average cost on the training set
236 |     def train_fn(theta_value):
237 |         classifier.theta.set_value(theta_value, borrow=True)
238 |         train_losses = [batch_cost(i * batch_size)
239 |                         for i in xrange(n_train_batches)]
240 |         return numpy.mean(train_losses)
241 | 
242 |     # creates a function that computes the average gradient of cost with
243 |     # respect to theta
244 |     def train_fn_grad(theta_value):
245 |         classifier.theta.set_value(theta_value, borrow=True)
246 |         grad = batch_grad(0)
247 |         for i in xrange(1, n_train_batches):
248 |             grad += batch_grad(i * batch_size)
249 |         return grad / n_train_batches
250 | 
251 |     validation_scores = [numpy.inf, 0]
252 | 
253 |     # creates the validation function
254 |     def callback(theta_value):
255 |         classifier.theta.set_value(theta_value, borrow=True)
256 |         #compute the validation loss
257 |         validation_losses = [validate_model(i * batch_size)
258 |                              for i in xrange(n_valid_batches)]
259 |         this_validation_loss = numpy.mean(validation_losses)
260 |         print('validation error %f %%' % (this_validation_loss * 100.,))
261 | 
262 |         # check if it is better then best validation score got until now
263 |         if this_validation_loss < validation_scores[0]:
264 |             # if so, replace the old one, and compute the score on the
265 |             # testing dataset
266 |             validation_scores[0] = this_validation_loss
267 |             test_losses = [test_model(i * batch_size)
268 |                            for i in xrange(n_test_batches)]
269 |             validation_scores[1] = numpy.mean(test_losses)
270 | 
271 |     ###############
272 |     # TRAIN MODEL #
273 |     ###############
274 | 
275 |     # using scipy conjugate gradient optimizer
276 |     import scipy.optimize
277 |     print ("Optimizing using scipy.optimize.fmin_cg...")
278 |     start_time = time.clock()
279 |     best_w_b = scipy.optimize.fmin_cg(
280 |         f=train_fn,
281 |         x0=numpy.zeros((n_in + 1) * n_out, dtype=x.dtype),
282 |         fprime=train_fn_grad,
283 |         callback=callback,
284 |         disp=0,
285 |         maxiter=n_epochs
286 |     )
287 |     end_time = time.clock()
288 |     print(
289 |         (
290 |             'Optimization complete with best validation score of %f %%, with '
291 |             'test performance %f %%'
292 |         )
293 |         % (validation_scores[0] * 100., validation_scores[1] * 100.)
294 |     )
295 | 
296 |     print >> sys.stderr, ('The code for file ' +
297 |                           os.path.split(__file__)[1] +
298 |                           ' ran for %.1fs' % ((end_time - start_time)))
299 | 
300 | 
301 | if __name__ == '__main__':
302 |     cg_optimization_mnist()
303 | 


--------------------------------------------------------------------------------
/doc/logreg.txt:
--------------------------------------------------------------------------------
  1 | .. index:: Logistic Regression
  2 | 
  3 | .. _logreg :
  4 | 
  5 | 
  6 | Classifying MNIST digits using Logistic Regression
  7 | ==================================================
  8 | 
  9 | .. note::
 10 |     This sections assumes familiarity with the following Theano
 11 |     concepts: `shared variables`_ , `basic arithmetic ops`_ , `T.grad`_ ,
 12 |     `floatX`_. If you intend to run the code on GPU also read `GPU`_.
 13 | 
 14 | .. note::
 15 |     The code for this section is available for download `here`_.
 16 | 
 17 | .. _here: http://deeplearning.net/tutorial/code/logistic_sgd.py
 18 | 
 19 | .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables
 20 | 
 21 | .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars
 22 | 
 23 | .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients
 24 | 
 25 | .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX
 26 | 
 27 | .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html
 28 | 
 29 | In this section, we show how Theano can be used to implement the most basic
 30 | classifier: the logistic regression. We start off with a quick primer of the
 31 | model, which serves both as a refresher but also to anchor the notation and
 32 | show how mathematical expressions are mapped onto Theano graphs.
 33 | 
 34 | In the deepest of machine learning traditions, this tutorial will tackle the exciting
 35 | problem of MNIST digit classification.
 36 | 
 37 | The Model
 38 | +++++++++
 39 | 
 40 | Logistic regression is a probabilistic, linear classifier. It is parametrized
 41 | by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
 42 | done by projecting an input vector onto a set of hyperplanes, each of which
 43 | corresponds to a class. The distance from the input to a hyperplane reflects
 44 | the probability that the input is a member of the corresponding class.
 45 | 
 46 | Mathematically, the probability that an input vector :math:`x` is a member of a
 47 | class :math:`i`, a value of a stochastic variable :math:`Y`, can be written as:
 48 | 
 49 | .. math::
 50 |   P(Y=i|x, W,b) &= softmax_i(W x + b) \\
 51 |                 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
 52 | 
 53 | The model's prediction :math:`y_{pred}` is the class whose probability is maximal, specifically:
 54 | 
 55 | .. math::
 56 |   y_{pred} = {\rm argmax}_i P(Y=i|x,W,b)
 57 | 
 58 | The code to do this in Theano is the following:
 59 | 
 60 | .. literalinclude:: ../code/logistic_sgd.py
 61 |   :start-after: start-snippet-1
 62 |   :end-before: end-snippet-1
 63 | 
 64 | Since the parameters of the model must maintain a persistent state throughout
 65 | training, we allocate shared variables for :math:`W,b`. This declares them both
 66 | as being symbolic Theano variables, but also initializes their contents. The
 67 | dot and softmax operators are then used to compute the vector :math:`P(Y|x,
 68 | W,b)`. The result ``p_y_given_x`` is a symbolic variable of vector-type.
 69 | 
 70 | To get the actual model prediction, we can use the ``T.argmax`` operator, which
 71 | will return the index at which ``p_y_given_x`` is maximal (i.e. the class with
 72 | maximum probability).
 73 | 
 74 | Now of course, the model we have defined so far does not do anything useful
 75 | yet, since its parameters are still in their initial state. The following
 76 | section will thus cover how to learn the optimal parameters.
 77 | 
 78 | 
 79 | .. note::
 80 |     For a complete list of Theano ops, see: `list of ops <http://deeplearning.net/software/theano/library/tensor/basic.html#basic-tensor-functionality>`_
 81 | 
 82 | 
 83 | Defining a Loss Function
 84 | ++++++++++++++++++++++++
 85 | 
 86 | Learning optimal model parameters involves minimizing a loss function. In the
 87 | case of multi-class logistic regression, it is very common to use the negative
 88 | log-likelihood as the loss. This is equivalent to maximizing the likelihood of the
 89 | data set :math:`\cal{D}` under the model parameterized by :math:`\theta`. Let
 90 | us first start by defining the likelihood :math:`\cal{L}` and loss
 91 | :math:`\ell`:
 92 | 
 93 | .. math::
 94 | 
 95 |    \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
 96 |      \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
 97 |    \ell (\theta=\{W,b\}, \mathcal{D}) = - \mathcal{L} (\theta=\{W,b\}, \mathcal{D})
 98 | 
 99 | While entire books are dedicated to the topic of minimization, gradient
100 | descent is by far the simplest method for minimizing arbitrary non-linear
101 | functions. This tutorial will use the method of stochastic gradient method with
102 | mini-batches (MSGD). See :ref:`opt_SGD` for more details.
103 | 
104 | The following Theano code defines the (symbolic) loss for a given minibatch:
105 | 
106 | .. literalinclude:: ../code/logistic_sgd.py
107 |   :start-after: start-snippet-2
108 |   :end-before: end-snippet-2
109 | 
110 | .. note::
111 | 
112 |     Even though the loss is formally defined as the *sum*, over the data set,
113 |     of individual error terms, in practice, we use the *mean* (``T.mean``)
114 |     in the code. This allows for the learning rate choice to be less dependent
115 |     of the minibatch size.
116 | 
117 | 
118 | Creating a LogisticRegression class
119 | +++++++++++++++++++++++++++++++++++
120 | 
121 | We now have all the tools we need to define a ``LogisticRegression`` class, which
122 | encapsulates the basic behaviour of logistic regression. The code is very
123 | similar to what we have covered so far, and should be self explanatory.
124 | 
125 | .. literalinclude:: ../code/logistic_sgd.py
126 |   :pyobject: LogisticRegression
127 | 
128 | We instantiate this class as follows:
129 | 
130 | .. literalinclude:: ../code/logistic_sgd.py
131 |   :start-after: index = T.lscalar()  
132 |   :end-before: # the cost we minimize during
133 | 
134 | We start by allocating symbolic variables for the training inputs :math:`x` and
135 | their corresponding classes :math:`y`. Note that ``x`` and ``y`` are defined
136 | outside the scope of the ``LogisticRegression`` object. Since the class
137 | requires the input to build its graph, it is passed as a parameter of the
138 | ``__init__`` function. This is useful in case you want to connect instances of
139 | such classes to form a deep network. The output of one layer can be passed as
140 | the input of the layer above. (This tutorial does not build a multi-layer
141 | network, but this code will be reused in future tutorials that do.)
142 | 
143 | Finally, we define a (symbolic) ``cost`` variable to minimize, using the instance
144 | method ``classifier.negative_log_likelihood``.
145 | 
146 | .. literalinclude:: ../code/logistic_sgd.py
147 |   :start-after: classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) 
148 |   :end-before: # compiling a Theano function that computes the mistakes
149 | 
150 | Note that ``x`` is an implicit symbolic input to the definition of ``cost``,
151 | because the symbolic variables of ``classifier`` were defined in terms of ``x``
152 | at initialization.
153 | 
154 | Learning the Model
155 | ++++++++++++++++++
156 | 
157 | To implement MSGD in most programming languages (C/C++, Matlab, Python), one
158 | would start by manually deriving the expressions for the gradient of the loss
159 | with respect to the parameters: in this case :math:`\partial{\ell}/\partial{W}`,
160 | and :math:`\partial{\ell}/\partial{b}`, This can get pretty tricky for complex
161 | models, as expressions for :math:`\partial{\ell}/\partial{\theta}` can get
162 | fairly complex, especially when taking into account problems of numerical
163 | stability.
164 | 
165 | With Theano, this work is greatly simplified. It performs
166 | automatic differentiation and applies certain math transforms to improve
167 | numerical stability.
168 | 
169 | To get the gradients :math:`\partial{\ell}/\partial{W}` and
170 | :math:`\partial{\ell}/\partial{b}` in Theano, simply do the following:
171 | 
172 | .. literalinclude:: ../code/logistic_sgd.py
173 |   :start-after: # compute the gradient of cost
174 |   :end-before: # start-snippet-3
175 | 
176 | ``g_W`` and ``g_b`` are symbolic variables, which can be used as part
177 | of a computation graph. The function ``train_model``, which performs one step
178 | of gradient descent, can then be defined as follows:
179 | 
180 | .. literalinclude:: ../code/logistic_sgd.py
181 |   :start-after: start-snippet-3
182 |   :end-before: end-snippet-3
183 | 
184 | ``updates`` is a list of pairs. In each pair, the first element is the symbolic
185 | variable to be updated in the step, and the second element is the symbolic
186 | function for calculating its new value. Similarly, ``givens`` is a dictionary
187 | whose keys are symbolic variables and whose values specify
188 | their replacements during the step. The function ``train_model`` is then defined such
189 | that:
190 | 
191 | * the input is the mini-batch index ``index`` that, together with the batch
192 |   size (which is not an input since it is fixed) defines :math:`x` with
193 |   corresponding labels :math:`y`
194 | * the return value is the cost/loss associated with the x, y defined by
195 |   the ``index``
196 | * on every function call, it will first replace ``x`` and ``y`` with the slices
197 |   from the training set specified by ``index``. Then, it will evaluate the cost
198 |   associated with that minibatch and apply the operations defined by the
199 |   ``updates`` list.
200 | 
201 | Each time ``train_model(index)`` is called, it will thus compute and return the
202 | cost of a minibatch, while also performing a step of MSGD. The entire learning
203 | algorithm thus consists in looping over all examples in the dataset, considering
204 | all the examples in one minibatch at a time,
205 | and repeatedly calling the ``train_model`` function.
206 | 
207 | 
208 | Testing the model
209 | +++++++++++++++++
210 | 
211 | As explained in :ref:`opt_learn_classifier`, when testing the model we are
212 | interested in the number of misclassified examples (and not only in the likelihood).
213 | The ``LogisticRegression`` class therefore has an extra instance method, which
214 | builds the symbolic graph for retrieving the number of misclassified examples in
215 | each minibatch.
216 | 
217 | The code is as follows:
218 | 
219 | .. literalinclude:: ../code/logistic_sgd.py
220 |   :pyobject: LogisticRegression.errors
221 | 
222 | We then create a function ``test_model`` and a function ``validate_model``,
223 | which we can call to retrieve this value. As you will see shortly,
224 | ``validate_model`` is key to our early-stopping implementation (see
225 | :ref:`opt_early_stopping`). These functions take a minibatch index and compute,
226 | for the examples in that minibatch, the number that were misclassified by the
227 | model. The only difference between them is that ``test_model`` draws its
228 | minibatches from the testing set, while ``validate_model`` draws its from the
229 | validation set.
230 | 
231 | .. literalinclude:: ../code/logistic_sgd.py
232 |   :start-after: cost = classifier.negative_log_likelihood(y)
233 |   :end-before: # compute the gradient of cost 
234 | 
235 | Putting it All Together
236 | +++++++++++++++++++++++
237 | 
238 | The finished product is as follows.
239 | 
240 | .. literalinclude:: ../code/logistic_sgd.py
241 | 
242 | The user can learn to classify MNIST digits with SGD logistic regression, by typing, from
243 | within the DeepLearningTutorials folder:
244 | 
245 | .. code-block:: bash
246 | 
247 |     python code/logistic_sgd.py
248 | 
249 | The output one should expect is of the form :
250 | 
251 | .. code-block:: bash
252 | 
253 |     ...
254 |     epoch 72, minibatch 83/83, validation error 7.510417 %
255 |          epoch 72, minibatch 83/83, test error of best model 7.510417 %
256 |     epoch 73, minibatch 83/83, validation error 7.500000 %
257 |          epoch 73, minibatch 83/83, test error of best model 7.489583 %
258 |     Optimization complete with best validation score of 7.500000 %,with test performance 7.489583 %
259 |     The code run for 74 epochs, with 1.936983 epochs/sec
260 | 
261 | 
262 | On an Intel(R) Core(TM)2 Duo CPU E8400 @ 3.00 Ghz  the code runs with
263 | approximately 1.936 epochs/sec and it took 75 epochs to reach a test
264 | error of 7.489%. On the GPU the code does almost 10.0 epochs/sec. For this
265 | instance we used a batch size of 600.
266 | 
267 | .. rubric:: Footnotes
268 | 
269 | .. [#f1] For smaller datasets and simpler models, more sophisticated descent
270 |          algorithms can be more effective. The sample code
271 |          `logistic_cg.py <http://deeplearning.net/tutorial/code/logistic_cg.py>`_
272 |          demonstrates how to use SciPy's conjugate gradient solver with Theano
273 |          on the logistic regression task.
274 | 


--------------------------------------------------------------------------------
/code/rnnrbm.py:
--------------------------------------------------------------------------------
  1 | # Author: Nicolas Boulanger-Lewandowski
  2 | # University of Montreal (2012)
  3 | # RNN-RBM deep learning tutorial
  4 | # More information at http://deeplearning.net/tutorial/rnnrbm.html
  5 | 
  6 | import glob
  7 | import os
  8 | import sys
  9 | 
 10 | import numpy
 11 | try:
 12 |     import pylab
 13 | except ImportError:
 14 |     print (
 15 |         "pylab isn't available. If you use its functionality, it will crash."
 16 |     )
 17 |     print "It can be installed with 'pip install -q Pillow'"
 18 | 
 19 | from midi.utils import midiread, midiwrite
 20 | import theano
 21 | import theano.tensor as T
 22 | from theano.tensor.shared_randomstreams import RandomStreams
 23 | 
 24 | #Don't use a python long as this don't work on 32 bits computers.
 25 | numpy.random.seed(0xbeef)
 26 | rng = RandomStreams(seed=numpy.random.randint(1 << 30))
 27 | theano.config.warn.subtensor_merge_bug = False
 28 | 
 29 | 
 30 | def build_rbm(v, W, bv, bh, k):
 31 |     '''Construct a k-step Gibbs chain starting at v for an RBM.
 32 | 
 33 |     v : Theano vector or matrix
 34 |         If a matrix, multiple chains will be run in parallel (batch).
 35 |     W : Theano matrix
 36 |         Weight matrix of the RBM.
 37 |     bv : Theano vector
 38 |         Visible bias vector of the RBM.
 39 |     bh : Theano vector
 40 |         Hidden bias vector of the RBM.
 41 |     k : scalar or Theano scalar
 42 |         Length of the Gibbs chain.
 43 | 
 44 |     Return a (v_sample, cost, monitor, updates) tuple:
 45 | 
 46 |     v_sample : Theano vector or matrix with the same shape as `v`
 47 |         Corresponds to the generated sample(s).
 48 |     cost : Theano scalar
 49 |         Expression whose gradient with respect to W, bv, bh is the CD-k
 50 |         approximation to the log-likelihood of `v` (training example) under the
 51 |         RBM. The cost is averaged in the batch case.
 52 |     monitor: Theano scalar
 53 |         Pseudo log-likelihood (also averaged in the batch case).
 54 |     updates: dictionary of Theano variable -> Theano variable
 55 |         The `updates` object returned by scan.'''
 56 | 
 57 |     def gibbs_step(v):
 58 |         mean_h = T.nnet.sigmoid(T.dot(v, W) + bh)
 59 |         h = rng.binomial(size=mean_h.shape, n=1, p=mean_h,
 60 |                          dtype=theano.config.floatX)
 61 |         mean_v = T.nnet.sigmoid(T.dot(h, W.T) + bv)
 62 |         v = rng.binomial(size=mean_v.shape, n=1, p=mean_v,
 63 |                          dtype=theano.config.floatX)
 64 |         return mean_v, v
 65 | 
 66 |     chain, updates = theano.scan(lambda v: gibbs_step(v)[1], outputs_info=[v],
 67 |                                  n_steps=k)
 68 |     v_sample = chain[-1]
 69 | 
 70 |     mean_v = gibbs_step(v_sample)[0]
 71 |     monitor = T.xlogx.xlogy0(v, mean_v) + T.xlogx.xlogy0(1 - v, 1 - mean_v)
 72 |     monitor = monitor.sum() / v.shape[0]
 73 | 
 74 |     def free_energy(v):
 75 |         return -(v * bv).sum() - T.log(1 + T.exp(T.dot(v, W) + bh)).sum()
 76 |     cost = (free_energy(v) - free_energy(v_sample)) / v.shape[0]
 77 | 
 78 |     return v_sample, cost, monitor, updates
 79 | 
 80 | 
 81 | def shared_normal(num_rows, num_cols, scale=1):
 82 |     '''Initialize a matrix shared variable with normally distributed
 83 |     elements.'''
 84 |     return theano.shared(numpy.random.normal(
 85 |         scale=scale, size=(num_rows, num_cols)).astype(theano.config.floatX))
 86 | 
 87 | 
 88 | def shared_zeros(*shape):
 89 |     '''Initialize a vector shared variable with zero elements.'''
 90 |     return theano.shared(numpy.zeros(shape, dtype=theano.config.floatX))
 91 | 
 92 | 
 93 | def build_rnnrbm(n_visible, n_hidden, n_hidden_recurrent):
 94 |     '''Construct a symbolic RNN-RBM and initialize parameters.
 95 | 
 96 |     n_visible : integer
 97 |         Number of visible units.
 98 |     n_hidden : integer
 99 |         Number of hidden units of the conditional RBMs.
100 |     n_hidden_recurrent : integer
101 |         Number of hidden units of the RNN.
102 | 
103 |     Return a (v, v_sample, cost, monitor, params, updates_train, v_t,
104 |     updates_generate) tuple:
105 | 
106 |     v : Theano matrix
107 |         Symbolic variable holding an input sequence (used during training)
108 |     v_sample : Theano matrix
109 |         Symbolic variable holding the negative particles for CD log-likelihood
110 |         gradient estimation (used during training)
111 |     cost : Theano scalar
112 |         Expression whose gradient (considering v_sample constant) corresponds
113 |         to the LL gradient of the RNN-RBM (used during training)
114 |     monitor : Theano scalar
115 |         Frame-level pseudo-likelihood (useful for monitoring during training)
116 |     params : tuple of Theano shared variables
117 |         The parameters of the model to be optimized during training.
118 |     updates_train : dictionary of Theano variable -> Theano variable
119 |         Update object that should be passed to theano.function when compiling
120 |         the training function.
121 |     v_t : Theano matrix
122 |         Symbolic variable holding a generated sequence (used during sampling)
123 |     updates_generate : dictionary of Theano variable -> Theano variable
124 |         Update object that should be passed to theano.function when compiling
125 |         the generation function.'''
126 | 
127 |     W = shared_normal(n_visible, n_hidden, 0.01)
128 |     bv = shared_zeros(n_visible)
129 |     bh = shared_zeros(n_hidden)
130 |     Wuh = shared_normal(n_hidden_recurrent, n_hidden, 0.0001)
131 |     Wuv = shared_normal(n_hidden_recurrent, n_visible, 0.0001)
132 |     Wvu = shared_normal(n_visible, n_hidden_recurrent, 0.0001)
133 |     Wuu = shared_normal(n_hidden_recurrent, n_hidden_recurrent, 0.0001)
134 |     bu = shared_zeros(n_hidden_recurrent)
135 | 
136 |     params = W, bv, bh, Wuh, Wuv, Wvu, Wuu, bu  # learned parameters as shared
137 |                                                 # variables
138 | 
139 |     v = T.matrix()  # a training sequence
140 |     u0 = T.zeros((n_hidden_recurrent,))  # initial value for the RNN hidden
141 |                                          # units
142 | 
143 |     # If `v_t` is given, deterministic recurrence to compute the variable
144 |     # biases bv_t, bh_t at each time step. If `v_t` is None, same recurrence
145 |     # but with a separate Gibbs chain at each time step to sample (generate)
146 |     # from the RNN-RBM. The resulting sample v_t is returned in order to be
147 |     # passed down to the sequence history.
148 |     def recurrence(v_t, u_tm1):
149 |         bv_t = bv + T.dot(u_tm1, Wuv)
150 |         bh_t = bh + T.dot(u_tm1, Wuh)
151 |         generate = v_t is None
152 |         if generate:
153 |             v_t, _, _, updates = build_rbm(T.zeros((n_visible,)), W, bv_t,
154 |                                            bh_t, k=25)
155 |         u_t = T.tanh(bu + T.dot(v_t, Wvu) + T.dot(u_tm1, Wuu))
156 |         return ([v_t, u_t], updates) if generate else [u_t, bv_t, bh_t]
157 | 
158 |     # For training, the deterministic recurrence is used to compute all the
159 |     # {bv_t, bh_t, 1 <= t <= T} given v. Conditional RBMs can then be trained
160 |     # in batches using those parameters.
161 |     (u_t, bv_t, bh_t), updates_train = theano.scan(
162 |         lambda v_t, u_tm1, *_: recurrence(v_t, u_tm1),
163 |         sequences=v, outputs_info=[u0, None, None], non_sequences=params)
164 |     v_sample, cost, monitor, updates_rbm = build_rbm(v, W, bv_t[:], bh_t[:],
165 |                                                      k=15)
166 |     updates_train.update(updates_rbm)
167 | 
168 |     # symbolic loop for sequence generation
169 |     (v_t, u_t), updates_generate = theano.scan(
170 |         lambda u_tm1, *_: recurrence(None, u_tm1),
171 |         outputs_info=[None, u0], non_sequences=params, n_steps=200)
172 | 
173 |     return (v, v_sample, cost, monitor, params, updates_train, v_t,
174 |             updates_generate)
175 | 
176 | 
177 | class RnnRbm:
178 |     '''Simple class to train an RNN-RBM from MIDI files and to generate sample
179 |     sequences.'''
180 | 
181 |     def __init__(
182 |         self,
183 |         n_hidden=150,
184 |         n_hidden_recurrent=100,
185 |         lr=0.001,
186 |         r=(21, 109),
187 |         dt=0.3
188 |     ):
189 |         '''Constructs and compiles Theano functions for training and sequence
190 |         generation.
191 | 
192 |         n_hidden : integer
193 |             Number of hidden units of the conditional RBMs.
194 |         n_hidden_recurrent : integer
195 |             Number of hidden units of the RNN.
196 |         lr : float
197 |             Learning rate
198 |         r : (integer, integer) tuple
199 |             Specifies the pitch range of the piano-roll in MIDI note numbers,
200 |             including r[0] but not r[1], such that r[1]-r[0] is the number of
201 |             visible units of the RBM at a given time step. The default (21,
202 |             109) corresponds to the full range of piano (88 notes).
203 |         dt : float
204 |             Sampling period when converting the MIDI files into piano-rolls, or
205 |             equivalently the time difference between consecutive time steps.'''
206 | 
207 |         self.r = r
208 |         self.dt = dt
209 |         (v, v_sample, cost, monitor, params, updates_train, v_t,
210 |             updates_generate) = build_rnnrbm(
211 |                 r[1] - r[0],
212 |                 n_hidden,
213 |                 n_hidden_recurrent
214 |             )
215 | 
216 |         gradient = T.grad(cost, params, consider_constant=[v_sample])
217 |         updates_train.update(
218 |             ((p, p - lr * g) for p, g in zip(params, gradient))
219 |         )
220 |         self.train_function = theano.function(
221 |             [v],
222 |             monitor,
223 |             updates=updates_train
224 |         )
225 |         self.generate_function = theano.function(
226 |             [],
227 |             v_t,
228 |             updates=updates_generate
229 |         )
230 | 
231 |     def train(self, files, batch_size=100, num_epochs=200):
232 |         '''Train the RNN-RBM via stochastic gradient descent (SGD) using MIDI
233 |         files converted to piano-rolls.
234 | 
235 |         files : list of strings
236 |             List of MIDI files that will be loaded as piano-rolls for training.
237 |         batch_size : integer
238 |             Training sequences will be split into subsequences of at most this
239 |             size before applying the SGD updates.
240 |         num_epochs : integer
241 |             Number of epochs (pass over the training set) performed. The user
242 |             can safely interrupt training with Ctrl+C at any time.'''
243 | 
244 |         assert len(files) > 0, 'Training set is empty!' \
245 |                                ' (did you download the data files?)'
246 |         dataset = [midiread(f, self.r,
247 |                             self.dt).piano_roll.astype(theano.config.floatX)
248 |                    for f in files]
249 | 
250 |         try:
251 |             for epoch in xrange(num_epochs):
252 |                 numpy.random.shuffle(dataset)
253 |                 costs = []
254 | 
255 |                 for s, sequence in enumerate(dataset):
256 |                     for i in xrange(0, len(sequence), batch_size):
257 |                         cost = self.train_function(sequence[i:i + batch_size])
258 |                         costs.append(cost)
259 | 
260 |                 print 'Epoch %i/%i' % (epoch + 1, num_epochs),
261 |                 print numpy.mean(costs)
262 |                 sys.stdout.flush()
263 | 
264 |         except KeyboardInterrupt:
265 |             print 'Interrupted by user.'
266 | 
267 |     def generate(self, filename, show=True):
268 |         '''Generate a sample sequence, plot the resulting piano-roll and save
269 |         it as a MIDI file.
270 | 
271 |         filename : string
272 |             A MIDI file will be created at this location.
273 |         show : boolean
274 |             If True, a piano-roll of the generated sequence will be shown.'''
275 | 
276 |         piano_roll = self.generate_function()
277 |         midiwrite(filename, piano_roll, self.r, self.dt)
278 |         if show:
279 |             extent = (0, self.dt * len(piano_roll)) + self.r
280 |             pylab.figure()
281 |             pylab.imshow(piano_roll.T, origin='lower', aspect='auto',
282 |                          interpolation='nearest', cmap=pylab.cm.gray_r,
283 |                          extent=extent)
284 |             pylab.xlabel('time (s)')
285 |             pylab.ylabel('MIDI note number')
286 |             pylab.title('generated piano-roll')
287 | 
288 | 
289 | def test_rnnrbm(batch_size=100, num_epochs=200):
290 |     model = RnnRbm()
291 |     re = os.path.join(os.path.split(os.path.dirname(__file__))[0],
292 |                       'data', 'Nottingham', 'train', '*.mid')
293 |     model.train(glob.glob(re),
294 |                 batch_size=batch_size, num_epochs=num_epochs)
295 |     return model
296 | 
297 | if __name__ == '__main__':
298 |     model = test_rnnrbm()
299 |     model.generate('sample1.mid')
300 |     model.generate('sample2.mid')
301 |     pylab.show()
302 | 


--------------------------------------------------------------------------------
/doc/DBN.txt:
--------------------------------------------------------------------------------
  1 | .. _DBN:
  2 | 
  3 | Deep Belief Networks
  4 | ====================
  5 | 
  6 | .. note::
  7 |   This section assumes the reader has already read through :doc:`logreg`
  8 |   and :doc:`mlp` and :doc:`rbm`. Additionally it uses the following Theano
  9 |   functions and concepts : `T.tanh`_, `shared variables`_, `basic arithmetic
 10 |   ops`_, `T.grad`_, `Random numbers`_, `floatX`_. If you intend to run the
 11 |   code on GPU also read `GPU`_.
 12 | 
 13 | .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh
 14 | 
 15 | .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables
 16 | 
 17 | .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars
 18 | 
 19 | .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients
 20 | 
 21 | .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX
 22 | 
 23 | .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html 
 24 | 
 25 | .. _Random numbers: http://deeplearning.net/software/theano/tutorial/examples.html#using-random-numbers
 26 | 
 27 | 
 28 | .. note::
 29 |     The code for this section is available for download `here`_.
 30 | 
 31 | .. _here: http://deeplearning.net/tutorial/code/DBN.py
 32 | 
 33 | 
 34 | Deep Belief Networks
 35 | ++++++++++++++++++++
 36 | 
 37 | [Hinton06]_ showed that RBMs can be stacked and trained in a greedy manner
 38 | to form so-called Deep Belief Networks (DBN). DBNs are graphical models which
 39 | learn to extract a deep hierarchical representation of the training data.
 40 | They model the joint distribution between observed vector :math:`x` and
 41 | the :math:`\ell` hidden layers :math:`h^k` as follows:
 42 | 
 43 | .. math::
 44 |     :label: dbn
 45 |  
 46 |     P(x, h^1, \ldots, h^{\ell}) = \left(\prod_{k=0}^{\ell-2} P(h^k|h^{k+1})\right) P(h^{\ell-1},h^{\ell})
 47 | 
 48 | where :math:`x=h^0`, :math:`P(h^{k-1} | h^k)` is a conditional distribution
 49 | for the visible units conditioned on the hidden units of the RBM at level
 50 | :math:`k`, and :math:`P(h^{\ell-1}, h^{\ell})` is the visible-hidden joint
 51 | distribution in the top-level RBM. This is illustrated in the figure below.
 52 | 
 53 | 
 54 | .. figure:: images/DBN3.png
 55 |     :align: center
 56 | 
 57 | The principle of greedy layer-wise unsupervised training can be applied to
 58 | DBNs with RBMs as the building blocks for each layer [Hinton06]_, [Bengio07]_.
 59 | The process is as follows:
 60 | 
 61 | 1. Train the first layer as an RBM that models the raw input :math:`x =
 62 | h^{(0)}` as its visible layer.
 63 | 
 64 | 2. Use that first layer to obtain a representation of the input that will
 65 | be used as data for the second layer. Two common solutions exist. This
 66 | representation can be chosen as being the mean activations
 67 | :math:`p(h^{(1)}=1|h^{(0)})` or samples of :math:`p(h^{(1)}|h^{(0)})`.
 68 | 
 69 | 3. Train the second layer as an RBM, taking the transformed data (samples or
 70 | mean activations) as training examples (for the visible layer of that RBM).
 71 | 
 72 | 4. Iterate (2 and 3) for the desired number of layers, each time propagating
 73 | upward either samples or mean values.
 74 | 
 75 | 5. Fine-tune all the parameters of this deep architecture with respect to a
 76 | proxy for the DBN log- likelihood, or with respect to a supervised training
 77 | criterion (after adding extra learning machinery to convert the learned
 78 | representation into supervised predictions, e.g. a linear classifier).
 79 | 
 80 | 
 81 | In this tutorial, we focus on fine-tuning via supervised gradient descent.
 82 | Specifically, we use a logistic regression classifier to classify the input
 83 | :math:`x` based on the output of the last hidden layer :math:`h^{(l)}` of the
 84 | DBN. Fine-tuning is then performed via supervised gradient descent of the
 85 | negative log-likelihood cost function. Since the supervised gradient is only
 86 | non-null for the weights and hidden layer biases of each layer (i.e. null for
 87 | the visible biases of each RBM), this procedure is equivalent to initializing
 88 | the parameters of a deep MLP with the weights and hidden layer biases obtained
 89 | with the unsupervised training strategy.
 90 | 
 91 | Justifying Greedy-Layer Wise Pre-Training
 92 | +++++++++++++++++++++++++++++++++++++++++
 93 | 
 94 | Why does such an algorithm work ? Taking as example a 2-layer DBN with hidden
 95 | layers :math:`h^{(1)}` and :math:`h^{(2)}` (with respective weight parameters
 96 | :math:`W^{(1)}` and :math:`W^{(2)}`), [Hinton06]_ established 
 97 | (see also Bengio09]_ for a detailed derivation) that :math:`\log
 98 | p(x)` can be rewritten as,
 99 | 
100 | .. math::
101 |     :label: dbn_bound
102 | 
103 |     \log p(x) = &KL(Q(h^{(1)}|x)||p(h^{(1)}|x)) + H_{Q(h^{(1)}|x)} + \\
104 |                 &\sum_h Q(h^{(1)}|x)(\log p(h^{(1)}) + \log p(x|h^{(1)})).
105 | 
106 | :math:`KL(Q(h^{(1)}|x) || p(h^{(1)}|x))` represents the KL divergence between
107 | the posterior :math:`Q(h^{(1)}|x)` of the first RBM if it were standalone, and the
108 | probability :math:`p(h^{(1)}|x)` for the same layer but defined by the entire DBN
109 | (i.e. taking into account the prior :math:`p(h^{(1)},h^{(2)})` defined by the
110 | top-level RBM). :math:`H_{Q(h^{(1)}|x)}` is the entropy of the distribution
111 | :math:`Q(h^{(1)}|x)`. 
112 | 
113 | It can be shown that if we initialize both hidden layers such that
114 | :math:`W^{(2)}={W^{(1)}}^T`, :math:`Q(h^{(1)}|x)=p(h^{(1)}|x)` and the KL
115 | divergence term is null. If we learn the first level RBM and then keep its
116 | parameters :math:`W^{(1)}` fixed, optimizing Eq. :eq:`dbn_bound` with respect
117 | to :math:`W^{(2)}` can thus only increase the likelihood :math:`p(x)`.  
118 | 
119 | Also, notice that if we isolate the terms which depend only on :math:`W^{(2)}`, we
120 | get: 
121 | 
122 | .. math:: 
123 |     \sum_h Q(h^{(1)}|x)p(h^{(1)})
124 |     
125 | Optimizing this with respect to :math:`W^{(2)}` amounts to training a second-stage
126 | RBM, using the output of :math:`Q(h^{(1)}|x)` as the training distribution,
127 | when :math:`x` is sampled from the training distribution for the first RBM.
128 | 
129 | Implementation
130 | ++++++++++++++
131 | 
132 | To implement DBNs in Theano, we will use the class defined in the :doc:`rbm`
133 | tutorial. One can also observe that the code for the DBN is very similar with the one
134 | for SdA, because both involve the principle of unsupervised layer-wise
135 | pre-training followed by supervised fine-tuning as a deep MLP. 
136 | The main difference is that we use the RBM class instead of the dA
137 | class.
138 | 
139 | We start off by defining the DBN class which will store the layers of the 
140 | MLP, along with their associated RBMs. Since we take the viewpoint of using
141 | the RBMs to initialize an MLP, the code will reflect this by seperating as
142 | much as possible the RBMs used to initialize the network and the MLP used for
143 | classification.
144 | 
145 | .. literalinclude:: ../code/DBN.py
146 |   :start-after: start-snippet-1
147 |   :end-before: end-snippet-1
148 | 
149 | ``self.sigmoid_layers`` will store the feed-forward graphs which together form
150 | the MLP, while ``self.rbm_layers`` will store the RBMs used to pretrain each
151 | layer of the MLP.
152 | 
153 | Next step, we construct ``n_layers`` sigmoid layers (we use the
154 | ``HiddenLayer`` class introduced in :ref:`mlp`, with the only modification
155 | that we replaced the non-linearity from ``tanh`` to the logistic function
156 | :math:`s(x) = \frac{1}{1+e^{-x}}`) and ``n_layers`` RBMs, where ``n_layers``
157 | is the depth of our model.  We link the sigmoid layers such that they form an
158 | MLP, and construct each RBM such that they share the weight matrix and the
159 | hidden bias with its corresponding sigmoid layer.
160 | 
161 | .. literalinclude:: ../code/DBN.py
162 |   :start-after: # MLP.
163 |   :end-before: # We now need to add a logistic layer on top of the MLP
164 | 
165 | All that is left is to stack one last logistic regression layer in order to
166 | form an MLP. We will use the ``LogisticRegression`` class introduced in
167 | :ref:`logreg`. 
168 | 
169 | .. literalinclude:: ../code/DBN.py
170 |   :start-after: # We now need to add a logistic layer on top of the MLP
171 |   :end-before: def pretraining_functions
172 | 
173 | The class also provides a method which generates training functions for each
174 | of the RBMs. They are returned as a list, where element :math:`i` is a
175 | function which implements one step of training for the ``RBM`` at layer
176 | :math:`i`.
177 | 
178 | .. literalinclude:: ../code/DBN.py
179 |   :start-after: self.errors = self.logLayer.errors(self.y)
180 |   :end-before: learning_rate = T.scalar('lr')
181 | 
182 | In order to be able to change the learning rate during training, we associate a
183 | Theano variable to it that has a default value.
184 | 
185 | .. literalinclude:: ../code/DBN.py
186 |   :start-after: index = T.lscalar('index')
187 |   :end-before: def build_finetune_functions
188 | 
189 | Now any function ``pretrain_fns[i]`` takes as arguments ``index`` and
190 | optionally ``lr`` -- the learning rate. Note that the names of the parameters
191 | are the names given to the Theano variables (e.g. ``lr``) when they are
192 | constructed and not the name of the python variables (e.g. ``learning_rate``). Keep
193 | this in mind when working with Theano. Optionally, if you provide ``k`` (the
194 | number of Gibbs steps to perform in CD or PCD) this will also become an
195 | argument of your function.
196 | 
197 | In the same fashion, the DBN class includes a method for building the
198 | functions required for finetuning ( a ``train_model``, a ``validate_model``
199 | and a ``test_model`` function). 
200 | 
201 | .. literalinclude:: ../code/DBN.py
202 |   :pyobject: DBN.build_finetune_functions
203 | 
204 | Note that the returned ``valid_score`` and ``test_score`` are not Theano
205 | functions, but rather Python functions. These loop over the entire 
206 | validation set and the entire test set to produce a list of the losses
207 | obtained over these sets.
208 | 
209 | 
210 | Putting it all together
211 | +++++++++++++++++++++++
212 | 
213 | The few lines of code below constructs the deep belief network : 
214 | 
215 | .. literalinclude:: ../code/DBN.py
216 |   :start-after: # numpy random generator
217 |   :end-before: start-snippet-2
218 | 
219 | There are two stages in training this network: (1) a layer-wise pre-training and
220 | (2) a fine-tuning stage.
221 | 
222 | For the pre-training stage, we loop over all the layers of the network. For
223 | each layer, we use the compiled theano function which determines the
224 | input to the ``i``-th level RBM and performs one step of CD-k within this RBM.
225 | This function is applied to the training set for a fixed number of epochs
226 | given by ``pretraining_epochs``.
227 | 
228 | .. literalinclude:: ../code/DBN.py
229 |   :start-after: start-snippet-2
230 |   :end-before: end-snippet-2
231 | 
232 | The fine-tuning loop is very similar to the one in the :ref:`mlp` tutorial,
233 | the only difference being that we now use the functions given by
234 | ``build_finetune_functions``.
235 | 
236 | Running the Code
237 | ++++++++++++++++
238 | 
239 | The user can run the code by calling:
240 | 
241 | .. code-block:: bash
242 |   
243 |   python code/DBN.py
244 | 
245 | With the default parameters, the code runs for 100 pre-training epochs with
246 | mini-batches of size 10. This corresponds to performing 500,000 unsupervised
247 | parameter updates. We use an unsupervised learning rate of 0.01, with a
248 | supervised learning rate of 0.1.  The DBN itself consists of three
249 | hidden layers with 1000 units per layer. With early-stopping, this configuration
250 | achieved a minimal validation error of 1.27 with corresponding test
251 | error of 1.34 after 46 supervised epochs.
252 | 
253 | On an Intel(R) Xeon(R) CPU X5560 running at 2.80GHz, using a multi-threaded MKL
254 | library (running on 4 cores), pretraining took 615 minutes with an average of
255 | 2.05 mins/(layer * epoch). Fine-tuning took only 101 minutes or approximately
256 | 2.20 mins/epoch.
257 | 
258 | Hyper-parameters were selected by optimizing on the validation error. We tested
259 | unsupervised learning rates in :math:`\{10^{-1}, ..., 10^{-5}\}` and supervised
260 | learning rates in :math:`\{10^{-1}, ..., 10^{-4}\}`. We did not use any form of
261 | regularization besides early-stopping, nor did we optimize over the number of
262 | pretraining updates.
263 | 
264 | 
265 | Tips and Tricks
266 | +++++++++++++++
267 | 
268 | One way to improve the running time of your code (given that you have
269 | sufficient memory available), is to compute the representation of the entire
270 | dataset at layer ``i`` in a single pass, once the weights of the
271 | :math:`i-1`-th layers have been fixed. Namely, start by training your first
272 | layer RBM. Once it is trained, you can compute the hidden units values for
273 | every example in the dataset and store this as a new dataset which is used to
274 | train the 2nd layer RBM. Once you trained the RBM for layer 2, you compute, in
275 | a similar fashion, the dataset for layer 3 and so on. This avoids calculating
276 | the intermediate (hidden layer) representations, ``pretraining_epochs`` times
277 | at the expense of increased memory usage.
278 | 


--------------------------------------------------------------------------------
/code/cA.py:
--------------------------------------------------------------------------------
  1 | """This tutorial introduces Contractive auto-encoders (cA) using Theano.
  2 | 
  3 |  They are based on auto-encoders as the ones used in Bengio et
  4 |  al. 2007.  An autoencoder takes an input x and first maps it to a
  5 |  hidden representation y = f_{\theta}(x) = s(Wx+b), parameterized by
  6 |  \theta={W,b}. The resulting latent representation y is then mapped
  7 |  back to a "reconstructed" vector z \in [0,1]^d in input space z =
  8 |  g_{\theta'}(y) = s(W'y + b').  The weight matrix W' can optionally be
  9 |  constrained such that W' = W^T, in which case the autoencoder is said
 10 |  to have tied weights. The network is trained such that to minimize
 11 |  the reconstruction error (the error between x and z).  Adding the
 12 |  squared Frobenius norm of the Jacobian of the hidden mapping h with
 13 |  respect to the visible units yields the contractive auto-encoder:
 14 | 
 15 |       - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
 16 |       + \| \frac{\partial h(x)}{\partial x} \|^2
 17 | 
 18 |  References :
 19 |    - S. Rifai, P. Vincent, X. Muller, X. Glorot, Y. Bengio: Contractive
 20 |    Auto-Encoders: Explicit Invariance During Feature Extraction, ICML-11
 21 | 
 22 |    - S. Rifai, X. Muller, X. Glorot, G. Mesnil, Y. Bengio, and Pascal
 23 |      Vincent. Learning invariant features through local space
 24 |      contraction. Technical Report 1360, Universite de Montreal
 25 | 
 26 |    - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
 27 |    Training of Deep Networks, Advances in Neural Information Processing
 28 |    Systems 19, 2007
 29 | 
 30 | """
 31 | import os
 32 | import sys
 33 | import time
 34 | 
 35 | import numpy
 36 | 
 37 | import theano
 38 | import theano.tensor as T
 39 | 
 40 | 
 41 | from logistic_sgd import load_data
 42 | from utils import tile_raster_images
 43 | 
 44 | try:
 45 |     import PIL.Image as Image
 46 | except ImportError:
 47 |     import Image
 48 | 
 49 | 
 50 | class cA(object):
 51 |     """ Contractive Auto-Encoder class (cA)
 52 | 
 53 |     The contractive autoencoder tries to reconstruct the input with an
 54 |     additional constraint on the latent space. With the objective of
 55 |     obtaining a robust representation of the input space, we
 56 |     regularize the L2 norm(Froebenius) of the jacobian of the hidden
 57 |     representation with respect to the input. Please refer to Rifai et
 58 |     al.,2011 for more details.
 59 | 
 60 |     If x is the input then equation (1) computes the projection of the
 61 |     input into the latent space h. Equation (2) computes the jacobian
 62 |     of h with respect to x.  Equation (3) computes the reconstruction
 63 |     of the input, while equation (4) computes the reconstruction
 64 |     error and the added regularization term from Eq.(2).
 65 | 
 66 |     .. math::
 67 | 
 68 |         h_i = s(W_i x + b_i)                                             (1)
 69 | 
 70 |         J_i = h_i (1 - h_i) * W_i                                        (2)
 71 | 
 72 |         x' = s(W' h  + b')                                               (3)
 73 | 
 74 |         L = -sum_{k=1}^d [x_k \log x'_k + (1-x_k) \log( 1-x'_k)]
 75 |              + lambda * sum_{i=1}^d sum_{j=1}^n J_{ij}^2                 (4)
 76 | 
 77 |     """
 78 | 
 79 |     def __init__(self, numpy_rng, input=None, n_visible=784, n_hidden=100,
 80 |                  n_batchsize=1, W=None, bhid=None, bvis=None):
 81 |         """Initialize the cA class by specifying the number of visible units
 82 |         (the dimension d of the input), the number of hidden units (the
 83 |         dimension d' of the latent or hidden space) and the contraction level.
 84 |         The constructor also receives symbolic variables for the input, weights
 85 |         and bias.
 86 | 
 87 |         :type numpy_rng: numpy.random.RandomState
 88 |         :param numpy_rng: number random generator used to generate weights
 89 | 
 90 |         :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
 91 |         :param theano_rng: Theano random generator; if None is given
 92 |                      one is generated based on a seed drawn from `rng`
 93 | 
 94 |         :type input: theano.tensor.TensorType
 95 |         :param input: a symbolic description of the input or None for
 96 |                       standalone cA
 97 | 
 98 |         :type n_visible: int
 99 |         :param n_visible: number of visible units
100 | 
101 |         :type n_hidden: int
102 |         :param n_hidden:  number of hidden units
103 | 
104 |         :type n_batchsize int
105 |         :param n_batchsize: number of examples per batch
106 | 
107 |         :type W: theano.tensor.TensorType
108 |         :param W: Theano variable pointing to a set of weights that should be
109 |                   shared belong the dA and another architecture; if dA should
110 |                   be standalone set this to None
111 | 
112 |         :type bhid: theano.tensor.TensorType
113 |         :param bhid: Theano variable pointing to a set of biases values (for
114 |                      hidden units) that should be shared belong dA and another
115 |                      architecture; if dA should be standalone set this to None
116 | 
117 |         :type bvis: theano.tensor.TensorType
118 |         :param bvis: Theano variable pointing to a set of biases values (for
119 |                      visible units) that should be shared belong dA and another
120 |                      architecture; if dA should be standalone set this to None
121 | 
122 |         """
123 |         self.n_visible = n_visible
124 |         self.n_hidden = n_hidden
125 |         self.n_batchsize = n_batchsize
126 |         # note : W' was written as `W_prime` and b' as `b_prime`
127 |         if not W:
128 |             # W is initialized with `initial_W` which is uniformely sampled
129 |             # from -4*sqrt(6./(n_visible+n_hidden)) and
130 |             # 4*sqrt(6./(n_hidden+n_visible))the output of uniform if
131 |             # converted using asarray to dtype
132 |             # theano.config.floatX so that the code is runable on GPU
133 |             initial_W = numpy.asarray(
134 |                 numpy_rng.uniform(
135 |                     low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
136 |                     high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
137 |                     size=(n_visible, n_hidden)
138 |                 ),
139 |                 dtype=theano.config.floatX
140 |             )
141 |             W = theano.shared(value=initial_W, name='W', borrow=True)
142 | 
143 |         if not bvis:
144 |             bvis = theano.shared(value=numpy.zeros(n_visible,
145 |                                                    dtype=theano.config.floatX),
146 |                                  borrow=True)
147 | 
148 |         if not bhid:
149 |             bhid = theano.shared(value=numpy.zeros(n_hidden,
150 |                                                    dtype=theano.config.floatX),
151 |                                  name='b',
152 |                                  borrow=True)
153 | 
154 |         self.W = W
155 |         # b corresponds to the bias of the hidden
156 |         self.b = bhid
157 |         # b_prime corresponds to the bias of the visible
158 |         self.b_prime = bvis
159 |         # tied weights, therefore W_prime is W transpose
160 |         self.W_prime = self.W.T
161 | 
162 |         # if no input is given, generate a variable representing the input
163 |         if input is None:
164 |             # we use a matrix because we expect a minibatch of several
165 |             # examples, each example being a row
166 |             self.x = T.dmatrix(name='input')
167 |         else:
168 |             self.x = input
169 | 
170 |         self.params = [self.W, self.b, self.b_prime]
171 | 
172 |     def get_hidden_values(self, input):
173 |         """ Computes the values of the hidden layer """
174 |         return T.nnet.sigmoid(T.dot(input, self.W) + self.b)
175 | 
176 |     def get_jacobian(self, hidden, W):
177 |         """Computes the jacobian of the hidden layer with respect to
178 |         the input, reshapes are necessary for broadcasting the
179 |         element-wise product on the right axis
180 | 
181 |         """
182 |         return T.reshape(hidden * (1 - hidden),
183 |                          (self.n_batchsize, 1, self.n_hidden)) * T.reshape(
184 |                              W, (1, self.n_visible, self.n_hidden))
185 | 
186 |     def get_reconstructed_input(self, hidden):
187 |         """Computes the reconstructed input given the values of the
188 |         hidden layer
189 | 
190 |         """
191 |         return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime)
192 | 
193 |     def get_cost_updates(self, contraction_level, learning_rate):
194 |         """ This function computes the cost and the updates for one trainng
195 |         step of the cA """
196 | 
197 |         y = self.get_hidden_values(self.x)
198 |         z = self.get_reconstructed_input(y)
199 |         J = self.get_jacobian(y, self.W)
200 |         # note : we sum over the size of a datapoint; if we are using
201 |         #        minibatches, L will be a vector, with one entry per
202 |         #        example in minibatch
203 |         self.L_rec = - T.sum(self.x * T.log(z) +
204 |                              (1 - self.x) * T.log(1 - z),
205 |                              axis=1)
206 | 
207 |         # Compute the jacobian and average over the number of samples/minibatch
208 |         self.L_jacob = T.sum(J ** 2) / self.n_batchsize
209 | 
210 |         # note : L is now a vector, where each element is the
211 |         #        cross-entropy cost of the reconstruction of the
212 |         #        corresponding example of the minibatch. We need to
213 |         #        compute the average of all these to get the cost of
214 |         #        the minibatch
215 |         cost = T.mean(self.L_rec) + contraction_level * T.mean(self.L_jacob)
216 | 
217 |         # compute the gradients of the cost of the `cA` with respect
218 |         # to its parameters
219 |         gparams = T.grad(cost, self.params)
220 |         # generate the list of updates
221 |         updates = []
222 |         for param, gparam in zip(self.params, gparams):
223 |             updates.append((param, param - learning_rate * gparam))
224 | 
225 |         return (cost, updates)
226 | 
227 | 
228 | def test_cA(learning_rate=0.01, training_epochs=20,
229 |             dataset='mnist.pkl.gz',
230 |             batch_size=10, output_folder='cA_plots', contraction_level=.1):
231 |     """
232 |     This demo is tested on MNIST
233 | 
234 |     :type learning_rate: float
235 |     :param learning_rate: learning rate used for training the contracting
236 |                           AutoEncoder
237 | 
238 |     :type training_epochs: int
239 |     :param training_epochs: number of epochs used for training
240 | 
241 |     :type dataset: string
242 |     :param dataset: path to the picked dataset
243 | 
244 |     """
245 |     datasets = load_data(dataset)
246 |     train_set_x, train_set_y = datasets[0]
247 | 
248 |     # compute number of minibatches for training, validation and testing
249 |     n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
250 | 
251 |     # allocate symbolic variables for the data
252 |     index = T.lscalar()    # index to a [mini]batch
253 |     x = T.matrix('x')  # the data is presented as rasterized images
254 | 
255 |     if not os.path.isdir(output_folder):
256 |         os.makedirs(output_folder)
257 |     os.chdir(output_folder)
258 |     ####################################
259 |     #        BUILDING THE MODEL        #
260 |     ####################################
261 | 
262 |     rng = numpy.random.RandomState(123)
263 | 
264 |     ca = cA(numpy_rng=rng, input=x,
265 |             n_visible=28 * 28, n_hidden=500, n_batchsize=batch_size)
266 | 
267 |     cost, updates = ca.get_cost_updates(contraction_level=contraction_level,
268 |                                         learning_rate=learning_rate)
269 | 
270 |     train_ca = theano.function(
271 |         [index],
272 |         [T.mean(ca.L_rec), ca.L_jacob],
273 |         updates=updates,
274 |         givens={
275 |             x: train_set_x[index * batch_size: (index + 1) * batch_size]
276 |         }
277 |     )
278 | 
279 |     start_time = time.clock()
280 | 
281 |     ############
282 |     # TRAINING #
283 |     ############
284 | 
285 |     # go through training epochs
286 |     for epoch in xrange(training_epochs):
287 |         # go through trainng set
288 |         c = []
289 |         for batch_index in xrange(n_train_batches):
290 |             c.append(train_ca(batch_index))
291 | 
292 |         c_array = numpy.vstack(c)
293 |         print 'Training epoch %d, reconstruction cost ' % epoch, numpy.mean(
294 |             c_array[0]), ' jacobian norm ', numpy.mean(numpy.sqrt(c_array[1]))
295 | 
296 |     end_time = time.clock()
297 | 
298 |     training_time = (end_time - start_time)
299 | 
300 |     print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
301 |                           ' ran for %.2fm' % ((training_time) / 60.))
302 |     image = Image.fromarray(tile_raster_images(
303 |         X=ca.W.get_value(borrow=True).T,
304 |         img_shape=(28, 28), tile_shape=(10, 10),
305 |         tile_spacing=(1, 1)))
306 | 
307 |     image.save('cae_filters.png')
308 | 
309 |     os.chdir('../')
310 | 
311 | 
312 | if __name__ == '__main__':
313 |     test_cA()
314 | 


--------------------------------------------------------------------------------
/code/convolutional_mlp.py:
--------------------------------------------------------------------------------
  1 | """This tutorial introduces the LeNet5 neural network architecture
  2 | using Theano.  LeNet5 is a convolutional neural network, good for
  3 | classifying images. This tutorial shows how to build the architecture,
  4 | and comes with all the hyper-parameters you need to reproduce the
  5 | paper's MNIST results.
  6 | 
  7 | 
  8 | This implementation simplifies the model in the following ways:
  9 | 
 10 |  - LeNetConvPool doesn't implement location-specific gain and bias parameters
 11 |  - LeNetConvPool doesn't implement pooling by average, it implements pooling
 12 |    by max.
 13 |  - Digit classification is implemented with a logistic regression rather than
 14 |    an RBF network
 15 |  - LeNet5 was not fully-connected convolutions at second layer
 16 | 
 17 | References:
 18 |  - Y. LeCun, L. Bottou, Y. Bengio and P. Haffner:
 19 |    Gradient-Based Learning Applied to Document
 20 |    Recognition, Proceedings of the IEEE, 86(11):2278-2324, November 1998.
 21 |    http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf
 22 | 
 23 | """
 24 | import os
 25 | import sys
 26 | import time
 27 | 
 28 | import numpy
 29 | 
 30 | import theano
 31 | import theano.tensor as T
 32 | from theano.tensor.signal import downsample
 33 | from theano.tensor.nnet import conv
 34 | 
 35 | from logistic_sgd import LogisticRegression, load_data
 36 | from mlp import HiddenLayer
 37 | 
 38 | 
 39 | class LeNetConvPoolLayer(object):
 40 |     """Pool Layer of a convolutional network """
 41 | 
 42 |     def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)):
 43 |         """
 44 |         Allocate a LeNetConvPoolLayer with shared variable internal parameters.
 45 | 
 46 |         :type rng: numpy.random.RandomState
 47 |         :param rng: a random number generator used to initialize weights
 48 | 
 49 |         :type input: theano.tensor.dtensor4
 50 |         :param input: symbolic image tensor, of shape image_shape
 51 | 
 52 |         :type filter_shape: tuple or list of length 4
 53 |         :param filter_shape: (number of filters, num input feature maps,
 54 |                               filter height, filter width)
 55 | 
 56 |         :type image_shape: tuple or list of length 4
 57 |         :param image_shape: (batch size, num input feature maps,
 58 |                              image height, image width)
 59 | 
 60 |         :type poolsize: tuple or list of length 2
 61 |         :param poolsize: the downsampling (pooling) factor (#rows, #cols)
 62 |         """
 63 | 
 64 |         assert image_shape[1] == filter_shape[1]
 65 |         self.input = input
 66 | 
 67 |         # there are "num input feature maps * filter height * filter width"
 68 |         # inputs to each hidden unit
 69 |         fan_in = numpy.prod(filter_shape[1:])
 70 |         # each unit in the lower layer receives a gradient from:
 71 |         # "num output feature maps * filter height * filter width" /
 72 |         #   pooling size
 73 |         fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) /
 74 |                    numpy.prod(poolsize))
 75 |         # initialize weights with random weights
 76 |         W_bound = numpy.sqrt(6. / (fan_in + fan_out))
 77 |         self.W = theano.shared(
 78 |             numpy.asarray(
 79 |                 rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
 80 |                 dtype=theano.config.floatX
 81 |             ),
 82 |             borrow=True
 83 |         )
 84 | 
 85 |         # the bias is a 1D tensor -- one bias per output feature map
 86 |         b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
 87 |         self.b = theano.shared(value=b_values, borrow=True)
 88 | 
 89 |         # convolve input feature maps with filters
 90 |         conv_out = conv.conv2d(
 91 |             input=input,
 92 |             filters=self.W,
 93 |             filter_shape=filter_shape,
 94 |             image_shape=image_shape
 95 |         )
 96 | 
 97 |         # downsample each feature map individually, using maxpooling
 98 |         pooled_out = downsample.max_pool_2d(
 99 |             input=conv_out,
100 |             ds=poolsize,
101 |             ignore_border=True
102 |         )
103 | 
104 |         # add the bias term. Since the bias is a vector (1D array), we first
105 |         # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will
106 |         # thus be broadcasted across mini-batches and feature map
107 |         # width & height
108 |         self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
109 | 
110 |         # store parameters of this layer
111 |         self.params = [self.W, self.b]
112 | 
113 | 
114 | def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
115 |                     dataset='mnist.pkl.gz',
116 |                     nkerns=[20, 50], batch_size=500):
117 |     """ Demonstrates lenet on MNIST dataset
118 | 
119 |     :type learning_rate: float
120 |     :param learning_rate: learning rate used (factor for the stochastic
121 |                           gradient)
122 | 
123 |     :type n_epochs: int
124 |     :param n_epochs: maximal number of epochs to run the optimizer
125 | 
126 |     :type dataset: string
127 |     :param dataset: path to the dataset used for training /testing (MNIST here)
128 | 
129 |     :type nkerns: list of ints
130 |     :param nkerns: number of kernels on each layer
131 |     """
132 | 
133 |     rng = numpy.random.RandomState(23455)
134 | 
135 |     datasets = load_data(dataset)
136 | 
137 |     train_set_x, train_set_y = datasets[0]
138 |     valid_set_x, valid_set_y = datasets[1]
139 |     test_set_x, test_set_y = datasets[2]
140 | 
141 |     # compute number of minibatches for training, validation and testing
142 |     n_train_batches = train_set_x.get_value(borrow=True).shape[0]
143 |     n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
144 |     n_test_batches = test_set_x.get_value(borrow=True).shape[0]
145 |     n_train_batches /= batch_size
146 |     n_valid_batches /= batch_size
147 |     n_test_batches /= batch_size
148 | 
149 |     # allocate symbolic variables for the data
150 |     index = T.lscalar()  # index to a [mini]batch
151 | 
152 |     # start-snippet-1
153 |     x = T.matrix('x')   # the data is presented as rasterized images
154 |     y = T.ivector('y')  # the labels are presented as 1D vector of
155 |                         # [int] labels
156 | 
157 |     ######################
158 |     # BUILD ACTUAL MODEL #
159 |     ######################
160 |     print '... building the model'
161 | 
162 |     # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
163 |     # to a 4D tensor, compatible with our LeNetConvPoolLayer
164 |     # (28, 28) is the size of MNIST images.
165 |     layer0_input = x.reshape((batch_size, 1, 28, 28))
166 | 
167 |     # Construct the first convolutional pooling layer:
168 |     # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
169 |     # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
170 |     # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
171 |     layer0 = LeNetConvPoolLayer(
172 |         rng,
173 |         input=layer0_input,
174 |         image_shape=(batch_size, 1, 28, 28),
175 |         filter_shape=(nkerns[0], 1, 5, 5),
176 |         poolsize=(2, 2)
177 |     )
178 | 
179 |     # Construct the second convolutional pooling layer
180 |     # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
181 |     # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
182 |     # 4D output tensor is thus of shape (nkerns[0], nkerns[1], 4, 4)
183 |     layer1 = LeNetConvPoolLayer(
184 |         rng,
185 |         input=layer0.output,
186 |         image_shape=(batch_size, nkerns[0], 12, 12),
187 |         filter_shape=(nkerns[1], nkerns[0], 5, 5),
188 |         poolsize=(2, 2)
189 |     )
190 | 
191 |     # the HiddenLayer being fully-connected, it operates on 2D matrices of
192 |     # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
193 |     # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
194 |     # or (500, 50 * 4 * 4) = (500, 800) with the default values.
195 |     layer2_input = layer1.output.flatten(2)
196 | 
197 |     # construct a fully-connected sigmoidal layer
198 |     layer2 = HiddenLayer(
199 |         rng,
200 |         input=layer2_input,
201 |         n_in=nkerns[1] * 4 * 4,
202 |         n_out=500,
203 |         activation=T.tanh
204 |     )
205 | 
206 |     # classify the values of the fully-connected sigmoidal layer
207 |     layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)
208 | 
209 |     # the cost we minimize during training is the NLL of the model
210 |     cost = layer3.negative_log_likelihood(y)
211 | 
212 |     # create a function to compute the mistakes that are made by the model
213 |     test_model = theano.function(
214 |         [index],
215 |         layer3.errors(y),
216 |         givens={
217 |             x: test_set_x[index * batch_size: (index + 1) * batch_size],
218 |             y: test_set_y[index * batch_size: (index + 1) * batch_size]
219 |         }
220 |     )
221 | 
222 |     validate_model = theano.function(
223 |         [index],
224 |         layer3.errors(y),
225 |         givens={
226 |             x: valid_set_x[index * batch_size: (index + 1) * batch_size],
227 |             y: valid_set_y[index * batch_size: (index + 1) * batch_size]
228 |         }
229 |     )
230 | 
231 |     # create a list of all model parameters to be fit by gradient descent
232 |     params = layer3.params + layer2.params + layer1.params + layer0.params
233 | 
234 |     # create a list of gradients for all model parameters
235 |     grads = T.grad(cost, params)
236 | 
237 |     # train_model is a function that updates the model parameters by
238 |     # SGD Since this model has many parameters, it would be tedious to
239 |     # manually create an update rule for each model parameter. We thus
240 |     # create the updates list by automatically looping over all
241 |     # (params[i], grads[i]) pairs.
242 |     updates = [
243 |         (param_i, param_i - learning_rate * grad_i)
244 |         for param_i, grad_i in zip(params, grads)
245 |     ]
246 | 
247 |     train_model = theano.function(
248 |         [index],
249 |         cost,
250 |         updates=updates,
251 |         givens={
252 |             x: train_set_x[index * batch_size: (index + 1) * batch_size],
253 |             y: train_set_y[index * batch_size: (index + 1) * batch_size]
254 |         }
255 |     )
256 |     # end-snippet-1
257 | 
258 |     ###############
259 |     # TRAIN MODEL #
260 |     ###############
261 |     print '... training'
262 |     # early-stopping parameters
263 |     patience = 10000  # look as this many examples regardless
264 |     patience_increase = 2  # wait this much longer when a new best is
265 |                            # found
266 |     improvement_threshold = 0.995  # a relative improvement of this much is
267 |                                    # considered significant
268 |     validation_frequency = min(n_train_batches, patience / 2)
269 |                                   # go through this many
270 |                                   # minibatche before checking the network
271 |                                   # on the validation set; in this case we
272 |                                   # check every epoch
273 | 
274 |     best_validation_loss = numpy.inf
275 |     best_iter = 0
276 |     test_score = 0.
277 |     start_time = time.clock()
278 | 
279 |     epoch = 0
280 |     done_looping = False
281 | 
282 |     while (epoch < n_epochs) and (not done_looping):
283 |         epoch = epoch + 1
284 |         for minibatch_index in xrange(n_train_batches):
285 | 
286 |             iter = (epoch - 1) * n_train_batches + minibatch_index
287 | 
288 |             if iter % 100 == 0:
289 |                 print 'training @ iter = ', iter
290 |             cost_ij = train_model(minibatch_index)
291 | 
292 |             if (iter + 1) % validation_frequency == 0:
293 | 
294 |                 # compute zero-one loss on validation set
295 |                 validation_losses = [validate_model(i) for i
296 |                                      in xrange(n_valid_batches)]
297 |                 this_validation_loss = numpy.mean(validation_losses)
298 |                 print('epoch %i, minibatch %i/%i, validation error %f %%' %
299 |                       (epoch, minibatch_index + 1, n_train_batches,
300 |                        this_validation_loss * 100.))
301 | 
302 |                 # if we got the best validation score until now
303 |                 if this_validation_loss < best_validation_loss:
304 | 
305 |                     #improve patience if loss improvement is good enough
306 |                     if this_validation_loss < best_validation_loss *  \
307 |                        improvement_threshold:
308 |                         patience = max(patience, iter * patience_increase)
309 | 
310 |                     # save best validation score and iteration number
311 |                     best_validation_loss = this_validation_loss
312 |                     best_iter = iter
313 | 
314 |                     # test it on the test set
315 |                     test_losses = [
316 |                         test_model(i)
317 |                         for i in xrange(n_test_batches)
318 |                     ]
319 |                     test_score = numpy.mean(test_losses)
320 |                     print(('     epoch %i, minibatch %i/%i, test error of '
321 |                            'best model %f %%') %
322 |                           (epoch, minibatch_index + 1, n_train_batches,
323 |                            test_score * 100.))
324 | 
325 |             if patience <= iter:
326 |                 done_looping = True
327 |                 break
328 | 
329 |     end_time = time.clock()
330 |     print('Optimization complete.')
331 |     print('Best validation score of %f %% obtained at iteration %i, '
332 |           'with test performance %f %%' %
333 |           (best_validation_loss * 100., best_iter + 1, test_score * 100.))
334 |     print >> sys.stderr, ('The code for file ' +
335 |                           os.path.split(__file__)[1] +
336 |                           ' ran for %.2fm' % ((end_time - start_time) / 60.))
337 | 
338 | if __name__ == '__main__':
339 |     evaluate_lenet5()
340 | 
341 | 
342 | def experiment(state, channel):
343 |     evaluate_lenet5(state.learning_rate, dataset=state.dataset)
344 | 


--------------------------------------------------------------------------------
/doc/mlp.txt:
--------------------------------------------------------------------------------
  1 | .. index:: Multilayer Perceptron
  2 | 
  3 | .. _mlp:
  4 | 
  5 | 
  6 | Multilayer Perceptron
  7 | =====================
  8 | 
  9 | .. note::
 10 |     This section assumes the reader has already read through :doc:`logreg`.
 11 |     Additionally, it uses the following new Theano functions and concepts:
 12 |     `T.tanh`_, `shared variables`_, `basic arithmetic ops`_, `T.grad`_,
 13 |     :ref:`L1_L2_regularization`, `floatX`_. If you intend to run the
 14 |     code on GPU also read `GPU`_.
 15 | 
 16 | .. note::
 17 |     The code for this section is available for download `here`_.
 18 | 
 19 | .. _here: http://deeplearning.net/tutorial/code/mlp.py
 20 | 
 21 | .. _T.tanh: http://deeplearning.net/software/theano/tutorial/examples.html?highlight=tanh
 22 | 
 23 | .. _shared variables: http://deeplearning.net/software/theano/tutorial/examples.html#using-shared-variables
 24 | 
 25 | .. _basic arithmetic ops: http://deeplearning.net/software/theano/tutorial/adding.html#adding-two-scalars
 26 | 
 27 | .. _T.grad: http://deeplearning.net/software/theano/tutorial/examples.html#computing-gradients
 28 | 
 29 | .. _floatX: http://deeplearning.net/software/theano/library/config.html#config.floatX
 30 | 
 31 | .. _GPU: http://deeplearning.net/software/theano/tutorial/using_gpu.html
 32 | 
 33 | 
 34 | The next architecture we are going to present using Theano is the
 35 | single-hidden-layer Multi-Layer Perceptron (MLP). An MLP can be viewed as a
 36 | logistic regression classifier where the input is first transformed using a
 37 | learnt non-linear transformation :math:`\Phi`. This transformation projects the
 38 | input data into a space where it becomes linearly separable. This intermediate
 39 | layer is referred to as a **hidden layer**. A single hidden layer is sufficient
 40 | to make MLPs a **universal approximator**. However we will see later on that
 41 | there are substantial benefits to using many such hidden layers, i.e. the very
 42 | premise of **deep learning**. See these course notes for an `introduction to
 43 | MLPs, the back-propagation algorithm, and how to train MLPs
 44 | <http://www.iro.umontreal.ca/~pift6266/H10/notes/mlp.html>`_.
 45 | 
 46 | This tutorial will again tackle the problem of MNIST digit classification.
 47 | 
 48 | The Model
 49 | +++++++++
 50 | 
 51 | An MLP (or Artificial Neural Network - ANN) with a single hidden layer
 52 | can be represented graphically as
 53 | follows:
 54 | 
 55 | .. figure:: images/mlp.png
 56 |     :align: center
 57 | 
 58 | Formally, a one-hidden-layer MLP is a function :math:`f: R^D \rightarrow
 59 | R^L`, where :math:`D` is the size of input vector :math:`x` and :math:`L` is
 60 | the size of the output vector :math:`f(x)`, such that, in matrix notation:
 61 | 
 62 | .. math::
 63 | 
 64 |     f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
 65 | 
 66 | with bias vectors :math:`b^{(1)}`, :math:`b^{(2)}`; weight matrices
 67 | :math:`W^{(1)}`, :math:`W^{(2)}` and activation functions :math:`G` and :math:`s`.
 68 | 
 69 | The vector :math:`h(x) = \Phi(x) = s(b^{(1)} + W^{(1)} x)` constitutes the hidden layer.
 70 | :math:`W^{(1)} \in R^{D \times D_h}` is the weight matrix connecting the input vector
 71 | to the hidden layer.  Each column :math:`W^{(1)}_{\cdot i}` represents the weights
 72 | from the input units to the i-th hidden unit. Typical choices for :math:`s`
 73 | include :math:`tanh`, with :math:`tanh(a)=(e^a-e^{-a})/(e^a+e^{-a})`,
 74 | or the logistic :math:`sigmoid` function, with :math:`sigmoid(a)=1/(1+e^{-a})`. We will be using
 75 | :math:`tanh` in this tutorial because it typically yields to faster training
 76 | (and sometimes also to better local minima). Both the :math:`tanh`
 77 | and :math:`sigmoid` are scalar-to-scalar functions but their natural
 78 | extension to vectors and tensors consists in applying them element-wise
 79 | (e.g. separately on each element of the vector, yielding a same-size vector).
 80 | 
 81 | The output vector is then obtained as: :math:`o(x) = G(b^{(2)} + W^{(2)} h(x))`.
 82 | The reader should recognize the form we already used for
 83 | :doc:`logreg`. As before,
 84 | class-membership probabilities can be obtained by choosing :math:`G` as the
 85 | :math:`softmax` function (in the case of multi-class classification).
 86 | 
 87 | To train an MLP, we learn **all** parameters of the model, and here we use
 88 | :ref:`opt_SGD` with minibatches.
 89 | The set of parameters to learn is the set :math:`\theta =
 90 | \{W^{(2)},b^{(2)},W^{(1)},b^{(1)}\}`.  Obtaining the gradients
 91 | :math:`\partial{\ell}/\partial{\theta}` can be achieved through the
 92 | **backpropagation algorithm** (a special case of the chain-rule of derivation).
 93 | Thankfully, since Theano performs automatic differentation, we will not need to
 94 | cover this in the tutorial !
 95 | 
 96 | 
 97 | Going from logistic regression to MLP
 98 | +++++++++++++++++++++++++++++++++++++
 99 | 
100 | This tutorial will focus on a single-hidden-layer MLP. We start off by
101 | implementing a class that will represent a hidden layer. To
102 | construct the MLP we will then only need to throw a logistic regression
103 | layer on top.
104 | 
105 | .. literalinclude:: ../code/mlp.py
106 |   :start-after: start-snippet-1
107 |   :end-before: end-snippet-1
108 | 
109 | The initial values for the weights of a hidden layer :math:`i` should be uniformly
110 | sampled from a symmetric interval that depends on the activation function. For
111 | :math:`tanh` activation function results obtained in [Xavier10]_ show that the
112 | interval should be
113 | :math:`[-\sqrt{\frac{6}{fan_{in}+fan_{out}}},\sqrt{\frac{6}{fan_{in}+fan_{out}}}]`, where
114 | :math:`fan_{in}` is the number of units in the :math:`(i-1)`-th layer,
115 | and :math:`fan_{out}` is the number of units in the :math:`i`-th layer. For
116 | the sigmoid function the interval is :math:`[-4\sqrt{\frac{6}{fan_{in}+fan_{out}}},4\sqrt{\frac{6}{fan_{in}+fan_{out}}}]`.
117 | This initialization ensures that, early in training, each neuron operates in a
118 | regime of its activation function where information can easily be propagated
119 | both upward (activations flowing from inputs to outputs) and backward
120 | (gradients flowing from outputs to inputs).
121 | 
122 | .. literalinclude:: ../code/mlp.py
123 |   :start-after: end-snippet-1
124 |   :end-before:  lin_output = T.dot(input, self.W) + self.b
125 | 
126 | Note that we used a given non-linear function as the activation function of the hidden layer. By default this is ``tanh``, but in many cases we might want
127 | to use something else.
128 | 
129 | .. literalinclude:: ../code/mlp.py
130 |   :start-after: self.b = b
131 |   :end-before: # parameters of the model
132 | 
133 | If you look into theory this class implements the graph that computes
134 | the hidden layer value :math:`h(x) = \Phi(x) = s(b^{(1)} + W^{(1)} x)`.
135 | If you give this graph as input to the ``LogisticRegression`` class,
136 | implemented in the previous tutorial :doc:`logreg`, you get the output
137 | of the MLP. You can see this in the following short implementation of
138 | the ``MLP`` class.
139 | 
140 | .. literalinclude:: ../code/mlp.py
141 |   :start-after: start-snippet-2
142 |   :end-before: end-snippet-2
143 | 
144 | In this tutorial we will also use L1 and L2 regularization (see
145 | :ref:`L1_L2_regularization`). For this, we need to compute the L1 norm and the squared L2
146 | norm of the weights :math:`W^{(1)}, W^{(2)}`.
147 | 
148 | .. literalinclude:: ../code/mlp.py
149 |   :start-after: start-snippet-3
150 |   :end-before: end-snippet-3
151 | 
152 | As before, we train this model using stochastic gradient descent with
153 | mini-batches. The difference is that we modify the cost function to include the
154 | regularization term. ``L1_reg`` and ``L2_reg`` are the hyperparameters
155 | controlling the weight of these regularization terms in the total cost function.
156 | The code that computes the new cost is:
157 | 
158 | .. literalinclude:: ../code/mlp.py
159 |   :start-after: start-snippet-4
160 |   :end-before: end-snippet-4
161 | 
162 | We then update the parameters of the model using the gradient. This code is
163 | almost identical to the one for logistic regression. Only the number of
164 | parameters differ. To get around this ( and write code that could work
165 | for any number of parameters) we will use the list of parameters that
166 | we created with the model ``params`` and parse it, computing a gradient
167 | at each step.
168 | 
169 | .. literalinclude:: ../code/mlp.py
170 |   :start-after: start-snippet-5
171 |   :end-before: end-snippet-5
172 | 
173 | Putting it All Together
174 | +++++++++++++++++++++++
175 | 
176 | Having covered the basic concepts, writing an MLP class becomes quite easy.
177 | The code below shows how this can be done, in a way which is analogous to our previous logistic regression implementation.
178 | 
179 | .. literalinclude:: ../code/mlp.py
180 | 
181 | The user can then run the code by calling :
182 | 
183 | .. code-block:: bash
184 | 
185 |     python code/mlp.py
186 | 
187 | The output one should expect is of the form :
188 | 
189 | .. code-block:: bash
190 | 
191 |   Optimization complete. Best validation score of 1.690000 % obtained at iteration 2070000, with test performance 1.650000 %
192 |   The code for file mlp.py ran for 97.34m
193 | 
194 | On an Intel(R) Core(TM) i7-2600K CPU @ 3.40GHz the code runs with
195 | approximately 10.3 epoch/minute and it took 828 epochs to reach a test
196 | error of 1.65%.
197 | 
198 | To put this into perspective, we refer the reader to the results section of `this
199 | <http://yann.lecun.com/exdb/mnist>`_  page.
200 | 
201 | Tips and Tricks for training MLPs
202 | +++++++++++++++++++++++++++++++++
203 | 
204 | There are several hyper-parameters in the above code, which are not (and,
205 | generally speaking, cannot be) optimized by gradient descent. Strictly speaking,
206 | finding an optimal set of values for these
207 | hyper-parameters is not a feasible problem. First, we can't simply optimize
208 | each of them independently. Second, we cannot readily apply gradient
209 | techniques that we described previously (partly because some parameters are
210 | discrete values and others are real-valued). Third, the optimization problem
211 | is not convex and finding a (local) minimum would involve a non-trivial
212 | amount of work.
213 | 
214 | The good news is that over the last 25 years, researchers have devised various
215 | rules of thumb for choosing hyper-parameters in a neural network. A very
216 | good overview of these tricks can be found in `Efficient
217 | BackProp <http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf>`_ by Yann LeCun,
218 | Leon Bottou, Genevieve Orr, and Klaus-Robert Mueller. In here, we summarize
219 | the same issues, with an emphasis on the parameters and techniques that we
220 | actually used in our code.
221 | 
222 | Nonlinearity
223 | --------------
224 | 
225 | Two of the most common ones are the :math:`sigmoid` and the :math:`tanh` function. For
226 | reasons explained in `Section 4.4  <http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf>`_, nonlinearities that
227 | are symmetric around the origin are preferred because they tend to produce
228 | zero-mean inputs to the next layer (which is a desirable property).
229 | Empirically, we have observed that the :math:`tanh` has better convergence
230 | properties.
231 | 
232 | Weight initialization
233 | ---------------------
234 | 
235 | At initialization we want the weights to be small enough around the origin
236 | so that the activation function operates in its linear regime, where gradients are
237 | the largest. Other desirable properties, especially for deep networks,
238 | are to conserve variance of the activation as well as variance of back-propagated gradients from layer to layer.
239 | This allows information to flow well upward and downward in the network and
240 | reduces discrepancies between layers.
241 | Under some assumptions, a compromise between these two constraints leads to the following
242 | initialization: :math:`uniform[-\frac{6}{\sqrt{fan_{in}+fan_{out}}},\frac{6}{\sqrt{fan_{in}+fan_{out}}}]`
243 | for tanh and :math:`uniform[-4*\frac{6}{\sqrt{fan_{in}+fan_{out}}},4*\frac{6}{\sqrt{fan_{in}+fan_{out}}}]`
244 | for sigmoid. Where :math:`fan_{in}` is the number of inputs and :math:`fan_{out}` the number of hidden units.
245 | For mathematical considerations please refer to [Xavier10]_.
246 | 
247 | Learning rate
248 | --------------
249 | 
250 | There is a great deal of literature on choosing a good learning rate. The
251 | simplest solution is to simply have a constant rate. Rule of thumb: try
252 | several log-spaced values (:math:`10^{-1},10^{-2},\ldots`) and narrow the
253 | (logarithmic) grid search to the region where you obtain the lowest
254 | validation error.
255 | 
256 | Decreasing the learning rate over time is sometimes a good idea. One simple
257 | rule for doing that is :math:`\frac{\mu_0}{1 + d\times t}` where
258 | :math:`\mu_0` is the initial rate (chosen, perhaps, using the grid search
259 | technique explained above), :math:`d` is a so-called "decrease constant"
260 | which controls the rate at which the learning rate decreases (typically, a
261 | smaller positive number, :math:`10^{-3}` and smaller) and :math:`t` is the
262 | epoch/stage.
263 | 
264 | `Section 4.7 <http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf>`_ details
265 | procedures for choosing a learning rate for each parameter (weight) in our
266 | network and for choosing them adaptively based on the error of the
267 | classifier.
268 | 
269 | Number of hidden units
270 | -----------------------
271 | 
272 | This hyper-parameter is very much dataset-dependent. Vaguely speaking, the
273 | more complicated the input distribution is, the more capacity the network
274 | will require to model it, and so the larger the number of hidden units that
275 | will be needed (note that the number of weights in a layer, perhaps a more direct
276 | measure of capacity, is :math:`D\times D_h` (recall :math:`D` is the number of
277 | inputs and :math:`D_h` is the number of hidden units).
278 | 
279 | Unless we employ some regularization scheme (early stopping or L1/L2
280 | penalties), a typical number of hidden  units vs. generalization performance graph will be U-shaped.
281 | 
282 | Regularization parameter
283 | ------------------------
284 | 
285 | Typical values to try for the L1/L2 regularization parameter :math:`\lambda`
286 | are :math:`10^{-2},10^{-3},\ldots`. In the framework that we described so
287 | far, optimizing this parameter will not lead to significantly better
288 | solutions, but is worth exploring nonetheless.
289 | 
290 | 


--------------------------------------------------------------------------------
/code/mlp.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This tutorial introduces the multilayer perceptron using Theano.
  3 | 
  4 |  A multilayer perceptron is a logistic regressor where
  5 | instead of feeding the input to the logistic regression you insert a
  6 | intermediate layer, called the hidden layer, that has a nonlinear
  7 | activation function (usually tanh or sigmoid) . One can use many such
  8 | hidden layers making the architecture deep. The tutorial will also tackle
  9 | the problem of MNIST digit classification.
 10 | 
 11 | .. math::
 12 | 
 13 |     f(x) = G( b^{(2)} + W^{(2)}( s( b^{(1)} + W^{(1)} x))),
 14 | 
 15 | References:
 16 | 
 17 |     - textbooks: "Pattern Recognition and Machine Learning" -
 18 |                  Christopher M. Bishop, section 5
 19 | 
 20 | """
 21 | __docformat__ = 'restructedtext en'
 22 | 
 23 | 
 24 | import os
 25 | import sys
 26 | import time
 27 | 
 28 | import numpy
 29 | 
 30 | import theano
 31 | import theano.tensor as T
 32 | 
 33 | 
 34 | from logistic_sgd import LogisticRegression, load_data
 35 | 
 36 | 
 37 | # start-snippet-1
 38 | class HiddenLayer(object):
 39 |     def __init__(self, rng, input, n_in, n_out, W=None, b=None,
 40 |                  activation=T.tanh):
 41 |         """
 42 |         Typical hidden layer of a MLP: units are fully-connected and have
 43 |         sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
 44 |         and the bias vector b is of shape (n_out,).
 45 | 
 46 |         NOTE : The nonlinearity used here is tanh
 47 | 
 48 |         Hidden unit activation is given by: tanh(dot(input,W) + b)
 49 | 
 50 |         :type rng: numpy.random.RandomState
 51 |         :param rng: a random number generator used to initialize weights
 52 | 
 53 |         :type input: theano.tensor.dmatrix
 54 |         :param input: a symbolic tensor of shape (n_examples, n_in)
 55 | 
 56 |         :type n_in: int
 57 |         :param n_in: dimensionality of input
 58 | 
 59 |         :type n_out: int
 60 |         :param n_out: number of hidden units
 61 | 
 62 |         :type activation: theano.Op or function
 63 |         :param activation: Non linearity to be applied in the hidden
 64 |                            layer
 65 |         """
 66 |         self.input = input
 67 |         # end-snippet-1
 68 | 
 69 |         # `W` is initialized with `W_values` which is uniformely sampled
 70 |         # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
 71 |         # for tanh activation function
 72 |         # the output of uniform if converted using asarray to dtype
 73 |         # theano.config.floatX so that the code is runable on GPU
 74 |         # Note : optimal initialization of weights is dependent on the
 75 |         #        activation function used (among other things).
 76 |         #        For example, results presented in [Xavier10] suggest that you
 77 |         #        should use 4 times larger initial weights for sigmoid
 78 |         #        compared to tanh
 79 |         #        We have no info for other function, so we use the same as
 80 |         #        tanh.
 81 |         if W is None:
 82 |             W_values = numpy.asarray(
 83 |                 rng.uniform(
 84 |                     low=-numpy.sqrt(6. / (n_in + n_out)),
 85 |                     high=numpy.sqrt(6. / (n_in + n_out)),
 86 |                     size=(n_in, n_out)
 87 |                 ),
 88 |                 dtype=theano.config.floatX
 89 |             )
 90 |             if activation == theano.tensor.nnet.sigmoid:
 91 |                 W_values *= 4
 92 | 
 93 |             W = theano.shared(value=W_values, name='W', borrow=True)
 94 | 
 95 |         if b is None:
 96 |             b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
 97 |             b = theano.shared(value=b_values, name='b', borrow=True)
 98 | 
 99 |         self.W = W
100 |         self.b = b
101 | 
102 |         lin_output = T.dot(input, self.W) + self.b
103 |         self.output = (
104 |             lin_output if activation is None
105 |             else activation(lin_output)
106 |         )
107 |         # parameters of the model
108 |         self.params = [self.W, self.b]
109 | 
110 | 
111 | # start-snippet-2
112 | class MLP(object):
113 |     """Multi-Layer Perceptron Class
114 | 
115 |     A multilayer perceptron is a feedforward artificial neural network model
116 |     that has one layer or more of hidden units and nonlinear activations.
117 |     Intermediate layers usually have as activation function tanh or the
118 |     sigmoid function (defined here by a ``HiddenLayer`` class)  while the
119 |     top layer is a softamx layer (defined here by a ``LogisticRegression``
120 |     class).
121 |     """
122 | 
123 |     def __init__(self, rng, input, n_in, n_hidden, n_out):
124 |         """Initialize the parameters for the multilayer perceptron
125 | 
126 |         :type rng: numpy.random.RandomState
127 |         :param rng: a random number generator used to initialize weights
128 | 
129 |         :type input: theano.tensor.TensorType
130 |         :param input: symbolic variable that describes the input of the
131 |         architecture (one minibatch)
132 | 
133 |         :type n_in: int
134 |         :param n_in: number of input units, the dimension of the space in
135 |         which the datapoints lie
136 | 
137 |         :type n_hidden: int
138 |         :param n_hidden: number of hidden units
139 | 
140 |         :type n_out: int
141 |         :param n_out: number of output units, the dimension of the space in
142 |         which the labels lie
143 | 
144 |         """
145 | 
146 |         # Since we are dealing with a one hidden layer MLP, this will translate
147 |         # into a HiddenLayer with a tanh activation function connected to the
148 |         # LogisticRegression layer; the activation function can be replaced by
149 |         # sigmoid or any other nonlinear function
150 |         self.hiddenLayer = HiddenLayer(
151 |             rng=rng,
152 |             input=input,
153 |             n_in=n_in,
154 |             n_out=n_hidden,
155 |             activation=T.tanh
156 |         )
157 | 
158 |         # The logistic regression layer gets as input the hidden units
159 |         # of the hidden layer
160 |         self.logRegressionLayer = LogisticRegression(
161 |             input=self.hiddenLayer.output,
162 |             n_in=n_hidden,
163 |             n_out=n_out
164 |         )
165 |         # end-snippet-2 start-snippet-3
166 |         # L1 norm ; one regularization option is to enforce L1 norm to
167 |         # be small
168 |         self.L1 = (
169 |             abs(self.hiddenLayer.W).sum()
170 |             + abs(self.logRegressionLayer.W).sum()
171 |         )
172 | 
173 |         # square of L2 norm ; one regularization option is to enforce
174 |         # square of L2 norm to be small
175 |         self.L2_sqr = (
176 |             (self.hiddenLayer.W ** 2).sum()
177 |             + (self.logRegressionLayer.W ** 2).sum()
178 |         )
179 | 
180 |         # negative log likelihood of the MLP is given by the negative
181 |         # log likelihood of the output of the model, computed in the
182 |         # logistic regression layer
183 |         self.negative_log_likelihood = (
184 |             self.logRegressionLayer.negative_log_likelihood
185 |         )
186 |         # same holds for the function computing the number of errors
187 |         self.errors = self.logRegressionLayer.errors
188 | 
189 |         # the parameters of the model are the parameters of the two layer it is
190 |         # made out of
191 |         self.params = self.hiddenLayer.params + self.logRegressionLayer.params
192 |         # end-snippet-3
193 | 
194 | 
195 | def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
196 |              dataset='mnist.pkl.gz', batch_size=20, n_hidden=500):
197 |     """
198 |     Demonstrate stochastic gradient descent optimization for a multilayer
199 |     perceptron
200 | 
201 |     This is demonstrated on MNIST.
202 | 
203 |     :type learning_rate: float
204 |     :param learning_rate: learning rate used (factor for the stochastic
205 |     gradient
206 | 
207 |     :type L1_reg: float
208 |     :param L1_reg: L1-norm's weight when added to the cost (see
209 |     regularization)
210 | 
211 |     :type L2_reg: float
212 |     :param L2_reg: L2-norm's weight when added to the cost (see
213 |     regularization)
214 | 
215 |     :type n_epochs: int
216 |     :param n_epochs: maximal number of epochs to run the optimizer
217 | 
218 |     :type dataset: string
219 |     :param dataset: the path of the MNIST dataset file from
220 |                  http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
221 | 
222 | 
223 |    """
224 |     datasets = load_data(dataset)
225 | 
226 |     train_set_x, train_set_y = datasets[0]
227 |     valid_set_x, valid_set_y = datasets[1]
228 |     test_set_x, test_set_y = datasets[2]
229 | 
230 |     # compute number of minibatches for training, validation and testing
231 |     n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
232 |     n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
233 |     n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
234 | 
235 |     ######################
236 |     # BUILD ACTUAL MODEL #
237 |     ######################
238 |     print '... building the model'
239 | 
240 |     # allocate symbolic variables for the data
241 |     index = T.lscalar()  # index to a [mini]batch
242 |     x = T.matrix('x')  # the data is presented as rasterized images
243 |     y = T.ivector('y')  # the labels are presented as 1D vector of
244 |                         # [int] labels
245 | 
246 |     rng = numpy.random.RandomState(1234)
247 | 
248 |     # construct the MLP class
249 |     classifier = MLP(
250 |         rng=rng,
251 |         input=x,
252 |         n_in=28 * 28,
253 |         n_hidden=n_hidden,
254 |         n_out=10
255 |     )
256 | 
257 |     # start-snippet-4
258 |     # the cost we minimize during training is the negative log likelihood of
259 |     # the model plus the regularization terms (L1 and L2); cost is expressed
260 |     # here symbolically
261 |     cost = (
262 |         classifier.negative_log_likelihood(y)
263 |         + L1_reg * classifier.L1
264 |         + L2_reg * classifier.L2_sqr
265 |     )
266 |     # end-snippet-4
267 | 
268 |     # compiling a Theano function that computes the mistakes that are made
269 |     # by the model on a minibatch
270 |     test_model = theano.function(
271 |         inputs=[index],
272 |         outputs=classifier.errors(y),
273 |         givens={
274 |             x: test_set_x[index * batch_size:(index + 1) * batch_size],
275 |             y: test_set_y[index * batch_size:(index + 1) * batch_size]
276 |         }
277 |     )
278 | 
279 |     validate_model = theano.function(
280 |         inputs=[index],
281 |         outputs=classifier.errors(y),
282 |         givens={
283 |             x: valid_set_x[index * batch_size:(index + 1) * batch_size],
284 |             y: valid_set_y[index * batch_size:(index + 1) * batch_size]
285 |         }
286 |     )
287 | 
288 |     # start-snippet-5
289 |     # compute the gradient of cost with respect to theta (sotred in params)
290 |     # the resulting gradients will be stored in a list gparams
291 |     gparams = [T.grad(cost, param) for param in classifier.params]
292 | 
293 |     # specify how to update the parameters of the model as a list of
294 |     # (variable, update expression) pairs
295 | 
296 |     # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of
297 |     # same length, zip generates a list C of same size, where each element
298 |     # is a pair formed from the two lists :
299 |     #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
300 |     updates = [
301 |         (param, param - learning_rate * gparam)
302 |         for param, gparam in zip(classifier.params, gparams)
303 |     ]
304 | 
305 |     # compiling a Theano function `train_model` that returns the cost, but
306 |     # in the same time updates the parameter of the model based on the rules
307 |     # defined in `updates`
308 |     train_model = theano.function(
309 |         inputs=[index],
310 |         outputs=cost,
311 |         updates=updates,
312 |         givens={
313 |             x: train_set_x[index * batch_size: (index + 1) * batch_size],
314 |             y: train_set_y[index * batch_size: (index + 1) * batch_size]
315 |         }
316 |     )
317 |     # end-snippet-5
318 | 
319 |     ###############
320 |     # TRAIN MODEL #
321 |     ###############
322 |     print '... training'
323 | 
324 |     # early-stopping parameters
325 |     patience = 10000  # look as this many examples regardless
326 |     patience_increase = 2  # wait this much longer when a new best is
327 |                            # found
328 |     improvement_threshold = 0.995  # a relative improvement of this much is
329 |                                    # considered significant
330 |     validation_frequency = min(n_train_batches, patience / 2)
331 |                                   # go through this many
332 |                                   # minibatche before checking the network
333 |                                   # on the validation set; in this case we
334 |                                   # check every epoch
335 | 
336 |     best_validation_loss = numpy.inf
337 |     best_iter = 0
338 |     test_score = 0.
339 |     start_time = time.clock()
340 | 
341 |     epoch = 0
342 |     done_looping = False
343 | 
344 |     while (epoch < n_epochs) and (not done_looping):
345 |         epoch = epoch + 1
346 |         for minibatch_index in xrange(n_train_batches):
347 | 
348 |             minibatch_avg_cost = train_model(minibatch_index)
349 |             # iteration number
350 |             iter = (epoch - 1) * n_train_batches + minibatch_index
351 | 
352 |             if (iter + 1) % validation_frequency == 0:
353 |                 # compute zero-one loss on validation set
354 |                 validation_losses = [validate_model(i) for i
355 |                                      in xrange(n_valid_batches)]
356 |                 this_validation_loss = numpy.mean(validation_losses)
357 | 
358 |                 print(
359 |                     'epoch %i, minibatch %i/%i, validation error %f %%' %
360 |                     (
361 |                         epoch,
362 |                         minibatch_index + 1,
363 |                         n_train_batches,
364 |                         this_validation_loss * 100.
365 |                     )
366 |                 )
367 | 
368 |                 # if we got the best validation score until now
369 |                 if this_validation_loss < best_validation_loss:
370 |                     #improve patience if loss improvement is good enough
371 |                     if (
372 |                         this_validation_loss < best_validation_loss *
373 |                         improvement_threshold
374 |                     ):
375 |                         patience = max(patience, iter * patience_increase)
376 | 
377 |                     best_validation_loss = this_validation_loss
378 |                     best_iter = iter
379 | 
380 |                     # test it on the test set
381 |                     test_losses = [test_model(i) for i
382 |                                    in xrange(n_test_batches)]
383 |                     test_score = numpy.mean(test_losses)
384 | 
385 |                     print(('     epoch %i, minibatch %i/%i, test error of '
386 |                            'best model %f %%') %
387 |                           (epoch, minibatch_index + 1, n_train_batches,
388 |                            test_score * 100.))
389 | 
390 |             if patience <= iter:
391 |                 done_looping = True
392 |                 break
393 | 
394 |     end_time = time.clock()
395 |     print(('Optimization complete. Best validation score of %f %% '
396 |            'obtained at iteration %i, with test performance %f %%') %
397 |           (best_validation_loss * 100., best_iter + 1, test_score * 100.))
398 |     print >> sys.stderr, ('The code for file ' +
399 |                           os.path.split(__file__)[1] +
400 |                           ' ran for %.2fm' % ((end_time - start_time) / 60.))
401 | 
402 | 
403 | if __name__ == '__main__':
404 |     test_mlp()
405 | 


--------------------------------------------------------------------------------
/code/dA.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  This tutorial introduces denoising auto-encoders (dA) using Theano.
  3 | 
  4 |  Denoising autoencoders are the building blocks for SdA.
  5 |  They are based on auto-encoders as the ones used in Bengio et al. 2007.
  6 |  An autoencoder takes an input x and first maps it to a hidden representation
  7 |  y = f_{\theta}(x) = s(Wx+b), parameterized by \theta={W,b}. The resulting
  8 |  latent representation y is then mapped back to a "reconstructed" vector
  9 |  z \in [0,1]^d in input space z = g_{\theta'}(y) = s(W'y + b').  The weight
 10 |  matrix W' can optionally be constrained such that W' = W^T, in which case
 11 |  the autoencoder is said to have tied weights. The network is trained such
 12 |  that to minimize the reconstruction error (the error between x and z).
 13 | 
 14 |  For the denosing autoencoder, during training, first x is corrupted into
 15 |  \tilde{x}, where \tilde{x} is a partially destroyed version of x by means
 16 |  of a stochastic mapping. Afterwards y is computed as before (using
 17 |  \tilde{x}), y = s(W\tilde{x} + b) and z as s(W'y + b'). The reconstruction
 18 |  error is now measured between z and the uncorrupted input x, which is
 19 |  computed as the cross-entropy :
 20 |       - \sum_{k=1}^d[ x_k \log z_k + (1-x_k) \log( 1-z_k)]
 21 | 
 22 | 
 23 |  References :
 24 |    - P. Vincent, H. Larochelle, Y. Bengio, P.A. Manzagol: Extracting and
 25 |    Composing Robust Features with Denoising Autoencoders, ICML'08, 1096-1103,
 26 |    2008
 27 |    - Y. Bengio, P. Lamblin, D. Popovici, H. Larochelle: Greedy Layer-Wise
 28 |    Training of Deep Networks, Advances in Neural Information Processing
 29 |    Systems 19, 2007
 30 | 
 31 | """
 32 | 
 33 | import os
 34 | import sys
 35 | import time
 36 | 
 37 | import numpy
 38 | 
 39 | import theano
 40 | import theano.tensor as T
 41 | from theano.tensor.shared_randomstreams import RandomStreams
 42 | 
 43 | from logistic_sgd import load_data
 44 | from utils import tile_raster_images
 45 | 
 46 | try:
 47 |     import PIL.Image as Image
 48 | except ImportError:
 49 |     import Image
 50 | 
 51 | 
 52 | # start-snippet-1
 53 | class dA(object):
 54 |     """Denoising Auto-Encoder class (dA)
 55 | 
 56 |     A denoising autoencoders tries to reconstruct the input from a corrupted
 57 |     version of it by projecting it first in a latent space and reprojecting
 58 |     it afterwards back in the input space. Please refer to Vincent et al.,2008
 59 |     for more details. If x is the input then equation (1) computes a partially
 60 |     destroyed version of x by means of a stochastic mapping q_D. Equation (2)
 61 |     computes the projection of the input into the latent space. Equation (3)
 62 |     computes the reconstruction of the input, while equation (4) computes the
 63 |     reconstruction error.
 64 | 
 65 |     .. math::
 66 | 
 67 |         \tilde{x} ~ q_D(\tilde{x}|x)                                     (1)
 68 | 
 69 |         y = s(W \tilde{x} + b)                                           (2)
 70 | 
 71 |         x = s(W' y  + b')                                                (3)
 72 | 
 73 |         L(x,z) = -sum_{k=1}^d [x_k \log z_k + (1-x_k) \log( 1-z_k)]      (4)
 74 | 
 75 |     """
 76 | 
 77 |     def __init__(
 78 |         self,
 79 |         numpy_rng,
 80 |         theano_rng=None,
 81 |         input=None,
 82 |         n_visible=784,
 83 |         n_hidden=500,
 84 |         W=None,
 85 |         bhid=None,
 86 |         bvis=None
 87 |     ):
 88 |         """
 89 |         Initialize the dA class by specifying the number of visible units (the
 90 |         dimension d of the input ), the number of hidden units ( the dimension
 91 |         d' of the latent or hidden space ) and the corruption level. The
 92 |         constructor also receives symbolic variables for the input, weights and
 93 |         bias. Such a symbolic variables are useful when, for example the input
 94 |         is the result of some computations, or when weights are shared between
 95 |         the dA and an MLP layer. When dealing with SdAs this always happens,
 96 |         the dA on layer 2 gets as input the output of the dA on layer 1,
 97 |         and the weights of the dA are used in the second stage of training
 98 |         to construct an MLP.
 99 | 
100 |         :type numpy_rng: numpy.random.RandomState
101 |         :param numpy_rng: number random generator used to generate weights
102 | 
103 |         :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
104 |         :param theano_rng: Theano random generator; if None is given one is
105 |                      generated based on a seed drawn from `rng`
106 | 
107 |         :type input: theano.tensor.TensorType
108 |         :param input: a symbolic description of the input or None for
109 |                       standalone dA
110 | 
111 |         :type n_visible: int
112 |         :param n_visible: number of visible units
113 | 
114 |         :type n_hidden: int
115 |         :param n_hidden:  number of hidden units
116 | 
117 |         :type W: theano.tensor.TensorType
118 |         :param W: Theano variable pointing to a set of weights that should be
119 |                   shared belong the dA and another architecture; if dA should
120 |                   be standalone set this to None
121 | 
122 |         :type bhid: theano.tensor.TensorType
123 |         :param bhid: Theano variable pointing to a set of biases values (for
124 |                      hidden units) that should be shared belong dA and another
125 |                      architecture; if dA should be standalone set this to None
126 | 
127 |         :type bvis: theano.tensor.TensorType
128 |         :param bvis: Theano variable pointing to a set of biases values (for
129 |                      visible units) that should be shared belong dA and another
130 |                      architecture; if dA should be standalone set this to None
131 | 
132 | 
133 |         """
134 |         self.n_visible = n_visible
135 |         self.n_hidden = n_hidden
136 | 
137 |         # create a Theano random generator that gives symbolic random values
138 |         if not theano_rng:
139 |             theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
140 | 
141 |         # note : W' was written as `W_prime` and b' as `b_prime`
142 |         if not W:
143 |             # W is initialized with `initial_W` which is uniformely sampled
144 |             # from -4*sqrt(6./(n_visible+n_hidden)) and
145 |             # 4*sqrt(6./(n_hidden+n_visible))the output of uniform if
146 |             # converted using asarray to dtype
147 |             # theano.config.floatX so that the code is runable on GPU
148 |             initial_W = numpy.asarray(
149 |                 numpy_rng.uniform(
150 |                     low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
151 |                     high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
152 |                     size=(n_visible, n_hidden)
153 |                 ),
154 |                 dtype=theano.config.floatX
155 |             )
156 |             W = theano.shared(value=initial_W, name='W', borrow=True)
157 | 
158 |         if not bvis:
159 |             bvis = theano.shared(
160 |                 value=numpy.zeros(
161 |                     n_visible,
162 |                     dtype=theano.config.floatX
163 |                 ),
164 |                 borrow=True
165 |             )
166 | 
167 |         if not bhid:
168 |             bhid = theano.shared(
169 |                 value=numpy.zeros(
170 |                     n_hidden,
171 |                     dtype=theano.config.floatX
172 |                 ),
173 |                 name='b',
174 |                 borrow=True
175 |             )
176 | 
177 |         self.W = W
178 |         # b corresponds to the bias of the hidden
179 |         self.b = bhid
180 |         # b_prime corresponds to the bias of the visible
181 |         self.b_prime = bvis
182 |         # tied weights, therefore W_prime is W transpose
183 |         self.W_prime = self.W.T
184 |         self.theano_rng = theano_rng
185 |         # if no input is given, generate a variable representing the input
186 |         if input is None:
187 |             # we use a matrix because we expect a minibatch of several
188 |             # examples, each example being a row
189 |             self.x = T.dmatrix(name='input')
190 |         else:
191 |             self.x = input
192 | 
193 |         self.params = [self.W, self.b, self.b_prime]
194 |     # end-snippet-1
195 | 
196 |     def get_corrupted_input(self, input, corruption_level):
197 |         """This function keeps ``1-corruption_level`` entries of the inputs the
198 |         same and zero-out randomly selected subset of size ``coruption_level``
199 |         Note : first argument of theano.rng.binomial is the shape(size) of
200 |                random numbers that it should produce
201 |                second argument is the number of trials
202 |                third argument is the probability of success of any trial
203 | 
204 |                 this will produce an array of 0s and 1s where 1 has a
205 |                 probability of 1 - ``corruption_level`` and 0 with
206 |                 ``corruption_level``
207 | 
208 |                 The binomial function return int64 data type by
209 |                 default.  int64 multiplicated by the input
210 |                 type(floatX) always return float64.  To keep all data
211 |                 in floatX when floatX is float32, we set the dtype of
212 |                 the binomial to floatX. As in our case the value of
213 |                 the binomial is always 0 or 1, this don't change the
214 |                 result. This is needed to allow the gpu to work
215 |                 correctly as it only support float32 for now.
216 | 
217 |         """
218 |         return self.theano_rng.binomial(size=input.shape, n=1,
219 |                                         p=1 - corruption_level,
220 |                                         dtype=theano.config.floatX) * input
221 | 
222 |     def get_hidden_values(self, input):
223 |         """ Computes the values of the hidden layer """
224 |         return T.nnet.sigmoid(T.dot(input, self.W) + self.b)
225 | 
226 |     def get_reconstructed_input(self, hidden):
227 |         """Computes the reconstructed input given the values of the
228 |         hidden layer
229 | 
230 |         """
231 |         return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime)
232 | 
233 |     def get_cost_updates(self, corruption_level, learning_rate):
234 |         """ This function computes the cost and the updates for one trainng
235 |         step of the dA """
236 | 
237 |         tilde_x = self.get_corrupted_input(self.x, corruption_level)
238 |         y = self.get_hidden_values(tilde_x)
239 |         z = self.get_reconstructed_input(y)
240 |         # note : we sum over the size of a datapoint; if we are using
241 |         #        minibatches, L will be a vector, with one entry per
242 |         #        example in minibatch
243 |         L = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1)
244 |         # note : L is now a vector, where each element is the
245 |         #        cross-entropy cost of the reconstruction of the
246 |         #        corresponding example of the minibatch. We need to
247 |         #        compute the average of all these to get the cost of
248 |         #        the minibatch
249 |         cost = T.mean(L)
250 | 
251 |         # compute the gradients of the cost of the `dA` with respect
252 |         # to its parameters
253 |         gparams = T.grad(cost, self.params)
254 |         # generate the list of updates
255 |         updates = [
256 |             (param, param - learning_rate * gparam)
257 |             for param, gparam in zip(self.params, gparams)
258 |         ]
259 | 
260 |         return (cost, updates)
261 | 
262 | 
263 | def test_dA(learning_rate=0.1, training_epochs=15,
264 |             dataset='mnist.pkl.gz',
265 |             batch_size=20, output_folder='dA_plots'):
266 | 
267 |     """
268 |     This demo is tested on MNIST
269 | 
270 |     :type learning_rate: float
271 |     :param learning_rate: learning rate used for training the DeNosing
272 |                           AutoEncoder
273 | 
274 |     :type training_epochs: int
275 |     :param training_epochs: number of epochs used for training
276 | 
277 |     :type dataset: string
278 |     :param dataset: path to the picked dataset
279 | 
280 |     """
281 |     datasets = load_data(dataset)
282 |     train_set_x, train_set_y = datasets[0]
283 | 
284 |     # compute number of minibatches for training, validation and testing
285 |     n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
286 | 
287 |     # allocate symbolic variables for the data
288 |     index = T.lscalar()    # index to a [mini]batch
289 |     x = T.matrix('x')  # the data is presented as rasterized images
290 | 
291 |     if not os.path.isdir(output_folder):
292 |         os.makedirs(output_folder)
293 |     os.chdir(output_folder)
294 |     ####################################
295 |     # BUILDING THE MODEL NO CORRUPTION #
296 |     ####################################
297 | 
298 |     rng = numpy.random.RandomState(123)
299 |     theano_rng = RandomStreams(rng.randint(2 ** 30))
300 | 
301 |     da = dA(
302 |         numpy_rng=rng,
303 |         theano_rng=theano_rng,
304 |         input=x,
305 |         n_visible=28 * 28,
306 |         n_hidden=500
307 |     )
308 | 
309 |     cost, updates = da.get_cost_updates(
310 |         corruption_level=0.,
311 |         learning_rate=learning_rate
312 |     )
313 | 
314 |     train_da = theano.function(
315 |         [index],
316 |         cost,
317 |         updates=updates,
318 |         givens={
319 |             x: train_set_x[index * batch_size: (index + 1) * batch_size]
320 |         }
321 |     )
322 | 
323 |     start_time = time.clock()
324 | 
325 |     ############
326 |     # TRAINING #
327 |     ############
328 | 
329 |     # go through training epochs
330 |     for epoch in xrange(training_epochs):
331 |         # go through trainng set
332 |         c = []
333 |         for batch_index in xrange(n_train_batches):
334 |             c.append(train_da(batch_index))
335 | 
336 |         print 'Training epoch %d, cost ' % epoch, numpy.mean(c)
337 | 
338 |     end_time = time.clock()
339 | 
340 |     training_time = (end_time - start_time)
341 | 
342 |     print >> sys.stderr, ('The no corruption code for file ' +
343 |                           os.path.split(__file__)[1] +
344 |                           ' ran for %.2fm' % ((training_time) / 60.))
345 |     image = Image.fromarray(
346 |         tile_raster_images(X=da.W.get_value(borrow=True).T,
347 |                            img_shape=(28, 28), tile_shape=(10, 10),
348 |                            tile_spacing=(1, 1)))
349 |     image.save('filters_corruption_0.png')
350 | 
351 |     #####################################
352 |     # BUILDING THE MODEL CORRUPTION 30% #
353 |     #####################################
354 | 
355 |     rng = numpy.random.RandomState(123)
356 |     theano_rng = RandomStreams(rng.randint(2 ** 30))
357 | 
358 |     da = dA(
359 |         numpy_rng=rng,
360 |         theano_rng=theano_rng,
361 |         input=x,
362 |         n_visible=28 * 28,
363 |         n_hidden=500
364 |     )
365 | 
366 |     cost, updates = da.get_cost_updates(
367 |         corruption_level=0.3,
368 |         learning_rate=learning_rate
369 |     )
370 | 
371 |     train_da = theano.function(
372 |         [index],
373 |         cost,
374 |         updates=updates,
375 |         givens={
376 |             x: train_set_x[index * batch_size: (index + 1) * batch_size]
377 |         }
378 |     )
379 | 
380 |     start_time = time.clock()
381 | 
382 |     ############
383 |     # TRAINING #
384 |     ############
385 | 
386 |     # go through training epochs
387 |     for epoch in xrange(training_epochs):
388 |         # go through trainng set
389 |         c = []
390 |         for batch_index in xrange(n_train_batches):
391 |             c.append(train_da(batch_index))
392 | 
393 |         print 'Training epoch %d, cost ' % epoch, numpy.mean(c)
394 | 
395 |     end_time = time.clock()
396 | 
397 |     training_time = (end_time - start_time)
398 | 
399 |     print >> sys.stderr, ('The 30% corruption code for file ' +
400 |                           os.path.split(__file__)[1] +
401 |                           ' ran for %.2fm' % (training_time / 60.))
402 | 
403 |     image = Image.fromarray(tile_raster_images(
404 |         X=da.W.get_value(borrow=True).T,
405 |         img_shape=(28, 28), tile_shape=(10, 10),
406 |         tile_spacing=(1, 1)))
407 |     image.save('filters_corruption_30.png')
408 | 
409 |     os.chdir('../')
410 | 
411 | 
412 | if __name__ == '__main__':
413 |     test_dA()
414 | 


--------------------------------------------------------------------------------
/code/hmc/hmc.py:
--------------------------------------------------------------------------------
  1 | """
  2 | TODO
  3 | """
  4 | 
  5 | import numpy
  6 | 
  7 | from theano import function, shared
  8 | from theano import tensor as TT
  9 | import theano
 10 | 
 11 | sharedX = (lambda X, name:
 12 |            shared(numpy.asarray(X, dtype=theano.config.floatX), name=name))
 13 | 
 14 | 
 15 | def kinetic_energy(vel):
 16 |     """Returns the kinetic energy associated with the given velocity
 17 |     and mass of 1.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     vel: theano matrix
 22 |         Symbolic matrix whose rows are velocity vectors.
 23 | 
 24 |     Returns
 25 |     -------
 26 |     return: theano vector
 27 |         Vector whose i-th entry is the kinetic entry associated with vel[i].
 28 | 
 29 |     """
 30 |     return 0.5 * (vel ** 2).sum(axis=1)
 31 | 
 32 | 
 33 | def hamiltonian(pos, vel, energy_fn):
 34 |     """
 35 |     Returns the Hamiltonian (sum of potential and kinetic energy) for the given
 36 |     velocity and position.
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     pos: theano matrix
 41 |         Symbolic matrix whose rows are position vectors.
 42 |     vel: theano matrix
 43 |         Symbolic matrix whose rows are velocity vectors.
 44 |     energy_fn: python function
 45 |         Python function, operating on symbolic theano variables, used tox
 46 |         compute the potential energy at a given position.
 47 | 
 48 |     Returns
 49 |     -------
 50 |     return: theano vector
 51 |         Vector whose i-th entry is the Hamiltonian at position pos[i] and
 52 |         velocity vel[i].
 53 |     """
 54 |     # assuming mass is 1
 55 |     return energy_fn(pos) + kinetic_energy(vel)
 56 | 
 57 | 
 58 | def metropolis_hastings_accept(energy_prev, energy_next, s_rng):
 59 |     """
 60 |     Performs a Metropolis-Hastings accept-reject move.
 61 | 
 62 |     Parameters
 63 |     ----------
 64 |     energy_prev: theano vector
 65 |         Symbolic theano tensor which contains the energy associated with the
 66 |         configuration at time-step t.
 67 |     energy_next: theano vector
 68 |         Symbolic theano tensor which contains the energy associated with the
 69 |         proposed configuration at time-step t+1.
 70 |     s_rng: theano.tensor.shared_randomstreams.RandomStreams
 71 |         Theano shared random stream object used to generate the random number
 72 |         used in proposal.
 73 | 
 74 |     Returns
 75 |     -------
 76 |     return: boolean
 77 |         True if move is accepted, False otherwise
 78 |     """
 79 |     ediff = energy_prev - energy_next
 80 |     return (TT.exp(ediff) - s_rng.uniform(size=energy_prev.shape)) >= 0
 81 | 
 82 | 
 83 | def simulate_dynamics(initial_pos, initial_vel, stepsize, n_steps, energy_fn):
 84 |     """
 85 |     Return final (position, velocity) obtained after an `n_steps` leapfrog
 86 |     updates, using Hamiltonian dynamics.
 87 | 
 88 |     Parameters
 89 |     ----------
 90 |     initial_pos: shared theano matrix
 91 |         Initial position at which to start the simulation
 92 |     initial_vel: shared theano matrix
 93 |         Initial velocity of particles
 94 |     stepsize: shared theano scalar
 95 |         Scalar value controlling amount by which to move
 96 |     energy_fn: python function
 97 |         Python function, operating on symbolic theano variables, used to
 98 |         compute the potential energy at a given position.
 99 | 
100 |     Returns
101 |     -------
102 |     rval1: theano matrix
103 |         Final positions obtained after simulation
104 |     rval2: theano matrix
105 |         Final velocity obtained after simulation
106 |     """
107 | 
108 |     def leapfrog(pos, vel, step):
109 |         """
110 |         Inside loop of Scan. Performs one step of leapfrog update, using
111 |         Hamiltonian dynamics.
112 | 
113 |         Parameters
114 |         ----------
115 |         pos: theano matrix
116 |             in leapfrog update equations, represents pos(t), position at time t
117 |         vel: theano matrix
118 |             in leapfrog update equations, represents vel(t - stepsize/2),
119 |             velocity at time (t - stepsize/2)
120 |         step: theano scalar
121 |             scalar value controlling amount by which to move
122 | 
123 |         Returns
124 |         -------
125 |         rval1: [theano matrix, theano matrix]
126 |             Symbolic theano matrices for new position pos(t + stepsize), and
127 |             velocity vel(t + stepsize/2)
128 |         rval2: dictionary
129 |             Dictionary of updates for the Scan Op
130 |         """
131 |         # from pos(t) and vel(t-stepsize/2), compute vel(t+stepsize/2)
132 |         dE_dpos = TT.grad(energy_fn(pos).sum(), pos)
133 |         new_vel = vel - step * dE_dpos
134 |         # from vel(t+stepsize/2) compute pos(t+stepsize)
135 |         new_pos = pos + step * new_vel
136 |         return [new_pos, new_vel], {}
137 | 
138 |     # compute velocity at time-step: t + stepsize/2
139 |     initial_energy = energy_fn(initial_pos)
140 |     dE_dpos = TT.grad(initial_energy.sum(), initial_pos)
141 |     vel_half_step = initial_vel - 0.5 * stepsize * dE_dpos
142 | 
143 |     # compute position at time-step: t + stepsize
144 |     pos_full_step = initial_pos + stepsize * vel_half_step
145 | 
146 |     # perform leapfrog updates: the scan op is used to repeatedly compute
147 |     # vel(t + (m-1/2)*stepsize) and pos(t + m*stepsize) for m in [2,n_steps].
148 |     (all_pos, all_vel), scan_updates = theano.scan(
149 |         leapfrog,
150 |         outputs_info=[
151 |             dict(initial=pos_full_step),
152 |             dict(initial=vel_half_step),
153 |         ],
154 |         non_sequences=[stepsize],
155 |         n_steps=n_steps - 1)
156 |     final_pos = all_pos[-1]
157 |     final_vel = all_vel[-1]
158 |     # NOTE: Scan always returns an updates dictionary, in case the
159 |     # scanned function draws samples from a RandomStream. These
160 |     # updates must then be used when compiling the Theano function, to
161 |     # avoid drawing the same random numbers each time the function is
162 |     # called. In this case however, we consciously ignore
163 |     # "scan_updates" because we know it is empty.
164 |     assert not scan_updates
165 | 
166 |     # The last velocity returned by scan is vel(t +
167 |     # (n_steps - 1 / 2) * stepsize) We therefore perform one more half-step
168 |     # to return vel(t + n_steps * stepsize)
169 |     energy = energy_fn(final_pos)
170 |     final_vel = final_vel - 0.5 * stepsize * TT.grad(energy.sum(), final_pos)
171 | 
172 |     # return new proposal state
173 |     return final_pos, final_vel
174 | 
175 | 
176 | # start-snippet-1
177 | def hmc_move(s_rng, positions, energy_fn, stepsize, n_steps):
178 |     """
179 |     This function performs one-step of Hybrid Monte-Carlo sampling. We start by
180 |     sampling a random velocity from a univariate Gaussian distribution, perform
181 |     `n_steps` leap-frog updates using Hamiltonian dynamics and accept-reject
182 |     using Metropolis-Hastings.
183 | 
184 |     Parameters
185 |     ----------
186 |     s_rng: theano shared random stream
187 |         Symbolic random number generator used to draw random velocity and
188 |         perform accept-reject move.
189 |     positions: shared theano matrix
190 |         Symbolic matrix whose rows are position vectors.
191 |     energy_fn: python function
192 |         Python function, operating on symbolic theano variables, used to
193 |         compute the potential energy at a given position.
194 |     stepsize:  shared theano scalar
195 |         Shared variable containing the stepsize to use for `n_steps` of HMC
196 |         simulation steps.
197 |     n_steps: integer
198 |         Number of HMC steps to perform before proposing a new position.
199 | 
200 |     Returns
201 |     -------
202 |     rval1: boolean
203 |         True if move is accepted, False otherwise
204 |     rval2: theano matrix
205 |         Matrix whose rows contain the proposed "new position"
206 |     """
207 |     # end-snippet-1 start-snippet-2
208 |     # sample random velocity
209 |     initial_vel = s_rng.normal(size=positions.shape)
210 |     # end-snippet-2 start-snippet-3
211 |     # perform simulation of particles subject to Hamiltonian dynamics
212 |     final_pos, final_vel = simulate_dynamics(
213 |         initial_pos=positions,
214 |         initial_vel=initial_vel,
215 |         stepsize=stepsize,
216 |         n_steps=n_steps,
217 |         energy_fn=energy_fn
218 |     )
219 |     # end-snippet-3 start-snippet-4
220 |     # accept/reject the proposed move based on the joint distribution
221 |     accept = metropolis_hastings_accept(
222 |         energy_prev=hamiltonian(positions, initial_vel, energy_fn),
223 |         energy_next=hamiltonian(final_pos, final_vel, energy_fn),
224 |         s_rng=s_rng
225 |     )
226 |     # end-snippet-4
227 |     return accept, final_pos
228 | 
229 | 
230 | # start-snippet-5
231 | def hmc_updates(positions, stepsize, avg_acceptance_rate, final_pos, accept,
232 |                 target_acceptance_rate, stepsize_inc, stepsize_dec,
233 |                 stepsize_min, stepsize_max, avg_acceptance_slowness):
234 |     """This function is executed after `n_steps` of HMC sampling
235 |     (`hmc_move` function). It creates the updates dictionary used by
236 |     the `simulate` function. It takes care of updating: the position
237 |     (if the move is accepted), the stepsize (to track a given target
238 |     acceptance rate) and the average acceptance rate (computed as a
239 |     moving average).
240 | 
241 |     Parameters
242 |     ----------
243 |     positions: shared variable, theano matrix
244 |         Shared theano matrix whose rows contain the old position
245 |     stepsize: shared variable, theano scalar
246 |         Shared theano scalar containing current step size
247 |     avg_acceptance_rate: shared variable, theano scalar
248 |         Shared theano scalar containing the current average acceptance rate
249 |     final_pos: shared variable, theano matrix
250 |         Shared theano matrix whose rows contain the new position
251 |     accept: theano scalar
252 |         Boolean-type variable representing whether or not the proposed HMC move
253 |         should be accepted or not.
254 |     target_acceptance_rate: float
255 |         The stepsize is modified in order to track this target acceptance rate.
256 |     stepsize_inc: float
257 |         Amount by which to increment stepsize when acceptance rate is too high.
258 |     stepsize_dec: float
259 |         Amount by which to decrement stepsize when acceptance rate is too low.
260 |     stepsize_min: float
261 |         Lower-bound on `stepsize`.
262 |     stepsize_min: float
263 |         Upper-bound on `stepsize`.
264 |     avg_acceptance_slowness: float
265 |         Average acceptance rate is computed as an exponential moving average.
266 |         (1-avg_acceptance_slowness) is the weight given to the newest
267 |         observation.
268 | 
269 |     Returns
270 |     -------
271 |     rval1: dictionary-like
272 |         A dictionary of updates to be used by the `HMC_Sampler.simulate`
273 |         function.  The updates target the position, stepsize and average
274 |         acceptance rate.
275 | 
276 |     """
277 | 
278 |     ## POSITION UPDATES ##
279 |     # broadcast `accept` scalar to tensor with the same dimensions as
280 |     # final_pos.
281 |     accept_matrix = accept.dimshuffle(0, *(('x',) * (final_pos.ndim - 1)))
282 |     # if accept is True, update to `final_pos` else stay put
283 |     new_positions = TT.switch(accept_matrix, final_pos, positions)
284 |     # end-snippet-5 start-snippet-7
285 |     ## STEPSIZE UPDATES ##
286 |     # if acceptance rate is too low, our sampler is too "noisy" and we reduce
287 |     # the stepsize. If it is too high, our sampler is too conservative, we can
288 |     # get away with a larger stepsize (resulting in better mixing).
289 |     _new_stepsize = TT.switch(avg_acceptance_rate > target_acceptance_rate,
290 |                               stepsize * stepsize_inc, stepsize * stepsize_dec)
291 |     # maintain stepsize in [stepsize_min, stepsize_max]
292 |     new_stepsize = TT.clip(_new_stepsize, stepsize_min, stepsize_max)
293 | 
294 |     # end-snippet-7 start-snippet-6
295 |     ## ACCEPT RATE UPDATES ##
296 |     # perform exponential moving average
297 |     mean_dtype = theano.scalar.upcast(accept.dtype, avg_acceptance_rate.dtype)
298 |     new_acceptance_rate = TT.add(
299 |         avg_acceptance_slowness * avg_acceptance_rate,
300 |         (1.0 - avg_acceptance_slowness) * accept.mean(dtype=mean_dtype))
301 |     # end-snippet-6 start-snippet-8
302 |     return [(positions, new_positions),
303 |             (stepsize, new_stepsize),
304 |             (avg_acceptance_rate, new_acceptance_rate)]
305 |     # end-snippet-8
306 | 
307 | 
308 | class HMC_sampler(object):
309 |     """
310 |     Convenience wrapper for performing Hybrid Monte Carlo (HMC). It creates the
311 |     symbolic graph for performing an HMC simulation (using `hmc_move` and
312 |     `hmc_updates`). The graph is then compiled into the `simulate` function, a
313 |     theano function which runs the simulation and updates the required shared
314 |     variables.
315 | 
316 |     Users should interface with the sampler thorugh the `draw` function which
317 |     advances the markov chain and returns the current sample by calling
318 |     `simulate` and `get_position` in sequence.
319 | 
320 |     The hyper-parameters are the same as those used by Marc'Aurelio's
321 |     'train_mcRBM.py' file (available on his personal home page).
322 |     """
323 | 
324 |     def __init__(self, **kwargs):
325 |         self.__dict__.update(kwargs)
326 | 
327 |     @classmethod
328 |     def new_from_shared_positions(
329 |         cls,
330 |         shared_positions,
331 |         energy_fn,
332 |         initial_stepsize=0.01,
333 |         target_acceptance_rate=.9,
334 |         n_steps=20,
335 |         stepsize_dec=0.98,
336 |         stepsize_min=0.001,
337 |         stepsize_max=0.25,
338 |         stepsize_inc=1.02,
339 |         # used in geometric avg. 1.0 would be not moving at all
340 |         avg_acceptance_slowness=0.9,
341 |         seed=12345
342 |     ):
343 |         """
344 |         :param shared_positions: theano ndarray shared var with
345 |             many particle [initial] positions
346 | 
347 |         :param energy_fn:
348 |             callable such that energy_fn(positions)
349 |             returns theano vector of energies.
350 |             The len of this vector is the batchsize.
351 | 
352 |             The sum of this energy vector must be differentiable (with
353 |             theano.tensor.grad) with respect to the positions for HMC
354 |             sampling to work.
355 | 
356 |         """
357 |         # allocate shared variables
358 |         stepsize = sharedX(initial_stepsize, 'hmc_stepsize')
359 |         avg_acceptance_rate = sharedX(target_acceptance_rate,
360 |                                       'avg_acceptance_rate')
361 |         s_rng = TT.shared_randomstreams.RandomStreams(seed)
362 | 
363 |         # define graph for an `n_steps` HMC simulation
364 |         accept, final_pos = hmc_move(
365 |             s_rng,
366 |             shared_positions,
367 |             energy_fn,
368 |             stepsize,
369 |             n_steps)
370 | 
371 |         # define the dictionary of updates, to apply on every `simulate` call
372 |         simulate_updates = hmc_updates(
373 |             shared_positions,
374 |             stepsize,
375 |             avg_acceptance_rate,
376 |             final_pos=final_pos,
377 |             accept=accept,
378 |             stepsize_min=stepsize_min,
379 |             stepsize_max=stepsize_max,
380 |             stepsize_inc=stepsize_inc,
381 |             stepsize_dec=stepsize_dec,
382 |             target_acceptance_rate=target_acceptance_rate,
383 |             avg_acceptance_slowness=avg_acceptance_slowness)
384 | 
385 |         # compile theano function
386 |         simulate = function([], [], updates=simulate_updates)
387 | 
388 |         # create HMC_sampler object with the following attributes ...
389 |         return cls(
390 |             positions=shared_positions,
391 |             stepsize=stepsize,
392 |             stepsize_min=stepsize_min,
393 |             stepsize_max=stepsize_max,
394 |             avg_acceptance_rate=avg_acceptance_rate,
395 |             target_acceptance_rate=target_acceptance_rate,
396 |             s_rng=s_rng,
397 |             _updates=simulate_updates,
398 |             simulate=simulate)
399 | 
400 |     def draw(self, **kwargs):
401 |         """
402 |         Returns a new position obtained after `n_steps` of HMC simulation.
403 | 
404 |         Parameters
405 |         ----------
406 |         kwargs: dictionary
407 |             The `kwargs` dictionary is passed to the shared variable
408 |             (self.positions) `get_value()` function.  For example, to avoid
409 |             copying the shared variable value, consider passing `borrow=True`.
410 | 
411 |         Returns
412 |         -------
413 |         rval: numpy matrix
414 |             Numpy matrix whose of dimensions similar to `initial_position`.
415 |        """
416 |         self.simulate()
417 |         return self.positions.get_value(borrow=False)
418 | 


--------------------------------------------------------------------------------