├── show_samples_tfd.py
├── .gitignore
├── show_inpaint_samples.py
├── show_samples_tfd_paper.py
├── show_samples_mnist_paper.py
├── show_samples_cifar_full_paper.py
├── show_samples_cifar_conv_paper.py
├── LICENSE
├── show_gen_weights.py
├── show_samples_inpaint.py
├── test_deconv.py
├── show_samples.py
├── README.md
├── ll_mnist.py
├── ll.py
├── mnist.yaml
├── cifar10_fully_connected.yaml
├── tfd_pretrain
    ├── train.yaml
    └── pretrain.yaml
├── parzen_ll.py
├── cifar10_convolutional.yaml
├── deconv.py
├── sgd.py
├── sgd_alt.py
└── __init__.py


/show_samples_tfd.py:
--------------------------------------------------------------------------------
 1 | from pylearn2.utils import serial
 2 | import sys
 3 | _, model_path = sys.argv
 4 | model = serial.load(model_path)
 5 | from pylearn2.gui.patch_viewer import make_viewer
 6 | space = model.generator.get_output_space()
 7 | total_dimension = space.get_total_dimension()
 8 | import numpy as np
 9 | num_colors = 1
10 | #if total_dimension % 3 == 0:
11 | #    num_colors = 3
12 | w = int(np.sqrt(total_dimension / num_colors))
13 | from pylearn2.space import Conv2DSpace
14 | desired_space = Conv2DSpace(shape=[w, w], num_channels=num_colors, axes=('b',0,1,'c'))
15 | samples = space.format_as(batch=model.generator.sample(100),
16 |         space=desired_space).eval()
17 | print (samples.min(), samples.mean(), samples.max())
18 | viewer = make_viewer(samples * 2.0 - 1.0)
19 | viewer.show()
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | bin/
12 | build/
13 | develop-eggs/
14 | dist/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # Installer logs
26 | pip-log.txt
27 | pip-delete-this-directory.txt
28 | 
29 | # Unit test / coverage reports
30 | htmlcov/
31 | .tox/
32 | .coverage
33 | .cache
34 | nosetests.xml
35 | coverage.xml
36 | 
37 | # Translations
38 | *.mo
39 | 
40 | # Mr Developer
41 | .mr.developer.cfg
42 | .project
43 | .pydevproject
44 | 
45 | # Rope
46 | .ropeproject
47 | 
48 | # Django stuff:
49 | *.log
50 | *.pot
51 | 
52 | # Sphinx documentation
53 | docs/_build/
54 | 
55 | 


--------------------------------------------------------------------------------
/show_inpaint_samples.py:
--------------------------------------------------------------------------------
 1 | from pylearn2.utils import serial
 2 | import sys
 3 | _, model_path = sys.argv
 4 | model = serial.load(model_path)
 5 | from pylearn2.gui.patch_viewer import make_viewer
 6 | space = model.generator.get_output_space()
 7 | from pylearn2.config import yaml_parse
 8 | import numpy as np
 9 | 
10 | dataset = yaml_parse.load(model.dataset_yaml_src)
11 | dataset = dataset.get_test_set()
12 | 
13 | grid_shape = None
14 | 
15 | from pylearn2.utils import sharedX
16 | X = sharedX(dataset.get_batch_topo(100))
17 | samples, ignore = model.generator.inpainting_sample_and_noise(X)
18 | samples = samples.eval()
19 | total_dimension = space.get_total_dimension()
20 | num_colors = 1
21 | if total_dimension % 3 == 0:
22 |     num_colors = 3
23 | w = int(np.sqrt(total_dimension / num_colors))
24 | from pylearn2.space import Conv2DSpace
25 | desired_space = Conv2DSpace(shape=[w, w], num_channels=num_colors, axes=('b',0,1,'c'))
26 | is_color = samples.shape[-1] == 3
27 | print (samples.min(), samples.mean(), samples.max())
28 | # Hack for detecting MNIST [0, 1] values. Otherwise we assume centered images
29 | if samples.min() >0:
30 |     samples = samples * 2.0 - 1.0
31 | viewer = make_viewer(samples, grid_shape=grid_shape, is_color=is_color)
32 | viewer.show()
33 | 


--------------------------------------------------------------------------------
/show_samples_tfd_paper.py:
--------------------------------------------------------------------------------
 1 | from pylearn2.utils import serial
 2 | import sys
 3 | _, model_path = sys.argv
 4 | model = serial.load(model_path)
 5 | from pylearn2.gui.patch_viewer import make_viewer
 6 | space = model.generator.get_output_space()
 7 | from pylearn2.config import yaml_parse
 8 | from pylearn2.gui.patch_viewer import PatchViewer
 9 | import numpy as np
10 | 
11 | dataset = yaml_parse.load(model.dataset_yaml_src)
12 | 
13 | grid_shape = None
14 | 
15 | rows = 4
16 | sample_cols = 5
17 | 
18 | # For some reason format_as from VectorSpace is not working right
19 | samples = model.generator.sample(rows * sample_cols).eval()
20 | topo_samples = dataset.get_topological_view(samples)
21 | 
22 | pv = PatchViewer(grid_shape=(rows, sample_cols + 1), patch_shape=(48,48),
23 |         is_color=False)
24 | 
25 | X = dataset.X
26 | topo = dataset.get_topological_view()
27 | index = 0
28 | for i in xrange(samples.shape[0]):
29 |     topo_sample = topo_samples[i, :, :, :]
30 |     pv.add_patch(topo_sample * 2. - 1., rescale=False)
31 | 
32 |     if (i +1) % sample_cols == 0:
33 |         sample = samples[i, :]
34 |         dists = np.square(X - sample).sum(axis=1)
35 |         j = np.argmin(dists)
36 |         match = topo[j, :]
37 |         pv.add_patch(match * 2 -1, rescale=False, activation=1)
38 | 
39 | pv.show()
40 | 


--------------------------------------------------------------------------------
/show_samples_mnist_paper.py:
--------------------------------------------------------------------------------
 1 | from pylearn2.utils import serial
 2 | import sys
 3 | _, model_path = sys.argv
 4 | model = serial.load(model_path)
 5 | from pylearn2.gui.patch_viewer import make_viewer
 6 | space = model.generator.get_output_space()
 7 | from pylearn2.config import yaml_parse
 8 | from pylearn2.gui.patch_viewer import PatchViewer
 9 | import numpy as np
10 | 
11 | dataset = yaml_parse.load(model.dataset_yaml_src)
12 | 
13 | grid_shape = None
14 | 
15 | rows = 4
16 | sample_cols = 5
17 | 
18 | # For some reason format_as from VectorSpace is not working right
19 | samples = model.generator.sample(rows * sample_cols).eval()
20 | topo_samples = dataset.get_topological_view(samples)
21 | 
22 | pv = PatchViewer(grid_shape=(rows, sample_cols + 1), patch_shape=(28,28),
23 |         is_color=False)
24 | 
25 | X = dataset.X
26 | topo = dataset.get_topological_view()
27 | index = 0
28 | for i in xrange(samples.shape[0]):
29 |     topo_sample = topo_samples[i, :, :, :]
30 |     pv.add_patch(topo_sample * 2. - 1., rescale=False)
31 | 
32 |     if (i +1) % sample_cols == 0:
33 |         sample = samples[i, :]
34 |         dists = np.square(X - sample).sum(axis=1)
35 |         j = np.argmin(dists)
36 |         match = topo[j, :]
37 |         pv.add_patch(match * 2 -1, rescale=False, activation=1)
38 | 
39 | pv.show()
40 | 


--------------------------------------------------------------------------------
/show_samples_cifar_full_paper.py:
--------------------------------------------------------------------------------
 1 | from pylearn2.utils import serial
 2 | import sys
 3 | _, model_path = sys.argv
 4 | model = serial.load(model_path)
 5 | space = model.generator.get_output_space()
 6 | from pylearn2.config import yaml_parse
 7 | from pylearn2.gui.patch_viewer import PatchViewer
 8 | import numpy as np
 9 | 
10 | dataset = yaml_parse.load(model.dataset_yaml_src)
11 | 
12 | grid_shape = None
13 | 
14 | rows = 4
15 | sample_cols = 5
16 | 
17 | # For some reason format_as from VectorSpace is not working right
18 | samples = model.generator.sample(rows * sample_cols).eval()
19 | topo_samples = dataset.get_topological_view(samples)
20 | 
21 | pv = PatchViewer(grid_shape=(rows, sample_cols + 1), patch_shape=(32,32),
22 |         is_color=True)
23 | scale = np.abs(samples).max()
24 | 
25 | X = dataset.X
26 | topo = dataset.get_topological_view()
27 | index = 0
28 | for i in xrange(samples.shape[0]):
29 |     topo_sample = topo_samples[i, :, :, :]
30 |     print topo_sample.min(), topo_sample.max()
31 |     pv.add_patch(topo_sample / scale, rescale=False)
32 | 
33 |     if (i +1) % sample_cols == 0:
34 |         sample = samples[i, :]
35 |         dists = np.square(X - sample).sum(axis=1)
36 |         j = np.argmin(dists)
37 |         match = topo[j, :]
38 |         print match.min(), match.max()
39 |         pv.add_patch(match / scale, rescale=False, activation=1)
40 | 
41 | pv.show()
42 | 


--------------------------------------------------------------------------------
/show_samples_cifar_conv_paper.py:
--------------------------------------------------------------------------------
 1 | from pylearn2.utils import serial
 2 | import sys
 3 | _, model_path = sys.argv
 4 | model = serial.load(model_path)
 5 | space = model.generator.get_output_space()
 6 | from pylearn2.config import yaml_parse
 7 | from pylearn2.gui.patch_viewer import PatchViewer
 8 | import numpy as np
 9 | 
10 | dataset = yaml_parse.load(model.dataset_yaml_src)
11 | 
12 | grid_shape = None
13 | 
14 | rows = 4
15 | sample_cols = 5
16 | 
17 | # For some reason format_as from VectorSpace is not working right
18 | topo_samples = model.generator.sample(rows * sample_cols).eval()
19 | samples = dataset.get_design_matrix(topo_samples)
20 | dataset.axes = ['b', 0, 1, 'c']
21 | dataset.view_converter.axes = ['b', 0, 1, 'c']
22 | topo_samples = dataset.get_topological_view(samples)
23 | 
24 | pv = PatchViewer(grid_shape=(rows, sample_cols + 1), patch_shape=(32,32),
25 |         is_color=True)
26 | scale = np.abs(samples).max()
27 | 
28 | X = dataset.X
29 | topo = dataset.get_topological_view()
30 | index = 0
31 | for i in xrange(samples.shape[0]):
32 |     topo_sample = topo_samples[i, :, :, :]
33 |     print topo_sample.min(), topo_sample.max()
34 |     pv.add_patch(topo_sample / scale, rescale=False)
35 | 
36 |     if (i +1) % sample_cols == 0:
37 |         sample = samples[i, :]
38 |         dists = np.square(X - sample).sum(axis=1)
39 |         j = np.argmin(dists)
40 |         match = topo[j, :]
41 |         print match.min(), match.max()
42 |         pv.add_patch(match / scale, rescale=False, activation=1)
43 | 
44 | pv.show()
45 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014, Ian Goodfellow
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of the {organization} nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/show_gen_weights.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pylearn2.gui.patch_viewer import make_viewer
 3 | from pylearn2.utils import serial
 4 | model = serial.load(sys.argv[1])
 5 | generator = model.generator
 6 | 
 7 | final = generator.mlp.layers[-1]
 8 | success = False
 9 | 
10 | i = -1
11 | success = False
12 | to_search = generator.mlp
13 | while not success:
14 |     print "while loop ", i
15 |     final = to_search.layers[i]
16 |     if 'Composite' in str(type(final)):
17 |         i = input("which")
18 |         elem = final.layers[i]
19 |         if hasattr(elem, 'layers'):
20 |             print "stepping into inner MLP"
21 |             i = -1
22 |             to_search = elem
23 |             continue
24 |         else:
25 |             print "examining this element"
26 |             final = elem
27 | 
28 |     try:
29 |         print "Trying get_weights topo"
30 |         topo = final.get_weights_topo()
31 |         print "It worked"
32 |         success = True
33 |     except Exception:
34 |         pass
35 | 
36 |     if success:
37 |         print "Making the viewer and showing"
38 |         make_viewer(topo).show()
39 |         quit()
40 | 
41 |     try:
42 |         print "Trying get_weights"
43 |         weights = final.get_weights()
44 |         print "It worked"
45 |         success = True
46 |     except NotImplementedError:
47 |         i -= 1 # skip over SpaceConverter, etc.
48 | print "Out of the while loop"
49 | 
50 | 
51 | print "weights shape ", weights.shape
52 | viewer = make_viewer(weights, is_color=weights.shape[1] % 3 == 0 and weights.shape[1] != 48*48)
53 | print "image shape ", viewer.image.shape
54 | 
55 | print "made viewer"
56 | 
57 | viewer.show()
58 | 
59 | print "executed show"
60 | 


--------------------------------------------------------------------------------
/show_samples_inpaint.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | from pylearn2.utils import serial
 3 | import sys
 4 | from pylearn2.gui.patch_viewer import make_viewer
 5 | from pylearn2.space import VectorSpace
 6 | from pylearn2.config import yaml_parse
 7 | import numpy as np
 8 | import ipdb
 9 | 
10 | 
11 | # TODO, only works for CIFAR10 for now
12 | 
13 | grid_shape = None
14 | repeat_samples = 1
15 | num_samples = 5
16 | 
17 | 
18 | _, model_path = sys.argv
19 | model = serial.load(model_path)
20 | rng = np.random.RandomState(20232)
21 | 
22 | def get_data_samples(dataset, n = num_samples):
23 |     unique_y = np.unique(dataset.y)
24 |     rval = []
25 |     for y in np.unique(dataset.y):
26 |         ind = np.where(dataset.y == y)[0]
27 |         ind = ind[rng.randint(0, len(ind), n)]
28 |         rval.append(dataset.get_topological_view()[ind])
29 | 
30 |     return np.concatenate(rval)
31 | 
32 | dataset = yaml_parse.load(model.dataset_yaml_src)
33 | dataset = dataset.get_test_set()
34 | data = get_data_samples(dataset)
35 | 
36 | output_space = model.generator.get_output_space()
37 | input_space = model.generator.mlp.input_space
38 | 
39 | X = input_space.get_theano_batch()
40 | samples, _ = model.generator.inpainting_sample_and_noise(X)
41 | f = theano.function([X], samples)
42 | 
43 | samples = []
44 | for i in xrange(repeat_samples):
45 |     samples.append(f(data))
46 | 
47 | samples = np.concatenate(samples)
48 | 
49 | is_color = True
50 | 
51 | 
52 | print (samples.min(), samples.mean(), samples.max())
53 | # Hack for detecting MNIST [0, 1] values. Otherwise we assume centered images
54 | if samples.min() >0:
55 |     samples = samples * 2.0 - 1.0
56 | viewer = make_viewer(samples, grid_shape=grid_shape, is_color=is_color)
57 | viewer.show()
58 | 


--------------------------------------------------------------------------------
/test_deconv.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script visually test the deconv layer.
 3 | Construct an MLP with conv ,and deconv layer,
 4 | set their W to same values and show the original
 5 | input and the output of the mlp side by side.
 6 | They are supposed to look same.
 7 | """
 8 | 
 9 | 
10 | import theano
11 | from adversarial.deconv import Deconv
12 | from pylearn2.datasets.mnist import MNIST
13 | from pylearn2.space import Conv2DSpace
14 | from pylearn2.models.mlp import MLP
15 | from pylearn2.models.maxout import MaxoutConvC01B
16 | from pylearn2.gui import patch_viewer
17 | import ipdb
18 | 
19 | 
20 | input_space = Conv2DSpace(shape = (28, 28), num_channels=1, axes = ('c', 0, 1, 'b'))
21 | conv = MaxoutConvC01B(layer_name = 'conv',
22 |                         num_channels = 16,
23 |                         num_pieces = 1,
24 |                         kernel_shape = (4, 4),
25 |                         pool_shape = (1, 1),
26 |                         pool_stride=(1, 1),
27 |                         irange = 0.05)
28 | deconv = Deconv(layer_name = 'deconv',
29 |                 num_channels = 1,
30 |                 kernel_shape = (4, 4),
31 |                 irange = 0.05)
32 | 
33 | mlp = MLP(input_space =input_space,
34 |         layers = [conv, deconv])
35 | 
36 | mlp.layers[1].transformer._filters.set_value(mlp.layers[0].transformer._filters.get_value())
37 | 
38 | x = input_space.get_theano_batch()
39 | out = mlp.fprop(x)
40 | f = theano.function([x], out)
41 | 
42 | data = MNIST('test')
43 | data_specs = (input_space, 'features')
44 | iter = data.iterator(mode = 'sequential', batch_size = 2, data_specs = data_specs)
45 | pv = patch_viewer.PatchViewer((10, 10), (28, 28))
46 | for item in iter:
47 |     res = f(item)
48 |     pv.add_patch(item[0,:,:,0])
49 |     pv.add_patch(res[0,:,:,0])
50 |     pv.show()
51 |     break
52 | 
53 | 


--------------------------------------------------------------------------------
/show_samples.py:
--------------------------------------------------------------------------------
 1 | from pylearn2.utils import serial
 2 | import sys
 3 | _, model_path = sys.argv
 4 | model = serial.load(model_path)
 5 | from pylearn2.gui.patch_viewer import make_viewer
 6 | space = model.generator.get_output_space()
 7 | from pylearn2.space import VectorSpace
 8 | from pylearn2.config import yaml_parse
 9 | import numpy as np
10 | 
11 | match_train = True
12 | if match_train:
13 |     dataset = yaml_parse.load(model.dataset_yaml_src)
14 | 
15 | grid_shape = None
16 | 
17 | if isinstance(space, VectorSpace):
18 |     # For some reason format_as from VectorSpace is not working right
19 |     samples = model.generator.sample(100).eval()
20 | 
21 |     if match_train:
22 |         grid_shape = (10, 20)
23 |         matched = np.zeros((samples.shape[0] * 2, samples.shape[1]))
24 |         X = dataset.X
25 |         for i in xrange(samples.shape[0]):
26 |             matched[2 * i, :] = samples[i, :].copy()
27 |             dists = np.square(X - samples[i, :]).sum(axis=1)
28 |             j = np.argmin(dists)
29 |             matched[2 * i + 1, :] = X[j, :]
30 |         samples = matched
31 | 
32 |     is_color = samples.shape[-1] % 3 == 0 and samples.shape[-1] != 48 * 48
33 | else:
34 |     total_dimension = space.get_total_dimension()
35 |     import numpy as np
36 |     num_colors = 1
37 |     if total_dimension % 3 == 0:
38 |         num_colors = 3
39 |     w = int(np.sqrt(total_dimension / num_colors))
40 |     from pylearn2.space import Conv2DSpace
41 |     desired_space = Conv2DSpace(shape=[w, w], num_channels=num_colors, axes=('b',0,1,'c'))
42 |     samples = space.format_as(batch=model.generator.sample(100),
43 |             space=desired_space).eval()
44 |     is_color = samples.shape[-1] == 3
45 | print (samples.min(), samples.mean(), samples.max())
46 | # Hack for detecting MNIST [0, 1] values. Otherwise we assume centered images
47 | if samples.min() >0:
48 |     samples = samples * 2.0 - 1.0
49 | viewer = make_viewer(samples, grid_shape=grid_shape, is_color=is_color)
50 | viewer.show()
51 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Generative Adversarial Networks
 2 | ===============================
 3 | 
 4 | This repository contains the code and hyperparameters for the paper:
 5 | 
 6 | "Generative Adversarial Networks." Ian J. Goodfellow, Jean Pouget-Abadie,
 7 | Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville,
 8 | Yoshua Bengio. ArXiv 2014.
 9 | 
10 | Please cite this paper if you use the code in this repository as part of
11 | a published research project.
12 | 
13 | We are an academic lab, not a software company, and have no personnel
14 | devoted to documenting and maintaing this research code.
15 | Therefore this code is offered with absolutely no support.
16 | Exact reproduction of the numbers in the paper depends on exact
17 | reproduction of many factors,
18 | including the version of all software dependencies and the choice of
19 | underlying hardware (GPU model, etc). We used NVIDA Ge-Force GTX-580
20 | graphics cards; other hardware will use different tree structures for
21 | summation and incur different rounding error. If you do not reproduce our
22 | setup exactly you should expect to need to re-tune your hyperparameters
23 | slight for your new setup.
24 | 
25 | Moreover, we have not integrated any unit tests for this code into Theano
26 | or Pylearn2 so subsequent changes to those libraries may break the code
27 | in this repository. If you encounter problems with this code, you should
28 | make sure that you are using the development branch of Pylearn2 and Theano,
29 | and use "git checkout" to go to a commit from approximately June 9, 2014.
30 | 
31 | This code itself requires no installation besides making sure that the
32 | "adversarial" directory is in a directory in your PYTHONPATH. If
33 | installed correctly, 'python -c "import adversarial"' will work. You
34 | must also install Pylearn2 and Pylearn2's dependencies (Theano, numpy,
35 | etc.)
36 | 
37 | parzen_ll.py is the script used to estimate the log likelihood of the
38 | model using the Parzen density technique.
39 | 
40 | Call pylearn2/scripts/train.py on the various yaml files in this repository
41 | to train the model for each dataset reported in the paper. The names of
42 | *.yaml are fairly self-explanatory.
43 | 


--------------------------------------------------------------------------------
/ll_mnist.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | 
 4 | from theano import function
 5 | from theano import tensor as T
 6 | 
 7 | _, model_path, sigma = sys.argv
 8 | from pylearn2.utils import serial
 9 | model = serial.load(model_path)
10 | from pylearn2.config import yaml_parse
11 | dataset = yaml_parse.load(model.dataset_yaml_src)
12 | dataset = dataset.get_test_set()
13 | from pylearn2.utils import sharedX
14 | g = model.generator
15 | n = g.get_input_space().get_total_dimension()
16 | X = sharedX(dataset.X)
17 | from theano.sandbox.rng_mrg import MRG_RandomStreams
18 | theano_rng = MRG_RandomStreams(2014 + 6 * 24)
19 | assert False # Aaron says to do valid comparison we need to download the standard binarization,
20 | # and the model should also have been trained on the standard binarization
21 | f = function([], updates=[(X, theano_rng.binomial(p=X, size=X.shape, dtype=X.dtype))])
22 | f()
23 | m = dataset.X.shape[0]
24 | accumulator = sharedX(np.zeros((m,)))
25 | z_samples = g.get_noise(1)
26 | x_samples = g.mlp.fprop(z_samples)
27 | # x_samples = X
28 | from theano.compat import OrderedDict
29 | updates = OrderedDict()
30 | from theano import shared
31 | num_samples = shared(1)
32 | sigma = sharedX(float(sigma))
33 | prev = accumulator
34 | from theano.printing import Print
35 | #prev = Print('prev',attrs=['min','max'])(prev)
36 | # E_x log E_z exp(- sum_i softplus( (1 - 2 x_i) A(z)_i) )
37 | from pylearn2.expr.nnet import arg_of_sigmoid
38 | A = arg_of_sigmoid(x_samples)
39 | cur = - T.nnet.softplus((1. - 2. * X) * A).sum(axis=1)
40 | #cur = Print('cur',attrs=['min','max'])(cur)
41 | ofs = T.maximum(prev, cur)
42 | num_samples_f = T.cast(num_samples, 'float32')
43 | updates[accumulator] = ofs + T.log((num_samples_f * T.exp(prev - ofs) + T.exp(cur - ofs)) / (num_samples_f + 1.))
44 | updates[num_samples] = num_samples + 1
45 | f = function([], updates=updates)
46 | updates[accumulator] = cur
47 | del updates[num_samples]
48 | first = function([], updates=updates)
49 | avg_ll = accumulator.mean()
50 | 
51 | import time
52 | prev_t = time.time()
53 | first()
54 | while True:
55 |     v = avg_ll.eval()
56 |     i = num_samples.get_value()
57 |     if i == 1 or i % 1000 == 0:
58 |         now_t = time.time()
59 |         print i, v, now_t - prev_t
60 |         prev_t = now_t
61 |     if np.isnan(v) or np.isinf(v):
62 |         break
63 |     f()
64 | 
65 | # E_x log p(x)
66 | # E_x log int p(x, z) dz
67 | # E_x log int p(z) p(x | z) dz
68 | # E_x log E_z p(x | z)
69 | # E_x log E_z prod_i p(x_i | z)
70 | # E_x log E_z prod_i sigmoid( (2 x_i - 1) A(z)_i)
71 | # E_x log E_z exp(log prod_i sigmoid( (2 x_i - 1) A(z)_i) )
72 | # E_x log E_z exp(sum_i log sigmoid( (2 x_i - 1) A(z)_i) )
73 | # E_x log E_z exp(- sum_i softplus( (1 - 2 x_i) A(z)_i) )
74 | 


--------------------------------------------------------------------------------
/ll.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | 
 4 | from theano import function
 5 | from theano import tensor as T
 6 | 
 7 | _, model_path, sigma = sys.argv
 8 | from pylearn2.utils import serial
 9 | model = serial.load(model_path)
10 | from pylearn2.config import yaml_parse
11 | dataset = yaml_parse.load(model.dataset_yaml_src)
12 | dataset = dataset.get_test_set()
13 | from pylearn2.utils import sharedX
14 | g = model.generator
15 | n = g.get_input_space().get_total_dimension()
16 | X = sharedX(dataset.X)
17 | m = dataset.X.shape[0]
18 | accumulator = sharedX(np.zeros((m,)))
19 | z_samples = g.get_noise(1)
20 | x_samples = g.mlp.fprop(z_samples)
21 | from theano.compat import OrderedDict
22 | updates = OrderedDict()
23 | from theano import shared
24 | num_samples = shared(1)
25 | sigma = sharedX(float(sigma))
26 | prev = accumulator
27 | cur = -0.5 * T.sqr(X - x_samples).sum(axis=1) / T.sqr(sigma)
28 | ofs = T.maximum(prev, cur)
29 | num_samples_f = T.cast(num_samples, 'float32')
30 | updates[accumulator] = ofs + T.log(num_samples_f * T.exp(prev - ofs) + T.exp(cur - ofs)) - T.log(num_samples_f + 1.)
31 | updates[num_samples] = num_samples + 1
32 | f = function([], updates=updates)
33 | updates[accumulator] = cur
34 | del updates[num_samples]
35 | first = function([], updates=updates)
36 | avg_ll = accumulator.mean() - 0.5 * X.shape[1] * T.log(2 * np.pi * T.sqr(sigma))
37 | 
38 | import time
39 | prev_t = time.time()
40 | first()
41 | while True:
42 |     v = avg_ll.eval()
43 |     i = num_samples.get_value()
44 |     if i == 1 or i % 1000 == 0:
45 |         now_t = time.time()
46 |         print i, v, now_t - prev_t
47 |         prev_t = now_t
48 |     if np.isnan(v) or np.isinf(v):
49 |         break
50 |     f()
51 | 
52 | # log p(x)
53 | # = log int p(z, x) dz
54 | # = log int p(z) p(x |z) dz
55 | # = log E_z p(x|z)
56 | # = log (1/m) sum_z p(x|z)
57 | # = log (1/m) sum_z prod_i sqrt(1/(2 pi sigma^2)) exp( -0.5 (x_i-g(z)_i)^2 / sigma^2)
58 | # = log  sqrt(1/(2 pi sigma^2))^d (1/m) sum_z prod_iexp( -0.5 (x_i-g(z)_i)^2 / sigma^2)
59 | # = log  sqrt(1/(2 pi sigma^2))^d (1/m) sum_z exp( sum_i -0.5 (x_i-g(z)_i)^2 / sigma^2)
60 | # = log  sqrt(1/(2 pi sigma^2))^d + log (1/m) sum_z exp( sum_i -0.5 (x_i-g(z)_i)^2 / sigma^2)
61 | # = 0.5 d log  1/(2 pi sigma^2) + log (1/m) sum_z exp( sum_i -0.5 (x_i-g(z)_i)^2 / sigma^2)
62 | # = -0.5 d log  (2 pi sigma^2) + log (1/m) sum_z exp( sum_i -0.5 (x_i-g(z)_i)^2 / sigma^2)
63 | 
64 | # log (1/m) sum_j exp(v_j)
65 | # = log (1/m) [exp(v_m) + sum_{j=1}^{m-1} exp(v_j)]
66 | # = log (1/m) [exp(v_m) + (m-1) exp( prev )]
67 | # = log (1/m) [exp(v_m) exp(ofs-ofs) + (m-1) exp( prev ) exp(ofs -ofs)]
68 | # = log (1/m) [exp(v_m- ofs) exp(ofs) + (m-1) exp( prev -ofs) exp(ofs)]
69 | # = log exp(ofs) (1/m) [exp(v_m- ofs) + (m-1) exp( prev -ofs) ]
70 | # = ofs + log  (1/m) [exp(v_m- ofs) + (m-1) exp( prev -ofs) ]
71 | # = ofs + log  [exp(v_m- ofs) + (m-1) exp( prev -ofs) ] - log m
72 | 


--------------------------------------------------------------------------------
/mnist.yaml:
--------------------------------------------------------------------------------
  1 | !obj:pylearn2.train.Train {
  2 |     dataset: &train !obj:pylearn2.datasets.mnist.MNIST {
  3 |         which_set: 'train',
  4 |         start: 0,
  5 |         stop: 50000
  6 |     },
  7 |     model: !obj:adversarial.AdversaryPair {
  8 |         generator: !obj:adversarial.Generator {
  9 |             noise: 'uniform',
 10 |             monitor_ll: 1,
 11 |             mlp: !obj:pylearn2.models.mlp.MLP {
 12 |             layers: [
 13 |                      !obj:pylearn2.models.mlp.RectifiedLinear {
 14 |                          layer_name: 'h0',
 15 |                          dim: 1200,
 16 |                          irange: .05,
 17 |                      },
 18 |                      !obj:pylearn2.models.mlp.RectifiedLinear {
 19 |                          layer_name: 'h1',
 20 |                          dim: 1200,
 21 |                          irange: .05,
 22 |                      },
 23 |                      !obj:pylearn2.models.mlp.Sigmoid {
 24 |                          init_bias: !obj:pylearn2.models.dbm.init_sigmoid_bias_from_marginals { dataset: *train},
 25 |                          layer_name: 'y',
 26 |                          irange: .05,
 27 |                          dim: 784
 28 |                      }
 29 |                     ],
 30 |             nvis: 100,
 31 |         }},
 32 |         discriminator: 
 33 |             !obj:pylearn2.models.mlp.MLP {
 34 |             layers: [
 35 |                      !obj:pylearn2.models.maxout.Maxout {
 36 |                          layer_name: 'h0',
 37 |                          num_units: 240,
 38 |                          num_pieces: 5,
 39 |                          irange: .005,
 40 |                      },
 41 |                      !obj:pylearn2.models.maxout.Maxout {
 42 |                          layer_name: 'h1',
 43 |                          num_units: 240,
 44 |                          num_pieces: 5,
 45 |                          irange: .005,
 46 |                      },
 47 |                      !obj:pylearn2.models.mlp.Sigmoid {
 48 |                          layer_name: 'y',
 49 |                          dim: 1,
 50 |                          irange: .005
 51 |                      }
 52 |                     ],
 53 |             nvis: 784,
 54 |         },
 55 |     },
 56 |     algorithm: !obj:pylearn2.training_algorithms.sgd.SGD {
 57 |         batch_size: 100,
 58 |         learning_rate: .1,
 59 |         learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum {
 60 |             init_momentum: .5,
 61 |         },
 62 |         monitoring_dataset:
 63 |             {
 64 |                 'valid' : !obj:pylearn2.datasets.mnist.MNIST {
 65 |                               which_set: 'train',
 66 |                               start: 50000,
 67 |                               stop:  60000
 68 |                           },
 69 |             },
 70 |         cost: !obj:adversarial.AdversaryCost2 {
 71 |             scale_grads: 0,
 72 |             #target_scale: 1.,
 73 |             discriminator_default_input_include_prob: .5,
 74 |             discriminator_input_include_probs: {
 75 |                 'h0': .8
 76 |             },
 77 |             discriminator_default_input_scale: 2.,
 78 |             discriminator_input_scales: {
 79 |                 'h0': 1.25   
 80 |             }
 81 |             },
 82 |         #!obj:pylearn2.costs.mlp.dropout.Dropout {
 83 |         #    input_include_probs: { 'h0' : .8 },
 84 |         #    input_scales: { 'h0': 1. }
 85 |         #},
 86 |         #termination_criterion: !obj:pylearn2.termination_criteria.MonitorBased {
 87 |         #    channel_name: "valid_y_misclass",
 88 |         #    prop_decrease: 0.,
 89 |         #    N: 100
 90 |         #},
 91 |         update_callbacks: !obj:pylearn2.training_algorithms.sgd.ExponentialDecay {
 92 |             decay_factor: 1.000004,
 93 |             min_lr: .000001
 94 |         }
 95 |     },
 96 |     extensions: [
 97 |         #!obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest {
 98 |         #     channel_name: 'valid_y_misclass',
 99 |         #     save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}_best.pkl"
100 |         #},
101 |         !obj:pylearn2.training_algorithms.learning_rule.MomentumAdjustor {
102 |             start: 1,
103 |             saturate: 250,
104 |             final_momentum: .7
105 |         }
106 |     ],
107 |     save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}.pkl",
108 |     save_freq: 1
109 | }
110 | 


--------------------------------------------------------------------------------
/cifar10_fully_connected.yaml:
--------------------------------------------------------------------------------
  1 | !obj:pylearn2.train.Train {
  2 |     dataset: &train !obj:pylearn2.datasets.cifar10.CIFAR10 {
  3 |         gcn: 55.,
  4 |         which_set: 'train',
  5 |         start: 0,
  6 |         stop: 40000
  7 |     },
  8 |     model: !obj:adversarial.AdversaryPair {
  9 |         generator: !obj:adversarial.Generator {
 10 |             mlp: !obj:pylearn2.models.mlp.MLP {
 11 |             layers: [
 12 |                      !obj:pylearn2.models.mlp.RectifiedLinear {
 13 |                          layer_name: 'gh0',
 14 |                          dim: 8000,
 15 |                          irange: .05,
 16 |                      },
 17 |                      !obj:pylearn2.models.mlp.Sigmoid {
 18 |                          layer_name: 'h1',
 19 |                          dim: 8000,
 20 |                          irange: .05,
 21 |                      },
 22 |                      !obj:pylearn2.models.mlp.Linear {
 23 |                          # init_bias: !obj:pylearn2.models.dbm.init_sigmoid_bias_from_marginals { dataset: *train},
 24 |                          layer_name: 'y',
 25 |                          irange: .5,
 26 |                          dim: 3072
 27 |                      }
 28 |                     ],
 29 |             nvis: 100,
 30 |         }},
 31 |         discriminator: 
 32 |             !obj:pylearn2.models.mlp.MLP {
 33 |             layers: [
 34 |                      !obj:pylearn2.models.maxout.Maxout {
 35 |                          layer_name: 'dh0',
 36 |                          num_units: 1600,
 37 |                          num_pieces: 5,
 38 |                          irange: .005,
 39 |                      },
 40 |                      !obj:pylearn2.models.maxout.Maxout {
 41 |                          layer_name: 'h1',
 42 |                          num_units: 1600,
 43 |                          num_pieces: 5,
 44 |                          irange: .005,
 45 |                      },
 46 |                      !obj:pylearn2.models.mlp.Sigmoid {
 47 |                          layer_name: 'y',
 48 |                          dim: 1,
 49 |                          irange: .005
 50 |                      }
 51 |                     ],
 52 |             nvis: 3072,
 53 |         },
 54 |     },
 55 |     algorithm: !obj:pylearn2.training_algorithms.sgd.SGD {
 56 |         batch_size: 100,
 57 |         learning_rate: .025,
 58 |         learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum {
 59 |             init_momentum: .5,
 60 |         },
 61 |         monitoring_dataset:
 62 |             {
 63 |                 #'train' : *train,
 64 |                 'valid' : !obj:pylearn2.datasets.cifar10.CIFAR10 {
 65 |                               gcn: 55., 
 66 |                               which_set: 'train',
 67 |                               start: 40000,
 68 |                               stop:  50000
 69 |                           },
 70 |                 #'test'  : !obj:pylearn2.datasets.cifar10.CIFAR10 {
 71 |                 #              which_set: 'test',
 72 |                 #              gcn: 55.,
 73 |                 #          }
 74 |             },
 75 |         cost: !obj:adversarial.AdversaryCost2 {
 76 |             scale_grads: 0,
 77 |             #target_scale: .1,
 78 |             discriminator_default_input_include_prob: .5,
 79 |             discriminator_input_include_probs: {
 80 |                 'dh0': .8
 81 |             },
 82 |             discriminator_default_input_scale: 2.,
 83 |             discriminator_input_scales: {
 84 |                 'dh0': 1.25   
 85 |             }
 86 |             },
 87 |         #!obj:pylearn2.costs.mlp.dropout.Dropout {
 88 |         #    input_include_probs: { 'h0' : .8 },
 89 |         #    input_scales: { 'h0': 1. }
 90 |         #},
 91 |         #termination_criterion: !obj:pylearn2.termination_criteria.MonitorBased {
 92 |         #    channel_name: "valid_y_misclass",
 93 |         #    prop_decrease: 0.,
 94 |         #    N: 100
 95 |         #},
 96 |         update_callbacks: !obj:pylearn2.training_algorithms.sgd.ExponentialDecay {
 97 |             decay_factor: 1.000004,
 98 |             min_lr: .000001
 99 |         }
100 |     },
101 |     extensions: [
102 |         #!obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest {
103 |         #     channel_name: 'valid_y_misclass',
104 |         #     save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}_best.pkl"
105 |         #},
106 |         !obj:pylearn2.training_algorithms.learning_rule.MomentumAdjustor {
107 |             start: 1,
108 |             saturate: 250,
109 |             final_momentum: .7
110 |         }
111 |     ],
112 |     save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}.pkl",
113 |     save_freq: 1
114 | }
115 | 


--------------------------------------------------------------------------------
/tfd_pretrain/train.yaml:
--------------------------------------------------------------------------------
  1 | !obj:pylearn2.train.Train {
  2 |     dataset: &train !obj:pylearn2.datasets.tfd.TFD {
  3 |         which_set: 'unlabeled',
  4 |         scale: True,
  5 |     },
  6 |     model: !obj:adversarial.AdversaryPair {
  7 |         generator: !obj:adversarial.Generator {
  8 |             monitor_ll: 1,
  9 |             mlp: !obj:adversarial.add_layers {
 10 |                 mlp: !obj:pylearn2.models.mlp.MLP {
 11 |                     layers: [
 12 |                      !obj:pylearn2.models.mlp.RectifiedLinear {
 13 |                          layer_name: 'h0',
 14 |                          dim: 8000,
 15 |                          irange: .05,
 16 |                          max_col_norm: 1.9365,
 17 |                      },
 18 |                      !obj:pylearn2.models.mlp.Sigmoid {
 19 |                          layer_name: 'h1',
 20 |                          dim: 100,
 21 |                          irange: .05,
 22 |                          max_col_norm: 1.9365,
 23 |                          init_bias: -2.0,
 24 |                      },
 25 |                     ],
 26 |                 nvis: 100,
 27 |                 },
 28 |                 pretrained: "./pretrain.pkl",
 29 |             }
 30 |         },
 31 |         discriminator:
 32 |             !obj:pylearn2.models.mlp.MLP {
 33 |             layers: [
 34 |                      !obj:pylearn2.models.maxout.Maxout {
 35 |                          #W_lr_scale: .1,
 36 |                          #b_lr_scale: .1,
 37 |                          layer_name: 'h0',
 38 |                          num_units: 1200,
 39 |                          num_pieces: 5,
 40 |                          irange: .005,
 41 |                          max_col_norm: 1.9365,
 42 |                      },
 43 |                      !obj:pylearn2.models.maxout.Maxout {
 44 |                          #W_lr_scale: .1,
 45 |                          #b_lr_scale: .1,
 46 |                          layer_name: 'h1',
 47 |                          num_units: 1200,
 48 |                          num_pieces: 5,
 49 |                          irange: .005,
 50 |                          max_col_norm: 1.9365,
 51 |                      },
 52 |                      !obj:pylearn2.models.mlp.Sigmoid {
 53 |                          #W_lr_scale: .1,
 54 |                          #b_lr_scale: .1,
 55 |                          max_col_norm: 1.9365,
 56 |                          layer_name: 'y',
 57 |                          dim: 1,
 58 |                          irange: .005
 59 |                      }
 60 |                     ],
 61 |             nvis: 2304,
 62 |         },
 63 |     },
 64 |     algorithm: !obj:pylearn2.training_algorithms.sgd.SGD {
 65 |         batch_size: 100,
 66 |         learning_rate: .05,
 67 |         learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum {
 68 |             init_momentum: .5,
 69 |         },
 70 |     monitoring_dataset:
 71 |            {
 72 |     #           'train' : *train,
 73 |                'valid' : !obj:pylearn2.datasets.tfd.TFD {
 74 |                              which_set: 'valid',
 75 |                              scale: True,
 76 |                          },
 77 |     #           'test'  : !obj:pylearn2.datasets.tfd.TFD {
 78 |     #                         which_set: 'test',
 79 |     #                         scale: True,
 80 |     #                     }
 81 |            },
 82 |         cost: !obj:adversarial.AdversaryCost2 {
 83 |             scale_grads: 0,
 84 |             #target_scale: 1.,
 85 |             discriminator_default_input_include_prob: .5,
 86 |             discriminator_input_include_probs: {
 87 |                 'h0': .8
 88 |             },
 89 |             discriminator_default_input_scale: 2.,
 90 |             discriminator_input_scales: {
 91 |                 'h0': 1.25
 92 |             }
 93 |             },
 94 |         #!obj:pylearn2.costs.mlp.dropout.Dropout {
 95 |         #    input_include_probs: { 'h0' : .8 },
 96 |         #    input_scales: { 'h0': 1. }
 97 |         #},
 98 |         termination_criterion: !obj:pylearn2.termination_criteria.EpochCounter {
 99 |             max_epochs: 22
100 |         },
101 |         update_callbacks: !obj:pylearn2.training_algorithms.sgd.ExponentialDecay {
102 |             decay_factor: 1.000004,
103 |             min_lr: .000001
104 |         }
105 |     },
106 |     extensions: [
107 |         !obj:pylearn2.training_algorithms.learning_rule.MomentumAdjustor {
108 |             start: 1,
109 |             saturate: 250,
110 |             final_momentum: .7
111 |         },
112 |         #!obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest {
113 |         #    channel_name: 'valid_gen_ll',
114 |         #    name_base: 'save/train',
115 |         #    store_best_model: True
116 |         #}
117 | 
118 |     ],
119 |     save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}.pkl",
120 |     save_freq: 1
121 | }
122 | 


--------------------------------------------------------------------------------
/tfd_pretrain/pretrain.yaml:
--------------------------------------------------------------------------------
  1 | !obj:pylearn2.train.Train {
  2 |     dataset: &train !obj:pylearn2.datasets.tfd.TFD {
  3 |         which_set: 'unlabeled',
  4 |         scale: True,
  5 |     },
  6 |     model: !obj:adversarial.AdversaryPair {
  7 |         generator: !obj:adversarial.Generator {
  8 |             monitor_ll: 1,
  9 |             mlp: !obj:pylearn2.models.mlp.MLP {
 10 |             layers: [
 11 |                      !obj:pylearn2.models.mlp.RectifiedLinear {
 12 |                          layer_name: 'h0',
 13 |                          dim: 8000,
 14 |                          irange: .05,
 15 |                          max_col_norm: 1.9365,
 16 |                      },
 17 |                      !obj:pylearn2.models.mlp.Sigmoid {
 18 |                          layer_name: 'h1',
 19 |                          dim: 8000,
 20 |                          irange: .05,
 21 |                          max_col_norm: 1.9365,
 22 |                          init_bias: -2.0,
 23 |                      },
 24 |                      !obj:pylearn2.models.mlp.Sigmoid {
 25 |                          max_col_norm: 1.9365,
 26 |                          init_bias: !obj:pylearn2.models.dbm.init_sigmoid_bias_from_marginals { dataset: *train},
 27 |                          layer_name: 'y',
 28 |                          sparse_init: 100,
 29 |                          dim: 2304
 30 |                      }
 31 |                     ],
 32 |             nvis: 100,
 33 |         }},
 34 |         discriminator:
 35 |             !obj:pylearn2.models.mlp.MLP {
 36 |             layers: [
 37 |                      !obj:pylearn2.models.maxout.Maxout {
 38 |                          #W_lr_scale: .1,
 39 |                          #b_lr_scale: .1,
 40 |                          layer_name: 'h0',
 41 |                          num_units: 1200,
 42 |                          num_pieces: 5,
 43 |                          irange: .005,
 44 |                          max_col_norm: 1.9365,
 45 |                      },
 46 |                      !obj:pylearn2.models.maxout.Maxout {
 47 |                          #W_lr_scale: .1,
 48 |                          #b_lr_scale: .1,
 49 |                          layer_name: 'h1',
 50 |                          num_units: 1200,
 51 |                          num_pieces: 5,
 52 |                          irange: .005,
 53 |                          max_col_norm: 1.9365,
 54 |                      },
 55 |                      !obj:pylearn2.models.mlp.Sigmoid {
 56 |                          #W_lr_scale: .1,
 57 |                          #b_lr_scale: .1,
 58 |                          max_col_norm: 1.9365,
 59 |                          layer_name: 'y',
 60 |                          dim: 1,
 61 |                          irange: .005
 62 |                      }
 63 |                     ],
 64 |             nvis: 2304,
 65 |         },
 66 |     },
 67 |     algorithm: !obj:pylearn2.training_algorithms.sgd.SGD {
 68 |         batch_size: 100,
 69 |         learning_rate: .05,
 70 |         learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum {
 71 |             init_momentum: .5,
 72 |         },
 73 |     monitoring_dataset:
 74 |            {
 75 |     #           'train' : *train,
 76 |                'valid' : !obj:pylearn2.datasets.tfd.TFD {
 77 |                              which_set: 'valid',
 78 |                              scale: True,
 79 |                          },
 80 |     #           'test'  : !obj:pylearn2.datasets.tfd.TFD {
 81 |     #                         which_set: 'test',
 82 |     #                         scale: True,
 83 |     #                     }
 84 |            },
 85 |         cost: !obj:adversarial.AdversaryCost2 {
 86 |             scale_grads: 0,
 87 |             #target_scale: 1.,
 88 |             discriminator_default_input_include_prob: .5,
 89 |             discriminator_input_include_probs: {
 90 |                 'h0': .8
 91 |             },
 92 |             discriminator_default_input_scale: 2.,
 93 |             discriminator_input_scales: {
 94 |                 'h0': 1.25
 95 |             }
 96 |             },
 97 |         #!obj:pylearn2.costs.mlp.dropout.Dropout {
 98 |         #    input_include_probs: { 'h0' : .8 },
 99 |         #    input_scales: { 'h0': 1. }
100 |         #},
101 |         termination_criterion: !obj:pylearn2.termination_criteria.EpochCounter {
102 |             max_epochs: 50
103 |         },
104 |         update_callbacks: !obj:pylearn2.training_algorithms.sgd.ExponentialDecay {
105 |             decay_factor: 1.000004,
106 |             min_lr: .000001
107 |         }
108 |     },
109 |     extensions: [
110 |         !obj:pylearn2.training_algorithms.learning_rule.MomentumAdjustor {
111 |             start: 1,
112 |             saturate: 250,
113 |             final_momentum: .7
114 |         },
115 |         #!obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest {
116 |         #    channel_name: 'valid_gen_ll',
117 |         #    name_base: 'save/pretrain',
118 |         #    store_best_model: True
119 |         #}
120 |     ],
121 |     save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}.pkl",
122 |     save_freq: 1
123 | }
124 | 


--------------------------------------------------------------------------------
/parzen_ll.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import time
  3 | import gc
  4 | import numpy
  5 | import theano
  6 | import theano.tensor as T
  7 | from pylearn2.utils import serial
  8 | from pylearn2.config import yaml_parse
  9 | from pylearn2.datasets.mnist import MNIST
 10 | from pylearn2.datasets.tfd import TFD
 11 | 
 12 | 
 13 | 
 14 | def get_nll(x, parzen, batch_size=10):
 15 |     """
 16 |     Credit: Yann N. Dauphin
 17 |     """
 18 | 
 19 |     inds = range(x.shape[0])
 20 |     n_batches = int(numpy.ceil(float(len(inds)) / batch_size))
 21 | 
 22 |     times = []
 23 |     nlls = []
 24 |     for i in range(n_batches):
 25 |         begin = time.time()
 26 |         nll = parzen(x[inds[i::n_batches]])
 27 |         end = time.time()
 28 |         times.append(end-begin)
 29 |         nlls.extend(nll)
 30 | 
 31 |         if i % 10 == 0:
 32 |             print i, numpy.mean(times), numpy.mean(nlls)
 33 | 
 34 |     return numpy.array(nlls)
 35 | 
 36 | 
 37 | def log_mean_exp(a):
 38 |     """
 39 |     Credit: Yann N. Dauphin
 40 |     """
 41 | 
 42 |     max_ = a.max(1)
 43 | 
 44 |     return max_ + T.log(T.exp(a - max_.dimshuffle(0, 'x')).mean(1))
 45 | 
 46 | 
 47 | def theano_parzen(mu, sigma):
 48 |     """
 49 |     Credit: Yann N. Dauphin
 50 |     """
 51 | 
 52 |     x = T.matrix()
 53 |     mu = theano.shared(mu)
 54 |     a = ( x.dimshuffle(0, 'x', 1) - mu.dimshuffle('x', 0, 1) ) / sigma
 55 |     E = log_mean_exp(-0.5*(a**2).sum(2))
 56 |     Z = mu.shape[1] * T.log(sigma * numpy.sqrt(numpy.pi * 2))
 57 | 
 58 |     return theano.function([x], E - Z)
 59 | 
 60 | 
 61 | def cross_validate_sigma(samples, data, sigmas, batch_size):
 62 | 
 63 |     lls = []
 64 |     for sigma in sigmas:
 65 |         print sigma
 66 |         parzen = theano_parzen(samples, sigma)
 67 |         tmp = get_nll(data, parzen, batch_size = batch_size)
 68 |         lls.append(numpy.asarray(tmp).mean())
 69 |         del parzen
 70 |         gc.collect()
 71 | 
 72 |     ind = numpy.argmax(lls)
 73 |     return sigmas[ind]
 74 | 
 75 | 
 76 | def get_valid(ds, limit_size = -1, fold = 0):
 77 |     if ds == 'mnist':
 78 |         data = MNIST('train', start=50000, stop=60000)
 79 |         return data.X[:limit_size]
 80 |     elif ds == 'tfd':
 81 |         data = TFD('valid', fold = fold, scale=True)
 82 |         return data.X
 83 |     else:
 84 |          raise ValueError("Unknow dataset: {}".format(args.dataet))
 85 | 
 86 | 
 87 | def get_test(ds, test, fold=0):
 88 |     if ds == 'mnist':
 89 |         return test.get_test_set()
 90 |     elif ds == 'tfd':
 91 |         return test.get_test_set(fold=fold)
 92 |     else:
 93 |         raise ValueError("Unknow dataset: {}".format(args.dataet))
 94 | 
 95 | 
 96 | def main():
 97 |     parser = argparse.ArgumentParser(description = 'Parzen window, log-likelihood estimator')
 98 |     parser.add_argument('-p', '--path', help='model path')
 99 |     parser.add_argument('-s', '--sigma', default = None)
100 |     parser.add_argument('-d', '--dataset', choices=['mnist', 'tfd'])
101 |     parser.add_argument('-f', '--fold', default = 0, type=int)
102 |     parser.add_argument('-v', '--valid', default = False, action='store_true')
103 |     parser.add_argument('-n', '--num_samples', default=10000, type=int)
104 |     parser.add_argument('-l', '--limit_size', default=1000, type=int)
105 |     parser.add_argument('-b', '--batch_size', default=100, type=int)
106 |     parser.add_argument('-c', '--cross_val', default=10, type=int,
107 |                             help="Number of cross valiation folds")
108 |     parser.add_argument('--sigma_start', default=-1, type=float)
109 |     parser.add_argument('--sigma_end', default=0., type=float)
110 |     args = parser.parse_args()
111 | 
112 |     # load model
113 |     model = serial.load(args.path)
114 |     src = model.dataset_yaml_src
115 |     batch_size = args.batch_size
116 |     model.set_batch_size(batch_size)
117 | 
118 |     # load test set
119 |     test = yaml_parse.load(src)
120 |     test = get_test(args.dataset, test, args.fold)
121 | 
122 |     # generate samples
123 |     samples = model.generator.sample(args.num_samples).eval()
124 |     output_space = model.generator.mlp.get_output_space()
125 |     if 'Conv2D' in str(output_space):
126 |         samples = output_space.convert(samples, output_space.axes, ('b', 0, 1, 'c'))
127 |         samples = samples.reshape((samples.shape[0], numpy.prod(samples.shape[1:])))
128 |     del model
129 |     gc.collect()
130 | 
131 |     # cross validate sigma
132 |     if args.sigma is None:
133 |         valid = get_valid(args.dataset, limit_size = args.limit_size, fold = args.fold)
134 |         sigma_range = numpy.logspace(args.sigma_start, args.sigma_end, num=args.cross_val)
135 |         sigma = cross_validate_sigma(samples, valid, sigma_range, batch_size)
136 |     else:
137 |         sigma = float(args.sigma)
138 | 
139 |     print "Using Sigma: {}".format(sigma)
140 |     gc.collect()
141 | 
142 |     # fit and evaulate
143 |     parzen = theano_parzen(samples, sigma)
144 |     ll = get_nll(test.X, parzen, batch_size = batch_size)
145 |     se = ll.std() / numpy.sqrt(test.X.shape[0])
146 | 
147 |     print "Log-Likelihood of test set = {}, se: {}".format(ll.mean(), se)
148 | 
149 |     # valid
150 |     if args.valid:
151 |         valid = get_valid(args.dataset)
152 |         ll = get_nll(valid, parzen, batch_size = batch_size)
153 |         se = ll.std() / numpy.sqrt(valid.shape[0])
154 |         print "Log-Likelihood of valid set = {}, se: {}".format(ll.mean(), se)
155 | 
156 | 
157 | if __name__ == "__main__":
158 |     main()
159 | 


--------------------------------------------------------------------------------
/cifar10_convolutional.yaml:
--------------------------------------------------------------------------------
  1 | !obj:pylearn2.train.Train {
  2 |     dataset: &train !obj:pylearn2.datasets.cifar10.CIFAR10 {
  3 |         axes: ['c', 0, 1, 'b'],
  4 |         gcn: 55.,
  5 |         which_set: 'train',
  6 |         start: 0,
  7 |         stop: 40000
  8 |     },
  9 |     model: !obj:adversarial.AdversaryPair {
 10 |         generator: !obj:adversarial.Generator {
 11 |             mlp: !obj:pylearn2.models.mlp.MLP {
 12 |             layers: [
 13 |                      !obj:pylearn2.models.mlp.RectifiedLinear {
 14 |                          layer_name: 'gh0',
 15 |                          dim: 8000,
 16 |                          irange: .05,
 17 |                          #max_col_norm: 1.9365,
 18 |                      },
 19 |                      !obj:pylearn2.models.mlp.Sigmoid {
 20 |                          layer_name: 'h1',
 21 |                          dim: 8000,
 22 |                          irange: .05,
 23 |                          #max_col_norm: 1.9365,
 24 |                      },
 25 |                      !obj:pylearn2.models.mlp.SpaceConverter {
 26 |                          layer_name: 'converter',
 27 |                          output_space: !obj:pylearn2.space.Conv2DSpace {
 28 |                         shape: [10, 10],
 29 |                         num_channels: 80,
 30 |                         axes: ['c', 0, 1, 'b'],
 31 |                     }},
 32 |                      !obj:adversarial.deconv.Deconv {
 33 |                      #W_lr_scale: .05,
 34 |                      #b_lr_scale: .05,
 35 |                          num_channels: 3,
 36 |                          output_stride: [3, 3],
 37 |                          kernel_shape: [5, 5],
 38 |                          pad_out: 0,
 39 |                          #max_kernel_norm: 1.9365,
 40 |                          # init_bias: !obj:pylearn2.models.dbm.init_sigmoid_bias_from_marginals { dataset: *train},
 41 |                          layer_name: 'y',
 42 |                          irange: .05,
 43 |                          tied_b: 0
 44 |                      },
 45 |                     ],
 46 |             nvis: 100,
 47 |         }},
 48 |         discriminator: 
 49 |             !obj:pylearn2.models.mlp.MLP {
 50 |             layers: [
 51 |                  !obj:pylearn2.models.maxout.MaxoutConvC01B {
 52 |                      layer_name: 'dh0',
 53 |                      pad: 4,
 54 |                      tied_b: 1,
 55 |                      #W_lr_scale: .05,
 56 |                      #b_lr_scale: .05,
 57 |                      num_channels: 32,
 58 |                      num_pieces: 2,
 59 |                      kernel_shape: [8, 8],
 60 |                      pool_shape: [4, 4],
 61 |                      pool_stride: [2, 2],
 62 |                      irange: .005,
 63 |                      #max_kernel_norm: .9,
 64 |                      partial_sum: 33,
 65 |                  },
 66 |                  !obj:pylearn2.models.maxout.MaxoutConvC01B {
 67 |                      layer_name: 'h1',
 68 |                      pad: 3,
 69 |                      tied_b: 1,
 70 |                      #W_lr_scale: .05,
 71 |                      #b_lr_scale: .05,
 72 |                      num_channels: 32, # 192 ran out of memory
 73 |                      num_pieces: 2,
 74 |                      kernel_shape: [8, 8],
 75 |                      pool_shape: [4, 4],
 76 |                      pool_stride: [2, 2],
 77 |                      irange: .005,
 78 |                      #max_kernel_norm: 1.9365,
 79 |                      partial_sum: 15,
 80 |                  },
 81 |                  !obj:pylearn2.models.maxout.MaxoutConvC01B {
 82 |                      pad: 3,
 83 |                      layer_name: 'h2',
 84 |                      tied_b: 1,
 85 |                      #W_lr_scale: .05,
 86 |                      #b_lr_scale: .05,
 87 |                      num_channels: 192,
 88 |                      num_pieces: 2,
 89 |                      kernel_shape: [5, 5],
 90 |                      pool_shape: [2, 2],
 91 |                      pool_stride: [2, 2],
 92 |                      irange: .005,
 93 |                      #max_kernel_norm: 1.9365,
 94 |                  },
 95 |                  !obj:pylearn2.models.maxout.Maxout {
 96 |                     layer_name: 'h3',
 97 |                     irange: .005,
 98 |                     num_units: 500,
 99 |                     num_pieces: 5,
100 |                     #max_col_norm: 1.9
101 |                      },
102 |                      !obj:pylearn2.models.mlp.Sigmoid {
103 |                          #W_lr_scale: .1,
104 |                          #b_lr_scale: .1,
105 |                          #max_col_norm: 1.9365,
106 |                          layer_name: 'y',
107 |                          dim: 1,
108 |                          irange: .005
109 |                      }
110 |                     ],
111 |         input_space: !obj:pylearn2.space.Conv2DSpace {
112 |             shape: [32, 32],
113 |             num_channels: 3,
114 |             axes: ['c', 0, 1, 'b'],
115 |         }
116 |         },
117 |     },
118 |     algorithm: !obj:pylearn2.training_algorithms.sgd.SGD {
119 |         batch_size: 128,
120 |         learning_rate: .004,
121 |         learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum {
122 |             init_momentum: .5,
123 |         },
124 |         monitoring_dataset:
125 |             {
126 |                 #'train' : *train,
127 |                 'valid' : !obj:pylearn2.datasets.cifar10.CIFAR10 {
128 |                             axes: ['c', 0, 1, 'b'],
129 |                               gcn: 55., 
130 |                               which_set: 'train',
131 |                               start: 40000,
132 |                               stop:  50000
133 |                           },
134 |                 #'test'  : !obj:pylearn2.datasets.cifar10.CIFAR10 {
135 |                 #              which_set: 'test',
136 |                 #              gcn: 55.,
137 |                 #          }
138 |             },
139 |         cost: !obj:adversarial.AdversaryCost2 {
140 |             scale_grads: 0,
141 |             #target_scale: .1,
142 |             discriminator_default_input_include_prob: .5,
143 |             discriminator_input_include_probs: {
144 |                 'dh0': .8
145 |             },
146 |             discriminator_default_input_scale: 2.,
147 |             discriminator_input_scales: {
148 |                 'dh0': 1.25   
149 |             }
150 |             },
151 |         #termination_criterion: !obj:pylearn2.termination_criteria.MonitorBased {
152 |         #    channel_name: "valid_y_misclass",
153 |         #    prop_decrease: 0.,
154 |         #    N: 100
155 |         #},
156 |         update_callbacks: !obj:pylearn2.training_algorithms.sgd.ExponentialDecay {
157 |             decay_factor: 1.000004,
158 |             min_lr: .000001
159 |         }
160 |     },
161 |     extensions: [
162 |         #!obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest {
163 |         #     channel_name: 'valid_y_misclass',
164 |         #     save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}_best.pkl"
165 |         #},
166 |         !obj:pylearn2.training_algorithms.learning_rule.MomentumAdjustor {
167 |             start: 1,
168 |             saturate: 250,
169 |             final_momentum: .7
170 |         }
171 |     ],
172 |     save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}.pkl",
173 |     save_freq: 1
174 | }
175 | 


--------------------------------------------------------------------------------
/deconv.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import logging
  3 | import numpy as np
  4 | 
  5 | from theano.compat import OrderedDict
  6 | from theano import tensor as T
  7 | 
  8 | from pylearn2.linear.conv2d_c01b import make_random_conv2D
  9 | from pylearn2.models import Model
 10 | from pylearn2.models.maxout import check_cuda # TODO: import from original path
 11 | from pylearn2.models.mlp import Layer
 12 | #from pylearn2.models.maxout import py_integer_types # TODO: import from orig path
 13 | from pylearn2.space import Conv2DSpace
 14 | from pylearn2.utils import sharedX
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | class Deconv(Layer):
 19 |     def __init__(self,
 20 |                  num_channels,
 21 |                  kernel_shape,
 22 |                  layer_name,
 23 |                  irange=None,
 24 |                  init_bias=0.,
 25 |                  W_lr_scale=None,
 26 |                  b_lr_scale=None,
 27 |                  pad_out=0,
 28 |                  fix_kernel_shape=False,
 29 |                  partial_sum=1,
 30 |                  tied_b=False,
 31 |                  max_kernel_norm=None,
 32 |                  output_stride=(1, 1)):
 33 |         check_cuda(str(type(self)))
 34 |         super(Deconv, self).__init__()
 35 | 
 36 |         detector_channels = num_channels
 37 | 
 38 |         self.__dict__.update(locals())
 39 |         del self.self
 40 | 
 41 |     @functools.wraps(Model.get_lr_scalers)
 42 |     def get_lr_scalers(self):
 43 | 
 44 |         if not hasattr(self, 'W_lr_scale'):
 45 |             self.W_lr_scale = None
 46 | 
 47 |         if not hasattr(self, 'b_lr_scale'):
 48 |             self.b_lr_scale = None
 49 | 
 50 |         rval = OrderedDict()
 51 | 
 52 |         if self.W_lr_scale is not None:
 53 |             W, = self.transformer.get_params()
 54 |             rval[W] = self.W_lr_scale
 55 | 
 56 |         if self.b_lr_scale is not None:
 57 |             rval[self.b] = self.b_lr_scale
 58 | 
 59 |         return rval
 60 | 
 61 |     def set_input_space(self, space):
 62 |         """
 63 |         Tells the layer to use the specified input space.
 64 | 
 65 |         This resets parameters! The kernel tensor is initialized with the
 66 |         size needed to receive input from this space.
 67 | 
 68 |         Parameters
 69 |         ----------
 70 |         space : Space
 71 |             The Space that the input will lie in.
 72 |         """
 73 | 
 74 |         setup_deconv_detector_layer_c01b(layer=self,
 75 |                                   input_space=space,
 76 |                                   rng=self.mlp.rng)
 77 | 
 78 |         rng = self.mlp.rng
 79 | 
 80 |         detector_shape = self.detector_space.shape
 81 | 
 82 | 
 83 |         self.output_space = self.detector_space
 84 | 
 85 |         logger.info('Output space: {0}'.format(self.output_space.shape))
 86 | 
 87 |     def _modify_updates(self, updates):
 88 |         """
 89 |         Replaces the values in `updates` if needed to enforce the options set
 90 |         in the __init__ method, including `max_kernel_norm`.
 91 | 
 92 |         Parameters
 93 |         ----------
 94 |         updates : OrderedDict
 95 |             A dictionary mapping parameters (including parameters not
 96 |             belonging to this model) to updated values of those parameters.
 97 |             The dictionary passed in contains the updates proposed by the
 98 |             learning algorithm. This function modifies the dictionary
 99 |             directly. The modified version will be compiled and executed
100 |             by the learning algorithm.
101 |         """
102 | 
103 |         if self.max_kernel_norm is not None:
104 |             W, = self.transformer.get_params()
105 |             if W in updates:
106 |                 updated_W = updates[W]
107 |                 row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=(0, 1, 2)))
108 |                 desired_norms = T.clip(row_norms, 0, self.max_kernel_norm)
109 |                 scales = desired_norms / (1e-7 + row_norms)
110 |                 updates[W] = (updated_W * scales.dimshuffle('x', 'x', 'x', 0))
111 | 
112 |     @functools.wraps(Model.get_params)
113 |     def get_params(self):
114 |         assert self.b.name is not None
115 |         W, = self.transformer.get_params()
116 |         assert W.name is not None
117 |         rval = self.transformer.get_params()
118 |         assert not isinstance(rval, set)
119 |         rval = list(rval)
120 |         assert self.b not in rval
121 |         rval.append(self.b)
122 |         return rval
123 | 
124 |     @functools.wraps(Layer.get_weight_decay)
125 |     def get_weight_decay(self, coeff):
126 |         if isinstance(coeff, str):
127 |             coeff = float(coeff)
128 |         assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
129 |         W, = self.transformer.get_params()
130 |         return coeff * T.sqr(W).sum()
131 | 
132 |     @functools.wraps(Layer.set_weights)
133 |     def set_weights(self, weights):
134 |         W, = self.transformer.get_params()
135 |         W.set_value(weights)
136 | 
137 |     @functools.wraps(Layer.set_biases)
138 |     def set_biases(self, biases):
139 |         self.b.set_value(biases)
140 | 
141 |     @functools.wraps(Layer.get_biases)
142 |     def get_biases(self):
143 |         return self.b.get_value()
144 | 
145 |     @functools.wraps(Model.get_weights_topo)
146 |     def get_weights_topo(self):
147 |         return self.transformer.get_weights_topo()
148 | 
149 |     @functools.wraps(Layer.get_monitoring_channels)
150 |     def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None):
151 | 
152 |         W, = self.transformer.get_params()
153 | 
154 |         assert W.ndim == 4
155 | 
156 |         sq_W = T.sqr(W)
157 | 
158 |         row_norms = T.sqrt(sq_W.sum(axis=(0, 1, 2)))
159 | 
160 |         P = state
161 | 
162 |         rval = OrderedDict()
163 | 
164 |         vars_and_prefixes = [(P, '')]
165 | 
166 |         for var, prefix in vars_and_prefixes:
167 |             if not hasattr(var, 'ndim') or var.ndim != 4:
168 |                 print "expected 4D tensor, got "
169 |                 print var
170 |                 print type(var)
171 |                 if isinstance(var, tuple):
172 |                     print "tuple length: ", len(var)
173 |                 assert False
174 |             v_max = var.max(axis=(1, 2, 3))
175 |             v_min = var.min(axis=(1, 2, 3))
176 |             v_mean = var.mean(axis=(1, 2, 3))
177 |             v_range = v_max - v_min
178 | 
179 |             # max_x.mean_u is "the mean over *u*nits of the max over
180 |             # e*x*amples" The x and u are included in the name because
181 |             # otherwise its hard to remember which axis is which when reading
182 |             # the monitor I use inner.outer rather than outer_of_inner or
183 |             # something like that because I want mean_x.* to appear next to
184 |             # each other in the alphabetical list, as these are commonly
185 |             # plotted together
186 |             for key, val in [('max_x.max_u',    v_max.max()),
187 |                              ('max_x.mean_u',   v_max.mean()),
188 |                              ('max_x.min_u',    v_max.min()),
189 |                              ('min_x.max_u',    v_min.max()),
190 |                              ('min_x.mean_u',   v_min.mean()),
191 |                              ('min_x.min_u',    v_min.min()),
192 |                              ('range_x.max_u',  v_range.max()),
193 |                              ('range_x.mean_u', v_range.mean()),
194 |                              ('range_x.min_u',  v_range.min()),
195 |                              ('mean_x.max_u',   v_mean.max()),
196 |                              ('mean_x.mean_u',  v_mean.mean()),
197 |                              ('mean_x.min_u',   v_mean.min())]:
198 |                 rval[prefix+key] = val
199 | 
200 |         rval.update(OrderedDict([('kernel_norms_min',  row_norms.min()),
201 |                             ('kernel_norms_mean', row_norms.mean()),
202 |                             ('kernel_norms_max',  row_norms.max()), ]))
203 | 
204 |         return rval
205 | 
206 |     @functools.wraps(Layer.fprop)
207 |     def fprop(self, state_below):
208 |         check_cuda(str(type(self)))
209 | 
210 |         self.input_space.validate(state_below)
211 | 
212 |         z = self.transformer.lmul_T(state_below)
213 | 
214 |         self.output_space.validate(z)
215 | 
216 |         if not hasattr(self, 'tied_b'):
217 |             self.tied_b = False
218 |         if self.tied_b:
219 |             b = self.b.dimshuffle(0, 'x', 'x', 'x')
220 |         else:
221 |             b = self.b.dimshuffle(0, 1, 2, 'x')
222 | 
223 |         return z + b
224 | 
225 | 
226 | 
227 | def setup_deconv_detector_layer_c01b(layer, input_space, rng, irange="not specified"):
228 |     """
229 |     layer. This function sets up only the detector layer.
230 | 
231 |     Does the following:
232 | 
233 |     * raises a RuntimeError if cuda is not available
234 |     * sets layer.input_space to input_space
235 |     * sets up addition of dummy channels for compatibility with cuda-convnet:
236 | 
237 |       - layer.dummy_channels: # of dummy channels that need to be added
238 |         (You might want to check this and raise an Exception if it's not 0)
239 |       - layer.dummy_space: The Conv2DSpace representing the input with dummy
240 |         channels added
241 | 
242 |     * sets layer.detector_space to the space for the detector layer
243 |     * sets layer.transformer to be a Conv2D instance
244 |     * sets layer.b to the right value
245 | 
246 |     Parameters
247 |     ----------
248 |     layer : object
249 |         Any python object that allows the modifications described below and
250 |         has the following attributes:
251 | 
252 |           * pad : int describing amount of zero padding to add
253 |           * kernel_shape : 2-element tuple or list describing spatial shape of
254 |             kernel
255 |           * fix_kernel_shape : bool, if true, will shrink the kernel shape to
256 |             make it feasible, as needed (useful for hyperparameter searchers)
257 |           * detector_channels : The number of channels in the detector layer
258 |           * init_bias : numeric constant added to a tensor of zeros to
259 |             initialize the bias
260 |           * tied_b : If true, biases are shared across all spatial locations
261 |     input_space : WRITEME
262 |         A Conv2DSpace to be used as input to the layer
263 |     rng : WRITEME
264 |         A numpy RandomState or equivalent
265 |     """
266 | 
267 |     if irange != "not specified":
268 |         raise AssertionError(
269 |             "There was a bug in setup_detector_layer_c01b."
270 |             "It uses layer.irange instead of the irange parameter to the "
271 |             "function. The irange parameter is now disabled by this "
272 |             "AssertionError, so that this error message can alert you that "
273 |             "the bug affected your code and explain why the interface is "
274 |             "changing. The irange parameter to the function and this "
275 |             "error message may be removed after April 21, 2014."
276 |         )
277 | 
278 |     # Use "self" to refer to layer from now on, so we can pretend we're
279 |     # just running in the set_input_space method of the layer
280 |     self = layer
281 | 
282 |     # Make sure cuda is available
283 |     check_cuda(str(type(self)))
284 | 
285 |     # Validate input
286 |     if not isinstance(input_space, Conv2DSpace):
287 |         raise TypeError("The input to a convolutional layer should be a "
288 |                         "Conv2DSpace, but layer " + self.layer_name + " got " +
289 |                         str(type(self.input_space)))
290 | 
291 |     if not hasattr(self, 'detector_channels'):
292 |         raise ValueError("layer argument must have a 'detector_channels' "
293 |                          "attribute specifying how many channels to put in "
294 |                          "the convolution kernel stack.")
295 | 
296 |     # Store the input space
297 |     self.input_space = input_space
298 | 
299 |     # Make sure number of channels is supported by cuda-convnet
300 |     # (multiple of 4 or <= 3)
301 |     # If not supported, pad the input with dummy channels
302 |     ch = self.detector_channels
303 |     rem = ch % 4
304 |     if ch > 3 and rem != 0:
305 |         raise NotImplementedError("Need to do dummy channels on the output")
306 |     #    self.dummy_channels = 4 - rem
307 |     #else:
308 |     #    self.dummy_channels = 0
309 |     #self.dummy_space = Conv2DSpace(
310 |     #    shape=input_space.shape,
311 |     #    channels=input_space.num_channels + self.dummy_channels,
312 |     #    axes=('c', 0, 1, 'b')
313 |     #)
314 | 
315 |     if hasattr(self, 'output_stride'):
316 |         kernel_stride = self.output_stride
317 |     else:
318 |         assert False # not sure if I got the name right, remove this assert if I did
319 |         kernel_stride = [1, 1]
320 | 
321 | 
322 |     #o_sh = int(np.ceil((i_sh + 2. * self.pad - k_sh) / float(k_st))) + 1
323 |     #o_sh -1 = np.ceil((i_sh + 2. * self.pad - k_sh) / float(k_st))
324 |     #inv_ceil(o_sh -1) = (i_sh + 2. * self.pad - k_sh) / float(k_st)
325 |     #float(k_st) inv_cel(o_sh -1) = (i_sh + 2 * self.pad -k_sh)
326 |     # i_sh = k_st inv_ceil(o_sh-1) - 2 * self.pad + k_sh
327 | 
328 |     output_shape = \
329 |         [k_st * (i_sh - 1) - 2 * self.pad_out + k_sh
330 |          for i_sh, k_sh, k_st in zip(self.input_space.shape,
331 |                                      self.kernel_shape, kernel_stride)]
332 | 
333 | 
334 |     if self.input_space.num_channels < 16:
335 |         raise ValueError("Cuda-convnet requires the input to lmul_T to have "
336 |                          "at least 16 channels.")
337 | 
338 |     self.detector_space = Conv2DSpace(shape=output_shape,
339 |                                       num_channels=self.detector_channels,
340 |                                       axes=('c', 0, 1, 'b'))
341 | 
342 |     if hasattr(self, 'partial_sum'):
343 |         partial_sum = self.partial_sum
344 |     else:
345 |         partial_sum = 1
346 | 
347 |     if hasattr(self, 'sparse_init') and self.sparse_init is not None:
348 |         self.transformer = \
349 |             checked_call(make_sparse_random_conv2D,
350 |                          OrderedDict([('num_nonzero', self.sparse_init),
351 |                                       ('input_space', self.detector_space),
352 |                                       ('output_space', self.input_space),
353 |                                       ('kernel_shape', self.kernel_shape),
354 |                                       ('pad', self.pad),
355 |                                       ('partial_sum', partial_sum),
356 |                                       ('kernel_stride', kernel_stride),
357 |                                       ('rng', rng)]))
358 |     else:
359 |         self.transformer = make_random_conv2D(
360 |             irange=self.irange,
361 |             input_axes=self.detector_space.axes,
362 |             output_axes=self.input_space.axes,
363 |             input_channels=self.detector_space.num_channels,
364 |             output_channels=self.input_space.num_channels,
365 |             kernel_shape=self.kernel_shape,
366 |             pad=self.pad_out,
367 |             partial_sum=partial_sum,
368 |             kernel_stride=kernel_stride,
369 |             rng=rng,
370 |             input_shape=self.detector_space.shape
371 |         )
372 | 
373 |     W, = self.transformer.get_params()
374 |     W.name = self.layer_name + '_W'
375 | 
376 |     if self.tied_b:
377 |         self.b = sharedX(np.zeros(self.detector_space.num_channels) +
378 |                          self.init_bias)
379 |     else:
380 |         self.b = sharedX(self.detector_space.get_origin() + self.init_bias)
381 |     self.b.name = self.layer_name + '_b'
382 | 
383 |     logger.info('Input shape: {0}'.format(self.input_space.shape))
384 |     print layer.layer_name + ' detector space: {0}'.format(self.detector_space.shape)
385 | 


--------------------------------------------------------------------------------
/sgd.py:
--------------------------------------------------------------------------------
   1 | """
   2 | Copy of pylearn2's sgd.py, hacked to support doing steps on
   3 | discriminator separately from the generator. Ideally this would
   4 | be accomplished using pylearn2's FixedVarDescr implementation,
   5 | but it is currently not very well supported.
   6 | """
   7 | from __future__ import division
   8 | 
   9 | __authors__ = "Ian Goodfellow"
  10 | __copyright__ = "Copyright 2010-2012, Universite de Montreal"
  11 | __credits__ = ["Ian Goodfellow, David Warde-Farley"]
  12 | __license__ = "3-clause BSD"
  13 | __maintainer__ = "David Warde-Farley"
  14 | __email__ = "pylearn-dev@googlegroups"
  15 | 
  16 | import logging
  17 | import warnings
  18 | import numpy as np
  19 | 
  20 | from theano import config
  21 | from theano import function
  22 | from theano.compat.python2x import OrderedDict
  23 | from theano.gof.op import get_debug_values
  24 | 
  25 | from pylearn2.monitor import Monitor
  26 | from pylearn2.space import CompositeSpace, NullSpace
  27 | from pylearn2.train_extensions import TrainExtension
  28 | from pylearn2.training_algorithms.training_algorithm import TrainingAlgorithm
  29 | from pylearn2.training_algorithms.learning_rule import Momentum
  30 | from pylearn2.training_algorithms.learning_rule import MomentumAdjustor \
  31 |         as LRMomentumAdjustor
  32 | from pylearn2.utils.iteration import is_stochastic, has_uniform_batch_size
  33 | from pylearn2.utils import py_integer_types, py_float_types
  34 | from pylearn2.utils import safe_zip
  35 | from pylearn2.utils import serial
  36 | from pylearn2.utils import sharedX
  37 | from pylearn2.utils.data_specs import DataSpecsMapping
  38 | from pylearn2.utils.timing import log_timing
  39 | from pylearn2.utils.rng import make_np_rng
  40 | 
  41 | 
  42 | log = logging.getLogger(__name__)
  43 | 
  44 | 
  45 | class SGD(TrainingAlgorithm):
  46 |     """
  47 |     SGD = (Minibatch) Stochastic Gradient Descent.
  48 |     A TrainingAlgorithm that does stochastic gradient descent on minibatches
  49 |     of training examples.
  50 | 
  51 |     For theoretical background on this algorithm, see Yoshua Bengio's machine
  52 |     learning course notes on the subject:
  53 | 
  54 |     http://www.iro.umontreal.ca/~pift6266/H10/notes/gradient.html
  55 | 
  56 |     Parameters
  57 |     ----------
  58 |     learning_rate : float
  59 |         The learning rate to use. Train object callbacks can change the
  60 |         learning rate after each epoch. SGD update_callbacks can change
  61 |         it after each minibatch.
  62 |     cost : pylearn2.costs.cost.Cost, optional
  63 |         Cost object specifying the objective function to be minimized.
  64 |         Optionally, may be None. In this case, SGD will call the model's
  65 |         get_default_cost method to obtain the objective function.
  66 |     batch_size : int, optional
  67 |         The size of the batch to be used.
  68 |         If not specified, the model will be asked for the batch size, so
  69 |         you must have specified the batch size there.
  70 |         (Some models are rigidly defined to only work with one batch size)
  71 |     monitoring_batch_size : int, optional
  72 |         The size of the monitoring batches.
  73 |     monitoring_batches : int, optional
  74 |         At the start of each epoch, we run "monitoring", to evaluate
  75 |         quantities such as the validation set error.
  76 |         monitoring_batches, if specified, determines the number of batches
  77 |         to draw from the iterator for each monitoring dataset.
  78 |         Unnecessary if not using monitoring or if `monitor_iteration_mode`
  79 |         is 'sequential' and `batch_size` is specified (number of
  80 |         batches will be calculated based on full dataset size).
  81 |         TODO: make it possible to specify different monitoring_batches
  82 |         for each monitoring dataset. The Monitor itself already supports
  83 |         this.
  84 |     monitoring_dataset : Dataset or dictionary, optional
  85 |         If not specified, no monitoring is used.
  86 |         If specified to be a Dataset, monitor on that Dataset.
  87 |         If specified to be dictionary, the keys should be string names
  88 |         of datasets, and the values should be Datasets. All monitoring
  89 |         channels will be computed for all monitoring Datasets and will
  90 |         have the dataset name and an underscore prepended to them.
  91 |     monitor_iteration_mode : str, optional
  92 |         The iteration mode used to iterate over the examples in all
  93 |         monitoring datasets. If not specified, defaults to 'sequential'.
  94 |         TODO: make it possible to specify different modes for different
  95 |         datasets.
  96 |     termination_criterion : instance of \
  97 |         pylearn2.termination_criteria.TerminationCriterion, optional
  98 | 
  99 |         Used to determine when the algorithm should stop running.
 100 |         If not specified, runs forever--or more realistically, until
 101 |         external factors halt the python process (Kansas 1977).
 102 |     update_callbacks : list, optional
 103 |         If specified, each member of the list should be a callable that
 104 |         accepts an SGD instance as its only argument.
 105 |         All callbacks will be called with this SGD instance after each
 106 |         SGD step.
 107 |     learning_rule : training_algorithms.learning_rule.LearningRule, optional
 108 |         A learning rule computes the new parameter values given old
 109 |         parameters and first-order gradients. If learning_rule is None,
 110 |         sgd.SGD will update parameters according to the standard SGD
 111 |         learning rule:
 112 | 
 113 |         .. code-block:: none
 114 | 
 115 |             param := param - learning_rate * d cost / d param
 116 | 
 117 |         This argument allows more sophisticated learning rules, such
 118 |         as SGD with momentum.
 119 |     init_momentum : float, **DEPRECATED** option
 120 |         Use learning_rule instead.
 121 |         If None, does not use momentum otherwise, use momentum and
 122 |         initialize the momentum coefficient to init_momentum. Callbacks
 123 |         can change this over time just like the learning rate. If the
 124 |         gradient is the same on every step, then the update taken by the
 125 |         SGD algorithm is scaled by a factor of 1/(1-momentum). See
 126 |         section 9 of Geoffrey Hinton's "A Practical Guide to Training
 127 |         Restricted Boltzmann Machines" for details.
 128 |     set_batch_size : bool, optional
 129 |         Defaults to False.
 130 |         If True, and batch_size conflicts with model.force_batch_size,
 131 |         will call model.set_batch_size(batch_size) in an attempt to
 132 |         change model.force_batch_size
 133 |     train_iteration_mode : str, optional
 134 |         Defaults to 'shuffled_sequential'.
 135 |         The iteration mode to use for iterating through training examples.
 136 |     batches_per_iter : int, optional
 137 |         The number of batches to draw from the iterator over training
 138 |         examples.
 139 |         If iteration mode is 'sequential' or 'shuffled_sequential', this
 140 |         is unnecessary; when unspecified we will iterate over all examples.
 141 |     theano_function_mode : a valid argument to theano.function's \
 142 |         'mode' parameter, optional
 143 | 
 144 |         The theano mode to compile the updates function with. Note that
 145 |         pylearn2 includes some wraplinker modes that are not bundled with
 146 |         theano. See pylearn2.devtools. These extra modes let you do
 147 |         things like check for NaNs at every step, or record md5 digests
 148 |         of all computations performed by the update function to help
 149 |         isolate problems with nondeterminism.
 150 |     monitoring_costs : list, optional
 151 |         a list of Cost instances. The Monitor will also include all
 152 |         channels defined by these Costs, even though we don't train
 153 |         using them.
 154 |     seed : valid argument to np.random.RandomState, optional
 155 |         The seed used for the random number generate to be passed to the
 156 |         training dataset iterator (if any)
 157 |     """
 158 |     def __init__(self, learning_rate, cost=None, batch_size=None,
 159 |                  monitoring_batch_size=None, monitoring_batches=None,
 160 |                  monitoring_dataset=None, monitor_iteration_mode='sequential',
 161 |                  termination_criterion=None, update_callbacks=None,
 162 |                  learning_rule = None, init_momentum = None,
 163 |                  set_batch_size = False,
 164 |                  train_iteration_mode = None, batches_per_iter=None,
 165 |                  theano_function_mode = None, monitoring_costs=None,
 166 |                  seed=[2012, 10, 5], discriminator_steps=1):
 167 |         self.discriminator_steps = discriminator_steps
 168 | 
 169 |         if isinstance(cost, (list, tuple, set)):
 170 |             raise TypeError("SGD no longer supports using collections of " +
 171 |                             "Costs to represent a sum of Costs. Use " +
 172 |                             "pylearn2.costs.cost.SumOfCosts instead.")
 173 | 
 174 |         if init_momentum:
 175 |             warnings.warn("init_momentum interface is deprecated and will "
 176 |             "become officially unsuported as of May 9, 2014. Please use the "
 177 |             "`learning_rule` parameter instead, providing an object of type "
 178 |             "`pylearn2.training_algorithms.learning_rule.Momentum` instead")
 179 |             # Convert to new interface under the hood.
 180 |             self.learning_rule = Momentum(init_momentum)
 181 |         else:
 182 |             self.learning_rule = learning_rule
 183 | 
 184 |         self.learning_rate = sharedX(learning_rate, 'learning_rate')
 185 |         self.cost = cost
 186 |         self.batch_size = batch_size
 187 |         self.set_batch_size = set_batch_size
 188 |         self.batches_per_iter = batches_per_iter
 189 |         self._set_monitoring_dataset(monitoring_dataset)
 190 |         self.monitoring_batch_size = monitoring_batch_size
 191 |         self.monitoring_batches = monitoring_batches
 192 |         self.monitor_iteration_mode = monitor_iteration_mode
 193 |         if monitoring_dataset is None:
 194 |             if monitoring_batch_size is not None:
 195 |                 raise ValueError("Specified a monitoring batch size " +
 196 |                                  "but not a monitoring dataset.")
 197 |             if monitoring_batches is not None:
 198 |                 raise ValueError("Specified an amount of monitoring batches " +
 199 |                                  "but not a monitoring dataset.")
 200 |         self.termination_criterion = termination_criterion
 201 |         self._register_update_callbacks(update_callbacks)
 202 |         if train_iteration_mode is None:
 203 |             train_iteration_mode = 'shuffled_sequential'
 204 |         self.train_iteration_mode = train_iteration_mode
 205 |         self.first = True
 206 |         self.rng = make_np_rng(seed, which_method=["randn","randint"])
 207 |         self.theano_function_mode = theano_function_mode
 208 |         self.monitoring_costs = monitoring_costs
 209 | 
 210 |     def setup(self, model, dataset):
 211 |         """
 212 |         Compiles the theano functions needed for the train method.
 213 | 
 214 |         Parameters
 215 |         ----------
 216 |         model : a Model instance
 217 |         dataset : Dataset
 218 |         """
 219 |         self.i = 0
 220 |         if self.cost is None:
 221 |             self.cost = model.get_default_cost()
 222 | 
 223 |         inf_params = [param for param in model.get_params()
 224 |                       if np.any(np.isinf(param.get_value()))]
 225 |         if len(inf_params) > 0:
 226 |             raise ValueError("These params are Inf: "+str(inf_params))
 227 |         if any([np.any(np.isnan(param.get_value()))
 228 |                 for param in model.get_params()]):
 229 |             nan_params = [param for param in model.get_params()
 230 |                           if np.any(np.isnan(param.get_value()))]
 231 |             raise ValueError("These params are NaN: "+str(nan_params))
 232 |         self.model = model
 233 | 
 234 |         self._synchronize_batch_size(model)
 235 |         model._test_batch_size = self.batch_size
 236 |         self.monitor = Monitor.get_monitor(model)
 237 |         self.monitor._sanity_check()
 238 | 
 239 |         # test if force batch size and batch size
 240 |         if getattr(model, "force_batch_size", False) and \
 241 |            any(dataset.get_design_matrix().shape[0] % self.batch_size != 0 for
 242 |                dataset in self.monitoring_dataset.values()) and \
 243 |            not has_uniform_batch_size(self.monitor_iteration_mode):
 244 | 
 245 |             raise ValueError("Dataset size is not a multiple of batch size."
 246 |                              "You should set monitor_iteration_mode to "
 247 |                              "even_sequential, even_shuffled_sequential or "
 248 |                              "even_batchwise_shuffled_sequential")
 249 | 
 250 |         data_specs = self.cost.get_data_specs(self.model)
 251 |         mapping = DataSpecsMapping(data_specs)
 252 |         space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
 253 |         source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
 254 | 
 255 |         # Build a flat tuple of Theano Variables, one for each space.
 256 |         # We want that so that if the same space/source is specified
 257 |         # more than once in data_specs, only one Theano Variable
 258 |         # is generated for it, and the corresponding value is passed
 259 |         # only once to the compiled Theano function.
 260 |         theano_args = []
 261 |         for space, source in safe_zip(space_tuple, source_tuple):
 262 |             name = '%s[%s]' % (self.__class__.__name__, source)
 263 |             arg = space.make_theano_batch(name=name,
 264 |                                           batch_size=self.batch_size)
 265 |             theano_args.append(arg)
 266 |         theano_args = tuple(theano_args)
 267 | 
 268 |         # Methods of `self.cost` need args to be passed in a format compatible
 269 |         # with data_specs
 270 |         nested_args = mapping.nest(theano_args)
 271 |         fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
 272 |         self.on_load_batch = fixed_var_descr.on_load_batch
 273 | 
 274 |         cost_value = self.cost.expr(model, nested_args,
 275 |                                     ** fixed_var_descr.fixed_vars)
 276 | 
 277 |         if cost_value is not None and cost_value.name is None:
 278 |             # Concatenate the name of all tensors in theano_args !?
 279 |             cost_value.name = 'objective'
 280 | 
 281 |         # Set up monitor to model the objective value, learning rate,
 282 |         # momentum (if applicable), and extra channels defined by
 283 |         # the cost
 284 |         learning_rate = self.learning_rate
 285 |         if self.monitoring_dataset is not None:
 286 |             if (self.monitoring_batch_size is None and
 287 |                     self.monitoring_batches is None):
 288 |                 self.monitoring_batch_size = self.batch_size
 289 |                 self.monitoring_batches = self.batches_per_iter
 290 |             self.monitor.setup(dataset=self.monitoring_dataset,
 291 |                                cost=self.cost,
 292 |                                batch_size=self.monitoring_batch_size,
 293 |                                num_batches=self.monitoring_batches,
 294 |                                extra_costs=self.monitoring_costs,
 295 |                                mode=self.monitor_iteration_mode)
 296 |             dataset_name = self.monitoring_dataset.keys()[0]
 297 |             monitoring_dataset = self.monitoring_dataset[dataset_name]
 298 |             #TODO: have Monitor support non-data-dependent channels
 299 |             self.monitor.add_channel(name='learning_rate',
 300 |                                      ipt=None,
 301 |                                      val=learning_rate,
 302 |                                      data_specs=(NullSpace(), ''),
 303 |                                      dataset=monitoring_dataset)
 304 | 
 305 |             if self.learning_rule:
 306 |                 self.learning_rule.add_channels_to_monitor(
 307 |                         self.monitor,
 308 |                         monitoring_dataset)
 309 | 
 310 |         params = list(model.get_params())
 311 |         assert len(params) > 0
 312 |         for i, param in enumerate(params):
 313 |             if param.name is None:
 314 |                 param.name = 'sgd_params[%d]' % i
 315 |         self.params = params
 316 | 
 317 | 
 318 |         grads, updates = self.cost.get_gradients(model, nested_args,
 319 |                                                  ** fixed_var_descr.fixed_vars)
 320 |         if not isinstance(grads, OrderedDict):
 321 |             raise TypeError(str(type(self.cost)) + ".get_gradients returned " +
 322 |                             "something with" + str(type(grads)) + "as its " +
 323 |                             "first member. Expected OrderedDict.")
 324 | 
 325 |         for param in grads:
 326 |             assert param in params
 327 |         for param in params:
 328 |             assert param in grads
 329 | 
 330 |         lr_scalers = model.get_lr_scalers()
 331 | 
 332 |         for key in lr_scalers:
 333 |             if key not in params:
 334 |                 raise ValueError("Tried to scale the learning rate on " +\
 335 |                         str(key)+" which is not an optimization parameter.")
 336 | 
 337 |         assert len(updates.keys()) == 0
 338 | 
 339 |         def get_func(learn_discriminator, learn_generator):
 340 | 
 341 |             updates = OrderedDict()
 342 | 
 343 |             assert (learn_discriminator or learn_generator) and not (learn_discriminator and learn_generator)
 344 | 
 345 |             if learn_discriminator:
 346 |                 cur_params = model.discriminator.get_params()
 347 |             else:
 348 |                 cur_params = model.generator.get_params()
 349 | 
 350 |             cur_grads = OrderedDict()
 351 |             for param in cur_params:
 352 |                 cur_grads[param] = grads[param]
 353 | 
 354 |             for param in grads:
 355 |                 if grads[param].name is None and cost_value is not None:
 356 |                     grads[param].name = ('grad(%(costname)s, %(paramname)s)' %
 357 |                                          {'costname': cost_value.name,
 358 |                                           'paramname': param.name})
 359 |                 assert grads[param].dtype == param.dtype
 360 | 
 361 |             cur_lr_scalers = OrderedDict()
 362 |             for param in cur_params:
 363 |                 if param in lr_scalers:
 364 |                     lr_scaler = lr_scalers[param]
 365 |                     cur_lr_scalers[param] = lr_scaler
 366 | 
 367 |             log.info('Parameter and initial learning rate summary:')
 368 |             for param in cur_params:
 369 |                 param_name = param.name
 370 |                 if param_name is None:
 371 |                     param_name = 'anon_param'
 372 |                 lr = learning_rate.get_value() * cur_lr_scalers.get(param,1.)
 373 |                 log.info('\t' + param_name + ': ' + str(lr))
 374 | 
 375 |             if self.learning_rule:
 376 |                 updates.update(self.learning_rule.get_updates(
 377 |                     learning_rate, cur_grads, cur_lr_scalers))
 378 |             else:
 379 |                 # Use standard SGD updates with fixed learning rate.
 380 |                 updates.update( dict(safe_zip(params, [param - learning_rate * \
 381 |                     lr_scalers.get(param, 1.) * grads[param]
 382 |                                         for param in params])))
 383 | 
 384 |             for param in cur_params:
 385 |                 if updates[param].name is None:
 386 |                     updates[param].name = 'sgd_update(' + param.name + ')'
 387 |             model.modify_updates(updates)
 388 |             for param in cur_params:
 389 |                 update = updates[param]
 390 |                 if update.name is None:
 391 |                     update.name = 'censor(sgd_update(' + param.name + '))'
 392 |                 for update_val in get_debug_values(update):
 393 |                     if np.any(np.isinf(update_val)):
 394 |                         raise ValueError("debug value of %s contains infs" %
 395 |                                 update.name)
 396 |                     if np.any(np.isnan(update_val)):
 397 |                         raise ValueError("debug value of %s contains nans" %
 398 |                                 update.name)
 399 | 
 400 | 
 401 |             with log_timing(log, 'Compiling sgd_update'):
 402 |                 return function(theano_args,
 403 |                                            updates=updates,
 404 |                                            name='sgd_update',
 405 |                                            on_unused_input='ignore',
 406 |                                            mode=self.theano_function_mode)
 407 |         self.d_func = get_func(1, 0)
 408 |         self.g_func = get_func(0, 1)
 409 | 
 410 |     def train(self, dataset):
 411 |         """
 412 |         Runs one epoch of SGD training on the specified dataset.
 413 | 
 414 |         Parameters
 415 |         ----------
 416 |         dataset : Dataset
 417 |         """
 418 |         if not hasattr(self, 'd_func'):
 419 |             raise Exception("train called without first calling setup")
 420 | 
 421 |         # Make sure none of the parameters have bad values
 422 |         for param in self.params:
 423 |             value = param.get_value(borrow=True)
 424 |             if np.any(np.isnan(value)) or np.any(np.isinf(value)):
 425 |                 raise Exception("NaN in " + param.name)
 426 | 
 427 |         self.first = False
 428 |         rng = self.rng
 429 |         if not is_stochastic(self.train_iteration_mode):
 430 |             rng = None
 431 | 
 432 |         data_specs = self.cost.get_data_specs(self.model)
 433 | 
 434 |         # The iterator should be built from flat data specs, so it returns
 435 |         # flat, non-redundent tuples of data.
 436 |         mapping = DataSpecsMapping(data_specs)
 437 |         space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
 438 |         source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
 439 |         if len(space_tuple) == 0:
 440 |             # No data will be returned by the iterator, and it is impossible
 441 |             # to know the size of the actual batch.
 442 |             # It is not decided yet what the right thing to do should be.
 443 |             raise NotImplementedError("Unable to train with SGD, because "
 444 |                     "the cost does not actually use data from the data set. "
 445 |                     "data_specs: %s" % str(data_specs))
 446 |         flat_data_specs = (CompositeSpace(space_tuple), source_tuple)
 447 | 
 448 |         iterator = dataset.iterator(mode=self.train_iteration_mode,
 449 |                 batch_size=self.batch_size,
 450 |                 data_specs=flat_data_specs, return_tuple=True,
 451 |                 rng = rng, num_batches = self.batches_per_iter)
 452 | 
 453 |         on_load_batch = self.on_load_batch
 454 |         i = self.i
 455 |         for batch in iterator:
 456 |             for callback in on_load_batch:
 457 |                 callback(*batch)
 458 |             self.d_func(*batch)
 459 |             i += 1
 460 |             if i == self.discriminator_steps:
 461 |                 # Generator doesn't actually use the data so we want to
 462 |                 # re-use this batch. Could save memory by making the
 463 |                 # code not expect data in the interface.
 464 |                 self.g_func(*batch)
 465 |                 i = 0
 466 |             # iterator might return a smaller batch if dataset size
 467 |             # isn't divisible by batch_size
 468 |             # Note: if data_specs[0] is a NullSpace, there is no way to know
 469 |             # how many examples would actually have been in the batch,
 470 |             # since it was empty, so actual_batch_size would be reported as 0.
 471 |             actual_batch_size = flat_data_specs[0].np_batch_size(batch)
 472 |             self.monitor.report_batch(actual_batch_size)
 473 |             for callback in self.update_callbacks:
 474 |                 callback(self)
 475 | 
 476 |         # Make sure none of the parameters have bad values
 477 |         for param in self.params:
 478 |             value = param.get_value(borrow=True)
 479 |             if np.any(np.isnan(value)) or np.any(np.isinf(value)):
 480 |                 raise Exception("NaN in " + param.name)
 481 |         self.i = i
 482 | 
 483 |     def continue_learning(self, model):
 484 |         """
 485 |         Returns True if the algorithm should continue running, or False
 486 |         if it has reached convergence / started overfitting and should
 487 |         stop.
 488 | 
 489 |         Parameters
 490 |         ----------
 491 |         model : a Model instance
 492 |         """
 493 |         if self.termination_criterion is None:
 494 |             return True
 495 |         else:
 496 |             return self.termination_criterion.continue_learning(self.model)
 497 | 
 498 | class MonitorBasedLRAdjuster(TrainExtension):
 499 |     """
 500 |     A TrainExtension that uses the on_monitor callback to adjust
 501 |     the learning rate on each epoch. It pulls out a channel
 502 |     from the model's monitor and adjusts the learning rate
 503 |     based on what happened to the monitoring channel on the last
 504 |     epoch. If the channel is greater than high_trigger times
 505 |     its previous value, the learning rate will be scaled by
 506 |     shrink_amt (which should be < 1 for this scheme to make
 507 |     sense). The idea is that in this case the learning algorithm
 508 |     is overshooting the bottom of the objective function.
 509 | 
 510 |     If the objective is less than high_trigger but
 511 |     greater than low_trigger times its previous value, the
 512 |     learning rate will be scaled by grow_amt (which should be > 1
 513 |     for this scheme to make sense). The idea is that the learning
 514 |     algorithm is making progress but at too slow of a rate.
 515 | 
 516 |     Parameters
 517 |     ----------
 518 |     high_trigger : float, optional
 519 |         See class-level docstring
 520 |     low_trigger : float, optional
 521 |         See class-level docstring
 522 |     grow_amt : float, optional
 523 |         See class-level docstring
 524 |     min_lr : float, optional
 525 |         All updates to the learning rate are clipped to be at least
 526 |         this value.
 527 |     max_lr : float, optional
 528 |         All updates to the learning rate are clipped to be at most
 529 |         this value.
 530 |     dataset_name : str, optional
 531 |         If specified, use dataset_name + "_objective" as the channel
 532 |         to guide the learning rate adaptation.
 533 |     channel_name : str, optional
 534 |         If specified, use channel_name as the channel to guide the
 535 |         learning rate adaptation. Conflicts with dataset_name.
 536 |         If neither dataset_name nor channel_name is specified, uses
 537 |         "objective"
 538 |     """
 539 | 
 540 |     def __init__(self, high_trigger=1., shrink_amt=.99,
 541 |                  low_trigger=.99, grow_amt=1.01,
 542 |                  min_lr = 1e-7, max_lr = 1.,
 543 |                  dataset_name=None, channel_name=None):
 544 |         self.high_trigger = high_trigger
 545 |         self.shrink_amt = shrink_amt
 546 |         self.low_trigger = low_trigger
 547 |         self.grow_amt = grow_amt
 548 |         self.min_lr = min_lr
 549 |         self.max_lr = max_lr
 550 |         self.dataset_name = None
 551 |         if channel_name is not None:
 552 |             self.channel_name = channel_name
 553 |         else:
 554 |             if dataset_name is not None:
 555 |                 self.channel_name = dataset_name + '_objective'
 556 |                 self.dataset_name = dataset_name
 557 |             else:
 558 |                 self.channel_name = None
 559 | 
 560 |     def on_monitor(self, model, dataset, algorithm):
 561 |         """
 562 |         Adjusts the learning rate based on the contents of model.monitor
 563 | 
 564 |         Parameters
 565 |         ----------
 566 |         model : a Model instance
 567 |         dataset : Dataset
 568 |         algorithm : WRITEME
 569 |         """
 570 |         model = algorithm.model
 571 |         lr = algorithm.learning_rate
 572 |         current_learning_rate = lr.get_value()
 573 |         assert hasattr(model, 'monitor'), ("no monitor associated with "
 574 |                 + str(model))
 575 |         monitor = model.monitor
 576 |         monitor_channel_specified = True
 577 | 
 578 |         if self.channel_name is None:
 579 |             monitor_channel_specified = False
 580 |             channels = [elem for elem in monitor.channels
 581 |                     if elem.endswith("objective")]
 582 |             if len(channels) < 1:
 583 |                 raise ValueError("There are no monitoring channels that end "
 584 |                         "with \"objective\". Please specify either "
 585 |                         "channel_name or dataset_name.")
 586 |             elif len(channels) > 1:
 587 |                 datasets = algorithm.monitoring_dataset.keys()
 588 |                 raise ValueError("There are multiple monitoring channels that"
 589 |                         "end with \"_objective\". The list of available "
 590 |                         "datasets are: " +
 591 |                                 str(datasets) + " . Please specify either "
 592 |                                 "channel_name or dataset_name in the "
 593 |                                 "MonitorBasedLRAdjuster constructor to "
 594 |                                 'disambiguate.')
 595 |             else:
 596 |                 self.channel_name = channels[0]
 597 |                 warnings.warn('The channel that has been chosen for '
 598 |                         'monitoring is: ' +
 599 |                               str(self.channel_name) + '.')
 600 | 
 601 |         try:
 602 |             v = monitor.channels[self.channel_name].val_record
 603 |         except KeyError:
 604 |             err_input = ''
 605 |             if monitor_channel_specified:
 606 |                 if self.dataset_name:
 607 |                     err_input = 'The dataset_name \'' + str(
 608 |                             self.dataset_name) + '\' is not valid.'
 609 |                 else:
 610 |                     err_input = 'The channel_name \'' + str(
 611 |                             self.channel_name) + '\' is not valid.'
 612 |             err_message = 'There is no monitoring channel named \'' + \
 613 |                     str(self.channel_name) + '\'. You probably need to ' + \
 614 |                     'specify a valid monitoring channel by using either ' + \
 615 |                     'dataset_name or channel_name in the ' + \
 616 |                     'MonitorBasedLRAdjuster constructor. ' + err_input
 617 |             raise ValueError(err_message)
 618 | 
 619 |         if len(v) < 1:
 620 |             if monitor.dataset is None:
 621 |                 assert len(v) == 0
 622 |                 raise ValueError("You're trying to use a monitor-based "
 623 |                         "learning rate adjustor but the monitor has no "
 624 |                         "entries because you didn't specify a "
 625 |                         "monitoring dataset.")
 626 | 
 627 |             raise ValueError("For some reason there are no monitor entries"
 628 |                                  "yet the MonitorBasedLRAdjuster has been "
 629 |                                  "called. This should never happen. The Train"
 630 |                                  " object should call the monitor once on "
 631 |                                  "initialization, then call the callbacks. "
 632 |                                  "It seems you are either calling the "
 633 |                                  "callback manually rather than as part of a "
 634 |                                  "training algorithm, or there is a problem "
 635 |                                 "with the Train object.")
 636 |         if len(v) == 1:
 637 |             #only the initial monitoring has happened
 638 |             #no learning has happened, so we can't adjust the learning rate yet
 639 |             #just do nothing
 640 |             return
 641 | 
 642 |         rval = current_learning_rate
 643 | 
 644 |         log.info("monitoring channel is {0}".format(self.channel_name))
 645 | 
 646 |         if v[-1] > self.high_trigger * v[-2]:
 647 |             rval *= self.shrink_amt
 648 |             log.info("shrinking learning rate to %f" % rval)
 649 |         elif v[-1] > self.low_trigger * v[-2]:
 650 |             rval *= self.grow_amt
 651 |             log.info("growing learning rate to %f" % rval)
 652 | 
 653 |         rval = max(self.min_lr, rval)
 654 |         rval = min(self.max_lr, rval)
 655 | 
 656 |         lr.set_value(np.cast[lr.dtype](rval))
 657 | 
 658 | 
 659 | class PatienceBasedTermCrit(object):
 660 |     """
 661 |     A monitor-based termination criterion using a geometrically increasing
 662 |     amount of patience. If the selected channel has decreased by a certain
 663 |     proportion when comparing to the lowest value seen yet, the patience is
 664 |     set to a factor of the number of examples seen, which by default
 665 |     (patience_increase=2.) ensures the model has seen as many examples as the
 666 |     number of examples that lead to the lowest value before concluding a local
 667 |     optima has been reached.
 668 | 
 669 |     Note: Technically, the patience corresponds to a number of epochs to be
 670 |     independent of the size of the dataset, so be aware of that when choosing
 671 |     initial_patience.
 672 | 
 673 |     Parameters
 674 |     ----------
 675 |     prop_decrease : float
 676 |         The factor X in the (1 - X) * best_value threshold
 677 |     initial_patience : int
 678 |         Minimal number of epochs the model has to run before it can stop
 679 |     patience_increase : float, optional
 680 |         The factor X in the patience = X * n_iter update.
 681 |     channel_name : string, optional
 682 |         Name of the channel to examine. If None and the monitor
 683 |         has only one channel, this channel will be used; otherwise, an
 684 |         error will be raised.
 685 |     """
 686 |     def __init__(self, prop_decrease, initial_patience,
 687 |                  patience_increase=2., channel_name=None):
 688 |         self._channel_name = channel_name
 689 |         self.prop_decrease = prop_decrease
 690 |         self.patience = initial_patience
 691 |         self.best_value = np.inf
 692 |         self.patience_increase = patience_increase
 693 | 
 694 |     def __call__(self, model):
 695 |         """
 696 |         Returns True or False depending on whether the optimization should
 697 |         stop or not. The optimization should stop if it has run for a number
 698 |         of epochs superior to the patience without any improvement.
 699 | 
 700 |         Parameters
 701 |         ----------
 702 |         model : Model
 703 |             The model used in the experiment and from which the monitor used
 704 |             in the termination criterion will be extracted.
 705 | 
 706 |         Returns
 707 |         -------
 708 |         bool
 709 |             True or False, indicating if the optimization should stop or not.
 710 |         """
 711 |         monitor = model.monitor
 712 |         # In the case the monitor has only one channel, the channel_name can
 713 |         # be omitted and the criterion will examine the only channel
 714 |         # available. However, if the monitor has multiple channels, leaving
 715 |         # the channel_name unspecified will raise an error.
 716 |         if self._channel_name is None:
 717 |             if len(monitor.channels) != 1:
 718 |                 raise ValueError("Only single-channel monitors are supported "
 719 |                                  "for channel_name == None")
 720 |             v = monitor.channels.values()[0].val_record
 721 |         else:
 722 |             v = monitor.channels[self._channel_name].val_record
 723 |         # If the channel value decrease is higher than the threshold, we
 724 |         # update the best value to this value and we update the patience.
 725 |         if v[-1] < self.best_value * (1. - self.prop_decrease):
 726 |             # Using the max between actual patience and updated patience
 727 |             # ensures that the model will run for at least the initial
 728 |             # patience and that it would behave correctly if the user
 729 |             # chooses a dumb value (i.e. less than 1)
 730 |             self.patience = max(self.patience, len(v) * self.patience_increase)
 731 |             self.best_value = v[-1]
 732 | 
 733 |         return len(v) < self.patience
 734 | 
 735 | 
 736 | class AnnealedLearningRate(object):
 737 |     """
 738 |     This is a callback for the SGD algorithm rather than the Train object.
 739 |     This anneals the learning rate to decrease as 1/t where t is the number
 740 |     of gradient descent updates done so far. Use OneOverEpoch as Train object
 741 |     callback if you would prefer 1/t where t is epochs.
 742 | 
 743 |     Parameters
 744 |     ----------
 745 |     anneal_start : int
 746 |         The epoch on which to begin annealing
 747 |     """
 748 |     def __init__(self, anneal_start):
 749 |         self._initialized = False
 750 |         self._count = 0
 751 |         self._anneal_start = anneal_start
 752 | 
 753 |     def __call__(self, algorithm):
 754 |         """
 755 |         Updates the learning rate according to the annealing schedule.
 756 | 
 757 |         Parameters
 758 |         ----------
 759 |         algorithm : WRITEME
 760 |         """
 761 |         if not self._initialized:
 762 |             self._base = algorithm.learning_rate.get_value()
 763 |         self._count += 1
 764 |         algorithm.learning_rate.set_value(self.current_learning_rate())
 765 | 
 766 |     def current_learning_rate(self):
 767 |         """
 768 |         Returns the current desired learning rate according to the
 769 |         annealing schedule.
 770 |         """
 771 |         return self._base * min(1, self._anneal_start / self._count)
 772 | 
 773 | class ExponentialDecay(object):
 774 |     """
 775 |     This is a callback for the `SGD` algorithm rather than the `Train` object.
 776 |     This anneals the learning rate by dividing by decay_factor after each
 777 |     gradient descent step. It will not shrink the learning rate beyond
 778 |     `min_lr`.
 779 | 
 780 |     Parameters
 781 |     ----------
 782 |     decay_factor : float
 783 |         The learning rate at step t is given by
 784 |         `init_learning_rate / (decay_factor ** t)`
 785 |     min_lr : float
 786 |         The learning rate will be clipped to be at least this value
 787 |     """
 788 | 
 789 |     def __init__(self, decay_factor, min_lr):
 790 |         if isinstance(decay_factor, str):
 791 |             decay_factor = float(decay_factor)
 792 |         if isinstance(min_lr, str):
 793 |             min_lr = float(min_lr)
 794 |         assert isinstance(decay_factor, float)
 795 |         assert isinstance(min_lr, float)
 796 |         self.__dict__.update(locals())
 797 |         del self.self
 798 |         self._count = 0
 799 |         self._min_reached = False
 800 | 
 801 |     def __call__(self, algorithm):
 802 |         """
 803 |         Updates the learning rate according to the exponential decay schedule.
 804 | 
 805 |         Parameters
 806 |         ----------
 807 |         algorithm : SGD
 808 |             The SGD instance whose `learning_rate` field should be modified.
 809 |         """
 810 |         if self._count == 0:
 811 |             self._base_lr = algorithm.learning_rate.get_value()
 812 |         self._count += 1
 813 | 
 814 |         if not self._min_reached:
 815 |             # If we keep on executing the exponentiation on each mini-batch,
 816 |             # we will eventually get an OverflowError. So make sure we
 817 |             # only do the computation until min_lr is reached.
 818 |             new_lr = self._base_lr / (self.decay_factor ** self._count)
 819 |             if new_lr <= self.min_lr:
 820 |                 self._min_reached = True
 821 |                 new_lr = self.min_lr
 822 |         else:
 823 |             new_lr = self.min_lr
 824 | 
 825 |         new_lr = np.cast[config.floatX](new_lr)
 826 |         algorithm.learning_rate.set_value(new_lr)
 827 | 
 828 | class LinearDecay(object):
 829 |     """
 830 |     This is a callback for the SGD algorithm rather than the Train object.
 831 |     This anneals the learning rate to decay_factor times of the initial value
 832 |     during time start till saturate.
 833 | 
 834 |     Parameters
 835 |     ----------
 836 |     start : int
 837 |         The step at which to start decreasing the learning rate
 838 |     saturate : int
 839 |         The step at which to stop decreating the learning rate
 840 |     decay_factor : float
 841 |         `final learning rate = decay_factor * initial learning rate`
 842 |     """
 843 | 
 844 |     def __init__(self, start, saturate, decay_factor):
 845 |         if isinstance(decay_factor, str):
 846 |             decay_factor = float(decay_factor)
 847 |         if isinstance(start, str):
 848 |             start = float(start)
 849 |         if isinstance(saturate, str):
 850 |             saturate = float(saturate)
 851 |         assert isinstance(decay_factor, float)
 852 |         assert isinstance(start, (py_integer_types, py_float_types))
 853 |         assert isinstance(saturate, (py_integer_types, py_float_types))
 854 |         assert saturate > start
 855 |         assert start > 0
 856 |         self.__dict__.update(locals())
 857 |         del self.self
 858 |         self._count = 0
 859 | 
 860 |     def __call__(self, algorithm):
 861 |         """
 862 |         Adjusts the learning rate according to the linear decay schedule
 863 | 
 864 |         Parameters
 865 |         ----------
 866 |         algorithm : WRITEME
 867 |         """
 868 |         if self._count == 0:
 869 |             self._base_lr = algorithm.learning_rate.get_value()
 870 |             self._step = ((self._base_lr - self._base_lr * self.decay_factor) /
 871 |                           (self.saturate - self.start + 1))
 872 |         self._count += 1
 873 |         if self._count >= self.start:
 874 |             if self._count < self.saturate:
 875 |                 new_lr = self._base_lr - self._step * (self._count
 876 |                         - self.start + 1)
 877 |             else:
 878 |                 new_lr = self._base_lr * self.decay_factor
 879 |         else:
 880 |             new_lr = self._base_lr
 881 |         assert new_lr > 0
 882 |         new_lr = np.cast[config.floatX](new_lr)
 883 |         algorithm.learning_rate.set_value(new_lr)
 884 | 
 885 | 
 886 | def MomentumAdjustor(final_momentum, start, saturate):
 887 |     """
 888 |     Deprecated class used with the deprecated init_momentum argument.
 889 |     Use learning_rule.MomentumAdjustor instead.
 890 | 
 891 |     Parameters
 892 |     ----------
 893 |     final_momentum : WRITEME
 894 |     start : WRITEME
 895 |     saturate : WRITEME
 896 |     """
 897 |     warnings.warn("sgd.MomentumAdjustor interface is deprecated and will "
 898 |     "become officially unsupported as of May 9, 2014. Please use "
 899 |     "`learning_rule.MomentumAdjustor` instead.")
 900 |     return LRMomentumAdjustor(final_momentum, start, saturate)
 901 | 
 902 | 
 903 | class OneOverEpoch(TrainExtension):
 904 |     """
 905 |     Scales the learning rate like one over # epochs
 906 | 
 907 |     Parameters
 908 |     ----------
 909 |     start : int
 910 |         The epoch on which to start shrinking the learning rate
 911 |     half_life : int, optional
 912 |         How many epochs after start it will take for the learning rate to lose
 913 |         half its value for the first time (to lose the next half of its value
 914 |         will take twice as long)
 915 |     min_lr : float, optional
 916 |         The minimum value the learning rate can take on
 917 |     """
 918 |     def __init__(self, start, half_life = None, min_lr = 1e-6):
 919 |         self.__dict__.update(locals())
 920 |         del self.self
 921 |         self._initialized = False
 922 |         self._count = 0
 923 |         assert start >= 0
 924 |         if half_life is None:
 925 |             self.half_life = start + 1
 926 |         else:
 927 |             assert half_life > 0
 928 | 
 929 |     def on_monitor(self, model, dataset, algorithm):
 930 |         """
 931 |         Adjusts the learning rate according to the decay schedule.
 932 | 
 933 |         Parameters
 934 |         ----------
 935 |         model : a Model instance
 936 |         dataset : Dataset
 937 |         algorithm : WRITEME
 938 |         """
 939 | 
 940 |         if not self._initialized:
 941 |             self._init_lr = algorithm.learning_rate.get_value()
 942 |             if self._init_lr < self.min_lr:
 943 |                 raise ValueError("The initial learning rate is smaller than " +
 944 |                                  "the minimum allowed learning rate.")
 945 |             self._initialized = True
 946 |         self._count += 1
 947 |         algorithm.learning_rate.set_value(np.cast[config.floatX](
 948 |             self.current_lr()))
 949 | 
 950 |     def current_lr(self):
 951 |         """
 952 |         Returns the learning rate currently desired by the decay schedule.
 953 |         """
 954 |         if self._count < self.start:
 955 |             scale = 1
 956 |         else:
 957 |             scale = float(self.half_life) / float(self._count - self.start
 958 |                     + self.half_life)
 959 |         lr = self._init_lr * scale
 960 |         clipped = max(self.min_lr, lr)
 961 |         return clipped
 962 | 
 963 | class LinearDecayOverEpoch(TrainExtension):
 964 |     """
 965 |     Scales the learning rate linearly on each epochs
 966 | 
 967 |     Parameters
 968 |     ----------
 969 |     start : int
 970 |         The epoch on which to start shrinking the learning rate
 971 |     saturate : int
 972 |         The epoch to saturate the shrinkage
 973 |     decay_factor : float
 974 |         The final value would be initial learning rate times decay_factor
 975 |     """
 976 | 
 977 |     def __init__(self, start, saturate, decay_factor):
 978 |         self.__dict__.update(locals())
 979 |         del self.self
 980 |         self._initialized = False
 981 |         self._count = 0
 982 |         assert isinstance(decay_factor, float)
 983 |         assert isinstance(start, (py_integer_types, py_float_types))
 984 |         assert isinstance(saturate, (py_integer_types, py_float_types))
 985 |         assert saturate > start
 986 |         assert start >= 0
 987 |         assert saturate >= start
 988 | 
 989 |     def on_monitor(self, model, dataset, algorithm):
 990 |         """
 991 |         Updates the learning rate based on the linear decay schedule.
 992 | 
 993 |         Parameters
 994 |         ----------
 995 |         model : a Model instance
 996 |         dataset : Dataset
 997 |         algorithm : WRITEME
 998 |         """
 999 |         if not self._initialized:
1000 |             self._init_lr = algorithm.learning_rate.get_value()
1001 |             self._step = ((self._init_lr - self._init_lr * self.decay_factor) /
1002 |                           (self.saturate - self.start + 1))
1003 |             self._initialized = True
1004 |         self._count += 1
1005 |         algorithm.learning_rate.set_value(np.cast[config.floatX](
1006 |             self.current_lr()))
1007 | 
1008 |     def current_lr(self):
1009 |         """
1010 |         Returns the learning rate currently desired by the decay schedule.
1011 |         """
1012 |         if self._count >= self.start:
1013 |             if self._count < self.saturate:
1014 |                 new_lr = self._init_lr - self._step * (self._count
1015 |                         - self.start + 1)
1016 |             else:
1017 |                 new_lr = self._init_lr * self.decay_factor
1018 |         else:
1019 |             new_lr = self._init_lr
1020 |         assert new_lr > 0
1021 |         return new_lr
1022 | 
1023 | class _PolyakWorker(object):
1024 |     """
1025 |     Only to be used by the PolyakAveraging TrainingCallback below.
1026 |     Do not use directly.
1027 |     A callback for the SGD class.
1028 | 
1029 |     Parameters
1030 |     ----------
1031 |     model : a Model
1032 |         The model whose parameters we want to train with Polyak averaging
1033 |     """
1034 | 
1035 |     def __init__(self, model):
1036 |         avg_updates = OrderedDict()
1037 |         t = sharedX(1.)
1038 |         self.param_to_mean = OrderedDict()
1039 |         for param in model.get_params():
1040 |             mean = sharedX(param.get_value())
1041 |             assert type(mean) == type(param)
1042 |             self.param_to_mean[param] = mean
1043 |             avg_updates[mean] = mean - (mean - param) / t
1044 |             avg_updates[t] = t + 1.
1045 |         self.avg = function([], updates = avg_updates)
1046 | 
1047 |     def __call__(self, algorithm):
1048 |         """
1049 |         To be called after each SGD step.
1050 |         Updates the Polyak averaged-parameters for this model
1051 | 
1052 |         Parameters
1053 |         ----------
1054 |         algorithm : WRITEME
1055 |         """
1056 |         self.avg()
1057 | 
1058 | class PolyakAveraging(TrainExtension):
1059 |     """
1060 |     See "A Tutorial on Stochastic Approximation Algorithms
1061 |         for Training Restricted Boltzmann Machines and
1062 |         Deep Belief Nets" by Kevin Swersky et al
1063 | 
1064 |     This functionality is still a work in progress. Currently,
1065 |     your model needs to implement "add_polyak_channels" to
1066 |     use it.
1067 | 
1068 |     The problem is that Polyak averaging shouldn't modify
1069 |     the model parameters. It should keep a second copy
1070 |     that it averages in the background. This second copy
1071 |     doesn't get to come back in and affect the learning process
1072 |     though.
1073 | 
1074 |     (IG tried having the second copy get pushed back into
1075 |     the model once per epoch, but this turned out to be
1076 |     harmful, at least in limited tests)
1077 | 
1078 |     So we need a cleaner interface for monitoring the
1079 |     averaged copy of the parameters, and we need to make
1080 |     sure the saved model at the end uses the averaged
1081 |     parameters, not the parameters used for computing
1082 |     the gradients during training.
1083 | 
1084 |     TODO: make use of the new on_save callback instead
1085 |         of duplicating Train's save_freq flag
1086 | 
1087 |     Parameters
1088 |     ----------
1089 |     start : int
1090 |         The epoch after which to start averaging (0 = start averaging
1091 |         immediately)
1092 |     save_path : str, optional
1093 |         WRITEME
1094 |     save_freq : int, optional
1095 |         WRITEME
1096 | 
1097 |     Notes
1098 |     -----
1099 |     This is usually used with a fixed, rather than annealed learning
1100 |     rate. It may be used in conjunction with momentum.
1101 |     """
1102 | 
1103 |     def __init__(self, start, save_path=None, save_freq=1):
1104 |         self.__dict__.update(locals())
1105 |         del self.self
1106 |         self._count = 0
1107 |         assert isinstance(start, py_integer_types)
1108 |         assert start >= 0
1109 | 
1110 |     def on_monitor(self, model, dataset, algorithm):
1111 |         """
1112 |         Make sure Polyak-averaged model gets monitored.
1113 |         Save the model if necessary.
1114 | 
1115 |         Parameters
1116 |         ----------
1117 |         model : a Model instance
1118 |         dataset : Dataset
1119 |         algorithm : WRITEME
1120 |         """
1121 |         if self._count == self.start:
1122 |             self._worker = _PolyakWorker(model)
1123 |             algorithm.update_callbacks.append(self._worker)
1124 |             #HACK
1125 |             try:
1126 |                 model.add_polyak_channels(self._worker.param_to_mean,
1127 |                                           algorithm.monitoring_dataset)
1128 |             except AttributeError:
1129 |                 pass
1130 |         elif self.save_path is not None and self._count > self.start and \
1131 |                 self._count % self.save_freq == 0:
1132 |             saved_params = OrderedDict()
1133 |             for param in model.get_params():
1134 |                 saved_params[param] = param.get_value()
1135 |                 param.set_value(self._worker.param_to_mean[param].get_value())
1136 |             serial.save(self.save_path, model)
1137 |             for param in model.get_params():
1138 |                 param.set_value(saved_params[param])
1139 |         self._count += 1
1140 | 


--------------------------------------------------------------------------------
/sgd_alt.py:
--------------------------------------------------------------------------------
   1 | """
   2 | Copy of pylearn2's sgd.py, hacked to support alternating between
   3 | epochs of updating only the discriminator and epochs of updating
   4 | both discriminator and generator. Ideally this would
   5 | be accomplished using pylearn2's FixedVarDescr implementation,
   6 | but it is currently not very well supported.
   7 | """
   8 | from __future__ import division
   9 | 
  10 | __authors__ = "Ian Goodfellow"
  11 | __copyright__ = "Copyright 2010-2012, Universite de Montreal"
  12 | __credits__ = ["Ian Goodfellow, David Warde-Farley"]
  13 | __license__ = "3-clause BSD"
  14 | __maintainer__ = "David Warde-Farley"
  15 | __email__ = "pylearn-dev@googlegroups"
  16 | 
  17 | import logging
  18 | import warnings
  19 | import numpy as np
  20 | 
  21 | from theano import config
  22 | from theano import function
  23 | from theano.compat.python2x import OrderedDict
  24 | from theano.gof.op import get_debug_values
  25 | 
  26 | from pylearn2.monitor import Monitor
  27 | from pylearn2.space import CompositeSpace, NullSpace
  28 | from pylearn2.train_extensions import TrainExtension
  29 | from pylearn2.training_algorithms.training_algorithm import TrainingAlgorithm
  30 | from pylearn2.training_algorithms.learning_rule import Momentum
  31 | from pylearn2.training_algorithms.learning_rule import MomentumAdjustor \
  32 |         as LRMomentumAdjustor
  33 | from pylearn2.utils.iteration import is_stochastic, has_uniform_batch_size
  34 | from pylearn2.utils import py_integer_types, py_float_types
  35 | from pylearn2.utils import safe_zip
  36 | from pylearn2.utils import serial
  37 | from pylearn2.utils import sharedX
  38 | from pylearn2.utils.data_specs import DataSpecsMapping
  39 | from pylearn2.utils.timing import log_timing
  40 | from pylearn2.utils.rng import make_np_rng
  41 | 
  42 | 
  43 | log = logging.getLogger(__name__)
  44 | 
  45 | 
  46 | class SGD(TrainingAlgorithm):
  47 |     """
  48 |     SGD = (Minibatch) Stochastic Gradient Descent.
  49 |     A TrainingAlgorithm that does stochastic gradient descent on minibatches
  50 |     of training examples.
  51 | 
  52 |     For theoretical background on this algorithm, see Yoshua Bengio's machine
  53 |     learning course notes on the subject:
  54 | 
  55 |     http://www.iro.umontreal.ca/~pift6266/H10/notes/gradient.html
  56 | 
  57 |     Parameters
  58 |     ----------
  59 |     learning_rate : float
  60 |         The learning rate to use. Train object callbacks can change the
  61 |         learning rate after each epoch. SGD update_callbacks can change
  62 |         it after each minibatch.
  63 |     cost : pylearn2.costs.cost.Cost, optional
  64 |         Cost object specifying the objective function to be minimized.
  65 |         Optionally, may be None. In this case, SGD will call the model's
  66 |         get_default_cost method to obtain the objective function.
  67 |     batch_size : int, optional
  68 |         The size of the batch to be used.
  69 |         If not specified, the model will be asked for the batch size, so
  70 |         you must have specified the batch size there.
  71 |         (Some models are rigidly defined to only work with one batch size)
  72 |     monitoring_batch_size : int, optional
  73 |         The size of the monitoring batches.
  74 |     monitoring_batches : int, optional
  75 |         At the start of each epoch, we run "monitoring", to evaluate
  76 |         quantities such as the validation set error.
  77 |         monitoring_batches, if specified, determines the number of batches
  78 |         to draw from the iterator for each monitoring dataset.
  79 |         Unnecessary if not using monitoring or if `monitor_iteration_mode`
  80 |         is 'sequential' and `batch_size` is specified (number of
  81 |         batches will be calculated based on full dataset size).
  82 |         TODO: make it possible to specify different monitoring_batches
  83 |         for each monitoring dataset. The Monitor itself already supports
  84 |         this.
  85 |     monitoring_dataset : Dataset or dictionary, optional
  86 |         If not specified, no monitoring is used.
  87 |         If specified to be a Dataset, monitor on that Dataset.
  88 |         If specified to be dictionary, the keys should be string names
  89 |         of datasets, and the values should be Datasets. All monitoring
  90 |         channels will be computed for all monitoring Datasets and will
  91 |         have the dataset name and an underscore prepended to them.
  92 |     monitor_iteration_mode : str, optional
  93 |         The iteration mode used to iterate over the examples in all
  94 |         monitoring datasets. If not specified, defaults to 'sequential'.
  95 |         TODO: make it possible to specify different modes for different
  96 |         datasets.
  97 |     termination_criterion : instance of \
  98 |         pylearn2.termination_criteria.TerminationCriterion, optional
  99 | 
 100 |         Used to determine when the algorithm should stop running.
 101 |         If not specified, runs forever--or more realistically, until
 102 |         external factors halt the python process (Kansas 1977).
 103 |     update_callbacks : list, optional
 104 |         If specified, each member of the list should be a callable that
 105 |         accepts an SGD instance as its only argument.
 106 |         All callbacks will be called with this SGD instance after each
 107 |         SGD step.
 108 |     learning_rule : training_algorithms.learning_rule.LearningRule, optional
 109 |         A learning rule computes the new parameter values given old
 110 |         parameters and first-order gradients. If learning_rule is None,
 111 |         sgd.SGD will update parameters according to the standard SGD
 112 |         learning rule:
 113 | 
 114 |         .. code-block:: none
 115 | 
 116 |             param := param - learning_rate * d cost / d param
 117 | 
 118 |         This argument allows more sophisticated learning rules, such
 119 |         as SGD with momentum.
 120 |     init_momentum : float, **DEPRECATED** option
 121 |         Use learning_rule instead.
 122 |         If None, does not use momentum otherwise, use momentum and
 123 |         initialize the momentum coefficient to init_momentum. Callbacks
 124 |         can change this over time just like the learning rate. If the
 125 |         gradient is the same on every step, then the update taken by the
 126 |         SGD algorithm is scaled by a factor of 1/(1-momentum). See
 127 |         section 9 of Geoffrey Hinton's "A Practical Guide to Training
 128 |         Restricted Boltzmann Machines" for details.
 129 |     set_batch_size : bool, optional
 130 |         Defaults to False.
 131 |         If True, and batch_size conflicts with model.force_batch_size,
 132 |         will call model.set_batch_size(batch_size) in an attempt to
 133 |         change model.force_batch_size
 134 |     train_iteration_mode : str, optional
 135 |         Defaults to 'shuffled_sequential'.
 136 |         The iteration mode to use for iterating through training examples.
 137 |     batches_per_iter : int, optional
 138 |         The number of batches to draw from the iterator over training
 139 |         examples.
 140 |         If iteration mode is 'sequential' or 'shuffled_sequential', this
 141 |         is unnecessary; when unspecified we will iterate over all examples.
 142 |     theano_function_mode : a valid argument to theano.function's \
 143 |         'mode' parameter, optional
 144 | 
 145 |         The theano mode to compile the updates function with. Note that
 146 |         pylearn2 includes some wraplinker modes that are not bundled with
 147 |         theano. See pylearn2.devtools. These extra modes let you do
 148 |         things like check for NaNs at every step, or record md5 digests
 149 |         of all computations performed by the update function to help
 150 |         isolate problems with nondeterminism.
 151 |     monitoring_costs : list, optional
 152 |         a list of Cost instances. The Monitor will also include all
 153 |         channels defined by these Costs, even though we don't train
 154 |         using them.
 155 |     seed : valid argument to np.random.RandomState, optional
 156 |         The seed used for the random number generate to be passed to the
 157 |         training dataset iterator (if any)
 158 |     """
 159 |     def __init__(self, learning_rate, cost=None, batch_size=None,
 160 |                  monitoring_batch_size=None, monitoring_batches=None,
 161 |                  monitoring_dataset=None, monitor_iteration_mode='sequential',
 162 |                  termination_criterion=None, update_callbacks=None,
 163 |                  learning_rule = None, init_momentum = None,
 164 |                  set_batch_size = False,
 165 |                  train_iteration_mode = None, batches_per_iter=None,
 166 |                  theano_function_mode = None, monitoring_costs=None,
 167 |                  seed=[2012, 10, 5], discriminator_steps=1):
 168 | 
 169 |         self.discriminator_steps = discriminator_steps
 170 |         self.train_generator = 0
 171 | 
 172 |         if isinstance(cost, (list, tuple, set)):
 173 |             raise TypeError("SGD no longer supports using collections of " +
 174 |                             "Costs to represent a sum of Costs. Use " +
 175 |                             "pylearn2.costs.cost.SumOfCosts instead.")
 176 | 
 177 |         if init_momentum:
 178 |             warnings.warn("init_momentum interface is deprecated and will "
 179 |             "become officially unsuported as of May 9, 2014. Please use the "
 180 |             "`learning_rule` parameter instead, providing an object of type "
 181 |             "`pylearn2.training_algorithms.learning_rule.Momentum` instead")
 182 |             # Convert to new interface under the hood.
 183 |             self.learning_rule = Momentum(init_momentum)
 184 |         else:
 185 |             self.learning_rule = learning_rule
 186 | 
 187 |         self.learning_rate = sharedX(learning_rate, 'learning_rate')
 188 |         self.cost = cost
 189 |         self.batch_size = batch_size
 190 |         self.set_batch_size = set_batch_size
 191 |         self.batches_per_iter = batches_per_iter
 192 |         self._set_monitoring_dataset(monitoring_dataset)
 193 |         self.monitoring_batch_size = monitoring_batch_size
 194 |         self.monitoring_batches = monitoring_batches
 195 |         self.monitor_iteration_mode = monitor_iteration_mode
 196 |         if monitoring_dataset is None:
 197 |             if monitoring_batch_size is not None:
 198 |                 raise ValueError("Specified a monitoring batch size " +
 199 |                                  "but not a monitoring dataset.")
 200 |             if monitoring_batches is not None:
 201 |                 raise ValueError("Specified an amount of monitoring batches " +
 202 |                                  "but not a monitoring dataset.")
 203 |         self.termination_criterion = termination_criterion
 204 |         self._register_update_callbacks(update_callbacks)
 205 |         if train_iteration_mode is None:
 206 |             train_iteration_mode = 'shuffled_sequential'
 207 |         self.train_iteration_mode = train_iteration_mode
 208 |         self.first = True
 209 |         self.rng = make_np_rng(seed, which_method=["randn","randint"])
 210 |         self.theano_function_mode = theano_function_mode
 211 |         self.monitoring_costs = monitoring_costs
 212 | 
 213 |     def setup(self, model, dataset):
 214 |         """
 215 |         Compiles the theano functions needed for the train method.
 216 | 
 217 |         Parameters
 218 |         ----------
 219 |         model : a Model instance
 220 |         dataset : Dataset
 221 |         """
 222 |         if self.cost is None:
 223 |             self.cost = model.get_default_cost()
 224 | 
 225 |         inf_params = [param for param in model.get_params()
 226 |                       if np.any(np.isinf(param.get_value()))]
 227 |         if len(inf_params) > 0:
 228 |             raise ValueError("These params are Inf: "+str(inf_params))
 229 |         if any([np.any(np.isnan(param.get_value()))
 230 |                 for param in model.get_params()]):
 231 |             nan_params = [param for param in model.get_params()
 232 |                           if np.any(np.isnan(param.get_value()))]
 233 |             raise ValueError("These params are NaN: "+str(nan_params))
 234 |         self.model = model
 235 | 
 236 |         self._synchronize_batch_size(model)
 237 |         model._test_batch_size = self.batch_size
 238 |         self.monitor = Monitor.get_monitor(model)
 239 |         self.monitor._sanity_check()
 240 | 
 241 |         # test if force batch size and batch size
 242 |         if getattr(model, "force_batch_size", False) and \
 243 |            any(dataset.get_design_matrix().shape[0] % self.batch_size != 0 for
 244 |                dataset in self.monitoring_dataset.values()) and \
 245 |            not has_uniform_batch_size(self.monitor_iteration_mode):
 246 | 
 247 |             raise ValueError("Dataset size is not a multiple of batch size."
 248 |                              "You should set monitor_iteration_mode to "
 249 |                              "even_sequential, even_shuffled_sequential or "
 250 |                              "even_batchwise_shuffled_sequential")
 251 | 
 252 |         data_specs = self.cost.get_data_specs(self.model)
 253 |         mapping = DataSpecsMapping(data_specs)
 254 |         space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
 255 |         source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
 256 | 
 257 |         # Build a flat tuple of Theano Variables, one for each space.
 258 |         # We want that so that if the same space/source is specified
 259 |         # more than once in data_specs, only one Theano Variable
 260 |         # is generated for it, and the corresponding value is passed
 261 |         # only once to the compiled Theano function.
 262 |         theano_args = []
 263 |         for space, source in safe_zip(space_tuple, source_tuple):
 264 |             name = '%s[%s]' % (self.__class__.__name__, source)
 265 |             arg = space.make_theano_batch(name=name,
 266 |                                           batch_size=self.batch_size)
 267 |             theano_args.append(arg)
 268 |         theano_args = tuple(theano_args)
 269 | 
 270 |         # Methods of `self.cost` need args to be passed in a format compatible
 271 |         # with data_specs
 272 |         nested_args = mapping.nest(theano_args)
 273 |         fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args)
 274 |         self.on_load_batch = fixed_var_descr.on_load_batch
 275 | 
 276 |         cost_value = self.cost.expr(model, nested_args,
 277 |                                     ** fixed_var_descr.fixed_vars)
 278 | 
 279 |         if cost_value is not None and cost_value.name is None:
 280 |             # Concatenate the name of all tensors in theano_args !?
 281 |             cost_value.name = 'objective'
 282 | 
 283 |         # Set up monitor to model the objective value, learning rate,
 284 |         # momentum (if applicable), and extra channels defined by
 285 |         # the cost
 286 |         learning_rate = self.learning_rate
 287 |         if self.monitoring_dataset is not None:
 288 |             if (self.monitoring_batch_size is None and
 289 |                     self.monitoring_batches is None):
 290 |                 self.monitoring_batch_size = self.batch_size
 291 |                 self.monitoring_batches = self.batches_per_iter
 292 |             self.monitor.setup(dataset=self.monitoring_dataset,
 293 |                                cost=self.cost,
 294 |                                batch_size=self.monitoring_batch_size,
 295 |                                num_batches=self.monitoring_batches,
 296 |                                extra_costs=self.monitoring_costs,
 297 |                                mode=self.monitor_iteration_mode)
 298 |             dataset_name = self.monitoring_dataset.keys()[0]
 299 |             monitoring_dataset = self.monitoring_dataset[dataset_name]
 300 |             #TODO: have Monitor support non-data-dependent channels
 301 |             self.monitor.add_channel(name='learning_rate',
 302 |                                      ipt=None,
 303 |                                      val=learning_rate,
 304 |                                      data_specs=(NullSpace(), ''),
 305 |                                      dataset=monitoring_dataset)
 306 | 
 307 |             if self.learning_rule:
 308 |                 self.learning_rule.add_channels_to_monitor(
 309 |                         self.monitor,
 310 |                         monitoring_dataset)
 311 | 
 312 |         params = list(model.get_params())
 313 |         assert len(params) > 0
 314 |         for i, param in enumerate(params):
 315 |             if param.name is None:
 316 |                 param.name = 'sgd_params[%d]' % i
 317 |         self.params = params
 318 | 
 319 | 
 320 |         grads, updates = self.cost.get_gradients(model, nested_args,
 321 |                                                  ** fixed_var_descr.fixed_vars)
 322 |         if not isinstance(grads, OrderedDict):
 323 |             raise TypeError(str(type(self.cost)) + ".get_gradients returned " +
 324 |                             "something with" + str(type(grads)) + "as its " +
 325 |                             "first member. Expected OrderedDict.")
 326 | 
 327 |         for param in grads:
 328 |             assert param in params
 329 |         for param in params:
 330 |             assert param in grads
 331 | 
 332 |         lr_scalers = model.get_lr_scalers()
 333 | 
 334 |         for key in lr_scalers:
 335 |             if key not in params:
 336 |                 raise ValueError("Tried to scale the learning rate on " +\
 337 |                         str(key)+" which is not an optimization parameter.")
 338 | 
 339 |         assert len(updates.keys()) == 0
 340 | 
 341 |         def get_func(learn_discriminator, learn_generator, dont_you_fucking_dare_touch_the_generator=False):
 342 | 
 343 |             updates = OrderedDict()
 344 | 
 345 |             assert (learn_discriminator or learn_generator) and not (learn_discriminator and learn_generator)
 346 | 
 347 |             if learn_discriminator:
 348 |                 cur_params = model.discriminator.get_params()
 349 |             else:
 350 |                 cur_params = model.generator.get_params()
 351 | 
 352 |             def check():
 353 |                 for param in params:
 354 |                     if param not in cur_params:
 355 |                         assert param not in updates
 356 | 
 357 |             cur_grads = OrderedDict()
 358 |             for param in cur_params:
 359 |                 cur_grads[param] = grads[param]
 360 | 
 361 |             for param in grads:
 362 |                 if grads[param].name is None and cost_value is not None:
 363 |                     grads[param].name = ('grad(%(costname)s, %(paramname)s)' %
 364 |                                          {'costname': cost_value.name,
 365 |                                           'paramname': param.name})
 366 |                 assert grads[param].dtype == param.dtype
 367 | 
 368 |             cur_lr_scalers = OrderedDict()
 369 |             for param in cur_params:
 370 |                 if param in lr_scalers:
 371 |                     lr_scaler = lr_scalers[param]
 372 |                     cur_lr_scalers[param] = lr_scaler
 373 | 
 374 |             log.info('Parameter and initial learning rate summary:')
 375 |             for param in cur_params:
 376 |                 param_name = param.name
 377 |                 if param_name is None:
 378 |                     param_name = 'anon_param'
 379 |                 lr = learning_rate.get_value() * cur_lr_scalers.get(param,1.)
 380 |                 log.info('\t' + param_name + ': ' + str(lr))
 381 | 
 382 |             updates.update(self.learning_rule.get_updates(
 383 |                     learning_rate, cur_grads, cur_lr_scalers))
 384 |             check()
 385 | 
 386 |             for param in cur_params:
 387 |                 if updates[param].name is None:
 388 |                     updates[param].name = 'sgd_update(' + param.name + ')'
 389 |             check()
 390 |             model.modify_updates(updates)
 391 |             check()
 392 |             for param in cur_params:
 393 |                 update = updates[param]
 394 |                 if update.name is None:
 395 |                     update.name = 'censor(sgd_update(' + param.name + '))'
 396 |                 for update_val in get_debug_values(update):
 397 |                     if np.any(np.isinf(update_val)):
 398 |                         raise ValueError("debug value of %s contains infs" %
 399 |                                 update.name)
 400 |                     if np.any(np.isnan(update_val)):
 401 |                         raise ValueError("debug value of %s contains nans" %
 402 |                                 update.name)
 403 | 
 404 |             check()
 405 | 
 406 |             if dont_you_fucking_dare_touch_the_generator:
 407 |                 for param in model.generator.get_params():
 408 |                     assert param not in updates
 409 | 
 410 |             with log_timing(log, 'Compiling sgd_update'):
 411 |                 return function(theano_args,
 412 |                                            updates=updates,
 413 |                                            name='sgd_update',
 414 |                                            on_unused_input='ignore',
 415 |                                            mode=self.theano_function_mode)
 416 |         self.d_func = get_func(1, 0, dont_you_fucking_dare_touch_the_generator=True)
 417 |         self.g_func = get_func(0, 1)
 418 | 
 419 |     def train(self, dataset):
 420 |         """
 421 |         Runs one epoch of SGD training on the specified dataset.
 422 | 
 423 |         Parameters
 424 |         ----------
 425 |         dataset : Dataset
 426 |         """
 427 | 
 428 | 
 429 |         if not hasattr(self, 'd_func'):
 430 |             raise Exception("train called without first calling setup")
 431 | 
 432 |         # Make sure none of the parameters have bad values
 433 |         for param in self.params:
 434 |             value = param.get_value(borrow=True)
 435 |             if np.any(np.isnan(value)) or np.any(np.isinf(value)):
 436 |                 raise Exception("NaN in " + param.name)
 437 | 
 438 |         self.first = False
 439 |         rng = self.rng
 440 |         if not is_stochastic(self.train_iteration_mode):
 441 |             rng = None
 442 | 
 443 |         data_specs = self.cost.get_data_specs(self.model)
 444 | 
 445 |         # The iterator should be built from flat data specs, so it returns
 446 |         # flat, non-redundent tuples of data.
 447 |         mapping = DataSpecsMapping(data_specs)
 448 |         space_tuple = mapping.flatten(data_specs[0], return_tuple=True)
 449 |         source_tuple = mapping.flatten(data_specs[1], return_tuple=True)
 450 |         if len(space_tuple) == 0:
 451 |             # No data will be returned by the iterator, and it is impossible
 452 |             # to know the size of the actual batch.
 453 |             # It is not decided yet what the right thing to do should be.
 454 |             raise NotImplementedError("Unable to train with SGD, because "
 455 |                     "the cost does not actually use data from the data set. "
 456 |                     "data_specs: %s" % str(data_specs))
 457 |         flat_data_specs = (CompositeSpace(space_tuple), source_tuple)
 458 | 
 459 |         iterator = dataset.iterator(mode=self.train_iteration_mode,
 460 |                 batch_size=self.batch_size,
 461 |                 data_specs=flat_data_specs, return_tuple=True,
 462 |                 rng = rng, num_batches = self.batches_per_iter)
 463 | 
 464 | 
 465 |         on_load_batch = self.on_load_batch
 466 |         i = 0
 467 |         for batch in iterator:
 468 |             for callback in on_load_batch:
 469 |                 callback(*batch)
 470 |             if self.train_generator and i == self.discriminator_steps:
 471 |                 self.g_func(*batch)
 472 |                 i = 0
 473 |             else:
 474 |                 self.d_func(*batch)
 475 |                 i += 1
 476 |             # iterator might return a smaller batch if dataset size
 477 |             # isn't divisible by batch_size
 478 |             # Note: if data_specs[0] is a NullSpace, there is no way to know
 479 |             # how many examples would actually have been in the batch,
 480 |             # since it was empty, so actual_batch_size would be reported as 0.
 481 |             actual_batch_size = flat_data_specs[0].np_batch_size(batch)
 482 |             self.monitor.report_batch(actual_batch_size)
 483 |             for callback in self.update_callbacks:
 484 |                 callback(self)
 485 | 
 486 | 
 487 |         # Make sure none of the parameters have bad values
 488 |         for param in self.params:
 489 |             value = param.get_value(borrow=True)
 490 |             if np.any(np.isnan(value)) or np.any(np.isinf(value)):
 491 |                 raise Exception("NaN in " + param.name)
 492 | 
 493 |         self.train_generator = not self.train_generator
 494 | 
 495 |     def continue_learning(self, model):
 496 |         """
 497 |         Returns True if the algorithm should continue running, or False
 498 |         if it has reached convergence / started overfitting and should
 499 |         stop.
 500 | 
 501 |         Parameters
 502 |         ----------
 503 |         model : a Model instance
 504 |         """
 505 |         if self.termination_criterion is None:
 506 |             return True
 507 |         else:
 508 |             return self.termination_criterion.continue_learning(self.model)
 509 | 
 510 | class MonitorBasedLRAdjuster(TrainExtension):
 511 |     """
 512 |     A TrainExtension that uses the on_monitor callback to adjust
 513 |     the learning rate on each epoch. It pulls out a channel
 514 |     from the model's monitor and adjusts the learning rate
 515 |     based on what happened to the monitoring channel on the last
 516 |     epoch. If the channel is greater than high_trigger times
 517 |     its previous value, the learning rate will be scaled by
 518 |     shrink_amt (which should be < 1 for this scheme to make
 519 |     sense). The idea is that in this case the learning algorithm
 520 |     is overshooting the bottom of the objective function.
 521 | 
 522 |     If the objective is less than high_trigger but
 523 |     greater than low_trigger times its previous value, the
 524 |     learning rate will be scaled by grow_amt (which should be > 1
 525 |     for this scheme to make sense). The idea is that the learning
 526 |     algorithm is making progress but at too slow of a rate.
 527 | 
 528 |     Parameters
 529 |     ----------
 530 |     high_trigger : float, optional
 531 |         See class-level docstring
 532 |     low_trigger : float, optional
 533 |         See class-level docstring
 534 |     grow_amt : float, optional
 535 |         See class-level docstring
 536 |     min_lr : float, optional
 537 |         All updates to the learning rate are clipped to be at least
 538 |         this value.
 539 |     max_lr : float, optional
 540 |         All updates to the learning rate are clipped to be at most
 541 |         this value.
 542 |     dataset_name : str, optional
 543 |         If specified, use dataset_name + "_objective" as the channel
 544 |         to guide the learning rate adaptation.
 545 |     channel_name : str, optional
 546 |         If specified, use channel_name as the channel to guide the
 547 |         learning rate adaptation. Conflicts with dataset_name.
 548 |         If neither dataset_name nor channel_name is specified, uses
 549 |         "objective"
 550 |     """
 551 | 
 552 |     def __init__(self, high_trigger=1., shrink_amt=.99,
 553 |                  low_trigger=.99, grow_amt=1.01,
 554 |                  min_lr = 1e-7, max_lr = 1.,
 555 |                  dataset_name=None, channel_name=None):
 556 |         self.high_trigger = high_trigger
 557 |         self.shrink_amt = shrink_amt
 558 |         self.low_trigger = low_trigger
 559 |         self.grow_amt = grow_amt
 560 |         self.min_lr = min_lr
 561 |         self.max_lr = max_lr
 562 |         self.dataset_name = None
 563 |         if channel_name is not None:
 564 |             self.channel_name = channel_name
 565 |         else:
 566 |             if dataset_name is not None:
 567 |                 self.channel_name = dataset_name + '_objective'
 568 |                 self.dataset_name = dataset_name
 569 |             else:
 570 |                 self.channel_name = None
 571 | 
 572 |     def on_monitor(self, model, dataset, algorithm):
 573 |         """
 574 |         Adjusts the learning rate based on the contents of model.monitor
 575 | 
 576 |         Parameters
 577 |         ----------
 578 |         model : a Model instance
 579 |         dataset : Dataset
 580 |         algorithm : WRITEME
 581 |         """
 582 |         model = algorithm.model
 583 |         lr = algorithm.learning_rate
 584 |         current_learning_rate = lr.get_value()
 585 |         assert hasattr(model, 'monitor'), ("no monitor associated with "
 586 |                 + str(model))
 587 |         monitor = model.monitor
 588 |         monitor_channel_specified = True
 589 | 
 590 |         if self.channel_name is None:
 591 |             monitor_channel_specified = False
 592 |             channels = [elem for elem in monitor.channels
 593 |                     if elem.endswith("objective")]
 594 |             if len(channels) < 1:
 595 |                 raise ValueError("There are no monitoring channels that end "
 596 |                         "with \"objective\". Please specify either "
 597 |                         "channel_name or dataset_name.")
 598 |             elif len(channels) > 1:
 599 |                 datasets = algorithm.monitoring_dataset.keys()
 600 |                 raise ValueError("There are multiple monitoring channels that"
 601 |                         "end with \"_objective\". The list of available "
 602 |                         "datasets are: " +
 603 |                                 str(datasets) + " . Please specify either "
 604 |                                 "channel_name or dataset_name in the "
 605 |                                 "MonitorBasedLRAdjuster constructor to "
 606 |                                 'disambiguate.')
 607 |             else:
 608 |                 self.channel_name = channels[0]
 609 |                 warnings.warn('The channel that has been chosen for '
 610 |                         'monitoring is: ' +
 611 |                               str(self.channel_name) + '.')
 612 | 
 613 |         try:
 614 |             v = monitor.channels[self.channel_name].val_record
 615 |         except KeyError:
 616 |             err_input = ''
 617 |             if monitor_channel_specified:
 618 |                 if self.dataset_name:
 619 |                     err_input = 'The dataset_name \'' + str(
 620 |                             self.dataset_name) + '\' is not valid.'
 621 |                 else:
 622 |                     err_input = 'The channel_name \'' + str(
 623 |                             self.channel_name) + '\' is not valid.'
 624 |             err_message = 'There is no monitoring channel named \'' + \
 625 |                     str(self.channel_name) + '\'. You probably need to ' + \
 626 |                     'specify a valid monitoring channel by using either ' + \
 627 |                     'dataset_name or channel_name in the ' + \
 628 |                     'MonitorBasedLRAdjuster constructor. ' + err_input
 629 |             raise ValueError(err_message)
 630 | 
 631 |         if len(v) < 1:
 632 |             if monitor.dataset is None:
 633 |                 assert len(v) == 0
 634 |                 raise ValueError("You're trying to use a monitor-based "
 635 |                         "learning rate adjustor but the monitor has no "
 636 |                         "entries because you didn't specify a "
 637 |                         "monitoring dataset.")
 638 | 
 639 |             raise ValueError("For some reason there are no monitor entries"
 640 |                                  "yet the MonitorBasedLRAdjuster has been "
 641 |                                  "called. This should never happen. The Train"
 642 |                                  " object should call the monitor once on "
 643 |                                  "initialization, then call the callbacks. "
 644 |                                  "It seems you are either calling the "
 645 |                                  "callback manually rather than as part of a "
 646 |                                  "training algorithm, or there is a problem "
 647 |                                 "with the Train object.")
 648 |         if len(v) == 1:
 649 |             #only the initial monitoring has happened
 650 |             #no learning has happened, so we can't adjust the learning rate yet
 651 |             #just do nothing
 652 |             return
 653 | 
 654 |         rval = current_learning_rate
 655 | 
 656 |         log.info("monitoring channel is {0}".format(self.channel_name))
 657 | 
 658 |         if v[-1] > self.high_trigger * v[-2]:
 659 |             rval *= self.shrink_amt
 660 |             log.info("shrinking learning rate to %f" % rval)
 661 |         elif v[-1] > self.low_trigger * v[-2]:
 662 |             rval *= self.grow_amt
 663 |             log.info("growing learning rate to %f" % rval)
 664 | 
 665 |         rval = max(self.min_lr, rval)
 666 |         rval = min(self.max_lr, rval)
 667 | 
 668 |         lr.set_value(np.cast[lr.dtype](rval))
 669 | 
 670 | 
 671 | class PatienceBasedTermCrit(object):
 672 |     """
 673 |     A monitor-based termination criterion using a geometrically increasing
 674 |     amount of patience. If the selected channel has decreased by a certain
 675 |     proportion when comparing to the lowest value seen yet, the patience is
 676 |     set to a factor of the number of examples seen, which by default
 677 |     (patience_increase=2.) ensures the model has seen as many examples as the
 678 |     number of examples that lead to the lowest value before concluding a local
 679 |     optima has been reached.
 680 | 
 681 |     Note: Technically, the patience corresponds to a number of epochs to be
 682 |     independent of the size of the dataset, so be aware of that when choosing
 683 |     initial_patience.
 684 | 
 685 |     Parameters
 686 |     ----------
 687 |     prop_decrease : float
 688 |         The factor X in the (1 - X) * best_value threshold
 689 |     initial_patience : int
 690 |         Minimal number of epochs the model has to run before it can stop
 691 |     patience_increase : float, optional
 692 |         The factor X in the patience = X * n_iter update.
 693 |     channel_name : string, optional
 694 |         Name of the channel to examine. If None and the monitor
 695 |         has only one channel, this channel will be used; otherwise, an
 696 |         error will be raised.
 697 |     """
 698 |     def __init__(self, prop_decrease, initial_patience,
 699 |                  patience_increase=2., channel_name=None):
 700 |         self._channel_name = channel_name
 701 |         self.prop_decrease = prop_decrease
 702 |         self.patience = initial_patience
 703 |         self.best_value = np.inf
 704 |         self.patience_increase = patience_increase
 705 | 
 706 |     def __call__(self, model):
 707 |         """
 708 |         Returns True or False depending on whether the optimization should
 709 |         stop or not. The optimization should stop if it has run for a number
 710 |         of epochs superior to the patience without any improvement.
 711 | 
 712 |         Parameters
 713 |         ----------
 714 |         model : Model
 715 |             The model used in the experiment and from which the monitor used
 716 |             in the termination criterion will be extracted.
 717 | 
 718 |         Returns
 719 |         -------
 720 |         bool
 721 |             True or False, indicating if the optimization should stop or not.
 722 |         """
 723 |         monitor = model.monitor
 724 |         # In the case the monitor has only one channel, the channel_name can
 725 |         # be omitted and the criterion will examine the only channel
 726 |         # available. However, if the monitor has multiple channels, leaving
 727 |         # the channel_name unspecified will raise an error.
 728 |         if self._channel_name is None:
 729 |             if len(monitor.channels) != 1:
 730 |                 raise ValueError("Only single-channel monitors are supported "
 731 |                                  "for channel_name == None")
 732 |             v = monitor.channels.values()[0].val_record
 733 |         else:
 734 |             v = monitor.channels[self._channel_name].val_record
 735 |         # If the channel value decrease is higher than the threshold, we
 736 |         # update the best value to this value and we update the patience.
 737 |         if v[-1] < self.best_value * (1. - self.prop_decrease):
 738 |             # Using the max between actual patience and updated patience
 739 |             # ensures that the model will run for at least the initial
 740 |             # patience and that it would behave correctly if the user
 741 |             # chooses a dumb value (i.e. less than 1)
 742 |             self.patience = max(self.patience, len(v) * self.patience_increase)
 743 |             self.best_value = v[-1]
 744 | 
 745 |         return len(v) < self.patience
 746 | 
 747 | 
 748 | class AnnealedLearningRate(object):
 749 |     """
 750 |     This is a callback for the SGD algorithm rather than the Train object.
 751 |     This anneals the learning rate to decrease as 1/t where t is the number
 752 |     of gradient descent updates done so far. Use OneOverEpoch as Train object
 753 |     callback if you would prefer 1/t where t is epochs.
 754 | 
 755 |     Parameters
 756 |     ----------
 757 |     anneal_start : int
 758 |         The epoch on which to begin annealing
 759 |     """
 760 |     def __init__(self, anneal_start):
 761 |         self._initialized = False
 762 |         self._count = 0
 763 |         self._anneal_start = anneal_start
 764 | 
 765 |     def __call__(self, algorithm):
 766 |         """
 767 |         Updates the learning rate according to the annealing schedule.
 768 | 
 769 |         Parameters
 770 |         ----------
 771 |         algorithm : WRITEME
 772 |         """
 773 |         if not self._initialized:
 774 |             self._base = algorithm.learning_rate.get_value()
 775 |         self._count += 1
 776 |         algorithm.learning_rate.set_value(self.current_learning_rate())
 777 | 
 778 |     def current_learning_rate(self):
 779 |         """
 780 |         Returns the current desired learning rate according to the
 781 |         annealing schedule.
 782 |         """
 783 |         return self._base * min(1, self._anneal_start / self._count)
 784 | 
 785 | class ExponentialDecay(object):
 786 |     """
 787 |     This is a callback for the `SGD` algorithm rather than the `Train` object.
 788 |     This anneals the learning rate by dividing by decay_factor after each
 789 |     gradient descent step. It will not shrink the learning rate beyond
 790 |     `min_lr`.
 791 | 
 792 |     Parameters
 793 |     ----------
 794 |     decay_factor : float
 795 |         The learning rate at step t is given by
 796 |         `init_learning_rate / (decay_factor ** t)`
 797 |     min_lr : float
 798 |         The learning rate will be clipped to be at least this value
 799 |     """
 800 | 
 801 |     def __init__(self, decay_factor, min_lr):
 802 |         if isinstance(decay_factor, str):
 803 |             decay_factor = float(decay_factor)
 804 |         if isinstance(min_lr, str):
 805 |             min_lr = float(min_lr)
 806 |         assert isinstance(decay_factor, float)
 807 |         assert isinstance(min_lr, float)
 808 |         self.__dict__.update(locals())
 809 |         del self.self
 810 |         self._count = 0
 811 |         self._min_reached = False
 812 | 
 813 |     def __call__(self, algorithm):
 814 |         """
 815 |         Updates the learning rate according to the exponential decay schedule.
 816 | 
 817 |         Parameters
 818 |         ----------
 819 |         algorithm : SGD
 820 |             The SGD instance whose `learning_rate` field should be modified.
 821 |         """
 822 |         if self._count == 0:
 823 |             self._base_lr = algorithm.learning_rate.get_value()
 824 |         self._count += 1
 825 | 
 826 |         if not self._min_reached:
 827 |             # If we keep on executing the exponentiation on each mini-batch,
 828 |             # we will eventually get an OverflowError. So make sure we
 829 |             # only do the computation until min_lr is reached.
 830 |             new_lr = self._base_lr / (self.decay_factor ** self._count)
 831 |             if new_lr <= self.min_lr:
 832 |                 self._min_reached = True
 833 |                 new_lr = self.min_lr
 834 |         else:
 835 |             new_lr = self.min_lr
 836 | 
 837 |         new_lr = np.cast[config.floatX](new_lr)
 838 |         algorithm.learning_rate.set_value(new_lr)
 839 | 
 840 | class LinearDecay(object):
 841 |     """
 842 |     This is a callback for the SGD algorithm rather than the Train object.
 843 |     This anneals the learning rate to decay_factor times of the initial value
 844 |     during time start till saturate.
 845 | 
 846 |     Parameters
 847 |     ----------
 848 |     start : int
 849 |         The step at which to start decreasing the learning rate
 850 |     saturate : int
 851 |         The step at which to stop decreating the learning rate
 852 |     decay_factor : float
 853 |         `final learning rate = decay_factor * initial learning rate`
 854 |     """
 855 | 
 856 |     def __init__(self, start, saturate, decay_factor):
 857 |         if isinstance(decay_factor, str):
 858 |             decay_factor = float(decay_factor)
 859 |         if isinstance(start, str):
 860 |             start = float(start)
 861 |         if isinstance(saturate, str):
 862 |             saturate = float(saturate)
 863 |         assert isinstance(decay_factor, float)
 864 |         assert isinstance(start, (py_integer_types, py_float_types))
 865 |         assert isinstance(saturate, (py_integer_types, py_float_types))
 866 |         assert saturate > start
 867 |         assert start > 0
 868 |         self.__dict__.update(locals())
 869 |         del self.self
 870 |         self._count = 0
 871 | 
 872 |     def __call__(self, algorithm):
 873 |         """
 874 |         Adjusts the learning rate according to the linear decay schedule
 875 | 
 876 |         Parameters
 877 |         ----------
 878 |         algorithm : WRITEME
 879 |         """
 880 |         if self._count == 0:
 881 |             self._base_lr = algorithm.learning_rate.get_value()
 882 |             self._step = ((self._base_lr - self._base_lr * self.decay_factor) /
 883 |                           (self.saturate - self.start + 1))
 884 |         self._count += 1
 885 |         if self._count >= self.start:
 886 |             if self._count < self.saturate:
 887 |                 new_lr = self._base_lr - self._step * (self._count
 888 |                         - self.start + 1)
 889 |             else:
 890 |                 new_lr = self._base_lr * self.decay_factor
 891 |         else:
 892 |             new_lr = self._base_lr
 893 |         assert new_lr > 0
 894 |         new_lr = np.cast[config.floatX](new_lr)
 895 |         algorithm.learning_rate.set_value(new_lr)
 896 | 
 897 | 
 898 | def MomentumAdjustor(final_momentum, start, saturate):
 899 |     """
 900 |     Deprecated class used with the deprecated init_momentum argument.
 901 |     Use learning_rule.MomentumAdjustor instead.
 902 | 
 903 |     Parameters
 904 |     ----------
 905 |     final_momentum : WRITEME
 906 |     start : WRITEME
 907 |     saturate : WRITEME
 908 |     """
 909 |     warnings.warn("sgd.MomentumAdjustor interface is deprecated and will "
 910 |     "become officially unsupported as of May 9, 2014. Please use "
 911 |     "`learning_rule.MomentumAdjustor` instead.")
 912 |     return LRMomentumAdjustor(final_momentum, start, saturate)
 913 | 
 914 | 
 915 | class OneOverEpoch(TrainExtension):
 916 |     """
 917 |     Scales the learning rate like one over # epochs
 918 | 
 919 |     Parameters
 920 |     ----------
 921 |     start : int
 922 |         The epoch on which to start shrinking the learning rate
 923 |     half_life : int, optional
 924 |         How many epochs after start it will take for the learning rate to lose
 925 |         half its value for the first time (to lose the next half of its value
 926 |         will take twice as long)
 927 |     min_lr : float, optional
 928 |         The minimum value the learning rate can take on
 929 |     """
 930 |     def __init__(self, start, half_life = None, min_lr = 1e-6):
 931 |         self.__dict__.update(locals())
 932 |         del self.self
 933 |         self._initialized = False
 934 |         self._count = 0
 935 |         assert start >= 0
 936 |         if half_life is None:
 937 |             self.half_life = start + 1
 938 |         else:
 939 |             assert half_life > 0
 940 | 
 941 |     def on_monitor(self, model, dataset, algorithm):
 942 |         """
 943 |         Adjusts the learning rate according to the decay schedule.
 944 | 
 945 |         Parameters
 946 |         ----------
 947 |         model : a Model instance
 948 |         dataset : Dataset
 949 |         algorithm : WRITEME
 950 |         """
 951 | 
 952 |         if not self._initialized:
 953 |             self._init_lr = algorithm.learning_rate.get_value()
 954 |             if self._init_lr < self.min_lr:
 955 |                 raise ValueError("The initial learning rate is smaller than " +
 956 |                                  "the minimum allowed learning rate.")
 957 |             self._initialized = True
 958 |         self._count += 1
 959 |         algorithm.learning_rate.set_value(np.cast[config.floatX](
 960 |             self.current_lr()))
 961 | 
 962 |     def current_lr(self):
 963 |         """
 964 |         Returns the learning rate currently desired by the decay schedule.
 965 |         """
 966 |         if self._count < self.start:
 967 |             scale = 1
 968 |         else:
 969 |             scale = float(self.half_life) / float(self._count - self.start
 970 |                     + self.half_life)
 971 |         lr = self._init_lr * scale
 972 |         clipped = max(self.min_lr, lr)
 973 |         return clipped
 974 | 
 975 | class LinearDecayOverEpoch(TrainExtension):
 976 |     """
 977 |     Scales the learning rate linearly on each epochs
 978 | 
 979 |     Parameters
 980 |     ----------
 981 |     start : int
 982 |         The epoch on which to start shrinking the learning rate
 983 |     saturate : int
 984 |         The epoch to saturate the shrinkage
 985 |     decay_factor : float
 986 |         The final value would be initial learning rate times decay_factor
 987 |     """
 988 | 
 989 |     def __init__(self, start, saturate, decay_factor):
 990 |         self.__dict__.update(locals())
 991 |         del self.self
 992 |         self._initialized = False
 993 |         self._count = 0
 994 |         assert isinstance(decay_factor, float)
 995 |         assert isinstance(start, (py_integer_types, py_float_types))
 996 |         assert isinstance(saturate, (py_integer_types, py_float_types))
 997 |         assert saturate > start
 998 |         assert start >= 0
 999 |         assert saturate >= start
1000 | 
1001 |     def on_monitor(self, model, dataset, algorithm):
1002 |         """
1003 |         Updates the learning rate based on the linear decay schedule.
1004 | 
1005 |         Parameters
1006 |         ----------
1007 |         model : a Model instance
1008 |         dataset : Dataset
1009 |         algorithm : WRITEME
1010 |         """
1011 |         if not self._initialized:
1012 |             self._init_lr = algorithm.learning_rate.get_value()
1013 |             self._step = ((self._init_lr - self._init_lr * self.decay_factor) /
1014 |                           (self.saturate - self.start + 1))
1015 |             self._initialized = True
1016 |         self._count += 1
1017 |         algorithm.learning_rate.set_value(np.cast[config.floatX](
1018 |             self.current_lr()))
1019 | 
1020 |     def current_lr(self):
1021 |         """
1022 |         Returns the learning rate currently desired by the decay schedule.
1023 |         """
1024 |         if self._count >= self.start:
1025 |             if self._count < self.saturate:
1026 |                 new_lr = self._init_lr - self._step * (self._count
1027 |                         - self.start + 1)
1028 |             else:
1029 |                 new_lr = self._init_lr * self.decay_factor
1030 |         else:
1031 |             new_lr = self._init_lr
1032 |         assert new_lr > 0
1033 |         return new_lr
1034 | 
1035 | class _PolyakWorker(object):
1036 |     """
1037 |     Only to be used by the PolyakAveraging TrainingCallback below.
1038 |     Do not use directly.
1039 |     A callback for the SGD class.
1040 | 
1041 |     Parameters
1042 |     ----------
1043 |     model : a Model
1044 |         The model whose parameters we want to train with Polyak averaging
1045 |     """
1046 | 
1047 |     def __init__(self, model):
1048 |         avg_updates = OrderedDict()
1049 |         t = sharedX(1.)
1050 |         self.param_to_mean = OrderedDict()
1051 |         for param in model.get_params():
1052 |             mean = sharedX(param.get_value())
1053 |             assert type(mean) == type(param)
1054 |             self.param_to_mean[param] = mean
1055 |             avg_updates[mean] = mean - (mean - param) / t
1056 |             avg_updates[t] = t + 1.
1057 |         self.avg = function([], updates = avg_updates)
1058 | 
1059 |     def __call__(self, algorithm):
1060 |         """
1061 |         To be called after each SGD step.
1062 |         Updates the Polyak averaged-parameters for this model
1063 | 
1064 |         Parameters
1065 |         ----------
1066 |         algorithm : WRITEME
1067 |         """
1068 |         self.avg()
1069 | 
1070 | class PolyakAveraging(TrainExtension):
1071 |     """
1072 |     See "A Tutorial on Stochastic Approximation Algorithms
1073 |         for Training Restricted Boltzmann Machines and
1074 |         Deep Belief Nets" by Kevin Swersky et al
1075 | 
1076 |     This functionality is still a work in progress. Currently,
1077 |     your model needs to implement "add_polyak_channels" to
1078 |     use it.
1079 | 
1080 |     The problem is that Polyak averaging shouldn't modify
1081 |     the model parameters. It should keep a second copy
1082 |     that it averages in the background. This second copy
1083 |     doesn't get to come back in and affect the learning process
1084 |     though.
1085 | 
1086 |     (IG tried having the second copy get pushed back into
1087 |     the model once per epoch, but this turned out to be
1088 |     harmful, at least in limited tests)
1089 | 
1090 |     So we need a cleaner interface for monitoring the
1091 |     averaged copy of the parameters, and we need to make
1092 |     sure the saved model at the end uses the averaged
1093 |     parameters, not the parameters used for computing
1094 |     the gradients during training.
1095 | 
1096 |     TODO: make use of the new on_save callback instead
1097 |         of duplicating Train's save_freq flag
1098 | 
1099 |     Parameters
1100 |     ----------
1101 |     start : int
1102 |         The epoch after which to start averaging (0 = start averaging
1103 |         immediately)
1104 |     save_path : str, optional
1105 |         WRITEME
1106 |     save_freq : int, optional
1107 |         WRITEME
1108 | 
1109 |     Notes
1110 |     -----
1111 |     This is usually used with a fixed, rather than annealed learning
1112 |     rate. It may be used in conjunction with momentum.
1113 |     """
1114 | 
1115 |     def __init__(self, start, save_path=None, save_freq=1):
1116 |         self.__dict__.update(locals())
1117 |         del self.self
1118 |         self._count = 0
1119 |         assert isinstance(start, py_integer_types)
1120 |         assert start >= 0
1121 | 
1122 |     def on_monitor(self, model, dataset, algorithm):
1123 |         """
1124 |         Make sure Polyak-averaged model gets monitored.
1125 |         Save the model if necessary.
1126 | 
1127 |         Parameters
1128 |         ----------
1129 |         model : a Model instance
1130 |         dataset : Dataset
1131 |         algorithm : WRITEME
1132 |         """
1133 |         if self._count == self.start:
1134 |             self._worker = _PolyakWorker(model)
1135 |             algorithm.update_callbacks.append(self._worker)
1136 |             #HACK
1137 |             try:
1138 |                 model.add_polyak_channels(self._worker.param_to_mean,
1139 |                                           algorithm.monitoring_dataset)
1140 |             except AttributeError:
1141 |                 pass
1142 |         elif self.save_path is not None and self._count > self.start and \
1143 |                 self._count % self.save_freq == 0:
1144 |             saved_params = OrderedDict()
1145 |             for param in model.get_params():
1146 |                 saved_params[param] = param.get_value()
1147 |                 param.set_value(self._worker.param_to_mean[param].get_value())
1148 |             serial.save(self.save_path, model)
1149 |             for param in model.get_params():
1150 |                 param.set_value(saved_params[param])
1151 |         self._count += 1
1152 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
   1 | """
   2 | Code for "Generative Adversarial Networks". Please cite the ArXiv paper in
   3 | any published research work making use of this code.
   4 | """
   5 | import functools
   6 | wraps = functools.wraps
   7 | import itertools
   8 | import numpy
   9 | np = numpy
  10 | import theano
  11 | import warnings
  12 | 
  13 | from theano.compat import OrderedDict
  14 | from theano.sandbox.rng_mrg import MRG_RandomStreams
  15 | from theano import tensor as T
  16 | 
  17 | from pylearn2.space import VectorSpace
  18 | from pylearn2.costs.cost import Cost
  19 | from pylearn2.costs.cost import DefaultDataSpecsMixin
  20 | from pylearn2.models.mlp import Layer
  21 | from pylearn2.models.mlp import Linear
  22 | from pylearn2.models import Model
  23 | from pylearn2.space import CompositeSpace
  24 | from pylearn2.train_extensions import TrainExtension
  25 | from pylearn2.utils import block_gradient
  26 | from pylearn2.utils import safe_zip
  27 | from pylearn2.utils import serial
  28 | from pylearn2.utils import sharedX
  29 | 
  30 | class AdversaryPair(Model):
  31 | 
  32 |     def __init__(self, generator, discriminator, inferer=None,
  33 |                  inference_monitoring_batch_size=128,
  34 |                  monitor_generator=True,
  35 |                  monitor_discriminator=True,
  36 |                  monitor_inference=True,
  37 |                  shrink_d = 0.):
  38 |         Model.__init__(self)
  39 |         self.__dict__.update(locals())
  40 |         del self.self
  41 | 
  42 |     def __setstate__(self, state):
  43 |         self.__dict__.update(state)
  44 |         if 'inferer' not in state:
  45 |             self.inferer = None
  46 |         if 'inference_monitoring_batch_size' not in state:
  47 |             self.inference_monitoring_batch_size = 128  # TODO: HACK
  48 |         if 'monitor_generator' not in state:
  49 |             self.monitor_generator = True
  50 |         if 'monitor_discriminator' not in state:
  51 |             self.monitor_discriminator = True
  52 |         if 'monitor_inference' not in state:
  53 |             self.monitor_inference = True
  54 | 
  55 |     def get_params(self):
  56 |         p = self.generator.get_params() + self.discriminator.get_params()
  57 |         if hasattr(self, 'inferer') and self.inferer is not None:
  58 |             p += self.inferer.get_params()
  59 |         return p
  60 | 
  61 |     def get_input_space(self):
  62 |         return self.discriminator.get_input_space()
  63 | 
  64 |     def get_weights_topo(self):
  65 |         return self.discriminator.get_weights_topo()
  66 | 
  67 |     def get_weights(self):
  68 |         return self.discriminator.get_weights()
  69 | 
  70 |     def get_weights_format(self):
  71 |         return self.discriminator.get_weights_format()
  72 | 
  73 |     def get_weights_view_shape(self):
  74 |         return self.discriminator.get_weights_view_shape()
  75 | 
  76 |     def get_monitoring_channels(self, data):
  77 |         rval = OrderedDict()
  78 | 
  79 |         g_ch = self.generator.get_monitoring_channels(data)
  80 |         d_ch = self.discriminator.get_monitoring_channels((data, None))
  81 |         samples = self.generator.sample(100)
  82 |         d_samp_ch = self.discriminator.get_monitoring_channels((samples, None))
  83 | 
  84 |         i_ch = OrderedDict()
  85 |         if self.inferer is not None:
  86 |             batch_size = self.inference_monitoring_batch_size
  87 |             sample, noise, _ = self.generator.sample_and_noise(batch_size)
  88 |             i_ch.update(self.inferer.get_monitoring_channels((sample, noise)))
  89 | 
  90 |         if self.monitor_generator:
  91 |             for key in g_ch:
  92 |                 rval['gen_' + key] = g_ch[key]
  93 |         if self.monitor_discriminator:
  94 |             for key in d_ch:
  95 |                 rval['dis_on_data_' + key] = d_samp_ch[key]
  96 |             for key in d_ch:
  97 |                 rval['dis_on_samp_' + key] = d_ch[key]
  98 |         if self.monitor_inference:
  99 |             for key in i_ch:
 100 |                 rval['inf_' + key] = i_ch[key]
 101 |         return rval
 102 | 
 103 |     def get_monitoring_data_specs(self):
 104 | 
 105 |         space = self.discriminator.get_input_space()
 106 |         source = self.discriminator.get_input_source()
 107 |         return (space, source)
 108 | 
 109 |     def _modify_updates(self, updates):
 110 |         self.generator.modify_updates(updates)
 111 |         self.discriminator.modify_updates(updates)
 112 |         if self.shrink_d != 0.:
 113 |             for param in self.discriminator.get_params():
 114 |                 if param in updates:
 115 |                     updates[param] = self.shrink_d * updates[param]
 116 |         if self.inferer is not None:
 117 |             self.inferer.modify_updates(updates)
 118 | 
 119 |     def get_lr_scalers(self):
 120 | 
 121 |         rval = self.generator.get_lr_scalers()
 122 |         rval.update(self.discriminator.get_lr_scalers())
 123 |         return rval
 124 | 
 125 | def add_layers(mlp, pretrained, start_layer=0):
 126 |     model = serial.load(pretrained)
 127 |     pretrained_layers = model.generator.mlp.layers
 128 |     assert pretrained_layers[start_layer].get_input_space() == mlp.layers[-1].get_output_space()
 129 |     mlp.layers.extend(pretrained_layers[start_layer:])
 130 |     return mlp
 131 | 
 132 | 
 133 | 
 134 | class Generator(Model):
 135 | 
 136 |     def __init__(self, mlp, noise = "gaussian", monitor_ll = False, ll_n_samples = 100, ll_sigma = 0.2):
 137 |         Model.__init__(self)
 138 |         self.__dict__.update(locals())
 139 |         del self.self
 140 |         self.theano_rng = MRG_RandomStreams(2014 * 5 + 27)
 141 | 
 142 |     def get_input_space(self):
 143 |         return self.mlp.get_input_space()
 144 | 
 145 |     def sample_and_noise(self, num_samples, default_input_include_prob=1., default_input_scale=1., all_g_layers=False):
 146 |         n = self.mlp.get_input_space().get_total_dimension()
 147 |         noise = self.get_noise((num_samples, n))
 148 |         formatted_noise = VectorSpace(n).format_as(noise, self.mlp.get_input_space())
 149 |         if all_g_layers:
 150 |             rval = self.mlp.dropout_fprop(formatted_noise, default_input_include_prob=default_input_include_prob, default_input_scale=default_input_scale, return_all=all_g_layers)
 151 |             other_layers, rval = rval[:-1], rval[-1]
 152 |         else:
 153 |             rval = self.mlp.dropout_fprop(formatted_noise, default_input_include_prob=default_input_include_prob, default_input_scale=default_input_scale)
 154 |             other_layers = None
 155 |         return rval, formatted_noise, other_layers
 156 | 
 157 |     def sample(self, num_samples, default_input_include_prob=1., default_input_scale=1.):
 158 |         sample, _, _ = self.sample_and_noise(num_samples, default_input_include_prob, default_input_scale)
 159 |         return sample
 160 | 
 161 |     def inpainting_sample_and_noise(self, X, default_input_include_prob=1., default_input_scale=1.):
 162 |         # Very hacky! Specifically for inpainting right half of CIFAR-10 given left half
 163 |         # assumes X is b01c
 164 |         assert X.ndim == 4
 165 |         input_space = self.mlp.get_input_space()
 166 |         n = input_space.get_total_dimension()
 167 |         image_size = input_space.shape[0]
 168 |         half_image = int(image_size / 2)
 169 |         data_shape = (X.shape[0], image_size, half_image, input_space.num_channels)
 170 | 
 171 |         noise = self.theano_rng.normal(size=data_shape, dtype='float32')
 172 |         Xg = T.set_subtensor(X[:,:,half_image:,:], noise)
 173 |         sampled_part, noise =  self.mlp.dropout_fprop(Xg, default_input_include_prob=default_input_include_prob, default_input_scale=default_input_scale), noise
 174 |         sampled_part = sampled_part.reshape(data_shape)
 175 |         rval = T.set_subtensor(X[:, :, half_image:, :], sampled_part)
 176 |         return rval, noise
 177 | 
 178 | 
 179 |     def get_monitoring_channels(self, data):
 180 |         if data is None:
 181 |             m = 100
 182 |         else:
 183 |             m = data.shape[0]
 184 |         n = self.mlp.get_input_space().get_total_dimension()
 185 |         noise = self.get_noise((m, n))
 186 |         rval = OrderedDict()
 187 | 
 188 |         try:
 189 |             rval.update(self.mlp.get_monitoring_channels((noise, None)))
 190 |         except Exception:
 191 |             warnings.warn("something went wrong with generator.mlp's monitoring channels")
 192 | 
 193 |         if  self.monitor_ll:
 194 |             rval['ll'] = T.cast(self.ll(data, self.ll_n_samples, self.ll_sigma),
 195 |                                         theano.config.floatX).mean()
 196 |             rval['nll'] = -rval['ll']
 197 |         return rval
 198 | 
 199 |     def get_noise(self, size):
 200 | 
 201 |         # Allow just requesting batch size
 202 |         if isinstance(size, int):
 203 |             size = (size, self.get_input_space().get_total_dimension())
 204 | 
 205 |         if not hasattr(self, 'noise'):
 206 |             self.noise = "gaussian"
 207 |         if self.noise == "uniform":
 208 |             return self.theano_rng.uniform(low=-np.sqrt(3), high=np.sqrt(3), size=size, dtype='float32')
 209 |         elif self.noise == "gaussian":
 210 |             return self.theano_rng.normal(size=size, dtype='float32')
 211 |         elif self.noise == "spherical":
 212 |             noise = self.theano_rng.normal(size=size, dtype='float32')
 213 |             noise = noise / T.maximum(1e-7, T.sqrt(T.sqr(noise).sum(axis=1))).dimshuffle(0, 'x')
 214 |             return noise
 215 |         else:
 216 |             raise NotImplementedError(self.noise)
 217 | 
 218 |     def get_params(self):
 219 |         return self.mlp.get_params()
 220 | 
 221 |     def get_output_space(self):
 222 |         return self.mlp.get_output_space()
 223 | 
 224 |     def ll(self, data, n_samples, sigma):
 225 | 
 226 |         samples = self.sample(n_samples)
 227 |         output_space = self.mlp.get_output_space()
 228 |         if 'Conv2D' in str(output_space):
 229 |             samples = output_space.convert(samples, output_space.axes, ('b', 0, 1, 'c'))
 230 |             samples = samples.flatten(2)
 231 |             data = output_space.convert(data, output_space.axes, ('b', 0, 1, 'c'))
 232 |             data = data.flatten(2)
 233 |         parzen = theano_parzen(data, samples, sigma)
 234 |         return parzen
 235 | 
 236 |     def _modify_updates(self, updates):
 237 |         self.mlp.modify_updates(updates)
 238 | 
 239 |     def get_lr_scalers(self):
 240 |         return self.mlp.get_lr_scalers()
 241 | 
 242 |     def __setstate__(self, state):
 243 |         self.__dict__.update(state)
 244 |         if 'monitor_ll' not in state:
 245 |             self.monitor_ll = False
 246 | 
 247 | 
 248 | class IntrinsicDropoutGenerator(Generator):
 249 |     def __init__(self, default_input_include_prob, default_input_scale,
 250 |                         input_include_probs=None, input_scales=None, **kwargs):
 251 |         super(IntrinsicDropoutGenerator, self).__init__(**kwargs)
 252 |         self.__dict__.update(locals())
 253 |         del self.self
 254 | 
 255 |     def sample_and_noise(self, num_samples, default_input_include_prob=1., default_input_scale=1., all_g_layers=False):
 256 |         if all_g_layers:
 257 |             raise NotImplementedError()
 258 |         n = self.mlp.get_input_space().get_total_dimension()
 259 |         noise = self.theano_rng.normal(size=(num_samples, n), dtype='float32')
 260 |         formatted_noise = VectorSpace(n).format_as(noise, self.mlp.get_input_space())
 261 |         # ignores dropout args
 262 |         default_input_include_prob = self.default_input_include_prob
 263 |         default_input_scale = self.default_input_scale
 264 |         input_include_probs = self.input_include_probs
 265 |         input_scales = self.input_scales
 266 |         return self.mlp.dropout_fprop(formatted_noise,
 267 |                                       default_input_include_prob=default_input_include_prob,
 268 |                                       default_input_scale=default_input_scale,
 269 |                                       input_include_probs=input_include_probs,
 270 |                                       input_scales=input_scales), formatted_noise, None
 271 | 
 272 | class AdversaryCost2(DefaultDataSpecsMixin, Cost):
 273 |     """
 274 |     """
 275 | 
 276 |     # Supplies own labels, don't get them from the dataset
 277 |     supervised = False
 278 | 
 279 |     def __init__(self, scale_grads=1, target_scale=.1,
 280 |             discriminator_default_input_include_prob = 1.,
 281 |             discriminator_input_include_probs=None,
 282 |             discriminator_default_input_scale=1.,
 283 |             discriminator_input_scales=None,
 284 |             generator_default_input_include_prob = 1.,
 285 |             generator_default_input_scale=1.,
 286 |             inference_default_input_include_prob=None,
 287 |             inference_input_include_probs=None,
 288 |             inference_default_input_scale=1.,
 289 |             inference_input_scales=None,
 290 |             init_now_train_generator=True,
 291 |             ever_train_discriminator=True,
 292 |             ever_train_generator=True,
 293 |             ever_train_inference=True,
 294 |             no_drop_in_d_for_g=False,
 295 |             alternate_g = False,
 296 |             infer_layer=None,
 297 |             noise_both = 0.,
 298 |             blend_obj = False,
 299 |             minimax_coeff = 1.,
 300 |             zurich_coeff = 1.):
 301 |         self.__dict__.update(locals())
 302 |         del self.self
 303 |         # These allow you to dynamically switch off training parts.
 304 |         # If the corresponding ever_train_* is False, these have
 305 |         # no effect.
 306 |         self.now_train_generator = sharedX(init_now_train_generator)
 307 |         self.now_train_discriminator = sharedX(numpy.array(1., dtype='float32'))
 308 |         self.now_train_inference = sharedX(numpy.array(1., dtype='float32'))
 309 | 
 310 |     def expr(self, model, data, **kwargs):
 311 |         S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data)
 312 |         l = []
 313 |         # This stops stuff from ever getting computed if we're not training
 314 |         # it.
 315 |         if self.ever_train_discriminator:
 316 |             l.append(d_obj)
 317 |         if self.ever_train_generator:
 318 |             l.append(g_obj)
 319 |         if self.ever_train_inference:
 320 |             l.append(i_obj)
 321 |         return sum(l)
 322 | 
 323 |     def get_samples_and_objectives(self, model, data):
 324 |         space, sources = self.get_data_specs(model)
 325 |         space.validate(data)
 326 |         assert isinstance(model, AdversaryPair)
 327 |         g = model.generator
 328 |         d = model.discriminator
 329 | 
 330 |         # Note: this assumes data is design matrix
 331 |         X = data
 332 |         m = data.shape[space.get_batch_axis()]
 333 |         y1 = T.alloc(1, m, 1)
 334 |         y0 = T.alloc(0, m, 1)
 335 |         # NOTE: if this changes to optionally use dropout, change the inference
 336 |         # code below to use a non-dropped-out version.
 337 |         S, z, other_layers = g.sample_and_noise(m, default_input_include_prob=self.generator_default_input_include_prob, default_input_scale=self.generator_default_input_scale, all_g_layers=(self.infer_layer is not None))
 338 | 
 339 |         if self.noise_both != 0.:
 340 |             rng = MRG_RandomStreams(2014 / 6 + 2)
 341 |             S = S + rng.normal(size=S.shape, dtype=S.dtype) * self.noise_both
 342 |             X = X + rng.normal(size=X.shape, dtype=S.dtype) * self.noise_both
 343 | 
 344 |         y_hat1 = d.dropout_fprop(X, self.discriminator_default_input_include_prob,
 345 |                                      self.discriminator_input_include_probs,
 346 |                                      self.discriminator_default_input_scale,
 347 |                                      self.discriminator_input_scales)
 348 |         y_hat0 = d.dropout_fprop(S, self.discriminator_default_input_include_prob,
 349 |                                      self.discriminator_input_include_probs,
 350 |                                      self.discriminator_default_input_scale,
 351 |                                      self.discriminator_input_scales)
 352 | 
 353 |         d_obj =  0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0))
 354 | 
 355 |         if self.no_drop_in_d_for_g:
 356 |             y_hat0_no_drop = d.dropout_fprop(S)
 357 |             g_obj = d.layers[-1].cost(y1, y_hat0_no_drop)
 358 |         else:
 359 |             g_obj = d.layers[-1].cost(y1, y_hat0)
 360 | 
 361 |         if self.blend_obj:
 362 |             g_obj = (self.zurich_coeff * g_obj - self.minimax_coeff * d_obj) / (self.zurich_coeff + self.minimax_coeff)
 363 | 
 364 |         if model.inferer is not None:
 365 |             # Change this if we ever switch to using dropout in the
 366 |             # construction of S.
 367 |             S_nograd = block_gradient(S)  # Redundant as long as we have custom get_gradients
 368 |             pred = model.inferer.dropout_fprop(S_nograd, self.inference_default_input_include_prob,
 369 |                                                 self.inference_input_include_probs,
 370 |                                                 self.inference_default_input_scale,
 371 |                                                 self.inference_input_scales)
 372 |             if self.infer_layer is None:
 373 |                 target = z
 374 |             else:
 375 |                 target = other_layers[self.infer_layer]
 376 |             i_obj = model.inferer.layers[-1].cost(target, pred)
 377 |         else:
 378 |             i_obj = 0
 379 | 
 380 |         return S, d_obj, g_obj, i_obj
 381 | 
 382 |     def get_gradients(self, model, data, **kwargs):
 383 |         space, sources = self.get_data_specs(model)
 384 |         space.validate(data)
 385 |         assert isinstance(model, AdversaryPair)
 386 |         g = model.generator
 387 |         d = model.discriminator
 388 | 
 389 |         S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data)
 390 | 
 391 |         g_params = g.get_params()
 392 |         d_params = d.get_params()
 393 |         for param in g_params:
 394 |             assert param not in d_params
 395 |         for param in d_params:
 396 |             assert param not in g_params
 397 |         d_grads = T.grad(d_obj, d_params)
 398 |         g_grads = T.grad(g_obj, g_params)
 399 | 
 400 |         if self.scale_grads:
 401 |             S_grad = T.grad(g_obj, S)
 402 |             scale = T.maximum(1., self.target_scale / T.sqrt(T.sqr(S_grad).sum()))
 403 |             g_grads = [g_grad * scale for g_grad in g_grads]
 404 | 
 405 |         rval = OrderedDict()
 406 |         zeros = itertools.repeat(theano.tensor.constant(0., dtype='float32'))
 407 |         if self.ever_train_discriminator:
 408 |             rval.update(OrderedDict(safe_zip(d_params, [self.now_train_discriminator * dg for dg in d_grads])))
 409 |         else:
 410 |             rval.update(OrderedDict(zip(d_params, zeros)))
 411 |         if self.ever_train_generator:
 412 |             rval.update(OrderedDict(safe_zip(g_params, [self.now_train_generator * gg for gg in g_grads])))
 413 |         else:
 414 |             rval.update(OrderedDict(zip(g_params, zeros)))
 415 |         if self.ever_train_inference and model.inferer is not None:
 416 |             i_params = model.inferer.get_params()
 417 |             i_grads = T.grad(i_obj, i_params)
 418 |             rval.update(OrderedDict(safe_zip(i_params, [self.now_train_inference * ig for ig in i_grads])))
 419 |         elif model.inferer is not None:
 420 |             rval.update(OrderedDict(model.inferer.get_params(), zeros))
 421 | 
 422 |         updates = OrderedDict()
 423 | 
 424 |         # Two d steps for every g step
 425 |         if self.alternate_g:
 426 |             updates[self.now_train_generator] = 1. - self.now_train_generator
 427 | 
 428 |         return rval, updates
 429 | 
 430 |     def get_monitoring_channels(self, model, data, **kwargs):
 431 | 
 432 |         rval = OrderedDict()
 433 | 
 434 |         m = data.shape[0]
 435 | 
 436 |         g = model.generator
 437 |         d = model.discriminator
 438 | 
 439 |         y_hat = d.fprop(data)
 440 | 
 441 |         rval['false_negatives'] = T.cast((y_hat < 0.5).mean(), 'float32')
 442 | 
 443 |         samples = g.sample(m)
 444 |         y_hat = d.fprop(samples)
 445 |         rval['false_positives'] = T.cast((y_hat > 0.5).mean(), 'float32')
 446 |         # y = T.alloc(0., m, 1)
 447 |         cost = d.cost_from_X((samples, y_hat))
 448 |         sample_grad = T.grad(-cost, samples)
 449 |         rval['sample_grad_norm'] = T.sqrt(T.sqr(sample_grad).sum())
 450 |         _S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data)
 451 |         if model.monitor_inference and i_obj != 0:
 452 |             rval['objective_i'] = i_obj
 453 |         if model.monitor_discriminator:
 454 |             rval['objective_d'] = d_obj
 455 |         if model.monitor_generator:
 456 |             rval['objective_g'] = g_obj
 457 | 
 458 |         rval['now_train_generator'] = self.now_train_generator
 459 |         return rval
 460 | 
 461 | def recapitate_discriminator(pair_path, new_head):
 462 |     pair = serial.load(pair_path)
 463 |     d = pair.discriminator
 464 |     del d.layers[-1]
 465 |     d.add_layers([new_head])
 466 |     return d
 467 | 
 468 | def theano_parzen(data, mu, sigma):
 469 |     """
 470 |     Credit: Yann N. Dauphin
 471 |     """
 472 |     x = data
 473 | 
 474 |     a = ( x.dimshuffle(0, 'x', 1) - mu.dimshuffle('x', 0, 1) ) / sigma
 475 | 
 476 |     E = log_mean_exp(-0.5*(a**2).sum(2))
 477 | 
 478 |     Z = mu.shape[1] * T.log(sigma * numpy.sqrt(numpy.pi * 2))
 479 | 
 480 |     #return theano.function([x], E - Z)
 481 |     return E - Z
 482 | 
 483 | 
 484 | def log_mean_exp(a):
 485 |     """
 486 |     Credit: Yann N. Dauphin
 487 |     """
 488 | 
 489 |     max_ = a.max(1)
 490 | 
 491 |     return max_ + T.log(T.exp(a - max_.dimshuffle(0, 'x')).mean(1))
 492 | 
 493 | class Sum(Layer):
 494 |     """
 495 |     Monitoring channels are hardcoded for C01B batches
 496 |     """
 497 | 
 498 |     def __init__(self, layer_name):
 499 |         Model.__init__(self)
 500 |         self.__dict__.update(locals())
 501 |         del self.self
 502 |         self._params = []
 503 | 
 504 |     def set_input_space(self, space):
 505 |         self.input_space = space
 506 |         assert isinstance(space, CompositeSpace)
 507 |         self.output_space = space.components[0]
 508 | 
 509 |     def fprop(self, state_below):
 510 |         rval = state_below[0]
 511 |         for i in xrange(1, len(state_below)):
 512 |             rval = rval + state_below[i]
 513 |         rval.came_from_sum = True
 514 |         return rval
 515 | 
 516 |     @functools.wraps(Layer.get_layer_monitoring_channels)
 517 |     def get_layer_monitoring_channels(self, state_below=None,
 518 |                                     state=None, targets=None):
 519 |         rval = OrderedDict()
 520 | 
 521 |         if state is None:
 522 |                 state = self.fprop(state_below)
 523 |         vars_and_prefixes = [(state, '')]
 524 | 
 525 |         for var, prefix in vars_and_prefixes:
 526 |             if not hasattr(var, 'ndim') or var.ndim != 4:
 527 |                 print "expected 4D tensor, got "
 528 |                 print var
 529 |                 print type(var)
 530 |                 if isinstance(var, tuple):
 531 |                     print "tuple length: ", len(var)
 532 |                 assert False
 533 |             v_max = var.max(axis=(1, 2, 3))
 534 |             v_min = var.min(axis=(1, 2, 3))
 535 |             v_mean = var.mean(axis=(1, 2, 3))
 536 |             v_range = v_max - v_min
 537 | 
 538 |             # max_x.mean_u is "the mean over *u*nits of the max over
 539 |             # e*x*amples" The x and u are included in the name because
 540 |             # otherwise its hard to remember which axis is which when reading
 541 |             # the monitor I use inner.outer rather than outer_of_inner or
 542 |             # something like that because I want mean_x.* to appear next to
 543 |             # each other in the alphabetical list, as these are commonly
 544 |             # plotted together
 545 |             for key, val in [('max_x.max_u',    v_max.max()),
 546 |                              ('max_x.mean_u',   v_max.mean()),
 547 |                              ('max_x.min_u',    v_max.min()),
 548 |                              ('min_x.max_u',    v_min.max()),
 549 |                              ('min_x.mean_u',   v_min.mean()),
 550 |                              ('min_x.min_u',    v_min.min()),
 551 |                              ('range_x.max_u',  v_range.max()),
 552 |                              ('range_x.mean_u', v_range.mean()),
 553 |                              ('range_x.min_u',  v_range.min()),
 554 |                              ('mean_x.max_u',   v_mean.max()),
 555 |                              ('mean_x.mean_u',  v_mean.mean()),
 556 |                              ('mean_x.min_u',   v_mean.min())]:
 557 |                 rval[prefix+key] = val
 558 | 
 559 |         return rval
 560 | 
 561 | def marginals(dataset):
 562 |     return dataset.X.mean(axis=0)
 563 | 
 564 | class ActivateGenerator(TrainExtension):
 565 |     def __init__(self, active_after, value=1.):
 566 |         self.__dict__.update(locals())
 567 |         del self.self
 568 |         self.cur_epoch = 0
 569 | 
 570 |     def on_monitor(self, model, dataset, algorithm):
 571 |         if self.cur_epoch == self.active_after:
 572 |             algorithm.cost.now_train_generator.set_value(np.array(self.value, dtype='float32'))
 573 |         self.cur_epoch += 1
 574 | 
 575 | class InpaintingAdversaryCost(DefaultDataSpecsMixin, Cost):
 576 |     """
 577 |     """
 578 | 
 579 |     # Supplies own labels, don't get them from the dataset
 580 |     supervised = False
 581 | 
 582 |     def __init__(self, scale_grads=1, target_scale=.1,
 583 |             discriminator_default_input_include_prob = 1.,
 584 |             discriminator_input_include_probs=None,
 585 |             discriminator_default_input_scale=1.,
 586 |             discriminator_input_scales=None,
 587 |             generator_default_input_include_prob = 1.,
 588 |             generator_default_input_scale=1.,
 589 |             inference_default_input_include_prob=None,
 590 |             inference_input_include_probs=None,
 591 |             inference_default_input_scale=1.,
 592 |             inference_input_scales=None,
 593 |             init_now_train_generator=True,
 594 |             ever_train_discriminator=True,
 595 |             ever_train_generator=True,
 596 |             ever_train_inference=True,
 597 |             no_drop_in_d_for_g=False,
 598 |             alternate_g = False):
 599 |         self.__dict__.update(locals())
 600 |         del self.self
 601 |         # These allow you to dynamically switch off training parts.
 602 |         # If the corresponding ever_train_* is False, these have
 603 |         # no effect.
 604 |         self.now_train_generator = sharedX(init_now_train_generator)
 605 |         self.now_train_discriminator = sharedX(numpy.array(1., dtype='float32'))
 606 |         self.now_train_inference = sharedX(numpy.array(1., dtype='float32'))
 607 | 
 608 |     def expr(self, model, data, **kwargs):
 609 |         S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data)
 610 |         return d_obj + g_obj + i_obj
 611 | 
 612 |     def get_samples_and_objectives(self, model, data):
 613 |         space, sources = self.get_data_specs(model)
 614 |         space.validate(data)
 615 |         assert isinstance(model, AdversaryPair)
 616 |         g = model.generator
 617 |         d = model.discriminator
 618 | 
 619 |         # Note: this assumes data is b01c
 620 |         X = data
 621 |         assert X.ndim == 4
 622 |         m = data.shape[space.get_batch_axis()]
 623 |         y1 = T.alloc(1, m, 1)
 624 |         y0 = T.alloc(0, m, 1)
 625 |         # NOTE: if this changes to optionally use dropout, change the inference
 626 |         # code below to use a non-dropped-out version.
 627 |         S, z = g.inpainting_sample_and_noise(X, default_input_include_prob=self.generator_default_input_include_prob, default_input_scale=self.generator_default_input_scale)
 628 |         y_hat1 = d.dropout_fprop(X, self.discriminator_default_input_include_prob,
 629 |                                      self.discriminator_input_include_probs,
 630 |                                      self.discriminator_default_input_scale,
 631 |                                      self.discriminator_input_scales)
 632 |         y_hat0 = d.dropout_fprop(S, self.discriminator_default_input_include_prob,
 633 |                                      self.discriminator_input_include_probs,
 634 |                                      self.discriminator_default_input_scale,
 635 |                                      self.discriminator_input_scales)
 636 | 
 637 |         d_obj =  0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0))
 638 | 
 639 |         if self.no_drop_in_d_for_g:
 640 |             y_hat0_no_drop = d.dropout_fprop(S)
 641 |             g_obj = d.layers[-1].cost(y1, y_hat0)
 642 |         else:
 643 |             g_obj = d.layers[-1].cost(y1, y_hat0)
 644 | 
 645 |         if model.inferer is not None:
 646 |             # Change this if we ever switch to using dropout in the
 647 |             # construction of S.
 648 |             S_nograd = block_gradient(S)  # Redundant as long as we have custom get_gradients
 649 |             z_hat = model.inferer.dropout_fprop(S_nograd, self.inference_default_input_include_prob,
 650 |                                                 self.inference_input_include_probs,
 651 |                                                 self.inference_default_input_scale,
 652 |                                                 self.inference_input_scales)
 653 |             i_obj = model.inferer.layers[-1].cost(z, z_hat)
 654 |         else:
 655 |             i_obj = 0
 656 | 
 657 |         return S, d_obj, g_obj, i_obj
 658 | 
 659 |     def get_gradients(self, model, data, **kwargs):
 660 |         space, sources = self.get_data_specs(model)
 661 |         space.validate(data)
 662 |         assert isinstance(model, AdversaryPair)
 663 |         g = model.generator
 664 |         d = model.discriminator
 665 | 
 666 |         S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data)
 667 | 
 668 |         g_params = g.get_params()
 669 |         d_params = d.get_params()
 670 |         for param in g_params:
 671 |             assert param not in d_params
 672 |         for param in d_params:
 673 |             assert param not in g_params
 674 |         d_grads = T.grad(d_obj, d_params)
 675 |         g_grads = T.grad(g_obj, g_params)
 676 | 
 677 |         if self.scale_grads:
 678 |             S_grad = T.grad(g_obj, S)
 679 |             scale = T.maximum(1., self.target_scale / T.sqrt(T.sqr(S_grad).sum()))
 680 |             g_grads = [g_grad * scale for g_grad in g_grads]
 681 | 
 682 |         rval = OrderedDict()
 683 |         if self.ever_train_discriminator:
 684 |             rval.update(OrderedDict(safe_zip(d_params, [self.now_train_discriminator * dg for dg in d_grads])))
 685 |         else:
 686 |             rval.update(OrderedDict(zip(d_params, itertools.repeat(theano.tensor.constant(0., dtype='float32')))))
 687 | 
 688 |         if self.ever_train_generator:
 689 |             rval.update(OrderedDict(safe_zip(g_params, [self.now_train_generator * gg for gg in g_grads])))
 690 |         else:
 691 |             rval.update(OrderedDict(zip(g_params, itertools.repeat(theano.tensor.constant(0., dtype='float32')))))
 692 | 
 693 |         if self.ever_train_inference and model.inferer is not None:
 694 |             i_params = model.inferer.get_params()
 695 |             i_grads = T.grad(i_obj, i_params)
 696 |             rval.update(OrderedDict(safe_zip(i_params, [self.now_train_inference * ig for ig in i_grads])))
 697 | 
 698 |         updates = OrderedDict()
 699 | 
 700 |         # Two d steps for every g step
 701 |         if self.alternate_g:
 702 |             updates[self.now_train_generator] = 1. - self.now_train_generator
 703 | 
 704 |         return rval, updates
 705 | 
 706 |     def get_monitoring_channels(self, model, data, **kwargs):
 707 | 
 708 |         rval = OrderedDict()
 709 | 
 710 |         m = data.shape[0]
 711 | 
 712 |         g = model.generator
 713 |         d = model.discriminator
 714 | 
 715 |         y_hat = d.fprop(data)
 716 | 
 717 |         rval['false_negatives'] = T.cast((y_hat < 0.5).mean(), 'float32')
 718 | 
 719 |         samples, noise = g.inpainting_sample_and_noise(data)
 720 |         y_hat = d.fprop(samples)
 721 |         rval['false_positives'] = T.cast((y_hat > 0.5).mean(), 'float32')
 722 |         # y = T.alloc(0., m, 1)
 723 |         cost = d.cost_from_X((samples, y_hat))
 724 |         sample_grad = T.grad(-cost, samples)
 725 |         rval['sample_grad_norm'] = T.sqrt(T.sqr(sample_grad).sum())
 726 |         _S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data)
 727 |         if i_obj != 0:
 728 |             rval['objective_i'] = i_obj
 729 |         rval['objective_d'] = d_obj
 730 |         rval['objective_g'] = g_obj
 731 | 
 732 |         rval['now_train_generator'] = self.now_train_generator
 733 |         return rval
 734 | 
 735 | class Cycler(object):
 736 | 
 737 |     def __init__(self, k):
 738 |         self.__dict__.update(locals())
 739 |         del self.self
 740 |         self.i = 0
 741 | 
 742 |     def __call__(self, sgd):
 743 |         self.i = (self.i + 1) % self.k
 744 |         sgd.cost.now_train_generator.set_value(np.cast['float32'](self.i == 0))
 745 | 
 746 | class NoiseCat(Layer):
 747 | 
 748 |     def __init__(self, new_dim, std, layer_name):
 749 |         Layer.__init__(self)
 750 |         self.__dict__.update(locals())
 751 |         del self.self
 752 |         self._params = []
 753 | 
 754 |     def set_input_space(self, space):
 755 |         assert isinstance(space, VectorSpace)
 756 |         self.input_space = space
 757 |         self.output_space = VectorSpace(space.dim + self.new_dim)
 758 |         self.theano_rng = MRG_RandomStreams(self.mlp.rng.randint(2 ** 16))
 759 | 
 760 |     def fprop(self, state):
 761 |         noise = self.theano_rng.normal(std=self.std, avg=0., size=(state.shape[0], self.new_dim),
 762 |                 dtype=state.dtype)
 763 |         return T.concatenate((state, noise), axis=1)
 764 | 
 765 | class RectifiedLinear(Layer):
 766 | 
 767 |     def __init__(self, layer_name, left_slope=0.0, **kwargs):
 768 |         super(RectifiedLinear, self).__init__(**kwargs)
 769 |         self.__dict__.update(locals())
 770 |         del self.self
 771 |         self._params = []
 772 | 
 773 |     def set_input_space(self, space):
 774 |         self.input_space = space
 775 |         self.output_space = space
 776 | 
 777 |     def fprop(self, state_below):
 778 |         p = state_below
 779 |         p = T.switch(p > 0., p, self.left_slope * p)
 780 |         return p
 781 | 
 782 | class Sigmoid(Layer):
 783 | 
 784 |     def __init__(self, layer_name, left_slope=0.0, **kwargs):
 785 |         super(Sigmoid, self).__init__(**kwargs)
 786 |         self.__dict__.update(locals())
 787 |         del self.self
 788 |         self._params = []
 789 | 
 790 |     def set_input_space(self, space):
 791 |         self.input_space = space
 792 |         self.output_space = space
 793 | 
 794 |     def fprop(self, state_below):
 795 |         p = T.nnet.sigmoid(state_below)
 796 |         return p
 797 | 
 798 | class SubtractHalf(Layer):
 799 | 
 800 |     def __init__(self, layer_name, left_slope=0.0, **kwargs):
 801 |         super(SubtractHalf, self).__init__(**kwargs)
 802 |         self.__dict__.update(locals())
 803 |         del self.self
 804 |         self._params = []
 805 | 
 806 |     def set_input_space(self, space):
 807 |         self.input_space = space
 808 |         self.output_space = space
 809 | 
 810 |     def fprop(self, state_below):
 811 |         return state_below - 0.5
 812 | 
 813 |     def get_weights(self):
 814 |         return self.mlp.layers[1].get_weights()
 815 | 
 816 |     def get_weights_format(self):
 817 |         return self.mlp.layers[1].get_weights_format()
 818 | 
 819 |     def get_weights_view_shape(self):
 820 |         return self.mlp.layers[1].get_weights_view_shape()
 821 | 
 822 | class SubtractRealMean(Layer):
 823 | 
 824 |     def __init__(self, layer_name, dataset, also_sd = False, **kwargs):
 825 |         super(SubtractRealMean, self).__init__(**kwargs)
 826 |         self.__dict__.update(locals())
 827 |         del self.self
 828 |         self._params = []
 829 |         self.mean = sharedX(dataset.X.mean(axis=0))
 830 |         if also_sd:
 831 |             self.sd = sharedX(dataset.X.std(axis=0))
 832 |         del self.dataset
 833 | 
 834 |     def set_input_space(self, space):
 835 |         self.input_space = space
 836 |         self.output_space = space
 837 | 
 838 |     def fprop(self, state_below):
 839 |         return (state_below - self.mean) / self.sd
 840 | 
 841 |     def get_weights(self):
 842 |         return self.mlp.layers[1].get_weights()
 843 | 
 844 |     def get_weights_format(self):
 845 |         return self.mlp.layers[1].get_weights_format()
 846 | 
 847 |     def get_weights_view_shape(self):
 848 |         return self.mlp.layers[1].get_weights_view_shape()
 849 | 
 850 | 
 851 | class Clusterize(Layer):
 852 | 
 853 |     def __init__(self, scale, layer_name):
 854 |         Layer.__init__(self)
 855 |         self.__dict__.update(locals())
 856 |         del self.self
 857 |         self._params = []
 858 | 
 859 |     def set_input_space(self, space):
 860 |         assert isinstance(space, VectorSpace)
 861 |         self.input_space = space
 862 |         self.output_space = space
 863 |         self.theano_rng = MRG_RandomStreams(self.mlp.rng.randint(2 ** 16))
 864 | 
 865 |     def fprop(self, state):
 866 |         noise = self.theano_rng.binomial(size=state.shape, p=0.5,
 867 |                 dtype=state.dtype) * 2. - 1.
 868 |         return state + self.scale * noise
 869 | 
 870 | 
 871 | 
 872 | class ThresholdedAdversaryCost(DefaultDataSpecsMixin, Cost):
 873 |     """
 874 |     """
 875 | 
 876 |     # Supplies own labels, don't get them from the dataset
 877 |     supervised = False
 878 | 
 879 |     def __init__(self, scale_grads=1, target_scale=.1,
 880 |             discriminator_default_input_include_prob = 1.,
 881 |             discriminator_input_include_probs=None,
 882 |             discriminator_default_input_scale=1.,
 883 |             discriminator_input_scales=None,
 884 |             generator_default_input_include_prob = 1.,
 885 |             generator_default_input_scale=1.,
 886 |             inference_default_input_include_prob=None,
 887 |             inference_input_include_probs=None,
 888 |             inference_default_input_scale=1.,
 889 |             inference_input_scales=None,
 890 |             init_now_train_generator=True,
 891 |             ever_train_discriminator=True,
 892 |             ever_train_generator=True,
 893 |             ever_train_inference=True,
 894 |             no_drop_in_d_for_g=False,
 895 |             alternate_g = False,
 896 |             infer_layer=None,
 897 |             noise_both = 0.):
 898 |         self.__dict__.update(locals())
 899 |         del self.self
 900 |         # These allow you to dynamically switch off training parts.
 901 |         # If the corresponding ever_train_* is False, these have
 902 |         # no effect.
 903 |         self.now_train_generator = sharedX(init_now_train_generator)
 904 |         self.now_train_discriminator = sharedX(numpy.array(1., dtype='float32'))
 905 |         self.now_train_inference = sharedX(numpy.array(1., dtype='float32'))
 906 | 
 907 |     def expr(self, model, data, **kwargs):
 908 |         S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data)
 909 |         l = []
 910 |         # This stops stuff from ever getting computed if we're not training
 911 |         # it.
 912 |         if self.ever_train_discriminator:
 913 |             l.append(d_obj)
 914 |         if self.ever_train_generator:
 915 |             l.append(g_obj)
 916 |         if self.ever_train_inference:
 917 |             l.append(i_obj)
 918 |         return sum(l)
 919 | 
 920 |     def get_samples_and_objectives(self, model, data):
 921 |         space, sources = self.get_data_specs(model)
 922 |         space.validate(data)
 923 |         assert isinstance(model, AdversaryPair)
 924 |         g = model.generator
 925 |         d = model.discriminator
 926 | 
 927 |         # Note: this assumes data is design matrix
 928 |         X = data
 929 |         m = data.shape[space.get_batch_axis()]
 930 |         y1 = T.alloc(1, m, 1)
 931 |         y0 = T.alloc(0, m, 1)
 932 |         # NOTE: if this changes to optionally use dropout, change the inference
 933 |         # code below to use a non-dropped-out version.
 934 |         S, z, other_layers = g.sample_and_noise(m, default_input_include_prob=self.generator_default_input_include_prob, default_input_scale=self.generator_default_input_scale, all_g_layers=(self.infer_layer is not None))
 935 | 
 936 |         if self.noise_both != 0.:
 937 |             rng = MRG_RandomStreams(2014 / 6 + 2)
 938 |             S = S + rng.normal(size=S.shape, dtype=S.dtype) * self.noise_both
 939 |             X = X + rng.normal(size=X.shape, dtype=S.dtype) * self.noise_both
 940 | 
 941 |         y_hat1 = d.dropout_fprop(X, self.discriminator_default_input_include_prob,
 942 |                                      self.discriminator_input_include_probs,
 943 |                                      self.discriminator_default_input_scale,
 944 |                                      self.discriminator_input_scales)
 945 |         y_hat0 = d.dropout_fprop(S, self.discriminator_default_input_include_prob,
 946 |                                      self.discriminator_input_include_probs,
 947 |                                      self.discriminator_default_input_scale,
 948 |                                      self.discriminator_input_scales)
 949 | 
 950 |         d_obj =  0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0))
 951 | 
 952 |         if self.no_drop_in_d_for_g:
 953 |             y_hat0_no_drop = d.dropout_fprop(S)
 954 |             g_cost_mat = d.layers[-1].cost_matrix(y1, y_hat0_no_drop)
 955 |         else:
 956 |             g_cost_mat = d.layers[-1].cost_matrix(y1, y_hat0)
 957 |         assert g_cost_mat.ndim == 2
 958 |         assert y_hat0.ndim == 2
 959 | 
 960 |         mask = y_hat0 < 0.5
 961 |         masked_cost = g_cost_mat * mask
 962 |         g_obj = masked_cost.mean()
 963 | 
 964 | 
 965 |         if model.inferer is not None:
 966 |             # Change this if we ever switch to using dropout in the
 967 |             # construction of S.
 968 |             S_nograd = block_gradient(S)  # Redundant as long as we have custom get_gradients
 969 |             pred = model.inferer.dropout_fprop(S_nograd, self.inference_default_input_include_prob,
 970 |                                                 self.inference_input_include_probs,
 971 |                                                 self.inference_default_input_scale,
 972 |                                                 self.inference_input_scales)
 973 |             if self.infer_layer is None:
 974 |                 target = z
 975 |             else:
 976 |                 target = other_layers[self.infer_layer]
 977 |             i_obj = model.inferer.layers[-1].cost(target, pred)
 978 |         else:
 979 |             i_obj = 0
 980 | 
 981 |         return S, d_obj, g_obj, i_obj
 982 | 
 983 |     def get_gradients(self, model, data, **kwargs):
 984 |         space, sources = self.get_data_specs(model)
 985 |         space.validate(data)
 986 |         assert isinstance(model, AdversaryPair)
 987 |         g = model.generator
 988 |         d = model.discriminator
 989 | 
 990 |         S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data)
 991 | 
 992 |         g_params = g.get_params()
 993 |         d_params = d.get_params()
 994 |         for param in g_params:
 995 |             assert param not in d_params
 996 |         for param in d_params:
 997 |             assert param not in g_params
 998 |         d_grads = T.grad(d_obj, d_params)
 999 |         g_grads = T.grad(g_obj, g_params)
1000 | 
1001 |         if self.scale_grads:
1002 |             S_grad = T.grad(g_obj, S)
1003 |             scale = T.maximum(1., self.target_scale / T.sqrt(T.sqr(S_grad).sum()))
1004 |             g_grads = [g_grad * scale for g_grad in g_grads]
1005 | 
1006 |         rval = OrderedDict()
1007 |         zeros = itertools.repeat(theano.tensor.constant(0., dtype='float32'))
1008 |         if self.ever_train_discriminator:
1009 |             rval.update(OrderedDict(safe_zip(d_params, [self.now_train_discriminator * dg for dg in d_grads])))
1010 |         else:
1011 |             rval.update(OrderedDict(zip(d_params, zeros)))
1012 |         if self.ever_train_generator:
1013 |             rval.update(OrderedDict(safe_zip(g_params, [self.now_train_generator * gg for gg in g_grads])))
1014 |         else:
1015 |             rval.update(OrderedDict(zip(g_params, zeros)))
1016 |         if self.ever_train_inference and model.inferer is not None:
1017 |             i_params = model.inferer.get_params()
1018 |             i_grads = T.grad(i_obj, i_params)
1019 |             rval.update(OrderedDict(safe_zip(i_params, [self.now_train_inference * ig for ig in i_grads])))
1020 |         elif model.inferer is not None:
1021 |             rval.update(OrderedDict(model.inferer.get_params(), zeros))
1022 | 
1023 |         updates = OrderedDict()
1024 | 
1025 |         # Two d steps for every g step
1026 |         if self.alternate_g:
1027 |             updates[self.now_train_generator] = 1. - self.now_train_generator
1028 | 
1029 |         return rval, updates
1030 | 
1031 |     def get_monitoring_channels(self, model, data, **kwargs):
1032 | 
1033 |         rval = OrderedDict()
1034 | 
1035 |         m = data.shape[0]
1036 | 
1037 |         g = model.generator
1038 |         d = model.discriminator
1039 | 
1040 |         y_hat = d.fprop(data)
1041 | 
1042 |         rval['false_negatives'] = T.cast((y_hat < 0.5).mean(), 'float32')
1043 | 
1044 |         samples = g.sample(m)
1045 |         y_hat = d.fprop(samples)
1046 |         rval['false_positives'] = T.cast((y_hat > 0.5).mean(), 'float32')
1047 |         # y = T.alloc(0., m, 1)
1048 |         cost = d.cost_from_X((samples, y_hat))
1049 |         sample_grad = T.grad(-cost, samples)
1050 |         rval['sample_grad_norm'] = T.sqrt(T.sqr(sample_grad).sum())
1051 |         _S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data)
1052 |         if model.monitor_inference and i_obj != 0:
1053 |             rval['objective_i'] = i_obj
1054 |         if model.monitor_discriminator:
1055 |             rval['objective_d'] = d_obj
1056 |         if model.monitor_generator:
1057 |             rval['objective_g'] = g_obj
1058 | 
1059 |         rval['now_train_generator'] = self.now_train_generator
1060 |         return rval
1061 | 
1062 | 
1063 | class HardSigmoid(Linear):
1064 |     """
1065 |     Hard "sigmoid" (note: shifted along the x axis)
1066 |     """
1067 | 
1068 |     def __init__(self, left_slope=0.0, **kwargs):
1069 |         super(HardSigmoid, self).__init__(**kwargs)
1070 |         self.left_slope = left_slope
1071 | 
1072 |     @wraps(Layer.fprop)
1073 |     def fprop(self, state_below):
1074 | 
1075 |         p = self._linear_part(state_below)
1076 |         # Original: p = p * (p > 0.) + self.left_slope * p * (p < 0.)
1077 |         # T.switch is faster.
1078 |         # For details, see benchmarks in
1079 |         # pylearn2/scripts/benchmark/time_relu.py
1080 |         p = T.clip(p, 0., 1.)
1081 |         return p
1082 | 
1083 |     @wraps(Layer.cost)
1084 |     def cost(self, *args, **kwargs):
1085 | 
1086 |         raise NotImplementedError()
1087 | 
1088 | 
1089 | class LazyAdversaryCost(DefaultDataSpecsMixin, Cost):
1090 |     """
1091 |     """
1092 | 
1093 |     # Supplies own labels, don't get them from the dataset
1094 |     supervised = False
1095 | 
1096 |     def __init__(self, scale_grads=1, target_scale=.1,
1097 |             discriminator_default_input_include_prob = 1.,
1098 |             discriminator_input_include_probs=None,
1099 |             discriminator_default_input_scale=1.,
1100 |             discriminator_input_scales=None,
1101 |             generator_default_input_include_prob = 1.,
1102 |             generator_default_input_scale=1.,
1103 |             inference_default_input_include_prob=None,
1104 |             inference_input_include_probs=None,
1105 |             inference_default_input_scale=1.,
1106 |             inference_input_scales=None,
1107 |             init_now_train_generator=True,
1108 |             ever_train_discriminator=True,
1109 |             ever_train_generator=True,
1110 |             ever_train_inference=True,
1111 |             no_drop_in_d_for_g=False,
1112 |             alternate_g = False,
1113 |             infer_layer=None,
1114 |             noise_both = 0.,
1115 |             g_eps = 0.,
1116 |             d_eps =0.):
1117 |         self.__dict__.update(locals())
1118 |         del self.self
1119 |         # These allow you to dynamically switch off training parts.
1120 |         # If the corresponding ever_train_* is False, these have
1121 |         # no effect.
1122 |         self.now_train_generator = sharedX(init_now_train_generator)
1123 |         self.now_train_discriminator = sharedX(numpy.array(1., dtype='float32'))
1124 |         self.now_train_inference = sharedX(numpy.array(1., dtype='float32'))
1125 | 
1126 |     def expr(self, model, data, **kwargs):
1127 |         S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data)
1128 |         l = []
1129 |         # This stops stuff from ever getting computed if we're not training
1130 |         # it.
1131 |         if self.ever_train_discriminator:
1132 |             l.append(d_obj)
1133 |         if self.ever_train_generator:
1134 |             l.append(g_obj)
1135 |         if self.ever_train_inference:
1136 |             l.append(i_obj)
1137 |         return sum(l)
1138 | 
1139 |     def get_samples_and_objectives(self, model, data):
1140 |         space, sources = self.get_data_specs(model)
1141 |         space.validate(data)
1142 |         assert isinstance(model, AdversaryPair)
1143 |         g = model.generator
1144 |         d = model.discriminator
1145 | 
1146 |         # Note: this assumes data is design matrix
1147 |         X = data
1148 |         m = data.shape[space.get_batch_axis()]
1149 |         y1 = T.alloc(1, m, 1)
1150 |         y0 = T.alloc(0, m, 1)
1151 |         # NOTE: if this changes to optionally use dropout, change the inference
1152 |         # code below to use a non-dropped-out version.
1153 |         S, z, other_layers = g.sample_and_noise(m, default_input_include_prob=self.generator_default_input_include_prob, default_input_scale=self.generator_default_input_scale, all_g_layers=(self.infer_layer is not None))
1154 | 
1155 |         if self.noise_both != 0.:
1156 |             rng = MRG_RandomStreams(2014 / 6 + 2)
1157 |             S = S + rng.normal(size=S.shape, dtype=S.dtype) * self.noise_both
1158 |             X = X + rng.normal(size=X.shape, dtype=S.dtype) * self.noise_both
1159 | 
1160 |         y_hat1 = d.dropout_fprop(X, self.discriminator_default_input_include_prob,
1161 |                                      self.discriminator_input_include_probs,
1162 |                                      self.discriminator_default_input_scale,
1163 |                                      self.discriminator_input_scales)
1164 |         y_hat0 = d.dropout_fprop(S, self.discriminator_default_input_include_prob,
1165 |                                      self.discriminator_input_include_probs,
1166 |                                      self.discriminator_default_input_scale,
1167 |                                      self.discriminator_input_scales)
1168 | 
1169 |         # d_obj =  0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0))
1170 | 
1171 |         pos_mask = y_hat1 < .5 + self.d_eps
1172 |         neg_mask = y_hat0 > .5 - self.d_eps
1173 | 
1174 |         pos_cost_matrix = d.layers[-1].cost_matrix(y1, y_hat1)
1175 |         neg_cost_matrix = d.layers[-1].cost_matrix(y0, y_hat0)
1176 | 
1177 |         pos_cost = (pos_mask * pos_cost_matrix).mean()
1178 |         neg_cost = (neg_mask * neg_cost_matrix).mean()
1179 | 
1180 |         d_obj = 0.5 * (pos_cost + neg_cost)
1181 | 
1182 |         if self.no_drop_in_d_for_g:
1183 |             y_hat0_no_drop = d.dropout_fprop(S)
1184 |             g_cost_mat = d.layers[-1].cost_matrix(y1, y_hat0_no_drop)
1185 |         else:
1186 |             g_cost_mat = d.layers[-1].cost_matrix(y1, y_hat0)
1187 |         assert g_cost_mat.ndim == 2
1188 |         assert y_hat0.ndim == 2
1189 | 
1190 |         mask = y_hat0 < 0.5 + self.g_eps
1191 |         masked_cost = g_cost_mat * mask
1192 |         g_obj = masked_cost.mean()
1193 | 
1194 | 
1195 |         if model.inferer is not None:
1196 |             # Change this if we ever switch to using dropout in the
1197 |             # construction of S.
1198 |             S_nograd = block_gradient(S)  # Redundant as long as we have custom get_gradients
1199 |             pred = model.inferer.dropout_fprop(S_nograd, self.inference_default_input_include_prob,
1200 |                                                 self.inference_input_include_probs,
1201 |                                                 self.inference_default_input_scale,
1202 |                                                 self.inference_input_scales)
1203 |             if self.infer_layer is None:
1204 |                 target = z
1205 |             else:
1206 |                 target = other_layers[self.infer_layer]
1207 |             i_obj = model.inferer.layers[-1].cost(target, pred)
1208 |         else:
1209 |             i_obj = 0
1210 | 
1211 |         return S, d_obj, g_obj, i_obj
1212 | 
1213 |     def get_gradients(self, model, data, **kwargs):
1214 |         space, sources = self.get_data_specs(model)
1215 |         space.validate(data)
1216 |         assert isinstance(model, AdversaryPair)
1217 |         g = model.generator
1218 |         d = model.discriminator
1219 | 
1220 |         S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data)
1221 | 
1222 |         g_params = g.get_params()
1223 |         d_params = d.get_params()
1224 |         for param in g_params:
1225 |             assert param not in d_params
1226 |         for param in d_params:
1227 |             assert param not in g_params
1228 |         d_grads = T.grad(d_obj, d_params)
1229 |         g_grads = T.grad(g_obj, g_params)
1230 | 
1231 |         if self.scale_grads:
1232 |             S_grad = T.grad(g_obj, S)
1233 |             scale = T.maximum(1., self.target_scale / T.sqrt(T.sqr(S_grad).sum()))
1234 |             g_grads = [g_grad * scale for g_grad in g_grads]
1235 | 
1236 |         rval = OrderedDict()
1237 |         zeros = itertools.repeat(theano.tensor.constant(0., dtype='float32'))
1238 |         if self.ever_train_discriminator:
1239 |             rval.update(OrderedDict(safe_zip(d_params, [self.now_train_discriminator * dg for dg in d_grads])))
1240 |         else:
1241 |             rval.update(OrderedDict(zip(d_params, zeros)))
1242 |         if self.ever_train_generator:
1243 |             rval.update(OrderedDict(safe_zip(g_params, [self.now_train_generator * gg for gg in g_grads])))
1244 |         else:
1245 |             rval.update(OrderedDict(zip(g_params, zeros)))
1246 |         if self.ever_train_inference and model.inferer is not None:
1247 |             i_params = model.inferer.get_params()
1248 |             i_grads = T.grad(i_obj, i_params)
1249 |             rval.update(OrderedDict(safe_zip(i_params, [self.now_train_inference * ig for ig in i_grads])))
1250 |         elif model.inferer is not None:
1251 |             rval.update(OrderedDict(model.inferer.get_params(), zeros))
1252 | 
1253 |         updates = OrderedDict()
1254 | 
1255 |         # Two d steps for every g step
1256 |         if self.alternate_g:
1257 |             updates[self.now_train_generator] = 1. - self.now_train_generator
1258 | 
1259 |         return rval, updates
1260 | 
1261 |     def get_monitoring_channels(self, model, data, **kwargs):
1262 | 
1263 |         rval = OrderedDict()
1264 | 
1265 |         m = data.shape[0]
1266 | 
1267 |         g = model.generator
1268 |         d = model.discriminator
1269 | 
1270 |         y_hat = d.fprop(data)
1271 | 
1272 |         rval['false_negatives'] = T.cast((y_hat < 0.5).mean(), 'float32')
1273 | 
1274 |         samples = g.sample(m)
1275 |         y_hat = d.fprop(samples)
1276 |         rval['false_positives'] = T.cast((y_hat > 0.5).mean(), 'float32')
1277 |         # y = T.alloc(0., m, 1)
1278 |         cost = d.cost_from_X((samples, y_hat))
1279 |         sample_grad = T.grad(-cost, samples)
1280 |         rval['sample_grad_norm'] = T.sqrt(T.sqr(sample_grad).sum())
1281 |         _S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data)
1282 |         if model.monitor_inference and i_obj != 0:
1283 |             rval['objective_i'] = i_obj
1284 |         if model.monitor_discriminator:
1285 |             rval['objective_d'] = d_obj
1286 |         if model.monitor_generator:
1287 |             rval['objective_g'] = g_obj
1288 | 
1289 |         rval['now_train_generator'] = self.now_train_generator
1290 |         return rval
1291 | 


--------------------------------------------------------------------------------