├── show_samples_tfd.py ├── .gitignore ├── show_inpaint_samples.py ├── show_samples_tfd_paper.py ├── show_samples_mnist_paper.py ├── show_samples_cifar_full_paper.py ├── show_samples_cifar_conv_paper.py ├── LICENSE ├── show_gen_weights.py ├── show_samples_inpaint.py ├── test_deconv.py ├── show_samples.py ├── README.md ├── ll_mnist.py ├── ll.py ├── mnist.yaml ├── cifar10_fully_connected.yaml ├── tfd_pretrain ├── train.yaml └── pretrain.yaml ├── parzen_ll.py ├── cifar10_convolutional.yaml ├── deconv.py ├── sgd.py ├── sgd_alt.py └── __init__.py /show_samples_tfd.py: -------------------------------------------------------------------------------- 1 | from pylearn2.utils import serial 2 | import sys 3 | _, model_path = sys.argv 4 | model = serial.load(model_path) 5 | from pylearn2.gui.patch_viewer import make_viewer 6 | space = model.generator.get_output_space() 7 | total_dimension = space.get_total_dimension() 8 | import numpy as np 9 | num_colors = 1 10 | #if total_dimension % 3 == 0: 11 | # num_colors = 3 12 | w = int(np.sqrt(total_dimension / num_colors)) 13 | from pylearn2.space import Conv2DSpace 14 | desired_space = Conv2DSpace(shape=[w, w], num_channels=num_colors, axes=('b',0,1,'c')) 15 | samples = space.format_as(batch=model.generator.sample(100), 16 | space=desired_space).eval() 17 | print (samples.min(), samples.mean(), samples.max()) 18 | viewer = make_viewer(samples * 2.0 - 1.0) 19 | viewer.show() 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | -------------------------------------------------------------------------------- /show_inpaint_samples.py: -------------------------------------------------------------------------------- 1 | from pylearn2.utils import serial 2 | import sys 3 | _, model_path = sys.argv 4 | model = serial.load(model_path) 5 | from pylearn2.gui.patch_viewer import make_viewer 6 | space = model.generator.get_output_space() 7 | from pylearn2.config import yaml_parse 8 | import numpy as np 9 | 10 | dataset = yaml_parse.load(model.dataset_yaml_src) 11 | dataset = dataset.get_test_set() 12 | 13 | grid_shape = None 14 | 15 | from pylearn2.utils import sharedX 16 | X = sharedX(dataset.get_batch_topo(100)) 17 | samples, ignore = model.generator.inpainting_sample_and_noise(X) 18 | samples = samples.eval() 19 | total_dimension = space.get_total_dimension() 20 | num_colors = 1 21 | if total_dimension % 3 == 0: 22 | num_colors = 3 23 | w = int(np.sqrt(total_dimension / num_colors)) 24 | from pylearn2.space import Conv2DSpace 25 | desired_space = Conv2DSpace(shape=[w, w], num_channels=num_colors, axes=('b',0,1,'c')) 26 | is_color = samples.shape[-1] == 3 27 | print (samples.min(), samples.mean(), samples.max()) 28 | # Hack for detecting MNIST [0, 1] values. Otherwise we assume centered images 29 | if samples.min() >0: 30 | samples = samples * 2.0 - 1.0 31 | viewer = make_viewer(samples, grid_shape=grid_shape, is_color=is_color) 32 | viewer.show() 33 | -------------------------------------------------------------------------------- /show_samples_tfd_paper.py: -------------------------------------------------------------------------------- 1 | from pylearn2.utils import serial 2 | import sys 3 | _, model_path = sys.argv 4 | model = serial.load(model_path) 5 | from pylearn2.gui.patch_viewer import make_viewer 6 | space = model.generator.get_output_space() 7 | from pylearn2.config import yaml_parse 8 | from pylearn2.gui.patch_viewer import PatchViewer 9 | import numpy as np 10 | 11 | dataset = yaml_parse.load(model.dataset_yaml_src) 12 | 13 | grid_shape = None 14 | 15 | rows = 4 16 | sample_cols = 5 17 | 18 | # For some reason format_as from VectorSpace is not working right 19 | samples = model.generator.sample(rows * sample_cols).eval() 20 | topo_samples = dataset.get_topological_view(samples) 21 | 22 | pv = PatchViewer(grid_shape=(rows, sample_cols + 1), patch_shape=(48,48), 23 | is_color=False) 24 | 25 | X = dataset.X 26 | topo = dataset.get_topological_view() 27 | index = 0 28 | for i in xrange(samples.shape[0]): 29 | topo_sample = topo_samples[i, :, :, :] 30 | pv.add_patch(topo_sample * 2. - 1., rescale=False) 31 | 32 | if (i +1) % sample_cols == 0: 33 | sample = samples[i, :] 34 | dists = np.square(X - sample).sum(axis=1) 35 | j = np.argmin(dists) 36 | match = topo[j, :] 37 | pv.add_patch(match * 2 -1, rescale=False, activation=1) 38 | 39 | pv.show() 40 | -------------------------------------------------------------------------------- /show_samples_mnist_paper.py: -------------------------------------------------------------------------------- 1 | from pylearn2.utils import serial 2 | import sys 3 | _, model_path = sys.argv 4 | model = serial.load(model_path) 5 | from pylearn2.gui.patch_viewer import make_viewer 6 | space = model.generator.get_output_space() 7 | from pylearn2.config import yaml_parse 8 | from pylearn2.gui.patch_viewer import PatchViewer 9 | import numpy as np 10 | 11 | dataset = yaml_parse.load(model.dataset_yaml_src) 12 | 13 | grid_shape = None 14 | 15 | rows = 4 16 | sample_cols = 5 17 | 18 | # For some reason format_as from VectorSpace is not working right 19 | samples = model.generator.sample(rows * sample_cols).eval() 20 | topo_samples = dataset.get_topological_view(samples) 21 | 22 | pv = PatchViewer(grid_shape=(rows, sample_cols + 1), patch_shape=(28,28), 23 | is_color=False) 24 | 25 | X = dataset.X 26 | topo = dataset.get_topological_view() 27 | index = 0 28 | for i in xrange(samples.shape[0]): 29 | topo_sample = topo_samples[i, :, :, :] 30 | pv.add_patch(topo_sample * 2. - 1., rescale=False) 31 | 32 | if (i +1) % sample_cols == 0: 33 | sample = samples[i, :] 34 | dists = np.square(X - sample).sum(axis=1) 35 | j = np.argmin(dists) 36 | match = topo[j, :] 37 | pv.add_patch(match * 2 -1, rescale=False, activation=1) 38 | 39 | pv.show() 40 | -------------------------------------------------------------------------------- /show_samples_cifar_full_paper.py: -------------------------------------------------------------------------------- 1 | from pylearn2.utils import serial 2 | import sys 3 | _, model_path = sys.argv 4 | model = serial.load(model_path) 5 | space = model.generator.get_output_space() 6 | from pylearn2.config import yaml_parse 7 | from pylearn2.gui.patch_viewer import PatchViewer 8 | import numpy as np 9 | 10 | dataset = yaml_parse.load(model.dataset_yaml_src) 11 | 12 | grid_shape = None 13 | 14 | rows = 4 15 | sample_cols = 5 16 | 17 | # For some reason format_as from VectorSpace is not working right 18 | samples = model.generator.sample(rows * sample_cols).eval() 19 | topo_samples = dataset.get_topological_view(samples) 20 | 21 | pv = PatchViewer(grid_shape=(rows, sample_cols + 1), patch_shape=(32,32), 22 | is_color=True) 23 | scale = np.abs(samples).max() 24 | 25 | X = dataset.X 26 | topo = dataset.get_topological_view() 27 | index = 0 28 | for i in xrange(samples.shape[0]): 29 | topo_sample = topo_samples[i, :, :, :] 30 | print topo_sample.min(), topo_sample.max() 31 | pv.add_patch(topo_sample / scale, rescale=False) 32 | 33 | if (i +1) % sample_cols == 0: 34 | sample = samples[i, :] 35 | dists = np.square(X - sample).sum(axis=1) 36 | j = np.argmin(dists) 37 | match = topo[j, :] 38 | print match.min(), match.max() 39 | pv.add_patch(match / scale, rescale=False, activation=1) 40 | 41 | pv.show() 42 | -------------------------------------------------------------------------------- /show_samples_cifar_conv_paper.py: -------------------------------------------------------------------------------- 1 | from pylearn2.utils import serial 2 | import sys 3 | _, model_path = sys.argv 4 | model = serial.load(model_path) 5 | space = model.generator.get_output_space() 6 | from pylearn2.config import yaml_parse 7 | from pylearn2.gui.patch_viewer import PatchViewer 8 | import numpy as np 9 | 10 | dataset = yaml_parse.load(model.dataset_yaml_src) 11 | 12 | grid_shape = None 13 | 14 | rows = 4 15 | sample_cols = 5 16 | 17 | # For some reason format_as from VectorSpace is not working right 18 | topo_samples = model.generator.sample(rows * sample_cols).eval() 19 | samples = dataset.get_design_matrix(topo_samples) 20 | dataset.axes = ['b', 0, 1, 'c'] 21 | dataset.view_converter.axes = ['b', 0, 1, 'c'] 22 | topo_samples = dataset.get_topological_view(samples) 23 | 24 | pv = PatchViewer(grid_shape=(rows, sample_cols + 1), patch_shape=(32,32), 25 | is_color=True) 26 | scale = np.abs(samples).max() 27 | 28 | X = dataset.X 29 | topo = dataset.get_topological_view() 30 | index = 0 31 | for i in xrange(samples.shape[0]): 32 | topo_sample = topo_samples[i, :, :, :] 33 | print topo_sample.min(), topo_sample.max() 34 | pv.add_patch(topo_sample / scale, rescale=False) 35 | 36 | if (i +1) % sample_cols == 0: 37 | sample = samples[i, :] 38 | dists = np.square(X - sample).sum(axis=1) 39 | j = np.argmin(dists) 40 | match = topo[j, :] 41 | print match.min(), match.max() 42 | pv.add_patch(match / scale, rescale=False, activation=1) 43 | 44 | pv.show() 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Ian Goodfellow 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of the {organization} nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /show_gen_weights.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pylearn2.gui.patch_viewer import make_viewer 3 | from pylearn2.utils import serial 4 | model = serial.load(sys.argv[1]) 5 | generator = model.generator 6 | 7 | final = generator.mlp.layers[-1] 8 | success = False 9 | 10 | i = -1 11 | success = False 12 | to_search = generator.mlp 13 | while not success: 14 | print "while loop ", i 15 | final = to_search.layers[i] 16 | if 'Composite' in str(type(final)): 17 | i = input("which") 18 | elem = final.layers[i] 19 | if hasattr(elem, 'layers'): 20 | print "stepping into inner MLP" 21 | i = -1 22 | to_search = elem 23 | continue 24 | else: 25 | print "examining this element" 26 | final = elem 27 | 28 | try: 29 | print "Trying get_weights topo" 30 | topo = final.get_weights_topo() 31 | print "It worked" 32 | success = True 33 | except Exception: 34 | pass 35 | 36 | if success: 37 | print "Making the viewer and showing" 38 | make_viewer(topo).show() 39 | quit() 40 | 41 | try: 42 | print "Trying get_weights" 43 | weights = final.get_weights() 44 | print "It worked" 45 | success = True 46 | except NotImplementedError: 47 | i -= 1 # skip over SpaceConverter, etc. 48 | print "Out of the while loop" 49 | 50 | 51 | print "weights shape ", weights.shape 52 | viewer = make_viewer(weights, is_color=weights.shape[1] % 3 == 0 and weights.shape[1] != 48*48) 53 | print "image shape ", viewer.image.shape 54 | 55 | print "made viewer" 56 | 57 | viewer.show() 58 | 59 | print "executed show" 60 | -------------------------------------------------------------------------------- /show_samples_inpaint.py: -------------------------------------------------------------------------------- 1 | import theano 2 | from pylearn2.utils import serial 3 | import sys 4 | from pylearn2.gui.patch_viewer import make_viewer 5 | from pylearn2.space import VectorSpace 6 | from pylearn2.config import yaml_parse 7 | import numpy as np 8 | import ipdb 9 | 10 | 11 | # TODO, only works for CIFAR10 for now 12 | 13 | grid_shape = None 14 | repeat_samples = 1 15 | num_samples = 5 16 | 17 | 18 | _, model_path = sys.argv 19 | model = serial.load(model_path) 20 | rng = np.random.RandomState(20232) 21 | 22 | def get_data_samples(dataset, n = num_samples): 23 | unique_y = np.unique(dataset.y) 24 | rval = [] 25 | for y in np.unique(dataset.y): 26 | ind = np.where(dataset.y == y)[0] 27 | ind = ind[rng.randint(0, len(ind), n)] 28 | rval.append(dataset.get_topological_view()[ind]) 29 | 30 | return np.concatenate(rval) 31 | 32 | dataset = yaml_parse.load(model.dataset_yaml_src) 33 | dataset = dataset.get_test_set() 34 | data = get_data_samples(dataset) 35 | 36 | output_space = model.generator.get_output_space() 37 | input_space = model.generator.mlp.input_space 38 | 39 | X = input_space.get_theano_batch() 40 | samples, _ = model.generator.inpainting_sample_and_noise(X) 41 | f = theano.function([X], samples) 42 | 43 | samples = [] 44 | for i in xrange(repeat_samples): 45 | samples.append(f(data)) 46 | 47 | samples = np.concatenate(samples) 48 | 49 | is_color = True 50 | 51 | 52 | print (samples.min(), samples.mean(), samples.max()) 53 | # Hack for detecting MNIST [0, 1] values. Otherwise we assume centered images 54 | if samples.min() >0: 55 | samples = samples * 2.0 - 1.0 56 | viewer = make_viewer(samples, grid_shape=grid_shape, is_color=is_color) 57 | viewer.show() 58 | -------------------------------------------------------------------------------- /test_deconv.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script visually test the deconv layer. 3 | Construct an MLP with conv ,and deconv layer, 4 | set their W to same values and show the original 5 | input and the output of the mlp side by side. 6 | They are supposed to look same. 7 | """ 8 | 9 | 10 | import theano 11 | from adversarial.deconv import Deconv 12 | from pylearn2.datasets.mnist import MNIST 13 | from pylearn2.space import Conv2DSpace 14 | from pylearn2.models.mlp import MLP 15 | from pylearn2.models.maxout import MaxoutConvC01B 16 | from pylearn2.gui import patch_viewer 17 | import ipdb 18 | 19 | 20 | input_space = Conv2DSpace(shape = (28, 28), num_channels=1, axes = ('c', 0, 1, 'b')) 21 | conv = MaxoutConvC01B(layer_name = 'conv', 22 | num_channels = 16, 23 | num_pieces = 1, 24 | kernel_shape = (4, 4), 25 | pool_shape = (1, 1), 26 | pool_stride=(1, 1), 27 | irange = 0.05) 28 | deconv = Deconv(layer_name = 'deconv', 29 | num_channels = 1, 30 | kernel_shape = (4, 4), 31 | irange = 0.05) 32 | 33 | mlp = MLP(input_space =input_space, 34 | layers = [conv, deconv]) 35 | 36 | mlp.layers[1].transformer._filters.set_value(mlp.layers[0].transformer._filters.get_value()) 37 | 38 | x = input_space.get_theano_batch() 39 | out = mlp.fprop(x) 40 | f = theano.function([x], out) 41 | 42 | data = MNIST('test') 43 | data_specs = (input_space, 'features') 44 | iter = data.iterator(mode = 'sequential', batch_size = 2, data_specs = data_specs) 45 | pv = patch_viewer.PatchViewer((10, 10), (28, 28)) 46 | for item in iter: 47 | res = f(item) 48 | pv.add_patch(item[0,:,:,0]) 49 | pv.add_patch(res[0,:,:,0]) 50 | pv.show() 51 | break 52 | 53 | -------------------------------------------------------------------------------- /show_samples.py: -------------------------------------------------------------------------------- 1 | from pylearn2.utils import serial 2 | import sys 3 | _, model_path = sys.argv 4 | model = serial.load(model_path) 5 | from pylearn2.gui.patch_viewer import make_viewer 6 | space = model.generator.get_output_space() 7 | from pylearn2.space import VectorSpace 8 | from pylearn2.config import yaml_parse 9 | import numpy as np 10 | 11 | match_train = True 12 | if match_train: 13 | dataset = yaml_parse.load(model.dataset_yaml_src) 14 | 15 | grid_shape = None 16 | 17 | if isinstance(space, VectorSpace): 18 | # For some reason format_as from VectorSpace is not working right 19 | samples = model.generator.sample(100).eval() 20 | 21 | if match_train: 22 | grid_shape = (10, 20) 23 | matched = np.zeros((samples.shape[0] * 2, samples.shape[1])) 24 | X = dataset.X 25 | for i in xrange(samples.shape[0]): 26 | matched[2 * i, :] = samples[i, :].copy() 27 | dists = np.square(X - samples[i, :]).sum(axis=1) 28 | j = np.argmin(dists) 29 | matched[2 * i + 1, :] = X[j, :] 30 | samples = matched 31 | 32 | is_color = samples.shape[-1] % 3 == 0 and samples.shape[-1] != 48 * 48 33 | else: 34 | total_dimension = space.get_total_dimension() 35 | import numpy as np 36 | num_colors = 1 37 | if total_dimension % 3 == 0: 38 | num_colors = 3 39 | w = int(np.sqrt(total_dimension / num_colors)) 40 | from pylearn2.space import Conv2DSpace 41 | desired_space = Conv2DSpace(shape=[w, w], num_channels=num_colors, axes=('b',0,1,'c')) 42 | samples = space.format_as(batch=model.generator.sample(100), 43 | space=desired_space).eval() 44 | is_color = samples.shape[-1] == 3 45 | print (samples.min(), samples.mean(), samples.max()) 46 | # Hack for detecting MNIST [0, 1] values. Otherwise we assume centered images 47 | if samples.min() >0: 48 | samples = samples * 2.0 - 1.0 49 | viewer = make_viewer(samples, grid_shape=grid_shape, is_color=is_color) 50 | viewer.show() 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Generative Adversarial Networks 2 | =============================== 3 | 4 | This repository contains the code and hyperparameters for the paper: 5 | 6 | "Generative Adversarial Networks." Ian J. Goodfellow, Jean Pouget-Abadie, 7 | Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville, 8 | Yoshua Bengio. ArXiv 2014. 9 | 10 | Please cite this paper if you use the code in this repository as part of 11 | a published research project. 12 | 13 | We are an academic lab, not a software company, and have no personnel 14 | devoted to documenting and maintaing this research code. 15 | Therefore this code is offered with absolutely no support. 16 | Exact reproduction of the numbers in the paper depends on exact 17 | reproduction of many factors, 18 | including the version of all software dependencies and the choice of 19 | underlying hardware (GPU model, etc). We used NVIDA Ge-Force GTX-580 20 | graphics cards; other hardware will use different tree structures for 21 | summation and incur different rounding error. If you do not reproduce our 22 | setup exactly you should expect to need to re-tune your hyperparameters 23 | slight for your new setup. 24 | 25 | Moreover, we have not integrated any unit tests for this code into Theano 26 | or Pylearn2 so subsequent changes to those libraries may break the code 27 | in this repository. If you encounter problems with this code, you should 28 | make sure that you are using the development branch of Pylearn2 and Theano, 29 | and use "git checkout" to go to a commit from approximately June 9, 2014. 30 | 31 | This code itself requires no installation besides making sure that the 32 | "adversarial" directory is in a directory in your PYTHONPATH. If 33 | installed correctly, 'python -c "import adversarial"' will work. You 34 | must also install Pylearn2 and Pylearn2's dependencies (Theano, numpy, 35 | etc.) 36 | 37 | parzen_ll.py is the script used to estimate the log likelihood of the 38 | model using the Parzen density technique. 39 | 40 | Call pylearn2/scripts/train.py on the various yaml files in this repository 41 | to train the model for each dataset reported in the paper. The names of 42 | *.yaml are fairly self-explanatory. 43 | -------------------------------------------------------------------------------- /ll_mnist.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | 4 | from theano import function 5 | from theano import tensor as T 6 | 7 | _, model_path, sigma = sys.argv 8 | from pylearn2.utils import serial 9 | model = serial.load(model_path) 10 | from pylearn2.config import yaml_parse 11 | dataset = yaml_parse.load(model.dataset_yaml_src) 12 | dataset = dataset.get_test_set() 13 | from pylearn2.utils import sharedX 14 | g = model.generator 15 | n = g.get_input_space().get_total_dimension() 16 | X = sharedX(dataset.X) 17 | from theano.sandbox.rng_mrg import MRG_RandomStreams 18 | theano_rng = MRG_RandomStreams(2014 + 6 * 24) 19 | assert False # Aaron says to do valid comparison we need to download the standard binarization, 20 | # and the model should also have been trained on the standard binarization 21 | f = function([], updates=[(X, theano_rng.binomial(p=X, size=X.shape, dtype=X.dtype))]) 22 | f() 23 | m = dataset.X.shape[0] 24 | accumulator = sharedX(np.zeros((m,))) 25 | z_samples = g.get_noise(1) 26 | x_samples = g.mlp.fprop(z_samples) 27 | # x_samples = X 28 | from theano.compat import OrderedDict 29 | updates = OrderedDict() 30 | from theano import shared 31 | num_samples = shared(1) 32 | sigma = sharedX(float(sigma)) 33 | prev = accumulator 34 | from theano.printing import Print 35 | #prev = Print('prev',attrs=['min','max'])(prev) 36 | # E_x log E_z exp(- sum_i softplus( (1 - 2 x_i) A(z)_i) ) 37 | from pylearn2.expr.nnet import arg_of_sigmoid 38 | A = arg_of_sigmoid(x_samples) 39 | cur = - T.nnet.softplus((1. - 2. * X) * A).sum(axis=1) 40 | #cur = Print('cur',attrs=['min','max'])(cur) 41 | ofs = T.maximum(prev, cur) 42 | num_samples_f = T.cast(num_samples, 'float32') 43 | updates[accumulator] = ofs + T.log((num_samples_f * T.exp(prev - ofs) + T.exp(cur - ofs)) / (num_samples_f + 1.)) 44 | updates[num_samples] = num_samples + 1 45 | f = function([], updates=updates) 46 | updates[accumulator] = cur 47 | del updates[num_samples] 48 | first = function([], updates=updates) 49 | avg_ll = accumulator.mean() 50 | 51 | import time 52 | prev_t = time.time() 53 | first() 54 | while True: 55 | v = avg_ll.eval() 56 | i = num_samples.get_value() 57 | if i == 1 or i % 1000 == 0: 58 | now_t = time.time() 59 | print i, v, now_t - prev_t 60 | prev_t = now_t 61 | if np.isnan(v) or np.isinf(v): 62 | break 63 | f() 64 | 65 | # E_x log p(x) 66 | # E_x log int p(x, z) dz 67 | # E_x log int p(z) p(x | z) dz 68 | # E_x log E_z p(x | z) 69 | # E_x log E_z prod_i p(x_i | z) 70 | # E_x log E_z prod_i sigmoid( (2 x_i - 1) A(z)_i) 71 | # E_x log E_z exp(log prod_i sigmoid( (2 x_i - 1) A(z)_i) ) 72 | # E_x log E_z exp(sum_i log sigmoid( (2 x_i - 1) A(z)_i) ) 73 | # E_x log E_z exp(- sum_i softplus( (1 - 2 x_i) A(z)_i) ) 74 | -------------------------------------------------------------------------------- /ll.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | 4 | from theano import function 5 | from theano import tensor as T 6 | 7 | _, model_path, sigma = sys.argv 8 | from pylearn2.utils import serial 9 | model = serial.load(model_path) 10 | from pylearn2.config import yaml_parse 11 | dataset = yaml_parse.load(model.dataset_yaml_src) 12 | dataset = dataset.get_test_set() 13 | from pylearn2.utils import sharedX 14 | g = model.generator 15 | n = g.get_input_space().get_total_dimension() 16 | X = sharedX(dataset.X) 17 | m = dataset.X.shape[0] 18 | accumulator = sharedX(np.zeros((m,))) 19 | z_samples = g.get_noise(1) 20 | x_samples = g.mlp.fprop(z_samples) 21 | from theano.compat import OrderedDict 22 | updates = OrderedDict() 23 | from theano import shared 24 | num_samples = shared(1) 25 | sigma = sharedX(float(sigma)) 26 | prev = accumulator 27 | cur = -0.5 * T.sqr(X - x_samples).sum(axis=1) / T.sqr(sigma) 28 | ofs = T.maximum(prev, cur) 29 | num_samples_f = T.cast(num_samples, 'float32') 30 | updates[accumulator] = ofs + T.log(num_samples_f * T.exp(prev - ofs) + T.exp(cur - ofs)) - T.log(num_samples_f + 1.) 31 | updates[num_samples] = num_samples + 1 32 | f = function([], updates=updates) 33 | updates[accumulator] = cur 34 | del updates[num_samples] 35 | first = function([], updates=updates) 36 | avg_ll = accumulator.mean() - 0.5 * X.shape[1] * T.log(2 * np.pi * T.sqr(sigma)) 37 | 38 | import time 39 | prev_t = time.time() 40 | first() 41 | while True: 42 | v = avg_ll.eval() 43 | i = num_samples.get_value() 44 | if i == 1 or i % 1000 == 0: 45 | now_t = time.time() 46 | print i, v, now_t - prev_t 47 | prev_t = now_t 48 | if np.isnan(v) or np.isinf(v): 49 | break 50 | f() 51 | 52 | # log p(x) 53 | # = log int p(z, x) dz 54 | # = log int p(z) p(x |z) dz 55 | # = log E_z p(x|z) 56 | # = log (1/m) sum_z p(x|z) 57 | # = log (1/m) sum_z prod_i sqrt(1/(2 pi sigma^2)) exp( -0.5 (x_i-g(z)_i)^2 / sigma^2) 58 | # = log sqrt(1/(2 pi sigma^2))^d (1/m) sum_z prod_iexp( -0.5 (x_i-g(z)_i)^2 / sigma^2) 59 | # = log sqrt(1/(2 pi sigma^2))^d (1/m) sum_z exp( sum_i -0.5 (x_i-g(z)_i)^2 / sigma^2) 60 | # = log sqrt(1/(2 pi sigma^2))^d + log (1/m) sum_z exp( sum_i -0.5 (x_i-g(z)_i)^2 / sigma^2) 61 | # = 0.5 d log 1/(2 pi sigma^2) + log (1/m) sum_z exp( sum_i -0.5 (x_i-g(z)_i)^2 / sigma^2) 62 | # = -0.5 d log (2 pi sigma^2) + log (1/m) sum_z exp( sum_i -0.5 (x_i-g(z)_i)^2 / sigma^2) 63 | 64 | # log (1/m) sum_j exp(v_j) 65 | # = log (1/m) [exp(v_m) + sum_{j=1}^{m-1} exp(v_j)] 66 | # = log (1/m) [exp(v_m) + (m-1) exp( prev )] 67 | # = log (1/m) [exp(v_m) exp(ofs-ofs) + (m-1) exp( prev ) exp(ofs -ofs)] 68 | # = log (1/m) [exp(v_m- ofs) exp(ofs) + (m-1) exp( prev -ofs) exp(ofs)] 69 | # = log exp(ofs) (1/m) [exp(v_m- ofs) + (m-1) exp( prev -ofs) ] 70 | # = ofs + log (1/m) [exp(v_m- ofs) + (m-1) exp( prev -ofs) ] 71 | # = ofs + log [exp(v_m- ofs) + (m-1) exp( prev -ofs) ] - log m 72 | -------------------------------------------------------------------------------- /mnist.yaml: -------------------------------------------------------------------------------- 1 | !obj:pylearn2.train.Train { 2 | dataset: &train !obj:pylearn2.datasets.mnist.MNIST { 3 | which_set: 'train', 4 | start: 0, 5 | stop: 50000 6 | }, 7 | model: !obj:adversarial.AdversaryPair { 8 | generator: !obj:adversarial.Generator { 9 | noise: 'uniform', 10 | monitor_ll: 1, 11 | mlp: !obj:pylearn2.models.mlp.MLP { 12 | layers: [ 13 | !obj:pylearn2.models.mlp.RectifiedLinear { 14 | layer_name: 'h0', 15 | dim: 1200, 16 | irange: .05, 17 | }, 18 | !obj:pylearn2.models.mlp.RectifiedLinear { 19 | layer_name: 'h1', 20 | dim: 1200, 21 | irange: .05, 22 | }, 23 | !obj:pylearn2.models.mlp.Sigmoid { 24 | init_bias: !obj:pylearn2.models.dbm.init_sigmoid_bias_from_marginals { dataset: *train}, 25 | layer_name: 'y', 26 | irange: .05, 27 | dim: 784 28 | } 29 | ], 30 | nvis: 100, 31 | }}, 32 | discriminator: 33 | !obj:pylearn2.models.mlp.MLP { 34 | layers: [ 35 | !obj:pylearn2.models.maxout.Maxout { 36 | layer_name: 'h0', 37 | num_units: 240, 38 | num_pieces: 5, 39 | irange: .005, 40 | }, 41 | !obj:pylearn2.models.maxout.Maxout { 42 | layer_name: 'h1', 43 | num_units: 240, 44 | num_pieces: 5, 45 | irange: .005, 46 | }, 47 | !obj:pylearn2.models.mlp.Sigmoid { 48 | layer_name: 'y', 49 | dim: 1, 50 | irange: .005 51 | } 52 | ], 53 | nvis: 784, 54 | }, 55 | }, 56 | algorithm: !obj:pylearn2.training_algorithms.sgd.SGD { 57 | batch_size: 100, 58 | learning_rate: .1, 59 | learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum { 60 | init_momentum: .5, 61 | }, 62 | monitoring_dataset: 63 | { 64 | 'valid' : !obj:pylearn2.datasets.mnist.MNIST { 65 | which_set: 'train', 66 | start: 50000, 67 | stop: 60000 68 | }, 69 | }, 70 | cost: !obj:adversarial.AdversaryCost2 { 71 | scale_grads: 0, 72 | #target_scale: 1., 73 | discriminator_default_input_include_prob: .5, 74 | discriminator_input_include_probs: { 75 | 'h0': .8 76 | }, 77 | discriminator_default_input_scale: 2., 78 | discriminator_input_scales: { 79 | 'h0': 1.25 80 | } 81 | }, 82 | #!obj:pylearn2.costs.mlp.dropout.Dropout { 83 | # input_include_probs: { 'h0' : .8 }, 84 | # input_scales: { 'h0': 1. } 85 | #}, 86 | #termination_criterion: !obj:pylearn2.termination_criteria.MonitorBased { 87 | # channel_name: "valid_y_misclass", 88 | # prop_decrease: 0., 89 | # N: 100 90 | #}, 91 | update_callbacks: !obj:pylearn2.training_algorithms.sgd.ExponentialDecay { 92 | decay_factor: 1.000004, 93 | min_lr: .000001 94 | } 95 | }, 96 | extensions: [ 97 | #!obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest { 98 | # channel_name: 'valid_y_misclass', 99 | # save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}_best.pkl" 100 | #}, 101 | !obj:pylearn2.training_algorithms.learning_rule.MomentumAdjustor { 102 | start: 1, 103 | saturate: 250, 104 | final_momentum: .7 105 | } 106 | ], 107 | save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}.pkl", 108 | save_freq: 1 109 | } 110 | -------------------------------------------------------------------------------- /cifar10_fully_connected.yaml: -------------------------------------------------------------------------------- 1 | !obj:pylearn2.train.Train { 2 | dataset: &train !obj:pylearn2.datasets.cifar10.CIFAR10 { 3 | gcn: 55., 4 | which_set: 'train', 5 | start: 0, 6 | stop: 40000 7 | }, 8 | model: !obj:adversarial.AdversaryPair { 9 | generator: !obj:adversarial.Generator { 10 | mlp: !obj:pylearn2.models.mlp.MLP { 11 | layers: [ 12 | !obj:pylearn2.models.mlp.RectifiedLinear { 13 | layer_name: 'gh0', 14 | dim: 8000, 15 | irange: .05, 16 | }, 17 | !obj:pylearn2.models.mlp.Sigmoid { 18 | layer_name: 'h1', 19 | dim: 8000, 20 | irange: .05, 21 | }, 22 | !obj:pylearn2.models.mlp.Linear { 23 | # init_bias: !obj:pylearn2.models.dbm.init_sigmoid_bias_from_marginals { dataset: *train}, 24 | layer_name: 'y', 25 | irange: .5, 26 | dim: 3072 27 | } 28 | ], 29 | nvis: 100, 30 | }}, 31 | discriminator: 32 | !obj:pylearn2.models.mlp.MLP { 33 | layers: [ 34 | !obj:pylearn2.models.maxout.Maxout { 35 | layer_name: 'dh0', 36 | num_units: 1600, 37 | num_pieces: 5, 38 | irange: .005, 39 | }, 40 | !obj:pylearn2.models.maxout.Maxout { 41 | layer_name: 'h1', 42 | num_units: 1600, 43 | num_pieces: 5, 44 | irange: .005, 45 | }, 46 | !obj:pylearn2.models.mlp.Sigmoid { 47 | layer_name: 'y', 48 | dim: 1, 49 | irange: .005 50 | } 51 | ], 52 | nvis: 3072, 53 | }, 54 | }, 55 | algorithm: !obj:pylearn2.training_algorithms.sgd.SGD { 56 | batch_size: 100, 57 | learning_rate: .025, 58 | learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum { 59 | init_momentum: .5, 60 | }, 61 | monitoring_dataset: 62 | { 63 | #'train' : *train, 64 | 'valid' : !obj:pylearn2.datasets.cifar10.CIFAR10 { 65 | gcn: 55., 66 | which_set: 'train', 67 | start: 40000, 68 | stop: 50000 69 | }, 70 | #'test' : !obj:pylearn2.datasets.cifar10.CIFAR10 { 71 | # which_set: 'test', 72 | # gcn: 55., 73 | # } 74 | }, 75 | cost: !obj:adversarial.AdversaryCost2 { 76 | scale_grads: 0, 77 | #target_scale: .1, 78 | discriminator_default_input_include_prob: .5, 79 | discriminator_input_include_probs: { 80 | 'dh0': .8 81 | }, 82 | discriminator_default_input_scale: 2., 83 | discriminator_input_scales: { 84 | 'dh0': 1.25 85 | } 86 | }, 87 | #!obj:pylearn2.costs.mlp.dropout.Dropout { 88 | # input_include_probs: { 'h0' : .8 }, 89 | # input_scales: { 'h0': 1. } 90 | #}, 91 | #termination_criterion: !obj:pylearn2.termination_criteria.MonitorBased { 92 | # channel_name: "valid_y_misclass", 93 | # prop_decrease: 0., 94 | # N: 100 95 | #}, 96 | update_callbacks: !obj:pylearn2.training_algorithms.sgd.ExponentialDecay { 97 | decay_factor: 1.000004, 98 | min_lr: .000001 99 | } 100 | }, 101 | extensions: [ 102 | #!obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest { 103 | # channel_name: 'valid_y_misclass', 104 | # save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}_best.pkl" 105 | #}, 106 | !obj:pylearn2.training_algorithms.learning_rule.MomentumAdjustor { 107 | start: 1, 108 | saturate: 250, 109 | final_momentum: .7 110 | } 111 | ], 112 | save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}.pkl", 113 | save_freq: 1 114 | } 115 | -------------------------------------------------------------------------------- /tfd_pretrain/train.yaml: -------------------------------------------------------------------------------- 1 | !obj:pylearn2.train.Train { 2 | dataset: &train !obj:pylearn2.datasets.tfd.TFD { 3 | which_set: 'unlabeled', 4 | scale: True, 5 | }, 6 | model: !obj:adversarial.AdversaryPair { 7 | generator: !obj:adversarial.Generator { 8 | monitor_ll: 1, 9 | mlp: !obj:adversarial.add_layers { 10 | mlp: !obj:pylearn2.models.mlp.MLP { 11 | layers: [ 12 | !obj:pylearn2.models.mlp.RectifiedLinear { 13 | layer_name: 'h0', 14 | dim: 8000, 15 | irange: .05, 16 | max_col_norm: 1.9365, 17 | }, 18 | !obj:pylearn2.models.mlp.Sigmoid { 19 | layer_name: 'h1', 20 | dim: 100, 21 | irange: .05, 22 | max_col_norm: 1.9365, 23 | init_bias: -2.0, 24 | }, 25 | ], 26 | nvis: 100, 27 | }, 28 | pretrained: "./pretrain.pkl", 29 | } 30 | }, 31 | discriminator: 32 | !obj:pylearn2.models.mlp.MLP { 33 | layers: [ 34 | !obj:pylearn2.models.maxout.Maxout { 35 | #W_lr_scale: .1, 36 | #b_lr_scale: .1, 37 | layer_name: 'h0', 38 | num_units: 1200, 39 | num_pieces: 5, 40 | irange: .005, 41 | max_col_norm: 1.9365, 42 | }, 43 | !obj:pylearn2.models.maxout.Maxout { 44 | #W_lr_scale: .1, 45 | #b_lr_scale: .1, 46 | layer_name: 'h1', 47 | num_units: 1200, 48 | num_pieces: 5, 49 | irange: .005, 50 | max_col_norm: 1.9365, 51 | }, 52 | !obj:pylearn2.models.mlp.Sigmoid { 53 | #W_lr_scale: .1, 54 | #b_lr_scale: .1, 55 | max_col_norm: 1.9365, 56 | layer_name: 'y', 57 | dim: 1, 58 | irange: .005 59 | } 60 | ], 61 | nvis: 2304, 62 | }, 63 | }, 64 | algorithm: !obj:pylearn2.training_algorithms.sgd.SGD { 65 | batch_size: 100, 66 | learning_rate: .05, 67 | learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum { 68 | init_momentum: .5, 69 | }, 70 | monitoring_dataset: 71 | { 72 | # 'train' : *train, 73 | 'valid' : !obj:pylearn2.datasets.tfd.TFD { 74 | which_set: 'valid', 75 | scale: True, 76 | }, 77 | # 'test' : !obj:pylearn2.datasets.tfd.TFD { 78 | # which_set: 'test', 79 | # scale: True, 80 | # } 81 | }, 82 | cost: !obj:adversarial.AdversaryCost2 { 83 | scale_grads: 0, 84 | #target_scale: 1., 85 | discriminator_default_input_include_prob: .5, 86 | discriminator_input_include_probs: { 87 | 'h0': .8 88 | }, 89 | discriminator_default_input_scale: 2., 90 | discriminator_input_scales: { 91 | 'h0': 1.25 92 | } 93 | }, 94 | #!obj:pylearn2.costs.mlp.dropout.Dropout { 95 | # input_include_probs: { 'h0' : .8 }, 96 | # input_scales: { 'h0': 1. } 97 | #}, 98 | termination_criterion: !obj:pylearn2.termination_criteria.EpochCounter { 99 | max_epochs: 22 100 | }, 101 | update_callbacks: !obj:pylearn2.training_algorithms.sgd.ExponentialDecay { 102 | decay_factor: 1.000004, 103 | min_lr: .000001 104 | } 105 | }, 106 | extensions: [ 107 | !obj:pylearn2.training_algorithms.learning_rule.MomentumAdjustor { 108 | start: 1, 109 | saturate: 250, 110 | final_momentum: .7 111 | }, 112 | #!obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest { 113 | # channel_name: 'valid_gen_ll', 114 | # name_base: 'save/train', 115 | # store_best_model: True 116 | #} 117 | 118 | ], 119 | save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}.pkl", 120 | save_freq: 1 121 | } 122 | -------------------------------------------------------------------------------- /tfd_pretrain/pretrain.yaml: -------------------------------------------------------------------------------- 1 | !obj:pylearn2.train.Train { 2 | dataset: &train !obj:pylearn2.datasets.tfd.TFD { 3 | which_set: 'unlabeled', 4 | scale: True, 5 | }, 6 | model: !obj:adversarial.AdversaryPair { 7 | generator: !obj:adversarial.Generator { 8 | monitor_ll: 1, 9 | mlp: !obj:pylearn2.models.mlp.MLP { 10 | layers: [ 11 | !obj:pylearn2.models.mlp.RectifiedLinear { 12 | layer_name: 'h0', 13 | dim: 8000, 14 | irange: .05, 15 | max_col_norm: 1.9365, 16 | }, 17 | !obj:pylearn2.models.mlp.Sigmoid { 18 | layer_name: 'h1', 19 | dim: 8000, 20 | irange: .05, 21 | max_col_norm: 1.9365, 22 | init_bias: -2.0, 23 | }, 24 | !obj:pylearn2.models.mlp.Sigmoid { 25 | max_col_norm: 1.9365, 26 | init_bias: !obj:pylearn2.models.dbm.init_sigmoid_bias_from_marginals { dataset: *train}, 27 | layer_name: 'y', 28 | sparse_init: 100, 29 | dim: 2304 30 | } 31 | ], 32 | nvis: 100, 33 | }}, 34 | discriminator: 35 | !obj:pylearn2.models.mlp.MLP { 36 | layers: [ 37 | !obj:pylearn2.models.maxout.Maxout { 38 | #W_lr_scale: .1, 39 | #b_lr_scale: .1, 40 | layer_name: 'h0', 41 | num_units: 1200, 42 | num_pieces: 5, 43 | irange: .005, 44 | max_col_norm: 1.9365, 45 | }, 46 | !obj:pylearn2.models.maxout.Maxout { 47 | #W_lr_scale: .1, 48 | #b_lr_scale: .1, 49 | layer_name: 'h1', 50 | num_units: 1200, 51 | num_pieces: 5, 52 | irange: .005, 53 | max_col_norm: 1.9365, 54 | }, 55 | !obj:pylearn2.models.mlp.Sigmoid { 56 | #W_lr_scale: .1, 57 | #b_lr_scale: .1, 58 | max_col_norm: 1.9365, 59 | layer_name: 'y', 60 | dim: 1, 61 | irange: .005 62 | } 63 | ], 64 | nvis: 2304, 65 | }, 66 | }, 67 | algorithm: !obj:pylearn2.training_algorithms.sgd.SGD { 68 | batch_size: 100, 69 | learning_rate: .05, 70 | learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum { 71 | init_momentum: .5, 72 | }, 73 | monitoring_dataset: 74 | { 75 | # 'train' : *train, 76 | 'valid' : !obj:pylearn2.datasets.tfd.TFD { 77 | which_set: 'valid', 78 | scale: True, 79 | }, 80 | # 'test' : !obj:pylearn2.datasets.tfd.TFD { 81 | # which_set: 'test', 82 | # scale: True, 83 | # } 84 | }, 85 | cost: !obj:adversarial.AdversaryCost2 { 86 | scale_grads: 0, 87 | #target_scale: 1., 88 | discriminator_default_input_include_prob: .5, 89 | discriminator_input_include_probs: { 90 | 'h0': .8 91 | }, 92 | discriminator_default_input_scale: 2., 93 | discriminator_input_scales: { 94 | 'h0': 1.25 95 | } 96 | }, 97 | #!obj:pylearn2.costs.mlp.dropout.Dropout { 98 | # input_include_probs: { 'h0' : .8 }, 99 | # input_scales: { 'h0': 1. } 100 | #}, 101 | termination_criterion: !obj:pylearn2.termination_criteria.EpochCounter { 102 | max_epochs: 50 103 | }, 104 | update_callbacks: !obj:pylearn2.training_algorithms.sgd.ExponentialDecay { 105 | decay_factor: 1.000004, 106 | min_lr: .000001 107 | } 108 | }, 109 | extensions: [ 110 | !obj:pylearn2.training_algorithms.learning_rule.MomentumAdjustor { 111 | start: 1, 112 | saturate: 250, 113 | final_momentum: .7 114 | }, 115 | #!obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest { 116 | # channel_name: 'valid_gen_ll', 117 | # name_base: 'save/pretrain', 118 | # store_best_model: True 119 | #} 120 | ], 121 | save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}.pkl", 122 | save_freq: 1 123 | } 124 | -------------------------------------------------------------------------------- /parzen_ll.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | import gc 4 | import numpy 5 | import theano 6 | import theano.tensor as T 7 | from pylearn2.utils import serial 8 | from pylearn2.config import yaml_parse 9 | from pylearn2.datasets.mnist import MNIST 10 | from pylearn2.datasets.tfd import TFD 11 | 12 | 13 | 14 | def get_nll(x, parzen, batch_size=10): 15 | """ 16 | Credit: Yann N. Dauphin 17 | """ 18 | 19 | inds = range(x.shape[0]) 20 | n_batches = int(numpy.ceil(float(len(inds)) / batch_size)) 21 | 22 | times = [] 23 | nlls = [] 24 | for i in range(n_batches): 25 | begin = time.time() 26 | nll = parzen(x[inds[i::n_batches]]) 27 | end = time.time() 28 | times.append(end-begin) 29 | nlls.extend(nll) 30 | 31 | if i % 10 == 0: 32 | print i, numpy.mean(times), numpy.mean(nlls) 33 | 34 | return numpy.array(nlls) 35 | 36 | 37 | def log_mean_exp(a): 38 | """ 39 | Credit: Yann N. Dauphin 40 | """ 41 | 42 | max_ = a.max(1) 43 | 44 | return max_ + T.log(T.exp(a - max_.dimshuffle(0, 'x')).mean(1)) 45 | 46 | 47 | def theano_parzen(mu, sigma): 48 | """ 49 | Credit: Yann N. Dauphin 50 | """ 51 | 52 | x = T.matrix() 53 | mu = theano.shared(mu) 54 | a = ( x.dimshuffle(0, 'x', 1) - mu.dimshuffle('x', 0, 1) ) / sigma 55 | E = log_mean_exp(-0.5*(a**2).sum(2)) 56 | Z = mu.shape[1] * T.log(sigma * numpy.sqrt(numpy.pi * 2)) 57 | 58 | return theano.function([x], E - Z) 59 | 60 | 61 | def cross_validate_sigma(samples, data, sigmas, batch_size): 62 | 63 | lls = [] 64 | for sigma in sigmas: 65 | print sigma 66 | parzen = theano_parzen(samples, sigma) 67 | tmp = get_nll(data, parzen, batch_size = batch_size) 68 | lls.append(numpy.asarray(tmp).mean()) 69 | del parzen 70 | gc.collect() 71 | 72 | ind = numpy.argmax(lls) 73 | return sigmas[ind] 74 | 75 | 76 | def get_valid(ds, limit_size = -1, fold = 0): 77 | if ds == 'mnist': 78 | data = MNIST('train', start=50000, stop=60000) 79 | return data.X[:limit_size] 80 | elif ds == 'tfd': 81 | data = TFD('valid', fold = fold, scale=True) 82 | return data.X 83 | else: 84 | raise ValueError("Unknow dataset: {}".format(args.dataet)) 85 | 86 | 87 | def get_test(ds, test, fold=0): 88 | if ds == 'mnist': 89 | return test.get_test_set() 90 | elif ds == 'tfd': 91 | return test.get_test_set(fold=fold) 92 | else: 93 | raise ValueError("Unknow dataset: {}".format(args.dataet)) 94 | 95 | 96 | def main(): 97 | parser = argparse.ArgumentParser(description = 'Parzen window, log-likelihood estimator') 98 | parser.add_argument('-p', '--path', help='model path') 99 | parser.add_argument('-s', '--sigma', default = None) 100 | parser.add_argument('-d', '--dataset', choices=['mnist', 'tfd']) 101 | parser.add_argument('-f', '--fold', default = 0, type=int) 102 | parser.add_argument('-v', '--valid', default = False, action='store_true') 103 | parser.add_argument('-n', '--num_samples', default=10000, type=int) 104 | parser.add_argument('-l', '--limit_size', default=1000, type=int) 105 | parser.add_argument('-b', '--batch_size', default=100, type=int) 106 | parser.add_argument('-c', '--cross_val', default=10, type=int, 107 | help="Number of cross valiation folds") 108 | parser.add_argument('--sigma_start', default=-1, type=float) 109 | parser.add_argument('--sigma_end', default=0., type=float) 110 | args = parser.parse_args() 111 | 112 | # load model 113 | model = serial.load(args.path) 114 | src = model.dataset_yaml_src 115 | batch_size = args.batch_size 116 | model.set_batch_size(batch_size) 117 | 118 | # load test set 119 | test = yaml_parse.load(src) 120 | test = get_test(args.dataset, test, args.fold) 121 | 122 | # generate samples 123 | samples = model.generator.sample(args.num_samples).eval() 124 | output_space = model.generator.mlp.get_output_space() 125 | if 'Conv2D' in str(output_space): 126 | samples = output_space.convert(samples, output_space.axes, ('b', 0, 1, 'c')) 127 | samples = samples.reshape((samples.shape[0], numpy.prod(samples.shape[1:]))) 128 | del model 129 | gc.collect() 130 | 131 | # cross validate sigma 132 | if args.sigma is None: 133 | valid = get_valid(args.dataset, limit_size = args.limit_size, fold = args.fold) 134 | sigma_range = numpy.logspace(args.sigma_start, args.sigma_end, num=args.cross_val) 135 | sigma = cross_validate_sigma(samples, valid, sigma_range, batch_size) 136 | else: 137 | sigma = float(args.sigma) 138 | 139 | print "Using Sigma: {}".format(sigma) 140 | gc.collect() 141 | 142 | # fit and evaulate 143 | parzen = theano_parzen(samples, sigma) 144 | ll = get_nll(test.X, parzen, batch_size = batch_size) 145 | se = ll.std() / numpy.sqrt(test.X.shape[0]) 146 | 147 | print "Log-Likelihood of test set = {}, se: {}".format(ll.mean(), se) 148 | 149 | # valid 150 | if args.valid: 151 | valid = get_valid(args.dataset) 152 | ll = get_nll(valid, parzen, batch_size = batch_size) 153 | se = ll.std() / numpy.sqrt(valid.shape[0]) 154 | print "Log-Likelihood of valid set = {}, se: {}".format(ll.mean(), se) 155 | 156 | 157 | if __name__ == "__main__": 158 | main() 159 | -------------------------------------------------------------------------------- /cifar10_convolutional.yaml: -------------------------------------------------------------------------------- 1 | !obj:pylearn2.train.Train { 2 | dataset: &train !obj:pylearn2.datasets.cifar10.CIFAR10 { 3 | axes: ['c', 0, 1, 'b'], 4 | gcn: 55., 5 | which_set: 'train', 6 | start: 0, 7 | stop: 40000 8 | }, 9 | model: !obj:adversarial.AdversaryPair { 10 | generator: !obj:adversarial.Generator { 11 | mlp: !obj:pylearn2.models.mlp.MLP { 12 | layers: [ 13 | !obj:pylearn2.models.mlp.RectifiedLinear { 14 | layer_name: 'gh0', 15 | dim: 8000, 16 | irange: .05, 17 | #max_col_norm: 1.9365, 18 | }, 19 | !obj:pylearn2.models.mlp.Sigmoid { 20 | layer_name: 'h1', 21 | dim: 8000, 22 | irange: .05, 23 | #max_col_norm: 1.9365, 24 | }, 25 | !obj:pylearn2.models.mlp.SpaceConverter { 26 | layer_name: 'converter', 27 | output_space: !obj:pylearn2.space.Conv2DSpace { 28 | shape: [10, 10], 29 | num_channels: 80, 30 | axes: ['c', 0, 1, 'b'], 31 | }}, 32 | !obj:adversarial.deconv.Deconv { 33 | #W_lr_scale: .05, 34 | #b_lr_scale: .05, 35 | num_channels: 3, 36 | output_stride: [3, 3], 37 | kernel_shape: [5, 5], 38 | pad_out: 0, 39 | #max_kernel_norm: 1.9365, 40 | # init_bias: !obj:pylearn2.models.dbm.init_sigmoid_bias_from_marginals { dataset: *train}, 41 | layer_name: 'y', 42 | irange: .05, 43 | tied_b: 0 44 | }, 45 | ], 46 | nvis: 100, 47 | }}, 48 | discriminator: 49 | !obj:pylearn2.models.mlp.MLP { 50 | layers: [ 51 | !obj:pylearn2.models.maxout.MaxoutConvC01B { 52 | layer_name: 'dh0', 53 | pad: 4, 54 | tied_b: 1, 55 | #W_lr_scale: .05, 56 | #b_lr_scale: .05, 57 | num_channels: 32, 58 | num_pieces: 2, 59 | kernel_shape: [8, 8], 60 | pool_shape: [4, 4], 61 | pool_stride: [2, 2], 62 | irange: .005, 63 | #max_kernel_norm: .9, 64 | partial_sum: 33, 65 | }, 66 | !obj:pylearn2.models.maxout.MaxoutConvC01B { 67 | layer_name: 'h1', 68 | pad: 3, 69 | tied_b: 1, 70 | #W_lr_scale: .05, 71 | #b_lr_scale: .05, 72 | num_channels: 32, # 192 ran out of memory 73 | num_pieces: 2, 74 | kernel_shape: [8, 8], 75 | pool_shape: [4, 4], 76 | pool_stride: [2, 2], 77 | irange: .005, 78 | #max_kernel_norm: 1.9365, 79 | partial_sum: 15, 80 | }, 81 | !obj:pylearn2.models.maxout.MaxoutConvC01B { 82 | pad: 3, 83 | layer_name: 'h2', 84 | tied_b: 1, 85 | #W_lr_scale: .05, 86 | #b_lr_scale: .05, 87 | num_channels: 192, 88 | num_pieces: 2, 89 | kernel_shape: [5, 5], 90 | pool_shape: [2, 2], 91 | pool_stride: [2, 2], 92 | irange: .005, 93 | #max_kernel_norm: 1.9365, 94 | }, 95 | !obj:pylearn2.models.maxout.Maxout { 96 | layer_name: 'h3', 97 | irange: .005, 98 | num_units: 500, 99 | num_pieces: 5, 100 | #max_col_norm: 1.9 101 | }, 102 | !obj:pylearn2.models.mlp.Sigmoid { 103 | #W_lr_scale: .1, 104 | #b_lr_scale: .1, 105 | #max_col_norm: 1.9365, 106 | layer_name: 'y', 107 | dim: 1, 108 | irange: .005 109 | } 110 | ], 111 | input_space: !obj:pylearn2.space.Conv2DSpace { 112 | shape: [32, 32], 113 | num_channels: 3, 114 | axes: ['c', 0, 1, 'b'], 115 | } 116 | }, 117 | }, 118 | algorithm: !obj:pylearn2.training_algorithms.sgd.SGD { 119 | batch_size: 128, 120 | learning_rate: .004, 121 | learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum { 122 | init_momentum: .5, 123 | }, 124 | monitoring_dataset: 125 | { 126 | #'train' : *train, 127 | 'valid' : !obj:pylearn2.datasets.cifar10.CIFAR10 { 128 | axes: ['c', 0, 1, 'b'], 129 | gcn: 55., 130 | which_set: 'train', 131 | start: 40000, 132 | stop: 50000 133 | }, 134 | #'test' : !obj:pylearn2.datasets.cifar10.CIFAR10 { 135 | # which_set: 'test', 136 | # gcn: 55., 137 | # } 138 | }, 139 | cost: !obj:adversarial.AdversaryCost2 { 140 | scale_grads: 0, 141 | #target_scale: .1, 142 | discriminator_default_input_include_prob: .5, 143 | discriminator_input_include_probs: { 144 | 'dh0': .8 145 | }, 146 | discriminator_default_input_scale: 2., 147 | discriminator_input_scales: { 148 | 'dh0': 1.25 149 | } 150 | }, 151 | #termination_criterion: !obj:pylearn2.termination_criteria.MonitorBased { 152 | # channel_name: "valid_y_misclass", 153 | # prop_decrease: 0., 154 | # N: 100 155 | #}, 156 | update_callbacks: !obj:pylearn2.training_algorithms.sgd.ExponentialDecay { 157 | decay_factor: 1.000004, 158 | min_lr: .000001 159 | } 160 | }, 161 | extensions: [ 162 | #!obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest { 163 | # channel_name: 'valid_y_misclass', 164 | # save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}_best.pkl" 165 | #}, 166 | !obj:pylearn2.training_algorithms.learning_rule.MomentumAdjustor { 167 | start: 1, 168 | saturate: 250, 169 | final_momentum: .7 170 | } 171 | ], 172 | save_path: "${PYLEARN2_TRAIN_FILE_FULL_STEM}.pkl", 173 | save_freq: 1 174 | } 175 | -------------------------------------------------------------------------------- /deconv.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | import numpy as np 4 | 5 | from theano.compat import OrderedDict 6 | from theano import tensor as T 7 | 8 | from pylearn2.linear.conv2d_c01b import make_random_conv2D 9 | from pylearn2.models import Model 10 | from pylearn2.models.maxout import check_cuda # TODO: import from original path 11 | from pylearn2.models.mlp import Layer 12 | #from pylearn2.models.maxout import py_integer_types # TODO: import from orig path 13 | from pylearn2.space import Conv2DSpace 14 | from pylearn2.utils import sharedX 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | class Deconv(Layer): 19 | def __init__(self, 20 | num_channels, 21 | kernel_shape, 22 | layer_name, 23 | irange=None, 24 | init_bias=0., 25 | W_lr_scale=None, 26 | b_lr_scale=None, 27 | pad_out=0, 28 | fix_kernel_shape=False, 29 | partial_sum=1, 30 | tied_b=False, 31 | max_kernel_norm=None, 32 | output_stride=(1, 1)): 33 | check_cuda(str(type(self))) 34 | super(Deconv, self).__init__() 35 | 36 | detector_channels = num_channels 37 | 38 | self.__dict__.update(locals()) 39 | del self.self 40 | 41 | @functools.wraps(Model.get_lr_scalers) 42 | def get_lr_scalers(self): 43 | 44 | if not hasattr(self, 'W_lr_scale'): 45 | self.W_lr_scale = None 46 | 47 | if not hasattr(self, 'b_lr_scale'): 48 | self.b_lr_scale = None 49 | 50 | rval = OrderedDict() 51 | 52 | if self.W_lr_scale is not None: 53 | W, = self.transformer.get_params() 54 | rval[W] = self.W_lr_scale 55 | 56 | if self.b_lr_scale is not None: 57 | rval[self.b] = self.b_lr_scale 58 | 59 | return rval 60 | 61 | def set_input_space(self, space): 62 | """ 63 | Tells the layer to use the specified input space. 64 | 65 | This resets parameters! The kernel tensor is initialized with the 66 | size needed to receive input from this space. 67 | 68 | Parameters 69 | ---------- 70 | space : Space 71 | The Space that the input will lie in. 72 | """ 73 | 74 | setup_deconv_detector_layer_c01b(layer=self, 75 | input_space=space, 76 | rng=self.mlp.rng) 77 | 78 | rng = self.mlp.rng 79 | 80 | detector_shape = self.detector_space.shape 81 | 82 | 83 | self.output_space = self.detector_space 84 | 85 | logger.info('Output space: {0}'.format(self.output_space.shape)) 86 | 87 | def _modify_updates(self, updates): 88 | """ 89 | Replaces the values in `updates` if needed to enforce the options set 90 | in the __init__ method, including `max_kernel_norm`. 91 | 92 | Parameters 93 | ---------- 94 | updates : OrderedDict 95 | A dictionary mapping parameters (including parameters not 96 | belonging to this model) to updated values of those parameters. 97 | The dictionary passed in contains the updates proposed by the 98 | learning algorithm. This function modifies the dictionary 99 | directly. The modified version will be compiled and executed 100 | by the learning algorithm. 101 | """ 102 | 103 | if self.max_kernel_norm is not None: 104 | W, = self.transformer.get_params() 105 | if W in updates: 106 | updated_W = updates[W] 107 | row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=(0, 1, 2))) 108 | desired_norms = T.clip(row_norms, 0, self.max_kernel_norm) 109 | scales = desired_norms / (1e-7 + row_norms) 110 | updates[W] = (updated_W * scales.dimshuffle('x', 'x', 'x', 0)) 111 | 112 | @functools.wraps(Model.get_params) 113 | def get_params(self): 114 | assert self.b.name is not None 115 | W, = self.transformer.get_params() 116 | assert W.name is not None 117 | rval = self.transformer.get_params() 118 | assert not isinstance(rval, set) 119 | rval = list(rval) 120 | assert self.b not in rval 121 | rval.append(self.b) 122 | return rval 123 | 124 | @functools.wraps(Layer.get_weight_decay) 125 | def get_weight_decay(self, coeff): 126 | if isinstance(coeff, str): 127 | coeff = float(coeff) 128 | assert isinstance(coeff, float) or hasattr(coeff, 'dtype') 129 | W, = self.transformer.get_params() 130 | return coeff * T.sqr(W).sum() 131 | 132 | @functools.wraps(Layer.set_weights) 133 | def set_weights(self, weights): 134 | W, = self.transformer.get_params() 135 | W.set_value(weights) 136 | 137 | @functools.wraps(Layer.set_biases) 138 | def set_biases(self, biases): 139 | self.b.set_value(biases) 140 | 141 | @functools.wraps(Layer.get_biases) 142 | def get_biases(self): 143 | return self.b.get_value() 144 | 145 | @functools.wraps(Model.get_weights_topo) 146 | def get_weights_topo(self): 147 | return self.transformer.get_weights_topo() 148 | 149 | @functools.wraps(Layer.get_monitoring_channels) 150 | def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): 151 | 152 | W, = self.transformer.get_params() 153 | 154 | assert W.ndim == 4 155 | 156 | sq_W = T.sqr(W) 157 | 158 | row_norms = T.sqrt(sq_W.sum(axis=(0, 1, 2))) 159 | 160 | P = state 161 | 162 | rval = OrderedDict() 163 | 164 | vars_and_prefixes = [(P, '')] 165 | 166 | for var, prefix in vars_and_prefixes: 167 | if not hasattr(var, 'ndim') or var.ndim != 4: 168 | print "expected 4D tensor, got " 169 | print var 170 | print type(var) 171 | if isinstance(var, tuple): 172 | print "tuple length: ", len(var) 173 | assert False 174 | v_max = var.max(axis=(1, 2, 3)) 175 | v_min = var.min(axis=(1, 2, 3)) 176 | v_mean = var.mean(axis=(1, 2, 3)) 177 | v_range = v_max - v_min 178 | 179 | # max_x.mean_u is "the mean over *u*nits of the max over 180 | # e*x*amples" The x and u are included in the name because 181 | # otherwise its hard to remember which axis is which when reading 182 | # the monitor I use inner.outer rather than outer_of_inner or 183 | # something like that because I want mean_x.* to appear next to 184 | # each other in the alphabetical list, as these are commonly 185 | # plotted together 186 | for key, val in [('max_x.max_u', v_max.max()), 187 | ('max_x.mean_u', v_max.mean()), 188 | ('max_x.min_u', v_max.min()), 189 | ('min_x.max_u', v_min.max()), 190 | ('min_x.mean_u', v_min.mean()), 191 | ('min_x.min_u', v_min.min()), 192 | ('range_x.max_u', v_range.max()), 193 | ('range_x.mean_u', v_range.mean()), 194 | ('range_x.min_u', v_range.min()), 195 | ('mean_x.max_u', v_mean.max()), 196 | ('mean_x.mean_u', v_mean.mean()), 197 | ('mean_x.min_u', v_mean.min())]: 198 | rval[prefix+key] = val 199 | 200 | rval.update(OrderedDict([('kernel_norms_min', row_norms.min()), 201 | ('kernel_norms_mean', row_norms.mean()), 202 | ('kernel_norms_max', row_norms.max()), ])) 203 | 204 | return rval 205 | 206 | @functools.wraps(Layer.fprop) 207 | def fprop(self, state_below): 208 | check_cuda(str(type(self))) 209 | 210 | self.input_space.validate(state_below) 211 | 212 | z = self.transformer.lmul_T(state_below) 213 | 214 | self.output_space.validate(z) 215 | 216 | if not hasattr(self, 'tied_b'): 217 | self.tied_b = False 218 | if self.tied_b: 219 | b = self.b.dimshuffle(0, 'x', 'x', 'x') 220 | else: 221 | b = self.b.dimshuffle(0, 1, 2, 'x') 222 | 223 | return z + b 224 | 225 | 226 | 227 | def setup_deconv_detector_layer_c01b(layer, input_space, rng, irange="not specified"): 228 | """ 229 | layer. This function sets up only the detector layer. 230 | 231 | Does the following: 232 | 233 | * raises a RuntimeError if cuda is not available 234 | * sets layer.input_space to input_space 235 | * sets up addition of dummy channels for compatibility with cuda-convnet: 236 | 237 | - layer.dummy_channels: # of dummy channels that need to be added 238 | (You might want to check this and raise an Exception if it's not 0) 239 | - layer.dummy_space: The Conv2DSpace representing the input with dummy 240 | channels added 241 | 242 | * sets layer.detector_space to the space for the detector layer 243 | * sets layer.transformer to be a Conv2D instance 244 | * sets layer.b to the right value 245 | 246 | Parameters 247 | ---------- 248 | layer : object 249 | Any python object that allows the modifications described below and 250 | has the following attributes: 251 | 252 | * pad : int describing amount of zero padding to add 253 | * kernel_shape : 2-element tuple or list describing spatial shape of 254 | kernel 255 | * fix_kernel_shape : bool, if true, will shrink the kernel shape to 256 | make it feasible, as needed (useful for hyperparameter searchers) 257 | * detector_channels : The number of channels in the detector layer 258 | * init_bias : numeric constant added to a tensor of zeros to 259 | initialize the bias 260 | * tied_b : If true, biases are shared across all spatial locations 261 | input_space : WRITEME 262 | A Conv2DSpace to be used as input to the layer 263 | rng : WRITEME 264 | A numpy RandomState or equivalent 265 | """ 266 | 267 | if irange != "not specified": 268 | raise AssertionError( 269 | "There was a bug in setup_detector_layer_c01b." 270 | "It uses layer.irange instead of the irange parameter to the " 271 | "function. The irange parameter is now disabled by this " 272 | "AssertionError, so that this error message can alert you that " 273 | "the bug affected your code and explain why the interface is " 274 | "changing. The irange parameter to the function and this " 275 | "error message may be removed after April 21, 2014." 276 | ) 277 | 278 | # Use "self" to refer to layer from now on, so we can pretend we're 279 | # just running in the set_input_space method of the layer 280 | self = layer 281 | 282 | # Make sure cuda is available 283 | check_cuda(str(type(self))) 284 | 285 | # Validate input 286 | if not isinstance(input_space, Conv2DSpace): 287 | raise TypeError("The input to a convolutional layer should be a " 288 | "Conv2DSpace, but layer " + self.layer_name + " got " + 289 | str(type(self.input_space))) 290 | 291 | if not hasattr(self, 'detector_channels'): 292 | raise ValueError("layer argument must have a 'detector_channels' " 293 | "attribute specifying how many channels to put in " 294 | "the convolution kernel stack.") 295 | 296 | # Store the input space 297 | self.input_space = input_space 298 | 299 | # Make sure number of channels is supported by cuda-convnet 300 | # (multiple of 4 or <= 3) 301 | # If not supported, pad the input with dummy channels 302 | ch = self.detector_channels 303 | rem = ch % 4 304 | if ch > 3 and rem != 0: 305 | raise NotImplementedError("Need to do dummy channels on the output") 306 | # self.dummy_channels = 4 - rem 307 | #else: 308 | # self.dummy_channels = 0 309 | #self.dummy_space = Conv2DSpace( 310 | # shape=input_space.shape, 311 | # channels=input_space.num_channels + self.dummy_channels, 312 | # axes=('c', 0, 1, 'b') 313 | #) 314 | 315 | if hasattr(self, 'output_stride'): 316 | kernel_stride = self.output_stride 317 | else: 318 | assert False # not sure if I got the name right, remove this assert if I did 319 | kernel_stride = [1, 1] 320 | 321 | 322 | #o_sh = int(np.ceil((i_sh + 2. * self.pad - k_sh) / float(k_st))) + 1 323 | #o_sh -1 = np.ceil((i_sh + 2. * self.pad - k_sh) / float(k_st)) 324 | #inv_ceil(o_sh -1) = (i_sh + 2. * self.pad - k_sh) / float(k_st) 325 | #float(k_st) inv_cel(o_sh -1) = (i_sh + 2 * self.pad -k_sh) 326 | # i_sh = k_st inv_ceil(o_sh-1) - 2 * self.pad + k_sh 327 | 328 | output_shape = \ 329 | [k_st * (i_sh - 1) - 2 * self.pad_out + k_sh 330 | for i_sh, k_sh, k_st in zip(self.input_space.shape, 331 | self.kernel_shape, kernel_stride)] 332 | 333 | 334 | if self.input_space.num_channels < 16: 335 | raise ValueError("Cuda-convnet requires the input to lmul_T to have " 336 | "at least 16 channels.") 337 | 338 | self.detector_space = Conv2DSpace(shape=output_shape, 339 | num_channels=self.detector_channels, 340 | axes=('c', 0, 1, 'b')) 341 | 342 | if hasattr(self, 'partial_sum'): 343 | partial_sum = self.partial_sum 344 | else: 345 | partial_sum = 1 346 | 347 | if hasattr(self, 'sparse_init') and self.sparse_init is not None: 348 | self.transformer = \ 349 | checked_call(make_sparse_random_conv2D, 350 | OrderedDict([('num_nonzero', self.sparse_init), 351 | ('input_space', self.detector_space), 352 | ('output_space', self.input_space), 353 | ('kernel_shape', self.kernel_shape), 354 | ('pad', self.pad), 355 | ('partial_sum', partial_sum), 356 | ('kernel_stride', kernel_stride), 357 | ('rng', rng)])) 358 | else: 359 | self.transformer = make_random_conv2D( 360 | irange=self.irange, 361 | input_axes=self.detector_space.axes, 362 | output_axes=self.input_space.axes, 363 | input_channels=self.detector_space.num_channels, 364 | output_channels=self.input_space.num_channels, 365 | kernel_shape=self.kernel_shape, 366 | pad=self.pad_out, 367 | partial_sum=partial_sum, 368 | kernel_stride=kernel_stride, 369 | rng=rng, 370 | input_shape=self.detector_space.shape 371 | ) 372 | 373 | W, = self.transformer.get_params() 374 | W.name = self.layer_name + '_W' 375 | 376 | if self.tied_b: 377 | self.b = sharedX(np.zeros(self.detector_space.num_channels) + 378 | self.init_bias) 379 | else: 380 | self.b = sharedX(self.detector_space.get_origin() + self.init_bias) 381 | self.b.name = self.layer_name + '_b' 382 | 383 | logger.info('Input shape: {0}'.format(self.input_space.shape)) 384 | print layer.layer_name + ' detector space: {0}'.format(self.detector_space.shape) 385 | -------------------------------------------------------------------------------- /sgd.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copy of pylearn2's sgd.py, hacked to support doing steps on 3 | discriminator separately from the generator. Ideally this would 4 | be accomplished using pylearn2's FixedVarDescr implementation, 5 | but it is currently not very well supported. 6 | """ 7 | from __future__ import division 8 | 9 | __authors__ = "Ian Goodfellow" 10 | __copyright__ = "Copyright 2010-2012, Universite de Montreal" 11 | __credits__ = ["Ian Goodfellow, David Warde-Farley"] 12 | __license__ = "3-clause BSD" 13 | __maintainer__ = "David Warde-Farley" 14 | __email__ = "pylearn-dev@googlegroups" 15 | 16 | import logging 17 | import warnings 18 | import numpy as np 19 | 20 | from theano import config 21 | from theano import function 22 | from theano.compat.python2x import OrderedDict 23 | from theano.gof.op import get_debug_values 24 | 25 | from pylearn2.monitor import Monitor 26 | from pylearn2.space import CompositeSpace, NullSpace 27 | from pylearn2.train_extensions import TrainExtension 28 | from pylearn2.training_algorithms.training_algorithm import TrainingAlgorithm 29 | from pylearn2.training_algorithms.learning_rule import Momentum 30 | from pylearn2.training_algorithms.learning_rule import MomentumAdjustor \ 31 | as LRMomentumAdjustor 32 | from pylearn2.utils.iteration import is_stochastic, has_uniform_batch_size 33 | from pylearn2.utils import py_integer_types, py_float_types 34 | from pylearn2.utils import safe_zip 35 | from pylearn2.utils import serial 36 | from pylearn2.utils import sharedX 37 | from pylearn2.utils.data_specs import DataSpecsMapping 38 | from pylearn2.utils.timing import log_timing 39 | from pylearn2.utils.rng import make_np_rng 40 | 41 | 42 | log = logging.getLogger(__name__) 43 | 44 | 45 | class SGD(TrainingAlgorithm): 46 | """ 47 | SGD = (Minibatch) Stochastic Gradient Descent. 48 | A TrainingAlgorithm that does stochastic gradient descent on minibatches 49 | of training examples. 50 | 51 | For theoretical background on this algorithm, see Yoshua Bengio's machine 52 | learning course notes on the subject: 53 | 54 | http://www.iro.umontreal.ca/~pift6266/H10/notes/gradient.html 55 | 56 | Parameters 57 | ---------- 58 | learning_rate : float 59 | The learning rate to use. Train object callbacks can change the 60 | learning rate after each epoch. SGD update_callbacks can change 61 | it after each minibatch. 62 | cost : pylearn2.costs.cost.Cost, optional 63 | Cost object specifying the objective function to be minimized. 64 | Optionally, may be None. In this case, SGD will call the model's 65 | get_default_cost method to obtain the objective function. 66 | batch_size : int, optional 67 | The size of the batch to be used. 68 | If not specified, the model will be asked for the batch size, so 69 | you must have specified the batch size there. 70 | (Some models are rigidly defined to only work with one batch size) 71 | monitoring_batch_size : int, optional 72 | The size of the monitoring batches. 73 | monitoring_batches : int, optional 74 | At the start of each epoch, we run "monitoring", to evaluate 75 | quantities such as the validation set error. 76 | monitoring_batches, if specified, determines the number of batches 77 | to draw from the iterator for each monitoring dataset. 78 | Unnecessary if not using monitoring or if `monitor_iteration_mode` 79 | is 'sequential' and `batch_size` is specified (number of 80 | batches will be calculated based on full dataset size). 81 | TODO: make it possible to specify different monitoring_batches 82 | for each monitoring dataset. The Monitor itself already supports 83 | this. 84 | monitoring_dataset : Dataset or dictionary, optional 85 | If not specified, no monitoring is used. 86 | If specified to be a Dataset, monitor on that Dataset. 87 | If specified to be dictionary, the keys should be string names 88 | of datasets, and the values should be Datasets. All monitoring 89 | channels will be computed for all monitoring Datasets and will 90 | have the dataset name and an underscore prepended to them. 91 | monitor_iteration_mode : str, optional 92 | The iteration mode used to iterate over the examples in all 93 | monitoring datasets. If not specified, defaults to 'sequential'. 94 | TODO: make it possible to specify different modes for different 95 | datasets. 96 | termination_criterion : instance of \ 97 | pylearn2.termination_criteria.TerminationCriterion, optional 98 | 99 | Used to determine when the algorithm should stop running. 100 | If not specified, runs forever--or more realistically, until 101 | external factors halt the python process (Kansas 1977). 102 | update_callbacks : list, optional 103 | If specified, each member of the list should be a callable that 104 | accepts an SGD instance as its only argument. 105 | All callbacks will be called with this SGD instance after each 106 | SGD step. 107 | learning_rule : training_algorithms.learning_rule.LearningRule, optional 108 | A learning rule computes the new parameter values given old 109 | parameters and first-order gradients. If learning_rule is None, 110 | sgd.SGD will update parameters according to the standard SGD 111 | learning rule: 112 | 113 | .. code-block:: none 114 | 115 | param := param - learning_rate * d cost / d param 116 | 117 | This argument allows more sophisticated learning rules, such 118 | as SGD with momentum. 119 | init_momentum : float, **DEPRECATED** option 120 | Use learning_rule instead. 121 | If None, does not use momentum otherwise, use momentum and 122 | initialize the momentum coefficient to init_momentum. Callbacks 123 | can change this over time just like the learning rate. If the 124 | gradient is the same on every step, then the update taken by the 125 | SGD algorithm is scaled by a factor of 1/(1-momentum). See 126 | section 9 of Geoffrey Hinton's "A Practical Guide to Training 127 | Restricted Boltzmann Machines" for details. 128 | set_batch_size : bool, optional 129 | Defaults to False. 130 | If True, and batch_size conflicts with model.force_batch_size, 131 | will call model.set_batch_size(batch_size) in an attempt to 132 | change model.force_batch_size 133 | train_iteration_mode : str, optional 134 | Defaults to 'shuffled_sequential'. 135 | The iteration mode to use for iterating through training examples. 136 | batches_per_iter : int, optional 137 | The number of batches to draw from the iterator over training 138 | examples. 139 | If iteration mode is 'sequential' or 'shuffled_sequential', this 140 | is unnecessary; when unspecified we will iterate over all examples. 141 | theano_function_mode : a valid argument to theano.function's \ 142 | 'mode' parameter, optional 143 | 144 | The theano mode to compile the updates function with. Note that 145 | pylearn2 includes some wraplinker modes that are not bundled with 146 | theano. See pylearn2.devtools. These extra modes let you do 147 | things like check for NaNs at every step, or record md5 digests 148 | of all computations performed by the update function to help 149 | isolate problems with nondeterminism. 150 | monitoring_costs : list, optional 151 | a list of Cost instances. The Monitor will also include all 152 | channels defined by these Costs, even though we don't train 153 | using them. 154 | seed : valid argument to np.random.RandomState, optional 155 | The seed used for the random number generate to be passed to the 156 | training dataset iterator (if any) 157 | """ 158 | def __init__(self, learning_rate, cost=None, batch_size=None, 159 | monitoring_batch_size=None, monitoring_batches=None, 160 | monitoring_dataset=None, monitor_iteration_mode='sequential', 161 | termination_criterion=None, update_callbacks=None, 162 | learning_rule = None, init_momentum = None, 163 | set_batch_size = False, 164 | train_iteration_mode = None, batches_per_iter=None, 165 | theano_function_mode = None, monitoring_costs=None, 166 | seed=[2012, 10, 5], discriminator_steps=1): 167 | self.discriminator_steps = discriminator_steps 168 | 169 | if isinstance(cost, (list, tuple, set)): 170 | raise TypeError("SGD no longer supports using collections of " + 171 | "Costs to represent a sum of Costs. Use " + 172 | "pylearn2.costs.cost.SumOfCosts instead.") 173 | 174 | if init_momentum: 175 | warnings.warn("init_momentum interface is deprecated and will " 176 | "become officially unsuported as of May 9, 2014. Please use the " 177 | "`learning_rule` parameter instead, providing an object of type " 178 | "`pylearn2.training_algorithms.learning_rule.Momentum` instead") 179 | # Convert to new interface under the hood. 180 | self.learning_rule = Momentum(init_momentum) 181 | else: 182 | self.learning_rule = learning_rule 183 | 184 | self.learning_rate = sharedX(learning_rate, 'learning_rate') 185 | self.cost = cost 186 | self.batch_size = batch_size 187 | self.set_batch_size = set_batch_size 188 | self.batches_per_iter = batches_per_iter 189 | self._set_monitoring_dataset(monitoring_dataset) 190 | self.monitoring_batch_size = monitoring_batch_size 191 | self.monitoring_batches = monitoring_batches 192 | self.monitor_iteration_mode = monitor_iteration_mode 193 | if monitoring_dataset is None: 194 | if monitoring_batch_size is not None: 195 | raise ValueError("Specified a monitoring batch size " + 196 | "but not a monitoring dataset.") 197 | if monitoring_batches is not None: 198 | raise ValueError("Specified an amount of monitoring batches " + 199 | "but not a monitoring dataset.") 200 | self.termination_criterion = termination_criterion 201 | self._register_update_callbacks(update_callbacks) 202 | if train_iteration_mode is None: 203 | train_iteration_mode = 'shuffled_sequential' 204 | self.train_iteration_mode = train_iteration_mode 205 | self.first = True 206 | self.rng = make_np_rng(seed, which_method=["randn","randint"]) 207 | self.theano_function_mode = theano_function_mode 208 | self.monitoring_costs = monitoring_costs 209 | 210 | def setup(self, model, dataset): 211 | """ 212 | Compiles the theano functions needed for the train method. 213 | 214 | Parameters 215 | ---------- 216 | model : a Model instance 217 | dataset : Dataset 218 | """ 219 | self.i = 0 220 | if self.cost is None: 221 | self.cost = model.get_default_cost() 222 | 223 | inf_params = [param for param in model.get_params() 224 | if np.any(np.isinf(param.get_value()))] 225 | if len(inf_params) > 0: 226 | raise ValueError("These params are Inf: "+str(inf_params)) 227 | if any([np.any(np.isnan(param.get_value())) 228 | for param in model.get_params()]): 229 | nan_params = [param for param in model.get_params() 230 | if np.any(np.isnan(param.get_value()))] 231 | raise ValueError("These params are NaN: "+str(nan_params)) 232 | self.model = model 233 | 234 | self._synchronize_batch_size(model) 235 | model._test_batch_size = self.batch_size 236 | self.monitor = Monitor.get_monitor(model) 237 | self.monitor._sanity_check() 238 | 239 | # test if force batch size and batch size 240 | if getattr(model, "force_batch_size", False) and \ 241 | any(dataset.get_design_matrix().shape[0] % self.batch_size != 0 for 242 | dataset in self.monitoring_dataset.values()) and \ 243 | not has_uniform_batch_size(self.monitor_iteration_mode): 244 | 245 | raise ValueError("Dataset size is not a multiple of batch size." 246 | "You should set monitor_iteration_mode to " 247 | "even_sequential, even_shuffled_sequential or " 248 | "even_batchwise_shuffled_sequential") 249 | 250 | data_specs = self.cost.get_data_specs(self.model) 251 | mapping = DataSpecsMapping(data_specs) 252 | space_tuple = mapping.flatten(data_specs[0], return_tuple=True) 253 | source_tuple = mapping.flatten(data_specs[1], return_tuple=True) 254 | 255 | # Build a flat tuple of Theano Variables, one for each space. 256 | # We want that so that if the same space/source is specified 257 | # more than once in data_specs, only one Theano Variable 258 | # is generated for it, and the corresponding value is passed 259 | # only once to the compiled Theano function. 260 | theano_args = [] 261 | for space, source in safe_zip(space_tuple, source_tuple): 262 | name = '%s[%s]' % (self.__class__.__name__, source) 263 | arg = space.make_theano_batch(name=name, 264 | batch_size=self.batch_size) 265 | theano_args.append(arg) 266 | theano_args = tuple(theano_args) 267 | 268 | # Methods of `self.cost` need args to be passed in a format compatible 269 | # with data_specs 270 | nested_args = mapping.nest(theano_args) 271 | fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) 272 | self.on_load_batch = fixed_var_descr.on_load_batch 273 | 274 | cost_value = self.cost.expr(model, nested_args, 275 | ** fixed_var_descr.fixed_vars) 276 | 277 | if cost_value is not None and cost_value.name is None: 278 | # Concatenate the name of all tensors in theano_args !? 279 | cost_value.name = 'objective' 280 | 281 | # Set up monitor to model the objective value, learning rate, 282 | # momentum (if applicable), and extra channels defined by 283 | # the cost 284 | learning_rate = self.learning_rate 285 | if self.monitoring_dataset is not None: 286 | if (self.monitoring_batch_size is None and 287 | self.monitoring_batches is None): 288 | self.monitoring_batch_size = self.batch_size 289 | self.monitoring_batches = self.batches_per_iter 290 | self.monitor.setup(dataset=self.monitoring_dataset, 291 | cost=self.cost, 292 | batch_size=self.monitoring_batch_size, 293 | num_batches=self.monitoring_batches, 294 | extra_costs=self.monitoring_costs, 295 | mode=self.monitor_iteration_mode) 296 | dataset_name = self.monitoring_dataset.keys()[0] 297 | monitoring_dataset = self.monitoring_dataset[dataset_name] 298 | #TODO: have Monitor support non-data-dependent channels 299 | self.monitor.add_channel(name='learning_rate', 300 | ipt=None, 301 | val=learning_rate, 302 | data_specs=(NullSpace(), ''), 303 | dataset=monitoring_dataset) 304 | 305 | if self.learning_rule: 306 | self.learning_rule.add_channels_to_monitor( 307 | self.monitor, 308 | monitoring_dataset) 309 | 310 | params = list(model.get_params()) 311 | assert len(params) > 0 312 | for i, param in enumerate(params): 313 | if param.name is None: 314 | param.name = 'sgd_params[%d]' % i 315 | self.params = params 316 | 317 | 318 | grads, updates = self.cost.get_gradients(model, nested_args, 319 | ** fixed_var_descr.fixed_vars) 320 | if not isinstance(grads, OrderedDict): 321 | raise TypeError(str(type(self.cost)) + ".get_gradients returned " + 322 | "something with" + str(type(grads)) + "as its " + 323 | "first member. Expected OrderedDict.") 324 | 325 | for param in grads: 326 | assert param in params 327 | for param in params: 328 | assert param in grads 329 | 330 | lr_scalers = model.get_lr_scalers() 331 | 332 | for key in lr_scalers: 333 | if key not in params: 334 | raise ValueError("Tried to scale the learning rate on " +\ 335 | str(key)+" which is not an optimization parameter.") 336 | 337 | assert len(updates.keys()) == 0 338 | 339 | def get_func(learn_discriminator, learn_generator): 340 | 341 | updates = OrderedDict() 342 | 343 | assert (learn_discriminator or learn_generator) and not (learn_discriminator and learn_generator) 344 | 345 | if learn_discriminator: 346 | cur_params = model.discriminator.get_params() 347 | else: 348 | cur_params = model.generator.get_params() 349 | 350 | cur_grads = OrderedDict() 351 | for param in cur_params: 352 | cur_grads[param] = grads[param] 353 | 354 | for param in grads: 355 | if grads[param].name is None and cost_value is not None: 356 | grads[param].name = ('grad(%(costname)s, %(paramname)s)' % 357 | {'costname': cost_value.name, 358 | 'paramname': param.name}) 359 | assert grads[param].dtype == param.dtype 360 | 361 | cur_lr_scalers = OrderedDict() 362 | for param in cur_params: 363 | if param in lr_scalers: 364 | lr_scaler = lr_scalers[param] 365 | cur_lr_scalers[param] = lr_scaler 366 | 367 | log.info('Parameter and initial learning rate summary:') 368 | for param in cur_params: 369 | param_name = param.name 370 | if param_name is None: 371 | param_name = 'anon_param' 372 | lr = learning_rate.get_value() * cur_lr_scalers.get(param,1.) 373 | log.info('\t' + param_name + ': ' + str(lr)) 374 | 375 | if self.learning_rule: 376 | updates.update(self.learning_rule.get_updates( 377 | learning_rate, cur_grads, cur_lr_scalers)) 378 | else: 379 | # Use standard SGD updates with fixed learning rate. 380 | updates.update( dict(safe_zip(params, [param - learning_rate * \ 381 | lr_scalers.get(param, 1.) * grads[param] 382 | for param in params]))) 383 | 384 | for param in cur_params: 385 | if updates[param].name is None: 386 | updates[param].name = 'sgd_update(' + param.name + ')' 387 | model.modify_updates(updates) 388 | for param in cur_params: 389 | update = updates[param] 390 | if update.name is None: 391 | update.name = 'censor(sgd_update(' + param.name + '))' 392 | for update_val in get_debug_values(update): 393 | if np.any(np.isinf(update_val)): 394 | raise ValueError("debug value of %s contains infs" % 395 | update.name) 396 | if np.any(np.isnan(update_val)): 397 | raise ValueError("debug value of %s contains nans" % 398 | update.name) 399 | 400 | 401 | with log_timing(log, 'Compiling sgd_update'): 402 | return function(theano_args, 403 | updates=updates, 404 | name='sgd_update', 405 | on_unused_input='ignore', 406 | mode=self.theano_function_mode) 407 | self.d_func = get_func(1, 0) 408 | self.g_func = get_func(0, 1) 409 | 410 | def train(self, dataset): 411 | """ 412 | Runs one epoch of SGD training on the specified dataset. 413 | 414 | Parameters 415 | ---------- 416 | dataset : Dataset 417 | """ 418 | if not hasattr(self, 'd_func'): 419 | raise Exception("train called without first calling setup") 420 | 421 | # Make sure none of the parameters have bad values 422 | for param in self.params: 423 | value = param.get_value(borrow=True) 424 | if np.any(np.isnan(value)) or np.any(np.isinf(value)): 425 | raise Exception("NaN in " + param.name) 426 | 427 | self.first = False 428 | rng = self.rng 429 | if not is_stochastic(self.train_iteration_mode): 430 | rng = None 431 | 432 | data_specs = self.cost.get_data_specs(self.model) 433 | 434 | # The iterator should be built from flat data specs, so it returns 435 | # flat, non-redundent tuples of data. 436 | mapping = DataSpecsMapping(data_specs) 437 | space_tuple = mapping.flatten(data_specs[0], return_tuple=True) 438 | source_tuple = mapping.flatten(data_specs[1], return_tuple=True) 439 | if len(space_tuple) == 0: 440 | # No data will be returned by the iterator, and it is impossible 441 | # to know the size of the actual batch. 442 | # It is not decided yet what the right thing to do should be. 443 | raise NotImplementedError("Unable to train with SGD, because " 444 | "the cost does not actually use data from the data set. " 445 | "data_specs: %s" % str(data_specs)) 446 | flat_data_specs = (CompositeSpace(space_tuple), source_tuple) 447 | 448 | iterator = dataset.iterator(mode=self.train_iteration_mode, 449 | batch_size=self.batch_size, 450 | data_specs=flat_data_specs, return_tuple=True, 451 | rng = rng, num_batches = self.batches_per_iter) 452 | 453 | on_load_batch = self.on_load_batch 454 | i = self.i 455 | for batch in iterator: 456 | for callback in on_load_batch: 457 | callback(*batch) 458 | self.d_func(*batch) 459 | i += 1 460 | if i == self.discriminator_steps: 461 | # Generator doesn't actually use the data so we want to 462 | # re-use this batch. Could save memory by making the 463 | # code not expect data in the interface. 464 | self.g_func(*batch) 465 | i = 0 466 | # iterator might return a smaller batch if dataset size 467 | # isn't divisible by batch_size 468 | # Note: if data_specs[0] is a NullSpace, there is no way to know 469 | # how many examples would actually have been in the batch, 470 | # since it was empty, so actual_batch_size would be reported as 0. 471 | actual_batch_size = flat_data_specs[0].np_batch_size(batch) 472 | self.monitor.report_batch(actual_batch_size) 473 | for callback in self.update_callbacks: 474 | callback(self) 475 | 476 | # Make sure none of the parameters have bad values 477 | for param in self.params: 478 | value = param.get_value(borrow=True) 479 | if np.any(np.isnan(value)) or np.any(np.isinf(value)): 480 | raise Exception("NaN in " + param.name) 481 | self.i = i 482 | 483 | def continue_learning(self, model): 484 | """ 485 | Returns True if the algorithm should continue running, or False 486 | if it has reached convergence / started overfitting and should 487 | stop. 488 | 489 | Parameters 490 | ---------- 491 | model : a Model instance 492 | """ 493 | if self.termination_criterion is None: 494 | return True 495 | else: 496 | return self.termination_criterion.continue_learning(self.model) 497 | 498 | class MonitorBasedLRAdjuster(TrainExtension): 499 | """ 500 | A TrainExtension that uses the on_monitor callback to adjust 501 | the learning rate on each epoch. It pulls out a channel 502 | from the model's monitor and adjusts the learning rate 503 | based on what happened to the monitoring channel on the last 504 | epoch. If the channel is greater than high_trigger times 505 | its previous value, the learning rate will be scaled by 506 | shrink_amt (which should be < 1 for this scheme to make 507 | sense). The idea is that in this case the learning algorithm 508 | is overshooting the bottom of the objective function. 509 | 510 | If the objective is less than high_trigger but 511 | greater than low_trigger times its previous value, the 512 | learning rate will be scaled by grow_amt (which should be > 1 513 | for this scheme to make sense). The idea is that the learning 514 | algorithm is making progress but at too slow of a rate. 515 | 516 | Parameters 517 | ---------- 518 | high_trigger : float, optional 519 | See class-level docstring 520 | low_trigger : float, optional 521 | See class-level docstring 522 | grow_amt : float, optional 523 | See class-level docstring 524 | min_lr : float, optional 525 | All updates to the learning rate are clipped to be at least 526 | this value. 527 | max_lr : float, optional 528 | All updates to the learning rate are clipped to be at most 529 | this value. 530 | dataset_name : str, optional 531 | If specified, use dataset_name + "_objective" as the channel 532 | to guide the learning rate adaptation. 533 | channel_name : str, optional 534 | If specified, use channel_name as the channel to guide the 535 | learning rate adaptation. Conflicts with dataset_name. 536 | If neither dataset_name nor channel_name is specified, uses 537 | "objective" 538 | """ 539 | 540 | def __init__(self, high_trigger=1., shrink_amt=.99, 541 | low_trigger=.99, grow_amt=1.01, 542 | min_lr = 1e-7, max_lr = 1., 543 | dataset_name=None, channel_name=None): 544 | self.high_trigger = high_trigger 545 | self.shrink_amt = shrink_amt 546 | self.low_trigger = low_trigger 547 | self.grow_amt = grow_amt 548 | self.min_lr = min_lr 549 | self.max_lr = max_lr 550 | self.dataset_name = None 551 | if channel_name is not None: 552 | self.channel_name = channel_name 553 | else: 554 | if dataset_name is not None: 555 | self.channel_name = dataset_name + '_objective' 556 | self.dataset_name = dataset_name 557 | else: 558 | self.channel_name = None 559 | 560 | def on_monitor(self, model, dataset, algorithm): 561 | """ 562 | Adjusts the learning rate based on the contents of model.monitor 563 | 564 | Parameters 565 | ---------- 566 | model : a Model instance 567 | dataset : Dataset 568 | algorithm : WRITEME 569 | """ 570 | model = algorithm.model 571 | lr = algorithm.learning_rate 572 | current_learning_rate = lr.get_value() 573 | assert hasattr(model, 'monitor'), ("no monitor associated with " 574 | + str(model)) 575 | monitor = model.monitor 576 | monitor_channel_specified = True 577 | 578 | if self.channel_name is None: 579 | monitor_channel_specified = False 580 | channels = [elem for elem in monitor.channels 581 | if elem.endswith("objective")] 582 | if len(channels) < 1: 583 | raise ValueError("There are no monitoring channels that end " 584 | "with \"objective\". Please specify either " 585 | "channel_name or dataset_name.") 586 | elif len(channels) > 1: 587 | datasets = algorithm.monitoring_dataset.keys() 588 | raise ValueError("There are multiple monitoring channels that" 589 | "end with \"_objective\". The list of available " 590 | "datasets are: " + 591 | str(datasets) + " . Please specify either " 592 | "channel_name or dataset_name in the " 593 | "MonitorBasedLRAdjuster constructor to " 594 | 'disambiguate.') 595 | else: 596 | self.channel_name = channels[0] 597 | warnings.warn('The channel that has been chosen for ' 598 | 'monitoring is: ' + 599 | str(self.channel_name) + '.') 600 | 601 | try: 602 | v = monitor.channels[self.channel_name].val_record 603 | except KeyError: 604 | err_input = '' 605 | if monitor_channel_specified: 606 | if self.dataset_name: 607 | err_input = 'The dataset_name \'' + str( 608 | self.dataset_name) + '\' is not valid.' 609 | else: 610 | err_input = 'The channel_name \'' + str( 611 | self.channel_name) + '\' is not valid.' 612 | err_message = 'There is no monitoring channel named \'' + \ 613 | str(self.channel_name) + '\'. You probably need to ' + \ 614 | 'specify a valid monitoring channel by using either ' + \ 615 | 'dataset_name or channel_name in the ' + \ 616 | 'MonitorBasedLRAdjuster constructor. ' + err_input 617 | raise ValueError(err_message) 618 | 619 | if len(v) < 1: 620 | if monitor.dataset is None: 621 | assert len(v) == 0 622 | raise ValueError("You're trying to use a monitor-based " 623 | "learning rate adjustor but the monitor has no " 624 | "entries because you didn't specify a " 625 | "monitoring dataset.") 626 | 627 | raise ValueError("For some reason there are no monitor entries" 628 | "yet the MonitorBasedLRAdjuster has been " 629 | "called. This should never happen. The Train" 630 | " object should call the monitor once on " 631 | "initialization, then call the callbacks. " 632 | "It seems you are either calling the " 633 | "callback manually rather than as part of a " 634 | "training algorithm, or there is a problem " 635 | "with the Train object.") 636 | if len(v) == 1: 637 | #only the initial monitoring has happened 638 | #no learning has happened, so we can't adjust the learning rate yet 639 | #just do nothing 640 | return 641 | 642 | rval = current_learning_rate 643 | 644 | log.info("monitoring channel is {0}".format(self.channel_name)) 645 | 646 | if v[-1] > self.high_trigger * v[-2]: 647 | rval *= self.shrink_amt 648 | log.info("shrinking learning rate to %f" % rval) 649 | elif v[-1] > self.low_trigger * v[-2]: 650 | rval *= self.grow_amt 651 | log.info("growing learning rate to %f" % rval) 652 | 653 | rval = max(self.min_lr, rval) 654 | rval = min(self.max_lr, rval) 655 | 656 | lr.set_value(np.cast[lr.dtype](rval)) 657 | 658 | 659 | class PatienceBasedTermCrit(object): 660 | """ 661 | A monitor-based termination criterion using a geometrically increasing 662 | amount of patience. If the selected channel has decreased by a certain 663 | proportion when comparing to the lowest value seen yet, the patience is 664 | set to a factor of the number of examples seen, which by default 665 | (patience_increase=2.) ensures the model has seen as many examples as the 666 | number of examples that lead to the lowest value before concluding a local 667 | optima has been reached. 668 | 669 | Note: Technically, the patience corresponds to a number of epochs to be 670 | independent of the size of the dataset, so be aware of that when choosing 671 | initial_patience. 672 | 673 | Parameters 674 | ---------- 675 | prop_decrease : float 676 | The factor X in the (1 - X) * best_value threshold 677 | initial_patience : int 678 | Minimal number of epochs the model has to run before it can stop 679 | patience_increase : float, optional 680 | The factor X in the patience = X * n_iter update. 681 | channel_name : string, optional 682 | Name of the channel to examine. If None and the monitor 683 | has only one channel, this channel will be used; otherwise, an 684 | error will be raised. 685 | """ 686 | def __init__(self, prop_decrease, initial_patience, 687 | patience_increase=2., channel_name=None): 688 | self._channel_name = channel_name 689 | self.prop_decrease = prop_decrease 690 | self.patience = initial_patience 691 | self.best_value = np.inf 692 | self.patience_increase = patience_increase 693 | 694 | def __call__(self, model): 695 | """ 696 | Returns True or False depending on whether the optimization should 697 | stop or not. The optimization should stop if it has run for a number 698 | of epochs superior to the patience without any improvement. 699 | 700 | Parameters 701 | ---------- 702 | model : Model 703 | The model used in the experiment and from which the monitor used 704 | in the termination criterion will be extracted. 705 | 706 | Returns 707 | ------- 708 | bool 709 | True or False, indicating if the optimization should stop or not. 710 | """ 711 | monitor = model.monitor 712 | # In the case the monitor has only one channel, the channel_name can 713 | # be omitted and the criterion will examine the only channel 714 | # available. However, if the monitor has multiple channels, leaving 715 | # the channel_name unspecified will raise an error. 716 | if self._channel_name is None: 717 | if len(monitor.channels) != 1: 718 | raise ValueError("Only single-channel monitors are supported " 719 | "for channel_name == None") 720 | v = monitor.channels.values()[0].val_record 721 | else: 722 | v = monitor.channels[self._channel_name].val_record 723 | # If the channel value decrease is higher than the threshold, we 724 | # update the best value to this value and we update the patience. 725 | if v[-1] < self.best_value * (1. - self.prop_decrease): 726 | # Using the max between actual patience and updated patience 727 | # ensures that the model will run for at least the initial 728 | # patience and that it would behave correctly if the user 729 | # chooses a dumb value (i.e. less than 1) 730 | self.patience = max(self.patience, len(v) * self.patience_increase) 731 | self.best_value = v[-1] 732 | 733 | return len(v) < self.patience 734 | 735 | 736 | class AnnealedLearningRate(object): 737 | """ 738 | This is a callback for the SGD algorithm rather than the Train object. 739 | This anneals the learning rate to decrease as 1/t where t is the number 740 | of gradient descent updates done so far. Use OneOverEpoch as Train object 741 | callback if you would prefer 1/t where t is epochs. 742 | 743 | Parameters 744 | ---------- 745 | anneal_start : int 746 | The epoch on which to begin annealing 747 | """ 748 | def __init__(self, anneal_start): 749 | self._initialized = False 750 | self._count = 0 751 | self._anneal_start = anneal_start 752 | 753 | def __call__(self, algorithm): 754 | """ 755 | Updates the learning rate according to the annealing schedule. 756 | 757 | Parameters 758 | ---------- 759 | algorithm : WRITEME 760 | """ 761 | if not self._initialized: 762 | self._base = algorithm.learning_rate.get_value() 763 | self._count += 1 764 | algorithm.learning_rate.set_value(self.current_learning_rate()) 765 | 766 | def current_learning_rate(self): 767 | """ 768 | Returns the current desired learning rate according to the 769 | annealing schedule. 770 | """ 771 | return self._base * min(1, self._anneal_start / self._count) 772 | 773 | class ExponentialDecay(object): 774 | """ 775 | This is a callback for the `SGD` algorithm rather than the `Train` object. 776 | This anneals the learning rate by dividing by decay_factor after each 777 | gradient descent step. It will not shrink the learning rate beyond 778 | `min_lr`. 779 | 780 | Parameters 781 | ---------- 782 | decay_factor : float 783 | The learning rate at step t is given by 784 | `init_learning_rate / (decay_factor ** t)` 785 | min_lr : float 786 | The learning rate will be clipped to be at least this value 787 | """ 788 | 789 | def __init__(self, decay_factor, min_lr): 790 | if isinstance(decay_factor, str): 791 | decay_factor = float(decay_factor) 792 | if isinstance(min_lr, str): 793 | min_lr = float(min_lr) 794 | assert isinstance(decay_factor, float) 795 | assert isinstance(min_lr, float) 796 | self.__dict__.update(locals()) 797 | del self.self 798 | self._count = 0 799 | self._min_reached = False 800 | 801 | def __call__(self, algorithm): 802 | """ 803 | Updates the learning rate according to the exponential decay schedule. 804 | 805 | Parameters 806 | ---------- 807 | algorithm : SGD 808 | The SGD instance whose `learning_rate` field should be modified. 809 | """ 810 | if self._count == 0: 811 | self._base_lr = algorithm.learning_rate.get_value() 812 | self._count += 1 813 | 814 | if not self._min_reached: 815 | # If we keep on executing the exponentiation on each mini-batch, 816 | # we will eventually get an OverflowError. So make sure we 817 | # only do the computation until min_lr is reached. 818 | new_lr = self._base_lr / (self.decay_factor ** self._count) 819 | if new_lr <= self.min_lr: 820 | self._min_reached = True 821 | new_lr = self.min_lr 822 | else: 823 | new_lr = self.min_lr 824 | 825 | new_lr = np.cast[config.floatX](new_lr) 826 | algorithm.learning_rate.set_value(new_lr) 827 | 828 | class LinearDecay(object): 829 | """ 830 | This is a callback for the SGD algorithm rather than the Train object. 831 | This anneals the learning rate to decay_factor times of the initial value 832 | during time start till saturate. 833 | 834 | Parameters 835 | ---------- 836 | start : int 837 | The step at which to start decreasing the learning rate 838 | saturate : int 839 | The step at which to stop decreating the learning rate 840 | decay_factor : float 841 | `final learning rate = decay_factor * initial learning rate` 842 | """ 843 | 844 | def __init__(self, start, saturate, decay_factor): 845 | if isinstance(decay_factor, str): 846 | decay_factor = float(decay_factor) 847 | if isinstance(start, str): 848 | start = float(start) 849 | if isinstance(saturate, str): 850 | saturate = float(saturate) 851 | assert isinstance(decay_factor, float) 852 | assert isinstance(start, (py_integer_types, py_float_types)) 853 | assert isinstance(saturate, (py_integer_types, py_float_types)) 854 | assert saturate > start 855 | assert start > 0 856 | self.__dict__.update(locals()) 857 | del self.self 858 | self._count = 0 859 | 860 | def __call__(self, algorithm): 861 | """ 862 | Adjusts the learning rate according to the linear decay schedule 863 | 864 | Parameters 865 | ---------- 866 | algorithm : WRITEME 867 | """ 868 | if self._count == 0: 869 | self._base_lr = algorithm.learning_rate.get_value() 870 | self._step = ((self._base_lr - self._base_lr * self.decay_factor) / 871 | (self.saturate - self.start + 1)) 872 | self._count += 1 873 | if self._count >= self.start: 874 | if self._count < self.saturate: 875 | new_lr = self._base_lr - self._step * (self._count 876 | - self.start + 1) 877 | else: 878 | new_lr = self._base_lr * self.decay_factor 879 | else: 880 | new_lr = self._base_lr 881 | assert new_lr > 0 882 | new_lr = np.cast[config.floatX](new_lr) 883 | algorithm.learning_rate.set_value(new_lr) 884 | 885 | 886 | def MomentumAdjustor(final_momentum, start, saturate): 887 | """ 888 | Deprecated class used with the deprecated init_momentum argument. 889 | Use learning_rule.MomentumAdjustor instead. 890 | 891 | Parameters 892 | ---------- 893 | final_momentum : WRITEME 894 | start : WRITEME 895 | saturate : WRITEME 896 | """ 897 | warnings.warn("sgd.MomentumAdjustor interface is deprecated and will " 898 | "become officially unsupported as of May 9, 2014. Please use " 899 | "`learning_rule.MomentumAdjustor` instead.") 900 | return LRMomentumAdjustor(final_momentum, start, saturate) 901 | 902 | 903 | class OneOverEpoch(TrainExtension): 904 | """ 905 | Scales the learning rate like one over # epochs 906 | 907 | Parameters 908 | ---------- 909 | start : int 910 | The epoch on which to start shrinking the learning rate 911 | half_life : int, optional 912 | How many epochs after start it will take for the learning rate to lose 913 | half its value for the first time (to lose the next half of its value 914 | will take twice as long) 915 | min_lr : float, optional 916 | The minimum value the learning rate can take on 917 | """ 918 | def __init__(self, start, half_life = None, min_lr = 1e-6): 919 | self.__dict__.update(locals()) 920 | del self.self 921 | self._initialized = False 922 | self._count = 0 923 | assert start >= 0 924 | if half_life is None: 925 | self.half_life = start + 1 926 | else: 927 | assert half_life > 0 928 | 929 | def on_monitor(self, model, dataset, algorithm): 930 | """ 931 | Adjusts the learning rate according to the decay schedule. 932 | 933 | Parameters 934 | ---------- 935 | model : a Model instance 936 | dataset : Dataset 937 | algorithm : WRITEME 938 | """ 939 | 940 | if not self._initialized: 941 | self._init_lr = algorithm.learning_rate.get_value() 942 | if self._init_lr < self.min_lr: 943 | raise ValueError("The initial learning rate is smaller than " + 944 | "the minimum allowed learning rate.") 945 | self._initialized = True 946 | self._count += 1 947 | algorithm.learning_rate.set_value(np.cast[config.floatX]( 948 | self.current_lr())) 949 | 950 | def current_lr(self): 951 | """ 952 | Returns the learning rate currently desired by the decay schedule. 953 | """ 954 | if self._count < self.start: 955 | scale = 1 956 | else: 957 | scale = float(self.half_life) / float(self._count - self.start 958 | + self.half_life) 959 | lr = self._init_lr * scale 960 | clipped = max(self.min_lr, lr) 961 | return clipped 962 | 963 | class LinearDecayOverEpoch(TrainExtension): 964 | """ 965 | Scales the learning rate linearly on each epochs 966 | 967 | Parameters 968 | ---------- 969 | start : int 970 | The epoch on which to start shrinking the learning rate 971 | saturate : int 972 | The epoch to saturate the shrinkage 973 | decay_factor : float 974 | The final value would be initial learning rate times decay_factor 975 | """ 976 | 977 | def __init__(self, start, saturate, decay_factor): 978 | self.__dict__.update(locals()) 979 | del self.self 980 | self._initialized = False 981 | self._count = 0 982 | assert isinstance(decay_factor, float) 983 | assert isinstance(start, (py_integer_types, py_float_types)) 984 | assert isinstance(saturate, (py_integer_types, py_float_types)) 985 | assert saturate > start 986 | assert start >= 0 987 | assert saturate >= start 988 | 989 | def on_monitor(self, model, dataset, algorithm): 990 | """ 991 | Updates the learning rate based on the linear decay schedule. 992 | 993 | Parameters 994 | ---------- 995 | model : a Model instance 996 | dataset : Dataset 997 | algorithm : WRITEME 998 | """ 999 | if not self._initialized: 1000 | self._init_lr = algorithm.learning_rate.get_value() 1001 | self._step = ((self._init_lr - self._init_lr * self.decay_factor) / 1002 | (self.saturate - self.start + 1)) 1003 | self._initialized = True 1004 | self._count += 1 1005 | algorithm.learning_rate.set_value(np.cast[config.floatX]( 1006 | self.current_lr())) 1007 | 1008 | def current_lr(self): 1009 | """ 1010 | Returns the learning rate currently desired by the decay schedule. 1011 | """ 1012 | if self._count >= self.start: 1013 | if self._count < self.saturate: 1014 | new_lr = self._init_lr - self._step * (self._count 1015 | - self.start + 1) 1016 | else: 1017 | new_lr = self._init_lr * self.decay_factor 1018 | else: 1019 | new_lr = self._init_lr 1020 | assert new_lr > 0 1021 | return new_lr 1022 | 1023 | class _PolyakWorker(object): 1024 | """ 1025 | Only to be used by the PolyakAveraging TrainingCallback below. 1026 | Do not use directly. 1027 | A callback for the SGD class. 1028 | 1029 | Parameters 1030 | ---------- 1031 | model : a Model 1032 | The model whose parameters we want to train with Polyak averaging 1033 | """ 1034 | 1035 | def __init__(self, model): 1036 | avg_updates = OrderedDict() 1037 | t = sharedX(1.) 1038 | self.param_to_mean = OrderedDict() 1039 | for param in model.get_params(): 1040 | mean = sharedX(param.get_value()) 1041 | assert type(mean) == type(param) 1042 | self.param_to_mean[param] = mean 1043 | avg_updates[mean] = mean - (mean - param) / t 1044 | avg_updates[t] = t + 1. 1045 | self.avg = function([], updates = avg_updates) 1046 | 1047 | def __call__(self, algorithm): 1048 | """ 1049 | To be called after each SGD step. 1050 | Updates the Polyak averaged-parameters for this model 1051 | 1052 | Parameters 1053 | ---------- 1054 | algorithm : WRITEME 1055 | """ 1056 | self.avg() 1057 | 1058 | class PolyakAveraging(TrainExtension): 1059 | """ 1060 | See "A Tutorial on Stochastic Approximation Algorithms 1061 | for Training Restricted Boltzmann Machines and 1062 | Deep Belief Nets" by Kevin Swersky et al 1063 | 1064 | This functionality is still a work in progress. Currently, 1065 | your model needs to implement "add_polyak_channels" to 1066 | use it. 1067 | 1068 | The problem is that Polyak averaging shouldn't modify 1069 | the model parameters. It should keep a second copy 1070 | that it averages in the background. This second copy 1071 | doesn't get to come back in and affect the learning process 1072 | though. 1073 | 1074 | (IG tried having the second copy get pushed back into 1075 | the model once per epoch, but this turned out to be 1076 | harmful, at least in limited tests) 1077 | 1078 | So we need a cleaner interface for monitoring the 1079 | averaged copy of the parameters, and we need to make 1080 | sure the saved model at the end uses the averaged 1081 | parameters, not the parameters used for computing 1082 | the gradients during training. 1083 | 1084 | TODO: make use of the new on_save callback instead 1085 | of duplicating Train's save_freq flag 1086 | 1087 | Parameters 1088 | ---------- 1089 | start : int 1090 | The epoch after which to start averaging (0 = start averaging 1091 | immediately) 1092 | save_path : str, optional 1093 | WRITEME 1094 | save_freq : int, optional 1095 | WRITEME 1096 | 1097 | Notes 1098 | ----- 1099 | This is usually used with a fixed, rather than annealed learning 1100 | rate. It may be used in conjunction with momentum. 1101 | """ 1102 | 1103 | def __init__(self, start, save_path=None, save_freq=1): 1104 | self.__dict__.update(locals()) 1105 | del self.self 1106 | self._count = 0 1107 | assert isinstance(start, py_integer_types) 1108 | assert start >= 0 1109 | 1110 | def on_monitor(self, model, dataset, algorithm): 1111 | """ 1112 | Make sure Polyak-averaged model gets monitored. 1113 | Save the model if necessary. 1114 | 1115 | Parameters 1116 | ---------- 1117 | model : a Model instance 1118 | dataset : Dataset 1119 | algorithm : WRITEME 1120 | """ 1121 | if self._count == self.start: 1122 | self._worker = _PolyakWorker(model) 1123 | algorithm.update_callbacks.append(self._worker) 1124 | #HACK 1125 | try: 1126 | model.add_polyak_channels(self._worker.param_to_mean, 1127 | algorithm.monitoring_dataset) 1128 | except AttributeError: 1129 | pass 1130 | elif self.save_path is not None and self._count > self.start and \ 1131 | self._count % self.save_freq == 0: 1132 | saved_params = OrderedDict() 1133 | for param in model.get_params(): 1134 | saved_params[param] = param.get_value() 1135 | param.set_value(self._worker.param_to_mean[param].get_value()) 1136 | serial.save(self.save_path, model) 1137 | for param in model.get_params(): 1138 | param.set_value(saved_params[param]) 1139 | self._count += 1 1140 | -------------------------------------------------------------------------------- /sgd_alt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copy of pylearn2's sgd.py, hacked to support alternating between 3 | epochs of updating only the discriminator and epochs of updating 4 | both discriminator and generator. Ideally this would 5 | be accomplished using pylearn2's FixedVarDescr implementation, 6 | but it is currently not very well supported. 7 | """ 8 | from __future__ import division 9 | 10 | __authors__ = "Ian Goodfellow" 11 | __copyright__ = "Copyright 2010-2012, Universite de Montreal" 12 | __credits__ = ["Ian Goodfellow, David Warde-Farley"] 13 | __license__ = "3-clause BSD" 14 | __maintainer__ = "David Warde-Farley" 15 | __email__ = "pylearn-dev@googlegroups" 16 | 17 | import logging 18 | import warnings 19 | import numpy as np 20 | 21 | from theano import config 22 | from theano import function 23 | from theano.compat.python2x import OrderedDict 24 | from theano.gof.op import get_debug_values 25 | 26 | from pylearn2.monitor import Monitor 27 | from pylearn2.space import CompositeSpace, NullSpace 28 | from pylearn2.train_extensions import TrainExtension 29 | from pylearn2.training_algorithms.training_algorithm import TrainingAlgorithm 30 | from pylearn2.training_algorithms.learning_rule import Momentum 31 | from pylearn2.training_algorithms.learning_rule import MomentumAdjustor \ 32 | as LRMomentumAdjustor 33 | from pylearn2.utils.iteration import is_stochastic, has_uniform_batch_size 34 | from pylearn2.utils import py_integer_types, py_float_types 35 | from pylearn2.utils import safe_zip 36 | from pylearn2.utils import serial 37 | from pylearn2.utils import sharedX 38 | from pylearn2.utils.data_specs import DataSpecsMapping 39 | from pylearn2.utils.timing import log_timing 40 | from pylearn2.utils.rng import make_np_rng 41 | 42 | 43 | log = logging.getLogger(__name__) 44 | 45 | 46 | class SGD(TrainingAlgorithm): 47 | """ 48 | SGD = (Minibatch) Stochastic Gradient Descent. 49 | A TrainingAlgorithm that does stochastic gradient descent on minibatches 50 | of training examples. 51 | 52 | For theoretical background on this algorithm, see Yoshua Bengio's machine 53 | learning course notes on the subject: 54 | 55 | http://www.iro.umontreal.ca/~pift6266/H10/notes/gradient.html 56 | 57 | Parameters 58 | ---------- 59 | learning_rate : float 60 | The learning rate to use. Train object callbacks can change the 61 | learning rate after each epoch. SGD update_callbacks can change 62 | it after each minibatch. 63 | cost : pylearn2.costs.cost.Cost, optional 64 | Cost object specifying the objective function to be minimized. 65 | Optionally, may be None. In this case, SGD will call the model's 66 | get_default_cost method to obtain the objective function. 67 | batch_size : int, optional 68 | The size of the batch to be used. 69 | If not specified, the model will be asked for the batch size, so 70 | you must have specified the batch size there. 71 | (Some models are rigidly defined to only work with one batch size) 72 | monitoring_batch_size : int, optional 73 | The size of the monitoring batches. 74 | monitoring_batches : int, optional 75 | At the start of each epoch, we run "monitoring", to evaluate 76 | quantities such as the validation set error. 77 | monitoring_batches, if specified, determines the number of batches 78 | to draw from the iterator for each monitoring dataset. 79 | Unnecessary if not using monitoring or if `monitor_iteration_mode` 80 | is 'sequential' and `batch_size` is specified (number of 81 | batches will be calculated based on full dataset size). 82 | TODO: make it possible to specify different monitoring_batches 83 | for each monitoring dataset. The Monitor itself already supports 84 | this. 85 | monitoring_dataset : Dataset or dictionary, optional 86 | If not specified, no monitoring is used. 87 | If specified to be a Dataset, monitor on that Dataset. 88 | If specified to be dictionary, the keys should be string names 89 | of datasets, and the values should be Datasets. All monitoring 90 | channels will be computed for all monitoring Datasets and will 91 | have the dataset name and an underscore prepended to them. 92 | monitor_iteration_mode : str, optional 93 | The iteration mode used to iterate over the examples in all 94 | monitoring datasets. If not specified, defaults to 'sequential'. 95 | TODO: make it possible to specify different modes for different 96 | datasets. 97 | termination_criterion : instance of \ 98 | pylearn2.termination_criteria.TerminationCriterion, optional 99 | 100 | Used to determine when the algorithm should stop running. 101 | If not specified, runs forever--or more realistically, until 102 | external factors halt the python process (Kansas 1977). 103 | update_callbacks : list, optional 104 | If specified, each member of the list should be a callable that 105 | accepts an SGD instance as its only argument. 106 | All callbacks will be called with this SGD instance after each 107 | SGD step. 108 | learning_rule : training_algorithms.learning_rule.LearningRule, optional 109 | A learning rule computes the new parameter values given old 110 | parameters and first-order gradients. If learning_rule is None, 111 | sgd.SGD will update parameters according to the standard SGD 112 | learning rule: 113 | 114 | .. code-block:: none 115 | 116 | param := param - learning_rate * d cost / d param 117 | 118 | This argument allows more sophisticated learning rules, such 119 | as SGD with momentum. 120 | init_momentum : float, **DEPRECATED** option 121 | Use learning_rule instead. 122 | If None, does not use momentum otherwise, use momentum and 123 | initialize the momentum coefficient to init_momentum. Callbacks 124 | can change this over time just like the learning rate. If the 125 | gradient is the same on every step, then the update taken by the 126 | SGD algorithm is scaled by a factor of 1/(1-momentum). See 127 | section 9 of Geoffrey Hinton's "A Practical Guide to Training 128 | Restricted Boltzmann Machines" for details. 129 | set_batch_size : bool, optional 130 | Defaults to False. 131 | If True, and batch_size conflicts with model.force_batch_size, 132 | will call model.set_batch_size(batch_size) in an attempt to 133 | change model.force_batch_size 134 | train_iteration_mode : str, optional 135 | Defaults to 'shuffled_sequential'. 136 | The iteration mode to use for iterating through training examples. 137 | batches_per_iter : int, optional 138 | The number of batches to draw from the iterator over training 139 | examples. 140 | If iteration mode is 'sequential' or 'shuffled_sequential', this 141 | is unnecessary; when unspecified we will iterate over all examples. 142 | theano_function_mode : a valid argument to theano.function's \ 143 | 'mode' parameter, optional 144 | 145 | The theano mode to compile the updates function with. Note that 146 | pylearn2 includes some wraplinker modes that are not bundled with 147 | theano. See pylearn2.devtools. These extra modes let you do 148 | things like check for NaNs at every step, or record md5 digests 149 | of all computations performed by the update function to help 150 | isolate problems with nondeterminism. 151 | monitoring_costs : list, optional 152 | a list of Cost instances. The Monitor will also include all 153 | channels defined by these Costs, even though we don't train 154 | using them. 155 | seed : valid argument to np.random.RandomState, optional 156 | The seed used for the random number generate to be passed to the 157 | training dataset iterator (if any) 158 | """ 159 | def __init__(self, learning_rate, cost=None, batch_size=None, 160 | monitoring_batch_size=None, monitoring_batches=None, 161 | monitoring_dataset=None, monitor_iteration_mode='sequential', 162 | termination_criterion=None, update_callbacks=None, 163 | learning_rule = None, init_momentum = None, 164 | set_batch_size = False, 165 | train_iteration_mode = None, batches_per_iter=None, 166 | theano_function_mode = None, monitoring_costs=None, 167 | seed=[2012, 10, 5], discriminator_steps=1): 168 | 169 | self.discriminator_steps = discriminator_steps 170 | self.train_generator = 0 171 | 172 | if isinstance(cost, (list, tuple, set)): 173 | raise TypeError("SGD no longer supports using collections of " + 174 | "Costs to represent a sum of Costs. Use " + 175 | "pylearn2.costs.cost.SumOfCosts instead.") 176 | 177 | if init_momentum: 178 | warnings.warn("init_momentum interface is deprecated and will " 179 | "become officially unsuported as of May 9, 2014. Please use the " 180 | "`learning_rule` parameter instead, providing an object of type " 181 | "`pylearn2.training_algorithms.learning_rule.Momentum` instead") 182 | # Convert to new interface under the hood. 183 | self.learning_rule = Momentum(init_momentum) 184 | else: 185 | self.learning_rule = learning_rule 186 | 187 | self.learning_rate = sharedX(learning_rate, 'learning_rate') 188 | self.cost = cost 189 | self.batch_size = batch_size 190 | self.set_batch_size = set_batch_size 191 | self.batches_per_iter = batches_per_iter 192 | self._set_monitoring_dataset(monitoring_dataset) 193 | self.monitoring_batch_size = monitoring_batch_size 194 | self.monitoring_batches = monitoring_batches 195 | self.monitor_iteration_mode = monitor_iteration_mode 196 | if monitoring_dataset is None: 197 | if monitoring_batch_size is not None: 198 | raise ValueError("Specified a monitoring batch size " + 199 | "but not a monitoring dataset.") 200 | if monitoring_batches is not None: 201 | raise ValueError("Specified an amount of monitoring batches " + 202 | "but not a monitoring dataset.") 203 | self.termination_criterion = termination_criterion 204 | self._register_update_callbacks(update_callbacks) 205 | if train_iteration_mode is None: 206 | train_iteration_mode = 'shuffled_sequential' 207 | self.train_iteration_mode = train_iteration_mode 208 | self.first = True 209 | self.rng = make_np_rng(seed, which_method=["randn","randint"]) 210 | self.theano_function_mode = theano_function_mode 211 | self.monitoring_costs = monitoring_costs 212 | 213 | def setup(self, model, dataset): 214 | """ 215 | Compiles the theano functions needed for the train method. 216 | 217 | Parameters 218 | ---------- 219 | model : a Model instance 220 | dataset : Dataset 221 | """ 222 | if self.cost is None: 223 | self.cost = model.get_default_cost() 224 | 225 | inf_params = [param for param in model.get_params() 226 | if np.any(np.isinf(param.get_value()))] 227 | if len(inf_params) > 0: 228 | raise ValueError("These params are Inf: "+str(inf_params)) 229 | if any([np.any(np.isnan(param.get_value())) 230 | for param in model.get_params()]): 231 | nan_params = [param for param in model.get_params() 232 | if np.any(np.isnan(param.get_value()))] 233 | raise ValueError("These params are NaN: "+str(nan_params)) 234 | self.model = model 235 | 236 | self._synchronize_batch_size(model) 237 | model._test_batch_size = self.batch_size 238 | self.monitor = Monitor.get_monitor(model) 239 | self.monitor._sanity_check() 240 | 241 | # test if force batch size and batch size 242 | if getattr(model, "force_batch_size", False) and \ 243 | any(dataset.get_design_matrix().shape[0] % self.batch_size != 0 for 244 | dataset in self.monitoring_dataset.values()) and \ 245 | not has_uniform_batch_size(self.monitor_iteration_mode): 246 | 247 | raise ValueError("Dataset size is not a multiple of batch size." 248 | "You should set monitor_iteration_mode to " 249 | "even_sequential, even_shuffled_sequential or " 250 | "even_batchwise_shuffled_sequential") 251 | 252 | data_specs = self.cost.get_data_specs(self.model) 253 | mapping = DataSpecsMapping(data_specs) 254 | space_tuple = mapping.flatten(data_specs[0], return_tuple=True) 255 | source_tuple = mapping.flatten(data_specs[1], return_tuple=True) 256 | 257 | # Build a flat tuple of Theano Variables, one for each space. 258 | # We want that so that if the same space/source is specified 259 | # more than once in data_specs, only one Theano Variable 260 | # is generated for it, and the corresponding value is passed 261 | # only once to the compiled Theano function. 262 | theano_args = [] 263 | for space, source in safe_zip(space_tuple, source_tuple): 264 | name = '%s[%s]' % (self.__class__.__name__, source) 265 | arg = space.make_theano_batch(name=name, 266 | batch_size=self.batch_size) 267 | theano_args.append(arg) 268 | theano_args = tuple(theano_args) 269 | 270 | # Methods of `self.cost` need args to be passed in a format compatible 271 | # with data_specs 272 | nested_args = mapping.nest(theano_args) 273 | fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) 274 | self.on_load_batch = fixed_var_descr.on_load_batch 275 | 276 | cost_value = self.cost.expr(model, nested_args, 277 | ** fixed_var_descr.fixed_vars) 278 | 279 | if cost_value is not None and cost_value.name is None: 280 | # Concatenate the name of all tensors in theano_args !? 281 | cost_value.name = 'objective' 282 | 283 | # Set up monitor to model the objective value, learning rate, 284 | # momentum (if applicable), and extra channels defined by 285 | # the cost 286 | learning_rate = self.learning_rate 287 | if self.monitoring_dataset is not None: 288 | if (self.monitoring_batch_size is None and 289 | self.monitoring_batches is None): 290 | self.monitoring_batch_size = self.batch_size 291 | self.monitoring_batches = self.batches_per_iter 292 | self.monitor.setup(dataset=self.monitoring_dataset, 293 | cost=self.cost, 294 | batch_size=self.monitoring_batch_size, 295 | num_batches=self.monitoring_batches, 296 | extra_costs=self.monitoring_costs, 297 | mode=self.monitor_iteration_mode) 298 | dataset_name = self.monitoring_dataset.keys()[0] 299 | monitoring_dataset = self.monitoring_dataset[dataset_name] 300 | #TODO: have Monitor support non-data-dependent channels 301 | self.monitor.add_channel(name='learning_rate', 302 | ipt=None, 303 | val=learning_rate, 304 | data_specs=(NullSpace(), ''), 305 | dataset=monitoring_dataset) 306 | 307 | if self.learning_rule: 308 | self.learning_rule.add_channels_to_monitor( 309 | self.monitor, 310 | monitoring_dataset) 311 | 312 | params = list(model.get_params()) 313 | assert len(params) > 0 314 | for i, param in enumerate(params): 315 | if param.name is None: 316 | param.name = 'sgd_params[%d]' % i 317 | self.params = params 318 | 319 | 320 | grads, updates = self.cost.get_gradients(model, nested_args, 321 | ** fixed_var_descr.fixed_vars) 322 | if not isinstance(grads, OrderedDict): 323 | raise TypeError(str(type(self.cost)) + ".get_gradients returned " + 324 | "something with" + str(type(grads)) + "as its " + 325 | "first member. Expected OrderedDict.") 326 | 327 | for param in grads: 328 | assert param in params 329 | for param in params: 330 | assert param in grads 331 | 332 | lr_scalers = model.get_lr_scalers() 333 | 334 | for key in lr_scalers: 335 | if key not in params: 336 | raise ValueError("Tried to scale the learning rate on " +\ 337 | str(key)+" which is not an optimization parameter.") 338 | 339 | assert len(updates.keys()) == 0 340 | 341 | def get_func(learn_discriminator, learn_generator, dont_you_fucking_dare_touch_the_generator=False): 342 | 343 | updates = OrderedDict() 344 | 345 | assert (learn_discriminator or learn_generator) and not (learn_discriminator and learn_generator) 346 | 347 | if learn_discriminator: 348 | cur_params = model.discriminator.get_params() 349 | else: 350 | cur_params = model.generator.get_params() 351 | 352 | def check(): 353 | for param in params: 354 | if param not in cur_params: 355 | assert param not in updates 356 | 357 | cur_grads = OrderedDict() 358 | for param in cur_params: 359 | cur_grads[param] = grads[param] 360 | 361 | for param in grads: 362 | if grads[param].name is None and cost_value is not None: 363 | grads[param].name = ('grad(%(costname)s, %(paramname)s)' % 364 | {'costname': cost_value.name, 365 | 'paramname': param.name}) 366 | assert grads[param].dtype == param.dtype 367 | 368 | cur_lr_scalers = OrderedDict() 369 | for param in cur_params: 370 | if param in lr_scalers: 371 | lr_scaler = lr_scalers[param] 372 | cur_lr_scalers[param] = lr_scaler 373 | 374 | log.info('Parameter and initial learning rate summary:') 375 | for param in cur_params: 376 | param_name = param.name 377 | if param_name is None: 378 | param_name = 'anon_param' 379 | lr = learning_rate.get_value() * cur_lr_scalers.get(param,1.) 380 | log.info('\t' + param_name + ': ' + str(lr)) 381 | 382 | updates.update(self.learning_rule.get_updates( 383 | learning_rate, cur_grads, cur_lr_scalers)) 384 | check() 385 | 386 | for param in cur_params: 387 | if updates[param].name is None: 388 | updates[param].name = 'sgd_update(' + param.name + ')' 389 | check() 390 | model.modify_updates(updates) 391 | check() 392 | for param in cur_params: 393 | update = updates[param] 394 | if update.name is None: 395 | update.name = 'censor(sgd_update(' + param.name + '))' 396 | for update_val in get_debug_values(update): 397 | if np.any(np.isinf(update_val)): 398 | raise ValueError("debug value of %s contains infs" % 399 | update.name) 400 | if np.any(np.isnan(update_val)): 401 | raise ValueError("debug value of %s contains nans" % 402 | update.name) 403 | 404 | check() 405 | 406 | if dont_you_fucking_dare_touch_the_generator: 407 | for param in model.generator.get_params(): 408 | assert param not in updates 409 | 410 | with log_timing(log, 'Compiling sgd_update'): 411 | return function(theano_args, 412 | updates=updates, 413 | name='sgd_update', 414 | on_unused_input='ignore', 415 | mode=self.theano_function_mode) 416 | self.d_func = get_func(1, 0, dont_you_fucking_dare_touch_the_generator=True) 417 | self.g_func = get_func(0, 1) 418 | 419 | def train(self, dataset): 420 | """ 421 | Runs one epoch of SGD training on the specified dataset. 422 | 423 | Parameters 424 | ---------- 425 | dataset : Dataset 426 | """ 427 | 428 | 429 | if not hasattr(self, 'd_func'): 430 | raise Exception("train called without first calling setup") 431 | 432 | # Make sure none of the parameters have bad values 433 | for param in self.params: 434 | value = param.get_value(borrow=True) 435 | if np.any(np.isnan(value)) or np.any(np.isinf(value)): 436 | raise Exception("NaN in " + param.name) 437 | 438 | self.first = False 439 | rng = self.rng 440 | if not is_stochastic(self.train_iteration_mode): 441 | rng = None 442 | 443 | data_specs = self.cost.get_data_specs(self.model) 444 | 445 | # The iterator should be built from flat data specs, so it returns 446 | # flat, non-redundent tuples of data. 447 | mapping = DataSpecsMapping(data_specs) 448 | space_tuple = mapping.flatten(data_specs[0], return_tuple=True) 449 | source_tuple = mapping.flatten(data_specs[1], return_tuple=True) 450 | if len(space_tuple) == 0: 451 | # No data will be returned by the iterator, and it is impossible 452 | # to know the size of the actual batch. 453 | # It is not decided yet what the right thing to do should be. 454 | raise NotImplementedError("Unable to train with SGD, because " 455 | "the cost does not actually use data from the data set. " 456 | "data_specs: %s" % str(data_specs)) 457 | flat_data_specs = (CompositeSpace(space_tuple), source_tuple) 458 | 459 | iterator = dataset.iterator(mode=self.train_iteration_mode, 460 | batch_size=self.batch_size, 461 | data_specs=flat_data_specs, return_tuple=True, 462 | rng = rng, num_batches = self.batches_per_iter) 463 | 464 | 465 | on_load_batch = self.on_load_batch 466 | i = 0 467 | for batch in iterator: 468 | for callback in on_load_batch: 469 | callback(*batch) 470 | if self.train_generator and i == self.discriminator_steps: 471 | self.g_func(*batch) 472 | i = 0 473 | else: 474 | self.d_func(*batch) 475 | i += 1 476 | # iterator might return a smaller batch if dataset size 477 | # isn't divisible by batch_size 478 | # Note: if data_specs[0] is a NullSpace, there is no way to know 479 | # how many examples would actually have been in the batch, 480 | # since it was empty, so actual_batch_size would be reported as 0. 481 | actual_batch_size = flat_data_specs[0].np_batch_size(batch) 482 | self.monitor.report_batch(actual_batch_size) 483 | for callback in self.update_callbacks: 484 | callback(self) 485 | 486 | 487 | # Make sure none of the parameters have bad values 488 | for param in self.params: 489 | value = param.get_value(borrow=True) 490 | if np.any(np.isnan(value)) or np.any(np.isinf(value)): 491 | raise Exception("NaN in " + param.name) 492 | 493 | self.train_generator = not self.train_generator 494 | 495 | def continue_learning(self, model): 496 | """ 497 | Returns True if the algorithm should continue running, or False 498 | if it has reached convergence / started overfitting and should 499 | stop. 500 | 501 | Parameters 502 | ---------- 503 | model : a Model instance 504 | """ 505 | if self.termination_criterion is None: 506 | return True 507 | else: 508 | return self.termination_criterion.continue_learning(self.model) 509 | 510 | class MonitorBasedLRAdjuster(TrainExtension): 511 | """ 512 | A TrainExtension that uses the on_monitor callback to adjust 513 | the learning rate on each epoch. It pulls out a channel 514 | from the model's monitor and adjusts the learning rate 515 | based on what happened to the monitoring channel on the last 516 | epoch. If the channel is greater than high_trigger times 517 | its previous value, the learning rate will be scaled by 518 | shrink_amt (which should be < 1 for this scheme to make 519 | sense). The idea is that in this case the learning algorithm 520 | is overshooting the bottom of the objective function. 521 | 522 | If the objective is less than high_trigger but 523 | greater than low_trigger times its previous value, the 524 | learning rate will be scaled by grow_amt (which should be > 1 525 | for this scheme to make sense). The idea is that the learning 526 | algorithm is making progress but at too slow of a rate. 527 | 528 | Parameters 529 | ---------- 530 | high_trigger : float, optional 531 | See class-level docstring 532 | low_trigger : float, optional 533 | See class-level docstring 534 | grow_amt : float, optional 535 | See class-level docstring 536 | min_lr : float, optional 537 | All updates to the learning rate are clipped to be at least 538 | this value. 539 | max_lr : float, optional 540 | All updates to the learning rate are clipped to be at most 541 | this value. 542 | dataset_name : str, optional 543 | If specified, use dataset_name + "_objective" as the channel 544 | to guide the learning rate adaptation. 545 | channel_name : str, optional 546 | If specified, use channel_name as the channel to guide the 547 | learning rate adaptation. Conflicts with dataset_name. 548 | If neither dataset_name nor channel_name is specified, uses 549 | "objective" 550 | """ 551 | 552 | def __init__(self, high_trigger=1., shrink_amt=.99, 553 | low_trigger=.99, grow_amt=1.01, 554 | min_lr = 1e-7, max_lr = 1., 555 | dataset_name=None, channel_name=None): 556 | self.high_trigger = high_trigger 557 | self.shrink_amt = shrink_amt 558 | self.low_trigger = low_trigger 559 | self.grow_amt = grow_amt 560 | self.min_lr = min_lr 561 | self.max_lr = max_lr 562 | self.dataset_name = None 563 | if channel_name is not None: 564 | self.channel_name = channel_name 565 | else: 566 | if dataset_name is not None: 567 | self.channel_name = dataset_name + '_objective' 568 | self.dataset_name = dataset_name 569 | else: 570 | self.channel_name = None 571 | 572 | def on_monitor(self, model, dataset, algorithm): 573 | """ 574 | Adjusts the learning rate based on the contents of model.monitor 575 | 576 | Parameters 577 | ---------- 578 | model : a Model instance 579 | dataset : Dataset 580 | algorithm : WRITEME 581 | """ 582 | model = algorithm.model 583 | lr = algorithm.learning_rate 584 | current_learning_rate = lr.get_value() 585 | assert hasattr(model, 'monitor'), ("no monitor associated with " 586 | + str(model)) 587 | monitor = model.monitor 588 | monitor_channel_specified = True 589 | 590 | if self.channel_name is None: 591 | monitor_channel_specified = False 592 | channels = [elem for elem in monitor.channels 593 | if elem.endswith("objective")] 594 | if len(channels) < 1: 595 | raise ValueError("There are no monitoring channels that end " 596 | "with \"objective\". Please specify either " 597 | "channel_name or dataset_name.") 598 | elif len(channels) > 1: 599 | datasets = algorithm.monitoring_dataset.keys() 600 | raise ValueError("There are multiple monitoring channels that" 601 | "end with \"_objective\". The list of available " 602 | "datasets are: " + 603 | str(datasets) + " . Please specify either " 604 | "channel_name or dataset_name in the " 605 | "MonitorBasedLRAdjuster constructor to " 606 | 'disambiguate.') 607 | else: 608 | self.channel_name = channels[0] 609 | warnings.warn('The channel that has been chosen for ' 610 | 'monitoring is: ' + 611 | str(self.channel_name) + '.') 612 | 613 | try: 614 | v = monitor.channels[self.channel_name].val_record 615 | except KeyError: 616 | err_input = '' 617 | if monitor_channel_specified: 618 | if self.dataset_name: 619 | err_input = 'The dataset_name \'' + str( 620 | self.dataset_name) + '\' is not valid.' 621 | else: 622 | err_input = 'The channel_name \'' + str( 623 | self.channel_name) + '\' is not valid.' 624 | err_message = 'There is no monitoring channel named \'' + \ 625 | str(self.channel_name) + '\'. You probably need to ' + \ 626 | 'specify a valid monitoring channel by using either ' + \ 627 | 'dataset_name or channel_name in the ' + \ 628 | 'MonitorBasedLRAdjuster constructor. ' + err_input 629 | raise ValueError(err_message) 630 | 631 | if len(v) < 1: 632 | if monitor.dataset is None: 633 | assert len(v) == 0 634 | raise ValueError("You're trying to use a monitor-based " 635 | "learning rate adjustor but the monitor has no " 636 | "entries because you didn't specify a " 637 | "monitoring dataset.") 638 | 639 | raise ValueError("For some reason there are no monitor entries" 640 | "yet the MonitorBasedLRAdjuster has been " 641 | "called. This should never happen. The Train" 642 | " object should call the monitor once on " 643 | "initialization, then call the callbacks. " 644 | "It seems you are either calling the " 645 | "callback manually rather than as part of a " 646 | "training algorithm, or there is a problem " 647 | "with the Train object.") 648 | if len(v) == 1: 649 | #only the initial monitoring has happened 650 | #no learning has happened, so we can't adjust the learning rate yet 651 | #just do nothing 652 | return 653 | 654 | rval = current_learning_rate 655 | 656 | log.info("monitoring channel is {0}".format(self.channel_name)) 657 | 658 | if v[-1] > self.high_trigger * v[-2]: 659 | rval *= self.shrink_amt 660 | log.info("shrinking learning rate to %f" % rval) 661 | elif v[-1] > self.low_trigger * v[-2]: 662 | rval *= self.grow_amt 663 | log.info("growing learning rate to %f" % rval) 664 | 665 | rval = max(self.min_lr, rval) 666 | rval = min(self.max_lr, rval) 667 | 668 | lr.set_value(np.cast[lr.dtype](rval)) 669 | 670 | 671 | class PatienceBasedTermCrit(object): 672 | """ 673 | A monitor-based termination criterion using a geometrically increasing 674 | amount of patience. If the selected channel has decreased by a certain 675 | proportion when comparing to the lowest value seen yet, the patience is 676 | set to a factor of the number of examples seen, which by default 677 | (patience_increase=2.) ensures the model has seen as many examples as the 678 | number of examples that lead to the lowest value before concluding a local 679 | optima has been reached. 680 | 681 | Note: Technically, the patience corresponds to a number of epochs to be 682 | independent of the size of the dataset, so be aware of that when choosing 683 | initial_patience. 684 | 685 | Parameters 686 | ---------- 687 | prop_decrease : float 688 | The factor X in the (1 - X) * best_value threshold 689 | initial_patience : int 690 | Minimal number of epochs the model has to run before it can stop 691 | patience_increase : float, optional 692 | The factor X in the patience = X * n_iter update. 693 | channel_name : string, optional 694 | Name of the channel to examine. If None and the monitor 695 | has only one channel, this channel will be used; otherwise, an 696 | error will be raised. 697 | """ 698 | def __init__(self, prop_decrease, initial_patience, 699 | patience_increase=2., channel_name=None): 700 | self._channel_name = channel_name 701 | self.prop_decrease = prop_decrease 702 | self.patience = initial_patience 703 | self.best_value = np.inf 704 | self.patience_increase = patience_increase 705 | 706 | def __call__(self, model): 707 | """ 708 | Returns True or False depending on whether the optimization should 709 | stop or not. The optimization should stop if it has run for a number 710 | of epochs superior to the patience without any improvement. 711 | 712 | Parameters 713 | ---------- 714 | model : Model 715 | The model used in the experiment and from which the monitor used 716 | in the termination criterion will be extracted. 717 | 718 | Returns 719 | ------- 720 | bool 721 | True or False, indicating if the optimization should stop or not. 722 | """ 723 | monitor = model.monitor 724 | # In the case the monitor has only one channel, the channel_name can 725 | # be omitted and the criterion will examine the only channel 726 | # available. However, if the monitor has multiple channels, leaving 727 | # the channel_name unspecified will raise an error. 728 | if self._channel_name is None: 729 | if len(monitor.channels) != 1: 730 | raise ValueError("Only single-channel monitors are supported " 731 | "for channel_name == None") 732 | v = monitor.channels.values()[0].val_record 733 | else: 734 | v = monitor.channels[self._channel_name].val_record 735 | # If the channel value decrease is higher than the threshold, we 736 | # update the best value to this value and we update the patience. 737 | if v[-1] < self.best_value * (1. - self.prop_decrease): 738 | # Using the max between actual patience and updated patience 739 | # ensures that the model will run for at least the initial 740 | # patience and that it would behave correctly if the user 741 | # chooses a dumb value (i.e. less than 1) 742 | self.patience = max(self.patience, len(v) * self.patience_increase) 743 | self.best_value = v[-1] 744 | 745 | return len(v) < self.patience 746 | 747 | 748 | class AnnealedLearningRate(object): 749 | """ 750 | This is a callback for the SGD algorithm rather than the Train object. 751 | This anneals the learning rate to decrease as 1/t where t is the number 752 | of gradient descent updates done so far. Use OneOverEpoch as Train object 753 | callback if you would prefer 1/t where t is epochs. 754 | 755 | Parameters 756 | ---------- 757 | anneal_start : int 758 | The epoch on which to begin annealing 759 | """ 760 | def __init__(self, anneal_start): 761 | self._initialized = False 762 | self._count = 0 763 | self._anneal_start = anneal_start 764 | 765 | def __call__(self, algorithm): 766 | """ 767 | Updates the learning rate according to the annealing schedule. 768 | 769 | Parameters 770 | ---------- 771 | algorithm : WRITEME 772 | """ 773 | if not self._initialized: 774 | self._base = algorithm.learning_rate.get_value() 775 | self._count += 1 776 | algorithm.learning_rate.set_value(self.current_learning_rate()) 777 | 778 | def current_learning_rate(self): 779 | """ 780 | Returns the current desired learning rate according to the 781 | annealing schedule. 782 | """ 783 | return self._base * min(1, self._anneal_start / self._count) 784 | 785 | class ExponentialDecay(object): 786 | """ 787 | This is a callback for the `SGD` algorithm rather than the `Train` object. 788 | This anneals the learning rate by dividing by decay_factor after each 789 | gradient descent step. It will not shrink the learning rate beyond 790 | `min_lr`. 791 | 792 | Parameters 793 | ---------- 794 | decay_factor : float 795 | The learning rate at step t is given by 796 | `init_learning_rate / (decay_factor ** t)` 797 | min_lr : float 798 | The learning rate will be clipped to be at least this value 799 | """ 800 | 801 | def __init__(self, decay_factor, min_lr): 802 | if isinstance(decay_factor, str): 803 | decay_factor = float(decay_factor) 804 | if isinstance(min_lr, str): 805 | min_lr = float(min_lr) 806 | assert isinstance(decay_factor, float) 807 | assert isinstance(min_lr, float) 808 | self.__dict__.update(locals()) 809 | del self.self 810 | self._count = 0 811 | self._min_reached = False 812 | 813 | def __call__(self, algorithm): 814 | """ 815 | Updates the learning rate according to the exponential decay schedule. 816 | 817 | Parameters 818 | ---------- 819 | algorithm : SGD 820 | The SGD instance whose `learning_rate` field should be modified. 821 | """ 822 | if self._count == 0: 823 | self._base_lr = algorithm.learning_rate.get_value() 824 | self._count += 1 825 | 826 | if not self._min_reached: 827 | # If we keep on executing the exponentiation on each mini-batch, 828 | # we will eventually get an OverflowError. So make sure we 829 | # only do the computation until min_lr is reached. 830 | new_lr = self._base_lr / (self.decay_factor ** self._count) 831 | if new_lr <= self.min_lr: 832 | self._min_reached = True 833 | new_lr = self.min_lr 834 | else: 835 | new_lr = self.min_lr 836 | 837 | new_lr = np.cast[config.floatX](new_lr) 838 | algorithm.learning_rate.set_value(new_lr) 839 | 840 | class LinearDecay(object): 841 | """ 842 | This is a callback for the SGD algorithm rather than the Train object. 843 | This anneals the learning rate to decay_factor times of the initial value 844 | during time start till saturate. 845 | 846 | Parameters 847 | ---------- 848 | start : int 849 | The step at which to start decreasing the learning rate 850 | saturate : int 851 | The step at which to stop decreating the learning rate 852 | decay_factor : float 853 | `final learning rate = decay_factor * initial learning rate` 854 | """ 855 | 856 | def __init__(self, start, saturate, decay_factor): 857 | if isinstance(decay_factor, str): 858 | decay_factor = float(decay_factor) 859 | if isinstance(start, str): 860 | start = float(start) 861 | if isinstance(saturate, str): 862 | saturate = float(saturate) 863 | assert isinstance(decay_factor, float) 864 | assert isinstance(start, (py_integer_types, py_float_types)) 865 | assert isinstance(saturate, (py_integer_types, py_float_types)) 866 | assert saturate > start 867 | assert start > 0 868 | self.__dict__.update(locals()) 869 | del self.self 870 | self._count = 0 871 | 872 | def __call__(self, algorithm): 873 | """ 874 | Adjusts the learning rate according to the linear decay schedule 875 | 876 | Parameters 877 | ---------- 878 | algorithm : WRITEME 879 | """ 880 | if self._count == 0: 881 | self._base_lr = algorithm.learning_rate.get_value() 882 | self._step = ((self._base_lr - self._base_lr * self.decay_factor) / 883 | (self.saturate - self.start + 1)) 884 | self._count += 1 885 | if self._count >= self.start: 886 | if self._count < self.saturate: 887 | new_lr = self._base_lr - self._step * (self._count 888 | - self.start + 1) 889 | else: 890 | new_lr = self._base_lr * self.decay_factor 891 | else: 892 | new_lr = self._base_lr 893 | assert new_lr > 0 894 | new_lr = np.cast[config.floatX](new_lr) 895 | algorithm.learning_rate.set_value(new_lr) 896 | 897 | 898 | def MomentumAdjustor(final_momentum, start, saturate): 899 | """ 900 | Deprecated class used with the deprecated init_momentum argument. 901 | Use learning_rule.MomentumAdjustor instead. 902 | 903 | Parameters 904 | ---------- 905 | final_momentum : WRITEME 906 | start : WRITEME 907 | saturate : WRITEME 908 | """ 909 | warnings.warn("sgd.MomentumAdjustor interface is deprecated and will " 910 | "become officially unsupported as of May 9, 2014. Please use " 911 | "`learning_rule.MomentumAdjustor` instead.") 912 | return LRMomentumAdjustor(final_momentum, start, saturate) 913 | 914 | 915 | class OneOverEpoch(TrainExtension): 916 | """ 917 | Scales the learning rate like one over # epochs 918 | 919 | Parameters 920 | ---------- 921 | start : int 922 | The epoch on which to start shrinking the learning rate 923 | half_life : int, optional 924 | How many epochs after start it will take for the learning rate to lose 925 | half its value for the first time (to lose the next half of its value 926 | will take twice as long) 927 | min_lr : float, optional 928 | The minimum value the learning rate can take on 929 | """ 930 | def __init__(self, start, half_life = None, min_lr = 1e-6): 931 | self.__dict__.update(locals()) 932 | del self.self 933 | self._initialized = False 934 | self._count = 0 935 | assert start >= 0 936 | if half_life is None: 937 | self.half_life = start + 1 938 | else: 939 | assert half_life > 0 940 | 941 | def on_monitor(self, model, dataset, algorithm): 942 | """ 943 | Adjusts the learning rate according to the decay schedule. 944 | 945 | Parameters 946 | ---------- 947 | model : a Model instance 948 | dataset : Dataset 949 | algorithm : WRITEME 950 | """ 951 | 952 | if not self._initialized: 953 | self._init_lr = algorithm.learning_rate.get_value() 954 | if self._init_lr < self.min_lr: 955 | raise ValueError("The initial learning rate is smaller than " + 956 | "the minimum allowed learning rate.") 957 | self._initialized = True 958 | self._count += 1 959 | algorithm.learning_rate.set_value(np.cast[config.floatX]( 960 | self.current_lr())) 961 | 962 | def current_lr(self): 963 | """ 964 | Returns the learning rate currently desired by the decay schedule. 965 | """ 966 | if self._count < self.start: 967 | scale = 1 968 | else: 969 | scale = float(self.half_life) / float(self._count - self.start 970 | + self.half_life) 971 | lr = self._init_lr * scale 972 | clipped = max(self.min_lr, lr) 973 | return clipped 974 | 975 | class LinearDecayOverEpoch(TrainExtension): 976 | """ 977 | Scales the learning rate linearly on each epochs 978 | 979 | Parameters 980 | ---------- 981 | start : int 982 | The epoch on which to start shrinking the learning rate 983 | saturate : int 984 | The epoch to saturate the shrinkage 985 | decay_factor : float 986 | The final value would be initial learning rate times decay_factor 987 | """ 988 | 989 | def __init__(self, start, saturate, decay_factor): 990 | self.__dict__.update(locals()) 991 | del self.self 992 | self._initialized = False 993 | self._count = 0 994 | assert isinstance(decay_factor, float) 995 | assert isinstance(start, (py_integer_types, py_float_types)) 996 | assert isinstance(saturate, (py_integer_types, py_float_types)) 997 | assert saturate > start 998 | assert start >= 0 999 | assert saturate >= start 1000 | 1001 | def on_monitor(self, model, dataset, algorithm): 1002 | """ 1003 | Updates the learning rate based on the linear decay schedule. 1004 | 1005 | Parameters 1006 | ---------- 1007 | model : a Model instance 1008 | dataset : Dataset 1009 | algorithm : WRITEME 1010 | """ 1011 | if not self._initialized: 1012 | self._init_lr = algorithm.learning_rate.get_value() 1013 | self._step = ((self._init_lr - self._init_lr * self.decay_factor) / 1014 | (self.saturate - self.start + 1)) 1015 | self._initialized = True 1016 | self._count += 1 1017 | algorithm.learning_rate.set_value(np.cast[config.floatX]( 1018 | self.current_lr())) 1019 | 1020 | def current_lr(self): 1021 | """ 1022 | Returns the learning rate currently desired by the decay schedule. 1023 | """ 1024 | if self._count >= self.start: 1025 | if self._count < self.saturate: 1026 | new_lr = self._init_lr - self._step * (self._count 1027 | - self.start + 1) 1028 | else: 1029 | new_lr = self._init_lr * self.decay_factor 1030 | else: 1031 | new_lr = self._init_lr 1032 | assert new_lr > 0 1033 | return new_lr 1034 | 1035 | class _PolyakWorker(object): 1036 | """ 1037 | Only to be used by the PolyakAveraging TrainingCallback below. 1038 | Do not use directly. 1039 | A callback for the SGD class. 1040 | 1041 | Parameters 1042 | ---------- 1043 | model : a Model 1044 | The model whose parameters we want to train with Polyak averaging 1045 | """ 1046 | 1047 | def __init__(self, model): 1048 | avg_updates = OrderedDict() 1049 | t = sharedX(1.) 1050 | self.param_to_mean = OrderedDict() 1051 | for param in model.get_params(): 1052 | mean = sharedX(param.get_value()) 1053 | assert type(mean) == type(param) 1054 | self.param_to_mean[param] = mean 1055 | avg_updates[mean] = mean - (mean - param) / t 1056 | avg_updates[t] = t + 1. 1057 | self.avg = function([], updates = avg_updates) 1058 | 1059 | def __call__(self, algorithm): 1060 | """ 1061 | To be called after each SGD step. 1062 | Updates the Polyak averaged-parameters for this model 1063 | 1064 | Parameters 1065 | ---------- 1066 | algorithm : WRITEME 1067 | """ 1068 | self.avg() 1069 | 1070 | class PolyakAveraging(TrainExtension): 1071 | """ 1072 | See "A Tutorial on Stochastic Approximation Algorithms 1073 | for Training Restricted Boltzmann Machines and 1074 | Deep Belief Nets" by Kevin Swersky et al 1075 | 1076 | This functionality is still a work in progress. Currently, 1077 | your model needs to implement "add_polyak_channels" to 1078 | use it. 1079 | 1080 | The problem is that Polyak averaging shouldn't modify 1081 | the model parameters. It should keep a second copy 1082 | that it averages in the background. This second copy 1083 | doesn't get to come back in and affect the learning process 1084 | though. 1085 | 1086 | (IG tried having the second copy get pushed back into 1087 | the model once per epoch, but this turned out to be 1088 | harmful, at least in limited tests) 1089 | 1090 | So we need a cleaner interface for monitoring the 1091 | averaged copy of the parameters, and we need to make 1092 | sure the saved model at the end uses the averaged 1093 | parameters, not the parameters used for computing 1094 | the gradients during training. 1095 | 1096 | TODO: make use of the new on_save callback instead 1097 | of duplicating Train's save_freq flag 1098 | 1099 | Parameters 1100 | ---------- 1101 | start : int 1102 | The epoch after which to start averaging (0 = start averaging 1103 | immediately) 1104 | save_path : str, optional 1105 | WRITEME 1106 | save_freq : int, optional 1107 | WRITEME 1108 | 1109 | Notes 1110 | ----- 1111 | This is usually used with a fixed, rather than annealed learning 1112 | rate. It may be used in conjunction with momentum. 1113 | """ 1114 | 1115 | def __init__(self, start, save_path=None, save_freq=1): 1116 | self.__dict__.update(locals()) 1117 | del self.self 1118 | self._count = 0 1119 | assert isinstance(start, py_integer_types) 1120 | assert start >= 0 1121 | 1122 | def on_monitor(self, model, dataset, algorithm): 1123 | """ 1124 | Make sure Polyak-averaged model gets monitored. 1125 | Save the model if necessary. 1126 | 1127 | Parameters 1128 | ---------- 1129 | model : a Model instance 1130 | dataset : Dataset 1131 | algorithm : WRITEME 1132 | """ 1133 | if self._count == self.start: 1134 | self._worker = _PolyakWorker(model) 1135 | algorithm.update_callbacks.append(self._worker) 1136 | #HACK 1137 | try: 1138 | model.add_polyak_channels(self._worker.param_to_mean, 1139 | algorithm.monitoring_dataset) 1140 | except AttributeError: 1141 | pass 1142 | elif self.save_path is not None and self._count > self.start and \ 1143 | self._count % self.save_freq == 0: 1144 | saved_params = OrderedDict() 1145 | for param in model.get_params(): 1146 | saved_params[param] = param.get_value() 1147 | param.set_value(self._worker.param_to_mean[param].get_value()) 1148 | serial.save(self.save_path, model) 1149 | for param in model.get_params(): 1150 | param.set_value(saved_params[param]) 1151 | self._count += 1 1152 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code for "Generative Adversarial Networks". Please cite the ArXiv paper in 3 | any published research work making use of this code. 4 | """ 5 | import functools 6 | wraps = functools.wraps 7 | import itertools 8 | import numpy 9 | np = numpy 10 | import theano 11 | import warnings 12 | 13 | from theano.compat import OrderedDict 14 | from theano.sandbox.rng_mrg import MRG_RandomStreams 15 | from theano import tensor as T 16 | 17 | from pylearn2.space import VectorSpace 18 | from pylearn2.costs.cost import Cost 19 | from pylearn2.costs.cost import DefaultDataSpecsMixin 20 | from pylearn2.models.mlp import Layer 21 | from pylearn2.models.mlp import Linear 22 | from pylearn2.models import Model 23 | from pylearn2.space import CompositeSpace 24 | from pylearn2.train_extensions import TrainExtension 25 | from pylearn2.utils import block_gradient 26 | from pylearn2.utils import safe_zip 27 | from pylearn2.utils import serial 28 | from pylearn2.utils import sharedX 29 | 30 | class AdversaryPair(Model): 31 | 32 | def __init__(self, generator, discriminator, inferer=None, 33 | inference_monitoring_batch_size=128, 34 | monitor_generator=True, 35 | monitor_discriminator=True, 36 | monitor_inference=True, 37 | shrink_d = 0.): 38 | Model.__init__(self) 39 | self.__dict__.update(locals()) 40 | del self.self 41 | 42 | def __setstate__(self, state): 43 | self.__dict__.update(state) 44 | if 'inferer' not in state: 45 | self.inferer = None 46 | if 'inference_monitoring_batch_size' not in state: 47 | self.inference_monitoring_batch_size = 128 # TODO: HACK 48 | if 'monitor_generator' not in state: 49 | self.monitor_generator = True 50 | if 'monitor_discriminator' not in state: 51 | self.monitor_discriminator = True 52 | if 'monitor_inference' not in state: 53 | self.monitor_inference = True 54 | 55 | def get_params(self): 56 | p = self.generator.get_params() + self.discriminator.get_params() 57 | if hasattr(self, 'inferer') and self.inferer is not None: 58 | p += self.inferer.get_params() 59 | return p 60 | 61 | def get_input_space(self): 62 | return self.discriminator.get_input_space() 63 | 64 | def get_weights_topo(self): 65 | return self.discriminator.get_weights_topo() 66 | 67 | def get_weights(self): 68 | return self.discriminator.get_weights() 69 | 70 | def get_weights_format(self): 71 | return self.discriminator.get_weights_format() 72 | 73 | def get_weights_view_shape(self): 74 | return self.discriminator.get_weights_view_shape() 75 | 76 | def get_monitoring_channels(self, data): 77 | rval = OrderedDict() 78 | 79 | g_ch = self.generator.get_monitoring_channels(data) 80 | d_ch = self.discriminator.get_monitoring_channels((data, None)) 81 | samples = self.generator.sample(100) 82 | d_samp_ch = self.discriminator.get_monitoring_channels((samples, None)) 83 | 84 | i_ch = OrderedDict() 85 | if self.inferer is not None: 86 | batch_size = self.inference_monitoring_batch_size 87 | sample, noise, _ = self.generator.sample_and_noise(batch_size) 88 | i_ch.update(self.inferer.get_monitoring_channels((sample, noise))) 89 | 90 | if self.monitor_generator: 91 | for key in g_ch: 92 | rval['gen_' + key] = g_ch[key] 93 | if self.monitor_discriminator: 94 | for key in d_ch: 95 | rval['dis_on_data_' + key] = d_samp_ch[key] 96 | for key in d_ch: 97 | rval['dis_on_samp_' + key] = d_ch[key] 98 | if self.monitor_inference: 99 | for key in i_ch: 100 | rval['inf_' + key] = i_ch[key] 101 | return rval 102 | 103 | def get_monitoring_data_specs(self): 104 | 105 | space = self.discriminator.get_input_space() 106 | source = self.discriminator.get_input_source() 107 | return (space, source) 108 | 109 | def _modify_updates(self, updates): 110 | self.generator.modify_updates(updates) 111 | self.discriminator.modify_updates(updates) 112 | if self.shrink_d != 0.: 113 | for param in self.discriminator.get_params(): 114 | if param in updates: 115 | updates[param] = self.shrink_d * updates[param] 116 | if self.inferer is not None: 117 | self.inferer.modify_updates(updates) 118 | 119 | def get_lr_scalers(self): 120 | 121 | rval = self.generator.get_lr_scalers() 122 | rval.update(self.discriminator.get_lr_scalers()) 123 | return rval 124 | 125 | def add_layers(mlp, pretrained, start_layer=0): 126 | model = serial.load(pretrained) 127 | pretrained_layers = model.generator.mlp.layers 128 | assert pretrained_layers[start_layer].get_input_space() == mlp.layers[-1].get_output_space() 129 | mlp.layers.extend(pretrained_layers[start_layer:]) 130 | return mlp 131 | 132 | 133 | 134 | class Generator(Model): 135 | 136 | def __init__(self, mlp, noise = "gaussian", monitor_ll = False, ll_n_samples = 100, ll_sigma = 0.2): 137 | Model.__init__(self) 138 | self.__dict__.update(locals()) 139 | del self.self 140 | self.theano_rng = MRG_RandomStreams(2014 * 5 + 27) 141 | 142 | def get_input_space(self): 143 | return self.mlp.get_input_space() 144 | 145 | def sample_and_noise(self, num_samples, default_input_include_prob=1., default_input_scale=1., all_g_layers=False): 146 | n = self.mlp.get_input_space().get_total_dimension() 147 | noise = self.get_noise((num_samples, n)) 148 | formatted_noise = VectorSpace(n).format_as(noise, self.mlp.get_input_space()) 149 | if all_g_layers: 150 | rval = self.mlp.dropout_fprop(formatted_noise, default_input_include_prob=default_input_include_prob, default_input_scale=default_input_scale, return_all=all_g_layers) 151 | other_layers, rval = rval[:-1], rval[-1] 152 | else: 153 | rval = self.mlp.dropout_fprop(formatted_noise, default_input_include_prob=default_input_include_prob, default_input_scale=default_input_scale) 154 | other_layers = None 155 | return rval, formatted_noise, other_layers 156 | 157 | def sample(self, num_samples, default_input_include_prob=1., default_input_scale=1.): 158 | sample, _, _ = self.sample_and_noise(num_samples, default_input_include_prob, default_input_scale) 159 | return sample 160 | 161 | def inpainting_sample_and_noise(self, X, default_input_include_prob=1., default_input_scale=1.): 162 | # Very hacky! Specifically for inpainting right half of CIFAR-10 given left half 163 | # assumes X is b01c 164 | assert X.ndim == 4 165 | input_space = self.mlp.get_input_space() 166 | n = input_space.get_total_dimension() 167 | image_size = input_space.shape[0] 168 | half_image = int(image_size / 2) 169 | data_shape = (X.shape[0], image_size, half_image, input_space.num_channels) 170 | 171 | noise = self.theano_rng.normal(size=data_shape, dtype='float32') 172 | Xg = T.set_subtensor(X[:,:,half_image:,:], noise) 173 | sampled_part, noise = self.mlp.dropout_fprop(Xg, default_input_include_prob=default_input_include_prob, default_input_scale=default_input_scale), noise 174 | sampled_part = sampled_part.reshape(data_shape) 175 | rval = T.set_subtensor(X[:, :, half_image:, :], sampled_part) 176 | return rval, noise 177 | 178 | 179 | def get_monitoring_channels(self, data): 180 | if data is None: 181 | m = 100 182 | else: 183 | m = data.shape[0] 184 | n = self.mlp.get_input_space().get_total_dimension() 185 | noise = self.get_noise((m, n)) 186 | rval = OrderedDict() 187 | 188 | try: 189 | rval.update(self.mlp.get_monitoring_channels((noise, None))) 190 | except Exception: 191 | warnings.warn("something went wrong with generator.mlp's monitoring channels") 192 | 193 | if self.monitor_ll: 194 | rval['ll'] = T.cast(self.ll(data, self.ll_n_samples, self.ll_sigma), 195 | theano.config.floatX).mean() 196 | rval['nll'] = -rval['ll'] 197 | return rval 198 | 199 | def get_noise(self, size): 200 | 201 | # Allow just requesting batch size 202 | if isinstance(size, int): 203 | size = (size, self.get_input_space().get_total_dimension()) 204 | 205 | if not hasattr(self, 'noise'): 206 | self.noise = "gaussian" 207 | if self.noise == "uniform": 208 | return self.theano_rng.uniform(low=-np.sqrt(3), high=np.sqrt(3), size=size, dtype='float32') 209 | elif self.noise == "gaussian": 210 | return self.theano_rng.normal(size=size, dtype='float32') 211 | elif self.noise == "spherical": 212 | noise = self.theano_rng.normal(size=size, dtype='float32') 213 | noise = noise / T.maximum(1e-7, T.sqrt(T.sqr(noise).sum(axis=1))).dimshuffle(0, 'x') 214 | return noise 215 | else: 216 | raise NotImplementedError(self.noise) 217 | 218 | def get_params(self): 219 | return self.mlp.get_params() 220 | 221 | def get_output_space(self): 222 | return self.mlp.get_output_space() 223 | 224 | def ll(self, data, n_samples, sigma): 225 | 226 | samples = self.sample(n_samples) 227 | output_space = self.mlp.get_output_space() 228 | if 'Conv2D' in str(output_space): 229 | samples = output_space.convert(samples, output_space.axes, ('b', 0, 1, 'c')) 230 | samples = samples.flatten(2) 231 | data = output_space.convert(data, output_space.axes, ('b', 0, 1, 'c')) 232 | data = data.flatten(2) 233 | parzen = theano_parzen(data, samples, sigma) 234 | return parzen 235 | 236 | def _modify_updates(self, updates): 237 | self.mlp.modify_updates(updates) 238 | 239 | def get_lr_scalers(self): 240 | return self.mlp.get_lr_scalers() 241 | 242 | def __setstate__(self, state): 243 | self.__dict__.update(state) 244 | if 'monitor_ll' not in state: 245 | self.monitor_ll = False 246 | 247 | 248 | class IntrinsicDropoutGenerator(Generator): 249 | def __init__(self, default_input_include_prob, default_input_scale, 250 | input_include_probs=None, input_scales=None, **kwargs): 251 | super(IntrinsicDropoutGenerator, self).__init__(**kwargs) 252 | self.__dict__.update(locals()) 253 | del self.self 254 | 255 | def sample_and_noise(self, num_samples, default_input_include_prob=1., default_input_scale=1., all_g_layers=False): 256 | if all_g_layers: 257 | raise NotImplementedError() 258 | n = self.mlp.get_input_space().get_total_dimension() 259 | noise = self.theano_rng.normal(size=(num_samples, n), dtype='float32') 260 | formatted_noise = VectorSpace(n).format_as(noise, self.mlp.get_input_space()) 261 | # ignores dropout args 262 | default_input_include_prob = self.default_input_include_prob 263 | default_input_scale = self.default_input_scale 264 | input_include_probs = self.input_include_probs 265 | input_scales = self.input_scales 266 | return self.mlp.dropout_fprop(formatted_noise, 267 | default_input_include_prob=default_input_include_prob, 268 | default_input_scale=default_input_scale, 269 | input_include_probs=input_include_probs, 270 | input_scales=input_scales), formatted_noise, None 271 | 272 | class AdversaryCost2(DefaultDataSpecsMixin, Cost): 273 | """ 274 | """ 275 | 276 | # Supplies own labels, don't get them from the dataset 277 | supervised = False 278 | 279 | def __init__(self, scale_grads=1, target_scale=.1, 280 | discriminator_default_input_include_prob = 1., 281 | discriminator_input_include_probs=None, 282 | discriminator_default_input_scale=1., 283 | discriminator_input_scales=None, 284 | generator_default_input_include_prob = 1., 285 | generator_default_input_scale=1., 286 | inference_default_input_include_prob=None, 287 | inference_input_include_probs=None, 288 | inference_default_input_scale=1., 289 | inference_input_scales=None, 290 | init_now_train_generator=True, 291 | ever_train_discriminator=True, 292 | ever_train_generator=True, 293 | ever_train_inference=True, 294 | no_drop_in_d_for_g=False, 295 | alternate_g = False, 296 | infer_layer=None, 297 | noise_both = 0., 298 | blend_obj = False, 299 | minimax_coeff = 1., 300 | zurich_coeff = 1.): 301 | self.__dict__.update(locals()) 302 | del self.self 303 | # These allow you to dynamically switch off training parts. 304 | # If the corresponding ever_train_* is False, these have 305 | # no effect. 306 | self.now_train_generator = sharedX(init_now_train_generator) 307 | self.now_train_discriminator = sharedX(numpy.array(1., dtype='float32')) 308 | self.now_train_inference = sharedX(numpy.array(1., dtype='float32')) 309 | 310 | def expr(self, model, data, **kwargs): 311 | S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data) 312 | l = [] 313 | # This stops stuff from ever getting computed if we're not training 314 | # it. 315 | if self.ever_train_discriminator: 316 | l.append(d_obj) 317 | if self.ever_train_generator: 318 | l.append(g_obj) 319 | if self.ever_train_inference: 320 | l.append(i_obj) 321 | return sum(l) 322 | 323 | def get_samples_and_objectives(self, model, data): 324 | space, sources = self.get_data_specs(model) 325 | space.validate(data) 326 | assert isinstance(model, AdversaryPair) 327 | g = model.generator 328 | d = model.discriminator 329 | 330 | # Note: this assumes data is design matrix 331 | X = data 332 | m = data.shape[space.get_batch_axis()] 333 | y1 = T.alloc(1, m, 1) 334 | y0 = T.alloc(0, m, 1) 335 | # NOTE: if this changes to optionally use dropout, change the inference 336 | # code below to use a non-dropped-out version. 337 | S, z, other_layers = g.sample_and_noise(m, default_input_include_prob=self.generator_default_input_include_prob, default_input_scale=self.generator_default_input_scale, all_g_layers=(self.infer_layer is not None)) 338 | 339 | if self.noise_both != 0.: 340 | rng = MRG_RandomStreams(2014 / 6 + 2) 341 | S = S + rng.normal(size=S.shape, dtype=S.dtype) * self.noise_both 342 | X = X + rng.normal(size=X.shape, dtype=S.dtype) * self.noise_both 343 | 344 | y_hat1 = d.dropout_fprop(X, self.discriminator_default_input_include_prob, 345 | self.discriminator_input_include_probs, 346 | self.discriminator_default_input_scale, 347 | self.discriminator_input_scales) 348 | y_hat0 = d.dropout_fprop(S, self.discriminator_default_input_include_prob, 349 | self.discriminator_input_include_probs, 350 | self.discriminator_default_input_scale, 351 | self.discriminator_input_scales) 352 | 353 | d_obj = 0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0)) 354 | 355 | if self.no_drop_in_d_for_g: 356 | y_hat0_no_drop = d.dropout_fprop(S) 357 | g_obj = d.layers[-1].cost(y1, y_hat0_no_drop) 358 | else: 359 | g_obj = d.layers[-1].cost(y1, y_hat0) 360 | 361 | if self.blend_obj: 362 | g_obj = (self.zurich_coeff * g_obj - self.minimax_coeff * d_obj) / (self.zurich_coeff + self.minimax_coeff) 363 | 364 | if model.inferer is not None: 365 | # Change this if we ever switch to using dropout in the 366 | # construction of S. 367 | S_nograd = block_gradient(S) # Redundant as long as we have custom get_gradients 368 | pred = model.inferer.dropout_fprop(S_nograd, self.inference_default_input_include_prob, 369 | self.inference_input_include_probs, 370 | self.inference_default_input_scale, 371 | self.inference_input_scales) 372 | if self.infer_layer is None: 373 | target = z 374 | else: 375 | target = other_layers[self.infer_layer] 376 | i_obj = model.inferer.layers[-1].cost(target, pred) 377 | else: 378 | i_obj = 0 379 | 380 | return S, d_obj, g_obj, i_obj 381 | 382 | def get_gradients(self, model, data, **kwargs): 383 | space, sources = self.get_data_specs(model) 384 | space.validate(data) 385 | assert isinstance(model, AdversaryPair) 386 | g = model.generator 387 | d = model.discriminator 388 | 389 | S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data) 390 | 391 | g_params = g.get_params() 392 | d_params = d.get_params() 393 | for param in g_params: 394 | assert param not in d_params 395 | for param in d_params: 396 | assert param not in g_params 397 | d_grads = T.grad(d_obj, d_params) 398 | g_grads = T.grad(g_obj, g_params) 399 | 400 | if self.scale_grads: 401 | S_grad = T.grad(g_obj, S) 402 | scale = T.maximum(1., self.target_scale / T.sqrt(T.sqr(S_grad).sum())) 403 | g_grads = [g_grad * scale for g_grad in g_grads] 404 | 405 | rval = OrderedDict() 406 | zeros = itertools.repeat(theano.tensor.constant(0., dtype='float32')) 407 | if self.ever_train_discriminator: 408 | rval.update(OrderedDict(safe_zip(d_params, [self.now_train_discriminator * dg for dg in d_grads]))) 409 | else: 410 | rval.update(OrderedDict(zip(d_params, zeros))) 411 | if self.ever_train_generator: 412 | rval.update(OrderedDict(safe_zip(g_params, [self.now_train_generator * gg for gg in g_grads]))) 413 | else: 414 | rval.update(OrderedDict(zip(g_params, zeros))) 415 | if self.ever_train_inference and model.inferer is not None: 416 | i_params = model.inferer.get_params() 417 | i_grads = T.grad(i_obj, i_params) 418 | rval.update(OrderedDict(safe_zip(i_params, [self.now_train_inference * ig for ig in i_grads]))) 419 | elif model.inferer is not None: 420 | rval.update(OrderedDict(model.inferer.get_params(), zeros)) 421 | 422 | updates = OrderedDict() 423 | 424 | # Two d steps for every g step 425 | if self.alternate_g: 426 | updates[self.now_train_generator] = 1. - self.now_train_generator 427 | 428 | return rval, updates 429 | 430 | def get_monitoring_channels(self, model, data, **kwargs): 431 | 432 | rval = OrderedDict() 433 | 434 | m = data.shape[0] 435 | 436 | g = model.generator 437 | d = model.discriminator 438 | 439 | y_hat = d.fprop(data) 440 | 441 | rval['false_negatives'] = T.cast((y_hat < 0.5).mean(), 'float32') 442 | 443 | samples = g.sample(m) 444 | y_hat = d.fprop(samples) 445 | rval['false_positives'] = T.cast((y_hat > 0.5).mean(), 'float32') 446 | # y = T.alloc(0., m, 1) 447 | cost = d.cost_from_X((samples, y_hat)) 448 | sample_grad = T.grad(-cost, samples) 449 | rval['sample_grad_norm'] = T.sqrt(T.sqr(sample_grad).sum()) 450 | _S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data) 451 | if model.monitor_inference and i_obj != 0: 452 | rval['objective_i'] = i_obj 453 | if model.monitor_discriminator: 454 | rval['objective_d'] = d_obj 455 | if model.monitor_generator: 456 | rval['objective_g'] = g_obj 457 | 458 | rval['now_train_generator'] = self.now_train_generator 459 | return rval 460 | 461 | def recapitate_discriminator(pair_path, new_head): 462 | pair = serial.load(pair_path) 463 | d = pair.discriminator 464 | del d.layers[-1] 465 | d.add_layers([new_head]) 466 | return d 467 | 468 | def theano_parzen(data, mu, sigma): 469 | """ 470 | Credit: Yann N. Dauphin 471 | """ 472 | x = data 473 | 474 | a = ( x.dimshuffle(0, 'x', 1) - mu.dimshuffle('x', 0, 1) ) / sigma 475 | 476 | E = log_mean_exp(-0.5*(a**2).sum(2)) 477 | 478 | Z = mu.shape[1] * T.log(sigma * numpy.sqrt(numpy.pi * 2)) 479 | 480 | #return theano.function([x], E - Z) 481 | return E - Z 482 | 483 | 484 | def log_mean_exp(a): 485 | """ 486 | Credit: Yann N. Dauphin 487 | """ 488 | 489 | max_ = a.max(1) 490 | 491 | return max_ + T.log(T.exp(a - max_.dimshuffle(0, 'x')).mean(1)) 492 | 493 | class Sum(Layer): 494 | """ 495 | Monitoring channels are hardcoded for C01B batches 496 | """ 497 | 498 | def __init__(self, layer_name): 499 | Model.__init__(self) 500 | self.__dict__.update(locals()) 501 | del self.self 502 | self._params = [] 503 | 504 | def set_input_space(self, space): 505 | self.input_space = space 506 | assert isinstance(space, CompositeSpace) 507 | self.output_space = space.components[0] 508 | 509 | def fprop(self, state_below): 510 | rval = state_below[0] 511 | for i in xrange(1, len(state_below)): 512 | rval = rval + state_below[i] 513 | rval.came_from_sum = True 514 | return rval 515 | 516 | @functools.wraps(Layer.get_layer_monitoring_channels) 517 | def get_layer_monitoring_channels(self, state_below=None, 518 | state=None, targets=None): 519 | rval = OrderedDict() 520 | 521 | if state is None: 522 | state = self.fprop(state_below) 523 | vars_and_prefixes = [(state, '')] 524 | 525 | for var, prefix in vars_and_prefixes: 526 | if not hasattr(var, 'ndim') or var.ndim != 4: 527 | print "expected 4D tensor, got " 528 | print var 529 | print type(var) 530 | if isinstance(var, tuple): 531 | print "tuple length: ", len(var) 532 | assert False 533 | v_max = var.max(axis=(1, 2, 3)) 534 | v_min = var.min(axis=(1, 2, 3)) 535 | v_mean = var.mean(axis=(1, 2, 3)) 536 | v_range = v_max - v_min 537 | 538 | # max_x.mean_u is "the mean over *u*nits of the max over 539 | # e*x*amples" The x and u are included in the name because 540 | # otherwise its hard to remember which axis is which when reading 541 | # the monitor I use inner.outer rather than outer_of_inner or 542 | # something like that because I want mean_x.* to appear next to 543 | # each other in the alphabetical list, as these are commonly 544 | # plotted together 545 | for key, val in [('max_x.max_u', v_max.max()), 546 | ('max_x.mean_u', v_max.mean()), 547 | ('max_x.min_u', v_max.min()), 548 | ('min_x.max_u', v_min.max()), 549 | ('min_x.mean_u', v_min.mean()), 550 | ('min_x.min_u', v_min.min()), 551 | ('range_x.max_u', v_range.max()), 552 | ('range_x.mean_u', v_range.mean()), 553 | ('range_x.min_u', v_range.min()), 554 | ('mean_x.max_u', v_mean.max()), 555 | ('mean_x.mean_u', v_mean.mean()), 556 | ('mean_x.min_u', v_mean.min())]: 557 | rval[prefix+key] = val 558 | 559 | return rval 560 | 561 | def marginals(dataset): 562 | return dataset.X.mean(axis=0) 563 | 564 | class ActivateGenerator(TrainExtension): 565 | def __init__(self, active_after, value=1.): 566 | self.__dict__.update(locals()) 567 | del self.self 568 | self.cur_epoch = 0 569 | 570 | def on_monitor(self, model, dataset, algorithm): 571 | if self.cur_epoch == self.active_after: 572 | algorithm.cost.now_train_generator.set_value(np.array(self.value, dtype='float32')) 573 | self.cur_epoch += 1 574 | 575 | class InpaintingAdversaryCost(DefaultDataSpecsMixin, Cost): 576 | """ 577 | """ 578 | 579 | # Supplies own labels, don't get them from the dataset 580 | supervised = False 581 | 582 | def __init__(self, scale_grads=1, target_scale=.1, 583 | discriminator_default_input_include_prob = 1., 584 | discriminator_input_include_probs=None, 585 | discriminator_default_input_scale=1., 586 | discriminator_input_scales=None, 587 | generator_default_input_include_prob = 1., 588 | generator_default_input_scale=1., 589 | inference_default_input_include_prob=None, 590 | inference_input_include_probs=None, 591 | inference_default_input_scale=1., 592 | inference_input_scales=None, 593 | init_now_train_generator=True, 594 | ever_train_discriminator=True, 595 | ever_train_generator=True, 596 | ever_train_inference=True, 597 | no_drop_in_d_for_g=False, 598 | alternate_g = False): 599 | self.__dict__.update(locals()) 600 | del self.self 601 | # These allow you to dynamically switch off training parts. 602 | # If the corresponding ever_train_* is False, these have 603 | # no effect. 604 | self.now_train_generator = sharedX(init_now_train_generator) 605 | self.now_train_discriminator = sharedX(numpy.array(1., dtype='float32')) 606 | self.now_train_inference = sharedX(numpy.array(1., dtype='float32')) 607 | 608 | def expr(self, model, data, **kwargs): 609 | S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data) 610 | return d_obj + g_obj + i_obj 611 | 612 | def get_samples_and_objectives(self, model, data): 613 | space, sources = self.get_data_specs(model) 614 | space.validate(data) 615 | assert isinstance(model, AdversaryPair) 616 | g = model.generator 617 | d = model.discriminator 618 | 619 | # Note: this assumes data is b01c 620 | X = data 621 | assert X.ndim == 4 622 | m = data.shape[space.get_batch_axis()] 623 | y1 = T.alloc(1, m, 1) 624 | y0 = T.alloc(0, m, 1) 625 | # NOTE: if this changes to optionally use dropout, change the inference 626 | # code below to use a non-dropped-out version. 627 | S, z = g.inpainting_sample_and_noise(X, default_input_include_prob=self.generator_default_input_include_prob, default_input_scale=self.generator_default_input_scale) 628 | y_hat1 = d.dropout_fprop(X, self.discriminator_default_input_include_prob, 629 | self.discriminator_input_include_probs, 630 | self.discriminator_default_input_scale, 631 | self.discriminator_input_scales) 632 | y_hat0 = d.dropout_fprop(S, self.discriminator_default_input_include_prob, 633 | self.discriminator_input_include_probs, 634 | self.discriminator_default_input_scale, 635 | self.discriminator_input_scales) 636 | 637 | d_obj = 0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0)) 638 | 639 | if self.no_drop_in_d_for_g: 640 | y_hat0_no_drop = d.dropout_fprop(S) 641 | g_obj = d.layers[-1].cost(y1, y_hat0) 642 | else: 643 | g_obj = d.layers[-1].cost(y1, y_hat0) 644 | 645 | if model.inferer is not None: 646 | # Change this if we ever switch to using dropout in the 647 | # construction of S. 648 | S_nograd = block_gradient(S) # Redundant as long as we have custom get_gradients 649 | z_hat = model.inferer.dropout_fprop(S_nograd, self.inference_default_input_include_prob, 650 | self.inference_input_include_probs, 651 | self.inference_default_input_scale, 652 | self.inference_input_scales) 653 | i_obj = model.inferer.layers[-1].cost(z, z_hat) 654 | else: 655 | i_obj = 0 656 | 657 | return S, d_obj, g_obj, i_obj 658 | 659 | def get_gradients(self, model, data, **kwargs): 660 | space, sources = self.get_data_specs(model) 661 | space.validate(data) 662 | assert isinstance(model, AdversaryPair) 663 | g = model.generator 664 | d = model.discriminator 665 | 666 | S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data) 667 | 668 | g_params = g.get_params() 669 | d_params = d.get_params() 670 | for param in g_params: 671 | assert param not in d_params 672 | for param in d_params: 673 | assert param not in g_params 674 | d_grads = T.grad(d_obj, d_params) 675 | g_grads = T.grad(g_obj, g_params) 676 | 677 | if self.scale_grads: 678 | S_grad = T.grad(g_obj, S) 679 | scale = T.maximum(1., self.target_scale / T.sqrt(T.sqr(S_grad).sum())) 680 | g_grads = [g_grad * scale for g_grad in g_grads] 681 | 682 | rval = OrderedDict() 683 | if self.ever_train_discriminator: 684 | rval.update(OrderedDict(safe_zip(d_params, [self.now_train_discriminator * dg for dg in d_grads]))) 685 | else: 686 | rval.update(OrderedDict(zip(d_params, itertools.repeat(theano.tensor.constant(0., dtype='float32'))))) 687 | 688 | if self.ever_train_generator: 689 | rval.update(OrderedDict(safe_zip(g_params, [self.now_train_generator * gg for gg in g_grads]))) 690 | else: 691 | rval.update(OrderedDict(zip(g_params, itertools.repeat(theano.tensor.constant(0., dtype='float32'))))) 692 | 693 | if self.ever_train_inference and model.inferer is not None: 694 | i_params = model.inferer.get_params() 695 | i_grads = T.grad(i_obj, i_params) 696 | rval.update(OrderedDict(safe_zip(i_params, [self.now_train_inference * ig for ig in i_grads]))) 697 | 698 | updates = OrderedDict() 699 | 700 | # Two d steps for every g step 701 | if self.alternate_g: 702 | updates[self.now_train_generator] = 1. - self.now_train_generator 703 | 704 | return rval, updates 705 | 706 | def get_monitoring_channels(self, model, data, **kwargs): 707 | 708 | rval = OrderedDict() 709 | 710 | m = data.shape[0] 711 | 712 | g = model.generator 713 | d = model.discriminator 714 | 715 | y_hat = d.fprop(data) 716 | 717 | rval['false_negatives'] = T.cast((y_hat < 0.5).mean(), 'float32') 718 | 719 | samples, noise = g.inpainting_sample_and_noise(data) 720 | y_hat = d.fprop(samples) 721 | rval['false_positives'] = T.cast((y_hat > 0.5).mean(), 'float32') 722 | # y = T.alloc(0., m, 1) 723 | cost = d.cost_from_X((samples, y_hat)) 724 | sample_grad = T.grad(-cost, samples) 725 | rval['sample_grad_norm'] = T.sqrt(T.sqr(sample_grad).sum()) 726 | _S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data) 727 | if i_obj != 0: 728 | rval['objective_i'] = i_obj 729 | rval['objective_d'] = d_obj 730 | rval['objective_g'] = g_obj 731 | 732 | rval['now_train_generator'] = self.now_train_generator 733 | return rval 734 | 735 | class Cycler(object): 736 | 737 | def __init__(self, k): 738 | self.__dict__.update(locals()) 739 | del self.self 740 | self.i = 0 741 | 742 | def __call__(self, sgd): 743 | self.i = (self.i + 1) % self.k 744 | sgd.cost.now_train_generator.set_value(np.cast['float32'](self.i == 0)) 745 | 746 | class NoiseCat(Layer): 747 | 748 | def __init__(self, new_dim, std, layer_name): 749 | Layer.__init__(self) 750 | self.__dict__.update(locals()) 751 | del self.self 752 | self._params = [] 753 | 754 | def set_input_space(self, space): 755 | assert isinstance(space, VectorSpace) 756 | self.input_space = space 757 | self.output_space = VectorSpace(space.dim + self.new_dim) 758 | self.theano_rng = MRG_RandomStreams(self.mlp.rng.randint(2 ** 16)) 759 | 760 | def fprop(self, state): 761 | noise = self.theano_rng.normal(std=self.std, avg=0., size=(state.shape[0], self.new_dim), 762 | dtype=state.dtype) 763 | return T.concatenate((state, noise), axis=1) 764 | 765 | class RectifiedLinear(Layer): 766 | 767 | def __init__(self, layer_name, left_slope=0.0, **kwargs): 768 | super(RectifiedLinear, self).__init__(**kwargs) 769 | self.__dict__.update(locals()) 770 | del self.self 771 | self._params = [] 772 | 773 | def set_input_space(self, space): 774 | self.input_space = space 775 | self.output_space = space 776 | 777 | def fprop(self, state_below): 778 | p = state_below 779 | p = T.switch(p > 0., p, self.left_slope * p) 780 | return p 781 | 782 | class Sigmoid(Layer): 783 | 784 | def __init__(self, layer_name, left_slope=0.0, **kwargs): 785 | super(Sigmoid, self).__init__(**kwargs) 786 | self.__dict__.update(locals()) 787 | del self.self 788 | self._params = [] 789 | 790 | def set_input_space(self, space): 791 | self.input_space = space 792 | self.output_space = space 793 | 794 | def fprop(self, state_below): 795 | p = T.nnet.sigmoid(state_below) 796 | return p 797 | 798 | class SubtractHalf(Layer): 799 | 800 | def __init__(self, layer_name, left_slope=0.0, **kwargs): 801 | super(SubtractHalf, self).__init__(**kwargs) 802 | self.__dict__.update(locals()) 803 | del self.self 804 | self._params = [] 805 | 806 | def set_input_space(self, space): 807 | self.input_space = space 808 | self.output_space = space 809 | 810 | def fprop(self, state_below): 811 | return state_below - 0.5 812 | 813 | def get_weights(self): 814 | return self.mlp.layers[1].get_weights() 815 | 816 | def get_weights_format(self): 817 | return self.mlp.layers[1].get_weights_format() 818 | 819 | def get_weights_view_shape(self): 820 | return self.mlp.layers[1].get_weights_view_shape() 821 | 822 | class SubtractRealMean(Layer): 823 | 824 | def __init__(self, layer_name, dataset, also_sd = False, **kwargs): 825 | super(SubtractRealMean, self).__init__(**kwargs) 826 | self.__dict__.update(locals()) 827 | del self.self 828 | self._params = [] 829 | self.mean = sharedX(dataset.X.mean(axis=0)) 830 | if also_sd: 831 | self.sd = sharedX(dataset.X.std(axis=0)) 832 | del self.dataset 833 | 834 | def set_input_space(self, space): 835 | self.input_space = space 836 | self.output_space = space 837 | 838 | def fprop(self, state_below): 839 | return (state_below - self.mean) / self.sd 840 | 841 | def get_weights(self): 842 | return self.mlp.layers[1].get_weights() 843 | 844 | def get_weights_format(self): 845 | return self.mlp.layers[1].get_weights_format() 846 | 847 | def get_weights_view_shape(self): 848 | return self.mlp.layers[1].get_weights_view_shape() 849 | 850 | 851 | class Clusterize(Layer): 852 | 853 | def __init__(self, scale, layer_name): 854 | Layer.__init__(self) 855 | self.__dict__.update(locals()) 856 | del self.self 857 | self._params = [] 858 | 859 | def set_input_space(self, space): 860 | assert isinstance(space, VectorSpace) 861 | self.input_space = space 862 | self.output_space = space 863 | self.theano_rng = MRG_RandomStreams(self.mlp.rng.randint(2 ** 16)) 864 | 865 | def fprop(self, state): 866 | noise = self.theano_rng.binomial(size=state.shape, p=0.5, 867 | dtype=state.dtype) * 2. - 1. 868 | return state + self.scale * noise 869 | 870 | 871 | 872 | class ThresholdedAdversaryCost(DefaultDataSpecsMixin, Cost): 873 | """ 874 | """ 875 | 876 | # Supplies own labels, don't get them from the dataset 877 | supervised = False 878 | 879 | def __init__(self, scale_grads=1, target_scale=.1, 880 | discriminator_default_input_include_prob = 1., 881 | discriminator_input_include_probs=None, 882 | discriminator_default_input_scale=1., 883 | discriminator_input_scales=None, 884 | generator_default_input_include_prob = 1., 885 | generator_default_input_scale=1., 886 | inference_default_input_include_prob=None, 887 | inference_input_include_probs=None, 888 | inference_default_input_scale=1., 889 | inference_input_scales=None, 890 | init_now_train_generator=True, 891 | ever_train_discriminator=True, 892 | ever_train_generator=True, 893 | ever_train_inference=True, 894 | no_drop_in_d_for_g=False, 895 | alternate_g = False, 896 | infer_layer=None, 897 | noise_both = 0.): 898 | self.__dict__.update(locals()) 899 | del self.self 900 | # These allow you to dynamically switch off training parts. 901 | # If the corresponding ever_train_* is False, these have 902 | # no effect. 903 | self.now_train_generator = sharedX(init_now_train_generator) 904 | self.now_train_discriminator = sharedX(numpy.array(1., dtype='float32')) 905 | self.now_train_inference = sharedX(numpy.array(1., dtype='float32')) 906 | 907 | def expr(self, model, data, **kwargs): 908 | S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data) 909 | l = [] 910 | # This stops stuff from ever getting computed if we're not training 911 | # it. 912 | if self.ever_train_discriminator: 913 | l.append(d_obj) 914 | if self.ever_train_generator: 915 | l.append(g_obj) 916 | if self.ever_train_inference: 917 | l.append(i_obj) 918 | return sum(l) 919 | 920 | def get_samples_and_objectives(self, model, data): 921 | space, sources = self.get_data_specs(model) 922 | space.validate(data) 923 | assert isinstance(model, AdversaryPair) 924 | g = model.generator 925 | d = model.discriminator 926 | 927 | # Note: this assumes data is design matrix 928 | X = data 929 | m = data.shape[space.get_batch_axis()] 930 | y1 = T.alloc(1, m, 1) 931 | y0 = T.alloc(0, m, 1) 932 | # NOTE: if this changes to optionally use dropout, change the inference 933 | # code below to use a non-dropped-out version. 934 | S, z, other_layers = g.sample_and_noise(m, default_input_include_prob=self.generator_default_input_include_prob, default_input_scale=self.generator_default_input_scale, all_g_layers=(self.infer_layer is not None)) 935 | 936 | if self.noise_both != 0.: 937 | rng = MRG_RandomStreams(2014 / 6 + 2) 938 | S = S + rng.normal(size=S.shape, dtype=S.dtype) * self.noise_both 939 | X = X + rng.normal(size=X.shape, dtype=S.dtype) * self.noise_both 940 | 941 | y_hat1 = d.dropout_fprop(X, self.discriminator_default_input_include_prob, 942 | self.discriminator_input_include_probs, 943 | self.discriminator_default_input_scale, 944 | self.discriminator_input_scales) 945 | y_hat0 = d.dropout_fprop(S, self.discriminator_default_input_include_prob, 946 | self.discriminator_input_include_probs, 947 | self.discriminator_default_input_scale, 948 | self.discriminator_input_scales) 949 | 950 | d_obj = 0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0)) 951 | 952 | if self.no_drop_in_d_for_g: 953 | y_hat0_no_drop = d.dropout_fprop(S) 954 | g_cost_mat = d.layers[-1].cost_matrix(y1, y_hat0_no_drop) 955 | else: 956 | g_cost_mat = d.layers[-1].cost_matrix(y1, y_hat0) 957 | assert g_cost_mat.ndim == 2 958 | assert y_hat0.ndim == 2 959 | 960 | mask = y_hat0 < 0.5 961 | masked_cost = g_cost_mat * mask 962 | g_obj = masked_cost.mean() 963 | 964 | 965 | if model.inferer is not None: 966 | # Change this if we ever switch to using dropout in the 967 | # construction of S. 968 | S_nograd = block_gradient(S) # Redundant as long as we have custom get_gradients 969 | pred = model.inferer.dropout_fprop(S_nograd, self.inference_default_input_include_prob, 970 | self.inference_input_include_probs, 971 | self.inference_default_input_scale, 972 | self.inference_input_scales) 973 | if self.infer_layer is None: 974 | target = z 975 | else: 976 | target = other_layers[self.infer_layer] 977 | i_obj = model.inferer.layers[-1].cost(target, pred) 978 | else: 979 | i_obj = 0 980 | 981 | return S, d_obj, g_obj, i_obj 982 | 983 | def get_gradients(self, model, data, **kwargs): 984 | space, sources = self.get_data_specs(model) 985 | space.validate(data) 986 | assert isinstance(model, AdversaryPair) 987 | g = model.generator 988 | d = model.discriminator 989 | 990 | S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data) 991 | 992 | g_params = g.get_params() 993 | d_params = d.get_params() 994 | for param in g_params: 995 | assert param not in d_params 996 | for param in d_params: 997 | assert param not in g_params 998 | d_grads = T.grad(d_obj, d_params) 999 | g_grads = T.grad(g_obj, g_params) 1000 | 1001 | if self.scale_grads: 1002 | S_grad = T.grad(g_obj, S) 1003 | scale = T.maximum(1., self.target_scale / T.sqrt(T.sqr(S_grad).sum())) 1004 | g_grads = [g_grad * scale for g_grad in g_grads] 1005 | 1006 | rval = OrderedDict() 1007 | zeros = itertools.repeat(theano.tensor.constant(0., dtype='float32')) 1008 | if self.ever_train_discriminator: 1009 | rval.update(OrderedDict(safe_zip(d_params, [self.now_train_discriminator * dg for dg in d_grads]))) 1010 | else: 1011 | rval.update(OrderedDict(zip(d_params, zeros))) 1012 | if self.ever_train_generator: 1013 | rval.update(OrderedDict(safe_zip(g_params, [self.now_train_generator * gg for gg in g_grads]))) 1014 | else: 1015 | rval.update(OrderedDict(zip(g_params, zeros))) 1016 | if self.ever_train_inference and model.inferer is not None: 1017 | i_params = model.inferer.get_params() 1018 | i_grads = T.grad(i_obj, i_params) 1019 | rval.update(OrderedDict(safe_zip(i_params, [self.now_train_inference * ig for ig in i_grads]))) 1020 | elif model.inferer is not None: 1021 | rval.update(OrderedDict(model.inferer.get_params(), zeros)) 1022 | 1023 | updates = OrderedDict() 1024 | 1025 | # Two d steps for every g step 1026 | if self.alternate_g: 1027 | updates[self.now_train_generator] = 1. - self.now_train_generator 1028 | 1029 | return rval, updates 1030 | 1031 | def get_monitoring_channels(self, model, data, **kwargs): 1032 | 1033 | rval = OrderedDict() 1034 | 1035 | m = data.shape[0] 1036 | 1037 | g = model.generator 1038 | d = model.discriminator 1039 | 1040 | y_hat = d.fprop(data) 1041 | 1042 | rval['false_negatives'] = T.cast((y_hat < 0.5).mean(), 'float32') 1043 | 1044 | samples = g.sample(m) 1045 | y_hat = d.fprop(samples) 1046 | rval['false_positives'] = T.cast((y_hat > 0.5).mean(), 'float32') 1047 | # y = T.alloc(0., m, 1) 1048 | cost = d.cost_from_X((samples, y_hat)) 1049 | sample_grad = T.grad(-cost, samples) 1050 | rval['sample_grad_norm'] = T.sqrt(T.sqr(sample_grad).sum()) 1051 | _S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data) 1052 | if model.monitor_inference and i_obj != 0: 1053 | rval['objective_i'] = i_obj 1054 | if model.monitor_discriminator: 1055 | rval['objective_d'] = d_obj 1056 | if model.monitor_generator: 1057 | rval['objective_g'] = g_obj 1058 | 1059 | rval['now_train_generator'] = self.now_train_generator 1060 | return rval 1061 | 1062 | 1063 | class HardSigmoid(Linear): 1064 | """ 1065 | Hard "sigmoid" (note: shifted along the x axis) 1066 | """ 1067 | 1068 | def __init__(self, left_slope=0.0, **kwargs): 1069 | super(HardSigmoid, self).__init__(**kwargs) 1070 | self.left_slope = left_slope 1071 | 1072 | @wraps(Layer.fprop) 1073 | def fprop(self, state_below): 1074 | 1075 | p = self._linear_part(state_below) 1076 | # Original: p = p * (p > 0.) + self.left_slope * p * (p < 0.) 1077 | # T.switch is faster. 1078 | # For details, see benchmarks in 1079 | # pylearn2/scripts/benchmark/time_relu.py 1080 | p = T.clip(p, 0., 1.) 1081 | return p 1082 | 1083 | @wraps(Layer.cost) 1084 | def cost(self, *args, **kwargs): 1085 | 1086 | raise NotImplementedError() 1087 | 1088 | 1089 | class LazyAdversaryCost(DefaultDataSpecsMixin, Cost): 1090 | """ 1091 | """ 1092 | 1093 | # Supplies own labels, don't get them from the dataset 1094 | supervised = False 1095 | 1096 | def __init__(self, scale_grads=1, target_scale=.1, 1097 | discriminator_default_input_include_prob = 1., 1098 | discriminator_input_include_probs=None, 1099 | discriminator_default_input_scale=1., 1100 | discriminator_input_scales=None, 1101 | generator_default_input_include_prob = 1., 1102 | generator_default_input_scale=1., 1103 | inference_default_input_include_prob=None, 1104 | inference_input_include_probs=None, 1105 | inference_default_input_scale=1., 1106 | inference_input_scales=None, 1107 | init_now_train_generator=True, 1108 | ever_train_discriminator=True, 1109 | ever_train_generator=True, 1110 | ever_train_inference=True, 1111 | no_drop_in_d_for_g=False, 1112 | alternate_g = False, 1113 | infer_layer=None, 1114 | noise_both = 0., 1115 | g_eps = 0., 1116 | d_eps =0.): 1117 | self.__dict__.update(locals()) 1118 | del self.self 1119 | # These allow you to dynamically switch off training parts. 1120 | # If the corresponding ever_train_* is False, these have 1121 | # no effect. 1122 | self.now_train_generator = sharedX(init_now_train_generator) 1123 | self.now_train_discriminator = sharedX(numpy.array(1., dtype='float32')) 1124 | self.now_train_inference = sharedX(numpy.array(1., dtype='float32')) 1125 | 1126 | def expr(self, model, data, **kwargs): 1127 | S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data) 1128 | l = [] 1129 | # This stops stuff from ever getting computed if we're not training 1130 | # it. 1131 | if self.ever_train_discriminator: 1132 | l.append(d_obj) 1133 | if self.ever_train_generator: 1134 | l.append(g_obj) 1135 | if self.ever_train_inference: 1136 | l.append(i_obj) 1137 | return sum(l) 1138 | 1139 | def get_samples_and_objectives(self, model, data): 1140 | space, sources = self.get_data_specs(model) 1141 | space.validate(data) 1142 | assert isinstance(model, AdversaryPair) 1143 | g = model.generator 1144 | d = model.discriminator 1145 | 1146 | # Note: this assumes data is design matrix 1147 | X = data 1148 | m = data.shape[space.get_batch_axis()] 1149 | y1 = T.alloc(1, m, 1) 1150 | y0 = T.alloc(0, m, 1) 1151 | # NOTE: if this changes to optionally use dropout, change the inference 1152 | # code below to use a non-dropped-out version. 1153 | S, z, other_layers = g.sample_and_noise(m, default_input_include_prob=self.generator_default_input_include_prob, default_input_scale=self.generator_default_input_scale, all_g_layers=(self.infer_layer is not None)) 1154 | 1155 | if self.noise_both != 0.: 1156 | rng = MRG_RandomStreams(2014 / 6 + 2) 1157 | S = S + rng.normal(size=S.shape, dtype=S.dtype) * self.noise_both 1158 | X = X + rng.normal(size=X.shape, dtype=S.dtype) * self.noise_both 1159 | 1160 | y_hat1 = d.dropout_fprop(X, self.discriminator_default_input_include_prob, 1161 | self.discriminator_input_include_probs, 1162 | self.discriminator_default_input_scale, 1163 | self.discriminator_input_scales) 1164 | y_hat0 = d.dropout_fprop(S, self.discriminator_default_input_include_prob, 1165 | self.discriminator_input_include_probs, 1166 | self.discriminator_default_input_scale, 1167 | self.discriminator_input_scales) 1168 | 1169 | # d_obj = 0.5 * (d.layers[-1].cost(y1, y_hat1) + d.layers[-1].cost(y0, y_hat0)) 1170 | 1171 | pos_mask = y_hat1 < .5 + self.d_eps 1172 | neg_mask = y_hat0 > .5 - self.d_eps 1173 | 1174 | pos_cost_matrix = d.layers[-1].cost_matrix(y1, y_hat1) 1175 | neg_cost_matrix = d.layers[-1].cost_matrix(y0, y_hat0) 1176 | 1177 | pos_cost = (pos_mask * pos_cost_matrix).mean() 1178 | neg_cost = (neg_mask * neg_cost_matrix).mean() 1179 | 1180 | d_obj = 0.5 * (pos_cost + neg_cost) 1181 | 1182 | if self.no_drop_in_d_for_g: 1183 | y_hat0_no_drop = d.dropout_fprop(S) 1184 | g_cost_mat = d.layers[-1].cost_matrix(y1, y_hat0_no_drop) 1185 | else: 1186 | g_cost_mat = d.layers[-1].cost_matrix(y1, y_hat0) 1187 | assert g_cost_mat.ndim == 2 1188 | assert y_hat0.ndim == 2 1189 | 1190 | mask = y_hat0 < 0.5 + self.g_eps 1191 | masked_cost = g_cost_mat * mask 1192 | g_obj = masked_cost.mean() 1193 | 1194 | 1195 | if model.inferer is not None: 1196 | # Change this if we ever switch to using dropout in the 1197 | # construction of S. 1198 | S_nograd = block_gradient(S) # Redundant as long as we have custom get_gradients 1199 | pred = model.inferer.dropout_fprop(S_nograd, self.inference_default_input_include_prob, 1200 | self.inference_input_include_probs, 1201 | self.inference_default_input_scale, 1202 | self.inference_input_scales) 1203 | if self.infer_layer is None: 1204 | target = z 1205 | else: 1206 | target = other_layers[self.infer_layer] 1207 | i_obj = model.inferer.layers[-1].cost(target, pred) 1208 | else: 1209 | i_obj = 0 1210 | 1211 | return S, d_obj, g_obj, i_obj 1212 | 1213 | def get_gradients(self, model, data, **kwargs): 1214 | space, sources = self.get_data_specs(model) 1215 | space.validate(data) 1216 | assert isinstance(model, AdversaryPair) 1217 | g = model.generator 1218 | d = model.discriminator 1219 | 1220 | S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data) 1221 | 1222 | g_params = g.get_params() 1223 | d_params = d.get_params() 1224 | for param in g_params: 1225 | assert param not in d_params 1226 | for param in d_params: 1227 | assert param not in g_params 1228 | d_grads = T.grad(d_obj, d_params) 1229 | g_grads = T.grad(g_obj, g_params) 1230 | 1231 | if self.scale_grads: 1232 | S_grad = T.grad(g_obj, S) 1233 | scale = T.maximum(1., self.target_scale / T.sqrt(T.sqr(S_grad).sum())) 1234 | g_grads = [g_grad * scale for g_grad in g_grads] 1235 | 1236 | rval = OrderedDict() 1237 | zeros = itertools.repeat(theano.tensor.constant(0., dtype='float32')) 1238 | if self.ever_train_discriminator: 1239 | rval.update(OrderedDict(safe_zip(d_params, [self.now_train_discriminator * dg for dg in d_grads]))) 1240 | else: 1241 | rval.update(OrderedDict(zip(d_params, zeros))) 1242 | if self.ever_train_generator: 1243 | rval.update(OrderedDict(safe_zip(g_params, [self.now_train_generator * gg for gg in g_grads]))) 1244 | else: 1245 | rval.update(OrderedDict(zip(g_params, zeros))) 1246 | if self.ever_train_inference and model.inferer is not None: 1247 | i_params = model.inferer.get_params() 1248 | i_grads = T.grad(i_obj, i_params) 1249 | rval.update(OrderedDict(safe_zip(i_params, [self.now_train_inference * ig for ig in i_grads]))) 1250 | elif model.inferer is not None: 1251 | rval.update(OrderedDict(model.inferer.get_params(), zeros)) 1252 | 1253 | updates = OrderedDict() 1254 | 1255 | # Two d steps for every g step 1256 | if self.alternate_g: 1257 | updates[self.now_train_generator] = 1. - self.now_train_generator 1258 | 1259 | return rval, updates 1260 | 1261 | def get_monitoring_channels(self, model, data, **kwargs): 1262 | 1263 | rval = OrderedDict() 1264 | 1265 | m = data.shape[0] 1266 | 1267 | g = model.generator 1268 | d = model.discriminator 1269 | 1270 | y_hat = d.fprop(data) 1271 | 1272 | rval['false_negatives'] = T.cast((y_hat < 0.5).mean(), 'float32') 1273 | 1274 | samples = g.sample(m) 1275 | y_hat = d.fprop(samples) 1276 | rval['false_positives'] = T.cast((y_hat > 0.5).mean(), 'float32') 1277 | # y = T.alloc(0., m, 1) 1278 | cost = d.cost_from_X((samples, y_hat)) 1279 | sample_grad = T.grad(-cost, samples) 1280 | rval['sample_grad_norm'] = T.sqrt(T.sqr(sample_grad).sum()) 1281 | _S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data) 1282 | if model.monitor_inference and i_obj != 0: 1283 | rval['objective_i'] = i_obj 1284 | if model.monitor_discriminator: 1285 | rval['objective_d'] = d_obj 1286 | if model.monitor_generator: 1287 | rval['objective_g'] = g_obj 1288 | 1289 | rval['now_train_generator'] = self.now_train_generator 1290 | return rval 1291 | --------------------------------------------------------------------------------