├── __init__.py
├── environment.yml
├── LICENSE
├── utils.py
├── README.md
├── nn.py
├── run.py
└── ladder.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: ladder
 2 | dependencies:
 3 | - hdf5=1.8.15.1=2
 4 | - h5py=2.5.0=np110py27_4
 5 | - matplotlib=1.5.1=np110py27_0
 6 | - nomkl=1.0
 7 | - openblas=0.2.14
 8 | - numpy=1.10.4=py27_nomkl_2
 9 | - pandas=0.17.1=np110py27_0
10 | - pytables=3.2.2=np110py27_0
11 | - python=2.7.11=0
12 | - scipy=0.17.0=np110py27_nomkl_0
13 | - pip
14 | - pip:
15 |   - git+git://github.com/Theano/Theano.git@rel-0.8.2
16 |   - git+git://github.com/mila-udem/fuel.git@0.2.0
17 |   - git+git://github.com/mila-udem/blocks.git@0.2
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 arasmus
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import logging
  4 | import numpy as np
  5 | import theano
  6 | from pandas import DataFrame, read_hdf
  7 | 
  8 | from blocks.extensions import Printing, SimpleExtension
  9 | from blocks.main_loop import MainLoop
 10 | from blocks.roles import add_role
 11 | 
 12 | logger = logging.getLogger('main.utils')
 13 | 
 14 | 
 15 | def shared_param(init, name, cast_float32, role, **kwargs):
 16 |     if cast_float32:
 17 |         v = np.float32(init)
 18 |     p = theano.shared(v, name=name, **kwargs)
 19 |     add_role(p, role)
 20 |     return p
 21 | 
 22 | 
 23 | class AttributeDict(dict):
 24 |     __getattr__ = dict.__getitem__
 25 | 
 26 |     def __setattr__(self, a, b):
 27 |         self.__setitem__(a, b)
 28 | 
 29 | 
 30 | class DummyLoop(MainLoop):
 31 |     def __init__(self, extensions):
 32 |         return super(DummyLoop, self).__init__(algorithm=None,
 33 |                                                data_stream=None,
 34 |                                                extensions=extensions)
 35 | 
 36 |     def run(self):
 37 |         for extension in self.extensions:
 38 |             extension.main_loop = self
 39 |         self._run_extensions('before_training')
 40 |         self._run_extensions('after_training')
 41 | 
 42 | 
 43 | class ShortPrinting(Printing):
 44 |     def __init__(self, to_print, use_log=True, **kwargs):
 45 |         self.to_print = to_print
 46 |         self.use_log = use_log
 47 |         super(ShortPrinting, self).__init__(**kwargs)
 48 | 
 49 |     def do(self, which_callback, *args):
 50 |         log = self.main_loop.log
 51 | 
 52 |         # Iteration
 53 |         msg = "e {}, i {}:".format(
 54 |             log.status['epochs_done'],
 55 |             log.status['iterations_done'])
 56 | 
 57 |         # Requested channels
 58 |         items = []
 59 |         for k, vars in self.to_print.iteritems():
 60 |             for shortname, vars in vars.iteritems():
 61 |                 if vars is None:
 62 |                     continue
 63 |                 if type(vars) is not list:
 64 |                     vars = [vars]
 65 | 
 66 |                 s = ""
 67 |                 for var in vars:
 68 |                     try:
 69 |                         name = k + '_' + var.name
 70 |                         val = log.current_row[name]
 71 |                     except:
 72 |                         continue
 73 |                     try:
 74 |                         s += ' ' + ' '.join(["%.3g" % v for v in val])
 75 |                     except:
 76 |                         s += " %.3g" % val
 77 |                 if s != "":
 78 |                     items += [shortname + s]
 79 |         msg = msg + ", ".join(items)
 80 |         if self.use_log:
 81 |             logger.info(msg)
 82 |         else:
 83 |             print msg
 84 | 
 85 | 
 86 | class SaveParams(SimpleExtension):
 87 |     """Finishes the training process when triggered."""
 88 |     def __init__(self, trigger_var, params, save_path, **kwargs):
 89 |         super(SaveParams, self).__init__(**kwargs)
 90 |         if trigger_var is None:
 91 |             self.var_name = None
 92 |         else:
 93 |             self.var_name = trigger_var[0] + '_' + trigger_var[1].name
 94 |         self.save_path = save_path
 95 |         self.params = params
 96 |         self.to_save = {}
 97 |         self.best_value = None
 98 |         self.add_condition(['after_training'], self.save)
 99 |         self.add_condition(['on_interrupt'], self.save)
100 | 
101 |     def save(self, which_callback, *args):
102 |         if self.var_name is None:
103 |             self.to_save = {v.name: v.get_value() for v in self.params}
104 |         path = self.save_path + '/trained_params'
105 |         logger.info('Saving to %s' % path)
106 |         np.savez_compressed(path, **self.to_save)
107 | 
108 |     def do(self, which_callback, *args):
109 |         if self.var_name is None:
110 |             return
111 |         val = self.main_loop.log.current_row[self.var_name]
112 |         if self.best_value is None or val < self.best_value:
113 |             self.best_value = val
114 |         self.to_save = {v.name: v.get_value() for v in self.params}
115 | 
116 | 
117 | class SaveExpParams(SimpleExtension):
118 |     def __init__(self, experiment_params, dir, **kwargs):
119 |         super(SaveExpParams, self).__init__(**kwargs)
120 |         self.dir = dir
121 |         self.experiment_params = experiment_params
122 | 
123 |     def do(self, which_callback, *args):
124 |         df = DataFrame.from_dict(self.experiment_params, orient='index')
125 |         df.to_hdf(os.path.join(self.dir, 'params'), 'params', mode='w',
126 |                   complevel=5, complib='blosc')
127 | 
128 | 
129 | class SaveLog(SimpleExtension):
130 |     def __init__(self, dir, show=None, **kwargs):
131 |         super(SaveLog, self).__init__(**kwargs)
132 |         self.dir = dir
133 |         self.show = show if show is not None else []
134 | 
135 |     def do(self, which_callback, *args):
136 |         df = DataFrame.from_dict(self.main_loop.log, orient='index')
137 |         df.to_hdf(os.path.join(self.dir, 'log'), 'log', mode='w',
138 |                   complevel=5, complib='blosc')
139 | 
140 | 
141 | def prepare_dir(save_to, results_dir='results'):
142 |     base = os.path.join(results_dir, save_to)
143 |     i = 0
144 | 
145 |     while True:
146 |         name = base + str(i)
147 |         try:
148 |             os.makedirs(name)
149 |             break
150 |         except:
151 |             i += 1
152 | 
153 |     return name
154 | 
155 | 
156 | def load_df(dirpath, filename, varname=None):
157 |     varname = filename if varname is None else varname
158 |     fn = os.path.join(dirpath, filename)
159 |     return read_hdf(fn, varname)
160 | 
161 | 
162 | def filter_funcs_prefix(d, pfx):
163 |     pfx = 'cmd_'
164 |     fp = lambda x: x.find(pfx)
165 |     return {n[fp(n) + len(pfx):]: v for n, v in d.iteritems() if fp(n) >= 0}
166 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | This repository contains source code for the experiments in a paper titled
  2 | [_Semi-Supervised Learning with Ladder Networks_](http://arxiv.org/abs/1507.02672) by A Rasmus, H Valpola, M Honkala,
  3 | M Berglund, and T Raiko.
  4 | 
  5 | #### Required libraries
  6 | ##### Install Theano, Blocks Stable 0.2, Fuel Stable 0.2
  7 | Refer to the [Blocks installation instructions](http://blocks.readthedocs.org/en/latest/setup.html) for
  8 | details but use tag v0.2 instead. Something along:
  9 | ```
 10 | pip install git+git://github.com/mila-udem/blocks.git@v0.2
 11 | pip install git+git://github.com/mila-udem/fuel.git@v0.2.0
 12 | ```
 13 | Fuel comes with Blocks, but you need to download and convert the datasets.
 14 | Refer to the Fuel documentation. One might need to rename the converted files.
 15 | ```
 16 | fuel-download mnist
 17 | fuel-convert mnist --dtype float32
 18 | fuel-download cifar10
 19 | fuel-convert cifar10
 20 | ```
 21 | ##### Alternatively, one can use the environment.yml file that is provided in this repo to create an conda environment.
 22 | 1. First install anaconda from https://www.continuum.io/downloads. Then,
 23 | 2. `conda env create -f environment.yml`
 24 | 3. `source activate ladder`
 25 | 4. The environment should be good to go!
 26 | 
 27 | #### Models in the paper
 28 | 
 29 | The following commands train the models with seed 1. The reported numbers in the paper are averages over
 30 | several random seeds. These commands use all the training samples for training (`--unlabeled-samples 60000`)
 31 | and none are used for validation. This results in a lot of NaNs being printed during the trainining, since
 32 | the validation statistics are not available. If you want to observe the validation error and costs during the
 33 | training, use `--unlabeled-samples 50000`.
 34 | 
 35 | 
 36 | ##### MNIST all labels
 37 | ```
 38 | # Full
 39 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec gauss --denoising-cost-x 1000,1,0.01,0.01,0.01,0.01,0.01 --labeled-samples 60000 --unlabeled-samples 60000 --seed 1 -- mnist_all_full
 40 | # Bottom
 41 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec gauss --denoising-cost-x 2000,0,0,0,0,0,0 --labeled-samples 60000 --unlabeled-samples 60000 --seed 1 -- mnist_all_bottom
 42 | # Gamma model
 43 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec 0-0-0-0-0-0-gauss --denoising-cost-x 0,0,0,0,0,0,2 --labeled-samples 60000 --unlabeled-samples 60000 --seed 1 -- mnist_all_gamma
 44 | # Supervised baseline
 45 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec 0-0-0-0-0-0-0 --denoising-cost-x 0,0,0,0,0,0,0 --labeled-samples 60000 --unlabeled-samples 60000 --f-local-noise-std 0.5 --seed 1 -- mnist_all_baseline
 46 | ```
 47 | 
 48 | ##### MNIST 100 labels
 49 | ```
 50 | # Full
 51 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec gauss --denoising-cost-x 1000,10,0.1,0.1,0.1,0.1,0.1 --labeled-samples 100 --unlabeled-samples 60000 --seed 1 -- mnist_100_full
 52 | # Bottom-only
 53 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec gauss --denoising-cost-x 5000,0,0,0,0,0,0 --labeled-samples 100 --unlabeled-samples 60000 --seed 1 -- mnist_100_bottom
 54 | # Gamma
 55 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec 0-0-0-0-0-0-gauss --denoising-cost-x 0,0,0,0,0,0,0.5 --labeled-samples 100 --unlabeled-samples 60000 --seed 1 -- mnist_100_gamma
 56 | # Supervised baseline
 57 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec 0-0-0-0-0-0-0 --denoising-cost-x 0,0,0,0,0,0,0 --labeled-samples 100 --unlabeled-samples 60000 --f-local-noise-std 0.5 --seed 1 -- mnist_100_baseline
 58 | ```
 59 | 
 60 | ##### MNIST 1000 labels
 61 | ```
 62 | # Full
 63 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec gauss --denoising-cost-x 2000,20,0.1,0.1,0.1,0.1,0.1 --f-local-noise-std 0.2 --labeled-samples 1000 --unlabeled-samples 60000 --seed 1 -- mnist_1000_full
 64 | # Bottom-only
 65 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec gauss --denoising-cost-x 2000,0,0,0,0,0,0 --labeled-samples 1000 --unlabeled-samples 60000 --seed 1 -- mnist_1000_bottom
 66 | # Gamma model
 67 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec 0-0-0-0-0-0-gauss --denoising-cost-x 0,0,0,0,0,0,10 --labeled-samples 1000 --unlabeled-samples 60000 --seed 1 -- mnist_1000_gamma
 68 | # Supervised baseline
 69 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec 0-0-0-0-0-0-0 --denoising-cost-x 0,0,0,0,0,0,0 --labeled-samples 1000 --unlabeled-samples 60000 --f-local-noise-std 0.5 --seed 1 -- mnist_1000_baseline
 70 | ```
 71 | 
 72 | ##### MNIST 50 labels
 73 | ```
 74 | # Full model
 75 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec gauss --denoising-cost-x 2000,20,0.1,0.1,0.1,0.1,0.1 --labeled-samples 50 --unlabeled-samples 60000 --seed 1 -- mnist_50_full
 76 | ```
 77 | 
 78 | ##### MNIST convolutional models
 79 | ```
 80 | # Conv-FC
 81 | run.py train --encoder-layers convv:1000:26:1:1-convv:500:1:1:1-convv:250:1:1:1-convv:250:1:1:1-convv:250:1:1:1-convv:10:1:1:1-globalmeanpool:0 --decoder-spec gauss --denoising-cost-x 1000,10,0.1,0.1,0.1,0.1,0.1,0.1 --labeled-samples 100 --unlabeled-samples 60000 --seed 1 -- mnist_100_conv_fc
 82 | # Conv-Small, Gamma
 83 | run.py train --encoder-layers convf:32:5:1:1-maxpool:2:2-convv:64:3:1:1-convf:64:3:1:1-maxpool:2:2-convv:128:3:1:1-convv:10:1:1:1-globalmeanpool:6:6-fc:10 --decoder-spec 0-0-0-0-0-0-0-0-0-gauss --denoising-cost-x 0,0,0,0,0,0,0,0,0,1 --labeled-samples 100 --unlabeled-samples 60000 --seed 1  -- mnist_100_conv_gamma
 84 | # Conv-Small, supervised baseline. Overfits easily, so keep training short.
 85 | run.py train --encoder-layers convf:32:5:1:1-maxpool:2:2-convv:64:3:1:1-convf:64:3:1:1-maxpool:2:2-convv:128:3:1:1-convv:10:1:1:1-globalmeanpool:6:6-fc:10 --decoder-spec 0-0-0-0-0-0-0-0-0-0 --denoising-cost-x 0,0,0,0,0,0,0,0,0,0 --num-epochs 20 --lrate-decay 0.5 --f-local-noise-std 0.45 --labeled-samples 100 --unlabeled-samples 60000 --seed 1 -- mnist_100_conv_baseline
 86 | ```
 87 | 
 88 | ##### CIFAR models
 89 | ```
 90 | # Conv-Large, Gamma
 91 | ./run.py train --encoder-layers convv:96:3:1:1-convf:96:3:1:1-convf:96:3:1:1-maxpool:2:2-convv:192:3:1:1-convf:192:3:1:1-convv:192:3:1:1-maxpool:2:2-convv:192:3:1:1-convv:192:1:1:1-convv:10:1:1:1-globalmeanpool:0 --decoder-spec 0-0-0-0-0-0-0-0-0-0-0-0-gauss --dataset cifar10 --act leakyrelu --denoising-cost-x 0,0,0,0,0,0,0,0,0,0,0,0,4.0 --num-epochs 70 --lrate-decay 0.86 --seed 1 --whiten-zca 3072 --contrast-norm 55 --top-c False --labeled-samples 4000 --unlabeled-samples 50000 -- cifar_4k_gamma
 92 | # Conv-Large, supervised baseline. Overfits easily, so keep training short.
 93 | ./run.py train --encoder-layers convv:96:3:1:1-convf:96:3:1:1-convf:96:3:1:1-maxpool:2:2-convv:192:3:1:1-convf:192:3:1:1-convv:192:3:1:1-maxpool:2:2-convv:192:3:1:1-convv:192:1:1:1-convv:10:1:1:1-globalmeanpool:0 --decoder-spec 0-0-0-0-0-0-0-0-0-0-0-0-0 --dataset cifar10 --act leakyrelu --denoising-cost-x 0,0,0,0,0,0,0,0,0,0,0,0,0 --num-epochs 20 --lrate-decay 0.5 --seed 1 --whiten-zca 3072 --contrast-norm 55 --top-c False --labeled-samples 4000 --unlabeled-samples 50000 -- cifar_4k_baseline
 94 | ```
 95 | 
 96 | ##### Evaluating models with testset
 97 | After training a model, you can infer the results on a test set by performing the `evaluate` command.
 98 | An example use after training a model:
 99 | ```
100 | ./run.py evaluate results/mnist_all_bottom0
101 | ```
102 | 


--------------------------------------------------------------------------------
/nn.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | import logging
  3 | 
  4 | import scipy
  5 | import numpy as np
  6 | from theano import tensor
  7 | from theano.tensor.signal.pool import pool_2d, Pool
  8 | 
  9 | from blocks.extensions import SimpleExtension
 10 | from blocks.extensions.monitoring import (DataStreamMonitoring,
 11 |                                           MonitoringExtension)
 12 | from blocks.filter import VariableFilter
 13 | from blocks.graph import ComputationGraph
 14 | from blocks.monitoring.evaluators import DatasetEvaluator
 15 | from blocks.roles import VariableRole
 16 | 
 17 | logger = logging.getLogger('main.nn')
 18 | 
 19 | 
 20 | class BnParamRole(VariableRole):
 21 |     pass
 22 | 
 23 | # Batch normalization parameters that have to be replaced when testing
 24 | BNPARAM = BnParamRole()
 25 | 
 26 | 
 27 | class ZCA(object):
 28 |     def __init__(self, n_components=None, data=None, filter_bias=0.1):
 29 |         self.filter_bias = np.float32(filter_bias)
 30 |         self.P = None
 31 |         self.P_inv = None
 32 |         self.n_components = 0
 33 |         self.is_fit = False
 34 |         if n_components and data:
 35 |             self.fit(n_components, data)
 36 | 
 37 |     def fit(self, n_components, data):
 38 |         if len(data.shape) == 2:
 39 |             self.reshape = None
 40 |         else:
 41 |             assert n_components == np.product(data.shape[1:]), \
 42 |                 'ZCA whitening components should be %d for convolutional data'\
 43 |                 % np.product(data.shape[1:])
 44 |             self.reshape = data.shape[1:]
 45 | 
 46 |         data = self._flatten_data(data)
 47 |         assert len(data.shape) == 2
 48 |         n, m = data.shape
 49 |         self.mean = np.mean(data, axis=0)
 50 | 
 51 |         bias = self.filter_bias * scipy.sparse.identity(m, 'float32')
 52 |         cov = np.cov(data, rowvar=0, bias=1) + bias
 53 |         eigs, eigv = scipy.linalg.eigh(cov)
 54 | 
 55 |         assert not np.isnan(eigs).any()
 56 |         assert not np.isnan(eigv).any()
 57 |         assert eigs.min() > 0
 58 | 
 59 |         if self.n_components:
 60 |             eigs = eigs[-self.n_components:]
 61 |             eigv = eigv[:, -self.n_components:]
 62 | 
 63 |         sqrt_eigs = np.sqrt(eigs)
 64 |         self.P = np.dot(eigv * (1.0 / sqrt_eigs), eigv.T)
 65 |         assert not np.isnan(self.P).any()
 66 |         self.P_inv = np.dot(eigv * sqrt_eigs, eigv.T)
 67 | 
 68 |         self.P = np.float32(self.P)
 69 |         self.P_inv = np.float32(self.P_inv)
 70 | 
 71 |         self.is_fit = True
 72 | 
 73 |     def apply(self, data, remove_mean=True):
 74 |         data = self._flatten_data(data)
 75 |         d = data - self.mean if remove_mean else data
 76 |         return self._reshape_data(np.dot(d, self.P))
 77 | 
 78 |     def inv(self, data, add_mean=True):
 79 |         d = np.dot(self._flatten_data(data), self.P_inv)
 80 |         d += self.mean if add_mean else 0.
 81 |         return self._reshape_data(d)
 82 | 
 83 |     def _flatten_data(self, data):
 84 |         if self.reshape is None:
 85 |             return data
 86 |         assert data.shape[1:] == self.reshape
 87 |         return data.reshape(data.shape[0], np.product(data.shape[1:]))
 88 | 
 89 |     def _reshape_data(self, data):
 90 |         assert len(data.shape) == 2
 91 |         if self.reshape is None:
 92 |             return data
 93 |         return np.reshape(data, (data.shape[0],) + self.reshape)
 94 | 
 95 | 
 96 | class ContrastNorm(object):
 97 |     def __init__(self, scale=55, epsilon=1e-8):
 98 |         self.scale = np.float32(scale)
 99 |         self.epsilon = np.float32(epsilon)
100 | 
101 |     def apply(self, data, copy=False):
102 |         if copy:
103 |             data = np.copy(data)
104 |         data_shape = data.shape
105 |         if len(data.shape) > 2:
106 |             data = data.reshape(data.shape[0], np.product(data.shape[1:]))
107 | 
108 |         assert len(data.shape) == 2, 'Contrast norm on flattened data'
109 | 
110 |         data -= data.mean(axis=1)[:, np.newaxis]
111 | 
112 |         norms = np.sqrt(np.sum(data ** 2, axis=1)) / self.scale
113 |         norms[norms < self.epsilon] = np.float32(1.)
114 | 
115 |         data /= norms[:, np.newaxis]
116 | 
117 |         if data_shape != data.shape:
118 |             data = data.reshape(data_shape)
119 | 
120 |         return data
121 | 
122 | 
123 | class TestMonitoring(object):
124 |     def _get_bn_params(self, output_vars):
125 |         # Pick out the nodes with batch normalization vars
126 |         cg = ComputationGraph(output_vars)
127 |         var_filter = VariableFilter(roles=[BNPARAM])
128 |         bn_ps = var_filter(cg.variables)
129 | 
130 |         if len(bn_ps) == 0:
131 |             logger.warn('No batch normalization parameters found - is' +
132 |                         ' batch normalization turned off?')
133 |             self._bn = False
134 |             self._counter = None
135 |             self._counter_max = None
136 |             bn_share = []
137 |             output_vars_replaced = output_vars
138 |         else:
139 |             self._bn = True
140 |             assert len(set([p.name for p in bn_ps])) == len(bn_ps), \
141 |                 'Some batch norm params have the same name'
142 |             logger.info('Batch norm parameters: %s' % ', '.join([p.name for p in bn_ps]))
143 | 
144 |             # Filter out the shared variables from the model updates
145 |             def filter_share(par):
146 |                 lst = [up for up in cg.updates if up.name == 'shared_%s' % par.name]
147 |                 assert len(lst) == 1
148 |                 return lst[0]
149 |             bn_share = map(filter_share, bn_ps)
150 | 
151 |             # Replace the BN coefficients in the test data model - Replace the
152 |             # theano variables in the test graph with the shareds
153 |             output_vars_replaced = cg.replace(zip(bn_ps, bn_share)).outputs
154 | 
155 |             # Pick out the counter
156 |             self._counter = self._param_from_updates(cg.updates, 'counter')
157 |             self._counter_max = self._param_from_updates(cg.updates, 'counter_max')
158 | 
159 |         return bn_ps, bn_share, output_vars_replaced
160 | 
161 |     def _param_from_updates(self, updates, p_name):
162 |         var_filter = VariableFilter(roles=[BNPARAM])
163 |         bn_ps = var_filter(updates.keys())
164 |         p = [p for p in bn_ps if p.name == p_name]
165 |         assert len(p) == 1, 'No %s of more than one %s' % (p_name, p_name)
166 |         return p[0]
167 | 
168 |     def reset_counter(self):
169 |         if self._bn:
170 |             self._counter.set_value(np.float32(1))
171 | 
172 |     def replicate_vars(self, output_vars):
173 |         # Problem in Blocks with multiple monitors monitoring the
174 |         # same value in a graph. Therefore, they are all "replicated" to a new
175 |         # Theano variable
176 |         if isinstance(output_vars, (list, tuple)):
177 |             return map(self.replicate_vars, output_vars)
178 |         assert not hasattr(output_vars.tag, 'aggregation_scheme'), \
179 |             'The variable %s already has an aggregator ' % output_vars.name + \
180 |             'assigned to it - are you using a datasetmonitor with the same' + \
181 |             ' variable as output? This might cause trouble in Blocks'
182 |         new_var = 1 * output_vars
183 |         new_var.name = output_vars.name
184 |         return new_var
185 | 
186 | 
187 | class ApproxTestMonitoring(DataStreamMonitoring, TestMonitoring):
188 |     def __init__(self, output_vars, *args, **kwargs):
189 |         output_vars = self.replicate_vars(output_vars)
190 |         _, _, replaced_vars = self._get_bn_params(output_vars)
191 |         super(ApproxTestMonitoring, self).__init__(replaced_vars, *args,
192 |                                                    **kwargs)
193 | 
194 |     def do(self, which_callback, *args, **kwargs):
195 |         assert not which_callback == "after_batch", "Do not monitor each mb"
196 |         self.reset_counter()
197 |         super(ApproxTestMonitoring, self).do(which_callback, *args, **kwargs)
198 | 
199 | 
200 | class FinalTestMonitoring(SimpleExtension, MonitoringExtension, TestMonitoring):
201 |     """Monitors validation and test set data with batch norm
202 | 
203 |     Calculates the training set statistics for batch normalization and adds
204 |     them to the model before calculating the validation and test set values.
205 |     This is done in two steps: First the training set is iterated and the
206 |     statistics are saved in shared variables, then the model iterates through
207 |     the test/validation set using the saved shared variables.
208 |     When the training set is iterated, it is done for the full set, layer by
209 |     layer so that the statistics are correct. This is expensive for very deep
210 |     models, in which case some approximation could be in order
211 |     """
212 |     def __init__(self, output_vars, train_data_stream, test_data_stream,
213 |                  **kwargs):
214 |         output_vars = self.replicate_vars(output_vars)
215 |         super(FinalTestMonitoring, self).__init__(**kwargs)
216 |         self.trn_stream = train_data_stream
217 |         self.tst_stream = test_data_stream
218 | 
219 |         bn_ps, bn_share, output_vars_replaced = self._get_bn_params(output_vars)
220 | 
221 |         if self._bn:
222 |             updates = self._get_updates(bn_ps, bn_share)
223 |             trn_evaluator = DatasetEvaluator(bn_ps, updates=updates)
224 |         else:
225 |             trn_evaluator = None
226 | 
227 |         self._trn_evaluator = trn_evaluator
228 |         self._tst_evaluator = DatasetEvaluator(output_vars_replaced)
229 | 
230 |     def _get_updates(self, bn_ps, bn_share):
231 |         cg = ComputationGraph(bn_ps)
232 |         # Only store updates that relate to params or the counter
233 |         updates = OrderedDict([(up, cg.updates[up]) for up in
234 |                                cg.updates if up.name == 'counter' or
235 |                                up in bn_share])
236 |         assert self._counter == self._param_from_updates(cg.updates, 'counter')
237 |         assert self._counter_max == self._param_from_updates(cg.updates,
238 |                                                              'counter_max')
239 |         assert len(updates) == len(bn_ps) + 1, \
240 |             'Counter or var missing from update'
241 |         return updates
242 | 
243 |     def do(self, which_callback, *args):
244 |         """Write the values of monitored variables to the log."""
245 |         assert not which_callback == "after_batch", "Do not monitor each mb"
246 |         # Run on train data and get the statistics
247 |         if self._bn:
248 |             self._counter_max.set_value(np.float32(np.inf))
249 |             self.reset_counter()
250 |             self._trn_evaluator.evaluate(self.trn_stream)
251 |             self.reset_counter()
252 | 
253 |         value_dict = self._tst_evaluator.evaluate(self.tst_stream)
254 |         self.add_records(self.main_loop.log, value_dict.items())
255 | 
256 | 
257 | class LRDecay(SimpleExtension):
258 |     def __init__(self, lr, decay_first, decay_last, **kwargs):
259 |         super(LRDecay, self).__init__(**kwargs)
260 |         self.iter = 0
261 |         self.decay_first = decay_first
262 |         self.decay_last = decay_last
263 |         self.lr = lr
264 |         self.lr_init = np.float32(lr)
265 | 
266 |     def do(self, which_callback, *args):
267 |         self.iter += 1
268 |         if self.iter > self.decay_first:
269 |             ratio = 1.0 * (self.decay_last - self.iter)
270 |             ratio = np.maximum(0, ratio / (self.decay_last - self.decay_first))
271 |             self.lr = np.float32(ratio * self.lr_init)
272 |         logger.info("Iter %d, lr %f" % (self.iter, self.lr))
273 | 
274 | 
275 | def global_meanpool_2d(x, num_filters):
276 |     mean = tensor.mean(x.flatten(3), axis=2)
277 |     mean = mean.dimshuffle(0, 1, 'x', 'x')
278 |     return mean, (num_filters, 1, 1)
279 | 
280 | 
281 | def pool_2d(x, mode="average", ws=(2, 2), stride=(2, 2)):
282 |     import theano.sandbox.cuda as cuda
283 |     assert cuda.dnn.dnn_available()
284 |     return cuda.dnn.dnn_pool(x, ws=ws, stride=stride, mode=mode)
285 | 
286 | 
287 | def maxpool_2d(z, in_dim, poolsize, poolstride):
288 |     z = pool_2d(z, ds=poolsize, st=poolstride)
289 |     output_size = tuple(Pool.out_shape(in_dim, poolsize, st=poolstride))
290 |     return z, output_size
291 | 
292 | def softmax_n(x, axis=-1):
293 |     e_x = tensor.exp(x - x.max(axis=axis, keepdims=True))
294 |     out = e_x / e_x.sum(axis=axis, keepdims=True)
295 |     return out
296 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import functools
  4 | import logging
  5 | import os
  6 | import subprocess
  7 | from argparse import ArgumentParser, Action
  8 | from collections import OrderedDict
  9 | import sys
 10 | from pandas import DataFrame
 11 | 
 12 | import numpy
 13 | import time
 14 | import theano
 15 | from theano.tensor.type import TensorType
 16 | 
 17 | from blocks.algorithms import GradientDescent, Adam
 18 | from blocks.extensions import FinishAfter
 19 | from blocks.extensions.monitoring import TrainingDataMonitoring
 20 | from blocks.filter import VariableFilter
 21 | from blocks.graph import ComputationGraph
 22 | from blocks.main_loop import MainLoop
 23 | from blocks.model import Model
 24 | from blocks.roles import PARAMETER
 25 | from fuel.datasets import MNIST, CIFAR10
 26 | from fuel.schemes import ShuffledScheme, SequentialScheme
 27 | from fuel.streams import DataStream
 28 | from fuel.transformers import Transformer
 29 | 
 30 | from picklable_itertools import cycle, imap
 31 | from itertools import izip, product, tee
 32 | 
 33 | logger = logging.getLogger('main')
 34 | 
 35 | from utils import ShortPrinting, prepare_dir, load_df, DummyLoop
 36 | from utils import SaveExpParams, SaveLog, SaveParams, AttributeDict
 37 | from nn import ZCA, ContrastNorm
 38 | from nn import ApproxTestMonitoring, FinalTestMonitoring, TestMonitoring
 39 | from nn import LRDecay
 40 | from ladder import LadderAE
 41 | 
 42 | 
 43 | class Whitening(Transformer):
 44 |     """ Makes a copy of the examples in the underlying dataset and whitens it
 45 |         if necessary.
 46 |     """
 47 |     def __init__(self, data_stream, iteration_scheme, whiten, cnorm=None,
 48 |                  **kwargs):
 49 |         super(Whitening, self).__init__(data_stream,
 50 |                                         iteration_scheme=iteration_scheme,
 51 |                                         **kwargs)
 52 |         data = data_stream.get_data(slice(data_stream.dataset.num_examples))
 53 |         self.data = []
 54 |         for s, d in zip(self.sources, data):
 55 |             if 'features' == s:
 56 |                 # Fuel provides Cifar in uint8, convert to float32
 57 |                 d = numpy.require(d, dtype=numpy.float32)
 58 |                 if cnorm is not None:
 59 |                     d = cnorm.apply(d)
 60 |                 if whiten is not None:
 61 |                     d = whiten.apply(d)
 62 |                 self.data += [d]
 63 |             elif 'targets' == s:
 64 |                 d = unify_labels(d)
 65 |                 self.data += [d]
 66 |             else:
 67 |                 raise Exception("Unsupported Fuel target: %s" % s)
 68 | 
 69 |     def get_data(self, request=None):
 70 |         return (s[request] for s in self.data)
 71 | 
 72 | 
 73 | class SemiDataStream(Transformer):
 74 |     """ Combines two datastreams into one such that 'target' source (labels)
 75 |         is used only from the first one. The second one is renamed
 76 |         to avoid collision. Upon iteration, the first one is repeated until
 77 |         the second one depletes.
 78 |         """
 79 |     def __init__(self, data_stream_labeled, data_stream_unlabeled, **kwargs):
 80 |         super(Transformer, self).__init__(**kwargs)
 81 |         self.ds_labeled = data_stream_labeled
 82 |         self.ds_unlabeled = data_stream_unlabeled
 83 |         # Rename the sources for clarity
 84 |         self.ds_labeled.sources = ('features_labeled', 'targets_labeled')
 85 |         # Rename the source for input pixels and hide its labels!
 86 |         self.ds_unlabeled.sources = ('features_unlabeled',)
 87 | 
 88 |     @property
 89 |     def sources(self):
 90 |         if hasattr(self, '_sources'):
 91 |             return self._sources
 92 |         return self.ds_labeled.sources + self.ds_unlabeled.sources
 93 | 
 94 |     @sources.setter
 95 |     def sources(self, value):
 96 |         self._sources = value
 97 | 
 98 |     def close(self):
 99 |         self.ds_labeled.close()
100 |         self.ds_unlabeled.close()
101 | 
102 |     def reset(self):
103 |         self.ds_labeled.reset()
104 |         self.ds_unlabeled.reset()
105 | 
106 |     def next_epoch(self):
107 |         self.ds_labeled.next_epoch()
108 |         self.ds_unlabeled.next_epoch()
109 | 
110 |     def get_epoch_iterator(self, **kwargs):
111 |         unlabeled = self.ds_unlabeled.get_epoch_iterator(**kwargs)
112 |         labeled = self.ds_labeled.get_epoch_iterator(**kwargs)
113 |         assert type(labeled) == type(unlabeled)
114 | 
115 |         return imap(self.mergedicts, cycle(labeled), unlabeled)
116 | 
117 |     def mergedicts(self, x, y):
118 |         return dict(list(x.items()) + list(y.items()))
119 | 
120 | 
121 | def unify_labels(y):
122 |     """ Work-around for Fuel bug where MNIST and Cifar-10
123 |     datasets have different dimensionalities for the targets:
124 |     e.g. (50000, 1) vs (60000,) """
125 |     yshape = y.shape
126 |     y = y.flatten()
127 |     assert y.shape[0] == yshape[0]
128 |     return y
129 | 
130 | 
131 | def make_datastream(dataset, indices, batch_size,
132 |                     n_labeled=None, n_unlabeled=None,
133 |                     balanced_classes=True, whiten=None, cnorm=None,
134 |                     scheme=ShuffledScheme):
135 |     if n_labeled is None or n_labeled == 0:
136 |         n_labeled = len(indices)
137 |     if batch_size is None:
138 |         batch_size = len(indices)
139 |     if n_unlabeled is None:
140 |         n_unlabeled = len(indices)
141 |     assert n_labeled <= n_unlabeled, 'need less labeled than unlabeled'
142 | 
143 |     if balanced_classes and n_labeled < n_unlabeled:
144 |         # Ensure each label is equally represented
145 |         logger.info('Balancing %d labels...' % n_labeled)
146 |         all_data = dataset.data_sources[dataset.sources.index('targets')]
147 |         y = unify_labels(all_data)[indices]
148 |         n_classes = y.max() + 1
149 |         assert n_labeled % n_classes == 0
150 |         n_from_each_class = n_labeled / n_classes
151 | 
152 |         i_labeled = []
153 |         for c in range(n_classes):
154 |             i = (indices[y == c])[:n_from_each_class]
155 |             i_labeled += list(i)
156 |     else:
157 |         i_labeled = indices[:n_labeled]
158 | 
159 |     # Get unlabeled indices
160 |     i_unlabeled = indices[:n_unlabeled]
161 | 
162 |     ds = SemiDataStream(
163 |         data_stream_labeled=Whitening(
164 |             DataStream(dataset),
165 |             iteration_scheme=scheme(i_labeled, batch_size),
166 |             whiten=whiten, cnorm=cnorm),
167 |         data_stream_unlabeled=Whitening(
168 |             DataStream(dataset),
169 |             iteration_scheme=scheme(i_unlabeled, batch_size),
170 |             whiten=whiten, cnorm=cnorm)
171 |     )
172 |     return ds
173 | 
174 | 
175 | def setup_model(p):
176 |     ladder = LadderAE(p)
177 |     # Setup inputs
178 |     input_type = TensorType('float32', [False] * (len(p.encoder_layers[0]) + 1))
179 |     x_only = input_type('features_unlabeled')
180 |     x = input_type('features_labeled')
181 |     y = theano.tensor.lvector('targets_labeled')
182 |     ladder.apply(x, y, x_only)
183 | 
184 |     # Load parameters if requested
185 |     if p.get('load_from'):
186 |         with open(p.load_from + '/trained_params.npz') as f:
187 |             loaded = numpy.load(f)
188 |             cg = ComputationGraph([ladder.costs.total])
189 |             current_params = VariableFilter(roles=[PARAMETER])(cg.variables)
190 |             logger.info('Loading parameters: %s' % ', '.join(loaded.keys()))
191 |             for param in current_params:
192 |                 assert param.get_value().shape == loaded[param.name].shape
193 |                 param.set_value(loaded[param.name])
194 | 
195 |     return ladder
196 | 
197 | 
198 | def load_and_log_params(cli_params):
199 |     cli_params = AttributeDict(cli_params)
200 |     if cli_params.get('load_from'):
201 |         p = load_df(cli_params.load_from, 'params').to_dict()[0]
202 |         p = AttributeDict(p)
203 |         for key in cli_params.iterkeys():
204 |             if key not in p:
205 |                 p[key] = None
206 |         new_params = cli_params
207 |         loaded = True
208 |     else:
209 |         p = cli_params
210 |         new_params = {}
211 |         loaded = False
212 | 
213 |         # Make dseed seed unless specified explicitly
214 |         if p.get('dseed') is None and p.get('seed') is not None:
215 |             p['dseed'] = p['seed']
216 | 
217 |     logger.info('== COMMAND LINE ==')
218 |     logger.info(' '.join(sys.argv))
219 | 
220 |     logger.info('== PARAMETERS ==')
221 |     for k, v in p.iteritems():
222 |         if new_params.get(k) is not None:
223 |             p[k] = new_params[k]
224 |             replace_str = "<- " + str(new_params.get(k))
225 |         else:
226 |             replace_str = ""
227 |         logger.info(" {:20}: {:<20} {}".format(k, v, replace_str))
228 |     return p, loaded
229 | 
230 | 
231 | def setup_data(p, test_set=False):
232 |     dataset_class, training_set_size = {
233 |         'cifar10': (CIFAR10, 40000),
234 |         'mnist': (MNIST, 50000),
235 |     }[p.dataset]
236 | 
237 |     # Allow overriding the default from command line
238 |     if p.get('unlabeled_samples') is not None:
239 |         training_set_size = p.unlabeled_samples
240 | 
241 |     train_set = dataset_class(["train"])
242 | 
243 |     # Make sure the MNIST data is in right format
244 |     if p.dataset == 'mnist':
245 |         d = train_set.data_sources[train_set.sources.index('features')]
246 |         assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), \
247 |             'Make sure data is in float format and in range 0 to 1'
248 | 
249 |     # Take all indices and permutate them
250 |     all_ind = numpy.arange(train_set.num_examples)
251 |     if p.get('dseed'):
252 |         rng = numpy.random.RandomState(seed=p.dseed)
253 |         rng.shuffle(all_ind)
254 | 
255 |     d = AttributeDict()
256 | 
257 |     # Choose the training set
258 |     d.train = train_set
259 |     d.train_ind = all_ind[:training_set_size]
260 | 
261 |     # Then choose validation set from the remaining indices
262 |     d.valid = train_set
263 |     d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size]
264 |     logger.info('Using %d examples for validation' % len(d.valid_ind))
265 | 
266 |     # Only touch test data if requested
267 |     if test_set:
268 |         d.test = dataset_class(["test"])
269 |         d.test_ind = numpy.arange(d.test.num_examples)
270 | 
271 |     # Setup optional whitening, only used for Cifar-10
272 |     in_dim = train_set.data_sources[train_set.sources.index('features')].shape[1:]
273 |     if len(in_dim) > 1 and p.whiten_zca > 0:
274 |         assert numpy.product(in_dim) == p.whiten_zca, \
275 |             'Need %d whitening dimensions, not %d' % (numpy.product(in_dim),
276 |                                                       p.whiten_zca)
277 |     cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None
278 | 
279 |     def get_data(d, i):
280 |         data = d.get_data(request=list(i))[d.sources.index('features')]
281 |         # Fuel provides Cifar in uint8, convert to float32
282 |         data = numpy.require(data, dtype=numpy.float32)
283 |         return data if cnorm is None else cnorm.apply(data)
284 | 
285 |     if p.whiten_zca > 0:
286 |         logger.info('Whitening using %d ZCA components' % p.whiten_zca)
287 |         whiten = ZCA()
288 |         whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind))
289 |     else:
290 |         whiten = None
291 | 
292 |     return in_dim, d, whiten, cnorm
293 | 
294 | 
295 | def get_error(args):
296 |     """ Calculate the classification error """
297 |     args['data_type'] = args.get('data_type', 'test')
298 |     args['no_load'] = 'g_'
299 | 
300 |     targets, acts = analyze(args)
301 |     guess = numpy.argmax(acts, axis=1)
302 |     correct = numpy.sum(numpy.equal(guess, targets.flatten()))
303 | 
304 |     return (1. - correct / float(len(guess))) * 100.
305 | 
306 | 
307 | def analyze(cli_params):
308 |     p, _ = load_and_log_params(cli_params)
309 |     _, data, whiten, cnorm = setup_data(p, test_set=True)
310 |     ladder = setup_model(p)
311 | 
312 |     # Analyze activations
313 |     dset, indices, calc_batchnorm = {
314 |         'train': (data.train, data.train_ind, False),
315 |         'valid': (data.valid, data.valid_ind, True),
316 |         'test':  (data.test, data.test_ind, True),
317 |     }[p.data_type]
318 | 
319 |     if calc_batchnorm:
320 |         logger.info('Calculating batch normalization for clean.labeled path')
321 |         main_loop = DummyLoop(
322 |             extensions=[
323 |                 FinalTestMonitoring(
324 |                     [ladder.costs.class_clean, ladder.error.clean]
325 |                     + ladder.costs.denois.values(),
326 |                     make_datastream(data.train, data.train_ind,
327 |                                     # These need to match with the training
328 |                                     p.batch_size,
329 |                                     n_labeled=p.labeled_samples,
330 |                                     n_unlabeled=len(data.train_ind),
331 |                                     cnorm=cnorm,
332 |                                     whiten=whiten, scheme=ShuffledScheme),
333 |                     make_datastream(data.valid, data.valid_ind,
334 |                                     p.valid_batch_size,
335 |                                     n_labeled=len(data.valid_ind),
336 |                                     n_unlabeled=len(data.valid_ind),
337 |                                     cnorm=cnorm,
338 |                                     whiten=whiten, scheme=ShuffledScheme),
339 |                     prefix="valid_final", before_training=True),
340 |                 ShortPrinting({
341 |                     "valid_final": OrderedDict([
342 |                         ('VF_C_class', ladder.costs.class_clean),
343 |                         ('VF_E', ladder.error.clean),
344 |                         ('VF_C_de', [ladder.costs.denois.get(0),
345 |                                      ladder.costs.denois.get(1),
346 |                                      ladder.costs.denois.get(2),
347 |                                      ladder.costs.denois.get(3)]),
348 |                     ]),
349 |                 }, after_training=True, use_log=False),
350 |             ])
351 |         main_loop.run()
352 | 
353 |     # Make a datastream that has all the indices in the labeled pathway
354 |     ds = make_datastream(dset, indices,
355 |                          batch_size=p.get('batch_size'),
356 |                          n_labeled=len(indices),
357 |                          n_unlabeled=len(indices),
358 |                          balanced_classes=False,
359 |                          whiten=whiten,
360 |                          cnorm=cnorm,
361 |                          scheme=SequentialScheme)
362 | 
363 |     # We want out the values after softmax
364 |     outputs = ladder.act.clean.labeled.h[len(ladder.layers) - 1]
365 | 
366 |     # Replace the batch normalization paramameters with the shared variables
367 |     if calc_batchnorm:
368 |         outputreplacer = TestMonitoring()
369 |         _, _,  outputs = outputreplacer._get_bn_params(outputs)
370 | 
371 |     cg = ComputationGraph(outputs)
372 |     f = cg.get_theano_function()
373 | 
374 |     it = ds.get_epoch_iterator(as_dict=True)
375 |     res = []
376 |     inputs = {'features_labeled': [],
377 |               'targets_labeled': [],
378 |               'features_unlabeled': []}
379 |     # Loop over one epoch
380 |     for d in it:
381 |         # Store all inputs
382 |         for k, v in d.iteritems():
383 |             inputs[k] += [v]
384 |         # Store outputs
385 |         res += [f(*[d[str(inp)] for inp in cg.inputs])]
386 | 
387 |     # Concatenate all minibatches
388 |     res = [numpy.vstack(minibatches) for minibatches in zip(*res)]
389 |     inputs = {k: numpy.vstack(v) for k, v in inputs.iteritems()}
390 | 
391 |     return inputs['targets_labeled'], res[0]
392 | 
393 | 
394 | def train(cli_params):
395 |     cli_params['save_dir'] = prepare_dir(cli_params['save_to'])
396 |     logfile = os.path.join(cli_params['save_dir'], 'log.txt')
397 | 
398 |     # Log also DEBUG to a file
399 |     fh = logging.FileHandler(filename=logfile)
400 |     fh.setLevel(logging.DEBUG)
401 |     logger.addHandler(fh)
402 | 
403 |     logger.info('Logging into %s' % logfile)
404 | 
405 |     p, loaded = load_and_log_params(cli_params)
406 |     in_dim, data, whiten, cnorm = setup_data(p, test_set=False)
407 |     if not loaded:
408 |         # Set the zero layer to match input dimensions
409 |         p.encoder_layers = (in_dim,) + p.encoder_layers
410 | 
411 |     ladder = setup_model(p)
412 | 
413 |     # Training
414 |     all_params = ComputationGraph([ladder.costs.total]).parameters
415 |     logger.info('Found the following parameters: %s' % str(all_params))
416 | 
417 |     # Fetch all batch normalization updates. They are in the clean path.
418 |     bn_updates = ComputationGraph([ladder.costs.class_clean]).updates
419 |     assert 'counter' in [u.name for u in bn_updates.keys()], \
420 |         'No batch norm params in graph - the graph has been cut?'
421 | 
422 |     training_algorithm = GradientDescent(
423 |         cost=ladder.costs.total, parameters=all_params,
424 |         step_rule=Adam(learning_rate=ladder.lr))
425 |     # In addition to actual training, also do BN variable approximations
426 |     training_algorithm.add_updates(bn_updates)
427 | 
428 |     short_prints = {
429 |         "train": {
430 |             'T_C_class': ladder.costs.class_corr,
431 |             'T_C_de': ladder.costs.denois.values(),
432 |         },
433 |         "valid_approx": OrderedDict([
434 |             ('V_C_class', ladder.costs.class_clean),
435 |             ('V_E', ladder.error.clean),
436 |             ('V_C_de', ladder.costs.denois.values()),
437 |         ]),
438 |         "valid_final": OrderedDict([
439 |             ('VF_C_class', ladder.costs.class_clean),
440 |             ('VF_E', ladder.error.clean),
441 |             ('VF_C_de', ladder.costs.denois.values()),
442 |         ]),
443 |     }
444 | 
445 |     main_loop = MainLoop(
446 |         training_algorithm,
447 |         # Datastream used for training
448 |         make_datastream(data.train, data.train_ind,
449 |                         p.batch_size,
450 |                         n_labeled=p.labeled_samples,
451 |                         n_unlabeled=p.unlabeled_samples,
452 |                         whiten=whiten,
453 |                         cnorm=cnorm),
454 |         model=Model(ladder.costs.total),
455 |         extensions=[
456 |             FinishAfter(after_n_epochs=p.num_epochs),
457 | 
458 |             # This will estimate the validation error using
459 |             # running average estimates of the batch normalization
460 |             # parameters, mean and variance
461 |             ApproxTestMonitoring(
462 |                 [ladder.costs.class_clean, ladder.error.clean]
463 |                 + ladder.costs.denois.values(),
464 |                 make_datastream(data.valid, data.valid_ind,
465 |                                 p.valid_batch_size, whiten=whiten, cnorm=cnorm,
466 |                                 scheme=ShuffledScheme),
467 |                 prefix="valid_approx"),
468 | 
469 |             # This Monitor is slower, but more accurate since it will first
470 |             # estimate batch normalization parameters from training data and
471 |             # then do another pass to calculate the validation error.
472 |             FinalTestMonitoring(
473 |                 [ladder.costs.class_clean, ladder.error.clean]
474 |                 + ladder.costs.denois.values(),
475 |                 make_datastream(data.train, data.train_ind,
476 |                                 p.batch_size,
477 |                                 n_labeled=p.labeled_samples,
478 |                                 whiten=whiten, cnorm=cnorm,
479 |                                 scheme=ShuffledScheme),
480 |                 make_datastream(data.valid, data.valid_ind,
481 |                                 p.valid_batch_size,
482 |                                 n_labeled=len(data.valid_ind),
483 |                                 whiten=whiten, cnorm=cnorm,
484 |                                 scheme=ShuffledScheme),
485 |                 prefix="valid_final",
486 |                 after_n_epochs=p.num_epochs),
487 | 
488 |             TrainingDataMonitoring(
489 |                 [ladder.costs.total, ladder.costs.class_corr,
490 |                  training_algorithm.total_gradient_norm]
491 |                 + ladder.costs.denois.values(),
492 |                 prefix="train", after_epoch=True),
493 | 
494 |             SaveParams(None, all_params, p.save_dir, after_epoch=True),
495 |             SaveExpParams(p, p.save_dir, before_training=True),
496 |             SaveLog(p.save_dir, after_training=True),
497 |             ShortPrinting(short_prints),
498 |             LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs,
499 |                     after_epoch=True),
500 |         ])
501 |     main_loop.run()
502 | 
503 |     # Get results
504 |     df = DataFrame.from_dict(main_loop.log, orient='index')
505 |     col = 'valid_final_error_rate_clean'
506 |     logger.info('%s %g' % (col, df[col].iloc[-1]))
507 | 
508 |     if main_loop.log.status['epoch_interrupt_received']:
509 |         return None
510 |     return df
511 | 
512 | if __name__ == "__main__":
513 |     logging.basicConfig(level=logging.INFO)
514 | 
515 |     rep = lambda s: s.replace('-', ',')
516 |     chop = lambda s: s.split(',')
517 |     to_int = lambda ss: [int(s) for s in ss if s.isdigit()]
518 |     to_float = lambda ss: [float(s) for s in ss]
519 | 
520 |     def to_bool(s):
521 |         if s.lower() in ['true', 't']:
522 |             return True
523 |         elif s.lower() in ['false', 'f']:
524 |             return False
525 |         else:
526 |             raise Exception("Unknown bool value %s" % s)
527 | 
528 |     def compose(*funs):
529 |         return functools.reduce(lambda f, g: lambda x: f(g(x)), funs)
530 | 
531 |     # Functional parsing logic to allow flexible function compositions
532 |     # as actions for ArgumentParser
533 |     def funcs(additional_arg):
534 |         class customAction(Action):
535 |             def __call__(self, parser, args, values, option_string=None):
536 | 
537 |                 def process(arg, func_list):
538 |                     if arg is None:
539 |                         return None
540 |                     elif type(arg) is list:
541 |                         return map(compose(*func_list), arg)
542 |                     else:
543 |                         return compose(*func_list)(arg)
544 | 
545 |                 setattr(args, self.dest, process(values, additional_arg))
546 |         return customAction
547 | 
548 |     def add_train_params(parser, use_defaults):
549 |         a = parser.add_argument
550 |         default = lambda x: x if use_defaults else None
551 | 
552 |         # General hyper parameters and settings
553 |         a("save_to", help="Destination to save the state and results",
554 |           default=default("noname"), nargs="?")
555 |         a("--num-epochs", help="Number of training epochs",
556 |           type=int, default=default(150))
557 |         a("--seed", help="Seed",
558 |           type=int, default=default([1]), nargs='+')
559 |         a("--dseed", help="Data permutation seed, defaults to 'seed'",
560 |           type=int, default=default([None]), nargs='+')
561 |         a("--labeled-samples", help="How many supervised samples are used",
562 |           type=int, default=default(None), nargs='+')
563 |         a("--unlabeled-samples", help="How many unsupervised samples are used",
564 |           type=int, default=default(None), nargs='+')
565 |         a("--dataset", type=str, default=default(['mnist']), nargs='+',
566 |           choices=['mnist', 'cifar10'], help="Which dataset to use")
567 |         a("--lr", help="Initial learning rate",
568 |           type=float, default=default([0.002]), nargs='+')
569 |         a("--lrate-decay", help="When to linearly start decaying lrate (0-1)",
570 |           type=float, default=default([0.67]), nargs='+')
571 |         a("--batch-size", help="Minibatch size",
572 |           type=int, default=default([100]), nargs='+')
573 |         a("--valid-batch-size", help="Minibatch size for validation data",
574 |           type=int, default=default([100]), nargs='+')
575 |         a("--valid-set-size", help="Number of examples in validation set",
576 |           type=int, default=default([10000]), nargs='+')
577 | 
578 |         # Hyperparameters controlling supervised path
579 |         a("--super-noise-std", help="Noise added to supervised learning path",
580 |           type=float, default=default([0.3]), nargs='+')
581 |         a("--f-local-noise-std", help="Noise added encoder path",
582 |           type=str, default=default([0.3]), nargs='+',
583 |           action=funcs([tuple, to_float, chop]))
584 |         a("--act", nargs='+', type=str, action=funcs([tuple, chop, rep]),
585 |           default=default(["relu"]), help="List of activation functions")
586 |         a("--encoder-layers", help="List of layers for f",
587 |           type=str, default=default(()), action=funcs([tuple, chop, rep]))
588 | 
589 |         # Hyperparameters controlling unsupervised training
590 |         a("--denoising-cost-x", help="Weight of the denoising cost.",
591 |           type=str, default=default([(0.,)]), nargs='+',
592 |           action=funcs([tuple, to_float, chop]))
593 |         a("--decoder-spec", help="List of decoding function types", nargs='+',
594 |           type=str, default=default(['sig']), action=funcs([tuple, chop, rep]))
595 |         a("--zestbn", type=str, default=default(['bugfix']), nargs='+',
596 |           choices=['bugfix', 'no'], help="How to do zest bn")
597 | 
598 |         # Hyperparameters used for Cifar training
599 |         a("--contrast-norm", help="Scale of contrast normalization (0=off)",
600 |           type=int, default=default([0]), nargs='+')
601 |         a("--top-c", help="Have c at softmax?", action=funcs([to_bool]),
602 |           default=default([True]), nargs='+')
603 |         a("--whiten-zca", help="Whether to whiten the data with ZCA",
604 |           type=int, default=default([0]), nargs='+')
605 | 
606 |     ap = ArgumentParser("Semisupervised experiment")
607 |     subparsers = ap.add_subparsers(dest='cmd', help='sub-command help')
608 | 
609 |     # TRAIN
610 |     train_cmd = subparsers.add_parser('train', help='Train a new model')
611 |     add_train_params(train_cmd, use_defaults=True)
612 | 
613 |     # EVALUATE
614 |     load_cmd = subparsers.add_parser('evaluate', help='Evaluate test error')
615 |     load_cmd.add_argument('load_from', type=str,
616 |                           help="Destination to load the state from")
617 |     load_cmd.add_argument('--data-type', type=str, default='test',
618 |                           help="Data set to evaluate on")
619 | 
620 |     args = ap.parse_args()
621 | 
622 |     subp = subprocess.Popen(['git', 'rev-parse', 'HEAD'],
623 |                             stdin=subprocess.PIPE, stdout=subprocess.PIPE,
624 |                             stderr=subprocess.PIPE)
625 |     out, err = subp.communicate()
626 |     args.commit = out.strip()
627 |     if err.strip():
628 |         logger.error('Subprocess returned %s' % err.strip())
629 | 
630 |     t_start = time.time()
631 |     if args.cmd == 'evaluate':
632 |         for k, v in vars(args).iteritems():
633 |             if type(v) is list:
634 |                 assert len(v) == 1, "should not be a list when loading: %s" % k
635 |                 logger.info("%s" % str(v[0]))
636 |                 vars(args)[k] = v[0]
637 | 
638 |         err = get_error(vars(args))
639 |         logger.info('Test error: %f' % err)
640 | 
641 |     elif args.cmd == "train":
642 |         listdicts = {k: v for k, v in vars(args).iteritems() if type(v) is list}
643 |         therest = {k: v for k, v in vars(args).iteritems() if type(v) is not list}
644 | 
645 |         gen1, gen2 = tee(product(*listdicts.itervalues()))
646 | 
647 |         l = len(list(gen1))
648 |         for i, d in enumerate(dict(izip(listdicts, x)) for x in gen2):
649 |             if l > 1:
650 |                 logger.info('Training configuration %d / %d' % (i+1, l))
651 |             d.update(therest)
652 |             if train(d) is None:
653 |                 break
654 |     logger.info('Took %.1f minutes' % ((time.time() - t_start) / 60.))
655 | 


--------------------------------------------------------------------------------
/ladder.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import numpy as np
  4 | from collections import OrderedDict
  5 | 
  6 | import theano
  7 | import theano.tensor as T
  8 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  9 | from theano.tensor.nnet.conv import conv2d, ConvOp
 10 | from theano.sandbox.cuda.blas import GpuCorrMM
 11 | from theano.sandbox.cuda.basic_ops import gpu_contiguous
 12 | 
 13 | from blocks.bricks.cost import SquaredError
 14 | from blocks.bricks.cost import CategoricalCrossEntropy, MisclassificationRate
 15 | from blocks.graph import add_annotation, Annotation
 16 | from blocks.roles import add_role, PARAMETER, WEIGHT, BIAS
 17 | 
 18 | from utils import shared_param, AttributeDict
 19 | from nn import maxpool_2d, global_meanpool_2d, BNPARAM, softmax_n
 20 | 
 21 | logger = logging.getLogger('main.model')
 22 | floatX = theano.config.floatX
 23 | 
 24 | 
 25 | class LadderAE():
 26 |     def __init__(self, p):
 27 |         self.p = p
 28 |         self.init_weights_transpose = False
 29 |         self.default_lr = p.lr
 30 |         self.shareds = OrderedDict()
 31 |         self.rstream = RandomStreams(seed=p.seed)
 32 |         self.rng = np.random.RandomState(seed=p.seed)
 33 | 
 34 |         n_layers = len(p.encoder_layers)
 35 |         assert n_layers > 1, "Need to define encoder layers"
 36 |         assert n_layers == len(p.denoising_cost_x), (
 37 |             "Number of denoising costs does not match with %d layers: %s" %
 38 |             (n_layers, str(p.denoising_cost_x)))
 39 | 
 40 |         def one_to_all(x):
 41 |             """ (5.,) -> 5 -> (5., 5., 5.)
 42 |                 ('relu',) -> 'relu' -> ('relu', 'relu', 'relu')
 43 |             """
 44 |             if type(x) is tuple and len(x) == 1:
 45 |                 x = x[0]
 46 | 
 47 |             if type(x) is float:
 48 |                 x = (np.float32(x),) * n_layers
 49 | 
 50 |             if type(x) is str:
 51 |                 x = (x,) * n_layers
 52 |             return x
 53 | 
 54 |         p.decoder_spec = one_to_all(p.decoder_spec)
 55 |         p.f_local_noise_std = one_to_all(p.f_local_noise_std)
 56 |         acts = one_to_all(p.get('act', 'relu'))
 57 | 
 58 |         assert n_layers == len(p.decoder_spec), "f and g need to match"
 59 |         assert (n_layers == len(acts)), (
 60 |             "Not enough activations given. Requires %d. Got: %s" %
 61 |             (n_layers, str(acts)))
 62 |         acts = acts[:-1] + ('softmax',)
 63 | 
 64 |         def parse_layer(spec):
 65 |             """ 'fc:5' -> ('fc', 5)
 66 |                 '5'    -> ('fc', 5)
 67 |                 5      -> ('fc', 5)
 68 |                 'convv:3:2:2' -> ('convv', [3,2,2])
 69 |             """
 70 |             if type(spec) is not str:
 71 |                 return "fc", spec
 72 |             spec = spec.split(':')
 73 |             l_type = spec.pop(0) if len(spec) >= 2 else "fc"
 74 |             spec = map(int, spec)
 75 |             spec = spec[0] if len(spec) == 1 else spec
 76 |             return l_type, spec
 77 | 
 78 |         enc = map(parse_layer, p.encoder_layers)
 79 |         self.layers = list(enumerate(zip(enc, p.decoder_spec, acts)))
 80 | 
 81 |     def weight(self, init, name, cast_float32=True, for_conv=False):
 82 |         weight = self.shared(init, name, cast_float32, role=WEIGHT)
 83 |         if for_conv:
 84 |             return weight.dimshuffle('x', 0, 'x', 'x')
 85 |         return weight
 86 | 
 87 |     def bias(self, init, name, cast_float32=True, for_conv=False):
 88 |         b = self.shared(init, name, cast_float32, role=BIAS)
 89 |         if for_conv:
 90 |             return b.dimshuffle('x', 0, 'x', 'x')
 91 |         return b
 92 | 
 93 |     def shared(self, init, name, cast_float32=True, role=PARAMETER, **kwargs):
 94 |         p = self.shareds.get(name)
 95 |         if p is None:
 96 |             p = shared_param(init, name, cast_float32, role, **kwargs)
 97 |             self.shareds[name] = p
 98 |         return p
 99 | 
100 |     def counter(self):
101 |         name = 'counter'
102 |         p = self.shareds.get(name)
103 |         update = []
104 |         if p is None:
105 |             p_max_val = np.float32(10)
106 |             p = self.shared(np.float32(1), name, role=BNPARAM)
107 |             p_max = self.shared(p_max_val, name + '_max', role=BNPARAM)
108 |             update = [(p, T.clip(p + np.float32(1), np.float32(0), p_max)),
109 |                       (p_max, p_max_val)]
110 |         return (p, update)
111 | 
112 |     def noise_like(self, x):
113 |         noise = self.rstream.normal(size=x.shape, avg=0.0, std=1.0)
114 |         return T.cast(noise, dtype=floatX)
115 | 
116 |     def rand_init(self, in_dim, out_dim):
117 |         """ Random initialization for fully connected layers """
118 |         W = self.rng.randn(in_dim, out_dim) / np.sqrt(in_dim)
119 |         return W
120 | 
121 |     def rand_init_conv(self, dim):
122 |         """ Random initialization for convolution filters """
123 |         fan_in = np.prod(dtype=floatX, a=dim[1:])
124 |         bound = np.sqrt(3. / max(1.0, (fan_in)))
125 |         W = np.asarray(
126 |             self.rng.uniform(low=-bound, high=bound, size=dim), dtype=floatX)
127 |         return W
128 | 
129 |     def new_activation_dict(self):
130 |         return AttributeDict({'z': {}, 'h': {}, 's': {}, 'm': {}})
131 | 
132 |     def annotate_update(self, update, tag_to):
133 |         a = Annotation()
134 |         for (var, up) in update:
135 |             a.updates[var] = up
136 |         add_annotation(tag_to, a)
137 | 
138 |     def apply(self, input_labeled, target_labeled, input_unlabeled):
139 |         self.layer_counter = 0
140 |         input_dim = self.p.encoder_layers[0]
141 | 
142 |         # Store the dimension tuples in the same order as layers.
143 |         layers = self.layers
144 |         self.layer_dims = {0: input_dim}
145 | 
146 |         self.lr = self.default_lr
147 | 
148 |         self.costs = costs = AttributeDict()
149 |         self.costs.denois = AttributeDict()
150 | 
151 |         self.act = AttributeDict()
152 |         self.error = AttributeDict()
153 | 
154 |         top = len(layers) - 1
155 | 
156 |         if input_labeled is None:
157 |             N = 0
158 |         else:
159 |             N = input_labeled.shape[0]
160 |         self.join = lambda l, u: T.concatenate([l, u], axis=0) if l else u
161 |         self.labeled = lambda x: x[:N] if x is not None else x
162 |         self.unlabeled = lambda x: x[N:] if x is not None else x
163 |         self.split_lu = lambda x: (self.labeled(x), self.unlabeled(x))
164 | 
165 |         input_concat = self.join(input_labeled, input_unlabeled)
166 | 
167 |         def encoder(input_, path_name, input_noise_std=0, noise_std=[]):
168 |             h = input_
169 | 
170 |             logger.info('  0: noise %g' % input_noise_std)
171 |             if input_noise_std > 0.:
172 |                 h = h + self.noise_like(h) * input_noise_std
173 | 
174 |             d = AttributeDict()
175 |             d.unlabeled = self.new_activation_dict()
176 |             d.labeled = self.new_activation_dict()
177 |             d.labeled.z[0] = self.labeled(h)
178 |             d.unlabeled.z[0] = self.unlabeled(h)
179 |             prev_dim = input_dim
180 |             for i, (spec, _, act_f) in layers[1:]:
181 |                 d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h)
182 |                 noise = noise_std[i] if i < len(noise_std) else 0.
183 |                 curr_dim, z, m, s, h = self.f(h, prev_dim, spec, i, act_f,
184 |                                               path_name=path_name,
185 |                                               noise_std=noise)
186 |                 assert self.layer_dims.get(i) in (None, curr_dim)
187 |                 self.layer_dims[i] = curr_dim
188 |                 d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z)
189 |                 d.unlabeled.s[i] = s
190 |                 d.unlabeled.m[i] = m
191 |                 prev_dim = curr_dim
192 |             d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h)
193 |             return d
194 | 
195 |         # Clean, supervised
196 |         logger.info('Encoder: clean, labeled')
197 |         clean = self.act.clean = encoder(input_concat, 'clean')
198 | 
199 |         # Corrupted, supervised
200 |         logger.info('Encoder: corr, labeled')
201 |         corr = self.act.corr = encoder(input_concat, 'corr',
202 |                                        input_noise_std=self.p.super_noise_std,
203 |                                        noise_std=self.p.f_local_noise_std)
204 |         est = self.act.est = self.new_activation_dict()
205 | 
206 |         # Decoder path in opposite order
207 |         logger.info('Decoder: z_corr -> z_est')
208 |         for i, ((_, spec), l_type, act_f) in layers[::-1]:
209 |             z_corr = corr.unlabeled.z[i]
210 |             z_clean = clean.unlabeled.z[i]
211 |             z_clean_s = clean.unlabeled.s.get(i)
212 |             z_clean_m = clean.unlabeled.m.get(i)
213 |             fspec = layers[i+1][1][0] if len(layers) > i+1 else (None, None)
214 | 
215 |             if i == top:
216 |                 ver = corr.unlabeled.h[i]
217 |                 ver_dim = self.layer_dims[i]
218 |                 top_g = True
219 |             else:
220 |                 ver = est.z.get(i + 1)
221 |                 ver_dim = self.layer_dims.get(i + 1)
222 |                 top_g = False
223 | 
224 |             z_est = self.g(z_lat=z_corr,
225 |                            z_ver=ver,
226 |                            in_dims=ver_dim,
227 |                            out_dims=self.layer_dims[i],
228 |                            l_type=l_type,
229 |                            num=i,
230 |                            fspec=fspec,
231 |                            top_g=top_g)
232 | 
233 |             if z_est is not None:
234 |                 # Denoising cost
235 | 
236 |                 if z_clean_s and self.p.zestbn == 'bugfix':
237 |                     z_est_norm = (z_est - z_clean_m) / T.sqrt(z_clean_s + np.float32(1e-10))
238 |                 elif z_clean_s is None or self.p.zestbn == 'no':
239 |                     z_est_norm = z_est
240 |                 else:
241 |                     assert False, 'Not supported path'
242 | 
243 |                 se = SquaredError('denois' + str(i))
244 |                 costs.denois[i] = se.apply(z_est_norm.flatten(2),
245 |                                            z_clean.flatten(2)) \
246 |                     / np.prod(self.layer_dims[i], dtype=floatX)
247 |                 costs.denois[i].name = 'denois' + str(i)
248 |                 denois_print = 'denois %.2f' % self.p.denoising_cost_x[i]
249 |             else:
250 |                 denois_print = ''
251 | 
252 |             # Store references for later use
253 |             est.h[i] = self.apply_act(z_est, act_f)
254 |             est.z[i] = z_est
255 |             est.s[i] = None
256 |             est.m[i] = None
257 |             logger.info('  g%d: %10s, %s, dim %s -> %s' % (
258 |                 i, l_type,
259 |                 denois_print,
260 |                 self.layer_dims.get(i+1),
261 |                 self.layer_dims.get(i)
262 |                 ))
263 | 
264 |         # Costs
265 |         y = target_labeled.flatten()
266 | 
267 |         costs.class_clean = CategoricalCrossEntropy().apply(y, clean.labeled.h[top])
268 |         costs.class_clean.name = 'cost_class_clean'
269 | 
270 |         costs.class_corr = CategoricalCrossEntropy().apply(y, corr.labeled.h[top])
271 |         costs.class_corr.name = 'cost_class_corr'
272 | 
273 |         # This will be used for training
274 |         costs.total = costs.class_corr * 1.0
275 |         for i in range(top + 1):
276 |             if costs.denois.get(i) and self.p.denoising_cost_x[i] > 0:
277 |                 costs.total += costs.denois[i] * self.p.denoising_cost_x[i]
278 |         costs.total.name = 'cost_total'
279 | 
280 |         # Classification error
281 |         mr = MisclassificationRate()
282 |         self.error.clean = mr.apply(y, clean.labeled.h[top]) * np.float32(100.)
283 |         self.error.clean.name = 'error_rate_clean'
284 | 
285 |     def apply_act(self, input, act_name):
286 |         if input is None:
287 |             return input
288 |         act = {
289 |             'relu': lambda x: T.maximum(0, x),
290 |             'leakyrelu': lambda x: T.switch(x > 0., x, 0.1 * x),
291 |             'linear': lambda x: x,
292 |             'softplus': lambda x: T.log(1. + T.exp(x)),
293 |             'sigmoid': lambda x: T.nnet.sigmoid(x),
294 |             'softmax': lambda x: softmax_n(x),
295 |         }.get(act_name)
296 |         assert act, 'unknown act %s' % act_name
297 |         if act_name == 'softmax':
298 |             input = input.flatten(2)
299 |         return act(input)
300 | 
301 |     def annotate_bn(self, var, id, var_type, mb_size, size, norm_ax):
302 |         var_shape = np.array((1,) + size)
303 |         out_dim = np.prod(var_shape) / np.prod(var_shape[list(norm_ax)])
304 |         # Flatten the var - shared variable updating is not trivial otherwise,
305 |         # as theano seems to believe a row vector is a matrix and will complain
306 |         # about the updates
307 |         orig_shape = var.shape
308 |         var = var.flatten()
309 |         # Here we add the name and role, the variables will later be identified
310 |         # by these values
311 |         var.name = id + '_%s_clean' % var_type
312 |         add_role(var, BNPARAM)
313 |         shared_var = self.shared(np.zeros(out_dim),
314 |                                  name='shared_%s' % var.name, role=None)
315 | 
316 |         # Update running average estimates. When the counter is reset to 1, it
317 |         # will clear its memory
318 |         cntr, c_up = self.counter()
319 |         one = np.float32(1)
320 |         run_avg = lambda new, old: one / cntr * new + (one - one / cntr) * old
321 |         if var_type == 'mean':
322 |             new_value = run_avg(var, shared_var)
323 |         elif var_type == 'var':
324 |             mb_size = T.cast(mb_size, 'float32')
325 |             new_value = run_avg(mb_size / (mb_size - one) * var, shared_var)
326 |         else:
327 |             raise NotImplemented('Unknown batch norm var %s' % var_type)
328 |         # Add the counter update to the annotated update if it is the first
329 |         # instance of a counter
330 |         self.annotate_update([(shared_var, new_value)] + c_up, var)
331 | 
332 |         return var.reshape(orig_shape)
333 | 
334 |     def f(self, h, in_dim, spec, num, act_f, path_name, noise_std=0):
335 |         # Generates identifiers used for referencing shared variables.
336 |         # E.g. clean and corrupted encoders will end up using the same
337 |         # variable name and hence sharing parameters
338 |         gen_id = lambda s: '_'.join(['f', str(num), s])
339 |         layer_type, _ = spec
340 | 
341 |         # Pooling
342 |         if layer_type in ['maxpool', 'globalmeanpool']:
343 |             z, output_size = self.f_pool(h, spec, in_dim)
344 |             norm_ax = (0, -2, -1)
345 |             # after pooling, no activation func for now unless its softmax
346 |             act_f = "linear" if act_f != "softmax" else act_f
347 | 
348 |         # Convolution
349 |         elif layer_type in ['convv', 'convf']:
350 |             z, output_size = self.f_conv(h, spec, in_dim, gen_id('W'))
351 |             norm_ax = (0, -2, -1)
352 | 
353 |         # Fully connected
354 |         elif layer_type == "fc":
355 |             h = h.flatten(2) if h.ndim > 2 else h
356 |             _, dim = spec
357 |             W = self.weight(self.rand_init(np.prod(in_dim), dim), gen_id('W'))
358 |             z, output_size = T.dot(h, W), (dim,)
359 |             norm_ax = (0,)
360 |         else:
361 |             raise ValueError("Unknown layer spec: %s" % layer_type)
362 | 
363 |         m = s = None
364 |         is_normalizing = True
365 |         if is_normalizing:
366 |             keep_dims = True
367 |             z_l = self.labeled(z)
368 |             z_u = self.unlabeled(z)
369 |             m = z_u.mean(norm_ax, keepdims=keep_dims)
370 |             s = z_u.var(norm_ax, keepdims=keep_dims)
371 | 
372 |             m_l = z_l.mean(norm_ax, keepdims=keep_dims)
373 |             s_l = z_l.var(norm_ax, keepdims=keep_dims)
374 |             if path_name == 'clean':
375 |                 # Batch normalization estimates the mean and variance of
376 |                 # validation and test sets based on the training set
377 |                 # statistics. The following annotates the computation of
378 |                 # running average to the graph.
379 |                 m_l = self.annotate_bn(m_l, gen_id('bn'), 'mean', z_l.shape[0],
380 |                                        output_size, norm_ax)
381 |                 s_l = self.annotate_bn(s_l, gen_id('bn'), 'var', z_l.shape[0],
382 |                                        output_size, norm_ax)
383 |             z = self.join(
384 |                 (z_l - m_l) / T.sqrt(s_l + np.float32(1e-10)),
385 |                 (z_u - m) / T.sqrt(s + np.float32(1e-10)))
386 | 
387 |         if noise_std > 0:
388 |             z += self.noise_like(z) * noise_std
389 | 
390 |         # z for lateral connection
391 |         z_lat = z
392 |         b_init, c_init = 0.0, 1.0
393 |         b_c_size = output_size[0]
394 | 
395 |         # Add bias
396 |         if act_f != 'linear':
397 |             z += self.bias(b_init * np.ones(b_c_size), gen_id('b'),
398 |                            for_conv=len(output_size) > 1)
399 | 
400 |         if is_normalizing:
401 |             # Add free parameter (gamma in original Batch Normalization paper)
402 |             # if needed by the activation. For instance ReLU does't need one
403 |             # and we only add it to softmax if hyperparameter top_c is set.
404 |             if (act_f not in ['relu', 'leakyrelu', 'linear', 'softmax'] or
405 |                     (act_f == 'softmax' and self.p.top_c is True)):
406 |                 c = self.weight(c_init * np.ones(b_c_size), gen_id('c'),
407 |                                 for_conv=len(output_size) > 1)
408 |                 z *= c
409 | 
410 |         h = self.apply_act(z, act_f)
411 | 
412 |         logger.info('  f%d: %s, %s,%s noise %.2f, params %s, dim %s -> %s' % (
413 |             num, layer_type, act_f, ' BN,' if is_normalizing else '',
414 |             noise_std, spec[1], in_dim, output_size))
415 |         return output_size, z_lat, m, s, h
416 | 
417 |     def f_pool(self, x, spec, in_dim):
418 |         layer_type, dims = spec
419 |         num_filters = in_dim[0]
420 |         if "globalmeanpool" == layer_type:
421 |             y, output_size = global_meanpool_2d(x, num_filters)
422 |             # scale the variance to match normal conv layers with xavier init
423 |             y = y * np.float32(in_dim[-1]) * np.float32(np.sqrt(3))
424 |         else:
425 |             assert dims[0] != 1 or dims[1] != 1
426 |             y, output_size = maxpool_2d(x, in_dim,
427 |                                         poolsize=(dims[1], dims[1]),
428 |                                         poolstride=(dims[0], dims[0]))
429 |         return y, output_size
430 | 
431 |     def f_conv(self, x, spec, in_dim, weight_name):
432 |         layer_type, dims = spec
433 |         num_filters = dims[0]
434 |         filter_size = (dims[1], dims[1])
435 |         stride = (dims[2], dims[2])
436 | 
437 |         bm = 'full' if 'convf' in layer_type else 'valid'
438 | 
439 |         num_channels = in_dim[0]
440 | 
441 |         W = self.weight(self.rand_init_conv(
442 |             (num_filters, num_channels) + filter_size), weight_name)
443 | 
444 |         if stride != (1, 1):
445 |             f = GpuCorrMM(subsample=stride, border_mode=bm, pad=(0, 0))
446 |             y = f(gpu_contiguous(x), gpu_contiguous(W))
447 |         else:
448 |             assert self.p.batch_size == self.p.valid_batch_size
449 |             y = conv2d(x, W, image_shape=(2*self.p.batch_size, ) + in_dim,
450 |                        filter_shape=((num_filters, num_channels) +
451 |                                      filter_size), border_mode=bm)
452 |         output_size = ((num_filters,) +
453 |                        ConvOp.getOutputShape(in_dim[1:], filter_size,
454 |                                              stride, bm))
455 | 
456 |         return y, output_size
457 | 
458 |     def g(self, z_lat, z_ver, in_dims, out_dims, l_type, num, fspec, top_g):
459 |         f_layer_type, dims = fspec
460 |         is_conv = f_layer_type is not None and ('conv' in f_layer_type or
461 |                                                 'pool' in f_layer_type)
462 |         gen_id = lambda s: '_'.join(['g', str(num), s])
463 | 
464 |         in_dim = np.prod(dtype=floatX, a=in_dims)
465 |         out_dim = np.prod(dtype=floatX, a=out_dims)
466 |         num_filters = out_dims[0] if is_conv else out_dim
467 | 
468 |         if l_type[-1] in ['0']:
469 |             g_type, u_type = l_type[:-1], l_type[-1]
470 |         else:
471 |             g_type, u_type = l_type, None
472 | 
473 |         # Mapping from layer above: u
474 |         if u_type in ['0'] or z_ver is None:
475 |             if z_ver is None and u_type not in ['0']:
476 |                 logger.warn('Decoder %d:%s without vertical input' %
477 |                             (num, g_type))
478 |             u = None
479 |         else:
480 |             if top_g:
481 |                 u = z_ver
482 |             elif is_conv:
483 |                 u = self.g_deconv(z_ver, in_dims, out_dims, gen_id('W'), fspec)
484 |             else:
485 |                 W = self.weight(self.rand_init(in_dim, out_dim), gen_id('W'))
486 |                 u = T.dot(z_ver, W)
487 | 
488 |         # Batch-normalize u
489 |         if u is not None:
490 |             norm_ax = (0,) if u.ndim <= 2 else (0, -2, -1)
491 |             keep_dims = True
492 |             u -= u.mean(norm_ax, keepdims=keep_dims)
493 |             u /= T.sqrt(u.var(norm_ax, keepdims=keep_dims) +
494 |                         np.float32(1e-10))
495 | 
496 |         # Define the g function
497 |         if not is_conv:
498 |             z_lat = z_lat.flatten(2)
499 |         bi = lambda inits, name: self.bias(inits * np.ones(num_filters),
500 |                                            gen_id(name), for_conv=is_conv)
501 |         wi = lambda inits, name: self.weight(inits * np.ones(num_filters),
502 |                                              gen_id(name), for_conv=is_conv)
503 | 
504 |         if g_type == '':
505 |             z_est = None
506 | 
507 |         elif g_type == 'i':
508 |             z_est = z_lat
509 | 
510 |         elif g_type in ['sig']:
511 |             sigval = bi(0., 'c1') + wi(1., 'c2') * z_lat
512 |             if u is not None:
513 |                 sigval += wi(0., 'c3') * u + wi(0., 'c4') * z_lat * u
514 |             sigval = T.nnet.sigmoid(sigval)
515 | 
516 |             z_est = bi(0., 'a1') + wi(1., 'a2') * z_lat + wi(1., 'b1') * sigval
517 |             if u is not None:
518 |                 z_est += wi(0., 'a3') * u + wi(0., 'a4') * z_lat * u
519 | 
520 |         elif g_type in ['lin']:
521 |             a1 = wi(1.0, 'a1')
522 |             b = bi(0.0, 'b')
523 | 
524 |             z_est = a1 * z_lat + b
525 | 
526 |         elif g_type in ['relu']:
527 |             assert u is not None
528 |             b = bi(0., 'b')
529 |             x = u + b
530 |             z_est = self.apply_act(x, 'relu')
531 | 
532 |         elif g_type in ['sigmoid']:
533 |             assert u is not None
534 |             b = bi(0., 'b')
535 |             c = wi(1., 'c')
536 |             z_est = self.apply_act((u + b) * c, 'sigmoid')
537 | 
538 |         elif g_type in ['comparison_g2']:
539 |             # sig without the uz cross term
540 |             sigval = bi(0., 'c1') + wi(1., 'c2') * z_lat
541 |             if u is not None:
542 |                 sigval += wi(0., 'c3') * u
543 |             sigval = T.nnet.sigmoid(sigval)
544 | 
545 |             z_est = bi(0., 'a1') + wi(1., 'a2') * z_lat + wi(1., 'b1') * sigval
546 |             if u is not None:
547 |                 z_est += wi(0., 'a3') * u
548 | 
549 |         elif g_type in ['comparison_g3']:
550 |             # sig without the sigmoid nonlinearity
551 |             z_est = bi(0., 'a1') + wi(1., 'a2') * z_lat
552 |             if u is not None:
553 |                 z_est += wi(0., 'a3') * u + wi(0., 'a4') * z_lat * u
554 | 
555 |         elif g_type in ['comparison_g4']:
556 |             # No mixing between z_lat and u before final sum, otherwise similar
557 |             # to sig
558 |             def nonlin(inp, in_name='input', add_bias=True):
559 |                 w1 = wi(1., 'w1_%s' % in_name)
560 |                 b1 = bi(0., 'b1')
561 |                 w2 = wi(1., 'w2_%s' % in_name)
562 |                 b2 = bi(0., 'b2') if add_bias else 0
563 |                 w3 = wi(0., 'w3_%s' % in_name)
564 |                 return w2 * T.nnet.sigmoid(b1 + w1 * inp) + w3 * inp + b2
565 | 
566 |             z_est = nonlin(z_lat, 'lat') if u is None else \
567 |                 nonlin(z_lat, 'lat') + nonlin(u, 'ver', False)
568 | 
569 |         elif g_type in ['comparison_g5', 'gauss']:
570 |             # Gaussian assumption on z: (z - mu) * v + mu
571 |             if u is None:
572 |                 b1 = bi(0., 'b1')
573 |                 w1 = wi(1., 'w1')
574 |                 z_est = w1 * z_lat + b1
575 |             else:
576 |                 a1 = bi(0., 'a1')
577 |                 a2 = wi(1., 'a2')
578 |                 a3 = bi(0., 'a3')
579 |                 a4 = bi(0., 'a4')
580 |                 a5 = bi(0., 'a5')
581 | 
582 |                 a6 = bi(0., 'a6')
583 |                 a7 = wi(1., 'a7')
584 |                 a8 = bi(0., 'a8')
585 |                 a9 = bi(0., 'a9')
586 |                 a10 = bi(0., 'a10')
587 | 
588 |                 mu = a1 * T.nnet.sigmoid(a2 * u + a3) + a4 * u + a5
589 |                 v = a6 * T.nnet.sigmoid(a7 * u + a8) + a9 * u + a10
590 | 
591 |                 z_est = (z_lat - mu) * v + mu
592 |         elif 'gauss_stable_v' in g_type:
593 |             # Gaussian assumption on z: (z - mu) * v + mu
594 |             if u is None:
595 |                 b1 = bi(0., 'b1')
596 |                 w1 = wi(1., 'w1')
597 |                 z_est = w1 * z_lat + b1
598 |             elif z_lat is None:
599 |                 b1 = bi(0., 'b1')
600 |                 w1 = wi(1., 'w1')
601 |                 z_est = w1 * u + b1
602 |             else:
603 |                 a1 = bi(0., 'a1')
604 |                 a2 = wi(1., 'a2')
605 |                 a3 = bi(0., 'a3')
606 |                 a4 = bi(0., 'a4')
607 |                 a5 = bi(0., 'a5')
608 |                 a6 = bi(0., 'a6')
609 |                 a7 = wi(1., 'a7')
610 |                 a8 = bi(0., 'a8')
611 |                 a9 = bi(0., 'a9')
612 |                 a10 = bi(0., 'a10')
613 | 
614 |                 mu = a1 * T.nnet.sigmoid(a2 * u + a3) + a4 * u + a5
615 |                 v = a6 * T.nnet.sigmoid(a7 * u + a8) + a9 * u + a10
616 |                 v = T.nnet.sigmoid(v)
617 | 
618 |                 z_est = (z_lat - mu) * v + mu
619 |         else:
620 |             raise NotImplementedError("unknown g type: %s" % str(g_type))
621 | 
622 |         # Reshape the output if z is for conv but u from fc layer
623 |         if (z_est is not None and type(out_dims) == tuple and
624 |                 len(out_dims) > 1.0 and z_est.ndim < 4):
625 |             z_est = z_est.reshape((z_est.shape[0],) + out_dims)
626 | 
627 |         return z_est
628 | 
629 |     def g_deconv(self, z_ver, in_dims, out_dims, weight_name, fspec):
630 |         """ Inverse operation for each type of f used in convnets """
631 |         f_type, f_dims = fspec
632 |         assert z_ver is not None
633 |         num_channels = in_dims[0] if in_dims is not None else None
634 |         num_filters, width, height = out_dims[:3]
635 | 
636 |         if f_type in ['globalmeanpool']:
637 |             u = T.addbroadcast(z_ver, 2, 3)
638 |             assert in_dims[1] == 1 and in_dims[2] == 1, \
639 |                 "global pooling needs in_dims (1,1): %s" % str(in_dims)
640 | 
641 |         elif f_type in ['maxpool']:
642 |             sh, str, size = z_ver.shape, f_dims[0], f_dims[1]
643 |             assert str == size, "depooling requires stride == size"
644 |             u = T.zeros((sh[0], sh[1], sh[2] * str, sh[3] * str),
645 |                         dtype=z_ver.dtype)
646 |             for x in xrange(str):
647 |                 for y in xrange(str):
648 |                     u = T.set_subtensor(u[:, :, x::str, y::str], z_ver)
649 |             u = u[:, :, :width, :height]
650 | 
651 |         elif f_type in ['convv', 'convf']:
652 |             filter_size, str = (f_dims[1], f_dims[1]), f_dims[2]
653 |             W_shape = (num_filters, num_channels) + filter_size
654 |             W = self.weight(self.rand_init_conv(W_shape), weight_name)
655 |             if str > 1:
656 |                 # upsample if strided version
657 |                 sh = z_ver.shape
658 |                 u = T.zeros((sh[0], sh[1], sh[2] * str, sh[3] * str),
659 |                             dtype=z_ver.dtype)
660 |                 u = T.set_subtensor(u[:, :, ::str, ::str], z_ver)
661 |             else:
662 |                 u = z_ver  # no strides, only deconv
663 |             u = conv2d(u, W, filter_shape=W_shape,
664 |                        border_mode='valid' if 'convf' in f_type else 'full')
665 |             u = u[:, :, :width, :height]
666 |         else:
667 |             raise NotImplementedError('Layer %s has no convolutional decoder'
668 |                                       % f_type)
669 | 
670 |         return u
671 | 


--------------------------------------------------------------------------------