├── __init__.py ├── environment.yml ├── LICENSE ├── utils.py ├── README.md ├── nn.py ├── run.py └── ladder.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: ladder 2 | dependencies: 3 | - hdf5=1.8.15.1=2 4 | - h5py=2.5.0=np110py27_4 5 | - matplotlib=1.5.1=np110py27_0 6 | - nomkl=1.0 7 | - openblas=0.2.14 8 | - numpy=1.10.4=py27_nomkl_2 9 | - pandas=0.17.1=np110py27_0 10 | - pytables=3.2.2=np110py27_0 11 | - python=2.7.11=0 12 | - scipy=0.17.0=np110py27_nomkl_0 13 | - pip 14 | - pip: 15 | - git+git://github.com/Theano/Theano.git@rel-0.8.2 16 | - git+git://github.com/mila-udem/fuel.git@0.2.0 17 | - git+git://github.com/mila-udem/blocks.git@0.2 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 arasmus 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import logging 4 | import numpy as np 5 | import theano 6 | from pandas import DataFrame, read_hdf 7 | 8 | from blocks.extensions import Printing, SimpleExtension 9 | from blocks.main_loop import MainLoop 10 | from blocks.roles import add_role 11 | 12 | logger = logging.getLogger('main.utils') 13 | 14 | 15 | def shared_param(init, name, cast_float32, role, **kwargs): 16 | if cast_float32: 17 | v = np.float32(init) 18 | p = theano.shared(v, name=name, **kwargs) 19 | add_role(p, role) 20 | return p 21 | 22 | 23 | class AttributeDict(dict): 24 | __getattr__ = dict.__getitem__ 25 | 26 | def __setattr__(self, a, b): 27 | self.__setitem__(a, b) 28 | 29 | 30 | class DummyLoop(MainLoop): 31 | def __init__(self, extensions): 32 | return super(DummyLoop, self).__init__(algorithm=None, 33 | data_stream=None, 34 | extensions=extensions) 35 | 36 | def run(self): 37 | for extension in self.extensions: 38 | extension.main_loop = self 39 | self._run_extensions('before_training') 40 | self._run_extensions('after_training') 41 | 42 | 43 | class ShortPrinting(Printing): 44 | def __init__(self, to_print, use_log=True, **kwargs): 45 | self.to_print = to_print 46 | self.use_log = use_log 47 | super(ShortPrinting, self).__init__(**kwargs) 48 | 49 | def do(self, which_callback, *args): 50 | log = self.main_loop.log 51 | 52 | # Iteration 53 | msg = "e {}, i {}:".format( 54 | log.status['epochs_done'], 55 | log.status['iterations_done']) 56 | 57 | # Requested channels 58 | items = [] 59 | for k, vars in self.to_print.iteritems(): 60 | for shortname, vars in vars.iteritems(): 61 | if vars is None: 62 | continue 63 | if type(vars) is not list: 64 | vars = [vars] 65 | 66 | s = "" 67 | for var in vars: 68 | try: 69 | name = k + '_' + var.name 70 | val = log.current_row[name] 71 | except: 72 | continue 73 | try: 74 | s += ' ' + ' '.join(["%.3g" % v for v in val]) 75 | except: 76 | s += " %.3g" % val 77 | if s != "": 78 | items += [shortname + s] 79 | msg = msg + ", ".join(items) 80 | if self.use_log: 81 | logger.info(msg) 82 | else: 83 | print msg 84 | 85 | 86 | class SaveParams(SimpleExtension): 87 | """Finishes the training process when triggered.""" 88 | def __init__(self, trigger_var, params, save_path, **kwargs): 89 | super(SaveParams, self).__init__(**kwargs) 90 | if trigger_var is None: 91 | self.var_name = None 92 | else: 93 | self.var_name = trigger_var[0] + '_' + trigger_var[1].name 94 | self.save_path = save_path 95 | self.params = params 96 | self.to_save = {} 97 | self.best_value = None 98 | self.add_condition(['after_training'], self.save) 99 | self.add_condition(['on_interrupt'], self.save) 100 | 101 | def save(self, which_callback, *args): 102 | if self.var_name is None: 103 | self.to_save = {v.name: v.get_value() for v in self.params} 104 | path = self.save_path + '/trained_params' 105 | logger.info('Saving to %s' % path) 106 | np.savez_compressed(path, **self.to_save) 107 | 108 | def do(self, which_callback, *args): 109 | if self.var_name is None: 110 | return 111 | val = self.main_loop.log.current_row[self.var_name] 112 | if self.best_value is None or val < self.best_value: 113 | self.best_value = val 114 | self.to_save = {v.name: v.get_value() for v in self.params} 115 | 116 | 117 | class SaveExpParams(SimpleExtension): 118 | def __init__(self, experiment_params, dir, **kwargs): 119 | super(SaveExpParams, self).__init__(**kwargs) 120 | self.dir = dir 121 | self.experiment_params = experiment_params 122 | 123 | def do(self, which_callback, *args): 124 | df = DataFrame.from_dict(self.experiment_params, orient='index') 125 | df.to_hdf(os.path.join(self.dir, 'params'), 'params', mode='w', 126 | complevel=5, complib='blosc') 127 | 128 | 129 | class SaveLog(SimpleExtension): 130 | def __init__(self, dir, show=None, **kwargs): 131 | super(SaveLog, self).__init__(**kwargs) 132 | self.dir = dir 133 | self.show = show if show is not None else [] 134 | 135 | def do(self, which_callback, *args): 136 | df = DataFrame.from_dict(self.main_loop.log, orient='index') 137 | df.to_hdf(os.path.join(self.dir, 'log'), 'log', mode='w', 138 | complevel=5, complib='blosc') 139 | 140 | 141 | def prepare_dir(save_to, results_dir='results'): 142 | base = os.path.join(results_dir, save_to) 143 | i = 0 144 | 145 | while True: 146 | name = base + str(i) 147 | try: 148 | os.makedirs(name) 149 | break 150 | except: 151 | i += 1 152 | 153 | return name 154 | 155 | 156 | def load_df(dirpath, filename, varname=None): 157 | varname = filename if varname is None else varname 158 | fn = os.path.join(dirpath, filename) 159 | return read_hdf(fn, varname) 160 | 161 | 162 | def filter_funcs_prefix(d, pfx): 163 | pfx = 'cmd_' 164 | fp = lambda x: x.find(pfx) 165 | return {n[fp(n) + len(pfx):]: v for n, v in d.iteritems() if fp(n) >= 0} 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repository contains source code for the experiments in a paper titled 2 | [_Semi-Supervised Learning with Ladder Networks_](http://arxiv.org/abs/1507.02672) by A Rasmus, H Valpola, M Honkala, 3 | M Berglund, and T Raiko. 4 | 5 | #### Required libraries 6 | ##### Install Theano, Blocks Stable 0.2, Fuel Stable 0.2 7 | Refer to the [Blocks installation instructions](http://blocks.readthedocs.org/en/latest/setup.html) for 8 | details but use tag v0.2 instead. Something along: 9 | ``` 10 | pip install git+git://github.com/mila-udem/blocks.git@v0.2 11 | pip install git+git://github.com/mila-udem/fuel.git@v0.2.0 12 | ``` 13 | Fuel comes with Blocks, but you need to download and convert the datasets. 14 | Refer to the Fuel documentation. One might need to rename the converted files. 15 | ``` 16 | fuel-download mnist 17 | fuel-convert mnist --dtype float32 18 | fuel-download cifar10 19 | fuel-convert cifar10 20 | ``` 21 | ##### Alternatively, one can use the environment.yml file that is provided in this repo to create an conda environment. 22 | 1. First install anaconda from https://www.continuum.io/downloads. Then, 23 | 2. `conda env create -f environment.yml` 24 | 3. `source activate ladder` 25 | 4. The environment should be good to go! 26 | 27 | #### Models in the paper 28 | 29 | The following commands train the models with seed 1. The reported numbers in the paper are averages over 30 | several random seeds. These commands use all the training samples for training (`--unlabeled-samples 60000`) 31 | and none are used for validation. This results in a lot of NaNs being printed during the trainining, since 32 | the validation statistics are not available. If you want to observe the validation error and costs during the 33 | training, use `--unlabeled-samples 50000`. 34 | 35 | 36 | ##### MNIST all labels 37 | ``` 38 | # Full 39 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec gauss --denoising-cost-x 1000,1,0.01,0.01,0.01,0.01,0.01 --labeled-samples 60000 --unlabeled-samples 60000 --seed 1 -- mnist_all_full 40 | # Bottom 41 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec gauss --denoising-cost-x 2000,0,0,0,0,0,0 --labeled-samples 60000 --unlabeled-samples 60000 --seed 1 -- mnist_all_bottom 42 | # Gamma model 43 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec 0-0-0-0-0-0-gauss --denoising-cost-x 0,0,0,0,0,0,2 --labeled-samples 60000 --unlabeled-samples 60000 --seed 1 -- mnist_all_gamma 44 | # Supervised baseline 45 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec 0-0-0-0-0-0-0 --denoising-cost-x 0,0,0,0,0,0,0 --labeled-samples 60000 --unlabeled-samples 60000 --f-local-noise-std 0.5 --seed 1 -- mnist_all_baseline 46 | ``` 47 | 48 | ##### MNIST 100 labels 49 | ``` 50 | # Full 51 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec gauss --denoising-cost-x 1000,10,0.1,0.1,0.1,0.1,0.1 --labeled-samples 100 --unlabeled-samples 60000 --seed 1 -- mnist_100_full 52 | # Bottom-only 53 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec gauss --denoising-cost-x 5000,0,0,0,0,0,0 --labeled-samples 100 --unlabeled-samples 60000 --seed 1 -- mnist_100_bottom 54 | # Gamma 55 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec 0-0-0-0-0-0-gauss --denoising-cost-x 0,0,0,0,0,0,0.5 --labeled-samples 100 --unlabeled-samples 60000 --seed 1 -- mnist_100_gamma 56 | # Supervised baseline 57 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec 0-0-0-0-0-0-0 --denoising-cost-x 0,0,0,0,0,0,0 --labeled-samples 100 --unlabeled-samples 60000 --f-local-noise-std 0.5 --seed 1 -- mnist_100_baseline 58 | ``` 59 | 60 | ##### MNIST 1000 labels 61 | ``` 62 | # Full 63 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec gauss --denoising-cost-x 2000,20,0.1,0.1,0.1,0.1,0.1 --f-local-noise-std 0.2 --labeled-samples 1000 --unlabeled-samples 60000 --seed 1 -- mnist_1000_full 64 | # Bottom-only 65 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec gauss --denoising-cost-x 2000,0,0,0,0,0,0 --labeled-samples 1000 --unlabeled-samples 60000 --seed 1 -- mnist_1000_bottom 66 | # Gamma model 67 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec 0-0-0-0-0-0-gauss --denoising-cost-x 0,0,0,0,0,0,10 --labeled-samples 1000 --unlabeled-samples 60000 --seed 1 -- mnist_1000_gamma 68 | # Supervised baseline 69 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec 0-0-0-0-0-0-0 --denoising-cost-x 0,0,0,0,0,0,0 --labeled-samples 1000 --unlabeled-samples 60000 --f-local-noise-std 0.5 --seed 1 -- mnist_1000_baseline 70 | ``` 71 | 72 | ##### MNIST 50 labels 73 | ``` 74 | # Full model 75 | run.py train --encoder-layers 1000-500-250-250-250-10 --decoder-spec gauss --denoising-cost-x 2000,20,0.1,0.1,0.1,0.1,0.1 --labeled-samples 50 --unlabeled-samples 60000 --seed 1 -- mnist_50_full 76 | ``` 77 | 78 | ##### MNIST convolutional models 79 | ``` 80 | # Conv-FC 81 | run.py train --encoder-layers convv:1000:26:1:1-convv:500:1:1:1-convv:250:1:1:1-convv:250:1:1:1-convv:250:1:1:1-convv:10:1:1:1-globalmeanpool:0 --decoder-spec gauss --denoising-cost-x 1000,10,0.1,0.1,0.1,0.1,0.1,0.1 --labeled-samples 100 --unlabeled-samples 60000 --seed 1 -- mnist_100_conv_fc 82 | # Conv-Small, Gamma 83 | run.py train --encoder-layers convf:32:5:1:1-maxpool:2:2-convv:64:3:1:1-convf:64:3:1:1-maxpool:2:2-convv:128:3:1:1-convv:10:1:1:1-globalmeanpool:6:6-fc:10 --decoder-spec 0-0-0-0-0-0-0-0-0-gauss --denoising-cost-x 0,0,0,0,0,0,0,0,0,1 --labeled-samples 100 --unlabeled-samples 60000 --seed 1 -- mnist_100_conv_gamma 84 | # Conv-Small, supervised baseline. Overfits easily, so keep training short. 85 | run.py train --encoder-layers convf:32:5:1:1-maxpool:2:2-convv:64:3:1:1-convf:64:3:1:1-maxpool:2:2-convv:128:3:1:1-convv:10:1:1:1-globalmeanpool:6:6-fc:10 --decoder-spec 0-0-0-0-0-0-0-0-0-0 --denoising-cost-x 0,0,0,0,0,0,0,0,0,0 --num-epochs 20 --lrate-decay 0.5 --f-local-noise-std 0.45 --labeled-samples 100 --unlabeled-samples 60000 --seed 1 -- mnist_100_conv_baseline 86 | ``` 87 | 88 | ##### CIFAR models 89 | ``` 90 | # Conv-Large, Gamma 91 | ./run.py train --encoder-layers convv:96:3:1:1-convf:96:3:1:1-convf:96:3:1:1-maxpool:2:2-convv:192:3:1:1-convf:192:3:1:1-convv:192:3:1:1-maxpool:2:2-convv:192:3:1:1-convv:192:1:1:1-convv:10:1:1:1-globalmeanpool:0 --decoder-spec 0-0-0-0-0-0-0-0-0-0-0-0-gauss --dataset cifar10 --act leakyrelu --denoising-cost-x 0,0,0,0,0,0,0,0,0,0,0,0,4.0 --num-epochs 70 --lrate-decay 0.86 --seed 1 --whiten-zca 3072 --contrast-norm 55 --top-c False --labeled-samples 4000 --unlabeled-samples 50000 -- cifar_4k_gamma 92 | # Conv-Large, supervised baseline. Overfits easily, so keep training short. 93 | ./run.py train --encoder-layers convv:96:3:1:1-convf:96:3:1:1-convf:96:3:1:1-maxpool:2:2-convv:192:3:1:1-convf:192:3:1:1-convv:192:3:1:1-maxpool:2:2-convv:192:3:1:1-convv:192:1:1:1-convv:10:1:1:1-globalmeanpool:0 --decoder-spec 0-0-0-0-0-0-0-0-0-0-0-0-0 --dataset cifar10 --act leakyrelu --denoising-cost-x 0,0,0,0,0,0,0,0,0,0,0,0,0 --num-epochs 20 --lrate-decay 0.5 --seed 1 --whiten-zca 3072 --contrast-norm 55 --top-c False --labeled-samples 4000 --unlabeled-samples 50000 -- cifar_4k_baseline 94 | ``` 95 | 96 | ##### Evaluating models with testset 97 | After training a model, you can infer the results on a test set by performing the `evaluate` command. 98 | An example use after training a model: 99 | ``` 100 | ./run.py evaluate results/mnist_all_bottom0 101 | ``` 102 | -------------------------------------------------------------------------------- /nn.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import logging 3 | 4 | import scipy 5 | import numpy as np 6 | from theano import tensor 7 | from theano.tensor.signal.pool import pool_2d, Pool 8 | 9 | from blocks.extensions import SimpleExtension 10 | from blocks.extensions.monitoring import (DataStreamMonitoring, 11 | MonitoringExtension) 12 | from blocks.filter import VariableFilter 13 | from blocks.graph import ComputationGraph 14 | from blocks.monitoring.evaluators import DatasetEvaluator 15 | from blocks.roles import VariableRole 16 | 17 | logger = logging.getLogger('main.nn') 18 | 19 | 20 | class BnParamRole(VariableRole): 21 | pass 22 | 23 | # Batch normalization parameters that have to be replaced when testing 24 | BNPARAM = BnParamRole() 25 | 26 | 27 | class ZCA(object): 28 | def __init__(self, n_components=None, data=None, filter_bias=0.1): 29 | self.filter_bias = np.float32(filter_bias) 30 | self.P = None 31 | self.P_inv = None 32 | self.n_components = 0 33 | self.is_fit = False 34 | if n_components and data: 35 | self.fit(n_components, data) 36 | 37 | def fit(self, n_components, data): 38 | if len(data.shape) == 2: 39 | self.reshape = None 40 | else: 41 | assert n_components == np.product(data.shape[1:]), \ 42 | 'ZCA whitening components should be %d for convolutional data'\ 43 | % np.product(data.shape[1:]) 44 | self.reshape = data.shape[1:] 45 | 46 | data = self._flatten_data(data) 47 | assert len(data.shape) == 2 48 | n, m = data.shape 49 | self.mean = np.mean(data, axis=0) 50 | 51 | bias = self.filter_bias * scipy.sparse.identity(m, 'float32') 52 | cov = np.cov(data, rowvar=0, bias=1) + bias 53 | eigs, eigv = scipy.linalg.eigh(cov) 54 | 55 | assert not np.isnan(eigs).any() 56 | assert not np.isnan(eigv).any() 57 | assert eigs.min() > 0 58 | 59 | if self.n_components: 60 | eigs = eigs[-self.n_components:] 61 | eigv = eigv[:, -self.n_components:] 62 | 63 | sqrt_eigs = np.sqrt(eigs) 64 | self.P = np.dot(eigv * (1.0 / sqrt_eigs), eigv.T) 65 | assert not np.isnan(self.P).any() 66 | self.P_inv = np.dot(eigv * sqrt_eigs, eigv.T) 67 | 68 | self.P = np.float32(self.P) 69 | self.P_inv = np.float32(self.P_inv) 70 | 71 | self.is_fit = True 72 | 73 | def apply(self, data, remove_mean=True): 74 | data = self._flatten_data(data) 75 | d = data - self.mean if remove_mean else data 76 | return self._reshape_data(np.dot(d, self.P)) 77 | 78 | def inv(self, data, add_mean=True): 79 | d = np.dot(self._flatten_data(data), self.P_inv) 80 | d += self.mean if add_mean else 0. 81 | return self._reshape_data(d) 82 | 83 | def _flatten_data(self, data): 84 | if self.reshape is None: 85 | return data 86 | assert data.shape[1:] == self.reshape 87 | return data.reshape(data.shape[0], np.product(data.shape[1:])) 88 | 89 | def _reshape_data(self, data): 90 | assert len(data.shape) == 2 91 | if self.reshape is None: 92 | return data 93 | return np.reshape(data, (data.shape[0],) + self.reshape) 94 | 95 | 96 | class ContrastNorm(object): 97 | def __init__(self, scale=55, epsilon=1e-8): 98 | self.scale = np.float32(scale) 99 | self.epsilon = np.float32(epsilon) 100 | 101 | def apply(self, data, copy=False): 102 | if copy: 103 | data = np.copy(data) 104 | data_shape = data.shape 105 | if len(data.shape) > 2: 106 | data = data.reshape(data.shape[0], np.product(data.shape[1:])) 107 | 108 | assert len(data.shape) == 2, 'Contrast norm on flattened data' 109 | 110 | data -= data.mean(axis=1)[:, np.newaxis] 111 | 112 | norms = np.sqrt(np.sum(data ** 2, axis=1)) / self.scale 113 | norms[norms < self.epsilon] = np.float32(1.) 114 | 115 | data /= norms[:, np.newaxis] 116 | 117 | if data_shape != data.shape: 118 | data = data.reshape(data_shape) 119 | 120 | return data 121 | 122 | 123 | class TestMonitoring(object): 124 | def _get_bn_params(self, output_vars): 125 | # Pick out the nodes with batch normalization vars 126 | cg = ComputationGraph(output_vars) 127 | var_filter = VariableFilter(roles=[BNPARAM]) 128 | bn_ps = var_filter(cg.variables) 129 | 130 | if len(bn_ps) == 0: 131 | logger.warn('No batch normalization parameters found - is' + 132 | ' batch normalization turned off?') 133 | self._bn = False 134 | self._counter = None 135 | self._counter_max = None 136 | bn_share = [] 137 | output_vars_replaced = output_vars 138 | else: 139 | self._bn = True 140 | assert len(set([p.name for p in bn_ps])) == len(bn_ps), \ 141 | 'Some batch norm params have the same name' 142 | logger.info('Batch norm parameters: %s' % ', '.join([p.name for p in bn_ps])) 143 | 144 | # Filter out the shared variables from the model updates 145 | def filter_share(par): 146 | lst = [up for up in cg.updates if up.name == 'shared_%s' % par.name] 147 | assert len(lst) == 1 148 | return lst[0] 149 | bn_share = map(filter_share, bn_ps) 150 | 151 | # Replace the BN coefficients in the test data model - Replace the 152 | # theano variables in the test graph with the shareds 153 | output_vars_replaced = cg.replace(zip(bn_ps, bn_share)).outputs 154 | 155 | # Pick out the counter 156 | self._counter = self._param_from_updates(cg.updates, 'counter') 157 | self._counter_max = self._param_from_updates(cg.updates, 'counter_max') 158 | 159 | return bn_ps, bn_share, output_vars_replaced 160 | 161 | def _param_from_updates(self, updates, p_name): 162 | var_filter = VariableFilter(roles=[BNPARAM]) 163 | bn_ps = var_filter(updates.keys()) 164 | p = [p for p in bn_ps if p.name == p_name] 165 | assert len(p) == 1, 'No %s of more than one %s' % (p_name, p_name) 166 | return p[0] 167 | 168 | def reset_counter(self): 169 | if self._bn: 170 | self._counter.set_value(np.float32(1)) 171 | 172 | def replicate_vars(self, output_vars): 173 | # Problem in Blocks with multiple monitors monitoring the 174 | # same value in a graph. Therefore, they are all "replicated" to a new 175 | # Theano variable 176 | if isinstance(output_vars, (list, tuple)): 177 | return map(self.replicate_vars, output_vars) 178 | assert not hasattr(output_vars.tag, 'aggregation_scheme'), \ 179 | 'The variable %s already has an aggregator ' % output_vars.name + \ 180 | 'assigned to it - are you using a datasetmonitor with the same' + \ 181 | ' variable as output? This might cause trouble in Blocks' 182 | new_var = 1 * output_vars 183 | new_var.name = output_vars.name 184 | return new_var 185 | 186 | 187 | class ApproxTestMonitoring(DataStreamMonitoring, TestMonitoring): 188 | def __init__(self, output_vars, *args, **kwargs): 189 | output_vars = self.replicate_vars(output_vars) 190 | _, _, replaced_vars = self._get_bn_params(output_vars) 191 | super(ApproxTestMonitoring, self).__init__(replaced_vars, *args, 192 | **kwargs) 193 | 194 | def do(self, which_callback, *args, **kwargs): 195 | assert not which_callback == "after_batch", "Do not monitor each mb" 196 | self.reset_counter() 197 | super(ApproxTestMonitoring, self).do(which_callback, *args, **kwargs) 198 | 199 | 200 | class FinalTestMonitoring(SimpleExtension, MonitoringExtension, TestMonitoring): 201 | """Monitors validation and test set data with batch norm 202 | 203 | Calculates the training set statistics for batch normalization and adds 204 | them to the model before calculating the validation and test set values. 205 | This is done in two steps: First the training set is iterated and the 206 | statistics are saved in shared variables, then the model iterates through 207 | the test/validation set using the saved shared variables. 208 | When the training set is iterated, it is done for the full set, layer by 209 | layer so that the statistics are correct. This is expensive for very deep 210 | models, in which case some approximation could be in order 211 | """ 212 | def __init__(self, output_vars, train_data_stream, test_data_stream, 213 | **kwargs): 214 | output_vars = self.replicate_vars(output_vars) 215 | super(FinalTestMonitoring, self).__init__(**kwargs) 216 | self.trn_stream = train_data_stream 217 | self.tst_stream = test_data_stream 218 | 219 | bn_ps, bn_share, output_vars_replaced = self._get_bn_params(output_vars) 220 | 221 | if self._bn: 222 | updates = self._get_updates(bn_ps, bn_share) 223 | trn_evaluator = DatasetEvaluator(bn_ps, updates=updates) 224 | else: 225 | trn_evaluator = None 226 | 227 | self._trn_evaluator = trn_evaluator 228 | self._tst_evaluator = DatasetEvaluator(output_vars_replaced) 229 | 230 | def _get_updates(self, bn_ps, bn_share): 231 | cg = ComputationGraph(bn_ps) 232 | # Only store updates that relate to params or the counter 233 | updates = OrderedDict([(up, cg.updates[up]) for up in 234 | cg.updates if up.name == 'counter' or 235 | up in bn_share]) 236 | assert self._counter == self._param_from_updates(cg.updates, 'counter') 237 | assert self._counter_max == self._param_from_updates(cg.updates, 238 | 'counter_max') 239 | assert len(updates) == len(bn_ps) + 1, \ 240 | 'Counter or var missing from update' 241 | return updates 242 | 243 | def do(self, which_callback, *args): 244 | """Write the values of monitored variables to the log.""" 245 | assert not which_callback == "after_batch", "Do not monitor each mb" 246 | # Run on train data and get the statistics 247 | if self._bn: 248 | self._counter_max.set_value(np.float32(np.inf)) 249 | self.reset_counter() 250 | self._trn_evaluator.evaluate(self.trn_stream) 251 | self.reset_counter() 252 | 253 | value_dict = self._tst_evaluator.evaluate(self.tst_stream) 254 | self.add_records(self.main_loop.log, value_dict.items()) 255 | 256 | 257 | class LRDecay(SimpleExtension): 258 | def __init__(self, lr, decay_first, decay_last, **kwargs): 259 | super(LRDecay, self).__init__(**kwargs) 260 | self.iter = 0 261 | self.decay_first = decay_first 262 | self.decay_last = decay_last 263 | self.lr = lr 264 | self.lr_init = np.float32(lr) 265 | 266 | def do(self, which_callback, *args): 267 | self.iter += 1 268 | if self.iter > self.decay_first: 269 | ratio = 1.0 * (self.decay_last - self.iter) 270 | ratio = np.maximum(0, ratio / (self.decay_last - self.decay_first)) 271 | self.lr = np.float32(ratio * self.lr_init) 272 | logger.info("Iter %d, lr %f" % (self.iter, self.lr)) 273 | 274 | 275 | def global_meanpool_2d(x, num_filters): 276 | mean = tensor.mean(x.flatten(3), axis=2) 277 | mean = mean.dimshuffle(0, 1, 'x', 'x') 278 | return mean, (num_filters, 1, 1) 279 | 280 | 281 | def pool_2d(x, mode="average", ws=(2, 2), stride=(2, 2)): 282 | import theano.sandbox.cuda as cuda 283 | assert cuda.dnn.dnn_available() 284 | return cuda.dnn.dnn_pool(x, ws=ws, stride=stride, mode=mode) 285 | 286 | 287 | def maxpool_2d(z, in_dim, poolsize, poolstride): 288 | z = pool_2d(z, ds=poolsize, st=poolstride) 289 | output_size = tuple(Pool.out_shape(in_dim, poolsize, st=poolstride)) 290 | return z, output_size 291 | 292 | def softmax_n(x, axis=-1): 293 | e_x = tensor.exp(x - x.max(axis=axis, keepdims=True)) 294 | out = e_x / e_x.sum(axis=axis, keepdims=True) 295 | return out 296 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import functools 4 | import logging 5 | import os 6 | import subprocess 7 | from argparse import ArgumentParser, Action 8 | from collections import OrderedDict 9 | import sys 10 | from pandas import DataFrame 11 | 12 | import numpy 13 | import time 14 | import theano 15 | from theano.tensor.type import TensorType 16 | 17 | from blocks.algorithms import GradientDescent, Adam 18 | from blocks.extensions import FinishAfter 19 | from blocks.extensions.monitoring import TrainingDataMonitoring 20 | from blocks.filter import VariableFilter 21 | from blocks.graph import ComputationGraph 22 | from blocks.main_loop import MainLoop 23 | from blocks.model import Model 24 | from blocks.roles import PARAMETER 25 | from fuel.datasets import MNIST, CIFAR10 26 | from fuel.schemes import ShuffledScheme, SequentialScheme 27 | from fuel.streams import DataStream 28 | from fuel.transformers import Transformer 29 | 30 | from picklable_itertools import cycle, imap 31 | from itertools import izip, product, tee 32 | 33 | logger = logging.getLogger('main') 34 | 35 | from utils import ShortPrinting, prepare_dir, load_df, DummyLoop 36 | from utils import SaveExpParams, SaveLog, SaveParams, AttributeDict 37 | from nn import ZCA, ContrastNorm 38 | from nn import ApproxTestMonitoring, FinalTestMonitoring, TestMonitoring 39 | from nn import LRDecay 40 | from ladder import LadderAE 41 | 42 | 43 | class Whitening(Transformer): 44 | """ Makes a copy of the examples in the underlying dataset and whitens it 45 | if necessary. 46 | """ 47 | def __init__(self, data_stream, iteration_scheme, whiten, cnorm=None, 48 | **kwargs): 49 | super(Whitening, self).__init__(data_stream, 50 | iteration_scheme=iteration_scheme, 51 | **kwargs) 52 | data = data_stream.get_data(slice(data_stream.dataset.num_examples)) 53 | self.data = [] 54 | for s, d in zip(self.sources, data): 55 | if 'features' == s: 56 | # Fuel provides Cifar in uint8, convert to float32 57 | d = numpy.require(d, dtype=numpy.float32) 58 | if cnorm is not None: 59 | d = cnorm.apply(d) 60 | if whiten is not None: 61 | d = whiten.apply(d) 62 | self.data += [d] 63 | elif 'targets' == s: 64 | d = unify_labels(d) 65 | self.data += [d] 66 | else: 67 | raise Exception("Unsupported Fuel target: %s" % s) 68 | 69 | def get_data(self, request=None): 70 | return (s[request] for s in self.data) 71 | 72 | 73 | class SemiDataStream(Transformer): 74 | """ Combines two datastreams into one such that 'target' source (labels) 75 | is used only from the first one. The second one is renamed 76 | to avoid collision. Upon iteration, the first one is repeated until 77 | the second one depletes. 78 | """ 79 | def __init__(self, data_stream_labeled, data_stream_unlabeled, **kwargs): 80 | super(Transformer, self).__init__(**kwargs) 81 | self.ds_labeled = data_stream_labeled 82 | self.ds_unlabeled = data_stream_unlabeled 83 | # Rename the sources for clarity 84 | self.ds_labeled.sources = ('features_labeled', 'targets_labeled') 85 | # Rename the source for input pixels and hide its labels! 86 | self.ds_unlabeled.sources = ('features_unlabeled',) 87 | 88 | @property 89 | def sources(self): 90 | if hasattr(self, '_sources'): 91 | return self._sources 92 | return self.ds_labeled.sources + self.ds_unlabeled.sources 93 | 94 | @sources.setter 95 | def sources(self, value): 96 | self._sources = value 97 | 98 | def close(self): 99 | self.ds_labeled.close() 100 | self.ds_unlabeled.close() 101 | 102 | def reset(self): 103 | self.ds_labeled.reset() 104 | self.ds_unlabeled.reset() 105 | 106 | def next_epoch(self): 107 | self.ds_labeled.next_epoch() 108 | self.ds_unlabeled.next_epoch() 109 | 110 | def get_epoch_iterator(self, **kwargs): 111 | unlabeled = self.ds_unlabeled.get_epoch_iterator(**kwargs) 112 | labeled = self.ds_labeled.get_epoch_iterator(**kwargs) 113 | assert type(labeled) == type(unlabeled) 114 | 115 | return imap(self.mergedicts, cycle(labeled), unlabeled) 116 | 117 | def mergedicts(self, x, y): 118 | return dict(list(x.items()) + list(y.items())) 119 | 120 | 121 | def unify_labels(y): 122 | """ Work-around for Fuel bug where MNIST and Cifar-10 123 | datasets have different dimensionalities for the targets: 124 | e.g. (50000, 1) vs (60000,) """ 125 | yshape = y.shape 126 | y = y.flatten() 127 | assert y.shape[0] == yshape[0] 128 | return y 129 | 130 | 131 | def make_datastream(dataset, indices, batch_size, 132 | n_labeled=None, n_unlabeled=None, 133 | balanced_classes=True, whiten=None, cnorm=None, 134 | scheme=ShuffledScheme): 135 | if n_labeled is None or n_labeled == 0: 136 | n_labeled = len(indices) 137 | if batch_size is None: 138 | batch_size = len(indices) 139 | if n_unlabeled is None: 140 | n_unlabeled = len(indices) 141 | assert n_labeled <= n_unlabeled, 'need less labeled than unlabeled' 142 | 143 | if balanced_classes and n_labeled < n_unlabeled: 144 | # Ensure each label is equally represented 145 | logger.info('Balancing %d labels...' % n_labeled) 146 | all_data = dataset.data_sources[dataset.sources.index('targets')] 147 | y = unify_labels(all_data)[indices] 148 | n_classes = y.max() + 1 149 | assert n_labeled % n_classes == 0 150 | n_from_each_class = n_labeled / n_classes 151 | 152 | i_labeled = [] 153 | for c in range(n_classes): 154 | i = (indices[y == c])[:n_from_each_class] 155 | i_labeled += list(i) 156 | else: 157 | i_labeled = indices[:n_labeled] 158 | 159 | # Get unlabeled indices 160 | i_unlabeled = indices[:n_unlabeled] 161 | 162 | ds = SemiDataStream( 163 | data_stream_labeled=Whitening( 164 | DataStream(dataset), 165 | iteration_scheme=scheme(i_labeled, batch_size), 166 | whiten=whiten, cnorm=cnorm), 167 | data_stream_unlabeled=Whitening( 168 | DataStream(dataset), 169 | iteration_scheme=scheme(i_unlabeled, batch_size), 170 | whiten=whiten, cnorm=cnorm) 171 | ) 172 | return ds 173 | 174 | 175 | def setup_model(p): 176 | ladder = LadderAE(p) 177 | # Setup inputs 178 | input_type = TensorType('float32', [False] * (len(p.encoder_layers[0]) + 1)) 179 | x_only = input_type('features_unlabeled') 180 | x = input_type('features_labeled') 181 | y = theano.tensor.lvector('targets_labeled') 182 | ladder.apply(x, y, x_only) 183 | 184 | # Load parameters if requested 185 | if p.get('load_from'): 186 | with open(p.load_from + '/trained_params.npz') as f: 187 | loaded = numpy.load(f) 188 | cg = ComputationGraph([ladder.costs.total]) 189 | current_params = VariableFilter(roles=[PARAMETER])(cg.variables) 190 | logger.info('Loading parameters: %s' % ', '.join(loaded.keys())) 191 | for param in current_params: 192 | assert param.get_value().shape == loaded[param.name].shape 193 | param.set_value(loaded[param.name]) 194 | 195 | return ladder 196 | 197 | 198 | def load_and_log_params(cli_params): 199 | cli_params = AttributeDict(cli_params) 200 | if cli_params.get('load_from'): 201 | p = load_df(cli_params.load_from, 'params').to_dict()[0] 202 | p = AttributeDict(p) 203 | for key in cli_params.iterkeys(): 204 | if key not in p: 205 | p[key] = None 206 | new_params = cli_params 207 | loaded = True 208 | else: 209 | p = cli_params 210 | new_params = {} 211 | loaded = False 212 | 213 | # Make dseed seed unless specified explicitly 214 | if p.get('dseed') is None and p.get('seed') is not None: 215 | p['dseed'] = p['seed'] 216 | 217 | logger.info('== COMMAND LINE ==') 218 | logger.info(' '.join(sys.argv)) 219 | 220 | logger.info('== PARAMETERS ==') 221 | for k, v in p.iteritems(): 222 | if new_params.get(k) is not None: 223 | p[k] = new_params[k] 224 | replace_str = "<- " + str(new_params.get(k)) 225 | else: 226 | replace_str = "" 227 | logger.info(" {:20}: {:<20} {}".format(k, v, replace_str)) 228 | return p, loaded 229 | 230 | 231 | def setup_data(p, test_set=False): 232 | dataset_class, training_set_size = { 233 | 'cifar10': (CIFAR10, 40000), 234 | 'mnist': (MNIST, 50000), 235 | }[p.dataset] 236 | 237 | # Allow overriding the default from command line 238 | if p.get('unlabeled_samples') is not None: 239 | training_set_size = p.unlabeled_samples 240 | 241 | train_set = dataset_class(["train"]) 242 | 243 | # Make sure the MNIST data is in right format 244 | if p.dataset == 'mnist': 245 | d = train_set.data_sources[train_set.sources.index('features')] 246 | assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), \ 247 | 'Make sure data is in float format and in range 0 to 1' 248 | 249 | # Take all indices and permutate them 250 | all_ind = numpy.arange(train_set.num_examples) 251 | if p.get('dseed'): 252 | rng = numpy.random.RandomState(seed=p.dseed) 253 | rng.shuffle(all_ind) 254 | 255 | d = AttributeDict() 256 | 257 | # Choose the training set 258 | d.train = train_set 259 | d.train_ind = all_ind[:training_set_size] 260 | 261 | # Then choose validation set from the remaining indices 262 | d.valid = train_set 263 | d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size] 264 | logger.info('Using %d examples for validation' % len(d.valid_ind)) 265 | 266 | # Only touch test data if requested 267 | if test_set: 268 | d.test = dataset_class(["test"]) 269 | d.test_ind = numpy.arange(d.test.num_examples) 270 | 271 | # Setup optional whitening, only used for Cifar-10 272 | in_dim = train_set.data_sources[train_set.sources.index('features')].shape[1:] 273 | if len(in_dim) > 1 and p.whiten_zca > 0: 274 | assert numpy.product(in_dim) == p.whiten_zca, \ 275 | 'Need %d whitening dimensions, not %d' % (numpy.product(in_dim), 276 | p.whiten_zca) 277 | cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None 278 | 279 | def get_data(d, i): 280 | data = d.get_data(request=list(i))[d.sources.index('features')] 281 | # Fuel provides Cifar in uint8, convert to float32 282 | data = numpy.require(data, dtype=numpy.float32) 283 | return data if cnorm is None else cnorm.apply(data) 284 | 285 | if p.whiten_zca > 0: 286 | logger.info('Whitening using %d ZCA components' % p.whiten_zca) 287 | whiten = ZCA() 288 | whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind)) 289 | else: 290 | whiten = None 291 | 292 | return in_dim, d, whiten, cnorm 293 | 294 | 295 | def get_error(args): 296 | """ Calculate the classification error """ 297 | args['data_type'] = args.get('data_type', 'test') 298 | args['no_load'] = 'g_' 299 | 300 | targets, acts = analyze(args) 301 | guess = numpy.argmax(acts, axis=1) 302 | correct = numpy.sum(numpy.equal(guess, targets.flatten())) 303 | 304 | return (1. - correct / float(len(guess))) * 100. 305 | 306 | 307 | def analyze(cli_params): 308 | p, _ = load_and_log_params(cli_params) 309 | _, data, whiten, cnorm = setup_data(p, test_set=True) 310 | ladder = setup_model(p) 311 | 312 | # Analyze activations 313 | dset, indices, calc_batchnorm = { 314 | 'train': (data.train, data.train_ind, False), 315 | 'valid': (data.valid, data.valid_ind, True), 316 | 'test': (data.test, data.test_ind, True), 317 | }[p.data_type] 318 | 319 | if calc_batchnorm: 320 | logger.info('Calculating batch normalization for clean.labeled path') 321 | main_loop = DummyLoop( 322 | extensions=[ 323 | FinalTestMonitoring( 324 | [ladder.costs.class_clean, ladder.error.clean] 325 | + ladder.costs.denois.values(), 326 | make_datastream(data.train, data.train_ind, 327 | # These need to match with the training 328 | p.batch_size, 329 | n_labeled=p.labeled_samples, 330 | n_unlabeled=len(data.train_ind), 331 | cnorm=cnorm, 332 | whiten=whiten, scheme=ShuffledScheme), 333 | make_datastream(data.valid, data.valid_ind, 334 | p.valid_batch_size, 335 | n_labeled=len(data.valid_ind), 336 | n_unlabeled=len(data.valid_ind), 337 | cnorm=cnorm, 338 | whiten=whiten, scheme=ShuffledScheme), 339 | prefix="valid_final", before_training=True), 340 | ShortPrinting({ 341 | "valid_final": OrderedDict([ 342 | ('VF_C_class', ladder.costs.class_clean), 343 | ('VF_E', ladder.error.clean), 344 | ('VF_C_de', [ladder.costs.denois.get(0), 345 | ladder.costs.denois.get(1), 346 | ladder.costs.denois.get(2), 347 | ladder.costs.denois.get(3)]), 348 | ]), 349 | }, after_training=True, use_log=False), 350 | ]) 351 | main_loop.run() 352 | 353 | # Make a datastream that has all the indices in the labeled pathway 354 | ds = make_datastream(dset, indices, 355 | batch_size=p.get('batch_size'), 356 | n_labeled=len(indices), 357 | n_unlabeled=len(indices), 358 | balanced_classes=False, 359 | whiten=whiten, 360 | cnorm=cnorm, 361 | scheme=SequentialScheme) 362 | 363 | # We want out the values after softmax 364 | outputs = ladder.act.clean.labeled.h[len(ladder.layers) - 1] 365 | 366 | # Replace the batch normalization paramameters with the shared variables 367 | if calc_batchnorm: 368 | outputreplacer = TestMonitoring() 369 | _, _, outputs = outputreplacer._get_bn_params(outputs) 370 | 371 | cg = ComputationGraph(outputs) 372 | f = cg.get_theano_function() 373 | 374 | it = ds.get_epoch_iterator(as_dict=True) 375 | res = [] 376 | inputs = {'features_labeled': [], 377 | 'targets_labeled': [], 378 | 'features_unlabeled': []} 379 | # Loop over one epoch 380 | for d in it: 381 | # Store all inputs 382 | for k, v in d.iteritems(): 383 | inputs[k] += [v] 384 | # Store outputs 385 | res += [f(*[d[str(inp)] for inp in cg.inputs])] 386 | 387 | # Concatenate all minibatches 388 | res = [numpy.vstack(minibatches) for minibatches in zip(*res)] 389 | inputs = {k: numpy.vstack(v) for k, v in inputs.iteritems()} 390 | 391 | return inputs['targets_labeled'], res[0] 392 | 393 | 394 | def train(cli_params): 395 | cli_params['save_dir'] = prepare_dir(cli_params['save_to']) 396 | logfile = os.path.join(cli_params['save_dir'], 'log.txt') 397 | 398 | # Log also DEBUG to a file 399 | fh = logging.FileHandler(filename=logfile) 400 | fh.setLevel(logging.DEBUG) 401 | logger.addHandler(fh) 402 | 403 | logger.info('Logging into %s' % logfile) 404 | 405 | p, loaded = load_and_log_params(cli_params) 406 | in_dim, data, whiten, cnorm = setup_data(p, test_set=False) 407 | if not loaded: 408 | # Set the zero layer to match input dimensions 409 | p.encoder_layers = (in_dim,) + p.encoder_layers 410 | 411 | ladder = setup_model(p) 412 | 413 | # Training 414 | all_params = ComputationGraph([ladder.costs.total]).parameters 415 | logger.info('Found the following parameters: %s' % str(all_params)) 416 | 417 | # Fetch all batch normalization updates. They are in the clean path. 418 | bn_updates = ComputationGraph([ladder.costs.class_clean]).updates 419 | assert 'counter' in [u.name for u in bn_updates.keys()], \ 420 | 'No batch norm params in graph - the graph has been cut?' 421 | 422 | training_algorithm = GradientDescent( 423 | cost=ladder.costs.total, parameters=all_params, 424 | step_rule=Adam(learning_rate=ladder.lr)) 425 | # In addition to actual training, also do BN variable approximations 426 | training_algorithm.add_updates(bn_updates) 427 | 428 | short_prints = { 429 | "train": { 430 | 'T_C_class': ladder.costs.class_corr, 431 | 'T_C_de': ladder.costs.denois.values(), 432 | }, 433 | "valid_approx": OrderedDict([ 434 | ('V_C_class', ladder.costs.class_clean), 435 | ('V_E', ladder.error.clean), 436 | ('V_C_de', ladder.costs.denois.values()), 437 | ]), 438 | "valid_final": OrderedDict([ 439 | ('VF_C_class', ladder.costs.class_clean), 440 | ('VF_E', ladder.error.clean), 441 | ('VF_C_de', ladder.costs.denois.values()), 442 | ]), 443 | } 444 | 445 | main_loop = MainLoop( 446 | training_algorithm, 447 | # Datastream used for training 448 | make_datastream(data.train, data.train_ind, 449 | p.batch_size, 450 | n_labeled=p.labeled_samples, 451 | n_unlabeled=p.unlabeled_samples, 452 | whiten=whiten, 453 | cnorm=cnorm), 454 | model=Model(ladder.costs.total), 455 | extensions=[ 456 | FinishAfter(after_n_epochs=p.num_epochs), 457 | 458 | # This will estimate the validation error using 459 | # running average estimates of the batch normalization 460 | # parameters, mean and variance 461 | ApproxTestMonitoring( 462 | [ladder.costs.class_clean, ladder.error.clean] 463 | + ladder.costs.denois.values(), 464 | make_datastream(data.valid, data.valid_ind, 465 | p.valid_batch_size, whiten=whiten, cnorm=cnorm, 466 | scheme=ShuffledScheme), 467 | prefix="valid_approx"), 468 | 469 | # This Monitor is slower, but more accurate since it will first 470 | # estimate batch normalization parameters from training data and 471 | # then do another pass to calculate the validation error. 472 | FinalTestMonitoring( 473 | [ladder.costs.class_clean, ladder.error.clean] 474 | + ladder.costs.denois.values(), 475 | make_datastream(data.train, data.train_ind, 476 | p.batch_size, 477 | n_labeled=p.labeled_samples, 478 | whiten=whiten, cnorm=cnorm, 479 | scheme=ShuffledScheme), 480 | make_datastream(data.valid, data.valid_ind, 481 | p.valid_batch_size, 482 | n_labeled=len(data.valid_ind), 483 | whiten=whiten, cnorm=cnorm, 484 | scheme=ShuffledScheme), 485 | prefix="valid_final", 486 | after_n_epochs=p.num_epochs), 487 | 488 | TrainingDataMonitoring( 489 | [ladder.costs.total, ladder.costs.class_corr, 490 | training_algorithm.total_gradient_norm] 491 | + ladder.costs.denois.values(), 492 | prefix="train", after_epoch=True), 493 | 494 | SaveParams(None, all_params, p.save_dir, after_epoch=True), 495 | SaveExpParams(p, p.save_dir, before_training=True), 496 | SaveLog(p.save_dir, after_training=True), 497 | ShortPrinting(short_prints), 498 | LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, 499 | after_epoch=True), 500 | ]) 501 | main_loop.run() 502 | 503 | # Get results 504 | df = DataFrame.from_dict(main_loop.log, orient='index') 505 | col = 'valid_final_error_rate_clean' 506 | logger.info('%s %g' % (col, df[col].iloc[-1])) 507 | 508 | if main_loop.log.status['epoch_interrupt_received']: 509 | return None 510 | return df 511 | 512 | if __name__ == "__main__": 513 | logging.basicConfig(level=logging.INFO) 514 | 515 | rep = lambda s: s.replace('-', ',') 516 | chop = lambda s: s.split(',') 517 | to_int = lambda ss: [int(s) for s in ss if s.isdigit()] 518 | to_float = lambda ss: [float(s) for s in ss] 519 | 520 | def to_bool(s): 521 | if s.lower() in ['true', 't']: 522 | return True 523 | elif s.lower() in ['false', 'f']: 524 | return False 525 | else: 526 | raise Exception("Unknown bool value %s" % s) 527 | 528 | def compose(*funs): 529 | return functools.reduce(lambda f, g: lambda x: f(g(x)), funs) 530 | 531 | # Functional parsing logic to allow flexible function compositions 532 | # as actions for ArgumentParser 533 | def funcs(additional_arg): 534 | class customAction(Action): 535 | def __call__(self, parser, args, values, option_string=None): 536 | 537 | def process(arg, func_list): 538 | if arg is None: 539 | return None 540 | elif type(arg) is list: 541 | return map(compose(*func_list), arg) 542 | else: 543 | return compose(*func_list)(arg) 544 | 545 | setattr(args, self.dest, process(values, additional_arg)) 546 | return customAction 547 | 548 | def add_train_params(parser, use_defaults): 549 | a = parser.add_argument 550 | default = lambda x: x if use_defaults else None 551 | 552 | # General hyper parameters and settings 553 | a("save_to", help="Destination to save the state and results", 554 | default=default("noname"), nargs="?") 555 | a("--num-epochs", help="Number of training epochs", 556 | type=int, default=default(150)) 557 | a("--seed", help="Seed", 558 | type=int, default=default([1]), nargs='+') 559 | a("--dseed", help="Data permutation seed, defaults to 'seed'", 560 | type=int, default=default([None]), nargs='+') 561 | a("--labeled-samples", help="How many supervised samples are used", 562 | type=int, default=default(None), nargs='+') 563 | a("--unlabeled-samples", help="How many unsupervised samples are used", 564 | type=int, default=default(None), nargs='+') 565 | a("--dataset", type=str, default=default(['mnist']), nargs='+', 566 | choices=['mnist', 'cifar10'], help="Which dataset to use") 567 | a("--lr", help="Initial learning rate", 568 | type=float, default=default([0.002]), nargs='+') 569 | a("--lrate-decay", help="When to linearly start decaying lrate (0-1)", 570 | type=float, default=default([0.67]), nargs='+') 571 | a("--batch-size", help="Minibatch size", 572 | type=int, default=default([100]), nargs='+') 573 | a("--valid-batch-size", help="Minibatch size for validation data", 574 | type=int, default=default([100]), nargs='+') 575 | a("--valid-set-size", help="Number of examples in validation set", 576 | type=int, default=default([10000]), nargs='+') 577 | 578 | # Hyperparameters controlling supervised path 579 | a("--super-noise-std", help="Noise added to supervised learning path", 580 | type=float, default=default([0.3]), nargs='+') 581 | a("--f-local-noise-std", help="Noise added encoder path", 582 | type=str, default=default([0.3]), nargs='+', 583 | action=funcs([tuple, to_float, chop])) 584 | a("--act", nargs='+', type=str, action=funcs([tuple, chop, rep]), 585 | default=default(["relu"]), help="List of activation functions") 586 | a("--encoder-layers", help="List of layers for f", 587 | type=str, default=default(()), action=funcs([tuple, chop, rep])) 588 | 589 | # Hyperparameters controlling unsupervised training 590 | a("--denoising-cost-x", help="Weight of the denoising cost.", 591 | type=str, default=default([(0.,)]), nargs='+', 592 | action=funcs([tuple, to_float, chop])) 593 | a("--decoder-spec", help="List of decoding function types", nargs='+', 594 | type=str, default=default(['sig']), action=funcs([tuple, chop, rep])) 595 | a("--zestbn", type=str, default=default(['bugfix']), nargs='+', 596 | choices=['bugfix', 'no'], help="How to do zest bn") 597 | 598 | # Hyperparameters used for Cifar training 599 | a("--contrast-norm", help="Scale of contrast normalization (0=off)", 600 | type=int, default=default([0]), nargs='+') 601 | a("--top-c", help="Have c at softmax?", action=funcs([to_bool]), 602 | default=default([True]), nargs='+') 603 | a("--whiten-zca", help="Whether to whiten the data with ZCA", 604 | type=int, default=default([0]), nargs='+') 605 | 606 | ap = ArgumentParser("Semisupervised experiment") 607 | subparsers = ap.add_subparsers(dest='cmd', help='sub-command help') 608 | 609 | # TRAIN 610 | train_cmd = subparsers.add_parser('train', help='Train a new model') 611 | add_train_params(train_cmd, use_defaults=True) 612 | 613 | # EVALUATE 614 | load_cmd = subparsers.add_parser('evaluate', help='Evaluate test error') 615 | load_cmd.add_argument('load_from', type=str, 616 | help="Destination to load the state from") 617 | load_cmd.add_argument('--data-type', type=str, default='test', 618 | help="Data set to evaluate on") 619 | 620 | args = ap.parse_args() 621 | 622 | subp = subprocess.Popen(['git', 'rev-parse', 'HEAD'], 623 | stdin=subprocess.PIPE, stdout=subprocess.PIPE, 624 | stderr=subprocess.PIPE) 625 | out, err = subp.communicate() 626 | args.commit = out.strip() 627 | if err.strip(): 628 | logger.error('Subprocess returned %s' % err.strip()) 629 | 630 | t_start = time.time() 631 | if args.cmd == 'evaluate': 632 | for k, v in vars(args).iteritems(): 633 | if type(v) is list: 634 | assert len(v) == 1, "should not be a list when loading: %s" % k 635 | logger.info("%s" % str(v[0])) 636 | vars(args)[k] = v[0] 637 | 638 | err = get_error(vars(args)) 639 | logger.info('Test error: %f' % err) 640 | 641 | elif args.cmd == "train": 642 | listdicts = {k: v for k, v in vars(args).iteritems() if type(v) is list} 643 | therest = {k: v for k, v in vars(args).iteritems() if type(v) is not list} 644 | 645 | gen1, gen2 = tee(product(*listdicts.itervalues())) 646 | 647 | l = len(list(gen1)) 648 | for i, d in enumerate(dict(izip(listdicts, x)) for x in gen2): 649 | if l > 1: 650 | logger.info('Training configuration %d / %d' % (i+1, l)) 651 | d.update(therest) 652 | if train(d) is None: 653 | break 654 | logger.info('Took %.1f minutes' % ((time.time() - t_start) / 60.)) 655 | -------------------------------------------------------------------------------- /ladder.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | import theano 7 | import theano.tensor as T 8 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 9 | from theano.tensor.nnet.conv import conv2d, ConvOp 10 | from theano.sandbox.cuda.blas import GpuCorrMM 11 | from theano.sandbox.cuda.basic_ops import gpu_contiguous 12 | 13 | from blocks.bricks.cost import SquaredError 14 | from blocks.bricks.cost import CategoricalCrossEntropy, MisclassificationRate 15 | from blocks.graph import add_annotation, Annotation 16 | from blocks.roles import add_role, PARAMETER, WEIGHT, BIAS 17 | 18 | from utils import shared_param, AttributeDict 19 | from nn import maxpool_2d, global_meanpool_2d, BNPARAM, softmax_n 20 | 21 | logger = logging.getLogger('main.model') 22 | floatX = theano.config.floatX 23 | 24 | 25 | class LadderAE(): 26 | def __init__(self, p): 27 | self.p = p 28 | self.init_weights_transpose = False 29 | self.default_lr = p.lr 30 | self.shareds = OrderedDict() 31 | self.rstream = RandomStreams(seed=p.seed) 32 | self.rng = np.random.RandomState(seed=p.seed) 33 | 34 | n_layers = len(p.encoder_layers) 35 | assert n_layers > 1, "Need to define encoder layers" 36 | assert n_layers == len(p.denoising_cost_x), ( 37 | "Number of denoising costs does not match with %d layers: %s" % 38 | (n_layers, str(p.denoising_cost_x))) 39 | 40 | def one_to_all(x): 41 | """ (5.,) -> 5 -> (5., 5., 5.) 42 | ('relu',) -> 'relu' -> ('relu', 'relu', 'relu') 43 | """ 44 | if type(x) is tuple and len(x) == 1: 45 | x = x[0] 46 | 47 | if type(x) is float: 48 | x = (np.float32(x),) * n_layers 49 | 50 | if type(x) is str: 51 | x = (x,) * n_layers 52 | return x 53 | 54 | p.decoder_spec = one_to_all(p.decoder_spec) 55 | p.f_local_noise_std = one_to_all(p.f_local_noise_std) 56 | acts = one_to_all(p.get('act', 'relu')) 57 | 58 | assert n_layers == len(p.decoder_spec), "f and g need to match" 59 | assert (n_layers == len(acts)), ( 60 | "Not enough activations given. Requires %d. Got: %s" % 61 | (n_layers, str(acts))) 62 | acts = acts[:-1] + ('softmax',) 63 | 64 | def parse_layer(spec): 65 | """ 'fc:5' -> ('fc', 5) 66 | '5' -> ('fc', 5) 67 | 5 -> ('fc', 5) 68 | 'convv:3:2:2' -> ('convv', [3,2,2]) 69 | """ 70 | if type(spec) is not str: 71 | return "fc", spec 72 | spec = spec.split(':') 73 | l_type = spec.pop(0) if len(spec) >= 2 else "fc" 74 | spec = map(int, spec) 75 | spec = spec[0] if len(spec) == 1 else spec 76 | return l_type, spec 77 | 78 | enc = map(parse_layer, p.encoder_layers) 79 | self.layers = list(enumerate(zip(enc, p.decoder_spec, acts))) 80 | 81 | def weight(self, init, name, cast_float32=True, for_conv=False): 82 | weight = self.shared(init, name, cast_float32, role=WEIGHT) 83 | if for_conv: 84 | return weight.dimshuffle('x', 0, 'x', 'x') 85 | return weight 86 | 87 | def bias(self, init, name, cast_float32=True, for_conv=False): 88 | b = self.shared(init, name, cast_float32, role=BIAS) 89 | if for_conv: 90 | return b.dimshuffle('x', 0, 'x', 'x') 91 | return b 92 | 93 | def shared(self, init, name, cast_float32=True, role=PARAMETER, **kwargs): 94 | p = self.shareds.get(name) 95 | if p is None: 96 | p = shared_param(init, name, cast_float32, role, **kwargs) 97 | self.shareds[name] = p 98 | return p 99 | 100 | def counter(self): 101 | name = 'counter' 102 | p = self.shareds.get(name) 103 | update = [] 104 | if p is None: 105 | p_max_val = np.float32(10) 106 | p = self.shared(np.float32(1), name, role=BNPARAM) 107 | p_max = self.shared(p_max_val, name + '_max', role=BNPARAM) 108 | update = [(p, T.clip(p + np.float32(1), np.float32(0), p_max)), 109 | (p_max, p_max_val)] 110 | return (p, update) 111 | 112 | def noise_like(self, x): 113 | noise = self.rstream.normal(size=x.shape, avg=0.0, std=1.0) 114 | return T.cast(noise, dtype=floatX) 115 | 116 | def rand_init(self, in_dim, out_dim): 117 | """ Random initialization for fully connected layers """ 118 | W = self.rng.randn(in_dim, out_dim) / np.sqrt(in_dim) 119 | return W 120 | 121 | def rand_init_conv(self, dim): 122 | """ Random initialization for convolution filters """ 123 | fan_in = np.prod(dtype=floatX, a=dim[1:]) 124 | bound = np.sqrt(3. / max(1.0, (fan_in))) 125 | W = np.asarray( 126 | self.rng.uniform(low=-bound, high=bound, size=dim), dtype=floatX) 127 | return W 128 | 129 | def new_activation_dict(self): 130 | return AttributeDict({'z': {}, 'h': {}, 's': {}, 'm': {}}) 131 | 132 | def annotate_update(self, update, tag_to): 133 | a = Annotation() 134 | for (var, up) in update: 135 | a.updates[var] = up 136 | add_annotation(tag_to, a) 137 | 138 | def apply(self, input_labeled, target_labeled, input_unlabeled): 139 | self.layer_counter = 0 140 | input_dim = self.p.encoder_layers[0] 141 | 142 | # Store the dimension tuples in the same order as layers. 143 | layers = self.layers 144 | self.layer_dims = {0: input_dim} 145 | 146 | self.lr = self.default_lr 147 | 148 | self.costs = costs = AttributeDict() 149 | self.costs.denois = AttributeDict() 150 | 151 | self.act = AttributeDict() 152 | self.error = AttributeDict() 153 | 154 | top = len(layers) - 1 155 | 156 | if input_labeled is None: 157 | N = 0 158 | else: 159 | N = input_labeled.shape[0] 160 | self.join = lambda l, u: T.concatenate([l, u], axis=0) if l else u 161 | self.labeled = lambda x: x[:N] if x is not None else x 162 | self.unlabeled = lambda x: x[N:] if x is not None else x 163 | self.split_lu = lambda x: (self.labeled(x), self.unlabeled(x)) 164 | 165 | input_concat = self.join(input_labeled, input_unlabeled) 166 | 167 | def encoder(input_, path_name, input_noise_std=0, noise_std=[]): 168 | h = input_ 169 | 170 | logger.info(' 0: noise %g' % input_noise_std) 171 | if input_noise_std > 0.: 172 | h = h + self.noise_like(h) * input_noise_std 173 | 174 | d = AttributeDict() 175 | d.unlabeled = self.new_activation_dict() 176 | d.labeled = self.new_activation_dict() 177 | d.labeled.z[0] = self.labeled(h) 178 | d.unlabeled.z[0] = self.unlabeled(h) 179 | prev_dim = input_dim 180 | for i, (spec, _, act_f) in layers[1:]: 181 | d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h) 182 | noise = noise_std[i] if i < len(noise_std) else 0. 183 | curr_dim, z, m, s, h = self.f(h, prev_dim, spec, i, act_f, 184 | path_name=path_name, 185 | noise_std=noise) 186 | assert self.layer_dims.get(i) in (None, curr_dim) 187 | self.layer_dims[i] = curr_dim 188 | d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z) 189 | d.unlabeled.s[i] = s 190 | d.unlabeled.m[i] = m 191 | prev_dim = curr_dim 192 | d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h) 193 | return d 194 | 195 | # Clean, supervised 196 | logger.info('Encoder: clean, labeled') 197 | clean = self.act.clean = encoder(input_concat, 'clean') 198 | 199 | # Corrupted, supervised 200 | logger.info('Encoder: corr, labeled') 201 | corr = self.act.corr = encoder(input_concat, 'corr', 202 | input_noise_std=self.p.super_noise_std, 203 | noise_std=self.p.f_local_noise_std) 204 | est = self.act.est = self.new_activation_dict() 205 | 206 | # Decoder path in opposite order 207 | logger.info('Decoder: z_corr -> z_est') 208 | for i, ((_, spec), l_type, act_f) in layers[::-1]: 209 | z_corr = corr.unlabeled.z[i] 210 | z_clean = clean.unlabeled.z[i] 211 | z_clean_s = clean.unlabeled.s.get(i) 212 | z_clean_m = clean.unlabeled.m.get(i) 213 | fspec = layers[i+1][1][0] if len(layers) > i+1 else (None, None) 214 | 215 | if i == top: 216 | ver = corr.unlabeled.h[i] 217 | ver_dim = self.layer_dims[i] 218 | top_g = True 219 | else: 220 | ver = est.z.get(i + 1) 221 | ver_dim = self.layer_dims.get(i + 1) 222 | top_g = False 223 | 224 | z_est = self.g(z_lat=z_corr, 225 | z_ver=ver, 226 | in_dims=ver_dim, 227 | out_dims=self.layer_dims[i], 228 | l_type=l_type, 229 | num=i, 230 | fspec=fspec, 231 | top_g=top_g) 232 | 233 | if z_est is not None: 234 | # Denoising cost 235 | 236 | if z_clean_s and self.p.zestbn == 'bugfix': 237 | z_est_norm = (z_est - z_clean_m) / T.sqrt(z_clean_s + np.float32(1e-10)) 238 | elif z_clean_s is None or self.p.zestbn == 'no': 239 | z_est_norm = z_est 240 | else: 241 | assert False, 'Not supported path' 242 | 243 | se = SquaredError('denois' + str(i)) 244 | costs.denois[i] = se.apply(z_est_norm.flatten(2), 245 | z_clean.flatten(2)) \ 246 | / np.prod(self.layer_dims[i], dtype=floatX) 247 | costs.denois[i].name = 'denois' + str(i) 248 | denois_print = 'denois %.2f' % self.p.denoising_cost_x[i] 249 | else: 250 | denois_print = '' 251 | 252 | # Store references for later use 253 | est.h[i] = self.apply_act(z_est, act_f) 254 | est.z[i] = z_est 255 | est.s[i] = None 256 | est.m[i] = None 257 | logger.info(' g%d: %10s, %s, dim %s -> %s' % ( 258 | i, l_type, 259 | denois_print, 260 | self.layer_dims.get(i+1), 261 | self.layer_dims.get(i) 262 | )) 263 | 264 | # Costs 265 | y = target_labeled.flatten() 266 | 267 | costs.class_clean = CategoricalCrossEntropy().apply(y, clean.labeled.h[top]) 268 | costs.class_clean.name = 'cost_class_clean' 269 | 270 | costs.class_corr = CategoricalCrossEntropy().apply(y, corr.labeled.h[top]) 271 | costs.class_corr.name = 'cost_class_corr' 272 | 273 | # This will be used for training 274 | costs.total = costs.class_corr * 1.0 275 | for i in range(top + 1): 276 | if costs.denois.get(i) and self.p.denoising_cost_x[i] > 0: 277 | costs.total += costs.denois[i] * self.p.denoising_cost_x[i] 278 | costs.total.name = 'cost_total' 279 | 280 | # Classification error 281 | mr = MisclassificationRate() 282 | self.error.clean = mr.apply(y, clean.labeled.h[top]) * np.float32(100.) 283 | self.error.clean.name = 'error_rate_clean' 284 | 285 | def apply_act(self, input, act_name): 286 | if input is None: 287 | return input 288 | act = { 289 | 'relu': lambda x: T.maximum(0, x), 290 | 'leakyrelu': lambda x: T.switch(x > 0., x, 0.1 * x), 291 | 'linear': lambda x: x, 292 | 'softplus': lambda x: T.log(1. + T.exp(x)), 293 | 'sigmoid': lambda x: T.nnet.sigmoid(x), 294 | 'softmax': lambda x: softmax_n(x), 295 | }.get(act_name) 296 | assert act, 'unknown act %s' % act_name 297 | if act_name == 'softmax': 298 | input = input.flatten(2) 299 | return act(input) 300 | 301 | def annotate_bn(self, var, id, var_type, mb_size, size, norm_ax): 302 | var_shape = np.array((1,) + size) 303 | out_dim = np.prod(var_shape) / np.prod(var_shape[list(norm_ax)]) 304 | # Flatten the var - shared variable updating is not trivial otherwise, 305 | # as theano seems to believe a row vector is a matrix and will complain 306 | # about the updates 307 | orig_shape = var.shape 308 | var = var.flatten() 309 | # Here we add the name and role, the variables will later be identified 310 | # by these values 311 | var.name = id + '_%s_clean' % var_type 312 | add_role(var, BNPARAM) 313 | shared_var = self.shared(np.zeros(out_dim), 314 | name='shared_%s' % var.name, role=None) 315 | 316 | # Update running average estimates. When the counter is reset to 1, it 317 | # will clear its memory 318 | cntr, c_up = self.counter() 319 | one = np.float32(1) 320 | run_avg = lambda new, old: one / cntr * new + (one - one / cntr) * old 321 | if var_type == 'mean': 322 | new_value = run_avg(var, shared_var) 323 | elif var_type == 'var': 324 | mb_size = T.cast(mb_size, 'float32') 325 | new_value = run_avg(mb_size / (mb_size - one) * var, shared_var) 326 | else: 327 | raise NotImplemented('Unknown batch norm var %s' % var_type) 328 | # Add the counter update to the annotated update if it is the first 329 | # instance of a counter 330 | self.annotate_update([(shared_var, new_value)] + c_up, var) 331 | 332 | return var.reshape(orig_shape) 333 | 334 | def f(self, h, in_dim, spec, num, act_f, path_name, noise_std=0): 335 | # Generates identifiers used for referencing shared variables. 336 | # E.g. clean and corrupted encoders will end up using the same 337 | # variable name and hence sharing parameters 338 | gen_id = lambda s: '_'.join(['f', str(num), s]) 339 | layer_type, _ = spec 340 | 341 | # Pooling 342 | if layer_type in ['maxpool', 'globalmeanpool']: 343 | z, output_size = self.f_pool(h, spec, in_dim) 344 | norm_ax = (0, -2, -1) 345 | # after pooling, no activation func for now unless its softmax 346 | act_f = "linear" if act_f != "softmax" else act_f 347 | 348 | # Convolution 349 | elif layer_type in ['convv', 'convf']: 350 | z, output_size = self.f_conv(h, spec, in_dim, gen_id('W')) 351 | norm_ax = (0, -2, -1) 352 | 353 | # Fully connected 354 | elif layer_type == "fc": 355 | h = h.flatten(2) if h.ndim > 2 else h 356 | _, dim = spec 357 | W = self.weight(self.rand_init(np.prod(in_dim), dim), gen_id('W')) 358 | z, output_size = T.dot(h, W), (dim,) 359 | norm_ax = (0,) 360 | else: 361 | raise ValueError("Unknown layer spec: %s" % layer_type) 362 | 363 | m = s = None 364 | is_normalizing = True 365 | if is_normalizing: 366 | keep_dims = True 367 | z_l = self.labeled(z) 368 | z_u = self.unlabeled(z) 369 | m = z_u.mean(norm_ax, keepdims=keep_dims) 370 | s = z_u.var(norm_ax, keepdims=keep_dims) 371 | 372 | m_l = z_l.mean(norm_ax, keepdims=keep_dims) 373 | s_l = z_l.var(norm_ax, keepdims=keep_dims) 374 | if path_name == 'clean': 375 | # Batch normalization estimates the mean and variance of 376 | # validation and test sets based on the training set 377 | # statistics. The following annotates the computation of 378 | # running average to the graph. 379 | m_l = self.annotate_bn(m_l, gen_id('bn'), 'mean', z_l.shape[0], 380 | output_size, norm_ax) 381 | s_l = self.annotate_bn(s_l, gen_id('bn'), 'var', z_l.shape[0], 382 | output_size, norm_ax) 383 | z = self.join( 384 | (z_l - m_l) / T.sqrt(s_l + np.float32(1e-10)), 385 | (z_u - m) / T.sqrt(s + np.float32(1e-10))) 386 | 387 | if noise_std > 0: 388 | z += self.noise_like(z) * noise_std 389 | 390 | # z for lateral connection 391 | z_lat = z 392 | b_init, c_init = 0.0, 1.0 393 | b_c_size = output_size[0] 394 | 395 | # Add bias 396 | if act_f != 'linear': 397 | z += self.bias(b_init * np.ones(b_c_size), gen_id('b'), 398 | for_conv=len(output_size) > 1) 399 | 400 | if is_normalizing: 401 | # Add free parameter (gamma in original Batch Normalization paper) 402 | # if needed by the activation. For instance ReLU does't need one 403 | # and we only add it to softmax if hyperparameter top_c is set. 404 | if (act_f not in ['relu', 'leakyrelu', 'linear', 'softmax'] or 405 | (act_f == 'softmax' and self.p.top_c is True)): 406 | c = self.weight(c_init * np.ones(b_c_size), gen_id('c'), 407 | for_conv=len(output_size) > 1) 408 | z *= c 409 | 410 | h = self.apply_act(z, act_f) 411 | 412 | logger.info(' f%d: %s, %s,%s noise %.2f, params %s, dim %s -> %s' % ( 413 | num, layer_type, act_f, ' BN,' if is_normalizing else '', 414 | noise_std, spec[1], in_dim, output_size)) 415 | return output_size, z_lat, m, s, h 416 | 417 | def f_pool(self, x, spec, in_dim): 418 | layer_type, dims = spec 419 | num_filters = in_dim[0] 420 | if "globalmeanpool" == layer_type: 421 | y, output_size = global_meanpool_2d(x, num_filters) 422 | # scale the variance to match normal conv layers with xavier init 423 | y = y * np.float32(in_dim[-1]) * np.float32(np.sqrt(3)) 424 | else: 425 | assert dims[0] != 1 or dims[1] != 1 426 | y, output_size = maxpool_2d(x, in_dim, 427 | poolsize=(dims[1], dims[1]), 428 | poolstride=(dims[0], dims[0])) 429 | return y, output_size 430 | 431 | def f_conv(self, x, spec, in_dim, weight_name): 432 | layer_type, dims = spec 433 | num_filters = dims[0] 434 | filter_size = (dims[1], dims[1]) 435 | stride = (dims[2], dims[2]) 436 | 437 | bm = 'full' if 'convf' in layer_type else 'valid' 438 | 439 | num_channels = in_dim[0] 440 | 441 | W = self.weight(self.rand_init_conv( 442 | (num_filters, num_channels) + filter_size), weight_name) 443 | 444 | if stride != (1, 1): 445 | f = GpuCorrMM(subsample=stride, border_mode=bm, pad=(0, 0)) 446 | y = f(gpu_contiguous(x), gpu_contiguous(W)) 447 | else: 448 | assert self.p.batch_size == self.p.valid_batch_size 449 | y = conv2d(x, W, image_shape=(2*self.p.batch_size, ) + in_dim, 450 | filter_shape=((num_filters, num_channels) + 451 | filter_size), border_mode=bm) 452 | output_size = ((num_filters,) + 453 | ConvOp.getOutputShape(in_dim[1:], filter_size, 454 | stride, bm)) 455 | 456 | return y, output_size 457 | 458 | def g(self, z_lat, z_ver, in_dims, out_dims, l_type, num, fspec, top_g): 459 | f_layer_type, dims = fspec 460 | is_conv = f_layer_type is not None and ('conv' in f_layer_type or 461 | 'pool' in f_layer_type) 462 | gen_id = lambda s: '_'.join(['g', str(num), s]) 463 | 464 | in_dim = np.prod(dtype=floatX, a=in_dims) 465 | out_dim = np.prod(dtype=floatX, a=out_dims) 466 | num_filters = out_dims[0] if is_conv else out_dim 467 | 468 | if l_type[-1] in ['0']: 469 | g_type, u_type = l_type[:-1], l_type[-1] 470 | else: 471 | g_type, u_type = l_type, None 472 | 473 | # Mapping from layer above: u 474 | if u_type in ['0'] or z_ver is None: 475 | if z_ver is None and u_type not in ['0']: 476 | logger.warn('Decoder %d:%s without vertical input' % 477 | (num, g_type)) 478 | u = None 479 | else: 480 | if top_g: 481 | u = z_ver 482 | elif is_conv: 483 | u = self.g_deconv(z_ver, in_dims, out_dims, gen_id('W'), fspec) 484 | else: 485 | W = self.weight(self.rand_init(in_dim, out_dim), gen_id('W')) 486 | u = T.dot(z_ver, W) 487 | 488 | # Batch-normalize u 489 | if u is not None: 490 | norm_ax = (0,) if u.ndim <= 2 else (0, -2, -1) 491 | keep_dims = True 492 | u -= u.mean(norm_ax, keepdims=keep_dims) 493 | u /= T.sqrt(u.var(norm_ax, keepdims=keep_dims) + 494 | np.float32(1e-10)) 495 | 496 | # Define the g function 497 | if not is_conv: 498 | z_lat = z_lat.flatten(2) 499 | bi = lambda inits, name: self.bias(inits * np.ones(num_filters), 500 | gen_id(name), for_conv=is_conv) 501 | wi = lambda inits, name: self.weight(inits * np.ones(num_filters), 502 | gen_id(name), for_conv=is_conv) 503 | 504 | if g_type == '': 505 | z_est = None 506 | 507 | elif g_type == 'i': 508 | z_est = z_lat 509 | 510 | elif g_type in ['sig']: 511 | sigval = bi(0., 'c1') + wi(1., 'c2') * z_lat 512 | if u is not None: 513 | sigval += wi(0., 'c3') * u + wi(0., 'c4') * z_lat * u 514 | sigval = T.nnet.sigmoid(sigval) 515 | 516 | z_est = bi(0., 'a1') + wi(1., 'a2') * z_lat + wi(1., 'b1') * sigval 517 | if u is not None: 518 | z_est += wi(0., 'a3') * u + wi(0., 'a4') * z_lat * u 519 | 520 | elif g_type in ['lin']: 521 | a1 = wi(1.0, 'a1') 522 | b = bi(0.0, 'b') 523 | 524 | z_est = a1 * z_lat + b 525 | 526 | elif g_type in ['relu']: 527 | assert u is not None 528 | b = bi(0., 'b') 529 | x = u + b 530 | z_est = self.apply_act(x, 'relu') 531 | 532 | elif g_type in ['sigmoid']: 533 | assert u is not None 534 | b = bi(0., 'b') 535 | c = wi(1., 'c') 536 | z_est = self.apply_act((u + b) * c, 'sigmoid') 537 | 538 | elif g_type in ['comparison_g2']: 539 | # sig without the uz cross term 540 | sigval = bi(0., 'c1') + wi(1., 'c2') * z_lat 541 | if u is not None: 542 | sigval += wi(0., 'c3') * u 543 | sigval = T.nnet.sigmoid(sigval) 544 | 545 | z_est = bi(0., 'a1') + wi(1., 'a2') * z_lat + wi(1., 'b1') * sigval 546 | if u is not None: 547 | z_est += wi(0., 'a3') * u 548 | 549 | elif g_type in ['comparison_g3']: 550 | # sig without the sigmoid nonlinearity 551 | z_est = bi(0., 'a1') + wi(1., 'a2') * z_lat 552 | if u is not None: 553 | z_est += wi(0., 'a3') * u + wi(0., 'a4') * z_lat * u 554 | 555 | elif g_type in ['comparison_g4']: 556 | # No mixing between z_lat and u before final sum, otherwise similar 557 | # to sig 558 | def nonlin(inp, in_name='input', add_bias=True): 559 | w1 = wi(1., 'w1_%s' % in_name) 560 | b1 = bi(0., 'b1') 561 | w2 = wi(1., 'w2_%s' % in_name) 562 | b2 = bi(0., 'b2') if add_bias else 0 563 | w3 = wi(0., 'w3_%s' % in_name) 564 | return w2 * T.nnet.sigmoid(b1 + w1 * inp) + w3 * inp + b2 565 | 566 | z_est = nonlin(z_lat, 'lat') if u is None else \ 567 | nonlin(z_lat, 'lat') + nonlin(u, 'ver', False) 568 | 569 | elif g_type in ['comparison_g5', 'gauss']: 570 | # Gaussian assumption on z: (z - mu) * v + mu 571 | if u is None: 572 | b1 = bi(0., 'b1') 573 | w1 = wi(1., 'w1') 574 | z_est = w1 * z_lat + b1 575 | else: 576 | a1 = bi(0., 'a1') 577 | a2 = wi(1., 'a2') 578 | a3 = bi(0., 'a3') 579 | a4 = bi(0., 'a4') 580 | a5 = bi(0., 'a5') 581 | 582 | a6 = bi(0., 'a6') 583 | a7 = wi(1., 'a7') 584 | a8 = bi(0., 'a8') 585 | a9 = bi(0., 'a9') 586 | a10 = bi(0., 'a10') 587 | 588 | mu = a1 * T.nnet.sigmoid(a2 * u + a3) + a4 * u + a5 589 | v = a6 * T.nnet.sigmoid(a7 * u + a8) + a9 * u + a10 590 | 591 | z_est = (z_lat - mu) * v + mu 592 | elif 'gauss_stable_v' in g_type: 593 | # Gaussian assumption on z: (z - mu) * v + mu 594 | if u is None: 595 | b1 = bi(0., 'b1') 596 | w1 = wi(1., 'w1') 597 | z_est = w1 * z_lat + b1 598 | elif z_lat is None: 599 | b1 = bi(0., 'b1') 600 | w1 = wi(1., 'w1') 601 | z_est = w1 * u + b1 602 | else: 603 | a1 = bi(0., 'a1') 604 | a2 = wi(1., 'a2') 605 | a3 = bi(0., 'a3') 606 | a4 = bi(0., 'a4') 607 | a5 = bi(0., 'a5') 608 | a6 = bi(0., 'a6') 609 | a7 = wi(1., 'a7') 610 | a8 = bi(0., 'a8') 611 | a9 = bi(0., 'a9') 612 | a10 = bi(0., 'a10') 613 | 614 | mu = a1 * T.nnet.sigmoid(a2 * u + a3) + a4 * u + a5 615 | v = a6 * T.nnet.sigmoid(a7 * u + a8) + a9 * u + a10 616 | v = T.nnet.sigmoid(v) 617 | 618 | z_est = (z_lat - mu) * v + mu 619 | else: 620 | raise NotImplementedError("unknown g type: %s" % str(g_type)) 621 | 622 | # Reshape the output if z is for conv but u from fc layer 623 | if (z_est is not None and type(out_dims) == tuple and 624 | len(out_dims) > 1.0 and z_est.ndim < 4): 625 | z_est = z_est.reshape((z_est.shape[0],) + out_dims) 626 | 627 | return z_est 628 | 629 | def g_deconv(self, z_ver, in_dims, out_dims, weight_name, fspec): 630 | """ Inverse operation for each type of f used in convnets """ 631 | f_type, f_dims = fspec 632 | assert z_ver is not None 633 | num_channels = in_dims[0] if in_dims is not None else None 634 | num_filters, width, height = out_dims[:3] 635 | 636 | if f_type in ['globalmeanpool']: 637 | u = T.addbroadcast(z_ver, 2, 3) 638 | assert in_dims[1] == 1 and in_dims[2] == 1, \ 639 | "global pooling needs in_dims (1,1): %s" % str(in_dims) 640 | 641 | elif f_type in ['maxpool']: 642 | sh, str, size = z_ver.shape, f_dims[0], f_dims[1] 643 | assert str == size, "depooling requires stride == size" 644 | u = T.zeros((sh[0], sh[1], sh[2] * str, sh[3] * str), 645 | dtype=z_ver.dtype) 646 | for x in xrange(str): 647 | for y in xrange(str): 648 | u = T.set_subtensor(u[:, :, x::str, y::str], z_ver) 649 | u = u[:, :, :width, :height] 650 | 651 | elif f_type in ['convv', 'convf']: 652 | filter_size, str = (f_dims[1], f_dims[1]), f_dims[2] 653 | W_shape = (num_filters, num_channels) + filter_size 654 | W = self.weight(self.rand_init_conv(W_shape), weight_name) 655 | if str > 1: 656 | # upsample if strided version 657 | sh = z_ver.shape 658 | u = T.zeros((sh[0], sh[1], sh[2] * str, sh[3] * str), 659 | dtype=z_ver.dtype) 660 | u = T.set_subtensor(u[:, :, ::str, ::str], z_ver) 661 | else: 662 | u = z_ver # no strides, only deconv 663 | u = conv2d(u, W, filter_shape=W_shape, 664 | border_mode='valid' if 'convf' in f_type else 'full') 665 | u = u[:, :, :width, :height] 666 | else: 667 | raise NotImplementedError('Layer %s has no convolutional decoder' 668 | % f_type) 669 | 670 | return u 671 | --------------------------------------------------------------------------------