├── ml ├── __init__.py ├── loss_functions.py ├── data_streams.py ├── step_strategies.py ├── ensembles.py ├── neural_nets.py └── trainers.py ├── util ├── __init__.py ├── misc.py ├── io.py ├── ml.py ├── math.py └── plot.py ├── .gitignore ├── README.md ├── run.py ├── LICENCE.txt └── bnn_demo.py /ml/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # specific files and foders 2 | .idea/ 3 | 4 | # python bytecode files 5 | *.pyc 6 | *.pyo 7 | 8 | # text editor's temporary files 9 | *~ 10 | 11 | -------------------------------------------------------------------------------- /util/misc.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | 4 | 5 | def remove_whitespace(str): 6 | """ 7 | Returns the string str with all whitespace removed. 8 | """ 9 | 10 | p = re.compile(r'\s+') 11 | return p.sub('', str) 12 | 13 | 14 | def get_environment(): 15 | """ 16 | Returns a string identifying the current environment. 17 | """ 18 | 19 | try: 20 | hostname = os.environ['SHORT_HOSTNAME'] 21 | 22 | except KeyError: 23 | 24 | try: 25 | hostname = os.environ['HOSTNAME'] 26 | 27 | except KeyError: 28 | return 'scratch' 29 | 30 | if re.match('charles[0-9]{2,}|renown|anne', hostname): 31 | return 'cluster' 32 | 33 | else: 34 | return 'scratch' 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bayesian neural networks demo 2 | 3 | A demo of Bayesian neural networks on a toy binary classification problem. 4 | 5 | Two ways of implementing Bayesian neural networks are demonstrated: 6 | - Stochastic Variational Inference using local reparameterization [1] 7 | - Hamiltonian Monte Carlo [2] 8 | 9 | [1] Kingma et al., _Variational Dropout and the Local Reparameterization Trick_, NeurIPS 2015. [[arXiv]](https://arxiv.org/abs/1506.02557) 10 | 11 | [2] Neal, _MCMC using Hamiltonian dynamics_, Handbook of Markov Chain Monte Carlo, 2011. [[arXiv]](https://arxiv.org/abs/1206.1901) 12 | 13 | ## How to run the code 14 | 15 | Display the training data: 16 | ``` 17 | python run.py --show 18 | ``` 19 | 20 | Train a non-Bayesian neural network by minimizing cross-entropy: 21 | ``` 22 | python run.py --mle 23 | ``` 24 | 25 | Train a Bayesian neural network using Stochastic Variarional Inference: 26 | ``` 27 | python run.py --svi 28 | ``` 29 | 30 | Train a Bayesian neural network using Hamiltonian Monte Carlo: 31 | ``` 32 | python run.py --hmc 33 | ``` 34 | 35 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import bnn_demo 3 | 4 | 5 | def parse_args(): 6 | """ 7 | Returns an object describing the command line. 8 | """ 9 | 10 | parser = argparse.ArgumentParser(description='Bayesian neural networks demo.') 11 | group = parser.add_mutually_exclusive_group() 12 | 13 | group.add_argument('--show', action='store_true', help='show the dataset') 14 | group.add_argument('--mle', action='store_true', help='train a non-bayesian net using maximum likelihood') 15 | group.add_argument('--svi', action='store_true', help='train a bayesian net using stochastic variational inference') 16 | group.add_argument('--hmc', action='store_true', help='train a bayesian net using hamiltonian monte carlo') 17 | 18 | return parser.parse_args() 19 | 20 | 21 | def main(): 22 | 23 | args = parse_args() 24 | 25 | if args.show: 26 | bnn_demo.show_train_data() 27 | 28 | elif args.mle: 29 | bnn_demo.fit_neural_net_demo() 30 | 31 | elif args.svi: 32 | bnn_demo.bayesian_neural_net_svi_demo() 33 | 34 | elif args.hmc: 35 | bnn_demo.bayesian_neural_net_hmc_demo() 36 | 37 | else: 38 | print('No action specified.') 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | -------------------------------------------------------------------------------- /LICENCE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018, George Papamakarios 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of anybody else. 27 | 28 | -------------------------------------------------------------------------------- /util/io.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import os 3 | import sys 4 | 5 | 6 | def save(data, file): 7 | """ 8 | Saves data to a file. 9 | """ 10 | 11 | dir = os.path.dirname(file) 12 | if dir: 13 | make_folder(dir) 14 | 15 | with open(file + '.pkl', 'w') as f: 16 | pickle.dump(data, f) 17 | 18 | 19 | def load(file): 20 | """ 21 | Loads data from file. 22 | """ 23 | 24 | with open(file + '.pkl', 'r') as f: 25 | data = pickle.load(f) 26 | 27 | return data 28 | 29 | 30 | def save_txt(str, file): 31 | """ 32 | Saves string to a text file. 33 | """ 34 | 35 | dir = os.path.dirname(file) 36 | if dir: 37 | make_folder(dir) 38 | 39 | with open(file, 'w') as f: 40 | f.write(str) 41 | 42 | 43 | def load_txt(file): 44 | """ 45 | Loads string from text file. 46 | """ 47 | 48 | with open(file, 'r') as f: 49 | str = f.read() 50 | 51 | return str 52 | 53 | 54 | def make_folder(folder): 55 | """ 56 | Creates given folder (or path) if it doesn't exist. 57 | """ 58 | 59 | if not os.path.exists(folder): 60 | os.makedirs(folder) 61 | 62 | 63 | class Logger: 64 | """ 65 | Implements an object that logs messages to a file, as well as printing them on the sceen. 66 | """ 67 | 68 | def __init__(self, filename): 69 | """ 70 | :param filename: file to be created for logging 71 | """ 72 | self.f = open(filename, 'w') 73 | 74 | def write(self, msg): 75 | """ 76 | :param msg: string to be logged and printed on screen 77 | """ 78 | sys.stdout.write(msg) 79 | self.f.write(msg) 80 | 81 | def __enter__(self): 82 | """ 83 | Context management enter function. 84 | """ 85 | return self 86 | 87 | def __exit__(self, exc_type, exc_val, exc_tb): 88 | """ 89 | Context management exit function. Closes the file. 90 | """ 91 | self.f.close() 92 | return False 93 | -------------------------------------------------------------------------------- /ml/loss_functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano.tensor as tt 3 | 4 | 5 | def SquareError(x): 6 | """Square error loss function.""" 7 | 8 | if x.ndim == 1: 9 | y = tt.vector('y') 10 | L = tt.mean((x - y) ** 2) 11 | 12 | elif x.ndim == 2: 13 | y = tt.matrix('y') 14 | L = tt.mean(tt.sum((x - y) ** 2, axis=1)) 15 | 16 | else: 17 | raise ValueError('x must be either a vector or a matrix.') 18 | 19 | L.name = 'loss' 20 | 21 | return y, L 22 | 23 | 24 | def CrossEntropy(x): 25 | """Cross entropy loss function. Only works for networks with one output.""" 26 | 27 | if x.ndim == 1: 28 | pass 29 | 30 | elif x.ndim == 2: 31 | x = x[:, 0] 32 | 33 | else: 34 | raise ValueError('x must be either a vector or a matrix.') 35 | 36 | y = tt.vector('y') 37 | L = -tt.mean(y * tt.log(x) + (1-y) * tt.log(1-x)) 38 | L.name = 'loss' 39 | 40 | return y, L 41 | 42 | 43 | def MultiCrossEntropy(x): 44 | """Cross entropy loss function with multiple outputs.""" 45 | 46 | assert x.ndim == 2, 'x must be a matrix.' 47 | 48 | y = tt.matrix('y') 49 | L = -tt.mean(tt.sum(y * tt.log(x), axis=1)) 50 | L.name = 'loss' 51 | 52 | return y, L 53 | 54 | 55 | def Accuracy(x): 56 | """Accuracy loss function. Mainly useful for validation.""" 57 | 58 | if x.ndim == 1: 59 | pass 60 | 61 | elif x.ndim == 2: 62 | x = x.argmax(axis=1) 63 | 64 | else: 65 | raise ValueError('x must be either a vector or a matrix.') 66 | 67 | y = tt.vector('y') 68 | L = tt.mean(tt.eq(y, x)) 69 | L.name = 'loss' 70 | 71 | return y, L 72 | 73 | 74 | def WeightDecay(ws, wdecay): 75 | """Weight decay regularization.""" 76 | 77 | assert wdecay > 0.0 78 | 79 | L = (wdecay / 2.0) * sum([tt.sum(w**2) for w in ws]) 80 | return L 81 | 82 | 83 | def SviRegularizer(mps, sps, wdecay): 84 | """ 85 | The type of regularization that is used in stochastic variational inference. Here, we assume that the prior is 86 | a spherical zero-centred gaussian whose precision corresponds to the weight decay parameter. 87 | """ 88 | 89 | assert wdecay > 0.0 90 | 91 | n_params = sum([mp.get_value().size for mp in mps]) 92 | 93 | L1 = 0.5 * wdecay * (sum([tt.sum(mp**2) for mp in mps]) + sum([tt.sum(tt.exp(sp*2)) for sp in sps])) 94 | L2 = sum([tt.sum(sp) for sp in sps]) 95 | Lc = 0.5 * n_params * (1.0 + np.log(wdecay)) 96 | 97 | L = L1 - L2 - Lc 98 | 99 | return L 100 | -------------------------------------------------------------------------------- /util/ml.py: -------------------------------------------------------------------------------- 1 | from itertools import izip 2 | import numpy as np 3 | import theano 4 | import theano.tensor as tt 5 | 6 | 7 | def select_theano_act_function(name, dtype=theano.config.floatX): 8 | """ 9 | Given the name of an activation function, returns a handle for the corresponding function in theano. 10 | """ 11 | 12 | if name == 'logistic': 13 | clip = 15.0 if dtype == 'float32' else 19.0 14 | f = lambda x: tt.nnet.sigmoid(tt.clip(x, -clip, clip)) 15 | 16 | elif name == 'tanh': 17 | clip = 9.0 if dtype == 'float32' else 19.0 18 | f = lambda x: tt.tanh(tt.clip(x, -clip, clip)) 19 | 20 | elif name == 'linear': 21 | f = lambda x: x 22 | 23 | elif name == 'relu': 24 | f = tt.nnet.relu 25 | 26 | elif name == 'softplus': 27 | f = tt.nnet.softplus 28 | 29 | elif name == 'softmax': 30 | f = tt.nnet.softmax 31 | 32 | else: 33 | raise ValueError(name + ' is not a supported activation function type.') 34 | 35 | return f 36 | 37 | 38 | def copy_model_parms(source_model, target_model): 39 | """ 40 | Copies the parameters of source_model to target_model. 41 | """ 42 | 43 | for sp, tp in izip(source_model.parms, target_model.parms): 44 | tp.set_value(sp.get_value()) 45 | 46 | 47 | def one_hot_encode(labels, n_labels): 48 | """ 49 | Transforms numeric labels to 1-hot encoded labels. Assumes numeric labels are in the range 0, 1, ..., n_labels-1. 50 | """ 51 | 52 | assert np.min(labels) >= 0 and np.max(labels) < n_labels 53 | 54 | y = np.zeros([labels.size, n_labels]) 55 | y[xrange(labels.size), labels] = 1 56 | 57 | return y 58 | 59 | 60 | def prepare_cond_input(xy, dtype): 61 | """ 62 | Prepares the conditional input for model evaluation. 63 | :param xy: tuple (x, y) for evaluating p(y|x) 64 | :param dtype: data type 65 | :return: prepared x, y and flag whether single datapoint input 66 | """ 67 | 68 | x, y = xy 69 | x = np.asarray(x, dtype=dtype) 70 | y = np.asarray(y, dtype=dtype) 71 | 72 | one_datapoint = False 73 | 74 | if x.ndim == 1: 75 | 76 | if y.ndim == 1: 77 | x = x[np.newaxis, :] 78 | y = y[np.newaxis, :] 79 | one_datapoint = True 80 | 81 | else: 82 | x = np.tile(x, [y.shape[0], 1]) 83 | 84 | else: 85 | 86 | if y.ndim == 1: 87 | y = np.tile(y, [x.shape[0], 1]) 88 | 89 | else: 90 | assert x.shape[0] == y.shape[0], 'wrong sizes' 91 | 92 | return x, y, one_datapoint 93 | 94 | 95 | def are_parms_finite(model): 96 | """ 97 | Check whether all parameters of a model are finite. 98 | :param model: an ml model 99 | :return: False if at least one parameter is inf or nan 100 | """ 101 | 102 | check = True 103 | 104 | for p in model.parms: 105 | check = check and np.all(np.isfinite(p.get_value())) 106 | 107 | return check 108 | -------------------------------------------------------------------------------- /ml/data_streams.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | 4 | 5 | class DataStream: 6 | """Abstract class. Specifies the interface of a data stream. 7 | The user can request from the stream to generate a new data batch of a 8 | specified size. Useful for online learning.""" 9 | 10 | def gen(self, N): 11 | """Generates a new data batch of size N.""" 12 | raise NotImplementedError('This is an abstract method and should be overriden.') 13 | 14 | 15 | class DataSubSampler(DataStream): 16 | """Given a data set, subsamples mini-batches from it.""" 17 | 18 | def __init__(self, xs): 19 | 20 | # check that input is of the right type 21 | check = lambda t: isinstance(t, np.ndarray) and t.size and t.ndim 22 | assert isinstance(xs, list) and xs, 'Input must be a non-empty list.' 23 | assert check(xs[0]), 'Data must be given as real nonempty arrays.' 24 | N = xs[0].shape[0] 25 | for x in xs[1:]: 26 | assert check(x), 'Data must be given as real nonempty arrays.' 27 | Nk = x.shape[0] 28 | assert N == Nk, 'All data arrays must have the same number of elements in their first dimension.' 29 | 30 | # set remaining class properties 31 | self.index_stream = IndexSubSampler(N) 32 | self.xs = [ theano.shared(x.astype(theano.config.floatX), name='data'+str(i)) for i, x in enumerate(xs) ] 33 | 34 | def gen(self, N): 35 | """Generates a new data batch of size N from the data set.""" 36 | 37 | assert isinstance(N, int) and N > 0, 'Batch size must be a positive integer.' 38 | 39 | n = self.index_stream.gen(N) 40 | return [x[n] for x in self.xs] 41 | 42 | 43 | class IndexSubSampler(DataStream): 44 | """Subsamples minibatches of indices.""" 45 | 46 | def __init__(self, num_idx, rng=np.random): 47 | 48 | assert isinstance(num_idx, int) and num_idx > 0, 'Number of indices must be a positive integer.' 49 | 50 | self.num_idx = num_idx 51 | self.nn = range(num_idx) 52 | rng.shuffle(self.nn) 53 | self.i = 0 54 | self.rng = rng 55 | 56 | def gen(self, N): 57 | """Generates a new index batch of size N from 0:num_idx-1.""" 58 | 59 | assert isinstance(N, int) and N > 0, 'Batch size must be a positive integer.' 60 | 61 | j = self.i + N 62 | times = j // self.num_idx 63 | new_i = j % self.num_idx 64 | n = [] 65 | 66 | for t in xrange(times): 67 | n += self.nn[self.i:] 68 | self.rng.shuffle(self.nn) 69 | self.i = 0 70 | 71 | n += self.nn[self.i:new_i] 72 | self.i = new_i 73 | 74 | return n 75 | 76 | 77 | class IndexSubSamplerSeq(DataStream): 78 | """Subsamples minibatches of indices. Indices are sequentially grouped into minibatches.""" 79 | 80 | def __init__(self, num_idx): 81 | 82 | assert isinstance(num_idx, int) and num_idx > 0, 'Number of indices must be a positive integer.' 83 | 84 | self.num_idx = num_idx 85 | self.nn = range(num_idx) 86 | self.i = 0 87 | 88 | def gen(self, N): 89 | """Generates a new index batch of size N from 0:num_idx-1.""" 90 | 91 | assert isinstance(N, int) and N > 0, 'Batch size must be a positive integer.' 92 | 93 | j = self.i + N 94 | times = j // self.num_idx 95 | new_i = j % self.num_idx 96 | n = [] 97 | 98 | for t in xrange(times): 99 | n += self.nn[self.i:] 100 | self.i = 0 101 | 102 | n += self.nn[self.i:new_i] 103 | self.i = new_i 104 | 105 | return n 106 | -------------------------------------------------------------------------------- /util/math.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.misc 3 | 4 | 5 | def isposint(n): 6 | """ 7 | Determines whether number n is a positive integer. 8 | :param n: number 9 | :return: bool 10 | """ 11 | return isinstance(n, int) and n > 0 12 | 13 | 14 | def isdistribution(p): 15 | """ 16 | :param p: a vector representing a discrete probability distribution 17 | :return: True if p is a valid probability distribution 18 | """ 19 | return np.all(p >= 0.0) and np.isclose(np.sum(p), 1.0) 20 | 21 | 22 | def logistic(x): 23 | """ 24 | Elementwise logistic sigmoid. 25 | :param x: numpy array 26 | :return: numpy array 27 | """ 28 | return 1.0 / (1.0 + np.exp(-x)) 29 | 30 | 31 | def logit(x): 32 | """ 33 | Elementwise logit (inverse logistic sigmoid). 34 | :param x: numpy array 35 | :return: numpy array 36 | """ 37 | return np.log(x / (1.0 - x)) 38 | 39 | 40 | def discrete_sample(p, n_samples=None, rng=np.random): 41 | """ 42 | Samples from a discrete distribution. 43 | :param p: a distribution with N elements 44 | :param n_samples: number of samples, only 1 if None 45 | :return: vector of samples 46 | """ 47 | 48 | # check distribution 49 | # assert isdistribution(p), 'Probabilities must be non-negative and sum to one.' 50 | 51 | one_sample = n_samples is None 52 | 53 | # cumulative distribution 54 | c = np.cumsum(p[:-1])[np.newaxis, :] 55 | 56 | # get the samples 57 | r = rng.rand(1 if one_sample else n_samples, 1) 58 | samples = np.sum((r > c).astype(int), axis=1) 59 | 60 | return samples[0] if one_sample else samples 61 | 62 | 63 | def importance_sample(target, proposal, n_samples, rng=np.random): 64 | """ 65 | Importance sampling. 66 | :param target: target distribution 67 | :param proposal: proposal distribution 68 | :param n_samples: number of samples 69 | :param rng: random generator to use 70 | :return: samples, normalized log weights 71 | """ 72 | 73 | xs = proposal.gen(n_samples, rng=rng) 74 | log_ws = target.eval(xs, log=True) - proposal.eval(xs, log=True) 75 | log_ws -= scipy.misc.logsumexp(log_ws) 76 | 77 | return xs, log_ws 78 | 79 | 80 | def ess_importance(ws): 81 | """ 82 | Calculates the effective sample size of a set of weighted independent samples (e.g. as given by importance 83 | sampling or sequential monte carlo). Takes as input the normalized sample weights. 84 | """ 85 | 86 | ess = 1.0 / np.sum(ws ** 2) 87 | return ess 88 | 89 | 90 | def ess_mcmc(xs): 91 | """ 92 | Calculates the effective sample size of a correlated sequence of samples, e.g. as given by markov chain monte 93 | carlo. 94 | """ 95 | 96 | n_samples, n_dim = xs.shape 97 | 98 | mean = np.mean(xs, axis=0) 99 | xms = xs - mean 100 | 101 | acors = np.zeros_like(xms) 102 | for i in xrange(n_dim): 103 | for lag in xrange(n_samples): 104 | acor = np.sum(xms[:n_samples-lag, i] * xms[lag:, i]) / (n_samples - lag) 105 | if acor <= 0.0: break 106 | acors[lag, i] = acor 107 | 108 | act = 1.0 + 2.0 * np.sum(acors[1:], axis=0) / acors[0] 109 | ess = n_samples / act 110 | 111 | return np.min(ess) 112 | 113 | 114 | def calc_whitening_transform(xs): 115 | """ 116 | Calculates the parameters that whiten a dataset. 117 | """ 118 | 119 | assert xs.ndim == 2, 'Data must be a matrix' 120 | N = xs.shape[0] 121 | 122 | means = np.mean(xs, axis=0) 123 | ys = xs - means 124 | 125 | cov = np.dot(ys.T, ys) / N 126 | vars, U = np.linalg.eig(cov) 127 | istds = np.sqrt(1.0 / vars) 128 | 129 | return means, U, istds 130 | 131 | 132 | def whiten(xs, params): 133 | """ 134 | Whitens a given dataset using the whitening transform provided. 135 | """ 136 | 137 | means, U, istds = params 138 | 139 | ys = xs.copy() 140 | ys -= means 141 | ys = np.dot(ys, U) 142 | ys *= istds 143 | 144 | return ys 145 | 146 | 147 | def de_whiten(xs, params): 148 | """ 149 | De-whitens a given dataset using the whitening transform provided. 150 | """ 151 | 152 | means, U, istds = params 153 | 154 | ys = xs.copy() 155 | ys /= istds 156 | ys = np.dot(ys, U.T) 157 | ys += means 158 | 159 | return ys 160 | -------------------------------------------------------------------------------- /ml/step_strategies.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from itertools import izip 3 | import numpy as np 4 | import theano 5 | import theano.tensor as tt 6 | 7 | 8 | class StepStrategy: 9 | """Abstract class for the step size strategy of stochastic gradient training.""" 10 | 11 | def updates(self, parms, grads): 12 | """Given current gradient, return a list of updates to be made.""" 13 | raise NotImplementedError('This is an abstract method and should be overriden.') 14 | 15 | 16 | class ConstantStep(StepStrategy): 17 | """Step size strategy where the learning rate is held constant.""" 18 | 19 | def __init__(self, step): 20 | """ 21 | Constructor. 22 | :param step: the constant step size to be used 23 | """ 24 | assert step > 0.0, 'Step size must be positive.' 25 | self.step = step 26 | 27 | def updates(self, parms, grads): 28 | """No updates to be made; step size is held constant throughout.""" 29 | new_parms = [p - self.step*g for p, g in izip(parms, grads)] 30 | return zip(parms, new_parms) 31 | 32 | 33 | class LinearDecay(StepStrategy): 34 | """Step size strategy where the learning rate is linearly decreased so as to 35 | hit zero after a specified number of iterations.""" 36 | 37 | def __init__(self, init, maxiter): 38 | """ 39 | Constructor. 40 | :param init: initial step size 41 | :param maxiter: maximum number of iterations. 42 | """ 43 | assert init > 0.0, 'Step size must be positive.' 44 | assert isinstance(maxiter, int) and maxiter > 0, 'Maximum number of iterations must be a positive integer.' 45 | 46 | self.init = init 47 | self.maxiter = maxiter 48 | 49 | def updates(self, parms, grads): 50 | """Next step is linearly decayed.""" 51 | step = theano.shared(np.asarray(self.init, dtype=theano.config.floatX), name='step') 52 | new_step = step - self.init / self.maxiter 53 | new_parms = [p - step*g for p, g in izip(parms, grads)] 54 | return [(step, new_step)] + zip(parms, new_parms) 55 | 56 | 57 | class AdaDelta(StepStrategy): 58 | """ADADELTA step size strategy. For details, see: 59 | M. D. Zeiler, "ADADELTA: An adaptive learning rate method", arXiv, 2012.""" 60 | 61 | def __init__(self, rho=0.95, eps=1.0e-6): 62 | """Constructor. Sets adadelta's hyperparameters.""" 63 | assert eps > 0, 'eps must be positive.' 64 | assert 0 < rho < 1, 'rho must be strictly between 0 and 1.' 65 | 66 | self.eps = eps 67 | self.rho = rho 68 | 69 | def updates(self, parms, grads): 70 | """Return a list of updates to be made, both to adadelta's accumulators and the parameters.""" 71 | 72 | acc_gs = [theano.shared(np.zeros_like(p.get_value(borrow=True)), borrow=True) for p in parms] 73 | acc_ds = [theano.shared(np.zeros_like(p.get_value(borrow=True)), borrow=True) for p in parms] 74 | 75 | new_acc_gs = [self.rho * ag + (1-self.rho) * g**2 for g, ag in izip(grads, acc_gs)] 76 | ds = [tt.sqrt((ad + self.eps) / (ag + self.eps)) * g for g, ag, ad in izip(grads, new_acc_gs, acc_ds)] 77 | new_acc_ds = [self.rho * ad + (1-self.rho) * d**2 for d, ad in izip(ds, acc_ds)] 78 | new_parms = [p - d for p, d in izip(parms, ds)] 79 | 80 | return zip(acc_gs, new_acc_gs) + zip(acc_ds, new_acc_ds) + zip(parms, new_parms) 81 | 82 | 83 | class Adam(StepStrategy): 84 | """Adam step size strategy. For details, see: 85 | D. Kingma and J. Ba, "Adam: A Method for Stochastic Optimization", ICLR, 2015.""" 86 | 87 | def __init__(self, a=0.001, bm=0.9, bv=0.999, eps=1.0e-8): 88 | """Constructor. Sets adam's hyperparameters.""" 89 | assert a > 0, 'a must be positive.' 90 | assert 0 < bm < 1, 'bm must be strictly between 0 and 1.' 91 | assert 0 < bv < 1, 'bv must be strictly between 0 and 1.' 92 | assert eps > 0, 'eps must be positive.' 93 | 94 | self.a = a 95 | self.bm = bm 96 | self.bv = bv 97 | self.eps = eps 98 | 99 | def updates(self, parms, grads): 100 | """Return a list of updates to be made, both to adams's running averages and the parameters.""" 101 | 102 | bm_t = theano.shared(np.asarray(self.bm).astype(theano.config.floatX)) 103 | bv_t = theano.shared(np.asarray(self.bv).astype(theano.config.floatX)) 104 | 105 | new_bm_t = bm_t * self.bm 106 | new_bv_t = bv_t * self.bv 107 | 108 | acc_m = [theano.shared(np.zeros_like(p.get_value(borrow=True)), borrow=True) for p in parms] 109 | acc_v = [theano.shared(np.zeros_like(p.get_value(borrow=True)), borrow=True) for p in parms] 110 | 111 | new_acc_m = [self.bm * am + (1-self.bm) * g for g, am in izip(grads, acc_m)] 112 | new_acc_v = [self.bv * av + (1-self.bv) * g**2 for g, av in izip(grads, acc_v)] 113 | 114 | step = self.a * tt.sqrt(1-new_bv_t) / (1-new_bm_t) 115 | eps = self.eps * (1-new_bv_t) 116 | ds = [step * am / tt.sqrt(av + eps) for am, av in izip(new_acc_m, new_acc_v)] 117 | 118 | new_parms = [p - d for p, d in izip(parms, ds)] 119 | 120 | return zip([bm_t, bv_t], [new_bm_t, new_bv_t]) + zip(acc_m, new_acc_m) + zip(acc_v, new_acc_v) + zip(parms, new_parms) 121 | -------------------------------------------------------------------------------- /ml/ensembles.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from scipy.misc import logsumexp 3 | from copy import deepcopy 4 | from itertools import izip 5 | import numpy as np 6 | 7 | 8 | class Ensemble: 9 | """ 10 | Implements an ensemble of other models. 11 | """ 12 | 13 | def __init__(self): 14 | """Initializes the ensemble as empty.""" 15 | 16 | self.models = [] 17 | self.n_copies = [] 18 | self.n_models = 0 19 | self.n_diff_models = 0 20 | self.n_inputs = 0 21 | self.n_outputs = 0 22 | 23 | def add_new(self, model, copy=False): 24 | """Adds a new model to the ensemble.""" 25 | 26 | if self.n_models == 0: 27 | self.n_inputs = model.n_inputs 28 | self.n_outputs = model.n_outputs 29 | 30 | else: 31 | assert self.n_inputs == model.n_inputs 32 | assert self.n_outputs == model.n_outputs 33 | 34 | if copy: 35 | self.models.append(deepcopy(model)) 36 | else: 37 | self.models.append(model) 38 | self.n_copies.append(1) 39 | self.n_models += 1 40 | self.n_diff_models += 1 41 | 42 | def add_existing(self, i): 43 | """Adds an extra copy of model i in the ensemble.""" 44 | 45 | self.n_copies[i] += 1 46 | self.n_models += 1 47 | 48 | def remove(self, i): 49 | """Removes a model at position i from the ensemble.""" 50 | 51 | self.n_copies[i] -= 1 52 | if self.n_copies[i] == 0: 53 | del self.models[i] 54 | del self.n_copies[i] 55 | self.n_diff_models -= 1 56 | 57 | self.n_models -= 1 58 | if self.n_models == 0: 59 | self.n_inputs = 0 60 | self.n_outputs = 0 61 | 62 | def eval(self, x): 63 | """Evaluates ensemble at given input x.""" 64 | 65 | # NOTE that there is the potential drawback in this implementation that x is moved back and forth to the gpu 66 | 67 | assert self.n_models > 0, 'Ensemble is empty.' 68 | 69 | y = 0.0 70 | 71 | for model, copies in izip(self.models, self.n_copies): 72 | y += copies * model.eval(x) 73 | 74 | y /= self.n_models 75 | 76 | return y 77 | 78 | def eval_model(self, i, x): 79 | """Evaluates model i in the ensemble at given input x.""" 80 | 81 | return self.models[i].eval(x) 82 | 83 | 84 | class FastEnsemble: 85 | """ 86 | Implements an ensemble of other models. Maintains only a single model, and a list of different parameter matrices. 87 | As a result, it is faster to create and more memory efficient, but slower to evaluate. 88 | """ 89 | 90 | def __init__(self, model, copy=False): 91 | """Initializes the ensemble as empty.""" 92 | 93 | self.model = deepcopy(model) if copy else model 94 | self.parms = [] 95 | self.n_copies = [] 96 | self.n_models = 0 97 | self.n_diff_models = 0 98 | self.n_inputs = model.n_inputs 99 | self.n_outputs = model.n_outputs 100 | 101 | def _load_model(self, i): 102 | """Loads parameters for model i.""" 103 | 104 | for j, p in enumerate(self.parms[i]): 105 | self.model.parms[j].set_value(p) 106 | 107 | def add_new(self, parms, copy=False): 108 | """Adds a new set of parameters to the ensemble.""" 109 | 110 | if copy: 111 | self.parms.append([x.get_value().copy() for x in parms]) 112 | else: 113 | self.parms.append([x.get_value() for x in parms]) 114 | self.n_copies.append(1) 115 | self.n_models += 1 116 | self.n_diff_models += 1 117 | 118 | def add_existing(self, i): 119 | """Adds an extra copy of model i in the ensemble.""" 120 | 121 | self.n_copies[i] += 1 122 | self.n_models += 1 123 | 124 | def remove(self, i): 125 | """Removes a model at position i from the ensemble.""" 126 | 127 | self.n_copies[i] -= 1 128 | 129 | if self.n_copies[i] == 0: 130 | del self.parms[i] 131 | del self.n_copies[i] 132 | self.n_diff_models -= 1 133 | 134 | def eval(self, x, mode='mean'): 135 | """Evaluates ensemble at given input x.""" 136 | 137 | # NOTE that there is the potential drawback in this implementation that x is moved back and forth to the gpu 138 | 139 | assert self.n_models > 0, 'Ensemble is empty.' 140 | 141 | if mode == 'mean': 142 | 143 | y = 0.0 144 | 145 | for i, copies in enumerate(self.n_copies): 146 | self._load_model(i) 147 | y += copies * self.model.eval(x) 148 | 149 | y /= self.n_models 150 | 151 | elif mode == 'logmeanexp': 152 | 153 | y = [] 154 | 155 | for i, copies in enumerate(self.n_copies): 156 | self._load_model(i) 157 | y.append(np.log(copies) + self.model.eval(x)) 158 | 159 | y = logsumexp(np.array(y), axis=0) - np.log(self.n_models) 160 | 161 | else: 162 | raise ValueError('Unknown averaging mode.') 163 | 164 | return y 165 | 166 | def eval_model(self, i, x): 167 | """Evaluates model i in the ensemble at given input x.""" 168 | 169 | self._load_model(i) 170 | return self.model.eval(x) 171 | 172 | def get_traces(self): 173 | """Returns matrices whose columns are traces of parameters, in the order they where added to the ensemble.""" 174 | 175 | all_traces = [] 176 | 177 | for i in xrange(len(self.model.parms)): 178 | 179 | traces = [] 180 | 181 | for params, copies in izip(self.parms, self.n_copies): 182 | for n in xrange(copies): 183 | traces.append(params[i].flatten()) 184 | 185 | all_traces.append(np.array(traces)) 186 | 187 | return all_traces -------------------------------------------------------------------------------- /util/plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def disp_imdata(xs, imsize, layout=(1, 1)): 6 | """ 7 | Displays an array of images, a page at a time. The user can navigate pages with 8 | left and right arrows, start over by pressing space, or close the figure by esc. 9 | :param xs: an numpy array with images as rows 10 | :param imsize: size of the images 11 | :param layout: layout of images in a page 12 | :return: none 13 | """ 14 | 15 | num_plots = np.prod(layout) 16 | num_xs = xs.shape[0] 17 | idx = [0] 18 | 19 | # create a figure with subplots 20 | fig, axs = plt.subplots(layout[0], layout[1]) 21 | 22 | if isinstance(axs, np.ndarray): 23 | axs = axs.flatten() 24 | else: 25 | axs = [axs] 26 | 27 | for ax in axs: 28 | ax.axes.get_xaxis().set_visible(False) 29 | ax.axes.get_yaxis().set_visible(False) 30 | 31 | def plot_page(): 32 | """Plots the next page.""" 33 | 34 | ii = np.arange(idx[0], idx[0]+num_plots) % num_xs 35 | 36 | for ax, i in zip(axs, ii): 37 | ax.imshow(xs[i].reshape(imsize), cmap='gray', interpolation='none') 38 | ax.set_title(str(i)) 39 | 40 | fig.canvas.draw() 41 | 42 | def on_key_event(event): 43 | """Event handler after key press.""" 44 | 45 | key = event.key 46 | 47 | if key == 'right': 48 | # show next page 49 | idx[0] = (idx[0] + num_plots) % num_xs 50 | plot_page() 51 | 52 | elif key == 'left': 53 | # show previous page 54 | idx[0] = (idx[0] - num_plots) % num_xs 55 | plot_page() 56 | 57 | elif key == ' ': 58 | # show first page 59 | idx[0] = 0 60 | plot_page() 61 | 62 | elif key == 'escape': 63 | # close figure 64 | plt.close(fig) 65 | 66 | fig.canvas.mpl_connect('key_press_event', on_key_event) 67 | plot_page() 68 | 69 | 70 | def probs2contours(probs, levels): 71 | """ 72 | Takes an array of probabilities and produces an array of contours at specified percentile levels 73 | :param probs: probability array. doesn't have to sum to 1, but it is assumed it contains all the mass 74 | :param levels: percentile levels. have to be in [0.0, 1.0] 75 | :return: array of same shape as probs with percentile labels 76 | """ 77 | 78 | # make sure all contour levels are in [0.0, 1.0] 79 | levels = np.asarray(levels) 80 | assert np.all(levels <= 1.0) and np.all(levels >= 0.0) 81 | 82 | # flatten probability array 83 | shape = probs.shape 84 | probs = probs.flatten() 85 | 86 | # sort probabilities in descending order 87 | idx_sort = probs.argsort()[::-1] 88 | idx_unsort = idx_sort.argsort() 89 | probs = probs[idx_sort] 90 | 91 | # cumulative probabilities 92 | cum_probs = probs.cumsum() 93 | cum_probs /= cum_probs[-1] 94 | 95 | # create contours at levels 96 | contours = np.ones_like(cum_probs) 97 | levels = np.sort(levels)[::-1] 98 | for level in levels: 99 | contours[cum_probs <= level] = level 100 | 101 | # make sure contours have the order and the shape of the original probability array 102 | contours = np.reshape(contours[idx_unsort], shape) 103 | 104 | return contours 105 | 106 | 107 | def plot_pdf_marginals(pdf, lims, gt=None, levels=(0.68, 0.95)): 108 | """ 109 | Plots marginals of a pdf, for each variable and pair of variables. 110 | """ 111 | 112 | if pdf.ndim == 1: 113 | 114 | fig, ax = plt.subplots(1, 1) 115 | xx = np.linspace(lims[0], lims[1], 200) 116 | 117 | pp = pdf.eval(xx[:, np.newaxis], log=False) 118 | ax.plot(xx, pp) 119 | ax.set_xlim(lims) 120 | ax.set_ylim([0, ax.get_ylim()[1]]) 121 | if gt is not None: ax.vlines(gt, 0, ax.get_ylim()[1], color='r') 122 | 123 | else: 124 | 125 | fig = plt.figure() 126 | 127 | lims = np.asarray(lims) 128 | lims = np.tile(lims, [pdf.ndim, 1]) if lims.ndim == 1 else lims 129 | 130 | for i in xrange(pdf.ndim): 131 | for j in xrange(i + 1): 132 | 133 | ax = fig.add_subplot(pdf.ndim, pdf.ndim, i * pdf.ndim + j + 1) 134 | 135 | if i == j: 136 | xx = np.linspace(lims[i, 0], lims[i, 1], 500) 137 | pp = pdf.eval(xx[:, np.newaxis], ii=[i], log=False) 138 | ax.plot(xx, pp) 139 | ax.set_xlim(lims[i]) 140 | ax.set_ylim([0, ax.get_ylim()[1]]) 141 | if gt is not None: ax.vlines(gt[i], 0, ax.get_ylim()[1], color='r') 142 | 143 | else: 144 | xx = np.linspace(lims[i, 0], lims[i, 1], 200) 145 | yy = np.linspace(lims[j ,0], lims[j, 1], 200) 146 | X, Y = np.meshgrid(xx, yy) 147 | xy = np.concatenate([X.reshape([-1, 1]), Y.reshape([-1, 1])], axis=1) 148 | pp = pdf.eval(xy, ii=[i, j], log=False) 149 | pp = pp.reshape(list(X.shape)) 150 | ax.contour(X, Y, probs2contours(pp, levels), levels) 151 | ax.set_xlim(lims[i]) 152 | ax.set_ylim(lims[j]) 153 | if gt is not None: ax.plot(gt[i], gt[j], 'r.', ms=8) 154 | 155 | plt.show(block=False) 156 | 157 | return fig 158 | 159 | 160 | def plot_hist_marginals(data, weights=None, lims=None, gt=None): 161 | """ 162 | Plots marginal histograms and pairwise scatter plots of a dataset. 163 | """ 164 | 165 | n_bins = int(np.sqrt(data.shape[0])) 166 | 167 | if data.ndim == 1: 168 | 169 | fig, ax = plt.subplots(1, 1) 170 | ax.hist(data, weights=weights, bins=n_bins, normed=True) 171 | ax.set_ylim([0, ax.get_ylim()[1]]) 172 | if lims is not None: ax.set_xlim(lims) 173 | if gt is not None: ax.vlines(gt, 0, ax.get_ylim()[1], color='r') 174 | 175 | else: 176 | 177 | n_dim = data.shape[1] 178 | fig = plt.figure() 179 | 180 | if weights is None: 181 | col = 'k' 182 | vmin, vmax = None, None 183 | else: 184 | col = weights 185 | vmin, vmax = 0., np.max(weights) 186 | 187 | if lims is not None: 188 | lims = np.asarray(lims) 189 | lims = np.tile(lims, [n_dim, 1]) if lims.ndim == 1 else lims 190 | 191 | for i in xrange(n_dim): 192 | for j in xrange(i + 1): 193 | 194 | ax = fig.add_subplot(n_dim, n_dim, i * n_dim + j + 1) 195 | 196 | if i == j: 197 | ax.hist(data[:, i], weights=weights, bins=n_bins, normed=True) 198 | ax.set_ylim([0, ax.get_ylim()[1]]) 199 | if lims is not None: ax.set_xlim(lims[i]) 200 | if gt is not None: ax.vlines(gt[i], 0, ax.get_ylim()[1], color='r') 201 | 202 | else: 203 | ax.scatter(data[:, i], data[:, j], c=col, s=3, marker='o', vmin=vmin, vmax=vmax, cmap='binary', edgecolors='none') 204 | if lims is not None: 205 | ax.set_xlim(lims[i]) 206 | ax.set_ylim(lims[j]) 207 | if gt is not None: ax.scatter(gt[i], gt[j], c='r', s=12, marker='o', edgecolors='none') 208 | 209 | plt.show(block=False) 210 | 211 | return fig 212 | 213 | 214 | def plot_traces(xs): 215 | """ 216 | Plots sample traces. Useful for MCMC. 217 | :param xs: # samples x # vars numpy array 218 | :return: figure and axes handles 219 | """ 220 | 221 | N = xs.shape[1] 222 | fig, ax = plt.subplots(N, 1, sharex=True) 223 | 224 | for i in xrange(N): 225 | ax[i].plot(xs[:, i]) 226 | 227 | ax[-1].set_xlabel('samples') 228 | plt.show(block=False) 229 | 230 | return fig, ax 231 | -------------------------------------------------------------------------------- /bnn_demo.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import numpy as np 4 | import numpy.random as rng 5 | import matplotlib.pyplot as plt 6 | import matplotlib.cm as cm 7 | from mpl_toolkits.mplot3d import Axes3D 8 | 9 | import ml.trainers as trainers 10 | import ml.neural_nets as nn 11 | import ml.loss_functions as lf 12 | 13 | wdecay = 0.0001 14 | 15 | 16 | def create_dataset(): 17 | """ 18 | Creates a small dataset of 2d points in two linearly separable classes. 19 | :return: datapoints, labels 20 | """ 21 | 22 | data_per_class = 12 23 | 24 | rng_state = rng.get_state() 25 | rng.seed(0) 26 | 27 | x1 = rng.multivariate_normal([-6, 0], np.eye(2), data_per_class) 28 | x2 = rng.multivariate_normal([+6, 0], np.eye(2), data_per_class) 29 | 30 | y1 = np.zeros(data_per_class) 31 | y2 = np.ones(data_per_class) 32 | 33 | xs = np.concatenate([x1, x2], axis=0) 34 | ys = np.concatenate([y1, y2], axis=0) 35 | 36 | rng.set_state(rng_state) 37 | 38 | return xs, ys 39 | 40 | 41 | def create_net(svi=False): 42 | """ 43 | Creates a feedforward neural net. 44 | :param svi: whether the neural net should be SVI enabled 45 | :return: the net 46 | """ 47 | 48 | if svi: 49 | net = nn.FeedforwardNet_SVI(2) 50 | else: 51 | net = nn.FeedforwardNet(2) 52 | 53 | net.addLayer(10, 'relu') 54 | net.addLayer(1, 'logistic') 55 | 56 | return net 57 | 58 | 59 | def create_grid(xmin, xmax, N): 60 | """ 61 | Creates a grid for 3d plotting. 62 | :param xmin: lower limit 63 | :param xmax: upper limit 64 | :param N: number of points in the grid per dimension 65 | :return: the grid 66 | """ 67 | 68 | xx = np.linspace(xmin, xmax, N) 69 | X, Y = np.meshgrid(xx, xx) 70 | data = np.concatenate([X.reshape([-1, 1]), Y.reshape([-1, 1])], axis=1) 71 | 72 | return data, X, Y 73 | 74 | 75 | def show_train_data(): 76 | """ 77 | Plots the training data. 78 | """ 79 | 80 | xs, ys = create_dataset() 81 | 82 | plt.figure() 83 | plt.plot(xs[ys == 0, 0], xs[ys == 0, 1], 'b.', ms=12) 84 | plt.plot(xs[ys == 1, 0], xs[ys == 1, 1], 'r.', ms=12) 85 | plt.xlabel('x1') 86 | plt.ylabel('x2') 87 | plt.axis('equal') 88 | plt.axis([-12, 12, -12, 12]) 89 | plt.title('Training data') 90 | 91 | plt.show() 92 | 93 | 94 | def fit_neural_net_demo(): 95 | """ 96 | Fits a non-bayesian neural net to the training data by minimizing cross entropy. 97 | """ 98 | 99 | xs, ys = create_dataset() 100 | net = create_net() 101 | 102 | # train the net 103 | trn_target, trn_loss = lf.CrossEntropy(net.output) 104 | regularizer = lf.WeightDecay(net.parms, wdecay) 105 | trainer = trainers.SGD( 106 | model=net, 107 | trn_data=[xs, ys], 108 | trn_loss=trn_loss + regularizer / xs.shape[0], 109 | trn_target=trn_target 110 | ) 111 | trainer.train(tol=1.0e-9, monitor_every=10, show_progress=True) 112 | 113 | # make predictions 114 | tst_data, X, Y = create_grid(-12, 12, 50) 115 | pred = net.eval(tst_data) 116 | 117 | # plot the prediction surface 118 | fig = plt.figure() 119 | ax = fig.gca(projection='3d') 120 | Z = pred.reshape(list(X.shape)) 121 | ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0) 122 | ax.plot(xs[ys == 0, 0], xs[ys == 0, 1], 'b.', ms=12) 123 | ax.plot(xs[ys == 1, 0], xs[ys == 1, 1], 'r.', ms=12) 124 | ax.view_init(elev=90, azim=-90) 125 | plt.xlabel('x1') 126 | plt.ylabel('x2') 127 | plt.axis('equal') 128 | ax.axis([-12, 12, -12, 12]) 129 | fig.suptitle('Prediction surface of trained net') 130 | 131 | plt.show() 132 | 133 | 134 | def bayesian_neural_net_svi_demo(): 135 | """ 136 | Trains a bayesian neural net on the training set using Stochastic Variational Inference. 137 | """ 138 | 139 | xs, ys = create_dataset() 140 | net = create_net(svi=True) 141 | tst_data, X, Y = create_grid(-12, 12, 50) 142 | 143 | # train the net 144 | trn_target, trn_loss = lf.CrossEntropy(net.output) 145 | regularizer = lf.SviRegularizer(net.mps, net.sps, wdecay) 146 | trainer = trainers.SGD( 147 | model=net, 148 | trn_data=[xs, ys], 149 | trn_loss=trn_loss + regularizer / xs.shape[0], 150 | trn_target=trn_target 151 | ) 152 | trainer.train(maxepochs=80000, monitor_every=10, show_progress=True) 153 | 154 | # make predictions with zero noise 155 | base_pred = net.eval(tst_data, rand=False) 156 | 157 | # make predictions by averaging samples 158 | n_samples = 1000 159 | avg_pred = 0.0 160 | for _ in xrange(n_samples): 161 | avg_pred += net.eval(tst_data, rand=True) 162 | avg_pred /= n_samples 163 | 164 | # plot the base prediction surface 165 | fig = plt.figure() 166 | ax = fig.gca(projection='3d') 167 | Z = base_pred.reshape(list(X.shape)) 168 | ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0) 169 | ax.plot(xs[ys == 0, 0], xs[ys == 0, 1], 'b.', ms=12) 170 | ax.plot(xs[ys == 1, 0], xs[ys == 1, 1], 'r.', ms=12) 171 | ax.view_init(elev=90, azim=-90) 172 | plt.xlabel('x1') 173 | plt.ylabel('x2') 174 | plt.axis('equal') 175 | ax.axis([-12, 12, -12, 12]) 176 | fig.suptitle('Prediction surface using average weights') 177 | 178 | # plot the average prediction surface 179 | fig = plt.figure() 180 | ax = fig.gca(projection='3d') 181 | Z = avg_pred.reshape(list(X.shape)) 182 | ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0) 183 | ax.plot(xs[ys == 0, 0], xs[ys == 0, 1], 'b.', ms=12) 184 | ax.plot(xs[ys == 1, 0], xs[ys == 1, 1], 'r.', ms=12) 185 | ax.view_init(elev=90, azim=-90) 186 | plt.xlabel('x1') 187 | plt.ylabel('x2') 188 | plt.axis('equal') 189 | ax.axis([-12, 12, -12, 12]) 190 | fig.suptitle('Bayesian prediction surface') 191 | 192 | # plot the sample prediction surfaces 193 | fig = plt.figure() 194 | fig.suptitle('Sample prediction surfaces') 195 | 196 | for i in xrange(6): 197 | 198 | sample_pred = net.eval(tst_data, rand=True) 199 | 200 | ax = fig.add_subplot(2, 3, i+1, projection='3d') 201 | Z = sample_pred.reshape(list(X.shape)) 202 | ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0) 203 | ax.plot(xs[ys == 0, 0], xs[ys == 0, 1], 'b.', ms=12) 204 | ax.plot(xs[ys == 1, 0], xs[ys == 1, 1], 'r.', ms=12) 205 | ax.view_init(elev=90, azim=-90) 206 | plt.xlabel('x1') 207 | plt.ylabel('x2') 208 | plt.axis('equal') 209 | ax.axis([-12, 12, -12, 12]) 210 | 211 | plt.show() 212 | 213 | 214 | def bayesian_neural_net_hmc_demo(): 215 | """ 216 | Trains a bayesian neural net on the training set using Hamiltonian Monte Carlo. 217 | """ 218 | 219 | xs, ys = create_dataset() 220 | net = create_net() 221 | tst_data, X, Y = create_grid(-12, 12, 50) 222 | 223 | # make predictions on a grid of points 224 | trn_target, trn_loss = lf.CrossEntropy(net.output) 225 | regularizer = lf.WeightDecay(net.parms, wdecay) 226 | sampler = trainers.HMC( 227 | model=net, 228 | trn_data=[xs, ys], 229 | trn_loss=xs.shape[0] * trn_loss + regularizer, 230 | trn_target=trn_target 231 | ) 232 | ensemble = sampler.gen( 233 | n_samples=2000, 234 | L=100, 235 | me=0.3, 236 | show_traces=True 237 | ) 238 | avg_pred = ensemble.eval(tst_data) 239 | 240 | # plot the prediction surface 241 | fig = plt.figure() 242 | ax = fig.gca(projection='3d') 243 | Z = avg_pred.reshape(list(X.shape)) 244 | ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0) 245 | ax.plot(xs[ys == 0, 0], xs[ys == 0, 1], 'b.', ms=12) 246 | ax.plot(xs[ys == 1, 0], xs[ys == 1, 1], 'r.', ms=12) 247 | ax.view_init(elev=90, azim=-90) 248 | plt.xlabel('x1') 249 | plt.ylabel('x2') 250 | plt.axis('equal') 251 | ax.axis([-12, 12, -12, 12]) 252 | fig.suptitle('Bayesian prediction surface') 253 | 254 | # plot the prediction surfaces of a few sample networks 255 | fig = plt.figure() 256 | fig.suptitle('Sample prediction surfaces') 257 | 258 | for c, i in enumerate(rng.randint(0, ensemble.n_diff_models, 6)): 259 | 260 | ax = fig.add_subplot(2, 3, c+1, projection='3d') 261 | Z = ensemble.eval_model(i, tst_data).reshape(list(X.shape)) 262 | ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0) 263 | ax.plot(xs[ys == 0, 0], xs[ys == 0, 1], 'b.', ms=12) 264 | ax.plot(xs[ys == 1, 0], xs[ys == 1, 1], 'r.', ms=12) 265 | ax.view_init(elev=90, azim=-90) 266 | plt.xlabel('x1') 267 | plt.ylabel('x2') 268 | plt.axis('equal') 269 | ax.axis([-12, 12, -12, 12]) 270 | 271 | plt.show() 272 | -------------------------------------------------------------------------------- /ml/neural_nets.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import numpy as np 4 | import theano 5 | import theano.tensor as tt 6 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 7 | import matplotlib.pyplot as plt 8 | 9 | import util.math 10 | import util.ml 11 | import util.plot 12 | 13 | dtype = theano.config.floatX 14 | 15 | 16 | class FeedforwardNet: 17 | """Implements a feedforward neural network. 18 | Supports various types of layers and loss functions.""" 19 | 20 | def __init__(self, n_inputs, input=None): 21 | """Constructs a net with a given number of inputs and no layers.""" 22 | 23 | assert util.math.isposint(n_inputs), 'Number of inputs must be a positive integer.' 24 | 25 | self.n_inputs = n_inputs 26 | self.n_outputs = n_inputs 27 | self.n_units = [n_inputs] 28 | self.n_layers = 0 29 | self.n_params = 0 30 | 31 | self.Ws = [] 32 | self.bs = [] 33 | self.hs = [tt.matrix('x') if input is None else input] 34 | self.parms = self.Ws + self.bs 35 | self.input = self.hs[0] 36 | self.output = self.hs[-1] 37 | 38 | self.eval_f = None 39 | 40 | def addLayer(self, n_units, type, rng=np.random): 41 | """Adds a new layer to the network, 42 | :param n_units: number of units in the layer 43 | :param type: a string specification of the activation function 44 | """ 45 | 46 | # check number of units 47 | assert util.math.isposint(n_units), 'Number of units must be a positive integer.' 48 | 49 | # choose activation function 50 | actfun = util.ml.select_theano_act_function(type, dtype) 51 | 52 | n_prev_units = self.n_outputs 53 | self.n_outputs = n_units 54 | self.n_units.append(n_units) 55 | self.n_layers += 1 56 | self.n_params += (n_prev_units + 1) * n_units 57 | 58 | W = theano.shared((rng.randn(n_prev_units, n_units) / np.sqrt(n_prev_units + 1)).astype(dtype), name='W' + str(self.n_layers), borrow=True) 59 | b = theano.shared(np.zeros(n_units, dtype=dtype), name='b' + str(self.n_layers), borrow=True) 60 | h = actfun(tt.dot(self.hs[-1], W) + b) 61 | h.name = 'h' + str(self.n_layers) 62 | 63 | self.Ws.append(W) 64 | self.bs.append(b) 65 | self.hs.append(h) 66 | self.parms = self.Ws + self.bs 67 | self.output = self.hs[-1] 68 | 69 | self.eval_f = None 70 | 71 | def removeLayer(self): 72 | """Removes a layer from the network.""" 73 | 74 | assert self.n_layers > 0, 'There is no layer to remove.' 75 | 76 | n_params_to_rem = self.n_outputs * (self.n_units[-2] + 1) 77 | self.n_outputs = self.n_units[-2] 78 | self.n_units.pop() 79 | self.n_layers -= 1 80 | self.n_params -= n_params_to_rem 81 | 82 | self.Ws.pop() 83 | self.bs.pop() 84 | self.hs.pop() 85 | self.parms = self.Ws + self.bs 86 | self.output = self.hs[-1] 87 | 88 | self.eval_f = None 89 | 90 | def eval(self, x): 91 | """Evaluate net at locations in x.""" 92 | 93 | # compile theano computation graph, if haven't already done so 94 | if self.eval_f is None: 95 | self.eval_f = theano.function( 96 | inputs=[self.hs[0]], 97 | outputs=self.hs[-1] 98 | ) 99 | 100 | x = np.asarray(x, dtype=dtype) 101 | 102 | return self.eval_f(x[np.newaxis, :])[0] if x.ndim == 1 else self.eval_f(x) 103 | 104 | def printInfo(self): 105 | """Prints some useful info about the net.""" 106 | 107 | print 'Number of inputs =', self.n_inputs 108 | print 'Number of outputs =', self.n_outputs 109 | print 'Number of units =', self.n_units 110 | print 'Number of layers =', self.n_layers 111 | print 'Number of params =', self.n_params 112 | print 'Data type =', dtype 113 | 114 | def visualize_weights(self, layer, imsize, layout): 115 | """ 116 | Displays the weights of a specified layer as images. 117 | :param layer: the layer whose weights to display 118 | :param imsize: the image size 119 | :param layout: number of rows and columns for each page 120 | :return: none 121 | """ 122 | 123 | util.plot.disp_imdata(self.Ws[layer].get_value().T, imsize, layout) 124 | plt.show(block=False) 125 | 126 | def visualize_activations(self, x, layers=None): 127 | """ 128 | Visualizes the activations of specified layers caused by a given data minibatch. 129 | :param x: a minibatch of data 130 | :param layers: list of layers to visualize activations of; defaults to the whole net except the input layer 131 | :return: none 132 | """ 133 | 134 | if layers is None: 135 | layers = xrange(self.n_layers) 136 | 137 | forwprop = theano.function( 138 | inputs=[self.hs[0]], 139 | outputs=self.hs[1:] 140 | ) 141 | hs = forwprop(x.astype(dtype)) 142 | 143 | for l in layers: 144 | 145 | fig = plt.figure() 146 | ax = fig.add_subplot(1, 1, 1) 147 | ax.imshow(hs[l], cmap='gray', interpolation='none') 148 | ax.set_title('Layer ' + str(l)) 149 | ax.set_xlabel('layer units') 150 | ax.set_ylabel('data points') 151 | 152 | plt.show(block=False) 153 | 154 | def param_hist(self, layers=None): 155 | """ 156 | Displays a histogram of weights and biases for specified layers. 157 | :param layers: list of layers to show histograms for; defaults to the whole net 158 | :return: none 159 | """ 160 | 161 | if layers is None: 162 | layers = xrange(self.n_layers) 163 | 164 | for l in layers: 165 | 166 | fig, (ax1, ax2) = plt.subplots(1, 2) 167 | 168 | nbins = int(np.sqrt(self.Ws[l].get_value().size)) 169 | ax1.hist(self.Ws[l].get_value().flatten(), nbins, normed=True) 170 | ax1.set_title('weights, layer ' + str(l)) 171 | 172 | nbins = int(np.sqrt(self.bs[l].get_value().size)) 173 | ax2.hist(self.bs[l].get_value(), nbins, normed=True) 174 | ax2.set_title('biases, layer ' + str(l)) 175 | 176 | plt.show(block=False) 177 | 178 | 179 | class FeedforwardNet_SVI: 180 | """Implements a feedforward neural network trained using stochastic variational inference. 181 | Supports various types of layers and loss functions.""" 182 | 183 | def __init__(self, n_inputs): 184 | """Constructs a net with a given number of inputs and no layers.""" 185 | 186 | assert util.math.isposint(n_inputs), 'Number of inputs must be a positive integer.' 187 | 188 | self.n_inputs = n_inputs 189 | self.n_outputs = n_inputs 190 | self.n_units = [n_inputs] 191 | self.n_layers = 0 192 | self.n_params = 0 193 | 194 | self.mWs = [] 195 | self.mbs = [] 196 | self.sWs = [] 197 | self.sbs = [] 198 | self.uas = [] 199 | self.mas = [] 200 | self.zas = [] 201 | self.hs = [tt.matrix('x')] 202 | 203 | self.mps = self.mWs + self.mbs 204 | self.sps = self.sWs + self.sbs 205 | self.parms = self.mps + self.sps 206 | self.input = self.hs[0] 207 | self.output = self.hs[-1] 208 | 209 | self.srng = RandomStreams() 210 | 211 | self.eval_f = None 212 | self.eval_f_rand = None 213 | 214 | def addLayer(self, n_units, type, rng=np.random): 215 | """Adds a new layer to the network, 216 | :param n_units: number of units in the layer 217 | :param type: a string specification of the activation function 218 | """ 219 | 220 | # check number of units 221 | assert util.math.isposint(n_units), 'Number of units must be a positive integer.' 222 | 223 | # choose activation function 224 | actfun = util.ml.select_theano_act_function(type, dtype) 225 | 226 | n_prev_units = self.n_outputs 227 | self.n_outputs = n_units 228 | self.n_units.append(n_units) 229 | self.n_layers += 1 230 | self.n_params += 2 * (n_prev_units + 1) * n_units 231 | 232 | mW = theano.shared((rng.randn(n_prev_units, n_units) / np.sqrt(n_prev_units + 1)).astype(dtype), name='mW' + str(self.n_layers), borrow=True) 233 | mb = theano.shared(np.zeros(n_units, dtype=dtype), name='mb' + str(self.n_layers), borrow=True) 234 | sW = theano.shared(-5.0 * np.ones([n_prev_units, n_units], dtype=dtype), name='sW' + str(self.n_layers), borrow=True) 235 | sb = theano.shared(-5.0 * np.ones(n_units, dtype=dtype), name='sb' + str(self.n_layers), borrow=True) 236 | ua = self.srng.normal((self.hs[-1].shape[0], n_units), dtype=dtype) 237 | ma = tt.dot(self.hs[-1], mW) + mb 238 | sa = tt.dot(self.hs[-1]**2, tt.exp(2*sW)) + tt.exp(2*sb) 239 | za = tt.sqrt(sa) * ua + ma 240 | h = actfun(za) 241 | h.name = 'h' + str(self.n_layers) 242 | 243 | self.mWs.append(mW) 244 | self.mbs.append(mb) 245 | self.sWs.append(sW) 246 | self.sbs.append(sb) 247 | self.uas.append(ua) 248 | self.mas.append(ma) 249 | self.zas.append(za) 250 | self.hs.append(h) 251 | 252 | self.mps = self.mWs + self.mbs 253 | self.sps = self.sWs + self.sbs 254 | self.parms = self.mps + self.sps 255 | self.output = self.hs[-1] 256 | 257 | self.eval_f = None 258 | self.eval_f_rand = None 259 | 260 | def removeLayer(self): 261 | """Removes a layer from the network.""" 262 | 263 | assert self.n_layers > 0, 'There is no layer to remove.' 264 | 265 | n_params_to_rem = 2 * self.n_outputs * (self.n_units[-2] + 1) 266 | self.n_outputs = self.n_units[-2] 267 | self.n_units.pop() 268 | self.n_layers -= 1 269 | self.n_params -= n_params_to_rem 270 | 271 | self.mWs.pop() 272 | self.mbs.pop() 273 | self.sWs.pop() 274 | self.sbs.pop() 275 | self.uas.pop() 276 | self.mas.pop() 277 | self.zas.pop() 278 | self.hs.pop() 279 | 280 | self.mps = self.mWs + self.mbs 281 | self.sps = self.sWs + self.sbs 282 | self.parms = self.mps + self.sps 283 | self.output = self.hs[-1] 284 | 285 | self.eval_f = None 286 | self.eval_f_rand = None 287 | 288 | def eval(self, x, rand=False): 289 | """Evaluate net at locations in x.""" 290 | 291 | x = np.asarray(x, dtype=dtype) 292 | 293 | if rand: 294 | 295 | # compile theano computation graph, if haven't already done so 296 | if self.eval_f_rand is None: 297 | 298 | n_data = tt.iscalar('n_data') 299 | uas = [tt.tile(self.srng.normal((n_units,), dtype=dtype), [n_data, 1]) for n_units in self.n_units[1:]] 300 | 301 | self.eval_f_rand = theano.function( 302 | inputs=[self.hs[0], n_data], 303 | outputs=self.hs[-1], 304 | givens=zip(self.uas, uas) 305 | ) 306 | 307 | return self.eval_f_rand(x[np.newaxis, :], 1)[0] if x.ndim == 1 else self.eval_f_rand(x, x.shape[0]) 308 | 309 | else: 310 | 311 | # compile theano computation graph, if haven't already done so 312 | if self.eval_f is None: 313 | self.eval_f = theano.function( 314 | inputs=[self.hs[0]], 315 | outputs=self.hs[-1], 316 | givens=zip(self.zas, self.mas) 317 | ) 318 | 319 | return self.eval_f(x[np.newaxis, :])[0] if x.ndim == 1 else self.eval_f(x) 320 | 321 | def printInfo(self): 322 | """Prints some useful info about the net.""" 323 | 324 | print 'Number of inputs =', self.n_inputs 325 | print 'Number of outputs =', self.n_outputs 326 | print 'Number of units =', self.n_units 327 | print 'Number of layers =', self.n_layers 328 | print 'Number of params =', self.n_params 329 | print 'Data type =', dtype 330 | 331 | def visualize_weights(self, layer, imsize, layout): 332 | """ 333 | Displays the weights of a specified layer as images. 334 | :param layer: the layer whose weights to display 335 | :param imsize: the image size 336 | :param layout: number of rows and columns for each page 337 | :return: none 338 | """ 339 | 340 | util.plot.disp_imdata(self.mWs[layer].get_value().T, imsize, layout) 341 | plt.show(block=False) 342 | 343 | def visualize_activations(self, x, layers=None): 344 | """ 345 | Visualizes the activations of specified layers caused by a given data minibatch. 346 | :param x: a minibatch of data 347 | :param layers: list of layers to visualize activations of; defaults to the whole net except the input layer 348 | :return: none 349 | """ 350 | 351 | if layers is None: 352 | layers = xrange(self.n_layers) 353 | 354 | forwprop = theano.function( 355 | inputs=[self.hs[0]], 356 | outputs=self.hs[1:] 357 | ) 358 | hs = forwprop(x.astype(dtype)) 359 | 360 | for l in layers: 361 | 362 | fig = plt.figure() 363 | ax = fig.add_subplot(1, 1, 1) 364 | ax.imshow(hs[l], cmap='gray', interpolation='none') 365 | ax.set_title('Layer ' + str(l)) 366 | ax.set_xlabel('layer units') 367 | ax.set_ylabel('data points') 368 | 369 | plt.show(block=False) 370 | 371 | def param_hist(self, layers=None): 372 | """ 373 | Displays a histogram of weights and biases for specified layers. 374 | :param layers: list of layers to show histograms for; defaults to the whole net 375 | :return: none 376 | """ 377 | 378 | if layers is None: 379 | layers = xrange(self.n_layers) 380 | 381 | for l in layers: 382 | 383 | fig, axs = plt.subplots(2, 2) 384 | 385 | nbins = int(np.sqrt(self.mWs[l].get_value().size)) 386 | axs[0, 0].hist(self.mWs[l].get_value().flatten(), nbins, normed=True) 387 | axs[0, 0].set_title('weight means, layer ' + str(l)) 388 | axs[1, 0].hist(self.sWs[l].get_value().flatten(), nbins, normed=True) 389 | axs[1, 0].set_title('weight log stds, layer ' + str(l)) 390 | 391 | nbins = int(np.sqrt(self.mbs[l].get_value().size)) 392 | axs[0, 1].hist(self.mbs[l].get_value(), nbins, normed=True) 393 | axs[0, 1].set_title('bias means, layer ' + str(l)) 394 | axs[1, 1].hist(self.sbs[l].get_value(), nbins, normed=True) 395 | axs[1, 1].set_title('bias log stds, layer ' + str(l)) 396 | 397 | plt.show(block=False) 398 | -------------------------------------------------------------------------------- /ml/trainers.py: -------------------------------------------------------------------------------- 1 | from itertools import izip 2 | 3 | import os 4 | import sys 5 | import numpy as np 6 | import theano 7 | import theano.tensor as tt 8 | import matplotlib.pyplot as plt 9 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 10 | 11 | import ml.step_strategies as ss 12 | import ml.data_streams as ds 13 | import ml.ensembles as ensembles 14 | import util.math 15 | 16 | 17 | dtype = theano.config.floatX 18 | 19 | 20 | class SGD: 21 | """ 22 | Minibatch stochastic gradient descent. Can work with a variety of step strategies, and supports early stopping on 23 | validation set. 24 | """ 25 | 26 | def __init__(self, model, trn_data, trn_loss, trn_target=None, val_data=None, val_loss=None, val_target=None, step=ss.Adam()): 27 | """ 28 | Constructs and configures the trainer. 29 | :param model: the model to be trained 30 | :param trn_data: train inputs and (possibly) train targets 31 | :param trn_loss: theano variable representing the train loss to minimize 32 | :param trn_target: theano variable representing the train target 33 | :param val_data: validation inputs and (possibly) validation targets 34 | :param val_loss: theano variable representing the validation loss 35 | :param val_target: theano variable representing the validation target 36 | :param step: step size strategy object 37 | :return: None 38 | """ 39 | 40 | # parse input 41 | # TODO: it would be good to type check the other inputs too 42 | assert isinstance(step, ss.StepStrategy), 'Step must be a step strategy object.' 43 | 44 | # prepare train data 45 | n_trn_data_list = set([x.shape[0] for x in trn_data]) 46 | assert len(n_trn_data_list) == 1, 'Number of train data is not consistent.' 47 | self.n_trn_data = list(n_trn_data_list)[0] 48 | trn_data = [theano.shared(x.astype(dtype), borrow=True) for x in trn_data] 49 | 50 | # compile theano function for a single training update 51 | grads = tt.grad(trn_loss, model.parms) 52 | idx = tt.ivector('idx') 53 | trn_inputs = [model.input] if trn_target is None else [model.input, trn_target] 54 | self.make_update = theano.function( 55 | inputs=[idx], 56 | outputs=trn_loss, 57 | givens=zip(trn_inputs, [x[idx] for x in trn_data]), 58 | updates=step.updates(model.parms, grads) 59 | ) 60 | 61 | # if model uses batch norm, compile a theano function for setting up stats 62 | if getattr(model, 'batch_norm', False): 63 | batch_norm_givens = [(bn.m, bn.bm) for bn in model.bns] + [(bn.v, bn.bv) for bn in model.bns] 64 | self.set_batch_norm_stats = theano.function( 65 | inputs=[], 66 | givens=zip(trn_inputs, trn_data), 67 | updates=[(bn.bm, bn.m) for bn in model.bns] + [(bn.bv, bn.v) for bn in model.bns] 68 | ) 69 | else: 70 | self.set_batch_norm_stats = None 71 | batch_norm_givens = [] 72 | 73 | # if validation data is given, then set up validation too 74 | self.do_validation = val_data is not None 75 | 76 | if self.do_validation: 77 | 78 | # prepare validation data 79 | n_val_data_list = set([x.shape[0] for x in val_data]) 80 | assert len(n_val_data_list) == 1, 'Number of validation data is not consistent.' 81 | self.n_val_data = list(n_val_data_list)[0] 82 | val_data = [theano.shared(x.astype(dtype), borrow=True) for x in val_data] 83 | 84 | # compile theano function for validation 85 | val_inputs = [model.input] if val_target is None else [model.input, val_target] 86 | self.validate = theano.function( 87 | inputs=[], 88 | outputs=val_loss, 89 | givens=zip(val_inputs, val_data) + batch_norm_givens 90 | ) 91 | 92 | # create checkpointer to store best model 93 | self.checkpointer = ModelCheckpointer(model) 94 | self.best_val_loss = float('inf') 95 | 96 | # initialize some variables 97 | self.trn_loss = float('inf') 98 | self.idx_stream = ds.IndexSubSampler(self.n_trn_data, rng=np.random.RandomState(42)) 99 | 100 | def train(self, minibatch=None, tol=None, maxepochs=None, monitor_every=None, patience=None, logger=sys.stdout, show_progress=False, val_in_same_plot=True): 101 | """ 102 | Trains the model. 103 | :param minibatch: minibatch size 104 | :param tol: tolerance 105 | :param maxepochs: maximum number of epochs 106 | :param monitor_every: monitoring frequency 107 | :param patience: maximum number of validation steps to wait for improvement before early stopping 108 | :param logger: logger for logging messages. If None, no logging takes place 109 | :param show_progress: if True, plot training and validation progress 110 | :param val_in_same_plot: if True, plot validation progress in same plot as training progress 111 | :return: None 112 | """ 113 | 114 | # parse input 115 | assert minibatch is None or util.math.isposint(minibatch), 'Minibatch size must be a positive integer or None.' 116 | assert tol is None or tol > 0.0, 'Tolerance must be positive or None.' 117 | assert maxepochs is None or maxepochs > 0.0, 'Maximum number of epochs must be positive or None.' 118 | assert monitor_every is None or monitor_every > 0.0, 'Monitoring frequency must be positive or None.' 119 | assert patience is None or util.math.isposint(patience), 'Patience must be a positive integer or None.' 120 | assert isinstance(show_progress, bool), 'store_progress must be boolean.' 121 | assert isinstance(val_in_same_plot, bool), 'val_in_same_plot must be boolean.' 122 | 123 | # initialize some variables 124 | iter = 0 125 | progress_epc = [] 126 | progress_trn = [] 127 | progress_val = [] 128 | minibatch = self.n_trn_data if minibatch is None else minibatch 129 | maxiter = float('inf') if maxepochs is None else np.ceil(maxepochs * self.n_trn_data / float(minibatch)) 130 | monitor_every = float('inf') if monitor_every is None else np.ceil(monitor_every * self.n_trn_data / float(minibatch)) 131 | patience = float('inf') if patience is None else patience 132 | patience_left = patience 133 | best_epoch = None 134 | logger = open(os.devnull, 'w') if logger is None else logger 135 | 136 | # main training loop 137 | while True: 138 | 139 | # make update to parameters 140 | trn_loss = self.make_update(self.idx_stream.gen(minibatch)) 141 | diff = self.trn_loss - trn_loss 142 | iter += 1 143 | self.trn_loss = trn_loss 144 | 145 | if iter % monitor_every == 0: 146 | 147 | epoch = iter * float(minibatch) / self.n_trn_data 148 | 149 | # do validation 150 | if self.do_validation: 151 | if self.set_batch_norm_stats is not None: self.set_batch_norm_stats() 152 | val_loss = self.validate() 153 | patience_left -= 1 154 | 155 | if val_loss < self.best_val_loss: 156 | self.best_val_loss = val_loss 157 | self.checkpointer.checkpoint() 158 | best_epoch = epoch 159 | patience_left = patience 160 | 161 | # monitor progress 162 | if show_progress: 163 | progress_epc.append(epoch) 164 | progress_trn.append(trn_loss) 165 | if self.do_validation: progress_val.append(val_loss) 166 | 167 | # log info 168 | if self.do_validation: 169 | logger.write('Epoch = {0:.2f}, train loss = {1}, validation loss = {2}\n'.format(epoch, trn_loss, val_loss)) 170 | else: 171 | logger.write('Epoch = {0:.2f}, train loss = {1}\n'.format(epoch, trn_loss)) 172 | 173 | # check for convergence 174 | if abs(diff) < tol or iter >= maxiter or patience_left <= 0: 175 | if self.do_validation: self.checkpointer.restore() 176 | if self.set_batch_norm_stats is not None: self.set_batch_norm_stats() 177 | break 178 | 179 | # plot progress 180 | if show_progress: 181 | 182 | if self.do_validation: 183 | 184 | if val_in_same_plot: 185 | fig, ax = plt.subplots(1, 1) 186 | ax.semilogx(progress_epc, progress_trn, 'b', label='training') 187 | ax.semilogx(progress_epc, progress_val, 'r', label='validation') 188 | ax.vlines(best_epoch, ax.get_ylim()[0], ax.get_ylim()[1], color='g', linestyles='dashed', label='best') 189 | ax.set_xlabel('epochs') 190 | ax.set_ylabel('loss') 191 | ax.legend() 192 | ax.set_title('Training progress') 193 | 194 | else: 195 | fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True) 196 | ax1.semilogx(progress_epc, progress_trn, 'b') 197 | ax2.semilogx(progress_epc, progress_val, 'r') 198 | ax1.vlines(best_epoch, ax1.get_ylim()[0], ax1.get_ylim()[1], color='g', linestyles='dashed', label='best') 199 | ax2.vlines(best_epoch, ax2.get_ylim()[0], ax2.get_ylim()[1], color='g', linestyles='dashed', label='best') 200 | ax2.set_xlabel('epochs') 201 | ax1.set_ylabel('training loss') 202 | ax2.set_ylabel('validation loss') 203 | fig.suptitle('Training progress') 204 | 205 | else: 206 | fig, ax = plt.subplots(1, 1) 207 | ax.semilogx(progress_epc, progress_trn, 'b') 208 | ax.set_xlabel('epochs') 209 | ax.set_ylabel('training loss') 210 | ax.legend() 211 | ax.set_title('Training progress') 212 | 213 | plt.show(block=False) 214 | 215 | 216 | class HMC: 217 | """ 218 | Hamiltonian Monte Carlo training of models. Uses a quadratic kinetic energy. Trained model is an ensemble of 219 | posterior model samples. 220 | """ 221 | 222 | def __init__(self, model, trn_data, trn_loss, trn_target): 223 | """ 224 | :param model: model to train 225 | :param trn_data: train data 226 | :param trn_loss: train loss 227 | :param trn_target: train target 228 | """ 229 | 230 | # prepare train data 231 | n_trn_data_list = set([x.shape[0] for x in trn_data]) 232 | assert len(n_trn_data_list) == 1, 'Number of train data is not consistent.' 233 | trn_data = [theano.shared(x.astype(dtype)) for x in trn_data] 234 | 235 | # prepare train inputs 236 | trn_inputs = [model.input] if trn_target is None else [model.input, trn_target] 237 | 238 | # potential energy 239 | self.U = theano.function( 240 | inputs=[], 241 | outputs=trn_loss, 242 | givens=zip(trn_inputs, trn_data) 243 | ) 244 | 245 | # theano variables 246 | step = tt.scalar('step') 247 | mass = tt.scalar('mass') 248 | srng = RandomStreams() 249 | 250 | # theano function for drawing random momentum variables 251 | ps = [theano.shared(np.zeros_like(x.get_value(borrow=True)), borrow=True) for x in model.parms] 252 | ps_rand = [srng.normal(x.get_value().shape, std=tt.sqrt(mass), dtype=dtype) for x in model.parms] 253 | ps_rand = [tt.unbroadcast(pr, *range(x.get_value().ndim)) for pr, x in izip(ps_rand, model.parms)] 254 | self.draw_momentum = theano.function( 255 | inputs=[mass], 256 | updates=zip(ps, ps_rand), 257 | allow_input_downcast=True 258 | ) 259 | 260 | # theano function for calculating kinetic energy 261 | K = sum([tt.sum(p**2) for p in ps]) / (2.0 * mass) 262 | self.calc_kinetic = theano.function( 263 | inputs=[mass], 264 | outputs=K, 265 | allow_input_downcast=True 266 | ) 267 | 268 | # theano function for updating momentum variables 269 | dUs = tt.grad(trn_loss, model.parms) 270 | new_ps = [p - step * dU for p, dU in izip(ps, dUs)] 271 | self.update_momentum = theano.function( 272 | inputs=[step], 273 | updates=zip(ps, new_ps), 274 | givens=zip(trn_inputs, trn_data), 275 | allow_input_downcast=True 276 | ) 277 | 278 | # theano function for updating model parameters 279 | new_parms = [x + step / mass * p for x, p in izip(model.parms, ps)] 280 | self.update_parms = theano.function( 281 | inputs=[step, mass], 282 | updates=zip(model.parms, new_parms), 283 | allow_input_downcast=True 284 | ) 285 | 286 | # initialize 287 | self.U_prev = self.U() 288 | self.model = model 289 | 290 | def gen(self, n_samples, L, me, m=1.0, logger=sys.stdout, show_traces=False, rng=np.random): 291 | """ 292 | Generates HMC samples. 293 | :param n_samples: number of samples 294 | :param L: number of leapfrog steps 295 | :param me: mean of time step 296 | :param m: mass 297 | :param logger: logger for logging messages. If None, no logging takes place 298 | :param show_traces: whether to plot info at the end of sampling 299 | :param rng: random number generator to use 300 | :return: an ensemble of model samples 301 | """ 302 | 303 | # initialize 304 | n_acc = 0 305 | U_trace = [] 306 | H_error_trace = [] 307 | acc_rate_trace = [] 308 | xs = self.model.parms 309 | ensemble = ensembles.FastEnsemble(self.model, copy=True) 310 | ensemble.add_new(xs, copy=True) 311 | logger = open(os.devnull, 'w') if logger is None else logger 312 | 313 | for n in xrange(n_samples): 314 | 315 | # sample momentum from a gaussian 316 | self.draw_momentum(m) 317 | K_prev = self.calc_kinetic(m) 318 | 319 | # simulate hamiltonian dynamics with leapfrog method 320 | e = -me * np.log(1 - rng.rand()) 321 | self.update_momentum(0.5 * e) 322 | for _ in xrange(L-1): 323 | self.update_parms(e, m) 324 | self.update_momentum(e) 325 | self.update_parms(e, m) 326 | self.update_momentum(0.5 * e) 327 | # negating p is not necessary, because kinetic energy is symmetric 328 | 329 | # metropolis acceptance rule 330 | U_new = self.U() 331 | K_new = self.calc_kinetic(m) 332 | H_err = (U_new + K_new) - (self.U_prev + K_prev) 333 | if rng.rand() < np.exp(-H_err): 334 | self.U_prev = U_new 335 | n_acc += 1 336 | ensemble.add_new(xs, copy=True) 337 | else: 338 | for i, x in enumerate(ensemble.parms[-1]): 339 | xs[i].set_value(x.copy()) 340 | ensemble.add_existing(-1) 341 | 342 | # acceptance rate 343 | acc_rate = n_acc / float(n+1) 344 | logger.write('sample = {0}, acc rate = {1:.2%}, hamiltonian error = {2:.2}\n'.format(n+1, acc_rate, H_err)) 345 | 346 | # record traces 347 | if show_traces: 348 | U_trace.append(self.U_prev) 349 | H_error_trace.append(H_err) 350 | acc_rate_trace.append(acc_rate) 351 | 352 | ensemble.remove(0) 353 | 354 | # show plot with the traces 355 | if show_traces: 356 | 357 | fig, ax = plt.subplots(3, 1, sharex=True) 358 | ax[0].plot(U_trace) 359 | ax[0].set_ylabel('potential energy') 360 | ax[1].plot(H_error_trace) 361 | ax[1].set_ylabel('hamiltonian error') 362 | ax[2].plot(acc_rate_trace) 363 | ax[2].set_ylim([0, 1]) 364 | ax[2].set_ylabel('acceptance rate') 365 | ax[2].set_xlabel('samples') 366 | fig.suptitle('HMC progress') 367 | 368 | parm_traces = ensemble.get_traces() 369 | fig, axs = plt.subplots(len(parm_traces), sharex=True) 370 | for ax, p in izip(axs, parm_traces): 371 | ax.plot(p) 372 | axs[-1].set_xlabel('samples') 373 | fig.suptitle('Parameter traces') 374 | 375 | plt.show(block=False) 376 | 377 | return ensemble 378 | 379 | 380 | class ModelCheckpointer: 381 | """ 382 | Helper class which makes checkpoints of a given model. 383 | Currently one checkpoint is supported; checkpointing twice overwrites previous checkpoint. 384 | """ 385 | 386 | def __init__(self, model): 387 | """ 388 | :param model: A machine learning model to be checkpointed. 389 | """ 390 | self.model = model 391 | self.checkpointed_parms = [np.empty_like(p.get_value()) for p in model.parms] 392 | 393 | def checkpoint(self): 394 | """ 395 | Checkpoints current model. Overwrites previous checkpoint. 396 | """ 397 | for i, p in enumerate(self.model.parms): 398 | self.checkpointed_parms[i] = p.get_value().copy() 399 | 400 | def restore(self): 401 | """ 402 | Restores last checkpointed model. 403 | """ 404 | for i, p in enumerate(self.checkpointed_parms): 405 | self.model.parms[i].set_value(p) 406 | --------------------------------------------------------------------------------