├── ml
    ├── __init__.py
    ├── loss_functions.py
    ├── data_streams.py
    ├── step_strategies.py
    ├── ensembles.py
    ├── neural_nets.py
    └── trainers.py
├── util
    ├── __init__.py
    ├── misc.py
    ├── io.py
    ├── ml.py
    ├── math.py
    └── plot.py
├── .gitignore
├── README.md
├── run.py
├── LICENCE.txt
└── bnn_demo.py


/ml/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # specific files and foders
 2 | .idea/
 3 | 
 4 | # python bytecode files
 5 | *.pyc
 6 | *.pyo
 7 | 
 8 | # text editor's temporary files
 9 | *~
10 | 
11 | 


--------------------------------------------------------------------------------
/util/misc.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import os
 3 | 
 4 | 
 5 | def remove_whitespace(str):
 6 |     """
 7 |     Returns the string str with all whitespace removed.
 8 |     """
 9 | 
10 |     p = re.compile(r'\s+')
11 |     return p.sub('', str)
12 | 
13 | 
14 | def get_environment():
15 |     """
16 |     Returns a string identifying the current environment.
17 |     """
18 | 
19 |     try:
20 |         hostname = os.environ['SHORT_HOSTNAME']
21 | 
22 |     except KeyError:
23 | 
24 |         try:
25 |             hostname = os.environ['HOSTNAME']
26 | 
27 |         except KeyError:
28 |             return 'scratch'
29 | 
30 |     if re.match('charles[0-9]{2,}|renown|anne', hostname):
31 |         return 'cluster'
32 | 
33 |     else:
34 |         return 'scratch'
35 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bayesian neural networks demo
 2 | 
 3 | A demo of Bayesian neural networks on a toy binary classification problem. 
 4 | 
 5 | Two ways of implementing Bayesian neural networks are demonstrated:
 6 | - Stochastic Variational Inference using local reparameterization [1]
 7 | - Hamiltonian Monte Carlo [2]
 8 | 
 9 | [1] Kingma et al., _Variational Dropout and the Local Reparameterization Trick_, NeurIPS 2015. [[arXiv]](https://arxiv.org/abs/1506.02557)
10 | 
11 | [2] Neal, _MCMC using Hamiltonian dynamics_, Handbook of Markov Chain Monte Carlo, 2011. [[arXiv]](https://arxiv.org/abs/1206.1901)
12 | 
13 | ## How to run the code
14 | 
15 | Display the training data:
16 | ```
17 | python run.py --show
18 | ```
19 | 
20 | Train a non-Bayesian neural network by minimizing cross-entropy:
21 | ```
22 | python run.py --mle
23 | ```
24 | 
25 | Train a Bayesian neural network using Stochastic Variarional Inference:
26 | ```
27 | python run.py --svi
28 | ```
29 | 
30 | Train a Bayesian neural network using Hamiltonian Monte Carlo:
31 | ```
32 | python run.py --hmc
33 | ```
34 | 
35 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import bnn_demo
 3 | 
 4 | 
 5 | def parse_args():
 6 |     """
 7 |     Returns an object describing the command line.
 8 |     """
 9 | 
10 |     parser = argparse.ArgumentParser(description='Bayesian neural networks demo.')
11 |     group = parser.add_mutually_exclusive_group()
12 | 
13 |     group.add_argument('--show', action='store_true', help='show the dataset')
14 |     group.add_argument('--mle', action='store_true', help='train a non-bayesian net using maximum likelihood')
15 |     group.add_argument('--svi', action='store_true', help='train a bayesian net using stochastic variational inference')
16 |     group.add_argument('--hmc', action='store_true', help='train a bayesian net using hamiltonian monte carlo')
17 | 
18 |     return parser.parse_args()
19 | 
20 | 
21 | def main():
22 | 
23 |     args = parse_args()
24 | 
25 |     if args.show:
26 |         bnn_demo.show_train_data()
27 | 
28 |     elif args.mle:
29 |         bnn_demo.fit_neural_net_demo()
30 | 
31 |     elif args.svi:
32 |         bnn_demo.bayesian_neural_net_svi_demo()
33 | 
34 |     elif args.hmc:
35 |         bnn_demo.bayesian_neural_net_hmc_demo()
36 | 
37 |     else:
38 |         print('No action specified.')
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     main()
43 | 


--------------------------------------------------------------------------------
/LICENCE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018, George Papamakarios
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 |    list of conditions and the following disclaimer.
 9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 |    this list of conditions and the following disclaimer in the documentation
11 |    and/or other materials provided with the distribution.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | 
24 | The views and conclusions contained in the software and documentation are those
25 | of the authors and should not be interpreted as representing official policies,
26 | either expressed or implied, of anybody else.
27 | 
28 | 


--------------------------------------------------------------------------------
/util/io.py:
--------------------------------------------------------------------------------
 1 | import cPickle as pickle
 2 | import os
 3 | import sys
 4 | 
 5 | 
 6 | def save(data, file):
 7 |     """
 8 |     Saves data to a file.
 9 |     """
10 | 
11 |     dir = os.path.dirname(file)
12 |     if dir:
13 |         make_folder(dir)
14 | 
15 |     with open(file + '.pkl', 'w') as f:
16 |         pickle.dump(data, f)
17 | 
18 | 
19 | def load(file):
20 |     """
21 |     Loads data from file.
22 |     """
23 | 
24 |     with open(file + '.pkl', 'r') as f:
25 |         data = pickle.load(f)
26 | 
27 |     return data
28 | 
29 | 
30 | def save_txt(str, file):
31 |     """
32 |     Saves string to a text file.
33 |     """
34 | 
35 |     dir = os.path.dirname(file)
36 |     if dir:
37 |         make_folder(dir)
38 | 
39 |     with open(file, 'w') as f:
40 |         f.write(str)
41 | 
42 | 
43 | def load_txt(file):
44 |     """
45 |     Loads string from text file.
46 |     """
47 | 
48 |     with open(file, 'r') as f:
49 |         str = f.read()
50 | 
51 |     return str
52 | 
53 | 
54 | def make_folder(folder):
55 |     """
56 |     Creates given folder (or path) if it doesn't exist.
57 |     """
58 | 
59 |     if not os.path.exists(folder):
60 |         os.makedirs(folder)
61 | 
62 | 
63 | class Logger:
64 |     """
65 |     Implements an object that logs messages to a file, as well as printing them on the sceen.
66 |     """
67 | 
68 |     def __init__(self, filename):
69 |         """
70 |         :param filename: file to be created for logging
71 |         """
72 |         self.f = open(filename, 'w')
73 | 
74 |     def write(self, msg):
75 |         """
76 |         :param msg: string to be logged and printed on screen
77 |         """
78 |         sys.stdout.write(msg)
79 |         self.f.write(msg)
80 | 
81 |     def __enter__(self):
82 |         """
83 |         Context management enter function.
84 |         """
85 |         return self
86 | 
87 |     def __exit__(self, exc_type, exc_val, exc_tb):
88 |         """
89 |         Context management exit function. Closes the file.
90 |         """
91 |         self.f.close()
92 |         return False
93 | 


--------------------------------------------------------------------------------
/ml/loss_functions.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import theano.tensor as tt
  3 | 
  4 | 
  5 | def SquareError(x):
  6 |     """Square error loss function."""
  7 | 
  8 |     if x.ndim == 1:
  9 |         y = tt.vector('y')
 10 |         L = tt.mean((x - y) ** 2)
 11 | 
 12 |     elif x.ndim == 2:
 13 |         y = tt.matrix('y')
 14 |         L = tt.mean(tt.sum((x - y) ** 2, axis=1))
 15 | 
 16 |     else:
 17 |         raise ValueError('x must be either a vector or a matrix.')
 18 | 
 19 |     L.name = 'loss'
 20 | 
 21 |     return y, L
 22 | 
 23 | 
 24 | def CrossEntropy(x):
 25 |     """Cross entropy loss function. Only works for networks with one output."""
 26 | 
 27 |     if x.ndim == 1:
 28 |         pass
 29 | 
 30 |     elif x.ndim == 2:
 31 |         x = x[:, 0]
 32 | 
 33 |     else:
 34 |         raise ValueError('x must be either a vector or a matrix.')
 35 | 
 36 |     y = tt.vector('y')
 37 |     L = -tt.mean(y * tt.log(x) + (1-y) * tt.log(1-x))
 38 |     L.name = 'loss'
 39 | 
 40 |     return y, L
 41 | 
 42 | 
 43 | def MultiCrossEntropy(x):
 44 |     """Cross entropy loss function with multiple outputs."""
 45 | 
 46 |     assert x.ndim == 2, 'x must be a matrix.'
 47 | 
 48 |     y = tt.matrix('y')
 49 |     L = -tt.mean(tt.sum(y * tt.log(x), axis=1))
 50 |     L.name = 'loss'
 51 | 
 52 |     return y, L
 53 | 
 54 | 
 55 | def Accuracy(x):
 56 |     """Accuracy loss function. Mainly useful for validation."""
 57 | 
 58 |     if x.ndim == 1:
 59 |         pass
 60 | 
 61 |     elif x.ndim == 2:
 62 |         x = x.argmax(axis=1)
 63 | 
 64 |     else:
 65 |         raise ValueError('x must be either a vector or a matrix.')
 66 | 
 67 |     y = tt.vector('y')
 68 |     L = tt.mean(tt.eq(y, x))
 69 |     L.name = 'loss'
 70 | 
 71 |     return y, L
 72 | 
 73 | 
 74 | def WeightDecay(ws, wdecay):
 75 |     """Weight decay regularization."""
 76 | 
 77 |     assert wdecay > 0.0
 78 | 
 79 |     L = (wdecay / 2.0) * sum([tt.sum(w**2) for w in ws])
 80 |     return L
 81 | 
 82 | 
 83 | def SviRegularizer(mps, sps, wdecay):
 84 |     """
 85 |     The type of regularization that is used in stochastic variational inference. Here, we assume that the prior is
 86 |     a spherical zero-centred gaussian whose precision corresponds to the weight decay parameter.
 87 |     """
 88 | 
 89 |     assert wdecay > 0.0
 90 | 
 91 |     n_params = sum([mp.get_value().size for mp in mps])
 92 | 
 93 |     L1 = 0.5 * wdecay * (sum([tt.sum(mp**2) for mp in mps]) + sum([tt.sum(tt.exp(sp*2)) for sp in sps]))
 94 |     L2 = sum([tt.sum(sp) for sp in sps])
 95 |     Lc = 0.5 * n_params * (1.0 + np.log(wdecay))
 96 | 
 97 |     L = L1 - L2 - Lc
 98 | 
 99 |     return L
100 | 


--------------------------------------------------------------------------------
/util/ml.py:
--------------------------------------------------------------------------------
  1 | from itertools import izip
  2 | import numpy as np
  3 | import theano
  4 | import theano.tensor as tt
  5 | 
  6 | 
  7 | def select_theano_act_function(name, dtype=theano.config.floatX):
  8 |     """
  9 |     Given the name of an activation function, returns a handle for the corresponding function in theano.
 10 |     """
 11 | 
 12 |     if name == 'logistic':
 13 |         clip = 15.0 if dtype == 'float32' else 19.0
 14 |         f = lambda x: tt.nnet.sigmoid(tt.clip(x, -clip, clip))
 15 | 
 16 |     elif name == 'tanh':
 17 |         clip = 9.0 if dtype == 'float32' else 19.0
 18 |         f = lambda x: tt.tanh(tt.clip(x, -clip, clip))
 19 | 
 20 |     elif name == 'linear':
 21 |         f = lambda x: x
 22 | 
 23 |     elif name == 'relu':
 24 |         f = tt.nnet.relu
 25 | 
 26 |     elif name == 'softplus':
 27 |         f = tt.nnet.softplus
 28 | 
 29 |     elif name == 'softmax':
 30 |         f = tt.nnet.softmax
 31 | 
 32 |     else:
 33 |         raise ValueError(name + ' is not a supported activation function type.')
 34 | 
 35 |     return f
 36 | 
 37 | 
 38 | def copy_model_parms(source_model, target_model):
 39 |     """
 40 |     Copies the parameters of source_model to target_model.
 41 |     """
 42 | 
 43 |     for sp, tp in izip(source_model.parms, target_model.parms):
 44 |         tp.set_value(sp.get_value())
 45 | 
 46 | 
 47 | def one_hot_encode(labels, n_labels):
 48 |     """
 49 |     Transforms numeric labels to 1-hot encoded labels. Assumes numeric labels are in the range 0, 1, ..., n_labels-1.
 50 |     """
 51 | 
 52 |     assert np.min(labels) >= 0 and np.max(labels) < n_labels
 53 | 
 54 |     y = np.zeros([labels.size, n_labels])
 55 |     y[xrange(labels.size), labels] = 1
 56 | 
 57 |     return y
 58 | 
 59 | 
 60 | def prepare_cond_input(xy, dtype):
 61 |     """
 62 |     Prepares the conditional input for model evaluation.
 63 |     :param xy: tuple (x, y) for evaluating p(y|x)
 64 |     :param dtype: data type
 65 |     :return: prepared x, y and flag whether single datapoint input
 66 |     """
 67 | 
 68 |     x, y = xy
 69 |     x = np.asarray(x, dtype=dtype)
 70 |     y = np.asarray(y, dtype=dtype)
 71 | 
 72 |     one_datapoint = False
 73 | 
 74 |     if x.ndim == 1:
 75 | 
 76 |         if y.ndim == 1:
 77 |             x = x[np.newaxis, :]
 78 |             y = y[np.newaxis, :]
 79 |             one_datapoint = True
 80 | 
 81 |         else:
 82 |             x = np.tile(x, [y.shape[0], 1])
 83 | 
 84 |     else:
 85 | 
 86 |         if y.ndim == 1:
 87 |             y = np.tile(y, [x.shape[0], 1])
 88 | 
 89 |         else:
 90 |             assert x.shape[0] == y.shape[0], 'wrong sizes'
 91 | 
 92 |     return x, y, one_datapoint
 93 | 
 94 | 
 95 | def are_parms_finite(model):
 96 |     """
 97 |     Check whether all parameters of a model are finite.
 98 |     :param model: an ml model
 99 |     :return: False if at least one parameter is inf or nan
100 |     """
101 | 
102 |     check = True
103 | 
104 |     for p in model.parms:
105 |         check = check and np.all(np.isfinite(p.get_value()))
106 | 
107 |     return check
108 | 


--------------------------------------------------------------------------------
/ml/data_streams.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import theano
  3 | 
  4 | 
  5 | class DataStream:
  6 |     """Abstract class. Specifies the interface of a data stream.
  7 |     The user can request from the stream to generate a new data batch of a
  8 |     specified size. Useful for online learning."""
  9 | 
 10 |     def gen(self, N):
 11 |         """Generates a new data batch of size N."""
 12 |         raise NotImplementedError('This is an abstract method and should be overriden.')
 13 | 
 14 | 
 15 | class DataSubSampler(DataStream):
 16 |     """Given a data set, subsamples mini-batches from it."""
 17 | 
 18 |     def __init__(self, xs):
 19 | 
 20 |         # check that input is of the right type
 21 |         check = lambda t: isinstance(t, np.ndarray) and t.size and t.ndim
 22 |         assert isinstance(xs, list) and xs, 'Input must be a non-empty list.'
 23 |         assert check(xs[0]), 'Data must be given as real nonempty arrays.'
 24 |         N = xs[0].shape[0]
 25 |         for x in xs[1:]:
 26 |             assert check(x), 'Data must be given as real nonempty arrays.'
 27 |             Nk = x.shape[0]
 28 |             assert N == Nk, 'All data arrays must have the same number of elements in their first dimension.'
 29 | 
 30 |         # set remaining class properties
 31 |         self.index_stream = IndexSubSampler(N)
 32 |         self.xs = [ theano.shared(x.astype(theano.config.floatX), name='data'+str(i)) for i, x in enumerate(xs) ]
 33 | 
 34 |     def gen(self, N):
 35 |         """Generates a new data batch of size N from the data set."""
 36 | 
 37 |         assert isinstance(N, int) and N > 0, 'Batch size must be a positive integer.'
 38 | 
 39 |         n = self.index_stream.gen(N)
 40 |         return [x[n] for x in self.xs]
 41 | 
 42 | 
 43 | class IndexSubSampler(DataStream):
 44 |     """Subsamples minibatches of indices."""
 45 | 
 46 |     def __init__(self, num_idx, rng=np.random):
 47 | 
 48 |         assert isinstance(num_idx, int) and num_idx > 0, 'Number of indices must be a positive integer.'
 49 | 
 50 |         self.num_idx = num_idx
 51 |         self.nn = range(num_idx)
 52 |         rng.shuffle(self.nn)
 53 |         self.i = 0
 54 |         self.rng = rng
 55 | 
 56 |     def gen(self, N):
 57 |         """Generates a new index batch of size N from 0:num_idx-1."""
 58 | 
 59 |         assert isinstance(N, int) and N > 0, 'Batch size must be a positive integer.'
 60 | 
 61 |         j = self.i + N
 62 |         times = j // self.num_idx
 63 |         new_i = j % self.num_idx
 64 |         n = []
 65 | 
 66 |         for t in xrange(times):
 67 |             n += self.nn[self.i:]
 68 |             self.rng.shuffle(self.nn)
 69 |             self.i = 0
 70 | 
 71 |         n += self.nn[self.i:new_i]
 72 |         self.i = new_i
 73 | 
 74 |         return n
 75 | 
 76 | 
 77 | class IndexSubSamplerSeq(DataStream):
 78 |     """Subsamples minibatches of indices. Indices are sequentially grouped into minibatches."""
 79 | 
 80 |     def __init__(self, num_idx):
 81 | 
 82 |         assert isinstance(num_idx, int) and num_idx > 0, 'Number of indices must be a positive integer.'
 83 | 
 84 |         self.num_idx = num_idx
 85 |         self.nn = range(num_idx)
 86 |         self.i = 0
 87 | 
 88 |     def gen(self, N):
 89 |         """Generates a new index batch of size N from 0:num_idx-1."""
 90 | 
 91 |         assert isinstance(N, int) and N > 0, 'Batch size must be a positive integer.'
 92 | 
 93 |         j = self.i + N
 94 |         times = j // self.num_idx
 95 |         new_i = j % self.num_idx
 96 |         n = []
 97 | 
 98 |         for t in xrange(times):
 99 |             n += self.nn[self.i:]
100 |             self.i = 0
101 | 
102 |         n += self.nn[self.i:new_i]
103 |         self.i = new_i
104 | 
105 |         return n
106 | 


--------------------------------------------------------------------------------
/util/math.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.misc
  3 | 
  4 | 
  5 | def isposint(n):
  6 |     """
  7 |     Determines whether number n is a positive integer.
  8 |     :param n: number
  9 |     :return: bool
 10 |     """
 11 |     return isinstance(n, int) and n > 0
 12 | 
 13 | 
 14 | def isdistribution(p):
 15 |     """
 16 |     :param p: a vector representing a discrete probability distribution
 17 |     :return: True if p is a valid probability distribution
 18 |     """
 19 |     return np.all(p >= 0.0) and np.isclose(np.sum(p), 1.0)
 20 | 
 21 | 
 22 | def logistic(x):
 23 |     """
 24 |     Elementwise logistic sigmoid.
 25 |     :param x: numpy array
 26 |     :return: numpy array
 27 |     """
 28 |     return 1.0 / (1.0 + np.exp(-x))
 29 | 
 30 | 
 31 | def logit(x):
 32 |     """
 33 |     Elementwise logit (inverse logistic sigmoid).
 34 |     :param x: numpy array
 35 |     :return: numpy array
 36 |     """
 37 |     return np.log(x / (1.0 - x))
 38 | 
 39 | 
 40 | def discrete_sample(p, n_samples=None, rng=np.random):
 41 |     """
 42 |     Samples from a discrete distribution.
 43 |     :param p: a distribution with N elements
 44 |     :param n_samples: number of samples, only 1 if None
 45 |     :return: vector of samples
 46 |     """
 47 | 
 48 |     # check distribution
 49 |     # assert isdistribution(p), 'Probabilities must be non-negative and sum to one.'
 50 | 
 51 |     one_sample = n_samples is None
 52 | 
 53 |     # cumulative distribution
 54 |     c = np.cumsum(p[:-1])[np.newaxis, :]
 55 | 
 56 |     # get the samples
 57 |     r = rng.rand(1 if one_sample else n_samples, 1)
 58 |     samples = np.sum((r > c).astype(int), axis=1)
 59 | 
 60 |     return samples[0] if one_sample else samples
 61 | 
 62 | 
 63 | def importance_sample(target, proposal, n_samples, rng=np.random):
 64 |     """
 65 |     Importance sampling.
 66 |     :param target: target distribution
 67 |     :param proposal: proposal distribution
 68 |     :param n_samples: number of samples
 69 |     :param rng: random generator to use
 70 |     :return: samples, normalized log weights
 71 |     """
 72 | 
 73 |     xs = proposal.gen(n_samples, rng=rng)
 74 |     log_ws = target.eval(xs, log=True) - proposal.eval(xs, log=True)
 75 |     log_ws -= scipy.misc.logsumexp(log_ws)
 76 | 
 77 |     return xs, log_ws
 78 | 
 79 | 
 80 | def ess_importance(ws):
 81 |     """
 82 |     Calculates the effective sample size of a set of weighted independent samples (e.g. as given by importance
 83 |     sampling or sequential monte carlo). Takes as input the normalized sample weights.
 84 |     """
 85 | 
 86 |     ess = 1.0 / np.sum(ws ** 2)
 87 |     return ess
 88 | 
 89 | 
 90 | def ess_mcmc(xs):
 91 |     """
 92 |     Calculates the effective sample size of a correlated sequence of samples, e.g. as given by markov chain monte
 93 |     carlo.
 94 |     """
 95 | 
 96 |     n_samples, n_dim = xs.shape
 97 | 
 98 |     mean = np.mean(xs, axis=0)
 99 |     xms = xs - mean
100 | 
101 |     acors = np.zeros_like(xms)
102 |     for i in xrange(n_dim):
103 |         for lag in xrange(n_samples):
104 |             acor = np.sum(xms[:n_samples-lag, i] * xms[lag:, i]) / (n_samples - lag)
105 |             if acor <= 0.0: break
106 |             acors[lag, i] = acor
107 | 
108 |     act = 1.0 + 2.0 * np.sum(acors[1:], axis=0) / acors[0]
109 |     ess = n_samples / act
110 | 
111 |     return np.min(ess)
112 | 
113 | 
114 | def calc_whitening_transform(xs):
115 |     """
116 |     Calculates the parameters that whiten a dataset.
117 |     """
118 | 
119 |     assert xs.ndim == 2, 'Data must be a matrix'
120 |     N = xs.shape[0]
121 | 
122 |     means = np.mean(xs, axis=0)
123 |     ys = xs - means
124 | 
125 |     cov = np.dot(ys.T, ys) / N
126 |     vars, U = np.linalg.eig(cov)
127 |     istds = np.sqrt(1.0 / vars)
128 | 
129 |     return means, U, istds
130 | 
131 | 
132 | def whiten(xs, params):
133 |     """
134 |     Whitens a given dataset using the whitening transform provided.
135 |     """
136 | 
137 |     means, U, istds = params
138 | 
139 |     ys = xs.copy()
140 |     ys -= means
141 |     ys = np.dot(ys, U)
142 |     ys *= istds
143 | 
144 |     return ys
145 | 
146 | 
147 | def de_whiten(xs, params):
148 |     """
149 |     De-whitens a given dataset using the whitening transform provided.
150 |     """
151 | 
152 |     means, U, istds = params
153 | 
154 |     ys = xs.copy()
155 |     ys /= istds
156 |     ys = np.dot(ys, U.T)
157 |     ys += means
158 | 
159 |     return ys
160 | 


--------------------------------------------------------------------------------
/ml/step_strategies.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from itertools import izip
  3 | import numpy as np
  4 | import theano
  5 | import theano.tensor as tt
  6 | 
  7 | 
  8 | class StepStrategy:
  9 |     """Abstract class for the step size strategy of stochastic gradient training."""
 10 | 
 11 |     def updates(self, parms, grads):
 12 |         """Given current gradient, return a list of updates to be made."""
 13 |         raise NotImplementedError('This is an abstract method and should be overriden.')
 14 | 
 15 | 
 16 | class ConstantStep(StepStrategy):
 17 |     """Step size strategy where the learning rate is held constant."""
 18 | 
 19 |     def __init__(self, step):
 20 |         """
 21 |         Constructor.
 22 |         :param step: the constant step size to be used
 23 |         """
 24 |         assert step > 0.0, 'Step size must be positive.'
 25 |         self.step = step
 26 | 
 27 |     def updates(self, parms, grads):
 28 |         """No updates to be made; step size is held constant throughout."""
 29 |         new_parms = [p - self.step*g for p, g in izip(parms, grads)]
 30 |         return zip(parms, new_parms)
 31 | 
 32 | 
 33 | class LinearDecay(StepStrategy):
 34 |     """Step size strategy where the learning rate is linearly decreased so as to
 35 |     hit zero after a specified number of iterations."""
 36 | 
 37 |     def __init__(self, init, maxiter):
 38 |         """
 39 |         Constructor.
 40 |         :param init: initial step size
 41 |         :param maxiter: maximum number of iterations.
 42 |         """
 43 |         assert init > 0.0, 'Step size must be positive.'
 44 |         assert isinstance(maxiter, int) and maxiter > 0, 'Maximum number of iterations must be a positive integer.'
 45 | 
 46 |         self.init = init
 47 |         self.maxiter = maxiter
 48 | 
 49 |     def updates(self, parms, grads):
 50 |         """Next step is linearly decayed."""
 51 |         step = theano.shared(np.asarray(self.init, dtype=theano.config.floatX), name='step')
 52 |         new_step = step - self.init / self.maxiter
 53 |         new_parms = [p - step*g for p, g in izip(parms, grads)]
 54 |         return [(step, new_step)] + zip(parms, new_parms)
 55 | 
 56 | 
 57 | class AdaDelta(StepStrategy):
 58 |     """ADADELTA step size strategy. For details, see:
 59 |     M. D. Zeiler, "ADADELTA: An adaptive learning rate method", arXiv, 2012."""
 60 | 
 61 |     def __init__(self, rho=0.95, eps=1.0e-6):
 62 |         """Constructor. Sets adadelta's hyperparameters."""
 63 |         assert eps > 0, 'eps must be positive.'
 64 |         assert 0 < rho < 1, 'rho must be strictly between 0 and 1.'
 65 | 
 66 |         self.eps = eps
 67 |         self.rho = rho
 68 | 
 69 |     def updates(self, parms, grads):
 70 |         """Return a list of updates to be made, both to adadelta's accumulators and the parameters."""
 71 | 
 72 |         acc_gs = [theano.shared(np.zeros_like(p.get_value(borrow=True)), borrow=True) for p in parms]
 73 |         acc_ds = [theano.shared(np.zeros_like(p.get_value(borrow=True)), borrow=True) for p in parms]
 74 | 
 75 |         new_acc_gs = [self.rho * ag + (1-self.rho) * g**2 for g, ag in izip(grads, acc_gs)]
 76 |         ds = [tt.sqrt((ad + self.eps) / (ag + self.eps)) * g for g, ag, ad in izip(grads, new_acc_gs, acc_ds)]
 77 |         new_acc_ds = [self.rho * ad + (1-self.rho) * d**2 for d, ad in izip(ds, acc_ds)]
 78 |         new_parms = [p - d for p, d in izip(parms, ds)]
 79 | 
 80 |         return zip(acc_gs, new_acc_gs) + zip(acc_ds, new_acc_ds) + zip(parms, new_parms)
 81 | 
 82 | 
 83 | class Adam(StepStrategy):
 84 |     """Adam step size strategy. For details, see:
 85 |     D. Kingma and J. Ba, "Adam: A Method for Stochastic Optimization", ICLR, 2015."""
 86 | 
 87 |     def __init__(self, a=0.001, bm=0.9, bv=0.999, eps=1.0e-8):
 88 |         """Constructor. Sets adam's hyperparameters."""
 89 |         assert a > 0, 'a must be positive.'
 90 |         assert 0 < bm < 1, 'bm must be strictly between 0 and 1.'
 91 |         assert 0 < bv < 1, 'bv must be strictly between 0 and 1.'
 92 |         assert eps > 0, 'eps must be positive.'
 93 | 
 94 |         self.a = a
 95 |         self.bm = bm
 96 |         self.bv = bv
 97 |         self.eps = eps
 98 | 
 99 |     def updates(self, parms, grads):
100 |         """Return a list of updates to be made, both to adams's running averages and the parameters."""
101 | 
102 |         bm_t = theano.shared(np.asarray(self.bm).astype(theano.config.floatX))
103 |         bv_t = theano.shared(np.asarray(self.bv).astype(theano.config.floatX))
104 | 
105 |         new_bm_t = bm_t * self.bm
106 |         new_bv_t = bv_t * self.bv
107 | 
108 |         acc_m = [theano.shared(np.zeros_like(p.get_value(borrow=True)), borrow=True) for p in parms]
109 |         acc_v = [theano.shared(np.zeros_like(p.get_value(borrow=True)), borrow=True) for p in parms]
110 | 
111 |         new_acc_m = [self.bm * am + (1-self.bm) * g for g, am in izip(grads, acc_m)]
112 |         new_acc_v = [self.bv * av + (1-self.bv) * g**2 for g, av in izip(grads, acc_v)]
113 | 
114 |         step = self.a * tt.sqrt(1-new_bv_t) / (1-new_bm_t)
115 |         eps = self.eps * (1-new_bv_t)
116 |         ds = [step * am / tt.sqrt(av + eps) for am, av in izip(new_acc_m, new_acc_v)]
117 | 
118 |         new_parms = [p - d for p, d in izip(parms, ds)]
119 | 
120 |         return zip([bm_t, bv_t], [new_bm_t, new_bv_t]) + zip(acc_m, new_acc_m) + zip(acc_v, new_acc_v) + zip(parms, new_parms)
121 | 


--------------------------------------------------------------------------------
/ml/ensembles.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from scipy.misc import logsumexp
  3 | from copy import deepcopy
  4 | from itertools import izip
  5 | import numpy as np
  6 | 
  7 | 
  8 | class Ensemble:
  9 |     """
 10 |     Implements an ensemble of other models.
 11 |     """
 12 | 
 13 |     def __init__(self):
 14 |         """Initializes the ensemble as empty."""
 15 | 
 16 |         self.models = []
 17 |         self.n_copies = []
 18 |         self.n_models = 0
 19 |         self.n_diff_models = 0
 20 |         self.n_inputs = 0
 21 |         self.n_outputs = 0
 22 | 
 23 |     def add_new(self, model, copy=False):
 24 |         """Adds a new model to the ensemble."""
 25 | 
 26 |         if self.n_models == 0:
 27 |             self.n_inputs = model.n_inputs
 28 |             self.n_outputs = model.n_outputs
 29 | 
 30 |         else:
 31 |             assert self.n_inputs == model.n_inputs
 32 |             assert self.n_outputs == model.n_outputs
 33 | 
 34 |         if copy:
 35 |             self.models.append(deepcopy(model))
 36 |         else:
 37 |             self.models.append(model)
 38 |         self.n_copies.append(1)
 39 |         self.n_models += 1
 40 |         self.n_diff_models += 1
 41 | 
 42 |     def add_existing(self, i):
 43 |         """Adds an extra copy of model i in the ensemble."""
 44 | 
 45 |         self.n_copies[i] += 1
 46 |         self.n_models += 1
 47 | 
 48 |     def remove(self, i):
 49 |         """Removes a model at position i from the ensemble."""
 50 | 
 51 |         self.n_copies[i] -= 1
 52 |         if self.n_copies[i] == 0:
 53 |             del self.models[i]
 54 |             del self.n_copies[i]
 55 |             self.n_diff_models -= 1
 56 | 
 57 |         self.n_models -= 1
 58 |         if self.n_models == 0:
 59 |             self.n_inputs = 0
 60 |             self.n_outputs = 0
 61 | 
 62 |     def eval(self, x):
 63 |         """Evaluates ensemble at given input x."""
 64 | 
 65 |         # NOTE that there is the potential drawback in this implementation that x is moved back and forth to the gpu
 66 | 
 67 |         assert self.n_models > 0, 'Ensemble is empty.'
 68 | 
 69 |         y = 0.0
 70 | 
 71 |         for model, copies in izip(self.models, self.n_copies):
 72 |             y += copies * model.eval(x)
 73 | 
 74 |         y /= self.n_models
 75 | 
 76 |         return y
 77 | 
 78 |     def eval_model(self, i, x):
 79 |         """Evaluates model i in the ensemble at given input x."""
 80 | 
 81 |         return self.models[i].eval(x)
 82 | 
 83 | 
 84 | class FastEnsemble:
 85 |     """
 86 |     Implements an ensemble of other models. Maintains only a single model, and a list of different parameter matrices.
 87 |     As a result, it is faster to create and more memory efficient, but slower to evaluate.
 88 |     """
 89 | 
 90 |     def __init__(self, model, copy=False):
 91 |         """Initializes the ensemble as empty."""
 92 | 
 93 |         self.model = deepcopy(model) if copy else model
 94 |         self.parms = []
 95 |         self.n_copies = []
 96 |         self.n_models = 0
 97 |         self.n_diff_models = 0
 98 |         self.n_inputs = model.n_inputs
 99 |         self.n_outputs = model.n_outputs
100 | 
101 |     def _load_model(self, i):
102 |         """Loads parameters for model i."""
103 | 
104 |         for j, p in enumerate(self.parms[i]):
105 |             self.model.parms[j].set_value(p)
106 | 
107 |     def add_new(self, parms, copy=False):
108 |         """Adds a new set of parameters to the ensemble."""
109 | 
110 |         if copy:
111 |             self.parms.append([x.get_value().copy() for x in parms])
112 |         else:
113 |             self.parms.append([x.get_value() for x in parms])
114 |         self.n_copies.append(1)
115 |         self.n_models += 1
116 |         self.n_diff_models += 1
117 | 
118 |     def add_existing(self, i):
119 |         """Adds an extra copy of model i in the ensemble."""
120 | 
121 |         self.n_copies[i] += 1
122 |         self.n_models += 1
123 | 
124 |     def remove(self, i):
125 |         """Removes a model at position i from the ensemble."""
126 | 
127 |         self.n_copies[i] -= 1
128 | 
129 |         if self.n_copies[i] == 0:
130 |             del self.parms[i]
131 |             del self.n_copies[i]
132 |             self.n_diff_models -= 1
133 | 
134 |     def eval(self, x, mode='mean'):
135 |         """Evaluates ensemble at given input x."""
136 | 
137 |         # NOTE that there is the potential drawback in this implementation that x is moved back and forth to the gpu
138 | 
139 |         assert self.n_models > 0, 'Ensemble is empty.'
140 | 
141 |         if mode == 'mean':
142 | 
143 |             y = 0.0
144 | 
145 |             for i, copies in enumerate(self.n_copies):
146 |                 self._load_model(i)
147 |                 y += copies * self.model.eval(x)
148 | 
149 |             y /= self.n_models
150 | 
151 |         elif mode == 'logmeanexp':
152 | 
153 |             y = []
154 | 
155 |             for i, copies in enumerate(self.n_copies):
156 |                 self._load_model(i)
157 |                 y.append(np.log(copies) + self.model.eval(x))
158 | 
159 |             y = logsumexp(np.array(y), axis=0) - np.log(self.n_models)
160 | 
161 |         else:
162 |             raise ValueError('Unknown averaging mode.')
163 | 
164 |         return y
165 | 
166 |     def eval_model(self, i, x):
167 |         """Evaluates model i in the ensemble at given input x."""
168 | 
169 |         self._load_model(i)
170 |         return self.model.eval(x)
171 | 
172 |     def get_traces(self):
173 |         """Returns matrices whose columns are traces of parameters, in the order they where added to the ensemble."""
174 | 
175 |         all_traces = []
176 | 
177 |         for i in xrange(len(self.model.parms)):
178 | 
179 |             traces = []
180 | 
181 |             for params, copies in izip(self.parms, self.n_copies):
182 |                 for n in xrange(copies):
183 |                     traces.append(params[i].flatten())
184 | 
185 |             all_traces.append(np.array(traces))
186 | 
187 |         return all_traces


--------------------------------------------------------------------------------
/util/plot.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | 
  4 | 
  5 | def disp_imdata(xs, imsize, layout=(1, 1)):
  6 |     """
  7 |     Displays an array of images, a page at a time. The user can navigate pages with
  8 |     left and right arrows, start over by pressing space, or close the figure by esc.
  9 |     :param xs: an numpy array with images as rows
 10 |     :param imsize: size of the images
 11 |     :param layout: layout of images in a page
 12 |     :return: none
 13 |     """
 14 | 
 15 |     num_plots = np.prod(layout)
 16 |     num_xs = xs.shape[0]
 17 |     idx = [0]
 18 | 
 19 |     # create a figure with subplots
 20 |     fig, axs = plt.subplots(layout[0], layout[1])
 21 | 
 22 |     if isinstance(axs, np.ndarray):
 23 |         axs = axs.flatten()
 24 |     else:
 25 |         axs = [axs]
 26 | 
 27 |     for ax in axs:
 28 |         ax.axes.get_xaxis().set_visible(False)
 29 |         ax.axes.get_yaxis().set_visible(False)
 30 | 
 31 |     def plot_page():
 32 |         """Plots the next page."""
 33 | 
 34 |         ii = np.arange(idx[0], idx[0]+num_plots) % num_xs
 35 | 
 36 |         for ax, i in zip(axs, ii):
 37 |             ax.imshow(xs[i].reshape(imsize), cmap='gray', interpolation='none')
 38 |             ax.set_title(str(i))
 39 | 
 40 |         fig.canvas.draw()
 41 | 
 42 |     def on_key_event(event):
 43 |         """Event handler after key press."""
 44 | 
 45 |         key = event.key
 46 | 
 47 |         if key == 'right':
 48 |             # show next page
 49 |             idx[0] = (idx[0] + num_plots) % num_xs
 50 |             plot_page()
 51 | 
 52 |         elif key == 'left':
 53 |             # show previous page
 54 |             idx[0] = (idx[0] - num_plots) % num_xs
 55 |             plot_page()
 56 | 
 57 |         elif key == ' ':
 58 |             # show first page
 59 |             idx[0] = 0
 60 |             plot_page()
 61 | 
 62 |         elif key == 'escape':
 63 |             # close figure
 64 |             plt.close(fig)
 65 | 
 66 |     fig.canvas.mpl_connect('key_press_event', on_key_event)
 67 |     plot_page()
 68 | 
 69 | 
 70 | def probs2contours(probs, levels):
 71 |     """
 72 |     Takes an array of probabilities and produces an array of contours at specified percentile levels
 73 |     :param probs: probability array. doesn't have to sum to 1, but it is assumed it contains all the mass
 74 |     :param levels: percentile levels. have to be in [0.0, 1.0]
 75 |     :return: array of same shape as probs with percentile labels
 76 |     """
 77 | 
 78 |     # make sure all contour levels are in [0.0, 1.0]
 79 |     levels = np.asarray(levels)
 80 |     assert np.all(levels <= 1.0) and np.all(levels >= 0.0)
 81 | 
 82 |     # flatten probability array
 83 |     shape = probs.shape
 84 |     probs = probs.flatten()
 85 | 
 86 |     # sort probabilities in descending order
 87 |     idx_sort = probs.argsort()[::-1]
 88 |     idx_unsort = idx_sort.argsort()
 89 |     probs = probs[idx_sort]
 90 | 
 91 |     # cumulative probabilities
 92 |     cum_probs = probs.cumsum()
 93 |     cum_probs /= cum_probs[-1]
 94 | 
 95 |     # create contours at levels
 96 |     contours = np.ones_like(cum_probs)
 97 |     levels = np.sort(levels)[::-1]
 98 |     for level in levels:
 99 |         contours[cum_probs <= level] = level
100 | 
101 |     # make sure contours have the order and the shape of the original probability array
102 |     contours = np.reshape(contours[idx_unsort], shape)
103 | 
104 |     return contours
105 | 
106 | 
107 | def plot_pdf_marginals(pdf, lims, gt=None, levels=(0.68, 0.95)):
108 |     """
109 |     Plots marginals of a pdf, for each variable and pair of variables.
110 |     """
111 | 
112 |     if pdf.ndim == 1:
113 | 
114 |         fig, ax = plt.subplots(1, 1)
115 |         xx = np.linspace(lims[0], lims[1], 200)
116 | 
117 |         pp = pdf.eval(xx[:, np.newaxis], log=False)
118 |         ax.plot(xx, pp)
119 |         ax.set_xlim(lims)
120 |         ax.set_ylim([0, ax.get_ylim()[1]])
121 |         if gt is not None: ax.vlines(gt, 0, ax.get_ylim()[1], color='r')
122 | 
123 |     else:
124 | 
125 |         fig = plt.figure()
126 | 
127 |         lims = np.asarray(lims)
128 |         lims = np.tile(lims, [pdf.ndim, 1]) if lims.ndim == 1 else lims
129 | 
130 |         for i in xrange(pdf.ndim):
131 |             for j in xrange(i + 1):
132 | 
133 |                 ax = fig.add_subplot(pdf.ndim, pdf.ndim, i * pdf.ndim + j + 1)
134 | 
135 |                 if i == j:
136 |                     xx = np.linspace(lims[i, 0], lims[i, 1], 500)
137 |                     pp = pdf.eval(xx[:, np.newaxis], ii=[i], log=False)
138 |                     ax.plot(xx, pp)
139 |                     ax.set_xlim(lims[i])
140 |                     ax.set_ylim([0, ax.get_ylim()[1]])
141 |                     if gt is not None: ax.vlines(gt[i], 0, ax.get_ylim()[1], color='r')
142 | 
143 |                 else:
144 |                     xx = np.linspace(lims[i, 0], lims[i, 1], 200)
145 |                     yy = np.linspace(lims[j ,0], lims[j, 1], 200)
146 |                     X, Y = np.meshgrid(xx, yy)
147 |                     xy = np.concatenate([X.reshape([-1, 1]), Y.reshape([-1, 1])], axis=1)
148 |                     pp = pdf.eval(xy, ii=[i, j], log=False)
149 |                     pp = pp.reshape(list(X.shape))
150 |                     ax.contour(X, Y, probs2contours(pp, levels), levels)
151 |                     ax.set_xlim(lims[i])
152 |                     ax.set_ylim(lims[j])
153 |                     if gt is not None: ax.plot(gt[i], gt[j], 'r.', ms=8)
154 | 
155 |     plt.show(block=False)
156 | 
157 |     return fig
158 | 
159 | 
160 | def plot_hist_marginals(data, weights=None, lims=None, gt=None):
161 |     """
162 |     Plots marginal histograms and pairwise scatter plots of a dataset.
163 |     """
164 | 
165 |     n_bins = int(np.sqrt(data.shape[0]))
166 | 
167 |     if data.ndim == 1:
168 | 
169 |         fig, ax = plt.subplots(1, 1)
170 |         ax.hist(data, weights=weights, bins=n_bins, normed=True)
171 |         ax.set_ylim([0, ax.get_ylim()[1]])
172 |         if lims is not None: ax.set_xlim(lims)
173 |         if gt is not None: ax.vlines(gt, 0, ax.get_ylim()[1], color='r')
174 | 
175 |     else:
176 | 
177 |         n_dim = data.shape[1]
178 |         fig = plt.figure()
179 | 
180 |         if weights is None:
181 |             col = 'k'
182 |             vmin, vmax = None, None
183 |         else:
184 |             col = weights
185 |             vmin, vmax = 0., np.max(weights)
186 | 
187 |         if lims is not None:
188 |             lims = np.asarray(lims)
189 |             lims = np.tile(lims, [n_dim, 1]) if lims.ndim == 1 else lims
190 | 
191 |         for i in xrange(n_dim):
192 |             for j in xrange(i + 1):
193 | 
194 |                 ax = fig.add_subplot(n_dim, n_dim, i * n_dim + j + 1)
195 | 
196 |                 if i == j:
197 |                     ax.hist(data[:, i], weights=weights, bins=n_bins, normed=True)
198 |                     ax.set_ylim([0, ax.get_ylim()[1]])
199 |                     if lims is not None: ax.set_xlim(lims[i])
200 |                     if gt is not None: ax.vlines(gt[i], 0, ax.get_ylim()[1], color='r')
201 | 
202 |                 else:
203 |                     ax.scatter(data[:, i], data[:, j], c=col, s=3, marker='o', vmin=vmin, vmax=vmax, cmap='binary', edgecolors='none')
204 |                     if lims is not None:
205 |                         ax.set_xlim(lims[i])
206 |                         ax.set_ylim(lims[j])
207 |                     if gt is not None: ax.scatter(gt[i], gt[j], c='r', s=12, marker='o', edgecolors='none')
208 | 
209 |     plt.show(block=False)
210 | 
211 |     return fig
212 | 
213 | 
214 | def plot_traces(xs):
215 |     """
216 |     Plots sample traces. Useful for MCMC.
217 |     :param xs: # samples x # vars numpy array
218 |     :return: figure and axes handles
219 |     """
220 | 
221 |     N = xs.shape[1]
222 |     fig, ax = plt.subplots(N, 1, sharex=True)
223 | 
224 |     for i in xrange(N):
225 |         ax[i].plot(xs[:, i])
226 | 
227 |     ax[-1].set_xlabel('samples')
228 |     plt.show(block=False)
229 | 
230 |     return fig, ax
231 | 


--------------------------------------------------------------------------------
/bnn_demo.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import numpy as np
  4 | import numpy.random as rng
  5 | import matplotlib.pyplot as plt
  6 | import matplotlib.cm as cm
  7 | from mpl_toolkits.mplot3d import Axes3D
  8 | 
  9 | import ml.trainers as trainers
 10 | import ml.neural_nets as nn
 11 | import ml.loss_functions as lf
 12 | 
 13 | wdecay = 0.0001
 14 | 
 15 | 
 16 | def create_dataset():
 17 |     """
 18 |     Creates a small dataset of 2d points in two linearly separable classes.
 19 |     :return: datapoints, labels
 20 |     """
 21 | 
 22 |     data_per_class = 12
 23 | 
 24 |     rng_state = rng.get_state()
 25 |     rng.seed(0)
 26 | 
 27 |     x1 = rng.multivariate_normal([-6, 0], np.eye(2), data_per_class)
 28 |     x2 = rng.multivariate_normal([+6, 0], np.eye(2), data_per_class)
 29 | 
 30 |     y1 = np.zeros(data_per_class)
 31 |     y2 = np.ones(data_per_class)
 32 | 
 33 |     xs = np.concatenate([x1, x2], axis=0)
 34 |     ys = np.concatenate([y1, y2], axis=0)
 35 | 
 36 |     rng.set_state(rng_state)
 37 | 
 38 |     return xs, ys
 39 | 
 40 | 
 41 | def create_net(svi=False):
 42 |     """
 43 |     Creates a feedforward neural net.
 44 |     :param svi: whether the neural net should be SVI enabled
 45 |     :return: the net
 46 |     """
 47 | 
 48 |     if svi:
 49 |         net = nn.FeedforwardNet_SVI(2)
 50 |     else:
 51 |         net = nn.FeedforwardNet(2)
 52 | 
 53 |     net.addLayer(10, 'relu')
 54 |     net.addLayer(1, 'logistic')
 55 | 
 56 |     return net
 57 | 
 58 | 
 59 | def create_grid(xmin, xmax, N):
 60 |     """
 61 |     Creates a grid for 3d plotting.
 62 |     :param xmin: lower limit
 63 |     :param xmax: upper limit
 64 |     :param N: number of points in the grid per dimension
 65 |     :return: the grid
 66 |     """
 67 | 
 68 |     xx = np.linspace(xmin, xmax, N)
 69 |     X, Y = np.meshgrid(xx, xx)
 70 |     data = np.concatenate([X.reshape([-1, 1]), Y.reshape([-1, 1])], axis=1)
 71 | 
 72 |     return data, X, Y
 73 | 
 74 | 
 75 | def show_train_data():
 76 |     """
 77 |     Plots the training data.
 78 |     """
 79 | 
 80 |     xs, ys = create_dataset()
 81 | 
 82 |     plt.figure()
 83 |     plt.plot(xs[ys == 0, 0], xs[ys == 0, 1], 'b.', ms=12)
 84 |     plt.plot(xs[ys == 1, 0], xs[ys == 1, 1], 'r.', ms=12)
 85 |     plt.xlabel('x1')
 86 |     plt.ylabel('x2')
 87 |     plt.axis('equal')
 88 |     plt.axis([-12, 12, -12, 12])
 89 |     plt.title('Training data')
 90 | 
 91 |     plt.show()
 92 | 
 93 | 
 94 | def fit_neural_net_demo():
 95 |     """
 96 |     Fits a non-bayesian neural net to the training data by minimizing cross entropy.
 97 |     """
 98 | 
 99 |     xs, ys = create_dataset()
100 |     net = create_net()
101 | 
102 |     # train the net
103 |     trn_target, trn_loss = lf.CrossEntropy(net.output)
104 |     regularizer = lf.WeightDecay(net.parms, wdecay)
105 |     trainer = trainers.SGD(
106 |         model=net,
107 |         trn_data=[xs, ys],
108 |         trn_loss=trn_loss + regularizer / xs.shape[0],
109 |         trn_target=trn_target
110 |     )
111 |     trainer.train(tol=1.0e-9, monitor_every=10, show_progress=True)
112 | 
113 |     # make predictions
114 |     tst_data, X, Y = create_grid(-12, 12, 50)
115 |     pred = net.eval(tst_data)
116 | 
117 |     # plot the prediction surface
118 |     fig = plt.figure()
119 |     ax = fig.gca(projection='3d')
120 |     Z = pred.reshape(list(X.shape))
121 |     ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0)
122 |     ax.plot(xs[ys == 0, 0], xs[ys == 0, 1], 'b.', ms=12)
123 |     ax.plot(xs[ys == 1, 0], xs[ys == 1, 1], 'r.', ms=12)
124 |     ax.view_init(elev=90, azim=-90)
125 |     plt.xlabel('x1')
126 |     plt.ylabel('x2')
127 |     plt.axis('equal')
128 |     ax.axis([-12, 12, -12, 12])
129 |     fig.suptitle('Prediction surface of trained net')
130 | 
131 |     plt.show()
132 | 
133 | 
134 | def bayesian_neural_net_svi_demo():
135 |     """
136 |     Trains a bayesian neural net on the training set using Stochastic Variational Inference.
137 |     """
138 | 
139 |     xs, ys = create_dataset()
140 |     net = create_net(svi=True)
141 |     tst_data, X, Y = create_grid(-12, 12, 50)
142 | 
143 |     # train the net
144 |     trn_target, trn_loss = lf.CrossEntropy(net.output)
145 |     regularizer = lf.SviRegularizer(net.mps, net.sps, wdecay)
146 |     trainer = trainers.SGD(
147 |         model=net,
148 |         trn_data=[xs, ys],
149 |         trn_loss=trn_loss + regularizer / xs.shape[0],
150 |         trn_target=trn_target
151 |     )
152 |     trainer.train(maxepochs=80000, monitor_every=10, show_progress=True)
153 | 
154 |     # make predictions with zero noise
155 |     base_pred = net.eval(tst_data, rand=False)
156 | 
157 |     # make predictions by averaging samples
158 |     n_samples = 1000
159 |     avg_pred = 0.0
160 |     for _ in xrange(n_samples):
161 |         avg_pred += net.eval(tst_data, rand=True)
162 |     avg_pred /= n_samples
163 | 
164 |     # plot the base prediction surface
165 |     fig = plt.figure()
166 |     ax = fig.gca(projection='3d')
167 |     Z = base_pred.reshape(list(X.shape))
168 |     ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0)
169 |     ax.plot(xs[ys == 0, 0], xs[ys == 0, 1], 'b.', ms=12)
170 |     ax.plot(xs[ys == 1, 0], xs[ys == 1, 1], 'r.', ms=12)
171 |     ax.view_init(elev=90, azim=-90)
172 |     plt.xlabel('x1')
173 |     plt.ylabel('x2')
174 |     plt.axis('equal')
175 |     ax.axis([-12, 12, -12, 12])
176 |     fig.suptitle('Prediction surface using average weights')
177 | 
178 |     # plot the average prediction surface
179 |     fig = plt.figure()
180 |     ax = fig.gca(projection='3d')
181 |     Z = avg_pred.reshape(list(X.shape))
182 |     ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0)
183 |     ax.plot(xs[ys == 0, 0], xs[ys == 0, 1], 'b.', ms=12)
184 |     ax.plot(xs[ys == 1, 0], xs[ys == 1, 1], 'r.', ms=12)
185 |     ax.view_init(elev=90, azim=-90)
186 |     plt.xlabel('x1')
187 |     plt.ylabel('x2')
188 |     plt.axis('equal')
189 |     ax.axis([-12, 12, -12, 12])
190 |     fig.suptitle('Bayesian prediction surface')
191 | 
192 |     # plot the sample prediction surfaces
193 |     fig = plt.figure()
194 |     fig.suptitle('Sample prediction surfaces')
195 | 
196 |     for i in xrange(6):
197 | 
198 |         sample_pred = net.eval(tst_data, rand=True)
199 | 
200 |         ax = fig.add_subplot(2, 3, i+1, projection='3d')
201 |         Z = sample_pred.reshape(list(X.shape))
202 |         ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0)
203 |         ax.plot(xs[ys == 0, 0], xs[ys == 0, 1], 'b.', ms=12)
204 |         ax.plot(xs[ys == 1, 0], xs[ys == 1, 1], 'r.', ms=12)
205 |         ax.view_init(elev=90, azim=-90)
206 |         plt.xlabel('x1')
207 |         plt.ylabel('x2')
208 |         plt.axis('equal')
209 |         ax.axis([-12, 12, -12, 12])
210 | 
211 |     plt.show()
212 | 
213 | 
214 | def bayesian_neural_net_hmc_demo():
215 |     """
216 |     Trains a bayesian neural net on the training set using Hamiltonian Monte Carlo.
217 |     """
218 | 
219 |     xs, ys = create_dataset()
220 |     net = create_net()
221 |     tst_data, X, Y = create_grid(-12, 12, 50)
222 | 
223 |     # make predictions on a grid of points
224 |     trn_target, trn_loss = lf.CrossEntropy(net.output)
225 |     regularizer = lf.WeightDecay(net.parms, wdecay)
226 |     sampler = trainers.HMC(
227 |         model=net,
228 |         trn_data=[xs, ys],
229 |         trn_loss=xs.shape[0] * trn_loss + regularizer,
230 |         trn_target=trn_target
231 |     )
232 |     ensemble = sampler.gen(
233 |         n_samples=2000,
234 |         L=100,
235 |         me=0.3,
236 |         show_traces=True
237 |     )
238 |     avg_pred = ensemble.eval(tst_data)
239 | 
240 |     # plot the prediction surface
241 |     fig = plt.figure()
242 |     ax = fig.gca(projection='3d')
243 |     Z = avg_pred.reshape(list(X.shape))
244 |     ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0)
245 |     ax.plot(xs[ys == 0, 0], xs[ys == 0, 1], 'b.', ms=12)
246 |     ax.plot(xs[ys == 1, 0], xs[ys == 1, 1], 'r.', ms=12)
247 |     ax.view_init(elev=90, azim=-90)
248 |     plt.xlabel('x1')
249 |     plt.ylabel('x2')
250 |     plt.axis('equal')
251 |     ax.axis([-12, 12, -12, 12])
252 |     fig.suptitle('Bayesian prediction surface')
253 | 
254 |     # plot the prediction surfaces of a few sample networks
255 |     fig = plt.figure()
256 |     fig.suptitle('Sample prediction surfaces')
257 | 
258 |     for c, i in enumerate(rng.randint(0, ensemble.n_diff_models, 6)):
259 | 
260 |         ax = fig.add_subplot(2, 3, c+1, projection='3d')
261 |         Z = ensemble.eval_model(i, tst_data).reshape(list(X.shape))
262 |         ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0)
263 |         ax.plot(xs[ys == 0, 0], xs[ys == 0, 1], 'b.', ms=12)
264 |         ax.plot(xs[ys == 1, 0], xs[ys == 1, 1], 'r.', ms=12)
265 |         ax.view_init(elev=90, azim=-90)
266 |         plt.xlabel('x1')
267 |         plt.ylabel('x2')
268 |         plt.axis('equal')
269 |         ax.axis([-12, 12, -12, 12])
270 | 
271 |     plt.show()
272 | 


--------------------------------------------------------------------------------
/ml/neural_nets.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import numpy as np
  4 | import theano
  5 | import theano.tensor as tt
  6 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | import util.math
 10 | import util.ml
 11 | import util.plot
 12 | 
 13 | dtype = theano.config.floatX
 14 | 
 15 | 
 16 | class FeedforwardNet:
 17 |     """Implements a feedforward neural network.
 18 |     Supports various types of layers and loss functions."""
 19 | 
 20 |     def __init__(self, n_inputs, input=None):
 21 |         """Constructs a net with a given number of inputs and no layers."""
 22 | 
 23 |         assert util.math.isposint(n_inputs), 'Number of inputs must be a positive integer.'
 24 | 
 25 |         self.n_inputs = n_inputs
 26 |         self.n_outputs = n_inputs
 27 |         self.n_units = [n_inputs]
 28 |         self.n_layers = 0
 29 |         self.n_params = 0
 30 | 
 31 |         self.Ws = []
 32 |         self.bs = []
 33 |         self.hs = [tt.matrix('x') if input is None else input]
 34 |         self.parms = self.Ws + self.bs
 35 |         self.input = self.hs[0]
 36 |         self.output = self.hs[-1]
 37 | 
 38 |         self.eval_f = None
 39 | 
 40 |     def addLayer(self, n_units, type, rng=np.random):
 41 |         """Adds a new layer to the network,
 42 |         :param n_units: number of units in the layer
 43 |         :param type: a string specification of the activation function
 44 |         """
 45 | 
 46 |         # check number of units
 47 |         assert util.math.isposint(n_units), 'Number of units must be a positive integer.'
 48 | 
 49 |         # choose activation function
 50 |         actfun = util.ml.select_theano_act_function(type, dtype)
 51 | 
 52 |         n_prev_units = self.n_outputs
 53 |         self.n_outputs = n_units
 54 |         self.n_units.append(n_units)
 55 |         self.n_layers += 1
 56 |         self.n_params += (n_prev_units + 1) * n_units
 57 | 
 58 |         W = theano.shared((rng.randn(n_prev_units, n_units) / np.sqrt(n_prev_units + 1)).astype(dtype), name='W' + str(self.n_layers), borrow=True)
 59 |         b = theano.shared(np.zeros(n_units, dtype=dtype), name='b' + str(self.n_layers), borrow=True)
 60 |         h = actfun(tt.dot(self.hs[-1], W) + b)
 61 |         h.name = 'h' + str(self.n_layers)
 62 | 
 63 |         self.Ws.append(W)
 64 |         self.bs.append(b)
 65 |         self.hs.append(h)
 66 |         self.parms = self.Ws + self.bs
 67 |         self.output = self.hs[-1]
 68 | 
 69 |         self.eval_f = None
 70 | 
 71 |     def removeLayer(self):
 72 |         """Removes a layer from the network."""
 73 | 
 74 |         assert self.n_layers > 0, 'There is no layer to remove.'
 75 | 
 76 |         n_params_to_rem = self.n_outputs * (self.n_units[-2] + 1)
 77 |         self.n_outputs = self.n_units[-2]
 78 |         self.n_units.pop()
 79 |         self.n_layers -= 1
 80 |         self.n_params -= n_params_to_rem
 81 | 
 82 |         self.Ws.pop()
 83 |         self.bs.pop()
 84 |         self.hs.pop()
 85 |         self.parms = self.Ws + self.bs
 86 |         self.output = self.hs[-1]
 87 | 
 88 |         self.eval_f = None
 89 | 
 90 |     def eval(self, x):
 91 |         """Evaluate net at locations in x."""
 92 | 
 93 |         # compile theano computation graph, if haven't already done so
 94 |         if self.eval_f is None:
 95 |             self.eval_f = theano.function(
 96 |                 inputs=[self.hs[0]],
 97 |                 outputs=self.hs[-1]
 98 |             )
 99 | 
100 |         x = np.asarray(x, dtype=dtype)
101 | 
102 |         return self.eval_f(x[np.newaxis, :])[0] if x.ndim == 1 else self.eval_f(x)
103 | 
104 |     def printInfo(self):
105 |         """Prints some useful info about the net."""
106 | 
107 |         print 'Number of inputs  =', self.n_inputs
108 |         print 'Number of outputs =', self.n_outputs
109 |         print 'Number of units   =', self.n_units
110 |         print 'Number of layers  =', self.n_layers
111 |         print 'Number of params  =', self.n_params
112 |         print 'Data type =', dtype
113 | 
114 |     def visualize_weights(self, layer, imsize, layout):
115 |         """
116 |         Displays the weights of a specified layer as images.
117 |         :param layer: the layer whose weights to display
118 |         :param imsize: the image size
119 |         :param layout: number of rows and columns for each page
120 |         :return: none
121 |         """
122 | 
123 |         util.plot.disp_imdata(self.Ws[layer].get_value().T, imsize, layout)
124 |         plt.show(block=False)
125 | 
126 |     def visualize_activations(self, x, layers=None):
127 |         """
128 |         Visualizes the activations of specified layers caused by a given data minibatch.
129 |         :param x: a minibatch of data
130 |         :param layers: list of layers to visualize activations of; defaults to the whole net except the input layer
131 |         :return: none
132 |         """
133 | 
134 |         if layers is None:
135 |             layers = xrange(self.n_layers)
136 | 
137 |         forwprop = theano.function(
138 |             inputs=[self.hs[0]],
139 |             outputs=self.hs[1:]
140 |         )
141 |         hs = forwprop(x.astype(dtype))
142 | 
143 |         for l in layers:
144 | 
145 |             fig = plt.figure()
146 |             ax = fig.add_subplot(1, 1, 1)
147 |             ax.imshow(hs[l], cmap='gray', interpolation='none')
148 |             ax.set_title('Layer ' + str(l))
149 |             ax.set_xlabel('layer units')
150 |             ax.set_ylabel('data points')
151 | 
152 |         plt.show(block=False)
153 | 
154 |     def param_hist(self, layers=None):
155 |         """
156 |         Displays a histogram of weights and biases for specified layers.
157 |         :param layers: list of layers to show histograms for; defaults to the whole net
158 |         :return: none
159 |         """
160 | 
161 |         if layers is None:
162 |             layers = xrange(self.n_layers)
163 | 
164 |         for l in layers:
165 | 
166 |             fig, (ax1, ax2) = plt.subplots(1, 2)
167 | 
168 |             nbins = int(np.sqrt(self.Ws[l].get_value().size))
169 |             ax1.hist(self.Ws[l].get_value().flatten(), nbins, normed=True)
170 |             ax1.set_title('weights, layer ' + str(l))
171 | 
172 |             nbins = int(np.sqrt(self.bs[l].get_value().size))
173 |             ax2.hist(self.bs[l].get_value(), nbins, normed=True)
174 |             ax2.set_title('biases, layer ' + str(l))
175 | 
176 |         plt.show(block=False)
177 | 
178 | 
179 | class FeedforwardNet_SVI:
180 |     """Implements a feedforward neural network trained using stochastic variational inference.
181 |     Supports various types of layers and loss functions."""
182 | 
183 |     def __init__(self, n_inputs):
184 |         """Constructs a net with a given number of inputs and no layers."""
185 | 
186 |         assert util.math.isposint(n_inputs), 'Number of inputs must be a positive integer.'
187 | 
188 |         self.n_inputs = n_inputs
189 |         self.n_outputs = n_inputs
190 |         self.n_units = [n_inputs]
191 |         self.n_layers = 0
192 |         self.n_params = 0
193 | 
194 |         self.mWs = []
195 |         self.mbs = []
196 |         self.sWs = []
197 |         self.sbs = []
198 |         self.uas = []
199 |         self.mas = []
200 |         self.zas = []
201 |         self.hs = [tt.matrix('x')]
202 | 
203 |         self.mps = self.mWs + self.mbs
204 |         self.sps = self.sWs + self.sbs
205 |         self.parms = self.mps + self.sps
206 |         self.input = self.hs[0]
207 |         self.output = self.hs[-1]
208 | 
209 |         self.srng = RandomStreams()
210 | 
211 |         self.eval_f = None
212 |         self.eval_f_rand = None
213 | 
214 |     def addLayer(self, n_units, type, rng=np.random):
215 |         """Adds a new layer to the network,
216 |         :param n_units: number of units in the layer
217 |         :param type: a string specification of the activation function
218 |         """
219 | 
220 |         # check number of units
221 |         assert util.math.isposint(n_units), 'Number of units must be a positive integer.'
222 | 
223 |         # choose activation function
224 |         actfun = util.ml.select_theano_act_function(type, dtype)
225 | 
226 |         n_prev_units = self.n_outputs
227 |         self.n_outputs = n_units
228 |         self.n_units.append(n_units)
229 |         self.n_layers += 1
230 |         self.n_params += 2 * (n_prev_units + 1) * n_units
231 | 
232 |         mW = theano.shared((rng.randn(n_prev_units, n_units) / np.sqrt(n_prev_units + 1)).astype(dtype), name='mW' + str(self.n_layers), borrow=True)
233 |         mb = theano.shared(np.zeros(n_units, dtype=dtype), name='mb' + str(self.n_layers), borrow=True)
234 |         sW = theano.shared(-5.0 * np.ones([n_prev_units, n_units], dtype=dtype), name='sW' + str(self.n_layers), borrow=True)
235 |         sb = theano.shared(-5.0 * np.ones(n_units, dtype=dtype), name='sb' + str(self.n_layers), borrow=True)
236 |         ua = self.srng.normal((self.hs[-1].shape[0], n_units), dtype=dtype)
237 |         ma = tt.dot(self.hs[-1], mW) + mb
238 |         sa = tt.dot(self.hs[-1]**2, tt.exp(2*sW)) + tt.exp(2*sb)
239 |         za = tt.sqrt(sa) * ua + ma
240 |         h = actfun(za)
241 |         h.name = 'h' + str(self.n_layers)
242 | 
243 |         self.mWs.append(mW)
244 |         self.mbs.append(mb)
245 |         self.sWs.append(sW)
246 |         self.sbs.append(sb)
247 |         self.uas.append(ua)
248 |         self.mas.append(ma)
249 |         self.zas.append(za)
250 |         self.hs.append(h)
251 | 
252 |         self.mps = self.mWs + self.mbs
253 |         self.sps = self.sWs + self.sbs
254 |         self.parms = self.mps + self.sps
255 |         self.output = self.hs[-1]
256 | 
257 |         self.eval_f = None
258 |         self.eval_f_rand = None
259 | 
260 |     def removeLayer(self):
261 |         """Removes a layer from the network."""
262 | 
263 |         assert self.n_layers > 0, 'There is no layer to remove.'
264 | 
265 |         n_params_to_rem = 2 * self.n_outputs * (self.n_units[-2] + 1)
266 |         self.n_outputs = self.n_units[-2]
267 |         self.n_units.pop()
268 |         self.n_layers -= 1
269 |         self.n_params -= n_params_to_rem
270 | 
271 |         self.mWs.pop()
272 |         self.mbs.pop()
273 |         self.sWs.pop()
274 |         self.sbs.pop()
275 |         self.uas.pop()
276 |         self.mas.pop()
277 |         self.zas.pop()
278 |         self.hs.pop()
279 | 
280 |         self.mps = self.mWs + self.mbs
281 |         self.sps = self.sWs + self.sbs
282 |         self.parms = self.mps + self.sps
283 |         self.output = self.hs[-1]
284 | 
285 |         self.eval_f = None
286 |         self.eval_f_rand = None
287 | 
288 |     def eval(self, x, rand=False):
289 |         """Evaluate net at locations in x."""
290 | 
291 |         x = np.asarray(x, dtype=dtype)
292 | 
293 |         if rand:
294 | 
295 |             # compile theano computation graph, if haven't already done so
296 |             if self.eval_f_rand is None:
297 | 
298 |                 n_data = tt.iscalar('n_data')
299 |                 uas = [tt.tile(self.srng.normal((n_units,), dtype=dtype), [n_data, 1]) for n_units in self.n_units[1:]]
300 | 
301 |                 self.eval_f_rand = theano.function(
302 |                     inputs=[self.hs[0], n_data],
303 |                     outputs=self.hs[-1],
304 |                     givens=zip(self.uas, uas)
305 |                 )
306 | 
307 |             return self.eval_f_rand(x[np.newaxis, :], 1)[0] if x.ndim == 1 else self.eval_f_rand(x, x.shape[0])
308 | 
309 |         else:
310 | 
311 |             # compile theano computation graph, if haven't already done so
312 |             if self.eval_f is None:
313 |                 self.eval_f = theano.function(
314 |                     inputs=[self.hs[0]],
315 |                     outputs=self.hs[-1],
316 |                     givens=zip(self.zas, self.mas)
317 |                 )
318 | 
319 |             return self.eval_f(x[np.newaxis, :])[0] if x.ndim == 1 else self.eval_f(x)
320 | 
321 |     def printInfo(self):
322 |         """Prints some useful info about the net."""
323 | 
324 |         print 'Number of inputs  =', self.n_inputs
325 |         print 'Number of outputs =', self.n_outputs
326 |         print 'Number of units   =', self.n_units
327 |         print 'Number of layers  =', self.n_layers
328 |         print 'Number of params  =', self.n_params
329 |         print 'Data type =', dtype
330 | 
331 |     def visualize_weights(self, layer, imsize, layout):
332 |         """
333 |         Displays the weights of a specified layer as images.
334 |         :param layer: the layer whose weights to display
335 |         :param imsize: the image size
336 |         :param layout: number of rows and columns for each page
337 |         :return: none
338 |         """
339 | 
340 |         util.plot.disp_imdata(self.mWs[layer].get_value().T, imsize, layout)
341 |         plt.show(block=False)
342 | 
343 |     def visualize_activations(self, x, layers=None):
344 |         """
345 |         Visualizes the activations of specified layers caused by a given data minibatch.
346 |         :param x: a minibatch of data
347 |         :param layers: list of layers to visualize activations of; defaults to the whole net except the input layer
348 |         :return: none
349 |         """
350 | 
351 |         if layers is None:
352 |             layers = xrange(self.n_layers)
353 | 
354 |         forwprop = theano.function(
355 |             inputs=[self.hs[0]],
356 |             outputs=self.hs[1:]
357 |         )
358 |         hs = forwprop(x.astype(dtype))
359 | 
360 |         for l in layers:
361 | 
362 |             fig = plt.figure()
363 |             ax = fig.add_subplot(1, 1, 1)
364 |             ax.imshow(hs[l], cmap='gray', interpolation='none')
365 |             ax.set_title('Layer ' + str(l))
366 |             ax.set_xlabel('layer units')
367 |             ax.set_ylabel('data points')
368 | 
369 |         plt.show(block=False)
370 | 
371 |     def param_hist(self, layers=None):
372 |         """
373 |         Displays a histogram of weights and biases for specified layers.
374 |         :param layers: list of layers to show histograms for; defaults to the whole net
375 |         :return: none
376 |         """
377 | 
378 |         if layers is None:
379 |             layers = xrange(self.n_layers)
380 | 
381 |         for l in layers:
382 | 
383 |             fig, axs = plt.subplots(2, 2)
384 | 
385 |             nbins = int(np.sqrt(self.mWs[l].get_value().size))
386 |             axs[0, 0].hist(self.mWs[l].get_value().flatten(), nbins, normed=True)
387 |             axs[0, 0].set_title('weight means, layer ' + str(l))
388 |             axs[1, 0].hist(self.sWs[l].get_value().flatten(), nbins, normed=True)
389 |             axs[1, 0].set_title('weight log stds, layer ' + str(l))
390 | 
391 |             nbins = int(np.sqrt(self.mbs[l].get_value().size))
392 |             axs[0, 1].hist(self.mbs[l].get_value(), nbins, normed=True)
393 |             axs[0, 1].set_title('bias means, layer ' + str(l))
394 |             axs[1, 1].hist(self.sbs[l].get_value(), nbins, normed=True)
395 |             axs[1, 1].set_title('bias log stds, layer ' + str(l))
396 | 
397 |         plt.show(block=False)
398 | 


--------------------------------------------------------------------------------
/ml/trainers.py:
--------------------------------------------------------------------------------
  1 | from itertools import izip
  2 | 
  3 | import os
  4 | import sys
  5 | import numpy as np
  6 | import theano
  7 | import theano.tensor as tt
  8 | import matplotlib.pyplot as plt
  9 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 10 | 
 11 | import ml.step_strategies as ss
 12 | import ml.data_streams as ds
 13 | import ml.ensembles as ensembles
 14 | import util.math
 15 | 
 16 | 
 17 | dtype = theano.config.floatX
 18 | 
 19 | 
 20 | class SGD:
 21 |     """
 22 |     Minibatch stochastic gradient descent. Can work with a variety of step strategies, and supports early stopping on
 23 |     validation set.
 24 |     """
 25 | 
 26 |     def __init__(self, model, trn_data, trn_loss, trn_target=None, val_data=None, val_loss=None, val_target=None, step=ss.Adam()):
 27 |         """
 28 |         Constructs and configures the trainer.
 29 |         :param model: the model to be trained
 30 |         :param trn_data: train inputs and (possibly) train targets
 31 |         :param trn_loss: theano variable representing the train loss to minimize
 32 |         :param trn_target: theano variable representing the train target
 33 |         :param val_data: validation inputs and (possibly) validation targets
 34 |         :param val_loss: theano variable representing the validation loss
 35 |         :param val_target: theano variable representing the validation target
 36 |         :param step: step size strategy object
 37 |         :return: None
 38 |         """
 39 | 
 40 |         # parse input
 41 |         # TODO: it would be good to type check the other inputs too
 42 |         assert isinstance(step, ss.StepStrategy), 'Step must be a step strategy object.'
 43 | 
 44 |         # prepare train data
 45 |         n_trn_data_list = set([x.shape[0] for x in trn_data])
 46 |         assert len(n_trn_data_list) == 1, 'Number of train data is not consistent.'
 47 |         self.n_trn_data = list(n_trn_data_list)[0]
 48 |         trn_data = [theano.shared(x.astype(dtype), borrow=True) for x in trn_data]
 49 | 
 50 |         # compile theano function for a single training update
 51 |         grads = tt.grad(trn_loss, model.parms)
 52 |         idx = tt.ivector('idx')
 53 |         trn_inputs = [model.input] if trn_target is None else [model.input, trn_target]
 54 |         self.make_update = theano.function(
 55 |             inputs=[idx],
 56 |             outputs=trn_loss,
 57 |             givens=zip(trn_inputs, [x[idx] for x in trn_data]),
 58 |             updates=step.updates(model.parms, grads)
 59 |         )
 60 | 
 61 |         # if model uses batch norm, compile a theano function for setting up stats
 62 |         if getattr(model, 'batch_norm', False):
 63 |             batch_norm_givens = [(bn.m, bn.bm) for bn in model.bns] + [(bn.v, bn.bv) for bn in model.bns]
 64 |             self.set_batch_norm_stats = theano.function(
 65 |                 inputs=[],
 66 |                 givens=zip(trn_inputs, trn_data),
 67 |                 updates=[(bn.bm, bn.m) for bn in model.bns] + [(bn.bv, bn.v) for bn in model.bns]
 68 |             )
 69 |         else:
 70 |             self.set_batch_norm_stats = None
 71 |             batch_norm_givens = []
 72 | 
 73 |         # if validation data is given, then set up validation too
 74 |         self.do_validation = val_data is not None
 75 | 
 76 |         if self.do_validation:
 77 | 
 78 |             # prepare validation data
 79 |             n_val_data_list = set([x.shape[0] for x in val_data])
 80 |             assert len(n_val_data_list) == 1, 'Number of validation data is not consistent.'
 81 |             self.n_val_data = list(n_val_data_list)[0]
 82 |             val_data = [theano.shared(x.astype(dtype), borrow=True) for x in val_data]
 83 | 
 84 |             # compile theano function for validation
 85 |             val_inputs = [model.input] if val_target is None else [model.input, val_target]
 86 |             self.validate = theano.function(
 87 |                 inputs=[],
 88 |                 outputs=val_loss,
 89 |                 givens=zip(val_inputs, val_data) + batch_norm_givens
 90 |             )
 91 | 
 92 |             # create checkpointer to store best model
 93 |             self.checkpointer = ModelCheckpointer(model)
 94 |             self.best_val_loss = float('inf')
 95 | 
 96 |         # initialize some variables
 97 |         self.trn_loss = float('inf')
 98 |         self.idx_stream = ds.IndexSubSampler(self.n_trn_data, rng=np.random.RandomState(42))
 99 | 
100 |     def train(self, minibatch=None, tol=None, maxepochs=None, monitor_every=None, patience=None, logger=sys.stdout, show_progress=False, val_in_same_plot=True):
101 |         """
102 |         Trains the model.
103 |         :param minibatch: minibatch size
104 |         :param tol: tolerance
105 |         :param maxepochs: maximum number of epochs
106 |         :param monitor_every: monitoring frequency
107 |         :param patience: maximum number of validation steps to wait for improvement before early stopping
108 |         :param logger: logger for logging messages. If None, no logging takes place
109 |         :param show_progress: if True, plot training and validation progress
110 |         :param val_in_same_plot: if True, plot validation progress in same plot as training progress
111 |         :return: None
112 |         """
113 | 
114 |         # parse input
115 |         assert minibatch is None or util.math.isposint(minibatch), 'Minibatch size must be a positive integer or None.'
116 |         assert tol is None or tol > 0.0, 'Tolerance must be positive or None.'
117 |         assert maxepochs is None or maxepochs > 0.0, 'Maximum number of epochs must be positive or None.'
118 |         assert monitor_every is None or monitor_every > 0.0, 'Monitoring frequency must be positive or None.'
119 |         assert patience is None or util.math.isposint(patience), 'Patience must be a positive integer or None.'
120 |         assert isinstance(show_progress, bool), 'store_progress must be boolean.'
121 |         assert isinstance(val_in_same_plot, bool), 'val_in_same_plot must be boolean.'
122 | 
123 |         # initialize some variables
124 |         iter = 0
125 |         progress_epc = []
126 |         progress_trn = []
127 |         progress_val = []
128 |         minibatch = self.n_trn_data if minibatch is None else minibatch
129 |         maxiter = float('inf') if maxepochs is None else np.ceil(maxepochs * self.n_trn_data / float(minibatch))
130 |         monitor_every = float('inf') if monitor_every is None else np.ceil(monitor_every * self.n_trn_data / float(minibatch))
131 |         patience = float('inf') if patience is None else patience
132 |         patience_left = patience
133 |         best_epoch = None
134 |         logger = open(os.devnull, 'w') if logger is None else logger
135 | 
136 |         # main training loop
137 |         while True:
138 | 
139 |             # make update to parameters
140 |             trn_loss = self.make_update(self.idx_stream.gen(minibatch))
141 |             diff = self.trn_loss - trn_loss
142 |             iter += 1
143 |             self.trn_loss = trn_loss
144 | 
145 |             if iter % monitor_every == 0:
146 | 
147 |                 epoch = iter * float(minibatch) / self.n_trn_data
148 | 
149 |                 # do validation
150 |                 if self.do_validation:
151 |                     if self.set_batch_norm_stats is not None: self.set_batch_norm_stats()
152 |                     val_loss = self.validate()
153 |                     patience_left -= 1
154 | 
155 |                     if val_loss < self.best_val_loss:
156 |                         self.best_val_loss = val_loss
157 |                         self.checkpointer.checkpoint()
158 |                         best_epoch = epoch
159 |                         patience_left = patience
160 | 
161 |                 # monitor progress
162 |                 if show_progress:
163 |                     progress_epc.append(epoch)
164 |                     progress_trn.append(trn_loss)
165 |                     if self.do_validation: progress_val.append(val_loss)
166 | 
167 |                 # log info
168 |                 if self.do_validation:
169 |                     logger.write('Epoch = {0:.2f}, train loss = {1}, validation loss = {2}\n'.format(epoch, trn_loss, val_loss))
170 |                 else:
171 |                     logger.write('Epoch = {0:.2f}, train loss = {1}\n'.format(epoch, trn_loss))
172 | 
173 |             # check for convergence
174 |             if abs(diff) < tol or iter >= maxiter or patience_left <= 0:
175 |                 if self.do_validation: self.checkpointer.restore()
176 |                 if self.set_batch_norm_stats is not None: self.set_batch_norm_stats()
177 |                 break
178 | 
179 |         # plot progress
180 |         if show_progress:
181 | 
182 |             if self.do_validation:
183 | 
184 |                 if val_in_same_plot:
185 |                     fig, ax = plt.subplots(1, 1)
186 |                     ax.semilogx(progress_epc, progress_trn, 'b', label='training')
187 |                     ax.semilogx(progress_epc, progress_val, 'r', label='validation')
188 |                     ax.vlines(best_epoch, ax.get_ylim()[0], ax.get_ylim()[1], color='g', linestyles='dashed', label='best')
189 |                     ax.set_xlabel('epochs')
190 |                     ax.set_ylabel('loss')
191 |                     ax.legend()
192 |                     ax.set_title('Training progress')
193 | 
194 |                 else:
195 |                     fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
196 |                     ax1.semilogx(progress_epc, progress_trn, 'b')
197 |                     ax2.semilogx(progress_epc, progress_val, 'r')
198 |                     ax1.vlines(best_epoch, ax1.get_ylim()[0], ax1.get_ylim()[1], color='g', linestyles='dashed', label='best')
199 |                     ax2.vlines(best_epoch, ax2.get_ylim()[0], ax2.get_ylim()[1], color='g', linestyles='dashed', label='best')
200 |                     ax2.set_xlabel('epochs')
201 |                     ax1.set_ylabel('training loss')
202 |                     ax2.set_ylabel('validation loss')
203 |                     fig.suptitle('Training progress')
204 | 
205 |             else:
206 |                 fig, ax = plt.subplots(1, 1)
207 |                 ax.semilogx(progress_epc, progress_trn, 'b')
208 |                 ax.set_xlabel('epochs')
209 |                 ax.set_ylabel('training loss')
210 |                 ax.legend()
211 |                 ax.set_title('Training progress')
212 | 
213 |             plt.show(block=False)
214 | 
215 | 
216 | class HMC:
217 |     """
218 |     Hamiltonian Monte Carlo training of models. Uses a quadratic kinetic energy. Trained model is an ensemble of
219 |     posterior model samples.
220 |     """
221 | 
222 |     def __init__(self, model, trn_data, trn_loss, trn_target):
223 |         """
224 |         :param model: model to train
225 |         :param trn_data: train data
226 |         :param trn_loss: train loss
227 |         :param trn_target: train target
228 |         """
229 | 
230 |         # prepare train data
231 |         n_trn_data_list = set([x.shape[0] for x in trn_data])
232 |         assert len(n_trn_data_list) == 1, 'Number of train data is not consistent.'
233 |         trn_data = [theano.shared(x.astype(dtype)) for x in trn_data]
234 | 
235 |         # prepare train inputs
236 |         trn_inputs = [model.input] if trn_target is None else [model.input, trn_target]
237 | 
238 |         # potential energy
239 |         self.U = theano.function(
240 |             inputs=[],
241 |             outputs=trn_loss,
242 |             givens=zip(trn_inputs, trn_data)
243 |         )
244 | 
245 |         # theano variables
246 |         step = tt.scalar('step')
247 |         mass = tt.scalar('mass')
248 |         srng = RandomStreams()
249 | 
250 |         # theano function for drawing random momentum variables
251 |         ps = [theano.shared(np.zeros_like(x.get_value(borrow=True)), borrow=True) for x in model.parms]
252 |         ps_rand = [srng.normal(x.get_value().shape, std=tt.sqrt(mass), dtype=dtype) for x in model.parms]
253 |         ps_rand = [tt.unbroadcast(pr, *range(x.get_value().ndim)) for pr, x in izip(ps_rand, model.parms)]
254 |         self.draw_momentum = theano.function(
255 |             inputs=[mass],
256 |             updates=zip(ps, ps_rand),
257 |             allow_input_downcast=True
258 |         )
259 | 
260 |         # theano function for calculating kinetic energy
261 |         K = sum([tt.sum(p**2) for p in ps]) / (2.0 * mass)
262 |         self.calc_kinetic = theano.function(
263 |             inputs=[mass],
264 |             outputs=K,
265 |             allow_input_downcast=True
266 |         )
267 | 
268 |         # theano function for updating momentum variables
269 |         dUs = tt.grad(trn_loss, model.parms)
270 |         new_ps = [p - step * dU for p, dU in izip(ps, dUs)]
271 |         self.update_momentum = theano.function(
272 |             inputs=[step],
273 |             updates=zip(ps, new_ps),
274 |             givens=zip(trn_inputs, trn_data),
275 |             allow_input_downcast=True
276 |         )
277 | 
278 |         # theano function for updating model parameters
279 |         new_parms = [x + step / mass * p for x, p in izip(model.parms, ps)]
280 |         self.update_parms = theano.function(
281 |             inputs=[step, mass],
282 |             updates=zip(model.parms, new_parms),
283 |             allow_input_downcast=True
284 |         )
285 | 
286 |         # initialize
287 |         self.U_prev = self.U()
288 |         self.model = model
289 | 
290 |     def gen(self, n_samples, L, me, m=1.0, logger=sys.stdout, show_traces=False, rng=np.random):
291 |         """
292 |         Generates HMC samples.
293 |         :param n_samples: number of samples
294 |         :param L: number of leapfrog steps
295 |         :param me: mean of time step
296 |         :param m: mass
297 |         :param logger: logger for logging messages. If None, no logging takes place
298 |         :param show_traces: whether to plot info at the end of sampling
299 |         :param rng: random number generator to use
300 |         :return: an ensemble of model samples
301 |         """
302 | 
303 |         # initialize
304 |         n_acc = 0
305 |         U_trace = []
306 |         H_error_trace = []
307 |         acc_rate_trace = []
308 |         xs = self.model.parms
309 |         ensemble = ensembles.FastEnsemble(self.model, copy=True)
310 |         ensemble.add_new(xs, copy=True)
311 |         logger = open(os.devnull, 'w') if logger is None else logger
312 | 
313 |         for n in xrange(n_samples):
314 | 
315 |             # sample momentum from a gaussian
316 |             self.draw_momentum(m)
317 |             K_prev = self.calc_kinetic(m)
318 | 
319 |             # simulate hamiltonian dynamics with leapfrog method
320 |             e = -me * np.log(1 - rng.rand())
321 |             self.update_momentum(0.5 * e)
322 |             for _ in xrange(L-1):
323 |                 self.update_parms(e, m)
324 |                 self.update_momentum(e)
325 |             self.update_parms(e, m)
326 |             self.update_momentum(0.5 * e)
327 |             # negating p is not necessary, because kinetic energy is symmetric
328 | 
329 |             # metropolis acceptance rule
330 |             U_new = self.U()
331 |             K_new = self.calc_kinetic(m)
332 |             H_err = (U_new + K_new) - (self.U_prev + K_prev)
333 |             if rng.rand() < np.exp(-H_err):
334 |                 self.U_prev = U_new
335 |                 n_acc += 1
336 |                 ensemble.add_new(xs, copy=True)
337 |             else:
338 |                 for i, x in enumerate(ensemble.parms[-1]):
339 |                     xs[i].set_value(x.copy())
340 |                 ensemble.add_existing(-1)
341 | 
342 |             # acceptance rate
343 |             acc_rate = n_acc / float(n+1)
344 |             logger.write('sample = {0}, acc rate = {1:.2%}, hamiltonian error = {2:.2}\n'.format(n+1, acc_rate, H_err))
345 | 
346 |             # record traces
347 |             if show_traces:
348 |                 U_trace.append(self.U_prev)
349 |                 H_error_trace.append(H_err)
350 |                 acc_rate_trace.append(acc_rate)
351 | 
352 |         ensemble.remove(0)
353 | 
354 |         # show plot with the traces
355 |         if show_traces:
356 | 
357 |             fig, ax = plt.subplots(3, 1, sharex=True)
358 |             ax[0].plot(U_trace)
359 |             ax[0].set_ylabel('potential energy')
360 |             ax[1].plot(H_error_trace)
361 |             ax[1].set_ylabel('hamiltonian error')
362 |             ax[2].plot(acc_rate_trace)
363 |             ax[2].set_ylim([0, 1])
364 |             ax[2].set_ylabel('acceptance rate')
365 |             ax[2].set_xlabel('samples')
366 |             fig.suptitle('HMC progress')
367 | 
368 |             parm_traces = ensemble.get_traces()
369 |             fig, axs = plt.subplots(len(parm_traces), sharex=True)
370 |             for ax, p in izip(axs, parm_traces):
371 |                 ax.plot(p)
372 |             axs[-1].set_xlabel('samples')
373 |             fig.suptitle('Parameter traces')
374 | 
375 |             plt.show(block=False)
376 | 
377 |         return ensemble
378 | 
379 | 
380 | class ModelCheckpointer:
381 |     """
382 |     Helper class which makes checkpoints of a given model.
383 |     Currently one checkpoint is supported; checkpointing twice overwrites previous checkpoint.
384 |     """
385 | 
386 |     def __init__(self, model):
387 |         """
388 |         :param model: A machine learning model to be checkpointed.
389 |         """
390 |         self.model = model
391 |         self.checkpointed_parms = [np.empty_like(p.get_value()) for p in model.parms]
392 | 
393 |     def checkpoint(self):
394 |         """
395 |         Checkpoints current model. Overwrites previous checkpoint.
396 |         """
397 |         for i, p in enumerate(self.model.parms):
398 |             self.checkpointed_parms[i] = p.get_value().copy()
399 | 
400 |     def restore(self):
401 |         """
402 |         Restores last checkpointed model.
403 |         """
404 |         for i, p in enumerate(self.checkpointed_parms):
405 |             self.model.parms[i].set_value(p)
406 | 


--------------------------------------------------------------------------------