├── util ├── __init__.py └── data.py ├── models ├── distributions │ ├── __init__.py │ ├── operations.py │ └── distributions.py ├── __init__.py ├── layers │ ├── __init__.py │ ├── shape.py │ ├── misc.py │ ├── conv.py │ ├── test_sampling.py │ └── sampling.py ├── helpers.py ├── vae.py ├── adgm.py ├── sbn.py ├── model.py ├── usbn.py ├── abstractrbm.py ├── dadgm.py ├── rbm_dadgm.py ├── variational_rbm.py ├── udadgm.py └── aux_variational_rbm.py ├── set_env.sh ├── log └── README.md ├── requirements.txt ├── Makefile ├── LICENSE ├── .gitignore ├── README.md └── run.py /util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/distributions/__init__.py: -------------------------------------------------------------------------------- 1 | from distributions import * -------------------------------------------------------------------------------- /set_env.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH="$PYTHONPATH:`pwd`" 2 | echo "Environment variables set!" 3 | -------------------------------------------------------------------------------- /log/README.md: -------------------------------------------------------------------------------- 1 | Log folder 2 | ========== 3 | 4 | The models will save their logs into this folder. 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cycler==0.10.0 2 | Lasagne==0.2.dev1 3 | matplotlib==1.5.3 4 | numpy==1.11.2 5 | pyparsing==2.1.10 6 | python-dateutil==2.5.3 7 | pytz==2016.7 8 | scipy==0.18.1 9 | scikit-learn==0.18.1 10 | six==1.10.0 11 | Theano==0.9.0.dev3 12 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from vae import VAE 2 | from sbn import SBN 3 | from usbn import USBN 4 | from adgm import ADGM 5 | from dadgm import DADGM 6 | from udadgm import UDADGM 7 | from rbm import RBM 8 | from variational_rbm import VariationalRBM 9 | from aux_variational_rbm import AuxiliaryVariationalRBM -------------------------------------------------------------------------------- /models/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from shape import RepeatLayer 2 | from sampling import ( 3 | GaussianSampleLayer, 4 | GaussianMultiSampleLayer, 5 | BernoulliSampleLayer, 6 | GumbelSoftmaxSampleLayer, 7 | GumbelSampleLayer, 8 | LogisticSampleLayer 9 | ) 10 | from conv import Deconv2DLayer 11 | from misc import ( 12 | ArgmaxLayer, RoundLayer, 13 | SoftmaxLayer, SigmoidLayer 14 | ) -------------------------------------------------------------------------------- /models/layers/shape.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano.tensor as T 3 | import lasagne 4 | 5 | # ---------------------------------------------------------------------------- 6 | 7 | class RepeatLayer(lasagne.layers.Layer): 8 | def __init__(self, l_in, n_ax, n_rep, rng=None, **kwargs): 9 | self.n_ax = n_ax 10 | self.n_rep = n_rep 11 | super(RepeatLayer, self).__init__(l_in, **kwargs) 12 | 13 | def get_output_shape_for(self, input_shapes): 14 | return tuple(np.insert(input_shapes, self.n_ax, self.n_rep)) 15 | 16 | def get_output_for(self, inputs, deterministic=False, **kwargs): 17 | return T.shape_padaxis(inputs, axis=self.n_ax).repeat(self.n_rep, self.n_ax) 18 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PYTHON=/usr/bin/python27 2 | PYTHON=python 3 | 4 | EPOCHS=100 5 | NAME=experiment2 6 | 7 | LOGFOLDER=log 8 | DATASET=mnist 9 | MODEL=discrete-vae-rbm 10 | ALG=adam 11 | 12 | LR=3e-4 13 | B1=0.9 14 | B2=0.999 15 | SUPERBATCH=1024 16 | NB=128 17 | 18 | # ---------------------------------------------------------------------------- 19 | 20 | train: 21 | $(PYTHON) run.py train \ 22 | --dataset $(DATASET) \ 23 | --model $(MODEL) \ 24 | -e $(EPOCHS) \ 25 | --logname $(LOGFOLDER)/$(DATASET).$(MODEL).$(ALG).$(LR).$(NB).$(NAME) \ 26 | --plotname $(LOGFOLDER)/$(DATASET)_$(MODEL)_$(ALG)_$(LR)_$(NB)_$(NAME).png \ 27 | --alg $(ALG) \ 28 | --lr $(LR) \ 29 | --b1 $(B1) \ 30 | --b2 $(B2) \ 31 | --n_superbatch $(SUPERBATCH) \ 32 | --n_batch $(NB) 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Volodymyr Kuleshov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.png 2 | *.json 3 | 4 | # ML datasets 5 | *.gz 6 | env/ 7 | log/ 8 | 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | env/ 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *,cover 54 | .hypothesis/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # IPython Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # Mac stuff 100 | .DS_Store 101 | 102 | *.bbl 103 | *.blg 104 | *.aux 105 | *.zip 106 | -------------------------------------------------------------------------------- /models/layers/misc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | import lasagne 5 | 6 | # ---------------------------------------------------------------------------- 7 | 8 | class ArgmaxLayer(lasagne.layers.Layer): 9 | """Argmax along the last dimension""" 10 | def __init__(self, incoming, **kwargs): 11 | super(ArgmaxLayer, self).__init__(incoming, **kwargs) 12 | 13 | def get_output_shape_for(self, input_shape): 14 | return input_shape[:-1] 15 | 16 | def get_output_for(self, input, **kwargs): 17 | return T.argmax(input, axis=-1) 18 | 19 | class RoundLayer(lasagne.layers.Layer): 20 | """Argmax along the last dimension""" 21 | def __init__(self, incoming, **kwargs): 22 | super(RoundLayer, self).__init__(incoming, **kwargs) 23 | 24 | def get_output_shape_for(self, input_shape): 25 | return input_shape 26 | 27 | def get_output_for(self, input, **kwargs): 28 | x = T.sgn(input) 29 | x = .5*(x + 1.) 30 | return x 31 | 32 | class SoftmaxLayer(lasagne.layers.Layer): 33 | """Argmax along the last dimension""" 34 | def __init__(self, incoming, tau=1., **kwargs): 35 | super(SoftmaxLayer, self).__init__(incoming, **kwargs) 36 | self.tau = tau 37 | 38 | def get_output_shape_for(self, input_shape): 39 | return input_shape 40 | 41 | def get_output_for(self, input, **kwargs): 42 | return T.nnet.softmax(input / self.tau) 43 | 44 | class SigmoidLayer(lasagne.layers.Layer): 45 | """Argmax along the last dimension""" 46 | def __init__(self, incoming, tau=1., **kwargs): 47 | super(SigmoidLayer, self).__init__(incoming, **kwargs) 48 | self.tau = tau 49 | 50 | def get_output_shape_for(self, input_shape): 51 | return input_shape 52 | 53 | def get_output_for(self, input, **kwargs): 54 | return T.nnet.sigmoid(input / self.tau) -------------------------------------------------------------------------------- /models/layers/conv.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | import lasagne 5 | 6 | # ---------------------------------------------------------------------------- 7 | 8 | class Deconv2DLayer(lasagne.layers.Layer): 9 | 10 | def __init__(self, incoming, num_filters, filter_size, stride=1, pad=0, 11 | nonlinearity=lasagne.nonlinearities.rectify, **kwargs): 12 | super(Deconv2DLayer, self).__init__(incoming, **kwargs) 13 | self.num_filters = num_filters 14 | self.filter_size = lasagne.utils.as_tuple(filter_size, 2, int) 15 | self.stride = lasagne.utils.as_tuple(stride, 2, int) 16 | self.pad = lasagne.utils.as_tuple(pad, 2, int) 17 | self.W = self.add_param(lasagne.init.Orthogonal(), 18 | (self.input_shape[1], num_filters) + self.filter_size, 19 | name='W') 20 | self.b = self.add_param(lasagne.init.Constant(0), 21 | (num_filters,), 22 | name='b') 23 | if nonlinearity is None: 24 | nonlinearity = lasagne.nonlinearities.identity 25 | self.nonlinearity = nonlinearity 26 | 27 | def get_output_shape_for(self, input_shape): 28 | shape = tuple(i*s - 2*p + f - 1 29 | for i, s, p, f in zip(input_shape[2:], 30 | self.stride, 31 | self.pad, 32 | self.filter_size)) 33 | return (input_shape[0], self.num_filters) + shape 34 | 35 | def get_output_for(self, input, **kwargs): 36 | op = T.nnet.abstract_conv.AbstractConv2d_gradInputs( 37 | imshp=self.output_shape, 38 | kshp=(self.input_shape[1], self.num_filters) + self.filter_size, 39 | subsample=self.stride, border_mode=self.pad) 40 | conved = op(self.W, input, self.output_shape[2:]) 41 | if self.b is not None: 42 | conved += self.b.dimshuffle('x', 0, 'x', 'x') 43 | return self.nonlinearity(conved) -------------------------------------------------------------------------------- /models/distributions/operations.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano.tensor as T 3 | 4 | # ---------------------------------------------------------------------------- 5 | # (stolen from the parmesan lib) 6 | 7 | def log_sum_exp(A, axis=None, sum_op=T.sum): 8 | """Computes `log(exp(A).sum(axis=axis))` avoiding numerical issues using the log-sum-exp trick. 9 | Direct calculation of :math:`\log \sum_i \exp A_i` can result in underflow or overflow numerical 10 | issues. Big positive values can cause overflow :math:`\exp A_i = \inf`, and big negative values 11 | can cause underflow :math:`\exp A_i = 0`. The latter can eventually cause the sum to go to zero 12 | and finally resulting in :math:`\log 0 = -\inf`. 13 | The log-sum-exp trick avoids these issues by using the identity, 14 | .. math:: 15 | \log \sum_i \exp A_i = \log \sum_i \exp(A_i - c) + c, \text{using}, \\ 16 | c = \max A. 17 | This avoids overflow, and while underflow can still happen for individual elements it avoids 18 | the sum being zero. 19 | 20 | Parameters 21 | ---------- 22 | A : Theano tensor 23 | Tensor of which we wish to compute the log-sum-exp. 24 | axis : int, tuple, list, None 25 | Axis or axes to sum over; None (default) sums over all axes. 26 | sum_op : function 27 | Summing function to apply; default is T.sum, but can also be T.mean for log-mean-exp. 28 | 29 | Returns 30 | ------- 31 | Theano tensor 32 | The log-sum-exp of `A`, dimensions over which is summed will be dropped. 33 | """ 34 | A_max = T.max(A, axis=axis, keepdims=True) 35 | B = T.log(sum_op(T.exp(A - A_max), axis=axis, keepdims=True)) + A_max 36 | 37 | if axis is None: 38 | return B.dimshuffle(()) # collapse to scalar 39 | else: 40 | if not hasattr(axis, '__iter__'): axis = [axis] 41 | return B.dimshuffle([d for d in range(B.ndim) if d not in axis]) # drop summed axes 42 | 43 | def log_mean_exp(A, axis=None): 44 | """Computes `log(exp(A).mean(axis=axis))` avoiding numerical issues using the log-sum-exp trick. 45 | See also 46 | -------- 47 | log_sum_exp 48 | """ 49 | return log_sum_exp(A, axis, sum_op=T.mean) -------------------------------------------------------------------------------- /models/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano.tensor as T 3 | 4 | # ---------------------------------------------------------------------------- 5 | # iteration 6 | 7 | def iterate_minibatch_idx(n_inputs, batchsize,): 8 | for start_idx in range(0, n_inputs - batchsize + 1, batchsize): 9 | yield start_idx, min(start_idx + batchsize, n_inputs) 10 | 11 | def iterate_minibatches(inputs, targets, batchsize, shuffle=False): 12 | assert len(inputs) == len(targets) 13 | if shuffle: 14 | indices = np.arange(len(inputs)) 15 | np.random.shuffle(indices) 16 | for start_idx in range(0, len(inputs) - batchsize + 1, batchsize): 17 | if shuffle: 18 | excerpt = indices[start_idx:start_idx + batchsize] 19 | else: 20 | excerpt = slice(start_idx, start_idx + batchsize) 21 | yield inputs[excerpt], targets[excerpt] 22 | 23 | def random_subbatch(inputs, targets, batchsize): 24 | assert len(inputs) == len(targets) 25 | indices = np.arange(len(inputs)) 26 | np.random.shuffle(indices) 27 | excerpt = indices[:batchsize] 28 | return inputs[excerpt], targets[excerpt] 29 | 30 | # ---------------------------------------------------------------------------- 31 | # eval 32 | 33 | def evaluate(eval_f, X, Y, batchsize=1000): 34 | tot_err, tot_acc, batches = 0, 0, 0 35 | print X.shape 36 | for inputs, targets in iterate_minibatches(X, Y, batchsize, shuffle=False): 37 | err, acc = eval_f(inputs, targets) 38 | tot_err += err 39 | tot_acc += acc 40 | batches += 1 41 | return tot_err / batches, tot_acc / batches 42 | 43 | def log_metrics(logname, metrics): 44 | logfile = '%s.log' % logname 45 | with open(logfile, 'a') as f: 46 | f.write('\t'.join([str(m) for m in metrics]) + '\n') 47 | 48 | # ---------------------------------------------------------------------------- 49 | # math 50 | 51 | def Tlogsumexp(L, axis=None, safe=True): 52 | """Logsumexp in Theano.""" 53 | if safe: 54 | # b = T.max(L, axis=axis) 55 | # b = 1e6 56 | # b = T.max(L) 57 | b = 0 58 | lse = b + T.log(T.sum(T.exp(L - b), axis=axis)) 59 | else: 60 | lse = T.log(T.sum(T.exp(L), axis=axis)) 61 | return lse 62 | 63 | def Tlogaddexp(a, b): 64 | """Logsumexp in Theano.""" 65 | # m = T.max((a,b)) # FIXME: why are there NaNs here? 66 | m = 0 67 | return m + T.log(T.exp(a-m) + T.exp(b-m)) -------------------------------------------------------------------------------- /models/layers/test_sampling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano.tensor as T 3 | import matplotlib.pyplot as plt 4 | 5 | # MY CODE 6 | 7 | temperature = 0.01 8 | logits = np.linspace(-2, 2, 10).reshape([1, -1]) 9 | 10 | def softmax(x): 11 | """Compute softmax values for each sets of scores in x.""" 12 | e_x = np.exp(x - np.max(x)) 13 | return e_x / e_x.sum(axis=0) 14 | 15 | def sample_gumbel(shape, eps=1e-20): 16 | """ Sample from Gumbel(0, 1) """ 17 | U = np.random.uniform(size=shape, low=0, high=1) 18 | return -np.log(-np.log(U + eps) + eps) 19 | 20 | def sample_gumbel_softmax(logits, temperature): 21 | """ Sample from Gumbel-Softmax distribution """ 22 | y = logits + sample_gumbel(logits.shape) 23 | return softmax(y / temperature) 24 | 25 | plt.title('gumbel-softmax samples') 26 | for i in range(100): 27 | plt.plot( 28 | range(10), 29 | sample_gumbel_softmax(logits,temperature=temperature), 30 | marker='o', 31 | alpha=0.25 32 | ) 33 | 34 | plt.ylim(0,1) 35 | plt.show() 36 | 37 | gsm_samples = [sample_gumbel_softmax(logits,temperature=temperature) for _ in range(500)] 38 | plt.title('average over samples') 39 | plt.plot( 40 | range(10), 41 | np.mean(gsm_samples, axis=0)[0], 42 | marker='o', 43 | label='gumbel-softmax average' 44 | ) 45 | plt.plot( 46 | softmax(sample_gumbel_softmax(logits,temperature=temperature)[0]), 47 | marker='+', 48 | label='regular softmax' 49 | ) 50 | plt.legend(loc='best') 51 | plt.show() 52 | 53 | # --------------------------------------------------------------------- 54 | 55 | # THEIR CODE 56 | temperature = 0.01 57 | logits = np.linspace(-2, 2, 10) 58 | 59 | def softmax(x): 60 | e_x = np.exp(x - np.max(x)) 61 | return e_x / e_x.sum(axis=0) 62 | 63 | def GumbelSoftmax(logits, temperature): 64 | uniform = np.random.uniform(size=logits.shape,low=0,high=1) 65 | gumbel = -np.log(-np.log(uniform + 1e-20) + 1e-20) 66 | return softmax((logits + gumbel) / temperature) 67 | 68 | plt.title('gumbel-softmax samples') 69 | for i in range(100): 70 | plt.plot( 71 | range(10), 72 | GumbelSoftmax(logits, temperature), 73 | marker='o', 74 | alpha=0.25 75 | ) 76 | 77 | plt.ylim(0,1) 78 | plt.show() 79 | 80 | gsm_samples = [sample_gumbel_softmax(logits,temperature=temperature) for _ in range(500)] 81 | plt.title('average over samples') 82 | plt.plot( 83 | range(10), 84 | np.mean(gsm_samples, axis=0)[0], 85 | marker='o', 86 | label='gumbel-softmax average' 87 | ) 88 | plt.plot( 89 | softmax(sample_gumbel_softmax(logits,temperature=temperature)[0]), 90 | marker='+', 91 | label='regular softmax' 92 | ) 93 | plt.legend(loc='best') 94 | plt.show() 95 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Neural Variational Inference in Undirected Models 2 | ================================================= 3 | 4 | This repository contains code accompanying the paper 5 | 6 | ``` 7 | Neural variational inference and learning in undirected graphical models. 8 | Volodymyr Kuleshov and Stefano Ermon. 9 | Neural Information Processing Systems, 2017 10 | ``` 11 | 12 | ## Installation 13 | 14 | The code uses Theano and Lasagne. 15 | To install this package, clone the git repo, and update your `PYTHONPATH` to include the folder. 16 | 17 | ``` 18 | git clone https://github.com/kuleshov/neural-variational-inference.git; 19 | cd neural-variational-inference/; 20 | source set_env.sh 21 | ``` 22 | 23 | ## Models and Datasets 24 | 25 | The repository implements the following models: 26 | 27 | * `vae`: Regular variational autoencoder. 28 | * `discrete-vae`: Variational autoencoder with a `Bernoulli(200)` prior. 29 | * `discrete-vae-rbm`: Variational autoencoder with an `RBM(64,8)` prior, 30 | trained using neural variational inference. 31 | * `adgm` Auxiliary-variable deep generative model (ADGM; Malloe et al. 2016). 32 | * `discrete-adgm`: ADGM with a `Bernoulli(200)` prior. 33 | * `discrete-adgm-rbm`: ADGM with an `RBM(64,8)` prior,· 34 | trained using neural variational inference. 35 | * `rbm`: Regular Restricted Boltzmann Machine (RBM) trained with persistent contrastive divergence. 36 | * `vrbm`: Variational RBM, i.e. RBM trained with neural variational infernce using a mixture of ten Bernoullis as the auxiliary helper distribution `q`. 37 | * `avrbm`: Auxiliary-variable Variational RBM, i.e. RBM trained with neural variational infernce using an auxiliary-variable distribution `q(x,a)` (parametrized with a neural network) as the helper distribution `q`. 38 | 39 | The models can be run on the following datasets: 40 | 41 | * `digits`: The UCI digits dataset. Use this for the RBM models (otherwise you'll get numerical issues and will 42 | * `mnist`: Regular MNIST. 43 | * `binmnist`: Binarized MNIST, using the binarization from the IWAE (Burda et al.) paper. 44 | * `omniglot`: The Omniglot dataset. 45 | 46 | The `run.py` script takes these names as input. 47 | 48 | ## Running the Code 49 | 50 | To run a model, you may use the `run.py` launch script. 51 | 52 | ``` 53 | usage: run.py train [-h] [--dataset DATASET] [--model MODEL] [--pkl PKL] 54 | [-e EPOCHS] [-l LOGNAME] [-p PLOTNAME] [--alg ALG] 55 | [--lr LR] [--b1 B1] [--b2 B2] [--n_batch N_BATCH] 56 | [--n_superbatch N_SUPERBATCH] 57 | 58 | optional arguments: 59 | -h, --help show this help message and exit 60 | --dataset DATASET 61 | --model MODEL 62 | --pkl PKL 63 | -e EPOCHS, --epochs EPOCHS 64 | -l LOGNAME, --logname LOGNAME 65 | -p PLOTNAME, --plotname PLOTNAME 66 | --alg ALG 67 | --lr LR 68 | --b1 B1 69 | --b2 B2 70 | --n_batch N_BATCH 71 | --n_superbatch N_SUPERBATCH 72 | ``` 73 | 74 | The simplest way to use it is via the `Makefile` provided in the root dir; typing `make train` will start training. 75 | You can specify the model, dataset, and other parameters by modifying the defaults in the `Makefile`. 76 | 77 | The default hyper-parameters on `avrbm` on the `digits` dataset are currently set incorrectly and are causing problems. 78 | 79 | ## Feedback 80 | 81 | Send feedback to [Volodymyr Kuleshov](http://www.stanford.edu/~kuleshov). 82 | -------------------------------------------------------------------------------- /models/distributions/distributions.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import theano.tensor as T 4 | 5 | # ---------------------------------------------------------------------------- 6 | # this is all taken from the parmesan lib 7 | 8 | c = - 0.5 * math.log(2*math.pi) 9 | 10 | def log_gumbel_softmax(x, mu, tau=1.0, eps=1e-6): 11 | """ 12 | Compute logpdf of a Gumbel Softmax distribution with parameters p, at values x. 13 | .. See Appendix B.[1:2] https://arxiv.org/pdf/1611.01144v2.pdf 14 | """ 15 | k = mu.shape[-1] 16 | logpdf = T.gammaln(k) + (k - 1) * T.log(tau + eps) \ 17 | - k * T.log(T.sum(T.exp(mu) / T.power(x, tau), axis=2) + eps) \ 18 | + T.sum(mu - (tau + 1) * T.log(x + eps), axis=2) 19 | return logpdf 20 | 21 | def log_logistic_sigmoid(x, mu, tau=1.0, eps=1e-6): 22 | """ 23 | Compute logpdf of a Gumbel Softmax distribution with parameters p, at values x. 24 | .. See Appendix B.[1:2] https://arxiv.org/pdf/1611.01144v2.pdf 25 | """ 26 | mu = T.clip(mu, -10., 10.) 27 | logpdf = mu + T.log(tau + eps) \ 28 | - (tau+1.) * ( T.log(x + eps) + T.log( 1.-x + eps) ) \ 29 | - 2. * T.log( T.exp(mu) * T.power(x,-tau) + T.power(1.-x,-tau) + eps ) 30 | return logpdf 31 | 32 | def log_logistic_sigmoid2(y, mu, tau=1.0, eps=1e-6): 33 | """ 34 | Compute logpdf of a Gumbel Softmax distribution with parameters p, at values x. 35 | .. See Appendix B.[1:2] https://arxiv.org/pdf/1611.01144v2.pdf 36 | """ 37 | mu = T.clip(mu, -10., 10.) 38 | logpdf = mu + T.log(tau + eps) - tau*y \ 39 | - 2. * T.log( 1. + T.exp( -tau*y + mu ) + eps ) 40 | return logpdf 41 | 42 | def log_bernoulli(x, p, eps=1e-6): 43 | """ 44 | Compute log pdf of a Bernoulli distribution with success probability p, at values x. 45 | .. math:: \log p(x; p) = \log \mathcal{B}(x; p) 46 | Parameters 47 | ---------- 48 | x : Theano tensor 49 | Values at which to evaluate pdf. 50 | p : Theano tensor 51 | Success probability :math:`p(x=1)`, which is also the mean of the Bernoulli distribution. 52 | eps : float 53 | Small number used to avoid NaNs by clipping p in range [eps;1-eps]. 54 | Returns 55 | ------- 56 | Theano tensor 57 | Element-wise log probability, this has to be summed for multi-variate distributions. 58 | """ 59 | p = T.clip(p, eps, 1.0 - eps) 60 | return -T.nnet.binary_crossentropy(p, x) 61 | 62 | 63 | def log_categorical(x, p, eps=1e-6): 64 | """ 65 | Compute log pdf of a Categorical distribution with success probability p, at values x. 66 | .. math:: \log p(x; p) = \log \mathcal{B}(x; p) 67 | Parameters 68 | ---------- 69 | x : Theano tensor 70 | Values at which to evaluate pdf. 71 | p : Theano tensor 72 | Success probability :math:`p(x=1)`, which is also the mean of the Bernoulli distribution. 73 | eps : float 74 | Small number used to avoid NaNs by clipping p in range [eps;1-eps]. 75 | Returns 76 | ------- 77 | Theano tensor 78 | Element-wise log probability, this has to be summed for multi-variate distributions. 79 | """ 80 | p = T.clip(p, eps, 1.0 - eps) 81 | return -T.nnet.categorical_crossentropy(p, x) 82 | 83 | 84 | def log_normal(x, mean, std, eps=1e-6): 85 | """ 86 | Compute log pdf of a Gaussian distribution with diagonal covariance, at values x. 87 | Variance is parameterized as standard deviation. 88 | .. math:: \log p(x) = \log \mathcal{N}(x; \mu, \sigma^2I) 89 | 90 | Parameters 91 | ---------- 92 | x : Theano tensor 93 | Values at which to evaluate pdf. 94 | mean : Theano tensor 95 | Mean of the Gaussian distribution. 96 | std : Theano tensor 97 | Standard deviation of the diagonal covariance Gaussian. 98 | eps : float 99 | Small number added to standard deviation to avoid NaNs. 100 | Returns 101 | ------- 102 | Theano tensor 103 | Element-wise log probability, this has to be summed for multi-variate distributions. 104 | See also 105 | -------- 106 | log_normal1 : using variance parameterization 107 | log_normal2 : using log variance parameterization 108 | """ 109 | std += eps 110 | return c - T.log(T.abs_(std)) - (x - mean)**2 / (2 * std**2) 111 | 112 | 113 | def log_normal2(x, mean, log_var, eps=1e-6): 114 | """ 115 | Compute log pdf of a Gaussian distribution with diagonal covariance, at values x. 116 | Variance is parameterized as log variance rather than standard deviation, which ensures :math:`\sigma > 0`. 117 | .. math:: \log p(x) = \log \mathcal{N}(x; \mu, \sigma^2I) 118 | 119 | Parameters 120 | ---------- 121 | x : Theano tensor 122 | Values at which to evaluate pdf. 123 | mean : Theano tensor 124 | Mean of the Gaussian distribution. 125 | log_var : Theano tensor 126 | Log variance of the diagonal covariance Gaussian. 127 | eps : float 128 | Small number added to denominator to avoid NaNs. 129 | Returns 130 | ------- 131 | Theano tensor 132 | Element-wise log probability, this has to be summed for multi-variate distributions. 133 | See also 134 | -------- 135 | log_normal : using standard deviation parameterization 136 | log_normal1 : using variance parameterization 137 | """ 138 | return c - log_var/2 - (x - mean)**2 / (2 * T.exp(log_var) + eps) 139 | -------------------------------------------------------------------------------- /models/layers/sampling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano.tensor as T 3 | import lasagne 4 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 5 | 6 | # ---------------------------------------------------------------------------- 7 | 8 | class GumbelSoftmax: 9 | def __init__(self, tau, softmax=True, eps=1e-6): 10 | """ 11 | Bottom of page 10. 12 | https://arxiv.org/pdf/1611.01144v2.pdf 13 | """ 14 | assert tau != 0 15 | self.temperature=tau 16 | self.eps=eps 17 | self.softmax=softmax 18 | self._srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579)) 19 | 20 | def __call__(self, logits): 21 | # sample from Gumbel(0, 1) 22 | uniform = self._srng.uniform(logits.shape,low=0,high=1) 23 | gumbel = -T.log(-T.log(uniform + self.eps) + self.eps) 24 | 25 | # draw a sample from the Gumbel-Softmax distribution 26 | if self.softmax: 27 | return T.nnet.softmax((logits + gumbel) / self.temperature) 28 | else: 29 | return (logits + gumbel) / self.temperature 30 | 31 | class LogisticSigmoid: 32 | def __init__(self, tau, sigmoid=True, eps=1e-6): 33 | """ 34 | Bottom of page 10. 35 | https://arxiv.org/pdf/1611.01144v2.pdf 36 | """ 37 | assert tau != 0 38 | self.temperature=tau 39 | self.eps=eps 40 | self.sigmoid=sigmoid 41 | self._srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579)) 42 | 43 | def __call__(self, logits): 44 | # sample from Gumbel(0, 1) 45 | uniform = self._srng.uniform(logits.shape,low=0,high=1) 46 | logistic = T.log(uniform + self.eps) - T.log(1.-uniform + self.eps) 47 | 48 | # draw a sample from the logistic-sigmoid distribution 49 | if self.sigmoid: 50 | return T.nnet.sigmoid((logits + logistic) / self.temperature) 51 | else: 52 | return (logits + logistic) / self.temperature 53 | 54 | def onehot_argmax(logits): 55 | return T.extra_ops.to_one_hot(T.argmax(logits,-1),logits.shape[-1]) 56 | 57 | class GumbelSoftmaxSampleLayer(lasagne.layers.Layer): 58 | def __init__(self, incoming, tau, eps=1e-6, **kwargs): 59 | super(GumbelSoftmaxSampleLayer, self).__init__(incoming, **kwargs) 60 | self.gumbel_softmax = GumbelSoftmax(tau, eps=eps) 61 | 62 | def get_output_for(self, input, hard_max=False, **kwargs): 63 | if hard_max: 64 | return onehot_argmax(input) 65 | else: 66 | return self.gumbel_softmax(input) 67 | 68 | class GumbelSampleLayer(lasagne.layers.Layer): 69 | def __init__(self, incoming, tau, eps=1e-6, **kwargs): 70 | super(GumbelSampleLayer, self).__init__(incoming, **kwargs) 71 | self.gumbel_softmax = GumbelSoftmax(tau, eps=eps, softmax=False) 72 | 73 | def get_output_for(self, input, hard_max=False, **kwargs): 74 | return self.gumbel_softmax(input) 75 | 76 | class LogisticSampleLayer(lasagne.layers.Layer): 77 | def __init__(self, incoming, tau, eps=1e-6, **kwargs): 78 | super(LogisticSampleLayer, self).__init__(incoming, **kwargs) 79 | self.logistic_sigmoid = LogisticSigmoid(tau, eps=eps, sigmoid=False) 80 | 81 | def get_output_for(self, input, hard_max=False, **kwargs): 82 | return self.logistic_sigmoid(input) 83 | 84 | 85 | class GaussianSampleLayer(lasagne.layers.MergeLayer): 86 | def __init__(self, mu, logsigma, rng=None, **kwargs): 87 | self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579)) 88 | super(GaussianSampleLayer, self).__init__([mu, logsigma], **kwargs) 89 | 90 | def get_output_shape_for(self, input_shapes): 91 | return input_shapes[0] 92 | 93 | def get_output_for(self, inputs, deterministic=False, **kwargs): 94 | mu, logsigma = inputs 95 | shape=(self.input_shapes[0][0] or inputs[0].shape[0], 96 | self.input_shapes[0][1] or inputs[0].shape[1]) 97 | if deterministic: 98 | return mu 99 | return mu + T.exp(logsigma) * self.rng.normal(shape) 100 | 101 | 102 | class GaussianMultiSampleLayer(lasagne.layers.MergeLayer): 103 | def __init__(self, mu, logsigma, n_samples, rng=None, **kwargs): 104 | self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579)) 105 | self.n_samples = n_samples 106 | super(GaussianMultiSampleLayer, self).__init__([mu, logsigma], **kwargs) 107 | 108 | def get_output_shape_for(self, input_shapes): 109 | n_batch, n_dim = input_shapes[0] 110 | return (n_batch, self.n_samples, n_dim) 111 | 112 | def get_output_for(self, inputs, deterministic=False, **kwargs): 113 | mu, logsigma = inputs 114 | 115 | # get dimensions 116 | n_bat = self.input_shapes[0][0] or inputs[0].shape[0] 117 | n_dim = self.input_shapes[0][1] or inputs[0].shape[1] 118 | n_sam = self.n_samples 119 | 120 | # reshape inputs into rank-3 tensors 121 | mu3 = mu.reshape((n_bat,1,n_dim)).repeat(n_sam, axis=1) 122 | ls3 = logsigma.reshape((n_bat,1,n_dim)).repeat(n_sam, axis=1) 123 | 124 | # return reshape means if layer is deterministic 125 | if deterministic: return mu3 126 | 127 | # otherwise, take samples 128 | shape = (n_bat, n_sam, n_dim) 129 | return mu3 + T.exp(ls3) * self.rng.normal(shape) 130 | 131 | 132 | class BernoulliSampleLayer(lasagne.layers.Layer): 133 | def __init__(self, mean, 134 | seed=lasagne.random.get_rng().randint(1, 2147462579), 135 | **kwargs): 136 | super(BernoulliSampleLayer, self).__init__(mean, **kwargs) 137 | self._srng = RandomStreams(seed) 138 | 139 | def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): 140 | self._srng.seed(seed) 141 | 142 | def get_output_shape_for(self, input_shape): 143 | return input_shape 144 | 145 | def get_output_for(self, mu, **kwargs): 146 | return self._srng.binomial(size=mu.shape, p=mu, dtype=mu.dtype) 147 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import pdb 2 | import argparse 3 | import matplotlib 4 | # Force matplotlib to not use any Xwindows backend. 5 | matplotlib.use('Agg') 6 | import matplotlib.pyplot as plt 7 | from util import data 8 | 9 | # ---------------------------------------------------------------------------- 10 | 11 | def make_parser(): 12 | parser = argparse.ArgumentParser() 13 | subparsers = parser.add_subparsers(title='Commands') 14 | 15 | # train 16 | train_parser = subparsers.add_parser('train', help='Train model') 17 | train_parser.set_defaults(func=train) 18 | 19 | train_parser.add_argument('--dataset', default='mnist') 20 | train_parser.add_argument('--model', default='vae') 21 | train_parser.add_argument('--pkl') 22 | train_parser.add_argument('-e', '--epochs', type=int, default=10) 23 | train_parser.add_argument('-l', '--logname', default='mnist-run') 24 | train_parser.add_argument('-p', '--plotname', default='mnist-plot.png') 25 | train_parser.add_argument('--alg', default='adam') 26 | train_parser.add_argument('--lr', type=float, default=1e-3) 27 | train_parser.add_argument('--b1', type=float, default=0.9) 28 | train_parser.add_argument('--b2', type=float, default=0.999) 29 | train_parser.add_argument('--n_batch', type=int, default=128) 30 | train_parser.add_argument('--n_superbatch', type=int, default=1280) 31 | 32 | return parser 33 | 34 | # ---------------------------------------------------------------------------- 35 | 36 | def train(args): 37 | import models 38 | import numpy as np 39 | np.random.seed(1234) 40 | 41 | if args.dataset == 'mnist': 42 | n_dim, n_out, n_channels = 28, 10, 1 43 | X_train, Y_train, X_val, Y_val, _, _ = data.load_mnist() 44 | elif args.dataset == 'binmnist': 45 | n_dim, n_out, n_channels = 28, 10, 1 46 | X_train, X_val, _ = data.load_mnist_binarized() 47 | X_train = X_train.reshape((-1, 1, 28, 28)) 48 | X_val = X_val.reshape((-1, 1, 28, 28)) 49 | Y_train = np.empty((X_train.shape[0],), dtype='int32') 50 | Y_val = np.empty((X_val.shape[0],), dtype='int32') 51 | elif args.dataset == 'omniglot': 52 | n_dim, n_out, n_channels = 28, 10, 1 53 | X_train, Y_train, X_val, Y_val = data.load_omniglot_iwae() 54 | X_train = X_train.reshape((-1, 1, 28, 28)) 55 | X_val = X_val.reshape((-1, 1, 28, 28)) 56 | Y_train = np.empty((X_train.shape[0],), dtype='int32') 57 | Y_val = np.empty((X_val.shape[0],), dtype='int32') 58 | elif args.dataset == 'digits': 59 | n_dim, n_out, n_channels = 8, 10, 1 60 | X_train, Y_train, X_val, Y_val, _, _ = data.load_digits() 61 | else: 62 | X_train, Y_train = data.load_h5(args.train) 63 | X_val, Y_val = data.load_h5(args.test) 64 | 65 | # also get the data dimensions 66 | print 'dataset loaded.' 67 | 68 | # set up optimization params 69 | p = { 'lr' : args.lr, 'b1': args.b1, 'b2': args.b2, 'nb': args.n_batch } 70 | 71 | # create model 72 | if args.model == 'vae': 73 | model = models.VAE( 74 | n_dim=n_dim, n_out=n_out, n_chan=n_channels, 75 | n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p, 76 | ) 77 | elif args.model == 'discrete-vae': 78 | model = models.SBN( 79 | n_dim=n_dim, n_out=n_out, n_chan=n_channels, 80 | n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p, 81 | ) 82 | elif args.model == 'discrete-vae-rbm': 83 | model = models.USBN( 84 | n_dim=n_dim, n_out=n_out, n_chan=n_channels, 85 | n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p, 86 | ) 87 | elif args.model == 'adgm': 88 | model = models.ADGM( 89 | n_dim=n_dim, n_out=n_out, n_chan=n_channels, 90 | n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p, 91 | ) 92 | elif args.model == 'discrete-adgm': 93 | model = models.DADGM( 94 | n_dim=n_dim, n_out=n_out, n_chan=n_channels, 95 | n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p, 96 | ) 97 | elif args.model == 'discrete-adgm-rbm': 98 | model = models.UDADGM( 99 | n_dim=n_dim, n_out=n_out, n_chan=n_channels, 100 | n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p, 101 | ) 102 | elif args.model == 'rbm': 103 | model = models.RBM( 104 | n_dim=n_dim, n_out=n_out, n_chan=n_channels, 105 | n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p, 106 | ) 107 | elif args.model == 'vrbm': 108 | model = models.VariationalRBM( 109 | n_dim=n_dim, n_out=n_out, n_chan=n_channels, 110 | n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p, 111 | ) 112 | elif args.model == 'avrbm': 113 | model = models.AuxiliaryVariationalRBM( 114 | n_dim=n_dim, n_out=n_out, n_chan=n_channels, 115 | n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p, 116 | ) 117 | else: 118 | raise ValueError('Invalid model') 119 | 120 | if args.pkl: 121 | model.load(args.pkl) 122 | 123 | # generate samples 124 | samples = model.hallucinate_chain() 125 | 126 | # plot them 127 | plt.figure(figsize=(5, 5)) 128 | plt.imshow(samples, cmap=plt.cm.gray, interpolation='none') 129 | plt.title('Hallucinated Samples') 130 | plt.tight_layout() 131 | plt.savefig(args.plotname) 132 | 133 | exit('hello') 134 | 135 | # train model 136 | model.fit( 137 | X_train, Y_train, X_val, Y_val, 138 | n_epoch=args.epochs, n_batch=args.n_batch, 139 | logname=args.logname 140 | ) 141 | 142 | # generate samples 143 | samples = model.hallucinate() 144 | 145 | # plot them 146 | plt.figure(figsize=(5, 5)) 147 | plt.imshow(samples, cmap=plt.cm.gray, interpolation='none') 148 | plt.title('Hallucinated Samples') 149 | plt.tight_layout() 150 | plt.savefig(args.plotname) 151 | 152 | def main(): 153 | parser = make_parser() 154 | args = parser.parse_args() 155 | args.func(args) 156 | 157 | if __name__ == '__main__': 158 | main() 159 | -------------------------------------------------------------------------------- /models/vae.py: -------------------------------------------------------------------------------- 1 | import pdb 2 | import time 3 | import pickle 4 | import numpy as np 5 | 6 | import theano 7 | import theano.tensor as T 8 | import lasagne 9 | 10 | from model import Model 11 | from layers import GaussianSampleLayer 12 | 13 | 14 | class VAE(Model): 15 | """Variational Autoencoder with Gaussian visible and latent variables""" 16 | def __init__( 17 | self, n_dim, n_out, n_chan=1, n_superbatch=12800, model='bernoulli', 18 | opt_alg='adam', opt_params={'lr' : 1e-3, 'b1': 0.9, 'b2': 0.99} 19 | ): 20 | # save model that wil be created 21 | self.model = model 22 | # invoke parent constructor 23 | Model.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params) 24 | 25 | def create_model(self, X, Y, n_dim, n_out, n_chan=1): 26 | # params 27 | n_lat = 64 # latent stochastic variables 28 | n_hid = 100 # size of hidden layer in encoder/decoder 29 | n_out = n_dim * n_dim * n_chan # total dimensionality of output 30 | hid_nl = lasagne.nonlinearities.tanh if self.model == 'bernoulli' \ 31 | else T.nnet.softplus 32 | 33 | # create the encoder network 34 | l_q_in = lasagne.layers.InputLayer( 35 | shape=(None, n_chan, n_dim, n_dim), 36 | input_var=X, 37 | ) 38 | l_q_hid = lasagne.layers.DenseLayer( 39 | l_q_in, num_units=n_hid, 40 | W=lasagne.init.Orthogonal(), 41 | b=lasagne.init.Constant(0.0), 42 | nonlinearity=hid_nl, 43 | name='q_hid', 44 | ) 45 | l_q_mu = lasagne.layers.DenseLayer( 46 | l_q_hid, num_units=n_lat, 47 | W=lasagne.init.Orthogonal(), 48 | b=lasagne.init.Constant(0.0), 49 | nonlinearity=None, 50 | name='q_mu', 51 | ) 52 | l_q_logsigma = lasagne.layers.DenseLayer( 53 | l_q_hid, num_units=n_lat, 54 | W=lasagne.init.Orthogonal(), 55 | b=lasagne.init.Constant(0.0), 56 | nonlinearity=None, 57 | name='q_logsigma', 58 | ) 59 | l_q = GaussianSampleLayer(l_q_mu, l_q_logsigma) 60 | 61 | # create the decoder network 62 | l_p_in = lasagne.layers.InputLayer((None, n_lat)) 63 | l_p_hid = lasagne.layers.DenseLayer( 64 | l_p_in, num_units=n_hid, 65 | nonlinearity=hid_nl, 66 | W=lasagne.init.Orthogonal(), 67 | b=lasagne.init.Constant(0.0), 68 | name='p_hid', 69 | ) 70 | l_p_mu, l_p_logsigma = None, None 71 | 72 | if self.model == 'bernoulli': 73 | l_p_mu = lasagne.layers.DenseLayer( 74 | l_p_hid, num_units=n_out, 75 | nonlinearity = lasagne.nonlinearities.sigmoid, 76 | W=lasagne.init.Orthogonal(), 77 | b=lasagne.init.Constant(0.0), 78 | name='p_sigma', 79 | ) 80 | elif self.model == 'gaussian': 81 | l_p_mu = lasagne.layers.DenseLayer( 82 | l_p_hid, num_units=n_out, 83 | nonlinearity=None, 84 | ) 85 | 86 | # relu_shift is for numerical stability - if training data has any 87 | # dimensions where stdev=0, allowing logsigma to approach -inf 88 | # will cause the loss function to become NAN. So we set the limit 89 | # stdev >= exp(-1 * relu_shift) 90 | relu_shift = 10 91 | l_p_logsigma = lasagne.layers.DenseLayer( 92 | l_p_hid, num_units=n_out, 93 | nonlinearity = lambda a: T.nnet.relu(a+relu_shift)-relu_shift 94 | ) 95 | 96 | self.input_layers = (l_q_in, l_p_in) 97 | self.n_lat = n_lat 98 | self.n_hid = n_hid 99 | 100 | return l_p_mu, l_p_logsigma, l_q_mu, l_q_logsigma, l_q 101 | 102 | def create_objectives(self, deterministic=False): 103 | # load network input 104 | X = self.inputs[0] 105 | x = X.flatten(2) 106 | 107 | l_p_mu, l_p_logsigma, l_q_mu, l_q_logsigma, l_q = self.network 108 | l_q_in, l_p_in = self.input_layers 109 | 110 | # load network output 111 | q_mu, q_logsigma, q_sample = lasagne.layers.get_output( 112 | [l_q_mu, l_q_logsigma, l_q], 113 | deterministic=deterministic, 114 | ) 115 | if self.model == 'bernoulli': 116 | p_mu = lasagne.layers.get_output( 117 | l_p_mu, {l_p_in : q_sample}, 118 | deterministic=deterministic, 119 | ) 120 | elif self.model == 'gaussian': 121 | p_mu, p_logsigma = lasagne.layers.get_output( 122 | [l_p_mu, l_p_logsigma], 123 | {l_p_in : q_sample}, 124 | deterministic=deterministic, 125 | ) 126 | 127 | # first term of the ELBO: kl-divergence (using the closed form expression) 128 | kl_div = 0.5 * T.sum(1 + 2 * q_logsigma - T.sqr(q_mu) 129 | - T.exp(2 * q_logsigma), axis=1).mean() 130 | 131 | # second term: log-likelihood of the data under the model 132 | if self.model == 'bernoulli': 133 | logpxz = -lasagne.objectives.binary_crossentropy( 134 | p_mu, x, 135 | ).sum(axis=1).mean() 136 | elif self.model == 'gaussian': 137 | def log_lik(x, mu, log_sig): 138 | return T.sum(-(np.float32(0.5 * np.log(2 * np.pi)) + log_sig) 139 | - 0.5 * T.sqr(x - mu) / T.exp(2 * log_sig), axis=1) 140 | logpxz = log_lik(x, p_mu, p_logsigma).mean() 141 | 142 | loss = -1 * (logpxz + kl_div) 143 | # we don't use the spearate accuracy metric right now 144 | return loss, -kl_div 145 | 146 | def gen_samples(self, deterministic=False): 147 | s = self.inputs[-1] 148 | # put it through the decoder 149 | _, l_p_in = self.input_layers 150 | l_p_mu = self.network[0] 151 | p_mu = lasagne.layers.get_output(l_p_mu, {l_p_in : s}) 152 | 153 | return p_mu 154 | 155 | def gen_noise(self, size, n_lat): 156 | noise = np.random.randn(size, n_lat) 157 | return noise 158 | 159 | def get_params(self): 160 | if self.model == 'bernoulli': 161 | l_p_mu, _, _, _, l_q = self.network 162 | elif self.model == 'gaussian': 163 | raise NotImplementedError() 164 | p_params = lasagne.layers.get_all_params(l_p_mu, trainable=True) 165 | q_params = lasagne.layers.get_all_params(l_q, trainable=True) 166 | 167 | return p_params + q_params 168 | -------------------------------------------------------------------------------- /models/adgm.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pickle 3 | import numpy as np 4 | 5 | import theano 6 | import theano.tensor as T 7 | import lasagne 8 | 9 | from model import Model 10 | from layers import GaussianSampleLayer, GaussianMultiSampleLayer 11 | from layers.shape import RepeatLayer 12 | from distributions import log_bernoulli, log_normal, log_normal2 13 | 14 | 15 | class ADGM(Model): 16 | """Auxiliary Deep Generative Model (unsupervised version)""" 17 | def __init__( 18 | self, n_dim, n_out, n_chan=1, n_superbatch=12800, model='bernoulli', 19 | opt_alg='adam', opt_params={'lr' : 1e-3, 'b1': 0.9, 'b2': 0.99} 20 | ): 21 | # save model that wil be created 22 | self.model = model 23 | self.n_sample = 1 # adjustable parameter, though 1 works best in practice 24 | Model.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params) 25 | 26 | def create_model(self, X, Y, n_dim, n_out, n_chan=1): 27 | # params 28 | n_lat = 200 # latent stochastic variables 29 | n_aux = 10 # auxiliary variables 30 | n_hid = 499 # size of hidden layer in encoder/decoder 31 | n_sam = self.n_sample # number of monte-carlo samples 32 | n_out = n_dim * n_dim * n_chan # total dimensionality of ouput 33 | hid_nl = lasagne.nonlinearities.rectify 34 | relu_shift = lambda av: T.nnet.relu(av + 10) - 10 # for numerical stability 35 | 36 | # create the encoder network 37 | # create q(a|x) 38 | l_qa_in = lasagne.layers.InputLayer( 39 | shape=(None, n_chan, n_dim, n_dim), 40 | input_var=X, 41 | ) 42 | l_qa_hid = lasagne.layers.DenseLayer( 43 | l_qa_in, num_units=n_hid, 44 | W=lasagne.init.GlorotNormal('relu'), 45 | b=lasagne.init.Normal(1e-3), 46 | nonlinearity=hid_nl, 47 | ) 48 | l_qa_mu = lasagne.layers.DenseLayer( 49 | l_qa_hid, num_units=n_aux, 50 | W=lasagne.init.GlorotNormal(), 51 | b=lasagne.init.Normal(1e-3), 52 | nonlinearity=None, 53 | ) 54 | l_qa_logsigma = lasagne.layers.DenseLayer( 55 | l_qa_hid, num_units=n_aux, 56 | W=lasagne.init.GlorotNormal(), 57 | b=lasagne.init.Normal(1e-3), 58 | nonlinearity=relu_shift, 59 | ) 60 | # repeatedly sample 61 | l_qa_mu = lasagne.layers.ReshapeLayer( 62 | RepeatLayer(l_qa_mu, n_ax=1, n_rep=n_sam), 63 | shape=(-1, n_aux), 64 | ) 65 | l_qa_logsigma = lasagne.layers.ReshapeLayer( 66 | RepeatLayer(l_qa_logsigma, n_ax=1, n_rep=n_sam), 67 | shape=(-1, n_aux), 68 | ) 69 | l_qa = GaussianSampleLayer(l_qa_mu, l_qa_logsigma) 70 | 71 | # create q(z|a,x) 72 | l_qz_hid1a = lasagne.layers.DenseLayer( 73 | l_qa, num_units=n_hid, 74 | W=lasagne.init.GlorotNormal('relu'), 75 | b=lasagne.init.Normal(1e-3), 76 | nonlinearity=hid_nl, 77 | ) 78 | l_qz_hid1b = lasagne.layers.DenseLayer( 79 | l_qa_in, num_units=n_hid, 80 | W=lasagne.init.GlorotNormal('relu'), 81 | b=lasagne.init.Normal(1e-3), 82 | nonlinearity=hid_nl, 83 | ) 84 | l_qz_hid1b = lasagne.layers.ReshapeLayer( 85 | RepeatLayer(l_qz_hid1b, n_ax=1, n_rep=n_sam), 86 | shape=(-1, n_hid), 87 | ) 88 | l_qz_hid2 = lasagne.layers.ElemwiseSumLayer([l_qz_hid1a, l_qz_hid1b]) 89 | l_qz_hid2 = lasagne.layers.NonlinearityLayer(l_qz_hid2, hid_nl) 90 | l_qz_mu = lasagne.layers.DenseLayer( 91 | l_qz_hid2, num_units=n_lat, 92 | W=lasagne.init.GlorotNormal(), 93 | b=lasagne.init.Normal(1e-3), 94 | nonlinearity=None, 95 | ) 96 | l_qz_logsigma = lasagne.layers.DenseLayer( 97 | l_qz_hid2, num_units=n_lat, 98 | W=lasagne.init.GlorotNormal(), 99 | b=lasagne.init.Normal(1e-3), 100 | nonlinearity=relu_shift, 101 | ) 102 | l_qz = GaussianSampleLayer(l_qz_mu, l_qz_logsigma) 103 | 104 | # create the decoder network 105 | # create p(x|z) 106 | l_px_in = lasagne.layers.InputLayer((None, n_lat)) 107 | l_px_hid = lasagne.layers.DenseLayer( 108 | l_px_in, num_units=n_hid, 109 | W=lasagne.init.GlorotNormal('relu'), 110 | b=lasagne.init.Normal(1e-3), 111 | nonlinearity=hid_nl, 112 | ) 113 | l_px_mu, l_px_logsigma = None, None 114 | 115 | if self.model == 'bernoulli': 116 | l_px_mu = lasagne.layers.DenseLayer( 117 | l_px_hid, num_units=n_out, 118 | nonlinearity = lasagne.nonlinearities.sigmoid, 119 | W=lasagne.init.GlorotUniform(), 120 | b=lasagne.init.Normal(1e-3), 121 | ) 122 | elif self.model == 'gaussian': 123 | l_px_mu = lasagne.layers.DenseLayer( 124 | l_px_hid, num_units=n_out, 125 | nonlinearity=None, 126 | ) 127 | l_px_logsigma = lasagne.layers.DenseLayer( 128 | l_px_hid, num_units=n_out, 129 | nonlinearity=relu_shift, 130 | ) 131 | 132 | # create p(a|z) 133 | l_pa_hid = lasagne.layers.DenseLayer( 134 | l_px_in, num_units=n_hid, 135 | nonlinearity=hid_nl, 136 | W=lasagne.init.GlorotNormal('relu'), 137 | b=lasagne.init.Normal(1e-3), 138 | ) 139 | l_pa_mu = lasagne.layers.DenseLayer( 140 | l_pa_hid, num_units=n_aux, 141 | W=lasagne.init.GlorotNormal(), 142 | b=lasagne.init.Normal(1e-3), 143 | nonlinearity=None, 144 | ) 145 | l_pa_logsigma = lasagne.layers.DenseLayer( 146 | l_pa_hid, num_units=n_aux, 147 | W=lasagne.init.GlorotNormal(), 148 | b=lasagne.init.Normal(1e-3), 149 | nonlinearity=relu_shift, 150 | ) 151 | 152 | self.input_layers = (l_qa_in, l_px_in) 153 | self.n_lat = n_lat 154 | self.n_hid = n_hid 155 | 156 | return l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ 157 | l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, \ 158 | l_qa, l_qz 159 | 160 | def create_objectives(self, deterministic=False): 161 | # load network input 162 | X = self.inputs[0] 163 | x = X.flatten(2) 164 | 165 | # duplicate entries to take into account multiple mc samples 166 | n_sam = self.n_sample 167 | n_out = x.shape[1] 168 | x = x.dimshuffle(0,'x',1).repeat(n_sam, axis=1).reshape((-1, n_out)) 169 | 170 | # load network 171 | l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ 172 | l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, \ 173 | l_qa, l_qz = self.network 174 | l_qa_in, l_px_in = self.input_layers 175 | 176 | # load network output 177 | qz_mu, qz_logsigma, qa_mu, qa_logsigma, a, z \ 178 | = lasagne.layers.get_output( 179 | [l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, l_qa, l_qz], 180 | deterministic=deterministic, 181 | ) 182 | pa_mu, pa_logsigma = lasagne.layers.get_output( 183 | [l_pa_mu, l_pa_logsigma], 184 | {l_px_in: z}, 185 | deterministic=deterministic, 186 | ) 187 | 188 | if self.model == 'bernoulli': 189 | px_mu = lasagne.layers.get_output( 190 | l_px_mu, {l_px_in: z}, 191 | deterministic=deterministic 192 | ) 193 | elif self.model == 'gaussian': 194 | px_mu, px_logsigma = lasagne.layers.get_output( 195 | [l_px_mu, l_px_logsigma], 196 | {l_px_in: z}, 197 | deterministic=deterministic, 198 | ) 199 | 200 | # entropy term 201 | log_qa_given_x = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1) 202 | log_qz_given_ax = log_normal2(z, qz_mu, qz_logsigma).sum(axis=1) 203 | log_qza_given_x = log_qz_given_ax + log_qa_given_x 204 | 205 | # log-probability term 206 | z_prior_sigma = T.cast(T.ones_like(qz_logsigma), dtype=theano.config.floatX) 207 | z_prior_mu = T.cast(T.zeros_like(qz_mu), dtype=theano.config.floatX) 208 | log_pz = log_normal(z, z_prior_mu, z_prior_sigma).sum(axis=1) 209 | log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1) 210 | 211 | if self.model == 'bernoulli': 212 | log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1) 213 | elif self.model == 'gaussian': 214 | log_px_given_z = log_normal2(x, px_mu, px_logsigma).sum(axis=1) 215 | 216 | log_paxz = log_pa_given_z + log_px_given_z + log_pz 217 | 218 | # compute the evidence lower bound 219 | elbo = T.mean(log_paxz - log_qza_given_x) 220 | 221 | # we don't use a spearate accuracy metric right now 222 | return -elbo, T.mean(qz_logsigma) 223 | 224 | def create_gradients(self, loss, deterministic=False): 225 | grads = Model.create_gradients(self, loss, deterministic) 226 | 227 | # combine and clip gradients 228 | clip_grad = 1 229 | max_norm = 5 230 | mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) 231 | cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] 232 | 233 | return cgrads 234 | 235 | def gen_samples(self, deterministic=False): 236 | s = self.inputs[-1] 237 | # put it through the decoder 238 | _, l_px_in = self.input_layers 239 | l_px_mu = self.network[0] 240 | px_mu = lasagne.layers.get_output(l_px_mu, {l_px_in : s}) 241 | 242 | return px_mu 243 | 244 | def gen_noise(self, size, n_lat): 245 | noise = np.random.randn(size, n_lat) 246 | return noise 247 | 248 | def get_params(self): 249 | l_px_mu, _, l_pa_mu, l_pa_logsigma, \ 250 | _, _, _, _, l_qa, l_qz = self.network 251 | 252 | p_params = lasagne.layers.get_all_params( 253 | [l_px_mu, l_pa_mu, l_pa_logsigma], 254 | trainable=True, 255 | ) 256 | qa_params = lasagne.layers.get_all_params(l_qa, trainable=True) 257 | qz_params = lasagne.layers.get_all_params(l_qz, trainable=True) 258 | 259 | return p_params + qa_params + qz_params 260 | -------------------------------------------------------------------------------- /models/sbn.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pickle 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | import theano 7 | import theano.tensor as T 8 | from theano.gradient import disconnected_grad as dg 9 | import lasagne 10 | 11 | from model import Model 12 | 13 | from layers import BernoulliSampleLayer 14 | from distributions import log_bernoulli 15 | from helpers import Tlogsumexp 16 | 17 | from theano.tensor.shared_randomstreams import RandomStreams 18 | 19 | # ---------------------------------------------------------------------------- 20 | 21 | class SBN(Model): 22 | """Sigmoid Belief Network trained using Neural Variational Inference 23 | Epoch 200 of 200 took 26.052s (192 minibatches) 24 | training loss/acc: 125.989901 107.652437 25 | validation loss/acc: 126.220432 108.006230 26 | """ 27 | 28 | def __init__( 29 | self, n_dim, n_out, n_chan=1, n_superbatch=12800, 30 | opt_alg='adam', opt_params={'lr': 1e-3, 'b1': 0.9, 'b2': 0.99} 31 | ): 32 | # invoke parent constructor 33 | Model.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params) 34 | 35 | # random number generators 36 | self.numpy_rng = np.random.RandomState(1234) 37 | self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30)) 38 | 39 | (loss, acc) = self.objectives 40 | llik = self.create_llik() 41 | (X, Y, idx1, idx2, S) = self.inputs 42 | # self.llik = theano.function( 43 | # [idx1, idx2], llik, updates=self.updates, 44 | # {X: self.train_set_x[idx1:idx2], Y: self.train_set_y[idx1:idx2]} 45 | # ) 46 | self.loss = theano.function( 47 | [X, Y], [loss, llik], 48 | on_unused_input='warn', 49 | ) 50 | 51 | def create_model(self, X, Y, n_dim, n_out, n_chan=1): 52 | # params 53 | n_lat = 200 # latent stochastic variables 54 | n_hid = 500 # size of hidden layer in encoder/decoder 55 | n_hid_cv = 500 # size of hidden layer in control variate net 56 | n_out = n_dim * n_dim * n_chan # total dimensionality of ouput 57 | hid_nl = lasagne.nonlinearities.tanh 58 | 59 | # create the encoder network 60 | l_q_in = lasagne.layers.InputLayer( 61 | shape=(None, n_chan, n_dim, n_dim), 62 | input_var=X, 63 | ) 64 | l_q_hid = lasagne.layers.DenseLayer( 65 | l_q_in, num_units=n_hid, 66 | nonlinearity=hid_nl, 67 | ) 68 | l_q_out = lasagne.layers.DenseLayer( 69 | l_q_hid, num_units=n_lat, 70 | nonlinearity=None, 71 | ) 72 | l_q_mu = lasagne.layers.DenseLayer( 73 | l_q_hid, num_units=n_lat, 74 | nonlinearity=T.nnet.sigmoid, 75 | ) 76 | l_q_sample = BernoulliSampleLayer(l_q_mu) 77 | 78 | # create the decoder network 79 | # note that we currently only handle Bernoulli x variables 80 | l_p_in = lasagne.layers.InputLayer((None, n_lat)) 81 | l_p_hid = lasagne.layers.DenseLayer( 82 | l_p_in, num_units=n_hid, 83 | nonlinearity=hid_nl, 84 | W=lasagne.init.GlorotUniform(), 85 | ) 86 | l_p_mu = lasagne.layers.DenseLayer(l_p_hid, num_units=n_out, 87 | nonlinearity = lasagne.nonlinearities.sigmoid, 88 | W=lasagne.init.GlorotUniform(), 89 | b=lasagne.init.Constant(0.), 90 | ) 91 | 92 | # create control variate (baseline) network 93 | l_cv_in = lasagne.layers.InputLayer( 94 | shape=(None, n_chan, n_dim, n_dim), 95 | input_var=X, 96 | ) 97 | l_cv_hid = lasagne.layers.DenseLayer( 98 | l_cv_in, num_units=n_hid_cv, 99 | nonlinearity=hid_nl, 100 | ) 101 | l_cv = lasagne.layers.DenseLayer( 102 | l_cv_hid, num_units=1, 103 | nonlinearity=None, 104 | ) 105 | 106 | # create variables for centering signal 107 | c = theano.shared(np.zeros((1,1), dtype=np.float64), broadcastable=(True,True)) 108 | v = theano.shared(np.zeros((1,1), dtype=np.float64), broadcastable=(True,True)) 109 | 110 | self.input_layers = (l_q_in, l_p_in, l_cv_in) 111 | self.n_lat = n_lat 112 | self.n_hid = n_hid 113 | 114 | return l_p_mu, l_q_mu, l_q_sample, l_cv, c, v 115 | 116 | def _create_components(self, deterministic=False): 117 | # load network input 118 | X = self.inputs[0] 119 | x = X.flatten(2) 120 | 121 | # load networks 122 | l_p_mu, l_q_mu, l_q_sample, _, _, _ = self.network 123 | l_q_in, l_p_in, l_cv_in = self.input_layers 124 | 125 | # load network output 126 | z, q_mu = lasagne.layers.get_output( 127 | [l_q_sample, l_q_mu], deterministic=deterministic) 128 | p_mu = lasagne.layers.get_output( 129 | l_p_mu, {l_p_in: z}, 130 | deterministic=deterministic, 131 | ) 132 | 133 | # entropy term 134 | log_qz_given_x = log_bernoulli(dg(z), q_mu).sum(axis=1) 135 | 136 | # expected p(x,z) term 137 | z_prior = T.ones_like(z)*np.float32(0.5) 138 | log_pz = log_bernoulli(z, z_prior).sum(axis=1) 139 | log_px_given_z = log_bernoulli(x, p_mu).sum(axis=1) 140 | log_pxz = log_pz + log_px_given_z 141 | 142 | # save them for later 143 | self.log_pxz = log_pxz 144 | self.log_qz_given_x = log_qz_given_x 145 | 146 | return log_pxz.flatten(), log_qz_given_x.flatten() 147 | 148 | def create_objectives(self, deterministic=False): 149 | # load probabilities 150 | log_pxz, log_qz_given_x = self._create_components(deterministic=deterministic) 151 | 152 | # compute the lower bound 153 | elbo = T.mean(log_pxz - log_qz_given_x) 154 | 155 | # we don't use the second accuracy metric right now 156 | return -elbo, -T.mean(log_qz_given_x) 157 | 158 | def create_gradients(self, loss, deterministic=False): 159 | from theano.gradient import disconnected_grad as dg 160 | 161 | # load networks 162 | l_p_mu, l_q_mu, _, l_cv, c, v = self.network 163 | 164 | # load params 165 | p_params = lasagne.layers.get_all_params(l_p_mu, trainable=True) 166 | q_params = lasagne.layers.get_all_params(l_q_mu, trainable=True) 167 | cv_params = lasagne.layers.get_all_params(l_cv, trainable=True) 168 | 169 | # load neural net outputs (probabilities have been precomputed) 170 | log_pxz, log_qz_given_x = self.log_pxz, self.log_qz_given_x 171 | cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1) 172 | 173 | # compute learning signals 174 | l = log_pxz - log_qz_given_x - cv 175 | l_avg, l_var = l.mean(), l.var() 176 | c_new = 0.8*c + 0.2*l_avg 177 | v_new = 0.8*v + 0.2*l_var 178 | l = (l - c_new) / T.maximum(1, T.sqrt(v_new)) 179 | 180 | # compute grad wrt p 181 | p_grads = T.grad(-log_pxz.mean(), p_params) 182 | 183 | # compute grad wrt q 184 | q_target = T.mean(dg(l) * log_qz_given_x) 185 | q_grads = T.grad(-0.2*q_target, q_params) # 5x slower rate for q 186 | 187 | # compute grad of cv net 188 | cv_target = T.mean(l**2) 189 | cv_grads = T.grad(cv_target, cv_params) 190 | 191 | # combine and clip gradients 192 | clip_grad = 1 193 | max_norm = 5 194 | grads = p_grads + q_grads + cv_grads 195 | mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) 196 | cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] 197 | 198 | return cgrads 199 | 200 | def gen_samples(self, deterministic=False): 201 | s = self.inputs[-1] 202 | # put it through the decoder 203 | _, l_p_in, _ = self.input_layers 204 | l_p_mu = self.network[0] 205 | p_mu = lasagne.layers.get_output(l_p_mu, {l_p_in : s}) 206 | 207 | return p_mu 208 | 209 | def get_params(self): 210 | l_p_mu, l_q_mu, _, l_cv, _, _ = self.network 211 | p_params = lasagne.layers.get_all_params(l_p_mu, trainable=True) 212 | q_params = lasagne.layers.get_all_params(l_q_mu, trainable=True) 213 | cv_params = lasagne.layers.get_all_params(l_cv, trainable=True) 214 | return p_params + q_params + cv_params #+ [c] 215 | 216 | def create_updates(self, grads, params, alpha, opt_alg, opt_params): 217 | # call super-class to generate SGD/ADAM updates 218 | grad_updates = Model.create_updates( 219 | self, grads, params, alpha, opt_alg, opt_params, 220 | ) 221 | 222 | # create updates for centering signal 223 | 224 | # load neural net outputs (probabilities have been precomputed) 225 | _, _, _, l_cv, c, v = self.network 226 | log_pxz, log_qz_given_x = self.log_pxz, self.log_qz_given_x 227 | cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1) 228 | 229 | # compute learning signals 230 | l = log_pxz - log_qz_given_x - cv 231 | l_avg, l_var = l.mean(), l.var() 232 | c_new = 0.8*c + 0.2*l_avg 233 | v_new = 0.8*v + 0.2*l_var 234 | 235 | # compute update for centering signal 236 | cv_updates = {c: c_new, v: v_new} 237 | 238 | return OrderedDict(grad_updates.items() + cv_updates.items()) 239 | 240 | def create_llik(self): 241 | # load inputs 242 | X = self.inputs[0] 243 | x = X.flatten(2) 244 | 245 | # load network params 246 | n_cat = self.n_lat 247 | n_rep = 10 248 | 249 | # load networks 250 | l_p_mu, l_q_mu, l_q_sample, _, _, _ = self.network 251 | l_q_in, l_p_in, l_cv_in = self.input_layers 252 | 253 | # load network output 254 | q_mu = lasagne.layers.get_output(l_q_mu) 255 | 256 | q_mu_rep = T.tile( q_mu.dimshuffle((0,'x',1)), reps=(1,n_rep,1) ) # (n_bat, n_rep, n_cat) 257 | q_sample_hard = self.theano_rng.binomial(size=q_mu_rep.shape, p=q_mu_rep, dtype=q_mu_rep.dtype) # (n_bat, n_rep, n_cat) 258 | q_sample_hard2 = q_sample_hard.reshape([100*n_rep, n_cat]) # (n_bat*n_rep, n_cat) 259 | 260 | p_mu = lasagne.layers.get_output(l_p_mu, {l_p_in: q_sample_hard2}) # (n_bat*n_rep, 784) 261 | x_rep = T.tile( x.dimshuffle((0,'x',1)), reps=(1,n_rep,1) ) # (n_bat, n_rep, 784) 262 | p_mu = T.reshape(p_mu, (100, n_rep, 784)) # (n_bat, n_rep, 784) 263 | 264 | # define the loss components 265 | log_p_x = log_bernoulli(x_rep, p_mu).sum(axis=2) # (n_bat, n_rep) 266 | z_prior = T.ones_like(q_sample_hard)*np.float32(0.5) # (n_bat, n_rep, n_cat) 267 | log_p_z = log_bernoulli(q_sample_hard, z_prior).sum(axis=2) # (n_bat, n_rep) 268 | log_q_z = log_bernoulli(q_sample_hard, q_mu_rep).sum(axis=2) # (n_bat, n_rep) 269 | 270 | # compute loss 271 | llik = Tlogsumexp( log_p_x + log_p_z - log_q_z, axis=1) # (n_bat,) 272 | 273 | return T.mean(llik) 274 | -------------------------------------------------------------------------------- /util/data.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import pickle 4 | import tarfile 5 | import numpy as np 6 | import urllib 7 | import zipfile 8 | import fnmatch 9 | import shutil 10 | import gzip 11 | import cPickle as cPkl 12 | import pickle as pkl 13 | 14 | 15 | def whiten(X_train, X_valid): 16 | offset = np.mean(X_train, 0) 17 | scale = np.std(X_train, 0).clip(min=1) 18 | X_train = (X_train - offset) / scale 19 | X_valid = (X_valid - offset) / scale 20 | return X_train, X_valid 21 | 22 | 23 | def load_cifar10(): 24 | """Download and extract the tarball from Alex's website.""" 25 | dest_directory = '.' 26 | DATA_URL = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' 27 | if not os.path.exists(dest_directory): 28 | os.makedirs(dest_directory) 29 | filename = DATA_URL.split('/')[-1] 30 | filepath = os.path.join(dest_directory, filename) 31 | if not os.path.exists(filepath): 32 | if sys.version_info[0] == 2: 33 | from urllib import urlretrieve 34 | else: 35 | from urllib.request import urlretrieve 36 | 37 | def _progress(count, block_size, total_size): 38 | sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename, 39 | float(count * block_size) / float(total_size) * 100.0)) 40 | sys.stdout.flush() 41 | 42 | filepath, _ = urlretrieve(DATA_URL, filepath, _progress) 43 | print() 44 | statinfo = os.stat(filepath) 45 | print('Successfully downloaded', filename, statinfo.st_size, 'bytes.') 46 | tarfile.open(filepath, 'r:gz').extractall(dest_directory) 47 | 48 | def load_CIFAR_batch(filename): 49 | """ load single batch of cifar """ 50 | with open(filename, 'rb') as f: 51 | datadict = pickle.load(f) 52 | X = datadict['data'] 53 | Y = datadict['labels'] 54 | X = X.reshape(10000, 3, 32, 32).astype("float32") 55 | Y = np.array(Y, dtype=np.uint8) 56 | return X, Y 57 | 58 | xs, ys = [], [] 59 | for b in range(1,6): 60 | f = 'cifar-10-batches-py/data_batch_%d' % b 61 | X, Y = load_CIFAR_batch(f) 62 | xs.append(X) 63 | ys.append(Y) 64 | 65 | Xtr = np.concatenate(xs) 66 | Ytr = np.concatenate(ys) 67 | del X, Y 68 | Xte, Yte = load_CIFAR_batch('cifar-10-batches-py/test_batch') 69 | 70 | return Xtr, Ytr, Xte, Yte 71 | 72 | def load_mnist(): 73 | # We first define a download function, supporting both Python 2 and 3. 74 | if sys.version_info[0] == 2: 75 | from urllib import urlretrieve 76 | else: 77 | from urllib.request import urlretrieve 78 | 79 | def download(filename, source='http://yann.lecun.com/exdb/mnist/'): 80 | print "Downloading %s" % filename 81 | urlretrieve(source + filename, filename) 82 | 83 | # We then define functions for loading MNIST images and labels. 84 | # For convenience, they also download the requested files if needed. 85 | import gzip 86 | 87 | def load_mnist_images(filename): 88 | if not os.path.exists(filename): 89 | download(filename) 90 | 91 | # Read the inputs in Yann LeCun's binary format. 92 | with gzip.open(filename, 'rb') as f: 93 | data = np.frombuffer(f.read(), np.uint8, offset=16) 94 | 95 | # The inputs are vectors now, we reshape them to monochrome 2D images, 96 | # following the shape convention: (examples, channels, rows, columns) 97 | data = data.reshape(-1, 1, 28, 28) 98 | # The inputs come as bytes, we convert them to float32 in range [0,1]. 99 | # (Actually to range [0, 255/256], for compatibility to the version 100 | # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.) 101 | return data / np.float32(256) 102 | 103 | def load_mnist_labels(filename): 104 | if not os.path.exists(filename): 105 | download(filename) 106 | # Read the labels in Yann LeCun's binary format. 107 | with gzip.open(filename, 'rb') as f: 108 | data = np.frombuffer(f.read(), np.uint8, offset=8) 109 | # The labels are vectors of integers now, that's exactly what we want. 110 | return data 111 | 112 | # We can now download and read the training and test set images and labels. 113 | X_train = load_mnist_images('train-images-idx3-ubyte.gz') 114 | y_train = load_mnist_labels('train-labels-idx1-ubyte.gz') 115 | X_test = load_mnist_images('t10k-images-idx3-ubyte.gz') 116 | y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz') 117 | 118 | # We reserve the last 10000 training examples for validation. 119 | X_train, X_val = X_train[:-10000], X_train[-10000:] 120 | y_train, y_val = y_train[:-10000], y_train[-10000:] 121 | 122 | # We just return all the arrays in order, as expected in main(). 123 | # (It doesn't matter how we do this as long as we can read them again.) 124 | return X_train, y_train, X_val, y_val, X_test, y_test 125 | 126 | def _download_omniglot_iwae(dataset): 127 | """ 128 | Download the MNIST dataset if it is not present. 129 | :return: The train, test and validation set. 130 | """ 131 | origin = ( 132 | 'https://github.com/yburda/iwae/raw/' 133 | 'master/datasets/OMNIGLOT/chardata.mat' 134 | ) 135 | print 'Downloading data from %s' % origin 136 | urllib.urlretrieve(origin, dataset + '/chardata.mat') 137 | 138 | def load_omniglot_iwae(dataset='./omniglot_iwae'): 139 | ''' 140 | Loads the real valued MNIST dataset 141 | :param dataset: path to dataset file 142 | :return: None 143 | ''' 144 | from scipy.io import loadmat 145 | if not os.path.exists(dataset): 146 | os.makedirs(dataset) 147 | _download_omniglot_iwae(dataset) 148 | 149 | data = loadmat(dataset+'/chardata.mat') 150 | 151 | train_x = data['data'].astype('float32').T 152 | train_t = np.argmax(data['target'].astype('float32').T,axis=1) 153 | train_char = data['targetchar'].astype('float32') 154 | test_x = data['testdata'].astype('float32').T 155 | test_t = np.argmax(data['testtarget'].astype('float32').T,axis=1) 156 | test_char = data['testtargetchar'].astype('float32') 157 | 158 | 159 | return train_x, train_t, test_x, test_t 160 | 161 | def _download_mnist_binarized(datapath): 162 | """ 163 | Download the fized binzarized MNIST dataset if it is not present. 164 | :return: The train, test and validation set. 165 | """ 166 | datafiles = { 167 | "train": "http://www.cs.toronto.edu/~larocheh/public/" 168 | "datasets/binarized_mnist/binarized_mnist_train.amat", 169 | "valid": "http://www.cs.toronto.edu/~larocheh/public/datasets/" 170 | "binarized_mnist/binarized_mnist_valid.amat", 171 | "test": "http://www.cs.toronto.edu/~larocheh/public/datasets/" 172 | "binarized_mnist/binarized_mnist_test.amat" 173 | } 174 | datasplits = {} 175 | for split in datafiles.keys(): 176 | print "Downloading %s data..." %(split) 177 | local_file = datapath + '/binarized_mnist_%s.npy'%(split) 178 | datasplits[split] = np.loadtxt(urllib.urlretrieve(datafiles[split])[0]) 179 | 180 | f = gzip.open(datapath +'/mnist.pkl.gz', 'w') 181 | pkl.dump([datasplits['train'],datasplits['valid'],datasplits['test']],f) 182 | 183 | # def load_mnist_binarized(dataset='./mnist_binarized/mnist.pkl.gz'): 184 | def load_mnist_binarized(dataset='./mnist_binarized/mnist.pkl'): 185 | ''' 186 | Loads the fixed binarized MNIST dataset provided by Hugo Larochelle. 187 | :param dataset: path to dataset file 188 | :return: None 189 | ''' 190 | if not os.path.isfile(dataset): 191 | datasetfolder = os.path.dirname(dataset) 192 | if not os.path.exists(datasetfolder): 193 | os.makedirs(datasetfolder) 194 | _download_mnist_binarized(datasetfolder) 195 | 196 | # f = gzip.open(dataset, 'rb') 197 | f = open(dataset, 'rb') 198 | x_train, x_valid, x_test = pkl.load(f) 199 | f.close() 200 | 201 | x_train = x_train.astype('float32') 202 | x_valid = x_valid.astype('float32') 203 | x_test = x_test.astype('float32') 204 | 205 | return x_train, x_valid, x_test 206 | 207 | def load_digits(): 208 | from sklearn.datasets import load_digits as _load_digits 209 | from sklearn.cross_validation import train_test_split 210 | 211 | digits = _load_digits() 212 | X = np.asarray(digits.data, 'float32') 213 | X, Y = nudge_dataset(X, digits.target) 214 | X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001) # 0-1 scaling 215 | n, d2 = X.shape 216 | d = int(np.sqrt(d2)) 217 | X = X.reshape((n,1,d,d)) 218 | Y = np.array(Y, dtype=np.uint8) 219 | 220 | X_train, X_val, y_train, y_val = train_test_split(X, Y, 221 | test_size=0.2, 222 | random_state=0) 223 | # TODO: We don't use these right now 224 | X_test, y_test = None, None 225 | 226 | # We just return all the arrays in order, as expected in main(). 227 | # (It doesn't matter how we do this as long as we can read them again.) 228 | return X_train, y_train, X_val, y_val, X_test, y_test 229 | 230 | # def load_digits(): 231 | # from sklearn.datasets import load_digits as _load_digits 232 | # data = _load_digits() 233 | # X, y = data.images.astype('float32'), data.target.astype('int32') 234 | # X = X[:, np.newaxis, :, :] 235 | 236 | # # We reserve the last 300 / ~1800 for validation. 237 | # X_train, X_val = X[:-300], X[-300:] 238 | # y_train, y_val = y[:-300], y[-300:] 239 | 240 | # # TODO: We don't use these right now 241 | # X_test, y_test = None, None 242 | 243 | # # We just return all the arrays in order, as expected in main(). 244 | # # (It doesn't matter how we do this as long as we can read them again.) 245 | # return X_train, y_train, X_val, y_val, X_test, y_test 246 | 247 | 248 | def split_semisup(X, y, n_lbl): 249 | n_tot = len(X) 250 | idx = np.random.permutation(n_tot) 251 | 252 | X_lbl = X[idx[:n_lbl]].copy() 253 | X_unl = X[idx[n_lbl:]].copy() 254 | y_lbl = y[idx[:n_lbl]].copy() 255 | y_unl = y[idx[n_lbl:]].copy() 256 | 257 | return X_lbl, y_lbl, X_unl, y_unl 258 | 259 | 260 | def load_noise(n=100,d=5): 261 | """For debugging""" 262 | X = np.random.randint(2,size=(n,1,d,d)).astype('float32') 263 | Y = np.random.randint(2,size=(n,)).astype(np.uint8) 264 | 265 | return X, Y 266 | 267 | 268 | def load_h5(h5_path): 269 | """This was untested""" 270 | import h5py 271 | # load training data 272 | with h5py.File(h5_path, 'r') as hf: 273 | print 'List of arrays in input file:', hf.keys() 274 | X = np.array(hf.get('data')) 275 | Y = np.array(hf.get('label')) 276 | print 'Shape of X: \n', X.shape 277 | print 'Shape of Y: \n', Y.shape 278 | 279 | return X, Y 280 | 281 | 282 | def nudge_dataset(X, Y): 283 | """ 284 | This produces a dataset 5 times bigger than the original one, 285 | by moving the 8x8 images in X around by 1px to left, right, down, up 286 | """ 287 | from scipy.ndimage import convolve 288 | direction_vectors = [ 289 | [[0, 1, 0], 290 | [0, 0, 0], 291 | [0, 0, 0]], 292 | 293 | [[0, 0, 0], 294 | [1, 0, 0], 295 | [0, 0, 0]], 296 | 297 | [[0, 0, 0], 298 | [0, 0, 1], 299 | [0, 0, 0]], 300 | 301 | [[0, 0, 0], 302 | [0, 0, 0], 303 | [0, 1, 0]]] 304 | 305 | shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant', 306 | weights=w).ravel() 307 | X = np.concatenate([X] + 308 | [np.apply_along_axis(shift, 1, X, vector) 309 | for vector in direction_vectors]) 310 | Y = np.concatenate([Y for _ in range(5)], axis=0) 311 | return X, Y 312 | -------------------------------------------------------------------------------- /models/model.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pickle 3 | from collections import OrderedDict 4 | 5 | import numpy as np 6 | import theano 7 | import theano.tensor as T 8 | import lasagne 9 | 10 | from helpers import ( 11 | iterate_minibatch_idx, 12 | iterate_minibatches, 13 | evaluate, 14 | log_metrics, 15 | ) 16 | 17 | from theano.compile.nanguardmode import NanGuardMode 18 | 19 | 20 | class Model(object): 21 | """Model superclass that includes training code""" 22 | def __init__( 23 | self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params 24 | ): 25 | # create shared data variables 26 | train_set_x = theano.shared(np.empty( 27 | (n_superbatch, n_chan, n_dim, n_dim), 28 | dtype=theano.config.floatX), 29 | borrow=False, 30 | ) 31 | val_set_x = theano.shared(np.empty( 32 | (n_superbatch, n_chan, n_dim, n_dim), 33 | dtype=theano.config.floatX), 34 | borrow=False, 35 | ) 36 | 37 | # create y-variables 38 | train_set_y = theano.shared(np.empty( 39 | (n_superbatch,), dtype='int32'), borrow=False) 40 | val_set_y = theano.shared(np.empty( 41 | (n_superbatch,), dtype='int32'), borrow=False) 42 | # train_set_y_int = T.cast(train_set_y, 'int32') 43 | # val_set_y_int = T.cast(val_set_y, 'int32') 44 | 45 | # save data variables 46 | self.train_set_x = train_set_x 47 | self.train_set_y = train_set_y 48 | self.val_set_x = val_set_x 49 | self.val_set_y = val_set_y 50 | self.data_loaded = False 51 | 52 | # create input vars 53 | (X, Y, idx1, idx2, S) = self.create_inputs() 54 | self.inputs = (X, Y, idx1, idx2, S) 55 | 56 | # create lasagne model 57 | self.network = self.create_model(X, Y, n_dim, n_out, n_chan) 58 | 59 | # create objectives 60 | loss, acc = self.create_objectives(deterministic=False) 61 | loss_test, acc_test = self.create_objectives(deterministic=True) 62 | self.objectives = (loss, acc) 63 | self.objectives_test = (loss_test, acc_test) 64 | 65 | # load params 66 | params = self.get_params() 67 | 68 | # create hallucinations 69 | sample = self.gen_samples(deterministic=False) 70 | self.dream = theano.function([S], sample, on_unused_input='warn') 71 | 72 | # create gradients 73 | grads = self.create_gradients(loss, deterministic=False) 74 | grads_test = self.create_gradients(loss_test, deterministic=True) 75 | 76 | # create updates 77 | self.alpha = T.scalar(dtype=theano.config.floatX) # learning rate 78 | self.updates = self.create_updates( 79 | grads, params, self.alpha, opt_alg, opt_params 80 | ) 81 | 82 | # create givens 83 | my_givens = self.create_givens() 84 | 85 | # create methods for training / prediction 86 | mode = None 87 | # mode = NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False) 88 | self.train = theano.function( 89 | [idx1, idx2, self.alpha], [loss, acc], updates=self.updates, 90 | givens=my_givens, on_unused_input='warn', mode=mode 91 | ) 92 | self.train_batch = theano.function( 93 | [X, Y, self.alpha], [loss, acc], updates=self.updates, 94 | on_unused_input='warn', mode=mode 95 | ) 96 | self.loss = theano.function( 97 | [X, Y], [loss, acc], 98 | on_unused_input='warn', 99 | ) 100 | 101 | # TODO: implement a create_predictions method 102 | # self.predict = theano.function([X], P) 103 | 104 | # save config 105 | self.n_class = 2 106 | self.n_dim = n_dim 107 | self.n_out = n_out 108 | self.n_superbatch = n_superbatch 109 | self.alg = opt_alg 110 | 111 | # save neural network 112 | self.params = self.get_params() 113 | self.grads = (grads, grads_test) 114 | self.metrics = (loss, acc) 115 | 116 | def create_inputs(self): 117 | X = T.tensor4(dtype=theano.config.floatX) 118 | S = T.matrix(dtype=theano.config.floatX) 119 | Y = T.ivector() 120 | idx1, idx2 = T.lscalar(), T.lscalar() 121 | return X, Y, idx1, idx2, S 122 | 123 | def create_objectives(self, deterministic=False): 124 | # load network 125 | l_out = self.network 126 | Y = self.inputs[1] 127 | 128 | # create predictions 129 | P = lasagne.layers.get_output(l_out) 130 | P_test = lasagne.layers.get_output(l_out, deterministic=True) 131 | 132 | # create loss 133 | loss = lasagne.objectives.categorical_crossentropy(P, Y).mean() 134 | loss_test = lasagne.objectives.categorical_crossentropy(P_test, Y).mean() 135 | 136 | # measure accuracy 137 | top = theano.tensor.argmax(P, axis=-1) 138 | top_test = theano.tensor.argmax(P_test, axis=-1) 139 | acc = theano.tensor.eq(top, Y).mean() 140 | acc_test = theano.tensor.eq(top_test, Y).mean() 141 | 142 | if deterministic: 143 | return loss_test, acc_test 144 | else: 145 | return loss, acc 146 | 147 | def create_gradients(self, loss, deterministic=False): 148 | params = self.get_params() 149 | return theano.grad(loss, params) 150 | 151 | def create_updates(self, grads, params, alpha, opt_alg, opt_params): 152 | scaled_grads = [grad * alpha for grad in grads] 153 | lr = opt_params.get('lr', 1e-3) 154 | if opt_alg == 'sgd': 155 | grad_updates = lasagne.updates.sgd( 156 | scaled_grads, params, learning_rate=lr) 157 | elif opt_alg == 'adam': 158 | b1, b2 = opt_params.get('b1', 0.9), opt_params.get('b2', 0.999) 159 | grad_updates = lasagne.updates.adam( 160 | scaled_grads, params, learning_rate=lr, beta1=b1, beta2=b2) 161 | else: 162 | grad_updates = OrderedDict() 163 | 164 | return grad_updates 165 | 166 | def create_givens(self): 167 | (X, Y, idx1, idx2, S) = self.inputs 168 | return {X: self.train_set_x[idx1:idx2], Y: self.train_set_y[idx1:idx2]} 169 | 170 | def get_params(self): 171 | l_out = self.network 172 | return lasagne.layers.get_all_params(l_out, trainable=True) 173 | 174 | def gen_samples(self, deterministic=False): 175 | pass # To be implemented by inherited models 176 | 177 | def gen_noise(self, size, n_lat): 178 | noise = np.zeros((size, n_lat)) 179 | noise[range(size), np.random.choice(n_lat, size)] = 1 180 | return noise 181 | 182 | def hallucinate(self): 183 | """Generate new samples by passing noise into the decoder""" 184 | # load network params 185 | size, n_lat, n_dim = 100, self.n_lat, self.n_dim 186 | img_size = int(np.sqrt(size)) 187 | 188 | # generate noisy inputs 189 | noise = self.gen_noise(size, n_lat).astype('float32') 190 | p_mu = self.dream(noise) 191 | if p_mu is None: 192 | return None 193 | p_mu = p_mu.reshape((img_size, img_size, n_dim, n_dim)) 194 | # split into img_size (1,img_size,n_dim,n_dim) images, 195 | # concat along columns -> 1,img_size,n_dim,n_dim*img_size 196 | p_mu = np.concatenate(np.split(p_mu, img_size, axis=0), axis=3) 197 | # split into img_size (1,1,n_dim,n_dim*img_size) images, 198 | # concat along rows -> 1,1,n_dim*img_size,n_dim*img_size 199 | p_mu = np.concatenate(np.split(p_mu, img_size, axis=1), axis=2) 200 | return np.squeeze(p_mu) 201 | 202 | def fit( 203 | self, X_train, Y_train, X_val, Y_val, 204 | n_epoch=10, n_batch=100, logname='run', 205 | ): 206 | """Train the model""" 207 | alpha = 1.0 # learning rate, which can be adjusted later 208 | n_data = len(X_train) 209 | n_superbatch = self.n_superbatch 210 | 211 | for epoch in range(n_epoch): 212 | # In each epoch, we do a full pass over the training data: 213 | train_batches, train_err, train_acc = 0, 0, 0 214 | start_time = time.time() 215 | 216 | # iterate over superbatches to save time on GPU memory transfer 217 | for X_sb, Y_sb in self.iterate_superbatches( 218 | X_train, Y_train, n_superbatch, 219 | datatype='train', shuffle=True, 220 | ): 221 | for idx1, idx2 in iterate_minibatch_idx(len(X_sb), n_batch): 222 | err, acc = self.train(idx1, idx2, alpha) 223 | # collect metrics 224 | train_batches += 1 225 | train_err += err 226 | train_acc += acc 227 | if train_batches % 100 == 0: 228 | n_total = epoch * n_data + n_batch * train_batches 229 | metrics = [ 230 | n_total, 231 | train_err / train_batches, 232 | train_acc / train_batches, 233 | ] 234 | log_metrics(logname, metrics) 235 | 236 | print "Epoch {} of {} took {:.3f}s ({} minibatches)".format( 237 | epoch + 1, n_epoch, time.time() - start_time, train_batches) 238 | 239 | # make a full pass over the training data and record metrics: 240 | train_err, train_acc = evaluate( 241 | self.loss, X_train, Y_train, batchsize=100, 242 | ) 243 | val_err, val_acc = evaluate( 244 | self.loss, X_val, Y_val, batchsize=100, 245 | ) 246 | 247 | print " training:\t\t{:.6f}\t{:.6f}".format( 248 | train_err, train_acc 249 | ) 250 | print " validation:\t\t{:.6f}\t{:.6f}".format( 251 | val_err, val_acc 252 | ) 253 | 254 | metrics = [epoch, train_err, train_acc, val_err, val_acc] 255 | log_metrics(logname + '.val', metrics) 256 | 257 | def dump(self, fname): 258 | """Pickle weights to a file""" 259 | params = lasagne.layers.get_all_param_values(self.network) 260 | with open(fname, 'w') as f: 261 | pickle.dump(params, f) 262 | 263 | def load(self, fname): 264 | """Load pickled network""" 265 | with open(fname) as f: 266 | params = pickle.load(f) 267 | self.load_params(params) 268 | 269 | def load_params(self, params): 270 | """Load a given set of parameters""" 271 | lasagne.layers.set_all_param_values(self.network, params) 272 | 273 | def dump_params(self): 274 | """Dump a given set of parameters""" 275 | return lasagne.layers.get_all_param_values(self.network) 276 | 277 | def load_data(self, X, Y, dest='train'): 278 | assert dest in ('train', 'val') 279 | if dest == 'train': 280 | self.train_set_x.set_value(X, borrow=False) 281 | if Y is not None: 282 | self.train_set_y.set_value(Y, borrow=False) 283 | elif dest == 'val': 284 | self.val_set_x.set_value(X, borrow=False) 285 | if Y is not None: 286 | self.val_set_y.set_value(Y, borrow=False) 287 | 288 | def iterate_superbatches( 289 | self, X, Y, batchsize, 290 | datatype='train', shuffle=False, 291 | ): 292 | assert datatype in ('train', 'val') 293 | assert len(X) == len(Y) 294 | assert batchsize <= len(X) 295 | 296 | # if we are loading entire dataset, only load it once 297 | if batchsize == len(X): 298 | if not self.data_loaded: 299 | self.load_data(X, Y, dest=datatype) 300 | self.data_loaded = True 301 | yield X, Y 302 | else: 303 | # otherwise iterate over superbatches 304 | for superbatch in iterate_minibatches( 305 | X, Y, batchsize, shuffle=shuffle 306 | ): 307 | inputs, targets = superbatch 308 | self.load_data(inputs, targets, dest=datatype) 309 | yield inputs, targets 310 | -------------------------------------------------------------------------------- /models/usbn.py: -------------------------------------------------------------------------------- 1 | 2 | import time 3 | import pickle 4 | import numpy as np 5 | from collections import OrderedDict 6 | 7 | import theano 8 | import theano.tensor as T 9 | from theano.gradient import disconnected_grad as dg 10 | import lasagne 11 | 12 | from model import Model 13 | 14 | from helpers import ( 15 | iterate_minibatch_idx, 16 | iterate_minibatches, 17 | evaluate, 18 | log_metrics 19 | ) 20 | 21 | from layers import BernoulliSampleLayer 22 | from distributions import log_bernoulli 23 | from helpers import Tlogsumexp 24 | 25 | from theano.tensor.shared_randomstreams import RandomStreams 26 | 27 | from rbm import RBM 28 | from aux_variational_rbm import AuxiliaryVariationalRBM 29 | 30 | # ---------------------------------------------------------------------------- 31 | 32 | class USBN(Model): 33 | """Unidected Sigmoid Belief Network trained using Neural Variational Inference 34 | Epoch 200 of 200 took 26.052s (192 minibatches) 35 | training loss/acc: 125.989901 107.652437 36 | validation loss/acc: 126.220432 108.006230 37 | """ 38 | 39 | def __init__( 40 | self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params 41 | ): 42 | Model.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params) 43 | 44 | # random number generators 45 | self.numpy_rng = np.random.RandomState(1234) 46 | self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30)) 47 | 48 | (X, Y, idx1, idx2, S) = self.inputs 49 | loss, acc = self.objectives 50 | self.train = theano.function( 51 | [idx1, idx2, self.alpha], [loss, self.log_e, self.z], updates=self.updates, 52 | givens={X: self.train_set_x[idx1:idx2], Y: self.train_set_y[idx1:idx2]}, 53 | on_unused_input='warn' 54 | ) 55 | llik = self.create_llik() 56 | self.loss = theano.function( 57 | [X, Y], [loss, llik], 58 | on_unused_input='warn', 59 | ) 60 | 61 | def create_model(self, X, Y, n_dim, n_out, n_chan=1): 62 | # params 63 | n_lat = 64 # latent stochastic variables 64 | n_hid = 500 # size of hidden layer in encoder/decoder 65 | n_hid_cv = 500 # size of hidden layer in control variate net 66 | n_out = n_dim * n_dim * n_chan # total dimensionality of ouput 67 | hid_nl = lasagne.nonlinearities.tanh 68 | 69 | # self.rbm = RBM(n_dim=int(np.sqrt(n_lat)), n_out=10, n_chan=1, opt_params={'nb':128}) 70 | self.rbm = AuxiliaryVariationalRBM(n_dim=int(np.sqrt(n_lat)), n_out=10, n_chan=1, opt_params={'nb':128}) 71 | 72 | # create the encoder network 73 | l_q_in = lasagne.layers.InputLayer( 74 | shape=(None, n_chan, n_dim, n_dim), 75 | input_var=X, 76 | ) 77 | l_q_hid = lasagne.layers.DenseLayer( 78 | l_q_in, num_units=n_hid, 79 | nonlinearity=hid_nl, 80 | ) 81 | l_q_out = lasagne.layers.DenseLayer( 82 | l_q_hid, num_units=n_lat, 83 | nonlinearity=None, 84 | ) 85 | l_q_mu = lasagne.layers.DenseLayer( 86 | l_q_hid, num_units=n_lat, 87 | nonlinearity=T.nnet.sigmoid, 88 | ) 89 | l_q_sample = BernoulliSampleLayer(l_q_mu) 90 | 91 | # create the decoder network 92 | # note that we currently only handle Bernoulli x variables 93 | l_p_in = lasagne.layers.InputLayer((None, n_lat)) 94 | l_p_hid = lasagne.layers.DenseLayer( 95 | l_p_in, num_units=n_hid, 96 | nonlinearity=hid_nl, 97 | W=lasagne.init.GlorotUniform(), 98 | ) 99 | l_p_mu = lasagne.layers.DenseLayer(l_p_hid, num_units=n_out, 100 | nonlinearity = lasagne.nonlinearities.sigmoid, 101 | W=lasagne.init.GlorotUniform(), 102 | b=lasagne.init.Constant(0.), 103 | ) 104 | 105 | # create control variate (baseline) network 106 | l_cv_in = lasagne.layers.InputLayer( 107 | shape=(None, n_chan, n_dim, n_dim), 108 | input_var=X, 109 | ) 110 | l_cv_hid = lasagne.layers.DenseLayer( 111 | l_cv_in, num_units=n_hid_cv, 112 | nonlinearity=hid_nl, 113 | ) 114 | l_cv = lasagne.layers.DenseLayer( 115 | l_cv_hid, num_units=1, 116 | nonlinearity=None, 117 | ) 118 | 119 | # create variables for centering signal 120 | c = theano.shared(np.zeros((1,1), dtype=np.float64), broadcastable=(True,True)) 121 | v = theano.shared(np.zeros((1,1), dtype=np.float64), broadcastable=(True,True)) 122 | 123 | self.input_layers = (l_q_in, l_p_in, l_cv_in) 124 | self.n_lat = n_lat 125 | self.n_hid = n_hid 126 | 127 | return l_p_mu, l_q_mu, l_q_sample, l_cv, c, v 128 | 129 | def _create_components(self, deterministic=False): 130 | # load network input 131 | X = self.inputs[0] 132 | x = X.flatten(2) 133 | 134 | # load networks 135 | l_p_mu, l_q_mu, l_q_sample, _, _, _ = self.network 136 | l_q_in, l_p_in, l_cv_in = self.input_layers 137 | 138 | # load network output 139 | z, q_mu = lasagne.layers.get_output( 140 | [l_q_sample, l_q_mu], deterministic=deterministic) 141 | p_mu = lasagne.layers.get_output( 142 | l_p_mu, {l_p_in: z}, 143 | deterministic=deterministic, 144 | ) 145 | 146 | # entropy term 147 | log_qz_given_x = log_bernoulli(dg(z), q_mu).sum(axis=1) 148 | 149 | # expected p(x,z) term 150 | z_prior = T.ones_like(z)*np.float32(0.5) 151 | log_pz = log_bernoulli(z, z_prior).sum(axis=1) 152 | log_e = -self.rbm.free_energy(z.reshape((128,self.n_lat))) 153 | # log_e = log_pz 154 | log_px_given_z = log_bernoulli(x, p_mu).sum(axis=1) 155 | log_pxz = log_e + log_px_given_z 156 | 157 | # save them for later 158 | self.z = z 159 | self.log_e = log_e.mean() 160 | self.log_pxz = log_pxz 161 | self.log_qz_given_x = log_qz_given_x 162 | 163 | return log_pxz.flatten(), log_qz_given_x.flatten() 164 | 165 | def create_objectives(self, deterministic=False): 166 | # load probabilities 167 | log_pxz, log_qz_given_x = self._create_components(deterministic=deterministic) 168 | 169 | # compute the lower bound 170 | elbo = T.mean(log_pxz - log_qz_given_x) 171 | 172 | # we don't use the second accuracy metric right now 173 | return -elbo, -T.mean(log_qz_given_x) 174 | 175 | def create_gradients(self, loss, deterministic=False): 176 | from theano.gradient import disconnected_grad as dg 177 | 178 | # load networks 179 | l_p_mu, l_q_mu, _, l_cv, c, v = self.network 180 | 181 | # load params 182 | p_params = lasagne.layers.get_all_params(l_p_mu, trainable=True) 183 | q_params = lasagne.layers.get_all_params(l_q_mu, trainable=True) 184 | cv_params = lasagne.layers.get_all_params(l_cv, trainable=True) 185 | 186 | # load neural net outputs (probabilities have been precomputed) 187 | log_pxz, log_qz_given_x = self.log_pxz, self.log_qz_given_x 188 | cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1) 189 | 190 | # compute learning signals 191 | l = log_pxz - log_qz_given_x - cv 192 | l_avg, l_var = l.mean(), l.var() 193 | c_new = 0.8*c + 0.2*l_avg 194 | v_new = 0.8*v + 0.2*l_var 195 | l = (l - c_new) / T.maximum(1, T.sqrt(v_new)) 196 | 197 | # compute grad wrt p 198 | p_grads = T.grad(-log_pxz.mean(), p_params) 199 | 200 | # compute grad wrt q 201 | q_target = T.mean(dg(l) * log_qz_given_x) 202 | q_grads = T.grad(-0.2*q_target, q_params) # 5x slower rate for q 203 | 204 | # compute grad of cv net 205 | cv_target = T.mean(l**2) 206 | cv_grads = T.grad(cv_target, cv_params) 207 | 208 | # combine and clip gradients 209 | clip_grad = 1 210 | max_norm = 5 211 | grads = p_grads + q_grads + cv_grads 212 | mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) 213 | cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] 214 | 215 | return cgrads 216 | 217 | def gen_samples(self, deterministic=False): 218 | s = self.inputs[-1] 219 | # put it through the decoder 220 | _, l_p_in, _ = self.input_layers 221 | l_p_mu = self.network[0] 222 | p_mu = lasagne.layers.get_output(l_p_mu, {l_p_in : s}) 223 | 224 | return p_mu 225 | 226 | def gen_noise(self, size, n_lat): 227 | return self.rbm.sample() 228 | noise = np.zeros((size, n_lat)) 229 | noise[range(size), np.random.choice(n_lat, size)] = 1 230 | return noise 231 | 232 | def get_params(self): 233 | l_p_mu, l_q_mu, _, l_cv, _, _ = self.network 234 | p_params = lasagne.layers.get_all_params(l_p_mu, trainable=True) 235 | q_params = lasagne.layers.get_all_params(l_q_mu, trainable=True) 236 | cv_params = lasagne.layers.get_all_params(l_cv, trainable=True) 237 | 238 | return p_params + q_params + cv_params #+ [c] 239 | 240 | def create_updates(self, grads, params, alpha, opt_alg, opt_params): 241 | # call super-class to generate SGD/ADAM updates 242 | grad_updates = Model.create_updates( 243 | self, grads, params, alpha, opt_alg, opt_params, 244 | ) 245 | 246 | # create updates for centering signal 247 | 248 | # load neural net outputs (probabilities have been precomputed) 249 | _, _, _, l_cv, c, v = self.network 250 | log_pxz, log_qz_given_x = self.log_pxz, self.log_qz_given_x 251 | cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1) 252 | 253 | # compute learning signals 254 | l = log_pxz - log_qz_given_x - cv 255 | l_avg, l_var = l.mean(), l.var() 256 | c_new = 0.8*c + 0.2*l_avg 257 | v_new = 0.8*v + 0.2*l_var 258 | 259 | # compute update for centering signal 260 | cv_updates = {c: c_new, v: v_new} 261 | 262 | return OrderedDict(grad_updates.items() + cv_updates.items()) 263 | 264 | def hallucinate(self): 265 | """Generate new samples by passing noise into the decoder""" 266 | # load network params 267 | size, n_lat, n_dim = 100, self.n_lat, self.n_dim 268 | img_size = int(np.sqrt(size)) 269 | 270 | # # generate noisy inputs 271 | # noise = self.gen_noise(size, n_lat).astype('float32') 272 | 273 | # sample noise from RBM 274 | noise = self.rbm.sample() 275 | 276 | p_mu = self.dream(noise) 277 | if p_mu is None: 278 | return None 279 | p_mu = p_mu.reshape((img_size, img_size, n_dim, n_dim)) 280 | # split into img_size (1,img_size,n_dim,n_dim) images, 281 | # concat along columns -> 1,img_size,n_dim,n_dim*img_size 282 | p_mu = np.concatenate(np.split(p_mu, img_size, axis=0), axis=3) 283 | # split into img_size (1,1,n_dim,n_dim*img_size) images, 284 | # concat along rows -> 1,1,n_dim*img_size,n_dim*img_size 285 | p_mu = np.concatenate(np.split(p_mu, img_size, axis=1), axis=2) 286 | return np.squeeze(p_mu) 287 | 288 | def create_llik(self): 289 | # load inputs 290 | X = self.inputs[0] 291 | x = X.flatten(2) 292 | 293 | # load network params 294 | n_cat = self.n_lat 295 | n_rep = 10 296 | 297 | # load networks 298 | l_p_mu, l_q_mu, l_q_sample, _, _, _ = self.network 299 | l_q_in, l_p_in, l_cv_in = self.input_layers 300 | 301 | # load network output 302 | q_mu = lasagne.layers.get_output(l_q_mu) 303 | 304 | q_mu_rep = T.tile( q_mu.dimshuffle((0,'x',1)), reps=(1,n_rep,1) ) # (n_bat, n_rep, n_cat) 305 | q_sample_hard = self.theano_rng.binomial(size=q_mu_rep.shape, p=q_mu_rep, dtype=q_mu_rep.dtype) # (n_bat, n_rep, n_cat) 306 | q_sample_hard2 = q_sample_hard.reshape([128*n_rep, n_cat]) # (n_bat*n_rep, n_cat) 307 | 308 | p_mu = lasagne.layers.get_output(l_p_mu, {l_p_in: q_sample_hard2}) # (n_bat*n_rep, 784) 309 | x_rep = T.tile( x.dimshuffle((0,'x',1)), reps=(1,n_rep,1) ) # (n_bat, n_rep, 784) 310 | p_mu = T.reshape(p_mu, (128, n_rep, 784)) # (n_bat, n_rep, 784) 311 | 312 | # define the loss components 313 | log_p_x = log_bernoulli(x_rep, p_mu).sum(axis=2) # (n_bat, n_rep) 314 | # z_prior = T.ones_like(q_sample_hard)*np.float32(0.5) # (n_bat, n_rep, n_cat) 315 | # log_p_z = log_bernoulli(q_sample_hard, z_prior).sum(axis=2) # (n_bat, n_rep) 316 | log_e = -self.rbm.free_energy(q_sample_hard.reshape((128*n_rep,self.n_lat))) 317 | log_e = T.reshape(log_e, [128, n_rep]) 318 | log_q_z = log_bernoulli(q_sample_hard, q_mu_rep).sum(axis=2) # (n_bat, n_rep) 319 | 320 | # compute loss 321 | llik = Tlogsumexp( log_p_x + log_e - log_q_z, axis=1) # (n_bat,) 322 | 323 | return T.mean(llik) 324 | 325 | def fit( 326 | self, X_train, Y_train, X_val, Y_val, 327 | n_epoch=10, n_batch=100, logname='run', 328 | ): 329 | """Train the model""" 330 | alpha = 1.0 # learning rate, which can be adjusted later 331 | n_data = len(X_train) 332 | n_superbatch = self.n_superbatch 333 | 334 | # print '...', self.rbm.logZ_exact() 335 | # print '...', self.rbm.logZ_exact(marg='h') 336 | 337 | for epoch in range(n_epoch): 338 | # In each epoch, we do a full pass over the training data: 339 | train_batches, train_err, train_acc = 0, 0, 0 340 | start_time = time.time() 341 | 342 | # iterate over superbatches to save time on GPU memory transfer 343 | for X_sb, Y_sb in self.iterate_superbatches( 344 | X_train, Y_train, n_superbatch, 345 | datatype='train', shuffle=True, 346 | ): 347 | for idx1, idx2 in iterate_minibatch_idx(len(X_sb), n_batch): 348 | # train model 349 | self.rbm.reset_averages() 350 | err, acc, z = self.train(idx1, idx2, alpha) 351 | 352 | # train rbm 353 | z_rbm = z.reshape((128,1,8,8)) 354 | # self.rbm.train_batch(z.reshape((128,1,8,8)), np.zeros((n_batch,), dtype='int32'), alpha) 355 | 356 | # q steps 357 | for i in range(2): self.rbm.train_q0(z_rbm) 358 | 359 | # p steps 360 | self.rbm.train_p_batch(z_rbm, alpha) 361 | 362 | # collect metrics 363 | # print err, acc 364 | train_batches += 1 365 | train_err += err 366 | train_acc += acc 367 | if train_batches % 100 == 0: 368 | n_total = epoch * n_data + n_batch * train_batches 369 | metrics = [ 370 | n_total, 371 | train_err / train_batches, 372 | train_acc / train_batches, 373 | ] 374 | log_metrics(logname, metrics) 375 | 376 | print "Epoch {} of {} took {:.3f}s ({} minibatches)".format( 377 | epoch + 1, n_epoch, time.time() - start_time, train_batches) 378 | 379 | # make a full pass over the training data and record metrics: 380 | train_err, train_acc = evaluate( 381 | self.loss, X_train, Y_train, batchsize=128, 382 | ) 383 | val_err, val_acc = evaluate( 384 | self.loss, X_val, Y_val, batchsize=128, 385 | ) 386 | 387 | print " training:\t\t{:.6f}\t{:.6f}".format( 388 | train_err, train_acc 389 | ) 390 | print " validation:\t\t{:.6f}\t{:.6f}".format( 391 | val_err, val_acc 392 | ) 393 | 394 | metrics = [epoch, train_err, train_acc, val_err, val_acc] 395 | log_metrics(logname + '.val', metrics) 396 | 397 | # reserve N of training data points to kick start hallucinations 398 | self.rbm.hallu_set = np.asarray( 399 | z[:100,:].reshape((100,1,8,8)), 400 | dtype=theano.config.floatX 401 | ) 402 | -------------------------------------------------------------------------------- /models/abstractrbm.py: -------------------------------------------------------------------------------- 1 | import pdb 2 | import time 3 | import numpy as np 4 | import theano 5 | import lasagne 6 | import theano.tensor as T 7 | from theano.tensor.shared_randomstreams import RandomStreams 8 | 9 | from model import Model 10 | from helpers import iterate_minibatch_idx, evaluate, log_metrics 11 | 12 | from sklearn.neural_network import BernoulliRBM 13 | 14 | 15 | class AbstractRBM(Model): 16 | """Restricted Boltzmann Machine 17 | RBM code adapted from http://deeplearning.net/tutorial/rbm.html 18 | 19 | Epoch 15 of 15 took 635.792s (2448 minibatches) 20 | training loss/acc: -62.311016 -62.311016 21 | 22 | Training Params 23 | --------------- 24 | batch_size: 20 25 | learning_rate: 0.1 26 | """ 27 | def __init__( 28 | self, n_dim, n_out, n_chan=1, n_superbatch=12800, opt_alg='adam', 29 | opt_params={'lr': 1e-3, 'b1': 0.9, 'b2': 0.99} 30 | ): 31 | """RBM constructor. 32 | Defines the parameters of the model along with 33 | basic operations for inferring hidden from visible (and vice-versa), 34 | as well as for performing CD updates. 35 | """ 36 | 37 | # store sklearn RBM instance 38 | self.rbm = BernoulliRBM(random_state=0, n_components=self.n_hidden) 39 | 40 | self.n_chain = 100 41 | 42 | # initialize storage for the persistent chain (state = hidden 43 | # layer of chain) 44 | self.persistent_chain = theano.shared( 45 | np.zeros( 46 | (self.n_batch, self.n_hidden), 47 | dtype=theano.config.floatX 48 | ), borrow=True, 49 | ) 50 | 51 | self.bit_i_idx = theano.shared(value=0, name='bit_i_idx') 52 | 53 | # create shared data variables 54 | self.train_set_x = theano.shared(np.empty( 55 | (n_superbatch, n_chan, n_dim, n_dim), 56 | dtype=theano.config.floatX), 57 | borrow=False, 58 | ) 59 | self.val_set_x = theano.shared(np.empty( 60 | (n_superbatch, n_chan, n_dim, n_dim), 61 | dtype=theano.config.floatX), 62 | borrow=False, 63 | ) 64 | 65 | # create y-variables 66 | self.train_set_y = theano.shared(np.empty( 67 | (n_superbatch,), dtype='int32'), borrow=False) 68 | self.val_set_y = theano.shared(np.empty( 69 | (n_superbatch,), dtype='int32'), borrow=False) 70 | # train_set_y_int = T.cast(train_set_y, 'int32') 71 | # val_set_y_int = T.cast(val_set_y, 'int32') 72 | 73 | def free_energy(self, v_sample): 74 | """Function to compute the free energy""" 75 | wx_b = T.dot(v_sample, self.W) + self.hbias 76 | vbias_term = T.dot(v_sample, self.vbias) 77 | hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1) 78 | return -hidden_term - vbias_term 79 | 80 | def propup(self, vis): 81 | """This function propagates the visible units activation upwards to 82 | the hidden units 83 | 84 | Note that we return also the pre-sigmoid activation of the 85 | layer. As it will turn out later, due to how Theano deals with 86 | optimizations, this symbolic variable will be needed to write 87 | down a more stable computational graph (see details in the 88 | reconstruction cost function) 89 | """ 90 | pre_sigmoid_activation = T.dot(vis, self.W) + self.hbias 91 | return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)] 92 | 93 | def sample_h_given_v(self, v0_sample): 94 | """This function infers state of hidden units given visible units""" 95 | # compute the activation of the hidden units given a sample of 96 | # the visibles 97 | pre_sigmoid_h1, h1_mean = self.propup(v0_sample) 98 | # get a sample of the hiddens given their activation 99 | # Note that theano_rng.binomial returns a symbolic sample of dtype 100 | # int64 by default. If we want to keep our computations in floatX 101 | # for the GPU we need to specify to return the dtype floatX 102 | h1_sample = self.theano_rng.binomial( 103 | size=h1_mean.shape, 104 | n=1, p=h1_mean, 105 | dtype=theano.config.floatX, 106 | ) 107 | return [pre_sigmoid_h1, h1_mean, h1_sample] 108 | 109 | def propdown(self, hid): 110 | """This function propagates the hidden units activation downwards to 111 | the visible units 112 | 113 | Note that we return also the pre_sigmoid_activation of the 114 | layer. As it will turn out later, due to how Theano deals with 115 | optimizations, this symbolic variable will be needed to write 116 | down a more stable computational graph (see details in the 117 | reconstruction cost function) 118 | """ 119 | pre_sigmoid_activation = T.dot(hid, self.W.T) + self.vbias 120 | return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)] 121 | 122 | def sample_v_given_h(self, h0_sample): 123 | """This function infers state of visible units given hidden units""" 124 | # compute the activation of the visible given the hidden sample 125 | pre_sigmoid_v1, v1_mean = self.propdown(h0_sample) 126 | # get a sample of the visible given their activation 127 | # Note that theano_rng.binomial returns a symbolic sample of dtype 128 | # int64 by default. If we want to keep our computations in floatX 129 | # for the GPU we need to specify to return the dtype floatX 130 | v1_sample = self.theano_rng.binomial( 131 | size=v1_mean.shape, 132 | n=1, p=v1_mean, 133 | dtype=theano.config.floatX, 134 | ) 135 | return [pre_sigmoid_v1, v1_mean, v1_sample] 136 | 137 | def gibbs_hvh(self, h0_sample): 138 | """This function implements one step of Gibbs sampling, 139 | starting from the hidden state 140 | """ 141 | pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h0_sample) 142 | pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample) 143 | return [pre_sigmoid_v1, v1_mean, v1_sample, 144 | pre_sigmoid_h1, h1_mean, h1_sample] 145 | 146 | def gibbs_vhv(self, v0_sample): 147 | """This function implements one step of Gibbs sampling, 148 | starting from the visible state 149 | """ 150 | pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v0_sample) 151 | pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h1_sample) 152 | return [pre_sigmoid_h1, h1_mean, h1_sample, 153 | pre_sigmoid_v1, v1_mean, v1_sample] 154 | 155 | def pseudolikelihood(self, data): 156 | self.rbm.components_ = self.W.get_value().T 157 | self.rbm.intercept_visible_ = self.vbias.get_value() 158 | self.rbm.intercept_hidden_ = self.hbias.get_value() 159 | return self.rbm.score_samples(data).mean() 160 | 161 | def get_pseudo_likelihood_cost(self, X, updates): 162 | """Stochastic approximation to the pseudo-likelihood""" 163 | # index of bit i in expression p(x_i | x_{\i}) 164 | bit_i_idx = self.bit_i_idx 165 | # bit_i_idx = theano.shared(value=0, name='bit_i_idx') 166 | 167 | # binarize the input image by rounding to nearest integer 168 | xi = T.round(X) 169 | 170 | # calculate free energy for the given bit configuration 171 | fe_xi = self.free_energy(xi) 172 | 173 | # flip bit x_i of matrix xi and preserve all other bits x_{\i} 174 | # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx], but assigns 175 | # the result to xi_flip, instead of working in place on xi. 176 | xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx]) 177 | 178 | # calculate free energy with bit flipped 179 | fe_xi_flip = self.free_energy(xi_flip) 180 | 181 | # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))) 182 | cost = T.mean(self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip-fe_xi))) 183 | 184 | return cost 185 | 186 | def hallucinate(self): 187 | """Once the RBM is trained, we can then use the gibbs_vhv function to 188 | implement the Gibbs chain required for sampling. This overwrites the 189 | hallucinate function in Model completely. 190 | """ 191 | n_samples = 10 192 | hallu_set = self.hallu_set.reshape((-1, self.n_visible)) 193 | persistent_vis_chain = theano.shared(hallu_set) 194 | # define one step of Gibbs sampling (mf = mean-field) define a 195 | # function that does `1000` steps before returning the 196 | # sample for plotting 197 | ( 198 | [ 199 | presig_hids, 200 | hid_mfs, 201 | hid_samples, 202 | presig_vis, 203 | vis_mfs, 204 | vis_samples 205 | ], 206 | updates 207 | ) = theano.scan( 208 | self.gibbs_vhv, 209 | outputs_info=[None, None, None, None, None, persistent_vis_chain], 210 | n_steps=1000, 211 | name="gibbs_vhv", 212 | ) 213 | 214 | # add to updates that takes care of our persistent chain : 215 | updates.update({persistent_vis_chain: vis_samples[-1]}) 216 | # construct the function that implements our persistent chain. 217 | # we generate the "mean field" activations for plotting and the actual 218 | # samples for reinitializing the state of our persistent chain 219 | sample_fn = theano.function( 220 | [], 221 | [ 222 | vis_mfs[-1], 223 | vis_samples[-1] 224 | ], 225 | updates=updates, 226 | name='sample_fn', 227 | ) 228 | 229 | for idx in range(n_samples): 230 | # generate `plot_every` intermediate samples that we discard, 231 | # because successive samples in the chain are too correlated 232 | vis_mf, vis_sample = sample_fn() 233 | 234 | img_size = int(np.sqrt(self.n_chain)) 235 | vis_mf = vis_mf.reshape((img_size, img_size, self.n_dim, self.n_dim)) 236 | vis_mf = np.concatenate(np.split(vis_mf, img_size, axis=0), axis=3) 237 | # split into img_size (1,1,n_dim,n_dim*img_size) images, 238 | # concat along rows -> 1,1,n_dim*img_size,n_dim*img_size 239 | vis_mf = np.concatenate(np.split(vis_mf, img_size, axis=1), axis=2) 240 | return np.squeeze(vis_mf) 241 | 242 | def hallucinate_chain(self): 243 | """Once the RBM is trained, we can then use the gibbs_vhv function to 244 | implement the Gibbs chain required for sampling. This overwrites the 245 | hallucinate function in Model completely. 246 | """ 247 | n_samples = 10 248 | # hallu_set = self.hallu_set.reshape((-1, self.n_visible)) 249 | hallu_set = np.random.rand(1, 64).astype('float32') 250 | # hallu_set = self.Phi.get_value()[:,0] 251 | persistent_vis_chain = theano.shared(hallu_set) 252 | # define one step of Gibbs sampling (mf = mean-field) define a 253 | # function that does `1000` steps before returning the 254 | # sample for plotting 255 | ( 256 | [ 257 | presig_hids, 258 | hid_mfs, 259 | hid_samples, 260 | presig_vis, 261 | vis_mfs, 262 | vis_samples 263 | ], 264 | updates 265 | ) = theano.scan( 266 | self.gibbs_vhv, 267 | outputs_info=[None, None, None, None, None, persistent_vis_chain], 268 | n_steps=100, 269 | name="gibbs_vhv", 270 | ) 271 | 272 | # add to updates that takes care of our persistent chain : 273 | updates.update({persistent_vis_chain: vis_samples[-1]}) 274 | # construct the function that implements our persistent chain. 275 | # we generate the "mean field" activations for plotting and the actual 276 | # samples for reinitializing the state of our persistent chain 277 | sample_fn = theano.function([],vis_mfs,updates=updates,name='sample_fn') 278 | 279 | vis_mf = sample_fn() 280 | print vis_mf.shape 281 | # vis_mf = vis_mf[::10] 282 | print vis_mf.shape 283 | 284 | img_size = 10 285 | vis_mf = vis_mf.reshape((img_size, img_size, self.n_dim, self.n_dim)) 286 | vis_mf = np.transpose(vis_mf, [1,0,2,3]) 287 | vis_mf = np.concatenate(np.split(vis_mf, img_size, axis=0), axis=3) 288 | # split into img_size (1,1,n_dim,n_dim*img_size) images, 289 | # concat along rows -> 1,1,n_dim*img_size,n_dim*img_size 290 | vis_mf = np.concatenate(np.split(vis_mf, img_size, axis=1), axis=2) 291 | return np.squeeze(vis_mf) 292 | 293 | def E_np_h(self,h): 294 | bv_vec = self.vbias.get_value().reshape((self.n_visible,1)) 295 | bh_vec = self.hbias.get_value().reshape((self.n_hidden,1)) 296 | W = self.W.get_value() 297 | return (np.dot(bh_vec.T, h) + np.sum(np.log(1. + np.exp(bv_vec + np.dot(W, h))), axis=0)).flatten() 298 | 299 | def E_np_v(self,v): 300 | bv_vec = self.vbias.get_value().reshape((self.n_visible,1)) 301 | bh_vec = self.hbias.get_value().reshape((self.n_hidden,1)) 302 | W = self.W.get_value() 303 | return (np.dot(bv_vec.T, v)+ np.sum(np.log(1. + np.exp(bh_vec + np.dot(W.T, v))), axis=0)).flatten() 304 | 305 | def logZ_exact(self, marg='v'): 306 | # get the next binary vector 307 | t = self.n_hidden if marg == 'v' else self.n_visible 308 | def inc(x): 309 | for i in xrange(t): 310 | x[i,0]+=1 311 | if x[i,0]<=1: return True 312 | x[i,0]=0 313 | 314 | return False 315 | 316 | #compute the normalizing constant 317 | if marg == 'v': x=np.zeros((self.n_hidden,1)) 318 | elif marg == 'h': x=np.zeros((self.n_visible,1)) 319 | logZ=-np.inf 320 | while True: 321 | if marg == 'v': logF = self.E_np_h(x) 322 | elif marg == 'h': logF = self.E_np_v(x) 323 | # print ''.join([str(xi) for xi in x]), logF, logZ 324 | logZ=np.logaddexp(logZ,logF) 325 | if not inc(x): break 326 | 327 | # print 328 | return logZ 329 | 330 | def sample(self): 331 | """Once the RBM is trained, we can then use the gibbs_vhv function to 332 | implement the Gibbs chain required for sampling. This overwrites the 333 | hallucinate function in Model completely. 334 | """ 335 | n_samples = 10 336 | hallu_set = self.hallu_set.reshape((-1, self.n_visible)) 337 | persistent_vis_chain = theano.shared(hallu_set) 338 | # define one step of Gibbs sampling (mf = mean-field) define a 339 | # function that does `1000` steps before returning the 340 | # sample for plotting 341 | ( 342 | [ 343 | presig_hids, 344 | hid_mfs, 345 | hid_samples, 346 | presig_vis, 347 | vis_mfs, 348 | vis_samples 349 | ], 350 | updates 351 | ) = theano.scan( 352 | self.gibbs_vhv, 353 | outputs_info=[None, None, None, None, None, persistent_vis_chain], 354 | n_steps=1000, 355 | name="gibbs_vhv", 356 | ) 357 | 358 | # add to updates that takes care of our persistent chain : 359 | updates.update({persistent_vis_chain: vis_samples[-1]}) 360 | # construct the function that implements our persistent chain. 361 | # we generate the "mean field" activations for plotting and the actual 362 | # samples for reinitializing the state of our persistent chain 363 | sample_fn = theano.function( 364 | [], 365 | [ 366 | vis_mfs[-1], 367 | vis_samples[-1] 368 | ], 369 | updates=updates, 370 | name='sample_fn', 371 | ) 372 | 373 | for idx in range(n_samples): 374 | # generate `plot_every` intermediate samples that we discard, 375 | # because successive samples in the chain are too correlated 376 | vis_mf, vis_sample = sample_fn() 377 | 378 | img_size = int(np.sqrt(self.n_chain)) 379 | vis_mf = vis_mf.reshape((self.n_chain, self.n_dim*self.n_dim)) 380 | 381 | return vis_mf -------------------------------------------------------------------------------- /models/dadgm.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pickle 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | import theano 7 | import theano.tensor as T 8 | import lasagne 9 | from theano.gradient import disconnected_grad as dg 10 | 11 | from model import Model 12 | from layers.shape import RepeatLayer 13 | from layers import ( 14 | GaussianSampleLayer, 15 | BernoulliSampleLayer, 16 | GaussianMultiSampleLayer, 17 | ) 18 | from distributions import log_bernoulli, log_normal2 19 | from helpers import Tlogsumexp 20 | 21 | from theano.tensor.shared_randomstreams import RandomStreams 22 | 23 | # ---------------------------------------------------------------------------- 24 | 25 | class DADGM(Model): 26 | """Auxiliary Deep Generative Model (unsupervised version) with discrete z""" 27 | def __init__( 28 | self, n_dim, n_out, n_chan=1, n_superbatch=12800, model='bernoulli', 29 | opt_alg='adam', opt_params={'lr' : 1e-3, 'b1': 0.9, 'b2': 0.99} 30 | ): 31 | # save model that wil be created 32 | self.model = model 33 | self.n_sample = 5 # adjustable parameter, though 1 works best in practice 34 | Model.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params) 35 | 36 | # random number generators 37 | self.numpy_rng = np.random.RandomState(1234) 38 | self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30)) 39 | 40 | (X, Y, idx1, idx2, S) = self.inputs 41 | loss, acc = self.objectives 42 | llik = self.create_llik() 43 | self.loss = theano.function( 44 | [X, Y], [loss, llik], 45 | on_unused_input='warn', 46 | ) 47 | 48 | def create_model(self, X, Y, n_dim, n_out, n_chan=1): 49 | # params 50 | n_lat = 200 # latent stochastic variables 51 | n_aux = 10 # auxiliary variables 52 | n_hid = 500 # size of hidden layer in encoder/decoder 53 | n_sam = self.n_sample # number of monte-carlo samples 54 | n_hid_cv = 500 # size of hidden layer in control variate net 55 | n_out = n_dim * n_dim * n_chan # total dimensionality of ouput 56 | hid_nl = lasagne.nonlinearities.tanh 57 | relu_shift = lambda av: T.nnet.relu(av+10)-10 # for numerical stability 58 | 59 | # create the encoder network 60 | # create q(a|x) 61 | l_qa_in = lasagne.layers.InputLayer( 62 | shape=(None, n_chan, n_dim, n_dim), 63 | input_var=X, 64 | ) 65 | l_qa_hid = lasagne.layers.DenseLayer( 66 | l_qa_in, num_units=n_hid, 67 | nonlinearity=hid_nl, 68 | ) 69 | l_qa_mu = lasagne.layers.DenseLayer( 70 | l_qa_in, num_units=n_aux, 71 | nonlinearity=None, 72 | ) 73 | l_qa_logsigma = lasagne.layers.DenseLayer( 74 | l_qa_in, num_units=n_aux, 75 | nonlinearity=relu_shift, 76 | ) 77 | # repeatedly sample 78 | l_qa_mu = lasagne.layers.ReshapeLayer( 79 | RepeatLayer(l_qa_mu, n_ax=1, n_rep=n_sam), 80 | shape=(-1, n_aux), 81 | ) 82 | l_qa_logsigma = lasagne.layers.ReshapeLayer( 83 | RepeatLayer(l_qa_logsigma, n_ax=1, n_rep=n_sam), 84 | shape=(-1, n_aux), 85 | ) 86 | l_qa = GaussianSampleLayer(l_qa_mu, l_qa_logsigma) 87 | 88 | # create q(z|a,x) 89 | l_qz_in = lasagne.layers.InputLayer((None, n_aux)) 90 | l_qz_hid1a = lasagne.layers.DenseLayer( 91 | l_qz_in, num_units=n_hid, 92 | nonlinearity=hid_nl, 93 | ) 94 | l_qz_hid1b = lasagne.layers.DenseLayer( 95 | l_qa_in, num_units=n_hid, 96 | nonlinearity=hid_nl, 97 | ) 98 | l_qz_hid1b = lasagne.layers.ReshapeLayer( 99 | RepeatLayer(l_qz_hid1b, n_ax=1, n_rep=n_sam), 100 | shape=(-1, n_hid), 101 | ) 102 | l_qz_hid2 = lasagne.layers.ElemwiseSumLayer([l_qz_hid1a, l_qz_hid1b]) 103 | l_qz_hid3 = lasagne.layers.DenseLayer( 104 | l_qz_hid2, num_units=n_hid, 105 | nonlinearity=hid_nl, 106 | ) 107 | l_qz_mu = lasagne.layers.DenseLayer( 108 | l_qz_hid3, num_units=n_lat, 109 | nonlinearity=T.nnet.sigmoid, 110 | ) 111 | l_qz = BernoulliSampleLayer(l_qz_mu) 112 | l_qz_logsigma = None 113 | 114 | # create the decoder network 115 | # create p(x|z) 116 | l_px_in = lasagne.layers.InputLayer((None, n_lat)) 117 | l_px_hid = lasagne.layers.DenseLayer( 118 | l_px_in, num_units=n_hid, 119 | W=lasagne.init.GlorotUniform(), 120 | nonlinearity=hid_nl, 121 | ) 122 | l_px_mu, l_px_logsigma = None, None 123 | 124 | if self.model == 'bernoulli': 125 | l_px_mu = lasagne.layers.DenseLayer( 126 | l_px_hid, num_units=n_out, 127 | nonlinearity=lasagne.nonlinearities.sigmoid, 128 | ) 129 | elif self.model == 'gaussian': 130 | l_px_mu = lasagne.layers.DenseLayer( 131 | l_px_hid, num_units=n_out, 132 | nonlinearity=None, 133 | ) 134 | l_px_logsigma = lasagne.layers.DenseLayer( 135 | l_px_hid, num_units=n_out, 136 | nonlinearity=relu_shift, 137 | ) 138 | 139 | # create p(a|z) 140 | l_pa_hid = lasagne.layers.DenseLayer( 141 | l_px_in, num_units=n_hid, 142 | nonlinearity=hid_nl, 143 | ) 144 | l_pa_mu = lasagne.layers.DenseLayer( 145 | l_pa_hid, num_units=n_aux, 146 | nonlinearity=None, 147 | ) 148 | l_pa_logsigma = lasagne.layers.DenseLayer( 149 | l_pa_hid, num_units=n_aux, 150 | W=lasagne.init.GlorotNormal(), 151 | b=lasagne.init.Normal(1e-3), 152 | nonlinearity=relu_shift, 153 | ) 154 | 155 | # create control variate (baseline) network 156 | l_cv_in = lasagne.layers.InputLayer( 157 | shape=(None, n_chan, n_dim, n_dim), 158 | input_var=X, 159 | ) 160 | l_cv_hid = lasagne.layers.DenseLayer( 161 | l_cv_in, num_units=n_hid_cv, 162 | nonlinearity=hid_nl, 163 | ) 164 | l_cv = lasagne.layers.DenseLayer( 165 | l_cv_hid, num_units=1, 166 | nonlinearity=None, 167 | ) 168 | 169 | # create variables for centering signal 170 | c = theano.shared(np.zeros((1,1), dtype=np.float64), broadcastable=(True,True)) 171 | v = theano.shared(np.zeros((1,1), dtype=np.float64), broadcastable=(True,True)) 172 | 173 | # store certain input layers for downstream (quick hack) 174 | self.input_layers = (l_qa_in, l_qz_in, l_px_in, l_cv_in) 175 | self.n_lat = n_lat 176 | self.n_hid = n_hid 177 | 178 | return l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ 179 | l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, \ 180 | l_qa, l_qz, l_cv, c, v 181 | 182 | def _create_components(self, deterministic=False): 183 | # load network input 184 | X = self.inputs[0] 185 | x = X.flatten(2) 186 | 187 | # duplicate entries to take into account multiple mc samples 188 | n_sam = self.n_sample 189 | n_out = x.shape[1] 190 | x = x.dimshuffle(0,'x',1).repeat(n_sam, axis=1).reshape((-1, n_out)) 191 | 192 | # load networks 193 | l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ 194 | l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, _, _, _ = self.network 195 | l_qa_in, l_qz_in, l_px_in, l_cv_in = self.input_layers 196 | 197 | # load network output 198 | qa_mu, qa_logsigma, a = lasagne.layers.get_output( 199 | [l_qa_mu, l_qa_logsigma, l_qa], 200 | deterministic=deterministic, 201 | ) 202 | qz_mu, z = lasagne.layers.get_output( 203 | [l_qz_mu, l_qz], 204 | {l_qz_in: a, l_qa_in: X}, 205 | deterministic=deterministic, 206 | ) 207 | pa_mu, pa_logsigma = lasagne.layers.get_output( 208 | [l_pa_mu, l_pa_logsigma], {l_px_in: z}, 209 | deterministic=deterministic, 210 | ) 211 | 212 | if self.model == 'bernoulli': 213 | px_mu = lasagne.layers.get_output( 214 | l_px_mu, {l_px_in: z}, deterministic=deterministic) 215 | elif self.model == 'gaussian': 216 | px_mu, px_logsigma = lasagne.layers.get_output( 217 | [l_px_mu, l_px_logsigma], {l_px_in: z}, 218 | deterministic=deterministic, 219 | ) 220 | 221 | # entropy term 222 | log_qa_given_x = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1) 223 | log_qz_given_x = log_bernoulli(z, qz_mu).sum(axis=1) 224 | log_qz_given_x_dgz = log_bernoulli(dg(z), qz_mu).sum(axis=1) 225 | log_qza_given_x = log_qz_given_x + log_qa_given_x 226 | 227 | # log-probability term 228 | z_prior = T.ones_like(z)*np.float32(0.5) 229 | log_pz = log_bernoulli(z, z_prior).sum(axis=1) 230 | log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1) 231 | log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1) 232 | 233 | log_pxz = log_pa_given_z + log_px_given_z + log_pz 234 | 235 | # save them for later 236 | if deterministic == False: 237 | self.log_pxz = log_pxz 238 | self.log_px_given_z = log_px_given_z 239 | self.log_pz = log_pz 240 | self.log_qza_given_x = log_qza_given_x 241 | self.log_qa_given_x = log_qa_given_x 242 | self.log_qz_given_x = log_qz_given_x 243 | self.log_qz_given_x_dgz = log_qz_given_x_dgz 244 | 245 | # return log_paxz, log_qza_given_x 246 | return log_pxz, log_qza_given_x 247 | 248 | def create_objectives(self, deterministic=False): 249 | # load probabilities 250 | log_paxz, log_qza_given_x = self._create_components(deterministic=deterministic) 251 | 252 | # compute the evidence lower bound 253 | elbo = T.mean(log_paxz - log_qza_given_x) 254 | log_qa_given_x = self.log_qa_given_x 255 | 256 | # we don't use a spearate accuracy metric right now 257 | return -elbo, T.mean(log_qa_given_x) 258 | 259 | def create_gradients(self, loss, deterministic=False): 260 | # load networks 261 | l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ 262 | l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, l_cv, c, v = self.network 263 | 264 | # load params 265 | p_params = lasagne.layers.get_all_params( 266 | [l_px_mu, l_pa_mu, l_pa_logsigma], trainable=True) 267 | qa_params = lasagne.layers.get_all_params(l_qa_mu, trainable=True) 268 | qz_params = lasagne.layers.get_all_params(l_qz, trainable=True) 269 | cv_params = lasagne.layers.get_all_params(l_cv, trainable=True) 270 | 271 | # load neural net outputs (probabilities have been precomputed) 272 | log_pxz, log_px_given_z, log_pz = self.log_pxz, self.log_px_given_z, self.log_pz 273 | log_qza_given_x = self.log_qza_given_x 274 | log_qz_given_x = self.log_qz_given_x 275 | log_qz_given_x_dgz = self.log_qz_given_x_dgz 276 | cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1) 277 | 278 | # compute learning signals 279 | l0 = log_px_given_z + log_pz - log_qz_given_x - cv # NOTE: this disn't have q(a) 280 | l_avg, l_var = l0.mean(), l0.var() 281 | c_new = 0.8*c + 0.2*l_avg 282 | v_new = 0.8*v + 0.2*l_var 283 | l = (l0 - c_new) / T.maximum(1, T.sqrt(v_new)) 284 | l_target = (l0 - c_new) / T.maximum(1, T.sqrt(v_new)) 285 | 286 | # compute grad wrt p 287 | p_grads = T.grad(-log_pxz.mean(), p_params) 288 | 289 | # compute grad wrt q_a 290 | elbo = T.mean(log_pxz - log_qza_given_x) 291 | qa_grads = T.grad(-elbo, qa_params) 292 | 293 | # compute grad wrt q_z 294 | qz_target = T.mean(dg(l_target) * log_qz_given_x_dgz) 295 | qz_grads = T.grad(-0.2*qz_target, qz_params) # 5x slower rate for q 296 | 297 | # compute grad of cv net 298 | cv_target = T.mean(l0**2) 299 | cv_grads = [0.2*g for g in T.grad(cv_target, cv_params)] 300 | 301 | # combine and clip gradients 302 | clip_grad = 1 303 | max_norm = 5 304 | grads = p_grads + qa_grads + qz_grads + cv_grads 305 | mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) 306 | cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] 307 | 308 | return cgrads 309 | 310 | def gen_samples(self, deterministic=False): 311 | s = self.inputs[-1] 312 | # put it through the decoder 313 | _, _, l_px_in, _ = self.input_layers 314 | l_px_mu = self.network[0] 315 | px_mu = lasagne.layers.get_output(l_px_mu, {l_px_in : s}) 316 | 317 | return px_mu 318 | 319 | def get_params(self): 320 | # load networks 321 | l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ 322 | l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, \ 323 | l_qa, l_qz, l_cv, c, v = self.network 324 | 325 | if self.model == 'gaussian': 326 | raise NotImplementedError('The code below needs to implement Gaussians') 327 | 328 | # load params 329 | p_params = lasagne.layers.get_all_params( 330 | [l_px_mu, l_pa_mu, l_pa_logsigma], trainable=True) 331 | qa_params = lasagne.layers.get_all_params(l_qa_mu, trainable=True) 332 | qz_params = lasagne.layers.get_all_params(l_qz, trainable=True) 333 | cv_params = lasagne.layers.get_all_params(l_cv, trainable=True) 334 | 335 | return p_params + qa_params + qz_params + cv_params 336 | 337 | def create_updates(self, grads, params, alpha, opt_alg, opt_params): 338 | # call super-class to generate SGD/ADAM updates 339 | grad_updates = Model.create_updates(self, grads, params, alpha, opt_alg, opt_params) 340 | 341 | # create updates for centering signal 342 | # load neural net outputs (probabilities have been precomputed) 343 | l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ 344 | l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, l_cv, c, v = self.network 345 | 346 | # load neural net outputs (probabilities have been precomputed) 347 | log_pxz, log_px_given_z, log_pz = self.log_pxz, self.log_px_given_z, self.log_pz 348 | log_qz_given_x = self.log_qz_given_x 349 | cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1) 350 | 351 | # compute learning signals 352 | l = log_px_given_z + log_pz - log_qz_given_x - cv 353 | l_avg, l_var = l.mean(), l.var() 354 | c_new = 0.8*c + 0.2*l_avg 355 | v_new = 0.8*v + 0.2*l_var 356 | 357 | # compute update for centering signal 358 | cv_updates = {c: c_new, v: v_new} 359 | 360 | return OrderedDict(grad_updates.items() + cv_updates.items()) 361 | 362 | def create_llik(self): 363 | # load network input 364 | X = self.inputs[0] 365 | x = X.flatten(2) 366 | 367 | # duplicate entries to take into account multiple mc samples 368 | n_sam = self.n_sample 369 | n_out = x.shape[1] 370 | x = x.dimshuffle(0,'x',1).repeat(n_sam, axis=1).reshape((-1, n_out)) 371 | 372 | # load networks 373 | l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ 374 | l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, _, _, _ = self.network 375 | l_qa_in, l_qz_in, l_px_in, l_cv_in = self.input_layers 376 | 377 | # load network output 378 | qa_mu, qa_logsigma, a = lasagne.layers.get_output( 379 | [l_qa_mu, l_qa_logsigma, l_qa], 380 | ) 381 | qz_mu, z = lasagne.layers.get_output( 382 | [l_qz_mu, l_qz], 383 | {l_qz_in: a, l_qa_in: X}, 384 | ) 385 | pa_mu, pa_logsigma = lasagne.layers.get_output( 386 | [l_pa_mu, l_pa_logsigma], {l_px_in: z}, 387 | ) 388 | 389 | px_mu = lasagne.layers.get_output(l_px_mu, {l_px_in: z}) 390 | 391 | # entropy term 392 | log_qa_given_x = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1) 393 | log_qz_given_x = log_bernoulli(z, qz_mu).sum(axis=1) 394 | log_qz_given_x_dgz = log_bernoulli(dg(z), qz_mu).sum(axis=1) 395 | log_qza_given_x = log_qz_given_x + log_qa_given_x 396 | 397 | # log-probability term 398 | z_prior = T.ones_like(z)*np.float32(0.5) 399 | log_pz = log_bernoulli(z, z_prior).sum(axis=1) 400 | log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1) 401 | log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1) 402 | 403 | t = log_pa_given_z + log_px_given_z + log_pz - log_qz_given_x - log_qa_given_x 404 | t = t.reshape([100, n_sam]) 405 | 406 | # compute loss 407 | llik = Tlogsumexp( t, axis=1) # (n_bat,) 408 | 409 | return T.mean(llik) -------------------------------------------------------------------------------- /models/rbm_dadgm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano.tensor as T 3 | from lasagne.layers import ( 4 | InputLayer, DenseLayer, ElemwiseSumLayer, 5 | reshape, flatten, get_all_params, get_output, 6 | ) 7 | from lasagne.updates import total_norm_constraint 8 | from lasagne.init import GlorotNormal, Normal 9 | from layers import GumbelSoftmaxSampleLayer, GaussianSampleLayer 10 | from distributions import log_bernoulli, log_normal2 11 | from gsm import GSM 12 | from rbm import RBM 13 | 14 | import theano 15 | import lasagne 16 | from theano.tensor.shared_randomstreams import RandomStreams 17 | 18 | 19 | class RBM_DADGM(GSM, RBM): 20 | """Restricted Boltzmann Machine trained with persistent contrastive 21 | divergence for learning p; Auxiliary Deep Generative Model trained 22 | using Gumbel Softmax Reparametrization for learning q. 23 | """ 24 | 25 | def __init__( 26 | self, n_dim, n_out, n_chan=1, n_superbatch=12800, opt_alg='adam', 27 | opt_params={'lr': 1e-3, 'b1': 0.9, 'b2': 0.99} 28 | ): 29 | self.numpy_rng = np.random.RandomState(1234) 30 | self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30)) 31 | 32 | self.n_dim = n_dim 33 | self.n_out = n_out 34 | self.n_superbatch = n_superbatch 35 | self.alg = opt_alg 36 | self.n_class = 10 37 | 38 | lr = opt_params.get('lr') 39 | n_batch = opt_params.get('nb') 40 | 41 | train_set_x = theano.shared( 42 | np.empty( 43 | (n_superbatch, n_chan, n_dim, n_dim), 44 | dtype=theano.config.floatX 45 | ), borrow=False, 46 | ) 47 | val_set_x = theano.shared(np.empty( 48 | (n_superbatch, n_chan, n_dim, n_dim), 49 | dtype=theano.config.floatX), 50 | borrow=False, 51 | ) 52 | train_set_y = theano.shared( 53 | np.empty( 54 | (n_superbatch,), 55 | dtype=theano.config.floatX 56 | ), borrow=False, 57 | ) 58 | val_set_y = theano.shared( 59 | np.empty( 60 | (n_superbatch,), 61 | dtype=theano.config.floatX 62 | ), borrow=False, 63 | ) 64 | train_set_y_int = T.cast(train_set_y, 'int32') 65 | val_set_y_int = T.cast(val_set_y, 'int32') 66 | 67 | train_rbm_px_mu = theano.shared( 68 | np.empty( 69 | (n_superbatch, self.n_aux), 70 | dtype=theano.config.floatX 71 | ), borrow=False, 72 | ) 73 | 74 | X = T.tensor4(dtype=theano.config.floatX) 75 | S = T.tensor3(dtype=theano.config.floatX) 76 | Y = T.ivector() 77 | px_mu = T.lscalar(dtype=config.floatX) 78 | idx1, idx2 = T.lscalar(), T.lscalar() 79 | alpha = T.scalar(dtype=theano.config.floatX) # learning rate 80 | self.inputs = (X, Y, idx1, idx2, S, px_mu) 81 | 82 | # ---------------------------- 83 | # Begin RBM-only 84 | self.rbm_network = self.create_rbm_model(n_dim, n_out, n_chan) 85 | persistent_chain = theano.shared( 86 | np.zeros( 87 | (n_batch, self.n_hidden), 88 | dtype=theano.config.floatX 89 | ), borrow=True, 90 | ) 91 | rbm_cost, rbm_acc, rbm_updates = self.get_rbm_objective_and_updates( 92 | alpha, lr=lr, persistent=persistent_chain, 93 | ) 94 | self.rbm_objectives = (rbm_cost, rbm_acc) 95 | self.rbm_train = theano.function( 96 | [idx1, idx2, alpha], 97 | [rbm_cost, rbm_acc], 98 | updates=rbm_updates, 99 | givens={ 100 | X: train_set_x[idx1:idx2], 101 | Y: train_set_y_int[idx1:idx2] 102 | }, 103 | on_unused_input='warn', 104 | ) 105 | # End RBM-only 106 | # ---------------------------- 107 | # Begin DADGM-only 108 | tau = theano.shared( 109 | np.float32(5.0), name='temperature', 110 | allow_downcast=True, borrow=False, 111 | ) 112 | self.tau = tau 113 | self.dadgm_network = self.create_dadgm_model( 114 | X, Y, n_dim, n_out, n_chan, 115 | ) 116 | dadgm_loss, dadgm_acc = self.create_dadgm_objectives(False) 117 | self.dadgm_objectives = (dadgm_loss, dadgm_acc) 118 | dadgm_params = self.get_dadgm_params() 119 | dadgm_grads = self.create_dadgm_gradients(dadgm_loss, False) 120 | dadgm_updates = self.create_dadgm_updates( 121 | dadgm_grads, dadgm_params, alpha, opt_alg, opt_params, 122 | ) 123 | self.dadgm_train = theano.function( 124 | [idx1, idx2, alpha], [dadgm_loss, dadgm_acc], 125 | updates=dadgm_updates, 126 | givens={ 127 | X: train_set_x[idx1:idx2], 128 | Y: train_set_y_int[idx1:idx2], 129 | px_mu: train_rbm_px_mu, 130 | }, 131 | on_unused_input='warn', 132 | ) 133 | self.dadgm_loss = theano.function( 134 | [X, Y], [dadgm_loss, dadgm_acc], 135 | on_unused_input='warn', 136 | ) 137 | # End DADGM-only 138 | # ---------------------------- 139 | self.n_batch = n_batch 140 | # parameters for sampling 141 | self.n_chain = 100 142 | 143 | # save data variables 144 | self.train_set_x = train_set_x 145 | self.train_set_y = train_set_y 146 | self.val_set_x = val_set_x 147 | self.val_set_y = val_set_y 148 | self.train_rbm_px_mu = train_rbm_px_mu 149 | self.data_loaded = False 150 | 151 | def create_rbm_model(self, n_dim, n_out, n_chan=1, n_class=10): 152 | n_visible = n_chan*n_dim*n_dim # size of visible layer 153 | n_hidden = 500 # size of hidden layer 154 | k_steps = 15 # number of steps during CD/PCD 155 | 156 | initial_W = np.asarray( 157 | self.numpy_rng.uniform( 158 | low=-4 * np.sqrt(6. / (n_hidden + n_visible)), 159 | high=4 * np.sqrt(6. / (n_hidden + n_visible)), 160 | size=(n_visible, n_hidden) 161 | ), dtype=theano.config.floatX, 162 | ) 163 | W = theano.shared(value=initial_W, name='W', borrow=True) 164 | hbias = theano.shared( 165 | value=np.zeros( 166 | n_hidden, 167 | dtype=theano.config.floatX 168 | ), name='hbias', 169 | borrow=True, 170 | ) 171 | vbias = theano.shared( 172 | value=np.zeros( 173 | n_visible, 174 | dtype=theano.config.floatX 175 | ), name='vbias', 176 | borrow=True, 177 | ) 178 | 179 | self.W = W 180 | self.hbias = hbias 181 | self.vbias = vbias 182 | 183 | self.rbm_params = [self.W, self.hbias, self.vbias] 184 | 185 | # network params 186 | self.n_visible = n_visible 187 | self.n_hidden = n_hidden 188 | self.k_steps = k_steps 189 | 190 | return None 191 | 192 | def create_dadgm_model(self, X, Y, n_dim, n_out, n_chan=1, n_class=10): 193 | n_cat = 20 # number of categorical distributions 194 | n_lat = n_class*n_cat # latent stochastic variables 195 | n_aux = 10 # number of auxiliary variables 196 | n_hid = 500 # size of hidden layer in encoder/decoder 197 | n_in = n_out = n_dim * n_dim * n_chan 198 | tau = self.tau 199 | hid_nl = T.nnet.relu 200 | relu_shift = lambda av: T.nnet.relu(av+10)-10 201 | 202 | # create the encoder network 203 | # - create q(a|x) 204 | qa_net_in = InputLayer(shape=(None, n_in), input_var=X) 205 | qa_net = DenseLayer( 206 | qa_net_in, num_units=n_hid, 207 | W=GlorotNormal('relu'), b=Normal(1e-3), 208 | nonlinearity=hid_nl, 209 | ) 210 | qa_net_mu = DenseLayer( 211 | qa_net, num_units=n_aux, 212 | W=GlorotNormal(), b=Normal(1e-3), 213 | nonlinearity=None, 214 | ) 215 | qa_net_logsigma = DenseLayer( 216 | qa_net, num_units=n_aux, 217 | W=GlorotNormal(), b=Normal(1e-3), 218 | nonlinearity=relu_shift, 219 | ) 220 | qa_net_sample = GaussianSampleLayer(qa_net_mu, qa_net_logsigma) 221 | # - create q(z|a, x) 222 | qz_net_in = lasagne.layers.InputLayer((None, n_aux)) 223 | qz_net_a = DenseLayer( 224 | qz_net_in, num_units=n_hid, 225 | nonlinearity=hid_nl, 226 | ) 227 | qz_net_b = DenseLayer( 228 | qa_net_in, num_units=n_hid, 229 | nonlinearity=hid_nl, 230 | ) 231 | qz_net = ElemwiseSumLayer([qz_net_a, qz_net_b]) 232 | qz_net = DenseLayer( 233 | qz_net, num_units=n_hid, 234 | nonlinearity=hid_nl 235 | ) 236 | qz_net_mu = DenseLayer( 237 | qz_net, num_units=n_lat, 238 | nonlinearity=None, 239 | ) 240 | qz_net_mu = reshape(qz_net_mu, (-1, n_class)) 241 | qz_net_sample = GumbelSoftmaxSampleLayer(qz_net_mu, tau) 242 | qz_net_sample = reshape(qz_net_sample, (-1, n_cat, n_class)) 243 | # create the decoder network 244 | # - create p(x|z) 245 | px_net_in = lasagne.layers.InputLayer((None, n_cat, n_class)) 246 | # --- rest is created from RBM --- 247 | # - create p(a|z) 248 | pa_net = DenseLayer( 249 | flatten(px_net_in), num_units=n_hid, 250 | W=GlorotNormal('relu'), b=Normal(1e-3), 251 | nonlinearity=hid_nl, 252 | ) 253 | pa_net_mu = DenseLayer( 254 | pa_net, num_units=n_aux, 255 | W=GlorotNormal(), 256 | b=Normal(1e-3), 257 | nonlinearity=None, 258 | ) 259 | pa_net_logsigma = DenseLayer( 260 | pa_net, num_units=n_aux, 261 | W=GlorotNormal(), 262 | b=Normal(1e-3), 263 | nonlinearity=relu_shift, 264 | ) 265 | # save network params 266 | self.n_cat = n_cat 267 | self.input_layers = (qa_net_in, qz_net_in, px_net_in) 268 | 269 | return pa_net_mu, pa_net_logsigma, qz_net_mu, \ 270 | qa_net_mu, qa_net_logsigma, qz_net_sample, qa_net_sample, 271 | 272 | def get_rbm_objective_and_updates(self, alpha, lr=0.1, persistent=None): 273 | X = self.inputs[0] 274 | x = X.reshape((-1, self.n_visible)) 275 | # compute positive phase 276 | pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(x) 277 | # for PCD, we initialize from the old state of the chain 278 | chain_start = persistent 279 | ( 280 | [ 281 | pre_sigmoid_nvs, 282 | nv_means, 283 | nv_samples, 284 | pre_sigmoid_nhs, 285 | nh_means, 286 | nh_samples 287 | ], 288 | updates 289 | ) = theano.scan( 290 | self.gibbs_hvh, 291 | outputs_info=[None, None, None, None, None, chain_start], 292 | n_steps=15, 293 | name="gibbs_hvh", 294 | ) 295 | 296 | chain_end = nv_samples[-1] 297 | cost = T.mean(self.free_energy(x))-T.mean(self.free_energy(chain_end)) 298 | # We must not compute the gradient through the gibbs sampling 299 | params = self.get_params() 300 | gparams = T.grad(cost, self.params, consider_constant=[chain_end]) 301 | gparams = [grad * alpha for grad in gparams] 302 | 303 | for gparam, param in zip(gparams, params): 304 | updates[param] = param - gparam * T.cast(lr, dtype=theano.config.floatX) 305 | 306 | updates[persistent] = nh_samples[-1] 307 | monitoring_cost = self.get_pseudo_likelihood_cost(x, updates) 308 | 309 | return monitoring_cost, monitoring_cost, updates 310 | 311 | def create_dadgm_objectives(self, deterministic=False): 312 | X = self.inputs[0] 313 | x = X.reshape((-1, self.n_out)) 314 | px_mu = self.inputs[-1] 315 | 316 | # load network params 317 | n_class = self.n_class 318 | n_cat = self.n_cat 319 | 320 | pa_net_mu, pa_net_logsigma, qz_net_mu, qa_net_mu, \ 321 | qa_net_logsigma, qz_net_sample, qa_net_sample = self.network 322 | qa_net_in, qz_net_in, px_net_in = self.input_layers 323 | 324 | qa_mu, qa_logsigma, qa_sample = get_output( 325 | [qa_net_mu, qa_net_logsigma, qa_net_sample], 326 | deterministic=deterministic, 327 | ) 328 | qz_mu, qz_sample = get_output( 329 | [qz_net_mu, qz_net_sample], 330 | {qz_net_in: qa_sample, qa_net_in: x}, 331 | deterministic=deterministic, 332 | ) 333 | pa_mu, pa_logsigma = get_output( 334 | [pa_net_mu, pa_net_logsigma], 335 | {px_net_in: qz_sample}, 336 | deterministic=deterministic, 337 | ) 338 | # Load this from RBM 339 | px_mu = self.inputs[-1] 340 | 341 | qz_given_ax = T.nnet.softmax(qz_mu) 342 | log_qz_given_ax = T.log(qz_given_ax + 1e-20) 343 | entropy = T.reshape( 344 | qz_given_ax * (log_qz_given_ax - T.log(1.0 / n_class)), 345 | (-1, n_cat, n_class), 346 | ) 347 | entropy = T.sum(entropy, axis=[1, 2]) 348 | 349 | log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1) 350 | log_pa_given_z = log_normal2(qa_sample, pa_mu, pa_logsigma).sum(axis=1) 351 | log_paxz = log_pa_given_z + log_px_given_z 352 | 353 | # logp(z)+logp(a|z)-logq(a)-logq(z|a) 354 | elbo = T.mean(log_paxz - entropy) 355 | 356 | return -elbo, -T.mean(entropy) 357 | 358 | def create_dadgm_gradients(self, loss, deterministic=False): 359 | grads = GSM.create_gradients(self, loss, deterministic) 360 | 361 | # combine and clip gradients 362 | clip_grad, max_norm = 1, 5 363 | mgrads = total_norm_constraint(grads, max_norm=max_norm) 364 | cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] 365 | 366 | return cgrads 367 | 368 | def create_dadgm_updates(self, grads, params, alpha, opt_alg, opt_params): 369 | return GSM.create_updates( 370 | self, grads, params, alpha, opt_alg, opt_params 371 | ) 372 | 373 | def get_rbm_params(self): 374 | return self.rbm_params 375 | 376 | def get_dadgm_params(self): 377 | px_net_mu, pa_net_mu, pa_net_logsigma, \ 378 | qz_net_mu, qa_net_mu, qa_net_logsigma, \ 379 | qz_net_sample, qa_net_sample = self.dadgm_network 380 | 381 | p_params = get_all_params( 382 | [px_net_mu, pa_net_mu, pa_net_logsigma], 383 | trainable=True, 384 | ) 385 | qa_params = get_all_params(qa_net_sample, trainable=True) 386 | qz_params = get_all_params(qz_net_sample, trainable=True) 387 | 388 | return p_params + qa_params + qz_params 389 | 390 | def fit_dadgm( 391 | self, X_train, Y_train, X_val, Y_val, 392 | n_epoch=10, n_batch=100, logname='run', 393 | ): 394 | alpha = 1.0 # learning rate, which can be adjusted later 395 | tau0 = 1.0 # initial temp 396 | MIN_TEMP = 0.5 # minimum temp 397 | ANNEAL_RATE = 0.00003 # adjusting rate 398 | np_temp = tau0 399 | n_data = len(X_train) 400 | n_superbatch = self.n_superbatch 401 | i = 1 # track # of train() executions 402 | 403 | n_flat_dim = np.prod(X_train.shape[1:]) 404 | X_train = X_train.reshape(-1, n_flat_dim) 405 | X_val = X_val.reshape(-1, n_flat_dim) 406 | 407 | for epoch in range(n_epoch): 408 | # In each epoch, we do a full pass over the training data: 409 | train_batches, train_err, train_acc = 0, 0, 0 410 | start_time = time.time() 411 | 412 | # iterate over superbatches to save time on GPU memory transfer 413 | for X_sb, Y_sb in self.iterate_superbatches( 414 | X_train, Y_train, n_superbatch, 415 | datatype='train', shuffle=True, 416 | ): 417 | for idx1, idx2 in iterate_minibatch_idx(len(X_sb), n_batch): 418 | dadgm_err, dadgm_acc = self.dadgm_train(idx1, idx2, alpha) 419 | 420 | # anneal temp and learning rate 421 | if i % 1000 == 1: 422 | alpha *= 0.9 423 | np_temp = np.maximum(tau0*np.exp(-ANNEAL_RATE*i), MIN_TEMP) 424 | self.tau.set_value(np_temp, borrow=False) 425 | 426 | # collect metrics 427 | i += 1 428 | train_batches += 1 429 | train_err += err 430 | train_acc += acc 431 | 432 | def fit( 433 | self, X_train, Y_train, X_val, Y_val, 434 | n_epoch=10, n_batch=100, logname='run' 435 | ): 436 | """Train 1 epoch of PCD to learn p, followed by K epochs 437 | of DADGM via GSM to fit q to p. 438 | """ 439 | alpha = 1.0 # learning rate, which can be adjusted later 440 | n_data = len(X_train) 441 | n_superbatch = self.n_superbatch 442 | n_loop_q_per_p = 10 443 | 444 | n_flat_dim = np.prod(X_train.shape[1:]) 445 | X_train = X_train.reshape(-1, n_flat_dim) 446 | X_val = X_val.reshape(-1, n_flat_dim) 447 | 448 | for epoch in range(n_epoch): 449 | # In each epoch, we do a full pass over the training data: 450 | train_batches, train_err, train_acc = 0, 0, 0 451 | start_time = time.time() 452 | 453 | # iterate over superbatches to save time on GPU memory transfer 454 | for X_sb, Y_sb in self.iterate_superbatches( 455 | X_train, Y_train, n_superbatch, 456 | datatype='train', shuffle=True, 457 | ): 458 | for idx1, idx2 in iterate_minibatch_idx(len(X_sb), n_batch): 459 | rbm_err, rbm_acc = self.rbm_train(idx1, idx2, alpha) 460 | self.train_rbm_px_mu.set_value(rbm_err) 461 | # Fit n_loop epochs of fitting q to current p 462 | self.fit_dadgm( 463 | X_train, Y_train, X_val, Y_val, 464 | n_epoch=n_loop_q_per_p, n_batch=n_batch, logname=logname, 465 | ) 466 | 467 | train_batches += 1 468 | train_err += err 469 | train_acc += acc 470 | 471 | def hallucinate(self): 472 | pass 473 | 474 | def load_params(self, rbm_params, dadgm_params): 475 | """Load a given set of parameters""" 476 | self.rbm_params = rbm_params 477 | self.dadgm_params = dadgm_params 478 | 479 | def dump_params(self): 480 | """Dump a given set of parameters""" 481 | return (self.rbm_params, self.dadgm_params) 482 | -------------------------------------------------------------------------------- /models/variational_rbm.py: -------------------------------------------------------------------------------- 1 | import pdb 2 | import time 3 | import pickle 4 | import numpy as np 5 | import theano 6 | import lasagne 7 | import theano.tensor as T 8 | from theano.tensor.shared_randomstreams import RandomStreams 9 | from collections import OrderedDict 10 | 11 | from model import Model 12 | from abstractrbm import AbstractRBM 13 | from helpers import iterate_minibatch_idx, evaluate, log_metrics 14 | from helpers import Tlogsumexp, Tlogaddexp 15 | 16 | from theano.compile.nanguardmode import NanGuardMode 17 | 18 | create_zmat = lambda x, y: np.zeros((x, y)).astype(theano.config.floatX) 19 | create_vec = lambda y: np.zeros(y,).astype(theano.config.floatX) 20 | create_sca = lambda x: (np.cast[theano.config.floatX](x)).astype(theano.config.floatX) 21 | 22 | class VariationalRBM(AbstractRBM): 23 | """Restricted Boltzmann Machine with VI""" 24 | def __init__( 25 | self, n_dim, n_out, n_chan=1, n_superbatch=12800, opt_alg='adam', 26 | opt_params={'lr': 1e-3, 'b1': 0.9, 'b2': 0.99} 27 | ): 28 | """RBM constructor. 29 | Defines the parameters of the model along with 30 | basic operations for inferring hidden from visible (and vice-versa), 31 | as well as for performing CD updates. 32 | """ 33 | self.numpy_rng = np.random.RandomState(1234) 34 | self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30)) 35 | self.create_mat = lambda x, y: self.numpy_rng.normal(0, 0.01, (x, y)).astype(theano.config.floatX) 36 | 37 | # save config 38 | n_batch = opt_params.get('nb') 39 | self.n_hidden = 100 40 | self.n_visible = n_chan*n_dim*n_dim # size of visible layer 41 | self.n_batch = n_batch 42 | self.n_qk = 10 # num of components in MoB used of q 43 | self.n_mc = 30 # num of monte carlo samples from each MoB component 44 | 45 | self.n_dim = n_dim 46 | self.n_out = n_out 47 | self.n_superbatch = n_superbatch 48 | self.alg = opt_alg 49 | 50 | # set up general RBM methods 51 | AbstractRBM.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params) 52 | 53 | # create updates 54 | alpha = T.scalar(dtype=theano.config.floatX) # learning rate 55 | 56 | # save config 57 | self.n_class = 2 58 | self.n_dim = n_dim 59 | self.n_out = n_out 60 | 61 | 62 | self.n_components = self.n_qk 63 | self.n_samples = self.n_mc 64 | self.n_tot_samples = self.n_samples*self.n_components 65 | 66 | 67 | # create input variables 68 | D, idx1, idx2 = self.create_inputs() 69 | 70 | # create model 71 | self.network = self.create_model() 72 | 73 | # create objectives 74 | loglik, plik = self.create_objectives(D) 75 | 76 | # create gradients 77 | dL_Theta, dE_Theta, dlogZ_Theta, dL_Phi = self.create_gradients() 78 | grads = dL_Theta, dE_Theta, dlogZ_Theta, dL_Phi 79 | 80 | # create updates 81 | uL_Theta, uL_Phi, avg_updates, avg_Theta_updates \ 82 | = self.create_updates(grads, None, alpha, opt_alg, opt_params) 83 | 84 | # logF_avg, Z_avg = self.create_llik_estimate(D) 85 | 86 | mode = NanGuardMode(nan_is_error=True, inf_is_error=False, big_is_error=False) 87 | mode = None 88 | 89 | common_update1 = OrderedDict(avg_updates.items() + uL_Phi.items()) 90 | self.train_q = theano.function([idx1, idx2], [loglik, plik], 91 | updates=common_update1, mode=mode, 92 | givens={D: self.train_set_x[idx1:idx2]}) 93 | 94 | common_update2 = OrderedDict(avg_Theta_updates.items() + uL_Theta.items()) 95 | self.train_p = theano.function([idx1, idx2], [loglik, plik], 96 | updates=common_update2, mode=mode, on_unused_input='warn', 97 | givens={D: self.train_set_x[idx1:idx2]}) 98 | # self.llik = theano.function([D], logF_avg - T.log(Z_avg), mode=mode) 99 | 100 | common_update3 = OrderedDict(common_update1.items() + common_update2.items()) 101 | self.train = theano.function([idx1, idx2], [loglik, plik], 102 | updates=common_update3, mode=mode, 103 | givens={D: self.train_set_x[idx1:idx2]}) 104 | 105 | def create_inputs(self): 106 | # allocate symbolic variables for the data 107 | idx1, idx2 = T.lscalar(), T.lscalar() 108 | D = T.tensor4('D', dtype=theano.config.floatX) # data matrixX) # learning rate 109 | # self.Z = T.tensor2(dtype=theano.config.floatX) 110 | return D, idx1, idx2 111 | 112 | def _free_energy(self, X, avg=False): 113 | bv_vec = self.vbias.reshape((self.n_visible,1)) 114 | bh_vec = self.hbias.reshape((self.n_hidden,1)) 115 | 116 | W = self.W_avg if avg else self.W 117 | W = W.T 118 | 119 | logF = T.dot(bv_vec.T, X) \ 120 | + T.sum(T.log(1. + T.exp(bh_vec + T.dot(W, X))), axis=0) 121 | 122 | return logF 123 | 124 | @property 125 | def Z(self): 126 | return np.exp(self.logZ_avg.get_value()) 127 | 128 | def create_model(self): 129 | self.W = theano.shared(self.create_mat(self.n_visible, self.n_hidden)) 130 | self.vbias = theano.shared(create_vec(self.n_visible)) 131 | self.hbias = theano.shared(create_vec(self.n_hidden)) 132 | self.params_p = [self.W, self.vbias, self.hbias] 133 | 134 | # running averages 135 | self.inner_it = theano.shared(create_sca(0.0)) 136 | self.loga = theano.shared(create_sca(0.0)) 137 | self.logZ_avg = theano.shared(create_sca(0.0)) 138 | self.grad_logZ_avg = [ 139 | theano.shared(create_zmat(self.n_visible, self.n_hidden)), 140 | theano.shared(create_vec(self.n_visible)), 141 | theano.shared(create_vec(self.n_hidden)) 142 | ] 143 | self.grad_p_avg = [ 144 | theano.shared(create_zmat(self.n_visible, self.n_hidden)), 145 | theano.shared(create_vec(self.n_visible)), 146 | theano.shared(create_vec(self.n_hidden)) 147 | ] 148 | self.W_avg = theano.shared(self.W.get_value().copy()) 149 | self.vbias_avg = theano.shared(self.vbias.get_value().copy()) 150 | self.hbias_avg = theano.shared(self.hbias.get_value().copy()) 151 | 152 | self.bit_i_idx = theano.shared(value=0, name='bit_i_idx') 153 | 154 | # create approximating distribution 155 | create_mat = lambda x, y: self.numpy_rng.normal(0, 0.01, (x, y)).astype(theano.config.floatX) 156 | self.pi = np.full( (self.n_qk,1), 1./float(self.n_qk) ).astype(theano.config.floatX) 157 | self.Phi = theano.shared(create_mat(self.n_visible,self.n_qk)) 158 | self.params_q = [self.Phi] 159 | 160 | # create symbolic variables representing the samples 161 | Qk = 1./(1+T.exp(-self.Phi)) # (n,k) component probabilities 162 | Qt = T.tile(T.reshape(Qk, (self.n_visible, self.n_qk, 1)), (1, 1, self.n_mc)) 163 | Qs = self.theano_rng.binomial(size=Qt.shape, p=Qt, dtype=Qt.dtype) 164 | self.q_samples = Qs 165 | self.Qk = Qk 166 | 167 | return None 168 | 169 | def _create_components(self, D): 170 | # shorthands 171 | k, t, s = self.n_components, self.n_tot_samples, self.n_samples 172 | n = self.n_visible 173 | 174 | ## compute bound objective 175 | 176 | X = self.q_samples 177 | D = D.reshape((-1, self.n_visible)).T 178 | X2 = T.flatten(X, outdim=2) 179 | 180 | # compute the probability of each datapoint under each component 181 | 182 | # reshape tensors 183 | QQ = self.Qk.reshape((n,k,1)) 184 | XX = X2.dimshuffle(0,'x',1) 185 | self.XX = XX 186 | T.addbroadcast(QQ, 2) 187 | T.addbroadcast(XX, 1) 188 | 189 | # compute the probabilities (log version is 2x slower...) 190 | # logQ_prob = T.log(QQ*XX + (1.-QQ)*(1.-XX)) # (n, k, t) 191 | # logQ_tot = T.sum(logQ_prob, axis=0) # (k, t) 192 | # logQ = Tlogsumexp(np.log(self.pi_uniform) + logQ_tot, axis=0) # (t,) 193 | Q_tot = T.prod(QQ*XX + (1-QQ)*(1-XX), axis=0) 194 | Q = T.sum(self.pi * Q_tot, axis=0) # (t,) 195 | logQ = T.log(Q) 196 | 197 | # compute energies of the samples, dim=(1, n_tot_samples) 198 | logF = self._free_energy(X2) 199 | 200 | # free energy of the data 201 | logF_D = self._free_energy(D) 202 | 203 | self._components = (logF, logQ, logF_D) 204 | 205 | def create_objectives(self, D, deterministic=False): 206 | self._create_components(D) 207 | (logF, logQ, logF_D_vec) = self._components 208 | t = self.n_qk * self.n_mc 209 | 210 | # set up bound 211 | t = self.n_qk * self.n_mc 212 | logFQ = logF - logQ # (t,) 213 | logFQ2 = 2.*logF - 2.*logQ # (t,) 214 | logFQ = logFQ.flatten() 215 | logFQ2 = logFQ2.flatten() 216 | logZ = Tlogsumexp(logFQ, axis=0) - T.log(t) 217 | logbnd = Tlogsumexp(logFQ2, axis=0) - T.log(t) 218 | 219 | logF_D = logF_D_vec.mean() 220 | 221 | loglik = logF_D - 0.5*logZ 222 | loglikbnd = logF_D - 0.5*logbnd 223 | 224 | self.logbnd = logbnd 225 | self.logZ = logZ 226 | 227 | # set up pseudolikelihood 228 | plik = self.create_pseudoliklihood(D) 229 | 230 | return loglik, plik 231 | 232 | def get_params(self): 233 | return [self.W, self.vbias, self.hbias, self.Phi] 234 | 235 | def get_params_p(self): 236 | return [self.W, self.vbias, self.hbias] 237 | 238 | def get_params_q(self): 239 | return [self.Phi] 240 | 241 | def create_gradients(self): 242 | k, s, n = self.n_qk, self.n_mc, self.n_visible 243 | (logF, logQ, logF_D_vec) = self._components 244 | logbnd, logZ = self.logbnd, self.logZ 245 | logF_D = logF_D_vec.mean() 246 | X = self.q_samples 247 | QQ = self.Qk.reshape((n,k,1)) 248 | 249 | logF2 = 2.*logF 250 | logQ2 = 2.*logQ 251 | logFQ = logF - logQ # (t,) 252 | S = T.exp(logFQ - logZ) # (t,) 253 | S2 = T.exp(logF2 - logbnd - logQ2) # (t,) 254 | target = T.mean(S * logF) 255 | 256 | # get grads wrt params_p 257 | dlogB_W, dlogB_bv, dlogB_bh = theano.grad(logbnd, [self.W, self.vbias, self.hbias]) 258 | dlogZ_W, dlogZ_bv, dlogZ_bh = theano.grad(target, [self.W, self.vbias, self.hbias], consider_constant=[S]) 259 | dE_W, dE_bv, dE_bh = theano.grad(logF_D, [self.W, self.vbias, self.hbias]) 260 | 261 | # get graps wrt params_q 262 | loga = - logbnd 263 | cv = T.exp(logZ + 0.5*loga) 264 | logF2a = logF2 + loga 265 | F2a = T.exp( logF2a ) 266 | Q2 = T.exp(logQ2) 267 | cv_adj = cv**2. * Q2 268 | Scv = (F2a - cv_adj)/Q2 269 | # S = (F2a)/Q2 270 | # S = FQ2a 271 | 272 | Dq = X - QQ # (n, K, s) 273 | Dq *= (-Scv).reshape((1,k,s)) 274 | dlogB_Phi = T.mean(Dq, axis=2) * self.pi.reshape(1,k) # (n, k) 275 | 276 | from theano.gradient import disconnected_grad as dg 277 | dlogB_target = T.mean(-dg(S2) * logQ) 278 | dlogB_Phi = theano.grad(dlogB_target, self.Phi, consider_constant=[self.XX]) 279 | 280 | # log-likelihood / bound gradients (combine the above) 281 | 282 | # dL_Phi = -0.5*a*dB_Phi 283 | dL_Phi = -0.5*dlogB_Phi 284 | dL_W = dE_W - dlogZ_W 285 | dL_bv = dE_bv - dlogZ_bv 286 | dL_bh = dE_bh - dlogZ_bh 287 | # dL_W = dE_W - 0.5 * dlogB_W 288 | # dL_bv = dE_bv - 0.5 * dlogB_bv 289 | # dL_bh = dE_bh - 0.5 * dlogB_bh 290 | 291 | dL_Theta = [dL_W, dL_bv, dL_bh] 292 | dlogZ_Theta = [dlogZ_W, dlogZ_bv, dlogZ_bh] 293 | dE_Theta = [dE_W, dE_bv, dE_bh] 294 | 295 | return dL_Theta, dlogZ_Theta, dE_Theta, dL_Phi 296 | 297 | def create_pseudoliklihood(self, X): 298 | """Stochastic approximation to the pseudo-likelihood""" 299 | # X = self.inputs[0] 300 | X = X.reshape((-1, self.n_visible)) 301 | 302 | # index of bit i in expression p(x_i | x_{\i}) 303 | bit_i_idx = self.bit_i_idx 304 | # bit_i_idx = theano.shared(value=0, name='bit_i_idx') 305 | 306 | # binarize the input image by rounding to nearest integer 307 | xi = T.round(X) 308 | 309 | # calculate free energy for the given bit configuration 310 | fe_xi = self.free_energy(xi) 311 | 312 | # flip bit x_i of matrix xi and preserve all other bits x_{\i} 313 | # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx], but assigns 314 | # the result to xi_flip, instead of working in place on xi. 315 | xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx]) 316 | 317 | # calculate free energy with bit flipped 318 | fe_xi_flip = self.free_energy(xi_flip) 319 | 320 | # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))) 321 | cost = T.mean(self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip-fe_xi))) 322 | 323 | return cost 324 | 325 | def create_updates(self, grads, params, alpha, opt_alg, opt_params): 326 | # def create_updates(self, alpha, opt_alg, opt_params): 327 | # scaled_grads = [grad * alpha for grad in grads] 328 | # scaled_grads = [T.clip(grad, -1., 1.) for grad in scaled_grads] 329 | dL_Theta, dE_Theta, dlogZ_Theta, dL_Phi = grads 330 | 331 | # create update for main parameters 332 | # TODO: Scale grads with alpha! 333 | lr = opt_params.get('lr', 1e-3) 334 | b1, b2 = opt_params.get('b1', 0.9), opt_params.get('b2', 0.999) 335 | uL_Theta = lasagne.updates.adam(self.grad_p_avg, self.params_p, learning_rate=1e-3, beta1=7e-1, beta2=1e-3) 336 | # learning_rate=lr, beta1=b1, beta2=b2) 337 | uL_Phi = lasagne.updates.sgd([-dL_Phi], [self.Phi], learning_rate=5e-1) # TODO: anneal lr 338 | 339 | # create updates for the running averages 340 | avg_updates = self.create_avg_updates(self.logZ, dlogZ_Theta, dL_Theta) 341 | 342 | # avg Theta updates 343 | avg_Theta_updates = OrderedDict() 344 | avg_Theta_updates[self.W_avg] = 0.95*self.W_avg + 0.05*self.W 345 | avg_Theta_updates[self.vbias_avg] = 0.95*self.vbias_avg + 0.05*self.vbias 346 | avg_Theta_updates[self.hbias_avg] = 0.95*self.hbias_avg + 0.05*self.hbias 347 | 348 | # increment bit_i_idx % number as part of updates 349 | uL_Phi[self.bit_i_idx] = (self.bit_i_idx + 1) % self.n_visible 350 | 351 | return uL_Theta, uL_Phi, avg_updates, avg_Theta_updates 352 | 353 | def create_avg_updates(self, logZ, dlogZ_Theta, dL_Theta): 354 | inner_it = self.inner_it 355 | avg_updates = OrderedDict() 356 | 357 | new_logZ_estimate = Tlogaddexp( 358 | self.logZ_avg + T.log((inner_it) / (inner_it+1)), 359 | logZ - T.log(inner_it+1.) 360 | ) 361 | # self.logZ_avg * ((inner_it) / (inner_it+1)) + logZ / (inner_it+1) 362 | 363 | avg_updates[inner_it] = inner_it + 1. 364 | avg_updates[self.logZ_avg] = new_logZ_estimate 365 | avg_updates[self.loga] = -2.*new_logZ_estimate # 1./(new_Z_estimate**2) 366 | 367 | for g, g_avg in zip(dL_Theta, self.grad_p_avg): 368 | avg_updates[g_avg] = g_avg * ((inner_it) / (inner_it+1)) - g / (inner_it+1) 369 | 370 | return avg_updates 371 | 372 | def reset_averages(self): 373 | self.inner_it.set_value(0.0) 374 | 375 | def free_energy(self, v_sample): 376 | """Function to compute the free energy""" 377 | wx_b = T.dot(v_sample, self.W) + self.hbias 378 | vbias_term = T.dot(v_sample, self.vbias) 379 | hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1) 380 | return -hidden_term - vbias_term 381 | 382 | def pseudolikelihood(self, data): 383 | self.rbm.components_ = self.W.get_value().T 384 | self.rbm.intercept_visible_ = self.vbias.get_value() 385 | self.rbm.intercept_hidden_ = self.hbias.get_value() 386 | return self.rbm.score_samples(data).mean() 387 | 388 | def dump(self, fname): 389 | """Pickle weights to a file""" 390 | all_params = [self.W.get_value(), self.vbias.get_value(), self.hbias.get_value(), self.Phi.get_value()] 391 | with open(fname, 'w') as f: 392 | pickle.dump(all_params, f) 393 | 394 | def load(self, fname): 395 | """Load pickled network""" 396 | with open(fname) as f: 397 | params = pickle.load(f) 398 | self.load_params(params) 399 | 400 | def load_params(self, params): 401 | """Load a given set of parameters""" 402 | W, vbias, hbias, Phi = params 403 | self.W.set_value(W) 404 | self.vbias.set_value(vbias) 405 | self.hbias.set_value(hbias) 406 | self.Phi.set_value(Phi) 407 | 408 | def fit( 409 | self, X_train, Y_train, X_val, Y_val, 410 | n_epoch=10, n_batch=100, logname='run' 411 | ): 412 | """Train the model""" 413 | alpha = 1.0 # learning rate, which can be adjusted later 414 | n_data = len(X_train) 415 | n_superbatch = self.n_superbatch 416 | 417 | for epoch in range(n_epoch): 418 | # In each epoch, we do a full pass over the training data: 419 | train_batches, train_err, train_acc = 0, 0, 0 420 | start_time = time.time() 421 | 422 | # if epoch > 100: alpha = 0.5 423 | 424 | # iterate over superbatches to save time on GPU memory transfer 425 | for X_sb, Y_sb in self.iterate_superbatches( 426 | X_train, Y_train, n_superbatch, 427 | datatype='train', shuffle=True, 428 | ): 429 | for idx1, idx2 in iterate_minibatch_idx(len(X_sb), n_batch): 430 | # q steps 431 | err, acc = self.train_q(idx1, idx2) 432 | err, acc = self.train_q(idx1, idx2) 433 | err, acc = self.train_q(idx1, idx2) 434 | 435 | # p steps 436 | err, acc = self.train_p(idx1, idx2) 437 | self.reset_averages() 438 | 439 | # err, acc = self.train(idx1, idx2) 440 | # self.reset_averages() 441 | # collect metrics 442 | err = self.pseudolikelihood(X_sb[idx1:idx2].reshape(-1, 64)) 443 | train_batches += 1 444 | train_err += err 445 | train_acc += acc 446 | 447 | # print train_batches, err, acc 448 | 449 | if train_batches % 100 == 0: 450 | n_total = epoch * n_data + n_batch * train_batches 451 | metrics = [ 452 | n_total, train_err / train_batches, 453 | train_acc / train_batches, 454 | ] 455 | log_metrics(logname, metrics) 456 | 457 | print "Epoch {} of {} took {:.3f}s ({} minibatches)".format( 458 | epoch + 1, n_epoch, time.time() - start_time, train_batches) 459 | 460 | print " training:\t\t{:.6f}\t{:.6f}".format( 461 | train_err / train_batches, train_acc / train_batches) 462 | 463 | # reserve N of training data points to kick start hallucinations 464 | self.dump(logname + '.pkl') 465 | hallu_i = self.numpy_rng.randint(n_data - self.n_chain) 466 | self.hallu_set = np.asarray( 467 | X_train[hallu_i:hallu_i + self.n_chain], 468 | dtype=theano.config.floatX 469 | ) -------------------------------------------------------------------------------- /models/udadgm.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pickle 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | import theano 7 | import theano.tensor as T 8 | import lasagne 9 | from theano.gradient import disconnected_grad as dg 10 | 11 | from model import Model 12 | from layers.shape import RepeatLayer 13 | from layers import ( 14 | GaussianSampleLayer, 15 | BernoulliSampleLayer, 16 | GaussianMultiSampleLayer, 17 | ) 18 | from helpers import ( 19 | iterate_minibatch_idx, 20 | iterate_minibatches, 21 | evaluate, 22 | log_metrics 23 | ) 24 | from distributions import log_bernoulli, log_normal2 25 | 26 | from helpers import Tlogsumexp 27 | from theano.tensor.shared_randomstreams import RandomStreams 28 | 29 | from rbm import RBM 30 | from aux_variational_rbm import AuxiliaryVariationalRBM 31 | 32 | # ---------------------------------------------------------------------------- 33 | 34 | class UDADGM(Model): 35 | """Auxiliary Deep Generative Model (unsupervised version) with discrete z""" 36 | def __init__( 37 | self, n_dim, n_out, n_chan=1, n_superbatch=12800, model='bernoulli', 38 | opt_alg='adam', opt_params={'lr' : 1e-3, 'b1': 0.9, 'b2': 0.99} 39 | ): 40 | # save model that wil be created 41 | self.model = model 42 | self.n_sample = 3 # adjustable parameter, though 1 works best in practice 43 | Model.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params) 44 | 45 | # random number generators 46 | self.numpy_rng = np.random.RandomState(1234) 47 | self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30)) 48 | 49 | (X, Y, idx1, idx2, S) = self.inputs 50 | loss, acc = self.objectives 51 | self.train = theano.function( 52 | [idx1, idx2, self.alpha], [loss, self.log_e, self.z], updates=self.updates, 53 | givens={X: self.train_set_x[idx1:idx2], Y: self.train_set_y[idx1:idx2]}, 54 | on_unused_input='warn' 55 | ) 56 | llik = self.create_llik() 57 | self.loss = theano.function( 58 | [X, Y], [loss, llik], 59 | on_unused_input='warn', 60 | ) 61 | 62 | def create_model(self, X, Y, n_dim, n_out, n_chan=1): 63 | # params 64 | n_lat = 64 # latent stochastic variables 65 | n_aux = 10 # auxiliary variables 66 | n_hid = 500 # size of hidden layer in encoder/decoder 67 | n_sam = self.n_sample # number of monte-carlo samples 68 | n_hid_cv = 500 # size of hidden layer in control variate net 69 | n_out = n_dim * n_dim * n_chan # total dimensionality of ouput 70 | hid_nl = lasagne.nonlinearities.tanh 71 | relu_shift = lambda av: T.nnet.relu(av+10)-10 # for numerical stability 72 | 73 | # self.rbm = RBM(n_dim=int(np.sqrt(n_lat)), n_out=10, n_chan=1, opt_params={'nb':128}) 74 | self.rbm = AuxiliaryVariationalRBM(n_dim=int(np.sqrt(n_lat)), n_out=10, n_chan=1, opt_params={'nb':128*n_sam}) 75 | 76 | # create the encoder network 77 | # create q(a|x) 78 | l_qa_in = lasagne.layers.InputLayer( 79 | shape=(None, n_chan, n_dim, n_dim), 80 | input_var=X, 81 | ) 82 | l_qa_hid = lasagne.layers.DenseLayer( 83 | l_qa_in, num_units=n_hid, 84 | nonlinearity=hid_nl, 85 | ) 86 | l_qa_mu = lasagne.layers.DenseLayer( 87 | l_qa_in, num_units=n_aux, 88 | nonlinearity=None, 89 | ) 90 | l_qa_logsigma = lasagne.layers.DenseLayer( 91 | l_qa_in, num_units=n_aux, 92 | nonlinearity=relu_shift, 93 | ) 94 | # repeatedly sample 95 | l_qa_mu = lasagne.layers.ReshapeLayer( 96 | RepeatLayer(l_qa_mu, n_ax=1, n_rep=n_sam), 97 | shape=(-1, n_aux), 98 | ) 99 | l_qa_logsigma = lasagne.layers.ReshapeLayer( 100 | RepeatLayer(l_qa_logsigma, n_ax=1, n_rep=n_sam), 101 | shape=(-1, n_aux), 102 | ) 103 | l_qa = GaussianSampleLayer(l_qa_mu, l_qa_logsigma) 104 | 105 | # create q(z|a,x) 106 | l_qz_in = lasagne.layers.InputLayer((None, n_aux)) 107 | l_qz_hid1a = lasagne.layers.DenseLayer( 108 | l_qz_in, num_units=n_hid, 109 | nonlinearity=hid_nl, 110 | ) 111 | l_qz_hid1b = lasagne.layers.DenseLayer( 112 | l_qa_in, num_units=n_hid, 113 | nonlinearity=hid_nl, 114 | ) 115 | l_qz_hid1b = lasagne.layers.ReshapeLayer( 116 | RepeatLayer(l_qz_hid1b, n_ax=1, n_rep=n_sam), 117 | shape=(-1, n_hid), 118 | ) 119 | l_qz_hid2 = lasagne.layers.ElemwiseSumLayer([l_qz_hid1a, l_qz_hid1b]) 120 | l_qz_hid3 = lasagne.layers.DenseLayer( 121 | l_qz_hid2, num_units=n_hid, 122 | nonlinearity=hid_nl, 123 | ) 124 | l_qz_mu = lasagne.layers.DenseLayer( 125 | l_qz_hid3, num_units=n_lat, 126 | nonlinearity=T.nnet.sigmoid, 127 | ) 128 | l_qz = BernoulliSampleLayer(l_qz_mu) 129 | l_qz_logsigma = None 130 | 131 | # create the decoder network 132 | # create p(x|z) 133 | l_px_in = lasagne.layers.InputLayer((None, n_lat)) 134 | l_px_hid = lasagne.layers.DenseLayer( 135 | l_px_in, num_units=n_hid, 136 | W=lasagne.init.GlorotUniform(), 137 | nonlinearity=hid_nl, 138 | ) 139 | l_px_mu, l_px_logsigma = None, None 140 | 141 | if self.model == 'bernoulli': 142 | l_px_mu = lasagne.layers.DenseLayer( 143 | l_px_hid, num_units=n_out, 144 | nonlinearity=lasagne.nonlinearities.sigmoid, 145 | ) 146 | elif self.model == 'gaussian': 147 | l_px_mu = lasagne.layers.DenseLayer( 148 | l_px_hid, num_units=n_out, 149 | nonlinearity=None, 150 | ) 151 | l_px_logsigma = lasagne.layers.DenseLayer( 152 | l_px_hid, num_units=n_out, 153 | nonlinearity=relu_shift, 154 | ) 155 | 156 | # create p(a|z) 157 | l_pa_hid = lasagne.layers.DenseLayer( 158 | l_px_in, num_units=n_hid, 159 | nonlinearity=hid_nl, 160 | ) 161 | l_pa_mu = lasagne.layers.DenseLayer( 162 | l_pa_hid, num_units=n_aux, 163 | nonlinearity=None, 164 | ) 165 | l_pa_logsigma = lasagne.layers.DenseLayer( 166 | l_pa_hid, num_units=n_aux, 167 | W=lasagne.init.GlorotNormal(), 168 | b=lasagne.init.Normal(1e-3), 169 | nonlinearity=relu_shift, 170 | ) 171 | 172 | # create control variate (baseline) network 173 | l_cv_in = lasagne.layers.InputLayer( 174 | shape=(None, n_chan, n_dim, n_dim), 175 | input_var=X, 176 | ) 177 | l_cv_hid = lasagne.layers.DenseLayer( 178 | l_cv_in, num_units=n_hid_cv, 179 | nonlinearity=hid_nl, 180 | ) 181 | l_cv = lasagne.layers.DenseLayer( 182 | l_cv_hid, num_units=1, 183 | nonlinearity=None, 184 | ) 185 | 186 | # create variables for centering signal 187 | c = theano.shared(np.zeros((1,1), dtype=np.float64), broadcastable=(True,True)) 188 | v = theano.shared(np.zeros((1,1), dtype=np.float64), broadcastable=(True,True)) 189 | 190 | # store certain input layers for downstream (quick hack) 191 | self.input_layers = (l_qa_in, l_qz_in, l_px_in, l_cv_in) 192 | self.n_lat = n_lat 193 | self.n_lat2 = int(np.sqrt(n_lat)) 194 | self.n_hid = n_hid 195 | 196 | return l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ 197 | l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, \ 198 | l_qa, l_qz, l_cv, c, v 199 | 200 | def _create_components(self, deterministic=False): 201 | # load network input 202 | X = self.inputs[0] 203 | x = X.flatten(2) 204 | 205 | # duplicate entries to take into account multiple mc samples 206 | n_sam = self.n_sample 207 | n_out = x.shape[1] 208 | x = x.dimshuffle(0,'x',1).repeat(n_sam, axis=1).reshape((-1, n_out)) 209 | 210 | # load networks 211 | l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ 212 | l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, _, _, _ = self.network 213 | l_qa_in, l_qz_in, l_px_in, l_cv_in = self.input_layers 214 | 215 | # load network output 216 | qa_mu, qa_logsigma, a = lasagne.layers.get_output( 217 | [l_qa_mu, l_qa_logsigma, l_qa], 218 | deterministic=deterministic, 219 | ) 220 | qz_mu, z = lasagne.layers.get_output( 221 | [l_qz_mu, l_qz], 222 | {l_qz_in: a, l_qa_in: X}, 223 | deterministic=deterministic, 224 | ) 225 | pa_mu, pa_logsigma = lasagne.layers.get_output( 226 | [l_pa_mu, l_pa_logsigma], {l_px_in: z}, 227 | deterministic=deterministic, 228 | ) 229 | 230 | if self.model == 'bernoulli': 231 | px_mu = lasagne.layers.get_output( 232 | l_px_mu, {l_px_in: z}, deterministic=deterministic) 233 | elif self.model == 'gaussian': 234 | px_mu, px_logsigma = lasagne.layers.get_output( 235 | [l_px_mu, l_px_logsigma], {l_px_in: z}, 236 | deterministic=deterministic, 237 | ) 238 | 239 | # entropy term 240 | log_qa_given_x = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1) 241 | log_qz_given_x = log_bernoulli(z, qz_mu).sum(axis=1) 242 | log_qz_given_x_dgz = log_bernoulli(dg(z), qz_mu).sum(axis=1) 243 | log_qza_given_x = log_qz_given_x + log_qa_given_x 244 | 245 | # log-probability term 246 | z_prior = T.ones_like(z)*np.float32(0.5) 247 | log_pz = log_bernoulli(z, z_prior).sum(axis=1) 248 | log_e = -self.rbm.free_energy(z.reshape((128*n_sam,self.n_lat))) 249 | log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1) 250 | log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1) 251 | 252 | log_pxz = log_pa_given_z + log_px_given_z + log_e 253 | 254 | # save them for later 255 | if deterministic == False: 256 | self.log_pxz = log_pxz 257 | self.log_px_given_z = log_px_given_z 258 | self.log_pz = log_pz 259 | self.log_qza_given_x = log_qza_given_x 260 | self.log_qa_given_x = log_qa_given_x 261 | self.log_qz_given_x = log_qz_given_x 262 | self.log_qz_given_x_dgz = log_qz_given_x_dgz 263 | self.log_e = log_e.mean() 264 | self.z = z 265 | 266 | # return log_paxz, log_qza_given_x 267 | return log_pxz, log_qza_given_x 268 | 269 | def create_objectives(self, deterministic=False): 270 | # load probabilities 271 | log_paxz, log_qza_given_x = self._create_components(deterministic=deterministic) 272 | 273 | # compute the evidence lower bound 274 | elbo = T.mean(log_paxz - log_qza_given_x) 275 | log_qa_given_x = self.log_qa_given_x 276 | 277 | # we don't use a spearate accuracy metric right now 278 | return -elbo, T.mean(log_qa_given_x) 279 | 280 | def create_gradients(self, loss, deterministic=False): 281 | # load networks 282 | l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ 283 | l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, l_cv, c, v = self.network 284 | 285 | # load params 286 | p_params = lasagne.layers.get_all_params( 287 | [l_px_mu, l_pa_mu, l_pa_logsigma], trainable=True) 288 | qa_params = lasagne.layers.get_all_params(l_qa_mu, trainable=True) 289 | qz_params = lasagne.layers.get_all_params(l_qz, trainable=True) 290 | cv_params = lasagne.layers.get_all_params(l_cv, trainable=True) 291 | 292 | # load neural net outputs (probabilities have been precomputed) 293 | log_pxz, log_px_given_z, log_pz = self.log_pxz, self.log_px_given_z, self.log_pz 294 | log_qza_given_x = self.log_qza_given_x 295 | log_qz_given_x = self.log_qz_given_x 296 | log_qz_given_x_dgz = self.log_qz_given_x_dgz 297 | cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1) 298 | 299 | # compute learning signals 300 | l0 = log_px_given_z + log_pz - log_qz_given_x - cv # NOTE: this disn't have q(a) 301 | l_avg, l_var = l0.mean(), l0.var() 302 | c_new = 0.8*c + 0.2*l_avg 303 | v_new = 0.8*v + 0.2*l_var 304 | l = (l0 - c_new) / T.maximum(1, T.sqrt(v_new)) 305 | l_target = (l0 - c_new) / T.maximum(1, T.sqrt(v_new)) 306 | 307 | # compute grad wrt p 308 | p_grads = T.grad(-log_pxz.mean(), p_params) 309 | 310 | # compute grad wrt q_a 311 | elbo = T.mean(log_pxz - log_qza_given_x) 312 | qa_grads = T.grad(-elbo, qa_params) 313 | 314 | # compute grad wrt q_z 315 | qz_target = T.mean(dg(l_target) * log_qz_given_x_dgz) 316 | qz_grads = T.grad(-0.2*qz_target, qz_params) # 5x slower rate for q 317 | 318 | # compute grad of cv net 319 | cv_target = T.mean(l0**2) 320 | cv_grads = [0.2*g for g in T.grad(cv_target, cv_params)] 321 | 322 | # combine and clip gradients 323 | clip_grad = 1 324 | max_norm = 5 325 | grads = p_grads + qa_grads + qz_grads + cv_grads 326 | mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) 327 | cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] 328 | 329 | return cgrads 330 | 331 | def gen_samples(self, deterministic=False): 332 | s = self.inputs[-1] 333 | # put it through the decoder 334 | _, _, l_px_in, _ = self.input_layers 335 | l_px_mu = self.network[0] 336 | px_mu = lasagne.layers.get_output(l_px_mu, {l_px_in : s}) 337 | 338 | return px_mu 339 | 340 | def get_params(self): 341 | # load networks 342 | l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ 343 | l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, \ 344 | l_qa, l_qz, l_cv, c, v = self.network 345 | 346 | if self.model == 'gaussian': 347 | raise NotImplementedError('The code below needs to implement Gaussians') 348 | 349 | # load params 350 | p_params = lasagne.layers.get_all_params( 351 | [l_px_mu, l_pa_mu, l_pa_logsigma], trainable=True) 352 | qa_params = lasagne.layers.get_all_params(l_qa_mu, trainable=True) 353 | qz_params = lasagne.layers.get_all_params(l_qz, trainable=True) 354 | cv_params = lasagne.layers.get_all_params(l_cv, trainable=True) 355 | 356 | return p_params + qa_params + qz_params + cv_params 357 | 358 | def create_updates(self, grads, params, alpha, opt_alg, opt_params): 359 | # call super-class to generate SGD/ADAM updates 360 | grad_updates = Model.create_updates(self, grads, params, alpha, opt_alg, opt_params) 361 | 362 | # create updates for centering signal 363 | # load neural net outputs (probabilities have been precomputed) 364 | l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ 365 | l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, l_cv, c, v = self.network 366 | 367 | # load neural net outputs (probabilities have been precomputed) 368 | log_pxz, log_px_given_z, log_pz = self.log_pxz, self.log_px_given_z, self.log_pz 369 | log_qz_given_x = self.log_qz_given_x 370 | cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1) 371 | 372 | # compute learning signals 373 | l = log_px_given_z + log_pz - log_qz_given_x - cv 374 | l_avg, l_var = l.mean(), l.var() 375 | c_new = 0.8*c + 0.2*l_avg 376 | v_new = 0.8*v + 0.2*l_var 377 | 378 | # compute update for centering signal 379 | cv_updates = {c: c_new, v: v_new} 380 | 381 | return OrderedDict(grad_updates.items() + cv_updates.items()) 382 | 383 | def hallucinate(self): 384 | """Generate new samples by passing noise into the decoder""" 385 | # load network params 386 | size, n_lat, n_dim = 100, self.n_lat, self.n_dim 387 | img_size = int(np.sqrt(size)) 388 | 389 | # # generate noisy inputs 390 | # noise = self.gen_noise(size, n_lat).astype('float32') 391 | 392 | # sample noise from RBM 393 | noise = self.rbm.sample() 394 | 395 | p_mu = self.dream(noise) 396 | if p_mu is None: 397 | return None 398 | p_mu = p_mu.reshape((img_size, img_size, n_dim, n_dim)) 399 | # split into img_size (1,img_size,n_dim,n_dim) images, 400 | # concat along columns -> 1,img_size,n_dim,n_dim*img_size 401 | p_mu = np.concatenate(np.split(p_mu, img_size, axis=0), axis=3) 402 | # split into img_size (1,1,n_dim,n_dim*img_size) images, 403 | # concat along rows -> 1,1,n_dim*img_size,n_dim*img_size 404 | p_mu = np.concatenate(np.split(p_mu, img_size, axis=1), axis=2) 405 | return np.squeeze(p_mu) 406 | 407 | def create_llik(self): 408 | # load network input 409 | X = self.inputs[0] 410 | x = X.flatten(2) 411 | 412 | # duplicate entries to take into account multiple mc samples 413 | n_sam = self.n_sample 414 | n_out = x.shape[1] 415 | x = x.dimshuffle(0,'x',1).repeat(n_sam, axis=1).reshape((-1, n_out)) 416 | 417 | # load networks 418 | l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ 419 | l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, _, _, _ = self.network 420 | l_qa_in, l_qz_in, l_px_in, l_cv_in = self.input_layers 421 | 422 | # load network output 423 | qa_mu, qa_logsigma, a = lasagne.layers.get_output( 424 | [l_qa_mu, l_qa_logsigma, l_qa], 425 | ) 426 | qz_mu, z = lasagne.layers.get_output( 427 | [l_qz_mu, l_qz], 428 | {l_qz_in: a, l_qa_in: X}, 429 | ) 430 | pa_mu, pa_logsigma = lasagne.layers.get_output( 431 | [l_pa_mu, l_pa_logsigma], {l_px_in: z}, 432 | ) 433 | 434 | px_mu = lasagne.layers.get_output(l_px_mu, {l_px_in: z}) 435 | 436 | # entropy term 437 | log_qa_given_x = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1) 438 | log_qz_given_x = log_bernoulli(z, qz_mu).sum(axis=1) 439 | log_qz_given_x_dgz = log_bernoulli(dg(z), qz_mu).sum(axis=1) 440 | log_qza_given_x = log_qz_given_x + log_qa_given_x 441 | 442 | # log-probability term 443 | # z_prior = T.ones_like(z)*np.float32(0.5) 444 | # log_pz = log_bernoulli(z, z_prior).sum(axis=1) 445 | log_e = -self.rbm.free_energy(z.reshape((128*n_sam,self.n_lat))) 446 | log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1) 447 | log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1) 448 | 449 | t = log_pa_given_z + log_px_given_z + log_e - log_qz_given_x - log_qa_given_x 450 | t = t.reshape([128, n_sam]) 451 | 452 | # compute loss 453 | llik = Tlogsumexp( t, axis=1) # (n_bat,) 454 | 455 | return T.mean(llik) 456 | 457 | def fit( 458 | self, X_train, Y_train, X_val, Y_val, 459 | n_epoch=10, n_batch=100, logname='run', 460 | ): 461 | """Train the model""" 462 | alpha = 1.0 # learning rate, which can be adjusted later 463 | n_data = len(X_train) 464 | n_superbatch = self.n_superbatch 465 | 466 | # print '...', self.rbm.logZ_exact() 467 | # print '...', self.rbm.logZ_exact(marg='h') 468 | 469 | for epoch in range(n_epoch): 470 | # In each epoch, we do a full pass over the training data: 471 | train_batches, train_err, train_acc = 0, 0, 0 472 | start_time = time.time() 473 | 474 | # iterate over superbatches to save time on GPU memory transfer 475 | for X_sb, Y_sb in self.iterate_superbatches( 476 | X_train, Y_train, n_superbatch, 477 | datatype='train', shuffle=True, 478 | ): 479 | for idx1, idx2 in iterate_minibatch_idx(len(X_sb), n_batch): 480 | # train model 481 | self.rbm.reset_averages() 482 | err, acc, z = self.train(idx1, idx2, alpha) 483 | 484 | # train rbm 485 | z_rbm = z.reshape((128*self.n_sample,1,8,8)) 486 | # self.rbm.train_batch(z.reshape((128,1,8,8)), np.zeros((n_batch,), dtype='int32'), alpha) 487 | 488 | # q steps 489 | for i in range(2): self.rbm.train_q0(z_rbm) 490 | 491 | # p steps 492 | self.rbm.train_p_batch(z_rbm, alpha) 493 | 494 | # collect metrics 495 | # print err, acc 496 | train_batches += 1 497 | train_err += err 498 | train_acc += acc 499 | if train_batches % 100 == 0: 500 | n_total = epoch * n_data + n_batch * train_batches 501 | metrics = [ 502 | n_total, 503 | train_err / train_batches, 504 | train_acc / train_batches, 505 | ] 506 | log_metrics(logname, metrics) 507 | 508 | print "Epoch {} of {} took {:.3f}s ({} minibatches)".format( 509 | epoch + 1, n_epoch, time.time() - start_time, train_batches) 510 | 511 | # make a full pass over the training data and record metrics: 512 | train_err, train_acc = evaluate( 513 | self.loss, X_train, Y_train, batchsize=128, 514 | ) 515 | val_err, val_acc = evaluate( 516 | self.loss, X_val, Y_val, batchsize=128, 517 | ) 518 | 519 | print " training:\t\t{:.6f}\t{:.6f}".format( 520 | train_err, train_acc 521 | ) 522 | print " validation:\t\t{:.6f}\t{:.6f}".format( 523 | val_err, val_acc 524 | ) 525 | 526 | metrics = [epoch, train_err, train_acc, val_err, val_acc] 527 | log_metrics(logname + '.val', metrics) 528 | 529 | # reserve N of training data points to kick start hallucinations 530 | self.rbm.hallu_set = np.asarray( 531 | z[:100,:].reshape((100,1,8,8)), 532 | dtype=theano.config.floatX 533 | ) 534 | -------------------------------------------------------------------------------- /models/aux_variational_rbm.py: -------------------------------------------------------------------------------- 1 | import pdb 2 | import time 3 | import pickle 4 | import numpy as np 5 | import theano 6 | import lasagne 7 | import theano.tensor as T 8 | from theano.tensor.shared_randomstreams import RandomStreams 9 | from collections import OrderedDict 10 | from lasagne.layers import batch_norm 11 | 12 | from model import Model 13 | from abstractrbm import AbstractRBM 14 | from helpers import iterate_minibatch_idx, evaluate, log_metrics 15 | from helpers import Tlogsumexp, Tlogaddexp 16 | from layers import BernoulliSampleLayer, Deconv2DLayer 17 | from distributions import log_bernoulli, log_normal, log_normal2 18 | 19 | from theano.compile.nanguardmode import NanGuardMode 20 | 21 | create_zmat = lambda x, y: np.zeros((x, y)).astype(theano.config.floatX) 22 | create_vec = lambda y: np.zeros(y,).astype(theano.config.floatX) 23 | create_sca = lambda x: (np.cast[theano.config.floatX](x)).astype(theano.config.floatX) 24 | 25 | class AuxiliaryVariationalRBM(AbstractRBM): 26 | """Restricted Boltzmann Machine with VI""" 27 | def __init__( 28 | self, n_dim, n_out, n_chan=1, n_superbatch=12800, opt_alg='adam', 29 | opt_params={'lr': 1e-3, 'b1': 0.9, 'b2': 0.99} 30 | ): 31 | """RBM constructor. 32 | Defines the parameters of the model along with 33 | basic operations for inferring hidden from visible (and vice-versa), 34 | as well as for performing CD updates. 35 | """ 36 | self.numpy_rng = np.random.RandomState(1234) 37 | self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30)) 38 | self.create_mat = lambda x, y: self.numpy_rng.normal(0, 0.01, (x, y)).astype(theano.config.floatX) 39 | 40 | # save config 41 | n_batch = opt_params.get('nb') 42 | self.n_hidden = 8 43 | self.n_visible = n_chan*n_dim*n_dim # size of visible layer 44 | self.n_batch = n_batch 45 | self.n_mc = 300 # num of monte carlo samples from each MoB component 46 | 47 | self.n_dim = n_dim 48 | self.n_out = n_out 49 | self.n_superbatch = n_superbatch 50 | self.alg = opt_alg 51 | 52 | # set up general RBM methods 53 | AbstractRBM.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params) 54 | 55 | # create updates 56 | alpha = T.scalar(dtype=theano.config.floatX) # learning rate 57 | 58 | # save config 59 | self.n_class = 2 60 | self.n_dim = n_dim 61 | self.n_out = n_out 62 | 63 | self.marginalize = 'h' 64 | self.n_samples = self.n_mc 65 | self.n_latent = self.n_visible if self.marginalize == 'h' else self.n_hidden 66 | 67 | # create input variables 68 | D, idx1, idx2 = self.create_inputs() 69 | 70 | # create model 71 | self.network = self.create_model() 72 | 73 | # create objectives 74 | loglik, plik = self.create_objectives(D) 75 | 76 | # create gradients 77 | dL_Theta, dE_Theta, dlogZ_Theta, dL_qx, dL_pa = self.create_gradients() 78 | grads = dL_Theta, dE_Theta, dlogZ_Theta, dL_qx, dL_pa 79 | 80 | # create updates 81 | uL_Theta, uL_Phi, avg_updates, avg_Theta_updates \ 82 | = self.create_updates(grads, None, alpha, opt_alg, opt_params) 83 | 84 | # logF_avg, Z_avg = self.create_llik_estimate(D) 85 | 86 | mode = NanGuardMode(nan_is_error=True, inf_is_error=False, big_is_error=False) 87 | mode = None 88 | 89 | common_update1 = OrderedDict(avg_updates.items() + uL_Phi.items()) 90 | self.train_q = theano.function([idx1, idx2], [loglik, self.logbnd], 91 | updates=common_update1, mode=mode, 92 | givens={D: self.train_set_x[idx1:idx2]}) 93 | self.train_q0 = theano.function([D], self.logbnd, updates=common_update1, mode=mode) 94 | 95 | common_update2 = OrderedDict(avg_Theta_updates.items() + uL_Theta.items()) 96 | self.train_p = theano.function([idx1, idx2, alpha], [loglik, self.logbnd], 97 | updates=common_update2, mode=mode, on_unused_input='warn', 98 | givens={D: self.train_set_x[idx1:idx2]}) 99 | self.train_p_batch = theano.function([D, alpha], [loglik, self.logbnd], 100 | updates=common_update2, mode=mode, on_unused_input='warn') 101 | # self.llik = theano.function([D], logF_avg - T.log(Z_avg), mode=mode) 102 | 103 | def create_inputs(self): 104 | # allocate symbolic variables for the data 105 | idx1, idx2 = T.lscalar(), T.lscalar() 106 | D = T.tensor4('D', dtype=theano.config.floatX) # data matrixX) # learning rate 107 | # self.Z = T.tensor2(dtype=theano.config.floatX) 108 | return D, idx1, idx2 109 | 110 | def _free_energy(self, X, marginalize='h', avg=False): 111 | bv_vec = self.vbias.reshape((self.n_visible,1)) 112 | bh_vec = self.hbias.reshape((self.n_hidden,1)) 113 | 114 | W = self.W_avg if avg else self.W 115 | W = W.T 116 | 117 | if marginalize == 'h': 118 | logF = T.dot(bv_vec.T, X) \ 119 | + T.sum(T.log(1. + T.exp(bh_vec + T.dot(W, X))), axis=0) 120 | elif marginalize == 'v': 121 | logF = T.dot(bh_vec.T, X) \ 122 | + T.sum(T.log(1. + T.exp(bv_vec + T.dot(W.T, X))), axis=0) 123 | else: 124 | raise ValueError('invalid argument') 125 | 126 | return logF 127 | 128 | @property 129 | def Z(self): 130 | return np.exp(self.logZ_avg.get_value()) 131 | 132 | def create_model(self): 133 | # create the RBM 134 | self.W = theano.shared(self.create_mat(self.n_visible, self.n_hidden)) 135 | self.vbias = theano.shared(create_vec(self.n_visible)) 136 | self.hbias = theano.shared(create_vec(self.n_hidden)) 137 | self.params_p = [self.W, self.vbias, self.hbias] 138 | 139 | # running averages 140 | self.inner_it = theano.shared(create_sca(0.0)) 141 | self.loga = theano.shared(create_sca(0.0)) 142 | self.logZ_avg = theano.shared(create_sca(0.0)) 143 | self.grad_logZ_avg = [ 144 | theano.shared(create_zmat(self.n_visible, self.n_hidden)), 145 | theano.shared(create_vec(self.n_visible)), 146 | theano.shared(create_vec(self.n_hidden)) 147 | ] 148 | self.grad_p_avg = [ 149 | theano.shared(create_zmat(self.n_visible, self.n_hidden)), 150 | theano.shared(create_vec(self.n_visible)), 151 | theano.shared(create_vec(self.n_hidden)) 152 | ] 153 | self.W_avg = theano.shared(self.W.get_value().copy()) 154 | self.vbias_avg = theano.shared(self.vbias.get_value().copy()) 155 | self.hbias_avg = theano.shared(self.hbias.get_value().copy()) 156 | 157 | # for pseudo-likelihood 158 | self.bit_i_idx = theano.shared(value=0, name='bit_i_idx') 159 | 160 | # create approximating distribution 161 | n_aux = 10 # dimensionality of latent auxiliary space 162 | self.A = self.theano_rng.normal(size=(self.n_mc, n_aux)) 163 | 164 | # create q(x|a) 165 | l_qx_in = lasagne.layers.InputLayer((None, n_aux)) 166 | 167 | # digits dataset 168 | if self.n_visible == 64: 169 | # l_qx_hid = lasagne.layers.DenseLayer(l_qx_in, num_units=64, 170 | # nonlinearity=lasagne.nonlinearities.rectify) 171 | l_qx_hid = lasagne.layers.DenseLayer(l_qx_in, num_units=32, 172 | nonlinearity=lasagne.nonlinearities.rectify) 173 | l_qx = lasagne.layers.DenseLayer(l_qx_hid, num_units=self.n_latent, 174 | nonlinearity=lasagne.nonlinearities.sigmoid) 175 | l_qx_samp = BernoulliSampleLayer(l_qx) 176 | 177 | # l_g_hid1 = lasagne.layers.DenseLayer(l_qx_in, 8*4*4) 178 | # l_g_hid2 = lasagne.layers.ReshapeLayer(l_g_hid1, ([0], 8, 4, 4)) 179 | # # l_g_dc1 = Deconv2DLayer(l_g_hid2, 8, 3, stride=2, pad=1) 180 | # l_g = Deconv2DLayer(l_g_hid2, 1, 3, stride=2, pad=1, 181 | # nonlinearity=lasagne.nonlinearities.sigmoid) 182 | # l_qx = lasagne.layers.flatten(l_g) 183 | # l_qx_samp = BernoulliSampleLayer(l_qx) 184 | 185 | # mnist dataset 186 | elif self.n_visible == 784: 187 | # dense 188 | l_qx_hid = lasagne.layers.DenseLayer(l_qx_in, num_units=2048, 189 | nonlinearity=lasagne.nonlinearities.rectify) 190 | # l_qx_hid = batch_norm(lasagne.layers.DenseLayer(l_qx_in, num_units=128, 191 | # nonlinearity=lasagne.nonlinearities.rectify)) 192 | # l_qx_hid = batch_norm(lasagne.layers.DenseLayer(l_qx_hid, num_units=256, 193 | # nonlinearity=lasagne.nonlinearities.rectify)) 194 | l_qx = lasagne.layers.DenseLayer(l_qx_hid, num_units=self.n_latent, 195 | nonlinearity=lasagne.nonlinearities.sigmoid) 196 | l_qx_samp = BernoulliSampleLayer(l_qx) 197 | 198 | # # deconv 199 | # l_g_hid1 = batch_norm(lasagne.layers.DenseLayer(l_qx_in, 64*7*7)) 200 | # l_g_hid2 = lasagne.layers.ReshapeLayer(l_g_hid1, ([0], 64, 7, 7)) 201 | # # l_g_dc = Deconv2DLayer(l_g_hid2, 1, 5, stride=4, pad=2, 202 | # # nonlinearity=lasagne.nonlinearities.sigmoid) 203 | 204 | # # l_g_dc1 = Deconv2DLayer(l_g_hid2, 8, 3, stride=2, pad=1) 205 | # l_g_dc1 = batch_norm(Deconv2DLayer(l_g_hid2, 32, 5, stride=2, pad=2, 206 | # nonlinearity=lasagne.nonlinearities.rectify)) 207 | # l_g_dc = Deconv2DLayer(l_g_dc1, 1, 5, stride=2, pad=2, 208 | # nonlinearity=lasagne.nonlinearities.sigmoid) 209 | 210 | # l_qx = lasagne.layers.flatten(l_g_dc) 211 | # l_qx_samp = BernoulliSampleLayer(l_qx) 212 | 213 | # create p(a|x) 214 | relu_shift = lambda av: T.nnet.relu(av+10)-10 # for numerical stability 215 | l_pa_in = lasagne.layers.InputLayer((None, self.n_latent)) 216 | l_pa_hid = lasagne.layers.DenseLayer(l_pa_in, num_units=32, 217 | nonlinearity=lasagne.nonlinearities.rectify) 218 | l_pa_mu = lasagne.layers.DenseLayer(l_pa_hid, num_units=n_aux, 219 | nonlinearity=None) 220 | l_pa_logsigma = lasagne.layers.DenseLayer(l_pa_hid, num_units=n_aux, 221 | nonlinearity=relu_shift) 222 | 223 | return (l_qx, l_qx_samp, l_pa_mu, l_pa_logsigma) 224 | 225 | def _create_components(self, D): 226 | # collect samples 227 | (l_qx, l_qx_samp, l_pa_mu, l_pa_logsigma) = self.network 228 | a = self.A 229 | qx, x = lasagne.layers.get_output([l_qx, l_qx_samp], a) 230 | pa_mu, pa_logsigma = lasagne.layers.get_output([l_pa_mu, l_pa_logsigma], x) 231 | 232 | # compute logQ 233 | logQa = T.sum(log_normal(a, 0., 1.), axis=1) 234 | logQx_given_a = T.sum(log_bernoulli(x, qx), axis=1) 235 | logQ = logQa + logQx_given_a 236 | 237 | # compute energies of the samples, dim=(1, n_tot_samples) 238 | logFx = self._free_energy(x.T, marginalize=self.marginalize) 239 | logpa = T.sum(log_normal2(a, pa_mu, pa_logsigma), axis=1) 240 | # logF = logFx + logpa 241 | 242 | # free energy of the data 243 | D = D.reshape((-1, self.n_visible)).T 244 | logF_D = self._free_energy(D) 245 | 246 | self._components = (logFx, logpa, logQ, logF_D) 247 | 248 | def create_objectives(self, D, deterministic=False): 249 | self._create_components(D) 250 | (logFx, logpa, logQ, logF_D_vec) = self._components 251 | t = self.n_mc 252 | logFxa = logFx + logpa 253 | 254 | # set up bound 255 | logFQ = logFx - logQ # (t,) 256 | logFQ2 = 2.*logFxa - 2.*logQ # (t,) 257 | logZ = Tlogsumexp(logFQ.flatten(), axis=0) - T.log(t) 258 | logbnd = Tlogsumexp(logFQ2.flatten(), axis=0) - T.log(t) 259 | 260 | logF_D = logF_D_vec.mean() 261 | 262 | loglik = logF_D - 0.5*logZ 263 | loglikbnd = logF_D - 0.5*logbnd 264 | 265 | self.logbnd = logbnd 266 | self.logZ = logZ 267 | 268 | # set up pseudolikelihood 269 | plik = self.create_pseudoliklihood(D) 270 | 271 | return loglik, plik 272 | 273 | # def get_params(self): 274 | # return [self.W, self.vbias, self.hbias, self.Phi] 275 | 276 | def get_params_p(self): 277 | return [self.W, self.vbias, self.hbias] 278 | 279 | def get_params_qx(self): 280 | (l_qx, l_qx_samp, l_pa_mu, l_pa_logsigma) = self.network 281 | return lasagne.layers.get_all_params(l_qx, trainable=True) 282 | 283 | def get_params_pa(self): 284 | (l_qx, l_qx_samp, l_pa_mu, l_pa_logsigma) = self.network 285 | return lasagne.layers.get_all_params(l_pa_mu, trainable=True) 286 | 287 | def create_gradients(self): 288 | (logFx, logpa, logQ, logF_D_vec) = self._components 289 | logFxa = logFx + logpa 290 | logbnd, logZ = self.logbnd, self.logZ 291 | logF_D = logF_D_vec.mean() 292 | 293 | logQ2 = 2.*logQ 294 | logFQ = logFx - logQ # (t,) 295 | S = T.exp(logFQ - logZ) # (t,) 296 | S2 = T.exp(2*logFxa - logbnd - logQ2) # (t,) 297 | target = T.mean(S * logFx) 298 | 299 | # get grads wrt params_p 300 | dlogB_W, dlogB_bv, dlogB_bh = theano.grad(logbnd, [self.W, self.vbias, self.hbias]) 301 | dlogZ_W, dlogZ_bv, dlogZ_bh = theano.grad(target, [self.W, self.vbias, self.hbias], consider_constant=[S]) 302 | dE_W, dE_bv, dE_bh = theano.grad(logF_D, [self.W, self.vbias, self.hbias]) 303 | 304 | # get grads wrt params_qx 305 | from theano.gradient import disconnected_grad as dg 306 | dlogB_target = T.mean(-dg(S2) * logQ) 307 | dlogB_qx = theano.grad(dlogB_target, self.get_params_qx()) 308 | 309 | # get grads wrt params_pa 310 | dlogB_pa = theano.grad(T.mean(S2), self.get_params_pa(), consider_constant=[logbnd]) 311 | 312 | # log-likelihood / bound gradients (combine the above) 313 | 314 | dL_qx = [-0.5*g for g in dlogB_qx] 315 | dL_pa = [-0.5*g for g in dlogB_pa] 316 | dL_W = dE_W - dlogZ_W 317 | dL_bv = dE_bv - dlogZ_bv 318 | dL_bh = dE_bh - dlogZ_bh 319 | # dL_W = dE_W - 0.5 * dlogB_W 320 | # dL_bv = dE_bv - 0.5 * dlogB_bv 321 | # dL_bh = dE_bh - 0.5 * dlogB_bh 322 | 323 | dL_Theta = [dL_W, dL_bv, dL_bh] 324 | dlogZ_Theta = [dlogZ_W, dlogZ_bv, dlogZ_bh] 325 | dE_Theta = [dE_W, dE_bv, dE_bh] 326 | 327 | return dL_Theta, dlogZ_Theta, dE_Theta, dL_qx, dL_pa 328 | 329 | def create_pseudoliklihood(self, X): 330 | """Stochastic approximation to the pseudo-likelihood""" 331 | # X = self.inputs[0] 332 | X = X.reshape((-1, self.n_visible)) 333 | 334 | # index of bit i in expression p(x_i | x_{\i}) 335 | bit_i_idx = self.bit_i_idx 336 | # bit_i_idx = theano.shared(value=0, name='bit_i_idx') 337 | 338 | # binarize the input image by rounding to nearest integer 339 | xi = T.round(X) 340 | 341 | # calculate free energy for the given bit configuration 342 | fe_xi = self.free_energy(xi) 343 | 344 | # flip bit x_i of matrix xi and preserve all other bits x_{\i} 345 | # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx], but assigns 346 | # the result to xi_flip, instead of working in place on xi. 347 | xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx]) 348 | 349 | # calculate free energy with bit flipped 350 | fe_xi_flip = self.free_energy(xi_flip) 351 | 352 | # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))) 353 | cost = T.mean(self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip-fe_xi))) 354 | 355 | return cost 356 | 357 | def create_updates(self, grads, params, alpha, opt_alg, opt_params): 358 | # def create_updates(self, alpha, opt_alg, opt_params): 359 | scaled_p_grads = [grad * alpha for grad in self.grad_p_avg] 360 | dL_Theta, dE_Theta, dlogZ_Theta, dL_qx, dL_pa = grads 361 | 362 | # create update for main parameters 363 | # TODO: Scale grads with alpha! 364 | # lr = opt_params.get('lr', 1e-3) 365 | # b1, b2 = opt_params.get('b1', 0.7), opt_params.get('b2', 1e-3) 366 | lr, b1, b2 = 1e-3, 0.7, 1e-3 367 | uL_Theta = lasagne.updates.adam(self.grad_p_avg, self.params_p, learning_rate=lr, beta1=b1, beta2=b2) 368 | # uL_Theta = lasagne.updates.adam(self.grad_p_avg, self.params_p, learning_rate=1e-3, beta1=7e-1, beta2=1e-3) 369 | # uL_Theta = lasagne.updates.adam(scaled_p_grads, self.params_p, learning_rate=1e-3, beta1=0.9, beta2=0.999) 370 | # learning_rate=lr, beta1=b1, beta2=b2) 371 | uL_qx = lasagne.updates.sgd([-1*g for g in dL_qx], self.get_params_qx(), learning_rate=5e-2) # TODO: anneal lr 372 | # uL_qx = lasagne.updates.adam([-1*g for g in dL_qx], self.get_params_qx(), learning_rate=5e-2, beta1=0.0, beta2=0.999) # TODO: anneal lr 373 | uL_pa = lasagne.updates.sgd([-1*g for g in dL_pa], self.get_params_pa(), learning_rate=5e-2) # TODO: anneal lr 374 | # uL_Phi = OrderedDict(uL_qx.items() + uL_pa.items()) 375 | uL_Phi = uL_qx 376 | 377 | # create updates for the running averages 378 | avg_updates = self.create_avg_updates(self.logZ, dlogZ_Theta, dL_Theta) 379 | 380 | # avg Theta updates 381 | avg_Theta_updates = OrderedDict() 382 | avg_Theta_updates[self.W_avg] = 0.95*self.W_avg + 0.05*self.W 383 | avg_Theta_updates[self.vbias_avg] = 0.95*self.vbias_avg + 0.05*self.vbias 384 | avg_Theta_updates[self.hbias_avg] = 0.95*self.hbias_avg + 0.05*self.hbias 385 | # avg_Theta_updates[self.W_avg] = self.W 386 | # avg_Theta_updates[self.vbias_avg] = self.vbias 387 | # avg_Theta_updates[self.hbias_avg] = self.hbias 388 | 389 | # increment bit_i_idx % number as part of updates 390 | uL_Phi[self.bit_i_idx] = (self.bit_i_idx + 1) % self.n_visible 391 | 392 | return uL_Theta, uL_Phi, avg_updates, avg_Theta_updates 393 | 394 | def create_avg_updates(self, logZ, dlogZ_Theta, dL_Theta): 395 | inner_it = self.inner_it 396 | avg_updates = OrderedDict() 397 | 398 | new_logZ_estimate = Tlogaddexp( 399 | self.logZ_avg + T.log((inner_it) / (inner_it+1)), 400 | logZ - T.log(inner_it+1.) 401 | ) 402 | # self.logZ_avg * ((inner_it) / (inner_it+1)) + logZ / (inner_it+1) 403 | 404 | avg_updates[inner_it] = inner_it + 1. 405 | avg_updates[self.logZ_avg] = new_logZ_estimate 406 | avg_updates[self.loga] = -2.*new_logZ_estimate # 1./(new_Z_estimate**2) 407 | 408 | for g, g_avg in zip(dL_Theta, self.grad_p_avg): 409 | avg_updates[g_avg] = g_avg * ((inner_it) / (inner_it+1)) - g / (inner_it+1) 410 | 411 | return avg_updates 412 | 413 | def reset_averages(self): 414 | self.inner_it.set_value(0.0) 415 | 416 | def free_energy(self, v_sample): 417 | """Function to compute the free energy""" 418 | wx_b = T.dot(v_sample, self.W) + self.hbias 419 | vbias_term = T.dot(v_sample, self.vbias) 420 | hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1) 421 | return -hidden_term - vbias_term 422 | 423 | def pseudolikelihood(self, data): 424 | self.rbm.components_ = self.W.get_value().T 425 | self.rbm.intercept_visible_ = self.vbias.get_value() 426 | self.rbm.intercept_hidden_ = self.hbias.get_value() 427 | return self.rbm.score_samples(data).mean() 428 | 429 | def dump(self, fname): 430 | """Pickle weights to a file""" 431 | params = lasagne.layers.get_all_param_values(self.network) 432 | all_params = [params, self.W.get_value(), self.vbias.get_value(), self.hbias.get_value()] 433 | with open(fname, 'w') as f: 434 | pickle.dump(all_params, f) 435 | 436 | def load(self, fname): 437 | """Load pickled network""" 438 | with open(fname) as f: 439 | params = pickle.load(f) 440 | self.load_params(params) 441 | 442 | def load_params(self, params): 443 | """Load a given set of parameters""" 444 | net_params, W, vbias, hbias = params 445 | self.W.set_value(W) 446 | self.vbias.set_value(vbias) 447 | self.hbias.set_value(hbias) 448 | lasagne.layers.set_all_param_values(self.network, net_params) 449 | 450 | def fit( 451 | self, X_train, Y_train, X_val, Y_val, 452 | n_epoch=10, n_batch=100, logname='run' 453 | ): 454 | """Train the model""" 455 | alpha = 1.0 # learning rate, which can be adjusted later 456 | n_data = len(X_train) 457 | n_superbatch = self.n_superbatch 458 | 459 | for epoch in range(n_epoch): 460 | # In each epoch, we do a full pass over the training data: 461 | train_batches, train_err, train_acc = 0, 0, 0 462 | start_time = time.time() 463 | 464 | # if epoch > 30: alpha = 0.33 465 | # if epoch > 20: alpha = 0.1 466 | 467 | # iterate over superbatches to save time on GPU memory transfer 468 | for X_sb, Y_sb in self.iterate_superbatches( 469 | X_train, Y_train, n_superbatch, 470 | datatype='train', shuffle=True, 471 | ): 472 | for idx1, idx2 in iterate_minibatch_idx(len(X_sb), n_batch): 473 | # q steps 474 | for i in range(2): 475 | err, acc = self.train_q(idx1, idx2) 476 | # print epoch, i, err, acc 477 | # err, acc = self.train_q(idx1, idx2) 478 | 479 | # p steps 480 | err, acc = self.train_p(idx1, idx2, alpha) 481 | # print epoch, 'P', self.pseudolikelihood(X_sb[idx1:idx2].reshape(-1, self.n_visible)), acc 482 | # print 483 | self.reset_averages() 484 | 485 | # err, acc = self.train(idx1, idx2, alpha) 486 | # collect metrics 487 | err = self.pseudolikelihood(X_sb[idx1:idx2].reshape(-1, self.n_visible)) 488 | train_batches += 1 489 | train_err += err 490 | train_acc += acc 491 | 492 | # print train_batches, err, acc 493 | 494 | if train_batches % 100 == 0: 495 | n_total = epoch * n_data + n_batch * train_batches 496 | metrics = [ 497 | n_total, train_err / train_batches, 498 | train_acc / train_batches, 499 | ] 500 | log_metrics(logname, metrics) 501 | 502 | print "Epoch {} of {} took {:.3f}s ({} minibatches)".format( 503 | epoch + 1, n_epoch, time.time() - start_time, train_batches) 504 | 505 | print " training:\t\t{:.6f}\t{:.6f}".format( 506 | train_err / train_batches, train_acc / train_batches) 507 | 508 | # reserve N of training data points to kick start hallucinations 509 | self.dump(logname + '.pkl') 510 | hallu_i = self.numpy_rng.randint(n_data - self.n_chain) 511 | self.hallu_set = np.asarray( 512 | X_train[hallu_i:hallu_i + self.n_chain], 513 | dtype=theano.config.floatX 514 | ) --------------------------------------------------------------------------------