├── util
    ├── __init__.py
    └── data.py
├── models
    ├── distributions
    │   ├── __init__.py
    │   ├── operations.py
    │   └── distributions.py
    ├── __init__.py
    ├── layers
    │   ├── __init__.py
    │   ├── shape.py
    │   ├── misc.py
    │   ├── conv.py
    │   ├── test_sampling.py
    │   └── sampling.py
    ├── helpers.py
    ├── vae.py
    ├── adgm.py
    ├── sbn.py
    ├── model.py
    ├── usbn.py
    ├── abstractrbm.py
    ├── dadgm.py
    ├── rbm_dadgm.py
    ├── variational_rbm.py
    ├── udadgm.py
    └── aux_variational_rbm.py
├── set_env.sh
├── log
    └── README.md
├── requirements.txt
├── Makefile
├── LICENSE
├── .gitignore
├── README.md
└── run.py


/util/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/distributions/__init__.py:
--------------------------------------------------------------------------------
1 | from distributions import *


--------------------------------------------------------------------------------
/set_env.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH="$PYTHONPATH:`pwd`"
2 | echo "Environment variables set!"
3 | 


--------------------------------------------------------------------------------
/log/README.md:
--------------------------------------------------------------------------------
1 | Log folder
2 | ==========
3 | 
4 | The models will save their logs into this folder.
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cycler==0.10.0
 2 | Lasagne==0.2.dev1
 3 | matplotlib==1.5.3
 4 | numpy==1.11.2
 5 | pyparsing==2.1.10
 6 | python-dateutil==2.5.3
 7 | pytz==2016.7
 8 | scipy==0.18.1
 9 | scikit-learn==0.18.1
10 | six==1.10.0
11 | Theano==0.9.0.dev3
12 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from vae import VAE
2 | from sbn import SBN
3 | from usbn import USBN
4 | from adgm import ADGM
5 | from dadgm import DADGM
6 | from udadgm import UDADGM
7 | from rbm import RBM
8 | from variational_rbm import VariationalRBM
9 | from aux_variational_rbm import AuxiliaryVariationalRBM


--------------------------------------------------------------------------------
/models/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | from shape import RepeatLayer
 2 | from sampling import (
 3 |     GaussianSampleLayer,
 4 |     GaussianMultiSampleLayer,
 5 |     BernoulliSampleLayer,
 6 |     GumbelSoftmaxSampleLayer,
 7 |     GumbelSampleLayer,
 8 |     LogisticSampleLayer
 9 | )
10 | from conv import Deconv2DLayer
11 | from misc import (
12 |   ArgmaxLayer, RoundLayer,
13 |   SoftmaxLayer, SigmoidLayer
14 | )


--------------------------------------------------------------------------------
/models/layers/shape.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import theano.tensor as T
 3 | import lasagne
 4 | 
 5 | # ----------------------------------------------------------------------------
 6 | 
 7 | class RepeatLayer(lasagne.layers.Layer):
 8 |     def __init__(self, l_in, n_ax, n_rep, rng=None, **kwargs):
 9 |         self.n_ax  = n_ax
10 |         self.n_rep = n_rep
11 |         super(RepeatLayer, self).__init__(l_in, **kwargs)
12 | 
13 |     def get_output_shape_for(self, input_shapes):
14 |         return tuple(np.insert(input_shapes, self.n_ax, self.n_rep))
15 | 
16 |     def get_output_for(self, inputs, deterministic=False, **kwargs):
17 |         return T.shape_padaxis(inputs, axis=self.n_ax).repeat(self.n_rep, self.n_ax)
18 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | PYTHON=/usr/bin/python27
 2 | PYTHON=python
 3 | 
 4 | EPOCHS=100
 5 | NAME=experiment2
 6 | 
 7 | LOGFOLDER=log
 8 | DATASET=mnist
 9 | MODEL=discrete-vae-rbm
10 | ALG=adam
11 | 
12 | LR=3e-4
13 | B1=0.9
14 | B2=0.999
15 | SUPERBATCH=1024
16 | NB=128
17 | 
18 | # ----------------------------------------------------------------------------
19 | 
20 | train:
21 | 	$(PYTHON) run.py train \
22 | 	  --dataset $(DATASET) \
23 | 	  --model $(MODEL) \
24 | 	  -e $(EPOCHS) \
25 | 	  --logname $(LOGFOLDER)/$(DATASET).$(MODEL).$(ALG).$(LR).$(NB).$(NAME) \
26 | 	  --plotname $(LOGFOLDER)/$(DATASET)_$(MODEL)_$(ALG)_$(LR)_$(NB)_$(NAME).png \
27 | 	  --alg $(ALG) \
28 | 	  --lr $(LR) \
29 | 	  --b1 $(B1) \
30 | 	  --b2 $(B2) \
31 | 	  --n_superbatch $(SUPERBATCH) \
32 | 	  --n_batch $(NB)
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Volodymyr Kuleshov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.png
  2 | *.json
  3 | 
  4 | # ML datasets
  5 | *.gz
  6 | env/
  7 | log/
  8 | 
  9 | # Byte-compiled / optimized / DLL files
 10 | __pycache__/
 11 | *.py[cod]
 12 | *$py.class
 13 | 
 14 | # C extensions
 15 | *.so
 16 | 
 17 | # Distribution / packaging
 18 | .Python
 19 | env/
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *,cover
 54 | .hypothesis/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # IPython Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # dotenv
 87 | .env
 88 | 
 89 | # virtualenv
 90 | venv/
 91 | ENV/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | 
 96 | # Rope project settings
 97 | .ropeproject
 98 | 
 99 | # Mac stuff
100 | .DS_Store
101 | 
102 | *.bbl
103 | *.blg
104 | *.aux
105 | *.zip
106 | 


--------------------------------------------------------------------------------
/models/layers/misc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import theano
 3 | import theano.tensor as T
 4 | import lasagne
 5 | 
 6 | # ----------------------------------------------------------------------------
 7 | 
 8 | class ArgmaxLayer(lasagne.layers.Layer):
 9 |     """Argmax along the last dimension"""
10 |     def __init__(self, incoming, **kwargs):
11 |         super(ArgmaxLayer, self).__init__(incoming, **kwargs)
12 | 
13 |     def get_output_shape_for(self, input_shape):
14 |         return input_shape[:-1]
15 | 
16 |     def get_output_for(self, input, **kwargs):
17 |         return T.argmax(input, axis=-1)
18 | 
19 | class RoundLayer(lasagne.layers.Layer):
20 |     """Argmax along the last dimension"""
21 |     def __init__(self, incoming, **kwargs):
22 |         super(RoundLayer, self).__init__(incoming, **kwargs)
23 | 
24 |     def get_output_shape_for(self, input_shape):
25 |         return input_shape
26 | 
27 |     def get_output_for(self, input, **kwargs):
28 |         x = T.sgn(input)
29 |         x = .5*(x + 1.)
30 |         return x
31 | 
32 | class SoftmaxLayer(lasagne.layers.Layer):
33 |     """Argmax along the last dimension"""
34 |     def __init__(self, incoming, tau=1., **kwargs):
35 |         super(SoftmaxLayer, self).__init__(incoming, **kwargs)
36 |         self.tau = tau
37 | 
38 |     def get_output_shape_for(self, input_shape):
39 |         return input_shape
40 | 
41 |     def get_output_for(self, input, **kwargs):
42 |         return T.nnet.softmax(input / self.tau)
43 | 
44 | class SigmoidLayer(lasagne.layers.Layer):
45 |     """Argmax along the last dimension"""
46 |     def __init__(self, incoming, tau=1., **kwargs):
47 |         super(SigmoidLayer, self).__init__(incoming, **kwargs)
48 |         self.tau = tau
49 | 
50 |     def get_output_shape_for(self, input_shape):
51 |         return input_shape
52 | 
53 |     def get_output_for(self, input, **kwargs):
54 |         return T.nnet.sigmoid(input / self.tau)        


--------------------------------------------------------------------------------
/models/layers/conv.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import theano
 3 | import theano.tensor as T
 4 | import lasagne
 5 | 
 6 | # ----------------------------------------------------------------------------
 7 | 
 8 | class Deconv2DLayer(lasagne.layers.Layer):
 9 | 
10 |     def __init__(self, incoming, num_filters, filter_size, stride=1, pad=0,
11 |             nonlinearity=lasagne.nonlinearities.rectify, **kwargs):
12 |         super(Deconv2DLayer, self).__init__(incoming, **kwargs)
13 |         self.num_filters = num_filters
14 |         self.filter_size = lasagne.utils.as_tuple(filter_size, 2, int)
15 |         self.stride = lasagne.utils.as_tuple(stride, 2, int)
16 |         self.pad = lasagne.utils.as_tuple(pad, 2, int)
17 |         self.W = self.add_param(lasagne.init.Orthogonal(),
18 |                 (self.input_shape[1], num_filters) + self.filter_size,
19 |                 name='W')
20 |         self.b = self.add_param(lasagne.init.Constant(0),
21 |                 (num_filters,),
22 |                 name='b')
23 |         if nonlinearity is None:
24 |             nonlinearity = lasagne.nonlinearities.identity
25 |         self.nonlinearity = nonlinearity
26 | 
27 |     def get_output_shape_for(self, input_shape):
28 |         shape = tuple(i*s - 2*p + f - 1
29 |                 for i, s, p, f in zip(input_shape[2:],
30 |                                       self.stride,
31 |                                       self.pad,
32 |                                       self.filter_size))
33 |         return (input_shape[0], self.num_filters) + shape
34 | 
35 |     def get_output_for(self, input, **kwargs):
36 |         op = T.nnet.abstract_conv.AbstractConv2d_gradInputs(
37 |             imshp=self.output_shape,
38 |             kshp=(self.input_shape[1], self.num_filters) + self.filter_size,
39 |             subsample=self.stride, border_mode=self.pad)
40 |         conved = op(self.W, input, self.output_shape[2:])
41 |         if self.b is not None:
42 |             conved += self.b.dimshuffle('x', 0, 'x', 'x')
43 |         return self.nonlinearity(conved)    


--------------------------------------------------------------------------------
/models/distributions/operations.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import theano.tensor as T
 3 | 
 4 | # ----------------------------------------------------------------------------
 5 | # (stolen from the parmesan lib)
 6 | 
 7 | def log_sum_exp(A, axis=None, sum_op=T.sum):
 8 |     """Computes `log(exp(A).sum(axis=axis))` avoiding numerical issues using the log-sum-exp trick.
 9 |     Direct calculation of :math:`\log \sum_i \exp A_i` can result in underflow or overflow numerical 
10 |     issues. Big positive values can cause overflow :math:`\exp A_i = \inf`, and big negative values 
11 |     can cause underflow :math:`\exp A_i = 0`. The latter can eventually cause the sum to go to zero 
12 |     and finally resulting in :math:`\log 0 = -\inf`.
13 |     The log-sum-exp trick avoids these issues by using the identity,
14 |     .. math::
15 |         \log \sum_i \exp A_i = \log \sum_i \exp(A_i - c) + c, \text{using},  \\
16 |         c = \max A.
17 |     This avoids overflow, and while underflow can still happen for individual elements it avoids 
18 |     the sum being zero.
19 |      
20 |     Parameters
21 |     ----------
22 |     A : Theano tensor
23 |         Tensor of which we wish to compute the log-sum-exp.
24 |     axis : int, tuple, list, None
25 |         Axis or axes to sum over; None (default) sums over all axes.
26 |     sum_op : function
27 |         Summing function to apply; default is T.sum, but can also be T.mean for log-mean-exp.
28 |     
29 |     Returns
30 |     -------
31 |     Theano tensor
32 |         The log-sum-exp of `A`, dimensions over which is summed will be dropped.
33 |     """
34 |     A_max = T.max(A, axis=axis, keepdims=True)
35 |     B = T.log(sum_op(T.exp(A - A_max), axis=axis, keepdims=True)) + A_max
36 | 
37 |     if axis is None:
38 |         return B.dimshuffle(())  # collapse to scalar
39 |     else:
40 |         if not hasattr(axis, '__iter__'): axis = [axis]
41 |         return B.dimshuffle([d for d in range(B.ndim) if d not in axis])  # drop summed axes
42 | 
43 | def log_mean_exp(A, axis=None):
44 |     """Computes `log(exp(A).mean(axis=axis))` avoiding numerical issues using the log-sum-exp trick.
45 |     See also
46 |     --------
47 |     log_sum_exp
48 |     """
49 |     return log_sum_exp(A, axis, sum_op=T.mean)


--------------------------------------------------------------------------------
/models/helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import theano.tensor as T
 3 | 
 4 | # ----------------------------------------------------------------------------
 5 | # iteration
 6 | 
 7 | def iterate_minibatch_idx(n_inputs, batchsize,):
 8 |     for start_idx in range(0, n_inputs - batchsize + 1, batchsize):
 9 |         yield start_idx, min(start_idx + batchsize, n_inputs)
10 | 
11 | def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
12 |     assert len(inputs) == len(targets)
13 |     if shuffle:
14 |         indices = np.arange(len(inputs))
15 |         np.random.shuffle(indices)
16 |     for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
17 |         if shuffle:
18 |             excerpt = indices[start_idx:start_idx + batchsize]
19 |         else:
20 |             excerpt = slice(start_idx, start_idx + batchsize)
21 |         yield inputs[excerpt], targets[excerpt]
22 | 
23 | def random_subbatch(inputs, targets, batchsize):
24 |     assert len(inputs) == len(targets)
25 |     indices = np.arange(len(inputs))
26 |     np.random.shuffle(indices)
27 |     excerpt = indices[:batchsize]
28 |     return inputs[excerpt], targets[excerpt]
29 | 
30 | # ----------------------------------------------------------------------------
31 | # eval
32 | 
33 | def evaluate(eval_f, X, Y, batchsize=1000):
34 |     tot_err, tot_acc, batches = 0, 0, 0
35 |     print X.shape
36 |     for inputs, targets in iterate_minibatches(X, Y, batchsize, shuffle=False):
37 |         err, acc = eval_f(inputs, targets)
38 |         tot_err += err
39 |         tot_acc += acc
40 |         batches += 1
41 |     return tot_err / batches, tot_acc / batches
42 | 
43 | def log_metrics(logname, metrics):
44 |     logfile = '%s.log' % logname
45 |     with open(logfile, 'a') as f:
46 |         f.write('\t'.join([str(m) for m in metrics]) + '\n')
47 | 
48 | # ----------------------------------------------------------------------------
49 | # math
50 | 
51 | def Tlogsumexp(L, axis=None, safe=True):
52 |   """Logsumexp in Theano."""
53 |   if safe:
54 |     # b = T.max(L, axis=axis)
55 |     # b = 1e6
56 |     # b = T.max(L)
57 |     b = 0
58 |     lse = b + T.log(T.sum(T.exp(L - b), axis=axis))
59 |   else:
60 |     lse = T.log(T.sum(T.exp(L), axis=axis))
61 |   return lse
62 | 
63 | def Tlogaddexp(a, b):
64 |   """Logsumexp in Theano."""
65 |   # m = T.max((a,b)) # FIXME: why are there NaNs here?
66 |   m = 0
67 |   return m + T.log(T.exp(a-m) + T.exp(b-m))


--------------------------------------------------------------------------------
/models/layers/test_sampling.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import theano.tensor as T
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # MY CODE
 6 | 
 7 | temperature = 0.01
 8 | logits = np.linspace(-2, 2, 10).reshape([1, -1])
 9 | 
10 | def softmax(x):
11 |     """Compute softmax values for each sets of scores in x."""
12 |     e_x = np.exp(x - np.max(x))
13 |     return e_x / e_x.sum(axis=0)
14 | 
15 | def sample_gumbel(shape, eps=1e-20):
16 |     """ Sample from Gumbel(0, 1) """
17 |     U = np.random.uniform(size=shape, low=0, high=1)
18 |     return -np.log(-np.log(U + eps) + eps)
19 | 
20 | def sample_gumbel_softmax(logits, temperature):
21 |     """ Sample from Gumbel-Softmax distribution """
22 |     y = logits + sample_gumbel(logits.shape)
23 |     return softmax(y / temperature)
24 | 
25 | plt.title('gumbel-softmax samples')
26 | for i in range(100):
27 |     plt.plot(
28 |         range(10),
29 |         sample_gumbel_softmax(logits,temperature=temperature),
30 |         marker='o',
31 |         alpha=0.25
32 |     )
33 | 
34 | plt.ylim(0,1)
35 | plt.show()
36 | 
37 | gsm_samples = [sample_gumbel_softmax(logits,temperature=temperature) for _ in range(500)]
38 | plt.title('average over samples')
39 | plt.plot(
40 |     range(10),
41 |     np.mean(gsm_samples, axis=0)[0],
42 |     marker='o',
43 |     label='gumbel-softmax average'
44 | )
45 | plt.plot(
46 |     softmax(sample_gumbel_softmax(logits,temperature=temperature)[0]),
47 |     marker='+',
48 |     label='regular softmax'
49 | )
50 | plt.legend(loc='best')
51 | plt.show()
52 | 
53 | # ---------------------------------------------------------------------
54 | 
55 | # THEIR CODE
56 | temperature = 0.01
57 | logits = np.linspace(-2, 2, 10)
58 | 
59 | def softmax(x):
60 |     e_x = np.exp(x - np.max(x))
61 |     return e_x / e_x.sum(axis=0)
62 | 
63 | def GumbelSoftmax(logits, temperature):
64 |     uniform = np.random.uniform(size=logits.shape,low=0,high=1)
65 |     gumbel = -np.log(-np.log(uniform + 1e-20) + 1e-20)
66 |     return softmax((logits + gumbel) / temperature)
67 | 
68 | plt.title('gumbel-softmax samples')
69 | for i in range(100):
70 |     plt.plot(
71 |         range(10),
72 |         GumbelSoftmax(logits, temperature),
73 |         marker='o',
74 |         alpha=0.25
75 |     )
76 | 
77 | plt.ylim(0,1)
78 | plt.show()
79 | 
80 | gsm_samples = [sample_gumbel_softmax(logits,temperature=temperature) for _ in range(500)]
81 | plt.title('average over samples')
82 | plt.plot(
83 |     range(10),
84 |     np.mean(gsm_samples, axis=0)[0],
85 |     marker='o',
86 |     label='gumbel-softmax average'
87 | )
88 | plt.plot(
89 |     softmax(sample_gumbel_softmax(logits,temperature=temperature)[0]),
90 |     marker='+',
91 |     label='regular softmax'
92 | )
93 | plt.legend(loc='best')
94 | plt.show()
95 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Neural Variational Inference in Undirected Models
 2 | =================================================
 3 | 
 4 | This repository contains code accompanying the paper
 5 | 
 6 | ```
 7 | Neural variational inference and learning in undirected graphical models.
 8 | Volodymyr Kuleshov and Stefano Ermon.
 9 | Neural Information Processing Systems, 2017
10 | ```
11 | 
12 | ## Installation
13 | 
14 | The code uses Theano and Lasagne.
15 | To install this package, clone the git repo, and update your `PYTHONPATH` to include the folder.
16 | 
17 | ```
18 | git clone https://github.com/kuleshov/neural-variational-inference.git;
19 | cd neural-variational-inference/;
20 | source set_env.sh
21 | ```
22 | 
23 | ## Models and Datasets
24 | 
25 | The repository implements the following models:
26 | 
27 | * `vae`: Regular variational autoencoder.
28 | * `discrete-vae`: Variational autoencoder with a `Bernoulli(200)` prior.
29 | * `discrete-vae-rbm`: Variational autoencoder with an `RBM(64,8)` prior, 
30 |    trained using neural variational inference.
31 | * `adgm` Auxiliary-variable deep generative model (ADGM; Malloe et al. 2016).
32 | * `discrete-adgm`: ADGM with a `Bernoulli(200)` prior.
33 | * `discrete-adgm-rbm`: ADGM with an `RBM(64,8)` prior,·
34 |    trained using neural variational inference.
35 | * `rbm`: Regular Restricted Boltzmann Machine (RBM) trained with persistent contrastive divergence.
36 | * `vrbm`: Variational RBM, i.e. RBM trained with neural variational infernce using a mixture of ten Bernoullis as the auxiliary helper distribution `q`.
37 | * `avrbm`: Auxiliary-variable Variational RBM, i.e. RBM trained with neural variational infernce using an auxiliary-variable distribution `q(x,a)` (parametrized with a neural network) as the helper distribution `q`.
38 | 
39 | The models can be run on the following datasets:
40 | 
41 | * `digits`: The UCI digits dataset. Use this for the RBM models (otherwise you'll get numerical issues and will 
42 | * `mnist`: Regular MNIST.
43 | * `binmnist`: Binarized MNIST, using the binarization from the IWAE (Burda et al.) paper.
44 | * `omniglot`: The Omniglot dataset.
45 | 
46 | The `run.py` script takes these names as input.
47 | 
48 | ## Running the Code
49 | 
50 | To run a model, you may use the `run.py` launch script.
51 | 
52 | ```
53 | usage: run.py train [-h] [--dataset DATASET] [--model MODEL] [--pkl PKL]
54 |                     [-e EPOCHS] [-l LOGNAME] [-p PLOTNAME] [--alg ALG]
55 |                     [--lr LR] [--b1 B1] [--b2 B2] [--n_batch N_BATCH]
56 |                     [--n_superbatch N_SUPERBATCH]
57 | 
58 | optional arguments:
59 |   -h, --help            show this help message and exit
60 |   --dataset DATASET
61 |   --model MODEL
62 |   --pkl PKL
63 |   -e EPOCHS, --epochs EPOCHS
64 |   -l LOGNAME, --logname LOGNAME
65 |   -p PLOTNAME, --plotname PLOTNAME
66 |   --alg ALG
67 |   --lr LR
68 |   --b1 B1
69 |   --b2 B2
70 |   --n_batch N_BATCH
71 |   --n_superbatch N_SUPERBATCH
72 | ```
73 | 
74 | The simplest way to use it is via the `Makefile` provided in the root dir; typing `make train` will start training. 
75 | You can specify the model, dataset, and other parameters by modifying the defaults in the `Makefile`.
76 | 
77 | The default hyper-parameters on `avrbm` on the `digits` dataset are currently set incorrectly and are causing problems.
78 | 
79 | ## Feedback
80 | 
81 | Send feedback to [Volodymyr Kuleshov](http://www.stanford.edu/~kuleshov).
82 | 


--------------------------------------------------------------------------------
/models/distributions/distributions.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import theano.tensor as T
  4 | 
  5 | # ----------------------------------------------------------------------------
  6 | # this is all taken from the parmesan lib
  7 | 
  8 | c = - 0.5 * math.log(2*math.pi)
  9 | 
 10 | def log_gumbel_softmax(x, mu, tau=1.0, eps=1e-6):
 11 |     """
 12 |     Compute logpdf of a Gumbel Softmax distribution with parameters p, at values x.
 13 |         .. See Appendix B.[1:2] https://arxiv.org/pdf/1611.01144v2.pdf
 14 |     """
 15 |     k = mu.shape[-1]
 16 |     logpdf = T.gammaln(k) + (k - 1) * T.log(tau + eps) \
 17 |         - k * T.log(T.sum(T.exp(mu) / T.power(x, tau), axis=2) + eps) \
 18 |         + T.sum(mu - (tau + 1) * T.log(x + eps), axis=2)
 19 |     return logpdf
 20 | 
 21 | def log_logistic_sigmoid(x, mu, tau=1.0, eps=1e-6):
 22 |     """
 23 |     Compute logpdf of a Gumbel Softmax distribution with parameters p, at values x.
 24 |         .. See Appendix B.[1:2] https://arxiv.org/pdf/1611.01144v2.pdf
 25 |     """
 26 |     mu = T.clip(mu, -10., 10.)
 27 |     logpdf = mu + T.log(tau + eps) \
 28 |         - (tau+1.) * ( T.log(x + eps) + T.log( 1.-x + eps) ) \
 29 |         - 2. * T.log( T.exp(mu) * T.power(x,-tau) + T.power(1.-x,-tau) + eps )
 30 |     return logpdf
 31 | 
 32 | def log_logistic_sigmoid2(y, mu, tau=1.0, eps=1e-6):
 33 |     """
 34 |     Compute logpdf of a Gumbel Softmax distribution with parameters p, at values x.
 35 |         .. See Appendix B.[1:2] https://arxiv.org/pdf/1611.01144v2.pdf
 36 |     """
 37 |     mu = T.clip(mu, -10., 10.)
 38 |     logpdf = mu + T.log(tau + eps) - tau*y \
 39 |            - 2. * T.log( 1. + T.exp( -tau*y + mu ) + eps )
 40 |     return logpdf
 41 | 
 42 | def log_bernoulli(x, p, eps=1e-6):
 43 |     """
 44 |     Compute log pdf of a Bernoulli distribution with success probability p, at values x.
 45 |         .. math:: \log p(x; p) = \log \mathcal{B}(x; p)
 46 |     Parameters
 47 |     ----------
 48 |     x : Theano tensor
 49 |         Values at which to evaluate pdf.
 50 |     p : Theano tensor
 51 |         Success probability :math:`p(x=1)`, which is also the mean of the Bernoulli distribution.
 52 |     eps : float
 53 |         Small number used to avoid NaNs by clipping p in range [eps;1-eps].
 54 |     Returns
 55 |     -------
 56 |     Theano tensor
 57 |         Element-wise log probability, this has to be summed for multi-variate distributions.
 58 |     """
 59 |     p = T.clip(p, eps, 1.0 - eps)
 60 |     return -T.nnet.binary_crossentropy(p, x)
 61 | 
 62 | 
 63 | def log_categorical(x, p, eps=1e-6):
 64 |     """
 65 |     Compute log pdf of a Categorical distribution with success probability p, at values x.
 66 |         .. math:: \log p(x; p) = \log \mathcal{B}(x; p)
 67 |     Parameters
 68 |     ----------
 69 |     x : Theano tensor
 70 |         Values at which to evaluate pdf.
 71 |     p : Theano tensor
 72 |         Success probability :math:`p(x=1)`, which is also the mean of the Bernoulli distribution.
 73 |     eps : float
 74 |         Small number used to avoid NaNs by clipping p in range [eps;1-eps].
 75 |     Returns
 76 |     -------
 77 |     Theano tensor
 78 |         Element-wise log probability, this has to be summed for multi-variate distributions.
 79 |     """
 80 |     p = T.clip(p, eps, 1.0 - eps)
 81 |     return -T.nnet.categorical_crossentropy(p, x)    
 82 | 
 83 | 
 84 | def log_normal(x, mean, std, eps=1e-6):
 85 |     """
 86 |     Compute log pdf of a Gaussian distribution with diagonal covariance, at values x.
 87 |     Variance is parameterized as standard deviation.
 88 |         .. math:: \log p(x) = \log \mathcal{N}(x; \mu, \sigma^2I)
 89 | 
 90 |     Parameters
 91 |     ----------
 92 |     x : Theano tensor
 93 |         Values at which to evaluate pdf.
 94 |     mean : Theano tensor
 95 |         Mean of the Gaussian distribution.
 96 |     std : Theano tensor
 97 |         Standard deviation of the diagonal covariance Gaussian.
 98 |     eps : float
 99 |         Small number added to standard deviation to avoid NaNs.
100 |     Returns
101 |     -------
102 |     Theano tensor
103 |         Element-wise log probability, this has to be summed for multi-variate distributions.
104 |     See also
105 |     --------
106 |     log_normal1 : using variance parameterization
107 |     log_normal2 : using log variance parameterization
108 |     """
109 |     std += eps
110 |     return c - T.log(T.abs_(std)) - (x - mean)**2 / (2 * std**2)
111 | 
112 | 
113 | def log_normal2(x, mean, log_var, eps=1e-6):
114 |     """
115 |     Compute log pdf of a Gaussian distribution with diagonal covariance, at values x.
116 |     Variance is parameterized as log variance rather than standard deviation, which ensures :math:`\sigma > 0`.
117 |         .. math:: \log p(x) = \log \mathcal{N}(x; \mu, \sigma^2I)
118 | 
119 |     Parameters
120 |     ----------
121 |     x : Theano tensor
122 |         Values at which to evaluate pdf.
123 |     mean : Theano tensor
124 |         Mean of the Gaussian distribution.
125 |     log_var : Theano tensor
126 |         Log variance of the diagonal covariance Gaussian.
127 |     eps : float
128 |         Small number added to denominator to avoid NaNs.
129 |     Returns
130 |     -------
131 |     Theano tensor
132 |         Element-wise log probability, this has to be summed for multi-variate distributions.
133 |     See also
134 |     --------
135 |     log_normal : using standard deviation parameterization
136 |     log_normal1 : using variance parameterization
137 |     """
138 |     return c - log_var/2 - (x - mean)**2 / (2 * T.exp(log_var) + eps)
139 | 


--------------------------------------------------------------------------------
/models/layers/sampling.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import theano.tensor as T
  3 | import lasagne
  4 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  5 | 
  6 | # ----------------------------------------------------------------------------
  7 | 
  8 | class GumbelSoftmax:
  9 |     def __init__(self, tau, softmax=True, eps=1e-6):
 10 |         """
 11 |         Bottom of page 10.
 12 |         https://arxiv.org/pdf/1611.01144v2.pdf
 13 |         """
 14 |         assert tau != 0
 15 |         self.temperature=tau
 16 |         self.eps=eps
 17 |         self.softmax=softmax
 18 |         self._srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))
 19 | 
 20 |     def __call__(self, logits):
 21 |         # sample from Gumbel(0, 1)
 22 |         uniform = self._srng.uniform(logits.shape,low=0,high=1)
 23 |         gumbel = -T.log(-T.log(uniform + self.eps) + self.eps)
 24 | 
 25 |         # draw a sample from the Gumbel-Softmax distribution
 26 |         if self.softmax:
 27 |             return T.nnet.softmax((logits + gumbel) / self.temperature)
 28 |         else:
 29 |             return (logits + gumbel) / self.temperature
 30 | 
 31 | class LogisticSigmoid:
 32 |     def __init__(self, tau, sigmoid=True, eps=1e-6):
 33 |         """
 34 |         Bottom of page 10.
 35 |         https://arxiv.org/pdf/1611.01144v2.pdf
 36 |         """
 37 |         assert tau != 0
 38 |         self.temperature=tau
 39 |         self.eps=eps
 40 |         self.sigmoid=sigmoid
 41 |         self._srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))
 42 | 
 43 |     def __call__(self, logits):
 44 |         # sample from Gumbel(0, 1)
 45 |         uniform = self._srng.uniform(logits.shape,low=0,high=1)
 46 |         logistic = T.log(uniform + self.eps) - T.log(1.-uniform + self.eps)
 47 | 
 48 |         # draw a sample from the logistic-sigmoid distribution
 49 |         if self.sigmoid:
 50 |             return T.nnet.sigmoid((logits + logistic) / self.temperature)
 51 |         else:
 52 |             return (logits + logistic) / self.temperature            
 53 | 
 54 | def onehot_argmax(logits):
 55 |     return T.extra_ops.to_one_hot(T.argmax(logits,-1),logits.shape[-1])
 56 | 
 57 | class GumbelSoftmaxSampleLayer(lasagne.layers.Layer):
 58 |     def __init__(self, incoming, tau, eps=1e-6, **kwargs):
 59 |         super(GumbelSoftmaxSampleLayer, self).__init__(incoming, **kwargs)
 60 |         self.gumbel_softmax = GumbelSoftmax(tau, eps=eps)
 61 | 
 62 |     def get_output_for(self, input, hard_max=False, **kwargs):
 63 |         if hard_max:
 64 |             return onehot_argmax(input)
 65 |         else:
 66 |             return self.gumbel_softmax(input)
 67 | 
 68 | class GumbelSampleLayer(lasagne.layers.Layer):
 69 |     def __init__(self, incoming, tau, eps=1e-6, **kwargs):
 70 |         super(GumbelSampleLayer, self).__init__(incoming, **kwargs)
 71 |         self.gumbel_softmax = GumbelSoftmax(tau, eps=eps, softmax=False)
 72 | 
 73 |     def get_output_for(self, input, hard_max=False, **kwargs):
 74 |         return self.gumbel_softmax(input)
 75 | 
 76 | class LogisticSampleLayer(lasagne.layers.Layer):
 77 |     def __init__(self, incoming, tau, eps=1e-6, **kwargs):
 78 |         super(LogisticSampleLayer, self).__init__(incoming, **kwargs)
 79 |         self.logistic_sigmoid = LogisticSigmoid(tau, eps=eps, sigmoid=False)
 80 | 
 81 |     def get_output_for(self, input, hard_max=False, **kwargs):
 82 |         return self.logistic_sigmoid(input)        
 83 | 
 84 | 
 85 | class GaussianSampleLayer(lasagne.layers.MergeLayer):
 86 |     def __init__(self, mu, logsigma, rng=None, **kwargs):
 87 |         self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579))
 88 |         super(GaussianSampleLayer, self).__init__([mu, logsigma], **kwargs)
 89 | 
 90 |     def get_output_shape_for(self, input_shapes):
 91 |         return input_shapes[0]
 92 | 
 93 |     def get_output_for(self, inputs, deterministic=False, **kwargs):
 94 |         mu, logsigma = inputs
 95 |         shape=(self.input_shapes[0][0] or inputs[0].shape[0],
 96 |                 self.input_shapes[0][1] or inputs[0].shape[1])
 97 |         if deterministic:
 98 |             return mu
 99 |         return mu + T.exp(logsigma) * self.rng.normal(shape)
100 | 
101 | 
102 | class GaussianMultiSampleLayer(lasagne.layers.MergeLayer):
103 |     def __init__(self, mu, logsigma, n_samples, rng=None, **kwargs):
104 |         self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579))
105 |         self.n_samples = n_samples
106 |         super(GaussianMultiSampleLayer, self).__init__([mu, logsigma], **kwargs)
107 | 
108 |     def get_output_shape_for(self, input_shapes):
109 |         n_batch, n_dim = input_shapes[0]
110 |         return (n_batch, self.n_samples, n_dim)
111 | 
112 |     def get_output_for(self, inputs, deterministic=False, **kwargs):
113 |         mu, logsigma = inputs
114 | 
115 |         # get dimensions
116 |         n_bat = self.input_shapes[0][0] or inputs[0].shape[0]
117 |         n_dim = self.input_shapes[0][1] or inputs[0].shape[1]
118 |         n_sam = self.n_samples
119 | 
120 |         # reshape inputs into rank-3 tensors
121 |         mu3 = mu.reshape((n_bat,1,n_dim)).repeat(n_sam, axis=1)
122 |         ls3 = logsigma.reshape((n_bat,1,n_dim)).repeat(n_sam, axis=1)
123 | 
124 |         # return reshape means if layer is deterministic
125 |         if deterministic: return mu3
126 | 
127 |         # otherwise, take samples
128 |         shape = (n_bat, n_sam, n_dim)
129 |         return mu3 + T.exp(ls3) * self.rng.normal(shape)
130 | 
131 | 
132 | class BernoulliSampleLayer(lasagne.layers.Layer):
133 |     def __init__(self, mean,
134 |                  seed=lasagne.random.get_rng().randint(1, 2147462579),
135 |                  **kwargs):
136 |         super(BernoulliSampleLayer, self).__init__(mean, **kwargs)
137 |         self._srng = RandomStreams(seed)
138 | 
139 |     def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)):
140 |         self._srng.seed(seed)
141 | 
142 |     def get_output_shape_for(self, input_shape):
143 |         return input_shape
144 | 
145 |     def get_output_for(self, mu, **kwargs):
146 |         return self._srng.binomial(size=mu.shape, p=mu, dtype=mu.dtype)
147 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | import pdb
  2 | import argparse
  3 | import matplotlib
  4 | # Force matplotlib to not use any Xwindows backend.
  5 | matplotlib.use('Agg')
  6 | import matplotlib.pyplot as plt
  7 | from util import data
  8 | 
  9 | # ----------------------------------------------------------------------------
 10 | 
 11 | def make_parser():
 12 |     parser = argparse.ArgumentParser()
 13 |     subparsers = parser.add_subparsers(title='Commands')
 14 | 
 15 |     # train
 16 |     train_parser = subparsers.add_parser('train', help='Train model')
 17 |     train_parser.set_defaults(func=train)
 18 | 
 19 |     train_parser.add_argument('--dataset', default='mnist')
 20 |     train_parser.add_argument('--model', default='vae')
 21 |     train_parser.add_argument('--pkl')
 22 |     train_parser.add_argument('-e', '--epochs', type=int, default=10)
 23 |     train_parser.add_argument('-l', '--logname', default='mnist-run')
 24 |     train_parser.add_argument('-p', '--plotname', default='mnist-plot.png')
 25 |     train_parser.add_argument('--alg', default='adam')
 26 |     train_parser.add_argument('--lr', type=float, default=1e-3)
 27 |     train_parser.add_argument('--b1', type=float, default=0.9)
 28 |     train_parser.add_argument('--b2', type=float, default=0.999)
 29 |     train_parser.add_argument('--n_batch', type=int, default=128)
 30 |     train_parser.add_argument('--n_superbatch', type=int, default=1280)
 31 | 
 32 |     return parser
 33 | 
 34 | # ----------------------------------------------------------------------------
 35 | 
 36 | def train(args):
 37 |     import models
 38 |     import numpy as np
 39 |     np.random.seed(1234)
 40 | 
 41 |     if args.dataset == 'mnist':
 42 |         n_dim, n_out, n_channels = 28, 10, 1
 43 |         X_train, Y_train, X_val, Y_val, _, _ = data.load_mnist()
 44 |     elif args.dataset == 'binmnist':
 45 |         n_dim, n_out, n_channels = 28, 10, 1
 46 |         X_train, X_val, _ = data.load_mnist_binarized()
 47 |         X_train = X_train.reshape((-1, 1, 28, 28))
 48 |         X_val = X_val.reshape((-1, 1, 28, 28))
 49 |         Y_train = np.empty((X_train.shape[0],), dtype='int32')
 50 |         Y_val = np.empty((X_val.shape[0],), dtype='int32')
 51 |     elif args.dataset == 'omniglot':
 52 |         n_dim, n_out, n_channels = 28, 10, 1
 53 |         X_train, Y_train, X_val, Y_val = data.load_omniglot_iwae()
 54 |         X_train = X_train.reshape((-1, 1, 28, 28))
 55 |         X_val = X_val.reshape((-1, 1, 28, 28))
 56 |         Y_train = np.empty((X_train.shape[0],), dtype='int32')
 57 |         Y_val = np.empty((X_val.shape[0],), dtype='int32')
 58 |     elif args.dataset == 'digits':
 59 |         n_dim, n_out, n_channels = 8, 10, 1
 60 |         X_train, Y_train, X_val, Y_val, _, _ = data.load_digits()
 61 |     else:
 62 |         X_train, Y_train = data.load_h5(args.train)
 63 |         X_val, Y_val = data.load_h5(args.test)
 64 | 
 65 |     # also get the data dimensions
 66 |     print 'dataset loaded.'
 67 | 
 68 |     # set up optimization params
 69 |     p = { 'lr' : args.lr, 'b1': args.b1, 'b2': args.b2, 'nb': args.n_batch }
 70 | 
 71 |     # create model
 72 |     if args.model == 'vae':
 73 |         model = models.VAE(
 74 |             n_dim=n_dim, n_out=n_out, n_chan=n_channels,
 75 |             n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p,
 76 |         )
 77 |     elif args.model == 'discrete-vae':
 78 |         model = models.SBN(
 79 |             n_dim=n_dim, n_out=n_out, n_chan=n_channels,
 80 |             n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p,
 81 |         )
 82 |     elif args.model == 'discrete-vae-rbm':
 83 |         model = models.USBN(
 84 |             n_dim=n_dim, n_out=n_out, n_chan=n_channels,
 85 |             n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p,
 86 |         )
 87 |     elif args.model == 'adgm':
 88 |         model = models.ADGM(
 89 |             n_dim=n_dim, n_out=n_out, n_chan=n_channels,
 90 |             n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p,
 91 |         )
 92 |     elif args.model == 'discrete-adgm':
 93 |         model = models.DADGM(
 94 |             n_dim=n_dim, n_out=n_out, n_chan=n_channels,
 95 |             n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p,
 96 |         )
 97 |     elif args.model == 'discrete-adgm-rbm':
 98 |         model = models.UDADGM(
 99 |             n_dim=n_dim, n_out=n_out, n_chan=n_channels,
100 |             n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p,
101 |         )
102 |     elif args.model == 'rbm':
103 |         model = models.RBM(
104 |             n_dim=n_dim, n_out=n_out, n_chan=n_channels,
105 |             n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p,
106 |         )
107 |     elif args.model == 'vrbm':
108 |         model = models.VariationalRBM(
109 |             n_dim=n_dim, n_out=n_out, n_chan=n_channels,
110 |             n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p,
111 |         )
112 |     elif args.model == 'avrbm':
113 |         model = models.AuxiliaryVariationalRBM(
114 |             n_dim=n_dim, n_out=n_out, n_chan=n_channels,
115 |             n_superbatch=args.n_superbatch, opt_alg=args.alg, opt_params=p,
116 |         )
117 |     else:
118 |         raise ValueError('Invalid model')
119 | 
120 |     if args.pkl:
121 |         model.load(args.pkl)
122 |         
123 |         # generate samples
124 |         samples = model.hallucinate_chain()
125 | 
126 |         # plot them
127 |         plt.figure(figsize=(5, 5))
128 |         plt.imshow(samples, cmap=plt.cm.gray, interpolation='none')
129 |         plt.title('Hallucinated Samples')
130 |         plt.tight_layout()
131 |         plt.savefig(args.plotname)
132 | 
133 |         exit('hello')
134 | 
135 |     # train model
136 |     model.fit(
137 |         X_train, Y_train, X_val, Y_val,
138 |         n_epoch=args.epochs, n_batch=args.n_batch,
139 |         logname=args.logname
140 |     )
141 | 
142 |     # generate samples
143 |     samples = model.hallucinate()
144 | 
145 |     # plot them
146 |     plt.figure(figsize=(5, 5))
147 |     plt.imshow(samples, cmap=plt.cm.gray, interpolation='none')
148 |     plt.title('Hallucinated Samples')
149 |     plt.tight_layout()
150 |     plt.savefig(args.plotname)
151 | 
152 | def main():
153 |     parser = make_parser()
154 |     args = parser.parse_args()
155 |     args.func(args)
156 | 
157 | if __name__ == '__main__':
158 |     main()
159 | 


--------------------------------------------------------------------------------
/models/vae.py:
--------------------------------------------------------------------------------
  1 | import pdb
  2 | import time
  3 | import pickle
  4 | import numpy as np
  5 | 
  6 | import theano
  7 | import theano.tensor as T
  8 | import lasagne
  9 | 
 10 | from model import Model
 11 | from layers import GaussianSampleLayer
 12 | 
 13 | 
 14 | class VAE(Model):
 15 |     """Variational Autoencoder with Gaussian visible and latent variables"""
 16 |     def __init__(
 17 |         self, n_dim, n_out, n_chan=1, n_superbatch=12800, model='bernoulli',
 18 |         opt_alg='adam', opt_params={'lr' : 1e-3, 'b1': 0.9, 'b2': 0.99}
 19 |     ):
 20 |         # save model that wil be created
 21 |         self.model = model
 22 |         # invoke parent constructor
 23 |         Model.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params)
 24 | 
 25 |     def create_model(self, X, Y, n_dim, n_out, n_chan=1):
 26 |         # params
 27 |         n_lat = 64 # latent stochastic variables
 28 |         n_hid = 100 # size of hidden layer in encoder/decoder
 29 |         n_out = n_dim * n_dim * n_chan # total dimensionality of output
 30 |         hid_nl = lasagne.nonlinearities.tanh if self.model == 'bernoulli' \
 31 |                  else T.nnet.softplus
 32 | 
 33 |         # create the encoder network
 34 |         l_q_in = lasagne.layers.InputLayer(
 35 |             shape=(None, n_chan, n_dim, n_dim),
 36 |             input_var=X,
 37 |         )
 38 |         l_q_hid = lasagne.layers.DenseLayer(
 39 |             l_q_in, num_units=n_hid,
 40 |             W=lasagne.init.Orthogonal(),
 41 |             b=lasagne.init.Constant(0.0),
 42 |             nonlinearity=hid_nl,
 43 |             name='q_hid',
 44 |         )
 45 |         l_q_mu = lasagne.layers.DenseLayer(
 46 |             l_q_hid, num_units=n_lat,
 47 |             W=lasagne.init.Orthogonal(),
 48 |             b=lasagne.init.Constant(0.0),
 49 |             nonlinearity=None,
 50 |             name='q_mu',
 51 |         )
 52 |         l_q_logsigma = lasagne.layers.DenseLayer(
 53 |             l_q_hid, num_units=n_lat,
 54 |             W=lasagne.init.Orthogonal(),
 55 |             b=lasagne.init.Constant(0.0),
 56 |             nonlinearity=None,
 57 |             name='q_logsigma',
 58 |         )
 59 |         l_q = GaussianSampleLayer(l_q_mu, l_q_logsigma)
 60 | 
 61 |         # create the decoder network
 62 |         l_p_in = lasagne.layers.InputLayer((None, n_lat))
 63 |         l_p_hid = lasagne.layers.DenseLayer(
 64 |             l_p_in, num_units=n_hid,
 65 |             nonlinearity=hid_nl,
 66 |             W=lasagne.init.Orthogonal(),
 67 |             b=lasagne.init.Constant(0.0),
 68 |             name='p_hid',
 69 |         )
 70 |         l_p_mu, l_p_logsigma = None, None
 71 | 
 72 |         if self.model == 'bernoulli':
 73 |             l_p_mu = lasagne.layers.DenseLayer(
 74 |                 l_p_hid, num_units=n_out,
 75 |                 nonlinearity = lasagne.nonlinearities.sigmoid,
 76 |                 W=lasagne.init.Orthogonal(),
 77 |                 b=lasagne.init.Constant(0.0),
 78 |                 name='p_sigma',
 79 |             )
 80 |         elif self.model == 'gaussian':
 81 |             l_p_mu = lasagne.layers.DenseLayer(
 82 |                 l_p_hid, num_units=n_out,
 83 |                 nonlinearity=None,
 84 |             )
 85 | 
 86 |             # relu_shift is for numerical stability - if training data has any
 87 |             # dimensions where stdev=0, allowing logsigma to approach -inf
 88 |             # will cause the loss function to become NAN. So we set the limit
 89 |             # stdev >= exp(-1 * relu_shift)
 90 |             relu_shift = 10
 91 |             l_p_logsigma = lasagne.layers.DenseLayer(
 92 |                 l_p_hid, num_units=n_out,
 93 |                 nonlinearity = lambda a: T.nnet.relu(a+relu_shift)-relu_shift
 94 |             )
 95 | 
 96 |         self.input_layers = (l_q_in, l_p_in)
 97 |         self.n_lat = n_lat
 98 |         self.n_hid = n_hid
 99 | 
100 |         return l_p_mu, l_p_logsigma, l_q_mu, l_q_logsigma, l_q
101 | 
102 |     def create_objectives(self, deterministic=False):
103 |         # load network input
104 |         X = self.inputs[0]
105 |         x = X.flatten(2)
106 | 
107 |         l_p_mu, l_p_logsigma, l_q_mu, l_q_logsigma, l_q = self.network
108 |         l_q_in, l_p_in = self.input_layers
109 | 
110 |         # load network output
111 |         q_mu, q_logsigma, q_sample = lasagne.layers.get_output(
112 |             [l_q_mu, l_q_logsigma, l_q],
113 |             deterministic=deterministic,
114 |         )
115 |         if self.model == 'bernoulli':
116 |             p_mu = lasagne.layers.get_output(
117 |                 l_p_mu, {l_p_in : q_sample},
118 |                 deterministic=deterministic,
119 |             )
120 |         elif self.model == 'gaussian':
121 |             p_mu, p_logsigma = lasagne.layers.get_output(
122 |                 [l_p_mu, l_p_logsigma],
123 |                 {l_p_in : q_sample},
124 |                 deterministic=deterministic,
125 |             )
126 | 
127 |         # first term of the ELBO: kl-divergence (using the closed form expression)
128 |         kl_div = 0.5 * T.sum(1 + 2 * q_logsigma - T.sqr(q_mu)
129 |                              - T.exp(2 * q_logsigma), axis=1).mean()
130 | 
131 |         # second term: log-likelihood of the data under the model
132 |         if self.model == 'bernoulli':
133 |             logpxz = -lasagne.objectives.binary_crossentropy(
134 |                 p_mu, x,
135 |             ).sum(axis=1).mean()
136 |         elif self.model == 'gaussian':
137 |             def log_lik(x, mu, log_sig):
138 |                 return T.sum(-(np.float32(0.5 * np.log(2 * np.pi)) + log_sig)
139 |                              - 0.5 * T.sqr(x - mu) / T.exp(2 * log_sig), axis=1)
140 |             logpxz = log_lik(x, p_mu, p_logsigma).mean()
141 | 
142 |         loss = -1 * (logpxz + kl_div)
143 |         # we don't use the spearate accuracy metric right now
144 |         return loss, -kl_div
145 | 
146 |     def gen_samples(self, deterministic=False):
147 |         s = self.inputs[-1]
148 |         # put it through the decoder
149 |         _, l_p_in = self.input_layers
150 |         l_p_mu = self.network[0]
151 |         p_mu = lasagne.layers.get_output(l_p_mu, {l_p_in : s})
152 | 
153 |         return p_mu
154 | 
155 |     def gen_noise(self, size, n_lat):
156 |         noise = np.random.randn(size, n_lat)
157 |         return noise
158 | 
159 |     def get_params(self):
160 |         if self.model == 'bernoulli':
161 |             l_p_mu, _, _, _, l_q = self.network
162 |         elif self.model == 'gaussian':
163 |             raise NotImplementedError()
164 |         p_params = lasagne.layers.get_all_params(l_p_mu, trainable=True)
165 |         q_params = lasagne.layers.get_all_params(l_q, trainable=True)
166 | 
167 |         return p_params + q_params
168 | 


--------------------------------------------------------------------------------
/models/adgm.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import pickle
  3 | import numpy as np
  4 | 
  5 | import theano
  6 | import theano.tensor as T
  7 | import lasagne
  8 | 
  9 | from model import Model
 10 | from layers import GaussianSampleLayer, GaussianMultiSampleLayer
 11 | from layers.shape import RepeatLayer
 12 | from distributions import log_bernoulli, log_normal, log_normal2
 13 | 
 14 | 
 15 | class ADGM(Model):
 16 |     """Auxiliary Deep Generative Model (unsupervised version)"""
 17 |     def __init__(
 18 |         self, n_dim, n_out, n_chan=1, n_superbatch=12800, model='bernoulli',
 19 |         opt_alg='adam', opt_params={'lr' : 1e-3, 'b1': 0.9, 'b2': 0.99}
 20 |     ):
 21 |         # save model that wil be created
 22 |         self.model = model
 23 |         self.n_sample = 1 # adjustable parameter, though 1 works best in practice
 24 |         Model.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params)
 25 | 
 26 |     def create_model(self, X, Y, n_dim, n_out, n_chan=1):
 27 |         # params
 28 |         n_lat = 200  # latent stochastic variables
 29 |         n_aux = 10  # auxiliary variables
 30 |         n_hid = 499  # size of hidden layer in encoder/decoder
 31 |         n_sam = self.n_sample  # number of monte-carlo samples
 32 |         n_out = n_dim * n_dim * n_chan # total dimensionality of ouput
 33 |         hid_nl = lasagne.nonlinearities.rectify
 34 |         relu_shift = lambda av: T.nnet.relu(av + 10) - 10 # for numerical stability
 35 | 
 36 |         # create the encoder network
 37 |         # create q(a|x)
 38 |         l_qa_in = lasagne.layers.InputLayer(
 39 |             shape=(None, n_chan, n_dim, n_dim),
 40 |             input_var=X,
 41 |         )
 42 |         l_qa_hid = lasagne.layers.DenseLayer(
 43 |             l_qa_in, num_units=n_hid,
 44 |             W=lasagne.init.GlorotNormal('relu'),
 45 |             b=lasagne.init.Normal(1e-3),
 46 |             nonlinearity=hid_nl,
 47 |         )
 48 |         l_qa_mu = lasagne.layers.DenseLayer(
 49 |             l_qa_hid, num_units=n_aux,
 50 |             W=lasagne.init.GlorotNormal(),
 51 |             b=lasagne.init.Normal(1e-3),
 52 |             nonlinearity=None,
 53 |         )
 54 |         l_qa_logsigma = lasagne.layers.DenseLayer(
 55 |             l_qa_hid, num_units=n_aux,
 56 |             W=lasagne.init.GlorotNormal(),
 57 |             b=lasagne.init.Normal(1e-3),
 58 |             nonlinearity=relu_shift,
 59 |         )
 60 |         # repeatedly sample
 61 |         l_qa_mu = lasagne.layers.ReshapeLayer(
 62 |             RepeatLayer(l_qa_mu, n_ax=1, n_rep=n_sam),
 63 |             shape=(-1, n_aux),
 64 |         )
 65 |         l_qa_logsigma = lasagne.layers.ReshapeLayer(
 66 |             RepeatLayer(l_qa_logsigma, n_ax=1, n_rep=n_sam),
 67 |             shape=(-1, n_aux),
 68 |         )
 69 |         l_qa = GaussianSampleLayer(l_qa_mu, l_qa_logsigma)
 70 | 
 71 |         # create q(z|a,x)
 72 |         l_qz_hid1a = lasagne.layers.DenseLayer(
 73 |             l_qa, num_units=n_hid,
 74 |             W=lasagne.init.GlorotNormal('relu'),
 75 |             b=lasagne.init.Normal(1e-3),
 76 |             nonlinearity=hid_nl,
 77 |         )
 78 |         l_qz_hid1b = lasagne.layers.DenseLayer(
 79 |             l_qa_in, num_units=n_hid,
 80 |             W=lasagne.init.GlorotNormal('relu'),
 81 |             b=lasagne.init.Normal(1e-3),
 82 |             nonlinearity=hid_nl,
 83 |         )
 84 |         l_qz_hid1b = lasagne.layers.ReshapeLayer(
 85 |             RepeatLayer(l_qz_hid1b, n_ax=1, n_rep=n_sam),
 86 |             shape=(-1, n_hid),
 87 |         )
 88 |         l_qz_hid2 = lasagne.layers.ElemwiseSumLayer([l_qz_hid1a, l_qz_hid1b])
 89 |         l_qz_hid2 = lasagne.layers.NonlinearityLayer(l_qz_hid2, hid_nl)
 90 |         l_qz_mu = lasagne.layers.DenseLayer(
 91 |             l_qz_hid2, num_units=n_lat,
 92 |             W=lasagne.init.GlorotNormal(),
 93 |             b=lasagne.init.Normal(1e-3),
 94 |             nonlinearity=None,
 95 |         )
 96 |         l_qz_logsigma = lasagne.layers.DenseLayer(
 97 |             l_qz_hid2, num_units=n_lat,
 98 |             W=lasagne.init.GlorotNormal(),
 99 |             b=lasagne.init.Normal(1e-3),
100 |             nonlinearity=relu_shift,
101 |         )
102 |         l_qz = GaussianSampleLayer(l_qz_mu, l_qz_logsigma)
103 | 
104 |         # create the decoder network
105 |         # create p(x|z)
106 |         l_px_in = lasagne.layers.InputLayer((None, n_lat))
107 |         l_px_hid = lasagne.layers.DenseLayer(
108 |             l_px_in, num_units=n_hid,
109 |             W=lasagne.init.GlorotNormal('relu'),
110 |             b=lasagne.init.Normal(1e-3),
111 |             nonlinearity=hid_nl,
112 |         )
113 |         l_px_mu, l_px_logsigma = None, None
114 | 
115 |         if self.model == 'bernoulli':
116 |             l_px_mu = lasagne.layers.DenseLayer(
117 |                 l_px_hid, num_units=n_out,
118 |                 nonlinearity = lasagne.nonlinearities.sigmoid,
119 |                 W=lasagne.init.GlorotUniform(),
120 |                 b=lasagne.init.Normal(1e-3),
121 |             )
122 |         elif self.model == 'gaussian':
123 |             l_px_mu = lasagne.layers.DenseLayer(
124 |                 l_px_hid, num_units=n_out,
125 |                 nonlinearity=None,
126 |             )
127 |             l_px_logsigma = lasagne.layers.DenseLayer(
128 |                 l_px_hid, num_units=n_out,
129 |                 nonlinearity=relu_shift,
130 |             )
131 | 
132 |         # create p(a|z)
133 |         l_pa_hid = lasagne.layers.DenseLayer(
134 |             l_px_in, num_units=n_hid,
135 |             nonlinearity=hid_nl,
136 |             W=lasagne.init.GlorotNormal('relu'),
137 |             b=lasagne.init.Normal(1e-3),
138 |         )
139 |         l_pa_mu = lasagne.layers.DenseLayer(
140 |             l_pa_hid, num_units=n_aux,
141 |             W=lasagne.init.GlorotNormal(),
142 |             b=lasagne.init.Normal(1e-3),
143 |             nonlinearity=None,
144 |         )
145 |         l_pa_logsigma = lasagne.layers.DenseLayer(
146 |             l_pa_hid, num_units=n_aux,
147 |             W=lasagne.init.GlorotNormal(),
148 |             b=lasagne.init.Normal(1e-3),
149 |             nonlinearity=relu_shift,
150 |         )
151 | 
152 |         self.input_layers = (l_qa_in, l_px_in)
153 |         self.n_lat = n_lat
154 |         self.n_hid = n_hid
155 | 
156 |         return l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \
157 |                l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, \
158 |                l_qa, l_qz
159 | 
160 |     def create_objectives(self, deterministic=False):
161 |         # load network input
162 |         X = self.inputs[0]
163 |         x = X.flatten(2)
164 | 
165 |         # duplicate entries to take into account multiple mc samples
166 |         n_sam = self.n_sample
167 |         n_out = x.shape[1]
168 |         x = x.dimshuffle(0,'x',1).repeat(n_sam, axis=1).reshape((-1, n_out))
169 | 
170 |         # load network
171 |         l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \
172 |         l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, \
173 |         l_qa, l_qz = self.network
174 |         l_qa_in, l_px_in = self.input_layers
175 | 
176 |         # load network output
177 |         qz_mu, qz_logsigma, qa_mu, qa_logsigma, a, z \
178 |             = lasagne.layers.get_output(
179 |                 [l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, l_qa, l_qz],
180 |                 deterministic=deterministic,
181 |             )
182 |         pa_mu, pa_logsigma = lasagne.layers.get_output(
183 |             [l_pa_mu, l_pa_logsigma],
184 |             {l_px_in: z},
185 |             deterministic=deterministic,
186 |         )
187 | 
188 |         if self.model == 'bernoulli':
189 |             px_mu = lasagne.layers.get_output(
190 |                 l_px_mu, {l_px_in: z},
191 |                 deterministic=deterministic
192 |             )
193 |         elif self.model == 'gaussian':
194 |             px_mu, px_logsigma  = lasagne.layers.get_output(
195 |                 [l_px_mu, l_px_logsigma],
196 |                 {l_px_in: z},
197 |                 deterministic=deterministic,
198 |             )
199 | 
200 |         # entropy term
201 |         log_qa_given_x  = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1)
202 |         log_qz_given_ax = log_normal2(z, qz_mu, qz_logsigma).sum(axis=1)
203 |         log_qza_given_x = log_qz_given_ax + log_qa_given_x
204 | 
205 |         # log-probability term
206 |         z_prior_sigma = T.cast(T.ones_like(qz_logsigma), dtype=theano.config.floatX)
207 |         z_prior_mu = T.cast(T.zeros_like(qz_mu), dtype=theano.config.floatX)
208 |         log_pz = log_normal(z, z_prior_mu,  z_prior_sigma).sum(axis=1)
209 |         log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1)
210 | 
211 |         if self.model == 'bernoulli':
212 |             log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1)
213 |         elif self.model == 'gaussian':
214 |             log_px_given_z = log_normal2(x, px_mu, px_logsigma).sum(axis=1)
215 | 
216 |         log_paxz = log_pa_given_z + log_px_given_z + log_pz
217 | 
218 |         # compute the evidence lower bound
219 |         elbo = T.mean(log_paxz - log_qza_given_x)
220 | 
221 |         # we don't use a spearate accuracy metric right now
222 |         return -elbo, T.mean(qz_logsigma)
223 | 
224 |     def create_gradients(self, loss, deterministic=False):
225 |         grads = Model.create_gradients(self, loss, deterministic)
226 | 
227 |         # combine and clip gradients
228 |         clip_grad = 1
229 |         max_norm = 5
230 |         mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm)
231 |         cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
232 | 
233 |         return cgrads
234 | 
235 |     def gen_samples(self, deterministic=False):
236 |         s = self.inputs[-1]
237 |         # put it through the decoder
238 |         _, l_px_in = self.input_layers
239 |         l_px_mu = self.network[0]
240 |         px_mu = lasagne.layers.get_output(l_px_mu, {l_px_in : s})
241 | 
242 |         return px_mu
243 | 
244 |     def gen_noise(self, size, n_lat):
245 |         noise = np.random.randn(size, n_lat)
246 |         return noise
247 | 
248 |     def get_params(self):
249 |         l_px_mu, _, l_pa_mu, l_pa_logsigma, \
250 |         _, _, _, _, l_qa, l_qz = self.network
251 | 
252 |         p_params = lasagne.layers.get_all_params(
253 |             [l_px_mu, l_pa_mu, l_pa_logsigma],
254 |             trainable=True,
255 |         )
256 |         qa_params = lasagne.layers.get_all_params(l_qa, trainable=True)
257 |         qz_params = lasagne.layers.get_all_params(l_qz, trainable=True)
258 | 
259 |         return p_params + qa_params + qz_params
260 | 


--------------------------------------------------------------------------------
/models/sbn.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import pickle
  3 | import numpy as np
  4 | from collections import OrderedDict
  5 | 
  6 | import theano
  7 | import theano.tensor as T
  8 | from theano.gradient import disconnected_grad as dg
  9 | import lasagne
 10 | 
 11 | from model import Model
 12 | 
 13 | from layers import BernoulliSampleLayer
 14 | from distributions import log_bernoulli
 15 | from helpers import Tlogsumexp
 16 | 
 17 | from theano.tensor.shared_randomstreams import RandomStreams
 18 | 
 19 | # ----------------------------------------------------------------------------
 20 | 
 21 | class SBN(Model):
 22 |     """Sigmoid Belief Network trained using Neural Variational Inference
 23 |        Epoch 200 of 200 took 26.052s (192 minibatches)
 24 |             training loss/acc:        125.989901  107.652437
 25 |             validation loss/acc:      126.220432  108.006230
 26 |     """
 27 | 
 28 |     def __init__(
 29 |         self, n_dim, n_out, n_chan=1, n_superbatch=12800,
 30 |         opt_alg='adam', opt_params={'lr': 1e-3, 'b1': 0.9, 'b2': 0.99}
 31 |     ):
 32 |         # invoke parent constructor
 33 |         Model.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params)
 34 | 
 35 |         # random number generators
 36 |         self.numpy_rng = np.random.RandomState(1234)
 37 |         self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30))
 38 | 
 39 |         (loss, acc) = self.objectives
 40 |         llik = self.create_llik()
 41 |         (X, Y, idx1, idx2, S) = self.inputs
 42 |         # self.llik = theano.function(
 43 |         #     [idx1, idx2], llik, updates=self.updates,
 44 |         #     {X: self.train_set_x[idx1:idx2], Y: self.train_set_y[idx1:idx2]}
 45 |         # )
 46 |         self.loss = theano.function(
 47 |             [X, Y], [loss, llik],
 48 |             on_unused_input='warn',
 49 |         )
 50 | 
 51 |     def create_model(self, X, Y, n_dim, n_out, n_chan=1):
 52 |         # params
 53 |         n_lat    = 200 # latent stochastic variables
 54 |         n_hid    = 500 # size of hidden layer in encoder/decoder
 55 |         n_hid_cv = 500 # size of hidden layer in control variate net
 56 |         n_out    = n_dim * n_dim * n_chan # total dimensionality of ouput
 57 |         hid_nl   = lasagne.nonlinearities.tanh
 58 | 
 59 |         # create the encoder network
 60 |         l_q_in = lasagne.layers.InputLayer(
 61 |             shape=(None, n_chan, n_dim, n_dim),
 62 |             input_var=X,
 63 |         )
 64 |         l_q_hid = lasagne.layers.DenseLayer(
 65 |             l_q_in, num_units=n_hid,
 66 |             nonlinearity=hid_nl,
 67 |         )
 68 |         l_q_out = lasagne.layers.DenseLayer(
 69 |             l_q_hid, num_units=n_lat,
 70 |             nonlinearity=None,
 71 |         )
 72 |         l_q_mu = lasagne.layers.DenseLayer(
 73 |             l_q_hid, num_units=n_lat,
 74 |             nonlinearity=T.nnet.sigmoid,
 75 |         )
 76 |         l_q_sample = BernoulliSampleLayer(l_q_mu)
 77 | 
 78 |         # create the decoder network
 79 |         # note that we currently only handle Bernoulli x variables
 80 |         l_p_in = lasagne.layers.InputLayer((None, n_lat))
 81 |         l_p_hid = lasagne.layers.DenseLayer(
 82 |             l_p_in, num_units=n_hid,
 83 |             nonlinearity=hid_nl,
 84 |             W=lasagne.init.GlorotUniform(),
 85 |         )
 86 |         l_p_mu = lasagne.layers.DenseLayer(l_p_hid, num_units=n_out,
 87 |             nonlinearity = lasagne.nonlinearities.sigmoid,
 88 |             W=lasagne.init.GlorotUniform(),
 89 |             b=lasagne.init.Constant(0.),
 90 |         )
 91 | 
 92 |         # create control variate (baseline) network
 93 |         l_cv_in = lasagne.layers.InputLayer(
 94 |             shape=(None, n_chan, n_dim, n_dim),
 95 |             input_var=X,
 96 |         )
 97 |         l_cv_hid = lasagne.layers.DenseLayer(
 98 |             l_cv_in, num_units=n_hid_cv,
 99 |             nonlinearity=hid_nl,
100 |         )
101 |         l_cv = lasagne.layers.DenseLayer(
102 |             l_cv_hid, num_units=1,
103 |             nonlinearity=None,
104 |         )
105 | 
106 |         # create variables for centering signal
107 |         c = theano.shared(np.zeros((1,1), dtype=np.float64), broadcastable=(True,True))
108 |         v = theano.shared(np.zeros((1,1), dtype=np.float64), broadcastable=(True,True))
109 | 
110 |         self.input_layers = (l_q_in, l_p_in, l_cv_in)
111 |         self.n_lat = n_lat
112 |         self.n_hid = n_hid
113 | 
114 |         return l_p_mu, l_q_mu, l_q_sample, l_cv, c, v
115 | 
116 |     def _create_components(self, deterministic=False):
117 |         # load network input
118 |         X = self.inputs[0]
119 |         x = X.flatten(2)
120 | 
121 |         # load networks
122 |         l_p_mu, l_q_mu, l_q_sample, _, _, _ = self.network
123 |         l_q_in, l_p_in, l_cv_in = self.input_layers
124 | 
125 |         # load network output
126 |         z, q_mu = lasagne.layers.get_output(
127 |             [l_q_sample, l_q_mu], deterministic=deterministic)
128 |         p_mu = lasagne.layers.get_output(
129 |             l_p_mu, {l_p_in: z},
130 |             deterministic=deterministic,
131 |         )
132 | 
133 |         # entropy term
134 |         log_qz_given_x = log_bernoulli(dg(z), q_mu).sum(axis=1)
135 | 
136 |         # expected p(x,z) term
137 |         z_prior = T.ones_like(z)*np.float32(0.5)
138 |         log_pz = log_bernoulli(z, z_prior).sum(axis=1)
139 |         log_px_given_z = log_bernoulli(x, p_mu).sum(axis=1)
140 |         log_pxz = log_pz + log_px_given_z
141 | 
142 |         # save them for later
143 |         self.log_pxz = log_pxz
144 |         self.log_qz_given_x = log_qz_given_x
145 | 
146 |         return log_pxz.flatten(), log_qz_given_x.flatten()
147 | 
148 |     def create_objectives(self, deterministic=False):
149 |         # load probabilities
150 |         log_pxz, log_qz_given_x = self._create_components(deterministic=deterministic)
151 | 
152 |         # compute the lower bound
153 |         elbo = T.mean(log_pxz - log_qz_given_x)
154 | 
155 |         # we don't use the second accuracy metric right now
156 |         return -elbo, -T.mean(log_qz_given_x)
157 | 
158 |     def create_gradients(self, loss, deterministic=False):
159 |         from theano.gradient import disconnected_grad as dg
160 | 
161 |         # load networks
162 |         l_p_mu, l_q_mu, _, l_cv, c, v = self.network
163 | 
164 |         # load params
165 |         p_params  = lasagne.layers.get_all_params(l_p_mu, trainable=True)
166 |         q_params  = lasagne.layers.get_all_params(l_q_mu, trainable=True)
167 |         cv_params = lasagne.layers.get_all_params(l_cv, trainable=True)
168 | 
169 |         # load neural net outputs (probabilities have been precomputed)
170 |         log_pxz, log_qz_given_x = self.log_pxz, self.log_qz_given_x
171 |         cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1)
172 | 
173 |         # compute learning signals
174 |         l = log_pxz - log_qz_given_x - cv
175 |         l_avg, l_var = l.mean(), l.var()
176 |         c_new = 0.8*c + 0.2*l_avg
177 |         v_new = 0.8*v + 0.2*l_var
178 |         l = (l - c_new) / T.maximum(1, T.sqrt(v_new))
179 | 
180 |         # compute grad wrt p
181 |         p_grads = T.grad(-log_pxz.mean(), p_params)
182 | 
183 |         # compute grad wrt q
184 |         q_target = T.mean(dg(l) * log_qz_given_x)
185 |         q_grads = T.grad(-0.2*q_target, q_params) # 5x slower rate for q
186 | 
187 |         # compute grad of cv net
188 |         cv_target = T.mean(l**2)
189 |         cv_grads = T.grad(cv_target, cv_params)
190 | 
191 |         # combine and clip gradients
192 |         clip_grad = 1
193 |         max_norm = 5
194 |         grads = p_grads + q_grads + cv_grads
195 |         mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm)
196 |         cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
197 | 
198 |         return cgrads
199 | 
200 |     def gen_samples(self, deterministic=False):
201 |         s = self.inputs[-1]
202 |         # put it through the decoder
203 |         _, l_p_in, _ = self.input_layers
204 |         l_p_mu = self.network[0]
205 |         p_mu = lasagne.layers.get_output(l_p_mu, {l_p_in : s})
206 | 
207 |         return p_mu
208 | 
209 |     def get_params(self):
210 |         l_p_mu, l_q_mu, _, l_cv, _, _ = self.network
211 |         p_params  = lasagne.layers.get_all_params(l_p_mu, trainable=True)
212 |         q_params  = lasagne.layers.get_all_params(l_q_mu, trainable=True)
213 |         cv_params = lasagne.layers.get_all_params(l_cv, trainable=True)
214 |         return p_params + q_params + cv_params #+ [c]
215 | 
216 |     def create_updates(self, grads, params, alpha, opt_alg, opt_params):
217 |         # call super-class to generate SGD/ADAM updates
218 |         grad_updates = Model.create_updates(
219 |             self, grads, params, alpha, opt_alg, opt_params,
220 |         )
221 | 
222 |         # create updates for centering signal
223 | 
224 |         # load neural net outputs (probabilities have been precomputed)
225 |         _, _, _, l_cv, c, v = self.network
226 |         log_pxz, log_qz_given_x = self.log_pxz, self.log_qz_given_x
227 |         cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1)
228 | 
229 |         # compute learning signals
230 |         l = log_pxz - log_qz_given_x - cv
231 |         l_avg, l_var = l.mean(), l.var()
232 |         c_new = 0.8*c + 0.2*l_avg
233 |         v_new = 0.8*v + 0.2*l_var
234 | 
235 |         # compute update for centering signal
236 |         cv_updates = {c: c_new, v: v_new}
237 | 
238 |         return OrderedDict(grad_updates.items() + cv_updates.items())
239 | 
240 |     def create_llik(self):
241 |         # load inputs
242 |         X = self.inputs[0]
243 |         x = X.flatten(2)
244 | 
245 |         # load network params
246 |         n_cat = self.n_lat
247 |         n_rep = 10
248 | 
249 |         # load networks
250 |         l_p_mu, l_q_mu, l_q_sample, _, _, _ = self.network
251 |         l_q_in, l_p_in, l_cv_in = self.input_layers
252 | 
253 |         # load network output
254 |         q_mu = lasagne.layers.get_output(l_q_mu)
255 | 
256 |         q_mu_rep = T.tile( q_mu.dimshuffle((0,'x',1)), reps=(1,n_rep,1) ) # (n_bat, n_rep, n_cat)
257 |         q_sample_hard = self.theano_rng.binomial(size=q_mu_rep.shape, p=q_mu_rep, dtype=q_mu_rep.dtype) # (n_bat, n_rep, n_cat)
258 |         q_sample_hard2 = q_sample_hard.reshape([100*n_rep, n_cat]) # (n_bat*n_rep, n_cat)
259 | 
260 |         p_mu = lasagne.layers.get_output(l_p_mu, {l_p_in: q_sample_hard2}) # (n_bat*n_rep, 784)
261 |         x_rep = T.tile( x.dimshuffle((0,'x',1)), reps=(1,n_rep,1) ) # (n_bat, n_rep, 784)
262 |         p_mu = T.reshape(p_mu, (100, n_rep, 784)) # (n_bat, n_rep, 784)
263 | 
264 |         # define the loss components
265 |         log_p_x = log_bernoulli(x_rep, p_mu).sum(axis=2) # (n_bat, n_rep)
266 |         z_prior = T.ones_like(q_sample_hard)*np.float32(0.5) # (n_bat, n_rep, n_cat)
267 |         log_p_z = log_bernoulli(q_sample_hard, z_prior).sum(axis=2) # (n_bat, n_rep)
268 |         log_q_z = log_bernoulli(q_sample_hard, q_mu_rep).sum(axis=2) # (n_bat, n_rep)
269 | 
270 |         # compute loss
271 |         llik = Tlogsumexp( log_p_x + log_p_z - log_q_z, axis=1) # (n_bat,)
272 | 
273 |         return T.mean(llik)
274 | 


--------------------------------------------------------------------------------
/util/data.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import pickle
  4 | import tarfile
  5 | import numpy as np
  6 | import urllib
  7 | import zipfile
  8 | import fnmatch
  9 | import shutil
 10 | import gzip
 11 | import cPickle as cPkl
 12 | import pickle as pkl
 13 | 
 14 | 
 15 | def whiten(X_train, X_valid):
 16 |     offset = np.mean(X_train, 0)
 17 |     scale = np.std(X_train, 0).clip(min=1)
 18 |     X_train = (X_train - offset) / scale
 19 |     X_valid = (X_valid - offset) / scale
 20 |     return X_train, X_valid
 21 | 
 22 | 
 23 | def load_cifar10():
 24 |     """Download and extract the tarball from Alex's website."""
 25 |     dest_directory = '.'
 26 |     DATA_URL = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
 27 |     if not os.path.exists(dest_directory):
 28 |         os.makedirs(dest_directory)
 29 |     filename = DATA_URL.split('/')[-1]
 30 |     filepath = os.path.join(dest_directory, filename)
 31 |     if not os.path.exists(filepath):
 32 |         if sys.version_info[0] == 2:
 33 |             from urllib import urlretrieve
 34 |         else:
 35 |             from urllib.request import urlretrieve
 36 | 
 37 |         def _progress(count, block_size, total_size):
 38 |             sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
 39 |                 float(count * block_size) / float(total_size) * 100.0))
 40 |             sys.stdout.flush()
 41 | 
 42 |         filepath, _ = urlretrieve(DATA_URL, filepath, _progress)
 43 |         print()
 44 |         statinfo = os.stat(filepath)
 45 |         print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
 46 |         tarfile.open(filepath, 'r:gz').extractall(dest_directory)
 47 | 
 48 |     def load_CIFAR_batch(filename):
 49 |         """ load single batch of cifar """
 50 |         with open(filename, 'rb') as f:
 51 |             datadict = pickle.load(f)
 52 |             X = datadict['data']
 53 |             Y = datadict['labels']
 54 |             X = X.reshape(10000, 3, 32, 32).astype("float32")
 55 |             Y = np.array(Y, dtype=np.uint8)
 56 |             return X, Y
 57 | 
 58 |     xs, ys = [], []
 59 |     for b in range(1,6):
 60 |         f = 'cifar-10-batches-py/data_batch_%d' % b
 61 |         X, Y = load_CIFAR_batch(f)
 62 |         xs.append(X)
 63 |         ys.append(Y)
 64 | 
 65 |     Xtr = np.concatenate(xs)
 66 |     Ytr = np.concatenate(ys)
 67 |     del X, Y
 68 |     Xte, Yte = load_CIFAR_batch('cifar-10-batches-py/test_batch')
 69 | 
 70 |     return Xtr, Ytr, Xte, Yte
 71 | 
 72 | def load_mnist():
 73 |     # We first define a download function, supporting both Python 2 and 3.
 74 |     if sys.version_info[0] == 2:
 75 |         from urllib import urlretrieve
 76 |     else:
 77 |         from urllib.request import urlretrieve
 78 | 
 79 |     def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
 80 |         print "Downloading %s" % filename
 81 |         urlretrieve(source + filename, filename)
 82 | 
 83 |     # We then define functions for loading MNIST images and labels.
 84 |     # For convenience, they also download the requested files if needed.
 85 |     import gzip
 86 | 
 87 |     def load_mnist_images(filename):
 88 |         if not os.path.exists(filename):
 89 |             download(filename)
 90 | 
 91 |         # Read the inputs in Yann LeCun's binary format.
 92 |         with gzip.open(filename, 'rb') as f:
 93 |             data = np.frombuffer(f.read(), np.uint8, offset=16)
 94 | 
 95 |         # The inputs are vectors now, we reshape them to monochrome 2D images,
 96 |         # following the shape convention: (examples, channels, rows, columns)
 97 |         data = data.reshape(-1, 1, 28, 28)
 98 |         # The inputs come as bytes, we convert them to float32 in range [0,1].
 99 |         # (Actually to range [0, 255/256], for compatibility to the version
100 |         # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.)
101 |         return data / np.float32(256)
102 | 
103 |     def load_mnist_labels(filename):
104 |         if not os.path.exists(filename):
105 |             download(filename)
106 |         # Read the labels in Yann LeCun's binary format.
107 |         with gzip.open(filename, 'rb') as f:
108 |             data = np.frombuffer(f.read(), np.uint8, offset=8)
109 |         # The labels are vectors of integers now, that's exactly what we want.
110 |         return data
111 | 
112 |     # We can now download and read the training and test set images and labels.
113 |     X_train = load_mnist_images('train-images-idx3-ubyte.gz')
114 |     y_train = load_mnist_labels('train-labels-idx1-ubyte.gz')
115 |     X_test = load_mnist_images('t10k-images-idx3-ubyte.gz')
116 |     y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
117 | 
118 |     # We reserve the last 10000 training examples for validation.
119 |     X_train, X_val = X_train[:-10000], X_train[-10000:]
120 |     y_train, y_val = y_train[:-10000], y_train[-10000:]
121 | 
122 |     # We just return all the arrays in order, as expected in main().
123 |     # (It doesn't matter how we do this as long as we can read them again.)
124 |     return X_train, y_train, X_val, y_val, X_test, y_test
125 | 
126 | def _download_omniglot_iwae(dataset):
127 |     """
128 |     Download the MNIST dataset if it is not present.
129 |     :return: The train, test and validation set.
130 |     """
131 |     origin = (
132 |         'https://github.com/yburda/iwae/raw/'
133 |         'master/datasets/OMNIGLOT/chardata.mat'
134 |     )
135 |     print 'Downloading data from %s' % origin
136 |     urllib.urlretrieve(origin, dataset + '/chardata.mat')
137 | 
138 | def load_omniglot_iwae(dataset='./omniglot_iwae'):
139 |     '''
140 |     Loads the real valued MNIST dataset
141 |     :param dataset: path to dataset file
142 |     :return: None
143 |     '''
144 |     from scipy.io import loadmat
145 |     if not os.path.exists(dataset):
146 |         os.makedirs(dataset)
147 |         _download_omniglot_iwae(dataset)
148 | 
149 |     data = loadmat(dataset+'/chardata.mat')
150 | 
151 |     train_x = data['data'].astype('float32').T
152 |     train_t = np.argmax(data['target'].astype('float32').T,axis=1)
153 |     train_char = data['targetchar'].astype('float32')
154 |     test_x = data['testdata'].astype('float32').T
155 |     test_t = np.argmax(data['testtarget'].astype('float32').T,axis=1)
156 |     test_char = data['testtargetchar'].astype('float32')
157 | 
158 | 
159 |     return train_x, train_t, test_x, test_t
160 | 
161 | def _download_mnist_binarized(datapath):
162 |     """
163 |     Download the fized binzarized MNIST dataset if it is not present.
164 |     :return: The train, test and validation set.
165 |     """
166 |     datafiles = {
167 |         "train": "http://www.cs.toronto.edu/~larocheh/public/"
168 |                  "datasets/binarized_mnist/binarized_mnist_train.amat",
169 |         "valid": "http://www.cs.toronto.edu/~larocheh/public/datasets/"
170 |                  "binarized_mnist/binarized_mnist_valid.amat",
171 |         "test": "http://www.cs.toronto.edu/~larocheh/public/datasets/"
172 |                 "binarized_mnist/binarized_mnist_test.amat"
173 |     }
174 |     datasplits = {}
175 |     for split in datafiles.keys():
176 |         print "Downloading %s data..." %(split)
177 |         local_file = datapath + '/binarized_mnist_%s.npy'%(split)
178 |         datasplits[split] = np.loadtxt(urllib.urlretrieve(datafiles[split])[0])
179 | 
180 |     f = gzip.open(datapath +'/mnist.pkl.gz', 'w')
181 |     pkl.dump([datasplits['train'],datasplits['valid'],datasplits['test']],f)
182 | 
183 | # def load_mnist_binarized(dataset='./mnist_binarized/mnist.pkl.gz'):
184 | def load_mnist_binarized(dataset='./mnist_binarized/mnist.pkl'):
185 |     '''
186 |     Loads the fixed binarized MNIST dataset provided by Hugo Larochelle.
187 |     :param dataset: path to dataset file
188 |     :return: None
189 |     '''
190 |     if not os.path.isfile(dataset):
191 |         datasetfolder = os.path.dirname(dataset)
192 |         if not os.path.exists(datasetfolder):
193 |             os.makedirs(datasetfolder)
194 |         _download_mnist_binarized(datasetfolder)
195 | 
196 |     # f = gzip.open(dataset, 'rb')
197 |     f = open(dataset, 'rb')
198 |     x_train, x_valid, x_test = pkl.load(f)
199 |     f.close()
200 | 
201 |     x_train = x_train.astype('float32')
202 |     x_valid = x_valid.astype('float32')
203 |     x_test = x_test.astype('float32')
204 | 
205 |     return x_train, x_valid, x_test    
206 | 
207 | def load_digits():
208 |     from sklearn.datasets import load_digits as _load_digits
209 |     from sklearn.cross_validation import train_test_split
210 | 
211 |     digits = _load_digits()
212 |     X = np.asarray(digits.data, 'float32')
213 |     X, Y = nudge_dataset(X, digits.target)
214 |     X = (X - np.min(X, 0)) / (np.max(X, 0) + 0.0001)  # 0-1 scaling
215 |     n, d2 = X.shape
216 |     d = int(np.sqrt(d2))
217 |     X = X.reshape((n,1,d,d))
218 |     Y = np.array(Y, dtype=np.uint8)
219 | 
220 |     X_train, X_val, y_train, y_val = train_test_split(X, Y,
221 |                                                         test_size=0.2,
222 |                                                         random_state=0)
223 |     # TODO: We don't use these right now
224 |     X_test, y_test = None, None
225 | 
226 |     # We just return all the arrays in order, as expected in main().
227 |     # (It doesn't matter how we do this as long as we can read them again.)
228 |     return X_train, y_train, X_val, y_val, X_test, y_test
229 | 
230 | # def load_digits():
231 | #     from sklearn.datasets import load_digits as _load_digits
232 | #     data = _load_digits()
233 | #     X, y = data.images.astype('float32'), data.target.astype('int32')
234 | #     X = X[:, np.newaxis, :, :]
235 | 
236 | #     # We reserve the last  300 / ~1800 for validation.
237 | #     X_train, X_val = X[:-300], X[-300:]
238 | #     y_train, y_val = y[:-300], y[-300:]
239 | 
240 | #     # TODO: We don't use these right now
241 | #     X_test, y_test = None, None
242 | 
243 | #     # We just return all the arrays in order, as expected in main().
244 | #     # (It doesn't matter how we do this as long as we can read them again.)
245 | #     return X_train, y_train, X_val, y_val, X_test, y_test
246 | 
247 | 
248 | def split_semisup(X, y, n_lbl):
249 |     n_tot = len(X)
250 |     idx = np.random.permutation(n_tot)
251 | 
252 |     X_lbl = X[idx[:n_lbl]].copy()
253 |     X_unl = X[idx[n_lbl:]].copy()
254 |     y_lbl = y[idx[:n_lbl]].copy()
255 |     y_unl = y[idx[n_lbl:]].copy()
256 | 
257 |     return X_lbl, y_lbl, X_unl, y_unl
258 | 
259 | 
260 | def load_noise(n=100,d=5):
261 |     """For debugging"""
262 |     X = np.random.randint(2,size=(n,1,d,d)).astype('float32')
263 |     Y = np.random.randint(2,size=(n,)).astype(np.uint8)
264 | 
265 |     return X, Y
266 | 
267 | 
268 | def load_h5(h5_path):
269 |     """This was untested"""
270 |     import h5py
271 |     # load training data
272 |     with h5py.File(h5_path, 'r') as hf:
273 |         print 'List of arrays in input file:', hf.keys()
274 |         X = np.array(hf.get('data'))
275 |         Y = np.array(hf.get('label'))
276 |         print 'Shape of X: \n', X.shape
277 |         print 'Shape of Y: \n', Y.shape
278 | 
279 |         return X, Y
280 | 
281 | 
282 | def nudge_dataset(X, Y):
283 |     """
284 |     This produces a dataset 5 times bigger than the original one,
285 |     by moving the 8x8 images in X around by 1px to left, right, down, up
286 |     """
287 |     from scipy.ndimage import convolve
288 |     direction_vectors = [
289 |         [[0, 1, 0],
290 |          [0, 0, 0],
291 |          [0, 0, 0]],
292 | 
293 |         [[0, 0, 0],
294 |          [1, 0, 0],
295 |          [0, 0, 0]],
296 | 
297 |         [[0, 0, 0],
298 |          [0, 0, 1],
299 |          [0, 0, 0]],
300 | 
301 |         [[0, 0, 0],
302 |          [0, 0, 0],
303 |          [0, 1, 0]]]
304 | 
305 |     shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant',
306 |                                   weights=w).ravel()
307 |     X = np.concatenate([X] +
308 |                        [np.apply_along_axis(shift, 1, X, vector)
309 |                         for vector in direction_vectors])
310 |     Y = np.concatenate([Y for _ in range(5)], axis=0)
311 |     return X, Y
312 | 


--------------------------------------------------------------------------------
/models/model.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import pickle
  3 | from collections import OrderedDict
  4 | 
  5 | import numpy as np
  6 | import theano
  7 | import theano.tensor as T
  8 | import lasagne
  9 | 
 10 | from helpers import (
 11 |     iterate_minibatch_idx,
 12 |     iterate_minibatches,
 13 |     evaluate,
 14 |     log_metrics,
 15 | )
 16 | 
 17 | from theano.compile.nanguardmode import NanGuardMode
 18 | 
 19 | 
 20 | class Model(object):
 21 |     """Model superclass that includes training code"""
 22 |     def __init__(
 23 |         self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params
 24 |     ):
 25 |         # create shared data variables
 26 |         train_set_x = theano.shared(np.empty(
 27 |             (n_superbatch, n_chan, n_dim, n_dim),
 28 |             dtype=theano.config.floatX),
 29 |             borrow=False,
 30 |         )
 31 |         val_set_x = theano.shared(np.empty(
 32 |             (n_superbatch, n_chan, n_dim, n_dim),
 33 |             dtype=theano.config.floatX),
 34 |             borrow=False,
 35 |         )
 36 | 
 37 |         # create y-variables
 38 |         train_set_y = theano.shared(np.empty(
 39 |             (n_superbatch,), dtype='int32'), borrow=False)
 40 |         val_set_y = theano.shared(np.empty(
 41 |             (n_superbatch,), dtype='int32'), borrow=False)
 42 |         # train_set_y_int = T.cast(train_set_y, 'int32')
 43 |         # val_set_y_int = T.cast(val_set_y, 'int32')
 44 | 
 45 |         # save data variables
 46 |         self.train_set_x = train_set_x
 47 |         self.train_set_y = train_set_y
 48 |         self.val_set_x = val_set_x
 49 |         self.val_set_y = val_set_y
 50 |         self.data_loaded = False
 51 | 
 52 |         # create input vars
 53 |         (X, Y, idx1, idx2, S) = self.create_inputs()
 54 |         self.inputs = (X, Y, idx1, idx2, S)
 55 | 
 56 |         # create lasagne model
 57 |         self.network = self.create_model(X, Y, n_dim, n_out, n_chan)
 58 | 
 59 |         # create objectives
 60 |         loss, acc = self.create_objectives(deterministic=False)
 61 |         loss_test, acc_test = self.create_objectives(deterministic=True)
 62 |         self.objectives = (loss, acc)
 63 |         self.objectives_test = (loss_test, acc_test)
 64 | 
 65 |         # load params
 66 |         params = self.get_params()
 67 | 
 68 |         # create hallucinations
 69 |         sample = self.gen_samples(deterministic=False)
 70 |         self.dream = theano.function([S], sample, on_unused_input='warn')
 71 | 
 72 |         # create gradients
 73 |         grads = self.create_gradients(loss, deterministic=False)
 74 |         grads_test = self.create_gradients(loss_test, deterministic=True)
 75 | 
 76 |         # create updates
 77 |         self.alpha = T.scalar(dtype=theano.config.floatX)  # learning rate
 78 |         self.updates = self.create_updates(
 79 |             grads, params, self.alpha, opt_alg, opt_params
 80 |         )
 81 | 
 82 |         # create givens
 83 |         my_givens = self.create_givens()
 84 | 
 85 |         # create methods for training / prediction
 86 |         mode = None
 87 |         # mode = NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)
 88 |         self.train = theano.function(
 89 |             [idx1, idx2, self.alpha], [loss, acc], updates=self.updates,
 90 |             givens=my_givens, on_unused_input='warn', mode=mode
 91 |         )
 92 |         self.train_batch = theano.function(
 93 |             [X, Y, self.alpha], [loss, acc], updates=self.updates,
 94 |             on_unused_input='warn', mode=mode
 95 |         )
 96 |         self.loss = theano.function(
 97 |             [X, Y], [loss, acc],
 98 |             on_unused_input='warn',
 99 |         )
100 | 
101 |         # TODO: implement a create_predictions method
102 |         # self.predict = theano.function([X], P)
103 | 
104 |         # save config
105 |         self.n_class = 2
106 |         self.n_dim = n_dim
107 |         self.n_out = n_out
108 |         self.n_superbatch = n_superbatch
109 |         self.alg = opt_alg
110 | 
111 |         # save neural network
112 |         self.params = self.get_params()
113 |         self.grads = (grads, grads_test)
114 |         self.metrics = (loss, acc)
115 | 
116 |     def create_inputs(self):
117 |         X = T.tensor4(dtype=theano.config.floatX)
118 |         S = T.matrix(dtype=theano.config.floatX)
119 |         Y = T.ivector()
120 |         idx1, idx2 = T.lscalar(), T.lscalar()
121 |         return X, Y, idx1, idx2, S
122 | 
123 |     def create_objectives(self, deterministic=False):
124 |         # load network
125 |         l_out = self.network
126 |         Y = self.inputs[1]
127 | 
128 |         # create predictions
129 |         P = lasagne.layers.get_output(l_out)
130 |         P_test = lasagne.layers.get_output(l_out, deterministic=True)
131 | 
132 |         # create loss
133 |         loss = lasagne.objectives.categorical_crossentropy(P, Y).mean()
134 |         loss_test = lasagne.objectives.categorical_crossentropy(P_test, Y).mean()
135 | 
136 |         # measure accuracy
137 |         top = theano.tensor.argmax(P, axis=-1)
138 |         top_test = theano.tensor.argmax(P_test, axis=-1)
139 |         acc = theano.tensor.eq(top, Y).mean()
140 |         acc_test = theano.tensor.eq(top_test, Y).mean()
141 | 
142 |         if deterministic:
143 |             return loss_test, acc_test
144 |         else:
145 |             return loss, acc
146 | 
147 |     def create_gradients(self, loss, deterministic=False):
148 |         params = self.get_params()
149 |         return theano.grad(loss, params)
150 | 
151 |     def create_updates(self, grads, params, alpha, opt_alg, opt_params):
152 |         scaled_grads = [grad * alpha for grad in grads]
153 |         lr = opt_params.get('lr', 1e-3)
154 |         if opt_alg == 'sgd':
155 |             grad_updates = lasagne.updates.sgd(
156 |                 scaled_grads, params, learning_rate=lr)
157 |         elif opt_alg == 'adam':
158 |             b1, b2 = opt_params.get('b1', 0.9), opt_params.get('b2', 0.999)
159 |             grad_updates = lasagne.updates.adam(
160 |                 scaled_grads, params, learning_rate=lr, beta1=b1, beta2=b2)
161 |         else:
162 |             grad_updates = OrderedDict()
163 | 
164 |         return grad_updates
165 | 
166 |     def create_givens(self):
167 |         (X, Y, idx1, idx2, S) = self.inputs
168 |         return {X: self.train_set_x[idx1:idx2], Y: self.train_set_y[idx1:idx2]}
169 | 
170 |     def get_params(self):
171 |         l_out = self.network
172 |         return lasagne.layers.get_all_params(l_out, trainable=True)
173 | 
174 |     def gen_samples(self, deterministic=False):
175 |         pass  # To be implemented by inherited models
176 | 
177 |     def gen_noise(self, size, n_lat):
178 |         noise = np.zeros((size, n_lat))
179 |         noise[range(size), np.random.choice(n_lat, size)] = 1
180 |         return noise
181 | 
182 |     def hallucinate(self):
183 |         """Generate new samples by passing noise into the decoder"""
184 |         # load network params
185 |         size, n_lat, n_dim = 100, self.n_lat, self.n_dim
186 |         img_size = int(np.sqrt(size))
187 | 
188 |         # generate noisy inputs
189 |         noise = self.gen_noise(size, n_lat).astype('float32')
190 |         p_mu = self.dream(noise)
191 |         if p_mu is None:
192 |             return None
193 |         p_mu = p_mu.reshape((img_size, img_size, n_dim, n_dim))
194 |         # split into img_size (1,img_size,n_dim,n_dim) images,
195 |         # concat along columns -> 1,img_size,n_dim,n_dim*img_size
196 |         p_mu = np.concatenate(np.split(p_mu, img_size, axis=0), axis=3)
197 |         # split into img_size (1,1,n_dim,n_dim*img_size) images,
198 |         # concat along rows -> 1,1,n_dim*img_size,n_dim*img_size
199 |         p_mu = np.concatenate(np.split(p_mu, img_size, axis=1), axis=2)
200 |         return np.squeeze(p_mu)
201 | 
202 |     def fit(
203 |         self, X_train, Y_train, X_val, Y_val,
204 |         n_epoch=10, n_batch=100, logname='run',
205 |     ):
206 |         """Train the model"""
207 |         alpha = 1.0  # learning rate, which can be adjusted later
208 |         n_data = len(X_train)
209 |         n_superbatch = self.n_superbatch
210 | 
211 |         for epoch in range(n_epoch):
212 |             # In each epoch, we do a full pass over the training data:
213 |             train_batches, train_err, train_acc = 0, 0, 0
214 |             start_time = time.time()
215 | 
216 |             # iterate over superbatches to save time on GPU memory transfer
217 |             for X_sb, Y_sb in self.iterate_superbatches(
218 |                 X_train, Y_train, n_superbatch,
219 |                 datatype='train', shuffle=True,
220 |             ):
221 |                 for idx1, idx2 in iterate_minibatch_idx(len(X_sb), n_batch):
222 |                     err, acc = self.train(idx1, idx2, alpha)
223 |                     # collect metrics
224 |                     train_batches += 1
225 |                     train_err += err
226 |                     train_acc += acc
227 |                     if train_batches % 100 == 0:
228 |                         n_total = epoch * n_data + n_batch * train_batches
229 |                         metrics = [
230 |                             n_total,
231 |                             train_err / train_batches,
232 |                             train_acc / train_batches,
233 |                         ]
234 |                         log_metrics(logname, metrics)
235 | 
236 |             print "Epoch {} of {} took {:.3f}s ({} minibatches)".format(
237 |                 epoch + 1, n_epoch, time.time() - start_time, train_batches)
238 | 
239 |             # make a full pass over the training data and record metrics:
240 |             train_err, train_acc = evaluate(
241 |                 self.loss, X_train, Y_train, batchsize=100,
242 |             )
243 |             val_err, val_acc = evaluate(
244 |                 self.loss, X_val, Y_val, batchsize=100,
245 |             )
246 | 
247 |             print "  training:\t\t{:.6f}\t{:.6f}".format(
248 |                 train_err, train_acc
249 |             )
250 |             print "  validation:\t\t{:.6f}\t{:.6f}".format(
251 |                 val_err, val_acc
252 |             )
253 | 
254 |             metrics = [epoch, train_err, train_acc, val_err, val_acc]
255 |             log_metrics(logname + '.val', metrics)
256 | 
257 |     def dump(self, fname):
258 |         """Pickle weights to a file"""
259 |         params = lasagne.layers.get_all_param_values(self.network)
260 |         with open(fname, 'w') as f:
261 |             pickle.dump(params, f)
262 | 
263 |     def load(self, fname):
264 |         """Load pickled network"""
265 |         with open(fname) as f:
266 |             params = pickle.load(f)
267 |         self.load_params(params)
268 | 
269 |     def load_params(self, params):
270 |         """Load a given set of parameters"""
271 |         lasagne.layers.set_all_param_values(self.network, params)
272 | 
273 |     def dump_params(self):
274 |         """Dump a given set of parameters"""
275 |         return lasagne.layers.get_all_param_values(self.network)
276 | 
277 |     def load_data(self, X, Y, dest='train'):
278 |         assert dest in ('train', 'val')
279 |         if dest == 'train':
280 |             self.train_set_x.set_value(X, borrow=False)
281 |             if Y is not None:
282 |                 self.train_set_y.set_value(Y, borrow=False)
283 |         elif dest == 'val':
284 |             self.val_set_x.set_value(X, borrow=False)
285 |             if Y is not None:
286 |                 self.val_set_y.set_value(Y, borrow=False)
287 | 
288 |     def iterate_superbatches(
289 |         self, X, Y, batchsize,
290 |         datatype='train', shuffle=False,
291 |     ):
292 |         assert datatype in ('train', 'val')
293 |         assert len(X) == len(Y)
294 |         assert batchsize <= len(X)
295 | 
296 |         # if we are loading entire dataset, only load it once
297 |         if batchsize == len(X):
298 |             if not self.data_loaded:
299 |                 self.load_data(X, Y, dest=datatype)
300 |                 self.data_loaded = True
301 |             yield X, Y
302 |         else:
303 |             # otherwise iterate over superbatches
304 |             for superbatch in iterate_minibatches(
305 |                 X, Y, batchsize, shuffle=shuffle
306 |             ):
307 |                 inputs, targets = superbatch
308 |                 self.load_data(inputs, targets, dest=datatype)
309 |                 yield inputs, targets
310 | 


--------------------------------------------------------------------------------
/models/usbn.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import time
  3 | import pickle
  4 | import numpy as np
  5 | from collections import OrderedDict
  6 | 
  7 | import theano
  8 | import theano.tensor as T
  9 | from theano.gradient import disconnected_grad as dg
 10 | import lasagne
 11 | 
 12 | from model import Model
 13 | 
 14 | from helpers import (
 15 |     iterate_minibatch_idx,
 16 |     iterate_minibatches,
 17 |     evaluate,
 18 |     log_metrics
 19 | )
 20 | 
 21 | from layers import BernoulliSampleLayer
 22 | from distributions import log_bernoulli
 23 | from helpers import Tlogsumexp
 24 | 
 25 | from theano.tensor.shared_randomstreams import RandomStreams
 26 | 
 27 | from rbm import RBM
 28 | from aux_variational_rbm import AuxiliaryVariationalRBM
 29 | 
 30 | # ----------------------------------------------------------------------------
 31 | 
 32 | class USBN(Model):
 33 |     """Unidected Sigmoid Belief Network trained using Neural Variational Inference
 34 |        Epoch 200 of 200 took 26.052s (192 minibatches)
 35 |             training loss/acc:        125.989901  107.652437
 36 |             validation loss/acc:      126.220432  108.006230
 37 |     """
 38 | 
 39 |     def __init__(
 40 |         self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params
 41 |     ):
 42 |         Model.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params)
 43 | 
 44 |         # random number generators
 45 |         self.numpy_rng = np.random.RandomState(1234)
 46 |         self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30))
 47 |         
 48 |         (X, Y, idx1, idx2, S) = self.inputs 
 49 |         loss, acc = self.objectives
 50 |         self.train = theano.function(
 51 |             [idx1, idx2, self.alpha], [loss, self.log_e, self.z], updates=self.updates,
 52 |             givens={X: self.train_set_x[idx1:idx2], Y: self.train_set_y[idx1:idx2]}, 
 53 |             on_unused_input='warn'
 54 |         )
 55 |         llik = self.create_llik()
 56 |         self.loss = theano.function(
 57 |             [X, Y], [loss, llik],
 58 |             on_unused_input='warn',
 59 |         )
 60 | 
 61 |     def create_model(self, X, Y, n_dim, n_out, n_chan=1):
 62 |         # params
 63 |         n_lat    = 64 # latent stochastic variables
 64 |         n_hid    = 500 # size of hidden layer in encoder/decoder
 65 |         n_hid_cv = 500 # size of hidden layer in control variate net
 66 |         n_out    = n_dim * n_dim * n_chan # total dimensionality of ouput
 67 |         hid_nl   = lasagne.nonlinearities.tanh
 68 | 
 69 |         # self.rbm = RBM(n_dim=int(np.sqrt(n_lat)), n_out=10, n_chan=1, opt_params={'nb':128})
 70 |         self.rbm = AuxiliaryVariationalRBM(n_dim=int(np.sqrt(n_lat)), n_out=10, n_chan=1, opt_params={'nb':128})
 71 | 
 72 |         # create the encoder network
 73 |         l_q_in = lasagne.layers.InputLayer(
 74 |             shape=(None, n_chan, n_dim, n_dim),
 75 |             input_var=X,
 76 |         )
 77 |         l_q_hid = lasagne.layers.DenseLayer(
 78 |             l_q_in, num_units=n_hid,
 79 |             nonlinearity=hid_nl,
 80 |         )
 81 |         l_q_out = lasagne.layers.DenseLayer(
 82 |             l_q_hid, num_units=n_lat,
 83 |             nonlinearity=None,
 84 |         )
 85 |         l_q_mu = lasagne.layers.DenseLayer(
 86 |             l_q_hid, num_units=n_lat,
 87 |             nonlinearity=T.nnet.sigmoid,
 88 |         )
 89 |         l_q_sample = BernoulliSampleLayer(l_q_mu)
 90 | 
 91 |         # create the decoder network
 92 |         # note that we currently only handle Bernoulli x variables
 93 |         l_p_in = lasagne.layers.InputLayer((None, n_lat))
 94 |         l_p_hid = lasagne.layers.DenseLayer(
 95 |             l_p_in, num_units=n_hid,
 96 |             nonlinearity=hid_nl,
 97 |             W=lasagne.init.GlorotUniform(),
 98 |         )
 99 |         l_p_mu = lasagne.layers.DenseLayer(l_p_hid, num_units=n_out,
100 |             nonlinearity = lasagne.nonlinearities.sigmoid,
101 |             W=lasagne.init.GlorotUniform(),
102 |             b=lasagne.init.Constant(0.),
103 |         )
104 | 
105 |         # create control variate (baseline) network
106 |         l_cv_in = lasagne.layers.InputLayer(
107 |             shape=(None, n_chan, n_dim, n_dim),
108 |             input_var=X,
109 |         )
110 |         l_cv_hid = lasagne.layers.DenseLayer(
111 |             l_cv_in, num_units=n_hid_cv,
112 |             nonlinearity=hid_nl,
113 |         )
114 |         l_cv = lasagne.layers.DenseLayer(
115 |             l_cv_hid, num_units=1,
116 |             nonlinearity=None,
117 |         )
118 | 
119 |         # create variables for centering signal
120 |         c = theano.shared(np.zeros((1,1), dtype=np.float64), broadcastable=(True,True))
121 |         v = theano.shared(np.zeros((1,1), dtype=np.float64), broadcastable=(True,True))
122 | 
123 |         self.input_layers = (l_q_in, l_p_in, l_cv_in)
124 |         self.n_lat = n_lat
125 |         self.n_hid = n_hid
126 | 
127 |         return l_p_mu, l_q_mu, l_q_sample, l_cv, c, v
128 | 
129 |     def _create_components(self, deterministic=False):
130 |         # load network input
131 |         X = self.inputs[0]
132 |         x = X.flatten(2)
133 | 
134 |         # load networks
135 |         l_p_mu, l_q_mu, l_q_sample, _, _, _ = self.network
136 |         l_q_in, l_p_in, l_cv_in = self.input_layers
137 | 
138 |         # load network output
139 |         z, q_mu = lasagne.layers.get_output(
140 |             [l_q_sample, l_q_mu], deterministic=deterministic)
141 |         p_mu = lasagne.layers.get_output(
142 |             l_p_mu, {l_p_in: z},
143 |             deterministic=deterministic,
144 |         )
145 | 
146 |         # entropy term
147 |         log_qz_given_x = log_bernoulli(dg(z), q_mu).sum(axis=1)
148 | 
149 |         # expected p(x,z) term
150 |         z_prior = T.ones_like(z)*np.float32(0.5)
151 |         log_pz = log_bernoulli(z, z_prior).sum(axis=1)
152 |         log_e = -self.rbm.free_energy(z.reshape((128,self.n_lat)))
153 |         # log_e = log_pz
154 |         log_px_given_z = log_bernoulli(x, p_mu).sum(axis=1)
155 |         log_pxz = log_e + log_px_given_z
156 | 
157 |         # save them for later
158 |         self.z = z
159 |         self.log_e = log_e.mean()
160 |         self.log_pxz = log_pxz
161 |         self.log_qz_given_x = log_qz_given_x
162 | 
163 |         return log_pxz.flatten(), log_qz_given_x.flatten()
164 | 
165 |     def create_objectives(self, deterministic=False):
166 |         # load probabilities
167 |         log_pxz, log_qz_given_x = self._create_components(deterministic=deterministic)
168 | 
169 |         # compute the lower bound
170 |         elbo = T.mean(log_pxz - log_qz_given_x)
171 | 
172 |         # we don't use the second accuracy metric right now
173 |         return -elbo, -T.mean(log_qz_given_x)
174 | 
175 |     def create_gradients(self, loss, deterministic=False):
176 |         from theano.gradient import disconnected_grad as dg
177 | 
178 |         # load networks
179 |         l_p_mu, l_q_mu, _, l_cv, c, v = self.network
180 | 
181 |         # load params
182 |         p_params  = lasagne.layers.get_all_params(l_p_mu, trainable=True)
183 |         q_params  = lasagne.layers.get_all_params(l_q_mu, trainable=True)
184 |         cv_params = lasagne.layers.get_all_params(l_cv, trainable=True)
185 | 
186 |         # load neural net outputs (probabilities have been precomputed)
187 |         log_pxz, log_qz_given_x = self.log_pxz, self.log_qz_given_x
188 |         cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1)
189 | 
190 |         # compute learning signals
191 |         l = log_pxz - log_qz_given_x - cv
192 |         l_avg, l_var = l.mean(), l.var()
193 |         c_new = 0.8*c + 0.2*l_avg
194 |         v_new = 0.8*v + 0.2*l_var
195 |         l = (l - c_new) / T.maximum(1, T.sqrt(v_new))
196 | 
197 |         # compute grad wrt p
198 |         p_grads = T.grad(-log_pxz.mean(), p_params)
199 | 
200 |         # compute grad wrt q
201 |         q_target = T.mean(dg(l) * log_qz_given_x)
202 |         q_grads = T.grad(-0.2*q_target, q_params) # 5x slower rate for q
203 | 
204 |         # compute grad of cv net
205 |         cv_target = T.mean(l**2)
206 |         cv_grads = T.grad(cv_target, cv_params)
207 | 
208 |         # combine and clip gradients
209 |         clip_grad = 1
210 |         max_norm = 5
211 |         grads = p_grads + q_grads + cv_grads
212 |         mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm)
213 |         cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
214 | 
215 |         return cgrads
216 | 
217 |     def gen_samples(self, deterministic=False):
218 |         s = self.inputs[-1]
219 |         # put it through the decoder
220 |         _, l_p_in, _ = self.input_layers
221 |         l_p_mu = self.network[0]
222 |         p_mu = lasagne.layers.get_output(l_p_mu, {l_p_in : s})
223 | 
224 |         return p_mu
225 | 
226 |     def gen_noise(self, size, n_lat):
227 |         return self.rbm.sample()
228 |         noise = np.zeros((size, n_lat))
229 |         noise[range(size), np.random.choice(n_lat, size)] = 1
230 |         return noise
231 | 
232 |     def get_params(self):
233 |         l_p_mu, l_q_mu, _, l_cv, _, _ = self.network
234 |         p_params  = lasagne.layers.get_all_params(l_p_mu, trainable=True)
235 |         q_params  = lasagne.layers.get_all_params(l_q_mu, trainable=True)
236 |         cv_params = lasagne.layers.get_all_params(l_cv, trainable=True)
237 | 
238 |         return p_params + q_params + cv_params #+ [c]
239 | 
240 |     def create_updates(self, grads, params, alpha, opt_alg, opt_params):
241 |         # call super-class to generate SGD/ADAM updates
242 |         grad_updates = Model.create_updates(
243 |             self, grads, params, alpha, opt_alg, opt_params,
244 |         )
245 | 
246 |         # create updates for centering signal
247 | 
248 |         # load neural net outputs (probabilities have been precomputed)
249 |         _, _, _, l_cv, c, v = self.network
250 |         log_pxz, log_qz_given_x = self.log_pxz, self.log_qz_given_x
251 |         cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1)
252 | 
253 |         # compute learning signals
254 |         l = log_pxz - log_qz_given_x - cv
255 |         l_avg, l_var = l.mean(), l.var()
256 |         c_new = 0.8*c + 0.2*l_avg
257 |         v_new = 0.8*v + 0.2*l_var
258 | 
259 |         # compute update for centering signal
260 |         cv_updates = {c: c_new, v: v_new}
261 | 
262 |         return OrderedDict(grad_updates.items() + cv_updates.items())
263 | 
264 |     def hallucinate(self):
265 |         """Generate new samples by passing noise into the decoder"""
266 |         # load network params
267 |         size, n_lat, n_dim = 100, self.n_lat, self.n_dim
268 |         img_size = int(np.sqrt(size))
269 | 
270 |         # # generate noisy inputs
271 |         # noise = self.gen_noise(size, n_lat).astype('float32')
272 |         
273 |         # sample noise from RBM
274 |         noise = self.rbm.sample()
275 | 
276 |         p_mu = self.dream(noise)
277 |         if p_mu is None:
278 |             return None
279 |         p_mu = p_mu.reshape((img_size, img_size, n_dim, n_dim))
280 |         # split into img_size (1,img_size,n_dim,n_dim) images,
281 |         # concat along columns -> 1,img_size,n_dim,n_dim*img_size
282 |         p_mu = np.concatenate(np.split(p_mu, img_size, axis=0), axis=3)
283 |         # split into img_size (1,1,n_dim,n_dim*img_size) images,
284 |         # concat along rows -> 1,1,n_dim*img_size,n_dim*img_size
285 |         p_mu = np.concatenate(np.split(p_mu, img_size, axis=1), axis=2)
286 |         return np.squeeze(p_mu)
287 | 
288 |     def create_llik(self):
289 |         # load inputs
290 |         X = self.inputs[0]
291 |         x = X.flatten(2)
292 | 
293 |         # load network params
294 |         n_cat = self.n_lat
295 |         n_rep = 10
296 | 
297 |         # load networks
298 |         l_p_mu, l_q_mu, l_q_sample, _, _, _ = self.network
299 |         l_q_in, l_p_in, l_cv_in = self.input_layers
300 | 
301 |         # load network output
302 |         q_mu = lasagne.layers.get_output(l_q_mu)
303 | 
304 |         q_mu_rep = T.tile( q_mu.dimshuffle((0,'x',1)), reps=(1,n_rep,1) ) # (n_bat, n_rep, n_cat)
305 |         q_sample_hard = self.theano_rng.binomial(size=q_mu_rep.shape, p=q_mu_rep, dtype=q_mu_rep.dtype) # (n_bat, n_rep, n_cat)
306 |         q_sample_hard2 = q_sample_hard.reshape([128*n_rep, n_cat]) # (n_bat*n_rep, n_cat)
307 | 
308 |         p_mu = lasagne.layers.get_output(l_p_mu, {l_p_in: q_sample_hard2}) # (n_bat*n_rep, 784)
309 |         x_rep = T.tile( x.dimshuffle((0,'x',1)), reps=(1,n_rep,1) ) # (n_bat, n_rep, 784)
310 |         p_mu = T.reshape(p_mu, (128, n_rep, 784)) # (n_bat, n_rep, 784)
311 | 
312 |         # define the loss components
313 |         log_p_x = log_bernoulli(x_rep, p_mu).sum(axis=2) # (n_bat, n_rep)
314 |         # z_prior = T.ones_like(q_sample_hard)*np.float32(0.5) # (n_bat, n_rep, n_cat)
315 |         # log_p_z = log_bernoulli(q_sample_hard, z_prior).sum(axis=2) # (n_bat, n_rep)
316 |         log_e = -self.rbm.free_energy(q_sample_hard.reshape((128*n_rep,self.n_lat)))
317 |         log_e = T.reshape(log_e, [128, n_rep])
318 |         log_q_z = log_bernoulli(q_sample_hard, q_mu_rep).sum(axis=2) # (n_bat, n_rep)
319 | 
320 |         # compute loss
321 |         llik = Tlogsumexp( log_p_x + log_e - log_q_z, axis=1) # (n_bat,)
322 | 
323 |         return T.mean(llik)
324 | 
325 |     def fit(
326 |         self, X_train, Y_train, X_val, Y_val,
327 |         n_epoch=10, n_batch=100, logname='run',
328 |     ):
329 |         """Train the model"""
330 |         alpha = 1.0  # learning rate, which can be adjusted later
331 |         n_data = len(X_train)
332 |         n_superbatch = self.n_superbatch
333 | 
334 |         # print '...', self.rbm.logZ_exact()
335 |         # print '...', self.rbm.logZ_exact(marg='h')
336 | 
337 |         for epoch in range(n_epoch):
338 |             # In each epoch, we do a full pass over the training data:
339 |             train_batches, train_err, train_acc = 0, 0, 0
340 |             start_time = time.time()
341 | 
342 |             # iterate over superbatches to save time on GPU memory transfer
343 |             for X_sb, Y_sb in self.iterate_superbatches(
344 |                 X_train, Y_train, n_superbatch,
345 |                 datatype='train', shuffle=True,
346 |             ):
347 |                 for idx1, idx2 in iterate_minibatch_idx(len(X_sb), n_batch):
348 |                     # train model
349 |                     self.rbm.reset_averages()
350 |                     err, acc, z = self.train(idx1, idx2, alpha)
351 | 
352 |                     # train rbm
353 |                     z_rbm = z.reshape((128,1,8,8))
354 |                     # self.rbm.train_batch(z.reshape((128,1,8,8)), np.zeros((n_batch,), dtype='int32'), alpha)
355 | 
356 |                     # q steps
357 |                     for i in range(2): self.rbm.train_q0(z_rbm)
358 | 
359 |                     # p steps
360 |                     self.rbm.train_p_batch(z_rbm, alpha)
361 | 
362 |                     # collect metrics
363 |                     # print err, acc
364 |                     train_batches += 1
365 |                     train_err += err
366 |                     train_acc += acc
367 |                     if train_batches % 100 == 0:
368 |                         n_total = epoch * n_data + n_batch * train_batches
369 |                         metrics = [
370 |                             n_total,
371 |                             train_err / train_batches,
372 |                             train_acc / train_batches,
373 |                         ]
374 |                         log_metrics(logname, metrics)
375 | 
376 |             print "Epoch {} of {} took {:.3f}s ({} minibatches)".format(
377 |                 epoch + 1, n_epoch, time.time() - start_time, train_batches)
378 | 
379 |             # make a full pass over the training data and record metrics:
380 |             train_err, train_acc = evaluate(
381 |                 self.loss, X_train, Y_train, batchsize=128,
382 |             )
383 |             val_err, val_acc = evaluate(
384 |                 self.loss, X_val, Y_val, batchsize=128,
385 |             )
386 | 
387 |             print "  training:\t\t{:.6f}\t{:.6f}".format(
388 |                 train_err, train_acc
389 |             )
390 |             print "  validation:\t\t{:.6f}\t{:.6f}".format(
391 |                 val_err, val_acc
392 |             )
393 | 
394 |             metrics = [epoch, train_err, train_acc, val_err, val_acc]
395 |             log_metrics(logname + '.val', metrics)
396 | 
397 |         # reserve N of training data points to kick start hallucinations
398 |         self.rbm.hallu_set = np.asarray(
399 |             z[:100,:].reshape((100,1,8,8)),
400 |             dtype=theano.config.floatX
401 |         )
402 | 


--------------------------------------------------------------------------------
/models/abstractrbm.py:
--------------------------------------------------------------------------------
  1 | import pdb
  2 | import time
  3 | import numpy as np
  4 | import theano
  5 | import lasagne
  6 | import theano.tensor as T
  7 | from theano.tensor.shared_randomstreams import RandomStreams
  8 | 
  9 | from model import Model
 10 | from helpers import iterate_minibatch_idx, evaluate, log_metrics
 11 | 
 12 | from sklearn.neural_network import BernoulliRBM
 13 | 
 14 | 
 15 | class AbstractRBM(Model):
 16 |     """Restricted Boltzmann Machine
 17 |     RBM code adapted from http://deeplearning.net/tutorial/rbm.html
 18 | 
 19 |     Epoch 15 of 15 took 635.792s (2448 minibatches)
 20 |         training loss/acc:    -62.311016  -62.311016
 21 | 
 22 |     Training Params
 23 |     ---------------
 24 |     batch_size: 20
 25 |     learning_rate: 0.1
 26 |     """
 27 |     def __init__(
 28 |         self, n_dim, n_out, n_chan=1, n_superbatch=12800, opt_alg='adam',
 29 |         opt_params={'lr': 1e-3, 'b1': 0.9, 'b2': 0.99}
 30 |     ):
 31 |         """RBM constructor.
 32 |         Defines the parameters of the model along with
 33 |         basic operations for inferring hidden from visible (and vice-versa),
 34 |         as well as for performing CD updates.
 35 |         """
 36 | 
 37 |         # store sklearn RBM instance
 38 |         self.rbm = BernoulliRBM(random_state=0, n_components=self.n_hidden)
 39 | 
 40 |         self.n_chain = 100
 41 | 
 42 |         # initialize storage for the persistent chain (state = hidden
 43 |         # layer of chain)
 44 |         self.persistent_chain = theano.shared(
 45 |             np.zeros(
 46 |                 (self.n_batch, self.n_hidden),
 47 |                 dtype=theano.config.floatX
 48 |             ), borrow=True,
 49 |         )
 50 | 
 51 |         self.bit_i_idx = theano.shared(value=0, name='bit_i_idx')
 52 | 
 53 |         # create shared data variables
 54 |         self.train_set_x = theano.shared(np.empty(
 55 |             (n_superbatch, n_chan, n_dim, n_dim),
 56 |             dtype=theano.config.floatX),
 57 |             borrow=False,
 58 |         )
 59 |         self.val_set_x = theano.shared(np.empty(
 60 |             (n_superbatch, n_chan, n_dim, n_dim),
 61 |             dtype=theano.config.floatX),
 62 |             borrow=False,
 63 |         )
 64 | 
 65 |         # create y-variables
 66 |         self.train_set_y = theano.shared(np.empty(
 67 |             (n_superbatch,), dtype='int32'), borrow=False)
 68 |         self.val_set_y = theano.shared(np.empty(
 69 |             (n_superbatch,), dtype='int32'), borrow=False)
 70 |         # train_set_y_int = T.cast(train_set_y, 'int32')
 71 |         # val_set_y_int = T.cast(val_set_y, 'int32')
 72 | 
 73 |     def free_energy(self, v_sample):
 74 |         """Function to compute the free energy"""
 75 |         wx_b = T.dot(v_sample, self.W) + self.hbias
 76 |         vbias_term = T.dot(v_sample, self.vbias)
 77 |         hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1)
 78 |         return -hidden_term - vbias_term
 79 | 
 80 |     def propup(self, vis):
 81 |         """This function propagates the visible units activation upwards to
 82 |         the hidden units
 83 | 
 84 |         Note that we return also the pre-sigmoid activation of the
 85 |         layer. As it will turn out later, due to how Theano deals with
 86 |         optimizations, this symbolic variable will be needed to write
 87 |         down a more stable computational graph (see details in the
 88 |         reconstruction cost function)
 89 |         """
 90 |         pre_sigmoid_activation = T.dot(vis, self.W) + self.hbias
 91 |         return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]
 92 | 
 93 |     def sample_h_given_v(self, v0_sample):
 94 |         """This function infers state of hidden units given visible units"""
 95 |         # compute the activation of the hidden units given a sample of
 96 |         # the visibles
 97 |         pre_sigmoid_h1, h1_mean = self.propup(v0_sample)
 98 |         # get a sample of the hiddens given their activation
 99 |         # Note that theano_rng.binomial returns a symbolic sample of dtype
100 |         # int64 by default. If we want to keep our computations in floatX
101 |         # for the GPU we need to specify to return the dtype floatX
102 |         h1_sample = self.theano_rng.binomial(
103 |             size=h1_mean.shape,
104 |             n=1, p=h1_mean,
105 |             dtype=theano.config.floatX,
106 |         )
107 |         return [pre_sigmoid_h1, h1_mean, h1_sample]
108 | 
109 |     def propdown(self, hid):
110 |         """This function propagates the hidden units activation downwards to
111 |         the visible units
112 | 
113 |         Note that we return also the pre_sigmoid_activation of the
114 |         layer. As it will turn out later, due to how Theano deals with
115 |         optimizations, this symbolic variable will be needed to write
116 |         down a more stable computational graph (see details in the
117 |         reconstruction cost function)
118 |         """
119 |         pre_sigmoid_activation = T.dot(hid, self.W.T) + self.vbias
120 |         return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]
121 | 
122 |     def sample_v_given_h(self, h0_sample):
123 |         """This function infers state of visible units given hidden units"""
124 |         # compute the activation of the visible given the hidden sample
125 |         pre_sigmoid_v1, v1_mean = self.propdown(h0_sample)
126 |         # get a sample of the visible given their activation
127 |         # Note that theano_rng.binomial returns a symbolic sample of dtype
128 |         # int64 by default. If we want to keep our computations in floatX
129 |         # for the GPU we need to specify to return the dtype floatX
130 |         v1_sample = self.theano_rng.binomial(
131 |             size=v1_mean.shape,
132 |             n=1, p=v1_mean,
133 |             dtype=theano.config.floatX,
134 |         )
135 |         return [pre_sigmoid_v1, v1_mean, v1_sample]
136 | 
137 |     def gibbs_hvh(self, h0_sample):
138 |         """This function implements one step of Gibbs sampling,
139 |         starting from the hidden state
140 |         """
141 |         pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h0_sample)
142 |         pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
143 |         return [pre_sigmoid_v1, v1_mean, v1_sample,
144 |                 pre_sigmoid_h1, h1_mean, h1_sample]
145 | 
146 |     def gibbs_vhv(self, v0_sample):
147 |         """This function implements one step of Gibbs sampling,
148 |         starting from the visible state
149 |         """
150 |         pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v0_sample)
151 |         pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h1_sample)
152 |         return [pre_sigmoid_h1, h1_mean, h1_sample,
153 |                 pre_sigmoid_v1, v1_mean, v1_sample]
154 | 
155 |     def pseudolikelihood(self, data):
156 |         self.rbm.components_ = self.W.get_value().T
157 |         self.rbm.intercept_visible_ = self.vbias.get_value()
158 |         self.rbm.intercept_hidden_  = self.hbias.get_value()
159 |         return self.rbm.score_samples(data).mean()
160 | 
161 |     def get_pseudo_likelihood_cost(self, X, updates):
162 |         """Stochastic approximation to the pseudo-likelihood"""
163 |         # index of bit i in expression p(x_i | x_{\i})
164 |         bit_i_idx = self.bit_i_idx
165 |         # bit_i_idx = theano.shared(value=0, name='bit_i_idx')
166 | 
167 |         # binarize the input image by rounding to nearest integer
168 |         xi = T.round(X)
169 | 
170 |         # calculate free energy for the given bit configuration
171 |         fe_xi = self.free_energy(xi)
172 | 
173 |         # flip bit x_i of matrix xi and preserve all other bits x_{\i}
174 |         # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx], but assigns
175 |         # the result to xi_flip, instead of working in place on xi.
176 |         xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx])
177 | 
178 |         # calculate free energy with bit flipped
179 |         fe_xi_flip = self.free_energy(xi_flip)
180 | 
181 |         # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i})))
182 |         cost = T.mean(self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip-fe_xi)))
183 | 
184 |         return cost
185 | 
186 |     def hallucinate(self):
187 |         """Once the RBM is trained, we can then use the gibbs_vhv function to
188 |         implement the Gibbs chain required for sampling. This overwrites the
189 |         hallucinate function in Model completely.
190 |         """
191 |         n_samples = 10
192 |         hallu_set = self.hallu_set.reshape((-1, self.n_visible))
193 |         persistent_vis_chain = theano.shared(hallu_set)
194 |         # define one step of Gibbs sampling (mf = mean-field) define a
195 |         # function that does `1000` steps before returning the
196 |         # sample for plotting
197 |         (
198 |             [
199 |                 presig_hids,
200 |                 hid_mfs,
201 |                 hid_samples,
202 |                 presig_vis,
203 |                 vis_mfs,
204 |                 vis_samples
205 |             ],
206 |             updates
207 |         ) = theano.scan(
208 |             self.gibbs_vhv,
209 |             outputs_info=[None, None, None, None, None, persistent_vis_chain],
210 |             n_steps=1000,
211 |             name="gibbs_vhv",
212 |         )
213 | 
214 |         # add to updates that takes care of our persistent chain :
215 |         updates.update({persistent_vis_chain: vis_samples[-1]})
216 |         # construct the function that implements our persistent chain.
217 |         # we generate the "mean field" activations for plotting and the actual
218 |         # samples for reinitializing the state of our persistent chain
219 |         sample_fn = theano.function(
220 |             [],
221 |             [
222 |                 vis_mfs[-1],
223 |                 vis_samples[-1]
224 |             ],
225 |             updates=updates,
226 |             name='sample_fn',
227 |         )
228 | 
229 |         for idx in range(n_samples):
230 |             # generate `plot_every` intermediate samples that we discard,
231 |             # because successive samples in the chain are too correlated
232 |             vis_mf, vis_sample = sample_fn()
233 | 
234 |         img_size = int(np.sqrt(self.n_chain))
235 |         vis_mf = vis_mf.reshape((img_size, img_size, self.n_dim, self.n_dim))
236 |         vis_mf = np.concatenate(np.split(vis_mf, img_size, axis=0), axis=3)
237 |         # split into img_size (1,1,n_dim,n_dim*img_size) images,
238 |         # concat along rows -> 1,1,n_dim*img_size,n_dim*img_size
239 |         vis_mf = np.concatenate(np.split(vis_mf, img_size, axis=1), axis=2)
240 |         return np.squeeze(vis_mf)
241 | 
242 |     def hallucinate_chain(self):
243 |         """Once the RBM is trained, we can then use the gibbs_vhv function to
244 |         implement the Gibbs chain required for sampling. This overwrites the
245 |         hallucinate function in Model completely.
246 |         """
247 |         n_samples = 10
248 |         # hallu_set = self.hallu_set.reshape((-1, self.n_visible))
249 |         hallu_set = np.random.rand(1, 64).astype('float32')
250 |         # hallu_set = self.Phi.get_value()[:,0]
251 |         persistent_vis_chain = theano.shared(hallu_set)
252 |         # define one step of Gibbs sampling (mf = mean-field) define a
253 |         # function that does `1000` steps before returning the
254 |         # sample for plotting
255 |         (
256 |             [
257 |                 presig_hids,
258 |                 hid_mfs,
259 |                 hid_samples,
260 |                 presig_vis,
261 |                 vis_mfs,
262 |                 vis_samples
263 |             ],
264 |             updates
265 |         ) = theano.scan(
266 |             self.gibbs_vhv,
267 |             outputs_info=[None, None, None, None, None, persistent_vis_chain],
268 |             n_steps=100,
269 |             name="gibbs_vhv",
270 |         )
271 | 
272 |         # add to updates that takes care of our persistent chain :
273 |         updates.update({persistent_vis_chain: vis_samples[-1]})
274 |         # construct the function that implements our persistent chain.
275 |         # we generate the "mean field" activations for plotting and the actual
276 |         # samples for reinitializing the state of our persistent chain
277 |         sample_fn = theano.function([],vis_mfs,updates=updates,name='sample_fn')
278 | 
279 |         vis_mf = sample_fn()
280 |         print vis_mf.shape
281 |         # vis_mf = vis_mf[::10]
282 |         print vis_mf.shape
283 | 
284 |         img_size = 10
285 |         vis_mf = vis_mf.reshape((img_size, img_size, self.n_dim, self.n_dim))
286 |         vis_mf = np.transpose(vis_mf, [1,0,2,3])
287 |         vis_mf = np.concatenate(np.split(vis_mf, img_size, axis=0), axis=3)
288 |         # split into img_size (1,1,n_dim,n_dim*img_size) images,
289 |         # concat along rows -> 1,1,n_dim*img_size,n_dim*img_size
290 |         vis_mf = np.concatenate(np.split(vis_mf, img_size, axis=1), axis=2)
291 |         return np.squeeze(vis_mf)
292 | 
293 |     def E_np_h(self,h):
294 |         bv_vec = self.vbias.get_value().reshape((self.n_visible,1))
295 |         bh_vec = self.hbias.get_value().reshape((self.n_hidden,1))
296 |         W = self.W.get_value()
297 |         return (np.dot(bh_vec.T, h) + np.sum(np.log(1. + np.exp(bv_vec + np.dot(W, h))), axis=0)).flatten()
298 | 
299 |     def E_np_v(self,v):
300 |         bv_vec = self.vbias.get_value().reshape((self.n_visible,1))
301 |         bh_vec = self.hbias.get_value().reshape((self.n_hidden,1))
302 |         W = self.W.get_value()
303 |         return (np.dot(bv_vec.T, v)+ np.sum(np.log(1. + np.exp(bh_vec + np.dot(W.T, v))), axis=0)).flatten()
304 | 
305 |     def logZ_exact(self, marg='v'):
306 |         # get the next binary vector
307 |         t = self.n_hidden if marg == 'v' else self.n_visible
308 |         def inc(x):
309 |             for i in xrange(t):
310 |                 x[i,0]+=1
311 |                 if x[i,0]<=1: return True
312 |                 x[i,0]=0
313 | 
314 |             return False
315 | 
316 |         #compute the normalizing constant
317 |         if marg == 'v': x=np.zeros((self.n_hidden,1))
318 |         elif marg == 'h': x=np.zeros((self.n_visible,1))
319 |         logZ=-np.inf
320 |         while True:
321 |             if marg == 'v': logF = self.E_np_h(x)
322 |             elif marg == 'h': logF = self.E_np_v(x)
323 |             # print ''.join([str(xi) for xi in x]), logF, logZ
324 |             logZ=np.logaddexp(logZ,logF)
325 |             if not inc(x): break
326 | 
327 |         # print
328 |         return logZ
329 | 
330 |     def sample(self):
331 |         """Once the RBM is trained, we can then use the gibbs_vhv function to
332 |         implement the Gibbs chain required for sampling. This overwrites the
333 |         hallucinate function in Model completely.
334 |         """
335 |         n_samples = 10
336 |         hallu_set = self.hallu_set.reshape((-1, self.n_visible))
337 |         persistent_vis_chain = theano.shared(hallu_set)
338 |         # define one step of Gibbs sampling (mf = mean-field) define a
339 |         # function that does `1000` steps before returning the
340 |         # sample for plotting
341 |         (
342 |             [
343 |                 presig_hids,
344 |                 hid_mfs,
345 |                 hid_samples,
346 |                 presig_vis,
347 |                 vis_mfs,
348 |                 vis_samples
349 |             ],
350 |             updates
351 |         ) = theano.scan(
352 |             self.gibbs_vhv,
353 |             outputs_info=[None, None, None, None, None, persistent_vis_chain],
354 |             n_steps=1000,
355 |             name="gibbs_vhv",
356 |         )
357 | 
358 |         # add to updates that takes care of our persistent chain :
359 |         updates.update({persistent_vis_chain: vis_samples[-1]})
360 |         # construct the function that implements our persistent chain.
361 |         # we generate the "mean field" activations for plotting and the actual
362 |         # samples for reinitializing the state of our persistent chain
363 |         sample_fn = theano.function(
364 |             [],
365 |             [
366 |                 vis_mfs[-1],
367 |                 vis_samples[-1]
368 |             ],
369 |             updates=updates,
370 |             name='sample_fn',
371 |         )
372 | 
373 |         for idx in range(n_samples):
374 |             # generate `plot_every` intermediate samples that we discard,
375 |             # because successive samples in the chain are too correlated
376 |             vis_mf, vis_sample = sample_fn()
377 | 
378 |         img_size = int(np.sqrt(self.n_chain))
379 |         vis_mf = vis_mf.reshape((self.n_chain, self.n_dim*self.n_dim))
380 | 
381 |         return vis_mf


--------------------------------------------------------------------------------
/models/dadgm.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import pickle
  3 | import numpy as np
  4 | from collections import OrderedDict
  5 | 
  6 | import theano
  7 | import theano.tensor as T
  8 | import lasagne
  9 | from theano.gradient import disconnected_grad as dg
 10 | 
 11 | from model import Model
 12 | from layers.shape import RepeatLayer
 13 | from layers import (
 14 |     GaussianSampleLayer,
 15 |     BernoulliSampleLayer,
 16 |     GaussianMultiSampleLayer,
 17 | )
 18 | from distributions import log_bernoulli, log_normal2
 19 | from helpers import Tlogsumexp
 20 | 
 21 | from theano.tensor.shared_randomstreams import RandomStreams
 22 | 
 23 | # ----------------------------------------------------------------------------
 24 | 
 25 | class DADGM(Model):
 26 |     """Auxiliary Deep Generative Model (unsupervised version) with discrete z"""
 27 |     def __init__(
 28 |         self, n_dim, n_out, n_chan=1, n_superbatch=12800, model='bernoulli',
 29 |         opt_alg='adam', opt_params={'lr' : 1e-3, 'b1': 0.9, 'b2': 0.99}
 30 |     ):
 31 |         # save model that wil be created
 32 |         self.model = model
 33 |         self.n_sample = 5 # adjustable parameter, though 1 works best in practice
 34 |         Model.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params)
 35 | 
 36 |         # random number generators
 37 |         self.numpy_rng = np.random.RandomState(1234)
 38 |         self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30))
 39 | 
 40 |         (X, Y, idx1, idx2, S) = self.inputs 
 41 |         loss, acc = self.objectives
 42 |         llik = self.create_llik()
 43 |         self.loss = theano.function(
 44 |             [X, Y], [loss, llik],
 45 |             on_unused_input='warn',
 46 |         )
 47 | 
 48 |     def create_model(self, X, Y, n_dim, n_out, n_chan=1):
 49 |         # params
 50 |         n_lat = 200  # latent stochastic variables
 51 |         n_aux = 10  # auxiliary variables
 52 |         n_hid = 500  # size of hidden layer in encoder/decoder
 53 |         n_sam = self.n_sample  # number of monte-carlo samples
 54 |         n_hid_cv = 500  # size of hidden layer in control variate net
 55 |         n_out = n_dim * n_dim * n_chan # total dimensionality of ouput
 56 |         hid_nl = lasagne.nonlinearities.tanh
 57 |         relu_shift = lambda av: T.nnet.relu(av+10)-10 # for numerical stability
 58 | 
 59 |         # create the encoder network
 60 |         # create q(a|x)
 61 |         l_qa_in = lasagne.layers.InputLayer(
 62 |             shape=(None, n_chan, n_dim, n_dim),
 63 |             input_var=X,
 64 |         )
 65 |         l_qa_hid = lasagne.layers.DenseLayer(
 66 |             l_qa_in, num_units=n_hid,
 67 |             nonlinearity=hid_nl,
 68 |         )
 69 |         l_qa_mu = lasagne.layers.DenseLayer(
 70 |             l_qa_in, num_units=n_aux,
 71 |             nonlinearity=None,
 72 |         )
 73 |         l_qa_logsigma = lasagne.layers.DenseLayer(
 74 |             l_qa_in, num_units=n_aux,
 75 |             nonlinearity=relu_shift,
 76 |         )
 77 |         # repeatedly sample
 78 |         l_qa_mu = lasagne.layers.ReshapeLayer(
 79 |             RepeatLayer(l_qa_mu, n_ax=1, n_rep=n_sam),
 80 |             shape=(-1, n_aux),
 81 |         )
 82 |         l_qa_logsigma = lasagne.layers.ReshapeLayer(
 83 |             RepeatLayer(l_qa_logsigma, n_ax=1, n_rep=n_sam),
 84 |             shape=(-1, n_aux),
 85 |         )
 86 |         l_qa = GaussianSampleLayer(l_qa_mu, l_qa_logsigma)
 87 | 
 88 |         # create q(z|a,x)
 89 |         l_qz_in = lasagne.layers.InputLayer((None, n_aux))
 90 |         l_qz_hid1a = lasagne.layers.DenseLayer(
 91 |             l_qz_in, num_units=n_hid,
 92 |             nonlinearity=hid_nl,
 93 |         )
 94 |         l_qz_hid1b = lasagne.layers.DenseLayer(
 95 |             l_qa_in, num_units=n_hid,
 96 |             nonlinearity=hid_nl,
 97 |         )
 98 |         l_qz_hid1b = lasagne.layers.ReshapeLayer(
 99 |             RepeatLayer(l_qz_hid1b, n_ax=1, n_rep=n_sam),
100 |             shape=(-1, n_hid),
101 |         )
102 |         l_qz_hid2 = lasagne.layers.ElemwiseSumLayer([l_qz_hid1a, l_qz_hid1b])
103 |         l_qz_hid3 = lasagne.layers.DenseLayer(
104 |             l_qz_hid2, num_units=n_hid,
105 |             nonlinearity=hid_nl,
106 |         )
107 |         l_qz_mu = lasagne.layers.DenseLayer(
108 |             l_qz_hid3, num_units=n_lat,
109 |             nonlinearity=T.nnet.sigmoid,
110 |         )
111 |         l_qz = BernoulliSampleLayer(l_qz_mu)
112 |         l_qz_logsigma = None
113 | 
114 |         # create the decoder network
115 |         # create p(x|z)
116 |         l_px_in = lasagne.layers.InputLayer((None, n_lat))
117 |         l_px_hid = lasagne.layers.DenseLayer(
118 |             l_px_in, num_units=n_hid,
119 |             W=lasagne.init.GlorotUniform(),
120 |             nonlinearity=hid_nl,
121 |         )
122 |         l_px_mu, l_px_logsigma = None, None
123 | 
124 |         if self.model == 'bernoulli':
125 |             l_px_mu = lasagne.layers.DenseLayer(
126 |                 l_px_hid, num_units=n_out,
127 |                 nonlinearity=lasagne.nonlinearities.sigmoid,
128 |             )
129 |         elif self.model == 'gaussian':
130 |             l_px_mu = lasagne.layers.DenseLayer(
131 |                 l_px_hid, num_units=n_out,
132 |                 nonlinearity=None,
133 |             )
134 |             l_px_logsigma = lasagne.layers.DenseLayer(
135 |                 l_px_hid, num_units=n_out,
136 |                 nonlinearity=relu_shift,
137 |             )
138 | 
139 |         # create p(a|z)
140 |         l_pa_hid = lasagne.layers.DenseLayer(
141 |             l_px_in, num_units=n_hid,
142 |             nonlinearity=hid_nl,
143 |         )
144 |         l_pa_mu = lasagne.layers.DenseLayer(
145 |             l_pa_hid, num_units=n_aux,
146 |             nonlinearity=None,
147 |         )
148 |         l_pa_logsigma = lasagne.layers.DenseLayer(
149 |             l_pa_hid, num_units=n_aux,
150 |             W=lasagne.init.GlorotNormal(),
151 |             b=lasagne.init.Normal(1e-3),
152 |             nonlinearity=relu_shift,
153 |         )
154 | 
155 |         # create control variate (baseline) network
156 |         l_cv_in = lasagne.layers.InputLayer(
157 |             shape=(None, n_chan, n_dim, n_dim),
158 |             input_var=X,
159 |         )
160 |         l_cv_hid = lasagne.layers.DenseLayer(
161 |             l_cv_in, num_units=n_hid_cv,
162 |             nonlinearity=hid_nl,
163 |         )
164 |         l_cv = lasagne.layers.DenseLayer(
165 |             l_cv_hid, num_units=1,
166 |             nonlinearity=None,
167 |         )
168 | 
169 |         # create variables for centering signal
170 |         c = theano.shared(np.zeros((1,1), dtype=np.float64), broadcastable=(True,True))
171 |         v = theano.shared(np.zeros((1,1), dtype=np.float64), broadcastable=(True,True))
172 | 
173 |         # store certain input layers for downstream (quick hack)
174 |         self.input_layers = (l_qa_in, l_qz_in, l_px_in, l_cv_in)
175 |         self.n_lat = n_lat
176 |         self.n_hid = n_hid
177 | 
178 |         return l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \
179 |                l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, \
180 |                l_qa, l_qz, l_cv, c, v
181 | 
182 |     def _create_components(self, deterministic=False):
183 |         # load network input
184 |         X = self.inputs[0]
185 |         x = X.flatten(2)
186 | 
187 |         # duplicate entries to take into account multiple mc samples
188 |         n_sam = self.n_sample
189 |         n_out = x.shape[1]
190 |         x = x.dimshuffle(0,'x',1).repeat(n_sam, axis=1).reshape((-1, n_out))
191 | 
192 |         # load networks
193 |         l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \
194 |         l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, _, _, _ = self.network
195 |         l_qa_in, l_qz_in, l_px_in, l_cv_in = self.input_layers
196 | 
197 |         # load network output
198 |         qa_mu, qa_logsigma, a = lasagne.layers.get_output(
199 |             [l_qa_mu, l_qa_logsigma, l_qa],
200 |             deterministic=deterministic,
201 |         )
202 |         qz_mu, z = lasagne.layers.get_output(
203 |             [l_qz_mu, l_qz],
204 |             {l_qz_in: a, l_qa_in: X},
205 |             deterministic=deterministic,
206 |         )
207 |         pa_mu, pa_logsigma = lasagne.layers.get_output(
208 |             [l_pa_mu, l_pa_logsigma], {l_px_in: z},
209 |             deterministic=deterministic,
210 |         )
211 | 
212 |         if self.model == 'bernoulli':
213 |             px_mu = lasagne.layers.get_output(
214 |                 l_px_mu, {l_px_in: z}, deterministic=deterministic)
215 |         elif self.model == 'gaussian':
216 |             px_mu, px_logsigma  = lasagne.layers.get_output(
217 |                 [l_px_mu, l_px_logsigma], {l_px_in: z},
218 |                 deterministic=deterministic,
219 |             )
220 | 
221 |         # entropy term
222 |         log_qa_given_x  = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1)
223 |         log_qz_given_x = log_bernoulli(z, qz_mu).sum(axis=1)
224 |         log_qz_given_x_dgz = log_bernoulli(dg(z), qz_mu).sum(axis=1)
225 |         log_qza_given_x =  log_qz_given_x + log_qa_given_x
226 | 
227 |         # log-probability term
228 |         z_prior = T.ones_like(z)*np.float32(0.5)
229 |         log_pz = log_bernoulli(z, z_prior).sum(axis=1)
230 |         log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1)
231 |         log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1)
232 | 
233 |         log_pxz = log_pa_given_z + log_px_given_z + log_pz
234 | 
235 |         # save them for later
236 |         if deterministic == False:
237 |             self.log_pxz = log_pxz
238 |             self.log_px_given_z = log_px_given_z
239 |             self.log_pz = log_pz
240 |             self.log_qza_given_x = log_qza_given_x
241 |             self.log_qa_given_x = log_qa_given_x
242 |             self.log_qz_given_x = log_qz_given_x
243 |             self.log_qz_given_x_dgz = log_qz_given_x_dgz
244 | 
245 |         # return log_paxz, log_qza_given_x
246 |         return log_pxz, log_qza_given_x
247 | 
248 |     def create_objectives(self, deterministic=False):
249 |         # load probabilities
250 |         log_paxz, log_qza_given_x = self._create_components(deterministic=deterministic)
251 | 
252 |         # compute the evidence lower bound
253 |         elbo = T.mean(log_paxz - log_qza_given_x)
254 |         log_qa_given_x = self.log_qa_given_x
255 | 
256 |         # we don't use a spearate accuracy metric right now
257 |         return -elbo, T.mean(log_qa_given_x)
258 | 
259 |     def create_gradients(self, loss, deterministic=False):
260 |         # load networks
261 |         l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \
262 |         l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, l_cv, c, v = self.network
263 | 
264 |         # load params
265 |         p_params  = lasagne.layers.get_all_params(
266 |             [l_px_mu, l_pa_mu, l_pa_logsigma], trainable=True)
267 |         qa_params  = lasagne.layers.get_all_params(l_qa_mu, trainable=True)
268 |         qz_params  = lasagne.layers.get_all_params(l_qz, trainable=True)
269 |         cv_params = lasagne.layers.get_all_params(l_cv, trainable=True)
270 | 
271 |         # load neural net outputs (probabilities have been precomputed)
272 |         log_pxz, log_px_given_z, log_pz = self.log_pxz, self.log_px_given_z, self.log_pz
273 |         log_qza_given_x = self.log_qza_given_x
274 |         log_qz_given_x = self.log_qz_given_x
275 |         log_qz_given_x_dgz = self.log_qz_given_x_dgz
276 |         cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1)
277 | 
278 |         # compute learning signals
279 |         l0 = log_px_given_z + log_pz - log_qz_given_x - cv # NOTE: this disn't have q(a)
280 |         l_avg, l_var = l0.mean(), l0.var()
281 |         c_new = 0.8*c + 0.2*l_avg
282 |         v_new = 0.8*v + 0.2*l_var
283 |         l = (l0 - c_new) / T.maximum(1, T.sqrt(v_new))
284 |         l_target = (l0 - c_new) / T.maximum(1, T.sqrt(v_new))
285 | 
286 |         # compute grad wrt p
287 |         p_grads = T.grad(-log_pxz.mean(), p_params)
288 | 
289 |         # compute grad wrt q_a
290 |         elbo = T.mean(log_pxz - log_qza_given_x)
291 |         qa_grads = T.grad(-elbo, qa_params)
292 | 
293 |         # compute grad wrt q_z
294 |         qz_target = T.mean(dg(l_target) * log_qz_given_x_dgz)
295 |         qz_grads = T.grad(-0.2*qz_target, qz_params) # 5x slower rate for q
296 | 
297 |         # compute grad of cv net
298 |         cv_target = T.mean(l0**2)
299 |         cv_grads = [0.2*g for g in T.grad(cv_target, cv_params)]
300 | 
301 |         # combine and clip gradients
302 |         clip_grad = 1
303 |         max_norm = 5
304 |         grads = p_grads + qa_grads + qz_grads + cv_grads
305 |         mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm)
306 |         cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
307 | 
308 |         return cgrads
309 | 
310 |     def gen_samples(self, deterministic=False):
311 |         s = self.inputs[-1]
312 |         # put it through the decoder
313 |         _, _, l_px_in, _ = self.input_layers
314 |         l_px_mu = self.network[0]
315 |         px_mu = lasagne.layers.get_output(l_px_mu, {l_px_in : s})
316 | 
317 |         return px_mu
318 | 
319 |     def get_params(self):
320 |         # load networks
321 |         l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \
322 |         l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, \
323 |         l_qa, l_qz, l_cv, c, v = self.network
324 | 
325 |         if self.model == 'gaussian':
326 |             raise NotImplementedError('The code below needs to implement Gaussians')
327 | 
328 |         # load params
329 |         p_params  = lasagne.layers.get_all_params(
330 |             [l_px_mu, l_pa_mu, l_pa_logsigma], trainable=True)
331 |         qa_params  = lasagne.layers.get_all_params(l_qa_mu, trainable=True)
332 |         qz_params  = lasagne.layers.get_all_params(l_qz, trainable=True)
333 |         cv_params = lasagne.layers.get_all_params(l_cv, trainable=True)
334 | 
335 |         return p_params + qa_params + qz_params + cv_params
336 | 
337 |     def create_updates(self, grads, params, alpha, opt_alg, opt_params):
338 |         # call super-class to generate SGD/ADAM updates
339 |         grad_updates = Model.create_updates(self, grads, params, alpha, opt_alg, opt_params)
340 | 
341 |         # create updates for centering signal
342 |         # load neural net outputs (probabilities have been precomputed)
343 |         l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \
344 |         l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, l_cv, c, v = self.network
345 | 
346 |         # load neural net outputs (probabilities have been precomputed)
347 |         log_pxz, log_px_given_z, log_pz = self.log_pxz, self.log_px_given_z, self.log_pz
348 |         log_qz_given_x = self.log_qz_given_x
349 |         cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1)
350 | 
351 |         # compute learning signals
352 |         l = log_px_given_z + log_pz - log_qz_given_x - cv
353 |         l_avg, l_var = l.mean(), l.var()
354 |         c_new = 0.8*c + 0.2*l_avg
355 |         v_new = 0.8*v + 0.2*l_var
356 | 
357 |         # compute update for centering signal
358 |         cv_updates = {c: c_new, v: v_new}
359 | 
360 |         return OrderedDict(grad_updates.items() + cv_updates.items())
361 | 
362 |     def create_llik(self):
363 |         # load network input
364 |         X = self.inputs[0]
365 |         x = X.flatten(2)
366 | 
367 |         # duplicate entries to take into account multiple mc samples
368 |         n_sam = self.n_sample
369 |         n_out = x.shape[1]
370 |         x = x.dimshuffle(0,'x',1).repeat(n_sam, axis=1).reshape((-1, n_out))
371 | 
372 |         # load networks
373 |         l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \
374 |         l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, _, _, _ = self.network
375 |         l_qa_in, l_qz_in, l_px_in, l_cv_in = self.input_layers
376 | 
377 |         # load network output
378 |         qa_mu, qa_logsigma, a = lasagne.layers.get_output(
379 |             [l_qa_mu, l_qa_logsigma, l_qa],
380 |         )
381 |         qz_mu, z = lasagne.layers.get_output(
382 |             [l_qz_mu, l_qz],
383 |             {l_qz_in: a, l_qa_in: X},
384 |         )
385 |         pa_mu, pa_logsigma = lasagne.layers.get_output(
386 |             [l_pa_mu, l_pa_logsigma], {l_px_in: z},
387 |         )
388 | 
389 |         px_mu = lasagne.layers.get_output(l_px_mu, {l_px_in: z})
390 | 
391 |         # entropy term
392 |         log_qa_given_x  = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1)
393 |         log_qz_given_x = log_bernoulli(z, qz_mu).sum(axis=1)
394 |         log_qz_given_x_dgz = log_bernoulli(dg(z), qz_mu).sum(axis=1)
395 |         log_qza_given_x =  log_qz_given_x + log_qa_given_x
396 | 
397 |         # log-probability term
398 |         z_prior = T.ones_like(z)*np.float32(0.5)
399 |         log_pz = log_bernoulli(z, z_prior).sum(axis=1)
400 |         log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1)
401 |         log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1)
402 | 
403 |         t = log_pa_given_z + log_px_given_z + log_pz - log_qz_given_x - log_qa_given_x
404 |         t = t.reshape([100, n_sam])
405 | 
406 |         # compute loss
407 |         llik = Tlogsumexp( t, axis=1) # (n_bat,)
408 | 
409 |         return T.mean(llik)


--------------------------------------------------------------------------------
/models/rbm_dadgm.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import theano.tensor as T
  3 | from lasagne.layers import (
  4 |   InputLayer, DenseLayer, ElemwiseSumLayer,
  5 |   reshape, flatten, get_all_params, get_output,
  6 | )
  7 | from lasagne.updates import total_norm_constraint
  8 | from lasagne.init import GlorotNormal, Normal
  9 | from layers import GumbelSoftmaxSampleLayer, GaussianSampleLayer
 10 | from distributions import log_bernoulli, log_normal2
 11 | from gsm import GSM
 12 | from rbm import RBM
 13 | 
 14 | import theano
 15 | import lasagne
 16 | from theano.tensor.shared_randomstreams import RandomStreams
 17 | 
 18 | 
 19 | class RBM_DADGM(GSM, RBM):
 20 |     """Restricted Boltzmann Machine trained with persistent contrastive
 21 |     divergence for learning p; Auxiliary Deep Generative Model trained
 22 |     using Gumbel Softmax Reparametrization for learning q.
 23 |     """
 24 | 
 25 |     def __init__(
 26 |         self, n_dim, n_out, n_chan=1, n_superbatch=12800, opt_alg='adam',
 27 |         opt_params={'lr': 1e-3, 'b1': 0.9, 'b2': 0.99}
 28 |     ):
 29 |         self.numpy_rng = np.random.RandomState(1234)
 30 |         self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30))
 31 | 
 32 |         self.n_dim = n_dim
 33 |         self.n_out = n_out
 34 |         self.n_superbatch = n_superbatch
 35 |         self.alg = opt_alg
 36 |         self.n_class = 10
 37 | 
 38 |         lr = opt_params.get('lr')
 39 |         n_batch = opt_params.get('nb')
 40 | 
 41 |         train_set_x = theano.shared(
 42 |             np.empty(
 43 |                 (n_superbatch, n_chan, n_dim, n_dim),
 44 |                 dtype=theano.config.floatX
 45 |             ), borrow=False,
 46 |         )
 47 |         val_set_x = theano.shared(np.empty(
 48 |             (n_superbatch, n_chan, n_dim, n_dim),
 49 |             dtype=theano.config.floatX),
 50 |             borrow=False,
 51 |         )
 52 |         train_set_y = theano.shared(
 53 |             np.empty(
 54 |                 (n_superbatch,),
 55 |                 dtype=theano.config.floatX
 56 |             ), borrow=False,
 57 |         )
 58 |         val_set_y = theano.shared(
 59 |             np.empty(
 60 |                 (n_superbatch,),
 61 |                 dtype=theano.config.floatX
 62 |             ), borrow=False,
 63 |         )
 64 |         train_set_y_int = T.cast(train_set_y, 'int32')
 65 |         val_set_y_int = T.cast(val_set_y, 'int32')
 66 | 
 67 |         train_rbm_px_mu = theano.shared(
 68 |             np.empty(
 69 |                 (n_superbatch, self.n_aux),
 70 |                 dtype=theano.config.floatX
 71 |             ), borrow=False,
 72 |         )
 73 | 
 74 |         X = T.tensor4(dtype=theano.config.floatX)
 75 |         S = T.tensor3(dtype=theano.config.floatX)
 76 |         Y = T.ivector()
 77 |         px_mu = T.lscalar(dtype=config.floatX)
 78 |         idx1, idx2 = T.lscalar(), T.lscalar()
 79 |         alpha = T.scalar(dtype=theano.config.floatX)  # learning rate
 80 |         self.inputs = (X, Y, idx1, idx2, S, px_mu)
 81 | 
 82 |         # ----------------------------
 83 |         # Begin RBM-only
 84 |         self.rbm_network = self.create_rbm_model(n_dim, n_out, n_chan)
 85 |         persistent_chain = theano.shared(
 86 |             np.zeros(
 87 |                 (n_batch, self.n_hidden),
 88 |                 dtype=theano.config.floatX
 89 |             ), borrow=True,
 90 |         )
 91 |         rbm_cost, rbm_acc, rbm_updates = self.get_rbm_objective_and_updates(
 92 |             alpha, lr=lr, persistent=persistent_chain,
 93 |         )
 94 |         self.rbm_objectives = (rbm_cost, rbm_acc)
 95 |         self.rbm_train = theano.function(
 96 |             [idx1, idx2, alpha],
 97 |             [rbm_cost, rbm_acc],
 98 |             updates=rbm_updates,
 99 |             givens={
100 |                 X: train_set_x[idx1:idx2],
101 |                 Y: train_set_y_int[idx1:idx2]
102 |             },
103 |             on_unused_input='warn',
104 |         )
105 |         # End RBM-only
106 |         # ----------------------------
107 |         # Begin DADGM-only
108 |         tau = theano.shared(
109 |             np.float32(5.0), name='temperature',
110 |             allow_downcast=True, borrow=False,
111 |         )
112 |         self.tau = tau
113 |         self.dadgm_network = self.create_dadgm_model(
114 |             X, Y, n_dim, n_out, n_chan,
115 |         )
116 |         dadgm_loss, dadgm_acc = self.create_dadgm_objectives(False)
117 |         self.dadgm_objectives = (dadgm_loss, dadgm_acc)
118 |         dadgm_params = self.get_dadgm_params()
119 |         dadgm_grads = self.create_dadgm_gradients(dadgm_loss, False)
120 |         dadgm_updates = self.create_dadgm_updates(
121 |             dadgm_grads, dadgm_params, alpha, opt_alg, opt_params,
122 |         )
123 |         self.dadgm_train = theano.function(
124 |             [idx1, idx2, alpha], [dadgm_loss, dadgm_acc],
125 |             updates=dadgm_updates,
126 |             givens={
127 |                 X: train_set_x[idx1:idx2],
128 |                 Y: train_set_y_int[idx1:idx2],
129 |                 px_mu: train_rbm_px_mu,
130 |             },
131 |             on_unused_input='warn',
132 |         )
133 |         self.dadgm_loss = theano.function(
134 |             [X, Y], [dadgm_loss, dadgm_acc],
135 |             on_unused_input='warn',
136 |         )
137 |         # End DADGM-only
138 |         # ----------------------------
139 |         self.n_batch = n_batch
140 |         # parameters for sampling
141 |         self.n_chain = 100
142 | 
143 |         # save data variables
144 |         self.train_set_x = train_set_x
145 |         self.train_set_y = train_set_y
146 |         self.val_set_x = val_set_x
147 |         self.val_set_y = val_set_y
148 |         self.train_rbm_px_mu = train_rbm_px_mu
149 |         self.data_loaded = False
150 | 
151 |     def create_rbm_model(self, n_dim, n_out, n_chan=1, n_class=10):
152 |         n_visible = n_chan*n_dim*n_dim  # size of visible layer
153 |         n_hidden = 500  # size of hidden layer
154 |         k_steps = 15  # number of steps during CD/PCD
155 | 
156 |         initial_W = np.asarray(
157 |             self.numpy_rng.uniform(
158 |                 low=-4 * np.sqrt(6. / (n_hidden + n_visible)),
159 |                 high=4 * np.sqrt(6. / (n_hidden + n_visible)),
160 |                 size=(n_visible, n_hidden)
161 |             ), dtype=theano.config.floatX,
162 |         )
163 |         W = theano.shared(value=initial_W, name='W', borrow=True)
164 |         hbias = theano.shared(
165 |             value=np.zeros(
166 |                 n_hidden,
167 |                 dtype=theano.config.floatX
168 |             ), name='hbias',
169 |             borrow=True,
170 |         )
171 |         vbias = theano.shared(
172 |             value=np.zeros(
173 |                 n_visible,
174 |                 dtype=theano.config.floatX
175 |             ), name='vbias',
176 |             borrow=True,
177 |         )
178 | 
179 |         self.W = W
180 |         self.hbias = hbias
181 |         self.vbias = vbias
182 | 
183 |         self.rbm_params = [self.W, self.hbias, self.vbias]
184 | 
185 |         # network params
186 |         self.n_visible = n_visible
187 |         self.n_hidden = n_hidden
188 |         self.k_steps = k_steps
189 | 
190 |         return None
191 | 
192 |     def create_dadgm_model(self, X, Y, n_dim, n_out, n_chan=1, n_class=10):
193 |         n_cat = 20  # number of categorical distributions
194 |         n_lat = n_class*n_cat  # latent stochastic variables
195 |         n_aux = 10  # number of auxiliary variables
196 |         n_hid = 500  # size of hidden layer in encoder/decoder
197 |         n_in = n_out = n_dim * n_dim * n_chan
198 |         tau = self.tau
199 |         hid_nl = T.nnet.relu
200 |         relu_shift = lambda av: T.nnet.relu(av+10)-10
201 | 
202 |         # create the encoder network
203 |         # - create q(a|x)
204 |         qa_net_in = InputLayer(shape=(None, n_in), input_var=X)
205 |         qa_net = DenseLayer(
206 |             qa_net_in, num_units=n_hid,
207 |             W=GlorotNormal('relu'), b=Normal(1e-3),
208 |             nonlinearity=hid_nl,
209 |         )
210 |         qa_net_mu = DenseLayer(
211 |             qa_net, num_units=n_aux,
212 |             W=GlorotNormal(), b=Normal(1e-3),
213 |             nonlinearity=None,
214 |         )
215 |         qa_net_logsigma = DenseLayer(
216 |             qa_net, num_units=n_aux,
217 |             W=GlorotNormal(), b=Normal(1e-3),
218 |             nonlinearity=relu_shift,
219 |         )
220 |         qa_net_sample = GaussianSampleLayer(qa_net_mu, qa_net_logsigma)
221 |         # - create q(z|a, x)
222 |         qz_net_in = lasagne.layers.InputLayer((None, n_aux))
223 |         qz_net_a = DenseLayer(
224 |             qz_net_in, num_units=n_hid,
225 |             nonlinearity=hid_nl,
226 |         )
227 |         qz_net_b = DenseLayer(
228 |             qa_net_in, num_units=n_hid,
229 |             nonlinearity=hid_nl,
230 |         )
231 |         qz_net = ElemwiseSumLayer([qz_net_a, qz_net_b])
232 |         qz_net = DenseLayer(
233 |             qz_net, num_units=n_hid,
234 |             nonlinearity=hid_nl
235 |         )
236 |         qz_net_mu = DenseLayer(
237 |             qz_net, num_units=n_lat,
238 |             nonlinearity=None,
239 |         )
240 |         qz_net_mu = reshape(qz_net_mu, (-1, n_class))
241 |         qz_net_sample = GumbelSoftmaxSampleLayer(qz_net_mu, tau)
242 |         qz_net_sample = reshape(qz_net_sample, (-1, n_cat, n_class))
243 |         # create the decoder network
244 |         # - create p(x|z)
245 |         px_net_in = lasagne.layers.InputLayer((None, n_cat, n_class))
246 |         # --- rest is created from RBM ---
247 |         # - create p(a|z)
248 |         pa_net = DenseLayer(
249 |             flatten(px_net_in), num_units=n_hid,
250 |             W=GlorotNormal('relu'), b=Normal(1e-3),
251 |             nonlinearity=hid_nl,
252 |         )
253 |         pa_net_mu = DenseLayer(
254 |             pa_net, num_units=n_aux,
255 |             W=GlorotNormal(),
256 |             b=Normal(1e-3),
257 |             nonlinearity=None,
258 |         )
259 |         pa_net_logsigma = DenseLayer(
260 |             pa_net, num_units=n_aux,
261 |             W=GlorotNormal(),
262 |             b=Normal(1e-3),
263 |             nonlinearity=relu_shift,
264 |         )
265 |         # save network params
266 |         self.n_cat = n_cat
267 |         self.input_layers = (qa_net_in, qz_net_in, px_net_in)
268 | 
269 |         return pa_net_mu, pa_net_logsigma, qz_net_mu, \
270 |             qa_net_mu, qa_net_logsigma, qz_net_sample, qa_net_sample,
271 | 
272 |     def get_rbm_objective_and_updates(self, alpha, lr=0.1, persistent=None):
273 |         X = self.inputs[0]
274 |         x = X.reshape((-1, self.n_visible))
275 |         # compute positive phase
276 |         pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(x)
277 |         # for PCD, we initialize from the old state of the chain
278 |         chain_start = persistent
279 |         (
280 |             [
281 |                 pre_sigmoid_nvs,
282 |                 nv_means,
283 |                 nv_samples,
284 |                 pre_sigmoid_nhs,
285 |                 nh_means,
286 |                 nh_samples
287 |             ],
288 |             updates
289 |         ) = theano.scan(
290 |             self.gibbs_hvh,
291 |             outputs_info=[None, None, None, None, None, chain_start],
292 |             n_steps=15,
293 |             name="gibbs_hvh",
294 |         )
295 | 
296 |         chain_end = nv_samples[-1]
297 |         cost = T.mean(self.free_energy(x))-T.mean(self.free_energy(chain_end))
298 |         # We must not compute the gradient through the gibbs sampling
299 |         params = self.get_params()
300 |         gparams = T.grad(cost, self.params, consider_constant=[chain_end])
301 |         gparams = [grad * alpha for grad in gparams]
302 | 
303 |         for gparam, param in zip(gparams, params):
304 |             updates[param] = param - gparam * T.cast(lr, dtype=theano.config.floatX)
305 | 
306 |         updates[persistent] = nh_samples[-1]
307 |         monitoring_cost = self.get_pseudo_likelihood_cost(x, updates)
308 | 
309 |         return monitoring_cost, monitoring_cost, updates
310 | 
311 |     def create_dadgm_objectives(self, deterministic=False):
312 |         X = self.inputs[0]
313 |         x = X.reshape((-1, self.n_out))
314 |         px_mu = self.inputs[-1]
315 | 
316 |         # load network params
317 |         n_class = self.n_class
318 |         n_cat = self.n_cat
319 | 
320 |         pa_net_mu, pa_net_logsigma, qz_net_mu, qa_net_mu, \
321 |             qa_net_logsigma, qz_net_sample, qa_net_sample = self.network
322 |         qa_net_in, qz_net_in, px_net_in = self.input_layers
323 | 
324 |         qa_mu, qa_logsigma, qa_sample = get_output(
325 |             [qa_net_mu, qa_net_logsigma, qa_net_sample],
326 |             deterministic=deterministic,
327 |         )
328 |         qz_mu, qz_sample = get_output(
329 |             [qz_net_mu, qz_net_sample],
330 |             {qz_net_in: qa_sample, qa_net_in: x},
331 |             deterministic=deterministic,
332 |         )
333 |         pa_mu, pa_logsigma = get_output(
334 |             [pa_net_mu, pa_net_logsigma],
335 |             {px_net_in: qz_sample},
336 |             deterministic=deterministic,
337 |         )
338 |         # Load this from RBM 
339 |         px_mu = self.inputs[-1]
340 | 
341 |         qz_given_ax = T.nnet.softmax(qz_mu)
342 |         log_qz_given_ax = T.log(qz_given_ax + 1e-20)
343 |         entropy = T.reshape(
344 |             qz_given_ax * (log_qz_given_ax - T.log(1.0 / n_class)),
345 |             (-1, n_cat, n_class),
346 |         )
347 |         entropy = T.sum(entropy, axis=[1, 2])
348 | 
349 |         log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1)
350 |         log_pa_given_z = log_normal2(qa_sample, pa_mu, pa_logsigma).sum(axis=1)
351 |         log_paxz = log_pa_given_z + log_px_given_z
352 | 
353 |         # logp(z)+logp(a|z)-logq(a)-logq(z|a)
354 |         elbo = T.mean(log_paxz - entropy)
355 | 
356 |         return -elbo, -T.mean(entropy)
357 | 
358 |     def create_dadgm_gradients(self, loss, deterministic=False):
359 |         grads = GSM.create_gradients(self, loss, deterministic)
360 | 
361 |         # combine and clip gradients
362 |         clip_grad, max_norm = 1, 5
363 |         mgrads = total_norm_constraint(grads, max_norm=max_norm)
364 |         cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
365 | 
366 |         return cgrads
367 | 
368 |     def create_dadgm_updates(self, grads, params, alpha, opt_alg, opt_params):
369 |         return GSM.create_updates(
370 |             self, grads, params, alpha, opt_alg, opt_params
371 |         )
372 | 
373 |     def get_rbm_params(self):
374 |         return self.rbm_params
375 | 
376 |     def get_dadgm_params(self):
377 |         px_net_mu, pa_net_mu, pa_net_logsigma, \
378 |             qz_net_mu, qa_net_mu, qa_net_logsigma, \
379 |             qz_net_sample, qa_net_sample = self.dadgm_network
380 | 
381 |         p_params = get_all_params(
382 |             [px_net_mu, pa_net_mu, pa_net_logsigma],
383 |             trainable=True,
384 |         )
385 |         qa_params = get_all_params(qa_net_sample, trainable=True)
386 |         qz_params = get_all_params(qz_net_sample, trainable=True)
387 | 
388 |         return p_params + qa_params + qz_params
389 | 
390 |     def fit_dadgm(
391 |         self, X_train, Y_train, X_val, Y_val,
392 |         n_epoch=10, n_batch=100, logname='run',
393 |     ):
394 |         alpha = 1.0  # learning rate, which can be adjusted later
395 |         tau0 = 1.0  # initial temp
396 |         MIN_TEMP = 0.5  # minimum temp
397 |         ANNEAL_RATE = 0.00003  # adjusting rate
398 |         np_temp = tau0
399 |         n_data = len(X_train)
400 |         n_superbatch = self.n_superbatch
401 |         i = 1  # track # of train() executions
402 | 
403 |         n_flat_dim = np.prod(X_train.shape[1:])
404 |         X_train = X_train.reshape(-1, n_flat_dim)
405 |         X_val = X_val.reshape(-1, n_flat_dim)
406 | 
407 |         for epoch in range(n_epoch):
408 |             # In each epoch, we do a full pass over the training data:
409 |             train_batches, train_err, train_acc = 0, 0, 0
410 |             start_time = time.time()
411 | 
412 |             # iterate over superbatches to save time on GPU memory transfer
413 |             for X_sb, Y_sb in self.iterate_superbatches(
414 |                 X_train, Y_train, n_superbatch,
415 |                 datatype='train', shuffle=True,
416 |             ):
417 |                 for idx1, idx2 in iterate_minibatch_idx(len(X_sb), n_batch):
418 |                     dadgm_err, dadgm_acc = self.dadgm_train(idx1, idx2, alpha)
419 | 
420 |                     # anneal temp and learning rate
421 |                     if i % 1000 == 1:
422 |                         alpha *= 0.9
423 |                         np_temp = np.maximum(tau0*np.exp(-ANNEAL_RATE*i), MIN_TEMP)
424 |                         self.tau.set_value(np_temp, borrow=False)
425 | 
426 |                     # collect metrics
427 |                     i += 1
428 |                     train_batches += 1
429 |                     train_err += err
430 |                     train_acc += acc
431 | 
432 |     def fit(
433 |         self, X_train, Y_train, X_val, Y_val,
434 |         n_epoch=10, n_batch=100, logname='run'
435 |     ):
436 |         """Train 1 epoch of PCD to learn p, followed by K epochs
437 |         of DADGM via GSM to fit q to p.
438 |         """
439 |         alpha = 1.0  # learning rate, which can be adjusted later
440 |         n_data = len(X_train)
441 |         n_superbatch = self.n_superbatch
442 |         n_loop_q_per_p = 10
443 | 
444 |         n_flat_dim = np.prod(X_train.shape[1:])
445 |         X_train = X_train.reshape(-1, n_flat_dim)
446 |         X_val = X_val.reshape(-1, n_flat_dim)
447 | 
448 |         for epoch in range(n_epoch):
449 |             # In each epoch, we do a full pass over the training data:
450 |             train_batches, train_err, train_acc = 0, 0, 0
451 |             start_time = time.time()
452 | 
453 |             # iterate over superbatches to save time on GPU memory transfer
454 |             for X_sb, Y_sb in self.iterate_superbatches(
455 |                 X_train, Y_train, n_superbatch,
456 |                 datatype='train', shuffle=True,
457 |             ):
458 |                 for idx1, idx2 in iterate_minibatch_idx(len(X_sb), n_batch):
459 |                     rbm_err, rbm_acc = self.rbm_train(idx1, idx2, alpha)
460 |                     self.train_rbm_px_mu.set_value(rbm_err)
461 |                     # Fit n_loop epochs of fitting q to current p
462 |                     self.fit_dadgm(
463 |                         X_train, Y_train, X_val, Y_val,
464 |                         n_epoch=n_loop_q_per_p, n_batch=n_batch, logname=logname,
465 |                     )
466 | 
467 |                     train_batches += 1
468 |                     train_err += err
469 |                     train_acc += acc
470 | 
471 |     def hallucinate(self):
472 |         pass
473 | 
474 |     def load_params(self, rbm_params, dadgm_params):
475 |         """Load a given set of parameters"""
476 |         self.rbm_params = rbm_params
477 |         self.dadgm_params = dadgm_params
478 | 
479 |     def dump_params(self):
480 |         """Dump a given set of parameters"""
481 |         return (self.rbm_params, self.dadgm_params)
482 | 


--------------------------------------------------------------------------------
/models/variational_rbm.py:
--------------------------------------------------------------------------------
  1 | import pdb
  2 | import time
  3 | import pickle
  4 | import numpy as np
  5 | import theano
  6 | import lasagne
  7 | import theano.tensor as T
  8 | from theano.tensor.shared_randomstreams import RandomStreams
  9 | from collections import OrderedDict
 10 | 
 11 | from model import Model
 12 | from abstractrbm import AbstractRBM
 13 | from helpers import iterate_minibatch_idx, evaluate, log_metrics
 14 | from helpers import Tlogsumexp, Tlogaddexp
 15 | 
 16 | from theano.compile.nanguardmode import NanGuardMode
 17 | 
 18 | create_zmat = lambda x, y: np.zeros((x, y)).astype(theano.config.floatX)
 19 | create_vec = lambda y: np.zeros(y,).astype(theano.config.floatX)
 20 | create_sca = lambda x: (np.cast[theano.config.floatX](x)).astype(theano.config.floatX)
 21 | 
 22 | class VariationalRBM(AbstractRBM):
 23 |     """Restricted Boltzmann Machine with VI"""
 24 |     def __init__(
 25 |         self, n_dim, n_out, n_chan=1, n_superbatch=12800, opt_alg='adam',
 26 |         opt_params={'lr': 1e-3, 'b1': 0.9, 'b2': 0.99}
 27 |     ):
 28 |         """RBM constructor.
 29 |         Defines the parameters of the model along with
 30 |         basic operations for inferring hidden from visible (and vice-versa),
 31 |         as well as for performing CD updates.
 32 |         """
 33 |         self.numpy_rng = np.random.RandomState(1234)
 34 |         self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30))
 35 |         self.create_mat = lambda x, y: self.numpy_rng.normal(0, 0.01, (x, y)).astype(theano.config.floatX)
 36 | 
 37 |         # save config
 38 |         n_batch = opt_params.get('nb')
 39 |         self.n_hidden = 100
 40 |         self.n_visible = n_chan*n_dim*n_dim  # size of visible layer
 41 |         self.n_batch = n_batch
 42 |         self.n_qk = 10 # num of components in MoB used of q
 43 |         self.n_mc = 30 # num of monte carlo samples from each MoB component
 44 | 
 45 |         self.n_dim = n_dim
 46 |         self.n_out = n_out
 47 |         self.n_superbatch = n_superbatch
 48 |         self.alg = opt_alg
 49 | 
 50 |         # set up general RBM methods
 51 |         AbstractRBM.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params) 
 52 | 
 53 |         # create updates
 54 |         alpha = T.scalar(dtype=theano.config.floatX)  # learning rate
 55 | 
 56 |         # save config
 57 |         self.n_class = 2
 58 |         self.n_dim = n_dim
 59 |         self.n_out = n_out
 60 | 
 61 | 
 62 |         self.n_components = self.n_qk
 63 |         self.n_samples = self.n_mc
 64 |         self.n_tot_samples = self.n_samples*self.n_components
 65 | 
 66 | 
 67 |         # create input variables
 68 |         D, idx1, idx2 = self.create_inputs()
 69 | 
 70 |         # create model
 71 |         self.network = self.create_model()
 72 | 
 73 |         # create objectives
 74 |         loglik, plik = self.create_objectives(D)
 75 | 
 76 |         # create gradients
 77 |         dL_Theta, dE_Theta, dlogZ_Theta, dL_Phi = self.create_gradients()
 78 |         grads = dL_Theta, dE_Theta, dlogZ_Theta, dL_Phi
 79 | 
 80 |         # create updates
 81 |         uL_Theta, uL_Phi, avg_updates, avg_Theta_updates \
 82 |           = self.create_updates(grads, None, alpha, opt_alg, opt_params)
 83 |       
 84 |         # logF_avg, Z_avg = self.create_llik_estimate(D)
 85 |         
 86 |         mode = NanGuardMode(nan_is_error=True, inf_is_error=False, big_is_error=False)
 87 |         mode = None
 88 | 
 89 |         common_update1 = OrderedDict(avg_updates.items() + uL_Phi.items())
 90 |         self.train_q = theano.function([idx1, idx2], [loglik, plik], 
 91 |           updates=common_update1, mode=mode,
 92 |           givens={D: self.train_set_x[idx1:idx2]})
 93 | 
 94 |         common_update2 = OrderedDict(avg_Theta_updates.items() + uL_Theta.items())
 95 |         self.train_p = theano.function([idx1, idx2], [loglik, plik], 
 96 |             updates=common_update2, mode=mode, on_unused_input='warn',
 97 |             givens={D: self.train_set_x[idx1:idx2]})
 98 |         # self.llik = theano.function([D], logF_avg - T.log(Z_avg), mode=mode)
 99 | 
100 |         common_update3 = OrderedDict(common_update1.items() + common_update2.items())
101 |         self.train = theano.function([idx1, idx2], [loglik, plik], 
102 |             updates=common_update3, mode=mode,
103 |             givens={D: self.train_set_x[idx1:idx2]})
104 | 
105 |     def create_inputs(self):
106 |         # allocate symbolic variables for the data
107 |         idx1, idx2 = T.lscalar(), T.lscalar()
108 |         D = T.tensor4('D', dtype=theano.config.floatX) # data matrixX)  # learning rate
109 |         # self.Z = T.tensor2(dtype=theano.config.floatX)
110 |         return D, idx1, idx2
111 | 
112 |     def _free_energy(self, X, avg=False):
113 |       bv_vec = self.vbias.reshape((self.n_visible,1))
114 |       bh_vec = self.hbias.reshape((self.n_hidden,1))
115 | 
116 |       W = self.W_avg if avg else self.W
117 |       W = W.T
118 |       
119 |       logF = T.dot(bv_vec.T, X) \
120 |              + T.sum(T.log(1. + T.exp(bh_vec + T.dot(W, X))), axis=0)
121 |       
122 |       return logF
123 | 
124 |     @property
125 |     def Z(self):
126 |       return np.exp(self.logZ_avg.get_value())
127 | 
128 |     def create_model(self):
129 |         self.W = theano.shared(self.create_mat(self.n_visible, self.n_hidden))
130 |         self.vbias = theano.shared(create_vec(self.n_visible))
131 |         self.hbias = theano.shared(create_vec(self.n_hidden))
132 |         self.params_p = [self.W, self.vbias, self.hbias]
133 | 
134 |         # running averages
135 |         self.inner_it = theano.shared(create_sca(0.0))
136 |         self.loga = theano.shared(create_sca(0.0))
137 |         self.logZ_avg = theano.shared(create_sca(0.0))
138 |         self.grad_logZ_avg = [
139 |           theano.shared(create_zmat(self.n_visible, self.n_hidden)), 
140 |           theano.shared(create_vec(self.n_visible)),
141 |           theano.shared(create_vec(self.n_hidden))
142 |         ]
143 |         self.grad_p_avg = [
144 |           theano.shared(create_zmat(self.n_visible, self.n_hidden)), 
145 |           theano.shared(create_vec(self.n_visible)),
146 |           theano.shared(create_vec(self.n_hidden))
147 |         ]
148 |         self.W_avg = theano.shared(self.W.get_value().copy())
149 |         self.vbias_avg = theano.shared(self.vbias.get_value().copy())
150 |         self.hbias_avg = theano.shared(self.hbias.get_value().copy())
151 | 
152 |         self.bit_i_idx = theano.shared(value=0, name='bit_i_idx')
153 | 
154 |         # create approximating distribution
155 |         create_mat = lambda x, y: self.numpy_rng.normal(0, 0.01, (x, y)).astype(theano.config.floatX)
156 |         self.pi = np.full( (self.n_qk,1), 1./float(self.n_qk) ).astype(theano.config.floatX)
157 |         self.Phi = theano.shared(create_mat(self.n_visible,self.n_qk))
158 |         self.params_q = [self.Phi]
159 | 
160 |         # create symbolic variables representing the samples
161 |         Qk = 1./(1+T.exp(-self.Phi)) # (n,k) component probabilities
162 |         Qt = T.tile(T.reshape(Qk, (self.n_visible, self.n_qk, 1)), (1, 1, self.n_mc))
163 |         Qs = self.theano_rng.binomial(size=Qt.shape, p=Qt, dtype=Qt.dtype)
164 |         self.q_samples = Qs
165 |         self.Qk = Qk
166 | 
167 |         return None
168 | 
169 |     def _create_components(self, D):
170 |         # shorthands
171 |         k, t, s = self.n_components, self.n_tot_samples, self.n_samples
172 |         n = self.n_visible
173 | 
174 |         ## compute bound objective
175 | 
176 |         X = self.q_samples
177 |         D = D.reshape((-1, self.n_visible)).T
178 |         X2 = T.flatten(X, outdim=2)
179 | 
180 |         # compute the probability of each datapoint under each component
181 | 
182 |         # reshape tensors
183 |         QQ = self.Qk.reshape((n,k,1))
184 |         XX = X2.dimshuffle(0,'x',1)
185 |         self.XX = XX
186 |         T.addbroadcast(QQ, 2)
187 |         T.addbroadcast(XX, 1)
188 | 
189 |         # compute the probabilities (log version is 2x slower...)
190 |         # logQ_prob = T.log(QQ*XX + (1.-QQ)*(1.-XX)) # (n, k, t)
191 |         # logQ_tot = T.sum(logQ_prob, axis=0) # (k, t)
192 |         # logQ = Tlogsumexp(np.log(self.pi_uniform) + logQ_tot, axis=0) # (t,)
193 |         Q_tot = T.prod(QQ*XX + (1-QQ)*(1-XX), axis=0)
194 |         Q = T.sum(self.pi * Q_tot, axis=0) # (t,)
195 |         logQ = T.log(Q)
196 | 
197 |         # compute energies of the samples, dim=(1, n_tot_samples)
198 |         logF = self._free_energy(X2)
199 | 
200 |         # free energy of the data
201 |         logF_D = self._free_energy(D)
202 | 
203 |         self._components = (logF, logQ, logF_D)
204 | 
205 |     def create_objectives(self, D, deterministic=False):
206 |         self._create_components(D)
207 |         (logF, logQ, logF_D_vec) = self._components
208 |         t = self.n_qk * self.n_mc
209 | 
210 |         # set up bound
211 |         t = self.n_qk * self.n_mc
212 |         logFQ = logF - logQ # (t,)
213 |         logFQ2 = 2.*logF - 2.*logQ # (t,)
214 |         logFQ = logFQ.flatten()
215 |         logFQ2 = logFQ2.flatten()
216 |         logZ = Tlogsumexp(logFQ, axis=0) - T.log(t)
217 |         logbnd = Tlogsumexp(logFQ2, axis=0) - T.log(t)
218 | 
219 |         logF_D = logF_D_vec.mean()
220 | 
221 |         loglik = logF_D - 0.5*logZ
222 |         loglikbnd = logF_D - 0.5*logbnd
223 | 
224 |         self.logbnd = logbnd
225 |         self.logZ = logZ
226 | 
227 |         # set up pseudolikelihood
228 |         plik = self.create_pseudoliklihood(D)
229 | 
230 |         return loglik, plik
231 | 
232 |     def get_params(self):
233 |         return [self.W, self.vbias, self.hbias, self.Phi]
234 | 
235 |     def get_params_p(self):
236 |         return [self.W, self.vbias, self.hbias]
237 | 
238 |     def get_params_q(self):
239 |         return [self.Phi]
240 | 
241 |     def create_gradients(self):
242 |         k, s, n = self.n_qk, self.n_mc, self.n_visible
243 |         (logF, logQ, logF_D_vec) = self._components
244 |         logbnd, logZ = self.logbnd, self.logZ
245 |         logF_D = logF_D_vec.mean()
246 |         X = self.q_samples
247 |         QQ = self.Qk.reshape((n,k,1))
248 |         
249 |         logF2 = 2.*logF
250 |         logQ2 = 2.*logQ
251 |         logFQ = logF - logQ # (t,)
252 |         S = T.exp(logFQ - logZ) # (t,)
253 |         S2 = T.exp(logF2 - logbnd - logQ2) # (t,)
254 |         target = T.mean(S * logF)
255 | 
256 |         # get grads wrt params_p
257 |         dlogB_W, dlogB_bv, dlogB_bh = theano.grad(logbnd, [self.W, self.vbias, self.hbias])
258 |         dlogZ_W, dlogZ_bv, dlogZ_bh = theano.grad(target, [self.W, self.vbias, self.hbias], consider_constant=[S])
259 |         dE_W, dE_bv, dE_bh = theano.grad(logF_D, [self.W, self.vbias, self.hbias])
260 | 
261 |         # get graps wrt params_q
262 |         loga = - logbnd
263 |         cv = T.exp(logZ + 0.5*loga)
264 |         logF2a = logF2 + loga
265 |         F2a = T.exp( logF2a )
266 |         Q2 = T.exp(logQ2)
267 |         cv_adj = cv**2. * Q2
268 |         Scv = (F2a - cv_adj)/Q2
269 |         # S = (F2a)/Q2
270 |         # S = FQ2a
271 | 
272 |         Dq = X - QQ # (n, K, s)
273 |         Dq *= (-Scv).reshape((1,k,s))
274 |         dlogB_Phi = T.mean(Dq, axis=2) * self.pi.reshape(1,k) # (n, k)
275 | 
276 |         from theano.gradient import disconnected_grad as dg
277 |         dlogB_target = T.mean(-dg(S2) * logQ)
278 |         dlogB_Phi = theano.grad(dlogB_target, self.Phi, consider_constant=[self.XX])        
279 | 
280 |         # log-likelihood / bound gradients (combine the above)
281 |         
282 |         # dL_Phi = -0.5*a*dB_Phi
283 |         dL_Phi = -0.5*dlogB_Phi
284 |         dL_W = dE_W - dlogZ_W 
285 |         dL_bv = dE_bv - dlogZ_bv
286 |         dL_bh = dE_bh - dlogZ_bh
287 |         # dL_W = dE_W - 0.5 * dlogB_W 
288 |         # dL_bv = dE_bv - 0.5 * dlogB_bv
289 |         # dL_bh = dE_bh - 0.5 * dlogB_bh
290 | 
291 |         dL_Theta = [dL_W, dL_bv, dL_bh]
292 |         dlogZ_Theta = [dlogZ_W, dlogZ_bv, dlogZ_bh]
293 |         dE_Theta = [dE_W, dE_bv, dE_bh]
294 | 
295 |         return dL_Theta, dlogZ_Theta, dE_Theta, dL_Phi
296 | 
297 |     def create_pseudoliklihood(self, X):
298 |         """Stochastic approximation to the pseudo-likelihood"""
299 |         # X = self.inputs[0]
300 |         X = X.reshape((-1, self.n_visible))
301 | 
302 |         # index of bit i in expression p(x_i | x_{\i})
303 |         bit_i_idx = self.bit_i_idx
304 |         # bit_i_idx = theano.shared(value=0, name='bit_i_idx')
305 | 
306 |         # binarize the input image by rounding to nearest integer
307 |         xi = T.round(X)
308 | 
309 |         # calculate free energy for the given bit configuration
310 |         fe_xi = self.free_energy(xi)
311 | 
312 |         # flip bit x_i of matrix xi and preserve all other bits x_{\i}
313 |         # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx], but assigns
314 |         # the result to xi_flip, instead of working in place on xi.
315 |         xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx])
316 | 
317 |         # calculate free energy with bit flipped
318 |         fe_xi_flip = self.free_energy(xi_flip)
319 | 
320 |         # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i})))
321 |         cost = T.mean(self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip-fe_xi)))
322 | 
323 |         return cost
324 | 
325 |     def create_updates(self, grads, params, alpha, opt_alg, opt_params):
326 |     # def create_updates(self, alpha, opt_alg, opt_params):
327 |         # scaled_grads = [grad * alpha for grad in grads]
328 |         # scaled_grads = [T.clip(grad, -1., 1.) for grad in scaled_grads]
329 |         dL_Theta, dE_Theta, dlogZ_Theta, dL_Phi = grads
330 | 
331 |         # create update for main parameters
332 |         # TODO: Scale grads with alpha!
333 |         lr = opt_params.get('lr', 1e-3)
334 |         b1, b2 = opt_params.get('b1', 0.9), opt_params.get('b2', 0.999)
335 |         uL_Theta = lasagne.updates.adam(self.grad_p_avg, self.params_p, learning_rate=1e-3, beta1=7e-1, beta2=1e-3)
336 |           # learning_rate=lr, beta1=b1, beta2=b2)
337 |         uL_Phi = lasagne.updates.sgd([-dL_Phi], [self.Phi], learning_rate=5e-1) # TODO: anneal lr
338 | 
339 |         # create updates for the running averages
340 |         avg_updates = self.create_avg_updates(self.logZ, dlogZ_Theta, dL_Theta)
341 | 
342 |         # avg Theta updates
343 |         avg_Theta_updates = OrderedDict()
344 |         avg_Theta_updates[self.W_avg] = 0.95*self.W_avg + 0.05*self.W
345 |         avg_Theta_updates[self.vbias_avg] = 0.95*self.vbias_avg + 0.05*self.vbias
346 |         avg_Theta_updates[self.hbias_avg] = 0.95*self.hbias_avg + 0.05*self.hbias
347 | 
348 |         # increment bit_i_idx % number as part of updates
349 |         uL_Phi[self.bit_i_idx] = (self.bit_i_idx + 1) % self.n_visible
350 | 
351 |         return uL_Theta, uL_Phi, avg_updates, avg_Theta_updates
352 | 
353 |     def create_avg_updates(self, logZ, dlogZ_Theta, dL_Theta):
354 |       inner_it = self.inner_it
355 |       avg_updates = OrderedDict()
356 | 
357 |       new_logZ_estimate = Tlogaddexp(
358 |           self.logZ_avg + T.log((inner_it) / (inner_it+1)),
359 |           logZ - T.log(inner_it+1.)
360 |       )
361 |         # self.logZ_avg * ((inner_it) / (inner_it+1)) + logZ / (inner_it+1)
362 | 
363 |       avg_updates[inner_it] = inner_it + 1.
364 |       avg_updates[self.logZ_avg] = new_logZ_estimate
365 |       avg_updates[self.loga] = -2.*new_logZ_estimate # 1./(new_Z_estimate**2)
366 |       
367 |       for g, g_avg in zip(dL_Theta, self.grad_p_avg):
368 |         avg_updates[g_avg] = g_avg * ((inner_it) / (inner_it+1)) - g / (inner_it+1)
369 | 
370 |       return avg_updates
371 | 
372 |     def reset_averages(self):
373 |       self.inner_it.set_value(0.0)
374 | 
375 |     def free_energy(self, v_sample):
376 |         """Function to compute the free energy"""
377 |         wx_b = T.dot(v_sample, self.W) + self.hbias
378 |         vbias_term = T.dot(v_sample, self.vbias)
379 |         hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1)
380 |         return -hidden_term - vbias_term
381 | 
382 |     def pseudolikelihood(self, data):
383 |         self.rbm.components_ = self.W.get_value().T
384 |         self.rbm.intercept_visible_ = self.vbias.get_value()
385 |         self.rbm.intercept_hidden_  = self.hbias.get_value()
386 |         return self.rbm.score_samples(data).mean()
387 | 
388 |     def dump(self, fname):
389 |         """Pickle weights to a file"""
390 |         all_params = [self.W.get_value(), self.vbias.get_value(), self.hbias.get_value(), self.Phi.get_value()]
391 |         with open(fname, 'w') as f:
392 |             pickle.dump(all_params, f)
393 | 
394 |     def load(self, fname):
395 |         """Load pickled network"""
396 |         with open(fname) as f:
397 |             params = pickle.load(f)
398 |         self.load_params(params)
399 | 
400 |     def load_params(self, params):
401 |         """Load a given set of parameters"""
402 |         W, vbias, hbias, Phi = params
403 |         self.W.set_value(W)
404 |         self.vbias.set_value(vbias)
405 |         self.hbias.set_value(hbias)
406 |         self.Phi.set_value(Phi)
407 | 
408 |     def fit(
409 |         self, X_train, Y_train, X_val, Y_val,
410 |         n_epoch=10, n_batch=100, logname='run'
411 |     ):
412 |         """Train the model"""
413 |         alpha = 1.0 # learning rate, which can be adjusted later
414 |         n_data = len(X_train)
415 |         n_superbatch = self.n_superbatch
416 | 
417 |         for epoch in range(n_epoch):
418 |             # In each epoch, we do a full pass over the training data:
419 |             train_batches, train_err, train_acc = 0, 0, 0
420 |             start_time = time.time()
421 | 
422 |             # if epoch > 100: alpha = 0.5
423 | 
424 |             # iterate over superbatches to save time on GPU memory transfer
425 |             for X_sb, Y_sb in self.iterate_superbatches(
426 |                 X_train, Y_train, n_superbatch,
427 |                 datatype='train', shuffle=True,
428 |             ):
429 |                 for idx1, idx2 in iterate_minibatch_idx(len(X_sb), n_batch):
430 |                     # q steps
431 |                     err, acc = self.train_q(idx1, idx2)
432 |                     err, acc = self.train_q(idx1, idx2)
433 |                     err, acc = self.train_q(idx1, idx2)
434 | 
435 |                     # p steps
436 |                     err, acc = self.train_p(idx1, idx2)
437 |                     self.reset_averages()
438 | 
439 |                     # err, acc = self.train(idx1, idx2)
440 |                     # self.reset_averages()
441 |                     # collect metrics
442 |                     err = self.pseudolikelihood(X_sb[idx1:idx2].reshape(-1, 64))
443 |                     train_batches += 1
444 |                     train_err += err
445 |                     train_acc += acc
446 | 
447 |                     # print train_batches, err, acc
448 | 
449 |                     if train_batches % 100 == 0:
450 |                         n_total = epoch * n_data + n_batch * train_batches
451 |                         metrics = [
452 |                             n_total, train_err / train_batches,
453 |                             train_acc / train_batches,
454 |                         ]
455 |                         log_metrics(logname, metrics)
456 | 
457 |             print "Epoch {} of {} took {:.3f}s ({} minibatches)".format(
458 |                 epoch + 1, n_epoch, time.time() - start_time, train_batches)
459 | 
460 |             print "  training:\t\t{:.6f}\t{:.6f}".format(
461 |                 train_err / train_batches, train_acc / train_batches)
462 | 
463 |         # reserve N of training data points to kick start hallucinations
464 |         self.dump(logname + '.pkl')
465 |         hallu_i = self.numpy_rng.randint(n_data - self.n_chain)
466 |         self.hallu_set = np.asarray(
467 |             X_train[hallu_i:hallu_i + self.n_chain],
468 |             dtype=theano.config.floatX
469 |         )


--------------------------------------------------------------------------------
/models/udadgm.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import pickle
  3 | import numpy as np
  4 | from collections import OrderedDict
  5 | 
  6 | import theano
  7 | import theano.tensor as T
  8 | import lasagne
  9 | from theano.gradient import disconnected_grad as dg
 10 | 
 11 | from model import Model
 12 | from layers.shape import RepeatLayer
 13 | from layers import (
 14 |     GaussianSampleLayer,
 15 |     BernoulliSampleLayer,
 16 |     GaussianMultiSampleLayer,
 17 | )
 18 | from helpers import (
 19 |     iterate_minibatch_idx,
 20 |     iterate_minibatches,
 21 |     evaluate,
 22 |     log_metrics
 23 | )
 24 | from distributions import log_bernoulli, log_normal2
 25 | 
 26 | from helpers import Tlogsumexp
 27 | from theano.tensor.shared_randomstreams import RandomStreams
 28 | 
 29 | from rbm import RBM
 30 | from aux_variational_rbm import AuxiliaryVariationalRBM
 31 | 
 32 | # ----------------------------------------------------------------------------
 33 | 
 34 | class UDADGM(Model):
 35 |     """Auxiliary Deep Generative Model (unsupervised version) with discrete z"""
 36 |     def __init__(
 37 |         self, n_dim, n_out, n_chan=1, n_superbatch=12800, model='bernoulli',
 38 |         opt_alg='adam', opt_params={'lr' : 1e-3, 'b1': 0.9, 'b2': 0.99}
 39 |     ):
 40 |         # save model that wil be created
 41 |         self.model = model
 42 |         self.n_sample = 3  # adjustable parameter, though 1 works best in practice
 43 |         Model.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params)
 44 | 
 45 |         # random number generators
 46 |         self.numpy_rng = np.random.RandomState(1234)
 47 |         self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30))
 48 |         
 49 |         (X, Y, idx1, idx2, S) = self.inputs 
 50 |         loss, acc = self.objectives
 51 |         self.train = theano.function(
 52 |             [idx1, idx2, self.alpha], [loss, self.log_e, self.z], updates=self.updates,
 53 |             givens={X: self.train_set_x[idx1:idx2], Y: self.train_set_y[idx1:idx2]}, 
 54 |             on_unused_input='warn'
 55 |         )
 56 |         llik = self.create_llik()
 57 |         self.loss = theano.function(
 58 |             [X, Y], [loss, llik],
 59 |             on_unused_input='warn',
 60 |         )
 61 | 
 62 |     def create_model(self, X, Y, n_dim, n_out, n_chan=1):
 63 |         # params
 64 |         n_lat = 64  # latent stochastic variables
 65 |         n_aux = 10  # auxiliary variables
 66 |         n_hid = 500  # size of hidden layer in encoder/decoder
 67 |         n_sam = self.n_sample  # number of monte-carlo samples
 68 |         n_hid_cv = 500  # size of hidden layer in control variate net
 69 |         n_out = n_dim * n_dim * n_chan # total dimensionality of ouput
 70 |         hid_nl = lasagne.nonlinearities.tanh
 71 |         relu_shift = lambda av: T.nnet.relu(av+10)-10 # for numerical stability
 72 | 
 73 |         # self.rbm = RBM(n_dim=int(np.sqrt(n_lat)), n_out=10, n_chan=1, opt_params={'nb':128})
 74 |         self.rbm = AuxiliaryVariationalRBM(n_dim=int(np.sqrt(n_lat)), n_out=10, n_chan=1, opt_params={'nb':128*n_sam})
 75 | 
 76 |         # create the encoder network
 77 |         # create q(a|x)
 78 |         l_qa_in = lasagne.layers.InputLayer(
 79 |             shape=(None, n_chan, n_dim, n_dim),
 80 |             input_var=X,
 81 |         )
 82 |         l_qa_hid = lasagne.layers.DenseLayer(
 83 |             l_qa_in, num_units=n_hid,
 84 |             nonlinearity=hid_nl,
 85 |         )
 86 |         l_qa_mu = lasagne.layers.DenseLayer(
 87 |             l_qa_in, num_units=n_aux,
 88 |             nonlinearity=None,
 89 |         )
 90 |         l_qa_logsigma = lasagne.layers.DenseLayer(
 91 |             l_qa_in, num_units=n_aux,
 92 |             nonlinearity=relu_shift,
 93 |         )
 94 |         # repeatedly sample
 95 |         l_qa_mu = lasagne.layers.ReshapeLayer(
 96 |             RepeatLayer(l_qa_mu, n_ax=1, n_rep=n_sam),
 97 |             shape=(-1, n_aux),
 98 |         )
 99 |         l_qa_logsigma = lasagne.layers.ReshapeLayer(
100 |             RepeatLayer(l_qa_logsigma, n_ax=1, n_rep=n_sam),
101 |             shape=(-1, n_aux),
102 |         )
103 |         l_qa = GaussianSampleLayer(l_qa_mu, l_qa_logsigma)
104 | 
105 |         # create q(z|a,x)
106 |         l_qz_in = lasagne.layers.InputLayer((None, n_aux))
107 |         l_qz_hid1a = lasagne.layers.DenseLayer(
108 |             l_qz_in, num_units=n_hid,
109 |             nonlinearity=hid_nl,
110 |         )
111 |         l_qz_hid1b = lasagne.layers.DenseLayer(
112 |             l_qa_in, num_units=n_hid,
113 |             nonlinearity=hid_nl,
114 |         )
115 |         l_qz_hid1b = lasagne.layers.ReshapeLayer(
116 |             RepeatLayer(l_qz_hid1b, n_ax=1, n_rep=n_sam),
117 |             shape=(-1, n_hid),
118 |         )
119 |         l_qz_hid2 = lasagne.layers.ElemwiseSumLayer([l_qz_hid1a, l_qz_hid1b])
120 |         l_qz_hid3 = lasagne.layers.DenseLayer(
121 |             l_qz_hid2, num_units=n_hid,
122 |             nonlinearity=hid_nl,
123 |         )
124 |         l_qz_mu = lasagne.layers.DenseLayer(
125 |             l_qz_hid3, num_units=n_lat,
126 |             nonlinearity=T.nnet.sigmoid,
127 |         )
128 |         l_qz = BernoulliSampleLayer(l_qz_mu)
129 |         l_qz_logsigma = None
130 | 
131 |         # create the decoder network
132 |         # create p(x|z)
133 |         l_px_in = lasagne.layers.InputLayer((None, n_lat))
134 |         l_px_hid = lasagne.layers.DenseLayer(
135 |             l_px_in, num_units=n_hid,
136 |             W=lasagne.init.GlorotUniform(),
137 |             nonlinearity=hid_nl,
138 |         )
139 |         l_px_mu, l_px_logsigma = None, None
140 | 
141 |         if self.model == 'bernoulli':
142 |             l_px_mu = lasagne.layers.DenseLayer(
143 |                 l_px_hid, num_units=n_out,
144 |                 nonlinearity=lasagne.nonlinearities.sigmoid,
145 |             )
146 |         elif self.model == 'gaussian':
147 |             l_px_mu = lasagne.layers.DenseLayer(
148 |                 l_px_hid, num_units=n_out,
149 |                 nonlinearity=None,
150 |             )
151 |             l_px_logsigma = lasagne.layers.DenseLayer(
152 |                 l_px_hid, num_units=n_out,
153 |                 nonlinearity=relu_shift,
154 |             )
155 | 
156 |         # create p(a|z)
157 |         l_pa_hid = lasagne.layers.DenseLayer(
158 |             l_px_in, num_units=n_hid,
159 |             nonlinearity=hid_nl,
160 |         )
161 |         l_pa_mu = lasagne.layers.DenseLayer(
162 |             l_pa_hid, num_units=n_aux,
163 |             nonlinearity=None,
164 |         )
165 |         l_pa_logsigma = lasagne.layers.DenseLayer(
166 |             l_pa_hid, num_units=n_aux,
167 |             W=lasagne.init.GlorotNormal(),
168 |             b=lasagne.init.Normal(1e-3),
169 |             nonlinearity=relu_shift,
170 |         )
171 | 
172 |         # create control variate (baseline) network
173 |         l_cv_in = lasagne.layers.InputLayer(
174 |             shape=(None, n_chan, n_dim, n_dim),
175 |             input_var=X,
176 |         )
177 |         l_cv_hid = lasagne.layers.DenseLayer(
178 |             l_cv_in, num_units=n_hid_cv,
179 |             nonlinearity=hid_nl,
180 |         )
181 |         l_cv = lasagne.layers.DenseLayer(
182 |             l_cv_hid, num_units=1,
183 |             nonlinearity=None,
184 |         )
185 | 
186 |         # create variables for centering signal
187 |         c = theano.shared(np.zeros((1,1), dtype=np.float64), broadcastable=(True,True))
188 |         v = theano.shared(np.zeros((1,1), dtype=np.float64), broadcastable=(True,True))
189 | 
190 |         # store certain input layers for downstream (quick hack)
191 |         self.input_layers = (l_qa_in, l_qz_in, l_px_in, l_cv_in)
192 |         self.n_lat = n_lat
193 |         self.n_lat2 = int(np.sqrt(n_lat))
194 |         self.n_hid = n_hid
195 | 
196 |         return l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \
197 |                l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, \
198 |                l_qa, l_qz, l_cv, c, v
199 | 
200 |     def _create_components(self, deterministic=False):
201 |         # load network input
202 |         X = self.inputs[0]
203 |         x = X.flatten(2)
204 | 
205 |         # duplicate entries to take into account multiple mc samples
206 |         n_sam = self.n_sample
207 |         n_out = x.shape[1]
208 |         x = x.dimshuffle(0,'x',1).repeat(n_sam, axis=1).reshape((-1, n_out))
209 | 
210 |         # load networks
211 |         l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \
212 |         l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, _, _, _ = self.network
213 |         l_qa_in, l_qz_in, l_px_in, l_cv_in = self.input_layers
214 | 
215 |         # load network output
216 |         qa_mu, qa_logsigma, a = lasagne.layers.get_output(
217 |             [l_qa_mu, l_qa_logsigma, l_qa],
218 |             deterministic=deterministic,
219 |         )
220 |         qz_mu, z = lasagne.layers.get_output(
221 |             [l_qz_mu, l_qz],
222 |             {l_qz_in: a, l_qa_in: X},
223 |             deterministic=deterministic,
224 |         )
225 |         pa_mu, pa_logsigma = lasagne.layers.get_output(
226 |             [l_pa_mu, l_pa_logsigma], {l_px_in: z},
227 |             deterministic=deterministic,
228 |         )
229 | 
230 |         if self.model == 'bernoulli':
231 |             px_mu = lasagne.layers.get_output(
232 |                 l_px_mu, {l_px_in: z}, deterministic=deterministic)
233 |         elif self.model == 'gaussian':
234 |             px_mu, px_logsigma  = lasagne.layers.get_output(
235 |                 [l_px_mu, l_px_logsigma], {l_px_in: z},
236 |                 deterministic=deterministic,
237 |             )
238 | 
239 |         # entropy term
240 |         log_qa_given_x  = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1)
241 |         log_qz_given_x = log_bernoulli(z, qz_mu).sum(axis=1)
242 |         log_qz_given_x_dgz = log_bernoulli(dg(z), qz_mu).sum(axis=1)
243 |         log_qza_given_x =  log_qz_given_x + log_qa_given_x
244 | 
245 |         # log-probability term
246 |         z_prior = T.ones_like(z)*np.float32(0.5)
247 |         log_pz = log_bernoulli(z, z_prior).sum(axis=1)
248 |         log_e = -self.rbm.free_energy(z.reshape((128*n_sam,self.n_lat)))
249 |         log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1)
250 |         log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1)
251 | 
252 |         log_pxz = log_pa_given_z + log_px_given_z + log_e
253 | 
254 |         # save them for later
255 |         if deterministic == False:
256 |             self.log_pxz = log_pxz
257 |             self.log_px_given_z = log_px_given_z
258 |             self.log_pz = log_pz
259 |             self.log_qza_given_x = log_qza_given_x
260 |             self.log_qa_given_x = log_qa_given_x
261 |             self.log_qz_given_x = log_qz_given_x
262 |             self.log_qz_given_x_dgz = log_qz_given_x_dgz
263 |             self.log_e = log_e.mean()
264 |             self.z = z
265 | 
266 |         # return log_paxz, log_qza_given_x
267 |         return log_pxz, log_qza_given_x
268 | 
269 |     def create_objectives(self, deterministic=False):
270 |         # load probabilities
271 |         log_paxz, log_qza_given_x = self._create_components(deterministic=deterministic)
272 | 
273 |         # compute the evidence lower bound
274 |         elbo = T.mean(log_paxz - log_qza_given_x)
275 |         log_qa_given_x = self.log_qa_given_x
276 | 
277 |         # we don't use a spearate accuracy metric right now
278 |         return -elbo, T.mean(log_qa_given_x)
279 | 
280 |     def create_gradients(self, loss, deterministic=False):
281 |         # load networks
282 |         l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \
283 |         l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, l_cv, c, v = self.network
284 | 
285 |         # load params
286 |         p_params  = lasagne.layers.get_all_params(
287 |             [l_px_mu, l_pa_mu, l_pa_logsigma], trainable=True)
288 |         qa_params  = lasagne.layers.get_all_params(l_qa_mu, trainable=True)
289 |         qz_params  = lasagne.layers.get_all_params(l_qz, trainable=True)
290 |         cv_params = lasagne.layers.get_all_params(l_cv, trainable=True)
291 | 
292 |         # load neural net outputs (probabilities have been precomputed)
293 |         log_pxz, log_px_given_z, log_pz = self.log_pxz, self.log_px_given_z, self.log_pz
294 |         log_qza_given_x = self.log_qza_given_x
295 |         log_qz_given_x = self.log_qz_given_x
296 |         log_qz_given_x_dgz = self.log_qz_given_x_dgz
297 |         cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1)
298 | 
299 |         # compute learning signals
300 |         l0 = log_px_given_z + log_pz - log_qz_given_x - cv # NOTE: this disn't have q(a)
301 |         l_avg, l_var = l0.mean(), l0.var()
302 |         c_new = 0.8*c + 0.2*l_avg
303 |         v_new = 0.8*v + 0.2*l_var
304 |         l = (l0 - c_new) / T.maximum(1, T.sqrt(v_new))
305 |         l_target = (l0 - c_new) / T.maximum(1, T.sqrt(v_new))
306 | 
307 |         # compute grad wrt p
308 |         p_grads = T.grad(-log_pxz.mean(), p_params)
309 | 
310 |         # compute grad wrt q_a
311 |         elbo = T.mean(log_pxz - log_qza_given_x)
312 |         qa_grads = T.grad(-elbo, qa_params)
313 | 
314 |         # compute grad wrt q_z
315 |         qz_target = T.mean(dg(l_target) * log_qz_given_x_dgz)
316 |         qz_grads = T.grad(-0.2*qz_target, qz_params) # 5x slower rate for q
317 | 
318 |         # compute grad of cv net
319 |         cv_target = T.mean(l0**2)
320 |         cv_grads = [0.2*g for g in T.grad(cv_target, cv_params)]
321 | 
322 |         # combine and clip gradients
323 |         clip_grad = 1
324 |         max_norm = 5
325 |         grads = p_grads + qa_grads + qz_grads + cv_grads
326 |         mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm)
327 |         cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
328 | 
329 |         return cgrads
330 | 
331 |     def gen_samples(self, deterministic=False):
332 |         s = self.inputs[-1]
333 |         # put it through the decoder
334 |         _, _, l_px_in, _ = self.input_layers
335 |         l_px_mu = self.network[0]
336 |         px_mu = lasagne.layers.get_output(l_px_mu, {l_px_in : s})
337 | 
338 |         return px_mu
339 | 
340 |     def get_params(self):
341 |         # load networks
342 |         l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \
343 |         l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, \
344 |         l_qa, l_qz, l_cv, c, v = self.network
345 | 
346 |         if self.model == 'gaussian':
347 |             raise NotImplementedError('The code below needs to implement Gaussians')
348 | 
349 |         # load params
350 |         p_params  = lasagne.layers.get_all_params(
351 |             [l_px_mu, l_pa_mu, l_pa_logsigma], trainable=True)
352 |         qa_params  = lasagne.layers.get_all_params(l_qa_mu, trainable=True)
353 |         qz_params  = lasagne.layers.get_all_params(l_qz, trainable=True)
354 |         cv_params = lasagne.layers.get_all_params(l_cv, trainable=True)
355 | 
356 |         return p_params + qa_params + qz_params + cv_params
357 | 
358 |     def create_updates(self, grads, params, alpha, opt_alg, opt_params):
359 |         # call super-class to generate SGD/ADAM updates
360 |         grad_updates = Model.create_updates(self, grads, params, alpha, opt_alg, opt_params)
361 | 
362 |         # create updates for centering signal
363 |         # load neural net outputs (probabilities have been precomputed)
364 |         l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \
365 |         l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, l_cv, c, v = self.network
366 | 
367 |         # load neural net outputs (probabilities have been precomputed)
368 |         log_pxz, log_px_given_z, log_pz = self.log_pxz, self.log_px_given_z, self.log_pz
369 |         log_qz_given_x = self.log_qz_given_x
370 |         cv = T.addbroadcast(lasagne.layers.get_output(l_cv),1)
371 | 
372 |         # compute learning signals
373 |         l = log_px_given_z + log_pz - log_qz_given_x - cv
374 |         l_avg, l_var = l.mean(), l.var()
375 |         c_new = 0.8*c + 0.2*l_avg
376 |         v_new = 0.8*v + 0.2*l_var
377 | 
378 |         # compute update for centering signal
379 |         cv_updates = {c: c_new, v: v_new}
380 | 
381 |         return OrderedDict(grad_updates.items() + cv_updates.items())
382 | 
383 |     def hallucinate(self):
384 |         """Generate new samples by passing noise into the decoder"""
385 |         # load network params
386 |         size, n_lat, n_dim = 100, self.n_lat, self.n_dim
387 |         img_size = int(np.sqrt(size))
388 | 
389 |         # # generate noisy inputs
390 |         # noise = self.gen_noise(size, n_lat).astype('float32')
391 |         
392 |         # sample noise from RBM
393 |         noise = self.rbm.sample()
394 | 
395 |         p_mu = self.dream(noise)
396 |         if p_mu is None:
397 |             return None
398 |         p_mu = p_mu.reshape((img_size, img_size, n_dim, n_dim))
399 |         # split into img_size (1,img_size,n_dim,n_dim) images,
400 |         # concat along columns -> 1,img_size,n_dim,n_dim*img_size
401 |         p_mu = np.concatenate(np.split(p_mu, img_size, axis=0), axis=3)
402 |         # split into img_size (1,1,n_dim,n_dim*img_size) images,
403 |         # concat along rows -> 1,1,n_dim*img_size,n_dim*img_size
404 |         p_mu = np.concatenate(np.split(p_mu, img_size, axis=1), axis=2)
405 |         return np.squeeze(p_mu)
406 | 
407 |     def create_llik(self):
408 |         # load network input
409 |         X = self.inputs[0]
410 |         x = X.flatten(2)
411 | 
412 |         # duplicate entries to take into account multiple mc samples
413 |         n_sam = self.n_sample
414 |         n_out = x.shape[1]
415 |         x = x.dimshuffle(0,'x',1).repeat(n_sam, axis=1).reshape((-1, n_out))
416 | 
417 |         # load networks
418 |         l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \
419 |         l_qa_mu, l_qa_logsigma, l_qz_mu, l_qz_logsigma, l_qa, l_qz, _, _, _ = self.network
420 |         l_qa_in, l_qz_in, l_px_in, l_cv_in = self.input_layers
421 | 
422 |         # load network output
423 |         qa_mu, qa_logsigma, a = lasagne.layers.get_output(
424 |             [l_qa_mu, l_qa_logsigma, l_qa],
425 |         )
426 |         qz_mu, z = lasagne.layers.get_output(
427 |             [l_qz_mu, l_qz],
428 |             {l_qz_in: a, l_qa_in: X},
429 |         )
430 |         pa_mu, pa_logsigma = lasagne.layers.get_output(
431 |             [l_pa_mu, l_pa_logsigma], {l_px_in: z},
432 |         )
433 | 
434 |         px_mu = lasagne.layers.get_output(l_px_mu, {l_px_in: z})
435 | 
436 |         # entropy term
437 |         log_qa_given_x  = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1)
438 |         log_qz_given_x = log_bernoulli(z, qz_mu).sum(axis=1)
439 |         log_qz_given_x_dgz = log_bernoulli(dg(z), qz_mu).sum(axis=1)
440 |         log_qza_given_x =  log_qz_given_x + log_qa_given_x
441 | 
442 |         # log-probability term
443 |         # z_prior = T.ones_like(z)*np.float32(0.5)
444 |         # log_pz = log_bernoulli(z, z_prior).sum(axis=1)
445 |         log_e = -self.rbm.free_energy(z.reshape((128*n_sam,self.n_lat)))
446 |         log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1)
447 |         log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1)
448 | 
449 |         t = log_pa_given_z + log_px_given_z + log_e - log_qz_given_x - log_qa_given_x
450 |         t = t.reshape([128, n_sam])
451 | 
452 |         # compute loss
453 |         llik = Tlogsumexp( t, axis=1) # (n_bat,)
454 | 
455 |         return T.mean(llik)
456 | 
457 |     def fit(
458 |         self, X_train, Y_train, X_val, Y_val,
459 |         n_epoch=10, n_batch=100, logname='run',
460 |     ):
461 |         """Train the model"""
462 |         alpha = 1.0  # learning rate, which can be adjusted later
463 |         n_data = len(X_train)
464 |         n_superbatch = self.n_superbatch
465 | 
466 |         # print '...', self.rbm.logZ_exact()
467 |         # print '...', self.rbm.logZ_exact(marg='h')
468 | 
469 |         for epoch in range(n_epoch):
470 |             # In each epoch, we do a full pass over the training data:
471 |             train_batches, train_err, train_acc = 0, 0, 0
472 |             start_time = time.time()
473 | 
474 |             # iterate over superbatches to save time on GPU memory transfer
475 |             for X_sb, Y_sb in self.iterate_superbatches(
476 |                 X_train, Y_train, n_superbatch,
477 |                 datatype='train', shuffle=True,
478 |             ):
479 |                 for idx1, idx2 in iterate_minibatch_idx(len(X_sb), n_batch):
480 |                     # train model
481 |                     self.rbm.reset_averages()
482 |                     err, acc, z = self.train(idx1, idx2, alpha)
483 | 
484 |                     # train rbm
485 |                     z_rbm = z.reshape((128*self.n_sample,1,8,8))
486 |                     # self.rbm.train_batch(z.reshape((128,1,8,8)), np.zeros((n_batch,), dtype='int32'), alpha)
487 | 
488 |                     # q steps
489 |                     for i in range(2): self.rbm.train_q0(z_rbm)
490 | 
491 |                     # p steps
492 |                     self.rbm.train_p_batch(z_rbm, alpha)
493 | 
494 |                     # collect metrics
495 |                     # print err, acc
496 |                     train_batches += 1
497 |                     train_err += err
498 |                     train_acc += acc
499 |                     if train_batches % 100 == 0:
500 |                         n_total = epoch * n_data + n_batch * train_batches
501 |                         metrics = [
502 |                             n_total,
503 |                             train_err / train_batches,
504 |                             train_acc / train_batches,
505 |                         ]
506 |                         log_metrics(logname, metrics)
507 | 
508 |             print "Epoch {} of {} took {:.3f}s ({} minibatches)".format(
509 |                 epoch + 1, n_epoch, time.time() - start_time, train_batches)
510 | 
511 |             # make a full pass over the training data and record metrics:
512 |             train_err, train_acc = evaluate(
513 |                 self.loss, X_train, Y_train, batchsize=128,
514 |             )
515 |             val_err, val_acc = evaluate(
516 |                 self.loss, X_val, Y_val, batchsize=128,
517 |             )
518 | 
519 |             print "  training:\t\t{:.6f}\t{:.6f}".format(
520 |                 train_err, train_acc
521 |             )
522 |             print "  validation:\t\t{:.6f}\t{:.6f}".format(
523 |                 val_err, val_acc
524 |             )
525 | 
526 |             metrics = [epoch, train_err, train_acc, val_err, val_acc]
527 |             log_metrics(logname + '.val', metrics)
528 | 
529 |         # reserve N of training data points to kick start hallucinations
530 |         self.rbm.hallu_set = np.asarray(
531 |             z[:100,:].reshape((100,1,8,8)),
532 |             dtype=theano.config.floatX
533 |         )
534 | 


--------------------------------------------------------------------------------
/models/aux_variational_rbm.py:
--------------------------------------------------------------------------------
  1 | import pdb
  2 | import time
  3 | import pickle
  4 | import numpy as np
  5 | import theano
  6 | import lasagne
  7 | import theano.tensor as T
  8 | from theano.tensor.shared_randomstreams import RandomStreams
  9 | from collections import OrderedDict
 10 | from lasagne.layers import batch_norm
 11 | 
 12 | from model import Model
 13 | from abstractrbm import AbstractRBM
 14 | from helpers import iterate_minibatch_idx, evaluate, log_metrics
 15 | from helpers import Tlogsumexp, Tlogaddexp
 16 | from layers import BernoulliSampleLayer, Deconv2DLayer
 17 | from distributions import log_bernoulli, log_normal, log_normal2
 18 | 
 19 | from theano.compile.nanguardmode import NanGuardMode
 20 | 
 21 | create_zmat = lambda x, y: np.zeros((x, y)).astype(theano.config.floatX)
 22 | create_vec = lambda y: np.zeros(y,).astype(theano.config.floatX)
 23 | create_sca = lambda x: (np.cast[theano.config.floatX](x)).astype(theano.config.floatX)
 24 | 
 25 | class AuxiliaryVariationalRBM(AbstractRBM):
 26 |     """Restricted Boltzmann Machine with VI"""
 27 |     def __init__(
 28 |         self, n_dim, n_out, n_chan=1, n_superbatch=12800, opt_alg='adam',
 29 |         opt_params={'lr': 1e-3, 'b1': 0.9, 'b2': 0.99}
 30 |     ):
 31 |         """RBM constructor.
 32 |         Defines the parameters of the model along with
 33 |         basic operations for inferring hidden from visible (and vice-versa),
 34 |         as well as for performing CD updates.
 35 |         """
 36 |         self.numpy_rng = np.random.RandomState(1234)
 37 |         self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30))
 38 |         self.create_mat = lambda x, y: self.numpy_rng.normal(0, 0.01, (x, y)).astype(theano.config.floatX)
 39 | 
 40 |         # save config
 41 |         n_batch = opt_params.get('nb')
 42 |         self.n_hidden = 8
 43 |         self.n_visible = n_chan*n_dim*n_dim  # size of visible layer
 44 |         self.n_batch = n_batch
 45 |         self.n_mc = 300 # num of monte carlo samples from each MoB component
 46 | 
 47 |         self.n_dim = n_dim
 48 |         self.n_out = n_out
 49 |         self.n_superbatch = n_superbatch
 50 |         self.alg = opt_alg
 51 | 
 52 |         # set up general RBM methods
 53 |         AbstractRBM.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params) 
 54 | 
 55 |         # create updates
 56 |         alpha = T.scalar(dtype=theano.config.floatX)  # learning rate
 57 | 
 58 |         # save config
 59 |         self.n_class = 2
 60 |         self.n_dim = n_dim
 61 |         self.n_out = n_out
 62 | 
 63 |         self.marginalize = 'h'
 64 |         self.n_samples = self.n_mc
 65 |         self.n_latent = self.n_visible if self.marginalize == 'h' else self.n_hidden
 66 | 
 67 |         # create input variables
 68 |         D, idx1, idx2 = self.create_inputs()
 69 | 
 70 |         # create model
 71 |         self.network = self.create_model()
 72 | 
 73 |         # create objectives
 74 |         loglik, plik = self.create_objectives(D)
 75 | 
 76 |         # create gradients
 77 |         dL_Theta, dE_Theta, dlogZ_Theta, dL_qx, dL_pa = self.create_gradients()
 78 |         grads = dL_Theta, dE_Theta, dlogZ_Theta, dL_qx, dL_pa
 79 | 
 80 |         # create updates
 81 |         uL_Theta, uL_Phi, avg_updates, avg_Theta_updates \
 82 |           = self.create_updates(grads, None, alpha, opt_alg, opt_params)
 83 |       
 84 |         # logF_avg, Z_avg = self.create_llik_estimate(D)
 85 |         
 86 |         mode = NanGuardMode(nan_is_error=True, inf_is_error=False, big_is_error=False)
 87 |         mode = None
 88 | 
 89 |         common_update1 = OrderedDict(avg_updates.items() + uL_Phi.items())
 90 |         self.train_q = theano.function([idx1, idx2], [loglik, self.logbnd], 
 91 |           updates=common_update1, mode=mode,
 92 |           givens={D: self.train_set_x[idx1:idx2]})
 93 |         self.train_q0 = theano.function([D], self.logbnd, updates=common_update1, mode=mode)
 94 | 
 95 |         common_update2 = OrderedDict(avg_Theta_updates.items() + uL_Theta.items())
 96 |         self.train_p = theano.function([idx1, idx2, alpha], [loglik, self.logbnd], 
 97 |             updates=common_update2, mode=mode, on_unused_input='warn',
 98 |             givens={D: self.train_set_x[idx1:idx2]})
 99 |         self.train_p_batch = theano.function([D, alpha], [loglik, self.logbnd], 
100 |             updates=common_update2, mode=mode, on_unused_input='warn')
101 |         # self.llik = theano.function([D], logF_avg - T.log(Z_avg), mode=mode)
102 | 
103 |     def create_inputs(self):
104 |         # allocate symbolic variables for the data
105 |         idx1, idx2 = T.lscalar(), T.lscalar()
106 |         D = T.tensor4('D', dtype=theano.config.floatX) # data matrixX)  # learning rate
107 |         # self.Z = T.tensor2(dtype=theano.config.floatX)
108 |         return D, idx1, idx2
109 | 
110 |     def _free_energy(self, X, marginalize='h', avg=False):
111 |       bv_vec = self.vbias.reshape((self.n_visible,1))
112 |       bh_vec = self.hbias.reshape((self.n_hidden,1))
113 | 
114 |       W = self.W_avg if avg else self.W
115 |       W = W.T
116 | 
117 |       if marginalize == 'h':
118 |         logF = T.dot(bv_vec.T, X) \
119 |              + T.sum(T.log(1. + T.exp(bh_vec + T.dot(W, X))), axis=0)
120 |       elif marginalize == 'v':
121 |         logF = T.dot(bh_vec.T, X) \
122 |              + T.sum(T.log(1. + T.exp(bv_vec + T.dot(W.T, X))), axis=0)
123 |       else:
124 |         raise ValueError('invalid argument')
125 |       
126 |       return logF
127 | 
128 |     @property
129 |     def Z(self):
130 |       return np.exp(self.logZ_avg.get_value())
131 | 
132 |     def create_model(self):
133 |         # create the RBM
134 |         self.W = theano.shared(self.create_mat(self.n_visible, self.n_hidden))
135 |         self.vbias = theano.shared(create_vec(self.n_visible))
136 |         self.hbias = theano.shared(create_vec(self.n_hidden))
137 |         self.params_p = [self.W, self.vbias, self.hbias]
138 | 
139 |         # running averages
140 |         self.inner_it = theano.shared(create_sca(0.0))
141 |         self.loga = theano.shared(create_sca(0.0))
142 |         self.logZ_avg = theano.shared(create_sca(0.0))
143 |         self.grad_logZ_avg = [
144 |           theano.shared(create_zmat(self.n_visible, self.n_hidden)), 
145 |           theano.shared(create_vec(self.n_visible)),
146 |           theano.shared(create_vec(self.n_hidden))
147 |         ]
148 |         self.grad_p_avg = [
149 |           theano.shared(create_zmat(self.n_visible, self.n_hidden)), 
150 |           theano.shared(create_vec(self.n_visible)),
151 |           theano.shared(create_vec(self.n_hidden))
152 |         ]
153 |         self.W_avg = theano.shared(self.W.get_value().copy())
154 |         self.vbias_avg = theano.shared(self.vbias.get_value().copy())
155 |         self.hbias_avg = theano.shared(self.hbias.get_value().copy())
156 | 
157 |         # for pseudo-likelihood
158 |         self.bit_i_idx = theano.shared(value=0, name='bit_i_idx')
159 | 
160 |         # create approximating distribution
161 |         n_aux = 10 # dimensionality of latent auxiliary space
162 |         self.A = self.theano_rng.normal(size=(self.n_mc, n_aux))
163 | 
164 |         # create q(x|a)
165 |         l_qx_in = lasagne.layers.InputLayer((None, n_aux))
166 | 
167 |         # digits dataset
168 |         if self.n_visible == 64:
169 |             # l_qx_hid = lasagne.layers.DenseLayer(l_qx_in, num_units=64, 
170 |             #   nonlinearity=lasagne.nonlinearities.rectify)
171 |             l_qx_hid = lasagne.layers.DenseLayer(l_qx_in, num_units=32, 
172 |               nonlinearity=lasagne.nonlinearities.rectify)
173 |             l_qx = lasagne.layers.DenseLayer(l_qx_hid, num_units=self.n_latent, 
174 |               nonlinearity=lasagne.nonlinearities.sigmoid)
175 |             l_qx_samp = BernoulliSampleLayer(l_qx)
176 | 
177 |             # l_g_hid1 = lasagne.layers.DenseLayer(l_qx_in, 8*4*4)
178 |             # l_g_hid2 = lasagne.layers.ReshapeLayer(l_g_hid1, ([0], 8, 4, 4))
179 |             # # l_g_dc1 = Deconv2DLayer(l_g_hid2, 8, 3, stride=2, pad=1)
180 |             # l_g = Deconv2DLayer(l_g_hid2, 1, 3, stride=2, pad=1, 
181 |             #         nonlinearity=lasagne.nonlinearities.sigmoid)
182 |             # l_qx = lasagne.layers.flatten(l_g)
183 |             # l_qx_samp = BernoulliSampleLayer(l_qx)
184 | 
185 |         # mnist dataset
186 |         elif self.n_visible == 784:
187 |             # dense
188 |             l_qx_hid = lasagne.layers.DenseLayer(l_qx_in, num_units=2048, 
189 |               nonlinearity=lasagne.nonlinearities.rectify)
190 |             # l_qx_hid = batch_norm(lasagne.layers.DenseLayer(l_qx_in, num_units=128, 
191 |             #   nonlinearity=lasagne.nonlinearities.rectify))
192 |             # l_qx_hid = batch_norm(lasagne.layers.DenseLayer(l_qx_hid, num_units=256, 
193 |             #   nonlinearity=lasagne.nonlinearities.rectify))
194 |             l_qx = lasagne.layers.DenseLayer(l_qx_hid, num_units=self.n_latent, 
195 |               nonlinearity=lasagne.nonlinearities.sigmoid)
196 |             l_qx_samp = BernoulliSampleLayer(l_qx)
197 | 
198 |             # # deconv
199 |             # l_g_hid1 = batch_norm(lasagne.layers.DenseLayer(l_qx_in, 64*7*7))
200 |             # l_g_hid2 = lasagne.layers.ReshapeLayer(l_g_hid1, ([0], 64, 7, 7))
201 |             # # l_g_dc = Deconv2DLayer(l_g_hid2, 1, 5, stride=4, pad=2, 
202 |             # #         nonlinearity=lasagne.nonlinearities.sigmoid)
203 | 
204 |             # # l_g_dc1 = Deconv2DLayer(l_g_hid2, 8, 3, stride=2, pad=1)
205 |             # l_g_dc1 = batch_norm(Deconv2DLayer(l_g_hid2, 32, 5, stride=2, pad=2, 
206 |             #         nonlinearity=lasagne.nonlinearities.rectify))
207 |             # l_g_dc = Deconv2DLayer(l_g_dc1, 1, 5, stride=2, pad=2, 
208 |             #         nonlinearity=lasagne.nonlinearities.sigmoid)
209 |             
210 |             # l_qx = lasagne.layers.flatten(l_g_dc)
211 |             # l_qx_samp = BernoulliSampleLayer(l_qx)
212 | 
213 |         # create p(a|x)
214 |         relu_shift = lambda av: T.nnet.relu(av+10)-10 # for numerical stability
215 |         l_pa_in = lasagne.layers.InputLayer((None, self.n_latent))
216 |         l_pa_hid = lasagne.layers.DenseLayer(l_pa_in, num_units=32, 
217 |           nonlinearity=lasagne.nonlinearities.rectify)
218 |         l_pa_mu = lasagne.layers.DenseLayer(l_pa_hid, num_units=n_aux, 
219 |           nonlinearity=None)
220 |         l_pa_logsigma = lasagne.layers.DenseLayer(l_pa_hid, num_units=n_aux, 
221 |           nonlinearity=relu_shift)
222 | 
223 |         return (l_qx, l_qx_samp, l_pa_mu, l_pa_logsigma)
224 | 
225 |     def _create_components(self, D):
226 |         # collect samples
227 |         (l_qx, l_qx_samp, l_pa_mu, l_pa_logsigma) = self.network
228 |         a = self.A
229 |         qx, x = lasagne.layers.get_output([l_qx, l_qx_samp], a)
230 |         pa_mu, pa_logsigma = lasagne.layers.get_output([l_pa_mu, l_pa_logsigma], x)
231 | 
232 |         # compute logQ
233 |         logQa = T.sum(log_normal(a, 0., 1.), axis=1)
234 |         logQx_given_a = T.sum(log_bernoulli(x, qx), axis=1)
235 |         logQ = logQa + logQx_given_a
236 | 
237 |         # compute energies of the samples, dim=(1, n_tot_samples)
238 |         logFx = self._free_energy(x.T, marginalize=self.marginalize)
239 |         logpa = T.sum(log_normal2(a, pa_mu, pa_logsigma), axis=1)
240 |         # logF = logFx + logpa
241 | 
242 |         # free energy of the data
243 |         D = D.reshape((-1, self.n_visible)).T
244 |         logF_D = self._free_energy(D)
245 | 
246 |         self._components = (logFx, logpa, logQ, logF_D)
247 | 
248 |     def create_objectives(self, D, deterministic=False):
249 |         self._create_components(D)
250 |         (logFx, logpa, logQ, logF_D_vec) = self._components
251 |         t = self.n_mc
252 |         logFxa = logFx + logpa
253 | 
254 |         # set up bound
255 |         logFQ = logFx - logQ # (t,)
256 |         logFQ2 = 2.*logFxa - 2.*logQ # (t,)
257 |         logZ = Tlogsumexp(logFQ.flatten(), axis=0) - T.log(t)
258 |         logbnd = Tlogsumexp(logFQ2.flatten(), axis=0) - T.log(t)
259 | 
260 |         logF_D = logF_D_vec.mean()
261 | 
262 |         loglik = logF_D - 0.5*logZ
263 |         loglikbnd = logF_D - 0.5*logbnd
264 | 
265 |         self.logbnd = logbnd
266 |         self.logZ = logZ
267 | 
268 |         # set up pseudolikelihood
269 |         plik = self.create_pseudoliklihood(D)
270 | 
271 |         return loglik, plik
272 | 
273 |     # def get_params(self):
274 |     #     return [self.W, self.vbias, self.hbias, self.Phi]
275 | 
276 |     def get_params_p(self):
277 |         return [self.W, self.vbias, self.hbias]
278 | 
279 |     def get_params_qx(self):
280 |         (l_qx, l_qx_samp, l_pa_mu, l_pa_logsigma) = self.network
281 |         return lasagne.layers.get_all_params(l_qx, trainable=True)
282 | 
283 |     def get_params_pa(self):
284 |         (l_qx, l_qx_samp, l_pa_mu, l_pa_logsigma) = self.network
285 |         return lasagne.layers.get_all_params(l_pa_mu, trainable=True)
286 | 
287 |     def create_gradients(self):
288 |         (logFx, logpa, logQ, logF_D_vec) = self._components
289 |         logFxa = logFx + logpa
290 |         logbnd, logZ = self.logbnd, self.logZ
291 |         logF_D = logF_D_vec.mean()
292 |         
293 |         logQ2 = 2.*logQ
294 |         logFQ = logFx - logQ # (t,)
295 |         S = T.exp(logFQ - logZ) # (t,)
296 |         S2 = T.exp(2*logFxa - logbnd - logQ2) # (t,)
297 |         target = T.mean(S * logFx)
298 | 
299 |         # get grads wrt params_p
300 |         dlogB_W, dlogB_bv, dlogB_bh = theano.grad(logbnd, [self.W, self.vbias, self.hbias])
301 |         dlogZ_W, dlogZ_bv, dlogZ_bh = theano.grad(target, [self.W, self.vbias, self.hbias], consider_constant=[S])
302 |         dE_W, dE_bv, dE_bh = theano.grad(logF_D, [self.W, self.vbias, self.hbias])
303 | 
304 |         # get grads wrt params_qx
305 |         from theano.gradient import disconnected_grad as dg
306 |         dlogB_target = T.mean(-dg(S2) * logQ)
307 |         dlogB_qx = theano.grad(dlogB_target, self.get_params_qx())
308 | 
309 |         # get grads wrt params_pa
310 |         dlogB_pa = theano.grad(T.mean(S2), self.get_params_pa(), consider_constant=[logbnd])
311 | 
312 |         # log-likelihood / bound gradients (combine the above)
313 |         
314 |         dL_qx = [-0.5*g for g in dlogB_qx]
315 |         dL_pa = [-0.5*g for g in dlogB_pa]
316 |         dL_W = dE_W - dlogZ_W 
317 |         dL_bv = dE_bv - dlogZ_bv
318 |         dL_bh = dE_bh - dlogZ_bh
319 |         # dL_W = dE_W - 0.5 * dlogB_W 
320 |         # dL_bv = dE_bv - 0.5 * dlogB_bv
321 |         # dL_bh = dE_bh - 0.5 * dlogB_bh
322 | 
323 |         dL_Theta = [dL_W, dL_bv, dL_bh]
324 |         dlogZ_Theta = [dlogZ_W, dlogZ_bv, dlogZ_bh]
325 |         dE_Theta = [dE_W, dE_bv, dE_bh]
326 | 
327 |         return dL_Theta, dlogZ_Theta, dE_Theta, dL_qx, dL_pa
328 | 
329 |     def create_pseudoliklihood(self, X):
330 |         """Stochastic approximation to the pseudo-likelihood"""
331 |         # X = self.inputs[0]
332 |         X = X.reshape((-1, self.n_visible))
333 | 
334 |         # index of bit i in expression p(x_i | x_{\i})
335 |         bit_i_idx = self.bit_i_idx
336 |         # bit_i_idx = theano.shared(value=0, name='bit_i_idx')
337 | 
338 |         # binarize the input image by rounding to nearest integer
339 |         xi = T.round(X)
340 | 
341 |         # calculate free energy for the given bit configuration
342 |         fe_xi = self.free_energy(xi)
343 | 
344 |         # flip bit x_i of matrix xi and preserve all other bits x_{\i}
345 |         # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx], but assigns
346 |         # the result to xi_flip, instead of working in place on xi.
347 |         xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx])
348 | 
349 |         # calculate free energy with bit flipped
350 |         fe_xi_flip = self.free_energy(xi_flip)
351 | 
352 |         # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i})))
353 |         cost = T.mean(self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip-fe_xi)))
354 | 
355 |         return cost
356 | 
357 |     def create_updates(self, grads, params, alpha, opt_alg, opt_params):
358 |     # def create_updates(self, alpha, opt_alg, opt_params):
359 |         scaled_p_grads = [grad * alpha for grad in self.grad_p_avg]
360 |         dL_Theta, dE_Theta, dlogZ_Theta, dL_qx, dL_pa = grads
361 | 
362 |         # create update for main parameters
363 |         # TODO: Scale grads with alpha!
364 |         # lr = opt_params.get('lr', 1e-3)
365 |         # b1, b2 = opt_params.get('b1', 0.7), opt_params.get('b2', 1e-3)
366 |         lr, b1, b2 = 1e-3, 0.7, 1e-3
367 |         uL_Theta = lasagne.updates.adam(self.grad_p_avg, self.params_p, learning_rate=lr, beta1=b1, beta2=b2)
368 |         # uL_Theta = lasagne.updates.adam(self.grad_p_avg, self.params_p, learning_rate=1e-3, beta1=7e-1, beta2=1e-3)
369 |         # uL_Theta = lasagne.updates.adam(scaled_p_grads, self.params_p, learning_rate=1e-3, beta1=0.9, beta2=0.999)
370 |           # learning_rate=lr, beta1=b1, beta2=b2)
371 |         uL_qx = lasagne.updates.sgd([-1*g for g in dL_qx], self.get_params_qx(), learning_rate=5e-2) # TODO: anneal lr
372 |         # uL_qx = lasagne.updates.adam([-1*g for g in dL_qx], self.get_params_qx(), learning_rate=5e-2, beta1=0.0, beta2=0.999) # TODO: anneal lr
373 |         uL_pa = lasagne.updates.sgd([-1*g for g in dL_pa], self.get_params_pa(), learning_rate=5e-2) # TODO: anneal lr
374 |         # uL_Phi = OrderedDict(uL_qx.items() + uL_pa.items())
375 |         uL_Phi = uL_qx
376 | 
377 |         # create updates for the running averages
378 |         avg_updates = self.create_avg_updates(self.logZ, dlogZ_Theta, dL_Theta)
379 | 
380 |         # avg Theta updates
381 |         avg_Theta_updates = OrderedDict()
382 |         avg_Theta_updates[self.W_avg] = 0.95*self.W_avg + 0.05*self.W
383 |         avg_Theta_updates[self.vbias_avg] = 0.95*self.vbias_avg + 0.05*self.vbias
384 |         avg_Theta_updates[self.hbias_avg] = 0.95*self.hbias_avg + 0.05*self.hbias
385 |         # avg_Theta_updates[self.W_avg] = self.W
386 |         # avg_Theta_updates[self.vbias_avg] = self.vbias
387 |         # avg_Theta_updates[self.hbias_avg] = self.hbias
388 | 
389 |         # increment bit_i_idx % number as part of updates
390 |         uL_Phi[self.bit_i_idx] = (self.bit_i_idx + 1) % self.n_visible
391 | 
392 |         return uL_Theta, uL_Phi, avg_updates, avg_Theta_updates
393 | 
394 |     def create_avg_updates(self, logZ, dlogZ_Theta, dL_Theta):
395 |       inner_it = self.inner_it
396 |       avg_updates = OrderedDict()
397 | 
398 |       new_logZ_estimate = Tlogaddexp(
399 |           self.logZ_avg + T.log((inner_it) / (inner_it+1)),
400 |           logZ - T.log(inner_it+1.)
401 |       )
402 |         # self.logZ_avg * ((inner_it) / (inner_it+1)) + logZ / (inner_it+1)
403 | 
404 |       avg_updates[inner_it] = inner_it + 1.
405 |       avg_updates[self.logZ_avg] = new_logZ_estimate
406 |       avg_updates[self.loga] = -2.*new_logZ_estimate # 1./(new_Z_estimate**2)
407 |       
408 |       for g, g_avg in zip(dL_Theta, self.grad_p_avg):
409 |         avg_updates[g_avg] = g_avg * ((inner_it) / (inner_it+1)) - g / (inner_it+1)
410 | 
411 |       return avg_updates
412 | 
413 |     def reset_averages(self):
414 |       self.inner_it.set_value(0.0)
415 | 
416 |     def free_energy(self, v_sample):
417 |         """Function to compute the free energy"""
418 |         wx_b = T.dot(v_sample, self.W) + self.hbias
419 |         vbias_term = T.dot(v_sample, self.vbias)
420 |         hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1)
421 |         return -hidden_term - vbias_term
422 | 
423 |     def pseudolikelihood(self, data):
424 |         self.rbm.components_ = self.W.get_value().T
425 |         self.rbm.intercept_visible_ = self.vbias.get_value()
426 |         self.rbm.intercept_hidden_  = self.hbias.get_value()
427 |         return self.rbm.score_samples(data).mean()
428 | 
429 |     def dump(self, fname):
430 |         """Pickle weights to a file"""
431 |         params = lasagne.layers.get_all_param_values(self.network)
432 |         all_params = [params, self.W.get_value(), self.vbias.get_value(), self.hbias.get_value()]
433 |         with open(fname, 'w') as f:
434 |             pickle.dump(all_params, f)
435 | 
436 |     def load(self, fname):
437 |         """Load pickled network"""
438 |         with open(fname) as f:
439 |             params = pickle.load(f)
440 |         self.load_params(params)
441 | 
442 |     def load_params(self, params):
443 |         """Load a given set of parameters"""
444 |         net_params, W, vbias, hbias = params
445 |         self.W.set_value(W)
446 |         self.vbias.set_value(vbias)
447 |         self.hbias.set_value(hbias)
448 |         lasagne.layers.set_all_param_values(self.network, net_params)
449 | 
450 |     def fit(
451 |         self, X_train, Y_train, X_val, Y_val,
452 |         n_epoch=10, n_batch=100, logname='run'
453 |     ):
454 |         """Train the model"""
455 |         alpha = 1.0 # learning rate, which can be adjusted later
456 |         n_data = len(X_train)
457 |         n_superbatch = self.n_superbatch
458 | 
459 |         for epoch in range(n_epoch):
460 |             # In each epoch, we do a full pass over the training data:
461 |             train_batches, train_err, train_acc = 0, 0, 0
462 |             start_time = time.time()
463 | 
464 |             # if epoch > 30: alpha = 0.33
465 |             # if epoch > 20: alpha = 0.1
466 | 
467 |             # iterate over superbatches to save time on GPU memory transfer
468 |             for X_sb, Y_sb in self.iterate_superbatches(
469 |                 X_train, Y_train, n_superbatch,
470 |                 datatype='train', shuffle=True,
471 |             ):
472 |                 for idx1, idx2 in iterate_minibatch_idx(len(X_sb), n_batch):
473 |                     # q steps
474 |                     for i in range(2):
475 |                       err, acc = self.train_q(idx1, idx2)
476 |                       # print epoch, i, err, acc
477 |                     # err, acc = self.train_q(idx1, idx2)
478 | 
479 |                     # p steps
480 |                     err, acc = self.train_p(idx1, idx2, alpha)
481 |                     # print epoch, 'P', self.pseudolikelihood(X_sb[idx1:idx2].reshape(-1, self.n_visible)), acc
482 |                     # print
483 |                     self.reset_averages()
484 | 
485 |                     # err, acc = self.train(idx1, idx2, alpha)
486 |                     # collect metrics
487 |                     err = self.pseudolikelihood(X_sb[idx1:idx2].reshape(-1, self.n_visible))
488 |                     train_batches += 1
489 |                     train_err += err
490 |                     train_acc += acc
491 | 
492 |                     # print train_batches, err, acc
493 | 
494 |                     if train_batches % 100 == 0:
495 |                         n_total = epoch * n_data + n_batch * train_batches
496 |                         metrics = [
497 |                             n_total, train_err / train_batches,
498 |                             train_acc / train_batches,
499 |                         ]
500 |                         log_metrics(logname, metrics)
501 | 
502 |             print "Epoch {} of {} took {:.3f}s ({} minibatches)".format(
503 |                 epoch + 1, n_epoch, time.time() - start_time, train_batches)
504 | 
505 |             print "  training:\t\t{:.6f}\t{:.6f}".format(
506 |                 train_err / train_batches, train_acc / train_batches)
507 | 
508 |         # reserve N of training data points to kick start hallucinations
509 |         self.dump(logname + '.pkl')
510 |         hallu_i = self.numpy_rng.randint(n_data - self.n_chain)
511 |         self.hallu_set = np.asarray(
512 |             X_train[hallu_i:hallu_i + self.n_chain],
513 |             dtype=theano.config.floatX
514 |         )


--------------------------------------------------------------------------------