├── .gitignore
├── README.rst
├── lab1
    ├── confusionmatrix.py
    ├── confusionmatrix.pyc
    ├── lab1_FFN.ipynb
    ├── mnist.npz
    └── out.png
├── lab2
    ├── lab2_AE.ipynb
    └── out.png
├── lab3
    ├── VAE.png
    ├── lab3_VAE.ipynb
    ├── out.png
    └── samplelayer.py
└── lab4
    ├── lab4_VAESSL.ipynb
    ├── out.png
    └── samplelayer.py


/.gitignore:
--------------------------------------------------------------------------------
1 | */.ipynb_checkpoints/*
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Variational Auto-encoders Summerschool 2016
 2 | =======
 3 | This repository contains the exercises on Variational Auto-encoders for the summer school on `semi supervised learning <https://semisupervised-learning.compute.dtu.dk>`_.
 4 | 
 5 | Installation
 6 | ------------
 7 | Please make sure you read this section thoroughly before getting started with the exercises.
 8 | 
 9 | All exercises are written in Python programming language and formatted into Jupyter Notebooks. In order to run the exercises you must follow the below requirements:
10 | 
11 | **1. Install Anaconda (Python distribution)**
12 | 
13 | - Install Anaconda for Python 2.7 from https://www.continuum.io/downloads.
14 | 
15 | - The installer will ask you whether you want to add Python as a default and to your environment path. You should tick both of these options.
16 | 
17 | - Restart your terminal/command prompt for the environment variables to update.
18 | 
19 | **2. Install and upgrade theano**
20 | 
21 | - In a terminal/command prompt run: pip install theano
22 | 
23 | - In a terminal/command prompt run: pip install --upgrade https://github.com/Theano/Theano/archive/master.zip --no-deps
24 | 
25 | - If you run into problems installing Theano, please refer to: http://deeplearning.net/software/theano/index.html.
26 | 
27 | **3. Install and upgrade lasagne**
28 | 
29 | - In a terminal/command prompt run: pip install lasagne
30 | 
31 | - In a terminal/command prompt run: pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip --no-deps
32 | 
33 | Run the notebooks
34 | ------------
35 | Download the repository from Github and run in a terminal/command prompt: jupyter notebook. This should start your default browser and you should be up and running. If you have issues while running the notebooks, we recommend that you use the newest version of Chrome.
36 | 


--------------------------------------------------------------------------------
/lab1/confusionmatrix.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class ConfusionMatrix:
  5 |     """
  6 |        Simple confusion matrix class
  7 |        row is the true class, column is the predicted class
  8 |     """
  9 |     def __init__(self, num_classes, class_names=None):
 10 |         self.n_classes = num_classes
 11 |         if class_names is None:
 12 |             self.class_names = map(str, range(num_classes))
 13 |         else:
 14 |             self.class_names = class_names
 15 | 
 16 |         # find max class_name and pad
 17 |         max_len = max(map(len, self.class_names))
 18 |         self.max_len = max_len
 19 |         for idx, name in enumerate(self.class_names):
 20 |             if len(self.class_names) < max_len:
 21 |                 self.class_names[idx] = name + " "*(max_len-len(name))
 22 | 
 23 |         self.mat = np.zeros((num_classes,num_classes),dtype='int')
 24 | 
 25 |     def __str__(self):
 26 |         # calucate row and column sums
 27 |         col_sum = np.sum(self.mat, axis=1)
 28 |         row_sum = np.sum(self.mat, axis=0)
 29 | 
 30 |         s = []
 31 | 
 32 |         mat_str = self.mat.__str__()
 33 |         mat_str = mat_str.replace('[','').replace(']','').split('\n')
 34 | 
 35 |         for idx, row in enumerate(mat_str):
 36 |             if idx == 0:
 37 |                 pad = " "
 38 |             else:
 39 |                 pad = ""
 40 |             class_name = self.class_names[idx]
 41 |             class_name = " " + class_name + " |"
 42 |             row_str = class_name + pad + row
 43 |             row_str += " |" + str(col_sum[idx])
 44 |             s.append(row_str)
 45 | 
 46 |         row_sum = [(self.max_len+4)*" "+" ".join(map(str, row_sum))]
 47 |         hline = [(1+self.max_len)*" "+"-"*len(row_sum[0])]
 48 | 
 49 |         s = hline + s + hline + row_sum
 50 | 
 51 |         # add linebreaks
 52 |         s_out = [line+'\n' for line in s]
 53 |         return "".join(s_out)
 54 | 
 55 |     def batch_add(self, targets, preds):
 56 |         assert targets.shape == preds.shape
 57 |         assert len(targets) == len(preds)
 58 |         assert max(targets) < self.n_classes
 59 |         assert max(preds) < self.n_classes
 60 |         targets = targets.flatten()
 61 |         preds = preds.flatten()
 62 |         for i in range(len(targets)):
 63 |                 self.mat[targets[i], preds[i]] += 1
 64 | 
 65 |     def get_errors(self):
 66 |         tp = np.asarray(np.diag(self.mat).flatten(),dtype='float')
 67 |         fn = np.asarray(np.sum(self.mat, axis=1).flatten(),dtype='float') - tp
 68 |         fp = np.asarray(np.sum(self.mat, axis=0).flatten(),dtype='float') - tp
 69 |         tn = np.asarray(np.sum(self.mat)*np.ones(self.n_classes).flatten(),
 70 |                         dtype='float') - tp - fn - fp
 71 |         return tp, fn, fp, tn
 72 | 
 73 |     def accuracy(self):
 74 |         """
 75 |         Calculates global accuracy
 76 |         :return: accuracy
 77 |         :example: >>> conf = ConfusionMatrix(3)
 78 |                   >>> conf.batchAdd([0,0,1],[0,0,2])
 79 |                   >>> print conf.accuracy()
 80 |         """
 81 |         tp, _, _, _ = self.get_errors()
 82 |         n_samples = np.sum(self.mat)
 83 |         return np.sum(tp) / n_samples
 84 | 
 85 |     def sensitivity(self):
 86 |         tp, tn, fp, fn = self.get_errors()
 87 |         res = tp / (tp + fn)
 88 |         res = res[~np.isnan(res)]
 89 |         return res
 90 | 
 91 |     def specificity(self):
 92 |         tp, tn, fp, fn = self.get_errors()
 93 |         res = tn / (tn + fp)
 94 |         res = res[~np.isnan(res)]
 95 |         return res
 96 | 
 97 |     def positive_predictive_value(self):
 98 |         tp, tn, fp, fn = self.get_errors()
 99 |         res = tp / (tp + fp)
100 |         res = res[~np.isnan(res)]
101 |         return res
102 | 
103 |     def negative_predictive_value(self):
104 |         tp, tn, fp, fn = self.get_errors()
105 |         res = tn / (tn + fn)
106 |         res = res[~np.isnan(res)]
107 |         return res
108 | 
109 |     def false_positive_rate(self):
110 |         tp, tn, fp, fn = self.get_errors()
111 |         res = fp / (fp + tn)
112 |         res = res[~np.isnan(res)]
113 |         return res
114 | 
115 |     def false_discovery_rate(self):
116 |         tp, tn, fp, fn = self.get_errors()
117 |         res = fp / (tp + fp)
118 |         res = res[~np.isnan(res)]
119 |         return res
120 | 
121 |     def F1(self):
122 |         tp, tn, fp, fn = self.get_errors()
123 |         res = (2*tp) / (2*tp + fp + fn)
124 |         res = res[~np.isnan(res)]
125 |         return res
126 | 
127 |     def matthews_correlation(self):
128 |         tp, tn, fp, fn = self.get_errors()
129 |         numerator = tp*tn - fp*fn
130 |         denominator = np.sqrt((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))
131 |         res = numerator / denominator
132 |         res = res[~np.isnan(res)]
133 |         return res
134 | 


--------------------------------------------------------------------------------
/lab1/confusionmatrix.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepLearningDTU/variational-autoencoders-summerschool-2016/00e2c92f5d4562cb6ac24471d28cd513c7d6aec2/lab1/confusionmatrix.pyc


--------------------------------------------------------------------------------
/lab1/mnist.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepLearningDTU/variational-autoencoders-summerschool-2016/00e2c92f5d4562cb6ac24471d28cd513c7d6aec2/lab1/mnist.npz


--------------------------------------------------------------------------------
/lab1/out.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepLearningDTU/variational-autoencoders-summerschool-2016/00e2c92f5d4562cb6ac24471d28cd513c7d6aec2/lab1/out.png


--------------------------------------------------------------------------------
/lab2/out.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepLearningDTU/variational-autoencoders-summerschool-2016/00e2c92f5d4562cb6ac24471d28cd513c7d6aec2/lab2/out.png


--------------------------------------------------------------------------------
/lab3/VAE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepLearningDTU/variational-autoencoders-summerschool-2016/00e2c92f5d4562cb6ac24471d28cd513c7d6aec2/lab3/VAE.png


--------------------------------------------------------------------------------
/lab3/out.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepLearningDTU/variational-autoencoders-summerschool-2016/00e2c92f5d4562cb6ac24471d28cd513c7d6aec2/lab3/out.png


--------------------------------------------------------------------------------
/lab3/samplelayer.py:
--------------------------------------------------------------------------------
  1 | import lasagne
  2 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  3 | import theano.tensor as T
  4 | import theano
  5 | 
  6 | 
  7 | class SimpleSampleLayer(lasagne.layers.MergeLayer):
  8 |     """
  9 |     Simple sampling layer drawing a single Monte Carlo sample to approximate
 10 |     E_q [log( p(x,z) / q(z|x) )]. This is the approach described in [KINGMA]_.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     mu, log_var : :class:`Layer` instances
 15 |         Parameterizing the mean and log(variance) of the distribution to sample
 16 |         from as described in [KINGMA]_. The code assumes that these have the
 17 |         same number of dimensions.
 18 | 
 19 |     seed : int
 20 |         seed to random stream
 21 | 
 22 |     Methods
 23 |     ----------
 24 |     seed : Helper function to change the random seed after init is called
 25 | 
 26 |     References
 27 |     ----------
 28 |         ..  [KINGMA] Kingma, Diederik P., and Max Welling.
 29 |             "Auto-Encoding Variational Bayes."
 30 |             arXiv preprint arXiv:1312.6114 (2013).
 31 |     """
 32 |     def __init__(self, mean, log_var,
 33 |                  seed=lasagne.random.get_rng().randint(1, 2147462579),
 34 |                  **kwargs):
 35 |         super(SimpleSampleLayer, self).__init__([mean, log_var], **kwargs)
 36 | 
 37 |         self._srng = RandomStreams(seed)
 38 | 
 39 |     def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)):
 40 |        self._srng.seed(seed)
 41 | 
 42 |     def get_output_shape_for(self, input_shapes):
 43 |         return input_shapes[0]
 44 | 
 45 |     def get_output_for(self, input, **kwargs):
 46 |         mu, log_var = input
 47 |         eps = self._srng.normal(mu.shape)
 48 |         z = mu + T.exp(0.5 * log_var) * eps
 49 |         return z
 50 | 
 51 | 
 52 | class SampleLayer(lasagne.layers.MergeLayer):
 53 |     """
 54 |     Sampling layer supporting importance sampling as described in [BURDA]_ and
 55 |     multiple Monte Carlo samples for the approximation of
 56 |     E_q [log( p(x,z) / q(z|x) )].
 57 | 
 58 |     Parameters
 59 |     ----------
 60 |     mu : class:`Layer` instance
 61 |         Parameterizing the mean of the distribution to sample
 62 |         from as described in [BURDA]_.
 63 | 
 64 |     log_var : class:`Layer` instance
 65 |         By default assumed to parametrize log(sigma^2) of the distribution to
 66 |         sample from as described in [BURDA]_ which is transformed to sigma using
 67 |         the nonlinearity function as described below. Effectively this means
 68 |         that the nonlinearity function controls what log_var parametrizes. A few
 69 |         common examples:
 70 |         -nonlinearity = lambda x: T.exp(0.5*x) => log_var = log(sigma^2)[default]
 71 |         -nonlinearity = lambda x: T.sqrt(x) => log_var = sigma^2
 72 |         -nonlinearity = lambda x: x => log_var = sigma
 73 | 
 74 |     eq_samples : int or T.scalar
 75 |         Number of Monte Carlo samples used to estimate the expectation over
 76 |         q(z|x) in eq. (8) in [BURDA]_.
 77 | 
 78 |     iw_samples : int or T.scalar
 79 |         Number of importance samples in the sum over k in eq. (8) in [BURDA]_.
 80 | 
 81 |     nonlinearity : callable or None
 82 |         The nonlinearity that is applied to the log_var input layer to transform
 83 |         it into a standard deviation. By default we assume that
 84 |         log_var = log(sigma^2) and hence the corresponding nonlinearity is
 85 |         f(x) = T.exp(0.5*x) such that T.exp(0.5*log(sigma^2)) = sigma
 86 | 
 87 |     seed : int
 88 |         seed to random stream
 89 | 
 90 |     Methods
 91 |     ----------
 92 |     seed : Helper function to change the random seed after init is called
 93 | 
 94 |     References
 95 |     ----------
 96 |         ..  [BURDA] Burda, Yuri, Roger Grosse, and Ruslan Salakhutdinov.
 97 |             "Importance Weighted Autoencoders."
 98 |             arXiv preprint arXiv:1509.00519 (2015).
 99 |     """
100 | 
101 |     def __init__(self, mean, log_var,
102 |                  eq_samples=1,
103 |                  iw_samples=1,
104 |                  nonlinearity=lambda x: T.exp(0.5*x),
105 |                  seed=lasagne.random.get_rng().randint(1, 2147462579),
106 |                   **kwargs):
107 |         super(SampleLayer, self).__init__([mean, log_var], **kwargs)
108 | 
109 |         self.eq_samples = eq_samples
110 |         self.iw_samples = iw_samples
111 |         self.nonlinearity = nonlinearity
112 | 
113 |         self._srng = RandomStreams(seed)
114 | 
115 |     def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)):
116 |         self._srng.seed(seed)
117 | 
118 |     def get_output_shape_for(self, input_shapes):
119 |         batch_size, num_latent = input_shapes[0]
120 |         if isinstance(batch_size, int) and \
121 |            isinstance(self.iw_samples, int) and \
122 |            isinstance(self.eq_samples, int):
123 |             out_dim = (batch_size*self.eq_samples*self.iw_samples, num_latent)
124 |         else:
125 |             out_dim = (None, num_latent)
126 |         return out_dim
127 | 
128 |     def get_output_for(self, input, **kwargs):
129 |         mu, log_var = input
130 |         batch_size, num_latent = mu.shape
131 |         eps = self._srng.normal(
132 |             [batch_size, self.eq_samples, self.iw_samples, num_latent],
133 |              dtype=theano.config.floatX)
134 | 
135 |         z = mu.dimshuffle(0,'x','x',1) + \
136 |             self.nonlinearity( log_var.dimshuffle(0,'x','x',1)) * eps
137 | 
138 |         return z.reshape((-1,num_latent))
139 | 
140 | 
141 | class SimpleBernoulliSampleLayer(lasagne.layers.Layer):
142 |     """
143 |     Simple sampling layer drawing samples from bernoulli distributions.
144 | 
145 |     Parameters
146 |     ----------
147 |     mean : :class:`Layer` instances
148 |           Parameterizing the mean value of each bernoulli distribution
149 |     seed : int
150 |         seed to random stream
151 |     Methods
152 |     ----------
153 |     seed : Helper function to change the random seed after init is called
154 |     """
155 | 
156 |     def __init__(self, mean,
157 |                  seed=lasagne.random.get_rng().randint(1, 2147462579),
158 |                  **kwargs):
159 |         super(SimpleBernoulliSampleLayer, self).__init__(mean, **kwargs)
160 | 
161 |         self._srng = RandomStreams(seed)
162 | 
163 |     def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)):
164 |         self._srng.seed(seed)
165 | 
166 |     def get_output_shape_for(self, input_shape):
167 |         return input_shape
168 | 
169 |     def get_output_for(self, mu, **kwargs):
170 |         return self._srng.binomial(size=mu.shape, p=mu, dtype=mu.dtype)
171 | 
172 | 
173 | class BernoulliSampleLayer(lasagne.layers.Layer):
174 |     """
175 |     Bernoulli Sampling layer supporting importance sampling
176 |     Parameters
177 |     ----------
178 |     mean : class:`Layer` instance
179 |            Parameterizing the mean value of each bernoulli distribution
180 |     eq_samples : int or T.scalar
181 |         Number of Monte Carlo samples used to estimate the expectation over
182 |     iw_samples : int or T.scalar
183 |         Number of importance samples in the sum over k
184 |     seed : int
185 |         seed to random stream
186 |     Methods
187 |     ----------
188 |     seed : Helper function to change the random seed after init is called
189 |     """
190 | 
191 |     def __init__(self, mean,
192 |                  eq_samples=1,
193 |                  iw_samples=1,
194 |                  seed=lasagne.random.get_rng().randint(1, 2147462579),
195 |                   **kwargs):
196 |         super(BernoulliSampleLayer, self).__init__(mean, **kwargs)
197 | 
198 |         self.eq_samples = eq_samples
199 |         self.iw_samples = iw_samples
200 | 
201 |         self._srng = RandomStreams(seed)
202 | 
203 |     def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)):
204 |         self._srng.seed(seed)
205 | 
206 |     def get_output_shape_for(self, input_shape):
207 |         batch_size, num_latent = input_shape
208 |         if isinstance(batch_size, int) and \
209 |            isinstance(self.iw_samples, int) and \
210 |            isinstance(self.eq_samples, int):
211 |             out_dim = (batch_size*self.eq_samples*self.iw_samples, num_latent)
212 |         else:
213 |             out_dim = (None, num_latent)
214 |         return out_dim
215 | 
216 |     def get_output_for(self, input, **kwargs):
217 |         mu = input
218 |         batch_size, num_latent = mu.shape
219 |         shp = (batch_size, self.eq_samples, self.iw_samples, num_latent)
220 |         mu_shp = mu.dimshuffle(0,'x','x',1)
221 |         mu_shp = T.repeat(mu_shp, axis=1, repeats=self.eq_samples)
222 |         mu_shp = T.repeat(mu_shp, axis=2, repeats=self.iw_samples)
223 |         samples = self._srng.binomial(
224 |             size=shp, p=mu_shp, dtype=theano.config.floatX)
225 |         return samples.reshape((-1, num_latent))
226 | 


--------------------------------------------------------------------------------
/lab4/lab4_VAESSL.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from __future__ import division, print_function\n",
 12 |     "import matplotlib\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "from IPython.display import Image, display, clear_output\n",
 15 |     "%matplotlib nbagg\n",
 16 |     "%matplotlib inline \n",
 17 |     "import numpy as np\n",
 18 |     "import sklearn.datasets\n",
 19 |     "import theano\n",
 20 |     "import theano.tensor as T\n",
 21 |     "import lasagne\n",
 22 |     "import math"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "# Variational Auto-encoders and Semi-supervised Learning\n",
 30 |     "\n",
 31 |     "Now we are going to extend the VAE from lab3 with a variable $y$, so that we can apply supervised and semi-supervised learning. We'll model the supervised VAE in which we model $p(x|y,z)$ and $q(z|x,y)$ (cf. [Kingma et al.](https://arxiv.org/abs/1406.5298)). This way we condition our latent variable and our generated sample on our label $y$. Next we'll define semi-supervised learning in which $y$ is given for the labeled data and as a latent variable that is marginalized out for the unlabeled data."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "## MNIST: Supervised VAE\n",
 39 |     "First we'll load the MNIST dataset and plot a few examples. We only load a limited amount of number classes, so that we can speed up training."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {
 46 |     "collapsed": false
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "from sklearn.utils import shuffle\n",
 51 |     "#To speed up training we'll only work on a subset of the data\n",
 52 |     "#We discretize the data to 0 and 1 in order to use it with a bernoulli observation model p(x|z) = Ber(mu(z))\n",
 53 |     "\n",
 54 |     "def bernoullisample(x):\n",
 55 |     "    return np.random.binomial(1,x,size=x.shape).astype(theano.config.floatX)\n",
 56 |     "\n",
 57 |     "def onehot(t, num_classes):\n",
 58 |     "    out = np.zeros((t.shape[0], num_classes)).astype('float32')\n",
 59 |     "    for row, col in enumerate(t):\n",
 60 |     "        out[row, col] = 1\n",
 61 |     "    return out\n",
 62 |     "\n",
 63 |     "data = np.load('../lab1/mnist.npz')\n",
 64 |     "num_classes = 3\n",
 65 |     "idxs_train = []\n",
 66 |     "idxs_valid = []\n",
 67 |     "idxs_test = []\n",
 68 |     "for i in range(num_classes):\n",
 69 |     "    idxs_train += np.where(data['y_train'] == i)[0].tolist()\n",
 70 |     "    idxs_valid += np.where(data['y_valid'] == i)[0].tolist()\n",
 71 |     "    idxs_test += np.where(data['y_test'] == i)[0].tolist()\n",
 72 |     "\n",
 73 |     "x_train = bernoullisample(data['X_train'][idxs_train]).astype('float32')\n",
 74 |     "targets_train = data['y_train'][idxs_train].astype('int32') # Since this is unsupervised, the targets are only used for validation.\n",
 75 |     "x_train, targets_train = shuffle(x_train, targets_train, random_state=1234)\n",
 76 |     "t_train = onehot(targets_train, num_classes)\n",
 77 |     "\n",
 78 |     "x_valid = bernoullisample(data['X_valid'][idxs_valid]).astype('float32')\n",
 79 |     "targets_valid = data['y_valid'][idxs_valid].astype('int32')\n",
 80 |     "t_valid = onehot(targets_valid, num_classes)\n",
 81 |     "\n",
 82 |     "x_test = bernoullisample(data['X_test'][idxs_test]).astype('float32')\n",
 83 |     "targets_test = data['y_test'][idxs_test].astype('int32')\n",
 84 |     "t_test = onehot(targets_test, num_classes)\n",
 85 |     "\n",
 86 |     "print(\"training set dim(%i, %i).\" % x_train.shape)\n",
 87 |     "print(\"validation set dim(%i, %i).\" % x_valid.shape)\n",
 88 |     "print(\"test set dim(%i, %i).\" % x_test.shape)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {
 95 |     "collapsed": false
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "#plot a few MNIST examples\n",
100 |     "def plot_samples(x,title=''):\n",
101 |     "    idx = 0\n",
102 |     "    canvas = np.zeros((28*10, 10*28))\n",
103 |     "    for i in range(10):\n",
104 |     "        for j in range(10):\n",
105 |     "            canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x_train[idx].reshape((28, 28))\n",
106 |     "            idx += 1\n",
107 |     "    plt.figure(figsize=(7, 7))\n",
108 |     "    plt.imshow(canvas, cmap='gray')\n",
109 |     "    plt.title(title)\n",
110 |     "    plt.show()\n",
111 |     "\n",
112 |     "plot_samples(x_train[:100],title='MNIST handwritten digits')"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "As for lab3, we define the helper functions for calculating the lowerbound."
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {
126 |     "collapsed": true
127 |    },
128 |    "outputs": [],
129 |    "source": [
130 |     "#defined a couple of helper functions\n",
131 |     "c = - 0.5 * math.log(2*math.pi)\n",
132 |     "def log_bernoulli(x, p, eps=0.0):\n",
133 |     "    p = T.clip(p, eps, 1.0 - eps)\n",
134 |     "    return -T.nnet.binary_crossentropy(p, x)\n",
135 |     "\n",
136 |     "def kl_normal2_stdnormal(mean, log_var):\n",
137 |     "    return -0.5*(1 + log_var - mean**2 - T.exp(log_var))"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "The model should now be modified to condition on the labeled data $y$."
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {
151 |     "collapsed": false
152 |    },
153 |    "outputs": [],
154 |    "source": [
155 |     "from lasagne.layers import InputLayer, DenseLayer, get_output, get_all_params, ConcatLayer\n",
156 |     "from lasagne.nonlinearities import rectify, sigmoid\n",
157 |     "from samplelayer import SimpleSampleLayer\n",
158 |     "\n",
159 |     "num_features = x_train.shape[-1]\n",
160 |     "num_latent_z = 2\n",
161 |     "\n",
162 |     "#MODEL SPECIFICATION\n",
163 |     "\n",
164 |     "#ENCODER\n",
165 |     "l_in_x = InputLayer(shape=(None, num_features))\n",
166 |     "l_in_y = InputLayer(shape=(None, num_classes))\n",
167 |     "l_enc = ConcatLayer([l_in_x, l_in_y])\n",
168 |     "l_enc = DenseLayer(l_enc, num_units=128, nonlinearity=rectify)\n",
169 |     "l_muq = DenseLayer(l_enc, num_units=num_latent_z, nonlinearity=None)     #mu(x)\n",
170 |     "l_logvarq = DenseLayer(l_enc, num_units=num_latent_z, nonlinearity=lambda x: T.clip(x,-10,10)) #logvar(x), \n",
171 |     "l_z = SimpleSampleLayer(mean=l_muq, log_var=l_logvarq) #sample a latent representation z \\sim q(z|x) = N(mu(x),logvar(x))\n",
172 |     "#we split the in two parts to allow sampling from the decoder model separately\n",
173 |     "#DECODER\n",
174 |     "l_in_z = InputLayer(shape=(None, num_latent_z))\n",
175 |     "l_dec = ConcatLayer([l_in_z, l_in_y])\n",
176 |     "l_dec = DenseLayer(l_dec, num_units=128, nonlinearity=rectify) \n",
177 |     "l_mux = DenseLayer(l_dec, num_units=num_features, nonlinearity=sigmoid)  #reconstruction of input using a sigmoid output since mux \\in [0,1] "
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "The objective functions are defined similarly to lab3."
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {
191 |     "collapsed": false
192 |    },
193 |    "outputs": [],
194 |    "source": [
195 |     "\n",
196 |     "sym_x = T.matrix('x')\n",
197 |     "sym_y = T.matrix('y')\n",
198 |     "sym_z = T.matrix('z')\n",
199 |     "\n",
200 |     "z_train, muq_train, logvarq_train = get_output([l_z,l_muq,l_logvarq],{l_in_x:sym_x, l_in_y:sym_y},deterministic=False)\n",
201 |     "mux_train = get_output(l_mux,{l_in_z:z_train, l_in_y:sym_y},deterministic=False)\n",
202 |     "\n",
203 |     "z_eval, muq_eval, logvarq_eval = get_output([l_z,l_muq,l_logvarq],{l_in_x:sym_x, l_in_y:sym_y},deterministic=True)\n",
204 |     "mux_eval = get_output(l_mux,{l_in_z:z_eval, l_in_y:sym_y},deterministic=True)\n",
205 |     "\n",
206 |     "mux_sample = get_output(l_mux,{l_in_z:sym_z, l_in_y:sym_y},deterministic=True)\n",
207 |     "\n",
208 |     "#define the cost function\n",
209 |     "def LogLikelihood(mux,x,muq,logvarq):\n",
210 |     "    log_px_given_z = log_bernoulli(x, mux, eps=1e-6).sum(axis=1).mean() #note that we sum the latent dimension and mean over the samples\n",
211 |     "    KL_qp = kl_normal2_stdnormal(muq, logvarq).sum(axis=1).mean()\n",
212 |     "    LL = log_px_given_z - KL_qp\n",
213 |     "    return LL, log_px_given_z, KL_qp\n",
214 |     "\n",
215 |     "LL_train, logpx_train, KL_train = LogLikelihood(mux_train, sym_x, muq_train, logvarq_train)\n",
216 |     "LL_eval, logpx_eval, KL_eval = LogLikelihood(mux_eval, sym_x, muq_eval, logvarq_eval)\n",
217 |     "\n",
218 |     "all_params = get_all_params([l_z,l_mux],trainable=True)\n",
219 |     "\n",
220 |     "# Let Theano do its magic and get all the gradients we need for training\n",
221 |     "all_grads = T.grad(-LL_train, all_params)\n",
222 |     "\n",
223 |     "# Set the update function for parameters. The Adam optimizer works really well with VAEs.\n",
224 |     "updates = lasagne.updates.adam(all_grads, all_params, learning_rate=1e-3)\n",
225 |     "\n",
226 |     "f_train = theano.function(inputs=[sym_x, sym_y],\n",
227 |     "                          outputs=[LL_train, logpx_train, KL_train],\n",
228 |     "                          updates=updates)\n",
229 |     "\n",
230 |     "f_eval = theano.function(inputs=[sym_x, sym_y],\n",
231 |     "                         outputs=[LL_eval, logpx_eval, KL_eval])\n",
232 |     "\n",
233 |     "f_z = theano.function(inputs=[sym_x, sym_y],\n",
234 |     "                         outputs=[z_eval])\n",
235 |     "\n",
236 |     "f_sample = theano.function(inputs=[sym_z, sym_y],\n",
237 |     "                         outputs=[mux_sample])\n",
238 |     "\n",
239 |     "f_recon = theano.function(inputs=[sym_x, sym_y],\n",
240 |     "                         outputs=[mux_eval])"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {
247 |     "collapsed": false
248 |    },
249 |    "outputs": [],
250 |    "source": [
251 |     "batch_size = 100\n",
252 |     "samples_to_process = 1e4\n",
253 |     "val_interval = 5e2\n",
254 |     " \n",
255 |     "LL_train, KL_train, logpx_train = [],[],[]\n",
256 |     "LL_valid, KL_valid, logpx_valid = [],[],[]\n",
257 |     "samples_processed = 0\n",
258 |     "plt.figure(figsize=(12, 24))\n",
259 |     "valid_samples_processed = []\n",
260 |     "\n",
261 |     "try:\n",
262 |     "    while samples_processed < samples_to_process:\n",
263 |     "        _LL_train, _KL_train, _logpx_train = [],[],[]\n",
264 |     "        idxs = np.random.choice(range(x_train.shape[0]), size=(batch_size), replace=False)  \n",
265 |     "        x_batch = x_train[idxs]\n",
266 |     "        t_batch = t_train[idxs]\n",
267 |     "        out = f_train(x_batch, t_batch)\n",
268 |     "        samples_processed += batch_size\n",
269 |     "           \n",
270 |     "        if samples_processed % val_interval == 0:\n",
271 |     "            valid_samples_processed += [samples_processed]\n",
272 |     "            out = f_eval(x_train, t_train)\n",
273 |     "            LL_train += [out[0]] \n",
274 |     "            logpx_train += [out[1]]\n",
275 |     "            KL_train += [out[2]]\n",
276 |     "            \n",
277 |     "            out = f_eval(x_valid, t_valid)\n",
278 |     "            LL_valid += [out[0]]\n",
279 |     "            logpx_valid += [out[1]]\n",
280 |     "            KL_valid += [out[2]]\n",
281 |     "            \n",
282 |     "            z_eval = f_z(x_valid, t_valid)[0]\n",
283 |     "            x_sample = f_sample(np.random.normal(size=(100, num_latent_z)).astype('float32'), onehot(np.random.randint(0, num_classes, size=100), num_classes))[0]\n",
284 |     "            x_recon = f_recon(x_valid, t_valid)[0]\n",
285 |     "            \n",
286 |     "            plt.subplot(num_classes+2,2,1)\n",
287 |     "            plt.legend(['LL', 'log(p(x))'], loc=2)\n",
288 |     "            plt.xlabel('Updates')\n",
289 |     "            plt.plot(valid_samples_processed, LL_train, color=\"black\")\n",
290 |     "            plt.plot(valid_samples_processed, logpx_train, color=\"red\")\n",
291 |     "            plt.plot(valid_samples_processed, LL_valid, color=\"black\", linestyle=\"--\")\n",
292 |     "            plt.plot(valid_samples_processed, logpx_valid, color=\"red\", linestyle=\"--\")\n",
293 |     "            plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))\n",
294 |     "            plt.grid('on')\n",
295 |     "            \n",
296 |     "            plt.subplot(num_classes+2,2,2)\n",
297 |     "            plt.cla()\n",
298 |     "            plt.xlabel('z0'), plt.ylabel('z1')\n",
299 |     "            color = iter(plt.get_cmap('brg')(np.linspace(0, 1.0, num_classes)))\n",
300 |     "            for i in range(num_classes):\n",
301 |     "                clr = next(color)\n",
302 |     "                plt.scatter(z_eval[targets_valid==i, 0], z_eval[targets_valid==i, 1], c=clr, s=5., lw=0, marker='o', )\n",
303 |     "            plt.grid('on')\n",
304 |     "            \n",
305 |     "            \n",
306 |     "            plt.subplot(num_classes+2,2,3)\n",
307 |     "            plt.legend(['KL(q||p)'])\n",
308 |     "            plt.xlabel('Updates')\n",
309 |     "            plt.plot(valid_samples_processed, KL_train, color=\"blue\")\n",
310 |     "            plt.plot(valid_samples_processed, KL_valid, color=\"blue\", linestyle=\"--\")\n",
311 |     "            plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))\n",
312 |     "            plt.grid('on')\n",
313 |     "            \n",
314 |     "            plt.subplot(num_classes+2,2,4)\n",
315 |     "            plt.cla()\n",
316 |     "            plt.title('Samples')\n",
317 |     "            plt.axis('off')\n",
318 |     "            idx = 0\n",
319 |     "            canvas = np.zeros((28*10, 10*28))\n",
320 |     "            for i in range(10):\n",
321 |     "                for j in range(10):\n",
322 |     "                    canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x_sample[idx].reshape((28, 28))\n",
323 |     "                    idx += 1\n",
324 |     "            plt.imshow(canvas, cmap='gray')\n",
325 |     "            \n",
326 |     "            c=0\n",
327 |     "            for k in range(5, 5 + num_classes*2, 2):\n",
328 |     "                plt.subplot(num_classes+2,2,k)\n",
329 |     "                plt.cla()\n",
330 |     "                plt.title('Inputs for %i' % c)\n",
331 |     "                plt.axis('off')\n",
332 |     "                idx = 0\n",
333 |     "                canvas = np.zeros((28*10, 10*28))\n",
334 |     "                for i in range(10):\n",
335 |     "                    for j in range(10):\n",
336 |     "                        canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x_valid[targets_valid==c][idx].reshape((28, 28))\n",
337 |     "                        idx += 1\n",
338 |     "                plt.imshow(canvas, cmap='gray')\n",
339 |     "\n",
340 |     "                plt.subplot(num_classes+2,2,k+1)\n",
341 |     "                plt.cla()\n",
342 |     "                plt.title('Reconstructions for %i' % c)\n",
343 |     "                plt.axis('off')\n",
344 |     "                idx = 0\n",
345 |     "                canvas = np.zeros((28*10, 10*28))\n",
346 |     "                for i in range(10):\n",
347 |     "                    for j in range(10):\n",
348 |     "                        canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x_recon[targets_valid==c][idx].reshape((28, 28))\n",
349 |     "                        idx += 1\n",
350 |     "                plt.imshow(canvas, cmap='gray')\n",
351 |     "                c += 1\n",
352 |     "            \n",
353 |     "            plt.savefig(\"out.png\")\n",
354 |     "            display(Image(filename=\"out.png\"))\n",
355 |     "            clear_output(wait=True)\n",
356 |     "            \n",
357 |     "except KeyboardInterrupt:\n",
358 |     "    pass\n",
359 |     "\n"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "markdown",
364 |    "metadata": {},
365 |    "source": [
366 |     "### Exercise 1 - Analyzing the supervised VAE\n",
367 |     "1. Test different network structures and evaluate the performance of the model.\n",
368 |     "2. Investigate the latent space $z$ of the supervised VAE by performing a random walk and generating samples in the same way as [Kingma et al.](https://arxiv.org/abs/1406.5298)."
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "markdown",
373 |    "metadata": {},
374 |    "source": [
375 |     "## MNIST: Semi-supervised VAE (for the very ambitous)"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "markdown",
380 |    "metadata": {},
381 |    "source": [
382 |     "For the semi-supervised modeling, we generate a labeled subset of our training data."
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": null,
388 |    "metadata": {
389 |     "collapsed": false
390 |    },
391 |    "outputs": [],
392 |    "source": [
393 |     "def bernoullisample(x):\n",
394 |     "    return np.random.binomial(1,x,size=x.shape).astype(theano.config.floatX)\n",
395 |     "\n",
396 |     "data = np.load('../lab1/mnist.npz')\n",
397 |     "num_classes = 5\n",
398 |     "idxs_train = []\n",
399 |     "idxs_valid = []\n",
400 |     "idxs_test = []\n",
401 |     "for i in range(num_classes):\n",
402 |     "    idxs_train += np.where(data['y_train'] == i)[0].tolist()\n",
403 |     "    idxs_valid += np.where(data['y_valid'] == i)[0].tolist()\n",
404 |     "    idxs_test += np.where(data['y_test'] == i)[0].tolist()\n",
405 |     "\n",
406 |     "x_train = bernoullisample(data['X_train'][idxs_train]).astype('float32')\n",
407 |     "targets_train = data['y_train'][idxs_train].astype('int32') # Since this is unsupervised, the targets are only used for validation.\n",
408 |     "x_train, targets_train = shuffle(x_train, targets_train, random_state=1234)\n",
409 |     "t_train = onehot(targets_train, num_classes)\n",
410 |     "\n",
411 |     "x_valid = bernoullisample(data['X_valid'][idxs_valid]).astype('float32')\n",
412 |     "targets_valid = data['y_valid'][idxs_valid].astype('int32')\n",
413 |     "t_valid = onehot(targets_valid, num_classes)\n",
414 |     "\n",
415 |     "x_test = bernoullisample(data['X_test'][idxs_test]).astype('float32')\n",
416 |     "targets_test = data['y_test'][idxs_test].astype('int32')\n",
417 |     "t_test = onehot(targets_test, num_classes)\n",
418 |     "\n",
419 |     "print(\"training set dim(%i, %i).\" % x_train.shape)\n",
420 |     "print(\"validation set dim(%i, %i).\" % x_valid.shape)\n",
421 |     "print(\"test set dim(%i, %i).\" % x_test.shape)"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {
428 |     "collapsed": false
429 |    },
430 |    "outputs": [],
431 |    "source": [
432 |     "# Generate a subset of labeled data points\n",
433 |     "\n",
434 |     "num_labeled = 2 # You decide on the size of the fraction...\n",
435 |     "\n",
436 |     "idxs_train_l = []\n",
437 |     "for i in range(num_classes):\n",
438 |     "    idxs = np.where(targets_train == i)[0]\n",
439 |     "    idxs_train_l += np.random.choice(idxs, size=num_labeled).tolist()\n",
440 |     "\n",
441 |     "x_train_l = x_train[idxs_train_l]\n",
442 |     "t_train_l = onehot(targets_train[idxs_train_l], num_classes)\n",
443 |     "print(\"labeled training set dim(%i, %i).\" % x_train_l.shape)\n",
444 |     "\n",
445 |     "plt.figure(figsize=(12, 7))\n",
446 |     "for i in range(num_classes*num_labeled):\n",
447 |     "    im = x_train_l[i].reshape((28, 28))\n",
448 |     "    plt.subplot(1, num_classes*num_labeled, i+1)\n",
449 |     "    plt.imshow(im, cmap='gray')\n",
450 |     "    plt.axis('off')"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "markdown",
455 |    "metadata": {},
456 |    "source": [
457 |     "For the semi-supervised case, the lower bound is divided into two, so that for the labeled data we have:\n",
458 |     "$$\n",
459 |     "\\begin{align}\n",
460 |     "\\log p(x,y) &= \\int_z \\log p(x,y,z) dz \\\\\n",
461 |     "&\\ge \\int_z q(z|x,y) \\log \\bigg[ \\frac{p(x,y,z)}{q(z|x,y)} \\bigg]dz \\\\\n",
462 |     "&= \\mathbb{E}_{q(z|x,y)} [\\log p(x,y,z) - \\log q(z|x,y)]dz \\equiv -\\mathcal{L}(x,y)\\ ,\n",
463 |     "\\end{align}\n",
464 |     "$$\n",
465 |     "\n",
466 |     "where in our case the $p(x,y,z)=p(x|y,z)p(y)p(z)$. For the unlabeled data $y$ is now a latent variable that needs to be marginalized out:\n",
467 |     "$$\n",
468 |     "\\begin{align}\n",
469 |     "\\log p(x,y) &= \\int_y \\int_z \\log p(x,y,z) dzdy \\\\\n",
470 |     "&\\ge \\int_y q(y|x) \\int_z q(z|x,y) \\log \\bigg[ \\frac{p(x,y,z)}{q(y|x)q(z|x,y)} \\bigg]dzdy \\\\\n",
471 |     "&= \\int_y q(y|x) \\int_z q(z|x,y) [\\log p(x,y,z) - \\log q(y|x) - \\log q(z|x,y) ]dzdy \\\\\n",
472 |     "&= \\int_y q(y|x) \\int_z q(z|x,y) [\\log p(x,y,z) - \\log q(z|x,y)]dzdy - \\int_y \\int_z q(y|x)q(z|x,y) \\log q(y|x)dzdy \\\\\n",
473 |     "&= \\int_y q(y|x) \\int_z q(z|x,y) [\\log p(x,y,z) - \\log q(z|x,y)]dzdy - \\int_y q(y|x) \\log q(y|x) \\int_z q(z|x,y)dzdy \\\\\n",
474 |     "&= \\int_y q(y|x) \\int_z q(z|x,y) [\\log p(x,y,z) - \\log q(z|x,y)]dzdy - \\int_y q(y|x) \\log q(y|x)dy \\cdot 1 \\\\\n",
475 |     "&= \\int_y q(y|x) \\int_z q(z|x,y) [\\log p(x,y,z) - \\log q(z|x,y)]dzdy + \\mathcal{H}(q(y|x)) \\\\\n",
476 |     "&= \\sum_y q(y|x) (-\\mathcal{L}(x,y)) + \\mathcal{H}(q(y|x)) \\equiv -\\mathcal{U}(x)\\ .\n",
477 |     "\\end{align}\n",
478 |     "$$\n",
479 |     "\n",
480 |     "We collect the two lowerbounds by:\n",
481 |     "$$\n",
482 |     "\\begin{align}\n",
483 |     "\\mathcal{J}^\\alpha = \\sum_{x_l,y_l} \\mathcal{L}(x_l, y_l) - \\alpha \\cdot \\log q(y_l|x_l) + \\sum_{x_u, y_u} \\mathcal{U}(x_u, y_u) \\ ,\n",
484 |     "\\end{align}\n",
485 |     "$$\n",
486 |     "\n",
487 |     "where the right hand-side is the addition of the cross-entropy scaled by an alpha constant which is equivalent to the fraction between unlabeled and labeled data points."
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "markdown",
492 |    "metadata": {},
493 |    "source": [
494 |     "Following you are provided with a code snippet for computing the lowerbound for the semi-supervised VAE:"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "code",
499 |    "execution_count": null,
500 |    "metadata": {
501 |     "collapsed": false
502 |    },
503 |    "outputs": [],
504 |    "source": [
505 |     "from lasagne.layers import InputLayer, DenseLayer, get_output, get_all_params, ConcatLayer\n",
506 |     "from lasagne.nonlinearities import rectify, sigmoid, softmax\n",
507 |     "from samplelayer import SimpleSampleLayer\n",
508 |     "\n",
509 |     "num_features = x_train.shape[-1]\n",
510 |     "num_latent_z = 16\n",
511 |     "\n",
512 |     "#MODEL SPECIFICATION\n",
513 |     "l_in_x = InputLayer(shape=(None, num_features))\n",
514 |     "l_in_y = InputLayer(shape=(None, num_classes))\n",
515 |     "\n",
516 |     "#Classifier\n",
517 |     "l_y = DenseLayer(l_in_x, num_units=128, nonlinearity=rectify)\n",
518 |     "l_y = DenseLayer(l_y, num_units=num_classes, nonlinearity=softmax)\n",
519 |     "\n",
520 |     "#ENCODER\n",
521 |     "l_enc = ConcatLayer([l_in_x, l_in_y])\n",
522 |     "l_enc = DenseLayer(l_enc, num_units=128, nonlinearity=rectify)\n",
523 |     "l_muq = DenseLayer(l_enc, num_units=num_latent_z, nonlinearity=None)     #mu(x)\n",
524 |     "l_logvarq = DenseLayer(l_enc, num_units=num_latent_z, nonlinearity=lambda x: T.clip(x,-10,10)) #logvar(x), \n",
525 |     "l_z = SimpleSampleLayer(mean=l_muq, log_var=l_logvarq) #sample a latent representation z \\sim q(z|x) = N(mu(x),logvar(x))\n",
526 |     "#we split the in two parts to allow sampling from the decoder model separately\n",
527 |     "#DECODER\n",
528 |     "l_in_z = InputLayer(shape=(None, num_latent_z))\n",
529 |     "l_dec = ConcatLayer([l_in_z, l_in_y])\n",
530 |     "l_dec = DenseLayer(l_dec, num_units=128, nonlinearity=rectify)\n",
531 |     "l_mux = DenseLayer(l_dec, num_units=num_features, nonlinearity=sigmoid)  #reconstruction of input using a sigmoid output since mux \\in [0,1] "
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": null,
537 |    "metadata": {
538 |     "collapsed": false
539 |    },
540 |    "outputs": [],
541 |    "source": [
542 |     "from lasagne.objectives import categorical_crossentropy, categorical_accuracy\n",
543 |     "\n",
544 |     "sym_x_l = T.matrix('x_l')\n",
545 |     "sym_x = T.matrix('x')\n",
546 |     "sym_y = T.matrix('y')\n",
547 |     "sym_z = T.matrix('z')\n",
548 |     "\n",
549 |     "y_eval = get_output(l_y, {l_in_x:sym_x}, deterministic=True)\n",
550 |     "z_eval = get_output(l_z,{l_in_x:sym_x, l_in_y:sym_y},deterministic=True)\n",
551 |     "\n",
552 |     "#define the cost function for labeled data points\n",
553 |     "y_train_l = get_output(l_y, {l_in_x:sym_x_l}, deterministic=False)\n",
554 |     "z_train_l, muq_train_l, logvarq_train_l = get_output([l_z,l_muq,l_logvarq],{l_in_x:sym_x_l, l_in_y:sym_y},deterministic=False)\n",
555 |     "mux_train_l = get_output(l_mux,{l_in_z:z_train_l, l_in_y:sym_y},deterministic=False)\n",
556 |     "\n",
557 |     "log_px_l = log_bernoulli(sym_x_l, mux_train_l, eps=1e-6).sum(axis=-1)\n",
558 |     "KL_qp_l = kl_normal2_stdnormal(muq_train_l, logvarq_train_l).sum(axis=-1)\n",
559 |     "log_qy_l = T.sum(sym_y * T.log(y_train_l+1e-8), axis=1)\n",
560 |     "py_l = softmax(T.zeros((sym_x_l.shape[0], num_classes)))\n",
561 |     "log_py_l = -categorical_crossentropy(py_l, sym_y)\n",
562 |     "alpha = 0.1*(x_train.shape[0]/x_train_l.shape[0])\n",
563 |     "LL_l_eval = T.mean(log_px_l + log_py_l - KL_qp_l)\n",
564 |     "LL_l = T.mean(log_px_l + log_py_l - KL_qp_l + alpha * log_qy_l)\n",
565 |     "\n",
566 |     "\n",
567 |     "#define the cost function for unlabeled data points\n",
568 |     "\n",
569 |     "# For the integrating out approach, we repeat the input matrix x, and construct a target (bs * n_y) x n_y\n",
570 |     "# Example of input and target matrix for a 3 class problem and batch_size=2. 2D tensors of the form\n",
571 |     "#               x_repeat                     t_repeat\n",
572 |     "#  [[x[0,0], x[0,1], ..., x[0,n_x]]         [[1, 0, 0]\n",
573 |     "#   [x[1,0], x[1,1], ..., x[1,n_x]]          [1, 0, 0]\n",
574 |     "#   [x[0,0], x[0,1], ..., x[0,n_x]]          [0, 1, 0]\n",
575 |     "#   [x[1,0], x[1,1], ..., x[1,n_x]]          [0, 1, 0]\n",
576 |     "#   [x[0,0], x[0,1], ..., x[0,n_x]]          [0, 0, 1]\n",
577 |     "#   [x[1,0], x[1,1], ..., x[1,n_x]]]         [0, 0, 1]]\n",
578 |     "t_eye = T.eye(num_classes, k=0)\n",
579 |     "t_u = t_eye.reshape((num_classes, 1, num_classes)).repeat(sym_x.shape[0], axis=1).reshape((-1, num_classes))\n",
580 |     "x_u = sym_x.reshape((1, sym_x.shape[0], sym_x.shape[1])).repeat(num_classes, axis=0).reshape((-1, sym_x.shape[1]))\n",
581 |     "\n",
582 |     "y_train = get_output(l_y, {l_in_x:sym_x}, deterministic=False)\n",
583 |     "z_train, muq_train, logvarq_train = get_output([l_z,l_muq,l_logvarq],{l_in_x:x_u, l_in_y:t_u},deterministic=False)\n",
584 |     "mux_train = get_output(l_mux,{l_in_z:z_train, l_in_y:t_u},deterministic=False)\n",
585 |     "\n",
586 |     "log_px_given_z_u = log_bernoulli(x_u, mux_train, eps=1e-6).sum(axis=-1)\n",
587 |     "KL_qp_u = kl_normal2_stdnormal(muq_train, logvarq_train).sum(axis=-1)\n",
588 |     "py_u = softmax(T.zeros((x_u.shape[0], num_classes)))\n",
589 |     "log_py_u = -categorical_crossentropy(py_u, t_u)\n",
590 |     "LL_u = log_px_given_z_u + log_py_u - KL_qp_u\n",
591 |     "LL_u = LL_u.reshape((num_classes, sym_x.shape[0])).T\n",
592 |     "LL_u = T.sum(y_train * (LL_u - T.log(y_train+1e-8)), axis=-1).mean()\n",
593 |     "\n",
594 |     "LL = LL_u + LL_l\n",
595 |     "\n",
596 |     "eval_acc = categorical_accuracy(y_eval, sym_y).mean()\n",
597 |     "\n",
598 |     "all_params = get_all_params([l_z,l_mux,l_y],trainable=True)\n",
599 |     "\n",
600 |     "# Let Theano do its magic and get all the gradients we need for training\n",
601 |     "all_grads = T.grad(-LL, all_params)\n",
602 |     "\n",
603 |     "# Set the update function for parameters. The Adam optimizer works really well with VAEs.\n",
604 |     "updates = lasagne.updates.adam(all_grads, all_params, learning_rate=3e-5)\n",
605 |     "\n",
606 |     "f_train = theano.function(inputs=[sym_x, sym_x_l, sym_y],\n",
607 |     "                          outputs=[LL],\n",
608 |     "                          updates=updates)\n",
609 |     "\n",
610 |     "f_train_eval = theano.function(inputs=[sym_x, sym_x_l, sym_y],\n",
611 |     "                          outputs=[LL_l_eval, LL_u, KL_qp_l.mean(), KL_qp_u.mean()])\n",
612 |     "\n",
613 |     "f_eval = theano.function(inputs=[sym_x, sym_y],\n",
614 |     "                         outputs=[eval_acc])\n",
615 |     "\n",
616 |     "f_z = theano.function(inputs=[sym_x, sym_y],\n",
617 |     "                         outputs=[z_eval])\n"
618 |    ]
619 |   },
620 |   {
621 |    "cell_type": "code",
622 |    "execution_count": null,
623 |    "metadata": {
624 |     "collapsed": false
625 |    },
626 |    "outputs": [],
627 |    "source": [
628 |     "batch_size = 100\n",
629 |     "samples_to_process = 9e4\n",
630 |     "val_interval = 5e2\n",
631 |     " \n",
632 |     "LL_l_train, LL_u_train, KL_l_train, KL_u_train = [], [], [], []\n",
633 |     "acc_valid = []\n",
634 |     "samples_processed = 0\n",
635 |     "plt.figure(figsize=(12, 6))\n",
636 |     "valid_samples_processed = []\n",
637 |     "\n",
638 |     "try:\n",
639 |     "    while samples_processed < samples_to_process:\n",
640 |     "        idxs = np.random.choice(range(x_train.shape[0]), size=(batch_size), replace=False)  \n",
641 |     "        x_batch = x_train[idxs]\n",
642 |     "        t_batch = t_train[idxs]\n",
643 |     "        out = f_train(x_batch, x_train_l, t_train_l)\n",
644 |     "        samples_processed += batch_size\n",
645 |     "           \n",
646 |     "        if samples_processed % val_interval == 0:\n",
647 |     "            valid_samples_processed += [samples_processed]\n",
648 |     "            out = f_train_eval(x_train, x_train_l, t_train_l)\n",
649 |     "            LL_l_train += [out[0]]\n",
650 |     "            LL_u_train += [out[1]]\n",
651 |     "            KL_l_train += [out[2]]\n",
652 |     "            KL_u_train += [out[3]]\n",
653 |     "            \n",
654 |     "            out = f_eval(x_valid, t_valid)\n",
655 |     "            acc_valid += [out[0]]\n",
656 |     "            \n",
657 |     "            #z_eval = f_z(x_valid, t_valid)[0]\n",
658 |     "            \n",
659 |     "            plt.subplot(1, 2, 1)\n",
660 |     "            plt.legend(['acc. eval'], loc=2)\n",
661 |     "            plt.xlabel('Updates')\n",
662 |     "            plt.plot(valid_samples_processed, acc_valid, color=\"black\")\n",
663 |     "            plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))\n",
664 |     "            plt.grid('on')\n",
665 |     "            \n",
666 |     "            plt.subplot(1, 2, 2)\n",
667 |     "            plt.legend(['LL_l train', 'LL_u train'], loc=2)\n",
668 |     "            plt.xlabel('Updates')\n",
669 |     "            plt.plot(valid_samples_processed, LL_l_train, color=\"black\")\n",
670 |     "            plt.plot(valid_samples_processed, LL_u_train, color=\"red\")\n",
671 |     "            plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))\n",
672 |     "            plt.grid('on')\n",
673 |     "            \n",
674 |     "            \n",
675 |     "            \n",
676 |     "            plt.savefig(\"out.png\")\n",
677 |     "            display(Image(filename=\"out.png\"))\n",
678 |     "            clear_output(wait=True)\n",
679 |     "            \n",
680 |     "        \n",
681 |     "except KeyboardInterrupt:\n",
682 |     "    pass\n",
683 |     "\n",
684 |     "\n"
685 |    ]
686 |   },
687 |   {
688 |    "cell_type": "markdown",
689 |    "metadata": {},
690 |    "source": [
691 |     "### Exercise 2 - Analyzing the semi-supervised VAE\n",
692 |     "1. Understand how the above code works.\n",
693 |     "2. Test different network structures and fractions of labeled/unlabeled data and evaluate the performance of the model.\n",
694 |     "3. What happens when you remove the labeled lowerbound from the objective.\n",
695 |     "4. What does the scaling of the labeled cross-entropy do to the training.\n",
696 |     "3. Try to benchmark this model against other semi-supervised approaches."
697 |    ]
698 |   },
699 |   {
700 |    "cell_type": "code",
701 |    "execution_count": null,
702 |    "metadata": {
703 |     "collapsed": true
704 |    },
705 |    "outputs": [],
706 |    "source": []
707 |   }
708 |  ],
709 |  "metadata": {
710 |   "kernelspec": {
711 |    "display_name": "Python [Root]",
712 |    "language": "python",
713 |    "name": "Python [Root]"
714 |   },
715 |   "language_info": {
716 |    "codemirror_mode": {
717 |     "name": "ipython",
718 |     "version": 3
719 |    },
720 |    "file_extension": ".py",
721 |    "mimetype": "text/x-python",
722 |    "name": "python",
723 |    "nbconvert_exporter": "python",
724 |    "pygments_lexer": "ipython3",
725 |    "version": "3.5.2"
726 |   }
727 |  },
728 |  "nbformat": 4,
729 |  "nbformat_minor": 0
730 | }
731 | 


--------------------------------------------------------------------------------
/lab4/out.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepLearningDTU/variational-autoencoders-summerschool-2016/00e2c92f5d4562cb6ac24471d28cd513c7d6aec2/lab4/out.png


--------------------------------------------------------------------------------
/lab4/samplelayer.py:
--------------------------------------------------------------------------------
  1 | import lasagne
  2 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  3 | import theano.tensor as T
  4 | import theano
  5 | 
  6 | 
  7 | class SimpleSampleLayer(lasagne.layers.MergeLayer):
  8 |     """
  9 |     Simple sampling layer drawing a single Monte Carlo sample to approximate
 10 |     E_q [log( p(x,z) / q(z|x) )]. This is the approach described in [KINGMA]_.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     mu, log_var : :class:`Layer` instances
 15 |         Parameterizing the mean and log(variance) of the distribution to sample
 16 |         from as described in [KINGMA]_. The code assumes that these have the
 17 |         same number of dimensions.
 18 | 
 19 |     seed : int
 20 |         seed to random stream
 21 | 
 22 |     Methods
 23 |     ----------
 24 |     seed : Helper function to change the random seed after init is called
 25 | 
 26 |     References
 27 |     ----------
 28 |         ..  [KINGMA] Kingma, Diederik P., and Max Welling.
 29 |             "Auto-Encoding Variational Bayes."
 30 |             arXiv preprint arXiv:1312.6114 (2013).
 31 |     """
 32 |     def __init__(self, mean, log_var,
 33 |                  seed=lasagne.random.get_rng().randint(1, 2147462579),
 34 |                  **kwargs):
 35 |         super(SimpleSampleLayer, self).__init__([mean, log_var], **kwargs)
 36 | 
 37 |         self._srng = RandomStreams(seed)
 38 | 
 39 |     def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)):
 40 |        self._srng.seed(seed)
 41 | 
 42 |     def get_output_shape_for(self, input_shapes):
 43 |         return input_shapes[0]
 44 | 
 45 |     def get_output_for(self, input, **kwargs):
 46 |         mu, log_var = input
 47 |         eps = self._srng.normal(mu.shape)
 48 |         z = mu + T.exp(0.5 * log_var) * eps
 49 |         return z
 50 | 
 51 | 
 52 | class SampleLayer(lasagne.layers.MergeLayer):
 53 |     """
 54 |     Sampling layer supporting importance sampling as described in [BURDA]_ and
 55 |     multiple Monte Carlo samples for the approximation of
 56 |     E_q [log( p(x,z) / q(z|x) )].
 57 | 
 58 |     Parameters
 59 |     ----------
 60 |     mu : class:`Layer` instance
 61 |         Parameterizing the mean of the distribution to sample
 62 |         from as described in [BURDA]_.
 63 | 
 64 |     log_var : class:`Layer` instance
 65 |         By default assumed to parametrize log(sigma^2) of the distribution to
 66 |         sample from as described in [BURDA]_ which is transformed to sigma using
 67 |         the nonlinearity function as described below. Effectively this means
 68 |         that the nonlinearity function controls what log_var parametrizes. A few
 69 |         common examples:
 70 |         -nonlinearity = lambda x: T.exp(0.5*x) => log_var = log(sigma^2)[default]
 71 |         -nonlinearity = lambda x: T.sqrt(x) => log_var = sigma^2
 72 |         -nonlinearity = lambda x: x => log_var = sigma
 73 | 
 74 |     eq_samples : int or T.scalar
 75 |         Number of Monte Carlo samples used to estimate the expectation over
 76 |         q(z|x) in eq. (8) in [BURDA]_.
 77 | 
 78 |     iw_samples : int or T.scalar
 79 |         Number of importance samples in the sum over k in eq. (8) in [BURDA]_.
 80 | 
 81 |     nonlinearity : callable or None
 82 |         The nonlinearity that is applied to the log_var input layer to transform
 83 |         it into a standard deviation. By default we assume that
 84 |         log_var = log(sigma^2) and hence the corresponding nonlinearity is
 85 |         f(x) = T.exp(0.5*x) such that T.exp(0.5*log(sigma^2)) = sigma
 86 | 
 87 |     seed : int
 88 |         seed to random stream
 89 | 
 90 |     Methods
 91 |     ----------
 92 |     seed : Helper function to change the random seed after init is called
 93 | 
 94 |     References
 95 |     ----------
 96 |         ..  [BURDA] Burda, Yuri, Roger Grosse, and Ruslan Salakhutdinov.
 97 |             "Importance Weighted Autoencoders."
 98 |             arXiv preprint arXiv:1509.00519 (2015).
 99 |     """
100 | 
101 |     def __init__(self, mean, log_var,
102 |                  eq_samples=1,
103 |                  iw_samples=1,
104 |                  nonlinearity=lambda x: T.exp(0.5*x),
105 |                  seed=lasagne.random.get_rng().randint(1, 2147462579),
106 |                   **kwargs):
107 |         super(SampleLayer, self).__init__([mean, log_var], **kwargs)
108 | 
109 |         self.eq_samples = eq_samples
110 |         self.iw_samples = iw_samples
111 |         self.nonlinearity = nonlinearity
112 | 
113 |         self._srng = RandomStreams(seed)
114 | 
115 |     def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)):
116 |         self._srng.seed(seed)
117 | 
118 |     def get_output_shape_for(self, input_shapes):
119 |         batch_size, num_latent = input_shapes[0]
120 |         if isinstance(batch_size, int) and \
121 |            isinstance(self.iw_samples, int) and \
122 |            isinstance(self.eq_samples, int):
123 |             out_dim = (batch_size*self.eq_samples*self.iw_samples, num_latent)
124 |         else:
125 |             out_dim = (None, num_latent)
126 |         return out_dim
127 | 
128 |     def get_output_for(self, input, **kwargs):
129 |         mu, log_var = input
130 |         batch_size, num_latent = mu.shape
131 |         eps = self._srng.normal(
132 |             [batch_size, self.eq_samples, self.iw_samples, num_latent],
133 |              dtype=theano.config.floatX)
134 | 
135 |         z = mu.dimshuffle(0,'x','x',1) + \
136 |             self.nonlinearity( log_var.dimshuffle(0,'x','x',1)) * eps
137 | 
138 |         return z.reshape((-1,num_latent))
139 | 
140 | 
141 | class SimpleBernoulliSampleLayer(lasagne.layers.Layer):
142 |     """
143 |     Simple sampling layer drawing samples from bernoulli distributions.
144 | 
145 |     Parameters
146 |     ----------
147 |     mean : :class:`Layer` instances
148 |           Parameterizing the mean value of each bernoulli distribution
149 |     seed : int
150 |         seed to random stream
151 |     Methods
152 |     ----------
153 |     seed : Helper function to change the random seed after init is called
154 |     """
155 | 
156 |     def __init__(self, mean,
157 |                  seed=lasagne.random.get_rng().randint(1, 2147462579),
158 |                  **kwargs):
159 |         super(SimpleBernoulliSampleLayer, self).__init__(mean, **kwargs)
160 | 
161 |         self._srng = RandomStreams(seed)
162 | 
163 |     def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)):
164 |         self._srng.seed(seed)
165 | 
166 |     def get_output_shape_for(self, input_shape):
167 |         return input_shape
168 | 
169 |     def get_output_for(self, mu, **kwargs):
170 |         return self._srng.binomial(size=mu.shape, p=mu, dtype=mu.dtype)
171 | 
172 | 
173 | class BernoulliSampleLayer(lasagne.layers.Layer):
174 |     """
175 |     Bernoulli Sampling layer supporting importance sampling
176 |     Parameters
177 |     ----------
178 |     mean : class:`Layer` instance
179 |            Parameterizing the mean value of each bernoulli distribution
180 |     eq_samples : int or T.scalar
181 |         Number of Monte Carlo samples used to estimate the expectation over
182 |     iw_samples : int or T.scalar
183 |         Number of importance samples in the sum over k
184 |     seed : int
185 |         seed to random stream
186 |     Methods
187 |     ----------
188 |     seed : Helper function to change the random seed after init is called
189 |     """
190 | 
191 |     def __init__(self, mean,
192 |                  eq_samples=1,
193 |                  iw_samples=1,
194 |                  seed=lasagne.random.get_rng().randint(1, 2147462579),
195 |                   **kwargs):
196 |         super(BernoulliSampleLayer, self).__init__(mean, **kwargs)
197 | 
198 |         self.eq_samples = eq_samples
199 |         self.iw_samples = iw_samples
200 | 
201 |         self._srng = RandomStreams(seed)
202 | 
203 |     def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)):
204 |         self._srng.seed(seed)
205 | 
206 |     def get_output_shape_for(self, input_shape):
207 |         batch_size, num_latent = input_shape
208 |         if isinstance(batch_size, int) and \
209 |            isinstance(self.iw_samples, int) and \
210 |            isinstance(self.eq_samples, int):
211 |             out_dim = (batch_size*self.eq_samples*self.iw_samples, num_latent)
212 |         else:
213 |             out_dim = (None, num_latent)
214 |         return out_dim
215 | 
216 |     def get_output_for(self, input, **kwargs):
217 |         mu = input
218 |         batch_size, num_latent = mu.shape
219 |         shp = (batch_size, self.eq_samples, self.iw_samples, num_latent)
220 |         mu_shp = mu.dimshuffle(0,'x','x',1)
221 |         mu_shp = T.repeat(mu_shp, axis=1, repeats=self.eq_samples)
222 |         mu_shp = T.repeat(mu_shp, axis=2, repeats=self.iw_samples)
223 |         samples = self._srng.binomial(
224 |             size=shp, p=mu_shp, dtype=theano.config.floatX)
225 |         return samples.reshape((-1, num_latent))
226 | 


--------------------------------------------------------------------------------