├── .gitignore ├── README.rst ├── lab1 ├── confusionmatrix.py ├── confusionmatrix.pyc ├── lab1_FFN.ipynb ├── mnist.npz └── out.png ├── lab2 ├── lab2_AE.ipynb └── out.png ├── lab3 ├── VAE.png ├── lab3_VAE.ipynb ├── out.png └── samplelayer.py └── lab4 ├── lab4_VAESSL.ipynb ├── out.png └── samplelayer.py /.gitignore: -------------------------------------------------------------------------------- 1 | */.ipynb_checkpoints/* 2 | *.pyc 3 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Variational Auto-encoders Summerschool 2016 2 | ======= 3 | This repository contains the exercises on Variational Auto-encoders for the summer school on `semi supervised learning `_. 4 | 5 | Installation 6 | ------------ 7 | Please make sure you read this section thoroughly before getting started with the exercises. 8 | 9 | All exercises are written in Python programming language and formatted into Jupyter Notebooks. In order to run the exercises you must follow the below requirements: 10 | 11 | **1. Install Anaconda (Python distribution)** 12 | 13 | - Install Anaconda for Python 2.7 from https://www.continuum.io/downloads. 14 | 15 | - The installer will ask you whether you want to add Python as a default and to your environment path. You should tick both of these options. 16 | 17 | - Restart your terminal/command prompt for the environment variables to update. 18 | 19 | **2. Install and upgrade theano** 20 | 21 | - In a terminal/command prompt run: pip install theano 22 | 23 | - In a terminal/command prompt run: pip install --upgrade https://github.com/Theano/Theano/archive/master.zip --no-deps 24 | 25 | - If you run into problems installing Theano, please refer to: http://deeplearning.net/software/theano/index.html. 26 | 27 | **3. Install and upgrade lasagne** 28 | 29 | - In a terminal/command prompt run: pip install lasagne 30 | 31 | - In a terminal/command prompt run: pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip --no-deps 32 | 33 | Run the notebooks 34 | ------------ 35 | Download the repository from Github and run in a terminal/command prompt: jupyter notebook. This should start your default browser and you should be up and running. If you have issues while running the notebooks, we recommend that you use the newest version of Chrome. 36 | -------------------------------------------------------------------------------- /lab1/confusionmatrix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class ConfusionMatrix: 5 | """ 6 | Simple confusion matrix class 7 | row is the true class, column is the predicted class 8 | """ 9 | def __init__(self, num_classes, class_names=None): 10 | self.n_classes = num_classes 11 | if class_names is None: 12 | self.class_names = map(str, range(num_classes)) 13 | else: 14 | self.class_names = class_names 15 | 16 | # find max class_name and pad 17 | max_len = max(map(len, self.class_names)) 18 | self.max_len = max_len 19 | for idx, name in enumerate(self.class_names): 20 | if len(self.class_names) < max_len: 21 | self.class_names[idx] = name + " "*(max_len-len(name)) 22 | 23 | self.mat = np.zeros((num_classes,num_classes),dtype='int') 24 | 25 | def __str__(self): 26 | # calucate row and column sums 27 | col_sum = np.sum(self.mat, axis=1) 28 | row_sum = np.sum(self.mat, axis=0) 29 | 30 | s = [] 31 | 32 | mat_str = self.mat.__str__() 33 | mat_str = mat_str.replace('[','').replace(']','').split('\n') 34 | 35 | for idx, row in enumerate(mat_str): 36 | if idx == 0: 37 | pad = " " 38 | else: 39 | pad = "" 40 | class_name = self.class_names[idx] 41 | class_name = " " + class_name + " |" 42 | row_str = class_name + pad + row 43 | row_str += " |" + str(col_sum[idx]) 44 | s.append(row_str) 45 | 46 | row_sum = [(self.max_len+4)*" "+" ".join(map(str, row_sum))] 47 | hline = [(1+self.max_len)*" "+"-"*len(row_sum[0])] 48 | 49 | s = hline + s + hline + row_sum 50 | 51 | # add linebreaks 52 | s_out = [line+'\n' for line in s] 53 | return "".join(s_out) 54 | 55 | def batch_add(self, targets, preds): 56 | assert targets.shape == preds.shape 57 | assert len(targets) == len(preds) 58 | assert max(targets) < self.n_classes 59 | assert max(preds) < self.n_classes 60 | targets = targets.flatten() 61 | preds = preds.flatten() 62 | for i in range(len(targets)): 63 | self.mat[targets[i], preds[i]] += 1 64 | 65 | def get_errors(self): 66 | tp = np.asarray(np.diag(self.mat).flatten(),dtype='float') 67 | fn = np.asarray(np.sum(self.mat, axis=1).flatten(),dtype='float') - tp 68 | fp = np.asarray(np.sum(self.mat, axis=0).flatten(),dtype='float') - tp 69 | tn = np.asarray(np.sum(self.mat)*np.ones(self.n_classes).flatten(), 70 | dtype='float') - tp - fn - fp 71 | return tp, fn, fp, tn 72 | 73 | def accuracy(self): 74 | """ 75 | Calculates global accuracy 76 | :return: accuracy 77 | :example: >>> conf = ConfusionMatrix(3) 78 | >>> conf.batchAdd([0,0,1],[0,0,2]) 79 | >>> print conf.accuracy() 80 | """ 81 | tp, _, _, _ = self.get_errors() 82 | n_samples = np.sum(self.mat) 83 | return np.sum(tp) / n_samples 84 | 85 | def sensitivity(self): 86 | tp, tn, fp, fn = self.get_errors() 87 | res = tp / (tp + fn) 88 | res = res[~np.isnan(res)] 89 | return res 90 | 91 | def specificity(self): 92 | tp, tn, fp, fn = self.get_errors() 93 | res = tn / (tn + fp) 94 | res = res[~np.isnan(res)] 95 | return res 96 | 97 | def positive_predictive_value(self): 98 | tp, tn, fp, fn = self.get_errors() 99 | res = tp / (tp + fp) 100 | res = res[~np.isnan(res)] 101 | return res 102 | 103 | def negative_predictive_value(self): 104 | tp, tn, fp, fn = self.get_errors() 105 | res = tn / (tn + fn) 106 | res = res[~np.isnan(res)] 107 | return res 108 | 109 | def false_positive_rate(self): 110 | tp, tn, fp, fn = self.get_errors() 111 | res = fp / (fp + tn) 112 | res = res[~np.isnan(res)] 113 | return res 114 | 115 | def false_discovery_rate(self): 116 | tp, tn, fp, fn = self.get_errors() 117 | res = fp / (tp + fp) 118 | res = res[~np.isnan(res)] 119 | return res 120 | 121 | def F1(self): 122 | tp, tn, fp, fn = self.get_errors() 123 | res = (2*tp) / (2*tp + fp + fn) 124 | res = res[~np.isnan(res)] 125 | return res 126 | 127 | def matthews_correlation(self): 128 | tp, tn, fp, fn = self.get_errors() 129 | numerator = tp*tn - fp*fn 130 | denominator = np.sqrt((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn)) 131 | res = numerator / denominator 132 | res = res[~np.isnan(res)] 133 | return res 134 | -------------------------------------------------------------------------------- /lab1/confusionmatrix.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepLearningDTU/variational-autoencoders-summerschool-2016/00e2c92f5d4562cb6ac24471d28cd513c7d6aec2/lab1/confusionmatrix.pyc -------------------------------------------------------------------------------- /lab1/mnist.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepLearningDTU/variational-autoencoders-summerschool-2016/00e2c92f5d4562cb6ac24471d28cd513c7d6aec2/lab1/mnist.npz -------------------------------------------------------------------------------- /lab1/out.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepLearningDTU/variational-autoencoders-summerschool-2016/00e2c92f5d4562cb6ac24471d28cd513c7d6aec2/lab1/out.png -------------------------------------------------------------------------------- /lab2/out.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepLearningDTU/variational-autoencoders-summerschool-2016/00e2c92f5d4562cb6ac24471d28cd513c7d6aec2/lab2/out.png -------------------------------------------------------------------------------- /lab3/VAE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepLearningDTU/variational-autoencoders-summerschool-2016/00e2c92f5d4562cb6ac24471d28cd513c7d6aec2/lab3/VAE.png -------------------------------------------------------------------------------- /lab3/out.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepLearningDTU/variational-autoencoders-summerschool-2016/00e2c92f5d4562cb6ac24471d28cd513c7d6aec2/lab3/out.png -------------------------------------------------------------------------------- /lab3/samplelayer.py: -------------------------------------------------------------------------------- 1 | import lasagne 2 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 3 | import theano.tensor as T 4 | import theano 5 | 6 | 7 | class SimpleSampleLayer(lasagne.layers.MergeLayer): 8 | """ 9 | Simple sampling layer drawing a single Monte Carlo sample to approximate 10 | E_q [log( p(x,z) / q(z|x) )]. This is the approach described in [KINGMA]_. 11 | 12 | Parameters 13 | ---------- 14 | mu, log_var : :class:`Layer` instances 15 | Parameterizing the mean and log(variance) of the distribution to sample 16 | from as described in [KINGMA]_. The code assumes that these have the 17 | same number of dimensions. 18 | 19 | seed : int 20 | seed to random stream 21 | 22 | Methods 23 | ---------- 24 | seed : Helper function to change the random seed after init is called 25 | 26 | References 27 | ---------- 28 | .. [KINGMA] Kingma, Diederik P., and Max Welling. 29 | "Auto-Encoding Variational Bayes." 30 | arXiv preprint arXiv:1312.6114 (2013). 31 | """ 32 | def __init__(self, mean, log_var, 33 | seed=lasagne.random.get_rng().randint(1, 2147462579), 34 | **kwargs): 35 | super(SimpleSampleLayer, self).__init__([mean, log_var], **kwargs) 36 | 37 | self._srng = RandomStreams(seed) 38 | 39 | def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): 40 | self._srng.seed(seed) 41 | 42 | def get_output_shape_for(self, input_shapes): 43 | return input_shapes[0] 44 | 45 | def get_output_for(self, input, **kwargs): 46 | mu, log_var = input 47 | eps = self._srng.normal(mu.shape) 48 | z = mu + T.exp(0.5 * log_var) * eps 49 | return z 50 | 51 | 52 | class SampleLayer(lasagne.layers.MergeLayer): 53 | """ 54 | Sampling layer supporting importance sampling as described in [BURDA]_ and 55 | multiple Monte Carlo samples for the approximation of 56 | E_q [log( p(x,z) / q(z|x) )]. 57 | 58 | Parameters 59 | ---------- 60 | mu : class:`Layer` instance 61 | Parameterizing the mean of the distribution to sample 62 | from as described in [BURDA]_. 63 | 64 | log_var : class:`Layer` instance 65 | By default assumed to parametrize log(sigma^2) of the distribution to 66 | sample from as described in [BURDA]_ which is transformed to sigma using 67 | the nonlinearity function as described below. Effectively this means 68 | that the nonlinearity function controls what log_var parametrizes. A few 69 | common examples: 70 | -nonlinearity = lambda x: T.exp(0.5*x) => log_var = log(sigma^2)[default] 71 | -nonlinearity = lambda x: T.sqrt(x) => log_var = sigma^2 72 | -nonlinearity = lambda x: x => log_var = sigma 73 | 74 | eq_samples : int or T.scalar 75 | Number of Monte Carlo samples used to estimate the expectation over 76 | q(z|x) in eq. (8) in [BURDA]_. 77 | 78 | iw_samples : int or T.scalar 79 | Number of importance samples in the sum over k in eq. (8) in [BURDA]_. 80 | 81 | nonlinearity : callable or None 82 | The nonlinearity that is applied to the log_var input layer to transform 83 | it into a standard deviation. By default we assume that 84 | log_var = log(sigma^2) and hence the corresponding nonlinearity is 85 | f(x) = T.exp(0.5*x) such that T.exp(0.5*log(sigma^2)) = sigma 86 | 87 | seed : int 88 | seed to random stream 89 | 90 | Methods 91 | ---------- 92 | seed : Helper function to change the random seed after init is called 93 | 94 | References 95 | ---------- 96 | .. [BURDA] Burda, Yuri, Roger Grosse, and Ruslan Salakhutdinov. 97 | "Importance Weighted Autoencoders." 98 | arXiv preprint arXiv:1509.00519 (2015). 99 | """ 100 | 101 | def __init__(self, mean, log_var, 102 | eq_samples=1, 103 | iw_samples=1, 104 | nonlinearity=lambda x: T.exp(0.5*x), 105 | seed=lasagne.random.get_rng().randint(1, 2147462579), 106 | **kwargs): 107 | super(SampleLayer, self).__init__([mean, log_var], **kwargs) 108 | 109 | self.eq_samples = eq_samples 110 | self.iw_samples = iw_samples 111 | self.nonlinearity = nonlinearity 112 | 113 | self._srng = RandomStreams(seed) 114 | 115 | def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): 116 | self._srng.seed(seed) 117 | 118 | def get_output_shape_for(self, input_shapes): 119 | batch_size, num_latent = input_shapes[0] 120 | if isinstance(batch_size, int) and \ 121 | isinstance(self.iw_samples, int) and \ 122 | isinstance(self.eq_samples, int): 123 | out_dim = (batch_size*self.eq_samples*self.iw_samples, num_latent) 124 | else: 125 | out_dim = (None, num_latent) 126 | return out_dim 127 | 128 | def get_output_for(self, input, **kwargs): 129 | mu, log_var = input 130 | batch_size, num_latent = mu.shape 131 | eps = self._srng.normal( 132 | [batch_size, self.eq_samples, self.iw_samples, num_latent], 133 | dtype=theano.config.floatX) 134 | 135 | z = mu.dimshuffle(0,'x','x',1) + \ 136 | self.nonlinearity( log_var.dimshuffle(0,'x','x',1)) * eps 137 | 138 | return z.reshape((-1,num_latent)) 139 | 140 | 141 | class SimpleBernoulliSampleLayer(lasagne.layers.Layer): 142 | """ 143 | Simple sampling layer drawing samples from bernoulli distributions. 144 | 145 | Parameters 146 | ---------- 147 | mean : :class:`Layer` instances 148 | Parameterizing the mean value of each bernoulli distribution 149 | seed : int 150 | seed to random stream 151 | Methods 152 | ---------- 153 | seed : Helper function to change the random seed after init is called 154 | """ 155 | 156 | def __init__(self, mean, 157 | seed=lasagne.random.get_rng().randint(1, 2147462579), 158 | **kwargs): 159 | super(SimpleBernoulliSampleLayer, self).__init__(mean, **kwargs) 160 | 161 | self._srng = RandomStreams(seed) 162 | 163 | def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): 164 | self._srng.seed(seed) 165 | 166 | def get_output_shape_for(self, input_shape): 167 | return input_shape 168 | 169 | def get_output_for(self, mu, **kwargs): 170 | return self._srng.binomial(size=mu.shape, p=mu, dtype=mu.dtype) 171 | 172 | 173 | class BernoulliSampleLayer(lasagne.layers.Layer): 174 | """ 175 | Bernoulli Sampling layer supporting importance sampling 176 | Parameters 177 | ---------- 178 | mean : class:`Layer` instance 179 | Parameterizing the mean value of each bernoulli distribution 180 | eq_samples : int or T.scalar 181 | Number of Monte Carlo samples used to estimate the expectation over 182 | iw_samples : int or T.scalar 183 | Number of importance samples in the sum over k 184 | seed : int 185 | seed to random stream 186 | Methods 187 | ---------- 188 | seed : Helper function to change the random seed after init is called 189 | """ 190 | 191 | def __init__(self, mean, 192 | eq_samples=1, 193 | iw_samples=1, 194 | seed=lasagne.random.get_rng().randint(1, 2147462579), 195 | **kwargs): 196 | super(BernoulliSampleLayer, self).__init__(mean, **kwargs) 197 | 198 | self.eq_samples = eq_samples 199 | self.iw_samples = iw_samples 200 | 201 | self._srng = RandomStreams(seed) 202 | 203 | def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): 204 | self._srng.seed(seed) 205 | 206 | def get_output_shape_for(self, input_shape): 207 | batch_size, num_latent = input_shape 208 | if isinstance(batch_size, int) and \ 209 | isinstance(self.iw_samples, int) and \ 210 | isinstance(self.eq_samples, int): 211 | out_dim = (batch_size*self.eq_samples*self.iw_samples, num_latent) 212 | else: 213 | out_dim = (None, num_latent) 214 | return out_dim 215 | 216 | def get_output_for(self, input, **kwargs): 217 | mu = input 218 | batch_size, num_latent = mu.shape 219 | shp = (batch_size, self.eq_samples, self.iw_samples, num_latent) 220 | mu_shp = mu.dimshuffle(0,'x','x',1) 221 | mu_shp = T.repeat(mu_shp, axis=1, repeats=self.eq_samples) 222 | mu_shp = T.repeat(mu_shp, axis=2, repeats=self.iw_samples) 223 | samples = self._srng.binomial( 224 | size=shp, p=mu_shp, dtype=theano.config.floatX) 225 | return samples.reshape((-1, num_latent)) 226 | -------------------------------------------------------------------------------- /lab4/lab4_VAESSL.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from __future__ import division, print_function\n", 12 | "import matplotlib\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "from IPython.display import Image, display, clear_output\n", 15 | "%matplotlib nbagg\n", 16 | "%matplotlib inline \n", 17 | "import numpy as np\n", 18 | "import sklearn.datasets\n", 19 | "import theano\n", 20 | "import theano.tensor as T\n", 21 | "import lasagne\n", 22 | "import math" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "# Variational Auto-encoders and Semi-supervised Learning\n", 30 | "\n", 31 | "Now we are going to extend the VAE from lab3 with a variable $y$, so that we can apply supervised and semi-supervised learning. We'll model the supervised VAE in which we model $p(x|y,z)$ and $q(z|x,y)$ (cf. [Kingma et al.](https://arxiv.org/abs/1406.5298)). This way we condition our latent variable and our generated sample on our label $y$. Next we'll define semi-supervised learning in which $y$ is given for the labeled data and as a latent variable that is marginalized out for the unlabeled data." 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "## MNIST: Supervised VAE\n", 39 | "First we'll load the MNIST dataset and plot a few examples. We only load a limited amount of number classes, so that we can speed up training." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "from sklearn.utils import shuffle\n", 51 | "#To speed up training we'll only work on a subset of the data\n", 52 | "#We discretize the data to 0 and 1 in order to use it with a bernoulli observation model p(x|z) = Ber(mu(z))\n", 53 | "\n", 54 | "def bernoullisample(x):\n", 55 | " return np.random.binomial(1,x,size=x.shape).astype(theano.config.floatX)\n", 56 | "\n", 57 | "def onehot(t, num_classes):\n", 58 | " out = np.zeros((t.shape[0], num_classes)).astype('float32')\n", 59 | " for row, col in enumerate(t):\n", 60 | " out[row, col] = 1\n", 61 | " return out\n", 62 | "\n", 63 | "data = np.load('../lab1/mnist.npz')\n", 64 | "num_classes = 3\n", 65 | "idxs_train = []\n", 66 | "idxs_valid = []\n", 67 | "idxs_test = []\n", 68 | "for i in range(num_classes):\n", 69 | " idxs_train += np.where(data['y_train'] == i)[0].tolist()\n", 70 | " idxs_valid += np.where(data['y_valid'] == i)[0].tolist()\n", 71 | " idxs_test += np.where(data['y_test'] == i)[0].tolist()\n", 72 | "\n", 73 | "x_train = bernoullisample(data['X_train'][idxs_train]).astype('float32')\n", 74 | "targets_train = data['y_train'][idxs_train].astype('int32') # Since this is unsupervised, the targets are only used for validation.\n", 75 | "x_train, targets_train = shuffle(x_train, targets_train, random_state=1234)\n", 76 | "t_train = onehot(targets_train, num_classes)\n", 77 | "\n", 78 | "x_valid = bernoullisample(data['X_valid'][idxs_valid]).astype('float32')\n", 79 | "targets_valid = data['y_valid'][idxs_valid].astype('int32')\n", 80 | "t_valid = onehot(targets_valid, num_classes)\n", 81 | "\n", 82 | "x_test = bernoullisample(data['X_test'][idxs_test]).astype('float32')\n", 83 | "targets_test = data['y_test'][idxs_test].astype('int32')\n", 84 | "t_test = onehot(targets_test, num_classes)\n", 85 | "\n", 86 | "print(\"training set dim(%i, %i).\" % x_train.shape)\n", 87 | "print(\"validation set dim(%i, %i).\" % x_valid.shape)\n", 88 | "print(\"test set dim(%i, %i).\" % x_test.shape)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "collapsed": false 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "#plot a few MNIST examples\n", 100 | "def plot_samples(x,title=''):\n", 101 | " idx = 0\n", 102 | " canvas = np.zeros((28*10, 10*28))\n", 103 | " for i in range(10):\n", 104 | " for j in range(10):\n", 105 | " canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x_train[idx].reshape((28, 28))\n", 106 | " idx += 1\n", 107 | " plt.figure(figsize=(7, 7))\n", 108 | " plt.imshow(canvas, cmap='gray')\n", 109 | " plt.title(title)\n", 110 | " plt.show()\n", 111 | "\n", 112 | "plot_samples(x_train[:100],title='MNIST handwritten digits')" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "As for lab3, we define the helper functions for calculating the lowerbound." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "collapsed": true 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "#defined a couple of helper functions\n", 131 | "c = - 0.5 * math.log(2*math.pi)\n", 132 | "def log_bernoulli(x, p, eps=0.0):\n", 133 | " p = T.clip(p, eps, 1.0 - eps)\n", 134 | " return -T.nnet.binary_crossentropy(p, x)\n", 135 | "\n", 136 | "def kl_normal2_stdnormal(mean, log_var):\n", 137 | " return -0.5*(1 + log_var - mean**2 - T.exp(log_var))" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "The model should now be modified to condition on the labeled data $y$." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": { 151 | "collapsed": false 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "from lasagne.layers import InputLayer, DenseLayer, get_output, get_all_params, ConcatLayer\n", 156 | "from lasagne.nonlinearities import rectify, sigmoid\n", 157 | "from samplelayer import SimpleSampleLayer\n", 158 | "\n", 159 | "num_features = x_train.shape[-1]\n", 160 | "num_latent_z = 2\n", 161 | "\n", 162 | "#MODEL SPECIFICATION\n", 163 | "\n", 164 | "#ENCODER\n", 165 | "l_in_x = InputLayer(shape=(None, num_features))\n", 166 | "l_in_y = InputLayer(shape=(None, num_classes))\n", 167 | "l_enc = ConcatLayer([l_in_x, l_in_y])\n", 168 | "l_enc = DenseLayer(l_enc, num_units=128, nonlinearity=rectify)\n", 169 | "l_muq = DenseLayer(l_enc, num_units=num_latent_z, nonlinearity=None) #mu(x)\n", 170 | "l_logvarq = DenseLayer(l_enc, num_units=num_latent_z, nonlinearity=lambda x: T.clip(x,-10,10)) #logvar(x), \n", 171 | "l_z = SimpleSampleLayer(mean=l_muq, log_var=l_logvarq) #sample a latent representation z \\sim q(z|x) = N(mu(x),logvar(x))\n", 172 | "#we split the in two parts to allow sampling from the decoder model separately\n", 173 | "#DECODER\n", 174 | "l_in_z = InputLayer(shape=(None, num_latent_z))\n", 175 | "l_dec = ConcatLayer([l_in_z, l_in_y])\n", 176 | "l_dec = DenseLayer(l_dec, num_units=128, nonlinearity=rectify) \n", 177 | "l_mux = DenseLayer(l_dec, num_units=num_features, nonlinearity=sigmoid) #reconstruction of input using a sigmoid output since mux \\in [0,1] " 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "The objective functions are defined similarly to lab3." 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "collapsed": false 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "\n", 196 | "sym_x = T.matrix('x')\n", 197 | "sym_y = T.matrix('y')\n", 198 | "sym_z = T.matrix('z')\n", 199 | "\n", 200 | "z_train, muq_train, logvarq_train = get_output([l_z,l_muq,l_logvarq],{l_in_x:sym_x, l_in_y:sym_y},deterministic=False)\n", 201 | "mux_train = get_output(l_mux,{l_in_z:z_train, l_in_y:sym_y},deterministic=False)\n", 202 | "\n", 203 | "z_eval, muq_eval, logvarq_eval = get_output([l_z,l_muq,l_logvarq],{l_in_x:sym_x, l_in_y:sym_y},deterministic=True)\n", 204 | "mux_eval = get_output(l_mux,{l_in_z:z_eval, l_in_y:sym_y},deterministic=True)\n", 205 | "\n", 206 | "mux_sample = get_output(l_mux,{l_in_z:sym_z, l_in_y:sym_y},deterministic=True)\n", 207 | "\n", 208 | "#define the cost function\n", 209 | "def LogLikelihood(mux,x,muq,logvarq):\n", 210 | " log_px_given_z = log_bernoulli(x, mux, eps=1e-6).sum(axis=1).mean() #note that we sum the latent dimension and mean over the samples\n", 211 | " KL_qp = kl_normal2_stdnormal(muq, logvarq).sum(axis=1).mean()\n", 212 | " LL = log_px_given_z - KL_qp\n", 213 | " return LL, log_px_given_z, KL_qp\n", 214 | "\n", 215 | "LL_train, logpx_train, KL_train = LogLikelihood(mux_train, sym_x, muq_train, logvarq_train)\n", 216 | "LL_eval, logpx_eval, KL_eval = LogLikelihood(mux_eval, sym_x, muq_eval, logvarq_eval)\n", 217 | "\n", 218 | "all_params = get_all_params([l_z,l_mux],trainable=True)\n", 219 | "\n", 220 | "# Let Theano do its magic and get all the gradients we need for training\n", 221 | "all_grads = T.grad(-LL_train, all_params)\n", 222 | "\n", 223 | "# Set the update function for parameters. The Adam optimizer works really well with VAEs.\n", 224 | "updates = lasagne.updates.adam(all_grads, all_params, learning_rate=1e-3)\n", 225 | "\n", 226 | "f_train = theano.function(inputs=[sym_x, sym_y],\n", 227 | " outputs=[LL_train, logpx_train, KL_train],\n", 228 | " updates=updates)\n", 229 | "\n", 230 | "f_eval = theano.function(inputs=[sym_x, sym_y],\n", 231 | " outputs=[LL_eval, logpx_eval, KL_eval])\n", 232 | "\n", 233 | "f_z = theano.function(inputs=[sym_x, sym_y],\n", 234 | " outputs=[z_eval])\n", 235 | "\n", 236 | "f_sample = theano.function(inputs=[sym_z, sym_y],\n", 237 | " outputs=[mux_sample])\n", 238 | "\n", 239 | "f_recon = theano.function(inputs=[sym_x, sym_y],\n", 240 | " outputs=[mux_eval])" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": { 247 | "collapsed": false 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "batch_size = 100\n", 252 | "samples_to_process = 1e4\n", 253 | "val_interval = 5e2\n", 254 | " \n", 255 | "LL_train, KL_train, logpx_train = [],[],[]\n", 256 | "LL_valid, KL_valid, logpx_valid = [],[],[]\n", 257 | "samples_processed = 0\n", 258 | "plt.figure(figsize=(12, 24))\n", 259 | "valid_samples_processed = []\n", 260 | "\n", 261 | "try:\n", 262 | " while samples_processed < samples_to_process:\n", 263 | " _LL_train, _KL_train, _logpx_train = [],[],[]\n", 264 | " idxs = np.random.choice(range(x_train.shape[0]), size=(batch_size), replace=False) \n", 265 | " x_batch = x_train[idxs]\n", 266 | " t_batch = t_train[idxs]\n", 267 | " out = f_train(x_batch, t_batch)\n", 268 | " samples_processed += batch_size\n", 269 | " \n", 270 | " if samples_processed % val_interval == 0:\n", 271 | " valid_samples_processed += [samples_processed]\n", 272 | " out = f_eval(x_train, t_train)\n", 273 | " LL_train += [out[0]] \n", 274 | " logpx_train += [out[1]]\n", 275 | " KL_train += [out[2]]\n", 276 | " \n", 277 | " out = f_eval(x_valid, t_valid)\n", 278 | " LL_valid += [out[0]]\n", 279 | " logpx_valid += [out[1]]\n", 280 | " KL_valid += [out[2]]\n", 281 | " \n", 282 | " z_eval = f_z(x_valid, t_valid)[0]\n", 283 | " x_sample = f_sample(np.random.normal(size=(100, num_latent_z)).astype('float32'), onehot(np.random.randint(0, num_classes, size=100), num_classes))[0]\n", 284 | " x_recon = f_recon(x_valid, t_valid)[0]\n", 285 | " \n", 286 | " plt.subplot(num_classes+2,2,1)\n", 287 | " plt.legend(['LL', 'log(p(x))'], loc=2)\n", 288 | " plt.xlabel('Updates')\n", 289 | " plt.plot(valid_samples_processed, LL_train, color=\"black\")\n", 290 | " plt.plot(valid_samples_processed, logpx_train, color=\"red\")\n", 291 | " plt.plot(valid_samples_processed, LL_valid, color=\"black\", linestyle=\"--\")\n", 292 | " plt.plot(valid_samples_processed, logpx_valid, color=\"red\", linestyle=\"--\")\n", 293 | " plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))\n", 294 | " plt.grid('on')\n", 295 | " \n", 296 | " plt.subplot(num_classes+2,2,2)\n", 297 | " plt.cla()\n", 298 | " plt.xlabel('z0'), plt.ylabel('z1')\n", 299 | " color = iter(plt.get_cmap('brg')(np.linspace(0, 1.0, num_classes)))\n", 300 | " for i in range(num_classes):\n", 301 | " clr = next(color)\n", 302 | " plt.scatter(z_eval[targets_valid==i, 0], z_eval[targets_valid==i, 1], c=clr, s=5., lw=0, marker='o', )\n", 303 | " plt.grid('on')\n", 304 | " \n", 305 | " \n", 306 | " plt.subplot(num_classes+2,2,3)\n", 307 | " plt.legend(['KL(q||p)'])\n", 308 | " plt.xlabel('Updates')\n", 309 | " plt.plot(valid_samples_processed, KL_train, color=\"blue\")\n", 310 | " plt.plot(valid_samples_processed, KL_valid, color=\"blue\", linestyle=\"--\")\n", 311 | " plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))\n", 312 | " plt.grid('on')\n", 313 | " \n", 314 | " plt.subplot(num_classes+2,2,4)\n", 315 | " plt.cla()\n", 316 | " plt.title('Samples')\n", 317 | " plt.axis('off')\n", 318 | " idx = 0\n", 319 | " canvas = np.zeros((28*10, 10*28))\n", 320 | " for i in range(10):\n", 321 | " for j in range(10):\n", 322 | " canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x_sample[idx].reshape((28, 28))\n", 323 | " idx += 1\n", 324 | " plt.imshow(canvas, cmap='gray')\n", 325 | " \n", 326 | " c=0\n", 327 | " for k in range(5, 5 + num_classes*2, 2):\n", 328 | " plt.subplot(num_classes+2,2,k)\n", 329 | " plt.cla()\n", 330 | " plt.title('Inputs for %i' % c)\n", 331 | " plt.axis('off')\n", 332 | " idx = 0\n", 333 | " canvas = np.zeros((28*10, 10*28))\n", 334 | " for i in range(10):\n", 335 | " for j in range(10):\n", 336 | " canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x_valid[targets_valid==c][idx].reshape((28, 28))\n", 337 | " idx += 1\n", 338 | " plt.imshow(canvas, cmap='gray')\n", 339 | "\n", 340 | " plt.subplot(num_classes+2,2,k+1)\n", 341 | " plt.cla()\n", 342 | " plt.title('Reconstructions for %i' % c)\n", 343 | " plt.axis('off')\n", 344 | " idx = 0\n", 345 | " canvas = np.zeros((28*10, 10*28))\n", 346 | " for i in range(10):\n", 347 | " for j in range(10):\n", 348 | " canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x_recon[targets_valid==c][idx].reshape((28, 28))\n", 349 | " idx += 1\n", 350 | " plt.imshow(canvas, cmap='gray')\n", 351 | " c += 1\n", 352 | " \n", 353 | " plt.savefig(\"out.png\")\n", 354 | " display(Image(filename=\"out.png\"))\n", 355 | " clear_output(wait=True)\n", 356 | " \n", 357 | "except KeyboardInterrupt:\n", 358 | " pass\n", 359 | "\n" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "### Exercise 1 - Analyzing the supervised VAE\n", 367 | "1. Test different network structures and evaluate the performance of the model.\n", 368 | "2. Investigate the latent space $z$ of the supervised VAE by performing a random walk and generating samples in the same way as [Kingma et al.](https://arxiv.org/abs/1406.5298)." 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "## MNIST: Semi-supervised VAE (for the very ambitous)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "For the semi-supervised modeling, we generate a labeled subset of our training data." 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": { 389 | "collapsed": false 390 | }, 391 | "outputs": [], 392 | "source": [ 393 | "def bernoullisample(x):\n", 394 | " return np.random.binomial(1,x,size=x.shape).astype(theano.config.floatX)\n", 395 | "\n", 396 | "data = np.load('../lab1/mnist.npz')\n", 397 | "num_classes = 5\n", 398 | "idxs_train = []\n", 399 | "idxs_valid = []\n", 400 | "idxs_test = []\n", 401 | "for i in range(num_classes):\n", 402 | " idxs_train += np.where(data['y_train'] == i)[0].tolist()\n", 403 | " idxs_valid += np.where(data['y_valid'] == i)[0].tolist()\n", 404 | " idxs_test += np.where(data['y_test'] == i)[0].tolist()\n", 405 | "\n", 406 | "x_train = bernoullisample(data['X_train'][idxs_train]).astype('float32')\n", 407 | "targets_train = data['y_train'][idxs_train].astype('int32') # Since this is unsupervised, the targets are only used for validation.\n", 408 | "x_train, targets_train = shuffle(x_train, targets_train, random_state=1234)\n", 409 | "t_train = onehot(targets_train, num_classes)\n", 410 | "\n", 411 | "x_valid = bernoullisample(data['X_valid'][idxs_valid]).astype('float32')\n", 412 | "targets_valid = data['y_valid'][idxs_valid].astype('int32')\n", 413 | "t_valid = onehot(targets_valid, num_classes)\n", 414 | "\n", 415 | "x_test = bernoullisample(data['X_test'][idxs_test]).astype('float32')\n", 416 | "targets_test = data['y_test'][idxs_test].astype('int32')\n", 417 | "t_test = onehot(targets_test, num_classes)\n", 418 | "\n", 419 | "print(\"training set dim(%i, %i).\" % x_train.shape)\n", 420 | "print(\"validation set dim(%i, %i).\" % x_valid.shape)\n", 421 | "print(\"test set dim(%i, %i).\" % x_test.shape)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": { 428 | "collapsed": false 429 | }, 430 | "outputs": [], 431 | "source": [ 432 | "# Generate a subset of labeled data points\n", 433 | "\n", 434 | "num_labeled = 2 # You decide on the size of the fraction...\n", 435 | "\n", 436 | "idxs_train_l = []\n", 437 | "for i in range(num_classes):\n", 438 | " idxs = np.where(targets_train == i)[0]\n", 439 | " idxs_train_l += np.random.choice(idxs, size=num_labeled).tolist()\n", 440 | "\n", 441 | "x_train_l = x_train[idxs_train_l]\n", 442 | "t_train_l = onehot(targets_train[idxs_train_l], num_classes)\n", 443 | "print(\"labeled training set dim(%i, %i).\" % x_train_l.shape)\n", 444 | "\n", 445 | "plt.figure(figsize=(12, 7))\n", 446 | "for i in range(num_classes*num_labeled):\n", 447 | " im = x_train_l[i].reshape((28, 28))\n", 448 | " plt.subplot(1, num_classes*num_labeled, i+1)\n", 449 | " plt.imshow(im, cmap='gray')\n", 450 | " plt.axis('off')" 451 | ] 452 | }, 453 | { 454 | "cell_type": "markdown", 455 | "metadata": {}, 456 | "source": [ 457 | "For the semi-supervised case, the lower bound is divided into two, so that for the labeled data we have:\n", 458 | "$$\n", 459 | "\\begin{align}\n", 460 | "\\log p(x,y) &= \\int_z \\log p(x,y,z) dz \\\\\n", 461 | "&\\ge \\int_z q(z|x,y) \\log \\bigg[ \\frac{p(x,y,z)}{q(z|x,y)} \\bigg]dz \\\\\n", 462 | "&= \\mathbb{E}_{q(z|x,y)} [\\log p(x,y,z) - \\log q(z|x,y)]dz \\equiv -\\mathcal{L}(x,y)\\ ,\n", 463 | "\\end{align}\n", 464 | "$$\n", 465 | "\n", 466 | "where in our case the $p(x,y,z)=p(x|y,z)p(y)p(z)$. For the unlabeled data $y$ is now a latent variable that needs to be marginalized out:\n", 467 | "$$\n", 468 | "\\begin{align}\n", 469 | "\\log p(x,y) &= \\int_y \\int_z \\log p(x,y,z) dzdy \\\\\n", 470 | "&\\ge \\int_y q(y|x) \\int_z q(z|x,y) \\log \\bigg[ \\frac{p(x,y,z)}{q(y|x)q(z|x,y)} \\bigg]dzdy \\\\\n", 471 | "&= \\int_y q(y|x) \\int_z q(z|x,y) [\\log p(x,y,z) - \\log q(y|x) - \\log q(z|x,y) ]dzdy \\\\\n", 472 | "&= \\int_y q(y|x) \\int_z q(z|x,y) [\\log p(x,y,z) - \\log q(z|x,y)]dzdy - \\int_y \\int_z q(y|x)q(z|x,y) \\log q(y|x)dzdy \\\\\n", 473 | "&= \\int_y q(y|x) \\int_z q(z|x,y) [\\log p(x,y,z) - \\log q(z|x,y)]dzdy - \\int_y q(y|x) \\log q(y|x) \\int_z q(z|x,y)dzdy \\\\\n", 474 | "&= \\int_y q(y|x) \\int_z q(z|x,y) [\\log p(x,y,z) - \\log q(z|x,y)]dzdy - \\int_y q(y|x) \\log q(y|x)dy \\cdot 1 \\\\\n", 475 | "&= \\int_y q(y|x) \\int_z q(z|x,y) [\\log p(x,y,z) - \\log q(z|x,y)]dzdy + \\mathcal{H}(q(y|x)) \\\\\n", 476 | "&= \\sum_y q(y|x) (-\\mathcal{L}(x,y)) + \\mathcal{H}(q(y|x)) \\equiv -\\mathcal{U}(x)\\ .\n", 477 | "\\end{align}\n", 478 | "$$\n", 479 | "\n", 480 | "We collect the two lowerbounds by:\n", 481 | "$$\n", 482 | "\\begin{align}\n", 483 | "\\mathcal{J}^\\alpha = \\sum_{x_l,y_l} \\mathcal{L}(x_l, y_l) - \\alpha \\cdot \\log q(y_l|x_l) + \\sum_{x_u, y_u} \\mathcal{U}(x_u, y_u) \\ ,\n", 484 | "\\end{align}\n", 485 | "$$\n", 486 | "\n", 487 | "where the right hand-side is the addition of the cross-entropy scaled by an alpha constant which is equivalent to the fraction between unlabeled and labeled data points." 488 | ] 489 | }, 490 | { 491 | "cell_type": "markdown", 492 | "metadata": {}, 493 | "source": [ 494 | "Following you are provided with a code snippet for computing the lowerbound for the semi-supervised VAE:" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": null, 500 | "metadata": { 501 | "collapsed": false 502 | }, 503 | "outputs": [], 504 | "source": [ 505 | "from lasagne.layers import InputLayer, DenseLayer, get_output, get_all_params, ConcatLayer\n", 506 | "from lasagne.nonlinearities import rectify, sigmoid, softmax\n", 507 | "from samplelayer import SimpleSampleLayer\n", 508 | "\n", 509 | "num_features = x_train.shape[-1]\n", 510 | "num_latent_z = 16\n", 511 | "\n", 512 | "#MODEL SPECIFICATION\n", 513 | "l_in_x = InputLayer(shape=(None, num_features))\n", 514 | "l_in_y = InputLayer(shape=(None, num_classes))\n", 515 | "\n", 516 | "#Classifier\n", 517 | "l_y = DenseLayer(l_in_x, num_units=128, nonlinearity=rectify)\n", 518 | "l_y = DenseLayer(l_y, num_units=num_classes, nonlinearity=softmax)\n", 519 | "\n", 520 | "#ENCODER\n", 521 | "l_enc = ConcatLayer([l_in_x, l_in_y])\n", 522 | "l_enc = DenseLayer(l_enc, num_units=128, nonlinearity=rectify)\n", 523 | "l_muq = DenseLayer(l_enc, num_units=num_latent_z, nonlinearity=None) #mu(x)\n", 524 | "l_logvarq = DenseLayer(l_enc, num_units=num_latent_z, nonlinearity=lambda x: T.clip(x,-10,10)) #logvar(x), \n", 525 | "l_z = SimpleSampleLayer(mean=l_muq, log_var=l_logvarq) #sample a latent representation z \\sim q(z|x) = N(mu(x),logvar(x))\n", 526 | "#we split the in two parts to allow sampling from the decoder model separately\n", 527 | "#DECODER\n", 528 | "l_in_z = InputLayer(shape=(None, num_latent_z))\n", 529 | "l_dec = ConcatLayer([l_in_z, l_in_y])\n", 530 | "l_dec = DenseLayer(l_dec, num_units=128, nonlinearity=rectify)\n", 531 | "l_mux = DenseLayer(l_dec, num_units=num_features, nonlinearity=sigmoid) #reconstruction of input using a sigmoid output since mux \\in [0,1] " 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": null, 537 | "metadata": { 538 | "collapsed": false 539 | }, 540 | "outputs": [], 541 | "source": [ 542 | "from lasagne.objectives import categorical_crossentropy, categorical_accuracy\n", 543 | "\n", 544 | "sym_x_l = T.matrix('x_l')\n", 545 | "sym_x = T.matrix('x')\n", 546 | "sym_y = T.matrix('y')\n", 547 | "sym_z = T.matrix('z')\n", 548 | "\n", 549 | "y_eval = get_output(l_y, {l_in_x:sym_x}, deterministic=True)\n", 550 | "z_eval = get_output(l_z,{l_in_x:sym_x, l_in_y:sym_y},deterministic=True)\n", 551 | "\n", 552 | "#define the cost function for labeled data points\n", 553 | "y_train_l = get_output(l_y, {l_in_x:sym_x_l}, deterministic=False)\n", 554 | "z_train_l, muq_train_l, logvarq_train_l = get_output([l_z,l_muq,l_logvarq],{l_in_x:sym_x_l, l_in_y:sym_y},deterministic=False)\n", 555 | "mux_train_l = get_output(l_mux,{l_in_z:z_train_l, l_in_y:sym_y},deterministic=False)\n", 556 | "\n", 557 | "log_px_l = log_bernoulli(sym_x_l, mux_train_l, eps=1e-6).sum(axis=-1)\n", 558 | "KL_qp_l = kl_normal2_stdnormal(muq_train_l, logvarq_train_l).sum(axis=-1)\n", 559 | "log_qy_l = T.sum(sym_y * T.log(y_train_l+1e-8), axis=1)\n", 560 | "py_l = softmax(T.zeros((sym_x_l.shape[0], num_classes)))\n", 561 | "log_py_l = -categorical_crossentropy(py_l, sym_y)\n", 562 | "alpha = 0.1*(x_train.shape[0]/x_train_l.shape[0])\n", 563 | "LL_l_eval = T.mean(log_px_l + log_py_l - KL_qp_l)\n", 564 | "LL_l = T.mean(log_px_l + log_py_l - KL_qp_l + alpha * log_qy_l)\n", 565 | "\n", 566 | "\n", 567 | "#define the cost function for unlabeled data points\n", 568 | "\n", 569 | "# For the integrating out approach, we repeat the input matrix x, and construct a target (bs * n_y) x n_y\n", 570 | "# Example of input and target matrix for a 3 class problem and batch_size=2. 2D tensors of the form\n", 571 | "# x_repeat t_repeat\n", 572 | "# [[x[0,0], x[0,1], ..., x[0,n_x]] [[1, 0, 0]\n", 573 | "# [x[1,0], x[1,1], ..., x[1,n_x]] [1, 0, 0]\n", 574 | "# [x[0,0], x[0,1], ..., x[0,n_x]] [0, 1, 0]\n", 575 | "# [x[1,0], x[1,1], ..., x[1,n_x]] [0, 1, 0]\n", 576 | "# [x[0,0], x[0,1], ..., x[0,n_x]] [0, 0, 1]\n", 577 | "# [x[1,0], x[1,1], ..., x[1,n_x]]] [0, 0, 1]]\n", 578 | "t_eye = T.eye(num_classes, k=0)\n", 579 | "t_u = t_eye.reshape((num_classes, 1, num_classes)).repeat(sym_x.shape[0], axis=1).reshape((-1, num_classes))\n", 580 | "x_u = sym_x.reshape((1, sym_x.shape[0], sym_x.shape[1])).repeat(num_classes, axis=0).reshape((-1, sym_x.shape[1]))\n", 581 | "\n", 582 | "y_train = get_output(l_y, {l_in_x:sym_x}, deterministic=False)\n", 583 | "z_train, muq_train, logvarq_train = get_output([l_z,l_muq,l_logvarq],{l_in_x:x_u, l_in_y:t_u},deterministic=False)\n", 584 | "mux_train = get_output(l_mux,{l_in_z:z_train, l_in_y:t_u},deterministic=False)\n", 585 | "\n", 586 | "log_px_given_z_u = log_bernoulli(x_u, mux_train, eps=1e-6).sum(axis=-1)\n", 587 | "KL_qp_u = kl_normal2_stdnormal(muq_train, logvarq_train).sum(axis=-1)\n", 588 | "py_u = softmax(T.zeros((x_u.shape[0], num_classes)))\n", 589 | "log_py_u = -categorical_crossentropy(py_u, t_u)\n", 590 | "LL_u = log_px_given_z_u + log_py_u - KL_qp_u\n", 591 | "LL_u = LL_u.reshape((num_classes, sym_x.shape[0])).T\n", 592 | "LL_u = T.sum(y_train * (LL_u - T.log(y_train+1e-8)), axis=-1).mean()\n", 593 | "\n", 594 | "LL = LL_u + LL_l\n", 595 | "\n", 596 | "eval_acc = categorical_accuracy(y_eval, sym_y).mean()\n", 597 | "\n", 598 | "all_params = get_all_params([l_z,l_mux,l_y],trainable=True)\n", 599 | "\n", 600 | "# Let Theano do its magic and get all the gradients we need for training\n", 601 | "all_grads = T.grad(-LL, all_params)\n", 602 | "\n", 603 | "# Set the update function for parameters. The Adam optimizer works really well with VAEs.\n", 604 | "updates = lasagne.updates.adam(all_grads, all_params, learning_rate=3e-5)\n", 605 | "\n", 606 | "f_train = theano.function(inputs=[sym_x, sym_x_l, sym_y],\n", 607 | " outputs=[LL],\n", 608 | " updates=updates)\n", 609 | "\n", 610 | "f_train_eval = theano.function(inputs=[sym_x, sym_x_l, sym_y],\n", 611 | " outputs=[LL_l_eval, LL_u, KL_qp_l.mean(), KL_qp_u.mean()])\n", 612 | "\n", 613 | "f_eval = theano.function(inputs=[sym_x, sym_y],\n", 614 | " outputs=[eval_acc])\n", 615 | "\n", 616 | "f_z = theano.function(inputs=[sym_x, sym_y],\n", 617 | " outputs=[z_eval])\n" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": null, 623 | "metadata": { 624 | "collapsed": false 625 | }, 626 | "outputs": [], 627 | "source": [ 628 | "batch_size = 100\n", 629 | "samples_to_process = 9e4\n", 630 | "val_interval = 5e2\n", 631 | " \n", 632 | "LL_l_train, LL_u_train, KL_l_train, KL_u_train = [], [], [], []\n", 633 | "acc_valid = []\n", 634 | "samples_processed = 0\n", 635 | "plt.figure(figsize=(12, 6))\n", 636 | "valid_samples_processed = []\n", 637 | "\n", 638 | "try:\n", 639 | " while samples_processed < samples_to_process:\n", 640 | " idxs = np.random.choice(range(x_train.shape[0]), size=(batch_size), replace=False) \n", 641 | " x_batch = x_train[idxs]\n", 642 | " t_batch = t_train[idxs]\n", 643 | " out = f_train(x_batch, x_train_l, t_train_l)\n", 644 | " samples_processed += batch_size\n", 645 | " \n", 646 | " if samples_processed % val_interval == 0:\n", 647 | " valid_samples_processed += [samples_processed]\n", 648 | " out = f_train_eval(x_train, x_train_l, t_train_l)\n", 649 | " LL_l_train += [out[0]]\n", 650 | " LL_u_train += [out[1]]\n", 651 | " KL_l_train += [out[2]]\n", 652 | " KL_u_train += [out[3]]\n", 653 | " \n", 654 | " out = f_eval(x_valid, t_valid)\n", 655 | " acc_valid += [out[0]]\n", 656 | " \n", 657 | " #z_eval = f_z(x_valid, t_valid)[0]\n", 658 | " \n", 659 | " plt.subplot(1, 2, 1)\n", 660 | " plt.legend(['acc. eval'], loc=2)\n", 661 | " plt.xlabel('Updates')\n", 662 | " plt.plot(valid_samples_processed, acc_valid, color=\"black\")\n", 663 | " plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))\n", 664 | " plt.grid('on')\n", 665 | " \n", 666 | " plt.subplot(1, 2, 2)\n", 667 | " plt.legend(['LL_l train', 'LL_u train'], loc=2)\n", 668 | " plt.xlabel('Updates')\n", 669 | " plt.plot(valid_samples_processed, LL_l_train, color=\"black\")\n", 670 | " plt.plot(valid_samples_processed, LL_u_train, color=\"red\")\n", 671 | " plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))\n", 672 | " plt.grid('on')\n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " plt.savefig(\"out.png\")\n", 677 | " display(Image(filename=\"out.png\"))\n", 678 | " clear_output(wait=True)\n", 679 | " \n", 680 | " \n", 681 | "except KeyboardInterrupt:\n", 682 | " pass\n", 683 | "\n", 684 | "\n" 685 | ] 686 | }, 687 | { 688 | "cell_type": "markdown", 689 | "metadata": {}, 690 | "source": [ 691 | "### Exercise 2 - Analyzing the semi-supervised VAE\n", 692 | "1. Understand how the above code works.\n", 693 | "2. Test different network structures and fractions of labeled/unlabeled data and evaluate the performance of the model.\n", 694 | "3. What happens when you remove the labeled lowerbound from the objective.\n", 695 | "4. What does the scaling of the labeled cross-entropy do to the training.\n", 696 | "3. Try to benchmark this model against other semi-supervised approaches." 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": null, 702 | "metadata": { 703 | "collapsed": true 704 | }, 705 | "outputs": [], 706 | "source": [] 707 | } 708 | ], 709 | "metadata": { 710 | "kernelspec": { 711 | "display_name": "Python [Root]", 712 | "language": "python", 713 | "name": "Python [Root]" 714 | }, 715 | "language_info": { 716 | "codemirror_mode": { 717 | "name": "ipython", 718 | "version": 3 719 | }, 720 | "file_extension": ".py", 721 | "mimetype": "text/x-python", 722 | "name": "python", 723 | "nbconvert_exporter": "python", 724 | "pygments_lexer": "ipython3", 725 | "version": "3.5.2" 726 | } 727 | }, 728 | "nbformat": 4, 729 | "nbformat_minor": 0 730 | } 731 | -------------------------------------------------------------------------------- /lab4/out.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepLearningDTU/variational-autoencoders-summerschool-2016/00e2c92f5d4562cb6ac24471d28cd513c7d6aec2/lab4/out.png -------------------------------------------------------------------------------- /lab4/samplelayer.py: -------------------------------------------------------------------------------- 1 | import lasagne 2 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 3 | import theano.tensor as T 4 | import theano 5 | 6 | 7 | class SimpleSampleLayer(lasagne.layers.MergeLayer): 8 | """ 9 | Simple sampling layer drawing a single Monte Carlo sample to approximate 10 | E_q [log( p(x,z) / q(z|x) )]. This is the approach described in [KINGMA]_. 11 | 12 | Parameters 13 | ---------- 14 | mu, log_var : :class:`Layer` instances 15 | Parameterizing the mean and log(variance) of the distribution to sample 16 | from as described in [KINGMA]_. The code assumes that these have the 17 | same number of dimensions. 18 | 19 | seed : int 20 | seed to random stream 21 | 22 | Methods 23 | ---------- 24 | seed : Helper function to change the random seed after init is called 25 | 26 | References 27 | ---------- 28 | .. [KINGMA] Kingma, Diederik P., and Max Welling. 29 | "Auto-Encoding Variational Bayes." 30 | arXiv preprint arXiv:1312.6114 (2013). 31 | """ 32 | def __init__(self, mean, log_var, 33 | seed=lasagne.random.get_rng().randint(1, 2147462579), 34 | **kwargs): 35 | super(SimpleSampleLayer, self).__init__([mean, log_var], **kwargs) 36 | 37 | self._srng = RandomStreams(seed) 38 | 39 | def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): 40 | self._srng.seed(seed) 41 | 42 | def get_output_shape_for(self, input_shapes): 43 | return input_shapes[0] 44 | 45 | def get_output_for(self, input, **kwargs): 46 | mu, log_var = input 47 | eps = self._srng.normal(mu.shape) 48 | z = mu + T.exp(0.5 * log_var) * eps 49 | return z 50 | 51 | 52 | class SampleLayer(lasagne.layers.MergeLayer): 53 | """ 54 | Sampling layer supporting importance sampling as described in [BURDA]_ and 55 | multiple Monte Carlo samples for the approximation of 56 | E_q [log( p(x,z) / q(z|x) )]. 57 | 58 | Parameters 59 | ---------- 60 | mu : class:`Layer` instance 61 | Parameterizing the mean of the distribution to sample 62 | from as described in [BURDA]_. 63 | 64 | log_var : class:`Layer` instance 65 | By default assumed to parametrize log(sigma^2) of the distribution to 66 | sample from as described in [BURDA]_ which is transformed to sigma using 67 | the nonlinearity function as described below. Effectively this means 68 | that the nonlinearity function controls what log_var parametrizes. A few 69 | common examples: 70 | -nonlinearity = lambda x: T.exp(0.5*x) => log_var = log(sigma^2)[default] 71 | -nonlinearity = lambda x: T.sqrt(x) => log_var = sigma^2 72 | -nonlinearity = lambda x: x => log_var = sigma 73 | 74 | eq_samples : int or T.scalar 75 | Number of Monte Carlo samples used to estimate the expectation over 76 | q(z|x) in eq. (8) in [BURDA]_. 77 | 78 | iw_samples : int or T.scalar 79 | Number of importance samples in the sum over k in eq. (8) in [BURDA]_. 80 | 81 | nonlinearity : callable or None 82 | The nonlinearity that is applied to the log_var input layer to transform 83 | it into a standard deviation. By default we assume that 84 | log_var = log(sigma^2) and hence the corresponding nonlinearity is 85 | f(x) = T.exp(0.5*x) such that T.exp(0.5*log(sigma^2)) = sigma 86 | 87 | seed : int 88 | seed to random stream 89 | 90 | Methods 91 | ---------- 92 | seed : Helper function to change the random seed after init is called 93 | 94 | References 95 | ---------- 96 | .. [BURDA] Burda, Yuri, Roger Grosse, and Ruslan Salakhutdinov. 97 | "Importance Weighted Autoencoders." 98 | arXiv preprint arXiv:1509.00519 (2015). 99 | """ 100 | 101 | def __init__(self, mean, log_var, 102 | eq_samples=1, 103 | iw_samples=1, 104 | nonlinearity=lambda x: T.exp(0.5*x), 105 | seed=lasagne.random.get_rng().randint(1, 2147462579), 106 | **kwargs): 107 | super(SampleLayer, self).__init__([mean, log_var], **kwargs) 108 | 109 | self.eq_samples = eq_samples 110 | self.iw_samples = iw_samples 111 | self.nonlinearity = nonlinearity 112 | 113 | self._srng = RandomStreams(seed) 114 | 115 | def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): 116 | self._srng.seed(seed) 117 | 118 | def get_output_shape_for(self, input_shapes): 119 | batch_size, num_latent = input_shapes[0] 120 | if isinstance(batch_size, int) and \ 121 | isinstance(self.iw_samples, int) and \ 122 | isinstance(self.eq_samples, int): 123 | out_dim = (batch_size*self.eq_samples*self.iw_samples, num_latent) 124 | else: 125 | out_dim = (None, num_latent) 126 | return out_dim 127 | 128 | def get_output_for(self, input, **kwargs): 129 | mu, log_var = input 130 | batch_size, num_latent = mu.shape 131 | eps = self._srng.normal( 132 | [batch_size, self.eq_samples, self.iw_samples, num_latent], 133 | dtype=theano.config.floatX) 134 | 135 | z = mu.dimshuffle(0,'x','x',1) + \ 136 | self.nonlinearity( log_var.dimshuffle(0,'x','x',1)) * eps 137 | 138 | return z.reshape((-1,num_latent)) 139 | 140 | 141 | class SimpleBernoulliSampleLayer(lasagne.layers.Layer): 142 | """ 143 | Simple sampling layer drawing samples from bernoulli distributions. 144 | 145 | Parameters 146 | ---------- 147 | mean : :class:`Layer` instances 148 | Parameterizing the mean value of each bernoulli distribution 149 | seed : int 150 | seed to random stream 151 | Methods 152 | ---------- 153 | seed : Helper function to change the random seed after init is called 154 | """ 155 | 156 | def __init__(self, mean, 157 | seed=lasagne.random.get_rng().randint(1, 2147462579), 158 | **kwargs): 159 | super(SimpleBernoulliSampleLayer, self).__init__(mean, **kwargs) 160 | 161 | self._srng = RandomStreams(seed) 162 | 163 | def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): 164 | self._srng.seed(seed) 165 | 166 | def get_output_shape_for(self, input_shape): 167 | return input_shape 168 | 169 | def get_output_for(self, mu, **kwargs): 170 | return self._srng.binomial(size=mu.shape, p=mu, dtype=mu.dtype) 171 | 172 | 173 | class BernoulliSampleLayer(lasagne.layers.Layer): 174 | """ 175 | Bernoulli Sampling layer supporting importance sampling 176 | Parameters 177 | ---------- 178 | mean : class:`Layer` instance 179 | Parameterizing the mean value of each bernoulli distribution 180 | eq_samples : int or T.scalar 181 | Number of Monte Carlo samples used to estimate the expectation over 182 | iw_samples : int or T.scalar 183 | Number of importance samples in the sum over k 184 | seed : int 185 | seed to random stream 186 | Methods 187 | ---------- 188 | seed : Helper function to change the random seed after init is called 189 | """ 190 | 191 | def __init__(self, mean, 192 | eq_samples=1, 193 | iw_samples=1, 194 | seed=lasagne.random.get_rng().randint(1, 2147462579), 195 | **kwargs): 196 | super(BernoulliSampleLayer, self).__init__(mean, **kwargs) 197 | 198 | self.eq_samples = eq_samples 199 | self.iw_samples = iw_samples 200 | 201 | self._srng = RandomStreams(seed) 202 | 203 | def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): 204 | self._srng.seed(seed) 205 | 206 | def get_output_shape_for(self, input_shape): 207 | batch_size, num_latent = input_shape 208 | if isinstance(batch_size, int) and \ 209 | isinstance(self.iw_samples, int) and \ 210 | isinstance(self.eq_samples, int): 211 | out_dim = (batch_size*self.eq_samples*self.iw_samples, num_latent) 212 | else: 213 | out_dim = (None, num_latent) 214 | return out_dim 215 | 216 | def get_output_for(self, input, **kwargs): 217 | mu = input 218 | batch_size, num_latent = mu.shape 219 | shp = (batch_size, self.eq_samples, self.iw_samples, num_latent) 220 | mu_shp = mu.dimshuffle(0,'x','x',1) 221 | mu_shp = T.repeat(mu_shp, axis=1, repeats=self.eq_samples) 222 | mu_shp = T.repeat(mu_shp, axis=2, repeats=self.iw_samples) 223 | samples = self._srng.binomial( 224 | size=shp, p=mu_shp, dtype=theano.config.floatX) 225 | return samples.reshape((-1, num_latent)) 226 | --------------------------------------------------------------------------------