├── .gitignore ├── README.md ├── lab1 ├── confusionmatrix.py ├── lab1_FFN.ipynb └── mnist.npz ├── lab2 ├── .ipynb_checkpoints │ └── lab2_CNN-checkpoint.ipynb ├── confusionmatrix.py ├── lab2_CNN.ipynb └── mnist.npz ├── lab3 ├── .ipynb_checkpoints │ └── RNN-checkpoint.ipynb ├── RNN.ipynb ├── confusionmatrix.py ├── data_generator.py ├── decoder_attention.py └── enc-dec.png └── lab6 ├── .ipynb_checkpoints └── Lab6-checkpoint.ipynb ├── Lab6.ipynb ├── VAE.png ├── lab6 ├── mnist.npz └── samplelayer.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.pyc 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Nvidia Deep Learning Summercamp 2016 2 | by *Casper Sønderby, University of Copenhagen* 3 | 4 | Parts of the code are based on contributions from Lars Maaløe and Søren Kaae Sønderby 5 | -------------------------------------------------------------------------------- /lab1/confusionmatrix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class ConfusionMatrix: 5 | """ 6 | Simple confusion matrix class 7 | row is the true class, column is the predicted class 8 | """ 9 | def __init__(self, num_classes, class_names=None): 10 | self.n_classes = num_classes 11 | if class_names is None: 12 | self.class_names = map(str, range(num_classes)) 13 | else: 14 | self.class_names = class_names 15 | 16 | # find max class_name and pad 17 | max_len = max(map(len, self.class_names)) 18 | self.max_len = max_len 19 | for idx, name in enumerate(self.class_names): 20 | if len(self.class_names) < max_len: 21 | self.class_names[idx] = name + " "*(max_len-len(name)) 22 | 23 | self.mat = np.zeros((num_classes,num_classes),dtype='int') 24 | 25 | def __str__(self): 26 | # calucate row and column sums 27 | col_sum = np.sum(self.mat, axis=1) 28 | row_sum = np.sum(self.mat, axis=0) 29 | 30 | s = [] 31 | 32 | mat_str = self.mat.__str__() 33 | mat_str = mat_str.replace('[','').replace(']','').split('\n') 34 | 35 | for idx, row in enumerate(mat_str): 36 | if idx == 0: 37 | pad = " " 38 | else: 39 | pad = "" 40 | class_name = self.class_names[idx] 41 | class_name = " " + class_name + " |" 42 | row_str = class_name + pad + row 43 | row_str += " |" + str(col_sum[idx]) 44 | s.append(row_str) 45 | 46 | row_sum = [(self.max_len+4)*" "+" ".join(map(str, row_sum))] 47 | hline = [(1+self.max_len)*" "+"-"*len(row_sum[0])] 48 | 49 | s = hline + s + hline + row_sum 50 | 51 | # add linebreaks 52 | s_out = [line+'\n' for line in s] 53 | return "".join(s_out) 54 | 55 | def batch_add(self, targets, preds): 56 | assert targets.shape == preds.shape 57 | assert len(targets) == len(preds) 58 | assert max(targets) < self.n_classes 59 | assert max(preds) < self.n_classes 60 | targets = targets.flatten() 61 | preds = preds.flatten() 62 | for i in range(len(targets)): 63 | self.mat[targets[i], preds[i]] += 1 64 | 65 | def get_errors(self): 66 | tp = np.asarray(np.diag(self.mat).flatten(),dtype='float') 67 | fn = np.asarray(np.sum(self.mat, axis=1).flatten(),dtype='float') - tp 68 | fp = np.asarray(np.sum(self.mat, axis=0).flatten(),dtype='float') - tp 69 | tn = np.asarray(np.sum(self.mat)*np.ones(self.n_classes).flatten(), 70 | dtype='float') - tp - fn - fp 71 | return tp, fn, fp, tn 72 | 73 | def accuracy(self): 74 | """ 75 | Calculates global accuracy 76 | :return: accuracy 77 | :example: >>> conf = ConfusionMatrix(3) 78 | >>> conf.batchAdd([0,0,1],[0,0,2]) 79 | >>> print conf.accuracy() 80 | """ 81 | tp, _, _, _ = self.get_errors() 82 | n_samples = np.sum(self.mat) 83 | return np.sum(tp) / n_samples 84 | 85 | def sensitivity(self): 86 | tp, tn, fp, fn = self.get_errors() 87 | res = tp / (tp + fn) 88 | res = res[~np.isnan(res)] 89 | return res 90 | 91 | def specificity(self): 92 | tp, tn, fp, fn = self.get_errors() 93 | res = tn / (tn + fp) 94 | res = res[~np.isnan(res)] 95 | return res 96 | 97 | def positive_predictive_value(self): 98 | tp, tn, fp, fn = self.get_errors() 99 | res = tp / (tp + fp) 100 | res = res[~np.isnan(res)] 101 | return res 102 | 103 | def negative_predictive_value(self): 104 | tp, tn, fp, fn = self.get_errors() 105 | res = tn / (tn + fn) 106 | res = res[~np.isnan(res)] 107 | return res 108 | 109 | def false_positive_rate(self): 110 | tp, tn, fp, fn = self.get_errors() 111 | res = fp / (fp + tn) 112 | res = res[~np.isnan(res)] 113 | return res 114 | 115 | def false_discovery_rate(self): 116 | tp, tn, fp, fn = self.get_errors() 117 | res = fp / (tp + fp) 118 | res = res[~np.isnan(res)] 119 | return res 120 | 121 | def F1(self): 122 | tp, tn, fp, fn = self.get_errors() 123 | res = (2*tp) / (2*tp + fp + fn) 124 | res = res[~np.isnan(res)] 125 | return res 126 | 127 | def matthews_correlation(self): 128 | tp, tn, fp, fn = self.get_errors() 129 | numerator = tp*tn - fp*fn 130 | denominator = np.sqrt((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn)) 131 | res = numerator / denominator 132 | res = res[~np.isnan(res)] 133 | return res 134 | -------------------------------------------------------------------------------- /lab1/lab1_FFN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import matplotlib\n", 13 | "import numpy as np\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "import sklearn.datasets\n", 16 | "import theano\n", 17 | "import theano.tensor as T\n", 18 | "import lasagne\n", 19 | "\n", 20 | "def plot_decision_boundary(pred_func, X, y):\n", 21 | " #from https://github.com/dennybritz/nn-from-scratch/blob/master/nn-from-scratch.ipynb\n", 22 | " # Set min and max values and give it some padding\n", 23 | " x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5\n", 24 | " y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5\n", 25 | " \n", 26 | " h = 0.01\n", 27 | " # Generate a grid of points with distance h between them\n", 28 | " xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n", 29 | " \n", 30 | " yy = yy.astype('float32')\n", 31 | " xx = xx.astype('float32')\n", 32 | " # Predict the function value for the whole gid\n", 33 | " Z = pred_func(np.c_[xx.ravel(), yy.ravel()])[:,0]\n", 34 | " Z = Z.reshape(xx.shape)\n", 35 | " # Plot the contour and training examples\n", 36 | " plt.figure()\n", 37 | " plt.contourf(xx, yy, Z, cmap=plt.cm.RdBu)\n", 38 | " plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)\n", 39 | "\n", 40 | "def onehot(t, num_classes):\n", 41 | " out = np.zeros((t.shape[0], num_classes))\n", 42 | " for row, col in enumerate(t):\n", 43 | " out[row, col] = 1\n", 44 | " return out\n", 45 | "\n", 46 | " " 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "# Neural networks 101\n", 54 | "In this notebook you will implement a simple neural network in Lasagne utilizing the automatic differentiation engine of Theano. We assume that you are already familiar with backpropation (if not please see [Andrej Karpathy](http://cs.stanford.edu/people/karpathy/) or [Michal Nielsen](http://neuralnetworksanddeeplearning.com/chap2.html).\n", 55 | "We'll not spend much time on how Theano works, but you can refer to [this short tutorial](http://nbviewer.jupyter.org/github/craffel/theano-tutorial/blob/master/Theano%20Tutorial.ipynb) if you are interested.\n", 56 | "\n", 57 | "Additionally, for the ambitious people we have previously made an assignment where you will implement both the forward and backpropagation in a neural network by hand, https://github.com/DTU-deeplearning/day1-NN/blob/master/exercises_1.ipynb \n", 58 | "\n", 59 | "In this exercise we'll start right away by defining logistic regression model in Lasagne/Theano. Some details of Theano can be a bit confusing, however you'll pick them up when you worked with it for some time. For now you should pay most attention to the highlevel network construction in Lasagne. We'll initially start with a simple 2-D and 2-class classification problem where the class decision boundary can be visualized. Initially we show that logistic regression can only separate classes linearly. Adding a Non-linear hidden layer to the algorithm permits nonlinear class separation. If time permits we'll continue on to implement a fully conencted neural network to classify the (in)famous MNIST dataset consisting of images of hand written digits. \n" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "# Problem \n", 67 | "We'll initally demonstrate the that MLPs can classify non-linear problems whereas simple logistic regression cannot. For ease of visualization and computationl speed we initially experiment on the simple 2D half-moon dataset." 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "# Generate a dataset and plot it\n", 79 | "np.random.seed(0)\n", 80 | "num_samples = 300\n", 81 | "\n", 82 | "X, y = sklearn.datasets.make_moons(num_samples, noise=0.20)\n", 83 | "\n", 84 | "X_tr = X[:100].astype('float32')\n", 85 | "X_val = X[100:200].astype('float32')\n", 86 | "X_te = X[200:].astype('float32')\n", 87 | "\n", 88 | "y_tr = y[:100].astype('int32')\n", 89 | "y_val = y[100:200].astype('int32')\n", 90 | "y_te = y[200:].astype('int32')\n", 91 | "\n", 92 | "plt.scatter(X_tr[:,0], X_tr[:,1], s=40, c=y_tr, cmap=plt.cm.BuGn)\n", 93 | "\n", 94 | "print X.shape, y.shape\n", 95 | "\n", 96 | "num_features = X_tr.shape[-1]" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "# From Logistic Regression to \"Deep Learning\" in Lasagne\n", 104 | "The code implements logistic regression in lasagne. In section __Assignments Half Moon__ you are asked to modify the code into a neural network. \n", 105 | "\n", 106 | "The building block in lasagne is the Layer. To get started the most important layers are the DenseLayer and the InputLayer. \n", 107 | "\n", 108 | "The [InputLayer](http://lasagne.readthedocs.io/en/latest/modules/layers/input.html) is a \"special\" layer which lets you input data to the network. The InputLayer is initialized with a tuple specifying the shape of the input data. Note that it is common to provide ``None`` for the first dimension which allows you to vary the batch size at runtime. \n", 109 | "\n", 110 | "The [DenseLayer](http://lasagne.readthedocs.io/en/latest/modules/layers/dense.html) implements the computation: \n", 111 | "\n", 112 | "$$y = nonlinearity(xW + b)$$\n", 113 | "\n", 114 | "where $x$ is the layer input, $y$ is the layer output and $\\{W, b\\}$ are the layer parameters. The DenseLayer is initialized with a pointer to the previous layer, the desired number of units in the layer and the nonlinearity. \n", 115 | "x has shape ```[batchsize, num_features]```. From this we can infer the size of ```W``` as ```[num_features, num_units]``` and b as ```[num_units]```. y is then ```[batch_size, num_units]```.\n", 116 | "\n", 117 | "\n", 118 | "A layer in Lasagne does the following:\n", 119 | "1. Given the shape of the input $x$ and the number of units in the layer lasagne infers the shapes of $W$ and $b$ and keep track of the layer parameters.\n", 120 | "2. Setup the computation $y = nonlinearity(xW + b)$" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "collapsed": false 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "from lasagne.updates import sgd\n", 132 | "from lasagne.nonlinearities import leaky_rectify, softmax, tanh, elu\n", 133 | "from lasagne.layers import InputLayer, DenseLayer\n", 134 | "\n", 135 | "\n", 136 | "#MODEL SPECIFICATION\n", 137 | "l_in = InputLayer(shape=(None, num_features))\n", 138 | "#INSERT HIDDEL LAYER HERE\n", 139 | "#l = DenseLayer(incoming=l,.....\n", 140 | "l_out = DenseLayer(incoming=l_in, num_units=2, nonlinearity=softmax, name='outputlayer') \n", 141 | "#We use two output units since we have two classes. the softmax function ensures that the the class probabilities sum to 1." 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "After we have built the network we can use lasagnes helper functions to \n", 149 | "\n", 150 | "1. Build the computation graph: __[lasagne.layers.get_output](http://lasagne.readthedocs.io/en/latest/modules/layers/helper.html#lasagne.layers.get_output)__ . The ``deterministic`` flag tells lasagne if we are in training mode or evaluation mode. When you build more complicated networks this is very important to remember! (Two important layers taht behave differently in training mode and evaluation mode are the [DropoutLayer](http://lasagne.readthedocs.io/en/latest/modules/layers/noise.html#lasagne.layers.DropoutLayer) and the [BatchNormalizationLayer](http://lasagne.readthedocs.io/en/latest/modules/layers/normalization.html?highlight=batchnorm#lasagne.layers.BatchNormLayer)). Building the computation graph gives us the forwardpass of the network. \n", 151 | "2. Collect the network parameters: __[lasagne.layers.get_all_params](http://lasagne.readthedocs.io/en/latest/modules/layers/helper.html#lasagne.layers.get_all_params)__ (Note the trainable flag which will only return paramters that are trainable. You'll get errors if your are using batchnorm and you forget this)\n", 152 | "\n", 153 | "Note that all the helper functions are called with the output layer or a list of outputlayers if you have multiple output layers. " 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "sym_x = T.matrix('X') # a symbolic variable taking on the value of a input batch.\n", 165 | "sym_t = T.ivector('target') # a symbolic variable taking on the value of the target batch.\n", 166 | "\n", 167 | "\n", 168 | "# Get network output\n", 169 | "train_out = lasagne.layers.get_output(l_out, {l_in: sym_x}, deterministic=False)\n", 170 | "eval_out = lasagne.layers.get_output(l_out, {l_in: sym_x}, deterministic=True)\n", 171 | "\n", 172 | "\n", 173 | "# Get list of all trainable parameters in the network.\n", 174 | "all_params = lasagne.layers.get_all_params(l_out, trainable=True)\n", 175 | "\n", 176 | "# print shapes of all the paramters in the network.\n", 177 | "for p in all_params:\n", 178 | " print p, p.get_value().shape" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "``train_out`` will be a symbolic variable representing the network output. Using ``train_out`` we can define the [crossentropy error](http://deeplearning.net/software/theano/library/tensor/nnet/nnet.html#tensor.nnet.categorical_crossentropy) used for training the network.\n", 186 | "We ```mean``` over all the samples in the mini-batch.\n" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": { 193 | "collapsed": true 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "cost_train = T.nnet.categorical_crossentropy(train_out, sym_t).mean()\n", 198 | "cost_eval = T.nnet.categorical_crossentropy(eval_out, sym_t).mean()" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "When we train a neural network we update the parameters in direction of the negative gradient w.r.t the cost.\n", 206 | "We can use ``T.grad`` to get the gradients for all parameters in the network w.r.t ``cost_train``.\n", 207 | "Imaggine that ```cost_train``` is a function and we want to go downhill. We go downhill by changing the value of the paramters in direction of the negative gradient. \n", 208 | "\n", 209 | "Finally we can use __[lasagne.updates.sgd](http://lasagne.readthedocs.io/en/latest/modules/updates.html#lasagne.updates.sgd)__ to calculate the stochastic gradient descent (SGD) update rule for each paramter in the network. ``updates`` is a dictionary of the parameter update rules.\n", 210 | "\n", 211 | "Heres a small animation of [different optimizers doing](http://lasagne.readthedocs.io/en/latest/modules/updates.html) gradient descent: http://imgur.com/a/Hqolp . E.g why saddle points might be difficult." 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": { 218 | "collapsed": false 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "# Let Theano do its magic and get all the gradients we need for training. Essentially T.grad does backprop i.e. get the \n", 223 | "# gradient of cost_train w.r.t. the parameters.\n", 224 | "all_grads = T.grad(cost_train, all_params)\n", 225 | "\n", 226 | "# Set the update function for parameters \n", 227 | "# you might wan't to experiment with more advanded update schemes like rmsprob, adadelta etc.\n", 228 | "updates = lasagne.updates.sgd(all_grads, all_params, learning_rate=1.0)\n" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "The final step is to compile Theano functions for the network. For theano functions we need to specify which inputs the function should take. For our network that is ``sym_x`` which is the input data and ``sym_t`` which is the targets. Secondly we need to specify which outputs we want the network to return. In our case that is the crossentropy cost and the network output.\n", 236 | "\n", 237 | "When we compile ``f_train`` we additionally gives the updates dictionary as input. This tell Theano to update the network parameters with the update rules everytime we call ``f_train``. " 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": { 244 | "collapsed": true 245 | }, 246 | "outputs": [], 247 | "source": [ 248 | "f_eval = theano.function(inputs=[sym_x, sym_t],\n", 249 | " outputs=[cost_eval, eval_out])\n", 250 | "\n", 251 | "f_train = theano.function(inputs=[sym_x, sym_t],\n", 252 | " outputs=[cost_train, eval_out],\n", 253 | " updates=updates)\n", 254 | "\n", 255 | "\n", 256 | "\n", 257 | "#now you have three functions. \n", 258 | "# f_train(X,y) -> cost, y_pred which will update the parameters using backprop each time you call it, only use this on the training data!\n", 259 | "# f_test(X,y) -> cost, y_pred which only calculates the forward pass\n", 260 | "\n", 261 | "\n", 262 | "#This us just a helper function for plotting the decision boundaries between the two classes\n", 263 | "f_pred = theano.function(inputs=[sym_x],\n", 264 | " outputs=eval_out)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": { 271 | "collapsed": false 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "# Training loop\n", 276 | "plot_decision_boundary(lambda x: f_pred(x), X_val,y_val)\n", 277 | "plt.title(\"Untrained Classifier\")\n", 278 | "\n", 279 | "num_epochs = 1000\n", 280 | "\n", 281 | "train_cost, val_cost = [],[]\n", 282 | "for e in range(num_epochs):\n", 283 | " out = f_train(X_tr,y_tr)\n", 284 | " #out = [cost, y_pred]\n", 285 | " train_cost += [out[0]]\n", 286 | " \n", 287 | " out = f_eval(X_val,y_val)\n", 288 | " val_cost += [out[0]]\n", 289 | "\n", 290 | " if e % 100 == 0:\n", 291 | " print \"Epoch %i, Train Cost: %0.3f\\tVal Cost: %0.3f\"%(e, train_cost[-1],val_cost[-1])\n", 292 | " \n", 293 | " \n", 294 | "out = f_eval(X_te,y_te)\n", 295 | "test_cost = out[0]\n", 296 | "print \"\\nTest Cost: %0.3f\"%(test_cost)\n", 297 | "\n", 298 | "plot_decision_boundary(lambda x: f_pred(x), X_te, y_te)\n", 299 | "plt.title(\"Trained Classifier\")\n", 300 | "\n", 301 | "epoch = np.arange(len(train_cost))\n", 302 | "plt.figure()\n", 303 | "plt.plot(epoch,train_cost,'r',epoch,val_cost,'b')\n", 304 | "plt.legend(['Train Loss','Val Loss'])\n", 305 | "plt.xlabel('Updates'), plt.ylabel('Loss')\n", 306 | "\n" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "# Assignments Half Moon\n", 314 | "\n", 315 | " 1) A linear logistic classifier is only able to create a linear decision boundary. Change the Logistic classifier into a (non-linear) Neural network by inserting a dense hidden layer between the input and output layers of the model\n", 316 | " \n", 317 | " 2) Experiment with multiple hidden layers or more / less hidden units. What happens to the decision bondary?\n", 318 | " \n", 319 | " 3) Overfitting: When increasing the number of hidden layers / units the neural network will fit the training data better by creating a highly nonlinear decision boundary. If the model is to complex it will often generalize poorly to new data (validation and test set). Can you obseve this from the training and validation errors? \n", 320 | " \n", 321 | " 3) We used the vanilla stocastic gradient descent algorithm for parameter updates. This is usually slow to converge and more sophisticated pseudo-second-order methods usually works better. Try changing the optimizer to [adam or adamax](http://lasagne.readthedocs.io/en/latest/modules/updates.html) (lasagne.updates.adam, lasagne.updates.adamax)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "# Optional: MNIST dataset\n", 329 | "MNIST is a dataset that is often used for benchmarking. The MNIST dataset consists of 70,000 images of handwritten digits from 0-9. The dataset is split into a 50,000 images training set, 10,000 images validation set and 10,000 images test set. The images are 28x28 pixels, where each pixel represents a normalised value between 0-255 (0=black and 255=white).\n", 330 | "\n", 331 | "### Primer for the afternoon...\n", 332 | "We use a feedforward neural network to classify the 28x28 mnist images. ``num_features`` is therefore 28x28=784.\n", 333 | "That is we represent each image as a vector. The ordering of the pixels in the vector does not matter, so we could permuate all images using the same permuataion and still get the same performance. (Your are of course encouraged to try this using ``numpy.random.permutation`` to get a random permutation :)). This task is therefore called the _permutation invariant_ MNIST. Obviously this throws away a lot of structure in the data. After lunch we'll fix this with the convolutional neural network wich encodes prior knowledgde about data that has either spatial or temporal structure. \n", 334 | "\n", 335 | "### Ballpark estimates of hyperparameters\n", 336 | "__Optimizers:__\n", 337 | " 1. SGD + Momentum: learning rate 1.0 - 0.1 \n", 338 | " 2. ADAM: learning rate 3*1e-4 - 1e-5\n", 339 | " 3. RMSPROP: somewhere between SGD and ADAM\n", 340 | "\n", 341 | "__Regularization:__\n", 342 | " 1. Dropout. Dropout rate 0.1-0.5 \n", 343 | " 2. L2/L1 regularization. http://lasagne.readthedocs.io/en/latest/modules/regularization.html . I don't use this that often but 1e-4 - 1e-8.\n", 344 | " \n", 345 | " 3. Batchnorm: Batchnorm also act regularizer\n", 346 | " \n", 347 | "__Parameter initialization__\n", 348 | " Parameter initialization is extremely important. [Lasagne has a lot of different units](http://lasagne.readthedocs.io/en/latest/modules/init.html). Often used initializer use\n", 349 | " 1. He\n", 350 | " 2. Glorot\n", 351 | " 3. Uniform or Normal with small scale. (0.1 - 0.01)\n", 352 | " 4. Orthogonal (I find that this works very well for RNNs)\n", 353 | "\n", 354 | "Bias is nearly always initialized to zero. \n", 355 | "\n", 356 | "__Number of hidden units and network structure__\n", 357 | " Probably as big network as possible and then apply regularization. You'll have to experiment :). One rarely goes below 512 units for feedforward networks unless your are training on CPU...\n", 358 | " Theres is some research into stochstic depth networks: https://arxiv.org/pdf/1603.09382v2.pdf, but in general this is trail and error. \n", 359 | "\n", 360 | "__Nonlinearity__: [The most commonly used nonliearities are](http://lasagne.readthedocs.io/en/latest/modules/nonlinearities.html)\n", 361 | " \n", 362 | " 1. ReLU\n", 363 | " 2. Leaky ReLU. Same as \n", 364 | " 3. Elu\n", 365 | " 3. Sigmoids are used if your output is binary. It is not used in the hidden layers. Squases the output between -1 and 1\n", 366 | " 4. Softmax used as output if you have a classification problem. Normalizes the the output to 1. )\n", 367 | "\n", 368 | "\n", 369 | "See the plot below.\n", 370 | "\n", 371 | "__mini-batch size__\n", 372 | " Usually people use 16-256. Bigger is not allways better. With smaller mini-batch size you get more updates and your model might converge faster. Also small batchsizez uses less memory -> you can use a bigger model.\n", 373 | "\n", 374 | "Hyperparameters can be found by experience (guessing) or some search procedure. Random search is easy to implement and performs decent: http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf . \n", 375 | "More advanced search procedures include [SPEARMINT](https://github.com/JasperSnoek/spearmint) and many others. " 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 2, 381 | "metadata": { 382 | "collapsed": false 383 | }, 384 | "outputs": [ 385 | { 386 | "ename": "NameError", 387 | "evalue": "name 'np' is not defined", 388 | "output_type": "error", 389 | "traceback": [ 390 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 391 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 392 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# PLOT OF DIFFERENT OUTPUT USNITS\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinspace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m6\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m6\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mrelu\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmaximum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mleaky_relu\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmaximum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m0.1\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mminimum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# probably a slow implementation....\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0melu\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mx\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 393 | "\u001b[0;31mNameError\u001b[0m: name 'np' is not defined" 394 | ] 395 | } 396 | ], 397 | "source": [ 398 | "# PLOT OF DIFFERENT OUTPUT USNITS\n", 399 | "x = np.linspace(-6, 6, 100)\n", 400 | "relu = lambda x: np.maximum(0, x)\n", 401 | "leaky_relu = lambda x: np.maximum(0, x) + 0.1*np.minimum(0, x) # probably a slow implementation....\n", 402 | "elu = lambda x: (x > 0)*x + (1 - (x > 0))*(np.exp(x) - 1) \n", 403 | "sigmoid = lambda x: (1+np.exp(-x))**(-1)\n", 404 | "\n", 405 | "plt.figure(figsize=(6,6))\n", 406 | "plt.plot(x, relu(x), label='ReLU', lw=2)\n", 407 | "plt.plot(x, leaky_relu(x), label='Leaky ReLU',lw=2)\n", 408 | "plt.plot(x, elu(x), label='Elu', lw=2)\n", 409 | "plt.plot(x, sigmoid(x), label='Sigmoid',lw=2)\n", 410 | "plt.legend(loc=2, fontsize=16)\n", 411 | "plt.title('Non-linearities', fontsize=20)\n", 412 | "plt.ylim([-2, 5])\n", 413 | "plt.xlim([-6, 6])\n", 414 | "\n", 415 | "# softmax\n", 416 | "# assert that all class probablities sum to one\n", 417 | "assert np.all(abs(1.0 - x_softmax.sum(axis=1)) < 1e-8)" 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | "## MNIST\n", 425 | "First let's load the MNIST dataset and plot a few examples:" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": { 432 | "collapsed": false 433 | }, 434 | "outputs": [], 435 | "source": [ 436 | "#To speed up training we'll only work on a subset of the data\n", 437 | "data = np.load('mnist.npz')\n", 438 | "num_classes = 10\n", 439 | "x_train = data['X_train'][:1000].astype('float32')\n", 440 | "targets_train = data['y_train'][:1000].astype('int32')\n", 441 | "\n", 442 | "x_valid = data['X_valid'][:500].astype('float32')\n", 443 | "targets_valid = data['y_valid'][:500].astype('int32')\n", 444 | "\n", 445 | "x_test = data['X_test'][:500].astype('float32')\n", 446 | "targets_test = data['y_test'][:500].astype('int32')" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": { 453 | "collapsed": false 454 | }, 455 | "outputs": [], 456 | "source": [ 457 | "#plot a few MNIST examples\n", 458 | "idx = 0\n", 459 | "canvas = np.zeros((28*10, 10*28))\n", 460 | "for i in range(10):\n", 461 | " for j in range(10):\n", 462 | " canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x_train[idx].reshape((28, 28))\n", 463 | " idx += 1\n", 464 | "plt.figure(figsize=(7, 7))\n", 465 | "plt.axis('off')\n", 466 | "plt.imshow(canvas, cmap='gray')\n", 467 | "plt.title('MNIST handwritten digits')\n", 468 | "plt.show()" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": null, 474 | "metadata": { 475 | "collapsed": false 476 | }, 477 | "outputs": [], 478 | "source": [ 479 | "#defined the model\n", 480 | "num_class = 10\n", 481 | "num_features = x_train.shape[1]\n", 482 | "\n", 483 | "l_in = InputLayer(shape=(None,num_features))\n", 484 | "l_hid = DenseLayer(incoming=l_in, num_units=500, nonlinearity=elu)\n", 485 | "l_out = DenseLayer(incoming=l_hid, num_units=num_class, nonlinearity=softmax)" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": null, 491 | "metadata": { 492 | "collapsed": true 493 | }, 494 | "outputs": [], 495 | "source": [ 496 | "sym_x = T.matrix('sym_x') # a symbolic variable taking on the value of a input batch.\n", 497 | "sym_t = T.ivector('sym_t') # a symbolic variable taking on the value of the target batch.\n", 498 | "\n", 499 | "# Get network output\n", 500 | "train_out = lasagne.layers.get_output(l_out, sym_x, deterministic=False)\n", 501 | "eval_out = lasagne.layers.get_output(l_out, sym_x, deterministic=True)\n", 502 | "\n", 503 | "\n", 504 | "# Get list of all trainable parameters in the network.\n", 505 | "all_params = lasagne.layers.get_all_params(l_out, trainable=True)\n", 506 | "\n", 507 | "cost = T.nnet.categorical_crossentropy(train_out+1e-8, sym_t).mean()\n", 508 | "# Let Theano do its magic and get all the gradients we need for training\n", 509 | "all_grads = T.grad(cost, all_params)\n", 510 | "\n", 511 | "\n", 512 | "# Set the update function for parameters \n", 513 | "# you might wan't to experiment with more advanded update schemes like rmsprob, adadelta etc.\n", 514 | "updates = lasagne.updates.sgd(all_grads, all_params, learning_rate=0.1)\n", 515 | "\n", 516 | "\n", 517 | "f_eval = theano.function([sym_x],\n", 518 | " eval_out, on_unused_input='warn')\n", 519 | "\n", 520 | "f_train = theano.function([sym_x, sym_t],\n", 521 | " [cost],\n", 522 | " updates=updates, on_unused_input='warn')" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": { 529 | "collapsed": false 530 | }, 531 | "outputs": [], 532 | "source": [ 533 | "#Test the forward pass\n", 534 | "x = np.random.normal(0,1, (45, 28*28)).astype('float32') #dummy data\n", 535 | "\n", 536 | "model = lasagne.layers.get_output(l_out, sym_x)\n", 537 | "out = model.eval({sym_x:x}) #this could also include mask etc if used\n", 538 | "print \"l_out\", out.shape" 539 | ] 540 | }, 541 | { 542 | "cell_type": "markdown", 543 | "metadata": {}, 544 | "source": [ 545 | "# Build the training loop.\n", 546 | "We train the network by calculating the gradient w.r.t the cost function and update the parameters in direction of the negative gradient. \n", 547 | "\n", 548 | "\n", 549 | "When training neural network you always use mini batches. Instead of calculating the average gradient using the entire dataset you approximate the gradient using a mini-batch of typically 16 to 256 samples. The paramters are updated after each mini batch. Networks converges much faster using minibatches because the paramters are updated more often.\n", 550 | "\n", 551 | "We build a loop that iterates over the training data. Remember that the parameters are updated each time ``f_train`` is called." 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": null, 557 | "metadata": { 558 | "collapsed": false 559 | }, 560 | "outputs": [], 561 | "source": [ 562 | "from confusionmatrix import ConfusionMatrix\n", 563 | "batch_size = 100\n", 564 | "num_epochs = 100\n", 565 | "learning_rate = 0.1\n", 566 | "num_samples_train = x_train.shape[0]\n", 567 | "num_batches_train = num_samples_train // batch_size\n", 568 | "num_samples_valid = x_valid.shape[0]\n", 569 | "num_batches_valid = num_samples_valid // batch_size\n", 570 | "\n", 571 | "train_acc, train_loss = [], []\n", 572 | "valid_acc, valid_loss = [], []\n", 573 | "test_acc, test_loss = [], []\n", 574 | "cur_loss = 0\n", 575 | "loss = []\n", 576 | "for epoch in range(num_epochs):\n", 577 | " #Forward->Backprob->Update params\n", 578 | " cur_loss = 0\n", 579 | " for i in range(num_batches_train):\n", 580 | " idx = range(i*batch_size, (i+1)*batch_size)\n", 581 | " x_batch = x_train[idx]\n", 582 | " target_batch = targets_train[idx] \n", 583 | " batch_loss = f_train(x_batch,target_batch) #this will do the complete backprob pass\n", 584 | " cur_loss += batch_loss[0]\n", 585 | " loss += [cur_loss/batch_size]\n", 586 | " \n", 587 | " confusion_valid = ConfusionMatrix(num_classes)\n", 588 | " confusion_train = ConfusionMatrix(num_classes)\n", 589 | "\n", 590 | " for i in range(num_batches_train):\n", 591 | " idx = range(i*batch_size, (i+1)*batch_size)\n", 592 | " x_batch = x_train[idx]\n", 593 | " targets_batch = targets_train[idx]\n", 594 | " net_out = f_eval(x_batch) \n", 595 | " preds = np.argmax(net_out, axis=-1) \n", 596 | " confusion_train.batch_add(targets_batch, preds)\n", 597 | "\n", 598 | " confusion_valid = ConfusionMatrix(num_classes)\n", 599 | " for i in range(num_batches_valid):\n", 600 | " idx = range(i*batch_size, (i+1)*batch_size)\n", 601 | " x_batch = x_valid[idx]\n", 602 | " targets_batch = targets_valid[idx]\n", 603 | " net_out = f_eval(x_batch) \n", 604 | " preds = np.argmax(net_out, axis=-1) \n", 605 | " \n", 606 | " confusion_valid.batch_add(targets_batch, preds)\n", 607 | " \n", 608 | " train_acc_cur = confusion_train.accuracy()\n", 609 | " valid_acc_cur = confusion_valid.accuracy()\n", 610 | "\n", 611 | " train_acc += [train_acc_cur]\n", 612 | " valid_acc += [valid_acc_cur]\n", 613 | " print \"Epoch %i : Train Loss %e , Train acc %f, Valid acc %f \" \\\n", 614 | " % (epoch+1, loss[-1], train_acc_cur, valid_acc_cur)\n", 615 | " \n", 616 | " \n", 617 | "epoch = np.arange(len(train_acc))\n", 618 | "plt.figure()\n", 619 | "plt.plot(epoch,train_acc,'r',epoch,valid_acc,'b')\n", 620 | "plt.legend(['Train Acc','Val Acc'])\n", 621 | "plt.xlabel('Updates'), plt.ylabel('Acc')" 622 | ] 623 | }, 624 | { 625 | "cell_type": "markdown", 626 | "metadata": {}, 627 | "source": [ 628 | "#More questions" 629 | ] 630 | }, 631 | { 632 | "cell_type": "markdown", 633 | "metadata": {}, 634 | "source": [ 635 | "1. Do you see overfitting? Google overfitting if you don't know how to spot it\n", 636 | "2. Regularization is a method to reduce overfitting. Adding noise to your network is a popular method to fight overfitting. Try using Dropout in your network. [Lasagne DropoutLayer](http://lasagne.readthedocs.io/en/latest/modules/layers/noise.html#lasagne.layers.DropoutLayer).\n", 637 | "3. Alternatively you can regularize your network by penalizing the L2 or L1 norm of the network parameters. [Read the docs for more info](http://lasagne.readthedocs.io/en/latest/modules/regularization.html). " 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": null, 643 | "metadata": { 644 | "collapsed": true 645 | }, 646 | "outputs": [], 647 | "source": [] 648 | } 649 | ], 650 | "metadata": { 651 | "kernelspec": { 652 | "display_name": "Python 2", 653 | "language": "python", 654 | "name": "python2" 655 | }, 656 | "language_info": { 657 | "codemirror_mode": { 658 | "name": "ipython", 659 | "version": 2 660 | }, 661 | "file_extension": ".py", 662 | "mimetype": "text/x-python", 663 | "name": "python", 664 | "nbconvert_exporter": "python", 665 | "pygments_lexer": "ipython2", 666 | "version": "2.7.11" 667 | } 668 | }, 669 | "nbformat": 4, 670 | "nbformat_minor": 0 671 | } 672 | -------------------------------------------------------------------------------- /lab1/mnist.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepLearningDTU/nvidia_deep_learning_summercamp_2016/9d87d409eeebed4b8cf39c42987fb5e786d3ae28/lab1/mnist.npz -------------------------------------------------------------------------------- /lab2/confusionmatrix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class ConfusionMatrix: 5 | """ 6 | Simple confusion matrix class 7 | row is the true class, column is the predicted class 8 | """ 9 | def __init__(self, num_classes, class_names=None): 10 | self.n_classes = num_classes 11 | if class_names is None: 12 | self.class_names = map(str, range(num_classes)) 13 | else: 14 | self.class_names = class_names 15 | 16 | # find max class_name and pad 17 | max_len = max(map(len, self.class_names)) 18 | self.max_len = max_len 19 | for idx, name in enumerate(self.class_names): 20 | if len(self.class_names) < max_len: 21 | self.class_names[idx] = name + " "*(max_len-len(name)) 22 | 23 | self.mat = np.zeros((num_classes,num_classes),dtype='int') 24 | 25 | def __str__(self): 26 | # calucate row and column sums 27 | col_sum = np.sum(self.mat, axis=1) 28 | row_sum = np.sum(self.mat, axis=0) 29 | 30 | s = [] 31 | 32 | mat_str = self.mat.__str__() 33 | mat_str = mat_str.replace('[','').replace(']','').split('\n') 34 | 35 | for idx, row in enumerate(mat_str): 36 | if idx == 0: 37 | pad = " " 38 | else: 39 | pad = "" 40 | class_name = self.class_names[idx] 41 | class_name = " " + class_name + " |" 42 | row_str = class_name + pad + row 43 | row_str += " |" + str(col_sum[idx]) 44 | s.append(row_str) 45 | 46 | row_sum = [(self.max_len+4)*" "+" ".join(map(str, row_sum))] 47 | hline = [(1+self.max_len)*" "+"-"*len(row_sum[0])] 48 | 49 | s = hline + s + hline + row_sum 50 | 51 | # add linebreaks 52 | s_out = [line+'\n' for line in s] 53 | return "".join(s_out) 54 | 55 | def batch_add(self, targets, preds): 56 | assert targets.shape == preds.shape 57 | assert len(targets) == len(preds) 58 | assert max(targets) < self.n_classes 59 | assert max(preds) < self.n_classes 60 | targets = targets.flatten() 61 | preds = preds.flatten() 62 | for i in range(len(targets)): 63 | self.mat[targets[i], preds[i]] += 1 64 | 65 | def get_errors(self): 66 | tp = np.asarray(np.diag(self.mat).flatten(),dtype='float') 67 | fn = np.asarray(np.sum(self.mat, axis=1).flatten(),dtype='float') - tp 68 | fp = np.asarray(np.sum(self.mat, axis=0).flatten(),dtype='float') - tp 69 | tn = np.asarray(np.sum(self.mat)*np.ones(self.n_classes).flatten(), 70 | dtype='float') - tp - fn - fp 71 | return tp, fn, fp, tn 72 | 73 | def accuracy(self): 74 | """ 75 | Calculates global accuracy 76 | :return: accuracy 77 | :example: >>> conf = ConfusionMatrix(3) 78 | >>> conf.batchAdd([0,0,1],[0,0,2]) 79 | >>> print conf.accuracy() 80 | """ 81 | tp, _, _, _ = self.get_errors() 82 | n_samples = np.sum(self.mat) 83 | return np.sum(tp) / n_samples 84 | 85 | def sensitivity(self): 86 | tp, tn, fp, fn = self.get_errors() 87 | res = tp / (tp + fn) 88 | res = res[~np.isnan(res)] 89 | return res 90 | 91 | def specificity(self): 92 | tp, tn, fp, fn = self.get_errors() 93 | res = tn / (tn + fp) 94 | res = res[~np.isnan(res)] 95 | return res 96 | 97 | def positive_predictive_value(self): 98 | tp, tn, fp, fn = self.get_errors() 99 | res = tp / (tp + fp) 100 | res = res[~np.isnan(res)] 101 | return res 102 | 103 | def negative_predictive_value(self): 104 | tp, tn, fp, fn = self.get_errors() 105 | res = tn / (tn + fn) 106 | res = res[~np.isnan(res)] 107 | return res 108 | 109 | def false_positive_rate(self): 110 | tp, tn, fp, fn = self.get_errors() 111 | res = fp / (fp + tn) 112 | res = res[~np.isnan(res)] 113 | return res 114 | 115 | def false_discovery_rate(self): 116 | tp, tn, fp, fn = self.get_errors() 117 | res = fp / (tp + fp) 118 | res = res[~np.isnan(res)] 119 | return res 120 | 121 | def F1(self): 122 | tp, tn, fp, fn = self.get_errors() 123 | res = (2*tp) / (2*tp + fp + fn) 124 | res = res[~np.isnan(res)] 125 | return res 126 | 127 | def matthews_correlation(self): 128 | tp, tn, fp, fn = self.get_errors() 129 | numerator = tp*tn - fp*fn 130 | denominator = np.sqrt((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn)) 131 | res = numerator / denominator 132 | res = res[~np.isnan(res)] 133 | return res 134 | -------------------------------------------------------------------------------- /lab2/lab2_CNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import matplotlib\n", 13 | "import numpy as np\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "import sklearn.datasets\n", 16 | "import theano\n", 17 | "import theano.tensor as T\n", 18 | "import lasagne\n", 19 | " " 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "# Convolutional Neural networks 101\n", 27 | "\n", 28 | "Convolution neural networks are one of the most succesfull types of neural networks for image recognition and an integral part of reigniting the interest in neural networks. \n", 29 | "\n", 30 | "In this lab we'll experiment with inserting 2D-convolution layers in the fully connected neural networks introduced in LAB1. We'll furhter experiment with stacking of convolution layers, max pooling and strided convolutions which are all important techniques in current convolution neural network architectures. Lastly we'll try to visualize the learned convolution filters and try to understand what kind of features they learn to recognize.\n", 31 | "\n", 32 | "\n", 33 | "If you are unfamilar with the the convolution operation https://github.com/vdumoulin/conv_arithmetic have a nice visualization of different convolution variants. For a more indept tutorial please see http://cs231n.github.io/convolutional-networks/ or http://neuralnetworksanddeeplearning.com/chap6.html. Lastly if you are ambitious and want implement a convolution neural network from scratch please see an exercise for our Deep Learning summer school last year https://github.com/DTU-deeplearning/day2-Conv" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "#LOAD the mnist data. To speed up training we'll only work on a subset of the data.\n", 45 | "#Note that we reshape the data from (nsamples, num_features)= (nsamples, nchannels*rows*cols) -> (nsamples, nchannels, rows, cols)\n", 46 | "# in order to retain the spatial arrangements of the pixels\n", 47 | "data = np.load('mnist.npz')\n", 48 | "num_classes = 10\n", 49 | "nchannels,rows,cols = 1,28,28\n", 50 | "x_train = data['X_train'][:10000].astype('float32')\n", 51 | "x_train = x_train.reshape((-1,nchannels,rows,cols))\n", 52 | "targets_train = data['y_train'][:10000].astype('int32')\n", 53 | "\n", 54 | "x_valid = data['X_valid'][:500].astype('float32')\n", 55 | "x_valid = x_valid.reshape((-1,nchannels,rows,cols))\n", 56 | "targets_valid = data['y_valid'][:500].astype('int32')\n", 57 | "\n", 58 | "x_test = data['X_test'][:500].astype('float32')\n", 59 | "x_test = x_test.reshape((-1,nchannels,rows,cols))\n", 60 | "targets_test = data['y_test'][:500].astype('int32')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "#plot a few MNIST examples\n", 72 | "idx = 0\n", 73 | "canvas = np.zeros((28*10, 10*28))\n", 74 | "for i in range(10):\n", 75 | " for j in range(10):\n", 76 | " canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x_train[idx].reshape((28, 28))\n", 77 | " idx += 1\n", 78 | "plt.figure(figsize=(7, 7))\n", 79 | "plt.imshow(canvas, cmap='gray')\n", 80 | "plt.title('MNIST handwritten digits')\n", 81 | "plt.show()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "#Define a simple feed forward neural network\n", 93 | "\n", 94 | "from lasagne.nonlinearities import leaky_rectify, softmax, tanh, elu\n", 95 | "from lasagne.layers import InputLayer, DenseLayer, Conv2DLayer, batch_norm, DropoutLayer, MaxPool2DLayer\n", 96 | "\n", 97 | "#defined the model\n", 98 | "num_class = 10\n", 99 | "num_features = x_train.shape[1]\n", 100 | "\n", 101 | "l_in = InputLayer(shape=(None,nchannels,rows,cols)) #note that we use a 4D input since we need to retain the spatial arrangement of the pixels when working with convolutions.\n", 102 | "#l_conv = Conv2DLayer(l_in,num_filters=16,filter_size=5)\n", 103 | "l_hid = DenseLayer(l_in, num_units=100, nonlinearity=elu) #remember to connect the new conv-layer here\n", 104 | "l_out = DenseLayer(l_hid, num_units=num_class, nonlinearity=softmax)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "#Setting up the graph in theano\n", 116 | "sym_x = T.tensor4('sym_x') # a symbolic variable, this is now a 4-D tensor.\n", 117 | "sym_t = T.ivector('sym_t') # a symbolic variable taking on the value of the target batch.\n", 118 | "\n", 119 | "# Get network output\n", 120 | "train_out = lasagne.layers.get_output(l_out, sym_x, deterministic=False)\n", 121 | "eval_out = lasagne.layers.get_output(l_out, sym_x, deterministic=True)\n", 122 | "\n", 123 | "\n", 124 | "# Get list of all trainable parameters in the network.\n", 125 | "all_params = lasagne.layers.get_all_params(l_out, trainable=True)\n", 126 | "\n", 127 | "cost = T.nnet.categorical_crossentropy(train_out+1e-8, sym_t).mean()\n", 128 | "# Let Theano do its magic and get all the gradients we need for training\n", 129 | "all_grads = T.grad(cost, all_params)\n", 130 | "\n", 131 | "\n", 132 | "# Set the update function for parameters \n", 133 | "# you might wan't to experiment with more advanded update schemes like rmsprob, adadelta etc.\n", 134 | "updates = lasagne.updates.adam(all_grads, all_params, learning_rate=0.001)\n", 135 | "\n", 136 | "\n", 137 | "f_eval = theano.function([sym_x],\n", 138 | " eval_out, on_unused_input='warn')\n", 139 | "\n", 140 | "f_train = theano.function([sym_x, sym_t],\n", 141 | " [cost],\n", 142 | " updates=updates, on_unused_input='warn')" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "#Test the forward pass\n", 154 | "x = np.random.normal(0,1, (45, 1,28,28)).astype('float32') #dummy data\n", 155 | "\n", 156 | "model = lasagne.layers.get_output(l_out, sym_x)\n", 157 | "out = model.eval({sym_x:x}) #this could also include mask etc if used\n", 158 | "print(\"l_out\", out.shape)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "#Training Loop\n", 170 | "from confusionmatrix import ConfusionMatrix\n", 171 | "batch_size = 100\n", 172 | "num_epochs = 10\n", 173 | "num_samples_train = x_train.shape[0]\n", 174 | "num_batches_train = num_samples_train // batch_size\n", 175 | "num_samples_valid = x_valid.shape[0]\n", 176 | "num_batches_valid = num_samples_valid // batch_size\n", 177 | "\n", 178 | "train_acc, train_loss = [], []\n", 179 | "valid_acc, valid_loss = [], []\n", 180 | "test_acc, test_loss = [], []\n", 181 | "cur_loss = 0\n", 182 | "loss = []\n", 183 | "try:\n", 184 | " for epoch in range(num_epochs):\n", 185 | " #Forward->Backprob->Update params\n", 186 | " cur_loss = 0\n", 187 | " for i in range(num_batches_train):\n", 188 | " idx = range(i*batch_size, (i+1)*batch_size)\n", 189 | " x_batch = x_train[idx]\n", 190 | " target_batch = targets_train[idx] \n", 191 | " batch_loss = f_train(x_batch,target_batch) #this will do the complete backprob pass\n", 192 | " cur_loss += batch_loss[0]\n", 193 | " loss += [cur_loss/batch_size]\n", 194 | "\n", 195 | " confusion_valid = ConfusionMatrix(num_classes)\n", 196 | " confusion_train = ConfusionMatrix(num_classes)\n", 197 | "\n", 198 | " for i in range(num_batches_train):\n", 199 | " idx = range(i*batch_size, (i+1)*batch_size)\n", 200 | " x_batch = x_train[idx]\n", 201 | " targets_batch = targets_train[idx]\n", 202 | " net_out = f_eval(x_batch) \n", 203 | " preds = np.argmax(net_out, axis=-1) \n", 204 | " confusion_train.batch_add(targets_batch, preds)\n", 205 | "\n", 206 | " confusion_valid = ConfusionMatrix(num_classes)\n", 207 | " for i in range(num_batches_valid):\n", 208 | " idx = range(i*batch_size, (i+1)*batch_size)\n", 209 | " x_batch = x_valid[idx]\n", 210 | " targets_batch = targets_valid[idx]\n", 211 | " net_out = f_eval(x_batch) \n", 212 | " preds = np.argmax(net_out, axis=-1) \n", 213 | "\n", 214 | " confusion_valid.batch_add(targets_batch, preds)\n", 215 | "\n", 216 | " train_acc_cur = confusion_train.accuracy()\n", 217 | " valid_acc_cur = confusion_valid.accuracy()\n", 218 | "\n", 219 | " train_acc += [train_acc_cur]\n", 220 | " valid_acc += [valid_acc_cur]\n", 221 | " print \"Epoch %i : Train Loss %e , Train acc %f, Valid acc %f \" \\\n", 222 | " % (epoch+1, loss[-1], train_acc_cur, valid_acc_cur)\n", 223 | "except KeyboardInterrupt:\n", 224 | " pass\n", 225 | " \n", 226 | "\n", 227 | "#get test set score\n", 228 | "confusion_test = ConfusionMatrix(num_classes)\n", 229 | "net_out = f_eval(x_test) \n", 230 | "preds = np.argmax(net_out, axis=-1) \n", 231 | "confusion_test.batch_add(targets_test, preds)\n", 232 | "print \"\\nTest set Acc: %f\" %(confusion_test.accuracy())\n", 233 | "\n", 234 | "\n", 235 | "epoch = np.arange(len(train_acc))\n", 236 | "plt.figure()\n", 237 | "plt.plot(epoch,train_acc,'r',epoch,valid_acc,'b')\n", 238 | "plt.legend(['Train Acc','Val Acc'])\n", 239 | "plt.xlabel('Epochs'), plt.ylabel('Acc'), plt.ylim([0.75,1.03])" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "# Assignments 1\n", 247 | "\n", 248 | " 1) Note the performance of the standard feedforward neural network. Add a 2D convolution layer before the dense hidden layer and confirm that it increases the generalization performance of the network (try num_filters=16 and filter_size=5 as a starting point). \n", 249 | " \n", 250 | " 2) Can the performance be increases even further by stacking more convolution layers ?\n", 251 | " \n", 252 | " 3) Maxpooling is a technique for decreasing the spatial resolution of an image while retaining the important features. Effectively this gives a local translational invariance and reduces the computation by a factor of four. In the classification algorithm which is usually desirable. Try to either: \n", 253 | " \n", 254 | " a) add a maxpool layer(add arguement pool_size=2) after the convolution layer or\n", 255 | " b) set add stride=2 to the arguments of the convolution layer. \n", 256 | " Verify that this decreases spatial dimension of the image. (print l_conv.output_shape or print l_maxpool.output_shape). Does this increase the performance of the network (you may need to stack multiple layers or increase the number of filters to increase performance) ?\n", 257 | " \n" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "# Visualization of filters\n", 265 | "Convolution filters can be interpreted as spatial feature detectors picking up different image features such as edges, corners etc. Below we provide code for visualization of the filters. The best results are obtained with fairly large filters of size 9 and either 16 or 36 filters. " 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": { 272 | "collapsed": false 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "### If you get an error with l_conv not being defined you need define l_conv when the network is defined!\n", 277 | "np_W = l_conv.W.get_value() #get the filter values from the conv layer\n", 278 | "print np_W.shape, \"i.e. the shape is num_filters, num_channels, filter_size, filter_size\"\n", 279 | "num_filters,num_channels,filter_size,_= np_W.shape\n", 280 | "n = int(num_filters**0.5)\n", 281 | "\n", 282 | "np_W_res = np_W.reshape(n,n,num_channels,filter_size,filter_size)\n", 283 | "fig, ax = plt.subplots(n,n)\n", 284 | "print \"learned filter values\"\n", 285 | "for i in range(n):\n", 286 | " for j in range(n):\n", 287 | " ax[i,j].imshow(np_W_res[i,j,0], cmap='gray',interpolation='none')\n", 288 | " ax[i,j].xaxis.set_major_formatter(plt.NullFormatter())\n", 289 | " ax[i,j].yaxis.set_major_formatter(plt.NullFormatter())\n", 290 | "\n", 291 | "\n", 292 | "idx = 1\n", 293 | "plt.figure()\n", 294 | "plt.imshow(x_train[idx,0],cmap='gray',interpolation='none')\n", 295 | "plt.title('Inut Image')\n", 296 | "plt.show()\n", 297 | "\n", 298 | "#visalize the filters convolved with an input image\n", 299 | "from scipy.signal import convolve2d\n", 300 | "np_W_res = np_W.reshape(n,n,num_channels,filter_size,filter_size)\n", 301 | "fig, ax = plt.subplots(n,n,figsize=(9,9))\n", 302 | "print \"Response from input image convolved with the filters\"\n", 303 | "for i in range(n):\n", 304 | " for j in range(n):\n", 305 | " ax[i,j].imshow(convolve2d(x_train[1,0],np_W_res[i,j,0],mode='same'), cmap='gray',interpolation='none')\n", 306 | " ax[i,j].xaxis.set_major_formatter(plt.NullFormatter())\n", 307 | " ax[i,j].yaxis.set_major_formatter(plt.NullFormatter())\n", 308 | "\n" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "# Assignment 2\n", 316 | "\n", 317 | "The visualized filters will likely look most like noise due to the small amount of training data.\n", 318 | "\n", 319 | " 1) Try to use 10000 traning examples instead and visualise the filters again\n", 320 | " \n", 321 | " 2) Dropout is a very usefull technique for preventing overfitting. Try to add a DropoutLayer after the convolution layer and hidden layer. This should increase both performance and the \"visual appeal\" of the filters\n", 322 | " \n", 323 | " 3) Batch normalization is a recent innovation for improving generalization performance. Try to insert batch normalization layers into the network to improve performance. \n", 324 | " \n", 325 | " \n" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "# More Fun with convolutional networks\n", 333 | "### Get the data" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": { 340 | "collapsed": false 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "!wget -N https://s3.amazonaws.com/lasagne/recipes/datasets/mnist_cluttered_60x60_6distortions.npz" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "In the data the each mnist digit (20x20 pixels) has been placed randomly in a 60x60 canvas. To make the task harder each canvas has then been cluttered with small pieces of digits. In this task it is helpfull for a network if it can focus only on the digit and ignore the rest.\n", 352 | "\n", 353 | "The ``TransformerLayer`` lets us do this. The transformer layer learns an affine transformation which lets the network zoom, rotate and skew. If you are interested you should read the paper, but the main idea is that you can let a small convolutional network determine the the parameters of the affine transformation. You then apply the affine transformation to the input data. Usually this also involves downsampling which forces the model to zoom in on the relevant parts of the data. After the affine transformation we can use a larger conv net to do the classification. \n", 354 | "This is possible because you can backprop through a an affine transformation if you use bilinear interpolation." 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": { 361 | "collapsed": false 362 | }, 363 | "outputs": [], 364 | "source": [ 365 | "import os\n", 366 | "import matplotlib\n", 367 | "import numpy as np\n", 368 | "np.random.seed(123)\n", 369 | "import matplotlib.pyplot as plt\n", 370 | "import lasagne\n", 371 | "import theano\n", 372 | "import theano.tensor as T\n", 373 | "conv = lasagne.layers.Conv2DLayer\n", 374 | "pool = lasagne.layers.MaxPool2DLayer\n", 375 | "NUM_EPOCHS = 500\n", 376 | "BATCH_SIZE = 256\n", 377 | "LEARNING_RATE = 0.001\n", 378 | "DIM = 60\n", 379 | "NUM_CLASSES = 10\n", 380 | "mnist_cluttered = \"mnist_cluttered_60x60_6distortions.npz\"\n", 381 | "\n", 382 | "\n" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": { 389 | "collapsed": false 390 | }, 391 | "outputs": [], 392 | "source": [ 393 | "def load_data():\n", 394 | " data = np.load(mnist_cluttered)\n", 395 | " X_train, y_train = data['x_train'], np.argmax(data['y_train'], axis=-1)\n", 396 | " X_valid, y_valid = data['x_valid'], np.argmax(data['y_valid'], axis=-1)\n", 397 | " X_test, y_test = data['x_test'], np.argmax(data['y_test'], axis=-1)\n", 398 | "\n", 399 | " # reshape for convolutions\n", 400 | " X_train = X_train.reshape((X_train.shape[0], 1, DIM, DIM))\n", 401 | " X_valid = X_valid.reshape((X_valid.shape[0], 1, DIM, DIM))\n", 402 | " X_test = X_test.reshape((X_test.shape[0], 1, DIM, DIM))\n", 403 | " \n", 404 | " print \"Train samples:\", X_train.shape\n", 405 | " print \"Validation samples:\", X_valid.shape\n", 406 | " print \"Test samples:\", X_test.shape\n", 407 | "\n", 408 | " return dict(\n", 409 | " X_train=lasagne.utils.floatX(X_train),\n", 410 | " y_train=y_train.astype('int32'),\n", 411 | " X_valid=lasagne.utils.floatX(X_valid),\n", 412 | " y_valid=y_valid.astype('int32'),\n", 413 | " X_test=lasagne.utils.floatX(X_test),\n", 414 | " y_test=y_test.astype('int32'),\n", 415 | " num_examples_train=X_train.shape[0],\n", 416 | " num_examples_valid=X_valid.shape[0],\n", 417 | " num_examples_test=X_test.shape[0],\n", 418 | " input_height=X_train.shape[2],\n", 419 | " input_width=X_train.shape[3],\n", 420 | " output_dim=10,)\n", 421 | "data = load_data()\n", 422 | "\n", 423 | "idx = 0\n", 424 | "canvas = np.zeros((DIM*10, 10*DIM))\n", 425 | "for i in range(10):\n", 426 | " for j in range(10):\n", 427 | " canvas[i*DIM:(i+1)*DIM, j*DIM:(j+1)*DIM] = data['X_train'][idx].reshape((DIM, DIM))\n", 428 | " idx += 1\n", 429 | "plt.figure(figsize=(10, 10))\n", 430 | "plt.imshow(canvas, cmap='gray')\n", 431 | "plt.title('Cluttered handwritten digits')\n", 432 | "plt.axis('off')\n", 433 | "\n", 434 | "plt.show()" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "## Building the model\n", 442 | "\n", 443 | "We use a model where the localization network is a two layer convolution network which operates directly on the image input. The output from the localization network is a 6 dimensional vector specifying the parameters in the affine transformation.\n", 444 | "\n", 445 | "We set up the transformer layer to initially do the identity transform, similarly to [1]. If the output from the localization networks is [t1, t2, t3, t4, t5, t6] then t1 and t5 determines zoom, t2 and t4 determines skewness, and t3 and t6 move the center position. By setting the initial values of the bias vector to \n", 446 | "\n", 447 | "```\n", 448 | "|1, 0, 0|\n", 449 | "|0, 1, 0|\n", 450 | "```\n", 451 | "and the final W of the localization network to all zeros we ensure that in the beginning of training the network works as a pooling layer. \n", 452 | "\n", 453 | "The output of the localization layer feeds into the transformer layer which applies the transformation to the image input. In our setup the transformer layer downsamples the input by a factor 3.\n", 454 | "\n", 455 | "Finally a 2 layer convolution layer and 2 fully connected layers calculates the output probabilities.\n", 456 | "\n", 457 | "\n", 458 | "### The model\n", 459 | "```\n", 460 | "Input -> localization_network -> TransformerLayer -> output_network -> predictions\n", 461 | " | |\n", 462 | " >--------------------------------^\n", 463 | "```\n", 464 | "\n", 465 | "\n" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": { 472 | "collapsed": false 473 | }, 474 | "outputs": [], 475 | "source": [ 476 | "def build_model(input_width, input_height, output_dim,\n", 477 | " batch_size=BATCH_SIZE):\n", 478 | " ini = lasagne.init.HeUniform()\n", 479 | " l_in = lasagne.layers.InputLayer(shape=(None, 1, input_width, input_height),)\n", 480 | "\n", 481 | " # Localization network\n", 482 | " b = np.zeros((2, 3), dtype=theano.config.floatX)\n", 483 | " b[0, 0] = 1\n", 484 | " b[1, 1] = 1\n", 485 | " b = b.flatten()\n", 486 | " loc_l1 = pool(l_in, pool_size=(2, 2))\n", 487 | " loc_l2 = conv(\n", 488 | " loc_l1, num_filters=8, filter_size=(5, 5), W=ini)\n", 489 | " loc_l3 = pool(loc_l2, pool_size=(2, 2))\n", 490 | " loc_l4 = conv(loc_l3, num_filters=8, filter_size=(5, 5), W=ini)\n", 491 | " loc_l5 = lasagne.layers.DenseLayer(\n", 492 | " loc_l4, num_units=50, W=lasagne.init.HeUniform('relu'))\n", 493 | " loc_out = lasagne.layers.DenseLayer(\n", 494 | " loc_l5, num_units=6, b=b, W=lasagne.init.Constant(0.0), \n", 495 | " nonlinearity=lasagne.nonlinearities.identity)\n", 496 | " \n", 497 | " # Transformer network\n", 498 | " l_trans1 = lasagne.layers.TransformerLayer(l_in, loc_out, downsample_factor=3.0)\n", 499 | " print \"Transformer network output shape: \", l_trans1.output_shape\n", 500 | " \n", 501 | " # Classification network\n", 502 | " class_l1 = conv(\n", 503 | " l_trans1,\n", 504 | " num_filters=16,\n", 505 | " filter_size=(3, 3),\n", 506 | " nonlinearity=lasagne.nonlinearities.rectify,\n", 507 | " W=ini,\n", 508 | " )\n", 509 | " class_l2 = pool(class_l1, pool_size=(2, 2))\n", 510 | " class_l3 = conv(\n", 511 | " class_l2,\n", 512 | " num_filters=16,\n", 513 | " filter_size=(3, 3),\n", 514 | " nonlinearity=lasagne.nonlinearities.rectify,\n", 515 | " W=ini,\n", 516 | " )\n", 517 | " class_l4 = pool(class_l3, pool_size=(2, 2))\n", 518 | " class_l5 = lasagne.layers.DenseLayer(\n", 519 | " class_l4,\n", 520 | " num_units=256,\n", 521 | " nonlinearity=lasagne.nonlinearities.rectify,\n", 522 | " W=ini,\n", 523 | " )\n", 524 | "\n", 525 | " l_out = lasagne.layers.DenseLayer(\n", 526 | " class_l5,\n", 527 | " num_units=output_dim,\n", 528 | " nonlinearity=lasagne.nonlinearities.softmax,\n", 529 | " W=ini,\n", 530 | " )\n", 531 | "\n", 532 | " return l_out, l_trans1\n", 533 | "\n", 534 | "model, l_transform = build_model(DIM, DIM, NUM_CLASSES)\n", 535 | "model_params = lasagne.layers.get_all_params(model, trainable=True)\n" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": null, 541 | "metadata": { 542 | "collapsed": false 543 | }, 544 | "outputs": [], 545 | "source": [ 546 | "X = T.tensor4()\n", 547 | "y = T.ivector()\n", 548 | "\n", 549 | "# training output\n", 550 | "output_train = lasagne.layers.get_output(model, X, deterministic=False)\n", 551 | "\n", 552 | "# evaluation output. Also includes output of transform for plotting\n", 553 | "output_eval, transform_eval = lasagne.layers.get_output([model, l_transform], X, deterministic=True)\n", 554 | "\n", 555 | "sh_lr = theano.shared(lasagne.utils.floatX(LEARNING_RATE))\n", 556 | "cost = T.mean(T.nnet.categorical_crossentropy(output_train, y))\n", 557 | "updates = lasagne.updates.adam(cost, model_params, learning_rate=sh_lr)\n", 558 | "\n", 559 | "train = theano.function([X, y], [cost, output_train], updates=updates)\n", 560 | "eval = theano.function([X], [output_eval, transform_eval])" 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": {}, 566 | "source": [ 567 | "### Training the model\n", 568 | "Unfortunately NVIDIA has yet to squeeze a TitanX into a labtop and training convnets on CPU is painfully slow. After 10 epochs you should see that model starts to zoom in on the digits. " 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": null, 574 | "metadata": { 575 | "collapsed": true 576 | }, 577 | "outputs": [], 578 | "source": [ 579 | "def train_epoch(X, y):\n", 580 | " num_samples = X.shape[0]\n", 581 | " num_batches = int(np.ceil(num_samples / float(BATCH_SIZE)))\n", 582 | " costs = []\n", 583 | " correct = 0\n", 584 | " for i in range(num_batches):\n", 585 | " if i % 10 == 0:\n", 586 | " print i,\n", 587 | " idx = range(i*BATCH_SIZE, np.minimum((i+1)*BATCH_SIZE, num_samples))\n", 588 | " X_batch = X[idx]\n", 589 | " y_batch = y[idx]\n", 590 | " cost_batch, output_train = train(X_batch, y_batch)\n", 591 | " costs += [cost_batch]\n", 592 | " preds = np.argmax(output_train, axis=-1)\n", 593 | " correct += np.sum(y_batch == preds)\n", 594 | " print \"\"\n", 595 | " return np.mean(costs), correct / float(num_samples)\n", 596 | "\n", 597 | "\n", 598 | "def eval_epoch(X, y):\n", 599 | " output_eval, transform_eval = eval(X)\n", 600 | " preds = np.argmax(output_eval, axis=-1)\n", 601 | " acc = np.mean(preds == y)\n", 602 | " return acc, transform_eval" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": null, 608 | "metadata": { 609 | "collapsed": false 610 | }, 611 | "outputs": [], 612 | "source": [ 613 | "valid_accs, train_accs, test_accs = [], [], []\n", 614 | "try:\n", 615 | " for n in range(NUM_EPOCHS):\n", 616 | " train_cost, train_acc = train_epoch(data['X_train'], data['y_train'])\n", 617 | " valid_acc, valid_trainsform = eval_epoch(data['X_valid'], data['y_valid'])\n", 618 | " test_acc, test_transform = eval_epoch(data['X_test'], data['y_test'])\n", 619 | " valid_accs += [valid_acc]\n", 620 | " test_accs += [test_acc]\n", 621 | " train_accs += [train_acc]\n", 622 | "\n", 623 | " if (n+1) % 20 == 0:\n", 624 | " new_lr = sh_lr.get_value() * 0.7\n", 625 | " print \"New LR:\", new_lr\n", 626 | " sh_lr.set_value(lasagne.utils.floatX(new_lr))\n", 627 | "\n", 628 | " print \"Epoch {0}: Train cost {1}, Train acc {2}, val acc {3}, test acc {4}\".format(\n", 629 | " n, train_cost, train_acc, valid_acc, test_acc)\n", 630 | "except KeyboardInterrupt:\n", 631 | " pass" 632 | ] 633 | }, 634 | { 635 | "cell_type": "markdown", 636 | "metadata": {}, 637 | "source": [ 638 | "### Plot errors and zoom" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": { 645 | "collapsed": false 646 | }, 647 | "outputs": [], 648 | "source": [ 649 | "plt.figure(figsize=(9,9))\n", 650 | "plt.plot(1-np.array(train_accs), label='Training Error')\n", 651 | "plt.plot(1-np.array(valid_accs), label='Validation Error')\n", 652 | "plt.legend(fontsize=20)\n", 653 | "plt.xlabel('Epoch', fontsize=20)\n", 654 | "plt.ylabel('Error', fontsize=20)\n", 655 | "plt.show()" 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": null, 661 | "metadata": { 662 | "collapsed": false 663 | }, 664 | "outputs": [], 665 | "source": [ 666 | "plt.figure(figsize=(7,14))\n", 667 | "for i in range(3):\n", 668 | " plt.subplot(321+i*2)\n", 669 | " plt.imshow(data['X_test'][i].reshape(DIM, DIM), cmap='gray', interpolation='none')\n", 670 | " if i == 0:\n", 671 | " plt.title('Original 60x60', fontsize=20)\n", 672 | " plt.axis('off')\n", 673 | " plt.subplot(322+i*2)\n", 674 | " plt.imshow(test_transform[i].reshape(DIM//3, DIM//3), cmap='gray', interpolation='none')\n", 675 | " if i == 0:\n", 676 | " plt.title('Transformed 20x20', fontsize=20)\n", 677 | " plt.axis('off')\n", 678 | " \n", 679 | " \n", 680 | "plt.tight_layout()" 681 | ] 682 | }, 683 | { 684 | "cell_type": "markdown", 685 | "metadata": { 686 | "collapsed": true 687 | }, 688 | "source": [ 689 | "# A few pointers for image classification\n", 690 | "If you want do image classification using a pretrained model is often a good choice, especially if you have limited amounts of labeled data. \n", 691 | "\n", 692 | "An often used pretrained network is the VGG16 and VGG19. Lasagne has pretrained models in the [modelzoo](https://github.com/Lasagne/Recipes/tree/master/modelzoo). Torch7 and Tensorflow have similar pretrained models that you can find with google. \n", 693 | "\n", 694 | "Currently the best performing image networks is [ResNet](https://arxiv.org/pdf/1512.03385v1.pdf). Torch7 has an interesting blog post about Residual nets. http://torch.ch/blog/2016/02/04/resnets.html\n", 695 | "\n", 696 | "\n" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": null, 702 | "metadata": { 703 | "collapsed": true 704 | }, 705 | "outputs": [], 706 | "source": [] 707 | } 708 | ], 709 | "metadata": { 710 | "kernelspec": { 711 | "display_name": "Python 2", 712 | "language": "python", 713 | "name": "python2" 714 | }, 715 | "language_info": { 716 | "codemirror_mode": { 717 | "name": "ipython", 718 | "version": 2 719 | }, 720 | "file_extension": ".py", 721 | "mimetype": "text/x-python", 722 | "name": "python", 723 | "nbconvert_exporter": "python", 724 | "pygments_lexer": "ipython2", 725 | "version": "2.7.11" 726 | } 727 | }, 728 | "nbformat": 4, 729 | "nbformat_minor": 0 730 | } 731 | -------------------------------------------------------------------------------- /lab2/mnist.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepLearningDTU/nvidia_deep_learning_summercamp_2016/9d87d409eeebed4b8cf39c42987fb5e786d3ae28/lab2/mnist.npz -------------------------------------------------------------------------------- /lab3/.ipynb_checkpoints/RNN-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline \n", 12 | "%matplotlib nbagg\n", 13 | "import lasagne\n", 14 | "import theano\n", 15 | "import theano.tensor as T\n", 16 | "import matplotlib\n", 17 | "import numpy as np\n", 18 | "import matplotlib.pyplot as plt\n", 19 | "from IPython import display\n", 20 | "from data_generator import get_batch, print_valid_characters\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "# Recurrent Neural Networks\n", 28 | "\n", 29 | "Recurrent neural networks are the natural type of neural network to use for sequential data i.e. time series analysis, translation, speech recognition, biological sequence analysis etc. Recurrent neural networks works by recursively applying the same operation at each time step of the data sequence and having layers that pass information from previous time step to the current. It can therefore naturally handle input of varying length. Recurrent networks can be used for several prediction tasks including: sequence-to-class, sequence tagging, and sequence-to-sequence predictions.\n", 30 | "\n", 31 | "In this exercise we'll implement a Encoder-Decoder RNN based on the GRU unit for a simple sequence to sequence translation task. This type of models have shown impressive performance in Neural Machine Translation and Image Caption generation. \n", 32 | "\n", 33 | "For more in depth background material on RNNs please see [Supervised Sequence Labelling with Recurrent\n", 34 | "Neural Networks](https://www.cs.toronto.edu/~graves/preprint.pdf) by Alex Graves" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "# Encoder-Decoder\n", 42 | "In the encoder-decoder structure one RNN (blue) encodes the input and a second RNN (red) calculates the target values. One essential step is to let the encoder and decoder communicate. In the simplest approach you use the last hidden state of the encoder to initialize the decoder. Other approaches lets the decoder attend to different parts of the encoded input at different timesteps in the decoding process. \n", 43 | "\n", 44 | "\n", 45 | "\n", 46 | "In our implementation we use a RNN with gated recurrent units (GRU) as encoder. We then use the last hidden state of the encoder ($h^{enc}_T$) as input to the decoder which is also a GRU RNN. \n", 47 | "\n", 48 | "### RNNs in Lasagne\n", 49 | "Lasagne have implementations of LSTM and GRU unit. Both layers assume that the input from the layer below have the shape **(Batch_size, seq_len, num_features)**. In this excercise we will use the GRU unit since it only stores a single hidden value per neuron (LSTMs stores two) and is approximately twice as fast as the LSTM unit.\n", 50 | "\n", 51 | "As stated above we will implement a Encoder-Decoder model. The simplest way to do this is to encode the input sequence using the Encoder model. We will then use the last hidden state of the Encoder $h^{enc}_T$ as input to the decoder model which then uses this information (simply a fixed length vector of numbers) to produce the targets. There is (at least) two ways to input $h^{enc}_T$ into the decoder\n", 52 | "\n", 53 | "1. Repeatly use $h^{enc}_T$ as input to the Decoder at each decode time step\n", 54 | "2. Intialize the decoder using $h^{enc}_T$ and run the decoder without any inputs\n", 55 | "\n", 56 | "In this exercise we will follow the first approach because it's easier to implement. To do this need to create a lasagne layer that takes $h^{enc}_T$ and repeat it *N_decode_step* times. Below is an implementation of the RepeatLayer. You don't need to know the exact way it works, however make sure that you understand that it takes an input is size *(Batch_size x num_units)* and produces an output of size (Batch_size x n_decode_steps x num_units).\n" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "class RepeatLayer(lasagne.layers.Layer):\n", 68 | " def __init__(self, incoming, n, **kwargs):\n", 69 | " '''\n", 70 | " The input is expected to be a 2D tensor of shape \n", 71 | " (num_batch, num_features). The input is repeated\n", 72 | " n times such that the output will be \n", 73 | " (num_batch, n, num_features)\n", 74 | " '''\n", 75 | " super(RepeatLayer, self).__init__(incoming, **kwargs)\n", 76 | " self.n = n\n", 77 | "\n", 78 | " def get_output_shape_for(self, input_shape):\n", 79 | " return tuple([input_shape[0], self.n] + list(input_shape[1:]))\n", 80 | "\n", 81 | " def get_output_for(self, input, **kwargs):\n", 82 | " #repeat the input n times\n", 83 | " tensors = [input]*self.n\n", 84 | " stacked = theano.tensor.stack(*tensors)\n", 85 | " dim = [1, 0] + range(2, input.ndim + 1)\n", 86 | " return stacked.dimshuffle(dim)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "### The Data\n", 94 | "Since RNN models can be very slow to train on real large datasets we will generate some simpler training data for this exercise. The task for the RNN is simply to translate a string of letters spelling the numbers between 0-9 into the corresponding numbers i.e\n", 95 | "\n", 96 | "\"one two five\" --> \"125#\" (we use # as a special stop of sequence character)\n", 97 | "\n", 98 | "To input the strings into the RNN model we translate the characters into a vector integers using a simple translation table (i.e. 'h'->16, 'o'-> 17 etc). The code below prints a few input/output pairs using the *get_batch* function which randomy produces the data." 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": { 105 | "collapsed": false 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "batch_size = 3\n", 110 | "inputs, input_masks, targets, target_masks, text_inputs, text_targets = \\\n", 111 | " get_batch(batch_size=batch_size,max_digits=2,min_digits=1)\n", 112 | "\n", 113 | "print \"input types:\", inputs.dtype, input_masks.dtype, targets.dtype, target_masks.dtype\n", 114 | "print print_valid_characters()\n", 115 | "print \"Stop character = #\"\n", 116 | "\n", 117 | "\n", 118 | "for i in range(batch_size):\n", 119 | " print \"\\nSAMPLE\",i\n", 120 | " print \"TEXT INPUTS:\\t\\t\", text_inputs[i]\n", 121 | " print \"TEXT TARGETS:\\t\\t\", text_targets[i]\n", 122 | " print \"ENCODED INPUTS:\\t\\t\", inputs[i]\n", 123 | " print \"MASK INPUTS:\\t\\t\", input_masks[i]\n", 124 | " print \"ENCODED TARGETS:\\t\", targets[i]\n", 125 | " print \"MASK TARGETS:\\t\\t\", target_masks[i]" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "### Encoder Decoder model setup\n", 133 | "Below is the Lasagne model definition. We use an embedding layer to go from integer representation to vector representation of the input.\n", 134 | "\n", 135 | "Note that the layer has a lot of print statements which we used for debugging during setup." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": { 142 | "collapsed": false 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "BATCH_SIZE = 100\n", 147 | "NUM_UNITS_ENC = 10\n", 148 | "NUM_UNITS_DEC = 10\n", 149 | "MAX_DIGITS = 20 \n", 150 | "MIN_DIGITS = MAX_DIGITS #currently only support for same length outputs - we'll leave it for an exercise to add support for varying length targets\n", 151 | "NUM_INPUTS = 27\n", 152 | "NUM_OUTPUTS = 11 #(0-9 + '#')\n", 153 | "\n", 154 | "\n", 155 | "#symbolic theano variables. Note that we are using imatrix for X since it goes into the embedding layer\n", 156 | "x_sym = T.imatrix()\n", 157 | "y_sym = T.imatrix()\n", 158 | "xmask_sym = T.matrix()\n", 159 | "\n", 160 | "#dummy data to test implementation - We advise to check the output-dimensions of all layers.\n", 161 | "#One way to do this in lasagne/theano is to forward pass some data through the model and \n", 162 | "#check the output dimensions of these.\n", 163 | "#Create some random testdata\n", 164 | "X = np.random.randint(0,10,size=(BATCH_SIZE,MIN_DIGITS)).astype('int32')\n", 165 | "Xmask = np.ones((BATCH_SIZE,MIN_DIGITS)).astype('float32')\n", 166 | "\n", 167 | "##### ENCODER START #####\n", 168 | "l_in = lasagne.layers.InputLayer((None, None))\n", 169 | "l_emb = lasagne.layers.EmbeddingLayer(l_in, NUM_INPUTS, NUM_INPUTS, \n", 170 | " W=np.eye(NUM_INPUTS,dtype='float32'),\n", 171 | " name='Embedding')\n", 172 | "#Here we'll remove the trainable parameters from the embeding layer to constrain \n", 173 | "#it to a simple \"one-hot-encoding\". You can experiment with removing this line\n", 174 | "l_emb.params[l_emb.W].remove('trainable') \n", 175 | "#forward pass some data throug the inputlayer-embedding layer and print the output shape\n", 176 | "print lasagne.layers.get_output(l_emb, inputs={l_in: x_sym}).eval({x_sym: X}).shape\n", 177 | "\n", 178 | "l_mask_enc = lasagne.layers.InputLayer((None, None))\n", 179 | "l_enc = lasagne.layers.GRULayer(l_emb, num_units=NUM_UNITS_ENC, name='GRUEncoder', mask_input=l_mask_enc)\n", 180 | "print lasagne.layers.get_output(l_enc, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n", 181 | " {x_sym: X, xmask_sym: Xmask}).shape\n", 182 | "\n", 183 | "# slice last index of dimension 1\n", 184 | "l_last_hid = lasagne.layers.SliceLayer(l_enc, indices=-1, axis=1)\n", 185 | "print lasagne.layers.get_output(l_last_hid, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n", 186 | " {x_sym: X, xmask_sym: Xmask}).shape\n", 187 | "##### END OF ENCODER######\n", 188 | "\n", 189 | "\n", 190 | "##### START OF DECODER######\n", 191 | "l_in_rep = RepeatLayer(l_last_hid, n=MAX_DIGITS+1) #we add one to allow space for the end of sequence character\n", 192 | "print lasagne.layers.get_output(l_in_rep, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n", 193 | " {x_sym: X, xmask_sym: Xmask}).shape\n", 194 | "\n", 195 | "l_dec = lasagne.layers.GRULayer(l_in_rep, num_units=NUM_UNITS_DEC, name='GRUDecoder')\n", 196 | "print lasagne.layers.get_output(l_dec, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n", 197 | " {x_sym: X, xmask_sym: Xmask}).shape\n", 198 | "\n", 199 | "\n", 200 | "# We need to do some reshape voodo to connect a softmax layer to the decoder.\n", 201 | "# See http://lasagne.readthedocs.org/en/latest/modules/layers/recurrent.html#examples \n", 202 | "# In short this line changes the shape from \n", 203 | "# (batch_size, decode_len, num_dec_units) -> (batch_size*decodelen,num_dec_units). \n", 204 | "# We need to do this since the softmax is applied to the last dimension and we want to \n", 205 | "# softmax the output at each position individually\n", 206 | "l_reshape = lasagne.layers.ReshapeLayer(l_dec, (-1, [2]))\n", 207 | "print lasagne.layers.get_output(l_reshape, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n", 208 | " {x_sym: X, xmask_sym: Xmask}).shape\n", 209 | "\n", 210 | "l_softmax = lasagne.layers.DenseLayer(l_reshape, num_units=NUM_OUTPUTS, \n", 211 | " nonlinearity=lasagne.nonlinearities.softmax,\n", 212 | " name='SoftmaxOutput')\n", 213 | "print lasagne.layers.get_output(l_softmax, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n", 214 | " {x_sym: X, xmask_sym: Xmask}).shape\n", 215 | "\n", 216 | "# reshape back to 3d format (batch_size, decode_len, num_dec_units). Here we tied the batch size to the shape of the symbolic variable for X allowing \n", 217 | "#us to use different batch sizes in the model.\n", 218 | "l_out = lasagne.layers.ReshapeLayer(l_softmax, (x_sym.shape[0], -1, NUM_OUTPUTS))\n", 219 | "print lasagne.layers.get_output(l_out, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n", 220 | " {x_sym: X, xmask_sym: Xmask}).shape\n", 221 | "###END OF DECODER######\n" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "### Defining the cost function and theano functions\n", 229 | "Becasue the targets are categorical we use cross entropy error. We use the Adam optimizer but you\n", 230 | "can experiment with the different optimizers implemented in [Lasagne](http://lasagne.readthedocs.org/en/latest/modules/updates.html). " 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": { 237 | "collapsed": false 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "output_decoder_train = lasagne.layers.get_output(l_out, inputs={l_in: x_sym, l_mask_enc: xmask_sym}, \n", 242 | " deterministic=False)\n", 243 | "\n", 244 | "#cost function\n", 245 | "total_cost = T.nnet.categorical_crossentropy(\n", 246 | " T.reshape(output_decoder_train, (-1, NUM_OUTPUTS)), y_sym.flatten())\n", 247 | "mean_cost = T.mean(total_cost)\n", 248 | "#accuracy function\n", 249 | "argmax = T.argmax(output_decoder_train,axis=-1)\n", 250 | "eq = T.eq(argmax,y_sym)\n", 251 | "acc = T.mean(eq) # gives float64 because eq is uint8, T.cast(eq, 'float32') will fix that...\n", 252 | "\n", 253 | "#Get parameters of both encoder and decoder\n", 254 | "all_parameters = lasagne.layers.get_all_params([l_out], trainable=True)\n", 255 | "\n", 256 | "print \"Trainable Model Parameters\"\n", 257 | "print \"-\"*40\n", 258 | "for param in all_parameters:\n", 259 | " print param, param.get_value().shape\n", 260 | "print \"-\"*40\n", 261 | "\n", 262 | "#add grad clipping to avoid exploding gradients\n", 263 | "all_grads = [T.clip(g,-3,3) for g in T.grad(mean_cost, all_parameters)]\n", 264 | "all_grads = lasagne.updates.total_norm_constraint(all_grads,3)\n", 265 | "\n", 266 | "#Compile Theano functions.\n", 267 | "#The two first two inputs to theano.functions is \n", 268 | "#1) a list of theano shared variables and \n", 269 | "#2) a list of functions(graphs) to calculate the values of most importanly the cost function. \n", 270 | "#3) for the training function the update argument should be given as the output from one of \n", 271 | "#4) lasagnes optimizers. of this argument is not set no parameters will be updated and only the values if 2) will be calculated\n", 272 | "updates = lasagne.updates.adam(all_grads, all_parameters, learning_rate=0.005)\n", 273 | "train_func = theano.function([x_sym, y_sym, xmask_sym], [mean_cost, acc, output_decoder_train], updates=updates)\n", 274 | "#since we don't have any stochasticity in the network we will just use the training graph without any updates given\n", 275 | "test_func = theano.function([x_sym, y_sym, xmask_sym], [acc, output_decoder_train])" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": { 282 | "collapsed": false 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "#Generate some validation data\n", 287 | "Xval, Xmask_val, Yval, Ymask_val, text_inputs_val, text_targets_val = \\\n", 288 | " get_batch(batch_size=5000, max_digits=MAX_DIGITS,min_digits=MIN_DIGITS)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "# Training" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": { 302 | "collapsed": false 303 | }, 304 | "outputs": [], 305 | "source": [ 306 | "val_interval = 5000\n", 307 | "samples_to_process = 3e5\n", 308 | "samples_processed = 0\n", 309 | "\n", 310 | "val_samples = []\n", 311 | "costs, accs = [], []\n", 312 | "plt.figure()\n", 313 | "try:\n", 314 | " while samples_processed < samples_to_process:\n", 315 | " inputs, input_masks, targets, target_masks, _, _ = \\\n", 316 | " get_batch(batch_size=BATCH_SIZE,max_digits=MAX_DIGITS,min_digits=MIN_DIGITS)\n", 317 | " batch_cost, batch_acc, batch_output = train_func(inputs, targets, input_masks)\n", 318 | " costs += [batch_cost]\n", 319 | " samples_processed += BATCH_SIZE\n", 320 | " #validation data\n", 321 | " if samples_processed % val_interval == 0:\n", 322 | " #print \"validating\"\n", 323 | " val_acc, val_output = test_func(Xval, Yval, Xmask_val)\n", 324 | " val_samples += [samples_processed]\n", 325 | " accs += [val_acc]\n", 326 | " plt.plot(val_samples,accs)\n", 327 | " plt.ylabel('Validation Accuracy', fontsize=15)\n", 328 | " plt.xlabel('Processed samples', fontsize=15)\n", 329 | " plt.title('', fontsize=20)\n", 330 | " plt.grid('on')\n", 331 | " display.display(plt.gcf())\n", 332 | " display.clear_output(wait=True)\n", 333 | " plt.show()\n", 334 | "except KeyboardInterrupt:\n", 335 | " pass\n" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": { 342 | "collapsed": false 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "#plot of validation accuracy for each target position\n", 347 | "plt.figure(figsize=(7,7))\n", 348 | "plt.plot(np.mean(np.argmax(val_output,axis=2)==Yval,axis=0))\n", 349 | "plt.ylabel('Accuracy', fontsize=15)\n", 350 | "plt.xlabel('Target position', fontsize=15)\n", 351 | "#plt.title('', fontsize=20)\n", 352 | "plt.grid('on')\n", 353 | "plt.show()\n", 354 | "#why do the plot look like this?" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": { 361 | "collapsed": false 362 | }, 363 | "outputs": [], 364 | "source": [ 365 | "# plot training cost\n", 366 | "#plt.figure(figsize=(7,7))\n", 367 | "#plt.plot(costs)\n", 368 | "#plt.ylabel('Cost', fontsize=15)\n", 369 | "#plt.xlabel('Number of updates', fontsize=15)\n", 370 | "#plt.title('Training', fontsize=20)\n", 371 | "#plt.show()" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "# Exercises:\n", 379 | "1. What is the final validation performance? Why do you think it is not better? Comment on the accuracy for each position in of the output symbols?\n", 380 | "\n", 381 | "2. Why do you think the validation performance looks more \"jig-saw\" like compared to FFN and CNN models?\n", 382 | "\n", 383 | "3. Optional: Bidirectional Encoder, In Lasagne bidirectional RNNs are implementated by running a forward model and a backward model separately and then concatenating them before parsing them on to the next layer. You can experiment with using a different merging layer than concat e.g. sum or multiplication see [lasagne merge layers [lasagne merge layers](http://lasagne.readthedocs.org/en/latest/modules/layers/merge.html).\n", 384 | "\n", 385 | "```\n", 386 | "l_rec_fwd = lasagne.layers.GRULayer(...,backwards=False)\n", 387 | "l_rec_bwd = lasagne.layers.GRULayer(...,backwards=True)\n", 388 | "l_rec = lasagne.layers.ConcatLayer([l_rec_fwd, l_rec_bwd], axis=2))\n", 389 | "```\n", 390 | "\n", 391 | "4. Optional: Add support for different lengths of targets (hint: add the target_mask to the cost function and only calculate the cost for the non-masked targets)\n" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "#### Attention Decoder (LSTM)\n", 399 | "Selective attention for recurrent neural networks have recently attracted a lot of interest. These methods let the Decoder model selective focus on which part of the encoder sequence it will use for each decoded output symbol. This relieves the encoder from having to compress the input sequence into a fixed size vector representation passed on to the decoder. Secondly we can interrogate the decoder network about where it attends while producing the ouputs. below we'll implement an LSTM-decoder with selective attention and show that it significantly improves the performance of the toy translation task." 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": { 406 | "collapsed": false 407 | }, 408 | "outputs": [], 409 | "source": [ 410 | "from decoder_attention import LSTMAttentionDecodeFeedbackLayer\n", 411 | "\n", 412 | "# you can acces the attetion weights alpha by adding l_dec.alpha \n", 413 | "# to the output variables in the theano function\n", 414 | "\n", 415 | "BATCH_SIZE = 100\n", 416 | "NUM_UNITS_ENC = 10\n", 417 | "NUM_UNITS_DEC = 10\n", 418 | "MAX_DIGITS = 20 \n", 419 | "MIN_DIGITS = MAX_DIGITS #currently only support for same length outputs - we'll leave it for an exercise to add support for varying length targets\n", 420 | "NUM_INPUTS = 27\n", 421 | "NUM_OUTPUTS = 11 #(0-9 + '#')\n", 422 | "\n", 423 | "\n", 424 | "x_sym = T.imatrix()\n", 425 | "y_sym = T.imatrix()\n", 426 | "xmask_sym = T.matrix()\n", 427 | " \n", 428 | "\n", 429 | "#dummy data to test implementation\n", 430 | "#X = np.random.randint(0,10,size=(BATCH_SIZE,15)).astype('int32')\n", 431 | "#Xmask = np.ones((BATCH_SIZE,NUM_INPUTS)).astype('float32')\n", 432 | "\n", 433 | "l_in = lasagne.layers.InputLayer((None, None))\n", 434 | "l_emb = lasagne.layers.EmbeddingLayer(l_in, NUM_INPUTS, NUM_INPUTS, \n", 435 | " W=np.eye(NUM_INPUTS,dtype='float32'),\n", 436 | " name='Embedding')\n", 437 | "##### ENCODER START #####\n", 438 | "l_in = lasagne.layers.InputLayer((None, None))\n", 439 | "l_emb = lasagne.layers.EmbeddingLayer(l_in, NUM_INPUTS, NUM_INPUTS, \n", 440 | " W=np.eye(NUM_INPUTS,dtype='float32'),\n", 441 | " name='Embedding')\n", 442 | "#Here we'll remove the trainable parameters from the embeding layer to constrain \n", 443 | "#it to a simple \"one-hot-encoding\". You can experiment with removing this line\n", 444 | "l_emb.params[l_emb.W].remove('trainable') \n", 445 | "print lasagne.layers.get_output(l_emb, inputs={l_in: x_sym}).eval(\n", 446 | " {x_sym: X}).shape\n", 447 | "T.grad(lasagne.layers.get_output(l_emb, inputs={l_in: x_sym}).sum(), \n", 448 | " lasagne.layers.get_all_params(l_emb, trainable=True))\n", 449 | "\n", 450 | "\n", 451 | "\n", 452 | "\n", 453 | "l_mask_enc = lasagne.layers.InputLayer((None, None))\n", 454 | "l_enc = lasagne.layers.GRULayer(l_emb, num_units=NUM_UNITS_ENC, name='GRUEncoder', mask_input=l_mask_enc)\n", 455 | "print lasagne.layers.get_output(l_enc, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n", 456 | " {x_sym: X, xmask_sym: Xmask}).shape\n", 457 | "T.grad(lasagne.layers.get_output(l_enc, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).sum(), \n", 458 | " lasagne.layers.get_all_params(l_enc, trainable=True))\n", 459 | "####END OF ENCODER######\n", 460 | "\n", 461 | "\n", 462 | "####START OF DECODER######\n", 463 | "#note that the decoder have its own input layer, we'll use that to plug in the output \n", 464 | "#from the encoder later\n", 465 | "l_dec = LSTMAttentionDecodeFeedbackLayer(l_enc,\n", 466 | " num_units=NUM_UNITS_DEC, \n", 467 | " aln_num_units=20,\n", 468 | " n_decodesteps=MAX_DIGITS+1,\n", 469 | " name='LSTMDecoder')\n", 470 | "print lasagne.layers.get_output(l_dec, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n", 471 | " {x_sym: X, xmask_sym: Xmask}).shape\n", 472 | "T.grad(lasagne.layers.get_output(l_dec, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).sum(), \n", 473 | " lasagne.layers.get_all_params(l_dec, trainable=True))\n", 474 | "\n", 475 | "# We need to do some reshape voodo to connect a softmax layer to the decoder.\n", 476 | "# See http://lasagne.readthedocs.org/en/latest/modules/layers/recurrent.html#examples \n", 477 | "l_reshape = lasagne.layers.ReshapeLayer(l_dec, (-1, [2]))\n", 478 | "l_softmax = lasagne.layers.DenseLayer(l_reshape, num_units=NUM_OUTPUTS, \n", 479 | " nonlinearity=lasagne.nonlinearities.softmax,\n", 480 | " name='SoftmaxOutput')\n", 481 | "# print lasagne.layers.get_output(l_softmax, x_sym).eval({x_sym: X}).shape\n", 482 | "# reshape back to 3d format (here we tied the batch size to the shape of the symbolic variable for X allowing \n", 483 | "#us to use different batch sizes in the model)\n", 484 | "l_out = lasagne.layers.ReshapeLayer(l_softmax, (x_sym.shape[0], -1, NUM_OUTPUTS))\n", 485 | "print lasagne.layers.get_output(l_out, inputs={l_in: x_sym, l_mask_enc: xmask_sym}, deterministic=False).eval(\n", 486 | " {x_sym: X, xmask_sym: Xmask}).shape\n", 487 | "T.grad(lasagne.layers.get_output(l_out, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).sum(), \n", 488 | " lasagne.layers.get_all_params(l_dec, trainable=True))\n", 489 | "\n", 490 | "print \"\"\n", 491 | "###END OF DECODER######\n", 492 | "\n" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": null, 498 | "metadata": { 499 | "collapsed": true 500 | }, 501 | "outputs": [], 502 | "source": [ 503 | "#Generate some validation data\n", 504 | "Xval, Xmask_val, Yval, Ymask_val, text_inputs_val, text_targets_val = \\\n", 505 | " get_batch(batch_size=5000, max_digits=MAX_DIGITS,min_digits=MIN_DIGITS)" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": { 512 | "collapsed": false 513 | }, 514 | "outputs": [], 515 | "source": [ 516 | "#get output of encoder using X and Xmask as input\n", 517 | "output_decoder_train = lasagne.layers.get_output(l_out, inputs={l_in: x_sym, l_mask_enc: xmask_sym}, \n", 518 | " deterministic=False)\n", 519 | "\n", 520 | "#cost function\n", 521 | "total_cost = T.nnet.categorical_crossentropy(\n", 522 | " T.reshape(output_decoder_train, (-1, NUM_OUTPUTS)), y_sym.flatten())\n", 523 | "mean_cost = T.mean(total_cost)\n", 524 | "#accuracy function\n", 525 | "acc = T.mean(T.eq(T.argmax(output_decoder_train,axis=-1),y_sym))\n", 526 | "\n", 527 | "#Get parameters of both encoder and decoder\n", 528 | "all_parameters = lasagne.layers.get_all_params(l_out, trainable=True)\n", 529 | "\n", 530 | "print \"Trainable Model Parameters\"\n", 531 | "print \"-\"*40\n", 532 | "for param in all_parameters:\n", 533 | " print param, param.get_value().shape\n", 534 | "print \"-\"*40\n", 535 | "\n", 536 | "#add grad clipping to avoid exploding gradients\n", 537 | "all_grads = [T.clip(g,-3,3) for g in T.grad(mean_cost, all_parameters)]\n", 538 | "all_grads = lasagne.updates.total_norm_constraint(all_grads,3)\n", 539 | "\n", 540 | "#Compile Theano functions\n", 541 | "updates = lasagne.updates.adam(all_grads, all_parameters, learning_rate=0.005)\n", 542 | "train_func = theano.function([x_sym, y_sym, xmask_sym], [mean_cost, acc, output_decoder_train], updates=updates)\n", 543 | "#since we don't have any stochasticity in the network we will just use the training graph without any updates given\n", 544 | "test_func = theano.function([x_sym, y_sym, xmask_sym], [acc, output_decoder_train, l_dec.alpha])\n" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": { 551 | "collapsed": false 552 | }, 553 | "outputs": [], 554 | "source": [ 555 | "val_interval = 5000\n", 556 | "samples_to_process = 1.5e5\n", 557 | "samples_processed = 0\n", 558 | "val_samples = []\n", 559 | "costs, accs = [], []\n", 560 | "plt.figure()\n", 561 | "try:\n", 562 | " while samples_processed < samples_to_process:\n", 563 | " inputs, input_masks, targets, target_masks, _, _ = \\\n", 564 | " get_batch(batch_size=BATCH_SIZE,max_digits=MAX_DIGITS,min_digits=MIN_DIGITS)\n", 565 | " batch_cost, batch_acc, batch_output = train_func(inputs, targets, input_masks)\n", 566 | " costs += [batch_cost]\n", 567 | " samples_processed += BATCH_SIZE\n", 568 | " #print i, samples_processed\n", 569 | " #validation data\n", 570 | " if samples_processed % val_interval == 0:\n", 571 | " #print \"validating\"\n", 572 | " val_acc, val_output, alpha = test_func(Xval, Yval, Xmask_val)\n", 573 | " val_samples += [samples_processed]\n", 574 | " accs += [val_acc]\n", 575 | " plt.plot(val_samples,accs)\n", 576 | " plt.ylabel('', fontsize=15)\n", 577 | " plt.xlabel('Processed samples', fontsize=15)\n", 578 | " plt.title('Validation Accuracy', fontsize=20)\n", 579 | " plt.grid('on')\n", 580 | " display.display(plt.gcf())\n", 581 | " display.clear_output(wait=True)\n", 582 | " plt.show()\n", 583 | "except KeyboardInterrupt:\n", 584 | " pass\n", 585 | " " 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": null, 591 | "metadata": { 592 | "collapsed": false 593 | }, 594 | "outputs": [], 595 | "source": [ 596 | "#plot of validation accuracy for each target position\n", 597 | "plt.figure(figsize=(7,7))\n", 598 | "plt.plot(np.mean(np.argmax(val_output,axis=2)==Yval,axis=0))\n", 599 | "plt.ylabel('Accuracy', fontsize=15)\n", 600 | "plt.xlabel('Target position', fontsize=15)\n", 601 | "#plt.title('', fontsize=20)\n", 602 | "plt.grid('on')\n", 603 | "plt.show()\n", 604 | "#why do the plot look like this?" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": { 611 | "collapsed": false 612 | }, 613 | "outputs": [], 614 | "source": [ 615 | "#Plot of average attention weight as a function of the sequence position for each of \n", 616 | "#the 21 targets in the output sequence i.e. each line is the mean postion of the \n", 617 | "#attention for each target position.\n", 618 | "\n", 619 | "np.mean(alpha,axis=0).shape\n", 620 | "plt.figure()\n", 621 | "plt.plot(np.mean(alpha,axis=0).T)\n", 622 | "plt.ylabel('alpha', fontsize=15)\n", 623 | "plt.xlabel('Input Sequence position', fontsize=15)\n", 624 | "plt.title('Alpha weights', fontsize=20)\n", 625 | "plt.legend(map(str,range(1,22)), bbox_to_anchor=(1.125,1.0), fontsize=10)\n", 626 | "plt.show()\n" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "metadata": { 633 | "collapsed": true 634 | }, 635 | "outputs": [], 636 | "source": [] 637 | } 638 | ], 639 | "metadata": { 640 | "kernelspec": { 641 | "display_name": "Python 2", 642 | "language": "python", 643 | "name": "python2" 644 | }, 645 | "language_info": { 646 | "codemirror_mode": { 647 | "name": "ipython", 648 | "version": 2 649 | }, 650 | "file_extension": ".py", 651 | "mimetype": "text/x-python", 652 | "name": "python", 653 | "nbconvert_exporter": "python", 654 | "pygments_lexer": "ipython2", 655 | "version": "2.7.11" 656 | } 657 | }, 658 | "nbformat": 4, 659 | "nbformat_minor": 0 660 | } 661 | -------------------------------------------------------------------------------- /lab3/confusionmatrix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class ConfusionMatrix: 5 | """ 6 | Simple confusion matrix class 7 | row is the true class, column is the predicted class 8 | """ 9 | def __init__(self, num_classes, class_names=None): 10 | self.n_classes = num_classes 11 | if class_names is None: 12 | self.class_names = map(str, range(num_classes)) 13 | else: 14 | self.class_names = class_names 15 | 16 | # find max class_name and pad 17 | max_len = max(map(len, self.class_names)) 18 | self.max_len = max_len 19 | for idx, name in enumerate(self.class_names): 20 | if len(self.class_names) < max_len: 21 | self.class_names[idx] = name + " "*(max_len-len(name)) 22 | 23 | self.mat = np.zeros((num_classes,num_classes),dtype='int') 24 | 25 | def __str__(self): 26 | # calucate row and column sums 27 | col_sum = np.sum(self.mat, axis=1) 28 | row_sum = np.sum(self.mat, axis=0) 29 | 30 | s = [] 31 | 32 | mat_str = self.mat.__str__() 33 | mat_str = mat_str.replace('[','').replace(']','').split('\n') 34 | 35 | for idx, row in enumerate(mat_str): 36 | if idx == 0: 37 | pad = " " 38 | else: 39 | pad = "" 40 | class_name = self.class_names[idx] 41 | class_name = " " + class_name + " |" 42 | row_str = class_name + pad + row 43 | row_str += " |" + str(col_sum[idx]) 44 | s.append(row_str) 45 | 46 | row_sum = [(self.max_len+4)*" "+" ".join(map(str, row_sum))] 47 | hline = [(1+self.max_len)*" "+"-"*len(row_sum[0])] 48 | 49 | s = hline + s + hline + row_sum 50 | 51 | # add linebreaks 52 | s_out = [line+'\n' for line in s] 53 | return "".join(s_out) 54 | 55 | def batch_add(self, targets, preds): 56 | assert targets.shape == preds.shape 57 | assert len(targets) == len(preds) 58 | assert max(targets) < self.n_classes 59 | assert max(preds) < self.n_classes 60 | targets = targets.flatten() 61 | preds = preds.flatten() 62 | for i in range(len(targets)): 63 | self.mat[targets[i], preds[i]] += 1 64 | 65 | def get_errors(self): 66 | tp = np.asarray(np.diag(self.mat).flatten(),dtype='float') 67 | fn = np.asarray(np.sum(self.mat, axis=1).flatten(),dtype='float') - tp 68 | fp = np.asarray(np.sum(self.mat, axis=0).flatten(),dtype='float') - tp 69 | tn = np.asarray(np.sum(self.mat)*np.ones(self.n_classes).flatten(), 70 | dtype='float') - tp - fn - fp 71 | return tp, fn, fp, tn 72 | 73 | def accuracy(self): 74 | """ 75 | Calculates global accuracy 76 | :return: accuracy 77 | :example: >>> conf = ConfusionMatrix(3) 78 | >>> conf.batchAdd([0,0,1],[0,0,2]) 79 | >>> print conf.accuracy() 80 | """ 81 | tp, _, _, _ = self.get_errors() 82 | n_samples = np.sum(self.mat) 83 | return np.sum(tp) / n_samples 84 | 85 | def sensitivity(self): 86 | tp, tn, fp, fn = self.get_errors() 87 | res = tp / (tp + fn) 88 | res = res[~np.isnan(res)] 89 | return res 90 | 91 | def specificity(self): 92 | tp, tn, fp, fn = self.get_errors() 93 | res = tn / (tn + fp) 94 | res = res[~np.isnan(res)] 95 | return res 96 | 97 | def positive_predictive_value(self): 98 | tp, tn, fp, fn = self.get_errors() 99 | res = tp / (tp + fp) 100 | res = res[~np.isnan(res)] 101 | return res 102 | 103 | def negative_predictive_value(self): 104 | tp, tn, fp, fn = self.get_errors() 105 | res = tn / (tn + fn) 106 | res = res[~np.isnan(res)] 107 | return res 108 | 109 | def false_positive_rate(self): 110 | tp, tn, fp, fn = self.get_errors() 111 | res = fp / (fp + tn) 112 | res = res[~np.isnan(res)] 113 | return res 114 | 115 | def false_discovery_rate(self): 116 | tp, tn, fp, fn = self.get_errors() 117 | res = fp / (tp + fp) 118 | res = res[~np.isnan(res)] 119 | return res 120 | 121 | def F1(self): 122 | tp, tn, fp, fn = self.get_errors() 123 | res = (2*tp) / (2*tp + fp + fn) 124 | res = res[~np.isnan(res)] 125 | return res 126 | 127 | def matthews_correlation(self): 128 | tp, tn, fp, fn = self.get_errors() 129 | numerator = tp*tn - fp*fn 130 | denominator = np.sqrt((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn)) 131 | res = numerator / denominator 132 | res = res[~np.isnan(res)] 133 | return res 134 | -------------------------------------------------------------------------------- /lab3/data_generator.py: -------------------------------------------------------------------------------- 1 | __author__ = 'casperkaae' 2 | import numpy as np 3 | 4 | target_to_text = { 5 | '0':'zero', 6 | '1':'one', 7 | '2':'two', 8 | '3':'three', 9 | '4':'four', 10 | '5':'five', 11 | '6':'six', 12 | '7':'seven', 13 | '8':'eight', 14 | '9':'nine', 15 | } 16 | 17 | stop_character = '#' 18 | 19 | input_characters = " ".join(target_to_text.values()) 20 | valid_characters = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '#'] + \ 21 | list(set(input_characters)) 22 | 23 | def print_valid_characters(): 24 | l = '' 25 | for i,c in enumerate(valid_characters): 26 | l += "\'%s\'=%i,\t" % (c,i) 27 | print "Number of valid characters:", len(valid_characters) 28 | print l 29 | 30 | ninput_chars = len(valid_characters) 31 | def get_batch(batch_size=100, min_digits = 3, max_digits=3): 32 | ''' 33 | Generates random sequences of integers and translates them to text i.e. 1->'one'. 34 | :param batch_size: number of samples to return 35 | :param min_digits: minimum length of target 36 | :param max_digits: maximum length of target 37 | ''' 38 | text_inputs = [] 39 | int_inputs = [] 40 | text_targets = [] 41 | int_targets = [] 42 | for i in range(batch_size): 43 | #convert integer into a list of digits 44 | tar_len = np.random.randint(min_digits,max_digits+1) 45 | text_target = "".join(map(str,np.random.randint(0,10,tar_len))) + stop_character 46 | inp_str = text_target[:-1] 47 | 48 | #generate the targets as a list of intergers 49 | int_target = map(lambda c: valid_characters.index(c), text_target) 50 | 51 | #generate the text input 52 | text_input = " ".join(map(lambda k: target_to_text[k], inp_str)) 53 | #generate the inputs as a list of intergers 54 | int_input = map(lambda c: valid_characters.index(c), text_input) 55 | 56 | text_inputs.append(text_input) 57 | int_inputs.append(int_input) 58 | text_targets.append(text_target) 59 | int_targets.append(int_target) 60 | 61 | #create the input matrix and mask - note that we zero pad the shorter sequences. 62 | max_input_len = max(map(len,int_inputs)) 63 | inputs = np.zeros((batch_size,max_input_len)) 64 | input_masks = np.zeros((batch_size,max_input_len)) 65 | for (i,inp) in enumerate(int_inputs): 66 | cur_len = len(inp) 67 | inputs[i,:cur_len] = inp 68 | input_masks[i,:cur_len] = 1 69 | 70 | targets = np.zeros((batch_size,max_digits+1)) #+1 to allow space for stop character 71 | target_masks = np.zeros((batch_size,max_digits+1)) #+1 to allow space for stop character 72 | for (i,tar) in enumerate(int_targets): 73 | cur_len = len(tar) 74 | targets[i,:cur_len] = tar 75 | target_masks[i,:cur_len] = 1 76 | 77 | return inputs.astype('int32'), \ 78 | input_masks.astype('float32'), \ 79 | targets.astype('int32'), \ 80 | target_masks.astype('float32'), \ 81 | text_inputs, \ 82 | text_targets 83 | 84 | -------------------------------------------------------------------------------- /lab3/decoder_attention.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | from lasagne import nonlinearities 5 | from lasagne import init 6 | from lasagne.utils import unroll_scan 7 | from lasagne.layers import MergeLayer 8 | from lasagne.layers.base import Layer 9 | from lasagne.layers import helper 10 | 11 | import numpy as np 12 | import theano 13 | import theano.tensor as T 14 | import lasagne.init as init 15 | import lasagne.nonlinearities as nonlinearities 16 | 17 | from lasagne.layers import Layer 18 | import lasagne 19 | 20 | 21 | # LSTMAttentionDecodeLayer 22 | # Model: Encoder -> Decoder Decoder-LSTM: ... hid_t-1 -> hid_t -> h_t+1 .... 23 | # attention_network | | | 24 | # weighted encoder hidden(output) whid_t-1 -> whid_t -> wh_t+1 25 | 26 | 27 | # LSTMAttentionDecodeFeedBackLayer 28 | # Model: Encoder -> Decoder Decoder-LSTM: ... hid_dec_t-1-> hid_dec_t-> hid_dec_t+1 .... 29 | # | /^ | /^ | /^ 30 | # attention_network | / | / | / 31 | # | / |/ | / 32 | # weighted encoder hidden(output) whid_enc_t-1 -> whid_enc__t -> wh_t+1 33 | # 34 | # This model also allows for adden "pre-steps" to the decoder where the model can 35 | # "comprehend the input data". basically this is just adding extra steps to the 36 | # decoder before producing the targets 37 | # 38 | # 39 | 40 | 41 | class LSTMAttentionDecodeLayer(MergeLayer): 42 | r"""A long short-term memory (LSTM) layer. 43 | 44 | Includes optional "peephole connections" and a forget gate. Based on the 45 | definition in [1]_, which is the current common definition. The output is 46 | computed by 47 | 48 | .. math :: 49 | 50 | i_t &= \sigma_i(W_{xi}x_t + W_{hi}h_{t-1} 51 | + w_{ci}\odot c_{t-1} + b_i)\\ 52 | f_t &= \sigma_f(W_{xf}x_t + W_{hf}h_{t-1} 53 | + w_{cf}\odot c_{t-1} + b_f)\\ 54 | c_t &= f_t \odot c_{t - 1} 55 | + i_t\sigma_c(W_{xc}x_t + W_{hc} h_{t-1} + b_c)\\ 56 | o_t &= \sigma_o(W_{xo}x_t + W_{ho}h_{t-1} + w_{co}\odot c_t + b_o)\\ 57 | h_t &= o_t \odot \sigma_h(c_t) 58 | 59 | Parameters 60 | ---------- 61 | incoming : a :class:`lasagne.layers.Layer` instance or a tuple 62 | The layer feeding into this layer, or the expected input shape. 63 | num_units : int 64 | Number of hidden/cell units in the layer. 65 | W_in_to_ingate : Theano shared variable, numpy array or callable 66 | Initializer for input-to-input gate weight matrix (:math:`W_{xi}`). 67 | W_hid_to_ingate : Theano shared variable, numpy array or callable 68 | Initializer for hidden-to-input gate weight matrix (:math:`W_{hi}`). 69 | W_cell_to_ingate : Theano shared variable, numpy array or callable 70 | Initializer for cell-to-input gate weight vector (:math:`w_{ci}`). 71 | b_ingate : Theano shared variable, numpy array or callable 72 | Initializer for input gate bias vector (:math:`b_i`). 73 | nonlinearity_ingate : callable or None 74 | The nonlinearity that is applied to the input gate activation 75 | (:math:`\sigma_i`). If None is provided, no nonlinearity will be 76 | applied. 77 | W_in_to_forgetgate : Theano shared variable, numpy array or callable 78 | Initializer for input-to-forget gate weight matrix (:math:`W_{xf}`). 79 | W_hid_to_forgetgate : Theano shared variable, numpy array or callable 80 | Initializer for hidden-to-forget gate weight matrix (:math:`W_{hf}`). 81 | W_cell_to_forgetgate : Theano shared variable, numpy array or callable 82 | Initializer for cell-to-forget gate weight vector (:math:`w_{cf}`). 83 | b_forgetgate : Theano shared variable, numpy array or callable 84 | Initializer for forget gate bias vector (:math:`b_f`). 85 | nonlinearity_forgetgate : callable or None 86 | The nonlinearity that is applied to the forget gate activation 87 | (:math:`\sigma_f`). If None is provided, no nonlinearity will be 88 | applied. 89 | W_in_to_cell : Theano shared variable, numpy array or callable 90 | Initializer for input-to-cell weight matrix (:math:`W_{ic}`). 91 | W_hid_to_cell : Theano shared variable, numpy array or callable 92 | Initializer for hidden-to-cell weight matrix (:math:`W_{hc}`). 93 | b_cell : Theano shared variable, numpy array or callable 94 | Initializer for cell bias vector (:math:`b_c`). 95 | nonlinearity_cell : callable or None 96 | The nonlinearity that is applied to the cell activation 97 | (;math:`\sigma_c`). If None is provided, no nonlinearity will be 98 | applied. 99 | W_in_to_outgate : Theano shared variable, numpy array or callable 100 | Initializer for input-to-output gate weight matrix (:math:`W_{io}`). 101 | W_hid_to_outgate : Theano shared variable, numpy array or callable 102 | Initializer for hidden-to-output gate weight matrix (:math:`W_{ho}`). 103 | W_cell_to_outgate : Theano shared variable, numpy array or callable 104 | Initializer for cell-to-output gate weight vector (:math:`w_{co}`). 105 | b_outgate : Theano shared variable, numpy array or callable 106 | Initializer for hidden-to-input gate weight matrix (:math:`b_o`). 107 | nonlinearity_outgate : callable or None 108 | The nonlinearity that is applied to the output gate activation 109 | (:math:`\sigma_o`). If None is provided, no nonlinearity will be 110 | applied. 111 | nonlinearity_out : callable or None 112 | The nonlinearity that is applied to the output (:math:`\sigma_h`). If 113 | None is provided, no nonlinearity will be applied. 114 | cell_init : callable, np.ndarray, theano.shared or TensorVariable 115 | Passing in a TensorVariable allows the user to specify 116 | the value of `cell_init` (:math:`c_0`). In this mode `learn_init` is 117 | ignored for the cell state. 118 | hid_init : callable, np.ndarray, theano.shared or TensorVariable 119 | Passing in a TensorVariable allows the user to specify 120 | the value of `hid_init` (:math:`h_0`). In this mode `learn_init` is 121 | ignored for the hidden state. 122 | backwards : bool 123 | If True, process the sequence backwards and then reverse the 124 | output again such that the output from the layer is always 125 | from :math:`x_1` to :math:`x_n`. 126 | learn_init : bool 127 | If True, initial hidden values are learned. If `hid_init` or 128 | `cell_init` are TensorVariables then the TensorVariable is used and 129 | `learn_init` is ignored for that initial state. 130 | peepholes : bool 131 | If True, the LSTM uses peephole connections. 132 | When False, `W_cell_to_ingate`, `W_cell_to_forgetgate` and 133 | `W_cell_to_outgate` are ignored. 134 | gradient_steps : int 135 | Number of timesteps to include in the backpropagated gradient. 136 | If -1, backpropagate through the entire sequence. 137 | grad_clipping: False or float 138 | If a float is provided, the gradient messages are clipped during the 139 | backward pass. If False, the gradients will not be clipped. See [1]_ 140 | (p. 6) for further explanation. 141 | unroll_scan : bool 142 | If True the recursion is unrolled instead of using scan. For some 143 | graphs this gives a significant speed up but it might also consume 144 | more memory. When `unroll_scan` is true then the `gradient_steps` 145 | setting is ignored. 146 | precompute_input : bool 147 | If True, precompute input_to_hid before iterating through 148 | the sequence. This can result in a speedup at the expense of 149 | an increase in memory usage. 150 | 151 | References 152 | ---------- 153 | .. [1] Graves, Alex: "Generating sequences with recurrent neural networks." 154 | arXiv preprint arXiv:1308.0850 (2013). 155 | """ 156 | def __init__(self, incoming, 157 | num_units, 158 | aln_num_units, 159 | n_decodesteps, 160 | W_align=init.Normal(0.1), 161 | U_align=init.Normal(0.1), 162 | v_align=init.Normal(0.1), 163 | nonlinearity_align=nonlinearities.tanh, 164 | W_hid_to_ingate=init.Normal(0.1), 165 | W_cell_to_ingate=init.Normal(0.1), 166 | b_ingate=init.Constant(0.), 167 | nonlinearity_ingate=nonlinearities.sigmoid, 168 | #W_in_to_forgetgate=init.Normal(0.1), 169 | W_hid_to_forgetgate=init.Normal(0.1), 170 | W_cell_to_forgetgate=init.Normal(0.1), 171 | b_forgetgate=init.Constant(0.), 172 | nonlinearity_forgetgate=nonlinearities.sigmoid, 173 | #W_in_to_cell=init.Normal(0.1), 174 | W_hid_to_cell=init.Normal(0.1), 175 | b_cell=init.Constant(0.), 176 | nonlinearity_cell=nonlinearities.tanh, 177 | #W_in_to_outgate=init.Normal(0.1), 178 | W_hid_to_outgate=init.Normal(0.1), 179 | W_cell_to_outgate=init.Normal(0.1), 180 | b_outgate=init.Constant(0.), 181 | nonlinearity_outgate=nonlinearities.sigmoid, 182 | nonlinearity_out=nonlinearities.tanh, 183 | cell_init=init.Constant(0.), 184 | hid_init=init.Constant(0.), 185 | backwards=False, 186 | learn_init=False, 187 | peepholes=True, 188 | gradient_steps=-1, 189 | grad_clipping=False, 190 | unroll_scan=False, 191 | mask_input=None, 192 | #precompute_input=True, 193 | **kwargs): 194 | 195 | # Initialize parent layer 196 | # This layer inherits from a MergeLayer, because it can have two 197 | # inputs - the layer input, and the mask. We will just provide the 198 | # layer input as incomings, unless a mask input was provided. 199 | incomings = [incoming] 200 | if mask_input is not None: 201 | incomings.append(mask_input) 202 | super(LSTMAttentionDecodeLayer, self).__init__(incomings, **kwargs) 203 | 204 | # For any of the nonlinearities, if None is supplied, use identity 205 | if nonlinearity_ingate is None: 206 | self.nonlinearity_ingate = nonlinearities.identity 207 | else: 208 | self.nonlinearity_ingate = nonlinearity_ingate 209 | 210 | if nonlinearity_forgetgate is None: 211 | self.nonlinearity_forgetgate = nonlinearities.identity 212 | else: 213 | self.nonlinearity_forgetgate = nonlinearity_forgetgate 214 | 215 | if nonlinearity_cell is None: 216 | self.nonlinearity_cell = nonlinearities.identity 217 | else: 218 | self.nonlinearity_cell = nonlinearity_cell 219 | 220 | if nonlinearity_outgate is None: 221 | self.nonlinearity_outgate = nonlinearities.identity 222 | else: 223 | self.nonlinearity_outgate = nonlinearity_outgate 224 | 225 | if nonlinearity_out is None: 226 | self.nonlinearity_out = nonlinearities.identity 227 | else: 228 | self.nonlinearity_out = nonlinearity_out 229 | 230 | self.learn_init = learn_init 231 | self.num_units = num_units 232 | self.backwards = backwards 233 | self.peepholes = peepholes 234 | self.gradient_steps = gradient_steps 235 | self.grad_clipping = grad_clipping 236 | self.unroll_scan = unroll_scan 237 | self.n_decodesteps = n_decodesteps 238 | self.aln_num_units = aln_num_units 239 | self.nonlinearity_align = nonlinearity_align 240 | 241 | # Retrieve the dimensionality of the incoming layer 242 | input_shape = self.input_shapes[0] 243 | if unroll_scan and input_shape[1] is None: 244 | raise ValueError("Input sequence length cannot be specified as " 245 | "None when unroll_scan is True") 246 | 247 | num_inputs = np.prod(self.input_shape[2:]) 248 | 249 | # Initialize parameters using the supplied args 250 | #self.W_in_to_ingate = self.add_param( 251 | # W_in_to_ingate, (num_inputs, num_units), name="W_in_to_ingate") 252 | 253 | self.W_hid_to_ingate = self.add_param( 254 | W_hid_to_ingate, (num_units, num_units), name="W_hid_to_ingate") 255 | 256 | self.b_ingate = self.add_param( 257 | b_ingate, (num_units,), name="b_ingate", regularizable=False) 258 | 259 | #self.W_in_to_forgetgate = self.add_param( 260 | # W_in_to_forgetgate, (num_inputs, num_units), 261 | # name="W_in_to_forgetgate") 262 | 263 | self.W_hid_to_forgetgate = self.add_param( 264 | W_hid_to_forgetgate, (num_units, num_units), 265 | name="W_hid_to_forgetgate") 266 | 267 | self.b_forgetgate = self.add_param( 268 | b_forgetgate, (num_units,), name="b_forgetgate", 269 | regularizable=False) 270 | 271 | #self.W_in_to_cell = self.add_param( 272 | # W_in_to_cell, (num_inputs, num_units), name="W_in_to_cell") 273 | 274 | self.W_hid_to_cell = self.add_param( 275 | W_hid_to_cell, (num_units, num_units), name="W_hid_to_cell") 276 | 277 | self.b_cell = self.add_param( 278 | b_cell, (num_units,), name="b_cell", regularizable=False) 279 | 280 | #self.W_in_to_outgate = self.add_param( 281 | # W_in_to_outgate, (num_inputs, num_units), name="W_in_to_outgate") 282 | 283 | self.W_hid_to_outgate = self.add_param( 284 | W_hid_to_outgate, (num_units, num_units), name="W_hid_to_outgate") 285 | 286 | self.b_outgate = self.add_param( 287 | b_outgate, (num_units,), name="b_outgate", regularizable=False) 288 | 289 | # Stack input weight matrices into a (num_inputs, 4*num_units) 290 | # matrix, which speeds up computation 291 | #self.W_in_stacked = T.concatenate( 292 | # [self.W_in_to_ingate, self.W_in_to_forgetgate, 293 | # self.W_in_to_cell, self.W_in_to_outgate], axis=1) 294 | 295 | # Same for hidden weight matrices 296 | self.W_hid_stacked = T.concatenate( 297 | [self.W_hid_to_ingate, self.W_hid_to_forgetgate, 298 | self.W_hid_to_cell, self.W_hid_to_outgate], axis=1) 299 | 300 | # Stack biases into a (4*num_units) vector 301 | self.b_stacked = T.concatenate( 302 | [self.b_ingate, self.b_forgetgate, 303 | self.b_cell, self.b_outgate], axis=0) 304 | 305 | # If peephole (cell to gate) connections were enabled, initialize 306 | # peephole connections. These are elementwise products with the cell 307 | # state, so they are represented as vectors. 308 | if self.peepholes: 309 | self.W_cell_to_ingate = self.add_param( 310 | W_cell_to_ingate, (num_units, ), name="W_cell_to_ingate") 311 | 312 | self.W_cell_to_forgetgate = self.add_param( 313 | W_cell_to_forgetgate, (num_units, ), 314 | name="W_cell_to_forgetgate") 315 | 316 | self.W_cell_to_outgate = self.add_param( 317 | W_cell_to_outgate, (num_units, ), name="W_cell_to_outgate") 318 | 319 | self.W_align = self.add_param(W_align, (num_units, self.aln_num_units), 320 | name="AlignSeqOutputLayer: (aln) W_a") 321 | self.U_align = self.add_param(U_align, (num_inputs, self.aln_num_units), 322 | name="AlignSeqOutputLayer: (aln) U_a") 323 | self.v_align = self.add_param(v_align, (self.aln_num_units, 1), 324 | name="AlignSeqOutputLayer: v_a") 325 | 326 | 327 | # Setup initial values for the cell and the hidden units 328 | if isinstance(cell_init, T.TensorVariable): 329 | if cell_init.ndim != 2: 330 | raise ValueError( 331 | "When cell_init is provided as a TensorVariable, it should" 332 | " have 2 dimensions and have shape (num_batch, num_units)") 333 | self.cell_init = cell_init 334 | else: 335 | self.cell_init = self.add_param( 336 | cell_init, (1, num_units), name="cell_init", 337 | trainable=learn_init, regularizable=False) 338 | 339 | if isinstance(hid_init, T.TensorVariable): 340 | if hid_init.ndim != 2: 341 | raise ValueError( 342 | "When hid_init is provided as a TensorVariable, it should " 343 | "have 2 dimensions and have shape (num_batch, num_units)") 344 | self.hid_init = hid_init 345 | else: 346 | self.hid_init = self.add_param( 347 | hid_init, (1, self.num_units), name="hid_init", 348 | trainable=learn_init, regularizable=False) 349 | 350 | def get_output_shape_for(self, input_shapes): 351 | input_shape = input_shapes[0] 352 | return input_shape[0], None, self.num_units 353 | 354 | def get_output_for(self, inputs, **kwargs): 355 | """ 356 | Compute this layer's output function given a symbolic input variable 357 | 358 | Parameters 359 | ---------- 360 | input : theano.TensorType 361 | Symbolic input variable. 362 | mask : theano.TensorType 363 | Theano variable denoting whether each time step in each 364 | sequence in the batch is part of the sequence or not. If ``None``, 365 | then it is assumed that all sequences are of the same length. If 366 | not all sequences are of the same length, then it must be 367 | supplied as a matrix of shape ``(n_batch, n_time_steps)`` where 368 | ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and 369 | ``mask[i, j] = 0`` when ``j > (length of sequence i)``. 370 | 371 | Returns 372 | ------- 373 | layer_output : theano.TensorType 374 | Symblic output variable. 375 | """ 376 | input = inputs[0] 377 | # Retrieve the mask when it is supplied 378 | mask = inputs[1] if len(inputs) > 1 else None 379 | 380 | # Treat all dimensions after the second as flattened feature dimensions 381 | # Retrieve the layer input 382 | if input.ndim > 3: 383 | input = input.reshape((input.shape[0], input.shape[1], 384 | T.prod(input.shape[2:]))) 385 | num_batch = input.shape[0] 386 | encode_seqlen = input.shape[1] 387 | 388 | # At each call to scan, input_n will be (n_time_steps, 4*num_units). 389 | # We define a slicing function that extract the input to each LSTM gate 390 | def slice_w(x, n): 391 | return x[:, n*self.num_units:(n+1)*self.num_units] 392 | 393 | # Create single recurrent computation step function 394 | # input_n is the n'th vector of the input 395 | def step(cell_previous, hid_previous, a_prev, 396 | hUa, W_align, v_align, 397 | W_hid_stacked, W_cell_to_ingate, W_cell_to_forgetgate, 398 | W_cell_to_outgate, b_stacked): 399 | 400 | # Calculate gates pre-activations and slice 401 | gates = T.dot(hid_previous, W_hid_stacked) + b_stacked 402 | 403 | # Clip gradients 404 | if self.grad_clipping is not False: 405 | gates = theano.gradient.grad_clip( 406 | gates, -self.grad_clipping, self.grad_clipping) 407 | 408 | # Extract the pre-activation gate values 409 | ingate = slice_w(gates, 0) 410 | forgetgate = slice_w(gates, 1) 411 | cell_input = slice_w(gates, 2) 412 | outgate = slice_w(gates, 3) 413 | 414 | if self.peepholes: 415 | # Compute peephole connections 416 | ingate += cell_previous*W_cell_to_ingate 417 | forgetgate += cell_previous*W_cell_to_forgetgate 418 | 419 | # Apply nonlinearities 420 | ingate = self.nonlinearity_ingate(ingate) 421 | forgetgate = self.nonlinearity_forgetgate(forgetgate) 422 | cell_input = self.nonlinearity_cell(cell_input) 423 | outgate = self.nonlinearity_outgate(outgate) 424 | 425 | # Compute new cell value 426 | cell = forgetgate*cell_previous + ingate*cell_input 427 | 428 | if self.peepholes: 429 | outgate += cell*W_cell_to_outgate 430 | 431 | # W_align: (num_units, aln_num_units) 432 | # U_align: (num_feats, aln_num_units) 433 | # v_align: (aln_num_units, 1) 434 | # hUa: (BS, Seqlen, aln_num_units) 435 | # hid: (BS, num_units_dec) 436 | # input: (BS, Seqlen, num_inputs) 437 | 438 | # Compute new hidden unit activation 439 | hid = outgate*self.nonlinearity_out(cell) 440 | 441 | #compute (unormalized) attetion vector 442 | sWa = T.dot(hid, W_align) # (BS, aln_num_units) 443 | sWa = sWa.dimshuffle(0, 'x', 1) # (BS, 1, aln_num_units) 444 | tanh_sWahUa = self.nonlinearity_align(sWa + hUa) 445 | # (BS, seqlen, num_units_aln) 446 | 447 | # CALCULATE WEIGHT FOR EACH HIDDEN STATE VECTOR 448 | a = T.dot(tanh_sWahUa, v_align) # (BS, Seqlen, 1) 449 | a = T.reshape(a, (a.shape[0], a.shape[1])) 450 | # # (BS, Seqlen) 451 | # # ->(BS, seq_len) 452 | #a = a.squeeze() 453 | #a = a*a 454 | #a = a*mask - (1-mask)*10000 #this line does not work 455 | #a = T.reshape(a, (input.shape[0], input.shape[1])) 456 | 457 | #alpha = T.nnet.softmax(a) 458 | #alpha = T.reshape(alpha, (input.shape[0], input.shape[1])) 459 | 460 | # 461 | # # create alpha in dim (batch_size, seq_len, 1) 462 | 463 | # 464 | #weighted_hidden = input * alpha.dimshuffle(0, 1, 'x') 465 | #weighted_hidden = T.sum(weighted_hidden, axis=1) #sum seqlen out 466 | 467 | return [cell, hid, a] 468 | 469 | sequences = [] 470 | step_fun = step 471 | 472 | ones = T.ones((num_batch, 1)) 473 | if isinstance(self.cell_init, T.TensorVariable): 474 | cell_init = self.cell_init 475 | else: 476 | # Dot against a 1s vector to repeat to shape (num_batch, num_units) 477 | cell_init = T.dot(ones, self.cell_init) 478 | 479 | if isinstance(self.hid_init, T.TensorVariable): 480 | hid_init = self.hid_init 481 | else: 482 | # Dot against a 1s vector to repeat to shape (num_batch, num_units) 483 | hid_init = T.dot(ones, self.hid_init) 484 | 485 | #weighted_hidden_init = T.zeros((num_batch, input.shape[2])) 486 | alpha_init = T.zeros((num_batch, encode_seqlen)) 487 | 488 | # The hidden-to-hidden weight matrix is always used in step 489 | 490 | hUa = T.dot(input, self.U_align) # (num_batch, seq_len, num_units_aln) 491 | 492 | non_seqs = [hUa, self.W_align, self.v_align, 493 | self.W_hid_stacked] 494 | # The "peephole" weight matrices are only used when self.peepholes=True 495 | if self.peepholes: 496 | non_seqs += [self.W_cell_to_ingate, 497 | self.W_cell_to_forgetgate, 498 | self.W_cell_to_outgate] 499 | # theano.scan only allows for positional arguments, so when 500 | # self.peepholes is False, we need to supply fake placeholder arguments 501 | # for the three peephole matrices. 502 | else: 503 | non_seqs += [(), (), ()] 504 | # When we aren't precomputing the input outside of scan, we need to 505 | # provide the input weights and biases to the step function 506 | non_seqs += [self.b_stacked] 507 | 508 | if self.unroll_scan: 509 | # Explicitly unroll the recurrence instead of using scan 510 | cell_out, hid_out, a_out = unroll_scan( 511 | fn=step_fun, 512 | sequences=sequences, 513 | outputs_info=[cell_init, hid_init, alpha_init], 514 | go_backwards=self.backwards, 515 | non_sequences=non_seqs, 516 | n_steps=self.n_decodesteps) 517 | else: 518 | # Scan op iterates over first dimension of input and repeatedly 519 | # applies the step function 520 | cell_out, hid_out, a_out = theano.scan( 521 | fn=step_fun, 522 | sequences=sequences, 523 | outputs_info=[cell_init, hid_init, alpha_init], 524 | go_backwards=self.backwards, 525 | truncate_gradient=self.gradient_steps, 526 | non_sequences=non_seqs, 527 | n_steps=self.n_decodesteps, 528 | strict=True)[0] 529 | 530 | # dimshuffle back to (n_batch, n_time_steps, n_features)) 531 | 532 | #a_out - (n_decodesteps, bs, seqlen) 533 | #hid_out - (n_decode_steps, bs, num_units) 534 | 535 | 536 | # mask: (BS, encode_seqlen 537 | # a_out; (n_decodesteps, BS, encode_seqlen) 538 | cell_out = cell_out.dimshuffle(1, 0, 2) 539 | mask = mask.dimshuffle(0, 'x', 1) 540 | a_out = a_out.dimshuffle(1, 0, 2) # (BS, n_decodesteps, encode_seqlen) 541 | 542 | # set masked positions to large negative value 543 | a_out = a_out*mask - (1-mask)*10000 544 | 545 | # normalize over encode_seqlen (->large negative values = 0) 546 | a_out = T.reshape(a_out, (num_batch*self.n_decodesteps, encode_seqlen)) 547 | alpha = T.nnet.softmax(a_out) 548 | alpha = T.reshape(alpha, (num_batch, self.n_decodesteps, encode_seqlen)) 549 | 550 | # (BS, encode_seqlen, num_units) -> (BS, num_units, 1 encode_seqlen,) 551 | input = input.dimshuffle(0, 2, 'x', 1) 552 | # (BS, n_decodesteps, encode_seqlen) -> (BS, '1', n_decodesteps, encode_seqlen) 553 | alpha = alpha.dimshuffle(0, 'x', 1, 2) 554 | weighted_hidden_out = input*alpha 555 | 556 | weighted_hidden_out = T.sum(weighted_hidden_out, axis=3) 557 | # (BS, n_decodesteps, num_encode_units) 558 | 559 | # if scan is backward reverse the output 560 | if self.backwards: 561 | hid_out = hid_out[:, ::-1] 562 | cell_out = cell_out[:, ::-1] 563 | weighted_hidden_out = weighted_hidden_out[:, ::-1] 564 | alpha = alpha[:, ::-1] 565 | 566 | self.hid_out = hid_out 567 | self.cell_out = cell_out 568 | self.weighted_hidden_out = weighted_hidden_out 569 | self.alpha = alpha 570 | 571 | return self.weighted_hidden_out 572 | 573 | 574 | class LSTMAttentionDecodeFeedbackLayer(MergeLayer): 575 | r"""A long short-term memory (LSTM) layer. 576 | 577 | Includes optional "peephole connections" and a forget gate. Based on the 578 | definition in [1]_, which is the current common definition. The output is 579 | computed by 580 | 581 | .. math :: 582 | 583 | i_t &= \sigma_i(W_{xi}x_t + W_{hi}h_{t-1} 584 | + w_{ci}\odot c_{t-1} + b_i)\\ 585 | f_t &= \sigma_f(W_{xf}x_t + W_{hf}h_{t-1} 586 | + w_{cf}\odot c_{t-1} + b_f)\\ 587 | c_t &= f_t \odot c_{t - 1} 588 | + i_t\sigma_c(W_{xc}x_t + W_{hc} h_{t-1} + b_c)\\ 589 | o_t &= \sigma_o(W_{xo}x_t + W_{ho}h_{t-1} + w_{co}\odot c_t + b_o)\\ 590 | h_t &= o_t \odot \sigma_h(c_t) 591 | 592 | Parameters 593 | ---------- 594 | incoming : a :class:`lasagne.layers.Layer` instance or a tuple 595 | The layer feeding into this layer, or the expected input shape. 596 | num_units : int 597 | Number of hidden/cell units in the layer. 598 | W_in_to_ingate : Theano shared variable, numpy array or callable 599 | Initializer for input-to-input gate weight matrix (:math:`W_{xi}`). 600 | W_hid_to_ingate : Theano shared variable, numpy array or callable 601 | Initializer for hidden-to-input gate weight matrix (:math:`W_{hi}`). 602 | W_cell_to_ingate : Theano shared variable, numpy array or callable 603 | Initializer for cell-to-input gate weight vector (:math:`w_{ci}`). 604 | b_ingate : Theano shared variable, numpy array or callable 605 | Initializer for input gate bias vector (:math:`b_i`). 606 | nonlinearity_ingate : callable or None 607 | The nonlinearity that is applied to the input gate activation 608 | (:math:`\sigma_i`). If None is provided, no nonlinearity will be 609 | applied. 610 | W_in_to_forgetgate : Theano shared variable, numpy array or callable 611 | Initializer for input-to-forget gate weight matrix (:math:`W_{xf}`). 612 | W_hid_to_forgetgate : Theano shared variable, numpy array or callable 613 | Initializer for hidden-to-forget gate weight matrix (:math:`W_{hf}`). 614 | W_cell_to_forgetgate : Theano shared variable, numpy array or callable 615 | Initializer for cell-to-forget gate weight vector (:math:`w_{cf}`). 616 | b_forgetgate : Theano shared variable, numpy array or callable 617 | Initializer for forget gate bias vector (:math:`b_f`). 618 | nonlinearity_forgetgate : callable or None 619 | The nonlinearity that is applied to the forget gate activation 620 | (:math:`\sigma_f`). If None is provided, no nonlinearity will be 621 | applied. 622 | W_in_to_cell : Theano shared variable, numpy array or callable 623 | Initializer for input-to-cell weight matrix (:math:`W_{ic}`). 624 | W_hid_to_cell : Theano shared variable, numpy array or callable 625 | Initializer for hidden-to-cell weight matrix (:math:`W_{hc}`). 626 | b_cell : Theano shared variable, numpy array or callable 627 | Initializer for cell bias vector (:math:`b_c`). 628 | nonlinearity_cell : callable or None 629 | The nonlinearity that is applied to the cell activation 630 | (;math:`\sigma_c`). If None is provided, no nonlinearity will be 631 | applied. 632 | W_in_to_outgate : Theano shared variable, numpy array or callable 633 | Initializer for input-to-output gate weight matrix (:math:`W_{io}`). 634 | W_hid_to_outgate : Theano shared variable, numpy array or callable 635 | Initializer for hidden-to-output gate weight matrix (:math:`W_{ho}`). 636 | W_cell_to_outgate : Theano shared variable, numpy array or callable 637 | Initializer for cell-to-output gate weight vector (:math:`w_{co}`). 638 | b_outgate : Theano shared variable, numpy array or callable 639 | Initializer for hidden-to-input gate weight matrix (:math:`b_o`). 640 | nonlinearity_outgate : callable or None 641 | The nonlinearity that is applied to the output gate activation 642 | (:math:`\sigma_o`). If None is provided, no nonlinearity will be 643 | applied. 644 | nonlinearity_out : callable or None 645 | The nonlinearity that is applied to the output (:math:`\sigma_h`). If 646 | None is provided, no nonlinearity will be applied. 647 | cell_init : callable, np.ndarray, theano.shared or TensorVariable 648 | Passing in a TensorVariable allows the user to specify 649 | the value of `cell_init` (:math:`c_0`). In this mode `learn_init` is 650 | ignored for the cell state. 651 | hid_init : callable, np.ndarray, theano.shared or TensorVariable 652 | Passing in a TensorVariable allows the user to specify 653 | the value of `hid_init` (:math:`h_0`). In this mode `learn_init` is 654 | ignored for the hidden state. 655 | backwards : bool 656 | If True, process the sequence backwards and then reverse the 657 | output again such that the output from the layer is always 658 | from :math:`x_1` to :math:`x_n`. 659 | learn_init : bool 660 | If True, initial hidden values are learned. If `hid_init` or 661 | `cell_init` are TensorVariables then the TensorVariable is used and 662 | `learn_init` is ignored for that initial state. 663 | peepholes : bool 664 | If True, the LSTM uses peephole connections. 665 | When False, `W_cell_to_ingate`, `W_cell_to_forgetgate` and 666 | `W_cell_to_outgate` are ignored. 667 | gradient_steps : int 668 | Number of timesteps to include in the backpropagated gradient. 669 | If -1, backpropagate through the entire sequence. 670 | grad_clipping: False or float 671 | If a float is provided, the gradient messages are clipped during the 672 | backward pass. If False, the gradients will not be clipped. See [1]_ 673 | (p. 6) for further explanation. 674 | unroll_scan : bool 675 | If True the recursion is unrolled instead of using scan. For some 676 | graphs this gives a significant speed up but it might also consume 677 | more memory. When `unroll_scan` is true then the `gradient_steps` 678 | setting is ignored. 679 | precompute_input : bool 680 | If True, precompute input_to_hid before iterating through 681 | the sequence. This can result in a speedup at the expense of 682 | an increase in memory usage. 683 | 684 | References 685 | ---------- 686 | .. [1] Graves, Alex: "Generating sequences with recurrent neural networks." 687 | arXiv preprint arXiv:1308.0850 (2013). 688 | """ 689 | def __init__(self, incoming, 690 | num_units, 691 | aln_num_units, 692 | n_decodesteps, 693 | W_align=init.Normal(0.1), 694 | U_align=init.Normal(0.1), 695 | v_align=init.Normal(0.1), 696 | U_conv_align=init.Normal(0.1), 697 | nonlinearity_align=nonlinearities.tanh, 698 | W_hid_to_ingate=init.Normal(0.1), 699 | W_cell_to_ingate=init.Normal(0.1), 700 | b_ingate=init.Constant(0.), 701 | nonlinearity_ingate=nonlinearities.sigmoid, 702 | #W_in_to_forgetgate=init.Normal(0.1), 703 | W_hid_to_forgetgate=init.Normal(0.1), 704 | W_cell_to_forgetgate=init.Normal(0.1), 705 | b_forgetgate=init.Constant(0.), 706 | nonlinearity_forgetgate=nonlinearities.sigmoid, 707 | #W_in_to_cell=init.Normal(0.1), 708 | W_hid_to_cell=init.Normal(0.1), 709 | b_cell=init.Constant(0.), 710 | nonlinearity_cell=nonlinearities.tanh, 711 | #W_in_to_outgate=init.Normal(0.1), 712 | W_hid_to_outgate=init.Normal(0.1), 713 | W_cell_to_outgate=init.Normal(0.1), 714 | b_outgate=init.Constant(0.), 715 | nonlinearity_outgate=nonlinearities.sigmoid, 716 | nonlinearity_out=nonlinearities.tanh, 717 | cell_init=init.Constant(0.), 718 | hid_init=init.Constant(0.), 719 | backwards=False, 720 | learn_init=False, 721 | peepholes=True, 722 | gradient_steps=-1, 723 | grad_clipping=False, 724 | unroll_scan=False, 725 | attention_softmax_function=T.nnet.softmax, 726 | #precompute_input=True, 727 | decode_pre_steps=0, 728 | return_decodehid=False, 729 | mask_input=None, 730 | **kwargs): 731 | 732 | # Initialize parent layer 733 | incomings = [incoming] 734 | if mask_input is not None: 735 | incomings.append(mask_input) 736 | super(LSTMAttentionDecodeFeedbackLayer, self).__init__( 737 | incomings, **kwargs) 738 | 739 | # For any of the nonlinearities, if None is supplied, use identity 740 | if nonlinearity_ingate is None: 741 | self.nonlinearity_ingate = nonlinearities.identity 742 | else: 743 | self.nonlinearity_ingate = nonlinearity_ingate 744 | 745 | if nonlinearity_forgetgate is None: 746 | self.nonlinearity_forgetgate = nonlinearities.identity 747 | else: 748 | self.nonlinearity_forgetgate = nonlinearity_forgetgate 749 | 750 | if nonlinearity_cell is None: 751 | self.nonlinearity_cell = nonlinearities.identity 752 | else: 753 | self.nonlinearity_cell = nonlinearity_cell 754 | 755 | if nonlinearity_outgate is None: 756 | self.nonlinearity_outgate = nonlinearities.identity 757 | else: 758 | self.nonlinearity_outgate = nonlinearity_outgate 759 | 760 | if nonlinearity_out is None: 761 | self.nonlinearity_out = nonlinearities.identity 762 | else: 763 | self.nonlinearity_out = nonlinearity_out 764 | 765 | self.attention_softmax_function = attention_softmax_function 766 | 767 | self.learn_init = learn_init 768 | self.num_units = num_units 769 | self.backwards = backwards 770 | self.peepholes = peepholes 771 | self.gradient_steps = gradient_steps 772 | self.grad_clipping = grad_clipping 773 | self.unroll_scan = unroll_scan 774 | self.n_decodesteps = n_decodesteps 775 | self.aln_num_units = aln_num_units 776 | self.nonlinearity_align = nonlinearity_align 777 | self.decode_pre_steps = decode_pre_steps 778 | self.return_decodehid = return_decodehid 779 | 780 | input_shape = self.input_shapes[0] 781 | if unroll_scan and input_shape[1] is None: 782 | raise ValueError("Input sequence length cannot be specified as " 783 | "None when unroll_scan is True") 784 | 785 | num_inputs = np.prod(input_shape[2:]) 786 | self.num_inputs = num_inputs 787 | # Initialize parameters using the supplied args 788 | #self.W_in_to_ingate = self.add_param( 789 | # W_in_to_ingate, (num_inputs, num_units), name="W_in_to_ingate") 790 | 791 | self.W_hid_to_ingate = self.add_param( 792 | W_hid_to_ingate, (num_units, num_units), name="W_hid_to_ingate") 793 | 794 | self.b_ingate = self.add_param( 795 | b_ingate, (num_units,), name="b_ingate", regularizable=False) 796 | 797 | #self.W_in_to_forgetgate = self.add_param( 798 | # W_in_to_forgetgate, (num_inputs, num_units), 799 | # name="W_in_to_forgetgate") 800 | 801 | self.W_hid_to_forgetgate = self.add_param( 802 | W_hid_to_forgetgate, (num_units, num_units), 803 | name="W_hid_to_forgetgate") 804 | 805 | self.b_forgetgate = self.add_param( 806 | b_forgetgate, (num_units,), name="b_forgetgate", 807 | regularizable=False) 808 | 809 | #self.W_in_to_cell = self.add_param( 810 | # W_in_to_cell, (num_inputs, num_units), name="W_in_to_cell") 811 | 812 | self.W_hid_to_cell = self.add_param( 813 | W_hid_to_cell, (num_units, num_units), name="W_hid_to_cell") 814 | 815 | self.b_cell = self.add_param( 816 | b_cell, (num_units,), name="b_cell", regularizable=False) 817 | 818 | #self.W_in_to_outgate = self.add_param( 819 | # W_in_to_outgate, (num_inputs, num_units), name="W_in_to_outgate") 820 | 821 | self.W_hid_to_outgate = self.add_param( 822 | W_hid_to_outgate, (num_units, num_units), name="W_hid_to_outgate") 823 | 824 | self.b_outgate = self.add_param( 825 | b_outgate, (num_units,), name="b_outgate", regularizable=False) 826 | 827 | 828 | self.W_weightedhid_to_ingate = self.add_param( 829 | W_hid_to_ingate, (num_inputs, num_units), name="W_weightedhid_to_ingate") 830 | 831 | self.W_weightedhid_to_forgetgate = self.add_param( 832 | W_hid_to_forgetgate, (num_inputs, num_units), 833 | name="W_weightedhid_to_forgetgate") 834 | 835 | self.W_weightedhid_to_cell = self.add_param( 836 | W_hid_to_cell, (num_inputs, num_units), name="W_weightedhid_to_cell") 837 | 838 | self.W_weightedhid_to_outgate = self.add_param( 839 | W_hid_to_outgate, (num_inputs, num_units), name="W_weightedhid_to_outgate") 840 | 841 | 842 | 843 | 844 | # Stack input weight matrices into a (num_inputs, 4*num_units) 845 | # matrix, which speeds up computation 846 | #self.W_in_stacked = T.concatenate( 847 | # [self.W_in_to_ingate, self.W_in_to_forgetgate, 848 | # self.W_in_to_cell, self.W_in_to_outgate], axis=1) 849 | 850 | # Same for hidden weight matrices 851 | self.W_hid_stacked = T.concatenate( 852 | [self.W_hid_to_ingate, self.W_hid_to_forgetgate, 853 | self.W_hid_to_cell, self.W_hid_to_outgate], axis=1) 854 | 855 | self.W_weightedhid_stacked = T.concatenate( 856 | [self.W_weightedhid_to_ingate, self.W_weightedhid_to_forgetgate, 857 | self.W_weightedhid_to_cell, self.W_weightedhid_to_outgate], axis=1) 858 | 859 | # Stack biases into a (4*num_units) vector 860 | self.b_stacked = T.concatenate( 861 | [self.b_ingate, self.b_forgetgate, 862 | self.b_cell, self.b_outgate], axis=0) 863 | 864 | # If peephole (cell to gate) connections were enabled, initialize 865 | # peephole connections. These are elementwise products with the cell 866 | # state, so they are represented as vectors. 867 | if self.peepholes: 868 | self.W_cell_to_ingate = self.add_param( 869 | W_cell_to_ingate, (num_units, ), name="W_cell_to_ingate") 870 | 871 | self.W_cell_to_forgetgate = self.add_param( 872 | W_cell_to_forgetgate, (num_units, ), 873 | name="W_cell_to_forgetgate") 874 | 875 | self.W_cell_to_outgate = self.add_param( 876 | W_cell_to_outgate, (num_units, ), name="W_cell_to_outgate") 877 | 878 | self.W_align = self.add_param(W_align, (num_units, self.aln_num_units), 879 | name="AlignSeqOutputLayer: (aln) W_a") 880 | self.U_align = self.add_param(U_align, (num_inputs, self.aln_num_units), 881 | name="AlignSeqOutputLayer: (aln) U_a") 882 | self.v_align = self.add_param(v_align, (self.aln_num_units, 1), 883 | name="AlignSeqOutputLayer: v_a") 884 | 885 | 886 | # Setup initial values for the cell and the hidden units 887 | if isinstance(cell_init, T.TensorVariable): 888 | if cell_init.ndim != 2: 889 | raise ValueError( 890 | "When cell_init is provided as a TensorVariable, it should" 891 | " have 2 dimensions and have shape (num_batch, num_units)") 892 | self.cell_init = cell_init 893 | else: 894 | self.cell_init = self.add_param( 895 | cell_init, (1, num_units), name="cell_init", 896 | trainable=learn_init, regularizable=False) 897 | 898 | if isinstance(hid_init, T.TensorVariable): 899 | if hid_init.ndim != 2: 900 | raise ValueError( 901 | "When hid_init is provided as a TensorVariable, it should " 902 | "have 2 dimensions and have shape (num_batch, num_units)") 903 | self.hid_init = hid_init 904 | else: 905 | self.hid_init = self.add_param( 906 | hid_init, (1, self.num_units), name="hid_init", 907 | trainable=learn_init, regularizable=False) 908 | 909 | def get_output_shape_for(self, input_shapes): 910 | input_shape = input_shapes[0] 911 | return input_shape[0], None, self.num_units 912 | 913 | def get_params(self, **tags): 914 | # Get all parameters from this layer, the master layer 915 | params = super(LSTMAttentionDecodeFeedbackLayer, self).get_params(**tags) 916 | # Combine with all parameters from the child layers 917 | return params 918 | 919 | def get_output_for(self, inputs, **kwargs): 920 | """ 921 | Compute this layer's output function given a symbolic input variable 922 | 923 | Parameters 924 | ---------- 925 | input : theano.TensorType 926 | Symbolic input variable. 927 | mask : theano.TensorType 928 | Theano variable denoting whether each time step in each 929 | sequence in the batch is part of the sequence or not. If ``None``, 930 | then it is assumed that all sequences are of the same length. If 931 | not all sequences are of the same length, then it must be 932 | supplied as a matrix of shape ``(n_batch, n_time_steps)`` where 933 | ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and 934 | ``mask[i, j] = 0`` when ``j > (length of sequence i)``. 935 | 936 | Returns 937 | ------- 938 | layer_output : theano.TensorType 939 | Symblic output variable. 940 | """ 941 | input = inputs[0] 942 | # Retrieve the mask when it is supplied 943 | mask = inputs[1] if len(inputs) > 1 else None 944 | 945 | # Treat all dimensions after the second as flattened feature dimensions 946 | if input.ndim > 3: 947 | input = input.reshape((input.shape[0], input.shape[1], 948 | T.prod(input.shape[2:]))) 949 | num_batch = input.shape[0] 950 | encode_seqlen = input.shape[1] 951 | 952 | if mask is None: 953 | mask = T.ones((num_batch, encode_seqlen),dtype='float32') 954 | # At each call to scan, input_n will be (n_time_steps, 4*num_units). 955 | # We define a slicing function that extract the input to each LSTM gate 956 | def slice_w(x, n): 957 | return x[:, n*self.num_units:(n+1)*self.num_units] 958 | 959 | # Create single recurrent computation step function 960 | # input_n is the n'th vector of the input 961 | def step(cell_previous, hid_previous, alpha_prev, weighted_hidden_prev, 962 | input, mask, hUa, W_align, v_align, 963 | W_hid_stacked, W_weightedhid_stacked, W_cell_to_ingate, 964 | W_cell_to_forgetgate, W_cell_to_outgate, 965 | b_stacked, *args): 966 | 967 | #compute (unormalized) attetion vector 968 | sWa = T.dot(hid_previous, W_align) # (BS, aln_num_units) 969 | sWa = sWa.dimshuffle(0, 'x', 1) # (BS, 1, aln_num_units) 970 | align_act = sWa + hUa 971 | tanh_sWahUa = self.nonlinearity_align(align_act) 972 | # (BS, seqlen, num_units_aln) 973 | 974 | # CALCULATE WEIGHT FOR EACH HIDDEN STATE VECTOR 975 | a = T.dot(tanh_sWahUa, v_align) # (BS, Seqlen, 1) 976 | a = T.reshape(a, (a.shape[0], a.shape[1])) 977 | # # (BS, Seqlen) 978 | # # ->(BS, seq_len) 979 | 980 | a = a*mask - (1-mask)*10000 981 | 982 | alpha = self.attention_softmax_function(a) 983 | #alpha = T.reshape(alpha, (input.shape[0], input.shape[1])) 984 | 985 | # input: (BS, Seqlen, num_units) 986 | weighted_hidden = input * alpha.dimshuffle(0, 1, 'x') 987 | weighted_hidden = T.sum(weighted_hidden, axis=1) #sum seqlen out 988 | 989 | 990 | # Calculate gates pre-activations and slice 991 | 992 | # (BS, dec_hid) x (dec_hid, dec_hid) 993 | gates = T.dot(hid_previous, W_hid_stacked) + b_stacked 994 | # (BS, enc_hid) x (enc_hid, dec_hid) 995 | gates += T.dot(weighted_hidden, W_weightedhid_stacked) 996 | 997 | # Clip gradients 998 | if self.grad_clipping is not False: 999 | gates = theano.gradient.grad_clip( 1000 | gates, -self.grad_clipping, self.grad_clipping) 1001 | 1002 | # Extract the pre-activation gate values 1003 | ingate = slice_w(gates, 0) 1004 | forgetgate = slice_w(gates, 1) 1005 | cell_input = slice_w(gates, 2) 1006 | outgate = slice_w(gates, 3) 1007 | 1008 | if self.peepholes: 1009 | # Compute peephole connections 1010 | ingate += cell_previous*W_cell_to_ingate 1011 | forgetgate += cell_previous*W_cell_to_forgetgate 1012 | 1013 | # Apply nonlinearities 1014 | ingate = self.nonlinearity_ingate(ingate) 1015 | forgetgate = self.nonlinearity_forgetgate(forgetgate) 1016 | cell_input = self.nonlinearity_cell(cell_input) 1017 | outgate = self.nonlinearity_outgate(outgate) 1018 | 1019 | # Compute new cell value 1020 | cell = forgetgate*cell_previous + ingate*cell_input 1021 | 1022 | if self.peepholes: 1023 | outgate += cell*W_cell_to_outgate 1024 | 1025 | # W_align: (num_units, aln_num_units) 1026 | # U_align: (num_feats, aln_num_units) 1027 | # v_align: (aln_num_units, 1) 1028 | # hUa: (BS, Seqlen, aln_num_units) 1029 | # hid: (BS, num_units_dec) 1030 | # input: (BS, Seqlen, num_inputs) 1031 | 1032 | # Compute new hidden unit activation 1033 | hid = outgate*self.nonlinearity_out(cell) 1034 | 1035 | return [cell, hid, alpha, weighted_hidden] 1036 | 1037 | sequences = [] 1038 | step_fun = step 1039 | 1040 | ones = T.ones((num_batch, 1)) 1041 | if isinstance(self.cell_init, T.TensorVariable): 1042 | cell_init = self.cell_init 1043 | else: 1044 | # Dot against a 1s vector to repeat to shape (num_batch, num_units) 1045 | cell_init = T.dot(ones, self.cell_init) 1046 | 1047 | if isinstance(self.hid_init, T.TensorVariable): 1048 | hid_init = self.hid_init 1049 | else: 1050 | # Dot against a 1s vector to repeat to shape (num_batch, num_units) 1051 | hid_init = T.dot(ones, self.hid_init) 1052 | 1053 | #weighted_hidden_init = T.zeros((num_batch, input.shape[2])) 1054 | alpha_init = T.zeros((num_batch, encode_seqlen)) 1055 | 1056 | weighted_hidden_init = T.zeros((num_batch, self.num_inputs)) 1057 | 1058 | # The hidden-to-hidden weight matrix is always used in step 1059 | 1060 | hUa = T.dot(input, self.U_align) # (num_batch, seq_len, num_units_aln) 1061 | 1062 | non_seqs = [input, mask, hUa, self.W_align, self.v_align, 1063 | self.W_hid_stacked, self.W_weightedhid_stacked] 1064 | # The "peephole" weight matrices are only used when self.peepholes=True 1065 | if self.peepholes: 1066 | non_seqs += [self.W_cell_to_ingate, 1067 | self.W_cell_to_forgetgate, 1068 | self.W_cell_to_outgate] 1069 | # theano.scan only allows for positional arguments, so when 1070 | # self.peepholes is False, we need to supply fake placeholder arguments 1071 | # for the three peephole matrices. 1072 | else: 1073 | non_seqs += [(), (), ()] 1074 | # When we aren't precomputing the input outside of scan, we need to 1075 | # provide the input weights and biases to the step function 1076 | 1077 | non_seqs += [self.b_stacked] 1078 | 1079 | if self.unroll_scan: 1080 | # Explicitly unroll the recurrence instead of using scan 1081 | cell_out, hid_out, alpha_out, weighted_hidden_out = unroll_scan( 1082 | fn=step_fun, 1083 | sequences=sequences, 1084 | outputs_info=[cell_init, hid_init, alpha_init, weighted_hidden_init], 1085 | go_backwards=self.backwards, 1086 | non_sequences=non_seqs, 1087 | n_steps=self.n_decodesteps + self.decode_pre_steps) 1088 | else: 1089 | # Scan op iterates over first dimension of input and repeatedly 1090 | # applies the step function 1091 | cell_out, hid_out, alpha_out, weighted_hidden_out = theano.scan( 1092 | fn=step_fun, 1093 | sequences=sequences, 1094 | outputs_info=[cell_init, hid_init, alpha_init, weighted_hidden_init], 1095 | go_backwards=self.backwards, 1096 | truncate_gradient=self.gradient_steps, 1097 | non_sequences=non_seqs, 1098 | n_steps=self.n_decodesteps + self.decode_pre_steps, 1099 | strict=True)[0] 1100 | 1101 | # dimshuffle back to (n_batch, n_time_steps, n_features)) 1102 | 1103 | #a_out - (n_decodesteps, bs, seqlen) 1104 | #hid_out - (n_decode_steps, bs, num_units) 1105 | 1106 | 1107 | # mask: (BS, encode_seqlen 1108 | # a_out; (n_decodesteps, BS, encode_seqlen) 1109 | cell_out = cell_out.dimshuffle(1, 0, 2) 1110 | hid_out = hid_out.dimshuffle(1, 0, 2) # (BS, n_decodesteps, encode_seqlen) 1111 | mask = mask.dimshuffle(0, 'x', 1) 1112 | alpha_out = alpha_out.dimshuffle(1, 0, 2) # (BS, n_decodesteps, encode_seqlen) 1113 | 1114 | weighted_hidden_out = weighted_hidden_out.dimshuffle(1, 0, 2) 1115 | 1116 | # if scan is backward reverse the output 1117 | if self.backwards: 1118 | hid_out = hid_out[:, ::-1] 1119 | cell_out = cell_out[:, ::-1] 1120 | weighted_hidden_out = weighted_hidden_out[:, ::-1] 1121 | alpha_out = alpha_out[:, ::-1] 1122 | 1123 | if self.decode_pre_steps > 0: 1124 | hid_out = hid_out[:, self.decode_pre_steps:] 1125 | cell_out = hid_out[:, self.decode_pre_steps:] 1126 | weighted_hidden_out = weighted_hidden_out[:, self.decode_pre_steps:] 1127 | alpha_out = hid_out[:, self.decode_pre_steps:] 1128 | 1129 | self.hid_out = hid_out 1130 | self.cell_out = cell_out 1131 | self.weighted_hidden_out = weighted_hidden_out 1132 | self.alpha = alpha_out 1133 | 1134 | if self.return_decodehid: 1135 | return hid_out 1136 | else: 1137 | return weighted_hidden_out 1138 | 1139 | 1140 | -------------------------------------------------------------------------------- /lab3/enc-dec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepLearningDTU/nvidia_deep_learning_summercamp_2016/9d87d409eeebed4b8cf39c42987fb5e786d3ae28/lab3/enc-dec.png -------------------------------------------------------------------------------- /lab6/Lab6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import matplotlib\n", 13 | "import numpy as np\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "import sklearn.datasets\n", 16 | "import theano\n", 17 | "import theano.tensor as T\n", 18 | "import lasagne\n", 19 | "import math\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "# Variational Autoencoders (VAE)\n", 27 | "\n", 28 | "In this exercise we'll implement an variational autoencoder. Very briefly an autoencoder encodes some input into a new representaiton and usually more compact representation which can be used to reconstruct the input data again. An variational autoencoder makes the furhter assumption that the compact representation is follows probabilistic distribution (usually a gaussian) which makes it possible to sample new data from a trained variational autoencoder. The \"variational\" part of the name comes from the fact that these models are training using variational inference.\n", 29 | "\n", 30 | "The mathematical details of the training can be a bit challenging however we believe that probabilistic deep learning will be an important part of future deep learning developments why we find it important to introduce the concepts.\n", 31 | "\n", 32 | "As background material we recommend reading [Tutorial on Variational Autoencoder](http://arxiv.org/abs/1606.05908). For the implementation of the model you must read the article \"Auto-Encoding Variational Bayes\", Kingma & Welling, ICLR 2014: http://arxiv.org/pdf/1312.6114v10.pdf and \"Stochastic Backpropagation and Approximate Inference in Deep Generative Models\", Rezende et al, ICML 2014:\n", 33 | "http://arxiv.org/pdf/1401.4082v3.pdf\n", 34 | "\n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "## VAE crash course\n", 42 | "\n", 43 | "VAEs consist of two parts:\n", 44 | "\n", 45 | " * Encoder (also known as recognition, inference or Q-model): Maps the input data into a probabilistic latent space by calculating the mean and variance parameters of a gaussian distribution as a function of the input data x: $q(z|x) = \\mathcal{N}(z|\\mu_\\theta(x), \\sigma_\\phi(x)I)$\n", 46 | " * Decoder (also known as generative or P-model): Reconstructs the input image using a sample from the latent space defined by the encoder model: $p(x|z)$\n", 47 | "\"Drawing\"\n", 48 | "\n", 49 | "\n", 50 | "In more mathematical details we have (this can be a bit challenging)\n", 51 | "\n", 52 | "$p(x) = \\int_z p(x|z)p(z)dz$\n", 53 | "\n", 54 | "$p(x) = \\int_z p(x|z)p(z)\\frac{q(z|x)}{q(z|x)}dz$\n", 55 | "\n", 56 | "\n", 57 | "$p(x) = \\int_z q(z|x) \\frac{p(x|z)p(z)}{q(z|x)}dz$\n", 58 | "\n", 59 | "\n", 60 | "$\\log p(x) = \\log \\int_z q(z|x) \\frac{p(x|z)p(z)}{q(z|x)}dz$\n", 61 | "\n", 62 | "$\\log p(x) \\geq \\int_z q(z|x)\\log \\frac{p(x|z)p(z)}{q(z|x)}dz$\n", 63 | "\n", 64 | "This is know as the variational lower bound. We contiue with a bit of rewriting\n", 65 | "\n", 66 | "$\\log p(x) \\geq E_{q(z|x)} \\left[\\log \\frac{p(x|z)p(z)}{q(z|x)}\\right]$\n", 67 | "\n", 68 | "$\\log p(x) \\geq E_{q(z|x)} \\left[\\log p(x|z)\\right] - KL(q(z|x) | p(z))$\n", 69 | "\n", 70 | "Here the first term on the right hand side are the data reconstruction and the second term the Kulback-Liebler divergenve between the approximate and true posterior distributions which acts as a probabilistic regularizer.\n", 71 | "\n", 72 | "### Training a VAE \n", 73 | "The VAE is similar to an deterministic autoencoder except that we assume that the hidden units are following some distribution. Usually we just assume that the units are independent standard gaussian distributed.\n", 74 | "\n", 75 | "Above we defined a lower bound on the log likelihood of the data. We can train the model by pushing up the lowerbound. I'e we do gradient ascent on the lowerbound. By using the _reparameterization trick_ we can directly backprop throug the model and uptimize the lower bound. If you are interested in the technical details you can look at the references given above.\n", 76 | "\n", 77 | "### Setting up the network\n", 78 | "\n", 79 | "We set up the network like an autoencoder except that the bottle neck layer is the __SimpleSampleLayer__ which samples the hidden units. \n", 80 | "\n", 81 | "The lower bound is calculated in the ```LogLikelihood```. " 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "#To speed up training we'll only work on a subset of the data\n", 93 | "#We discretize the data to 0 and 1 in order to use it with a bernoulli observation model p(x|z) = Ber(mu(z))\n", 94 | "\n", 95 | "def bernoullisample(x):\n", 96 | " return np.random.binomial(1,x,size=x.shape).astype(theano.config.floatX)\n", 97 | "\n", 98 | "\n", 99 | "data = np.load('mnist.npz')\n", 100 | "num_classes = 10\n", 101 | "x_train = bernoullisample(data['X_train'][:50000]).astype('float32')\n", 102 | "targets_train = data['y_train'][:50000].astype('int32')\n", 103 | "\n", 104 | "x_valid = bernoullisample(data['X_valid'][:500]).astype('float32')\n", 105 | "targets_valid = data['y_valid'][:500].astype('int32')\n", 106 | "\n", 107 | "x_test = bernoullisample(data['X_test'][:500]).astype('float32')\n", 108 | "targets_test = data['y_test'][:500].astype('int32')" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "#plot a few MNIST examples\n", 120 | "\n", 121 | "def plot_samples(x,title=''):\n", 122 | " idx = 0\n", 123 | " canvas = np.zeros((28*10, 10*28))\n", 124 | " for i in range(10):\n", 125 | " for j in range(10):\n", 126 | " canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x[idx].reshape((28, 28))\n", 127 | " idx += 1\n", 128 | " plt.figure(figsize=(7, 7))\n", 129 | " plt.imshow(canvas, cmap='gray')\n", 130 | " plt.title(title)\n", 131 | " plt.show()\n", 132 | "\n", 133 | "plot_samples(x_train[:100],title='MNIST handwritten digits')" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": false 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "#defined a couple of helper functions\n", 145 | "c = - 0.5 * math.log(2*math.pi)\n", 146 | "def log_bernoulli(x, p, eps=0.0):\n", 147 | " p = T.clip(p, eps, 1.0 - eps)\n", 148 | " return -T.nnet.binary_crossentropy(p, x)\n", 149 | "\n", 150 | "def kl_normal2_stdnormal(mean, log_var):\n", 151 | " return -0.5*(1 + log_var - mean**2 - T.exp(log_var))\n" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "Construct the lasagne layer." 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "from lasagne.layers import InputLayer,DenseLayer,get_output, get_all_params\n", 170 | "from lasagne.nonlinearities import elu, identity, sigmoid\n", 171 | "from samplelayer import SimpleSampleLayer\n", 172 | "\n", 173 | "num_features = x_train.shape[-1]\n", 174 | "num_latent_z = 64\n", 175 | "\n", 176 | "#MODEL SPECIFICATION\n", 177 | "\n", 178 | "#ENCODER\n", 179 | "l_in_x = InputLayer(shape=(None, num_features))\n", 180 | "l_enc = DenseLayer(l_in_x, num_units=256, nonlinearity=elu)\n", 181 | "l_enc = DenseLayer(l_enc, num_units=256, nonlinearity=elu) \n", 182 | "l_muq = DenseLayer(l_enc, num_units=num_latent_z, nonlinearity=identity) #mu(x)\n", 183 | "l_logvarq = DenseLayer(l_enc, num_units=num_latent_z, nonlinearity=lambda x: T.clip(x,-10,10)) #logvar(x), \n", 184 | "\n", 185 | "l_z = SimpleSampleLayer(mean=l_muq, log_var=l_logvarq) #sample a latent representation z \\sim q(z|x) = N(mu(x),logvar(x))\n", 186 | "\n", 187 | "#we split the in two parts to allow sampling from the decoder model separately\n", 188 | "\n", 189 | "#DECODER\n", 190 | "l_in_z = InputLayer(shape=(None, num_latent_z))\n", 191 | "l_dec = DenseLayer(l_in_z, num_units=256, nonlinearity=elu) \n", 192 | "l_dec = DenseLayer(l_dec, num_units=256, nonlinearity=elu) \n", 193 | "l_mux = DenseLayer(l_dec, num_units=num_features, nonlinearity=sigmoid) #reconstruction of input using a sigmoid output since mux \\in [0,1] " 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": { 200 | "collapsed": false 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "\n", 205 | "sym_x = T.matrix('x')\n", 206 | "sym_z = T.matrix('z')\n", 207 | "\n", 208 | "z_train, muq_train, logvarq_train = get_output([l_z,l_muq,l_logvarq],{l_in_x:sym_x},deterministic=False)\n", 209 | "mux_train = get_output(l_mux,{l_in_z:z_train},deterministic=False)\n", 210 | "\n", 211 | "z_eval, muq_eval, logvarq_eval = get_output([l_z,l_muq,l_logvarq],{l_in_x:sym_x},deterministic=True)\n", 212 | "mux_eval = get_output(l_mux,{l_in_z:z_eval},deterministic=True)\n", 213 | "\n", 214 | "mux_sample = get_output(l_mux,{l_in_z:sym_z},deterministic=True)\n", 215 | "\n", 216 | "\n", 217 | "\n", 218 | "#defined the cost function\n", 219 | "\n", 220 | "def LogLikelihood(mux,x,muq,logvarq):\n", 221 | " log_px_given_z = log_bernoulli(x, mux, eps=1e-6).sum(axis=1).mean() #note that we sum the latent dimension and mean over the samples\n", 222 | " KL_qp = kl_normal2_stdnormal(muq, logvarq).sum(axis=1).mean()\n", 223 | " LL = log_px_given_z - KL_qp\n", 224 | " return LL, log_px_given_z, KL_qp\n", 225 | "\n", 226 | "\n", 227 | "LL_train, logpx_train, KL_train = LogLikelihood(mux_train, sym_x, muq_train, logvarq_train)\n", 228 | "LL_eval, logpx_eval, KL_eval = LogLikelihood(mux_eval, sym_x, muq_eval, logvarq_eval)\n", 229 | "\n", 230 | "\n", 231 | "all_params = get_all_params([l_z,l_mux],trainable=True)\n", 232 | "\n", 233 | "# Let Theano do its magic and get all the gradients we need for training\n", 234 | "all_grads = T.grad(-LL_train, all_params)\n", 235 | "\n", 236 | "\n", 237 | "# Set the update function for parameters \n", 238 | "# you might wan't to experiment with more advanded update schemes like rmsprob, adadelta etc.\n", 239 | "updates = lasagne.updates.adam(all_grads, all_params, learning_rate=1e-3)\n", 240 | "\n", 241 | "\n", 242 | "f_train = theano.function(inputs=[sym_x],\n", 243 | " outputs=[LL_train, logpx_train, KL_train],\n", 244 | " updates=updates)\n", 245 | "\n", 246 | "f_eval = theano.function(inputs=[sym_x],\n", 247 | " outputs=[LL_train, logpx_train, KL_train])\n", 248 | "\n", 249 | "f_sample= theano.function(inputs=[sym_z],\n", 250 | " outputs=[mux_sample])\n", 251 | "\n", 252 | "f_recon= theano.function(inputs=[sym_x],\n", 253 | " outputs=[mux_eval])\n", 254 | "\n" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": { 261 | "collapsed": false 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "#Test the forward pass\n", 266 | "print f_train(x_valid)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": { 273 | "collapsed": false 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "#plot some samples from the untrained model\n", 278 | "z = np.random.normal(0,1,size=(100,num_latent_z)).astype('float32')\n", 279 | "mux_sample = f_sample(z)[0]\n", 280 | "\n", 281 | "plot_samples(mux_sample,title='MNIST handwritten samples, untrained model')" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "Train the model." 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": { 295 | "collapsed": false 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "num_epochs = 10\n", 300 | "batch_size = 64\n", 301 | "num_batch_train = x_train.shape[0] // batch_size\n", 302 | " \n", 303 | "LL_train, KL_train, logpx_train = [],[],[]\n", 304 | "LL_valid, KL_valid, logpx_valid = [],[],[]\n", 305 | "\n", 306 | "for e in range(num_epochs):\n", 307 | " _LL_train, _KL_train, _logpx_train = [],[],[]\n", 308 | " for i in range(num_batch_train):\n", 309 | " out = f_train(x_train[batch_size*i:(i+1)*batch_size])\n", 310 | " #out = [LL, logpx,KL_qp]\n", 311 | " _LL_train += [out[0]]\n", 312 | " _logpx_train += [out[1]]\n", 313 | " _KL_train += [out[2]]\n", 314 | " \n", 315 | " LL_train += [np.mean(_LL_train)] \n", 316 | " KL_train += [np.mean(_KL_train)] \n", 317 | " logpx_train += [np.mean(_logpx_train)] \n", 318 | " \n", 319 | " out = f_eval(x_valid)\n", 320 | " LL_valid += [out[0]]\n", 321 | " logpx_valid += [out[1]]\n", 322 | " KL_valid += [out[2]]\n", 323 | "\n", 324 | " print \"Epoch %i\\t\"%(e) + \\\n", 325 | " \"Train: LL: %0.1f\\tKL %0.1f\\tlogpx: %0.1f\\t\"%(LL_train[-1],KL_train[-1],logpx_train[-1]) + \\\n", 326 | " \"Valid: LL: %0.1f\\tKL %0.1f\\tlogpx: %0.1f\"%(LL_valid[-1],KL_valid[-1],logpx_valid[-1])\n", 327 | "\n", 328 | "\n", 329 | "epoch = np.arange(len(LL_train))\n", 330 | "plt.figure()\n", 331 | "plt.plot(epoch,LL_train,'r',epoch,LL_valid,'b')\n", 332 | "plt.legend(['Train LL','Val LL'],loc='best')\n", 333 | "plt.xlabel('Updates'), plt.ylabel('LL')\n", 334 | "\n" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": { 341 | "collapsed": false 342 | }, 343 | "outputs": [], 344 | "source": [ 345 | "#plot some samples from the trained model\n", 346 | "mux_sample = f_sample(z)[0]\n", 347 | "plot_samples(mux_sample,title='MNIST handwritten samples, $z\\sim p(z)$')\n", 348 | "\n", 349 | "#plot some samples from the trained model\n", 350 | "mux_recon = f_recon(x_test[:100])[0]\n", 351 | "plot_samples(mux_recon,title='MNIST handwritten reconstructions, $z\\sim q(z|x)$')\n" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "## Assignments\n", 359 | "Remember that the model defines the probability distribution $p(x,z) = p(x|z)p(z)$. We additionally have the inference network $q(z|x)$ which allows us to infer the latent variables, $z$, for specific input data values $x$.\n", 360 | "\n", 361 | "\n", 362 | "\n", 363 | "1. Explain how you could sample form the model, which function does this in the code? \n", 364 | "2. Explain how you could get reconstructions from the model. Remember that you have the inference network $q(z|x)$\n", 365 | "3. Use the original paper http://arxiv.org/pdf/1312.6114v10.pdf or [this blog](http://blog.shakirm.com/2015/10/machine-learning-trick-of-the-day-4-reparameterisation-tricks/) to explain what the reparameterization trick does. \n", 366 | "4. The VAE is a probablistic model. We could model $p(x,z,y)$ where $y$ is the label information. How could this model handle semisupervised learning? You can look the papers https://arxiv.org/pdf/1406.5298.pdf or https://arxiv.org/pdf/1602.05473v4.pdf. " 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": { 373 | "collapsed": true 374 | }, 375 | "outputs": [], 376 | "source": [] 377 | } 378 | ], 379 | "metadata": { 380 | "kernelspec": { 381 | "display_name": "Python 2", 382 | "language": "python", 383 | "name": "python2" 384 | }, 385 | "language_info": { 386 | "codemirror_mode": { 387 | "name": "ipython", 388 | "version": 2 389 | }, 390 | "file_extension": ".py", 391 | "mimetype": "text/x-python", 392 | "name": "python", 393 | "nbconvert_exporter": "python", 394 | "pygments_lexer": "ipython2", 395 | "version": "2.7.11" 396 | } 397 | }, 398 | "nbformat": 4, 399 | "nbformat_minor": 0 400 | } 401 | -------------------------------------------------------------------------------- /lab6/VAE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepLearningDTU/nvidia_deep_learning_summercamp_2016/9d87d409eeebed4b8cf39c42987fb5e786d3ae28/lab6/VAE.png -------------------------------------------------------------------------------- /lab6/mnist.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DeepLearningDTU/nvidia_deep_learning_summercamp_2016/9d87d409eeebed4b8cf39c42987fb5e786d3ae28/lab6/mnist.npz -------------------------------------------------------------------------------- /lab6/samplelayer.py: -------------------------------------------------------------------------------- 1 | import lasagne 2 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 3 | import theano.tensor as T 4 | import theano 5 | 6 | 7 | class SimpleSampleLayer(lasagne.layers.MergeLayer): 8 | """ 9 | Simple sampling layer drawing a single Monte Carlo sample to approximate 10 | E_q [log( p(x,z) / q(z|x) )]. This is the approach described in [KINGMA]_. 11 | 12 | Parameters 13 | ---------- 14 | mu, log_var : :class:`Layer` instances 15 | Parameterizing the mean and log(variance) of the distribution to sample 16 | from as described in [KINGMA]_. The code assumes that these have the 17 | same number of dimensions. 18 | 19 | seed : int 20 | seed to random stream 21 | 22 | Methods 23 | ---------- 24 | seed : Helper function to change the random seed after init is called 25 | 26 | References 27 | ---------- 28 | .. [KINGMA] Kingma, Diederik P., and Max Welling. 29 | "Auto-Encoding Variational Bayes." 30 | arXiv preprint arXiv:1312.6114 (2013). 31 | """ 32 | def __init__(self, mean, log_var, 33 | seed=lasagne.random.get_rng().randint(1, 2147462579), 34 | **kwargs): 35 | super(SimpleSampleLayer, self).__init__([mean, log_var], **kwargs) 36 | 37 | self._srng = RandomStreams(seed) 38 | 39 | def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): 40 | self._srng.seed(seed) 41 | 42 | def get_output_shape_for(self, input_shapes): 43 | return input_shapes[0] 44 | 45 | def get_output_for(self, input, **kwargs): 46 | mu, log_var = input 47 | eps = self._srng.normal(mu.shape) 48 | z = mu + T.exp(0.5 * log_var) * eps 49 | return z 50 | 51 | 52 | class SampleLayer(lasagne.layers.MergeLayer): 53 | """ 54 | Sampling layer supporting importance sampling as described in [BURDA]_ and 55 | multiple Monte Carlo samples for the approximation of 56 | E_q [log( p(x,z) / q(z|x) )]. 57 | 58 | Parameters 59 | ---------- 60 | mu : class:`Layer` instance 61 | Parameterizing the mean of the distribution to sample 62 | from as described in [BURDA]_. 63 | 64 | log_var : class:`Layer` instance 65 | By default assumed to parametrize log(sigma^2) of the distribution to 66 | sample from as described in [BURDA]_ which is transformed to sigma using 67 | the nonlinearity function as described below. Effectively this means 68 | that the nonlinearity function controls what log_var parametrizes. A few 69 | common examples: 70 | -nonlinearity = lambda x: T.exp(0.5*x) => log_var = log(sigma^2)[default] 71 | -nonlinearity = lambda x: T.sqrt(x) => log_var = sigma^2 72 | -nonlinearity = lambda x: x => log_var = sigma 73 | 74 | eq_samples : int or T.scalar 75 | Number of Monte Carlo samples used to estimate the expectation over 76 | q(z|x) in eq. (8) in [BURDA]_. 77 | 78 | iw_samples : int or T.scalar 79 | Number of importance samples in the sum over k in eq. (8) in [BURDA]_. 80 | 81 | nonlinearity : callable or None 82 | The nonlinearity that is applied to the log_var input layer to transform 83 | it into a standard deviation. By default we assume that 84 | log_var = log(sigma^2) and hence the corresponding nonlinearity is 85 | f(x) = T.exp(0.5*x) such that T.exp(0.5*log(sigma^2)) = sigma 86 | 87 | seed : int 88 | seed to random stream 89 | 90 | Methods 91 | ---------- 92 | seed : Helper function to change the random seed after init is called 93 | 94 | References 95 | ---------- 96 | .. [BURDA] Burda, Yuri, Roger Grosse, and Ruslan Salakhutdinov. 97 | "Importance Weighted Autoencoders." 98 | arXiv preprint arXiv:1509.00519 (2015). 99 | """ 100 | 101 | def __init__(self, mean, log_var, 102 | eq_samples=1, 103 | iw_samples=1, 104 | nonlinearity=lambda x: T.exp(0.5*x), 105 | seed=lasagne.random.get_rng().randint(1, 2147462579), 106 | **kwargs): 107 | super(SampleLayer, self).__init__([mean, log_var], **kwargs) 108 | 109 | self.eq_samples = eq_samples 110 | self.iw_samples = iw_samples 111 | self.nonlinearity = nonlinearity 112 | 113 | self._srng = RandomStreams(seed) 114 | 115 | def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): 116 | self._srng.seed(seed) 117 | 118 | def get_output_shape_for(self, input_shapes): 119 | batch_size, num_latent = input_shapes[0] 120 | if isinstance(batch_size, int) and \ 121 | isinstance(self.iw_samples, int) and \ 122 | isinstance(self.eq_samples, int): 123 | out_dim = (batch_size*self.eq_samples*self.iw_samples, num_latent) 124 | else: 125 | out_dim = (None, num_latent) 126 | return out_dim 127 | 128 | def get_output_for(self, input, **kwargs): 129 | mu, log_var = input 130 | batch_size, num_latent = mu.shape 131 | eps = self._srng.normal( 132 | [batch_size, self.eq_samples, self.iw_samples, num_latent], 133 | dtype=theano.config.floatX) 134 | 135 | z = mu.dimshuffle(0,'x','x',1) + \ 136 | self.nonlinearity( log_var.dimshuffle(0,'x','x',1)) * eps 137 | 138 | return z.reshape((-1,num_latent)) 139 | 140 | 141 | class SimpleBernoulliSampleLayer(lasagne.layers.Layer): 142 | """ 143 | Simple sampling layer drawing samples from bernoulli distributions. 144 | 145 | Parameters 146 | ---------- 147 | mean : :class:`Layer` instances 148 | Parameterizing the mean value of each bernoulli distribution 149 | seed : int 150 | seed to random stream 151 | Methods 152 | ---------- 153 | seed : Helper function to change the random seed after init is called 154 | """ 155 | 156 | def __init__(self, mean, 157 | seed=lasagne.random.get_rng().randint(1, 2147462579), 158 | **kwargs): 159 | super(SimpleBernoulliSampleLayer, self).__init__(mean, **kwargs) 160 | 161 | self._srng = RandomStreams(seed) 162 | 163 | def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): 164 | self._srng.seed(seed) 165 | 166 | def get_output_shape_for(self, input_shape): 167 | return input_shape 168 | 169 | def get_output_for(self, mu, **kwargs): 170 | return self._srng.binomial(size=mu.shape, p=mu, dtype=mu.dtype) 171 | 172 | 173 | class BernoulliSampleLayer(lasagne.layers.Layer): 174 | """ 175 | Bernoulli Sampling layer supporting importance sampling 176 | Parameters 177 | ---------- 178 | mean : class:`Layer` instance 179 | Parameterizing the mean value of each bernoulli distribution 180 | eq_samples : int or T.scalar 181 | Number of Monte Carlo samples used to estimate the expectation over 182 | iw_samples : int or T.scalar 183 | Number of importance samples in the sum over k 184 | seed : int 185 | seed to random stream 186 | Methods 187 | ---------- 188 | seed : Helper function to change the random seed after init is called 189 | """ 190 | 191 | def __init__(self, mean, 192 | eq_samples=1, 193 | iw_samples=1, 194 | seed=lasagne.random.get_rng().randint(1, 2147462579), 195 | **kwargs): 196 | super(BernoulliSampleLayer, self).__init__(mean, **kwargs) 197 | 198 | self.eq_samples = eq_samples 199 | self.iw_samples = iw_samples 200 | 201 | self._srng = RandomStreams(seed) 202 | 203 | def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): 204 | self._srng.seed(seed) 205 | 206 | def get_output_shape_for(self, input_shape): 207 | batch_size, num_latent = input_shape 208 | if isinstance(batch_size, int) and \ 209 | isinstance(self.iw_samples, int) and \ 210 | isinstance(self.eq_samples, int): 211 | out_dim = (batch_size*self.eq_samples*self.iw_samples, num_latent) 212 | else: 213 | out_dim = (None, num_latent) 214 | return out_dim 215 | 216 | def get_output_for(self, input, **kwargs): 217 | mu = input 218 | batch_size, num_latent = mu.shape 219 | shp = (batch_size, self.eq_samples, self.iw_samples, num_latent) 220 | mu_shp = mu.dimshuffle(0,'x','x',1) 221 | mu_shp = T.repeat(mu_shp, axis=1, repeats=self.eq_samples) 222 | mu_shp = T.repeat(mu_shp, axis=2, repeats=self.iw_samples) 223 | samples = self._srng.binomial( 224 | size=shp, p=mu_shp, dtype=theano.config.floatX) 225 | return samples.reshape((-1, num_latent)) 226 | --------------------------------------------------------------------------------