├── .gitignore
├── README.md
├── lab1
    ├── confusionmatrix.py
    ├── lab1_FFN.ipynb
    └── mnist.npz
├── lab2
    ├── .ipynb_checkpoints
    │   └── lab2_CNN-checkpoint.ipynb
    ├── confusionmatrix.py
    ├── lab2_CNN.ipynb
    └── mnist.npz
├── lab3
    ├── .ipynb_checkpoints
    │   └── RNN-checkpoint.ipynb
    ├── RNN.ipynb
    ├── confusionmatrix.py
    ├── data_generator.py
    ├── decoder_attention.py
    └── enc-dec.png
└── lab6
    ├── .ipynb_checkpoints
        └── Lab6-checkpoint.ipynb
    ├── Lab6.ipynb
    ├── VAE.png
    ├── lab6
    ├── mnist.npz
    └── samplelayer.py


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Nvidia Deep Learning Summercamp 2016
2 | by *Casper Sønderby, University of Copenhagen* 
3 | 
4 | Parts of the code are based on contributions from Lars Maaløe and Søren Kaae Sønderby
5 | 


--------------------------------------------------------------------------------
/lab1/confusionmatrix.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class ConfusionMatrix:
  5 |     """
  6 |        Simple confusion matrix class
  7 |        row is the true class, column is the predicted class
  8 |     """
  9 |     def __init__(self, num_classes, class_names=None):
 10 |         self.n_classes = num_classes
 11 |         if class_names is None:
 12 |             self.class_names = map(str, range(num_classes))
 13 |         else:
 14 |             self.class_names = class_names
 15 | 
 16 |         # find max class_name and pad
 17 |         max_len = max(map(len, self.class_names))
 18 |         self.max_len = max_len
 19 |         for idx, name in enumerate(self.class_names):
 20 |             if len(self.class_names) < max_len:
 21 |                 self.class_names[idx] = name + " "*(max_len-len(name))
 22 | 
 23 |         self.mat = np.zeros((num_classes,num_classes),dtype='int')
 24 | 
 25 |     def __str__(self):
 26 |         # calucate row and column sums
 27 |         col_sum = np.sum(self.mat, axis=1)
 28 |         row_sum = np.sum(self.mat, axis=0)
 29 | 
 30 |         s = []
 31 | 
 32 |         mat_str = self.mat.__str__()
 33 |         mat_str = mat_str.replace('[','').replace(']','').split('\n')
 34 | 
 35 |         for idx, row in enumerate(mat_str):
 36 |             if idx == 0:
 37 |                 pad = " "
 38 |             else:
 39 |                 pad = ""
 40 |             class_name = self.class_names[idx]
 41 |             class_name = " " + class_name + " |"
 42 |             row_str = class_name + pad + row
 43 |             row_str += " |" + str(col_sum[idx])
 44 |             s.append(row_str)
 45 | 
 46 |         row_sum = [(self.max_len+4)*" "+" ".join(map(str, row_sum))]
 47 |         hline = [(1+self.max_len)*" "+"-"*len(row_sum[0])]
 48 | 
 49 |         s = hline + s + hline + row_sum
 50 | 
 51 |         # add linebreaks
 52 |         s_out = [line+'\n' for line in s]
 53 |         return "".join(s_out)
 54 | 
 55 |     def batch_add(self, targets, preds):
 56 |         assert targets.shape == preds.shape
 57 |         assert len(targets) == len(preds)
 58 |         assert max(targets) < self.n_classes
 59 |         assert max(preds) < self.n_classes
 60 |         targets = targets.flatten()
 61 |         preds = preds.flatten()
 62 |         for i in range(len(targets)):
 63 |                 self.mat[targets[i], preds[i]] += 1
 64 | 
 65 |     def get_errors(self):
 66 |         tp = np.asarray(np.diag(self.mat).flatten(),dtype='float')
 67 |         fn = np.asarray(np.sum(self.mat, axis=1).flatten(),dtype='float') - tp
 68 |         fp = np.asarray(np.sum(self.mat, axis=0).flatten(),dtype='float') - tp
 69 |         tn = np.asarray(np.sum(self.mat)*np.ones(self.n_classes).flatten(),
 70 |                         dtype='float') - tp - fn - fp
 71 |         return tp, fn, fp, tn
 72 | 
 73 |     def accuracy(self):
 74 |         """
 75 |         Calculates global accuracy
 76 |         :return: accuracy
 77 |         :example: >>> conf = ConfusionMatrix(3)
 78 |                   >>> conf.batchAdd([0,0,1],[0,0,2])
 79 |                   >>> print conf.accuracy()
 80 |         """
 81 |         tp, _, _, _ = self.get_errors()
 82 |         n_samples = np.sum(self.mat)
 83 |         return np.sum(tp) / n_samples
 84 | 
 85 |     def sensitivity(self):
 86 |         tp, tn, fp, fn = self.get_errors()
 87 |         res = tp / (tp + fn)
 88 |         res = res[~np.isnan(res)]
 89 |         return res
 90 | 
 91 |     def specificity(self):
 92 |         tp, tn, fp, fn = self.get_errors()
 93 |         res = tn / (tn + fp)
 94 |         res = res[~np.isnan(res)]
 95 |         return res
 96 | 
 97 |     def positive_predictive_value(self):
 98 |         tp, tn, fp, fn = self.get_errors()
 99 |         res = tp / (tp + fp)
100 |         res = res[~np.isnan(res)]
101 |         return res
102 | 
103 |     def negative_predictive_value(self):
104 |         tp, tn, fp, fn = self.get_errors()
105 |         res = tn / (tn + fn)
106 |         res = res[~np.isnan(res)]
107 |         return res
108 | 
109 |     def false_positive_rate(self):
110 |         tp, tn, fp, fn = self.get_errors()
111 |         res = fp / (fp + tn)
112 |         res = res[~np.isnan(res)]
113 |         return res
114 | 
115 |     def false_discovery_rate(self):
116 |         tp, tn, fp, fn = self.get_errors()
117 |         res = fp / (tp + fp)
118 |         res = res[~np.isnan(res)]
119 |         return res
120 | 
121 |     def F1(self):
122 |         tp, tn, fp, fn = self.get_errors()
123 |         res = (2*tp) / (2*tp + fp + fn)
124 |         res = res[~np.isnan(res)]
125 |         return res
126 | 
127 |     def matthews_correlation(self):
128 |         tp, tn, fp, fn = self.get_errors()
129 |         numerator = tp*tn - fp*fn
130 |         denominator = np.sqrt((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))
131 |         res = numerator / denominator
132 |         res = res[~np.isnan(res)]
133 |         return res
134 | 


--------------------------------------------------------------------------------
/lab1/lab1_FFN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%matplotlib inline\n",
 12 |     "import matplotlib\n",
 13 |     "import numpy as np\n",
 14 |     "import matplotlib.pyplot as plt\n",
 15 |     "import sklearn.datasets\n",
 16 |     "import theano\n",
 17 |     "import theano.tensor as T\n",
 18 |     "import lasagne\n",
 19 |     "\n",
 20 |     "def plot_decision_boundary(pred_func, X, y):\n",
 21 |     "    #from https://github.com/dennybritz/nn-from-scratch/blob/master/nn-from-scratch.ipynb\n",
 22 |     "    # Set min and max values and give it some padding\n",
 23 |     "    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5\n",
 24 |     "    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5\n",
 25 |     "    \n",
 26 |     "    h = 0.01\n",
 27 |     "    # Generate a grid of points with distance h between them\n",
 28 |     "    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
 29 |     "    \n",
 30 |     "    yy = yy.astype('float32')\n",
 31 |     "    xx = xx.astype('float32')\n",
 32 |     "    # Predict the function value for the whole gid\n",
 33 |     "    Z = pred_func(np.c_[xx.ravel(), yy.ravel()])[:,0]\n",
 34 |     "    Z = Z.reshape(xx.shape)\n",
 35 |     "    # Plot the contour and training examples\n",
 36 |     "    plt.figure()\n",
 37 |     "    plt.contourf(xx, yy, Z, cmap=plt.cm.RdBu)\n",
 38 |     "    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)\n",
 39 |     "\n",
 40 |     "def onehot(t, num_classes):\n",
 41 |     "    out = np.zeros((t.shape[0], num_classes))\n",
 42 |     "    for row, col in enumerate(t):\n",
 43 |     "        out[row, col] = 1\n",
 44 |     "    return out\n",
 45 |     "\n",
 46 |     "    "
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "# Neural networks 101\n",
 54 |     "In this notebook you will implement a simple neural network in Lasagne utilizing the automatic differentiation engine of Theano. We assume that you are already familiar with backpropation (if not please see [Andrej Karpathy](http://cs.stanford.edu/people/karpathy/) or [Michal Nielsen](http://neuralnetworksanddeeplearning.com/chap2.html).\n",
 55 |     "We'll not spend much time on how Theano works, but you can refer to [this short tutorial](http://nbviewer.jupyter.org/github/craffel/theano-tutorial/blob/master/Theano%20Tutorial.ipynb) if you are interested.\n",
 56 |     "\n",
 57 |     "Additionally, for the ambitious people we have previously made an assignment where you will implement both the forward and backpropagation in a neural network by hand, https://github.com/DTU-deeplearning/day1-NN/blob/master/exercises_1.ipynb \n",
 58 |     "\n",
 59 |     "In this exercise we'll start right away by defining logistic regression model in Lasagne/Theano. Some details of Theano can be a bit confusing, however you'll pick them up when you worked with it for some time. For now you should pay most attention to the highlevel network construction in Lasagne. We'll initially start with a simple 2-D and 2-class classification problem where the class decision boundary can be visualized. Initially we show that logistic regression can only separate classes linearly. Adding a Non-linear hidden layer to the algorithm permits nonlinear class separation. If time permits we'll continue on to implement a fully conencted neural network to classify the (in)famous MNIST dataset consisting of images of hand written digits. \n"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "# Problem \n",
 67 |     "We'll initally demonstrate the that MLPs can classify non-linear problems whereas simple logistic regression cannot. For ease of visualization and computationl speed we initially experiment on the simple 2D half-moon dataset."
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {
 74 |     "collapsed": false
 75 |    },
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "# Generate a dataset and plot it\n",
 79 |     "np.random.seed(0)\n",
 80 |     "num_samples = 300\n",
 81 |     "\n",
 82 |     "X, y = sklearn.datasets.make_moons(num_samples, noise=0.20)\n",
 83 |     "\n",
 84 |     "X_tr = X[:100].astype('float32')\n",
 85 |     "X_val = X[100:200].astype('float32')\n",
 86 |     "X_te = X[200:].astype('float32')\n",
 87 |     "\n",
 88 |     "y_tr = y[:100].astype('int32')\n",
 89 |     "y_val = y[100:200].astype('int32')\n",
 90 |     "y_te = y[200:].astype('int32')\n",
 91 |     "\n",
 92 |     "plt.scatter(X_tr[:,0], X_tr[:,1], s=40, c=y_tr, cmap=plt.cm.BuGn)\n",
 93 |     "\n",
 94 |     "print X.shape, y.shape\n",
 95 |     "\n",
 96 |     "num_features = X_tr.shape[-1]"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "# From Logistic Regression to \"Deep Learning\" in Lasagne\n",
104 |     "The code implements logistic regression in lasagne. In section __Assignments Half Moon__ you are asked to modify the code into a neural network. \n",
105 |     "\n",
106 |     "The building block in lasagne is the Layer. To get started the most important layers are the DenseLayer and the InputLayer. \n",
107 |     "\n",
108 |     "The [InputLayer](http://lasagne.readthedocs.io/en/latest/modules/layers/input.html) is a \"special\" layer which lets you input data to the network. The InputLayer is initialized with a tuple specifying the shape of the input data. Note that it is common to provide ``None`` for the first dimension which allows you to vary the batch size at runtime. \n",
109 |     "\n",
110 |     "The [DenseLayer](http://lasagne.readthedocs.io/en/latest/modules/layers/dense.html) implements the computation: \n",
111 |     "\n",
112 |     "$$y = nonlinearity(xW + b)$$\n",
113 |     "\n",
114 |     "where $x$ is the layer input, $y$ is the layer output and $\\{W, b\\}$ are the layer parameters. The DenseLayer is initialized with a pointer to the previous layer, the desired number of units in the layer and the nonlinearity. \n",
115 |     "x has shape ```[batchsize, num_features]```. From this we can infer the size of ```W``` as ```[num_features, num_units]``` and b as ```[num_units]```. y is then ```[batch_size, num_units]```.\n",
116 |     "\n",
117 |     "\n",
118 |     "A layer in Lasagne does the following:\n",
119 |     "1. Given the shape of the input $x$ and the number of units in the layer lasagne infers the shapes of $W$ and $b$ and keep track of the layer parameters.\n",
120 |     "2. Setup the computation $y = nonlinearity(xW + b)$"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {
127 |     "collapsed": false
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "from lasagne.updates import sgd\n",
132 |     "from lasagne.nonlinearities import leaky_rectify, softmax, tanh, elu\n",
133 |     "from lasagne.layers import InputLayer, DenseLayer\n",
134 |     "\n",
135 |     "\n",
136 |     "#MODEL SPECIFICATION\n",
137 |     "l_in = InputLayer(shape=(None, num_features))\n",
138 |     "#INSERT HIDDEL LAYER HERE\n",
139 |     "#l = DenseLayer(incoming=l,.....\n",
140 |     "l_out = DenseLayer(incoming=l_in, num_units=2, nonlinearity=softmax, name='outputlayer') \n",
141 |     "#We use two output units since we have two classes. the softmax function ensures that the the class probabilities sum to 1."
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "After we have built the network we can use lasagnes helper functions to \n",
149 |     "\n",
150 |     "1. Build the computation graph: __[lasagne.layers.get_output](http://lasagne.readthedocs.io/en/latest/modules/layers/helper.html#lasagne.layers.get_output)__ . The ``deterministic`` flag tells lasagne if we are in training mode or evaluation mode. When you build more complicated networks this is very important to remember! (Two important layers taht behave differently in training mode and evaluation mode are the [DropoutLayer](http://lasagne.readthedocs.io/en/latest/modules/layers/noise.html#lasagne.layers.DropoutLayer) and the [BatchNormalizationLayer](http://lasagne.readthedocs.io/en/latest/modules/layers/normalization.html?highlight=batchnorm#lasagne.layers.BatchNormLayer)). Building the computation graph gives us the forwardpass of the network.  \n",
151 |     "2. Collect the network parameters: __[lasagne.layers.get_all_params](http://lasagne.readthedocs.io/en/latest/modules/layers/helper.html#lasagne.layers.get_all_params)__ (Note the trainable flag which will only return paramters that are trainable. You'll get errors if your are using batchnorm and you forget this)\n",
152 |     "\n",
153 |     "Note that all the helper functions are called with the output layer or a list of outputlayers if you have multiple output layers. "
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {
160 |     "collapsed": false
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "sym_x = T.matrix('X') # a symbolic variable taking on the value of a input batch.\n",
165 |     "sym_t = T.ivector('target') # a symbolic variable taking on the value of the target batch.\n",
166 |     "\n",
167 |     "\n",
168 |     "# Get network output\n",
169 |     "train_out = lasagne.layers.get_output(l_out, {l_in: sym_x}, deterministic=False)\n",
170 |     "eval_out = lasagne.layers.get_output(l_out, {l_in: sym_x}, deterministic=True)\n",
171 |     "\n",
172 |     "\n",
173 |     "# Get list of all trainable parameters in the network.\n",
174 |     "all_params = lasagne.layers.get_all_params(l_out, trainable=True)\n",
175 |     "\n",
176 |     "# print shapes of all the paramters in the network.\n",
177 |     "for p in all_params:\n",
178 |     "    print p, p.get_value().shape"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "``train_out`` will be a symbolic variable representing the network output. Using ``train_out`` we  can define the [crossentropy error](http://deeplearning.net/software/theano/library/tensor/nnet/nnet.html#tensor.nnet.categorical_crossentropy) used for training the network.\n",
186 |     "We ```mean``` over all the samples in the mini-batch.\n"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {
193 |     "collapsed": true
194 |    },
195 |    "outputs": [],
196 |    "source": [
197 |     "cost_train = T.nnet.categorical_crossentropy(train_out, sym_t).mean()\n",
198 |     "cost_eval = T.nnet.categorical_crossentropy(eval_out, sym_t).mean()"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "When we train a neural network we update the parameters in direction of the negative gradient w.r.t the cost.\n",
206 |     "We can use ``T.grad`` to get the gradients for all parameters in the network w.r.t ``cost_train``.\n",
207 |     "Imaggine that ```cost_train``` is a function and we want to go downhill. We go downhill by changing the value of the paramters in direction of the negative gradient. \n",
208 |     "\n",
209 |     "Finally we can use __[lasagne.updates.sgd](http://lasagne.readthedocs.io/en/latest/modules/updates.html#lasagne.updates.sgd)__ to calculate the stochastic gradient descent (SGD) update rule for each paramter in the network. ``updates`` is a dictionary of the parameter update rules.\n",
210 |     "\n",
211 |     "Heres a small animation of [different optimizers doing](http://lasagne.readthedocs.io/en/latest/modules/updates.html) gradient descent: http://imgur.com/a/Hqolp . E.g why saddle points might be difficult."
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": null,
217 |    "metadata": {
218 |     "collapsed": false
219 |    },
220 |    "outputs": [],
221 |    "source": [
222 |     "# Let Theano do its magic and get all the gradients we need for training. Essentially T.grad does backprop i.e. get the \n",
223 |     "# gradient of cost_train w.r.t. the parameters.\n",
224 |     "all_grads = T.grad(cost_train, all_params)\n",
225 |     "\n",
226 |     "# Set the update function for parameters \n",
227 |     "# you might wan't to experiment with more advanded update schemes like rmsprob, adadelta etc.\n",
228 |     "updates = lasagne.updates.sgd(all_grads, all_params, learning_rate=1.0)\n"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "The final step is to compile Theano functions for the network. For theano functions we need to specify which inputs the function should take. For our network that is ``sym_x`` which is the input data and ``sym_t`` which is the targets. Secondly we need to specify which outputs we want the network to return. In our case that is the crossentropy cost and the network output.\n",
236 |     "\n",
237 |     "When we compile ``f_train`` we additionally gives the updates dictionary as input. This tell Theano to update the network parameters with the update rules everytime we call ``f_train``. "
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {
244 |     "collapsed": true
245 |    },
246 |    "outputs": [],
247 |    "source": [
248 |     "f_eval = theano.function(inputs=[sym_x, sym_t],\n",
249 |     "                         outputs=[cost_eval, eval_out])\n",
250 |     "\n",
251 |     "f_train = theano.function(inputs=[sym_x, sym_t],\n",
252 |     "                          outputs=[cost_train, eval_out],\n",
253 |     "                          updates=updates)\n",
254 |     "\n",
255 |     "\n",
256 |     "\n",
257 |     "#now you have three functions. \n",
258 |     "# f_train(X,y) -> cost, y_pred which will update the parameters using backprop each time you call it, only use this on the training data!\n",
259 |     "# f_test(X,y) -> cost, y_pred which only calculates the forward pass\n",
260 |     "\n",
261 |     "\n",
262 |     "#This us just a helper function for plotting the decision boundaries between the two classes\n",
263 |     "f_pred = theano.function(inputs=[sym_x],\n",
264 |     "                         outputs=eval_out)"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {
271 |     "collapsed": false
272 |    },
273 |    "outputs": [],
274 |    "source": [
275 |     "# Training loop\n",
276 |     "plot_decision_boundary(lambda x: f_pred(x), X_val,y_val)\n",
277 |     "plt.title(\"Untrained Classifier\")\n",
278 |     "\n",
279 |     "num_epochs = 1000\n",
280 |     "\n",
281 |     "train_cost, val_cost = [],[]\n",
282 |     "for e in range(num_epochs):\n",
283 |     "    out = f_train(X_tr,y_tr)\n",
284 |     "    #out = [cost, y_pred]\n",
285 |     "    train_cost += [out[0]]\n",
286 |     "    \n",
287 |     "    out = f_eval(X_val,y_val)\n",
288 |     "    val_cost += [out[0]]\n",
289 |     "\n",
290 |     "    if e % 100 == 0:\n",
291 |     "        print \"Epoch %i, Train Cost: %0.3f\\tVal Cost: %0.3f\"%(e, train_cost[-1],val_cost[-1])\n",
292 |     "    \n",
293 |     "    \n",
294 |     "out = f_eval(X_te,y_te)\n",
295 |     "test_cost = out[0]\n",
296 |     "print \"\\nTest Cost: %0.3f\"%(test_cost)\n",
297 |     "\n",
298 |     "plot_decision_boundary(lambda x: f_pred(x), X_te, y_te)\n",
299 |     "plt.title(\"Trained Classifier\")\n",
300 |     "\n",
301 |     "epoch = np.arange(len(train_cost))\n",
302 |     "plt.figure()\n",
303 |     "plt.plot(epoch,train_cost,'r',epoch,val_cost,'b')\n",
304 |     "plt.legend(['Train Loss','Val Loss'])\n",
305 |     "plt.xlabel('Updates'), plt.ylabel('Loss')\n",
306 |     "\n"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "markdown",
311 |    "metadata": {},
312 |    "source": [
313 |     "# Assignments Half Moon\n",
314 |     "\n",
315 |     " 1) A linear logistic classifier is only able to create a linear decision boundary. Change the Logistic classifier into a (non-linear) Neural network by inserting a dense hidden layer between the input and output layers of the model\n",
316 |     " \n",
317 |     " 2) Experiment with multiple hidden layers or more / less hidden units. What happens to the decision bondary?\n",
318 |     " \n",
319 |     " 3) Overfitting: When increasing the number of hidden layers / units the neural network will fit the training data better by creating a highly nonlinear decision boundary. If the model is to complex it will often generalize poorly to new data (validation and test set). Can you obseve this from the training and validation errors? \n",
320 |     " \n",
321 |     " 3) We used the vanilla stocastic gradient descent algorithm for parameter updates. This is usually slow to converge and more sophisticated pseudo-second-order methods usually works better. Try changing the optimizer to [adam or adamax](http://lasagne.readthedocs.io/en/latest/modules/updates.html) (lasagne.updates.adam, lasagne.updates.adamax)"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "markdown",
326 |    "metadata": {},
327 |    "source": [
328 |     "# Optional:  MNIST dataset\n",
329 |     "MNIST is a dataset that is often used for benchmarking. The MNIST dataset consists of 70,000 images of handwritten digits from 0-9. The dataset is split into a 50,000 images training set, 10,000 images validation set and 10,000 images test set. The images are 28x28 pixels, where each pixel represents a normalised value between 0-255 (0=black and 255=white).\n",
330 |     "\n",
331 |     "### Primer for the afternoon...\n",
332 |     "We use a feedforward neural network to classify the 28x28 mnist images. ``num_features`` is therefore 28x28=784.\n",
333 |     "That is we represent each image as a vector. The ordering of the pixels in the vector does not matter, so we could permuate all images using the same permuataion and still get the same performance. (Your are of course encouraged to try this using ``numpy.random.permutation`` to get a random permutation :)). This task is therefore called the _permutation invariant_ MNIST. Obviously this throws away a lot of structure in the data. After lunch we'll fix this with the convolutional neural network wich encodes prior knowledgde about data that has either spatial or temporal structure.  \n",
334 |     "\n",
335 |     "### Ballpark estimates of hyperparameters\n",
336 |     "__Optimizers:__\n",
337 |     "    1. SGD + Momentum: learning rate 1.0 - 0.1 \n",
338 |     "    2. ADAM: learning rate 3*1e-4 - 1e-5\n",
339 |     "    3. RMSPROP: somewhere between SGD and ADAM\n",
340 |     "\n",
341 |     "__Regularization:__\n",
342 |     "    1. Dropout. Dropout rate 0.1-0.5 \n",
343 |     "    2. L2/L1 regularization.  http://lasagne.readthedocs.io/en/latest/modules/regularization.html . I don't use this that often but 1e-4  -  1e-8.\n",
344 |     "    \n",
345 |     "    3. Batchnorm: Batchnorm also act regularizer\n",
346 |     "    \n",
347 |     "__Parameter initialization__\n",
348 |     "    Parameter initialization is extremely important. [Lasagne has a lot of different units](http://lasagne.readthedocs.io/en/latest/modules/init.html). Often used initializer use\n",
349 |     "    1. He\n",
350 |     "    2. Glorot\n",
351 |     "    3. Uniform or Normal with small scale. (0.1 - 0.01)\n",
352 |     "    4. Orthogonal (I find that this works very well for RNNs)\n",
353 |     "\n",
354 |     "Bias is nearly always initialized to zero. \n",
355 |     "\n",
356 |     "__Number of hidden units and network structure__\n",
357 |     "   Probably as big network as possible and then apply regularization. You'll have to experiment :). One rarely goes below 512 units for feedforward networks unless your are training on CPU...\n",
358 |     "   Theres is some research into stochstic depth networks: https://arxiv.org/pdf/1603.09382v2.pdf, but in general this is trail and error. \n",
359 |     "\n",
360 |     "__Nonlinearity__: [The most commonly used nonliearities are](http://lasagne.readthedocs.io/en/latest/modules/nonlinearities.html)\n",
361 |     "    \n",
362 |     "    1. ReLU\n",
363 |     "    2. Leaky ReLU. Same as \n",
364 |     "    3. Elu\n",
365 |     "    3. Sigmoids are used if your output is binary. It is not used in the hidden layers. Squases the output between -1 and 1\n",
366 |     "    4. Softmax used as output if you have a classification problem. Normalizes the the output to 1. )\n",
367 |     "\n",
368 |     "\n",
369 |     "See the plot below.\n",
370 |     "\n",
371 |     "__mini-batch size__\n",
372 |     "   Usually people use 16-256. Bigger is not allways better. With smaller mini-batch size you get more updates and your model might converge faster. Also small batchsizez uses less memory  -> you can use a bigger model.\n",
373 |     "\n",
374 |     "Hyperparameters can be found by experience (guessing) or some search procedure. Random search is easy to implement and performs decent: http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf . \n",
375 |     "More advanced search procedures include [SPEARMINT](https://github.com/JasperSnoek/spearmint) and many others. "
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 2,
381 |    "metadata": {
382 |     "collapsed": false
383 |    },
384 |    "outputs": [
385 |     {
386 |      "ename": "NameError",
387 |      "evalue": "name 'np' is not defined",
388 |      "output_type": "error",
389 |      "traceback": [
390 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
391 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
392 |       "\u001b[0;32m<ipython-input-2-3e178072354c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# PLOT OF DIFFERENT OUTPUT USNITS\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinspace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m6\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m6\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mrelu\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmaximum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mleaky_relu\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmaximum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m0.1\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mminimum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# probably a slow implementation....\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0melu\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mx\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
393 |       "\u001b[0;31mNameError\u001b[0m: name 'np' is not defined"
394 |      ]
395 |     }
396 |    ],
397 |    "source": [
398 |     "# PLOT OF DIFFERENT OUTPUT USNITS\n",
399 |     "x = np.linspace(-6, 6, 100)\n",
400 |     "relu = lambda x: np.maximum(0, x)\n",
401 |     "leaky_relu = lambda x: np.maximum(0, x) + 0.1*np.minimum(0, x) # probably a slow implementation....\n",
402 |     "elu = lambda x: (x > 0)*x + (1 - (x > 0))*(np.exp(x) - 1) \n",
403 |     "sigmoid = lambda x: (1+np.exp(-x))**(-1)\n",
404 |     "\n",
405 |     "plt.figure(figsize=(6,6))\n",
406 |     "plt.plot(x, relu(x), label='ReLU', lw=2)\n",
407 |     "plt.plot(x, leaky_relu(x), label='Leaky ReLU',lw=2)\n",
408 |     "plt.plot(x, elu(x), label='Elu', lw=2)\n",
409 |     "plt.plot(x, sigmoid(x), label='Sigmoid',lw=2)\n",
410 |     "plt.legend(loc=2, fontsize=16)\n",
411 |     "plt.title('Non-linearities', fontsize=20)\n",
412 |     "plt.ylim([-2, 5])\n",
413 |     "plt.xlim([-6, 6])\n",
414 |     "\n",
415 |     "# softmax\n",
416 |     "# assert that all class probablities sum to one\n",
417 |     "assert np.all(abs(1.0 - x_softmax.sum(axis=1)) < 1e-8)"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "markdown",
422 |    "metadata": {},
423 |    "source": [
424 |     "## MNIST\n",
425 |     "First let's load the MNIST dataset and plot a few examples:"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": null,
431 |    "metadata": {
432 |     "collapsed": false
433 |    },
434 |    "outputs": [],
435 |    "source": [
436 |     "#To speed up training we'll only work on a subset of the data\n",
437 |     "data = np.load('mnist.npz')\n",
438 |     "num_classes = 10\n",
439 |     "x_train = data['X_train'][:1000].astype('float32')\n",
440 |     "targets_train = data['y_train'][:1000].astype('int32')\n",
441 |     "\n",
442 |     "x_valid = data['X_valid'][:500].astype('float32')\n",
443 |     "targets_valid = data['y_valid'][:500].astype('int32')\n",
444 |     "\n",
445 |     "x_test = data['X_test'][:500].astype('float32')\n",
446 |     "targets_test = data['y_test'][:500].astype('int32')"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": null,
452 |    "metadata": {
453 |     "collapsed": false
454 |    },
455 |    "outputs": [],
456 |    "source": [
457 |     "#plot a few MNIST examples\n",
458 |     "idx = 0\n",
459 |     "canvas = np.zeros((28*10, 10*28))\n",
460 |     "for i in range(10):\n",
461 |     "    for j in range(10):\n",
462 |     "        canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x_train[idx].reshape((28, 28))\n",
463 |     "        idx += 1\n",
464 |     "plt.figure(figsize=(7, 7))\n",
465 |     "plt.axis('off')\n",
466 |     "plt.imshow(canvas, cmap='gray')\n",
467 |     "plt.title('MNIST handwritten digits')\n",
468 |     "plt.show()"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "code",
473 |    "execution_count": null,
474 |    "metadata": {
475 |     "collapsed": false
476 |    },
477 |    "outputs": [],
478 |    "source": [
479 |     "#defined the model\n",
480 |     "num_class = 10\n",
481 |     "num_features = x_train.shape[1]\n",
482 |     "\n",
483 |     "l_in = InputLayer(shape=(None,num_features))\n",
484 |     "l_hid = DenseLayer(incoming=l_in, num_units=500, nonlinearity=elu)\n",
485 |     "l_out = DenseLayer(incoming=l_hid, num_units=num_class, nonlinearity=softmax)"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "code",
490 |    "execution_count": null,
491 |    "metadata": {
492 |     "collapsed": true
493 |    },
494 |    "outputs": [],
495 |    "source": [
496 |     "sym_x = T.matrix('sym_x') # a symbolic variable taking on the value of a input batch.\n",
497 |     "sym_t = T.ivector('sym_t') # a symbolic variable taking on the value of the target batch.\n",
498 |     "\n",
499 |     "# Get network output\n",
500 |     "train_out = lasagne.layers.get_output(l_out, sym_x, deterministic=False)\n",
501 |     "eval_out = lasagne.layers.get_output(l_out, sym_x, deterministic=True)\n",
502 |     "\n",
503 |     "\n",
504 |     "# Get list of all trainable parameters in the network.\n",
505 |     "all_params = lasagne.layers.get_all_params(l_out, trainable=True)\n",
506 |     "\n",
507 |     "cost = T.nnet.categorical_crossentropy(train_out+1e-8, sym_t).mean()\n",
508 |     "# Let Theano do its magic and get all the gradients we need for training\n",
509 |     "all_grads = T.grad(cost, all_params)\n",
510 |     "\n",
511 |     "\n",
512 |     "# Set the update function for parameters \n",
513 |     "# you might wan't to experiment with more advanded update schemes like rmsprob, adadelta etc.\n",
514 |     "updates = lasagne.updates.sgd(all_grads, all_params, learning_rate=0.1)\n",
515 |     "\n",
516 |     "\n",
517 |     "f_eval = theano.function([sym_x],\n",
518 |     "                     eval_out, on_unused_input='warn')\n",
519 |     "\n",
520 |     "f_train = theano.function([sym_x, sym_t],\n",
521 |     "                          [cost],\n",
522 |     "                          updates=updates, on_unused_input='warn')"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": null,
528 |    "metadata": {
529 |     "collapsed": false
530 |    },
531 |    "outputs": [],
532 |    "source": [
533 |     "#Test the forward pass\n",
534 |     "x = np.random.normal(0,1, (45, 28*28)).astype('float32') #dummy data\n",
535 |     "\n",
536 |     "model = lasagne.layers.get_output(l_out, sym_x)\n",
537 |     "out = model.eval({sym_x:x}) #this could also include mask etc if used\n",
538 |     "print \"l_out\", out.shape"
539 |    ]
540 |   },
541 |   {
542 |    "cell_type": "markdown",
543 |    "metadata": {},
544 |    "source": [
545 |     "# Build the training loop.\n",
546 |     "We train the network by calculating the gradient w.r.t the cost function and update the parameters in direction of the negative gradient. \n",
547 |     "\n",
548 |     "\n",
549 |     "When training neural network you always use mini batches. Instead of calculating the average gradient using the entire dataset you approximate the gradient using a mini-batch of typically 16 to 256 samples. The paramters are updated after each mini batch. Networks converges much faster using minibatches because the paramters are updated more often.\n",
550 |     "\n",
551 |     "We build a loop that iterates over the training data. Remember that the parameters are updated each time ``f_train`` is called."
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "code",
556 |    "execution_count": null,
557 |    "metadata": {
558 |     "collapsed": false
559 |    },
560 |    "outputs": [],
561 |    "source": [
562 |     "from confusionmatrix import ConfusionMatrix\n",
563 |     "batch_size = 100\n",
564 |     "num_epochs = 100\n",
565 |     "learning_rate = 0.1\n",
566 |     "num_samples_train = x_train.shape[0]\n",
567 |     "num_batches_train = num_samples_train // batch_size\n",
568 |     "num_samples_valid = x_valid.shape[0]\n",
569 |     "num_batches_valid = num_samples_valid // batch_size\n",
570 |     "\n",
571 |     "train_acc, train_loss = [], []\n",
572 |     "valid_acc, valid_loss = [], []\n",
573 |     "test_acc, test_loss = [], []\n",
574 |     "cur_loss = 0\n",
575 |     "loss = []\n",
576 |     "for epoch in range(num_epochs):\n",
577 |     "    #Forward->Backprob->Update params\n",
578 |     "    cur_loss = 0\n",
579 |     "    for i in range(num_batches_train):\n",
580 |     "        idx = range(i*batch_size, (i+1)*batch_size)\n",
581 |     "        x_batch = x_train[idx]\n",
582 |     "        target_batch = targets_train[idx]    \n",
583 |     "        batch_loss = f_train(x_batch,target_batch) #this will do the complete backprob pass\n",
584 |     "        cur_loss += batch_loss[0]\n",
585 |     "    loss += [cur_loss/batch_size]\n",
586 |     "    \n",
587 |     "    confusion_valid = ConfusionMatrix(num_classes)\n",
588 |     "    confusion_train = ConfusionMatrix(num_classes)\n",
589 |     "\n",
590 |     "    for i in range(num_batches_train):\n",
591 |     "        idx = range(i*batch_size, (i+1)*batch_size)\n",
592 |     "        x_batch = x_train[idx]\n",
593 |     "        targets_batch = targets_train[idx]\n",
594 |     "        net_out = f_eval(x_batch)   \n",
595 |     "        preds = np.argmax(net_out, axis=-1) \n",
596 |     "        confusion_train.batch_add(targets_batch, preds)\n",
597 |     "\n",
598 |     "    confusion_valid = ConfusionMatrix(num_classes)\n",
599 |     "    for i in range(num_batches_valid):\n",
600 |     "        idx = range(i*batch_size, (i+1)*batch_size)\n",
601 |     "        x_batch = x_valid[idx]\n",
602 |     "        targets_batch = targets_valid[idx]\n",
603 |     "        net_out = f_eval(x_batch)   \n",
604 |     "        preds = np.argmax(net_out, axis=-1) \n",
605 |     "        \n",
606 |     "        confusion_valid.batch_add(targets_batch, preds)\n",
607 |     "    \n",
608 |     "    train_acc_cur = confusion_train.accuracy()\n",
609 |     "    valid_acc_cur = confusion_valid.accuracy()\n",
610 |     "\n",
611 |     "    train_acc += [train_acc_cur]\n",
612 |     "    valid_acc += [valid_acc_cur]\n",
613 |     "    print \"Epoch %i : Train Loss %e , Train acc %f,  Valid acc %f \" \\\n",
614 |     "    % (epoch+1, loss[-1], train_acc_cur, valid_acc_cur)\n",
615 |     "    \n",
616 |     "    \n",
617 |     "epoch = np.arange(len(train_acc))\n",
618 |     "plt.figure()\n",
619 |     "plt.plot(epoch,train_acc,'r',epoch,valid_acc,'b')\n",
620 |     "plt.legend(['Train Acc','Val Acc'])\n",
621 |     "plt.xlabel('Updates'), plt.ylabel('Acc')"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "markdown",
626 |    "metadata": {},
627 |    "source": [
628 |     "#More questions"
629 |    ]
630 |   },
631 |   {
632 |    "cell_type": "markdown",
633 |    "metadata": {},
634 |    "source": [
635 |     "1. Do you see overfitting? Google overfitting if you don't know how to spot it\n",
636 |     "2. Regularization is a method to reduce overfitting. Adding noise to your network is a popular method to fight overfitting. Try using Dropout in your network. [Lasagne DropoutLayer](http://lasagne.readthedocs.io/en/latest/modules/layers/noise.html#lasagne.layers.DropoutLayer).\n",
637 |     "3. Alternatively you can regularize your network by penalizing the L2 or L1 norm of the network parameters. [Read the docs for more info](http://lasagne.readthedocs.io/en/latest/modules/regularization.html). "
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "code",
642 |    "execution_count": null,
643 |    "metadata": {
644 |     "collapsed": true
645 |    },
646 |    "outputs": [],
647 |    "source": []
648 |   }
649 |  ],
650 |  "metadata": {
651 |   "kernelspec": {
652 |    "display_name": "Python 2",
653 |    "language": "python",
654 |    "name": "python2"
655 |   },
656 |   "language_info": {
657 |    "codemirror_mode": {
658 |     "name": "ipython",
659 |     "version": 2
660 |    },
661 |    "file_extension": ".py",
662 |    "mimetype": "text/x-python",
663 |    "name": "python",
664 |    "nbconvert_exporter": "python",
665 |    "pygments_lexer": "ipython2",
666 |    "version": "2.7.11"
667 |   }
668 |  },
669 |  "nbformat": 4,
670 |  "nbformat_minor": 0
671 | }
672 | 


--------------------------------------------------------------------------------
/lab1/mnist.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepLearningDTU/nvidia_deep_learning_summercamp_2016/9d87d409eeebed4b8cf39c42987fb5e786d3ae28/lab1/mnist.npz


--------------------------------------------------------------------------------
/lab2/confusionmatrix.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class ConfusionMatrix:
  5 |     """
  6 |        Simple confusion matrix class
  7 |        row is the true class, column is the predicted class
  8 |     """
  9 |     def __init__(self, num_classes, class_names=None):
 10 |         self.n_classes = num_classes
 11 |         if class_names is None:
 12 |             self.class_names = map(str, range(num_classes))
 13 |         else:
 14 |             self.class_names = class_names
 15 | 
 16 |         # find max class_name and pad
 17 |         max_len = max(map(len, self.class_names))
 18 |         self.max_len = max_len
 19 |         for idx, name in enumerate(self.class_names):
 20 |             if len(self.class_names) < max_len:
 21 |                 self.class_names[idx] = name + " "*(max_len-len(name))
 22 | 
 23 |         self.mat = np.zeros((num_classes,num_classes),dtype='int')
 24 | 
 25 |     def __str__(self):
 26 |         # calucate row and column sums
 27 |         col_sum = np.sum(self.mat, axis=1)
 28 |         row_sum = np.sum(self.mat, axis=0)
 29 | 
 30 |         s = []
 31 | 
 32 |         mat_str = self.mat.__str__()
 33 |         mat_str = mat_str.replace('[','').replace(']','').split('\n')
 34 | 
 35 |         for idx, row in enumerate(mat_str):
 36 |             if idx == 0:
 37 |                 pad = " "
 38 |             else:
 39 |                 pad = ""
 40 |             class_name = self.class_names[idx]
 41 |             class_name = " " + class_name + " |"
 42 |             row_str = class_name + pad + row
 43 |             row_str += " |" + str(col_sum[idx])
 44 |             s.append(row_str)
 45 | 
 46 |         row_sum = [(self.max_len+4)*" "+" ".join(map(str, row_sum))]
 47 |         hline = [(1+self.max_len)*" "+"-"*len(row_sum[0])]
 48 | 
 49 |         s = hline + s + hline + row_sum
 50 | 
 51 |         # add linebreaks
 52 |         s_out = [line+'\n' for line in s]
 53 |         return "".join(s_out)
 54 | 
 55 |     def batch_add(self, targets, preds):
 56 |         assert targets.shape == preds.shape
 57 |         assert len(targets) == len(preds)
 58 |         assert max(targets) < self.n_classes
 59 |         assert max(preds) < self.n_classes
 60 |         targets = targets.flatten()
 61 |         preds = preds.flatten()
 62 |         for i in range(len(targets)):
 63 |                 self.mat[targets[i], preds[i]] += 1
 64 | 
 65 |     def get_errors(self):
 66 |         tp = np.asarray(np.diag(self.mat).flatten(),dtype='float')
 67 |         fn = np.asarray(np.sum(self.mat, axis=1).flatten(),dtype='float') - tp
 68 |         fp = np.asarray(np.sum(self.mat, axis=0).flatten(),dtype='float') - tp
 69 |         tn = np.asarray(np.sum(self.mat)*np.ones(self.n_classes).flatten(),
 70 |                         dtype='float') - tp - fn - fp
 71 |         return tp, fn, fp, tn
 72 | 
 73 |     def accuracy(self):
 74 |         """
 75 |         Calculates global accuracy
 76 |         :return: accuracy
 77 |         :example: >>> conf = ConfusionMatrix(3)
 78 |                   >>> conf.batchAdd([0,0,1],[0,0,2])
 79 |                   >>> print conf.accuracy()
 80 |         """
 81 |         tp, _, _, _ = self.get_errors()
 82 |         n_samples = np.sum(self.mat)
 83 |         return np.sum(tp) / n_samples
 84 | 
 85 |     def sensitivity(self):
 86 |         tp, tn, fp, fn = self.get_errors()
 87 |         res = tp / (tp + fn)
 88 |         res = res[~np.isnan(res)]
 89 |         return res
 90 | 
 91 |     def specificity(self):
 92 |         tp, tn, fp, fn = self.get_errors()
 93 |         res = tn / (tn + fp)
 94 |         res = res[~np.isnan(res)]
 95 |         return res
 96 | 
 97 |     def positive_predictive_value(self):
 98 |         tp, tn, fp, fn = self.get_errors()
 99 |         res = tp / (tp + fp)
100 |         res = res[~np.isnan(res)]
101 |         return res
102 | 
103 |     def negative_predictive_value(self):
104 |         tp, tn, fp, fn = self.get_errors()
105 |         res = tn / (tn + fn)
106 |         res = res[~np.isnan(res)]
107 |         return res
108 | 
109 |     def false_positive_rate(self):
110 |         tp, tn, fp, fn = self.get_errors()
111 |         res = fp / (fp + tn)
112 |         res = res[~np.isnan(res)]
113 |         return res
114 | 
115 |     def false_discovery_rate(self):
116 |         tp, tn, fp, fn = self.get_errors()
117 |         res = fp / (tp + fp)
118 |         res = res[~np.isnan(res)]
119 |         return res
120 | 
121 |     def F1(self):
122 |         tp, tn, fp, fn = self.get_errors()
123 |         res = (2*tp) / (2*tp + fp + fn)
124 |         res = res[~np.isnan(res)]
125 |         return res
126 | 
127 |     def matthews_correlation(self):
128 |         tp, tn, fp, fn = self.get_errors()
129 |         numerator = tp*tn - fp*fn
130 |         denominator = np.sqrt((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))
131 |         res = numerator / denominator
132 |         res = res[~np.isnan(res)]
133 |         return res
134 | 


--------------------------------------------------------------------------------
/lab2/lab2_CNN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%matplotlib inline\n",
 12 |     "import matplotlib\n",
 13 |     "import numpy as np\n",
 14 |     "import matplotlib.pyplot as plt\n",
 15 |     "import sklearn.datasets\n",
 16 |     "import theano\n",
 17 |     "import theano.tensor as T\n",
 18 |     "import lasagne\n",
 19 |     " "
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# Convolutional Neural networks 101\n",
 27 |     "\n",
 28 |     "Convolution neural networks are one of the most succesfull types of neural networks for image recognition and an integral part of reigniting the interest in neural networks. \n",
 29 |     "\n",
 30 |     "In this lab we'll experiment with inserting 2D-convolution layers in the fully connected neural networks introduced in LAB1. We'll furhter experiment with stacking of convolution layers, max pooling and strided convolutions which are all important techniques in current convolution neural network architectures. Lastly we'll try to visualize the learned convolution filters and try to understand what kind of features they learn to recognize.\n",
 31 |     "\n",
 32 |     "\n",
 33 |     "If you are unfamilar with the the convolution operation  https://github.com/vdumoulin/conv_arithmetic have a nice visualization of different convolution variants. For a more indept tutorial please see http://cs231n.github.io/convolutional-networks/ or http://neuralnetworksanddeeplearning.com/chap6.html. Lastly if you are ambitious and want implement a convolution neural network from scratch please see an exercise for our Deep Learning summer school last year https://github.com/DTU-deeplearning/day2-Conv"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {
 40 |     "collapsed": false
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "#LOAD the mnist data. To speed up training we'll only work on a subset of the data.\n",
 45 |     "#Note that we reshape the data from (nsamples, num_features)= (nsamples, nchannels*rows*cols)  -> (nsamples, nchannels, rows, cols)\n",
 46 |     "# in order to retain the spatial arrangements of the pixels\n",
 47 |     "data = np.load('mnist.npz')\n",
 48 |     "num_classes = 10\n",
 49 |     "nchannels,rows,cols = 1,28,28\n",
 50 |     "x_train = data['X_train'][:10000].astype('float32')\n",
 51 |     "x_train = x_train.reshape((-1,nchannels,rows,cols))\n",
 52 |     "targets_train = data['y_train'][:10000].astype('int32')\n",
 53 |     "\n",
 54 |     "x_valid = data['X_valid'][:500].astype('float32')\n",
 55 |     "x_valid = x_valid.reshape((-1,nchannels,rows,cols))\n",
 56 |     "targets_valid = data['y_valid'][:500].astype('int32')\n",
 57 |     "\n",
 58 |     "x_test = data['X_test'][:500].astype('float32')\n",
 59 |     "x_test = x_test.reshape((-1,nchannels,rows,cols))\n",
 60 |     "targets_test = data['y_test'][:500].astype('int32')"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {
 67 |     "collapsed": false
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "#plot a few MNIST examples\n",
 72 |     "idx = 0\n",
 73 |     "canvas = np.zeros((28*10, 10*28))\n",
 74 |     "for i in range(10):\n",
 75 |     "    for j in range(10):\n",
 76 |     "        canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x_train[idx].reshape((28, 28))\n",
 77 |     "        idx += 1\n",
 78 |     "plt.figure(figsize=(7, 7))\n",
 79 |     "plt.imshow(canvas, cmap='gray')\n",
 80 |     "plt.title('MNIST handwritten digits')\n",
 81 |     "plt.show()"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {
 88 |     "collapsed": false
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "#Define a simple feed forward neural network\n",
 93 |     "\n",
 94 |     "from lasagne.nonlinearities import leaky_rectify, softmax, tanh, elu\n",
 95 |     "from lasagne.layers import InputLayer, DenseLayer, Conv2DLayer, batch_norm, DropoutLayer, MaxPool2DLayer\n",
 96 |     "\n",
 97 |     "#defined the model\n",
 98 |     "num_class = 10\n",
 99 |     "num_features = x_train.shape[1]\n",
100 |     "\n",
101 |     "l_in = InputLayer(shape=(None,nchannels,rows,cols)) #note that we use a 4D input since we need to retain the spatial arrangement of the pixels when working with convolutions.\n",
102 |     "#l_conv = Conv2DLayer(l_in,num_filters=16,filter_size=5)\n",
103 |     "l_hid = DenseLayer(l_in, num_units=100, nonlinearity=elu) #remember to connect the new conv-layer here\n",
104 |     "l_out = DenseLayer(l_hid, num_units=num_class, nonlinearity=softmax)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {
111 |     "collapsed": false
112 |    },
113 |    "outputs": [],
114 |    "source": [
115 |     "#Setting up the graph in theano\n",
116 |     "sym_x = T.tensor4('sym_x') # a symbolic variable, this is now a 4-D tensor.\n",
117 |     "sym_t = T.ivector('sym_t') # a symbolic variable taking on the value of the target batch.\n",
118 |     "\n",
119 |     "# Get network output\n",
120 |     "train_out = lasagne.layers.get_output(l_out, sym_x, deterministic=False)\n",
121 |     "eval_out = lasagne.layers.get_output(l_out, sym_x, deterministic=True)\n",
122 |     "\n",
123 |     "\n",
124 |     "# Get list of all trainable parameters in the network.\n",
125 |     "all_params = lasagne.layers.get_all_params(l_out, trainable=True)\n",
126 |     "\n",
127 |     "cost = T.nnet.categorical_crossentropy(train_out+1e-8, sym_t).mean()\n",
128 |     "# Let Theano do its magic and get all the gradients we need for training\n",
129 |     "all_grads = T.grad(cost, all_params)\n",
130 |     "\n",
131 |     "\n",
132 |     "# Set the update function for parameters \n",
133 |     "# you might wan't to experiment with more advanded update schemes like rmsprob, adadelta etc.\n",
134 |     "updates = lasagne.updates.adam(all_grads, all_params, learning_rate=0.001)\n",
135 |     "\n",
136 |     "\n",
137 |     "f_eval = theano.function([sym_x],\n",
138 |     "                     eval_out, on_unused_input='warn')\n",
139 |     "\n",
140 |     "f_train = theano.function([sym_x, sym_t],\n",
141 |     "                          [cost],\n",
142 |     "                          updates=updates, on_unused_input='warn')"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {
149 |     "collapsed": false
150 |    },
151 |    "outputs": [],
152 |    "source": [
153 |     "#Test the forward pass\n",
154 |     "x = np.random.normal(0,1, (45, 1,28,28)).astype('float32') #dummy data\n",
155 |     "\n",
156 |     "model = lasagne.layers.get_output(l_out, sym_x)\n",
157 |     "out = model.eval({sym_x:x}) #this could also include mask etc if used\n",
158 |     "print(\"l_out\", out.shape)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {
165 |     "collapsed": false
166 |    },
167 |    "outputs": [],
168 |    "source": [
169 |     "#Training Loop\n",
170 |     "from confusionmatrix import ConfusionMatrix\n",
171 |     "batch_size = 100\n",
172 |     "num_epochs = 10\n",
173 |     "num_samples_train = x_train.shape[0]\n",
174 |     "num_batches_train = num_samples_train // batch_size\n",
175 |     "num_samples_valid = x_valid.shape[0]\n",
176 |     "num_batches_valid = num_samples_valid // batch_size\n",
177 |     "\n",
178 |     "train_acc, train_loss = [], []\n",
179 |     "valid_acc, valid_loss = [], []\n",
180 |     "test_acc, test_loss = [], []\n",
181 |     "cur_loss = 0\n",
182 |     "loss = []\n",
183 |     "try:\n",
184 |     "    for epoch in range(num_epochs):\n",
185 |     "        #Forward->Backprob->Update params\n",
186 |     "        cur_loss = 0\n",
187 |     "        for i in range(num_batches_train):\n",
188 |     "            idx = range(i*batch_size, (i+1)*batch_size)\n",
189 |     "            x_batch = x_train[idx]\n",
190 |     "            target_batch = targets_train[idx]    \n",
191 |     "            batch_loss = f_train(x_batch,target_batch) #this will do the complete backprob pass\n",
192 |     "            cur_loss += batch_loss[0]\n",
193 |     "        loss += [cur_loss/batch_size]\n",
194 |     "\n",
195 |     "        confusion_valid = ConfusionMatrix(num_classes)\n",
196 |     "        confusion_train = ConfusionMatrix(num_classes)\n",
197 |     "\n",
198 |     "        for i in range(num_batches_train):\n",
199 |     "            idx = range(i*batch_size, (i+1)*batch_size)\n",
200 |     "            x_batch = x_train[idx]\n",
201 |     "            targets_batch = targets_train[idx]\n",
202 |     "            net_out = f_eval(x_batch)   \n",
203 |     "            preds = np.argmax(net_out, axis=-1) \n",
204 |     "            confusion_train.batch_add(targets_batch, preds)\n",
205 |     "\n",
206 |     "        confusion_valid = ConfusionMatrix(num_classes)\n",
207 |     "        for i in range(num_batches_valid):\n",
208 |     "            idx = range(i*batch_size, (i+1)*batch_size)\n",
209 |     "            x_batch = x_valid[idx]\n",
210 |     "            targets_batch = targets_valid[idx]\n",
211 |     "            net_out = f_eval(x_batch)   \n",
212 |     "            preds = np.argmax(net_out, axis=-1) \n",
213 |     "\n",
214 |     "            confusion_valid.batch_add(targets_batch, preds)\n",
215 |     "\n",
216 |     "        train_acc_cur = confusion_train.accuracy()\n",
217 |     "        valid_acc_cur = confusion_valid.accuracy()\n",
218 |     "\n",
219 |     "        train_acc += [train_acc_cur]\n",
220 |     "        valid_acc += [valid_acc_cur]\n",
221 |     "        print \"Epoch %i : Train Loss %e , Train acc %f,  Valid acc %f \" \\\n",
222 |     "        % (epoch+1, loss[-1], train_acc_cur, valid_acc_cur)\n",
223 |     "except KeyboardInterrupt:\n",
224 |     "    pass\n",
225 |     "    \n",
226 |     "\n",
227 |     "#get test set score\n",
228 |     "confusion_test = ConfusionMatrix(num_classes)\n",
229 |     "net_out = f_eval(x_test)    \n",
230 |     "preds = np.argmax(net_out, axis=-1) \n",
231 |     "confusion_test.batch_add(targets_test, preds)\n",
232 |     "print \"\\nTest set Acc:  %f\" %(confusion_test.accuracy())\n",
233 |     "\n",
234 |     "\n",
235 |     "epoch = np.arange(len(train_acc))\n",
236 |     "plt.figure()\n",
237 |     "plt.plot(epoch,train_acc,'r',epoch,valid_acc,'b')\n",
238 |     "plt.legend(['Train Acc','Val Acc'])\n",
239 |     "plt.xlabel('Epochs'), plt.ylabel('Acc'), plt.ylim([0.75,1.03])"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "# Assignments 1\n",
247 |     "\n",
248 |     " 1) Note the performance of the standard feedforward neural network. Add a 2D convolution layer before the dense hidden layer and confirm that it increases the generalization performance of the network (try num_filters=16 and filter_size=5 as a starting point). \n",
249 |     " \n",
250 |     " 2) Can the performance be increases even further by stacking more convolution layers ?\n",
251 |     " \n",
252 |     " 3) Maxpooling is a technique for decreasing the spatial resolution of an image while retaining the important features. Effectively this gives a local translational invariance and reduces the computation by a factor of four. In the classification algorithm which is usually desirable. Try to either: \n",
253 |     " \n",
254 |     "     a) add a maxpool layer(add arguement pool_size=2)  after the convolution layer or\n",
255 |     "     b) set add stride=2 to the arguments of the convolution layer. \n",
256 |     "  Verify that this decreases spatial dimension of the image. (print l_conv.output_shape or print   l_maxpool.output_shape). Does this increase the performance of the network (you may need to stack multiple layers or increase the number of filters to increase performance) ?\n",
257 |     "  \n"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "metadata": {},
263 |    "source": [
264 |     "# Visualization of filters\n",
265 |     "Convolution filters can be interpreted as spatial feature detectors picking up different image features such as edges, corners etc. Below we provide code for visualization of the filters. The best results are obtained with fairly large filters of size 9 and either 16 or 36 filters. "
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": null,
271 |    "metadata": {
272 |     "collapsed": false
273 |    },
274 |    "outputs": [],
275 |    "source": [
276 |     "### If you get an error with l_conv not being defined you need define l_conv when the network is defined!\n",
277 |     "np_W = l_conv.W.get_value() #get the filter values from the conv layer\n",
278 |     "print np_W.shape, \"i.e. the shape is num_filters, num_channels, filter_size, filter_size\"\n",
279 |     "num_filters,num_channels,filter_size,_= np_W.shape\n",
280 |     "n = int(num_filters**0.5)\n",
281 |     "\n",
282 |     "np_W_res = np_W.reshape(n,n,num_channels,filter_size,filter_size)\n",
283 |     "fig, ax = plt.subplots(n,n)\n",
284 |     "print \"learned filter values\"\n",
285 |     "for i in range(n):\n",
286 |     "    for j in range(n):\n",
287 |     "        ax[i,j].imshow(np_W_res[i,j,0], cmap='gray',interpolation='none')\n",
288 |     "        ax[i,j].xaxis.set_major_formatter(plt.NullFormatter())\n",
289 |     "        ax[i,j].yaxis.set_major_formatter(plt.NullFormatter())\n",
290 |     "\n",
291 |     "\n",
292 |     "idx = 1\n",
293 |     "plt.figure()\n",
294 |     "plt.imshow(x_train[idx,0],cmap='gray',interpolation='none')\n",
295 |     "plt.title('Inut Image')\n",
296 |     "plt.show()\n",
297 |     "\n",
298 |     "#visalize the filters convolved with an input image\n",
299 |     "from scipy.signal import convolve2d\n",
300 |     "np_W_res = np_W.reshape(n,n,num_channels,filter_size,filter_size)\n",
301 |     "fig, ax = plt.subplots(n,n,figsize=(9,9))\n",
302 |     "print \"Response from input image convolved with the filters\"\n",
303 |     "for i in range(n):\n",
304 |     "    for j in range(n):\n",
305 |     "        ax[i,j].imshow(convolve2d(x_train[1,0],np_W_res[i,j,0],mode='same'), cmap='gray',interpolation='none')\n",
306 |     "        ax[i,j].xaxis.set_major_formatter(plt.NullFormatter())\n",
307 |     "        ax[i,j].yaxis.set_major_formatter(plt.NullFormatter())\n",
308 |     "\n"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "markdown",
313 |    "metadata": {},
314 |    "source": [
315 |     "# Assignment 2\n",
316 |     "\n",
317 |     "The visualized filters will likely look most like noise due to the small amount of training data.\n",
318 |     "\n",
319 |     " 1) Try to use 10000 traning examples instead and visualise the filters again\n",
320 |     " \n",
321 |     " 2) Dropout is a very usefull technique for preventing overfitting. Try to add a DropoutLayer after the convolution layer and hidden layer. This should increase both performance and the \"visual appeal\" of the filters\n",
322 |     " \n",
323 |     " 3) Batch normalization is a recent innovation for improving generalization performance. Try to insert batch normalization layers into the network to improve performance. \n",
324 |     " \n",
325 |     " \n"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "markdown",
330 |    "metadata": {},
331 |    "source": [
332 |     "# More Fun with convolutional networks\n",
333 |     "### Get the data"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {
340 |     "collapsed": false
341 |    },
342 |    "outputs": [],
343 |    "source": [
344 |     "!wget -N https://s3.amazonaws.com/lasagne/recipes/datasets/mnist_cluttered_60x60_6distortions.npz"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "metadata": {},
350 |    "source": [
351 |     "In the data the each mnist digit (20x20 pixels) has been placed randomly in a 60x60 canvas. To make the task harder each canvas has then been cluttered with small pieces of digits. In this task it is helpfull for a network if it can focus only on the digit and ignore the rest.\n",
352 |     "\n",
353 |     "The ``TransformerLayer`` lets us do this. The transformer layer learns an affine transformation which lets the network zoom, rotate and skew. If you are interested you should read the paper, but the main idea is that you can let a small convolutional network determine the the parameters of the affine transformation. You then apply the affine transformation to the input data. Usually this also involves downsampling which forces the model to zoom in on the relevant parts of the data. After the affine transformation we can use a larger conv net to do the classification. \n",
354 |     "This is possible because you can backprop through a an affine transformation if you use bilinear interpolation."
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": null,
360 |    "metadata": {
361 |     "collapsed": false
362 |    },
363 |    "outputs": [],
364 |    "source": [
365 |     "import os\n",
366 |     "import matplotlib\n",
367 |     "import numpy as np\n",
368 |     "np.random.seed(123)\n",
369 |     "import matplotlib.pyplot as plt\n",
370 |     "import lasagne\n",
371 |     "import theano\n",
372 |     "import theano.tensor as T\n",
373 |     "conv = lasagne.layers.Conv2DLayer\n",
374 |     "pool = lasagne.layers.MaxPool2DLayer\n",
375 |     "NUM_EPOCHS = 500\n",
376 |     "BATCH_SIZE = 256\n",
377 |     "LEARNING_RATE = 0.001\n",
378 |     "DIM = 60\n",
379 |     "NUM_CLASSES = 10\n",
380 |     "mnist_cluttered = \"mnist_cluttered_60x60_6distortions.npz\"\n",
381 |     "\n",
382 |     "\n"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": null,
388 |    "metadata": {
389 |     "collapsed": false
390 |    },
391 |    "outputs": [],
392 |    "source": [
393 |     "def load_data():\n",
394 |     "    data = np.load(mnist_cluttered)\n",
395 |     "    X_train, y_train = data['x_train'], np.argmax(data['y_train'], axis=-1)\n",
396 |     "    X_valid, y_valid = data['x_valid'], np.argmax(data['y_valid'], axis=-1)\n",
397 |     "    X_test, y_test = data['x_test'], np.argmax(data['y_test'], axis=-1)\n",
398 |     "\n",
399 |     "    # reshape for convolutions\n",
400 |     "    X_train = X_train.reshape((X_train.shape[0], 1, DIM, DIM))\n",
401 |     "    X_valid = X_valid.reshape((X_valid.shape[0], 1, DIM, DIM))\n",
402 |     "    X_test = X_test.reshape((X_test.shape[0], 1, DIM, DIM))\n",
403 |     "    \n",
404 |     "    print \"Train samples:\", X_train.shape\n",
405 |     "    print \"Validation samples:\", X_valid.shape\n",
406 |     "    print \"Test samples:\", X_test.shape\n",
407 |     "\n",
408 |     "    return dict(\n",
409 |     "        X_train=lasagne.utils.floatX(X_train),\n",
410 |     "        y_train=y_train.astype('int32'),\n",
411 |     "        X_valid=lasagne.utils.floatX(X_valid),\n",
412 |     "        y_valid=y_valid.astype('int32'),\n",
413 |     "        X_test=lasagne.utils.floatX(X_test),\n",
414 |     "        y_test=y_test.astype('int32'),\n",
415 |     "        num_examples_train=X_train.shape[0],\n",
416 |     "        num_examples_valid=X_valid.shape[0],\n",
417 |     "        num_examples_test=X_test.shape[0],\n",
418 |     "        input_height=X_train.shape[2],\n",
419 |     "        input_width=X_train.shape[3],\n",
420 |     "        output_dim=10,)\n",
421 |     "data = load_data()\n",
422 |     "\n",
423 |     "idx = 0\n",
424 |     "canvas = np.zeros((DIM*10, 10*DIM))\n",
425 |     "for i in range(10):\n",
426 |     "    for j in range(10):\n",
427 |     "        canvas[i*DIM:(i+1)*DIM, j*DIM:(j+1)*DIM] = data['X_train'][idx].reshape((DIM, DIM))\n",
428 |     "        idx += 1\n",
429 |     "plt.figure(figsize=(10, 10))\n",
430 |     "plt.imshow(canvas, cmap='gray')\n",
431 |     "plt.title('Cluttered handwritten digits')\n",
432 |     "plt.axis('off')\n",
433 |     "\n",
434 |     "plt.show()"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "markdown",
439 |    "metadata": {},
440 |    "source": [
441 |     "## Building the model\n",
442 |     "\n",
443 |     "We use a model where the localization network is a two layer convolution network which operates directly on the image input. The output from the localization network is a 6 dimensional vector specifying the parameters in the affine transformation.\n",
444 |     "\n",
445 |     "We set up the transformer layer to initially do the identity transform, similarly to [1]. If the output from the localization networks is [t1, t2, t3, t4, t5, t6] then t1 and t5 determines zoom, t2 and t4 determines skewness, and t3 and t6 move the center position. By setting the initial values of the bias vector to \n",
446 |     "\n",
447 |     "```\n",
448 |     "|1, 0, 0|\n",
449 |     "|0, 1, 0|\n",
450 |     "```\n",
451 |     "and the final W of the localization network to all zeros we ensure that in the beginning of training the network works as a pooling layer. \n",
452 |     "\n",
453 |     "The output of the localization layer feeds into the transformer layer which applies the transformation to the image input. In our setup the transformer layer downsamples the input by a factor 3.\n",
454 |     "\n",
455 |     "Finally a 2 layer convolution layer and 2 fully connected layers calculates the output probabilities.\n",
456 |     "\n",
457 |     "\n",
458 |     "### The model\n",
459 |     "```\n",
460 |     "Input -> localization_network -> TransformerLayer -> output_network -> predictions\n",
461 |     "   |                                |\n",
462 |     "   >--------------------------------^\n",
463 |     "```\n",
464 |     "\n",
465 |     "\n"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "code",
470 |    "execution_count": null,
471 |    "metadata": {
472 |     "collapsed": false
473 |    },
474 |    "outputs": [],
475 |    "source": [
476 |     "def build_model(input_width, input_height, output_dim,\n",
477 |     "                batch_size=BATCH_SIZE):\n",
478 |     "    ini = lasagne.init.HeUniform()\n",
479 |     "    l_in = lasagne.layers.InputLayer(shape=(None, 1, input_width, input_height),)\n",
480 |     "\n",
481 |     "    # Localization network\n",
482 |     "    b = np.zeros((2, 3), dtype=theano.config.floatX)\n",
483 |     "    b[0, 0] = 1\n",
484 |     "    b[1, 1] = 1\n",
485 |     "    b = b.flatten()\n",
486 |     "    loc_l1 = pool(l_in, pool_size=(2, 2))\n",
487 |     "    loc_l2 = conv(\n",
488 |     "        loc_l1, num_filters=8, filter_size=(5, 5), W=ini)\n",
489 |     "    loc_l3 = pool(loc_l2, pool_size=(2, 2))\n",
490 |     "    loc_l4 = conv(loc_l3, num_filters=8, filter_size=(5, 5), W=ini)\n",
491 |     "    loc_l5 = lasagne.layers.DenseLayer(\n",
492 |     "        loc_l4, num_units=50, W=lasagne.init.HeUniform('relu'))\n",
493 |     "    loc_out = lasagne.layers.DenseLayer(\n",
494 |     "        loc_l5, num_units=6, b=b, W=lasagne.init.Constant(0.0), \n",
495 |     "        nonlinearity=lasagne.nonlinearities.identity)\n",
496 |     "    \n",
497 |     "    # Transformer network\n",
498 |     "    l_trans1 = lasagne.layers.TransformerLayer(l_in, loc_out, downsample_factor=3.0)\n",
499 |     "    print \"Transformer network output shape: \", l_trans1.output_shape\n",
500 |     "    \n",
501 |     "    # Classification network\n",
502 |     "    class_l1 = conv(\n",
503 |     "        l_trans1,\n",
504 |     "        num_filters=16,\n",
505 |     "        filter_size=(3, 3),\n",
506 |     "        nonlinearity=lasagne.nonlinearities.rectify,\n",
507 |     "        W=ini,\n",
508 |     "    )\n",
509 |     "    class_l2 = pool(class_l1, pool_size=(2, 2))\n",
510 |     "    class_l3 = conv(\n",
511 |     "        class_l2,\n",
512 |     "        num_filters=16,\n",
513 |     "        filter_size=(3, 3),\n",
514 |     "        nonlinearity=lasagne.nonlinearities.rectify,\n",
515 |     "        W=ini,\n",
516 |     "    )\n",
517 |     "    class_l4 = pool(class_l3, pool_size=(2, 2))\n",
518 |     "    class_l5 = lasagne.layers.DenseLayer(\n",
519 |     "        class_l4,\n",
520 |     "        num_units=256,\n",
521 |     "        nonlinearity=lasagne.nonlinearities.rectify,\n",
522 |     "        W=ini,\n",
523 |     "    )\n",
524 |     "\n",
525 |     "    l_out = lasagne.layers.DenseLayer(\n",
526 |     "        class_l5,\n",
527 |     "        num_units=output_dim,\n",
528 |     "        nonlinearity=lasagne.nonlinearities.softmax,\n",
529 |     "        W=ini,\n",
530 |     "    )\n",
531 |     "\n",
532 |     "    return l_out, l_trans1\n",
533 |     "\n",
534 |     "model, l_transform = build_model(DIM, DIM, NUM_CLASSES)\n",
535 |     "model_params = lasagne.layers.get_all_params(model, trainable=True)\n"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "code",
540 |    "execution_count": null,
541 |    "metadata": {
542 |     "collapsed": false
543 |    },
544 |    "outputs": [],
545 |    "source": [
546 |     "X = T.tensor4()\n",
547 |     "y = T.ivector()\n",
548 |     "\n",
549 |     "# training output\n",
550 |     "output_train = lasagne.layers.get_output(model, X, deterministic=False)\n",
551 |     "\n",
552 |     "# evaluation output. Also includes output of transform for plotting\n",
553 |     "output_eval, transform_eval = lasagne.layers.get_output([model, l_transform], X, deterministic=True)\n",
554 |     "\n",
555 |     "sh_lr = theano.shared(lasagne.utils.floatX(LEARNING_RATE))\n",
556 |     "cost = T.mean(T.nnet.categorical_crossentropy(output_train, y))\n",
557 |     "updates = lasagne.updates.adam(cost, model_params, learning_rate=sh_lr)\n",
558 |     "\n",
559 |     "train = theano.function([X, y], [cost, output_train], updates=updates)\n",
560 |     "eval = theano.function([X], [output_eval, transform_eval])"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "markdown",
565 |    "metadata": {},
566 |    "source": [
567 |     "### Training the model\n",
568 |     "Unfortunately NVIDIA has yet to squeeze a TitanX into a labtop and training convnets on CPU is painfully slow. After 10 epochs you should see that model starts to zoom in on the digits. "
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "code",
573 |    "execution_count": null,
574 |    "metadata": {
575 |     "collapsed": true
576 |    },
577 |    "outputs": [],
578 |    "source": [
579 |     "def train_epoch(X, y):\n",
580 |     "    num_samples = X.shape[0]\n",
581 |     "    num_batches = int(np.ceil(num_samples / float(BATCH_SIZE)))\n",
582 |     "    costs = []\n",
583 |     "    correct = 0\n",
584 |     "    for i in range(num_batches):\n",
585 |     "        if i % 10 == 0:\n",
586 |     "            print i,\n",
587 |     "        idx = range(i*BATCH_SIZE, np.minimum((i+1)*BATCH_SIZE, num_samples))\n",
588 |     "        X_batch = X[idx]\n",
589 |     "        y_batch = y[idx]\n",
590 |     "        cost_batch, output_train = train(X_batch, y_batch)\n",
591 |     "        costs += [cost_batch]\n",
592 |     "        preds = np.argmax(output_train, axis=-1)\n",
593 |     "        correct += np.sum(y_batch == preds)\n",
594 |     "    print \"\"\n",
595 |     "    return np.mean(costs), correct / float(num_samples)\n",
596 |     "\n",
597 |     "\n",
598 |     "def eval_epoch(X, y):\n",
599 |     "    output_eval, transform_eval = eval(X)\n",
600 |     "    preds = np.argmax(output_eval, axis=-1)\n",
601 |     "    acc = np.mean(preds == y)\n",
602 |     "    return acc, transform_eval"
603 |    ]
604 |   },
605 |   {
606 |    "cell_type": "code",
607 |    "execution_count": null,
608 |    "metadata": {
609 |     "collapsed": false
610 |    },
611 |    "outputs": [],
612 |    "source": [
613 |     "valid_accs, train_accs, test_accs = [], [], []\n",
614 |     "try:\n",
615 |     "    for n in range(NUM_EPOCHS):\n",
616 |     "        train_cost, train_acc = train_epoch(data['X_train'], data['y_train'])\n",
617 |     "        valid_acc, valid_trainsform = eval_epoch(data['X_valid'], data['y_valid'])\n",
618 |     "        test_acc, test_transform = eval_epoch(data['X_test'], data['y_test'])\n",
619 |     "        valid_accs += [valid_acc]\n",
620 |     "        test_accs += [test_acc]\n",
621 |     "        train_accs += [train_acc]\n",
622 |     "\n",
623 |     "        if (n+1) % 20 == 0:\n",
624 |     "            new_lr = sh_lr.get_value() * 0.7\n",
625 |     "            print \"New LR:\", new_lr\n",
626 |     "            sh_lr.set_value(lasagne.utils.floatX(new_lr))\n",
627 |     "\n",
628 |     "        print \"Epoch {0}: Train cost {1}, Train acc {2}, val acc {3}, test acc {4}\".format(\n",
629 |     "                n, train_cost, train_acc, valid_acc, test_acc)\n",
630 |     "except KeyboardInterrupt:\n",
631 |     "    pass"
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "markdown",
636 |    "metadata": {},
637 |    "source": [
638 |     "### Plot errors and zoom"
639 |    ]
640 |   },
641 |   {
642 |    "cell_type": "code",
643 |    "execution_count": null,
644 |    "metadata": {
645 |     "collapsed": false
646 |    },
647 |    "outputs": [],
648 |    "source": [
649 |     "plt.figure(figsize=(9,9))\n",
650 |     "plt.plot(1-np.array(train_accs), label='Training Error')\n",
651 |     "plt.plot(1-np.array(valid_accs), label='Validation Error')\n",
652 |     "plt.legend(fontsize=20)\n",
653 |     "plt.xlabel('Epoch', fontsize=20)\n",
654 |     "plt.ylabel('Error', fontsize=20)\n",
655 |     "plt.show()"
656 |    ]
657 |   },
658 |   {
659 |    "cell_type": "code",
660 |    "execution_count": null,
661 |    "metadata": {
662 |     "collapsed": false
663 |    },
664 |    "outputs": [],
665 |    "source": [
666 |     "plt.figure(figsize=(7,14))\n",
667 |     "for i in range(3):\n",
668 |     "    plt.subplot(321+i*2)\n",
669 |     "    plt.imshow(data['X_test'][i].reshape(DIM, DIM), cmap='gray', interpolation='none')\n",
670 |     "    if i == 0:\n",
671 |     "        plt.title('Original 60x60', fontsize=20)\n",
672 |     "    plt.axis('off')\n",
673 |     "    plt.subplot(322+i*2)\n",
674 |     "    plt.imshow(test_transform[i].reshape(DIM//3, DIM//3), cmap='gray', interpolation='none')\n",
675 |     "    if i == 0:\n",
676 |     "        plt.title('Transformed 20x20', fontsize=20)\n",
677 |     "    plt.axis('off')\n",
678 |     "    \n",
679 |     "    \n",
680 |     "plt.tight_layout()"
681 |    ]
682 |   },
683 |   {
684 |    "cell_type": "markdown",
685 |    "metadata": {
686 |     "collapsed": true
687 |    },
688 |    "source": [
689 |     "# A few pointers for image classification\n",
690 |     "If you want do image classification using a pretrained model is often a good choice, especially if you have limited amounts of labeled data. \n",
691 |     "\n",
692 |     "An often used pretrained network is the VGG16 and VGG19. Lasagne has pretrained models in the [modelzoo](https://github.com/Lasagne/Recipes/tree/master/modelzoo). Torch7 and Tensorflow have similar pretrained models that you can find with google. \n",
693 |     "\n",
694 |     "Currently the best performing image networks is [ResNet](https://arxiv.org/pdf/1512.03385v1.pdf). Torch7 has an interesting blog post about Residual nets. http://torch.ch/blog/2016/02/04/resnets.html\n",
695 |     "\n",
696 |     "\n"
697 |    ]
698 |   },
699 |   {
700 |    "cell_type": "code",
701 |    "execution_count": null,
702 |    "metadata": {
703 |     "collapsed": true
704 |    },
705 |    "outputs": [],
706 |    "source": []
707 |   }
708 |  ],
709 |  "metadata": {
710 |   "kernelspec": {
711 |    "display_name": "Python 2",
712 |    "language": "python",
713 |    "name": "python2"
714 |   },
715 |   "language_info": {
716 |    "codemirror_mode": {
717 |     "name": "ipython",
718 |     "version": 2
719 |    },
720 |    "file_extension": ".py",
721 |    "mimetype": "text/x-python",
722 |    "name": "python",
723 |    "nbconvert_exporter": "python",
724 |    "pygments_lexer": "ipython2",
725 |    "version": "2.7.11"
726 |   }
727 |  },
728 |  "nbformat": 4,
729 |  "nbformat_minor": 0
730 | }
731 | 


--------------------------------------------------------------------------------
/lab2/mnist.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepLearningDTU/nvidia_deep_learning_summercamp_2016/9d87d409eeebed4b8cf39c42987fb5e786d3ae28/lab2/mnist.npz


--------------------------------------------------------------------------------
/lab3/.ipynb_checkpoints/RNN-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%matplotlib inline \n",
 12 |     "%matplotlib nbagg\n",
 13 |     "import lasagne\n",
 14 |     "import theano\n",
 15 |     "import theano.tensor as T\n",
 16 |     "import matplotlib\n",
 17 |     "import numpy as np\n",
 18 |     "import matplotlib.pyplot as plt\n",
 19 |     "from IPython import display\n",
 20 |     "from data_generator import get_batch, print_valid_characters\n"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "# Recurrent Neural Networks\n",
 28 |     "\n",
 29 |     "Recurrent neural networks are the natural type of neural network to use for sequential data i.e. time series analysis, translation, speech recognition, biological sequence analysis etc. Recurrent neural networks works by recursively applying the same operation at each time step of the data sequence and having layers that pass information from previous time step to the current. It can therefore naturally handle input of varying length. Recurrent networks can be used for several prediction tasks including: sequence-to-class, sequence tagging, and sequence-to-sequence predictions.\n",
 30 |     "\n",
 31 |     "In this exercise we'll implement a Encoder-Decoder RNN based on the GRU unit for a simple sequence to sequence translation task. This type of models have shown impressive performance in Neural Machine Translation and Image Caption generation. \n",
 32 |     "\n",
 33 |     "For more in depth background material on RNNs please see [Supervised Sequence Labelling with Recurrent\n",
 34 |     "Neural Networks](https://www.cs.toronto.edu/~graves/preprint.pdf) by Alex Graves"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "# Encoder-Decoder\n",
 42 |     "In the encoder-decoder structure one RNN (blue) encodes the input and a second RNN (red) calculates the target values. One essential step is to let the encoder and decoder communicate. In the simplest approach you use the last hidden state of the encoder to initialize the decoder. Other approaches lets the decoder attend to different parts of the encoded input at different timesteps in the decoding process. \n",
 43 |     "\n",
 44 |     "<img src=\"files/enc-dec.png\", width=400>\n",
 45 |     "\n",
 46 |     "In our implementation we use a RNN with gated recurrent units (GRU) as encoder. We then use the last hidden state of the encoder ($h^{enc}_T$) as input to the decoder which is also a GRU RNN. \n",
 47 |     "\n",
 48 |     "### RNNs in Lasagne\n",
 49 |     "Lasagne have implementations of LSTM and GRU unit. Both layers assume that the input from the layer below have the shape **(Batch_size, seq_len, num_features)**. In this excercise we will use the GRU unit since it only stores a single hidden value per neuron (LSTMs stores two) and is approximately twice as fast as the LSTM unit.\n",
 50 |     "\n",
 51 |     "As stated above we will implement a Encoder-Decoder model. The simplest way to do this is to encode the input sequence using the Encoder model. We will then use the last hidden state of the Encoder $h^{enc}_T$ as input to the decoder model which then uses this information (simply a fixed length vector of numbers) to produce the targets. There is (at least) two ways to input $h^{enc}_T$ into the decoder\n",
 52 |     "\n",
 53 |     "1. Repeatly use $h^{enc}_T$ as input to the Decoder at each decode time step\n",
 54 |     "2. Intialize the decoder using $h^{enc}_T$ and run the decoder without any inputs\n",
 55 |     "\n",
 56 |     "In this exercise we will follow the first approach because it's easier to implement. To do this need to create a lasagne layer that takes $h^{enc}_T$ and repeat it *N_decode_step* times. Below is an implementation of the RepeatLayer. You don't need to know the exact way it works, however make sure that you understand that it takes an input is size *(Batch_size x num_units)* and produces an output of size (Batch_size x n_decode_steps x num_units).\n"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {
 63 |     "collapsed": true
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "class RepeatLayer(lasagne.layers.Layer):\n",
 68 |     "    def __init__(self, incoming, n, **kwargs):\n",
 69 |     "        '''\n",
 70 |     "        The input is expected to be a 2D tensor of shape \n",
 71 |     "        (num_batch, num_features). The input is repeated\n",
 72 |     "        n times such that the output will be \n",
 73 |     "        (num_batch, n, num_features)\n",
 74 |     "        '''\n",
 75 |     "        super(RepeatLayer, self).__init__(incoming, **kwargs)\n",
 76 |     "        self.n = n\n",
 77 |     "\n",
 78 |     "    def get_output_shape_for(self, input_shape):\n",
 79 |     "        return tuple([input_shape[0], self.n] + list(input_shape[1:]))\n",
 80 |     "\n",
 81 |     "    def get_output_for(self, input, **kwargs):\n",
 82 |     "        #repeat the input n times\n",
 83 |     "        tensors = [input]*self.n\n",
 84 |     "        stacked = theano.tensor.stack(*tensors)\n",
 85 |     "        dim = [1, 0] + range(2, input.ndim + 1)\n",
 86 |     "        return stacked.dimshuffle(dim)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "### The Data\n",
 94 |     "Since RNN models can be very slow to train on real large datasets we will generate some simpler training data for this exercise. The task for the RNN is simply to translate a string of letters spelling the numbers between 0-9 into the corresponding numbers i.e\n",
 95 |     "\n",
 96 |     "\"one two five\" --> \"125#\" (we use # as a special stop of sequence character)\n",
 97 |     "\n",
 98 |     "To input the strings into the RNN model we translate the characters into a vector integers using a simple translation table (i.e. 'h'->16, 'o'-> 17 etc). The code below prints a few input/output pairs using the *get_batch* function which randomy produces the data."
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {
105 |     "collapsed": false
106 |    },
107 |    "outputs": [],
108 |    "source": [
109 |     "batch_size = 3\n",
110 |     "inputs, input_masks, targets, target_masks, text_inputs, text_targets = \\\n",
111 |     "    get_batch(batch_size=batch_size,max_digits=2,min_digits=1)\n",
112 |     "\n",
113 |     "print \"input types:\", inputs.dtype,  input_masks.dtype, targets.dtype, target_masks.dtype\n",
114 |     "print print_valid_characters()\n",
115 |     "print \"Stop character = #\"\n",
116 |     "\n",
117 |     "\n",
118 |     "for i in range(batch_size):\n",
119 |     "    print \"\\nSAMPLE\",i\n",
120 |     "    print \"TEXT INPUTS:\\t\\t\", text_inputs[i]\n",
121 |     "    print \"TEXT TARGETS:\\t\\t\", text_targets[i]\n",
122 |     "    print \"ENCODED INPUTS:\\t\\t\", inputs[i]\n",
123 |     "    print \"MASK INPUTS:\\t\\t\", input_masks[i]\n",
124 |     "    print \"ENCODED TARGETS:\\t\", targets[i]\n",
125 |     "    print \"MASK TARGETS:\\t\\t\", target_masks[i]"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "### Encoder Decoder model setup\n",
133 |     "Below is the Lasagne model definition. We use an embedding layer to go from integer representation to vector representation of the input.\n",
134 |     "\n",
135 |     "Note that the layer has a lot of print statements which we used for debugging during setup."
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {
142 |     "collapsed": false
143 |    },
144 |    "outputs": [],
145 |    "source": [
146 |     "BATCH_SIZE = 100\n",
147 |     "NUM_UNITS_ENC = 10\n",
148 |     "NUM_UNITS_DEC = 10\n",
149 |     "MAX_DIGITS = 20 \n",
150 |     "MIN_DIGITS = MAX_DIGITS #currently only support for same length outputs - we'll leave it for an exercise to add support for varying length targets\n",
151 |     "NUM_INPUTS = 27\n",
152 |     "NUM_OUTPUTS = 11 #(0-9 + '#')\n",
153 |     "\n",
154 |     "\n",
155 |     "#symbolic theano variables. Note that we are using imatrix for X since it goes into the embedding layer\n",
156 |     "x_sym = T.imatrix()\n",
157 |     "y_sym = T.imatrix()\n",
158 |     "xmask_sym = T.matrix()\n",
159 |     "\n",
160 |     "#dummy data to test implementation - We advise to check the output-dimensions of all layers.\n",
161 |     "#One way to do this in lasagne/theano is to forward pass some data through the model and \n",
162 |     "#check the output dimensions of these.\n",
163 |     "#Create some random testdata\n",
164 |     "X = np.random.randint(0,10,size=(BATCH_SIZE,MIN_DIGITS)).astype('int32')\n",
165 |     "Xmask = np.ones((BATCH_SIZE,MIN_DIGITS)).astype('float32')\n",
166 |     "\n",
167 |     "##### ENCODER START #####\n",
168 |     "l_in = lasagne.layers.InputLayer((None, None))\n",
169 |     "l_emb = lasagne.layers.EmbeddingLayer(l_in, NUM_INPUTS, NUM_INPUTS, \n",
170 |     "                                      W=np.eye(NUM_INPUTS,dtype='float32'),\n",
171 |     "                                      name='Embedding')\n",
172 |     "#Here we'll remove the trainable parameters from the embeding layer to constrain \n",
173 |     "#it to a simple \"one-hot-encoding\". You can experiment with removing this line\n",
174 |     "l_emb.params[l_emb.W].remove('trainable') \n",
175 |     "#forward pass some data throug the inputlayer-embedding layer and print the output shape\n",
176 |     "print lasagne.layers.get_output(l_emb, inputs={l_in: x_sym}).eval({x_sym: X}).shape\n",
177 |     "\n",
178 |     "l_mask_enc = lasagne.layers.InputLayer((None, None))\n",
179 |     "l_enc = lasagne.layers.GRULayer(l_emb, num_units=NUM_UNITS_ENC, name='GRUEncoder', mask_input=l_mask_enc)\n",
180 |     "print lasagne.layers.get_output(l_enc, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n",
181 |     "    {x_sym: X, xmask_sym: Xmask}).shape\n",
182 |     "\n",
183 |     "# slice last index of dimension 1\n",
184 |     "l_last_hid = lasagne.layers.SliceLayer(l_enc, indices=-1, axis=1)\n",
185 |     "print lasagne.layers.get_output(l_last_hid, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n",
186 |     "    {x_sym: X, xmask_sym: Xmask}).shape\n",
187 |     "##### END OF ENCODER######\n",
188 |     "\n",
189 |     "\n",
190 |     "##### START OF DECODER######\n",
191 |     "l_in_rep = RepeatLayer(l_last_hid, n=MAX_DIGITS+1) #we add one to allow space for the end of sequence character\n",
192 |     "print lasagne.layers.get_output(l_in_rep, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n",
193 |     "    {x_sym: X, xmask_sym: Xmask}).shape\n",
194 |     "\n",
195 |     "l_dec = lasagne.layers.GRULayer(l_in_rep, num_units=NUM_UNITS_DEC, name='GRUDecoder')\n",
196 |     "print lasagne.layers.get_output(l_dec, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n",
197 |     "    {x_sym: X, xmask_sym: Xmask}).shape\n",
198 |     "\n",
199 |     "\n",
200 |     "# We need to do some reshape voodo to connect a softmax layer to the decoder.\n",
201 |     "# See http://lasagne.readthedocs.org/en/latest/modules/layers/recurrent.html#examples \n",
202 |     "# In short this line changes the shape from \n",
203 |     "# (batch_size, decode_len, num_dec_units) -> (batch_size*decodelen,num_dec_units). \n",
204 |     "# We need to do this since the softmax is applied to the last dimension and we want to \n",
205 |     "# softmax the output at each position individually\n",
206 |     "l_reshape = lasagne.layers.ReshapeLayer(l_dec, (-1, [2]))\n",
207 |     "print lasagne.layers.get_output(l_reshape, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n",
208 |     "    {x_sym: X, xmask_sym: Xmask}).shape\n",
209 |     "\n",
210 |     "l_softmax = lasagne.layers.DenseLayer(l_reshape, num_units=NUM_OUTPUTS, \n",
211 |     "                                      nonlinearity=lasagne.nonlinearities.softmax,\n",
212 |     "                                      name='SoftmaxOutput')\n",
213 |     "print lasagne.layers.get_output(l_softmax, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n",
214 |     "    {x_sym: X, xmask_sym: Xmask}).shape\n",
215 |     "\n",
216 |     "# reshape back to 3d format (batch_size, decode_len, num_dec_units). Here we tied the batch size to the shape of the symbolic variable for X allowing \n",
217 |     "#us to use different batch sizes in the model.\n",
218 |     "l_out = lasagne.layers.ReshapeLayer(l_softmax, (x_sym.shape[0], -1, NUM_OUTPUTS))\n",
219 |     "print lasagne.layers.get_output(l_out, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n",
220 |     "    {x_sym: X, xmask_sym: Xmask}).shape\n",
221 |     "###END OF DECODER######\n"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "### Defining the cost function and theano functions\n",
229 |     "Becasue the targets are categorical we use cross entropy error. We use the Adam optimizer but you\n",
230 |     "can experiment with the different optimizers implemented in [Lasagne](http://lasagne.readthedocs.org/en/latest/modules/updates.html). "
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {
237 |     "collapsed": false
238 |    },
239 |    "outputs": [],
240 |    "source": [
241 |     "output_decoder_train = lasagne.layers.get_output(l_out, inputs={l_in: x_sym, l_mask_enc: xmask_sym}, \n",
242 |     "                                                deterministic=False)\n",
243 |     "\n",
244 |     "#cost function\n",
245 |     "total_cost = T.nnet.categorical_crossentropy(\n",
246 |     "    T.reshape(output_decoder_train, (-1, NUM_OUTPUTS)), y_sym.flatten())\n",
247 |     "mean_cost = T.mean(total_cost)\n",
248 |     "#accuracy function\n",
249 |     "argmax = T.argmax(output_decoder_train,axis=-1)\n",
250 |     "eq = T.eq(argmax,y_sym)\n",
251 |     "acc = T.mean(eq)  # gives float64 because eq is uint8, T.cast(eq, 'float32') will fix that...\n",
252 |     "\n",
253 |     "#Get parameters of both encoder and decoder\n",
254 |     "all_parameters = lasagne.layers.get_all_params([l_out], trainable=True)\n",
255 |     "\n",
256 |     "print \"Trainable Model Parameters\"\n",
257 |     "print \"-\"*40\n",
258 |     "for param in all_parameters:\n",
259 |     "    print param, param.get_value().shape\n",
260 |     "print \"-\"*40\n",
261 |     "\n",
262 |     "#add grad clipping to avoid exploding gradients\n",
263 |     "all_grads = [T.clip(g,-3,3) for g in T.grad(mean_cost, all_parameters)]\n",
264 |     "all_grads = lasagne.updates.total_norm_constraint(all_grads,3)\n",
265 |     "\n",
266 |     "#Compile Theano functions.\n",
267 |     "#The two first two inputs to theano.functions is \n",
268 |     "#1) a list of theano shared variables and \n",
269 |     "#2) a list of functions(graphs) to calculate the values of most importanly the cost function. \n",
270 |     "#3) for the training function the update argument should be given as the output from one of \n",
271 |     "#4) lasagnes optimizers. of this argument is not set no parameters will be updated and only the values if 2) will be calculated\n",
272 |     "updates = lasagne.updates.adam(all_grads, all_parameters, learning_rate=0.005)\n",
273 |     "train_func = theano.function([x_sym, y_sym, xmask_sym], [mean_cost, acc, output_decoder_train], updates=updates)\n",
274 |     "#since we don't have any stochasticity in the network we will just use the training graph without any updates given\n",
275 |     "test_func = theano.function([x_sym, y_sym, xmask_sym], [acc, output_decoder_train])"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {
282 |     "collapsed": false
283 |    },
284 |    "outputs": [],
285 |    "source": [
286 |     "#Generate some validation data\n",
287 |     "Xval, Xmask_val, Yval, Ymask_val, text_inputs_val, text_targets_val = \\\n",
288 |     "    get_batch(batch_size=5000, max_digits=MAX_DIGITS,min_digits=MIN_DIGITS)"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "markdown",
293 |    "metadata": {},
294 |    "source": [
295 |     "# Training"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": null,
301 |    "metadata": {
302 |     "collapsed": false
303 |    },
304 |    "outputs": [],
305 |    "source": [
306 |     "val_interval = 5000\n",
307 |     "samples_to_process = 3e5\n",
308 |     "samples_processed = 0\n",
309 |     "\n",
310 |     "val_samples = []\n",
311 |     "costs, accs = [], []\n",
312 |     "plt.figure()\n",
313 |     "try:\n",
314 |     "    while samples_processed < samples_to_process:\n",
315 |     "        inputs, input_masks, targets, target_masks, _, _ = \\\n",
316 |     "            get_batch(batch_size=BATCH_SIZE,max_digits=MAX_DIGITS,min_digits=MIN_DIGITS)\n",
317 |     "        batch_cost, batch_acc, batch_output = train_func(inputs, targets, input_masks)\n",
318 |     "        costs += [batch_cost]\n",
319 |     "        samples_processed += BATCH_SIZE\n",
320 |     "        #validation data\n",
321 |     "        if samples_processed % val_interval == 0:\n",
322 |     "            #print \"validating\"\n",
323 |     "            val_acc, val_output = test_func(Xval, Yval, Xmask_val)\n",
324 |     "            val_samples += [samples_processed]\n",
325 |     "            accs += [val_acc]\n",
326 |     "            plt.plot(val_samples,accs)\n",
327 |     "            plt.ylabel('Validation Accuracy', fontsize=15)\n",
328 |     "            plt.xlabel('Processed samples', fontsize=15)\n",
329 |     "            plt.title('', fontsize=20)\n",
330 |     "            plt.grid('on')\n",
331 |     "            display.display(plt.gcf())\n",
332 |     "            display.clear_output(wait=True)\n",
333 |     "            plt.show()\n",
334 |     "except KeyboardInterrupt:\n",
335 |     "    pass\n"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": null,
341 |    "metadata": {
342 |     "collapsed": false
343 |    },
344 |    "outputs": [],
345 |    "source": [
346 |     "#plot of validation accuracy for each target position\n",
347 |     "plt.figure(figsize=(7,7))\n",
348 |     "plt.plot(np.mean(np.argmax(val_output,axis=2)==Yval,axis=0))\n",
349 |     "plt.ylabel('Accuracy', fontsize=15)\n",
350 |     "plt.xlabel('Target position', fontsize=15)\n",
351 |     "#plt.title('', fontsize=20)\n",
352 |     "plt.grid('on')\n",
353 |     "plt.show()\n",
354 |     "#why do the plot look like this?"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": null,
360 |    "metadata": {
361 |     "collapsed": false
362 |    },
363 |    "outputs": [],
364 |    "source": [
365 |     "# plot training cost\n",
366 |     "#plt.figure(figsize=(7,7))\n",
367 |     "#plt.plot(costs)\n",
368 |     "#plt.ylabel('Cost', fontsize=15)\n",
369 |     "#plt.xlabel('Number of updates', fontsize=15)\n",
370 |     "#plt.title('Training', fontsize=20)\n",
371 |     "#plt.show()"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "markdown",
376 |    "metadata": {},
377 |    "source": [
378 |     "# Exercises:\n",
379 |     "1. What is the final validation performance? Why do you think it is not better? Comment on the accuracy for each position in of the output symbols?\n",
380 |     "\n",
381 |     "2. Why do you think the validation performance looks more \"jig-saw\" like compared to FFN and CNN models?\n",
382 |     "\n",
383 |     "3. Optional: Bidirectional Encoder, In Lasagne bidirectional RNNs are implementated by running a forward model and a backward model separately and then concatenating them before parsing them on to the next layer. You can experiment with using a different merging layer than concat e.g. sum or multiplication see [lasagne merge layers [lasagne merge layers](http://lasagne.readthedocs.org/en/latest/modules/layers/merge.html).\n",
384 |     "\n",
385 |     "```\n",
386 |     "l_rec_fwd = lasagne.layers.GRULayer(...,backwards=False)\n",
387 |     "l_rec_bwd = lasagne.layers.GRULayer(...,backwards=True)\n",
388 |     "l_rec = lasagne.layers.ConcatLayer([l_rec_fwd, l_rec_bwd], axis=2))\n",
389 |     "```\n",
390 |     "\n",
391 |     "4. Optional: Add support for different lengths of targets (hint: add the target_mask to the cost function and only calculate the cost for the non-masked targets)\n"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "markdown",
396 |    "metadata": {},
397 |    "source": [
398 |     "#### Attention Decoder (LSTM)\n",
399 |     "Selective attention for recurrent neural networks have recently attracted a lot of interest. These methods let the Decoder model selective focus on which part of the encoder sequence it will use for each decoded output symbol. This relieves the encoder from having to compress the input sequence into a fixed size vector representation passed on to the decoder. Secondly we can interrogate the decoder network about where it attends while producing the ouputs. below we'll implement an LSTM-decoder with selective attention and show that it significantly improves the performance of the toy translation task."
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": null,
405 |    "metadata": {
406 |     "collapsed": false
407 |    },
408 |    "outputs": [],
409 |    "source": [
410 |     "from decoder_attention import LSTMAttentionDecodeFeedbackLayer\n",
411 |     "\n",
412 |     "# you can acces the attetion weights alpha by adding l_dec.alpha \n",
413 |     "# to the output variables in the theano function\n",
414 |     "\n",
415 |     "BATCH_SIZE = 100\n",
416 |     "NUM_UNITS_ENC = 10\n",
417 |     "NUM_UNITS_DEC = 10\n",
418 |     "MAX_DIGITS = 20 \n",
419 |     "MIN_DIGITS = MAX_DIGITS #currently only support for same length outputs - we'll leave it for an exercise to add support for varying length targets\n",
420 |     "NUM_INPUTS = 27\n",
421 |     "NUM_OUTPUTS = 11 #(0-9 + '#')\n",
422 |     "\n",
423 |     "\n",
424 |     "x_sym = T.imatrix()\n",
425 |     "y_sym = T.imatrix()\n",
426 |     "xmask_sym = T.matrix()\n",
427 |     "    \n",
428 |     "\n",
429 |     "#dummy data to test implementation\n",
430 |     "#X = np.random.randint(0,10,size=(BATCH_SIZE,15)).astype('int32')\n",
431 |     "#Xmask = np.ones((BATCH_SIZE,NUM_INPUTS)).astype('float32')\n",
432 |     "\n",
433 |     "l_in = lasagne.layers.InputLayer((None, None))\n",
434 |     "l_emb = lasagne.layers.EmbeddingLayer(l_in, NUM_INPUTS, NUM_INPUTS, \n",
435 |     "                                      W=np.eye(NUM_INPUTS,dtype='float32'),\n",
436 |     "                                      name='Embedding')\n",
437 |     "##### ENCODER START #####\n",
438 |     "l_in = lasagne.layers.InputLayer((None, None))\n",
439 |     "l_emb = lasagne.layers.EmbeddingLayer(l_in, NUM_INPUTS, NUM_INPUTS, \n",
440 |     "                                      W=np.eye(NUM_INPUTS,dtype='float32'),\n",
441 |     "                                      name='Embedding')\n",
442 |     "#Here we'll remove the trainable parameters from the embeding layer to constrain \n",
443 |     "#it to a simple \"one-hot-encoding\". You can experiment with removing this line\n",
444 |     "l_emb.params[l_emb.W].remove('trainable') \n",
445 |     "print lasagne.layers.get_output(l_emb, inputs={l_in: x_sym}).eval(\n",
446 |     "    {x_sym: X}).shape\n",
447 |     "T.grad(lasagne.layers.get_output(l_emb, inputs={l_in: x_sym}).sum(), \n",
448 |     "       lasagne.layers.get_all_params(l_emb, trainable=True))\n",
449 |     "\n",
450 |     "\n",
451 |     "\n",
452 |     "\n",
453 |     "l_mask_enc = lasagne.layers.InputLayer((None, None))\n",
454 |     "l_enc = lasagne.layers.GRULayer(l_emb, num_units=NUM_UNITS_ENC, name='GRUEncoder', mask_input=l_mask_enc)\n",
455 |     "print lasagne.layers.get_output(l_enc, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n",
456 |     "    {x_sym: X, xmask_sym: Xmask}).shape\n",
457 |     "T.grad(lasagne.layers.get_output(l_enc, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).sum(), \n",
458 |     "       lasagne.layers.get_all_params(l_enc, trainable=True))\n",
459 |     "####END OF ENCODER######\n",
460 |     "\n",
461 |     "\n",
462 |     "####START OF DECODER######\n",
463 |     "#note that the decoder have its own input layer, we'll use that to plug in the output \n",
464 |     "#from the encoder later\n",
465 |     "l_dec = LSTMAttentionDecodeFeedbackLayer(l_enc,\n",
466 |     "                                        num_units=NUM_UNITS_DEC, \n",
467 |     "                                        aln_num_units=20,\n",
468 |     "                                        n_decodesteps=MAX_DIGITS+1,\n",
469 |     "                                        name='LSTMDecoder')\n",
470 |     "print lasagne.layers.get_output(l_dec, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(\n",
471 |     "    {x_sym: X, xmask_sym: Xmask}).shape\n",
472 |     "T.grad(lasagne.layers.get_output(l_dec, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).sum(), \n",
473 |     "       lasagne.layers.get_all_params(l_dec, trainable=True))\n",
474 |     "\n",
475 |     "# We need to do some reshape voodo to connect a softmax layer to the decoder.\n",
476 |     "# See http://lasagne.readthedocs.org/en/latest/modules/layers/recurrent.html#examples \n",
477 |     "l_reshape = lasagne.layers.ReshapeLayer(l_dec, (-1, [2]))\n",
478 |     "l_softmax = lasagne.layers.DenseLayer(l_reshape, num_units=NUM_OUTPUTS, \n",
479 |     "                                      nonlinearity=lasagne.nonlinearities.softmax,\n",
480 |     "                                      name='SoftmaxOutput')\n",
481 |     "# print lasagne.layers.get_output(l_softmax, x_sym).eval({x_sym: X}).shape\n",
482 |     "# reshape back to 3d format (here we tied the batch size to the shape of the symbolic variable for X allowing \n",
483 |     "#us to use different batch sizes in the model)\n",
484 |     "l_out = lasagne.layers.ReshapeLayer(l_softmax, (x_sym.shape[0], -1, NUM_OUTPUTS))\n",
485 |     "print lasagne.layers.get_output(l_out, inputs={l_in: x_sym, l_mask_enc: xmask_sym}, deterministic=False).eval(\n",
486 |     "    {x_sym: X, xmask_sym: Xmask}).shape\n",
487 |     "T.grad(lasagne.layers.get_output(l_out, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).sum(), \n",
488 |     "       lasagne.layers.get_all_params(l_dec, trainable=True))\n",
489 |     "\n",
490 |     "print \"\"\n",
491 |     "###END OF DECODER######\n",
492 |     "\n"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": null,
498 |    "metadata": {
499 |     "collapsed": true
500 |    },
501 |    "outputs": [],
502 |    "source": [
503 |     "#Generate some validation data\n",
504 |     "Xval, Xmask_val, Yval, Ymask_val, text_inputs_val, text_targets_val = \\\n",
505 |     "    get_batch(batch_size=5000, max_digits=MAX_DIGITS,min_digits=MIN_DIGITS)"
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "code",
510 |    "execution_count": null,
511 |    "metadata": {
512 |     "collapsed": false
513 |    },
514 |    "outputs": [],
515 |    "source": [
516 |     "#get output of encoder using X and Xmask as input\n",
517 |     "output_decoder_train = lasagne.layers.get_output(l_out, inputs={l_in: x_sym, l_mask_enc: xmask_sym}, \n",
518 |     "                                                 deterministic=False)\n",
519 |     "\n",
520 |     "#cost function\n",
521 |     "total_cost = T.nnet.categorical_crossentropy(\n",
522 |     "    T.reshape(output_decoder_train, (-1, NUM_OUTPUTS)), y_sym.flatten())\n",
523 |     "mean_cost = T.mean(total_cost)\n",
524 |     "#accuracy function\n",
525 |     "acc = T.mean(T.eq(T.argmax(output_decoder_train,axis=-1),y_sym))\n",
526 |     "\n",
527 |     "#Get parameters of both encoder and decoder\n",
528 |     "all_parameters = lasagne.layers.get_all_params(l_out, trainable=True)\n",
529 |     "\n",
530 |     "print \"Trainable Model Parameters\"\n",
531 |     "print \"-\"*40\n",
532 |     "for param in all_parameters:\n",
533 |     "    print param, param.get_value().shape\n",
534 |     "print \"-\"*40\n",
535 |     "\n",
536 |     "#add grad clipping to avoid exploding gradients\n",
537 |     "all_grads = [T.clip(g,-3,3) for g in T.grad(mean_cost, all_parameters)]\n",
538 |     "all_grads = lasagne.updates.total_norm_constraint(all_grads,3)\n",
539 |     "\n",
540 |     "#Compile Theano functions\n",
541 |     "updates = lasagne.updates.adam(all_grads, all_parameters, learning_rate=0.005)\n",
542 |     "train_func = theano.function([x_sym, y_sym, xmask_sym], [mean_cost, acc, output_decoder_train], updates=updates)\n",
543 |     "#since we don't have any stochasticity in the network we will just use the training graph without any updates given\n",
544 |     "test_func = theano.function([x_sym, y_sym, xmask_sym], [acc, output_decoder_train, l_dec.alpha])\n"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": null,
550 |    "metadata": {
551 |     "collapsed": false
552 |    },
553 |    "outputs": [],
554 |    "source": [
555 |     "val_interval = 5000\n",
556 |     "samples_to_process = 1.5e5\n",
557 |     "samples_processed = 0\n",
558 |     "val_samples = []\n",
559 |     "costs, accs = [], []\n",
560 |     "plt.figure()\n",
561 |     "try:\n",
562 |     "    while samples_processed < samples_to_process:\n",
563 |     "        inputs, input_masks, targets, target_masks, _, _ = \\\n",
564 |     "            get_batch(batch_size=BATCH_SIZE,max_digits=MAX_DIGITS,min_digits=MIN_DIGITS)\n",
565 |     "        batch_cost, batch_acc, batch_output = train_func(inputs, targets, input_masks)\n",
566 |     "        costs += [batch_cost]\n",
567 |     "        samples_processed += BATCH_SIZE\n",
568 |     "        #print i, samples_processed\n",
569 |     "        #validation data\n",
570 |     "        if samples_processed % val_interval == 0:\n",
571 |     "            #print \"validating\"\n",
572 |     "            val_acc, val_output, alpha = test_func(Xval, Yval, Xmask_val)\n",
573 |     "            val_samples += [samples_processed]\n",
574 |     "            accs += [val_acc]\n",
575 |     "            plt.plot(val_samples,accs)\n",
576 |     "            plt.ylabel('', fontsize=15)\n",
577 |     "            plt.xlabel('Processed samples', fontsize=15)\n",
578 |     "            plt.title('Validation Accuracy', fontsize=20)\n",
579 |     "            plt.grid('on')\n",
580 |     "            display.display(plt.gcf())\n",
581 |     "            display.clear_output(wait=True)\n",
582 |     "            plt.show()\n",
583 |     "except KeyboardInterrupt:\n",
584 |     "    pass\n",
585 |     "        "
586 |    ]
587 |   },
588 |   {
589 |    "cell_type": "code",
590 |    "execution_count": null,
591 |    "metadata": {
592 |     "collapsed": false
593 |    },
594 |    "outputs": [],
595 |    "source": [
596 |     "#plot of validation accuracy for each target position\n",
597 |     "plt.figure(figsize=(7,7))\n",
598 |     "plt.plot(np.mean(np.argmax(val_output,axis=2)==Yval,axis=0))\n",
599 |     "plt.ylabel('Accuracy', fontsize=15)\n",
600 |     "plt.xlabel('Target position', fontsize=15)\n",
601 |     "#plt.title('', fontsize=20)\n",
602 |     "plt.grid('on')\n",
603 |     "plt.show()\n",
604 |     "#why do the plot look like this?"
605 |    ]
606 |   },
607 |   {
608 |    "cell_type": "code",
609 |    "execution_count": null,
610 |    "metadata": {
611 |     "collapsed": false
612 |    },
613 |    "outputs": [],
614 |    "source": [
615 |     "#Plot of average attention weight as a function of the sequence position for each of \n",
616 |     "#the 21 targets in the output sequence i.e. each line is the mean postion of the \n",
617 |     "#attention for each target position.\n",
618 |     "\n",
619 |     "np.mean(alpha,axis=0).shape\n",
620 |     "plt.figure()\n",
621 |     "plt.plot(np.mean(alpha,axis=0).T)\n",
622 |     "plt.ylabel('alpha', fontsize=15)\n",
623 |     "plt.xlabel('Input Sequence position', fontsize=15)\n",
624 |     "plt.title('Alpha weights', fontsize=20)\n",
625 |     "plt.legend(map(str,range(1,22)), bbox_to_anchor=(1.125,1.0), fontsize=10)\n",
626 |     "plt.show()\n"
627 |    ]
628 |   },
629 |   {
630 |    "cell_type": "code",
631 |    "execution_count": null,
632 |    "metadata": {
633 |     "collapsed": true
634 |    },
635 |    "outputs": [],
636 |    "source": []
637 |   }
638 |  ],
639 |  "metadata": {
640 |   "kernelspec": {
641 |    "display_name": "Python 2",
642 |    "language": "python",
643 |    "name": "python2"
644 |   },
645 |   "language_info": {
646 |    "codemirror_mode": {
647 |     "name": "ipython",
648 |     "version": 2
649 |    },
650 |    "file_extension": ".py",
651 |    "mimetype": "text/x-python",
652 |    "name": "python",
653 |    "nbconvert_exporter": "python",
654 |    "pygments_lexer": "ipython2",
655 |    "version": "2.7.11"
656 |   }
657 |  },
658 |  "nbformat": 4,
659 |  "nbformat_minor": 0
660 | }
661 | 


--------------------------------------------------------------------------------
/lab3/confusionmatrix.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class ConfusionMatrix:
  5 |     """
  6 |        Simple confusion matrix class
  7 |        row is the true class, column is the predicted class
  8 |     """
  9 |     def __init__(self, num_classes, class_names=None):
 10 |         self.n_classes = num_classes
 11 |         if class_names is None:
 12 |             self.class_names = map(str, range(num_classes))
 13 |         else:
 14 |             self.class_names = class_names
 15 | 
 16 |         # find max class_name and pad
 17 |         max_len = max(map(len, self.class_names))
 18 |         self.max_len = max_len
 19 |         for idx, name in enumerate(self.class_names):
 20 |             if len(self.class_names) < max_len:
 21 |                 self.class_names[idx] = name + " "*(max_len-len(name))
 22 | 
 23 |         self.mat = np.zeros((num_classes,num_classes),dtype='int')
 24 | 
 25 |     def __str__(self):
 26 |         # calucate row and column sums
 27 |         col_sum = np.sum(self.mat, axis=1)
 28 |         row_sum = np.sum(self.mat, axis=0)
 29 | 
 30 |         s = []
 31 | 
 32 |         mat_str = self.mat.__str__()
 33 |         mat_str = mat_str.replace('[','').replace(']','').split('\n')
 34 | 
 35 |         for idx, row in enumerate(mat_str):
 36 |             if idx == 0:
 37 |                 pad = " "
 38 |             else:
 39 |                 pad = ""
 40 |             class_name = self.class_names[idx]
 41 |             class_name = " " + class_name + " |"
 42 |             row_str = class_name + pad + row
 43 |             row_str += " |" + str(col_sum[idx])
 44 |             s.append(row_str)
 45 | 
 46 |         row_sum = [(self.max_len+4)*" "+" ".join(map(str, row_sum))]
 47 |         hline = [(1+self.max_len)*" "+"-"*len(row_sum[0])]
 48 | 
 49 |         s = hline + s + hline + row_sum
 50 | 
 51 |         # add linebreaks
 52 |         s_out = [line+'\n' for line in s]
 53 |         return "".join(s_out)
 54 | 
 55 |     def batch_add(self, targets, preds):
 56 |         assert targets.shape == preds.shape
 57 |         assert len(targets) == len(preds)
 58 |         assert max(targets) < self.n_classes
 59 |         assert max(preds) < self.n_classes
 60 |         targets = targets.flatten()
 61 |         preds = preds.flatten()
 62 |         for i in range(len(targets)):
 63 |                 self.mat[targets[i], preds[i]] += 1
 64 | 
 65 |     def get_errors(self):
 66 |         tp = np.asarray(np.diag(self.mat).flatten(),dtype='float')
 67 |         fn = np.asarray(np.sum(self.mat, axis=1).flatten(),dtype='float') - tp
 68 |         fp = np.asarray(np.sum(self.mat, axis=0).flatten(),dtype='float') - tp
 69 |         tn = np.asarray(np.sum(self.mat)*np.ones(self.n_classes).flatten(),
 70 |                         dtype='float') - tp - fn - fp
 71 |         return tp, fn, fp, tn
 72 | 
 73 |     def accuracy(self):
 74 |         """
 75 |         Calculates global accuracy
 76 |         :return: accuracy
 77 |         :example: >>> conf = ConfusionMatrix(3)
 78 |                   >>> conf.batchAdd([0,0,1],[0,0,2])
 79 |                   >>> print conf.accuracy()
 80 |         """
 81 |         tp, _, _, _ = self.get_errors()
 82 |         n_samples = np.sum(self.mat)
 83 |         return np.sum(tp) / n_samples
 84 | 
 85 |     def sensitivity(self):
 86 |         tp, tn, fp, fn = self.get_errors()
 87 |         res = tp / (tp + fn)
 88 |         res = res[~np.isnan(res)]
 89 |         return res
 90 | 
 91 |     def specificity(self):
 92 |         tp, tn, fp, fn = self.get_errors()
 93 |         res = tn / (tn + fp)
 94 |         res = res[~np.isnan(res)]
 95 |         return res
 96 | 
 97 |     def positive_predictive_value(self):
 98 |         tp, tn, fp, fn = self.get_errors()
 99 |         res = tp / (tp + fp)
100 |         res = res[~np.isnan(res)]
101 |         return res
102 | 
103 |     def negative_predictive_value(self):
104 |         tp, tn, fp, fn = self.get_errors()
105 |         res = tn / (tn + fn)
106 |         res = res[~np.isnan(res)]
107 |         return res
108 | 
109 |     def false_positive_rate(self):
110 |         tp, tn, fp, fn = self.get_errors()
111 |         res = fp / (fp + tn)
112 |         res = res[~np.isnan(res)]
113 |         return res
114 | 
115 |     def false_discovery_rate(self):
116 |         tp, tn, fp, fn = self.get_errors()
117 |         res = fp / (tp + fp)
118 |         res = res[~np.isnan(res)]
119 |         return res
120 | 
121 |     def F1(self):
122 |         tp, tn, fp, fn = self.get_errors()
123 |         res = (2*tp) / (2*tp + fp + fn)
124 |         res = res[~np.isnan(res)]
125 |         return res
126 | 
127 |     def matthews_correlation(self):
128 |         tp, tn, fp, fn = self.get_errors()
129 |         numerator = tp*tn - fp*fn
130 |         denominator = np.sqrt((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))
131 |         res = numerator / denominator
132 |         res = res[~np.isnan(res)]
133 |         return res
134 | 


--------------------------------------------------------------------------------
/lab3/data_generator.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'casperkaae'
 2 | import numpy as np
 3 | 
 4 | target_to_text = {
 5 |     '0':'zero',
 6 |     '1':'one',
 7 |     '2':'two',
 8 |     '3':'three',
 9 |     '4':'four',
10 |     '5':'five',
11 |     '6':'six',
12 |     '7':'seven',
13 |     '8':'eight',
14 |     '9':'nine',
15 | }
16 | 
17 | stop_character = '#'
18 | 
19 | input_characters = " ".join(target_to_text.values())
20 | valid_characters = ['0', '1', '2', '3',  '4',  '5',  '6',  '7',  '8',  '9',  '#'] + \
21 |               list(set(input_characters))
22 | 
23 | def print_valid_characters():
24 |     l = ''
25 |     for i,c in enumerate(valid_characters):
26 |         l += "\'%s\'=%i,\t" % (c,i)
27 |     print "Number of valid characters:", len(valid_characters)
28 |     print l
29 | 
30 | ninput_chars = len(valid_characters)
31 | def get_batch(batch_size=100, min_digits = 3, max_digits=3):
32 |     '''
33 |     Generates random sequences of integers and translates them to text i.e. 1->'one'.
34 |     :param batch_size: number of samples to return
35 |     :param min_digits: minimum length of target
36 |     :param max_digits: maximum length of target
37 |     '''
38 |     text_inputs = []
39 |     int_inputs = []
40 |     text_targets = []
41 |     int_targets = []
42 |     for i in range(batch_size):
43 |         #convert integer into a list of digits
44 |         tar_len = np.random.randint(min_digits,max_digits+1)
45 |         text_target = "".join(map(str,np.random.randint(0,10,tar_len))) + stop_character
46 |         inp_str = text_target[:-1]
47 | 
48 |         #generate the targets as a list of intergers
49 |         int_target = map(lambda c: valid_characters.index(c), text_target)
50 | 
51 |         #generate the text input
52 |         text_input = " ".join(map(lambda k: target_to_text[k], inp_str))
53 |         #generate the inputs as a list of intergers
54 |         int_input = map(lambda c: valid_characters.index(c), text_input)
55 | 
56 |         text_inputs.append(text_input)
57 |         int_inputs.append(int_input)
58 |         text_targets.append(text_target)
59 |         int_targets.append(int_target)
60 | 
61 |     #create the input matrix and mask - note that we zero pad the shorter sequences.
62 |     max_input_len = max(map(len,int_inputs))
63 |     inputs = np.zeros((batch_size,max_input_len))
64 |     input_masks = np.zeros((batch_size,max_input_len))
65 |     for (i,inp) in enumerate(int_inputs):
66 |         cur_len = len(inp)
67 |         inputs[i,:cur_len] = inp
68 |         input_masks[i,:cur_len] = 1
69 | 
70 |     targets = np.zeros((batch_size,max_digits+1)) #+1 to allow space for stop character
71 |     target_masks = np.zeros((batch_size,max_digits+1)) #+1 to allow space for stop character
72 |     for (i,tar) in enumerate(int_targets):
73 |         cur_len = len(tar)
74 |         targets[i,:cur_len] = tar
75 |         target_masks[i,:cur_len] = 1
76 | 
77 |     return inputs.astype('int32'), \
78 |            input_masks.astype('float32'), \
79 |            targets.astype('int32'), \
80 |            target_masks.astype('float32'), \
81 |            text_inputs, \
82 |            text_targets
83 | 
84 | 


--------------------------------------------------------------------------------
/lab3/decoder_attention.py:
--------------------------------------------------------------------------------
   1 | import numpy as np
   2 | import theano
   3 | import theano.tensor as T
   4 | from lasagne import nonlinearities
   5 | from lasagne import init
   6 | from lasagne.utils import unroll_scan
   7 | from lasagne.layers import MergeLayer
   8 | from lasagne.layers.base import Layer
   9 | from lasagne.layers import helper
  10 | 
  11 | import numpy as np
  12 | import theano
  13 | import theano.tensor as T
  14 | import lasagne.init as init
  15 | import lasagne.nonlinearities as nonlinearities
  16 | 
  17 | from lasagne.layers import Layer
  18 | import lasagne
  19 | 
  20 | 
  21 | # LSTMAttentionDecodeLayer
  22 | # Model: Encoder -> Decoder    Decoder-LSTM: ... hid_t-1 ->  hid_t  -> h_t+1 ....
  23 | #                         attention_network         |         |        |
  24 | #          weighted encoder hidden(output)      whid_t-1 -> whid_t -> wh_t+1
  25 | 
  26 | 
  27 | # LSTMAttentionDecodeFeedBackLayer
  28 | # Model: Encoder -> Decoder    Decoder-LSTM: ... hid_dec_t-1-> hid_dec_t-> hid_dec_t+1 ....
  29 | #                                                   |     /^  |    /^ |     /^
  30 | #                         attention_network         |   /     |  /    |   /
  31 | #                                                   | /       |/      | /
  32 | #          weighted encoder hidden(output)      whid_enc_t-1 -> whid_enc__t -> wh_t+1
  33 | #
  34 | # This model also allows for adden "pre-steps" to the decoder where the model can
  35 | # "comprehend the input data". basically this is just adding extra steps to the
  36 | # decoder before producing the targets
  37 | #
  38 | #
  39 | 
  40 | 
  41 | class LSTMAttentionDecodeLayer(MergeLayer):
  42 |     r"""A long short-term memory (LSTM) layer.
  43 | 
  44 |     Includes optional "peephole connections" and a forget gate.  Based on the
  45 |     definition in [1]_, which is the current common definition.  The output is
  46 |     computed by
  47 | 
  48 |     .. math ::
  49 | 
  50 |         i_t &= \sigma_i(W_{xi}x_t + W_{hi}h_{t-1}
  51 |                + w_{ci}\odot c_{t-1} + b_i)\\
  52 |         f_t &= \sigma_f(W_{xf}x_t + W_{hf}h_{t-1}
  53 |                + w_{cf}\odot c_{t-1} + b_f)\\
  54 |         c_t &= f_t \odot c_{t - 1}
  55 |                + i_t\sigma_c(W_{xc}x_t + W_{hc} h_{t-1} + b_c)\\
  56 |         o_t &= \sigma_o(W_{xo}x_t + W_{ho}h_{t-1} + w_{co}\odot c_t + b_o)\\
  57 |         h_t &= o_t \odot \sigma_h(c_t)
  58 | 
  59 |     Parameters
  60 |     ----------
  61 |     incoming : a :class:`lasagne.layers.Layer` instance or a tuple
  62 |         The layer feeding into this layer, or the expected input shape.
  63 |     num_units : int
  64 |         Number of hidden/cell units in the layer.
  65 |     W_in_to_ingate : Theano shared variable, numpy array or callable
  66 |         Initializer for input-to-input gate weight matrix (:math:`W_{xi}`).
  67 |     W_hid_to_ingate : Theano shared variable, numpy array or callable
  68 |         Initializer for hidden-to-input gate weight matrix (:math:`W_{hi}`).
  69 |     W_cell_to_ingate : Theano shared variable, numpy array or callable
  70 |         Initializer for cell-to-input gate weight vector (:math:`w_{ci}`).
  71 |     b_ingate : Theano shared variable, numpy array or callable
  72 |         Initializer for input gate bias vector (:math:`b_i`).
  73 |     nonlinearity_ingate : callable or None
  74 |         The nonlinearity that is applied to the input gate activation
  75 |         (:math:`\sigma_i`). If None is provided, no nonlinearity will be
  76 |         applied.
  77 |     W_in_to_forgetgate : Theano shared variable, numpy array or callable
  78 |         Initializer for input-to-forget gate weight matrix (:math:`W_{xf}`).
  79 |     W_hid_to_forgetgate : Theano shared variable, numpy array or callable
  80 |         Initializer for hidden-to-forget gate weight matrix (:math:`W_{hf}`).
  81 |     W_cell_to_forgetgate : Theano shared variable, numpy array or callable
  82 |         Initializer for cell-to-forget gate weight vector (:math:`w_{cf}`).
  83 |     b_forgetgate : Theano shared variable, numpy array or callable
  84 |         Initializer for forget gate bias vector (:math:`b_f`).
  85 |     nonlinearity_forgetgate : callable or None
  86 |         The nonlinearity that is applied to the forget gate activation
  87 |         (:math:`\sigma_f`). If None is provided, no nonlinearity will be
  88 |         applied.
  89 |     W_in_to_cell : Theano shared variable, numpy array or callable
  90 |         Initializer for input-to-cell weight matrix (:math:`W_{ic}`).
  91 |     W_hid_to_cell : Theano shared variable, numpy array or callable
  92 |         Initializer for hidden-to-cell weight matrix (:math:`W_{hc}`).
  93 |     b_cell : Theano shared variable, numpy array or callable
  94 |         Initializer for cell bias vector (:math:`b_c`).
  95 |     nonlinearity_cell : callable or None
  96 |         The nonlinearity that is applied to the cell activation
  97 |         (;math:`\sigma_c`). If None is provided, no nonlinearity will be
  98 |         applied.
  99 |     W_in_to_outgate : Theano shared variable, numpy array or callable
 100 |         Initializer for input-to-output gate weight matrix (:math:`W_{io}`).
 101 |     W_hid_to_outgate : Theano shared variable, numpy array or callable
 102 |         Initializer for hidden-to-output gate weight matrix (:math:`W_{ho}`).
 103 |     W_cell_to_outgate : Theano shared variable, numpy array or callable
 104 |         Initializer for cell-to-output gate weight vector (:math:`w_{co}`).
 105 |     b_outgate : Theano shared variable, numpy array or callable
 106 |         Initializer for hidden-to-input gate weight matrix (:math:`b_o`).
 107 |     nonlinearity_outgate : callable or None
 108 |         The nonlinearity that is applied to the output gate activation
 109 |         (:math:`\sigma_o`). If None is provided, no nonlinearity will be
 110 |         applied.
 111 |     nonlinearity_out : callable or None
 112 |         The nonlinearity that is applied to the output (:math:`\sigma_h`). If
 113 |         None is provided, no nonlinearity will be applied.
 114 |     cell_init : callable, np.ndarray, theano.shared or TensorVariable
 115 |         Passing in a TensorVariable allows the user to specify
 116 |         the value of `cell_init` (:math:`c_0`). In this mode `learn_init` is
 117 |         ignored for the cell state.
 118 |     hid_init : callable, np.ndarray, theano.shared or TensorVariable
 119 |         Passing in a TensorVariable allows the user to specify
 120 |         the value of `hid_init` (:math:`h_0`). In this mode `learn_init` is
 121 |         ignored for the hidden state.
 122 |     backwards : bool
 123 |         If True, process the sequence backwards and then reverse the
 124 |         output again such that the output from the layer is always
 125 |         from :math:`x_1` to :math:`x_n`.
 126 |     learn_init : bool
 127 |         If True, initial hidden values are learned. If `hid_init` or
 128 |         `cell_init` are TensorVariables then the TensorVariable is used and
 129 |         `learn_init` is ignored for that initial state.
 130 |     peepholes : bool
 131 |         If True, the LSTM uses peephole connections.
 132 |         When False, `W_cell_to_ingate`, `W_cell_to_forgetgate` and
 133 |         `W_cell_to_outgate` are ignored.
 134 |     gradient_steps : int
 135 |         Number of timesteps to include in the backpropagated gradient.
 136 |         If -1, backpropagate through the entire sequence.
 137 |     grad_clipping: False or float
 138 |         If a float is provided, the gradient messages are clipped during the
 139 |         backward pass.  If False, the gradients will not be clipped.  See [1]_
 140 |         (p. 6) for further explanation.
 141 |     unroll_scan : bool
 142 |         If True the recursion is unrolled instead of using scan. For some
 143 |         graphs this gives a significant speed up but it might also consume
 144 |         more memory. When `unroll_scan` is true then the `gradient_steps`
 145 |         setting is ignored.
 146 |     precompute_input : bool
 147 |         If True, precompute input_to_hid before iterating through
 148 |         the sequence. This can result in a speedup at the expense of
 149 |         an increase in memory usage.
 150 | 
 151 |     References
 152 |     ----------
 153 |     .. [1] Graves, Alex: "Generating sequences with recurrent neural networks."
 154 |            arXiv preprint arXiv:1308.0850 (2013).
 155 |     """
 156 |     def __init__(self, incoming,
 157 |                  num_units,
 158 |                  aln_num_units,
 159 |                  n_decodesteps,
 160 |                  W_align=init.Normal(0.1),
 161 |                  U_align=init.Normal(0.1),
 162 |                  v_align=init.Normal(0.1),
 163 |                  nonlinearity_align=nonlinearities.tanh,
 164 |                  W_hid_to_ingate=init.Normal(0.1),
 165 |                  W_cell_to_ingate=init.Normal(0.1),
 166 |                  b_ingate=init.Constant(0.),
 167 |                  nonlinearity_ingate=nonlinearities.sigmoid,
 168 |                  #W_in_to_forgetgate=init.Normal(0.1),
 169 |                  W_hid_to_forgetgate=init.Normal(0.1),
 170 |                  W_cell_to_forgetgate=init.Normal(0.1),
 171 |                  b_forgetgate=init.Constant(0.),
 172 |                  nonlinearity_forgetgate=nonlinearities.sigmoid,
 173 |                  #W_in_to_cell=init.Normal(0.1),
 174 |                  W_hid_to_cell=init.Normal(0.1),
 175 |                  b_cell=init.Constant(0.),
 176 |                  nonlinearity_cell=nonlinearities.tanh,
 177 |                  #W_in_to_outgate=init.Normal(0.1),
 178 |                  W_hid_to_outgate=init.Normal(0.1),
 179 |                  W_cell_to_outgate=init.Normal(0.1),
 180 |                  b_outgate=init.Constant(0.),
 181 |                  nonlinearity_outgate=nonlinearities.sigmoid,
 182 |                  nonlinearity_out=nonlinearities.tanh,
 183 |                  cell_init=init.Constant(0.),
 184 |                  hid_init=init.Constant(0.),
 185 |                  backwards=False,
 186 |                  learn_init=False,
 187 |                  peepholes=True,
 188 |                  gradient_steps=-1,
 189 |                  grad_clipping=False,
 190 |                  unroll_scan=False,
 191 |                  mask_input=None,
 192 |                  #precompute_input=True,
 193 |                  **kwargs):
 194 | 
 195 |         # Initialize parent layer
 196 |         # This layer inherits from a MergeLayer, because it can have two
 197 |         # inputs - the layer input, and the mask.  We will just provide the
 198 |         # layer input as incomings, unless a mask input was provided.
 199 |         incomings = [incoming]
 200 |         if mask_input is not None:
 201 |             incomings.append(mask_input)
 202 |         super(LSTMAttentionDecodeLayer, self).__init__(incomings, **kwargs)
 203 | 
 204 |         # For any of the nonlinearities, if None is supplied, use identity
 205 |         if nonlinearity_ingate is None:
 206 |             self.nonlinearity_ingate = nonlinearities.identity
 207 |         else:
 208 |             self.nonlinearity_ingate = nonlinearity_ingate
 209 | 
 210 |         if nonlinearity_forgetgate is None:
 211 |             self.nonlinearity_forgetgate = nonlinearities.identity
 212 |         else:
 213 |             self.nonlinearity_forgetgate = nonlinearity_forgetgate
 214 | 
 215 |         if nonlinearity_cell is None:
 216 |             self.nonlinearity_cell = nonlinearities.identity
 217 |         else:
 218 |             self.nonlinearity_cell = nonlinearity_cell
 219 | 
 220 |         if nonlinearity_outgate is None:
 221 |             self.nonlinearity_outgate = nonlinearities.identity
 222 |         else:
 223 |             self.nonlinearity_outgate = nonlinearity_outgate
 224 | 
 225 |         if nonlinearity_out is None:
 226 |             self.nonlinearity_out = nonlinearities.identity
 227 |         else:
 228 |             self.nonlinearity_out = nonlinearity_out
 229 | 
 230 |         self.learn_init = learn_init
 231 |         self.num_units = num_units
 232 |         self.backwards = backwards
 233 |         self.peepholes = peepholes
 234 |         self.gradient_steps = gradient_steps
 235 |         self.grad_clipping = grad_clipping
 236 |         self.unroll_scan = unroll_scan
 237 |         self.n_decodesteps = n_decodesteps
 238 |         self.aln_num_units = aln_num_units
 239 |         self.nonlinearity_align = nonlinearity_align
 240 | 
 241 |         # Retrieve the dimensionality of the incoming layer
 242 |         input_shape = self.input_shapes[0]
 243 |         if unroll_scan and input_shape[1] is None:
 244 |             raise ValueError("Input sequence length cannot be specified as "
 245 |                              "None when unroll_scan is True")
 246 | 
 247 |         num_inputs = np.prod(self.input_shape[2:])
 248 | 
 249 |         # Initialize parameters using the supplied args
 250 |         #self.W_in_to_ingate = self.add_param(
 251 |         #    W_in_to_ingate, (num_inputs, num_units), name="W_in_to_ingate")
 252 | 
 253 |         self.W_hid_to_ingate = self.add_param(
 254 |             W_hid_to_ingate, (num_units, num_units), name="W_hid_to_ingate")
 255 | 
 256 |         self.b_ingate = self.add_param(
 257 |             b_ingate, (num_units,), name="b_ingate", regularizable=False)
 258 | 
 259 |         #self.W_in_to_forgetgate = self.add_param(
 260 |         #    W_in_to_forgetgate, (num_inputs, num_units),
 261 |         #    name="W_in_to_forgetgate")
 262 | 
 263 |         self.W_hid_to_forgetgate = self.add_param(
 264 |             W_hid_to_forgetgate, (num_units, num_units),
 265 |             name="W_hid_to_forgetgate")
 266 | 
 267 |         self.b_forgetgate = self.add_param(
 268 |             b_forgetgate, (num_units,), name="b_forgetgate",
 269 |             regularizable=False)
 270 | 
 271 |         #self.W_in_to_cell = self.add_param(
 272 |         #    W_in_to_cell, (num_inputs, num_units), name="W_in_to_cell")
 273 | 
 274 |         self.W_hid_to_cell = self.add_param(
 275 |             W_hid_to_cell, (num_units, num_units), name="W_hid_to_cell")
 276 | 
 277 |         self.b_cell = self.add_param(
 278 |             b_cell, (num_units,), name="b_cell", regularizable=False)
 279 | 
 280 |         #self.W_in_to_outgate = self.add_param(
 281 |         #    W_in_to_outgate, (num_inputs, num_units), name="W_in_to_outgate")
 282 | 
 283 |         self.W_hid_to_outgate = self.add_param(
 284 |             W_hid_to_outgate, (num_units, num_units), name="W_hid_to_outgate")
 285 | 
 286 |         self.b_outgate = self.add_param(
 287 |             b_outgate, (num_units,), name="b_outgate", regularizable=False)
 288 | 
 289 |         # Stack input weight matrices into a (num_inputs, 4*num_units)
 290 |         # matrix, which speeds up computation
 291 |         #self.W_in_stacked = T.concatenate(
 292 |         #    [self.W_in_to_ingate, self.W_in_to_forgetgate,
 293 |         #     self.W_in_to_cell, self.W_in_to_outgate], axis=1)
 294 | 
 295 |         # Same for hidden weight matrices
 296 |         self.W_hid_stacked = T.concatenate(
 297 |             [self.W_hid_to_ingate, self.W_hid_to_forgetgate,
 298 |              self.W_hid_to_cell, self.W_hid_to_outgate], axis=1)
 299 | 
 300 |         # Stack biases into a (4*num_units) vector
 301 |         self.b_stacked = T.concatenate(
 302 |             [self.b_ingate, self.b_forgetgate,
 303 |              self.b_cell, self.b_outgate], axis=0)
 304 | 
 305 |         # If peephole (cell to gate) connections were enabled, initialize
 306 |         # peephole connections.  These are elementwise products with the cell
 307 |         # state, so they are represented as vectors.
 308 |         if self.peepholes:
 309 |             self.W_cell_to_ingate = self.add_param(
 310 |                 W_cell_to_ingate, (num_units, ), name="W_cell_to_ingate")
 311 | 
 312 |             self.W_cell_to_forgetgate = self.add_param(
 313 |                 W_cell_to_forgetgate, (num_units, ),
 314 |                 name="W_cell_to_forgetgate")
 315 | 
 316 |             self.W_cell_to_outgate = self.add_param(
 317 |                 W_cell_to_outgate, (num_units, ), name="W_cell_to_outgate")
 318 | 
 319 |         self.W_align = self.add_param(W_align, (num_units, self.aln_num_units),
 320 |                                    name="AlignSeqOutputLayer: (aln) W_a")
 321 |         self.U_align = self.add_param(U_align, (num_inputs, self.aln_num_units),
 322 |                            name="AlignSeqOutputLayer: (aln) U_a")
 323 |         self.v_align = self.add_param(v_align, (self.aln_num_units, 1),
 324 |                                  name="AlignSeqOutputLayer: v_a")
 325 | 
 326 | 
 327 |         # Setup initial values for the cell and the hidden units
 328 |         if isinstance(cell_init, T.TensorVariable):
 329 |             if cell_init.ndim != 2:
 330 |                 raise ValueError(
 331 |                     "When cell_init is provided as a TensorVariable, it should"
 332 |                     " have 2 dimensions and have shape (num_batch, num_units)")
 333 |             self.cell_init = cell_init
 334 |         else:
 335 |             self.cell_init = self.add_param(
 336 |                 cell_init, (1, num_units), name="cell_init",
 337 |                 trainable=learn_init, regularizable=False)
 338 | 
 339 |         if isinstance(hid_init, T.TensorVariable):
 340 |             if hid_init.ndim != 2:
 341 |                 raise ValueError(
 342 |                     "When hid_init is provided as a TensorVariable, it should "
 343 |                     "have 2 dimensions and have shape (num_batch, num_units)")
 344 |             self.hid_init = hid_init
 345 |         else:
 346 |             self.hid_init = self.add_param(
 347 |                 hid_init, (1, self.num_units), name="hid_init",
 348 |                 trainable=learn_init, regularizable=False)
 349 | 
 350 |     def get_output_shape_for(self, input_shapes):
 351 |         input_shape = input_shapes[0]
 352 |         return input_shape[0], None, self.num_units
 353 | 
 354 |     def get_output_for(self, inputs, **kwargs):
 355 |         """
 356 |         Compute this layer's output function given a symbolic input variable
 357 | 
 358 |         Parameters
 359 |         ----------
 360 |         input : theano.TensorType
 361 |             Symbolic input variable.
 362 |         mask : theano.TensorType
 363 |             Theano variable denoting whether each time step in each
 364 |             sequence in the batch is part of the sequence or not.  If ``None``,
 365 |             then it is assumed that all sequences are of the same length.  If
 366 |             not all sequences are of the same length, then it must be
 367 |             supplied as a matrix of shape ``(n_batch, n_time_steps)`` where
 368 |             ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and
 369 |             ``mask[i, j] = 0`` when ``j > (length of sequence i)``.
 370 | 
 371 |         Returns
 372 |         -------
 373 |         layer_output : theano.TensorType
 374 |             Symblic output variable.
 375 |         """
 376 |         input = inputs[0]
 377 |         # Retrieve the mask when it is supplied
 378 |         mask = inputs[1] if len(inputs) > 1 else None
 379 | 
 380 |         # Treat all dimensions after the second as flattened feature dimensions
 381 |         # Retrieve the layer input
 382 |         if input.ndim > 3:
 383 |             input = input.reshape((input.shape[0], input.shape[1],
 384 |                                    T.prod(input.shape[2:])))
 385 |         num_batch = input.shape[0]
 386 |         encode_seqlen = input.shape[1]
 387 | 
 388 |         # At each call to scan, input_n will be (n_time_steps, 4*num_units).
 389 |         # We define a slicing function that extract the input to each LSTM gate
 390 |         def slice_w(x, n):
 391 |             return x[:, n*self.num_units:(n+1)*self.num_units]
 392 | 
 393 |         # Create single recurrent computation step function
 394 |         # input_n is the n'th vector of the input
 395 |         def step(cell_previous, hid_previous, a_prev,
 396 |                  hUa, W_align, v_align,
 397 |                  W_hid_stacked, W_cell_to_ingate, W_cell_to_forgetgate,
 398 |                  W_cell_to_outgate, b_stacked):
 399 | 
 400 |             # Calculate gates pre-activations and slice
 401 |             gates = T.dot(hid_previous, W_hid_stacked) + b_stacked
 402 | 
 403 |             # Clip gradients
 404 |             if self.grad_clipping is not False:
 405 |                 gates = theano.gradient.grad_clip(
 406 |                     gates, -self.grad_clipping, self.grad_clipping)
 407 | 
 408 |             # Extract the pre-activation gate values
 409 |             ingate = slice_w(gates, 0)
 410 |             forgetgate = slice_w(gates, 1)
 411 |             cell_input = slice_w(gates, 2)
 412 |             outgate = slice_w(gates, 3)
 413 | 
 414 |             if self.peepholes:
 415 |                 # Compute peephole connections
 416 |                 ingate += cell_previous*W_cell_to_ingate
 417 |                 forgetgate += cell_previous*W_cell_to_forgetgate
 418 | 
 419 |             # Apply nonlinearities
 420 |             ingate = self.nonlinearity_ingate(ingate)
 421 |             forgetgate = self.nonlinearity_forgetgate(forgetgate)
 422 |             cell_input = self.nonlinearity_cell(cell_input)
 423 |             outgate = self.nonlinearity_outgate(outgate)
 424 | 
 425 |             # Compute new cell value
 426 |             cell = forgetgate*cell_previous + ingate*cell_input
 427 | 
 428 |             if self.peepholes:
 429 |                 outgate += cell*W_cell_to_outgate
 430 | 
 431 |             # W_align:  (num_units, aln_num_units)
 432 |             # U_align:  (num_feats, aln_num_units)
 433 |             # v_align:  (aln_num_units, 1)
 434 |             # hUa:      (BS, Seqlen, aln_num_units)
 435 |             # hid:      (BS, num_units_dec)
 436 |             # input:    (BS, Seqlen, num_inputs)
 437 | 
 438 |             # Compute new hidden unit activation
 439 |             hid = outgate*self.nonlinearity_out(cell)
 440 | 
 441 |             #compute (unormalized) attetion vector
 442 |             sWa = T.dot(hid, W_align)       # (BS, aln_num_units)
 443 |             sWa = sWa.dimshuffle(0, 'x', 1)   # (BS, 1, aln_num_units)
 444 |             tanh_sWahUa = self.nonlinearity_align(sWa + hUa)
 445 |                                             # (BS, seqlen, num_units_aln)
 446 | 
 447 |             # CALCULATE WEIGHT FOR EACH HIDDEN STATE VECTOR
 448 |             a = T.dot(tanh_sWahUa, v_align)  # (BS, Seqlen, 1)
 449 |             a = T.reshape(a, (a.shape[0], a.shape[1]))
 450 |             #                                # (BS, Seqlen)
 451 |             # # ->(BS, seq_len)
 452 |             #a = a.squeeze()
 453 |             #a = a*a
 454 |             #a = a*mask - (1-mask)*10000 #this line does not work
 455 |             #a = T.reshape(a, (input.shape[0], input.shape[1]))
 456 | 
 457 |             #alpha = T.nnet.softmax(a)
 458 |             #alpha = T.reshape(alpha, (input.shape[0], input.shape[1]))
 459 | 
 460 |             #
 461 |             # # create alpha in dim (batch_size, seq_len, 1)
 462 | 
 463 |             #
 464 |             #weighted_hidden = input * alpha.dimshuffle(0, 1, 'x')
 465 |             #weighted_hidden = T.sum(weighted_hidden, axis=1)  #sum seqlen out
 466 | 
 467 |             return [cell, hid, a]
 468 | 
 469 |         sequences = []
 470 |         step_fun = step
 471 | 
 472 |         ones = T.ones((num_batch, 1))
 473 |         if isinstance(self.cell_init, T.TensorVariable):
 474 |             cell_init = self.cell_init
 475 |         else:
 476 |             # Dot against a 1s vector to repeat to shape (num_batch, num_units)
 477 |             cell_init = T.dot(ones, self.cell_init)
 478 | 
 479 |         if isinstance(self.hid_init, T.TensorVariable):
 480 |             hid_init = self.hid_init
 481 |         else:
 482 |             # Dot against a 1s vector to repeat to shape (num_batch, num_units)
 483 |             hid_init = T.dot(ones, self.hid_init)
 484 | 
 485 |         #weighted_hidden_init = T.zeros((num_batch, input.shape[2]))
 486 |         alpha_init = T.zeros((num_batch, encode_seqlen))
 487 | 
 488 |         # The hidden-to-hidden weight matrix is always used in step
 489 | 
 490 |         hUa = T.dot(input, self.U_align)   # (num_batch, seq_len, num_units_aln)
 491 | 
 492 |         non_seqs = [hUa, self.W_align, self.v_align,
 493 |                     self.W_hid_stacked]
 494 |         # The "peephole" weight matrices are only used when self.peepholes=True
 495 |         if self.peepholes:
 496 |             non_seqs += [self.W_cell_to_ingate,
 497 |                          self.W_cell_to_forgetgate,
 498 |                          self.W_cell_to_outgate]
 499 |         # theano.scan only allows for positional arguments, so when
 500 |         # self.peepholes is False, we need to supply fake placeholder arguments
 501 |         # for the three peephole matrices.
 502 |         else:
 503 |             non_seqs += [(), (), ()]
 504 |         # When we aren't precomputing the input outside of scan, we need to
 505 |         # provide the input weights and biases to the step function
 506 |         non_seqs += [self.b_stacked]
 507 | 
 508 |         if self.unroll_scan:
 509 |             # Explicitly unroll the recurrence instead of using scan
 510 |             cell_out, hid_out, a_out = unroll_scan(
 511 |                 fn=step_fun,
 512 |                 sequences=sequences,
 513 |                 outputs_info=[cell_init, hid_init, alpha_init],
 514 |                 go_backwards=self.backwards,
 515 |                 non_sequences=non_seqs,
 516 |                 n_steps=self.n_decodesteps)
 517 |         else:
 518 |             # Scan op iterates over first dimension of input and repeatedly
 519 |             # applies the step function
 520 |             cell_out, hid_out, a_out = theano.scan(
 521 |                 fn=step_fun,
 522 |                 sequences=sequences,
 523 |                 outputs_info=[cell_init, hid_init, alpha_init],
 524 |                 go_backwards=self.backwards,
 525 |                 truncate_gradient=self.gradient_steps,
 526 |                 non_sequences=non_seqs,
 527 |                 n_steps=self.n_decodesteps,
 528 |                 strict=True)[0]
 529 | 
 530 |         # dimshuffle back to (n_batch, n_time_steps, n_features))
 531 | 
 532 |         #a_out - (n_decodesteps, bs, seqlen)
 533 |         #hid_out -   (n_decode_steps, bs, num_units)
 534 | 
 535 | 
 536 |         # mask:  (BS, encode_seqlen
 537 |         # a_out; (n_decodesteps, BS, encode_seqlen)
 538 |         cell_out = cell_out.dimshuffle(1, 0, 2)
 539 |         mask = mask.dimshuffle(0, 'x', 1)
 540 |         a_out = a_out.dimshuffle(1, 0, 2)  # (BS, n_decodesteps, encode_seqlen)
 541 | 
 542 |         # set masked positions to large negative value
 543 |         a_out = a_out*mask - (1-mask)*10000
 544 | 
 545 |         # normalize over encode_seqlen (->large negative values = 0)
 546 |         a_out = T.reshape(a_out, (num_batch*self.n_decodesteps, encode_seqlen))
 547 |         alpha = T.nnet.softmax(a_out)
 548 |         alpha = T.reshape(alpha, (num_batch, self.n_decodesteps, encode_seqlen))
 549 | 
 550 |         # (BS, encode_seqlen, num_units) -> (BS, num_units, 1 encode_seqlen,)
 551 |         input = input.dimshuffle(0, 2, 'x',  1)
 552 |         # (BS, n_decodesteps, encode_seqlen) -> (BS, '1', n_decodesteps, encode_seqlen)
 553 |         alpha = alpha.dimshuffle(0, 'x', 1, 2)
 554 |         weighted_hidden_out = input*alpha
 555 | 
 556 |         weighted_hidden_out = T.sum(weighted_hidden_out, axis=3)
 557 |         # (BS, n_decodesteps, num_encode_units)
 558 | 
 559 |         # if scan is backward reverse the output
 560 |         if self.backwards:
 561 |             hid_out = hid_out[:, ::-1]
 562 |             cell_out = cell_out[:, ::-1]
 563 |             weighted_hidden_out = weighted_hidden_out[:, ::-1]
 564 |             alpha = alpha[:, ::-1]
 565 | 
 566 |         self.hid_out = hid_out
 567 |         self.cell_out = cell_out
 568 |         self.weighted_hidden_out = weighted_hidden_out
 569 |         self.alpha = alpha
 570 | 
 571 |         return self.weighted_hidden_out
 572 | 
 573 | 
 574 | class LSTMAttentionDecodeFeedbackLayer(MergeLayer):
 575 |     r"""A long short-term memory (LSTM) layer.
 576 | 
 577 |     Includes optional "peephole connections" and a forget gate.  Based on the
 578 |     definition in [1]_, which is the current common definition.  The output is
 579 |     computed by
 580 | 
 581 |     .. math ::
 582 | 
 583 |         i_t &= \sigma_i(W_{xi}x_t + W_{hi}h_{t-1}
 584 |                + w_{ci}\odot c_{t-1} + b_i)\\
 585 |         f_t &= \sigma_f(W_{xf}x_t + W_{hf}h_{t-1}
 586 |                + w_{cf}\odot c_{t-1} + b_f)\\
 587 |         c_t &= f_t \odot c_{t - 1}
 588 |                + i_t\sigma_c(W_{xc}x_t + W_{hc} h_{t-1} + b_c)\\
 589 |         o_t &= \sigma_o(W_{xo}x_t + W_{ho}h_{t-1} + w_{co}\odot c_t + b_o)\\
 590 |         h_t &= o_t \odot \sigma_h(c_t)
 591 | 
 592 |     Parameters
 593 |     ----------
 594 |     incoming : a :class:`lasagne.layers.Layer` instance or a tuple
 595 |         The layer feeding into this layer, or the expected input shape.
 596 |     num_units : int
 597 |         Number of hidden/cell units in the layer.
 598 |     W_in_to_ingate : Theano shared variable, numpy array or callable
 599 |         Initializer for input-to-input gate weight matrix (:math:`W_{xi}`).
 600 |     W_hid_to_ingate : Theano shared variable, numpy array or callable
 601 |         Initializer for hidden-to-input gate weight matrix (:math:`W_{hi}`).
 602 |     W_cell_to_ingate : Theano shared variable, numpy array or callable
 603 |         Initializer for cell-to-input gate weight vector (:math:`w_{ci}`).
 604 |     b_ingate : Theano shared variable, numpy array or callable
 605 |         Initializer for input gate bias vector (:math:`b_i`).
 606 |     nonlinearity_ingate : callable or None
 607 |         The nonlinearity that is applied to the input gate activation
 608 |         (:math:`\sigma_i`). If None is provided, no nonlinearity will be
 609 |         applied.
 610 |     W_in_to_forgetgate : Theano shared variable, numpy array or callable
 611 |         Initializer for input-to-forget gate weight matrix (:math:`W_{xf}`).
 612 |     W_hid_to_forgetgate : Theano shared variable, numpy array or callable
 613 |         Initializer for hidden-to-forget gate weight matrix (:math:`W_{hf}`).
 614 |     W_cell_to_forgetgate : Theano shared variable, numpy array or callable
 615 |         Initializer for cell-to-forget gate weight vector (:math:`w_{cf}`).
 616 |     b_forgetgate : Theano shared variable, numpy array or callable
 617 |         Initializer for forget gate bias vector (:math:`b_f`).
 618 |     nonlinearity_forgetgate : callable or None
 619 |         The nonlinearity that is applied to the forget gate activation
 620 |         (:math:`\sigma_f`). If None is provided, no nonlinearity will be
 621 |         applied.
 622 |     W_in_to_cell : Theano shared variable, numpy array or callable
 623 |         Initializer for input-to-cell weight matrix (:math:`W_{ic}`).
 624 |     W_hid_to_cell : Theano shared variable, numpy array or callable
 625 |         Initializer for hidden-to-cell weight matrix (:math:`W_{hc}`).
 626 |     b_cell : Theano shared variable, numpy array or callable
 627 |         Initializer for cell bias vector (:math:`b_c`).
 628 |     nonlinearity_cell : callable or None
 629 |         The nonlinearity that is applied to the cell activation
 630 |         (;math:`\sigma_c`). If None is provided, no nonlinearity will be
 631 |         applied.
 632 |     W_in_to_outgate : Theano shared variable, numpy array or callable
 633 |         Initializer for input-to-output gate weight matrix (:math:`W_{io}`).
 634 |     W_hid_to_outgate : Theano shared variable, numpy array or callable
 635 |         Initializer for hidden-to-output gate weight matrix (:math:`W_{ho}`).
 636 |     W_cell_to_outgate : Theano shared variable, numpy array or callable
 637 |         Initializer for cell-to-output gate weight vector (:math:`w_{co}`).
 638 |     b_outgate : Theano shared variable, numpy array or callable
 639 |         Initializer for hidden-to-input gate weight matrix (:math:`b_o`).
 640 |     nonlinearity_outgate : callable or None
 641 |         The nonlinearity that is applied to the output gate activation
 642 |         (:math:`\sigma_o`). If None is provided, no nonlinearity will be
 643 |         applied.
 644 |     nonlinearity_out : callable or None
 645 |         The nonlinearity that is applied to the output (:math:`\sigma_h`). If
 646 |         None is provided, no nonlinearity will be applied.
 647 |     cell_init : callable, np.ndarray, theano.shared or TensorVariable
 648 |         Passing in a TensorVariable allows the user to specify
 649 |         the value of `cell_init` (:math:`c_0`). In this mode `learn_init` is
 650 |         ignored for the cell state.
 651 |     hid_init : callable, np.ndarray, theano.shared or TensorVariable
 652 |         Passing in a TensorVariable allows the user to specify
 653 |         the value of `hid_init` (:math:`h_0`). In this mode `learn_init` is
 654 |         ignored for the hidden state.
 655 |     backwards : bool
 656 |         If True, process the sequence backwards and then reverse the
 657 |         output again such that the output from the layer is always
 658 |         from :math:`x_1` to :math:`x_n`.
 659 |     learn_init : bool
 660 |         If True, initial hidden values are learned. If `hid_init` or
 661 |         `cell_init` are TensorVariables then the TensorVariable is used and
 662 |         `learn_init` is ignored for that initial state.
 663 |     peepholes : bool
 664 |         If True, the LSTM uses peephole connections.
 665 |         When False, `W_cell_to_ingate`, `W_cell_to_forgetgate` and
 666 |         `W_cell_to_outgate` are ignored.
 667 |     gradient_steps : int
 668 |         Number of timesteps to include in the backpropagated gradient.
 669 |         If -1, backpropagate through the entire sequence.
 670 |     grad_clipping: False or float
 671 |         If a float is provided, the gradient messages are clipped during the
 672 |         backward pass.  If False, the gradients will not be clipped.  See [1]_
 673 |         (p. 6) for further explanation.
 674 |     unroll_scan : bool
 675 |         If True the recursion is unrolled instead of using scan. For some
 676 |         graphs this gives a significant speed up but it might also consume
 677 |         more memory. When `unroll_scan` is true then the `gradient_steps`
 678 |         setting is ignored.
 679 |     precompute_input : bool
 680 |         If True, precompute input_to_hid before iterating through
 681 |         the sequence. This can result in a speedup at the expense of
 682 |         an increase in memory usage.
 683 | 
 684 |     References
 685 |     ----------
 686 |     .. [1] Graves, Alex: "Generating sequences with recurrent neural networks."
 687 |            arXiv preprint arXiv:1308.0850 (2013).
 688 |     """
 689 |     def __init__(self, incoming,
 690 |                  num_units,
 691 |                  aln_num_units,
 692 |                  n_decodesteps,
 693 |                  W_align=init.Normal(0.1),
 694 |                  U_align=init.Normal(0.1),
 695 |                  v_align=init.Normal(0.1),
 696 |                  U_conv_align=init.Normal(0.1),
 697 |                  nonlinearity_align=nonlinearities.tanh,
 698 |                  W_hid_to_ingate=init.Normal(0.1),
 699 |                  W_cell_to_ingate=init.Normal(0.1),
 700 |                  b_ingate=init.Constant(0.),
 701 |                  nonlinearity_ingate=nonlinearities.sigmoid,
 702 |                  #W_in_to_forgetgate=init.Normal(0.1),
 703 |                  W_hid_to_forgetgate=init.Normal(0.1),
 704 |                  W_cell_to_forgetgate=init.Normal(0.1),
 705 |                  b_forgetgate=init.Constant(0.),
 706 |                  nonlinearity_forgetgate=nonlinearities.sigmoid,
 707 |                  #W_in_to_cell=init.Normal(0.1),
 708 |                  W_hid_to_cell=init.Normal(0.1),
 709 |                  b_cell=init.Constant(0.),
 710 |                  nonlinearity_cell=nonlinearities.tanh,
 711 |                  #W_in_to_outgate=init.Normal(0.1),
 712 |                  W_hid_to_outgate=init.Normal(0.1),
 713 |                  W_cell_to_outgate=init.Normal(0.1),
 714 |                  b_outgate=init.Constant(0.),
 715 |                  nonlinearity_outgate=nonlinearities.sigmoid,
 716 |                  nonlinearity_out=nonlinearities.tanh,
 717 |                  cell_init=init.Constant(0.),
 718 |                  hid_init=init.Constant(0.),
 719 |                  backwards=False,
 720 |                  learn_init=False,
 721 |                  peepholes=True,
 722 |                  gradient_steps=-1,
 723 |                  grad_clipping=False,
 724 |                  unroll_scan=False,
 725 |                  attention_softmax_function=T.nnet.softmax,
 726 |                  #precompute_input=True,
 727 |                  decode_pre_steps=0,
 728 |                  return_decodehid=False,
 729 |                  mask_input=None,
 730 |                  **kwargs):
 731 | 
 732 |         # Initialize parent layer
 733 |         incomings = [incoming]
 734 |         if mask_input is not None:
 735 |             incomings.append(mask_input)
 736 |         super(LSTMAttentionDecodeFeedbackLayer, self).__init__(
 737 |             incomings, **kwargs)
 738 | 
 739 |         # For any of the nonlinearities, if None is supplied, use identity
 740 |         if nonlinearity_ingate is None:
 741 |             self.nonlinearity_ingate = nonlinearities.identity
 742 |         else:
 743 |             self.nonlinearity_ingate = nonlinearity_ingate
 744 | 
 745 |         if nonlinearity_forgetgate is None:
 746 |             self.nonlinearity_forgetgate = nonlinearities.identity
 747 |         else:
 748 |             self.nonlinearity_forgetgate = nonlinearity_forgetgate
 749 | 
 750 |         if nonlinearity_cell is None:
 751 |             self.nonlinearity_cell = nonlinearities.identity
 752 |         else:
 753 |             self.nonlinearity_cell = nonlinearity_cell
 754 | 
 755 |         if nonlinearity_outgate is None:
 756 |             self.nonlinearity_outgate = nonlinearities.identity
 757 |         else:
 758 |             self.nonlinearity_outgate = nonlinearity_outgate
 759 | 
 760 |         if nonlinearity_out is None:
 761 |             self.nonlinearity_out = nonlinearities.identity
 762 |         else:
 763 |             self.nonlinearity_out = nonlinearity_out
 764 | 
 765 |         self.attention_softmax_function = attention_softmax_function
 766 | 
 767 |         self.learn_init = learn_init
 768 |         self.num_units = num_units
 769 |         self.backwards = backwards
 770 |         self.peepholes = peepholes
 771 |         self.gradient_steps = gradient_steps
 772 |         self.grad_clipping = grad_clipping
 773 |         self.unroll_scan = unroll_scan
 774 |         self.n_decodesteps = n_decodesteps
 775 |         self.aln_num_units = aln_num_units
 776 |         self.nonlinearity_align = nonlinearity_align
 777 |         self.decode_pre_steps = decode_pre_steps
 778 |         self.return_decodehid = return_decodehid
 779 | 
 780 |         input_shape = self.input_shapes[0]
 781 |         if unroll_scan and input_shape[1] is None:
 782 |             raise ValueError("Input sequence length cannot be specified as "
 783 |                              "None when unroll_scan is True")
 784 | 
 785 |         num_inputs = np.prod(input_shape[2:])
 786 |         self.num_inputs = num_inputs
 787 |         # Initialize parameters using the supplied args
 788 |         #self.W_in_to_ingate = self.add_param(
 789 |         #    W_in_to_ingate, (num_inputs, num_units), name="W_in_to_ingate")
 790 | 
 791 |         self.W_hid_to_ingate = self.add_param(
 792 |             W_hid_to_ingate, (num_units, num_units), name="W_hid_to_ingate")
 793 | 
 794 |         self.b_ingate = self.add_param(
 795 |             b_ingate, (num_units,), name="b_ingate", regularizable=False)
 796 | 
 797 |         #self.W_in_to_forgetgate = self.add_param(
 798 |         #    W_in_to_forgetgate, (num_inputs, num_units),
 799 |         #    name="W_in_to_forgetgate")
 800 | 
 801 |         self.W_hid_to_forgetgate = self.add_param(
 802 |             W_hid_to_forgetgate, (num_units, num_units),
 803 |             name="W_hid_to_forgetgate")
 804 | 
 805 |         self.b_forgetgate = self.add_param(
 806 |             b_forgetgate, (num_units,), name="b_forgetgate",
 807 |             regularizable=False)
 808 | 
 809 |         #self.W_in_to_cell = self.add_param(
 810 |         #    W_in_to_cell, (num_inputs, num_units), name="W_in_to_cell")
 811 | 
 812 |         self.W_hid_to_cell = self.add_param(
 813 |             W_hid_to_cell, (num_units, num_units), name="W_hid_to_cell")
 814 | 
 815 |         self.b_cell = self.add_param(
 816 |             b_cell, (num_units,), name="b_cell", regularizable=False)
 817 | 
 818 |         #self.W_in_to_outgate = self.add_param(
 819 |         #    W_in_to_outgate, (num_inputs, num_units), name="W_in_to_outgate")
 820 | 
 821 |         self.W_hid_to_outgate = self.add_param(
 822 |             W_hid_to_outgate, (num_units, num_units), name="W_hid_to_outgate")
 823 | 
 824 |         self.b_outgate = self.add_param(
 825 |             b_outgate, (num_units,), name="b_outgate", regularizable=False)
 826 | 
 827 | 
 828 |         self.W_weightedhid_to_ingate = self.add_param(
 829 |             W_hid_to_ingate, (num_inputs, num_units), name="W_weightedhid_to_ingate")
 830 | 
 831 |         self.W_weightedhid_to_forgetgate = self.add_param(
 832 |             W_hid_to_forgetgate, (num_inputs, num_units),
 833 |             name="W_weightedhid_to_forgetgate")
 834 | 
 835 |         self.W_weightedhid_to_cell = self.add_param(
 836 |             W_hid_to_cell, (num_inputs, num_units), name="W_weightedhid_to_cell")
 837 | 
 838 |         self.W_weightedhid_to_outgate = self.add_param(
 839 |             W_hid_to_outgate, (num_inputs, num_units), name="W_weightedhid_to_outgate")
 840 | 
 841 | 
 842 | 
 843 | 
 844 |         # Stack input weight matrices into a (num_inputs, 4*num_units)
 845 |         # matrix, which speeds up computation
 846 |         #self.W_in_stacked = T.concatenate(
 847 |         #    [self.W_in_to_ingate, self.W_in_to_forgetgate,
 848 |         #     self.W_in_to_cell, self.W_in_to_outgate], axis=1)
 849 | 
 850 |         # Same for hidden weight matrices
 851 |         self.W_hid_stacked = T.concatenate(
 852 |             [self.W_hid_to_ingate, self.W_hid_to_forgetgate,
 853 |              self.W_hid_to_cell, self.W_hid_to_outgate], axis=1)
 854 | 
 855 |         self.W_weightedhid_stacked = T.concatenate(
 856 |             [self.W_weightedhid_to_ingate, self.W_weightedhid_to_forgetgate,
 857 |              self.W_weightedhid_to_cell, self.W_weightedhid_to_outgate], axis=1)
 858 | 
 859 |         # Stack biases into a (4*num_units) vector
 860 |         self.b_stacked = T.concatenate(
 861 |             [self.b_ingate, self.b_forgetgate,
 862 |              self.b_cell, self.b_outgate], axis=0)
 863 | 
 864 |         # If peephole (cell to gate) connections were enabled, initialize
 865 |         # peephole connections.  These are elementwise products with the cell
 866 |         # state, so they are represented as vectors.
 867 |         if self.peepholes:
 868 |             self.W_cell_to_ingate = self.add_param(
 869 |                 W_cell_to_ingate, (num_units, ), name="W_cell_to_ingate")
 870 | 
 871 |             self.W_cell_to_forgetgate = self.add_param(
 872 |                 W_cell_to_forgetgate, (num_units, ),
 873 |                 name="W_cell_to_forgetgate")
 874 | 
 875 |             self.W_cell_to_outgate = self.add_param(
 876 |                 W_cell_to_outgate, (num_units, ), name="W_cell_to_outgate")
 877 | 
 878 |         self.W_align = self.add_param(W_align, (num_units, self.aln_num_units),
 879 |                                    name="AlignSeqOutputLayer: (aln) W_a")
 880 |         self.U_align = self.add_param(U_align, (num_inputs, self.aln_num_units),
 881 |                            name="AlignSeqOutputLayer: (aln) U_a")
 882 |         self.v_align = self.add_param(v_align, (self.aln_num_units, 1),
 883 |                                  name="AlignSeqOutputLayer: v_a")
 884 | 
 885 | 
 886 |         # Setup initial values for the cell and the hidden units
 887 |         if isinstance(cell_init, T.TensorVariable):
 888 |             if cell_init.ndim != 2:
 889 |                 raise ValueError(
 890 |                     "When cell_init is provided as a TensorVariable, it should"
 891 |                     " have 2 dimensions and have shape (num_batch, num_units)")
 892 |             self.cell_init = cell_init
 893 |         else:
 894 |             self.cell_init = self.add_param(
 895 |                 cell_init, (1, num_units), name="cell_init",
 896 |                 trainable=learn_init, regularizable=False)
 897 | 
 898 |         if isinstance(hid_init, T.TensorVariable):
 899 |             if hid_init.ndim != 2:
 900 |                 raise ValueError(
 901 |                     "When hid_init is provided as a TensorVariable, it should "
 902 |                     "have 2 dimensions and have shape (num_batch, num_units)")
 903 |             self.hid_init = hid_init
 904 |         else:
 905 |             self.hid_init = self.add_param(
 906 |                 hid_init, (1, self.num_units), name="hid_init",
 907 |                 trainable=learn_init, regularizable=False)
 908 | 
 909 |     def get_output_shape_for(self, input_shapes):
 910 |         input_shape = input_shapes[0]
 911 |         return input_shape[0], None, self.num_units
 912 | 
 913 |     def get_params(self, **tags):
 914 |         # Get all parameters from this layer, the master layer
 915 |         params = super(LSTMAttentionDecodeFeedbackLayer, self).get_params(**tags)
 916 |         # Combine with all parameters from the child layers
 917 |         return params
 918 | 
 919 |     def get_output_for(self, inputs, **kwargs):
 920 |         """
 921 |         Compute this layer's output function given a symbolic input variable
 922 | 
 923 |         Parameters
 924 |         ----------
 925 |         input : theano.TensorType
 926 |             Symbolic input variable.
 927 |         mask : theano.TensorType
 928 |             Theano variable denoting whether each time step in each
 929 |             sequence in the batch is part of the sequence or not.  If ``None``,
 930 |             then it is assumed that all sequences are of the same length.  If
 931 |             not all sequences are of the same length, then it must be
 932 |             supplied as a matrix of shape ``(n_batch, n_time_steps)`` where
 933 |             ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and
 934 |             ``mask[i, j] = 0`` when ``j > (length of sequence i)``.
 935 | 
 936 |         Returns
 937 |         -------
 938 |         layer_output : theano.TensorType
 939 |             Symblic output variable.
 940 |         """
 941 |         input = inputs[0]
 942 |         # Retrieve the mask when it is supplied
 943 |         mask = inputs[1] if len(inputs) > 1 else None
 944 | 
 945 |         # Treat all dimensions after the second as flattened feature dimensions
 946 |         if input.ndim > 3:
 947 |             input = input.reshape((input.shape[0], input.shape[1],
 948 |                                    T.prod(input.shape[2:])))
 949 |         num_batch = input.shape[0]
 950 |         encode_seqlen = input.shape[1]
 951 | 
 952 |         if mask is None:
 953 |             mask = T.ones((num_batch, encode_seqlen),dtype='float32')
 954 |         # At each call to scan, input_n will be (n_time_steps, 4*num_units).
 955 |         # We define a slicing function that extract the input to each LSTM gate
 956 |         def slice_w(x, n):
 957 |             return x[:, n*self.num_units:(n+1)*self.num_units]
 958 | 
 959 |         # Create single recurrent computation step function
 960 |         # input_n is the n'th vector of the input
 961 |         def step(cell_previous, hid_previous, alpha_prev, weighted_hidden_prev,
 962 |                  input, mask, hUa, W_align, v_align,
 963 |                  W_hid_stacked, W_weightedhid_stacked, W_cell_to_ingate,
 964 |                  W_cell_to_forgetgate, W_cell_to_outgate,
 965 |                  b_stacked, *args):
 966 | 
 967 |             #compute (unormalized) attetion vector
 968 |             sWa = T.dot(hid_previous, W_align)       # (BS, aln_num_units)
 969 |             sWa = sWa.dimshuffle(0, 'x', 1)   # (BS, 1, aln_num_units)
 970 |             align_act = sWa + hUa
 971 |             tanh_sWahUa = self.nonlinearity_align(align_act)
 972 |                                             # (BS, seqlen, num_units_aln)
 973 | 
 974 |             # CALCULATE WEIGHT FOR EACH HIDDEN STATE VECTOR
 975 |             a = T.dot(tanh_sWahUa, v_align)  # (BS, Seqlen, 1)
 976 |             a = T.reshape(a, (a.shape[0], a.shape[1]))
 977 |             #                                # (BS, Seqlen)
 978 |             # # ->(BS, seq_len)
 979 | 
 980 |             a = a*mask - (1-mask)*10000
 981 | 
 982 |             alpha = self.attention_softmax_function(a)
 983 |             #alpha = T.reshape(alpha, (input.shape[0], input.shape[1]))
 984 | 
 985 |             # input: (BS, Seqlen, num_units)
 986 |             weighted_hidden = input * alpha.dimshuffle(0, 1, 'x')
 987 |             weighted_hidden = T.sum(weighted_hidden, axis=1)  #sum seqlen out
 988 | 
 989 | 
 990 |             # Calculate gates pre-activations and slice
 991 | 
 992 |             # (BS, dec_hid) x (dec_hid, dec_hid)
 993 |             gates = T.dot(hid_previous, W_hid_stacked) + b_stacked
 994 |             # (BS, enc_hid) x (enc_hid, dec_hid)
 995 |             gates += T.dot(weighted_hidden, W_weightedhid_stacked)
 996 | 
 997 |             # Clip gradients
 998 |             if self.grad_clipping is not False:
 999 |                 gates = theano.gradient.grad_clip(
1000 |                     gates, -self.grad_clipping, self.grad_clipping)
1001 | 
1002 |             # Extract the pre-activation gate values
1003 |             ingate = slice_w(gates, 0)
1004 |             forgetgate = slice_w(gates, 1)
1005 |             cell_input = slice_w(gates, 2)
1006 |             outgate = slice_w(gates, 3)
1007 | 
1008 |             if self.peepholes:
1009 |                 # Compute peephole connections
1010 |                 ingate += cell_previous*W_cell_to_ingate
1011 |                 forgetgate += cell_previous*W_cell_to_forgetgate
1012 | 
1013 |             # Apply nonlinearities
1014 |             ingate = self.nonlinearity_ingate(ingate)
1015 |             forgetgate = self.nonlinearity_forgetgate(forgetgate)
1016 |             cell_input = self.nonlinearity_cell(cell_input)
1017 |             outgate = self.nonlinearity_outgate(outgate)
1018 | 
1019 |             # Compute new cell value
1020 |             cell = forgetgate*cell_previous + ingate*cell_input
1021 | 
1022 |             if self.peepholes:
1023 |                 outgate += cell*W_cell_to_outgate
1024 | 
1025 |             # W_align:  (num_units, aln_num_units)
1026 |             # U_align:  (num_feats, aln_num_units)
1027 |             # v_align:  (aln_num_units, 1)
1028 |             # hUa:      (BS, Seqlen, aln_num_units)
1029 |             # hid:      (BS, num_units_dec)
1030 |             # input:    (BS, Seqlen, num_inputs)
1031 | 
1032 |             # Compute new hidden unit activation
1033 |             hid = outgate*self.nonlinearity_out(cell)
1034 | 
1035 |             return [cell, hid, alpha, weighted_hidden]
1036 | 
1037 |         sequences = []
1038 |         step_fun = step
1039 | 
1040 |         ones = T.ones((num_batch, 1))
1041 |         if isinstance(self.cell_init, T.TensorVariable):
1042 |             cell_init = self.cell_init
1043 |         else:
1044 |             # Dot against a 1s vector to repeat to shape (num_batch, num_units)
1045 |             cell_init = T.dot(ones, self.cell_init)
1046 | 
1047 |         if isinstance(self.hid_init, T.TensorVariable):
1048 |             hid_init = self.hid_init
1049 |         else:
1050 |             # Dot against a 1s vector to repeat to shape (num_batch, num_units)
1051 |             hid_init = T.dot(ones, self.hid_init)
1052 | 
1053 |         #weighted_hidden_init = T.zeros((num_batch, input.shape[2]))
1054 |         alpha_init = T.zeros((num_batch, encode_seqlen))
1055 | 
1056 |         weighted_hidden_init = T.zeros((num_batch, self.num_inputs))
1057 | 
1058 |         # The hidden-to-hidden weight matrix is always used in step
1059 | 
1060 |         hUa = T.dot(input, self.U_align)   # (num_batch, seq_len, num_units_aln)
1061 | 
1062 |         non_seqs = [input, mask, hUa, self.W_align, self.v_align,
1063 |                     self.W_hid_stacked, self.W_weightedhid_stacked]
1064 |         # The "peephole" weight matrices are only used when self.peepholes=True
1065 |         if self.peepholes:
1066 |             non_seqs += [self.W_cell_to_ingate,
1067 |                          self.W_cell_to_forgetgate,
1068 |                          self.W_cell_to_outgate]
1069 |         # theano.scan only allows for positional arguments, so when
1070 |         # self.peepholes is False, we need to supply fake placeholder arguments
1071 |         # for the three peephole matrices.
1072 |         else:
1073 |             non_seqs += [(), (), ()]
1074 |         # When we aren't precomputing the input outside of scan, we need to
1075 |         # provide the input weights and biases to the step function
1076 | 
1077 |         non_seqs += [self.b_stacked]
1078 | 
1079 |         if self.unroll_scan:
1080 |             # Explicitly unroll the recurrence instead of using scan
1081 |             cell_out, hid_out, alpha_out, weighted_hidden_out = unroll_scan(
1082 |                 fn=step_fun,
1083 |                 sequences=sequences,
1084 |                 outputs_info=[cell_init, hid_init, alpha_init, weighted_hidden_init],
1085 |                 go_backwards=self.backwards,
1086 |                 non_sequences=non_seqs,
1087 |                 n_steps=self.n_decodesteps + self.decode_pre_steps)
1088 |         else:
1089 |             # Scan op iterates over first dimension of input and repeatedly
1090 |             # applies the step function
1091 |             cell_out, hid_out, alpha_out, weighted_hidden_out = theano.scan(
1092 |                 fn=step_fun,
1093 |                 sequences=sequences,
1094 |                 outputs_info=[cell_init, hid_init, alpha_init, weighted_hidden_init],
1095 |                 go_backwards=self.backwards,
1096 |                 truncate_gradient=self.gradient_steps,
1097 |                 non_sequences=non_seqs,
1098 |                 n_steps=self.n_decodesteps + self.decode_pre_steps,
1099 |                 strict=True)[0]
1100 | 
1101 |         # dimshuffle back to (n_batch, n_time_steps, n_features))
1102 | 
1103 |         #a_out - (n_decodesteps, bs, seqlen)
1104 |         #hid_out -   (n_decode_steps, bs, num_units)
1105 | 
1106 | 
1107 |         # mask:  (BS, encode_seqlen
1108 |         # a_out; (n_decodesteps, BS, encode_seqlen)
1109 |         cell_out = cell_out.dimshuffle(1, 0, 2)
1110 |         hid_out = hid_out.dimshuffle(1, 0, 2)  # (BS, n_decodesteps, encode_seqlen)
1111 |         mask = mask.dimshuffle(0, 'x', 1)
1112 |         alpha_out = alpha_out.dimshuffle(1, 0, 2)  # (BS, n_decodesteps, encode_seqlen)
1113 | 
1114 |         weighted_hidden_out = weighted_hidden_out.dimshuffle(1, 0, 2)
1115 | 
1116 |         # if scan is backward reverse the output
1117 |         if self.backwards:
1118 |             hid_out = hid_out[:, ::-1]
1119 |             cell_out = cell_out[:, ::-1]
1120 |             weighted_hidden_out = weighted_hidden_out[:, ::-1]
1121 |             alpha_out = alpha_out[:, ::-1]
1122 | 
1123 |         if self.decode_pre_steps > 0:
1124 |             hid_out = hid_out[:, self.decode_pre_steps:]
1125 |             cell_out = hid_out[:, self.decode_pre_steps:]
1126 |             weighted_hidden_out = weighted_hidden_out[:, self.decode_pre_steps:]
1127 |             alpha_out = hid_out[:, self.decode_pre_steps:]
1128 | 
1129 |         self.hid_out = hid_out
1130 |         self.cell_out = cell_out
1131 |         self.weighted_hidden_out = weighted_hidden_out
1132 |         self.alpha = alpha_out
1133 | 
1134 |         if self.return_decodehid:
1135 |             return hid_out
1136 |         else:
1137 |             return weighted_hidden_out
1138 | 
1139 | 
1140 | 


--------------------------------------------------------------------------------
/lab3/enc-dec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepLearningDTU/nvidia_deep_learning_summercamp_2016/9d87d409eeebed4b8cf39c42987fb5e786d3ae28/lab3/enc-dec.png


--------------------------------------------------------------------------------
/lab6/Lab6.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%matplotlib inline\n",
 12 |     "import matplotlib\n",
 13 |     "import numpy as np\n",
 14 |     "import matplotlib.pyplot as plt\n",
 15 |     "import sklearn.datasets\n",
 16 |     "import theano\n",
 17 |     "import theano.tensor as T\n",
 18 |     "import lasagne\n",
 19 |     "import math\n"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# Variational Autoencoders (VAE)\n",
 27 |     "\n",
 28 |     "In this exercise we'll implement an variational autoencoder. Very briefly an autoencoder encodes some input into a new representaiton and usually more compact representation which can be used to reconstruct the input data again. An variational autoencoder makes the furhter assumption that the compact representation is follows probabilistic distribution (usually a gaussian) which makes it possible to sample new data from a trained variational autoencoder. The \"variational\" part of the name comes from the fact that these models are training using variational inference.\n",
 29 |     "\n",
 30 |     "The mathematical details of the training can be a bit challenging however we believe that probabilistic deep learning will be an important part of future deep learning developments why we find it important to introduce the concepts.\n",
 31 |     "\n",
 32 |     "As background material we recommend reading [Tutorial on Variational Autoencoder](http://arxiv.org/abs/1606.05908). For the implementation of the model you must read the article \"Auto-Encoding Variational Bayes\", Kingma & Welling, ICLR 2014: http://arxiv.org/pdf/1312.6114v10.pdf and \"Stochastic Backpropagation and Approximate Inference in Deep Generative Models\", Rezende et al, ICML 2014:\n",
 33 |     "http://arxiv.org/pdf/1401.4082v3.pdf\n",
 34 |     "\n"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "## VAE crash course\n",
 42 |     "\n",
 43 |     "VAEs consist of two parts:\n",
 44 |     "\n",
 45 |     " * Encoder (also known as recognition, inference or Q-model): Maps the input data into a probabilistic latent space by calculating the mean and variance parameters of a gaussian distribution as a function of the input data x:  $q(z|x) = \\mathcal{N}(z|\\mu_\\theta(x), \\sigma_\\phi(x)I)$\n",
 46 |     " * Decoder (also known as generative or P-model): Reconstructs the input image using a sample from the latent space defined by the encoder model: $p(x|z)$\n",
 47 |     "<img src=\"VAE.png\" alt=\"Drawing\" style=\"width: 300px;\"/>\n",
 48 |     "\n",
 49 |     "\n",
 50 |     "In more mathematical details we have (this can be a bit challenging)\n",
 51 |     "\n",
 52 |     "$p(x) = \\int_z p(x|z)p(z)dz$\n",
 53 |     "\n",
 54 |     "$p(x) = \\int_z p(x|z)p(z)\\frac{q(z|x)}{q(z|x)}dz$\n",
 55 |     "\n",
 56 |     "\n",
 57 |     "$p(x) = \\int_z q(z|x) \\frac{p(x|z)p(z)}{q(z|x)}dz$\n",
 58 |     "\n",
 59 |     "\n",
 60 |     "$\\log p(x) = \\log \\int_z q(z|x) \\frac{p(x|z)p(z)}{q(z|x)}dz$\n",
 61 |     "\n",
 62 |     "$\\log p(x) \\geq  \\int_z q(z|x)\\log \\frac{p(x|z)p(z)}{q(z|x)}dz$\n",
 63 |     "\n",
 64 |     "This is know as the variational lower bound. We contiue with a bit of rewriting\n",
 65 |     "\n",
 66 |     "$\\log p(x) \\geq E_{q(z|x)} \\left[\\log \\frac{p(x|z)p(z)}{q(z|x)}\\right]$\n",
 67 |     "\n",
 68 |     "$\\log p(x) \\geq E_{q(z|x)} \\left[\\log p(x|z)\\right] - KL(q(z|x) | p(z))$\n",
 69 |     "\n",
 70 |     "Here the first term on the right hand side are the data reconstruction and the second term the Kulback-Liebler divergenve between the approximate and true posterior distributions which acts as a probabilistic regularizer.\n",
 71 |     "\n",
 72 |     "### Training a VAE \n",
 73 |     "The VAE is similar to an deterministic autoencoder except that we assume that the hidden units are following some distribution. Usually we just assume that the units are independent standard gaussian distributed.\n",
 74 |     "\n",
 75 |     "Above we defined a lower bound on the log likelihood of the data. We can train the model by pushing up the lowerbound. I'e we do gradient ascent on the lowerbound.  By using the _reparameterization trick_ we can directly backprop throug the model and uptimize the lower bound. If you are interested in the technical details you can look at the references given above.\n",
 76 |     "\n",
 77 |     "### Setting up the network\n",
 78 |     "\n",
 79 |     "We set up the network like an autoencoder except that the bottle neck layer is the __SimpleSampleLayer__ which samples the hidden units. \n",
 80 |     "\n",
 81 |     "The lower bound is calculated in the ```LogLikelihood```. "
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {
 88 |     "collapsed": false
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "#To speed up training we'll only work on a subset of the data\n",
 93 |     "#We discretize the data to 0 and 1 in order to use it with a bernoulli observation model p(x|z) = Ber(mu(z))\n",
 94 |     "\n",
 95 |     "def bernoullisample(x):\n",
 96 |     "    return np.random.binomial(1,x,size=x.shape).astype(theano.config.floatX)\n",
 97 |     "\n",
 98 |     "\n",
 99 |     "data = np.load('mnist.npz')\n",
100 |     "num_classes = 10\n",
101 |     "x_train = bernoullisample(data['X_train'][:50000]).astype('float32')\n",
102 |     "targets_train = data['y_train'][:50000].astype('int32')\n",
103 |     "\n",
104 |     "x_valid = bernoullisample(data['X_valid'][:500]).astype('float32')\n",
105 |     "targets_valid = data['y_valid'][:500].astype('int32')\n",
106 |     "\n",
107 |     "x_test = bernoullisample(data['X_test'][:500]).astype('float32')\n",
108 |     "targets_test = data['y_test'][:500].astype('int32')"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "#plot a few MNIST examples\n",
120 |     "\n",
121 |     "def plot_samples(x,title=''):\n",
122 |     "    idx = 0\n",
123 |     "    canvas = np.zeros((28*10, 10*28))\n",
124 |     "    for i in range(10):\n",
125 |     "        for j in range(10):\n",
126 |     "            canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x[idx].reshape((28, 28))\n",
127 |     "            idx += 1\n",
128 |     "    plt.figure(figsize=(7, 7))\n",
129 |     "    plt.imshow(canvas, cmap='gray')\n",
130 |     "    plt.title(title)\n",
131 |     "    plt.show()\n",
132 |     "\n",
133 |     "plot_samples(x_train[:100],title='MNIST handwritten digits')"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {
140 |     "collapsed": false
141 |    },
142 |    "outputs": [],
143 |    "source": [
144 |     "#defined a couple of helper functions\n",
145 |     "c = - 0.5 * math.log(2*math.pi)\n",
146 |     "def log_bernoulli(x, p, eps=0.0):\n",
147 |     "    p = T.clip(p, eps, 1.0 - eps)\n",
148 |     "    return -T.nnet.binary_crossentropy(p, x)\n",
149 |     "\n",
150 |     "def kl_normal2_stdnormal(mean, log_var):\n",
151 |     "    return -0.5*(1 + log_var - mean**2 - T.exp(log_var))\n"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "Construct the lasagne layer."
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {
165 |     "collapsed": false
166 |    },
167 |    "outputs": [],
168 |    "source": [
169 |     "from lasagne.layers import InputLayer,DenseLayer,get_output, get_all_params\n",
170 |     "from lasagne.nonlinearities import elu, identity, sigmoid\n",
171 |     "from samplelayer import SimpleSampleLayer\n",
172 |     "\n",
173 |     "num_features = x_train.shape[-1]\n",
174 |     "num_latent_z = 64\n",
175 |     "\n",
176 |     "#MODEL SPECIFICATION\n",
177 |     "\n",
178 |     "#ENCODER\n",
179 |     "l_in_x = InputLayer(shape=(None, num_features))\n",
180 |     "l_enc = DenseLayer(l_in_x, num_units=256, nonlinearity=elu)\n",
181 |     "l_enc = DenseLayer(l_enc, num_units=256, nonlinearity=elu) \n",
182 |     "l_muq = DenseLayer(l_enc, num_units=num_latent_z, nonlinearity=identity)     #mu(x)\n",
183 |     "l_logvarq = DenseLayer(l_enc, num_units=num_latent_z, nonlinearity=lambda x: T.clip(x,-10,10)) #logvar(x), \n",
184 |     "\n",
185 |     "l_z = SimpleSampleLayer(mean=l_muq, log_var=l_logvarq) #sample a latent representation z \\sim q(z|x) = N(mu(x),logvar(x))\n",
186 |     "\n",
187 |     "#we split the in two parts to allow sampling from the decoder model separately\n",
188 |     "\n",
189 |     "#DECODER\n",
190 |     "l_in_z = InputLayer(shape=(None, num_latent_z))\n",
191 |     "l_dec = DenseLayer(l_in_z, num_units=256, nonlinearity=elu) \n",
192 |     "l_dec = DenseLayer(l_dec, num_units=256, nonlinearity=elu) \n",
193 |     "l_mux = DenseLayer(l_dec, num_units=num_features, nonlinearity=sigmoid)  #reconstruction of input using a sigmoid output since mux \\in [0,1] "
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {
200 |     "collapsed": false
201 |    },
202 |    "outputs": [],
203 |    "source": [
204 |     "\n",
205 |     "sym_x = T.matrix('x')\n",
206 |     "sym_z = T.matrix('z')\n",
207 |     "\n",
208 |     "z_train, muq_train, logvarq_train = get_output([l_z,l_muq,l_logvarq],{l_in_x:sym_x},deterministic=False)\n",
209 |     "mux_train = get_output(l_mux,{l_in_z:z_train},deterministic=False)\n",
210 |     "\n",
211 |     "z_eval, muq_eval, logvarq_eval = get_output([l_z,l_muq,l_logvarq],{l_in_x:sym_x},deterministic=True)\n",
212 |     "mux_eval = get_output(l_mux,{l_in_z:z_eval},deterministic=True)\n",
213 |     "\n",
214 |     "mux_sample = get_output(l_mux,{l_in_z:sym_z},deterministic=True)\n",
215 |     "\n",
216 |     "\n",
217 |     "\n",
218 |     "#defined the cost function\n",
219 |     "\n",
220 |     "def LogLikelihood(mux,x,muq,logvarq):\n",
221 |     "    log_px_given_z = log_bernoulli(x, mux, eps=1e-6).sum(axis=1).mean() #note that we sum the latent dimension and mean over the samples\n",
222 |     "    KL_qp = kl_normal2_stdnormal(muq, logvarq).sum(axis=1).mean()\n",
223 |     "    LL = log_px_given_z - KL_qp\n",
224 |     "    return LL, log_px_given_z, KL_qp\n",
225 |     "\n",
226 |     "\n",
227 |     "LL_train, logpx_train, KL_train = LogLikelihood(mux_train, sym_x, muq_train, logvarq_train)\n",
228 |     "LL_eval, logpx_eval, KL_eval = LogLikelihood(mux_eval, sym_x, muq_eval, logvarq_eval)\n",
229 |     "\n",
230 |     "\n",
231 |     "all_params = get_all_params([l_z,l_mux],trainable=True)\n",
232 |     "\n",
233 |     "# Let Theano do its magic and get all the gradients we need for training\n",
234 |     "all_grads = T.grad(-LL_train, all_params)\n",
235 |     "\n",
236 |     "\n",
237 |     "# Set the update function for parameters \n",
238 |     "# you might wan't to experiment with more advanded update schemes like rmsprob, adadelta etc.\n",
239 |     "updates = lasagne.updates.adam(all_grads, all_params, learning_rate=1e-3)\n",
240 |     "\n",
241 |     "\n",
242 |     "f_train = theano.function(inputs=[sym_x],\n",
243 |     "                          outputs=[LL_train, logpx_train, KL_train],\n",
244 |     "                          updates=updates)\n",
245 |     "\n",
246 |     "f_eval = theano.function(inputs=[sym_x],\n",
247 |     "                         outputs=[LL_train, logpx_train, KL_train])\n",
248 |     "\n",
249 |     "f_sample= theano.function(inputs=[sym_z],\n",
250 |     "                         outputs=[mux_sample])\n",
251 |     "\n",
252 |     "f_recon= theano.function(inputs=[sym_x],\n",
253 |     "                         outputs=[mux_eval])\n",
254 |     "\n"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {
261 |     "collapsed": false
262 |    },
263 |    "outputs": [],
264 |    "source": [
265 |     "#Test the forward pass\n",
266 |     "print  f_train(x_valid)"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {
273 |     "collapsed": false
274 |    },
275 |    "outputs": [],
276 |    "source": [
277 |     "#plot some samples from the untrained model\n",
278 |     "z = np.random.normal(0,1,size=(100,num_latent_z)).astype('float32')\n",
279 |     "mux_sample = f_sample(z)[0]\n",
280 |     "\n",
281 |     "plot_samples(mux_sample,title='MNIST handwritten samples, untrained model')"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "markdown",
286 |    "metadata": {},
287 |    "source": [
288 |     "Train the model."
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": null,
294 |    "metadata": {
295 |     "collapsed": false
296 |    },
297 |    "outputs": [],
298 |    "source": [
299 |     "num_epochs = 10\n",
300 |     "batch_size = 64\n",
301 |     "num_batch_train = x_train.shape[0] // batch_size\n",
302 |     " \n",
303 |     "LL_train, KL_train, logpx_train = [],[],[]\n",
304 |     "LL_valid, KL_valid, logpx_valid = [],[],[]\n",
305 |     "\n",
306 |     "for e in range(num_epochs):\n",
307 |     "    _LL_train, _KL_train, _logpx_train = [],[],[]\n",
308 |     "    for i in range(num_batch_train):\n",
309 |     "        out = f_train(x_train[batch_size*i:(i+1)*batch_size])\n",
310 |     "        #out = [LL, logpx,KL_qp]\n",
311 |     "        _LL_train += [out[0]]\n",
312 |     "        _logpx_train += [out[1]]\n",
313 |     "        _KL_train += [out[2]]\n",
314 |     "        \n",
315 |     "    LL_train += [np.mean(_LL_train)] \n",
316 |     "    KL_train += [np.mean(_KL_train)] \n",
317 |     "    logpx_train += [np.mean(_logpx_train)] \n",
318 |     "    \n",
319 |     "    out = f_eval(x_valid)\n",
320 |     "    LL_valid += [out[0]]\n",
321 |     "    logpx_valid += [out[1]]\n",
322 |     "    KL_valid += [out[2]]\n",
323 |     "\n",
324 |     "    print \"Epoch %i\\t\"%(e) + \\\n",
325 |     "    \"Train: LL: %0.1f\\tKL %0.1f\\tlogpx: %0.1f\\t\"%(LL_train[-1],KL_train[-1],logpx_train[-1]) + \\\n",
326 |     "    \"Valid: LL: %0.1f\\tKL %0.1f\\tlogpx: %0.1f\"%(LL_valid[-1],KL_valid[-1],logpx_valid[-1])\n",
327 |     "\n",
328 |     "\n",
329 |     "epoch = np.arange(len(LL_train))\n",
330 |     "plt.figure()\n",
331 |     "plt.plot(epoch,LL_train,'r',epoch,LL_valid,'b')\n",
332 |     "plt.legend(['Train LL','Val LL'],loc='best')\n",
333 |     "plt.xlabel('Updates'), plt.ylabel('LL')\n",
334 |     "\n"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": null,
340 |    "metadata": {
341 |     "collapsed": false
342 |    },
343 |    "outputs": [],
344 |    "source": [
345 |     "#plot some samples from the trained model\n",
346 |     "mux_sample = f_sample(z)[0]\n",
347 |     "plot_samples(mux_sample,title='MNIST handwritten samples, $z\\sim p(z)$')\n",
348 |     "\n",
349 |     "#plot some samples from the trained model\n",
350 |     "mux_recon = f_recon(x_test[:100])[0]\n",
351 |     "plot_samples(mux_recon,title='MNIST handwritten reconstructions, $z\\sim q(z|x)$')\n"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "metadata": {},
357 |    "source": [
358 |     "## Assignments\n",
359 |     "Remember that the model defines the probability distribution $p(x,z) = p(x|z)p(z)$. We additionally have the inference network $q(z|x)$ which allows us to infer the latent variables, $z$, for specific input data values $x$.\n",
360 |     "\n",
361 |     "\n",
362 |     "\n",
363 |     "1. Explain how you could sample form the model, which function does this in the code? \n",
364 |     "2. Explain how you could get reconstructions from the model. Remember that you have the inference network $q(z|x)$\n",
365 |     "3. Use the original paper http://arxiv.org/pdf/1312.6114v10.pdf or [this blog](http://blog.shakirm.com/2015/10/machine-learning-trick-of-the-day-4-reparameterisation-tricks/) to explain what the reparameterization trick does. \n",
366 |     "4. The VAE is a probablistic model. We could model $p(x,z,y)$ where $y$ is the label information. How could this model handle semisupervised learning? You can look the papers https://arxiv.org/pdf/1406.5298.pdf or  https://arxiv.org/pdf/1602.05473v4.pdf. "
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "metadata": {
373 |     "collapsed": true
374 |    },
375 |    "outputs": [],
376 |    "source": []
377 |   }
378 |  ],
379 |  "metadata": {
380 |   "kernelspec": {
381 |    "display_name": "Python 2",
382 |    "language": "python",
383 |    "name": "python2"
384 |   },
385 |   "language_info": {
386 |    "codemirror_mode": {
387 |     "name": "ipython",
388 |     "version": 2
389 |    },
390 |    "file_extension": ".py",
391 |    "mimetype": "text/x-python",
392 |    "name": "python",
393 |    "nbconvert_exporter": "python",
394 |    "pygments_lexer": "ipython2",
395 |    "version": "2.7.11"
396 |   }
397 |  },
398 |  "nbformat": 4,
399 |  "nbformat_minor": 0
400 | }
401 | 


--------------------------------------------------------------------------------
/lab6/VAE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepLearningDTU/nvidia_deep_learning_summercamp_2016/9d87d409eeebed4b8cf39c42987fb5e786d3ae28/lab6/VAE.png


--------------------------------------------------------------------------------
/lab6/mnist.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DeepLearningDTU/nvidia_deep_learning_summercamp_2016/9d87d409eeebed4b8cf39c42987fb5e786d3ae28/lab6/mnist.npz


--------------------------------------------------------------------------------
/lab6/samplelayer.py:
--------------------------------------------------------------------------------
  1 | import lasagne
  2 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  3 | import theano.tensor as T
  4 | import theano
  5 | 
  6 | 
  7 | class SimpleSampleLayer(lasagne.layers.MergeLayer):
  8 |     """
  9 |     Simple sampling layer drawing a single Monte Carlo sample to approximate
 10 |     E_q [log( p(x,z) / q(z|x) )]. This is the approach described in [KINGMA]_.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     mu, log_var : :class:`Layer` instances
 15 |         Parameterizing the mean and log(variance) of the distribution to sample
 16 |         from as described in [KINGMA]_. The code assumes that these have the
 17 |         same number of dimensions.
 18 | 
 19 |     seed : int
 20 |         seed to random stream
 21 | 
 22 |     Methods
 23 |     ----------
 24 |     seed : Helper function to change the random seed after init is called
 25 | 
 26 |     References
 27 |     ----------
 28 |         ..  [KINGMA] Kingma, Diederik P., and Max Welling.
 29 |             "Auto-Encoding Variational Bayes."
 30 |             arXiv preprint arXiv:1312.6114 (2013).
 31 |     """
 32 |     def __init__(self, mean, log_var,
 33 |                  seed=lasagne.random.get_rng().randint(1, 2147462579),
 34 |                  **kwargs):
 35 |         super(SimpleSampleLayer, self).__init__([mean, log_var], **kwargs)
 36 | 
 37 |         self._srng = RandomStreams(seed)
 38 | 
 39 |     def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)):
 40 |        self._srng.seed(seed)
 41 | 
 42 |     def get_output_shape_for(self, input_shapes):
 43 |         return input_shapes[0]
 44 | 
 45 |     def get_output_for(self, input, **kwargs):
 46 |         mu, log_var = input
 47 |         eps = self._srng.normal(mu.shape)
 48 |         z = mu + T.exp(0.5 * log_var) * eps
 49 |         return z
 50 | 
 51 | 
 52 | class SampleLayer(lasagne.layers.MergeLayer):
 53 |     """
 54 |     Sampling layer supporting importance sampling as described in [BURDA]_ and
 55 |     multiple Monte Carlo samples for the approximation of
 56 |     E_q [log( p(x,z) / q(z|x) )].
 57 | 
 58 |     Parameters
 59 |     ----------
 60 |     mu : class:`Layer` instance
 61 |         Parameterizing the mean of the distribution to sample
 62 |         from as described in [BURDA]_.
 63 | 
 64 |     log_var : class:`Layer` instance
 65 |         By default assumed to parametrize log(sigma^2) of the distribution to
 66 |         sample from as described in [BURDA]_ which is transformed to sigma using
 67 |         the nonlinearity function as described below. Effectively this means
 68 |         that the nonlinearity function controls what log_var parametrizes. A few
 69 |         common examples:
 70 |         -nonlinearity = lambda x: T.exp(0.5*x) => log_var = log(sigma^2)[default]
 71 |         -nonlinearity = lambda x: T.sqrt(x) => log_var = sigma^2
 72 |         -nonlinearity = lambda x: x => log_var = sigma
 73 | 
 74 |     eq_samples : int or T.scalar
 75 |         Number of Monte Carlo samples used to estimate the expectation over
 76 |         q(z|x) in eq. (8) in [BURDA]_.
 77 | 
 78 |     iw_samples : int or T.scalar
 79 |         Number of importance samples in the sum over k in eq. (8) in [BURDA]_.
 80 | 
 81 |     nonlinearity : callable or None
 82 |         The nonlinearity that is applied to the log_var input layer to transform
 83 |         it into a standard deviation. By default we assume that
 84 |         log_var = log(sigma^2) and hence the corresponding nonlinearity is
 85 |         f(x) = T.exp(0.5*x) such that T.exp(0.5*log(sigma^2)) = sigma
 86 | 
 87 |     seed : int
 88 |         seed to random stream
 89 | 
 90 |     Methods
 91 |     ----------
 92 |     seed : Helper function to change the random seed after init is called
 93 | 
 94 |     References
 95 |     ----------
 96 |         ..  [BURDA] Burda, Yuri, Roger Grosse, and Ruslan Salakhutdinov.
 97 |             "Importance Weighted Autoencoders."
 98 |             arXiv preprint arXiv:1509.00519 (2015).
 99 |     """
100 | 
101 |     def __init__(self, mean, log_var,
102 |                  eq_samples=1,
103 |                  iw_samples=1,
104 |                  nonlinearity=lambda x: T.exp(0.5*x),
105 |                  seed=lasagne.random.get_rng().randint(1, 2147462579),
106 |                   **kwargs):
107 |         super(SampleLayer, self).__init__([mean, log_var], **kwargs)
108 | 
109 |         self.eq_samples = eq_samples
110 |         self.iw_samples = iw_samples
111 |         self.nonlinearity = nonlinearity
112 | 
113 |         self._srng = RandomStreams(seed)
114 | 
115 |     def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)):
116 |         self._srng.seed(seed)
117 | 
118 |     def get_output_shape_for(self, input_shapes):
119 |         batch_size, num_latent = input_shapes[0]
120 |         if isinstance(batch_size, int) and \
121 |            isinstance(self.iw_samples, int) and \
122 |            isinstance(self.eq_samples, int):
123 |             out_dim = (batch_size*self.eq_samples*self.iw_samples, num_latent)
124 |         else:
125 |             out_dim = (None, num_latent)
126 |         return out_dim
127 | 
128 |     def get_output_for(self, input, **kwargs):
129 |         mu, log_var = input
130 |         batch_size, num_latent = mu.shape
131 |         eps = self._srng.normal(
132 |             [batch_size, self.eq_samples, self.iw_samples, num_latent],
133 |              dtype=theano.config.floatX)
134 | 
135 |         z = mu.dimshuffle(0,'x','x',1) + \
136 |             self.nonlinearity( log_var.dimshuffle(0,'x','x',1)) * eps
137 | 
138 |         return z.reshape((-1,num_latent))
139 | 
140 | 
141 | class SimpleBernoulliSampleLayer(lasagne.layers.Layer):
142 |     """
143 |     Simple sampling layer drawing samples from bernoulli distributions.
144 | 
145 |     Parameters
146 |     ----------
147 |     mean : :class:`Layer` instances
148 |           Parameterizing the mean value of each bernoulli distribution
149 |     seed : int
150 |         seed to random stream
151 |     Methods
152 |     ----------
153 |     seed : Helper function to change the random seed after init is called
154 |     """
155 | 
156 |     def __init__(self, mean,
157 |                  seed=lasagne.random.get_rng().randint(1, 2147462579),
158 |                  **kwargs):
159 |         super(SimpleBernoulliSampleLayer, self).__init__(mean, **kwargs)
160 | 
161 |         self._srng = RandomStreams(seed)
162 | 
163 |     def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)):
164 |         self._srng.seed(seed)
165 | 
166 |     def get_output_shape_for(self, input_shape):
167 |         return input_shape
168 | 
169 |     def get_output_for(self, mu, **kwargs):
170 |         return self._srng.binomial(size=mu.shape, p=mu, dtype=mu.dtype)
171 | 
172 | 
173 | class BernoulliSampleLayer(lasagne.layers.Layer):
174 |     """
175 |     Bernoulli Sampling layer supporting importance sampling
176 |     Parameters
177 |     ----------
178 |     mean : class:`Layer` instance
179 |            Parameterizing the mean value of each bernoulli distribution
180 |     eq_samples : int or T.scalar
181 |         Number of Monte Carlo samples used to estimate the expectation over
182 |     iw_samples : int or T.scalar
183 |         Number of importance samples in the sum over k
184 |     seed : int
185 |         seed to random stream
186 |     Methods
187 |     ----------
188 |     seed : Helper function to change the random seed after init is called
189 |     """
190 | 
191 |     def __init__(self, mean,
192 |                  eq_samples=1,
193 |                  iw_samples=1,
194 |                  seed=lasagne.random.get_rng().randint(1, 2147462579),
195 |                   **kwargs):
196 |         super(BernoulliSampleLayer, self).__init__(mean, **kwargs)
197 | 
198 |         self.eq_samples = eq_samples
199 |         self.iw_samples = iw_samples
200 | 
201 |         self._srng = RandomStreams(seed)
202 | 
203 |     def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)):
204 |         self._srng.seed(seed)
205 | 
206 |     def get_output_shape_for(self, input_shape):
207 |         batch_size, num_latent = input_shape
208 |         if isinstance(batch_size, int) and \
209 |            isinstance(self.iw_samples, int) and \
210 |            isinstance(self.eq_samples, int):
211 |             out_dim = (batch_size*self.eq_samples*self.iw_samples, num_latent)
212 |         else:
213 |             out_dim = (None, num_latent)
214 |         return out_dim
215 | 
216 |     def get_output_for(self, input, **kwargs):
217 |         mu = input
218 |         batch_size, num_latent = mu.shape
219 |         shp = (batch_size, self.eq_samples, self.iw_samples, num_latent)
220 |         mu_shp = mu.dimshuffle(0,'x','x',1)
221 |         mu_shp = T.repeat(mu_shp, axis=1, repeats=self.eq_samples)
222 |         mu_shp = T.repeat(mu_shp, axis=2, repeats=self.iw_samples)
223 |         samples = self._srng.binomial(
224 |             size=shp, p=mu_shp, dtype=theano.config.floatX)
225 |         return samples.reshape((-1, num_latent))
226 | 


--------------------------------------------------------------------------------