├── .gitignore ├── 1_novice ├── 0-what-is-pytorch.ipynb ├── 1-automatic-differentiation.ipynb ├── 2-neural-networks.ipynb ├── 3-cifar10.ipynb └── README.md ├── 2_intermediate ├── 1.1-FFN.ipynb ├── 1.2-FFN.ipynb ├── 2.1-CNN.ipynb ├── 2.2-CNN.ipynb ├── 3.1-AE.ipynb ├── 4.1-Sequences.ipynb ├── 4.2-RNN.ipynb ├── 5.1-Kaggle.ipynb ├── 6.1-final_exam_SNLI.ipynb ├── README.md └── data_utils.py ├── 3_advanced └── README.md ├── Dockerfile.cpu ├── Dockerfile.gpu ├── README.md ├── jupyter_run.sh └── static_files ├── autograd-variable.png ├── cifar10.png ├── lstm_cell.png ├── mnist.png └── rnn-unfold.png /.gitignore: -------------------------------------------------------------------------------- 1 | MANIFEST 2 | node_modules 3 | *.py[co] 4 | *.egg-info 5 | *.bak 6 | .ipynb_checkpoints 7 | .tox 8 | .coverage 9 | 10 | *.map 11 | .idea/ 12 | 13 | /.project 14 | /.pydevproject 15 | 16 | build/ 17 | dist/ 18 | **/__pycache__ 19 | **/*.pyc 20 | **/*.so* 21 | **/*.dylib* 22 | 23 | # IPython notebook checkpoints 24 | .ipynb_checkpoints 25 | 26 | # Editor temporaries 27 | *.swn 28 | *.swo 29 | *.swp 30 | *~ 31 | 32 | # macOS dir files 33 | .DS_Store 34 | 35 | # Ninja files 36 | .ninja_deps 37 | .ninja_log 38 | compile_commands.json 39 | 40 | **/data 41 | **/images 42 | **/*.npz 43 | **/*.png 44 | **/*.[jJ][pP]*[eE][gG] 45 | **/.data 46 | **/.vector_cache 47 | **/*.zip 48 | **/*.vec 49 | **/*.pt 50 | **/*.csv 51 | -------------------------------------------------------------------------------- /1_novice/0-what-is-pytorch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Credits\n", 8 | "\n", 9 | "This is heavily influenced or copied from https://github.com/pytorch/tutorials" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# What is PyTorch?\n", 17 | "\n", 18 | "It’s a Python based scientific computing package targeted at two sets of\n", 19 | "audiences:\n", 20 | "- A replacement for numpy to use the power of GPUs\n", 21 | "- a deep learning research platform that provides maximum flexibility\n", 22 | " and speed\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "# Getting Started\n", 30 | "\n", 31 | "In this lab you will get a quick start on what pytorch is and how to use it.\n", 32 | "\n", 33 | "## 1. Tensors\n", 34 | "\n", 35 | "Tensors are similar to numpy’s ndarrays, with the addition being that\n", 36 | "Tensors can also be used on a GPU to accelerate computing." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "import torch" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "Construct a 5x3 matrix, uninitialized" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "scrolled": false 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "x = torch.Tensor(5, 3)\n", 64 | "print(x)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "Construct a randomly initialized matrix" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "x = torch.rand(5, 3)\n", 81 | "print(x)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "Get its size" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "print(x.size())" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "**NOTE** `torch.Size` is in fact a tuple, so it supports the same operations" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "x[1:3] = 2\n", 114 | "print(x)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "# Assignment\n", 122 | "\n", 123 | "1. Make a tensor of size (2, 17)\n", 124 | "2. Make a torch.FloatTensor of size (3, 1)\n", 125 | "3. Make a tochh.LongTensor of size (5, 2, 1)\n", 126 | " - fill the entire tensor with 7s\n", 127 | "4. Make a torch.ByteTensor of size (5,)\n", 128 | " - fill the middle 3 indices with ones such that it records [0, 1, 1, 1, 0]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "## 2. Operations\n", 136 | "There are multiple syntaxes for operations. Let's see addition as an example:\n", 137 | "\n", 138 | "### 2.1 Addition: syntax 1" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "y = torch.rand(5, 3)\n", 148 | "print(x + y)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "### 2.2 Addition: syntax 2" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "print(torch.add(x, y))" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "### 2.3 Addition: giving an output tensor" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "result = torch.Tensor(5, 3)\n", 181 | "torch.add(x, y, out=result)\n", 182 | "print(result)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "### 2.4 Addition: in-place\n", 190 | "\n", 191 | "adds `x`to `y`" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "y.add_(x)\n", 201 | "print(y)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "**NOTE** Any operation that mutates a tensor in-place is post-fixed with an `_`. For example: `x.copy_(y)`, `x.t_()`, will change `x`." 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "You can use standard numpy-like indexing with all bells and whistles!" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "print(x[:, 1])" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "**Read later** 100+ Tensor operations, including transposing, indexing, slicing, mathematical operations, linear algebra, random numbers, etc are described here " 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "# Assignment\n", 239 | "\n", 240 | "1. multiply of two tensors (see [torch.Tensor.mul](http://pytorch.org/docs/master/tensors.html#torch.Tensor.mul))\n", 241 | "2. do the same, but inplace\n", 242 | "3. multiply of two tensors (see [torch.Tensor.div](http://pytorch.org/docs/master/tensors.html#torch.Tensor.div))\n", 243 | "4. perform a matrix multiplication of two tensors of size (2, 4) and (4, 2)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "## 3. Numpy Bridge\n", 251 | "\n", 252 | "Converting a torch Tensor to a numpy array and vice versa is a breeze.\n", 253 | "\n", 254 | "The torch Tensor and numpy array will share their underlying memory locations, and changing one will change the other.\n", 255 | "\n", 256 | "### 3.1 Converting torch Tensor to numpy Array" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "a = torch.ones(5)\n", 266 | "print(a)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "b = a.numpy()\n", 276 | "print(b)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "See how the numpy array changed in value." 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "a.add_(1)\n", 293 | "print(a)\n", 294 | "print(b)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "### 3.2 Converting numpy Array to torch Tensor\n", 302 | "\n", 303 | "See how changing the np array changed the torch Tensor automatically" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "import numpy as np\n", 313 | "a = np.ones(5)\n", 314 | "b = torch.from_numpy(a)\n", 315 | "np.add(a, 1, out=a)\n", 316 | "print(a)\n", 317 | "print(b)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "# Assignment\n", 325 | "\n", 326 | "1. create a tensor of size (5, 2) containing ones\n", 327 | "2. now convert it to a numpy array\n", 328 | "3. now convert it back to a torch tensor" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "All the Tensors on the CPU except a CharTensor support converting to NumPy and back.\n", 336 | "\n", 337 | "## 4 CUDA Tensors\n", 338 | "\n", 339 | "Tensors can be moved onto GPU using the `.cuda` function.\n", 340 | "This is not necessary, but check the `README.md` for details on how to use a GPU with docker." 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "# let us run this cell only if CUDA is available\n", 350 | "if torch.cuda.is_available():\n", 351 | " x = x.cuda()\n", 352 | " y = y.cuda()\n", 353 | " z = x + y\n", 354 | " # notice that the tensors are now of type torch.cuda.FloatTensor (notice the cuda in there)\n", 355 | " # this is meant as a tensor to be run on the GPU.\n", 356 | " # the .cuda() does this to any parameter it is applied to\n", 357 | " print(x)\n", 358 | " print(y)\n", 359 | " print(z)" 360 | ] 361 | } 362 | ], 363 | "metadata": { 364 | "kernelspec": { 365 | "display_name": "Python 3", 366 | "language": "python", 367 | "name": "python3" 368 | }, 369 | "language_info": { 370 | "codemirror_mode": { 371 | "name": "ipython", 372 | "version": 3 373 | }, 374 | "file_extension": ".py", 375 | "mimetype": "text/x-python", 376 | "name": "python", 377 | "nbconvert_exporter": "python", 378 | "pygments_lexer": "ipython3", 379 | "version": "3.6.4" 380 | } 381 | }, 382 | "nbformat": 4, 383 | "nbformat_minor": 2 384 | } 385 | -------------------------------------------------------------------------------- /1_novice/1-automatic-differentiation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Credits\n", 8 | "\n", 9 | "This is heavily influenced or copied from https://github.com/pytorch/tutorials" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Autograd: automatic differentiation\n", 17 | "\n", 18 | "Central to all neural networks in PyTorch is the ``autograd`` package.\n", 19 | "Let’s first briefly visit this, and we will then go to training our first neural network.\n", 20 | "\n", 21 | "The `autograd` package provides automatic differentiation for all operations on Tensors.\n", 22 | "It is a define-by-run framework, which means that your backprop is defined by how your code is run, and that every single iteration can be different.\n", 23 | "\n", 24 | "Let us see this in more simple terms with some examples." 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## 1. Variable\n", 32 | "\n", 33 | "`autograd.Variable` is the central class of the package. It wraps a Tensor, and supports nearly all of operations defined on it.\n", 34 | "Once you finish your computation you can call `.backward()` and have all the gradients computed automatically.\n", 35 | "\n", 36 | "You can access the raw tensor through the `.data` attribute, while the gradient w.r.t. this variable is accumulated into `.grad`.\n", 37 | "\n", 38 | "![autograd.Variable](../static_files/autograd-variable.png)\n", 39 | "\n", 40 | "There’s one more class which is very important for autograd implementation - a `Function`.\n", 41 | "\n", 42 | "`Variable` and `Function` are interconnected and build up an acyclic graph, that encodes a complete history of computation. Each variable has a `.grad_fn` attribute that references a `Function` that has created the `Variable` (except for `Variable`s created by the user - their `grad_fn` is `None`).\n", 43 | "\n", 44 | "If you want to compute the derivatives, you can call `.backward()` on a `Variable`. If `Variable` is a scalar (i.e. it holds a one element data), you don’t need to specify any arguments to `backward()`, however if it has more elements, you need to specify a `grad_output` argument that is a tensor of matching shape." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "import torch\n", 54 | "from torch.autograd import Variable" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "Create a variable" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "x = Variable(torch.ones(2, 2), requires_grad=True)\n", 71 | "print(x)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "Do an operation of variable:" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "y = x + 2\n", 88 | "print(y)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "`y` was created as a result of an operation, so it has a `grad_fn`." 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "print(y.grad_fn)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "Do more operations on y" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "z = y * y * 3\n", 121 | "out = z.mean()\n", 122 | "\n", 123 | "print(z, out)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "# Assignments\n", 131 | "\n", 132 | "1. Create a Variable that `requires_grad` containing a tensor of size (5, 5)\n", 133 | "2. Sum the values in the Variable" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "## 2. Gradients\n", 141 | "\n", 142 | "Let’s backprop now `out.backward()` is equivalent to doing `out.backward(torch.Tensor([1.0]))`" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "out.backward()" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "print gradients d(out)/dx" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "print(x.grad)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "You should have got a matrix of `4.5`. Let’s call the `out` *Variable* $o$.\n", 175 | "\n", 176 | "We have that $o = \\frac{1}{4}\\sum_i z_i$,\n", 177 | "$z_i = 3(x_i+2)^2$ and $z_i\\bigr\\rvert_{x_i=1} = 27$.\n", 178 | "\n", 179 | "Therefore, $\\frac{\\partial o}{\\partial x_i} = \\frac{3}{2}(x_i+2)$,\n", 180 | "hence $\\frac{\\partial o}{\\partial x_i}\\bigr\\rvert_{x_i=1} = \\frac{9}{2} = 4.5$.\n", 181 | "\n", 182 | "You can do many crazy things with autograd!" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "x = torch.randn(3)\n", 192 | "x = Variable(x, requires_grad=True)\n", 193 | "\n", 194 | "y = x * 2\n", 195 | "while y.data.norm() < 1000:\n", 196 | " y = y * 2\n", 197 | "\n", 198 | "print(y)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "gradients = torch.FloatTensor([0.1, 1.0, 0.0001])\n", 208 | "y.backward(gradients)\n", 209 | "\n", 210 | "print(x.grad)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "**Read later**\n", 218 | "Documentation of `Variable` and `Function` is at http://pytorch.org/docs/autograd" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "# Assignments\n", 226 | "\n", 227 | "1. Define a tensor\n", 228 | "2. Convert the tensor to a torch.Variable that requires_grad\n", 229 | "3. Multiply the torch.Variable with 2 and assign the result to a new python variable (i.e. `x = result`)\n", 230 | "4. Sum the variable's elements and assign to a new python variable\n", 231 | "5. Print the gradients of all the variables\n", 232 | "6. Now perform a backward pass on the last variable\n", 233 | "7. Print all gradients again\n", 234 | " - what did you notice?" 235 | ] 236 | } 237 | ], 238 | "metadata": { 239 | "kernelspec": { 240 | "display_name": "Python 3", 241 | "language": "python", 242 | "name": "python3" 243 | }, 244 | "language_info": { 245 | "codemirror_mode": { 246 | "name": "ipython", 247 | "version": 3 248 | }, 249 | "file_extension": ".py", 250 | "mimetype": "text/x-python", 251 | "name": "python", 252 | "nbconvert_exporter": "python", 253 | "pygments_lexer": "ipython3", 254 | "version": "3.6.4" 255 | } 256 | }, 257 | "nbformat": 4, 258 | "nbformat_minor": 2 259 | } 260 | -------------------------------------------------------------------------------- /1_novice/2-neural-networks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Credits\n", 8 | "\n", 9 | "This is heavily influenced or copied from https://github.com/pytorch/tutorials" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import torch\n", 19 | "from torch.autograd import Variable\n", 20 | "import torch.nn as nn\n", 21 | "import torch.nn.functional as F" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "# Neural Networks\n", 29 | "\n", 30 | "Neural networks can be constructed using the ``torch.nn`` package.\n", 31 | "\n", 32 | "Now that you had a glimpse of ``autograd``, ``nn`` depends on\n", 33 | "``autograd`` to define models and differentiate them.\n", 34 | "An ``nn.Module`` contains layers, and a method ``forward(input)``\\ that\n", 35 | "returns the ``output``.\n", 36 | "\n", 37 | "For example, look at this network that classfies digit images:\n", 38 | "\n", 39 | "![convnet](../static_files/mnist.png)\n", 40 | "\n", 41 | "It is a simple feed-forward network. It takes the input, feeds it\n", 42 | "through several layers one after the other, and then finally gives the\n", 43 | "output.\n", 44 | "\n", 45 | "A typical training procedure for a neural network is as follows:\n", 46 | "\n", 47 | "- Define the neural network that has some learnable parameters (or\n", 48 | " weights)\n", 49 | "- Iterate over a dataset of inputs\n", 50 | "- Process input through the network\n", 51 | "- Compute the loss (how far is the output from being correct)\n", 52 | "- Propagate gradients back into the network’s parameters\n", 53 | "- Update the weights of the network, typically using a simple update rule:\n", 54 | "\n", 55 | " ``weight = weight - learning_rate * gradient``\n", 56 | "\n", 57 | "## 1. Define the network\n", 58 | "\n", 59 | "Let’s define this network:" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "class Net(nn.Module):\n", 69 | "\n", 70 | " def __init__(self):\n", 71 | " super(Net, self).__init__()\n", 72 | " # 1 input image channel, 6 output channels, 5x5 square convolution\n", 73 | " # kernel\n", 74 | " self.conv1 = nn.Conv2d(1, 6, 5)\n", 75 | " self.conv2 = nn.Conv2d(6, 16, 5)\n", 76 | " # an affine operation: y = Wx + b\n", 77 | " self.fc1 = nn.Linear(16 * 5 * 5, 120)\n", 78 | " self.fc2 = nn.Linear(120, 84)\n", 79 | " self.fc3 = nn.Linear(84, 10)\n", 80 | "\n", 81 | " def forward(self, x):\n", 82 | " # Max pooling over a (2, 2) window\n", 83 | " x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))\n", 84 | " # If the size is a square you can only specify a single number\n", 85 | " x = F.max_pool2d(F.relu(self.conv2(x)), 2)\n", 86 | " x = x.view(-1, self.num_flat_features(x))\n", 87 | " x = F.relu(self.fc1(x))\n", 88 | " x = F.relu(self.fc2(x))\n", 89 | " x = self.fc3(x)\n", 90 | " return x\n", 91 | "\n", 92 | " def num_flat_features(self, x):\n", 93 | " size = x.size()[1:] # all dimensions except the batch dimension\n", 94 | " num_features = 1\n", 95 | " for s in size:\n", 96 | " num_features *= s\n", 97 | " return num_features\n", 98 | "\n", 99 | "\n", 100 | "net = Net()\n", 101 | "print(net)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "You just have to define the ``forward`` function, and the ``backward``\n", 109 | "function (where gradients are computed) is automatically defined for you\n", 110 | "using ``autograd``.\n", 111 | "You can use any of the Tensor operations in the ``forward`` function.\n", 112 | "\n", 113 | "The learnable parameters of a model are returned by ``net.parameters()``\n", 114 | "\n", 115 | "**Note** the ``.named_parameters()`` method also gives the names of each parameter (useful for debugging later)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "params = list(net.parameters())\n", 125 | "print(len(params))\n", 126 | "print(params[0].size()) # conv1's .weight" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "The input to the forward is an ``autograd.Variable``, and so is the output.\n", 134 | "Note: Expected input size to this net(LeNet) is 32x32. To use this net on\n", 135 | "MNIST dataset,please resize the images from the dataset to 32x32.\n", 136 | "\n" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "input = Variable(torch.randn(1, 1, 32, 32))\n", 146 | "out = net(input)\n", 147 | "print(out)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "Zero the gradient buffers of all parameters and backprops with random\n", 155 | "gradients:\n", 156 | "\n" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "net.zero_grad()\n", 166 | "out.backward(torch.randn(1, 10))" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "
\n", 174 | "

Note

\n", 175 | "

\n", 176 | " ``torch.nn`` only supports mini-batches The entire ``torch.nn``\n", 177 | " package only supports inputs that are a mini-batch of samples, and not a single sample.\n", 178 | "\n", 179 | " For example, ``nn.Conv2d`` will take in a 4D Tensor of\n", 180 | " ``nSamples x nChannels x Height x Width``.\n", 181 | "\n", 182 | " If you have a single sample, just use ``input.unsqueeze(0)`` to add\n", 183 | " a fake batch dimension.\n", 184 | "

\n", 185 | "
\n", 186 | "\n", 187 | "Before proceeding further, let's recap all the classes you’ve seen so far.\n", 188 | "\n", 189 | "**Recap:**\n", 190 | " - ``torch.Tensor`` - A *multi-dimensional array*.\n", 191 | " - ``autograd.Variable`` - *Wraps a Tensor and records the history of\n", 192 | " operations* applied to it. Has the same API as a ``Tensor``, with\n", 193 | " some additions like ``backward()``. Also *holds the gradient*\n", 194 | " w.r.t. the tensor.\n", 195 | " - ``nn.Module`` - Neural network module. *Convenient way of\n", 196 | " encapsulating parameters*, with helpers for moving them to GPU,\n", 197 | " exporting, loading, etc.\n", 198 | " - ``nn.Parameter`` - A kind of Variable, that is *automatically\n", 199 | " registered as a parameter when assigned as an attribute to a*\n", 200 | " ``Module``.\n", 201 | " - ``autograd.Function`` - Implements *forward and backward definitions\n", 202 | " of an autograd operation*. Every ``Variable`` operation, creates at\n", 203 | " least a single ``Function`` node, that connects to functions that\n", 204 | " created a ``Variable`` and *encodes its history*.\n", 205 | "\n", 206 | "**At this point, we covered:**\n", 207 | " - Defining a neural network\n", 208 | " - Processing inputs and calling backward.\n", 209 | "\n", 210 | "**Still Left:**\n", 211 | " - Computing the loss\n", 212 | " - Updating the weights of the network\n", 213 | "\n", 214 | "## 2. Loss Function\n", 215 | "\n", 216 | "A loss function takes the (output, target) pair of inputs, and computes a value that estimates how far away the output is from the target.\n", 217 | "\n", 218 | "There are several different [loss functions](http://pytorch.org/docs/master/nn.html#loss-functions) under the `nn` package .\n", 219 | "\n", 220 | "A simple loss is: ``nn.MSELoss`` which computes the mean-squared error\n", 221 | "between the input and the target.\n", 222 | "\n", 223 | "For example:" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "output = net(input)\n", 233 | "target = Variable(torch.arange(1, 11)) # a dummy target, for example\n", 234 | "criterion = nn.MSELoss()\n", 235 | "\n", 236 | "loss = criterion(output, target)\n", 237 | "print(loss)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "Now, if you follow ``loss`` in the backward direction, using it’s\n", 245 | "``.grad_fn`` attribute, you will see a graph of computations that looks\n", 246 | "like this:\n", 247 | "\n", 248 | "```\n", 249 | "input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d\n", 250 | " -> view -> linear -> relu -> linear -> relu -> linear\n", 251 | " -> MSELoss\n", 252 | " -> loss\n", 253 | "```\n", 254 | "\n", 255 | "So, when we call ``loss.backward()``, the whole graph is differentiated\n", 256 | "w.r.t. the loss, and all Variables in the graph will have their\n", 257 | "``.grad`` Variable accumulated with the gradient.\n", 258 | "\n", 259 | "For illustration, let us follow a few steps backward:\n", 260 | "\n" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "print(loss.grad_fn) # MSELoss\n", 270 | "print(loss.grad_fn.next_functions[0][0]) # Linear\n", 271 | "print(loss.grad_fn.next_functions[0][0].next_functions[0][0]) # ReLU" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "## 3. Backprop\n", 279 | "\n", 280 | "To backpropagate the error all we have to do is to ``loss.backward()``.\n", 281 | "You need to clear the existing gradients though, else gradients will be\n", 282 | "accumulated to existing gradients\n", 283 | "\n", 284 | "Now we shall call ``loss.backward()``, and have a look at conv1's bias\n", 285 | "gradients before and after the backward." 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "net.zero_grad() # zeroes the gradient buffers of all parameters\n", 295 | "\n", 296 | "print('conv1.bias.grad before backward')\n", 297 | "print(net.conv1.bias.grad)\n", 298 | "\n", 299 | "loss.backward()\n", 300 | "\n", 301 | "print('conv1.bias.grad after backward')\n", 302 | "print(net.conv1.bias.grad)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "Now, we have seen how to use loss functions.\n", 310 | "\n", 311 | "**Read later:**\n", 312 | "\n", 313 | "The neural network package contains various modules and loss functions that form the building blocks of deep neural networks. A full list with documentation is [here](http://pytorch.org/docs/nn)\n", 314 | "\n", 315 | "**The only thing left to learn is:**\n", 316 | "\n", 317 | " - updating the weights of the network\n", 318 | "\n", 319 | "## 3.1 Update the weights\n", 320 | "\n", 321 | "The simplest update rule used in practice is the Stochastic Gradient Descent (SGD):\n", 322 | "\n", 323 | " ``weight = weight - learning_rate * gradient``\n", 324 | "\n", 325 | "We can implement this using simple python code:\n", 326 | "\n", 327 | "```\n", 328 | " learning_rate = 0.01\n", 329 | " for f in net.parameters():\n", 330 | " f.data.sub_(f.grad.data * learning_rate)\n", 331 | "```\n", 332 | "\n", 333 | "However, as you use neural networks, you want to use various different\n", 334 | "update rules such as SGD, Nesterov-SGD, Adam, RMSProp, etc.\n", 335 | "To enable this, we built a small package: ``torch.optim`` that\n", 336 | "implements all these methods. Using it is very straight forward:" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "import torch.optim as optim\n", 346 | "\n", 347 | "# create your optimizer\n", 348 | "optimizer = optim.SGD(net.parameters(), lr=0.01)\n", 349 | "\n", 350 | "# in your training loop:\n", 351 | "optimizer.zero_grad() # zero the gradient buffers\n", 352 | "output = net(input)\n", 353 | "loss = criterion(output, target)\n", 354 | "loss.backward()\n", 355 | "optimizer.step() # Does the update" 356 | ] 357 | } 358 | ], 359 | "metadata": { 360 | "kernelspec": { 361 | "display_name": "Python 3", 362 | "language": "python", 363 | "name": "python3" 364 | }, 365 | "language_info": { 366 | "codemirror_mode": { 367 | "name": "ipython", 368 | "version": 3 369 | }, 370 | "file_extension": ".py", 371 | "mimetype": "text/x-python", 372 | "name": "python", 373 | "nbconvert_exporter": "python", 374 | "pygments_lexer": "ipython3", 375 | "version": "3.6.4" 376 | } 377 | }, 378 | "nbformat": 4, 379 | "nbformat_minor": 1 380 | } 381 | -------------------------------------------------------------------------------- /1_novice/3-cifar10.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Credits\n", 8 | "\n", 9 | "This is heavily influenced or copied from https://github.com/pytorch/tutorials" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Training a classifier\n", 17 | "\n", 18 | "This is it. You have seen how to define neural networks, compute loss and make\n", 19 | "updates to the weights of the network.\n", 20 | "\n", 21 | "Now you might be thinking,\n", 22 | "\n", 23 | "## What about data?\n", 24 | "\n", 25 | "Generally, when you have to deal with image, text, audio or video data,\n", 26 | "you can use standard python packages that load data into a numpy array.\n", 27 | "Then you can convert this array into a ``torch.*Tensor``.\n", 28 | "\n", 29 | "- For images, packages such as Pillow, OpenCV are useful.\n", 30 | "- For audio, packages such as scipy and librosa\n", 31 | "- For text, either raw Python or Cython based loading, or NLTK and\n", 32 | " SpaCy are useful.\n", 33 | "\n", 34 | "Specifically for ``vision``, we have created a package called\n", 35 | "``torchvision``, that has data loaders for common datasets such as\n", 36 | "Imagenet, CIFAR10, MNIST, etc. and data transformers for images, viz.,\n", 37 | "``torchvision.datasets`` and ``torch.utils.data.DataLoader``.\n", 38 | "\n", 39 | "This provides a huge convenience and avoids writing boilerplate code.\n", 40 | "\n", 41 | "For this tutorial, we will use the CIFAR10 dataset.\n", 42 | "It has the classes: ‘airplane’, ‘automobile’, ‘bird’, ‘cat’, ‘deer’,\n", 43 | "‘dog’, ‘frog’, ‘horse’, ‘ship’, ‘truck’. The images in CIFAR-10 are of\n", 44 | "size 3x32x32, i.e. 3-channel color images of 32x32 pixels in size.\n", 45 | "\n", 46 | "![cifar10](../static_files/cifar10.png)\n", 47 | "\n", 48 | "\n", 49 | "## Training an image classifier\n", 50 | "\n", 51 | "We will do the following steps in order:\n", 52 | "\n", 53 | "1. Load and normalizing the CIFAR10 training and test datasets using\n", 54 | " ``torchvision``\n", 55 | "2. Define a Convolution Neural Network\n", 56 | "3. Define a loss function\n", 57 | "4. Train the network on the training data\n", 58 | "5. Test the network on the test data\n", 59 | "\n", 60 | "## 1. Loading and normalizing CIFAR10\n", 61 | "\n", 62 | "Using ``torchvision``, it’s extremely easy to load CIFAR10." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "import torch\n", 72 | "import torchvision\n", 73 | "import torchvision.transforms as transforms" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "The output of torchvision datasets are PILImage images of range [0, 1].\n", 81 | "We transform them to Tensors of normalized range [-1, 1]\n", 82 | "\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "transform = transforms.Compose(\n", 92 | " [transforms.ToTensor(),\n", 93 | " transforms.Normalize((0.5, 0.5, 0.5),\n", 94 | " (0.5, 0.5, 0.5))]\n", 95 | ")\n", 96 | "\n", 97 | "trainset = torchvision.datasets.CIFAR10(root='./data', train=True,\n", 98 | " download=True, transform=transform)\n", 99 | "trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,\n", 100 | " shuffle=True, num_workers=2)\n", 101 | "\n", 102 | "testset = torchvision.datasets.CIFAR10(root='./data', train=False,\n", 103 | " download=True, transform=transform)\n", 104 | "testloader = torch.utils.data.DataLoader(testset, batch_size=4,\n", 105 | " shuffle=False, num_workers=2)\n", 106 | "\n", 107 | "classes = ('plane', 'car', 'bird', 'cat', 'deer',\n", 108 | " 'dog', 'frog', 'horse', 'ship', 'truck')" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "Let us show some of the training images, for fun.\n", 116 | "\n" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "%matplotlib inline\n", 126 | "import matplotlib.pyplot as plt\n", 127 | "import numpy as np\n", 128 | "\n", 129 | "def imshow(img):\n", 130 | " \"\"\" show an image \"\"\"\n", 131 | " img = img / 2 + 0.5 # unnormalize\n", 132 | " npimg = img.numpy()\n", 133 | " plt.imshow(np.transpose(npimg, (1, 2, 0)))\n", 134 | "\n", 135 | "# get some random training images\n", 136 | "dataiter = iter(trainloader)\n", 137 | "images, labels = dataiter.next()\n", 138 | "\n", 139 | "# show images\n", 140 | "imshow(torchvision.utils.make_grid(images))\n", 141 | "# print labels\n", 142 | "print(' '.join('%5s' % classes[labels[j]] for j in range(4)))" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "## 2. Define a Convolution Neural Network\n", 150 | "\n", 151 | "Copy the neural network from the Neural Networks section before and modify it to take 3-channel images (instead of 1-channel images as it was defined)." 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "from torch.autograd import Variable\n", 161 | "import torch.nn as nn\n", 162 | "import torch.nn.functional as F\n", 163 | "\n", 164 | "\n", 165 | "class Net(nn.Module):\n", 166 | " def __init__(self):\n", 167 | " super(Net, self).__init__()\n", 168 | " self.conv1 = nn.Conv2d(3, 6, 5)\n", 169 | " self.pool = nn.MaxPool2d(2, 2)\n", 170 | " self.conv2 = nn.Conv2d(6, 16, 5)\n", 171 | " self.fc1 = nn.Linear(16 * 5 * 5, 120)\n", 172 | " self.fc2 = nn.Linear(120, 84)\n", 173 | " self.fc3 = nn.Linear(84, 10)\n", 174 | "\n", 175 | " def forward(self, x):\n", 176 | " x = self.pool(F.relu(self.conv1(x)))\n", 177 | " x = self.pool(F.relu(self.conv2(x)))\n", 178 | " x = x.view(-1, 16 * 5 * 5)\n", 179 | " x = F.relu(self.fc1(x))\n", 180 | " x = F.relu(self.fc2(x))\n", 181 | " x = self.fc3(x)\n", 182 | " return x\n", 183 | "\n", 184 | "\n", 185 | "net = Net()\n", 186 | "print(net)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "## 3. Define a Loss function and optimizer\n", 194 | "\n", 195 | "Let's use a Classification Cross-Entropy loss and SGD with momentum" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "import torch.optim as optim\n", 205 | "\n", 206 | "criterion = nn.CrossEntropyLoss()\n", 207 | "optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "## 4. Train the network\n", 215 | "\n", 216 | "This is when things start to get interesting.\n", 217 | "We simply have to loop over our data iterator, and feed the inputs to the network and optimize" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "for epoch in range(2): # loop over the dataset multiple times\n", 227 | "\n", 228 | " running_loss = 0.0\n", 229 | " for i, data in enumerate(trainloader, 0):\n", 230 | " # get the inputs\n", 231 | " inputs, labels = data\n", 232 | "\n", 233 | " # wrap them in Variable\n", 234 | " inputs, labels = Variable(inputs), Variable(labels)\n", 235 | "\n", 236 | " # zero the parameter gradients\n", 237 | " optimizer.zero_grad()\n", 238 | "\n", 239 | " # forward + backward + optimize\n", 240 | " outputs = net(inputs)\n", 241 | " loss = criterion(outputs, labels)\n", 242 | " loss.backward()\n", 243 | " optimizer.step()\n", 244 | "\n", 245 | " # print statistics\n", 246 | " running_loss += loss.data[0]\n", 247 | " if i % 2000 == 1999: # print every 2000 mini-batches\n", 248 | " print('[%d, %5d] loss: %.3f' %\n", 249 | " (epoch + 1, i + 1, running_loss / 2000))\n", 250 | " running_loss = 0.0\n", 251 | "\n", 252 | "print('Finished Training')" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "## 5. Test the network on the test data\n", 260 | "\n", 261 | "We have trained the network for 2 passes over the training dataset.\n", 262 | "But we need to check if the network has learnt anything at all.\n", 263 | "\n", 264 | "We will check this by predicting the class label that the neural network outputs, and checking it against the ground-truth. If the prediction is correct, we add the sample to the list of correct predictions.\n", 265 | "\n", 266 | "Okay, first step. Let us display an image from the test set to get familiar." 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "dataiter = iter(testloader)\n", 276 | "images, labels = dataiter.next()\n", 277 | "\n", 278 | "# print images\n", 279 | "imshow(torchvision.utils.make_grid(images))\n", 280 | "print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "Okay, now let us see what the neural network thinks these examples above are:\n", 288 | "\n" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "outputs = net(Variable(images))\n", 298 | "print(outputs)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "The outputs are energies for the 10 classes.\n", 306 | "Higher the energy for a class, the more the network\n", 307 | "thinks that the image is of the particular class.\n", 308 | "So, let's get the index of the highest energy:\n", 309 | "\n" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "_, predicted = torch.max(outputs.data, 1)\n", 319 | "\n", 320 | "print('Predicted: ', ' '.join('%5s' % classes[predicted[j]] for j in range(4)))" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [ 327 | "The results seem pretty good.\n", 328 | "\n", 329 | "Let us look at how the network performs on the whole dataset.\n", 330 | "\n" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "correct = 0\n", 340 | "total = 0\n", 341 | "for data in testloader:\n", 342 | " images, labels = data\n", 343 | " outputs = net(Variable(images))\n", 344 | " _, predicted = torch.max(outputs.data, 1)\n", 345 | " total += labels.size(0)\n", 346 | " correct += (predicted == labels).sum()\n", 347 | "\n", 348 | "print('Accuracy of the network on the 10000 test images: %d %%' % (\n", 349 | " 100 * correct / total))" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "That looks waaay better than chance, which is 10% accuracy (randomly picking\n", 357 | "a class out of 10 classes).\n", 358 | "Seems like the network learnt something.\n", 359 | "\n", 360 | "Hmmm, what are the classes that performed well, and the classes that did\n", 361 | "not perform well:\n", 362 | "\n" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "class_correct = list(0. for i in range(10))\n", 372 | "class_total = list(0. for i in range(10))\n", 373 | "for data in testloader:\n", 374 | " images, labels = data\n", 375 | " outputs = net(Variable(images))\n", 376 | " _, predicted = torch.max(outputs.data, 1)\n", 377 | " c = (predicted == labels).squeeze()\n", 378 | " for i in range(4):\n", 379 | " label = labels[i]\n", 380 | " class_correct[label] += c[i]\n", 381 | " class_total[label] += 1\n", 382 | "\n", 383 | "\n", 384 | "for i in range(10):\n", 385 | " print('Accuracy of %5s : %2d %%' % (\n", 386 | " classes[i], 100 * class_correct[i] / class_total[i]))" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": {}, 392 | "source": [ 393 | "Okay, so what next?\n", 394 | "\n", 395 | "How do we run these neural networks on the GPU?\n", 396 | "\n", 397 | "## 6. Training on GPU\n", 398 | "\n", 399 | "Just like how you transfer a Tensor on to the GPU, you transfer the neural\n", 400 | "net onto the GPU.\n", 401 | "This will recursively go over all modules and convert their parameters and\n", 402 | "buffers to CUDA tensors:\n", 403 | "\n", 404 | "```\n", 405 | " net.cuda()\n", 406 | "```\n", 407 | "\n", 408 | "Remember that you will have to send the inputs and targets at every step\n", 409 | "to the GPU too:\n", 410 | "\n", 411 | "```\n", 412 | " inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())\n", 413 | "```\n", 414 | "\n", 415 | "Why dont I notice MASSIVE speedup compared to CPU? Because your network\n", 416 | "is realllly small.\n", 417 | "\n", 418 | "**Exercise:** Try increasing the width of your network (argument 2 of\n", 419 | "the first ``nn.Conv2d``, and argument 1 of the second ``nn.Conv2d`` –\n", 420 | "they need to be the same number), see what kind of speedup you get.\n", 421 | "\n", 422 | "**Goals achieved**:\n", 423 | "\n", 424 | "- Understanding PyTorch's Tensor library and neural networks at a high level.\n", 425 | "- Train a small neural network to classify images\n", 426 | "\n", 427 | "### 6.1 Training on multiple GPUs\n", 428 | "\n", 429 | "If you want to see even more MASSIVE speedup using all of your GPUs, check out [the data parallel example](http://pytorch.org/tutorials/beginner/former_torchies/parallelism_tutorial.html).\n", 430 | "\n", 431 | "## 7. Additional content\n", 432 | "\n", 433 | "Our tutorial will continue in the next folder.\n", 434 | "If you're interested in some PyTorch codebases check out the following links (reinforcement learning, GANTs, ResNet, etc).\n", 435 | "\n", 436 | "- [Train neural nets to play video games](http://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html)\n", 437 | "- [Train a state-of-the-art ResNet network on imagenet](https://github.com/pytorch/examples/tree/master/imagenet)\n", 438 | "- [Train a face generator using Generative Adversarial Networks](https://github.com/pytorch/examples/tree/master/dcgan)\n", 439 | "- [Train a word-level language model using Recurrent LSTM networks](https://github.com/pytorch/examples/tree/master/word_language_model)\n", 440 | "- [More examples](https://github.com/pytorch/examples)\n", 441 | "- [More tutorials](https://github.com/pytorch/tutorials)\n", 442 | "- [Discuss PyTorch on the Forums](https://discuss.pytorch.org/)\n", 443 | "- [Chat with other users on Slack](http://pytorch.slack.com/messages/beginner/)" 444 | ] 445 | } 446 | ], 447 | "metadata": { 448 | "kernelspec": { 449 | "display_name": "Python 3", 450 | "language": "python", 451 | "name": "python3" 452 | }, 453 | "language_info": { 454 | "codemirror_mode": { 455 | "name": "ipython", 456 | "version": 3 457 | }, 458 | "file_extension": ".py", 459 | "mimetype": "text/x-python", 460 | "name": "python", 461 | "nbconvert_exporter": "python", 462 | "pygments_lexer": "ipython3", 463 | "version": "3.6.4" 464 | } 465 | }, 466 | "nbformat": 4, 467 | "nbformat_minor": 1 468 | } 469 | -------------------------------------------------------------------------------- /1_novice/README.md: -------------------------------------------------------------------------------- 1 | # Credits 2 | 3 | This is a remix from https://github.com/pytorch/tutorials 4 | 5 | # Content 6 | 7 | This is the first steps of the PyTorch tutorial. 8 | The purpose is to get the audience to get the audience familiar with PyTorch. 9 | 10 | ## Lab 0: 0-what-is-pytorch 11 | 12 | A brief introduction to the basics of PyTorch. 13 | 14 | **NOTE** In the last part of this lab cuda is used. If you have a cuda enabled machine, read the README.md in the root of this repo on how to use nvidia-docker. 15 | 16 | ## Lab 1: 1-automatic-differentiation 17 | 18 | An introduction to automatic differentiation in PyTorch. 19 | 20 | ## Lab 2: 2-neural-networks 21 | 22 | An introduction to building a simple neural network with PyTorch. 23 | 24 | ### Reading material 25 | 26 | Requires fundamental understanding of regression, classification, and neural networks. 27 | If needed then check out week 1 through 6 in [Andrew Ng's machine learning course](https://www.coursera.org/learn/machine-learning). 28 | 29 | ## Lab 3: 3-cifar10 30 | 31 | An introduction to data loading, image classification, and training with PyTorch. 32 | 33 | ### Reading material 34 | 35 | Requires understanding of convolutional neural networks. 36 | If needed then check out [Stanford's cs231 lecture 6](https://www.youtube.com/watch?v=bNb2fEVKeEo&list=PL3FW7Lu3i5JvHM8ljYj-zLfQRF3EO8sYv). -------------------------------------------------------------------------------- /2_intermediate/1.1-FFN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Credits\n", 8 | "\n", 9 | "See main readme for credits." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Dependancies and supporting functions\n", 17 | "Loading dependancies and supporting functions by running the code block below." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "%matplotlib inline\n", 27 | "import matplotlib\n", 28 | "import numpy as np\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "import sklearn.datasets\n", 31 | "\n", 32 | "# Do not worry about the code below for now, it is used for plotting later\n", 33 | "def plot_decision_boundary(pred_func, X, y):\n", 34 | " #from https://github.com/dennybritz/nn-from-scratch/blob/master/nn-from-scratch.ipynb\n", 35 | " # Set min and max values and give it some padding\n", 36 | " x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5\n", 37 | " y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5\n", 38 | " \n", 39 | " h = 0.01\n", 40 | " # Generate a grid of points with distance h between them\n", 41 | " xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n", 42 | " \n", 43 | " yy = yy.astype('float32')\n", 44 | " xx = xx.astype('float32')\n", 45 | " # Predict the function value for the whole gid\n", 46 | " Z = pred_func(np.c_[xx.ravel(), yy.ravel()])[:,0]\n", 47 | " Z = Z.reshape(xx.shape)\n", 48 | " # Plot the contour and training examples\n", 49 | " plt.figure()\n", 50 | " plt.contourf(xx, yy, Z, cmap=plt.cm.RdBu)\n", 51 | " plt.scatter(X[:, 0], X[:, 1], c=-y, cmap=plt.cm.Spectral)\n", 52 | "\n", 53 | "def onehot(t, num_classes):\n", 54 | " out = np.zeros((t.shape[0], num_classes))\n", 55 | " for row, col in enumerate(t):\n", 56 | " out[row, col] = 1\n", 57 | " return out" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "# Purpose and goals\n", 65 | "In this notebook you will implement a simple neural network in PyTorch.\n", 66 | "\n", 67 | "> We assume that you are already familiar with backpropagation (if not please see [Andrej Karpathy](http://cs.stanford.edu/people/karpathy/) or [Michal Nielsen](http://neuralnetworksanddeeplearning.com/chap2.html)).\n", 68 | "\n", 69 | "In this exercise we'll start right away by defining a logistic regression model.\n", 70 | "We'll initially start with a simple 2D and two-class classification problem where the class decision boundary can be visualized.\n", 71 | "Initially we show that logistic regression can only separate classes linearly.\n", 72 | "Adding a nonlinear hidden layer to the algorithm permits nonlinear class separation.\n", 73 | "If time permits we'll continue on to implement a fully connected neural network to classify the MNIST dataset consisting of images of hand written digits." 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "source": [ 82 | "# Problem \n", 83 | "We'll initally demonstrate the that Multi-layer Perceptrons (MLPs) can classify nonlinear problems, whereas simple logistic regression cannot. For ease of visualization and computationl speed we initially experiment on the simple 2D half-moon dataset." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# Generate a dataset and plot it\n", 93 | "np.random.seed(0)\n", 94 | "num_samples = 300\n", 95 | "\n", 96 | "X, y = sklearn.datasets.make_moons(num_samples, noise=0.20)\n", 97 | "\n", 98 | "# define train, validation, and test sets\n", 99 | "X_tr = X[:100].astype('float32')\n", 100 | "X_val = X[100:200].astype('float32')\n", 101 | "X_te = X[200:].astype('float32')\n", 102 | "\n", 103 | "# and labels\n", 104 | "y_tr = y[:100].astype('int32')\n", 105 | "y_val = y[100:200].astype('int32')\n", 106 | "y_te = y[200:].astype('int32')\n", 107 | "\n", 108 | "plt.scatter(X_tr[:,0], X_tr[:,1], s=40, c=y_tr, cmap=plt.cm.BuGn)\n", 109 | "\n", 110 | "print(X.shape, y.shape)\n", 111 | "\n", 112 | "num_features = X_tr.shape[-1]\n", 113 | "num_output = 2" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "# From Logistic Regression to \"Deep Learning\"\n", 121 | "The code implements logistic regression. In section [__Assignments Half Moon__](#Assignments-Half-Moon) you are asked to modify the code into a neural network.\n", 122 | "\n", 123 | "The building blocks of PyTorch are Tensors, Variables and Operations, with these we can form dynamic computational graphs that form neural networks.\n", 124 | "\n", 125 | "$$y = \\mathrm{nonlinearity}(xW + b)$$\n", 126 | "\n", 127 | "where $x$ is the input tensor, $y$ is the output tensor, and $W, b$ are the weights (variable tensors). The weights are initialized with an initializer of our choice.\n", 128 | "\n", 129 | "- $x$ has shape `[batch_size, num_features]`,\n", 130 | "- $W$ has shape `[num_features, num_units]`,\n", 131 | "- $b$ has `[num_units]`, and\n", 132 | "- $y$ has then `[batch_size, num_units]`" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "## PyTorch 101\n", 140 | "\n", 141 | "In this first exercise we will use basic PyTorch functions so that you can learn how to build it from scratch. This will help you later if you want to build your own custom operations." 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "import torch\n", 151 | "from torch.autograd import Variable\n", 152 | "from torch.nn.parameter import Parameter\n", 153 | "import torch.nn as nn\n", 154 | "import torch.nn.functional as F" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "[`Parameters`](http://pytorch.org/docs/0.1.12/nn.html#torch.nn.Parameter) are [`Variable`](http://pytorch.org/docs/0.1.12/autograd.html#torch.autograd.Variable) subclasses, that have a very special property when used with [`Module`](http://pytorch.org/docs/0.1.12/nn.html#torch.nn.Module)'s - when they’re assigned as `Module` attributes they are automatically added to the list of its parameters, and will appear e.g. in [`.parameters()`](http://pytorch.org/docs/0.1.12/nn.html#torch.nn.Module.parameters) iterator. Assigning a Variable doesn’t have such effect. This is because one might want to cache some temporary state, like last hidden state of the RNN, in the model. If there was no such class as `Parameter`, these temporaries would get registered too." 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "class Net(nn.Module):\n", 171 | "\n", 172 | " def __init__(self):\n", 173 | " super(Net, self).__init__()\n", 174 | " # Setting up variables, these variables are weights in your \n", 175 | " # network that can be updated while running our graph.\n", 176 | " # Notice, to make a hidden layer, the weights need to have the \n", 177 | " # following dimensionality:\n", 178 | " # W[number_of_units_going_out, number_of_units_going_in]\n", 179 | " # b[number_of_units_going_out]\n", 180 | " # in the example below we have 2 input units (num_features) and 2 output units (num_output)\n", 181 | " # so our weights become W[2, 2], b[2]\n", 182 | " # if we want to make a hidden layer with 100 units, we need to define the shape of the\n", 183 | " # first weight to W[100, 2], b[2] and the shape of the second weight to W[2, 100], b[2]\n", 184 | " self.W_1 = Parameter(torch.randn(num_output, num_features)) # change num_output to hidden units in 2nd layer\n", 185 | " self.b_1 = Parameter(torch.randn(num_output)) # change num_output to hidden units in second layer\n", 186 | " \n", 187 | " # second layer\n", 188 | " #self.W_2 = Parameter(torch.randn(num_output, 100))\n", 189 | " #self.b_2 = Parameter(torch.randn(num_output))\n", 190 | "\n", 191 | " def forward(self, x):\n", 192 | " # Setting up ops, these ops will define edges along our computational graph\n", 193 | " # The below ops will compute a logistic regression, \n", 194 | " # but can be modified to compute a neural network\n", 195 | " x = F.linear(x, self.W_1, self.b_1) # remember to wrap your nonlinearity F.tanh(...)\n", 196 | " #x = F.linear(x, self.W_2, self.b_2)\n", 197 | " return F.softmax(x, dim=1) # softmax to be performed on the second dimension\n", 198 | "\n", 199 | "\n", 200 | "net = Net()" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "Knowing how to print your tensors is useful" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "# list all parameters in your network\n", 217 | "print(\"NAMED PARAMETERS\")\n", 218 | "print(list(net.named_parameters()))\n", 219 | "print()\n", 220 | "# the .parameters() method simply gives the Tensors in the list\n", 221 | "print(\"PARAMETERS\")\n", 222 | "print(list(net.parameters()))\n", 223 | "print()\n", 224 | "\n", 225 | "# list individual parameters by name\n", 226 | "print('WEIGHTS')\n", 227 | "print(net.W_1)\n", 228 | "print(net.W_1.size())\n", 229 | "print('\\nBIAS')\n", 230 | "print(net.b_1)\n", 231 | "print(net.b_1.size())" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "# Exploring Parameter\n", 239 | "\n", 240 | "Ok, let's investigate what a Parameter/Variable is" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "param = net.W_1\n", 250 | "print(\"## this is the tensor\")\n", 251 | "print(param.data)\n", 252 | "print(\"\\n## this is the tensor's gradient\")\n", 253 | "print(param.grad)\n", 254 | "# notice, the gradient is undefined because we have not yet run a backward pass\n", 255 | "\n", 256 | "print(\"\\n## is it a leaf in the graph?\")\n", 257 | "print(param.is_leaf)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "## Excluding subgraphs from backward propagation\n", 265 | "\n", 266 | "`Variable`s have two properties which allow you to define if a graph is going to be used for training or inference, `requires_grad` and `volatile`.\n", 267 | "During training we might not want to compute the gradients for all layers, e.g. if we have a pretrained model or embeddings that we do not want to change the values of.\n", 268 | "To compute gradients we need to store activations and compute the backward pass for the given layer.\n", 269 | "Setting `requires_grad = False` will allow you to circumvent these properties.\n", 270 | "If any paramater in an operation / layer requires gradient then the entire output of the operation will also require gradient.\n", 271 | "\n", 272 | "The `volatile` property is mostly used when you want to run inference with your model, and if it is set to `True` the entire graph will not require gradient. This means that you expect to never call `.backward()` on the network.\n", 273 | "\n", 274 | "See http://pytorch.org/docs/master/notes/autograd.html for an in-depth explanation." 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "# Test network\n", 282 | "\n", 283 | "To use our network we can simply call our graph, and it will dynamically be created. Here is an example of running the network's forward pass." 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "X = Variable(torch.randn(5, num_features))\n", 293 | "# the net.__call__ runs some pre-defined functions\n", 294 | "# both before and after running net.forward()\n", 295 | "# see http://pytorch.org/docs/master/_modules/torch/nn/modules/module.html\n", 296 | "print(net(X))" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "Parameters is a special case of Variable " 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "# let's take a look at the gradients\n", 313 | "for p in net.parameters():\n", 314 | " print(p.data)\n", 315 | " print(p.grad)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "X = Variable(torch.randn(7, num_features))\n", 325 | "out = net(X)\n", 326 | "# we need to give a tensor of gradients to .backward,\n", 327 | "# we give a dummy tensor\n", 328 | "# for details on .backward(), see http://pytorch.org/docs/master/autograd.html#torch.autograd.backward\n", 329 | "out.backward(torch.randn(7, num_output))" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "for p in net.parameters():\n", 339 | " print(p.data)\n", 340 | " print(p.grad)" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "# ok, let's try and zero the accumulated gradients\n", 350 | "net.zero_grad()\n", 351 | "for p in net.parameters():\n", 352 | " print(p.data)\n", 353 | " print(p.grad)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "# Loss function\n", 361 | "\n", 362 | "Let's define a custom loss function to compute how good our graph is doing." 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "def cross_entropy(ys, ts):\n", 372 | " # computing cross entropy per sample\n", 373 | " cross_entropy = -torch.sum(ts * torch.log(ys), dim=1, keepdim=False)\n", 374 | " # averaging over samples\n", 375 | " return torch.mean(cross_entropy)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "To train our neural network we need to update the parameters in direction of the negative gradient w.r.t the cost function we defined earlier.\n", 383 | "We can use [`torch.optim`](http://pytorch.org/docs/master/optim.html) to get the gradients with some update rule for all parameters in the network.\n", 384 | "\n", 385 | "Heres a small animation of gradient descent: http://imgur.com/a/Hqolp, which also illustrates which challenges optimizers might face, e.g. saddle points." 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "import torch.optim as optim\n", 395 | "\n", 396 | "optimizer = optim.SGD(net.parameters(), lr=0.01)" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": {}, 402 | "source": [ 403 | "Next, we make the prediction functions, such that we can get an accuracy measure over a batch" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [ 412 | "def accuracy(ys, ts):\n", 413 | " # making a one-hot encoded vector of correct (1) and incorrect (0) predictions\n", 414 | " correct_prediction = torch.eq(torch.max(ys, 1)[1], torch.max(ts, 1)[1])\n", 415 | " # averaging the one-hot encoded vector\n", 416 | " return torch.mean(correct_prediction.float())" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "The next step is to utilize our `optimizer` repeatedly in order to optimize our weights `W_1` and `b_1` to make the best possible linear seperation of the half moon dataset." 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "# number of training passses\n", 433 | "num_epochs = 1000\n", 434 | "# store loss and accuracy for information\n", 435 | "train_losses, val_losses, val_accs = [], [], []\n", 436 | "\n", 437 | "def pred(X):\n", 438 | " \"\"\" Compute graph's prediction and return numpy array\n", 439 | " \n", 440 | " Parameters\n", 441 | " ----------\n", 442 | " X : numpy.ndarray\n", 443 | " \n", 444 | " Returns\n", 445 | " -------\n", 446 | " numpy.ndarray\n", 447 | " \"\"\"\n", 448 | " X = Variable(torch.from_numpy(X))\n", 449 | " y = net(X)\n", 450 | " return y.data.numpy()\n", 451 | "\n", 452 | "# plot boundary on testset before training session\n", 453 | "plot_decision_boundary(lambda x: pred(x), X_te, y_te)\n", 454 | "plt.title(\"Untrained Classifier\")\n", 455 | "\n", 456 | "# training loop\n", 457 | "for e in range(num_epochs):\n", 458 | " # get training input and expected output as torch Variables and make sure type is correct\n", 459 | " tr_input = Variable(torch.from_numpy(X_tr))\n", 460 | " tr_targets = Variable(torch.from_numpy(onehot(y_tr, num_output))).float()\n", 461 | " \n", 462 | " # zeroize accumulated gradients in parameters\n", 463 | " optimizer.zero_grad()\n", 464 | " # predict by running forward pass\n", 465 | " tr_output = net(tr_input)\n", 466 | " # compute cross entropy loss\n", 467 | " tr_loss = cross_entropy(tr_output, tr_targets)\n", 468 | " # compute gradients given loss\n", 469 | " tr_loss.backward()\n", 470 | " # update the parameters given the computed gradients\n", 471 | " optimizer.step()\n", 472 | " \n", 473 | " # store training loss\n", 474 | " train_losses.append(tr_loss.data.numpy())\n", 475 | " \n", 476 | " # get validation input and expected output as torch Variables and make sure type is correct\n", 477 | " val_input = Variable(torch.from_numpy(X_val))\n", 478 | " val_targets = Variable(torch.from_numpy(onehot(y_val, num_output))).float()\n", 479 | " \n", 480 | " # predict with validation input\n", 481 | " val_output = net(val_input)\n", 482 | " # compute loss and accuracy\n", 483 | " val_loss = cross_entropy(val_output, val_targets)\n", 484 | " val_acc = accuracy(val_output, val_targets)\n", 485 | " \n", 486 | " # store loss and accuracy\n", 487 | " val_losses.append(val_loss.data.numpy())\n", 488 | " val_accs.append(val_acc.data.numpy())\n", 489 | " \n", 490 | " if e % 100 == 0:\n", 491 | " print(\"Epoch %i, \"\n", 492 | " \"Train Cost: %0.3f\"\n", 493 | " \"\\tVal Cost: %0.3f\"\n", 494 | " \"\\t Val acc: %0.3f\" % (e, \n", 495 | " train_losses[-1],\n", 496 | " val_losses[-1],\n", 497 | " val_accs[-1]))\n", 498 | "\n", 499 | "# get test input and expected output\n", 500 | "te_input = Variable(torch.from_numpy(X_te))\n", 501 | "te_targets = Variable(torch.from_numpy(onehot(y_te, num_output))).float()\n", 502 | "# predict on testset\n", 503 | "te_output = net(te_input)\n", 504 | "# compute loss and accuracy\n", 505 | "te_loss = cross_entropy(te_output, te_targets)\n", 506 | "te_acc = accuracy(te_output, te_targets)\n", 507 | "print(\"\\nTest Cost: %0.3f\\tTest Accuracy: %0.3f\" % (te_loss.data.numpy(), te_acc.data.numpy()))\n", 508 | "\n", 509 | "# plot boundary on testset after training session\n", 510 | "plot_decision_boundary(lambda x: pred(x), X_te, y_te)\n", 511 | "plt.title(\"Trained Classifier\")\n", 512 | "\n", 513 | "\n", 514 | "epoch = np.arange(len(train_losses))\n", 515 | "plt.figure()\n", 516 | "plt.plot(epoch, train_losses, 'r', epoch, val_losses, 'b')\n", 517 | "plt.legend(['Train Loss', 'Val Loss'])\n", 518 | "plt.xlabel('Updates')\n", 519 | "plt.ylabel('Loss')" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "# Assignments Half Moon\n", 527 | "\n", 528 | "1. A linear logistic classifier is only able to create a linear decision boundary. Change the Logistic classifier into a (nonlinear) Neural network by inserting a dense hidden layer between the input and output layers of the model\n", 529 | " \n", 530 | "2. Experiment with multiple hidden layers or more / less hidden units. What happens to the decision boundary?\n", 531 | " \n", 532 | "3. Overfitting: When increasing the number of hidden layers / units the neural network will fit the training data better by creating a highly nonlinear decision boundary. If the model is to complex it will often generalize poorly to new data (validation and test set). Can you observe this from the training and validation errors? \n", 533 | " \n", 534 | "4. We used the vanilla stocastic gradient descent algorithm for parameter updates. This is usually slow to converge and more sophisticated pseudo-second-order methods usually works better. Try changing the optimizer to [adam or momentum](http://pytorch.org/docs/master/optim.html#torch.optim.Adam)" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": {}, 540 | "source": [ 541 | "When you're done, continue to the [next part of this lab](http://0.0.0.0:8888/notebooks/intermediate/1.2-FFN.ipynb)." 542 | ] 543 | } 544 | ], 545 | "metadata": { 546 | "kernelspec": { 547 | "display_name": "Python 3", 548 | "language": "python", 549 | "name": "python3" 550 | }, 551 | "language_info": { 552 | "codemirror_mode": { 553 | "name": "ipython", 554 | "version": 3 555 | }, 556 | "file_extension": ".py", 557 | "mimetype": "text/x-python", 558 | "name": "python", 559 | "nbconvert_exporter": "python", 560 | "pygments_lexer": "ipython3", 561 | "version": "3.6.4" 562 | } 563 | }, 564 | "nbformat": 4, 565 | "nbformat_minor": 1 566 | } 567 | -------------------------------------------------------------------------------- /2_intermediate/1.2-FFN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Credits\n", 8 | "\n", 9 | "See main readme for credits." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import torch\n", 19 | "from torch.autograd import Variable\n", 20 | "from torch.nn.parameter import Parameter\n", 21 | "import torch.nn as nn\n", 22 | "import torch.nn.functional as F\n", 23 | "import torch.optim as optim\n", 24 | "import torch.nn.init as init\n", 25 | "\n", 26 | "%matplotlib inline\n", 27 | "import matplotlib\n", 28 | "import numpy as np\n", 29 | "import matplotlib.pyplot as plt" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "# MNIST dataset\n", 37 | "MNIST is a dataset that is often used for benchmarking. The MNIST dataset consists of 70,000 images of handwritten digits from 0-9. The dataset is split into a 50,000 images training set, 10,000 images validation set and 10,000 images test set. The images are 28x28 pixels, where each pixel represents a normalised value between 0-255 (0=black and 255=white).\n", 38 | "\n", 39 | "## Primer\n", 40 | "We use a feedforward neural network to classify the 28x28 mnist images. `num_features` is therefore $28 * 28=784$, i.e. we represent each image as a vector. The ordering of the pixels in the vector does not matter, so we could permutate all images using the same permutation and still get the same performance. (You are of course encouraged to try this using ``numpy.random.permutation`` to get a random permutation. This task is therefore called the _permutation invariant_ MNIST. Obviously this throws away a lot of structure in the data. In the next module we'll fix this with the convolutional neural network wich encodes prior knowledgde about data that has either spatial or temporal structure. \n", 41 | "\n", 42 | "## Ballpark estimates of hyperparameters\n", 43 | "__Optimizers:__\n", 44 | " 1. SGD + Momentum: learning rate 1.0 - 0.1 \n", 45 | " 2. ADAM: learning rate 3*1e-4 - 1e-5\n", 46 | " 3. RMSPROP: somewhere between SGD and ADAM\n", 47 | "\n", 48 | "__Regularization:__\n", 49 | "1. [Dropout](http://pytorch.org/docs/master/nn.html?highlight=dropout#torch.nn.Dropout). Dropout rate 0.1-0.5\n", 50 | " - Remember to pick the correct version according to the input dimensionality\n", 51 | " - **NOTE** call `net.train()` before training to activate random dropout, and call `net.eval()` to deactivate dropout while validating or running inference with model.\n", 52 | "2. L2 (weight decay of optimization functions, e.g. [Adam](http://pytorch.org/docs/master/optim.html#torch.optim.Adam)) and [L1 regularization](http://pytorch.org/docs/master/nn.html#torch.nn.L1Loss).\n", 53 | " - Not used that often in deep learning, but 1e-4 - 1e-8\n", 54 | "3. [Batchnorm](http://pytorch.org/docs/master/nn.html#torch.nn.BatchNorm1d): Batchnorm also acts as a regularizer - Often very useful (faster and better convergence)\n", 55 | " - Remember to pick the correct version according to the input dimensionality\n", 56 | " - **NOTE** call `net.train()` before training to activate, and call `net.eval()` to have a non-stochastic variant while validating or running inference with model.\n", 57 | " \n", 58 | " \n", 59 | "__Parameter initialization:__\n", 60 | "Parameter initialization is extremely important. PyTorch has a lot of different initializers, check the [PyTorch API](http://pytorch.org/docs/master/nn.html#torch-nn-init). Often used initializer are\n", 61 | "1. Kaming He\n", 62 | "2. Xavier Glorot\n", 63 | "3. Uniform or Normal with small scale (0.1 - 0.01)\n", 64 | "4. Orthogonal (this usually works very well for RNNs)\n", 65 | "\n", 66 | "Bias is nearly always initialized to zero using the [torch.nn.init.constant(tensor, val)](http://pytorch.org/docs/master/nn.html#torch.nn.init.constant)\n", 67 | "\n", 68 | "__Number of hidden units and network structure:__\n", 69 | "Probably as big network as possible and then apply regularization. You'll have to experiment. One rarely goes below 512 units for feedforward networks unless your are training on CPU...\n", 70 | "There's some research into stochastic depth networks: https://arxiv.org/pdf/1603.09382v2.pdf, but in general this is trial and error.\n", 71 | "\n", 72 | "__Nonlinearity:__ [The most commonly used nonliearities are](http://pytorch.org/docs/master/nn.html#non-linear-activations)\n", 73 | " \n", 74 | "1. ReLU\n", 75 | "2. Leaky ReLU\n", 76 | "3. Elu\n", 77 | "3. Sigmoids squash the output [-1, 1], and are used if your output is binary (not used in the hidden layers)\n", 78 | "4. Softmax normalizes the the output to 1, and is used as output if you have a classification problem\n", 79 | "\n", 80 | "See the plot below.\n", 81 | "\n", 82 | "__Mini-batch size:__\n", 83 | "Usually people use 16-256. Bigger is not allways better. With smaller mini-batch size you get more updates and your model might converge faster. Also small batch sizes use less memory, which means you can train a model with more parameters.\n", 84 | "\n", 85 | "Hyperparameters can be found by experience (guessing) or some search procedure. Random search is easy to implement and performs decent: http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf . \n", 86 | "More advanced search procedures include [Spearmint](https://github.com/JasperSnoek/spearmint) and many others." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "# Illustrate different output units\n", 96 | "x = np.linspace(-6, 6, 100)\n", 97 | "units = {\n", 98 | " \"ReLU\": lambda x: np.maximum(0, x),\n", 99 | " \"Leaky ReLU\": lambda x: np.maximum(0, x) + 0.1 * np.minimum(0, x),\n", 100 | " \"Elu\": lambda x: (x > 0) * x + (1 - (x > 0)) * (np.exp(x) - 1),\n", 101 | " \"Sigmoid\": lambda x: (1 + np.exp(-x))**(-1),\n", 102 | " \"tanh\": lambda x: (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))\n", 103 | "}\n", 104 | "\n", 105 | "plt.figure(figsize=(5, 5))\n", 106 | "[plt.plot(x, unit(x), label=unit_name, lw=2) for unit_name, unit in units.items()]\n", 107 | "plt.legend(loc=2, fontsize=16)\n", 108 | "plt.title('Non-linearities', fontsize=20)\n", 109 | "plt.ylim([-2, 5])\n", 110 | "plt.xlim([-6, 6])\n", 111 | "\n", 112 | "# assert that all class probablities sum to one\n", 113 | "softmax = lambda x: np.exp(x) / np.sum(np.exp(x))\n", 114 | "print(\"softmax should sum to one (approxiamtely):\", np.sum(softmax(x)))" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "source": [ 123 | "## MNIST\n", 124 | "First let's load the MNIST dataset and plot a few examples:" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "!if [ ! -f mnist.npz ]; then wget -N https://www.dropbox.com/s/qxywaq7nx19z72p/mnist.npz; else echo \"mnist.npz already downloaded\"; fi" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "#To speed up training we'll only work on a subset of the data\n", 143 | "data = np.load('mnist.npz')\n", 144 | "num_classes = 10\n", 145 | "x_train = data['X_train'][:1000].astype('float32')\n", 146 | "targets_train = data['y_train'][:1000].astype('int32')\n", 147 | "\n", 148 | "x_valid = data['X_valid'][:500].astype('float32')\n", 149 | "targets_valid = data['y_valid'][:500].astype('int32')\n", 150 | "\n", 151 | "x_test = data['X_test'][:500].astype('float32')\n", 152 | "targets_test = data['y_test'][:500].astype('int32')\n", 153 | "\n", 154 | "print(\"Information on dataset\")\n", 155 | "print(\"x_train\", x_train.shape)\n", 156 | "print(\"targets_train\", targets_train.shape)\n", 157 | "print(\"x_valid\", x_valid.shape)\n", 158 | "print(\"targets_valid\", targets_valid.shape)\n", 159 | "print(\"x_test\", x_test.shape)\n", 160 | "print(\"targets_test\", targets_test.shape)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "scrolled": false 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "#plot a few MNIST examples\n", 172 | "idx, dim, classes = 0, 28, 10\n", 173 | "# create empty canvas\n", 174 | "canvas = np.zeros((dim*classes, classes*dim))\n", 175 | "\n", 176 | "# fill with tensors\n", 177 | "for i in range(classes):\n", 178 | " for j in range(classes):\n", 179 | " canvas[i*dim:(i+1)*dim, j*dim:(j+1)*dim] = x_train[idx].reshape((dim, dim))\n", 180 | " idx += 1\n", 181 | "\n", 182 | "# visualize matrix of tensors as gray scale image\n", 183 | "plt.figure(figsize=(6, 6))\n", 184 | "plt.axis('off')\n", 185 | "plt.imshow(canvas, cmap='gray')\n", 186 | "plt.title('MNIST handwritten digits')\n", 187 | "plt.show()" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "#Hyperparameters\n", 197 | "num_classes = 10\n", 198 | "num_l1 = 512\n", 199 | "num_features = x_train.shape[1]\n", 200 | "\n", 201 | "# define network\n", 202 | "class Net(nn.Module):\n", 203 | "\n", 204 | " def __init__(self, num_features, num_hidden, num_output):\n", 205 | " super(Net, self).__init__() \n", 206 | " # input layer\n", 207 | " self.W_1 = Parameter(init.xavier_normal(torch.Tensor(num_hidden, num_features)))\n", 208 | " self.b_1 = Parameter(init.constant(torch.Tensor(num_hidden), 0))\n", 209 | " # hidden layer\n", 210 | " self.W_2 = Parameter(init.xavier_normal(torch.Tensor(num_output, num_hidden)))\n", 211 | " self.b_2 = Parameter(init.constant(torch.Tensor(num_output), 0))\n", 212 | " # define activation function in constructor\n", 213 | " self.activation = torch.nn.ELU()\n", 214 | "\n", 215 | " def forward(self, x):\n", 216 | " x = F.linear(x, self.W_1, self.b_1)\n", 217 | " x = self.activation(x)\n", 218 | " x = F.linear(x, self.W_2, self.b_2)\n", 219 | " return F.softmax(x, dim=1)\n", 220 | "\n", 221 | "\n", 222 | "net = Net(num_features, num_l1, num_classes)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "optimizer = optim.SGD(net.parameters(), lr=0.1)\n", 232 | "criterion = nn.CrossEntropyLoss()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "#Test the forward pass with dummy data\n", 242 | "x = np.random.normal(0, 1, (45, dim*dim)).astype('float32')\n", 243 | "\n", 244 | "print(net(Variable(torch.from_numpy(x))).size())" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "# Build the training loop\n", 252 | "\n", 253 | "We train the network by calculating the gradient w.r.t the cost function and update the parameters in direction of the negative gradient. \n", 254 | "\n", 255 | "\n", 256 | "When training neural network you always use mini batches. Instead of calculating the average gradient using the entire dataset you approximate the gradient using a mini-batch of typically 16 to 256 samples. The paramters are updated after each mini batch. Networks converge much faster using mini batches because the parameters are updated more often.\n", 257 | "\n", 258 | "We build a loop that iterates over the training data. Remember that the parameters are updated each time ``optimizer.step()`` is called." 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "# we could have done this ourselves,\n", 268 | "# but we should be aware of sklearn and it's tools\n", 269 | "from sklearn.metrics import accuracy_score\n", 270 | "\n", 271 | "# setting hyperparameters and gettings epoch sizes\n", 272 | "batch_size = 100\n", 273 | "num_epochs = 100\n", 274 | "num_samples_train = x_train.shape[0]\n", 275 | "num_batches_train = num_samples_train // batch_size\n", 276 | "num_samples_valid = x_valid.shape[0]\n", 277 | "num_batches_valid = num_samples_valid // batch_size\n", 278 | "\n", 279 | "# setting up lists for handling loss/accuracy\n", 280 | "train_acc, train_loss = [], []\n", 281 | "valid_acc, valid_loss = [], []\n", 282 | "test_acc, test_loss = [], []\n", 283 | "cur_loss = 0\n", 284 | "losses = []\n", 285 | "\n", 286 | "get_slice = lambda i, size: range(i * size, (i + 1) * size)\n", 287 | "\n", 288 | "for epoch in range(num_epochs):\n", 289 | " # Forward -> Backprob -> Update params\n", 290 | " ## Train\n", 291 | " cur_loss = 0\n", 292 | " net.train()\n", 293 | " for i in range(num_batches_train):\n", 294 | " slce = get_slice(i, batch_size)\n", 295 | " x_batch = Variable(torch.from_numpy(x_train[slce]))\n", 296 | " output = net(x_batch)\n", 297 | " \n", 298 | " # compute gradients given loss\n", 299 | " target_batch = Variable(torch.from_numpy(targets_train[slce]).long())\n", 300 | " batch_loss = criterion(output, target_batch)\n", 301 | " optimizer.zero_grad()\n", 302 | " batch_loss.backward()\n", 303 | " optimizer.step()\n", 304 | " \n", 305 | " cur_loss += batch_loss \n", 306 | " losses.append(cur_loss / batch_size)\n", 307 | "\n", 308 | " net.eval()\n", 309 | " ### Evaluate training\n", 310 | " train_preds, train_targs = [], []\n", 311 | " for i in range(num_batches_train):\n", 312 | " slce = get_slice(i, batch_size)\n", 313 | " x_batch = Variable(torch.from_numpy(x_train[slce]))\n", 314 | " \n", 315 | " output = net(x_batch)\n", 316 | " preds = torch.max(output, 1)[1]\n", 317 | " \n", 318 | " train_targs += list(targets_train[slce])\n", 319 | " train_preds += list(preds.data.numpy())\n", 320 | " \n", 321 | " ### Evaluate validation\n", 322 | " val_preds, val_targs = [], []\n", 323 | " for i in range(num_batches_valid):\n", 324 | " slce = get_slice(i, batch_size)\n", 325 | " x_batch = Variable(torch.from_numpy(x_valid[slce]))\n", 326 | " \n", 327 | " output = net(x_batch)\n", 328 | " preds = torch.max(output, 1)[1]\n", 329 | " val_preds += list(preds.data.numpy())\n", 330 | " val_targs += list(targets_valid[slce])\n", 331 | "\n", 332 | " train_acc_cur = accuracy_score(train_targs, train_preds)\n", 333 | " valid_acc_cur = accuracy_score(val_targs, val_preds)\n", 334 | " \n", 335 | " train_acc.append(train_acc_cur)\n", 336 | " valid_acc.append(valid_acc_cur)\n", 337 | " \n", 338 | " if epoch % 10 == 0:\n", 339 | " print(\"Epoch %2i : Train Loss %f , Train acc %f, Valid acc %f\" % (\n", 340 | " epoch+1, losses[-1], train_acc_cur, valid_acc_cur))\n", 341 | "\n", 342 | "epoch = np.arange(len(train_acc))\n", 343 | "plt.figure()\n", 344 | "plt.plot(epoch, train_acc, 'r', epoch, valid_acc, 'b')\n", 345 | "plt.legend(['Train Accucary','Validation Accuracy'])\n", 346 | "plt.xlabel('Updates'), plt.ylabel('Acc')" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": { 352 | "collapsed": true 353 | }, 354 | "source": [ 355 | "# Questions\n", 356 | "\n", 357 | "Try and add these modifications:\n", 358 | "- Kaiming He initialization instead of Xavier Glorot\n", 359 | "- add an extra layer\n", 360 | "- use the relu activation function\n", 361 | "- add dropout to the network (**note** the `net.train()` and `net.eval()` already in the code)\n", 362 | "- add momentum to the optimizer\n", 363 | "- use the ADAM optimizer instead of stochastic gradient descent\n", 364 | "- add L2/weight decay to the optimizer\n", 365 | "- add L1 regularization" 366 | ] 367 | } 368 | ], 369 | "metadata": { 370 | "kernelspec": { 371 | "display_name": "Python 3", 372 | "language": "python", 373 | "name": "python3" 374 | }, 375 | "language_info": { 376 | "codemirror_mode": { 377 | "name": "ipython", 378 | "version": 3 379 | }, 380 | "file_extension": ".py", 381 | "mimetype": "text/x-python", 382 | "name": "python", 383 | "nbconvert_exporter": "python", 384 | "pygments_lexer": "ipython3", 385 | "version": "3.6.4" 386 | } 387 | }, 388 | "nbformat": 4, 389 | "nbformat_minor": 1 390 | } 391 | -------------------------------------------------------------------------------- /2_intermediate/2.1-CNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Credits\n", 8 | "\n", 9 | "See main readme for credits." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Dependancies and supporting functions\n", 17 | "Loading dependancies and supporting functions by running the code block below." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "%matplotlib inline\n", 27 | "import matplotlib\n", 28 | "import numpy as np\n", 29 | "import matplotlib.pyplot as plt" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "# Convolutional Neural networks 101\n", 37 | "\n", 38 | "Convolution neural networks are one of the most succesfull types of neural networks for image recognition and an integral part of reigniting the interest in neural networks. \n", 39 | "\n", 40 | "In this lab we'll experiment with inserting 2D-convolution layers in the fully connected neural networks introduced in `1.2-FFN`. We'll further experiment with stacking of convolution layers, max pooling and strided convolutions which are all important techniques in current convolution neural network architectures. Lastly we'll try to visualize the learned convolution filters and try to understand what kind of features they learn to recognize.\n", 41 | "\n", 42 | "\n", 43 | "If you are unfamilar with the the convolution operation https://github.com/vdumoulin/conv_arithmetic have a nice visualization of different convolution variants. For a more indepth tutorial please see http://cs231n.github.io/convolutional-networks/ or http://neuralnetworksanddeeplearning.com/chap6.html." 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "# Assignment 1\n", 51 | "\n", 52 | "Given the following 3D tensor input `(channel, height, width)` , a given amount (`channels_out`) of filters `(channels_in, filter_height, filter_width)`, stride `(height, width)` and padding `(height, width)`, calculate the output dimensionality if it's valid.\n", 53 | "\n", 54 | "1. input tensor with dimensionality (1, 28, 28) and 16 filters of size (1, 5, 5) with stride (1, 1) and padding (0, 0)\n", 55 | "2. input tensor with dimensionality (3, 32, 32) and 24 filters of size (2, 3, 3) with stride (1, 1) and padding (0, 0)\n", 56 | "3. input tensor with dimensionality (10, 32, 32) and 3 filters of size (10, 2, 2) with stride (2, 2) and padding (0, 0)\n", 57 | "4. input tensor with dimensionality (11, 8, 16) and 7 filters of size (11, 3, 3) with stride (2, 2) and padding (1, 1)\n", 58 | "5. input tensor with dimensionality (128, 256, 256) and 112 filters of size (128, 3, 3) with stride (1, 1) and padding (1, 1)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "# Convolutional network\n", 66 | "\n", 67 | "Making a convolutional network we load the data as `(num_samples, num_channels, height, width)`" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!if [ ! -f mnist.npz ]; then wget -N https://www.dropbox.com/s/qxywaq7nx19z72p/mnist.npz; else echo \"mnist.npz already downloaded\"; fi" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "# LOAD the mnist data. To speed up training we'll only work on a subset of the data.\n", 86 | "# Note that we reshape the data from \n", 87 | "# (nsamples, num_features) = (nsamples, nchannels*rows*cols)\n", 88 | "# -> (nsamples, nchannels, rows, cols)\n", 89 | "# in order to retain the spatial arrangements of the pixels\n", 90 | "data = np.load('mnist.npz')\n", 91 | "num_classes = 10\n", 92 | "nchannels, rows, cols = 1, 28, 28\n", 93 | "x_train = data['X_train'][:1000].astype('float32')\n", 94 | "x_train = x_train.reshape((-1, nchannels, rows, cols))\n", 95 | "targets_train = data['y_train'][:1000].astype('int32')\n", 96 | "\n", 97 | "x_valid = data['X_valid'][:500].astype('float32')\n", 98 | "x_valid = x_valid.reshape((-1, nchannels, rows, cols))\n", 99 | "targets_valid = data['y_valid'][:500].astype('int32')\n", 100 | "\n", 101 | "x_test = data['X_test'][:500].astype('float32')\n", 102 | "x_test = x_test.reshape((-1, nchannels, rows, cols))\n", 103 | "targets_test = data['y_test'][:500].astype('int32')\n", 104 | "\n", 105 | "print(\"Information on dataset\")\n", 106 | "print(\"x_train\", x_train.shape)\n", 107 | "print(\"targets_train\", targets_train.shape)\n", 108 | "print(\"x_valid\", x_valid.shape)\n", 109 | "print(\"targets_valid\", targets_valid.shape)\n", 110 | "print(\"x_test\", x_test.shape)\n", 111 | "print(\"targets_test\", targets_test.shape)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "#plot a few MNIST examples\n", 121 | "idx, dim, classes = 0, 28, 10\n", 122 | "# create empty canvas\n", 123 | "canvas = np.zeros((dim*classes, classes*dim))\n", 124 | "\n", 125 | "# fill with tensors\n", 126 | "for i in range(classes):\n", 127 | " for j in range(classes):\n", 128 | " canvas[i*dim:(i+1)*dim, j*dim:(j+1)*dim] = x_train[idx].reshape((dim, dim))\n", 129 | " idx += 1\n", 130 | "\n", 131 | "# visualize matrix of tensors as gray scale image\n", 132 | "plt.figure(figsize=(6, 6))\n", 133 | "plt.axis('off')\n", 134 | "plt.imshow(canvas, cmap='gray')\n", 135 | "plt.title('MNIST handwritten digits')\n", 136 | "plt.show()" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "# Define a simple feed forward neural network" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "import torch\n", 153 | "from torch.autograd import Variable\n", 154 | "from torch.nn.parameter import Parameter\n", 155 | "import torch.nn as nn\n", 156 | "import torch.nn.functional as F\n", 157 | "import torch.optim as optim\n", 158 | "import torch.nn.init as init\n", 159 | "\n", 160 | "from torch.nn import Linear, Conv2d, BatchNorm2d, MaxPool2d, Dropout2d\n", 161 | "from torch.nn.functional import relu, elu, relu6, sigmoid, tanh, softmax" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "# hyperameters of the model\n", 171 | "num_classes = 10\n", 172 | "channels = x_train.shape[1]\n", 173 | "height = x_train.shape[2]\n", 174 | "width = x_train.shape[3]\n", 175 | "num_filters_conv1 = 16\n", 176 | "kernel_size_conv1 = 5 # [height, width]\n", 177 | "stride_conv1 = 1 # [stride_height, stride_width]\n", 178 | "num_l1 = 100\n", 179 | "padding_conv1 = 0\n", 180 | " \n", 181 | "def compute_conv_dim(dim_size):\n", 182 | " return int((dim_size - kernel_size_conv1 + 2 * padding_conv1) / stride_conv1 + 1)\n", 183 | "\n", 184 | "# define network\n", 185 | "class Net(nn.Module):\n", 186 | "\n", 187 | " def __init__(self):\n", 188 | " super(Net, self).__init__()\n", 189 | " # out_dim = (input_dim - filter_dim + 2padding) / stride + 1\n", 190 | " #self.conv_1 = Conv2d(in_channels=channels,\n", 191 | " # out_channels=num_filters_conv1,\n", 192 | " # kernel_size=kernel_size_conv1,\n", 193 | " # stride=stride_conv1)\n", 194 | " \n", 195 | " #self.conv_out_height = compute_conv_dim(height)\n", 196 | " #self.conv_out_width = compute_conv_dim(width)\n", 197 | " \n", 198 | " # add dropout to network\n", 199 | " #self.dropout = Dropout2d(p=0.5)\n", 200 | " #self.l1_in_features = num_filters_conv1 * self.conv_out_height * self.conv_out_width\n", 201 | " self.l1_in_features = channels * height * width\n", 202 | " \n", 203 | " self.l_1 = Linear(in_features=self.l1_in_features, \n", 204 | " out_features=num_l1,\n", 205 | " bias=True)\n", 206 | " self.l_out = Linear(in_features=num_l1, \n", 207 | " out_features=num_classes,\n", 208 | " bias=False)\n", 209 | " \n", 210 | " def forward(self, x): # x.size() = [batch, channel, height, width]\n", 211 | " #x = relu(self.conv_1(x))\n", 212 | " # torch.Tensor.view: http://pytorch.org/docs/master/tensors.html?highlight=view#torch.Tensor.view\n", 213 | " # Returns a new tensor with the same data as the self tensor,\n", 214 | " # but of a different size.\n", 215 | " # the size -1 is inferred from other dimensions \n", 216 | " x = x.view(-1, self.l1_in_features)\n", 217 | " #x = self.dropout(relu(self.l_1(x)))\n", 218 | " x = relu(self.l_1(x))\n", 219 | " return softmax(self.l_out(x), dim=1)\n", 220 | "\n", 221 | "\n", 222 | "net = Net()\n", 223 | "print(net)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "criterion = nn.CrossEntropyLoss()\n", 233 | "optimizer = optim.Adam(net.parameters(), lr=0.001)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "#Test the forward pass with dummy data\n", 243 | "x = np.random.normal(0,1, (5, 1, 28, 28)).astype('float32')\n", 244 | "out = net(Variable(torch.from_numpy(x)))\n", 245 | "out.size(), out" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "Notice how the output's probabilities are nicely distributed.\n", 253 | "The built-in nn functions (layers) alreay have a sane initialization of the weights." 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "# we could have done this ourselves,\n", 263 | "# but we should be aware of sklearn and it's tools\n", 264 | "from sklearn.metrics import accuracy_score\n", 265 | "\n", 266 | "batch_size = 100\n", 267 | "num_epochs = 50\n", 268 | "num_samples_train = x_train.shape[0]\n", 269 | "num_batches_train = num_samples_train // batch_size\n", 270 | "num_samples_valid = x_valid.shape[0]\n", 271 | "num_batches_valid = num_samples_valid // batch_size\n", 272 | "\n", 273 | "train_acc, train_loss = [], []\n", 274 | "valid_acc, valid_loss = [], []\n", 275 | "test_acc, test_loss = [], []\n", 276 | "cur_loss = 0\n", 277 | "losses = []\n", 278 | "\n", 279 | "get_slice = lambda i, size: range(i * size, (i + 1) * size)\n", 280 | "\n", 281 | "for epoch in range(num_epochs):\n", 282 | " # Forward -> Backprob -> Update params\n", 283 | " ## Train\n", 284 | " cur_loss = 0\n", 285 | " net.train()\n", 286 | " for i in range(num_batches_train):\n", 287 | " slce = get_slice(i, batch_size)\n", 288 | " x_batch = Variable(torch.from_numpy(x_train[slce]))\n", 289 | " output = net(x_batch)\n", 290 | " \n", 291 | " # compute gradients given loss\n", 292 | " target_batch = Variable(torch.from_numpy(targets_train[slce]).long())\n", 293 | " batch_loss = criterion(output, target_batch)\n", 294 | " optimizer.zero_grad()\n", 295 | " batch_loss.backward()\n", 296 | " optimizer.step()\n", 297 | " \n", 298 | " cur_loss += batch_loss \n", 299 | " losses.append(cur_loss / batch_size)\n", 300 | "\n", 301 | " net.eval()\n", 302 | " ### Evaluate training\n", 303 | " train_preds, train_targs = [], []\n", 304 | " for i in range(num_batches_train):\n", 305 | " slce = get_slice(i, batch_size)\n", 306 | " x_batch = Variable(torch.from_numpy(x_train[slce]))\n", 307 | " \n", 308 | " output = net(x_batch)\n", 309 | " preds = torch.max(output, 1)[1]\n", 310 | " \n", 311 | " train_targs += list(targets_train[slce])\n", 312 | " train_preds += list(preds.data.numpy())\n", 313 | " \n", 314 | " ### Evaluate validation\n", 315 | " val_preds, val_targs = [], []\n", 316 | " for i in range(num_batches_valid):\n", 317 | " slce = get_slice(i, batch_size)\n", 318 | " x_batch = Variable(torch.from_numpy(x_valid[slce]))\n", 319 | " \n", 320 | " output = net(x_batch)\n", 321 | " preds = torch.max(output, 1)[1]\n", 322 | " val_preds += list(preds.data.numpy())\n", 323 | " val_targs += list(targets_valid[slce])\n", 324 | "\n", 325 | " train_acc_cur = accuracy_score(train_targs, train_preds)\n", 326 | " valid_acc_cur = accuracy_score(val_targs, val_preds)\n", 327 | " \n", 328 | " train_acc.append(train_acc_cur)\n", 329 | " valid_acc.append(valid_acc_cur)\n", 330 | " \n", 331 | " if epoch % 10 == 0:\n", 332 | " print(\"Epoch %2i : Train Loss %f , Train acc %f, Valid acc %f\" % (\n", 333 | " epoch+1, losses[-1], train_acc_cur, valid_acc_cur))\n", 334 | " \n", 335 | "epoch = np.arange(len(train_acc))\n", 336 | "plt.figure()\n", 337 | "plt.plot(epoch, train_acc, 'r', epoch, valid_acc, 'b')\n", 338 | "plt.legend(['Train Acc', 'Val Acc'])\n", 339 | "plt.xlabel('Epochs')\n", 340 | "plt.ylabel('Acc')\n", 341 | "\n", 342 | "### Evaluate test set\n", 343 | "x_batch = Variable(torch.from_numpy(x_test))\n", 344 | "output = net(x_batch)\n", 345 | "preds = torch.max(output, 1)[1]\n", 346 | "print(\"\\nTest set Acc: %f\" % (accuracy_score(list(targets_test), list(preds.data.numpy()))))" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "# Assignment 2\n", 354 | "\n", 355 | "1. Note the performance of the standard feedforward neural network. Add a 2D convolution layer before the dense hidden layer and confirm that it increases the generalization performance of the network (try num_filters=16 and filter_size=5 as a starting point). \n", 356 | " \n", 357 | "2. Notice that the size of the image reduces. This can cause loss of information in convolutional networks that apply many convolutional layers. To avoid such add adequate padding to the convolutional layer.\n", 358 | " \n", 359 | "3. Can the performance be increases even further by stacking more convolution layers ?\n", 360 | " \n", 361 | "4. Maxpooling is a technique for decreasing the spatial resolution of an image while retaining the important features. Effectively this gives a local translational invariance and reduces the computation by a factor of four. In the classification algorithm which is usually desirable. Try to either: \n", 362 | " \n", 363 | " - add a maxpool layer (add arguement kernel_size=2, stride=2) after the convolution layer, or\n", 364 | " - set add stride=2 to the arguments of the convolution layer, make it fit with the kernel size\n", 365 | " \n", 366 | " Verify that this decreases spatial dimension of the image (`print(l_conv_x.size())` or `print(l_maxpool_x.size())` in your forward pass). Does this increase the performance of the network (you may need to stack multiple layers or increase the number of filters to increase performance) ?" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "# Visualization of filters\n", 374 | "Convolution filters can be interpreted as spatial feature detectors picking up different image features such as edges, corners etc. Below we provide code for visualization of the filters. The best results are obtained with fairly large filters of size 9 and either 16 or 36 filters. " 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": {}, 381 | "outputs": [], 382 | "source": [ 383 | "# to start with we print the names of the weights in our network\n", 384 | "names_and_vars = {x[0]: x[1] for x in net.named_parameters()}\n", 385 | "print(names_and_vars.keys())" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "### ERROR - If you get a key error, then you need to define l_conv1 in your model!\n", 395 | "if not 'conv_1.weight' in names_and_vars:\n", 396 | " print(\"You need to go back and define a convolutional layer in the network.\")\n", 397 | "else:\n", 398 | " np_W = names_and_vars['conv_1.weight'].data.numpy() # get the filter values from the first conv layer\n", 399 | " print(np_W.shape, \"i.e. the shape is (channels_out, channels_in, filter_height, filter_width)\")\n", 400 | " channels_out, channels_in, filter_size, _ = np_W.shape\n", 401 | " n = int(channels_out**0.5)\n", 402 | "\n", 403 | " # reshaping the last dimension to be n by n\n", 404 | " np_W_res = np_W.reshape(filter_size, filter_size, channels_in, n, n)\n", 405 | " fig, ax = plt.subplots(n,n)\n", 406 | " print(\"learned filter values\")\n", 407 | " for i in range(n):\n", 408 | " for j in range(n):\n", 409 | " ax[i,j].imshow(np_W_res[:,:,0,i,j], cmap='gray',interpolation='none')\n", 410 | " ax[i,j].xaxis.set_major_formatter(plt.NullFormatter())\n", 411 | " ax[i,j].yaxis.set_major_formatter(plt.NullFormatter())\n", 412 | "\n", 413 | " idx = 1\n", 414 | " plt.figure()\n", 415 | " plt.imshow(x_train[idx,0],cmap='gray',interpolation='none')\n", 416 | " plt.title('Inut Image')\n", 417 | " plt.show()\n", 418 | "\n", 419 | " #visalize the filters convolved with an input image\n", 420 | " from scipy.signal import convolve2d\n", 421 | " np_W_res = np_W.reshape(filter_size, filter_size, channels_in, n, n)\n", 422 | " fig, ax = plt.subplots(n,n,figsize=(9,9))\n", 423 | " print(\"Response from input image convolved with the filters\")\n", 424 | " for i in range(n):\n", 425 | " for j in range(n):\n", 426 | " ax[i,j].imshow(convolve2d(x_train[1,0],np_W_res[:,:,0,i,j],mode='same'),\n", 427 | " cmap='gray',interpolation='none')\n", 428 | " ax[i,j].xaxis.set_major_formatter(plt.NullFormatter())\n", 429 | " ax[i,j].yaxis.set_major_formatter(plt.NullFormatter())" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": {}, 435 | "source": [ 436 | "# Assignment 3\n", 437 | "\n", 438 | "The visualized filters will likely look most like noise due to the small amount of training data.\n", 439 | "\n", 440 | "1. Try to use 10000 traning examples instead and visualise the filters again\n", 441 | " \n", 442 | "2. Dropout is a very usefull technique for preventing overfitting. Try to add a DropoutLayer after the convolution layer and hidden layer. This should increase both performance and the \"visual appeal\" of the filters\n", 443 | " - remember to use `net.train()` and `net.eval()` properly.\n", 444 | " \n", 445 | "3. Batch normalization is a recent innovation for improving generalization performance. Try to insert batch normalization layers into the network to improve performance. \n", 446 | " - remember to use `net.train()` and `net.eval()` properly." 447 | ] 448 | } 449 | ], 450 | "metadata": { 451 | "kernelspec": { 452 | "display_name": "Python 3", 453 | "language": "python", 454 | "name": "python3" 455 | }, 456 | "language_info": { 457 | "codemirror_mode": { 458 | "name": "ipython", 459 | "version": 3 460 | }, 461 | "file_extension": ".py", 462 | "mimetype": "text/x-python", 463 | "name": "python", 464 | "nbconvert_exporter": "python", 465 | "pygments_lexer": "ipython3", 466 | "version": "3.6.4" 467 | } 468 | }, 469 | "nbformat": 4, 470 | "nbformat_minor": 1 471 | } 472 | -------------------------------------------------------------------------------- /2_intermediate/2.2-CNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Credits\n", 8 | "\n", 9 | "See main readme for credits." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Dependancies and supporting functions\n", 17 | "Loading dependancies and supporting functions by running the code block below." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "%matplotlib inline\n", 27 | "import matplotlib\n", 28 | "import numpy as np\n", 29 | "import matplotlib.pyplot as plt" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "# Cuda semantics in PyTorch\n", 37 | "\n", 38 | "When we're making larger models it becomes infeasible to run on CPUs.\n", 39 | "Thankfully, GPU support is straightforward in PyTorch.\n", 40 | "\n", 41 | "> **NOTE** remember to run on a cuda enabled machine (e.g. use nvidia-docker and run the gpu image).\n", 42 | "\n", 43 | "> You can see this repository's `README.md` for details.\n", 44 | "\n", 45 | "See the [docs on cuda functionality](http://pytorch.org/docs/master/cuda.html) for an overview of useful functions for use with cuda.\n", 46 | "For more examples on cuda semantics see [the notes here](http://pytorch.org/docs/master/notes/cuda.html#cuda-semantics).\n", 47 | "\n", 48 | "\n", 49 | "## Transfer tensor to GPU\n", 50 | "\n", 51 | "Tensors can be transferred to the GPU by applying the `.cuda()` method to the [`torch.Tensor`s](http://pytorch.org/docs/master/tensors.html#torch-tensor) and [`torch.autograd.Variable`s](http://pytorch.org/docs/master/autograd.html#torch.autograd.Variable), e.g. `inputs.cuda()`.\n", 52 | "\n", 53 | "## Transfer network to GPU\n", 54 | "\n", 55 | "Just like how you transfer a Tensor on to the GPU, you transfer the entire network onto the GPU.\n", 56 | "This will recursively go over all modules and convert their parameters and buffers to CUDA tensors:\n", 57 | "\n", 58 | " net.cuda()\n", 59 | " \n", 60 | "Remember that you will have to send the inputs and targets at every step to the GPU too:\n", 61 | "\n", 62 | " inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())\n", 63 | " outputs = net(inputs)\n", 64 | " loss = criterion(outputs, labels)\n", 65 | "\n", 66 | "## Use the cuda tensors / variables on CPU\n", 67 | "\n", 68 | "When tensors, modules, etc. are on a GPU they are no longer regular `torch.Tensor`s, but `torch.cuda.Tensor`s (see [torch's tensor overview](http://pytorch.org/docs/master/tensors.html#torch-tensor)).\n", 69 | "To be able to work with them on the host's memory, you need to fetch them from the GPU first.\n", 70 | "\n", 71 | "Given a cuda tensor or variable `ct` you can move it to host memory with `ct.cpu()`.\n", 72 | "If for instance you want to convert the tensor in a variable to numpy, you call `ct.cpu().data.numpy()`.\n", 73 | "If it was not on the GPU, we would not need the `.cpu()` call.\n", 74 | "\n", 75 | "## Multiple GPUs\n", 76 | "\n", 77 | "When you need to run a model on multiple GPUs you can use either [torch.nn.DataParallel](http://pytorch.org/docs/master/nn.html#torch.nn.DataParallel) or [torch.nn.parallel.DistributedDataParallel](http://pytorch.org/docs/master/nn.html#torch.nn.parallel.DistributedDataParallel).\n", 78 | "Depending on whether the GPUs are on multiple machines and how many CPUs you can utilize for handling the training.\n", 79 | "If you have more than two GPUs DitributedDataParallel is usually the better choice.\n", 80 | "\n", 81 | "It is straightforward to use this. Simply wrap the model as follows\n", 82 | "\n", 83 | "```\n", 84 | "model = Net() # define the model as always\n", 85 | "net = torch.nn.DataParallel(model, device_ids=None) # tell pytorch to use all available cuda devices\n", 86 | "output = net(input_var) # run on all devices\n", 87 | "```" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "def get_variable(x):\n", 97 | " \"\"\" Converts tensors to cuda, if available. \"\"\"\n", 98 | " if torch.cuda.is_available():\n", 99 | " return x.cuda()\n", 100 | " return x\n", 101 | "\n", 102 | "def get_numpy(x):\n", 103 | " \"\"\" Get numpy array for both cuda and not. \"\"\"\n", 104 | " if torch.cuda.is_available():\n", 105 | " return x.cpu().data.numpy()\n", 106 | " return x.data.numpy()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "# More Fun with convolutional networks\n", 114 | "## Get the data" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "!if [ ! -f mnist_cluttered_60x60_6distortions.npz ]; then wget -N https://www.dropbox.com/s/rvvo1vtjjrryr7e/mnist_cluttered_60x60_6distortions.npz; else echo \"mnist_cluttered_60x60_6distortions.npz already downloaded\"; fi" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "## Spatial Transformer Networks\n", 131 | "\n", 132 | "In the data the each mnist digit (20x20 pixels) has been placed randomly in a 60x60 canvas. To make the task harder each canvas has then been cluttered with small pieces of digits. In this task it is helpfull for a network if it can focus only on the digit and ignore the rest.\n", 133 | "\n", 134 | "The ``TransformerLayer`` lets us do this. The transformer layer learns an affine transformation which lets the network zoom, rotate and skew. If you are interested you should read the [paper](https://arxiv.org/abs/1506.02025), but the main idea is that you can let a small convolutional network determine the the parameters of the affine transformation. You then apply the affine transformation to the input data. Usually this also involves downsampling which forces the model to zoom in on the relevant parts of the data. After the affine transformation we can use a larger conv net to do the classification. \n", 135 | "This is possible because you can backprop through an affine transformation if you use bilinear interpolation." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "NUM_EPOCHS = 15\n", 145 | "BATCH_SIZE = 256\n", 146 | "DIM = 60\n", 147 | "NUM_CLASSES = 10\n", 148 | "mnist_cluttered = \"mnist_cluttered_60x60_6distortions.npz\"" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "def load_data():\n", 158 | " data = np.load(mnist_cluttered)\n", 159 | " X_train, y_train = data['x_train'], np.argmax(data['y_train'], axis=-1)\n", 160 | " X_valid, y_valid = data['x_valid'], np.argmax(data['y_valid'], axis=-1)\n", 161 | " X_test, y_test = data['x_test'], np.argmax(data['y_test'], axis=-1)\n", 162 | "\n", 163 | " # reshape for convolutions\n", 164 | " X_train = X_train.reshape((X_train.shape[0], 1, DIM, DIM))\n", 165 | " X_valid = X_valid.reshape((X_valid.shape[0], 1, DIM, DIM))\n", 166 | " X_test = X_test.reshape((X_test.shape[0], 1, DIM, DIM))\n", 167 | " \n", 168 | " print(\"Train samples:\", X_train.shape)\n", 169 | " print(\"Validation samples:\", X_valid.shape)\n", 170 | " print(\"Test samples:\", X_test.shape)\n", 171 | "\n", 172 | " return dict(\n", 173 | " X_train=np.asarray(X_train, dtype='float32'),\n", 174 | " y_train=y_train.astype('int32'),\n", 175 | " X_valid=np.asarray(X_valid, dtype='float32'),\n", 176 | " y_valid=y_valid.astype('int32'),\n", 177 | " X_test=np.asarray(X_test, dtype='float32'),\n", 178 | " y_test=y_test.astype('int32'),\n", 179 | " num_examples_train=X_train.shape[0],\n", 180 | " num_examples_valid=X_valid.shape[0],\n", 181 | " num_examples_test=X_test.shape[0],\n", 182 | " input_height=X_train.shape[2],\n", 183 | " input_width=X_train.shape[3],\n", 184 | " output_dim=10,)\n", 185 | "data = load_data()\n", 186 | "\n", 187 | "idx = 0\n", 188 | "canvas = np.zeros((DIM*NUM_CLASSES, NUM_CLASSES*DIM))\n", 189 | "for i in range(NUM_CLASSES):\n", 190 | " for j in range(NUM_CLASSES):\n", 191 | " canvas[i*DIM:(i+1)*DIM, j*DIM:(j+1)*DIM] = data['X_train'][idx].reshape((DIM, DIM))\n", 192 | " idx += 1\n", 193 | "plt.figure(figsize=(10, 10))\n", 194 | "plt.imshow(canvas, cmap='gray')\n", 195 | "plt.title('Cluttered handwritten digits')\n", 196 | "plt.axis('off')\n", 197 | "\n", 198 | "plt.show()" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "## Building the model\n", 206 | "\n", 207 | "We use a model where the localization network is a two layer convolution network which operates directly on the image input. The output from the localization network is a 6 dimensional vector specifying the parameters in the affine transformation.\n", 208 | "\n", 209 | "We set up the transformer layer to initially do the identity transform, similarly to [1]. If the output from the localization networks is [t1, t2, t3, t4, t5, t6] then t1 and t5 determines zoom, t2 and t4 determines skewness, and t3 and t6 move the center position. By setting the initial values of the bias vector to \n", 210 | "\n", 211 | "```\n", 212 | "|1, 0, 0|\n", 213 | "|0, 1, 0|\n", 214 | "```\n", 215 | "and the final W of the localization network to all zeros we ensure that in the beginning of training the network works as a pooling layer. \n", 216 | "\n", 217 | "The output of the localization layer feeds into the transformer layer which applies the transformation to the image input. In our setup the transformer layer downsamples the input by a factor 3.\n", 218 | "\n", 219 | "Finally a 2 layer convolution layer and 2 fully connected layers calculates the output probabilities.\n", 220 | "\n", 221 | "\n", 222 | "### The model\n", 223 | "```\n", 224 | "Input -> localization_network -> TransformerLayer -> output_network -> predictions\n", 225 | " | |\n", 226 | " >--------------------------------^\n", 227 | "```\n", 228 | "\n", 229 | "\n" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "import torch\n", 239 | "import torch.nn as nn\n", 240 | "import torch.nn.functional as F\n", 241 | "import torch.optim as optim\n", 242 | "\n", 243 | "from torch.autograd import Variable" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "class Net(nn.Module):\n", 253 | " \n", 254 | " def __init__(self, input_channels, input_height, input_width, num_classes, num_zoom=3):\n", 255 | " super(Net, self).__init__()\n", 256 | " self.input_channels = input_channels\n", 257 | " self.input_height = input_height\n", 258 | " self.input_width = input_width\n", 259 | " self.num_classes = num_classes\n", 260 | " self.num_zoom = num_zoom\n", 261 | " \n", 262 | " # Spatial transformer localization-network\n", 263 | " # nn.Sequential http://pytorch.org/docs/master/nn.html#torch.nn.Sequential\n", 264 | " # A sequential container. \n", 265 | " # Modules will be added to it in the order they are passed in the constructor.\n", 266 | " self.localization = nn.Sequential(\n", 267 | " nn.Conv2d(in_channels=input_channels,\n", 268 | " out_channels=8, \n", 269 | " kernel_size=7, \n", 270 | " padding=3),\n", 271 | " nn.MaxPool2d(kernel_size=2, \n", 272 | " stride=2),\n", 273 | " nn.ReLU(inplace=True),\n", 274 | " nn.Conv2d(in_channels=8, \n", 275 | " out_channels=10, \n", 276 | " kernel_size=5, \n", 277 | " padding=2),\n", 278 | " nn.MaxPool2d(kernel_size=2,\n", 279 | " stride=2),\n", 280 | " nn.ReLU(inplace=True)\n", 281 | " )\n", 282 | "\n", 283 | " # Regressor for the 3 * 2 affine matrix that we use \n", 284 | " # to make the bilinear interpolation for the spatial transformer\n", 285 | " self.fc_loc = nn.Sequential(\n", 286 | " nn.Linear(in_features=10 * input_height//4 * input_width//4, \n", 287 | " out_features=32,\n", 288 | " bias=True),\n", 289 | " nn.ReLU(inplace=True),\n", 290 | " nn.Linear(in_features=32, \n", 291 | " out_features=3 * 2,\n", 292 | " bias=True)\n", 293 | " )\n", 294 | "\n", 295 | " # Initialize the weights/bias with identity transformation\n", 296 | " # see the article for a definition and explanation for this\n", 297 | " self.fc_loc[2].weight.data.fill_(0)\n", 298 | " self.fc_loc[2].bias.data = torch.FloatTensor([1, 0, 0, 0, 1, 0])\n", 299 | " \n", 300 | " # The classification network based on the transformed (cropped) image\n", 301 | " self.conv1 = nn.Conv2d(in_channels=input_channels,\n", 302 | " out_channels=16,\n", 303 | " kernel_size=5,\n", 304 | " padding=2)\n", 305 | " self.conv2 = nn.Conv2d(in_channels=16, \n", 306 | " out_channels=32,\n", 307 | " kernel_size=5,\n", 308 | " padding=2)\n", 309 | " self.conv2_drop = nn.Dropout2d()\n", 310 | " \n", 311 | " # fully connected output layers\n", 312 | " self.fc1_features = 32 * input_height//num_zoom//4 * input_width//num_zoom//4\n", 313 | " self.fc1 = nn.Linear(in_features=self.fc1_features, \n", 314 | " out_features=50)\n", 315 | " self.fc2 = nn.Linear(in_features=50,\n", 316 | " out_features=num_classes)\n", 317 | "\n", 318 | " # Spatial transformer network forward function\n", 319 | " def stn(self, x):\n", 320 | " \"\"\" Spatial Transformer Network \"\"\"\n", 321 | " # creates distributed embeddings of the image with the location network.\n", 322 | " xs = self.localization(x)\n", 323 | " xs = xs.view(-1, 10 * self.input_height//4 * self.input_width//4)\n", 324 | " # project from distributed embeddings to bilinear interpolation space\n", 325 | " theta = self.fc_loc(xs)\n", 326 | " theta = theta.view(-1, 2, 3)\n", 327 | " \n", 328 | " # define the output size of the cropped tensor\n", 329 | " # notice that we divide the height and width with the amount of zoom\n", 330 | " output_size = torch.Size((x.size()[0],\n", 331 | " x.size()[1], \n", 332 | " x.size()[2]//self.num_zoom,\n", 333 | " x.size()[3]//self.num_zoom))\n", 334 | " # magic pytorch functions that are used for transformer networks\n", 335 | " grid = F.affine_grid(theta, output_size) # http://pytorch.org/docs/master/nn.html#torch.nn.functional.affine_grid\n", 336 | " x = F.grid_sample(x, grid) # http://pytorch.org/docs/master/nn.html#torch.nn.functional.grid_sample\n", 337 | " return x\n", 338 | "\n", 339 | " def forward(self, x):\n", 340 | " # transform the input\n", 341 | " x = self.stn(x)\n", 342 | " # save transformation\n", 343 | " l_trans1 = Variable(x.data)\n", 344 | "\n", 345 | " # Perform the usual forward pass\n", 346 | " x = F.relu(F.max_pool2d(self.conv1(x), 2))\n", 347 | " x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))\n", 348 | " x = x.view(-1, self.fc1_features)\n", 349 | " x = F.relu(self.fc1(x))\n", 350 | " # note use of Functional.dropout, where training must be explicitly defined (default: False)\n", 351 | " x = F.dropout(x, training=self.training)\n", 352 | " x = self.fc2(x)\n", 353 | " # return output and batch of bilinear interpolated images\n", 354 | " return F.log_softmax(x, dim=1), l_trans1\n", 355 | "\n", 356 | "\n", 357 | "net = Net(1, DIM, DIM, NUM_CLASSES)\n", 358 | "if torch.cuda.is_available():\n", 359 | " print('##converting network to cuda-enabled')\n", 360 | " net.cuda()\n", 361 | "print(net)" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": { 368 | "scrolled": true 369 | }, 370 | "outputs": [], 371 | "source": [ 372 | "# test forward pass on dummy data\n", 373 | "x = np.random.normal(0,1, (45, 1, 60, 60)).astype('float32')\n", 374 | "x = Variable(torch.from_numpy(x))\n", 375 | "if torch.cuda.is_available():\n", 376 | " x = x.cuda()\n", 377 | "output = net(x)\n", 378 | "print([x.size() for x in output])" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "criterion = nn.CrossEntropyLoss()\n", 388 | "optimizer = optim.Adam(net.parameters(), lr=0.001)" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": {}, 394 | "source": [ 395 | "## Training the model\n", 396 | "Training convnets on CPU is painfully slow.\n", 397 | "After 10 epochs you should see that model starts to zoom in on the digits. " 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [ 406 | "def train_epoch(X, y,):\n", 407 | " num_samples = X.shape[0]\n", 408 | " num_batches = int(np.ceil(num_samples / float(BATCH_SIZE)))\n", 409 | " costs = []\n", 410 | " correct = 0\n", 411 | " net.train()\n", 412 | " for i in range(num_batches):\n", 413 | " if i % 10 == 0:\n", 414 | " print(\"{}, \".format(i), end='')\n", 415 | " idx = range(i*BATCH_SIZE, np.minimum((i+1)*BATCH_SIZE, num_samples))\n", 416 | " X_batch_tr = get_variable(Variable(torch.from_numpy(X[idx])))\n", 417 | " y_batch_tr = get_variable(Variable(torch.from_numpy(y[idx]).long()))\n", 418 | "\n", 419 | " optimizer.zero_grad()\n", 420 | " output, _ = net(X_batch_tr)\n", 421 | " batch_loss = criterion(output, y_batch_tr)\n", 422 | " \n", 423 | " batch_loss.backward()\n", 424 | " optimizer.step()\n", 425 | " \n", 426 | " costs.append(get_numpy(batch_loss))\n", 427 | " preds = np.argmax(get_numpy(output), axis=-1)\n", 428 | " correct += np.sum(get_numpy(y_batch_tr) == preds)\n", 429 | " print()\n", 430 | " return np.mean(costs), correct / float(num_samples)\n", 431 | "\n", 432 | "def eval_epoch(X, y):\n", 433 | " num_samples = X.shape[0]\n", 434 | " num_batches = int(np.ceil(num_samples / float(BATCH_SIZE)))\n", 435 | " pred_list = []\n", 436 | " transform_list = []\n", 437 | " net.eval()\n", 438 | " for i in range(num_batches):\n", 439 | " if i % 10 == 0:\n", 440 | " print(\"{}, \".format(i), end='')\n", 441 | " idx = range(i*BATCH_SIZE, np.minimum((i+1)*BATCH_SIZE, num_samples))\n", 442 | " X_batch_val = get_variable(Variable(torch.from_numpy(X[idx])))\n", 443 | " output, transformation = net(X_batch_val)\n", 444 | " pred_list.append(get_numpy(output))\n", 445 | " transform_list.append(get_numpy(transformation))\n", 446 | " transform_eval = np.concatenate(transform_list, axis=0)\n", 447 | " preds = np.concatenate(pred_list, axis=0)\n", 448 | " preds = np.argmax(preds, axis=-1)\n", 449 | " acc = np.mean(preds == y)\n", 450 | " print()\n", 451 | " return acc, transform_eval" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": { 458 | "scrolled": true 459 | }, 460 | "outputs": [], 461 | "source": [ 462 | "valid_accs, train_accs, test_accs = [], [], []\n", 463 | "\n", 464 | "for n in range(NUM_EPOCHS):\n", 465 | " print(\"Epoch %d:\" % n)\n", 466 | " print('train: ')\n", 467 | " train_cost, train_acc = train_epoch(data['X_train'], data['y_train'])\n", 468 | " print('valid ')\n", 469 | " valid_acc, valid_trainsform = eval_epoch(data['X_valid'], data['y_valid'])\n", 470 | " print('test ')\n", 471 | " test_acc, test_transform = eval_epoch(data['X_test'], data['y_test'])\n", 472 | " valid_accs += [valid_acc]\n", 473 | " test_accs += [test_acc]\n", 474 | " train_accs += [train_acc]\n", 475 | "\n", 476 | " print(\"train cost {0:.2}, train acc {1:.2}, val acc {2:.2}, test acc {3:.2}\".format(\n", 477 | " train_cost, train_acc, valid_acc, test_acc))" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": {}, 483 | "source": [ 484 | "### Plot errors and zoom" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [ 493 | "plt.figure(figsize=(9, 9))\n", 494 | "plt.plot(1 - np.array(train_accs), label='Training Error')\n", 495 | "plt.plot(1 - np.array(valid_accs), label='Validation Error')\n", 496 | "plt.legend(fontsize=20)\n", 497 | "plt.xlabel('Epoch', fontsize=20)\n", 498 | "plt.ylabel('Error', fontsize=20)\n", 499 | "plt.show()" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "plt.figure(figsize=(7, 14))\n", 509 | "for i in range(3):\n", 510 | " plt.subplot(321 + i * 2)\n", 511 | " plt.imshow(data['X_test'][i].reshape(DIM, DIM), cmap='gray', interpolation='none')\n", 512 | " if i == 0:\n", 513 | " plt.title('Original 60x60', fontsize=20)\n", 514 | " plt.axis('off')\n", 515 | " plt.subplot(322+i*2)\n", 516 | " plt.imshow(test_transform[i].reshape(DIM//3, DIM//3), cmap='gray', interpolation='none')\n", 517 | " if i == 0:\n", 518 | " plt.title('Transformed 20x20', fontsize=20)\n", 519 | " plt.axis('off')\n", 520 | "\n", 521 | "plt.tight_layout()\n", 522 | "plt.show()" 523 | ] 524 | }, 525 | { 526 | "cell_type": "markdown", 527 | "metadata": { 528 | "collapsed": true 529 | }, 530 | "source": [ 531 | "# A few pointers for image classification\n", 532 | "\n", 533 | "If you want do image classification, using a pretrained model is often a good choice, especially if you have limited amounts of labelled data.\n", 534 | "\n", 535 | "An often used pretrained network is the Google Inception model. PyTorch has a [guide for using their current state-of-the-art pretrained model](http://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html) in their [model repository](https://github.com/pytorch/vision/tree/master/torchvision/models).\n", 536 | "\n", 537 | "Currently the best performing image network is the [ResNet](https://arxiv.org/pdf/1512.03385v1.pdf) model. Torch7 has [an interesting blog post about residual nets](http://torch.ch/blog/2016/02/04/resnets.html)." 538 | ] 539 | } 540 | ], 541 | "metadata": { 542 | "kernelspec": { 543 | "display_name": "Python 3", 544 | "language": "python", 545 | "name": "python3" 546 | }, 547 | "language_info": { 548 | "codemirror_mode": { 549 | "name": "ipython", 550 | "version": 3 551 | }, 552 | "file_extension": ".py", 553 | "mimetype": "text/x-python", 554 | "name": "python", 555 | "nbconvert_exporter": "python", 556 | "pygments_lexer": "ipython3", 557 | "version": "3.6.4" 558 | } 559 | }, 560 | "nbformat": 4, 561 | "nbformat_minor": 1 562 | } 563 | -------------------------------------------------------------------------------- /2_intermediate/3.1-AE.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Credits\n", 8 | "\n", 9 | "See main readme for credits." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Dependancies and supporting functions\n", 17 | "Loading dependancies and supporting functions by running the code block below." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import matplotlib\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "from IPython.display import Image, display, clear_output\n", 29 | "import numpy as np\n", 30 | "%matplotlib nbagg\n", 31 | "%matplotlib inline" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "def get_variable(x):\n", 41 | " \"\"\" Converts tensors to cuda, if available. \"\"\"\n", 42 | " if torch.cuda.is_available():\n", 43 | " return x.cuda()\n", 44 | " return x\n", 45 | "\n", 46 | "def get_numpy(x):\n", 47 | " \"\"\" Get numpy array for both cuda and not. \"\"\"\n", 48 | " if torch.cuda.is_available():\n", 49 | " return x.cpu().data.numpy()\n", 50 | " return x.data.numpy()" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "# Auto-encoders 101\n", 58 | "In this notebook you will implement a simple auto-encoder (AE). We assume that you are already familiar with the basics of neural networks. We'll start by defining an AE similar to the one used for the finetuning step by [Geoffrey Hinton and Ruslan Salakhutdinov](https://www.cs.toronto.edu/~hinton/science.pdf). We'll experiment with the AE setup and try to run it on the MNIST dataset. There has been a wide variety of research into the field of auto-encoders and the technique that you're about to learn is very simple compared to recent advances (e.g. [the Ladder network](https://arxiv.org/abs/1507.02672) and [Variational AEs](https://arxiv.org/abs/1312.6114)). However, the basic idea stays the same.\n", 59 | "\n", 60 | "AEs are used within unsupervised learning, in which you do not have a target $y$. Instead it *encodes* an input $x$ into a latent state $z$ and decodes $z$ into a reconstruction $\\hat{x}$. This way the parameters of the network can be optimized w.r.t. the difference between $x$ and $\\hat{x}$. Depending on the input distribution, the difference can be measured in various ways, e.g. mean squared error (MSE). In many applications the auto-encoder will find an internal state of each data point corresponding to a feature. So if we are to model the MNIST dataset, one could expect that the internal state would correspond to a digit-class and/or the shape.\n", 61 | "\n", 62 | "*The exercises are found at the bottom of the notebook*" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## MNIST\n", 70 | "First let us load the MNIST dataset and plot a few examples. We only load a limited amount of classes to speed up training." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "!if [ ! -f mnist.npz ]; then wget -N https://www.dropbox.com/s/qxywaq7nx19z72p/mnist.npz; else echo \"mnist.npz already downloaded\"; fi" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "from sklearn.utils import shuffle\n", 89 | "\n", 90 | "# To speed up training we'll only work on a subset of the data containing only the numbers 0, 1.\n", 91 | "data = np.load('mnist.npz')\n", 92 | "num_classes = 2\n", 93 | "idxs_train = []\n", 94 | "idxs_valid = []\n", 95 | "idxs_test = []\n", 96 | "for i in range(num_classes):\n", 97 | " idxs_train += np.where(data['y_train'] == i)[0].tolist()\n", 98 | " idxs_valid += np.where(data['y_valid'] == i)[0].tolist()\n", 99 | " idxs_test += np.where(data['y_test'] == i)[0].tolist()\n", 100 | "\n", 101 | "x_train = data['X_train'][idxs_train].astype('float32')\n", 102 | "# Since this is unsupervised, the targets are only used for validation.\n", 103 | "targets_train = data['y_train'][idxs_train].astype('int32')\n", 104 | "x_train, targets_train = shuffle(x_train, targets_train, random_state=1234)\n", 105 | "\n", 106 | "\n", 107 | "x_valid = data['X_valid'][idxs_valid].astype('float32')\n", 108 | "targets_valid = data['y_valid'][idxs_valid].astype('int32')\n", 109 | "\n", 110 | "x_test = data['X_test'][idxs_test].astype('float32')\n", 111 | "targets_test = data['y_test'][idxs_test].astype('int32')\n", 112 | "\n", 113 | "print(\"training set dim(%i, %i).\" % x_train.shape)\n", 114 | "print(\"validation set dim(%i, %i).\" % x_valid.shape)\n", 115 | "print(\"test set dim(%i, %i).\" % x_test.shape)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "#plot a few MNIST examples\n", 125 | "idx = 0\n", 126 | "canvas = np.zeros((28*10, 10*28))\n", 127 | "for i in range(10):\n", 128 | " for j in range(10):\n", 129 | " canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x_train[idx].reshape((28, 28))\n", 130 | " idx += 1\n", 131 | "plt.figure(figsize=(7, 7))\n", 132 | "plt.axis('off')\n", 133 | "plt.imshow(canvas, cmap='gray')\n", 134 | "plt.title('MNIST handwritten digits')" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "### Building the model\n", 142 | "When defining the model the latent layer $z$ must act as a bottleneck of information. We initialize the AE with 1 hidden layer in the encoder and decoder using ReLU units as nonlinearities. The latent layer has a dimensionality of 2 in order to make it easy to visualise. Since $x$ are pixel intensities that are normalized between 0 and 1, we use the sigmoid nonlinearity to model the reconstruction." 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "import torch\n", 152 | "from torch.autograd import Variable\n", 153 | "import torch.nn as nn\n", 154 | "import torch.optim as optim\n", 155 | "\n", 156 | "from torch.nn import Linear\n", 157 | "from torch.nn.functional import relu, sigmoid" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "# define size variables\n", 167 | "num_features = x_train.shape[1]\n", 168 | "l_enc_features = 128\n", 169 | "l_z_features = 2\n", 170 | "l_dec_features = 128\n", 171 | "\n", 172 | "class Net(nn.Module):\n", 173 | "\n", 174 | " def __init__(self):\n", 175 | " super(Net, self).__init__()\n", 176 | " # encoder layer\n", 177 | " self.l_enc = Linear(in_features=num_features,\n", 178 | " out_features=l_enc_features)\n", 179 | " # bottleneck layer\n", 180 | " self.l_z = Linear(in_features=l_enc_features,\n", 181 | " out_features=l_z_features)\n", 182 | " # decoder layer\n", 183 | " self.l_dec = Linear(in_features=l_z_features,\n", 184 | " out_features=l_dec_features)\n", 185 | " # output layer, projecting back to image size\n", 186 | " self.l_out = Linear(in_features=l_dec_features,\n", 187 | " out_features=num_features)\n", 188 | "\n", 189 | " def forward(self, x): \n", 190 | " outputs = {}\n", 191 | " x = relu(self.l_enc(x))\n", 192 | " # we don't apply an activation to the bottleneck layer; self.l_z\n", 193 | " x = self.l_z(x)\n", 194 | " outputs['l_z'] = x\n", 195 | " x = relu(self.l_dec(x))\n", 196 | " # apply sigmoid to output to get pixel intensities between 0 and 1\n", 197 | " x = sigmoid(self.l_out(x))\n", 198 | " outputs['l_out'] = x\n", 199 | " return outputs\n", 200 | "\n", 201 | "\n", 202 | "net = Net()\n", 203 | "if torch.cuda.is_available():\n", 204 | " net.cuda()\n", 205 | "print(net)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "Following we define the PyTorch functions for training and evaluation." 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "def mse_pixel_loss(y, t):\n", 222 | " \"\"\" Mean Squared Error (MSE)\n", 223 | " \n", 224 | " Parameters\n", 225 | " ----------\n", 226 | " y: torch.autograd.Variable\n", 227 | " t: torch.autograd.Variable\n", 228 | " \n", 229 | " Returns\n", 230 | " -------\n", 231 | " torch.autograd.Variable\n", 232 | " \"\"\"\n", 233 | " loss_per_pixel = torch.pow(t.sub(y), exponent=2)\n", 234 | " return torch.mean(loss_per_pixel)\n", 235 | "\n", 236 | "# if you want L2 regularization, then add weight_decay to SGD\n", 237 | "optimizer = optim.SGD(net.parameters(), lr=0.25)\n", 238 | "\n", 239 | "loss_function = mse_pixel_loss" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "This function can be called with `torch.Tensor`'s and will return a float, but we will use `Variable` as inputs to compute and use gradients." 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "# create two random tensors of size height 1, and width 3\n", 256 | "_y, _t = torch.randn(1, 3), torch.randn(1, 3)\n", 257 | "\n", 258 | "# apply mean squared error by comparing the two tensors\n", 259 | "x = mse_pixel_loss(_y, _t)\n", 260 | "# returns a float\n", 261 | "print(type(x))\n", 262 | "print(x)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "# use same two tensors, but wrap them in a Variable\n", 272 | "# notice, that we put requires_grad=True on the prediction (__y),\n", 273 | "# i.e. when .backward() is applied, the gradients are computed and stored on __y\n", 274 | "__y = Variable(_y, requires_grad=True)\n", 275 | "# no need to store grad for target Variable\n", 276 | "__t = Variable(_t)\n", 277 | "\n", 278 | "# apply mean squared error by comparing the two Variables\n", 279 | "x = mse_pixel_loss(__y, __t)\n", 280 | "# returns Variable containing torch.FloatTensor of size 1\n", 281 | "print(type(x))\n", 282 | "print(x)\n", 283 | "\n", 284 | "# the Variable should not have any grad before applying .backward()\n", 285 | "print(__y.grad)\n", 286 | "x.backward()\n", 287 | "print(__y.grad)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "# test the forward pass\n", 297 | "_x_test = np.zeros(shape=(32, num_features), dtype='float32')\n", 298 | "# expect output size of [32, num_features]\n", 299 | "print(num_features)\n", 300 | "out_dict = net(get_variable(Variable(torch.from_numpy(_x_test))))\n", 301 | "print(out_dict['l_out'].size())" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "In the training loop we sample each batch and evaluate the error, latent space, and reconstructions on every epoch.\n", 309 | "\n", 310 | "**NOTE** this will take a while on CPU." 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "batch_size = 100\n", 320 | "num_epochs = 100\n", 321 | "num_samples_train = x_train.shape[0]\n", 322 | "num_batches_train = num_samples_train // batch_size\n", 323 | "num_samples_valid = x_valid.shape[0]\n", 324 | "num_batches_valid = num_samples_valid // batch_size\n", 325 | "updates = []\n", 326 | "tmp_img = \"tmp_ae_out.png\"\n", 327 | "\n", 328 | "train_loss = []\n", 329 | "valid_loss = []\n", 330 | "cur_loss = 0\n", 331 | "plt.figure(figsize=(12, 24))\n", 332 | "\n", 333 | "for epoch in range(num_epochs):\n", 334 | " cur_loss = []\n", 335 | " net.train()\n", 336 | " for i in range(num_batches_train):\n", 337 | " idxs = np.random.choice(range(x_train.shape[0]), size=(batch_size), replace=False) \n", 338 | " x_batch = x_train[idxs]\n", 339 | " _x = get_variable(Variable(torch.from_numpy(x_batch)))\n", 340 | " out_dict = net(_x)\n", 341 | " # note, target is the original tensor, as we're working with auto-encoders\n", 342 | " loss = loss_function(out_dict['l_out'], _x)\n", 343 | " \n", 344 | " optimizer.zero_grad()\n", 345 | " loss.backward()\n", 346 | " optimizer.step()\n", 347 | " \n", 348 | " cur_loss.append(get_numpy(loss))\n", 349 | " train_loss.append(np.mean(cur_loss))\n", 350 | " updates.append(batch_size * num_batches_train * (epoch + 1))\n", 351 | "\n", 352 | " # evaluate\n", 353 | " net.eval()\n", 354 | " _target = get_variable(Variable(torch.from_numpy(x_valid)))\n", 355 | " out_dict = net(_target)\n", 356 | " loss = loss_function(out_dict['l_out'], _target)\n", 357 | " \n", 358 | " # used later\n", 359 | " eval_out = get_numpy(out_dict['l_out'])\n", 360 | " eval_z = get_numpy(out_dict['l_z'])\n", 361 | " \n", 362 | " valid_loss.append(get_numpy(loss))\n", 363 | " \n", 364 | " if epoch == 0:\n", 365 | " continue\n", 366 | "\n", 367 | " # Plotting\n", 368 | " plt.subplot(num_classes + 1, 2, 1)\n", 369 | " plt.title('Error')\n", 370 | " plt.legend(['Train Error', 'Valid Error'])\n", 371 | " plt.xlabel('Updates'), plt.ylabel('Error')\n", 372 | " plt.plot(updates, train_loss, color=\"black\")\n", 373 | " plt.plot(updates, valid_loss, color=\"grey\")\n", 374 | " plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))\n", 375 | " plt.grid('on')\n", 376 | "\n", 377 | " plt.subplot(num_classes+1,2,2)\n", 378 | " plt.cla()\n", 379 | " plt.title('Latent space')\n", 380 | " plt.xlabel('z0'), plt.ylabel('z1')\n", 381 | " color = iter(plt.get_cmap('brg')(np.linspace(0, 1.0, num_classes)))\n", 382 | " for i in range(num_classes):\n", 383 | " clr = next(color)\n", 384 | " plt.scatter(eval_z[targets_valid==i, 0], eval_z[targets_valid==i, 1], c=clr, s=5., lw=0, marker='o', )\n", 385 | " plt.grid('on')\n", 386 | "\n", 387 | " c=0\n", 388 | " for k in range(3, 3 + num_classes * 2, 2):\n", 389 | " plt.subplot(num_classes + 1, 2, k)\n", 390 | " plt.cla()\n", 391 | " plt.title('Inputs for %i' % c)\n", 392 | " plt.axis('off')\n", 393 | " idx = 0\n", 394 | " canvas = np.zeros((28*10, 10*28))\n", 395 | " for i in range(10):\n", 396 | " for j in range(10):\n", 397 | " canvas[i*28:(i+1)*28, j*28:(j+1)*28] = x_valid[targets_valid==c][idx].reshape((28, 28))\n", 398 | " idx += 1\n", 399 | " plt.imshow(canvas, cmap='gray')\n", 400 | "\n", 401 | " plt.subplot(num_classes+1,2,k+1)\n", 402 | " plt.cla()\n", 403 | " plt.title('Reconstructions for %i' % c)\n", 404 | " plt.axis('off')\n", 405 | " idx = 0\n", 406 | " canvas = np.zeros((28*10, 10*28))\n", 407 | " for i in range(10):\n", 408 | " for j in range(10):\n", 409 | " canvas[i*28:(i+1)*28, j*28:(j+1)*28] = eval_out[targets_valid==c][idx].reshape((28, 28))\n", 410 | " idx += 1\n", 411 | " plt.imshow(canvas, cmap='gray')\n", 412 | " c+=1\n", 413 | "\n", 414 | " plt.savefig(tmp_img)\n", 415 | " display(Image(filename=tmp_img))\n", 416 | " clear_output(wait=True)\n", 417 | " break\n", 418 | "import os\n", 419 | "os.remove(tmp_img)" 420 | ] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": {}, 425 | "source": [ 426 | "# Assignments" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": { 432 | "collapsed": true 433 | }, 434 | "source": [ 435 | "## Assignment 1 - Analyzing the AE\n", 436 | "1. The above implementation of an AE is very simple.\n", 437 | " - Experiment with the number of layers and non-linearities in order to improve the reconstructions\n", 438 | " - What happens with the network when we change the non-linearities in the latent layer (e.g. sigmoid)?\n", 439 | " - Try to increase the number of digit classes in the training set and analyze the results\n", 440 | " - Test different optimization algorithms and decide whether you should use regularizers\n", 441 | " \n", 442 | "2. Currently we optimize w.r.t. mean squared error. \n", 443 | " - Find another error function that could fit this problem better\n", 444 | " - Evaluate whether the error function is a better choice and explain your findings\n", 445 | "\n", 446 | "3. Complexity of the bottleneck.\n", 447 | " - Increase the number of units in the latent layer and train\n", 448 | " - Visualize by using [PCA](http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html) or [t-SNE](http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": { 454 | "collapsed": true 455 | }, 456 | "source": [ 457 | "## Assignment 2 - Adding classification\n", 458 | "\n", 459 | "The above training has been performed unsupervised. Now let us assume that we only have a fraction of labeled data points from each class (implemented below). As we know, semi-supervised learning can be utilized by combining unsupervised and supervised learning. Now you must analyze whether a trained AE from the above exercise can aid a classifier.\n", 460 | "\n", 461 | "1. Build a simple classifier (like the ones from week1) where you:\n", 462 | " - Train on the labeled dataset and evaluate the results\n", 463 | "2. Build a second classifier and train on the latent output $z$ of the AE.\n", 464 | "3. Build a third classifier and train on the reconstructions of the AE.\n", 465 | "4. Evaluate the classifiers against each other and implement a model that improves the classification by combining the input, latent output, and reconstruction." 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "# Generate a subset of labeled data points\n", 475 | "\n", 476 | "num_labeled = 10 # You decide on the size of the fraction...\n", 477 | "\n", 478 | "def onehot(t, num_classes):\n", 479 | " out = np.zeros((t.shape[0], num_classes))\n", 480 | " for row, col in enumerate(t):\n", 481 | " out[row, col] = 1\n", 482 | " return out\n", 483 | "\n", 484 | "idxs_train_l = []\n", 485 | "for i in range(num_classes):\n", 486 | " idxs = np.where(targets_train == i)[0]\n", 487 | " idxs_train_l += np.random.choice(idxs, size=num_labeled).tolist()\n", 488 | "\n", 489 | "x_train_l = x_train[idxs_train_l]\n", 490 | "targets_train_l = targets_train[idxs_train_l]\n", 491 | "print(\"labeled training set dim(%i, %i).\" % x_train_l.shape)\n", 492 | "\n", 493 | "plt.figure(figsize=(12, 7))\n", 494 | "for i in range(num_classes*num_labeled):\n", 495 | " im = x_train_l[i].reshape((28, 28))\n", 496 | " plt.subplot(1, num_classes*num_labeled, i + 1)\n", 497 | " plt.imshow(im, cmap='gray')\n", 498 | " plt.axis('off')" 499 | ] 500 | } 501 | ], 502 | "metadata": { 503 | "kernelspec": { 504 | "display_name": "Python 3", 505 | "language": "python", 506 | "name": "python3" 507 | }, 508 | "language_info": { 509 | "codemirror_mode": { 510 | "name": "ipython", 511 | "version": 3 512 | }, 513 | "file_extension": ".py", 514 | "mimetype": "text/x-python", 515 | "name": "python", 516 | "nbconvert_exporter": "python", 517 | "pygments_lexer": "ipython3", 518 | "version": "3.6.4" 519 | } 520 | }, 521 | "nbformat": 4, 522 | "nbformat_minor": 1 523 | } 524 | -------------------------------------------------------------------------------- /2_intermediate/4.1-Sequences.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Sequences\n", 8 | "\n", 9 | "In this lab we will introduce data which has internal dependencies on previous relations in a sequence of data points, and how to model such data.\n", 10 | "\n", 11 | "Examples of data with a sequence dimension are stock prices, weather data, protein sequences, speech, text, and many more.\n", 12 | "In previous labs we mainly considered data $x \\in \\mathrm{R}^d$, where $d$ is the feature space.\n", 13 | "With time sequences our data can be represented as $x \\in \\mathrm{R}^{t \\, \\times \\, d}$, where $t$ is the sequence length. This emphasises sequence dependence and that the samples along the sequence are not independent and identically distributed (i.i.d.).\n", 14 | "\n", 15 | "In the following we will exemplify methods on text given the same challenges as presented in [learning when to skim and when to read](https://einstein.ai/research/learning-when-to-skim-and-when-to-read)." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "# Text classification: Sentiment analysis\n", 23 | "\n", 24 | "In our first work on sequences we will classify sequences of text.\n", 25 | "We will model functions as $\\mathrm{R}^{t \\, \\times \\, d} \\rightarrow \\mathrm{R}^c$, where $c$ is the amount of classes in the output.\n", 26 | "\n", 27 | "With text the challenge is how to represent a word as the feature $d$, as it is required to represent text with decimal numbers.\n", 28 | "Currently, two popular approaches exist; one-hot encoding and embeddings." 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## One-hot encoding over vocabulary\n", 36 | "\n", 37 | "One way to represent a fixed amount of words is by making a one-hot encoded vector, which consists of 0s in all cells with the exception of a single 1 in a cell used uniquely to identify each word.\n", 38 | "\n", 39 | "| vocabulary | one-hot encoded vector |\n", 40 | "| ------------- |--------------------------|\n", 41 | "| Copenhagen | $= [0, 0, 1, \\ldots, 0]$ |\n", 42 | "| Paris | $= [1, 0, 0, \\ldots, 0]$ |\n", 43 | "| Rome | $= [0, 1, 0, \\ldots, 0]$ |\n", 44 | "\n", 45 | "Representing a large vocabulary with one-hot encodings often becomes inefficient because of the size of each sparse vector.\n", 46 | "To overcome this challenge it is common practice to truncate the vocabulary to contain the $k$ most used words and represent the rest with a special symbol, $\\mathtt{UNK}$, to define unknown/unimportant words.\n", 47 | "This often causes entities such as names to be represented with $\\mathtt{UNK}$.\n", 48 | "\n", 49 | "Consider the following text\n", 50 | "> I love the corny jokes in Spielberg's new movie.\n", 51 | "\n", 52 | "where an example result would be similar to\n", 53 | "> I love the corny jokes in $\\mathtt{UNK}$'s new movie." 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## Embeddings\n", 61 | "\n", 62 | "Word embeddings tries to tackle the intractability of one-hot encoded vectors, as $k$ is often in the range of 50k to 100k elements.\n", 63 | "Furthermore, one-hot encoding of vectors assumes orthogonality between all words, which makes it inept to incorporate relationships between words, e.g. `ran` and `run` should be related, where e.g. `awkward` and `space` should be far apart in the vector space.\n", 64 | "\n", 65 | "An embedding is defined as $\\mathrm{R}^d \\rightarrow \\mathrm{R}^{d'}$, where $d' \\ll d$.\n", 66 | "In practice this is often achieved by having a lookup table with $d'$-dimensional embeddings, similar to the following matrix operation $\\mathrm{R}^d \\cdot \\mathrm{R}^{d \\, \\times \\, d'}$.\n", 67 | "\n", 68 | "For visualizations and more intuition check out [learning when to skim and when to read](https://einstein.ai/research/learning-when-to-skim-and-when-to-read)." 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Bag of Words\n", 76 | "\n", 77 | "A simple way to model sequences of words is by averaging the word embeddings across the sequence dimension.\n", 78 | "This gives us a vector which has a little information of each word, although completely disregarding the order of the words. Even though this might seem like a lossy approach to condense information it works surprisingly well.\n", 79 | "\n", 80 | "A bag of words model is represented as $\\mathrm{R}^{t \\, \\times \\, d'} \\rightarrow \\mathrm{R}^{d'}$, afterwards the representation can be used to do classification $\\mathrm{R}^{d'} \\rightarrow \\mathrm{R}^{c}$." 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "# Stanford sentiment treebank\n", 88 | "\n", 89 | "A great public dataset for sentiment analysis is the Stanford sentiment treebank (SST).\n", 90 | "The SST provides not only the class (positive, negative) for a sentence, but also each of its grammatical subphrases.\n", 91 | "We will not utilize any tree information.\n", 92 | "The original SST constitutes five classes: *very positive*, *positive*, *neutral*, *negative* and *very negative*.\n", 93 | "We consider the simpler task of binary classification where *very positive* is combined with *positive*, *very negative* is combined with *negative* and all *neutrals* are removed.\n", 94 | "\n", 95 | "## positive examples\n", 96 | "\n", 97 | "
\n", 98 | " \n", 103 | "
\n", 104 | "\n", 105 | "## negative examples\n", 106 | "\n", 107 | "
\n", 108 | " \n", 113 | "
" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "# import stuff\n", 123 | "from pprint import pprint\n", 124 | "\n", 125 | "import numpy as np\n", 126 | "\n", 127 | "from torchtext import data\n", 128 | "from torchtext import datasets\n", 129 | "from torchtext.vocab import Vectors\n", 130 | "\n", 131 | "import torch\n", 132 | "from torch.autograd import Variable\n", 133 | "import torch.nn as nn\n", 134 | "import torch.optim as optim\n", 135 | "from torch.nn import Linear\n", 136 | "from torch.nn.functional import softmax, relu\n", 137 | "\n", 138 | "from sklearn.manifold import TSNE\n", 139 | "\n", 140 | "# we'll use the bokeh library to create beautiful plots\n", 141 | "# *_notebook functions are needed for correct use in jupyter\n", 142 | "from bokeh.plotting import figure, ColumnDataSource\n", 143 | "from bokeh.models import HoverTool\n", 144 | "from bokeh.io import output_notebook, show, push_notebook\n", 145 | "output_notebook()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "use_cuda = torch.cuda.is_available()\n", 155 | "\n", 156 | "def get_variable(x):\n", 157 | " \"\"\" Converts tensors to cuda, if available. \"\"\"\n", 158 | " if use_cuda:\n", 159 | " return x.cuda()\n", 160 | " return x\n", 161 | "\n", 162 | "def get_numpy(x):\n", 163 | " \"\"\" Get numpy array for both cuda and not. \"\"\"\n", 164 | " if use_cuda:\n", 165 | " return x.cpu().data.numpy()\n", 166 | " return x.data.numpy()" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "# Data loader - `torchtext`\n", 174 | "\n", 175 | "Creating data loaders for NLP is quite a hassle.\n", 176 | "[torchtext](https://github.com/pytorch/text/) is a convenient library with builtin functionality useful when working with text, e.g. building vocabularies and padding sequences to max length." 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "## `torchtext` - Fields and Dataset\n", 184 | "\n", 185 | "Our dataset must have a predefined structure, e.g. similar to a database table.\n", 186 | "\n", 187 | "- `torchtext.data.Field()` defines a column in our dataset table\n", 188 | "- `torchtext.datasets.SST` is a data loader for the Stanford Sentiment Treebank (SST) dataset\n", 189 | "- `torchtext.datasets.SST.split()` is a function to create train/validation/test sets" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "# we assume that all fields are sequential, i.e. there will be a sequence of data\n", 199 | "# however, the label field will not contain any sequence\n", 200 | "TEXT = data.Field(sequential=True)\n", 201 | "LABEL = data.Field(sequential=False)\n", 202 | "# create SST dataset splits\n", 203 | "# note, we remove samples with neutral labels\n", 204 | "train_set, validation_set, _ = datasets.SST.splits(TEXT,\n", 205 | " LABEL,\n", 206 | " fine_grained=False,\n", 207 | " train_subtrees=True,\n", 208 | " filter_pred=lambda ex: ex.label != 'neutral')" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": { 215 | "scrolled": true 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "print('train_set.fields:', list(train_set.fields.keys()))\n", 220 | "print('validation_set.fields:', list(validation_set.fields.keys()))\n", 221 | "print()\n", 222 | "print('size of training set', len(train_set))\n", 223 | "print('size of validation set', len(validation_set))\n", 224 | "print()\n", 225 | "print('content of first training sample:')\n", 226 | "pprint(vars(train_set[0]))" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "## `torchtext` - Vocabulary\n", 234 | "\n", 235 | "For each `Field` we build a vocabulary to numericalize the symbols, e.g. `\"fun\" => 471`.\n", 236 | "When building a vocabulary we can attach embedding vectors, e.g. GloVe, FastText, etc.\n", 237 | "Many of these are already built into `torchtext`." 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "# build the vocabularies\n", 247 | "url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'\n", 248 | "TEXT.build_vocab(train_set, max_size=None, vectors=Vectors('wiki.simple.vec', url=url))\n", 249 | "LABEL.build_vocab(train_set)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "print('Text fields:')\n", 259 | "#print('keys of TEXT.vocab:', list(TEXT.vocab.__dict__.keys()))\n", 260 | "print(' size of vocabulary:', len(TEXT.vocab))\n", 261 | "print(\" vocabulary's embedding dimension:\", TEXT.vocab.vectors.size())\n", 262 | "print(' no. times the \"fun\" appear in the dataset:', TEXT.vocab.freqs['fun'])\n", 263 | "\n", 264 | "print('\\nLabel fields:')\n", 265 | "#print('keys of LABEL.vocab:', list(LABEL.vocab.__dict__.keys()))\n", 266 | "print(\" list of vocabulary (int-to-str):\", LABEL.vocab.itos)\n", 267 | "print(\" list of vocabulary (str-to-int):\", dict(LABEL.vocab.stoi))" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "## `torchtext` - Iterator over datasets\n", 275 | "\n", 276 | "`torchtext.data.Iterator` is a class which can be used to create iterators.\n", 277 | "These iterators have various useful functionality, e.g. to shuffle at every epoch, or to generate data endlessly.\n", 278 | "It is useful to be able to generate endless batches of training data." 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "# make iterator for splits\n", 288 | "# device gives a CUDA enabled device (-1 disables it)\n", 289 | "train_iter, val_iter, _ = data.BucketIterator.splits((train_set, validation_set, _),\n", 290 | " batch_size=128, \n", 291 | " device=0 if use_cuda else -1)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "# print batch information\n", 301 | "batch = next(iter(train_iter))\n", 302 | "print(\"dimension of batch's text:\", batch.text.size())\n", 303 | "print(\"first sequence in text:\", batch.text[:,0])\n", 304 | "print(\"correct label index:\", batch.label[0])\n", 305 | "print(\"the actual label:\", LABEL.vocab.itos[get_numpy(batch.label[0])[0]])" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "# Build the model" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "# size of embeddings\n", 322 | "embedding_dim = TEXT.vocab.vectors.size()[1]\n", 323 | "num_embeddings = TEXT.vocab.vectors.size()[0]\n", 324 | "num_classes = len(LABEL.vocab.itos)\n", 325 | "\n", 326 | "class Net(nn.Module):\n", 327 | "\n", 328 | " def __init__(self):\n", 329 | " super(Net, self).__init__()\n", 330 | " self.embeddings = nn.Embedding(num_embeddings, embedding_dim)\n", 331 | " # use pretrained embeddings\n", 332 | " self.embeddings.weight.data.copy_(TEXT.vocab.vectors)\n", 333 | " \n", 334 | " # add hidden layers\n", 335 | "# self.l_1 = Linear(in_features=embedding_dim,\n", 336 | "# out_features=30,\n", 337 | "# bias=True)\n", 338 | "# self.l_2 = Linear(in_features=30,\n", 339 | "# out_features=30,\n", 340 | "# bias=True)\n", 341 | " # output layer\n", 342 | " self.l_out = Linear(in_features=embedding_dim,\n", 343 | " out_features=num_classes,\n", 344 | " bias=False)\n", 345 | " \n", 346 | " def forward(self, x):\n", 347 | " out = {}\n", 348 | " # get embeddings\n", 349 | " x = self.embeddings(x)\n", 350 | " # mean embeddings, this is the bag of words trick\n", 351 | " out['bow'] = x = torch.mean(x, dim=0)\n", 352 | " # apply layer\n", 353 | " #out['l1_activations'] = x = relu(self.l_1(x))\n", 354 | " #out['l2_activations'] = x = relu(self.l_2(x))\n", 355 | " # classify\n", 356 | " out['out'] = softmax(self.l_out(x), dim=1)\n", 357 | " return out\n", 358 | "\n", 359 | "net = Net()\n", 360 | "if use_cuda:\n", 361 | " net.cuda()\n", 362 | "print(net)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "criterion = nn.CrossEntropyLoss()\n", 372 | "optimizer = optim.Adam(net.parameters(), lr=0.001)\n", 373 | "\n", 374 | "def accuracy(ys, ts):\n", 375 | " # making a one-hot encoded vector of correct (1) and incorrect (0) predictions\n", 376 | " correct_prediction = torch.eq(torch.max(ys, 1)[1], ts)\n", 377 | " # averaging the one-hot encoded vector\n", 378 | " return torch.mean(correct_prediction.float())" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "def construct_sentences(batch):\n", 388 | " \"\"\" \n", 389 | " Parameters\n", 390 | " ----------\n", 391 | " batch: torchtext.data.batch.Batch\n", 392 | " \n", 393 | " Returns\n", 394 | " -------\n", 395 | " [str]\n", 396 | " \"\"\"\n", 397 | " return [\" \".join([TEXT.vocab.itos[elm] \n", 398 | " for elm in get_numpy(batch.text[:,i])])\n", 399 | " for i in range(batch.text.size()[1])]\n", 400 | "\n", 401 | "def get_labels(batch):\n", 402 | " \"\"\" \n", 403 | " Parameters\n", 404 | " ----------\n", 405 | " batch: torchtext.data.batch.Batch\n", 406 | " \n", 407 | " Returns\n", 408 | " -------\n", 409 | " [str]\n", 410 | " \"\"\"\n", 411 | " return [LABEL.vocab.itos[get_numpy(batch.label[i])[0]] for i in range(len(batch.label))]\n", 412 | "#construct_sentences(batch)\n", 413 | "#get_labels(batch)" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [ 422 | "# to project our hidden embeddings to a visualizable space\n", 423 | "tsne = TSNE(perplexity=10.0, learning_rate=5.0, n_iter=2000)\n", 424 | "\n", 425 | "# index for each label\n", 426 | "colormap = {1: 'DodgerBlue', 2: 'FireBrick'}\n", 427 | "# create a tmp source to be updated later\n", 428 | "validation_set_size = len(validation_set)\n", 429 | "source = ColumnDataSource(data={'x': np.random.randn(validation_set_size),\n", 430 | " 'y': np.random.randn(validation_set_size),\n", 431 | " 'colors': ['green']*validation_set_size,\n", 432 | " 'sentences': [\"tmp\"]*validation_set_size,\n", 433 | " 'labels': [\"unk\"]*validation_set_size})\n", 434 | "# instance to define hover logic in plot\n", 435 | "hover = HoverTool(tooltips=[(\"Sentence\", \"@sentences\"), (\"Label\", \"@labels\")])\n", 436 | "\n", 437 | "# set up the bokeh figure for later visualizations\n", 438 | "p = figure(tools=[hover])\n", 439 | "p.circle(x='x', y='y', fill_color='colors', size=5, line_color=None, source=source)\n", 440 | "\n", 441 | "def update_plot(meta, layer, handle):\n", 442 | " \"\"\" Update existing plot\n", 443 | " \n", 444 | " Parameters\n", 445 | " ----------\n", 446 | " meta: dict\n", 447 | " layer: str\n", 448 | " \"\"\"\n", 449 | " tsne_acts = tsne.fit_transform(meta[layer])\n", 450 | " source.data['x'] = tsne_acts[:,0]\n", 451 | " source.data['y'] = tsne_acts[:,1]\n", 452 | " source.data['colors'] = [colormap[l] for l in meta['label_idx']]\n", 453 | " \n", 454 | " source.data['sentences'] = meta['sentences']\n", 455 | " source.data['labels'] = meta['labels']\n", 456 | " \n", 457 | " # this updates the given plot\n", 458 | " push_notebook(handle=handle)" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "## Run the bag of words model\n", 466 | "\n", 467 | "**Warning** this might take a while on CPU.\n", 468 | "Go get a cop of coffe, and enjoy the visualizations.\n", 469 | "\n", 470 | "Notice that each data point in the plot corresponds to an entire sentence in the validation set." 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": { 477 | "scrolled": false 478 | }, 479 | "outputs": [], 480 | "source": [ 481 | "max_iter = 3000\n", 482 | "eval_every = 1000\n", 483 | "log_every = 200\n", 484 | "\n", 485 | "# will be updated while iterating\n", 486 | "tsne_plot = show(p, notebook_handle=True)\n", 487 | "\n", 488 | "train_loss, train_accs = [], []\n", 489 | "\n", 490 | "net.train()\n", 491 | "for i, batch in enumerate(train_iter):\n", 492 | " if i % eval_every == 0:\n", 493 | " net.eval()\n", 494 | " val_losses, val_accs, val_lengths = 0, 0, 0\n", 495 | " val_meta = {'label_idx': [], 'sentences': [], 'labels': []}\n", 496 | " for val_batch in val_iter:\n", 497 | " output = net(val_batch.text)\n", 498 | " # batches sizes might vary, which is why we cannot just mean the batch's loss\n", 499 | " # we multiply the loss and accuracies with the batch's size,\n", 500 | " # to later divide by the total size\n", 501 | " val_losses += criterion(output['out'], val_batch.label) * val_batch.batch_size\n", 502 | " val_accs += accuracy(output['out'], val_batch.label) * val_batch.batch_size\n", 503 | " val_lengths += val_batch.batch_size\n", 504 | " \n", 505 | " for key, _val in output.items():\n", 506 | " if key not in val_meta:\n", 507 | " val_meta[key] = []\n", 508 | " val_meta[key].append(get_numpy(_val)) \n", 509 | " val_meta['label_idx'].append(get_numpy(val_batch.label))\n", 510 | " val_meta['sentences'].append(construct_sentences(val_batch))\n", 511 | " val_meta['labels'].append(get_labels(val_batch))\n", 512 | " \n", 513 | " for key, _val in val_meta.items():\n", 514 | " val_meta[key] = np.concatenate(_val)\n", 515 | " \n", 516 | " # divide by the total accumulated batch sizes\n", 517 | " val_losses /= val_lengths\n", 518 | " val_accs /= val_lengths\n", 519 | " \n", 520 | " print(\"it: {} loss: {:.2f} accs: {:.2f}\".format(i, get_numpy(val_losses)[0], get_numpy(val_accs)[0]))\n", 521 | " update_plot(val_meta, 'bow', tsne_plot)\n", 522 | " \n", 523 | " net.train()\n", 524 | " \n", 525 | " output = net(batch.text)\n", 526 | " batch_loss = criterion(output['out'], batch.label)\n", 527 | " \n", 528 | " train_loss.append(get_numpy(batch_loss))\n", 529 | " train_accs.append(get_numpy(accuracy(output['out'], batch.label)))\n", 530 | " \n", 531 | " optimizer.zero_grad()\n", 532 | " batch_loss.backward()\n", 533 | " optimizer.step()\n", 534 | " \n", 535 | " if i % log_every == 0: \n", 536 | " print(\"train, it: {} loss: {:.2f} accs: {:.2f}\".format(i, \n", 537 | " np.mean(train_loss), \n", 538 | " np.mean(train_accs)))\n", 539 | " # reset\n", 540 | " train_loss, train_accs = [], []\n", 541 | " \n", 542 | " if max_iter < i:\n", 543 | " break" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "# Assignments\n", 551 | "\n", 552 | "## Assignment 1 - add hidden layer\n", 553 | "\n", 554 | "- add one hidden layer to the bag of words (BoW) model\n", 555 | " - plot the hidden layer's activations instead of the BoW representation\n", 556 | "- add a second hidden layer\n", 557 | " - try and plot the activations of the second hidden layer\n", 558 | "\n", 559 | "Notice any difference in the plots?\n", 560 | "Describe what you see.\n", 561 | "Hover over the data points." 562 | ] 563 | } 564 | ], 565 | "metadata": { 566 | "kernelspec": { 567 | "display_name": "Python 3", 568 | "language": "python", 569 | "name": "python3" 570 | }, 571 | "language_info": { 572 | "codemirror_mode": { 573 | "name": "ipython", 574 | "version": 3 575 | }, 576 | "file_extension": ".py", 577 | "mimetype": "text/x-python", 578 | "name": "python", 579 | "nbconvert_exporter": "python", 580 | "pygments_lexer": "ipython3", 581 | "version": "3.6.4" 582 | } 583 | }, 584 | "nbformat": 4, 585 | "nbformat_minor": 2 586 | } 587 | -------------------------------------------------------------------------------- /2_intermediate/4.2-RNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Recurrent Neural Networks\n", 8 | "\n", 9 | "A recurrent neural network (RNN) is a type of neural network that has been succesful in modelling sequential data, e.g. language, speech, protein sequences, etc.\n", 10 | "\n", 11 | "## Vanilla RNN\n", 12 | "\n", 13 | "A RNN performs its computations in a cyclic manner, where the same computation is applied to every sample of a given sequence.\n", 14 | "The idea is that the network should be able to use the previous computations as some form of memory and apply this to the future computation.\n", 15 | "An image may best explain how this is to be understood," 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "![rnn-unfold](../static_files/rnn-unfold.png)" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "\n", 30 | "where it the network contains the following elements:\n", 31 | "\n", 32 | "- $x$ is the input sequence of samples, \n", 33 | "- $U$ is a weight matrix applied to the given input sample,\n", 34 | "- $V$ is a weight matrix used for the recurrent computation in order to pass memory along the sequence,\n", 35 | "- $W$ is a weight matrix used to compute the output of the every timestep (given that every timestep requires an output),\n", 36 | "- $h$ is the hidden state (the network's memory) for a given time step, and\n", 37 | "- $o$ is the resulting output.\n", 38 | "\n", 39 | "When the network is unfolded as shown, it is easier to refer to a timestep, $t$.\n", 40 | "We have the following computations through the network:\n", 41 | "\n", 42 | "- $h_t = f(U_{x_t} + V_{h_{t-1}})$, where $f$ usually is an activation function, e.g. $\\mathrm{tanh}$.\n", 43 | "- $o_t = \\mathrm{softmax}(W_{h_t})$" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Using Better memory\n", 51 | "\n", 52 | "The vanilla RNN has issues with vanishing gradients which give challenges in saving memory over longer sequences.\n", 53 | "\n", 54 | "To battle these issues the gated hidden units were create.\n", 55 | "We have Long Short-Term Memory (LSTM) (see [Christopher Olah's walk through](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)) and Gated Recurrent Unit (GRU) which have shown increased performance in saving and reusing memory in later timesteps." 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "![lstm](../static_files/lstm_cell.png)\n", 63 | "source: https://arxiv.org/abs/1412.7828" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "### LSTM\n", 71 | "\n", 72 | "The LSTM contains three gates, input, forget, output gates and a memory cell.\n", 73 | "The output of the LSTM unit is computed with the following functions, where $\\sigma = \\mathrm{softmax}$.\n", 74 | "We have input gate $i$, forget gate $f$, and output gate $o$ defines as\n", 75 | "\n", 76 | "- $i = \\sigma (x_t U^i + h_{t-1} W^i)$\n", 77 | "\n", 78 | "- $f = \\sigma (x_t U^f + h_{t-1} W^f)$\n", 79 | "\n", 80 | "- $o = \\sigma (x_t U^o + h_{t-1} W^o)$\n", 81 | "\n", 82 | "where $U^i, U^f, U^o$ are weight matrices applied to $x_t$, and\n", 83 | "$W^i, W^f, W^o$ are weight matrices applied to $h_{t-1}$ for each respective gate.\n", 84 | "\n", 85 | "$h_{t-1}$, from the previous time step along with the current input $x_t$ are used to compute the a candidate $g$\n", 86 | "\n", 87 | "- $g = \\mathrm{tanh}(x_t U^g + h_{t-1} W^g)$\n", 88 | "\n", 89 | "The value of the cell's memory, $c_t$, is updated as\n", 90 | "\n", 91 | "- $c_t = c_{t-1} \\circ f + g \\circ i$\n", 92 | "\n", 93 | "where $c_{t-1}$ is the previous memory, and $\\circ$ refers to element-wise multiplication.\n", 94 | "\n", 95 | "The output, $h_t$, is computed as\n", 96 | "\n", 97 | "- $h_t = \\mathrm{tanh}(c_t) \\circ o$\n", 98 | "\n", 99 | "and it is used for both the timestep's output and the next timestep, whereas $c_t$ is exclusively sent to the next timestep.\n", 100 | "This makes $c_t$ a memory feature, and is not used directly to compute the output of the timestep." 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "# Stanford sentiment treebank\n", 108 | "\n", 109 | "We will continue with the SST. See [lab 4.1](http://0.0.0.0:8888/notebooks/intermediate/4.1-Sequences.ipynb) for more information." 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# import stuff\n", 119 | "import numpy as np\n", 120 | "\n", 121 | "from torchtext import data\n", 122 | "from torchtext import datasets\n", 123 | "from torchtext.vocab import Vectors, GloVe\n", 124 | "\n", 125 | "import torch\n", 126 | "from torch.autograd import Variable\n", 127 | "import torch.nn as nn\n", 128 | "import torch.optim as optim\n", 129 | "from torch.nn import Linear, RNN, LSTM\n", 130 | "from torch.nn.functional import softmax, relu\n", 131 | "\n", 132 | "from sklearn.manifold import TSNE\n", 133 | "\n", 134 | "# we'll use the bokeh library to create beautiful plots\n", 135 | "# *_notebook functions are needed for correct use in jupyter\n", 136 | "from bokeh.plotting import figure, ColumnDataSource\n", 137 | "from bokeh.models import HoverTool\n", 138 | "from bokeh.io import output_notebook, show, push_notebook\n", 139 | "output_notebook()" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "use_cuda = torch.cuda.is_available()\n", 149 | "\n", 150 | "def get_variable(x):\n", 151 | " \"\"\" Converts tensors to cuda, if available. \"\"\"\n", 152 | " if use_cuda:\n", 153 | " return x.cuda()\n", 154 | " return x\n", 155 | "\n", 156 | "def get_numpy(x):\n", 157 | " \"\"\" Get numpy array for both cuda and not. \"\"\"\n", 158 | " if use_cuda:\n", 159 | " return x.cpu().data.numpy()\n", 160 | " return x.data.numpy()" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "# we assume that all fields are sequential, i.e. there will be a sequence of data\n", 170 | "# however, the label field will not contain any sequence\n", 171 | "TEXT = data.Field(sequential=True)\n", 172 | "LABEL = data.Field(sequential=False)\n", 173 | "# create SST dataset splits\n", 174 | "# note, we remove samples with neutral labels\n", 175 | "train_set, validation_set, _ = datasets.SST.splits(TEXT,\n", 176 | " LABEL,\n", 177 | " fine_grained=False,\n", 178 | " train_subtrees=True,\n", 179 | " filter_pred=lambda ex: ex.label != 'neutral')\n", 180 | "# build the vocabularies\n", 181 | "# NOTE you should download the GloVe vocabulary, it is quite large..\n", 182 | "#url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'\n", 183 | "#TEXT.build_vocab(train_set, max_size=None, vectors=Vectors('wiki.simple.vec', url=url))\n", 184 | "TEXT.build_vocab(train_set, max_size=None, vectors=[GloVe(name='840B', dim='300')])\n", 185 | "LABEL.build_vocab(train_set)\n", 186 | "# make iterator for splits\n", 187 | "# device gives a CUDA enabled device (-1 disables it)\n", 188 | "train_iter, val_iter, _ = data.BucketIterator.splits((train_set, validation_set, _),\n", 189 | " batch_size=128, \n", 190 | " device=0 if use_cuda else -1)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "# Build the model" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "# size of embeddings\n", 207 | "embedding_dim = TEXT.vocab.vectors.size()[1]\n", 208 | "num_embeddings = TEXT.vocab.vectors.size()[0]\n", 209 | "num_classes = len(LABEL.vocab.itos)\n", 210 | "\n", 211 | "class Net(nn.Module):\n", 212 | "\n", 213 | " def __init__(self):\n", 214 | " super(Net, self).__init__()\n", 215 | " self.embeddings = nn.Embedding(num_embeddings, embedding_dim)\n", 216 | " # use pretrained embeddings\n", 217 | " self.embeddings.weight.data.copy_(TEXT.vocab.vectors)\n", 218 | " self.embeddings.weight.detach_()\n", 219 | " \n", 220 | " self.rnn_1 = RNN(input_size=embedding_dim,\n", 221 | " hidden_size=100,\n", 222 | " num_layers=1,\n", 223 | " bidirectional=False)\n", 224 | " self.l_out = Linear(in_features=200,\n", 225 | " out_features=num_classes,\n", 226 | " bias=False)\n", 227 | " \n", 228 | " def forward(self, x):\n", 229 | " out = {}\n", 230 | " # get embeddings\n", 231 | " x = self.embeddings(x)\n", 232 | " # rnn returns output and last hidden state\n", 233 | " x, hn = self.rnn_1(x)\n", 234 | " # get a fixed sized hidden representation of the entire sequence\n", 235 | " out['hidden'] = x = torch.cat((torch.mean(x, dim=0), torch.max(x, dim=0)[0]), dim=1)\n", 236 | " # classify\n", 237 | " out['out'] = softmax(self.l_out(x), dim=1)\n", 238 | " return out\n", 239 | "\n", 240 | "net = Net()\n", 241 | "if use_cuda:\n", 242 | " net.cuda()\n", 243 | "print(net)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "# check which params require grad\n", 253 | "{p[0]: p[1].requires_grad for p in net.named_parameters()}" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "criterion = nn.CrossEntropyLoss()\n", 263 | "# we filter the model's parameters such that we can remove the embedding layer, \n", 264 | "# which does not have requires_grad\n", 265 | "optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=0.001)\n", 266 | "\n", 267 | "def accuracy(ys, ts):\n", 268 | " # making a one-hot encoded vector of correct (1) and incorrect (0) predictions\n", 269 | " correct_prediction = torch.eq(torch.max(ys, 1)[1], ts)\n", 270 | " # averaging the one-hot encoded vector\n", 271 | " return torch.mean(correct_prediction.float())\n", 272 | "\n", 273 | "def construct_sentences(batch):\n", 274 | " \"\"\" \n", 275 | " Parameters\n", 276 | " ----------\n", 277 | " batch: torchtext.data.batch.Batch\n", 278 | " \n", 279 | " Returns\n", 280 | " -------\n", 281 | " [str]\n", 282 | " \"\"\"\n", 283 | " return [\" \".join([TEXT.vocab.itos[elm] \n", 284 | " for elm in get_numpy(batch.text[:,i])])\n", 285 | " for i in range(batch.text.size()[1])]\n", 286 | "\n", 287 | "def get_labels(batch):\n", 288 | " \"\"\" \n", 289 | " Parameters\n", 290 | " ----------\n", 291 | " batch: torchtext.data.batch.Batch\n", 292 | " \n", 293 | " Returns\n", 294 | " -------\n", 295 | " [str]\n", 296 | " \"\"\"\n", 297 | " return [LABEL.vocab.itos[get_numpy(batch.label[i])[0]] for i in range(len(batch.label))]" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "# to project our hidden embeddings to a visualizable space\n", 307 | "tsne = TSNE(perplexity=10.0, learning_rate=5.0, n_iter=2000)\n", 308 | "\n", 309 | "# index for each label\n", 310 | "colormap = {1: 'DodgerBlue', 2: 'FireBrick'}\n", 311 | "# create a tmp source to be updated later\n", 312 | "validation_set_size = len(validation_set)\n", 313 | "source = ColumnDataSource(data={'x': np.random.randn(validation_set_size),\n", 314 | " 'y': np.random.randn(validation_set_size),\n", 315 | " 'colors': ['green']*validation_set_size,\n", 316 | " 'sentences': [\"tmp\"]*validation_set_size,\n", 317 | " 'labels': [\"unk\"]*validation_set_size})\n", 318 | "# instance to define hover logic in plot\n", 319 | "hover = HoverTool(tooltips=[(\"Sentence\", \"@sentences\"), (\"Label\", \"@labels\")])\n", 320 | "\n", 321 | "# set up the bokeh figure for later visualizations\n", 322 | "p = figure(tools=[hover])\n", 323 | "p.circle(x='x', y='y', fill_color='colors', size=5, line_color=None, source=source)\n", 324 | "\n", 325 | "def update_plot(meta, layer, handle):\n", 326 | " \"\"\" Update existing plot\n", 327 | " \n", 328 | " Parameters\n", 329 | " ----------\n", 330 | " meta: dict\n", 331 | " layer: str\n", 332 | " \"\"\"\n", 333 | " tsne_acts = tsne.fit_transform(meta[layer])\n", 334 | " source.data['x'] = tsne_acts[:,0]\n", 335 | " source.data['y'] = tsne_acts[:,1]\n", 336 | " source.data['colors'] = [colormap[l] for l in meta['label_idx']]\n", 337 | " \n", 338 | " source.data['sentences'] = meta['sentences']\n", 339 | " source.data['labels'] = meta['labels']\n", 340 | " \n", 341 | " # this updates the given plot\n", 342 | " push_notebook(handle=handle)" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "## Run the bag of words model\n", 350 | "\n", 351 | "**Warning** this might take a while.\n", 352 | "Go get a cop of coffe, and enjoy the visualizations.\n", 353 | "\n", 354 | "Notice that each data point in the plot corresponds to an entire sentence in the validation set." 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": { 361 | "scrolled": false 362 | }, 363 | "outputs": [], 364 | "source": [ 365 | "max_iter = 25000\n", 366 | "eval_every = 1000\n", 367 | "log_every = 500\n", 368 | "tsne_every = eval_every * 5\n", 369 | "\n", 370 | "# will be updated while iterating\n", 371 | "tsne_plot = show(p, notebook_handle=True)\n", 372 | "\n", 373 | "train_loss, train_accs = [], []\n", 374 | "\n", 375 | "net.train()\n", 376 | "for i, batch in enumerate(train_iter):\n", 377 | " if i % eval_every == 0:\n", 378 | " net.eval()\n", 379 | " val_losses, val_accs, val_lengths = 0, 0, 0\n", 380 | " val_meta = {'label_idx': [], 'sentences': [], 'labels': []}\n", 381 | " for val_batch in val_iter:\n", 382 | " output = net(val_batch.text)\n", 383 | " # batches sizes might vary, which is why we cannot just mean the batch's loss\n", 384 | " # we multiply the loss and accuracies with the batch's size,\n", 385 | " # to later divide by the total size\n", 386 | " val_losses += criterion(output['out'], val_batch.label) * val_batch.batch_size\n", 387 | " val_accs += accuracy(output['out'], val_batch.label) * val_batch.batch_size\n", 388 | " val_lengths += val_batch.batch_size\n", 389 | " \n", 390 | " for key, _val in output.items():\n", 391 | " if key not in val_meta:\n", 392 | " val_meta[key] = []\n", 393 | " val_meta[key].append(get_numpy(_val)) \n", 394 | " val_meta['label_idx'].append(get_numpy(val_batch.label))\n", 395 | " val_meta['sentences'].append(construct_sentences(val_batch))\n", 396 | " val_meta['labels'].append(get_labels(val_batch))\n", 397 | " \n", 398 | " for key, _val in val_meta.items():\n", 399 | " val_meta[key] = np.concatenate(_val)\n", 400 | " \n", 401 | " # divide by the total accumulated batch sizes\n", 402 | " val_losses /= val_lengths\n", 403 | " val_accs /= val_lengths\n", 404 | " \n", 405 | " print(\"### EVAL loss: {:.2f} accs: {:.2f}\".format(get_numpy(val_losses)[0],\n", 406 | " get_numpy(val_accs)[0]))\n", 407 | " if i % tsne_every == 0:\n", 408 | " update_plot(val_meta, 'hidden', tsne_plot)\n", 409 | " \n", 410 | " net.train()\n", 411 | " \n", 412 | " output = net(batch.text)\n", 413 | " batch_loss = criterion(output['out'], batch.label)\n", 414 | " \n", 415 | " train_loss.append(get_numpy(batch_loss))\n", 416 | " train_accs.append(get_numpy(accuracy(output['out'], batch.label)))\n", 417 | " \n", 418 | " optimizer.zero_grad()\n", 419 | " batch_loss.backward()\n", 420 | " optimizer.step()\n", 421 | " \n", 422 | " if i % log_every == 0: \n", 423 | " print(\"train, it: {} loss: {:.2f} accs: {:.2f}\".format(i, \n", 424 | " np.mean(train_loss), \n", 425 | " np.mean(train_accs)))\n", 426 | " # reset\n", 427 | " train_loss, train_accs = [], []\n", 428 | " \n", 429 | " \n", 430 | " if max_iter < i:\n", 431 | " break" 432 | ] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": {}, 437 | "source": [ 438 | "The above vanilla model should achieve below 80% accuracy in evaluation." 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "# Assignments\n", 446 | "\n", 447 | "## Assignment 1\n", 448 | "\n", 449 | "Upgrade the model such that it is equivalent to the model presented in [Johansen & Socher](https://arxiv.org/abs/1712.05483).\n", 450 | "See figure A3 for an illustration.\n", 451 | "\n", 452 | "- Note, batch and sequence dimensions are flipped\n", 453 | "- A *projection layer* is a fully connected layer, which does not necessarily have a non-linearity\n", 454 | "- Notice, that you also need to update the optimizer\n", 455 | "\n", 456 | "> **Goal** you should see evaluation acccuracy around 86%" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "# Optional reading material and lab\n", 464 | "\n", 465 | "- follow [pytorch's seq2seq lab](http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html) for a complete implementation of a seq2seq model\n", 466 | "\n", 467 | "also, you might want to read the following articles (which are also mentioned in the above lab):\n", 468 | "\n", 469 | "- [Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation](https://arxiv.org/abs/1406.1078)\n", 470 | "- [Sequence to Sequence Learning with Neural Networks](https://arxiv.org/abs/1409.3215)\n", 471 | "- [Neural Machine Translation by Jointly Learning to Align and Translate](https://arxiv.org/abs/1409.0473)\n", 472 | "- [A Neural Conversational Model](https://arxiv.org/abs/1506.05869)" 473 | ] 474 | } 475 | ], 476 | "metadata": { 477 | "kernelspec": { 478 | "display_name": "Python 3", 479 | "language": "python", 480 | "name": "python3" 481 | }, 482 | "language_info": { 483 | "codemirror_mode": { 484 | "name": "ipython", 485 | "version": 3 486 | }, 487 | "file_extension": ".py", 488 | "mimetype": "text/x-python", 489 | "name": "python", 490 | "nbconvert_exporter": "python", 491 | "pygments_lexer": "ipython3", 492 | "version": "3.6.4" 493 | } 494 | }, 495 | "nbformat": 4, 496 | "nbformat_minor": 2 497 | } 498 | -------------------------------------------------------------------------------- /2_intermediate/6.1-final_exam_SNLI.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Textual Entailment\n", 8 | "\n", 9 | "With Textual Entailment (TE) the goal is to take a pair of sentences and predict whether the facts in the first sentence necessarily imply the facts in the second one.\n", 10 | "\n", 11 | "> An **entailment** is a deduction or implication, that is, something that follows logically from or is implied by something else.\n", 12 | "\n", 13 | "### Contradiction example\n", 14 | "- **Sentence one:** Two women are wandering along the shore drinking iced tea.\n", 15 | "- **Sentence two:** Two women are sitting on a blanket near some rocks talking about politics.\n", 16 | "\n", 17 | "### Entailment example\n", 18 | "- **Sentence one:** An interplanetary spacecraft is in orbit around a gas giant's icy moon.\n", 19 | "- **Sentence two:** The spacecraft has the ability to travel between planets.\n", 20 | "\n", 21 | "### Neutral example\n", 22 | "- **Sentence one:** A large, gray elephant walked beside a herd of zebras.\n", 23 | "- **Sentence two:** The elephant was lost.\n", 24 | "\n", 25 | "## Stanford Natural Language Inference corpus\n", 26 | "\n", 27 | "With $570{,}152$ sentence pairs the Stanford Natural Language Inference (SNLI) corpus is a large collection of sentence pairs labeled for entailment, contradiction, and semantic independence. See [Bowman et al.](https://nlp.stanford.edu/pubs/snli_paper.pdf) for more info" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "# Final exam\n", 35 | "\n", 36 | "In this final exam, we expect you to implement three different variants of models that can solve the TE challenge.\n", 37 | "\n", 38 | "1. Follow [Bowman et al.](https://nlp.stanford.edu/pubs/snli_paper.pdf)'s solution to build a simple Bag of Words model\n", 39 | " - Accuracy goal: 70%\n", 40 | "2. Follow [Bowman et al.](https://nlp.stanford.edu/pubs/snli_paper.pdf) (same as above) to build a model with an LSTM RNN\n", 41 | " - Accuracy goal: 75%\n", 42 | "3. Follow [McCann et al.](https://arxiv.org/abs/1708.00107) al Biattentive Classification Network (BCN) model elaborated in section five (this is advanced and timeconsuming)\n", 43 | " - Accuracy goal: 80%" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "# The data loader\n", 51 | "\n", 52 | "`torchtext` has made a convinient data loader for the SNLI dataset.\n", 53 | "See https://github.com/pytorch/text/blob/master/test/snli.py for details on how to use it.\n", 54 | "\n", 55 | "For word vectors use `GloVe 840B` with 300 dimensions." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "# use torchtext's data loader for snli\n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# import stuff\n", 74 | "import torch" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "use_cuda = torch.cuda.is_available()\n", 84 | "\n", 85 | "def get_variable(x):\n", 86 | " \"\"\" Converts tensors to cuda, if available. \"\"\"\n", 87 | " return x.cuda() if use_cuda else x\n", 88 | "\n", 89 | "def get_numpy(x):\n", 90 | " \"\"\" Get numpy array for both cuda and not. \"\"\"\n", 91 | " return x.cpu().data.numpy() if use_cuda else x.data.numpy()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "# Bag of Words model" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "# build the BoW model\n", 108 | "class BoWNet(nn.Module):\n", 109 | "\n", 110 | " def __init__(self):\n", 111 | " super(BoWNet, self).__init__()\n", 112 | " \n", 113 | " def forward(self, x):\n", 114 | " pass" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "# LSTM RNN model" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "# build the LSTM model\n", 131 | "class LSTMNet(nn.Module):\n", 132 | "\n", 133 | " def __init__(self):\n", 134 | " super(LSTMNet, self).__init__()\n", 135 | " \n", 136 | " def forward(self, x):\n", 137 | " pass" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "# BCN model" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "# build the BCN model\n", 154 | "class BCNNet(nn.Module):\n", 155 | "\n", 156 | " def __init__(self):\n", 157 | " super(BCNNet, self).__init__()\n", 158 | " \n", 159 | " def forward(self, x):\n", 160 | " pass" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "# Begin training" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "# define loss function, optimizer, and accuracy metric\n", 177 | "# note, you might want to think about the model first\n" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "\n", 187 | "net = None # the net you want to use\n", 188 | "if use_cuda:\n", 189 | " net.cuda()" 190 | ] 191 | } 192 | ], 193 | "metadata": { 194 | "kernelspec": { 195 | "display_name": "Python 3", 196 | "language": "python", 197 | "name": "python3" 198 | }, 199 | "language_info": { 200 | "codemirror_mode": { 201 | "name": "ipython", 202 | "version": 3 203 | }, 204 | "file_extension": ".py", 205 | "mimetype": "text/x-python", 206 | "name": "python", 207 | "nbconvert_exporter": "python", 208 | "pygments_lexer": "ipython3", 209 | "version": "3.6.4" 210 | } 211 | }, 212 | "nbformat": 4, 213 | "nbformat_minor": 2 214 | } 215 | -------------------------------------------------------------------------------- /2_intermediate/README.md: -------------------------------------------------------------------------------- 1 | # Credits 2 | 3 | This is a remix from https://github.com/DeepLearningDTU/02456-deep-learning 4 | 5 | # Content 6 | 7 | This is the next steps in learning how to use PyTorch. 8 | The purpose is to get the audience to get the audience familiar with building machine learning models in PyTorch. 9 | 10 | **NOTE** In these labs cuda is used extensively. 11 | If you have a cuda enabled machine, read the README.md in the root of this repo on how to use nvidia-docker. 12 | 13 | ## Lab 1: Feed Forward Network 14 | 15 | In this lab you will implement a simple feed forward neural network in PyTorch. 16 | 17 | ### Reading material 18 | 19 | - http://neuralnetworksanddeeplearning.com/chap1.html 20 | - http://neuralnetworksanddeeplearning.com/chap2.html 21 | 22 | ### 1.1-FFN 23 | 24 | Train a FFN classifier to solve the half moon problem. 25 | 26 | ### 1.2-FFN 27 | 28 | Train a FFN classifier to recognize handwritten digits. 29 | 30 | ## Lab 2: Convolutional Neural Network 31 | 32 | In this lab you will implement a convolutional neural network in PyTorch. 33 | 34 | ### Reading material 35 | 36 | - http://cs231n.github.io/convolutional-networks/ 37 | - http://neuralnetworksanddeeplearning.com/chap6.html 38 | 39 | ### 2.1-CNN 40 | 41 | Train a CNN classifier to recognize handwritten digits. 42 | 43 | ### 2.2-CNN 44 | 45 | Use the cluttered MNIST dataset along with a Spatial Transformer Network to make a CNN classifier to recognize handwritten digits. 46 | 47 | This lab contains first real use of cuda support. 48 | 49 | ## Lab 3: Autoencoders 50 | 51 | Autoencoders and MNIST. 52 | 53 | ### Reading material 54 | 55 | - https://www.cs.toronto.edu/~hinton/science.pdf 56 | 57 | ### 3.1-AE 58 | 59 | Train an autoencoder on MNIST to encode a target given some input. 60 | 61 | ## Lab 4: Recurrent Neural Networks and Sequences 62 | 63 | This lab shows how sequences can be used with recurrent neural networks to learn to classify text. 64 | 65 | ### Reading material 66 | 67 | - https://www.youtube.com/watch?v=yCC09vCHzF8 68 | - http://karpathy.github.io/2015/05/21/rnn-effectiveness/ 69 | - https://einstein.ai/research/learning-when-to-skim-and-when-to-read 70 | - http://papers.nips.cc/paper/5021-distributed-representations-of-words-andphrases 71 | 72 | ### 4.1-Sequences 73 | 74 | Train a bag of words model to classify positive and negative sentences. 75 | 76 | ### 4.2-RNN 77 | 78 | Train a recurrent neural network to classify positive and negative sentences. 79 | Note this lab is very slow without a GPU. See `README.md` in root folder. 80 | 81 | ## Lab 5: Leaf Classification with Kaggle 82 | 83 | Combine all previous labs (FFN, CNN, RNN) to compete in the [leaf classification challenge](https://www.kaggle.com/c/leaf-classification). 84 | 85 | ### Reading material 86 | 87 | *Nothing.* 88 | 89 | ### 5.1-Kaggle 90 | 91 | Train different models and submit the results to Kaggle. 92 | 93 | ## Lab 6: Final exam 94 | 95 | Test your newly acquired skills on the most notorious classification challenges: Text Entailment. 96 | 97 | ### Reading material 98 | 99 | - Stanford Natural Language Inference (SNLI), [Bowman et al.](https://nlp.stanford.edu/pubs/snli_paper.pdf) 100 | - [Rocktäschel et al.](https://arxiv.org/abs/1509.06664) 101 | - [McCann et al.](https://arxiv.org/abs/1708.00107) (bleeding edge research, advanced reading material, focus on the BCN model in section five) 102 | 103 | ### 6.1-final_exam_SNLI 104 | 105 | Build data loader, model, loss function, optimizer and training loop from scratch. 106 | Notice that replicating the BCN model is a difficult task and may take a significant amount of time. -------------------------------------------------------------------------------- /2_intermediate/data_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | 5 | from skimage.io import imread 6 | from skimage.transform import resize 7 | from sklearn.preprocessing import LabelEncoder 8 | 9 | import warnings 10 | with warnings.catch_warnings(): 11 | warnings.filterwarnings("ignore", category=DeprecationWarning) 12 | from sklearn.cross_validation import StratifiedShuffleSplit 13 | 14 | 15 | def onehot(t, num_classes): 16 | out = np.zeros((t.shape[0], num_classes)) 17 | for row, col in enumerate(t): 18 | out[int(row), int(col)] = 1 19 | return out 20 | 21 | class load_data(): 22 | # data_train, data_test and le are public 23 | def __init__(self, train_path, test_path, image_paths, image_shape=(128, 128)): 24 | train_df = pd.read_csv(train_path) 25 | test_df = pd.read_csv(test_path) 26 | image_paths = image_paths 27 | image_shape = image_shape 28 | self._load(train_df, test_df, image_paths, image_shape) 29 | 30 | def _load(self, train_df, test_df, image_paths, image_shape): 31 | # load train.csv 32 | path_dict = self._path_to_dict(image_paths) # numerate image paths and make it a dict 33 | # merge image paths with data frame 34 | train_image_df = self._merge_image_df(train_df, path_dict) 35 | test_image_df = self._merge_image_df(test_df, path_dict) 36 | # label encoder-decoder (self. because we need it later) 37 | self.le = LabelEncoder().fit(train_image_df['species']) 38 | # labels for train 39 | t_train = self.le.transform(train_image_df['species']) 40 | # getting data 41 | train_data = self._make_dataset(train_image_df, image_shape, t_train) 42 | test_data = self._make_dataset(test_image_df, image_shape) 43 | # need to reformat the train for validation split reasons in the batch_generator 44 | self.train = self._format_dataset(train_data, for_train=True) 45 | self.test = self._format_dataset(test_data, for_train=False) 46 | 47 | 48 | def _path_to_dict(self, image_paths): 49 | path_dict = dict() 50 | for image_path in image_paths: 51 | num_path = int(os.path.basename(image_path[:-4])) 52 | path_dict[num_path] = image_path 53 | return path_dict 54 | 55 | def _merge_image_df(self, df, path_dict): 56 | split_path_dict = dict() 57 | for index, row in df.iterrows(): 58 | split_path_dict[row['id']] = path_dict[row['id']] 59 | image_frame = pd.DataFrame(list(split_path_dict.values()), columns=['image']) 60 | df_image = pd.concat([image_frame, df], axis=1) 61 | return df_image 62 | 63 | def _make_dataset(self, df, image_shape, t_train=None): 64 | # make dataset 65 | data = dict() 66 | # merge image with 3x64 features 67 | for i, dat in enumerate(df.iterrows()): 68 | index, row = dat 69 | sample = dict() 70 | if t_train is not None: 71 | features = row.drop(['id', 'species', 'image'], axis=0).values 72 | else: 73 | features = row.drop(['id', 'image'], axis=0).values 74 | sample['margin'] = features[:64] 75 | sample['shape'] = features[64:128] 76 | sample['texture'] = features[128:] 77 | if t_train is not None: 78 | sample['t'] = np.asarray(t_train[i], dtype='int32') 79 | image = imread(row['image'], as_grey=True) 80 | image = resize(image, output_shape=image_shape) 81 | image = np.expand_dims(image, axis=2) 82 | sample['image'] = image 83 | data[row['id']] = sample 84 | if i % 100 == 0: 85 | print("\t%d of %d" % (i, len(df))) 86 | return data 87 | 88 | def _format_dataset(self, df, for_train): 89 | # making arrays with all data in, is nessesary when doing validation split 90 | data = dict() 91 | value = list(df.values())[0] 92 | img_tot_shp = tuple([len(df)] + list(value['image'].shape)) 93 | data['images'] = np.zeros(img_tot_shp, dtype='float32') 94 | feature_tot_shp = (len(df), 64) 95 | data['margins'] = np.zeros(feature_tot_shp, dtype='float32') 96 | data['shapes'] = np.zeros(feature_tot_shp, dtype='float32') 97 | data['textures'] = np.zeros(feature_tot_shp, dtype='float32') 98 | if for_train: 99 | data['ts'] = np.zeros((len(df),), dtype='int32') 100 | else: 101 | data['ids'] = np.zeros((len(df),), dtype='int32') 102 | for i, pair in enumerate(df.items()): 103 | key, value = pair 104 | data['images'][i] = value['image'] 105 | data['margins'][i] = value['margin'] 106 | data['shapes'][i] = value['shape'] 107 | data['textures'][i] = value['texture'] 108 | if for_train: 109 | data['ts'][i] = value['t'] 110 | else: 111 | data['ids'][i] = key 112 | return data 113 | 114 | class batch_generator(): 115 | def __init__(self, data, batch_size=64, num_classes=99, 116 | num_iterations=5e3, num_features=64, seed=42, val_size=0.1): 117 | self._train = data.train 118 | self._test = data.test 119 | # get image size 120 | value = self._train['images'][0] 121 | self._image_shape = list(value.shape) 122 | self._batch_size = batch_size 123 | self._num_classes = num_classes 124 | self._num_iterations = num_iterations 125 | self._num_features = num_features 126 | self._seed = seed 127 | self._val_size = 0.1 128 | self._valid_split() 129 | 130 | def _valid_split(self): 131 | self._idcs_train, self._idcs_valid = next(iter( 132 | StratifiedShuffleSplit(self._train['ts'], 133 | n_iter=1, 134 | test_size=self._val_size, 135 | random_state=self._seed))) 136 | def _shuffle_train(self): 137 | np.random.shuffle(self._idcs_train) 138 | 139 | def _batch_init(self, purpose): 140 | assert purpose in ['train', 'valid', 'test'] 141 | batch_holder = dict() 142 | batch_holder['margins'] = np.zeros((self._batch_size, self._num_features), dtype='float32') 143 | batch_holder['shapes'] = np.zeros((self._batch_size, self._num_features), dtype='float32') 144 | batch_holder['textures'] = np.zeros((self._batch_size, self._num_features), dtype='float32') 145 | batch_holder['images'] = np.zeros(tuple([self._batch_size] + self._image_shape), dtype='float32') 146 | if (purpose == "train") or (purpose == "valid"): 147 | batch_holder['ts'] = np.zeros((self._batch_size, self._num_classes), dtype='float32') 148 | else: 149 | batch_holder['ids'] = [] 150 | return batch_holder 151 | 152 | def gen_valid(self): 153 | batch = self._batch_init(purpose='train') 154 | i = 0 155 | for idx in self._idcs_valid: 156 | batch['margins'][i] = self._train['margins'][idx] 157 | batch['shapes'][i] = self._train['shapes'][idx] 158 | batch['textures'][i] = self._train['textures'][idx] 159 | batch['images'][i] = self._train['images'][idx] 160 | batch['ts'][i] = onehot(np.asarray([self._train['ts'][idx]], dtype='float32'), self._num_classes) 161 | i += 1 162 | if i >= self._batch_size: 163 | yield batch, i 164 | batch = self._batch_init(purpose='valid') 165 | i = 0 166 | if i != 0: 167 | yield batch, i 168 | 169 | def gen_test(self): 170 | batch = self._batch_init(purpose='test') 171 | i = 0 172 | for idx in range(len(self._test['ids'])): 173 | batch['margins'][i] = self._test['margins'][idx] 174 | batch['shapes'][i] = self._test['shapes'][idx] 175 | batch['textures'][i] = self._test['textures'][idx] 176 | batch['images'][i] = self._test['images'][idx] 177 | batch['ids'].append(self._test['ids'][idx]) 178 | i += 1 179 | if i >= self._batch_size: 180 | yield batch, i 181 | batch = self._batch_init(purpose='test') 182 | i = 0 183 | if i != 0: 184 | yield batch, i 185 | 186 | 187 | def gen_train(self): 188 | batch = self._batch_init(purpose='train') 189 | iteration = 0 190 | i = 0 191 | while True: 192 | # shuffling all batches 193 | self._shuffle_train() 194 | for idx in self._idcs_train: 195 | # extract data from dict 196 | batch['margins'][i] = self._train['margins'][idx] 197 | batch['shapes'][i] = self._train['shapes'][idx] 198 | batch['textures'][i] = self._train['textures'][idx] 199 | batch['images'][i] = self._train['images'][idx] 200 | batch['ts'][i] = onehot(np.asarray([self._train['ts'][idx]], dtype='float32'), self._num_classes) 201 | i += 1 202 | if i >= self._batch_size: 203 | yield batch 204 | batch = self._batch_init(purpose='train') 205 | i = 0 206 | iteration += 1 207 | if iteration >= self._num_iterations: 208 | break 209 | -------------------------------------------------------------------------------- /3_advanced/README.md: -------------------------------------------------------------------------------- 1 | # Credits 2 | 3 | This is a remix/a PyTorch course assistent to [Standford's CS224n (winter 2018)](http://web.stanford.edu/class/cs224n/index.html). 4 | 5 | # Content 6 | 7 | These labs are more advanced and time consuming than the previous labs. 8 | It is required that you have a good understanding for statistics, probability, and convex optimization. 9 | 10 | The course will follow [Standford's CS224n (winter 2018)](http://web.stanford.edu/class/cs224n/index.html), where we have translated the three assignments to PyTorch and iPython Notebook. 11 | 12 | Each lecture will consist of reading material and a 1.5 hour video. 13 | 14 | We expect that you give a brief summarization of each article and the core content introduced in the video lecture. 15 | -------------------------------------------------------------------------------- /Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM conda/miniconda3 2 | 3 | ENV LANG=C.UTF-8 \ 4 | LC_ALL=C.UTF-8 5 | 6 | RUN export CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" 7 | RUN conda install -y numpy pyyaml mkl setuptools cmake cffi 8 | RUN conda install -y -c pytorch magma-cuda80 9 | 10 | RUN apt-get update -y && \ 11 | apt-get install -y --no-install-recommends \ 12 | git \ 13 | wget \ 14 | build-essential \ 15 | && \ 16 | apt-get clean && \ 17 | rm -rf /var/lib/apt/lists/* 18 | 19 | RUN git clone --recursive https://github.com/pytorch/pytorch /tmp/pytorch 20 | RUN cd /tmp/pytorch/ && python3 setup.py install 21 | 22 | RUN pip install https://github.com/pytorch/text/archive/master.zip 23 | 24 | RUN conda install -y torchvision -c pytorch 25 | RUN conda install -y jupyter matplotlib scikit-learn nltk bokeh scikit-image 26 | 27 | WORKDIR /work 28 | CMD ["bash"] 29 | -------------------------------------------------------------------------------- /Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:8.0-devel-ubuntu16.04 2 | 3 | ARG CONDA_DIR=/opt/conda 4 | ARG CONDA_DOWNLOAD_SCRIPT_URL=https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh 5 | 6 | ENV LANG=C.UTF-8 \ 7 | LC_ALL=C.UTF-8 \ 8 | PATH=$CONDA_DIR/bin:$PATH 9 | 10 | RUN apt-get update -y && \ 11 | apt-get install -y --no-install-recommends \ 12 | git \ 13 | wget \ 14 | ca-certificates \ 15 | build-essential \ 16 | && \ 17 | apt-get clean && \ 18 | rm -rf /var/lib/apt/lists/* 19 | 20 | RUN wget $CONDA_DOWNLOAD_SCRIPT_URL -qO /tmp/miniconda.sh && \ 21 | /bin/bash /tmp/miniconda.sh -b -p $CONDA_DIR && \ 22 | rm -rf /tmp/* 23 | 24 | RUN export CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" 25 | RUN conda install -y numpy pyyaml mkl setuptools cmake cffi 26 | RUN conda install -y -c pytorch magma-cuda80 27 | 28 | RUN git clone --recursive https://github.com/pytorch/pytorch /tmp/pytorch 29 | RUN cd /tmp/pytorch/ && python3 setup.py install 30 | 31 | RUN pip install https://github.com/pytorch/text/archive/master.zip 32 | 33 | RUN conda install -y torchvision -c pytorch 34 | RUN conda install -y jupyter matplotlib scikit-learn nltk bokeh scikit-image 35 | 36 | WORKDIR /work 37 | CMD ["bash"] 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Most complete PyTorch and NLP tutorial in existence 2 | 3 | This repo is a remix of the most popular online courses starting from applied deep learning and increasingly moving into more theoretical content with a pytorch translation of Stanford NLP's CS224d 2017 course. 4 | This course is a natural extension of fast.ai's 2018 deep learning tutorial. 5 | Lastly, the tutorial encompasses the AllenNLP library for building advanced deep learning systems and researching in NLP. 6 | 7 | Amongst courses where content is remixed are 8 | 9 | - [PyTorch Tutorial](https://github.com/pytorch/tutorials) 10 | - [Technical University of Denmark's Deep Learning tutorial](https://github.com/DeepLearningDTU/02456-deep-learning) 11 | - [Stanford's CS224 Deep Learning for NLP](http://cs224d.stanford.edu/) 12 | 13 | **Estimated time for completion of all content, given prerequisites are satisfied** 2 months (full time). 14 | 15 | # Prerequisites 16 | 17 | Prerequisites for these tutorials are understanding of linear algebra and python. 18 | If you do not understand linear algegbra, we recommend either 19 | 20 | ## Linear Algebra 21 | 22 | 1. Crash course (often sufficient for engineers) 23 | - Andrew Ng's [linear algebra crash course](https://www.coursera.org/learn/machine-learning/lecture/38jIT/matrices-and-vectors). 24 | Watch all the videos, which should be around an hour total. 25 | 2. Thorough introduction (recommended for researchers) 26 | - [Khan Academy's Precalculus](https://www.khanacademy.org/math/precalculus) 27 | - note, if you do not have prerequisites for the precalculus, you can start from scratch with Khan Academy 28 | - [Single Variable Calculus](https://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/) 29 | - [Multivariable Calculus](https://ocw.mit.edu/courses/mathematics/18-02sc-multivariable-calculus-fall-2010/) 30 | - [Linear Algebra](https://ocw.mit.edu/courses/mathematics/18-06sc-linear-algebra-fall-2011/) 31 | 32 | Going through the thorough introduction to linear algebra from scratch is about 2 months (full time). 33 | The crash course should take about 1 day to complete. 34 | 35 | ## Python 36 | 37 | 1. Crash course 38 | - https://learnxinyminutes.com/docs/python3/ 39 | - https://www.codecademy.com/learn/learn-python 40 | 2. Thorough introduction 41 | - [Introduction to Computer Science and Programming in Python](https://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-0001-introduction-to-computer-science-and-programming-in-python-fall-2016/) 42 | - [Introduction to Algorithms](https://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-006-introduction-to-algorithms-fall-2011/) 43 | 44 | Going through the thorough introduction to computer science from scratch is about 1 month (full time). 45 | The crash course should take about 1 day to complete. 46 | 47 | # Docker 48 | 49 | To run the labs we will use Docker, which is ellaborated below. 50 | 51 | See https://docs.docker.com/install/linux/docker-ce/ubuntu/#install-docker-ce on how to install `docker`. 52 | 53 | ## Build image 54 | 55 | ### CPU 56 | 57 | ``` 58 | docker build -t munkai/pytorch:cpu -f Dockerfile.cpu . 59 | ``` 60 | 61 | ### GPU 62 | 63 | Also available on dockerhub, so simply use the `docker run` command to fetch remote version. 64 | 65 | ``` 66 | docker build -t munkai/pytorch:gpu -f Dockerfile.gpu . 67 | ``` 68 | 69 | ## Start container 70 | 71 | ``` 72 | docker run -it -p 8888:8888 -v `pwd`:/work munkai/pytorch:cpu ./jupyter_run.sh 73 | ``` 74 | 75 | ## Running docker with a CUDA-enabled machine 76 | 77 | You need Nvidia and nvidia-docker installed for this. 78 | 79 | ``` 80 | nvidia-docker run -it -p 8888:8888 -v `pwd`:/work munkai/pytorch:gpu ./jupyter_run.sh 81 | ``` 82 | 83 | ### Install Nvidia 84 | 85 | Make sure you have Nvidia's drivers installed for your system. 86 | Following is install instructions for ubuntu 16.04 87 | 88 | ``` 89 | DISTRO=ubuntu 90 | VERSION=1604 91 | ARCH=x86_64 92 | sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/${DISTRO}${VERSION}/${ARCH}/7fa2af80.pub 93 | sudo sh -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/${DISTRO}${VERSION}/${ARCH}/" > /etc/apt/sources.list.d/cuda.list' 94 | sudo apt-get update && sudo apt-get install -y --no-install-recommends cuda-drivers 95 | ``` 96 | 97 | ### Install nvidia-docker 98 | 99 | See https://github.com/NVIDIA/nvidia-docker on how to install `nvidia-docker`. 100 | -------------------------------------------------------------------------------- /jupyter_run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | #============ 3 | # FILE: jupyter_run.sh 4 | # USAGE: ./jupyter_run.sh 5 | # DESCRIPTION: Start jupyter notebook. Note the commands used. 6 | # AUTHOR: Elias Obeid, elias@munk.ai 7 | # REQUIREMENT: 8 | #============ 9 | 10 | jupyter notebook --ip=0.0.0.0 --allow-root 11 | -------------------------------------------------------------------------------- /static_files/autograd-variable.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/munkai/pytorch-tutorial/f4cadc166fa365257c7b94a3e4ceedf930dc762e/static_files/autograd-variable.png -------------------------------------------------------------------------------- /static_files/cifar10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/munkai/pytorch-tutorial/f4cadc166fa365257c7b94a3e4ceedf930dc762e/static_files/cifar10.png -------------------------------------------------------------------------------- /static_files/lstm_cell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/munkai/pytorch-tutorial/f4cadc166fa365257c7b94a3e4ceedf930dc762e/static_files/lstm_cell.png -------------------------------------------------------------------------------- /static_files/mnist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/munkai/pytorch-tutorial/f4cadc166fa365257c7b94a3e4ceedf930dc762e/static_files/mnist.png -------------------------------------------------------------------------------- /static_files/rnn-unfold.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/munkai/pytorch-tutorial/f4cadc166fa365257c7b94a3e4ceedf930dc762e/static_files/rnn-unfold.png --------------------------------------------------------------------------------