├── README.md ├── pytorch ├── images │ ├── detach.png │ ├── Variable.png │ ├── dcgan_d.png │ ├── dcgan_g.png │ └── dynamic_graph.gif ├── data │ └── nmt_license.txt ├── 3. Introduction to the Torch Neural Network Library.ipynb ├── 1. The Torch Tensor Library and Basic Operations.ipynb ├── 2. Autograd.ipynb ├── 5. Neural Machine Translation.ipynb └── 4. Image Classification with Convnets and ResNets.ipynb ├── LICENSE └── tensorflow ├── tensorflow_exercise_solution.py └── tensorflow.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # welcome_tutorials 2 | Various tutorials given for welcoming new students at MILA. 3 | -------------------------------------------------------------------------------- /pytorch/images/detach.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mila-iqia/welcome_tutorials/HEAD/pytorch/images/detach.png -------------------------------------------------------------------------------- /pytorch/images/Variable.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mila-iqia/welcome_tutorials/HEAD/pytorch/images/Variable.png -------------------------------------------------------------------------------- /pytorch/images/dcgan_d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mila-iqia/welcome_tutorials/HEAD/pytorch/images/dcgan_d.png -------------------------------------------------------------------------------- /pytorch/images/dcgan_g.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mila-iqia/welcome_tutorials/HEAD/pytorch/images/dcgan_g.png -------------------------------------------------------------------------------- /pytorch/images/dynamic_graph.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mila-iqia/welcome_tutorials/HEAD/pytorch/images/dynamic_graph.gif -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 mila-udem 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tensorflow/tensorflow_exercise_solution.py: -------------------------------------------------------------------------------- 1 | # Avoid polluting the default graph by using an alternate graph 2 | with tf.Graph().as_default(): 3 | tf.set_random_seed(1234) 4 | 5 | # Create two scalar variables, x and y, initialized at random. 6 | x = tf.get_variable(name='x', shape=[], dtype=tf.float32, 7 | initializer=tf.random_normal_initializer()) 8 | y = tf.get_variable(name='y', shape=[], dtype=tf.float32, 9 | initializer=tf.random_normal_initializer()) 10 | 11 | # Create a tensor z whose value represents the expression 12 | # 2(x - 2)^2 + 2(y + 3)^2 13 | z = 2 * (x - 2) ** 2 + 2 * (y + 3) ** 2 14 | 15 | # Compute the gradients of z with respect to x and y. 16 | dx, dy = tf.gradients(z, [x, y]) 17 | 18 | # Create an assignment expression for x using the update rule 19 | # x <- x - 0.1 * dz/dx 20 | # and do the same for y. 21 | x_update = tf.assign_sub(x, 0.1 * dx) 22 | y_update = tf.assign_sub(y, 0.1 * dy) 23 | 24 | with tf.Session() as session: 25 | # Run the global initializer op for x and y. 26 | session.run(tf.global_variables_initializer()) 27 | 28 | for _ in range(10): 29 | # Run the update ops for x and y. 30 | session.run([x_update, y_update]) 31 | 32 | # Retrieve the values for x, y, and z, and print them. 33 | x_val, y_val, z_val = session.run([x, y, z]) 34 | print('x = {:4.2f}, y = {:4.2f}, z = {:4.2f}'.format(x_val, y_val, z_val)) 35 | -------------------------------------------------------------------------------- /pytorch/data/nmt_license.txt: -------------------------------------------------------------------------------- 1 | ** Info ** 2 | 3 | Check for newest version here: 4 | http://www.manythings.org/anki/ 5 | Date of this file: 6 | 2017-10-30 7 | 8 | This data is from the sentences_detailed.csv file from tatoeba.org. 9 | http://tatoeba.org/files/downloads/sentences_detailed.csv 10 | 11 | 12 | 13 | ** Terms of Use ** 14 | 15 | See the terms of use. 16 | These files have been released under the same license as the 17 | source. 18 | 19 | http://tatoeba.org/eng/terms_of_use 20 | http://creativecommons.org/licenses/by/2.0 21 | 22 | Attribution: www.manythings.org/anki and tatoeba.org 23 | 24 | 25 | 26 | ** Warnings ** 27 | 28 | The data from the Tatoeba Project contains errors. 29 | 30 | To lower the number of errors you are likely to see, only 31 | sentences by native speakers and proofread sentences have 32 | been included. 33 | 34 | For the non-English language, I made these (possibly wrong) 35 | assumptions. 36 | Assumption 1: Sentences written by native speakers can be 37 | trusted. 38 | Assumption 2: Contributors to the Tatoeba Project are honest 39 | about what their native language is. 40 | 41 | For English, I used the sentences that I have proofread 42 | and thought were OK. 43 | Of course, I may have missed a few errors. 44 | 45 | 46 | 47 | ** Downloading Anki ** 48 | 49 | See http://ankisrs.net/ 50 | 51 | 52 | 53 | ** Importing into Anki ** 54 | 55 | Information is at http://ankisrs.net/docs/manual.html#importing 56 | 57 | Of particular interest may be about "duplicates" at http://ankisrs.net/docs/manual.html#duplicates-and-updating. 58 | You can choose: 59 | 1. not to allow duplicates (alternate translations) as cards. 60 | 2. allow duplicates (alternate translations) as cards. 61 | -------------------------------------------------------------------------------- /pytorch/3. Introduction to the Torch Neural Network Library.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## PyTorch Tutorial\n", 8 | "MILA, November 2017\n", 9 | "\n", 10 | "By Sandeep Subramanian" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "## An introduction to the PyTorch neural network library\n", 18 | "\n", 19 | "### `torch.nn` & `torch.optim`" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 37, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "import numpy as np\n", 31 | "from __future__ import print_function" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 38, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import torch\n", 43 | "import torch.nn as nn\n", 44 | "import torch.optim as optim\n", 45 | "import torch.nn.init as init\n", 46 | "import torch.nn.functional as F\n", 47 | "from torch.autograd import Variable" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "### torch.nn\n", 55 | "\n", 56 | "Neural networks can be constructed using the `torch.nn` package.\n", 57 | "\n", 58 | "Provides pretty much all neural network related functionalities such as :\n", 59 | "\n", 60 | "1. Linear layers - `nn.Linear`, `nn.Bilinear`\n", 61 | "2. Convolution Layers - `nn.Conv1d`, `nn.Conv2d`, `nn.Conv3d`, `nn.ConvTranspose2d`\n", 62 | "3. Nonlinearities - `nn.Sigmoid`, `nn.Tanh`, `nn.ReLU`, `nn.LeakyReLU`\n", 63 | "4. Pooling Layers - `nn.MaxPool1d`, `nn.AveragePool2d`\n", 64 | "4. Recurrent Networks - `nn.LSTM`, `nn.GRU`\n", 65 | "5. Normalization - `nn.BatchNorm2d`\n", 66 | "6. Dropout - `nn.Dropout`, `nn.Dropout2d`\n", 67 | "7. Embedding - `nn.Embedding`\n", 68 | "8. Loss Functions - `nn.MSELoss`, `nn.CrossEntropyLoss`, `nn.NLLLoss`\n", 69 | "\n", 70 | "Instances of these classes will have an `__call__` function built-in that can be used to run an input through the layer." 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "### Linear, Bilinear & Nonlinearities" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 39, 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "Linear output size : torch.Size([32, 20])\n", 92 | "Bilinear output size : torch.Size([32, 50])\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "x = Variable(torch.randn(32, 10))\n", 98 | "y = Variable(torch.randn(32, 30))\n", 99 | "\n", 100 | "sigmoid = nn.Sigmoid()\n", 101 | "\n", 102 | "linear = nn.Linear(in_features=10, out_features=20, bias=True)\n", 103 | "output_linear = linear(x)\n", 104 | "print('Linear output size : ', output_linear.size())\n", 105 | "\n", 106 | "bilinear = nn.Bilinear(in1_features=10, in2_features=30, out_features=50, bias=True)\n", 107 | "output_bilinear = bilinear(x, y)\n", 108 | "print('Bilinear output size : ', output_bilinear.size())" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "### Convolution, BatchNorm & Pooling Layers" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 40, 121 | "metadata": { 122 | "collapsed": false 123 | }, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "Conv output size : torch.Size([10, 32, 28, 28])\n", 130 | "Pool output size : torch.Size([10, 32, 14, 14])\n" 131 | ] 132 | } 133 | ], 134 | "source": [ 135 | "x = Variable(torch.randn(10, 3, 28, 28))\n", 136 | "\n", 137 | "conv = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=(3, 3), stride=1, padding=1, bias=True)\n", 138 | "bn = nn.BatchNorm2d(num_features=32)\n", 139 | "pool = nn.MaxPool2d(kernel_size=(2, 2), stride=2)\n", 140 | "\n", 141 | "output_conv = bn(conv(x))\n", 142 | "outpout_pool = pool(conv(x))\n", 143 | "\n", 144 | "print('Conv output size : ', output_conv.size())\n", 145 | "print('Pool output size : ', outpout_pool.size())" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "### Recurrent, Embedding & Dropout Layers" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 41, 158 | "metadata": { 159 | "collapsed": false 160 | }, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "Embedding size : torch.Size([5, 3, 20])\n", 167 | "GRU hidden states size : torch.Size([5, 3, 100])\n", 168 | "GRU last hidden state size : torch.Size([4, 5, 50])\n" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "inputs = [[1, 2, 3], [1, 0, 4], [1, 2, 4], [1, 4, 0], [1, 3, 3]]\n", 174 | "x = Variable(torch.LongTensor(inputs))\n", 175 | "\n", 176 | "embedding = nn.Embedding(num_embeddings=5, embedding_dim=20, padding_idx=1)\n", 177 | "drop = nn.Dropout(p=0.5)\n", 178 | "gru = nn.GRU(input_size=20, hidden_size=50, num_layers=2, batch_first=True, bidirectional=True, dropout=0.3)\n", 179 | "\n", 180 | "emb = drop(embedding(x))\n", 181 | "gru_h, gru_h_t = gru(emb)\n", 182 | "\n", 183 | "print('Embedding size : ', emb.size())\n", 184 | "print('GRU hidden states size : ', gru_h.size())\n", 185 | "print('GRU last hidden state size : ', gru_h_t.size())" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "### torch.nn.functional\n", 193 | "\n", 194 | "Using the above classes requires defining an instance of the class and then running inputs through the instance.\n", 195 | "\n", 196 | "The functional API provides users a way to use these classes in a `functional` way. Such as\n", 197 | "\n", 198 | "`import torch.nn.functional as F`\n", 199 | "\n", 200 | "1. Linear layers - `F.linear(input=x, weight=W, bias=b)`\n", 201 | "2. Convolution Layers - `F.conv2d(input=x, weight=W, bias=b, stride=1, padding=0, dilation=1, groups=1)`\n", 202 | "3. Nonlinearities - `F.sigmoid(x), F.tanh(x), F.relu(x), F.softmax(x)`\n", 203 | "4. Dropout - `F.dropout(x, p=0.5, training=True)`" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "### A few examples of the functional API" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 42, 216 | "metadata": { 217 | "collapsed": false 218 | }, 219 | "outputs": [ 220 | { 221 | "name": "stdout", 222 | "output_type": "stream", 223 | "text": [ 224 | "Conv output size : torch.Size([10, 32, 28, 28])\n" 225 | ] 226 | } 227 | ], 228 | "source": [ 229 | "x = Variable(torch.randn(10, 3, 28, 28))\n", 230 | "filters = Variable(torch.randn(32, 3, 3, 3))\n", 231 | "conv_out = F.relu(F.dropout(F.conv2d(input=x, weight=filters, padding=1), p=0.5, training=True))\n", 232 | "\n", 233 | "print('Conv output size : ', conv_out.size())" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "### torch.nn.init\n", 241 | "\n", 242 | "Provides a set of functions for standard weight initialization techniques\n", 243 | "\n", 244 | "`import torch.nn.init as init`\n", 245 | "\n", 246 | "1. Calculate the gain of a layer based on the activation function - `init.calculate_gain('sigmoid')`\n", 247 | "2. Uniform init - `init.uniform(tensor, low, high)`\n", 248 | "3. Xavier uniform - `init.xavier_uniform(tensor, gain=init.calculate_gain('sigmoid'))`\n", 249 | "4. Xavier normal - `init.xavier_normal(tensor, gain=init.calculate_gain('tanh'))`\n", 250 | "5. Orthogonal - `init.orthogonal(tensor, gain=init.calculate_gain('tanh'))`\n", 251 | "6. Kaiming normal - `init.kaiming_normal(tensor, mode='fan_in')`" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "### Initializing convolution kernels" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 43, 264 | "metadata": { 265 | "collapsed": false 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "conv_layer = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=(3, 3), padding=1)\n", 270 | "for k,v in conv_layer.named_parameters():\n", 271 | " if k == 'weight':\n", 272 | " init.kaiming_normal(v)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "### torch.optim\n", 280 | "\n", 281 | "Provides implementations of standard stochastic optimization techniques\n", 282 | "\n", 283 | "`import torch.optim as optim`\n", 284 | "\n", 285 | " W1 = Variable(torch.randn(10, 20), requires_grad=True)\n", 286 | " W2 = Variable(torch.randn(10, 20), requires_grad=True)\n", 287 | "\n", 288 | "1. SGD - `optim.SGD([W1, W2], lr=0.01, momentum=0.9, dampening=0, weight_decay=1e-2, nesterov=True)`\n", 289 | "2. Adam - `optim.Adam([W1, W2], lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)`\n", 290 | "\n", 291 | "#### Learning Rate Scheduling\n", 292 | "\n", 293 | "`optim.lr_scheduler`\n", 294 | "\n", 295 | "1. `optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30,80], gamma=0.1)`\n", 296 | "2. `optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True, threshold=1e-04, threshold_mode='rel', min_lr=1e-05, eps=1e-08)`" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": { 302 | "collapsed": true 303 | }, 304 | "source": [ 305 | "### We'll look at how to use `torch.optim` in the following tutorial" 306 | ] 307 | } 308 | ], 309 | "metadata": { 310 | "kernelspec": { 311 | "display_name": "Python 2", 312 | "language": "python", 313 | "name": "python2" 314 | }, 315 | "language_info": { 316 | "codemirror_mode": { 317 | "name": "ipython", 318 | "version": 2 319 | }, 320 | "file_extension": ".py", 321 | "mimetype": "text/x-python", 322 | "name": "python", 323 | "nbconvert_exporter": "python", 324 | "pygments_lexer": "ipython2", 325 | "version": "2.7.13" 326 | } 327 | }, 328 | "nbformat": 4, 329 | "nbformat_minor": 2 330 | } 331 | -------------------------------------------------------------------------------- /pytorch/1. The Torch Tensor Library and Basic Operations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## PyTorch Tutorial\n", 8 | "MILA, November 2017\n", 9 | "\n", 10 | "By Sandeep Subramanian" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "## 1. Introduction to the torch tensor library\n", 18 | "### Torch's numpy equivalent with GPU support" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "import numpy as np\n", 30 | "from __future__ import print_function" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "import torch" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "### Initialize a random tensor" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/plain": [ 61 | "\n", 62 | " 2.4878e+04 4.5692e-41 2.4878e+04\n", 63 | " 4.5692e-41 -2.9205e+19 4.5691e-41\n", 64 | " 1.2277e-02 4.5692e-41 -4.0170e+19\n", 65 | " 4.5691e-41 1.2277e-02 4.5692e-41\n", 66 | " 0.0000e+00 0.0000e+00 0.0000e+00\n", 67 | "[torch.FloatTensor of size 5x3]" 68 | ] 69 | }, 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "torch.Tensor(5, 3)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "### From a uniform distribution" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 5, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "\n", 97 | "-0.2767 -0.1082 -0.1339\n", 98 | "-0.6477 0.3098 0.1642\n", 99 | "-0.1125 -0.2104 0.8962\n", 100 | "-0.6573 0.9669 -0.3806\n", 101 | " 0.8008 -0.3860 0.6816\n", 102 | "[torch.FloatTensor of size 5x3]" 103 | ] 104 | }, 105 | "execution_count": 5, 106 | "metadata": {}, 107 | "output_type": "execute_result" 108 | } 109 | ], 110 | "source": [ 111 | "torch.Tensor(5, 3).uniform_(-1, 1)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "### Get it's shape" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 6, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "torch.Size([5, 3])\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "x = torch.Tensor(5, 3).uniform_(-1, 1)\n", 138 | "print(x.size())" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "### Tensor Types\n", 146 | "source: http://pytorch.org/docs/master/tensors.html" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "|Data type |Tensor|\n", 154 | "|----------|------|\n", 155 | "|32-bit floating point|\ttorch.FloatTensor|\n", 156 | "|64-bit floating point|\ttorch.DoubleTensor|\n", 157 | "|16-bit floating point|\ttorch.HalfTensor|\n", 158 | "|8-bit integer (unsigned)|torch.ByteTensor|\n", 159 | "|8-bit integer (signed)|torch.CharTensor|\n", 160 | "|16-bit integer (signed)|torch.ShortTensor|\n", 161 | "|32-bit integer (signed)|torch.IntTensor|\n", 162 | "|64-bit integer (signed)|torch.LongTensor|" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "### Creation from lists & numpy" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 7, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "torch.LongTensor\n", 184 | "int64\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "z = torch.LongTensor([[1, 3], [2, 9]])\n", 190 | "print(z.type())\n", 191 | "# Cast to numpy ndarray\n", 192 | "print(z.numpy().dtype)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 8, 198 | "metadata": { 199 | "collapsed": false 200 | }, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "torch.DoubleTensor\n", 207 | "torch.FloatTensor\n" 208 | ] 209 | } 210 | ], 211 | "source": [ 212 | "# Data type inferred from numpy\n", 213 | "print(torch.from_numpy(np.random.rand(5, 3)).type())\n", 214 | "print(torch.from_numpy(np.random.rand(5, 3).astype(np.float32)).type())" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "### Simple mathematical operations" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 9, 227 | "metadata": { 228 | "collapsed": false 229 | }, 230 | "outputs": [ 231 | { 232 | "name": "stdout", 233 | "output_type": "stream", 234 | "text": [ 235 | "\n", 236 | " 0.2200 -0.0368 0.4494\n", 237 | "-0.2577 -0.0343 0.1587\n", 238 | "-0.7503 -0.1729 0.0453\n", 239 | " 0.9296 -0.1067 -0.6402\n", 240 | "-0.3276 0.0158 -0.0552\n", 241 | "[torch.FloatTensor of size 5x3]\n", 242 | "\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "y = x * torch.randn(5, 3)\n", 248 | "print(y)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 10, 254 | "metadata": { 255 | "collapsed": false 256 | }, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "\n", 263 | " 0.2820 -0.1633 -4.4346\n", 264 | "-1.6809 0.2066 -0.8261\n", 265 | "-0.6464 0.9758 0.2542\n", 266 | " 0.5789 0.1890 -0.4662\n", 267 | " 5.3183 0.0236 -0.1403\n", 268 | "[torch.FloatTensor of size 5x3]\n", 269 | "\n" 270 | ] 271 | } 272 | ], 273 | "source": [ 274 | "y = x / torch.sqrt(torch.randn(5, 3) ** 2)\n", 275 | "print(y)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "### Broadcasting" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 11, 288 | "metadata": { 289 | "collapsed": false 290 | }, 291 | "outputs": [ 292 | { 293 | "name": "stdout", 294 | "output_type": "stream", 295 | "text": [ 296 | "torch.Size([5, 3])\n", 297 | "\n", 298 | " 0.1919 -0.5006 -1.2410\n", 299 | "-0.8080 0.1407 -0.6193\n", 300 | "-1.6629 -0.1580 -0.3921\n", 301 | " 1.0395 0.7069 -0.1459\n", 302 | " 1.9027 1.4343 1.2299\n", 303 | "[torch.FloatTensor of size 5x3]\n", 304 | "\n" 305 | ] 306 | } 307 | ], 308 | "source": [ 309 | "print (x.size())\n", 310 | "y = x + torch.randn(5, 1)\n", 311 | "print(y)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": { 317 | "collapsed": true 318 | }, 319 | "source": [ 320 | "### Reshape" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 12, 326 | "metadata": { 327 | "collapsed": false 328 | }, 329 | "outputs": [ 330 | { 331 | "name": "stdout", 332 | "output_type": "stream", 333 | "text": [ 334 | "torch.Size([5, 10, 15])\n", 335 | "torch.Size([50, 15])\n", 336 | "torch.Size([50, 1, 15])\n", 337 | "torch.Size([50, 15])\n", 338 | "\n", 339 | "torch.Size([10, 5, 15])\n", 340 | "torch.Size([5, 15, 10])\n", 341 | "torch.Size([10, 15, 5])\n", 342 | "torch.Size([10, 15, 5])\n" 343 | ] 344 | } 345 | ], 346 | "source": [ 347 | "y = torch.randn(5, 10, 15)\n", 348 | "print(y.size())\n", 349 | "print(y.view(-1, 15).size()) # Same as doing y.view(50, 15)\n", 350 | "print(y.view(-1, 15).unsqueeze(1).size()) # Adds a dimension at index 1.\n", 351 | "print(y.view(-1, 15).unsqueeze(1).squeeze().size())\n", 352 | "# If input is of shape: (Ax1xBxCx1xD)(Ax1xBxCx1xD) then the out Tensor will be of shape: (AxBxCxD)(AxBxCxD)\n", 353 | "print()\n", 354 | "print(y.transpose(0, 1).size())\n", 355 | "print(y.transpose(1, 2).size())\n", 356 | "print(y.transpose(0, 1).transpose(1, 2).size())\n", 357 | "print(y.permute(1, 2, 0).size())" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": {}, 363 | "source": [ 364 | "### Repeat" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 13, 370 | "metadata": { 371 | "collapsed": false 372 | }, 373 | "outputs": [ 374 | { 375 | "name": "stdout", 376 | "output_type": "stream", 377 | "text": [ 378 | "torch.Size([50, 100, 15])\n", 379 | "torch.Size([50, 100, 15])\n" 380 | ] 381 | } 382 | ], 383 | "source": [ 384 | "print(y.view(-1, 15).unsqueeze(1).expand(50, 100, 15).size())\n", 385 | "print(y.view(-1, 15).unsqueeze(1).expand_as(torch.randn(50, 100, 15)).size())" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "### Concatenate" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 14, 398 | "metadata": { 399 | "collapsed": false 400 | }, 401 | "outputs": [ 402 | { 403 | "name": "stdout", 404 | "output_type": "stream", 405 | "text": [ 406 | "torch.Size([5, 10, 30])\n", 407 | "torch.Size([2, 5, 10, 15])\n" 408 | ] 409 | } 410 | ], 411 | "source": [ 412 | "# 2 is the dimension over which the tensors are concatenated\n", 413 | "print(torch.cat([y, y], 2).size())\n", 414 | "# stack concatenates the sequence of tensors along a new dimension.\n", 415 | "print(torch.stack([y, y], 0).size())" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": {}, 421 | "source": [ 422 | "### Advanced Indexing" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 15, 428 | "metadata": { 429 | "collapsed": false 430 | }, 431 | "outputs": [ 432 | { 433 | "name": "stdout", 434 | "output_type": "stream", 435 | "text": [ 436 | "torch.Size([4, 3, 4])\n", 437 | "torch.Size([2, 3, 4])\n" 438 | ] 439 | } 440 | ], 441 | "source": [ 442 | "y = torch.randn(2, 3, 4)\n", 443 | "print(y[[1, 0, 1, 1]].size())\n", 444 | "\n", 445 | "# PyTorch doesn't support negative strides yet so ::-1 does not work.\n", 446 | "rev_idx = torch.arange(1, -1, -1).long()\n", 447 | "print(y[rev_idx].size())" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": {}, 453 | "source": [ 454 | "### GPU support" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 16, 460 | "metadata": { 461 | "collapsed": false 462 | }, 463 | "outputs": [ 464 | { 465 | "data": { 466 | "text/plain": [ 467 | "\n", 468 | " 0.2456 1.1543 0.5376 0.4358 -0.0369\n", 469 | " 0.8247 -0.4143 -0.7188 0.3953 0.2573\n", 470 | "-0.1346 0.7329 0.5156 0.0864 -0.1349\n", 471 | "-0.3555 0.3135 0.3921 -0.1428 -0.1368\n", 472 | "-0.4385 0.5601 0.6533 -0.2793 -0.5220\n", 473 | "[torch.cuda.HalfTensor of size 5x5 (GPU 0)]" 474 | ] 475 | }, 476 | "execution_count": 16, 477 | "metadata": {}, 478 | "output_type": "execute_result" 479 | } 480 | ], 481 | "source": [ 482 | "x = torch.cuda.HalfTensor(5, 3).uniform_(-1, 1)\n", 483 | "y = torch.cuda.HalfTensor(3, 5).uniform_(-1, 1)\n", 484 | "torch.matmul(x, y)" 485 | ] 486 | }, 487 | { 488 | "cell_type": "markdown", 489 | "metadata": {}, 490 | "source": [ 491 | "### Move tensors on the CPU -> GPU" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 17, 497 | "metadata": { 498 | "collapsed": false 499 | }, 500 | "outputs": [ 501 | { 502 | "name": "stdout", 503 | "output_type": "stream", 504 | "text": [ 505 | "\n", 506 | "-0.3758 -0.1090 0.7911\n", 507 | " 0.2839 -0.9136 0.1070\n", 508 | " 0.9184 0.5113 -0.8040\n", 509 | "-0.3412 -0.8895 -0.5780\n", 510 | "-0.0992 0.0983 0.6074\n", 511 | "[torch.FloatTensor of size 5x3]\n", 512 | "\n", 513 | "\n", 514 | "-0.3758 -0.1090 0.7911\n", 515 | " 0.2839 -0.9136 0.1070\n", 516 | " 0.9184 0.5113 -0.8040\n", 517 | "-0.3412 -0.8895 -0.5780\n", 518 | "-0.0992 0.0983 0.6074\n", 519 | "[torch.cuda.FloatTensor of size 5x3 (GPU 0)]\n", 520 | "\n", 521 | "\n", 522 | "-0.3758 -0.1090 0.7911\n", 523 | " 0.2839 -0.9136 0.1070\n", 524 | " 0.9184 0.5113 -0.8040\n", 525 | "-0.3412 -0.8895 -0.5780\n", 526 | "-0.0992 0.0983 0.6074\n", 527 | "[torch.FloatTensor of size 5x3]\n", 528 | "\n" 529 | ] 530 | } 531 | ], 532 | "source": [ 533 | "x = torch.FloatTensor(5, 3).uniform_(-1, 1)\n", 534 | "print(x)\n", 535 | "x = x.cuda(device=0)\n", 536 | "print(x)\n", 537 | "x = x.cpu()\n", 538 | "print(x)" 539 | ] 540 | }, 541 | { 542 | "cell_type": "markdown", 543 | "metadata": { 544 | "collapsed": true 545 | }, 546 | "source": [ 547 | "### Contiguity in memory" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": 18, 553 | "metadata": { 554 | "collapsed": false 555 | }, 556 | "outputs": [ 557 | { 558 | "name": "stdout", 559 | "output_type": "stream", 560 | "text": [ 561 | "\n", 562 | " 0.4740 -0.9209 0.4143\n", 563 | "-0.3473 0.4474 -0.8159\n", 564 | "-0.7654 -0.0956 0.6145\n", 565 | "-0.0846 -0.6239 0.8609\n", 566 | "-0.8142 0.9289 -0.7020\n", 567 | "[torch.FloatTensor of size 5x3]\n", 568 | "\n", 569 | "\n", 570 | " 0.4740 -0.9209 0.4143\n", 571 | "-0.3473 0.4474 -0.8159\n", 572 | "-0.7654 -0.0956 0.6145\n", 573 | "-0.0846 -0.6239 0.8609\n", 574 | "-0.8142 0.9289 -0.7020\n", 575 | "[torch.cuda.FloatTensor of size 5x3 (GPU 0)]\n", 576 | "\n", 577 | "Contiguity : True \n", 578 | "Contiguity : False \n", 579 | "Contiguity : True \n" 580 | ] 581 | } 582 | ], 583 | "source": [ 584 | "x = torch.FloatTensor(5, 3).uniform_(-1, 1)\n", 585 | "print(x)\n", 586 | "x = x.cuda(device=0)\n", 587 | "print(x)\n", 588 | "print('Contiguity : %s ' % (x.is_contiguous()))\n", 589 | "x = x.unsqueeze(0).expand(30, 5, 3)\n", 590 | "print('Contiguity : %s ' % (x.is_contiguous()))\n", 591 | "x = x.contiguous()\n", 592 | "print('Contiguity : %s ' % (x.is_contiguous()))" 593 | ] 594 | } 595 | ], 596 | "metadata": { 597 | "kernelspec": { 598 | "display_name": "Python 2", 599 | "language": "python", 600 | "name": "python2" 601 | }, 602 | "language_info": { 603 | "codemirror_mode": { 604 | "name": "ipython", 605 | "version": 2 606 | }, 607 | "file_extension": ".py", 608 | "mimetype": "text/x-python", 609 | "name": "python", 610 | "nbconvert_exporter": "python", 611 | "pygments_lexer": "ipython2", 612 | "version": "2.7.13" 613 | } 614 | }, 615 | "nbformat": 4, 616 | "nbformat_minor": 2 617 | } 618 | -------------------------------------------------------------------------------- /pytorch/2. Autograd.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## PyTorch Tutorial\n", 8 | "MILA, November 2017\n", 9 | "\n", 10 | "By Sandeep Subramanian" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### Torch Autograd, Variables, Define-by-run & Execution Paradigm\n", 18 | "\n", 19 | "Adapted from\n", 20 | "1. http://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html#sphx-glr-beginner-blitz-autograd-tutorial-py \n", 21 | "2. http://pytorch.org/docs/master/notes/autograd.html" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Variables : Thin wrappers around tensors to facilitate autograd\n", 29 | "\n", 30 | "Supports almost all operations that can be performed on regular tensors" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 1, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "import numpy as np\n", 42 | "from __future__ import print_function" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": { 49 | "collapsed": false 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "import torch \n", 54 | "from torch.autograd import Variable" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "![caption](images/Variable.png)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "### Wrap tensors in a Variable" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 3, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "Variable containing:\n", 83 | "-0.2456 -0.0608 -0.7359\n", 84 | "-0.8375 -0.3687 -0.6179\n", 85 | "-0.1984 -0.2076 -0.7292\n", 86 | " 0.4198 -0.3215 0.9470\n", 87 | "-0.3811 -0.0531 0.9047\n", 88 | "[torch.FloatTensor of size 5x3]\n", 89 | "\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "z = Variable(torch.Tensor(5, 3).uniform_(-1, 1))\n", 95 | "print(z)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "### Properties of Variables : Requiring gradients, Volatility, Data & Grad\n", 103 | "\n", 104 | "1. You can access the raw tensor through the .data attribute\n", 105 | "2. Gradient of the loss w.r.t. this variable is accumulated into .grad.\n", 106 | "3. Stay tuned for requires_grad and volatile" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 4, 112 | "metadata": { 113 | "collapsed": false 114 | }, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "Requires Gradient : False \n", 121 | "Volatile : False \n", 122 | "Gradient : None \n", 123 | "\n", 124 | "-0.2456 -0.0608 -0.7359\n", 125 | "-0.8375 -0.3687 -0.6179\n", 126 | "-0.1984 -0.2076 -0.7292\n", 127 | " 0.4198 -0.3215 0.9470\n", 128 | "-0.3811 -0.0531 0.9047\n", 129 | "[torch.FloatTensor of size 5x3]\n", 130 | "\n" 131 | ] 132 | } 133 | ], 134 | "source": [ 135 | "print('Requires Gradient : %s ' % (z.requires_grad))\n", 136 | "print('Volatile : %s ' % (z.volatile))\n", 137 | "print('Gradient : %s ' % (z.grad))\n", 138 | "print(z.data)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 5, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "torch.Size([5, 5])\n" 153 | ] 154 | } 155 | ], 156 | "source": [ 157 | "### Operations on Variables\n", 158 | "x = Variable(torch.Tensor(5, 3).uniform_(-1, 1))\n", 159 | "y = Variable(torch.Tensor(3, 5).uniform_(-1, 1))\n", 160 | "# matrix multiplication\n", 161 | "z = torch.mm(x, y)\n", 162 | "print(z.size())" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "## Define-by-run Paradigm\n", 170 | "\n", 171 | "The torch autograd package provides automatic differentiation for all operations on Tensors.\n", 172 | "\n", 173 | "PyTorch's autograd is a reverse mode automatic differentiation system.\n", 174 | "\n", 175 | "Backprop is defined by how your code is run, and that every single iteration can be different.\n", 176 | "\n", 177 | "Other frameworks that adopt a similar approach :\n", 178 | "\n", 179 | "1. Chainer - https://github.com/chainer/chainer\n", 180 | "2. DyNet - https://github.com/clab/dynet\n", 181 | "3. Tensorflow Eager - https://research.googleblog.com/2017/10/eager-execution-imperative-define-by.html\n", 182 | "\n", 183 | "### How autograd encodes execution history\n", 184 | "\n", 185 | "\n", 186 | "Conceptually, autograd maintains a graph that records all of the operations performed on variables as you execute your operations. This results in a directed acyclic graph whose leaves are the input variables and roots are the output variables. By tracing this graph from roots to leaves, you can automatically compute the gradients using the chain rule." 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "![caption](images/dynamic_graph.gif)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "GIF source: https://github.com/pytorch/pytorch" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "Internally, autograd represents this graph as a graph of Function objects (really expressions), which can be `apply()` ed to compute the result of evaluating the graph. When computing the forwards pass, autograd simultaneously performs the requested computations and builds up a graph representing the function that computes the gradient (the `.grad_fn` attribute of each Variable is an entry point into this graph). When the forwards pass is completed, we evaluate this graph in the backwards pass to compute the gradients." 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 6, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "x = Variable(torch.Tensor(5, 3).uniform_(-1, 1))\n", 227 | "y = Variable(torch.Tensor(3, 5).uniform_(-1, 1))\n", 228 | "z = torch.mm(x, y)\n", 229 | "print(z.grad_fn)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "An important thing to note is that the graph is recreated from scratch at every iteration, and this is exactly what allows for using arbitrary Python control flow statements, that can change the overall shape and size of the graph at every iteration. You don’t have to encode all possible paths before you launch the training - what you run is what you differentiate." 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "## Getting gradients : `backward()` & `torch.autograd.grad`" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 7, 249 | "metadata": { 250 | "collapsed": false 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "x = Variable(torch.Tensor(5, 3).uniform_(-1, 1), requires_grad=True)\n", 255 | "y = Variable(torch.Tensor(5, 3).uniform_(-1, 1), requires_grad=True)\n", 256 | "z = x ** 2 + 3 * y\n", 257 | "z.backward(gradient=torch.ones(5, 3))" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 8, 263 | "metadata": { 264 | "collapsed": false 265 | }, 266 | "outputs": [ 267 | { 268 | "data": { 269 | "text/plain": [ 270 | "Variable containing:\n", 271 | " 1 1 1\n", 272 | " 1 1 1\n", 273 | " 1 1 1\n", 274 | " 1 1 1\n", 275 | " 1 1 1\n", 276 | "[torch.ByteTensor of size 5x3]" 277 | ] 278 | }, 279 | "execution_count": 10, 280 | "metadata": {}, 281 | "output_type": "execute_result" 282 | } 283 | ], 284 | "source": [ 285 | "# eq computes element-wise equality\n", 286 | "torch.eq(x.grad, 2 * x)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 11, 292 | "metadata": { 293 | "collapsed": false 294 | }, 295 | "outputs": [ 296 | { 297 | "data": { 298 | "text/plain": [ 299 | "Variable containing:\n", 300 | " 3 3 3\n", 301 | " 3 3 3\n", 302 | " 3 3 3\n", 303 | " 3 3 3\n", 304 | " 3 3 3\n", 305 | "[torch.FloatTensor of size 5x3]" 306 | ] 307 | }, 308 | "execution_count": 9, 309 | "metadata": {}, 310 | "output_type": "execute_result" 311 | } 312 | ], 313 | "source": [ 314 | "y.grad" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 10, 320 | "metadata": { 321 | "collapsed": false 322 | }, 323 | "outputs": [], 324 | "source": [ 325 | "x = Variable(torch.Tensor(5, 3).uniform_(-1, 1), requires_grad=True)\n", 326 | "y = Variable(torch.Tensor(5, 3).uniform_(-1, 1), requires_grad=True)\n", 327 | "z = x ** 2 + 3 * y\n", 328 | "dz_dx = torch.autograd.grad(z, x, grad_outputs=torch.ones(5, 3))\n", 329 | "dz_dy = torch.autograd.grad(z, y, grad_outputs=torch.ones(5, 3))" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "## Define-by-run example" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "### Common Variable definition" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 18, 349 | "metadata": { 350 | "collapsed": true 351 | }, 352 | "outputs": [], 353 | "source": [ 354 | "x = Variable(torch.Tensor(5, 3).uniform_(-1, 1), requires_grad=True)\n", 355 | "w = Variable(torch.Tensor(3, 10).uniform_(-1, 1), requires_grad=True)\n", 356 | "b = Variable(torch.Tensor(10,).uniform_(-1, 1), requires_grad=True)" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "### Graph 1 : `wx + b`" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 19, 369 | "metadata": { 370 | "collapsed": false 371 | }, 372 | "outputs": [], 373 | "source": [ 374 | "o = torch.matmul(x, w) + b\n", 375 | "do_dinputs_1 = torch.autograd.grad(o, [x, w, b], grad_outputs=torch.ones(5, 10))" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 20, 381 | "metadata": { 382 | "collapsed": false 383 | }, 384 | "outputs": [ 385 | { 386 | "name": "stdout", 387 | "output_type": "stream", 388 | "text": [ 389 | "Gradients of o w.r.t inputs in Graph 1\n", 390 | "do/dx : \n", 391 | "\n", 392 | " Variable containing:\n", 393 | " 1.5226 -1.3615 0.7991\n", 394 | " 1.5226 -1.3615 0.7991\n", 395 | " 1.5226 -1.3615 0.7991\n", 396 | " 1.5226 -1.3615 0.7991\n", 397 | " 1.5226 -1.3615 0.7991\n", 398 | "[torch.FloatTensor of size 5x3]\n", 399 | " \n", 400 | "do/dw : \n", 401 | "\n", 402 | " Variable containing:\n", 403 | " 0.3361 0.3361 0.3361 0.3361 0.3361 0.3361 0.3361 0.3361 0.3361 0.3361\n", 404 | "-1.1158 -1.1158 -1.1158 -1.1158 -1.1158 -1.1158 -1.1158 -1.1158 -1.1158 -1.1158\n", 405 | " 1.2694 1.2694 1.2694 1.2694 1.2694 1.2694 1.2694 1.2694 1.2694 1.2694\n", 406 | "[torch.FloatTensor of size 3x10]\n", 407 | " \n", 408 | "do/db : \n", 409 | "\n", 410 | " Variable containing:\n", 411 | " 5\n", 412 | " 5\n", 413 | " 5\n", 414 | " 5\n", 415 | " 5\n", 416 | " 5\n", 417 | " 5\n", 418 | " 5\n", 419 | " 5\n", 420 | " 5\n", 421 | "[torch.FloatTensor of size 10]\n", 422 | " \n" 423 | ] 424 | } 425 | ], 426 | "source": [ 427 | "print('Gradients of o w.r.t inputs in Graph 1')\n", 428 | "print('do/dx : \\n\\n %s ' % (do_dinputs_1[0]))\n", 429 | "print('do/dw : \\n\\n %s ' % (do_dinputs_1[1]))\n", 430 | "print('do/db : \\n\\n %s ' % (do_dinputs_1[2]))" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "### Graph 2 : wx / b" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 21, 443 | "metadata": { 444 | "collapsed": false 445 | }, 446 | "outputs": [], 447 | "source": [ 448 | "o = torch.matmul(x, w) / b\n", 449 | "do_dinputs_2 = torch.autograd.grad(o, [x, w, b], grad_outputs=torch.ones(5, 10))" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 22, 455 | "metadata": { 456 | "collapsed": false 457 | }, 458 | "outputs": [ 459 | { 460 | "name": "stdout", 461 | "output_type": "stream", 462 | "text": [ 463 | "Gradients of o w.r.t inputs in Graph 2\n", 464 | "do/dx : \n", 465 | " Variable containing:\n", 466 | " 47.6666 -31.6417 -54.9581\n", 467 | " 47.6666 -31.6417 -54.9581\n", 468 | " 47.6666 -31.6417 -54.9581\n", 469 | " 47.6666 -31.6417 -54.9581\n", 470 | " 47.6666 -31.6417 -54.9581\n", 471 | "[torch.FloatTensor of size 5x3]\n", 472 | " \n", 473 | "do/dw : \n", 474 | " Variable containing:\n", 475 | "\n", 476 | "Columns 0 to 7 \n", 477 | " 25.7204 -1.4251 0.5816 0.7336 0.3829 -0.5467 0.3904 0.3968\n", 478 | "-85.3804 4.7306 -1.9306 -2.4353 -1.2709 1.8149 -1.2960 -1.3172\n", 479 | " 97.1318 -5.3817 2.1963 2.7705 1.4459 -2.0647 1.4743 1.4985\n", 480 | "\n", 481 | "Columns 8 to 9 \n", 482 | " -4.2812 -0.4352\n", 483 | " 14.2118 1.4446\n", 484 | "-16.1679 -1.6434\n", 485 | "[torch.FloatTensor of size 3x10]\n", 486 | " \n", 487 | "do/db : \n", 488 | " Variable containing:\n", 489 | "-173.4977\n", 490 | " 5.7707\n", 491 | " 5.5410\n", 492 | " -2.9863\n", 493 | " -0.3088\n", 494 | " -1.7527\n", 495 | " -0.5234\n", 496 | " -1.5729\n", 497 | "-261.3063\n", 498 | " -0.8931\n", 499 | "[torch.FloatTensor of size 10]\n", 500 | " \n" 501 | ] 502 | } 503 | ], 504 | "source": [ 505 | "print('Gradients of o w.r.t inputs in Graph 2')\n", 506 | "print('do/dx : \\n %s ' % (do_dinputs_2[0]))\n", 507 | "print('do/dw : \\n %s ' % (do_dinputs_2[1]))\n", 508 | "print('do/db : \\n %s ' % (do_dinputs_2[2]))" 509 | ] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": {}, 514 | "source": [ 515 | "## Gradient buffers: `.backward()` and `retain_graph=True`\n", 516 | "\n", 517 | "1. Calling `.backward()` clears the current computation graph.\n", 518 | "2. Once `.backward()` is called, intermediate variables used in the construction of the graph are removed.\n", 519 | "2. This is used implicitly to let PyTorch know when a new graph is to be built for a new minibatch. This is built around the forward and backward pass paradigm.\n", 520 | "3. To retain the graph after the backward pass use `loss.backward(retain_graph=True)`. This lets you re-use intermediate variables to potentially compute a secondary loss after the initial gradients are computed. This is useful to implement things like the gradient penalty in WGANs (https://arxiv.org/abs/1704.00028)" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": 23, 526 | "metadata": { 527 | "collapsed": false 528 | }, 529 | "outputs": [], 530 | "source": [ 531 | "o = torch.mm(x, w) + b\n", 532 | "o.backward(torch.ones(5, 10))" 533 | ] 534 | }, 535 | { 536 | "cell_type": "markdown", 537 | "metadata": {}, 538 | "source": [ 539 | "### Call backward again -> This fails" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": 24, 545 | "metadata": { 546 | "collapsed": false 547 | }, 548 | "outputs": [ 549 | { 550 | "ename": "RuntimeError", 551 | "evalue": "Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.", 552 | "output_type": "error", 553 | "traceback": [ 554 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 555 | "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", 556 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mo\u001b[0m \u001b[0;34m**\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mones\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 557 | "\u001b[0;32m/home/sandeep/anaconda2/lib/python2.7/site-packages/torch/autograd/variable.pyc\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, retain_variables)\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[0mVariable\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 157\u001b[0m \"\"\"\n\u001b[0;32m--> 158\u001b[0;31m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mautograd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_variables\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 159\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mregister_hook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 558 | "\u001b[0;32m/home/sandeep/anaconda2/lib/python2.7/site-packages/torch/autograd/__init__.pyc\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(variables, grad_variables, retain_graph, create_graph, retain_variables)\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m Variable._execution_engine.run_backward(\n\u001b[0;32m---> 99\u001b[0;31m variables, grad_variables, retain_graph)\n\u001b[0m\u001b[1;32m 100\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 559 | "\u001b[0;32m/home/sandeep/anaconda2/lib/python2.7/site-packages/torch/autograd/function.pyc\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 91\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_forward_cls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 92\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 93\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 560 | "\u001b[0;32m/home/sandeep/anaconda2/lib/python2.7/site-packages/torch/autograd/_functions/blas.pyc\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(ctx, grad_output)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mctx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 35\u001b[0;31m \u001b[0mmatrix1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmatrix2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mctx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msaved_variables\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 36\u001b[0m \u001b[0mgrad_add_matrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgrad_matrix1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgrad_matrix2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 561 | "\u001b[0;31mRuntimeError\u001b[0m: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time." 562 | ] 563 | } 564 | ], 565 | "source": [ 566 | "o = o ** 3\n", 567 | "o.backward(torch.ones(5, 10))" 568 | ] 569 | }, 570 | { 571 | "cell_type": "markdown", 572 | "metadata": {}, 573 | "source": [ 574 | "### But with `retain_graph=True`" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": 26, 580 | "metadata": { 581 | "collapsed": false 582 | }, 583 | "outputs": [], 584 | "source": [ 585 | "o = torch.mm(x, w) + b\n", 586 | "o.backward(torch.ones(5, 10), retain_graph=True)" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": 28, 592 | "metadata": { 593 | "collapsed": false 594 | }, 595 | "outputs": [], 596 | "source": [ 597 | "o = o ** 3\n", 598 | "o.backward(torch.ones(5, 10))" 599 | ] 600 | }, 601 | { 602 | "cell_type": "markdown", 603 | "metadata": {}, 604 | "source": [ 605 | "## WARNING: Calling `.backward()` multiple times will accumulate gradients into `.grad` and NOT overwrite them." 606 | ] 607 | }, 608 | { 609 | "cell_type": "markdown", 610 | "metadata": {}, 611 | "source": [ 612 | "## Excluding subgraphs from backward: requires_grad=False, volatile=True & .detach\n", 613 | "\n", 614 | "### `requires_grad=False`\n", 615 | "\n", 616 | "1. If there’s a single input to an operation that requires gradient, its output will also require gradient.\n", 617 | "\n", 618 | "2. Conversely, if all inputs don’t require gradient, the output won’t require it.\n", 619 | "\n", 620 | "3. Backward computation is never performed in the subgraphs, where all Variables didn’t require gradients.\n", 621 | "\n", 622 | "4. This is potentially useful when you have part of a network that is pretrained and not fine-tuned, for example word embeddings or a pretrained imagenet model." 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": 30, 628 | "metadata": { 629 | "collapsed": true 630 | }, 631 | "outputs": [], 632 | "source": [ 633 | "x = Variable(torch.Tensor(3, 5).uniform_(-1, 1), requires_grad=False)\n", 634 | "y = Variable(torch.Tensor(3, 5).uniform_(-1, 1), requires_grad=False)\n", 635 | "z = Variable(torch.Tensor(3, 5).uniform_(-1, 1), requires_grad=True)" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 31, 641 | "metadata": { 642 | "collapsed": false 643 | }, 644 | "outputs": [ 645 | { 646 | "name": "stdout", 647 | "output_type": "stream", 648 | "text": [ 649 | " o = x + y requires grad ? : False \n", 650 | " o = x + y + z requires grad ? : True \n" 651 | ] 652 | } 653 | ], 654 | "source": [ 655 | "o = x + y\n", 656 | "print(' o = x + y requires grad ? : %s ' % (o.requires_grad))\n", 657 | "o = x + y + z\n", 658 | "print(' o = x + y + z requires grad ? : %s ' % (o.requires_grad))" 659 | ] 660 | }, 661 | { 662 | "cell_type": "markdown", 663 | "metadata": {}, 664 | "source": [ 665 | "### `volatile=True`\n", 666 | "\n", 667 | "1. If a single input to an operation is volatile, the resulting variable will not have a `grad_fn` and so, the result will not be a node in the computation graph.\n", 668 | "\n", 669 | "2. Conversely, only if all inputs are not volatile, the output will have a `grad_fn` and be included in the computation graph.\n", 670 | "\n", 671 | "3. Volatile is useful when running Variables through your network during inference. Since it is fairly uncommon to go backwards through the network during inference, `.backward()` is rarely invoked. This means graphs are never cleared and hence it is common to run out of memory pretty quickly. Since operations on `volatile` variables are not recorded on the tape and therfore save memory." 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": 195, 677 | "metadata": { 678 | "collapsed": true 679 | }, 680 | "outputs": [], 681 | "source": [ 682 | "x = Variable(torch.Tensor(3, 5).uniform_(-1, 1), volatile=True)\n", 683 | "y = Variable(torch.Tensor(3, 5).uniform_(-1, 1), volatile=True)\n", 684 | "z = Variable(torch.Tensor(3, 5).uniform_(-1, 1), requires_grad=True)" 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": 196, 690 | "metadata": { 691 | "collapsed": false 692 | }, 693 | "outputs": [ 694 | { 695 | "name": "stdout", 696 | "output_type": "stream", 697 | "text": [ 698 | "Graph : x + y\n", 699 | "o.requires_grad : False \n", 700 | "o.grad_fn : None \n", 701 | "\n", 702 | "\n", 703 | "Graph : x + y + z\n", 704 | "o.requires_grad : False \n", 705 | "o.grad_fn : None \n" 706 | ] 707 | } 708 | ], 709 | "source": [ 710 | "print('Graph : x + y')\n", 711 | "o = x + y\n", 712 | "print('o.requires_grad : %s ' % (o.requires_grad))\n", 713 | "print('o.grad_fn : %s ' % (o.grad_fn))\n", 714 | "print('\\n\\nGraph : x + y + z')\n", 715 | "o = x + y + z\n", 716 | "print('o.requires_grad : %s ' % (o.requires_grad))\n", 717 | "print('o.grad_fn : %s ' % (o.grad_fn))" 718 | ] 719 | }, 720 | { 721 | "cell_type": "markdown", 722 | "metadata": {}, 723 | "source": [ 724 | "### `.detach()`\n", 725 | "\n", 726 | "1. It is possible to detach variables from the graph by calling `.detach()`. \n", 727 | "2. This could lead to disconnected graphs. In which case PyTorch will only backpropagate gradients until the point of disconnection." 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": 41, 733 | "metadata": { 734 | "collapsed": false 735 | }, 736 | "outputs": [], 737 | "source": [ 738 | "x = Variable(torch.Tensor(3, 5).uniform_(-1, 1), requires_grad=True)\n", 739 | "y = Variable(torch.Tensor(3, 5).uniform_(-1, 1), requires_grad=True)\n", 740 | "z = Variable(torch.Tensor(3, 5).uniform_(-1, 1), requires_grad=True)" 741 | ] 742 | }, 743 | { 744 | "cell_type": "markdown", 745 | "metadata": {}, 746 | "source": [ 747 | "![caption](images/detach.png)" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 42, 753 | "metadata": { 754 | "collapsed": false 755 | }, 756 | "outputs": [], 757 | "source": [ 758 | "m1 = x + y\n", 759 | "m2 = z ** 2\n", 760 | "m1 = m1.detach()\n", 761 | "m3 = m1 + m2\n", 762 | "m3.backward(torch.ones(3, 5))" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": 43, 768 | "metadata": { 769 | "collapsed": false 770 | }, 771 | "outputs": [ 772 | { 773 | "name": "stdout", 774 | "output_type": "stream", 775 | "text": [ 776 | "dm3/dx \n", 777 | "\n", 778 | " None \n", 779 | "\n", 780 | "dm3/dy \n", 781 | "\n", 782 | " None \n", 783 | "\n", 784 | "dm3/dz \n", 785 | "\n", 786 | " Variable containing:\n", 787 | " 1.2043 1.9914 0.1340 -1.8074 1.3064\n", 788 | "-0.1923 0.9834 -1.9299 1.4948 0.6174\n", 789 | " 1.0566 -1.1677 -1.5411 0.5598 -1.9467\n", 790 | "[torch.FloatTensor of size 3x5]\n", 791 | " \n" 792 | ] 793 | } 794 | ], 795 | "source": [ 796 | "print('dm3/dx \\n\\n %s ' % (x.grad))\n", 797 | "print('\\ndm3/dy \\n\\n %s ' % (y.grad))\n", 798 | "print('\\ndm3/dz \\n\\n %s ' % (z.grad))" 799 | ] 800 | }, 801 | { 802 | "cell_type": "markdown", 803 | "metadata": { 804 | "collapsed": true 805 | }, 806 | "source": [ 807 | "## Gradients w.r.t intermediate variables in the graph\n", 808 | "\n", 809 | "1. By default, PyTorch all gradient computations w.r.t intermediate nodes in the graph are ad-hoc.\n", 810 | "\n", 811 | "2. This is in the interest of saving memory.\n", 812 | "\n", 813 | "3. To compute gradients w.r.t intermediate variables, use `.retain_grad()` or explicitly compute gradients using `torch.autograd.grad`\n", 814 | "\n", 815 | "4. `.retain_grad()` populates the `.grad` attribute of the Variable while `torch.autograd.grad` returns a Variable that contains the gradients." 816 | ] 817 | }, 818 | { 819 | "cell_type": "code", 820 | "execution_count": 44, 821 | "metadata": { 822 | "collapsed": true 823 | }, 824 | "outputs": [], 825 | "source": [ 826 | "x = Variable(torch.Tensor(3, 5).uniform_(-1, 1), requires_grad=True)\n", 827 | "y = Variable(torch.Tensor(3, 5).uniform_(-1, 1), requires_grad=True)\n", 828 | "z = Variable(torch.Tensor(3, 5).uniform_(-1, 1), requires_grad=True)" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": 45, 834 | "metadata": { 835 | "collapsed": false 836 | }, 837 | "outputs": [], 838 | "source": [ 839 | "m1 = x + y\n", 840 | "m2 = z ** 2\n", 841 | "m1.retain_grad()\n", 842 | "m2.retain_grad()\n", 843 | "m3 = m1 * m2\n", 844 | "m3.backward(torch.ones(3, 5))" 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": 46, 850 | "metadata": { 851 | "collapsed": false 852 | }, 853 | "outputs": [ 854 | { 855 | "name": "stdout", 856 | "output_type": "stream", 857 | "text": [ 858 | "dm3/dm1 \n", 859 | "\n", 860 | " Variable containing:\n", 861 | " 0.6986 0.0007 0.0314 0.3346 0.4070\n", 862 | " 0.7087 0.7066 0.0344 0.3643 0.4734\n", 863 | " 0.6196 0.0079 0.0240 0.4427 0.0531\n", 864 | "[torch.FloatTensor of size 3x5]\n", 865 | " \n", 866 | "dm3/dm2 \n", 867 | "\n", 868 | " Variable containing:\n", 869 | "-1.0104 1.6305 0.3076 -0.2957 0.1597\n", 870 | "-0.1984 -1.2168 0.4246 -1.3702 0.8474\n", 871 | "-1.1777 1.6642 -1.2514 0.8266 0.0997\n", 872 | "[torch.FloatTensor of size 3x5]\n", 873 | " \n" 874 | ] 875 | } 876 | ], 877 | "source": [ 878 | "print('dm3/dm1 \\n\\n %s ' % (m1.grad))\n", 879 | "print('dm3/dm2 \\n\\n %s ' % (m2.grad))" 880 | ] 881 | }, 882 | { 883 | "cell_type": "markdown", 884 | "metadata": {}, 885 | "source": [ 886 | "### In place operations on variables in a graph\n", 887 | "\n", 888 | "source: http://pytorch.org/docs/master/notes/autograd.html\n", 889 | "\n", 890 | "In place operations are suffixed by `_` ex: `log_`, `uniform_` etc.\n", 891 | "\n", 892 | "1. Supporting in-place operations in autograd is difficult and PyTorch discourages their use in most cases.\n", 893 | "\n", 894 | "2. Autograd’s aggressive buffer freeing and reuse makes it very efficient and there are very few occasions when in-place operations actually lower memory usage by any significant amount. Unless you’re operating under heavy memory pressure, you might never need to use them.\n", 895 | "\n", 896 | "### There are two main reasons that limit the applicability of in-place operations:\n", 897 | "\n", 898 | "(a) Overwriting values required to compute gradients. This is why variables don’t support `log_`. Its gradient formula requires the original input, and while it is possible to recreate it by computing the inverse operation, it is numerically unstable, and requires additional work that often defeats the purpose of using these functions.\n", 899 | "\n", 900 | "(b) Every in-place operation actually requires the implementation to rewrite the computational graph. Out-of-place versions simply allocate new objects and keep references to the old graph, while in-place operations, require changing the creator of all inputs to the Function representing this operation. This can be tricky, especially if there are many Variables that reference the same storage (e.g. created by indexing or transposing), and in-place functions will actually raise an error if the storage of modified inputs is referenced by any other Variable.\n", 901 | "In-place correctness checks" 902 | ] 903 | }, 904 | { 905 | "cell_type": "markdown", 906 | "metadata": {}, 907 | "source": [ 908 | "## Second and higher order derivatives\n", 909 | "\n", 910 | "### Computing gradients w.r.t gradients\n", 911 | "\n", 912 | "1. `o = xy + z`\n", 913 | "2. `l = o + do_dz`\n", 914 | "\n", 915 | "### Practical application of this in WGAN-GP later in the tutorial" 916 | ] 917 | }, 918 | { 919 | "cell_type": "code", 920 | "execution_count": 178, 921 | "metadata": { 922 | "collapsed": true 923 | }, 924 | "outputs": [], 925 | "source": [ 926 | "x = Variable(torch.Tensor(5, 3).uniform_(-1, 1), requires_grad=True)\n", 927 | "y = Variable(torch.Tensor(3, 5).uniform_(-1, 1), requires_grad=True)\n", 928 | "z = Variable(torch.Tensor(5, 5).uniform_(-1, 1), requires_grad=True)" 929 | ] 930 | }, 931 | { 932 | "cell_type": "code", 933 | "execution_count": 181, 934 | "metadata": { 935 | "collapsed": false 936 | }, 937 | "outputs": [ 938 | { 939 | "name": "stdout", 940 | "output_type": "stream", 941 | "text": [ 942 | "do/dz \n", 943 | "\n", 944 | " : Variable containing:\n", 945 | "-0.0465 1.6061 0.6523 0.0604 -1.0861\n", 946 | "-0.1614 -1.8736 1.6718 1.7629 -1.4649\n", 947 | "-0.7555 0.2532 -1.8296 -1.5360 0.2838\n", 948 | " 1.1525 0.8089 -1.7133 -1.3501 1.7537\n", 949 | " 0.6360 1.3759 -1.7214 0.2242 0.5220\n", 950 | "[torch.FloatTensor of size 5x5]\n", 951 | " \n", 952 | "dl/dz \n", 953 | "\n", 954 | " : Variable containing:\n", 955 | " 1.9749 2.1041 2.5755 2.0140 1.6410\n", 956 | " 1.7873 1.8495 3.9286 10.6421 0.3371\n", 957 | " 0.2702 3.0303 -6.2525 1.6083 2.0623\n", 958 | " 2.8351 3.7686 -4.1183 1.6259 2.7881\n", 959 | " 2.2092 2.0311 -3.4778 2.0041 2.0320\n", 960 | "[torch.FloatTensor of size 5x5]\n", 961 | " \n" 962 | ] 963 | } 964 | ], 965 | "source": [ 966 | "o = torch.mm(x, y) + z ** 2\n", 967 | "# if create_graph=False then the resulting gradient is volatile and cannot be used further to compute a second loss.\n", 968 | "do_dz = torch.autograd.grad(o, z, grad_outputs=torch.ones(5, 5), retain_graph=True, create_graph=True)\n", 969 | "print('do/dz \\n\\n : %s ' % (do_dz[0]))\n", 970 | "l = o ** 3 + do_dz[0]\n", 971 | "dl_dz = torch.autograd.grad(l, z, grad_outputs=torch.ones(5, 5))\n", 972 | "print('dl/dz \\n\\n : %s ' % (dl_dz[0]))" 973 | ] 974 | } 975 | ], 976 | "metadata": { 977 | "kernelspec": { 978 | "display_name": "Python 2", 979 | "language": "python", 980 | "name": "python2" 981 | }, 982 | "language_info": { 983 | "codemirror_mode": { 984 | "name": "ipython", 985 | "version": 2 986 | }, 987 | "file_extension": ".py", 988 | "mimetype": "text/x-python", 989 | "name": "python", 990 | "nbconvert_exporter": "python", 991 | "pygments_lexer": "ipython2", 992 | "version": "2.7.13" 993 | } 994 | }, 995 | "nbformat": 4, 996 | "nbformat_minor": 2 997 | } 998 | -------------------------------------------------------------------------------- /pytorch/5. Neural Machine Translation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## PyTorch Tutorial\n", 8 | "MILA, November 2017\n", 9 | "\n", 10 | "By Sandeep Subramanian" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "## Neural Machine Translation (Seq2Seq)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import time\n", 29 | "import numpy as np\n", 30 | "from __future__ import print_function" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "import torch\n", 42 | "import torch.nn as nn\n", 43 | "import torch.optim as optim\n", 44 | "import torch.nn.init as init\n", 45 | "import torch.nn.functional as F\n", 46 | "from torch.autograd import Variable\n", 47 | "from torch.nn.utils.rnn import pack_padded_sequence" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "import numpy as np\n", 59 | "import codecs\n", 60 | "import nltk" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "### Read training, validation & test data\n", 68 | "\n", 69 | "Training data was obtained from http://www.manythings.org/anki/ and partitioned randomly into train, dev and test" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 4, 75 | "metadata": { 76 | "collapsed": true 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "train_lines = [line.strip().split('\\t') for line in codecs.open('data/jpn-train.txt', 'r', encoding='utf-8')]\n", 81 | "dev_lines = [line.strip().split('\\t') for line in codecs.open('data/jpn-dev.txt', 'r', encoding='utf-8')]\n", 82 | "test_lines = [line.strip().split('\\t') for line in codecs.open('data/jpn-test.txt', 'r', encoding='utf-8')]" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "### Compute source and target vocabularies" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 5, 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "Number of unique Japanese words : 2367 \n", 104 | "Number of unique English words : 16065 \n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "src_vocab = set()\n", 110 | "trg_vocab = set()\n", 111 | "for line in train_lines:\n", 112 | " for word in line[1]:\n", 113 | " if word not in src_vocab:\n", 114 | " src_vocab.add(word)\n", 115 | " for word in line[0].split():\n", 116 | " if word not in trg_vocab:\n", 117 | " trg_vocab.add(word)\n", 118 | "\n", 119 | "# Add special tokens to the source and target vocabularies\n", 120 | "src_vocab.add('')\n", 121 | "src_vocab.add('')\n", 122 | "src_vocab.add('')\n", 123 | "src_vocab.add('')\n", 124 | "\n", 125 | "trg_vocab.add('')\n", 126 | "trg_vocab.add('')\n", 127 | "trg_vocab.add('')\n", 128 | "trg_vocab.add('')\n", 129 | "\n", 130 | "src_word2id = {word: idx for idx, word in enumerate(src_vocab)}\n", 131 | "src_id2word = {idx: word for idx, word in enumerate(src_vocab)}\n", 132 | "\n", 133 | "trg_word2id = {word: idx for idx, word in enumerate(trg_vocab)}\n", 134 | "trg_id2word = {idx: word for idx, word in enumerate(trg_vocab)}\n", 135 | "\n", 136 | "print('Number of unique Japanese words : %d ' % (len(src_vocab)))\n", 137 | "print('Number of unique English words : %d ' % (len(trg_vocab)))" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "collapsed": true 145 | }, 146 | "outputs": [], 147 | "source": [] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "### Create Seq2Seq model with GRUs" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 6, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "class Seq2Seq(nn.Module):\n", 165 | " \"\"\"A Vanilla Sequence to Sequence (Seq2Seq) model with LSTMs.\n", 166 | " Ref: Sequence to Sequence Learning with Neural Nets\n", 167 | " https://arxiv.org/abs/1409.3215\n", 168 | " \"\"\"\n", 169 | "\n", 170 | " def __init__(\n", 171 | " self, src_emb_dim, trg_emb_dim, src_vocab_size,\n", 172 | " trg_vocab_size, src_hidden_dim, trg_hidden_dim,\n", 173 | " pad_token_src, pad_token_trg, bidirectional=False,\n", 174 | " nlayers_src=1, nlayers_trg=1\n", 175 | " ):\n", 176 | " \"\"\"Initialize Seq2Seq Model.\"\"\"\n", 177 | " super(Seq2Seq, self).__init__()\n", 178 | " self.src_vocab_size = src_vocab_size\n", 179 | " self.trg_vocab_size = trg_vocab_size\n", 180 | " self.src_emb_dim = src_emb_dim\n", 181 | " self.trg_emb_dim = trg_emb_dim\n", 182 | " self.src_hidden_dim = src_hidden_dim\n", 183 | " self.trg_hidden_dim = trg_hidden_dim\n", 184 | " self.bidirectional = bidirectional\n", 185 | " self.nlayers_src = nlayers_src\n", 186 | " self.nlayers_trg = nlayers_trg\n", 187 | " self.pad_token_src = pad_token_src\n", 188 | " self.pad_token_trg = pad_token_trg\n", 189 | " \n", 190 | " # Word Embedding look-up table for the soruce language\n", 191 | " self.src_embedding = nn.Embedding(\n", 192 | " self.src_vocab_size,\n", 193 | " self.src_emb_dim,\n", 194 | " self.pad_token_src,\n", 195 | " )\n", 196 | "\n", 197 | " # Word Embedding look-up table for the target language\n", 198 | " self.trg_embedding = nn.Embedding(\n", 199 | " self.trg_vocab_size,\n", 200 | " self.trg_emb_dim,\n", 201 | " self.pad_token_trg,\n", 202 | " )\n", 203 | "\n", 204 | " # Encoder GRU\n", 205 | " self.encoder = nn.GRU(\n", 206 | " self.src_emb_dim // 2 if self.bidirectional else self.src_emb_dim,\n", 207 | " self.src_hidden_dim,\n", 208 | " self.nlayers_src,\n", 209 | " bidirectional=bidirectional,\n", 210 | " batch_first=True,\n", 211 | " )\n", 212 | "\n", 213 | " # Decoder GRU\n", 214 | " self.decoder = nn.GRU(\n", 215 | " self.trg_emb_dim,\n", 216 | " self.trg_hidden_dim,\n", 217 | " self.nlayers_trg,\n", 218 | " batch_first=True\n", 219 | " )\n", 220 | " \n", 221 | " # Projection layer from decoder hidden states to target language vocabulary\n", 222 | " self.decoder2vocab = nn.Linear(trg_hidden_dim, trg_vocab_size)\n", 223 | "\n", 224 | " def forward(self, input_src, input_trg, src_lengths):\n", 225 | " # Lookup word embeddings in source and target minibatch\n", 226 | " src_emb = self.src_embedding(input_src)\n", 227 | " trg_emb = self.trg_embedding(input_trg)\n", 228 | " \n", 229 | " # Pack padded sequence for length masking in encoder RNN (This requires sorting input sequence by length)\n", 230 | " src_emb = pack_padded_sequence(src_emb, src_lengths, batch_first=True)\n", 231 | " \n", 232 | " # Run sequence of embeddings through the encoder GRU\n", 233 | " _, src_h_t = self.encoder(src_emb)\n", 234 | " \n", 235 | " # Extract the last hidden state of the GRU\n", 236 | " h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1) if self.bidirectional else src_h_t[-1]\n", 237 | "\n", 238 | " # Initialize the decoder GRU with the last hidden state of the encoder and \n", 239 | " # run target inputs through the decoder.\n", 240 | " trg_h, _ = self.decoder(trg_emb, h_t.unsqueeze(0).expand(self.nlayers_trg, h_t.size(0), h_t.size(1)))\n", 241 | " \n", 242 | " # Merge batch and time dimensions to pass to a linear layer\n", 243 | " trg_h_reshape = trg_h.contiguous().view(\n", 244 | " trg_h.size(0) * trg_h.size(1), trg_h.size(2)\n", 245 | " )\n", 246 | " \n", 247 | " # Affine transformation of all decoder hidden states\n", 248 | " decoder2vocab = self.decoder2vocab(trg_h_reshape)\n", 249 | " \n", 250 | " # Reshape\n", 251 | " decoder2vocab = decoder2vocab.view(\n", 252 | " trg_h.size(0), trg_h.size(1), decoder2vocab.size(1)\n", 253 | " )\n", 254 | "\n", 255 | " return decoder2vocab\n", 256 | " \n", 257 | " def decode(self, decoder2vocab):\n", 258 | " # Turn decoder output into a probabiltiy distribution over vocabulary\n", 259 | " decoder2vocab_reshape = decoder2vocab.view(-1, decoder2vocab.size(2))\n", 260 | " word_probs = F.softmax(decoder2vocab_reshape)\n", 261 | " word_probs = word_probs.view(\n", 262 | " decoder2vocab.size(0), decoder2vocab.size(1), decoder2vocab.size(2)\n", 263 | " )\n", 264 | "\n", 265 | " return word_probs" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 7, 271 | "metadata": { 272 | "collapsed": false 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "def get_parallel_minibatch(lines, src_word2id, trg_word2id, index, batch_size, volatile=False):\n", 277 | " \n", 278 | " # Get source sentences for this minibatch\n", 279 | " src_lines = [\n", 280 | " [''] + list(line[1]) + ['']\n", 281 | " for line in lines[index: index + batch_size]\n", 282 | " ]\n", 283 | "\n", 284 | " # Get target sentences for this minibatch\n", 285 | " trg_lines = [\n", 286 | " [''] + line[0].split() + ['']\n", 287 | " for line in lines[index: index + batch_size]\n", 288 | " ]\n", 289 | " \n", 290 | " # Sort source sentences by length for length masking in RNNs\n", 291 | " src_lens = [len(line) for line in src_lines]\n", 292 | " sorted_indices = np.argsort(src_lens)[::-1]\n", 293 | " \n", 294 | " # Reorder sentences based on source lengths\n", 295 | " sorted_src_lines = [src_lines[idx] for idx in sorted_indices]\n", 296 | " sorted_trg_lines = [trg_lines[idx] for idx in sorted_indices]\n", 297 | " \n", 298 | " # Compute new sentence lengths\n", 299 | " sorted_src_lens = [len(line) for line in sorted_src_lines]\n", 300 | " sorted_trg_lens = [len(line) for line in sorted_trg_lines]\n", 301 | " \n", 302 | " # Get max source and target lengths to pad input and output sequences\n", 303 | " max_src_len = max(sorted_src_lens)\n", 304 | " max_trg_len = max(sorted_trg_lens)\n", 305 | " \n", 306 | " # Construct padded source input sequence\n", 307 | " input_lines_src = [\n", 308 | " [src_word2id[w] if w in src_word2id else src_word2id[''] for w in line] +\n", 309 | " [src_word2id['']] * (max_src_len - len(line))\n", 310 | " for line in sorted_src_lines\n", 311 | " ]\n", 312 | "\n", 313 | " # Construct padded target input sequence\n", 314 | " input_lines_trg = [\n", 315 | " [trg_word2id[w] if w in trg_word2id else trg_word2id[''] for w in line[:-1]] +\n", 316 | " [trg_word2id['']] * (max_trg_len - len(line))\n", 317 | " for line in sorted_trg_lines\n", 318 | " ]\n", 319 | "\n", 320 | " # Construct padded target output sequence (Note: Output sequence is just the input shifted by 1 position)\n", 321 | " # This is for teacher-forcing\n", 322 | " output_lines_trg = [\n", 323 | " [trg_word2id[w] if w in trg_word2id else trg_word2id[''] for w in line[1:]] +\n", 324 | " [trg_word2id['']] * (max_trg_len - len(line))\n", 325 | " for line in sorted_trg_lines\n", 326 | " ]\n", 327 | "\n", 328 | " input_lines_src = Variable(torch.LongTensor(input_lines_src), volatile=volatile)\n", 329 | " input_lines_trg = Variable(torch.LongTensor(input_lines_trg), volatile=volatile)\n", 330 | " output_lines_trg = Variable(torch.LongTensor(output_lines_trg), volatile=volatile)\n", 331 | "\n", 332 | " return {\n", 333 | " 'input_src': input_lines_src,\n", 334 | " 'input_trg': input_lines_trg,\n", 335 | " 'output_trg': output_lines_trg,\n", 336 | " 'src_lens': sorted_src_lens\n", 337 | " }" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 8, 343 | "metadata": { 344 | "collapsed": true 345 | }, 346 | "outputs": [], 347 | "source": [ 348 | "cuda_available = torch.cuda.is_available()" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 10, 354 | "metadata": { 355 | "collapsed": false 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "seq2seq = Seq2Seq(\n", 360 | " src_emb_dim=128, trg_emb_dim=128,\n", 361 | " src_vocab_size=len(src_word2id), trg_vocab_size=len(trg_word2id),\n", 362 | " src_hidden_dim=512, trg_hidden_dim=512,\n", 363 | " pad_token_src=src_word2id[''],\n", 364 | " pad_token_trg=trg_word2id[''],\n", 365 | ")\n", 366 | "\n", 367 | "if cuda_available:\n", 368 | " seq2seq = seq2seq.cuda()" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 11, 374 | "metadata": { 375 | "collapsed": false 376 | }, 377 | "outputs": [], 378 | "source": [ 379 | "optimizer = optim.Adam(seq2seq.parameters(), lr=4e-4)\n", 380 | "weight_mask = torch.ones(len(trg_word2id))\n", 381 | "if cuda_available:\n", 382 | " weight_mask = weight_mask.cuda()\n", 383 | "weight_mask[trg_word2id['']] = 0\n", 384 | "loss_criterion = nn.CrossEntropyLoss(weight=weight_mask)\n", 385 | "batch_size = 64" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 12, 391 | "metadata": { 392 | "collapsed": false 393 | }, 394 | "outputs": [ 395 | { 396 | "name": "stdout", 397 | "output_type": "stream", 398 | "text": [ 399 | "Epoch : 0 Training Loss : 5.507\n", 400 | "Epoch : 0 Dev Loss : 4.752\n", 401 | "Epoch : 0 Test Loss : 4.679\n", 402 | "-------------------------------------------------------------\n", 403 | "Epoch : 1 Training Loss : 4.257\n", 404 | "Epoch : 1 Dev Loss : 4.223\n", 405 | "Epoch : 1 Test Loss : 4.122\n", 406 | "-------------------------------------------------------------\n", 407 | "Epoch : 2 Training Loss : 3.636\n", 408 | "Epoch : 2 Dev Loss : 3.913\n", 409 | "Epoch : 2 Test Loss : 3.787\n", 410 | "-------------------------------------------------------------\n", 411 | "Epoch : 3 Training Loss : 3.156\n", 412 | "Epoch : 3 Dev Loss : 3.702\n", 413 | "Epoch : 3 Test Loss : 3.555\n", 414 | "-------------------------------------------------------------\n", 415 | "Epoch : 4 Training Loss : 2.757\n", 416 | "Epoch : 4 Dev Loss : 3.554\n", 417 | "Epoch : 4 Test Loss : 3.392\n", 418 | "-------------------------------------------------------------\n", 419 | "Epoch : 5 Training Loss : 2.418\n", 420 | "Epoch : 5 Dev Loss : 3.450\n", 421 | "Epoch : 5 Test Loss : 3.276\n", 422 | "-------------------------------------------------------------\n", 423 | "Epoch : 6 Training Loss : 2.127\n", 424 | "Epoch : 6 Dev Loss : 3.381\n", 425 | "Epoch : 6 Test Loss : 3.191\n", 426 | "-------------------------------------------------------------\n", 427 | "Epoch : 7 Training Loss : 1.875\n", 428 | "Epoch : 7 Dev Loss : 3.343\n", 429 | "Epoch : 7 Test Loss : 3.141\n", 430 | "-------------------------------------------------------------\n", 431 | "Epoch : 8 Training Loss : 1.660\n", 432 | "Epoch : 8 Dev Loss : 3.322\n", 433 | "Epoch : 8 Test Loss : 3.115\n", 434 | "-------------------------------------------------------------\n", 435 | "Epoch : 9 Training Loss : 1.468\n", 436 | "Epoch : 9 Dev Loss : 3.297\n", 437 | "Epoch : 9 Test Loss : 3.086\n", 438 | "-------------------------------------------------------------\n", 439 | "Epoch : 10 Training Loss : 1.293\n", 440 | "Epoch : 10 Dev Loss : 3.286\n", 441 | "Epoch : 10 Test Loss : 3.066\n", 442 | "-------------------------------------------------------------\n", 443 | "Epoch : 11 Training Loss : 1.138\n", 444 | "Epoch : 11 Dev Loss : 3.311\n", 445 | "Epoch : 11 Test Loss : 3.085\n", 446 | "-------------------------------------------------------------\n", 447 | "Epoch : 12 Training Loss : 1.005\n", 448 | "Epoch : 12 Dev Loss : 3.322\n", 449 | "Epoch : 12 Test Loss : 3.090\n", 450 | "-------------------------------------------------------------\n" 451 | ] 452 | }, 453 | { 454 | "ename": "KeyboardInterrupt", 455 | "evalue": "", 456 | "output_type": "error", 457 | "traceback": [ 458 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 459 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 460 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;31m# Gradient clipping to avoid exploding gradients\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclip_grad_norm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseq2seq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparameters\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5.\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 28\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0mlosses\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 461 | "\u001b[0;32m/home/sandeep/anaconda2/lib/python2.7/site-packages/torch/nn/utils/clip_grad.pyc\u001b[0m in \u001b[0;36mclip_grad_norm\u001b[0;34m(parameters, max_norm, norm_type)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0mtotal_norm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mparameters\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 26\u001b[0;31m \u001b[0mparam_norm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgrad\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnorm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnorm_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 27\u001b[0m \u001b[0mtotal_norm\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mparam_norm\u001b[0m \u001b[0;34m**\u001b[0m \u001b[0mnorm_type\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0mtotal_norm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtotal_norm\u001b[0m \u001b[0;34m**\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1.\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mnorm_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 462 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 463 | ] 464 | } 465 | ], 466 | "source": [ 467 | "for epoch in range(15):\n", 468 | " losses = []\n", 469 | " for j in range(0, len(train_lines), batch_size):\n", 470 | " # Get minibatch of examples\n", 471 | " minibatch = get_parallel_minibatch(\n", 472 | " lines=train_lines, src_word2id=src_word2id,\n", 473 | " trg_word2id=trg_word2id, index=j, batch_size=batch_size\n", 474 | " )\n", 475 | " \n", 476 | " if cuda_available:\n", 477 | " minibatch['input_src'] = minibatch['input_src'].cuda()\n", 478 | " minibatch['input_trg'] = minibatch['input_trg'].cuda()\n", 479 | " minibatch['output_trg'] = minibatch['output_trg'].cuda()\n", 480 | " \n", 481 | " decoder_out = seq2seq(\n", 482 | " input_src=minibatch['input_src'], input_trg=minibatch['input_trg'], src_lengths=minibatch['src_lens']\n", 483 | " )\n", 484 | " \n", 485 | " loss = loss_criterion(\n", 486 | " decoder_out.contiguous().view(-1, decoder_out.size(2)),\n", 487 | " minibatch['output_trg'].contiguous().view(-1)\n", 488 | " )\n", 489 | "\n", 490 | " optimizer.zero_grad()\n", 491 | " loss.backward()\n", 492 | " # Gradient clipping to avoid exploding gradients\n", 493 | " torch.nn.utils.clip_grad_norm(seq2seq.parameters(), 5.)\n", 494 | " optimizer.step()\n", 495 | " losses.append(loss.data[0])\n", 496 | " \n", 497 | " dev_nll = []\n", 498 | " for j in range(0, len(dev_lines), batch_size):\n", 499 | " # Get minibatch of examples\n", 500 | " minibatch = get_parallel_minibatch(\n", 501 | " lines=dev_lines, src_word2id=src_word2id,\n", 502 | " trg_word2id=trg_word2id, index=j, batch_size=batch_size,\n", 503 | " volatile=True\n", 504 | " )\n", 505 | " \n", 506 | " if cuda_available:\n", 507 | " minibatch['input_src'] = minibatch['input_src'].cuda()\n", 508 | " minibatch['input_trg'] = minibatch['input_trg'].cuda()\n", 509 | " minibatch['output_trg'] = minibatch['output_trg'].cuda()\n", 510 | " \n", 511 | " decoder_out = seq2seq(\n", 512 | " input_src=minibatch['input_src'], input_trg=minibatch['input_trg'], src_lengths=minibatch['src_lens']\n", 513 | " )\n", 514 | " \n", 515 | " loss = loss_criterion(\n", 516 | " decoder_out.contiguous().view(-1, decoder_out.size(2)),\n", 517 | " minibatch['output_trg'].contiguous().view(-1)\n", 518 | " )\n", 519 | "\n", 520 | " dev_nll.append(loss.data[0])\n", 521 | " \n", 522 | " test_nll = []\n", 523 | " for j in range(0, len(test_lines), batch_size):\n", 524 | " # Get minibatch of examples\n", 525 | " minibatch = get_parallel_minibatch(\n", 526 | " lines=test_lines, src_word2id=src_word2id,\n", 527 | " trg_word2id=trg_word2id, index=j, batch_size=batch_size,\n", 528 | " volatile=True\n", 529 | " )\n", 530 | " \n", 531 | " if cuda_available:\n", 532 | " minibatch['input_src'] = minibatch['input_src'].cuda()\n", 533 | " minibatch['input_trg'] = minibatch['input_trg'].cuda()\n", 534 | " minibatch['output_trg'] = minibatch['output_trg'].cuda()\n", 535 | " \n", 536 | " decoder_out = seq2seq(\n", 537 | " input_src=minibatch['input_src'], input_trg=minibatch['input_trg'], src_lengths=minibatch['src_lens']\n", 538 | " )\n", 539 | " \n", 540 | " loss = loss_criterion(\n", 541 | " decoder_out.contiguous().view(-1, decoder_out.size(2)),\n", 542 | " minibatch['output_trg'].contiguous().view(-1)\n", 543 | " )\n", 544 | "\n", 545 | " test_nll.append(loss.data[0])\n", 546 | " \n", 547 | " print('Epoch : %d Training Loss : %.3f' % (epoch, np.mean(losses)))\n", 548 | " print('Epoch : %d Dev Loss : %.3f' % (epoch, np.mean(dev_nll)))\n", 549 | " print('Epoch : %d Test Loss : %.3f' % (epoch, np.mean(test_nll)))\n", 550 | " print('-------------------------------------------------------------')" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": {}, 556 | "source": [ 557 | "### Lets see what the model produces for a few sentences in our dev set" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": 13, 563 | "metadata": { 564 | "collapsed": false 565 | }, 566 | "outputs": [ 567 | { 568 | "name": "stdout", 569 | "output_type": "stream", 570 | "text": [ 571 | "Prediction : Tom found Tom unlikely unlikely that Tom would interested planning by Mary has eat able to see the party \n", 572 | "Gold : I think it's highly unlikely that Tom was not aware that he wouldn't be allowed to enter the museum without his parents. \n", 573 | "---------------\n", 574 | "Prediction : It news was me a times I I the grandmother. \n", 575 | "Gold : The photo brought back many happy memories of my childhood. \n", 576 | "---------------\n", 577 | "Prediction : I thought to find a test in I was in the \n", 578 | "Gold : I expected to make new friends when I moved to Boston. \n", 579 | "---------------\n", 580 | "Prediction : I've never heard him speak of the job. \n", 581 | "Gold : I've never heard him complaining about his meals. \n", 582 | "---------------\n", 583 | "Prediction : I thought it would be an to one to go to \n", 584 | "Gold : I thought it would be fun for us to go skiing together. \n", 585 | "---------------\n", 586 | "Prediction : The man was out of his hand to take me to the station. \n", 587 | "Gold : The man went out of his way to take me to the station. \n", 588 | "---------------\n", 589 | "Prediction : Nobody must have had my cold. with mistake. \n", 590 | "Gold : Someone must have taken my umbrella by mistake. \n", 591 | "---------------\n", 592 | "Prediction : It's would surprised to she had to a good cook. \n", 593 | "Gold : I am surprised that she refused such a good offer. \n", 594 | "---------------\n", 595 | "Prediction : Tom has never been to to do this \n", 596 | "Gold : Tom has never been able to beat me. \n", 597 | "---------------\n", 598 | "Prediction : The was the night yesterday. had yesterday. \n", 599 | "Gold : Everyone loved the I baked yesterday. \n", 600 | "---------------\n", 601 | "Prediction : The been a picture from the \n", 602 | "Gold : I've ordered a book from Amazon.com. \n", 603 | "---------------\n", 604 | "Prediction : The factory produces CD in the in day. \n", 605 | "Gold : The factory produces thousands of bottles every month. \n", 606 | "---------------\n", 607 | "Prediction : The is the earth's surface is out in snow. \n", 608 | "Gold : Three-fourths of the earth's surface is covered with water. \n", 609 | "---------------\n", 610 | "Prediction : He not cut good but he bit \n", 611 | "Gold : He's not a doctor, but a nurse. \n", 612 | "---------------\n", 613 | "Prediction : I'll be in this tomorrow here afternoon. \n", 614 | "Gold : She'll be up and around this afternoon. \n", 615 | "---------------\n", 616 | "Prediction : The sun was so up in school. with. \n", 617 | "Gold : The situation was getting difficult to deal with. \n", 618 | "---------------\n", 619 | "Prediction : I told him to go the room. \n", 620 | "Gold : I told him to leave the room. \n", 621 | "---------------\n", 622 | "Prediction : She plays a every a older for day. \n", 623 | "Gold : She spends time with her grandmother every Sunday. \n", 624 | "---------------\n", 625 | "Prediction : Tom doesn't had no pretty with Mary. else. \n", 626 | "Gold : Tom hasn't had a fight with anybody lately. \n", 627 | "---------------\n", 628 | "Prediction : He asked me to pass him for salt. \n", 629 | "Gold : He asked me to pass him the salt. \n", 630 | "---------------\n", 631 | "Prediction : She's is just to letter to \n", 632 | "Gold : She is writing a letter now. \n", 633 | "---------------\n", 634 | "Prediction : Let's careful to to catch anything much. \n", 635 | "Gold : Be sure not to eat too much. \n", 636 | "---------------\n", 637 | "Prediction : It was not a joke. \n", 638 | "Gold : It was only a partial success. \n", 639 | "---------------\n", 640 | "Prediction : We won't not be \n", 641 | "Gold : We will never agree. \n", 642 | "---------------\n", 643 | "Prediction : She saw to the sight of a town. \n", 644 | "Gold : She froze at the sight of the bear. \n", 645 | "---------------\n", 646 | "Prediction : The bag's is \n", 647 | "Gold : Your reputation you. \n", 648 | "---------------\n", 649 | "Prediction : I need a bad for you. \n", 650 | "Gold : I have a gift for you. \n", 651 | "---------------\n", 652 | "Prediction : Quite man of people attended when the middle \n", 653 | "Gold : A bunch of people died in the explosion. \n", 654 | "---------------\n", 655 | "Prediction : We helped a new \n", 656 | "Gold : We bought a round table. \n", 657 | "---------------\n", 658 | "Prediction : Tom came up his about late. \n", 659 | "Gold : Tom showed up 15 minutes late. \n", 660 | "---------------\n", 661 | "Prediction : There is to to \n", 662 | "Gold : Everything has its limit. \n", 663 | "---------------\n", 664 | "Prediction : Tom doesn't like French. \n", 665 | "Gold : Tom doesn't like cheese. \n", 666 | "---------------\n", 667 | "Prediction : You go now. now. \n", 668 | "Gold : Don't go there now. \n", 669 | "---------------\n", 670 | "Prediction : Are you take a car? \n", 671 | "Gold : Can you ride a bicycle? \n", 672 | "---------------\n", 673 | "Prediction : She has a eye for antiques. beautiful. \n", 674 | "Gold : She has an eye for the beautiful. \n", 675 | "---------------\n", 676 | "Prediction : It was a very beautiful flower. \n", 677 | "Gold : It was a very beautiful flower. \n", 678 | "---------------\n", 679 | "Prediction : Look him as \n", 680 | "Gold : Give him time. \n", 681 | "---------------\n", 682 | "Prediction : Can me the salt, please. you? \n", 683 | "Gold : Pass me the salt, will you? \n", 684 | "---------------\n", 685 | "Prediction : They dragged their bottles of wine. \n", 686 | "Gold : They drank two bottles of wine. \n", 687 | "---------------\n", 688 | "Prediction : Where can going to a reason. \n", 689 | "Gold : How about going for a swim? \n", 690 | "---------------\n", 691 | "Prediction : He got to letter to his right. \n", 692 | "Gold : He moved the desk to the right. \n", 693 | "---------------\n", 694 | "Prediction : Dinner is ready. \n", 695 | "Gold : Dinner is ready. \n", 696 | "---------------\n", 697 | "Prediction : Do you know him? he is? \n", 698 | "Gold : Do you know who he is? \n", 699 | "---------------\n", 700 | "Prediction : I feed to my diary every day. \n", 701 | "Gold : I write in my diary every day. \n", 702 | "---------------\n", 703 | "Prediction : People one on \n", 704 | "Gold : No allowed. \n", 705 | "---------------\n", 706 | "Prediction : He is what with the way \n", 707 | "Gold : He is familiar with the subject. \n", 708 | "---------------\n", 709 | "Prediction : It would you were \n", 710 | "Gold : I wish you success. \n", 711 | "---------------\n", 712 | "Prediction : He fell off hard. \n", 713 | "Gold : He got very drunk. \n", 714 | "---------------\n", 715 | "Prediction : Tom's house was late. of fashion. \n", 716 | "Gold : Tom's clothes are out of fashion. \n", 717 | "---------------\n", 718 | "Prediction : I don't like any God. sports. sort of thing. \n", 719 | "Gold : I don't go in for that sort of thing. \n", 720 | "---------------\n", 721 | "Prediction : She's is a beautiful beauty. \n", 722 | "Gold : She is a real beauty. \n", 723 | "---------------\n", 724 | "Prediction : Speech is silver, silence \n", 725 | "Gold : Speech is silver, silence is gold. \n", 726 | "---------------\n", 727 | "Prediction : Tom is a short \n", 728 | "Gold : Tom has a hangover. \n", 729 | "---------------\n", 730 | "Prediction : I said mistaken. \n", 731 | "Gold : I was \n", 732 | "---------------\n", 733 | "Prediction : I'm a same \n", 734 | "Gold : I'm the youngest child in the family. \n", 735 | "---------------\n", 736 | "Prediction : Don't it easy. \n", 737 | "Gold : Take it \n", 738 | "---------------\n", 739 | "Prediction : Don't be angry. \n", 740 | "Gold : Don't be mad at me. \n", 741 | "---------------\n", 742 | "Prediction : I have a fever. fever. \n", 743 | "Gold : I have a high temperature. \n", 744 | "---------------\n", 745 | "Prediction : He is a good temper. \n", 746 | "Gold : He has a bad heart. \n", 747 | "---------------\n", 748 | "Prediction : What're are you doing? \n", 749 | "Gold : What are you doing? \n", 750 | "---------------\n", 751 | "Prediction : It's already seven. o'clock. \n", 752 | "Gold : It's already nine o'clock. \n", 753 | "---------------\n", 754 | "Prediction : Get out. \n", 755 | "Gold : Get out! \n", 756 | "---------------\n", 757 | "Prediction : Where have I? \n", 758 | "Gold : Where am I? \n", 759 | "---------------\n", 760 | "Prediction : How are you? \n", 761 | "Gold : How are you? \n", 762 | "---------------\n" 763 | ] 764 | } 765 | ], 766 | "source": [ 767 | "# Get the first minibatch in the dev set.\n", 768 | "minibatch = get_parallel_minibatch(\n", 769 | " lines=dev_lines, src_word2id=src_word2id,\n", 770 | " trg_word2id=trg_word2id, index=0, batch_size=batch_size,\n", 771 | " volatile=True\n", 772 | ")\n", 773 | "\n", 774 | "if cuda_available:\n", 775 | " minibatch['input_src'] = minibatch['input_src'].cuda()\n", 776 | " minibatch['input_trg'] = minibatch['input_trg'].cuda()\n", 777 | " minibatch['output_trg'] = minibatch['output_trg'].cuda()\n", 778 | "\n", 779 | "# Run it through our model (in teacher forcing mode)\n", 780 | "res = seq2seq(\n", 781 | " input_src=minibatch['input_src'], input_trg=minibatch['input_trg'], src_lengths=minibatch['src_lens']\n", 782 | ")\n", 783 | "\n", 784 | "# Pick the most likely word at each time step\n", 785 | "res = res.data.cpu().numpy().argmax(axis=-1)\n", 786 | "\n", 787 | "# Cast targets to numpy\n", 788 | "gold = minibatch['output_trg'].data.cpu().numpy()\n", 789 | "\n", 790 | "# Decode indices to words for predictions and gold\n", 791 | "res = [[trg_id2word[x] for x in line] for line in res]\n", 792 | "gold = [[trg_id2word[x] for x in line] for line in gold]\n", 793 | "\n", 794 | "for r, g in zip(res, gold):\n", 795 | " if '' in r:\n", 796 | " index = r.index('')\n", 797 | " else:\n", 798 | " index = len(r)\n", 799 | " \n", 800 | " print('Prediction : %s ' % (' '.join(r[:index])))\n", 801 | "\n", 802 | " index = g.index('')\n", 803 | " print('Gold : %s ' % (' '.join(g[:index])))\n", 804 | " print('---------------')" 805 | ] 806 | }, 807 | { 808 | "cell_type": "code", 809 | "execution_count": null, 810 | "metadata": { 811 | "collapsed": true 812 | }, 813 | "outputs": [], 814 | "source": [] 815 | } 816 | ], 817 | "metadata": { 818 | "kernelspec": { 819 | "display_name": "Python 2", 820 | "language": "python", 821 | "name": "python2" 822 | }, 823 | "language_info": { 824 | "codemirror_mode": { 825 | "name": "ipython", 826 | "version": 2 827 | }, 828 | "file_extension": ".py", 829 | "mimetype": "text/x-python", 830 | "name": "python", 831 | "nbconvert_exporter": "python", 832 | "pygments_lexer": "ipython2", 833 | "version": "2.7.13" 834 | } 835 | }, 836 | "nbformat": 4, 837 | "nbformat_minor": 2 838 | } 839 | -------------------------------------------------------------------------------- /pytorch/4. Image Classification with Convnets and ResNets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## PyTorch Tutorial\n", 8 | "MILA, November 2017\n", 9 | "\n", 10 | "By Sandeep Subramanian" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "## Classifying MNIST & CIFAR-10 with Convnets & ResNets" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import time\n", 29 | "import numpy as np\n", 30 | "from __future__ import print_function" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "import torch\n", 42 | "import torch.nn as nn\n", 43 | "import torch.optim as optim\n", 44 | "import torch.nn.init as init\n", 45 | "import torch.nn.functional as F\n", 46 | "from torch.autograd import Variable" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "import torchvision\n", 58 | "import torchvision.transforms" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "import matplotlib.pyplot as plt" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "### Define image transformations & Initialize datasets" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 5, 82 | "metadata": { 83 | "collapsed": false 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "mnist_transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])\n", 88 | "mnist_train = torchvision.datasets.MNIST(root='./data', train=True, transform=mnist_transforms, download=True)\n", 89 | "mnist_test = torchvision.datasets.MNIST(root='./data', train=False, transform=mnist_transforms, download=True)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "### Create multi-threaded DataLoaders" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 6, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "trainloader = torch.utils.data.DataLoader(mnist_train, batch_size=64, shuffle=True, num_workers=2)\n", 108 | "testloader = torch.utils.data.DataLoader(mnist_test, batch_size=64, shuffle=True, num_workers=2)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "### Main classifier that subclasses nn.Module" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 7, 121 | "metadata": { 122 | "collapsed": false 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "class Classifier(nn.Module):\n", 127 | " \"\"\"Convnet Classifier\"\"\"\n", 128 | " def __init__(self):\n", 129 | " super(Classifier, self).__init__()\n", 130 | " self.conv = nn.Sequential(\n", 131 | " # Layer 1\n", 132 | " nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(3, 3), padding=1),\n", 133 | " nn.Dropout(p=0.5),\n", 134 | " nn.ReLU(),\n", 135 | " nn.MaxPool2d(kernel_size=(2, 2), stride=2),\n", 136 | " \n", 137 | " # Layer 2\n", 138 | " nn.Conv2d(in_channels=16, out_channels=32, kernel_size=(3, 3), padding=1),\n", 139 | " nn.Dropout(p=0.5),\n", 140 | " nn.ReLU(),\n", 141 | " nn.MaxPool2d(kernel_size=(2, 2), stride=2),\n", 142 | " \n", 143 | " # Layer 3\n", 144 | " nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(3, 3), padding=1),\n", 145 | " nn.Dropout(p=0.5),\n", 146 | " nn.ReLU(),\n", 147 | " nn.MaxPool2d(kernel_size=(2, 2), stride=2),\n", 148 | " \n", 149 | " # Layer 4\n", 150 | " nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3, 3), padding=1),\n", 151 | " nn.Dropout(p=0.5),\n", 152 | " nn.ReLU(),\n", 153 | " nn.MaxPool2d(kernel_size=(2, 2), stride=2)\n", 154 | " )\n", 155 | " # Logistic Regression\n", 156 | " self.clf = nn.Linear(128, 10)\n", 157 | "\n", 158 | " def forward(self, x):\n", 159 | " return self.clf(self.conv(x).squeeze())" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 8, 165 | "metadata": { 166 | "collapsed": true 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "cuda_available = torch.cuda.is_available()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 9, 176 | "metadata": { 177 | "collapsed": false 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "clf = Classifier()\n", 182 | "if cuda_available:\n", 183 | " clf = clf.cuda()\n", 184 | "optimizer = torch.optim.Adam(clf.parameters(), lr=1e-4)\n", 185 | "criterion = nn.CrossEntropyLoss()" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 10, 191 | "metadata": { 192 | "collapsed": false 193 | }, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "Epoch : 0 Loss : 1.080 \n", 200 | "Epoch : 0 Test Acc : 90.880\n", 201 | "--------------------------------------------------------------\n", 202 | "Epoch : 1 Loss : 0.344 \n", 203 | "Epoch : 1 Test Acc : 94.320\n", 204 | "--------------------------------------------------------------\n", 205 | "Epoch : 2 Loss : 0.255 \n", 206 | "Epoch : 2 Test Acc : 96.250\n", 207 | "--------------------------------------------------------------\n", 208 | "Epoch : 3 Loss : 0.214 \n", 209 | "Epoch : 3 Test Acc : 96.530\n", 210 | "--------------------------------------------------------------\n", 211 | "Epoch : 4 Loss : 0.189 \n", 212 | "Epoch : 4 Test Acc : 97.180\n", 213 | "--------------------------------------------------------------\n", 214 | "Epoch : 5 Loss : 0.171 \n", 215 | "Epoch : 5 Test Acc : 97.600\n", 216 | "--------------------------------------------------------------\n", 217 | "Epoch : 6 Loss : 0.159 \n", 218 | "Epoch : 6 Test Acc : 97.720\n", 219 | "--------------------------------------------------------------\n" 220 | ] 221 | }, 222 | { 223 | "name": "stderr", 224 | "output_type": "stream", 225 | "text": [ 226 | "Process Process-29:\n", 227 | "Traceback (most recent call last):\n", 228 | " File \"/home/sandeep/anaconda2/lib/python2.7/multiprocessing/process.py\", line 258, in _bootstrap\n", 229 | "Process Process-30:\n", 230 | " self.run()\n", 231 | "Traceback (most recent call last):\n", 232 | " File \"/home/sandeep/anaconda2/lib/python2.7/multiprocessing/process.py\", line 258, in _bootstrap\n", 233 | " File \"/home/sandeep/anaconda2/lib/python2.7/multiprocessing/process.py\", line 114, in run\n", 234 | " self.run()\n", 235 | " File \"/home/sandeep/anaconda2/lib/python2.7/multiprocessing/process.py\", line 114, in run\n", 236 | " File \"/home/sandeep/anaconda2/lib/python2.7/site-packages/torch/utils/data/dataloader.py\", line 45, in _worker_loop\n", 237 | " self._target(*self._args, **self._kwargs)\n", 238 | " self._target(*self._args, **self._kwargs)\n", 239 | " File \"/home/sandeep/anaconda2/lib/python2.7/site-packages/torch/utils/data/dataloader.py\", line 41, in _worker_loop\n", 240 | " samples = collate_fn([dataset[i] for i in batch_indices])\n", 241 | " data_queue.put((idx, samples))\n", 242 | " File \"/home/sandeep/anaconda2/lib/python2.7/site-packages/torchvision/datasets/mnist.py\", line 52, in __getitem__\n", 243 | " File \"/home/sandeep/anaconda2/lib/python2.7/multiprocessing/queues.py\", line 392, in put\n", 244 | " return send(obj)\n", 245 | " File \"/home/sandeep/anaconda2/lib/python2.7/site-packages/torch/multiprocessing/queue.py\", line 17, in send\n", 246 | " ForkingPickler(buf, pickle.HIGHEST_PROTOCOL).dump(obj)\n", 247 | " File \"/home/sandeep/anaconda2/lib/python2.7/pickle.py\", line 224, in dump\n", 248 | " self.save(obj)\n", 249 | " File \"/home/sandeep/anaconda2/lib/python2.7/pickle.py\", line 286, in save\n", 250 | " f(self, obj) # Call unbound method with explicit self\n", 251 | " File \"/home/sandeep/anaconda2/lib/python2.7/pickle.py\", line 554, in save_tuple\n", 252 | " save(element)\n", 253 | " File \"/home/sandeep/anaconda2/lib/python2.7/pickle.py\", line 286, in save\n", 254 | " f(self, obj) # Call unbound method with explicit self\n", 255 | " File \"/home/sandeep/anaconda2/lib/python2.7/pickle.py\", line 454, in save_int\n", 256 | " self.write(\"%c%c%c\" % (BININT2, obj&0xff, obj>>8))\n", 257 | "KeyboardInterrupt\n", 258 | " img = Image.fromarray(img.numpy(), mode='L')\n", 259 | " File \"/home/sandeep/anaconda2/lib/python2.7/site-packages/PIL/Image.py\", line 2284, in fromarray\n", 260 | " arr = obj.__array_interface__\n", 261 | "KeyboardInterrupt\n" 262 | ] 263 | }, 264 | { 265 | "ename": "KeyboardInterrupt", 266 | "evalue": "", 267 | "output_type": "error", 268 | "traceback": [ 269 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 270 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 271 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mlosses\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# Train\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mbatch_idx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtargets\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrainloader\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcuda_available\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtargets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtargets\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 272 | "\u001b[0;32m/home/sandeep/anaconda2/lib/python2.7/site-packages/torch/utils/data/dataloader.pyc\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 194\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 195\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshutdown\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbatches_outstanding\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 196\u001b[0;31m \u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata_queue\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 197\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbatches_outstanding\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 198\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0midx\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrcvd_idx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 273 | "\u001b[0;32m/home/sandeep/anaconda2/lib/python2.7/multiprocessing/queues.pyc\u001b[0m in \u001b[0;36mget\u001b[0;34m()\u001b[0m\n\u001b[1;32m 376\u001b[0m \u001b[0mracquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 377\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 378\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 379\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 380\u001b[0m \u001b[0mrrelease\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 274 | "\u001b[0;32m/home/sandeep/anaconda2/lib/python2.7/site-packages/torch/multiprocessing/queue.pyc\u001b[0m in \u001b[0;36mrecv\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mbuf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_bytes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 23\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 275 | "\u001b[0;32m/home/sandeep/anaconda2/lib/python2.7/pickle.pyc\u001b[0m in \u001b[0;36mloads\u001b[0;34m(str)\u001b[0m\n\u001b[1;32m 1386\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1387\u001b[0m \u001b[0mfile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mStringIO\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1388\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mUnpickler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1389\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1390\u001b[0m \u001b[0;31m# Doctest\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 276 | "\u001b[0;32m/home/sandeep/anaconda2/lib/python2.7/pickle.pyc\u001b[0m in \u001b[0;36mload\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 862\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 863\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 864\u001b[0;31m \u001b[0mdispatch\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 865\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0m_Stop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstopinst\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 866\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mstopinst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 277 | "\u001b[0;32m/home/sandeep/anaconda2/lib/python2.7/pickle.pyc\u001b[0m in \u001b[0;36mload_reduce\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1137\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstack\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1138\u001b[0m \u001b[0mfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstack\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1139\u001b[0;31m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1140\u001b[0m \u001b[0mstack\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1141\u001b[0m \u001b[0mdispatch\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mREDUCE\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_reduce\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 278 | "\u001b[0;32m/home/sandeep/anaconda2/lib/python2.7/site-packages/torch/multiprocessing/reductions.pyc\u001b[0m in \u001b[0;36mrebuild_storage_fd\u001b[0;34m(cls, df, size)\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrebuild_storage_fd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mversion_info\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m \u001b[0mfd\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmultiprocessing\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreduction\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrebuild_handle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 69\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0mfd\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdetach\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 279 | "\u001b[0;32m/home/sandeep/anaconda2/lib/python2.7/multiprocessing/reduction.pyc\u001b[0m in \u001b[0;36mrebuild_handle\u001b[0;34m(pickled_data)\u001b[0m\n\u001b[1;32m 153\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 154\u001b[0m \u001b[0msub_debug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'rebuilding handle %d'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 155\u001b[0;31m \u001b[0mconn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mClient\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maddress\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mauthkey\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcurrent_process\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauthkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 156\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetpid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 157\u001b[0m \u001b[0mnew_handle\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrecv_handle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 280 | "\u001b[0;32m/home/sandeep/anaconda2/lib/python2.7/multiprocessing/connection.pyc\u001b[0m in \u001b[0;36mClient\u001b[0;34m(address, family, authkey)\u001b[0m\n\u001b[1;32m 173\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 174\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mauthkey\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 175\u001b[0;31m \u001b[0manswer_challenge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mauthkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 176\u001b[0m \u001b[0mdeliver_challenge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mauthkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 177\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 281 | "\u001b[0;32m/home/sandeep/anaconda2/lib/python2.7/multiprocessing/connection.pyc\u001b[0m in \u001b[0;36manswer_challenge\u001b[0;34m(connection, authkey)\u001b[0m\n\u001b[1;32m 430\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mhmac\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 431\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mauthkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 432\u001b[0;31m \u001b[0mmessage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_bytes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m256\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# reject large message\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 433\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mCHALLENGE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mCHALLENGE\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'message = %r'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 434\u001b[0m \u001b[0mmessage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mCHALLENGE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 282 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 283 | ] 284 | } 285 | ], 286 | "source": [ 287 | "for epoch in range(50):\n", 288 | " losses = []\n", 289 | " # Train\n", 290 | " for batch_idx, (inputs, targets) in enumerate(trainloader):\n", 291 | " if cuda_available:\n", 292 | " inputs, targets = inputs.cuda(), targets.cuda()\n", 293 | "\n", 294 | " optimizer.zero_grad()\n", 295 | " inputs, targets = Variable(inputs), Variable(targets)\n", 296 | " outputs = clf(inputs)\n", 297 | " loss = criterion(outputs, targets)\n", 298 | " loss.backward()\n", 299 | " optimizer.step()\n", 300 | " losses.append(loss.data[0])\n", 301 | "\n", 302 | " print('Epoch : %d Loss : %.3f ' % (epoch, np.mean(losses)))\n", 303 | " \n", 304 | " # Evaluate\n", 305 | " clf.eval()\n", 306 | " total = 0\n", 307 | " correct = 0\n", 308 | " for batch_idx, (inputs, targets) in enumerate(testloader):\n", 309 | " if cuda_available:\n", 310 | " inputs, targets = inputs.cuda(), targets.cuda()\n", 311 | "\n", 312 | " inputs, targets = Variable(inputs, volatile=True), Variable(targets, volatile=True)\n", 313 | " outputs = clf(inputs)\n", 314 | " _, predicted = torch.max(outputs.data, 1)\n", 315 | " total += targets.size(0)\n", 316 | " correct += predicted.eq(targets.data).cpu().sum()\n", 317 | "\n", 318 | " print('Epoch : %d Test Acc : %.3f' % (epoch, 100.*correct/total))\n", 319 | " print('--------------------------------------------------------------')\n", 320 | " clf.train()" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 11, 326 | "metadata": { 327 | "collapsed": false 328 | }, 329 | "outputs": [ 330 | { 331 | "name": "stdout", 332 | "output_type": "stream", 333 | "text": [ 334 | "Files already downloaded and verified\n", 335 | "Files already downloaded and verified\n" 336 | ] 337 | } 338 | ], 339 | "source": [ 340 | "cifar_train_transform = torchvision.transforms.Compose([\n", 341 | " torchvision.transforms.RandomCrop(32, padding=4),\n", 342 | " torchvision.transforms.RandomHorizontalFlip(),\n", 343 | " torchvision.transforms.ToTensor(),\n", 344 | " torchvision.transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),\n", 345 | "])\n", 346 | "\n", 347 | "cifar_test_transform = torchvision.transforms.Compose([\n", 348 | " torchvision.transforms.ToTensor(),\n", 349 | " torchvision.transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),\n", 350 | "])\n", 351 | "\n", 352 | "trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=cifar_train_transform)\n", 353 | "trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)\n", 354 | "\n", 355 | "testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=cifar_test_transform)\n", 356 | "testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=2)" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "### Create a single Residual Block\n", 364 | "\n", 365 | "Adapted from https://github.com/kuangliu/pytorch-cifar and https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 12, 371 | "metadata": { 372 | "collapsed": false 373 | }, 374 | "outputs": [], 375 | "source": [ 376 | "class ResidualBlock(nn.Module):\n", 377 | " expansion = 1\n", 378 | "\n", 379 | " def __init__(self, in_channels, out_channels, stride=1):\n", 380 | " super(ResidualBlock, self).__init__()\n", 381 | " \n", 382 | " # Conv Layer 1\n", 383 | " self.conv1 = nn.Conv2d(\n", 384 | " in_channels=in_channels, out_channels=out_channels,\n", 385 | " kernel_size=(3, 3), stride=stride, padding=1, bias=False\n", 386 | " )\n", 387 | " self.bn1 = nn.BatchNorm2d(out_channels)\n", 388 | " \n", 389 | " # Conv Layer 2\n", 390 | " self.conv2 = nn.Conv2d(\n", 391 | " in_channels=out_channels, out_channels=out_channels,\n", 392 | " kernel_size=(3, 3), stride=1, padding=1, bias=False\n", 393 | " )\n", 394 | " self.bn2 = nn.BatchNorm2d(out_channels)\n", 395 | " \n", 396 | " # Shortcut connection to downsample residual\n", 397 | " self.shortcut = nn.Sequential()\n", 398 | " if stride != 1 or in_channels != out_channels:\n", 399 | " self.shortcut = nn.Sequential(\n", 400 | " nn.Conv2d(\n", 401 | " in_channels=in_channels, out_channels=out_channels,\n", 402 | " kernel_size=(1, 1), stride=stride, bias=False\n", 403 | " ),\n", 404 | " nn.BatchNorm2d(out_channels)\n", 405 | " )\n", 406 | "\n", 407 | " def forward(self, x):\n", 408 | " out = F.relu(self.bn1(self.conv1(x)))\n", 409 | " out = self.bn2(self.conv2(out))\n", 410 | " out += self.shortcut(x)\n", 411 | " out = F.relu(out)\n", 412 | " return out" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 13, 418 | "metadata": { 419 | "collapsed": true 420 | }, 421 | "outputs": [], 422 | "source": [ 423 | "class CIFARResNet18(nn.Module):\n", 424 | " def __init__(self, num_classes=10):\n", 425 | " super(CIFARResNet18, self).__init__()\n", 426 | " \n", 427 | " # Initial input conv\n", 428 | " self.conv1 = nn.Conv2d(\n", 429 | " in_channels=3, out_channels=64, kernel_size=(3, 3),\n", 430 | " stride=1, padding=1, bias=False\n", 431 | " )\n", 432 | " self.bn1 = nn.BatchNorm2d(64)\n", 433 | " \n", 434 | " # Create stages 1-4\n", 435 | " self.stage1 = self._create_stage(64, 64, stride=1)\n", 436 | " self.stage2 = self._create_stage(64, 128, stride=2)\n", 437 | " self.stage3 = self._create_stage(128, 256, stride=2)\n", 438 | " self.stage4 = self._create_stage(256, 512, stride=2)\n", 439 | " self.linear = nn.Linear(512, num_classes)\n", 440 | " \n", 441 | " # A stage is just two residual blocks for ResNet18\n", 442 | " def _create_stage(self, in_channels, out_channels, stride):\n", 443 | " return nn.Sequential(\n", 444 | " ResidualBlock(in_channels, out_channels, stride),\n", 445 | " ResidualBlock(out_channels, out_channels, 1)\n", 446 | " )\n", 447 | "\n", 448 | " def forward(self, x):\n", 449 | " out = F.relu(self.bn1(self.conv1(x)))\n", 450 | " out = self.stage1(out)\n", 451 | " out = self.stage2(out)\n", 452 | " out = self.stage3(out)\n", 453 | " out = self.stage4(out)\n", 454 | " out = F.avg_pool2d(out, 4)\n", 455 | " out = out.view(out.size(0), -1)\n", 456 | " out = self.linear(out)\n", 457 | " return out" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 14, 463 | "metadata": { 464 | "collapsed": false 465 | }, 466 | "outputs": [], 467 | "source": [ 468 | "clf = CIFARResNet18()\n", 469 | "if cuda_available:\n", 470 | " clf = clf.cuda()\n", 471 | "criterion = nn.CrossEntropyLoss()\n", 472 | "optimizer = optim.SGD(clf.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)\n", 473 | "scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[150, 200], gamma=0.1)" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 15, 479 | "metadata": { 480 | "collapsed": false 481 | }, 482 | "outputs": [ 483 | { 484 | "name": "stdout", 485 | "output_type": "stream", 486 | "text": [ 487 | "Epoch : 0 Loss : 1.631 Time : 35.428 seconds \n", 488 | "Epoch : 0 Test Acc : 46.180\n", 489 | "--------------------------------------------------------------\n", 490 | "Epoch : 1 Loss : 1.120 Time : 36.086 seconds \n", 491 | "Epoch : 1 Test Acc : 57.270\n", 492 | "--------------------------------------------------------------\n" 493 | ] 494 | }, 495 | { 496 | "name": "stderr", 497 | "output_type": "stream", 498 | "text": [ 499 | "Process Process-40:\n", 500 | "KeyboardInterrupt\n", 501 | "Process Process-39:\n", 502 | "Traceback (most recent call last):\n", 503 | "Traceback (most recent call last):\n", 504 | " File \"/home/sandeep/anaconda2/lib/python2.7/multiprocessing/process.py\", line 258, in _bootstrap\n", 505 | " File \"/home/sandeep/anaconda2/lib/python2.7/multiprocessing/process.py\", line 258, in _bootstrap\n" 506 | ] 507 | }, 508 | { 509 | "ename": "KeyboardInterrupt", 510 | "evalue": "", 511 | "output_type": "error", 512 | "traceback": [ 513 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 514 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 515 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0mlosses\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 516 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 517 | ] 518 | }, 519 | { 520 | "name": "stderr", 521 | "output_type": "stream", 522 | "text": [ 523 | " self.run()\n", 524 | " self.run()\n", 525 | " File \"/home/sandeep/anaconda2/lib/python2.7/multiprocessing/process.py\", line 114, in run\n", 526 | " File \"/home/sandeep/anaconda2/lib/python2.7/multiprocessing/process.py\", line 114, in run\n", 527 | " self._target(*self._args, **self._kwargs)\n", 528 | " self._target(*self._args, **self._kwargs)\n", 529 | " File \"/home/sandeep/anaconda2/lib/python2.7/site-packages/torch/utils/data/dataloader.py\", line 35, in _worker_loop\n", 530 | " File \"/home/sandeep/anaconda2/lib/python2.7/site-packages/torch/utils/data/dataloader.py\", line 35, in _worker_loop\n", 531 | " r = index_queue.get()\n", 532 | " r = index_queue.get()\n", 533 | " File \"/home/sandeep/anaconda2/lib/python2.7/multiprocessing/queues.py\", line 376, in get\n", 534 | " File \"/home/sandeep/anaconda2/lib/python2.7/multiprocessing/queues.py\", line 378, in get\n", 535 | " racquire()\n", 536 | " return recv()\n", 537 | "KeyboardInterrupt\n", 538 | " File \"/home/sandeep/anaconda2/lib/python2.7/site-packages/torch/multiprocessing/queue.py\", line 21, in recv\n", 539 | " buf = self.recv_bytes()\n" 540 | ] 541 | } 542 | ], 543 | "source": [ 544 | "for epoch in range(200):\n", 545 | " losses = []\n", 546 | " scheduler.step()\n", 547 | " # Train\n", 548 | " start = time.time()\n", 549 | " for batch_idx, (inputs, targets) in enumerate(trainloader):\n", 550 | " if cuda_available:\n", 551 | " inputs, targets = inputs.cuda(), targets.cuda()\n", 552 | "\n", 553 | " optimizer.zero_grad()\n", 554 | " inputs, targets = Variable(inputs), Variable(targets)\n", 555 | " outputs = clf(inputs)\n", 556 | " loss = criterion(outputs, targets)\n", 557 | " loss.backward()\n", 558 | " optimizer.step()\n", 559 | " losses.append(loss.data[0])\n", 560 | " end = time.time()\n", 561 | "\n", 562 | " print('Epoch : %d Loss : %.3f Time : %.3f seconds ' % (epoch, np.mean(losses), end - start))\n", 563 | " # Evaluate\n", 564 | " clf.eval()\n", 565 | " total = 0\n", 566 | " correct = 0\n", 567 | " for batch_idx, (inputs, targets) in enumerate(testloader):\n", 568 | " if cuda_available:\n", 569 | " inputs, targets = inputs.cuda(), targets.cuda()\n", 570 | "\n", 571 | " inputs, targets = Variable(inputs, volatile=True), Variable(targets, volatile=True)\n", 572 | " outputs = clf(inputs)\n", 573 | " _, predicted = torch.max(outputs.data, 1)\n", 574 | " total += targets.size(0)\n", 575 | " correct += predicted.eq(targets.data).cpu().sum()\n", 576 | "\n", 577 | " print('Epoch : %d Test Acc : %.3f' % (epoch, 100.*correct/total))\n", 578 | " print('--------------------------------------------------------------')\n", 579 | " clf.train()" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": null, 585 | "metadata": { 586 | "collapsed": true 587 | }, 588 | "outputs": [], 589 | "source": [] 590 | } 591 | ], 592 | "metadata": { 593 | "kernelspec": { 594 | "display_name": "Python 2", 595 | "language": "python", 596 | "name": "python2" 597 | }, 598 | "language_info": { 599 | "codemirror_mode": { 600 | "name": "ipython", 601 | "version": 2 602 | }, 603 | "file_extension": ".py", 604 | "mimetype": "text/x-python", 605 | "name": "python", 606 | "nbconvert_exporter": "python", 607 | "pygments_lexer": "ipython2", 608 | "version": "2.7.13" 609 | } 610 | }, 611 | "nbformat": 4, 612 | "nbformat_minor": 2 613 | } 614 | -------------------------------------------------------------------------------- /tensorflow/tensorflow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# TensorFlow tutorial\n", 8 | "_MILA, November 2017_\n", 9 | "\n", 10 | "## Protip: browsing through the TensorFlow API\n", 11 | "\n", 12 | "The [devdocs.io](http://devdocs.io/) website is an amazing resource to browse through the TensorFlow Python API (as well as many other APIs such as numpy or the Python API itself).\n", 13 | "\n", 14 | "# Using TensorFlow at MILA\n", 15 | "\n", 16 | "The most straightforward way to access TensorFlow using the MILA software stack is through the `tf1.4` conda environment. To activate the `tf1.4` environment, use the command\n", 17 | "\n", 18 | "```bash\n", 19 | "source activate tf1.4\n", 20 | "```\n", 21 | "\n", 22 | "To return back to normal, simply use the bash command\n", 23 | "\n", 24 | "```bash\n", 25 | "source deactivate\n", 26 | "```\n", 27 | "\n", 28 | "# Installing TensorFlow outside MILA\n", 29 | "\n", 30 | "Follow the [online documentation](https://www.tensorflow.org/install/), which describes how to install TensorFlow for all major platforms (Linux, macOS, Windows) in various ways (`virtualenv`, native `pip`, Docker, Anaconda).\n", 31 | "\n", 32 | "# Importing TensorFlow\n", 33 | "\n", 34 | "TensorFlow is imported as a Python package using the following statement:" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 1, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "import tensorflow as tf" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "_Note: the `tensorflow` package is usually aliased to `tf` for convenience._" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "# Paradigm\n", 60 | "\n", 61 | "TensorFlow separates the _definition_ of computation from its _execution_.\n", 62 | "\n", 63 | "Computation is defined via a [_dataflow_](https://en.wikipedia.org/wiki/Dataflow_programming) _graph_, i.e., a graph where nodes represent units of computation and the edges represent the data consumed or produced by the computation.\n", 64 | "\n", 65 | "TensorFlow calls these edges _tensors_ (not to be confused with the mathematical object of the same name). In TensorFlow parlance, a tensor is simply a multi-dimensional array of a certain data type.\n", 66 | "\n", 67 | "# Constant, variable, placeholder, and random tensors\n", 68 | "\n", 69 | "Many types of tensors may be used as input to the computation graph. We will cover four of them here: constant, variable, placeholder, and random tensors.\n", 70 | "\n", 71 | "## Constant\n", 72 | "\n", 73 | "A constant tensor always evaluates to the same value. It can be created using the `tf.constant` function:" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 2, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "Tensor(\"c:0\", shape=(), dtype=float32)\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "c = tf.constant(value=42.0, name='c')\n", 91 | "print(c)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "The `name` argument is not strictly necessary, but it is considered good practice to name things in TensorFlow, as it facilitates visualizing the computation graph and debugging.\n", 99 | "\n", 100 | "To get the value associated with a constant tensor, we evaluate it within a session:" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 3, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "The value for c is 42.0\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "with tf.Session() as session:\n", 118 | " print('The value for c is {}'.format(session.run(c)))" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "The value for a constant _always_ stays the same, be it within the same session or across different sessions:" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 4, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "name": "stdout", 135 | "output_type": "stream", 136 | "text": [ 137 | "The value for c is 42.0\n", 138 | "The value for c is 42.0\n", 139 | "The value for c is 42.0\n" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "with tf.Session() as session:\n", 145 | " print('The value for c is {}'.format(session.run(c)))\n", 146 | " print('The value for c is {}'.format(session.run(c)))\n", 147 | "\n", 148 | "with tf.Session() as session:\n", 149 | " print('The value for c is {}'.format(session.run(c)))" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "## Variable\n", 157 | "\n", 158 | "It can be useful for an input tensor's value to evolve across the lifetime of a session. For instance, a tensor's value can represent the weights of a neural network which we want to update using gradient descent.\n", 159 | "\n", 160 | "Tensors with this property are called _variables_. The preferred way to create variables is via `tf.get_variable`:" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 5, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "v = tf.get_variable(\n", 178 | " name='v', shape=[2], dtype=tf.float32,\n", 179 | " initializer=tf.zeros_initializer())\n", 180 | "print(v)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "This time, the `name` argument is required. This is because TensorFlow refers to variables by name. As such, TensorFlow expects the name for the variable to be unique. Trying to create a variable with the same name will result in an error:" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 6, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "Variable v already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "# Throughout this tutorial, we will wrap statements that\n", 205 | "# we know will cause an error to be raised with a try-except\n", 206 | "# block to print the error message only, and not the whole\n", 207 | "# stack trace.\n", 208 | "try:\n", 209 | " tf.get_variable(name='v')\n", 210 | "except ValueError as e:\n", 211 | " print(str(e).split('\\n')[0])" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "_(In case you are wondering, there is a way to bypass this behavior and retrieve by name a variable which has already been created. More on that later.)_\n", 219 | "\n", 220 | "Note that a variable's value only makes sense _within the context of a session_. Furthermore, a variable's initial value has to be set before it can be used. See what happens if we try to evaluate `v` within a session:" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 7, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "name": "stdout", 230 | "output_type": "stream", 231 | "text": [ 232 | "Attempting to use uninitialized value v\n", 233 | "\t [[Node: _retval_v_0_0 = _Retval[T=DT_FLOAT, index=0, _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](v)]]\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "with tf.Session() as session:\n", 239 | " try:\n", 240 | " session.run(v)\n", 241 | " except tf.errors.FailedPreconditionError as e:\n", 242 | " print(e)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "TensorFlow provides a function, `tf.global_variables_initializer`, which returns an op that can be evaluated to do just that:" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 8, 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "name": "stdout", 259 | "output_type": "stream", 260 | "text": [ 261 | "The value for v is [ 0. 0.]\n" 262 | ] 263 | } 264 | ], 265 | "source": [ 266 | "with tf.Session() as session:\n", 267 | " session.run(tf.global_variables_initializer())\n", 268 | " print('The value for v is {}'.format(session.run(v)))" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "A variable's value persists across a session unless it is updated by running an assignment op. For instance, the op returned by `tf.assign_add` can be used to increment a variable's value:" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 9, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "name": "stdout", 285 | "output_type": "stream", 286 | "text": [ 287 | "The value for v is [ 0. 0.]\n", 288 | "The value for v is [ 0. 0.]\n", 289 | "The value for v is [ 1. 2.]\n" 290 | ] 291 | } 292 | ], 293 | "source": [ 294 | "with tf.Session() as session:\n", 295 | " session.run(tf.global_variables_initializer())\n", 296 | " # The value for v persists across session.run calls...\n", 297 | " print('The value for v is {}'.format(session.run(v)))\n", 298 | " print('The value for v is {}'.format(session.run(v)))\n", 299 | " # ... until it is updated by running an assignment op.\n", 300 | " session.run(v.assign_add([1, 2]))\n", 301 | " print('The value for v is {}'.format(session.run(v)))" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "In addition to `tf.assign_add`, the `tf.assign_sub` and `tf.assign` functions return ops which decrement and assign a variable's value, respectively.\n", 309 | "\n", 310 | "## Placeholder\n", 311 | "\n", 312 | "Oftentimes the computation we define depends on data which we don't yet have. For instance, the output of a neural network depends on a user-defined input which is only specified at runtime.\n", 313 | "\n", 314 | "_Placeholder_ tensors are used to represent this data. They can be created via `tf.placeholder`:" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 10, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "name": "stdout", 324 | "output_type": "stream", 325 | "text": [ 326 | "Tensor(\"p:0\", shape=(), dtype=float32)\n" 327 | ] 328 | } 329 | ], 330 | "source": [ 331 | "p = tf.placeholder(dtype=tf.float32, shape=[], name='p')\n", 332 | "print(p)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "Once again, the `name` argument is optional, but it is good practice to provide it.\n", 340 | "\n", 341 | "Because it has no pre-defined value, evaluating a placeholder tensor raises an error:" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 11, 347 | "metadata": {}, 348 | "outputs": [ 349 | { 350 | "name": "stdout", 351 | "output_type": "stream", 352 | "text": [ 353 | "InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'p' with dtype float\n", 354 | "\t [[Node: p = Placeholder[dtype=DT_FLOAT, shape=[], _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"]()]]\n", 355 | "\n" 356 | ] 357 | } 358 | ], 359 | "source": [ 360 | "with tf.Session() as session:\n", 361 | " try:\n", 362 | " session.run(p)\n", 363 | " except tf.errors.InvalidArgumentError as e:\n", 364 | " # Cutting through the error message...\n", 365 | " print('\\n'.join(str(e).split('\\n')[-3:]))" 366 | ] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": {}, 371 | "source": [ 372 | "Its value must be _explicitly_ passed to `session.run` via the `feed_dict` argument, which expects a `dict` mapping tensors to their value:" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 12, 378 | "metadata": {}, 379 | "outputs": [ 380 | { 381 | "name": "stdout", 382 | "output_type": "stream", 383 | "text": [ 384 | "The value for p is 42.0\n", 385 | "The value for p is 21.0\n" 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | "with tf.Session() as session:\n", 391 | " # feed p the value 42\n", 392 | " print('The value for p is {}'.format(session.run(p, feed_dict={p: 42})))\n", 393 | " # feed p the value 21\n", 394 | " print('The value for p is {}'.format(session.run(p, feed_dict={p: 21})))" 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "metadata": {}, 400 | "source": [ 401 | "## Random\n", 402 | "\n", 403 | "Another useful input tensor to have in our toolbox is the random tensor. The random seed can be set globally via `tf.set_random_seed`:" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 13, 409 | "metadata": { 410 | "collapsed": true 411 | }, 412 | "outputs": [], 413 | "source": [ 414 | "tf.set_random_seed(1234)" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": {}, 420 | "source": [ 421 | "There are many random distributions to choose from in TensorFlow. Let's look at `tf.random_uniform`:" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 14, 427 | "metadata": { 428 | "collapsed": true 429 | }, 430 | "outputs": [], 431 | "source": [ 432 | "r = tf.random_uniform(\n", 433 | " shape=[], minval=0.0, maxval=1.0, dtype=tf.float32, name='r')" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "A random tensor's value changes randomly between `session.run` calls, but the sequence of those random values stays the same across different sessions:" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 15, 446 | "metadata": {}, 447 | "outputs": [ 448 | { 449 | "name": "stdout", 450 | "output_type": "stream", 451 | "text": [ 452 | "The value for r is 0.8478444814682007\n", 453 | "The value for r is 0.23446130752563477\n", 454 | "The value for r is 0.8478444814682007\n", 455 | "The value for r is 0.23446130752563477\n" 456 | ] 457 | } 458 | ], 459 | "source": [ 460 | "with tf.Session() as session:\n", 461 | " print('The value for r is {}'.format(session.run(r)))\n", 462 | " print('The value for r is {}'.format(session.run(r)))\n", 463 | "\n", 464 | "with tf.Session() as session:\n", 465 | " print('The value for r is {}'.format(session.run(r)))\n", 466 | " print('The value for r is {}'.format(session.run(r)))" 467 | ] 468 | }, 469 | { 470 | "cell_type": "markdown", 471 | "metadata": {}, 472 | "source": [ 473 | "# Combining tensors\n", 474 | "\n", 475 | "Tensors can be combined in various ways using what TensorFlow calls operations, or _ops_. Ops can take zero or more tensors as input and produce zero or more tensors as output, with or without side effects.\n", 476 | "\n", 477 | "We have already dealt with ops when initializing or assigning values to variables, but there are _many_ more TensorFlow functions which can be used to create ops.\n", 478 | "\n", 479 | "The best way to discover new useful ops is to browse the [TensorFlow Python API](https://www.tensorflow.org/api_docs/python/). For instance, we can discover that there exists a function, `tf.add`, which adds two tensors together and returns a tensor representing the output:" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 16, 485 | "metadata": {}, 486 | "outputs": [ 487 | { 488 | "name": "stdout", 489 | "output_type": "stream", 490 | "text": [ 491 | "1 + 3 = 4\n" 492 | ] 493 | } 494 | ], 495 | "source": [ 496 | "one_plus_three = tf.add(1, 3)\n", 497 | "\n", 498 | "with tf.Session() as session:\n", 499 | " print('1 + 3 = {}'.format(session.run(one_plus_three)))" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": {}, 505 | "source": [ 506 | "Note that TensorFlow also offers syntactic sugar by overriding some Python operators like `+`, `-`, `*`, and `/`:" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 17, 512 | "metadata": {}, 513 | "outputs": [ 514 | { 515 | "name": "stdout", 516 | "output_type": "stream", 517 | "text": [ 518 | "1 + 4 = 5\n" 519 | ] 520 | } 521 | ], 522 | "source": [ 523 | "one_plus_four = tf.constant(1) + tf.constant(4)\n", 524 | "\n", 525 | "with tf.Session() as session:\n", 526 | " print('1 + 4 = {}'.format(session.run(one_plus_four)))" 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "metadata": {}, 532 | "source": [ 533 | "# Where tensors live\n", 534 | "\n", 535 | "So far we created tensors and combined them together, but we have not explicitly dealt with the computation graph itself. Which computation graph are these tensors part of, then?\n", 536 | "\n", 537 | "By default, TensorFlow stores all tensors and operations in a **default graph**, which can be accessed as follows:" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": 18, 543 | "metadata": {}, 544 | "outputs": [ 545 | { 546 | "name": "stdout", 547 | "output_type": "stream", 548 | "text": [ 549 | "[, , , ]\n" 550 | ] 551 | } 552 | ], 553 | "source": [ 554 | "# Access the default graph\n", 555 | "default_graph = tf.get_default_graph()\n", 556 | "# Print first four operations defined in the graph\n", 557 | "print(default_graph.get_operations()[:4])" 558 | ] 559 | }, 560 | { 561 | "cell_type": "markdown", 562 | "metadata": {}, 563 | "source": [ 564 | "We can change that default graph to be another graph:" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": 19, 570 | "metadata": {}, 571 | "outputs": [ 572 | { 573 | "name": "stdout", 574 | "output_type": "stream", 575 | "text": [ 576 | "default graph contains 23 operations, alternate graph contains 0 operations\n", 577 | "[, ]\n", 578 | "default graph contains 23 operations, alternate graph contains 2 operations\n" 579 | ] 580 | } 581 | ], 582 | "source": [ 583 | "# Create a new Graph\n", 584 | "alternate_graph = tf.Graph()\n", 585 | "\n", 586 | "# Print the number of operations defined in the default and alternate graphs\n", 587 | "print('default graph contains {} operations, '.format(len(default_graph.get_operations())) +\n", 588 | " 'alternate graph contains {} operations'.format(len(alternate_graph.get_operations())))\n", 589 | "\n", 590 | "# Use the alternate graph as the default graph\n", 591 | "with alternate_graph.as_default():\n", 592 | " tf.constant(0, name='a')\n", 593 | " tf.constant(1, name='b')\n", 594 | " print(alternate_graph.get_operations())\n", 595 | "\n", 596 | "# Print again the number of operations defined in the default and alternate graphs\n", 597 | "print('default graph contains {} operations, '.format(len(default_graph.get_operations())) +\n", 598 | " 'alternate graph contains {} operations'.format(len(alternate_graph.get_operations())))" 599 | ] 600 | }, 601 | { 602 | "cell_type": "markdown", 603 | "metadata": {}, 604 | "source": [ 605 | "In practice, you usually won't have to deal with multiple computation graphs. We will however use multiple computation graphs in this tutorial to isolate ops in their own namespace where appropriate." 606 | ] 607 | }, 608 | { 609 | "cell_type": "markdown", 610 | "metadata": {}, 611 | "source": [ 612 | "# Gradients" 613 | ] 614 | }, 615 | { 616 | "cell_type": "markdown", 617 | "metadata": {}, 618 | "source": [ 619 | "TensorFlow supports _automatic differentiation_, i.e., it can compute the derivative of scalars with respect to tensors in the graph and represent the result as a symbolic expression.\n", 620 | "\n", 621 | "Take for instance the linear equation" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": 20, 627 | "metadata": { 628 | "collapsed": true 629 | }, 630 | "outputs": [], 631 | "source": [ 632 | "x = tf.placeholder(dtype=tf.float32, shape=[], name='p')\n", 633 | "y = 3 * x + 2" 634 | ] 635 | }, 636 | { 637 | "cell_type": "markdown", 638 | "metadata": {}, 639 | "source": [ 640 | "We can compute the derivative of `y` with respect to `x` with the `tf.gradients` function:" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": 21, 646 | "metadata": { 647 | "collapsed": true 648 | }, 649 | "outputs": [], 650 | "source": [ 651 | "dy_dx, = tf.gradients(ys=y, xs=[x])" 652 | ] 653 | }, 654 | { 655 | "cell_type": "markdown", 656 | "metadata": {}, 657 | "source": [ 658 | "We can verify that the gradient evaluates to 3 as expected:" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 22, 664 | "metadata": {}, 665 | "outputs": [ 666 | { 667 | "name": "stdout", 668 | "output_type": "stream", 669 | "text": [ 670 | "The gradient of y with respect to x is 3.0\n" 671 | ] 672 | } 673 | ], 674 | "source": [ 675 | "with tf.Session() as session:\n", 676 | " dy_dx_val = session.run(dy_dx)\n", 677 | " print('The gradient of y with respect '\n", 678 | " 'to x is {}'.format(dy_dx_val))" 679 | ] 680 | }, 681 | { 682 | "cell_type": "markdown", 683 | "metadata": {}, 684 | "source": [ 685 | "_Note: Some of you may have noticed that TensorFlow did not complain despite no value being provided for the `x` placeholder. This is because even though `x` is part of the computation graph, the derivative of `y` with respect to `x` does not involve `x`, and therefore evaluating it does not require a value to be passed for `x`._" 686 | ] 687 | }, 688 | { 689 | "cell_type": "markdown", 690 | "metadata": {}, 691 | "source": [ 692 | "# Exercise\n", 693 | "\n", 694 | "Find the minimum of the expression\n", 695 | "\n", 696 | "$$2(x - 2)^2 + 2(y + 3)^2$$\n", 697 | "\n", 698 | "using gradient descent by filling in the following code block:" 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": 23, 704 | "metadata": { 705 | "collapsed": true 706 | }, 707 | "outputs": [], 708 | "source": [ 709 | "# Avoid polluting the default graph by using an alternate graph\n", 710 | "with tf.Graph().as_default():\n", 711 | " tf.set_random_seed(1234)\n", 712 | "\n", 713 | " # Create two scalar variables, x and y, initialized at random.\n", 714 | " # x = WRITEME.\n", 715 | " # y = WRITEME.\n", 716 | "\n", 717 | " # Create a tensor z whose value represents the expression\n", 718 | " # 2(x - 2)^2 + 2(y + 3)^2\n", 719 | " # z = WRITEME.\n", 720 | " \n", 721 | " # Compute the gradients of z with respect to x and y.\n", 722 | " # dx, dy = WRITEME.\n", 723 | " \n", 724 | " # Create an assignment expression for x using the update rule\n", 725 | " # x <- x - 0.1 * dz/dx\n", 726 | " # and do the same for y.\n", 727 | " # x_update = WRITEME.\n", 728 | " # y_update = WRITEME.\n", 729 | " \n", 730 | " with tf.Session() as session:\n", 731 | " # Run the global initializer op for x and y.\n", 732 | " # WRITEME.\n", 733 | " \n", 734 | " for _ in range(10):\n", 735 | " pass\n", 736 | " # Run the update ops for x and y.\n", 737 | " # WRITEME.\n", 738 | " \n", 739 | " # Retrieve the values for x, y, and z, and print them.\n", 740 | " # x_val, y_val, z_val = WRITEME.\n", 741 | " # print('x = {:4.2f}, y = {:4.2f}, z = {:4.2f}'.format(x_val, y_val, z_val))" 742 | ] 743 | }, 744 | { 745 | "cell_type": "markdown", 746 | "metadata": {}, 747 | "source": [ 748 | "## Solution" 749 | ] 750 | }, 751 | { 752 | "cell_type": "code", 753 | "execution_count": null, 754 | "metadata": {}, 755 | "outputs": [], 756 | "source": [ 757 | "%load tensorflow_exercise_solution.py" 758 | ] 759 | }, 760 | { 761 | "cell_type": "markdown", 762 | "metadata": {}, 763 | "source": [ 764 | "# Optimization made easy\n", 765 | "\n", 766 | "The solution to the exercise above can be shortened quite a bit by taking advantage of TensorFlow's optimization features. Here is the graph we were working with:" 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": 25, 772 | "metadata": { 773 | "collapsed": true 774 | }, 775 | "outputs": [], 776 | "source": [ 777 | "tf.set_random_seed(1234)\n", 778 | "x = tf.get_variable(name='x', shape=[], dtype=tf.float32,\n", 779 | " initializer=tf.random_normal_initializer())\n", 780 | "y = tf.get_variable(name='y', shape=[], dtype=tf.float32,\n", 781 | " initializer=tf.random_normal_initializer())\n", 782 | "\n", 783 | "z = 2 * (x - 2) ** 2 + 2 * (y + 3) ** 2" 784 | ] 785 | }, 786 | { 787 | "cell_type": "markdown", 788 | "metadata": {}, 789 | "source": [ 790 | "TensorFlow provides utility classes to facilitate optimization in computation graphs. These classes inherit from `tf.train.Optimizer`. Let's look at the simplest one, `tf.train.GradientDescentOptimizer`.\n", 791 | "\n", 792 | "We instantiate the `tf.train.GradientOptimizer` by passing it a scalar learning rate. Note that the learning rate itself can be symbolic, and is allowed to vary across a session." 793 | ] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "execution_count": 26, 798 | "metadata": { 799 | "collapsed": true 800 | }, 801 | "outputs": [], 802 | "source": [ 803 | "optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)" 804 | ] 805 | }, 806 | { 807 | "cell_type": "markdown", 808 | "metadata": {}, 809 | "source": [ 810 | "We can then call the optimizer's `minimize` function to obtain an op with, when evaluated, does a gradient descent step on the variables we specified:" 811 | ] 812 | }, 813 | { 814 | "cell_type": "code", 815 | "execution_count": 27, 816 | "metadata": { 817 | "collapsed": true 818 | }, 819 | "outputs": [], 820 | "source": [ 821 | "update_op = optimizer.minimize(loss=z, var_list=tf.trainable_variables())" 822 | ] 823 | }, 824 | { 825 | "cell_type": "markdown", 826 | "metadata": {}, 827 | "source": [ 828 | "Here we took advantage of the fact that all variables created via `tf.get_variable` can be accessed as a list using the `tf.trainable_variables` function.\n", 829 | "\n", 830 | "(Note: you can pass `trainable=False` to `tf.get_variable` to exclude a certain variable from ending up in that list.)\n", 831 | "\n", 832 | "The code then proceeds as before:" 833 | ] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "execution_count": 28, 838 | "metadata": {}, 839 | "outputs": [ 840 | { 841 | "name": "stdout", 842 | "output_type": "stream", 843 | "text": [ 844 | "x = 0.52, y = -1.75, z = 7.49\n", 845 | "x = 1.11, y = -2.25, z = 2.70\n", 846 | "x = 1.47, y = -2.55, z = 0.97\n", 847 | "x = 1.68, y = -2.73, z = 0.35\n", 848 | "x = 1.81, y = -2.84, z = 0.13\n", 849 | "x = 1.89, y = -2.90, z = 0.05\n", 850 | "x = 1.93, y = -2.94, z = 0.02\n", 851 | "x = 1.96, y = -2.97, z = 0.01\n", 852 | "x = 1.98, y = -2.98, z = 0.00\n", 853 | "x = 1.99, y = -2.99, z = 0.00\n" 854 | ] 855 | } 856 | ], 857 | "source": [ 858 | "with tf.Session() as session:\n", 859 | " session.run(tf.global_variables_initializer())\n", 860 | "\n", 861 | " for _ in range(10):\n", 862 | " session.run(update_op)\n", 863 | " x_val, y_val, z_val = session.run([x, y, z])\n", 864 | " print('x = {:4.2f}, y = {:4.2f}, z = {:4.2f}'.format(x_val, y_val, z_val))" 865 | ] 866 | }, 867 | { 868 | "cell_type": "markdown", 869 | "metadata": {}, 870 | "source": [ 871 | "# Control flow\n", 872 | "\n", 873 | "TensorFlow implements symbolic operations for executing common control flow structures, like if-statements and while loops.\n", 874 | "\n", 875 | "This is usually where people get confused: despite appearances, there is a *big* difference between a regular control flow statement and its symbolic counterpart. **When working with control flow ops, you should always keep in mind that the code you write _defines the computation graph_, it does not _execute_ that computation.**\n", 876 | "\n", 877 | "## If-statement\n", 878 | "\n", 879 | "To illustrate this, let's look at a regular if-statement:" 880 | ] 881 | }, 882 | { 883 | "cell_type": "code", 884 | "execution_count": 29, 885 | "metadata": {}, 886 | "outputs": [ 887 | { 888 | "name": "stdout", 889 | "output_type": "stream", 890 | "text": [ 891 | "1.0\n" 892 | ] 893 | } 894 | ], 895 | "source": [ 896 | "regular_if_graph = tf.Graph()\n", 897 | "with regular_if_graph.as_default():\n", 898 | " # Define computation graph\n", 899 | " x = 3\n", 900 | " y = tf.placeholder(dtype=tf.float32, shape=[], name='y')\n", 901 | " if x < 4:\n", 902 | " z = y + 1\n", 903 | " else:\n", 904 | " z = y - 1\n", 905 | " \n", 906 | " # Run the computation graph on some input\n", 907 | " with tf.Session() as session:\n", 908 | " print(session.run(z, feed_dict={y: 0}))\n" 909 | ] 910 | }, 911 | { 912 | "cell_type": "markdown", 913 | "metadata": {}, 914 | "source": [ 915 | "This piece of code is fairly simple: depending on the value of `x`, we either add 1 to `y` or subtract 1 from it. However, it depends on the fact that *the value of `x` was known when the graph was created*. In fact, because the `else` statement never gets executed, **the expression `y - 1` doesn't even appear in the computation graph**:" 916 | ] 917 | }, 918 | { 919 | "cell_type": "code", 920 | "execution_count": 30, 921 | "metadata": {}, 922 | "outputs": [ 923 | { 924 | "name": "stdout", 925 | "output_type": "stream", 926 | "text": [ 927 | "[, , ]\n" 928 | ] 929 | } 930 | ], 931 | "source": [ 932 | "print(regular_if_graph.get_operations())" 933 | ] 934 | }, 935 | { 936 | "cell_type": "markdown", 937 | "metadata": {}, 938 | "source": [ 939 | "Clearly, this approach does not work when the value of `x` is not known in advance. Try the same code with a placeholder `x` and you will be greeted with an error message:" 940 | ] 941 | }, 942 | { 943 | "cell_type": "code", 944 | "execution_count": 31, 945 | "metadata": {}, 946 | "outputs": [ 947 | { 948 | "name": "stdout", 949 | "output_type": "stream", 950 | "text": [ 951 | "Using a `tf.Tensor` as a Python `bool` is not allowed. Use `if t is not None:` instead of `if t:` to test if a tensor is defined, and use TensorFlow ops such as tf.cond to execute subgraphs conditioned on the value of a tensor.\n" 952 | ] 953 | } 954 | ], 955 | "source": [ 956 | "with tf.Graph().as_default():\n", 957 | " x = tf.placeholder(dtype=tf.float32, shape=[], name='x')\n", 958 | " y = tf.placeholder(dtype=tf.float32, shape=[], name='y')\n", 959 | " try:\n", 960 | " if x < 4:\n", 961 | " z = y + 1\n", 962 | " else:\n", 963 | " z = y - 1\n", 964 | " except TypeError as e:\n", 965 | " print(str(e).split('\\n')[0])" 966 | ] 967 | }, 968 | { 969 | "cell_type": "markdown", 970 | "metadata": {}, 971 | "source": [ 972 | "For this use case, TensorFlow implements function called `tf.cond` which acts as a symbolic counterpart to the if-statement:" 973 | ] 974 | }, 975 | { 976 | "cell_type": "code", 977 | "execution_count": 32, 978 | "metadata": {}, 979 | "outputs": [ 980 | { 981 | "name": "stdout", 982 | "output_type": "stream", 983 | "text": [ 984 | "1.0\n", 985 | "-1.0\n" 986 | ] 987 | } 988 | ], 989 | "source": [ 990 | "symbolic_if_graph = tf.Graph()\n", 991 | "with symbolic_if_graph.as_default():\n", 992 | " # Define computation graph\n", 993 | " x = tf.placeholder(dtype=tf.float32, shape=[], name='x')\n", 994 | " y = tf.placeholder(dtype=tf.float32, shape=[], name='y')\n", 995 | " z = tf.cond(\n", 996 | " pred=x < 4,\n", 997 | " true_fn=lambda: y + 1,\n", 998 | " false_fn=lambda: y - 1)\n", 999 | " \n", 1000 | " # Run the computation graph on some inputs\n", 1001 | " with tf.Session() as session:\n", 1002 | " print(session.run(z, feed_dict={x: 3, y: 0}))\n", 1003 | " print(session.run(z, feed_dict={x: 5, y: 0}))" 1004 | ] 1005 | }, 1006 | { 1007 | "cell_type": "markdown", 1008 | "metadata": {}, 1009 | "source": [ 1010 | "The `tf.cond` function takes a predicate `pred`, a subgraph-creating function `true_fn`, and a subgraph-creating function `false_fn` as input. The predicate is a *symbolic* boolean which is used to decide which branch of the conditional is executed. The two graph-creating functions take no argument as input, create a computation subgraph, and return its symbolic output.\n", 1011 | "\n", 1012 | "Looking at the operations defined in the graph above reveals a very different picture:" 1013 | ] 1014 | }, 1015 | { 1016 | "cell_type": "code", 1017 | "execution_count": 33, 1018 | "metadata": {}, 1019 | "outputs": [ 1020 | { 1021 | "name": "stdout", 1022 | "output_type": "stream", 1023 | "text": [ 1024 | "[, , , , , , , , , , , , , , ]\n" 1025 | ] 1026 | } 1027 | ], 1028 | "source": [ 1029 | "print(symbolic_if_graph.get_operations())" 1030 | ] 1031 | }, 1032 | { 1033 | "cell_type": "markdown", 1034 | "metadata": {}, 1035 | "source": [ 1036 | "**The takeaway here is that because TensorFlow separates graph definition from graph execution, we have to adjust our mental model of what happens behind the scenes, even for a seemingly innocuous if-statement.**\n", 1037 | "\n", 1038 | "Now that this warning is out of the way, we can look at some other symbolic control-flow functions TensorFlow implements." 1039 | ] 1040 | }, 1041 | { 1042 | "cell_type": "markdown", 1043 | "metadata": {}, 1044 | "source": [ 1045 | "## Case\n", 1046 | "\n", 1047 | "The `tf.case` function implements a symbolic counterpart to the `case` control flow statement. It takes a sequence of (predicate, subgraph-creating function) tuples as input:" 1048 | ] 1049 | }, 1050 | { 1051 | "cell_type": "code", 1052 | "execution_count": 34, 1053 | "metadata": {}, 1054 | "outputs": [ 1055 | { 1056 | "name": "stdout", 1057 | "output_type": "stream", 1058 | "text": [ 1059 | "3.0\n", 1060 | "1.0\n", 1061 | "4.0\n", 1062 | "1.0\n" 1063 | ] 1064 | } 1065 | ], 1066 | "source": [ 1067 | "with tf.Graph().as_default():\n", 1068 | " # Define computation graph\n", 1069 | " x = tf.placeholder(dtype=tf.int32, shape=[], name='x')\n", 1070 | " y = tf.placeholder(dtype=tf.float32, shape=[], name='y')\n", 1071 | " z = tf.case(\n", 1072 | " pred_fn_pairs=[(tf.equal(x, 0), lambda: y + 1),\n", 1073 | " (tf.equal(x, 1), lambda: y - 1),\n", 1074 | " (tf.equal(x, 2), lambda: y * 2)],\n", 1075 | " default=lambda: y / 2)\n", 1076 | " \n", 1077 | " # Run the computation graph on some inputs\n", 1078 | " with tf.Session() as session:\n", 1079 | " print(session.run(z, feed_dict={x: 0, y: 2}))\n", 1080 | " print(session.run(z, feed_dict={x: 1, y: 2}))\n", 1081 | " print(session.run(z, feed_dict={x: 2, y: 2}))\n", 1082 | " print(session.run(z, feed_dict={x: 3, y: 2}))" 1083 | ] 1084 | }, 1085 | { 1086 | "cell_type": "markdown", 1087 | "metadata": {}, 1088 | "source": [ 1089 | "## While loop\n", 1090 | "\n", 1091 | "The `tf.while_loop` function implements a symbolic counterpart to the `while` control flow statement. It takes a `cond` subgraph-creating function, a `body` subgraph-creating function and a `loop_vars` sequence of tensors.\n", 1092 | "\n", 1093 | "You can think of `loop_vars` as the initial state of all tensors which change from one iteration of the loop to the other. The `cond` and `body` functions takes a sequence of tensors with the same length as `loop_vars` as input; you can think of them as the current state the `loop_vars`. The `cond` function returns a symbolic boolean telling whether the loop should be executed or not. The `body` function returns a sequence of tensors representing the new values the `loop_vars`.\n", 1094 | "\n", 1095 | "Here is an example of a sequential implementation of the Fibonacci sequence using `tf.while_loop`:" 1096 | ] 1097 | }, 1098 | { 1099 | "cell_type": "code", 1100 | "execution_count": 35, 1101 | "metadata": {}, 1102 | "outputs": [ 1103 | { 1104 | "name": "stdout", 1105 | "output_type": "stream", 1106 | "text": [ 1107 | "5\n", 1108 | "13\n" 1109 | ] 1110 | } 1111 | ], 1112 | "source": [ 1113 | "with tf.Graph().as_default():\n", 1114 | " # Define computation graph\n", 1115 | " n = tf.placeholder(tf.int32, shape=[], name='n')\n", 1116 | " i = tf.constant(2)\n", 1117 | " a = tf.constant(0)\n", 1118 | " b = tf.constant(1)\n", 1119 | "\n", 1120 | " _, _, nth_fib = tf.while_loop(\n", 1121 | " cond=lambda i, a, b: i < n,\n", 1122 | " body=lambda i, a, b: (i + 1, b, a + b),\n", 1123 | " loop_vars=(i, a, b))\n", 1124 | "\n", 1125 | " # Run the computation graph on some inputs\n", 1126 | " with tf.Session() as session:\n", 1127 | " print(session.run(nth_fib, feed_dict={n: 6}))\n", 1128 | " print(session.run(nth_fib, feed_dict={n: 8}))" 1129 | ] 1130 | }, 1131 | { 1132 | "cell_type": "markdown", 1133 | "metadata": {}, 1134 | "source": [ 1135 | "We define a placeholder tensor `n` representing which element of the Fibonacci sequence to compute. We then instantiate three constant tensors that act as the `loop_vars`: a counter `i` and two values `a` and `b` representing the elements $i - 1$ and $i$ of the Fibonacci sequence, respectively.\n", 1136 | "\n", 1137 | "The `cond` argument passed to `tf.while_loop` indicates that the body should execute as long as $i < n$. The `body` argument itself increments `i` by one and updates the value for `a` and `b` so they reflect elements $i$ and $i + 1$ of the Fibonacci sequence, respectively (reminder: $fib(i + 1) = fib(i) + fib(i - 1)$).\n", 1138 | "\n", 1139 | "## Scan\n", 1140 | "\n", 1141 | "Another useful function TensorFlow implements is `tf.scan`, which \"scans\" over the elements of its input and applies some possibly stateful function to them.\n", 1142 | "\n", 1143 | "Here is an example of a cumulative sum implemented using `tf.scan`:" 1144 | ] 1145 | }, 1146 | { 1147 | "cell_type": "code", 1148 | "execution_count": 36, 1149 | "metadata": {}, 1150 | "outputs": [ 1151 | { 1152 | "name": "stdout", 1153 | "output_type": "stream", 1154 | "text": [ 1155 | "[ 1 3 6 10]\n" 1156 | ] 1157 | } 1158 | ], 1159 | "source": [ 1160 | "with tf.Graph().as_default():\n", 1161 | " # Define computation graph\n", 1162 | " x = tf.placeholder(tf.int32, shape=[None], name='x')\n", 1163 | "\n", 1164 | " c = tf.scan(\n", 1165 | " fn=lambda a, x_t: a + x_t,\n", 1166 | " elems=x,\n", 1167 | " initializer=tf.constant(0))\n", 1168 | " \n", 1169 | " # Run the computation graph on some input\n", 1170 | " with tf.Session() as session:\n", 1171 | " print(session.run(c, feed_dict={x: [1, 2, 3, 4]}))" 1172 | ] 1173 | }, 1174 | { 1175 | "cell_type": "markdown", 1176 | "metadata": {}, 1177 | "source": [ 1178 | "We pass `tf.scan` three arguments as inputs: a subgraph-creating function `fn`, a tensor (or sequence of tensors) `elems` to loop over, and a tensor (or list of tensors) `initializer` of initial accumulator values.\n", 1179 | "\n", 1180 | "The scan function loops over the first axis of `elems` (or, if it's a sequence, over the first axes of every element in the sequence). The `fn` function takes the accumulator value and the current loop element as input, and returns a new value for the accumulator.\n", 1181 | "\n", 1182 | "The `tf.scan` function itself returns a sequence of accumulated values." 1183 | ] 1184 | }, 1185 | { 1186 | "cell_type": "markdown", 1187 | "metadata": {}, 1188 | "source": [ 1189 | "# Scaling up to large computation graphs\n", 1190 | "\n", 1191 | "We have now covered the bare minimum that would allow you to do machine learning with TensorFlow. We have not covered _every_ TensorFlow op, but you now possess the knowledge required to browse through the [TensorFlow Python API](https://www.tensorflow.org/api_docs/python/) and find what you need.\n", 1192 | "\n", 1193 | "We will now concentrate on ways to scale what you learned to actual machine learning problems without increasing the maintenance complexity too much.\n", 1194 | "\n", 1195 | "## Variable and name scopes\n", 1196 | "\n", 1197 | "TensorFlow uses a soft convention for op and variable names: an op or variable that is part of a hierarchy should have a name that conveys its location in the hierarchy, with the `'/'` character being used to separate different levels in the hierarchy. For instance, a good name for the bias vector of the second layer of the model would be `'model/layer2/b'`.\n", 1198 | "\n", 1199 | "In order to reduce code duplication and facilitate maintenance, TensorFlow provides two context managers, named `tf.name_scope` and `tf.variable_scope`, inside which variables and ops that are created see their name prepended with the name of the enclosing scope. The difference between the two is that `tf.variable_scope` operates on _all_ names, whereas `tf.name_scope` operates on all _but_ variable names:" 1200 | ] 1201 | }, 1202 | { 1203 | "cell_type": "code", 1204 | "execution_count": 37, 1205 | "metadata": {}, 1206 | "outputs": [ 1207 | { 1208 | "name": "stdout", 1209 | "output_type": "stream", 1210 | "text": [ 1211 | "foo/bar/a:0\n", 1212 | "foo/bar/b:0\n", 1213 | "a:0\n", 1214 | "machine/learning/b:0\n" 1215 | ] 1216 | } 1217 | ], 1218 | "source": [ 1219 | "# Variable scopes operate on all tensors\n", 1220 | "with tf.variable_scope('foo'):\n", 1221 | " # Scopes can be nested\n", 1222 | " with tf.variable_scope('bar'):\n", 1223 | " print(tf.get_variable('a', shape=[]).name)\n", 1224 | " print(tf.constant(0.0, name='b').name)\n", 1225 | "# Name scopes do not operate on variables\n", 1226 | "with tf.name_scope('machine'):\n", 1227 | " with tf.name_scope('learning'):\n", 1228 | " print(tf.get_variable('a', shape=[]).name)\n", 1229 | " print(tf.constant(0.0, name='b').name)" 1230 | ] 1231 | }, 1232 | { 1233 | "cell_type": "markdown", 1234 | "metadata": {}, 1235 | "source": [ 1236 | "## Device placement\n", 1237 | "\n", 1238 | "In this tutorial we have not bothered with the specific placement (CPU or GPU) of our ops, i.e., the device on which they are executed, mostly because the examples we considered were so small in scale that GPU acceleration makes little to no sense.\n", 1239 | "\n", 1240 | "However, in general, we would like most of our large operations (such as matrix-matrix multiplications) to take place on the GPU. Fortunately for us, TensorFlow already handles device placement for us behind the scenes. The short story is that TensorFlow will try to place all possible ops on all available GPUs, which is a good default to have.\n", 1241 | "\n", 1242 | "Because we are sharing workstations with multiple GPUs, this means we need to be careful in allowing TensorFlow to see only the GPUs we want to use. This is achieved by setting the `CUDA_VISIBLE_DEVICES` environment variable. For instance, to allow TensorFlow to see GPUs 0 and 2, set it to\n", 1243 | "\n", 1244 | "```bash\n", 1245 | "CUDA_VISIBLE_DEVICES=0,2\n", 1246 | "```\n", 1247 | "\n", 1248 | "To allow TensorFlow to see GPU 0 only, set it to\n", 1249 | "\n", 1250 | "```bash\n", 1251 | "CUDA_VISIBLE_DEVICES=0\n", 1252 | "```\n", 1253 | "\n", 1254 | "To disallow it to see any GPU, set it to\n", 1255 | "\n", 1256 | "```bash\n", 1257 | "CUDA_VISIBLE_DEVICES=\"\"\n", 1258 | "```\n", 1259 | "\n", 1260 | "In addition, there may be situations in which we want fine-grained control over device placement. For instance, we may want on-the-fly input pre-processing to be computed on the CPU and reserve the GPU for inference. In that case, use the `tf.device` context manager:\n", 1261 | "\n", 1262 | "```python\n", 1263 | "# All ops created within this context are forced to be placed on CPU\n", 1264 | "with tf.device('/cpu:0'):\n", 1265 | " x = # some tensor\n", 1266 | " preprocessed_x = # some pre-processing on x\n", 1267 | "\n", 1268 | "# All ops created within this context are forced to be placed on GPU\n", 1269 | "with tf.device('/gpu:0'):\n", 1270 | " y = # some mapping from x to y\n", 1271 | "```\n", 1272 | "\n", 1273 | "You can find more information on manual device placement in the [TensorFlow documentation](https://www.tensorflow.org/tutorials/using_gpu#manual_device_placement)." 1274 | ] 1275 | }, 1276 | { 1277 | "cell_type": "markdown", 1278 | "metadata": {}, 1279 | "source": [ 1280 | "# Advanced topics\n", 1281 | "\n", 1282 | "This tutorial does not cover all topics in the [TensorFlow programmer's guide](https://www.tensorflow.org/programmers_guide/). The guide as a whole is a great follow-up read; in particular, you will find the following sections useful:\n", 1283 | "\n", 1284 | "* [Data management](https://www.tensorflow.org/programmers_guide/datasets)\n", 1285 | "* [Estimators](https://www.tensorflow.org/programmers_guide/estimators)\n", 1286 | "* [Eager mode](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager)\n", 1287 | "\n", 1288 | "You may also want to have a look at the [performance guide](https://www.tensorflow.org/performance/performance_guide), as well as the tutorial series on TensorBoard:\n", 1289 | "\n", 1290 | "* [Visualizing learning](https://www.tensorflow.org/get_started/summaries_and_tensorboard)\n", 1291 | "* [Graph visualization](https://www.tensorflow.org/get_started/graph_viz)\n", 1292 | "* [Histogram dashboard](https://www.tensorflow.org/get_started/tensorboard_histograms)" 1293 | ] 1294 | } 1295 | ], 1296 | "metadata": { 1297 | "kernelspec": { 1298 | "display_name": "Python 3", 1299 | "language": "python", 1300 | "name": "python3" 1301 | }, 1302 | "language_info": { 1303 | "codemirror_mode": { 1304 | "name": "ipython", 1305 | "version": 3 1306 | }, 1307 | "file_extension": ".py", 1308 | "mimetype": "text/x-python", 1309 | "name": "python", 1310 | "nbconvert_exporter": "python", 1311 | "pygments_lexer": "ipython3", 1312 | "version": "3.6.3" 1313 | } 1314 | }, 1315 | "nbformat": 4, 1316 | "nbformat_minor": 2 1317 | } 1318 | --------------------------------------------------------------------------------