├── README.md ├── LICENSE ├── .gitignore ├── 3_regularization.ipynb ├── 4_convolutions.ipynb ├── 5_word2vec.ipynb ├── 6_lstm.ipynb └── 2_fullyconnected.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # deep-learning-udacity 2 | Awesome solutions to [Udacity's Deep Learning course: ud70](https://www.udacity.com/course/deep-learning--ud730) 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Bas Nijholt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /3_regularization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "kR-4eNdK6lYS" 8 | }, 9 | "source": [ 10 | "Deep Learning\n", 11 | "=============\n", 12 | "\n", 13 | "Assignment 3\n", 14 | "------------\n", 15 | "\n", 16 | "Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.\n", 17 | "\n", 18 | "The goal of this assignment is to explore regularization techniques." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "cellView": "both", 26 | "colab": { 27 | "autoexec": { 28 | "startup": false, 29 | "wait_interval": 0 30 | } 31 | }, 32 | "colab_type": "code", 33 | "collapsed": false, 34 | "id": "JLpLa8Jt7Vu4" 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "# These are all the modules we'll be using later. Make sure you can import them\n", 39 | "# before proceeding further.\n", 40 | "from __future__ import print_function\n", 41 | "import numpy as np\n", 42 | "import tensorflow as tf\n", 43 | "from six.moves import cPickle as pickle" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": { 49 | "colab_type": "text", 50 | "id": "1HrCK6e17WzV" 51 | }, 52 | "source": [ 53 | "First reload the data we generated in _notmist.ipynb_." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "cellView": "both", 61 | "colab": { 62 | "autoexec": { 63 | "startup": false, 64 | "wait_interval": 0 65 | }, 66 | "output_extras": [ 67 | { 68 | "item_id": 1 69 | } 70 | ] 71 | }, 72 | "colab_type": "code", 73 | "collapsed": false, 74 | "executionInfo": { 75 | "elapsed": 11777, 76 | "status": "ok", 77 | "timestamp": 1449849322348, 78 | "user": { 79 | "color": "", 80 | "displayName": "", 81 | "isAnonymous": false, 82 | "isMe": true, 83 | "permissionId": "", 84 | "photoUrl": "", 85 | "sessionId": "0", 86 | "userId": "" 87 | }, 88 | "user_tz": 480 89 | }, 90 | "id": "y3-cj1bpmuxc", 91 | "outputId": "e03576f1-ebbe-4838-c388-f1777bcc9873" 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "pickle_file = 'notMNIST.pickle'\n", 96 | "\n", 97 | "with open(pickle_file, 'rb') as f:\n", 98 | " save = pickle.load(f)\n", 99 | " train_dataset = save['train_dataset']\n", 100 | " train_labels = save['train_labels']\n", 101 | " valid_dataset = save['valid_dataset']\n", 102 | " valid_labels = save['valid_labels']\n", 103 | " test_dataset = save['test_dataset']\n", 104 | " test_labels = save['test_labels']\n", 105 | " del save # hint to help gc free up memory\n", 106 | " print('Training set', train_dataset.shape, train_labels.shape)\n", 107 | " print('Validation set', valid_dataset.shape, valid_labels.shape)\n", 108 | " print('Test set', test_dataset.shape, test_labels.shape)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": { 114 | "colab_type": "text", 115 | "id": "L7aHrm6nGDMB" 116 | }, 117 | "source": [ 118 | "Reformat into a shape that's more adapted to the models we're going to train:\n", 119 | "- data as a flat matrix,\n", 120 | "- labels as float 1-hot encodings." 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "cellView": "both", 128 | "colab": { 129 | "autoexec": { 130 | "startup": false, 131 | "wait_interval": 0 132 | }, 133 | "output_extras": [ 134 | { 135 | "item_id": 1 136 | } 137 | ] 138 | }, 139 | "colab_type": "code", 140 | "collapsed": false, 141 | "executionInfo": { 142 | "elapsed": 11728, 143 | "status": "ok", 144 | "timestamp": 1449849322356, 145 | "user": { 146 | "color": "", 147 | "displayName": "", 148 | "isAnonymous": false, 149 | "isMe": true, 150 | "permissionId": "", 151 | "photoUrl": "", 152 | "sessionId": "0", 153 | "userId": "" 154 | }, 155 | "user_tz": 480 156 | }, 157 | "id": "IRSyYiIIGIzS", 158 | "outputId": "3f8996ee-3574-4f44-c953-5c8a04636582" 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "image_size = 28\n", 163 | "num_labels = 10\n", 164 | "\n", 165 | "def reformat(dataset, labels):\n", 166 | " dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)\n", 167 | " # Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]\n", 168 | " labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)\n", 169 | " return dataset, labels\n", 170 | "train_dataset, train_labels = reformat(train_dataset, train_labels)\n", 171 | "valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)\n", 172 | "test_dataset, test_labels = reformat(test_dataset, test_labels)\n", 173 | "print('Training set', train_dataset.shape, train_labels.shape)\n", 174 | "print('Validation set', valid_dataset.shape, valid_labels.shape)\n", 175 | "print('Test set', test_dataset.shape, test_labels.shape)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "cellView": "both", 183 | "colab": { 184 | "autoexec": { 185 | "startup": false, 186 | "wait_interval": 0 187 | } 188 | }, 189 | "colab_type": "code", 190 | "collapsed": false, 191 | "id": "RajPLaL_ZW6w" 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "def accuracy(predictions, labels):\n", 196 | " return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))\n", 197 | " / predictions.shape[0])" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": { 203 | "colab_type": "text", 204 | "id": "sgLbUAQ1CW-1" 205 | }, 206 | "source": [ 207 | "---\n", 208 | "Problem 1\n", 209 | "---------\n", 210 | "\n", 211 | "Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.\n", 212 | "\n", 213 | "---" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": { 219 | "colab_type": "text", 220 | "id": "na8xX2yHZzNF" 221 | }, 222 | "source": [ 223 | "---\n", 224 | "Problem 2\n", 225 | "---------\n", 226 | "Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?\n", 227 | "\n", 228 | "---" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": { 234 | "colab_type": "text", 235 | "id": "ww3SCBUdlkRc" 236 | }, 237 | "source": [ 238 | "---\n", 239 | "Problem 3\n", 240 | "---------\n", 241 | "Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.\n", 242 | "\n", 243 | "What happens to our extreme overfitting case?\n", 244 | "\n", 245 | "---" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": { 251 | "colab_type": "text", 252 | "id": "-b1hTz3VWZjw" 253 | }, 254 | "source": [ 255 | "---\n", 256 | "Problem 4\n", 257 | "---------\n", 258 | "\n", 259 | "Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).\n", 260 | "\n", 261 | "One avenue you can explore is to add multiple layers.\n", 262 | "\n", 263 | "Another one is to use learning rate decay:\n", 264 | "\n", 265 | " global_step = tf.Variable(0) # count the number of steps taken.\n", 266 | " learning_rate = tf.train.exponential_decay(0.5, global_step, ...)\n", 267 | " optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)\n", 268 | " \n", 269 | " ---\n" 270 | ] 271 | } 272 | ], 273 | "metadata": { 274 | "colab": { 275 | "default_view": {}, 276 | "name": "3_regularization.ipynb", 277 | "provenance": [], 278 | "version": "0.3.2", 279 | "views": {} 280 | }, 281 | "kernelspec": { 282 | "display_name": "Python 3", 283 | "language": "python", 284 | "name": "python3" 285 | }, 286 | "language_info": { 287 | "codemirror_mode": { 288 | "name": "ipython", 289 | "version": 3 290 | }, 291 | "file_extension": ".py", 292 | "mimetype": "text/x-python", 293 | "name": "python", 294 | "nbconvert_exporter": "python", 295 | "pygments_lexer": "ipython3", 296 | "version": "3.5.1" 297 | } 298 | }, 299 | "nbformat": 4, 300 | "nbformat_minor": 0 301 | } 302 | -------------------------------------------------------------------------------- /4_convolutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "4embtkV0pNxM" 8 | }, 9 | "source": [ 10 | "Deep Learning\n", 11 | "=============\n", 12 | "\n", 13 | "Assignment 4\n", 14 | "------------\n", 15 | "\n", 16 | "Previously in `2_fullyconnected.ipynb` and `3_regularization.ipynb`, we trained fully connected networks to classify [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) characters.\n", 17 | "\n", 18 | "The goal of this assignment is make the neural network convolutional." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "cellView": "both", 26 | "colab": { 27 | "autoexec": { 28 | "startup": false, 29 | "wait_interval": 0 30 | } 31 | }, 32 | "colab_type": "code", 33 | "collapsed": false, 34 | "id": "tm2CQN_Cpwj0" 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "# These are all the modules we'll be using later. Make sure you can import them\n", 39 | "# before proceeding further.\n", 40 | "from __future__ import print_function\n", 41 | "import numpy as np\n", 42 | "import tensorflow as tf\n", 43 | "from six.moves import cPickle as pickle\n", 44 | "from six.moves import range" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "cellView": "both", 52 | "colab": { 53 | "autoexec": { 54 | "startup": false, 55 | "wait_interval": 0 56 | }, 57 | "output_extras": [ 58 | { 59 | "item_id": 1 60 | } 61 | ] 62 | }, 63 | "colab_type": "code", 64 | "collapsed": false, 65 | "executionInfo": { 66 | "elapsed": 11948, 67 | "status": "ok", 68 | "timestamp": 1446658914837, 69 | "user": { 70 | "color": "", 71 | "displayName": "", 72 | "isAnonymous": false, 73 | "isMe": true, 74 | "permissionId": "", 75 | "photoUrl": "", 76 | "sessionId": "0", 77 | "userId": "" 78 | }, 79 | "user_tz": 480 80 | }, 81 | "id": "y3-cj1bpmuxc", 82 | "outputId": "016b1a51-0290-4b08-efdb-8c95ffc3cd01" 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "pickle_file = 'notMNIST.pickle'\n", 87 | "\n", 88 | "with open(pickle_file, 'rb') as f:\n", 89 | " save = pickle.load(f)\n", 90 | " train_dataset = save['train_dataset']\n", 91 | " train_labels = save['train_labels']\n", 92 | " valid_dataset = save['valid_dataset']\n", 93 | " valid_labels = save['valid_labels']\n", 94 | " test_dataset = save['test_dataset']\n", 95 | " test_labels = save['test_labels']\n", 96 | " del save # hint to help gc free up memory\n", 97 | " print('Training set', train_dataset.shape, train_labels.shape)\n", 98 | " print('Validation set', valid_dataset.shape, valid_labels.shape)\n", 99 | " print('Test set', test_dataset.shape, test_labels.shape)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": { 105 | "colab_type": "text", 106 | "id": "L7aHrm6nGDMB" 107 | }, 108 | "source": [ 109 | "Reformat into a TensorFlow-friendly shape:\n", 110 | "- convolutions need the image data formatted as a cube (width by height by #channels)\n", 111 | "- labels as float 1-hot encodings." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "cellView": "both", 119 | "colab": { 120 | "autoexec": { 121 | "startup": false, 122 | "wait_interval": 0 123 | }, 124 | "output_extras": [ 125 | { 126 | "item_id": 1 127 | } 128 | ] 129 | }, 130 | "colab_type": "code", 131 | "collapsed": false, 132 | "executionInfo": { 133 | "elapsed": 11952, 134 | "status": "ok", 135 | "timestamp": 1446658914857, 136 | "user": { 137 | "color": "", 138 | "displayName": "", 139 | "isAnonymous": false, 140 | "isMe": true, 141 | "permissionId": "", 142 | "photoUrl": "", 143 | "sessionId": "0", 144 | "userId": "" 145 | }, 146 | "user_tz": 480 147 | }, 148 | "id": "IRSyYiIIGIzS", 149 | "outputId": "650a208c-8359-4852-f4f5-8bf10e80ef6c" 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "image_size = 28\n", 154 | "num_labels = 10\n", 155 | "num_channels = 1 # grayscale\n", 156 | "\n", 157 | "import numpy as np\n", 158 | "\n", 159 | "def reformat(dataset, labels):\n", 160 | " dataset = dataset.reshape(\n", 161 | " (-1, image_size, image_size, num_channels)).astype(np.float32)\n", 162 | " labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)\n", 163 | " return dataset, labels\n", 164 | "train_dataset, train_labels = reformat(train_dataset, train_labels)\n", 165 | "valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)\n", 166 | "test_dataset, test_labels = reformat(test_dataset, test_labels)\n", 167 | "print('Training set', train_dataset.shape, train_labels.shape)\n", 168 | "print('Validation set', valid_dataset.shape, valid_labels.shape)\n", 169 | "print('Test set', test_dataset.shape, test_labels.shape)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "cellView": "both", 177 | "colab": { 178 | "autoexec": { 179 | "startup": false, 180 | "wait_interval": 0 181 | } 182 | }, 183 | "colab_type": "code", 184 | "collapsed": false, 185 | "id": "AgQDIREv02p1" 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "def accuracy(predictions, labels):\n", 190 | " return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))\n", 191 | " / predictions.shape[0])" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": { 197 | "colab_type": "text", 198 | "id": "5rhgjmROXu2O" 199 | }, 200 | "source": [ 201 | "Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes." 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "cellView": "both", 209 | "colab": { 210 | "autoexec": { 211 | "startup": false, 212 | "wait_interval": 0 213 | } 214 | }, 215 | "colab_type": "code", 216 | "collapsed": false, 217 | "id": "IZYv70SvvOan" 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "batch_size = 16\n", 222 | "patch_size = 5\n", 223 | "depth = 16\n", 224 | "num_hidden = 64\n", 225 | "\n", 226 | "graph = tf.Graph()\n", 227 | "\n", 228 | "with graph.as_default():\n", 229 | "\n", 230 | " # Input data.\n", 231 | " tf_train_dataset = tf.placeholder(\n", 232 | " tf.float32, shape=(batch_size, image_size, image_size, num_channels))\n", 233 | " tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))\n", 234 | " tf_valid_dataset = tf.constant(valid_dataset)\n", 235 | " tf_test_dataset = tf.constant(test_dataset)\n", 236 | " \n", 237 | " # Variables.\n", 238 | " layer1_weights = tf.Variable(tf.truncated_normal(\n", 239 | " [patch_size, patch_size, num_channels, depth], stddev=0.1))\n", 240 | " layer1_biases = tf.Variable(tf.zeros([depth]))\n", 241 | " layer2_weights = tf.Variable(tf.truncated_normal(\n", 242 | " [patch_size, patch_size, depth, depth], stddev=0.1))\n", 243 | " layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))\n", 244 | " layer3_weights = tf.Variable(tf.truncated_normal(\n", 245 | " [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))\n", 246 | " layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))\n", 247 | " layer4_weights = tf.Variable(tf.truncated_normal(\n", 248 | " [num_hidden, num_labels], stddev=0.1))\n", 249 | " layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))\n", 250 | " \n", 251 | " # Model.\n", 252 | " def model(data):\n", 253 | " conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')\n", 254 | " hidden = tf.nn.relu(conv + layer1_biases)\n", 255 | " conv = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME')\n", 256 | " hidden = tf.nn.relu(conv + layer2_biases)\n", 257 | " shape = hidden.get_shape().as_list()\n", 258 | " reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])\n", 259 | " hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)\n", 260 | " return tf.matmul(hidden, layer4_weights) + layer4_biases\n", 261 | " \n", 262 | " # Training computation.\n", 263 | " logits = model(tf_train_dataset)\n", 264 | " loss = tf.reduce_mean(\n", 265 | " tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))\n", 266 | " \n", 267 | " # Optimizer.\n", 268 | " optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)\n", 269 | " \n", 270 | " # Predictions for the training, validation, and test data.\n", 271 | " train_prediction = tf.nn.softmax(logits)\n", 272 | " valid_prediction = tf.nn.softmax(model(tf_valid_dataset))\n", 273 | " test_prediction = tf.nn.softmax(model(tf_test_dataset))" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": { 280 | "cellView": "both", 281 | "colab": { 282 | "autoexec": { 283 | "startup": false, 284 | "wait_interval": 0 285 | }, 286 | "output_extras": [ 287 | { 288 | "item_id": 37 289 | } 290 | ] 291 | }, 292 | "colab_type": "code", 293 | "collapsed": false, 294 | "executionInfo": { 295 | "elapsed": 63292, 296 | "status": "ok", 297 | "timestamp": 1446658966251, 298 | "user": { 299 | "color": "", 300 | "displayName": "", 301 | "isAnonymous": false, 302 | "isMe": true, 303 | "permissionId": "", 304 | "photoUrl": "", 305 | "sessionId": "0", 306 | "userId": "" 307 | }, 308 | "user_tz": 480 309 | }, 310 | "id": "noKFb2UovVFR", 311 | "outputId": "28941338-2ef9-4088-8bd1-44295661e628" 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "num_steps = 1001\n", 316 | "\n", 317 | "with tf.Session(graph=graph) as session:\n", 318 | " tf.initialize_all_variables().run()\n", 319 | " print('Initialized')\n", 320 | " for step in range(num_steps):\n", 321 | " offset = (step * batch_size) % (train_labels.shape[0] - batch_size)\n", 322 | " batch_data = train_dataset[offset:(offset + batch_size), :, :, :]\n", 323 | " batch_labels = train_labels[offset:(offset + batch_size), :]\n", 324 | " feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}\n", 325 | " _, l, predictions = session.run(\n", 326 | " [optimizer, loss, train_prediction], feed_dict=feed_dict)\n", 327 | " if (step % 50 == 0):\n", 328 | " print('Minibatch loss at step %d: %f' % (step, l))\n", 329 | " print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))\n", 330 | " print('Validation accuracy: %.1f%%' % accuracy(\n", 331 | " valid_prediction.eval(), valid_labels))\n", 332 | " print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": { 338 | "colab_type": "text", 339 | "id": "KedKkn4EutIK" 340 | }, 341 | "source": [ 342 | "---\n", 343 | "Problem 1\n", 344 | "---------\n", 345 | "\n", 346 | "The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (`nn.max_pool()`) of stride 2 and kernel size 2.\n", 347 | "\n", 348 | "---" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": { 354 | "colab_type": "text", 355 | "id": "klf21gpbAgb-" 356 | }, 357 | "source": [ 358 | "---\n", 359 | "Problem 2\n", 360 | "---------\n", 361 | "\n", 362 | "Try to get the best performance you can using a convolutional net. Look for example at the classic [LeNet5](http://yann.lecun.com/exdb/lenet/) architecture, adding Dropout, and/or adding learning rate decay.\n", 363 | "\n", 364 | "---" 365 | ] 366 | } 367 | ], 368 | "metadata": { 369 | "colab": { 370 | "default_view": {}, 371 | "name": "4_convolutions.ipynb", 372 | "provenance": [], 373 | "version": "0.3.2", 374 | "views": {} 375 | }, 376 | "kernelspec": { 377 | "display_name": "Python 3", 378 | "language": "python", 379 | "name": "python3" 380 | }, 381 | "language_info": { 382 | "codemirror_mode": { 383 | "name": "ipython", 384 | "version": 3 385 | }, 386 | "file_extension": ".py", 387 | "mimetype": "text/x-python", 388 | "name": "python", 389 | "nbconvert_exporter": "python", 390 | "pygments_lexer": "ipython3", 391 | "version": "3.5.1" 392 | } 393 | }, 394 | "nbformat": 4, 395 | "nbformat_minor": 0 396 | } 397 | -------------------------------------------------------------------------------- /5_word2vec.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "D7tqLMoKF6uq" 8 | }, 9 | "source": [ 10 | "Deep Learning\n", 11 | "=============\n", 12 | "\n", 13 | "Assignment 5\n", 14 | "------------\n", 15 | "\n", 16 | "The goal of this assignment is to train a Word2Vec skip-gram model over [Text8](http://mattmahoney.net/dc/textdata) data." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": { 23 | "cellView": "both", 24 | "colab": { 25 | "autoexec": { 26 | "startup": false, 27 | "wait_interval": 0 28 | } 29 | }, 30 | "colab_type": "code", 31 | "collapsed": false, 32 | "id": "0K1ZyLn04QZf" 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "# These are all the modules we'll be using later. Make sure you can import them\n", 37 | "# before proceeding further.\n", 38 | "%matplotlib inline\n", 39 | "from __future__ import print_function\n", 40 | "import collections\n", 41 | "import math\n", 42 | "import numpy as np\n", 43 | "import os\n", 44 | "import random\n", 45 | "import tensorflow as tf\n", 46 | "import zipfile\n", 47 | "from matplotlib import pylab\n", 48 | "from six.moves import range\n", 49 | "from six.moves.urllib.request import urlretrieve\n", 50 | "from sklearn.manifold import TSNE" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "colab_type": "text", 57 | "id": "aCjPJE944bkV" 58 | }, 59 | "source": [ 60 | "Download the data from the source website if necessary." 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "cellView": "both", 68 | "colab": { 69 | "autoexec": { 70 | "startup": false, 71 | "wait_interval": 0 72 | }, 73 | "output_extras": [ 74 | { 75 | "item_id": 1 76 | } 77 | ] 78 | }, 79 | "colab_type": "code", 80 | "collapsed": false, 81 | "executionInfo": { 82 | "elapsed": 14640, 83 | "status": "ok", 84 | "timestamp": 1445964482948, 85 | "user": { 86 | "color": "#1FA15D", 87 | "displayName": "Vincent Vanhoucke", 88 | "isAnonymous": false, 89 | "isMe": true, 90 | "permissionId": "05076109866853157986", 91 | "photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg", 92 | "sessionId": "2f1ffade4c9f20de", 93 | "userId": "102167687554210253930" 94 | }, 95 | "user_tz": 420 96 | }, 97 | "id": "RJ-o3UBUFtCw", 98 | "outputId": "c4ec222c-80b5-4298-e635-93ca9f79c3b7" 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "url = 'http://mattmahoney.net/dc/'\n", 103 | "\n", 104 | "def maybe_download(filename, expected_bytes):\n", 105 | " \"\"\"Download a file if not present, and make sure it's the right size.\"\"\"\n", 106 | " if not os.path.exists(filename):\n", 107 | " filename, _ = urlretrieve(url + filename, filename)\n", 108 | " statinfo = os.stat(filename)\n", 109 | " if statinfo.st_size == expected_bytes:\n", 110 | " print('Found and verified %s' % filename)\n", 111 | " else:\n", 112 | " print(statinfo.st_size)\n", 113 | " raise Exception(\n", 114 | " 'Failed to verify ' + filename + '. Can you get to it with a browser?')\n", 115 | " return filename\n", 116 | "\n", 117 | "filename = maybe_download('text8.zip', 31344016)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": { 123 | "colab_type": "text", 124 | "id": "Zqz3XiqI4mZT" 125 | }, 126 | "source": [ 127 | "Read the data into a string." 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "cellView": "both", 135 | "colab": { 136 | "autoexec": { 137 | "startup": false, 138 | "wait_interval": 0 139 | }, 140 | "output_extras": [ 141 | { 142 | "item_id": 1 143 | } 144 | ] 145 | }, 146 | "colab_type": "code", 147 | "collapsed": false, 148 | "executionInfo": { 149 | "elapsed": 28844, 150 | "status": "ok", 151 | "timestamp": 1445964497165, 152 | "user": { 153 | "color": "#1FA15D", 154 | "displayName": "Vincent Vanhoucke", 155 | "isAnonymous": false, 156 | "isMe": true, 157 | "permissionId": "05076109866853157986", 158 | "photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg", 159 | "sessionId": "2f1ffade4c9f20de", 160 | "userId": "102167687554210253930" 161 | }, 162 | "user_tz": 420 163 | }, 164 | "id": "Mvf09fjugFU_", 165 | "outputId": "e3a928b4-1645-4fe8-be17-fcf47de5716d" 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "def read_data(filename):\n", 170 | " \"\"\"Extract the first file enclosed in a zip file as a list of words\"\"\"\n", 171 | " with zipfile.ZipFile(filename) as f:\n", 172 | " data = tf.compat.as_str(f.read(f.namelist()[0])).split()\n", 173 | " return data\n", 174 | " \n", 175 | "words = read_data(filename)\n", 176 | "print('Data size %d' % len(words))" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": { 182 | "colab_type": "text", 183 | "id": "Zdw6i4F8glpp" 184 | }, 185 | "source": [ 186 | "Build the dictionary and replace rare words with UNK token." 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": { 193 | "cellView": "both", 194 | "colab": { 195 | "autoexec": { 196 | "startup": false, 197 | "wait_interval": 0 198 | }, 199 | "output_extras": [ 200 | { 201 | "item_id": 1 202 | } 203 | ] 204 | }, 205 | "colab_type": "code", 206 | "collapsed": false, 207 | "executionInfo": { 208 | "elapsed": 28849, 209 | "status": "ok", 210 | "timestamp": 1445964497178, 211 | "user": { 212 | "color": "#1FA15D", 213 | "displayName": "Vincent Vanhoucke", 214 | "isAnonymous": false, 215 | "isMe": true, 216 | "permissionId": "05076109866853157986", 217 | "photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg", 218 | "sessionId": "2f1ffade4c9f20de", 219 | "userId": "102167687554210253930" 220 | }, 221 | "user_tz": 420 222 | }, 223 | "id": "gAL1EECXeZsD", 224 | "outputId": "3fb4ecd1-df67-44b6-a2dc-2291730970b2" 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "vocabulary_size = 50000\n", 229 | "\n", 230 | "def build_dataset(words):\n", 231 | " count = [['UNK', -1]]\n", 232 | " count.extend(collections.Counter(words).most_common(vocabulary_size - 1))\n", 233 | " dictionary = dict()\n", 234 | " for word, _ in count:\n", 235 | " dictionary[word] = len(dictionary)\n", 236 | " data = list()\n", 237 | " unk_count = 0\n", 238 | " for word in words:\n", 239 | " if word in dictionary:\n", 240 | " index = dictionary[word]\n", 241 | " else:\n", 242 | " index = 0 # dictionary['UNK']\n", 243 | " unk_count = unk_count + 1\n", 244 | " data.append(index)\n", 245 | " count[0][1] = unk_count\n", 246 | " reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) \n", 247 | " return data, count, dictionary, reverse_dictionary\n", 248 | "\n", 249 | "data, count, dictionary, reverse_dictionary = build_dataset(words)\n", 250 | "print('Most common words (+UNK)', count[:5])\n", 251 | "print('Sample data', data[:10])\n", 252 | "del words # Hint to reduce memory." 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": { 258 | "colab_type": "text", 259 | "id": "lFwoyygOmWsL" 260 | }, 261 | "source": [ 262 | "Function to generate a training batch for the skip-gram model." 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": { 269 | "cellView": "both", 270 | "colab": { 271 | "autoexec": { 272 | "startup": false, 273 | "wait_interval": 0 274 | }, 275 | "output_extras": [ 276 | { 277 | "item_id": 1 278 | } 279 | ] 280 | }, 281 | "colab_type": "code", 282 | "collapsed": false, 283 | "executionInfo": { 284 | "elapsed": 113, 285 | "status": "ok", 286 | "timestamp": 1445964901989, 287 | "user": { 288 | "color": "#1FA15D", 289 | "displayName": "Vincent Vanhoucke", 290 | "isAnonymous": false, 291 | "isMe": true, 292 | "permissionId": "05076109866853157986", 293 | "photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg", 294 | "sessionId": "2f1ffade4c9f20de", 295 | "userId": "102167687554210253930" 296 | }, 297 | "user_tz": 420 298 | }, 299 | "id": "w9APjA-zmfjV", 300 | "outputId": "67cccb02-cdaf-4e47-d489-43bcc8d57bb8" 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "data_index = 0\n", 305 | "\n", 306 | "def generate_batch(batch_size, num_skips, skip_window):\n", 307 | " global data_index\n", 308 | " assert batch_size % num_skips == 0\n", 309 | " assert num_skips <= 2 * skip_window\n", 310 | " batch = np.ndarray(shape=(batch_size), dtype=np.int32)\n", 311 | " labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)\n", 312 | " span = 2 * skip_window + 1 # [ skip_window target skip_window ]\n", 313 | " buffer = collections.deque(maxlen=span)\n", 314 | " for _ in range(span):\n", 315 | " buffer.append(data[data_index])\n", 316 | " data_index = (data_index + 1) % len(data)\n", 317 | " for i in range(batch_size // num_skips):\n", 318 | " target = skip_window # target label at the center of the buffer\n", 319 | " targets_to_avoid = [ skip_window ]\n", 320 | " for j in range(num_skips):\n", 321 | " while target in targets_to_avoid:\n", 322 | " target = random.randint(0, span - 1)\n", 323 | " targets_to_avoid.append(target)\n", 324 | " batch[i * num_skips + j] = buffer[skip_window]\n", 325 | " labels[i * num_skips + j, 0] = buffer[target]\n", 326 | " buffer.append(data[data_index])\n", 327 | " data_index = (data_index + 1) % len(data)\n", 328 | " return batch, labels\n", 329 | "\n", 330 | "print('data:', [reverse_dictionary[di] for di in data[:8]])\n", 331 | "\n", 332 | "for num_skips, skip_window in [(2, 1), (4, 2)]:\n", 333 | " data_index = 0\n", 334 | " batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)\n", 335 | " print('\\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))\n", 336 | " print(' batch:', [reverse_dictionary[bi] for bi in batch])\n", 337 | " print(' labels:', [reverse_dictionary[li] for li in labels.reshape(8)])" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": { 343 | "colab_type": "text", 344 | "id": "Ofd1MbBuwiva" 345 | }, 346 | "source": [ 347 | "Train a skip-gram model." 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": { 354 | "cellView": "both", 355 | "colab": { 356 | "autoexec": { 357 | "startup": false, 358 | "wait_interval": 0 359 | } 360 | }, 361 | "colab_type": "code", 362 | "collapsed": false, 363 | "id": "8pQKsV4Vwlzy" 364 | }, 365 | "outputs": [], 366 | "source": [ 367 | "batch_size = 128\n", 368 | "embedding_size = 128 # Dimension of the embedding vector.\n", 369 | "skip_window = 1 # How many words to consider left and right.\n", 370 | "num_skips = 2 # How many times to reuse an input to generate a label.\n", 371 | "# We pick a random validation set to sample nearest neighbors. here we limit the\n", 372 | "# validation samples to the words that have a low numeric ID, which by\n", 373 | "# construction are also the most frequent. \n", 374 | "valid_size = 16 # Random set of words to evaluate similarity on.\n", 375 | "valid_window = 100 # Only pick dev samples in the head of the distribution.\n", 376 | "valid_examples = np.array(random.sample(range(valid_window), valid_size))\n", 377 | "num_sampled = 64 # Number of negative examples to sample.\n", 378 | "\n", 379 | "graph = tf.Graph()\n", 380 | "\n", 381 | "with graph.as_default(), tf.device('/cpu:0'):\n", 382 | "\n", 383 | " # Input data.\n", 384 | " train_dataset = tf.placeholder(tf.int32, shape=[batch_size])\n", 385 | " train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])\n", 386 | " valid_dataset = tf.constant(valid_examples, dtype=tf.int32)\n", 387 | " \n", 388 | " # Variables.\n", 389 | " embeddings = tf.Variable(\n", 390 | " tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))\n", 391 | " softmax_weights = tf.Variable(\n", 392 | " tf.truncated_normal([vocabulary_size, embedding_size],\n", 393 | " stddev=1.0 / math.sqrt(embedding_size)))\n", 394 | " softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))\n", 395 | " \n", 396 | " # Model.\n", 397 | " # Look up embeddings for inputs.\n", 398 | " embed = tf.nn.embedding_lookup(embeddings, train_dataset)\n", 399 | " # Compute the softmax loss, using a sample of the negative labels each time.\n", 400 | " loss = tf.reduce_mean(\n", 401 | " tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed,\n", 402 | " train_labels, num_sampled, vocabulary_size))\n", 403 | "\n", 404 | " # Optimizer.\n", 405 | " # Note: The optimizer will optimize the softmax_weights AND the embeddings.\n", 406 | " # This is because the embeddings are defined as a variable quantity and the\n", 407 | " # optimizer's `minimize` method will by default modify all variable quantities \n", 408 | " # that contribute to the tensor it is passed.\n", 409 | " # See docs on `tf.train.Optimizer.minimize()` for more details.\n", 410 | " optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)\n", 411 | " \n", 412 | " # Compute the similarity between minibatch examples and all embeddings.\n", 413 | " # We use the cosine distance:\n", 414 | " norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))\n", 415 | " normalized_embeddings = embeddings / norm\n", 416 | " valid_embeddings = tf.nn.embedding_lookup(\n", 417 | " normalized_embeddings, valid_dataset)\n", 418 | " similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": { 425 | "cellView": "both", 426 | "colab": { 427 | "autoexec": { 428 | "startup": false, 429 | "wait_interval": 0 430 | }, 431 | "output_extras": [ 432 | { 433 | "item_id": 23 434 | }, 435 | { 436 | "item_id": 48 437 | }, 438 | { 439 | "item_id": 61 440 | } 441 | ] 442 | }, 443 | "colab_type": "code", 444 | "collapsed": false, 445 | "executionInfo": { 446 | "elapsed": 436189, 447 | "status": "ok", 448 | "timestamp": 1445965429787, 449 | "user": { 450 | "color": "#1FA15D", 451 | "displayName": "Vincent Vanhoucke", 452 | "isAnonymous": false, 453 | "isMe": true, 454 | "permissionId": "05076109866853157986", 455 | "photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg", 456 | "sessionId": "2f1ffade4c9f20de", 457 | "userId": "102167687554210253930" 458 | }, 459 | "user_tz": 420 460 | }, 461 | "id": "1bQFGceBxrWW", 462 | "outputId": "5ebd6d9a-33c6-4bcd-bf6d-252b0b6055e4" 463 | }, 464 | "outputs": [], 465 | "source": [ 466 | "num_steps = 100001\n", 467 | "\n", 468 | "with tf.Session(graph=graph) as session:\n", 469 | " tf.initialize_all_variables().run()\n", 470 | " print('Initialized')\n", 471 | " average_loss = 0\n", 472 | " for step in range(num_steps):\n", 473 | " batch_data, batch_labels = generate_batch(\n", 474 | " batch_size, num_skips, skip_window)\n", 475 | " feed_dict = {train_dataset : batch_data, train_labels : batch_labels}\n", 476 | " _, l = session.run([optimizer, loss], feed_dict=feed_dict)\n", 477 | " average_loss += l\n", 478 | " if step % 2000 == 0:\n", 479 | " if step > 0:\n", 480 | " average_loss = average_loss / 2000\n", 481 | " # The average loss is an estimate of the loss over the last 2000 batches.\n", 482 | " print('Average loss at step %d: %f' % (step, average_loss))\n", 483 | " average_loss = 0\n", 484 | " # note that this is expensive (~20% slowdown if computed every 500 steps)\n", 485 | " if step % 10000 == 0:\n", 486 | " sim = similarity.eval()\n", 487 | " for i in range(valid_size):\n", 488 | " valid_word = reverse_dictionary[valid_examples[i]]\n", 489 | " top_k = 8 # number of nearest neighbors\n", 490 | " nearest = (-sim[i, :]).argsort()[1:top_k+1]\n", 491 | " log = 'Nearest to %s:' % valid_word\n", 492 | " for k in range(top_k):\n", 493 | " close_word = reverse_dictionary[nearest[k]]\n", 494 | " log = '%s %s,' % (log, close_word)\n", 495 | " print(log)\n", 496 | " final_embeddings = normalized_embeddings.eval()" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "metadata": { 503 | "cellView": "both", 504 | "colab": { 505 | "autoexec": { 506 | "startup": false, 507 | "wait_interval": 0 508 | } 509 | }, 510 | "colab_type": "code", 511 | "collapsed": false, 512 | "id": "jjJXYA_XzV79" 513 | }, 514 | "outputs": [], 515 | "source": [ 516 | "num_points = 400\n", 517 | "\n", 518 | "tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)\n", 519 | "two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": { 526 | "cellView": "both", 527 | "colab": { 528 | "autoexec": { 529 | "startup": false, 530 | "wait_interval": 0 531 | }, 532 | "output_extras": [ 533 | { 534 | "item_id": 1 535 | } 536 | ] 537 | }, 538 | "colab_type": "code", 539 | "collapsed": false, 540 | "executionInfo": { 541 | "elapsed": 4763, 542 | "status": "ok", 543 | "timestamp": 1445965465525, 544 | "user": { 545 | "color": "#1FA15D", 546 | "displayName": "Vincent Vanhoucke", 547 | "isAnonymous": false, 548 | "isMe": true, 549 | "permissionId": "05076109866853157986", 550 | "photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg", 551 | "sessionId": "2f1ffade4c9f20de", 552 | "userId": "102167687554210253930" 553 | }, 554 | "user_tz": 420 555 | }, 556 | "id": "o_e0D_UezcDe", 557 | "outputId": "df22e4a5-e8ec-4e5e-d384-c6cf37c68c34" 558 | }, 559 | "outputs": [], 560 | "source": [ 561 | "def plot(embeddings, labels):\n", 562 | " assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'\n", 563 | " pylab.figure(figsize=(15,15)) # in inches\n", 564 | " for i, label in enumerate(labels):\n", 565 | " x, y = embeddings[i,:]\n", 566 | " pylab.scatter(x, y)\n", 567 | " pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',\n", 568 | " ha='right', va='bottom')\n", 569 | " pylab.show()\n", 570 | "\n", 571 | "words = [reverse_dictionary[i] for i in range(1, num_points+1)]\n", 572 | "plot(two_d_embeddings, words)" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": { 578 | "colab_type": "text", 579 | "id": "QB5EFrBnpNnc" 580 | }, 581 | "source": [ 582 | "---\n", 583 | "\n", 584 | "Problem\n", 585 | "-------\n", 586 | "\n", 587 | "An alternative to skip-gram is another Word2Vec model called [CBOW](http://arxiv.org/abs/1301.3781) (Continuous Bag of Words). In the CBOW model, instead of predicting a context word from a word vector, you predict a word from the sum of all the word vectors in its context. Implement and evaluate a CBOW model trained on the text8 dataset.\n", 588 | "\n", 589 | "---" 590 | ] 591 | } 592 | ], 593 | "metadata": { 594 | "colab": { 595 | "default_view": {}, 596 | "name": "5_word2vec.ipynb", 597 | "provenance": [], 598 | "version": "0.3.2", 599 | "views": {} 600 | }, 601 | "kernelspec": { 602 | "display_name": "Python 3", 603 | "language": "python", 604 | "name": "python3" 605 | }, 606 | "language_info": { 607 | "codemirror_mode": { 608 | "name": "ipython", 609 | "version": 3 610 | }, 611 | "file_extension": ".py", 612 | "mimetype": "text/x-python", 613 | "name": "python", 614 | "nbconvert_exporter": "python", 615 | "pygments_lexer": "ipython3", 616 | "version": "3.5.1" 617 | } 618 | }, 619 | "nbformat": 4, 620 | "nbformat_minor": 0 621 | } 622 | -------------------------------------------------------------------------------- /6_lstm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "8tQJd2YSCfWR" 8 | }, 9 | "source": [] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": { 14 | "colab_type": "text", 15 | "id": "D7tqLMoKF6uq" 16 | }, 17 | "source": [ 18 | "Deep Learning\n", 19 | "=============\n", 20 | "\n", 21 | "Assignment 6\n", 22 | "------------\n", 23 | "\n", 24 | "After training a skip-gram model in `5_word2vec.ipynb`, the goal of this notebook is to train a LSTM character model over [Text8](http://mattmahoney.net/dc/textdata) data." 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": { 31 | "cellView": "both", 32 | "colab": { 33 | "autoexec": { 34 | "startup": false, 35 | "wait_interval": 0 36 | } 37 | }, 38 | "colab_type": "code", 39 | "collapsed": false, 40 | "id": "MvEblsgEXxrd" 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "# These are all the modules we'll be using later. Make sure you can import them\n", 45 | "# before proceeding further.\n", 46 | "from __future__ import print_function\n", 47 | "import os\n", 48 | "import numpy as np\n", 49 | "import random\n", 50 | "import string\n", 51 | "import tensorflow as tf\n", 52 | "import zipfile\n", 53 | "from six.moves import range\n", 54 | "from six.moves.urllib.request import urlretrieve" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "cellView": "both", 62 | "colab": { 63 | "autoexec": { 64 | "startup": false, 65 | "wait_interval": 0 66 | }, 67 | "output_extras": [ 68 | { 69 | "item_id": 1 70 | } 71 | ] 72 | }, 73 | "colab_type": "code", 74 | "collapsed": false, 75 | "executionInfo": { 76 | "elapsed": 5993, 77 | "status": "ok", 78 | "timestamp": 1445965582896, 79 | "user": { 80 | "color": "#1FA15D", 81 | "displayName": "Vincent Vanhoucke", 82 | "isAnonymous": false, 83 | "isMe": true, 84 | "permissionId": "05076109866853157986", 85 | "photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg", 86 | "sessionId": "6f6f07b359200c46", 87 | "userId": "102167687554210253930" 88 | }, 89 | "user_tz": 420 90 | }, 91 | "id": "RJ-o3UBUFtCw", 92 | "outputId": "d530534e-0791-4a94-ca6d-1c8f1b908a9e" 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "url = 'http://mattmahoney.net/dc/'\n", 97 | "\n", 98 | "def maybe_download(filename, expected_bytes):\n", 99 | " \"\"\"Download a file if not present, and make sure it's the right size.\"\"\"\n", 100 | " if not os.path.exists(filename):\n", 101 | " filename, _ = urlretrieve(url + filename, filename)\n", 102 | " statinfo = os.stat(filename)\n", 103 | " if statinfo.st_size == expected_bytes:\n", 104 | " print('Found and verified %s' % filename)\n", 105 | " else:\n", 106 | " print(statinfo.st_size)\n", 107 | " raise Exception(\n", 108 | " 'Failed to verify ' + filename + '. Can you get to it with a browser?')\n", 109 | " return filename\n", 110 | "\n", 111 | "filename = maybe_download('text8.zip', 31344016)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "cellView": "both", 119 | "colab": { 120 | "autoexec": { 121 | "startup": false, 122 | "wait_interval": 0 123 | }, 124 | "output_extras": [ 125 | { 126 | "item_id": 1 127 | } 128 | ] 129 | }, 130 | "colab_type": "code", 131 | "collapsed": false, 132 | "executionInfo": { 133 | "elapsed": 5982, 134 | "status": "ok", 135 | "timestamp": 1445965582916, 136 | "user": { 137 | "color": "#1FA15D", 138 | "displayName": "Vincent Vanhoucke", 139 | "isAnonymous": false, 140 | "isMe": true, 141 | "permissionId": "05076109866853157986", 142 | "photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg", 143 | "sessionId": "6f6f07b359200c46", 144 | "userId": "102167687554210253930" 145 | }, 146 | "user_tz": 420 147 | }, 148 | "id": "Mvf09fjugFU_", 149 | "outputId": "8f75db58-3862-404b-a0c3-799380597390" 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "def read_data(filename):\n", 154 | " f = zipfile.ZipFile(filename)\n", 155 | " for name in f.namelist():\n", 156 | " return tf.compat.as_str(f.read(name))\n", 157 | " f.close()\n", 158 | " \n", 159 | "text = read_data(filename)\n", 160 | "print('Data size %d' % len(text))" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": { 166 | "colab_type": "text", 167 | "id": "ga2CYACE-ghb" 168 | }, 169 | "source": [ 170 | "Create a small validation set." 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "cellView": "both", 178 | "colab": { 179 | "autoexec": { 180 | "startup": false, 181 | "wait_interval": 0 182 | }, 183 | "output_extras": [ 184 | { 185 | "item_id": 1 186 | } 187 | ] 188 | }, 189 | "colab_type": "code", 190 | "collapsed": false, 191 | "executionInfo": { 192 | "elapsed": 6184, 193 | "status": "ok", 194 | "timestamp": 1445965583138, 195 | "user": { 196 | "color": "#1FA15D", 197 | "displayName": "Vincent Vanhoucke", 198 | "isAnonymous": false, 199 | "isMe": true, 200 | "permissionId": "05076109866853157986", 201 | "photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg", 202 | "sessionId": "6f6f07b359200c46", 203 | "userId": "102167687554210253930" 204 | }, 205 | "user_tz": 420 206 | }, 207 | "id": "w-oBpfFG-j43", 208 | "outputId": "bdb96002-d021-4379-f6de-a977924f0d02" 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "valid_size = 1000\n", 213 | "valid_text = text[:valid_size]\n", 214 | "train_text = text[valid_size:]\n", 215 | "train_size = len(train_text)\n", 216 | "print(train_size, train_text[:64])\n", 217 | "print(valid_size, valid_text[:64])" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": { 223 | "colab_type": "text", 224 | "id": "Zdw6i4F8glpp" 225 | }, 226 | "source": [ 227 | "Utility functions to map characters to vocabulary IDs and back." 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "cellView": "both", 235 | "colab": { 236 | "autoexec": { 237 | "startup": false, 238 | "wait_interval": 0 239 | }, 240 | "output_extras": [ 241 | { 242 | "item_id": 1 243 | } 244 | ] 245 | }, 246 | "colab_type": "code", 247 | "collapsed": false, 248 | "executionInfo": { 249 | "elapsed": 6276, 250 | "status": "ok", 251 | "timestamp": 1445965583249, 252 | "user": { 253 | "color": "#1FA15D", 254 | "displayName": "Vincent Vanhoucke", 255 | "isAnonymous": false, 256 | "isMe": true, 257 | "permissionId": "05076109866853157986", 258 | "photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg", 259 | "sessionId": "6f6f07b359200c46", 260 | "userId": "102167687554210253930" 261 | }, 262 | "user_tz": 420 263 | }, 264 | "id": "gAL1EECXeZsD", 265 | "outputId": "88fc9032-feb9-45ff-a9a0-a26759cc1f2e" 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '\n", 270 | "first_letter = ord(string.ascii_lowercase[0])\n", 271 | "\n", 272 | "def char2id(char):\n", 273 | " if char in string.ascii_lowercase:\n", 274 | " return ord(char) - first_letter + 1\n", 275 | " elif char == ' ':\n", 276 | " return 0\n", 277 | " else:\n", 278 | " print('Unexpected character: %s' % char)\n", 279 | " return 0\n", 280 | " \n", 281 | "def id2char(dictid):\n", 282 | " if dictid > 0:\n", 283 | " return chr(dictid + first_letter - 1)\n", 284 | " else:\n", 285 | " return ' '\n", 286 | "\n", 287 | "print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))\n", 288 | "print(id2char(1), id2char(26), id2char(0))" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": { 294 | "colab_type": "text", 295 | "id": "lFwoyygOmWsL" 296 | }, 297 | "source": [ 298 | "Function to generate a training batch for the LSTM model." 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": { 305 | "cellView": "both", 306 | "colab": { 307 | "autoexec": { 308 | "startup": false, 309 | "wait_interval": 0 310 | }, 311 | "output_extras": [ 312 | { 313 | "item_id": 1 314 | } 315 | ] 316 | }, 317 | "colab_type": "code", 318 | "collapsed": false, 319 | "executionInfo": { 320 | "elapsed": 6473, 321 | "status": "ok", 322 | "timestamp": 1445965583467, 323 | "user": { 324 | "color": "#1FA15D", 325 | "displayName": "Vincent Vanhoucke", 326 | "isAnonymous": false, 327 | "isMe": true, 328 | "permissionId": "05076109866853157986", 329 | "photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg", 330 | "sessionId": "6f6f07b359200c46", 331 | "userId": "102167687554210253930" 332 | }, 333 | "user_tz": 420 334 | }, 335 | "id": "d9wMtjy5hCj9", 336 | "outputId": "3dd79c80-454a-4be0-8b71-4a4a357b3367" 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "batch_size=64\n", 341 | "num_unrollings=10\n", 342 | "\n", 343 | "class BatchGenerator(object):\n", 344 | " def __init__(self, text, batch_size, num_unrollings):\n", 345 | " self._text = text\n", 346 | " self._text_size = len(text)\n", 347 | " self._batch_size = batch_size\n", 348 | " self._num_unrollings = num_unrollings\n", 349 | " segment = self._text_size // batch_size\n", 350 | " self._cursor = [ offset * segment for offset in range(batch_size)]\n", 351 | " self._last_batch = self._next_batch()\n", 352 | " \n", 353 | " def _next_batch(self):\n", 354 | " \"\"\"Generate a single batch from the current cursor position in the data.\"\"\"\n", 355 | " batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)\n", 356 | " for b in range(self._batch_size):\n", 357 | " batch[b, char2id(self._text[self._cursor[b]])] = 1.0\n", 358 | " self._cursor[b] = (self._cursor[b] + 1) % self._text_size\n", 359 | " return batch\n", 360 | " \n", 361 | " def next(self):\n", 362 | " \"\"\"Generate the next array of batches from the data. The array consists of\n", 363 | " the last batch of the previous array, followed by num_unrollings new ones.\n", 364 | " \"\"\"\n", 365 | " batches = [self._last_batch]\n", 366 | " for step in range(self._num_unrollings):\n", 367 | " batches.append(self._next_batch())\n", 368 | " self._last_batch = batches[-1]\n", 369 | " return batches\n", 370 | "\n", 371 | "def characters(probabilities):\n", 372 | " \"\"\"Turn a 1-hot encoding or a probability distribution over the possible\n", 373 | " characters back into its (most likely) character representation.\"\"\"\n", 374 | " return [id2char(c) for c in np.argmax(probabilities, 1)]\n", 375 | "\n", 376 | "def batches2string(batches):\n", 377 | " \"\"\"Convert a sequence of batches back into their (most likely) string\n", 378 | " representation.\"\"\"\n", 379 | " s = [''] * batches[0].shape[0]\n", 380 | " for b in batches:\n", 381 | " s = [''.join(x) for x in zip(s, characters(b))]\n", 382 | " return s\n", 383 | "\n", 384 | "train_batches = BatchGenerator(train_text, batch_size, num_unrollings)\n", 385 | "valid_batches = BatchGenerator(valid_text, 1, 1)\n", 386 | "\n", 387 | "print(batches2string(train_batches.next()))\n", 388 | "print(batches2string(train_batches.next()))\n", 389 | "print(batches2string(valid_batches.next()))\n", 390 | "print(batches2string(valid_batches.next()))" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": null, 396 | "metadata": { 397 | "cellView": "both", 398 | "colab": { 399 | "autoexec": { 400 | "startup": false, 401 | "wait_interval": 0 402 | } 403 | }, 404 | "colab_type": "code", 405 | "collapsed": false, 406 | "id": "KyVd8FxT5QBc" 407 | }, 408 | "outputs": [], 409 | "source": [ 410 | "def logprob(predictions, labels):\n", 411 | " \"\"\"Log-probability of the true labels in a predicted batch.\"\"\"\n", 412 | " predictions[predictions < 1e-10] = 1e-10\n", 413 | " return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]\n", 414 | "\n", 415 | "def sample_distribution(distribution):\n", 416 | " \"\"\"Sample one element from a distribution assumed to be an array of normalized\n", 417 | " probabilities.\n", 418 | " \"\"\"\n", 419 | " r = random.uniform(0, 1)\n", 420 | " s = 0\n", 421 | " for i in range(len(distribution)):\n", 422 | " s += distribution[i]\n", 423 | " if s >= r:\n", 424 | " return i\n", 425 | " return len(distribution) - 1\n", 426 | "\n", 427 | "def sample(prediction):\n", 428 | " \"\"\"Turn a (column) prediction into 1-hot encoded samples.\"\"\"\n", 429 | " p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)\n", 430 | " p[0, sample_distribution(prediction[0])] = 1.0\n", 431 | " return p\n", 432 | "\n", 433 | "def random_distribution():\n", 434 | " \"\"\"Generate a random column of probabilities.\"\"\"\n", 435 | " b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])\n", 436 | " return b/np.sum(b, 1)[:,None]" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": { 442 | "colab_type": "text", 443 | "id": "K8f67YXaDr4C" 444 | }, 445 | "source": [ 446 | "Simple LSTM Model." 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": { 453 | "cellView": "both", 454 | "colab": { 455 | "autoexec": { 456 | "startup": false, 457 | "wait_interval": 0 458 | } 459 | }, 460 | "colab_type": "code", 461 | "collapsed": false, 462 | "id": "Q5rxZK6RDuGe" 463 | }, 464 | "outputs": [], 465 | "source": [ 466 | "num_nodes = 64\n", 467 | "\n", 468 | "graph = tf.Graph()\n", 469 | "with graph.as_default():\n", 470 | " \n", 471 | " # Parameters:\n", 472 | " # Input gate: input, previous output, and bias.\n", 473 | " ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))\n", 474 | " im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))\n", 475 | " ib = tf.Variable(tf.zeros([1, num_nodes]))\n", 476 | " # Forget gate: input, previous output, and bias.\n", 477 | " fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))\n", 478 | " fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))\n", 479 | " fb = tf.Variable(tf.zeros([1, num_nodes]))\n", 480 | " # Memory cell: input, state and bias. \n", 481 | " cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))\n", 482 | " cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))\n", 483 | " cb = tf.Variable(tf.zeros([1, num_nodes]))\n", 484 | " # Output gate: input, previous output, and bias.\n", 485 | " ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))\n", 486 | " om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))\n", 487 | " ob = tf.Variable(tf.zeros([1, num_nodes]))\n", 488 | " # Variables saving state across unrollings.\n", 489 | " saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)\n", 490 | " saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)\n", 491 | " # Classifier weights and biases.\n", 492 | " w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))\n", 493 | " b = tf.Variable(tf.zeros([vocabulary_size]))\n", 494 | " \n", 495 | " # Definition of the cell computation.\n", 496 | " def lstm_cell(i, o, state):\n", 497 | " \"\"\"Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf\n", 498 | " Note that in this formulation, we omit the various connections between the\n", 499 | " previous state and the gates.\"\"\"\n", 500 | " input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)\n", 501 | " forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)\n", 502 | " update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb\n", 503 | " state = forget_gate * state + input_gate * tf.tanh(update)\n", 504 | " output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)\n", 505 | " return output_gate * tf.tanh(state), state\n", 506 | "\n", 507 | " # Input data.\n", 508 | " train_data = list()\n", 509 | " for _ in range(num_unrollings + 1):\n", 510 | " train_data.append(\n", 511 | " tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))\n", 512 | " train_inputs = train_data[:num_unrollings]\n", 513 | " train_labels = train_data[1:] # labels are inputs shifted by one time step.\n", 514 | "\n", 515 | " # Unrolled LSTM loop.\n", 516 | " outputs = list()\n", 517 | " output = saved_output\n", 518 | " state = saved_state\n", 519 | " for i in train_inputs:\n", 520 | " output, state = lstm_cell(i, output, state)\n", 521 | " outputs.append(output)\n", 522 | "\n", 523 | " # State saving across unrollings.\n", 524 | " with tf.control_dependencies([saved_output.assign(output),\n", 525 | " saved_state.assign(state)]):\n", 526 | " # Classifier.\n", 527 | " logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)\n", 528 | " loss = tf.reduce_mean(\n", 529 | " tf.nn.softmax_cross_entropy_with_logits(\n", 530 | " logits, tf.concat(0, train_labels)))\n", 531 | "\n", 532 | " # Optimizer.\n", 533 | " global_step = tf.Variable(0)\n", 534 | " learning_rate = tf.train.exponential_decay(\n", 535 | " 10.0, global_step, 5000, 0.1, staircase=True)\n", 536 | " optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", 537 | " gradients, v = zip(*optimizer.compute_gradients(loss))\n", 538 | " gradients, _ = tf.clip_by_global_norm(gradients, 1.25)\n", 539 | " optimizer = optimizer.apply_gradients(\n", 540 | " zip(gradients, v), global_step=global_step)\n", 541 | "\n", 542 | " # Predictions.\n", 543 | " train_prediction = tf.nn.softmax(logits)\n", 544 | " \n", 545 | " # Sampling and validation eval: batch 1, no unrolling.\n", 546 | " sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])\n", 547 | " saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))\n", 548 | " saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))\n", 549 | " reset_sample_state = tf.group(\n", 550 | " saved_sample_output.assign(tf.zeros([1, num_nodes])),\n", 551 | " saved_sample_state.assign(tf.zeros([1, num_nodes])))\n", 552 | " sample_output, sample_state = lstm_cell(\n", 553 | " sample_input, saved_sample_output, saved_sample_state)\n", 554 | " with tf.control_dependencies([saved_sample_output.assign(sample_output),\n", 555 | " saved_sample_state.assign(sample_state)]):\n", 556 | " sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": null, 562 | "metadata": { 563 | "cellView": "both", 564 | "colab": { 565 | "autoexec": { 566 | "startup": false, 567 | "wait_interval": 0 568 | }, 569 | "output_extras": [ 570 | { 571 | "item_id": 41 572 | }, 573 | { 574 | "item_id": 80 575 | }, 576 | { 577 | "item_id": 126 578 | }, 579 | { 580 | "item_id": 144 581 | } 582 | ] 583 | }, 584 | "colab_type": "code", 585 | "collapsed": false, 586 | "executionInfo": { 587 | "elapsed": 199909, 588 | "status": "ok", 589 | "timestamp": 1445965877333, 590 | "user": { 591 | "color": "#1FA15D", 592 | "displayName": "Vincent Vanhoucke", 593 | "isAnonymous": false, 594 | "isMe": true, 595 | "permissionId": "05076109866853157986", 596 | "photoUrl": "//lh6.googleusercontent.com/-cCJa7dTDcgQ/AAAAAAAAAAI/AAAAAAAACgw/r2EZ_8oYer4/s50-c-k-no/photo.jpg", 597 | "sessionId": "6f6f07b359200c46", 598 | "userId": "102167687554210253930" 599 | }, 600 | "user_tz": 420 601 | }, 602 | "id": "RD9zQCZTEaEm", 603 | "outputId": "5e868466-2532-4545-ce35-b403cf5d9de6" 604 | }, 605 | "outputs": [], 606 | "source": [ 607 | "num_steps = 7001\n", 608 | "summary_frequency = 100\n", 609 | "\n", 610 | "with tf.Session(graph=graph) as session:\n", 611 | " tf.initialize_all_variables().run()\n", 612 | " print('Initialized')\n", 613 | " mean_loss = 0\n", 614 | " for step in range(num_steps):\n", 615 | " batches = train_batches.next()\n", 616 | " feed_dict = dict()\n", 617 | " for i in range(num_unrollings + 1):\n", 618 | " feed_dict[train_data[i]] = batches[i]\n", 619 | " _, l, predictions, lr = session.run(\n", 620 | " [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)\n", 621 | " mean_loss += l\n", 622 | " if step % summary_frequency == 0:\n", 623 | " if step > 0:\n", 624 | " mean_loss = mean_loss / summary_frequency\n", 625 | " # The mean loss is an estimate of the loss over the last few batches.\n", 626 | " print(\n", 627 | " 'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))\n", 628 | " mean_loss = 0\n", 629 | " labels = np.concatenate(list(batches)[1:])\n", 630 | " print('Minibatch perplexity: %.2f' % float(\n", 631 | " np.exp(logprob(predictions, labels))))\n", 632 | " if step % (summary_frequency * 10) == 0:\n", 633 | " # Generate some samples.\n", 634 | " print('=' * 80)\n", 635 | " for _ in range(5):\n", 636 | " feed = sample(random_distribution())\n", 637 | " sentence = characters(feed)[0]\n", 638 | " reset_sample_state.run()\n", 639 | " for _ in range(79):\n", 640 | " prediction = sample_prediction.eval({sample_input: feed})\n", 641 | " feed = sample(prediction)\n", 642 | " sentence += characters(feed)[0]\n", 643 | " print(sentence)\n", 644 | " print('=' * 80)\n", 645 | " # Measure validation set perplexity.\n", 646 | " reset_sample_state.run()\n", 647 | " valid_logprob = 0\n", 648 | " for _ in range(valid_size):\n", 649 | " b = valid_batches.next()\n", 650 | " predictions = sample_prediction.eval({sample_input: b[0]})\n", 651 | " valid_logprob = valid_logprob + logprob(predictions, b[1])\n", 652 | " print('Validation set perplexity: %.2f' % float(np.exp(\n", 653 | " valid_logprob / valid_size)))" 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "metadata": { 659 | "colab_type": "text", 660 | "id": "pl4vtmFfa5nn" 661 | }, 662 | "source": [ 663 | "---\n", 664 | "Problem 1\n", 665 | "---------\n", 666 | "\n", 667 | "You might have noticed that the definition of the LSTM cell involves 4 matrix multiplications with the input, and 4 matrix multiplications with the output. Simplify the expression by using a single matrix multiply for each, and variables that are 4 times larger.\n", 668 | "\n", 669 | "---" 670 | ] 671 | }, 672 | { 673 | "cell_type": "markdown", 674 | "metadata": { 675 | "colab_type": "text", 676 | "id": "4eErTCTybtph" 677 | }, 678 | "source": [ 679 | "---\n", 680 | "Problem 2\n", 681 | "---------\n", 682 | "\n", 683 | "We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.\n", 684 | "\n", 685 | "a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.\n", 686 | "\n", 687 | "b- Write a bigram-based LSTM, modeled on the character LSTM above.\n", 688 | "\n", 689 | "c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this [article](http://arxiv.org/abs/1409.2329).\n", 690 | "\n", 691 | "---" 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "metadata": { 697 | "colab_type": "text", 698 | "id": "Y5tapX3kpcqZ" 699 | }, 700 | "source": [ 701 | "---\n", 702 | "Problem 3\n", 703 | "---------\n", 704 | "\n", 705 | "(difficult!)\n", 706 | "\n", 707 | "Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:\n", 708 | "\n", 709 | " the quick brown fox\n", 710 | " \n", 711 | "the model should attempt to output:\n", 712 | "\n", 713 | " eht kciuq nworb xof\n", 714 | " \n", 715 | "Refer to the lecture on how to put together a sequence-to-sequence model, as well as [this article](http://arxiv.org/abs/1409.3215) for best practices.\n", 716 | "\n", 717 | "---" 718 | ] 719 | } 720 | ], 721 | "metadata": { 722 | "colab": { 723 | "default_view": {}, 724 | "name": "6_lstm.ipynb", 725 | "provenance": [], 726 | "version": "0.3.2", 727 | "views": {} 728 | }, 729 | "kernelspec": { 730 | "display_name": "Python 3", 731 | "language": "python", 732 | "name": "python3" 733 | }, 734 | "language_info": { 735 | "codemirror_mode": { 736 | "name": "ipython", 737 | "version": 3 738 | }, 739 | "file_extension": ".py", 740 | "mimetype": "text/x-python", 741 | "name": "python", 742 | "nbconvert_exporter": "python", 743 | "pygments_lexer": "ipython3", 744 | "version": "3.5.1" 745 | } 746 | }, 747 | "nbformat": 4, 748 | "nbformat_minor": 0 749 | } 750 | -------------------------------------------------------------------------------- /2_fullyconnected.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "kR-4eNdK6lYS" 8 | }, 9 | "source": [ 10 | "Deep Learning\n", 11 | "=============\n", 12 | "\n", 13 | "Assignment 2\n", 14 | "------------\n", 15 | "\n", 16 | "Previously in `1_notmnist.ipynb`, we created a pickle with formatted datasets for training, development and testing on the [notMNIST dataset](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html).\n", 17 | "\n", 18 | "The goal of this assignment is to progressively train deeper and more accurate models using TensorFlow." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": { 25 | "cellView": "both", 26 | "colab": { 27 | "autoexec": { 28 | "startup": false, 29 | "wait_interval": 0 30 | } 31 | }, 32 | "colab_type": "code", 33 | "collapsed": false, 34 | "id": "JLpLa8Jt7Vu4" 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "# These are all the modules we'll be using later. Make sure you can import them\n", 39 | "# before proceeding further.\n", 40 | "from __future__ import print_function\n", 41 | "import numpy as np\n", 42 | "import tensorflow as tf\n", 43 | "from six.moves import cPickle as pickle\n", 44 | "from six.moves import range" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "colab_type": "text", 51 | "id": "1HrCK6e17WzV" 52 | }, 53 | "source": [ 54 | "First reload the data we generated in `1_notmnist.ipynb`." 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": { 61 | "cellView": "both", 62 | "colab": { 63 | "autoexec": { 64 | "startup": false, 65 | "wait_interval": 0 66 | }, 67 | "output_extras": [ 68 | { 69 | "item_id": 1 70 | } 71 | ] 72 | }, 73 | "colab_type": "code", 74 | "collapsed": false, 75 | "executionInfo": { 76 | "elapsed": 19456, 77 | "status": "ok", 78 | "timestamp": 1449847956073, 79 | "user": { 80 | "color": "", 81 | "displayName": "", 82 | "isAnonymous": false, 83 | "isMe": true, 84 | "permissionId": "", 85 | "photoUrl": "", 86 | "sessionId": "0", 87 | "userId": "" 88 | }, 89 | "user_tz": 480 90 | }, 91 | "id": "y3-cj1bpmuxc", 92 | "outputId": "0ddb1607-1fc4-4ddb-de28-6c7ab7fb0c33" 93 | }, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "Training set (200000, 28, 28) (200000,)\n", 100 | "Validation set (10000, 28, 28) (10000,)\n", 101 | "Test set (10000, 28, 28) (10000,)\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "pickle_file = 'notMNIST.pickle'\n", 107 | "\n", 108 | "with open(pickle_file, 'rb') as f:\n", 109 | " save = pickle.load(f)\n", 110 | " train_dataset = save['train_dataset']\n", 111 | " train_labels = save['train_labels']\n", 112 | " valid_dataset = save['valid_dataset']\n", 113 | " valid_labels = save['valid_labels']\n", 114 | " test_dataset = save['test_dataset']\n", 115 | " test_labels = save['test_labels']\n", 116 | " del save # hint to help gc free up memory\n", 117 | " print('Training set', train_dataset.shape, train_labels.shape)\n", 118 | " print('Validation set', valid_dataset.shape, valid_labels.shape)\n", 119 | " print('Test set', test_dataset.shape, test_labels.shape)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": { 125 | "colab_type": "text", 126 | "id": "L7aHrm6nGDMB" 127 | }, 128 | "source": [ 129 | "Reformat into a shape that's more adapted to the models we're going to train:\n", 130 | "- data as a flat matrix,\n", 131 | "- labels as float 1-hot encodings." 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 3, 137 | "metadata": { 138 | "cellView": "both", 139 | "colab": { 140 | "autoexec": { 141 | "startup": false, 142 | "wait_interval": 0 143 | }, 144 | "output_extras": [ 145 | { 146 | "item_id": 1 147 | } 148 | ] 149 | }, 150 | "colab_type": "code", 151 | "collapsed": false, 152 | "executionInfo": { 153 | "elapsed": 19723, 154 | "status": "ok", 155 | "timestamp": 1449847956364, 156 | "user": { 157 | "color": "", 158 | "displayName": "", 159 | "isAnonymous": false, 160 | "isMe": true, 161 | "permissionId": "", 162 | "photoUrl": "", 163 | "sessionId": "0", 164 | "userId": "" 165 | }, 166 | "user_tz": 480 167 | }, 168 | "id": "IRSyYiIIGIzS", 169 | "outputId": "2ba0fc75-1487-4ace-a562-cf81cae82793" 170 | }, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "Training set (200000, 784) (200000, 10)\n", 177 | "Validation set (10000, 784) (10000, 10)\n", 178 | "Test set (10000, 784) (10000, 10)\n" 179 | ] 180 | } 181 | ], 182 | "source": [ 183 | "image_size = 28\n", 184 | "num_labels = 10\n", 185 | "\n", 186 | "def reformat(dataset, labels):\n", 187 | " dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)\n", 188 | " # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]\n", 189 | " labels = (np.arange(num_labels) == labels[:, None]).astype(np.float32)\n", 190 | " return dataset, labels\n", 191 | "train_dataset, train_labels = reformat(train_dataset, train_labels)\n", 192 | "valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)\n", 193 | "test_dataset, test_labels = reformat(test_dataset, test_labels)\n", 194 | "print('Training set', train_dataset.shape, train_labels.shape)\n", 195 | "print('Validation set', valid_dataset.shape, valid_labels.shape)\n", 196 | "print('Test set', test_dataset.shape, test_labels.shape)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": { 202 | "colab_type": "text", 203 | "id": "nCLVqyQ5vPPH" 204 | }, 205 | "source": [ 206 | "We're first going to train a multinomial logistic regression using simple gradient descent.\n", 207 | "\n", 208 | "TensorFlow works like this:\n", 209 | "* First you describe the computation that you want to see performed: what the inputs, the variables, and the operations look like. These get created as nodes over a computation graph. This description is all contained within the block below:\n", 210 | "\n", 211 | " with graph.as_default():\n", 212 | " ...\n", 213 | "\n", 214 | "* Then you can run the operations on this graph as many times as you want by calling `session.run()`, providing it outputs to fetch from the graph that get returned. This runtime operation is all contained in the block below:\n", 215 | "\n", 216 | " with tf.Session(graph=graph) as session:\n", 217 | " ...\n", 218 | "\n", 219 | "Let's load all the data into TensorFlow and build the computation graph corresponding to our training:" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 4, 225 | "metadata": { 226 | "cellView": "both", 227 | "colab": { 228 | "autoexec": { 229 | "startup": false, 230 | "wait_interval": 0 231 | } 232 | }, 233 | "colab_type": "code", 234 | "collapsed": false, 235 | "id": "Nfv39qvtvOl_" 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "# With gradient descent training, even this much data is prohibitive.\n", 240 | "# Subset the training data for faster turnaround.\n", 241 | "train_subset = 10000\n", 242 | "\n", 243 | "graph = tf.Graph()\n", 244 | "with graph.as_default():\n", 245 | "\n", 246 | " # Input data.\n", 247 | " # Load the training, validation and test data into constants that are\n", 248 | " # attached to the graph.\n", 249 | " tf_train_dataset = tf.constant(train_dataset[:train_subset, :])\n", 250 | " tf_train_labels = tf.constant(train_labels[:train_subset])\n", 251 | " tf_valid_dataset = tf.constant(valid_dataset)\n", 252 | " tf_test_dataset = tf.constant(test_dataset)\n", 253 | "\n", 254 | " # Variables.\n", 255 | " # These are the parameters that we are going to be training. The weight\n", 256 | " # matrix will be initialized using random valued following a (truncated)\n", 257 | " # normal distribution. The biases get initialized to zero.\n", 258 | " weights = tf.Variable(\n", 259 | " tf.truncated_normal([image_size * image_size, num_labels]))\n", 260 | " biases = tf.Variable(tf.zeros([num_labels]))\n", 261 | "\n", 262 | " # Training computation.\n", 263 | " # We multiply the inputs with the weight matrix, and add biases. We compute\n", 264 | " # the softmax and cross-entropy (it's one operation in TensorFlow, because\n", 265 | " # it's very common, and it can be optimized). We take the average of this\n", 266 | " # cross-entropy across all training examples: that's our loss.\n", 267 | " logits = tf.matmul(tf_train_dataset, weights) + biases\n", 268 | " loss = tf.reduce_mean(\n", 269 | " tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))\n", 270 | "\n", 271 | " # Optimizer.\n", 272 | " # We are going to find the minimum of this loss using gradient descent.\n", 273 | " optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)\n", 274 | "\n", 275 | " # Predictions for the training, validation, and test data.\n", 276 | " # These are not part of training, but merely here so that we can report\n", 277 | " # accuracy figures as we train.\n", 278 | " train_prediction = tf.nn.softmax(logits)\n", 279 | " valid_prediction = tf.nn.softmax(\n", 280 | " tf.matmul(tf_valid_dataset, weights) + biases)\n", 281 | " test_prediction = tf.nn.softmax(\n", 282 | " tf.matmul(tf_test_dataset, weights) + biases)" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": { 288 | "colab_type": "text", 289 | "id": "KQcL4uqISHjP" 290 | }, 291 | "source": [ 292 | "Let's run this computation and iterate:" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 5, 298 | "metadata": { 299 | "cellView": "both", 300 | "colab": { 301 | "autoexec": { 302 | "startup": false, 303 | "wait_interval": 0 304 | }, 305 | "output_extras": [ 306 | { 307 | "item_id": 9 308 | } 309 | ] 310 | }, 311 | "colab_type": "code", 312 | "collapsed": false, 313 | "executionInfo": { 314 | "elapsed": 57454, 315 | "status": "ok", 316 | "timestamp": 1449847994134, 317 | "user": { 318 | "color": "", 319 | "displayName": "", 320 | "isAnonymous": false, 321 | "isMe": true, 322 | "permissionId": "", 323 | "photoUrl": "", 324 | "sessionId": "0", 325 | "userId": "" 326 | }, 327 | "user_tz": 480 328 | }, 329 | "id": "z2cjdenH869W", 330 | "outputId": "4c037ba1-b526-4d8e-e632-91e2a0333267" 331 | }, 332 | "outputs": [ 333 | { 334 | "name": "stdout", 335 | "output_type": "stream", 336 | "text": [ 337 | "Initialized\n", 338 | "Loss at step 0: 15.285550\n", 339 | "Training accuracy: 14.3%\n", 340 | "Validation accuracy: 16.5%\n", 341 | "Loss at step 100: 2.357812\n", 342 | "Training accuracy: 72.0%\n", 343 | "Validation accuracy: 70.7%\n", 344 | "Loss at step 200: 1.876882\n", 345 | "Training accuracy: 74.8%\n", 346 | "Validation accuracy: 73.2%\n", 347 | "Loss at step 300: 1.612335\n", 348 | "Training accuracy: 76.0%\n", 349 | "Validation accuracy: 74.1%\n", 350 | "Loss at step 400: 1.436991\n", 351 | "Training accuracy: 77.0%\n", 352 | "Validation accuracy: 74.6%\n", 353 | "Loss at step 500: 1.308938\n", 354 | "Training accuracy: 77.8%\n", 355 | "Validation accuracy: 75.0%\n", 356 | "Loss at step 600: 1.209363\n", 357 | "Training accuracy: 78.6%\n", 358 | "Validation accuracy: 75.3%\n", 359 | "Loss at step 700: 1.128800\n", 360 | "Training accuracy: 79.2%\n", 361 | "Validation accuracy: 75.5%\n", 362 | "Loss at step 800: 1.061904\n", 363 | "Training accuracy: 79.5%\n", 364 | "Validation accuracy: 75.7%\n", 365 | "Test accuracy: 83.4%\n" 366 | ] 367 | } 368 | ], 369 | "source": [ 370 | "num_steps = 801\n", 371 | "\n", 372 | "\n", 373 | "def accuracy(predictions, labels):\n", 374 | " return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))\n", 375 | " / predictions.shape[0])\n", 376 | "\n", 377 | "with tf.Session(graph=graph) as session:\n", 378 | " # This is a one-time operation which ensures the parameters get initialized as\n", 379 | " # we described in the graph: random weights for the matrix, zeros for the\n", 380 | " # biases.\n", 381 | " tf.initialize_all_variables().run()\n", 382 | " print('Initialized')\n", 383 | " for step in range(num_steps):\n", 384 | " # Run the computations. We tell .run() that we want to run the optimizer,\n", 385 | " # and get the loss value and the training predictions returned as numpy\n", 386 | " # arrays.\n", 387 | " _, l, predictions = session.run([optimizer, loss, train_prediction])\n", 388 | " if (step % 100 == 0):\n", 389 | " print('Loss at step %d: %f' % (step, l))\n", 390 | " print('Training accuracy: %.1f%%' % accuracy(\n", 391 | " predictions, train_labels[:train_subset, :]))\n", 392 | " # Calling .eval() on valid_prediction is basically like calling run(), but\n", 393 | " # just to get that one numpy array. Note that it recomputes all its graph\n", 394 | " # dependencies.\n", 395 | " print('Validation accuracy: %.1f%%' % accuracy(\n", 396 | " valid_prediction.eval(), valid_labels))\n", 397 | " print('Test accuracy: %.1f%%' %\n", 398 | " accuracy(test_prediction.eval(), test_labels))" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": { 404 | "colab_type": "text", 405 | "id": "x68f-hxRGm3H" 406 | }, 407 | "source": [ 408 | "Let's now switch to stochastic gradient descent training instead, which is much faster.\n", 409 | "\n", 410 | "The graph will be similar, except that instead of holding all the training data into a constant node, we create a `Placeholder` node which will be fed actual data at every call of `session.run()`." 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 6, 416 | "metadata": { 417 | "cellView": "both", 418 | "colab": { 419 | "autoexec": { 420 | "startup": false, 421 | "wait_interval": 0 422 | } 423 | }, 424 | "colab_type": "code", 425 | "collapsed": false, 426 | "id": "qhPMzWYRGrzM" 427 | }, 428 | "outputs": [], 429 | "source": [ 430 | "batch_size = 128\n", 431 | "\n", 432 | "graph = tf.Graph()\n", 433 | "with graph.as_default():\n", 434 | "\n", 435 | " # Input data. For the training data, we use a placeholder that will be fed\n", 436 | " # at run time with a training minibatch.\n", 437 | " tf_train_dataset = tf.placeholder(tf.float32,\n", 438 | " shape=(batch_size, image_size * image_size))\n", 439 | " tf_train_labels = tf.placeholder(\n", 440 | " tf.float32, shape=(batch_size, num_labels))\n", 441 | " tf_valid_dataset = tf.constant(valid_dataset)\n", 442 | " tf_test_dataset = tf.constant(test_dataset)\n", 443 | "\n", 444 | " # Variables.\n", 445 | " weights = tf.Variable(\n", 446 | " tf.truncated_normal([image_size * image_size, num_labels]))\n", 447 | " biases = tf.Variable(tf.zeros([num_labels]))\n", 448 | "\n", 449 | " # Training computation.\n", 450 | " logits = tf.matmul(tf_train_dataset, weights) + biases\n", 451 | " loss = tf.reduce_mean(\n", 452 | " tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))\n", 453 | "\n", 454 | " # Optimizer.\n", 455 | " optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)\n", 456 | "\n", 457 | " # Predictions for the training, validation, and test data.\n", 458 | " train_prediction = tf.nn.softmax(logits)\n", 459 | " valid_prediction = tf.nn.softmax(\n", 460 | " tf.matmul(tf_valid_dataset, weights) + biases)\n", 461 | " test_prediction = tf.nn.softmax(\n", 462 | " tf.matmul(tf_test_dataset, weights) + biases)" 463 | ] 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "metadata": { 468 | "colab_type": "text", 469 | "id": "XmVZESmtG4JH" 470 | }, 471 | "source": [ 472 | "Let's run it:" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 7, 478 | "metadata": { 479 | "cellView": "both", 480 | "colab": { 481 | "autoexec": { 482 | "startup": false, 483 | "wait_interval": 0 484 | }, 485 | "output_extras": [ 486 | { 487 | "item_id": 6 488 | } 489 | ] 490 | }, 491 | "colab_type": "code", 492 | "collapsed": false, 493 | "executionInfo": { 494 | "elapsed": 66292, 495 | "status": "ok", 496 | "timestamp": 1449848003013, 497 | "user": { 498 | "color": "", 499 | "displayName": "", 500 | "isAnonymous": false, 501 | "isMe": true, 502 | "permissionId": "", 503 | "photoUrl": "", 504 | "sessionId": "0", 505 | "userId": "" 506 | }, 507 | "user_tz": 480 508 | }, 509 | "id": "FoF91pknG_YW", 510 | "outputId": "d255c80e-954d-4183-ca1c-c7333ce91d0a" 511 | }, 512 | "outputs": [ 513 | { 514 | "name": "stdout", 515 | "output_type": "stream", 516 | "text": [ 517 | "Initialized\n", 518 | "Minibatch loss at step 0: 16.295612\n", 519 | "Minibatch accuracy: 11.7%\n", 520 | "Validation accuracy: 15.4%\n", 521 | "Minibatch loss at step 500: 1.773802\n", 522 | "Minibatch accuracy: 78.9%\n", 523 | "Validation accuracy: 76.3%\n", 524 | "Minibatch loss at step 1000: 0.957548\n", 525 | "Minibatch accuracy: 80.5%\n", 526 | "Validation accuracy: 76.9%\n", 527 | "Minibatch loss at step 1500: 1.148340\n", 528 | "Minibatch accuracy: 76.6%\n", 529 | "Validation accuracy: 77.2%\n", 530 | "Minibatch loss at step 2000: 0.769025\n", 531 | "Minibatch accuracy: 82.0%\n", 532 | "Validation accuracy: 78.2%\n", 533 | "Minibatch loss at step 2500: 1.294803\n", 534 | "Minibatch accuracy: 70.3%\n", 535 | "Validation accuracy: 78.4%\n", 536 | "Minibatch loss at step 3000: 0.863623\n", 537 | "Minibatch accuracy: 82.8%\n", 538 | "Validation accuracy: 78.5%\n", 539 | "Test accuracy: 85.8%\n" 540 | ] 541 | } 542 | ], 543 | "source": [ 544 | "num_steps = 3001\n", 545 | "\n", 546 | "with tf.Session(graph=graph) as session:\n", 547 | " tf.initialize_all_variables().run()\n", 548 | " print(\"Initialized\")\n", 549 | " for step in range(num_steps):\n", 550 | " # Pick an offset within the training data, which has been randomized.\n", 551 | " # Note: we could use better randomization across epochs.\n", 552 | " offset = (step * batch_size) % (train_labels.shape[0] - batch_size)\n", 553 | " # Generate a minibatch.\n", 554 | " batch_data = train_dataset[offset:(offset + batch_size), :]\n", 555 | " batch_labels = train_labels[offset:(offset + batch_size), :]\n", 556 | " # Prepare a dictionary telling the session where to feed the minibatch.\n", 557 | " # The key of the dictionary is the placeholder node of the graph to be fed,\n", 558 | " # and the value is the numpy array to feed to it.\n", 559 | " feed_dict = {\n", 560 | " tf_train_dataset: batch_data, tf_train_labels: batch_labels}\n", 561 | " _, l, predictions = session.run(\n", 562 | " [optimizer, loss, train_prediction], feed_dict=feed_dict)\n", 563 | " if (step % 500 == 0):\n", 564 | " print(\"Minibatch loss at step %d: %f\" % (step, l))\n", 565 | " print(\"Minibatch accuracy: %.1f%%\" %\n", 566 | " accuracy(predictions, batch_labels))\n", 567 | " print(\"Validation accuracy: %.1f%%\" % accuracy(\n", 568 | " valid_prediction.eval(), valid_labels))\n", 569 | " print(\"Test accuracy: %.1f%%\" %\n", 570 | " accuracy(test_prediction.eval(), test_labels))" 571 | ] 572 | }, 573 | { 574 | "cell_type": "markdown", 575 | "metadata": { 576 | "colab_type": "text", 577 | "id": "7omWxtvLLxik" 578 | }, 579 | "source": [ 580 | "---\n", 581 | "Problem\n", 582 | "-------\n", 583 | "\n", 584 | "Turn the logistic regression example with SGD into a 1-hidden layer neural network with rectified linear units [nn.relu()](https://www.tensorflow.org/versions/r0.7/api_docs/python/nn.html#relu) and 1024 hidden nodes. This model should improve your validation / test accuracy.\n", 585 | "\n", 586 | "---" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": 8, 592 | "metadata": { 593 | "collapsed": false 594 | }, 595 | "outputs": [], 596 | "source": [ 597 | "batch_size = 128\n", 598 | "hidden_layer_size = 1024\n", 599 | "\n", 600 | "graph = tf.Graph()\n", 601 | "with graph.as_default():\n", 602 | "\n", 603 | " # Input data. For the training data, we use a placeholder that will be fed\n", 604 | " # at run time with a training minibatch.\n", 605 | " tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))\n", 606 | " tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))\n", 607 | " tf_valid_dataset = tf.constant(valid_dataset)\n", 608 | " tf_test_dataset = tf.constant(test_dataset)\n", 609 | "\n", 610 | " # Variables.\n", 611 | " W_1 = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_layer_size]))\n", 612 | " b_1 = tf.Variable(tf.zeros([hidden_layer_size]))\n", 613 | " W_2 = tf.Variable(tf.truncated_normal([hidden_layer_size, num_labels]))\n", 614 | " b_2 = tf.Variable(tf.constant(0.1, shape=[num_labels])) # Initialize > 0 to avoid dead neurons\n", 615 | " \n", 616 | " # Model\n", 617 | " h_relu = tf.nn.relu(tf.matmul(tf_train_dataset, W_1) + b_1)\n", 618 | " logits = tf.matmul(h_relu, W_2) + b_2\n", 619 | " loss = tf.reduce_mean(\n", 620 | " tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))\n", 621 | "\n", 622 | " # Optimizer.\n", 623 | " optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)\n", 624 | "\n", 625 | " # Predictions for the training, validation, and test data.\n", 626 | " train_prediction = tf.nn.softmax(logits)\n", 627 | " valid_prediction = tf.nn.softmax(\n", 628 | " tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, W_1) + b_1), W_2) + b_2)\n", 629 | " test_prediction = tf.nn.softmax(\n", 630 | " tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, W_1) + b_1), W_2) + b_2)" 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": 9, 636 | "metadata": { 637 | "collapsed": false 638 | }, 639 | "outputs": [ 640 | { 641 | "name": "stdout", 642 | "output_type": "stream", 643 | "text": [ 644 | "Initialized\n", 645 | "Minibatch loss at step 0: 382.219482\n", 646 | "Minibatch accuracy: 4.7%\n", 647 | "Validation accuracy: 23.5%\n", 648 | "Minibatch loss at step 500: 16.568413\n", 649 | "Minibatch accuracy: 79.7%\n", 650 | "Validation accuracy: 82.4%\n", 651 | "Minibatch loss at step 1000: 4.991648\n", 652 | "Minibatch accuracy: 87.5%\n", 653 | "Validation accuracy: 81.1%\n", 654 | "Minibatch loss at step 1500: 10.653100\n", 655 | "Minibatch accuracy: 82.8%\n", 656 | "Validation accuracy: 80.5%\n", 657 | "Minibatch loss at step 2000: 2.832264\n", 658 | "Minibatch accuracy: 86.7%\n", 659 | "Validation accuracy: 81.8%\n", 660 | "Minibatch loss at step 2500: 3.929306\n", 661 | "Minibatch accuracy: 77.3%\n", 662 | "Validation accuracy: 80.8%\n", 663 | "Minibatch loss at step 3000: 3.857058\n", 664 | "Minibatch accuracy: 80.5%\n", 665 | "Validation accuracy: 82.8%\n", 666 | "Test accuracy: 89.2%\n" 667 | ] 668 | } 669 | ], 670 | "source": [ 671 | "num_steps = 3001\n", 672 | "\n", 673 | "with tf.Session(graph=graph) as session:\n", 674 | " tf.initialize_all_variables().run()\n", 675 | " print(\"Initialized\")\n", 676 | " for step in range(num_steps):\n", 677 | " # Pick an offset within the training data, which has been randomized.\n", 678 | " # Note: we could use better randomization across epochs.\n", 679 | " offset = (step * batch_size) % (train_labels.shape[0] - batch_size)\n", 680 | " # Generate a minibatch.\n", 681 | " batch_data = train_dataset[offset:(offset + batch_size), :]\n", 682 | " batch_labels = train_labels[offset:(offset + batch_size), :]\n", 683 | " # Prepare a dictionary telling the session where to feed the minibatch.\n", 684 | " # The key of the dictionary is the placeholder node of the graph to be fed,\n", 685 | " # and the value is the numpy array to feed to it.\n", 686 | " feed_dict = {\n", 687 | " tf_train_dataset: batch_data, tf_train_labels: batch_labels}\n", 688 | " _, l, predictions = session.run(\n", 689 | " [optimizer, loss, train_prediction], feed_dict=feed_dict)\n", 690 | " if (step % 500 == 0):\n", 691 | " print(\"Minibatch loss at step %d: %f\" % (step, l))\n", 692 | " print(\"Minibatch accuracy: %.1f%%\" %\n", 693 | " accuracy(predictions, batch_labels))\n", 694 | " print(\"Validation accuracy: %.1f%%\" % accuracy(\n", 695 | " valid_prediction.eval(), valid_labels))\n", 696 | " print(\"Test accuracy: %.1f%%\" %\n", 697 | " accuracy(test_prediction.eval(), test_labels))" 698 | ] 699 | } 700 | ], 701 | "metadata": { 702 | "colab": { 703 | "default_view": {}, 704 | "name": "2_fullyconnected.ipynb", 705 | "provenance": [], 706 | "version": "0.3.2", 707 | "views": {} 708 | }, 709 | "kernelspec": { 710 | "display_name": "Python 3", 711 | "language": "python", 712 | "name": "python3" 713 | }, 714 | "language_info": { 715 | "codemirror_mode": { 716 | "name": "ipython", 717 | "version": 3 718 | }, 719 | "file_extension": ".py", 720 | "mimetype": "text/x-python", 721 | "name": "python", 722 | "nbconvert_exporter": "python", 723 | "pygments_lexer": "ipython3", 724 | "version": "3.5.1" 725 | } 726 | }, 727 | "nbformat": 4, 728 | "nbformat_minor": 0 729 | } 730 | --------------------------------------------------------------------------------