├── LICENSE2 ├── Verify Installation.ipynb ├── README.md ├── LICENSE1 ├── Notebook 4 TextClassification - Sentiment Analysis.ipynb ├── Notebook 1 - Newsflash.ipynb ├── Notebook 3 - Movie Critics.ipynb └── Notebook 2 - Raining Cats and Dogs.ipynb /LICENSE2: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 François Chollet 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /Verify Installation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "'1.8.0'" 12 | ] 13 | }, 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "output_type": "execute_result" 17 | } 18 | ], 19 | "source": [ 20 | "import tensorflow as tf\n", 21 | "tf.__version__" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stderr", 31 | "output_type": "stream", 32 | "text": [ 33 | "Using TensorFlow backend.\n" 34 | ] 35 | }, 36 | { 37 | "data": { 38 | "text/plain": [ 39 | "'2.2.0'" 40 | ] 41 | }, 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "output_type": "execute_result" 45 | } 46 | ], 47 | "source": [ 48 | "import keras\n", 49 | "keras.__version__" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 7, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "'5.2.0'" 61 | ] 62 | }, 63 | "execution_count": 7, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "import PIL\n", 70 | "PIL.__version__" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [] 79 | } 80 | ], 81 | "metadata": { 82 | "kernelspec": { 83 | "display_name": "Python 2", 84 | "language": "python", 85 | "name": "python2" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 2 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython2", 97 | "version": "2.7.15" 98 | } 99 | }, 100 | "nbformat": 4, 101 | "nbformat_minor": 2 102 | } 103 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Learning with Keras - Hands-on Workshop 2 | 3 | ## What to expect: 4 | 5 | This workshop is intended for an audience that is new to Keras. 6 | 7 | Workshop Agenda: 8 | 9 | Section 1: 10 | 11 | * Introduction to Keras 12 | * How to build and train a model using Keras APIs 13 | * Using Callbacks 14 | 15 | 16 | Section 2 17 | 18 | * Building a CNN model using Keras Layers 19 | * Data augmentation with ImageGenerator 20 | * Feature extraction with pre-trained CNN model 21 | * Introduction to Fine Tuning 22 | 23 | 24 | Section 3 25 | 26 | * Word Embeddings in Keras 27 | * Using pre-trained word embeddings with the Embedding layer 28 | * Introduction to RNN layers 29 | 30 | 31 | Section 4 32 | 33 | * Workflow for solving Text Classification problems 34 | * Using a N-gram model 35 | * Using a Sequence model 36 | 37 | 38 | 39 | ## Installation instructions 40 | 41 | Required packages: 42 | 43 | * Install [Anaconda](https://www.anaconda.com/download/#macos) and create an environment that you can use for this workshop. 44 | 45 | * Install [TensorFlow](https://www.tensorflow.org/install/). There is a section specific to installing TensorFlow in a conda environment. 46 | 47 | * Install [Keras](https://keras.io/#installation). 48 | 49 | * Install [Pillow](https://pypi.org/project/Pillow/2.2.1/). 50 | 51 | * Install [Pandas](https://pandas.pydata.org/pandas-docs/stable/install.html) 52 | 53 | * Confirm that TensorBoard is installed. 54 | 55 | Required Datasets: 56 | 57 | * Download the Cats Vs Dogs dataset from the [Kaggle website](https://www.kaggle.com/c/dogs-vs-cats/data). You will have to create an account if you don't have one already. 58 | 59 | * Download the GloVe pre-trained embedding from the [GloVe website](https://nlp.stanford.edu/projects/glove/). 60 | 61 | * Download the IMDB dataset from this [website](http://ai.stanford.edu/~amaas/data/sentiment/) 62 | 63 | * Download the Rotten Tomatoes reviews from the [Kaggle website](https://www.kaggle.com/c/3810/download/train.tsv.zip) 64 | 65 | ## Verify Installation: 66 | 67 | * git clone https://github.com/anj-s/kdd2018.git 68 | 69 | * Activate your conda environment 70 | 71 | * Start the Jupyter notebook by running "jupyter notebook" 72 | 73 | * Verify that you can run Keras and TensorFlow by running the "Verify Installation" jupyter notebook. 74 | 75 | NOTE: 76 | During the workshop you will need to modify the data directory paths in the notebooks since you will be pointing to a local directory. 77 | 78 | -------------------------------------------------------------------------------- /LICENSE1: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Notebook 4 TextClassification - Sentiment Analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Import Keras and verify that the TensorFlow backend is set as the default.\n", 10 | "import keras\n", 11 | "keras.__version__" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## Machine Learning Guide for Text Classification\n", 19 | "This has been adapted from the [text classification guide](https://developers.google.com/machine-learning/guides/text-classification/)." 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "# S/W < 1500 -> MLP Model" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "import os\n", 36 | "import random\n", 37 | "import numpy as np\n", 38 | "import sklearn\n", 39 | "import tensorflow as tf\n", 40 | "from tensorflow.python.keras.preprocessing import sequence\n", 41 | "from tensorflow.python.keras.preprocessing import text\n", 42 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 43 | "from sklearn.feature_selection import SelectKBest\n", 44 | "from sklearn.feature_selection import f_classif" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# The first step is to gather data. The more training examples you\n", 54 | "# have the better you will be able to train a generalized mode.\n", 55 | "# You should also make sure that the samples for every class is not\n", 56 | "# imbalanced. There should be a fairly even representation of\n", 57 | "# all samples. To illustrate this workflow we will use the IMDB\n", 58 | "# dataset.\n", 59 | "data_path = '/Users/anjalisridhar/kdd2018/workshop/datasets'\n", 60 | "imdb_data_path = os.path.join(data_path, 'aclImdb')\n", 61 | "seed = 123\n", 62 | "# Load the training data\n", 63 | "train_texts = []\n", 64 | "train_labels = []\n", 65 | "for category in ['pos', 'neg']:\n", 66 | " train_path = os.path.join(imdb_data_path, 'train', category)\n", 67 | " for fname in sorted(os.listdir(train_path)):\n", 68 | " if fname.endswith('.txt'):\n", 69 | " with open(os.path.join(train_path, fname)) as f:\n", 70 | " train_texts.append(f.read())\n", 71 | " train_labels.append(0 if category == 'neg' else 1)\n", 72 | "\n", 73 | "# Load the validation data.\n", 74 | "test_texts = []\n", 75 | "test_labels = []\n", 76 | "for category in ['pos', 'neg']:\n", 77 | " test_path = os.path.join(imdb_data_path, 'test', category)\n", 78 | " for fname in sorted(os.listdir(test_path)):\n", 79 | " if fname.endswith('.txt'):\n", 80 | " with open(os.path.join(test_path, fname)) as f:\n", 81 | " test_texts.append(f.read())\n", 82 | " test_labels.append(0 if category == 'neg' else 1)\n", 83 | "\n", 84 | "# Shuffle the training data and labels. \n", 85 | "# The data gathered may be in a specific order and we should\n", 86 | "# shuffle the data before doing anything else.\n", 87 | "random.seed(seed)\n", 88 | "random.shuffle(train_texts)\n", 89 | "random.seed(seed)\n", 90 | "random.shuffle(train_labels)\n", 91 | "\n", 92 | "data = ((train_texts, np.array(train_labels)),\n", 93 | " (test_texts, np.array(test_labels)))" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "# Get the data.\n", 103 | "(train_texts, train_labels), (val_texts, val_labels) = data" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# Verify that validation labels are in the same range as training labels.\n", 113 | "num_classes = max(train_labels) + 1\n", 114 | "missing_classes = [i for i in range(num_classes) if i not in train_labels]\n", 115 | "if len(missing_classes):\n", 116 | " raise ValueError('Missing samples with label value(s) '\n", 117 | " '{missing_classes}. Please make sure you have '\n", 118 | " 'at least one sample for every label value '\n", 119 | " 'in the range(0, {max_class})'.format(\n", 120 | " missing_classes=missing_classes,\n", 121 | " max_class=num_classes - 1))\n", 122 | "\n", 123 | "if num_classes <= 1:\n", 124 | " raise ValueError('Invalid number of labels: {num_classes}.'\n", 125 | " 'Please make sure there are at least two classes '\n", 126 | " 'of samples'.format(num_classes=num_classes))" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "unexpected_labels = [v for v in val_labels if v not in range(num_classes)]\n", 136 | "if len(unexpected_labels):\n", 137 | " raise ValueError('Unexpected label values found in the validation set:'\n", 138 | " ' {unexpected_labels}. Please make sure that the '\n", 139 | " 'labels in the validation set are in the same range '\n", 140 | " 'as training labels.'.format(\n", 141 | " unexpected_labels=unexpected_labels))" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "# Tokenization and Vectorization\n", 151 | "# Vectorization parameters\n", 152 | "# Range (inclusive) of n-gram sizes for tokenizing text.\n", 153 | "NGRAM_RANGE = (1, 2)\n", 154 | "# Whether text should be split into word or character n-grams.\n", 155 | "# One of 'word', 'char'.\n", 156 | "TOKEN_MODE = 'word'\n", 157 | "# Minimum document/corpus frequency below which a token will be discarded.\n", 158 | "MIN_DOCUMENT_FREQUENCY = 2\n", 159 | "# Limit on the number of features. We use the top 20K features.\n", 160 | "TOP_K = 20000\n", 161 | "# Create keyword arguments to pass to the 'tf-idf' vectorizer.\n", 162 | "kwargs = {\n", 163 | " 'ngram_range': NGRAM_RANGE, # Use 1-grams + 2-grams.\n", 164 | " 'dtype': 'int32',\n", 165 | " 'strip_accents': 'unicode',\n", 166 | " 'decode_error': 'replace',\n", 167 | " 'analyzer': TOKEN_MODE, # Split text into word tokens.\n", 168 | " 'min_df': MIN_DOCUMENT_FREQUENCY,\n", 169 | "}\n", 170 | "# Tokenizing samples into unigrams + bigrams provides good accuracy\n", 171 | "# while taking less compute time.\n", 172 | "# We use Tf-idf encoding for vectorization. This does better than\n", 173 | "# one-hot encoding and count encoding in terms of accuracy\n", 174 | "# (on average: 0.25-15% higher). Tf-idf uses floating point \n", 175 | "# representation and takes more time to compute and uses more\n", 176 | "# memory.\n", 177 | "vectorizer = TfidfVectorizer(**kwargs)\n", 178 | "\n", 179 | "# Learn vocabulary from training texts and vectorize training texts.\n", 180 | "x_train = vectorizer.fit_transform(train_texts)\n", 181 | "\n", 182 | "# Vectorize validation texts.\n", 183 | "x_val = vectorizer.transform(val_texts)\n", 184 | "\n", 185 | "# When we convert texts to tokens we may end up with a large\n", 186 | "# number of tokens. We want to drop rarely occurring tokens\n", 187 | "# as well as tokens that don't contribute heavily to label \n", 188 | "# predictions. \n", 189 | "# We use the `f_classif` function to identify the top 20K features.\n", 190 | "# Select top 'k' of the vectorized features.\n", 191 | "selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))\n", 192 | "selector.fit(x_train, train_labels)\n", 193 | "x_train = selector.transform(x_train)\n", 194 | "x_val = selector.transform(x_val)\n", 195 | "\n", 196 | "x_train = x_train.astype('float32')\n", 197 | "x_val = x_val.astype('float32')" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "# Create model instance.\n", 207 | "learning_rate=1e-3,\n", 208 | "#TODO: Do I have to train epochs=1000?\n", 209 | "# Try for more epochs\n", 210 | "epochs=10\n", 211 | "batch_size=128\n", 212 | "layers=2\n", 213 | "units=64\n", 214 | "dropout_rate=0.2\n", 215 | "input_shape=x_train.shape[1:]\n", 216 | "\n", 217 | "if num_classes == 2:\n", 218 | " op_activation = 'sigmoid'\n", 219 | " op_units = 1\n", 220 | "else:\n", 221 | " op_activation = 'softmax'\n", 222 | " op_units = num_classes\n", 223 | "\n", 224 | "model = keras.models.Sequential()\n", 225 | "model.add(keras.layers.Dropout(rate=dropout_rate, input_shape=input_shape))\n", 226 | "\n", 227 | "for _ in range(layers-1):\n", 228 | " model.add(keras.layers.Dense(units=units, activation='relu'))\n", 229 | " model.add(keras.layers.Dropout(rate=dropout_rate))\n", 230 | "\n", 231 | "model.add(keras.layers.Dense(units=op_units, activation=op_activation))" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "# Compile model with learning parameters.\n", 241 | "if num_classes == 2:\n", 242 | " loss = 'binary_crossentropy'\n", 243 | "else:\n", 244 | " loss = 'sparse_categorical_crossentropy'\n", 245 | "optimizer = keras.optimizers.Adam(lr=learning_rate)\n", 246 | "model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "# Create callback for early stopping on validation loss. If the loss does\n", 256 | "# not decrease in two consecutive tries, stop training.\n", 257 | "callbacks = [tf.keras.callbacks.EarlyStopping(\n", 258 | " monitor='val_loss', patience=2)]\n", 259 | "\n", 260 | "# Train and validate model.\n", 261 | "history = model.fit(\n", 262 | " x_train,\n", 263 | " train_labels,\n", 264 | " epochs=epochs,\n", 265 | " callbacks=callbacks,\n", 266 | " validation_data=(x_val, val_labels),\n", 267 | " verbose=2, # Logs once per epoch.\n", 268 | " batch_size=batch_size)\n", 269 | "\n", 270 | "# Print results.\n", 271 | "history = history.history\n", 272 | "print('Validation accuracy: {acc}, loss: {loss}'.format(\n", 273 | " acc=history['val_acc'][-1], loss=history['val_loss'][-1]))\n", 274 | "\n", 275 | "# Save model.\n", 276 | "model.save('imdb_mlp_model.h5')\n", 277 | "print(history['val_acc'][-1], history['val_loss'][-1])" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "# S/W > 1500 -> sepCNN Model" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "import os, shutil, zipfile\n", 294 | "import random\n", 295 | "\n", 296 | "import numpy as np\n", 297 | "import pandas as pd\n", 298 | "import keras" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "columns = (2, 3) # 2 - Phrases, 3 - Sentiment.\n", 308 | "data_path = '/Users/anjalisridhar/kdd2018/workshop/datasets'\n", 309 | "file_name = 'rotten_tomatoes_train.tsv'\n", 310 | "seed = 123\n", 311 | "validation_split = 0.2\n", 312 | "separator = '\\t'\n", 313 | "header = 0\n", 314 | "\n", 315 | "# Using the Rotten tomatoes movie reviews dataset to demonstrate\n", 316 | "# training sequence model.\n", 317 | "np.random.seed(seed)\n", 318 | "data_path = os.path.join(data_path, file_name)\n", 319 | "data = pd.read_csv(data_path, usecols=columns, sep=separator, header=header)\n", 320 | "data = data.reindex(np.random.permutation(data.index))\n", 321 | "\n", 322 | "# Get the review phrase and sentiment values.\n", 323 | "texts = list(data['Phrase'])\n", 324 | "labels = np.array(data['Sentiment'])\n", 325 | "num_training_samples = int((1 - validation_split) * len(texts))\n", 326 | "data = ((texts[:num_training_samples], labels[:num_training_samples]),\n", 327 | " (texts[num_training_samples:], labels[num_training_samples:]))" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "learning_rate=1e-3\n", 337 | "epochs=1000\n", 338 | "batch_size=128\n", 339 | "blocks=2\n", 340 | "filters=64\n", 341 | "dropout_rate=0.2\n", 342 | "embedding_dim=200\n", 343 | "kernel_size=3\n", 344 | "pool_size=3\n", 345 | "\n", 346 | "(train_texts, train_labels), (val_texts, val_labels) = data\n", 347 | "\n", 348 | "# Verify that validation labels are in the same range as training labels.\n", 349 | "num_classes = max(train_labels) + 1\n", 350 | "missing_classes = [i for i in range(num_classes) if i not in train_labels]\n", 351 | "if len(missing_classes):\n", 352 | " raise ValueError('Missing samples with label value(s) '\n", 353 | " '{missing_classes}. Please make sure you have '\n", 354 | " 'at least one sample for every label value '\n", 355 | " 'in the range(0, {max_class})'.format(\n", 356 | " missing_classes=missing_classes,\n", 357 | " max_class=num_classes - 1))\n", 358 | "\n", 359 | "if num_classes <= 1:\n", 360 | " raise ValueError('Invalid number of labels: {num_classes}.'\n", 361 | " 'Please make sure there are at least two classes '\n", 362 | " 'of samples'.format(num_classes=num_classes))\n", 363 | "unexpected_labels = [v for v in val_labels if v not in range(num_classes)]\n", 364 | "if len(unexpected_labels):\n", 365 | " raise ValueError('Unexpected label values found in the validation set:'\n", 366 | " ' {unexpected_labels}. Please make sure that the '\n", 367 | " 'labels in the validation set are in the same range '\n", 368 | " 'as training labels.'.format(\n", 369 | " unexpected_labels=unexpected_labels))" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "from tensorflow.python.keras.preprocessing import sequence\n", 379 | "from tensorflow.python.keras.preprocessing import text\n", 380 | "\n", 381 | "# Vectorize texts.\n", 382 | "# Limit on the number of features. We use the top 20K features.\n", 383 | "TOP_K = 20000\n", 384 | "\n", 385 | "# Limit on the length of text sequences. Sequences longer than this\n", 386 | "# will be truncated.\n", 387 | "MAX_SEQUENCE_LENGTH = 500\n", 388 | "\n", 389 | "# We need to convert our text samples into numerical vectors.\n", 390 | "# We first build a vocabulary of the 20K most frequently occurring\n", 391 | "# words. Each word in the vocab is associated with an index.\n", 392 | "# Create vocabulary with training texts.\n", 393 | "tokenizer = text.Tokenizer(num_words=TOP_K)\n", 394 | "tokenizer.fit_on_texts(train_texts)\n", 395 | "\n", 396 | "# Vectorize training and validation texts.\n", 397 | "x_train = tokenizer.texts_to_sequences(train_texts)\n", 398 | "x_val = tokenizer.texts_to_sequences(val_texts)\n", 399 | "\n", 400 | "# Get max sequence length.\n", 401 | "max_length = len(max(x_train, key=len))\n", 402 | "if max_length > MAX_SEQUENCE_LENGTH:\n", 403 | " max_length = MAX_SEQUENCE_LENGTH\n", 404 | "\n", 405 | "# Fix sequence length to max value. Sequences shorter than the length are\n", 406 | "# padded in the beginning and sequences longer are truncated\n", 407 | "# at the beginning.\n", 408 | "x_train = sequence.pad_sequences(x_train, maxlen=max_length)\n", 409 | "x_val = sequence.pad_sequences(x_val, maxlen=max_length)\n", 410 | "word_index = tokenizer.word_index\n", 411 | "\n", 412 | "# Number of features will be the embedding input dimension. Add 1 for the\n", 413 | "# reserved index 0.\n", 414 | "num_features = min(len(word_index) + 1, TOP_K)" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": {}, 421 | "outputs": [], 422 | "source": [ 423 | "from keras import models\n", 424 | "from keras import initializers\n", 425 | "from keras import regularizers\n", 426 | "\n", 427 | "from keras.layers import Dense\n", 428 | "from keras.layers import Dropout\n", 429 | "from keras.layers import Embedding\n", 430 | "from keras.layers import SeparableConv1D\n", 431 | "from keras.layers import MaxPooling1D\n", 432 | "from keras.layers import GlobalAveragePooling1D\n", 433 | "\n", 434 | "# Create model instance.\n", 435 | "input_shape=x_train.shape[1:]\n", 436 | "use_pretrained_embedding=False\n", 437 | "is_embedding_trainable=False\n", 438 | "embedding_matrix=None\n", 439 | "\n", 440 | "if num_classes == 2:\n", 441 | " op_activation = 'sigmoid'\n", 442 | " op_units = 1\n", 443 | "else:\n", 444 | " op_activation = 'softmax'\n", 445 | " op_units = num_classes\n", 446 | "\n", 447 | "model = models.Sequential()\n", 448 | "\n", 449 | "# Add embedding layer. If pre-trained embedding is used add weights to the\n", 450 | "# embeddings layer and set trainable to input is_embedding_trainable flag.\n", 451 | "# Sequence models often have such an embedding layer as their first layer. \n", 452 | "# This layer learns to turn word index sequences into word embedding vectors \n", 453 | "# during the training process, such that each word index gets mapped to a \n", 454 | "# dense vector of real values representing that word’s location in semantic space.\n", 455 | "if use_pretrained_embedding:\n", 456 | " model.add(Embedding(input_dim=num_features,\n", 457 | " output_dim=embedding_dim,\n", 458 | " input_length=input_shape[0],\n", 459 | " weights=[embedding_matrix],\n", 460 | " trainable=is_embedding_trainable))\n", 461 | "else:\n", 462 | " model.add(Embedding(input_dim=num_features,\n", 463 | " output_dim=embedding_dim,\n", 464 | " input_length=input_shape[0]))\n", 465 | "\n", 466 | "for _ in range(blocks-1):\n", 467 | " model.add(Dropout(rate=dropout_rate))\n", 468 | " model.add(SeparableConv1D(filters=filters,\n", 469 | " kernel_size=kernel_size,\n", 470 | " activation='relu',\n", 471 | " bias_initializer='random_uniform',\n", 472 | " depthwise_initializer='random_uniform',\n", 473 | " padding='same'))\n", 474 | " model.add(SeparableConv1D(filters=filters,\n", 475 | " kernel_size=kernel_size,\n", 476 | " activation='relu',\n", 477 | " bias_initializer='random_uniform',\n", 478 | " depthwise_initializer='random_uniform',\n", 479 | " padding='same'))\n", 480 | " model.add(MaxPooling1D(pool_size=pool_size))\n", 481 | "\n", 482 | "model.add(SeparableConv1D(filters=filters * 2,\n", 483 | " kernel_size=kernel_size,\n", 484 | " activation='relu',\n", 485 | " bias_initializer='random_uniform',\n", 486 | " depthwise_initializer='random_uniform',\n", 487 | " padding='same'))\n", 488 | "model.add(SeparableConv1D(filters=filters * 2,\n", 489 | " kernel_size=kernel_size,\n", 490 | " activation='relu',\n", 491 | " bias_initializer='random_uniform',\n", 492 | " depthwise_initializer='random_uniform',\n", 493 | " padding='same'))\n", 494 | "model.add(GlobalAveragePooling1D())\n", 495 | "model.add(Dropout(rate=dropout_rate))\n", 496 | "model.add(Dense(op_units, activation=op_activation))" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "# Compile model with learning parameters.\n", 506 | "if num_classes == 2:\n", 507 | " loss = 'binary_crossentropy'\n", 508 | "else:\n", 509 | " loss = 'sparse_categorical_crossentropy'\n", 510 | "optimizer = keras.optimizers.Adam(lr=learning_rate)\n", 511 | "model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])\n", 512 | "\n", 513 | "# Create callback for early stopping on validation loss. If the loss does\n", 514 | "# not decrease in two consecutive tries, stop training.\n", 515 | "callbacks = [keras.callbacks.EarlyStopping(\n", 516 | " monitor='val_loss', patience=2)]\n", 517 | "\n", 518 | "# Train and validate model.\n", 519 | "history = model.fit(\n", 520 | " x_train,\n", 521 | " train_labels,\n", 522 | " epochs=epochs,\n", 523 | " callbacks=callbacks,\n", 524 | " validation_data=(x_val, val_labels),\n", 525 | " verbose=2, # Logs once per epoch.\n", 526 | " batch_size=batch_size)\n", 527 | "\n", 528 | "# Print results.\n", 529 | "history = history.history\n", 530 | "print('Validation accuracy: {acc}, loss: {loss}'.format(\n", 531 | " acc=history['val_acc'][-1], loss=history['val_loss'][-1]))\n", 532 | "\n", 533 | "# Save model.\n", 534 | "model.save('rotten_tomatoes_sepcnn_model.h5')\n", 535 | "print(history['val_acc'][-1], history['val_loss'][-1])" 536 | ] 537 | } 538 | ], 539 | "metadata": { 540 | "kernelspec": { 541 | "display_name": "Python 2", 542 | "language": "python", 543 | "name": "python2" 544 | }, 545 | "language_info": { 546 | "codemirror_mode": { 547 | "name": "ipython", 548 | "version": 2 549 | }, 550 | "file_extension": ".py", 551 | "mimetype": "text/x-python", 552 | "name": "python", 553 | "nbconvert_exporter": "python", 554 | "pygments_lexer": "ipython2", 555 | "version": "2.7.15" 556 | } 557 | }, 558 | "nbformat": 4, 559 | "nbformat_minor": 2 560 | } 561 | -------------------------------------------------------------------------------- /Notebook 1 - Newsflash.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Import Keras and verify that the TensorFlow backend is set as the default.\n", 10 | "import keras\n", 11 | "keras.__version__" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Notebook 1" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Classifying newswires: a multi-class classification example\n", 26 | "\n", 27 | "This notebook contains code samples that have been adapted from Francois Chollet's Deep Learning With Python.\n", 28 | "\n", 29 | "----\n", 30 | "In this notebook we are going to introduce how to build a [`Sequential`](https://keras.io/getting-started/sequential-model-guide/) model using [`Dense`](https://keras.io/layers/core/) layers. A Sequential model is a linear stack of layers with a single input and output. A Dense layer is a fully connected layer in your model. We are also going to introduce the following APIs:\n", 31 | "* `compile`\n", 32 | "* `fit`\n", 33 | "* `evaluate`\n", 34 | "* `predict`\n", 35 | "\n", 36 | "We will also introduce `callbacks` and how you can use them as part of model training. " 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## Problem\n", 44 | "To get familiar with Keras APIs we we going to build a network to classify Reuters newswires into 46 different mutually-exclusive topics. Since we have many \n", 45 | "classes, this problem is an instance of \"multi-class classification\", and since each data point should be classified into only one \n", 46 | "category, the problem is more specifically an instance of \"single-label, multi-class classification\"." 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## Dataset\n", 54 | "\n", 55 | "\n", 56 | "We will be working with the _Reuters dataset_, a set of short newswires and their topics, published by Reuters in 1986. It's a very simple, \n", 57 | "widely used toy dataset for text classification. There are 46 different topics; some topics are more represented than others, but each topic has at least 10 examples in the training set.\n", 58 | "\n", 59 | "A number of datasets come packaged as part of Keras. A few examples of these Datasets can be found [here](https://keras.io/datasets/). Some example datasets are IMDB, MNIST, CIFAR10 etc. The _Reuters dataset_ also comes prepackaged with Keras. Lets take a look at what the data looks like." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "from keras.datasets import reuters\n", 69 | "\n", 70 | "(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "The argument `num_words=10000` restricts the data to the 10,000 most frequently occurring words found in the data.\n", 78 | "\n", 79 | "We have 8,982 training examples and 2,246 test examples:" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "len(train_data)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "len(test_data)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "Each example is a list of integers (word indices):" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "train_data[10]" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "Here's how you can decode it back to words, in case you are curious:" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "word_index = reuters.get_word_index()\n", 130 | "reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])\n", 131 | "# Note that our indices were offset by 3\n", 132 | "# because 0, 1 and 2 are reserved indices for \"padding\", \"start of sequence\", and \"unknown\".\n", 133 | "decoded_newswire = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "decoded_newswire" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "Since we limit the number of word index dictionary to the first 10000 words, there is a chance\n", 150 | "that the newswire will have words that do not have an index associated with it. This is what the ? symbol\n", 151 | "represents in the above newswire." 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "The label associated with an example is an integer between 0 and 45: a topic index." 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "train_labels[10]" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "## Preparing the data\n", 175 | "\n", 176 | "We can vectorize the data with the exact same code as in our previous example:" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "import numpy as np\n", 186 | "\n", 187 | "def vectorize_sequences(sequences, dimension=10000):\n", 188 | " results = np.zeros((len(sequences), dimension))\n", 189 | " for i, sequence in enumerate(sequences):\n", 190 | " results[i, sequence] = 1.\n", 191 | " return results\n", 192 | "\n", 193 | "# Our vectorized training data\n", 194 | "x_train = vectorize_sequences(train_data)\n", 195 | "# Our vectorized test data\n", 196 | "x_test = vectorize_sequences(test_data)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "x_train[0][:10]" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "\n", 213 | "To vectorize the labels, there are two possibilities: we could just cast the label list as an integer tensor, or we could use a \"one-hot\" \n", 214 | "encoding. One-hot encoding of our labels consists in embedding each label as an all-zero vector with a 1 in the place of the label index. You can do this using a built-in Keras function:" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "from keras.utils.np_utils import to_categorical\n", 224 | "\n", 225 | "one_hot_train_labels = to_categorical(train_labels)\n", 226 | "one_hot_test_labels = to_categorical(test_labels)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "## Building our network\n", 234 | "\n", 235 | "\n", 236 | "In this topic classification problem we are trying to classify short snippets of text. The difference between a binary classification problem and this one is that the number of output class is 46 i.e the dimensionality of the output space is much larger. (A binary classification problem is one in which there are two possible output clases.)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "## Build a Sequential model:" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "from keras import models\n", 253 | "from keras import layers\n", 254 | "\n", 255 | "# Instantiate a Sequential Model\n", 256 | "model = models.Sequential()\n", 257 | "# Add 2 Dense layers of 64 units each to the model. Let `relu` be the activation function.\n", 258 | "# Note: Specify the input_shape argument for the first layer.\n", 259 | "model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))\n", 260 | "model.add(layers.Dense(64, activation='relu'))\n", 261 | "# Add a final Dense layer that classifies the output\n", 262 | "# Note: You should use 46 units since out output dimension is going to be the number of output classes). \n", 263 | "# Let `softmax` be the activation function.\n", 264 | "model.add(layers.Dense(46, activation='softmax'))" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "Some things to note about this architecture:\n", 272 | "\n", 273 | "* We are ending the network with a `Dense` layer of size 46. This means that for each input sample, our network will output a \n", 274 | "46-dimensional vector. Each entry in this vector (each dimension) will encode a different output class.\n", 275 | "* We are using the Relu function for activation.\n", 276 | "* The last layer uses a `softmax` activation. It means that the network will \n", 277 | "output a _probability distribution_ over the 46 different output classes, i.e. for every input sample, the network will produce a \n", 278 | "46-dimensional output vector where `output[i]` is the probability that the sample belongs to class `i`. The 46 scores will sum to 1.\n", 279 | "\n" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "## Compile the model:" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "Compiling the model is \"freezing\" the model with certain attributes set such as `loss`, `optimizer`, `metrics` etc. " 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "# Compile the model with a \"rmsprop\" optimizer, \"categorical_crossentropy\" loss and \"accuracy\" \\\n", 303 | "# metric.\n", 304 | "model.compile(optimizer='rmsprop',\n", 305 | " loss='categorical_crossentropy',\n", 306 | " metrics=['accuracy'])" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "In Keras you can specify the different parameters above as strings such as \"rmsprop\" or as keras.optimizers.RMSprop. You can do the same for loss and metrics. Here are a list of [optimizers](https://keras.io/optimizers/), [loss functions](https://keras.io/losses/) and [metrics](https://keras.io/metrics/) that are available in Keras. \n", 314 | "\n", 315 | "In our example above the best loss function to use `categorical_crossentropy` since it measures the distance between \n", 316 | "the probability distribution output by our network and the true distribution of the labels. By minimizing the \n", 317 | "distance between these two distributions, we train our network to output something as close as possible to the true labels.\n", 318 | "\n", 319 | "Another point to note is if we had used output labels as integer tensors instead of one-hot encoded vectors, then we should use`sparse_categorical_crossentropy` instead of `categorical_crossentropy`." 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "## Training our model\n", 327 | "\n", 328 | "Let's set apart 1,000 samples in our training data to use as a validation set:" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "# Split the training data into train and validation datasets.\n", 338 | "x_val = x_train[:1000]\n", 339 | "partial_x_train = x_train[1000:]\n", 340 | "\n", 341 | "y_val = one_hot_train_labels[:1000]\n", 342 | "partial_y_train = one_hot_train_labels[1000:]" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "## Now let's train our network for 20 epochs:" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "# Use the `fit` call with the above training and validation datasets. \n", 359 | "# Use can use a batch size of 512.\n", 360 | "history = model.fit(partial_x_train,\n", 361 | " partial_y_train,\n", 362 | " epochs=20,\n", 363 | " batch_size=512,\n", 364 | " validation_data=(x_val, y_val))" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": {}, 370 | "source": [ 371 | "Let's display its loss and accuracy curves:" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "import matplotlib.pyplot as plt\n", 381 | "\n", 382 | "loss = history.history['loss']\n", 383 | "val_loss = history.history['val_loss']\n", 384 | "\n", 385 | "epochs = range(1, len(loss) + 1)\n", 386 | "\n", 387 | "plt.plot(epochs, loss, 'bo', label='Training loss')\n", 388 | "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", 389 | "plt.title('Training and validation loss')\n", 390 | "plt.xlabel('Epochs')\n", 391 | "plt.ylabel('Loss')\n", 392 | "plt.legend()\n", 393 | "\n", 394 | "plt.show()" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "plt.clf() # clear figure\n", 404 | "\n", 405 | "acc = history.history['acc']\n", 406 | "val_acc = history.history['val_acc']\n", 407 | "\n", 408 | "plt.plot(epochs, acc, 'bo', label='Training acc')\n", 409 | "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", 410 | "plt.title('Training and validation accuracy')\n", 411 | "plt.xlabel('Epochs')\n", 412 | "plt.ylabel('Loss')\n", 413 | "plt.legend()\n", 414 | "\n", 415 | "plt.show()" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": {}, 421 | "source": [ 422 | "It seems that the network starts overfitting after 8 epochs. \n", 423 | "\n", 424 | "## Let's train a new network from scratch for 8 epochs, then let's evaluate it on the test set:" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "# Use the same model from before and train the model as before. \n", 434 | "# However for this run let the number of epochs be 8.\n", 435 | "model = models.Sequential()\n", 436 | "model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))\n", 437 | "model.add(layers.Dense(64, activation='relu'))\n", 438 | "model.add(layers.Dense(46, activation='softmax'))\n", 439 | "\n", 440 | "model.compile(optimizer='rmsprop',\n", 441 | " loss='categorical_crossentropy',\n", 442 | " metrics=['accuracy'])\n", 443 | "\n", 444 | "history = model.fit(partial_x_train,\n", 445 | " partial_y_train,\n", 446 | " epochs=8,\n", 447 | " batch_size=512,\n", 448 | " validation_data=(x_val, y_val))" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "## Evaluate the model:" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": null, 461 | "metadata": {}, 462 | "outputs": [], 463 | "source": [ 464 | "# Use `evaluate` the model with the test training input(x_test) and \n", 465 | "# the one-hot encoded labels(one_hot_test_labels).\n", 466 | "results = model.evaluate(x_test, one_hot_test_labels)" 467 | ] 468 | }, 469 | { 470 | "cell_type": "markdown", 471 | "metadata": {}, 472 | "source": [ 473 | "\n", 474 | "Our approach reaches an accuracy of ~78%. With a balanced binary classification problem, the accuracy reached by a purely random classifier \n", 475 | "would be 50%, but in our case it is closer to 19%, so our results seem pretty good, at least when compared to a random baseline:" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": null, 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "import copy\n", 485 | "\n", 486 | "test_labels_copy = copy.copy(test_labels)\n", 487 | "np.random.shuffle(test_labels_copy)\n", 488 | "float(np.sum(np.array(test_labels) == np.array(test_labels_copy))) / len(test_labels)" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": {}, 494 | "source": [ 495 | "## Exercise 1: Now let us try with a larger network with 128 Dense units:" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [ 504 | "# Use the same model as before but with 128 units set for the first two Dense layers. `compile`, `fit` and `evaluate`\n", 505 | "# the model as before. You can also plot the loss and accuracy to see the behavior of the model.\n", 506 | "\n", 507 | "from keras import models\n", 508 | "from keras import layers\n", 509 | "\n", 510 | "# Define your model here\n", 511 | "# ...\n", 512 | "\n", 513 | "model.compile(optimizer='rmsprop',\n", 514 | " loss='categorical_crossentropy',\n", 515 | " metrics=['accuracy'])\n", 516 | "\n", 517 | "history = model.fit(partial_x_train,\n", 518 | " partial_y_train,\n", 519 | " epochs=20,\n", 520 | " batch_size=512,\n", 521 | " validation_data=(x_val, y_val))\n", 522 | "results = model.evaluate(x_test, one_hot_test_labels)\n", 523 | "\n", 524 | "plt.clf() # clear figure\n", 525 | "\n", 526 | "acc = history.history['acc']\n", 527 | "val_acc = history.history['val_acc']\n", 528 | "\n", 529 | "plt.plot(epochs, acc, 'bo', label='Training acc')\n", 530 | "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", 531 | "plt.title('Training and validation accuracy')\n", 532 | "plt.xlabel('Epochs')\n", 533 | "plt.ylabel('Loss')\n", 534 | "plt.legend()\n", 535 | "\n", 536 | "plt.show()" 537 | ] 538 | }, 539 | { 540 | "cell_type": "markdown", 541 | "metadata": {}, 542 | "source": [ 543 | "As you can see, with the larger network we get a slight increase in accuracy but the network seems to perform just as good previously." 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "## Exercise 2: Now let us train the model with a single Dense layer network:" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "metadata": {}, 557 | "outputs": [], 558 | "source": [ 559 | "# Use the same model as before but with 1 units set for a single Dense layer. `compile`, `fit` and `evaluate`\n", 560 | "# the model as before. You can also plot the loss and accuracy to see the behavior of the model.\n", 561 | "\n", 562 | "from keras import models\n", 563 | "from keras import layers\n", 564 | "\n", 565 | "# Define your model here\n", 566 | "# ...\n", 567 | "\n", 568 | "model.compile(optimizer='rmsprop',\n", 569 | " loss='categorical_crossentropy',\n", 570 | " metrics=['accuracy'])\n", 571 | "\n", 572 | "history = model.fit(partial_x_train,\n", 573 | " partial_y_train,\n", 574 | " epochs=20,\n", 575 | " batch_size=512,\n", 576 | " validation_data=(x_val, y_val))\n", 577 | "results = model.evaluate(x_test, one_hot_test_labels)\n", 578 | "\n", 579 | "plt.clf() # clear figure\n", 580 | "\n", 581 | "acc = history.history['acc']\n", 582 | "val_acc = history.history['val_acc']\n", 583 | "\n", 584 | "plt.plot(epochs, acc, 'bo', label='Training acc')\n", 585 | "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", 586 | "plt.title('Training and validation accuracy')\n", 587 | "plt.xlabel('Epochs')\n", 588 | "plt.ylabel('Loss')\n", 589 | "plt.legend()\n", 590 | "\n", 591 | "plt.show()" 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "metadata": {}, 597 | "source": [ 598 | "You can see that the accuracy stagnates at around 35% since the network is unable to learn any new information from\n", 599 | "the data." 600 | ] 601 | }, 602 | { 603 | "cell_type": "markdown", 604 | "metadata": {}, 605 | "source": [ 606 | "## Generating predictions on new data\n", 607 | "\n", 608 | "We can verify that the `predict` method of our model instance returns a probability distribution over all 46 topics. Let's generate topic \n", 609 | "predictions for all of the test data:" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": null, 615 | "metadata": {}, 616 | "outputs": [], 617 | "source": [ 618 | "predictions = model.predict(x_test)" 619 | ] 620 | }, 621 | { 622 | "cell_type": "markdown", 623 | "metadata": {}, 624 | "source": [ 625 | "Each entry in `predictions` is a vector of length 46:" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": null, 631 | "metadata": {}, 632 | "outputs": [], 633 | "source": [ 634 | "predictions[0].shape" 635 | ] 636 | }, 637 | { 638 | "cell_type": "markdown", 639 | "metadata": {}, 640 | "source": [ 641 | "The coefficients in this vector sum to 1:" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": null, 647 | "metadata": {}, 648 | "outputs": [], 649 | "source": [ 650 | "np.sum(predictions[0])" 651 | ] 652 | }, 653 | { 654 | "cell_type": "markdown", 655 | "metadata": {}, 656 | "source": [ 657 | "The largest entry is the predicted class, i.e. the class with the highest probability:" 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": null, 663 | "metadata": {}, 664 | "outputs": [], 665 | "source": [ 666 | "np.argmax(predictions[0])" 667 | ] 668 | }, 669 | { 670 | "cell_type": "markdown", 671 | "metadata": {}, 672 | "source": [ 673 | "## Dealing with overfitting:" 674 | ] 675 | }, 676 | { 677 | "cell_type": "markdown", 678 | "metadata": {}, 679 | "source": [ 680 | "Common ways to deal with overfitting is to train the model on more data or even to reduce the capacity of the network.\n", 681 | "A bigger network gets its training loss near zero very quickly. The more capacity the network has, the quicker it will be able to model the training data (resulting in a low training loss), but the more susceptible it is to overfitting (resulting in a large difference between the training and validation loss). Other techniques to deal with overfitting is using weight regularization and dropout. Keras provides APIs that make it very simple to use these techniques. " 682 | ] 683 | }, 684 | { 685 | "cell_type": "markdown", 686 | "metadata": {}, 687 | "source": [ 688 | "### Adding weight regularization:\n", 689 | "A common way to mitigate overfitting is to put constraints on the complexity of a network by forcing its weights to only take small values, which makes the distribution of weight values more \"regular\". This is called \"weight regularization\", and it is done by adding to the loss function of the network a cost associated with having large weights. This cost comes in two flavors:\n", 690 | "\n", 691 | "L1 regularization, where the cost added is proportional to the absolute value of the weights coefficients (i.e. to what is called the \"L1 norm\" of the weights).\n", 692 | "L2 regularization, where the cost added is proportional to the square of the value of the weights coefficients (i.e. to what is called the \"L2 norm\" of the weights). L2 regularization is also called weight decay in the context of neural networks. Don't let the different name confuse you: weight decay is mathematically the exact same as L2 regularization.\n", 693 | "In Keras, [weight regularization](https://keras.io/regularizers/) is added by passing weight regularizer instances to layers as keyword arguments. Let's add L2 weight regularization to an example Dense layer:" 694 | ] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "execution_count": null, 699 | "metadata": {}, 700 | "outputs": [], 701 | "source": [ 702 | "mock_dense_layer = layers.Dense(64, kernel_regularizer=regularizers.l2(0.001), activation='relu', input_shape=(10000,)))" 703 | ] 704 | }, 705 | { 706 | "cell_type": "markdown", 707 | "metadata": {}, 708 | "source": [ 709 | "### Adding dropout:\n", 710 | "[Dropout](https://keras.io/layers/core/), applied to a layer, consists of randomly \"dropping out\" (i.e. setting to zero) a number of output features of the layer during training. Let's say a given layer would normally have returned a vector [0.2, 0.5, 1.3, 0.8, 1.1] for a given input sample during training; after applying dropout, this vector will have a few zero entries distributed at random, e.g. [0, 0.5, 1.3, 0, 1.1]. The \"dropout rate\" is the fraction of the features that are being zeroed-out; it is usually set between 0.2 and 0.5. At test time, no units are dropped out, and instead the layer's output values are scaled down by a factor equal to the dropout rate, so as to balance for the fact that more units are active than at training time." 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": null, 716 | "metadata": {}, 717 | "outputs": [], 718 | "source": [ 719 | "# An example Dropout layer:\n", 720 | "#...\n", 721 | "mock_model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))\n", 722 | "mock_model.add(layers.Dropout(0.5)) # 0.5 is the dropout rate\n", 723 | "#..." 724 | ] 725 | }, 726 | { 727 | "cell_type": "markdown", 728 | "metadata": {}, 729 | "source": [ 730 | "## Callbacks\n", 731 | "You can use Keras built-in [callbacks](https://keras.io/callbacks/) to checkpoint your model, save TensorBoard summaries, adjust learning rate during training etc.\n", 732 | "Callbacks are a way to perform an action, view internal state or model statistics at the beginning/end of the training loop, an epoch or a step in the fit loop.\n", 733 | "Let's instantiate the ModelCheckpoint and TensorBoard callback:" 734 | ] 735 | }, 736 | { 737 | "cell_type": "markdown", 738 | "metadata": {}, 739 | "source": [ 740 | "# Exercise 3: Create a ModelCheckpoint and TensorBoard callback." 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": null, 746 | "metadata": {}, 747 | "outputs": [], 748 | "source": [ 749 | "from keras import callbacks\n", 750 | "# Create a ModelCheckpoint callback by specifying a filepath such a s'/tmp/checkpoints'. You can look at some of the\n", 751 | "# other parameters that you can set in the link above.\n", 752 | "# ...\n", 753 | "\n", 754 | "# Create a ModelCheckpoint callback\n", 755 | "# model_checkpoint_cb = \n", 756 | "\n", 757 | "# Create a TensorBoard callback that writes model summaries to a given directory such as '/tmp/logs'.\n", 758 | "# tensorboard_cb = \n", 759 | "\n", 760 | "# Let us use our model from above:\n", 761 | "from keras import models\n", 762 | "from keras import layers\n", 763 | "\n", 764 | "model = models.Sequential()\n", 765 | "model.add(layers.Dense(64, activation='relu', input_shape=(10000,)))\n", 766 | "model.add(layers.Dense(64, activation='relu'))\n", 767 | "model.add(layers.Dense(46, activation='softmax'))\n", 768 | "\n", 769 | "model.compile(optimizer='rmsprop',\n", 770 | " loss='categorical_crossentropy',\n", 771 | " metrics=['accuracy'])\n", 772 | "\n", 773 | "# Pass the callbacks instantiated above to the `callbacks` parameter in the `fit` call.\n", 774 | "history = model.fit(partial_x_train,\n", 775 | " partial_y_train,\n", 776 | " epochs=20,\n", 777 | " batch_size=512,\n", 778 | " validation_data=(x_val, y_val),\n", 779 | " callbacks=[model_checkpoint_cb, tensorboard_cb])" 780 | ] 781 | }, 782 | { 783 | "cell_type": "markdown", 784 | "metadata": {}, 785 | "source": [ 786 | "You should be able to view TensorBoard summaries by using the following command:\n", 787 | "`tensorboard --logdir=/full_path_to_your_logs`. Make sure to have installed the TensorBoard pip package." 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": null, 793 | "metadata": {}, 794 | "outputs": [], 795 | "source": [ 796 | "# Exercise 1: Solution\n", 797 | "# model = models.Sequential()\n", 798 | "# model.add(layers.Dense(128, activation='relu', input_shape=(10000,)))\n", 799 | "# model.add(layers.Dense(128, activation='relu'))\n", 800 | "# model.add(layers.Dense(46, activation='softmax'))" 801 | ] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": null, 806 | "metadata": {}, 807 | "outputs": [], 808 | "source": [ 809 | "# Exercise 2: Solution\n", 810 | "# model = models.Sequential()\n", 811 | "# model.add(layers.Dense(1, activation='relu', input_shape=(10000,)))\n", 812 | "# model.add(layers.Dense(46, activation='softmax'))" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": null, 818 | "metadata": {}, 819 | "outputs": [], 820 | "source": [ 821 | "# Exercise 3: Solution\n", 822 | "# model_checkpoint_cb = callbacks.ModelCheckpoint('/tmp/checkpoints',\n", 823 | "# monitor='val_loss',\n", 824 | "# period=1)\n", 825 | "\n", 826 | "# tensorboard_cb = callbacks.TensorBoard(log_dir='/tmp/logs')" 827 | ] 828 | } 829 | ], 830 | "metadata": { 831 | "kernelspec": { 832 | "display_name": "Python 2", 833 | "language": "python", 834 | "name": "python2" 835 | }, 836 | "language_info": { 837 | "codemirror_mode": { 838 | "name": "ipython", 839 | "version": 2 840 | }, 841 | "file_extension": ".py", 842 | "mimetype": "text/x-python", 843 | "name": "python", 844 | "nbconvert_exporter": "python", 845 | "pygments_lexer": "ipython2", 846 | "version": "2.7.15" 847 | } 848 | }, 849 | "nbformat": 4, 850 | "nbformat_minor": 2 851 | } 852 | -------------------------------------------------------------------------------- /Notebook 3 - Movie Critics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import keras\n", 10 | "keras.__version__" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "## Notebook 3" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Using WordEmbeddings and RNNs to classify IMDB reviews: a binary classification problem\n", 25 | "\n", 26 | "This notebook contains code samples that have been adapted from Francois Chollet's Deep Learning With Python.\n", 27 | "\n", 28 | "----\n", 29 | "In this notebook we are going to introduce how to use Embeddings and [Recurrent Neural Network(RNN)](https://keras.io/layers/recurrent/) layers in Keras. We are going to introduce the following layers:\n", 30 | "* `Embedding`\n", 31 | "* `SimpleRNN`\n", 32 | "* `LSTM`\n", 33 | "\n", 34 | "We are going look at how to train networks with limited input text data using pre-trained embeddings. Tokenizing and preprocessing input text data will also be illustrated end to end. " 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "## Problem \n", 42 | "In this notebook, we will learn to classify movie reviews into \"positive\" reviews and \"negative\" reviews, just based on the text content of the reviews." 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# First, a note on using word embeddings\n", 50 | "\n", 51 | "Another popular and powerful way to associate a vector with a word is the use of dense \"word vectors\", also called \"word embeddings\". \n", 52 | "While the vectors obtained through one-hot encoding are binary, sparse (mostly made of zeros) and very high-dimensional (same dimensionality as the \n", 53 | "number of words in the vocabulary), \"word embeddings\" are low-dimensional floating point vectors \n", 54 | "(i.e. \"dense\" vectors, as opposed to sparse vectors). \n", 55 | "Unlike word vectors obtained via one-hot encoding, word embeddings are learned from data. \n", 56 | "It is common to see word embeddings that are 256-dimensional, 512-dimensional, or 1024-dimensional when dealing with very large vocabularies. \n", 57 | "On the other hand, one-hot encoding words generally leads to vectors that are 20,000-dimensional or higher (capturing a vocabulary of 20,000 \n", 58 | "token in this case). So, word embeddings pack more information into far fewer dimensions. " 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "![word embeddings vs. one hot encoding](https://s3.amazonaws.com/book.keras.io/img/ch6/word_embeddings.png)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "There are two ways to obtain word embeddings:\n", 73 | "\n", 74 | "* Learn word embeddings jointly with the main task you care about (e.g. document classification or sentiment prediction). \n", 75 | "In this setup, you would start with random word vectors, then learn your word vectors in the same way that you learn the weights of a neural network.\n", 76 | "* Load into your model word embeddings that were pre-computed using a different machine learning task than the one you are trying to solve. \n", 77 | "These are called \"pre-trained word embeddings\". \n", 78 | "\n", 79 | "Let's take a look at both." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "from keras.layers import Embedding\n", 89 | "\n", 90 | "# The Embedding layer takes at least two arguments:\n", 91 | "# the number of possible tokens, here 1000 (1 + maximum word index),\n", 92 | "# and the dimensionality of the embeddings, here 64.\n", 93 | "embedding_layer = Embedding(1000, 64)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "\n", 101 | "The `Embedding` layer is best understood as a dictionary mapping integer indices (which stand for specific words) to dense vectors. It takes \n", 102 | "as input integers, it looks up these integers into an internal dictionary, and it returns the associated vectors. It's effectively a dictionary lookup." 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "\n", 110 | "The `Embedding` layer takes as input a 2D tensor of integers, of shape `(samples, sequence_length)`, where each entry is a sequence of \n", 111 | "integers. It can embed sequences of variable lengths, so for instance we could feed into our embedding layer above batches that could have \n", 112 | "shapes `(32, 10)` (batch of 32 sequences of length 10) or `(64, 15)` (batch of 64 sequences of length 15). All sequences in a batch must \n", 113 | "have the same length, though (since we need to pack them into a single tensor), so sequences that are shorter than others should be padded \n", 114 | "with zeros, and sequences that are longer should be truncated.\n", 115 | "\n", 116 | "This layer returns a 3D floating point tensor, of shape `(samples, sequence_length, embedding_dimensionality)`. Such a 3D tensor can then \n", 117 | "be processed by a RNN layer or a 1D convolution layer.\n", 118 | "\n", 119 | "When you instantiate an `Embedding` layer, its weights (its internal dictionary of token vectors) are initially random, just like with any \n", 120 | "other layer. During training, these word vectors will be gradually adjusted via backpropagation, structuring the space into something that the \n", 121 | "downstream model can exploit. Once fully trained, your embedding space will show a lot of structure -- a kind of structure specialized for \n", 122 | "the specific problem you were training your model for." 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "## Dataset\n", 130 | "Let's consider IMDB movie review sentiment prediction task that you are already familiar with. Let's quickly prepare \n", 131 | "the data. We will restrict the movie reviews to the top 10,000 most common words, \n", 132 | "and cut the reviews after only 20 words. Our network will simply learn 8-dimensional embeddings for each of the 10,000 words, turn the \n", 133 | "input integer sequences (2D integer tensor) into embedded sequences (3D float tensor), flatten the tensor to 2D, and train a single `Dense` layer on top for classification." 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "from keras.datasets import imdb\n", 143 | "from keras import preprocessing" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# Number of words to consider as features\n", 153 | "max_features = 10000\n", 154 | "# Cut texts after this number of words \n", 155 | "# (among top max_features most common words)\n", 156 | "maxlen = 20\n", 157 | "\n", 158 | "# Load the data as lists of integers.\n", 159 | "(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)\n", 160 | "\n", 161 | "# This turns our lists of integers\n", 162 | "# into a 2D integer tensor of shape `(samples, maxlen)`\n", 163 | "x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)\n", 164 | "x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "## Exercise 1: Build and train a Sequential model with the following specs:\n", 172 | "* Embedding Layer with maximum number of tokes to be 10000 and embedding dimensionality as 8. Let the input_length be the maximum length of each review i.e 20 as seen previously.\n", 173 | "* Flatten the 3D embedding output to 2D.\n", 174 | "* Dense Layer which is the classifier.\n", 175 | "* Compile the model with a 'rmsprop' optimizer. Can you guess what loss we need to use?\n", 176 | "* Let accuracy be one of the metrics we are interested in.\n", 177 | "* Run the model on the above training data. " 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "from keras.models import Sequential\n", 187 | "from keras.layers import Flatten, Dense\n", 188 | "\n", 189 | "model = Sequential()\n", 190 | "# Add an Embedding Layer with maximum number of tokes to be 10000 and embedding dimensionality as 8. \n", 191 | "# Let the input_length be the maximum length of each review i.e 20 as seen previously.\n", 192 | "# After the Embedding layer, \n", 193 | "# our activations have shape `(samples, maxlen, 8)`.\n", 194 | "\n", 195 | "# We flatten the 3D tensor of embeddings \n", 196 | "# into a 2D tensor of shape `(samples, maxlen * 8)`\n", 197 | "# ...\n", 198 | "# We add a Dense classifier on top\n", 199 | "# ...\n", 200 | "\n", 201 | "model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])\n", 202 | "model.summary()\n", 203 | "\n", 204 | "history = model.fit(x_train, y_train,\n", 205 | " epochs=10,\n", 206 | " batch_size=32,\n", 207 | " validation_split=0.2)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "If you used a batch size of 32 and ran the model for 10 epochs you should get to a validation accuracy of ~76%, which is pretty good considering that we only look at the first 20 words in every review. But \n", 215 | "note that merely flattening the embedded sequences and training a single `Dense` layer on top leads to a model that treats each word in the \n", 216 | "input sequence separately, without considering inter-word relationships and structure sentence (e.g. it would likely treat both _\"this movie \n", 217 | "is shit\"_ and _\"this movie is the shit\"_ as being negative \"reviews\"). It would be much better to add recurrent layers or 1D convolutional \n", 218 | "layers on top of the embedded sequences to learn features that take into account each sequence as a whole." 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "## What if we increase the number of words that we consider in each IMDB review? Let maxlen now be 200 instead of 20." 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "from keras.models import Sequential\n", 235 | "from keras.layers import Flatten, Dense\n", 236 | "# Number of words to consider as features\n", 237 | "max_features = 10000\n", 238 | "# Cut texts after this number of words \n", 239 | "# (among top max_features most common words)\n", 240 | "maxlen = 200\n", 241 | "\n", 242 | "# Load the data as lists of integers.\n", 243 | "#(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)\n", 244 | "\n", 245 | "# This turns our lists of integers\n", 246 | "# into a 2D integer tensor of shape `(samples, maxlen)`\n", 247 | "x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)\n", 248 | "x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)\n", 249 | "\n", 250 | "model = Sequential()\n", 251 | "model.add(Embedding(10000, 8, input_length=maxlen))\n", 252 | "model.add(Flatten())\n", 253 | "\n", 254 | "model.add(Dense(1, activation='sigmoid'))\n", 255 | "model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])\n", 256 | "model.summary()\n", 257 | "\n", 258 | "history = model.fit(x_train, y_train,\n", 259 | " epochs=10,\n", 260 | " batch_size=32,\n", 261 | " validation_split=0.2)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "With a higher number of words considered per review the validation accuracy can increase up to 88%." 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "## Using pre-trained word embeddings\n", 276 | "\n", 277 | "\n", 278 | "When you have very little training data you can leverage pre-computed embeddings that are learnt on a different dataset. The \n", 279 | "rationale behind using pre-trained word embeddings in natural language processing is very much the same as for using pre-trained convnets \n", 280 | "in image classification: we don't have enough data available to learn truly powerful features on our own, but we expect the features that \n", 281 | "we need to be fairly generic, i.e. common visual features or semantic features. In this case it makes sense to reuse features learned on a \n", 282 | "different problem.\n", 283 | "\n", 284 | "There are various pre-computed databases of word embeddings that you can download and start using in a Keras `Embedding` layer. Word2Vec is one \n", 285 | "of them. Another popular one is called \"GloVe\", developed by Stanford researchers in 2014. It stands for \"Global Vectors for Word \n", 286 | "Representation\", and it is an embedding technique based on factorizing a matrix of word co-occurrence statistics. Its developers have made \n", 287 | "available pre-computed embeddings for millions of English tokens, obtained from Wikipedia data or from Common Crawl data.\n", 288 | "\n", 289 | "Let's take a look at how you can get started using GloVe embeddings in a Keras model. The same method will of course be valid for Word2Vec \n", 290 | "embeddings or any other word embedding database that you can download." 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "## Putting it all together: from raw text to word embeddings\n", 298 | "\n", 299 | "\n", 300 | "We will be using a model similar to the one we just went over -- embedding sentences in sequences of vectors, flattening them and training a \n", 301 | "`Dense` layer on top. But we will do it using pre-trained word embeddings, and instead of using the pre-tokenized IMDB data packaged in \n", 302 | "Keras, we will start from scratch, by downloading the original text data." 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "### Download the IMDB data as raw text\n", 310 | "\n", 311 | "\n", 312 | "First, head to `http://ai.stanford.edu/~amaas/data/sentiment/` and download the raw IMDB dataset (if the URL isn't working anymore, just \n", 313 | "Google \"IMDB dataset\"). Uncompress it.\n", 314 | "\n", 315 | "Now let's collect the individual training reviews into a list of strings, one string per review, and let's also collect the review labels \n", 316 | "(positive / negative) into a `labels` list:" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "import os\n", 326 | "\n", 327 | "imdb_dir = '/Users/anjalisridhar/kdd2018/workshop/datasets/aclImdb'\n", 328 | "train_dir = os.path.join(imdb_dir, 'train')\n", 329 | "\n", 330 | "labels = []\n", 331 | "texts = []\n", 332 | "\n", 333 | "for label_type in ['neg', 'pos']:\n", 334 | " dir_name = os.path.join(train_dir, label_type)\n", 335 | " for fname in os.listdir(dir_name):\n", 336 | " if fname[-4:] == '.txt':\n", 337 | " f = open(os.path.join(dir_name, fname))\n", 338 | " texts.append(f.read())\n", 339 | " f.close()\n", 340 | " if label_type == 'neg':\n", 341 | " labels.append(0)\n", 342 | " else:\n", 343 | " labels.append(1)" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "### Tokenize the data\n", 351 | "\n", 352 | "\n", 353 | "Let's vectorize the texts we collected, and prepare a training and validation split.\n", 354 | "We will merely be using the concepts we introduced earlier in this section.\n", 355 | "\n", 356 | "Because pre-trained word embeddings are meant to be particularly useful on problems where little training data is available (otherwise, \n", 357 | "task-specific embeddings are likely to outperform them), we will add the following twist: we restrict the training data to its first 200 \n", 358 | "samples.\n" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "from keras.preprocessing.text import Tokenizer\n", 368 | "from keras.preprocessing.sequence import pad_sequences\n", 369 | "import numpy as np\n", 370 | "\n", 371 | "maxlen = 100 # We will cut reviews after 100 words\n", 372 | "training_samples = 200 # We will be training on 200 samples\n", 373 | "validation_samples = 10000 # We will be validating on 10000 samples\n", 374 | "max_words = 10000 # We will only consider the top 10,000 words in the dataset\n", 375 | "\n", 376 | "tokenizer = Tokenizer(num_words=max_words)\n", 377 | "tokenizer.fit_on_texts(texts)\n", 378 | "sequences = tokenizer.texts_to_sequences(texts)\n", 379 | "\n", 380 | "word_index = tokenizer.word_index\n", 381 | "print('Found %s unique tokens.' % len(word_index))\n", 382 | "\n", 383 | "data = pad_sequences(sequences, maxlen=maxlen)\n", 384 | "\n", 385 | "labels = np.asarray(labels)\n", 386 | "print('Shape of data tensor:', data.shape)\n", 387 | "print('Shape of label tensor:', labels.shape)\n", 388 | "\n", 389 | "# Split the data into a training set and a validation set\n", 390 | "# But first, shuffle the data, since we started from data\n", 391 | "# where sample are ordered (all negative first, then all positive).\n", 392 | "indices = np.arange(data.shape[0])\n", 393 | "np.random.shuffle(indices)\n", 394 | "data = data[indices]\n", 395 | "labels = labels[indices]\n", 396 | "\n", 397 | "x_train = data[:training_samples]\n", 398 | "y_train = labels[:training_samples]\n", 399 | "x_val = data[training_samples: training_samples + validation_samples]\n", 400 | "y_val = labels[training_samples: training_samples + validation_samples]" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "### Download the GloVe word embeddings\n", 408 | "\n", 409 | "\n", 410 | "Head to `https://nlp.stanford.edu/projects/glove/` (where you can learn more about the GloVe algorithm), and download the pre-computed \n", 411 | "embeddings from 2014 English Wikipedia. It's a 822MB zip file named `glove.6B.zip`, containing 100-dimensional embedding vectors for \n", 412 | "400,000 words (or non-word tokens). Un-zip it." 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "### Pre-process the embeddings\n", 420 | "\n", 421 | "\n", 422 | "Let's parse the un-zipped file (it's a `txt` file) to build an index mapping words (as strings) to their vector representation (as number \n", 423 | "vectors)." 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "glove_dir = '/Users/anjalisridhar/kdd2018/workshop/datasets/glove.6B'\n", 433 | "\n", 434 | "embeddings_index = {}\n", 435 | "f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))\n", 436 | "for line in f:\n", 437 | " values = line.split()\n", 438 | " word = values[0]\n", 439 | " coefs = np.asarray(values[1:], dtype='float32')\n", 440 | " embeddings_index[word] = coefs\n", 441 | "f.close()\n", 442 | "\n", 443 | "print('Found %s word vectors.' % len(embeddings_index))" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": {}, 449 | "source": [ 450 | "\n", 451 | "Now let's build an embedding matrix that we will be able to load into an `Embedding` layer. It must be a matrix of shape `(max_words, embedding_dim)`, where each entry `i` contains the `embedding_dim`-dimensional vector for the word of index `i` in our reference word index \n", 452 | "(built during tokenization). Note that the index `0` is not supposed to stand for any word or token -- it's a placeholder." 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "embedding_dim = 100\n", 462 | "\n", 463 | "embedding_matrix = np.zeros((max_words, embedding_dim))\n", 464 | "for word, i in word_index.items():\n", 465 | " embedding_vector = embeddings_index.get(word)\n", 466 | " if i < max_words:\n", 467 | " if embedding_vector is not None:\n", 468 | " # Words not found in embedding index will be all-zeros.\n", 469 | " embedding_matrix[i] = embedding_vector" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "### Define a model\n", 477 | "\n", 478 | "We will be using the same model architecture as before:" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "from keras.models import Sequential\n", 488 | "from keras.layers import Embedding, Flatten, Dense\n", 489 | "\n", 490 | "model = Sequential()\n", 491 | "model.add(Embedding(max_words, embedding_dim, input_length=maxlen))\n", 492 | "model.add(Flatten())\n", 493 | "model.add(Dense(32, activation='relu'))\n", 494 | "model.add(Dense(1, activation='sigmoid'))\n", 495 | "model.summary()" 496 | ] 497 | }, 498 | { 499 | "cell_type": "markdown", 500 | "metadata": {}, 501 | "source": [ 502 | "### Load the GloVe embeddings in the model\n", 503 | "\n", 504 | "\n", 505 | "The `Embedding` layer has a single weight matrix: a 2D float matrix where each entry `i` is the word vector meant to be associated with \n", 506 | "index `i`. Simple enough. Let's just load the GloVe matrix we prepared into our `Embedding` layer, the first layer in our model:" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": null, 512 | "metadata": {}, 513 | "outputs": [], 514 | "source": [ 515 | "model.layers[0].set_weights([embedding_matrix])\n", 516 | "model.layers[0].trainable = False" 517 | ] 518 | }, 519 | { 520 | "cell_type": "markdown", 521 | "metadata": {}, 522 | "source": [ 523 | "\n", 524 | "Additionally, we freeze the embedding layer (we set its `trainable` attribute to `False`), following the same rationale as what you are \n", 525 | "already familiar with in the context of pre-trained convnet features: when parts of a model are pre-trained (like our `Embedding` layer), \n", 526 | "and parts are randomly initialized (like our classifier), the pre-trained parts should not be updated during training to avoid forgetting \n", 527 | "what they already know. The large gradient update triggered by the randomly initialized layers would be very disruptive to the already \n", 528 | "learned features." 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": {}, 534 | "source": [ 535 | "### Train and evaluate\n", 536 | "\n", 537 | "Let's compile our model and train it:" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "metadata": {}, 544 | "outputs": [], 545 | "source": [ 546 | "model.compile(optimizer='rmsprop',\n", 547 | " loss='binary_crossentropy',\n", 548 | " metrics=['acc'])\n", 549 | "history = model.fit(x_train, y_train,\n", 550 | " epochs=10,\n", 551 | " batch_size=32,\n", 552 | " validation_data=(x_val, y_val))\n", 553 | "model.save_weights('pre_trained_glove_model.h5')" 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": {}, 559 | "source": [ 560 | "Let's plot its performance over time:" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": null, 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "import matplotlib.pyplot as plt\n", 570 | "\n", 571 | "acc = history.history['acc']\n", 572 | "val_acc = history.history['val_acc']\n", 573 | "loss = history.history['loss']\n", 574 | "val_loss = history.history['val_loss']\n", 575 | "\n", 576 | "epochs = range(1, len(acc) + 1)\n", 577 | "\n", 578 | "plt.plot(epochs, acc, 'bo', label='Training acc')\n", 579 | "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", 580 | "plt.title('Training and validation accuracy')\n", 581 | "plt.legend()\n", 582 | "\n", 583 | "plt.figure()\n", 584 | "\n", 585 | "plt.plot(epochs, loss, 'bo', label='Training loss')\n", 586 | "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", 587 | "plt.title('Training and validation loss')\n", 588 | "plt.legend()\n", 589 | "\n", 590 | "plt.show()" 591 | ] 592 | }, 593 | { 594 | "cell_type": "markdown", 595 | "metadata": {}, 596 | "source": [ 597 | "\n", 598 | "The model quickly starts overfitting, unsurprisingly given the small number of training samples. Validation accuracy has high variance for \n", 599 | "the same reason, but seems to reach high 50s.\n", 600 | "\n", 601 | "Note that your mileage may vary: since we have so few training samples, performance is heavily dependent on which exact 200 samples we \n", 602 | "picked, and we picked them at random. If it worked really poorly for you, try picking a different random set of 200 samples, just for the \n", 603 | "sake of the exercise (in real life you don't get to pick your training data).\n", 604 | "\n", 605 | "We can also try to train the same model without loading the pre-trained word embeddings and without freezing the embedding layer. In that \n", 606 | "case, we would be learning a task-specific embedding of our input tokens, which is generally more powerful than pre-trained word embeddings \n", 607 | "when lots of data is available. However, in our case, we have only 200 training samples. Let's try it:" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": null, 613 | "metadata": {}, 614 | "outputs": [], 615 | "source": [ 616 | "from keras.models import Sequential\n", 617 | "from keras.layers import Embedding, Flatten, Dense\n", 618 | "\n", 619 | "model = Sequential()\n", 620 | "model.add(Embedding(max_words, embedding_dim, input_length=maxlen))\n", 621 | "model.add(Flatten())\n", 622 | "model.add(Dense(32, activation='relu'))\n", 623 | "model.add(Dense(1, activation='sigmoid'))\n", 624 | "model.summary()\n", 625 | "\n", 626 | "model.compile(optimizer='rmsprop',\n", 627 | " loss='binary_crossentropy',\n", 628 | " metrics=['acc'])\n", 629 | "history = model.fit(x_train, y_train,\n", 630 | " epochs=10,\n", 631 | " batch_size=32,\n", 632 | " validation_data=(x_val, y_val))" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": null, 638 | "metadata": {}, 639 | "outputs": [], 640 | "source": [ 641 | "acc = history.history['acc']\n", 642 | "val_acc = history.history['val_acc']\n", 643 | "loss = history.history['loss']\n", 644 | "val_loss = history.history['val_loss']\n", 645 | "\n", 646 | "epochs = range(1, len(acc) + 1)\n", 647 | "\n", 648 | "plt.plot(epochs, acc, 'bo', label='Training acc')\n", 649 | "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", 650 | "plt.title('Training and validation accuracy')\n", 651 | "plt.legend()\n", 652 | "\n", 653 | "plt.figure()\n", 654 | "\n", 655 | "plt.plot(epochs, loss, 'bo', label='Training loss')\n", 656 | "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", 657 | "plt.title('Training and validation loss')\n", 658 | "plt.legend()\n", 659 | "\n", 660 | "plt.show()" 661 | ] 662 | }, 663 | { 664 | "cell_type": "markdown", 665 | "metadata": {}, 666 | "source": [ 667 | "\n", 668 | "Validation accuracy stalls in the low 50s. So in our case, pre-trained word embeddings does outperform jointly learned embeddings. If you \n", 669 | "increase the number of training samples, this will quickly stop being the case -- try it as an exercise.\n", 670 | "\n", 671 | "Finally, let's evaluate the model on the test data. First, we will need to tokenize the test data:" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": null, 677 | "metadata": {}, 678 | "outputs": [], 679 | "source": [ 680 | "test_dir = os.path.join(imdb_dir, 'test')\n", 681 | "\n", 682 | "labels = []\n", 683 | "texts = []\n", 684 | "\n", 685 | "for label_type in ['neg', 'pos']:\n", 686 | " dir_name = os.path.join(test_dir, label_type)\n", 687 | " for fname in sorted(os.listdir(dir_name)):\n", 688 | " if fname[-4:] == '.txt':\n", 689 | " f = open(os.path.join(dir_name, fname))\n", 690 | " texts.append(f.read())\n", 691 | " f.close()\n", 692 | " if label_type == 'neg':\n", 693 | " labels.append(0)\n", 694 | " else:\n", 695 | " labels.append(1)\n", 696 | "\n", 697 | "sequences = tokenizer.texts_to_sequences(texts)\n", 698 | "x_test = pad_sequences(sequences, maxlen=maxlen)\n", 699 | "y_test = np.asarray(labels)" 700 | ] 701 | }, 702 | { 703 | "cell_type": "markdown", 704 | "metadata": {}, 705 | "source": [ 706 | "And let's load and evaluate the first model:" 707 | ] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "execution_count": null, 712 | "metadata": {}, 713 | "outputs": [], 714 | "source": [ 715 | "model.load_weights('pre_trained_glove_model.h5')\n", 716 | "model.evaluate(x_test, y_test)" 717 | ] 718 | }, 719 | { 720 | "cell_type": "markdown", 721 | "metadata": {}, 722 | "source": [ 723 | "We get an appalling test accuracy of ~55%." 724 | ] 725 | }, 726 | { 727 | "cell_type": "markdown", 728 | "metadata": {}, 729 | "source": [ 730 | "## A first recurrent layer in Keras" 731 | ] 732 | }, 733 | { 734 | "cell_type": "code", 735 | "execution_count": null, 736 | "metadata": {}, 737 | "outputs": [], 738 | "source": [ 739 | "from keras.layers import SimpleRNN" 740 | ] 741 | }, 742 | { 743 | "cell_type": "markdown", 744 | "metadata": {}, 745 | "source": [ 746 | "`SimpleRNN` processes batches of sequences, like all other Keras layers, not just a single sequence.\n", 747 | "Like all recurrent layers in Keras, `SimpleRNN` can be run in two different modes: it can return either the full sequences of successive \n", 748 | "outputs for each timestep (a 3D tensor of shape `(batch_size, timesteps, output_features)`), or it can return only the last output for each \n", 749 | "input sequence (a 2D tensor of shape `(batch_size, output_features)`). These two modes are controlled by the `return_sequences` constructor \n", 750 | "argument. Let's take a look at an example:" 751 | ] 752 | }, 753 | { 754 | "cell_type": "markdown", 755 | "metadata": {}, 756 | "source": [ 757 | "## Exercise 2: Lets add a SimpleRNN layer to our model with embedding dimension 32" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": null, 763 | "metadata": {}, 764 | "outputs": [], 765 | "source": [ 766 | "from keras.models import Sequential\n", 767 | "from keras.layers import Embedding, SimpleRNN\n", 768 | "\n", 769 | "model = Sequential()\n", 770 | "# Add an Embedding layer of 10000 vocab size(or max features) and 32 dimensions\n", 771 | "# ...\n", 772 | "# Add a SimpleRNN layer of output 32 dimensions\n", 773 | "# ..." 774 | ] 775 | }, 776 | { 777 | "cell_type": "markdown", 778 | "metadata": {}, 779 | "source": [ 780 | "It is sometimes useful to stack several recurrent layers one after the other in order to increase the representational power of a network. \n", 781 | "In such a setup, you have to get all intermediate layers to return full sequences:" 782 | ] 783 | }, 784 | { 785 | "cell_type": "code", 786 | "execution_count": null, 787 | "metadata": {}, 788 | "outputs": [], 789 | "source": [ 790 | "# Let us add 3 more SimpleRNN layers. This time we also want to set the \n", 791 | "# `return_sequences` parameter to be True.\n", 792 | "# This will return the output for each timestep as opposed to returning the output \n", 793 | "# for only the last timestep.\n", 794 | "# Compare model.summary() to see this difference.\n", 795 | "model = Sequential()\n", 796 | "model.add(Embedding(10000, 32))\n", 797 | "model.add(SimpleRNN(32, return_sequences=True))\n", 798 | "model.add(SimpleRNN(32, return_sequences=True))\n", 799 | "model.add(SimpleRNN(32, return_sequences=True))\n", 800 | "model.add(SimpleRNN(32)) # This last layer only returns the last outputs.\n", 801 | "model.summary()" 802 | ] 803 | }, 804 | { 805 | "cell_type": "markdown", 806 | "metadata": {}, 807 | "source": [ 808 | "Now let's try to use such a model on the IMDB movie review classification problem. First, let's preprocess the data:" 809 | ] 810 | }, 811 | { 812 | "cell_type": "code", 813 | "execution_count": null, 814 | "metadata": {}, 815 | "outputs": [], 816 | "source": [ 817 | "from keras.datasets import imdb\n", 818 | "from keras.preprocessing import sequence\n", 819 | "\n", 820 | "max_features = 10000 # number of words to consider as features\n", 821 | "maxlen = 500 # cut texts after this number of words (among top max_features most common words)\n", 822 | "batch_size = 32\n", 823 | "\n", 824 | "print('Loading data...')\n", 825 | "(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features)\n", 826 | "print(len(input_train), 'train sequences')\n", 827 | "print(len(input_test), 'test sequences')\n", 828 | "\n", 829 | "print('Pad sequences (samples x time)')\n", 830 | "input_train = sequence.pad_sequences(input_train, maxlen=maxlen)\n", 831 | "input_test = sequence.pad_sequences(input_test, maxlen=maxlen)\n", 832 | "print('input_train shape:', input_train.shape)\n", 833 | "print('input_test shape:', input_test.shape)" 834 | ] 835 | }, 836 | { 837 | "cell_type": "markdown", 838 | "metadata": {}, 839 | "source": [ 840 | "Let's train a simple recurrent network using an `Embedding` layer and a `SimpleRNN` layer:" 841 | ] 842 | }, 843 | { 844 | "cell_type": "code", 845 | "execution_count": null, 846 | "metadata": {}, 847 | "outputs": [], 848 | "source": [ 849 | "from keras.layers import Dense\n", 850 | "\n", 851 | "model = Sequential()\n", 852 | "model.add(Embedding(max_features, 32))\n", 853 | "model.add(SimpleRNN(32))\n", 854 | "model.add(Dense(1, activation='sigmoid'))\n", 855 | "\n", 856 | "model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])\n", 857 | "history = model.fit(input_train, y_train,\n", 858 | " epochs=10,\n", 859 | " batch_size=128,\n", 860 | " validation_split=0.2)" 861 | ] 862 | }, 863 | { 864 | "cell_type": "markdown", 865 | "metadata": {}, 866 | "source": [ 867 | "Let's display the training and validation loss and accuracy:" 868 | ] 869 | }, 870 | { 871 | "cell_type": "code", 872 | "execution_count": null, 873 | "metadata": {}, 874 | "outputs": [], 875 | "source": [ 876 | "import matplotlib.pyplot as plt\n", 877 | "\n", 878 | "acc = history.history['acc']\n", 879 | "val_acc = history.history['val_acc']\n", 880 | "loss = history.history['loss']\n", 881 | "val_loss = history.history['val_loss']\n", 882 | "\n", 883 | "epochs = range(len(acc))\n", 884 | "\n", 885 | "plt.plot(epochs, acc, 'bo', label='Training acc')\n", 886 | "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", 887 | "plt.title('Training and validation accuracy')\n", 888 | "plt.legend()\n", 889 | "\n", 890 | "plt.figure()\n", 891 | "\n", 892 | "plt.plot(epochs, loss, 'bo', label='Training loss')\n", 893 | "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", 894 | "plt.title('Training and validation loss')\n", 895 | "plt.legend()\n", 896 | "\n", 897 | "plt.show()" 898 | ] 899 | }, 900 | { 901 | "cell_type": "markdown", 902 | "metadata": {}, 903 | "source": [ 904 | "Unfortunately, our small \n", 905 | "recurrent network doesn't perform very well at all compared to this baseline (only up to 85% validation accuracy). Part of the problem is \n", 906 | "that our inputs only consider the first 500 words rather the full sequences -- \n", 907 | "hence our RNN has access to less information than our earlier baseline model. The remainder of the problem is simply that `SimpleRNN` isn't very good at processing long sequences, like text. Other types of recurrent layers perform much better. Let's take a look at some \n", 908 | "more advanced layers." 909 | ] 910 | }, 911 | { 912 | "cell_type": "markdown", 913 | "metadata": {}, 914 | "source": [ 915 | "## A concrete LSTM example in Keras\n", 916 | "Now let's switch to more practical concerns: we will set up a model using a LSTM layer and train it on the IMDB data. Here's the network, \n", 917 | "similar to the one with `SimpleRNN` that we just presented. We only specify the output dimensionality of the LSTM layer, and leave every \n", 918 | "other argument (there are lots) to the Keras defaults. Keras has good defaults, and things will almost always \"just work\" without you \n", 919 | "having to spend time tuning parameters by hand." 920 | ] 921 | }, 922 | { 923 | "cell_type": "markdown", 924 | "metadata": {}, 925 | "source": [ 926 | "## Exercise 3: Use an LSTM layer instead of a SimpleRNN layer" 927 | ] 928 | }, 929 | { 930 | "cell_type": "code", 931 | "execution_count": null, 932 | "metadata": {}, 933 | "outputs": [], 934 | "source": [ 935 | "from keras.layers import LSTM\n", 936 | "\n", 937 | "model = Sequential()\n", 938 | "# Add an Embedding layer as before with 10000 vocab size(max features) and 32 output dimensions\n", 939 | "# ...\n", 940 | "# Add a LSTM layer of 32 dimensions\n", 941 | "# ...\n", 942 | "\n", 943 | "\n", 944 | "model.compile(optimizer='rmsprop',\n", 945 | " loss='binary_crossentropy',\n", 946 | " metrics=['acc'])\n", 947 | "history = model.fit(input_train, y_train,\n", 948 | " epochs=10,\n", 949 | " batch_size=128,\n", 950 | " validation_split=0.2)" 951 | ] 952 | }, 953 | { 954 | "cell_type": "code", 955 | "execution_count": null, 956 | "metadata": {}, 957 | "outputs": [], 958 | "source": [ 959 | "acc = history.history['acc']\n", 960 | "val_acc = history.history['val_acc']\n", 961 | "loss = history.history['loss']\n", 962 | "val_loss = history.history['val_loss']\n", 963 | "\n", 964 | "epochs = range(len(acc))\n", 965 | "\n", 966 | "plt.plot(epochs, acc, 'bo', label='Training acc')\n", 967 | "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", 968 | "plt.title('Training and validation accuracy')\n", 969 | "plt.legend()\n", 970 | "\n", 971 | "plt.figure()\n", 972 | "\n", 973 | "plt.plot(epochs, loss, 'bo', label='Training loss')\n", 974 | "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", 975 | "plt.title('Training and validation loss')\n", 976 | "plt.legend()\n", 977 | "\n", 978 | "plt.show()" 979 | ] 980 | }, 981 | { 982 | "cell_type": "markdown", 983 | "metadata": {}, 984 | "source": [ 985 | "You can see that the accuracy is 87%, much higher than what we got with a SimpleRNN layer. " 986 | ] 987 | }, 988 | { 989 | "cell_type": "code", 990 | "execution_count": null, 991 | "metadata": {}, 992 | "outputs": [], 993 | "source": [ 994 | "# Exercise 1: Solution\n", 995 | "# model.add(Embedding(10000, 8, input_length=maxlen))\n", 996 | "# model.add(Flatten())\n", 997 | "# model.add(Dense(1, activation='sigmoid'))" 998 | ] 999 | }, 1000 | { 1001 | "cell_type": "code", 1002 | "execution_count": null, 1003 | "metadata": {}, 1004 | "outputs": [], 1005 | "source": [ 1006 | "# Exercise 2: Solution\n", 1007 | "# model.add(Embedding(10000, 32))\n", 1008 | "# model.add(SimpleRNN(32))\n", 1009 | "# model.summary()" 1010 | ] 1011 | }, 1012 | { 1013 | "cell_type": "code", 1014 | "execution_count": null, 1015 | "metadata": {}, 1016 | "outputs": [], 1017 | "source": [ 1018 | "# Exercise 3: Solution\n", 1019 | "# model.add(Embedding(max_features, 32))\n", 1020 | "# model.add(LSTM(32))\n", 1021 | "# model.add(Dense(1, activation='sigmoid'))" 1022 | ] 1023 | } 1024 | ], 1025 | "metadata": { 1026 | "kernelspec": { 1027 | "display_name": "Python 2", 1028 | "language": "python", 1029 | "name": "python2" 1030 | }, 1031 | "language_info": { 1032 | "codemirror_mode": { 1033 | "name": "ipython", 1034 | "version": 2 1035 | }, 1036 | "file_extension": ".py", 1037 | "mimetype": "text/x-python", 1038 | "name": "python", 1039 | "nbconvert_exporter": "python", 1040 | "pygments_lexer": "ipython2", 1041 | "version": "2.7.15" 1042 | } 1043 | }, 1044 | "nbformat": 4, 1045 | "nbformat_minor": 2 1046 | } 1047 | -------------------------------------------------------------------------------- /Notebook 2 - Raining Cats and Dogs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import keras\n", 10 | "keras.__version__" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# Notebook 2" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Using Convolutional Neural Networks(CNNs) to identify dogs and cats: a binary classification problem\n", 25 | "\n", 26 | "This notebook contains code samples that have been adapted from Francois Chollet's Deep Learning With Python.\n", 27 | "\n", 28 | "----\n", 29 | "In this notebook we are going to introduce how to build a Convolutional Neural Network using [convolutional](https://keras.io/layers/convolutional/) layers. We are going to introduce the following layers:\n", 30 | "* `Conv2D`\n", 31 | "* `MaxPooling`\n", 32 | "* `Flatten`\n", 33 | "* `Dropout` (You have seen a small introduction to this in the previous notebook.\n", 34 | "\n", 35 | "We are also going to introduce training using Keras generator APIs such as `fit_generator`. Using Keras's `ImageDataGenerator` we will use image augmentation to fight overfitting. " 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## Problem\n", 43 | "\n", 44 | "In our example CNN model we will classify images as \"dogs\" or \"cats\", in a dataset containing 4000 pictures of cats and dogs (2000 cats, 2000 dogs). We will use 2000 pictures for training, 1000 for validation, and finally 1000 for testing.\n", 45 | "\n", 46 | "In this section, we will start by naively training a small convnet on our 2000 training samples, without any regularization, to set a baseline for what can be achieved. We will introduce two techniques to train a model on small datasets:\n", 47 | "* Data augmentation [improves accuracy from 71% to 82%]\n", 48 | "* Feature extraction with a pre-trained network [improves accuracy to ~93%]\n", 49 | "\n", 50 | "There is a third technique which can increase accuracy further. But we will not be covering this since it requires training on a GPU.\n", 51 | "* Fine tuning a pre-trained network [improves accuracy to 95%]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Dataset\n", 59 | "\n", 60 | "You should have downloaded the cats vs dogs dataset from Kaggle as part of the Workshop setup instructions. If you haven't you can do so at this link: \n", 61 | "`https://www.kaggle.com/c/dogs-vs-cats/data` (you will need to create a Kaggle account if you don't already have one).\n", 62 | "\n", 63 | "The pictures are medium-resolution color JPEGs. They look like this:\n", 64 | "\n", 65 | "![cats_vs_dogs_samples](https://s3.amazonaws.com/book.keras.io/img/ch5/cats_vs_dogs_samples.jpg)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "This original dataset contains 25,000 images of dogs and cats (12,500 from each class) and is 543MB large (compressed). After downloading \n", 73 | "and uncompressing it, we will create a new dataset containing three subsets: a training set with 1000 samples of each class, a validation \n", 74 | "set with 500 samples of each class, and finally a test set with 500 samples of each class.\n", 75 | "\n", 76 | "Here are a few lines of code to do this:" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "import os, shutil" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "# The path to the directory where the original\n", 95 | "# dataset was uncompressed\n", 96 | "original_dataset_dir = '/Users/anjalisridhar/kdd2018/workshop/datasets/cats_vs_dogs_original_data'\n", 97 | "\n", 98 | "# The directory where we will\n", 99 | "# store our smaller dataset\n", 100 | "base_dir = '/Users/anjalisridhar/kdd2018/workshop/datasets/cats_and_dogs_small_2'\n", 101 | "os.mkdir(base_dir)\n", 102 | "\n", 103 | "# Directories for our training,\n", 104 | "# validation and test splits\n", 105 | "train_dir = os.path.join(base_dir, 'train')\n", 106 | "os.mkdir(train_dir)\n", 107 | "validation_dir = os.path.join(base_dir, 'validation')\n", 108 | "os.mkdir(validation_dir)\n", 109 | "test_dir = os.path.join(base_dir, 'test')\n", 110 | "os.mkdir(test_dir)\n", 111 | "\n", 112 | "# Directory with our training cat pictures\n", 113 | "train_cats_dir = os.path.join(train_dir, 'cats')\n", 114 | "os.mkdir(train_cats_dir)\n", 115 | "\n", 116 | "# Directory with our training dog pictures\n", 117 | "train_dogs_dir = os.path.join(train_dir, 'dogs')\n", 118 | "os.mkdir(train_dogs_dir)\n", 119 | "\n", 120 | "# Directory with our validation cat pictures\n", 121 | "validation_cats_dir = os.path.join(validation_dir, 'cats')\n", 122 | "os.mkdir(validation_cats_dir)\n", 123 | "\n", 124 | "# Directory with our validation dog pictures\n", 125 | "validation_dogs_dir = os.path.join(validation_dir, 'dogs')\n", 126 | "os.mkdir(validation_dogs_dir)\n", 127 | "\n", 128 | "# Directory with our testing cat pictures\n", 129 | "test_cats_dir = os.path.join(test_dir, 'cats')\n", 130 | "os.mkdir(test_cats_dir)\n", 131 | "\n", 132 | "# Directory with our testing dog pictures\n", 133 | "test_dogs_dir = os.path.join(test_dir, 'dogs')\n", 134 | "os.mkdir(test_dogs_dir)\n", 135 | "\n", 136 | "# Copy first 1000 cat images to train_cats_dir\n", 137 | "fnames = ['cat.{}.jpg'.format(i) for i in range(1000)]\n", 138 | "for fname in fnames:\n", 139 | " src = os.path.join(original_dataset_dir, fname)\n", 140 | " dst = os.path.join(train_cats_dir, fname)\n", 141 | " shutil.copyfile(src, dst)\n", 142 | "\n", 143 | "# Copy next 500 cat images to validation_cats_dir\n", 144 | "fnames = ['cat.{}.jpg'.format(i) for i in range(1000, 1500)]\n", 145 | "for fname in fnames:\n", 146 | " src = os.path.join(original_dataset_dir, fname)\n", 147 | " dst = os.path.join(validation_cats_dir, fname)\n", 148 | " shutil.copyfile(src, dst)\n", 149 | " \n", 150 | "# Copy next 500 cat images to test_cats_dir\n", 151 | "fnames = ['cat.{}.jpg'.format(i) for i in range(1500, 2000)]\n", 152 | "for fname in fnames:\n", 153 | " src = os.path.join(original_dataset_dir, fname)\n", 154 | " dst = os.path.join(test_cats_dir, fname)\n", 155 | " shutil.copyfile(src, dst)\n", 156 | " \n", 157 | "# Copy first 1000 dog images to train_dogs_dir\n", 158 | "fnames = ['dog.{}.jpg'.format(i) for i in range(1000)]\n", 159 | "for fname in fnames:\n", 160 | " src = os.path.join(original_dataset_dir, fname)\n", 161 | " dst = os.path.join(train_dogs_dir, fname)\n", 162 | " shutil.copyfile(src, dst)\n", 163 | " \n", 164 | "# Copy next 500 dog images to validation_dogs_dir\n", 165 | "fnames = ['dog.{}.jpg'.format(i) for i in range(1000, 1500)]\n", 166 | "for fname in fnames:\n", 167 | " src = os.path.join(original_dataset_dir, fname)\n", 168 | " dst = os.path.join(validation_dogs_dir, fname)\n", 169 | " shutil.copyfile(src, dst)\n", 170 | " \n", 171 | "# Copy next 500 dog images to test_dogs_dir\n", 172 | "fnames = ['dog.{}.jpg'.format(i) for i in range(1500, 2000)]\n", 173 | "for fname in fnames:\n", 174 | " src = os.path.join(original_dataset_dir, fname)\n", 175 | " dst = os.path.join(test_dogs_dir, fname)\n", 176 | " shutil.copyfile(src, dst)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "As a sanity check, let's count how many pictures we have in each training split (train/validation/test):" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "assert 1000 == len(os.listdir(train_cats_dir))\n", 193 | "assert 1000 == len(os.listdir(train_dogs_dir))\n", 194 | "\n", 195 | "assert 500 == len(os.listdir(validation_cats_dir))\n", 196 | "assert 500 == len(os.listdir(validation_dogs_dir))\n", 197 | "\n", 198 | "assert 500 == len(os.listdir(test_cats_dir))\n", 199 | "assert 500 == len(os.listdir(test_dogs_dir))" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "\n", 207 | "So we have indeed 2000 training images, and then 1000 validation images and 1000 test images. In each split, there is the same number of \n", 208 | "samples from each class: this is a balanced binary classification problem, which means that classification accuracy will be an appropriate \n", 209 | "measure of success." 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "## Building our network\n", 217 | "\n", 218 | "The general structure of our convnet will be a stack of alternated `Conv2D` (with `relu` activation) and `MaxPooling2D` layers.\n", 219 | "\n", 220 | "However, since we are dealing with bigger images and a more complex problem, we will make our network accordingly: it will have 4 `Conv2D` + `MaxPooling2D` stages. \n", 221 | "\n", 222 | "A `Conv2D` layer creates a convolution kernel that is convolved with the layer input to produce a tensor of outputs. You can add a bias vector and activation to the output if needed. A `MaxPooling2D` layer is used to downscale input in\n", 223 | "both the vertical and horizontal dimensions." 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "## Exercise 1: Build your model with Conv2D and MaxPooling2D layers" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "from keras import layers\n", 240 | "from keras import models\n", 241 | "\n", 242 | "# Instantiate a Sequential model\n", 243 | "model = models.Sequential() \n", 244 | "# Add a Conv2D layer with 32 filters, kernel_size of (3, 3) and `relu` activation. \n", 245 | "# Given that this is the first layer you need to specify the input shape of the input \n", 246 | "# without the sample size axis. For example: (128, 128, 3) represents\n", 247 | "# an input image of shape 128 X 128 RGB images and a data_format of \"channels_last\".\n", 248 | "model.add(layers.Conv2D(32, (3, 3), activation='relu',\n", 249 | " input_shape=(150, 150, 3)))\n", 250 | "# Add a MaxPooling2D layer with a pool size of (2, 2).\n", 251 | "model.add(layers.MaxPooling2D((2, 2)))\n", 252 | "# Add a Conv2D layer with 64 filters, kernel_size of (3, 3) and `relu` activation.\n", 253 | "# ...\n", 254 | "# Add a MaxPooling2D layer with a pool size of (2, 2).\n", 255 | "# ...\n", 256 | "# Add a Conv2D layer with 128 filters, kernel_size of (3, 3) and `relu` activation.\n", 257 | "# ...\n", 258 | "# Add a MaxPooling2D layer with a pool size of (2, 2).\n", 259 | "# ...\n", 260 | "# Add a Conv2D layer with 128 filters, kernel_size of (3, 3) and `relu` activation.\n", 261 | "# ...\n", 262 | "# Add a MaxPooling2D layer with a pool size of (2, 2).\n", 263 | "# ..." 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "## Exercise 2: Add a Flatten layer followed by 2 Dense layers:\n", 271 | "The next step would be to feed our last output tensor into a densely-connected classifier network like those you are already familiar with: a stack of Dense layers. These classifiers process vectors, which are 1D, whereas our current output is a 3D tensor. So first, we will have to flatten our 3D outputs to 1D using the `Flatten` layer and then add a few Dense layers on top to classify the image." 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "from keras import layers\n", 281 | "from keras import models\n", 282 | "\n", 283 | "# Use our model with Conv2D and MaxPooling2D layers from the previous section\n", 284 | "# ...\n", 285 | "# Add a Flatten layer\n", 286 | "# ...\n", 287 | "# Add a Dense layer of 512 units and `relu` activation.\n", 288 | "# ...\n", 289 | "# Add a Dense layer of 1 unit and `sigmoid` activation." 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "Let's take a look at how the dimensions of the feature maps change with every successive layer:" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "model.summary()" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "Note that the depth of the feature maps is progressively increasing in the network (from 32 to 128), while the size of the feature maps is \n", 313 | "decreasing (from 148x148 to 7x7). This is a pattern that you will see in almost all convnets.\n", 314 | "\n", 315 | "Since we are attacking a binary classification problem, we are ending the network with a single unit (a `Dense` layer of size 1) and a \n", 316 | "`sigmoid` activation. This unit will encode the probability that the network is looking at one class or the other." 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "## Compile the model:" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "from keras import optimizers\n", 333 | "\n", 334 | "# `compile` the model using `binary_crossentropy` loss, `RMSProp` optimizer and \n", 335 | "# `accuracy` metric.\n", 336 | "model.compile(loss='binary_crossentropy',\n", 337 | " optimizer=optimizers.RMSprop(lr=1e-4),\n", 338 | " metrics=['acc'])" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "## ImageDataGenerator\n", 346 | "\n", 347 | "As you already know by now, data should be formatted into appropriately pre-processed floating point tensors before being fed into our \n", 348 | "network. Currently, our data sits on a drive as JPEG files, so the steps for getting it into our network are roughly:\n", 349 | "\n", 350 | "* Read the picture files.\n", 351 | "* Decode the JPEG content to RBG grids of pixels.\n", 352 | "* Convert these into floating point tensors.\n", 353 | "* Rescale the pixel values (between 0 and 255) to the [0, 1] interval (as you know, neural networks prefer to deal with small input values).\n", 354 | "\n", 355 | "Keras has utilities to take care of these steps automatically. Keras has a module with image \n", 356 | "processing helper tools, located at `keras.preprocessing.image`. In particular, it contains the class [`ImageDataGenerator`](https://keras.io/preprocessing/image/) which allows to quickly set up Python generators that can automatically turn image files on disk into batches of pre-processed tensors. This is what we will use here." 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "from keras.preprocessing.image import ImageDataGenerator\n", 366 | "\n", 367 | "# All images will be rescaled by 1./255\n", 368 | "train_datagen = ImageDataGenerator(rescale=1./255)\n", 369 | "test_datagen = ImageDataGenerator(rescale=1./255)\n", 370 | "\n", 371 | "train_generator = train_datagen.flow_from_directory(\n", 372 | " # This is the target directory\n", 373 | " train_dir,\n", 374 | " # All images will be resized to 150x150\n", 375 | " target_size=(150, 150),\n", 376 | " batch_size=20,\n", 377 | " # Since we use binary_crossentropy loss, we need binary labels\n", 378 | " class_mode='binary')\n", 379 | "\n", 380 | "validation_generator = test_datagen.flow_from_directory(\n", 381 | " validation_dir,\n", 382 | " target_size=(150, 150),\n", 383 | " batch_size=20,\n", 384 | " class_mode='binary')" 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "Let's take a look at the output of one of these generators: it yields batches of 150x150 RGB images (shape `(20, 150, 150, 3)`) and binary \n", 392 | "labels (shape `(20,)`). 20 is the number of samples in each batch (the batch size). Note that the generator yields these batches \n", 393 | "indefinitely: it just loops endlessly over the images present in the target folder. For this reason, we need to `break` the iteration loop \n", 394 | "at some point." 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "for data_batch, labels_batch in train_generator:\n", 404 | " print('data batch shape:', data_batch.shape)\n", 405 | " print('labels batch shape:', labels_batch.shape)\n", 406 | " break" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "Let's fit our model to the data using the generator.We do it using the `fit_generator` method, the equivalent of `fit` for data generators like ours." 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "## Exercise 3: Train the model using the `fit_generator` method:\n", 421 | "\n", 422 | "We need the `steps_per_epoch` and `validation_steps` argument to specify the batches drawn from the generator since the generator yields data endlessly.\n", 423 | "Note: To run through the entire input dataset, the `steps_per_epoch` argument should be num_inputs_samples/generator_batch_size = 2000/20 = 100." 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "# Train the model using the `fit_generator` method with the `train_generator` created \n", 433 | "# above as the first argument.\n", 434 | "# The other parameters to this API are similar to the `fit` function.\n", 435 | "# `steps_per_epoch` to specify the number of batches to draw from the generator.\n", 436 | "# `epochs` that we want to train the model for.\n", 437 | "# `validation_data` argument which will be the validation generator\n", 438 | "# `validation_steps` to specify the number of batches to draw from the generator.\n", 439 | "# history = model.fit_generator(...)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": { 445 | "collapsed": true 446 | }, 447 | "source": [ 448 | "It is good practice to always save your models after training:" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "model.save('cats_and_dogs_small_1.h5')" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": {}, 463 | "source": [ 464 | "Keras also provides APIs to load your model from an existing h5 file using `load`. You can also use the `save_weights` and `load_weights` APIs to only save/restore layer weights. " 465 | ] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "metadata": {}, 470 | "source": [ 471 | "Let's plot the loss and accuracy of the model over the training and validation data during training:" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "import matplotlib.pyplot as plt\n", 481 | "\n", 482 | "acc = history.history['acc']\n", 483 | "val_acc = history.history['val_acc']\n", 484 | "loss = history.history['loss']\n", 485 | "val_loss = history.history['val_loss']\n", 486 | "\n", 487 | "epochs = range(len(acc))\n", 488 | "\n", 489 | "plt.plot(epochs, acc, 'bo', label='Training acc')\n", 490 | "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", 491 | "plt.title('Training and validation accuracy')\n", 492 | "plt.legend()\n", 493 | "\n", 494 | "plt.figure()\n", 495 | "\n", 496 | "plt.plot(epochs, loss, 'bo', label='Training loss')\n", 497 | "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", 498 | "plt.title('Training and validation loss')\n", 499 | "plt.legend()\n", 500 | "\n", 501 | "plt.show()" 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "metadata": {}, 507 | "source": [ 508 | "Verify that training accuracy increases linearly to reach almost 100%. Training loss should also decrease linearly until it reaches nearly 0.\n", 509 | "Verify that validation accuracy stalls at 70-72% and validation loss reaches its minima after only 5 epochs and stalls.\n", 510 | "These plots are characteristic of overfitting. \n", 511 | "\n", 512 | "We are now going to introduce the first technique to deal with overfitting, specific to computer vision, and used almost universally when processing images with deep learning models: *data augmentation*." 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "## Using Data Augmentation\n", 520 | "\n", 521 | "Overfitting is caused by having too few samples to learn from, rendering us unable to train a model able to generalize to new data. Data augmentation takes the approach of generating more training data from existing training samples, by \"augmenting\" the samples via a number of random transformations that yield believable-looking images. The goal is that at training time, our model would never see the exact same picture twice. This helps the model get exposed to more aspects of the data and generalize better.\n", 522 | "\n", 523 | "In Keras, this can be done by configuring a number of random transformations to be performed on the images read by our `ImageDataGenerator` instance. Let's get started with an example:" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": null, 529 | "metadata": {}, 530 | "outputs": [], 531 | "source": [ 532 | "datagen = ImageDataGenerator(\n", 533 | " rotation_range=40,\n", 534 | " width_shift_range=0.2,\n", 535 | " height_shift_range=0.2,\n", 536 | " shear_range=0.2,\n", 537 | " zoom_range=0.2,\n", 538 | " horizontal_flip=True,\n", 539 | " fill_mode='nearest')" 540 | ] 541 | }, 542 | { 543 | "cell_type": "markdown", 544 | "metadata": {}, 545 | "source": [ 546 | "These are just a few of the options available (for more, see the Keras documentation). Let's quickly go over what we just wrote:\n", 547 | "\n", 548 | "* `rotation_range` is a value in degrees (0-180), a range within which to randomly rotate pictures.\n", 549 | "* `width_shift` and `height_shift` are ranges (as a fraction of total width or height) within which to randomly translate pictures \n", 550 | "vertically or horizontally.\n", 551 | "* `shear_range` is for randomly applying shearing transformations.\n", 552 | "* `zoom_range` is for randomly zooming inside pictures.\n", 553 | "* `horizontal_flip` is for randomly flipping half of the images horizontally -- relevant when there are no assumptions of horizontal \n", 554 | "asymmetry (e.g. real-world pictures).\n", 555 | "* `fill_mode` is the strategy used for filling in newly created pixels, which can appear after a rotation or a width/height shift.\n", 556 | "\n", 557 | "Let's take a look at our augmented images:" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": null, 563 | "metadata": {}, 564 | "outputs": [], 565 | "source": [ 566 | "# This is module with image preprocessing utilities\n", 567 | "from keras.preprocessing import image\n", 568 | "\n", 569 | "fnames = [os.path.join(train_cats_dir, fname) for fname in os.listdir(train_cats_dir)]\n", 570 | "\n", 571 | "# We pick one image to \"augment\"\n", 572 | "img_path = fnames[3]\n", 573 | "\n", 574 | "# Read the image and resize it\n", 575 | "img = image.load_img(img_path, target_size=(150, 150))\n", 576 | "\n", 577 | "# Convert it to a Numpy array with shape (150, 150, 3)\n", 578 | "x = image.img_to_array(img)\n", 579 | "\n", 580 | "# Reshape it to (1, 150, 150, 3)\n", 581 | "x = x.reshape((1,) + x.shape)\n", 582 | "\n", 583 | "# The .flow() command below generates batches of randomly transformed images.\n", 584 | "# It will loop indefinitely, so we need to `break` the loop at some point!\n", 585 | "i = 0\n", 586 | "for batch in datagen.flow(x, batch_size=1):\n", 587 | " plt.figure(i)\n", 588 | " imgplot = plt.imshow(image.array_to_img(batch[0]))\n", 589 | " i += 1\n", 590 | " if i % 4 == 0:\n", 591 | " break\n", 592 | "\n", 593 | "plt.show()" 594 | ] 595 | }, 596 | { 597 | "cell_type": "markdown", 598 | "metadata": {}, 599 | "source": [ 600 | "If we train a new network using this data augmentation configuration, our network will never see twice the same input. However, the inputs \n", 601 | "that it sees are still heavily intercorrelated, since they come from a small number of original images -- we cannot produce new information, \n", 602 | "we can only remix existing information. As such, this might not be quite enough to completely get rid of overfitting. To further fight \n", 603 | "overfitting, we will also add a Dropout layer to our model, right before the densely-connected classifier." 604 | ] 605 | }, 606 | { 607 | "cell_type": "markdown", 608 | "metadata": {}, 609 | "source": [ 610 | "## Exercise 4: Add a Dropout layer right before the fully connected Dense layers with a rate value of 0.5." 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [ 619 | "model = models.Sequential()\n", 620 | "model.add(layers.Conv2D(32, (3, 3), activation='relu',\n", 621 | " input_shape=(150, 150, 3)))\n", 622 | "model.add(layers.MaxPooling2D((2, 2)))\n", 623 | "model.add(layers.Conv2D(64, (3, 3), activation='relu'))\n", 624 | "model.add(layers.MaxPooling2D((2, 2)))\n", 625 | "model.add(layers.Conv2D(128, (3, 3), activation='relu'))\n", 626 | "model.add(layers.MaxPooling2D((2, 2)))\n", 627 | "model.add(layers.Conv2D(128, (3, 3), activation='relu'))\n", 628 | "model.add(layers.MaxPooling2D((2, 2)))\n", 629 | "model.add(layers.Flatten())\n", 630 | "# Add the Dropout layer with a dropout rate of 0.5\n", 631 | "# ...\n", 632 | "\n", 633 | "# FCC Dense layers\n", 634 | "model.add(layers.Dense(512, activation='relu'))\n", 635 | "model.add(layers.Dense(1, activation='sigmoid'))" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": null, 641 | "metadata": {}, 642 | "outputs": [], 643 | "source": [ 644 | "# Compile your model using a `binary_crossentropy` loss, `RMPProp` \n", 645 | "# optimizer(learning rateis 1e-4) and `accuracy` as one of the metrics.\n", 646 | "model.compile(loss='binary_crossentropy',\n", 647 | " optimizer=optimizers.RMSprop(lr=1e-4),\n", 648 | " metrics=['acc'])" 649 | ] 650 | }, 651 | { 652 | "cell_type": "markdown", 653 | "metadata": {}, 654 | "source": [ 655 | "## Let's train our network using data augmentation and dropout:" 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": null, 661 | "metadata": {}, 662 | "outputs": [], 663 | "source": [ 664 | "# Define our train ImageDataGenerator from above\n", 665 | "train_datagen = ImageDataGenerator(\n", 666 | " rescale=1./255,\n", 667 | " rotation_range=40,\n", 668 | " width_shift_range=0.2,\n", 669 | " height_shift_range=0.2,\n", 670 | " shear_range=0.2,\n", 671 | " zoom_range=0.2,\n", 672 | " horizontal_flip=True,)\n", 673 | "\n", 674 | "# Define our test ImageDataGenerator\n", 675 | "# Note that the validation data should not be augmented!\n", 676 | "test_datagen = ImageDataGenerator(rescale=1./255)\n", 677 | "\n", 678 | "# Define parameters for batch generation from train data\n", 679 | "train_generator = train_datagen.flow_from_directory(\n", 680 | " # This is the target directory\n", 681 | " train_dir,\n", 682 | " # All images will be resized to 150x150\n", 683 | " target_size=(150, 150),\n", 684 | " batch_size=32,\n", 685 | " # Since we use binary_crossentropy loss, we need binary labels\n", 686 | " class_mode='binary')\n", 687 | "\n", 688 | "# Define parameters for batch generation from test data\n", 689 | "validation_generator = test_datagen.flow_from_directory(\n", 690 | " validation_dir,\n", 691 | " target_size=(150, 150),\n", 692 | " batch_size=32,\n", 693 | " class_mode='binary')\n", 694 | "\n", 695 | "# Train the model using `fit_generator`\n", 696 | "history = model.fit_generator(\n", 697 | " train_generator,\n", 698 | " steps_per_epoch=20,\n", 699 | " epochs=20,\n", 700 | " validation_data=validation_generator,\n", 701 | " validation_steps=50)\n", 702 | "# Note you can train for steps_per_epoch=100 and epochs=100 to get an accuracy of 82%.\n", 703 | "# However in the interest of time you can reduce the steps_per_epoch and number of epochs to\n", 704 | "# 50 and 20 respectively." 705 | ] 706 | }, 707 | { 708 | "cell_type": "markdown", 709 | "metadata": {}, 710 | "source": [ 711 | "Let's plot our results again:" 712 | ] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": null, 717 | "metadata": { 718 | "scrolled": true 719 | }, 720 | "outputs": [], 721 | "source": [ 722 | "acc = history.history['acc']\n", 723 | "val_acc = history.history['val_acc']\n", 724 | "loss = history.history['loss']\n", 725 | "val_loss = history.history['val_loss']\n", 726 | "\n", 727 | "epochs = range(len(acc))\n", 728 | "\n", 729 | "plt.plot(epochs, acc, 'bo', label='Training acc')\n", 730 | "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", 731 | "plt.title('Training and validation accuracy')\n", 732 | "plt.legend()\n", 733 | "\n", 734 | "plt.figure()\n", 735 | "\n", 736 | "plt.plot(epochs, loss, 'bo', label='Training loss')\n", 737 | "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", 738 | "plt.title('Training and validation loss')\n", 739 | "plt.legend()\n", 740 | "\n", 741 | "plt.show()" 742 | ] 743 | }, 744 | { 745 | "cell_type": "markdown", 746 | "metadata": {}, 747 | "source": [ 748 | "Verify that the training and validation curves are closely tracking each other.\n", 749 | "Verify that we can now reach an accuracy of 72% for 20 epochs with 50 steps per epoch.\n", 750 | "\n", 751 | "Thanks to data augmentation and dropout, we are no longer overfitting: the training curves are rather closely tracking the validation \n", 752 | "curves. We are now able to reach an accuracy of ~72% if we train for 20 epochs. If we train for a 100 epochs we can achieve an accuracy of 82%, a 15% relative improvement over the non-regularized model." 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": null, 758 | "metadata": {}, 759 | "outputs": [], 760 | "source": [ 761 | "## Exercise(optional): Use L2 regularization in our layers to see if we can achieve even better accuracy." 762 | ] 763 | }, 764 | { 765 | "cell_type": "markdown", 766 | "metadata": {}, 767 | "source": [ 768 | "## Feature Extraction\n", 769 | "\n", 770 | "The second technique that we will use to train an image classification model on a small dataset is Feature Extraction using a pre-trained network. A pre-trained network is simply a saved network previously trained on a large dataset, typically on a large-scale image classification task. If this original dataset is large enough and general enough, then the spatial feature hierarchy learned by the pre-trained network can effectively act as a generic model of our visual world, and hence its features can prove useful for many different computer vision problems, even though these new problems might involve completely different classes from those of the original task.\n", 771 | "\n", 772 | "We will use the VGG16 architecture, developed by Karen Simonyan and Andrew Zisserman in 2014, a simple and widely used convnet architecture for ImageNet dataset(1.4 million labeled images and 1000 different classes). Keras comes prepackaged with a number of [image classification models](https://keras.io/applications/). VGG16 is one such model that is available as part of `keras.applications`. All the models are pre-trained on the ImageNet dataset." 773 | ] 774 | }, 775 | { 776 | "cell_type": "markdown", 777 | "metadata": {}, 778 | "source": [ 779 | "Feature extraction consists of using the representations learned by a previous network to extract interesting features from new samples. \n", 780 | "These features are then run through a new classifier, which is trained from scratch.\n", 781 | "\n", 782 | "![swapping FC classifiers](https://s3.amazonaws.com/book.keras.io/img/ch5/swapping_fc_classifier.png)\n", 783 | "\n", 784 | "Note that the level of generality (and therefore reusability) of the representations extracted by specific convolution layers depends on \n", 785 | "the depth of the layer in the model. Layers that come earlier in the model extract local, highly generic feature maps (such as visual \n", 786 | "edges, colors, and textures), while layers higher-up extract more abstract concepts (such as \"cat ear\" or \"dog eye\"). So if your new \n", 787 | "dataset differs a lot from the dataset that the original model was trained on, you may be better off using only the first few layers of the \n", 788 | "model to do feature extraction, rather than using the entire convolutional base." 789 | ] 790 | }, 791 | { 792 | "cell_type": "markdown", 793 | "metadata": {}, 794 | "source": [ 795 | "Let's instantiate the VGG16 model:" 796 | ] 797 | }, 798 | { 799 | "cell_type": "code", 800 | "execution_count": null, 801 | "metadata": {}, 802 | "outputs": [], 803 | "source": [ 804 | "from keras.applications import VGG16\n", 805 | "\n", 806 | "conv_base = VGG16(weights='imagenet',\n", 807 | " include_top=False,\n", 808 | " input_shape=(150, 150, 3))" 809 | ] 810 | }, 811 | { 812 | "cell_type": "markdown", 813 | "metadata": {}, 814 | "source": [ 815 | "We passed three arguments to the constructor:\n", 816 | "\n", 817 | "* `weights`, to specify which weight checkpoint to initialize the model from\n", 818 | "* `include_top`, which refers to including or not the densely-connected classifier on top of the network. By default, this \n", 819 | "densely-connected classifier would correspond to the 1000 classes from ImageNet. Since we intend to use our own densely-connected \n", 820 | "classifier (with only two classes, cat and dog), we don't need to include it.\n", 821 | "* `input_shape`, the shape of the image tensors that we will feed to the network. This argument is purely optional: if we don't pass it, \n", 822 | "then the network will be able to process inputs of any size.\n", 823 | "\n", 824 | "Here's the detail of the architecture of the VGG16 convolutional base: it's very similar to the simple convnets that you are already \n", 825 | "familiar with." 826 | ] 827 | }, 828 | { 829 | "cell_type": "code", 830 | "execution_count": null, 831 | "metadata": {}, 832 | "outputs": [], 833 | "source": [ 834 | "conv_base.summary()" 835 | ] 836 | }, 837 | { 838 | "cell_type": "markdown", 839 | "metadata": {}, 840 | "source": [ 841 | "The final feature map has shape `(4, 4, 512)`. That's the feature on top of which we will stick a densely-connected classifier.\n", 842 | "\n", 843 | "At this point, there are two ways we could proceed: \n", 844 | "\n", 845 | "* Running the convolutional base over our dataset, recording its output to a Numpy array on disk, then using this data as input to a standalone densely-connected classifier. This involves running the convolutional base once for every input image. However, for the exact same reason, this technique would not allow us to leverage data augmentation at \n", 846 | "all.\n", 847 | "* Extending the model we have (`conv_base`) by adding `Dense` layers on top, and running the whole thing end-to-end on the input data. This allows us to use data augmentation, because every input image is going through the convolutional base every time it is seen by the model. However, for this same reason, this technique is far more expensive than the first one.\n", 848 | "\n", 849 | "Let's walk through the code required to set-up the first one: recording the output of `conv_base` on our \n", 850 | "data and using these outputs as inputs to a new model.\n", 851 | "\n", 852 | "We will start by simply running instances of the previously-introduced `ImageDataGenerator` to extract images as Numpy arrays as well as their labels. We will extract features from these images simply by calling the `predict` method of the `conv_base` model." 853 | ] 854 | }, 855 | { 856 | "cell_type": "code", 857 | "execution_count": null, 858 | "metadata": {}, 859 | "outputs": [], 860 | "source": [ 861 | "import os\n", 862 | "import numpy as np\n", 863 | "from keras.preprocessing.image import ImageDataGenerator\n", 864 | "\n", 865 | "base_dir = '/Users/anjalisridhar/Downloads/cats_and_dogs_small'\n", 866 | "\n", 867 | "train_dir = os.path.join(base_dir, 'train')\n", 868 | "validation_dir = os.path.join(base_dir, 'validation')\n", 869 | "test_dir = os.path.join(base_dir, 'test')\n", 870 | "\n", 871 | "datagen = ImageDataGenerator(rescale=1./255)\n", 872 | "batch_size = 20\n", 873 | "\n", 874 | "def extract_features(directory, sample_count):\n", 875 | " features = np.zeros(shape=(sample_count, 4, 4, 512))\n", 876 | " labels = np.zeros(shape=(sample_count))\n", 877 | " generator = datagen.flow_from_directory(\n", 878 | " directory,\n", 879 | " target_size=(150, 150),\n", 880 | " batch_size=batch_size,\n", 881 | " class_mode='binary')\n", 882 | " i = 0\n", 883 | " for inputs_batch, labels_batch in generator:\n", 884 | " features_batch = conv_base.predict(inputs_batch)\n", 885 | " features[i * batch_size : (i + 1) * batch_size] = features_batch\n", 886 | " labels[i * batch_size : (i + 1) * batch_size] = labels_batch\n", 887 | " i += 1\n", 888 | " if i * batch_size >= sample_count:\n", 889 | " # Note that since generators yield data indefinitely in a loop,\n", 890 | " # we must `break` after every image has been seen once.\n", 891 | " break\n", 892 | " return features, labels\n", 893 | "\n", 894 | "train_features, train_labels = extract_features(train_dir, 2000)\n", 895 | "validation_features, validation_labels = extract_features(validation_dir, 1000)\n", 896 | "test_features, test_labels = extract_features(test_dir, 1000)" 897 | ] 898 | }, 899 | { 900 | "cell_type": "markdown", 901 | "metadata": {}, 902 | "source": [ 903 | "The extracted features are currently of shape `(samples, 4, 4, 512)`. We will feed them to a densely-connected classifier, so first we must \n", 904 | "flatten them to `(samples, 8192)`:" 905 | ] 906 | }, 907 | { 908 | "cell_type": "code", 909 | "execution_count": null, 910 | "metadata": {}, 911 | "outputs": [], 912 | "source": [ 913 | "train_features = np.reshape(train_features, (2000, 4 * 4 * 512))\n", 914 | "validation_features = np.reshape(validation_features, (1000, 4 * 4 * 512))\n", 915 | "test_features = np.reshape(test_features, (1000, 4 * 4 * 512))" 916 | ] 917 | }, 918 | { 919 | "cell_type": "markdown", 920 | "metadata": {}, 921 | "source": [ 922 | "At this point, we can define our densely-connected classifier (note the use of dropout for regularization), and train it on the data and labels that we just recorded:" 923 | ] 924 | }, 925 | { 926 | "cell_type": "markdown", 927 | "metadata": {}, 928 | "source": [ 929 | "## Run the trained features through a Dense classifer:" 930 | ] 931 | }, 932 | { 933 | "cell_type": "code", 934 | "execution_count": null, 935 | "metadata": {}, 936 | "outputs": [], 937 | "source": [ 938 | "from keras import models\n", 939 | "from keras import layers\n", 940 | "from keras import optimizers\n", 941 | "\n", 942 | "model = models.Sequential()\n", 943 | "model.add(layers.Dense(256, activation='relu', input_dim=4 * 4 * 512))\n", 944 | "model.add(layers.Dropout(0.5))\n", 945 | "model.add(layers.Dense(1, activation='sigmoid'))\n", 946 | "\n", 947 | "# Compile the model\n", 948 | "model.compile(optimizer=optimizers.RMSprop(lr=2e-5),\n", 949 | " loss='binary_crossentropy',\n", 950 | " metrics=['acc'])\n", 951 | "\n", 952 | "# Train the model using the extracted features and labels.\n", 953 | "history = model.fit(train_features, train_labels,\n", 954 | " epochs=30,\n", 955 | " batch_size=20,\n", 956 | " validation_data=(validation_features, validation_labels))" 957 | ] 958 | }, 959 | { 960 | "cell_type": "markdown", 961 | "metadata": {}, 962 | "source": [ 963 | "Training is very fast, since we only have to deal with two `Dense` layers -- an epoch takes less than one second even on CPU. You should have reached a validation accuracy of ~90% in 30 epochs.\n", 964 | "\n", 965 | "Let's take a look at the loss and accuracy curves during training:" 966 | ] 967 | }, 968 | { 969 | "cell_type": "code", 970 | "execution_count": null, 971 | "metadata": {}, 972 | "outputs": [], 973 | "source": [ 974 | "import matplotlib.pyplot as plt\n", 975 | "\n", 976 | "acc = history.history['acc']\n", 977 | "val_acc = history.history['val_acc']\n", 978 | "loss = history.history['loss']\n", 979 | "val_loss = history.history['val_loss']\n", 980 | "\n", 981 | "epochs = range(len(acc))\n", 982 | "\n", 983 | "plt.plot(epochs, acc, 'bo', label='Training acc')\n", 984 | "plt.plot(epochs, val_acc, 'b', label='Validation acc')\n", 985 | "plt.title('Training and validation accuracy')\n", 986 | "plt.legend()\n", 987 | "\n", 988 | "plt.figure()\n", 989 | "\n", 990 | "plt.plot(epochs, loss, 'bo', label='Training loss')\n", 991 | "plt.plot(epochs, val_loss, 'b', label='Validation loss')\n", 992 | "plt.title('Training and validation loss')\n", 993 | "plt.legend()\n", 994 | "\n", 995 | "plt.show()" 996 | ] 997 | }, 998 | { 999 | "cell_type": "markdown", 1000 | "metadata": {}, 1001 | "source": [ 1002 | "We reach a validation accuracy of about 90%, much better than what we could achieve in the previous section (~84%) with our small model trained from \n", 1003 | "scratch. However, our plots also indicate that we are overfitting almost from the start -- despite using dropout with a fairly large rate. \n", 1004 | "This is because this technique does not leverage data augmentation, which is essential to preventing overfitting with small image datasets.\n", 1005 | "\n", 1006 | "\n", 1007 | "Another technique used to deal with small datasets is *Fine Tuning*. As mentioned in the beginning of the notebook, this technique is extremely slow and is practically impossible to run on CPU. You will need a GPU for this technique. You can create a `Sequential` model and add the `conv_base` as one of its layers. The output of the base is then flattened and run through a Dense fully connected network as we have seen before.\n", 1008 | "\n", 1009 | "The steps for fine-tuning a network are as follow:\n", 1010 | "\n", 1011 | "* 1) Add your custom network on top of an already trained base network.\n", 1012 | "* 2) Freeze the base network.\n", 1013 | "* 3) Train the part you added.\n", 1014 | "* 4) Unfreeze some layers in the base network.\n", 1015 | "* 5) Jointly train both these layers and the part you added.\n", 1016 | "\n", 1017 | "\n", 1018 | "We have already completed the first 3 steps when doing feature extraction. To proceed with the 4th step we will need to unfreeze our `conv_base`, and then freeze individual layers inside of it. You can set the `trainable` property of a layer as needed. \n", 1019 | "\n", 1020 | "Note: We need to first train the dense classifier we added to the pre-trained network before unfreezing the convolutional base. This is to prevent a large error signal from propagating through the network during training. The representations learned by the network previously will be destroyed otherwise." 1021 | ] 1022 | }, 1023 | { 1024 | "cell_type": "code", 1025 | "execution_count": null, 1026 | "metadata": {}, 1027 | "outputs": [], 1028 | "source": [ 1029 | "# Exercise 1: Solution\n", 1030 | "# model.add(layers.Conv2D(64, (3, 3), activation='relu'))\n", 1031 | "# model.add(layers.MaxPooling2D((2, 2)))\n", 1032 | "# model.add(layers.Conv2D(128, (3, 3), activation='relu'))\n", 1033 | "# model.add(layers.MaxPooling2D((2, 2)))\n", 1034 | "# model.add(layers.Conv2D(128, (3, 3), activation='relu'))" 1035 | ] 1036 | }, 1037 | { 1038 | "cell_type": "code", 1039 | "execution_count": null, 1040 | "metadata": {}, 1041 | "outputs": [], 1042 | "source": [ 1043 | "# Exercise 2: Solution\n", 1044 | "# Our model from the previous section\n", 1045 | "# model = models.Sequential()\n", 1046 | "# model.add(layers.Conv2D(32, (3, 3), activation='relu',\n", 1047 | "# input_shape=(150, 150, 3)))\n", 1048 | "# model.add(layers.MaxPooling2D((2, 2)))\n", 1049 | "# model.add(layers.Conv2D(64, (3, 3), activation='relu'))\n", 1050 | "# model.add(layers.MaxPooling2D((2, 2)))\n", 1051 | "# model.add(layers.Conv2D(128, (3, 3), activation='relu'))\n", 1052 | "# model.add(layers.MaxPooling2D((2, 2)))\n", 1053 | "# model.add(layers.Conv2D(128, (3, 3), activation='relu'))\n", 1054 | "# model.add(layers.MaxPooling2D((2, 2)))\n", 1055 | "\n", 1056 | "# The newly added Flatten and Dense layers\n", 1057 | "# model.add(layers.Flatten())\n", 1058 | "# model.add(layers.Dense(512, activation='relu'))\n", 1059 | "# model.add(layers.Dense(1, activation='sigmoid'))" 1060 | ] 1061 | }, 1062 | { 1063 | "cell_type": "code", 1064 | "execution_count": null, 1065 | "metadata": {}, 1066 | "outputs": [], 1067 | "source": [ 1068 | "# Exercise 3: Solution\n", 1069 | "# history = model.fit_generator(\n", 1070 | "# train_generator,\n", 1071 | "# steps_per_epoch=50,\n", 1072 | "# epochs=30,\n", 1073 | "# validation_data=validation_generator,\n", 1074 | "# validation_steps=50)" 1075 | ] 1076 | }, 1077 | { 1078 | "cell_type": "code", 1079 | "execution_count": null, 1080 | "metadata": {}, 1081 | "outputs": [], 1082 | "source": [ 1083 | "# Exercise 4: Solution\n", 1084 | "# model.add(layers.Dropout(0.5))" 1085 | ] 1086 | } 1087 | ], 1088 | "metadata": { 1089 | "kernelspec": { 1090 | "display_name": "Python 2", 1091 | "language": "python", 1092 | "name": "python2" 1093 | }, 1094 | "language_info": { 1095 | "codemirror_mode": { 1096 | "name": "ipython", 1097 | "version": 2 1098 | }, 1099 | "file_extension": ".py", 1100 | "mimetype": "text/x-python", 1101 | "name": "python", 1102 | "nbconvert_exporter": "python", 1103 | "pygments_lexer": "ipython2", 1104 | "version": "2.7.15" 1105 | } 1106 | }, 1107 | "nbformat": 4, 1108 | "nbformat_minor": 2 1109 | } 1110 | --------------------------------------------------------------------------------