├── Attention_is_All_You_Need.ipynb ├── CNN_Text_classification.ipynb ├── CNN_image_Classification.ipynb ├── Intro_to_Deep_Neural_Networks.ipynb ├── PCA_and_LDA.ipynb ├── README.md ├── Word_Embeddings.ipynb ├── classification.ipynb ├── classification_test_bench.ipynb ├── regression_&_minimization.ipynb ├── sentiment_analysis.ipynb └── training_a_neural_network.ipynb /Attention_is_All_You_Need.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.8.5" 21 | }, 22 | "colab": { 23 | "name": "Copy of Copy of 6 - Attention is All You Need.ipynb", 24 | "provenance": [], 25 | "collapsed_sections": [ 26 | "3WKffF0Gy_x6", 27 | "q12JMc8by_x7", 28 | "3XbLg8mty_x7", 29 | "CCX5MMGky_x8" 30 | ], 31 | "toc_visible": true 32 | }, 33 | "accelerator": "GPU" 34 | }, 35 | "cells": [ 36 | { 37 | "cell_type": "markdown", 38 | "metadata": { 39 | "id": "sOd68EzV2wcI" 40 | }, 41 | "source": [ 42 | "# Huggingface Transformers intro" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "metadata": { 48 | "colab": { 49 | "base_uri": "https://localhost:8080/" 50 | }, 51 | "id": "p3dmhxc_3nXu", 52 | "outputId": "f55fe066-0b12-457a-bb1f-d155da34c142" 53 | }, 54 | "source": [ 55 | "!pip install transformers" 56 | ], 57 | "execution_count": 2, 58 | "outputs": [ 59 | { 60 | "output_type": "stream", 61 | "text": [ 62 | "Collecting transformers\n", 63 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)\n", 64 | "\u001b[K |████████████████████████████████| 2.3MB 7.4MB/s \n", 65 | "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers) (20.9)\n", 66 | "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.0.12)\n", 67 | "Collecting tokenizers<0.11,>=0.10.1\n", 68 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)\n", 69 | "\u001b[K |████████████████████████████████| 3.3MB 47.5MB/s \n", 70 | "\u001b[?25hCollecting huggingface-hub==0.0.8\n", 71 | " Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl\n", 72 | "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.41.1)\n", 73 | "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n", 74 | "Collecting sacremoses\n", 75 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n", 76 | "\u001b[K |████████████████████████████████| 901kB 50.8MB/s \n", 77 | "\u001b[?25hRequirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from transformers) (4.5.0)\n", 78 | "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n", 79 | "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n", 80 | "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers) (2.4.7)\n", 81 | "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n", 82 | "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n", 83 | "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)\n", 84 | "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers) (3.4.1)\n", 85 | "Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers) (3.7.4.3)\n", 86 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n", 87 | "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n", 88 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.5.30)\n", 89 | "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n", 90 | "Installing collected packages: tokenizers, huggingface-hub, sacremoses, transformers\n", 91 | "Successfully installed huggingface-hub-0.0.8 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.6.1\n" 92 | ], 93 | "name": "stdout" 94 | } 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "metadata": { 100 | "colab": { 101 | "base_uri": "https://localhost:8080/" 102 | }, 103 | "id": "Pa2vsNYi2tzK", 104 | "outputId": "cd0a66ae-0dab-435b-b01c-8481d0482141" 105 | }, 106 | "source": [ 107 | "from transformers import BertTokenizer, BertForMaskedLM\n", 108 | "from torch.nn import functional as F\n", 109 | "import torch\n", 110 | "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", 111 | "model = BertForMaskedLM.from_pretrained('bert-base-uncased', return_dict = True)\n", 112 | "text = \"The capital of France, [MASK] contains the Eiffel Tower.\"\n", 113 | "\n", 114 | "\n", 115 | "input = tokenizer.encode_plus(text, return_tensors = \"pt\")\n", 116 | "mask_index = torch.where(input[\"input_ids\"][0] == tokenizer.mask_token_id)\n", 117 | "output = model(**input)\n", 118 | "logits = output.logits\n", 119 | "softmax = F.softmax(logits, dim = -1)\n", 120 | "mask_word = softmax[0, mask_index, :]\n", 121 | "top_10 = torch.topk(mask_word, 10, dim = 1)[1][0]\n", 122 | "\n", 123 | "print('\\n\\nSuggestions:')\n", 124 | "for token in top_10:\n", 125 | " word = tokenizer.decode([token])\n", 126 | " new_sentence = text.replace(tokenizer.mask_token, word)\n", 127 | " print(new_sentence)" 128 | ], 129 | "execution_count": 19, 130 | "outputs": [ 131 | { 132 | "output_type": "stream", 133 | "text": [ 134 | "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n", 135 | "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", 136 | "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" 137 | ], 138 | "name": "stderr" 139 | }, 140 | { 141 | "output_type": "stream", 142 | "text": [ 143 | "\n", 144 | "\n", 145 | "Suggestions:\n", 146 | "The capital of France, paris contains the Eiffel Tower.\n", 147 | "The capital of France, it contains the Eiffel Tower.\n", 148 | "The capital of France, which contains the Eiffel Tower.\n", 149 | "The capital of France, versailles contains the Eiffel Tower.\n", 150 | "The capital of France, brussels contains the Eiffel Tower.\n", 151 | "The capital of France, monaco contains the Eiffel Tower.\n", 152 | "The capital of France, and contains the Eiffel Tower.\n", 153 | "The capital of France, orleans contains the Eiffel Tower.\n", 154 | "The capital of France, lyon contains the Eiffel Tower.\n", 155 | "The capital of France, france contains the Eiffel Tower.\n" 156 | ], 157 | "name": "stdout" 158 | } 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "DmjYmuge3fKB" 165 | }, 166 | "source": [ 167 | "" 168 | ], 169 | "execution_count": 10, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "id": "dHAsY11Q3fVr" 176 | }, 177 | "source": [ 178 | "" 179 | ], 180 | "execution_count": null, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": { 186 | "id": "yxdChIEky_xy" 187 | }, 188 | "source": [ 189 | "## Preparing the Data" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "metadata": { 195 | "id": "dCawn1l9y_xz" 196 | }, 197 | "source": [ 198 | "import torch\n", 199 | "import torch.nn as nn\n", 200 | "import torch.optim as optim\n", 201 | "\n", 202 | "import torchtext\n", 203 | "from torchtext.legacy.datasets import Multi30k\n", 204 | "from torchtext.legacy.data import Field, BucketIterator\n", 205 | "\n", 206 | "import matplotlib.pyplot as plt\n", 207 | "import matplotlib.ticker as ticker\n", 208 | "\n", 209 | "import spacy\n", 210 | "import numpy as np\n", 211 | "\n", 212 | "import random\n", 213 | "import math\n", 214 | "import time" 215 | ], 216 | "execution_count": null, 217 | "outputs": [] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "metadata": { 222 | "id": "4BsXoOXTy_xz" 223 | }, 224 | "source": [ 225 | "SEED = 1234\n", 226 | "\n", 227 | "random.seed(SEED)\n", 228 | "np.random.seed(SEED)\n", 229 | "torch.manual_seed(SEED)\n", 230 | "torch.cuda.manual_seed(SEED)\n", 231 | "torch.backends.cudnn.deterministic = True" 232 | ], 233 | "execution_count": null, 234 | "outputs": [] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "metadata": { 239 | "id": "V-zSIxEWy_x0", 240 | "colab": { 241 | "base_uri": "https://localhost:8080/" 242 | }, 243 | "outputId": "9eb3f52a-3be8-4495-d638-b871d37ac750" 244 | }, 245 | "source": [ 246 | "!python -m spacy download de_core_news_sm\n", 247 | "!python -m spacy download en_core_web_sm\n", 248 | "\n", 249 | "import de_core_news_sm\n", 250 | "spacy_de = de_core_news_sm.load()\n", 251 | "spacy_en = spacy.load('en_core_web_sm')" 252 | ], 253 | "execution_count": null, 254 | "outputs": [ 255 | { 256 | "output_type": "stream", 257 | "text": [ 258 | "Collecting de_core_news_sm==2.2.5\n", 259 | "\u001b[?25l Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)\n", 260 | "\u001b[K |████████████████████████████████| 14.9MB 5.8MB/s \n", 261 | "\u001b[?25hRequirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.7/dist-packages (from de_core_news_sm==2.2.5) (2.2.4)\n", 262 | "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->de_core_news_sm==2.2.5) (3.0.5)\n", 263 | "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->de_core_news_sm==2.2.5) (1.0.0)\n", 264 | "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->de_core_news_sm==2.2.5) (2.23.0)\n", 265 | "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->de_core_news_sm==2.2.5) (2.0.5)\n", 266 | "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->de_core_news_sm==2.2.5) (0.8.2)\n", 267 | "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->de_core_news_sm==2.2.5) (1.0.5)\n", 268 | "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->de_core_news_sm==2.2.5) (1.1.3)\n", 269 | "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->de_core_news_sm==2.2.5) (0.4.1)\n", 270 | "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->de_core_news_sm==2.2.5) (1.0.5)\n", 271 | "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->de_core_news_sm==2.2.5) (1.19.5)\n", 272 | "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->de_core_news_sm==2.2.5) (57.0.0)\n", 273 | "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->de_core_news_sm==2.2.5) (4.41.1)\n", 274 | "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->de_core_news_sm==2.2.5) (7.4.0)\n", 275 | "Requirement already satisfied: importlib-metadata>=0.20; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->de_core_news_sm==2.2.5) (4.5.0)\n", 276 | "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->de_core_news_sm==2.2.5) (3.0.4)\n", 277 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->de_core_news_sm==2.2.5) (2021.5.30)\n", 278 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->de_core_news_sm==2.2.5) (2.10)\n", 279 | "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->de_core_news_sm==2.2.5) (1.24.3)\n", 280 | "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->de_core_news_sm==2.2.5) (3.4.1)\n", 281 | "Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->de_core_news_sm==2.2.5) (3.7.4.3)\n", 282 | "Building wheels for collected packages: de-core-news-sm\n", 283 | " Building wheel for de-core-news-sm (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 284 | " Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp37-none-any.whl size=14907055 sha256=cdb065979574a22e5404ade47691dfce21af87caf602f3dc07bd7e9c593d0d56\n", 285 | " Stored in directory: /tmp/pip-ephem-wheel-cache-9vxxhv5b/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca\n", 286 | "Successfully built de-core-news-sm\n", 287 | "Installing collected packages: de-core-news-sm\n", 288 | "Successfully installed de-core-news-sm-2.2.5\n", 289 | "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", 290 | "You can now load the model via spacy.load('de_core_news_sm')\n", 291 | "Requirement already satisfied: en_core_web_sm==2.2.5 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5 in /usr/local/lib/python3.7/dist-packages (2.2.5)\n", 292 | "Requirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.7/dist-packages (from en_core_web_sm==2.2.5) (2.2.4)\n", 293 | "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (7.4.0)\n", 294 | "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.4.1)\n", 295 | "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.5)\n", 296 | "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.8.2)\n", 297 | "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (4.41.1)\n", 298 | "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.5)\n", 299 | "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.1.3)\n", 300 | "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (57.0.0)\n", 301 | "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.0)\n", 302 | "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.19.5)\n", 303 | "Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.5)\n", 304 | "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.23.0)\n", 305 | "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.5)\n", 306 | "Requirement already satisfied: importlib-metadata>=0.20; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (4.5.0)\n", 307 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.10)\n", 308 | "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.24.3)\n", 309 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2021.5.30)\n", 310 | "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.4)\n", 311 | "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.4.1)\n", 312 | "Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.7.4.3)\n", 313 | "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", 314 | "You can now load the model via spacy.load('en_core_web_sm')\n" 315 | ], 316 | "name": "stdout" 317 | } 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "metadata": { 323 | "id": "kLB6GI7ry_x0" 324 | }, 325 | "source": [ 326 | "def tokenize_de(text):\n", 327 | " \"\"\"\n", 328 | " Tokenizes German text from a string into a list of strings\n", 329 | " \"\"\"\n", 330 | " return [tok.text for tok in spacy_de.tokenizer(text)]\n", 331 | "\n", 332 | "def tokenize_en(text):\n", 333 | " \"\"\"\n", 334 | " Tokenizes English text from a string into a list of strings\n", 335 | " \"\"\"\n", 336 | " return [tok.text for tok in spacy_en.tokenizer(text)]" 337 | ], 338 | "execution_count": null, 339 | "outputs": [] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": { 344 | "id": "D6y3fPJmy_x0" 345 | }, 346 | "source": [ 347 | "Our fields are the same as the previous notebook. The model expects data to be fed in with the batch dimension first, so we use `batch_first = True`. " 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "metadata": { 353 | "id": "yXCOKl5Fy_x1" 354 | }, 355 | "source": [ 356 | "SRC = Field(tokenize = tokenize_de, \n", 357 | " init_token = '', \n", 358 | " eos_token = '', \n", 359 | " lower = True, \n", 360 | " batch_first = True)\n", 361 | "\n", 362 | "TRG = Field(tokenize = tokenize_en, \n", 363 | " init_token = '', \n", 364 | " eos_token = '', \n", 365 | " lower = True, \n", 366 | " batch_first = True)" 367 | ], 368 | "execution_count": null, 369 | "outputs": [] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "metadata": { 374 | "id": "QKvxT6bLy_x2" 375 | }, 376 | "source": [ 377 | "train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), \n", 378 | " fields = (SRC, TRG))" 379 | ], 380 | "execution_count": null, 381 | "outputs": [] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "metadata": { 386 | "id": "T3MFok3Ly_x2" 387 | }, 388 | "source": [ 389 | "SRC.build_vocab(train_data, min_freq = 2)\n", 390 | "TRG.build_vocab(train_data, min_freq = 2)" 391 | ], 392 | "execution_count": null, 393 | "outputs": [] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "metadata": { 398 | "id": "jZ1XMthqy_x3" 399 | }, 400 | "source": [ 401 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" 402 | ], 403 | "execution_count": null, 404 | "outputs": [] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "metadata": { 409 | "id": "wUXHZuuCy_x3" 410 | }, 411 | "source": [ 412 | "BATCH_SIZE = 32\n", 413 | "\n", 414 | "train_iterator, valid_iterator, test_iterator = BucketIterator.splits(\n", 415 | " (train_data, valid_data, test_data), \n", 416 | " batch_size = BATCH_SIZE,\n", 417 | " device = device)" 418 | ], 419 | "execution_count": null, 420 | "outputs": [] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": { 425 | "id": "c-bHe9BUy_x3" 426 | }, 427 | "source": [ 428 | "## Building the Model\n", 429 | "\n", 430 | "### Encoder\n", 431 | "\n", 432 | "Similar to the ConvSeq2Seq model, the Transformer's encoder does not attempt to compress the entire source sentence, $X = (x_1, ... ,x_n)$, into a single context vector, $z$. Instead it produces a sequence of context vectors, $Z = (z_1, ... , z_n)$. So, if our input sequence was 5 tokens long we would have $Z = (z_1, z_2, z_3, z_4, z_5)$. Why do we call this a sequence of context vectors and not a sequence of hidden states? A hidden state at time $t$ in an RNN has only seen tokens $x_t$ and all the tokens before it. However, each context vector here has seen all tokens at all positions within the input sequence.\n", 433 | "\n", 434 | "![](https://github.com/bentrevett/pytorch-seq2seq/blob/master/assets/transformer-encoder.png?raw=1)\n", 435 | "\n", 436 | "First, the tokens are passed through a standard embedding layer. Next, as the model has no recurrent it has no idea about the order of the tokens within the sequence. We solve this by using a second embedding layer called a *positional embedding layer*. This is a standard embedding layer where the input is not the token itself but the position of the token within the sequence, starting with the first token, the `` (start of sequence) token, in position 0. The position embedding has a \"vocabulary\" size of 100, which means our model can accept sentences up to 100 tokens long. This can be increased if we want to handle longer sentences.\n", 437 | "\n", 438 | "The original Transformer implementation from the Attention is All You Need paper does not learn positional embeddings. Instead it uses a fixed static embedding. Modern Transformer architectures, like BERT, use positional embeddings instead, hence we have decided to use them in these tutorials. Check out [this](http://nlp.seas.harvard.edu/2018/04/03/attention.html#positional-encoding) section to read more about the positional embeddings used in the original Transformer model.\n", 439 | "\n", 440 | "Next, the token and positional embeddings are elementwise summed together to get a vector which contains information about the token and also its position with in the sequence. However, before they are summed, the token embeddings are multiplied by a scaling factor which is $\\sqrt{d_{model}}$, where $d_{model}$ is the hidden dimension size, `hid_dim`. This supposedly reduces variance in the embeddings and the model is difficult to train reliably without this scaling factor. Dropout is then applied to the combined embeddings.\n", 441 | "\n", 442 | "The combined embeddings are then passed through $N$ *encoder layers* to get $Z$, which is then output and can be used by the decoder.\n", 443 | "\n", 444 | "The source mask, `src_mask`, is simply the same shape as the source sentence but has a value of 1 when the token in the source sentence is not a `` token and 0 when it is a `` token. This is used in the encoder layers to mask the multi-head attention mechanisms, which are used to calculate and apply attention over the source sentence, so the model does not pay attention to `` tokens, which contain no useful information." 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "metadata": { 450 | "id": "5qMAHun7y_x4" 451 | }, 452 | "source": [ 453 | "class Encoder(nn.Module):\n", 454 | " def __init__(self, \n", 455 | " input_dim, \n", 456 | " hid_dim, \n", 457 | " n_layers, \n", 458 | " n_heads, \n", 459 | " pf_dim,\n", 460 | " dropout, \n", 461 | " device,\n", 462 | " max_length = 100):\n", 463 | " super().__init__()\n", 464 | "\n", 465 | " self.device = device\n", 466 | " \n", 467 | " self.tok_embedding = nn.Embedding(input_dim, hid_dim)\n", 468 | " self.pos_embedding = nn.Embedding(max_length, hid_dim)\n", 469 | " \n", 470 | " self.layers = nn.ModuleList([EncoderLayer(hid_dim, \n", 471 | " n_heads, \n", 472 | " pf_dim,\n", 473 | " dropout, \n", 474 | " device) \n", 475 | " for _ in range(n_layers)])\n", 476 | " \n", 477 | " self.dropout = nn.Dropout(dropout)\n", 478 | " \n", 479 | " self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)\n", 480 | " \n", 481 | " def forward(self, src, src_mask):\n", 482 | " \n", 483 | " #src = [batch size, src len]\n", 484 | " #src_mask = [batch size, 1, 1, src len]\n", 485 | " \n", 486 | " batch_size = src.shape[0]\n", 487 | " src_len = src.shape[1]\n", 488 | " \n", 489 | " pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)\n", 490 | " \n", 491 | " #pos = [batch size, src len]\n", 492 | " \n", 493 | " src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))\n", 494 | " \n", 495 | " #src = [batch size, src len, hid dim]\n", 496 | " \n", 497 | " for layer in self.layers:\n", 498 | " src = layer(src, src_mask)\n", 499 | " \n", 500 | " #src = [batch size, src len, hid dim]\n", 501 | " \n", 502 | " return src" 503 | ], 504 | "execution_count": null, 505 | "outputs": [] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": { 510 | "id": "BZ0--Yd2y_x5" 511 | }, 512 | "source": [ 513 | "### Encoder Layer\n", 514 | "\n", 515 | "The encoder layers are where all of the \"meat\" of the encoder is contained. We first pass the source sentence and its mask into the *multi-head attention layer*, then perform dropout on it, apply a residual connection and pass it through a [Layer Normalization](https://arxiv.org/abs/1607.06450) layer. We then pass it through a *position-wise feedforward* layer and then, again, apply dropout, a residual connection and then layer normalization to get the output of this layer which is fed into the next layer. The parameters are not shared between layers. \n", 516 | "\n", 517 | "The mutli head attention layer is used by the encoder layer to attend to the source sentence, i.e. it is calculating and applying attention over itself instead of another sequence, hence we call it *self attention*.\n", 518 | "\n", 519 | "[This](https://mlexplained.com/2018/01/13/weight-normalization-and-layer-normalization-explained-normalization-in-deep-learning-part-2/) article goes into more detail about layer normalization, but the gist is that it normalizes the values of the features, i.e. across the hidden dimension, so each feature has a mean of 0 and a standard deviation of 1. This allows neural networks with a larger number of layers, like the Transformer, to be trained easier." 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "metadata": { 525 | "id": "HMJlQoNCy_x6" 526 | }, 527 | "source": [ 528 | "class EncoderLayer(nn.Module):\n", 529 | " def __init__(self, \n", 530 | " hid_dim, \n", 531 | " n_heads, \n", 532 | " pf_dim, \n", 533 | " dropout, \n", 534 | " device):\n", 535 | " super().__init__()\n", 536 | " \n", 537 | " self.self_attn_layer_norm = nn.LayerNorm(hid_dim)\n", 538 | " self.ff_layer_norm = nn.LayerNorm(hid_dim)\n", 539 | " self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)\n", 540 | " self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, \n", 541 | " pf_dim, \n", 542 | " dropout)\n", 543 | " self.dropout = nn.Dropout(dropout)\n", 544 | " \n", 545 | " def forward(self, src, src_mask):\n", 546 | " \n", 547 | " #src = [batch size, src len, hid dim]\n", 548 | " #src_mask = [batch size, 1, 1, src len] \n", 549 | " \n", 550 | " #self attention\n", 551 | " _src, _ = self.self_attention(src, src, src, src_mask)\n", 552 | " \n", 553 | " #dropout, residual connection and layer norm\n", 554 | " src = self.self_attn_layer_norm(src + self.dropout(_src))\n", 555 | " \n", 556 | " #src = [batch size, src len, hid dim]\n", 557 | " \n", 558 | " #positionwise feedforward\n", 559 | " _src = self.positionwise_feedforward(src)\n", 560 | " \n", 561 | " #dropout, residual and layer norm\n", 562 | " src = self.ff_layer_norm(src + self.dropout(_src))\n", 563 | " \n", 564 | " #src = [batch size, src len, hid dim]\n", 565 | " \n", 566 | " return src" 567 | ], 568 | "execution_count": null, 569 | "outputs": [] 570 | }, 571 | { 572 | "cell_type": "markdown", 573 | "metadata": { 574 | "id": "3WKffF0Gy_x6" 575 | }, 576 | "source": [ 577 | "### Mutli Head Attention Layer\n", 578 | "\n", 579 | "One of the key, novel concepts introduced by the Transformer paper is the *multi-head attention layer*. \n", 580 | "\n", 581 | "![](https://github.com/bentrevett/pytorch-seq2seq/blob/master/assets/transformer-attention.png?raw=1)\n", 582 | "\n", 583 | "Attention can be though of as *queries*, *keys* and *values* - where the query is used with the key to get an attention vector (usually the output of a *softmax* operation and has all values between 0 and 1 which sum to 1) which is then used to get a weighted sum of the values.\n", 584 | "\n", 585 | "The Transformer uses *scaled dot-product attention*, where the query and key are combined by taking the dot product between them, then applying the softmax operation and scaling by $d_k$ before finally then multiplying by the value. $d_k$ is the *head dimension*, `head_dim`, which we will shortly explain further.\n", 586 | "\n", 587 | "$$ \\text{Attention}(Q, K, V) = \\text{Softmax} \\big( \\frac{QK^T}{\\sqrt{d_k}} \\big)V $$ \n", 588 | "\n", 589 | "This is similar to standard *dot product attention* but is scaled by $d_k$, which the paper states is used to stop the results of the dot products growing large, causing gradients to become too small.\n", 590 | "\n", 591 | "However, the scaled dot-product attention isn't simply applied to the queries, keys and values. Instead of doing a single attention application the queries, keys and values have their `hid_dim` split into $h$ *heads* and the scaled dot-product attention is calculated over all heads in parallel. This means instead of paying attention to one concept per attention application, we pay attention to $h$. We then re-combine the heads into their `hid_dim` shape, thus each `hid_dim` is potentially paying attention to $h$ different concepts.\n", 592 | "\n", 593 | "$$ \\text{MultiHead}(Q, K, V) = \\text{Concat}(\\text{head}_1,...,\\text{head}_h)W^O $$\n", 594 | "\n", 595 | "$$\\text{head}_i = \\text{Attention}(QW_i^Q, KW_i^K, VW_i^V) $$\n", 596 | "\n", 597 | "$W^O$ is the linear layer applied at the end of the multi-head attention layer, `fc`. $W^Q, W^K, W^V$ are the linear layers `fc_q`, `fc_k` and `fc_v`.\n", 598 | "\n", 599 | "Walking through the module, first we calculate $QW^Q$, $KW^K$ and $VW^V$ with the linear layers, `fc_q`, `fc_k` and `fc_v`, to give us `Q`, `K` and `V`. Next, we split the `hid_dim` of the query, key and value into `n_heads` using `.view` and correctly permute them so they can be multiplied together. We then calculate the `energy` (the un-normalized attention) by multiplying `Q` and `K` together and scaling it by the square root of `head_dim`, which is calulated as `hid_dim // n_heads`. We then mask the energy so we do not pay attention over any elements of the sequeuence we shouldn't, then apply the softmax and dropout. We then apply the attention to the value heads, `V`, before combining the `n_heads` together. Finally, we multiply this $W^O$, represented by `fc_o`. \n", 600 | "\n", 601 | "Note that in our implementation the lengths of the keys and values are always the same, thus when matrix multiplying the output of the softmax, `attention`, with `V` we will always have valid dimension sizes for matrix multiplication. This multiplication is carried out using `torch.matmul` which, when both tensors are >2-dimensional, does a batched matrix multiplication over the last two dimensions of each tensor. This will be a **[query len, key len] x [value len, head dim]** batched matrix multiplication over the batch size and each head which provides the **[batch size, n heads, query len, head dim]** result.\n", 602 | "\n", 603 | "One thing that looks strange at first is that dropout is applied directly to the attention. This means that our attention vector will most probably not sum to 1 and we may pay full attention to a token but the attention over that token is set to 0 by dropout. This is never explained, or even mentioned, in the paper however is used by the [official implementation](https://github.com/tensorflow/tensor2tensor/) and every Transformer implementation since, [including BERT](https://github.com/google-research/bert/)." 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "metadata": { 609 | "id": "U0VFTLGRy_x6" 610 | }, 611 | "source": [ 612 | "class MultiHeadAttentionLayer(nn.Module):\n", 613 | " def __init__(self, hid_dim, n_heads, dropout, device):\n", 614 | " super().__init__()\n", 615 | " \n", 616 | " assert hid_dim % n_heads == 0\n", 617 | " \n", 618 | " self.hid_dim = hid_dim\n", 619 | " self.n_heads = n_heads\n", 620 | " self.head_dim = hid_dim // n_heads\n", 621 | " \n", 622 | " self.fc_q = nn.Linear(hid_dim, hid_dim)\n", 623 | " self.fc_k = nn.Linear(hid_dim, hid_dim)\n", 624 | " self.fc_v = nn.Linear(hid_dim, hid_dim)\n", 625 | " \n", 626 | " self.fc_o = nn.Linear(hid_dim, hid_dim)\n", 627 | " \n", 628 | " self.dropout = nn.Dropout(dropout)\n", 629 | " \n", 630 | " self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)\n", 631 | " \n", 632 | " def forward(self, query, key, value, mask = None):\n", 633 | " \n", 634 | " batch_size = query.shape[0]\n", 635 | " \n", 636 | " #query = [batch size, query len, hid dim]\n", 637 | " #key = [batch size, key len, hid dim]\n", 638 | " #value = [batch size, value len, hid dim]\n", 639 | " \n", 640 | " Q = self.fc_q(query)\n", 641 | " K = self.fc_k(key)\n", 642 | " V = self.fc_v(value)\n", 643 | " \n", 644 | " #Q = [batch size, query len, hid dim]\n", 645 | " #K = [batch size, key len, hid dim]\n", 646 | " #V = [batch size, value len, hid dim]\n", 647 | " \n", 648 | " Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)\n", 649 | " K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)\n", 650 | " V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)\n", 651 | " \n", 652 | " #Q = [batch size, n heads, query len, head dim]\n", 653 | " #K = [batch size, n heads, key len, head dim]\n", 654 | " #V = [batch size, n heads, value len, head dim]\n", 655 | " \n", 656 | " energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale\n", 657 | " \n", 658 | " #energy = [batch size, n heads, query len, key len]\n", 659 | " \n", 660 | " if mask is not None:\n", 661 | " energy = energy.masked_fill(mask == 0, -1e10)\n", 662 | " \n", 663 | " attention = torch.softmax(energy, dim = -1)\n", 664 | " \n", 665 | " #attention = [batch size, n heads, query len, key len]\n", 666 | " \n", 667 | " x = torch.matmul(self.dropout(attention), V)\n", 668 | " \n", 669 | " #x = [batch size, n heads, query len, head dim]\n", 670 | " \n", 671 | " x = x.permute(0, 2, 1, 3).contiguous()\n", 672 | " \n", 673 | " #x = [batch size, query len, n heads, head dim]\n", 674 | " \n", 675 | " x = x.view(batch_size, -1, self.hid_dim)\n", 676 | " \n", 677 | " #x = [batch size, query len, hid dim]\n", 678 | " \n", 679 | " x = self.fc_o(x)\n", 680 | " \n", 681 | " #x = [batch size, query len, hid dim]\n", 682 | " \n", 683 | " return x, attention" 684 | ], 685 | "execution_count": null, 686 | "outputs": [] 687 | }, 688 | { 689 | "cell_type": "markdown", 690 | "metadata": { 691 | "id": "q12JMc8by_x7" 692 | }, 693 | "source": [ 694 | "### Position-wise Feedforward Layer\n", 695 | "\n", 696 | "The other main block inside the encoder layer is the *position-wise feedforward layer* This is relatively simple compared to the multi-head attention layer. The input is transformed from `hid_dim` to `pf_dim`, where `pf_dim` is usually a lot larger than `hid_dim`. The original Transformer used a `hid_dim` of 512 and a `pf_dim` of 2048. The ReLU activation function and dropout are applied before it is transformed back into a `hid_dim` representation. \n", 697 | "\n", 698 | "Why is this used? Unfortunately, it is never explained in the paper.\n", 699 | "\n", 700 | "BERT uses the [GELU](https://arxiv.org/abs/1606.08415) activation function, which can be used by simply switching `torch.relu` for `F.gelu`. Why did they use GELU? Again, it is never explained." 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "metadata": { 706 | "id": "tJfnEHOwy_x7" 707 | }, 708 | "source": [ 709 | "class PositionwiseFeedforwardLayer(nn.Module):\n", 710 | " def __init__(self, hid_dim, pf_dim, dropout):\n", 711 | " super().__init__()\n", 712 | " \n", 713 | " self.fc_1 = nn.Linear(hid_dim, pf_dim)\n", 714 | " self.fc_2 = nn.Linear(pf_dim, hid_dim)\n", 715 | " \n", 716 | " self.dropout = nn.Dropout(dropout)\n", 717 | " \n", 718 | " def forward(self, x):\n", 719 | " \n", 720 | " #x = [batch size, seq len, hid dim]\n", 721 | " \n", 722 | " x = self.dropout(torch.relu(self.fc_1(x)))\n", 723 | " \n", 724 | " #x = [batch size, seq len, pf dim]\n", 725 | " \n", 726 | " x = self.fc_2(x)\n", 727 | " \n", 728 | " #x = [batch size, seq len, hid dim]\n", 729 | " \n", 730 | " return x" 731 | ], 732 | "execution_count": null, 733 | "outputs": [] 734 | }, 735 | { 736 | "cell_type": "markdown", 737 | "metadata": { 738 | "id": "3XbLg8mty_x7" 739 | }, 740 | "source": [ 741 | "### Decoder\n", 742 | "\n", 743 | "The objective of the decoder is to take the encoded representation of the source sentence, $Z$, and convert it into predicted tokens in the target sentence, $\\hat{Y}$. We then compare $\\hat{Y}$ with the actual tokens in the target sentence, $Y$, to calculate our loss, which will be used to calculate the gradients of our parameters and then use our optimizer to update our weights in order to improve our predictions. \n", 744 | "\n", 745 | "![](https://github.com/bentrevett/pytorch-seq2seq/blob/master/assets/transformer-decoder.png?raw=1)\n", 746 | "\n", 747 | "The decoder is similar to encoder, however it now has two multi-head attention layers. A *masked multi-head attention layer* over the target sequence, and a multi-head attention layer which uses the decoder representation as the query and the encoder representation as the key and value.\n", 748 | "\n", 749 | "The decoder uses positional embeddings and combines - via an elementwise sum - them with the scaled embedded target tokens, followed by dropout. Again, our positional encodings have a \"vocabulary\" of 100, which means they can accept sequences up to 100 tokens long. This can be increased if desired.\n", 750 | "\n", 751 | "The combined embeddings are then passed through the $N$ decoder layers, along with the encoded source, `enc_src`, and the source and target masks. Note that the number of layers in the encoder does not have to be equal to the number of layers in the decoder, even though they are both denoted by $N$.\n", 752 | "\n", 753 | "The decoder representation after the $N^{th}$ layer is then passed through a linear layer, `fc_out`. In PyTorch, the softmax operation is contained within our loss function, so we do not explicitly need to use a softmax layer here.\n", 754 | "\n", 755 | "As well as using the source mask, as we did in the encoder to prevent our model attending to `` tokens, we also use a target mask. This will be explained further in the `Seq2Seq` model which encapsulates both the encoder and decoder, but the gist of it is that it performs a similar operation as the decoder padding in the convolutional sequence-to-sequence model. As we are processing all of the target tokens at once in parallel we need a method of stopping the decoder from \"cheating\" by simply \"looking\" at what the next token in the target sequence is and outputting it. \n", 756 | "\n", 757 | "Our decoder layer also outputs the normalized attention values so we can later plot them to see what our model is actually paying attention to." 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "metadata": { 763 | "id": "szwZ39lSy_x7" 764 | }, 765 | "source": [ 766 | "class Decoder(nn.Module):\n", 767 | " def __init__(self, \n", 768 | " output_dim, \n", 769 | " hid_dim, \n", 770 | " n_layers, \n", 771 | " n_heads, \n", 772 | " pf_dim, \n", 773 | " dropout, \n", 774 | " device,\n", 775 | " max_length = 100):\n", 776 | " super().__init__()\n", 777 | " \n", 778 | " self.device = device\n", 779 | " \n", 780 | " self.tok_embedding = nn.Embedding(output_dim, hid_dim)\n", 781 | " self.pos_embedding = nn.Embedding(max_length, hid_dim)\n", 782 | " \n", 783 | " self.layers = nn.ModuleList([DecoderLayer(hid_dim, \n", 784 | " n_heads, \n", 785 | " pf_dim, \n", 786 | " dropout, \n", 787 | " device)\n", 788 | " for _ in range(n_layers)])\n", 789 | " \n", 790 | " self.fc_out = nn.Linear(hid_dim, output_dim)\n", 791 | " \n", 792 | " self.dropout = nn.Dropout(dropout)\n", 793 | " \n", 794 | " self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)\n", 795 | " \n", 796 | " def forward(self, trg, enc_src, trg_mask, src_mask):\n", 797 | " \n", 798 | " #trg = [batch size, trg len]\n", 799 | " #enc_src = [batch size, src len, hid dim]\n", 800 | " #trg_mask = [batch size, 1, trg len, trg len]\n", 801 | " #src_mask = [batch size, 1, 1, src len]\n", 802 | " \n", 803 | " batch_size = trg.shape[0]\n", 804 | " trg_len = trg.shape[1]\n", 805 | " \n", 806 | " pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)\n", 807 | " \n", 808 | " #pos = [batch size, trg len]\n", 809 | " \n", 810 | " trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))\n", 811 | " \n", 812 | " #trg = [batch size, trg len, hid dim]\n", 813 | " \n", 814 | " for layer in self.layers:\n", 815 | " trg, attention = layer(trg, enc_src, trg_mask, src_mask)\n", 816 | " \n", 817 | " #trg = [batch size, trg len, hid dim]\n", 818 | " #attention = [batch size, n heads, trg len, src len]\n", 819 | " \n", 820 | " output = self.fc_out(trg)\n", 821 | " \n", 822 | " #output = [batch size, trg len, output dim]\n", 823 | " \n", 824 | " return output, attention" 825 | ], 826 | "execution_count": null, 827 | "outputs": [] 828 | }, 829 | { 830 | "cell_type": "markdown", 831 | "metadata": { 832 | "id": "CCX5MMGky_x8" 833 | }, 834 | "source": [ 835 | "### Decoder Layer\n", 836 | "\n", 837 | "As mentioned previously, the decoder layer is similar to the encoder layer except that it now has two multi-head attention layers, `self_attention` and `encoder_attention`. \n", 838 | "\n", 839 | "The first performs self-attention, as in the encoder, by using the decoder representation so far as the query, key and value. This is followed by dropout, residual connection and layer normalization. This `self_attention` layer uses the target sequence mask, `trg_mask`, in order to prevent the decoder from \"cheating\" by paying attention to tokens that are \"ahead\" of the one it is currently processing as it processes all tokens in the target sentence in parallel.\n", 840 | "\n", 841 | "The second is how we actually feed the encoded source sentence, `enc_src`, into our decoder. In this multi-head attention layer the queries are the decoder representations and the keys and values are the encoder representations. Here, the source mask, `src_mask` is used to prevent the multi-head attention layer from attending to `` tokens within the source sentence. This is then followed by the dropout, residual connection and layer normalization layers. \n", 842 | "\n", 843 | "Finally, we pass this through the position-wise feedforward layer and yet another sequence of dropout, residual connection and layer normalization.\n", 844 | "\n", 845 | "The decoder layer isn't introducing any new concepts, just using the same set of layers as the encoder in a slightly different way." 846 | ] 847 | }, 848 | { 849 | "cell_type": "code", 850 | "metadata": { 851 | "id": "9iviBr13y_x8" 852 | }, 853 | "source": [ 854 | "class DecoderLayer(nn.Module):\n", 855 | " def __init__(self, \n", 856 | " hid_dim, \n", 857 | " n_heads, \n", 858 | " pf_dim, \n", 859 | " dropout, \n", 860 | " device):\n", 861 | " super().__init__()\n", 862 | " \n", 863 | " self.self_attn_layer_norm = nn.LayerNorm(hid_dim)\n", 864 | " self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)\n", 865 | " self.ff_layer_norm = nn.LayerNorm(hid_dim)\n", 866 | " self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)\n", 867 | " self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)\n", 868 | " self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, \n", 869 | " pf_dim, \n", 870 | " dropout)\n", 871 | " self.dropout = nn.Dropout(dropout)\n", 872 | " \n", 873 | " def forward(self, trg, enc_src, trg_mask, src_mask):\n", 874 | " \n", 875 | " #trg = [batch size, trg len, hid dim]\n", 876 | " #enc_src = [batch size, src len, hid dim]\n", 877 | " #trg_mask = [batch size, 1, trg len, trg len]\n", 878 | " #src_mask = [batch size, 1, 1, src len]\n", 879 | " \n", 880 | " #self attention\n", 881 | " _trg, _ = self.self_attention(trg, trg, trg, trg_mask)\n", 882 | " \n", 883 | " #dropout, residual connection and layer norm\n", 884 | " trg = self.self_attn_layer_norm(trg + self.dropout(_trg))\n", 885 | " \n", 886 | " #trg = [batch size, trg len, hid dim]\n", 887 | " \n", 888 | " #encoder attention\n", 889 | " _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)\n", 890 | " \n", 891 | " #dropout, residual connection and layer norm\n", 892 | " trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))\n", 893 | " \n", 894 | " #trg = [batch size, trg len, hid dim]\n", 895 | " \n", 896 | " #positionwise feedforward\n", 897 | " _trg = self.positionwise_feedforward(trg)\n", 898 | " \n", 899 | " #dropout, residual and layer norm\n", 900 | " trg = self.ff_layer_norm(trg + self.dropout(_trg))\n", 901 | " \n", 902 | " #trg = [batch size, trg len, hid dim]\n", 903 | " #attention = [batch size, n heads, trg len, src len]\n", 904 | " \n", 905 | " return trg, attention" 906 | ], 907 | "execution_count": null, 908 | "outputs": [] 909 | }, 910 | { 911 | "cell_type": "code", 912 | "metadata": { 913 | "id": "_OBeq20Ky_x9" 914 | }, 915 | "source": [ 916 | "class Seq2Seq(nn.Module):\n", 917 | " def __init__(self, \n", 918 | " encoder, \n", 919 | " decoder, \n", 920 | " src_pad_idx, \n", 921 | " trg_pad_idx, \n", 922 | " device):\n", 923 | " super().__init__()\n", 924 | " \n", 925 | " self.encoder = encoder\n", 926 | " self.decoder = decoder\n", 927 | " self.src_pad_idx = src_pad_idx\n", 928 | " self.trg_pad_idx = trg_pad_idx\n", 929 | " self.device = device\n", 930 | " \n", 931 | " def make_src_mask(self, src):\n", 932 | " \n", 933 | " #src = [batch size, src len]\n", 934 | " \n", 935 | " src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)\n", 936 | "\n", 937 | " #src_mask = [batch size, 1, 1, src len]\n", 938 | "\n", 939 | " return src_mask\n", 940 | " \n", 941 | " def make_trg_mask(self, trg):\n", 942 | " \n", 943 | " #trg = [batch size, trg len]\n", 944 | " \n", 945 | " trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)\n", 946 | " \n", 947 | " #trg_pad_mask = [batch size, 1, 1, trg len]\n", 948 | " \n", 949 | " trg_len = trg.shape[1]\n", 950 | " \n", 951 | " trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()\n", 952 | " \n", 953 | " #trg_sub_mask = [trg len, trg len]\n", 954 | " \n", 955 | " trg_mask = trg_pad_mask & trg_sub_mask\n", 956 | " \n", 957 | " #trg_mask = [batch size, 1, trg len, trg len]\n", 958 | " \n", 959 | " return trg_mask\n", 960 | "\n", 961 | " def forward(self, src, trg):\n", 962 | " \n", 963 | " #src = [batch size, src len]\n", 964 | " #trg = [batch size, trg len]\n", 965 | " \n", 966 | " src_mask = self.make_src_mask(src)\n", 967 | " trg_mask = self.make_trg_mask(trg)\n", 968 | " \n", 969 | " #src_mask = [batch size, 1, 1, src len]\n", 970 | " #trg_mask = [batch size, 1, trg len, trg len]\n", 971 | " \n", 972 | " enc_src = self.encoder(src, src_mask)\n", 973 | " \n", 974 | " #enc_src = [batch size, src len, hid dim]\n", 975 | " \n", 976 | " output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)\n", 977 | " \n", 978 | " #output = [batch size, trg len, output dim]\n", 979 | " #attention = [batch size, n heads, trg len, src len]\n", 980 | " \n", 981 | " return output, attention" 982 | ], 983 | "execution_count": null, 984 | "outputs": [] 985 | }, 986 | { 987 | "cell_type": "code", 988 | "metadata": { 989 | "id": "rJVTCJBTy_x9" 990 | }, 991 | "source": [ 992 | "INPUT_DIM = len(SRC.vocab)\n", 993 | "OUTPUT_DIM = len(TRG.vocab)\n", 994 | "HID_DIM = 256\n", 995 | "ENC_LAYERS = 3\n", 996 | "DEC_LAYERS = 3\n", 997 | "ENC_HEADS = 8\n", 998 | "DEC_HEADS = 8\n", 999 | "ENC_PF_DIM = 512\n", 1000 | "DEC_PF_DIM = 512\n", 1001 | "ENC_DROPOUT = 0.1\n", 1002 | "DEC_DROPOUT = 0.1\n", 1003 | "\n", 1004 | "enc = Encoder(INPUT_DIM, \n", 1005 | " HID_DIM, \n", 1006 | " ENC_LAYERS, \n", 1007 | " ENC_HEADS, \n", 1008 | " ENC_PF_DIM, \n", 1009 | " ENC_DROPOUT, \n", 1010 | " device)\n", 1011 | "\n", 1012 | "dec = Decoder(OUTPUT_DIM, \n", 1013 | " HID_DIM, \n", 1014 | " DEC_LAYERS, \n", 1015 | " DEC_HEADS, \n", 1016 | " DEC_PF_DIM, \n", 1017 | " DEC_DROPOUT, \n", 1018 | " device)" 1019 | ], 1020 | "execution_count": null, 1021 | "outputs": [] 1022 | }, 1023 | { 1024 | "cell_type": "markdown", 1025 | "metadata": { 1026 | "id": "8yBBerzFy_x9" 1027 | }, 1028 | "source": [ 1029 | "Then, use them to define our whole sequence-to-sequence encapsulating model." 1030 | ] 1031 | }, 1032 | { 1033 | "cell_type": "code", 1034 | "metadata": { 1035 | "id": "JYJ39bNAy_x9" 1036 | }, 1037 | "source": [ 1038 | "SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]\n", 1039 | "TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]\n", 1040 | "\n", 1041 | "model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)" 1042 | ], 1043 | "execution_count": null, 1044 | "outputs": [] 1045 | }, 1046 | { 1047 | "cell_type": "code", 1048 | "metadata": { 1049 | "id": "J1zCByqIy_x-", 1050 | "colab": { 1051 | "base_uri": "https://localhost:8080/" 1052 | }, 1053 | "outputId": "32500c1c-cfc4-4a2e-ec8b-f9fd13cbcff2" 1054 | }, 1055 | "source": [ 1056 | "def count_parameters(model):\n", 1057 | " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n", 1058 | "\n", 1059 | "print(f'The model has {count_parameters(model):,} trainable parameters')" 1060 | ], 1061 | "execution_count": null, 1062 | "outputs": [ 1063 | { 1064 | "output_type": "stream", 1065 | "text": [ 1066 | "The model has 9,038,853 trainable parameters\n" 1067 | ], 1068 | "name": "stdout" 1069 | } 1070 | ] 1071 | }, 1072 | { 1073 | "cell_type": "code", 1074 | "metadata": { 1075 | "id": "7WtaK3KZy_x-" 1076 | }, 1077 | "source": [ 1078 | "def initialize_weights(m):\n", 1079 | " if hasattr(m, 'weight') and m.weight.dim() > 1:\n", 1080 | " nn.init.xavier_uniform_(m.weight.data)" 1081 | ], 1082 | "execution_count": null, 1083 | "outputs": [] 1084 | }, 1085 | { 1086 | "cell_type": "code", 1087 | "metadata": { 1088 | "id": "chTWTXGDy_x-" 1089 | }, 1090 | "source": [ 1091 | "model.apply(initialize_weights);" 1092 | ], 1093 | "execution_count": null, 1094 | "outputs": [] 1095 | }, 1096 | { 1097 | "cell_type": "code", 1098 | "metadata": { 1099 | "id": "C9FFqqbNy_x_" 1100 | }, 1101 | "source": [ 1102 | "LEARNING_RATE = 0.0005\n", 1103 | "\n", 1104 | "optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)" 1105 | ], 1106 | "execution_count": null, 1107 | "outputs": [] 1108 | }, 1109 | { 1110 | "cell_type": "code", 1111 | "metadata": { 1112 | "id": "7D1VEFeoy_x_" 1113 | }, 1114 | "source": [ 1115 | "criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)" 1116 | ], 1117 | "execution_count": null, 1118 | "outputs": [] 1119 | }, 1120 | { 1121 | "cell_type": "code", 1122 | "metadata": { 1123 | "id": "UQYJNk2_y_x_" 1124 | }, 1125 | "source": [ 1126 | "def train(model, iterator, optimizer, criterion, clip):\n", 1127 | " \n", 1128 | " model.train()\n", 1129 | " \n", 1130 | " epoch_loss = 0\n", 1131 | " \n", 1132 | " for i, batch in enumerate(iterator):\n", 1133 | " \n", 1134 | " src = batch.src\n", 1135 | " trg = batch.trg\n", 1136 | " \n", 1137 | " optimizer.zero_grad()\n", 1138 | " \n", 1139 | " output, _ = model(src, trg[:,:-1])\n", 1140 | " \n", 1141 | " #output = [batch size, trg len - 1, output dim]\n", 1142 | " #trg = [batch size, trg len]\n", 1143 | " \n", 1144 | " output_dim = output.shape[-1]\n", 1145 | " \n", 1146 | " output = output.contiguous().view(-1, output_dim)\n", 1147 | " trg = trg[:,1:].contiguous().view(-1)\n", 1148 | " \n", 1149 | " #output = [batch size * trg len - 1, output dim]\n", 1150 | " #trg = [batch size * trg len - 1]\n", 1151 | " \n", 1152 | " loss = criterion(output, trg)\n", 1153 | " \n", 1154 | " loss.backward()\n", 1155 | " \n", 1156 | " torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n", 1157 | " \n", 1158 | " optimizer.step()\n", 1159 | " \n", 1160 | " epoch_loss += loss.item()\n", 1161 | " \n", 1162 | " return epoch_loss / len(iterator)" 1163 | ], 1164 | "execution_count": null, 1165 | "outputs": [] 1166 | }, 1167 | { 1168 | "cell_type": "code", 1169 | "metadata": { 1170 | "id": "EWG1iBkJy_yA" 1171 | }, 1172 | "source": [ 1173 | "def evaluate(model, iterator, criterion):\n", 1174 | " \n", 1175 | " model.eval()\n", 1176 | " \n", 1177 | " epoch_loss = 0\n", 1178 | " \n", 1179 | " with torch.no_grad():\n", 1180 | " \n", 1181 | " for i, batch in enumerate(iterator):\n", 1182 | "\n", 1183 | " src = batch.src\n", 1184 | " trg = batch.trg\n", 1185 | "\n", 1186 | " output, _ = model(src, trg[:,:-1])\n", 1187 | " \n", 1188 | " #output = [batch size, trg len - 1, output dim]\n", 1189 | " #trg = [batch size, trg len]\n", 1190 | " \n", 1191 | " output_dim = output.shape[-1]\n", 1192 | " \n", 1193 | " output = output.contiguous().view(-1, output_dim)\n", 1194 | " trg = trg[:,1:].contiguous().view(-1)\n", 1195 | " \n", 1196 | " #output = [batch size * trg len - 1, output dim]\n", 1197 | " #trg = [batch size * trg len - 1]\n", 1198 | " \n", 1199 | " loss = criterion(output, trg)\n", 1200 | "\n", 1201 | " epoch_loss += loss.item()\n", 1202 | " \n", 1203 | " return epoch_loss / len(iterator)" 1204 | ], 1205 | "execution_count": null, 1206 | "outputs": [] 1207 | }, 1208 | { 1209 | "cell_type": "code", 1210 | "metadata": { 1211 | "id": "AFmtwEy1y_yA" 1212 | }, 1213 | "source": [ 1214 | "def epoch_time(start_time, end_time):\n", 1215 | " elapsed_time = end_time - start_time\n", 1216 | " elapsed_mins = int(elapsed_time / 60)\n", 1217 | " elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n", 1218 | " return elapsed_mins, elapsed_secs" 1219 | ], 1220 | "execution_count": null, 1221 | "outputs": [] 1222 | }, 1223 | { 1224 | "cell_type": "code", 1225 | "metadata": { 1226 | "id": "suKcAoNby_yB", 1227 | "colab": { 1228 | "base_uri": "https://localhost:8080/" 1229 | }, 1230 | "outputId": "8797a47f-4627-40e8-845f-932e12cfb95a" 1231 | }, 1232 | "source": [ 1233 | "N_EPOCHS = 10\n", 1234 | "CLIP = 1\n", 1235 | "\n", 1236 | "best_valid_loss = float('inf')\n", 1237 | "\n", 1238 | "for epoch in range(N_EPOCHS):\n", 1239 | " \n", 1240 | " start_time = time.time()\n", 1241 | " \n", 1242 | " train_loss = train(model, train_iterator, optimizer, criterion, CLIP)\n", 1243 | " valid_loss = evaluate(model, valid_iterator, criterion)\n", 1244 | " \n", 1245 | " end_time = time.time()\n", 1246 | " \n", 1247 | " epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n", 1248 | " \n", 1249 | " if valid_loss < best_valid_loss:\n", 1250 | " best_valid_loss = valid_loss\n", 1251 | " torch.save(model.state_dict(), 'tut6-model.pt')\n", 1252 | " \n", 1253 | " print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')\n", 1254 | " print(f'\\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')\n", 1255 | " print(f'\\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')" 1256 | ], 1257 | "execution_count": null, 1258 | "outputs": [ 1259 | { 1260 | "output_type": "stream", 1261 | "text": [ 1262 | "Epoch: 01 | Time: 0m 59s\n", 1263 | "\tTrain Loss: 3.636 | Train PPL: 37.923\n", 1264 | "\t Val. Loss: 2.623 | Val. PPL: 13.772\n", 1265 | "Epoch: 02 | Time: 0m 59s\n", 1266 | "\tTrain Loss: 2.463 | Train PPL: 11.740\n", 1267 | "\t Val. Loss: 2.061 | Val. PPL: 7.853\n", 1268 | "Epoch: 03 | Time: 0m 59s\n", 1269 | "\tTrain Loss: 1.979 | Train PPL: 7.232\n", 1270 | "\t Val. Loss: 1.822 | Val. PPL: 6.182\n", 1271 | "Epoch: 04 | Time: 1m 0s\n", 1272 | "\tTrain Loss: 1.689 | Train PPL: 5.415\n", 1273 | "\t Val. Loss: 1.705 | Val. PPL: 5.500\n", 1274 | "Epoch: 05 | Time: 0m 59s\n", 1275 | "\tTrain Loss: 1.485 | Train PPL: 4.415\n", 1276 | "\t Val. Loss: 1.664 | Val. PPL: 5.282\n", 1277 | "Epoch: 06 | Time: 0m 59s\n", 1278 | "\tTrain Loss: 1.326 | Train PPL: 3.767\n", 1279 | "\t Val. Loss: 1.647 | Val. PPL: 5.192\n", 1280 | "Epoch: 07 | Time: 0m 59s\n", 1281 | "\tTrain Loss: 1.200 | Train PPL: 3.321\n", 1282 | "\t Val. Loss: 1.639 | Val. PPL: 5.148\n", 1283 | "Epoch: 08 | Time: 0m 59s\n", 1284 | "\tTrain Loss: 1.095 | Train PPL: 2.988\n", 1285 | "\t Val. Loss: 1.651 | Val. PPL: 5.212\n", 1286 | "Epoch: 09 | Time: 0m 59s\n", 1287 | "\tTrain Loss: 1.003 | Train PPL: 2.725\n", 1288 | "\t Val. Loss: 1.659 | Val. PPL: 5.253\n", 1289 | "Epoch: 10 | Time: 0m 59s\n", 1290 | "\tTrain Loss: 0.927 | Train PPL: 2.526\n", 1291 | "\t Val. Loss: 1.710 | Val. PPL: 5.527\n" 1292 | ], 1293 | "name": "stdout" 1294 | } 1295 | ] 1296 | }, 1297 | { 1298 | "cell_type": "markdown", 1299 | "metadata": { 1300 | "id": "rO_sS7Qby_yB" 1301 | }, 1302 | "source": [ 1303 | "We load our \"best\" parameters and manage to achieve a better test perplexity than all previous models." 1304 | ] 1305 | }, 1306 | { 1307 | "cell_type": "code", 1308 | "metadata": { 1309 | "id": "p7QirlGyy_yB", 1310 | "colab": { 1311 | "base_uri": "https://localhost:8080/" 1312 | }, 1313 | "outputId": "da4ba35b-d36f-4395-e3b3-7de394965d38" 1314 | }, 1315 | "source": [ 1316 | "model.load_state_dict(torch.load('tut6-model.pt'))\n", 1317 | "\n", 1318 | "test_loss = evaluate(model, test_iterator, criterion)\n", 1319 | "\n", 1320 | "print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')" 1321 | ], 1322 | "execution_count": null, 1323 | "outputs": [ 1324 | { 1325 | "output_type": "stream", 1326 | "text": [ 1327 | "| Test Loss: 1.693 | Test PPL: 5.435 |\n" 1328 | ], 1329 | "name": "stdout" 1330 | } 1331 | ] 1332 | }, 1333 | { 1334 | "cell_type": "code", 1335 | "metadata": { 1336 | "id": "skoJQ43Gy_yC" 1337 | }, 1338 | "source": [ 1339 | "def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):\n", 1340 | " \n", 1341 | " model.eval()\n", 1342 | " \n", 1343 | " if isinstance(sentence, str):\n", 1344 | " nlp = spacy.load('de_core_news_sm')\n", 1345 | " tokens = [token.text.lower() for token in nlp(sentence)]\n", 1346 | " else:\n", 1347 | " tokens = [token.lower() for token in sentence]\n", 1348 | "\n", 1349 | " tokens = [src_field.init_token] + tokens + [src_field.eos_token]\n", 1350 | " \n", 1351 | " src_indexes = [src_field.vocab.stoi[token] for token in tokens]\n", 1352 | "\n", 1353 | " src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)\n", 1354 | " \n", 1355 | " src_mask = model.make_src_mask(src_tensor)\n", 1356 | " \n", 1357 | " with torch.no_grad():\n", 1358 | " enc_src = model.encoder(src_tensor, src_mask)\n", 1359 | "\n", 1360 | " trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]\n", 1361 | "\n", 1362 | " for i in range(max_len):\n", 1363 | "\n", 1364 | " trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)\n", 1365 | "\n", 1366 | " trg_mask = model.make_trg_mask(trg_tensor)\n", 1367 | " \n", 1368 | " with torch.no_grad():\n", 1369 | " output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)\n", 1370 | " \n", 1371 | " pred_token = output.argmax(2)[:,-1].item()\n", 1372 | " \n", 1373 | " trg_indexes.append(pred_token)\n", 1374 | "\n", 1375 | " if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:\n", 1376 | " break\n", 1377 | " \n", 1378 | " trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]\n", 1379 | " \n", 1380 | " return trg_tokens[1:], attention" 1381 | ], 1382 | "execution_count": null, 1383 | "outputs": [] 1384 | }, 1385 | { 1386 | "cell_type": "code", 1387 | "metadata": { 1388 | "id": "kPzaC1o5y_yD", 1389 | "colab": { 1390 | "base_uri": "https://localhost:8080/" 1391 | }, 1392 | "outputId": "6f09e97e-2e60-4bb3-ad5d-09fe699373e4" 1393 | }, 1394 | "source": [ 1395 | "example_idx = 8\n", 1396 | "\n", 1397 | "src = vars(train_data.examples[example_idx])['src']\n", 1398 | "trg = vars(train_data.examples[example_idx])['trg']\n", 1399 | "\n", 1400 | "print(f'src = {src}')\n", 1401 | "print(f'trg = {trg}')" 1402 | ], 1403 | "execution_count": null, 1404 | "outputs": [ 1405 | { 1406 | "output_type": "stream", 1407 | "text": [ 1408 | "src = ['eine', 'frau', 'mit', 'einer', 'großen', 'geldbörse', 'geht', 'an', 'einem', 'tor', 'vorbei', '.']\n", 1409 | "trg = ['a', 'woman', 'with', 'a', 'large', 'purse', 'is', 'walking', 'by', 'a', 'gate', '.']\n" 1410 | ], 1411 | "name": "stdout" 1412 | } 1413 | ] 1414 | }, 1415 | { 1416 | "cell_type": "code", 1417 | "metadata": { 1418 | "id": "IP0dOHYKy_yD", 1419 | "colab": { 1420 | "base_uri": "https://localhost:8080/" 1421 | }, 1422 | "outputId": "d04ad06f-cc79-4e1d-8766-52fec9fbfade" 1423 | }, 1424 | "source": [ 1425 | "translation, attention = translate_sentence(src, SRC, TRG, model, device)\n", 1426 | "\n", 1427 | "print(f'predicted trg = {translation}')" 1428 | ], 1429 | "execution_count": null, 1430 | "outputs": [ 1431 | { 1432 | "output_type": "stream", 1433 | "text": [ 1434 | "predicted trg = ['a', 'woman', 'with', 'a', 'large', 'purse', 'walks', 'by', 'a', 'gate', '.', '']\n" 1435 | ], 1436 | "name": "stdout" 1437 | } 1438 | ] 1439 | }, 1440 | { 1441 | "cell_type": "code", 1442 | "metadata": { 1443 | "id": "6Uqgbssvy_yE", 1444 | "colab": { 1445 | "base_uri": "https://localhost:8080/" 1446 | }, 1447 | "outputId": "2a500fa9-71ae-486f-cbad-9dec7a6e6589" 1448 | }, 1449 | "source": [ 1450 | "example_idx = 6\n", 1451 | "\n", 1452 | "src = vars(valid_data.examples[example_idx])['src']\n", 1453 | "trg = vars(valid_data.examples[example_idx])['trg']\n", 1454 | "\n", 1455 | "print(f'src = {src}')\n", 1456 | "print(f'trg = {trg}')" 1457 | ], 1458 | "execution_count": null, 1459 | "outputs": [ 1460 | { 1461 | "output_type": "stream", 1462 | "text": [ 1463 | "src = ['ein', 'brauner', 'hund', 'rennt', 'dem', 'schwarzen', 'hund', 'hinterher', '.']\n", 1464 | "trg = ['a', 'brown', 'dog', 'is', 'running', 'after', 'the', 'black', 'dog', '.']\n" 1465 | ], 1466 | "name": "stdout" 1467 | } 1468 | ] 1469 | }, 1470 | { 1471 | "cell_type": "code", 1472 | "metadata": { 1473 | "id": "6bEYG7mGy_yE", 1474 | "colab": { 1475 | "base_uri": "https://localhost:8080/" 1476 | }, 1477 | "outputId": "787dd41e-adcc-4aee-d36f-44929849fa08" 1478 | }, 1479 | "source": [ 1480 | "translation, attention = translate_sentence(src, SRC, TRG, model, device)\n", 1481 | "\n", 1482 | "print(f'predicted trg = {translation}')" 1483 | ], 1484 | "execution_count": null, 1485 | "outputs": [ 1486 | { 1487 | "output_type": "stream", 1488 | "text": [ 1489 | "predicted trg = ['a', 'brown', 'dog', 'chases', 'the', 'black', 'dog', '.', '']\n" 1490 | ], 1491 | "name": "stdout" 1492 | } 1493 | ] 1494 | }, 1495 | { 1496 | "cell_type": "code", 1497 | "metadata": { 1498 | "id": "F__v88CPy_yF", 1499 | "colab": { 1500 | "base_uri": "https://localhost:8080/" 1501 | }, 1502 | "outputId": "fdfcbc59-8e35-45f3-88be-3d13badb2849" 1503 | }, 1504 | "source": [ 1505 | "example_idx = 10\n", 1506 | "\n", 1507 | "src = vars(test_data.examples[example_idx])['src']\n", 1508 | "trg = vars(test_data.examples[example_idx])['trg']\n", 1509 | "\n", 1510 | "print(f'src = {src}')\n", 1511 | "print(f'trg = {trg}')" 1512 | ], 1513 | "execution_count": null, 1514 | "outputs": [ 1515 | { 1516 | "output_type": "stream", 1517 | "text": [ 1518 | "src = ['eine', 'mutter', 'und', 'ihr', 'kleiner', 'sohn', 'genießen', 'einen', 'schönen', 'tag', 'im', 'freien', '.']\n", 1519 | "trg = ['a', 'mother', 'and', 'her', 'young', 'song', 'enjoying', 'a', 'beautiful', 'day', 'outside', '.']\n" 1520 | ], 1521 | "name": "stdout" 1522 | } 1523 | ] 1524 | }, 1525 | { 1526 | "cell_type": "code", 1527 | "metadata": { 1528 | "id": "AEqV6ykQy_yG", 1529 | "colab": { 1530 | "base_uri": "https://localhost:8080/" 1531 | }, 1532 | "outputId": "abd9ddd5-8455-4aaf-efa4-236624a3eca3" 1533 | }, 1534 | "source": [ 1535 | "translation, attention = translate_sentence(src, SRC, TRG, model, device)\n", 1536 | "\n", 1537 | "print(f'predicted trg = {translation}')" 1538 | ], 1539 | "execution_count": null, 1540 | "outputs": [ 1541 | { 1542 | "output_type": "stream", 1543 | "text": [ 1544 | "predicted trg = ['a', 'mother', 'and', 'her', 'young', 'son', 'enjoy', 'a', 'beautiful', 'day', '.', '']\n" 1545 | ], 1546 | "name": "stdout" 1547 | } 1548 | ] 1549 | } 1550 | ] 1551 | } -------------------------------------------------------------------------------- /CNN_Text_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "CNN Text classification", 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyPEvVbblKTRTV5G7YlGCnPX", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "language_info": { 16 | "name": "python" 17 | }, 18 | "accelerator": "GPU" 19 | }, 20 | "cells": [ 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "id": "view-in-github", 25 | "colab_type": "text" 26 | }, 27 | "source": [ 28 | "\"Open" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "id": "lUgxNO4aFcWj" 35 | }, 36 | "source": [ 37 | "# Using Convolutional Neural Networks for sentence Classification\n", 38 | "\n", 39 | "This code uses a version of the model described in ) [CNN For Sentence Classification (Yoon Kim, 2014)](https://www.aclweb.org/anthology/D14-1181/\n", 40 | ") in order to perform sentence classification on the IMDB dataset" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "metadata": { 46 | "colab": { 47 | "base_uri": "https://localhost:8080/" 48 | }, 49 | "id": "NudorOZQ9FTA", 50 | "outputId": "d897b564-2e79-4f62-858f-a04355c97440" 51 | }, 52 | "source": [ 53 | "!pip install -q torchtext==0.2.3\n", 54 | "import torchtext\n", 55 | "print (torchtext.__version__)\n" 56 | ], 57 | "execution_count": 1, 58 | "outputs": [ 59 | { 60 | "output_type": "stream", 61 | "text": [ 62 | "0.2.3\n" 63 | ], 64 | "name": "stdout" 65 | } 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "metadata": { 71 | "colab": { 72 | "base_uri": "https://localhost:8080/" 73 | }, 74 | "id": "IcReTQg-8Qa1", 75 | "outputId": "93c65f37-9b47-475b-b564-2a7eb0e0d5d8" 76 | }, 77 | "source": [ 78 | "!pip install -q torch==0.4.1 \n", 79 | "import torch\n", 80 | "print (torch.__version__)" 81 | ], 82 | "execution_count": 2, 83 | "outputs": [ 84 | { 85 | "output_type": "stream", 86 | "text": [ 87 | "0.4.1\n" 88 | ], 89 | "name": "stdout" 90 | } 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "metadata": { 96 | "colab": { 97 | "base_uri": "https://localhost:8080/" 98 | }, 99 | "id": "PTR2rKr35UXP", 100 | "outputId": "d15b28af-8d7a-4799-b07f-5190b2890bc3" 101 | }, 102 | "source": [ 103 | "import os\n", 104 | "import sys\n", 105 | "import time\n", 106 | "import torch\n", 107 | "import torch.nn as nn\n", 108 | "import torch.nn.functional as F\n", 109 | "from torch.autograd import Variable\n", 110 | "import torch.optim as optim\n", 111 | "import numpy as np\n", 112 | "import torchtext\n", 113 | "from torchtext import data\n", 114 | "from torchtext import datasets\n", 115 | "from torchtext.vocab import Vectors, GloVe\n", 116 | "print (torch.__version__)\n", 117 | "print (torchtext.__version__)" 118 | ], 119 | "execution_count": 3, 120 | "outputs": [ 121 | { 122 | "output_type": "stream", 123 | "text": [ 124 | "0.4.1\n", 125 | "0.2.3\n" 126 | ], 127 | "name": "stdout" 128 | } 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": { 134 | "id": "9ZTQj_UjGYkU" 135 | }, 136 | "source": [ 137 | "The function to load the dataset, tokenize the sentences and also find out the GloVe embeddings for the vocabulary" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "metadata": { 143 | "id": "IDWE5AlY5XQA" 144 | }, 145 | "source": [ 146 | "def load_dataset(test_sen=None):\n", 147 | "\n", 148 | " \"\"\"\n", 149 | " tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied\n", 150 | " Field : A class that stores information about the way of preprocessing\n", 151 | " fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will\n", 152 | " dynamically pad each sequence to the longest sequence in that \"batch\". But here we are using fi_length which\n", 153 | " will pad each sequence to have a fix length of 200.\n", 154 | " \n", 155 | " build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an\n", 156 | " idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.\n", 157 | " \n", 158 | " vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.\n", 159 | " BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.\n", 160 | " \n", 161 | " \"\"\"\n", 162 | " \n", 163 | " tokenize = lambda x: x.split()\n", 164 | " TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)\n", 165 | " LABEL = data.LabelField()\n", 166 | " train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)\n", 167 | " TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))\n", 168 | " LABEL.build_vocab(train_data)\n", 169 | "\n", 170 | " word_embeddings = TEXT.vocab.vectors\n", 171 | " print (\"Length of Text Vocabulary: \" + str(len(TEXT.vocab)))\n", 172 | " print (\"Vector size of Text Vocabulary: \", TEXT.vocab.vectors.size())\n", 173 | " print (\"Label Length: \" + str(len(LABEL.vocab)))\n", 174 | "\n", 175 | " train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data\n", 176 | " train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)\n", 177 | "\n", 178 | " '''Alternatively we can also use the default configurations'''\n", 179 | " # train_iter, test_iter = datasets.IMDB.iters(batch_size=32)\n", 180 | "\n", 181 | " vocab_size = len(TEXT.vocab)\n", 182 | "\n", 183 | " return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter" 184 | ], 185 | "execution_count": 5, 186 | "outputs": [] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "metadata": { 191 | "colab": { 192 | "base_uri": "https://localhost:8080/" 193 | }, 194 | "id": "UpUkOqkF5afM", 195 | "outputId": "4f689868-eff9-4b26-d7b8-db474690a003" 196 | }, 197 | "source": [ 198 | "TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter = load_dataset()" 199 | ], 200 | "execution_count": 6, 201 | "outputs": [ 202 | { 203 | "output_type": "stream", 204 | "text": [ 205 | "downloading aclImdb_v1.tar.gz\n" 206 | ], 207 | "name": "stdout" 208 | }, 209 | { 210 | "output_type": "stream", 211 | "text": [ 212 | ".vector_cache/glove.6B.zip: 862MB [02:41, 5.35MB/s] \n", 213 | "100%|██████████| 400000/400000 [00:37<00:00, 10768.76it/s]\n" 214 | ], 215 | "name": "stderr" 216 | }, 217 | { 218 | "output_type": "stream", 219 | "text": [ 220 | "Length of Text Vocabulary: 251639\n", 221 | "Vector size of Text Vocabulary: torch.Size([251639, 300])\n", 222 | "Label Length: 2\n" 223 | ], 224 | "name": "stdout" 225 | } 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": { 231 | "id": "kPnsfFJtHDnY" 232 | }, 233 | "source": [ 234 | "Clipping the gradient to a maximum and minimum possible value to prevent the exploding gradient problem" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "metadata": { 240 | "id": "gsQLjnVd5dBW" 241 | }, 242 | "source": [ 243 | "def clip_gradient(model, clip_value):\n", 244 | " params = list(filter(lambda p: p.grad is not None, model.parameters()))\n", 245 | " for p in params:\n", 246 | " p.grad.data.clamp_(-clip_value, clip_value)" 247 | ], 248 | "execution_count": 7, 249 | "outputs": [] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": { 254 | "id": "g8hCKhKGHOpa" 255 | }, 256 | "source": [ 257 | "This function is used to train the given model using an Adam optimiser " 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "metadata": { 263 | "id": "T1Auu6r75e2j" 264 | }, 265 | "source": [ 266 | "def train_model(model, train_iter, epoch):\n", 267 | " total_epoch_loss = 0\n", 268 | " total_epoch_acc = 0\n", 269 | " model.cuda()\n", 270 | " optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))\n", 271 | " steps = 0\n", 272 | " model.train()\n", 273 | " for idx, batch in enumerate(train_iter):\n", 274 | " text = batch.text[0]\n", 275 | " target = batch.label\n", 276 | " target = torch.autograd.Variable(target).long()\n", 277 | " if torch.cuda.is_available():\n", 278 | " text = text.cuda()\n", 279 | " target = target.cuda()\n", 280 | " if (text.size()[0] is not 32):# One of the batch returned by BucketIterator has length different than 32.\n", 281 | " continue\n", 282 | " optim.zero_grad()\n", 283 | " prediction = model(text)\n", 284 | " loss = loss_fn(prediction, target)\n", 285 | " num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()\n", 286 | " acc = 100.0 * num_corrects/len(batch)\n", 287 | " loss.backward()\n", 288 | " clip_gradient(model, 1e-1)\n", 289 | " optim.step()\n", 290 | " steps += 1\n", 291 | " \n", 292 | " if steps % 100 == 0:\n", 293 | " print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')\n", 294 | " \n", 295 | " total_epoch_loss += loss.item()\n", 296 | " total_epoch_acc += acc.item()\n", 297 | " \n", 298 | " return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)" 299 | ], 300 | "execution_count": 8, 301 | "outputs": [] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "metadata": { 306 | "id": "xtgF3JW45haa" 307 | }, 308 | "source": [ 309 | "def eval_model(model, val_iter):\n", 310 | " total_epoch_loss = 0\n", 311 | " total_epoch_acc = 0\n", 312 | " model.eval()\n", 313 | " with torch.no_grad():\n", 314 | " for idx, batch in enumerate(val_iter):\n", 315 | " text = batch.text[0]\n", 316 | " if (text.size()[0] is not 32):\n", 317 | " continue\n", 318 | " target = batch.label\n", 319 | " target = torch.autograd.Variable(target).long()\n", 320 | " if torch.cuda.is_available():\n", 321 | " text = text.cuda()\n", 322 | " target = target.cuda()\n", 323 | " prediction = model(text)\n", 324 | " loss = loss_fn(prediction, target)\n", 325 | " num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()\n", 326 | " acc = 100.0 * num_corrects/len(batch)\n", 327 | " total_epoch_loss += loss.item()\n", 328 | " total_epoch_acc += acc.item()\n", 329 | "\n", 330 | " return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter)" 331 | ], 332 | "execution_count": 9, 333 | "outputs": [] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": { 338 | "id": "5dPb1zy4H2mz" 339 | }, 340 | "source": [ 341 | "Defining the model class" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "metadata": { 347 | "id": "LIQApSTW5klv" 348 | }, 349 | "source": [ 350 | "class CNN(nn.Module):\n", 351 | "\tdef __init__(self, batch_size, output_size, in_channels, out_channels, kernel_heights, stride, padding, keep_probab, vocab_size, embedding_length, weights):\n", 352 | "\t\tsuper(CNN, self).__init__()\n", 353 | "\t\t\n", 354 | "\t\t\"\"\"\n", 355 | "\t\tArguments\n", 356 | "\t\t---------\n", 357 | "\t\tbatch_size : Size of each batch which is same as the batch_size of the data returned by the TorchText BucketIterator\n", 358 | "\t\toutput_size : 2 = (pos, neg)\n", 359 | "\t\tin_channels : Number of input channels. Here it is 1 as the input data has dimension = (batch_size, num_seq, embedding_length)\n", 360 | "\t\tout_channels : Number of output channels after convolution operation performed on the input matrix\n", 361 | "\t\tkernel_heights : A list consisting of 3 different kernel_heights. Convolution will be performed 3 times and finally results from each kernel_height will be concatenated.\n", 362 | "\t\tkeep_probab : Probability of retaining an activation node during dropout operation\n", 363 | "\t\tvocab_size : Size of the vocabulary containing unique words\n", 364 | "\t\tembedding_length : Embedding dimension of GloVe word embeddings\n", 365 | "\t\tweights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table\n", 366 | "\t\t--------\n", 367 | "\t\t\n", 368 | "\t\t\"\"\"\n", 369 | "\t\tself.batch_size = batch_size\n", 370 | "\t\tself.output_size = output_size\n", 371 | "\t\tself.in_channels = in_channels\n", 372 | "\t\tself.out_channels = out_channels\n", 373 | "\t\tself.kernel_heights = kernel_heights\n", 374 | "\t\tself.stride = stride\n", 375 | "\t\tself.padding = padding\n", 376 | "\t\tself.vocab_size = vocab_size\n", 377 | "\t\tself.embedding_length = embedding_length\n", 378 | "\t\t\n", 379 | "\t\tself.word_embeddings = nn.Embedding(vocab_size, embedding_length)\n", 380 | "\t\tself.word_embeddings.weight = nn.Parameter(weights, requires_grad=False)\n", 381 | "\t\tself.conv1 = nn.Conv2d(in_channels, out_channels, (kernel_heights[0], embedding_length), stride, padding)\n", 382 | "\t\tself.conv2 = nn.Conv2d(in_channels, out_channels, (kernel_heights[1], embedding_length), stride, padding)\n", 383 | "\t\tself.conv3 = nn.Conv2d(in_channels, out_channels, (kernel_heights[2], embedding_length), stride, padding)\n", 384 | "\t\tself.dropout = nn.Dropout(keep_probab)\n", 385 | "\t\tself.label = nn.Linear(len(kernel_heights)*out_channels, output_size)\n", 386 | "\t\n", 387 | "\tdef conv_block(self, input, conv_layer):\n", 388 | "\t\tconv_out = conv_layer(input)# conv_out.size() = (batch_size, out_channels, dim, 1)\n", 389 | "\t\tactivation = F.relu(conv_out.squeeze(3))# activation.size() = (batch_size, out_channels, dim1)\n", 390 | "\t\tmax_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2)# maxpool_out.size() = (batch_size, out_channels)\n", 391 | "\t\t\n", 392 | "\t\treturn max_out\n", 393 | "\t\n", 394 | "\tdef forward(self, input_sentences, batch_size=None):\n", 395 | "\t\t\n", 396 | "\t\t\"\"\"\n", 397 | "\t\tThe idea of the Convolutional Neural Netwok for Text Classification is very simple. We perform convolution operation on the embedding matrix \n", 398 | "\t\twhose shape for each batch is (num_seq, embedding_length) with kernel of varying height but constant width which is same as the embedding_length.\n", 399 | "\t\tWe will be using ReLU activation after the convolution operation and then for each kernel height, we will use max_pool operation on each tensor \n", 400 | "\t\tand will filter all the maximum activation for every channel and then we will concatenate the resulting tensors. This output is then fully connected\n", 401 | "\t\tto the output layers consisting two units which basically gives us the logits for both positive and negative classes.\n", 402 | "\t\t\n", 403 | "\t\tParameters\n", 404 | "\t\t----------\n", 405 | "\t\tinput_sentences: input_sentences of shape = (batch_size, num_sequences)\n", 406 | "\t\tbatch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)\n", 407 | "\t\t\n", 408 | "\t\tReturns\n", 409 | "\t\t-------\n", 410 | "\t\tOutput of the linear layer containing logits for pos & neg class.\n", 411 | "\t\tlogits.size() = (batch_size, output_size)\n", 412 | "\t\t\n", 413 | "\t\t\"\"\"\n", 414 | "\t\t\n", 415 | "\t\tinput = self.word_embeddings(input_sentences)\n", 416 | "\t\t# input.size() = (batch_size, num_seq, embedding_length)\n", 417 | "\t\tinput = input.unsqueeze(1)\n", 418 | "\t\t# input.size() = (batch_size, 1, num_seq, embedding_length)\n", 419 | "\t\tmax_out1 = self.conv_block(input, self.conv1)\n", 420 | "\t\tmax_out2 = self.conv_block(input, self.conv2)\n", 421 | "\t\tmax_out3 = self.conv_block(input, self.conv3)\n", 422 | "\t\t\n", 423 | "\t\tall_out = torch.cat((max_out1, max_out2, max_out3), 1)\n", 424 | "\t\t# all_out.size() = (batch_size, num_kernels*out_channels)\n", 425 | "\t\tfc_in = self.dropout(all_out)\n", 426 | "\t\t# fc_in.size()) = (batch_size, num_kernels*out_channels)\n", 427 | "\t\tlogits = self.label(fc_in)\n", 428 | "\t\t\n", 429 | "\t\treturn logits" 430 | ], 431 | "execution_count": 10, 432 | "outputs": [] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": { 437 | "id": "rHvNfrkqH6R5" 438 | }, 439 | "source": [ 440 | "Listing all Hyper-parameters" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "metadata": { 446 | "id": "k7Bfyz1l5m8y" 447 | }, 448 | "source": [ 449 | "\n", 450 | "learning_rate = 2e-5\n", 451 | "batch_size = 32\n", 452 | "output_size = 2\n", 453 | "in_channels = 1\n", 454 | "out_channels = 100\n", 455 | "kernel_heights = [2,3,4]\n", 456 | "keep_probab =0.5\n", 457 | "stride =1\n", 458 | "padding =0\n", 459 | "vocab_size = len(TEXT.vocab)\n", 460 | "embedding_length = 300\n", 461 | "word_embeddings = TEXT.vocab.vectors\n", 462 | "\n", 463 | "loss_fn = F.cross_entropy\n", 464 | "\n", 465 | "\n", 466 | "model = CNN(batch_size, output_size, in_channels, out_channels, kernel_heights, stride, padding, keep_probab, vocab_size, embedding_length, word_embeddings)" 467 | ], 468 | "execution_count": 23, 469 | "outputs": [] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": { 474 | "id": "YyhIsWJRIBkL" 475 | }, 476 | "source": [ 477 | "Training the model" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "metadata": { 483 | "colab": { 484 | "base_uri": "https://localhost:8080/" 485 | }, 486 | "id": "uu-DpGZ15rIs", 487 | "outputId": "2319db60-152f-44fc-c727-05466817eabf" 488 | }, 489 | "source": [ 490 | "\n", 491 | "for epoch in range(10):\n", 492 | " train_loss, train_acc = train_model(model, train_iter, epoch)\n", 493 | " val_loss, val_acc = eval_model(model, valid_iter)\n", 494 | " \n", 495 | " print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')\n", 496 | " \n", 497 | "test_loss, test_acc = eval_model(model, test_iter)\n", 498 | "print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')\n" 499 | ], 500 | "execution_count": 24, 501 | "outputs": [ 502 | { 503 | "output_type": "stream", 504 | "text": [ 505 | "Epoch: 1, Idx: 100, Training Loss: 0.5618, Training Accuracy: 78.12%\n", 506 | "Epoch: 1, Idx: 200, Training Loss: 0.5752, Training Accuracy: 68.75%\n", 507 | "Epoch: 1, Idx: 300, Training Loss: 0.3646, Training Accuracy: 84.38%\n", 508 | "Epoch: 1, Idx: 400, Training Loss: 0.4188, Training Accuracy: 81.25%\n", 509 | "Epoch: 1, Idx: 500, Training Loss: 0.8118, Training Accuracy: 68.75%\n" 510 | ], 511 | "name": "stdout" 512 | }, 513 | { 514 | "output_type": "stream", 515 | "text": [ 516 | "/usr/local/lib/python3.7/dist-packages/torchtext/data/field.py:321: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n", 517 | " return Variable(arr, volatile=not train), lengths\n", 518 | "/usr/local/lib/python3.7/dist-packages/torchtext/data/field.py:322: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n", 519 | " return Variable(arr, volatile=not train)\n" 520 | ], 521 | "name": "stderr" 522 | }, 523 | { 524 | "output_type": "stream", 525 | "text": [ 526 | "Epoch: 01, Train Loss: 0.500, Train Acc: 74.68%, Val. Loss: 0.409318, Val. Acc: 79.84%\n", 527 | "Epoch: 2, Idx: 100, Training Loss: 0.3630, Training Accuracy: 84.38%\n", 528 | "Epoch: 2, Idx: 200, Training Loss: 0.2427, Training Accuracy: 84.38%\n", 529 | "Epoch: 2, Idx: 300, Training Loss: 0.6100, Training Accuracy: 75.00%\n", 530 | "Epoch: 2, Idx: 400, Training Loss: 0.5128, Training Accuracy: 68.75%\n", 531 | "Epoch: 2, Idx: 500, Training Loss: 0.3353, Training Accuracy: 84.38%\n", 532 | "Epoch: 02, Train Loss: 0.404, Train Acc: 81.58%, Val. Loss: 0.363430, Val. Acc: 83.05%\n", 533 | "Epoch: 3, Idx: 100, Training Loss: 0.4209, Training Accuracy: 81.25%\n", 534 | "Epoch: 3, Idx: 200, Training Loss: 0.3722, Training Accuracy: 81.25%\n", 535 | "Epoch: 3, Idx: 300, Training Loss: 0.4726, Training Accuracy: 81.25%\n", 536 | "Epoch: 3, Idx: 400, Training Loss: 0.4197, Training Accuracy: 75.00%\n", 537 | "Epoch: 3, Idx: 500, Training Loss: 0.5725, Training Accuracy: 75.00%\n", 538 | "Epoch: 03, Train Loss: 0.352, Train Acc: 84.21%, Val. Loss: 0.364105, Val. Acc: 83.29%\n", 539 | "Epoch: 4, Idx: 100, Training Loss: 0.3276, Training Accuracy: 81.25%\n", 540 | "Epoch: 4, Idx: 200, Training Loss: 0.3190, Training Accuracy: 84.38%\n", 541 | "Epoch: 4, Idx: 300, Training Loss: 0.3192, Training Accuracy: 81.25%\n", 542 | "Epoch: 4, Idx: 400, Training Loss: 0.3468, Training Accuracy: 84.38%\n", 543 | "Epoch: 4, Idx: 500, Training Loss: 0.3960, Training Accuracy: 84.38%\n", 544 | "Epoch: 04, Train Loss: 0.312, Train Acc: 86.39%, Val. Loss: 0.371037, Val. Acc: 82.59%\n", 545 | "Epoch: 5, Idx: 100, Training Loss: 0.2751, Training Accuracy: 93.75%\n", 546 | "Epoch: 5, Idx: 200, Training Loss: 0.2091, Training Accuracy: 93.75%\n", 547 | "Epoch: 5, Idx: 300, Training Loss: 0.5730, Training Accuracy: 81.25%\n", 548 | "Epoch: 5, Idx: 400, Training Loss: 0.2439, Training Accuracy: 84.38%\n", 549 | "Epoch: 5, Idx: 500, Training Loss: 0.2319, Training Accuracy: 93.75%\n", 550 | "Epoch: 05, Train Loss: 0.269, Train Acc: 88.48%, Val. Loss: 0.369516, Val. Acc: 83.85%\n", 551 | "Epoch: 6, Idx: 100, Training Loss: 0.3177, Training Accuracy: 81.25%\n", 552 | "Epoch: 6, Idx: 200, Training Loss: 0.4831, Training Accuracy: 78.12%\n", 553 | "Epoch: 6, Idx: 300, Training Loss: 0.0979, Training Accuracy: 100.00%\n", 554 | "Epoch: 6, Idx: 400, Training Loss: 0.1686, Training Accuracy: 93.75%\n", 555 | "Epoch: 6, Idx: 500, Training Loss: 0.1559, Training Accuracy: 96.88%\n", 556 | "Epoch: 06, Train Loss: 0.239, Train Acc: 90.35%, Val. Loss: 0.379501, Val. Acc: 83.43%\n", 557 | "Epoch: 7, Idx: 100, Training Loss: 0.2812, Training Accuracy: 87.50%\n", 558 | "Epoch: 7, Idx: 200, Training Loss: 0.1574, Training Accuracy: 96.88%\n", 559 | "Epoch: 7, Idx: 300, Training Loss: 0.0601, Training Accuracy: 100.00%\n", 560 | "Epoch: 7, Idx: 400, Training Loss: 0.2383, Training Accuracy: 90.62%\n", 561 | "Epoch: 7, Idx: 500, Training Loss: 0.3109, Training Accuracy: 87.50%\n", 562 | "Epoch: 07, Train Loss: 0.211, Train Acc: 91.16%, Val. Loss: 0.398822, Val. Acc: 82.96%\n", 563 | "Epoch: 8, Idx: 100, Training Loss: 0.1824, Training Accuracy: 93.75%\n", 564 | "Epoch: 8, Idx: 200, Training Loss: 0.1994, Training Accuracy: 90.62%\n", 565 | "Epoch: 8, Idx: 300, Training Loss: 0.2334, Training Accuracy: 90.62%\n", 566 | "Epoch: 8, Idx: 400, Training Loss: 0.2106, Training Accuracy: 87.50%\n", 567 | "Epoch: 8, Idx: 500, Training Loss: 0.4483, Training Accuracy: 78.12%\n", 568 | "Epoch: 08, Train Loss: 0.184, Train Acc: 92.53%, Val. Loss: 0.407115, Val. Acc: 83.46%\n", 569 | "Epoch: 9, Idx: 100, Training Loss: 0.0677, Training Accuracy: 96.88%\n", 570 | "Epoch: 9, Idx: 200, Training Loss: 0.1273, Training Accuracy: 93.75%\n", 571 | "Epoch: 9, Idx: 300, Training Loss: 0.1377, Training Accuracy: 93.75%\n", 572 | "Epoch: 9, Idx: 400, Training Loss: 0.1893, Training Accuracy: 87.50%\n", 573 | "Epoch: 9, Idx: 500, Training Loss: 0.2026, Training Accuracy: 93.75%\n", 574 | "Epoch: 09, Train Loss: 0.173, Train Acc: 93.00%, Val. Loss: 0.410676, Val. Acc: 83.50%\n", 575 | "Epoch: 10, Idx: 100, Training Loss: 0.1268, Training Accuracy: 96.88%\n", 576 | "Epoch: 10, Idx: 200, Training Loss: 0.3121, Training Accuracy: 81.25%\n", 577 | "Epoch: 10, Idx: 300, Training Loss: 0.0726, Training Accuracy: 96.88%\n", 578 | "Epoch: 10, Idx: 400, Training Loss: 0.1684, Training Accuracy: 90.62%\n", 579 | "Epoch: 10, Idx: 500, Training Loss: 0.1953, Training Accuracy: 87.50%\n", 580 | "Epoch: 10, Train Loss: 0.160, Train Acc: 93.57%, Val. Loss: 0.436646, Val. Acc: 83.19%\n", 581 | "Test Loss: 0.445, Test Acc: 82.39%\n" 582 | ], 583 | "name": "stdout" 584 | } 585 | ] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "metadata": { 590 | "id": "wE1afAC4IE4H" 591 | }, 592 | "source": [ 593 | "Testing the model on custom input" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "metadata": { 599 | "colab": { 600 | "base_uri": "https://localhost:8080/" 601 | }, 602 | "id": "N1kodbbG5tvz", 603 | "outputId": "695582fd-bd4d-4396-fdea-46c33d0a926e" 604 | }, 605 | "source": [ 606 | "test_sen1 = \"This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues.\"\n", 607 | "test_sen2 = \"Ohh, such a ridiculous movie. Not gonna recommend it to anyone. Complete waste of time and money.\"\n", 608 | "\n", 609 | "test_sen1 = TEXT.preprocess(test_sen1)\n", 610 | "test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]]\n", 611 | "\n", 612 | "test_sen2 = TEXT.preprocess(test_sen2)\n", 613 | "test_sen2 = [[TEXT.vocab.stoi[x] for x in test_sen2]]\n", 614 | "\n", 615 | "with torch.no_grad():\n", 616 | " test_sen = np.asarray(test_sen1)\n", 617 | " test_sen = torch.LongTensor(test_sen)\n", 618 | " test_tensor = Variable(test_sen)\n", 619 | " test_tensor = test_tensor.cuda()\n", 620 | " model.eval()\n", 621 | " output = model(test_tensor, 1)\n", 622 | " out = F.softmax(output, 1)\n", 623 | " if (torch.argmax(out[0]) == 1):\n", 624 | " print (\"Sentiment: Positive\")\n", 625 | " else:\n", 626 | " print (\"Sentiment: Negative\")\n", 627 | "\n", 628 | "with torch.no_grad():\n", 629 | " test_sen = np.asarray(test_sen2)\n", 630 | " test_sen = torch.LongTensor(test_sen)\n", 631 | " test_tensor = Variable(test_sen)\n", 632 | " test_tensor = test_tensor.cuda()\n", 633 | " model.eval()\n", 634 | " output = model(test_tensor, 1)\n", 635 | " out = F.softmax(output, 1)\n", 636 | " if (torch.argmax(out[0]) == 1):\n", 637 | " print (\"Sentiment: Positive\")\n", 638 | " else:\n", 639 | " print (\"Sentiment: Negative\")\n", 640 | "\n" 641 | ], 642 | "execution_count": 30, 643 | "outputs": [ 644 | { 645 | "output_type": "stream", 646 | "text": [ 647 | "Sentiment: Positive\n", 648 | "Sentiment: Negative\n" 649 | ], 650 | "name": "stdout" 651 | } 652 | ] 653 | } 654 | ] 655 | } -------------------------------------------------------------------------------- /Intro_to_Deep_Neural_Networks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Intro_to_Deep_Neural_Networks.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "authorship_tag": "ABX9TyOSv6Xq37HO7FMDzNlq1PKr", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "metadata": { 31 | "id": "5fB5G5hIhZ0H", 32 | "colab_type": "code", 33 | "colab": {} 34 | }, 35 | "source": [ 36 | "import numpy as np" 37 | ], 38 | "execution_count": 0, 39 | "outputs": [] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "_9Eedm9HZExP", 45 | "colab_type": "text" 46 | }, 47 | "source": [ 48 | "### $$ Perceptron\\ as\\ Linear\\ Classifier $$\n", 49 | "\n", 50 | "
\n", 51 | "
\n", 52 | "\n", 53 | "
\n", 54 | "
" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "metadata": { 60 | "id": "MgaVwufvgo6m", 61 | "colab_type": "code", 62 | "colab": {} 63 | }, 64 | "source": [ 65 | "# Perceptron as a Boolean Gate\n", 66 | "\n", 67 | "def make_perceptron(weights, threshold):\n", 68 | " # basic unit of computation\n", 69 | " # compute sum of all inputs\n", 70 | " # if the sum is greater than the threshold, output 1\n", 71 | " # else, output -1\n", 72 | "\n", 73 | " return (lambda x : int(np.sign(np.matmul(np.asmatrix(weights), np.asmatrix(x).T)[0, 0] - threshold)))" 74 | ], 75 | "execution_count": 0, 76 | "outputs": [] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": { 81 | "id": "wxwlWGutaaD9", 82 | "colab_type": "text" 83 | }, 84 | "source": [ 85 | "### $$ Not\\ perceptron $$\n", 86 | "\n", 87 | "
\n", 88 | "
\n", 89 | "\n", 90 | "
\n", 91 | "
" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "metadata": { 97 | "id": "40tgNlnYhqWS", 98 | "colab_type": "code", 99 | "outputId": "546eec42-bac5-44a7-8950-4898bbe40943", 100 | "colab": { 101 | "base_uri": "https://localhost:8080/", 102 | "height": 53 103 | } 104 | }, 105 | "source": [ 106 | "# 1 input NOT Gate\n", 107 | "# threshold = 0\n", 108 | "# weights = [-1]\n", 109 | "\n", 110 | "not_tron = make_perceptron([-1], 0)\n", 111 | "print(\"NOT(1):\", not_tron([1]))\n", 112 | "print(\"NOT(-1):\", not_tron([-1]))" 113 | ], 114 | "execution_count": 3, 115 | "outputs": [ 116 | { 117 | "output_type": "stream", 118 | "text": [ 119 | "NOT(1): -1\n", 120 | "NOT(-1): 1\n" 121 | ], 122 | "name": "stdout" 123 | } 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": { 129 | "id": "Ajb9AD1xat1i", 130 | "colab_type": "text" 131 | }, 132 | "source": [ 133 | "### $$ Or\\ perceptron $$\n", 134 | "\n", 135 | "
\n", 136 | "
\n", 137 | "\n", 138 | "
\n", 139 | "
" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "metadata": { 145 | "id": "7s5y6tOwi0SO", 146 | "colab_type": "code", 147 | "outputId": "c4e99055-70e0-43c9-f69c-141d02653318", 148 | "colab": { 149 | "base_uri": "https://localhost:8080/", 150 | "height": 88 151 | } 152 | }, 153 | "source": [ 154 | "# 2 input OR Gate\n", 155 | "# threshold = -0.5\n", 156 | "# weights = [1, 1]\n", 157 | "\n", 158 | "or_tron = make_perceptron([1, 1], -0.5)\n", 159 | "print(\"OR(-1, -1):\", or_tron([-1, -1]))\n", 160 | "print(\"OR(-1, 1):\", or_tron([-1, 1]))\n", 161 | "print(\"OR(1, -1):\", or_tron([1, -1]))\n", 162 | "print(\"OR(1, 1):\", or_tron([1, 1]))" 163 | ], 164 | "execution_count": 4, 165 | "outputs": [ 166 | { 167 | "output_type": "stream", 168 | "text": [ 169 | "OR(-1, -1): -1\n", 170 | "OR(-1, 1): 1\n", 171 | "OR(1, -1): 1\n", 172 | "OR(1, 1): 1\n" 173 | ], 174 | "name": "stdout" 175 | } 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": { 181 | "id": "tElWcM1QbD-P", 182 | "colab_type": "text" 183 | }, 184 | "source": [ 185 | "### $$ And\\ perceptron $$\n", 186 | "\n", 187 | "
\n", 188 | "
\n", 189 | "\n", 190 | "
\n", 191 | "
" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "metadata": { 197 | "id": "f6tau4fakKki", 198 | "colab_type": "code", 199 | "outputId": "6d8b8753-972a-4627-e884-79358a4a95c7", 200 | "colab": { 201 | "base_uri": "https://localhost:8080/", 202 | "height": 88 203 | } 204 | }, 205 | "source": [ 206 | "# 2 input AND Gate\n", 207 | "# threshold = 1.5\n", 208 | "# weights = [1, 1]\n", 209 | "\n", 210 | "and_tron = make_perceptron([1, 1], 0.5)\n", 211 | "print(\"AND(-1, -1):\", and_tron([-1, -1]))\n", 212 | "print(\"AND(-1, 1):\", and_tron([-1, 1]))\n", 213 | "print(\"AND(1, -1):\", and_tron([1, -1]))\n", 214 | "print(\"AND(1, 1):\", and_tron([1, 1]))" 215 | ], 216 | "execution_count": 5, 217 | "outputs": [ 218 | { 219 | "output_type": "stream", 220 | "text": [ 221 | "AND(-1, -1): -1\n", 222 | "AND(-1, 1): -1\n", 223 | "AND(1, -1): -1\n", 224 | "AND(1, 1): 1\n" 225 | ], 226 | "name": "stdout" 227 | } 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": { 233 | "id": "egK06_GGbNPe", 234 | "colab_type": "text" 235 | }, 236 | "source": [ 237 | "### $$ Xor\\ problem $$\n", 238 | "\n", 239 | "
\n", 240 | "
\n", 241 | "\n", 242 | "
\n", 243 | "
" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": { 249 | "id": "F8VYugugOYax", 250 | "colab_type": "text" 251 | }, 252 | "source": [ 253 | "### $$ MLP\\ for\\ Xor $$\n", 254 | "\n", 255 | "
\n", 256 | "
\n", 257 | "\n", 258 | "
\n", 259 | "
" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "metadata": { 265 | "id": "9hPHkkZGodsG", 266 | "colab_type": "code", 267 | "outputId": "918b8857-c9d0-4dfd-d316-f4b590661a82", 268 | "colab": { 269 | "base_uri": "https://localhost:8080/", 270 | "height": 88 271 | } 272 | }, 273 | "source": [ 274 | "# What about XOR?\n", 275 | "# We need more than one perceptron\n", 276 | "# Therefore, we use a Multi-Layer Perceptron (MLP)\n", 277 | "\n", 278 | "xor_tron = (lambda x : and_tron([or_tron(x), or_tron([not_tron(x[0]), not_tron(x[1])])]))\n", 279 | "print(\"XOR(-1, -1):\", xor_tron([-1, -1]))\n", 280 | "print(\"XOR(-1, 1):\", xor_tron([-1, 1]))\n", 281 | "print(\"XOR(1, -1):\", xor_tron([1, -1]))\n", 282 | "print(\"XOR(1, 1):\", xor_tron([1, 1]))" 283 | ], 284 | "execution_count": 6, 285 | "outputs": [ 286 | { 287 | "output_type": "stream", 288 | "text": [ 289 | "XOR(-1, -1): -1\n", 290 | "XOR(-1, 1): 1\n", 291 | "XOR(1, -1): 1\n", 292 | "XOR(1, 1): -1\n" 293 | ], 294 | "name": "stdout" 295 | } 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": { 301 | "id": "payCDtwM8GRf", 302 | "colab_type": "text" 303 | }, 304 | "source": [ 305 | "### $ >\\ Try\\ defining\\ other\\ functions\\ using\\ perceptrons.$\n", 306 | "\n", 307 | "---" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": { 313 | "id": "G7upVhPLcxuR", 314 | "colab_type": "text" 315 | }, 316 | "source": [ 317 | "### $$ Generic\\ MLP\\ without\\ Bias\\ Terms $$\n", 318 | "\n", 319 | "
\n", 320 | "
\n", 321 | "\n", 322 | "
\n", 323 | "
" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": { 329 | "id": "1YbECi5yOh1x", 330 | "colab_type": "text" 331 | }, 332 | "source": [ 333 | "$$\n", 334 | "X_{3 \\times 1} = \\begin{pmatrix}\n", 335 | "x_1 \\\\\n", 336 | "x_2 \\\\\n", 337 | "x_3 \\\\\n", 338 | "\\end{pmatrix} $$\n", 339 | "\n", 340 | "
\n", 341 | "\n", 342 | "$$\n", 343 | "^1W_{3 \\times 4} = \\begin{pmatrix}\n", 344 | "^1w_{11} && ^1w_{12} && ^1w_{13} && ^1w_{14} \\\\\n", 345 | "^1w_{21} && ^1w_{22} && ^1w_{23} && ^1w_{24} \\\\\n", 346 | "^1w_{31} && ^1w_{32} && ^1w_{33} && ^1w_{34} \\\\\n", 347 | "\\end{pmatrix}\n", 348 | "$$\n", 349 | "\n", 350 | "
\n", 351 | "\n", 352 | "$$\n", 353 | "^2W_{4 \\times 4} = \\begin{pmatrix}\n", 354 | "^2w_{11} && ^2w_{12} && ^2w_{13} && ^2w_{14} \\\\\n", 355 | "^2w_{21} && ^2w_{22} && ^2w_{23} && ^2w_{24} \\\\\n", 356 | "^2w_{31} && ^2w_{32} && ^2w_{33} && ^2w_{34} \\\\\n", 357 | "^2w_{41} && ^2w_{42} && ^2w_{43} && ^2w_{44} \\\\\n", 358 | "\\end{pmatrix} $$\n", 359 | "\n", 360 | "
\n", 361 | "\n", 362 | "$$\n", 363 | "^3W_{4 \\times 1} = \\begin{pmatrix}\n", 364 | "^3w_{11} \\\\\n", 365 | "^3w_{21} \\\\\n", 366 | "^3w_{31} \\\\\n", 367 | "^3w_{41} \\\\\n", 368 | "\\end{pmatrix} $$\n", 369 | "\n", 370 | "
\n", 371 | "\n", 372 | "$$ What\\ about\\ ^3W^T.^2W^T.^1W^T.X? $$\n", 373 | "\n", 374 | "
\n", 375 | "\n", 376 | "$$ Reduces\\ to\\ W.X\\ where\\ W\\ is\\ a\\ 1 \\times 3\\ matrix $$\n", 377 | "\n", 378 | "
\n", 379 | "\n", 380 | "$$ Therefore,\\ we\\ need\\ to\\ introduce\\ non-linearities: $$\n", 381 | "\n", 382 | "$$ H_1 = f_1(^1W^T.X) $$\n", 383 | "$$ H_2 = f_2(^2W^T.H_1) $$\n", 384 | "$$ o\\ =\\ ^3W^T.H_2 $$\n", 385 | "\n", 386 | "$$ where\\ f_1,\\ and\\ f_2,\\ are\\ non-linear\\ activation\\ functions $$\n", 387 | "\n", 388 | "$$ \\therefore o\\ =\\ ^3W^T.(f_2(^2W^T.(f_1(^1W^T.X)))) $$" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "metadata": { 394 | "id": "TSki8cprWfre", 395 | "colab_type": "code", 396 | "outputId": "77446ec1-7f4f-485e-d656-2b02c3cb7678", 397 | "colab": { 398 | "base_uri": "https://localhost:8080/", 399 | "height": 88 400 | } 401 | }, 402 | "source": [ 403 | "# Building the Neural Network\n", 404 | "X = np.random.uniform(0, 1, 3)\n", 405 | "\n", 406 | "W_1 = np.random.uniform(0, 1, (3, 4))\n", 407 | "W_2 = np.random.uniform(0, 1, (4, 4))\n", 408 | "W_3 = np.random.uniform(0, 1, (4, 1))\n", 409 | "\n", 410 | "H_1 = np.tanh(W_1.T @ X)\n", 411 | "H_2 = np.tanh(W_2.T @ H_1)\n", 412 | "o = W_3.T @ H_2\n", 413 | "\n", 414 | "print(\"input layer:\\t\", X)\n", 415 | "print(\"hidden layer 1:\\t\", H_1)\n", 416 | "print(\"hidden layer 2:\\t\", H_2)\n", 417 | "print(\"output layer:\\t\", o)" 418 | ], 419 | "execution_count": 7, 420 | "outputs": [ 421 | { 422 | "output_type": "stream", 423 | "text": [ 424 | "input layer:\t [0.86400536 0.96209315 0.52361253]\n", 425 | "hidden layer 1:\t [0.88276624 0.90933301 0.67527933 0.7899755 ]\n", 426 | "hidden layer 2:\t [0.99211059 0.84954129 0.94098012 0.57407657]\n", 427 | "output layer:\t [2.92327043]\n" 428 | ], 429 | "name": "stdout" 430 | } 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": { 436 | "id": "6f3VCynw8XW2", 437 | "colab_type": "text" 438 | }, 439 | "source": [ 440 | "### $ >\\ Define\\ and\\ use\\ other\\ activation\\ functions.$\n", 441 | "\n", 442 | "---" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "metadata": { 448 | "id": "2rUnEfoCX92W", 449 | "colab_type": "code", 450 | "outputId": "442d2f35-eccd-4a60-fa65-3abc6d8e268e", 451 | "colab": { 452 | "base_uri": "https://localhost:8080/", 453 | "height": 35 454 | } 455 | }, 456 | "source": [ 457 | "# using scikit learn to define multi-layer perceptrons\n", 458 | "from sklearn.neural_network import MLPRegressor\n", 459 | "\n", 460 | "reg = MLPRegressor(hidden_layer_sizes = (4, 4),\n", 461 | " random_state = 2020,\n", 462 | " activation = 'tanh',\n", 463 | " max_iter = int(1e7))\n", 464 | "\n", 465 | "# Trying to fit weights to input X and output o\n", 466 | "reg.fit(np.asmatrix(X), np.asarray(o))\n", 467 | "\n", 468 | "# reg is our MLP model\n", 469 | "\n", 470 | "# Passing X through the MLP\n", 471 | "print(\"Predicted Output:\", reg.predict(np.asmatrix(X)))" 472 | ], 473 | "execution_count": 8, 474 | "outputs": [ 475 | { 476 | "output_type": "stream", 477 | "text": [ 478 | "Predicted Output: [2.81988345]\n" 479 | ], 480 | "name": "stdout" 481 | } 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "metadata": { 487 | "id": "aMK8DfC9r0uJ", 488 | "colab_type": "code", 489 | "colab": {} 490 | }, 491 | "source": [ 492 | "# Thank You" 493 | ], 494 | "execution_count": 0, 495 | "outputs": [] 496 | } 497 | ] 498 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Introduction to Deep Learning Tutorials, Summer '20, '21 2 | 3 | - [Recorded sessions from summer 2021](https://web.microsoftstream.com/channel/6fd9affc-b350-47e8-bb5a-b4dd209f24ba) (Use your IIIT-H credentials for access) 4 | 5 | ### Intro to Deep Learning and Word Vectors 6 | 7 | > [Slides](https://docs.google.com/presentation/d/1KLXUDtq3OwYkcSmLKsl3ZirIj8eXJODS-razdfQVGeY/edit?usp=sharing) 8 | 9 | > [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sayarghoshroy/Intro_to_DL_tutorial/blob/master/Intro_to_Deep_Neural_Networks.ipynb) Intro to Deep NN Playground 10 | 11 | > [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sayarghoshroy/Intro_to_DL_tutorial/blob/master/Word_Embeddings.ipynb) Word Vectors Playground 12 | 13 | --- 14 | 15 | ### Preliminary Mathematics and Machine Learning 16 | 17 | > [Slides](https://docs.google.com/presentation/d/1OqxR1O218ZUkUeSNgo7c46HMHms2PyiRWw8SWFgqLF4/edit?usp=sharing) 18 | 19 | > [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sayarghoshroy/Intro_to_DL_tutorial/blob/master/regression_%26_minimization.ipynb) Visualizing Gradient Descent — Regression and Function Minimization 20 | 21 | > [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sayarghoshroy/Intro_to_DL_tutorial/blob/master/classification.ipynb) Example on Classification 22 | 23 | > [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sayarghoshroy/Intro_to_DL_tutorial/blob/master/training_a_neural_network.ipynb) Training a Neural Network on a Real World Dataset 24 | 25 | The dataset for the fouth example can be found [here](https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength). 26 | 27 | If this is way too basic for you, and you're really interested in Optimization Algorithms, go [here](https://github.com/sayarghoshroy/Optimization_and_Learning). 28 | 29 | > [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sayarghoshroy/Intro_to_DL_tutorial/blob/master/PCA_and_LDA.ipynb) PCA and LDA ~ Illustration 30 | > 31 | > [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sayarghoshroy/Intro_to_DL_tutorial/blob/master/sentiment_analysis.ipynb) Sentiment Analysis of Movie Reviews 32 | 33 | > [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sayarghoshroy/Summarization/blob/master/summarization_model.ipynb) Deep NN for Sentence Salience Prediction & how it can lead to Extractive Summarization 34 | 35 | Dataset for sentences with labelled scores can be downloaded from [here](https://drive.google.com/file/d/1WpydthiqqUDYsWx7KqfA0-nlQ6XNZWz6/view?usp=sharing). 36 | 37 | --- 38 | 39 | ### Convolutional Neural Networks 40 | 41 | > [Slides](https://docs.google.com/presentation/d/19e5bHn3zfcfQB8UyiAw9LFDGniOA5oskLBFZ1EcnIWQ/edit?usp=sharing) 42 | 43 | > [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sayarghoshroy/Intro_to_DL_tutorial/blob/master/CNN_Text_classification.ipynb) CNN-based Text Encoder 44 | 45 | > [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sayarghoshroy/Intro_to_DL_tutorial/blob/master/CNN_image_Classification.ipynb) CNNs for Image Classification 46 | 47 | --- 48 | 49 | ### Recurrent Neural Networks 50 | 51 | > [Slides](https://docs.google.com/presentation/d/10QvPP7UDaX24Spypa_KligcsCUn2ME6bHKYXbTFdiko/edit?usp=sharing) 52 | 53 | > [Neural Machine Translation with Encoder-Decoder and Attention](https://github.com/sayarghoshroy/Neural_Machine_Translation) 54 | 55 | > [Time Series Prediction using Recurrent Neural Networks](https://github.com/sayarghoshroy/Recurrent_NN_Modelling) 56 | 57 | My write-up on use of attention schemes in Neural QA can be found [here](https://docs.google.com/document/d/1K7lPsVtBF60O-dfFzQK-1oyB-uynuEokgqMQkyr5YC0/edit?usp=sharing). 58 | 59 | --- 60 | 61 | ### Transformer Models 62 | 63 | 64 | > [Slides](https://iiitaphyd-my.sharepoint.com/:p:/g/personal/tanmay_sachan_research_iiit_ac_in/EeF2R0GIwchFshS4cpAbreYBvsNJIQ9Iw5jYcntY1LRfAg?e=3gvRIT) 65 | 66 | > [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sayarghoshroy/Intro_to_DL_tutorial/blob/master/Attention_is_All_You_Need.ipynb) Building a Transformer Neural Network 67 | 68 | > [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sayarghoshroy/Intro_to_DL_tutorial/blob/master/classification_test_bench.ipynb) Simple Transfomers for basic tasks using Transformer neural networks 69 | 70 | --- 71 | 72 | -------------------------------------------------------------------------------- /Word_Embeddings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.7.3" 21 | }, 22 | "colab": { 23 | "name": "Word_Embeddings.ipynb", 24 | "provenance": [], 25 | "collapsed_sections": [], 26 | "include_colab_link": true 27 | } 28 | }, 29 | "cells": [ 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "id": "view-in-github", 34 | "colab_type": "text" 35 | }, 36 | "source": [ 37 | "\"Open" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "metadata": { 43 | "id": "bJrH9l8GhByQ", 44 | "colab_type": "code", 45 | "colab": {} 46 | }, 47 | "source": [ 48 | "import spacy\n", 49 | "import numpy as np" 50 | ], 51 | "execution_count": 0, 52 | "outputs": [] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "metadata": { 57 | "id": "LpG4AaPeiVRe", 58 | "colab_type": "code", 59 | "colab": {} 60 | }, 61 | "source": [ 62 | "# Running on local machine?\n", 63 | "# in case of errors with conda, try this:\n", 64 | "# conda install -c conda-forge spacy" 65 | ], 66 | "execution_count": 0, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "J5I-oFwMiLOr", 73 | "colab_type": "code", 74 | "colab": {} 75 | }, 76 | "source": [ 77 | "# for the default model in Spacy\n", 78 | "# nlp = spacy.load(\"en_core_web_sm\")" 79 | ], 80 | "execution_count": 0, 81 | "outputs": [] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "metadata": { 86 | "id": "rZ3YXN28h7ww", 87 | "colab_type": "code", 88 | "outputId": "f9c8bf26-c188-4f42-c200-c4a58e92b642", 89 | "colab": { 90 | "base_uri": "https://localhost:8080/", 91 | "height": 88 92 | } 93 | }, 94 | "source": [ 95 | "# for the large language model for English\n", 96 | "# might take some time to download\n", 97 | "\n", 98 | "# uncomment the next line if default English model data cannot be located\n", 99 | "# !python -m spacy download en\n", 100 | "\n", 101 | "# uncomment the next line if the large model for English cannot be located\n", 102 | "# !python -m spacy download en_core_web_lg\n", 103 | "\n", 104 | "!python -m spacy link en_core_web_lg en --force\n", 105 | "# use the large model as the default model for English textual data\n", 106 | "\n", 107 | "nlp = spacy.load(\"en\")" 108 | ], 109 | "execution_count": 0, 110 | "outputs": [ 111 | { 112 | "output_type": "stream", 113 | "text": [ 114 | "\u001b[38;5;2m✔ Linking successful\u001b[0m\n", 115 | "/usr/local/lib/python3.6/dist-packages/en_core_web_lg -->\n", 116 | "/usr/local/lib/python3.6/dist-packages/spacy/data/en\n", 117 | "You can now load the model via spacy.load('en')\n" 118 | ], 119 | "name": "stdout" 120 | } 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "metadata": { 126 | "id": "JaYkgReRn31q", 127 | "colab_type": "code", 128 | "colab": {} 129 | }, 130 | "source": [ 131 | "def cosine_similarity(vec_A, vec_B):\n", 132 | " return np.dot(np.asarray(vec_A), np.asarray(vec_B)) / (np.linalg.norm(np.asarray(vec_A)) * np.linalg.norm(np.asarray(vec_B)))" 133 | ], 134 | "execution_count": 0, 135 | "outputs": [] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": { 140 | "id": "f7tT9BzqpJHU", 141 | "colab_type": "text" 142 | }, 143 | "source": [ 144 | "### Will try out various 'Similarity' Tasks for Word Embeddings" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "metadata": { 150 | "id": "hdqBM-vchBym", 151 | "colab_type": "code", 152 | "outputId": "38038471-1f23-4f1d-931c-0d489c13e200", 153 | "colab": { 154 | "base_uri": "https://localhost:8080/", 155 | "height": 35 156 | } 157 | }, 158 | "source": [ 159 | "# Gender\n", 160 | "\n", 161 | "tokens = nlp(\"king man woman queen\")\n", 162 | "\n", 163 | "for token in tokens:\n", 164 | " if(token.text == 'king'):\n", 165 | " vec_king = token.vector\n", 166 | " if(token.text == 'man'):\n", 167 | " vec_man = token.vector\n", 168 | " if(token.text == 'woman'):\n", 169 | " vec_woman = token.vector\n", 170 | " if(token.text == 'queen'):\n", 171 | " vec_queen = token.vector\n", 172 | "\n", 173 | "new_vec = vec_king - vec_man + vec_woman\n", 174 | "print(cosine_similarity(new_vec, vec_queen))" 175 | ], 176 | "execution_count": 0, 177 | "outputs": [ 178 | { 179 | "output_type": "stream", 180 | "text": [ 181 | "0.78808445\n" 182 | ], 183 | "name": "stdout" 184 | } 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "metadata": { 190 | "id": "WOPh6nSfhBzl", 191 | "colab_type": "code", 192 | "outputId": "cf8702e9-f372-4dbb-81a3-90d47d0b8544", 193 | "colab": { 194 | "base_uri": "https://localhost:8080/", 195 | "height": 35 196 | } 197 | }, 198 | "source": [ 199 | "# Capital Cities\n", 200 | "\n", 201 | "tokens = nlp(\"paris france tokyo japan\")\n", 202 | "\n", 203 | "for token in tokens:\n", 204 | " if(token.text == 'paris'):\n", 205 | " vec_paris = token.vector\n", 206 | " if(token.text == 'france'):\n", 207 | " vec_france = token.vector\n", 208 | " if(token.text == 'tokyo'):\n", 209 | " vec_tokyo = token.vector\n", 210 | " if(token.text == 'japan'):\n", 211 | " vec_japan = token.vector\n", 212 | " \n", 213 | "new_vec = vec_paris - vec_france + vec_japan\n", 214 | "print(cosine_similarity(new_vec, vec_tokyo))" 215 | ], 216 | "execution_count": 0, 217 | "outputs": [ 218 | { 219 | "output_type": "stream", 220 | "text": [ 221 | "0.79177994\n" 222 | ], 223 | "name": "stdout" 224 | } 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "metadata": { 230 | "id": "Ns_lz8hShBzx", 231 | "colab_type": "code", 232 | "outputId": "f334e981-c7a3-41ab-c96a-9f44e369b64a", 233 | "colab": { 234 | "base_uri": "https://localhost:8080/", 235 | "height": 35 236 | } 237 | }, 238 | "source": [ 239 | "# Pluralization\n", 240 | "\n", 241 | "tokens = nlp(\"mouse mice chair chairs\")\n", 242 | "\n", 243 | "for token in tokens:\n", 244 | " if(token.text == 'mouse'):\n", 245 | " vec_mouse = token.vector\n", 246 | " if(token.text == 'mice'):\n", 247 | " vec_mice = token.vector\n", 248 | " if(token.text == 'chair'):\n", 249 | " vec_chair = token.vector\n", 250 | " if(token.text == 'chairs'):\n", 251 | " vec_chairs = token.vector\n", 252 | " \n", 253 | "new_vec = vec_mice - vec_mouse + vec_chair\n", 254 | "print(cosine_similarity(new_vec, vec_chairs))" 255 | ], 256 | "execution_count": 0, 257 | "outputs": [ 258 | { 259 | "output_type": "stream", 260 | "text": [ 261 | "0.6925059\n" 262 | ], 263 | "name": "stdout" 264 | } 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "metadata": { 270 | "id": "bvhdv_2FhBz8", 271 | "colab_type": "code", 272 | "outputId": "b48d5b2c-377a-44f5-ce89-a6876862344e", 273 | "colab": { 274 | "base_uri": "https://localhost:8080/", 275 | "height": 35 276 | } 277 | }, 278 | "source": [ 279 | "# Superlative Degree\n", 280 | "\n", 281 | "tokens = nlp(\"good best cold colder\")\n", 282 | "\n", 283 | "for token in tokens:\n", 284 | " if(token.text == 'cold'):\n", 285 | " vec_cold = token.vector\n", 286 | " if(token.text == 'colder'):\n", 287 | " vec_colder = token.vector\n", 288 | " if(token.text == 'best'):\n", 289 | " vec_best = token.vector\n", 290 | " if(token.text == 'good'):\n", 291 | " vec_good = token.vector\n", 292 | " \n", 293 | "new_vec = vec_colder - vec_cold + vec_good\n", 294 | "print(cosine_similarity(new_vec, vec_best))" 295 | ], 296 | "execution_count": 0, 297 | "outputs": [ 298 | { 299 | "output_type": "stream", 300 | "text": [ 301 | "0.4129227\n" 302 | ], 303 | "name": "stdout" 304 | } 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "metadata": { 310 | "id": "SihapaMohB0G", 311 | "colab_type": "code", 312 | "outputId": "1646f44d-5f79-4881-c98a-c5bdc87e743a", 313 | "colab": { 314 | "base_uri": "https://localhost:8080/", 315 | "height": 35 316 | } 317 | }, 318 | "source": [ 319 | "# Present Participle Forms\n", 320 | "\n", 321 | "tokens = nlp(\"think thinking read reading\")\n", 322 | "\n", 323 | "for token in tokens:\n", 324 | " if(token.text == 'think'):\n", 325 | " vec_think = token.vector\n", 326 | " if(token.text == 'thinking'):\n", 327 | " vec_thinking = token.vector\n", 328 | " if(token.text == 'read'):\n", 329 | " vec_read = token.vector\n", 330 | " if(token.text == 'reading'):\n", 331 | " vec_reading = token.vector\n", 332 | " \n", 333 | "new_vec = vec_thinking - vec_think + vec_read\n", 334 | "print(cosine_similarity(new_vec, vec_reading))" 335 | ], 336 | "execution_count": 0, 337 | "outputs": [ 338 | { 339 | "output_type": "stream", 340 | "text": [ 341 | "0.78735167\n" 342 | ], 343 | "name": "stdout" 344 | } 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "metadata": { 350 | "id": "4oMf6JRShB0S", 351 | "colab_type": "code", 352 | "outputId": "8cfd77b8-c786-42b8-bd87-6efd7db8f0c4", 353 | "colab": { 354 | "base_uri": "https://localhost:8080/", 355 | "height": 35 356 | } 357 | }, 358 | "source": [ 359 | "# Opposites\n", 360 | "\n", 361 | "tokens = nlp(\"possible impossible ethical unethical\")\n", 362 | "\n", 363 | "for token in tokens:\n", 364 | " if(token.text == 'possible'):\n", 365 | " vec_possible = token.vector\n", 366 | " if(token.text == 'impossible'):\n", 367 | " vec_impossible = token.vector\n", 368 | " if(token.text == 'ethical'):\n", 369 | " vec_ethical = token.vector\n", 370 | " if(token.text == 'unethical'):\n", 371 | " vec_unethical = token.vector\n", 372 | " \n", 373 | "new_vec = vec_impossible - vec_possible + vec_ethical\n", 374 | "print(cosine_similarity(new_vec, vec_unethical))" 375 | ], 376 | "execution_count": 0, 377 | "outputs": [ 378 | { 379 | "output_type": "stream", 380 | "text": [ 381 | "0.54883933\n" 382 | ], 383 | "name": "stdout" 384 | } 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": { 390 | "id": "Otcw5W-L8pJn", 391 | "colab_type": "text" 392 | }, 393 | "source": [ 394 | "#### $> Differenece\\ between\\ the\\ small\\ and\\ large\\ models\\ for \\ English. $\n", 395 | "##### - $ Check\\ out\\ the\\ medium-sized\\ model. $\n", 396 | "#### $> Try\\ out\\ other\\ operations\\ using\\ the\\ vectors.$\n", 397 | "##### - $ Does\\ Vector\\ Difference\\ really\\ capture\\ co-occurence\\ probability?$\n", 398 | "##### - $ Do\\ distance\\ based\\ metrics\\ add\\ anything\\ to\\ the\\ picture? $\n", 399 | "##### - $ Can\\ you\\ define\\ your\\ own\\ weighted\\ similarity\\ metric?\\ What\\ motivates\\ such\\ definitions?$\n" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "metadata": { 405 | "id": "1c9bN6nihB0d", 406 | "colab_type": "code", 407 | "colab": {} 408 | }, 409 | "source": [ 410 | "# ^_^ Thank You" 411 | ], 412 | "execution_count": 0, 413 | "outputs": [] 414 | } 415 | ] 416 | } -------------------------------------------------------------------------------- /classification_test_bench.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "classification_test_bench.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "mount_file_id": "1Jxc6F2Ne3EssCIG3vjxLuwUE_xjH33Rk", 10 | "authorship_tag": "ABX9TyMmzRDWsQ54Iy1OiKWaOFXM", 11 | "include_colab_link": true 12 | }, 13 | "kernelspec": { 14 | "name": "python3", 15 | "display_name": "Python 3" 16 | }, 17 | "accelerator": "GPU" 18 | }, 19 | "cells": [ 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "id": "view-in-github", 24 | "colab_type": "text" 25 | }, 26 | "source": [ 27 | "\"Open" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "metadata": { 33 | "id": "GoNXzxiuJh_T" 34 | }, 35 | "source": [ 36 | "import warnings\n", 37 | "warnings.simplefilter('ignore')" 38 | ], 39 | "execution_count": null, 40 | "outputs": [] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "metadata": { 45 | "id": "KdARbO3dJwvL" 46 | }, 47 | "source": [ 48 | "%%capture\n", 49 | "# Suppressing cell output\n", 50 | "!pip install torch\n", 51 | "import torch" 52 | ], 53 | "execution_count": null, 54 | "outputs": [] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "metadata": { 59 | "colab": { 60 | "base_uri": "https://localhost:8080/" 61 | }, 62 | "id": "NYVzGiguKGBs", 63 | "outputId": "470a0681-60c5-4772-9d92-9225ea2dc0f6" 64 | }, 65 | "source": [ 66 | "cuda_available = torch.cuda.is_available()\n", 67 | "print('CUDA is available: ' + str(cuda_available))\n", 68 | "print('PyTorch version: ' + str(torch.__version__))\n", 69 | "if cuda_available:\n", 70 | " torch.device('cuda')" 71 | ], 72 | "execution_count": null, 73 | "outputs": [ 74 | { 75 | "output_type": "stream", 76 | "text": [ 77 | "CUDA is available: True\n", 78 | "PyTorch version: 1.8.1+cu101\n" 79 | ], 80 | "name": "stdout" 81 | } 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "metadata": { 87 | "colab": { 88 | "base_uri": "https://localhost:8080/" 89 | }, 90 | "id": "cI4n9guqKXLT", 91 | "outputId": "800666c3-da4f-4788-9a49-3fea142198d2" 92 | }, 93 | "source": [ 94 | "!nvidia-smi" 95 | ], 96 | "execution_count": null, 97 | "outputs": [ 98 | { 99 | "output_type": "stream", 100 | "text": [ 101 | "Sun Apr 11 20:24:08 2021 \n", 102 | "+-----------------------------------------------------------------------------+\n", 103 | "| NVIDIA-SMI 460.67 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", 104 | "|-------------------------------+----------------------+----------------------+\n", 105 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 106 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 107 | "| | | MIG M. |\n", 108 | "|===============================+======================+======================|\n", 109 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", 110 | "| N/A 36C P8 9W / 70W | 3MiB / 15109MiB | 0% Default |\n", 111 | "| | | N/A |\n", 112 | "+-------------------------------+----------------------+----------------------+\n", 113 | " \n", 114 | "+-----------------------------------------------------------------------------+\n", 115 | "| Processes: |\n", 116 | "| GPU GI CI PID Type Process name GPU Memory |\n", 117 | "| ID ID Usage |\n", 118 | "|=============================================================================|\n", 119 | "| No running processes found |\n", 120 | "+-----------------------------------------------------------------------------+\n" 121 | ], 122 | "name": "stdout" 123 | } 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "metadata": { 129 | "id": "EdZOx5TlKYsJ" 130 | }, 131 | "source": [ 132 | "import os\n", 133 | "import time\n", 134 | "import sys\n", 135 | "import json\n", 136 | "import numpy as np\n", 137 | "import pickle\n", 138 | "import shutil" 139 | ], 140 | "execution_count": null, 141 | "outputs": [] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "metadata": { 146 | "id": "Ck9nPw5eKjDE" 147 | }, 148 | "source": [ 149 | "%%capture\n", 150 | "# Suppressing cell output\n", 151 | "!pip install datasets\n", 152 | "!pip install protobuf\n", 153 | "!pip install simpletransformers\n", 154 | "\n", 155 | "# Note: If you're facing issues on Colab\n", 156 | "# Restart and rerun from this cell" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "iy67hFK-Ksz9" 165 | }, 166 | "source": [ 167 | "from simpletransformers.classification import ClassificationModel, ClassificationArgs\n", 168 | "from sklearn.metrics import classification_report\n", 169 | "import simpletransformers\n", 170 | "import logging\n", 171 | "import pandas as pd" 172 | ], 173 | "execution_count": null, 174 | "outputs": [] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "metadata": { 179 | "id": "jV2twhfUK3da" 180 | }, 181 | "source": [ 182 | "# Set data name and path\n", 183 | "data_name = 'data'\n", 184 | "data_path = 'drive/My Drive/data'\n", 185 | "# For referencing data stored on Google drive like the above\n", 186 | "# Mount drive first\n", 187 | "\n", 188 | "# Expectation:\n", 189 | "# data_path directory should contain train, val, test jsons\n", 190 | "# data-points should be present as a list of dicts\n", 191 | "# with each dict having a 'source', and a 'target'\n", 192 | "\n", 193 | "with open(data_path + '/' + 'train.json', 'r+') as f:\n", 194 | " raw_train = json.load(f)\n", 195 | "\n", 196 | "with open(data_path + '/' + 'val.json', 'r+') as f:\n", 197 | " raw_val = json.load(f)\n", 198 | "\n", 199 | "with open(data_path + '/' + 'test.json', 'r+') as f:\n", 200 | " raw_test = json.load(f)" 201 | ], 202 | "execution_count": null, 203 | "outputs": [] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "metadata": { 208 | "colab": { 209 | "base_uri": "https://localhost:8080/" 210 | }, 211 | "id": "Bqkc33YyLtvW", 212 | "outputId": "1cb70f15-73e8-42f2-a6e1-21383f42408d" 213 | }, 214 | "source": [ 215 | "# Verifying loaded data\n", 216 | "assert type(raw_train) == type(raw_val)\n", 217 | "assert type(raw_train) == type(raw_test)\n", 218 | "print('Raw data object type: ' + str(type(raw_train)))\n", 219 | "print()\n", 220 | "\n", 221 | "print('Fields in the raw data: ')\n", 222 | "unit = raw_train[0]\n", 223 | "\n", 224 | "for key in unit:\n", 225 | " print('• ' + str(key))" 226 | ], 227 | "execution_count": null, 228 | "outputs": [ 229 | { 230 | "output_type": "stream", 231 | "text": [ 232 | "Raw data object type: \n", 233 | "\n", 234 | "Fields in the raw data: \n", 235 | "• source\n", 236 | "• target\n" 237 | ], 238 | "name": "stdout" 239 | } 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "TYWID8eWNUuW" 246 | }, 247 | "source": [ 248 | "# To test out the procedure with small amounts of data\n", 249 | "global_testing_mode = 0\n", 250 | "global_testing_unit_count = 512" 251 | ], 252 | "execution_count": null, 253 | "outputs": [] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "metadata": { 258 | "colab": { 259 | "base_uri": "https://localhost:8080/" 260 | }, 261 | "id": "9wTFOtjYNdqW", 262 | "outputId": "9ce2a872-e4de-4289-ff84-18dcedda52d6" 263 | }, 264 | "source": [ 265 | "print('Number of Samples in: ')\n", 266 | "print('• train: ' + str(len(raw_train)))\n", 267 | "print('• val: ' + str(len(raw_val)))\n", 268 | "print('• test: ' + str(len(raw_test)))\n", 269 | "\n", 270 | "# Defining mappings for training\n", 271 | "def create_set(set_name = 'train'):\n", 272 | " global raw_train, raw_val, raw_test\n", 273 | " global global_testing_mode, global_testing_unit_count\n", 274 | " work_on = None\n", 275 | "\n", 276 | " if set_name == 'train':\n", 277 | " work_on = raw_train\n", 278 | " elif set_name == 'val':\n", 279 | " work_on = raw_val\n", 280 | " elif set_name == 'test':\n", 281 | " work_on = raw_test\n", 282 | " else:\n", 283 | " print('Invalid Data Split.')\n", 284 | " return -1\n", 285 | " \n", 286 | " data_size = len(work_on)\n", 287 | " if global_testing_mode:\n", 288 | " data_size = global_testing_unit_count\n", 289 | "\n", 290 | " data = []\n", 291 | " for index in range(data_size):\n", 292 | " unit = [work_on[index]['source'], work_on[index]['target']]\n", 293 | " data.append(unit)\n", 294 | "\n", 295 | " return data\n", 296 | "\n", 297 | "train = create_set('train')\n", 298 | "val = create_set('val')\n", 299 | "test = create_set('test')" 300 | ], 301 | "execution_count": null, 302 | "outputs": [ 303 | { 304 | "output_type": "stream", 305 | "text": [ 306 | "Number of Samples in: \n", 307 | "• train: 17348\n", 308 | "• val: 2478\n", 309 | "• test: 4957\n" 310 | ], 311 | "name": "stdout" 312 | } 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "metadata": { 318 | "colab": { 319 | "base_uri": "https://localhost:8080/" 320 | }, 321 | "id": "tz3Gzl6lNvsE", 322 | "outputId": "00bdf3d0-077f-4f07-dd56-d8d253d4dd59" 323 | }, 324 | "source": [ 325 | "# Getting number of positive and negative samples in train split\n", 326 | "total_in_train = len(train)\n", 327 | "positive_in_train = 0\n", 328 | "correct_imbalance = True\n", 329 | "\n", 330 | "for unit in train:\n", 331 | " positive_in_train += unit[1]\n", 332 | "\n", 333 | "print('Number of positive samples: ' + str(positive_in_train))\n", 334 | "print('Number of negative samples: ' + str(total_in_train - positive_in_train))\n", 335 | "\n", 336 | "# Weights to correct the class imbalance\n", 337 | "greater_class_count = max((total_in_train - positive_in_train), positive_in_train)\n", 338 | "class_weights = [greater_class_count / (total_in_train - positive_in_train),\n", 339 | " greater_class_count / positive_in_train]\n", 340 | "\n", 341 | "if correct_imbalance == False:\n", 342 | " # Disabling weighing of classes\n", 343 | " class_weights = [1, 1]" 344 | ], 345 | "execution_count": null, 346 | "outputs": [ 347 | { 348 | "output_type": "stream", 349 | "text": [ 350 | "Number of positive samples: 14450\n", 351 | "Number of negative samples: 2898\n" 352 | ], 353 | "name": "stdout" 354 | } 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "metadata": { 360 | "colab": { 361 | "base_uri": "https://localhost:8080/", 362 | "height": 424 363 | }, 364 | "id": "4k-jbdscSEPQ", 365 | "outputId": "a54009f2-59f5-44d7-a3e3-a3fe68c05e89" 366 | }, 367 | "source": [ 368 | "# Defining dataframes\n", 369 | "train_df = pd.DataFrame(train)\n", 370 | "train_df.columns = ['source', 'label']\n", 371 | "\n", 372 | "val_df = pd.DataFrame(val)\n", 373 | "val_df.columns = ['source', 'label']\n", 374 | "\n", 375 | "# Verifying correctness\n", 376 | "train_df" 377 | ], 378 | "execution_count": null, 379 | "outputs": [ 380 | { 381 | "output_type": "execute_result", 382 | "data": { 383 | "text/html": [ 384 | "
\n", 385 | "\n", 398 | "\n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | "
sourcelabel
0Early bird get the worm.0
1if youre not cheering for mexico youre a littl...1
2RT : These hoes ain't loyal1
3\" lames crying over hoes thats tears of a clown \"1
4: I feel sorry for da females that dont get wh...1
.........
17343When a hoe tryna get at you while ya girl is l...1
17344Fuk wit a real nicca1
17345naw nigga what's up with em hands, what happen...1
17346Yous a bitch ass nigga &;: Rt i Push the tape ...1
17347: Where da hoes at... C'Mon thru at Hatchy's i...1
\n", 464 | "

17348 rows × 2 columns

\n", 465 | "
" 466 | ], 467 | "text/plain": [ 468 | " source label\n", 469 | "0 Early bird get the worm. 0\n", 470 | "1 if youre not cheering for mexico youre a littl... 1\n", 471 | "2 RT : These hoes ain't loyal 1\n", 472 | "3 \" lames crying over hoes thats tears of a clown \" 1\n", 473 | "4 : I feel sorry for da females that dont get wh... 1\n", 474 | "... ... ...\n", 475 | "17343 When a hoe tryna get at you while ya girl is l... 1\n", 476 | "17344 Fuk wit a real nicca 1\n", 477 | "17345 naw nigga what's up with em hands, what happen... 1\n", 478 | "17346 Yous a bitch ass nigga &;: Rt i Push the tape ... 1\n", 479 | "17347 : Where da hoes at... C'Mon thru at Hatchy's i... 1\n", 480 | "\n", 481 | "[17348 rows x 2 columns]" 482 | ] 483 | }, 484 | "metadata": { 485 | "tags": [] 486 | }, 487 | "execution_count": 13 488 | } 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "metadata": { 494 | "colab": { 495 | "base_uri": "https://localhost:8080/" 496 | }, 497 | "id": "pgvVg6FGSXFB", 498 | "outputId": "48ecec3d-cdc1-4cec-9d4e-aed4b4976ceb" 499 | }, 500 | "source": [ 501 | "%%capture\n", 502 | "# Leveraging a pre-trained Transformer Model\n", 503 | "\n", 504 | "model_index = 1\n", 505 | "# Set 0 for bert-base, 1 for roberta-base\n", 506 | "\n", 507 | "model_loc = ['bert-base-uncased', 'roberta-base'][model_index]\n", 508 | "model_type = ['bert', 'roberta'][model_index]\n", 509 | "\n", 510 | "is_lower = False\n", 511 | "if model_index == 0:\n", 512 | " is_lower = True\n", 513 | "\n", 514 | "length_setting = 256\n", 515 | "model_name = model_loc + '_' + data_name + '_' + str(length_setting)\n", 516 | "cache_name = model_name + '_cache_dir'\n", 517 | "\n", 518 | "batch_size = 32\n", 519 | "num_epochs = 4\n", 520 | "num_gpus = 4\n", 521 | "\n", 522 | "if global_testing_mode == 1:\n", 523 | " model_name += '_testing'\n", 524 | " num_epochs = 2\n", 525 | " length_setting = 64\n", 526 | "\n", 527 | "model_args = ClassificationArgs(train_batch_size = batch_size,\n", 528 | " max_seq_length = length_setting,\n", 529 | " save_steps = -1,\n", 530 | " n_gpu = num_gpus,\n", 531 | " num_train_epochs = num_epochs,\n", 532 | " evaluate_during_training = True,\n", 533 | " overwrite_output_dir = True,\n", 534 | " save_eval_checkpoints = False,\n", 535 | " save_model_every_epoch = False,\n", 536 | " cache_dir = cache_name,\n", 537 | " fp16 = True,\n", 538 | " manual_seed = 42,\n", 539 | " do_lower_case = is_lower,\n", 540 | " best_model_dir = model_name)\n", 541 | "\n", 542 | "model = ClassificationModel(model_type,\n", 543 | " model_loc,\n", 544 | " use_cuda = cuda_available,\n", 545 | " args = model_args,\n", 546 | " num_labels = 2,\n", 547 | " weight = class_weights)" 548 | ], 549 | "execution_count": null, 550 | "outputs": [ 551 | { 552 | "output_type": "stream", 553 | "text": [ 554 | "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']\n", 555 | "- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", 556 | "- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", 557 | "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']\n", 558 | "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" 559 | ], 560 | "name": "stderr" 561 | } 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "metadata": { 567 | "id": "foj_fq0yevcV" 568 | }, 569 | "source": [ 570 | "# Training\n", 571 | "\n", 572 | "start = time.time()\n", 573 | "model.train_model(train_df, eval_df = val_df)\n", 574 | "end = time.time()\n", 575 | "time_to_train = int(round(end - start))\n", 576 | "\n", 577 | "hours = int(time_to_train / 3600)\n", 578 | "minutes = int(int(time_to_train % 3600) / 60)\n", 579 | "seconds = int(time_to_train % 60)\n", 580 | "print()\n", 581 | "print('Number of Epochs: ' + str(num_epochs))\n", 582 | "print('Maximum Sequence Length: ' + str(length_setting))\n", 583 | "print('Batch size: ' + str(batch_size))\n", 584 | "print('Time taken for training: ' + str(hours).zfill(2) + ':' + str(minutes).zfill(2) + ':' + str(seconds).zfill(2))" 585 | ], 586 | "execution_count": null, 587 | "outputs": [] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "metadata": { 592 | "id": "iUR1XguvfbIg" 593 | }, 594 | "source": [ 595 | "# Inference\n", 596 | "infer_now = True\n", 597 | "\n", 598 | "if infer_now == True:\n", 599 | " model = ClassificationModel(model_type, model_name)\n", 600 | " print('Using Model: ' + str(model_name))\n", 601 | " print()\n", 602 | " \n", 603 | " val_sources = [unit[0] for unit in val]\n", 604 | " test_sources = [unit[0] for unit in test]\n", 605 | "\n", 606 | " val_targets = [unit[1] for unit in val]\n", 607 | " test_targets = [unit[1] for unit in test]\n", 608 | "\n", 609 | " # Evaluation on val data\n", 610 | " print('Results on the validation split: ')\n", 611 | " val_predictions, val_outputs = model.predict(val_sources)\n", 612 | " print(classification_report(val_targets, val_predictions, digits = 6))\n", 613 | " print()\n", 614 | "\n", 615 | " # Evaluation on test data\n", 616 | " print('Results on the test split: ')\n", 617 | " test_predictions, test_outputs = model.predict(test_sources)\n", 618 | " print(classification_report(test_targets, test_predictions, digits = 6))" 619 | ], 620 | "execution_count": null, 621 | "outputs": [] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "metadata": { 626 | "id": "oBm-JNGhZVl6" 627 | }, 628 | "source": [ 629 | "compress_model = False\n", 630 | "if compress_model == True:\n", 631 | " shutil.make_archive(model_name, 'zip', model_name)\n", 632 | " shutil.make_archive(cache_name, 'zip', cache_name)" 633 | ], 634 | "execution_count": null, 635 | "outputs": [] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "metadata": { 640 | "id": "Ryb6DigKZ9gU" 641 | }, 642 | "source": [ 643 | "# ^_^ Thank You" 644 | ], 645 | "execution_count": null, 646 | "outputs": [] 647 | } 648 | ] 649 | } -------------------------------------------------------------------------------- /sentiment_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.8.3" 21 | }, 22 | "colab": { 23 | "name": "sentiment_analysis.ipynb", 24 | "provenance": [], 25 | "collapsed_sections": [], 26 | "include_colab_link": true 27 | } 28 | }, 29 | "cells": [ 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "id": "view-in-github", 34 | "colab_type": "text" 35 | }, 36 | "source": [ 37 | "\"Open" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "metadata": { 43 | "id": "KhGNEf8GxBVD" 44 | }, 45 | "source": [ 46 | "import json\n", 47 | "import csv\n", 48 | "import numpy as np\n", 49 | "import re\n", 50 | "import nltk\n", 51 | "import pandas as pd\n", 52 | "import matplotlib.pyplot as plt\n", 53 | "import tqdm as tqdm\n", 54 | "import random\n", 55 | "import joblib\n", 56 | "import time\n", 57 | "from sklearn.neural_network import MLPClassifier\n", 58 | "from sklearn.datasets import make_classification\n", 59 | "from sklearn.feature_extraction.text import TfidfTransformer\n", 60 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 61 | "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", 62 | "from sklearn.model_selection import train_test_split\n", 63 | "from sklearn.metrics import classification_report\n", 64 | "from sklearn.pipeline import Pipeline\n", 65 | "from sklearn.model_selection import KFold" 66 | ], 67 | "execution_count": null, 68 | "outputs": [] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "metadata": { 73 | "id": "UEtFLTZ7xBVJ" 74 | }, 75 | "source": [ 76 | "%%capture .logs\n", 77 | "# Getting Text Processing Tools\n", 78 | "\n", 79 | "nltk.download('all')" 80 | ], 81 | "execution_count": null, 82 | "outputs": [] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "metadata": { 87 | "id": "lcGR8siZxBVL" 88 | }, 89 | "source": [ 90 | "# Importing Tools\n", 91 | "from nltk.tokenize import sent_tokenize, word_tokenize\n", 92 | "from nltk.tokenize import RegexpTokenizer\n", 93 | "from string import punctuation\n", 94 | "from nltk.corpus import stopwords\n", 95 | "\n", 96 | "stopword_set = set(stopwords.words('english'))" 97 | ], 98 | "execution_count": null, 99 | "outputs": [] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "metadata": { 104 | "id": "xKnOwp8fxBVN" 105 | }, 106 | "source": [ 107 | "with open('train.json', 'r+') as f:\n", 108 | " records = json.load(f)\n", 109 | "\n", 110 | "with open('test.json', 'r') as f:\n", 111 | " gold_test_list = json.load(f)" 112 | ], 113 | "execution_count": null, 114 | "outputs": [] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "metadata": { 119 | "id": "WRHw7y0T-PX5" 120 | }, 121 | "source": [ 122 | "X_train = []\n", 123 | "Y_train = []\n", 124 | "\n", 125 | "X_test = []\n", 126 | "Y_test = []\n", 127 | "\n", 128 | "for item in records:\n", 129 | " X_train.append(item['content'])\n", 130 | " Y_train.append(item['label'])\n", 131 | "\n", 132 | "for item in gold_test_list:\n", 133 | " X_test.append(item['content'])\n", 134 | " Y_test.append(item['label'])" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "ZlmjqtLixBVO" 143 | }, 144 | "source": [ 145 | "def clean(s):\n", 146 | " # takes an input string\n", 147 | " # preprocesses it for the tf-idf vectorizer\n", 148 | " s.replace(\"\\n\", \" \")\n", 149 | " tokens = word_tokenize(s)\n", 150 | " output = \"\"\n", 151 | " \n", 152 | " for token in tokens:\n", 153 | " unit = token.strip().lower()\n", 154 | " if unit in stopword_set or unit in punctuation:\n", 155 | " continue\n", 156 | " output = output + \" \" + unit\n", 157 | " \n", 158 | " return output.strip()" 159 | ], 160 | "execution_count": null, 161 | "outputs": [] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "metadata": { 166 | "id": "RF5MOGLYxBVR" 167 | }, 168 | "source": [ 169 | "vectorizer = TfidfVectorizer(\n", 170 | " sublinear_tf = True,\n", 171 | " norm = \"l2\",\n", 172 | " encoding = 'utf-8',\n", 173 | " max_features = 512,\n", 174 | " stop_words = 'english',\n", 175 | " ngram_range = (1, 3),\n", 176 | " strip_accents = 'unicode',\n", 177 | " smooth_idf = True)" 178 | ], 179 | "execution_count": null, 180 | "outputs": [] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "metadata": { 185 | "colab": { 186 | "base_uri": "https://localhost:8080/" 187 | }, 188 | "id": "wJ6h7HVrxBVS", 189 | "outputId": "60cf17da-35be-4d0b-bd02-3543db265b1a" 190 | }, 191 | "source": [ 192 | "# To verify correctness of Vectorizer\n", 193 | "X_train_vec = vectorizer.fit_transform(X_train)\n", 194 | "print(np.shape(X_train_vec))" 195 | ], 196 | "execution_count": null, 197 | "outputs": [ 198 | { 199 | "output_type": "stream", 200 | "text": [ 201 | "(25000, 512)\n" 202 | ], 203 | "name": "stdout" 204 | } 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "metadata": { 210 | "scrolled": true, 211 | "colab": { 212 | "base_uri": "https://localhost:8080/" 213 | }, 214 | "id": "jV5U_ZclxBVT", 215 | "outputId": "3328ca6a-6fa0-412c-e26d-297a585b392d" 216 | }, 217 | "source": [ 218 | "print(\"Size of Train: \" + str(len(X_train)))\n", 219 | "print(\"Size of Test: \" + str(len(X_test)))\n", 220 | "max_feature_size = 10000" 221 | ], 222 | "execution_count": null, 223 | "outputs": [ 224 | { 225 | "output_type": "stream", 226 | "text": [ 227 | "Size of Train: 25000\n", 228 | "Size of Test: 25000\n" 229 | ], 230 | "name": "stdout" 231 | } 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "metadata": { 237 | "id": "5CXY6994xBVT" 238 | }, 239 | "source": [ 240 | "def train(X, y, active = 'identity', solve = 'sgd', approach = 'mlp'):\n", 241 | " start = time.time()\n", 242 | " vec = vectorizer.fit(X)\n", 243 | " X_train_vec = vec.transform(X)\n", 244 | " \n", 245 | " if approach == 'lda':\n", 246 | " model = LinearDiscriminantAnalysis()\n", 247 | " model.fit(X_train_vec.toarray(), y)\n", 248 | " \n", 249 | " elif approach == 'mlp':\n", 250 | " model = MLPClassifier(alpha = 0,\n", 251 | " hidden_layer_sizes = (512, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 1),\n", 252 | " random_state = 2020,\n", 253 | " activation = active,\n", 254 | " max_iter = int(1e3),\n", 255 | " solver = solve,\n", 256 | " learning_rate = 'adaptive',\n", 257 | " early_stopping = True,\n", 258 | " momentum = 0.9,\n", 259 | " batch_size = 512)\n", 260 | " \n", 261 | " model.fit(X_train_vec, y)\n", 262 | " \n", 263 | " end = time.time()\n", 264 | " time_to_train = int(round(end - start))\n", 265 | "\n", 266 | " hours = int(time_to_train / 3600)\n", 267 | " minutes = int(int(time_to_train % 3600) / 60)\n", 268 | " seconds = int(time_to_train % 60)\n", 269 | "\n", 270 | " print()\n", 271 | " print('Time taken for training: ' + str(hours).zfill(2) + ':' +\n", 272 | " str(minutes).zfill(2) + ':' + str(seconds).zfill(2))\n", 273 | " return vec, model" 274 | ], 275 | "execution_count": null, 276 | "outputs": [] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "metadata": { 281 | "id": "GlUIOFTbxBVW" 282 | }, 283 | "source": [ 284 | "def get_res(vec, clf):\n", 285 | " X_test_vec = vec.transform(X_test)\n", 286 | " pred_Y_test = clf.predict(X_test_vec)\n", 287 | " print(\"Number of Features: \" + str(np.shape(X_test_vec)[1]))\n", 288 | " print(classification_report(Y_test, pred_Y_test, digits = 6))\n", 289 | " return" 290 | ], 291 | "execution_count": null, 292 | "outputs": [] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "metadata": { 297 | "colab": { 298 | "base_uri": "https://localhost:8080/" 299 | }, 300 | "id": "U6FJ0MyqxBVX", 301 | "outputId": "ae913549-29c3-4478-bbce-bec3a403a221" 302 | }, 303 | "source": [ 304 | "# To Try out all possibilities\n", 305 | "try_all = False\n", 306 | "\n", 307 | "if try_all == True:\n", 308 | " activations = ['identity', 'tanh', 'relu']\n", 309 | " solvers = ['adam', 'sgd', 'lbfgs']\n", 310 | "else:\n", 311 | " activations = ['tanh']\n", 312 | " solvers = ['sgd']\n", 313 | "\n", 314 | "for active in activations:\n", 315 | " for solver in solvers:\n", 316 | " if active == 'tanh' and solver == 'lbfgs':\n", 317 | " continue\n", 318 | " vec, model = train(X_train, Y_train, active, solver)\n", 319 | " print(\"Hidden Layer Activation = \" + str(active) + \", Solver = \" + str(solver))\n", 320 | " get_res(vec, model)" 321 | ], 322 | "execution_count": null, 323 | "outputs": [ 324 | { 325 | "output_type": "stream", 326 | "text": [ 327 | "\n", 328 | "Time taken for training: 00:29:42\n", 329 | "Hidden Layer Activation = tanh, Solver = sgd\n", 330 | "Number of Features: 512\n", 331 | " precision recall f1-score support\n", 332 | "\n", 333 | " 0 0.845930 0.814800 0.830073 12500\n", 334 | " 1 0.821373 0.851600 0.836214 12500\n", 335 | "\n", 336 | " accuracy 0.833200 25000\n", 337 | " macro avg 0.833652 0.833200 0.833144 25000\n", 338 | "weighted avg 0.833652 0.833200 0.833144 25000\n", 339 | "\n" 340 | ], 341 | "name": "stdout" 342 | } 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "metadata": { 348 | "colab": { 349 | "base_uri": "https://localhost:8080/" 350 | }, 351 | "id": "i3Cr4A2GxBVX", 352 | "outputId": "4bd572a8-0b1e-4104-9be8-443398c7a28d" 353 | }, 354 | "source": [ 355 | "# Testing out a basic pipeline\n", 356 | "pipe = Pipeline([('Feature Builder', vec), ('Classifier', model)])\n", 357 | "pred_Y_test = pipe.predict(X_test)\n", 358 | "print(classification_report(Y_test, pred_Y_test, digits = 6))" 359 | ], 360 | "execution_count": null, 361 | "outputs": [ 362 | { 363 | "output_type": "stream", 364 | "text": [ 365 | " precision recall f1-score support\n", 366 | "\n", 367 | " 0 0.845930 0.814800 0.830073 12500\n", 368 | " 1 0.821373 0.851600 0.836214 12500\n", 369 | "\n", 370 | " accuracy 0.833200 25000\n", 371 | " macro avg 0.833652 0.833200 0.833144 25000\n", 372 | "weighted avg 0.833652 0.833200 0.833144 25000\n", 373 | "\n" 374 | ], 375 | "name": "stdout" 376 | } 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "metadata": { 382 | "id": "kFgTyEV3xBVY" 383 | }, 384 | "source": [ 385 | "# K-fold Cross Validation\n", 386 | "\n", 387 | "X = X_train\n", 388 | "Y = Y_train\n", 389 | "\n", 390 | "def cross_val(algo = 'mlp', splits = 5):\n", 391 | " global X, Y\n", 392 | " splits = int(splits)\n", 393 | " if splits > 9 or splits < 3:\n", 394 | " splits = 5\n", 395 | " print(\"Classification Technique: \" + str(algo))\n", 396 | " kf = KFold(n_splits = splits, shuffle = True, random_state = 2020)\n", 397 | " index = 1 \n", 398 | "\n", 399 | " for train_index, test_index in kf.split(X):\n", 400 | " X_train = []\n", 401 | " X_test = []\n", 402 | " Y_train = []\n", 403 | " Y_test = []\n", 404 | "\n", 405 | " for idx in train_index:\n", 406 | " X_train.append(X[idx])\n", 407 | " Y_train.append(Y[idx])\n", 408 | "\n", 409 | " for idx in test_index:\n", 410 | " X_test.append(X[idx])\n", 411 | " Y_test.append(Y[idx])\n", 412 | "\n", 413 | " if algo == 'lda':\n", 414 | " vec, model = train(X_train, Y_train, '', '', 'lda')\n", 415 | " else:\n", 416 | " vec, model = train(X_train, Y_train, 'tanh', 'sgd', 'mlp')\n", 417 | "\n", 418 | " pipe = Pipeline([('Feature Builder', vec), ('Classifier', model)])\n", 419 | " pred_Y_test = pipe.predict(X_test)\n", 420 | "\n", 421 | " print(\"Fold Index: \" + str(index))\n", 422 | " index += 1\n", 423 | " print(classification_report(Y_test, pred_Y_test, digits = 6))\n", 424 | " \n", 425 | " return" 426 | ], 427 | "execution_count": null, 428 | "outputs": [] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "metadata": { 433 | "colab": { 434 | "base_uri": "https://localhost:8080/" 435 | }, 436 | "id": "ILBsC3-pxBVZ", 437 | "outputId": "319ce6e6-b5ee-4206-d3ad-ef1d1aaee6e3" 438 | }, 439 | "source": [ 440 | "# Performing K-Fold Cross Validation using LDA\n", 441 | "cross_val('lda', splits = 3)" 442 | ], 443 | "execution_count": null, 444 | "outputs": [ 445 | { 446 | "output_type": "stream", 447 | "text": [ 448 | "Classification Technique: lda\n", 449 | "\n", 450 | "Time taken for training: 00:00:39\n", 451 | "Fold Index: 1\n", 452 | " precision recall f1-score support\n", 453 | "\n", 454 | " 0 0.838678 0.819029 0.828737 4183\n", 455 | " 1 0.821840 0.841243 0.831429 4151\n", 456 | "\n", 457 | " accuracy 0.830094 8334\n", 458 | " macro avg 0.830259 0.830136 0.830083 8334\n", 459 | "weighted avg 0.830292 0.830094 0.830078 8334\n", 460 | "\n", 461 | "\n", 462 | "Time taken for training: 00:00:38\n", 463 | "Fold Index: 2\n", 464 | " precision recall f1-score support\n", 465 | "\n", 466 | " 0 0.840903 0.813250 0.826846 4166\n", 467 | " 1 0.819238 0.846172 0.832487 4167\n", 468 | "\n", 469 | " accuracy 0.829713 8333\n", 470 | " macro avg 0.830071 0.829711 0.829666 8333\n", 471 | "weighted avg 0.830069 0.829713 0.829667 8333\n", 472 | "\n", 473 | "\n", 474 | "Time taken for training: 00:00:39\n", 475 | "Fold Index: 3\n", 476 | " precision recall f1-score support\n", 477 | "\n", 478 | " 0 0.848914 0.809444 0.828709 4151\n", 479 | " 1 0.819200 0.857006 0.837677 4182\n", 480 | "\n", 481 | " accuracy 0.833313 8333\n", 482 | " macro avg 0.834057 0.833225 0.833193 8333\n", 483 | "weighted avg 0.834002 0.833313 0.833209 8333\n", 484 | "\n" 485 | ], 486 | "name": "stdout" 487 | } 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "metadata": { 493 | "colab": { 494 | "base_uri": "https://localhost:8080/" 495 | }, 496 | "id": "LovufthcxBVZ", 497 | "outputId": "b903bb41-41b4-4a1c-d520-485eab216faf" 498 | }, 499 | "source": [ 500 | "# Performing K-Fold Cross Validation using MLP\n", 501 | "cross_val('mlp', splits = 3)" 502 | ], 503 | "execution_count": null, 504 | "outputs": [ 505 | { 506 | "output_type": "stream", 507 | "text": [ 508 | "Classification Technique: mlp\n", 509 | "\n", 510 | "Time taken for training: 00:44:47\n", 511 | "Fold Index: 1\n", 512 | " precision recall f1-score support\n", 513 | "\n", 514 | " 0 0.797916 0.787234 0.792539 4183\n", 515 | " 1 0.788448 0.799085 0.793731 4151\n", 516 | "\n", 517 | " accuracy 0.793137 8334\n", 518 | " macro avg 0.793182 0.793159 0.793135 8334\n", 519 | "weighted avg 0.793200 0.793137 0.793133 8334\n", 520 | "\n", 521 | "\n", 522 | "Time taken for training: 00:12:53\n", 523 | "Fold Index: 2\n", 524 | " precision recall f1-score support\n", 525 | "\n", 526 | " 0 0.740989 0.345415 0.471185 4166\n", 527 | " 1 0.573306 0.879290 0.694071 4167\n", 528 | "\n", 529 | " accuracy 0.612384 8333\n", 530 | " macro avg 0.657147 0.612352 0.582628 8333\n", 531 | "weighted avg 0.657137 0.612384 0.582641 8333\n", 532 | "\n", 533 | "\n", 534 | "Time taken for training: 00:11:12\n", 535 | "Fold Index: 3\n", 536 | " precision recall f1-score support\n", 537 | "\n", 538 | " 0 0.623016 0.529511 0.572470 4151\n", 539 | " 1 0.593548 0.681970 0.634695 4182\n", 540 | "\n", 541 | " accuracy 0.606024 8333\n", 542 | " macro avg 0.608282 0.605741 0.603582 8333\n", 543 | "weighted avg 0.608227 0.606024 0.603698 8333\n", 544 | "\n" 545 | ], 546 | "name": "stdout" 547 | } 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "metadata": { 553 | "colab": { 554 | "base_uri": "https://localhost:8080/" 555 | }, 556 | "id": "HEQxvbr3xBVa", 557 | "outputId": "6f75d8c3-4c19-46ec-8e5a-87b2744a2fe5" 558 | }, 559 | "source": [ 560 | "# Training a LDA Classifier on the complete dataset\n", 561 | "# And saving the full pipeline into a Model\n", 562 | "\n", 563 | "vec, model = train(X, Y, '', '', 'lda')\n", 564 | "\n", 565 | "pipe = Pipeline([('Feature Builder', vec), ('Classifier', model)])\n", 566 | "joblib.dump(pipe, \"tf-idf_lda_model.pkl\")" 567 | ], 568 | "execution_count": null, 569 | "outputs": [ 570 | { 571 | "output_type": "stream", 572 | "text": [ 573 | "\n", 574 | "Time taken for training: 00:00:57\n" 575 | ], 576 | "name": "stdout" 577 | }, 578 | { 579 | "output_type": "execute_result", 580 | "data": { 581 | "text/plain": [ 582 | "['tf-idf_lda_model.pkl']" 583 | ] 584 | }, 585 | "metadata": { 586 | "tags": [] 587 | }, 588 | "execution_count": 18 589 | } 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "metadata": { 595 | "colab": { 596 | "base_uri": "https://localhost:8080/" 597 | }, 598 | "id": "00z5qJsExBVa", 599 | "outputId": "8ddae691-41fa-4bee-bf38-deaf93d74f44" 600 | }, 601 | "source": [ 602 | "# Training a MLP Classifier on the complete dataset\n", 603 | "# And saving the full pipeline into a Model\n", 604 | "\n", 605 | "vec, model = train(X, Y, 'tanh', 'sgd', 'mlp')\n", 606 | "\n", 607 | "pipe = Pipeline([('Feature Builder', vec), ('Classifier', model)])\n", 608 | "joblib.dump(pipe, \"tf-idf_mlp_model.pkl\")" 609 | ], 610 | "execution_count": null, 611 | "outputs": [ 612 | { 613 | "output_type": "stream", 614 | "text": [ 615 | "\n", 616 | "Time taken for training: 00:27:22\n" 617 | ], 618 | "name": "stdout" 619 | }, 620 | { 621 | "output_type": "execute_result", 622 | "data": { 623 | "text/plain": [ 624 | "['tf-idf_mlp_model.pkl']" 625 | ] 626 | }, 627 | "metadata": { 628 | "tags": [] 629 | }, 630 | "execution_count": 19 631 | } 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "metadata": { 637 | "colab": { 638 | "base_uri": "https://localhost:8080/" 639 | }, 640 | "id": "e8o40jQYxBVa", 641 | "outputId": "c9a380c4-7bcc-4355-ae8b-ecf924ca2d97" 642 | }, 643 | "source": [ 644 | "# Testing out the saved pipeline on all train samples\n", 645 | "saved_pipe = joblib.load(\"tf-idf_lda_model.pkl\")\n", 646 | "\n", 647 | "pred_Y_all = saved_pipe.predict(X)\n", 648 | "print(classification_report(Y, pred_Y_all, digits = 6))" 649 | ], 650 | "execution_count": null, 651 | "outputs": [ 652 | { 653 | "output_type": "stream", 654 | "text": [ 655 | " precision recall f1-score support\n", 656 | "\n", 657 | " 0 0.854967 0.826240 0.840358 12500\n", 658 | " 1 0.831889 0.859840 0.845633 12500\n", 659 | "\n", 660 | " accuracy 0.843040 25000\n", 661 | " macro avg 0.843428 0.843040 0.842996 25000\n", 662 | "weighted avg 0.843428 0.843040 0.842996 25000\n", 663 | "\n" 664 | ], 665 | "name": "stdout" 666 | } 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "metadata": { 672 | "colab": { 673 | "base_uri": "https://localhost:8080/" 674 | }, 675 | "id": "4WzlKu7gxBVa", 676 | "outputId": "33bfdd30-cadb-4f1b-c7f0-4c1cce82b05c" 677 | }, 678 | "source": [ 679 | "# Testing out Saved LDA Model on Test Data\n", 680 | "\n", 681 | "saved_pipe = joblib.load(\"tf-idf_lda_model.pkl\")\n", 682 | "\n", 683 | "X_gold_test = []\n", 684 | "Y_gold_test = []\n", 685 | "\n", 686 | "for unit in gold_test_list:\n", 687 | " X_gold_test.append(unit['content'])\n", 688 | " Y_gold_test.append(unit['label'])\n", 689 | " \n", 690 | "pred_Y_gold_test = saved_pipe.predict(X_gold_test)\n", 691 | "print(classification_report(Y_gold_test, pred_Y_gold_test, digits = 6))" 692 | ], 693 | "execution_count": null, 694 | "outputs": [ 695 | { 696 | "output_type": "stream", 697 | "text": [ 698 | " precision recall f1-score support\n", 699 | "\n", 700 | " 0 0.843693 0.816560 0.829905 12500\n", 701 | " 1 0.822276 0.848720 0.835289 12500\n", 702 | "\n", 703 | " accuracy 0.832640 25000\n", 704 | " macro avg 0.832984 0.832640 0.832597 25000\n", 705 | "weighted avg 0.832984 0.832640 0.832597 25000\n", 706 | "\n" 707 | ], 708 | "name": "stdout" 709 | } 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "metadata": { 715 | "colab": { 716 | "base_uri": "https://localhost:8080/" 717 | }, 718 | "id": "YdLrWo0hxBVa", 719 | "outputId": "718d7644-d215-4efb-9f02-5c595a2eea79" 720 | }, 721 | "source": [ 722 | "# Testing out Saved MLP Model on Test Data\n", 723 | "\n", 724 | "saved_pipe = joblib.load(\"tf-idf_mlp_model.pkl\")\n", 725 | "\n", 726 | "X_gold_test = []\n", 727 | "Y_gold_test = []\n", 728 | "\n", 729 | "for unit in gold_test_list:\n", 730 | " X_gold_test.append(unit['content'])\n", 731 | " Y_gold_test.append(unit['label'])\n", 732 | " \n", 733 | "pred_Y_gold_test = saved_pipe.predict(X_gold_test)\n", 734 | "print(classification_report(Y_gold_test, pred_Y_gold_test, digits = 6))" 735 | ], 736 | "execution_count": null, 737 | "outputs": [ 738 | { 739 | "output_type": "stream", 740 | "text": [ 741 | " precision recall f1-score support\n", 742 | "\n", 743 | " 0 0.845930 0.814800 0.830073 12500\n", 744 | " 1 0.821373 0.851600 0.836214 12500\n", 745 | "\n", 746 | " accuracy 0.833200 25000\n", 747 | " macro avg 0.833652 0.833200 0.833144 25000\n", 748 | "weighted avg 0.833652 0.833200 0.833144 25000\n", 749 | "\n" 750 | ], 751 | "name": "stdout" 752 | } 753 | ] 754 | }, 755 | { 756 | "cell_type": "markdown", 757 | "metadata": { 758 | "id": "adDYwu0OM94m" 759 | }, 760 | "source": [ 761 | "### $Exercise$:\n", 762 | "\n", 763 | "#### $Replace\\ TF-IDF\\ with\\ GloVe\\ Vectors,\\ re-run\\ experiments.$" 764 | ] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "metadata": { 769 | "id": "PEt9QxQDxBVf" 770 | }, 771 | "source": [ 772 | "# ^_^ Thank You" 773 | ], 774 | "execution_count": null, 775 | "outputs": [] 776 | } 777 | ] 778 | } --------------------------------------------------------------------------------