├── house.jpg ├── imgs ├── seq2seq.png └── image_captioning.png ├── README.md ├── labs └── lab1-template.ipynb ├── lesson10-bert_classification.ipynb ├── lesson4-rnn-name2lang.ipynb ├── lesson3-cnn-text.ipynb ├── lesson10-seq2seq.ipynb └── lesson1-cbow.ipynb /house.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanneta/deep-learning-with-pytorch/HEAD/house.jpg -------------------------------------------------------------------------------- /imgs/seq2seq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanneta/deep-learning-with-pytorch/HEAD/imgs/seq2seq.png -------------------------------------------------------------------------------- /imgs/image_captioning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanneta/deep-learning-with-pytorch/HEAD/imgs/image_captioning.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # deep-learning-with-pytorch 2 | 3 | 4 | Follow [these instructions](https://aws.amazon.com/blogs/machine-learning/get-started-with-deep-learning-using-the-aws-deep-learning-ami/) 5 | to get an AWS instance with a GPU. Pick the Ubuntu version of the deep learning Amazon Machine Images (AMI). 6 | For the instance type use a p2.xlarge. 7 | 8 | Follow [these instructions](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstances.html) to learn how to connect to the Linux instances that you launched and transfer files between your local computer and your instance. 9 | 10 | Follow [these instructions](https://docs.aws.amazon.com/dlami/latest/devguide/setup-jupyter-configure-client.html) to configure the Client to Connect to the Jupyter Server. 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /labs/lab1-template.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 9, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "end_time": "2019-05-23T04:51:18.088250Z", 9 | "start_time": "2019-05-23T04:51:17.426066Z" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "import torch \n", 15 | "import torch.autograd as autograd \n", 16 | "import torch.nn as nn \n", 17 | "import torch.nn.functional as F\n", 18 | "import torch.optim as optim\n", 19 | "from torch.utils.data import Dataset, DataLoader\n", 20 | "import numpy as np" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Lab1 \n", 28 | "Create a bag of word model for a text classification problem. Note that this is not the same as the continous bag of word problem that we solved here but you can reuse the tokenization part.\n", 29 | "\n", 30 | "https://github.com/yanneta/ML-notebooks/blob/master/cbow.ipynb" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "### Download data" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 10, 43 | "metadata": { 44 | "ExecuteTime": { 45 | "end_time": "2019-05-23T04:51:21.661953Z", 46 | "start_time": "2019-05-23T04:51:21.658143Z" 47 | } 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "def get_data():\n", 52 | " ! wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz\n", 53 | " ! mkdir data\n", 54 | " ! tar -xvf rotten_imdb.tar.gz -C data" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": { 61 | "ExecuteTime": { 62 | "end_time": "2019-05-23T04:51:22.437635Z", 63 | "start_time": "2019-05-23T04:51:22.287902Z" 64 | } 65 | }, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "--2020-05-14 11:42:27-- http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz\n", 72 | "Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.20\n", 73 | "Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.20|:80... connected.\n", 74 | "HTTP request sent, awaiting response... 200 OK\n", 75 | "Length: 519599 (507K) [application/x-gzip]\n", 76 | "Saving to: ‘rotten_imdb.tar.gz.1’\n", 77 | "\n", 78 | "rotten_imdb.tar.gz. 100%[===================>] 507.42K 557KB/s in 0.9s \n", 79 | "\n", 80 | "2020-05-14 11:42:29 (557 KB/s) - ‘rotten_imdb.tar.gz.1’ saved [519599/519599]\n", 81 | "\n", 82 | "mkdir: data: File exists\n", 83 | "x quote.tok.gt9.5000\n", 84 | "x plot.tok.gt9.5000\n", 85 | "x subjdata.README.1.0\n", 86 | "plot.tok.gt9.5000 quote.tok.gt9.5000 subjdata.README.1.0\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "get_data()\n", 92 | "! ls data" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "### Split data" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 11, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "def read_file(path):\n", 109 | " \"\"\" Read file returns a list of lines.\n", 110 | " \"\"\"\n", 111 | " with open(path, encoding = \"ISO-8859-1\") as f:\n", 112 | " content = f.readlines()\n", 113 | " return content" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 12, 119 | "metadata": { 120 | "ExecuteTime": { 121 | "end_time": "2019-05-23T04:51:31.979291Z", 122 | "start_time": "2019-05-23T04:51:30.952129Z" 123 | } 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "from sklearn.model_selection import train_test_split" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 13, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "sub_content = read_file(\"data/quote.tok.gt9.5000\")\n", 137 | "obj_content = read_file(\"data/plot.tok.gt9.5000\")\n", 138 | "sub_content = np.array([line.strip().lower() for line in sub_content])\n", 139 | "obj_content = np.array([line.strip().lower() for line in obj_content])\n", 140 | "sub_y = np.zeros(len(sub_content))\n", 141 | "obj_y = np.ones(len(obj_content))\n", 142 | "X = np.append(sub_content, obj_content)\n", 143 | "y = np.append(sub_y, obj_y)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 14, 149 | "metadata": { 150 | "ExecuteTime": { 151 | "end_time": "2019-05-23T04:51:31.992431Z", 152 | "start_time": "2019-05-23T04:51:31.982777Z" 153 | } 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 15, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "data": { 167 | "text/plain": [ 168 | "((8000,), (8000,))" 169 | ] 170 | }, 171 | "execution_count": 15, 172 | "metadata": {}, 173 | "output_type": "execute_result" 174 | } 175 | ], 176 | "source": [ 177 | "x_train.shape, y_train.shape" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 17, 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "data": { 187 | "text/plain": [ 188 | "\"both lead performances are oscar-size . quaid is utterly fearless as the tortured husband living a painful lie , and moore wonderfully underplays the long-suffering heroine with an unflappable '50s dignity somewhere between jane wyman and june cleaver .\"" 189 | ] 190 | }, 191 | "execution_count": 17, 192 | "metadata": {}, 193 | "output_type": "execute_result" 194 | } 195 | ], 196 | "source": [ 197 | "x_train[0]" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "### Compute a vocabulary\n", 205 | "* Split your sentences in tokens by spliting on spaces.\n", 206 | "* Compute the frequency of every word.\n", 207 | "* Pick top frequency words (4000 or so) to be part of your vocabulary.\n", 208 | "* Create a map from each word to an index. Keep 0 for out of the vocabulary workds ()." 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "### Bag of word representation\n", 216 | "\n", 217 | "* Given a piece of text compute the following features $x$.\n", 218 | "$x_i = 1$ if word with index $i$ appears in the text. Otherwise $x_i = 0$. Note that length $x$ is the size of the vocabulary. " 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "### Dataset and dataloaders\n", 226 | "Write a dataset for this problem" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 20, 232 | "metadata": { 233 | "ExecuteTime": { 234 | "end_time": "2019-05-23T04:52:54.922573Z", 235 | "start_time": "2019-05-23T04:52:54.916113Z" 236 | } 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "class BOW(Dataset):\n", 241 | " def __init__(self, ):\n", 242 | " self.x = None\n", 243 | " self.y = None\n", 244 | " \n", 245 | " def __len__(self):\n", 246 | " return None\n", 247 | " \n", 248 | " def __getitem__(self, idx):\n", 249 | " \n", 250 | " return None" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "### Model\n", 258 | "\n", 259 | "Define a simpler linear model or a two layer neural network." 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "## Training and valid functions" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 33, 272 | "metadata": { 273 | "ExecuteTime": { 274 | "end_time": "2019-05-23T04:56:29.431871Z", 275 | "start_time": "2019-05-23T04:56:29.426088Z" 276 | } 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "def val_metrics(model):\n", 281 | " model.eval()\n", 282 | " correct = 0\n", 283 | " total = 0\n", 284 | " loss_sum = 0\n", 285 | " for x, y in valid_dl:\n", 286 | " y_hat = model(x.float())\n", 287 | " loss = F.binary_cross_entropy_with_logits(y_hat, y.float())\n", 288 | " y_pred = y_hat > 0\n", 289 | " correct += (y_pred.float() == y.float()).float().sum()\n", 290 | " total += x.size(0)\n", 291 | " loss_sum += loss.item()*x.size(0)\n", 292 | " accuracy = correct.item()/total\n", 293 | " return loss_sum/total, accuracy" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "### Training loop" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 35, 306 | "metadata": { 307 | "ExecuteTime": { 308 | "end_time": "2019-05-22T09:04:48.131508Z", 309 | "start_time": "2019-05-22T09:04:48.124575Z" 310 | } 311 | }, 312 | "outputs": [], 313 | "source": [ 314 | "def train_epocs(model, epochs=10, lr=0.001):\n", 315 | " optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n", 316 | " for i in range(epochs):\n", 317 | " model.train()\n", 318 | " total = 0\n", 319 | " loss_sum = 0\n", 320 | " for x, y in train_dl:\n", 321 | " y_hat = model(x.float())\n", 322 | " loss = F.binary_cross_entropy_with_logits(y_hat, y.float())\n", 323 | " optimizer.zero_grad()\n", 324 | " loss.backward()\n", 325 | " optimizer.step()\n", 326 | " total += x.size(0)\n", 327 | " loss_sum += loss.item()*x.size(0)\n", 328 | " val_loss, val_acc = val_metrics(model)\n", 329 | " print(\"train loss %.3f val loss %.3f and accuracy %.3f\" % (loss_sum/total, val_loss, val_acc))" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 36, 335 | "metadata": { 336 | "ExecuteTime": { 337 | "end_time": "2019-05-22T09:05:03.287290Z", 338 | "start_time": "2019-05-22T09:04:54.494282Z" 339 | } 340 | }, 341 | "outputs": [ 342 | { 343 | "name": "stdout", 344 | "output_type": "stream", 345 | "text": [ 346 | "train loss 0.656 val loss 0.615 and accuracy 0.875\n", 347 | "train loss 0.576 val loss 0.551 and accuracy 0.889\n", 348 | "train loss 0.512 val loss 0.501 and accuracy 0.893\n", 349 | "train loss 0.461 val loss 0.461 and accuracy 0.897\n", 350 | "train loss 0.422 val loss 0.429 and accuracy 0.898\n", 351 | "train loss 0.390 val loss 0.404 and accuracy 0.899\n", 352 | "train loss 0.364 val loss 0.383 and accuracy 0.898\n", 353 | "train loss 0.342 val loss 0.366 and accuracy 0.901\n", 354 | "train loss 0.324 val loss 0.352 and accuracy 0.902\n", 355 | "train loss 0.308 val loss 0.340 and accuracy 0.903\n", 356 | "train loss 0.294 val loss 0.329 and accuracy 0.904\n", 357 | "train loss 0.282 val loss 0.320 and accuracy 0.906\n", 358 | "train loss 0.271 val loss 0.312 and accuracy 0.907\n", 359 | "train loss 0.261 val loss 0.305 and accuracy 0.908\n", 360 | "train loss 0.253 val loss 0.299 and accuracy 0.910\n" 361 | ] 362 | } 363 | ], 364 | "source": [ 365 | "model = BOWModel(vocab_size=len(vocab2index.keys()))\n", 366 | "train_epocs(model, 15, 0.005)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "### Word importance\n", 374 | "To get the words that affect the most the positive label we find the words with higher weights. Similarly to get the words that affect the most the 0 label we find the words with lower weights." 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 37, 380 | "metadata": {}, 381 | "outputs": [ 382 | { 383 | "data": { 384 | "text/plain": [ 385 | "[Parameter containing:\n", 386 | " tensor([[-0.1818, -0.1209, -0.0691, ..., -0.1578, 0.2273, 0.2485]],\n", 387 | " requires_grad=True), Parameter containing:\n", 388 | " tensor([-0.0347], requires_grad=True)]" 389 | ] 390 | }, 391 | "execution_count": 37, 392 | "metadata": {}, 393 | "output_type": "execute_result" 394 | } 395 | ], 396 | "source": [ 397 | "parms = [p for p in model.parameters()]\n", 398 | "parms" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 38, 404 | "metadata": {}, 405 | "outputs": [ 406 | { 407 | "data": { 408 | "text/plain": [ 409 | "array([[-0.18177307, -0.12088173, -0.06908546, ..., -0.15775996,\n", 410 | " 0.22728996, 0.24849562]], dtype=float32)" 411 | ] 412 | }, 413 | "execution_count": 38, 414 | "metadata": {}, 415 | "output_type": "execute_result" 416 | } 417 | ], 418 | "source": [ 419 | "weights = parms[0].detach().numpy()\n", 420 | "weights" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 39, 426 | "metadata": {}, 427 | "outputs": [ 428 | { 429 | "data": { 430 | "text/plain": [ 431 | "(4009,)" 432 | ] 433 | }, 434 | "execution_count": 39, 435 | "metadata": {}, 436 | "output_type": "execute_result" 437 | } 438 | ], 439 | "source": [ 440 | "weights[0].shape" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 40, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "sorted_indeces = np.argsort(weights[0])" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 41, 455 | "metadata": {}, 456 | "outputs": [ 457 | { 458 | "data": { 459 | "text/plain": [ 460 | "(-0.4535758, 0.44974813)" 461 | ] 462 | }, 463 | "execution_count": 41, 464 | "metadata": {}, 465 | "output_type": "execute_result" 466 | } 467 | ], 468 | "source": [ 469 | "weights[0, sorted_indeces[0]], weights[0, sorted_indeces[-1]]," 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 42, 475 | "metadata": {}, 476 | "outputs": [ 477 | { 478 | "data": { 479 | "text/plain": [ 480 | "['material',\n", 481 | " 'performance',\n", 482 | " 'actors',\n", 483 | " 'movie',\n", 484 | " 'its',\n", 485 | " 'interesting',\n", 486 | " 'script',\n", 487 | " 'beautifully',\n", 488 | " 'movies',\n", 489 | " \"film's\"]" 490 | ] 491 | }, 492 | "execution_count": 42, 493 | "metadata": {}, 494 | "output_type": "execute_result" 495 | } 496 | ], 497 | "source": [ 498 | "[words[i] for i in sorted_indeces[:10]]" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 43, 504 | "metadata": {}, 505 | "outputs": [ 506 | { 507 | "data": { 508 | "text/plain": [ 509 | "['obsessed',\n", 510 | " 'kill',\n", 511 | " 'secret',\n", 512 | " 'school',\n", 513 | " 'patricia',\n", 514 | " 'sam',\n", 515 | " 'however',\n", 516 | " 'she',\n", 517 | " 'they',\n", 518 | " '-',\n", 519 | " 'discover']" 520 | ] 521 | }, 522 | "execution_count": 43, 523 | "metadata": {}, 524 | "output_type": "execute_result" 525 | } 526 | ], 527 | "source": [ 528 | "[words[i] for i in sorted_indeces[3998:]]" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": null, 534 | "metadata": {}, 535 | "outputs": [], 536 | "source": [] 537 | } 538 | ], 539 | "metadata": { 540 | "kernelspec": { 541 | "display_name": "Python 3", 542 | "language": "python", 543 | "name": "python3" 544 | }, 545 | "language_info": { 546 | "codemirror_mode": { 547 | "name": "ipython", 548 | "version": 3 549 | }, 550 | "file_extension": ".py", 551 | "mimetype": "text/x-python", 552 | "name": "python", 553 | "nbconvert_exporter": "python", 554 | "pygments_lexer": "ipython3", 555 | "version": "3.7.4" 556 | }, 557 | "toc": { 558 | "nav_menu": {}, 559 | "number_sections": true, 560 | "sideBar": true, 561 | "skip_h1_title": false, 562 | "toc_cell": false, 563 | "toc_position": {}, 564 | "toc_section_display": "block", 565 | "toc_window_display": false 566 | } 567 | }, 568 | "nbformat": 4, 569 | "nbformat_minor": 2 570 | } 571 | -------------------------------------------------------------------------------- /lesson10-bert_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%reload_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "%matplotlib inline\n", 12 | "import matplotlib.pyplot as plt" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "pip install pytorch-transformers" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import torch\n", 29 | "import torch.nn as nn\n", 30 | "import pickle\n", 31 | "from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,\n", 32 | " TensorDataset)\n", 33 | "from tqdm import tqdm_notebook, trange\n", 34 | "import os\n", 35 | "from pytorch_transformers import BertConfig, BertTokenizer, BertModel\n", 36 | "from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule\n", 37 | "\n", 38 | "from torch.utils.data import Dataset, DataLoader" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "import numpy as np\n", 48 | "import torch.optim as optim\n", 49 | "from torch.optim import lr_scheduler\n", 50 | "import time\n", 51 | "import copy\n", 52 | "import torch.nn.functional as F" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 4, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "class BertForSequenceClassification(nn.Module):\n", 62 | " \"\"\"BERT model for classification.\n", 63 | " This module is composed of the BERT model with a linear layer on top of\n", 64 | " the pooled output.\n", 65 | " \"\"\"\n", 66 | " def __init__(self, num_labels=1):\n", 67 | " super(BertForSequenceClassification, self).__init__()\n", 68 | " self.num_labels = num_labels\n", 69 | " self.bert = BertModel.from_pretrained('bert-base-uncased')\n", 70 | " self.dropout = nn.Dropout(config.hidden_dropout_prob)\n", 71 | " self.classifier = nn.Linear(config.hidden_size, num_labels)\n", 72 | " nn.init.xavier_normal_(self.classifier.weight)\n", 73 | " \n", 74 | " def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):\n", 75 | " outputs = self.bert(input_ids, token_type_ids, attention_mask)\n", 76 | " pooled_output = outputs[1]\n", 77 | " pooled_output = self.dropout(pooled_output)\n", 78 | " logits = self.classifier(pooled_output)\n", 79 | " return logits\n", 80 | " \n", 81 | " def freeze_bert_encoder(self):\n", 82 | " for param in self.bert.parameters():\n", 83 | " param.requires_grad = False\n", 84 | " \n", 85 | " def unfreeze_bert_encoder(self):\n", 86 | " for param in self.bert.parameters():\n", 87 | " param.requires_grad = True" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 5, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,\n", 97 | " num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 6, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "num_labels = 1\n", 107 | "model = BertForSequenceClassification(num_labels)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 7, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "from pathlib import Path\n", 117 | "PATH = Path(\"/data2/yinterian/aclImdb/\")" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 8, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 9, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/plain": [ 137 | "['bro', '##m', '##well', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it']" 138 | ] 139 | }, 140 | "execution_count": 9, 141 | "metadata": {}, 142 | "output_type": "execute_result" 143 | } 144 | ], 145 | "source": [ 146 | "path = PATH/\"train/pos/0_9.txt\"\n", 147 | "z = tokenizer.tokenize(path.read_text())\n", 148 | "z[:10]" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 10, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "[22953, 2213, 4381, 2152, 2003, 1037, 9476, 4038, 1012, 2009]" 160 | ] 161 | }, 162 | "execution_count": 10, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "ids = tokenizer.convert_tokens_to_ids(z)\n", 169 | "ids[:10]" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 11, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "tokens_tensor = torch.tensor([ids])" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 12, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "logits = model(tokens_tensor)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 13, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "tensor([[-0.6475]], grad_fn=)" 199 | ] 200 | }, 201 | "execution_count": 13, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "logits " 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "Based on these tutorials\n", 215 | "* https://pytorch.org/hub/huggingface_pytorch-pretrained-bert_bert/\n", 216 | "* https://github.com/huggingface/pytorch-transformers/blob/master/README.md\n", 217 | "* https://medium.com/huggingface/multi-label-text-classification-using-bert-the-mighty-transformer-69714fa3fb3d\n", 218 | "* https://towardsdatascience.com/bert-classifier-just-another-pytorch-model-881b3cf05784" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 14, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "def text2ids(text, max_seq_length=300):\n", 228 | " tok_text = tokenizer.tokenize(text)\n", 229 | " if len(tok_text) > max_seq_length:\n", 230 | " tok_text = tok_text[:max_seq_length]\n", 231 | " ids_text = tokenizer.convert_tokens_to_ids(tok_text)\n", 232 | " padding = [0] * (max_seq_length - len(ids_text))\n", 233 | " ids_text += padding\n", 234 | " return np.array(ids_text)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 15, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/plain": [ 245 | "array([22953, 2213, 4381, 2152, 2003, 1037, 9476, 4038, 1012,\n", 246 | " 2009, 2743, 2012, 1996, 2168, 2051, 2004, 2070, 2060,\n", 247 | " 3454, 2055, 2082, 2166, 1010, 2107, 2004, 1000, 5089,\n", 248 | " 1000, 1012, 2026, 3486, 2086, 1999, 1996, 4252, 9518,\n", 249 | " 2599, 2033, 2000, 2903, 2008, 22953, 2213, 4381, 2152,\n", 250 | " 1005, 1055, 18312, 2003, 2172, 3553, 2000, 4507, 2084,\n", 251 | " 2003, 1000, 5089, 1000, 1012, 1996, 25740, 2000, 5788,\n", 252 | " 13732, 1010, 1996, 12369, 3993, 2493, 2040, 2064, 2156,\n", 253 | " 2157, 2083, 2037, 17203, 5089, 1005, 13433, 8737, 1010,\n", 254 | " 1996, 9004, 10196, 4757, 1997, 1996, 2878, 3663, 1010,\n", 255 | " 2035, 10825, 2033, 1997, 1996, 2816, 1045, 2354, 1998,\n", 256 | " 2037, 2493, 1012, 2043, 1045, 2387, 1996, 2792, 1999,\n", 257 | " 2029, 1037, 3076, 8385, 2699, 2000, 6402, 2091, 1996,\n", 258 | " 2082, 1010, 1045, 3202, 7383, 1012, 1012, 1012, 1012,\n", 259 | " 1012, 1012, 1012, 1012, 1012, 2012, 1012, 1012, 1012,\n", 260 | " 1012, 1012, 1012, 1012, 1012, 1012, 1012, 2152, 1012,\n", 261 | " 1037, 4438, 2240, 1024, 7742, 1024, 1045, 1005, 1049,\n", 262 | " 2182, 2000, 12803, 2028, 1997, 2115, 5089, 1012, 3076,\n", 263 | " 1024, 6160, 2000, 22953, 2213, 4381, 2152, 1012, 1045,\n", 264 | " 5987, 2008, 2116, 6001, 1997, 2026, 2287, 2228, 2008,\n", 265 | " 22953, 2213, 4381, 2152, 2003, 2521, 18584, 2098, 1012,\n", 266 | " 2054, 1037, 12063, 2008, 2009, 3475, 1005, 1056, 999,\n", 267 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 268 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 269 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 270 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 271 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 272 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 273 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 274 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 275 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 276 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 277 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 278 | " 0, 0, 0])" 279 | ] 280 | }, 281 | "execution_count": 15, 282 | "metadata": {}, 283 | "output_type": "execute_result" 284 | } 285 | ], 286 | "source": [ 287 | "text2ids(path.read_text())" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 16, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "class ImdbDataset(Dataset):\n", 297 | " def __init__(self, PATH, train=\"train\"):\n", 298 | " self.path_to_images = PATH/train\n", 299 | " self.pos_files = list((self.path_to_images/\"pos\").iterdir())\n", 300 | " self.neg_files = list((self.path_to_images/\"neg\").iterdir()) \n", 301 | " self.files = self.pos_files + self.neg_files\n", 302 | " self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),\n", 303 | " np.zeros(len(self.neg_files), dtype=int)), axis=0)\n", 304 | " \n", 305 | " def __getitem__(self, index):\n", 306 | " path = self.files[index]\n", 307 | " x = text2ids(path.read_text())\n", 308 | " return x, self.y[index]\n", 309 | " \n", 310 | " def __len__(self):\n", 311 | " return len(self.y)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 17, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "train_ds = ImdbDataset(PATH)\n", 321 | "valid_ds = ImdbDataset(PATH, \"test\")" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 18, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "batch_size = 10\n", 331 | "train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)\n", 332 | "valid_dl = DataLoader(valid_ds, batch_size=batch_size)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 19, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "x, y = train_ds[0]" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 20, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "x, y = next(iter(train_dl))" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 21, 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "data": { 360 | "text/plain": [ 361 | "tensor([ 1045, 1005, 1049, 4089, 21474, 1012, 1045, 5632, 1000, 2980,\n", 362 | " 7171, 1000, 1998, 1000, 1996, 6248, 3282, 1000, 1998, 2037,\n", 363 | " 2116, 25815, 1010, 2130, 2043, 2087, 2111, 2179, 2068, 24257,\n", 364 | " 1012, 1045, 1005, 2310, 2130, 3266, 2000, 5959, 2087, 2703,\n", 365 | " 2100, 5370, 5691, 1012, 2045, 2003, 2069, 2028, 3185, 2008,\n", 366 | " 1045, 1005, 2310, 2464, 2008, 1045, 2064, 9826, 2360, 2001,\n", 367 | " 2919, 1012, 1012, 1012, 1998, 2023, 2001, 2009, 1012, 2009,\n", 368 | " 1005, 1055, 2042, 1037, 2096, 2144, 1045, 1005, 2310, 2464,\n", 369 | " 2009, 1010, 2021, 1045, 2079, 3342, 3564, 1999, 1996, 4258,\n", 370 | " 3241, 1010, 1000, 2023, 2003, 1037, 12873, 3185, 1012, 2339,\n", 371 | " 2106, 1045, 2156, 2023, 1029, 1000, 2009, 1005, 1055, 9826,\n", 372 | " 1996, 2069, 3185, 2008, 1045, 3685, 16755, 1012, 0, 0,\n", 373 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 374 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 375 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 376 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 377 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 378 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 379 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 380 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 381 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 382 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 383 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 384 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 385 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 386 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 387 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 388 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 389 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 390 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])" 391 | ] 392 | }, 393 | "execution_count": 21, 394 | "metadata": {}, 395 | "output_type": "execute_result" 396 | } 397 | ], 398 | "source": [ 399 | "x[3]" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 22, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "def train_model(model, optimizer, num_epochs=25):\n", 409 | " for epoch in range(num_epochs):\n", 410 | " model.train()\n", 411 | " running_loss = 0.0\n", 412 | " for x, y in train_dl:\n", 413 | " x = x.cuda()\n", 414 | " y = y.unsqueeze(1).float().cuda()\n", 415 | " optimizer.zero_grad()\n", 416 | " logits = model(x)\n", 417 | " loss = F.binary_cross_entropy_with_logits(logits, y) \n", 418 | " loss.backward()\n", 419 | " optimizer.step()\n", 420 | " \n", 421 | " running_loss += loss.item() * x.size(0)\n", 422 | " epoch_loss = running_loss / len(train_ds)\n", 423 | " val_loss, accuracy = eval_model(model)\n", 424 | " print('train loss: {:.3f}, valid loss {:.3f} accuracy {:.3f}'.format(\n", 425 | " epoch_loss, val_loss, accuracy))" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 23, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "def eval_model(model):\n", 435 | " model.eval()\n", 436 | " running_loss = 0.0\n", 437 | " correct = 0\n", 438 | " for x, y in valid_dl:\n", 439 | " x = x.cuda()\n", 440 | " y = y.unsqueeze(1).float().cuda()\n", 441 | " logits = model(x)\n", 442 | " loss = F.binary_cross_entropy_with_logits(logits, y) \n", 443 | " y_pred = logits > 0\n", 444 | " correct += (y_pred.float() == y).float().sum()\n", 445 | " running_loss += loss.item() * x.size(0)\n", 446 | " accuracy = correct / len(valid_ds)\n", 447 | " epoch_loss = running_loss / len(valid_ds)\n", 448 | " return epoch_loss, accuracy.item() " 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 24, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "model = model.cuda()" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 25, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "lrlast = .0001\n", 467 | "lrmain = .00001\n", 468 | "optimizer = optim.Adam(\n", 469 | " [\n", 470 | " {\"params\":model.bert.parameters(),\"lr\": lrmain},\n", 471 | " {\"params\":model.classifier.parameters(), \"lr\": lrlast},\n", 472 | " \n", 473 | " ])" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 26, 479 | "metadata": {}, 480 | "outputs": [ 481 | { 482 | "name": "stdout", 483 | "output_type": "stream", 484 | "text": [ 485 | "train loss: 0.291, valid loss 0.201 accuracy 0.920\n", 486 | "train loss: 0.167, valid loss 0.197 accuracy 0.923\n" 487 | ] 488 | } 489 | ], 490 | "source": [ 491 | "train_model(model, optimizer, num_epochs=2)" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": null, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [] 500 | } 501 | ], 502 | "metadata": { 503 | "kernelspec": { 504 | "display_name": "Python 3", 505 | "language": "python", 506 | "name": "python3" 507 | }, 508 | "language_info": { 509 | "codemirror_mode": { 510 | "name": "ipython", 511 | "version": 3 512 | }, 513 | "file_extension": ".py", 514 | "mimetype": "text/x-python", 515 | "name": "python", 516 | "nbconvert_exporter": "python", 517 | "pygments_lexer": "ipython3", 518 | "version": "3.7.3" 519 | }, 520 | "toc": { 521 | "nav_menu": {}, 522 | "number_sections": true, 523 | "sideBar": true, 524 | "skip_h1_title": false, 525 | "toc_cell": false, 526 | "toc_position": {}, 527 | "toc_section_display": "block", 528 | "toc_window_display": false 529 | } 530 | }, 531 | "nbformat": 4, 532 | "nbformat_minor": 2 533 | } 534 | -------------------------------------------------------------------------------- /lesson4-rnn-name2lang.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Classifing last names with character-level RNN" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%reload_ext autoreload\n", 17 | "%autoreload 2\n", 18 | "%matplotlib inline\n", 19 | "import pandas as pd\n", 20 | "import numpy as np\n", 21 | "from pathlib import Path\n", 22 | "import torch\n", 23 | "from torch.utils.data import Dataset, DataLoader\n", 24 | "import torch.optim as optim\n", 25 | "import torch.nn as nn\n", 26 | "import torch.nn.functional as F\n", 27 | "import random" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Dataset\n", 35 | "`https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_train.csv.gz`\n", 36 | "\n", 37 | "`https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz`" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 10, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "def unpack_dataset():\n", 47 | " ! wget https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_train.csv.gz \n", 48 | " ! wget https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_test.csv.gz \n", 49 | " ! mkdir -p data\n", 50 | " ! gunzip names_train.csv.gz \n", 51 | " ! gunzip names_test.csv.gz\n", 52 | " ! mv names*.csv data" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 11, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "--2020-05-27 15:13:54-- https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_train.csv.gz\n", 65 | "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.188.133\n", 66 | "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.188.133|:443... connected.\n", 67 | "HTTP request sent, awaiting response... 200 OK\n", 68 | "Length: 50237 (49K) [application/octet-stream]\n", 69 | "Saving to: ‘names_train.csv.gz’\n", 70 | "\n", 71 | "names_train.csv.gz 100%[===================>] 49.06K --.-KB/s in 0.04s \n", 72 | "\n", 73 | "2020-05-27 15:13:54 (1.26 MB/s) - ‘names_train.csv.gz’ saved [50237/50237]\n", 74 | "\n", 75 | "--2020-05-27 15:13:54-- https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_test.csv.gz\n", 76 | "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.188.133\n", 77 | "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.188.133|:443... connected.\n", 78 | "HTTP request sent, awaiting response... 200 OK\n", 79 | "Length: 27541 (27K) [application/octet-stream]\n", 80 | "Saving to: ‘names_test.csv.gz’\n", 81 | "\n", 82 | "names_test.csv.gz 100%[===================>] 26.90K --.-KB/s in 0.02s \n", 83 | "\n", 84 | "2020-05-27 15:13:55 (1.53 MB/s) - ‘names_test.csv.gz’ saved [27541/27541]\n", 85 | "\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "unpack_dataset()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 12, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "[PosixPath('data/glove.6B.300d.txt'),\n", 102 | " PosixPath('data/glove.6B.100d.txt'),\n", 103 | " PosixPath('data/names_train.csv'),\n", 104 | " PosixPath('data/names_test.csv'),\n", 105 | " PosixPath('data/glove.6B.50d.txt'),\n", 106 | " PosixPath('data/plot.tok.gt9.5000'),\n", 107 | " PosixPath('data/subjdata.README.1.0'),\n", 108 | " PosixPath('data/pmlb'),\n", 109 | " PosixPath('data/quote.tok.gt9.5000'),\n", 110 | " PosixPath('data/glove.6B.200d.txt'),\n", 111 | " PosixPath('data/glove.6B.zip')]" 112 | ] 113 | }, 114 | "execution_count": 12, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "PATH = Path(\"data\")\n", 121 | "list(PATH.iterdir())" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 13, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "\"Adsit\",\"Czech\"\r", 134 | "\r\n", 135 | "\"Ajdrna\",\"Czech\"\r", 136 | "\r\n", 137 | "\"Antonowitsch\",\"Czech\"\r", 138 | "\r\n", 139 | "\"Antonowitz\",\"Czech\"\r", 140 | "\r\n", 141 | "\"Ballalatak\",\"Czech\"\r", 142 | "\r\n", 143 | "\"Ballaltick\",\"Czech\"\r", 144 | "\r\n", 145 | "\"Bastl\",\"Czech\"\r", 146 | "\r\n", 147 | "\"Baroch\",\"Czech\"\r", 148 | "\r\n", 149 | "\"Betlach\",\"Czech\"\r", 150 | "\r\n", 151 | "\"Biganska\",\"Czech\"\r", 152 | "\r\n" 153 | ] 154 | } 155 | ], 156 | "source": [ 157 | "! head data/names_train.csv" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "### Processing data" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 14, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "df = pd.read_csv(PATH/\"names_train.csv\", header=None)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 15, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/plain": [ 184 | "[' ', \"'\", ',', 'A', 'B', 'C', 'D', 'E', 'F', 'G']" 185 | ] 186 | }, 187 | "execution_count": 15, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "# getting a vocabulary of characters\n", 194 | "letters = [list(l) for l in df[0].values]\n", 195 | "vocab = sorted(list(set(np.concatenate(np.array(letters)))))\n", 196 | "vocab[:10]" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 16, 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "data": { 206 | "text/plain": [ 207 | "0" 208 | ] 209 | }, 210 | "execution_count": 16, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "vocab2id = {key:i for i, key in enumerate(vocab)}\n", 217 | "vocab2id[\" \"] # I am going to use 0 to pad sequences" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 17, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "text/plain": [ 228 | "{'Arabic': 0,\n", 229 | " 'Chinese': 1,\n", 230 | " 'Czech': 2,\n", 231 | " 'Dutch': 3,\n", 232 | " 'English': 4,\n", 233 | " 'French': 5,\n", 234 | " 'German': 6,\n", 235 | " 'Greek': 7,\n", 236 | " 'Irish': 8,\n", 237 | " 'Italian': 9,\n", 238 | " 'Japanese': 10,\n", 239 | " 'Korean': 11,\n", 240 | " 'Polish': 12,\n", 241 | " 'Portuguese': 13,\n", 242 | " 'Russian': 14,\n", 243 | " 'Scottish': 15,\n", 244 | " 'Spanish': 16,\n", 245 | " 'Vietnamese': 17}" 246 | ] 247 | }, 248 | "execution_count": 17, 249 | "metadata": {}, 250 | "output_type": "execute_result" 251 | } 252 | ], 253 | "source": [ 254 | "labels = sorted(df[1].unique())\n", 255 | "label2id = {key:i for i, key in enumerate(labels)}\n", 256 | "label2id" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 18, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "def pad_seq(x, seq_len=15, vocab2id=vocab2id):\n", 266 | " x = list(x)\n", 267 | " x = np.array([vocab2id[k] for k in x])\n", 268 | " z = np.zeros(seq_len, dtype=np.int32)\n", 269 | " n = min(seq_len, x.shape[0])\n", 270 | " z[seq_len - n:] = x[0:n]\n", 271 | " return z" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 19, 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/plain": [ 282 | "array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 29, 29, 30, 30, 30],\n", 283 | " dtype=int32)" 284 | ] 285 | }, 286 | "execution_count": 19, 287 | "metadata": {}, 288 | "output_type": "execute_result" 289 | } 290 | ], 291 | "source": [ 292 | "x = pad_seq(\"aabbb\")\n", 293 | "x" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 20, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "# one hot encoding\n", 303 | "def seq2matrix(x, vocab_len=55):\n", 304 | " z = np.zeros((x.shape[0], vocab_len))\n", 305 | " z[np.arange(len(x)), x] = 1\n", 306 | " return z" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 21, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "class NameDataset(Dataset):\n", 316 | " def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):\n", 317 | " self.df = pd.read_csv(path, header=None)\n", 318 | " self.label2id = label2id\n", 319 | " self.vocab2id = vocab2id\n", 320 | " self.seq_len = seq_len\n", 321 | " self.vocab_len = vocab_len \n", 322 | " self.x = df[0].values\n", 323 | " self.y = [self.label2id[l] for l in df[1].values]\n", 324 | " self.vocab2id = vocab2id\n", 325 | " \n", 326 | " def __len__(self):\n", 327 | " return len(self.y)\n", 328 | " \n", 329 | " def __getitem__(self, idx):\n", 330 | " x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)\n", 331 | " x = seq2matrix(x, self.vocab_len)\n", 332 | " return x, self.y[idx]" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 22, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "train = NameDataset(PATH/\"names_train.csv\", vocab2id, label2id)\n", 342 | "val = NameDataset(PATH/\"names_test.csv\", vocab2id, label2id)" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 23, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "batch_size = 2000\n", 352 | "n=len(val)\n", 353 | "train_dl = DataLoader(train, batch_size=batch_size)\n", 354 | "val_dl = DataLoader(val, batch_size=n)" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 24, 360 | "metadata": {}, 361 | "outputs": [ 362 | { 363 | "data": { 364 | "text/plain": [ 365 | "(13374, 13374)" 366 | ] 367 | }, 368 | "execution_count": 24, 369 | "metadata": {}, 370 | "output_type": "execute_result" 371 | } 372 | ], 373 | "source": [ 374 | "len(train), len(val)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 25, 380 | "metadata": {}, 381 | "outputs": [ 382 | { 383 | "name": "stdout", 384 | "output_type": "stream", 385 | "text": [ 386 | "(15, 55) 2\n" 387 | ] 388 | } 389 | ], 390 | "source": [ 391 | "x,y = train[0]\n", 392 | "print(x.shape,y)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "## Model" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 26, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "class CharRNN(nn.Module):\n", 409 | " def __init__(self, input_size, hidden_size, output_size):\n", 410 | " super(CharRNN, self).__init__()\n", 411 | "\n", 412 | " self.hidden_size = hidden_size\n", 413 | " self.linear_i2h = nn.Linear(input_size + hidden_size, hidden_size)\n", 414 | " self.linear_h2o = nn.Linear(hidden_size, output_size)\n", 415 | "\n", 416 | " def forward(self, x, hidden):\n", 417 | " combined = torch.cat((x, hidden), 1)\n", 418 | " hidden = torch.tanh(self.linear_i2h(combined))\n", 419 | " output = self.linear_h2o(hidden)\n", 420 | " return output, hidden\n", 421 | "\n", 422 | " def initHidden(self, bash_size):\n", 423 | " return torch.zeros(bash_size, self.hidden_size)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "markdown", 428 | "metadata": {}, 429 | "source": [ 430 | "## Debugging model" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 29, 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "vocab_size = 55\n", 440 | "hidden_size = 100\n", 441 | "n_classes = 18\n", 442 | "model = CharRNN(vocab_size, hidden_size, n_classes)" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 30, 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "x, y = next(iter(train_dl))" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 31, 457 | "metadata": {}, 458 | "outputs": [ 459 | { 460 | "data": { 461 | "text/plain": [ 462 | "(torch.Size([2000, 15, 55]), torch.Size([2000]))" 463 | ] 464 | }, 465 | "execution_count": 31, 466 | "metadata": {}, 467 | "output_type": "execute_result" 468 | } 469 | ], 470 | "source": [ 471 | "x.shape, y.shape" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 32, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "batch = x.shape[0]\n", 481 | "h = model.initHidden(batch)\n", 482 | "x = x.float()\n", 483 | "y = y.long()" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 33, 489 | "metadata": {}, 490 | "outputs": [ 491 | { 492 | "data": { 493 | "text/plain": [ 494 | "torch.Size([2000, 155])" 495 | ] 496 | }, 497 | "execution_count": 33, 498 | "metadata": {}, 499 | "output_type": "execute_result" 500 | } 501 | ], 502 | "source": [ 503 | "torch.cat((x[:,0], h), 1).size()" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": 34, 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "for ei in range(x.shape[1]):\n", 513 | " y_t, h = model(x[:,ei], h)" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 35, 519 | "metadata": {}, 520 | "outputs": [ 521 | { 522 | "data": { 523 | "text/plain": [ 524 | "2.8705217838287354" 525 | ] 526 | }, 527 | "execution_count": 35, 528 | "metadata": {}, 529 | "output_type": "execute_result" 530 | } 531 | ], 532 | "source": [ 533 | "# note that just the last x_t is used in the loss\n", 534 | "# update\n", 535 | "loss = F.cross_entropy(y_t, y)\n", 536 | "loss.item()" 537 | ] 538 | }, 539 | { 540 | "cell_type": "markdown", 541 | "metadata": {}, 542 | "source": [ 543 | "## Training" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": 36, 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "vocab_size = 55\n", 553 | "hidden_size = 100\n", 554 | "n_classes = 18\n", 555 | "model = CharRNN(vocab_size, hidden_size, n_classes)" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 37, 561 | "metadata": {}, 562 | "outputs": [], 563 | "source": [ 564 | "def get_optimizer(model, lr = 0.01, wd = 0.00001):\n", 565 | " parameters = filter(lambda p: p.requires_grad, model.parameters())\n", 566 | " optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)\n", 567 | " return optim" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": 45, 573 | "metadata": {}, 574 | "outputs": [], 575 | "source": [ 576 | "def train(model, optim, train_dl):\n", 577 | " model.train()\n", 578 | " total = 0\n", 579 | " sum_loss = 0\n", 580 | " for x, y in train_dl:\n", 581 | " batch = x.shape[0]\n", 582 | " h = model.initHidden(batch)\n", 583 | " loss = 0\n", 584 | " x = x.float()\n", 585 | " y = y.long()\n", 586 | " \n", 587 | " for t in range(x.shape[1]):\n", 588 | " out, h = model(x[:,t], h)\n", 589 | " \n", 590 | " loss = F.cross_entropy(out, y)\n", 591 | " optim.zero_grad()\n", 592 | " loss.backward()\n", 593 | " optim.step()\n", 594 | " total += batch\n", 595 | " sum_loss += batch*(loss.item())\n", 596 | " return sum_loss/total" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 46, 602 | "metadata": {}, 603 | "outputs": [], 604 | "source": [ 605 | "def val_metric(model, val_dl):\n", 606 | " model.eval()\n", 607 | " x, y = next(iter(val_dl))\n", 608 | " x = x.float()\n", 609 | " y = y.long()\n", 610 | " N = x.shape[0]\n", 611 | " h = model.initHidden(N)\n", 612 | " for t in range(x.shape[1]):\n", 613 | " out, h = model(x[:,t], h)\n", 614 | " loss = F.cross_entropy(out, y)\n", 615 | " _, pred = torch.max(out, 1)\n", 616 | " acc = pred.eq(y).sum().float()/N\n", 617 | " return loss.item(), acc.item()" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": 47, 623 | "metadata": {}, 624 | "outputs": [], 625 | "source": [ 626 | "vocab_size = 55\n", 627 | "hidden_size = 80\n", 628 | "n_classes = 18" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": 48, 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [ 637 | "def train_loop(model, lr, train_dl, val_dl, epochs=20):\n", 638 | " optim = get_optimizer(model, lr =lr, wd = 0.0)\n", 639 | " for i in range(epochs):\n", 640 | " loss = train(model, optim, train_dl)\n", 641 | " val_loss, val_acc = val_metric(model, val_dl)\n", 642 | " if i%5 == 1: print(\"train loss %.3f val loss %.3f and val accuracy %.3f\" % (loss, val_loss, val_acc))" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": 49, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "model = CharRNN(vocab_size, hidden_size, n_classes)" 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": 50, 657 | "metadata": {}, 658 | "outputs": [ 659 | { 660 | "name": "stdout", 661 | "output_type": "stream", 662 | "text": [ 663 | "train loss 2.093 val loss 1.836 and val accuracy 0.469\n", 664 | "train loss 1.749 val loss 1.630 and val accuracy 0.488\n", 665 | "train loss 1.497 val loss 1.390 and val accuracy 0.570\n", 666 | "train loss 1.332 val loss 1.231 and val accuracy 0.622\n" 667 | ] 668 | } 669 | ], 670 | "source": [ 671 | "train_loop(model, 0.01, train_dl, val_dl, epochs=20)" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": 51, 677 | "metadata": {}, 678 | "outputs": [ 679 | { 680 | "name": "stdout", 681 | "output_type": "stream", 682 | "text": [ 683 | "train loss 1.122 val loss 1.108 and val accuracy 0.665\n", 684 | "train loss 1.086 val loss 1.075 and val accuracy 0.681\n", 685 | "train loss 1.064 val loss 1.049 and val accuracy 0.687\n", 686 | "train loss 1.042 val loss 1.026 and val accuracy 0.692\n" 687 | ] 688 | } 689 | ], 690 | "source": [ 691 | "train_loop(model, 0.001, train_dl, val_dl, epochs=20)" 692 | ] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": 52, 697 | "metadata": {}, 698 | "outputs": [ 699 | { 700 | "name": "stdout", 701 | "output_type": "stream", 702 | "text": [ 703 | "train loss 1.041 val loss 1.023 and val accuracy 0.690\n", 704 | "train loss 1.010 val loss 0.998 and val accuracy 0.702\n", 705 | "train loss 0.992 val loss 0.977 and val accuracy 0.711\n", 706 | "train loss 0.977 val loss 0.961 and val accuracy 0.716\n" 707 | ] 708 | } 709 | ], 710 | "source": [ 711 | "train_loop(model, 0.001, train_dl, val_dl, epochs=20)" 712 | ] 713 | }, 714 | { 715 | "cell_type": "markdown", 716 | "metadata": {}, 717 | "source": [ 718 | "# Model with character embeddings " 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": 53, 724 | "metadata": {}, 725 | "outputs": [], 726 | "source": [ 727 | "class NameDatasetEmb(Dataset):\n", 728 | " def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):\n", 729 | " self.df = pd.read_csv(path, header=None)\n", 730 | " self.label2id = label2id\n", 731 | " self.vocab2id = vocab2id\n", 732 | " self.seq_len = seq_len\n", 733 | " self.vocab_len = vocab_len \n", 734 | " self.x = df[0].values\n", 735 | " self.y = [self.label2id[l] for l in df[1].values]\n", 736 | " self.vocab2id = vocab2id\n", 737 | " \n", 738 | " def __len__(self):\n", 739 | " return len(self.y)\n", 740 | " \n", 741 | " def __getitem__(self, idx):\n", 742 | " x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)\n", 743 | " return x, self.y[idx]" 744 | ] 745 | }, 746 | { 747 | "cell_type": "code", 748 | "execution_count": 54, 749 | "metadata": {}, 750 | "outputs": [], 751 | "source": [ 752 | "train_2 = NameDatasetEmb(PATH/\"names_train.csv\", vocab2id, label2id)\n", 753 | "val_2 = NameDatasetEmb(PATH/\"names_test.csv\", vocab2id, label2id)" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": 55, 759 | "metadata": {}, 760 | "outputs": [], 761 | "source": [ 762 | "batch_size = 2000\n", 763 | "n = len(val_2)\n", 764 | "train_dl_2 = DataLoader(train_2, batch_size=batch_size)\n", 765 | "val_dl_2 = DataLoader(val_2, batch_size=n)" 766 | ] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": 56, 771 | "metadata": {}, 772 | "outputs": [ 773 | { 774 | "data": { 775 | "text/plain": [ 776 | "(array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 32, 47, 37, 48],\n", 777 | " dtype=int32), 2)" 778 | ] 779 | }, 780 | "execution_count": 56, 781 | "metadata": {}, 782 | "output_type": "execute_result" 783 | } 784 | ], 785 | "source": [ 786 | "train_2[0]" 787 | ] 788 | }, 789 | { 790 | "cell_type": "code", 791 | "execution_count": 57, 792 | "metadata": {}, 793 | "outputs": [], 794 | "source": [ 795 | "class CharEmbRNN(nn.Module):\n", 796 | " def __init__(self, vocab_size, emb_size, hidden_size, output_size):\n", 797 | " super(CharEmbRNN, self).__init__()\n", 798 | " self.emb = nn.Embedding(vocab_size, emb_size)\n", 799 | " self.hidden_size = hidden_size\n", 800 | " self.linear_i2h = nn.Linear(emb_size + hidden_size, hidden_size)\n", 801 | " self.linear_h2o = nn.Linear(hidden_size, output_size)\n", 802 | "\n", 803 | " def forward(self, x, hidden):\n", 804 | " x = x.long()\n", 805 | " x = self.emb(x)\n", 806 | " combined = torch.cat((x, hidden), 1)\n", 807 | " hidden = torch.tanh(self.linear_i2h(combined))\n", 808 | " output = self.linear_h2o(hidden)\n", 809 | " return output, hidden\n", 810 | "\n", 811 | " def initHidden(self, bash_size):\n", 812 | " return torch.zeros(bash_size, self.hidden_size)" 813 | ] 814 | }, 815 | { 816 | "cell_type": "markdown", 817 | "metadata": {}, 818 | "source": [ 819 | "## Train " 820 | ] 821 | }, 822 | { 823 | "cell_type": "code", 824 | "execution_count": 58, 825 | "metadata": {}, 826 | "outputs": [ 827 | { 828 | "ename": "AssertionError", 829 | "evalue": "Torch not compiled with CUDA enabled", 830 | "output_type": "error", 831 | "traceback": [ 832 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 833 | "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", 834 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mhidden_size\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m80\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mn_classes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m18\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCharEmbRNN\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvocab_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0memb_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhidden_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_classes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 835 | "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36mcuda\u001b[0;34m(self, device)\u001b[0m\n\u001b[1;32m 309\u001b[0m \u001b[0mModule\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 310\u001b[0m \"\"\"\n\u001b[0;32m--> 311\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 312\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 313\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcpu\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 836 | "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 208\u001b[0;31m \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 209\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 837 | "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn)\u001b[0m\n\u001b[1;32m 228\u001b[0m \u001b[0;31m# `with torch.no_grad():`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 229\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mno_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0mparam_applied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0mshould_use_set_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparam_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mshould_use_set_data\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 838 | "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m\u001b[0;34m(t)\u001b[0m\n\u001b[1;32m 309\u001b[0m \u001b[0mModule\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 310\u001b[0m \"\"\"\n\u001b[0;32m--> 311\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 312\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 313\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcpu\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 839 | "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/torch/cuda/__init__.py\u001b[0m in \u001b[0;36m_lazy_init\u001b[0;34m()\u001b[0m\n\u001b[1;32m 176\u001b[0m raise RuntimeError(\n\u001b[1;32m 177\u001b[0m \"Cannot re-initialize CUDA in forked subprocess. \" + msg)\n\u001b[0;32m--> 178\u001b[0;31m \u001b[0m_check_driver\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 179\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cuda_init\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[0m_cudart\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_load_cudart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 840 | "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/torch/cuda/__init__.py\u001b[0m in \u001b[0;36m_check_driver\u001b[0;34m()\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_check_driver\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'_cuda_isDriverSufficient'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 92\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mAssertionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Torch not compiled with CUDA enabled\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 93\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cuda_isDriverSufficient\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cuda_getDriverVersion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 841 | "\u001b[0;31mAssertionError\u001b[0m: Torch not compiled with CUDA enabled" 842 | ] 843 | } 844 | ], 845 | "source": [ 846 | "vocab_size = 55\n", 847 | "emb_size = 30\n", 848 | "hidden_size = 80\n", 849 | "n_classes = 18\n", 850 | "model = CharEmbRNN(vocab_size, emb_size, hidden_size, n_classes).cuda()" 851 | ] 852 | }, 853 | { 854 | "cell_type": "code", 855 | "execution_count": null, 856 | "metadata": {}, 857 | "outputs": [], 858 | "source": [ 859 | "train_loop(model, 0.01, train_dl_2, val_dl_2, epochs=50)" 860 | ] 861 | }, 862 | { 863 | "cell_type": "code", 864 | "execution_count": null, 865 | "metadata": {}, 866 | "outputs": [], 867 | "source": [ 868 | "train_loop(model, 0.01, train_dl_2, val_dl_2, epochs=50)" 869 | ] 870 | }, 871 | { 872 | "cell_type": "code", 873 | "execution_count": null, 874 | "metadata": {}, 875 | "outputs": [], 876 | "source": [ 877 | "train_loop(model, 0.01, train_dl_2, val_dl_2, epochs=20)" 878 | ] 879 | }, 880 | { 881 | "cell_type": "code", 882 | "execution_count": null, 883 | "metadata": {}, 884 | "outputs": [], 885 | "source": [ 886 | "train_loop(model, 0.01, train_dl_2, val_dl_2, epochs=20)" 887 | ] 888 | }, 889 | { 890 | "cell_type": "code", 891 | "execution_count": null, 892 | "metadata": {}, 893 | "outputs": [], 894 | "source": [ 895 | "train_loop(model, 0.001, train_dl_2, val_dl_2, epochs=50)" 896 | ] 897 | }, 898 | { 899 | "cell_type": "markdown", 900 | "metadata": {}, 901 | "source": [ 902 | "# References\n", 903 | "This notebook is a modified version of this tutorial\n", 904 | "http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html. Here I implement vanilla RNNs." 905 | ] 906 | }, 907 | { 908 | "cell_type": "code", 909 | "execution_count": null, 910 | "metadata": {}, 911 | "outputs": [], 912 | "source": [] 913 | } 914 | ], 915 | "metadata": { 916 | "kernelspec": { 917 | "display_name": "Python 3", 918 | "language": "python", 919 | "name": "python3" 920 | }, 921 | "language_info": { 922 | "codemirror_mode": { 923 | "name": "ipython", 924 | "version": 3 925 | }, 926 | "file_extension": ".py", 927 | "mimetype": "text/x-python", 928 | "name": "python", 929 | "nbconvert_exporter": "python", 930 | "pygments_lexer": "ipython3", 931 | "version": "3.7.3" 932 | }, 933 | "toc": { 934 | "nav_menu": {}, 935 | "number_sections": true, 936 | "sideBar": true, 937 | "skip_h1_title": false, 938 | "toc_cell": false, 939 | "toc_position": {}, 940 | "toc_section_display": "block", 941 | "toc_window_display": false 942 | } 943 | }, 944 | "nbformat": 4, 945 | "nbformat_minor": 2 946 | } 947 | -------------------------------------------------------------------------------- /lesson3-cnn-text.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Subjectivity classification with CNNs" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In this notebook we implement the approched described in this [paper](https://arxiv.org/pdf/1408.5882.pdf) for classifiying sentences using Convolutional Neural Networks. In particular, we will classify sentences into \"subjective\" or \"objective\". " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import torch\n", 24 | "import torch.nn as nn\n", 25 | "import torch.nn.functional as F\n", 26 | "\n", 27 | "from torch.utils.data import Dataset, DataLoader" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "from sklearn.model_selection import train_test_split" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## Subjectivity Dataset" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "The subjectivity dataset has 5000 subjective and 5000 objective processed sentences. To get the data:\n", 51 | "```\n", 52 | "wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz\n", 53 | "```" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/plain": [ 64 | "[PosixPath('data/glove.6B.300d.txt'),\n", 65 | " PosixPath('data/glove.6B.100d.txt'),\n", 66 | " PosixPath('data/names_train.csv'),\n", 67 | " PosixPath('data/names_test.csv'),\n", 68 | " PosixPath('data/glove.6B.50d.txt'),\n", 69 | " PosixPath('data/plot.tok.gt9.5000'),\n", 70 | " PosixPath('data/subjdata.README.1.0'),\n", 71 | " PosixPath('data/pmlb'),\n", 72 | " PosixPath('data/quote.tok.gt9.5000'),\n", 73 | " PosixPath('data/glove.6B.200d.txt'),\n", 74 | " PosixPath('data/glove.6B.zip')]" 75 | ] 76 | }, 77 | "execution_count": 3, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "from pathlib import Path\n", 84 | "PATH = Path(\"data\")\n", 85 | "list(PATH.iterdir())" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "From the readme file:\n", 93 | "- quote.tok.gt9.5000 contains 5000 subjective sentences (or snippets)\n", 94 | "- plot.tok.gt9.5000 contains 5000 objective sentences" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 4, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . \r\n", 107 | "emerging from the human psyche and showing characteristics of abstract expressionism , minimalism and russian constructivism , graffiti removal has secured its place in the history of modern art while being created by artists who are unconscious of their artistic achievements . \r\n", 108 | "spurning her mother's insistence that she get on with her life , mary is thrown out of the house , rejected by joe , and expelled from school as she grows larger with child . \r\n", 109 | "amitabh can't believe the board of directors and his mind is filled with revenge and what better revenge than robbing the bank himself , ironic as it may sound . \r\n", 110 | "she , among others excentricities , talks to a small rock , gertrude , like if she was alive . \r\n", 111 | "this gives the girls a fair chance of pulling the wool over their eyes using their sexiness to poach any last vestige of common sense the dons might have had . \r\n", 112 | "styled after vh1's \" behind the music , \" this mockumentary profiles the rise and fall of an internet startup , called icevan . com . \r\n", 113 | "being blue is not his only predicament ; he also lacks the ability to outwardly express his emotions . \r\n", 114 | "the killer's clues are a perversion of biblical punishments for sins : stoning , burning , decapitation . \r\n", 115 | "david is a painter with painter's block who takes a job as a waiter to get some inspiration . \r\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "! head data/plot.tok.gt9.5000" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "## String cleaning functions" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 5, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "import numpy as np\n", 137 | "from collections import defaultdict\n", 138 | "import re" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 6, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "# this is from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py\n", 148 | "def clean_str(string):\n", 149 | " \"\"\"\n", 150 | " Tokenization/string cleaning for all datasets except for SST.\n", 151 | " Every dataset is lower cased except for TREC\n", 152 | " \"\"\"\n", 153 | " string = re.sub(r\"[^A-Za-z0-9(),!?\\'\\`]\", \" \", string) \n", 154 | " string = re.sub(r\"\\'s\", \" \\'s\", string) \n", 155 | " string = re.sub(r\"\\'ve\", \" \\'ve\", string) \n", 156 | " string = re.sub(r\"n\\'t\", \" n\\'t\", string) \n", 157 | " string = re.sub(r\"\\'re\", \" \\'re\", string) \n", 158 | " string = re.sub(r\"\\'d\", \" \\'d\", string) \n", 159 | " string = re.sub(r\"\\'ll\", \" \\'ll\", string) \n", 160 | " string = re.sub(r\",\", \" , \", string) \n", 161 | " string = re.sub(r\"!\", \" ! \", string) \n", 162 | " string = re.sub(r\"\\(\", \" \\( \", string) \n", 163 | " string = re.sub(r\"\\)\", \" \\) \", string) \n", 164 | " string = re.sub(r\"\\?\", \" \\? \", string) \n", 165 | " string = re.sub(r\"\\s{2,}\", \" \", string) \n", 166 | " return string.strip().lower()" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 7, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "def read_file(path):\n", 176 | " \"\"\" Read file returns a shuttled list.\n", 177 | " \"\"\"\n", 178 | " with open(path, encoding = \"ISO-8859-1\") as f:\n", 179 | " content = np.array(f.readlines())\n", 180 | " return content" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 8, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "def get_vocab(list_of_content):\n", 190 | " \"\"\"Computes Dict of counts of words.\n", 191 | " \n", 192 | " Computes the number of times a word is on a document.\n", 193 | " \"\"\"\n", 194 | " vocab = defaultdict(float)\n", 195 | " for content in list_of_content:\n", 196 | " for line in content:\n", 197 | " line = clean_str(line.strip())\n", 198 | " words = set(line.split())\n", 199 | " for word in words:\n", 200 | " vocab[word] += 1\n", 201 | " return vocab " 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "## Split train and test" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 9, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "sub_content = read_file(PATH/\"quote.tok.gt9.5000\")\n", 218 | "obj_content = read_file(PATH/\"plot.tok.gt9.5000\")\n", 219 | "sub_content = np.array([clean_str(line.strip()) for line in sub_content])\n", 220 | "obj_content = np.array([clean_str(line.strip()) for line in obj_content])\n", 221 | "sub_y = np.zeros(len(sub_content))\n", 222 | "obj_y = np.ones(len(obj_content))\n", 223 | "X = np.append(sub_content, obj_content)\n", 224 | "y = np.append(sub_y, obj_y)" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 10, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 11, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "data": { 243 | "text/plain": [ 244 | "(array(['will god let her fall or give her a new path \\\\?',\n", 245 | " \"the director 's twitchy sketchbook style and adroit perspective shifts grow wearisome amid leaden pacing and indifferent craftsmanship \\\\( most notably wretched sound design \\\\)\",\n", 246 | " \"welles groupie scholar peter bogdanovich took a long time to do it , but he 's finally provided his own broadside at publishing giant william randolph hearst\",\n", 247 | " 'based on the 1997 john king novel of the same name with a rather odd synopsis a first novel about a seasoned chelsea football club hooligan who represents a disaffected society operating by brutal rules',\n", 248 | " 'yet , beneath an upbeat appearance , she is struggling desperately with the emotional and physical scars left by the attack'],\n", 249 | " dtype='\":0, \"UNK\":1} # init with padding and unknown\n", 332 | "words = [\"\", \"UNK\"]\n", 333 | "for word in word_count:\n", 334 | " vocab2index[word] = len(words)\n", 335 | " words.append(word)" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "## Embedding Layer" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 17, 348 | "metadata": {}, 349 | "outputs": [ 350 | { 351 | "data": { 352 | "text/plain": [ 353 | "tensor([[[-0.6895, 0.5455, 0.9540],\n", 354 | " [-0.7710, 0.6922, -1.5952],\n", 355 | " [-0.3402, -1.1721, -1.0863],\n", 356 | " [ 0.8806, -0.4155, 1.0337],\n", 357 | " [-0.6895, 0.5455, 0.9540]]], grad_fn=)" 358 | ] 359 | }, 360 | "execution_count": 17, 361 | "metadata": {}, 362 | "output_type": "execute_result" 363 | } 364 | ], 365 | "source": [ 366 | "# an Embedding module containing 10 (words) tensors of size 3\n", 367 | "embed = nn.Embedding(10, 3)\n", 368 | "a = torch.LongTensor([[1,2,4,5,1]])\n", 369 | "embed(a)" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 18, 375 | "metadata": {}, 376 | "outputs": [ 377 | { 378 | "data": { 379 | "text/plain": [ 380 | "tensor([[-0.3417, 0.4184, 0.5396],\n", 381 | " [-0.6895, 0.5455, 0.9540],\n", 382 | " [-0.7710, 0.6922, -1.5952],\n", 383 | " [-1.0999, -0.3351, 0.2493],\n", 384 | " [-0.3402, -1.1721, -1.0863],\n", 385 | " [ 0.8806, -0.4155, 1.0337],\n", 386 | " [-0.1286, 1.2243, 0.7281],\n", 387 | " [-0.2313, 1.6461, -1.4697],\n", 388 | " [-0.6915, 1.0947, -0.2442],\n", 389 | " [-0.2410, 0.2314, -0.6789]])" 390 | ] 391 | }, 392 | "execution_count": 18, 393 | "metadata": {}, 394 | "output_type": "execute_result" 395 | } 396 | ], 397 | "source": [ 398 | "## here is the randomly initialized embeddings\n", 399 | "embed.weight.data" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": {}, 405 | "source": [ 406 | "Question: How many parameters do we have in this embedding matrix?" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "## Encoding training and validation sets" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "We will be using 1D Convolutional neural networks as our model. CNNs assume a fixed input size so we need to assume a fixed size and truncate or pad the sentences as needed. Let's find a good value to set our sequence length to." 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 19, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "x_len = np.array([len(x.split()) for x in X_train])" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 20, 435 | "metadata": {}, 436 | "outputs": [ 437 | { 438 | "data": { 439 | "text/plain": [ 440 | "42.0" 441 | ] 442 | }, 443 | "execution_count": 20, 444 | "metadata": {}, 445 | "output_type": "execute_result" 446 | } 447 | ], 448 | "source": [ 449 | "np.percentile(x_len, 95) # let set the max sequence len to N=40" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 21, 455 | "metadata": {}, 456 | "outputs": [ 457 | { 458 | "data": { 459 | "text/plain": [ 460 | "'will god let her fall or give her a new path \\\\?'" 461 | ] 462 | }, 463 | "execution_count": 21, 464 | "metadata": {}, 465 | "output_type": "execute_result" 466 | } 467 | ], 468 | "source": [ 469 | "X_train[0]" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 22, 475 | "metadata": {}, 476 | "outputs": [ 477 | { 478 | "data": { 479 | "text/plain": [ 480 | "2" 481 | ] 482 | }, 483 | "execution_count": 22, 484 | "metadata": {}, 485 | "output_type": "execute_result" 486 | } 487 | ], 488 | "source": [ 489 | "# returns the index of the word or the index of \"UNK\" otherwise\n", 490 | "vocab2index.get(\"will\", vocab2index[\"UNK\"])" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 23, 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "data": { 500 | "text/plain": [ 501 | "array([ 2, 11, 10, 4, 12, 5, 6, 4, 7, 3, 8, 9])" 502 | ] 503 | }, 504 | "execution_count": 23, 505 | "metadata": {}, 506 | "output_type": "execute_result" 507 | } 508 | ], 509 | "source": [ 510 | "np.array([vocab2index.get(w, vocab2index[\"UNK\"]) for w in X_train[0].split()])" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 24, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "def encode_sentence(s, N=40):\n", 520 | " enc = np.zeros(N, dtype=np.int32)\n", 521 | " enc1 = np.array([vocab2index.get(w, vocab2index[\"UNK\"]) for w in s.split()])\n", 522 | " l = min(N, len(enc1))\n", 523 | " enc[:l] = enc1[:l]\n", 524 | " return enc" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 25, 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [ 533 | "class SubjectivityDataset(Dataset):\n", 534 | " def __init__(self, X, y):\n", 535 | " self.x = X\n", 536 | " self.y = y\n", 537 | " \n", 538 | " def __len__(self):\n", 539 | " return len(self.y)\n", 540 | " \n", 541 | " def __getitem__(self, idx):\n", 542 | " x = self.x[idx]\n", 543 | " x = encode_sentence(x)\n", 544 | " return x, self.y[idx]\n", 545 | " \n", 546 | "train_ds = SubjectivityDataset(X_train, y_train)\n", 547 | "valid_ds = SubjectivityDataset(X_val, y_val)" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": 26, 553 | "metadata": {}, 554 | "outputs": [ 555 | { 556 | "data": { 557 | "text/plain": [ 558 | "(array([ 1, 498, 2405, 63, 94, 61, 3622, 19, 1331, 498, 2151,\n", 559 | " 315, 94, 61, 1, 1, 0, 0, 0, 0, 0, 0,\n", 560 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 561 | " 0, 0, 0, 0, 0, 0, 0], dtype=int32), 1.0)" 562 | ] 563 | }, 564 | "execution_count": 26, 565 | "metadata": {}, 566 | "output_type": "execute_result" 567 | } 568 | ], 569 | "source": [ 570 | "valid_ds[0]" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 27, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [ 579 | "train_dl = DataLoader(train_ds, batch_size=500, shuffle=True)\n", 580 | "valid_dl = DataLoader(valid_ds, batch_size=500)" 581 | ] 582 | }, 583 | { 584 | "cell_type": "markdown", 585 | "metadata": {}, 586 | "source": [ 587 | "## Playing and debugging CNN layers" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 28, 593 | "metadata": {}, 594 | "outputs": [], 595 | "source": [ 596 | "tr_dl = DataLoader(train_ds, batch_size=3, shuffle=True)" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 41, 602 | "metadata": {}, 603 | "outputs": [], 604 | "source": [ 605 | "V = len(words)\n", 606 | "D = 7\n", 607 | "N = 40" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": 42, 613 | "metadata": {}, 614 | "outputs": [], 615 | "source": [ 616 | "emb = nn.Embedding(V, D)" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": 43, 622 | "metadata": {}, 623 | "outputs": [ 624 | { 625 | "data": { 626 | "text/plain": [ 627 | "(torch.Size([3, 40]), tensor([0., 1., 1.]))" 628 | ] 629 | }, 630 | "execution_count": 43, 631 | "metadata": {}, 632 | "output_type": "execute_result" 633 | } 634 | ], 635 | "source": [ 636 | "x, y = next(iter(tr_dl))\n", 637 | "x.shape, y" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": 44, 643 | "metadata": {}, 644 | "outputs": [ 645 | { 646 | "data": { 647 | "text/plain": [ 648 | "tensor([[ 151, 69, 180, 1, 172, 26, 797, 7, 1, 92, 7, 251,\n", 649 | " 273, 122, 1270, 587, 7, 158, 63, 1526, 1, 55, 344, 7,\n", 650 | " 1, 37, 153, 3492, 3690, 391, 1, 1896, 7, 273, 220, 46,\n", 651 | " 395, 1, 0, 0],\n", 652 | " [ 151, 2283, 981, 1547, 59, 1, 1, 19, 1, 0, 0, 0,\n", 653 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 654 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 655 | " 0, 0, 0, 0],\n", 656 | " [ 77, 7, 3784, 19, 148, 98, 790, 37, 1538, 1, 1, 3783,\n", 657 | " 391, 1, 363, 619, 19, 1, 0, 0, 0, 0, 0, 0,\n", 658 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 659 | " 0, 0, 0, 0]], dtype=torch.int32)" 660 | ] 661 | }, 662 | "execution_count": 44, 663 | "metadata": {}, 664 | "output_type": "execute_result" 665 | } 666 | ], 667 | "source": [ 668 | "x" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 45, 674 | "metadata": {}, 675 | "outputs": [], 676 | "source": [ 677 | "x1 = emb(x.long())" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": 46, 683 | "metadata": {}, 684 | "outputs": [ 685 | { 686 | "data": { 687 | "text/plain": [ 688 | "torch.Size([3, 40, 7])" 689 | ] 690 | }, 691 | "execution_count": 46, 692 | "metadata": {}, 693 | "output_type": "execute_result" 694 | } 695 | ], 696 | "source": [ 697 | "x1.size()" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": 47, 703 | "metadata": {}, 704 | "outputs": [ 705 | { 706 | "data": { 707 | "text/plain": [ 708 | "torch.Size([3, 7, 40])" 709 | ] 710 | }, 711 | "execution_count": 47, 712 | "metadata": {}, 713 | "output_type": "execute_result" 714 | } 715 | ], 716 | "source": [ 717 | "x1 = x1.transpose(1,2) # needs to convert x to (batch, embedding_dim, sentence_len)\n", 718 | "x1.size()" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": 48, 724 | "metadata": {}, 725 | "outputs": [], 726 | "source": [ 727 | "conv_3 = nn.Conv1d(in_channels=D, out_channels=100, kernel_size=3)" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": 49, 733 | "metadata": {}, 734 | "outputs": [], 735 | "source": [ 736 | "x3 = conv_3(x1)" 737 | ] 738 | }, 739 | { 740 | "cell_type": "code", 741 | "execution_count": 50, 742 | "metadata": {}, 743 | "outputs": [ 744 | { 745 | "data": { 746 | "text/plain": [ 747 | "torch.Size([3, 100, 38])" 748 | ] 749 | }, 750 | "execution_count": 50, 751 | "metadata": {}, 752 | "output_type": "execute_result" 753 | } 754 | ], 755 | "source": [ 756 | "x3.size()" 757 | ] 758 | }, 759 | { 760 | "cell_type": "code", 761 | "execution_count": 51, 762 | "metadata": {}, 763 | "outputs": [], 764 | "source": [ 765 | "conv_4 = nn.Conv1d(in_channels=D, out_channels=100, kernel_size=4)\n", 766 | "conv_5 = nn.Conv1d(in_channels=D, out_channels=100, kernel_size=5)" 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": 52, 772 | "metadata": {}, 773 | "outputs": [ 774 | { 775 | "name": "stdout", 776 | "output_type": "stream", 777 | "text": [ 778 | "torch.Size([3, 100, 37]) torch.Size([3, 100, 36])\n" 779 | ] 780 | } 781 | ], 782 | "source": [ 783 | "x4 = conv_4(x1)\n", 784 | "x5 = conv_5(x1)\n", 785 | "print(x4.size(), x5.size())" 786 | ] 787 | }, 788 | { 789 | "cell_type": "markdown", 790 | "metadata": {}, 791 | "source": [ 792 | "Note that the convolution all apply to the same `x1`. How do we combine now the results of the convolutions? " 793 | ] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "execution_count": 53, 798 | "metadata": {}, 799 | "outputs": [ 800 | { 801 | "data": { 802 | "text/plain": [ 803 | "torch.Size([3, 100, 1])" 804 | ] 805 | }, 806 | "execution_count": 53, 807 | "metadata": {}, 808 | "output_type": "execute_result" 809 | } 810 | ], 811 | "source": [ 812 | "# 100 3-gram detectors\n", 813 | "x3 = nn.ReLU()(x3)\n", 814 | "x3 = nn.MaxPool1d(kernel_size = 38)(x3)\n", 815 | "x3.size()" 816 | ] 817 | }, 818 | { 819 | "cell_type": "code", 820 | "execution_count": 54, 821 | "metadata": {}, 822 | "outputs": [ 823 | { 824 | "data": { 825 | "text/plain": [ 826 | "torch.Size([3, 100, 1])" 827 | ] 828 | }, 829 | "execution_count": 54, 830 | "metadata": {}, 831 | "output_type": "execute_result" 832 | } 833 | ], 834 | "source": [ 835 | "# 100 4-gram detectors\n", 836 | "x4 = nn.ReLU()(x4)\n", 837 | "x4 = nn.MaxPool1d(kernel_size = 37)(x4)\n", 838 | "x4.size()" 839 | ] 840 | }, 841 | { 842 | "cell_type": "code", 843 | "execution_count": 55, 844 | "metadata": {}, 845 | "outputs": [ 846 | { 847 | "data": { 848 | "text/plain": [ 849 | "torch.Size([3, 100, 1])" 850 | ] 851 | }, 852 | "execution_count": 55, 853 | "metadata": {}, 854 | "output_type": "execute_result" 855 | } 856 | ], 857 | "source": [ 858 | "# 100 5-gram detectors\n", 859 | "x5 = nn.ReLU()(x5)\n", 860 | "x5 = nn.MaxPool1d(kernel_size = 36)(x5)\n", 861 | "x5.size()" 862 | ] 863 | }, 864 | { 865 | "cell_type": "code", 866 | "execution_count": 56, 867 | "metadata": {}, 868 | "outputs": [ 869 | { 870 | "data": { 871 | "text/plain": [ 872 | "torch.Size([3, 100, 3])" 873 | ] 874 | }, 875 | "execution_count": 56, 876 | "metadata": {}, 877 | "output_type": "execute_result" 878 | } 879 | ], 880 | "source": [ 881 | "# concatenate x3, x4, x5\n", 882 | "out = torch.cat([x3, x4, x5], 2)\n", 883 | "out.size()" 884 | ] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "execution_count": 57, 889 | "metadata": {}, 890 | "outputs": [ 891 | { 892 | "data": { 893 | "text/plain": [ 894 | "torch.Size([3, 300])" 895 | ] 896 | }, 897 | "execution_count": 57, 898 | "metadata": {}, 899 | "output_type": "execute_result" 900 | } 901 | ], 902 | "source": [ 903 | "out = out.view(out.size(0), -1)\n", 904 | "out.size()" 905 | ] 906 | }, 907 | { 908 | "cell_type": "markdown", 909 | "metadata": {}, 910 | "source": [ 911 | "After this we have a fully connected network. Let's write a network that implements this." 912 | ] 913 | }, 914 | { 915 | "cell_type": "markdown", 916 | "metadata": {}, 917 | "source": [ 918 | "## 1D CNN model for sentence classification" 919 | ] 920 | }, 921 | { 922 | "cell_type": "markdown", 923 | "metadata": {}, 924 | "source": [ 925 | "Notation:\n", 926 | "* V -- vocabulary size\n", 927 | "* D -- embedding size\n", 928 | "* N -- MAX Sentence length" 929 | ] 930 | }, 931 | { 932 | "cell_type": "code", 933 | "execution_count": 101, 934 | "metadata": {}, 935 | "outputs": [], 936 | "source": [ 937 | "class SentenceCNN(nn.Module):\n", 938 | " \n", 939 | " def __init__(self, V, D):\n", 940 | " super(SentenceCNN, self).__init__()\n", 941 | " self.embedding = nn.Embedding(V, D, padding_idx=0)\n", 942 | "\n", 943 | " self.conv_3 = nn.Conv1d(in_channels=D, out_channels=100, kernel_size=3)\n", 944 | " self.conv_4 = nn.Conv1d(in_channels=D, out_channels=100, kernel_size=4)\n", 945 | " self.conv_5 = nn.Conv1d(in_channels=D, out_channels=100, kernel_size=5)\n", 946 | " \n", 947 | " self.bn= nn.BatchNorm1d(300)\n", 948 | " \n", 949 | " self.dropout = nn.Dropout(p=0.5)\n", 950 | " self.fc = nn.Linear(300, 1)\n", 951 | " \n", 952 | " def forward(self, x):\n", 953 | " x = self.embedding(x)\n", 954 | " x = self.dropout(x)\n", 955 | " x = x.transpose(1,2)\n", 956 | " x3 = F.relu(self.conv_3(x))\n", 957 | " x4 = F.relu(self.conv_4(x))\n", 958 | " x5 = F.relu(self.conv_5(x))\n", 959 | " x3 = nn.MaxPool1d(kernel_size = 38)(x3)\n", 960 | " x4 = nn.MaxPool1d(kernel_size = 37)(x4)\n", 961 | " x5 = nn.MaxPool1d(kernel_size = 36)(x5)\n", 962 | " out = torch.cat([x3, x4, x5], 2)\n", 963 | " out = out.view(out.size(0), -1)\n", 964 | " out = self.dropout(self.bn(out))\n", 965 | " return self.fc(out) " 966 | ] 967 | }, 968 | { 969 | "cell_type": "code", 970 | "execution_count": 91, 971 | "metadata": {}, 972 | "outputs": [], 973 | "source": [ 974 | "V = len(words)\n", 975 | "D = 50\n", 976 | "N = 40\n", 977 | "model = SentenceCNN(V, D)" 978 | ] 979 | }, 980 | { 981 | "cell_type": "code", 982 | "execution_count": 92, 983 | "metadata": {}, 984 | "outputs": [], 985 | "source": [ 986 | "x, y = next(iter(train_dl))" 987 | ] 988 | }, 989 | { 990 | "cell_type": "code", 991 | "execution_count": 93, 992 | "metadata": {}, 993 | "outputs": [ 994 | { 995 | "data": { 996 | "text/plain": [ 997 | "torch.Size([500, 1])" 998 | ] 999 | }, 1000 | "execution_count": 93, 1001 | "metadata": {}, 1002 | "output_type": "execute_result" 1003 | } 1004 | ], 1005 | "source": [ 1006 | "y_hat = model(x.long())\n", 1007 | "y_hat.size()" 1008 | ] 1009 | }, 1010 | { 1011 | "cell_type": "code", 1012 | "execution_count": 94, 1013 | "metadata": {}, 1014 | "outputs": [ 1015 | { 1016 | "data": { 1017 | "text/plain": [ 1018 | "tensor(0.7426, grad_fn=)" 1019 | ] 1020 | }, 1021 | "execution_count": 94, 1022 | "metadata": {}, 1023 | "output_type": "execute_result" 1024 | } 1025 | ], 1026 | "source": [ 1027 | "F.binary_cross_entropy_with_logits(y_hat, y.unsqueeze(1))" 1028 | ] 1029 | }, 1030 | { 1031 | "cell_type": "markdown", 1032 | "metadata": {}, 1033 | "source": [ 1034 | "## Training" 1035 | ] 1036 | }, 1037 | { 1038 | "cell_type": "code", 1039 | "execution_count": 104, 1040 | "metadata": {}, 1041 | "outputs": [], 1042 | "source": [ 1043 | "def update_optimizer(optimizer, lr):\n", 1044 | " for i, param_group in enumerate(optimizer.param_groups):\n", 1045 | " param_group[\"lr\"] = lr" 1046 | ] 1047 | }, 1048 | { 1049 | "cell_type": "code", 1050 | "execution_count": 95, 1051 | "metadata": {}, 1052 | "outputs": [], 1053 | "source": [ 1054 | "def valid_metrics(model):\n", 1055 | " model.eval()\n", 1056 | " total = 0\n", 1057 | " sum_loss = 0\n", 1058 | " correct = 0\n", 1059 | " for x, y in valid_dl:\n", 1060 | " x = x.long() #.cuda()\n", 1061 | " y = y.float().unsqueeze(1)\n", 1062 | " batch = y.shape[0]\n", 1063 | " out = model(x)\n", 1064 | " loss = F.binary_cross_entropy_with_logits(out, y)\n", 1065 | " sum_loss += batch*(loss.item())\n", 1066 | " total += batch\n", 1067 | " pred = (out > 0).float()\n", 1068 | " correct += (pred == y).float().sum().item()\n", 1069 | " val_loss = sum_loss/total\n", 1070 | " val_acc = correct/total\n", 1071 | " return val_loss, val_acc" 1072 | ] 1073 | }, 1074 | { 1075 | "cell_type": "code", 1076 | "execution_count": 96, 1077 | "metadata": {}, 1078 | "outputs": [], 1079 | "source": [ 1080 | "def train_epocs(model, optimizer, epochs=10):\n", 1081 | " for i in range(epochs):\n", 1082 | " model.train()\n", 1083 | " total_loss = 0\n", 1084 | " total = 0\n", 1085 | " for x, y in train_dl:\n", 1086 | " x = x.long()\n", 1087 | " y = y.float().unsqueeze(1)\n", 1088 | " out = model(x)\n", 1089 | " loss = F.binary_cross_entropy_with_logits(out, y)\n", 1090 | " optimizer.zero_grad()\n", 1091 | " loss.backward()\n", 1092 | " optimizer.step()\n", 1093 | " total_loss += x.size(0)*loss.item()\n", 1094 | " total += x.size(0)\n", 1095 | " train_loss = total_loss/total\n", 1096 | " val_loss, val_accuracy = valid_metrics(model)\n", 1097 | " \n", 1098 | " print(\"train_loss %.3f val_loss %.3f val_accuracy %.3f\" % (\n", 1099 | " train_loss, val_loss, val_accuracy))" 1100 | ] 1101 | }, 1102 | { 1103 | "cell_type": "code", 1104 | "execution_count": 105, 1105 | "metadata": {}, 1106 | "outputs": [], 1107 | "source": [ 1108 | "V = len(words)\n", 1109 | "D = 100\n", 1110 | "model = SentenceCNN(V, D)" 1111 | ] 1112 | }, 1113 | { 1114 | "cell_type": "code", 1115 | "execution_count": 106, 1116 | "metadata": {}, 1117 | "outputs": [ 1118 | { 1119 | "name": "stdout", 1120 | "output_type": "stream", 1121 | "text": [ 1122 | "train_loss 0.568 val_loss 0.405 val_accuracy 0.815\n", 1123 | "train_loss 0.425 val_loss 0.358 val_accuracy 0.838\n", 1124 | "train_loss 0.361 val_loss 0.336 val_accuracy 0.854\n", 1125 | "train_loss 0.302 val_loss 0.300 val_accuracy 0.872\n", 1126 | "train_loss 0.251 val_loss 0.280 val_accuracy 0.884\n", 1127 | "train_loss 0.222 val_loss 0.284 val_accuracy 0.881\n", 1128 | "train_loss 0.194 val_loss 0.294 val_accuracy 0.885\n", 1129 | "train_loss 0.174 val_loss 0.276 val_accuracy 0.889\n", 1130 | "train_loss 0.158 val_loss 0.295 val_accuracy 0.887\n", 1131 | "train_loss 0.138 val_loss 0.336 val_accuracy 0.883\n" 1132 | ] 1133 | } 1134 | ], 1135 | "source": [ 1136 | "optimizer = torch.optim.Adam(model.parameters(), lr=0.01)\n", 1137 | "train_epocs(model, optimizer, epochs=10)" 1138 | ] 1139 | }, 1140 | { 1141 | "cell_type": "code", 1142 | "execution_count": 107, 1143 | "metadata": {}, 1144 | "outputs": [ 1145 | { 1146 | "name": "stdout", 1147 | "output_type": "stream", 1148 | "text": [ 1149 | "train_loss 0.129 val_loss 0.296 val_accuracy 0.888\n", 1150 | "train_loss 0.122 val_loss 0.291 val_accuracy 0.889\n", 1151 | "train_loss 0.114 val_loss 0.293 val_accuracy 0.893\n", 1152 | "train_loss 0.118 val_loss 0.290 val_accuracy 0.893\n", 1153 | "train_loss 0.124 val_loss 0.286 val_accuracy 0.893\n", 1154 | "train_loss 0.111 val_loss 0.290 val_accuracy 0.894\n", 1155 | "train_loss 0.112 val_loss 0.300 val_accuracy 0.895\n", 1156 | "train_loss 0.112 val_loss 0.296 val_accuracy 0.896\n", 1157 | "train_loss 0.112 val_loss 0.294 val_accuracy 0.894\n", 1158 | "train_loss 0.108 val_loss 0.291 val_accuracy 0.899\n" 1159 | ] 1160 | } 1161 | ], 1162 | "source": [ 1163 | "update_optimizer(optimizer, lr=0.001)\n", 1164 | "train_epocs(model, optimizer, epochs=10)" 1165 | ] 1166 | }, 1167 | { 1168 | "cell_type": "code", 1169 | "execution_count": null, 1170 | "metadata": {}, 1171 | "outputs": [ 1172 | { 1173 | "name": "stdout", 1174 | "output_type": "stream", 1175 | "text": [ 1176 | "train_loss 0.097 val_loss 0.289 val_accuracy 0.900\n", 1177 | "train_loss 0.100 val_loss 0.294 val_accuracy 0.899\n", 1178 | "train_loss 0.100 val_loss 0.290 val_accuracy 0.901\n", 1179 | "train_loss 0.098 val_loss 0.294 val_accuracy 0.900\n", 1180 | "train_loss 0.100 val_loss 0.290 val_accuracy 0.899\n" 1181 | ] 1182 | } 1183 | ], 1184 | "source": [ 1185 | "train_epocs(model, optimizer, epochs=10)" 1186 | ] 1187 | }, 1188 | { 1189 | "cell_type": "markdown", 1190 | "metadata": {}, 1191 | "source": [ 1192 | "## References" 1193 | ] 1194 | }, 1195 | { 1196 | "cell_type": "markdown", 1197 | "metadata": {}, 1198 | "source": [ 1199 | "The CNN is adapted from here https://github.com/junwang4/CNN-sentence-classification-pytorch-2017/blob/master/cnn_pytorch.py.\n", 1200 | "Code for the original paper can be found here https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py." 1201 | ] 1202 | } 1203 | ], 1204 | "metadata": { 1205 | "kernelspec": { 1206 | "display_name": "Python 3", 1207 | "language": "python", 1208 | "name": "python3" 1209 | }, 1210 | "language_info": { 1211 | "codemirror_mode": { 1212 | "name": "ipython", 1213 | "version": 3 1214 | }, 1215 | "file_extension": ".py", 1216 | "mimetype": "text/x-python", 1217 | "name": "python", 1218 | "nbconvert_exporter": "python", 1219 | "pygments_lexer": "ipython3", 1220 | "version": "3.7.3" 1221 | }, 1222 | "toc": { 1223 | "nav_menu": {}, 1224 | "number_sections": true, 1225 | "sideBar": true, 1226 | "skip_h1_title": false, 1227 | "toc_cell": false, 1228 | "toc_position": {}, 1229 | "toc_section_display": "block", 1230 | "toc_window_display": false 1231 | } 1232 | }, 1233 | "nbformat": 4, 1234 | "nbformat_minor": 2 1235 | } 1236 | -------------------------------------------------------------------------------- /lesson10-seq2seq.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Sequence to Sequence\n", 8 | "In this notebook we will be teaching a neural network to translate from French to English." 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "This is made possible by the simple but powerful idea of the [sequence\n", 16 | "to sequence network](https://arxiv.org/abs/1409.3215>), in which two\n", 17 | "recurrent neural networks work together to transform one sequence to\n", 18 | "another. An **encoder** network condenses an input sequence into a vector,\n", 19 | "and a **decoder** network unfolds that vector into a new sequence.\n", 20 | "\n", 21 | "![](imgs/seq2seq.png)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "from __future__ import unicode_literals\n", 31 | "from io import open\n", 32 | "import unicodedata\n", 33 | "import string\n", 34 | "import re\n", 35 | "import random\n", 36 | "\n", 37 | "import torch\n", 38 | "import torch.nn as nn\n", 39 | "from torch import optim\n", 40 | "import torch.nn.functional as F\n", 41 | "from torch.utils.data import Dataset, DataLoader\n", 42 | "import numpy as np\n", 43 | "%matplotlib inline" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Pre-processing data\n", 51 | "The data for this project is a set of many thousands of English to\n", 52 | "French translation pairs." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "def download_dataset():\n", 62 | " ! wget https://download.pytorch.org/tutorial/data.zip\n", 63 | " ! unzip data.zip" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "# to download the dataset\n", 73 | "#download_dataset()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "We'll need a unique index per word to use as the inputs and targets of\n", 81 | "the networks later. To keep track of all this we will use a helper class\n", 82 | "called ``Lang`` which has word → index (``word2index``) and index → word\n", 83 | "(``index2word``) dictionaries, as well as a count of each word\n", 84 | "``word2count`` to use to later replace rare words." 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 4, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "SOS_token = 1\n", 94 | "EOS_token = 2\n", 95 | "class Lang:\n", 96 | " def __init__(self, name):\n", 97 | " self.name = name\n", 98 | " self.word2index = {\"PAD\": 0, \"SOS\": 1, \"EOS\": 2, \"UNK\": 3}\n", 99 | " self.word2count = {}\n", 100 | " self.index2word = {0: \"PAD\", 1: \"SOS\", 2: \"EOS\", 3: \"UNK\"}\n", 101 | " self.n_words = 4 # Count SOS and EOS\n", 102 | "\n", 103 | " def addSentence(self, sentence):\n", 104 | " for word in sentence.split(' '):\n", 105 | " self.addWord(word)\n", 106 | "\n", 107 | " def addWord(self, word):\n", 108 | " if word not in self.word2index:\n", 109 | " self.word2index[word] = self.n_words\n", 110 | " self.word2count[word] = 1\n", 111 | " self.index2word[self.n_words] = word\n", 112 | " self.n_words += 1\n", 113 | " else:\n", 114 | " self.word2count[word] += 1" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "The files are all in Unicode, to simplify we will turn Unicode\n", 122 | "characters to ASCII, make everything lowercase, and trim most\n", 123 | "punctuation.\n", 124 | "\n", 125 | "\n" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 5, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "def unicodeToAscii(s):\n", 135 | " \"\"\"Turn a Unicode string to plain ASCII\n", 136 | " \n", 137 | " https://stackoverflow.com/a/518232/2809427\n", 138 | " \"\"\"\n", 139 | " return ''.join(c for c in unicodedata.normalize('NFD', s)\n", 140 | " if unicodedata.category(c) != 'Mn'\n", 141 | " )\n", 142 | "\n", 143 | "def normalizeString(s):\n", 144 | " \"\"\"Lowercase, trim, and remove non-letter characters\"\"\"\n", 145 | " s = unicodeToAscii(s.lower().strip())\n", 146 | " s = re.sub(r\"([.!?])\", r\" \\1\", s)\n", 147 | " s = re.sub(r\"[^a-zA-Z.!?]+\", r\" \", s)\n", 148 | " return s" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 6, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "def readLangs(filename):\n", 158 | " # Read the file and split into lines\n", 159 | " lines = open(filename).read().strip().split('\\n')\n", 160 | "\n", 161 | " # Split every line into pairs and normalize\n", 162 | " pairs = [[normalizeString(s) for s in l.split('\\t')] for l in lines]\n", 163 | "\n", 164 | " return pairs" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 7, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "# filtering some of the data\n", 174 | "MAX_LENGTH = 15\n", 175 | "\n", 176 | "eng_prefixes = (\n", 177 | " \"i am \", \"i m \",\n", 178 | " \"he is\", \"he s \",\n", 179 | " \"she is\", \"she s \",\n", 180 | " \"you are\", \"you re \",\n", 181 | " \"we are\", \"we re \",\n", 182 | " \"they are\", \"they re \"\n", 183 | ")\n", 184 | "\n", 185 | "def filterPair(p):\n", 186 | " return len(p[0].split(' ')) <= MAX_LENGTH and \\\n", 187 | " len(p[1].split(' ')) <= MAX_LENGTH and \\\n", 188 | " p[0].startswith(eng_prefixes)\n", 189 | "\n", 190 | "\n", 191 | "def filterPairs(pairs):\n", 192 | " return [pair for pair in pairs if filterPair(pair)]" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "The full process for preparing the data is:\n", 200 | "\n", 201 | "- Read text file and split into lines, split lines into pairs\n", 202 | "- Normalize text, filter by length and content\n", 203 | "- Make word lists from sentences in pairs\n", 204 | "\n", 205 | "\n" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 8, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "Read 135842 sentence pairs\n", 218 | "Trimmed to 12898 sentence pairs\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "pairs = readLangs(\"data/eng-fra.txt\")\n", 224 | "print(\"Read %s sentence pairs\" % len(pairs))\n", 225 | "pairs = filterPairs(pairs)\n", 226 | "print(\"Trimmed to %s sentence pairs\" % len(pairs))" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 9, 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "name": "stdout", 236 | "output_type": "stream", 237 | "text": [ 238 | "Read 135842 sentence pairs\n", 239 | "Trimmed to 12898 sentence pairs\n", 240 | "number of test pairs: 300\n", 241 | "number of train pairs: 12598\n", 242 | "Counting words...\n", 243 | "Counted words:\n", 244 | "english 5070\n", 245 | "french 3331\n", 246 | "['he is too drunk to drive home .', 'il est trop saoul pour conduire jusque chez lui .']\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "def prepareData(data_filename):\n", 252 | " pairs = readLangs(data_filename)\n", 253 | " print(\"Read %s sentence pairs\" % len(pairs))\n", 254 | " pairs = filterPairs(pairs)\n", 255 | " print(\"Trimmed to %s sentence pairs\" % len(pairs))\n", 256 | " \n", 257 | " \n", 258 | " #randomize the data with a fixed seed for repeatability\n", 259 | " random.seed(4)\n", 260 | " random.shuffle(pairs)\n", 261 | " #choose the first 10 pairs for testing and the rest for training\n", 262 | " valid_pairs = pairs[0:300]\n", 263 | " train_pairs = pairs[300:len(pairs)]\n", 264 | " \n", 265 | " print(\"number of test pairs: %s\" % len(valid_pairs))\n", 266 | " print(\"number of train pairs: %s\" % len(train_pairs))\n", 267 | " \n", 268 | " input_lang = Lang(\"english\")\n", 269 | " output_lang = Lang(\"french\")\n", 270 | " \n", 271 | " print(\"Counting words...\")\n", 272 | " cnt = 0\n", 273 | " for pair in pairs:\n", 274 | " input_lang.addSentence(pair[1])\n", 275 | " output_lang.addSentence(pair[0])\n", 276 | " \n", 277 | " print(\"Counted words:\")\n", 278 | " print(input_lang.name, input_lang.n_words)\n", 279 | " print(output_lang.name, output_lang.n_words)\n", 280 | " return input_lang, output_lang, pairs, train_pairs, valid_pairs\n", 281 | "\n", 282 | "input_lang, output_lang, pairs, train_pairs, valid_pairs = prepareData(\"data/eng-fra.txt\")\n", 283 | "random.seed(4)\n", 284 | "print(random.choice(pairs))" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 10, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "data": { 294 | "text/plain": [ 295 | "['he is a tennis player .', 'c est un joueur de tennis .']" 296 | ] 297 | }, 298 | "execution_count": 10, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "train_pairs[0]" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "# Dataset" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 11, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "def encode_sentence(s, vocab2index, N=MAX_LENGTH + 2, padding_start=True):\n", 321 | " enc = np.zeros(N, dtype=np.int32)\n", 322 | " enc1 = np.array([SOS_token] + [vocab2index.get(w, vocab2index[\"UNK\"]) for w in s.split()] + [EOS_token])\n", 323 | " l = min(N, len(enc1))\n", 324 | " if padding_start:\n", 325 | " enc[:l] = enc1[:l]\n", 326 | " else:\n", 327 | " enc[N-l:] = enc1[:l]\n", 328 | " return enc, l" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 12, 334 | "metadata": {}, 335 | "outputs": [ 336 | { 337 | "data": { 338 | "text/plain": [ 339 | "['he is a tennis player .', 'c est un joueur de tennis .']" 340 | ] 341 | }, 342 | "execution_count": 12, 343 | "metadata": {}, 344 | "output_type": "execute_result" 345 | } 346 | ], 347 | "source": [ 348 | "train_pairs[0]" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 13, 354 | "metadata": {}, 355 | "outputs": [ 356 | { 357 | "data": { 358 | "text/plain": [ 359 | "(array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 3, 79,\n", 360 | " 554, 3, 11, 2], dtype=int32), 8)" 361 | ] 362 | }, 363 | "execution_count": 13, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "encode_sentence(train_pairs[0][0], input_lang.word2index, padding_start=False)" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 14, 375 | "metadata": {}, 376 | "outputs": [ 377 | { 378 | "data": { 379 | "text/plain": [ 380 | "(array([ 1, 3, 3, 3, 3, 3, 499, 11, 2, 0, 0, 0, 0,\n", 381 | " 0, 0, 0, 0], dtype=int32), 9)" 382 | ] 383 | }, 384 | "execution_count": 14, 385 | "metadata": {}, 386 | "output_type": "execute_result" 387 | } 388 | ], 389 | "source": [ 390 | "encode_sentence(train_pairs[0][1], output_lang.word2index)" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 15, 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "class PairDataset(Dataset):\n", 400 | " def __init__(self, pairs, input_lang, output_lang):\n", 401 | " self.pairs = pairs\n", 402 | " self.input_word2index = input_lang.word2index\n", 403 | " self.output_word2index = output_lang.word2index\n", 404 | " \n", 405 | " def __len__(self):\n", 406 | " return len(self.pairs)\n", 407 | " \n", 408 | " def __getitem__(self, idx):\n", 409 | " x, n_x = encode_sentence(self.pairs[idx][1], self.input_word2index, padding_start=False)\n", 410 | " y, n_y = encode_sentence(self.pairs[idx][0], self.output_word2index)\n", 411 | " return x, y\n", 412 | " \n", 413 | "train_ds = PairDataset(train_pairs, input_lang, output_lang)\n", 414 | "valid_ds = PairDataset(valid_pairs, input_lang, output_lang)" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 16, 420 | "metadata": {}, 421 | "outputs": [ 422 | { 423 | "data": { 424 | "text/plain": [ 425 | "(array([ 0, 0, 0, 0, 0, 0, 0, 0, 1, 44, 45, 97, 553,\n", 426 | " 16, 554, 11, 2], dtype=int32),\n", 427 | " array([ 1, 90, 38, 39, 499, 500, 11, 2, 0, 0, 0, 0, 0,\n", 428 | " 0, 0, 0, 0], dtype=int32))" 429 | ] 430 | }, 431 | "execution_count": 16, 432 | "metadata": {}, 433 | "output_type": "execute_result" 434 | } 435 | ], 436 | "source": [ 437 | "train_ds[0]" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 17, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "batch_size=5\n", 447 | "train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)\n", 448 | "valid_dl = DataLoader(valid_ds, batch_size=batch_size)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "## The Seq2Seq Model\n", 456 | "\n", 457 | "A Recurrent Neural Network, or RNN, is a network that operates on a\n", 458 | "sequence and uses its own output as input for subsequent steps.\n", 459 | "\n", 460 | "A `Sequence to Sequence network `__, or\n", 461 | "seq2seq network, or `Encoder Decoder\n", 462 | "network `__, is a model\n", 463 | "consisting of two RNNs called the encoder and decoder. The encoder reads\n", 464 | "an input sequence and outputs a single vector, and the decoder reads\n", 465 | "that vector to produce an output sequence." 466 | ] 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "metadata": {}, 471 | "source": [ 472 | "### The Encoder\n", 473 | "\n", 474 | "The encoder of a seq2seq network is a RNN that outputs some value for\n", 475 | "every word from the input sentence. For every input word the encoder\n", 476 | "outputs a vector and a hidden state, and uses the hidden state for the\n", 477 | "next input word.\n", 478 | "\n", 479 | "![](imgs/encoder-network.png)" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 18, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "class EncoderRNN(nn.Module):\n", 489 | " def __init__(self, input_size, hidden_size):\n", 490 | " super(EncoderRNN, self).__init__()\n", 491 | " self.hidden_size = hidden_size\n", 492 | "\n", 493 | " self.embedding = nn.Embedding(input_size, hidden_size, padding_idx=0)\n", 494 | " self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)\n", 495 | " self.dropout = nn.Dropout(0.3)\n", 496 | "\n", 497 | " def forward(self, x):\n", 498 | " x = self.embedding(x)\n", 499 | " x = self.dropout(x)\n", 500 | " output, hidden = self.gru(x)\n", 501 | " return output, hidden" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 19, 507 | "metadata": {}, 508 | "outputs": [], 509 | "source": [ 510 | "x, y = next(iter(train_dl))" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 20, 516 | "metadata": {}, 517 | "outputs": [ 518 | { 519 | "data": { 520 | "text/plain": [ 521 | "(tensor([[ 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 645, 46,\n", 522 | " 2969, 16, 2970, 11, 2],\n", 523 | " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 30,\n", 524 | " 31, 95, 716, 11, 2],\n", 525 | " [ 0, 0, 0, 0, 0, 0, 1, 44, 45, 127, 195, 45,\n", 526 | " 79, 99, 3434, 11, 2],\n", 527 | " [ 0, 0, 0, 0, 0, 1, 22, 178, 37, 25, 201, 1557,\n", 528 | " 40, 82, 83, 11, 2],\n", 529 | " [ 0, 0, 0, 1, 22, 24, 14, 91, 136, 50, 79, 182,\n", 530 | " 91, 22, 1835, 11, 2]], dtype=torch.int32),\n", 531 | " tensor([[ 1, 4, 38, 567, 39, 1576, 1923, 11, 2, 0, 0, 0,\n", 532 | " 0, 0, 0, 0, 0],\n", 533 | " [ 1, 28, 13, 41, 303, 11, 2, 0, 0, 0, 0, 0,\n", 534 | " 0, 0, 0, 0, 0],\n", 535 | " [ 1, 90, 5, 7, 72, 228, 5, 2449, 44, 11, 2, 0,\n", 536 | " 0, 0, 0, 0, 0],\n", 537 | " [ 1, 17, 18, 23, 41, 992, 15, 66, 11, 2, 0, 0,\n", 538 | " 0, 0, 0, 0, 0],\n", 539 | " [ 1, 17, 18, 96, 111, 678, 28, 17, 73, 650, 11, 2,\n", 540 | " 0, 0, 0, 0, 0]], dtype=torch.int32))" 541 | ] 542 | }, 543 | "execution_count": 20, 544 | "metadata": {}, 545 | "output_type": "execute_result" 546 | } 547 | ], 548 | "source": [ 549 | "x, y" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 33, 555 | "metadata": {}, 556 | "outputs": [], 557 | "source": [ 558 | "input_size = input_lang.n_words\n", 559 | "hidden_size = 300\n", 560 | "encoder = EncoderRNN(input_size, hidden_size)" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 34, 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "enc_outputs, enc_hidden = encoder(x.long())" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 35, 575 | "metadata": {}, 576 | "outputs": [ 577 | { 578 | "data": { 579 | "text/plain": [ 580 | "(torch.Size([5, 17, 300]), torch.Size([1, 5, 300]))" 581 | ] 582 | }, 583 | "execution_count": 35, 584 | "metadata": {}, 585 | "output_type": "execute_result" 586 | } 587 | ], 588 | "source": [ 589 | "enc_outputs.shape, enc_hidden.shape" 590 | ] 591 | }, 592 | { 593 | "cell_type": "markdown", 594 | "metadata": {}, 595 | "source": [ 596 | "The Decoder\n", 597 | "-----------" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": 36, 603 | "metadata": {}, 604 | "outputs": [], 605 | "source": [ 606 | "class DecoderRNN(nn.Module):\n", 607 | " def __init__(self, output_size, hidden_size):\n", 608 | " super(DecoderRNN, self).__init__()\n", 609 | "\n", 610 | " self.embedding = nn.Embedding(output_size, hidden_size, padding_idx=0)\n", 611 | " self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)\n", 612 | " self.out = nn.Linear(hidden_size, output_size)\n", 613 | " self.dropout = nn.Dropout(0.3)\n", 614 | "\n", 615 | " def forward(self, x, hidden):\n", 616 | " embedded = self.embedding(x)\n", 617 | " embedded = self.dropout(embedded)\n", 618 | " output, hidden = self.gru(embedded, hidden)\n", 619 | " output = self.out(hidden[-1])\n", 620 | " return output, hidden" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 37, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "output_size = output_lang.n_words\n", 630 | "hidden_size = 300" 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": 38, 636 | "metadata": {}, 637 | "outputs": [ 638 | { 639 | "data": { 640 | "text/plain": [ 641 | "torch.Size([5, 1])" 642 | ] 643 | }, 644 | "execution_count": 38, 645 | "metadata": {}, 646 | "output_type": "execute_result" 647 | } 648 | ], 649 | "source": [ 650 | "batch_size = y.size(0)\n", 651 | "decoder_input = SOS_token*torch.ones(batch_size,1).long()\n", 652 | "decoder_input.shape" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": 39, 658 | "metadata": {}, 659 | "outputs": [], 660 | "source": [ 661 | "decoder = DecoderRNN(output_size, hidden_size)" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": 40, 667 | "metadata": {}, 668 | "outputs": [], 669 | "source": [ 670 | "output, hidden = decoder(decoder_input, enc_hidden)" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": 41, 676 | "metadata": {}, 677 | "outputs": [ 678 | { 679 | "data": { 680 | "text/plain": [ 681 | "(torch.Size([1, 5, 300]), torch.Size([5, 3331]))" 682 | ] 683 | }, 684 | "execution_count": 41, 685 | "metadata": {}, 686 | "output_type": "execute_result" 687 | } 688 | ], 689 | "source": [ 690 | "hidden.shape, output.shape" 691 | ] 692 | }, 693 | { 694 | "cell_type": "markdown", 695 | "metadata": {}, 696 | "source": [ 697 | "Training\n", 698 | "========" 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": 42, 704 | "metadata": {}, 705 | "outputs": [], 706 | "source": [ 707 | "def train_batch(x, y, encoder, decoder, encoder_optimizer, decoder_optimizer,\n", 708 | " teacher_forcing_ratio=0.5):\n", 709 | "\n", 710 | " encoder_optimizer.zero_grad()\n", 711 | " decoder_optimizer.zero_grad()\n", 712 | " \n", 713 | " batch_size = y.size(0)\n", 714 | " target_length = y.size(1)\n", 715 | "\n", 716 | " enc_outputs, enc_hidden = encoder(x)\n", 717 | "\n", 718 | " loss = 0\n", 719 | " dec_input = y[:,0].unsqueeze(1) # allways SOS\n", 720 | " hidden = enc_hidden\n", 721 | "\n", 722 | " use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False\n", 723 | "\n", 724 | " for di in range(1, target_length):\n", 725 | " output, hidden = decoder(dec_input, hidden)\n", 726 | " yi = y[:, di]\n", 727 | " if (yi>0).sum() > 0:\n", 728 | " # ignoring padding\n", 729 | " loss += F.cross_entropy(output, yi, ignore_index = 0, reduction=\"sum\")/(yi>0).sum()\n", 730 | " if use_teacher_forcing:\n", 731 | " dec_input = y[:, di].unsqueeze(1) # Teacher forcing: Feed the target as the next input\n", 732 | " else: \n", 733 | " dec_input = output.argmax(dim=1).unsqueeze(1).detach()\n", 734 | "\n", 735 | " loss.backward()\n", 736 | "\n", 737 | " encoder_optimizer.step()\n", 738 | " decoder_optimizer.step()\n", 739 | "\n", 740 | " return loss.item()" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 43, 746 | "metadata": {}, 747 | "outputs": [], 748 | "source": [ 749 | "def train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 10,\n", 750 | " teacher_forcing_ratio=0.5):\n", 751 | " for i in range(epochs):\n", 752 | " total_loss = 0\n", 753 | " total = 0\n", 754 | " encoder.train()\n", 755 | " decoder.train()\n", 756 | " for x, y in train_dl:\n", 757 | " x = x.long().cuda()\n", 758 | " y = y.long().cuda()\n", 759 | " loss = train_batch(x, y, encoder, decoder, enc_optimizer, dec_optimizer,\n", 760 | " teacher_forcing_ratio)\n", 761 | " total_loss = loss*x.size(0)\n", 762 | " total += x.size(0)\n", 763 | " if i%10 == 0:\n", 764 | " print(\"train loss %.3f\" % (total_loss / total)) " 765 | ] 766 | }, 767 | { 768 | "cell_type": "code", 769 | "execution_count": 44, 770 | "metadata": {}, 771 | "outputs": [], 772 | "source": [ 773 | "input_size = input_lang.n_words\n", 774 | "output_size = output_lang.n_words\n", 775 | "hidden_size = 300\n", 776 | "encoder = EncoderRNN(input_size, hidden_size).cuda()\n", 777 | "decoder = DecoderRNN(output_size, hidden_size).cuda()\n", 778 | "enc_optimizer = optim.Adam(encoder.parameters(), lr=0.01)\n", 779 | "dec_optimizer = optim.Adam(decoder.parameters(), lr=0.01) " 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": 45, 785 | "metadata": {}, 786 | "outputs": [], 787 | "source": [ 788 | "batch_size= 1000\n", 789 | "train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)\n", 790 | "valid_dl = DataLoader(valid_ds, batch_size=batch_size)" 791 | ] 792 | }, 793 | { 794 | "cell_type": "code", 795 | "execution_count": 46, 796 | "metadata": {}, 797 | "outputs": [ 798 | { 799 | "name": "stdout", 800 | "output_type": "stream", 801 | "text": [ 802 | "train loss 2.365\n", 803 | "train loss 1.618\n" 804 | ] 805 | } 806 | ], 807 | "source": [ 808 | "train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 20)" 809 | ] 810 | }, 811 | { 812 | "cell_type": "code", 813 | "execution_count": 47, 814 | "metadata": {}, 815 | "outputs": [ 816 | { 817 | "name": "stdout", 818 | "output_type": "stream", 819 | "text": [ 820 | "train loss 1.128\n", 821 | "train loss 1.058\n", 822 | "train loss 0.338\n", 823 | "train loss 0.299\n" 824 | ] 825 | } 826 | ], 827 | "source": [ 828 | "enc_optimizer = optim.Adam(encoder.parameters(), lr=0.001)\n", 829 | "dec_optimizer = optim.Adam(decoder.parameters(), lr=0.001) \n", 830 | "train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 40)" 831 | ] 832 | }, 833 | { 834 | "cell_type": "code", 835 | "execution_count": 48, 836 | "metadata": {}, 837 | "outputs": [ 838 | { 839 | "name": "stdout", 840 | "output_type": "stream", 841 | "text": [ 842 | "train loss 0.818\n", 843 | "train loss 0.836\n", 844 | "train loss 0.806\n", 845 | "train loss 0.738\n", 846 | "train loss 0.808\n", 847 | "train loss 0.657\n", 848 | "train loss 0.688\n", 849 | "train loss 0.580\n", 850 | "train loss 0.561\n", 851 | "train loss 0.540\n", 852 | "train loss 0.636\n", 853 | "train loss 0.624\n", 854 | "train loss 0.588\n", 855 | "train loss 0.484\n", 856 | "train loss 0.490\n", 857 | "train loss 0.501\n", 858 | "train loss 0.489\n", 859 | "train loss 0.463\n", 860 | "train loss 0.445\n", 861 | "train loss 0.478\n", 862 | "train loss 0.389\n", 863 | "train loss 0.412\n", 864 | "train loss 0.348\n", 865 | "train loss 0.462\n", 866 | "train loss 0.416\n", 867 | "train loss 0.544\n", 868 | "train loss 0.332\n", 869 | "train loss 0.380\n", 870 | "train loss 0.500\n", 871 | "train loss 0.417\n" 872 | ] 873 | } 874 | ], 875 | "source": [ 876 | "train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 300, teacher_forcing_ratio=0.0)" 877 | ] 878 | }, 879 | { 880 | "cell_type": "code", 881 | "execution_count": 49, 882 | "metadata": {}, 883 | "outputs": [ 884 | { 885 | "name": "stdout", 886 | "output_type": "stream", 887 | "text": [ 888 | "train loss 0.335\n", 889 | "train loss 0.292\n", 890 | "train loss 0.306\n", 891 | "train loss 0.311\n", 892 | "train loss 0.280\n", 893 | "train loss 0.345\n", 894 | "train loss 0.372\n", 895 | "train loss 0.290\n", 896 | "train loss 0.262\n", 897 | "train loss 0.355\n", 898 | "train loss 0.258\n", 899 | "train loss 0.352\n", 900 | "train loss 0.252\n", 901 | "train loss 0.444\n", 902 | "train loss 0.236\n", 903 | "train loss 0.238\n", 904 | "train loss 0.257\n", 905 | "train loss 0.266\n", 906 | "train loss 0.240\n", 907 | "train loss 0.237\n", 908 | "train loss 0.248\n", 909 | "train loss 0.321\n", 910 | "train loss 0.247\n", 911 | "train loss 0.169\n", 912 | "train loss 0.208\n", 913 | "train loss 0.207\n", 914 | "train loss 0.206\n", 915 | "train loss 0.244\n", 916 | "train loss 0.198\n", 917 | "train loss 0.172\n" 918 | ] 919 | } 920 | ], 921 | "source": [ 922 | "train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 300, teacher_forcing_ratio=0.0)" 923 | ] 924 | }, 925 | { 926 | "cell_type": "markdown", 927 | "metadata": {}, 928 | "source": [ 929 | "Evaluation\n", 930 | "==========\n", 931 | "\n", 932 | "Evaluation is mostly the same as training, but there are no targets so\n", 933 | "we simply feed the decoder's predictions back to itself for each step.\n", 934 | "Every time it predicts a word we add it to the output string, and if it\n", 935 | "predicts the EOS token we stop there. We also store the decoder's\n", 936 | "attention outputs for display later.\n", 937 | "\n", 938 | "\n" 939 | ] 940 | }, 941 | { 942 | "cell_type": "markdown", 943 | "metadata": {}, 944 | "source": [ 945 | "* `model.eval()` will notify all your layers that you are in eval mode, that way, batchnorm or dropout layers will work in eval mode instead of training mode.\n", 946 | "* `torch.no_grad()` impacts the autograd engine and deactivate it. It will reduce memory usage and speed up computations but you won’t be able to backprop (which you don’t want in an eval script)." 947 | ] 948 | }, 949 | { 950 | "cell_type": "code", 951 | "execution_count": 50, 952 | "metadata": {}, 953 | "outputs": [], 954 | "source": [ 955 | "def decoding(x, y, encoder, decoder, max_length=MAX_LENGTH+2):\n", 956 | " decoder = decoder.eval()\n", 957 | " loss = 0\n", 958 | " with torch.no_grad(): \n", 959 | " batch_size = x.size(0)\n", 960 | " enc_outputs, hidden = encoder(x)\n", 961 | " dec_input = SOS_token*torch.ones(batch_size, 1).long().cuda() # SOS\n", 962 | " decoded_words = []\n", 963 | " for di in range(1, max_length):\n", 964 | " output, hidden = decoder(dec_input, hidden)\n", 965 | " pred = output.argmax(dim=1)\n", 966 | " decoded_words.append(pred.cpu().numpy())\n", 967 | " dec_input = output.argmax(dim=1).unsqueeze(1).detach()\n", 968 | " yi = y[:, di]\n", 969 | " if (yi>0).sum() > 0:\n", 970 | " # ignoring padding\n", 971 | " loss += F.cross_entropy(\n", 972 | " output, yi, ignore_index = 0, reduction=\"sum\")/(yi>0).sum()\n", 973 | " return loss.item()/batch_size, np.transpose(decoded_words)" 974 | ] 975 | }, 976 | { 977 | "cell_type": "code", 978 | "execution_count": 51, 979 | "metadata": {}, 980 | "outputs": [ 981 | { 982 | "data": { 983 | "text/plain": [ 984 | "0.14845184326171876" 985 | ] 986 | }, 987 | "execution_count": 51, 988 | "metadata": {}, 989 | "output_type": "execute_result" 990 | } 991 | ], 992 | "source": [ 993 | "batch_size=300\n", 994 | "valid_dl_2 = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)\n", 995 | "\n", 996 | "x, y = next(iter(valid_dl_2)) \n", 997 | "x = x.long().cuda()\n", 998 | "y = y.long().cuda()\n", 999 | "\n", 1000 | "loss, _ = decoding(x, y, encoder, decoder)\n", 1001 | "loss" 1002 | ] 1003 | }, 1004 | { 1005 | "cell_type": "code", 1006 | "execution_count": 52, 1007 | "metadata": {}, 1008 | "outputs": [], 1009 | "source": [ 1010 | "batch_size=5\n", 1011 | "train_dl_2 = DataLoader(train_ds, batch_size=batch_size, shuffle=True)\n", 1012 | "\n", 1013 | "x, y = next(iter(train_dl_2)) \n", 1014 | "x = x.long().cuda()\n", 1015 | "y = y.long().cuda()" 1016 | ] 1017 | }, 1018 | { 1019 | "cell_type": "markdown", 1020 | "metadata": {}, 1021 | "source": [ 1022 | "We can evaluate random sentences from the training set and print out the\n", 1023 | "input, target, and output to make some subjective quality judgements:\n", 1024 | "\n", 1025 | "\n" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "code", 1030 | "execution_count": 53, 1031 | "metadata": {}, 1032 | "outputs": [], 1033 | "source": [ 1034 | "def print_results(x, y, encoder, decoder):\n", 1035 | " _, decoded_words = decoding(x, y, encoder, decoder)\n", 1036 | " for i in range(x.shape[0]):\n", 1037 | " xi = x[i].cpu().numpy()\n", 1038 | " yi = y[i].cpu().numpy()\n", 1039 | " y_hat = decoded_words[i]\n", 1040 | " x_sent = ' '.join([input_lang.index2word[t] for t in xi if t > 3])\n", 1041 | " y_sent = ' '.join([output_lang.index2word[t] for t in yi if t > 3])\n", 1042 | " y_hat_sent = ' '.join([output_lang.index2word[t] for t in y_hat if t > 3])\n", 1043 | " print('>', x_sent)\n", 1044 | " print('=', y_sent)\n", 1045 | " print('<', y_hat_sent)\n", 1046 | " print('')" 1047 | ] 1048 | }, 1049 | { 1050 | "cell_type": "code", 1051 | "execution_count": 54, 1052 | "metadata": { 1053 | "scrolled": true 1054 | }, 1055 | "outputs": [ 1056 | { 1057 | "name": "stdout", 1058 | "output_type": "stream", 1059 | "text": [ 1060 | "> je suis quelqu un de bien .\n", 1061 | "= i m a nice guy .\n", 1062 | "< i m a nice guy .\n", 1063 | "\n", 1064 | "> vous etes rusee .\n", 1065 | "= you re crafty .\n", 1066 | "< you re crafty .\n", 1067 | "\n", 1068 | "> je suis un peu desoriente .\n", 1069 | "= i m a little confused .\n", 1070 | "< i m a little confused .\n", 1071 | "\n", 1072 | "> actuellement je me trouve a l aeroport de narita .\n", 1073 | "= i m at narita airport right now .\n", 1074 | "< i m at narita airport right now .\n", 1075 | "\n", 1076 | "> je suis juste ici .\n", 1077 | "= i m right here .\n", 1078 | "< i m just here .\n", 1079 | "\n" 1080 | ] 1081 | } 1082 | ], 1083 | "source": [ 1084 | "print_results(x, y, encoder, decoder)" 1085 | ] 1086 | }, 1087 | { 1088 | "cell_type": "code", 1089 | "execution_count": 55, 1090 | "metadata": {}, 1091 | "outputs": [], 1092 | "source": [ 1093 | "batch_size=10\n", 1094 | "valid_dl_2 = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)\n", 1095 | "\n", 1096 | "x, y = next(iter(valid_dl_2)) \n", 1097 | "x = x.long().cuda()\n", 1098 | "y = y.long().cuda()" 1099 | ] 1100 | }, 1101 | { 1102 | "cell_type": "code", 1103 | "execution_count": 56, 1104 | "metadata": {}, 1105 | "outputs": [ 1106 | { 1107 | "name": "stdout", 1108 | "output_type": "stream", 1109 | "text": [ 1110 | "> je ne suis pas cette sorte de fille .\n", 1111 | "= i m not that kind of girl .\n", 1112 | "< i m not in a of girl .\n", 1113 | "\n", 1114 | "> nous sommes en securite ici .\n", 1115 | "= we re safe here .\n", 1116 | "< we re here here . couple .\n", 1117 | "\n", 1118 | "> ils le font correctement .\n", 1119 | "= they re doing it right .\n", 1120 | "< they re doing it right .\n", 1121 | "\n", 1122 | "> j ai raison .\n", 1123 | "= i m right .\n", 1124 | "< i m correct .\n", 1125 | "\n", 1126 | "> vous etes tres avises .\n", 1127 | "= you re very wise .\n", 1128 | "< you re very wise .\n", 1129 | "\n", 1130 | "> nous nous marions .\n", 1131 | "= we re getting married .\n", 1132 | "< we re undressing .\n", 1133 | "\n", 1134 | "> j ai une mauvaise impression .\n", 1135 | "= i m getting a bad feeling .\n", 1136 | "< i m a of . .\n", 1137 | "\n", 1138 | "> ce n est pas le genre de type a abandonner facilement .\n", 1139 | "= he is not the sort of guy who gives in easily .\n", 1140 | "< he s not very well off at at s . .\n", 1141 | "\n", 1142 | "> c est un homme cruel .\n", 1143 | "= he is a cruel person .\n", 1144 | "< he s a man of .\n", 1145 | "\n", 1146 | "> ils ne sont pas plus semblables qu une vache a un canari .\n", 1147 | "= they are no more alike than a cow and a canary .\n", 1148 | "< they are as a rock rock band in . .\n", 1149 | "\n" 1150 | ] 1151 | } 1152 | ], 1153 | "source": [ 1154 | "print_results(x, y, encoder, decoder)" 1155 | ] 1156 | }, 1157 | { 1158 | "cell_type": "markdown", 1159 | "metadata": {}, 1160 | "source": [ 1161 | "## Exercise\n", 1162 | "- Replace the embeddings with pre-trained word embeddings. Here are word embeddings for various languages.\n", 1163 | "\n", 1164 | "https://fasttext.cc/docs/en/crawl-vectors.html " 1165 | ] 1166 | }, 1167 | { 1168 | "cell_type": "markdown", 1169 | "metadata": {}, 1170 | "source": [ 1171 | "# Credits\n", 1172 | "The original notebook was written by Sean Robertson _" 1173 | ] 1174 | }, 1175 | { 1176 | "cell_type": "code", 1177 | "execution_count": null, 1178 | "metadata": {}, 1179 | "outputs": [], 1180 | "source": [] 1181 | } 1182 | ], 1183 | "metadata": { 1184 | "kernelspec": { 1185 | "display_name": "Python 3", 1186 | "language": "python", 1187 | "name": "python3" 1188 | }, 1189 | "language_info": { 1190 | "codemirror_mode": { 1191 | "name": "ipython", 1192 | "version": 3 1193 | }, 1194 | "file_extension": ".py", 1195 | "mimetype": "text/x-python", 1196 | "name": "python", 1197 | "nbconvert_exporter": "python", 1198 | "pygments_lexer": "ipython3", 1199 | "version": "3.6.8" 1200 | }, 1201 | "toc": { 1202 | "nav_menu": {}, 1203 | "number_sections": true, 1204 | "sideBar": true, 1205 | "skip_h1_title": false, 1206 | "toc_cell": false, 1207 | "toc_position": {}, 1208 | "toc_section_display": "block", 1209 | "toc_window_display": false 1210 | } 1211 | }, 1212 | "nbformat": 4, 1213 | "nbformat_minor": 1 1214 | } 1215 | -------------------------------------------------------------------------------- /lesson1-cbow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# import pytorch libraries\n", 10 | "%matplotlib inline\n", 11 | "import torch \n", 12 | "import torch.autograd as autograd \n", 13 | "import torch.nn as nn \n", 14 | "import torch.nn.functional as F\n", 15 | "import torch.optim as optim\n", 16 | "import numpy as np" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "# Continuous bag of words (CBOW) model for text classification\n", 24 | "This notebook shows how to use a continuous bag of words (CBOW) model with Pytorch. The task is a text classification problem described [here]( https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf). The CBOW model was first described [here](https://arxiv.org/pdf/1301.3781.pdf)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Subjectivity Dataset\n", 32 | "The subjectivity dataset has 5000 subjective and 5000 objective processed sentences. To get the data:\n", 33 | "```\n", 34 | "wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz\n", 35 | "```" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "def unpack_dataset():\n", 45 | " ! wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz\n", 46 | " ! mkdir data\n", 47 | " ! tar -xvf rotten_imdb.tar.gz -C data" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "--2020-02-19 11:49:47-- http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz\n", 60 | "Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.20\n", 61 | "Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.20|:80... connected.\n", 62 | "HTTP request sent, awaiting response... 200 OK\n", 63 | "Length: 519599 (507K) [application/x-gzip]\n", 64 | "Saving to: ‘rotten_imdb.tar.gz’\n", 65 | "\n", 66 | "rotten_imdb.tar.gz 100%[===================>] 507.42K 558KB/s in 0.9s \n", 67 | "\n", 68 | "2020-02-19 11:49:49 (558 KB/s) - ‘rotten_imdb.tar.gz’ saved [519599/519599]\n", 69 | "\n", 70 | "x quote.tok.gt9.5000\n", 71 | "x plot.tok.gt9.5000\n", 72 | "x subjdata.README.1.0\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "unpack_dataset()" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 72, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "plot.tok.gt9.5000 quote.tok.gt9.5000 subjdata.README.1.0\r\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "!ls data" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 73, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . \r\n", 107 | "emerging from the human psyche and showing characteristics of abstract expressionism , minimalism and russian constructivism , graffiti removal has secured its place in the history of modern art while being created by artists who are unconscious of their artistic achievements . \r\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "! head -2 data/plot.tok.gt9.5000" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 74, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "[PosixPath('data/plot.tok.gt9.5000'),\n", 124 | " PosixPath('data/subjdata.README.1.0'),\n", 125 | " PosixPath('data/quote.tok.gt9.5000')]" 126 | ] 127 | }, 128 | "execution_count": 74, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "from pathlib import Path\n", 135 | "PATH = Path(\"data\")\n", 136 | "list(PATH.iterdir())" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "## Tokenization\n", 144 | "Tokenization is the task of chopping up text into pieces, called tokens.\n", 145 | "\n", 146 | "spaCy is an open-source software library for advanced Natural Language Processing. Here we will use it for tokenization. " 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "### Simple Tokenization" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 75, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "# We need each line in the file \n", 163 | "def read_file(path):\n", 164 | " \"\"\" Read file returns a list of lines.\n", 165 | " \"\"\"\n", 166 | " with open(path, encoding = \"ISO-8859-1\") as f:\n", 167 | " content = f.readlines()\n", 168 | " return content" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 76, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "obj_lines = read_file(PATH/\"plot.tok.gt9.5000\")" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 77, 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "data": { 187 | "text/plain": [ 188 | "'the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . \\n'" 189 | ] 190 | }, 191 | "execution_count": 77, 192 | "metadata": {}, 193 | "output_type": "execute_result" 194 | } 195 | ], 196 | "source": [ 197 | "obj_lines[0]" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 78, 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "data": { 207 | "text/plain": [ 208 | "array(['the', 'movie', 'begins', 'in', 'the', 'past', 'where', 'a',\n", 209 | " 'young', 'boy', 'named', 'sam', 'attempts', 'to', 'save', 'celebi',\n", 210 | " 'from', 'a', 'hunter', '.'], dtype='\":0, \"UNK\":1} # init with padding and unknown\n", 548 | "words = [\"\", \"UNK\"]\n", 549 | "for word in word_count:\n", 550 | " vocab2index[word] = len(words)\n", 551 | " words.append(word)" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 101, 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "#vocab2index" 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": {}, 566 | "source": [ 567 | "## Sentence encoding\n", 568 | "Here we encode each sentence as a sequence of indices corresponding to each word." 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": 102, 574 | "metadata": {}, 575 | "outputs": [], 576 | "source": [ 577 | "x_train_len = np.array([len(x.split()) for x in X_train])\n", 578 | "x_val_len = np.array([len(x.split()) for x in X_val])" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": 103, 584 | "metadata": {}, 585 | "outputs": [ 586 | { 587 | "data": { 588 | "text/plain": [ 589 | "43.0" 590 | ] 591 | }, 592 | "execution_count": 103, 593 | "metadata": {}, 594 | "output_type": "execute_result" 595 | } 596 | ], 597 | "source": [ 598 | "np.percentile(x_train_len, 95) # let set the max sequence len to N=40" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": 104, 604 | "metadata": {}, 605 | "outputs": [ 606 | { 607 | "data": { 608 | "text/plain": [ 609 | "'will god let her fall or give her a new path ?'" 610 | ] 611 | }, 612 | "execution_count": 104, 613 | "metadata": {}, 614 | "output_type": "execute_result" 615 | } 616 | ], 617 | "source": [ 618 | "X_train[0]" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": 105, 624 | "metadata": {}, 625 | "outputs": [ 626 | { 627 | "data": { 628 | "text/plain": [ 629 | "7" 630 | ] 631 | }, 632 | "execution_count": 105, 633 | "metadata": {}, 634 | "output_type": "execute_result" 635 | } 636 | ], 637 | "source": [ 638 | "# returns the index of the word or the index of \"UNK\" otherwise\n", 639 | "vocab2index.get(\"?\", vocab2index[\"UNK\"])" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 106, 645 | "metadata": {}, 646 | "outputs": [ 647 | { 648 | "data": { 649 | "text/plain": [ 650 | "array([ 4, 3, 5, 11, 12, 8, 9, 11, 10, 2, 6, 7])" 651 | ] 652 | }, 653 | "execution_count": 106, 654 | "metadata": {}, 655 | "output_type": "execute_result" 656 | } 657 | ], 658 | "source": [ 659 | "np.array([vocab2index.get(w, vocab2index[\"UNK\"]) for w in X_train[0].split()])" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": 107, 665 | "metadata": {}, 666 | "outputs": [], 667 | "source": [ 668 | "def encode_sentence(s, N=40):\n", 669 | " enc = np.zeros(N, dtype=np.int32)\n", 670 | " enc1 = np.array([vocab2index.get(w, vocab2index[\"UNK\"]) for w in s.split()])\n", 671 | " l = min(N, len(enc1))\n", 672 | " enc[:l] = enc1[:l]\n", 673 | " return enc" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": 108, 679 | "metadata": {}, 680 | "outputs": [ 681 | { 682 | "data": { 683 | "text/plain": [ 684 | "array([ 4, 3, 5, 11, 12, 8, 9, 11, 10, 2, 6, 7, 0, 0, 0, 0, 0,\n", 685 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 686 | " 0, 0, 0, 0, 0, 0], dtype=int32)" 687 | ] 688 | }, 689 | "execution_count": 108, 690 | "metadata": {}, 691 | "output_type": "execute_result" 692 | } 693 | ], 694 | "source": [ 695 | "encode_sentence(X_train[0])" 696 | ] 697 | }, 698 | { 699 | "cell_type": "code", 700 | "execution_count": 109, 701 | "metadata": {}, 702 | "outputs": [], 703 | "source": [ 704 | "x_train_len = np.minimum(x_train_len, 40)\n", 705 | "x_val_len = np.minimum(x_val_len, 40)" 706 | ] 707 | }, 708 | { 709 | "cell_type": "code", 710 | "execution_count": 110, 711 | "metadata": {}, 712 | "outputs": [ 713 | { 714 | "data": { 715 | "text/plain": [ 716 | "(8000, 40)" 717 | ] 718 | }, 719 | "execution_count": 110, 720 | "metadata": {}, 721 | "output_type": "execute_result" 722 | } 723 | ], 724 | "source": [ 725 | "x_train = np.vstack([encode_sentence(x) for x in X_train])\n", 726 | "x_train.shape" 727 | ] 728 | }, 729 | { 730 | "cell_type": "code", 731 | "execution_count": 111, 732 | "metadata": {}, 733 | "outputs": [ 734 | { 735 | "data": { 736 | "text/plain": [ 737 | "(2000, 40)" 738 | ] 739 | }, 740 | "execution_count": 111, 741 | "metadata": {}, 742 | "output_type": "execute_result" 743 | } 744 | ], 745 | "source": [ 746 | "x_val = np.vstack([encode_sentence(x) for x in X_val])\n", 747 | "x_val.shape" 748 | ] 749 | }, 750 | { 751 | "cell_type": "markdown", 752 | "metadata": {}, 753 | "source": [ 754 | "## Embedding layer\n", 755 | "Most deep learning models use a dense vectors of real numbers as representation of words (word embeddings), as opposed to a one-hot encoding representations. The module torch.nn.Embedding is used to represent word embeddings. It takes two arguments: the vocabulary size, and the dimensionality of the embeddings. The embeddings are initialized with random vectors. " 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": 112, 761 | "metadata": {}, 762 | "outputs": [ 763 | { 764 | "data": { 765 | "text/plain": [ 766 | "Parameter containing:\n", 767 | "tensor([[ 0.0000, 0.0000, 0.0000, 0.0000],\n", 768 | " [-0.6142, 0.2136, -0.6799, 0.6064],\n", 769 | " [ 1.3913, 0.3764, -0.9674, -0.9030],\n", 770 | " [-0.1504, -0.0164, 2.1774, 0.8860],\n", 771 | " [ 1.4216, -0.4580, 1.3691, 0.8674],\n", 772 | " [-0.7108, 1.2483, 0.5496, -1.5263],\n", 773 | " [-1.2288, -0.6853, 0.2598, -0.9845],\n", 774 | " [ 0.4001, 0.3452, 0.5711, 0.5329],\n", 775 | " [ 0.5904, -0.7116, -0.1716, -0.2356],\n", 776 | " [ 1.4376, 1.5275, -0.4301, 0.3887]], requires_grad=True)" 777 | ] 778 | }, 779 | "execution_count": 112, 780 | "metadata": {}, 781 | "output_type": "execute_result" 782 | } 783 | ], 784 | "source": [ 785 | "# an Embedding module containing 10 words with embedding size 4\n", 786 | "# embedding will be initialized at random\n", 787 | "embed = nn.Embedding(10, 4, padding_idx=0)\n", 788 | "embed.weight" 789 | ] 790 | }, 791 | { 792 | "cell_type": "markdown", 793 | "metadata": {}, 794 | "source": [ 795 | "Note that the `padding_idx` has embedding vector 0." 796 | ] 797 | }, 798 | { 799 | "cell_type": "code", 800 | "execution_count": 113, 801 | "metadata": {}, 802 | "outputs": [ 803 | { 804 | "data": { 805 | "text/plain": [ 806 | "tensor([[[-0.6142, 0.2136, -0.6799, 0.6064],\n", 807 | " [ 1.4216, -0.4580, 1.3691, 0.8674],\n", 808 | " [-0.6142, 0.2136, -0.6799, 0.6064],\n", 809 | " [-0.7108, 1.2483, 0.5496, -1.5263],\n", 810 | " [-0.6142, 0.2136, -0.6799, 0.6064],\n", 811 | " [ 0.0000, 0.0000, 0.0000, 0.0000]]], grad_fn=)" 812 | ] 813 | }, 814 | "execution_count": 113, 815 | "metadata": {}, 816 | "output_type": "execute_result" 817 | } 818 | ], 819 | "source": [ 820 | "# given a list of ids we can \"look up\" the embedding corresponing to each id\n", 821 | "# can you see that some vectors are the same?\n", 822 | "a = torch.LongTensor([[1,4,1,5,1,0]])\n", 823 | "embed(a)" 824 | ] 825 | }, 826 | { 827 | "cell_type": "markdown", 828 | "metadata": {}, 829 | "source": [ 830 | "This would be the representation of a sentence with words with indices [1,4,1,5,1] and a padding at the end. Bellow we have an example in which we have two sentences. the first sentence has length 3 and the last sentence has length 2. In order to use a tensor we use padding at the end of the second sentence. " 831 | ] 832 | }, 833 | { 834 | "cell_type": "code", 835 | "execution_count": 114, 836 | "metadata": {}, 837 | "outputs": [], 838 | "source": [ 839 | "a = torch.LongTensor([[1,4,1], [1,3,0]])" 840 | ] 841 | }, 842 | { 843 | "cell_type": "markdown", 844 | "metadata": {}, 845 | "source": [ 846 | "Our model takes an average of the word embedding of each word. Here is how we do it." 847 | ] 848 | }, 849 | { 850 | "cell_type": "code", 851 | "execution_count": 115, 852 | "metadata": {}, 853 | "outputs": [], 854 | "source": [ 855 | "s = torch.FloatTensor([3, 2]) # here is the size of the vector" 856 | ] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": 116, 861 | "metadata": {}, 862 | "outputs": [ 863 | { 864 | "data": { 865 | "text/plain": [ 866 | "tensor([[[-0.6142, 0.2136, -0.6799, 0.6064],\n", 867 | " [ 1.4216, -0.4580, 1.3691, 0.8674],\n", 868 | " [-0.6142, 0.2136, -0.6799, 0.6064]],\n", 869 | "\n", 870 | " [[-0.6142, 0.2136, -0.6799, 0.6064],\n", 871 | " [-0.1504, -0.0164, 2.1774, 0.8860],\n", 872 | " [ 0.0000, 0.0000, 0.0000, 0.0000]]], grad_fn=)" 873 | ] 874 | }, 875 | "execution_count": 116, 876 | "metadata": {}, 877 | "output_type": "execute_result" 878 | } 879 | ], 880 | "source": [ 881 | "embed(a)" 882 | ] 883 | }, 884 | { 885 | "cell_type": "code", 886 | "execution_count": 117, 887 | "metadata": {}, 888 | "outputs": [ 889 | { 890 | "data": { 891 | "text/plain": [ 892 | "tensor([[ 0.1932, -0.0307, 0.0092, 2.0803],\n", 893 | " [-0.7646, 0.1972, 1.4974, 1.4924]], grad_fn=)" 894 | ] 895 | }, 896 | "execution_count": 117, 897 | "metadata": {}, 898 | "output_type": "execute_result" 899 | } 900 | ], 901 | "source": [ 902 | "embed(a).sum(dim=1)" 903 | ] 904 | }, 905 | { 906 | "cell_type": "code", 907 | "execution_count": 118, 908 | "metadata": {}, 909 | "outputs": [ 910 | { 911 | "data": { 912 | "text/plain": [ 913 | "tensor([[ 0.0644, -0.0102, 0.0031, 0.6934],\n", 914 | " [-0.3823, 0.0986, 0.7487, 0.7462]], grad_fn=)" 915 | ] 916 | }, 917 | "execution_count": 118, 918 | "metadata": {}, 919 | "output_type": "execute_result" 920 | } 921 | ], 922 | "source": [ 923 | "sum_embs = embed(a).sum(dim=1) \n", 924 | "sum_embs/ s.view(s.shape[0], 1)" 925 | ] 926 | }, 927 | { 928 | "cell_type": "markdown", 929 | "metadata": {}, 930 | "source": [ 931 | "## Continuous Bag of Words Model" 932 | ] 933 | }, 934 | { 935 | "cell_type": "code", 936 | "execution_count": 119, 937 | "metadata": {}, 938 | "outputs": [], 939 | "source": [ 940 | "class CBOW(nn.Module):\n", 941 | " def __init__(self, vocab_size, emb_size=100):\n", 942 | " super(CBOW, self).__init__()\n", 943 | " self.word_emb = nn.Embedding(vocab_size, emb_size, padding_idx=0)\n", 944 | " self.linear = nn.Linear(emb_size, 1)\n", 945 | " \n", 946 | " def forward(self, x, s):\n", 947 | " x = self.word_emb(x)\n", 948 | " x = x.sum(dim=1)/ s\n", 949 | " x = self.linear(x)\n", 950 | " return x" 951 | ] 952 | }, 953 | { 954 | "cell_type": "code", 955 | "execution_count": 120, 956 | "metadata": {}, 957 | "outputs": [], 958 | "source": [ 959 | "model = CBOW(vocab_size=5, emb_size=3)" 960 | ] 961 | }, 962 | { 963 | "cell_type": "code", 964 | "execution_count": 121, 965 | "metadata": {}, 966 | "outputs": [ 967 | { 968 | "data": { 969 | "text/plain": [ 970 | "Parameter containing:\n", 971 | "tensor([[ 0.0000, 0.0000, 0.0000],\n", 972 | " [ 0.4837, 0.0118, -1.5768],\n", 973 | " [ 0.5992, 0.5553, 0.8514],\n", 974 | " [ 0.8974, -0.0957, 0.5334],\n", 975 | " [-1.9593, -0.5764, -0.4522]], requires_grad=True)" 976 | ] 977 | }, 978 | "execution_count": 121, 979 | "metadata": {}, 980 | "output_type": "execute_result" 981 | } 982 | ], 983 | "source": [ 984 | "model.word_emb.weight" 985 | ] 986 | }, 987 | { 988 | "cell_type": "code", 989 | "execution_count": 122, 990 | "metadata": {}, 991 | "outputs": [ 992 | { 993 | "data": { 994 | "text/plain": [ 995 | "tensor([[-0.9028],\n", 996 | " [-0.7842]], grad_fn=)" 997 | ] 998 | }, 999 | "execution_count": 122, 1000 | "metadata": {}, 1001 | "output_type": "execute_result" 1002 | } 1003 | ], 1004 | "source": [ 1005 | "s = s.view(s.shape[0], 1)\n", 1006 | "model(a, s)" 1007 | ] 1008 | }, 1009 | { 1010 | "cell_type": "markdown", 1011 | "metadata": {}, 1012 | "source": [ 1013 | "# Training the CBOW model " 1014 | ] 1015 | }, 1016 | { 1017 | "cell_type": "code", 1018 | "execution_count": 123, 1019 | "metadata": {}, 1020 | "outputs": [ 1021 | { 1022 | "name": "stdout", 1023 | "output_type": "stream", 1024 | "text": [ 1025 | "4067\n" 1026 | ] 1027 | } 1028 | ], 1029 | "source": [ 1030 | "V = len(words)\n", 1031 | "model = CBOW(vocab_size=V, emb_size=50)\n", 1032 | "print(V)" 1033 | ] 1034 | }, 1035 | { 1036 | "cell_type": "code", 1037 | "execution_count": 124, 1038 | "metadata": {}, 1039 | "outputs": [], 1040 | "source": [ 1041 | "def val_metrics(model):\n", 1042 | " model.eval()\n", 1043 | " x = torch.LongTensor(x_val) #.cuda()\n", 1044 | " y = torch.Tensor(y_val).unsqueeze(1) #).cuda()\n", 1045 | " s = torch.Tensor(x_val_len).view(x_val_len.shape[0], 1)\n", 1046 | " y_hat = model(x, s)\n", 1047 | " loss = F.binary_cross_entropy_with_logits(y_hat, y)\n", 1048 | " y_pred = y_hat > 0\n", 1049 | " correct = (y_pred.float() == y).float().sum()\n", 1050 | " accuracy = correct/y_pred.shape[0]\n", 1051 | " return loss.item(), accuracy.item()" 1052 | ] 1053 | }, 1054 | { 1055 | "cell_type": "code", 1056 | "execution_count": 125, 1057 | "metadata": {}, 1058 | "outputs": [ 1059 | { 1060 | "data": { 1061 | "text/plain": [ 1062 | "(0.7011229991912842, 0.5065000057220459)" 1063 | ] 1064 | }, 1065 | "execution_count": 125, 1066 | "metadata": {}, 1067 | "output_type": "execute_result" 1068 | } 1069 | ], 1070 | "source": [ 1071 | "# accuracy of a random model should be around 0.5\n", 1072 | "val_metrics(model)" 1073 | ] 1074 | }, 1075 | { 1076 | "cell_type": "code", 1077 | "execution_count": 126, 1078 | "metadata": {}, 1079 | "outputs": [], 1080 | "source": [ 1081 | "def train_epocs(model, epochs=10, lr=0.01):\n", 1082 | " optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n", 1083 | " for i in range(epochs):\n", 1084 | " model.train()\n", 1085 | " x = torch.LongTensor(x_train) #.cuda()\n", 1086 | " y = torch.Tensor(y_train).unsqueeze(1)\n", 1087 | " s = torch.Tensor(x_train_len).view(x_train_len.shape[0], 1)\n", 1088 | " y_hat = model(x, s)\n", 1089 | " loss = F.binary_cross_entropy_with_logits(y_hat, y)\n", 1090 | " optimizer.zero_grad()\n", 1091 | " loss.backward()\n", 1092 | " optimizer.step()\n", 1093 | " val_loss, val_accuracy = val_metrics(model)\n", 1094 | " print(\"train_loss %.3f val_loss %.3f val_accuracy %.3f\" % (loss.item(), val_loss, val_accuracy))" 1095 | ] 1096 | }, 1097 | { 1098 | "cell_type": "code", 1099 | "execution_count": 127, 1100 | "metadata": {}, 1101 | "outputs": [ 1102 | { 1103 | "name": "stdout", 1104 | "output_type": "stream", 1105 | "text": [ 1106 | "train_loss 0.705 val_loss 0.712 val_accuracy 0.500\n", 1107 | "train_loss 0.701 val_loss 0.619 val_accuracy 0.728\n", 1108 | "train_loss 0.613 val_loss 0.552 val_accuracy 0.795\n", 1109 | "train_loss 0.542 val_loss 0.467 val_accuracy 0.854\n", 1110 | "train_loss 0.448 val_loss 0.381 val_accuracy 0.882\n", 1111 | "train_loss 0.351 val_loss 0.326 val_accuracy 0.872\n", 1112 | "train_loss 0.283 val_loss 0.277 val_accuracy 0.892\n", 1113 | "train_loss 0.225 val_loss 0.251 val_accuracy 0.904\n", 1114 | "train_loss 0.189 val_loss 0.247 val_accuracy 0.903\n", 1115 | "train_loss 0.167 val_loss 0.243 val_accuracy 0.910\n" 1116 | ] 1117 | } 1118 | ], 1119 | "source": [ 1120 | "train_epocs(model, epochs=10, lr=0.1)" 1121 | ] 1122 | }, 1123 | { 1124 | "cell_type": "code", 1125 | "execution_count": 128, 1126 | "metadata": {}, 1127 | "outputs": [ 1128 | { 1129 | "name": "stdout", 1130 | "output_type": "stream", 1131 | "text": [ 1132 | "train_loss 0.144 val_loss 0.241 val_accuracy 0.913\n", 1133 | "train_loss 0.139 val_loss 0.240 val_accuracy 0.913\n", 1134 | "train_loss 0.136 val_loss 0.240 val_accuracy 0.910\n", 1135 | "train_loss 0.133 val_loss 0.240 val_accuracy 0.909\n", 1136 | "train_loss 0.130 val_loss 0.239 val_accuracy 0.910\n", 1137 | "train_loss 0.126 val_loss 0.238 val_accuracy 0.910\n", 1138 | "train_loss 0.123 val_loss 0.238 val_accuracy 0.910\n", 1139 | "train_loss 0.119 val_loss 0.238 val_accuracy 0.911\n", 1140 | "train_loss 0.116 val_loss 0.238 val_accuracy 0.910\n", 1141 | "train_loss 0.113 val_loss 0.238 val_accuracy 0.909\n" 1142 | ] 1143 | } 1144 | ], 1145 | "source": [ 1146 | "train_epocs(model, epochs=10, lr=0.01)" 1147 | ] 1148 | }, 1149 | { 1150 | "cell_type": "markdown", 1151 | "metadata": {}, 1152 | "source": [ 1153 | "# Data loaders for SGD" 1154 | ] 1155 | }, 1156 | { 1157 | "cell_type": "markdown", 1158 | "metadata": {}, 1159 | "source": [ 1160 | "Nearly all of deep learning is powered by one very important algorithm: **stochastic gradient descent (SGD)**. SGD can be seeing as an approximation of **gradient descent** (GD). In GD you have to run through *all* the samples in your training set to do a single itaration. In SGD you use *only one* or *a subset* of training samples to do the update for a parameter in a particular iteration. The subset use in every iteration is called a **batch** or **minibatch**." 1161 | ] 1162 | }, 1163 | { 1164 | "cell_type": "code", 1165 | "execution_count": 129, 1166 | "metadata": {}, 1167 | "outputs": [], 1168 | "source": [ 1169 | "from torch.utils.data import Dataset, DataLoader" 1170 | ] 1171 | }, 1172 | { 1173 | "cell_type": "markdown", 1174 | "metadata": {}, 1175 | "source": [ 1176 | "Next we are going to create a data loader. The data loader provides the following features:\n", 1177 | "* Batching the data\n", 1178 | "* Shuffling the data\n", 1179 | "* Load the data in parallel using multiprocessing workers." 1180 | ] 1181 | }, 1182 | { 1183 | "cell_type": "code", 1184 | "execution_count": 130, 1185 | "metadata": {}, 1186 | "outputs": [], 1187 | "source": [ 1188 | "def encode_sentence2(s, N=40):\n", 1189 | " enc = np.zeros(N, dtype=np.int32)\n", 1190 | " enc1 = np.array([vocab2index.get(w, vocab2index[\"UNK\"]) for w in s.split()])\n", 1191 | " l = min(N, len(enc1))\n", 1192 | " enc[:l] = enc1[:l]\n", 1193 | " return enc, l" 1194 | ] 1195 | }, 1196 | { 1197 | "cell_type": "code", 1198 | "execution_count": 131, 1199 | "metadata": {}, 1200 | "outputs": [ 1201 | { 1202 | "data": { 1203 | "text/plain": [ 1204 | "(array([ 4, 3, 5, 11, 12, 8, 9, 11, 10, 2, 6, 7, 0, 0, 0, 0, 0,\n", 1205 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 1206 | " 0, 0, 0, 0, 0, 0], dtype=int32), 12)" 1207 | ] 1208 | }, 1209 | "execution_count": 131, 1210 | "metadata": {}, 1211 | "output_type": "execute_result" 1212 | } 1213 | ], 1214 | "source": [ 1215 | "encode_sentence2(X_train[0])" 1216 | ] 1217 | }, 1218 | { 1219 | "cell_type": "code", 1220 | "execution_count": 132, 1221 | "metadata": {}, 1222 | "outputs": [], 1223 | "source": [ 1224 | "class SubjectivityDataset(Dataset):\n", 1225 | " def __init__(self, X, y):\n", 1226 | " self.x = X\n", 1227 | " self.y = y\n", 1228 | " \n", 1229 | " def __len__(self):\n", 1230 | " return len(self.y)\n", 1231 | " \n", 1232 | " def __getitem__(self, idx):\n", 1233 | " x = self.x[idx]\n", 1234 | " x, s = encode_sentence2(x)\n", 1235 | " return x, self.y[idx], s\n", 1236 | " \n", 1237 | "sub_dataset_train = SubjectivityDataset(X_train, y_train)" 1238 | ] 1239 | }, 1240 | { 1241 | "cell_type": "code", 1242 | "execution_count": 133, 1243 | "metadata": {}, 1244 | "outputs": [], 1245 | "source": [ 1246 | "train_loader = DataLoader(sub_dataset_train, batch_size=5, shuffle=True)\n", 1247 | "x, y, s = next(iter(train_loader))" 1248 | ] 1249 | }, 1250 | { 1251 | "cell_type": "code", 1252 | "execution_count": 134, 1253 | "metadata": {}, 1254 | "outputs": [ 1255 | { 1256 | "data": { 1257 | "text/plain": [ 1258 | "(tensor([[1098, 171, 10, 594, 118, 28, 13, 39, 1386, 417, 51, 130,\n", 1259 | " 1891, 1, 51, 424, 2638, 959, 51, 1, 1895, 28, 29, 929,\n", 1260 | " 90, 3126, 696, 10, 1, 51, 1, 1, 171, 14, 113, 20,\n", 1261 | " 0, 0, 0, 0],\n", 1262 | " [ 14, 441, 196, 42, 90, 1, 74, 188, 187, 1817, 14, 657,\n", 1263 | " 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 1264 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 1265 | " 0, 0, 0, 0],\n", 1266 | " [ 14, 505, 51, 617, 3557, 96, 433, 1077, 796, 1740, 264, 13,\n", 1267 | " 264, 28, 212, 1432, 14, 2930, 20, 0, 0, 0, 0, 0,\n", 1268 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 1269 | " 0, 0, 0, 0],\n", 1270 | " [ 14, 1198, 51, 438, 1, 1, 2929, 2101, 81, 240, 1, 2930,\n", 1271 | " 74, 181, 1939, 414, 42, 90, 2262, 20, 0, 0, 0, 0,\n", 1272 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 1273 | " 0, 0, 0, 0],\n", 1274 | " [ 1, 2434, 10, 1, 28, 3068, 2994, 28, 57, 1, 267, 65,\n", 1275 | " 1, 1, 1, 13, 14, 1092, 134, 706, 1, 1, 20, 0,\n", 1276 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 1277 | " 0, 0, 0, 0]], dtype=torch.int32),\n", 1278 | " tensor([0., 1., 0., 0., 0.]),\n", 1279 | " tensor([36, 13, 19, 20, 23]))" 1280 | ] 1281 | }, 1282 | "execution_count": 134, 1283 | "metadata": {}, 1284 | "output_type": "execute_result" 1285 | } 1286 | ], 1287 | "source": [ 1288 | "x, y, s" 1289 | ] 1290 | }, 1291 | { 1292 | "cell_type": "code", 1293 | "execution_count": 135, 1294 | "metadata": {}, 1295 | "outputs": [], 1296 | "source": [ 1297 | "model = CBOW(vocab_size=V, emb_size=50)" 1298 | ] 1299 | }, 1300 | { 1301 | "cell_type": "code", 1302 | "execution_count": 136, 1303 | "metadata": {}, 1304 | "outputs": [], 1305 | "source": [ 1306 | "train_loader = DataLoader(sub_dataset_train, batch_size=500, shuffle=True)" 1307 | ] 1308 | }, 1309 | { 1310 | "cell_type": "code", 1311 | "execution_count": 137, 1312 | "metadata": {}, 1313 | "outputs": [], 1314 | "source": [ 1315 | "def train_epocs(model, epochs=10, lr=0.01):\n", 1316 | " optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n", 1317 | " for i in range(epochs):\n", 1318 | " total_loss = 0\n", 1319 | " total = 0\n", 1320 | " model.train()\n", 1321 | " for x, y, s in train_loader:\n", 1322 | " x = x.type(torch.LongTensor) #.cuda()\n", 1323 | " y = y.type(torch.FloatTensor).unsqueeze(1)\n", 1324 | " s = s.type(torch.Tensor).view(s.shape[0], 1)\n", 1325 | " y_hat = model(x, s)\n", 1326 | " loss = F.binary_cross_entropy_with_logits(y_hat, y)\n", 1327 | " optimizer.zero_grad()\n", 1328 | " loss.backward()\n", 1329 | " optimizer.step()\n", 1330 | " total_loss += x.size(0)*loss.item()\n", 1331 | " total += x.size(0)\n", 1332 | " train_loss = total_loss/total\n", 1333 | " val_loss, val_accuracy = val_metrics(model)\n", 1334 | " \n", 1335 | " print(\"train_loss %.3f val_loss %.3f val_accuracy %.3f\" % (train_loss, val_loss, val_accuracy))" 1336 | ] 1337 | }, 1338 | { 1339 | "cell_type": "code", 1340 | "execution_count": 138, 1341 | "metadata": {}, 1342 | "outputs": [ 1343 | { 1344 | "name": "stdout", 1345 | "output_type": "stream", 1346 | "text": [ 1347 | "train_loss 0.644 val_loss 0.579 val_accuracy 0.775\n", 1348 | "train_loss 0.493 val_loss 0.416 val_accuracy 0.850\n", 1349 | "train_loss 0.332 val_loss 0.307 val_accuracy 0.887\n", 1350 | "train_loss 0.235 val_loss 0.258 val_accuracy 0.898\n", 1351 | "train_loss 0.182 val_loss 0.238 val_accuracy 0.906\n", 1352 | "train_loss 0.148 val_loss 0.231 val_accuracy 0.910\n", 1353 | "train_loss 0.125 val_loss 0.230 val_accuracy 0.910\n", 1354 | "train_loss 0.107 val_loss 0.233 val_accuracy 0.910\n", 1355 | "train_loss 0.092 val_loss 0.240 val_accuracy 0.908\n", 1356 | "train_loss 0.080 val_loss 0.246 val_accuracy 0.909\n" 1357 | ] 1358 | } 1359 | ], 1360 | "source": [ 1361 | "train_epocs(model, epochs=10)" 1362 | ] 1363 | }, 1364 | { 1365 | "cell_type": "code", 1366 | "execution_count": null, 1367 | "metadata": {}, 1368 | "outputs": [], 1369 | "source": [] 1370 | } 1371 | ], 1372 | "metadata": { 1373 | "kernelspec": { 1374 | "display_name": "Python 3", 1375 | "language": "python", 1376 | "name": "python3" 1377 | }, 1378 | "language_info": { 1379 | "codemirror_mode": { 1380 | "name": "ipython", 1381 | "version": 3 1382 | }, 1383 | "file_extension": ".py", 1384 | "mimetype": "text/x-python", 1385 | "name": "python", 1386 | "nbconvert_exporter": "python", 1387 | "pygments_lexer": "ipython3", 1388 | "version": "3.7.4" 1389 | }, 1390 | "nav_menu": {}, 1391 | "toc": { 1392 | "nav_menu": { 1393 | "height": "116px", 1394 | "width": "251px" 1395 | }, 1396 | "number_sections": true, 1397 | "sideBar": true, 1398 | "skip_h1_title": false, 1399 | "toc_cell": true, 1400 | "toc_position": {}, 1401 | "toc_section_display": "block", 1402 | "toc_window_display": false 1403 | }, 1404 | "widgets": { 1405 | "state": {}, 1406 | "version": "1.1.2" 1407 | } 1408 | }, 1409 | "nbformat": 4, 1410 | "nbformat_minor": 1 1411 | } 1412 | --------------------------------------------------------------------------------