├── test_data ├── tiny_val2.csv ├── edge.jpg └── tiny_training2.csv ├── images └── model.png ├── imgs └── image_captioning.png ├── README.md ├── lesson7-bert_classification.ipynb ├── lesson4-rnn-name2lang.ipynb ├── lesson3-cbow.ipynb ├── lesson4-seq2seq.ipynb └── 5_lab.ipynb /test_data/tiny_val2.csv: -------------------------------------------------------------------------------- 1 | userId,movieId,rating 2 | 2,1,5 3 | 4,23,5 4 | 4,2,3 5 | -------------------------------------------------------------------------------- /images/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanneta/deep-learning-data-institute/HEAD/images/model.png -------------------------------------------------------------------------------- /test_data/edge.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanneta/deep-learning-data-institute/HEAD/test_data/edge.jpg -------------------------------------------------------------------------------- /imgs/image_captioning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanneta/deep-learning-data-institute/HEAD/imgs/image_captioning.png -------------------------------------------------------------------------------- /test_data/tiny_training2.csv: -------------------------------------------------------------------------------- 1 | userId,movieId,rating 2 | 11,1,4 3 | 11,23,5 4 | 2,23,5 5 | 2,4,3 6 | 31,1,4 7 | 31,23,4 8 | 4,1,5 9 | 4,3,2 10 | 52,1,1 11 | 52,3,4 12 | 61,3,5 13 | 7,23,1 14 | 7,3,3 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # deep-learning-data-institute 2 | Repo for the course "Fundamentals of Deep Learning with Pytorch" 3 | 4 | Syllabus (subject to change) 5 | * Lesson 1: review of machine learning, intro to pytorch 6 | * Lesson 2: tabular data, neural networks, effective training of neural networks 7 | * Lesson 3-4: text classification, word embeddings, convolutional neural networks for text, recurrent neural networks, seq2seq networks, seq2seq with attention 8 | * Lesson 5-6: image classification, convolutional neural networks, data augmentation, transfer learning 9 | * Lesson 7: image captioning, BERT / transformers 10 | -------------------------------------------------------------------------------- /lesson7-bert_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%reload_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "%matplotlib inline\n", 12 | "import matplotlib.pyplot as plt" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "pip install pytorch-transformers" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import torch\n", 29 | "import torch.nn as nn\n", 30 | "import pickle\n", 31 | "from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,\n", 32 | " TensorDataset)\n", 33 | "from tqdm import tqdm_notebook, trange\n", 34 | "import os\n", 35 | "from pytorch_transformers import BertConfig, BertTokenizer, BertModel\n", 36 | "from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule\n", 37 | "\n", 38 | "from torch.utils.data import Dataset, DataLoader" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "import numpy as np\n", 48 | "import torch.optim as optim\n", 49 | "from torch.optim import lr_scheduler\n", 50 | "import time\n", 51 | "import copy\n", 52 | "import torch.nn.functional as F" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 4, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "class BertForSequenceClassification(nn.Module):\n", 62 | " \"\"\"BERT model for classification.\n", 63 | " This module is composed of the BERT model with a linear layer on top of\n", 64 | " the pooled output.\n", 65 | " \"\"\"\n", 66 | " def __init__(self, num_labels=1):\n", 67 | " super(BertForSequenceClassification, self).__init__()\n", 68 | " self.num_labels = num_labels\n", 69 | " self.bert = BertModel.from_pretrained('bert-base-uncased')\n", 70 | " self.dropout = nn.Dropout(config.hidden_dropout_prob)\n", 71 | " self.classifier = nn.Linear(config.hidden_size, num_labels)\n", 72 | " nn.init.xavier_normal_(self.classifier.weight)\n", 73 | " \n", 74 | " def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):\n", 75 | " outputs = self.bert(input_ids, token_type_ids, attention_mask)\n", 76 | " pooled_output = outputs[1]\n", 77 | " pooled_output = self.dropout(pooled_output)\n", 78 | " logits = self.classifier(pooled_output)\n", 79 | " return logits\n", 80 | " \n", 81 | " def freeze_bert_encoder(self):\n", 82 | " for param in self.bert.parameters():\n", 83 | " param.requires_grad = False\n", 84 | " \n", 85 | " def unfreeze_bert_encoder(self):\n", 86 | " for param in self.bert.parameters():\n", 87 | " param.requires_grad = True" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 5, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,\n", 97 | " num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 6, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "num_labels = 1\n", 107 | "model = BertForSequenceClassification(num_labels)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 7, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "from pathlib import Path\n", 117 | "PATH = Path(\"/home/yinterian/data/aclImdb/\")" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 8, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 10, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/plain": [ 137 | "['bro', '##m', '##well', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it']" 138 | ] 139 | }, 140 | "execution_count": 10, 141 | "metadata": {}, 142 | "output_type": "execute_result" 143 | } 144 | ], 145 | "source": [ 146 | "path = PATH/\"train/pos/0_9.txt\"\n", 147 | "z = tokenizer.tokenize(path.read_text())\n", 148 | "z[:10]" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 11, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "[22953, 2213, 4381, 2152, 2003, 1037, 9476, 4038, 1012, 2009]" 160 | ] 161 | }, 162 | "execution_count": 11, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "ids = tokenizer.convert_tokens_to_ids(z)\n", 169 | "ids[:10]" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 12, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "tokens_tensor = torch.tensor([ids])" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 13, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "logits = model(tokens_tensor)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 14, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "tensor([[-0.5909]], grad_fn=)" 199 | ] 200 | }, 201 | "execution_count": 14, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "logits " 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "Based on these tutorials\n", 215 | "* https://pytorch.org/hub/huggingface_pytorch-pretrained-bert_bert/\n", 216 | "* https://github.com/huggingface/pytorch-transformers/blob/master/README.md\n", 217 | "* https://medium.com/huggingface/multi-label-text-classification-using-bert-the-mighty-transformer-69714fa3fb3d\n", 218 | "* https://towardsdatascience.com/bert-classifier-just-another-pytorch-model-881b3cf05784" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 15, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "def text2ids(text, max_seq_length=300):\n", 228 | " tok_text = tokenizer.tokenize(text)\n", 229 | " if len(tok_text) > max_seq_length:\n", 230 | " tok_text = tok_text[:max_seq_length]\n", 231 | " ids_text = tokenizer.convert_tokens_to_ids(tok_text)\n", 232 | " padding = [0] * (max_seq_length - len(ids_text))\n", 233 | " ids_text += padding\n", 234 | " return np.array(ids_text)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 16, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/plain": [ 245 | "array([22953, 2213, 4381, 2152, 2003, 1037, 9476, 4038, 1012,\n", 246 | " 2009, 2743, 2012, 1996, 2168, 2051, 2004, 2070, 2060,\n", 247 | " 3454, 2055, 2082, 2166, 1010, 2107, 2004, 1000, 5089,\n", 248 | " 1000, 1012, 2026, 3486, 2086, 1999, 1996, 4252, 9518,\n", 249 | " 2599, 2033, 2000, 2903, 2008, 22953, 2213, 4381, 2152,\n", 250 | " 1005, 1055, 18312, 2003, 2172, 3553, 2000, 4507, 2084,\n", 251 | " 2003, 1000, 5089, 1000, 1012, 1996, 25740, 2000, 5788,\n", 252 | " 13732, 1010, 1996, 12369, 3993, 2493, 2040, 2064, 2156,\n", 253 | " 2157, 2083, 2037, 17203, 5089, 1005, 13433, 8737, 1010,\n", 254 | " 1996, 9004, 10196, 4757, 1997, 1996, 2878, 3663, 1010,\n", 255 | " 2035, 10825, 2033, 1997, 1996, 2816, 1045, 2354, 1998,\n", 256 | " 2037, 2493, 1012, 2043, 1045, 2387, 1996, 2792, 1999,\n", 257 | " 2029, 1037, 3076, 8385, 2699, 2000, 6402, 2091, 1996,\n", 258 | " 2082, 1010, 1045, 3202, 7383, 1012, 1012, 1012, 1012,\n", 259 | " 1012, 1012, 1012, 1012, 1012, 2012, 1012, 1012, 1012,\n", 260 | " 1012, 1012, 1012, 1012, 1012, 1012, 1012, 2152, 1012,\n", 261 | " 1037, 4438, 2240, 1024, 7742, 1024, 1045, 1005, 1049,\n", 262 | " 2182, 2000, 12803, 2028, 1997, 2115, 5089, 1012, 3076,\n", 263 | " 1024, 6160, 2000, 22953, 2213, 4381, 2152, 1012, 1045,\n", 264 | " 5987, 2008, 2116, 6001, 1997, 2026, 2287, 2228, 2008,\n", 265 | " 22953, 2213, 4381, 2152, 2003, 2521, 18584, 2098, 1012,\n", 266 | " 2054, 1037, 12063, 2008, 2009, 3475, 1005, 1056, 999,\n", 267 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 268 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 269 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 270 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 271 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 272 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 273 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 274 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 275 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 276 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 277 | " 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 278 | " 0, 0, 0])" 279 | ] 280 | }, 281 | "execution_count": 16, 282 | "metadata": {}, 283 | "output_type": "execute_result" 284 | } 285 | ], 286 | "source": [ 287 | "text2ids(path.read_text())" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 17, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "class ImdbDataset(Dataset):\n", 297 | " def __init__(self, PATH, train=\"train\"):\n", 298 | " self.path_to_images = PATH/train\n", 299 | " self.pos_files = list((self.path_to_images/\"pos\").iterdir())\n", 300 | " self.neg_files = list((self.path_to_images/\"neg\").iterdir()) \n", 301 | " self.files = self.pos_files + self.neg_files\n", 302 | " self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),\n", 303 | " np.zeros(len(self.neg_files), dtype=int)), axis=0)\n", 304 | " \n", 305 | " def __getitem__(self, index):\n", 306 | " path = self.files[index]\n", 307 | " x = text2ids(path.read_text())\n", 308 | " return x, self.y[index]\n", 309 | " \n", 310 | " def __len__(self):\n", 311 | " return len(self.y)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 18, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "train_ds = ImdbDataset(PATH)\n", 321 | "valid_ds = ImdbDataset(PATH, \"test\")" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 19, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "batch_size = 10\n", 331 | "train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)\n", 332 | "valid_dl = DataLoader(valid_ds, batch_size=batch_size)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 20, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "x, y = train_ds[0]" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 21, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "x, y = next(iter(train_dl))" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 22, 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "data": { 360 | "text/plain": [ 361 | "tensor([ 2004, 15444, 2890, 1998, 7369, 2012, 1996, 4578, 1997, 2037,\n", 362 | " 6217, 1012, 1999, 4266, 4841, 2245, 1997, 1996, 3212, 2004,\n", 363 | " 1037, 2173, 2005, 2299, 1998, 3153, 1012, 25755, 2001, 2145,\n", 364 | " 1037, 2261, 2086, 2185, 1012, 5965, 1998, 14580, 3153, 2039,\n", 365 | " 1996, 2237, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013,\n", 366 | " 1028, 1996, 5436, 2003, 11519, 1010, 2021, 2040, 14977, 1012,\n", 367 | " 1012, 1012, 2011, 1996, 2126, 1010, 5060, 1996, 12081, 4395,\n", 368 | " 2005, 9306, 6723, 2571, 1998, 1037, 1043, 10278, 25373, 12776,\n", 369 | " 9463, 3608, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013,\n", 370 | " 1028, 1037, 7170, 1997, 12415, 4068, 2774, 1010, 2164, 1996,\n", 371 | " 3297, 1000, 2292, 1005, 1055, 2227, 1996, 2189, 1998, 3153,\n", 372 | " 1000, 1012, 1999, 2008, 3496, 1010, 14580, 1005, 1055, 3082,\n", 373 | " 25430, 29046, 4377, 21526, 2015, 5965, 1999, 1996, 2227, 2076,\n", 374 | " 2028, 1997, 2014, 23371, 1998, 2471, 21145, 2032, 9787, 1012,\n", 375 | " 5965, 7278, 2006, 4363, 1996, 2202, 2004, 1996, 5613, 2001,\n", 376 | " 21688, 9690, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013,\n", 377 | " 1028, 14580, 2320, 7034, 2008, 2016, 2001, 1037, 2488, 8033,\n", 378 | " 2084, 5965, 1010, 2144, 2016, 2018, 2000, 2079, 2035, 1996,\n", 379 | " 2168, 5829, 1010, 1999, 3357, 1010, 1998, 11043, 1012, 1012,\n", 380 | " 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2272,\n", 381 | " 2000, 2228, 1997, 2009, 1010, 5965, 1005, 1055, 2376, 2001,\n", 382 | " 3835, 2205, 1012, 1996, 2158, 2001, 3947, 3238, 1999, 4367,\n", 383 | " 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2182,\n", 384 | " 1005, 1055, 1037, 3185, 2000, 26931, 2039, 2006, 1996, 6411,\n", 385 | " 2007, 1037, 3866, 1011, 2028, 1010, 5926, 2125, 1996, 6007,\n", 386 | " 1010, 1998, 5959, 1996, 4024, 1012, 0, 0, 0, 0,\n", 387 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 388 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 389 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 390 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])" 391 | ] 392 | }, 393 | "execution_count": 22, 394 | "metadata": {}, 395 | "output_type": "execute_result" 396 | } 397 | ], 398 | "source": [ 399 | "x[3]" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 23, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "def train_model(model, optimizer, num_epochs=25):\n", 409 | " for epoch in range(num_epochs):\n", 410 | " model.train()\n", 411 | " running_loss = 0.0\n", 412 | " for x, y in train_dl:\n", 413 | " x = x.cuda()\n", 414 | " y = y.unsqueeze(1).float().cuda()\n", 415 | " optimizer.zero_grad()\n", 416 | " logits = model(x)\n", 417 | " loss = F.binary_cross_entropy_with_logits(logits, y) \n", 418 | " loss.backward()\n", 419 | " optimizer.step()\n", 420 | " \n", 421 | " running_loss += loss.item() * x.size(0)\n", 422 | " epoch_loss = running_loss / len(train_ds)\n", 423 | " val_loss, accuracy = eval_model(model)\n", 424 | " print('train loss: {:.3f}, valid loss {:.3f} accuracy {:.3f}'.format(\n", 425 | " epoch_loss, val_loss, accuracy))" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 24, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "def eval_model(model):\n", 435 | " model.eval()\n", 436 | " running_loss = 0.0\n", 437 | " correct = 0\n", 438 | " for x, y in valid_dl:\n", 439 | " x = x.cuda()\n", 440 | " y = y.unsqueeze(1).float().cuda()\n", 441 | " logits = model(x)\n", 442 | " loss = F.binary_cross_entropy_with_logits(logits, y) \n", 443 | " y_pred = logits > 0\n", 444 | " correct += (y_pred.float() == y).float().sum()\n", 445 | " running_loss += loss.item() * x.size(0)\n", 446 | " accuracy = correct / len(valid_ds)\n", 447 | " epoch_loss = running_loss / len(valid_ds)\n", 448 | " return epoch_loss, accuracy.item() " 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 25, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "model = model.cuda()" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 26, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "lrlast = .0001\n", 467 | "lrmain = .00001\n", 468 | "optimizer = optim.Adam(\n", 469 | " [\n", 470 | " {\"params\":model.bert.parameters(),\"lr\": lrmain},\n", 471 | " {\"params\":model.classifier.parameters(), \"lr\": lrlast},\n", 472 | " \n", 473 | " ])" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 27, 479 | "metadata": {}, 480 | "outputs": [ 481 | { 482 | "name": "stdout", 483 | "output_type": "stream", 484 | "text": [ 485 | "train loss: 0.286, valid loss 0.201 accuracy 0.922\n", 486 | "train loss: 0.166, valid loss 0.210 accuracy 0.922\n" 487 | ] 488 | } 489 | ], 490 | "source": [ 491 | "train_model(model, optimizer, num_epochs=2)" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": null, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [] 500 | } 501 | ], 502 | "metadata": { 503 | "kernelspec": { 504 | "display_name": "Python 3 (ipykernel)", 505 | "language": "python", 506 | "name": "python3" 507 | }, 508 | "language_info": { 509 | "codemirror_mode": { 510 | "name": "ipython", 511 | "version": 3 512 | }, 513 | "file_extension": ".py", 514 | "mimetype": "text/x-python", 515 | "name": "python", 516 | "nbconvert_exporter": "python", 517 | "pygments_lexer": "ipython3", 518 | "version": "3.8.11" 519 | }, 520 | "toc": { 521 | "base_numbering": 1, 522 | "nav_menu": {}, 523 | "number_sections": true, 524 | "sideBar": true, 525 | "skip_h1_title": false, 526 | "title_cell": "Table of Contents", 527 | "title_sidebar": "Contents", 528 | "toc_cell": false, 529 | "toc_position": {}, 530 | "toc_section_display": "block", 531 | "toc_window_display": false 532 | } 533 | }, 534 | "nbformat": 4, 535 | "nbformat_minor": 2 536 | } 537 | -------------------------------------------------------------------------------- /lesson4-rnn-name2lang.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Classifing last names with character-level RNN" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%reload_ext autoreload\n", 17 | "%autoreload 2\n", 18 | "%matplotlib inline\n", 19 | "import pandas as pd\n", 20 | "import numpy as np\n", 21 | "from pathlib import Path\n", 22 | "import torch\n", 23 | "from torch.utils.data import Dataset, DataLoader\n", 24 | "import torch.optim as optim\n", 25 | "import torch.nn as nn\n", 26 | "import torch.nn.functional as F\n", 27 | "import random" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Dataset\n", 35 | "`wget https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_train.csv.gz`\n", 36 | "\n", 37 | "`wget https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz`" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "def unpack_dataset():\n", 47 | " ! mkdir -p data\n", 48 | " ! wget -O names_test.csv.gz https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz?raw=true\n", 49 | " ! wget -O names_train.csv.gz https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_train.csv.gz?raw=true\n", 50 | " ! gunzip *.gz\n", 51 | " ! mv names_test.csv names_train.csv data/" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "--2019-09-25 10:30:52-- https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_test.csv.gz?raw=true\n", 64 | "Resolving github.com (github.com)... 192.30.255.112\n", 65 | "Connecting to github.com (github.com)|192.30.255.112|:443... connected.\n", 66 | "HTTP request sent, awaiting response... 302 Found\n", 67 | "Location: https://github.com/hunkim/PyTorchZeroToAll/raw/master/data/names_test.csv.gz [following]\n", 68 | "--2019-09-25 10:30:52-- https://github.com/hunkim/PyTorchZeroToAll/raw/master/data/names_test.csv.gz\n", 69 | "Reusing existing connection to github.com:443.\n", 70 | "HTTP request sent, awaiting response... 302 Found\n", 71 | "Location: https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_test.csv.gz [following]\n", 72 | "--2019-09-25 10:30:52-- https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_test.csv.gz\n", 73 | "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.40.133\n", 74 | "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.40.133|:443... connected.\n", 75 | "HTTP request sent, awaiting response... 200 OK\n", 76 | "Length: 27541 (27K) [application/octet-stream]\n", 77 | "Saving to: ‘names_test.csv.gz’\n", 78 | "\n", 79 | "names_test.csv.gz 100%[===================>] 26.90K --.-KB/s in 0.004s \n", 80 | "\n", 81 | "2019-09-25 10:30:53 (6.04 MB/s) - ‘names_test.csv.gz’ saved [27541/27541]\n", 82 | "\n", 83 | "--2019-09-25 10:30:53-- https://github.com/hunkim/PyTorchZeroToAll/blob/master/data/names_train.csv.gz?raw=true\n", 84 | "Resolving github.com (github.com)... 192.30.255.112\n", 85 | "Connecting to github.com (github.com)|192.30.255.112|:443... connected.\n", 86 | "HTTP request sent, awaiting response... 302 Found\n", 87 | "Location: https://github.com/hunkim/PyTorchZeroToAll/raw/master/data/names_train.csv.gz [following]\n", 88 | "--2019-09-25 10:30:53-- https://github.com/hunkim/PyTorchZeroToAll/raw/master/data/names_train.csv.gz\n", 89 | "Reusing existing connection to github.com:443.\n", 90 | "HTTP request sent, awaiting response... 302 Found\n", 91 | "Location: https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_train.csv.gz [following]\n", 92 | "--2019-09-25 10:30:53-- https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_train.csv.gz\n", 93 | "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.40.133\n", 94 | "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.40.133|:443... connected.\n", 95 | "HTTP request sent, awaiting response... 200 OK\n", 96 | "Length: 50237 (49K) [application/octet-stream]\n", 97 | "Saving to: ‘names_train.csv.gz’\n", 98 | "\n", 99 | "names_train.csv.gz 100%[===================>] 49.06K --.-KB/s in 0.009s \n", 100 | "\n", 101 | "2019-09-25 10:30:54 (5.40 MB/s) - ‘names_train.csv.gz’ saved [50237/50237]\n", 102 | "\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "unpack_dataset()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 4, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "[PosixPath('data/names_test.csv'),\n", 119 | " PosixPath('data/aclImdb'),\n", 120 | " PosixPath('data/names_train.csv')]" 121 | ] 122 | }, 123 | "execution_count": 4, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | } 127 | ], 128 | "source": [ 129 | "PATH = Path(\"data\")\n", 130 | "list(PATH.iterdir())" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 5, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "\"Adsit\",\"Czech\"\r", 143 | "\r\n", 144 | "\"Ajdrna\",\"Czech\"\r", 145 | "\r\n", 146 | "\"Antonowitsch\",\"Czech\"\r", 147 | "\r\n", 148 | "\"Antonowitz\",\"Czech\"\r", 149 | "\r\n", 150 | "\"Ballalatak\",\"Czech\"\r", 151 | "\r\n", 152 | "\"Ballaltick\",\"Czech\"\r", 153 | "\r\n", 154 | "\"Bastl\",\"Czech\"\r", 155 | "\r\n", 156 | "\"Baroch\",\"Czech\"\r", 157 | "\r\n", 158 | "\"Betlach\",\"Czech\"\r", 159 | "\r\n", 160 | "\"Biganska\",\"Czech\"\r", 161 | "\r\n" 162 | ] 163 | } 164 | ], 165 | "source": [ 166 | "! head data/names_train.csv" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "### Processing data" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 6, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "df = pd.read_csv(PATH/\"names_train.csv\", header=None)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 7, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "data": { 192 | "text/plain": [ 193 | "[' ', \"'\", ',', 'A', 'B', 'C', 'D', 'E', 'F', 'G']" 194 | ] 195 | }, 196 | "execution_count": 7, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "# getting a vocabulary of characters\n", 203 | "letters = [list(l) for l in df[0].values]\n", 204 | "vocab = sorted(list(set(np.concatenate(np.array(letters)))))\n", 205 | "vocab[:10]" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 8, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "0" 217 | ] 218 | }, 219 | "execution_count": 8, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "vocab2id = {key:i for i, key in enumerate(vocab)}\n", 226 | "vocab2id[\" \"] # I am going to use 0 to pad sequences" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 9, 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "data": { 236 | "text/plain": [ 237 | "{'Arabic': 0,\n", 238 | " 'Chinese': 1,\n", 239 | " 'Czech': 2,\n", 240 | " 'Dutch': 3,\n", 241 | " 'English': 4,\n", 242 | " 'French': 5,\n", 243 | " 'German': 6,\n", 244 | " 'Greek': 7,\n", 245 | " 'Irish': 8,\n", 246 | " 'Italian': 9,\n", 247 | " 'Japanese': 10,\n", 248 | " 'Korean': 11,\n", 249 | " 'Polish': 12,\n", 250 | " 'Portuguese': 13,\n", 251 | " 'Russian': 14,\n", 252 | " 'Scottish': 15,\n", 253 | " 'Spanish': 16,\n", 254 | " 'Vietnamese': 17}" 255 | ] 256 | }, 257 | "execution_count": 9, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "labels = sorted(df[1].unique())\n", 264 | "label2id = {key:i for i, key in enumerate(labels)}\n", 265 | "label2id" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 10, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "def pad_seq(x, seq_len=15, vocab2id=vocab2id):\n", 275 | " x = list(x)\n", 276 | " x = np.array([vocab2id[k] for k in x])\n", 277 | " z = np.zeros(seq_len, dtype=np.int32)\n", 278 | " n = min(seq_len, x.shape[0])\n", 279 | " z[seq_len - n:] = x[0:n]\n", 280 | " return z" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 11, 286 | "metadata": {}, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 29, 29, 30, 30, 30],\n", 292 | " dtype=int32)" 293 | ] 294 | }, 295 | "execution_count": 11, 296 | "metadata": {}, 297 | "output_type": "execute_result" 298 | } 299 | ], 300 | "source": [ 301 | "x = pad_seq(\"aabbb\")\n", 302 | "x" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 12, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "# one hot encoding\n", 312 | "def seq2matrix(x, vocab_len=55):\n", 313 | " z = np.zeros((x.shape[0], vocab_len))\n", 314 | " z[np.arange(len(x)), x] = 1\n", 315 | " return z" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 13, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "class NameDataset(Dataset):\n", 325 | " def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):\n", 326 | " self.df = pd.read_csv(path, header=None)\n", 327 | " self.label2id = label2id\n", 328 | " self.vocab2id = vocab2id\n", 329 | " self.seq_len = seq_len\n", 330 | " self.vocab_len = vocab_len \n", 331 | " self.x = df[0].values\n", 332 | " self.y = [self.label2id[l] for l in df[1].values]\n", 333 | " self.vocab2id = vocab2id\n", 334 | " \n", 335 | " def __len__(self):\n", 336 | " return len(self.y)\n", 337 | " \n", 338 | " def __getitem__(self, idx):\n", 339 | " x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)\n", 340 | " x = seq2matrix(x, self.vocab_len)\n", 341 | " return x, self.y[idx]" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 14, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "train_ds = NameDataset(PATH/\"names_train.csv\", vocab2id, label2id)\n", 351 | "valid_ds = NameDataset(PATH/\"names_test.csv\", vocab2id, label2id)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 15, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "batch_size = 2000\n", 361 | "train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)\n", 362 | "valid_dl = DataLoader(valid_ds, batch_size=len(valid_ds))" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 16, 368 | "metadata": {}, 369 | "outputs": [ 370 | { 371 | "data": { 372 | "text/plain": [ 373 | "(13374, 13374)" 374 | ] 375 | }, 376 | "execution_count": 16, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "len(train_ds), len(valid_ds)" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 17, 388 | "metadata": {}, 389 | "outputs": [ 390 | { 391 | "name": "stdout", 392 | "output_type": "stream", 393 | "text": [ 394 | "(15, 55) 2\n" 395 | ] 396 | } 397 | ], 398 | "source": [ 399 | "x, y = train_ds[0]\n", 400 | "print(x.shape, y)" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 18, 406 | "metadata": {}, 407 | "outputs": [ 408 | { 409 | "data": { 410 | "text/plain": [ 411 | "array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 412 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 413 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 414 | " 0., 0., 0., 0., 0., 0., 0.],\n", 415 | " [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 416 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 417 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 418 | " 0., 0., 0., 0., 0., 0., 0.],\n", 419 | " [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 420 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 421 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 422 | " 0., 0., 0., 0., 0., 0., 0.],\n", 423 | " [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 424 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 425 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 426 | " 0., 0., 0., 0., 0., 0., 0.],\n", 427 | " [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 428 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 429 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 430 | " 0., 0., 0., 0., 0., 0., 0.],\n", 431 | " [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 432 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 433 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 434 | " 0., 0., 0., 0., 0., 0., 0.],\n", 435 | " [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 436 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 437 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 438 | " 0., 0., 0., 0., 0., 0., 0.],\n", 439 | " [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 440 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 441 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 442 | " 0., 0., 0., 0., 0., 0., 0.],\n", 443 | " [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 444 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 445 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 446 | " 0., 0., 0., 0., 0., 0., 0.],\n", 447 | " [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 448 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 449 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 450 | " 0., 0., 0., 0., 0., 0., 0.],\n", 451 | " [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 452 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 453 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 454 | " 0., 0., 0., 0., 0., 0., 0.],\n", 455 | " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 456 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 457 | " 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 458 | " 0., 0., 0., 0., 0., 0., 0.],\n", 459 | " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 460 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 461 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,\n", 462 | " 0., 0., 0., 0., 0., 0., 0.],\n", 463 | " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 464 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 465 | " 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 466 | " 0., 0., 0., 0., 0., 0., 0.],\n", 467 | " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 468 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 469 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 470 | " 1., 0., 0., 0., 0., 0., 0.]])" 471 | ] 472 | }, 473 | "execution_count": 18, 474 | "metadata": {}, 475 | "output_type": "execute_result" 476 | } 477 | ], 478 | "source": [ 479 | "x" 480 | ] 481 | }, 482 | { 483 | "cell_type": "markdown", 484 | "metadata": {}, 485 | "source": [ 486 | "## Model" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": 19, 492 | "metadata": {}, 493 | "outputs": [], 494 | "source": [ 495 | "class CharRNN(nn.Module):\n", 496 | " def __init__(self, input_size, hidden_size, output_size):\n", 497 | " super(CharRNN, self).__init__()\n", 498 | "\n", 499 | " self.hidden_size = hidden_size\n", 500 | " self.linear_i2h = nn.Linear(input_size + hidden_size, hidden_size)\n", 501 | " self.linear_h2o = nn.Linear(hidden_size, output_size)\n", 502 | "\n", 503 | " def forward(self, x, hidden):\n", 504 | " combined = torch.cat((x, hidden), 1)\n", 505 | " hidden = torch.tanh(self.linear_i2h(combined))\n", 506 | " output = self.linear_h2o(hidden)\n", 507 | " return output, hidden\n", 508 | "\n", 509 | " def initHidden(self, bash_size):\n", 510 | " return torch.zeros(bash_size, self.hidden_size)" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": {}, 516 | "source": [ 517 | "## Debugging model" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": 20, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "vocab_size = 55\n", 527 | "hidden_size = 100\n", 528 | "n_classes = 18\n", 529 | "model = CharRNN(vocab_size, hidden_size, n_classes) #.cuda()" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 21, 535 | "metadata": {}, 536 | "outputs": [], 537 | "source": [ 538 | "x, y = next(iter(train_dl))" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": 22, 544 | "metadata": {}, 545 | "outputs": [ 546 | { 547 | "data": { 548 | "text/plain": [ 549 | "(torch.Size([2000, 15, 55]), torch.Size([2000]))" 550 | ] 551 | }, 552 | "execution_count": 22, 553 | "metadata": {}, 554 | "output_type": "execute_result" 555 | } 556 | ], 557 | "source": [ 558 | "x.shape, y.shape" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": 23, 564 | "metadata": {}, 565 | "outputs": [], 566 | "source": [ 567 | "batch = x.shape[0]\n", 568 | "h = model.initHidden(batch) #.cuda()\n", 569 | "x = x.float() #.cuda()\n", 570 | "y = y.long() #.cuda()" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 24, 576 | "metadata": {}, 577 | "outputs": [ 578 | { 579 | "data": { 580 | "text/plain": [ 581 | "torch.Size([2000, 155])" 582 | ] 583 | }, 584 | "execution_count": 24, 585 | "metadata": {}, 586 | "output_type": "execute_result" 587 | } 588 | ], 589 | "source": [ 590 | "torch.cat((x[:,0], h), 1).size()" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": 25, 596 | "metadata": {}, 597 | "outputs": [], 598 | "source": [ 599 | "for ei in range(x.shape[1]):\n", 600 | " y_t, h = model(x[:,ei], h)" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 26, 606 | "metadata": {}, 607 | "outputs": [ 608 | { 609 | "data": { 610 | "text/plain": [ 611 | "2.9177000522613525" 612 | ] 613 | }, 614 | "execution_count": 26, 615 | "metadata": {}, 616 | "output_type": "execute_result" 617 | } 618 | ], 619 | "source": [ 620 | "# note that just the last x_t is used in the loss\n", 621 | "# update\n", 622 | "loss = F.cross_entropy(y_t, y)\n", 623 | "loss.item()" 624 | ] 625 | }, 626 | { 627 | "cell_type": "markdown", 628 | "metadata": {}, 629 | "source": [ 630 | "## Training" 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": 27, 636 | "metadata": {}, 637 | "outputs": [], 638 | "source": [ 639 | "vocab_size = 55\n", 640 | "hidden_size = 100\n", 641 | "n_classes = 18\n", 642 | "model = CharRNN(vocab_size, hidden_size, n_classes) #.cuda()" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": 28, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "def get_optimizer(model, lr = 0.01, wd = 0.00001):\n", 652 | " parameters = filter(lambda p: p.requires_grad, model.parameters())\n", 653 | " optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)\n", 654 | " return optim" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 29, 660 | "metadata": {}, 661 | "outputs": [], 662 | "source": [ 663 | "def train(model, optim, train_dl):\n", 664 | " model.train()\n", 665 | " total = 0\n", 666 | " sum_loss = 0\n", 667 | " for x, y in train_dl:\n", 668 | " batch = x.shape[0]\n", 669 | " h = model.initHidden(batch) #.cuda()\n", 670 | " x = x.float() #.cuda()\n", 671 | " y = y.long() #.cuda()\n", 672 | " \n", 673 | " for t in range(x.shape[1]):\n", 674 | " out, h = model(x[:,t], h)\n", 675 | " \n", 676 | " loss = F.cross_entropy(out, y)\n", 677 | " optim.zero_grad()\n", 678 | " loss.backward()\n", 679 | " optim.step()\n", 680 | " total += batch\n", 681 | " sum_loss += batch*(loss.item())\n", 682 | " return sum_loss/total" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": 30, 688 | "metadata": {}, 689 | "outputs": [], 690 | "source": [ 691 | "def val_metric(model, valid_dl):\n", 692 | " model.eval()\n", 693 | " x, y = next(iter(valid_dl)) # just one batch\n", 694 | " x = x.float() #x.cuda()\n", 695 | " y = y.long() # y.cuda()\n", 696 | " N = x.shape[0]\n", 697 | " h = model.initHidden(N) # .cuda()\n", 698 | " for t in range(x.shape[1]):\n", 699 | " out, h = model(x[:,t], h)\n", 700 | " loss = F.cross_entropy(out, y)\n", 701 | " _, pred = torch.max(out, 1)\n", 702 | " acc = pred.eq(y).sum().float()/N\n", 703 | " return loss.item(), acc.item()" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 31, 709 | "metadata": {}, 710 | "outputs": [], 711 | "source": [ 712 | "vocab_size = 55\n", 713 | "hidden_size = 80\n", 714 | "n_classes = 18" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": 32, 720 | "metadata": {}, 721 | "outputs": [], 722 | "source": [ 723 | "def train_loop(model, lr, train_dl, valid_dl, epochs=20):\n", 724 | " optim = get_optimizer(model, lr =lr, wd = 0.0)\n", 725 | " for i in range(epochs):\n", 726 | " loss = train(model, optim, train_dl)\n", 727 | " val_loss, val_acc = val_metric(model, valid_dl)\n", 728 | " if i%5 == 1: print(\"train loss %.3f val loss %.3f and val accuracy %.3f\" % (loss, val_loss, val_acc))" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": 33, 734 | "metadata": {}, 735 | "outputs": [], 736 | "source": [ 737 | "model = CharRNN(vocab_size, hidden_size, n_classes) #.cuda()" 738 | ] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": 34, 743 | "metadata": {}, 744 | "outputs": [ 745 | { 746 | "name": "stdout", 747 | "output_type": "stream", 748 | "text": [ 749 | "train loss 1.849 val loss 1.803 and val accuracy 0.469\n", 750 | "train loss 1.362 val loss 1.282 and val accuracy 0.603\n", 751 | "train loss 1.016 val loss 0.983 and val accuracy 0.701\n", 752 | "train loss 0.891 val loss 0.858 and val accuracy 0.746\n" 753 | ] 754 | } 755 | ], 756 | "source": [ 757 | "train_loop(model, 0.01, train_dl, valid_dl, epochs=20)" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": 35, 763 | "metadata": {}, 764 | "outputs": [ 765 | { 766 | "name": "stdout", 767 | "output_type": "stream", 768 | "text": [ 769 | "train loss 0.771 val loss 0.767 and val accuracy 0.768\n", 770 | "train loss 0.750 val loss 0.746 and val accuracy 0.775\n", 771 | "train loss 0.732 val loss 0.728 and val accuracy 0.780\n", 772 | "train loss 0.713 val loss 0.710 and val accuracy 0.789\n" 773 | ] 774 | } 775 | ], 776 | "source": [ 777 | "train_loop(model, 0.001, train_dl, valid_dl, epochs=20)" 778 | ] 779 | }, 780 | { 781 | "cell_type": "code", 782 | "execution_count": 36, 783 | "metadata": {}, 784 | "outputs": [ 785 | { 786 | "name": "stdout", 787 | "output_type": "stream", 788 | "text": [ 789 | "train loss 0.714 val loss 0.710 and val accuracy 0.787\n", 790 | "train loss 0.690 val loss 0.687 and val accuracy 0.792\n", 791 | "train loss 0.676 val loss 0.674 and val accuracy 0.797\n", 792 | "train loss 0.664 val loss 0.664 and val accuracy 0.801\n" 793 | ] 794 | } 795 | ], 796 | "source": [ 797 | "train_loop(model, 0.001, train_dl, valid_dl, epochs=20)" 798 | ] 799 | }, 800 | { 801 | "cell_type": "markdown", 802 | "metadata": {}, 803 | "source": [ 804 | "# Model with character embeddings " 805 | ] 806 | }, 807 | { 808 | "cell_type": "code", 809 | "execution_count": 37, 810 | "metadata": {}, 811 | "outputs": [], 812 | "source": [ 813 | "class NameDatasetEmb(Dataset):\n", 814 | " def __init__(self, path, vocab2id, label2id, seq_len=15, vocab_len=55):\n", 815 | " self.df = pd.read_csv(path, header=None)\n", 816 | " self.label2id = label2id\n", 817 | " self.vocab2id = vocab2id\n", 818 | " self.seq_len = seq_len\n", 819 | " self.vocab_len = vocab_len \n", 820 | " self.x = df[0].values\n", 821 | " self.y = [self.label2id[l] for l in df[1].values]\n", 822 | " self.vocab2id = vocab2id\n", 823 | " \n", 824 | " def __len__(self):\n", 825 | " return len(self.y)\n", 826 | " \n", 827 | " def __getitem__(self, idx):\n", 828 | " x = pad_seq(self.x[idx], self.seq_len, self.vocab2id)\n", 829 | " return x, self.y[idx]" 830 | ] 831 | }, 832 | { 833 | "cell_type": "code", 834 | "execution_count": 38, 835 | "metadata": {}, 836 | "outputs": [], 837 | "source": [ 838 | "train_ds_2 = NameDatasetEmb(PATH/\"names_train.csv\", vocab2id, label2id)\n", 839 | "valid_ds_2 = NameDatasetEmb(PATH/\"names_test.csv\", vocab2id, label2id)" 840 | ] 841 | }, 842 | { 843 | "cell_type": "code", 844 | "execution_count": 39, 845 | "metadata": {}, 846 | "outputs": [], 847 | "source": [ 848 | "batch_size = 2000\n", 849 | "n = len(valid_ds_2)\n", 850 | "train_dl_2 = DataLoader(train_ds_2, batch_size=batch_size)\n", 851 | "valid_dl_2 = DataLoader(valid_ds_2, batch_size=n)" 852 | ] 853 | }, 854 | { 855 | "cell_type": "code", 856 | "execution_count": 40, 857 | "metadata": {}, 858 | "outputs": [ 859 | { 860 | "data": { 861 | "text/plain": [ 862 | "(array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 32, 47, 37, 48],\n", 863 | " dtype=int32), 2)" 864 | ] 865 | }, 866 | "execution_count": 40, 867 | "metadata": {}, 868 | "output_type": "execute_result" 869 | } 870 | ], 871 | "source": [ 872 | "train_ds_2[0]" 873 | ] 874 | }, 875 | { 876 | "cell_type": "code", 877 | "execution_count": 41, 878 | "metadata": {}, 879 | "outputs": [], 880 | "source": [ 881 | "class CharEmbRNN(nn.Module):\n", 882 | " def __init__(self, vocab_size, emb_size, hidden_size, output_size):\n", 883 | " super(CharEmbRNN, self).__init__()\n", 884 | " self.emb = nn.Embedding(vocab_size, emb_size)\n", 885 | " self.hidden_size = hidden_size\n", 886 | " self.linear_i2h = nn.Linear(emb_size + hidden_size, hidden_size)\n", 887 | " self.linear_h2o = nn.Linear(hidden_size, output_size)\n", 888 | "\n", 889 | " def forward(self, x, hidden):\n", 890 | " x = x.long() # this could be in the training loop\n", 891 | " x = self.emb(x)\n", 892 | " combined = torch.cat((x, hidden), 1)\n", 893 | " hidden = torch.tanh(self.linear_i2h(combined))\n", 894 | " output = self.linear_h2o(hidden)\n", 895 | " return output, hidden\n", 896 | "\n", 897 | " def initHidden(self, bash_size):\n", 898 | " return torch.zeros(bash_size, self.hidden_size)" 899 | ] 900 | }, 901 | { 902 | "cell_type": "markdown", 903 | "metadata": {}, 904 | "source": [ 905 | "## Train " 906 | ] 907 | }, 908 | { 909 | "cell_type": "code", 910 | "execution_count": 42, 911 | "metadata": {}, 912 | "outputs": [], 913 | "source": [ 914 | "vocab_size = 55\n", 915 | "emb_size = 30\n", 916 | "hidden_size = 80\n", 917 | "n_classes = 18\n", 918 | "model = CharEmbRNN(vocab_size, emb_size, hidden_size, n_classes) #.cuda()" 919 | ] 920 | }, 921 | { 922 | "cell_type": "code", 923 | "execution_count": 43, 924 | "metadata": {}, 925 | "outputs": [ 926 | { 927 | "name": "stdout", 928 | "output_type": "stream", 929 | "text": [ 930 | "train loss 1.957 val loss 1.646 and val accuracy 0.503\n", 931 | "train loss 1.270 val loss 1.182 and val accuracy 0.635\n", 932 | "train loss 1.080 val loss 1.007 and val accuracy 0.697\n", 933 | "train loss 0.954 val loss 0.884 and val accuracy 0.736\n", 934 | "train loss 0.866 val loss 0.792 and val accuracy 0.764\n", 935 | "train loss 0.834 val loss 0.775 and val accuracy 0.764\n", 936 | "train loss 0.735 val loss 0.686 and val accuracy 0.790\n", 937 | "train loss 0.676 val loss 0.630 and val accuracy 0.808\n", 938 | "train loss 0.635 val loss 0.583 and val accuracy 0.821\n", 939 | "train loss 0.602 val loss 0.557 and val accuracy 0.828\n" 940 | ] 941 | } 942 | ], 943 | "source": [ 944 | "train_loop(model, 0.01, train_dl_2, valid_dl_2, epochs=50)" 945 | ] 946 | }, 947 | { 948 | "cell_type": "code", 949 | "execution_count": 44, 950 | "metadata": {}, 951 | "outputs": [ 952 | { 953 | "name": "stdout", 954 | "output_type": "stream", 955 | "text": [ 956 | "train loss 0.723 val loss 0.685 and val accuracy 0.786\n", 957 | "train loss 0.560 val loss 0.521 and val accuracy 0.836\n", 958 | "train loss 0.504 val loss 0.465 and val accuracy 0.859\n", 959 | "train loss 0.467 val loss 0.424 and val accuracy 0.872\n", 960 | "train loss 0.431 val loss 0.388 and val accuracy 0.883\n", 961 | "train loss 0.398 val loss 0.358 and val accuracy 0.893\n", 962 | "train loss 0.420 val loss 0.393 and val accuracy 0.879\n", 963 | "train loss 0.366 val loss 0.326 and val accuracy 0.899\n", 964 | "train loss 0.338 val loss 0.290 and val accuracy 0.916\n", 965 | "train loss 0.291 val loss 0.265 and val accuracy 0.921\n" 966 | ] 967 | } 968 | ], 969 | "source": [ 970 | "train_loop(model, 0.01, train_dl_2, valid_dl_2, epochs=50)" 971 | ] 972 | }, 973 | { 974 | "cell_type": "code", 975 | "execution_count": 45, 976 | "metadata": {}, 977 | "outputs": [ 978 | { 979 | "name": "stdout", 980 | "output_type": "stream", 981 | "text": [ 982 | "train loss 0.435 val loss 0.456 and val accuracy 0.849\n", 983 | "train loss 0.275 val loss 0.250 and val accuracy 0.928\n", 984 | "train loss 0.258 val loss 0.224 and val accuracy 0.935\n", 985 | "train loss 0.250 val loss 0.217 and val accuracy 0.938\n", 986 | "train loss 0.205 val loss 0.183 and val accuracy 0.949\n", 987 | "train loss 0.286 val loss 0.246 and val accuracy 0.921\n", 988 | "train loss 0.212 val loss 0.184 and val accuracy 0.948\n", 989 | "train loss 0.178 val loss 0.148 and val accuracy 0.958\n" 990 | ] 991 | } 992 | ], 993 | "source": [ 994 | "train_loop(model, 0.01, train_dl_2, valid_dl_2, epochs=40)" 995 | ] 996 | }, 997 | { 998 | "cell_type": "code", 999 | "execution_count": 46, 1000 | "metadata": {}, 1001 | "outputs": [ 1002 | { 1003 | "name": "stdout", 1004 | "output_type": "stream", 1005 | "text": [ 1006 | "train loss 0.144 val loss 0.139 and val accuracy 0.962\n", 1007 | "train loss 0.133 val loss 0.131 and val accuracy 0.964\n", 1008 | "train loss 0.129 val loss 0.127 and val accuracy 0.966\n", 1009 | "train loss 0.126 val loss 0.124 and val accuracy 0.966\n", 1010 | "train loss 0.123 val loss 0.121 and val accuracy 0.967\n", 1011 | "train loss 0.120 val loss 0.118 and val accuracy 0.968\n", 1012 | "train loss 0.117 val loss 0.115 and val accuracy 0.969\n", 1013 | "train loss 0.115 val loss 0.113 and val accuracy 0.969\n", 1014 | "train loss 0.112 val loss 0.110 and val accuracy 0.970\n", 1015 | "train loss 0.111 val loss 0.108 and val accuracy 0.971\n" 1016 | ] 1017 | } 1018 | ], 1019 | "source": [ 1020 | "train_loop(model, 0.001, train_dl_2, valid_dl_2, epochs=50)" 1021 | ] 1022 | }, 1023 | { 1024 | "cell_type": "markdown", 1025 | "metadata": {}, 1026 | "source": [ 1027 | "## Lab\n", 1028 | "* Add dropout to the lastest model. \n", 1029 | "* Change some of the hyper-parameters.\n", 1030 | "* Play with different learning rates." 1031 | ] 1032 | }, 1033 | { 1034 | "cell_type": "markdown", 1035 | "metadata": {}, 1036 | "source": [ 1037 | "# References\n", 1038 | "This notebook is a modified version of this tutorial\n", 1039 | "http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html. Here I implement vanilla RNNs." 1040 | ] 1041 | }, 1042 | { 1043 | "cell_type": "code", 1044 | "execution_count": null, 1045 | "metadata": {}, 1046 | "outputs": [], 1047 | "source": [] 1048 | } 1049 | ], 1050 | "metadata": { 1051 | "kernelspec": { 1052 | "display_name": "Python 3", 1053 | "language": "python", 1054 | "name": "python3" 1055 | }, 1056 | "language_info": { 1057 | "codemirror_mode": { 1058 | "name": "ipython", 1059 | "version": 3 1060 | }, 1061 | "file_extension": ".py", 1062 | "mimetype": "text/x-python", 1063 | "name": "python", 1064 | "nbconvert_exporter": "python", 1065 | "pygments_lexer": "ipython3", 1066 | "version": "3.7.3" 1067 | }, 1068 | "toc": { 1069 | "nav_menu": {}, 1070 | "number_sections": true, 1071 | "sideBar": true, 1072 | "skip_h1_title": false, 1073 | "toc_cell": false, 1074 | "toc_position": {}, 1075 | "toc_section_display": "block", 1076 | "toc_window_display": false 1077 | } 1078 | }, 1079 | "nbformat": 4, 1080 | "nbformat_minor": 2 1081 | } 1082 | -------------------------------------------------------------------------------- /lesson3-cbow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "toc": true 7 | }, 8 | "source": [ 9 | "

Table of Contents

\n", 10 | "
" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# import pytorch libraries\n", 20 | "%matplotlib inline\n", 21 | "import torch \n", 22 | "import torch.autograd as autograd \n", 23 | "import torch.nn as nn \n", 24 | "import torch.nn.functional as F\n", 25 | "import torch.optim as optim\n", 26 | "from torch.utils.data import Dataset, DataLoader\n", 27 | "import numpy as np" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "from sklearn.model_selection import train_test_split" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "# CBOW model for text classification\n", 44 | "In this part of the tutorial we develop a continuous bag of words (CBOW) model for a text classification task described [here]( https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf). The CBOW model was first described [here](https://arxiv.org/pdf/1301.3781.pdf)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Subjectivity Dataset\n", 52 | "The subjectivity dataset has 5000 subjective and 5000 objective processed sentences. To get the data:\n", 53 | "```\n", 54 | "wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz\n", 55 | "```" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "def unpack_dataset():\n", 65 | " ! wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz\n", 66 | " ! mkdir data\n", 67 | " ! tar -xvf rotten_imdb.tar.gz -C data" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "--2021-11-08 17:30:29-- http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz\n", 80 | "Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.36\n", 81 | "Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.36|:80... connected.\n", 82 | "HTTP request sent, awaiting response... 200 OK\n", 83 | "Length: 519599 (507K) [application/x-gzip]\n", 84 | "Saving to: ‘rotten_imdb.tar.gz.1’\n", 85 | "\n", 86 | "rotten_imdb.tar.gz. 100%[===================>] 507.42K 1008KB/s in 0.5s \n", 87 | "\n", 88 | "2021-11-08 17:30:30 (1008 KB/s) - ‘rotten_imdb.tar.gz.1’ saved [519599/519599]\n", 89 | "\n", 90 | "quote.tok.gt9.5000\n", 91 | "plot.tok.gt9.5000\n", 92 | "subjdata.README.1.0\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "unpack_dataset()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 5, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "plot.tok.gt9.5000 quote.tok.gt9.5000 subjdata.README.1.0\r\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "!ls data" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 6, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . \r\n", 127 | "emerging from the human psyche and showing characteristics of abstract expressionism , minimalism and russian constructivism , graffiti removal has secured its place in the history of modern art while being created by artists who are unconscious of their artistic achievements . \r\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "! head -2 data/plot.tok.gt9.5000" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 7, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "[PosixPath('data/plot.tok.gt9.5000'),\n", 144 | " PosixPath('data/subjdata.README.1.0'),\n", 145 | " PosixPath('data/quote.tok.gt9.5000')]" 146 | ] 147 | }, 148 | "execution_count": 7, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "from pathlib import Path\n", 155 | "PATH = Path(\"data\")\n", 156 | "list(PATH.iterdir())" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "## Tokenization\n", 164 | "Tokenization is the task of chopping up text into pieces, called tokens.\n", 165 | "\n", 166 | "spaCy is an open-source software library for advanced Natural Language Processing. Here we will use it for tokenization. " 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "### Simple Tokenization" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 8, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "# We need each line in the file \n", 183 | "def read_file(path):\n", 184 | " \"\"\" Read file returns a list of lines.\n", 185 | " \"\"\"\n", 186 | " with open(path, encoding = \"ISO-8859-1\") as f:\n", 187 | " content = f.readlines()\n", 188 | " return content" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 9, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "obj_lines = read_file(PATH/\"plot.tok.gt9.5000\")" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 10, 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "data": { 207 | "text/plain": [ 208 | "'the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . \\n'" 209 | ] 210 | }, 211 | "execution_count": 10, 212 | "metadata": {}, 213 | "output_type": "execute_result" 214 | } 215 | ], 216 | "source": [ 217 | "obj_lines[0]" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 11, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "text/plain": [ 228 | "array(['the', 'movie', 'begins', 'in', 'the', 'past', 'where', 'a',\n", 229 | " 'young', 'boy', 'named', 'sam', 'attempts', 'to', 'save', 'celebi',\n", 230 | " 'from', 'a', 'hunter', '.'], dtype='\":0, \"UNK\":1} # init with padding and unknown\n", 569 | "words = [\"\", \"UNK\"]\n", 570 | "for word in word_count:\n", 571 | " vocab2index[word] = len(words)\n", 572 | " words.append(word)" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 34, 578 | "metadata": {}, 579 | "outputs": [], 580 | "source": [ 581 | "#vocab2index" 582 | ] 583 | }, 584 | { 585 | "cell_type": "markdown", 586 | "metadata": {}, 587 | "source": [ 588 | "## Sentence encoding\n", 589 | "Here we encode each sentence as a sequence of indices corresponding to each word." 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 35, 595 | "metadata": {}, 596 | "outputs": [], 597 | "source": [ 598 | "x_train_len = np.array([len(x.split()) for x in X_train])\n", 599 | "x_valid_len = np.array([len(x.split()) for x in X_valid])" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 36, 605 | "metadata": {}, 606 | "outputs": [ 607 | { 608 | "data": { 609 | "text/plain": [ 610 | "55.0" 611 | ] 612 | }, 613 | "execution_count": 36, 614 | "metadata": {}, 615 | "output_type": "execute_result" 616 | } 617 | ], 618 | "source": [ 619 | "np.percentile(x_train_len, 99) # let set the max sequence len to N=40" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": 37, 625 | "metadata": {}, 626 | "outputs": [ 627 | { 628 | "data": { 629 | "text/plain": [ 630 | "'will god let her fall or give her a new path ?'" 631 | ] 632 | }, 633 | "execution_count": 37, 634 | "metadata": {}, 635 | "output_type": "execute_result" 636 | } 637 | ], 638 | "source": [ 639 | "X_train[0]" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 38, 645 | "metadata": {}, 646 | "outputs": [ 647 | { 648 | "data": { 649 | "text/plain": [ 650 | "5" 651 | ] 652 | }, 653 | "execution_count": 38, 654 | "metadata": {}, 655 | "output_type": "execute_result" 656 | } 657 | ], 658 | "source": [ 659 | "# returns the index of the word or the index of \"UNK\" otherwise\n", 660 | "vocab2index.get(\"?\", vocab2index[\"UNK\"])" 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": 39, 666 | "metadata": {}, 667 | "outputs": [ 668 | { 669 | "data": { 670 | "text/plain": [ 671 | "array([ 3, 2, 10, 9, 4, 8, 11, 9, 12, 6, 7, 5])" 672 | ] 673 | }, 674 | "execution_count": 39, 675 | "metadata": {}, 676 | "output_type": "execute_result" 677 | } 678 | ], 679 | "source": [ 680 | "np.array([vocab2index.get(w, vocab2index[\"UNK\"]) for w in X_train[0].split()])" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": 40, 686 | "metadata": {}, 687 | "outputs": [], 688 | "source": [ 689 | "def encode_sentence(s, N=40):\n", 690 | " enc = np.zeros(N, dtype=np.int32)\n", 691 | " enc1 = np.array([vocab2index.get(w, vocab2index[\"UNK\"]) for w in s.split()])\n", 692 | " l = min(N, len(enc1))\n", 693 | " enc[:l] = enc1[:l]\n", 694 | " return enc, l" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": 41, 700 | "metadata": {}, 701 | "outputs": [ 702 | { 703 | "data": { 704 | "text/plain": [ 705 | "(array([ 3, 2, 10, 9, 4, 8, 11, 9, 12, 6, 7, 5, 0, 0, 0, 0, 0,\n", 706 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 707 | " 0, 0, 0, 0, 0, 0], dtype=int32),\n", 708 | " 12)" 709 | ] 710 | }, 711 | "execution_count": 41, 712 | "metadata": {}, 713 | "output_type": "execute_result" 714 | } 715 | ], 716 | "source": [ 717 | "encode_sentence(X_train[0])" 718 | ] 719 | }, 720 | { 721 | "cell_type": "markdown", 722 | "metadata": {}, 723 | "source": [ 724 | "## Dataset" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": 42, 730 | "metadata": {}, 731 | "outputs": [], 732 | "source": [ 733 | "class SubjectivityDataset(Dataset):\n", 734 | " def __init__(self, X, y):\n", 735 | " self.x = X\n", 736 | " self.y = y\n", 737 | " \n", 738 | " def __len__(self):\n", 739 | " return len(self.y)\n", 740 | " \n", 741 | " def __getitem__(self, idx):\n", 742 | " x = self.x[idx]\n", 743 | " x, s = encode_sentence(x)\n", 744 | " return x, self.y[idx], s\n", 745 | " \n", 746 | "train_ds = SubjectivityDataset(X_train, y_train)\n", 747 | "valid_ds = SubjectivityDataset(X_valid, y_valid)" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 43, 753 | "metadata": {}, 754 | "outputs": [], 755 | "source": [ 756 | "batch_size=3\n", 757 | "train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)\n", 758 | "valid_dl = DataLoader(valid_ds, batch_size=batch_size)" 759 | ] 760 | }, 761 | { 762 | "cell_type": "code", 763 | "execution_count": 44, 764 | "metadata": {}, 765 | "outputs": [], 766 | "source": [ 767 | "x, y, s = next(iter(train_dl))" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": 45, 773 | "metadata": {}, 774 | "outputs": [ 775 | { 776 | "data": { 777 | "text/plain": [ 778 | "tensor([[ 702, 1083, 3740, 21, 1336, 36, 213, 214, 97, 2794, 19, 1638,\n", 779 | " 205, 12, 6, 1657, 2585, 59, 102, 698, 203, 702, 1, 22,\n", 780 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 781 | " 0, 0, 0, 0],\n", 782 | " [ 243, 3817, 1853, 304, 3279, 18, 2664, 203, 71, 1, 29, 275,\n", 783 | " 233, 1359, 28, 18, 1296, 79, 29, 153, 21, 163, 172, 3636,\n", 784 | " 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 785 | " 0, 0, 0, 0],\n", 786 | " [ 34, 3480, 498, 21, 1, 145, 1157, 35, 529, 36, 27, 71,\n", 787 | " 198, 162, 2092, 28, 1027, 8, 92, 1157, 35, 1281, 22, 0,\n", 788 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 789 | " 0, 0, 0, 0]], dtype=torch.int32)" 790 | ] 791 | }, 792 | "execution_count": 45, 793 | "metadata": {}, 794 | "output_type": "execute_result" 795 | } 796 | ], 797 | "source": [ 798 | "x" 799 | ] 800 | }, 801 | { 802 | "cell_type": "code", 803 | "execution_count": 46, 804 | "metadata": {}, 805 | "outputs": [ 806 | { 807 | "data": { 808 | "text/plain": [ 809 | "tensor([0., 1., 0.], dtype=torch.float64)" 810 | ] 811 | }, 812 | "execution_count": 46, 813 | "metadata": {}, 814 | "output_type": "execute_result" 815 | } 816 | ], 817 | "source": [ 818 | "y" 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": 47, 824 | "metadata": {}, 825 | "outputs": [ 826 | { 827 | "data": { 828 | "text/plain": [ 829 | "tensor([24, 25, 23])" 830 | ] 831 | }, 832 | "execution_count": 47, 833 | "metadata": {}, 834 | "output_type": "execute_result" 835 | } 836 | ], 837 | "source": [ 838 | "# lenght of each vector\n", 839 | "s" 840 | ] 841 | }, 842 | { 843 | "cell_type": "markdown", 844 | "metadata": {}, 845 | "source": [ 846 | "## Embedding layer\n", 847 | "Most deep learning models use a dense vectors of real numbers as representation of words (word embeddings), as opposed to a one-hot encoding representations. The module torch.nn.Embedding is used to represent word embeddings. It takes two arguments: the vocabulary size, and the dimensionality of the embeddings. The embeddings are initialized with random vectors. " 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "execution_count": 48, 853 | "metadata": {}, 854 | "outputs": [ 855 | { 856 | "data": { 857 | "text/plain": [ 858 | "Parameter containing:\n", 859 | "tensor([[ 0.0000, 0.0000, 0.0000, 0.0000],\n", 860 | " [ 0.1624, 0.2922, 0.1153, -0.1103],\n", 861 | " [-0.3021, -1.5938, -0.2059, -0.2419],\n", 862 | " [ 2.3811, 0.6239, -0.6555, 0.6966],\n", 863 | " [ 1.6653, -0.6109, 0.0929, 0.9278],\n", 864 | " [ 0.0951, 0.9867, 0.9146, 1.2108],\n", 865 | " [-1.4875, -0.7667, 0.9095, 0.6300],\n", 866 | " [ 0.7030, 0.0452, -0.5968, -0.0531],\n", 867 | " [-1.2330, -0.4856, 1.0943, 0.6714],\n", 868 | " [-0.5795, -1.9490, 1.2225, 0.3357]], requires_grad=True)" 869 | ] 870 | }, 871 | "execution_count": 48, 872 | "metadata": {}, 873 | "output_type": "execute_result" 874 | } 875 | ], 876 | "source": [ 877 | "# an Embedding module containing 10 words with embedding size 4\n", 878 | "# embedding will be initialized at random\n", 879 | "embed = nn.Embedding(10, 4, padding_idx=0)\n", 880 | "embed.weight" 881 | ] 882 | }, 883 | { 884 | "cell_type": "markdown", 885 | "metadata": {}, 886 | "source": [ 887 | "Note that the `padding_idx` has embedding vector 0." 888 | ] 889 | }, 890 | { 891 | "cell_type": "code", 892 | "execution_count": 49, 893 | "metadata": {}, 894 | "outputs": [ 895 | { 896 | "data": { 897 | "text/plain": [ 898 | "tensor([[[ 0.1624, 0.2922, 0.1153, -0.1103],\n", 899 | " [ 1.6653, -0.6109, 0.0929, 0.9278],\n", 900 | " [ 0.1624, 0.2922, 0.1153, -0.1103],\n", 901 | " [ 0.0951, 0.9867, 0.9146, 1.2108],\n", 902 | " [ 0.1624, 0.2922, 0.1153, -0.1103],\n", 903 | " [ 0.0000, 0.0000, 0.0000, 0.0000]]], grad_fn=)" 904 | ] 905 | }, 906 | "execution_count": 49, 907 | "metadata": {}, 908 | "output_type": "execute_result" 909 | } 910 | ], 911 | "source": [ 912 | "# given a list of ids we can \"look up\" the embedding corresponing to each id\n", 913 | "# can you see that some vectors are the same?\n", 914 | "a = torch.LongTensor([[1,4,1,5,1,0]])\n", 915 | "embed(a)" 916 | ] 917 | }, 918 | { 919 | "cell_type": "markdown", 920 | "metadata": {}, 921 | "source": [ 922 | "This would be the representation of a sentence with words with indices [1,4,1,5,1] and a padding at the end. Bellow we have an example in which we have two sentences. the first sentence has length 3 and the last sentence has length 2. In order to use a tensor we use padding at the end of the second sentence. " 923 | ] 924 | }, 925 | { 926 | "cell_type": "code", 927 | "execution_count": 50, 928 | "metadata": {}, 929 | "outputs": [], 930 | "source": [ 931 | "a = torch.LongTensor([[1,4,1], [1,3,0]])" 932 | ] 933 | }, 934 | { 935 | "cell_type": "markdown", 936 | "metadata": {}, 937 | "source": [ 938 | "Our model takes an average of the word embedding of each word. Here is how we do it." 939 | ] 940 | }, 941 | { 942 | "cell_type": "code", 943 | "execution_count": 51, 944 | "metadata": {}, 945 | "outputs": [], 946 | "source": [ 947 | "s = torch.FloatTensor([3, 2]) # here is the size of the vector" 948 | ] 949 | }, 950 | { 951 | "cell_type": "code", 952 | "execution_count": 52, 953 | "metadata": {}, 954 | "outputs": [ 955 | { 956 | "data": { 957 | "text/plain": [ 958 | "tensor([[[ 0.1624, 0.2922, 0.1153, -0.1103],\n", 959 | " [ 1.6653, -0.6109, 0.0929, 0.9278],\n", 960 | " [ 0.1624, 0.2922, 0.1153, -0.1103]],\n", 961 | "\n", 962 | " [[ 0.1624, 0.2922, 0.1153, -0.1103],\n", 963 | " [ 2.3811, 0.6239, -0.6555, 0.6966],\n", 964 | " [ 0.0000, 0.0000, 0.0000, 0.0000]]], grad_fn=)" 965 | ] 966 | }, 967 | "execution_count": 52, 968 | "metadata": {}, 969 | "output_type": "execute_result" 970 | } 971 | ], 972 | "source": [ 973 | "embed(a)" 974 | ] 975 | }, 976 | { 977 | "cell_type": "code", 978 | "execution_count": 53, 979 | "metadata": {}, 980 | "outputs": [ 981 | { 982 | "data": { 983 | "text/plain": [ 984 | "tensor([[ 1.9900, -0.0265, 0.3234, 0.7071],\n", 985 | " [ 2.5435, 0.9161, -0.5402, 0.5863]], grad_fn=)" 986 | ] 987 | }, 988 | "execution_count": 53, 989 | "metadata": {}, 990 | "output_type": "execute_result" 991 | } 992 | ], 993 | "source": [ 994 | "embed(a).sum(dim=1)" 995 | ] 996 | }, 997 | { 998 | "cell_type": "code", 999 | "execution_count": 54, 1000 | "metadata": {}, 1001 | "outputs": [ 1002 | { 1003 | "data": { 1004 | "text/plain": [ 1005 | "tensor([[ 0.6633, -0.0088, 0.1078, 0.2357],\n", 1006 | " [ 1.2717, 0.4581, -0.2701, 0.2931]], grad_fn=)" 1007 | ] 1008 | }, 1009 | "execution_count": 54, 1010 | "metadata": {}, 1011 | "output_type": "execute_result" 1012 | } 1013 | ], 1014 | "source": [ 1015 | "sum_embs = embed(a).sum(dim=1) \n", 1016 | "sum_embs/ s.view(s.shape[0], 1)" 1017 | ] 1018 | }, 1019 | { 1020 | "cell_type": "markdown", 1021 | "metadata": {}, 1022 | "source": [ 1023 | "## Continuous Bag of Words Model" 1024 | ] 1025 | }, 1026 | { 1027 | "cell_type": "code", 1028 | "execution_count": 55, 1029 | "metadata": {}, 1030 | "outputs": [], 1031 | "source": [ 1032 | "class CBOW(nn.Module):\n", 1033 | " def __init__(self, vocab_size, emb_size=100):\n", 1034 | " super(CBOW, self).__init__()\n", 1035 | " self.word_emb = nn.Embedding(vocab_size, emb_size, padding_idx=0)\n", 1036 | " self.linear1 = nn.Linear(emb_size, 30)\n", 1037 | " self.linear2 = nn.Linear(30, 1)\n", 1038 | " \n", 1039 | " def forward(self, x, s):\n", 1040 | " x = self.word_emb(x)\n", 1041 | " x = x.sum(dim=1)/ s.view(s.shape[0], 1)\n", 1042 | " x = self.linear1(x)\n", 1043 | " x = F.relu(x)\n", 1044 | " x = self.linear2(x)\n", 1045 | " return x" 1046 | ] 1047 | }, 1048 | { 1049 | "cell_type": "code", 1050 | "execution_count": 56, 1051 | "metadata": {}, 1052 | "outputs": [], 1053 | "source": [ 1054 | "model = CBOW(vocab_size=5, emb_size=3)" 1055 | ] 1056 | }, 1057 | { 1058 | "cell_type": "code", 1059 | "execution_count": 57, 1060 | "metadata": {}, 1061 | "outputs": [ 1062 | { 1063 | "data": { 1064 | "text/plain": [ 1065 | "Parameter containing:\n", 1066 | "tensor([[ 0.0000, 0.0000, 0.0000],\n", 1067 | " [-1.1687, 0.4324, -1.0500],\n", 1068 | " [-0.2847, 0.1846, -0.8403],\n", 1069 | " [-1.9271, -1.1778, -1.2009],\n", 1070 | " [ 0.4731, 0.9903, -1.2505]], requires_grad=True)" 1071 | ] 1072 | }, 1073 | "execution_count": 57, 1074 | "metadata": {}, 1075 | "output_type": "execute_result" 1076 | } 1077 | ], 1078 | "source": [ 1079 | "model.word_emb.weight" 1080 | ] 1081 | }, 1082 | { 1083 | "cell_type": "code", 1084 | "execution_count": 58, 1085 | "metadata": {}, 1086 | "outputs": [ 1087 | { 1088 | "data": { 1089 | "text/plain": [ 1090 | "tensor([[-0.0758],\n", 1091 | " [ 0.0683]], grad_fn=)" 1092 | ] 1093 | }, 1094 | "execution_count": 58, 1095 | "metadata": {}, 1096 | "output_type": "execute_result" 1097 | } 1098 | ], 1099 | "source": [ 1100 | "model(a, s)" 1101 | ] 1102 | }, 1103 | { 1104 | "cell_type": "markdown", 1105 | "metadata": {}, 1106 | "source": [ 1107 | "# Training the CBOW model " 1108 | ] 1109 | }, 1110 | { 1111 | "cell_type": "code", 1112 | "execution_count": 59, 1113 | "metadata": {}, 1114 | "outputs": [ 1115 | { 1116 | "name": "stdout", 1117 | "output_type": "stream", 1118 | "text": [ 1119 | "4067\n" 1120 | ] 1121 | } 1122 | ], 1123 | "source": [ 1124 | "V = len(words)\n", 1125 | "model = CBOW(vocab_size=V, emb_size=50)\n", 1126 | "print(V)" 1127 | ] 1128 | }, 1129 | { 1130 | "cell_type": "code", 1131 | "execution_count": 60, 1132 | "metadata": {}, 1133 | "outputs": [], 1134 | "source": [ 1135 | "batch_size=500\n", 1136 | "train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)\n", 1137 | "valid_dl = DataLoader(valid_ds, batch_size=2000)" 1138 | ] 1139 | }, 1140 | { 1141 | "cell_type": "code", 1142 | "execution_count": 61, 1143 | "metadata": {}, 1144 | "outputs": [ 1145 | { 1146 | "data": { 1147 | "text/plain": [ 1148 | "2000" 1149 | ] 1150 | }, 1151 | "execution_count": 61, 1152 | "metadata": {}, 1153 | "output_type": "execute_result" 1154 | } 1155 | ], 1156 | "source": [ 1157 | "len(valid_ds)" 1158 | ] 1159 | }, 1160 | { 1161 | "cell_type": "code", 1162 | "execution_count": 62, 1163 | "metadata": {}, 1164 | "outputs": [], 1165 | "source": [ 1166 | "def test_metrics(model):\n", 1167 | " model.eval()\n", 1168 | " for x, y, s in valid_dl:\n", 1169 | " s = torch.FloatTensor(s.float()).view(s.shape[0], 1)\n", 1170 | " y = y.unsqueeze(1)\n", 1171 | " y_hat = model(x.long(), s)\n", 1172 | " loss = F.binary_cross_entropy_with_logits(y_hat, y)\n", 1173 | " y_pred = y_hat > 0\n", 1174 | " correct = (y_pred.float() == y).float().sum()\n", 1175 | " accuracy = correct/y_pred.shape[0]\n", 1176 | " return loss.item(), accuracy.item()" 1177 | ] 1178 | }, 1179 | { 1180 | "cell_type": "code", 1181 | "execution_count": 63, 1182 | "metadata": {}, 1183 | "outputs": [ 1184 | { 1185 | "data": { 1186 | "text/plain": [ 1187 | "(0.6949562772251665, 0.49399998784065247)" 1188 | ] 1189 | }, 1190 | "execution_count": 63, 1191 | "metadata": {}, 1192 | "output_type": "execute_result" 1193 | } 1194 | ], 1195 | "source": [ 1196 | "# accuracy of a random model should be around 0.5\n", 1197 | "test_metrics(model)" 1198 | ] 1199 | }, 1200 | { 1201 | "cell_type": "code", 1202 | "execution_count": 64, 1203 | "metadata": {}, 1204 | "outputs": [], 1205 | "source": [ 1206 | "def train_epocs(model, epochs=10, lr=0.01, weight_decay=1e-5):\n", 1207 | " optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)\n", 1208 | " for i in range(epochs):\n", 1209 | " model.train()\n", 1210 | " for x, y, s in train_dl:\n", 1211 | " y = y.unsqueeze(1)\n", 1212 | " s = s.type(torch.Tensor).view(s.shape[0], 1)\n", 1213 | " y_hat = model(x.long(), s)\n", 1214 | " loss = F.binary_cross_entropy_with_logits(y_hat, y)\n", 1215 | " optimizer.zero_grad()\n", 1216 | " loss.backward()\n", 1217 | " optimizer.step()\n", 1218 | " val_loss, val_acc = test_metrics(model)\n", 1219 | " print(\"train loss %.3f val loss %.3f and val accuracy %.3f\" % (loss.item(), val_loss, val_acc))" 1220 | ] 1221 | }, 1222 | { 1223 | "cell_type": "code", 1224 | "execution_count": 65, 1225 | "metadata": {}, 1226 | "outputs": [ 1227 | { 1228 | "name": "stdout", 1229 | "output_type": "stream", 1230 | "text": [ 1231 | "train loss 0.505 val loss 0.483 and val accuracy 0.786\n", 1232 | "train loss 0.316 val loss 0.328 and val accuracy 0.868\n", 1233 | "train loss 0.223 val loss 0.284 and val accuracy 0.883\n", 1234 | "train loss 0.176 val loss 0.267 and val accuracy 0.894\n", 1235 | "train loss 0.123 val loss 0.283 and val accuracy 0.891\n", 1236 | "train loss 0.095 val loss 0.306 and val accuracy 0.888\n", 1237 | "train loss 0.052 val loss 0.326 and val accuracy 0.884\n", 1238 | "train loss 0.066 val loss 0.367 and val accuracy 0.879\n", 1239 | "train loss 0.033 val loss 0.401 and val accuracy 0.879\n", 1240 | "train loss 0.033 val loss 0.440 and val accuracy 0.878\n", 1241 | "train loss 0.021 val loss 0.487 and val accuracy 0.873\n", 1242 | "train loss 0.021 val loss 0.507 and val accuracy 0.871\n", 1243 | "train loss 0.034 val loss 0.543 and val accuracy 0.870\n", 1244 | "train loss 0.014 val loss 0.572 and val accuracy 0.870\n", 1245 | "train loss 0.008 val loss 0.590 and val accuracy 0.870\n" 1246 | ] 1247 | } 1248 | ], 1249 | "source": [ 1250 | "V = len(words)\n", 1251 | "model = CBOW(vocab_size=V, emb_size=50)\n", 1252 | "train_epocs(model, epochs=15)" 1253 | ] 1254 | }, 1255 | { 1256 | "cell_type": "markdown", 1257 | "metadata": {}, 1258 | "source": [ 1259 | "## Lab\n", 1260 | "* Apply this model to any text classification problem. Here are a couple of problems:\n", 1261 | " * https://ai.stanford.edu/~amaas/data/sentiment/ (sentiment classification)\n", 1262 | " * https://www.kaggle.com/yelp-dataset/yelp-dataset\n", 1263 | "* More challenging, modify the cbow model to decide it two sentences have the same intent.\n", 1264 | " * https://www.kaggle.com/c/quora-question-pairs" 1265 | ] 1266 | }, 1267 | { 1268 | "cell_type": "markdown", 1269 | "metadata": { 1270 | "collapsed": true 1271 | }, 1272 | "source": [ 1273 | "# References\n", 1274 | "* https://pytorch.org/docs/stable/index.html\n", 1275 | "* http://pytorch.org/tutorials/beginner/pytorch_with_examples.html\n", 1276 | "* https://hsaghir.github.io/data_science/pytorch_starter/" 1277 | ] 1278 | } 1279 | ], 1280 | "metadata": { 1281 | "kernelspec": { 1282 | "display_name": "Python 3 (ipykernel)", 1283 | "language": "python", 1284 | "name": "python3" 1285 | }, 1286 | "language_info": { 1287 | "codemirror_mode": { 1288 | "name": "ipython", 1289 | "version": 3 1290 | }, 1291 | "file_extension": ".py", 1292 | "mimetype": "text/x-python", 1293 | "name": "python", 1294 | "nbconvert_exporter": "python", 1295 | "pygments_lexer": "ipython3", 1296 | "version": "3.8.11" 1297 | }, 1298 | "nav_menu": {}, 1299 | "toc": { 1300 | "base_numbering": 1, 1301 | "nav_menu": { 1302 | "height": "116px", 1303 | "width": "251px" 1304 | }, 1305 | "number_sections": true, 1306 | "sideBar": true, 1307 | "skip_h1_title": false, 1308 | "title_cell": "Table of Contents", 1309 | "title_sidebar": "Contents", 1310 | "toc_cell": true, 1311 | "toc_position": {}, 1312 | "toc_section_display": "block", 1313 | "toc_window_display": false 1314 | }, 1315 | "widgets": { 1316 | "state": {}, 1317 | "version": "1.1.2" 1318 | } 1319 | }, 1320 | "nbformat": 4, 1321 | "nbformat_minor": 1 1322 | } 1323 | -------------------------------------------------------------------------------- /lesson4-seq2seq.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Sequence to Sequence\n", 8 | "In this notebook we will be teaching a neural network to translate from French to English." 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "This is made possible by the simple but powerful idea of the [sequence\n", 16 | "to sequence network](https://arxiv.org/abs/1409.3215>), in which two\n", 17 | "recurrent neural networks work together to transform one sequence to\n", 18 | "another. An **encoder** network condenses an input sequence into a vector,\n", 19 | "and a **decoder** network unfolds that vector into a new sequence.\n", 20 | "\n", 21 | "![](imgs/seq2seq.png)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "from __future__ import unicode_literals\n", 31 | "from io import open\n", 32 | "import unicodedata\n", 33 | "import string\n", 34 | "import re\n", 35 | "import random\n", 36 | "\n", 37 | "import torch\n", 38 | "import torch.nn as nn\n", 39 | "from torch import optim\n", 40 | "import torch.nn.functional as F\n", 41 | "from torch.utils.data import Dataset, DataLoader\n", 42 | "import numpy as np\n", 43 | "%matplotlib inline" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Pre-processing data\n", 51 | "The data for this project is a set of many thousands of English to\n", 52 | "French translation pairs." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "def download_dataset():\n", 62 | " ! wget https://download.pytorch.org/tutorial/data.zip\n", 63 | " ! unzip data.zip" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "--2019-09-25 13:59:39-- https://download.pytorch.org/tutorial/data.zip\n", 76 | "Resolving download.pytorch.org (download.pytorch.org)... 99.84.224.54, 99.84.224.36, 99.84.224.48, ...\n", 77 | "Connecting to download.pytorch.org (download.pytorch.org)|99.84.224.54|:443... connected.\n", 78 | "HTTP request sent, awaiting response... 200 OK\n", 79 | "Length: 2882130 (2.7M) [application/zip]\n", 80 | "Saving to: ‘data.zip’\n", 81 | "\n", 82 | "data.zip 100%[===================>] 2.75M 5.90MB/s in 0.5s \n", 83 | "\n", 84 | "2019-09-25 13:59:40 (5.90 MB/s) - ‘data.zip’ saved [2882130/2882130]\n", 85 | "\n", 86 | "Archive: data.zip\n", 87 | " inflating: data/eng-fra.txt \n", 88 | " creating: data/names/\n", 89 | " inflating: data/names/Arabic.txt \n", 90 | " inflating: data/names/Chinese.txt \n", 91 | " inflating: data/names/Czech.txt \n", 92 | " inflating: data/names/Dutch.txt \n", 93 | " inflating: data/names/English.txt \n", 94 | " inflating: data/names/French.txt \n", 95 | " inflating: data/names/German.txt \n", 96 | " inflating: data/names/Greek.txt \n", 97 | " inflating: data/names/Irish.txt \n", 98 | " inflating: data/names/Italian.txt \n", 99 | " inflating: data/names/Japanese.txt \n", 100 | " inflating: data/names/Korean.txt \n", 101 | " inflating: data/names/Polish.txt \n", 102 | " inflating: data/names/Portuguese.txt \n", 103 | " inflating: data/names/Russian.txt \n", 104 | " inflating: data/names/Scottish.txt \n", 105 | " inflating: data/names/Spanish.txt \n", 106 | " inflating: data/names/Vietnamese.txt \n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "# to download the dataset\n", 112 | "download_dataset()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "We'll need a unique index per word to use as the inputs and targets of\n", 120 | "the networks later. To keep track of all this we will use a helper class\n", 121 | "called ``Lang`` which has word → index (``word2index``) and index → word\n", 122 | "(``index2word``) dictionaries, as well as a count of each word\n", 123 | "``word2count`` to use to later replace rare words." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 4, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "SOS_token = 1\n", 133 | "EOS_token = 2\n", 134 | "class Lang:\n", 135 | " def __init__(self, name):\n", 136 | " self.name = name\n", 137 | " self.word2index = {\"PAD\": 0, \"SOS\": 1, \"EOS\": 2, \"UNK\": 3}\n", 138 | " self.word2count = {}\n", 139 | " self.index2word = {0: \"PAD\", 1: \"SOS\", 2: \"EOS\", 3: \"UNK\"}\n", 140 | " self.n_words = 4 # Count SOS and EOS\n", 141 | "\n", 142 | " def addSentence(self, sentence):\n", 143 | " for word in sentence.split(' '):\n", 144 | " self.addWord(word)\n", 145 | "\n", 146 | " def addWord(self, word):\n", 147 | " if word not in self.word2index:\n", 148 | " self.word2index[word] = self.n_words\n", 149 | " self.word2count[word] = 1\n", 150 | " self.index2word[self.n_words] = word\n", 151 | " self.n_words += 1\n", 152 | " else:\n", 153 | " self.word2count[word] += 1" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "The files are all in Unicode, to simplify we will turn Unicode\n", 161 | "characters to ASCII, make everything lowercase, and trim most\n", 162 | "punctuation.\n", 163 | "\n", 164 | "\n" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 5, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "def unicodeToAscii(s):\n", 174 | " \"\"\"Turn a Unicode string to plain ASCII\n", 175 | " \n", 176 | " https://stackoverflow.com/a/518232/2809427\n", 177 | " \"\"\"\n", 178 | " return ''.join(c for c in unicodedata.normalize('NFD', s)\n", 179 | " if unicodedata.category(c) != 'Mn'\n", 180 | " )\n", 181 | "\n", 182 | "def normalizeString(s):\n", 183 | " \"\"\"Lowercase, trim, and remove non-letter characters\"\"\"\n", 184 | " s = unicodeToAscii(s.lower().strip())\n", 185 | " s = re.sub(r\"([.!?])\", r\" \\1\", s)\n", 186 | " s = re.sub(r\"[^a-zA-Z.!?]+\", r\" \", s)\n", 187 | " return s" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 6, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "def readLangs(filename):\n", 197 | " # Read the file and split into lines\n", 198 | " lines = open(filename).read().strip().split('\\n')\n", 199 | "\n", 200 | " # Split every line into pairs and normalize\n", 201 | " pairs = [[normalizeString(s) for s in l.split('\\t')] for l in lines]\n", 202 | "\n", 203 | " return pairs" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 7, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "# filtering some of the data\n", 213 | "MAX_LENGTH = 15\n", 214 | "\n", 215 | "eng_prefixes = (\n", 216 | " \"i am \", \"i m \",\n", 217 | " \"he is\", \"he s \",\n", 218 | " \"she is\", \"she s \",\n", 219 | " \"you are\", \"you re \",\n", 220 | " \"we are\", \"we re \",\n", 221 | " \"they are\", \"they re \"\n", 222 | ")\n", 223 | "\n", 224 | "def filterPair(p):\n", 225 | " return len(p[0].split(' ')) <= MAX_LENGTH and \\\n", 226 | " len(p[1].split(' ')) <= MAX_LENGTH and \\\n", 227 | " p[0].startswith(eng_prefixes)\n", 228 | "\n", 229 | "\n", 230 | "def filterPairs(pairs):\n", 231 | " return [pair for pair in pairs if filterPair(pair)]" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "The full process for preparing the data is:\n", 239 | "\n", 240 | "- Read text file and split into lines, split lines into pairs\n", 241 | "- Normalize text, filter by length and content\n", 242 | "- Make word lists from sentences in pairs\n", 243 | "\n", 244 | "\n" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 8, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "name": "stdout", 254 | "output_type": "stream", 255 | "text": [ 256 | "Read 135842 sentence pairs\n", 257 | "Trimmed to 12898 sentence pairs\n" 258 | ] 259 | } 260 | ], 261 | "source": [ 262 | "pairs = readLangs(\"data/eng-fra.txt\")\n", 263 | "print(\"Read %s sentence pairs\" % len(pairs))\n", 264 | "pairs = filterPairs(pairs)\n", 265 | "print(\"Trimmed to %s sentence pairs\" % len(pairs))" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 9, 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "name": "stdout", 275 | "output_type": "stream", 276 | "text": [ 277 | "Read 135842 sentence pairs\n", 278 | "Trimmed to 12898 sentence pairs\n", 279 | "number of test pairs: 300\n", 280 | "number of train pairs: 12598\n", 281 | "Counting words...\n", 282 | "Counted words:\n", 283 | "english 5070\n", 284 | "french 3331\n", 285 | "['he is too drunk to drive home .', 'il est trop saoul pour conduire jusque chez lui .']\n" 286 | ] 287 | } 288 | ], 289 | "source": [ 290 | "def prepareData(data_filename):\n", 291 | " pairs = readLangs(data_filename)\n", 292 | " print(\"Read %s sentence pairs\" % len(pairs))\n", 293 | " pairs = filterPairs(pairs)\n", 294 | " print(\"Trimmed to %s sentence pairs\" % len(pairs))\n", 295 | " \n", 296 | " \n", 297 | " #randomize the data with a fixed seed for repeatability\n", 298 | " random.seed(4)\n", 299 | " random.shuffle(pairs)\n", 300 | " #choose the first 10 pairs for testing and the rest for training\n", 301 | " valid_pairs = pairs[0:300]\n", 302 | " train_pairs = pairs[300:len(pairs)]\n", 303 | " \n", 304 | " print(\"number of test pairs: %s\" % len(valid_pairs))\n", 305 | " print(\"number of train pairs: %s\" % len(train_pairs))\n", 306 | " \n", 307 | " input_lang = Lang(\"english\")\n", 308 | " output_lang = Lang(\"french\")\n", 309 | " \n", 310 | " print(\"Counting words...\")\n", 311 | " cnt = 0\n", 312 | " for pair in pairs:\n", 313 | " input_lang.addSentence(pair[1])\n", 314 | " output_lang.addSentence(pair[0])\n", 315 | " \n", 316 | " print(\"Counted words:\")\n", 317 | " print(input_lang.name, input_lang.n_words)\n", 318 | " print(output_lang.name, output_lang.n_words)\n", 319 | " return input_lang, output_lang, pairs, train_pairs, valid_pairs\n", 320 | "\n", 321 | "input_lang, output_lang, pairs, train_pairs, valid_pairs = prepareData(\"data/eng-fra.txt\")\n", 322 | "random.seed(4)\n", 323 | "print(random.choice(pairs))" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 10, 329 | "metadata": {}, 330 | "outputs": [ 331 | { 332 | "data": { 333 | "text/plain": [ 334 | "['he is a tennis player .', 'c est un joueur de tennis .']" 335 | ] 336 | }, 337 | "execution_count": 10, 338 | "metadata": {}, 339 | "output_type": "execute_result" 340 | } 341 | ], 342 | "source": [ 343 | "train_pairs[0]" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "# Dataset" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 11, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "def encode_sentence(s, vocab2index, N=MAX_LENGTH + 2, padding_start=True):\n", 360 | " enc = np.zeros(N, dtype=np.int32)\n", 361 | " enc1 = np.array([SOS_token] + [vocab2index.get(w, vocab2index[\"UNK\"]) for w in s.split()] + [EOS_token])\n", 362 | " l = min(N, len(enc1))\n", 363 | " if padding_start:\n", 364 | " enc[:l] = enc1[:l]\n", 365 | " else:\n", 366 | " enc[N-l:] = enc1[:l]\n", 367 | " return enc, l" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 12, 373 | "metadata": {}, 374 | "outputs": [ 375 | { 376 | "data": { 377 | "text/plain": [ 378 | "['he is a tennis player .', 'c est un joueur de tennis .']" 379 | ] 380 | }, 381 | "execution_count": 12, 382 | "metadata": {}, 383 | "output_type": "execute_result" 384 | } 385 | ], 386 | "source": [ 387 | "train_pairs[0]" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 13, 393 | "metadata": {}, 394 | "outputs": [ 395 | { 396 | "data": { 397 | "text/plain": [ 398 | "(array([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 3, 79,\n", 399 | " 554, 3, 11, 2], dtype=int32), 8)" 400 | ] 401 | }, 402 | "execution_count": 13, 403 | "metadata": {}, 404 | "output_type": "execute_result" 405 | } 406 | ], 407 | "source": [ 408 | "encode_sentence(train_pairs[0][0], input_lang.word2index, padding_start=False)" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 14, 414 | "metadata": {}, 415 | "outputs": [ 416 | { 417 | "data": { 418 | "text/plain": [ 419 | "(array([ 1, 3, 3, 3, 3, 3, 499, 11, 2, 0, 0, 0, 0,\n", 420 | " 0, 0, 0, 0], dtype=int32), 9)" 421 | ] 422 | }, 423 | "execution_count": 14, 424 | "metadata": {}, 425 | "output_type": "execute_result" 426 | } 427 | ], 428 | "source": [ 429 | "encode_sentence(train_pairs[0][1], output_lang.word2index)" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 15, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "class PairDataset(Dataset):\n", 439 | " def __init__(self, pairs, input_lang, output_lang):\n", 440 | " self.pairs = pairs\n", 441 | " self.input_word2index = input_lang.word2index\n", 442 | " self.output_word2index = output_lang.word2index\n", 443 | " \n", 444 | " def __len__(self):\n", 445 | " return len(self.pairs)\n", 446 | " \n", 447 | " def __getitem__(self, idx):\n", 448 | " x, n_x = encode_sentence(self.pairs[idx][1], self.input_word2index, padding_start=False)\n", 449 | " y, n_y = encode_sentence(self.pairs[idx][0], self.output_word2index)\n", 450 | " return x, y\n", 451 | " \n", 452 | "train_ds = PairDataset(train_pairs, input_lang, output_lang)\n", 453 | "valid_ds = PairDataset(valid_pairs, input_lang, output_lang)" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 16, 459 | "metadata": {}, 460 | "outputs": [ 461 | { 462 | "data": { 463 | "text/plain": [ 464 | "(array([ 0, 0, 0, 0, 0, 0, 0, 0, 1, 44, 45, 97, 553,\n", 465 | " 16, 554, 11, 2], dtype=int32),\n", 466 | " array([ 1, 90, 38, 39, 499, 500, 11, 2, 0, 0, 0, 0, 0,\n", 467 | " 0, 0, 0, 0], dtype=int32))" 468 | ] 469 | }, 470 | "execution_count": 16, 471 | "metadata": {}, 472 | "output_type": "execute_result" 473 | } 474 | ], 475 | "source": [ 476 | "train_ds[0]" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 17, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "batch_size=5\n", 486 | "train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)\n", 487 | "valid_dl = DataLoader(valid_ds, batch_size=batch_size)" 488 | ] 489 | }, 490 | { 491 | "cell_type": "markdown", 492 | "metadata": {}, 493 | "source": [ 494 | "## The Seq2Seq Model\n", 495 | "\n", 496 | "A Recurrent Neural Network, or RNN, is a network that operates on a\n", 497 | "sequence and uses its own output as input for subsequent steps.\n", 498 | "\n", 499 | "A `Sequence to Sequence network `__, or\n", 500 | "seq2seq network, or `Encoder Decoder\n", 501 | "network `__, is a model\n", 502 | "consisting of two RNNs called the encoder and decoder. The encoder reads\n", 503 | "an input sequence and outputs a single vector, and the decoder reads\n", 504 | "that vector to produce an output sequence." 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": {}, 510 | "source": [ 511 | "### The Encoder\n", 512 | "\n", 513 | "The encoder of a seq2seq network is a RNN that outputs some value for\n", 514 | "every word from the input sentence. For every input word the encoder\n", 515 | "outputs a vector and a hidden state, and uses the hidden state for the\n", 516 | "next input word.\n", 517 | "\n", 518 | "![](imgs/encoder-network.png)" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": 19, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "class EncoderRNN(nn.Module):\n", 528 | " def __init__(self, input_size, hidden_size):\n", 529 | " super(EncoderRNN, self).__init__()\n", 530 | " self.hidden_size = hidden_size\n", 531 | "\n", 532 | " self.embedding = nn.Embedding(input_size, hidden_size, padding_idx=0)\n", 533 | " self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)\n", 534 | " self.dropout = nn.Dropout(0.3)\n", 535 | "\n", 536 | " def forward(self, x):\n", 537 | " x = self.embedding(x)\n", 538 | " x = self.dropout(x)\n", 539 | " output, hidden = self.gru(x)\n", 540 | " return output, hidden" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": 20, 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [ 549 | "x, y = next(iter(train_dl))" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 21, 555 | "metadata": {}, 556 | "outputs": [ 557 | { 558 | "data": { 559 | "text/plain": [ 560 | "(tensor([[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 184,\n", 561 | " 524, 272, 904, 11, 2],\n", 562 | " [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 22, 23,\n", 563 | " 24, 25, 612, 11, 2],\n", 564 | " [ 0, 0, 0, 0, 0, 0, 0, 0, 1, 22, 24, 1691,\n", 565 | " 530, 332, 1738, 11, 2],\n", 566 | " [ 0, 0, 0, 0, 0, 0, 0, 0, 1, 22, 178, 42,\n", 567 | " 24, 25, 3498, 11, 2],\n", 568 | " [ 0, 0, 0, 0, 0, 0, 1, 22, 24, 39, 16, 57,\n", 569 | " 72, 1774, 1175, 11, 2]], dtype=torch.int32),\n", 570 | " tensor([[ 1, 4, 38, 33, 473, 86, 773, 11, 2, 0, 0, 0,\n", 571 | " 0, 0, 0, 0, 0],\n", 572 | " [ 1, 17, 18, 23, 2317, 11, 2, 0, 0, 0, 0, 0,\n", 573 | " 0, 0, 0, 0, 0],\n", 574 | " [ 1, 17, 64, 472, 15, 1373, 1408, 11, 2, 0, 0, 0,\n", 575 | " 0, 0, 0, 0, 0],\n", 576 | " [ 1, 17, 18, 23, 2571, 55, 11, 2, 0, 0, 0, 0,\n", 577 | " 0, 0, 0, 0, 0],\n", 578 | " [ 1, 17, 18, 36, 17, 439, 207, 28, 11, 2, 0, 0,\n", 579 | " 0, 0, 0, 0, 0]], dtype=torch.int32))" 580 | ] 581 | }, 582 | "execution_count": 21, 583 | "metadata": {}, 584 | "output_type": "execute_result" 585 | } 586 | ], 587 | "source": [ 588 | "x, y" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": 22, 594 | "metadata": {}, 595 | "outputs": [], 596 | "source": [ 597 | "input_size = input_lang.n_words\n", 598 | "hidden_size = 300\n", 599 | "encoder = EncoderRNN(input_size, hidden_size)" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 23, 605 | "metadata": {}, 606 | "outputs": [], 607 | "source": [ 608 | "enc_outputs, enc_hidden = encoder(x.long())" 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": 24, 614 | "metadata": {}, 615 | "outputs": [ 616 | { 617 | "data": { 618 | "text/plain": [ 619 | "(torch.Size([5, 17, 300]), torch.Size([1, 5, 300]))" 620 | ] 621 | }, 622 | "execution_count": 24, 623 | "metadata": {}, 624 | "output_type": "execute_result" 625 | } 626 | ], 627 | "source": [ 628 | "enc_outputs.shape, enc_hidden.shape" 629 | ] 630 | }, 631 | { 632 | "cell_type": "markdown", 633 | "metadata": {}, 634 | "source": [ 635 | "The Decoder\n", 636 | "-----------" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 25, 642 | "metadata": {}, 643 | "outputs": [], 644 | "source": [ 645 | "class DecoderRNN(nn.Module):\n", 646 | " def __init__(self, output_size, hidden_size):\n", 647 | " super(DecoderRNN, self).__init__()\n", 648 | "\n", 649 | " self.embedding = nn.Embedding(output_size, hidden_size, padding_idx=0)\n", 650 | " self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)\n", 651 | " self.out = nn.Linear(hidden_size, output_size)\n", 652 | " self.dropout = nn.Dropout(0.3)\n", 653 | "\n", 654 | " def forward(self, x, hidden):\n", 655 | " embedded = self.embedding(x)\n", 656 | " embedded = self.dropout(embedded)\n", 657 | " output, hidden = self.gru(embedded, hidden)\n", 658 | " output = self.out(hidden[-1])\n", 659 | " return output, hidden" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": 26, 665 | "metadata": {}, 666 | "outputs": [], 667 | "source": [ 668 | "output_size = output_lang.n_words\n", 669 | "hidden_size = 300" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": 27, 675 | "metadata": {}, 676 | "outputs": [ 677 | { 678 | "data": { 679 | "text/plain": [ 680 | "torch.Size([5, 1])" 681 | ] 682 | }, 683 | "execution_count": 27, 684 | "metadata": {}, 685 | "output_type": "execute_result" 686 | } 687 | ], 688 | "source": [ 689 | "batch_size = y.size(0)\n", 690 | "decoder_input = SOS_token*torch.ones(batch_size,1).long()\n", 691 | "decoder_input.shape" 692 | ] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": 28, 697 | "metadata": {}, 698 | "outputs": [], 699 | "source": [ 700 | "decoder = DecoderRNN(output_size, hidden_size)" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": 29, 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [ 709 | "output, hidden = decoder(decoder_input, enc_hidden)" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": 30, 715 | "metadata": {}, 716 | "outputs": [ 717 | { 718 | "data": { 719 | "text/plain": [ 720 | "(torch.Size([1, 5, 300]), torch.Size([5, 3331]))" 721 | ] 722 | }, 723 | "execution_count": 30, 724 | "metadata": {}, 725 | "output_type": "execute_result" 726 | } 727 | ], 728 | "source": [ 729 | "hidden.shape, output.shape" 730 | ] 731 | }, 732 | { 733 | "cell_type": "markdown", 734 | "metadata": {}, 735 | "source": [ 736 | "Training\n", 737 | "========" 738 | ] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": 31, 743 | "metadata": {}, 744 | "outputs": [], 745 | "source": [ 746 | "def train_batch(x, y, encoder, decoder, encoder_optimizer, decoder_optimizer,\n", 747 | " teacher_forcing_ratio=0.5):\n", 748 | "\n", 749 | " encoder_optimizer.zero_grad()\n", 750 | " decoder_optimizer.zero_grad()\n", 751 | " \n", 752 | " batch_size = y.size(0)\n", 753 | " target_length = y.size(1)\n", 754 | "\n", 755 | " enc_outputs, enc_hidden = encoder(x)\n", 756 | "\n", 757 | " loss = 0\n", 758 | " dec_input = y[:,0].unsqueeze(1) # allways SOS\n", 759 | " hidden = enc_hidden\n", 760 | "\n", 761 | " use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False\n", 762 | "\n", 763 | " for di in range(1, target_length):\n", 764 | " output, hidden = decoder(dec_input, hidden)\n", 765 | " yi = y[:, di]\n", 766 | " if (yi>0).sum() > 0:\n", 767 | " # ignoring padding\n", 768 | " loss += F.cross_entropy(output, yi, ignore_index = 0, reduction=\"sum\")/(yi>0).sum()\n", 769 | " if use_teacher_forcing:\n", 770 | " dec_input = y[:, di].unsqueeze(1) # Teacher forcing: Feed the target as the next input\n", 771 | " else: \n", 772 | " dec_input = output.argmax(dim=1).unsqueeze(1).detach()\n", 773 | "\n", 774 | " loss.backward()\n", 775 | "\n", 776 | " encoder_optimizer.step()\n", 777 | " decoder_optimizer.step()\n", 778 | "\n", 779 | " return loss.item()" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": 32, 785 | "metadata": {}, 786 | "outputs": [], 787 | "source": [ 788 | "def train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 10,\n", 789 | " teacher_forcing_ratio=0.5):\n", 790 | " for i in range(epochs):\n", 791 | " total_loss = 0\n", 792 | " total = 0\n", 793 | " encoder.train()\n", 794 | " decoder.train()\n", 795 | " for x, y in train_dl:\n", 796 | " x = x.long().cuda()\n", 797 | " y = y.long().cuda()\n", 798 | " loss = train_batch(x, y, encoder, decoder, enc_optimizer, dec_optimizer,\n", 799 | " teacher_forcing_ratio)\n", 800 | " total_loss = loss*x.size(0)\n", 801 | " total += x.size(0)\n", 802 | " if i%10 == 0:\n", 803 | " print(\"train loss %.3f\" % (total_loss / total)) " 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": 33, 809 | "metadata": {}, 810 | "outputs": [], 811 | "source": [ 812 | "input_size = input_lang.n_words\n", 813 | "output_size = output_lang.n_words\n", 814 | "hidden_size = 300\n", 815 | "encoder = EncoderRNN(input_size, hidden_size).cuda()\n", 816 | "decoder = DecoderRNN(output_size, hidden_size).cuda()\n", 817 | "enc_optimizer = optim.Adam(encoder.parameters(), lr=0.01)\n", 818 | "dec_optimizer = optim.Adam(decoder.parameters(), lr=0.01) " 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": 34, 824 | "metadata": {}, 825 | "outputs": [], 826 | "source": [ 827 | "batch_size= 1000\n", 828 | "train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)\n", 829 | "valid_dl = DataLoader(valid_ds, batch_size=batch_size)" 830 | ] 831 | }, 832 | { 833 | "cell_type": "code", 834 | "execution_count": 35, 835 | "metadata": {}, 836 | "outputs": [ 837 | { 838 | "name": "stdout", 839 | "output_type": "stream", 840 | "text": [ 841 | "train loss 2.503\n", 842 | "train loss 1.864\n" 843 | ] 844 | } 845 | ], 846 | "source": [ 847 | "train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 20)" 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "execution_count": 36, 853 | "metadata": {}, 854 | "outputs": [ 855 | { 856 | "name": "stdout", 857 | "output_type": "stream", 858 | "text": [ 859 | "train loss 1.100\n", 860 | "train loss 0.926\n", 861 | "train loss 0.326\n", 862 | "train loss 0.304\n" 863 | ] 864 | } 865 | ], 866 | "source": [ 867 | "enc_optimizer = optim.Adam(encoder.parameters(), lr=0.001)\n", 868 | "dec_optimizer = optim.Adam(decoder.parameters(), lr=0.001) \n", 869 | "train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 40)" 870 | ] 871 | }, 872 | { 873 | "cell_type": "code", 874 | "execution_count": 37, 875 | "metadata": {}, 876 | "outputs": [ 877 | { 878 | "name": "stdout", 879 | "output_type": "stream", 880 | "text": [ 881 | "train loss 0.776\n", 882 | "train loss 0.932\n", 883 | "train loss 0.635\n", 884 | "train loss 0.745\n", 885 | "train loss 0.689\n", 886 | "train loss 0.714\n", 887 | "train loss 0.688\n", 888 | "train loss 0.653\n", 889 | "train loss 0.633\n", 890 | "train loss 0.610\n", 891 | "train loss 0.473\n", 892 | "train loss 0.599\n", 893 | "train loss 0.636\n", 894 | "train loss 0.469\n", 895 | "train loss 0.506\n", 896 | "train loss 0.443\n", 897 | "train loss 0.463\n", 898 | "train loss 0.452\n", 899 | "train loss 0.502\n", 900 | "train loss 0.471\n", 901 | "train loss 0.489\n", 902 | "train loss 0.489\n", 903 | "train loss 0.370\n", 904 | "train loss 0.489\n", 905 | "train loss 0.451\n", 906 | "train loss 0.365\n", 907 | "train loss 0.435\n", 908 | "train loss 0.461\n", 909 | "train loss 0.472\n", 910 | "train loss 0.450\n" 911 | ] 912 | } 913 | ], 914 | "source": [ 915 | "train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 300, teacher_forcing_ratio=0.0)" 916 | ] 917 | }, 918 | { 919 | "cell_type": "code", 920 | "execution_count": 38, 921 | "metadata": {}, 922 | "outputs": [ 923 | { 924 | "name": "stdout", 925 | "output_type": "stream", 926 | "text": [ 927 | "train loss 0.349\n", 928 | "train loss 0.350\n", 929 | "train loss 0.283\n", 930 | "train loss 0.422\n", 931 | "train loss 0.353\n", 932 | "train loss 0.360\n", 933 | "train loss 0.236\n", 934 | "train loss 0.273\n", 935 | "train loss 0.309\n", 936 | "train loss 0.217\n", 937 | "train loss 0.269\n", 938 | "train loss 0.384\n", 939 | "train loss 0.345\n", 940 | "train loss 0.428\n", 941 | "train loss 0.254\n", 942 | "train loss 0.381\n", 943 | "train loss 0.256\n", 944 | "train loss 0.276\n", 945 | "train loss 0.202\n", 946 | "train loss 0.367\n", 947 | "train loss 0.254\n", 948 | "train loss 0.225\n", 949 | "train loss 0.228\n", 950 | "train loss 0.242\n", 951 | "train loss 0.203\n", 952 | "train loss 0.259\n", 953 | "train loss 0.211\n", 954 | "train loss 0.213\n", 955 | "train loss 0.244\n", 956 | "train loss 0.189\n" 957 | ] 958 | } 959 | ], 960 | "source": [ 961 | "train(encoder, decoder, enc_optimizer, dec_optimizer, epochs = 300, teacher_forcing_ratio=0.0)" 962 | ] 963 | }, 964 | { 965 | "cell_type": "markdown", 966 | "metadata": {}, 967 | "source": [ 968 | "Evaluation\n", 969 | "==========\n", 970 | "\n", 971 | "Evaluation is mostly the same as training, but there are no targets so\n", 972 | "we simply feed the decoder's predictions back to itself for each step.\n", 973 | "Every time it predicts a word we add it to the output string, and if it\n", 974 | "predicts the EOS token we stop there. We also store the decoder's\n", 975 | "attention outputs for display later.\n", 976 | "\n", 977 | "\n" 978 | ] 979 | }, 980 | { 981 | "cell_type": "markdown", 982 | "metadata": {}, 983 | "source": [ 984 | "* `model.eval()` will notify all your layers that you are in eval mode, that way, batchnorm or dropout layers will work in eval mode instead of training mode.\n", 985 | "* `torch.no_grad()` impacts the autograd engine and deactivate it. It will reduce memory usage and speed up computations but you won’t be able to backprop (which you don’t want in an eval script)." 986 | ] 987 | }, 988 | { 989 | "cell_type": "code", 990 | "execution_count": 39, 991 | "metadata": {}, 992 | "outputs": [], 993 | "source": [ 994 | "def decoding(x, y, encoder, decoder, max_length=MAX_LENGTH+2):\n", 995 | " decoder.eval()\n", 996 | " loss = 0\n", 997 | " with torch.no_grad(): \n", 998 | " batch_size = x.size(0)\n", 999 | " enc_outputs, hidden = encoder(x)\n", 1000 | " dec_input = SOS_token*torch.ones(batch_size, 1).long().cuda() # SOS\n", 1001 | " decoded_words = []\n", 1002 | " for di in range(1, max_length):\n", 1003 | " output, hidden = decoder(dec_input, hidden)\n", 1004 | " pred = output.argmax(dim=1)\n", 1005 | " decoded_words.append(pred.cpu().numpy())\n", 1006 | " dec_input = output.argmax(dim=1).unsqueeze(1).detach()\n", 1007 | " yi = y[:, di]\n", 1008 | " if (yi>0).sum() > 0:\n", 1009 | " # ignoring padding\n", 1010 | " loss += F.cross_entropy(\n", 1011 | " output, yi, ignore_index = 0, reduction=\"sum\")/(yi>0).sum()\n", 1012 | " return loss.item()/batch_size, np.transpose(decoded_words)" 1013 | ] 1014 | }, 1015 | { 1016 | "cell_type": "code", 1017 | "execution_count": 40, 1018 | "metadata": {}, 1019 | "outputs": [ 1020 | { 1021 | "data": { 1022 | "text/plain": [ 1023 | "0.13901766459147136" 1024 | ] 1025 | }, 1026 | "execution_count": 40, 1027 | "metadata": {}, 1028 | "output_type": "execute_result" 1029 | } 1030 | ], 1031 | "source": [ 1032 | "batch_size=300\n", 1033 | "valid_dl_2 = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)\n", 1034 | "\n", 1035 | "x, y = next(iter(valid_dl_2)) \n", 1036 | "x = x.long().cuda()\n", 1037 | "y = y.long().cuda()\n", 1038 | "\n", 1039 | "loss, _ = decoding(x, y, encoder, decoder)\n", 1040 | "loss" 1041 | ] 1042 | }, 1043 | { 1044 | "cell_type": "code", 1045 | "execution_count": 41, 1046 | "metadata": {}, 1047 | "outputs": [], 1048 | "source": [ 1049 | "batch_size=10\n", 1050 | "train_dl_2 = DataLoader(train_ds, batch_size=batch_size, shuffle=True)\n", 1051 | "\n", 1052 | "x, y = next(iter(train_dl_2)) \n", 1053 | "x = x.long().cuda()\n", 1054 | "y = y.long().cuda()" 1055 | ] 1056 | }, 1057 | { 1058 | "cell_type": "markdown", 1059 | "metadata": {}, 1060 | "source": [ 1061 | "We can evaluate random sentences from the training set and print out the\n", 1062 | "input, target, and output to make some subjective quality judgements:\n", 1063 | "\n", 1064 | "\n" 1065 | ] 1066 | }, 1067 | { 1068 | "cell_type": "code", 1069 | "execution_count": 42, 1070 | "metadata": {}, 1071 | "outputs": [], 1072 | "source": [ 1073 | "def print_results(x, y, encoder, decoder):\n", 1074 | " _, decoded_words = decoding(x, y, encoder, decoder)\n", 1075 | " for i in range(x.shape[0]):\n", 1076 | " xi = x[i].cpu().numpy()\n", 1077 | " yi = y[i].cpu().numpy()\n", 1078 | " y_hat = decoded_words[i]\n", 1079 | " x_sent = ' '.join([input_lang.index2word[t] for t in xi if t > 3])\n", 1080 | " y_sent = ' '.join([output_lang.index2word[t] for t in yi if t > 3])\n", 1081 | " y_hat_sent = ' '.join([output_lang.index2word[t] for t in y_hat if t > 3])\n", 1082 | " print('>', x_sent)\n", 1083 | " print('=', y_sent)\n", 1084 | " print('<', y_hat_sent)\n", 1085 | " print('')" 1086 | ] 1087 | }, 1088 | { 1089 | "cell_type": "code", 1090 | "execution_count": 43, 1091 | "metadata": { 1092 | "scrolled": true 1093 | }, 1094 | "outputs": [ 1095 | { 1096 | "name": "stdout", 1097 | "output_type": "stream", 1098 | "text": [ 1099 | "> je suis prete a tout faire pour toi .\n", 1100 | "= i am ready to do anything for you .\n", 1101 | "< i m ready to do anything for you .\n", 1102 | "\n", 1103 | "> ils attendent tous .\n", 1104 | "= they re all waiting .\n", 1105 | "< they re all waiting .\n", 1106 | "\n", 1107 | "> ils sont semblables .\n", 1108 | "= they re similar .\n", 1109 | "< they re similar .\n", 1110 | "\n", 1111 | "> je suis vraiment desolee .\n", 1112 | "= i am truly sorry .\n", 1113 | "< i m truly sorry .\n", 1114 | "\n", 1115 | "> ce n est pas un saint .\n", 1116 | "= he s no saint .\n", 1117 | "< he s no saint .\n", 1118 | "\n", 1119 | "> il est ce qu on appelle un homme d action .\n", 1120 | "= he is what is called a man of action .\n", 1121 | "< he is what clever called a man of action .\n", 1122 | "\n", 1123 | "> nous voyageons a petit budget .\n", 1124 | "= we are traveling on a tight budget .\n", 1125 | "< we are traveling on a tight budget .\n", 1126 | "\n", 1127 | "> vous etes sans pitie .\n", 1128 | "= you re ruthless .\n", 1129 | "< you re ruthless .\n", 1130 | "\n", 1131 | "> tu es celle qui m a formee .\n", 1132 | "= you re the one who trained me .\n", 1133 | "< you re the one who trained me .\n", 1134 | "\n", 1135 | "> nous sommes au milieu d amis .\n", 1136 | "= we re among friends .\n", 1137 | "< we re among friends .\n", 1138 | "\n" 1139 | ] 1140 | } 1141 | ], 1142 | "source": [ 1143 | "print_results(x, y, encoder, decoder)" 1144 | ] 1145 | }, 1146 | { 1147 | "cell_type": "code", 1148 | "execution_count": 48, 1149 | "metadata": {}, 1150 | "outputs": [], 1151 | "source": [ 1152 | "batch_size=10\n", 1153 | "valid_dl_2 = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)\n", 1154 | "\n", 1155 | "x, y = next(iter(valid_dl_2)) \n", 1156 | "x = x.long().cuda()\n", 1157 | "y = y.long().cuda()" 1158 | ] 1159 | }, 1160 | { 1161 | "cell_type": "code", 1162 | "execution_count": 49, 1163 | "metadata": {}, 1164 | "outputs": [ 1165 | { 1166 | "name": "stdout", 1167 | "output_type": "stream", 1168 | "text": [ 1169 | "> je suis pret quand tu l es .\n", 1170 | "= i m ready when you are .\n", 1171 | "< i m ready to leave .\n", 1172 | "\n", 1173 | "> je compte devenir ingenieur .\n", 1174 | "= i am going to be an engineer .\n", 1175 | "< i m able on .\n", 1176 | "\n", 1177 | "> il est mon parent par alliance .\n", 1178 | "= he is related to me by marriage .\n", 1179 | "< he s my new .\n", 1180 | "\n", 1181 | "> il est riche mais il n est pas heureux .\n", 1182 | "= he s rich but he s not happy .\n", 1183 | "< he is not rich but he s happy .\n", 1184 | "\n", 1185 | "> nous sommes pieges .\n", 1186 | "= we re trapped .\n", 1187 | "< we re comedians .\n", 1188 | "\n", 1189 | "> je suis ponctuel .\n", 1190 | "= i m punctual .\n", 1191 | "< i m a .\n", 1192 | "\n", 1193 | "> vous n etes pas tres ordonnees .\n", 1194 | "= you re not very tidy .\n", 1195 | "< you re not very funny .\n", 1196 | "\n", 1197 | "> vous etes toutes a moi .\n", 1198 | "= you re all mine .\n", 1199 | "< you re all mine .\n", 1200 | "\n", 1201 | "> c est toi celle que je cherchais .\n", 1202 | "= you are the one that i was looking for .\n", 1203 | "< you re the one i i want to .\n", 1204 | "\n", 1205 | "> tu n es pas assez rapide .\n", 1206 | "= you re not fast enough .\n", 1207 | "< you re not fast enough .\n", 1208 | "\n" 1209 | ] 1210 | } 1211 | ], 1212 | "source": [ 1213 | "print_results(x, y, encoder, decoder)" 1214 | ] 1215 | }, 1216 | { 1217 | "cell_type": "markdown", 1218 | "metadata": {}, 1219 | "source": [ 1220 | "## Exercise\n", 1221 | "- Replace the embeddings with pre-trained word embeddings. Here are word embeddings for various languages.\n", 1222 | "\n", 1223 | "https://fasttext.cc/docs/en/crawl-vectors.html " 1224 | ] 1225 | }, 1226 | { 1227 | "cell_type": "markdown", 1228 | "metadata": {}, 1229 | "source": [ 1230 | "# Credits\n", 1231 | "The original notebook was written by Sean Robertson _" 1232 | ] 1233 | }, 1234 | { 1235 | "cell_type": "code", 1236 | "execution_count": null, 1237 | "metadata": {}, 1238 | "outputs": [], 1239 | "source": [] 1240 | } 1241 | ], 1242 | "metadata": { 1243 | "kernelspec": { 1244 | "display_name": "Python 3", 1245 | "language": "python", 1246 | "name": "python3" 1247 | }, 1248 | "language_info": { 1249 | "codemirror_mode": { 1250 | "name": "ipython", 1251 | "version": 3 1252 | }, 1253 | "file_extension": ".py", 1254 | "mimetype": "text/x-python", 1255 | "name": "python", 1256 | "nbconvert_exporter": "python", 1257 | "pygments_lexer": "ipython3", 1258 | "version": "3.7.3" 1259 | }, 1260 | "toc": { 1261 | "nav_menu": {}, 1262 | "number_sections": true, 1263 | "sideBar": true, 1264 | "skip_h1_title": false, 1265 | "toc_cell": false, 1266 | "toc_position": {}, 1267 | "toc_section_display": "block", 1268 | "toc_window_display": false 1269 | } 1270 | }, 1271 | "nbformat": 4, 1272 | "nbformat_minor": 1 1273 | } 1274 | -------------------------------------------------------------------------------- /5_lab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "5_lab.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "widgets": { 18 | "application/vnd.jupyter.widget-state+json": { 19 | "7e7b3dceeb7040349b16f5b1fa75c099": { 20 | "model_module": "@jupyter-widgets/controls", 21 | "model_name": "HBoxModel", 22 | "model_module_version": "1.5.0", 23 | "state": { 24 | "_view_name": "HBoxView", 25 | "_dom_classes": [], 26 | "_model_name": "HBoxModel", 27 | "_view_module": "@jupyter-widgets/controls", 28 | "_model_module_version": "1.5.0", 29 | "_view_count": null, 30 | "_view_module_version": "1.5.0", 31 | "box_style": "", 32 | "layout": "IPY_MODEL_939464683b4d4bb689b8e62cb1418128", 33 | "_model_module": "@jupyter-widgets/controls", 34 | "children": [ 35 | "IPY_MODEL_98b4d408b9a74f1e8b079813d92091d5", 36 | "IPY_MODEL_a85754e74c2a459f95c50e097a7e20e6", 37 | "IPY_MODEL_6a244a9b581b4ca096157ab39ff2bfe8" 38 | ] 39 | } 40 | }, 41 | "939464683b4d4bb689b8e62cb1418128": { 42 | "model_module": "@jupyter-widgets/base", 43 | "model_name": "LayoutModel", 44 | "model_module_version": "1.2.0", 45 | "state": { 46 | "_view_name": "LayoutView", 47 | "grid_template_rows": null, 48 | "right": null, 49 | "justify_content": null, 50 | "_view_module": "@jupyter-widgets/base", 51 | "overflow": null, 52 | "_model_module_version": "1.2.0", 53 | "_view_count": null, 54 | "flex_flow": null, 55 | "width": null, 56 | "min_width": null, 57 | "border": null, 58 | "align_items": null, 59 | "bottom": null, 60 | "_model_module": "@jupyter-widgets/base", 61 | "top": null, 62 | "grid_column": null, 63 | "overflow_y": null, 64 | "overflow_x": null, 65 | "grid_auto_flow": null, 66 | "grid_area": null, 67 | "grid_template_columns": null, 68 | "flex": null, 69 | "_model_name": "LayoutModel", 70 | "justify_items": null, 71 | "grid_row": null, 72 | "max_height": null, 73 | "align_content": null, 74 | "visibility": null, 75 | "align_self": null, 76 | "height": null, 77 | "min_height": null, 78 | "padding": null, 79 | "grid_auto_rows": null, 80 | "grid_gap": null, 81 | "max_width": null, 82 | "order": null, 83 | "_view_module_version": "1.2.0", 84 | "grid_template_areas": null, 85 | "object_position": null, 86 | "object_fit": null, 87 | "grid_auto_columns": null, 88 | "margin": null, 89 | "display": null, 90 | "left": null 91 | } 92 | }, 93 | "98b4d408b9a74f1e8b079813d92091d5": { 94 | "model_module": "@jupyter-widgets/controls", 95 | "model_name": "HTMLModel", 96 | "model_module_version": "1.5.0", 97 | "state": { 98 | "_view_name": "HTMLView", 99 | "style": "IPY_MODEL_2d4a0b71baca41ad854c0a5d8232d267", 100 | "_dom_classes": [], 101 | "description": "", 102 | "_model_name": "HTMLModel", 103 | "placeholder": "​", 104 | "_view_module": "@jupyter-widgets/controls", 105 | "_model_module_version": "1.5.0", 106 | "value": "", 107 | "_view_count": null, 108 | "_view_module_version": "1.5.0", 109 | "description_tooltip": null, 110 | "_model_module": "@jupyter-widgets/controls", 111 | "layout": "IPY_MODEL_6d9ec2a9b91b48e2965e65b36a6331db" 112 | } 113 | }, 114 | "a85754e74c2a459f95c50e097a7e20e6": { 115 | "model_module": "@jupyter-widgets/controls", 116 | "model_name": "FloatProgressModel", 117 | "model_module_version": "1.5.0", 118 | "state": { 119 | "_view_name": "ProgressView", 120 | "style": "IPY_MODEL_fd2e1e67e6eb40adab86f4c6546e75e2", 121 | "_dom_classes": [], 122 | "description": "", 123 | "_model_name": "FloatProgressModel", 124 | "bar_style": "success", 125 | "max": 170498071, 126 | "_view_module": "@jupyter-widgets/controls", 127 | "_model_module_version": "1.5.0", 128 | "value": 170498071, 129 | "_view_count": null, 130 | "_view_module_version": "1.5.0", 131 | "orientation": "horizontal", 132 | "min": 0, 133 | "description_tooltip": null, 134 | "_model_module": "@jupyter-widgets/controls", 135 | "layout": "IPY_MODEL_911d55b5c31b4ccbac97c78e0d427b47" 136 | } 137 | }, 138 | "6a244a9b581b4ca096157ab39ff2bfe8": { 139 | "model_module": "@jupyter-widgets/controls", 140 | "model_name": "HTMLModel", 141 | "model_module_version": "1.5.0", 142 | "state": { 143 | "_view_name": "HTMLView", 144 | "style": "IPY_MODEL_3c3ac877a7fb41cda825447361fb1c40", 145 | "_dom_classes": [], 146 | "description": "", 147 | "_model_name": "HTMLModel", 148 | "placeholder": "​", 149 | "_view_module": "@jupyter-widgets/controls", 150 | "_model_module_version": "1.5.0", 151 | "value": " 170499072/? [00:04<00:00, 47020266.07it/s]", 152 | "_view_count": null, 153 | "_view_module_version": "1.5.0", 154 | "description_tooltip": null, 155 | "_model_module": "@jupyter-widgets/controls", 156 | "layout": "IPY_MODEL_0fb40ac99d914282945f7683c8a3597b" 157 | } 158 | }, 159 | "2d4a0b71baca41ad854c0a5d8232d267": { 160 | "model_module": "@jupyter-widgets/controls", 161 | "model_name": "DescriptionStyleModel", 162 | "model_module_version": "1.5.0", 163 | "state": { 164 | "_view_name": "StyleView", 165 | "_model_name": "DescriptionStyleModel", 166 | "description_width": "", 167 | "_view_module": "@jupyter-widgets/base", 168 | "_model_module_version": "1.5.0", 169 | "_view_count": null, 170 | "_view_module_version": "1.2.0", 171 | "_model_module": "@jupyter-widgets/controls" 172 | } 173 | }, 174 | "6d9ec2a9b91b48e2965e65b36a6331db": { 175 | "model_module": "@jupyter-widgets/base", 176 | "model_name": "LayoutModel", 177 | "model_module_version": "1.2.0", 178 | "state": { 179 | "_view_name": "LayoutView", 180 | "grid_template_rows": null, 181 | "right": null, 182 | "justify_content": null, 183 | "_view_module": "@jupyter-widgets/base", 184 | "overflow": null, 185 | "_model_module_version": "1.2.0", 186 | "_view_count": null, 187 | "flex_flow": null, 188 | "width": null, 189 | "min_width": null, 190 | "border": null, 191 | "align_items": null, 192 | "bottom": null, 193 | "_model_module": "@jupyter-widgets/base", 194 | "top": null, 195 | "grid_column": null, 196 | "overflow_y": null, 197 | "overflow_x": null, 198 | "grid_auto_flow": null, 199 | "grid_area": null, 200 | "grid_template_columns": null, 201 | "flex": null, 202 | "_model_name": "LayoutModel", 203 | "justify_items": null, 204 | "grid_row": null, 205 | "max_height": null, 206 | "align_content": null, 207 | "visibility": null, 208 | "align_self": null, 209 | "height": null, 210 | "min_height": null, 211 | "padding": null, 212 | "grid_auto_rows": null, 213 | "grid_gap": null, 214 | "max_width": null, 215 | "order": null, 216 | "_view_module_version": "1.2.0", 217 | "grid_template_areas": null, 218 | "object_position": null, 219 | "object_fit": null, 220 | "grid_auto_columns": null, 221 | "margin": null, 222 | "display": null, 223 | "left": null 224 | } 225 | }, 226 | "fd2e1e67e6eb40adab86f4c6546e75e2": { 227 | "model_module": "@jupyter-widgets/controls", 228 | "model_name": "ProgressStyleModel", 229 | "model_module_version": "1.5.0", 230 | "state": { 231 | "_view_name": "StyleView", 232 | "_model_name": "ProgressStyleModel", 233 | "description_width": "", 234 | "_view_module": "@jupyter-widgets/base", 235 | "_model_module_version": "1.5.0", 236 | "_view_count": null, 237 | "_view_module_version": "1.2.0", 238 | "bar_color": null, 239 | "_model_module": "@jupyter-widgets/controls" 240 | } 241 | }, 242 | "911d55b5c31b4ccbac97c78e0d427b47": { 243 | "model_module": "@jupyter-widgets/base", 244 | "model_name": "LayoutModel", 245 | "model_module_version": "1.2.0", 246 | "state": { 247 | "_view_name": "LayoutView", 248 | "grid_template_rows": null, 249 | "right": null, 250 | "justify_content": null, 251 | "_view_module": "@jupyter-widgets/base", 252 | "overflow": null, 253 | "_model_module_version": "1.2.0", 254 | "_view_count": null, 255 | "flex_flow": null, 256 | "width": null, 257 | "min_width": null, 258 | "border": null, 259 | "align_items": null, 260 | "bottom": null, 261 | "_model_module": "@jupyter-widgets/base", 262 | "top": null, 263 | "grid_column": null, 264 | "overflow_y": null, 265 | "overflow_x": null, 266 | "grid_auto_flow": null, 267 | "grid_area": null, 268 | "grid_template_columns": null, 269 | "flex": null, 270 | "_model_name": "LayoutModel", 271 | "justify_items": null, 272 | "grid_row": null, 273 | "max_height": null, 274 | "align_content": null, 275 | "visibility": null, 276 | "align_self": null, 277 | "height": null, 278 | "min_height": null, 279 | "padding": null, 280 | "grid_auto_rows": null, 281 | "grid_gap": null, 282 | "max_width": null, 283 | "order": null, 284 | "_view_module_version": "1.2.0", 285 | "grid_template_areas": null, 286 | "object_position": null, 287 | "object_fit": null, 288 | "grid_auto_columns": null, 289 | "margin": null, 290 | "display": null, 291 | "left": null 292 | } 293 | }, 294 | "3c3ac877a7fb41cda825447361fb1c40": { 295 | "model_module": "@jupyter-widgets/controls", 296 | "model_name": "DescriptionStyleModel", 297 | "model_module_version": "1.5.0", 298 | "state": { 299 | "_view_name": "StyleView", 300 | "_model_name": "DescriptionStyleModel", 301 | "description_width": "", 302 | "_view_module": "@jupyter-widgets/base", 303 | "_model_module_version": "1.5.0", 304 | "_view_count": null, 305 | "_view_module_version": "1.2.0", 306 | "_model_module": "@jupyter-widgets/controls" 307 | } 308 | }, 309 | "0fb40ac99d914282945f7683c8a3597b": { 310 | "model_module": "@jupyter-widgets/base", 311 | "model_name": "LayoutModel", 312 | "model_module_version": "1.2.0", 313 | "state": { 314 | "_view_name": "LayoutView", 315 | "grid_template_rows": null, 316 | "right": null, 317 | "justify_content": null, 318 | "_view_module": "@jupyter-widgets/base", 319 | "overflow": null, 320 | "_model_module_version": "1.2.0", 321 | "_view_count": null, 322 | "flex_flow": null, 323 | "width": null, 324 | "min_width": null, 325 | "border": null, 326 | "align_items": null, 327 | "bottom": null, 328 | "_model_module": "@jupyter-widgets/base", 329 | "top": null, 330 | "grid_column": null, 331 | "overflow_y": null, 332 | "overflow_x": null, 333 | "grid_auto_flow": null, 334 | "grid_area": null, 335 | "grid_template_columns": null, 336 | "flex": null, 337 | "_model_name": "LayoutModel", 338 | "justify_items": null, 339 | "grid_row": null, 340 | "max_height": null, 341 | "align_content": null, 342 | "visibility": null, 343 | "align_self": null, 344 | "height": null, 345 | "min_height": null, 346 | "padding": null, 347 | "grid_auto_rows": null, 348 | "grid_gap": null, 349 | "max_width": null, 350 | "order": null, 351 | "_view_module_version": "1.2.0", 352 | "grid_template_areas": null, 353 | "object_position": null, 354 | "object_fit": null, 355 | "grid_auto_columns": null, 356 | "margin": null, 357 | "display": null, 358 | "left": null 359 | } 360 | } 361 | } 362 | } 363 | }, 364 | "cells": [ 365 | { 366 | "cell_type": "code", 367 | "metadata": { 368 | "id": "rR0hJXY56W2K" 369 | }, 370 | "source": [ 371 | "%reload_ext autoreload\n", 372 | "%autoreload 2\n", 373 | "%matplotlib inline\n", 374 | "import matplotlib.pyplot as plt\n", 375 | "\n", 376 | "import numpy as np\n", 377 | "import pandas as pd\n", 378 | "\n", 379 | "from pathlib import Path\n", 380 | "from sklearn import metrics\n", 381 | "import random\n", 382 | "\n", 383 | "import torch\n", 384 | "from torch.utils.data import Dataset, DataLoader\n", 385 | "import torch.optim as optim\n", 386 | "import torch.nn as nn\n", 387 | "import torch.nn.functional as F\n", 388 | "from torchvision import datasets, models\n", 389 | "\n", 390 | "from datetime import datetime\n", 391 | "import cv2" 392 | ], 393 | "execution_count": 1, 394 | "outputs": [] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "metadata": { 399 | "colab": { 400 | "base_uri": "https://localhost:8080/", 401 | "height": 100, 402 | "referenced_widgets": [ 403 | "7e7b3dceeb7040349b16f5b1fa75c099", 404 | "939464683b4d4bb689b8e62cb1418128", 405 | "98b4d408b9a74f1e8b079813d92091d5", 406 | "a85754e74c2a459f95c50e097a7e20e6", 407 | "6a244a9b581b4ca096157ab39ff2bfe8", 408 | "2d4a0b71baca41ad854c0a5d8232d267", 409 | "6d9ec2a9b91b48e2965e65b36a6331db", 410 | "fd2e1e67e6eb40adab86f4c6546e75e2", 411 | "911d55b5c31b4ccbac97c78e0d427b47", 412 | "3c3ac877a7fb41cda825447361fb1c40", 413 | "0fb40ac99d914282945f7683c8a3597b" 414 | ] 415 | }, 416 | "id": "_dlg1BnJ6rQn", 417 | "outputId": "7e2c47da-1eb0-4086-d90f-31f138bf3ff6" 418 | }, 419 | "source": [ 420 | "train_set = datasets.CIFAR10(root='cifardata', train=True, download=True)\n", 421 | "valid_set = datasets.CIFAR10(root='cifardata', train=False, download=True)" 422 | ], 423 | "execution_count": 2, 424 | "outputs": [ 425 | { 426 | "output_type": "stream", 427 | "name": "stdout", 428 | "text": [ 429 | "Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to cifardata/cifar-10-python.tar.gz\n" 430 | ] 431 | }, 432 | { 433 | "output_type": "display_data", 434 | "data": { 435 | "application/vnd.jupyter.widget-view+json": { 436 | "model_id": "7e7b3dceeb7040349b16f5b1fa75c099", 437 | "version_minor": 0, 438 | "version_major": 2 439 | }, 440 | "text/plain": [ 441 | " 0%| | 0/170498071 [00:00" 503 | ] 504 | }, 505 | "metadata": {}, 506 | "execution_count": 4 507 | }, 508 | { 509 | "output_type": "display_data", 510 | "data": { 511 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPsAAAD5CAYAAADhukOtAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO2daXBc15Xf/6c3NJbGTgIgABIACZKiKGqDKUqWLEeaUWTHM7IzUy47KUcfVKOp1LgqTk0+qJzFTlU+eFKxXf6QcoqOVaOZOF7Gq8ajjEeSJUu2Y1LgIu4UQRIgCGIjgAYaS+8nH7qZouT7b0AE0KDnnl8Vi8A9OO/dvu+dft3v/845oqowDOMfP4GNnoBhGOXBgt0wPMGC3TA8wYLdMDzBgt0wPMGC3TA8IbQaZxF5EsDXAAQB/E9V/VKpv6+M1WusaYvTVkoBDCDv3r8mqU8ut0Rt88kUtUmYL0lFdcQ5HgpSF4iUeD9VoaZIkG80CL5Yc7MZ53g+w19XpsR6ROuoCdFqvs1s2r1NTaepTyhUwbdXYhmDoSi3kXXMZ3PUJzE1RW2a4+dcsLLEMQvxFxCJVDvHoxVVJbbnPncmR6aRmFlwGm852EUkCOC/A/h9AFcBvCUiL6rqGeYTa9qCP/4Pf+20pTN8MWp0wTleoWepz9T0KWo7dHaQ2oJtDdS2fX+Xc7ypngdtRaiS2rIZfnJ0N/Eoq8nzE/Xln445x1NjTdRnZOAdatvzBD9Feg/wtZoaGnSOp4evUJ+mzT3Udp2/D6CueRe3NdQ6xxcnZ6nPmy+8QG3p2QvUFrubH7P6TfwNqav9gHO8t/du6lPb5N7ef/yjr1Cf1XyM3w9gQFUvqWoawHcAPLWK7RmGsY6sJtjbAQzf9PvV4phhGLch636DTkSeFZF+EelfSsys9+4MwyCsJthHAHTe9HtHcexdqOpBVe1T1b7KGP+OZxjG+rKaYH8LQK+IdItIBMCnALy4NtMyDGOtueW78aqaFZHPAvgZCtLb86p6upSPBAXRWrd8df3Cb30o+P8MD/U7x6vSA9Tn6vg4tSVawtTWtrOe+yHuHG+JNFKfpLqlMACYzXN5MBNPUFs04ZYiAWDhuluuaaptpT7ZHi4njYzyNY6e5JLdFnW/tt6ODurT0cltZyYvU1s8xc+d3IJb8ZhddJ+HAJCs7KK2UEWM+6X4cbk2yeW8peCwczxdz+/ud1e5lYtMnsuyq9LZVfUlAC+tZhuGYZQHe4LOMDzBgt0wPMGC3TA8wYLdMDzBgt0wPGFVd+PfL6pAKuuWBsLgUsj4FbeMNnX5HPVp6drOJ9LAJZK5iDvpBgA21bilEOWbQ3xpntoWSmSvoYSMEy6RAtba5JbD6hsuUZ/ufc3U9uab/EGo/h++RW3/6nG33yP376U+dbVcaqrCdWr71dA1bjvszrJLBj9MfSTyaWpDcpGa8temqW0uf5zamhvc50FVJU+iuj436hzP5rnUa1d2w/AEC3bD8AQLdsPwBAt2w/AEC3bD8ISy3o0PBAKornInatR1dzrHASAxtt85XlnB794ugN81jdTyO+6REL9D3lq32TkeTPKyVHnlNdeWErw0Ukz5oQlM88QVSbiVi4kEv2M9H7+D2rpqdlNbtoXPo7vRXTorl+DJP3NJvlbpWb5W1Ut8Hg1Z97GZD/Pt7foQr8ESC5eod1eizNhSxn0OA8DE4iHn+ODJw9Snvv1O53g2zVUcu7IbhidYsBuGJ1iwG4YnWLAbhidYsBuGJ1iwG4YnlFV6A7LQvLuc9PkhnqiRqXInVfQ+xHtSnL/yGz6NKN9Xex2XVhrC7nnMzXEpb3aaS02DZ85T2/Q4T6DZtsRr6C1dn3OPh3lLlVgjlymbwry+277uEvX64u5EjVPnJqnPQ48+Qm3BNE8y2VLHX9t9O9xS70jwberT2sZf88glflwmpnjXndkpXrsulXfXB5ye53X3xsbcEltykUvHdmU3DE+wYDcMT7BgNwxPsGA3DE+wYDcMT7BgNwxPWJX0JiKDABIAcgCyqtpX6u/zuRQW59xy0/WZEq2E2txZWQuhCeqTaeYdYxsbeb07BHjm0qUZdwufhTEuXU2eHaK29AkuAc5PlsgOq95EbdVh9/xDJQ71fTu3UtvVS+7WRABw7O2L1PbgXe4sxppKvvapLF9H8MRCIMDl0rmEO1Px6JFB6vPYg+6MPQAIxnn24ORkLbUtBLZQW2XYLef19vI6iqmI+zodDvOFWgud/Z+oKq8GaBjGbYF9jDcMT1htsCuAfxCRIyLy7FpMyDCM9WG1H+MfVtUREdkM4GUROaeqb9z8B8U3gWcBoKaJP05oGMb6sqoru6qOFP+fAPAjAL9Ve0dVD6pqn6r2Vcb4DQzDMNaXWw52EakWkdiNnwE8AeDUWk3MMIy1ZTUf41sA/EhEbmznf6vq35f00DzyObekVBNzF9ADgFzQneFzefJN6jOd4+87WzfxAovxLJcAL1x1Z3LNHubyVMU7PIPq4XZeZLOxhX8KOj3A97dlW5dzXEu8r1cFuRw2xZPNMBvn2WazOXfrot2NbikMAMYnE9S2uMRP1bxkqS2Zq3aO9935MPWJLfG2VgMnX6O23nv4uVN/jzuzDQCW5tzS21yCH7NslhTMFC4b3nKwq+olAHffqr9hGOXFpDfD8AQLdsPwBAt2w/AEC3bD8AQLdsPwhLIWnFxKp3CaFJYMCJeaRi67JY35Rd7XquduLnVE6tyyEABUznIZJ3XancGWf5tnr1XM8j5ku/s+QG3BEtl3rR3d1NbS0uYcP/r2CeqTVi4ZVdfzU6SilhdR/PXpY87xxua7qM/2PY9Rm85xSalizl1kEwD27XIX55yKX6A+Z37+BrVhIUNNc0e4LFeR4BJs8w53RlzVFi7NLky6pTcJ8Ou3XdkNwxMs2A3DEyzYDcMTLNgNwxMs2A3DE8p6Nz6dSeLq+DtOWzDL7+wGAu3O8Z6tHdRndw+/4z6U5i18ps/zmnGLR9011x7q6qU+wXm+r2yW39kN1fK6avceuJ/aQMSE8evu+nkAMDzK1YS5BV5xbHSUKw2ZeXcGzeDFOuqTfoivR1WUnx+hKK/Jd6jffdf9Z3/3C+pTl+fHbH/fDmoLpHiC0tzAGLUNnHW3bErv47UBQ9tanOOivDWYXdkNwxMs2A3DEyzYDcMTLNgNwxMs2A3DEyzYDcMTyiq9VVWGce8e90P/85PuBA4A0KDbp711gPo0VLrlDAA4eZHLINPHeYJE45x7m3/wLx+lPuk0b2l0efgKtW3t5kkQR44dpjZJuRNGlpI8kWRxjtQzAxCt4OsYVp6I9MC9DzjH66K8rdXZU/3Ulg7zOnmjQZ68NB3Y5xwfn+KJV6kIr184nOJJN3e2uyViANgS4Ws1fcq9xvEhLqNNxt3zSC7x42xXdsPwBAt2w/AEC3bD8AQLdsPwBAt2w/AEC3bD8IRlpTcReR7AxwBMqOre4lgjgO8C6AIwCOCTqjqz3LbqaqrwkQ+6m8jMzlRRv/PD7oytYHCBzzvDM5fGz3F5rWKMSyv/4p/9oXP8/vv2Up/jJ49T29jkJLU9+giX866eOkttAxfctvg8l34iMf6ePzkbp7adO+6ltvs/4F6TUJ5n0Y0leC28XxzjcthF5XN84g+6nONVtVxu1EXehiqV43KYVPJzONSQprZETY17Ho28JVow7H7NEuDZkiu5sv8lgCffM/YcgFdVtRfAq8XfDcO4jVk22Iv91t+bnPwUgBeKP78A4ONrPC/DMNaYW/3O3qKqN1qajqHQ0dUwjNuYVd+gU1UFQJ+pFJFnRaRfRPoTM/y7kGEY68utBvu4iLQBQPH/CfaHqnpQVftUtS/WwEsLGYaxvtxqsL8I4Oniz08D+MnaTMcwjPViJdLbtwF8GECziFwF8AUAXwLwPRF5BsAQgE+uaG85BRLuDKVNdTyrKRN1Fy+cHOdFFLOzXFqpmuKZaDLLix5OjLs/wBw/wVsrvfjjv6W2xAKXmk6+xSW76mAFtd11p7u9UijIC3BmsjwTLZkpkVFWQio7dvRN53h9jEtXC2lum18Uatu+fSe1nf6Nu0jo9HX+mu9obqC2+zp3UduuDt7O68IJXpxzoabROV65vYf6DA3/xjme49+olw92Vf00MT2+nK9hGLcP9gSdYXiCBbtheIIFu2F4ggW7YXiCBbtheEJZC04uJVM4ddYthex/YBv1a6l1y0YVWS5BZeJcyuutaaa2LQ9xaWV83J3Y98vDb1MfBKupKZvnstbLb/yS2qav8wTDyog762nXVnfRTgDY0lRPbbG6JmoLR/m1YnHRLW/GAvyUq43xY6bN3C8+MU5t7V3dzvH8Ht6fb3ZgkNqqUlyWC8O9LwDIt/ECl3u6W53j01VcHqxOurMYAyEuvdmV3TA8wYLdMDzBgt0wPMGC3TA8wYLdMDzBgt0wPKGs0lsun8PMorugYzDIs9Qa4S7WFwjz96oBcWfKAcBgYoTalpK8MODYFbfEo1f4PCrCQWqbmODFFyMR7lffyOWwWMwtDdW08F56R8+dobazF16jtq4tm6ittdEtOco1/rqCES6lVtXwQopbm93SFQAE0+7MyF2b+TwG4+4sNAB45bBbOgaAqZpH+Dw6+Po3tLnDcGmJnx81Ve7ilsEAPxftym4YnmDBbhieYMFuGJ5gwW4YnmDBbhieUNa78cGgoLHefRdUMUj9CtWqf5vJGe7z1uUj1JZs5Xdij//fAWpbGnYrBlVhfqc4n+c17VpaeLn9YJC/D8dqeJXeBpK4MhXnrbKWtMQdXHLXFwASWd5SqjrjPmbzczy549roZWrb1MQTSXbu4Hfx993hvlM/cukc9ens3U1t41ley+9SnicUdcbcLZ4AYH7R3cppbIwrSvEx93iWn252ZTcMX7BgNwxPsGA3DE+wYDcMT7BgNwxPsGA3DE9YSfun5wF8DMCEqu4tjn0RwJ8AmCz+2edV9aXlthWNBLGrs85puzbBpZBZUs9svESLp2iJ2mn1yqWahShvaVTR6JbsKgO8bdHSUgnJa5G3BILw9+FzZw5T28P3P0A2V0LKi3Apb98OLhmlkrwr70LK/doCVXxfW3u4zFdTIunpwhCvQZfLk2MW4RLabKpEElLvPmobTvP6b+kxvlbBIEmEmeLncCjnbocl4HX8VnJl/0sATzrGv6qq9xT/LRvohmFsLMsGu6q+AYCr+4Zh/E6wmu/snxWREyLyvIjw+rqGYdwW3Gqwfx3AdgD3ABgF8GX2hyLyrIj0i0j//CxvlWwYxvpyS8GuquOqmlPVPIBvANhf4m8PqmqfqvbV1PEbMIZhrC+3FOwicnONnU8AOLU20zEMY71YifT2bQAfBtAsIlcBfAHAh0XkHgAKYBDAn65kZ/l8HotEkpkpoULFmjqc41vreLuducEr1JYqkXm1NcbbJJ1PuN/T0iVaGoVKSGiBCu5XV88zqBJxLuPU1rmlrWgVl9BOnDhBbds7eGZerJa3tjo96K79NnTxGvW5a/cOatMsb5VVG+OfGDdt3uwcDyNHfVDF5auJ2Ulqyze2U5soP54tte41bm3gEvGFCrfsfK5EHb9lg11VP+0Y/uZyfoZh3F7YE3SG4QkW7IbhCRbshuEJFuyG4QkW7IbhCWUtOCnhCEKbtjpte2L0uRxUBd0FHafnedbbawM8iy510d2CCgB6q7jUNFo76J5HnM8jwLtJYX6eZ8SlSrShamnlbZdODRB5kGRJAcDwlWFqq+XJYagM84KTUZII2N5aonVVFS/cGUrxY9bR6s6kBIBgzi2zZpJc691cyWXKdJY/BSrNbokYADbVcVmurtIt9QWifI4VVe7sTAnwY2JXdsPwBAt2w/AEC3bD8AQLdsPwBAt2w/AEC3bD8ISySm9ZBWbybgmoJcyzdVIpdwOr+Jy7RxYA1FTwzKWORncmFADUgWdyNW919w1r2tJIfWJ5/rriU1xOyqZ54cv6CH+PFrKO7T1d1Kfrj/4pteXn+BxnJnkGWGWdex1r6/n6TpTYXl65HLZtWw+1vfGL3zjHF5JcomrP3UFtvY8+Sm317dyvJsRDLahuCVZD/DiHKtzzF66w2pXdMHzBgt0wPMGC3TA8wYLdMDzBgt0wPKGsd+PTyQwGz7iTLup7eBJEY6PbNpvkCShdPfwO7dTIALWNj49Q2/Z2d326gYvvUJ9FoiQAQEcnr3fX1ckTJ5KL/HWzWm219TxZZDHO77gnSiT5TM1zxWBu3H1nvTrEWyRB+fYmpuap7dT5q9TW3L7NOd7Ytpv6xHrcLbQAILT9ALUFoyWuncoTm9I59zmSzvA6eSmyuXyJ5bUru2F4ggW7YXiCBbtheIIFu2F4ggW7YXiCBbtheMJK2j91AvgrAC0otHs6qKpfE5FGAN8F0IVCC6hPqupMqW0tziVx/OfnnbYrR7nE88Dj9zjHU0F3qyMAmEvy+m5Hz5ymtgc7dlHbP3/8Cef4+Z28bdF3v/9jPo8Tx6ktnnC3TwKA+QXe/imZdmsvgRLdjkq94wdIfTQAWMjxlky5rDtRY2Gaz30zadUEAJu3us8BAGjYytuAxdrcdeGWKvi+Jqu5LVIi0aS1gp9zmuYHIJly17VLg8t1kTA5zqtMhMkC+HNV3QPgAIA/E5E9AJ4D8Kqq9gJ4tfi7YRi3KcsGu6qOqurR4s8JAGcBtAN4CsALxT97AcDH12uShmGsnvf1nV1EugDcC+AQgBZVHS2axlD4mG8Yxm3KioNdRGoA/ADA51T1Xc9Xqqqi8H3e5fesiPSLSH+WPeNnGMa6s6JgF5EwCoH+LVX9YXF4XETaivY2ABMuX1U9qKp9qtoXKlE9xjCM9WXZYBcRQaEf+1lV/cpNphcBPF38+WkAP1n76RmGsVasJOvtgwA+A+CkiNzQij4P4EsAvicizwAYAvDJ5TaUz1ciMe+WUOYDvK3OtVd+7hzfttPdSgoA9u3cTm2dd/FaYSePuKVBAPj4kjs76aknP0Z93vz1IWoLBPh77b777qK2V1//BbWlA+6+SzX1tdQnWaIVkob5KRIOctvurW458sjxcepTu/dxamu6i0tvpXpUzYfd6xGIcJ9woMRrDvGvohUV3C+Z59JbJOrWy6IlCsplG9yZiqEg38+ywa6qvwTA9sqPjmEYtxX2BJ1heIIFu2F4ggW7YXiCBbtheIIFu2F4QlkLTkYq67B170ectsXg31O/eOKcc3w2zosQVsX2Udv2B7jtxyWksv/14t86x+++cJH6PHJgP7WVUHjQsoUXowyG3HISAFybc2cPnp3gRRmHro5RW5grouhubqK2Tc3NzvGqGn59WcjzopgLEff2AGA2ySWqYM69VhUlijk2VZZ40nOeJ3ZOpPlDY+kkLzwa0SXneEsdb1EVibmzIkNBnoloV3bD8AQLdsPwBAt2w/AEC3bD8AQLdsPwBAt2w/CE8kpv0TS27rrstM3OX6F+1Xn3e9LiOJczEvO8f1mujktX9Xd0UdurvzrmHB8dHnWOA8C//dwz1Da7yGWc1155hdp29t5JbdvI/BdG+GuOt3DpqnLGLQsBQHqU2w4dchfTzM5zuW5nI5eudrfyOb4zzeWmeXFnt1XHuMxXX0IeDIX4PHIZ7hcJ8te2hWTt3cHVV8zlpp3j0RCfg13ZDcMTLNgNwxMs2A3DEyzYDcMTLNgNwxPKejc+k72OiZkXnLb5+DD1C2Xd9dNqYm3U59qks9gtAKCimddja7mL164bOz3kHE+XqPs1NOT2AYBAlC//fILXhdvd0Ultc1G3QvHg/Twhp613N7Wd+tmb1BbPcBVi5Lo7UWN7Fz9mj97J79R3NfI77k0l6uSNBN3bXAhHqU+0gu+rKhqktunxa9RWQ+rMAUBzrbvdVEbcLZ4AYHbGra7kcnw/dmU3DE+wYDcMT7BgNwxPsGA3DE+wYDcMT7BgNwxPWFZ6E5FOAH+FQktmBXBQVb8mIl8E8CcAJot/+nlVfanUtnLpFKaH3fXaQsolg7buvc7xnjseoj5XZnhdtazyl93Y3UptWx/e4xw//9Jh6nPoraN8Xy28y/Vimif5XL7MZcrpdMo5vrnvfuqzr9P9ugAg3sOTXRavu5NdAKCjxS0NPfTYI9Snq3sTtcVKyFC7NnMpdWHBXcctowt8X2Fegy656JYUAWBy4hK1pcHnGMm553Jo2l1PEACuDpx0jsdn+fFaic6eBfDnqnpURGIAjojIy0XbV1X1v61gG4ZhbDAr6fU2CmC0+HNCRM4CaF/viRmGsba8r+/sItIF4F4AN+otf1ZETojI8yLSsMZzMwxjDVlxsItIDYAfAPicqs4B+DqA7QDuQeHK/2Xi96yI9ItIf2aJP1ZqGMb6sqJgF5EwCoH+LVX9IQCo6riq5lQ1D+AbAJwPX6vqQVXtU9W+cCV/rtgwjPVl2WAXEQHwTQBnVfUrN43fnNHwCQCn1n56hmGsFSu5G/9BAJ8BcFJEbmgtnwfwaRG5BwU5bhDAny63oVh1NX6v7wNO28WTF6hfetwtd1Tt5XJddycv4DV2ndenS0d4rbZ0tzuDKrmlnvocOc1fV/Ssux4fAHTt3UFtrx/jkldgyV3P7N523vJKYlzW2rzvLu5Xx+Wkhlp3jbftPfx1VUT5OmYXeBbgYjrBtxl0S2/b6kqc+tkRajp09lfUFq6roLbWFl43MBp0n8fBOD9PEWAZh1yyXcnd+F8CcM2mpKZuGMbthT1BZxieYMFuGJ5gwW4YnmDBbhieYMFuGJ5Q1oKTS4k0Tr7uztiqynHJAJWT7uFAnLqEw/zp3bYYl3iuXuFtqIZT7v1ldvDstUtXz1FbB2lNBADJBXf2GgCMx3kxzWjInTkWLFGUsbJEy6BN1VyKvJbh7asQdctQ9SVaPIXTvNAjIvzpy8wil+WqK9yyoua5rNV/5KfUNjTGj2e2mUvB8zl+zO7sfMA5Hojw8zsbYLKntX8yDO+xYDcMT7BgNwxPsGA3DE+wYDcMT7BgNwxPKKv0llqM4J1j7opWMvkW9WvsdstyMwfGqc/OA1v5RGJclptOcTlpJOPOrhpc5PJUtpdLPGOnucyXv8Sz5SqceUkFeu7tcY5HGvihjuQXqW1TgEterTH+ulNElssmp6lPMsWzxvJpPo9AlM8jGnUXjxy+dpr6XBkpka0d4z3ialt7qS0b5YVMWRbj4nl+XOIzMef4Elch7cpuGL5gwW4YnmDBbhieYMFuGJ5gwW4YnmDBbhieUFbpLRiKom6zu/De+HneE21c3XLNG//n59Rnx507qa1pE5fl9u/iBRa3d7v9xnbPUx/t45lLw6e5/DM9wYse1iuXmg48/ofO8c3dndTn+gLvKRaP89eWAC9UWVvX6BzXSDX1kVANtSlPEERDPZfDckn3a5uc5r0Aw/XuYpkAEKrj505l8x3UNnDtbWq7cM49l66F3dTnkQ8+5hyf+PUPqI9d2Q3DEyzYDcMTLNgNwxMs2A3DEyzYDcMTlr0bLyJRAG8AqCj+/fdV9Qsi0g3gOwCaABwB8BlVdWcdFMlHFpHu7HfaGveWaOHT5J5mRTPf3TsXeK2wbTn3nWIAiET5knSTBJq9vbylUXA3T1q5um8Xtb1+6HVqy80sUFuoyd2ianyB1/i7vsRtueoqauvc6+zlCQDQnLue3DtXeC22DC8NiJoqniSTus7vrB876z7ffvXa31GfXG2JW/8Zfuc/WcMbl84s8te9dadbodhTyRWl2s3uxQqG+RxWcmVPAXhMVe9GoT3zkyJyAMBfAPiqqu4AMAPgmRVsyzCMDWLZYNcCN8TWcPGfAngMwPeL4y8A+Pi6zNAwjDVhpf3Zg8UOrhMAXgZwEUBcVW98VrsKwJ2obhjGbcGKgl1Vc6p6D4AOAPsB8Ed73oOIPCsi/SLSn02W/EpvGMY68r7uxqtqHMBrAB4EUC8iN+5mdQBwPt+pqgdVtU9V+0JR3iDAMIz1ZdlgF5FNIlJf/LkSwO8DOItC0P9x8c+eBvCT9ZqkYRirZyWJMG0AXhCRIApvDt9T1Z+KyBkA3xGR/wLgGIBvLrehYGUG1XeOOm3dPbxGV6DJLZVlAjxxYjzJa781klpyAFAd4BJPNuOWoZLC2xYl0ryO2HSCFwwLRrkOFWjlNfSkzt0WqEr49mp0ie8rzGW+yspmakvNudc/VM9lraGla9QWvzhAbXWNXA5LBK47x/d18ISckdEpPo/TXNLNlEgo6rqDJ1h1d7pr12Vm3TIqAFyed38lTud5ctKywa6qJwDc6xi/hML3d8MwfgewJ+gMwxMs2A3DEyzYDcMTLNgNwxMs2A3DE0SV36pf852JTAIYKv7aDMCti5QXm8e7sXm8m9+1eWxT1U0uQ1mD/V07FulX1b4N2bnNw+bh4TzsY7xheIIFu2F4wkYG+8EN3PfN2Dzejc3j3fyjmceGfWc3DKO82Md4w/CEDQl2EXlSRM6LyICIPLcRcyjOY1BETorIcRFxVyZcn/0+LyITInLqprFGEXlZRC4U/+epbes7jy+KyEhxTY6LyEfLMI9OEXlNRM6IyGkR+TfF8bKuSYl5lHVNRCQqIodF5O3iPP5zcbxbRA4V4+a7IvL+CkSoaln/AQiiUNaqB0AEwNsA9pR7HsW5DAJo3oD9fgjAfQBO3TT2XwE8V/z5OQB/sUHz+CKAf1fm9WgDcF/x5xiAdwDsKfealJhHWdcEgACoKf4cBnAIwAEA3wPwqeL4/wDwr9/Pdjfiyr4fwICqXtJC6envAHhqA+axYajqGwDe263yKRQKdwJlKuBJ5lF2VHVUVY8Wf06gUBylHWVekxLzKCtaYM2LvG5EsLcDGL7p940sVqkA/kFEjojIsxs0hxu0qOqNyh5jAFo2cC6fFZETxY/56/514mZEpAuF+gmHsIFr8p55AGVek/Uo8ur7DbqHVfU+AB8B8Gci8qGNnhBQeGcHSvRDXl++DmA7Cj0CRgF8uVw7FpEaAD8A8DlVfVepm3KuiWMeZV8TXUWRV8ZGBPsIgJubhdNileuNqo4U/58A8CNsbOWdcRFpA4Di/7yFyDqiquPFEy0P4Bso05qISBiFAPuWqv6wOFz2NXHNY6PWpHYWm2gAAAD2SURBVLjv913klbERwf4WgN7incUIgE8BeLHckxCRahGJ3fgZwBMATpX2WldeRKFwJ7CBBTxvBFeRT6AMayIigkINw7Oq+pWbTGVdEzaPcq/JuhV5LdcdxvfcbfwoCnc6LwL49xs0hx4UlIC3AZwu5zwAfBuFj4MZFL57PYNCz7xXAVwA8AqAxg2ax18DOAngBArB1laGeTyMwkf0EwCOF/99tNxrUmIeZV0TAPtQKOJ6AoU3lv900zl7GMAAgL8BUPF+tmtP0BmGJ/h+g84wvMGC3TA8wYLdMDzBgt0wPMGC3TA8wYLdMDzBgt0wPMGC3TA84f8Bax53MjGO65EAAAAASUVORK5CYII=\n", 512 | "text/plain": [ 513 | "
" 514 | ] 515 | }, 516 | "metadata": { 517 | "needs_background": "light" 518 | } 519 | } 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "metadata": { 525 | "id": "ov2mANm47I7j" 526 | }, 527 | "source": [ 528 | "x = np.array(x)\n", 529 | "pad = 4\n", 530 | "x = cv2.copyMakeBorder(x, pad, pad, pad, pad, cv2.BORDER_REFLECT)" 531 | ], 532 | "execution_count": 6, 533 | "outputs": [] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "metadata": { 538 | "colab": { 539 | "base_uri": "https://localhost:8080/", 540 | "height": 284 541 | }, 542 | "id": "mIurrX5R7Xwx", 543 | "outputId": "1519c8f8-7bbf-49fc-e16f-d0de2ebc2e75" 544 | }, 545 | "source": [ 546 | "plt.imshow(x)\n", 547 | "x.shape" 548 | ], 549 | "execution_count": 7, 550 | "outputs": [ 551 | { 552 | "output_type": "execute_result", 553 | "data": { 554 | "text/plain": [ 555 | "(40, 40, 3)" 556 | ] 557 | }, 558 | "metadata": {}, 559 | "execution_count": 7 560 | }, 561 | { 562 | "output_type": "display_data", 563 | "data": { 564 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPsAAAD6CAYAAABnLjEDAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO2daXBc53Wm39MruoFuAA2AAAkQBECCm0iK2ilZsjRS7CjOIjtJOXFm8VSpypmqcVU8k5qKJn+STE2mPFVxPD8mlamkorFck4ntchxbSRw7Wi1ZtkhRFMV9J8ANaKwNdAPo/ZsfaHjYeE9LLYIEl3ueKhaAw3Pv/e5y7u1+7/nOEeccDMO48/Hd7AEYhrE6WLAbhkewYDcMj2DBbhgewYLdMDyCBbtheIQVBbuIPC0iJ0XkjIg8d70GZRjG9Ueu9T27iPgBnALwCQCXALwD4HPOuWO1lgmGoi4cba6yBUINqm+kqYls7iPcmqSs2xcyGbIV81nVtwxeSTnKNl9Y31ZjmP+j1rgKrsTjKrENAIIQsgWK+nks5wpsq3GP9/n5XLhSiG2FeXX5Vj5liIT9qq/wLiBfyqu+2QIfh7k8jwsAyhIhWzDMvr4aJ0KU81BWbABQLM+xL3Kqrz/I46pFqbBANh/0iyzga6z6e2ZyCguZjHJ0gUDdI2AeBHDGOXcOAETkGwCeAVAz2MPRZux8/NkqW0fvFtV3x8N7yFaMqvugEpjXL/4jP32bbOMXTqq+c+CLOrubT3DjJn1bD/RvIlsoq+/DyMIM2abSbAOANY5PW/uUfsPKnEmSLYdGxROIxreRrZDuJltx5KC6/K99nINi10Cz4gkEgnwTuDAzpPoeG+Hj8M759apvJrKbbOsGeB9iQf2G5S/wthYUGwCMze8l23x5SPVt6b5LtWukLh8lW9TXp/quiT5U9ffX/9tXaq53JR/juwFcvOrvSxWbYRi3IDdcoBORL4jIfhHZX8jrd1PDMG48Kwn2ywCu/izVU7FV4Zz7C+fc/c65+4Oh6Ao2ZxjGSljJd/Z3AAyKSD8Wg/w3AfzWBy2Qz6Zx6cTLVbaFnP59KNbF3wha+tvrHlzq/IRqv3B6H9kmz7+j+nb2bSTb7CwLO8GiokwBGJ0ZI1t3Q1z19Ql/343E9e/WThMOg7qA45K8vTWtbapvUzsf3zffPEG2VPKwuvy9U61ku+fuLtU3Hufv8iFMq75zKRa9pgO6TpIt83rfemOKbP6s/ilTsuw7W9Y1ioH72bZ9z4Oqr6J91qQlwes49jZfSwCw77Xq63lmhjWlJa452J1zRRH5IoAfAvADeN45x8qCYRi3BCt5ssM5930A379OYzEM4wZiGXSG4REs2A3DI1iwG4ZHWNF39o9KY1MD7n94a5VtPtSp+hbAqZPhQP0ZdNryANDZy9vr79qqeAKXkpx9hukgmeJ5fR+ymSLZpMbbx5YoK/qS5eUBIB5kJdqf19M/Ryd5g4WFAdX3xLEhHkMjK+T3/6qe9Zhy7Pvm6dOqb896HsOx8bTqu9Cyjmx3P9Wr+o5NXCHb+NBLiuekurxvHR/HeFj3LYU7yDa/EFN9+9fox1zj/PQ5ZVvDqm9843jV3/69tdV4e7IbhkewYDcMj2DBbhgewYLdMDzCqgp04WgTNj/wsSpbTnhaJQBkhFNFs7O66KYRT+ii2brWJ3hcTvdtmzpCtr3Hh8i2cCqlb+vBPrIV83qaZ0OAhb+AT8+x7G/hlNCmuD7n+nL7KNlmRtkGAMlzF8i2/ZMs8K3fyWmxADA5zNNs37rI6wSANmU++kSYjwEANMc4dbq5UU8lbp7n9OuGhSGy5Wd04bBxLR/blg79PPR18TTbwY4+1Tce0effa0SVdYRSelr5UKl6OsrJGmnEgD3ZDcMzWLAbhkewYDcMj2DBbhgewYLdMDyCBbtheAQLdsPwCBbshuERLNgNwyOsKINORIYApAGUABSdc0oJPsMwbgWuR7rsv3DO6aVcl5Gbz+DUO29V2eZDypxxAM0b+L7RM1h/D4qJ01TVGgBwcXg/2aL5M6qvNp893ckpnWs3t6jLp8FptJ2hhOqbddymaabMbYAAoJDied8NaX0++9wEp7u2xfWKr8UBTne9PMLHpuGw3uJonePxDvb0qL4969l+bPy86pvK8bkszeltpWbmOS01G+kjWyCszzvP5vg4XhnX57Mv+C+SLa+kMgNAf/QjzGcfHyLb2VHeFgBMj1fvb6FYu+aDfYw3DI+w0mB3AP5ZRN4VkS9cjwEZhnFjWOnH+Eedc5dFZA2Al0TkhHPujasdKjeBLwBApMZMJcMwbjwrerI75y5Xfo4B+DssdnZd7vOz9k/hho/QFsMwjOvKNQe7iDSKSGzpdwCfBMATwA3DuCVYycf4TgB/JyJL6/m/zrkffNACc5ks9v+0undYW7+uirZvYFU1V6w9MX85QejFApIXWGGfPM/9zAC91xtaWa2dDekVPTuaWJl1umiO1EKGbHOosb+KYhws6vftrjZWzltauXopAPTv0nq9caGK/d/Re+P9m6fY97H7dqi+zUqvtyj0lzpvDXPF2Lf26YVMsv4nyCahzymOeq+38pX6e721t/L5iUYiqu/E7Ihq19DW4c9tUH1nz/ZV/V3K6W8OgJX1ejsH4O5rXd4wjNXFXr0ZhkewYDcMj2DBbhgeYVWry4YaYujZ+nNVto5evZXQhn6u3FmM1t/+qVlZHgDSo9zoPhLWUxznwCJOKM5iXKhGRc+u5jVk82f1fSg7FpwW0npF0Zjj0+ab4lRXAJA0C5JjaRa8ACCT4kq/fU3cGqvYqW+rP8EVbktpPeV3Nsv7m5/R97dxgbfXWiMtNBPkdWz5OKdZx4L6a2B/gdNwFwp8zQDA2Pxesg0d3qf6tnTfpdo1UpePkq091qf6bv/F6rF9/f03a67XnuyG4REs2A3DI1iwG4ZHsGA3DI9gwW4YHmF1e701tWDwkWeqbONTevGKk8MHyDawpf4CACeH9ZTQQpRTOpeP6WfruPA2Gxt4vd3NurLbGuRtzc7qqbUzU6xaDx07qfpOJTm1dsOC3idtYWKWbcGw6htL8NuHtiAXjtjVX6NYR4pTQo+cGFd9H3n8MbL585yqCgDrmnm8927iohwAcNn/Ptm61vI+XD7HxxAAxibbyDYzqad058pcBGQqoxfgGB2tkSetrXeC3+4kmvSCIylfdVp4tsbbHsCe7IbhGSzYDcMjWLAbhkewYDcMj7CqAl3ZV8JctDqdsSXUq/peGeE5012zumClMTFdowLqWk4JnQuMqb6F9mmyJRLKPHmfXun03DTPLZ4b1edRjx8fJlv+kC4yZsZZzJtt7FB9G4M8tkCN037vZj4Xl85xVdP33j+rLv/wTk5RborodQVyReU41NKWfCyAzqY5FRkADrw7RLYnH+Y0Xn9KTxkeH4+Tbc63TvWNBFnkGxxUaiAAyIXqf66GW3kdExNrVd/pQvXYSk4XagF7shuGZ7BgNwyPYMFuGB7Bgt0wPMKHCnQi8jyAXwIw5pzbUbElAHwTQB+AIQCfdc6xmrWMXDGD8+PV8203tX9G9W2KsZBWLvE831o0xfT5wyU/ZzItH9MSUyUultvbweNKFXUx8PQlziib2ae38Qmf4iyvR7v1OfmJThaRjp7R17tuQx/ZXI17fNTPYtqkktQ2k9Iz8GZKXChxa0IX0pLj3MJqfkG/HMtSJFu2pPcguP+uR8kWW2Cx98zh19TlB3crBTp369lrC7Ms0M2m9WNbLOpz9TWamliQXLdBz/iLxKuLdJ7+AR+rJep5sn8NwNPLbM8BeMU5NwjglcrfhmHcwnxosFc6vCy/vz8D4IXK7y8A+PR1HpdhGNeZa/3O3umcW/qMOorFGvIqIvIFEdkvIvuL8/o7ZsMwbjwrFuiccw6o1c2guv1TIKrPVDIM48ZzrcGeFJG1AFD5qaegGYZxy3Ct6bIvAvg8gC9Xfn6vnoXKBWA2Wa2Gn0ofUn1b4qz4Hq0xR13DJ6xYA8Dl86y2Zub1ucYDd7MKG2pmxTkyoyuguaOcAlt+X9+H8AxXUN16/wOqr19Jz+3q6Vd9Ozs5zfLA+/oxzzs+No0tfImE4/r87p8cfY9sifadqu/G7U+Szc1yWisAhGd5Tv6uLXpa6GTqNNmOvfoGO84V1OVn32WVPpzmNyUA0L6J02ij6/Q3KHPj9avxXR1Ka6zRY6rvxPsvV/1dTusttIA6nuwi8jcAfgpgi4hcEpFnsRjknxCR0wB+rvK3YRi3MB/6ZHfOKV3xAABPXeexGIZxA7EMOsPwCBbshuERVrf9UyiIgZ6eKtvwBf2t3dwsz5kuBU7VvS1/UReRfD5uBTTQ26N4AlsHWIwbznPa4tRJFuIAYP4A78MjfYOqrz/D6y0WdREpEOd0ynv23Kf6QtEOkxN6D++LIywezs6x4DMyord/KmQ4t3borN5aK/8I71u0QT9ngQaeq793PwtxAPDDf/wR2ZrLfGwfvH+Turwvx2nHs2dGVd8zx/naze/S6zMENtRMRSGG9rJQGjqki4QdweqCrZLTrxnAnuyG4Rks2A3DI1iwG4ZHsGA3DI9gwW4YHmF11fgg0N21LDW1qFfulBKnsDZ16L4amXG9Gqfz8zq6u86ovq0RVlsPn2VlduqgrgwnZnn5X/6Xj6u++TzPCDx/8YLq29vPKZnvvrdP9ZUcp6AuZPW01PlZTulsCPM+BJ2eXvzQPQ+RrbmBK+ECwPEj+8mWD+qVaEf8/EphyrdL9U1Ocpp0LsRFSC7mOAUXAO7q5rc160L6/k4d4WOTGtbTeMdT+vY0OmaayLa2pL+12rqtul1Vw0/0NweAPdkNwzNYsBuGR7BgNwyPYMFuGB5hVQU6n5TREKruT963QU/d3LKeq9o0t95d97ZmpvWqOCcv8vb8fr1nuhQ4zTJ5gsW48KguvvzWL/4K2e67d4fqe/DwQbKNjuu9zR9/jEW+S0eOq75nTrM9ldEFp1CM7/3jMymybd50j7r8fQ/wvgXK+vzq0TSnhP7oPRbSAOCs4zF88pf7VN9onEVGN8+VbHMlXUiTCF83gda86ptuYiHNJfSqxv4g70MtnL+Ft5Xn62NxbNXXqASsP7theB4LdsPwCBbshuERLNgNwyPUU4PueREZE5EjV9n+UEQui8jByr9P3dhhGoaxUupR478G4H8C+Poy+1edc3/yUTbm8xURbaxWwzs6W1XfjnCCbPl07T5WtHyznnpZaOACC+NJ/Y1AcYaV3egkp7XKjF4wYCzJFbYPHtIru7743b8nW3pO7yF3+B1WZhv9ev+1nXdxddeAn4tyAEChyKmt2YKSqqoo6QDw3gHumdcS01XvuTzbM/O6krxx42ayHX2bC4MAwNQE78O2dr7G7l2/RV1+Sw9X9D19SC/WMdfE12hk44DqO3zxbdWusU5Zx1yGe9gBwEyh+nouOT11G7j29k+GYdxmrOQ7+xdF5FDlY77+eDYM45bhWoP9zwFsBLAbwAiAr9RyvLrX2/ys9XozjJvFNQW7cy7pnCs558oA/hLAgx/g+7Neb9G49XozjJvFNaXLisjaq7q4fgaAnue4jGBA0NNWLSS1xHWxKFDkCqoHjuuijMaDD21Q7Z3K9sJFXdwqpFjkG2xqJ9u6R3SxJ5mcJtuP972v+sLfSKZiWRckX3rjx2SbmuBtAUAkxMdxS69eF2BdG6dpxprbyBZs0J8R80qX3phPv8TiMT62rl33TY0lydbdp7e7Km/n6r0zZ4bIFs3p3zyD4PWW1+qtxLb3c3uwqag+f78xq6coa7T28DoSicdU33J22fz1YO1ObB8a7JX2T08AaBeRSwD+AMATIrIbi91bhwD89oetxzCMm8u1tn/6qxswFsMwbiCWQWcYHsGC3TA8ggW7YXiEVS1eERBBIlh9f2mBXhgg6+dU1en5+it0+pXlASChbM8X1O95Z0TpXZbmnlsLWX0fRi+wiuwu6NsKB/1kGxvTCz+EQuzbkmDVHABiMVadmzr1yrsHThwj2/HTr5Gtbx33XgOArgS/UZArPFYA8If4DUi0id8cAEBvO6ve/nyNoidreHtDKU5rfXmf/mZnsolVb3+Pfrxa13L4LCzo56wpWv9r54hSgKO1k48BAORGqo952V/7+W1PdsPwCBbshuERLNgNwyNYsBuGR1hVga5YymF8eqjK1tykz3d24LnNiRZd7NGXH9LtjtvoLB/TEu+cf5ds2S4ew8Gf6u2jFi4q8+GDughVLvOc+M7OTtXXr4gwsaaY6tuqpLtOpvRquguO1+tThKV0UU/9bCzwsc3M6umjV0bOk62jTU9L3byJxbxd23TB6vK5E2RbP7iVbMminqZ9rswpw+tjXEUWADLzXDF2dFSfDZ6q3ZWJ19HM64h06ddN07KxORPoDMOwYDcMj2DBbhgewYLdMDyCBbtheIRVVeOzxQLOTFWnm6ZL3E8NAJoVFXjLej1tUePKGKuyADCjFFhIKlVkAaBBUbJbHCvDcw16tdVwgpX7iE9/+7CwwAr5wrxe1RTC9+gTx/apro/e95CyuH6Pj4VY0d+1iZXoXJZ7pwHAXI7H64vqbwl6B/j8NtVIWz49zGnHpbL+ZiYSYpV9JqekFw/uUpe/mOc3CvlRfX/9fiVddlJPWw6Uavdg43XwGCaUcw4AyVL12HKF2kUy7MluGB7Bgt0wPIIFu2F4hHraP60XkddE5JiIHBWR36nYEyLykoicrvy02vGGcQtTj0BXBPC7zrkDIhID8K6IvATg3wJ4xTn3ZRF5DsBzAH7vg1YUCkXQ27u7egBKmigATE8qaYcNNQQrbfkarrG2HrL1NuuVSmeHLpAtp6R/9sb0aq0n01x0N1+j2mpAEWB8Yd23uYVTOtMpXUSKN7NA1hDV0z8PKa2pNvZwym4szvPWAeDoEM8xHz57RfXduXUT2VxRr6Ybj7GY17FmjeobRImNUa5kOzYzri5fTnSTTRwfbwDojPOx6WrVKxWfDusisMZgbzPZXEkXgUemlomXrnZKeT3tn0accwcqv6cBHAfQDeAZAC9U3F4A8OkPW5dhGDePj/SdXUT6ANwDYC+Azqtqx48C0GdtGIZxS1B3sItIE4C/BfAl51xVfSi3OJWMXw6iuv3T3Iw+A8owjBtPXcEuIkEsBvpfO+e+UzEnRWRt5f/XAuD+xKhu/9TYrE8rNAzjxlNPRxjBYlOI4865P73qv14E8HkAX678rN13pkLQF8Hapm1VtkQTixEAMN/JCtt4uv5Jwdtjevu5qJ/nBU9ldPHktTOchZc7y0UvB6P6N5iR+BBvK6Vvy6fUrMxk9HnnOaXAZWeXXgTyyBlFJKyRzXXxwkWyad25IkE9S6tBSQ7s7qpRCDPK5yGQ0wuK9nTxNeIv6Z8SC1m+btZEWJDMF/Umo9LOAm5HM4t2ANAcYeHPV0NEDkd1gU2jJc5CZTmri6LFlurxBgJ6hiZQnxr/MQD/GsBhETlYsf0+FoP8WyLyLIBhAJ+tY12GYdwk6mn/9GNAKRuzyFPXdziGYdwoLIPOMDyCBbtheAQLdsPwCKs6n71ULiI1W12RszGst8XxhTntcLpc/5zgzqCetpjLcXru8jEt0RRmtbUnwWmazdCV0vZeroDato5bEQFArMzjTU3q6nQxz8puS0i/b4tyHLoH+lTfvl/7ebKVZ3kM0+N6qmmkmY9DvEU/NmPKOspOT+PdsGGAbG/86G3Vdy7Lbwq6S9vINvj44+ryLd3s2xTQw8Tv+G2JC+jnIRCuPc+cnXkd/oB+7be1VF9PAWWO/RL2ZDcMj2DBbhgewYLdMDyCBbtheIRVFuhKmMlWp4u2l/S5wlOXOO1w6Bync9aiZUBvl5NIsH35mJboG2BhaPIyt3pKJrlnOwBs7OZ57mfOnlJ95xXhsGe9Pk++bz2nb2bn9X3Q5ojHW2qkKKdYjEsr6b2TGT31czbJoltjQJ0fBThex9ikXnz0yMlLZGvv3qD6JtZyq6fYABfdDGzcoy7vb1Cef07JZQaQL/E5yxeU+fQAcvoqVOaUdYREr/vg9y9bsdQ43rAnu2F4Bgt2w/AIFuyG4REs2A3DI1iwG4ZHWFU1vuz8SOerq50eP6srsHtfOUi2qY/Q0f7CAV2dfuip3WTL+fUWRbNZToc8cOwo2R7u2aIu/6tPfZJsJzdzVVUA+Oa3v8vbOsTHAABSaa7impnTq8tmlXZGPl0wVu/8PqVAw1xJrwJbKnJK6NyUPq41SnXYNb18bgCgtZer/8bWcpEJAFgI83rHG9kWqpF53RVWUmDz+gHL5rgARh667B4K1lbJab15HkMZfB4AIEq1Kqz9k2F4Hgt2w/AIFuyG4RFW0v7pD0XksogcrPz71I0frmEY18pK2j8BwFedc39S78YWcnkcPV/dUmn4FLdYAoBiuo9sLsPzw2uR8enVQ6+8/CrZNmzuVX13bd5ItvU7eb7z4XdPqst/eoFTHJ95+pdU3zd/spdsvhp91Hfdu5Nsr7z+I9U3r/SDb2qJq75ZpTKrC/IlEqwxZ3prL4uP7x7k3uoAEN/B5QvbduoCnVbiNhPUq6j6lP7sQaXlVjCgC2lhpeVWtqwLdKEGVvkaRFf+iq16bQKNeIRrPJRdDUUxsGxsH5AuW0/ByREAI5Xf0yKy1P7JMIzbiJW0fwKAL4rIIRF53rq4GsatzUraP/05gI0AdmPxyf+VGsv9rP1Tfs7aPxnGzeKa2z8555LOuZJzrgzgLwGoLViubv8UarT2T4Zxs6hHjVfbPy31eavwGQDcZ8gwjFuGlbR/+pyI7MZi99YhAL/9YSsqlRYwkzpWZXNhLkwAAG3td5EtWnq6juEuMu//gWpPpbl/20xKT9mNxnaRbeNDbPuuoqQDwP958e/Jdvfps6rvY3v4g5EiIgMAOtdxUQt/jR5fV2Y5bfj4mH7Mhy9xOnJQeanR3673b+tobydbtEl/nsyVuYDGXIiXB4CZLCvR/pK+v2Gl8ENbRGukN60uP5bntNR8Vi8cEXL8tbSzWU9XDcU4xbkWwSKr8ckZ/TjmpfrTcrFG8QxgZe2fvv9hyxqGcetgGXSG4REs2A3DI1iwG4ZHWNX57Cg4IFktdnS36febeBen0TY3na97UzMZPQ23sczbm0/qAkw6wymOpWYWhlq29anLv/LWe2QbuTii+v6HLz1Ltpl5XUR67eWXybZ5kAVNANigjG3usi5upTpZmolMswiVH9HzJfbu5fn3xYwu5m1OsBC2tUtPCT01xfPnM6K/xm2MsfDXooiEgRrtlEoF9g359bnk65Q03m16QWDMlqb0/1CI+1mo9Pt13yuz1eMVqf38tie7YXgEC3bD8AgW7IbhESzYDcMjWLAbhkdYVTXe5w+hKba2ylYqciVNAJgc5xTUXOFK3dvKpPS+cIEiF25YPqYlroyPkS3czst37uQiFwAwenSYbHm/ns44PMy+vgb99GTSXGRia8961Xe2gd80PHyfOmcJawe5T9qRH75JtlRBf6NweYJTQjf26cf28btYpe9L6FVr25QCGpf9uso/F+Refg1hXm+0QZe3p5J8jTUpRSoAoD3OVWsLNYpHzEzrb0A0Ih2Nyrb0MUynq69Rn1WXNQzDgt0wPIIFu2F4BAt2w/AIqyrQNcVj2PPzT1bZzh1nQQUAhs9ztdSptD4XXCNQoxrn2v4dZBvY9ojqe2Ga53cXHR+yRL9e9bb30e1kO/n9farv3ncO8Ho7O1Xf+TyLbufP64LkVD5HtjX336f67lrP400NcGrs/ITelqqnk0WoR558TPXt6+8gW6yGuLVlDYuic3O6EFVwLPjGgjyfPTuvzy8fHztHtjz0aryhEm9r75TeduzSmcOqXaNnE4u4XQlOAwaA5OXq/Sjk9aq5gD3ZDcMzWLAbhkewYDcMj1BPwckGEdknIu9X2j/9UcXeLyJ7ReSMiHxTRPR5gIZh3BLUI9DlADzpnMtUSkr/WET+CcB/xGL7p2+IyP8C8CwWa8nXxCdA1F8tnOWTulCyvoEFiY07B+sY7iJnD59W7dr2ojt0Ma9/PU9OHp3gOe75kJ4dle/nLK/suhbV992jPN6G4/r8/b4d3Gbp9fd00cy3wHOu7+nmopkAIDEWyNbs4lZT0qwLVq1x5ZwN6P3oww18HIpznBkIAPN57vEe9usC3YZm5ZIuXibT3uNvqcsHm8Nk6+rUawU0+JVCmKkabZ58etahht+3gbcV16+brmXJdkoC4f8fwodt2C2yVH41WPnnADwJ4NsV+wsAPv1h6zIM4+ZRb5MIf6WM9BiAlwCcBZByzi0lHV+C9X8zjFuauoK90vllN4AeLHZ+4RkTNbi6/dNcij+OGYaxOnwkNd45lwLwGoCHAbSIyNIXpB4A/MUI1e2fGltiKxqsYRjXTj1qfIeItFR+jwD4BIDjWAz6X6+4fR7A927UIA3DWDn1qPFrAbwgIn4s3hy+5Zz7BxE5BuAbIvJfAbyHxX5wH4ighIgvVW2cGdedFzgl9PDrekqoRrSkV4xFhLdHY6oQDHIX6rUxVkUvXdAr2V7M8XoLm/QU2HOXuC1VT40Kqtk5ToFNpnjuPQA0BDgt1a/MDweASIDv/R2N/KbhSkGveosGVrJblCqyABDMK3PXQ/pc/8I8q/SNYT211pVZDd//7j+QbXiUjzcAFNtZYc+U9GN71/qHyOYL6ddS0ae/wdDQ1qGvFTh6pbruw0Kh9lfleto/HcJiT/bl9nOo0bnVMIxbD8ugMwyPYMFuGB7Bgt0wPMKqzmcvFQqYvpSsso0c1dviTJ1nUcZ11P16HzL+jmpP9LNwN70nqXgCm/f0sjHGot1UThesLitiydC8nlpbHGRhafSoLvyVz3FqbVjtqg0M3DNAtlCrftpDZW7G3uFT5lbH9H3IKcJdMauf32yOxbxyXhfofA28vYYGfd72xStHyXbh8hF2jOl5pfEuTskuNuj1CrQU5fmTSkN7AKnp+l87Xxk9RrboFu7ZDgCtfdVjc77ahS3tyW4YHsGC3TA8ggW7YXgEC3bD8AgW7IbhEVZVjU/PpPHGP71aZUue09XaoqJedt6lFxHQSJ7kaq0AkHS8vWyPmccAAAhNSURBVOVjWmLTXZvJ1tbBCv2DW7jAAwBs7Gff0a0ZxRNw93NC5MWjrCwDwNQYzzlqcboKu+epXyHbmn69VdTEHFdGTaV4vGnoqarx5gTZXIhbGQGABJrYV88ORmsLK+elrF7FdXyKKwIHW7ioRqBZedMCINK+jWxnrryv+p4+wdvqm9PfGD32sSdVu8Zb+/6ZbIdOHFd9BxPVany2xKnUS9iT3TA8ggW7YXgEC3bD8AgW7IbhEVa3P3vAIdxenebYsUsfQm6SU03z6/fXva3EDn1eb7iNt7d8TEucOs1znjeUWIQK1eij3q+k1u4Y1Kut+rdyuuulXVtU39f3vk620rTe5z7QxhVuk3P6XP8JpYZAqZHTNNfv0Gc2uxLPUT91QZ8LXlCKpTZFOYUWAHITLIS9d1y/Ft567R/JVooryl9BT5fNNnHf9ul5fR96N7P4uD3Coi4AxNfo1WE1tj/C6ygv6LUclo+tVK5RxwH2ZDcMz2DBbhgewYLdMDzCSto/fU1EzovIwcq/3Td+uIZhXCsraf8EAP/JOfftD1jWMIxbhHoKTjoAWvunj76xSAhtu6vTFLt2sboNAOVJTms9H6m/X1b/gF5wwNfG2yv4OHUTAJJZLiiRUApSNPp0FblYYCU7K0pVVQDpPBc9mErrvc/8Sp80Xxcr/4Dely0qujLc5BZ4vUFW+SORdnX53Cwfr0CLngM7vHCFbKmzZ1Tf5gQr52nfhOq7q4cV8ssj3N8vdVSvLltQUob7tunp0P3rudBFYYbffgDA+Yz+xkejoZnX8VDvE/p6J6sLmUz49qp+wDW2f3LOLa3xj0XkkIh8VUT0K94wjFuCa2r/JCI7APxnLLaBegBAAsDvacte3f4p+xHuboZhXF+utf3T0865kUqH1xyA/40aNeSvbv/U0GQt3A3jZnGt7Z9OiMjaik2w2K5ZqepnGMatwkraP70qIh0ABMBBAP/uBo7TMIwVspL2T/XPxjcM46ZjGXSG4REs2A3DI1iwG4ZHsGA3DI+wqsUrigt5TB6s7l82e05Pe8xNclqp21V/O/ijh/ap9nAbby8+oKd/DnyMK4XGglz1NhTSD2MgyPfShhqFLhobuQJqEXpBilKWK9HWKl7h1vA+zPv1bOdMlpOeSkUu5jBX1s+Zg3LOUpyCCwD9LevIdtfOftU3V+R018yU3lPtrUt8HEpxJZV4QC8iEu1bS7ahjJ7GOzbBVX53RJ5SffvX6BV9NYbHeH8PjL6u+s5Hq49vvqynWAP2ZDcMz2DBbhgewYLdMDyCBbtheIRVFejKRUFuonoyzPghfX632v6p9f66t5U8orfLCbTyfPRwXJ+gs3mQxa22jm6yTRf1Srbnc9NkG72kVwl1yRvT/qnYzWJPZ79eQ8APXkcqpdQVOK1Pg4h3dZDtvr7tqu+GMtcQKJZVV4Ta+8gWe0Cfvx8oc/ujM8lTZMs0r1GXb+7iyq7pK7rIePogV731zfG2AKD1wT7VrnFsH6/jQqMuwA4+Ul23IenTrwPAnuyG4Rks2A3DI1iwG4ZHsGA3DI9gwW4YHmFV1fhYcwwf/4XqafCvXHlR9Z06z2l/M2O6Oq0RSZRUe0JRopePaQlfjFNYp+dYed83dFhd/o1Lx8g2dJYVXAAo7j1NttLRC4onsCbGbyrC4F5xADA9xqmxT//Gb6i+kQQr5F1Rtk3U2FZ2hpV7yddI483xWxjJ6+dseo4V5myD/galI8FVhY+ffIdsxRIfbwBYUDKBN8UHVN+2rfxGYP6knsb75luvqnaNllZW//ds2ab6ti5LBR7y1677ak92w/AIFuyG4REs2A3DI1iwG4ZHkMXuTqu0MZFxAMOVP9sB6BOjb29sv24/7qR92+Cc47xlrHKwV21YZL9zrv5k99sE26/bjzt5367GPsYbhkewYDcMj3Azg/0vbuK2byS2X7cfd/K+/Yyb9p3dMIzVxT7GG4ZHWPVgF5GnReSkiJwRkedWe/vXExF5XkTGROTIVbaEiLwkIqcrP/WSKrcwIrJeRF4TkWMiclREfqdiv633TUQaRGSfiLxf2a8/qtj7RWRv5Zr8pojckb3FVzXYK51g/wzALwDYDuBzIqLXLbo9+BqAp5fZngPwinNuEMArlb9vN4oAftc5tx3AHgD/vnKebvd9ywF40jl3N4DdAJ4WkT0A/juArzrnNgGYBvDsTRzjDWO1n+wPAjjjnDvnnMsD+AaAZ1Z5DNcN59wbAJZP9XoGwAuV31/AYu/62wrn3Ihz7kDl9zSA4wC6cZvvm1skU/kzWPnnADwJ4NsV+223X/Wy2sHeDeDqiouXKrY7iU7n3Ejl91EAnTdzMCtFRPqw2LJ7L+6AfRMRv4gcBDAG4CUAZwGknHNLc27vxGsSgAl0NxS3+Krjtn3dISJNAP4WwJecc7NX/9/tum/OuZJzbjeAHix+0uQSwncoqx3slwFc3fSqp2K7k0iKyFoAqPwcu8njuSZEJIjFQP9r59x3KuY7Yt8AwDmXAvAagIcBtIjIUiGXO/GaBLD6wf4OgMGK+hkC8JsA9FI1ty8vAvh85ffPA/jeTRzLNSEiAuCvABx3zv3pVf91W++biHSISEvl9wiAT2BRj3gNwK9X3G67/aqXVU+qEZFPAfgfAPwAnnfO/fGqDuA6IiJ/A+AJLM6aSgL4AwDfBfAtAL1YnOH3Wecc12u6hRGRRwG8CeAwgKXWDb+Pxe/tt+2+icguLApwfiw+6L7lnPsvIjKARbE4AeA9AP/KOcfdJm5zLIPOMDyCCXSG4REs2A3DI1iwG4ZHsGA3DI9gwW4YHsGC3TA8ggW7YXgEC3bD8Aj/Dw87RVmqnGCMAAAAAElFTkSuQmCC\n", 565 | "text/plain": [ 566 | "
" 567 | ] 568 | }, 569 | "metadata": { 570 | "needs_background": "light" 571 | } 572 | } 573 | ] 574 | } 575 | ] 576 | } --------------------------------------------------------------------------------