├── 1_Background_of_NLP.ipynb ├── 2_Representation_Vector.ipynb ├── 3_Tagging_RNN.ipynb ├── 4_NMT.ipynb └── README.md /1_Background_of_NLP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "eng_data = pd.read_csv(\"../data/IMDB Dataset.csv\")" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "data": { 20 | "text/plain": [ 21 | "(50000, 2)" 22 | ] 23 | }, 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "output_type": "execute_result" 27 | } 28 | ], 29 | "source": [ 30 | "eng_data.shape" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/html": [ 41 | "
\n", 42 | "\n", 55 | "\n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | "
reviewsentiment
0One of the other reviewers has mentioned that ...positive
1A wonderful little production. <br /><br />The...positive
2I thought this was a wonderful way to spend ti...positive
3Basically there's a family where a little boy ...negative
4Petter Mattei's \"Love in the Time of Money\" is...positive
\n", 91 | "
" 92 | ], 93 | "text/plain": [ 94 | " review sentiment\n", 95 | "0 One of the other reviewers has mentioned that ... positive\n", 96 | "1 A wonderful little production.

The... positive\n", 97 | "2 I thought this was a wonderful way to spend ti... positive\n", 98 | "3 Basically there's a family where a little boy ... negative\n", 99 | "4 Petter Mattei's \"Love in the Time of Money\" is... positive" 100 | ] 101 | }, 102 | "execution_count": 3, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "eng_data.head(5)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 4, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "from bs4 import BeautifulSoup\n", 118 | "from nltk.tokenize.toktok import ToktokTokenizer\n", 119 | "import re\n", 120 | "import nltk\n", 121 | "\n", 122 | "def strip_html(text):\n", 123 | " soup = BeautifulSoup(text, \"html.parser\")\n", 124 | " return soup.get_text()\n", 125 | "\n", 126 | "def remove_between_square_brackets(text):\n", 127 | " return re.sub('\\[[^]]*\\]', '', text)\n", 128 | "\n", 129 | "def remove_special_characters(text, remove_digits = True):\n", 130 | " pattern=r'[^a-zA-z0-9\\s]'\n", 131 | " text=re.sub(pattern,'',text)\n", 132 | " return text\n", 133 | "\n", 134 | "def remove_stopwords(text, is_lower_case = False):\n", 135 | " tokenizer = ToktokTokenizer()\n", 136 | " stopword_list = nltk.corpus.stopwords.words('english')\n", 137 | " tokens = tokenizer.tokenize(text)\n", 138 | " tokens = [token.strip() for token in tokens]\n", 139 | " if is_lower_case:\n", 140 | " filtered_tokens = [token for token in tokens if token not in stopword_list]\n", 141 | " else:\n", 142 | " filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]\n", 143 | " filtered_text = ' '.join(filtered_tokens) \n", 144 | " return filtered_text\n", 145 | "\n", 146 | "def text_cleaning(text):\n", 147 | " text = strip_html(text)\n", 148 | " text = remove_between_square_brackets(text)\n", 149 | " text = remove_special_characters(text, remove_digits = True)\n", 150 | " text = remove_stopwords(text, is_lower_case = False)\n", 151 | " return text" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 5, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "data": { 161 | "text/plain": [ 162 | "'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.

This was the most I\\'d laughed at one of Woody\\'s comedies in years (dare I say a decade?). While I\\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her \"sexy\" image and jumped right into a average, but spirited young woman.

This may not be the crown jewel of his career, but it was wittier than \"Devil Wears Prada\" and more interesting than \"Superman\" a great comedy to go see with friends.'" 163 | ] 164 | }, 165 | "execution_count": 5, 166 | "metadata": {}, 167 | "output_type": "execute_result" 168 | } 169 | ], 170 | "source": [ 171 | "eng_data[\"review\"][2]" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 6, 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.This was the most I\\'d laughed at one of Woody\\'s comedies in years (dare I say a decade?). While I\\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her \"sexy\" image and jumped right into a average, but spirited young woman.This may not be the crown jewel of his career, but it was wittier than \"Devil Wears Prada\" and more interesting than \"Superman\" a great comedy to go see with friends.'" 183 | ] 184 | }, 185 | "execution_count": 6, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "strip_html(eng_data[\"review\"][2])" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 7, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "eng_data[\"review\"] = eng_data[\"review\"].apply(text_cleaning)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 8, 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "data": { 210 | "text/plain": [ 211 | "'wonderful little production filming technique unassuming oldtimeBBC fashion gives comforting sometimes discomforting sense realism entire piece actors extremely well chosen Michael Sheen got polari voices pat truly see seamless editing guided references Williams diary entries well worth watching terrificly written performed piece masterful production one great masters comedy life realism really comes home little things fantasy guard rather use traditional dream techniques remains solid disappears plays knowledge senses particularly scenes concerning Orton Halliwell sets particularly flat Halliwells murals decorating every surface terribly well done'" 212 | ] 213 | }, 214 | "execution_count": 8, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "eng_data[\"review\"][1]" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 9, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "from nltk.tokenize import word_tokenize\n", 230 | "vocab_lst = [word_tokenize(x) for x in eng_data[\"review\"]]" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 10, 236 | "metadata": {}, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "[('movie', 82310),\n", 242 | " ('film', 73514),\n", 243 | " ('one', 46301),\n", 244 | " ('like', 37483),\n", 245 | " ('good', 27403),\n", 246 | " ('would', 23751),\n", 247 | " ('time', 22741),\n", 248 | " ('really', 22207),\n", 249 | " ('see', 21765),\n", 250 | " ('even', 21494)]" 251 | ] 252 | }, 253 | "execution_count": 10, 254 | "metadata": {}, 255 | "output_type": "execute_result" 256 | } 257 | ], 258 | "source": [ 259 | "from collections import Counter\n", 260 | "vocab_lst2 = [y for x in vocab_lst for y in x]\n", 261 | "Counter(vocab_lst2).most_common(10)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 11, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "vocab_lst3 = list(Counter(vocab_lst2).keys())\n", 271 | "vocab_to_index = {word: index for index, word in enumerate(vocab_lst3)}\n", 272 | "index_to_vocab = {index: word for index, word in enumerate(vocab_lst3)}" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 12, 278 | "metadata": { 279 | "scrolled": true 280 | }, 281 | "outputs": [ 282 | { 283 | "data": { 284 | "text/plain": [ 285 | "{0: 'One',\n", 286 | " 1: 'reviewers',\n", 287 | " 2: 'mentioned',\n", 288 | " 3: 'watching',\n", 289 | " 4: '1',\n", 290 | " 5: 'Oz',\n", 291 | " 6: 'episode',\n", 292 | " 7: 'youll',\n", 293 | " 8: 'hooked',\n", 294 | " 9: 'right',\n", 295 | " 10: 'exactly',\n", 296 | " 11: 'happened',\n", 297 | " 12: 'meThe',\n", 298 | " 13: 'first',\n", 299 | " 14: 'thing',\n", 300 | " 15: 'struck',\n", 301 | " 16: 'brutality',\n", 302 | " 17: 'unflinching',\n", 303 | " 18: 'scenes',\n", 304 | " 19: 'violence',\n", 305 | " 20: 'set',\n", 306 | " 21: 'word',\n", 307 | " 22: 'GO',\n", 308 | " 23: 'Trust',\n", 309 | " 24: 'show',\n", 310 | " 25: 'faint',\n", 311 | " 26: 'hearted',\n", 312 | " 27: 'timid',\n", 313 | " 28: 'pulls',\n", 314 | " 29: 'punches',\n", 315 | " 30: 'regards',\n", 316 | " 31: 'drugs',\n", 317 | " 32: 'sex',\n", 318 | " 33: 'hardcore',\n", 319 | " 34: 'classic',\n", 320 | " 35: 'use',\n", 321 | " 36: 'wordIt',\n", 322 | " 37: 'called',\n", 323 | " 38: 'OZ',\n", 324 | " 39: 'nickname',\n", 325 | " 40: 'given',\n", 326 | " 41: 'Oswald',\n", 327 | " 42: 'Maximum',\n", 328 | " 43: 'Security',\n", 329 | " 44: 'State',\n", 330 | " 45: 'Penitentary',\n", 331 | " 46: 'focuses',\n", 332 | " 47: 'mainly',\n", 333 | " 48: 'Emerald',\n", 334 | " 49: 'City',\n", 335 | " 50: 'experimental',\n", 336 | " 51: 'section',\n", 337 | " 52: 'prison',\n", 338 | " 53: 'cells',\n", 339 | " 54: 'glass',\n", 340 | " 55: 'fronts',\n", 341 | " 56: 'face',\n", 342 | " 57: 'inwards',\n", 343 | " 58: 'privacy',\n", 344 | " 59: 'high',\n", 345 | " 60: 'agenda',\n", 346 | " 61: 'Em',\n", 347 | " 62: 'home',\n", 348 | " 63: 'manyAryans',\n", 349 | " 64: 'Muslims',\n", 350 | " 65: 'gangstas',\n", 351 | " 66: 'Latinos',\n", 352 | " 67: 'Christians',\n", 353 | " 68: 'Italians',\n", 354 | " 69: 'Irish',\n", 355 | " 70: 'moreso',\n", 356 | " 71: 'scuffles',\n", 357 | " 72: 'death',\n", 358 | " 73: 'stares',\n", 359 | " 74: 'dodgy',\n", 360 | " 75: 'dealings',\n", 361 | " 76: 'shady',\n", 362 | " 77: 'agreements',\n", 363 | " 78: 'never',\n", 364 | " 79: 'far',\n", 365 | " 80: 'awayI',\n", 366 | " 81: 'would',\n", 367 | " 82: 'say',\n", 368 | " 83: 'main',\n", 369 | " 84: 'appeal',\n", 370 | " 85: 'due',\n", 371 | " 86: 'fact',\n", 372 | " 87: 'goes',\n", 373 | " 88: 'shows',\n", 374 | " 89: 'wouldnt',\n", 375 | " 90: 'dare',\n", 376 | " 91: 'Forget',\n", 377 | " 92: 'pretty',\n", 378 | " 93: 'pictures',\n", 379 | " 94: 'painted',\n", 380 | " 95: 'mainstream',\n", 381 | " 96: 'audiences',\n", 382 | " 97: 'forget',\n", 383 | " 98: 'charm',\n", 384 | " 99: 'romanceOZ',\n", 385 | " 100: 'doesnt',\n", 386 | " 101: 'mess',\n", 387 | " 102: 'around',\n", 388 | " 103: 'ever',\n", 389 | " 104: 'saw',\n", 390 | " 105: 'nasty',\n", 391 | " 106: 'surreal',\n", 392 | " 107: 'couldnt',\n", 393 | " 108: 'ready',\n", 394 | " 109: 'watched',\n", 395 | " 110: 'developed',\n", 396 | " 111: 'taste',\n", 397 | " 112: 'got',\n", 398 | " 113: 'accustomed',\n", 399 | " 114: 'levels',\n", 400 | " 115: 'graphic',\n", 401 | " 116: 'injustice',\n", 402 | " 117: 'crooked',\n", 403 | " 118: 'guards',\n", 404 | " 119: 'wholl',\n", 405 | " 120: 'sold',\n", 406 | " 121: 'nickel',\n", 407 | " 122: 'inmates',\n", 408 | " 123: 'kill',\n", 409 | " 124: 'order',\n", 410 | " 125: 'get',\n", 411 | " 126: 'away',\n", 412 | " 127: 'well',\n", 413 | " 128: 'mannered',\n", 414 | " 129: 'middle',\n", 415 | " 130: 'class',\n", 416 | " 131: 'turned',\n", 417 | " 132: 'bitches',\n", 418 | " 133: 'lack',\n", 419 | " 134: 'street',\n", 420 | " 135: 'skills',\n", 421 | " 136: 'experience',\n", 422 | " 137: 'Watching',\n", 423 | " 138: 'may',\n", 424 | " 139: 'become',\n", 425 | " 140: 'comfortable',\n", 426 | " 141: 'uncomfortable',\n", 427 | " 142: 'viewingthats',\n", 428 | " 143: 'touch',\n", 429 | " 144: 'darker',\n", 430 | " 145: 'side',\n", 431 | " 146: 'wonderful',\n", 432 | " 147: 'little',\n", 433 | " 148: 'production',\n", 434 | " 149: 'filming',\n", 435 | " 150: 'technique',\n", 436 | " 151: 'unassuming',\n", 437 | " 152: 'oldtimeBBC',\n", 438 | " 153: 'fashion',\n", 439 | " 154: 'gives',\n", 440 | " 155: 'comforting',\n", 441 | " 156: 'sometimes',\n", 442 | " 157: 'discomforting',\n", 443 | " 158: 'sense',\n", 444 | " 159: 'realism',\n", 445 | " 160: 'entire',\n", 446 | " 161: 'piece',\n", 447 | " 162: 'actors',\n", 448 | " 163: 'extremely',\n", 449 | " 164: 'chosen',\n", 450 | " 165: 'Michael',\n", 451 | " 166: 'Sheen',\n", 452 | " 167: 'polari',\n", 453 | " 168: 'voices',\n", 454 | " 169: 'pat',\n", 455 | " 170: 'truly',\n", 456 | " 171: 'see',\n", 457 | " 172: 'seamless',\n", 458 | " 173: 'editing',\n", 459 | " 174: 'guided',\n", 460 | " 175: 'references',\n", 461 | " 176: 'Williams',\n", 462 | " 177: 'diary',\n", 463 | " 178: 'entries',\n", 464 | " 179: 'worth',\n", 465 | " 180: 'terrificly',\n", 466 | " 181: 'written',\n", 467 | " 182: 'performed',\n", 468 | " 183: 'masterful',\n", 469 | " 184: 'one',\n", 470 | " 185: 'great',\n", 471 | " 186: 'masters',\n", 472 | " 187: 'comedy',\n", 473 | " 188: 'life',\n", 474 | " 189: 'really',\n", 475 | " 190: 'comes',\n", 476 | " 191: 'things',\n", 477 | " 192: 'fantasy',\n", 478 | " 193: 'guard',\n", 479 | " 194: 'rather',\n", 480 | " 195: 'traditional',\n", 481 | " 196: 'dream',\n", 482 | " 197: 'techniques',\n", 483 | " 198: 'remains',\n", 484 | " 199: 'solid',\n", 485 | " 200: 'disappears',\n", 486 | " 201: 'plays',\n", 487 | " 202: 'knowledge',\n", 488 | " 203: 'senses',\n", 489 | " 204: 'particularly',\n", 490 | " 205: 'concerning',\n", 491 | " 206: 'Orton',\n", 492 | " 207: 'Halliwell',\n", 493 | " 208: 'sets',\n", 494 | " 209: 'flat',\n", 495 | " 210: 'Halliwells',\n", 496 | " 211: 'murals',\n", 497 | " 212: 'decorating',\n", 498 | " 213: 'every',\n", 499 | " 214: 'surface',\n", 500 | " 215: 'terribly',\n", 501 | " 216: 'done',\n", 502 | " 217: 'thought',\n", 503 | " 218: 'way',\n", 504 | " 219: 'spend',\n", 505 | " 220: 'time',\n", 506 | " 221: 'hot',\n", 507 | " 222: 'summer',\n", 508 | " 223: 'weekend',\n", 509 | " 224: 'sitting',\n", 510 | " 225: 'air',\n", 511 | " 226: 'conditioned',\n", 512 | " 227: 'theater',\n", 513 | " 228: 'lighthearted',\n", 514 | " 229: 'plot',\n", 515 | " 230: 'simplistic',\n", 516 | " 231: 'dialogue',\n", 517 | " 232: 'witty',\n", 518 | " 233: 'characters',\n", 519 | " 234: 'likable',\n", 520 | " 235: 'even',\n", 521 | " 236: 'bread',\n", 522 | " 237: 'suspected',\n", 523 | " 238: 'serial',\n", 524 | " 239: 'killer',\n", 525 | " 240: 'disappointed',\n", 526 | " 241: 'realize',\n", 527 | " 242: 'Match',\n", 528 | " 243: 'Point',\n", 529 | " 244: '2',\n", 530 | " 245: 'Risk',\n", 531 | " 246: 'Addiction',\n", 532 | " 247: 'proof',\n", 533 | " 248: 'Woody',\n", 534 | " 249: 'Allen',\n", 535 | " 250: 'still',\n", 536 | " 251: 'fully',\n", 537 | " 252: 'control',\n", 538 | " 253: 'style',\n", 539 | " 254: 'many',\n", 540 | " 255: 'us',\n", 541 | " 256: 'grown',\n", 542 | " 257: 'loveThis',\n", 543 | " 258: 'Id',\n", 544 | " 259: 'laughed',\n", 545 | " 260: 'Woodys',\n", 546 | " 261: 'comedies',\n", 547 | " 262: 'years',\n", 548 | " 263: 'decade',\n", 549 | " 264: 'Ive',\n", 550 | " 265: 'impressed',\n", 551 | " 266: 'Scarlet',\n", 552 | " 267: 'Johanson',\n", 553 | " 268: 'managed',\n", 554 | " 269: 'tone',\n", 555 | " 270: 'sexy',\n", 556 | " 271: 'image',\n", 557 | " 272: 'jumped',\n", 558 | " 273: 'average',\n", 559 | " 274: 'spirited',\n", 560 | " 275: 'young',\n", 561 | " 276: 'womanThis',\n", 562 | " 277: 'crown',\n", 563 | " 278: 'jewel',\n", 564 | " 279: 'career',\n", 565 | " 280: 'wittier',\n", 566 | " 281: 'Devil',\n", 567 | " 282: 'Wears',\n", 568 | " 283: 'Prada',\n", 569 | " 284: 'interesting',\n", 570 | " 285: 'Superman',\n", 571 | " 286: 'go',\n", 572 | " 287: 'friends',\n", 573 | " 288: 'Basically',\n", 574 | " 289: 'theres',\n", 575 | " 290: 'family',\n", 576 | " 291: 'boy',\n", 577 | " 292: 'Jake',\n", 578 | " 293: 'thinks',\n", 579 | " 294: 'zombie',\n", 580 | " 295: 'closet',\n", 581 | " 296: 'parents',\n", 582 | " 297: 'fighting',\n", 583 | " 298: 'timeThis',\n", 584 | " 299: 'movie',\n", 585 | " 300: 'slower',\n", 586 | " 301: 'soap',\n", 587 | " 302: 'opera',\n", 588 | " 303: 'suddenly',\n", 589 | " 304: 'decides',\n", 590 | " 305: 'Rambo',\n", 591 | " 306: 'zombieOK',\n", 592 | " 307: 'youre',\n", 593 | " 308: 'going',\n", 594 | " 309: 'make',\n", 595 | " 310: 'film',\n", 596 | " 311: 'must',\n", 597 | " 312: 'Decide',\n", 598 | " 313: 'thriller',\n", 599 | " 314: 'drama',\n", 600 | " 315: 'watchable',\n", 601 | " 316: 'Parents',\n", 602 | " 317: 'divorcing',\n", 603 | " 318: 'arguing',\n", 604 | " 319: 'like',\n", 605 | " 320: 'real',\n", 606 | " 321: 'totally',\n", 607 | " 322: 'ruins',\n", 608 | " 323: 'expected',\n", 609 | " 324: 'BOOGEYMAN',\n", 610 | " 325: 'similar',\n", 611 | " 326: 'instead',\n", 612 | " 327: 'meaningless',\n", 613 | " 328: 'spots3',\n", 614 | " 329: '10',\n", 615 | " 330: 'playing',\n", 616 | " 331: 'descent',\n", 617 | " 332: 'dialogs',\n", 618 | " 333: 'shots',\n", 619 | " 334: 'ignore',\n", 620 | " 335: 'Petter',\n", 621 | " 336: 'Matteis',\n", 622 | " 337: 'Love',\n", 623 | " 338: 'Time',\n", 624 | " 339: 'Money',\n", 625 | " 340: 'visually',\n", 626 | " 341: 'stunning',\n", 627 | " 342: 'watch',\n", 628 | " 343: 'Mr',\n", 629 | " 344: 'Mattei',\n", 630 | " 345: 'offers',\n", 631 | " 346: 'vivid',\n", 632 | " 347: 'portrait',\n", 633 | " 348: 'human',\n", 634 | " 349: 'relations',\n", 635 | " 350: 'seems',\n", 636 | " 351: 'telling',\n", 637 | " 352: 'money',\n", 638 | " 353: 'power',\n", 639 | " 354: 'success',\n", 640 | " 355: 'people',\n", 641 | " 356: 'different',\n", 642 | " 357: 'situations',\n", 643 | " 358: 'encounter',\n", 644 | " 359: 'variation',\n", 645 | " 360: 'Arthur',\n", 646 | " 361: 'Schnitzlers',\n", 647 | " 362: 'play',\n", 648 | " 363: 'theme',\n", 649 | " 364: 'director',\n", 650 | " 365: 'transfers',\n", 651 | " 366: 'action',\n", 652 | " 367: 'present',\n", 653 | " 368: 'New',\n", 654 | " 369: 'York',\n", 655 | " 370: 'meet',\n", 656 | " 371: 'connect',\n", 657 | " 372: 'connected',\n", 658 | " 373: 'another',\n", 659 | " 374: 'next',\n", 660 | " 375: 'person',\n", 661 | " 376: 'know',\n", 662 | " 377: 'previous',\n", 663 | " 378: 'point',\n", 664 | " 379: 'contact',\n", 665 | " 380: 'Stylishly',\n", 666 | " 381: 'sophisticated',\n", 667 | " 382: 'luxurious',\n", 668 | " 383: 'look',\n", 669 | " 384: 'taken',\n", 670 | " 385: 'live',\n", 671 | " 386: 'world',\n", 672 | " 387: 'habitatThe',\n", 673 | " 388: 'gets',\n", 674 | " 389: 'souls',\n", 675 | " 390: 'picture',\n", 676 | " 391: 'stages',\n", 677 | " 392: 'loneliness',\n", 678 | " 393: 'inhabits',\n", 679 | " 394: 'big',\n", 680 | " 395: 'city',\n", 681 | " 396: 'best',\n", 682 | " 397: 'place',\n", 683 | " 398: 'find',\n", 684 | " 399: 'sincere',\n", 685 | " 400: 'fulfillment',\n", 686 | " 401: 'discerns',\n", 687 | " 402: 'case',\n", 688 | " 403: 'encounterThe',\n", 689 | " 404: 'acting',\n", 690 | " 405: 'good',\n", 691 | " 406: 'direction',\n", 692 | " 407: 'Steve',\n", 693 | " 408: 'Buscemi',\n", 694 | " 409: 'Rosario',\n", 695 | " 410: 'Dawson',\n", 696 | " 411: 'Carol',\n", 697 | " 412: 'Kane',\n", 698 | " 413: 'Imperioli',\n", 699 | " 414: 'Adrian',\n", 700 | " 415: 'Grenier',\n", 701 | " 416: 'rest',\n", 702 | " 417: 'talented',\n", 703 | " 418: 'cast',\n", 704 | " 419: 'come',\n", 705 | " 420: 'aliveWe',\n", 706 | " 421: 'wish',\n", 707 | " 422: 'luck',\n", 708 | " 423: 'await',\n", 709 | " 424: 'anxiously',\n", 710 | " 425: 'work',\n", 711 | " 426: 'Probably',\n", 712 | " 427: 'alltime',\n", 713 | " 428: 'favorite',\n", 714 | " 429: 'story',\n", 715 | " 430: 'selflessness',\n", 716 | " 431: 'sacrifice',\n", 717 | " 432: 'dedication',\n", 718 | " 433: 'noble',\n", 719 | " 434: 'cause',\n", 720 | " 435: 'preachy',\n", 721 | " 436: 'boring',\n", 722 | " 437: 'old',\n", 723 | " 438: 'despite',\n", 724 | " 439: 'seen',\n", 725 | " 440: '15',\n", 726 | " 441: 'times',\n", 727 | " 442: 'last',\n", 728 | " 443: '25',\n", 729 | " 444: 'Paul',\n", 730 | " 445: 'Lukas',\n", 731 | " 446: 'performance',\n", 732 | " 447: 'brings',\n", 733 | " 448: 'tears',\n", 734 | " 449: 'eyes',\n", 735 | " 450: 'Bette',\n", 736 | " 451: 'Davis',\n", 737 | " 452: 'sympathetic',\n", 738 | " 453: 'roles',\n", 739 | " 454: 'delight',\n", 740 | " 455: 'kids',\n", 741 | " 456: 'grandma',\n", 742 | " 457: 'says',\n", 743 | " 458: 'dressedup',\n", 744 | " 459: 'midgets',\n", 745 | " 460: 'children',\n", 746 | " 461: 'makes',\n", 747 | " 462: 'fun',\n", 748 | " 463: 'mothers',\n", 749 | " 464: 'slow',\n", 750 | " 465: 'awakening',\n", 751 | " 466: 'whats',\n", 752 | " 467: 'happening',\n", 753 | " 468: 'roof',\n", 754 | " 469: 'believable',\n", 755 | " 470: 'startling',\n", 756 | " 471: 'dozen',\n", 757 | " 472: 'thumbs',\n", 758 | " 473: 'theyd',\n", 759 | " 474: 'sure',\n", 760 | " 475: 'resurrection',\n", 761 | " 476: 'dated',\n", 762 | " 477: 'Seahunt',\n", 763 | " 478: 'series',\n", 764 | " 479: 'tech',\n", 765 | " 480: 'today',\n", 766 | " 481: 'bring',\n", 767 | " 482: 'back',\n", 768 | " 483: 'kid',\n", 769 | " 484: 'excitement',\n", 770 | " 485: 'meI',\n", 771 | " 486: 'grew',\n", 772 | " 487: 'black',\n", 773 | " 488: 'white',\n", 774 | " 489: 'TV',\n", 775 | " 490: 'Gunsmoke',\n", 776 | " 491: 'heros',\n", 777 | " 492: 'weekYou',\n", 778 | " 493: 'vote',\n", 779 | " 494: 'comeback',\n", 780 | " 495: 'new',\n", 781 | " 496: 'sea',\n", 782 | " 497: 'huntWe',\n", 783 | " 498: 'need',\n", 784 | " 499: 'change',\n", 785 | " 500: 'pace',\n", 786 | " 501: 'water',\n", 787 | " 502: 'adventureOh',\n", 788 | " 503: 'thank',\n", 789 | " 504: 'outlet',\n", 790 | " 505: 'view',\n", 791 | " 506: 'viewpoints',\n", 792 | " 507: 'moviesSo',\n", 793 | " 508: 'ole',\n", 794 | " 509: 'believe',\n", 795 | " 510: 'wan',\n", 796 | " 511: 'na',\n", 797 | " 512: 'sayWould',\n", 798 | " 513: 'nice',\n", 799 | " 514: 'read',\n", 800 | " 515: 'plus',\n", 801 | " 516: 'points',\n", 802 | " 517: 'huntIf',\n", 803 | " 518: 'rhymes',\n", 804 | " 519: 'lines',\n", 805 | " 520: 'let',\n", 806 | " 521: 'submitor',\n", 807 | " 522: 'leave',\n", 808 | " 523: 'doubt',\n", 809 | " 524: 'quitIf',\n", 810 | " 525: 'lets',\n", 811 | " 526: 'amazing',\n", 812 | " 527: 'fresh',\n", 813 | " 528: 'innovative',\n", 814 | " 529: 'idea',\n", 815 | " 530: '70s',\n", 816 | " 531: 'aired',\n", 817 | " 532: '7',\n", 818 | " 533: '8',\n", 819 | " 534: 'brilliant',\n", 820 | " 535: 'dropped',\n", 821 | " 536: '1990',\n", 822 | " 537: 'funny',\n", 823 | " 538: 'anymore',\n", 824 | " 539: 'continued',\n", 825 | " 540: 'decline',\n", 826 | " 541: 'complete',\n", 827 | " 542: 'waste',\n", 828 | " 543: 'todayIts',\n", 829 | " 544: 'disgraceful',\n", 830 | " 545: 'fallen',\n", 831 | " 546: 'writing',\n", 832 | " 547: 'painfully',\n", 833 | " 548: 'bad',\n", 834 | " 549: 'performances',\n", 835 | " 550: 'almost',\n", 836 | " 551: 'mildly',\n", 837 | " 552: 'entertaining',\n", 838 | " 553: 'respite',\n", 839 | " 554: 'guesthosts',\n", 840 | " 555: 'probably',\n", 841 | " 556: 'hard',\n", 842 | " 557: 'creator',\n", 843 | " 558: 'handselected',\n", 844 | " 559: 'original',\n", 845 | " 560: 'also',\n", 846 | " 561: 'chose',\n", 847 | " 562: 'band',\n", 848 | " 563: 'hacks',\n", 849 | " 564: 'followed',\n", 850 | " 565: 'recognize',\n", 851 | " 566: 'brilliance',\n", 852 | " 567: 'fit',\n", 853 | " 568: 'replace',\n", 854 | " 569: 'mediocrity',\n", 855 | " 570: 'felt',\n", 856 | " 571: 'give',\n", 857 | " 572: 'stars',\n", 858 | " 573: 'respect',\n", 859 | " 574: 'made',\n", 860 | " 575: 'huge',\n", 861 | " 576: 'awful',\n", 862 | " 577: 'cant',\n", 863 | " 578: 'Encouraged',\n", 864 | " 579: 'positive',\n", 865 | " 580: 'comments',\n", 866 | " 581: 'looking',\n", 867 | " 582: 'forward',\n", 868 | " 583: 'Bad',\n", 869 | " 584: 'mistake',\n", 870 | " 585: '950',\n", 871 | " 586: 'films',\n", 872 | " 587: 'worst',\n", 873 | " 588: 'pacing',\n", 874 | " 589: 'storyline',\n", 875 | " 590: 'soundtrack',\n", 876 | " 591: 'song',\n", 877 | " 592: 'lame',\n", 878 | " 593: 'country',\n", 879 | " 594: 'tune',\n", 880 | " 595: 'played',\n", 881 | " 596: 'less',\n", 882 | " 597: 'four',\n", 883 | " 598: 'looks',\n", 884 | " 599: 'cheap',\n", 885 | " 600: 'extreme',\n", 886 | " 601: 'Rarely',\n", 887 | " 602: 'happy',\n", 888 | " 603: 'end',\n", 889 | " 604: 'credits',\n", 890 | " 605: 'prevents',\n", 891 | " 606: 'giving',\n", 892 | " 607: '1score',\n", 893 | " 608: 'Harvey',\n", 894 | " 609: 'Keitel',\n", 895 | " 610: 'least',\n", 896 | " 611: 'making',\n", 897 | " 612: 'bit',\n", 898 | " 613: 'effort',\n", 899 | " 614: 'obsessives',\n", 900 | " 615: 'gut',\n", 901 | " 616: 'wrenching',\n", 902 | " 617: 'laughter',\n", 903 | " 618: 'love',\n", 904 | " 619: 'hell',\n", 905 | " 620: 'mom',\n", 906 | " 621: 'liked',\n", 907 | " 622: 'itGreat',\n", 908 | " 623: 'Camp',\n", 909 | " 624: 'Phil',\n", 910 | " 625: 'Alien',\n", 911 | " 626: 'quirky',\n", 912 | " 627: 'humour',\n", 913 | " 628: 'based',\n", 914 | " 629: 'oddness',\n", 915 | " 630: 'everything',\n", 916 | " 631: 'actual',\n", 917 | " 632: 'punchlinesAt',\n", 918 | " 633: 'odd',\n", 919 | " 634: 'progressed',\n", 920 | " 635: 'didnt',\n", 921 | " 636: 'jokes',\n", 922 | " 637: 'anymoreIts',\n", 923 | " 638: 'low',\n", 924 | " 639: 'budget',\n", 925 | " 640: 'thats',\n", 926 | " 641: 'problem',\n", 927 | " 642: 'eventually',\n", 928 | " 643: 'lost',\n", 929 | " 644: 'interestI',\n", 930 | " 645: 'imagine',\n", 931 | " 646: 'stoner',\n", 932 | " 647: 'currently',\n", 933 | " 648: 'partakingFor',\n", 934 | " 649: 'something',\n", 935 | " 650: 'better',\n", 936 | " 651: 'try',\n", 937 | " 652: 'Brother',\n", 938 | " 653: 'planet',\n", 939 | " 654: '12',\n", 940 | " 655: 'came',\n", 941 | " 656: 'recall',\n", 942 | " 657: 'scariest',\n", 943 | " 658: 'scene',\n", 944 | " 659: 'bird',\n", 945 | " 660: 'eating',\n", 946 | " 661: 'men',\n", 947 | " 662: 'dangling',\n", 948 | " 663: 'helplessly',\n", 949 | " 664: 'parachutes',\n", 950 | " 665: 'horror',\n", 951 | " 666: 'horrorAs',\n", 952 | " 667: 'cheesy',\n", 953 | " 668: 'B',\n", 954 | " 669: 'Saturday',\n", 955 | " 670: 'afternoons',\n", 956 | " 671: 'tired',\n", 957 | " 672: 'formula',\n", 958 | " 673: 'monster',\n", 959 | " 674: 'type',\n", 960 | " 675: 'movies',\n", 961 | " 676: 'usually',\n", 962 | " 677: 'included',\n", 963 | " 678: 'hero',\n", 964 | " 679: 'beautiful',\n", 965 | " 680: 'woman',\n", 966 | " 681: 'might',\n", 967 | " 682: 'daughter',\n", 968 | " 683: 'professor',\n", 969 | " 684: 'resolution',\n", 970 | " 685: 'died',\n", 971 | " 686: 'care',\n", 972 | " 687: 'much',\n", 973 | " 688: 'romantic',\n", 974 | " 689: 'angle',\n", 975 | " 690: 'year',\n", 976 | " 691: 'predictable',\n", 977 | " 692: 'plots',\n", 978 | " 693: 'unintentional',\n", 979 | " 694: 'humorBut',\n", 980 | " 695: 'later',\n", 981 | " 696: 'Psycho',\n", 982 | " 697: 'loved',\n", 983 | " 698: 'star',\n", 984 | " 699: 'Janet',\n", 985 | " 700: 'Leigh',\n", 986 | " 701: 'bumped',\n", 987 | " 702: 'early',\n", 988 | " 703: 'sat',\n", 989 | " 704: 'took',\n", 990 | " 705: 'notice',\n", 991 | " 706: 'Since',\n", 992 | " 707: 'screenwriters',\n", 993 | " 708: 'scary',\n", 994 | " 709: 'possible',\n", 995 | " 710: 'wellworn',\n", 996 | " 711: 'rules',\n", 997 | " 712: 'im',\n", 998 | " 713: 'fan',\n", 999 | " 714: 'Bolls',\n", 1000 | " 715: 'enjoyed',\n", 1001 | " 716: 'Postal',\n", 1002 | " 717: 'maybe',\n", 1003 | " 718: 'Boll',\n", 1004 | " 719: 'apparently',\n", 1005 | " 720: 'bought',\n", 1006 | " 721: 'rights',\n", 1007 | " 722: 'Far',\n", 1008 | " 723: 'Cry',\n", 1009 | " 724: 'long',\n", 1010 | " 725: 'ago',\n", 1011 | " 726: 'game',\n", 1012 | " 727: 'finsished',\n", 1013 | " 728: 'People',\n", 1014 | " 729: 'killing',\n", 1015 | " 730: 'mercs',\n", 1016 | " 731: 'infiltrating',\n", 1017 | " 732: 'secret',\n", 1018 | " 733: 'research',\n", 1019 | " 734: 'labs',\n", 1020 | " 735: 'located',\n", 1021 | " 736: 'tropical',\n", 1022 | " 737: 'island',\n", 1023 | " 738: 'warned',\n", 1024 | " 739: 'schemed',\n", 1025 | " 740: 'together',\n", 1026 | " 741: 'along',\n", 1027 | " 742: 'legion',\n", 1028 | " 743: 'schmucks',\n", 1029 | " 744: 'Feeling',\n", 1030 | " 745: 'loneley',\n", 1031 | " 746: 'invites',\n", 1032 | " 747: 'three',\n", 1033 | " 748: 'countrymen',\n", 1034 | " 749: 'players',\n", 1035 | " 750: 'names',\n", 1036 | " 751: 'Til',\n", 1037 | " 752: 'Schweiger',\n", 1038 | " 753: 'Udo',\n", 1039 | " 754: 'Kier',\n", 1040 | " 755: 'Ralf',\n", 1041 | " 756: 'MoellerThree',\n", 1042 | " 757: 'actually',\n", 1043 | " 758: 'selfs',\n", 1044 | " 759: 'biz',\n", 1045 | " 760: 'tale',\n", 1046 | " 761: 'Jack',\n", 1047 | " 762: 'Carver',\n", 1048 | " 763: 'yes',\n", 1049 | " 764: 'German',\n", 1050 | " 765: 'hail',\n", 1051 | " 766: 'bratwurst',\n", 1052 | " 767: 'dudes',\n", 1053 | " 768: 'However',\n", 1054 | " 769: 'Tils',\n", 1055 | " 770: 'badass',\n", 1056 | " 771: 'complained',\n", 1057 | " 772: 'hes',\n", 1058 | " 773: 'staying',\n", 1059 | " 774: 'true',\n", 1060 | " 775: 'whole',\n", 1061 | " 776: 'carver',\n", 1062 | " 777: 'perspective',\n", 1063 | " 778: 'dont',\n", 1064 | " 779: 'looked',\n", 1065 | " 780: 'kicking',\n", 1066 | " 781: 'beyond',\n", 1067 | " 782: 'demented',\n", 1068 | " 783: 'evil',\n", 1069 | " 784: 'mad',\n", 1070 | " 785: 'scientist',\n", 1071 | " 786: 'Dr',\n", 1072 | " 787: 'Krieger',\n", 1073 | " 788: 'GeneticallyMutatedsoldiers',\n", 1074 | " 789: 'GMS',\n", 1075 | " 790: 'Performing',\n", 1076 | " 791: 'topsecret',\n", 1077 | " 792: 'reminds',\n", 1078 | " 793: 'SPOILER',\n", 1079 | " 794: 'Vancouver',\n", 1080 | " 795: 'reason',\n", 1081 | " 796: 'Thats',\n", 1082 | " 797: 'palm',\n", 1083 | " 798: 'trees',\n", 1084 | " 799: 'Instead',\n", 1085 | " 800: 'rich',\n", 1086 | " 801: 'lumberjackwoods',\n", 1087 | " 802: 'havent',\n", 1088 | " 803: 'gone',\n", 1089 | " 804: 'FAR',\n", 1090 | " 805: 'started',\n", 1091 | " 806: 'CRY',\n", 1092 | " 807: 'mehehe',\n", 1093 | " 808: 'can',\n", 1094 | " 809: 'not',\n", 1095 | " 810: 'stay',\n", 1096 | " 811: 'shenanigans',\n", 1097 | " 812: 'delivers',\n", 1098 | " 813: 'meaning',\n", 1099 | " 814: 'suckThere',\n", 1100 | " 815: 'mentioning',\n", 1101 | " 816: 'imply',\n", 1102 | " 817: 'areas',\n", 1103 | " 818: 'boat',\n", 1104 | " 819: 'cromedalbino',\n", 1105 | " 820: 'squad',\n", 1106 | " 821: 'enters',\n", 1107 | " 822: 'laugh',\n", 1108 | " 823: 'reeks',\n", 1109 | " 824: 'scheisse',\n", 1110 | " 825: 'poop',\n", 1111 | " 826: 'simpletons',\n", 1112 | " 827: 'take',\n", 1113 | " 828: 'wiff',\n", 1114 | " 829: 'ahead',\n", 1115 | " 830: 'BTW',\n", 1116 | " 831: 'annoying',\n", 1117 | " 832: 'sidekick',\n", 1118 | " 833: 'shoot',\n", 1119 | " 834: 'minutes',\n", 1120 | " 835: 'screen',\n", 1121 | " 836: 'ShakespeareShakespeare',\n", 1122 | " 837: 'lostI',\n", 1123 | " 838: 'appreciate',\n", 1124 | " 839: 'trying',\n", 1125 | " 840: 'Shakespeare',\n", 1126 | " 841: 'masses',\n", 1127 | " 842: 'ruin',\n", 1128 | " 843: 'goodIs',\n", 1129 | " 844: 'Scottish',\n", 1130 | " 845: 'Play',\n", 1131 | " 846: 'certain',\n", 1132 | " 847: 'Rev',\n", 1133 | " 848: 'Bowdler',\n", 1134 | " 849: 'hence',\n", 1135 | " 850: 'bowdlerization',\n", 1136 | " 851: 'tried',\n", 1137 | " 852: 'Victorian',\n", 1138 | " 853: 'eraIn',\n", 1139 | " 854: 'words',\n", 1140 | " 855: 'improve',\n", 1141 | " 856: 'perfectionI',\n", 1142 | " 857: 'write',\n", 1143 | " 858: 'ten',\n", 1144 | " 859: 'text',\n", 1145 | " 860: 'English',\n", 1146 | " 861: 'composition',\n", 1147 | " 862: 'forte',\n", 1148 | " 863: 'keep',\n", 1149 | " 864: 'saying',\n", 1150 | " 865: 'cut',\n", 1151 | " 866: 'fantastic',\n", 1152 | " 867: 'prisoners',\n", 1153 | " 868: 'famous',\n", 1154 | " 869: 'george',\n", 1155 | " 870: 'clooney',\n", 1156 | " 871: 'Im',\n", 1157 | " 872: 'roll',\n", 1158 | " 873: 'Another',\n", 1159 | " 874: 'man',\n", 1160 | " 875: 'constant',\n", 1161 | " 876: 'sorrow',\n", 1162 | " 877: 'recommand',\n", 1163 | " 878: 'everybody',\n", 1164 | " 879: 'Greetings',\n", 1165 | " 880: 'Bart',\n", 1166 | " 881: 'Kind',\n", 1167 | " 882: 'drawn',\n", 1168 | " 883: 'erotic',\n", 1169 | " 884: 'amateurish',\n", 1170 | " 885: 'unbelievable',\n", 1171 | " 886: 'bits',\n", 1172 | " 887: 'Sort',\n", 1173 | " 888: 'school',\n", 1174 | " 889: 'project',\n", 1175 | " 890: 'Rosanna',\n", 1176 | " 891: 'Arquette',\n", 1177 | " 892: 'thinking',\n", 1178 | " 893: 'stock',\n", 1179 | " 894: 'bizarre',\n", 1180 | " 895: 'supposed',\n", 1181 | " 896: 'Midwest',\n", 1182 | " 897: 'town',\n", 1183 | " 898: 'Pretty',\n", 1184 | " 899: 'involved',\n", 1185 | " 900: 'lessons',\n", 1186 | " 901: 'learned',\n", 1187 | " 902: 'insights',\n", 1188 | " 903: 'stilted',\n", 1189 | " 904: 'quite',\n", 1190 | " 905: 'ridiculous',\n", 1191 | " 906: 'lots',\n", 1192 | " 907: 'skin',\n", 1193 | " 908: 'intrigues',\n", 1194 | " 909: 'videotaped',\n", 1195 | " 910: 'nonsenseWhat',\n", 1196 | " 911: 'bisexual',\n", 1197 | " 912: 'relationship',\n", 1198 | " 913: 'nowhere',\n", 1199 | " 914: 'heterosexual',\n", 1200 | " 915: 'encounters',\n", 1201 | " 916: 'absurd',\n", 1202 | " 917: 'dance',\n", 1203 | " 918: 'stereotyped',\n", 1204 | " 919: 'Give',\n", 1205 | " 920: 'pass',\n", 1206 | " 921: 'million',\n", 1207 | " 922: 'miles',\n", 1208 | " 923: 'wasted',\n", 1209 | " 924: 'could',\n", 1210 | " 925: 'spent',\n", 1211 | " 926: 'starving',\n", 1212 | " 927: 'Aids',\n", 1213 | " 928: 'Africa',\n", 1214 | " 929: 'simply',\n", 1215 | " 930: 'remade',\n", 1216 | " 931: 'fails',\n", 1217 | " 932: 'capture',\n", 1218 | " 933: 'flavor',\n", 1219 | " 934: 'terror',\n", 1220 | " 935: '1963',\n", 1221 | " 936: 'title',\n", 1222 | " 937: 'Liam',\n", 1223 | " 938: 'Neeson',\n", 1224 | " 939: 'excellent',\n", 1225 | " 940: 'always',\n", 1226 | " 941: 'holds',\n", 1227 | " 942: 'exception',\n", 1228 | " 943: 'Owen',\n", 1229 | " 944: 'Wilson',\n", 1230 | " 945: 'feel',\n", 1231 | " 946: 'character',\n", 1232 | " 947: 'Luke',\n", 1233 | " 948: 'major',\n", 1234 | " 949: 'fault',\n", 1235 | " 950: 'version',\n", 1236 | " 951: 'strayed',\n", 1237 | " 952: 'Shirley',\n", 1238 | " 953: 'Jackson',\n", 1239 | " 954: 'attempts',\n", 1240 | " 955: 'grandiose',\n", 1241 | " 956: 'thrill',\n", 1242 | " 957: 'earlier',\n", 1243 | " 958: 'trade',\n", 1244 | " 959: 'snazzier',\n", 1245 | " 960: 'special',\n", 1246 | " 961: 'effects',\n", 1247 | " 962: 'enjoy',\n", 1248 | " 963: 'friction',\n", 1249 | " 964: 'older',\n", 1250 | " 965: 'top',\n", 1251 | " 966: 'Horrible',\n", 1252 | " 967: 'wasnt',\n", 1253 | " 968: 'continuous',\n", 1254 | " 969: 'minute',\n", 1255 | " 970: 'fight',\n", 1256 | " 971: 'chance',\n", 1257 | " 972: 'development',\n", 1258 | " 973: 'busy',\n", 1259 | " 974: 'running',\n", 1260 | " 975: 'sword',\n", 1261 | " 976: 'emotional',\n", 1262 | " 977: 'attachment',\n", 1263 | " 978: 'except',\n", 1264 | " 979: 'machine',\n", 1265 | " 980: 'wanted',\n", 1266 | " 981: 'destroy',\n", 1267 | " 982: 'Scenes',\n", 1268 | " 983: 'blatantly',\n", 1269 | " 984: 'stolen',\n", 1270 | " 985: 'LOTR',\n", 1271 | " 986: 'Star',\n", 1272 | " 987: 'Wars',\n", 1273 | " 988: 'Matrix',\n", 1274 | " 989: 'ExamplesThe',\n", 1275 | " 990: 'ghost',\n", 1276 | " 991: 'final',\n", 1277 | " 992: 'Yoda',\n", 1278 | " 993: 'Obee',\n", 1279 | " 994: 'Vader',\n", 1280 | " 995: 'spider',\n", 1281 | " 996: 'beginning',\n", 1282 | " 997: 'Frodo',\n", 1283 | " 998: 'attacked',\n", 1284 | " 999: 'Return',\n", 1285 | " ...}" 1286 | ] 1287 | }, 1288 | "execution_count": 12, 1289 | "metadata": {}, 1290 | "output_type": "execute_result" 1291 | } 1292 | ], 1293 | "source": [ 1294 | "index_to_vocab" 1295 | ] 1296 | }, 1297 | { 1298 | "cell_type": "code", 1299 | "execution_count": 13, 1300 | "metadata": { 1301 | "scrolled": true 1302 | }, 1303 | "outputs": [ 1304 | { 1305 | "data": { 1306 | "text/plain": [ 1307 | "{'One': 0,\n", 1308 | " 'reviewers': 1,\n", 1309 | " 'mentioned': 2,\n", 1310 | " 'watching': 3,\n", 1311 | " '1': 4,\n", 1312 | " 'Oz': 5,\n", 1313 | " 'episode': 6,\n", 1314 | " 'youll': 7,\n", 1315 | " 'hooked': 8,\n", 1316 | " 'right': 9,\n", 1317 | " 'exactly': 10,\n", 1318 | " 'happened': 11,\n", 1319 | " 'meThe': 12,\n", 1320 | " 'first': 13,\n", 1321 | " 'thing': 14,\n", 1322 | " 'struck': 15,\n", 1323 | " 'brutality': 16,\n", 1324 | " 'unflinching': 17,\n", 1325 | " 'scenes': 18,\n", 1326 | " 'violence': 19,\n", 1327 | " 'set': 20,\n", 1328 | " 'word': 21,\n", 1329 | " 'GO': 22,\n", 1330 | " 'Trust': 23,\n", 1331 | " 'show': 24,\n", 1332 | " 'faint': 25,\n", 1333 | " 'hearted': 26,\n", 1334 | " 'timid': 27,\n", 1335 | " 'pulls': 28,\n", 1336 | " 'punches': 29,\n", 1337 | " 'regards': 30,\n", 1338 | " 'drugs': 31,\n", 1339 | " 'sex': 32,\n", 1340 | " 'hardcore': 33,\n", 1341 | " 'classic': 34,\n", 1342 | " 'use': 35,\n", 1343 | " 'wordIt': 36,\n", 1344 | " 'called': 37,\n", 1345 | " 'OZ': 38,\n", 1346 | " 'nickname': 39,\n", 1347 | " 'given': 40,\n", 1348 | " 'Oswald': 41,\n", 1349 | " 'Maximum': 42,\n", 1350 | " 'Security': 43,\n", 1351 | " 'State': 44,\n", 1352 | " 'Penitentary': 45,\n", 1353 | " 'focuses': 46,\n", 1354 | " 'mainly': 47,\n", 1355 | " 'Emerald': 48,\n", 1356 | " 'City': 49,\n", 1357 | " 'experimental': 50,\n", 1358 | " 'section': 51,\n", 1359 | " 'prison': 52,\n", 1360 | " 'cells': 53,\n", 1361 | " 'glass': 54,\n", 1362 | " 'fronts': 55,\n", 1363 | " 'face': 56,\n", 1364 | " 'inwards': 57,\n", 1365 | " 'privacy': 58,\n", 1366 | " 'high': 59,\n", 1367 | " 'agenda': 60,\n", 1368 | " 'Em': 61,\n", 1369 | " 'home': 62,\n", 1370 | " 'manyAryans': 63,\n", 1371 | " 'Muslims': 64,\n", 1372 | " 'gangstas': 65,\n", 1373 | " 'Latinos': 66,\n", 1374 | " 'Christians': 67,\n", 1375 | " 'Italians': 68,\n", 1376 | " 'Irish': 69,\n", 1377 | " 'moreso': 70,\n", 1378 | " 'scuffles': 71,\n", 1379 | " 'death': 72,\n", 1380 | " 'stares': 73,\n", 1381 | " 'dodgy': 74,\n", 1382 | " 'dealings': 75,\n", 1383 | " 'shady': 76,\n", 1384 | " 'agreements': 77,\n", 1385 | " 'never': 78,\n", 1386 | " 'far': 79,\n", 1387 | " 'awayI': 80,\n", 1388 | " 'would': 81,\n", 1389 | " 'say': 82,\n", 1390 | " 'main': 83,\n", 1391 | " 'appeal': 84,\n", 1392 | " 'due': 85,\n", 1393 | " 'fact': 86,\n", 1394 | " 'goes': 87,\n", 1395 | " 'shows': 88,\n", 1396 | " 'wouldnt': 89,\n", 1397 | " 'dare': 90,\n", 1398 | " 'Forget': 91,\n", 1399 | " 'pretty': 92,\n", 1400 | " 'pictures': 93,\n", 1401 | " 'painted': 94,\n", 1402 | " 'mainstream': 95,\n", 1403 | " 'audiences': 96,\n", 1404 | " 'forget': 97,\n", 1405 | " 'charm': 98,\n", 1406 | " 'romanceOZ': 99,\n", 1407 | " 'doesnt': 100,\n", 1408 | " 'mess': 101,\n", 1409 | " 'around': 102,\n", 1410 | " 'ever': 103,\n", 1411 | " 'saw': 104,\n", 1412 | " 'nasty': 105,\n", 1413 | " 'surreal': 106,\n", 1414 | " 'couldnt': 107,\n", 1415 | " 'ready': 108,\n", 1416 | " 'watched': 109,\n", 1417 | " 'developed': 110,\n", 1418 | " 'taste': 111,\n", 1419 | " 'got': 112,\n", 1420 | " 'accustomed': 113,\n", 1421 | " 'levels': 114,\n", 1422 | " 'graphic': 115,\n", 1423 | " 'injustice': 116,\n", 1424 | " 'crooked': 117,\n", 1425 | " 'guards': 118,\n", 1426 | " 'wholl': 119,\n", 1427 | " 'sold': 120,\n", 1428 | " 'nickel': 121,\n", 1429 | " 'inmates': 122,\n", 1430 | " 'kill': 123,\n", 1431 | " 'order': 124,\n", 1432 | " 'get': 125,\n", 1433 | " 'away': 126,\n", 1434 | " 'well': 127,\n", 1435 | " 'mannered': 128,\n", 1436 | " 'middle': 129,\n", 1437 | " 'class': 130,\n", 1438 | " 'turned': 131,\n", 1439 | " 'bitches': 132,\n", 1440 | " 'lack': 133,\n", 1441 | " 'street': 134,\n", 1442 | " 'skills': 135,\n", 1443 | " 'experience': 136,\n", 1444 | " 'Watching': 137,\n", 1445 | " 'may': 138,\n", 1446 | " 'become': 139,\n", 1447 | " 'comfortable': 140,\n", 1448 | " 'uncomfortable': 141,\n", 1449 | " 'viewingthats': 142,\n", 1450 | " 'touch': 143,\n", 1451 | " 'darker': 144,\n", 1452 | " 'side': 145,\n", 1453 | " 'wonderful': 146,\n", 1454 | " 'little': 147,\n", 1455 | " 'production': 148,\n", 1456 | " 'filming': 149,\n", 1457 | " 'technique': 150,\n", 1458 | " 'unassuming': 151,\n", 1459 | " 'oldtimeBBC': 152,\n", 1460 | " 'fashion': 153,\n", 1461 | " 'gives': 154,\n", 1462 | " 'comforting': 155,\n", 1463 | " 'sometimes': 156,\n", 1464 | " 'discomforting': 157,\n", 1465 | " 'sense': 158,\n", 1466 | " 'realism': 159,\n", 1467 | " 'entire': 160,\n", 1468 | " 'piece': 161,\n", 1469 | " 'actors': 162,\n", 1470 | " 'extremely': 163,\n", 1471 | " 'chosen': 164,\n", 1472 | " 'Michael': 165,\n", 1473 | " 'Sheen': 166,\n", 1474 | " 'polari': 167,\n", 1475 | " 'voices': 168,\n", 1476 | " 'pat': 169,\n", 1477 | " 'truly': 170,\n", 1478 | " 'see': 171,\n", 1479 | " 'seamless': 172,\n", 1480 | " 'editing': 173,\n", 1481 | " 'guided': 174,\n", 1482 | " 'references': 175,\n", 1483 | " 'Williams': 176,\n", 1484 | " 'diary': 177,\n", 1485 | " 'entries': 178,\n", 1486 | " 'worth': 179,\n", 1487 | " 'terrificly': 180,\n", 1488 | " 'written': 181,\n", 1489 | " 'performed': 182,\n", 1490 | " 'masterful': 183,\n", 1491 | " 'one': 184,\n", 1492 | " 'great': 185,\n", 1493 | " 'masters': 186,\n", 1494 | " 'comedy': 187,\n", 1495 | " 'life': 188,\n", 1496 | " 'really': 189,\n", 1497 | " 'comes': 190,\n", 1498 | " 'things': 191,\n", 1499 | " 'fantasy': 192,\n", 1500 | " 'guard': 193,\n", 1501 | " 'rather': 194,\n", 1502 | " 'traditional': 195,\n", 1503 | " 'dream': 196,\n", 1504 | " 'techniques': 197,\n", 1505 | " 'remains': 198,\n", 1506 | " 'solid': 199,\n", 1507 | " 'disappears': 200,\n", 1508 | " 'plays': 201,\n", 1509 | " 'knowledge': 202,\n", 1510 | " 'senses': 203,\n", 1511 | " 'particularly': 204,\n", 1512 | " 'concerning': 205,\n", 1513 | " 'Orton': 206,\n", 1514 | " 'Halliwell': 207,\n", 1515 | " 'sets': 208,\n", 1516 | " 'flat': 209,\n", 1517 | " 'Halliwells': 210,\n", 1518 | " 'murals': 211,\n", 1519 | " 'decorating': 212,\n", 1520 | " 'every': 213,\n", 1521 | " 'surface': 214,\n", 1522 | " 'terribly': 215,\n", 1523 | " 'done': 216,\n", 1524 | " 'thought': 217,\n", 1525 | " 'way': 218,\n", 1526 | " 'spend': 219,\n", 1527 | " 'time': 220,\n", 1528 | " 'hot': 221,\n", 1529 | " 'summer': 222,\n", 1530 | " 'weekend': 223,\n", 1531 | " 'sitting': 224,\n", 1532 | " 'air': 225,\n", 1533 | " 'conditioned': 226,\n", 1534 | " 'theater': 227,\n", 1535 | " 'lighthearted': 228,\n", 1536 | " 'plot': 229,\n", 1537 | " 'simplistic': 230,\n", 1538 | " 'dialogue': 231,\n", 1539 | " 'witty': 232,\n", 1540 | " 'characters': 233,\n", 1541 | " 'likable': 234,\n", 1542 | " 'even': 235,\n", 1543 | " 'bread': 236,\n", 1544 | " 'suspected': 237,\n", 1545 | " 'serial': 238,\n", 1546 | " 'killer': 239,\n", 1547 | " 'disappointed': 240,\n", 1548 | " 'realize': 241,\n", 1549 | " 'Match': 242,\n", 1550 | " 'Point': 243,\n", 1551 | " '2': 244,\n", 1552 | " 'Risk': 245,\n", 1553 | " 'Addiction': 246,\n", 1554 | " 'proof': 247,\n", 1555 | " 'Woody': 248,\n", 1556 | " 'Allen': 249,\n", 1557 | " 'still': 250,\n", 1558 | " 'fully': 251,\n", 1559 | " 'control': 252,\n", 1560 | " 'style': 253,\n", 1561 | " 'many': 254,\n", 1562 | " 'us': 255,\n", 1563 | " 'grown': 256,\n", 1564 | " 'loveThis': 257,\n", 1565 | " 'Id': 258,\n", 1566 | " 'laughed': 259,\n", 1567 | " 'Woodys': 260,\n", 1568 | " 'comedies': 261,\n", 1569 | " 'years': 262,\n", 1570 | " 'decade': 263,\n", 1571 | " 'Ive': 264,\n", 1572 | " 'impressed': 265,\n", 1573 | " 'Scarlet': 266,\n", 1574 | " 'Johanson': 267,\n", 1575 | " 'managed': 268,\n", 1576 | " 'tone': 269,\n", 1577 | " 'sexy': 270,\n", 1578 | " 'image': 271,\n", 1579 | " 'jumped': 272,\n", 1580 | " 'average': 273,\n", 1581 | " 'spirited': 274,\n", 1582 | " 'young': 275,\n", 1583 | " 'womanThis': 276,\n", 1584 | " 'crown': 277,\n", 1585 | " 'jewel': 278,\n", 1586 | " 'career': 279,\n", 1587 | " 'wittier': 280,\n", 1588 | " 'Devil': 281,\n", 1589 | " 'Wears': 282,\n", 1590 | " 'Prada': 283,\n", 1591 | " 'interesting': 284,\n", 1592 | " 'Superman': 285,\n", 1593 | " 'go': 286,\n", 1594 | " 'friends': 287,\n", 1595 | " 'Basically': 288,\n", 1596 | " 'theres': 289,\n", 1597 | " 'family': 290,\n", 1598 | " 'boy': 291,\n", 1599 | " 'Jake': 292,\n", 1600 | " 'thinks': 293,\n", 1601 | " 'zombie': 294,\n", 1602 | " 'closet': 295,\n", 1603 | " 'parents': 296,\n", 1604 | " 'fighting': 297,\n", 1605 | " 'timeThis': 298,\n", 1606 | " 'movie': 299,\n", 1607 | " 'slower': 300,\n", 1608 | " 'soap': 301,\n", 1609 | " 'opera': 302,\n", 1610 | " 'suddenly': 303,\n", 1611 | " 'decides': 304,\n", 1612 | " 'Rambo': 305,\n", 1613 | " 'zombieOK': 306,\n", 1614 | " 'youre': 307,\n", 1615 | " 'going': 308,\n", 1616 | " 'make': 309,\n", 1617 | " 'film': 310,\n", 1618 | " 'must': 311,\n", 1619 | " 'Decide': 312,\n", 1620 | " 'thriller': 313,\n", 1621 | " 'drama': 314,\n", 1622 | " 'watchable': 315,\n", 1623 | " 'Parents': 316,\n", 1624 | " 'divorcing': 317,\n", 1625 | " 'arguing': 318,\n", 1626 | " 'like': 319,\n", 1627 | " 'real': 320,\n", 1628 | " 'totally': 321,\n", 1629 | " 'ruins': 322,\n", 1630 | " 'expected': 323,\n", 1631 | " 'BOOGEYMAN': 324,\n", 1632 | " 'similar': 325,\n", 1633 | " 'instead': 326,\n", 1634 | " 'meaningless': 327,\n", 1635 | " 'spots3': 328,\n", 1636 | " '10': 329,\n", 1637 | " 'playing': 330,\n", 1638 | " 'descent': 331,\n", 1639 | " 'dialogs': 332,\n", 1640 | " 'shots': 333,\n", 1641 | " 'ignore': 334,\n", 1642 | " 'Petter': 335,\n", 1643 | " 'Matteis': 336,\n", 1644 | " 'Love': 337,\n", 1645 | " 'Time': 338,\n", 1646 | " 'Money': 339,\n", 1647 | " 'visually': 340,\n", 1648 | " 'stunning': 341,\n", 1649 | " 'watch': 342,\n", 1650 | " 'Mr': 343,\n", 1651 | " 'Mattei': 344,\n", 1652 | " 'offers': 345,\n", 1653 | " 'vivid': 346,\n", 1654 | " 'portrait': 347,\n", 1655 | " 'human': 348,\n", 1656 | " 'relations': 349,\n", 1657 | " 'seems': 350,\n", 1658 | " 'telling': 351,\n", 1659 | " 'money': 352,\n", 1660 | " 'power': 353,\n", 1661 | " 'success': 354,\n", 1662 | " 'people': 355,\n", 1663 | " 'different': 356,\n", 1664 | " 'situations': 357,\n", 1665 | " 'encounter': 358,\n", 1666 | " 'variation': 359,\n", 1667 | " 'Arthur': 360,\n", 1668 | " 'Schnitzlers': 361,\n", 1669 | " 'play': 362,\n", 1670 | " 'theme': 363,\n", 1671 | " 'director': 364,\n", 1672 | " 'transfers': 365,\n", 1673 | " 'action': 366,\n", 1674 | " 'present': 367,\n", 1675 | " 'New': 368,\n", 1676 | " 'York': 369,\n", 1677 | " 'meet': 370,\n", 1678 | " 'connect': 371,\n", 1679 | " 'connected': 372,\n", 1680 | " 'another': 373,\n", 1681 | " 'next': 374,\n", 1682 | " 'person': 375,\n", 1683 | " 'know': 376,\n", 1684 | " 'previous': 377,\n", 1685 | " 'point': 378,\n", 1686 | " 'contact': 379,\n", 1687 | " 'Stylishly': 380,\n", 1688 | " 'sophisticated': 381,\n", 1689 | " 'luxurious': 382,\n", 1690 | " 'look': 383,\n", 1691 | " 'taken': 384,\n", 1692 | " 'live': 385,\n", 1693 | " 'world': 386,\n", 1694 | " 'habitatThe': 387,\n", 1695 | " 'gets': 388,\n", 1696 | " 'souls': 389,\n", 1697 | " 'picture': 390,\n", 1698 | " 'stages': 391,\n", 1699 | " 'loneliness': 392,\n", 1700 | " 'inhabits': 393,\n", 1701 | " 'big': 394,\n", 1702 | " 'city': 395,\n", 1703 | " 'best': 396,\n", 1704 | " 'place': 397,\n", 1705 | " 'find': 398,\n", 1706 | " 'sincere': 399,\n", 1707 | " 'fulfillment': 400,\n", 1708 | " 'discerns': 401,\n", 1709 | " 'case': 402,\n", 1710 | " 'encounterThe': 403,\n", 1711 | " 'acting': 404,\n", 1712 | " 'good': 405,\n", 1713 | " 'direction': 406,\n", 1714 | " 'Steve': 407,\n", 1715 | " 'Buscemi': 408,\n", 1716 | " 'Rosario': 409,\n", 1717 | " 'Dawson': 410,\n", 1718 | " 'Carol': 411,\n", 1719 | " 'Kane': 412,\n", 1720 | " 'Imperioli': 413,\n", 1721 | " 'Adrian': 414,\n", 1722 | " 'Grenier': 415,\n", 1723 | " 'rest': 416,\n", 1724 | " 'talented': 417,\n", 1725 | " 'cast': 418,\n", 1726 | " 'come': 419,\n", 1727 | " 'aliveWe': 420,\n", 1728 | " 'wish': 421,\n", 1729 | " 'luck': 422,\n", 1730 | " 'await': 423,\n", 1731 | " 'anxiously': 424,\n", 1732 | " 'work': 425,\n", 1733 | " 'Probably': 426,\n", 1734 | " 'alltime': 427,\n", 1735 | " 'favorite': 428,\n", 1736 | " 'story': 429,\n", 1737 | " 'selflessness': 430,\n", 1738 | " 'sacrifice': 431,\n", 1739 | " 'dedication': 432,\n", 1740 | " 'noble': 433,\n", 1741 | " 'cause': 434,\n", 1742 | " 'preachy': 435,\n", 1743 | " 'boring': 436,\n", 1744 | " 'old': 437,\n", 1745 | " 'despite': 438,\n", 1746 | " 'seen': 439,\n", 1747 | " '15': 440,\n", 1748 | " 'times': 441,\n", 1749 | " 'last': 442,\n", 1750 | " '25': 443,\n", 1751 | " 'Paul': 444,\n", 1752 | " 'Lukas': 445,\n", 1753 | " 'performance': 446,\n", 1754 | " 'brings': 447,\n", 1755 | " 'tears': 448,\n", 1756 | " 'eyes': 449,\n", 1757 | " 'Bette': 450,\n", 1758 | " 'Davis': 451,\n", 1759 | " 'sympathetic': 452,\n", 1760 | " 'roles': 453,\n", 1761 | " 'delight': 454,\n", 1762 | " 'kids': 455,\n", 1763 | " 'grandma': 456,\n", 1764 | " 'says': 457,\n", 1765 | " 'dressedup': 458,\n", 1766 | " 'midgets': 459,\n", 1767 | " 'children': 460,\n", 1768 | " 'makes': 461,\n", 1769 | " 'fun': 462,\n", 1770 | " 'mothers': 463,\n", 1771 | " 'slow': 464,\n", 1772 | " 'awakening': 465,\n", 1773 | " 'whats': 466,\n", 1774 | " 'happening': 467,\n", 1775 | " 'roof': 468,\n", 1776 | " 'believable': 469,\n", 1777 | " 'startling': 470,\n", 1778 | " 'dozen': 471,\n", 1779 | " 'thumbs': 472,\n", 1780 | " 'theyd': 473,\n", 1781 | " 'sure': 474,\n", 1782 | " 'resurrection': 475,\n", 1783 | " 'dated': 476,\n", 1784 | " 'Seahunt': 477,\n", 1785 | " 'series': 478,\n", 1786 | " 'tech': 479,\n", 1787 | " 'today': 480,\n", 1788 | " 'bring': 481,\n", 1789 | " 'back': 482,\n", 1790 | " 'kid': 483,\n", 1791 | " 'excitement': 484,\n", 1792 | " 'meI': 485,\n", 1793 | " 'grew': 486,\n", 1794 | " 'black': 487,\n", 1795 | " 'white': 488,\n", 1796 | " 'TV': 489,\n", 1797 | " 'Gunsmoke': 490,\n", 1798 | " 'heros': 491,\n", 1799 | " 'weekYou': 492,\n", 1800 | " 'vote': 493,\n", 1801 | " 'comeback': 494,\n", 1802 | " 'new': 495,\n", 1803 | " 'sea': 496,\n", 1804 | " 'huntWe': 497,\n", 1805 | " 'need': 498,\n", 1806 | " 'change': 499,\n", 1807 | " 'pace': 500,\n", 1808 | " 'water': 501,\n", 1809 | " 'adventureOh': 502,\n", 1810 | " 'thank': 503,\n", 1811 | " 'outlet': 504,\n", 1812 | " 'view': 505,\n", 1813 | " 'viewpoints': 506,\n", 1814 | " 'moviesSo': 507,\n", 1815 | " 'ole': 508,\n", 1816 | " 'believe': 509,\n", 1817 | " 'wan': 510,\n", 1818 | " 'na': 511,\n", 1819 | " 'sayWould': 512,\n", 1820 | " 'nice': 513,\n", 1821 | " 'read': 514,\n", 1822 | " 'plus': 515,\n", 1823 | " 'points': 516,\n", 1824 | " 'huntIf': 517,\n", 1825 | " 'rhymes': 518,\n", 1826 | " 'lines': 519,\n", 1827 | " 'let': 520,\n", 1828 | " 'submitor': 521,\n", 1829 | " 'leave': 522,\n", 1830 | " 'doubt': 523,\n", 1831 | " 'quitIf': 524,\n", 1832 | " 'lets': 525,\n", 1833 | " 'amazing': 526,\n", 1834 | " 'fresh': 527,\n", 1835 | " 'innovative': 528,\n", 1836 | " 'idea': 529,\n", 1837 | " '70s': 530,\n", 1838 | " 'aired': 531,\n", 1839 | " '7': 532,\n", 1840 | " '8': 533,\n", 1841 | " 'brilliant': 534,\n", 1842 | " 'dropped': 535,\n", 1843 | " '1990': 536,\n", 1844 | " 'funny': 537,\n", 1845 | " 'anymore': 538,\n", 1846 | " 'continued': 539,\n", 1847 | " 'decline': 540,\n", 1848 | " 'complete': 541,\n", 1849 | " 'waste': 542,\n", 1850 | " 'todayIts': 543,\n", 1851 | " 'disgraceful': 544,\n", 1852 | " 'fallen': 545,\n", 1853 | " 'writing': 546,\n", 1854 | " 'painfully': 547,\n", 1855 | " 'bad': 548,\n", 1856 | " 'performances': 549,\n", 1857 | " 'almost': 550,\n", 1858 | " 'mildly': 551,\n", 1859 | " 'entertaining': 552,\n", 1860 | " 'respite': 553,\n", 1861 | " 'guesthosts': 554,\n", 1862 | " 'probably': 555,\n", 1863 | " 'hard': 556,\n", 1864 | " 'creator': 557,\n", 1865 | " 'handselected': 558,\n", 1866 | " 'original': 559,\n", 1867 | " 'also': 560,\n", 1868 | " 'chose': 561,\n", 1869 | " 'band': 562,\n", 1870 | " 'hacks': 563,\n", 1871 | " 'followed': 564,\n", 1872 | " 'recognize': 565,\n", 1873 | " 'brilliance': 566,\n", 1874 | " 'fit': 567,\n", 1875 | " 'replace': 568,\n", 1876 | " 'mediocrity': 569,\n", 1877 | " 'felt': 570,\n", 1878 | " 'give': 571,\n", 1879 | " 'stars': 572,\n", 1880 | " 'respect': 573,\n", 1881 | " 'made': 574,\n", 1882 | " 'huge': 575,\n", 1883 | " 'awful': 576,\n", 1884 | " 'cant': 577,\n", 1885 | " 'Encouraged': 578,\n", 1886 | " 'positive': 579,\n", 1887 | " 'comments': 580,\n", 1888 | " 'looking': 581,\n", 1889 | " 'forward': 582,\n", 1890 | " 'Bad': 583,\n", 1891 | " 'mistake': 584,\n", 1892 | " '950': 585,\n", 1893 | " 'films': 586,\n", 1894 | " 'worst': 587,\n", 1895 | " 'pacing': 588,\n", 1896 | " 'storyline': 589,\n", 1897 | " 'soundtrack': 590,\n", 1898 | " 'song': 591,\n", 1899 | " 'lame': 592,\n", 1900 | " 'country': 593,\n", 1901 | " 'tune': 594,\n", 1902 | " 'played': 595,\n", 1903 | " 'less': 596,\n", 1904 | " 'four': 597,\n", 1905 | " 'looks': 598,\n", 1906 | " 'cheap': 599,\n", 1907 | " 'extreme': 600,\n", 1908 | " 'Rarely': 601,\n", 1909 | " 'happy': 602,\n", 1910 | " 'end': 603,\n", 1911 | " 'credits': 604,\n", 1912 | " 'prevents': 605,\n", 1913 | " 'giving': 606,\n", 1914 | " '1score': 607,\n", 1915 | " 'Harvey': 608,\n", 1916 | " 'Keitel': 609,\n", 1917 | " 'least': 610,\n", 1918 | " 'making': 611,\n", 1919 | " 'bit': 612,\n", 1920 | " 'effort': 613,\n", 1921 | " 'obsessives': 614,\n", 1922 | " 'gut': 615,\n", 1923 | " 'wrenching': 616,\n", 1924 | " 'laughter': 617,\n", 1925 | " 'love': 618,\n", 1926 | " 'hell': 619,\n", 1927 | " 'mom': 620,\n", 1928 | " 'liked': 621,\n", 1929 | " 'itGreat': 622,\n", 1930 | " 'Camp': 623,\n", 1931 | " 'Phil': 624,\n", 1932 | " 'Alien': 625,\n", 1933 | " 'quirky': 626,\n", 1934 | " 'humour': 627,\n", 1935 | " 'based': 628,\n", 1936 | " 'oddness': 629,\n", 1937 | " 'everything': 630,\n", 1938 | " 'actual': 631,\n", 1939 | " 'punchlinesAt': 632,\n", 1940 | " 'odd': 633,\n", 1941 | " 'progressed': 634,\n", 1942 | " 'didnt': 635,\n", 1943 | " 'jokes': 636,\n", 1944 | " 'anymoreIts': 637,\n", 1945 | " 'low': 638,\n", 1946 | " 'budget': 639,\n", 1947 | " 'thats': 640,\n", 1948 | " 'problem': 641,\n", 1949 | " 'eventually': 642,\n", 1950 | " 'lost': 643,\n", 1951 | " 'interestI': 644,\n", 1952 | " 'imagine': 645,\n", 1953 | " 'stoner': 646,\n", 1954 | " 'currently': 647,\n", 1955 | " 'partakingFor': 648,\n", 1956 | " 'something': 649,\n", 1957 | " 'better': 650,\n", 1958 | " 'try': 651,\n", 1959 | " 'Brother': 652,\n", 1960 | " 'planet': 653,\n", 1961 | " '12': 654,\n", 1962 | " 'came': 655,\n", 1963 | " 'recall': 656,\n", 1964 | " 'scariest': 657,\n", 1965 | " 'scene': 658,\n", 1966 | " 'bird': 659,\n", 1967 | " 'eating': 660,\n", 1968 | " 'men': 661,\n", 1969 | " 'dangling': 662,\n", 1970 | " 'helplessly': 663,\n", 1971 | " 'parachutes': 664,\n", 1972 | " 'horror': 665,\n", 1973 | " 'horrorAs': 666,\n", 1974 | " 'cheesy': 667,\n", 1975 | " 'B': 668,\n", 1976 | " 'Saturday': 669,\n", 1977 | " 'afternoons': 670,\n", 1978 | " 'tired': 671,\n", 1979 | " 'formula': 672,\n", 1980 | " 'monster': 673,\n", 1981 | " 'type': 674,\n", 1982 | " 'movies': 675,\n", 1983 | " 'usually': 676,\n", 1984 | " 'included': 677,\n", 1985 | " 'hero': 678,\n", 1986 | " 'beautiful': 679,\n", 1987 | " 'woman': 680,\n", 1988 | " 'might': 681,\n", 1989 | " 'daughter': 682,\n", 1990 | " 'professor': 683,\n", 1991 | " 'resolution': 684,\n", 1992 | " 'died': 685,\n", 1993 | " 'care': 686,\n", 1994 | " 'much': 687,\n", 1995 | " 'romantic': 688,\n", 1996 | " 'angle': 689,\n", 1997 | " 'year': 690,\n", 1998 | " 'predictable': 691,\n", 1999 | " 'plots': 692,\n", 2000 | " 'unintentional': 693,\n", 2001 | " 'humorBut': 694,\n", 2002 | " 'later': 695,\n", 2003 | " 'Psycho': 696,\n", 2004 | " 'loved': 697,\n", 2005 | " 'star': 698,\n", 2006 | " 'Janet': 699,\n", 2007 | " 'Leigh': 700,\n", 2008 | " 'bumped': 701,\n", 2009 | " 'early': 702,\n", 2010 | " 'sat': 703,\n", 2011 | " 'took': 704,\n", 2012 | " 'notice': 705,\n", 2013 | " 'Since': 706,\n", 2014 | " 'screenwriters': 707,\n", 2015 | " 'scary': 708,\n", 2016 | " 'possible': 709,\n", 2017 | " 'wellworn': 710,\n", 2018 | " 'rules': 711,\n", 2019 | " 'im': 712,\n", 2020 | " 'fan': 713,\n", 2021 | " 'Bolls': 714,\n", 2022 | " 'enjoyed': 715,\n", 2023 | " 'Postal': 716,\n", 2024 | " 'maybe': 717,\n", 2025 | " 'Boll': 718,\n", 2026 | " 'apparently': 719,\n", 2027 | " 'bought': 720,\n", 2028 | " 'rights': 721,\n", 2029 | " 'Far': 722,\n", 2030 | " 'Cry': 723,\n", 2031 | " 'long': 724,\n", 2032 | " 'ago': 725,\n", 2033 | " 'game': 726,\n", 2034 | " 'finsished': 727,\n", 2035 | " 'People': 728,\n", 2036 | " 'killing': 729,\n", 2037 | " 'mercs': 730,\n", 2038 | " 'infiltrating': 731,\n", 2039 | " 'secret': 732,\n", 2040 | " 'research': 733,\n", 2041 | " 'labs': 734,\n", 2042 | " 'located': 735,\n", 2043 | " 'tropical': 736,\n", 2044 | " 'island': 737,\n", 2045 | " 'warned': 738,\n", 2046 | " 'schemed': 739,\n", 2047 | " 'together': 740,\n", 2048 | " 'along': 741,\n", 2049 | " 'legion': 742,\n", 2050 | " 'schmucks': 743,\n", 2051 | " 'Feeling': 744,\n", 2052 | " 'loneley': 745,\n", 2053 | " 'invites': 746,\n", 2054 | " 'three': 747,\n", 2055 | " 'countrymen': 748,\n", 2056 | " 'players': 749,\n", 2057 | " 'names': 750,\n", 2058 | " 'Til': 751,\n", 2059 | " 'Schweiger': 752,\n", 2060 | " 'Udo': 753,\n", 2061 | " 'Kier': 754,\n", 2062 | " 'Ralf': 755,\n", 2063 | " 'MoellerThree': 756,\n", 2064 | " 'actually': 757,\n", 2065 | " 'selfs': 758,\n", 2066 | " 'biz': 759,\n", 2067 | " 'tale': 760,\n", 2068 | " 'Jack': 761,\n", 2069 | " 'Carver': 762,\n", 2070 | " 'yes': 763,\n", 2071 | " 'German': 764,\n", 2072 | " 'hail': 765,\n", 2073 | " 'bratwurst': 766,\n", 2074 | " 'dudes': 767,\n", 2075 | " 'However': 768,\n", 2076 | " 'Tils': 769,\n", 2077 | " 'badass': 770,\n", 2078 | " 'complained': 771,\n", 2079 | " 'hes': 772,\n", 2080 | " 'staying': 773,\n", 2081 | " 'true': 774,\n", 2082 | " 'whole': 775,\n", 2083 | " 'carver': 776,\n", 2084 | " 'perspective': 777,\n", 2085 | " 'dont': 778,\n", 2086 | " 'looked': 779,\n", 2087 | " 'kicking': 780,\n", 2088 | " 'beyond': 781,\n", 2089 | " 'demented': 782,\n", 2090 | " 'evil': 783,\n", 2091 | " 'mad': 784,\n", 2092 | " 'scientist': 785,\n", 2093 | " 'Dr': 786,\n", 2094 | " 'Krieger': 787,\n", 2095 | " 'GeneticallyMutatedsoldiers': 788,\n", 2096 | " 'GMS': 789,\n", 2097 | " 'Performing': 790,\n", 2098 | " 'topsecret': 791,\n", 2099 | " 'reminds': 792,\n", 2100 | " 'SPOILER': 793,\n", 2101 | " 'Vancouver': 794,\n", 2102 | " 'reason': 795,\n", 2103 | " 'Thats': 796,\n", 2104 | " 'palm': 797,\n", 2105 | " 'trees': 798,\n", 2106 | " 'Instead': 799,\n", 2107 | " 'rich': 800,\n", 2108 | " 'lumberjackwoods': 801,\n", 2109 | " 'havent': 802,\n", 2110 | " 'gone': 803,\n", 2111 | " 'FAR': 804,\n", 2112 | " 'started': 805,\n", 2113 | " 'CRY': 806,\n", 2114 | " 'mehehe': 807,\n", 2115 | " 'can': 808,\n", 2116 | " 'not': 809,\n", 2117 | " 'stay': 810,\n", 2118 | " 'shenanigans': 811,\n", 2119 | " 'delivers': 812,\n", 2120 | " 'meaning': 813,\n", 2121 | " 'suckThere': 814,\n", 2122 | " 'mentioning': 815,\n", 2123 | " 'imply': 816,\n", 2124 | " 'areas': 817,\n", 2125 | " 'boat': 818,\n", 2126 | " 'cromedalbino': 819,\n", 2127 | " 'squad': 820,\n", 2128 | " 'enters': 821,\n", 2129 | " 'laugh': 822,\n", 2130 | " 'reeks': 823,\n", 2131 | " 'scheisse': 824,\n", 2132 | " 'poop': 825,\n", 2133 | " 'simpletons': 826,\n", 2134 | " 'take': 827,\n", 2135 | " 'wiff': 828,\n", 2136 | " 'ahead': 829,\n", 2137 | " 'BTW': 830,\n", 2138 | " 'annoying': 831,\n", 2139 | " 'sidekick': 832,\n", 2140 | " 'shoot': 833,\n", 2141 | " 'minutes': 834,\n", 2142 | " 'screen': 835,\n", 2143 | " 'ShakespeareShakespeare': 836,\n", 2144 | " 'lostI': 837,\n", 2145 | " 'appreciate': 838,\n", 2146 | " 'trying': 839,\n", 2147 | " 'Shakespeare': 840,\n", 2148 | " 'masses': 841,\n", 2149 | " 'ruin': 842,\n", 2150 | " 'goodIs': 843,\n", 2151 | " 'Scottish': 844,\n", 2152 | " 'Play': 845,\n", 2153 | " 'certain': 846,\n", 2154 | " 'Rev': 847,\n", 2155 | " 'Bowdler': 848,\n", 2156 | " 'hence': 849,\n", 2157 | " 'bowdlerization': 850,\n", 2158 | " 'tried': 851,\n", 2159 | " 'Victorian': 852,\n", 2160 | " 'eraIn': 853,\n", 2161 | " 'words': 854,\n", 2162 | " 'improve': 855,\n", 2163 | " 'perfectionI': 856,\n", 2164 | " 'write': 857,\n", 2165 | " 'ten': 858,\n", 2166 | " 'text': 859,\n", 2167 | " 'English': 860,\n", 2168 | " 'composition': 861,\n", 2169 | " 'forte': 862,\n", 2170 | " 'keep': 863,\n", 2171 | " 'saying': 864,\n", 2172 | " 'cut': 865,\n", 2173 | " 'fantastic': 866,\n", 2174 | " 'prisoners': 867,\n", 2175 | " 'famous': 868,\n", 2176 | " 'george': 869,\n", 2177 | " 'clooney': 870,\n", 2178 | " 'Im': 871,\n", 2179 | " 'roll': 872,\n", 2180 | " 'Another': 873,\n", 2181 | " 'man': 874,\n", 2182 | " 'constant': 875,\n", 2183 | " 'sorrow': 876,\n", 2184 | " 'recommand': 877,\n", 2185 | " 'everybody': 878,\n", 2186 | " 'Greetings': 879,\n", 2187 | " 'Bart': 880,\n", 2188 | " 'Kind': 881,\n", 2189 | " 'drawn': 882,\n", 2190 | " 'erotic': 883,\n", 2191 | " 'amateurish': 884,\n", 2192 | " 'unbelievable': 885,\n", 2193 | " 'bits': 886,\n", 2194 | " 'Sort': 887,\n", 2195 | " 'school': 888,\n", 2196 | " 'project': 889,\n", 2197 | " 'Rosanna': 890,\n", 2198 | " 'Arquette': 891,\n", 2199 | " 'thinking': 892,\n", 2200 | " 'stock': 893,\n", 2201 | " 'bizarre': 894,\n", 2202 | " 'supposed': 895,\n", 2203 | " 'Midwest': 896,\n", 2204 | " 'town': 897,\n", 2205 | " 'Pretty': 898,\n", 2206 | " 'involved': 899,\n", 2207 | " 'lessons': 900,\n", 2208 | " 'learned': 901,\n", 2209 | " 'insights': 902,\n", 2210 | " 'stilted': 903,\n", 2211 | " 'quite': 904,\n", 2212 | " 'ridiculous': 905,\n", 2213 | " 'lots': 906,\n", 2214 | " 'skin': 907,\n", 2215 | " 'intrigues': 908,\n", 2216 | " 'videotaped': 909,\n", 2217 | " 'nonsenseWhat': 910,\n", 2218 | " 'bisexual': 911,\n", 2219 | " 'relationship': 912,\n", 2220 | " 'nowhere': 913,\n", 2221 | " 'heterosexual': 914,\n", 2222 | " 'encounters': 915,\n", 2223 | " 'absurd': 916,\n", 2224 | " 'dance': 917,\n", 2225 | " 'stereotyped': 918,\n", 2226 | " 'Give': 919,\n", 2227 | " 'pass': 920,\n", 2228 | " 'million': 921,\n", 2229 | " 'miles': 922,\n", 2230 | " 'wasted': 923,\n", 2231 | " 'could': 924,\n", 2232 | " 'spent': 925,\n", 2233 | " 'starving': 926,\n", 2234 | " 'Aids': 927,\n", 2235 | " 'Africa': 928,\n", 2236 | " 'simply': 929,\n", 2237 | " 'remade': 930,\n", 2238 | " 'fails': 931,\n", 2239 | " 'capture': 932,\n", 2240 | " 'flavor': 933,\n", 2241 | " 'terror': 934,\n", 2242 | " '1963': 935,\n", 2243 | " 'title': 936,\n", 2244 | " 'Liam': 937,\n", 2245 | " 'Neeson': 938,\n", 2246 | " 'excellent': 939,\n", 2247 | " 'always': 940,\n", 2248 | " 'holds': 941,\n", 2249 | " 'exception': 942,\n", 2250 | " 'Owen': 943,\n", 2251 | " 'Wilson': 944,\n", 2252 | " 'feel': 945,\n", 2253 | " 'character': 946,\n", 2254 | " 'Luke': 947,\n", 2255 | " 'major': 948,\n", 2256 | " 'fault': 949,\n", 2257 | " 'version': 950,\n", 2258 | " 'strayed': 951,\n", 2259 | " 'Shirley': 952,\n", 2260 | " 'Jackson': 953,\n", 2261 | " 'attempts': 954,\n", 2262 | " 'grandiose': 955,\n", 2263 | " 'thrill': 956,\n", 2264 | " 'earlier': 957,\n", 2265 | " 'trade': 958,\n", 2266 | " 'snazzier': 959,\n", 2267 | " 'special': 960,\n", 2268 | " 'effects': 961,\n", 2269 | " 'enjoy': 962,\n", 2270 | " 'friction': 963,\n", 2271 | " 'older': 964,\n", 2272 | " 'top': 965,\n", 2273 | " 'Horrible': 966,\n", 2274 | " 'wasnt': 967,\n", 2275 | " 'continuous': 968,\n", 2276 | " 'minute': 969,\n", 2277 | " 'fight': 970,\n", 2278 | " 'chance': 971,\n", 2279 | " 'development': 972,\n", 2280 | " 'busy': 973,\n", 2281 | " 'running': 974,\n", 2282 | " 'sword': 975,\n", 2283 | " 'emotional': 976,\n", 2284 | " 'attachment': 977,\n", 2285 | " 'except': 978,\n", 2286 | " 'machine': 979,\n", 2287 | " 'wanted': 980,\n", 2288 | " 'destroy': 981,\n", 2289 | " 'Scenes': 982,\n", 2290 | " 'blatantly': 983,\n", 2291 | " 'stolen': 984,\n", 2292 | " 'LOTR': 985,\n", 2293 | " 'Star': 986,\n", 2294 | " 'Wars': 987,\n", 2295 | " 'Matrix': 988,\n", 2296 | " 'ExamplesThe': 989,\n", 2297 | " 'ghost': 990,\n", 2298 | " 'final': 991,\n", 2299 | " 'Yoda': 992,\n", 2300 | " 'Obee': 993,\n", 2301 | " 'Vader': 994,\n", 2302 | " 'spider': 995,\n", 2303 | " 'beginning': 996,\n", 2304 | " 'Frodo': 997,\n", 2305 | " 'attacked': 998,\n", 2306 | " 'Return': 999,\n", 2307 | " ...}" 2308 | ] 2309 | }, 2310 | "execution_count": 13, 2311 | "metadata": {}, 2312 | "output_type": "execute_result" 2313 | } 2314 | ], 2315 | "source": [ 2316 | "vocab_to_index" 2317 | ] 2318 | }, 2319 | { 2320 | "cell_type": "code", 2321 | "execution_count": 14, 2322 | "metadata": {}, 2323 | "outputs": [ 2324 | { 2325 | "data": { 2326 | "text/plain": [ 2327 | "256140" 2328 | ] 2329 | }, 2330 | "execution_count": 14, 2331 | "metadata": {}, 2332 | "output_type": "execute_result" 2333 | } 2334 | ], 2335 | "source": [ 2336 | "len(vocab_to_index)" 2337 | ] 2338 | }, 2339 | { 2340 | "cell_type": "code", 2341 | "execution_count": 16, 2342 | "metadata": { 2343 | "scrolled": true 2344 | }, 2345 | "outputs": [], 2346 | "source": [ 2347 | "result = [[vocab_to_index[word] for word in y] for y in vocab_lst]" 2348 | ] 2349 | }, 2350 | { 2351 | "cell_type": "code", 2352 | "execution_count": 17, 2353 | "metadata": { 2354 | "scrolled": true 2355 | }, 2356 | "outputs": [ 2357 | { 2358 | "data": { 2359 | "text/plain": [ 2360 | "[624,\n", 2361 | " 625,\n", 2362 | " 184,\n", 2363 | " 626,\n", 2364 | " 586,\n", 2365 | " 627,\n", 2366 | " 628,\n", 2367 | " 102,\n", 2368 | " 629,\n", 2369 | " 630,\n", 2370 | " 194,\n", 2371 | " 631,\n", 2372 | " 632,\n", 2373 | " 13,\n", 2374 | " 633,\n", 2375 | " 92,\n", 2376 | " 537,\n", 2377 | " 299,\n", 2378 | " 634,\n", 2379 | " 635,\n", 2380 | " 398,\n", 2381 | " 636,\n", 2382 | " 629,\n", 2383 | " 537,\n", 2384 | " 637,\n", 2385 | " 638,\n", 2386 | " 639,\n", 2387 | " 310,\n", 2388 | " 640,\n", 2389 | " 78,\n", 2390 | " 641,\n", 2391 | " 92,\n", 2392 | " 284,\n", 2393 | " 233,\n", 2394 | " 642,\n", 2395 | " 643,\n", 2396 | " 644,\n", 2397 | " 645,\n", 2398 | " 310,\n", 2399 | " 81,\n", 2400 | " 84,\n", 2401 | " 646,\n", 2402 | " 647,\n", 2403 | " 648,\n", 2404 | " 649,\n", 2405 | " 325,\n", 2406 | " 650,\n", 2407 | " 651,\n", 2408 | " 652,\n", 2409 | " 373,\n", 2410 | " 653]" 2411 | ] 2412 | }, 2413 | "execution_count": 17, 2414 | "metadata": {}, 2415 | "output_type": "execute_result" 2416 | } 2417 | ], 2418 | "source": [ 2419 | "result[10]" 2420 | ] 2421 | }, 2422 | { 2423 | "cell_type": "code", 2424 | "execution_count": 18, 2425 | "metadata": {}, 2426 | "outputs": [], 2427 | "source": [ 2428 | "# KOREAN : https://github.com/e9t/nsmc\n", 2429 | "# ENGLISH : https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/kernels" 2430 | ] 2431 | } 2432 | ], 2433 | "metadata": { 2434 | "kernelspec": { 2435 | "display_name": "Python 3", 2436 | "language": "python", 2437 | "name": "python3" 2438 | }, 2439 | "language_info": { 2440 | "codemirror_mode": { 2441 | "name": "ipython", 2442 | "version": 3 2443 | }, 2444 | "file_extension": ".py", 2445 | "mimetype": "text/x-python", 2446 | "name": "python", 2447 | "nbconvert_exporter": "python", 2448 | "pygments_lexer": "ipython3", 2449 | "version": "3.6.8" 2450 | } 2451 | }, 2452 | "nbformat": 4, 2453 | "nbformat_minor": 4 2454 | } 2455 | -------------------------------------------------------------------------------- /2_Representation_Vector.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import torch\n", 10 | "from torch.autograd import Variable\n", 11 | "import torch.nn as nn\n", 12 | "import torch.nn.functional as F\n", 13 | "import torch.optim as optim" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "vocab_size: 49\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "CONTEXT_SIZE = 2\n", 31 | "\n", 32 | "text = \"\"\"We are about to study the idea of a computational process.\n", 33 | "Computational processes are abstract beings that inhabit computers.\n", 34 | "As they evolve, processes manipulate other abstract things called data.\n", 35 | "The evolution of a process is directed by a pattern of rules\n", 36 | "called a program. People create programs to direct processes. In effect,\n", 37 | "we conjure the spirits of the computer with our spells.\"\"\".split()\n", 38 | "\n", 39 | "vocab = set(text)\n", 40 | "vocab_size = len(vocab)\n", 41 | "print('vocab_size:', vocab_size)\n", 42 | "\n", 43 | "w2i = {w: i for i, w in enumerate(vocab)}\n", 44 | "i2w = {i: w for i, w in enumerate(vocab)}" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "cbow sample (['We', 'are', 'to', 'study'], 'about')\n", 57 | "skipgram sample ('about', 'We', 1)\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "def create_cbow_dataset(text):\n", 63 | " data = []\n", 64 | " for i in range(2, len(text) - 2):\n", 65 | " context = [text[i - 2], text[i - 1],\n", 66 | " text[i + 1], text[i + 2]]\n", 67 | " target = text[i]\n", 68 | " data.append((context, target))\n", 69 | " return data\n", 70 | "\n", 71 | "def create_skipgram_dataset(text):\n", 72 | " import random\n", 73 | " data = []\n", 74 | " for i in range(2, len(text) - 2):\n", 75 | " data.append((text[i], text[i-2], 1))\n", 76 | " data.append((text[i], text[i-1], 1))\n", 77 | " data.append((text[i], text[i+1], 1))\n", 78 | " data.append((text[i], text[i+2], 1))\n", 79 | " for _ in range(4):\n", 80 | " if random.random() < 0.5 or i >= len(text) - 3:\n", 81 | " rand_id = random.randint(0, i-1)\n", 82 | " else:\n", 83 | " rand_id = random.randint(i+3, len(text)-1)\n", 84 | " data.append((text[i], text[rand_id], 0))\n", 85 | " return data\n", 86 | "\n", 87 | "cbow_train = create_cbow_dataset(text)\n", 88 | "skipgram_train = create_skipgram_dataset(text)\n", 89 | "print('cbow sample', cbow_train[0])\n", 90 | "print('skipgram sample', skipgram_train[0])" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "class CBOW(nn.Module):\n", 100 | " def __init__(self, vocab_size, embd_size, context_size, hidden_size):\n", 101 | " super(CBOW, self).__init__()\n", 102 | " self.embeddings = nn.Embedding(vocab_size, embd_size)\n", 103 | " self.linear1 = nn.Linear(2*context_size*embd_size, hidden_size)\n", 104 | " self.linear2 = nn.Linear(hidden_size, vocab_size)\n", 105 | " \n", 106 | " def forward(self, inputs):\n", 107 | " embedded = self.embeddings(inputs).view((1, -1))\n", 108 | " hid = F.relu(self.linear1(embedded))\n", 109 | " out = self.linear2(hid)\n", 110 | " log_probs = F.log_softmax(out)\n", 111 | " return log_probs\n", 112 | "\n", 113 | "class SkipGram(nn.Module):\n", 114 | " def __init__(self, vocab_size, embd_size):\n", 115 | " super(SkipGram, self).__init__()\n", 116 | " self.embeddings = nn.Embedding(vocab_size, embd_size)\n", 117 | " \n", 118 | " def forward(self, focus, context):\n", 119 | " embed_focus = self.embeddings(focus).view((1, -1))\n", 120 | " embed_ctx = self.embeddings(context).view((1, -1))\n", 121 | " score = torch.mm(embed_focus, torch.t(embed_ctx))\n", 122 | " log_probs = F.logsigmoid(score)\n", 123 | " \n", 124 | " return log_probs" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 5, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "embd_size = 100\n", 134 | "learning_rate = 0.001\n", 135 | "n_epoch = 30\n", 136 | "\n", 137 | "def train_cbow():\n", 138 | " hidden_size = 64\n", 139 | " losses = []\n", 140 | " loss_fn = nn.NLLLoss()\n", 141 | " model = CBOW(vocab_size, embd_size, CONTEXT_SIZE, hidden_size)\n", 142 | " print(model)\n", 143 | " optimizer = optim.SGD(model.parameters(), lr=learning_rate)\n", 144 | "\n", 145 | " for epoch in range(n_epoch):\n", 146 | " total_loss = .0\n", 147 | " for context, target in cbow_train:\n", 148 | " ctx_idxs = [w2i[w] for w in context]\n", 149 | " ctx_var = Variable(torch.LongTensor(ctx_idxs))\n", 150 | "\n", 151 | " model.zero_grad()\n", 152 | " log_probs = model(ctx_var)\n", 153 | "\n", 154 | " loss = loss_fn(log_probs, Variable(torch.LongTensor([w2i[target]])))\n", 155 | "\n", 156 | " loss.backward()\n", 157 | " optimizer.step()\n", 158 | " total_loss += loss.data\n", 159 | " losses.append(total_loss)\n", 160 | " return model, losses\n", 161 | "\n", 162 | "def train_skipgram():\n", 163 | " losses = []\n", 164 | " loss_fn = nn.MSELoss()\n", 165 | " model = SkipGram(vocab_size, embd_size)\n", 166 | " print(model)\n", 167 | " optimizer = optim.SGD(model.parameters(), lr=learning_rate)\n", 168 | " \n", 169 | " for epoch in range(n_epoch):\n", 170 | " total_loss = .0\n", 171 | " for in_w, out_w, target in skipgram_train:\n", 172 | " in_w_var = Variable(torch.LongTensor([w2i[in_w]]))\n", 173 | " out_w_var = Variable(torch.LongTensor([w2i[out_w]]))\n", 174 | " \n", 175 | " model.zero_grad()\n", 176 | " log_probs = model(in_w_var, out_w_var)\n", 177 | " loss = loss_fn(log_probs[0], Variable(torch.Tensor([target])))\n", 178 | " \n", 179 | " loss.backward()\n", 180 | " optimizer.step()\n", 181 | "\n", 182 | " total_loss += loss.data\n", 183 | " losses.append(total_loss)\n", 184 | " return model, losses" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 6, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "name": "stdout", 194 | "output_type": "stream", 195 | "text": [ 196 | "CBOW(\n", 197 | " (embeddings): Embedding(49, 100)\n", 198 | " (linear1): Linear(in_features=400, out_features=64, bias=True)\n", 199 | " (linear2): Linear(in_features=64, out_features=49, bias=True)\n", 200 | ")\n" 201 | ] 202 | }, 203 | { 204 | "name": "stderr", 205 | "output_type": "stream", 206 | "text": [ 207 | "c:\\users\\justin\\venv\\lib\\site-packages\\ipykernel_launcher.py:12: UserWarning: Implicit dimension choice for log_softmax has been deprecated. Change the call to include dim=X as an argument.\n", 208 | " if sys.path[0] == '':\n" 209 | ] 210 | }, 211 | { 212 | "name": "stdout", 213 | "output_type": "stream", 214 | "text": [ 215 | "SkipGram(\n", 216 | " (embeddings): Embedding(49, 100)\n", 217 | ")\n" 218 | ] 219 | } 220 | ], 221 | "source": [ 222 | "cbow_model, cbow_losses = train_cbow()\n", 223 | "sg_model, sg_losses = train_skipgram()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 7, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "def test_cbow(test_data, model):\n", 233 | " print('====Test CBOW===')\n", 234 | " correct_ct = 0\n", 235 | " for ctx, target in test_data:\n", 236 | " ctx_idxs = [w2i[w] for w in ctx]\n", 237 | " ctx_var = Variable(torch.LongTensor(ctx_idxs))\n", 238 | "\n", 239 | " model.zero_grad()\n", 240 | " log_probs = model(ctx_var)\n", 241 | " _, predicted = torch.max(log_probs.data, 1)\n", 242 | " predicted_word = i2w[predicted.item()]\n", 243 | " print('predicted:', predicted_word)\n", 244 | " print('label :', target)\n", 245 | " if predicted_word == target:\n", 246 | " correct_ct += 1\n", 247 | " \n", 248 | " print('Accuracy: {:.1f}% ({:d}/{:d})'.format(correct_ct/len(test_data)*100, correct_ct, len(test_data)))\n", 249 | "\n", 250 | "def test_skipgram(test_data, model):\n", 251 | " print('====Test SkipGram===')\n", 252 | " correct_ct = 0\n", 253 | " for in_w, out_w, target in test_data:\n", 254 | " in_w_var = Variable(torch.LongTensor([w2i[in_w]]))\n", 255 | " out_w_var = Variable(torch.LongTensor([w2i[out_w]]))\n", 256 | "\n", 257 | " model.zero_grad()\n", 258 | " log_probs = model(in_w_var, out_w_var)\n", 259 | " _, predicted = torch.max(log_probs.data, 1)\n", 260 | " predicted = predicted[0]\n", 261 | " if predicted == target:\n", 262 | " correct_ct += 1\n", 263 | "\n", 264 | " print('Accuracy: {:.1f}% ({:d}/{:d})'.format(correct_ct/len(test_data)*100, correct_ct, len(test_data)))" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 8, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "====Test CBOW===\n", 277 | "predicted: about\n", 278 | "label : about\n", 279 | "predicted: to\n", 280 | "label : to\n", 281 | "predicted: study\n", 282 | "label : study\n", 283 | "predicted: the\n", 284 | "label : the\n", 285 | "predicted: idea\n", 286 | "label : idea\n", 287 | "predicted: of\n", 288 | "label : of\n", 289 | "predicted: a\n", 290 | "label : a\n", 291 | "predicted: computational\n", 292 | "label : computational\n", 293 | "predicted: process.\n", 294 | "label : process.\n", 295 | "predicted: Computational\n", 296 | "label : Computational\n", 297 | "predicted: processes\n", 298 | "label : processes\n", 299 | "predicted: are\n", 300 | "label : are\n", 301 | "predicted: abstract\n", 302 | "label : abstract\n", 303 | "predicted: beings\n", 304 | "label : beings\n", 305 | "predicted: that\n", 306 | "label : that\n", 307 | "predicted: inhabit\n", 308 | "label : inhabit\n", 309 | "predicted: computers.\n", 310 | "label : computers.\n", 311 | "predicted: As\n", 312 | "label : As\n", 313 | "predicted: they\n", 314 | "label : they\n", 315 | "predicted: evolve,\n", 316 | "label : evolve,\n", 317 | "predicted: processes\n", 318 | "label : processes\n", 319 | "predicted: manipulate\n", 320 | "label : manipulate\n", 321 | "predicted: other\n", 322 | "label : other\n", 323 | "predicted: abstract\n", 324 | "label : abstract\n", 325 | "predicted: things\n", 326 | "label : things\n", 327 | "predicted: called\n", 328 | "label : called\n", 329 | "predicted: data.\n", 330 | "label : data.\n", 331 | "predicted: The\n", 332 | "label : The\n", 333 | "predicted: evolution\n", 334 | "label : evolution\n", 335 | "predicted: of\n", 336 | "label : of\n", 337 | "predicted: a\n", 338 | "label : a\n", 339 | "predicted: process\n", 340 | "label : process\n", 341 | "predicted: is\n", 342 | "label : is\n", 343 | "predicted: directed\n", 344 | "label : directed\n", 345 | "predicted: by\n", 346 | "label : by\n", 347 | "predicted: a\n", 348 | "label : a\n", 349 | "predicted: pattern\n", 350 | "label : pattern\n", 351 | "predicted: of\n", 352 | "label : of\n", 353 | "predicted: rules\n", 354 | "label : rules\n", 355 | "predicted: called\n", 356 | "label : called\n", 357 | "predicted: a\n", 358 | "label : a\n", 359 | "predicted: program.\n", 360 | "label : program.\n", 361 | "predicted: People\n", 362 | "label : People\n", 363 | "predicted: create\n", 364 | "label : create\n", 365 | "predicted: programs\n", 366 | "label : programs\n", 367 | "predicted: to\n", 368 | "label : to\n", 369 | "predicted: direct\n", 370 | "label : direct\n", 371 | "predicted: processes.\n", 372 | "label : processes.\n", 373 | "predicted: In\n", 374 | "label : In\n", 375 | "predicted: effect,\n", 376 | "label : effect,\n", 377 | "predicted: we\n", 378 | "label : we\n", 379 | "predicted: conjure\n", 380 | "label : conjure\n", 381 | "predicted: the\n", 382 | "label : the\n", 383 | "predicted: spirits\n", 384 | "label : spirits\n", 385 | "predicted: of\n", 386 | "label : of\n", 387 | "predicted: the\n", 388 | "label : the\n", 389 | "predicted: computer\n", 390 | "label : computer\n", 391 | "predicted: of\n", 392 | "label : with\n", 393 | "Accuracy: 98.3% (57/58)\n", 394 | "------\n", 395 | "====Test SkipGram===\n", 396 | "Accuracy: 50.0% (232/464)\n" 397 | ] 398 | }, 399 | { 400 | "name": "stderr", 401 | "output_type": "stream", 402 | "text": [ 403 | "c:\\users\\justin\\venv\\lib\\site-packages\\ipykernel_launcher.py:12: UserWarning: Implicit dimension choice for log_softmax has been deprecated. Change the call to include dim=X as an argument.\n", 404 | " if sys.path[0] == '':\n" 405 | ] 406 | } 407 | ], 408 | "source": [ 409 | "test_cbow(cbow_train, cbow_model)\n", 410 | "print('------')\n", 411 | "test_skipgram(skipgram_train, sg_model)" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 9, 417 | "metadata": {}, 418 | "outputs": [ 419 | { 420 | "data": { 421 | "text/plain": [ 422 | "
" 423 | ] 424 | }, 425 | "metadata": {}, 426 | "output_type": "display_data" 427 | }, 428 | { 429 | "data": { 430 | "image/png": "\n", 431 | "text/plain": [ 432 | "
" 433 | ] 434 | }, 435 | "metadata": { 436 | "needs_background": "light" 437 | }, 438 | "output_type": "display_data" 439 | }, 440 | { 441 | "data": { 442 | "text/plain": [ 443 | "
" 444 | ] 445 | }, 446 | "metadata": {}, 447 | "output_type": "display_data" 448 | }, 449 | { 450 | "data": { 451 | "image/png": "\n", 452 | "text/plain": [ 453 | "
" 454 | ] 455 | }, 456 | "metadata": { 457 | "needs_background": "light" 458 | }, 459 | "output_type": "display_data" 460 | } 461 | ], 462 | "source": [ 463 | "%matplotlib inline\n", 464 | "import matplotlib.pyplot as plt\n", 465 | "import numpy as np\n", 466 | "\n", 467 | "def showPlot(points, title):\n", 468 | " plt.figure()\n", 469 | " fig, ax = plt.subplots()\n", 470 | " plt.plot(points)\n", 471 | "\n", 472 | "showPlot(cbow_losses, 'CBOW Losses')\n", 473 | "showPlot(sg_losses, 'SkipGram Losses')" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "metadata": {}, 480 | "outputs": [], 481 | "source": [] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": 10, 486 | "metadata": {}, 487 | "outputs": [ 488 | { 489 | "name": "stdout", 490 | "output_type": "stream", 491 | "text": [ 492 | "Loaded 400000 words\n" 493 | ] 494 | } 495 | ], 496 | "source": [ 497 | "import torch\n", 498 | "import torchtext.vocab as vocab\n", 499 | "glove = vocab.GloVe(name = \"6B\", dim = 100)\n", 500 | "print(\"Loaded {} words\".format(len(glove.itos)))" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 11, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "def get_word(word):\n", 510 | " return glove.vectors[glove.stoi[word]]" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 12, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "def closest(vec, n = 10):\n", 520 | " all_dists = [(w, torch.dist(vec, get_word(w))) for w in glove.itos]\n", 521 | " return sorted(all_dists, key = lambda t: t[1])[:n]" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": 13, 527 | "metadata": {}, 528 | "outputs": [], 529 | "source": [ 530 | "def print_tuples(tuples):\n", 531 | " for tuple in tuples:\n", 532 | " print(\"(%.4f) %s\" % (tuple[1], tuple[0]))" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 14, 538 | "metadata": {}, 539 | "outputs": [ 540 | { 541 | "name": "stdout", 542 | "output_type": "stream", 543 | "text": [ 544 | "(0.0000) google\n", 545 | "(3.0772) yahoo\n", 546 | "(3.8836) microsoft\n", 547 | "(4.1048) web\n", 548 | "(4.1082) aol\n", 549 | "(4.1165) facebook\n", 550 | "(4.3917) ebay\n", 551 | "(4.4122) msn\n", 552 | "(4.4540) internet\n", 553 | "(4.4651) netscape\n" 554 | ] 555 | } 556 | ], 557 | "source": [ 558 | "print_tuples(closest(get_word(\"google\")))" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": 15, 564 | "metadata": {}, 565 | "outputs": [], 566 | "source": [ 567 | "def analogy(w1, w2, w3, n=5, filter_given=True):\n", 568 | " print('\\n[%s : %s :: %s : ?]' % (w1, w2, w3))\n", 569 | " closest_words = closest(get_word(w2) - get_word(w1) + get_word(w3)) \n", 570 | " if filter_given:\n", 571 | " closest_words = [t for t in closest_words if t[0] not in [w1, w2, w3]]\n", 572 | " \n", 573 | " print_tuples(closest_words[:n])" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 16, 579 | "metadata": {}, 580 | "outputs": [ 581 | { 582 | "name": "stdout", 583 | "output_type": "stream", 584 | "text": [ 585 | "\n", 586 | "[king : man :: queen : ?]\n", 587 | "(4.0811) woman\n", 588 | "(4.6916) girl\n", 589 | "(5.2703) she\n", 590 | "(5.2788) teenager\n", 591 | "(5.3084) boy\n" 592 | ] 593 | } 594 | ], 595 | "source": [ 596 | "analogy(\"king\", \"man\", \"queen\")" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 17, 602 | "metadata": {}, 603 | "outputs": [ 604 | { 605 | "name": "stdout", 606 | "output_type": "stream", 607 | "text": [ 608 | "\n", 609 | "[man : actor :: woman : ?]\n", 610 | "(2.8133) actress\n", 611 | "(5.0039) comedian\n", 612 | "(5.1399) actresses\n", 613 | "(5.2773) starred\n", 614 | "(5.3085) screenwriter\n", 615 | "\n", 616 | "[cat : kitten :: dog : ?]\n", 617 | "(3.8146) puppy\n", 618 | "(4.2944) rottweiler\n", 619 | "(4.5888) puppies\n", 620 | "(4.6086) pooch\n", 621 | "(4.6520) pug\n", 622 | "\n", 623 | "[dog : puppy :: cat : ?]\n", 624 | "(3.8146) kitten\n", 625 | "(4.0255) puppies\n", 626 | "(4.1575) kittens\n", 627 | "(4.1882) pterodactyl\n", 628 | "(4.1945) scaredy\n", 629 | "\n", 630 | "[russia : moscow :: france : ?]\n", 631 | "(3.2697) paris\n", 632 | "(4.6857) french\n", 633 | "(4.7085) lyon\n", 634 | "(4.9087) strasbourg\n", 635 | "(5.0362) marseille\n", 636 | "\n", 637 | "[obama : president :: trump : ?]\n", 638 | "(6.4302) executive\n", 639 | "(6.5149) founder\n", 640 | "(6.6997) ceo\n", 641 | "(6.7524) hilton\n", 642 | "(6.7729) walt\n", 643 | "\n", 644 | "[rich : mansion :: poor : ?]\n", 645 | "(5.8262) residence\n", 646 | "(5.9444) riverside\n", 647 | "(6.0283) hillside\n", 648 | "(6.0328) abandoned\n", 649 | "(6.0681) bungalow\n", 650 | "\n", 651 | "[elvis : rock :: eminem : ?]\n", 652 | "(5.6597) rap\n", 653 | "(6.2057) rappers\n", 654 | "(6.2161) rapper\n", 655 | "(6.2444) punk\n", 656 | "(6.2690) hop\n", 657 | "\n", 658 | "[paper : newspaper :: screen : ?]\n", 659 | "(4.7810) tv\n", 660 | "(5.1049) television\n", 661 | "(5.3818) cinema\n", 662 | "(5.5524) feature\n", 663 | "(5.5646) shows\n", 664 | "\n", 665 | "[monet : paint :: michelangelo : ?]\n", 666 | "(6.0782) plaster\n", 667 | "(6.3768) mold\n", 668 | "(6.3922) tile\n", 669 | "(6.5819) marble\n", 670 | "(6.6524) image\n", 671 | "\n", 672 | "[beer : barley :: wine : ?]\n", 673 | "(5.6021) grape\n", 674 | "(5.6760) beans\n", 675 | "(5.8174) grapes\n", 676 | "(5.9035) lentils\n", 677 | "(5.9454) figs\n", 678 | "\n", 679 | "[earth : moon :: sun : ?]\n", 680 | "(6.2294) lee\n", 681 | "(6.4125) kang\n", 682 | "(6.4644) tan\n", 683 | "(6.4757) yang\n", 684 | "(6.4853) lin\n", 685 | "\n", 686 | "[house : roof :: castle : ?]\n", 687 | "(6.2919) stonework\n", 688 | "(6.3779) masonry\n", 689 | "(6.4773) canopy\n", 690 | "(6.4954) fortress\n", 691 | "(6.5259) battlements\n", 692 | "\n", 693 | "[building : architect :: software : ?]\n", 694 | "(5.8369) programmer\n", 695 | "(6.8881) entrepreneur\n", 696 | "(6.9240) inventor\n", 697 | "(6.9730) developer\n", 698 | "(6.9949) innovator\n", 699 | "\n", 700 | "[boston : bruins :: phoenix : ?]\n", 701 | "(3.8546) suns\n", 702 | "(4.1968) mavericks\n", 703 | "(4.6126) coyotes\n", 704 | "(4.6894) mavs\n", 705 | "(4.6971) knicks\n", 706 | "\n", 707 | "[good : heaven :: bad : ?]\n", 708 | "(4.3959) hell\n", 709 | "(5.2864) ghosts\n", 710 | "(5.2898) hades\n", 711 | "(5.3414) madness\n", 712 | "(5.3520) purgatory\n", 713 | "\n", 714 | "[jordan : basketball :: woods : ?]\n", 715 | "(5.8607) golf\n", 716 | "(6.4110) golfers\n", 717 | "(6.4418) tournament\n", 718 | "(6.4592) tennis\n", 719 | "(6.6560) collegiate\n" 720 | ] 721 | } 722 | ], 723 | "source": [ 724 | "analogy('man', 'actor', 'woman')\n", 725 | "analogy('cat', 'kitten', 'dog')\n", 726 | "analogy('dog', 'puppy', 'cat')\n", 727 | "analogy('russia', 'moscow', 'france')\n", 728 | "analogy('obama', 'president', 'trump')\n", 729 | "analogy('rich', 'mansion', 'poor')\n", 730 | "analogy('elvis', 'rock', 'eminem')\n", 731 | "analogy('paper', 'newspaper', 'screen')\n", 732 | "analogy('monet', 'paint', 'michelangelo')\n", 733 | "analogy('beer', 'barley', 'wine')\n", 734 | "analogy('earth', 'moon', 'sun')\n", 735 | "analogy('house', 'roof', 'castle')\n", 736 | "analogy('building', 'architect', 'software')\n", 737 | "analogy('boston', 'bruins', 'phoenix')\n", 738 | "analogy('good', 'heaven', 'bad')\n", 739 | "analogy('jordan', 'basketball', 'woods')" 740 | ] 741 | }, 742 | { 743 | "cell_type": "code", 744 | "execution_count": null, 745 | "metadata": {}, 746 | "outputs": [], 747 | "source": [] 748 | } 749 | ], 750 | "metadata": { 751 | "kernelspec": { 752 | "display_name": "Python 3", 753 | "language": "python", 754 | "name": "python3" 755 | }, 756 | "language_info": { 757 | "codemirror_mode": { 758 | "name": "ipython", 759 | "version": 3 760 | }, 761 | "file_extension": ".py", 762 | "mimetype": "text/x-python", 763 | "name": "python", 764 | "nbconvert_exporter": "python", 765 | "pygments_lexer": "ipython3", 766 | "version": "3.6.8" 767 | } 768 | }, 769 | "nbformat": 4, 770 | "nbformat_minor": 4 771 | } 772 | -------------------------------------------------------------------------------- /3_Tagging_RNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import sys\n", 11 | "import time\n", 12 | "import torch\n", 13 | "import numpy as np\n", 14 | "import torch.nn as nn\n", 15 | "import torch.optim as optim\n", 16 | "\n", 17 | "from torch.nn import functional as F\n", 18 | "from torch.autograd import Variable\n", 19 | "from torchtext import data\n", 20 | "from torchtext import datasets\n", 21 | "from torchtext.vocab import Vectors, GloVe\n", 22 | "\n", 23 | "def load_dataset(test_sen=None): \n", 24 | " tokenize = lambda x: x.split()\n", 25 | " TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)\n", 26 | " LABEL = data.LabelField()\n", 27 | " train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)\n", 28 | " TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))\n", 29 | " LABEL.build_vocab(train_data)\n", 30 | "\n", 31 | " word_embeddings = TEXT.vocab.vectors\n", 32 | " print (\"Length of Text Vocabulary: \" + str(len(TEXT.vocab)))\n", 33 | " print (\"Vector size of Text Vocabulary: \", TEXT.vocab.vectors.size())\n", 34 | " print (\"Label Length: \" + str(len(LABEL.vocab)))\n", 35 | "\n", 36 | " train_data, valid_data = train_data.split()\n", 37 | " train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)\n", 38 | "\n", 39 | " vocab_size = len(TEXT.vocab)\n", 40 | "\n", 41 | " return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "class RNN(nn.Module):\n", 51 | " def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):\n", 52 | " super(RNN, self).__init__()\n", 53 | " \n", 54 | " self.batch_size = batch_size\n", 55 | " self.output_size = output_size\n", 56 | " self.hidden_size = hidden_size\n", 57 | " self.vocab_size = vocab_size\n", 58 | " self.embedding_length = embedding_length\n", 59 | " \n", 60 | " self.word_embeddings = nn.Embedding(vocab_size, embedding_length)\n", 61 | " self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False)\n", 62 | " self.rnn = nn.RNN(embedding_length, hidden_size, num_layers=2, bidirectional=True)\n", 63 | " self.label = nn.Linear(4*hidden_size, output_size)\n", 64 | " \n", 65 | " def forward(self, input_sentences, batch_size=None):\n", 66 | " input = self.word_embeddings(input_sentences)\n", 67 | " input = input.permute(1, 0, 2)\n", 68 | " \n", 69 | " if batch_size is None:\n", 70 | " h_0 = Variable(torch.zeros(4, self.batch_size, self.hidden_size).cuda())\n", 71 | " else:\n", 72 | " h_0 = Variable(torch.zeros(4, batch_size, self.hidden_size).cuda())\n", 73 | " \n", 74 | " output, h_n = self.rnn(input, h_0)\n", 75 | " h_n = h_n.permute(1, 0, 2)\n", 76 | " h_n = h_n.contiguous().view(h_n.size()[0], h_n.size()[1]*h_n.size()[2])\n", 77 | " logits = self.label(h_n)\n", 78 | " \n", 79 | " return logits" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 3, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "Length of Text Vocabulary: 251639\n", 92 | "Vector size of Text Vocabulary: torch.Size([251639, 300])\n", 93 | "Label Length: 2\n", 94 | "Epoch: 1, Idx: 100, Training Loss: 0.6538, Training Accuracy: 59.38%\n", 95 | "Epoch: 1, Idx: 200, Training Loss: 0.7323, Training Accuracy: 28.12%\n", 96 | "Epoch: 1, Idx: 300, Training Loss: 0.6863, Training Accuracy: 62.50%\n", 97 | "Epoch: 1, Idx: 400, Training Loss: 0.6000, Training Accuracy: 68.75%\n", 98 | "Epoch: 1, Idx: 500, Training Loss: 0.6716, Training Accuracy: 65.62%\n", 99 | "Epoch: 01, Train Loss: 0.701, Train Acc: 56.00%, Val. Loss: 0.684682, Val. Acc: 53.44%\n", 100 | "Epoch: 2, Idx: 100, Training Loss: 0.6071, Training Accuracy: 62.50%\n", 101 | "Epoch: 2, Idx: 200, Training Loss: 0.6640, Training Accuracy: 53.12%\n", 102 | "Epoch: 2, Idx: 300, Training Loss: 0.7665, Training Accuracy: 43.75%\n", 103 | "Epoch: 2, Idx: 400, Training Loss: 0.7446, Training Accuracy: 65.62%\n", 104 | "Epoch: 2, Idx: 500, Training Loss: 0.7483, Training Accuracy: 40.62%\n", 105 | "Epoch: 02, Train Loss: 0.702, Train Acc: 55.19%, Val. Loss: 0.726367, Val. Acc: 53.12%\n", 106 | "Epoch: 3, Idx: 100, Training Loss: 0.6248, Training Accuracy: 68.75%\n", 107 | "Epoch: 3, Idx: 200, Training Loss: 0.7327, Training Accuracy: 50.00%\n", 108 | "Epoch: 3, Idx: 300, Training Loss: 0.6317, Training Accuracy: 65.62%\n", 109 | "Epoch: 3, Idx: 400, Training Loss: 0.5729, Training Accuracy: 71.88%\n", 110 | "Epoch: 3, Idx: 500, Training Loss: 0.6937, Training Accuracy: 56.25%\n", 111 | "Epoch: 03, Train Loss: 0.685, Train Acc: 60.74%, Val. Loss: 0.728285, Val. Acc: 54.92%\n", 112 | "Epoch: 4, Idx: 100, Training Loss: 0.7239, Training Accuracy: 59.38%\n", 113 | "Epoch: 4, Idx: 200, Training Loss: 0.5938, Training Accuracy: 75.00%\n", 114 | "Epoch: 4, Idx: 300, Training Loss: 0.9016, Training Accuracy: 53.12%\n", 115 | "Epoch: 4, Idx: 400, Training Loss: 0.5571, Training Accuracy: 78.12%\n", 116 | "Epoch: 4, Idx: 500, Training Loss: 0.5879, Training Accuracy: 65.62%\n", 117 | "Epoch: 04, Train Loss: 0.667, Train Acc: 61.56%, Val. Loss: 0.678201, Val. Acc: 57.76%\n", 118 | "Epoch: 5, Idx: 100, Training Loss: 0.6851, Training Accuracy: 56.25%\n", 119 | "Epoch: 5, Idx: 200, Training Loss: 0.6559, Training Accuracy: 62.50%\n", 120 | "Epoch: 5, Idx: 300, Training Loss: 0.6128, Training Accuracy: 62.50%\n", 121 | "Epoch: 5, Idx: 400, Training Loss: 0.6151, Training Accuracy: 59.38%\n", 122 | "Epoch: 5, Idx: 500, Training Loss: 0.6839, Training Accuracy: 68.75%\n", 123 | "Epoch: 05, Train Loss: 0.669, Train Acc: 61.76%, Val. Loss: 0.631271, Val. Acc: 66.40%\n", 124 | "Epoch: 6, Idx: 100, Training Loss: 0.4968, Training Accuracy: 78.12%\n", 125 | "Epoch: 6, Idx: 200, Training Loss: 0.7118, Training Accuracy: 62.50%\n", 126 | "Epoch: 6, Idx: 300, Training Loss: 0.5181, Training Accuracy: 81.25%\n", 127 | "Epoch: 6, Idx: 400, Training Loss: 0.5818, Training Accuracy: 75.00%\n", 128 | "Epoch: 6, Idx: 500, Training Loss: 0.5787, Training Accuracy: 68.75%\n", 129 | "Epoch: 06, Train Loss: 0.664, Train Acc: 62.54%, Val. Loss: 0.714283, Val. Acc: 53.20%\n", 130 | "Epoch: 7, Idx: 100, Training Loss: 0.7741, Training Accuracy: 65.62%\n", 131 | "Epoch: 7, Idx: 200, Training Loss: 0.6719, Training Accuracy: 62.50%\n", 132 | "Epoch: 7, Idx: 300, Training Loss: 0.5993, Training Accuracy: 68.75%\n", 133 | "Epoch: 7, Idx: 400, Training Loss: 0.7759, Training Accuracy: 46.88%\n", 134 | "Epoch: 7, Idx: 500, Training Loss: 0.6450, Training Accuracy: 59.38%\n", 135 | "Epoch: 07, Train Loss: 0.659, Train Acc: 62.85%, Val. Loss: 0.643714, Val. Acc: 61.96%\n", 136 | "Epoch: 8, Idx: 100, Training Loss: 0.6427, Training Accuracy: 75.00%\n", 137 | "Epoch: 8, Idx: 200, Training Loss: 0.7509, Training Accuracy: 46.88%\n", 138 | "Epoch: 8, Idx: 300, Training Loss: 0.7016, Training Accuracy: 53.12%\n", 139 | "Epoch: 8, Idx: 400, Training Loss: 0.6085, Training Accuracy: 71.88%\n", 140 | "Epoch: 8, Idx: 500, Training Loss: 0.5723, Training Accuracy: 71.88%\n", 141 | "Epoch: 08, Train Loss: 0.661, Train Acc: 63.19%, Val. Loss: 0.631669, Val. Acc: 66.58%\n", 142 | "Epoch: 9, Idx: 100, Training Loss: 0.6699, Training Accuracy: 56.25%\n", 143 | "Epoch: 9, Idx: 200, Training Loss: 0.7980, Training Accuracy: 56.25%\n", 144 | "Epoch: 9, Idx: 300, Training Loss: 0.7833, Training Accuracy: 56.25%\n", 145 | "Epoch: 9, Idx: 400, Training Loss: 0.6437, Training Accuracy: 56.25%\n", 146 | "Epoch: 9, Idx: 500, Training Loss: 0.6734, Training Accuracy: 65.62%\n", 147 | "Epoch: 09, Train Loss: 0.658, Train Acc: 63.68%, Val. Loss: 0.675087, Val. Acc: 61.79%\n", 148 | "Epoch: 10, Idx: 100, Training Loss: 0.5863, Training Accuracy: 75.00%\n", 149 | "Epoch: 10, Idx: 200, Training Loss: 0.6492, Training Accuracy: 71.88%\n", 150 | "Epoch: 10, Idx: 300, Training Loss: 0.5064, Training Accuracy: 84.38%\n", 151 | "Epoch: 10, Idx: 400, Training Loss: 0.8142, Training Accuracy: 53.12%\n", 152 | "Epoch: 10, Idx: 500, Training Loss: 0.7308, Training Accuracy: 46.88%\n", 153 | "Epoch: 10, Train Loss: 0.651, Train Acc: 64.40%, Val. Loss: 0.645745, Val. Acc: 65.99%\n", 154 | "Test Loss: 0.647, Test Acc: 65.61%\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter = load_dataset()\n", 160 | "\n", 161 | "def clip_gradient(model, clip_value):\n", 162 | " params = list(filter(lambda p: p.grad is not None, model.parameters()))\n", 163 | " for p in params:\n", 164 | " p.grad.data.clamp_(-clip_value, clip_value)\n", 165 | " \n", 166 | "def train_model(model, train_iter, epoch):\n", 167 | " total_epoch_loss = 0\n", 168 | " total_epoch_acc = 0\n", 169 | " model.cuda()\n", 170 | " optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))\n", 171 | " steps = 0\n", 172 | " model.train()\n", 173 | " for idx, batch in enumerate(train_iter):\n", 174 | " text = batch.text[0]\n", 175 | " target = batch.label\n", 176 | " target = torch.autograd.Variable(target).long()\n", 177 | " if torch.cuda.is_available():\n", 178 | " text = text.cuda()\n", 179 | " target = target.cuda()\n", 180 | " if (text.size()[0] is not 32):# One of the batch returned by BucketIterator has length different than 32.\n", 181 | " continue\n", 182 | " optim.zero_grad()\n", 183 | " prediction = model(text)\n", 184 | " loss = loss_fn(prediction, target)\n", 185 | " num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()\n", 186 | " acc = 100.0 * num_corrects/len(batch)\n", 187 | " loss.backward()\n", 188 | " clip_gradient(model, 1e-1)\n", 189 | " optim.step()\n", 190 | " steps += 1\n", 191 | " \n", 192 | " if steps % 100 == 0:\n", 193 | " print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')\n", 194 | " \n", 195 | " total_epoch_loss += loss.item()\n", 196 | " total_epoch_acc += acc.item()\n", 197 | " \n", 198 | " return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)\n", 199 | "\n", 200 | "def eval_model(model, val_iter):\n", 201 | " total_epoch_loss = 0\n", 202 | " total_epoch_acc = 0\n", 203 | " model.eval()\n", 204 | " with torch.no_grad():\n", 205 | " for idx, batch in enumerate(val_iter):\n", 206 | " text = batch.text[0]\n", 207 | " if (text.size()[0] is not 32):\n", 208 | " continue\n", 209 | " target = batch.label\n", 210 | " target = torch.autograd.Variable(target).long()\n", 211 | " if torch.cuda.is_available():\n", 212 | " text = text.cuda()\n", 213 | " target = target.cuda()\n", 214 | " prediction = model(text)\n", 215 | " loss = loss_fn(prediction, target)\n", 216 | " num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()\n", 217 | " acc = 100.0 * num_corrects/len(batch)\n", 218 | " total_epoch_loss += loss.item()\n", 219 | " total_epoch_acc += acc.item()\n", 220 | "\n", 221 | " return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter)\n", 222 | "\n", 223 | "learning_rate = 1e-5\n", 224 | "batch_size = 32\n", 225 | "output_size = 2\n", 226 | "hidden_size = 256\n", 227 | "embedding_length = 300\n", 228 | "\n", 229 | "model = RNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)\n", 230 | "loss_fn = F.cross_entropy\n", 231 | "\n", 232 | "for epoch in range(10):\n", 233 | " train_loss, train_acc = train_model(model, train_iter, epoch)\n", 234 | " val_loss, val_acc = eval_model(model, valid_iter)\n", 235 | " \n", 236 | " print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')\n", 237 | " \n", 238 | "test_loss, test_acc = eval_model(model, test_iter)\n", 239 | "print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 4, 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "tensor([[0.1081, 0.8919]], device='cuda:0', grad_fn=)\n", 252 | "Sentiment: Positive\n" 253 | ] 254 | }, 255 | { 256 | "name": "stderr", 257 | "output_type": "stream", 258 | "text": [ 259 | "c:\\users\\justin\\venv\\lib\\site-packages\\ipykernel_launcher.py:8: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n", 260 | " \n" 261 | ] 262 | } 263 | ], 264 | "source": [ 265 | "test_sen1 = \"This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues.\"\n", 266 | "\n", 267 | "test_sen1 = TEXT.preprocess(test_sen1)\n", 268 | "test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]]\n", 269 | "\n", 270 | "test_sen = np.asarray(test_sen1)\n", 271 | "test_sen = torch.LongTensor(test_sen)\n", 272 | "test_tensor = Variable(test_sen, volatile=True)\n", 273 | "test_tensor = test_tensor.cuda()\n", 274 | "model.eval()\n", 275 | "output = model(test_tensor, 1)\n", 276 | "out = F.softmax(output, 1)\n", 277 | "print(out)\n", 278 | "if (torch.argmax(out[0]) == 1):\n", 279 | " print (\"Sentiment: Positive\")\n", 280 | "else:\n", 281 | " print (\"Sentiment: Negative\")" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 5, 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "name": "stdout", 291 | "output_type": "stream", 292 | "text": [ 293 | "tensor([[0.6741, 0.3259]], device='cuda:0', grad_fn=)\n", 294 | "Sentiment: Negative\n" 295 | ] 296 | }, 297 | { 298 | "name": "stderr", 299 | "output_type": "stream", 300 | "text": [ 301 | "c:\\users\\justin\\venv\\lib\\site-packages\\ipykernel_launcher.py:7: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n", 302 | " import sys\n" 303 | ] 304 | } 305 | ], 306 | "source": [ 307 | "test_sen2 = \"Ohh, such a ridiculous movie. Not gonna recommend it to anyone. Complete waste of time and money.\"\n", 308 | "test_sen2 = TEXT.preprocess(test_sen2)\n", 309 | "test_sen2 = [[TEXT.vocab.stoi[x] for x in test_sen2]]\n", 310 | "\n", 311 | "test_sen = np.asarray(test_sen2)\n", 312 | "test_sen = torch.LongTensor(test_sen)\n", 313 | "test_tensor = Variable(test_sen, volatile=True)\n", 314 | "test_tensor = test_tensor.cuda()\n", 315 | "model.eval()\n", 316 | "output = model(test_tensor, 1)\n", 317 | "out = F.softmax(output, 1)\n", 318 | "print(out)\n", 319 | "if (torch.argmax(out[0]) == 1):\n", 320 | " print (\"Sentiment: Positive\")\n", 321 | "else:\n", 322 | " print (\"Sentiment: Negative\")" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 6, 328 | "metadata": {}, 329 | "outputs": [ 330 | { 331 | "name": "stdout", 332 | "output_type": "stream", 333 | "text": [ 334 | "Epoch: 1, Idx: 100, Training Loss: 0.6934, Training Accuracy: 50.00%\n", 335 | "Epoch: 1, Idx: 200, Training Loss: 0.7078, Training Accuracy: 46.88%\n", 336 | "Epoch: 1, Idx: 300, Training Loss: 0.6930, Training Accuracy: 56.25%\n", 337 | "Epoch: 1, Idx: 400, Training Loss: 0.6874, Training Accuracy: 56.25%\n", 338 | "Epoch: 1, Idx: 500, Training Loss: 0.6718, Training Accuracy: 78.12%\n", 339 | "Epoch: 01, Train Loss: 0.690, Train Acc: 51.97%, Val. Loss: 0.688056, Val. Acc: 51.27%\n", 340 | "Epoch: 2, Idx: 100, Training Loss: 0.6545, Training Accuracy: 62.50%\n", 341 | "Epoch: 2, Idx: 200, Training Loss: 0.6695, Training Accuracy: 62.50%\n", 342 | "Epoch: 2, Idx: 300, Training Loss: 0.6926, Training Accuracy: 53.12%\n", 343 | "Epoch: 2, Idx: 400, Training Loss: 0.6827, Training Accuracy: 56.25%\n", 344 | "Epoch: 2, Idx: 500, Training Loss: 0.6129, Training Accuracy: 68.75%\n", 345 | "Epoch: 02, Train Loss: 0.667, Train Acc: 58.72%, Val. Loss: 0.682012, Val. Acc: 54.80%\n", 346 | "Epoch: 3, Idx: 100, Training Loss: 0.6308, Training Accuracy: 68.75%\n", 347 | "Epoch: 3, Idx: 200, Training Loss: 0.4342, Training Accuracy: 84.38%\n", 348 | "Epoch: 3, Idx: 300, Training Loss: 0.6503, Training Accuracy: 62.50%\n", 349 | "Epoch: 3, Idx: 400, Training Loss: 0.6636, Training Accuracy: 68.75%\n", 350 | "Epoch: 3, Idx: 500, Training Loss: 0.5156, Training Accuracy: 75.00%\n", 351 | "Epoch: 03, Train Loss: 0.590, Train Acc: 69.50%, Val. Loss: 0.494978, Val. Acc: 77.09%\n", 352 | "Epoch: 4, Idx: 100, Training Loss: 0.5605, Training Accuracy: 71.88%\n", 353 | "Epoch: 4, Idx: 200, Training Loss: 0.8281, Training Accuracy: 65.62%\n", 354 | "Epoch: 4, Idx: 300, Training Loss: 0.6036, Training Accuracy: 65.62%\n", 355 | "Epoch: 4, Idx: 400, Training Loss: 0.4735, Training Accuracy: 71.88%\n", 356 | "Epoch: 4, Idx: 500, Training Loss: 0.4546, Training Accuracy: 78.12%\n", 357 | "Epoch: 04, Train Loss: 0.428, Train Acc: 80.67%, Val. Loss: 0.386465, Val. Acc: 82.08%\n", 358 | "Epoch: 5, Idx: 100, Training Loss: 0.3328, Training Accuracy: 87.50%\n", 359 | "Epoch: 5, Idx: 200, Training Loss: 0.3596, Training Accuracy: 78.12%\n", 360 | "Epoch: 5, Idx: 300, Training Loss: 0.3249, Training Accuracy: 87.50%\n", 361 | "Epoch: 5, Idx: 400, Training Loss: 0.6565, Training Accuracy: 68.75%\n", 362 | "Epoch: 5, Idx: 500, Training Loss: 0.4050, Training Accuracy: 78.12%\n", 363 | "Epoch: 05, Train Loss: 0.367, Train Acc: 83.72%, Val. Loss: 0.369900, Val. Acc: 82.79%\n", 364 | "Epoch: 6, Idx: 100, Training Loss: 0.4549, Training Accuracy: 84.38%\n", 365 | "Epoch: 6, Idx: 200, Training Loss: 0.3892, Training Accuracy: 81.25%\n", 366 | "Epoch: 6, Idx: 300, Training Loss: 0.1442, Training Accuracy: 96.88%\n", 367 | "Epoch: 6, Idx: 400, Training Loss: 0.3001, Training Accuracy: 87.50%\n", 368 | "Epoch: 6, Idx: 500, Training Loss: 0.4553, Training Accuracy: 75.00%\n", 369 | "Epoch: 06, Train Loss: 0.324, Train Acc: 85.52%, Val. Loss: 0.367029, Val. Acc: 83.53%\n", 370 | "Epoch: 7, Idx: 100, Training Loss: 0.2308, Training Accuracy: 90.62%\n", 371 | "Epoch: 7, Idx: 200, Training Loss: 0.3394, Training Accuracy: 81.25%\n", 372 | "Epoch: 7, Idx: 300, Training Loss: 0.4261, Training Accuracy: 87.50%\n", 373 | "Epoch: 7, Idx: 400, Training Loss: 0.3106, Training Accuracy: 90.62%\n", 374 | "Epoch: 7, Idx: 500, Training Loss: 0.1421, Training Accuracy: 96.88%\n", 375 | "Epoch: 07, Train Loss: 0.282, Train Acc: 87.91%, Val. Loss: 0.378974, Val. Acc: 83.80%\n", 376 | "Epoch: 8, Idx: 100, Training Loss: 0.1280, Training Accuracy: 96.88%\n", 377 | "Epoch: 8, Idx: 200, Training Loss: 0.4244, Training Accuracy: 84.38%\n", 378 | "Epoch: 8, Idx: 300, Training Loss: 0.3225, Training Accuracy: 90.62%\n", 379 | "Epoch: 8, Idx: 400, Training Loss: 0.3618, Training Accuracy: 84.38%\n", 380 | "Epoch: 8, Idx: 500, Training Loss: 0.2334, Training Accuracy: 87.50%\n", 381 | "Epoch: 08, Train Loss: 0.232, Train Acc: 90.26%, Val. Loss: 0.395538, Val. Acc: 83.34%\n", 382 | "Epoch: 9, Idx: 100, Training Loss: 0.1379, Training Accuracy: 96.88%\n", 383 | "Epoch: 9, Idx: 200, Training Loss: 0.2220, Training Accuracy: 87.50%\n", 384 | "Epoch: 9, Idx: 300, Training Loss: 0.2743, Training Accuracy: 87.50%\n", 385 | "Epoch: 9, Idx: 400, Training Loss: 0.3071, Training Accuracy: 84.38%\n", 386 | "Epoch: 9, Idx: 500, Training Loss: 0.1465, Training Accuracy: 93.75%\n", 387 | "Epoch: 09, Train Loss: 0.181, Train Acc: 92.97%, Val. Loss: 0.433728, Val. Acc: 83.30%\n", 388 | "Epoch: 10, Idx: 100, Training Loss: 0.0535, Training Accuracy: 96.88%\n", 389 | "Epoch: 10, Idx: 200, Training Loss: 0.1756, Training Accuracy: 90.62%\n", 390 | "Epoch: 10, Idx: 300, Training Loss: 0.0970, Training Accuracy: 100.00%\n", 391 | "Epoch: 10, Idx: 400, Training Loss: 0.1835, Training Accuracy: 93.75%\n", 392 | "Epoch: 10, Idx: 500, Training Loss: 0.0648, Training Accuracy: 96.88%\n", 393 | "Epoch: 10, Train Loss: 0.133, Train Acc: 94.96%, Val. Loss: 0.458091, Val. Acc: 82.26%\n", 394 | "Test Loss: 0.456, Test Acc: 82.61%\n" 395 | ] 396 | } 397 | ], 398 | "source": [ 399 | "class LSTMClassifier(nn.Module):\n", 400 | " def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):\n", 401 | " super(LSTMClassifier, self).__init__()\n", 402 | " self.batch_size = batch_size\n", 403 | " self.output_size = output_size\n", 404 | " self.hidden_size = hidden_size\n", 405 | " self.vocab_size = vocab_size\n", 406 | " self.embedding_length = embedding_length\n", 407 | "\n", 408 | " self.word_embeddings = nn.Embedding(vocab_size, embedding_length)# Initializing the look-up table.\n", 409 | " self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) # Assigning the look-up table to the pre-trained GloVe word embedding.\n", 410 | " self.lstm = nn.LSTM(embedding_length, hidden_size)\n", 411 | " self.label = nn.Linear(hidden_size, output_size)\n", 412 | " \n", 413 | " def forward(self, input_sentence, batch_size=None):\n", 414 | " input = self.word_embeddings(input_sentence) # embedded input of shape = (batch_size, num_sequences, embedding_length)\n", 415 | " input = input.permute(1, 0, 2) # input.size() = (num_sequences, batch_size, embedding_length)\n", 416 | " if batch_size is None:\n", 417 | " h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) # Initial hidden state of the LSTM\n", 418 | " c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) # Initial cell state of the LSTM\n", 419 | " else:\n", 420 | " h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())\n", 421 | " c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())\n", 422 | " output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))\n", 423 | " final_output = self.label(final_hidden_state[-1]) # final_hidden_state.size() = (1, batch_size, hidden_size) & final_output.size() = (batch_size, output_size)\n", 424 | " return final_output\n", 425 | " \n", 426 | "model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)\n", 427 | "loss_fn = F.cross_entropy\n", 428 | "\n", 429 | "for epoch in range(10):\n", 430 | " train_loss, train_acc = train_model(model, train_iter, epoch)\n", 431 | " val_loss, val_acc = eval_model(model, valid_iter)\n", 432 | " \n", 433 | " print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')\n", 434 | " \n", 435 | "test_loss, test_acc = eval_model(model, test_iter)\n", 436 | "print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 7, 442 | "metadata": {}, 443 | "outputs": [ 444 | { 445 | "name": "stdout", 446 | "output_type": "stream", 447 | "text": [ 448 | "tensor([[5.6929e-06, 9.9999e-01]], device='cuda:0', grad_fn=)\n", 449 | "Sentiment: Positive\n" 450 | ] 451 | }, 452 | { 453 | "name": "stderr", 454 | "output_type": "stream", 455 | "text": [ 456 | "c:\\users\\justin\\venv\\lib\\site-packages\\ipykernel_launcher.py:8: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n", 457 | " \n" 458 | ] 459 | } 460 | ], 461 | "source": [ 462 | "test_sen1 = \"This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues.\"\n", 463 | "\n", 464 | "test_sen1 = TEXT.preprocess(test_sen1)\n", 465 | "test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]]\n", 466 | "\n", 467 | "test_sen = np.asarray(test_sen1)\n", 468 | "test_sen = torch.LongTensor(test_sen)\n", 469 | "test_tensor = Variable(test_sen, volatile=True)\n", 470 | "test_tensor = test_tensor.cuda()\n", 471 | "model.eval()\n", 472 | "output = model(test_tensor, 1)\n", 473 | "out = F.softmax(output, 1)\n", 474 | "print(out)\n", 475 | "if (torch.argmax(out[0]) == 1):\n", 476 | " print (\"Sentiment: Positive\")\n", 477 | "else:\n", 478 | " print (\"Sentiment: Negative\")" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 8, 484 | "metadata": {}, 485 | "outputs": [ 486 | { 487 | "name": "stdout", 488 | "output_type": "stream", 489 | "text": [ 490 | "tensor([[0.9989, 0.0011]], device='cuda:0', grad_fn=)\n", 491 | "Sentiment: Negative\n" 492 | ] 493 | }, 494 | { 495 | "name": "stderr", 496 | "output_type": "stream", 497 | "text": [ 498 | "c:\\users\\justin\\venv\\lib\\site-packages\\ipykernel_launcher.py:8: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n", 499 | " \n" 500 | ] 501 | } 502 | ], 503 | "source": [ 504 | "test_sen2 = \"Ohh, such a ridiculous movie. Not gonna recommend it to anyone. Complete waste of time and money.\"\n", 505 | "test_sen2 = TEXT.preprocess(test_sen2)\n", 506 | "test_sen2 = [[TEXT.vocab.stoi[x] for x in test_sen2]]\n", 507 | "\n", 508 | "\n", 509 | "test_sen = np.asarray(test_sen2)\n", 510 | "test_sen = torch.LongTensor(test_sen)\n", 511 | "test_tensor = Variable(test_sen, volatile=True)\n", 512 | "test_tensor = test_tensor.cuda()\n", 513 | "model.eval()\n", 514 | "output = model(test_tensor, 1)\n", 515 | "out = F.softmax(output, 1)\n", 516 | "print(out)\n", 517 | "if (torch.argmax(out[0]) == 1):\n", 518 | " print (\"Sentiment: Positive\")\n", 519 | "else:\n", 520 | " print (\"Sentiment: Negative\")" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": 9, 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "class AttentionModel(torch.nn.Module):\n", 530 | " def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):\n", 531 | " super(AttentionModel, self).__init__()\n", 532 | " self.batch_size = batch_size\n", 533 | " self.output_size = output_size\n", 534 | " self.hidden_size = hidden_size\n", 535 | "\n", 536 | " self.vocab_size = vocab_size\n", 537 | " self.embedding_length = embedding_length\n", 538 | " \n", 539 | " self.word_embeddings = nn.Embedding(vocab_size, embedding_length)\n", 540 | " self.word_embeddings.weights = nn.Parameter(weights, requires_grad=False)\n", 541 | " self.lstm = nn.LSTM(embedding_length, hidden_size)\n", 542 | " self.label = nn.Linear(hidden_size, output_size)\n", 543 | " \n", 544 | " def attention_net(self, lstm_output, final_state):\n", 545 | " hidden = final_state.squeeze(0)\n", 546 | "\n", 547 | " attn_weights = torch.bmm(lstm_output, hidden.unsqueeze(2)).squeeze(2)\n", 548 | " soft_attn_weights = F.softmax(attn_weights, 1)\n", 549 | " new_hidden_state = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)\n", 550 | " \n", 551 | " return new_hidden_state\n", 552 | " \n", 553 | " def forward(self, input_sentences, batch_size=None):\n", 554 | " input = self.word_embeddings(input_sentences)\n", 555 | "\n", 556 | " input = input.permute(1, 0, 2)\n", 557 | " if batch_size is None:\n", 558 | " h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda())\n", 559 | " c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda())\n", 560 | " else:\n", 561 | " h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())\n", 562 | " c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())\n", 563 | "\n", 564 | " output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))\n", 565 | " output = output.permute(1, 0, 2)\n", 566 | " \n", 567 | " attn_output = self.attention_net(output, final_hidden_state)\n", 568 | "\n", 569 | " logits = self.label(attn_output)\n", 570 | " \n", 571 | " return logits" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 10, 577 | "metadata": {}, 578 | "outputs": [ 579 | { 580 | "name": "stdout", 581 | "output_type": "stream", 582 | "text": [ 583 | "Epoch: 1, Idx: 100, Training Loss: 0.7107, Training Accuracy: 46.88%\n", 584 | "Epoch: 1, Idx: 200, Training Loss: 0.6477, Training Accuracy: 65.62%\n", 585 | "Epoch: 1, Idx: 300, Training Loss: 0.6709, Training Accuracy: 59.38%\n", 586 | "Epoch: 1, Idx: 400, Training Loss: 0.6348, Training Accuracy: 71.88%\n", 587 | "Epoch: 1, Idx: 500, Training Loss: 0.6457, Training Accuracy: 71.88%\n", 588 | "Epoch: 01, Train Loss: 0.670, Train Acc: 61.15%, Val. Loss: 0.622058, Val. Acc: 69.28%\n", 589 | "Epoch: 2, Idx: 100, Training Loss: 0.5689, Training Accuracy: 81.25%\n", 590 | "Epoch: 2, Idx: 200, Training Loss: 0.7160, Training Accuracy: 59.38%\n", 591 | "Epoch: 2, Idx: 300, Training Loss: 0.5420, Training Accuracy: 78.12%\n", 592 | "Epoch: 2, Idx: 400, Training Loss: 0.4943, Training Accuracy: 78.12%\n", 593 | "Epoch: 2, Idx: 500, Training Loss: 0.4309, Training Accuracy: 87.50%\n", 594 | "Epoch: 02, Train Loss: 0.517, Train Acc: 76.50%, Val. Loss: 0.500942, Val. Acc: 76.73%\n", 595 | "Epoch: 3, Idx: 100, Training Loss: 0.4180, Training Accuracy: 78.12%\n", 596 | "Epoch: 3, Idx: 200, Training Loss: 0.1869, Training Accuracy: 93.75%\n", 597 | "Epoch: 3, Idx: 300, Training Loss: 0.4827, Training Accuracy: 75.00%\n", 598 | "Epoch: 3, Idx: 400, Training Loss: 0.3836, Training Accuracy: 84.38%\n", 599 | "Epoch: 3, Idx: 500, Training Loss: 0.6573, Training Accuracy: 68.75%\n", 600 | "Epoch: 03, Train Loss: 0.315, Train Acc: 87.20%, Val. Loss: 0.440898, Val. Acc: 79.07%\n", 601 | "Epoch: 4, Idx: 100, Training Loss: 0.1041, Training Accuracy: 96.88%\n", 602 | "Epoch: 4, Idx: 200, Training Loss: 0.0691, Training Accuracy: 100.00%\n", 603 | "Epoch: 4, Idx: 300, Training Loss: 0.0704, Training Accuracy: 96.88%\n", 604 | "Epoch: 4, Idx: 400, Training Loss: 0.1435, Training Accuracy: 93.75%\n", 605 | "Epoch: 4, Idx: 500, Training Loss: 0.1228, Training Accuracy: 96.88%\n", 606 | "Epoch: 04, Train Loss: 0.163, Train Acc: 93.80%, Val. Loss: 0.498487, Val. Acc: 81.04%\n", 607 | "Epoch: 5, Idx: 100, Training Loss: 0.0418, Training Accuracy: 96.88%\n", 608 | "Epoch: 5, Idx: 200, Training Loss: 0.0405, Training Accuracy: 96.88%\n", 609 | "Epoch: 5, Idx: 300, Training Loss: 0.1384, Training Accuracy: 90.62%\n", 610 | "Epoch: 5, Idx: 400, Training Loss: 0.2633, Training Accuracy: 90.62%\n", 611 | "Epoch: 5, Idx: 500, Training Loss: 0.0360, Training Accuracy: 100.00%\n", 612 | "Epoch: 05, Train Loss: 0.079, Train Acc: 97.15%, Val. Loss: 0.572422, Val. Acc: 81.11%\n", 613 | "Epoch: 6, Idx: 100, Training Loss: 0.0018, Training Accuracy: 100.00%\n", 614 | "Epoch: 6, Idx: 200, Training Loss: 0.0444, Training Accuracy: 96.88%\n", 615 | "Epoch: 6, Idx: 300, Training Loss: 0.1177, Training Accuracy: 96.88%\n", 616 | "Epoch: 6, Idx: 400, Training Loss: 0.1992, Training Accuracy: 96.88%\n", 617 | "Epoch: 6, Idx: 500, Training Loss: 0.0245, Training Accuracy: 100.00%\n", 618 | "Epoch: 06, Train Loss: 0.034, Train Acc: 98.77%, Val. Loss: 0.777436, Val. Acc: 81.54%\n", 619 | "Epoch: 7, Idx: 100, Training Loss: 0.0051, Training Accuracy: 100.00%\n", 620 | "Epoch: 7, Idx: 200, Training Loss: 0.0011, Training Accuracy: 100.00%\n", 621 | "Epoch: 7, Idx: 300, Training Loss: 0.0009, Training Accuracy: 100.00%\n", 622 | "Epoch: 7, Idx: 400, Training Loss: 0.0038, Training Accuracy: 100.00%\n", 623 | "Epoch: 7, Idx: 500, Training Loss: 0.0092, Training Accuracy: 100.00%\n", 624 | "Epoch: 07, Train Loss: 0.021, Train Acc: 99.27%, Val. Loss: 0.816149, Val. Acc: 81.73%\n", 625 | "Epoch: 8, Idx: 100, Training Loss: 0.0013, Training Accuracy: 100.00%\n", 626 | "Epoch: 8, Idx: 200, Training Loss: 0.0011, Training Accuracy: 100.00%\n", 627 | "Epoch: 8, Idx: 300, Training Loss: 0.0069, Training Accuracy: 100.00%\n", 628 | "Epoch: 8, Idx: 400, Training Loss: 0.0105, Training Accuracy: 100.00%\n", 629 | "Epoch: 8, Idx: 500, Training Loss: 0.0003, Training Accuracy: 100.00%\n", 630 | "Epoch: 08, Train Loss: 0.016, Train Acc: 99.34%, Val. Loss: 0.735093, Val. Acc: 80.49%\n", 631 | "Epoch: 9, Idx: 100, Training Loss: 0.0003, Training Accuracy: 100.00%\n", 632 | "Epoch: 9, Idx: 200, Training Loss: 0.0004, Training Accuracy: 100.00%\n", 633 | "Epoch: 9, Idx: 300, Training Loss: 0.0004, Training Accuracy: 100.00%\n", 634 | "Epoch: 9, Idx: 400, Training Loss: 0.0025, Training Accuracy: 100.00%\n", 635 | "Epoch: 9, Idx: 500, Training Loss: 0.0021, Training Accuracy: 100.00%\n", 636 | "Epoch: 09, Train Loss: 0.008, Train Acc: 99.58%, Val. Loss: 1.018735, Val. Acc: 81.88%\n", 637 | "Epoch: 10, Idx: 100, Training Loss: 0.0025, Training Accuracy: 100.00%\n", 638 | "Epoch: 10, Idx: 200, Training Loss: 0.0192, Training Accuracy: 100.00%\n", 639 | "Epoch: 10, Idx: 300, Training Loss: 0.0002, Training Accuracy: 100.00%\n", 640 | "Epoch: 10, Idx: 400, Training Loss: 0.0000, Training Accuracy: 100.00%\n", 641 | "Epoch: 10, Idx: 500, Training Loss: 0.0082, Training Accuracy: 100.00%\n", 642 | "Epoch: 10, Train Loss: 0.011, Train Acc: 99.55%, Val. Loss: 0.810989, Val. Acc: 82.54%\n", 643 | "Test Loss: 0.940, Test Acc: 80.09%\n" 644 | ] 645 | } 646 | ], 647 | "source": [ 648 | "model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)\n", 649 | "loss_fn = F.cross_entropy\n", 650 | "\n", 651 | "for epoch in range(10):\n", 652 | " train_loss, train_acc = train_model(model, train_iter, epoch)\n", 653 | " val_loss, val_acc = eval_model(model, valid_iter)\n", 654 | " \n", 655 | " print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')\n", 656 | " \n", 657 | "test_loss, test_acc = eval_model(model, test_iter)\n", 658 | "print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 11, 664 | "metadata": {}, 665 | "outputs": [ 666 | { 667 | "name": "stdout", 668 | "output_type": "stream", 669 | "text": [ 670 | "tensor([[3.2883e-07, 1.0000e+00]], device='cuda:0', grad_fn=)\n", 671 | "Sentiment: Positive\n" 672 | ] 673 | }, 674 | { 675 | "name": "stderr", 676 | "output_type": "stream", 677 | "text": [ 678 | "c:\\users\\justin\\venv\\lib\\site-packages\\ipykernel_launcher.py:8: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n", 679 | " \n" 680 | ] 681 | } 682 | ], 683 | "source": [ 684 | "test_sen1 = \"This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues.\"\n", 685 | "\n", 686 | "test_sen1 = TEXT.preprocess(test_sen1)\n", 687 | "test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]]\n", 688 | "\n", 689 | "test_sen = np.asarray(test_sen1)\n", 690 | "test_sen = torch.LongTensor(test_sen)\n", 691 | "test_tensor = Variable(test_sen, volatile=True)\n", 692 | "test_tensor = test_tensor.cuda()\n", 693 | "model.eval()\n", 694 | "output = model(test_tensor, 1)\n", 695 | "out = F.softmax(output, 1)\n", 696 | "print(out)\n", 697 | "if (torch.argmax(out[0]) == 1):\n", 698 | " print (\"Sentiment: Positive\")\n", 699 | "else:\n", 700 | " print (\"Sentiment: Negative\")" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": 12, 706 | "metadata": {}, 707 | "outputs": [ 708 | { 709 | "name": "stdout", 710 | "output_type": "stream", 711 | "text": [ 712 | "tensor([[1.0000e+00, 1.0964e-06]], device='cuda:0', grad_fn=)\n", 713 | "Sentiment: Negative\n" 714 | ] 715 | }, 716 | { 717 | "name": "stderr", 718 | "output_type": "stream", 719 | "text": [ 720 | "c:\\users\\justin\\venv\\lib\\site-packages\\ipykernel_launcher.py:8: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n", 721 | " \n" 722 | ] 723 | } 724 | ], 725 | "source": [ 726 | "test_sen2 = \"Ohh, such a ridiculous movie. Not gonna recommend it to anyone. Complete waste of time and money.\"\n", 727 | "test_sen2 = TEXT.preprocess(test_sen2)\n", 728 | "test_sen2 = [[TEXT.vocab.stoi[x] for x in test_sen2]]\n", 729 | "\n", 730 | "\n", 731 | "test_sen = np.asarray(test_sen2)\n", 732 | "test_sen = torch.LongTensor(test_sen)\n", 733 | "test_tensor = Variable(test_sen, volatile=True)\n", 734 | "test_tensor = test_tensor.cuda()\n", 735 | "model.eval()\n", 736 | "output = model(test_tensor, 1)\n", 737 | "out = F.softmax(output, 1)\n", 738 | "print(out)\n", 739 | "if (torch.argmax(out[0]) == 1):\n", 740 | " print (\"Sentiment: Positive\")\n", 741 | "else:\n", 742 | " print (\"Sentiment: Negative\")" 743 | ] 744 | } 745 | ], 746 | "metadata": { 747 | "kernelspec": { 748 | "display_name": "Python 3", 749 | "language": "python", 750 | "name": "python3" 751 | }, 752 | "language_info": { 753 | "codemirror_mode": { 754 | "name": "ipython", 755 | "version": 3 756 | }, 757 | "file_extension": ".py", 758 | "mimetype": "text/x-python", 759 | "name": "python", 760 | "nbconvert_exporter": "python", 761 | "pygments_lexer": "ipython3", 762 | "version": "3.6.8" 763 | } 764 | }, 765 | "nbformat": 4, 766 | "nbformat_minor": 4 767 | } 768 | -------------------------------------------------------------------------------- /4_NMT.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import torch\n", 10 | "import torch.nn as nn\n", 11 | "import torch.optim as optim\n", 12 | "\n", 13 | "from torchtext.datasets import TranslationDataset, Multi30k\n", 14 | "from torchtext.data import Field, BucketIterator\n", 15 | "\n", 16 | "import spacy\n", 17 | "import numpy as np\n", 18 | "\n", 19 | "import random\n", 20 | "import math\n", 21 | "import time" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "SEED = 1234\n", 31 | "\n", 32 | "random.seed(SEED)\n", 33 | "np.random.seed(SEED)\n", 34 | "torch.manual_seed(SEED)\n", 35 | "torch.cuda.manual_seed(SEED)\n", 36 | "torch.backends.cudnn.deterministic = True" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "spacy_en = spacy.load('en_core_web_sm')\n", 46 | "spacy_de = spacy.load('de_core_news_sm')" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 4, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "def tokenize_de(text):\n", 56 | " return [tok.text for tok in spacy_de.tokenizer(text)][::-1]\n", 57 | "\n", 58 | "def tokenize_en(text):\n", 59 | " return [tok.text for tok in spacy_en.tokenizer(text)]" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 5, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "SRC = Field(tokenize = tokenize_en, \n", 69 | " init_token = '', \n", 70 | " eos_token = '', \n", 71 | " lower = True)\n", 72 | "\n", 73 | "TRG = Field(tokenize = tokenize_de, \n", 74 | " init_token = '', \n", 75 | " eos_token = '', \n", 76 | " lower = True)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 6, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), \n", 86 | " fields = (SRC, TRG))" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 7, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "Number of training examples: 29000\n", 99 | "Number of validation examples: 1014\n", 100 | "Number of testing examples: 1000\n" 101 | ] 102 | } 103 | ], 104 | "source": [ 105 | "print(f\"Number of training examples: {len(train_data.examples)}\")\n", 106 | "print(f\"Number of validation examples: {len(valid_data.examples)}\")\n", 107 | "print(f\"Number of testing examples: {len(test_data.examples)}\")" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 8, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "{'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'i', 'm', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.'], 'trg': ['.', 'bushes', 'many', 'near', 'outside', 'are', 'males', 'white', ',', 'young', 'two']}\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "print(vars(train_data.examples[0]))" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 9, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "SRC.build_vocab(train_data, min_freq = 2)\n", 134 | "TRG.build_vocab(train_data, min_freq = 2)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 10, 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "Unique tokens in source (de) vocabulary: 7873\n", 147 | "Unique tokens in target (en) vocabulary: 5923\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "print(f\"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}\")\n", 153 | "print(f\"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}\")" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 11, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 12, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "BATCH_SIZE = 128\n", 172 | "\n", 173 | "train_iterator, valid_iterator, test_iterator = BucketIterator.splits(\n", 174 | " (train_data, valid_data, test_data), \n", 175 | " batch_size = BATCH_SIZE, \n", 176 | " device = device)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 13, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "class Encoder(nn.Module):\n", 186 | " def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):\n", 187 | " super().__init__()\n", 188 | " \n", 189 | " self.hid_dim = hid_dim\n", 190 | " self.n_layers = n_layers\n", 191 | " self.embedding = nn.Embedding(input_dim, emb_dim)\n", 192 | " self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)\n", 193 | " self.dropout = nn.Dropout(dropout)\n", 194 | " \n", 195 | " def forward(self, src):\n", 196 | " embedded = self.dropout(self.embedding(src))\n", 197 | " outputs, (hidden, cell) = self.rnn(embedded)\n", 198 | " return hidden, cell" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 14, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "class Decoder(nn.Module):\n", 208 | " def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):\n", 209 | " super().__init__()\n", 210 | " \n", 211 | " self.output_dim = output_dim\n", 212 | " self.hid_dim = hid_dim\n", 213 | " self.n_layers = n_layers\n", 214 | " self.embedding = nn.Embedding(output_dim, emb_dim)\n", 215 | " self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)\n", 216 | " self.fc_out = nn.Linear(hid_dim, output_dim)\n", 217 | " self.dropout = nn.Dropout(dropout)\n", 218 | " \n", 219 | " def forward(self, input, hidden, cell): \n", 220 | " input = input.unsqueeze(0)\n", 221 | " embedded = self.dropout(self.embedding(input))\n", 222 | " output, (hidden, cell) = self.rnn(embedded, (hidden, cell))\n", 223 | " prediction = self.fc_out(output.squeeze(0))\n", 224 | " return prediction, hidden, cell" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 15, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "class Seq2Seq(nn.Module):\n", 234 | " def __init__(self, encoder, decoder, device):\n", 235 | " super().__init__()\n", 236 | " \n", 237 | " self.encoder = encoder\n", 238 | " self.decoder = decoder\n", 239 | " self.device = device\n", 240 | " \n", 241 | " assert encoder.hid_dim == decoder.hid_dim, \\\n", 242 | " \"Hidden dimensions of encoder and decoder must be equal!\"\n", 243 | " assert encoder.n_layers == decoder.n_layers, \\\n", 244 | " \"Encoder and decoder must have equal number of layers!\"\n", 245 | " \n", 246 | " def forward(self, src, trg, teacher_forcing_ratio = 0.5):\n", 247 | " \n", 248 | " batch_size = trg.shape[1]\n", 249 | " trg_len = trg.shape[0]\n", 250 | " trg_vocab_size = self.decoder.output_dim\n", 251 | " outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)\n", 252 | " hidden, cell = self.encoder(src)\n", 253 | " input = trg[0,:]\n", 254 | " \n", 255 | " for t in range(1, trg_len):\n", 256 | " output, hidden, cell = self.decoder(input, hidden, cell)\n", 257 | " outputs[t] = output\n", 258 | " teacher_force = random.random() < teacher_forcing_ratio\n", 259 | " top1 = output.argmax(1) \n", 260 | " input = trg[t] if teacher_force else top1\n", 261 | " return outputs" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 16, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "\n", 271 | "INPUT_DIM = len(SRC.vocab)\n", 272 | "OUTPUT_DIM = len(TRG.vocab)\n", 273 | "ENC_EMB_DIM = 256\n", 274 | "DEC_EMB_DIM = 256\n", 275 | "HID_DIM = 512\n", 276 | "N_LAYERS = 2\n", 277 | "ENC_DROPOUT = 0.5\n", 278 | "DEC_DROPOUT = 0.5\n", 279 | "\n", 280 | "enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)\n", 281 | "dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)\n", 282 | "\n", 283 | "model = Seq2Seq(enc, dec, device).to(device)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 17, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "data": { 293 | "text/plain": [ 294 | "Seq2Seq(\n", 295 | " (encoder): Encoder(\n", 296 | " (embedding): Embedding(7873, 256)\n", 297 | " (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)\n", 298 | " (dropout): Dropout(p=0.5, inplace=False)\n", 299 | " )\n", 300 | " (decoder): Decoder(\n", 301 | " (embedding): Embedding(5923, 256)\n", 302 | " (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)\n", 303 | " (fc_out): Linear(in_features=512, out_features=5923, bias=True)\n", 304 | " (dropout): Dropout(p=0.5, inplace=False)\n", 305 | " )\n", 306 | ")" 307 | ] 308 | }, 309 | "execution_count": 17, 310 | "metadata": {}, 311 | "output_type": "execute_result" 312 | } 313 | ], 314 | "source": [ 315 | "def init_weights(m):\n", 316 | " for name, param in m.named_parameters():\n", 317 | " nn.init.uniform_(param.data, -0.08, 0.08)\n", 318 | " \n", 319 | "model.apply(init_weights)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 18, 325 | "metadata": {}, 326 | "outputs": [ 327 | { 328 | "name": "stdout", 329 | "output_type": "stream", 330 | "text": [ 331 | "The model has 13,926,691 trainable parameters\n" 332 | ] 333 | } 334 | ], 335 | "source": [ 336 | "def count_parameters(model):\n", 337 | " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n", 338 | "\n", 339 | "print(f'The model has {count_parameters(model):,} trainable parameters')" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 19, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "optimizer = optim.Adam(model.parameters())" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 20, 354 | "metadata": {}, 355 | "outputs": [], 356 | "source": [ 357 | "TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]\n", 358 | "criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 21, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "def train(model, iterator, optimizer, criterion, clip):\n", 368 | " \n", 369 | " model.train()\n", 370 | " epoch_loss = 0\n", 371 | " for i, batch in enumerate(iterator):\n", 372 | " src = batch.src\n", 373 | " trg = batch.trg\n", 374 | " optimizer.zero_grad()\n", 375 | " output = model(src, trg)\n", 376 | " output_dim = output.shape[-1]\n", 377 | " output = output[1:].view(-1, output_dim)\n", 378 | " trg = trg[1:].view(-1)\n", 379 | " loss = criterion(output, trg)\n", 380 | " loss.backward()\n", 381 | " torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n", 382 | " optimizer.step()\n", 383 | " epoch_loss += loss.item()\n", 384 | " return epoch_loss / len(iterator)" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 22, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "def evaluate(model, iterator, criterion):\n", 394 | " \n", 395 | " model.eval()\n", 396 | " epoch_loss = 0\n", 397 | " with torch.no_grad():\n", 398 | " for i, batch in enumerate(iterator):\n", 399 | " src = batch.src\n", 400 | " trg = batch.trg\n", 401 | " output = model(src, trg, 0)\n", 402 | " output_dim = output.shape[-1]\n", 403 | " output = output[1:].view(-1, output_dim)\n", 404 | " trg = trg[1:].view(-1)\n", 405 | " loss = criterion(output, trg)\n", 406 | " epoch_loss += loss.item()\n", 407 | " return epoch_loss / len(iterator)" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 23, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "def epoch_time(start_time, end_time):\n", 417 | " elapsed_time = end_time - start_time\n", 418 | " elapsed_mins = int(elapsed_time / 60)\n", 419 | " elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n", 420 | " return elapsed_mins, elapsed_secs" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 24, 426 | "metadata": {}, 427 | "outputs": [ 428 | { 429 | "name": "stdout", 430 | "output_type": "stream", 431 | "text": [ 432 | "Epoch: 01 | Time: 0m 34s\n", 433 | "\tTrain Loss: 4.985 | Train PPL: 146.191\n", 434 | "\t Val. Loss: 4.928 | Val. PPL: 138.108\n", 435 | "Epoch: 02 | Time: 0m 34s\n", 436 | "\tTrain Loss: 4.462 | Train PPL: 86.666\n", 437 | "\t Val. Loss: 4.883 | Val. PPL: 131.987\n", 438 | "Epoch: 03 | Time: 0m 34s\n", 439 | "\tTrain Loss: 4.200 | Train PPL: 66.677\n", 440 | "\t Val. Loss: 4.602 | Val. PPL: 99.726\n", 441 | "Epoch: 04 | Time: 0m 34s\n", 442 | "\tTrain Loss: 3.999 | Train PPL: 54.560\n", 443 | "\t Val. Loss: 4.467 | Val. PPL: 87.056\n", 444 | "Epoch: 05 | Time: 0m 34s\n", 445 | "\tTrain Loss: 3.828 | Train PPL: 45.983\n", 446 | "\t Val. Loss: 4.386 | Val. PPL: 80.279\n", 447 | "Epoch: 06 | Time: 0m 34s\n", 448 | "\tTrain Loss: 3.653 | Train PPL: 38.600\n", 449 | "\t Val. Loss: 4.248 | Val. PPL: 69.934\n", 450 | "Epoch: 07 | Time: 0m 34s\n", 451 | "\tTrain Loss: 3.489 | Train PPL: 32.764\n", 452 | "\t Val. Loss: 4.083 | Val. PPL: 59.326\n", 453 | "Epoch: 08 | Time: 0m 34s\n", 454 | "\tTrain Loss: 3.339 | Train PPL: 28.182\n", 455 | "\t Val. Loss: 4.000 | Val. PPL: 54.601\n", 456 | "Epoch: 09 | Time: 0m 34s\n", 457 | "\tTrain Loss: 3.189 | Train PPL: 24.269\n", 458 | "\t Val. Loss: 3.956 | Val. PPL: 52.262\n", 459 | "Epoch: 10 | Time: 0m 34s\n", 460 | "\tTrain Loss: 3.056 | Train PPL: 21.245\n", 461 | "\t Val. Loss: 3.917 | Val. PPL: 50.249\n" 462 | ] 463 | } 464 | ], 465 | "source": [ 466 | "N_EPOCHS = 10\n", 467 | "CLIP = 1\n", 468 | "\n", 469 | "best_valid_loss = float('inf')\n", 470 | "\n", 471 | "for epoch in range(N_EPOCHS):\n", 472 | " \n", 473 | " start_time = time.time()\n", 474 | " \n", 475 | " train_loss = train(model, train_iterator, optimizer, criterion, CLIP)\n", 476 | " valid_loss = evaluate(model, valid_iterator, criterion)\n", 477 | " \n", 478 | " end_time = time.time()\n", 479 | " \n", 480 | " epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n", 481 | " \n", 482 | " if valid_loss < best_valid_loss:\n", 483 | " best_valid_loss = valid_loss\n", 484 | " torch.save(model.state_dict(), 'tut1-model.pt')\n", 485 | " \n", 486 | " print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')\n", 487 | " print(f'\\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')\n", 488 | " print(f'\\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 25, 494 | "metadata": {}, 495 | "outputs": [ 496 | { 497 | "name": "stdout", 498 | "output_type": "stream", 499 | "text": [ 500 | "| Test Loss: 4.011 | Test PPL: 55.177 |\n" 501 | ] 502 | } 503 | ], 504 | "source": [ 505 | "model.load_state_dict(torch.load('tut1-model.pt'))\n", 506 | "\n", 507 | "test_loss = evaluate(model, test_iterator, criterion)\n", 508 | "\n", 509 | "print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [] 518 | } 519 | ], 520 | "metadata": { 521 | "kernelspec": { 522 | "display_name": "Python 3", 523 | "language": "python", 524 | "name": "python3" 525 | }, 526 | "language_info": { 527 | "codemirror_mode": { 528 | "name": "ipython", 529 | "version": 3 530 | }, 531 | "file_extension": ".py", 532 | "mimetype": "text/x-python", 533 | "name": "python", 534 | "nbconvert_exporter": "python", 535 | "pygments_lexer": "ipython3", 536 | "version": "3.6.8" 537 | } 538 | }, 539 | "nbformat": 4, 540 | "nbformat_minor": 4 541 | } 542 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # torch_nlp_basic 2 | - Basic Concept to understand Natural Language Process 3 | - Please contact to me by e-mail 4 | --------------------------------------------------------------------------------