├── 1_Background_of_NLP.ipynb
├── 2_Representation_Vector.ipynb
├── 3_Tagging_RNN.ipynb
├── 4_NMT.ipynb
└── README.md
/1_Background_of_NLP.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "eng_data = pd.read_csv(\"../data/IMDB Dataset.csv\")"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "metadata": {},
17 | "outputs": [
18 | {
19 | "data": {
20 | "text/plain": [
21 | "(50000, 2)"
22 | ]
23 | },
24 | "execution_count": 2,
25 | "metadata": {},
26 | "output_type": "execute_result"
27 | }
28 | ],
29 | "source": [
30 | "eng_data.shape"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 3,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "data": {
40 | "text/html": [
41 | "
\n",
42 | "\n",
55 | "
\n",
56 | " \n",
57 | " \n",
58 | " \n",
59 | " review \n",
60 | " sentiment \n",
61 | " \n",
62 | " \n",
63 | " \n",
64 | " \n",
65 | " 0 \n",
66 | " One of the other reviewers has mentioned that ... \n",
67 | " positive \n",
68 | " \n",
69 | " \n",
70 | " 1 \n",
71 | " A wonderful little production. <br /><br />The... \n",
72 | " positive \n",
73 | " \n",
74 | " \n",
75 | " 2 \n",
76 | " I thought this was a wonderful way to spend ti... \n",
77 | " positive \n",
78 | " \n",
79 | " \n",
80 | " 3 \n",
81 | " Basically there's a family where a little boy ... \n",
82 | " negative \n",
83 | " \n",
84 | " \n",
85 | " 4 \n",
86 | " Petter Mattei's \"Love in the Time of Money\" is... \n",
87 | " positive \n",
88 | " \n",
89 | " \n",
90 | "
\n",
91 | "
"
92 | ],
93 | "text/plain": [
94 | " review sentiment\n",
95 | "0 One of the other reviewers has mentioned that ... positive\n",
96 | "1 A wonderful little production. The... positive\n",
97 | "2 I thought this was a wonderful way to spend ti... positive\n",
98 | "3 Basically there's a family where a little boy ... negative\n",
99 | "4 Petter Mattei's \"Love in the Time of Money\" is... positive"
100 | ]
101 | },
102 | "execution_count": 3,
103 | "metadata": {},
104 | "output_type": "execute_result"
105 | }
106 | ],
107 | "source": [
108 | "eng_data.head(5)"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 4,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "from bs4 import BeautifulSoup\n",
118 | "from nltk.tokenize.toktok import ToktokTokenizer\n",
119 | "import re\n",
120 | "import nltk\n",
121 | "\n",
122 | "def strip_html(text):\n",
123 | " soup = BeautifulSoup(text, \"html.parser\")\n",
124 | " return soup.get_text()\n",
125 | "\n",
126 | "def remove_between_square_brackets(text):\n",
127 | " return re.sub('\\[[^]]*\\]', '', text)\n",
128 | "\n",
129 | "def remove_special_characters(text, remove_digits = True):\n",
130 | " pattern=r'[^a-zA-z0-9\\s]'\n",
131 | " text=re.sub(pattern,'',text)\n",
132 | " return text\n",
133 | "\n",
134 | "def remove_stopwords(text, is_lower_case = False):\n",
135 | " tokenizer = ToktokTokenizer()\n",
136 | " stopword_list = nltk.corpus.stopwords.words('english')\n",
137 | " tokens = tokenizer.tokenize(text)\n",
138 | " tokens = [token.strip() for token in tokens]\n",
139 | " if is_lower_case:\n",
140 | " filtered_tokens = [token for token in tokens if token not in stopword_list]\n",
141 | " else:\n",
142 | " filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]\n",
143 | " filtered_text = ' '.join(filtered_tokens) \n",
144 | " return filtered_text\n",
145 | "\n",
146 | "def text_cleaning(text):\n",
147 | " text = strip_html(text)\n",
148 | " text = remove_between_square_brackets(text)\n",
149 | " text = remove_special_characters(text, remove_digits = True)\n",
150 | " text = remove_stopwords(text, is_lower_case = False)\n",
151 | " return text"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 5,
157 | "metadata": {},
158 | "outputs": [
159 | {
160 | "data": {
161 | "text/plain": [
162 | "'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love. This was the most I\\'d laughed at one of Woody\\'s comedies in years (dare I say a decade?). While I\\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her \"sexy\" image and jumped right into a average, but spirited young woman. This may not be the crown jewel of his career, but it was wittier than \"Devil Wears Prada\" and more interesting than \"Superman\" a great comedy to go see with friends.'"
163 | ]
164 | },
165 | "execution_count": 5,
166 | "metadata": {},
167 | "output_type": "execute_result"
168 | }
169 | ],
170 | "source": [
171 | "eng_data[\"review\"][2]"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 6,
177 | "metadata": {},
178 | "outputs": [
179 | {
180 | "data": {
181 | "text/plain": [
182 | "'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.This was the most I\\'d laughed at one of Woody\\'s comedies in years (dare I say a decade?). While I\\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her \"sexy\" image and jumped right into a average, but spirited young woman.This may not be the crown jewel of his career, but it was wittier than \"Devil Wears Prada\" and more interesting than \"Superman\" a great comedy to go see with friends.'"
183 | ]
184 | },
185 | "execution_count": 6,
186 | "metadata": {},
187 | "output_type": "execute_result"
188 | }
189 | ],
190 | "source": [
191 | "strip_html(eng_data[\"review\"][2])"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 7,
197 | "metadata": {},
198 | "outputs": [],
199 | "source": [
200 | "eng_data[\"review\"] = eng_data[\"review\"].apply(text_cleaning)"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 8,
206 | "metadata": {},
207 | "outputs": [
208 | {
209 | "data": {
210 | "text/plain": [
211 | "'wonderful little production filming technique unassuming oldtimeBBC fashion gives comforting sometimes discomforting sense realism entire piece actors extremely well chosen Michael Sheen got polari voices pat truly see seamless editing guided references Williams diary entries well worth watching terrificly written performed piece masterful production one great masters comedy life realism really comes home little things fantasy guard rather use traditional dream techniques remains solid disappears plays knowledge senses particularly scenes concerning Orton Halliwell sets particularly flat Halliwells murals decorating every surface terribly well done'"
212 | ]
213 | },
214 | "execution_count": 8,
215 | "metadata": {},
216 | "output_type": "execute_result"
217 | }
218 | ],
219 | "source": [
220 | "eng_data[\"review\"][1]"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 9,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "from nltk.tokenize import word_tokenize\n",
230 | "vocab_lst = [word_tokenize(x) for x in eng_data[\"review\"]]"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": 10,
236 | "metadata": {},
237 | "outputs": [
238 | {
239 | "data": {
240 | "text/plain": [
241 | "[('movie', 82310),\n",
242 | " ('film', 73514),\n",
243 | " ('one', 46301),\n",
244 | " ('like', 37483),\n",
245 | " ('good', 27403),\n",
246 | " ('would', 23751),\n",
247 | " ('time', 22741),\n",
248 | " ('really', 22207),\n",
249 | " ('see', 21765),\n",
250 | " ('even', 21494)]"
251 | ]
252 | },
253 | "execution_count": 10,
254 | "metadata": {},
255 | "output_type": "execute_result"
256 | }
257 | ],
258 | "source": [
259 | "from collections import Counter\n",
260 | "vocab_lst2 = [y for x in vocab_lst for y in x]\n",
261 | "Counter(vocab_lst2).most_common(10)"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 11,
267 | "metadata": {},
268 | "outputs": [],
269 | "source": [
270 | "vocab_lst3 = list(Counter(vocab_lst2).keys())\n",
271 | "vocab_to_index = {word: index for index, word in enumerate(vocab_lst3)}\n",
272 | "index_to_vocab = {index: word for index, word in enumerate(vocab_lst3)}"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 12,
278 | "metadata": {
279 | "scrolled": true
280 | },
281 | "outputs": [
282 | {
283 | "data": {
284 | "text/plain": [
285 | "{0: 'One',\n",
286 | " 1: 'reviewers',\n",
287 | " 2: 'mentioned',\n",
288 | " 3: 'watching',\n",
289 | " 4: '1',\n",
290 | " 5: 'Oz',\n",
291 | " 6: 'episode',\n",
292 | " 7: 'youll',\n",
293 | " 8: 'hooked',\n",
294 | " 9: 'right',\n",
295 | " 10: 'exactly',\n",
296 | " 11: 'happened',\n",
297 | " 12: 'meThe',\n",
298 | " 13: 'first',\n",
299 | " 14: 'thing',\n",
300 | " 15: 'struck',\n",
301 | " 16: 'brutality',\n",
302 | " 17: 'unflinching',\n",
303 | " 18: 'scenes',\n",
304 | " 19: 'violence',\n",
305 | " 20: 'set',\n",
306 | " 21: 'word',\n",
307 | " 22: 'GO',\n",
308 | " 23: 'Trust',\n",
309 | " 24: 'show',\n",
310 | " 25: 'faint',\n",
311 | " 26: 'hearted',\n",
312 | " 27: 'timid',\n",
313 | " 28: 'pulls',\n",
314 | " 29: 'punches',\n",
315 | " 30: 'regards',\n",
316 | " 31: 'drugs',\n",
317 | " 32: 'sex',\n",
318 | " 33: 'hardcore',\n",
319 | " 34: 'classic',\n",
320 | " 35: 'use',\n",
321 | " 36: 'wordIt',\n",
322 | " 37: 'called',\n",
323 | " 38: 'OZ',\n",
324 | " 39: 'nickname',\n",
325 | " 40: 'given',\n",
326 | " 41: 'Oswald',\n",
327 | " 42: 'Maximum',\n",
328 | " 43: 'Security',\n",
329 | " 44: 'State',\n",
330 | " 45: 'Penitentary',\n",
331 | " 46: 'focuses',\n",
332 | " 47: 'mainly',\n",
333 | " 48: 'Emerald',\n",
334 | " 49: 'City',\n",
335 | " 50: 'experimental',\n",
336 | " 51: 'section',\n",
337 | " 52: 'prison',\n",
338 | " 53: 'cells',\n",
339 | " 54: 'glass',\n",
340 | " 55: 'fronts',\n",
341 | " 56: 'face',\n",
342 | " 57: 'inwards',\n",
343 | " 58: 'privacy',\n",
344 | " 59: 'high',\n",
345 | " 60: 'agenda',\n",
346 | " 61: 'Em',\n",
347 | " 62: 'home',\n",
348 | " 63: 'manyAryans',\n",
349 | " 64: 'Muslims',\n",
350 | " 65: 'gangstas',\n",
351 | " 66: 'Latinos',\n",
352 | " 67: 'Christians',\n",
353 | " 68: 'Italians',\n",
354 | " 69: 'Irish',\n",
355 | " 70: 'moreso',\n",
356 | " 71: 'scuffles',\n",
357 | " 72: 'death',\n",
358 | " 73: 'stares',\n",
359 | " 74: 'dodgy',\n",
360 | " 75: 'dealings',\n",
361 | " 76: 'shady',\n",
362 | " 77: 'agreements',\n",
363 | " 78: 'never',\n",
364 | " 79: 'far',\n",
365 | " 80: 'awayI',\n",
366 | " 81: 'would',\n",
367 | " 82: 'say',\n",
368 | " 83: 'main',\n",
369 | " 84: 'appeal',\n",
370 | " 85: 'due',\n",
371 | " 86: 'fact',\n",
372 | " 87: 'goes',\n",
373 | " 88: 'shows',\n",
374 | " 89: 'wouldnt',\n",
375 | " 90: 'dare',\n",
376 | " 91: 'Forget',\n",
377 | " 92: 'pretty',\n",
378 | " 93: 'pictures',\n",
379 | " 94: 'painted',\n",
380 | " 95: 'mainstream',\n",
381 | " 96: 'audiences',\n",
382 | " 97: 'forget',\n",
383 | " 98: 'charm',\n",
384 | " 99: 'romanceOZ',\n",
385 | " 100: 'doesnt',\n",
386 | " 101: 'mess',\n",
387 | " 102: 'around',\n",
388 | " 103: 'ever',\n",
389 | " 104: 'saw',\n",
390 | " 105: 'nasty',\n",
391 | " 106: 'surreal',\n",
392 | " 107: 'couldnt',\n",
393 | " 108: 'ready',\n",
394 | " 109: 'watched',\n",
395 | " 110: 'developed',\n",
396 | " 111: 'taste',\n",
397 | " 112: 'got',\n",
398 | " 113: 'accustomed',\n",
399 | " 114: 'levels',\n",
400 | " 115: 'graphic',\n",
401 | " 116: 'injustice',\n",
402 | " 117: 'crooked',\n",
403 | " 118: 'guards',\n",
404 | " 119: 'wholl',\n",
405 | " 120: 'sold',\n",
406 | " 121: 'nickel',\n",
407 | " 122: 'inmates',\n",
408 | " 123: 'kill',\n",
409 | " 124: 'order',\n",
410 | " 125: 'get',\n",
411 | " 126: 'away',\n",
412 | " 127: 'well',\n",
413 | " 128: 'mannered',\n",
414 | " 129: 'middle',\n",
415 | " 130: 'class',\n",
416 | " 131: 'turned',\n",
417 | " 132: 'bitches',\n",
418 | " 133: 'lack',\n",
419 | " 134: 'street',\n",
420 | " 135: 'skills',\n",
421 | " 136: 'experience',\n",
422 | " 137: 'Watching',\n",
423 | " 138: 'may',\n",
424 | " 139: 'become',\n",
425 | " 140: 'comfortable',\n",
426 | " 141: 'uncomfortable',\n",
427 | " 142: 'viewingthats',\n",
428 | " 143: 'touch',\n",
429 | " 144: 'darker',\n",
430 | " 145: 'side',\n",
431 | " 146: 'wonderful',\n",
432 | " 147: 'little',\n",
433 | " 148: 'production',\n",
434 | " 149: 'filming',\n",
435 | " 150: 'technique',\n",
436 | " 151: 'unassuming',\n",
437 | " 152: 'oldtimeBBC',\n",
438 | " 153: 'fashion',\n",
439 | " 154: 'gives',\n",
440 | " 155: 'comforting',\n",
441 | " 156: 'sometimes',\n",
442 | " 157: 'discomforting',\n",
443 | " 158: 'sense',\n",
444 | " 159: 'realism',\n",
445 | " 160: 'entire',\n",
446 | " 161: 'piece',\n",
447 | " 162: 'actors',\n",
448 | " 163: 'extremely',\n",
449 | " 164: 'chosen',\n",
450 | " 165: 'Michael',\n",
451 | " 166: 'Sheen',\n",
452 | " 167: 'polari',\n",
453 | " 168: 'voices',\n",
454 | " 169: 'pat',\n",
455 | " 170: 'truly',\n",
456 | " 171: 'see',\n",
457 | " 172: 'seamless',\n",
458 | " 173: 'editing',\n",
459 | " 174: 'guided',\n",
460 | " 175: 'references',\n",
461 | " 176: 'Williams',\n",
462 | " 177: 'diary',\n",
463 | " 178: 'entries',\n",
464 | " 179: 'worth',\n",
465 | " 180: 'terrificly',\n",
466 | " 181: 'written',\n",
467 | " 182: 'performed',\n",
468 | " 183: 'masterful',\n",
469 | " 184: 'one',\n",
470 | " 185: 'great',\n",
471 | " 186: 'masters',\n",
472 | " 187: 'comedy',\n",
473 | " 188: 'life',\n",
474 | " 189: 'really',\n",
475 | " 190: 'comes',\n",
476 | " 191: 'things',\n",
477 | " 192: 'fantasy',\n",
478 | " 193: 'guard',\n",
479 | " 194: 'rather',\n",
480 | " 195: 'traditional',\n",
481 | " 196: 'dream',\n",
482 | " 197: 'techniques',\n",
483 | " 198: 'remains',\n",
484 | " 199: 'solid',\n",
485 | " 200: 'disappears',\n",
486 | " 201: 'plays',\n",
487 | " 202: 'knowledge',\n",
488 | " 203: 'senses',\n",
489 | " 204: 'particularly',\n",
490 | " 205: 'concerning',\n",
491 | " 206: 'Orton',\n",
492 | " 207: 'Halliwell',\n",
493 | " 208: 'sets',\n",
494 | " 209: 'flat',\n",
495 | " 210: 'Halliwells',\n",
496 | " 211: 'murals',\n",
497 | " 212: 'decorating',\n",
498 | " 213: 'every',\n",
499 | " 214: 'surface',\n",
500 | " 215: 'terribly',\n",
501 | " 216: 'done',\n",
502 | " 217: 'thought',\n",
503 | " 218: 'way',\n",
504 | " 219: 'spend',\n",
505 | " 220: 'time',\n",
506 | " 221: 'hot',\n",
507 | " 222: 'summer',\n",
508 | " 223: 'weekend',\n",
509 | " 224: 'sitting',\n",
510 | " 225: 'air',\n",
511 | " 226: 'conditioned',\n",
512 | " 227: 'theater',\n",
513 | " 228: 'lighthearted',\n",
514 | " 229: 'plot',\n",
515 | " 230: 'simplistic',\n",
516 | " 231: 'dialogue',\n",
517 | " 232: 'witty',\n",
518 | " 233: 'characters',\n",
519 | " 234: 'likable',\n",
520 | " 235: 'even',\n",
521 | " 236: 'bread',\n",
522 | " 237: 'suspected',\n",
523 | " 238: 'serial',\n",
524 | " 239: 'killer',\n",
525 | " 240: 'disappointed',\n",
526 | " 241: 'realize',\n",
527 | " 242: 'Match',\n",
528 | " 243: 'Point',\n",
529 | " 244: '2',\n",
530 | " 245: 'Risk',\n",
531 | " 246: 'Addiction',\n",
532 | " 247: 'proof',\n",
533 | " 248: 'Woody',\n",
534 | " 249: 'Allen',\n",
535 | " 250: 'still',\n",
536 | " 251: 'fully',\n",
537 | " 252: 'control',\n",
538 | " 253: 'style',\n",
539 | " 254: 'many',\n",
540 | " 255: 'us',\n",
541 | " 256: 'grown',\n",
542 | " 257: 'loveThis',\n",
543 | " 258: 'Id',\n",
544 | " 259: 'laughed',\n",
545 | " 260: 'Woodys',\n",
546 | " 261: 'comedies',\n",
547 | " 262: 'years',\n",
548 | " 263: 'decade',\n",
549 | " 264: 'Ive',\n",
550 | " 265: 'impressed',\n",
551 | " 266: 'Scarlet',\n",
552 | " 267: 'Johanson',\n",
553 | " 268: 'managed',\n",
554 | " 269: 'tone',\n",
555 | " 270: 'sexy',\n",
556 | " 271: 'image',\n",
557 | " 272: 'jumped',\n",
558 | " 273: 'average',\n",
559 | " 274: 'spirited',\n",
560 | " 275: 'young',\n",
561 | " 276: 'womanThis',\n",
562 | " 277: 'crown',\n",
563 | " 278: 'jewel',\n",
564 | " 279: 'career',\n",
565 | " 280: 'wittier',\n",
566 | " 281: 'Devil',\n",
567 | " 282: 'Wears',\n",
568 | " 283: 'Prada',\n",
569 | " 284: 'interesting',\n",
570 | " 285: 'Superman',\n",
571 | " 286: 'go',\n",
572 | " 287: 'friends',\n",
573 | " 288: 'Basically',\n",
574 | " 289: 'theres',\n",
575 | " 290: 'family',\n",
576 | " 291: 'boy',\n",
577 | " 292: 'Jake',\n",
578 | " 293: 'thinks',\n",
579 | " 294: 'zombie',\n",
580 | " 295: 'closet',\n",
581 | " 296: 'parents',\n",
582 | " 297: 'fighting',\n",
583 | " 298: 'timeThis',\n",
584 | " 299: 'movie',\n",
585 | " 300: 'slower',\n",
586 | " 301: 'soap',\n",
587 | " 302: 'opera',\n",
588 | " 303: 'suddenly',\n",
589 | " 304: 'decides',\n",
590 | " 305: 'Rambo',\n",
591 | " 306: 'zombieOK',\n",
592 | " 307: 'youre',\n",
593 | " 308: 'going',\n",
594 | " 309: 'make',\n",
595 | " 310: 'film',\n",
596 | " 311: 'must',\n",
597 | " 312: 'Decide',\n",
598 | " 313: 'thriller',\n",
599 | " 314: 'drama',\n",
600 | " 315: 'watchable',\n",
601 | " 316: 'Parents',\n",
602 | " 317: 'divorcing',\n",
603 | " 318: 'arguing',\n",
604 | " 319: 'like',\n",
605 | " 320: 'real',\n",
606 | " 321: 'totally',\n",
607 | " 322: 'ruins',\n",
608 | " 323: 'expected',\n",
609 | " 324: 'BOOGEYMAN',\n",
610 | " 325: 'similar',\n",
611 | " 326: 'instead',\n",
612 | " 327: 'meaningless',\n",
613 | " 328: 'spots3',\n",
614 | " 329: '10',\n",
615 | " 330: 'playing',\n",
616 | " 331: 'descent',\n",
617 | " 332: 'dialogs',\n",
618 | " 333: 'shots',\n",
619 | " 334: 'ignore',\n",
620 | " 335: 'Petter',\n",
621 | " 336: 'Matteis',\n",
622 | " 337: 'Love',\n",
623 | " 338: 'Time',\n",
624 | " 339: 'Money',\n",
625 | " 340: 'visually',\n",
626 | " 341: 'stunning',\n",
627 | " 342: 'watch',\n",
628 | " 343: 'Mr',\n",
629 | " 344: 'Mattei',\n",
630 | " 345: 'offers',\n",
631 | " 346: 'vivid',\n",
632 | " 347: 'portrait',\n",
633 | " 348: 'human',\n",
634 | " 349: 'relations',\n",
635 | " 350: 'seems',\n",
636 | " 351: 'telling',\n",
637 | " 352: 'money',\n",
638 | " 353: 'power',\n",
639 | " 354: 'success',\n",
640 | " 355: 'people',\n",
641 | " 356: 'different',\n",
642 | " 357: 'situations',\n",
643 | " 358: 'encounter',\n",
644 | " 359: 'variation',\n",
645 | " 360: 'Arthur',\n",
646 | " 361: 'Schnitzlers',\n",
647 | " 362: 'play',\n",
648 | " 363: 'theme',\n",
649 | " 364: 'director',\n",
650 | " 365: 'transfers',\n",
651 | " 366: 'action',\n",
652 | " 367: 'present',\n",
653 | " 368: 'New',\n",
654 | " 369: 'York',\n",
655 | " 370: 'meet',\n",
656 | " 371: 'connect',\n",
657 | " 372: 'connected',\n",
658 | " 373: 'another',\n",
659 | " 374: 'next',\n",
660 | " 375: 'person',\n",
661 | " 376: 'know',\n",
662 | " 377: 'previous',\n",
663 | " 378: 'point',\n",
664 | " 379: 'contact',\n",
665 | " 380: 'Stylishly',\n",
666 | " 381: 'sophisticated',\n",
667 | " 382: 'luxurious',\n",
668 | " 383: 'look',\n",
669 | " 384: 'taken',\n",
670 | " 385: 'live',\n",
671 | " 386: 'world',\n",
672 | " 387: 'habitatThe',\n",
673 | " 388: 'gets',\n",
674 | " 389: 'souls',\n",
675 | " 390: 'picture',\n",
676 | " 391: 'stages',\n",
677 | " 392: 'loneliness',\n",
678 | " 393: 'inhabits',\n",
679 | " 394: 'big',\n",
680 | " 395: 'city',\n",
681 | " 396: 'best',\n",
682 | " 397: 'place',\n",
683 | " 398: 'find',\n",
684 | " 399: 'sincere',\n",
685 | " 400: 'fulfillment',\n",
686 | " 401: 'discerns',\n",
687 | " 402: 'case',\n",
688 | " 403: 'encounterThe',\n",
689 | " 404: 'acting',\n",
690 | " 405: 'good',\n",
691 | " 406: 'direction',\n",
692 | " 407: 'Steve',\n",
693 | " 408: 'Buscemi',\n",
694 | " 409: 'Rosario',\n",
695 | " 410: 'Dawson',\n",
696 | " 411: 'Carol',\n",
697 | " 412: 'Kane',\n",
698 | " 413: 'Imperioli',\n",
699 | " 414: 'Adrian',\n",
700 | " 415: 'Grenier',\n",
701 | " 416: 'rest',\n",
702 | " 417: 'talented',\n",
703 | " 418: 'cast',\n",
704 | " 419: 'come',\n",
705 | " 420: 'aliveWe',\n",
706 | " 421: 'wish',\n",
707 | " 422: 'luck',\n",
708 | " 423: 'await',\n",
709 | " 424: 'anxiously',\n",
710 | " 425: 'work',\n",
711 | " 426: 'Probably',\n",
712 | " 427: 'alltime',\n",
713 | " 428: 'favorite',\n",
714 | " 429: 'story',\n",
715 | " 430: 'selflessness',\n",
716 | " 431: 'sacrifice',\n",
717 | " 432: 'dedication',\n",
718 | " 433: 'noble',\n",
719 | " 434: 'cause',\n",
720 | " 435: 'preachy',\n",
721 | " 436: 'boring',\n",
722 | " 437: 'old',\n",
723 | " 438: 'despite',\n",
724 | " 439: 'seen',\n",
725 | " 440: '15',\n",
726 | " 441: 'times',\n",
727 | " 442: 'last',\n",
728 | " 443: '25',\n",
729 | " 444: 'Paul',\n",
730 | " 445: 'Lukas',\n",
731 | " 446: 'performance',\n",
732 | " 447: 'brings',\n",
733 | " 448: 'tears',\n",
734 | " 449: 'eyes',\n",
735 | " 450: 'Bette',\n",
736 | " 451: 'Davis',\n",
737 | " 452: 'sympathetic',\n",
738 | " 453: 'roles',\n",
739 | " 454: 'delight',\n",
740 | " 455: 'kids',\n",
741 | " 456: 'grandma',\n",
742 | " 457: 'says',\n",
743 | " 458: 'dressedup',\n",
744 | " 459: 'midgets',\n",
745 | " 460: 'children',\n",
746 | " 461: 'makes',\n",
747 | " 462: 'fun',\n",
748 | " 463: 'mothers',\n",
749 | " 464: 'slow',\n",
750 | " 465: 'awakening',\n",
751 | " 466: 'whats',\n",
752 | " 467: 'happening',\n",
753 | " 468: 'roof',\n",
754 | " 469: 'believable',\n",
755 | " 470: 'startling',\n",
756 | " 471: 'dozen',\n",
757 | " 472: 'thumbs',\n",
758 | " 473: 'theyd',\n",
759 | " 474: 'sure',\n",
760 | " 475: 'resurrection',\n",
761 | " 476: 'dated',\n",
762 | " 477: 'Seahunt',\n",
763 | " 478: 'series',\n",
764 | " 479: 'tech',\n",
765 | " 480: 'today',\n",
766 | " 481: 'bring',\n",
767 | " 482: 'back',\n",
768 | " 483: 'kid',\n",
769 | " 484: 'excitement',\n",
770 | " 485: 'meI',\n",
771 | " 486: 'grew',\n",
772 | " 487: 'black',\n",
773 | " 488: 'white',\n",
774 | " 489: 'TV',\n",
775 | " 490: 'Gunsmoke',\n",
776 | " 491: 'heros',\n",
777 | " 492: 'weekYou',\n",
778 | " 493: 'vote',\n",
779 | " 494: 'comeback',\n",
780 | " 495: 'new',\n",
781 | " 496: 'sea',\n",
782 | " 497: 'huntWe',\n",
783 | " 498: 'need',\n",
784 | " 499: 'change',\n",
785 | " 500: 'pace',\n",
786 | " 501: 'water',\n",
787 | " 502: 'adventureOh',\n",
788 | " 503: 'thank',\n",
789 | " 504: 'outlet',\n",
790 | " 505: 'view',\n",
791 | " 506: 'viewpoints',\n",
792 | " 507: 'moviesSo',\n",
793 | " 508: 'ole',\n",
794 | " 509: 'believe',\n",
795 | " 510: 'wan',\n",
796 | " 511: 'na',\n",
797 | " 512: 'sayWould',\n",
798 | " 513: 'nice',\n",
799 | " 514: 'read',\n",
800 | " 515: 'plus',\n",
801 | " 516: 'points',\n",
802 | " 517: 'huntIf',\n",
803 | " 518: 'rhymes',\n",
804 | " 519: 'lines',\n",
805 | " 520: 'let',\n",
806 | " 521: 'submitor',\n",
807 | " 522: 'leave',\n",
808 | " 523: 'doubt',\n",
809 | " 524: 'quitIf',\n",
810 | " 525: 'lets',\n",
811 | " 526: 'amazing',\n",
812 | " 527: 'fresh',\n",
813 | " 528: 'innovative',\n",
814 | " 529: 'idea',\n",
815 | " 530: '70s',\n",
816 | " 531: 'aired',\n",
817 | " 532: '7',\n",
818 | " 533: '8',\n",
819 | " 534: 'brilliant',\n",
820 | " 535: 'dropped',\n",
821 | " 536: '1990',\n",
822 | " 537: 'funny',\n",
823 | " 538: 'anymore',\n",
824 | " 539: 'continued',\n",
825 | " 540: 'decline',\n",
826 | " 541: 'complete',\n",
827 | " 542: 'waste',\n",
828 | " 543: 'todayIts',\n",
829 | " 544: 'disgraceful',\n",
830 | " 545: 'fallen',\n",
831 | " 546: 'writing',\n",
832 | " 547: 'painfully',\n",
833 | " 548: 'bad',\n",
834 | " 549: 'performances',\n",
835 | " 550: 'almost',\n",
836 | " 551: 'mildly',\n",
837 | " 552: 'entertaining',\n",
838 | " 553: 'respite',\n",
839 | " 554: 'guesthosts',\n",
840 | " 555: 'probably',\n",
841 | " 556: 'hard',\n",
842 | " 557: 'creator',\n",
843 | " 558: 'handselected',\n",
844 | " 559: 'original',\n",
845 | " 560: 'also',\n",
846 | " 561: 'chose',\n",
847 | " 562: 'band',\n",
848 | " 563: 'hacks',\n",
849 | " 564: 'followed',\n",
850 | " 565: 'recognize',\n",
851 | " 566: 'brilliance',\n",
852 | " 567: 'fit',\n",
853 | " 568: 'replace',\n",
854 | " 569: 'mediocrity',\n",
855 | " 570: 'felt',\n",
856 | " 571: 'give',\n",
857 | " 572: 'stars',\n",
858 | " 573: 'respect',\n",
859 | " 574: 'made',\n",
860 | " 575: 'huge',\n",
861 | " 576: 'awful',\n",
862 | " 577: 'cant',\n",
863 | " 578: 'Encouraged',\n",
864 | " 579: 'positive',\n",
865 | " 580: 'comments',\n",
866 | " 581: 'looking',\n",
867 | " 582: 'forward',\n",
868 | " 583: 'Bad',\n",
869 | " 584: 'mistake',\n",
870 | " 585: '950',\n",
871 | " 586: 'films',\n",
872 | " 587: 'worst',\n",
873 | " 588: 'pacing',\n",
874 | " 589: 'storyline',\n",
875 | " 590: 'soundtrack',\n",
876 | " 591: 'song',\n",
877 | " 592: 'lame',\n",
878 | " 593: 'country',\n",
879 | " 594: 'tune',\n",
880 | " 595: 'played',\n",
881 | " 596: 'less',\n",
882 | " 597: 'four',\n",
883 | " 598: 'looks',\n",
884 | " 599: 'cheap',\n",
885 | " 600: 'extreme',\n",
886 | " 601: 'Rarely',\n",
887 | " 602: 'happy',\n",
888 | " 603: 'end',\n",
889 | " 604: 'credits',\n",
890 | " 605: 'prevents',\n",
891 | " 606: 'giving',\n",
892 | " 607: '1score',\n",
893 | " 608: 'Harvey',\n",
894 | " 609: 'Keitel',\n",
895 | " 610: 'least',\n",
896 | " 611: 'making',\n",
897 | " 612: 'bit',\n",
898 | " 613: 'effort',\n",
899 | " 614: 'obsessives',\n",
900 | " 615: 'gut',\n",
901 | " 616: 'wrenching',\n",
902 | " 617: 'laughter',\n",
903 | " 618: 'love',\n",
904 | " 619: 'hell',\n",
905 | " 620: 'mom',\n",
906 | " 621: 'liked',\n",
907 | " 622: 'itGreat',\n",
908 | " 623: 'Camp',\n",
909 | " 624: 'Phil',\n",
910 | " 625: 'Alien',\n",
911 | " 626: 'quirky',\n",
912 | " 627: 'humour',\n",
913 | " 628: 'based',\n",
914 | " 629: 'oddness',\n",
915 | " 630: 'everything',\n",
916 | " 631: 'actual',\n",
917 | " 632: 'punchlinesAt',\n",
918 | " 633: 'odd',\n",
919 | " 634: 'progressed',\n",
920 | " 635: 'didnt',\n",
921 | " 636: 'jokes',\n",
922 | " 637: 'anymoreIts',\n",
923 | " 638: 'low',\n",
924 | " 639: 'budget',\n",
925 | " 640: 'thats',\n",
926 | " 641: 'problem',\n",
927 | " 642: 'eventually',\n",
928 | " 643: 'lost',\n",
929 | " 644: 'interestI',\n",
930 | " 645: 'imagine',\n",
931 | " 646: 'stoner',\n",
932 | " 647: 'currently',\n",
933 | " 648: 'partakingFor',\n",
934 | " 649: 'something',\n",
935 | " 650: 'better',\n",
936 | " 651: 'try',\n",
937 | " 652: 'Brother',\n",
938 | " 653: 'planet',\n",
939 | " 654: '12',\n",
940 | " 655: 'came',\n",
941 | " 656: 'recall',\n",
942 | " 657: 'scariest',\n",
943 | " 658: 'scene',\n",
944 | " 659: 'bird',\n",
945 | " 660: 'eating',\n",
946 | " 661: 'men',\n",
947 | " 662: 'dangling',\n",
948 | " 663: 'helplessly',\n",
949 | " 664: 'parachutes',\n",
950 | " 665: 'horror',\n",
951 | " 666: 'horrorAs',\n",
952 | " 667: 'cheesy',\n",
953 | " 668: 'B',\n",
954 | " 669: 'Saturday',\n",
955 | " 670: 'afternoons',\n",
956 | " 671: 'tired',\n",
957 | " 672: 'formula',\n",
958 | " 673: 'monster',\n",
959 | " 674: 'type',\n",
960 | " 675: 'movies',\n",
961 | " 676: 'usually',\n",
962 | " 677: 'included',\n",
963 | " 678: 'hero',\n",
964 | " 679: 'beautiful',\n",
965 | " 680: 'woman',\n",
966 | " 681: 'might',\n",
967 | " 682: 'daughter',\n",
968 | " 683: 'professor',\n",
969 | " 684: 'resolution',\n",
970 | " 685: 'died',\n",
971 | " 686: 'care',\n",
972 | " 687: 'much',\n",
973 | " 688: 'romantic',\n",
974 | " 689: 'angle',\n",
975 | " 690: 'year',\n",
976 | " 691: 'predictable',\n",
977 | " 692: 'plots',\n",
978 | " 693: 'unintentional',\n",
979 | " 694: 'humorBut',\n",
980 | " 695: 'later',\n",
981 | " 696: 'Psycho',\n",
982 | " 697: 'loved',\n",
983 | " 698: 'star',\n",
984 | " 699: 'Janet',\n",
985 | " 700: 'Leigh',\n",
986 | " 701: 'bumped',\n",
987 | " 702: 'early',\n",
988 | " 703: 'sat',\n",
989 | " 704: 'took',\n",
990 | " 705: 'notice',\n",
991 | " 706: 'Since',\n",
992 | " 707: 'screenwriters',\n",
993 | " 708: 'scary',\n",
994 | " 709: 'possible',\n",
995 | " 710: 'wellworn',\n",
996 | " 711: 'rules',\n",
997 | " 712: 'im',\n",
998 | " 713: 'fan',\n",
999 | " 714: 'Bolls',\n",
1000 | " 715: 'enjoyed',\n",
1001 | " 716: 'Postal',\n",
1002 | " 717: 'maybe',\n",
1003 | " 718: 'Boll',\n",
1004 | " 719: 'apparently',\n",
1005 | " 720: 'bought',\n",
1006 | " 721: 'rights',\n",
1007 | " 722: 'Far',\n",
1008 | " 723: 'Cry',\n",
1009 | " 724: 'long',\n",
1010 | " 725: 'ago',\n",
1011 | " 726: 'game',\n",
1012 | " 727: 'finsished',\n",
1013 | " 728: 'People',\n",
1014 | " 729: 'killing',\n",
1015 | " 730: 'mercs',\n",
1016 | " 731: 'infiltrating',\n",
1017 | " 732: 'secret',\n",
1018 | " 733: 'research',\n",
1019 | " 734: 'labs',\n",
1020 | " 735: 'located',\n",
1021 | " 736: 'tropical',\n",
1022 | " 737: 'island',\n",
1023 | " 738: 'warned',\n",
1024 | " 739: 'schemed',\n",
1025 | " 740: 'together',\n",
1026 | " 741: 'along',\n",
1027 | " 742: 'legion',\n",
1028 | " 743: 'schmucks',\n",
1029 | " 744: 'Feeling',\n",
1030 | " 745: 'loneley',\n",
1031 | " 746: 'invites',\n",
1032 | " 747: 'three',\n",
1033 | " 748: 'countrymen',\n",
1034 | " 749: 'players',\n",
1035 | " 750: 'names',\n",
1036 | " 751: 'Til',\n",
1037 | " 752: 'Schweiger',\n",
1038 | " 753: 'Udo',\n",
1039 | " 754: 'Kier',\n",
1040 | " 755: 'Ralf',\n",
1041 | " 756: 'MoellerThree',\n",
1042 | " 757: 'actually',\n",
1043 | " 758: 'selfs',\n",
1044 | " 759: 'biz',\n",
1045 | " 760: 'tale',\n",
1046 | " 761: 'Jack',\n",
1047 | " 762: 'Carver',\n",
1048 | " 763: 'yes',\n",
1049 | " 764: 'German',\n",
1050 | " 765: 'hail',\n",
1051 | " 766: 'bratwurst',\n",
1052 | " 767: 'dudes',\n",
1053 | " 768: 'However',\n",
1054 | " 769: 'Tils',\n",
1055 | " 770: 'badass',\n",
1056 | " 771: 'complained',\n",
1057 | " 772: 'hes',\n",
1058 | " 773: 'staying',\n",
1059 | " 774: 'true',\n",
1060 | " 775: 'whole',\n",
1061 | " 776: 'carver',\n",
1062 | " 777: 'perspective',\n",
1063 | " 778: 'dont',\n",
1064 | " 779: 'looked',\n",
1065 | " 780: 'kicking',\n",
1066 | " 781: 'beyond',\n",
1067 | " 782: 'demented',\n",
1068 | " 783: 'evil',\n",
1069 | " 784: 'mad',\n",
1070 | " 785: 'scientist',\n",
1071 | " 786: 'Dr',\n",
1072 | " 787: 'Krieger',\n",
1073 | " 788: 'GeneticallyMutatedsoldiers',\n",
1074 | " 789: 'GMS',\n",
1075 | " 790: 'Performing',\n",
1076 | " 791: 'topsecret',\n",
1077 | " 792: 'reminds',\n",
1078 | " 793: 'SPOILER',\n",
1079 | " 794: 'Vancouver',\n",
1080 | " 795: 'reason',\n",
1081 | " 796: 'Thats',\n",
1082 | " 797: 'palm',\n",
1083 | " 798: 'trees',\n",
1084 | " 799: 'Instead',\n",
1085 | " 800: 'rich',\n",
1086 | " 801: 'lumberjackwoods',\n",
1087 | " 802: 'havent',\n",
1088 | " 803: 'gone',\n",
1089 | " 804: 'FAR',\n",
1090 | " 805: 'started',\n",
1091 | " 806: 'CRY',\n",
1092 | " 807: 'mehehe',\n",
1093 | " 808: 'can',\n",
1094 | " 809: 'not',\n",
1095 | " 810: 'stay',\n",
1096 | " 811: 'shenanigans',\n",
1097 | " 812: 'delivers',\n",
1098 | " 813: 'meaning',\n",
1099 | " 814: 'suckThere',\n",
1100 | " 815: 'mentioning',\n",
1101 | " 816: 'imply',\n",
1102 | " 817: 'areas',\n",
1103 | " 818: 'boat',\n",
1104 | " 819: 'cromedalbino',\n",
1105 | " 820: 'squad',\n",
1106 | " 821: 'enters',\n",
1107 | " 822: 'laugh',\n",
1108 | " 823: 'reeks',\n",
1109 | " 824: 'scheisse',\n",
1110 | " 825: 'poop',\n",
1111 | " 826: 'simpletons',\n",
1112 | " 827: 'take',\n",
1113 | " 828: 'wiff',\n",
1114 | " 829: 'ahead',\n",
1115 | " 830: 'BTW',\n",
1116 | " 831: 'annoying',\n",
1117 | " 832: 'sidekick',\n",
1118 | " 833: 'shoot',\n",
1119 | " 834: 'minutes',\n",
1120 | " 835: 'screen',\n",
1121 | " 836: 'ShakespeareShakespeare',\n",
1122 | " 837: 'lostI',\n",
1123 | " 838: 'appreciate',\n",
1124 | " 839: 'trying',\n",
1125 | " 840: 'Shakespeare',\n",
1126 | " 841: 'masses',\n",
1127 | " 842: 'ruin',\n",
1128 | " 843: 'goodIs',\n",
1129 | " 844: 'Scottish',\n",
1130 | " 845: 'Play',\n",
1131 | " 846: 'certain',\n",
1132 | " 847: 'Rev',\n",
1133 | " 848: 'Bowdler',\n",
1134 | " 849: 'hence',\n",
1135 | " 850: 'bowdlerization',\n",
1136 | " 851: 'tried',\n",
1137 | " 852: 'Victorian',\n",
1138 | " 853: 'eraIn',\n",
1139 | " 854: 'words',\n",
1140 | " 855: 'improve',\n",
1141 | " 856: 'perfectionI',\n",
1142 | " 857: 'write',\n",
1143 | " 858: 'ten',\n",
1144 | " 859: 'text',\n",
1145 | " 860: 'English',\n",
1146 | " 861: 'composition',\n",
1147 | " 862: 'forte',\n",
1148 | " 863: 'keep',\n",
1149 | " 864: 'saying',\n",
1150 | " 865: 'cut',\n",
1151 | " 866: 'fantastic',\n",
1152 | " 867: 'prisoners',\n",
1153 | " 868: 'famous',\n",
1154 | " 869: 'george',\n",
1155 | " 870: 'clooney',\n",
1156 | " 871: 'Im',\n",
1157 | " 872: 'roll',\n",
1158 | " 873: 'Another',\n",
1159 | " 874: 'man',\n",
1160 | " 875: 'constant',\n",
1161 | " 876: 'sorrow',\n",
1162 | " 877: 'recommand',\n",
1163 | " 878: 'everybody',\n",
1164 | " 879: 'Greetings',\n",
1165 | " 880: 'Bart',\n",
1166 | " 881: 'Kind',\n",
1167 | " 882: 'drawn',\n",
1168 | " 883: 'erotic',\n",
1169 | " 884: 'amateurish',\n",
1170 | " 885: 'unbelievable',\n",
1171 | " 886: 'bits',\n",
1172 | " 887: 'Sort',\n",
1173 | " 888: 'school',\n",
1174 | " 889: 'project',\n",
1175 | " 890: 'Rosanna',\n",
1176 | " 891: 'Arquette',\n",
1177 | " 892: 'thinking',\n",
1178 | " 893: 'stock',\n",
1179 | " 894: 'bizarre',\n",
1180 | " 895: 'supposed',\n",
1181 | " 896: 'Midwest',\n",
1182 | " 897: 'town',\n",
1183 | " 898: 'Pretty',\n",
1184 | " 899: 'involved',\n",
1185 | " 900: 'lessons',\n",
1186 | " 901: 'learned',\n",
1187 | " 902: 'insights',\n",
1188 | " 903: 'stilted',\n",
1189 | " 904: 'quite',\n",
1190 | " 905: 'ridiculous',\n",
1191 | " 906: 'lots',\n",
1192 | " 907: 'skin',\n",
1193 | " 908: 'intrigues',\n",
1194 | " 909: 'videotaped',\n",
1195 | " 910: 'nonsenseWhat',\n",
1196 | " 911: 'bisexual',\n",
1197 | " 912: 'relationship',\n",
1198 | " 913: 'nowhere',\n",
1199 | " 914: 'heterosexual',\n",
1200 | " 915: 'encounters',\n",
1201 | " 916: 'absurd',\n",
1202 | " 917: 'dance',\n",
1203 | " 918: 'stereotyped',\n",
1204 | " 919: 'Give',\n",
1205 | " 920: 'pass',\n",
1206 | " 921: 'million',\n",
1207 | " 922: 'miles',\n",
1208 | " 923: 'wasted',\n",
1209 | " 924: 'could',\n",
1210 | " 925: 'spent',\n",
1211 | " 926: 'starving',\n",
1212 | " 927: 'Aids',\n",
1213 | " 928: 'Africa',\n",
1214 | " 929: 'simply',\n",
1215 | " 930: 'remade',\n",
1216 | " 931: 'fails',\n",
1217 | " 932: 'capture',\n",
1218 | " 933: 'flavor',\n",
1219 | " 934: 'terror',\n",
1220 | " 935: '1963',\n",
1221 | " 936: 'title',\n",
1222 | " 937: 'Liam',\n",
1223 | " 938: 'Neeson',\n",
1224 | " 939: 'excellent',\n",
1225 | " 940: 'always',\n",
1226 | " 941: 'holds',\n",
1227 | " 942: 'exception',\n",
1228 | " 943: 'Owen',\n",
1229 | " 944: 'Wilson',\n",
1230 | " 945: 'feel',\n",
1231 | " 946: 'character',\n",
1232 | " 947: 'Luke',\n",
1233 | " 948: 'major',\n",
1234 | " 949: 'fault',\n",
1235 | " 950: 'version',\n",
1236 | " 951: 'strayed',\n",
1237 | " 952: 'Shirley',\n",
1238 | " 953: 'Jackson',\n",
1239 | " 954: 'attempts',\n",
1240 | " 955: 'grandiose',\n",
1241 | " 956: 'thrill',\n",
1242 | " 957: 'earlier',\n",
1243 | " 958: 'trade',\n",
1244 | " 959: 'snazzier',\n",
1245 | " 960: 'special',\n",
1246 | " 961: 'effects',\n",
1247 | " 962: 'enjoy',\n",
1248 | " 963: 'friction',\n",
1249 | " 964: 'older',\n",
1250 | " 965: 'top',\n",
1251 | " 966: 'Horrible',\n",
1252 | " 967: 'wasnt',\n",
1253 | " 968: 'continuous',\n",
1254 | " 969: 'minute',\n",
1255 | " 970: 'fight',\n",
1256 | " 971: 'chance',\n",
1257 | " 972: 'development',\n",
1258 | " 973: 'busy',\n",
1259 | " 974: 'running',\n",
1260 | " 975: 'sword',\n",
1261 | " 976: 'emotional',\n",
1262 | " 977: 'attachment',\n",
1263 | " 978: 'except',\n",
1264 | " 979: 'machine',\n",
1265 | " 980: 'wanted',\n",
1266 | " 981: 'destroy',\n",
1267 | " 982: 'Scenes',\n",
1268 | " 983: 'blatantly',\n",
1269 | " 984: 'stolen',\n",
1270 | " 985: 'LOTR',\n",
1271 | " 986: 'Star',\n",
1272 | " 987: 'Wars',\n",
1273 | " 988: 'Matrix',\n",
1274 | " 989: 'ExamplesThe',\n",
1275 | " 990: 'ghost',\n",
1276 | " 991: 'final',\n",
1277 | " 992: 'Yoda',\n",
1278 | " 993: 'Obee',\n",
1279 | " 994: 'Vader',\n",
1280 | " 995: 'spider',\n",
1281 | " 996: 'beginning',\n",
1282 | " 997: 'Frodo',\n",
1283 | " 998: 'attacked',\n",
1284 | " 999: 'Return',\n",
1285 | " ...}"
1286 | ]
1287 | },
1288 | "execution_count": 12,
1289 | "metadata": {},
1290 | "output_type": "execute_result"
1291 | }
1292 | ],
1293 | "source": [
1294 | "index_to_vocab"
1295 | ]
1296 | },
1297 | {
1298 | "cell_type": "code",
1299 | "execution_count": 13,
1300 | "metadata": {
1301 | "scrolled": true
1302 | },
1303 | "outputs": [
1304 | {
1305 | "data": {
1306 | "text/plain": [
1307 | "{'One': 0,\n",
1308 | " 'reviewers': 1,\n",
1309 | " 'mentioned': 2,\n",
1310 | " 'watching': 3,\n",
1311 | " '1': 4,\n",
1312 | " 'Oz': 5,\n",
1313 | " 'episode': 6,\n",
1314 | " 'youll': 7,\n",
1315 | " 'hooked': 8,\n",
1316 | " 'right': 9,\n",
1317 | " 'exactly': 10,\n",
1318 | " 'happened': 11,\n",
1319 | " 'meThe': 12,\n",
1320 | " 'first': 13,\n",
1321 | " 'thing': 14,\n",
1322 | " 'struck': 15,\n",
1323 | " 'brutality': 16,\n",
1324 | " 'unflinching': 17,\n",
1325 | " 'scenes': 18,\n",
1326 | " 'violence': 19,\n",
1327 | " 'set': 20,\n",
1328 | " 'word': 21,\n",
1329 | " 'GO': 22,\n",
1330 | " 'Trust': 23,\n",
1331 | " 'show': 24,\n",
1332 | " 'faint': 25,\n",
1333 | " 'hearted': 26,\n",
1334 | " 'timid': 27,\n",
1335 | " 'pulls': 28,\n",
1336 | " 'punches': 29,\n",
1337 | " 'regards': 30,\n",
1338 | " 'drugs': 31,\n",
1339 | " 'sex': 32,\n",
1340 | " 'hardcore': 33,\n",
1341 | " 'classic': 34,\n",
1342 | " 'use': 35,\n",
1343 | " 'wordIt': 36,\n",
1344 | " 'called': 37,\n",
1345 | " 'OZ': 38,\n",
1346 | " 'nickname': 39,\n",
1347 | " 'given': 40,\n",
1348 | " 'Oswald': 41,\n",
1349 | " 'Maximum': 42,\n",
1350 | " 'Security': 43,\n",
1351 | " 'State': 44,\n",
1352 | " 'Penitentary': 45,\n",
1353 | " 'focuses': 46,\n",
1354 | " 'mainly': 47,\n",
1355 | " 'Emerald': 48,\n",
1356 | " 'City': 49,\n",
1357 | " 'experimental': 50,\n",
1358 | " 'section': 51,\n",
1359 | " 'prison': 52,\n",
1360 | " 'cells': 53,\n",
1361 | " 'glass': 54,\n",
1362 | " 'fronts': 55,\n",
1363 | " 'face': 56,\n",
1364 | " 'inwards': 57,\n",
1365 | " 'privacy': 58,\n",
1366 | " 'high': 59,\n",
1367 | " 'agenda': 60,\n",
1368 | " 'Em': 61,\n",
1369 | " 'home': 62,\n",
1370 | " 'manyAryans': 63,\n",
1371 | " 'Muslims': 64,\n",
1372 | " 'gangstas': 65,\n",
1373 | " 'Latinos': 66,\n",
1374 | " 'Christians': 67,\n",
1375 | " 'Italians': 68,\n",
1376 | " 'Irish': 69,\n",
1377 | " 'moreso': 70,\n",
1378 | " 'scuffles': 71,\n",
1379 | " 'death': 72,\n",
1380 | " 'stares': 73,\n",
1381 | " 'dodgy': 74,\n",
1382 | " 'dealings': 75,\n",
1383 | " 'shady': 76,\n",
1384 | " 'agreements': 77,\n",
1385 | " 'never': 78,\n",
1386 | " 'far': 79,\n",
1387 | " 'awayI': 80,\n",
1388 | " 'would': 81,\n",
1389 | " 'say': 82,\n",
1390 | " 'main': 83,\n",
1391 | " 'appeal': 84,\n",
1392 | " 'due': 85,\n",
1393 | " 'fact': 86,\n",
1394 | " 'goes': 87,\n",
1395 | " 'shows': 88,\n",
1396 | " 'wouldnt': 89,\n",
1397 | " 'dare': 90,\n",
1398 | " 'Forget': 91,\n",
1399 | " 'pretty': 92,\n",
1400 | " 'pictures': 93,\n",
1401 | " 'painted': 94,\n",
1402 | " 'mainstream': 95,\n",
1403 | " 'audiences': 96,\n",
1404 | " 'forget': 97,\n",
1405 | " 'charm': 98,\n",
1406 | " 'romanceOZ': 99,\n",
1407 | " 'doesnt': 100,\n",
1408 | " 'mess': 101,\n",
1409 | " 'around': 102,\n",
1410 | " 'ever': 103,\n",
1411 | " 'saw': 104,\n",
1412 | " 'nasty': 105,\n",
1413 | " 'surreal': 106,\n",
1414 | " 'couldnt': 107,\n",
1415 | " 'ready': 108,\n",
1416 | " 'watched': 109,\n",
1417 | " 'developed': 110,\n",
1418 | " 'taste': 111,\n",
1419 | " 'got': 112,\n",
1420 | " 'accustomed': 113,\n",
1421 | " 'levels': 114,\n",
1422 | " 'graphic': 115,\n",
1423 | " 'injustice': 116,\n",
1424 | " 'crooked': 117,\n",
1425 | " 'guards': 118,\n",
1426 | " 'wholl': 119,\n",
1427 | " 'sold': 120,\n",
1428 | " 'nickel': 121,\n",
1429 | " 'inmates': 122,\n",
1430 | " 'kill': 123,\n",
1431 | " 'order': 124,\n",
1432 | " 'get': 125,\n",
1433 | " 'away': 126,\n",
1434 | " 'well': 127,\n",
1435 | " 'mannered': 128,\n",
1436 | " 'middle': 129,\n",
1437 | " 'class': 130,\n",
1438 | " 'turned': 131,\n",
1439 | " 'bitches': 132,\n",
1440 | " 'lack': 133,\n",
1441 | " 'street': 134,\n",
1442 | " 'skills': 135,\n",
1443 | " 'experience': 136,\n",
1444 | " 'Watching': 137,\n",
1445 | " 'may': 138,\n",
1446 | " 'become': 139,\n",
1447 | " 'comfortable': 140,\n",
1448 | " 'uncomfortable': 141,\n",
1449 | " 'viewingthats': 142,\n",
1450 | " 'touch': 143,\n",
1451 | " 'darker': 144,\n",
1452 | " 'side': 145,\n",
1453 | " 'wonderful': 146,\n",
1454 | " 'little': 147,\n",
1455 | " 'production': 148,\n",
1456 | " 'filming': 149,\n",
1457 | " 'technique': 150,\n",
1458 | " 'unassuming': 151,\n",
1459 | " 'oldtimeBBC': 152,\n",
1460 | " 'fashion': 153,\n",
1461 | " 'gives': 154,\n",
1462 | " 'comforting': 155,\n",
1463 | " 'sometimes': 156,\n",
1464 | " 'discomforting': 157,\n",
1465 | " 'sense': 158,\n",
1466 | " 'realism': 159,\n",
1467 | " 'entire': 160,\n",
1468 | " 'piece': 161,\n",
1469 | " 'actors': 162,\n",
1470 | " 'extremely': 163,\n",
1471 | " 'chosen': 164,\n",
1472 | " 'Michael': 165,\n",
1473 | " 'Sheen': 166,\n",
1474 | " 'polari': 167,\n",
1475 | " 'voices': 168,\n",
1476 | " 'pat': 169,\n",
1477 | " 'truly': 170,\n",
1478 | " 'see': 171,\n",
1479 | " 'seamless': 172,\n",
1480 | " 'editing': 173,\n",
1481 | " 'guided': 174,\n",
1482 | " 'references': 175,\n",
1483 | " 'Williams': 176,\n",
1484 | " 'diary': 177,\n",
1485 | " 'entries': 178,\n",
1486 | " 'worth': 179,\n",
1487 | " 'terrificly': 180,\n",
1488 | " 'written': 181,\n",
1489 | " 'performed': 182,\n",
1490 | " 'masterful': 183,\n",
1491 | " 'one': 184,\n",
1492 | " 'great': 185,\n",
1493 | " 'masters': 186,\n",
1494 | " 'comedy': 187,\n",
1495 | " 'life': 188,\n",
1496 | " 'really': 189,\n",
1497 | " 'comes': 190,\n",
1498 | " 'things': 191,\n",
1499 | " 'fantasy': 192,\n",
1500 | " 'guard': 193,\n",
1501 | " 'rather': 194,\n",
1502 | " 'traditional': 195,\n",
1503 | " 'dream': 196,\n",
1504 | " 'techniques': 197,\n",
1505 | " 'remains': 198,\n",
1506 | " 'solid': 199,\n",
1507 | " 'disappears': 200,\n",
1508 | " 'plays': 201,\n",
1509 | " 'knowledge': 202,\n",
1510 | " 'senses': 203,\n",
1511 | " 'particularly': 204,\n",
1512 | " 'concerning': 205,\n",
1513 | " 'Orton': 206,\n",
1514 | " 'Halliwell': 207,\n",
1515 | " 'sets': 208,\n",
1516 | " 'flat': 209,\n",
1517 | " 'Halliwells': 210,\n",
1518 | " 'murals': 211,\n",
1519 | " 'decorating': 212,\n",
1520 | " 'every': 213,\n",
1521 | " 'surface': 214,\n",
1522 | " 'terribly': 215,\n",
1523 | " 'done': 216,\n",
1524 | " 'thought': 217,\n",
1525 | " 'way': 218,\n",
1526 | " 'spend': 219,\n",
1527 | " 'time': 220,\n",
1528 | " 'hot': 221,\n",
1529 | " 'summer': 222,\n",
1530 | " 'weekend': 223,\n",
1531 | " 'sitting': 224,\n",
1532 | " 'air': 225,\n",
1533 | " 'conditioned': 226,\n",
1534 | " 'theater': 227,\n",
1535 | " 'lighthearted': 228,\n",
1536 | " 'plot': 229,\n",
1537 | " 'simplistic': 230,\n",
1538 | " 'dialogue': 231,\n",
1539 | " 'witty': 232,\n",
1540 | " 'characters': 233,\n",
1541 | " 'likable': 234,\n",
1542 | " 'even': 235,\n",
1543 | " 'bread': 236,\n",
1544 | " 'suspected': 237,\n",
1545 | " 'serial': 238,\n",
1546 | " 'killer': 239,\n",
1547 | " 'disappointed': 240,\n",
1548 | " 'realize': 241,\n",
1549 | " 'Match': 242,\n",
1550 | " 'Point': 243,\n",
1551 | " '2': 244,\n",
1552 | " 'Risk': 245,\n",
1553 | " 'Addiction': 246,\n",
1554 | " 'proof': 247,\n",
1555 | " 'Woody': 248,\n",
1556 | " 'Allen': 249,\n",
1557 | " 'still': 250,\n",
1558 | " 'fully': 251,\n",
1559 | " 'control': 252,\n",
1560 | " 'style': 253,\n",
1561 | " 'many': 254,\n",
1562 | " 'us': 255,\n",
1563 | " 'grown': 256,\n",
1564 | " 'loveThis': 257,\n",
1565 | " 'Id': 258,\n",
1566 | " 'laughed': 259,\n",
1567 | " 'Woodys': 260,\n",
1568 | " 'comedies': 261,\n",
1569 | " 'years': 262,\n",
1570 | " 'decade': 263,\n",
1571 | " 'Ive': 264,\n",
1572 | " 'impressed': 265,\n",
1573 | " 'Scarlet': 266,\n",
1574 | " 'Johanson': 267,\n",
1575 | " 'managed': 268,\n",
1576 | " 'tone': 269,\n",
1577 | " 'sexy': 270,\n",
1578 | " 'image': 271,\n",
1579 | " 'jumped': 272,\n",
1580 | " 'average': 273,\n",
1581 | " 'spirited': 274,\n",
1582 | " 'young': 275,\n",
1583 | " 'womanThis': 276,\n",
1584 | " 'crown': 277,\n",
1585 | " 'jewel': 278,\n",
1586 | " 'career': 279,\n",
1587 | " 'wittier': 280,\n",
1588 | " 'Devil': 281,\n",
1589 | " 'Wears': 282,\n",
1590 | " 'Prada': 283,\n",
1591 | " 'interesting': 284,\n",
1592 | " 'Superman': 285,\n",
1593 | " 'go': 286,\n",
1594 | " 'friends': 287,\n",
1595 | " 'Basically': 288,\n",
1596 | " 'theres': 289,\n",
1597 | " 'family': 290,\n",
1598 | " 'boy': 291,\n",
1599 | " 'Jake': 292,\n",
1600 | " 'thinks': 293,\n",
1601 | " 'zombie': 294,\n",
1602 | " 'closet': 295,\n",
1603 | " 'parents': 296,\n",
1604 | " 'fighting': 297,\n",
1605 | " 'timeThis': 298,\n",
1606 | " 'movie': 299,\n",
1607 | " 'slower': 300,\n",
1608 | " 'soap': 301,\n",
1609 | " 'opera': 302,\n",
1610 | " 'suddenly': 303,\n",
1611 | " 'decides': 304,\n",
1612 | " 'Rambo': 305,\n",
1613 | " 'zombieOK': 306,\n",
1614 | " 'youre': 307,\n",
1615 | " 'going': 308,\n",
1616 | " 'make': 309,\n",
1617 | " 'film': 310,\n",
1618 | " 'must': 311,\n",
1619 | " 'Decide': 312,\n",
1620 | " 'thriller': 313,\n",
1621 | " 'drama': 314,\n",
1622 | " 'watchable': 315,\n",
1623 | " 'Parents': 316,\n",
1624 | " 'divorcing': 317,\n",
1625 | " 'arguing': 318,\n",
1626 | " 'like': 319,\n",
1627 | " 'real': 320,\n",
1628 | " 'totally': 321,\n",
1629 | " 'ruins': 322,\n",
1630 | " 'expected': 323,\n",
1631 | " 'BOOGEYMAN': 324,\n",
1632 | " 'similar': 325,\n",
1633 | " 'instead': 326,\n",
1634 | " 'meaningless': 327,\n",
1635 | " 'spots3': 328,\n",
1636 | " '10': 329,\n",
1637 | " 'playing': 330,\n",
1638 | " 'descent': 331,\n",
1639 | " 'dialogs': 332,\n",
1640 | " 'shots': 333,\n",
1641 | " 'ignore': 334,\n",
1642 | " 'Petter': 335,\n",
1643 | " 'Matteis': 336,\n",
1644 | " 'Love': 337,\n",
1645 | " 'Time': 338,\n",
1646 | " 'Money': 339,\n",
1647 | " 'visually': 340,\n",
1648 | " 'stunning': 341,\n",
1649 | " 'watch': 342,\n",
1650 | " 'Mr': 343,\n",
1651 | " 'Mattei': 344,\n",
1652 | " 'offers': 345,\n",
1653 | " 'vivid': 346,\n",
1654 | " 'portrait': 347,\n",
1655 | " 'human': 348,\n",
1656 | " 'relations': 349,\n",
1657 | " 'seems': 350,\n",
1658 | " 'telling': 351,\n",
1659 | " 'money': 352,\n",
1660 | " 'power': 353,\n",
1661 | " 'success': 354,\n",
1662 | " 'people': 355,\n",
1663 | " 'different': 356,\n",
1664 | " 'situations': 357,\n",
1665 | " 'encounter': 358,\n",
1666 | " 'variation': 359,\n",
1667 | " 'Arthur': 360,\n",
1668 | " 'Schnitzlers': 361,\n",
1669 | " 'play': 362,\n",
1670 | " 'theme': 363,\n",
1671 | " 'director': 364,\n",
1672 | " 'transfers': 365,\n",
1673 | " 'action': 366,\n",
1674 | " 'present': 367,\n",
1675 | " 'New': 368,\n",
1676 | " 'York': 369,\n",
1677 | " 'meet': 370,\n",
1678 | " 'connect': 371,\n",
1679 | " 'connected': 372,\n",
1680 | " 'another': 373,\n",
1681 | " 'next': 374,\n",
1682 | " 'person': 375,\n",
1683 | " 'know': 376,\n",
1684 | " 'previous': 377,\n",
1685 | " 'point': 378,\n",
1686 | " 'contact': 379,\n",
1687 | " 'Stylishly': 380,\n",
1688 | " 'sophisticated': 381,\n",
1689 | " 'luxurious': 382,\n",
1690 | " 'look': 383,\n",
1691 | " 'taken': 384,\n",
1692 | " 'live': 385,\n",
1693 | " 'world': 386,\n",
1694 | " 'habitatThe': 387,\n",
1695 | " 'gets': 388,\n",
1696 | " 'souls': 389,\n",
1697 | " 'picture': 390,\n",
1698 | " 'stages': 391,\n",
1699 | " 'loneliness': 392,\n",
1700 | " 'inhabits': 393,\n",
1701 | " 'big': 394,\n",
1702 | " 'city': 395,\n",
1703 | " 'best': 396,\n",
1704 | " 'place': 397,\n",
1705 | " 'find': 398,\n",
1706 | " 'sincere': 399,\n",
1707 | " 'fulfillment': 400,\n",
1708 | " 'discerns': 401,\n",
1709 | " 'case': 402,\n",
1710 | " 'encounterThe': 403,\n",
1711 | " 'acting': 404,\n",
1712 | " 'good': 405,\n",
1713 | " 'direction': 406,\n",
1714 | " 'Steve': 407,\n",
1715 | " 'Buscemi': 408,\n",
1716 | " 'Rosario': 409,\n",
1717 | " 'Dawson': 410,\n",
1718 | " 'Carol': 411,\n",
1719 | " 'Kane': 412,\n",
1720 | " 'Imperioli': 413,\n",
1721 | " 'Adrian': 414,\n",
1722 | " 'Grenier': 415,\n",
1723 | " 'rest': 416,\n",
1724 | " 'talented': 417,\n",
1725 | " 'cast': 418,\n",
1726 | " 'come': 419,\n",
1727 | " 'aliveWe': 420,\n",
1728 | " 'wish': 421,\n",
1729 | " 'luck': 422,\n",
1730 | " 'await': 423,\n",
1731 | " 'anxiously': 424,\n",
1732 | " 'work': 425,\n",
1733 | " 'Probably': 426,\n",
1734 | " 'alltime': 427,\n",
1735 | " 'favorite': 428,\n",
1736 | " 'story': 429,\n",
1737 | " 'selflessness': 430,\n",
1738 | " 'sacrifice': 431,\n",
1739 | " 'dedication': 432,\n",
1740 | " 'noble': 433,\n",
1741 | " 'cause': 434,\n",
1742 | " 'preachy': 435,\n",
1743 | " 'boring': 436,\n",
1744 | " 'old': 437,\n",
1745 | " 'despite': 438,\n",
1746 | " 'seen': 439,\n",
1747 | " '15': 440,\n",
1748 | " 'times': 441,\n",
1749 | " 'last': 442,\n",
1750 | " '25': 443,\n",
1751 | " 'Paul': 444,\n",
1752 | " 'Lukas': 445,\n",
1753 | " 'performance': 446,\n",
1754 | " 'brings': 447,\n",
1755 | " 'tears': 448,\n",
1756 | " 'eyes': 449,\n",
1757 | " 'Bette': 450,\n",
1758 | " 'Davis': 451,\n",
1759 | " 'sympathetic': 452,\n",
1760 | " 'roles': 453,\n",
1761 | " 'delight': 454,\n",
1762 | " 'kids': 455,\n",
1763 | " 'grandma': 456,\n",
1764 | " 'says': 457,\n",
1765 | " 'dressedup': 458,\n",
1766 | " 'midgets': 459,\n",
1767 | " 'children': 460,\n",
1768 | " 'makes': 461,\n",
1769 | " 'fun': 462,\n",
1770 | " 'mothers': 463,\n",
1771 | " 'slow': 464,\n",
1772 | " 'awakening': 465,\n",
1773 | " 'whats': 466,\n",
1774 | " 'happening': 467,\n",
1775 | " 'roof': 468,\n",
1776 | " 'believable': 469,\n",
1777 | " 'startling': 470,\n",
1778 | " 'dozen': 471,\n",
1779 | " 'thumbs': 472,\n",
1780 | " 'theyd': 473,\n",
1781 | " 'sure': 474,\n",
1782 | " 'resurrection': 475,\n",
1783 | " 'dated': 476,\n",
1784 | " 'Seahunt': 477,\n",
1785 | " 'series': 478,\n",
1786 | " 'tech': 479,\n",
1787 | " 'today': 480,\n",
1788 | " 'bring': 481,\n",
1789 | " 'back': 482,\n",
1790 | " 'kid': 483,\n",
1791 | " 'excitement': 484,\n",
1792 | " 'meI': 485,\n",
1793 | " 'grew': 486,\n",
1794 | " 'black': 487,\n",
1795 | " 'white': 488,\n",
1796 | " 'TV': 489,\n",
1797 | " 'Gunsmoke': 490,\n",
1798 | " 'heros': 491,\n",
1799 | " 'weekYou': 492,\n",
1800 | " 'vote': 493,\n",
1801 | " 'comeback': 494,\n",
1802 | " 'new': 495,\n",
1803 | " 'sea': 496,\n",
1804 | " 'huntWe': 497,\n",
1805 | " 'need': 498,\n",
1806 | " 'change': 499,\n",
1807 | " 'pace': 500,\n",
1808 | " 'water': 501,\n",
1809 | " 'adventureOh': 502,\n",
1810 | " 'thank': 503,\n",
1811 | " 'outlet': 504,\n",
1812 | " 'view': 505,\n",
1813 | " 'viewpoints': 506,\n",
1814 | " 'moviesSo': 507,\n",
1815 | " 'ole': 508,\n",
1816 | " 'believe': 509,\n",
1817 | " 'wan': 510,\n",
1818 | " 'na': 511,\n",
1819 | " 'sayWould': 512,\n",
1820 | " 'nice': 513,\n",
1821 | " 'read': 514,\n",
1822 | " 'plus': 515,\n",
1823 | " 'points': 516,\n",
1824 | " 'huntIf': 517,\n",
1825 | " 'rhymes': 518,\n",
1826 | " 'lines': 519,\n",
1827 | " 'let': 520,\n",
1828 | " 'submitor': 521,\n",
1829 | " 'leave': 522,\n",
1830 | " 'doubt': 523,\n",
1831 | " 'quitIf': 524,\n",
1832 | " 'lets': 525,\n",
1833 | " 'amazing': 526,\n",
1834 | " 'fresh': 527,\n",
1835 | " 'innovative': 528,\n",
1836 | " 'idea': 529,\n",
1837 | " '70s': 530,\n",
1838 | " 'aired': 531,\n",
1839 | " '7': 532,\n",
1840 | " '8': 533,\n",
1841 | " 'brilliant': 534,\n",
1842 | " 'dropped': 535,\n",
1843 | " '1990': 536,\n",
1844 | " 'funny': 537,\n",
1845 | " 'anymore': 538,\n",
1846 | " 'continued': 539,\n",
1847 | " 'decline': 540,\n",
1848 | " 'complete': 541,\n",
1849 | " 'waste': 542,\n",
1850 | " 'todayIts': 543,\n",
1851 | " 'disgraceful': 544,\n",
1852 | " 'fallen': 545,\n",
1853 | " 'writing': 546,\n",
1854 | " 'painfully': 547,\n",
1855 | " 'bad': 548,\n",
1856 | " 'performances': 549,\n",
1857 | " 'almost': 550,\n",
1858 | " 'mildly': 551,\n",
1859 | " 'entertaining': 552,\n",
1860 | " 'respite': 553,\n",
1861 | " 'guesthosts': 554,\n",
1862 | " 'probably': 555,\n",
1863 | " 'hard': 556,\n",
1864 | " 'creator': 557,\n",
1865 | " 'handselected': 558,\n",
1866 | " 'original': 559,\n",
1867 | " 'also': 560,\n",
1868 | " 'chose': 561,\n",
1869 | " 'band': 562,\n",
1870 | " 'hacks': 563,\n",
1871 | " 'followed': 564,\n",
1872 | " 'recognize': 565,\n",
1873 | " 'brilliance': 566,\n",
1874 | " 'fit': 567,\n",
1875 | " 'replace': 568,\n",
1876 | " 'mediocrity': 569,\n",
1877 | " 'felt': 570,\n",
1878 | " 'give': 571,\n",
1879 | " 'stars': 572,\n",
1880 | " 'respect': 573,\n",
1881 | " 'made': 574,\n",
1882 | " 'huge': 575,\n",
1883 | " 'awful': 576,\n",
1884 | " 'cant': 577,\n",
1885 | " 'Encouraged': 578,\n",
1886 | " 'positive': 579,\n",
1887 | " 'comments': 580,\n",
1888 | " 'looking': 581,\n",
1889 | " 'forward': 582,\n",
1890 | " 'Bad': 583,\n",
1891 | " 'mistake': 584,\n",
1892 | " '950': 585,\n",
1893 | " 'films': 586,\n",
1894 | " 'worst': 587,\n",
1895 | " 'pacing': 588,\n",
1896 | " 'storyline': 589,\n",
1897 | " 'soundtrack': 590,\n",
1898 | " 'song': 591,\n",
1899 | " 'lame': 592,\n",
1900 | " 'country': 593,\n",
1901 | " 'tune': 594,\n",
1902 | " 'played': 595,\n",
1903 | " 'less': 596,\n",
1904 | " 'four': 597,\n",
1905 | " 'looks': 598,\n",
1906 | " 'cheap': 599,\n",
1907 | " 'extreme': 600,\n",
1908 | " 'Rarely': 601,\n",
1909 | " 'happy': 602,\n",
1910 | " 'end': 603,\n",
1911 | " 'credits': 604,\n",
1912 | " 'prevents': 605,\n",
1913 | " 'giving': 606,\n",
1914 | " '1score': 607,\n",
1915 | " 'Harvey': 608,\n",
1916 | " 'Keitel': 609,\n",
1917 | " 'least': 610,\n",
1918 | " 'making': 611,\n",
1919 | " 'bit': 612,\n",
1920 | " 'effort': 613,\n",
1921 | " 'obsessives': 614,\n",
1922 | " 'gut': 615,\n",
1923 | " 'wrenching': 616,\n",
1924 | " 'laughter': 617,\n",
1925 | " 'love': 618,\n",
1926 | " 'hell': 619,\n",
1927 | " 'mom': 620,\n",
1928 | " 'liked': 621,\n",
1929 | " 'itGreat': 622,\n",
1930 | " 'Camp': 623,\n",
1931 | " 'Phil': 624,\n",
1932 | " 'Alien': 625,\n",
1933 | " 'quirky': 626,\n",
1934 | " 'humour': 627,\n",
1935 | " 'based': 628,\n",
1936 | " 'oddness': 629,\n",
1937 | " 'everything': 630,\n",
1938 | " 'actual': 631,\n",
1939 | " 'punchlinesAt': 632,\n",
1940 | " 'odd': 633,\n",
1941 | " 'progressed': 634,\n",
1942 | " 'didnt': 635,\n",
1943 | " 'jokes': 636,\n",
1944 | " 'anymoreIts': 637,\n",
1945 | " 'low': 638,\n",
1946 | " 'budget': 639,\n",
1947 | " 'thats': 640,\n",
1948 | " 'problem': 641,\n",
1949 | " 'eventually': 642,\n",
1950 | " 'lost': 643,\n",
1951 | " 'interestI': 644,\n",
1952 | " 'imagine': 645,\n",
1953 | " 'stoner': 646,\n",
1954 | " 'currently': 647,\n",
1955 | " 'partakingFor': 648,\n",
1956 | " 'something': 649,\n",
1957 | " 'better': 650,\n",
1958 | " 'try': 651,\n",
1959 | " 'Brother': 652,\n",
1960 | " 'planet': 653,\n",
1961 | " '12': 654,\n",
1962 | " 'came': 655,\n",
1963 | " 'recall': 656,\n",
1964 | " 'scariest': 657,\n",
1965 | " 'scene': 658,\n",
1966 | " 'bird': 659,\n",
1967 | " 'eating': 660,\n",
1968 | " 'men': 661,\n",
1969 | " 'dangling': 662,\n",
1970 | " 'helplessly': 663,\n",
1971 | " 'parachutes': 664,\n",
1972 | " 'horror': 665,\n",
1973 | " 'horrorAs': 666,\n",
1974 | " 'cheesy': 667,\n",
1975 | " 'B': 668,\n",
1976 | " 'Saturday': 669,\n",
1977 | " 'afternoons': 670,\n",
1978 | " 'tired': 671,\n",
1979 | " 'formula': 672,\n",
1980 | " 'monster': 673,\n",
1981 | " 'type': 674,\n",
1982 | " 'movies': 675,\n",
1983 | " 'usually': 676,\n",
1984 | " 'included': 677,\n",
1985 | " 'hero': 678,\n",
1986 | " 'beautiful': 679,\n",
1987 | " 'woman': 680,\n",
1988 | " 'might': 681,\n",
1989 | " 'daughter': 682,\n",
1990 | " 'professor': 683,\n",
1991 | " 'resolution': 684,\n",
1992 | " 'died': 685,\n",
1993 | " 'care': 686,\n",
1994 | " 'much': 687,\n",
1995 | " 'romantic': 688,\n",
1996 | " 'angle': 689,\n",
1997 | " 'year': 690,\n",
1998 | " 'predictable': 691,\n",
1999 | " 'plots': 692,\n",
2000 | " 'unintentional': 693,\n",
2001 | " 'humorBut': 694,\n",
2002 | " 'later': 695,\n",
2003 | " 'Psycho': 696,\n",
2004 | " 'loved': 697,\n",
2005 | " 'star': 698,\n",
2006 | " 'Janet': 699,\n",
2007 | " 'Leigh': 700,\n",
2008 | " 'bumped': 701,\n",
2009 | " 'early': 702,\n",
2010 | " 'sat': 703,\n",
2011 | " 'took': 704,\n",
2012 | " 'notice': 705,\n",
2013 | " 'Since': 706,\n",
2014 | " 'screenwriters': 707,\n",
2015 | " 'scary': 708,\n",
2016 | " 'possible': 709,\n",
2017 | " 'wellworn': 710,\n",
2018 | " 'rules': 711,\n",
2019 | " 'im': 712,\n",
2020 | " 'fan': 713,\n",
2021 | " 'Bolls': 714,\n",
2022 | " 'enjoyed': 715,\n",
2023 | " 'Postal': 716,\n",
2024 | " 'maybe': 717,\n",
2025 | " 'Boll': 718,\n",
2026 | " 'apparently': 719,\n",
2027 | " 'bought': 720,\n",
2028 | " 'rights': 721,\n",
2029 | " 'Far': 722,\n",
2030 | " 'Cry': 723,\n",
2031 | " 'long': 724,\n",
2032 | " 'ago': 725,\n",
2033 | " 'game': 726,\n",
2034 | " 'finsished': 727,\n",
2035 | " 'People': 728,\n",
2036 | " 'killing': 729,\n",
2037 | " 'mercs': 730,\n",
2038 | " 'infiltrating': 731,\n",
2039 | " 'secret': 732,\n",
2040 | " 'research': 733,\n",
2041 | " 'labs': 734,\n",
2042 | " 'located': 735,\n",
2043 | " 'tropical': 736,\n",
2044 | " 'island': 737,\n",
2045 | " 'warned': 738,\n",
2046 | " 'schemed': 739,\n",
2047 | " 'together': 740,\n",
2048 | " 'along': 741,\n",
2049 | " 'legion': 742,\n",
2050 | " 'schmucks': 743,\n",
2051 | " 'Feeling': 744,\n",
2052 | " 'loneley': 745,\n",
2053 | " 'invites': 746,\n",
2054 | " 'three': 747,\n",
2055 | " 'countrymen': 748,\n",
2056 | " 'players': 749,\n",
2057 | " 'names': 750,\n",
2058 | " 'Til': 751,\n",
2059 | " 'Schweiger': 752,\n",
2060 | " 'Udo': 753,\n",
2061 | " 'Kier': 754,\n",
2062 | " 'Ralf': 755,\n",
2063 | " 'MoellerThree': 756,\n",
2064 | " 'actually': 757,\n",
2065 | " 'selfs': 758,\n",
2066 | " 'biz': 759,\n",
2067 | " 'tale': 760,\n",
2068 | " 'Jack': 761,\n",
2069 | " 'Carver': 762,\n",
2070 | " 'yes': 763,\n",
2071 | " 'German': 764,\n",
2072 | " 'hail': 765,\n",
2073 | " 'bratwurst': 766,\n",
2074 | " 'dudes': 767,\n",
2075 | " 'However': 768,\n",
2076 | " 'Tils': 769,\n",
2077 | " 'badass': 770,\n",
2078 | " 'complained': 771,\n",
2079 | " 'hes': 772,\n",
2080 | " 'staying': 773,\n",
2081 | " 'true': 774,\n",
2082 | " 'whole': 775,\n",
2083 | " 'carver': 776,\n",
2084 | " 'perspective': 777,\n",
2085 | " 'dont': 778,\n",
2086 | " 'looked': 779,\n",
2087 | " 'kicking': 780,\n",
2088 | " 'beyond': 781,\n",
2089 | " 'demented': 782,\n",
2090 | " 'evil': 783,\n",
2091 | " 'mad': 784,\n",
2092 | " 'scientist': 785,\n",
2093 | " 'Dr': 786,\n",
2094 | " 'Krieger': 787,\n",
2095 | " 'GeneticallyMutatedsoldiers': 788,\n",
2096 | " 'GMS': 789,\n",
2097 | " 'Performing': 790,\n",
2098 | " 'topsecret': 791,\n",
2099 | " 'reminds': 792,\n",
2100 | " 'SPOILER': 793,\n",
2101 | " 'Vancouver': 794,\n",
2102 | " 'reason': 795,\n",
2103 | " 'Thats': 796,\n",
2104 | " 'palm': 797,\n",
2105 | " 'trees': 798,\n",
2106 | " 'Instead': 799,\n",
2107 | " 'rich': 800,\n",
2108 | " 'lumberjackwoods': 801,\n",
2109 | " 'havent': 802,\n",
2110 | " 'gone': 803,\n",
2111 | " 'FAR': 804,\n",
2112 | " 'started': 805,\n",
2113 | " 'CRY': 806,\n",
2114 | " 'mehehe': 807,\n",
2115 | " 'can': 808,\n",
2116 | " 'not': 809,\n",
2117 | " 'stay': 810,\n",
2118 | " 'shenanigans': 811,\n",
2119 | " 'delivers': 812,\n",
2120 | " 'meaning': 813,\n",
2121 | " 'suckThere': 814,\n",
2122 | " 'mentioning': 815,\n",
2123 | " 'imply': 816,\n",
2124 | " 'areas': 817,\n",
2125 | " 'boat': 818,\n",
2126 | " 'cromedalbino': 819,\n",
2127 | " 'squad': 820,\n",
2128 | " 'enters': 821,\n",
2129 | " 'laugh': 822,\n",
2130 | " 'reeks': 823,\n",
2131 | " 'scheisse': 824,\n",
2132 | " 'poop': 825,\n",
2133 | " 'simpletons': 826,\n",
2134 | " 'take': 827,\n",
2135 | " 'wiff': 828,\n",
2136 | " 'ahead': 829,\n",
2137 | " 'BTW': 830,\n",
2138 | " 'annoying': 831,\n",
2139 | " 'sidekick': 832,\n",
2140 | " 'shoot': 833,\n",
2141 | " 'minutes': 834,\n",
2142 | " 'screen': 835,\n",
2143 | " 'ShakespeareShakespeare': 836,\n",
2144 | " 'lostI': 837,\n",
2145 | " 'appreciate': 838,\n",
2146 | " 'trying': 839,\n",
2147 | " 'Shakespeare': 840,\n",
2148 | " 'masses': 841,\n",
2149 | " 'ruin': 842,\n",
2150 | " 'goodIs': 843,\n",
2151 | " 'Scottish': 844,\n",
2152 | " 'Play': 845,\n",
2153 | " 'certain': 846,\n",
2154 | " 'Rev': 847,\n",
2155 | " 'Bowdler': 848,\n",
2156 | " 'hence': 849,\n",
2157 | " 'bowdlerization': 850,\n",
2158 | " 'tried': 851,\n",
2159 | " 'Victorian': 852,\n",
2160 | " 'eraIn': 853,\n",
2161 | " 'words': 854,\n",
2162 | " 'improve': 855,\n",
2163 | " 'perfectionI': 856,\n",
2164 | " 'write': 857,\n",
2165 | " 'ten': 858,\n",
2166 | " 'text': 859,\n",
2167 | " 'English': 860,\n",
2168 | " 'composition': 861,\n",
2169 | " 'forte': 862,\n",
2170 | " 'keep': 863,\n",
2171 | " 'saying': 864,\n",
2172 | " 'cut': 865,\n",
2173 | " 'fantastic': 866,\n",
2174 | " 'prisoners': 867,\n",
2175 | " 'famous': 868,\n",
2176 | " 'george': 869,\n",
2177 | " 'clooney': 870,\n",
2178 | " 'Im': 871,\n",
2179 | " 'roll': 872,\n",
2180 | " 'Another': 873,\n",
2181 | " 'man': 874,\n",
2182 | " 'constant': 875,\n",
2183 | " 'sorrow': 876,\n",
2184 | " 'recommand': 877,\n",
2185 | " 'everybody': 878,\n",
2186 | " 'Greetings': 879,\n",
2187 | " 'Bart': 880,\n",
2188 | " 'Kind': 881,\n",
2189 | " 'drawn': 882,\n",
2190 | " 'erotic': 883,\n",
2191 | " 'amateurish': 884,\n",
2192 | " 'unbelievable': 885,\n",
2193 | " 'bits': 886,\n",
2194 | " 'Sort': 887,\n",
2195 | " 'school': 888,\n",
2196 | " 'project': 889,\n",
2197 | " 'Rosanna': 890,\n",
2198 | " 'Arquette': 891,\n",
2199 | " 'thinking': 892,\n",
2200 | " 'stock': 893,\n",
2201 | " 'bizarre': 894,\n",
2202 | " 'supposed': 895,\n",
2203 | " 'Midwest': 896,\n",
2204 | " 'town': 897,\n",
2205 | " 'Pretty': 898,\n",
2206 | " 'involved': 899,\n",
2207 | " 'lessons': 900,\n",
2208 | " 'learned': 901,\n",
2209 | " 'insights': 902,\n",
2210 | " 'stilted': 903,\n",
2211 | " 'quite': 904,\n",
2212 | " 'ridiculous': 905,\n",
2213 | " 'lots': 906,\n",
2214 | " 'skin': 907,\n",
2215 | " 'intrigues': 908,\n",
2216 | " 'videotaped': 909,\n",
2217 | " 'nonsenseWhat': 910,\n",
2218 | " 'bisexual': 911,\n",
2219 | " 'relationship': 912,\n",
2220 | " 'nowhere': 913,\n",
2221 | " 'heterosexual': 914,\n",
2222 | " 'encounters': 915,\n",
2223 | " 'absurd': 916,\n",
2224 | " 'dance': 917,\n",
2225 | " 'stereotyped': 918,\n",
2226 | " 'Give': 919,\n",
2227 | " 'pass': 920,\n",
2228 | " 'million': 921,\n",
2229 | " 'miles': 922,\n",
2230 | " 'wasted': 923,\n",
2231 | " 'could': 924,\n",
2232 | " 'spent': 925,\n",
2233 | " 'starving': 926,\n",
2234 | " 'Aids': 927,\n",
2235 | " 'Africa': 928,\n",
2236 | " 'simply': 929,\n",
2237 | " 'remade': 930,\n",
2238 | " 'fails': 931,\n",
2239 | " 'capture': 932,\n",
2240 | " 'flavor': 933,\n",
2241 | " 'terror': 934,\n",
2242 | " '1963': 935,\n",
2243 | " 'title': 936,\n",
2244 | " 'Liam': 937,\n",
2245 | " 'Neeson': 938,\n",
2246 | " 'excellent': 939,\n",
2247 | " 'always': 940,\n",
2248 | " 'holds': 941,\n",
2249 | " 'exception': 942,\n",
2250 | " 'Owen': 943,\n",
2251 | " 'Wilson': 944,\n",
2252 | " 'feel': 945,\n",
2253 | " 'character': 946,\n",
2254 | " 'Luke': 947,\n",
2255 | " 'major': 948,\n",
2256 | " 'fault': 949,\n",
2257 | " 'version': 950,\n",
2258 | " 'strayed': 951,\n",
2259 | " 'Shirley': 952,\n",
2260 | " 'Jackson': 953,\n",
2261 | " 'attempts': 954,\n",
2262 | " 'grandiose': 955,\n",
2263 | " 'thrill': 956,\n",
2264 | " 'earlier': 957,\n",
2265 | " 'trade': 958,\n",
2266 | " 'snazzier': 959,\n",
2267 | " 'special': 960,\n",
2268 | " 'effects': 961,\n",
2269 | " 'enjoy': 962,\n",
2270 | " 'friction': 963,\n",
2271 | " 'older': 964,\n",
2272 | " 'top': 965,\n",
2273 | " 'Horrible': 966,\n",
2274 | " 'wasnt': 967,\n",
2275 | " 'continuous': 968,\n",
2276 | " 'minute': 969,\n",
2277 | " 'fight': 970,\n",
2278 | " 'chance': 971,\n",
2279 | " 'development': 972,\n",
2280 | " 'busy': 973,\n",
2281 | " 'running': 974,\n",
2282 | " 'sword': 975,\n",
2283 | " 'emotional': 976,\n",
2284 | " 'attachment': 977,\n",
2285 | " 'except': 978,\n",
2286 | " 'machine': 979,\n",
2287 | " 'wanted': 980,\n",
2288 | " 'destroy': 981,\n",
2289 | " 'Scenes': 982,\n",
2290 | " 'blatantly': 983,\n",
2291 | " 'stolen': 984,\n",
2292 | " 'LOTR': 985,\n",
2293 | " 'Star': 986,\n",
2294 | " 'Wars': 987,\n",
2295 | " 'Matrix': 988,\n",
2296 | " 'ExamplesThe': 989,\n",
2297 | " 'ghost': 990,\n",
2298 | " 'final': 991,\n",
2299 | " 'Yoda': 992,\n",
2300 | " 'Obee': 993,\n",
2301 | " 'Vader': 994,\n",
2302 | " 'spider': 995,\n",
2303 | " 'beginning': 996,\n",
2304 | " 'Frodo': 997,\n",
2305 | " 'attacked': 998,\n",
2306 | " 'Return': 999,\n",
2307 | " ...}"
2308 | ]
2309 | },
2310 | "execution_count": 13,
2311 | "metadata": {},
2312 | "output_type": "execute_result"
2313 | }
2314 | ],
2315 | "source": [
2316 | "vocab_to_index"
2317 | ]
2318 | },
2319 | {
2320 | "cell_type": "code",
2321 | "execution_count": 14,
2322 | "metadata": {},
2323 | "outputs": [
2324 | {
2325 | "data": {
2326 | "text/plain": [
2327 | "256140"
2328 | ]
2329 | },
2330 | "execution_count": 14,
2331 | "metadata": {},
2332 | "output_type": "execute_result"
2333 | }
2334 | ],
2335 | "source": [
2336 | "len(vocab_to_index)"
2337 | ]
2338 | },
2339 | {
2340 | "cell_type": "code",
2341 | "execution_count": 16,
2342 | "metadata": {
2343 | "scrolled": true
2344 | },
2345 | "outputs": [],
2346 | "source": [
2347 | "result = [[vocab_to_index[word] for word in y] for y in vocab_lst]"
2348 | ]
2349 | },
2350 | {
2351 | "cell_type": "code",
2352 | "execution_count": 17,
2353 | "metadata": {
2354 | "scrolled": true
2355 | },
2356 | "outputs": [
2357 | {
2358 | "data": {
2359 | "text/plain": [
2360 | "[624,\n",
2361 | " 625,\n",
2362 | " 184,\n",
2363 | " 626,\n",
2364 | " 586,\n",
2365 | " 627,\n",
2366 | " 628,\n",
2367 | " 102,\n",
2368 | " 629,\n",
2369 | " 630,\n",
2370 | " 194,\n",
2371 | " 631,\n",
2372 | " 632,\n",
2373 | " 13,\n",
2374 | " 633,\n",
2375 | " 92,\n",
2376 | " 537,\n",
2377 | " 299,\n",
2378 | " 634,\n",
2379 | " 635,\n",
2380 | " 398,\n",
2381 | " 636,\n",
2382 | " 629,\n",
2383 | " 537,\n",
2384 | " 637,\n",
2385 | " 638,\n",
2386 | " 639,\n",
2387 | " 310,\n",
2388 | " 640,\n",
2389 | " 78,\n",
2390 | " 641,\n",
2391 | " 92,\n",
2392 | " 284,\n",
2393 | " 233,\n",
2394 | " 642,\n",
2395 | " 643,\n",
2396 | " 644,\n",
2397 | " 645,\n",
2398 | " 310,\n",
2399 | " 81,\n",
2400 | " 84,\n",
2401 | " 646,\n",
2402 | " 647,\n",
2403 | " 648,\n",
2404 | " 649,\n",
2405 | " 325,\n",
2406 | " 650,\n",
2407 | " 651,\n",
2408 | " 652,\n",
2409 | " 373,\n",
2410 | " 653]"
2411 | ]
2412 | },
2413 | "execution_count": 17,
2414 | "metadata": {},
2415 | "output_type": "execute_result"
2416 | }
2417 | ],
2418 | "source": [
2419 | "result[10]"
2420 | ]
2421 | },
2422 | {
2423 | "cell_type": "code",
2424 | "execution_count": 18,
2425 | "metadata": {},
2426 | "outputs": [],
2427 | "source": [
2428 | "# KOREAN : https://github.com/e9t/nsmc\n",
2429 | "# ENGLISH : https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/kernels"
2430 | ]
2431 | }
2432 | ],
2433 | "metadata": {
2434 | "kernelspec": {
2435 | "display_name": "Python 3",
2436 | "language": "python",
2437 | "name": "python3"
2438 | },
2439 | "language_info": {
2440 | "codemirror_mode": {
2441 | "name": "ipython",
2442 | "version": 3
2443 | },
2444 | "file_extension": ".py",
2445 | "mimetype": "text/x-python",
2446 | "name": "python",
2447 | "nbconvert_exporter": "python",
2448 | "pygments_lexer": "ipython3",
2449 | "version": "3.6.8"
2450 | }
2451 | },
2452 | "nbformat": 4,
2453 | "nbformat_minor": 4
2454 | }
2455 |
--------------------------------------------------------------------------------
/2_Representation_Vector.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import torch\n",
10 | "from torch.autograd import Variable\n",
11 | "import torch.nn as nn\n",
12 | "import torch.nn.functional as F\n",
13 | "import torch.optim as optim"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 2,
19 | "metadata": {},
20 | "outputs": [
21 | {
22 | "name": "stdout",
23 | "output_type": "stream",
24 | "text": [
25 | "vocab_size: 49\n"
26 | ]
27 | }
28 | ],
29 | "source": [
30 | "CONTEXT_SIZE = 2\n",
31 | "\n",
32 | "text = \"\"\"We are about to study the idea of a computational process.\n",
33 | "Computational processes are abstract beings that inhabit computers.\n",
34 | "As they evolve, processes manipulate other abstract things called data.\n",
35 | "The evolution of a process is directed by a pattern of rules\n",
36 | "called a program. People create programs to direct processes. In effect,\n",
37 | "we conjure the spirits of the computer with our spells.\"\"\".split()\n",
38 | "\n",
39 | "vocab = set(text)\n",
40 | "vocab_size = len(vocab)\n",
41 | "print('vocab_size:', vocab_size)\n",
42 | "\n",
43 | "w2i = {w: i for i, w in enumerate(vocab)}\n",
44 | "i2w = {i: w for i, w in enumerate(vocab)}"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 3,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "name": "stdout",
54 | "output_type": "stream",
55 | "text": [
56 | "cbow sample (['We', 'are', 'to', 'study'], 'about')\n",
57 | "skipgram sample ('about', 'We', 1)\n"
58 | ]
59 | }
60 | ],
61 | "source": [
62 | "def create_cbow_dataset(text):\n",
63 | " data = []\n",
64 | " for i in range(2, len(text) - 2):\n",
65 | " context = [text[i - 2], text[i - 1],\n",
66 | " text[i + 1], text[i + 2]]\n",
67 | " target = text[i]\n",
68 | " data.append((context, target))\n",
69 | " return data\n",
70 | "\n",
71 | "def create_skipgram_dataset(text):\n",
72 | " import random\n",
73 | " data = []\n",
74 | " for i in range(2, len(text) - 2):\n",
75 | " data.append((text[i], text[i-2], 1))\n",
76 | " data.append((text[i], text[i-1], 1))\n",
77 | " data.append((text[i], text[i+1], 1))\n",
78 | " data.append((text[i], text[i+2], 1))\n",
79 | " for _ in range(4):\n",
80 | " if random.random() < 0.5 or i >= len(text) - 3:\n",
81 | " rand_id = random.randint(0, i-1)\n",
82 | " else:\n",
83 | " rand_id = random.randint(i+3, len(text)-1)\n",
84 | " data.append((text[i], text[rand_id], 0))\n",
85 | " return data\n",
86 | "\n",
87 | "cbow_train = create_cbow_dataset(text)\n",
88 | "skipgram_train = create_skipgram_dataset(text)\n",
89 | "print('cbow sample', cbow_train[0])\n",
90 | "print('skipgram sample', skipgram_train[0])"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 4,
96 | "metadata": {},
97 | "outputs": [],
98 | "source": [
99 | "class CBOW(nn.Module):\n",
100 | " def __init__(self, vocab_size, embd_size, context_size, hidden_size):\n",
101 | " super(CBOW, self).__init__()\n",
102 | " self.embeddings = nn.Embedding(vocab_size, embd_size)\n",
103 | " self.linear1 = nn.Linear(2*context_size*embd_size, hidden_size)\n",
104 | " self.linear2 = nn.Linear(hidden_size, vocab_size)\n",
105 | " \n",
106 | " def forward(self, inputs):\n",
107 | " embedded = self.embeddings(inputs).view((1, -1))\n",
108 | " hid = F.relu(self.linear1(embedded))\n",
109 | " out = self.linear2(hid)\n",
110 | " log_probs = F.log_softmax(out)\n",
111 | " return log_probs\n",
112 | "\n",
113 | "class SkipGram(nn.Module):\n",
114 | " def __init__(self, vocab_size, embd_size):\n",
115 | " super(SkipGram, self).__init__()\n",
116 | " self.embeddings = nn.Embedding(vocab_size, embd_size)\n",
117 | " \n",
118 | " def forward(self, focus, context):\n",
119 | " embed_focus = self.embeddings(focus).view((1, -1))\n",
120 | " embed_ctx = self.embeddings(context).view((1, -1))\n",
121 | " score = torch.mm(embed_focus, torch.t(embed_ctx))\n",
122 | " log_probs = F.logsigmoid(score)\n",
123 | " \n",
124 | " return log_probs"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 5,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "embd_size = 100\n",
134 | "learning_rate = 0.001\n",
135 | "n_epoch = 30\n",
136 | "\n",
137 | "def train_cbow():\n",
138 | " hidden_size = 64\n",
139 | " losses = []\n",
140 | " loss_fn = nn.NLLLoss()\n",
141 | " model = CBOW(vocab_size, embd_size, CONTEXT_SIZE, hidden_size)\n",
142 | " print(model)\n",
143 | " optimizer = optim.SGD(model.parameters(), lr=learning_rate)\n",
144 | "\n",
145 | " for epoch in range(n_epoch):\n",
146 | " total_loss = .0\n",
147 | " for context, target in cbow_train:\n",
148 | " ctx_idxs = [w2i[w] for w in context]\n",
149 | " ctx_var = Variable(torch.LongTensor(ctx_idxs))\n",
150 | "\n",
151 | " model.zero_grad()\n",
152 | " log_probs = model(ctx_var)\n",
153 | "\n",
154 | " loss = loss_fn(log_probs, Variable(torch.LongTensor([w2i[target]])))\n",
155 | "\n",
156 | " loss.backward()\n",
157 | " optimizer.step()\n",
158 | " total_loss += loss.data\n",
159 | " losses.append(total_loss)\n",
160 | " return model, losses\n",
161 | "\n",
162 | "def train_skipgram():\n",
163 | " losses = []\n",
164 | " loss_fn = nn.MSELoss()\n",
165 | " model = SkipGram(vocab_size, embd_size)\n",
166 | " print(model)\n",
167 | " optimizer = optim.SGD(model.parameters(), lr=learning_rate)\n",
168 | " \n",
169 | " for epoch in range(n_epoch):\n",
170 | " total_loss = .0\n",
171 | " for in_w, out_w, target in skipgram_train:\n",
172 | " in_w_var = Variable(torch.LongTensor([w2i[in_w]]))\n",
173 | " out_w_var = Variable(torch.LongTensor([w2i[out_w]]))\n",
174 | " \n",
175 | " model.zero_grad()\n",
176 | " log_probs = model(in_w_var, out_w_var)\n",
177 | " loss = loss_fn(log_probs[0], Variable(torch.Tensor([target])))\n",
178 | " \n",
179 | " loss.backward()\n",
180 | " optimizer.step()\n",
181 | "\n",
182 | " total_loss += loss.data\n",
183 | " losses.append(total_loss)\n",
184 | " return model, losses"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 6,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "name": "stdout",
194 | "output_type": "stream",
195 | "text": [
196 | "CBOW(\n",
197 | " (embeddings): Embedding(49, 100)\n",
198 | " (linear1): Linear(in_features=400, out_features=64, bias=True)\n",
199 | " (linear2): Linear(in_features=64, out_features=49, bias=True)\n",
200 | ")\n"
201 | ]
202 | },
203 | {
204 | "name": "stderr",
205 | "output_type": "stream",
206 | "text": [
207 | "c:\\users\\justin\\venv\\lib\\site-packages\\ipykernel_launcher.py:12: UserWarning: Implicit dimension choice for log_softmax has been deprecated. Change the call to include dim=X as an argument.\n",
208 | " if sys.path[0] == '':\n"
209 | ]
210 | },
211 | {
212 | "name": "stdout",
213 | "output_type": "stream",
214 | "text": [
215 | "SkipGram(\n",
216 | " (embeddings): Embedding(49, 100)\n",
217 | ")\n"
218 | ]
219 | }
220 | ],
221 | "source": [
222 | "cbow_model, cbow_losses = train_cbow()\n",
223 | "sg_model, sg_losses = train_skipgram()"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 7,
229 | "metadata": {},
230 | "outputs": [],
231 | "source": [
232 | "def test_cbow(test_data, model):\n",
233 | " print('====Test CBOW===')\n",
234 | " correct_ct = 0\n",
235 | " for ctx, target in test_data:\n",
236 | " ctx_idxs = [w2i[w] for w in ctx]\n",
237 | " ctx_var = Variable(torch.LongTensor(ctx_idxs))\n",
238 | "\n",
239 | " model.zero_grad()\n",
240 | " log_probs = model(ctx_var)\n",
241 | " _, predicted = torch.max(log_probs.data, 1)\n",
242 | " predicted_word = i2w[predicted.item()]\n",
243 | " print('predicted:', predicted_word)\n",
244 | " print('label :', target)\n",
245 | " if predicted_word == target:\n",
246 | " correct_ct += 1\n",
247 | " \n",
248 | " print('Accuracy: {:.1f}% ({:d}/{:d})'.format(correct_ct/len(test_data)*100, correct_ct, len(test_data)))\n",
249 | "\n",
250 | "def test_skipgram(test_data, model):\n",
251 | " print('====Test SkipGram===')\n",
252 | " correct_ct = 0\n",
253 | " for in_w, out_w, target in test_data:\n",
254 | " in_w_var = Variable(torch.LongTensor([w2i[in_w]]))\n",
255 | " out_w_var = Variable(torch.LongTensor([w2i[out_w]]))\n",
256 | "\n",
257 | " model.zero_grad()\n",
258 | " log_probs = model(in_w_var, out_w_var)\n",
259 | " _, predicted = torch.max(log_probs.data, 1)\n",
260 | " predicted = predicted[0]\n",
261 | " if predicted == target:\n",
262 | " correct_ct += 1\n",
263 | "\n",
264 | " print('Accuracy: {:.1f}% ({:d}/{:d})'.format(correct_ct/len(test_data)*100, correct_ct, len(test_data)))"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 8,
270 | "metadata": {},
271 | "outputs": [
272 | {
273 | "name": "stdout",
274 | "output_type": "stream",
275 | "text": [
276 | "====Test CBOW===\n",
277 | "predicted: about\n",
278 | "label : about\n",
279 | "predicted: to\n",
280 | "label : to\n",
281 | "predicted: study\n",
282 | "label : study\n",
283 | "predicted: the\n",
284 | "label : the\n",
285 | "predicted: idea\n",
286 | "label : idea\n",
287 | "predicted: of\n",
288 | "label : of\n",
289 | "predicted: a\n",
290 | "label : a\n",
291 | "predicted: computational\n",
292 | "label : computational\n",
293 | "predicted: process.\n",
294 | "label : process.\n",
295 | "predicted: Computational\n",
296 | "label : Computational\n",
297 | "predicted: processes\n",
298 | "label : processes\n",
299 | "predicted: are\n",
300 | "label : are\n",
301 | "predicted: abstract\n",
302 | "label : abstract\n",
303 | "predicted: beings\n",
304 | "label : beings\n",
305 | "predicted: that\n",
306 | "label : that\n",
307 | "predicted: inhabit\n",
308 | "label : inhabit\n",
309 | "predicted: computers.\n",
310 | "label : computers.\n",
311 | "predicted: As\n",
312 | "label : As\n",
313 | "predicted: they\n",
314 | "label : they\n",
315 | "predicted: evolve,\n",
316 | "label : evolve,\n",
317 | "predicted: processes\n",
318 | "label : processes\n",
319 | "predicted: manipulate\n",
320 | "label : manipulate\n",
321 | "predicted: other\n",
322 | "label : other\n",
323 | "predicted: abstract\n",
324 | "label : abstract\n",
325 | "predicted: things\n",
326 | "label : things\n",
327 | "predicted: called\n",
328 | "label : called\n",
329 | "predicted: data.\n",
330 | "label : data.\n",
331 | "predicted: The\n",
332 | "label : The\n",
333 | "predicted: evolution\n",
334 | "label : evolution\n",
335 | "predicted: of\n",
336 | "label : of\n",
337 | "predicted: a\n",
338 | "label : a\n",
339 | "predicted: process\n",
340 | "label : process\n",
341 | "predicted: is\n",
342 | "label : is\n",
343 | "predicted: directed\n",
344 | "label : directed\n",
345 | "predicted: by\n",
346 | "label : by\n",
347 | "predicted: a\n",
348 | "label : a\n",
349 | "predicted: pattern\n",
350 | "label : pattern\n",
351 | "predicted: of\n",
352 | "label : of\n",
353 | "predicted: rules\n",
354 | "label : rules\n",
355 | "predicted: called\n",
356 | "label : called\n",
357 | "predicted: a\n",
358 | "label : a\n",
359 | "predicted: program.\n",
360 | "label : program.\n",
361 | "predicted: People\n",
362 | "label : People\n",
363 | "predicted: create\n",
364 | "label : create\n",
365 | "predicted: programs\n",
366 | "label : programs\n",
367 | "predicted: to\n",
368 | "label : to\n",
369 | "predicted: direct\n",
370 | "label : direct\n",
371 | "predicted: processes.\n",
372 | "label : processes.\n",
373 | "predicted: In\n",
374 | "label : In\n",
375 | "predicted: effect,\n",
376 | "label : effect,\n",
377 | "predicted: we\n",
378 | "label : we\n",
379 | "predicted: conjure\n",
380 | "label : conjure\n",
381 | "predicted: the\n",
382 | "label : the\n",
383 | "predicted: spirits\n",
384 | "label : spirits\n",
385 | "predicted: of\n",
386 | "label : of\n",
387 | "predicted: the\n",
388 | "label : the\n",
389 | "predicted: computer\n",
390 | "label : computer\n",
391 | "predicted: of\n",
392 | "label : with\n",
393 | "Accuracy: 98.3% (57/58)\n",
394 | "------\n",
395 | "====Test SkipGram===\n",
396 | "Accuracy: 50.0% (232/464)\n"
397 | ]
398 | },
399 | {
400 | "name": "stderr",
401 | "output_type": "stream",
402 | "text": [
403 | "c:\\users\\justin\\venv\\lib\\site-packages\\ipykernel_launcher.py:12: UserWarning: Implicit dimension choice for log_softmax has been deprecated. Change the call to include dim=X as an argument.\n",
404 | " if sys.path[0] == '':\n"
405 | ]
406 | }
407 | ],
408 | "source": [
409 | "test_cbow(cbow_train, cbow_model)\n",
410 | "print('------')\n",
411 | "test_skipgram(skipgram_train, sg_model)"
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": 9,
417 | "metadata": {},
418 | "outputs": [
419 | {
420 | "data": {
421 | "text/plain": [
422 | ""
423 | ]
424 | },
425 | "metadata": {},
426 | "output_type": "display_data"
427 | },
428 | {
429 | "data": {
430 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAD4CAYAAADmWv3KAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3dd3gWVfrG8e+TApHeQguhCSJVwAgokFhAEAQUG6KCIEVFpbiu+tPVdd3irhKqgqAIuBQbAhZKZDUJ3VCklwACAYQgvbfz+yMvbhYpISRM3jf357pyMTkzb/LMNXIznjlzjjnnEBGRwBPkdQEiIpI9FPAiIgFKAS8iEqAU8CIiAUoBLyISoEK8LgCgRIkSrmLFil6XISLiVxYtWrTbORd+of05IuArVqxIUlKS12WIiPgVM9t8sf3qohERCVAKeBGRAKWAFxEJUAp4EZEApYAXEQlQCngRkQClgBcRCVB+HfB7Dp/gja9Wcvj4Ka9LERHJcfw64Gcn72b03J9p9+4c1u886HU5IiI5il8HfNsbyvLvJxqy78gJ2g6dw+Ql27wuSUQkx/DrgAdoXKUE3zzXlNoRhenzyVL+78vlHDt52uuyREQ85/cBD1CqUBjjuzekZ0xlxi/Ywv3D57Ll1yNelyUi4qmACHiAkOAgXr6rOiM7RbHl1yO0HpLIjJW/eF2WiIhnAibgz2peoxTfPNeUisXz0/PjRfz929WcPH3G67JERK66gAt4gMhi+fj8qZt5rFEFRiRspOPI+fyy/5jXZYmIXFUBGfAAeUOCefOeWgzqUJeV2w/QenAiietTvS5LROSqCdiAP6td3QimPtOE4gXy0GnUQv45fY26bEQkV7hkwJtZpJl9b2arzWylmfX2tb9tZmvMbJmZfWlmRdJ95mUzSzaztWbWIjtPICOqlCzAlF5N6HBTJMN+2MCD789j6x6NshGRwJaRO/hTwPPOuepAI6CXmdUA4oBazrk6wDrgZQDfvg5ATaAl8J6ZBWdH8ZfjmjzB/KN9HYZ2rEfyzkO0GpzItOU7vC5LRCTbXDLgnXM7nHOLfdsHgdVAhHNupnPu7CQw84Fyvu12wETn3HHn3CYgGWiQ9aVnzt11yvJt76ZUDi/AU+MW84pejBKRAHVZffBmVhGoByw4Z1dXYJpvOwLYmm5fiq/t3J/Vw8ySzCwpNfXqPvyMLJaPz3reTM/oyoxbsIV2QzWXjYgEngwHvJkVAL4A+jjnDqRrf4W0bpxxZ5vO83H3uwbnRjjnopxzUeHh4ZdXdRbIExLEy62qM6ZrA3YfOk6bobOZuHALzv2uVBERv5ShgDezUNLCfZxzblK69s7A3cAj7r/JmAJEpvt4OWB71pSb9WKuC2da76bcWKEoL01azrMTlnDg2EmvyxIRuWIZGUVjwIfAaudcbLr2lsCLQFvnXPohKVOBDmaW18wqAVWBhVlbdtYqWSiMsV0b8kKLakxb8QutByeyaPNer8sSEbkiGbmDbww8BtxuZkt9X62AoUBBIM7XNhzAObcS+BRYBUwHejnncvxTzOAgo9dtVfi0ZyOcgwffn8eAuHWc0ph5EfFTlhP6nKOiolxSUpLXZfzmwLGT/HnqSiYt3kbdyCIMfKguFUvk97osEZH/YWaLnHNRF9of8G+yZkahsFBiH6zL0I712JiaNmZeD2BFxN8o4C/i7jplmdE3mrqRRXhp0nJ6fryIPYdPeF2WiEiGKOAvoUzha/j3Ew15pVV1flibSouBCcSv06RlIpLzKeAzICjI6B5dmcm9GlM0XyidRy3kz1NX6g1YEcnRFPCXoUbZQkx9pgldGldk9NyfaTNkNiu37/e6LBGR81LAX6aw0GBeb1OTMV0bsP/oSe55dw5D/7NewylFJMdRwGdSzHXhzOgTTYuapXln5jruGz6PDamHvC5LROQ3CvgrUDR/HoZ2rM+Qh+ux+dfDtBqUyEdzNnHmjIZTioj3FPBZoM0NZZnZJ5pbri3OG1+t4pEPFpCyVwuKiIi3FPBZpGShMEY9fhNvta/NspR9tByYyKc/btXLUSLiGQV8FjIzOjQoz/Q+0dQsW4g/frGMbmOS2HXwmNeliUgupIDPBpHF8jGheyNebV2dxOTdtBiQwDfLtDygiFxdCvhsEhRkdGtamW+fa0L5YvnoNX4xvcYtZveh416XJiK5hAI+m1UpWZDPn7qFF1pUI27VTu4ckMDUn7arb15Esp0C/ioIDQ6i121V+Pq5JkQWy8dzE5bQ8+NF6psXkWylgL+KritVkC+evJmX77qeH9al0jw2gUmLU3Q3LyLZQgF/lYUEB9Ez5lqm9W5KlZIF6PfpTzwxJolf9utuXkSylgLeI9eGF+DTnjfz2t01mLthN81j4/nkRy0qIiJZRwHvoeAgo2uTSszoE02NsoV48YvldBq1UG/BikiWUMDnABWK52dC90a82a4mizbvpcWABMbM/Vlz2ojIFVHA5xBBQcZjN1dkZt9obqxYjNenruSB9+eRvOug16WJiJ9SwOcw5YrmY0yXm4h98AY2pB6i1aDZDJ61nhOnNN+8iFweBXwOZGa0r1+O7/rFcGfNUsTGraPt0Nks3brP69JExI8o4HOwEgXyMrRjfT7oFMW+Iydp/94c/vr1Ko6cOOV1aSLiBxTwfqBZjVLM7BfNww3K88HsTbQYmMCc5N1elyUiOZwC3k8UCgvlb/fWZmKPRoQEBfHIBwt44bOf2HfkhNeliUgOpYD3M40qF2da76Y8deu1TFqyjWax8Xy9TJOXicjvKeD9UFhoMC+2vJ6pzzSmTOFreGb8ErqPTWL7vqNelyYiOcglA97MIs3sezNbbWYrzay3r72YmcWZ2Xrfn0V97WZmg80s2cyWmVn97D6J3Kpm2cJ8+fQtvNKqOrOT06Y7GDtPL0iJSJqM3MGfAp53zlUHGgG9zKwG8BIwyzlXFZjl+x7gLqCq76sHMCzLq5bfhAQH0T26MjP7xFC/QlFem7KS+4fPZd1OvSAlkttdMuCdczucc4t92weB1UAE0A4Y4ztsDHCPb7sdMNalmQ8UMbMyWV65/I/yxfMxtmsDYh+8gU27D9N6cCKxces4fuq016WJiEcuqw/ezCoC9YAFQCnn3A5I+0cAKOk7LALYmu5jKb62c39WDzNLMrOk1NTUy69cfif9C1Kta5dh8Kz1tBqUSNLPe7wuTUQ8kOGAN7MCwBdAH+fcgYsdep6233UKO+dGOOeinHNR4eHhGS1DMqB4gbwM7FCP0V1u4tjJM9w/fB6vfLmc/UdPel2aiFxFGQp4MwslLdzHOecm+Zp3nu168f25y9eeAkSm+3g5YHvWlCuX49ZqJZnZN5qujSsxYeEWDakUyWUyMorGgA+B1c652HS7pgKdfdudgSnp2jv5RtM0Avaf7cqRqy9/3hBea1ODKb2aUKpQXp4Zv4Suo39k6x7NOS8S6OxSd3Nm1gRIBJYDZ6c0/D/S+uE/BcoDW4AHnHN7fP8gDAVaAkeALs65pIv9jqioKJeUdNFDJAucOn2GsfM203/mWk47R99m19G1SSVCg/U6hIg/MrNFzrmoC+7PCf+7roC/urbvO8prU1by3eqdVC9TiL/fW4t65Yt6XZaIXKZLBbxu3XKhskWu4YPOUbz/2I3sPXyC9sPm8tqUFRw4poewIoFEAZ+LtahZmrh+0XS+uSIfz99M89h4pi3foYewIgFCAZ/LFQwL5c9tazL56caUKJCXp8Yt5okxSXoIKxIAFPACwA2RRZjSqzGvtq7O/I2/cueABN6P38DJ01oqUMRfKeDlNyHBQXRrWpm4fjE0qVqCf0xbQ5shs1m8Za/XpYlIJijg5XciilzDyE5pD2H3Hz3JfcPm8upkvQkr4m8U8HJBaQ9hY+hySyXGL0h7E/arn/QmrIi/UMDLRRXwvQk79ZkmlC4UxrMTltD5ox/Z8qseworkdAp4yZBaEYWZ3Ksxf25Tg8Wb99J8QDzvfp/MiVN6CCuSUyngJcOCg4zHG1fiu34x3H59Sd6esZbWgxNZuEnTEYvkRAp4uWylC4cx7NEbGfV4FEdOnObB9+fxwmc/sefwCa9LE5F0FPCSabdfX4q4ftE8GXMtXy7Zxh39f+CzpK16CCuSQyjg5YrkyxPCS3ddz9fPNaFyeAFe+HwZD42Yz3qtCSviOQW8ZInrSxfis54381b72qz95SCtBify9ow1HDupNWFFvKKAlywTFGR0aFCeWc/H0OaGsrz7/QbuHJDAD2t3XfrDIpLlFPCS5UoUyEvsg3UZ370hIcHG4x/9SK9xi/ll/zGvSxPJVRTwkm1uubYE03o35fnm1xG3eifNYuMZNXsTpzSBmchVoYCXbJU3JJhn76hKXN9obqxQlL98vYq2Q+ewRBOYiWQ7BbxcFRWK52d0l5t4t2N9fj18nPbD5vLKl8vZf0QTmIlkFwW8XDVmRus6ZfjON4HZhIVbuCP2B75ckqKx8yLZQAEvV13BsNDfJjArVzQffT/5iY4jF5C865DXpYkEFAW8eKZWRGEmPXULf7u3Fiu37+euQQm8PWMNR09o7LxIVlDAi6eCgoxHGlZg1vO30qZO2tj55gPimbV6p9elifg9BbzkCOEF8xL7UF0m9mhEWGgwT4xJosfYJLbtO+p1aSJ+SwEvOUqjysX59rmmvHTX9SSu302z/vEM+2GD5p0XyQQFvOQ4eUKCeDLmWuL6RdO0agn+OX0NrQcnMn/jr16XJuJXFPCSY5Urmo8RnaL4sHMUR0+epsOI+fT7ZCmpB497XZqIX1DAS453R/VSxPWN4ZnbqvDVsu3c0f8HPp6/mdNnNHZe5GIuGfBmNsrMdpnZinRtdc1svpktNbMkM2vgazczG2xmyWa2zMzqZ2fxkntckyeYP7SoxrTe0dSKKMyfJq+g/XtzWJ6y3+vSRHKsjNzBjwZantP2L+AN51xd4DXf9wB3AVV9Xz2AYVlTpkiaKiULMK5bQwZ1qMu2fcdo9+5sXp+yggPHNOWByLkuGfDOuQTg3FWVHVDIt10Y2O7bbgeMdWnmA0XMrExWFSsCaVMetKsbwaznY3isUQXGzt/MHf3jmbJ0m6Y8EEkns33wfYC3zWwr8A7wsq89Atia7rgUX9vvmFkPX/dOUmpqaibLkNys8DWhvNGuFlN7NaFM4TB6T1zKYx8uZGOqpjwQgcwH/FNAX+dcJNAX+NDXbuc59ry3VM65Ec65KOdcVHh4eCbLEIHa5Qrz5dONebNdTX5K2UfLgYnEzlyr5QIl18tswHcGJvm2PwMa+LZTgMh0x5Xjv903ItkmOMh47OaKzHo+hla1SzP4P8laLlByvcwG/HYgxrd9O7Detz0V6OQbTdMI2O+c23GFNYpkWMmCYQzsUI/x3f67XODT4xZpuUDJlUIudYCZTQBuBUqYWQrwOtAdGGRmIcAx0kbMAHwLtAKSgSNAl2yoWeSSbqmStlzgyISNDPlPMvFrU3n+zmp0urkCIcF6/UNyB8sJow6ioqJcUlKS12VIgNry6xH+NGUF8etSqVm2EH+7tzZ1I4t4XZbIFTOzRc65qAvt162MBLzyxfMxustNvPdIfXYfOs69783hT5NXsP+oxs5LYFPAS65gZrSqnbZc4OO3VGTcAo2dl8CngJdcpWBYKK+3qcnUZ5oQUeS/Y+c37T7sdWkiWU4BL7lSrYjCTDo7dn7rPloMTGDgd+s0dl4CigJecq3fxs7/IYaWNUsz8Lv1tBqUyNzk3V6XJpIlFPCS65UsGMbgh+sxtmsDTjtHxw8W0O+Tpfx6SPPOi39TwIv4RF8Xzow+0Tx7e9q887f3j2fiwi2c0bzz4qcU8CLphIUG8/yd1ZjWuynVShXkpUnLeWjEPNbtPOh1aSKXTQEvch5VShZkYo9G/Ou+OqzfdYhWgxL51/Q1HD2hh7DiPxTwIhcQFGQ8eFMks/rF0K5uBO/9sIE7B8ZrAjPxGwp4kUsoXiAv/R+8gfHdGxIaFMTjH/3IM+MXs+ugJjCTnE0BL5JBt1xbgml9mtKnWVVmrtxJs/7xjF+gh7CScyngRS5D3pBg+jS7jml9mlKjbCH+78vlPPi+HsJKzqSAF8mEa8MLMKF7I96+vw7JqYdoPTiRd2ZoFSnJWRTwIplkZjwQlfYQtk2dsgz9PpmWAxOYozdhJYdQwItcoeIF8hL7UF3GdWsIwCN6E1ZyCAW8SBZpXKUE0/tE88xtaW/C3hEbz6dJWzUdsXhGAS+ShcJCg/lDi2p881xTqoQX4I+fL6PjyAVsTD3kdWmSCyngRbLBdaUK8mnPm/lH+9qs2L6floMSGTJrPSdOnfG6NMlFFPAi2SQoyHi4QXlm9YuheY1S9I9bR+vBiST9vMfr0iSXUMCLZLOShcJ4t2N9Rj0exZETp7l/+Dxe+XK51oSVbKeAF7lKbr++FDP7RtOtSSUmLNxCs9h4vlm2Qw9hJdso4EWuovx5Q3j17hpM6dWEkgXz0mv8YrqNSWLbvqNelyYBSAEv4oHa5QozpVdjXm1dnbkbfqV5bDwfzt7Eac1rI1lIAS/ikZDgILo1rczMvtE0qFSMN79eRfv35rB6xwGvS5MAoYAX8VhksXx89PhNDOpQl5S9R2kzZDb/mr5G89rIFVPAi+QAZka7uhF81y+Ge+qlLS5y16BE5m341evSxI9dMuDNbJSZ7TKzFee0P2tma81spZn9K137y2aW7NvXIjuKFglURfPn4Z0HbuDfTzTk9BnHwyPn8+Lny9h/REMq5fJl5A5+NNAyfYOZ3Qa0A+o452oC7/jaawAdgJq+z7xnZsFZWbBIbtCkaglm9ImmZ3RlPl+cwh0aUimZcMmAd84lAOe+evcU8JZz7rjvmLOLVLYDJjrnjjvnNgHJQIMsrFck17gmTzAvt6rOlF6NKV04bUhl97FJ7NivIZWSMZntg78OaGpmC8ws3sxu8rVHAFvTHZfiaxORTKoVUZjJTzfmlVbVmZ28m2b94xk772ctFSiXlNmADwGKAo2AF4BPzcwAO8+x5/2v0Mx6mFmSmSWlpqZmsgyR3CEkOIju0ZWZ2SeG+hWK8tqUlTzw/jzWa6lAuYjMBnwKMMmlWQicAUr42iPTHVcO2H6+H+CcG+Gci3LORYWHh2eyDJHcpXzxfIzt2oD+D9zAhtRDtBqcyIC4dRw/pSGV8nuZDfjJwO0AZnYdkAfYDUwFOphZXjOrBFQFFmZFoSKSxsy478ZyfNcvhla1yzBo1npaD56tWSrldzIyTHICMA+oZmYpZvYEMAqo7Bs6ORHo7LubXwl8CqwCpgO9nHO6tRDJBiUK5GVQh3p81OUmjvpmqfzT5BUcPKYhlZLGcsKwq6ioKJeUlOR1GSJ+6/DxU7wzcy2j5/5MqYJhvHlPLZrXKOV1WZLNzGyRcy7qQvv1JqtIAMifN4TX29Rk0lO3UCRfKN3HJvH0uEXsOnjM69LEQwp4kQBSr3xRvnq2CS+0qMZ3q3fRrH88n2nh71xLAS8SYEKDg+h1WxWm9W5KtdIFeeHzZXQatZCte454XZpcZQp4kQB1bXgBPulxM2+2q8nizXtpMTCBj+ZozvncRAEvEsCCgozHbq7IzH4xNKhUjDe+WsUDw+fqBalcQgEvkgtEFLmGjx6/iYEP1WXT7sO0HjybIbPWc+LUGa9Lk2ykgBfJJcyMe+pFENcvhha1StM/bh1th85mWco+r0uTbKKAF8llShTIy5CH6zGyUxR7j5zgnnfn8PdvV3P0hN5JDDQKeJFcqnmNUsT1i+Ghm8ozImEjLQclaAWpAKOAF8nFCoWF8o/2tZnQvREAD4+cz8uTlnNA0x0EBAW8iHDztcWZ3jttBalPftxC89h4vlu10+uy5Aop4EUE+O8KUpN7NaZovjx0G5vEsxOWsPvQca9Lk0xSwIvI/6hTrghTn2nC882vY8aKX2geG8/kJds03YEfUsCLyO/kCQni2Tuq8s1zTahUIj99PllK19E/sn2f1oP1Jwp4EbmgqqUK8tmTt/B6mxrM37iH5rHxfDxP68H6CwW8iFxUcJDRpXElZvaNpn6Fovxpyko6jJjPxtRDXpcml6CAF5EMiSyWth7s2/fXYc0vB7hrUCLD4zdw6rSmO8ipFPAikmFmxgNRkXzXL4Zbq4Xz1rQ13PveXFZtP+B1aXIeCngRuWwlC4Ux/NEbee+R+uzYf5S2Q2fTf+Zajp/SdAc5iQJeRDLFzGhVuwxxfWNoW7csQ/6TzN2DZ7N4y16vSxMfBbyIXJGi+fMQ+2BdPupyE4ePn+K+YXP5y1erOHLilNel5XoKeBHJErdVK8nMfjE82rACo+ZsosXABOYm7/a6rFxNAS8iWaZA3hDevKcWn/RoREhQEB0/WKDJyzykgBeRLNewcnGm9W762+RlLQYk8P2aXV6Xleso4EUkW4SFpk1eNunpxhQMC6HL6B/p9+lS9h054XVpuYYCXkSyVd3IInz1bBOeu6MqU5dup1lsAtNX7PC6rFxBAS8i2S5vSDD9ml/H1GeaUKpQXp7892J6jVtM6kFNRZydFPAictXUKFuIyb0a80KLasSt2smdA+KZslRTEWeXSwa8mY0ys11mtuI8+/5gZs7MSvi+NzMbbGbJZrbMzOpnR9Ei4r9Cg4PodVsVvu2dNhVx74lL6TYmiZ0HjnldWsDJyB38aKDluY1mFgk0B7aka74LqOr76gEMu/ISRSQQVSmZNhXxq62rM2fDbprHxvNZ0lbdzWehSwa8cy4B2HOeXQOAPwLpr0Y7YKxLMx8oYmZlsqRSEQk4wUFGt6aVmd47mutLF+KFz5fRRQuLZJlM9cGbWVtgm3Pup3N2RQBb032f4ms738/oYWZJZpaUmpqamTJEJEBULJGfiT0a8UbbmizYuIcWAxKYuHCL7uav0GUHvJnlA14BXjvf7vO0nfcKOedGOOeinHNR4eHhl1uGiASYoCCj8y0VmdEnmloRhXlp0nI6jVpIyt4jXpfmtzJzB38tUAn4ycx+BsoBi82sNGl37JHpji0HbL/SIkUk9yhfPB/jujXkr/fUYvHmvbQYkMC/52/WMoGZcNkB75xb7pwr6Zyr6JyrSFqo13fO/QJMBTr5RtM0AvY75/RGg4hclqAg49FGFZjRN5p65Yvy6uQVPPLBArbu0d385cjIMMkJwDygmpmlmNkTFzn8W2AjkAyMBJ7OkipFJFcqVzQfHz/RgLfa12b5tv3cOSCB0XM26W4+gywnPMSIiopySUlJXpchIjnY9n1HeXnScuLXpdKgYjH+eX8dKpXI73VZnjKzRc65qAvt15usIuIXyha5htFdbuLt++uw+pcD3DUogQ8SN3Jad/MXpIAXEb+RftHvJlVK8NdvVvPA8Lkk7zrkdWk5kgJeRPxOqUJhjOwUxcCH6rJx92FaDU5k2A8bOHX6jNel5SgKeBHxS2bGPfUimNk3mtuqhfPP6Wu4b9hc1v5y0OvScgwFvIj4tZIFwxj+6I0M7ViPrXuPcveQRIbMWs9J3c0r4EXE/5kZd9cpS1zfaFrULE3/uHW0GzqHldv3e12apxTwIhIwihfIy9CO9Rn+6I3sOnicdkPnEDtzLcdPnfa6NE8o4EUk4LSsVZq4vtG0vaEsg/+TTJshs/lp6z6vy7rqFPAiEpCK5s9D7EN1GfV4FAeOnuLe9+bwj2mrOXYy99zNK+BFJKDdfn0pZvaL5sGoSN6P30irwYks2ny+JS4CjwJeRAJeobBQ3rqvDh8/0YDjJ89w//B5vPHVSo6cOOV1adlKAS8iuUbTquHM6BvNY40q8NGcn2k5MJF5G371uqxso4AXkVylQN4Q/tKuFp/0aIQZPDxyPq98uZyDx056XVqWU8CLSK7UsHJxpveOpnvTSkxYuIU7ByTwnzU7vS4rSyngRSTXuiZPMK+0rsGkpxtTKCyUrqOT6DNxCXsOn/C6tCyhgBeRXK9uZBG+erYJfZpV5ZvlO2gWG8/Un7b7/aLfCngRESBPSBB9ml3H1882JbJYPp6bsITuYxfxy/5jXpeWaQp4EZF0qpUuyKSnbuHV1tWZnZxK89h4Jizc4pd38wp4EZFzBAcZ3ZpWZkafaGpFFOblScvpOHIBm3897HVpl0UBLyJyARWK52d894b8o31tVmzbT4uB/rVMoAJeROQizIyHG5QnLt0yge39ZGERBbyISAaULpy2TODgh+uxdc8R7h6SyIC4dTl6KmIFvIhIBpkZbW8oy3f9YmhduwyDZq2nzZDZLNmy1+vSzksBLyJymYrlz8PADvUY9XgUB4+dov2wubz59aocN3mZAl5EJJNuv74UM/tG80jD8nw4exMtBiYwJ3m312X9RgEvInIFCoaF8td7avNJj0aEBAXxyAcLePHzZew/6v3kZQp4EZEs0LBycab1bkrPmMp8vjiFZrHxTF+xw9OaFPAiIlkkLDSYl++qzpRejQkvkJcn/72YJz9exK4D3kx3cMmAN7NRZrbLzFaka3vbzNaY2TIz+9LMiqTb97KZJZvZWjNrkV2Fi4jkVLUiCjPlmcb8sWU1/rN2F81i4/nkx6s/3UFG7uBHAy3PaYsDajnn6gDrgJcBzKwG0AGo6fvMe2YWnGXVioj4idDgIJ6+tQrTezfl+jKFePGL5TzywdWd7uCSAe+cSwD2nNM20zl3djzQfKCcb7sdMNE5d9w5twlIBhpkYb0iIn6lcngBJnZvxN/urcWylLTpDkYkbODU6TPZ/ruzog++KzDNtx0BbE23L8XX9jtm1sPMkswsKTU1NQvKEBHJmYKCjEcaViCuXzRNqpTg79+uof2wuazafiB7f++VfNjMXgFOAePONp3nsPN2OjnnRjjnopxzUeHh4VdShoiIXyhT+BpGdopiyMP12Lb3KG2HzuaDxI3Z9vtCMvtBM+sM3A3c4f775CAFiEx3WDlge+bLExEJLGZGmxvK0qRKCd78ZhWVSuTPtt+VqYA3s5bAi0CMc+5Iul1TgfFmFguUBaoCC6+4ShGRAFM0fx5iH6ybrb/jkgFvZhOAW4ESZpYCvE7aqJm8QJyZAcx3zj3pnFtpZp8Cq0jruunlnMu5U62JiAQwywnLUEVFRbmkpCSvyxAR8Stmtgf1HYQAAAQPSURBVMg5F3Wh/XqTVUQkQCngRUQClAJeRCRAKeBFRAKUAl5EJEAp4EVEAlSOGCZpZqnA5kx+vASQc9bIyhqBdk6Bdj4QeOcUaOcDgXdO5zufCs65C871kiMC/kqYWdLFxoH6o0A7p0A7Hwi8cwq084HAO6fMnI+6aEREApQCXkQkQAVCwI/wuoBsEGjnFGjnA4F3ToF2PhB453TZ5+P3ffAiInJ+gXAHLyIi56GAFxEJUH4d8GbW0szWmlmymb3kdT1Zwcx+NrPlZrbUzPxuDmUzG2Vmu8xsRbq2YmYWZ2brfX8W9bLGy3WBc/qzmW3zXaelZtbKyxovh5lFmtn3ZrbazFaaWW9fu19ep4ucjz9fozAzW2hmP/nO6Q1feyUzW+C7Rp+YWZ6L/hx/7YM3s2BgHdCctKUCfwQeds6t8rSwK2RmPwNRzjm/fEHDzKKBQ8BY51wtX9u/gD3Oubd8/xAXdc696GWdl+MC5/Rn4JBz7h0va8sMMysDlHHOLTazgsAi4B7gcfzwOl3kfB7Ef6+RAfmdc4fMLBSYDfQG+gGTnHMTzWw48JNzbtiFfo4/38E3AJKdcxudcyeAiUA7j2vK9ZxzCcCec5rbAWN822NI+8vnNy5wTn7LObfDObfYt30QWA1E4KfX6SLn47dcmkO+b0N9Xw64Hfjc137Ja+TPAR8BbE33fQp+flF9HDDTzBaZWQ+vi8kipZxzOyDtLyNQ0uN6ssozZrbM14XjF90Z5zKzikA9YAEBcJ3OOR/w42tkZsFmthTYBcQBG4B9zrlTvkMumXn+HPB2njb/7G/6X42dc/WBu4Bevu4ByXmGAdcCdYEdQH9vy7l8ZlYA+ALo45w74HU9V+o85+PX18g5d9o5VxcoR1qPRfXzHXaxn+HPAZ8CRKb7vhyw3aNasoxzbrvvz13Al6RdWH+309dPera/dJfH9Vwx59xO31/AM8BI/Ow6+fp1vwDGOecm+Zr99jqd73z8/Rqd5ZzbB/wANAKKmFmIb9clM8+fA/5HoKrvqXIeoAMw1eOaroiZ5fc9JMLM8gN3Aisu/im/MBXo7NvuDEzxsJYscTYIfe7Fj66T7wHeh8Bq51xsul1+eZ0udD5+fo3CzayIb/saoBlpzxa+B+73HXbJa+S3o2gAfMOeBgLBwCjn3N88LumKmFll0u7aAUKA8f52TmY2AbiVtKlNdwKvA5OBT4HywBbgAeec3zy0vMA53Ura//o74Geg59n+65zOzJoAicBy4Iyv+f9I67f2u+t0kfN5GP+9RnVIe4gaTNqN+KfOub/4MmIiUAxYAjzqnDt+wZ/jzwEvIiIX5s9dNCIichEKeBGRAKWAFxEJUAp4EZEApYAXEQlQCngRkQClgBcRCVD/D3iHVaXR1UsQAAAAAElFTkSuQmCC\n",
431 | "text/plain": [
432 | ""
433 | ]
434 | },
435 | "metadata": {
436 | "needs_background": "light"
437 | },
438 | "output_type": "display_data"
439 | },
440 | {
441 | "data": {
442 | "text/plain": [
443 | ""
444 | ]
445 | },
446 | "metadata": {},
447 | "output_type": "display_data"
448 | },
449 | {
450 | "data": {
451 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAD4CAYAAAAD6PrjAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAe50lEQVR4nO3de3Rd5X3m8e9zdLElQJINgjGWg53gZriUJlgFJ2G6MpCAYZLYnUAXrMngyTDLnZSk6WRmGpj+4Q4pa5E2U1pWCFkkeGLSFOMALW7ihLqElEnKTQbCzSFWDMECg0XkC+CLbr/547ySj6VzJPkc2UfSfj5raZ19fvvd+7zbB/Ro396tiMDMzAwgV+0OmJnZ1OFQMDOzYQ4FMzMb5lAwM7NhDgUzMxtWW+0OlOukk06KhQsXVrsbZmbTyubNm9+MiNZS86dtKCxcuJCOjo5qd8PMbFqR9Kux5vvwkZmZDXMomJnZMIeCmZkNGzcUJK2RtFPScyPqn5P0oqTnJf15Qf16SZ1p3iUF9WWp1inpuoL6IkmPSdoq6W5J9ZO1cWZmdmQmsqfwLWBZYUHSvwWWA+dExFnAV1L9TOBK4Ky0zNck1UiqAW4FLgXOBK5KbQG+DNwcEYuBXcA1lW6UmZmVZ9xQiIiHgZ4R5c8AN0XEwdRmZ6ovB9ZFxMGIeAnoBM5LP50RsS0ieoF1wHJJAi4E7knLrwVWVLhNZmZWpnLPKfwG8G/SYZ9/lvTbqT4f2F7QrivVStVPBHZHRP+IelGSVknqkNTR3d1dZtfNzKyUckOhFpgDLAX+J7A+/dWvIm2jjHpREXF7RLRHRHtra8l7L8a09l9eZsPPXitrWTOzma7cUOgC7ou8x4FB4KRUX1DQrg14bYz6m0CLpNoR9aPmrsdf4R8cCmZmRZUbCn9P/lwAkn4DqCf/C34DcKWkWZIWAYuBx4EngMXpSqN68iejN0T+CT8PAZen9a4E7i93YyaiuaGOPfv7juZHmJlNWxO5JPUu4BHgvZK6JF0DrAHenS5TXQesTHsNzwPrgReAHwLXRsRAOmfwWeABYAuwPrUF+CLwBUmd5M8x3DG5m3i45oY69uxzKJiZFTPu2EcRcVWJWZ8q0f5G4MYi9Y3AxiL1beSvTjomWhrreKbLoWBmVkzm7mj24SMzs9IyGQr7+wY42D9Q7a6YmU052QuFxvwoGt5bMDMbLXuh0FAHwF6HgpnZKJkNhd2+AsnMbJTMhUJLCgUfPjIzGy1zoeA9BTOz0jIbCt5TMDMbLXOh0ORQMDMrKXOhUJMTJ8yudSiYmRWRuVAA39VsZlZKJkOhpdGhYGZWTCZDobmhjt37eqvdDTOzKSezoeA9BTOz0TIaCvXs2d8/fkMzs4zJaCjUsWd/L/kHv5mZ2ZCJPHltjaSd6SlrI+f9D0kh6aT0XpJukdQp6RlJ5xa0XSlpa/pZWVBfIunZtMwtkjRZG1dKc0MdfQPB/j4Pn21mVmgiewrfApaNLEpaAHwUeKWgfCn55zIvBlYBt6W2c4HVwPnkn7K2WtKctMxtqe3QcqM+a7K1NPoGNjOzYsYNhYh4GOgpMutm4I+BwmMwy4E70/OaHwVaJM0DLgE2RURPROwCNgHL0rymiHgk8sdy7gRWVLZJ4/P4R2ZmxZV1TkHSJ4BXI+JnI2bNB7YXvO9KtbHqXUXqR5VHSjUzK672SBeQ1Aj8CXBxsdlFalFGvdRnryJ/qIl3vetd4/a1FI9/ZGZWXDl7Cu8BFgE/k/Qy0AY8Kelfkf9Lf0FB2zbgtXHqbUXqRUXE7RHRHhHtra2tZXQ9b3ikVB8+MjM7zBGHQkQ8GxEnR8TCiFhI/hf7uRHxOrABuDpdhbQU2BMRO4AHgIslzUknmC8GHkjz3pK0NF11dDVw/yRtW0k+0WxmVtxELkm9C3gEeK+kLknXjNF8I7AN6AS+AfwBQET0AF8Cnkg/N6QawGeAb6Zlfgn8oLxNmbjjZ9VSk5NDwcxshHHPKUTEVePMX1gwHcC1JdqtAdYUqXcAZ4/Xj8kkiabZteze7/GPzMwKZfKOZoCWRg91YWY2UmZDocmD4pmZjZLZUGhuqGOPh882MztMZkOhxXsKZmajZDYU/EwFM7PRMh8Kg4MePtvMbEhmQ6GlsY7BgLd7fQWSmdmQzIZCk4e6MDMbJbOh0OxB8czMRslsKHj4bDOz0TIbCs0eFM/MbJTshoKfvmZmNkpmQ6GloR7wnoKZWaHMhsLsuhz1NTmHgplZgcyGgqQ0KJ7HPzIzG5LZUID8DWzeUzAzOyTToeDxj8zMDjeRx3GukbRT0nMFtb+Q9HNJz0j6O0ktBfOul9Qp6UVJlxTUl6Vap6TrCuqLJD0maaukuyXVT+YGjqW5oc5XH5mZFZjInsK3gGUjapuAsyPiHOAXwPUAks4ErgTOSst8TVKNpBrgVuBS4EzgqtQW4MvAzRGxGNgFjPUM6Enl4bPNzA43bihExMNAz4jaP0bE0EhyjwJtaXo5sC4iDkbES0AncF766YyIbRHRC6wDlksScCFwT1p+LbCiwm2aMD99zczscJNxTuE/Az9I0/OB7QXzulKtVP1EYHdBwAzVi5K0SlKHpI7u7u6KO97cUMdbB/oZ8PDZZmZAhaEg6U+AfuA7Q6UizaKMelERcXtEtEdEe2tr65F2d5SWNNTFXu8tmJkBUFvugpJWAh8DLoqIoV/kXcCCgmZtwGtpulj9TaBFUm3aWyhsf9QVjpQ657hjdn7bzGzKKmtPQdIy4IvAJyJiX8GsDcCVkmZJWgQsBh4HngAWpyuN6smfjN6QwuQh4PK0/Erg/vI25cgNj3/kPQUzM2Bil6TeBTwCvFdSl6RrgK8CJwCbJD0t6esAEfE8sB54AfghcG1EDKS9gM8CDwBbgPWpLeTD5QuSOsmfY7hjUrdwDC0eKdXM7DDjHj6KiKuKlEv+4o6IG4Ebi9Q3AhuL1LeRvzrpmPODdszMDpfpO5oPPZLT4x+ZmUHGQ8F7CmZmh8t0KMyqraGhrsahYGaWZDoUwOMfmZkVynwoePhsM7NDMh8KHv/IzOyQzIeCn6lgZnZI5kPBw2ebmR2S+VDwnoKZ2SEOhYY69vUO0Ns/WO2umJlVXeZDweMfmZkdkvlQGB7qYr+HujAzy3woeKgLM7NDMh8KLY35h+s4FMzMHAqHHrTjoS7MzBwKPnxkZnbIRJ68tkbSTknPFdTmStokaWt6nZPqknSLpE5Jz0g6t2CZlan91vR856H6EknPpmVukaTJ3sixNM3OP2fIoWBmNrE9hW8By0bUrgMejIjFwIPpPcCl5J/LvBhYBdwG+RABVgPnk3/K2uqhIEltVhUsN/KzjqramhwnzKr14SMzMyYQChHxMNAzorwcWJum1wIrCup3Rt6jQIukecAlwKaI6ImIXcAmYFma1xQRj0REAHcWrOuYaW6sY6/3FMzMyj6ncEpE7ABIryen+nxge0G7rlQbq95VpF6UpFWSOiR1dHd3l9n10TzUhZlZ3mSfaC52PiDKqBcVEbdHRHtEtLe2tpbZxdGaG+rY7VAwMys7FN5Ih35IrztTvQtYUNCuDXhtnHpbkfox5QftmJnllRsKG4ChK4hWAvcX1K9OVyEtBfakw0sPABdLmpNOMF8MPJDmvSVpabrq6OqCdR0zPnxkZpZXO14DSXcBHwZOktRF/iqim4D1kq4BXgGuSM03ApcBncA+4NMAEdEj6UvAE6ndDRExdPL6M+SvcGoAfpB+jqmmhjr27OsjIjjGV8SamU0p44ZCRFxVYtZFRdoGcG2J9awB1hSpdwBnj9ePo6mloZ7egUEO9A3SUF9Tza6YmVVV5u9oBt/VbGY2xKFAwfhHHj7bzDLOoUDBg3Z8V7OZZZxDAR8+MjMb4lCg8PCRQ8HMss2hQH7sI8DjH5lZ5jkUgOPra8nJh4/MzBwKQC4nmhrqPHy2mWWeQyFp8VAXZmYOhSEe/8jMzKEwrMnDZ5uZORSGtDTW++ojM8s8h0LS3FDrw0dmlnkOhWTonEJ+oFczs2xyKCQtDfUMDAZvH+yvdlfMzKrGoZB4/CMzM4fCsKah8Y98A5uZZVhFoSDpv0l6XtJzku6SNFvSIkmPSdoq6W5J9antrPS+M81fWLCe61P9RUmXVLZJ5Wnx+EdmZuWHgqT5wB8C7RFxNlADXAl8Gbg5IhYDu4Br0iLXALsi4nTg5tQOSWem5c4ClgFfk3TMn4npw0dmZpUfPqoFGiTVAo3ADuBC4J40fy2wIk0vT+9J8y+SpFRfFxEHI+IloBM4r8J+HTEPn21mVkEoRMSrwFeAV8iHwR5gM7A7IoYu4ekC5qfp+cD2tGx/an9iYb3IMoeRtEpSh6SO7u7ucrte1PDT1xwKZpZhlRw+mkP+r/xFwKnAccClRZoOXfivEvNK1UcXI26PiPaIaG9tbT3yTo+hoa6Guho5FMws0yo5fPQR4KWI6I6IPuA+4INASzqcBNAGvJamu4AFAGl+M9BTWC+yzDEjiWYPn21mGVdJKLwCLJXUmM4NXAS8ADwEXJ7arATuT9Mb0nvS/B9F/vbhDcCV6eqkRcBi4PEK+lW25oY6X31kZplWO36T4iLiMUn3AE8C/cBTwO3A94F1kv4s1e5Ii9wBfFtSJ/k9hCvTep6XtJ58oPQD10bEQLn9qoSHzzazrCs7FAAiYjWwekR5G0WuHoqIA8AVJdZzI3BjJX2ZDM0NdXS/fbDa3TAzqxrf0VygpbHeewpmlmkOhQLNDXXs8YlmM8swh0KBpoY69h7oZ2DQw2ebWTY5FAq0pLua3zrgvQUzyyaHQgGPf2RmWedQKNDs4bPNLOMcCgU8/pGZZZ1DoYAPH5lZ1jkUCnj4bDPLOodCgaFHcnr8IzPLKodCgdl1Ncyuy/nwkZlllkNhhPzw2b3V7oaZWVU4FEZoafD4R2aWXQ6FETx8tpllmUNhhCY/fc3MMsyhMEJLo5++ZmbZVVEoSGqRdI+kn0vaIukDkuZK2iRpa3qdk9pK0i2SOiU9I+ncgvWsTO23SlpZ+hOPvuaGOt+nYGaZVemewl8DP4yIfw38FrAFuA54MCIWAw+m9wCXkn/+8mJgFXAbgKS55J/edj75J7atHgqSamhpqGNf7wB9A4PV6oKZWdWUHQqSmoDfIT2DOSJ6I2I3sBxYm5qtBVak6eXAnZH3KNAiaR5wCbApInoiYhewCVhWbr8q1ezxj8wswyrZU3g30A38X0lPSfqmpOOAUyJiB0B6PTm1nw9sL1i+K9VK1UeRtEpSh6SO7u7uCrpemkdKNbMsqyQUaoFzgdsi4v3AOxw6VFSMitRijProYsTtEdEeEe2tra1H2t8J8aB4ZpZllYRCF9AVEY+l9/eQD4k30mEh0uvOgvYLCpZvA14bo14VzR7/yMwyrOxQiIjXge2S3ptKFwEvABuAoSuIVgL3p+kNwNXpKqSlwJ50eOkB4GJJc9IJ5otTrSoOjZTqoS7MLHtqK1z+c8B3JNUD24BPkw+a9ZKuAV4BrkhtNwKXAZ3AvtSWiOiR9CXgidTuhojoqbBfZWtprAdgj88pmFkGVRQKEfE00F5k1kVF2gZwbYn1rAHWVNKXydI0O/9Psmd/f5V7YmZ27PmO5hFqa3IcP6vWh4/MLJMcCkV4UDwzyyqHQhHNDR7/yMyyyaFQRLNHSjWzjHIoFNHS6MNHZpZNDoUifE7BzLLKoVCEh882s6xyKBTR3FhHb/8gB/oGqt0VM7NjyqFQhAfFM7OscigU4eGzzSyrHApFtDSk8Y+8p2BmGeNQKMKHj8wsqxwKRRw6fOTxj8wsWxwKRfg5zWaWVQ6FIk6YVYvkp6+ZWfY4FIrI5UTTbN/AZmbZU3EoSKqR9JSk76X3iyQ9JmmrpLvTU9mQNCu970zzFxas4/pUf1HSJZX2aTJ4/CMzy6LJ2FP4PLCl4P2XgZsjYjGwC7gm1a8BdkXE6cDNqR2SzgSuBM4ClgFfk1QzCf2qiMc/MrMsqigUJLUB/w74Znov4ELgntRkLbAiTS9P70nzL0rtlwPrIuJgRLxE/hnO51XSr8ng4bPNLIsq3VP4K+CPgcH0/kRgd0QMPeC4C5ifpucD2wHS/D2p/XC9yDKHkbRKUoekju7u7gq7PjY/aMfMsqjsUJD0MWBnRGwuLBdpGuPMG2uZw4sRt0dEe0S0t7a2HlF/j5QPH5lZFtVWsOyHgE9IugyYDTSR33NokVSb9gbagNdS+y5gAdAlqRZoBnoK6kMKl6maoeGzI4L8US4zs5mv7D2FiLg+ItoiYiH5E8U/ioj/ADwEXJ6arQTuT9Mb0nvS/B9FRKT6lenqpEXAYuDxcvs1WVoa6xgYDN7p9fDZZpYdlewplPJFYJ2kPwOeAu5I9TuAb0vqJL+HcCVARDwvaT3wAtAPXBsRVf9NXDjUxfGzjsY/k5nZ1DMpv+0i4sfAj9P0NopcPRQRB4ArSix/I3DjZPRlsiw88TgAnnxlN21zGqvcGzOzY8N3NJfw2wvnMr+lge92bB+/sZnZDOFQKCGXE59c0sZPOt9kx5791e6Omdkx4VAYwyfPnU8E3Pfkq9XuipnZMeFQGMNpJx7HeYvmcs/mLvIXSpmZzWwOhXFcsaSNl958h82/2lXtrpiZHXUOhXFc9pvzaKyv4Z7NXdXuipnZUedQGMdxs2q57Dfn8b1ndrCvt3/8BczMpjGHwgRcvqSNtw/288Dzr1e7K2ZmR5VDYQLOWziXd81t5LsdPoRkZjObQ2ECcjnxyXPb+Jdf/pquXfuq3R0zs6PGoTBBn1ySf8TDvZt9z4KZzVwOhQlqm9PIB99zIvc8uZ3BQd+zYGYzk0PhCFzR3sb2nv08/nJPtbtiZnZUOBSOwLKz5nH8rFrfs2BmM5ZD4Qg01NfwsXPmsfHZHbxz0PcsmNnM41A4QpcvaWNf7wAbn91R7a6YmU06h8IRWnLaHBaddBzf9SEkM5uByg4FSQskPSRpi6TnJX0+1edK2iRpa3qdk+qSdIukTknPSDq3YF0rU/utklaW+sypQBKXL2nj8Zd6+NWv36l2d8zMJlUlewr9wH+PiDOApcC1ks4ErgMejIjFwIPpPcClwOL0swq4DfIhAqwGzif/GM/VQ0EyVf37c+cjwb3eWzCzGabsUIiIHRHxZJp+C9gCzAeWA2tTs7XAijS9HLgz8h4FWiTNAy4BNkVET0TsAjYBy8rt17Ewr7mBC04/iXuffNX3LJjZjDIp5xQkLQTeDzwGnBIROyAfHMDJqdl8oPCBx12pVqpe7HNWSeqQ1NHd3T0ZXS/bFe0LeHX3fh7Z9uuq9sPMbDJVHAqSjgfuBf4oIvaO1bRILcaojy5G3B4R7RHR3traeuSdnUQXn3kKJ8z2PQtmNrNUFAqS6sgHwnci4r5UfiMdFiK97kz1LmBBweJtwGtj1Ke02XU1fPy3TuUHz+1g74G+anfHzGxSVHL1kYA7gC0R8ZcFszYAQ1cQrQTuL6hfna5CWgrsSYeXHgAuljQnnWC+ONWmvCuWtHGgb5CNz/ieBTObGSrZU/gQ8B+BCyU9nX4uA24CPippK/DR9B5gI7AN6AS+AfwBQET0AF8Cnkg/N6TalPe+BS28p9X3LJjZzFFb7oIR8ROKnw8AuKhI+wCuLbGuNcCacvtSLZK4on0BN/3g52zrfpt3tx5f7S6ZmVXEdzRX6HffP5+c4Ov//Etfnmpm055DoUKnNM3m6g8sZH1HF6u+vdknnc1sWnMoTILVHz+T1R8/k4de3MmKW39K5863qt0lM7OyOBQmgSQ+/aFFfOe/nM+efX0s/+pP+eFzr1e7W2ZmR8yhMImWvvtE/uFzF3D6ycfzX/9mM1954EUGfJ7BzKYRh8IkO7Wlgbt//wP8XnsbX32ok2vWPsGefT7PYGbTg0PhKJhdV8OXP3kOf7bibH7a+SafuPUnvPi6zzOY2dTnUDhKJPGppaexbtVS9vUOsOLWn/K9Z6b86B1mlnEOhaNsyWlz+f7nLuDMU5v47N8+xZe+9wKv7d5f7W6ZmRWl/I3G0097e3t0dHRUuxsT1ts/yA3fe56/efQVAM6Y18RHzjiZi844hXPmN5PLlbo53Mxs8kjaHBHtJec7FI6tzp1v8+CWN3hwy046ftXDYEDrCbO48L0nc9EZJ3PB4pNorC979BEzszE5FKawXe/08uNf7OSftuzk4Re7eetgP7Nqc3zwPSdy0RmnsOS0OSyY28jxsxwSZjY5HArTRG//IE+83MM/pb2IV3r2Dc+b01jHgrmNLJjTSNvcBhbMaUzvG5g/p4FZtTVV7LmZTScOhWkoIvhl99v8/PW32N6zn+279rG9Zx9du/bz6q799A4MDreVoPX4WbQ01tE0u46mhjqaG+poml1LU8NQrZam2fl6Q30Ns+uGfnLMrs1Pz6rN+byGWQaMFwo+LjEFSeL0k0/g9JNPGDVvYDB4Y+8BtvfsY/uu/Wzv2ceOPfvZu7+fvQf6eGPvAbbufIu9+/t560AfR3JDdX1tjtm1ueHQqK0R9TU56mpy1NaIupoc9QXTdem1NpejNidyOVGbEzVDrzWiRkO1HDU5yOXytZqckESNoCYtWyORU5rOQU75NjlBTcF0vg1peaFUU/q3y6nwdWh6aH6+LQXTEohD7WHkOvPz02Kjlsu/Dn15h9c0Yh0Mf8bh8ynyOUO11KNRyw1NDy9buKBZmRwK00xNTpza0sCpLQ2cP07bwcHgnd5+9h7oZ+/+Pvbs72N/7wAH+gY40D/Agb7B/PTQa/8AB4drA/QNBn39g/QPBn0Dg/T2D7K/b4C+A4P0DeRrfQOD9A8EA4NB/2AwGEH/wODw+6FXO/bGC5H8+0OJVnTeiPVQZF0j34xsV2wdo+aPaDeydfHlC1seHogq0p9inztSsUAu9hmlVlGsXOzzii4+6vNKr+f7f3jBUTtsPGVCQdIy4K+BGuCbEXHTOIvYOHI5ccLsOk6YXcf8loaq9mWwIDQGBoOBCGIQBtL7wTg0b3CQ4feDUTCd6hH55fK1IICIQ/MiLRek19Q2Pw+CQ20ZbnN4e0auk3w9LVLQNj89dBQ20oJDfYrCaQ6tGw4ty2Hzh9ZzaJ1weJ+iYPmRbYPDZ45c5+HLjZ434uWwzz28XtA3Dt+mkaKgL8XajexX6bbFP2fkZ47Vn1J9GLkcYy5X/I+cYtWi/x5F28XYbUYURobgZJoSoSCpBriV/OM7u4AnJG2IiBeq2zObLLmcqPc5C7Mpb6rc0Xwe0BkR2yKiF1gHLK9yn8zMMmeqhMJ8YHvB+65UO4ykVZI6JHV0d3cfs86ZmWXFVAmFYscVRh9Wi7g9Itojor21tfUYdMvMLFumSih0AQsK3rcBHlLUzOwYmyqh8ASwWNIiSfXAlcCGKvfJzCxzpsTVRxHRL+mzwAPkL0ldExHPV7lbZmaZMyVCASAiNgIbq90PM7MsmyqHj8zMbAqYtgPiSeoGflXm4icBb05id6ptpm0PzLxtmmnbAzNvm2ba9kDxbTotIkpevjltQ6ESkjrGGiVwuplp2wMzb5tm2vbAzNummbY9UN42+fCRmZkNcyiYmdmwrIbC7dXuwCSbadsDM2+bZtr2wMzbppm2PVDGNmXynIKZmRWX1T0FMzMrwqFgZmbDMhUKkpZJelFSp6Trqt2fySDpZUnPSnpaUke1+1MOSWsk7ZT0XEFtrqRNkram1znV7OORKLE9fyrp1fQ9PS3psmr28UhIWiDpIUlbJD0v6fOpPp2/o1LbNC2/J0mzJT0u6Wdpe/53qi+S9Fj6ju5OY8uNva6snFNIT3f7BQVPdwOumu5Pd5P0MtAeEdP2phtJvwO8DdwZEWen2p8DPRFxUwrwORHxxWr2c6JKbM+fAm9HxFeq2bdySJoHzIuIJyWdAGwGVgD/ien7HZXapt9jGn5Pyj/A+biIeFtSHfAT4PPAF4D7ImKdpK8DP4uI28ZaV5b2FPx0tykqIh4GekaUlwNr0/Ra8v/DTgsltmfaiogdEfFkmn4L2EL+IVjT+TsqtU3TUuS9nd7WpZ8ALgTuSfUJfUdZCoUJPd1tGgrgHyVtlrSq2p2ZRKdExA7I/w8MnFzl/kyGz0p6Jh1emjaHWgpJWgi8H3iMGfIdjdgmmKbfk6QaSU8DO4FNwC+B3RHRn5pM6HdelkJhQk93m4Y+FBHnApcC16ZDFzb13Aa8B3gfsAP4P9XtzpGTdDxwL/BHEbG32v2ZDEW2adp+TxExEBHvI/+QsvOAM4o1G289WQqFGfl0t4h4Lb3uBP6O/H8MM8Eb6bjv0PHfnVXuT0Ui4o30P+0g8A2m2feUjlPfC3wnIu5L5Wn9HRXbpun+PQFExG7gx8BSoEXS0CMSJvQ7L0uhMOOe7ibpuHSSDEnHARcDz4291LSxAViZplcC91exLxUb+uWZ/C7T6HtKJzHvALZExF8WzJq231GpbZqu35OkVkktaboB+Aj58yQPAZenZhP6jjJz9RFAurzsrzj0dLcbq9ylikh6N/m9A8g/MOlvp+M2SboL+DD5YX7fAFYDfw+sB94FvAJcERHT4uRtie35MPlDEgG8DPz+0PH4qU7SBcD/A54FBlP5f5E/Bj9dv6NS23QV0/B7knQO+RPJNeT/2F8fETek3xHrgLnAU8CnIuLgmOvKUiiYmdnYsnT4yMzMxuFQMDOzYQ4FMzMb5lAwM7NhDgUzMxvmUDAzs2EOBTMzG/b/AUv5vMyt2x8UAAAAAElFTkSuQmCC\n",
452 | "text/plain": [
453 | ""
454 | ]
455 | },
456 | "metadata": {
457 | "needs_background": "light"
458 | },
459 | "output_type": "display_data"
460 | }
461 | ],
462 | "source": [
463 | "%matplotlib inline\n",
464 | "import matplotlib.pyplot as plt\n",
465 | "import numpy as np\n",
466 | "\n",
467 | "def showPlot(points, title):\n",
468 | " plt.figure()\n",
469 | " fig, ax = plt.subplots()\n",
470 | " plt.plot(points)\n",
471 | "\n",
472 | "showPlot(cbow_losses, 'CBOW Losses')\n",
473 | "showPlot(sg_losses, 'SkipGram Losses')"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": null,
479 | "metadata": {},
480 | "outputs": [],
481 | "source": []
482 | },
483 | {
484 | "cell_type": "code",
485 | "execution_count": 10,
486 | "metadata": {},
487 | "outputs": [
488 | {
489 | "name": "stdout",
490 | "output_type": "stream",
491 | "text": [
492 | "Loaded 400000 words\n"
493 | ]
494 | }
495 | ],
496 | "source": [
497 | "import torch\n",
498 | "import torchtext.vocab as vocab\n",
499 | "glove = vocab.GloVe(name = \"6B\", dim = 100)\n",
500 | "print(\"Loaded {} words\".format(len(glove.itos)))"
501 | ]
502 | },
503 | {
504 | "cell_type": "code",
505 | "execution_count": 11,
506 | "metadata": {},
507 | "outputs": [],
508 | "source": [
509 | "def get_word(word):\n",
510 | " return glove.vectors[glove.stoi[word]]"
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": 12,
516 | "metadata": {},
517 | "outputs": [],
518 | "source": [
519 | "def closest(vec, n = 10):\n",
520 | " all_dists = [(w, torch.dist(vec, get_word(w))) for w in glove.itos]\n",
521 | " return sorted(all_dists, key = lambda t: t[1])[:n]"
522 | ]
523 | },
524 | {
525 | "cell_type": "code",
526 | "execution_count": 13,
527 | "metadata": {},
528 | "outputs": [],
529 | "source": [
530 | "def print_tuples(tuples):\n",
531 | " for tuple in tuples:\n",
532 | " print(\"(%.4f) %s\" % (tuple[1], tuple[0]))"
533 | ]
534 | },
535 | {
536 | "cell_type": "code",
537 | "execution_count": 14,
538 | "metadata": {},
539 | "outputs": [
540 | {
541 | "name": "stdout",
542 | "output_type": "stream",
543 | "text": [
544 | "(0.0000) google\n",
545 | "(3.0772) yahoo\n",
546 | "(3.8836) microsoft\n",
547 | "(4.1048) web\n",
548 | "(4.1082) aol\n",
549 | "(4.1165) facebook\n",
550 | "(4.3917) ebay\n",
551 | "(4.4122) msn\n",
552 | "(4.4540) internet\n",
553 | "(4.4651) netscape\n"
554 | ]
555 | }
556 | ],
557 | "source": [
558 | "print_tuples(closest(get_word(\"google\")))"
559 | ]
560 | },
561 | {
562 | "cell_type": "code",
563 | "execution_count": 15,
564 | "metadata": {},
565 | "outputs": [],
566 | "source": [
567 | "def analogy(w1, w2, w3, n=5, filter_given=True):\n",
568 | " print('\\n[%s : %s :: %s : ?]' % (w1, w2, w3))\n",
569 | " closest_words = closest(get_word(w2) - get_word(w1) + get_word(w3)) \n",
570 | " if filter_given:\n",
571 | " closest_words = [t for t in closest_words if t[0] not in [w1, w2, w3]]\n",
572 | " \n",
573 | " print_tuples(closest_words[:n])"
574 | ]
575 | },
576 | {
577 | "cell_type": "code",
578 | "execution_count": 16,
579 | "metadata": {},
580 | "outputs": [
581 | {
582 | "name": "stdout",
583 | "output_type": "stream",
584 | "text": [
585 | "\n",
586 | "[king : man :: queen : ?]\n",
587 | "(4.0811) woman\n",
588 | "(4.6916) girl\n",
589 | "(5.2703) she\n",
590 | "(5.2788) teenager\n",
591 | "(5.3084) boy\n"
592 | ]
593 | }
594 | ],
595 | "source": [
596 | "analogy(\"king\", \"man\", \"queen\")"
597 | ]
598 | },
599 | {
600 | "cell_type": "code",
601 | "execution_count": 17,
602 | "metadata": {},
603 | "outputs": [
604 | {
605 | "name": "stdout",
606 | "output_type": "stream",
607 | "text": [
608 | "\n",
609 | "[man : actor :: woman : ?]\n",
610 | "(2.8133) actress\n",
611 | "(5.0039) comedian\n",
612 | "(5.1399) actresses\n",
613 | "(5.2773) starred\n",
614 | "(5.3085) screenwriter\n",
615 | "\n",
616 | "[cat : kitten :: dog : ?]\n",
617 | "(3.8146) puppy\n",
618 | "(4.2944) rottweiler\n",
619 | "(4.5888) puppies\n",
620 | "(4.6086) pooch\n",
621 | "(4.6520) pug\n",
622 | "\n",
623 | "[dog : puppy :: cat : ?]\n",
624 | "(3.8146) kitten\n",
625 | "(4.0255) puppies\n",
626 | "(4.1575) kittens\n",
627 | "(4.1882) pterodactyl\n",
628 | "(4.1945) scaredy\n",
629 | "\n",
630 | "[russia : moscow :: france : ?]\n",
631 | "(3.2697) paris\n",
632 | "(4.6857) french\n",
633 | "(4.7085) lyon\n",
634 | "(4.9087) strasbourg\n",
635 | "(5.0362) marseille\n",
636 | "\n",
637 | "[obama : president :: trump : ?]\n",
638 | "(6.4302) executive\n",
639 | "(6.5149) founder\n",
640 | "(6.6997) ceo\n",
641 | "(6.7524) hilton\n",
642 | "(6.7729) walt\n",
643 | "\n",
644 | "[rich : mansion :: poor : ?]\n",
645 | "(5.8262) residence\n",
646 | "(5.9444) riverside\n",
647 | "(6.0283) hillside\n",
648 | "(6.0328) abandoned\n",
649 | "(6.0681) bungalow\n",
650 | "\n",
651 | "[elvis : rock :: eminem : ?]\n",
652 | "(5.6597) rap\n",
653 | "(6.2057) rappers\n",
654 | "(6.2161) rapper\n",
655 | "(6.2444) punk\n",
656 | "(6.2690) hop\n",
657 | "\n",
658 | "[paper : newspaper :: screen : ?]\n",
659 | "(4.7810) tv\n",
660 | "(5.1049) television\n",
661 | "(5.3818) cinema\n",
662 | "(5.5524) feature\n",
663 | "(5.5646) shows\n",
664 | "\n",
665 | "[monet : paint :: michelangelo : ?]\n",
666 | "(6.0782) plaster\n",
667 | "(6.3768) mold\n",
668 | "(6.3922) tile\n",
669 | "(6.5819) marble\n",
670 | "(6.6524) image\n",
671 | "\n",
672 | "[beer : barley :: wine : ?]\n",
673 | "(5.6021) grape\n",
674 | "(5.6760) beans\n",
675 | "(5.8174) grapes\n",
676 | "(5.9035) lentils\n",
677 | "(5.9454) figs\n",
678 | "\n",
679 | "[earth : moon :: sun : ?]\n",
680 | "(6.2294) lee\n",
681 | "(6.4125) kang\n",
682 | "(6.4644) tan\n",
683 | "(6.4757) yang\n",
684 | "(6.4853) lin\n",
685 | "\n",
686 | "[house : roof :: castle : ?]\n",
687 | "(6.2919) stonework\n",
688 | "(6.3779) masonry\n",
689 | "(6.4773) canopy\n",
690 | "(6.4954) fortress\n",
691 | "(6.5259) battlements\n",
692 | "\n",
693 | "[building : architect :: software : ?]\n",
694 | "(5.8369) programmer\n",
695 | "(6.8881) entrepreneur\n",
696 | "(6.9240) inventor\n",
697 | "(6.9730) developer\n",
698 | "(6.9949) innovator\n",
699 | "\n",
700 | "[boston : bruins :: phoenix : ?]\n",
701 | "(3.8546) suns\n",
702 | "(4.1968) mavericks\n",
703 | "(4.6126) coyotes\n",
704 | "(4.6894) mavs\n",
705 | "(4.6971) knicks\n",
706 | "\n",
707 | "[good : heaven :: bad : ?]\n",
708 | "(4.3959) hell\n",
709 | "(5.2864) ghosts\n",
710 | "(5.2898) hades\n",
711 | "(5.3414) madness\n",
712 | "(5.3520) purgatory\n",
713 | "\n",
714 | "[jordan : basketball :: woods : ?]\n",
715 | "(5.8607) golf\n",
716 | "(6.4110) golfers\n",
717 | "(6.4418) tournament\n",
718 | "(6.4592) tennis\n",
719 | "(6.6560) collegiate\n"
720 | ]
721 | }
722 | ],
723 | "source": [
724 | "analogy('man', 'actor', 'woman')\n",
725 | "analogy('cat', 'kitten', 'dog')\n",
726 | "analogy('dog', 'puppy', 'cat')\n",
727 | "analogy('russia', 'moscow', 'france')\n",
728 | "analogy('obama', 'president', 'trump')\n",
729 | "analogy('rich', 'mansion', 'poor')\n",
730 | "analogy('elvis', 'rock', 'eminem')\n",
731 | "analogy('paper', 'newspaper', 'screen')\n",
732 | "analogy('monet', 'paint', 'michelangelo')\n",
733 | "analogy('beer', 'barley', 'wine')\n",
734 | "analogy('earth', 'moon', 'sun')\n",
735 | "analogy('house', 'roof', 'castle')\n",
736 | "analogy('building', 'architect', 'software')\n",
737 | "analogy('boston', 'bruins', 'phoenix')\n",
738 | "analogy('good', 'heaven', 'bad')\n",
739 | "analogy('jordan', 'basketball', 'woods')"
740 | ]
741 | },
742 | {
743 | "cell_type": "code",
744 | "execution_count": null,
745 | "metadata": {},
746 | "outputs": [],
747 | "source": []
748 | }
749 | ],
750 | "metadata": {
751 | "kernelspec": {
752 | "display_name": "Python 3",
753 | "language": "python",
754 | "name": "python3"
755 | },
756 | "language_info": {
757 | "codemirror_mode": {
758 | "name": "ipython",
759 | "version": 3
760 | },
761 | "file_extension": ".py",
762 | "mimetype": "text/x-python",
763 | "name": "python",
764 | "nbconvert_exporter": "python",
765 | "pygments_lexer": "ipython3",
766 | "version": "3.6.8"
767 | }
768 | },
769 | "nbformat": 4,
770 | "nbformat_minor": 4
771 | }
772 |
--------------------------------------------------------------------------------
/3_Tagging_RNN.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "import sys\n",
11 | "import time\n",
12 | "import torch\n",
13 | "import numpy as np\n",
14 | "import torch.nn as nn\n",
15 | "import torch.optim as optim\n",
16 | "\n",
17 | "from torch.nn import functional as F\n",
18 | "from torch.autograd import Variable\n",
19 | "from torchtext import data\n",
20 | "from torchtext import datasets\n",
21 | "from torchtext.vocab import Vectors, GloVe\n",
22 | "\n",
23 | "def load_dataset(test_sen=None): \n",
24 | " tokenize = lambda x: x.split()\n",
25 | " TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)\n",
26 | " LABEL = data.LabelField()\n",
27 | " train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)\n",
28 | " TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))\n",
29 | " LABEL.build_vocab(train_data)\n",
30 | "\n",
31 | " word_embeddings = TEXT.vocab.vectors\n",
32 | " print (\"Length of Text Vocabulary: \" + str(len(TEXT.vocab)))\n",
33 | " print (\"Vector size of Text Vocabulary: \", TEXT.vocab.vectors.size())\n",
34 | " print (\"Label Length: \" + str(len(LABEL.vocab)))\n",
35 | "\n",
36 | " train_data, valid_data = train_data.split()\n",
37 | " train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)\n",
38 | "\n",
39 | " vocab_size = len(TEXT.vocab)\n",
40 | "\n",
41 | " return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 2,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "class RNN(nn.Module):\n",
51 | " def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):\n",
52 | " super(RNN, self).__init__()\n",
53 | " \n",
54 | " self.batch_size = batch_size\n",
55 | " self.output_size = output_size\n",
56 | " self.hidden_size = hidden_size\n",
57 | " self.vocab_size = vocab_size\n",
58 | " self.embedding_length = embedding_length\n",
59 | " \n",
60 | " self.word_embeddings = nn.Embedding(vocab_size, embedding_length)\n",
61 | " self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False)\n",
62 | " self.rnn = nn.RNN(embedding_length, hidden_size, num_layers=2, bidirectional=True)\n",
63 | " self.label = nn.Linear(4*hidden_size, output_size)\n",
64 | " \n",
65 | " def forward(self, input_sentences, batch_size=None):\n",
66 | " input = self.word_embeddings(input_sentences)\n",
67 | " input = input.permute(1, 0, 2)\n",
68 | " \n",
69 | " if batch_size is None:\n",
70 | " h_0 = Variable(torch.zeros(4, self.batch_size, self.hidden_size).cuda())\n",
71 | " else:\n",
72 | " h_0 = Variable(torch.zeros(4, batch_size, self.hidden_size).cuda())\n",
73 | " \n",
74 | " output, h_n = self.rnn(input, h_0)\n",
75 | " h_n = h_n.permute(1, 0, 2)\n",
76 | " h_n = h_n.contiguous().view(h_n.size()[0], h_n.size()[1]*h_n.size()[2])\n",
77 | " logits = self.label(h_n)\n",
78 | " \n",
79 | " return logits"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 3,
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "Length of Text Vocabulary: 251639\n",
92 | "Vector size of Text Vocabulary: torch.Size([251639, 300])\n",
93 | "Label Length: 2\n",
94 | "Epoch: 1, Idx: 100, Training Loss: 0.6538, Training Accuracy: 59.38%\n",
95 | "Epoch: 1, Idx: 200, Training Loss: 0.7323, Training Accuracy: 28.12%\n",
96 | "Epoch: 1, Idx: 300, Training Loss: 0.6863, Training Accuracy: 62.50%\n",
97 | "Epoch: 1, Idx: 400, Training Loss: 0.6000, Training Accuracy: 68.75%\n",
98 | "Epoch: 1, Idx: 500, Training Loss: 0.6716, Training Accuracy: 65.62%\n",
99 | "Epoch: 01, Train Loss: 0.701, Train Acc: 56.00%, Val. Loss: 0.684682, Val. Acc: 53.44%\n",
100 | "Epoch: 2, Idx: 100, Training Loss: 0.6071, Training Accuracy: 62.50%\n",
101 | "Epoch: 2, Idx: 200, Training Loss: 0.6640, Training Accuracy: 53.12%\n",
102 | "Epoch: 2, Idx: 300, Training Loss: 0.7665, Training Accuracy: 43.75%\n",
103 | "Epoch: 2, Idx: 400, Training Loss: 0.7446, Training Accuracy: 65.62%\n",
104 | "Epoch: 2, Idx: 500, Training Loss: 0.7483, Training Accuracy: 40.62%\n",
105 | "Epoch: 02, Train Loss: 0.702, Train Acc: 55.19%, Val. Loss: 0.726367, Val. Acc: 53.12%\n",
106 | "Epoch: 3, Idx: 100, Training Loss: 0.6248, Training Accuracy: 68.75%\n",
107 | "Epoch: 3, Idx: 200, Training Loss: 0.7327, Training Accuracy: 50.00%\n",
108 | "Epoch: 3, Idx: 300, Training Loss: 0.6317, Training Accuracy: 65.62%\n",
109 | "Epoch: 3, Idx: 400, Training Loss: 0.5729, Training Accuracy: 71.88%\n",
110 | "Epoch: 3, Idx: 500, Training Loss: 0.6937, Training Accuracy: 56.25%\n",
111 | "Epoch: 03, Train Loss: 0.685, Train Acc: 60.74%, Val. Loss: 0.728285, Val. Acc: 54.92%\n",
112 | "Epoch: 4, Idx: 100, Training Loss: 0.7239, Training Accuracy: 59.38%\n",
113 | "Epoch: 4, Idx: 200, Training Loss: 0.5938, Training Accuracy: 75.00%\n",
114 | "Epoch: 4, Idx: 300, Training Loss: 0.9016, Training Accuracy: 53.12%\n",
115 | "Epoch: 4, Idx: 400, Training Loss: 0.5571, Training Accuracy: 78.12%\n",
116 | "Epoch: 4, Idx: 500, Training Loss: 0.5879, Training Accuracy: 65.62%\n",
117 | "Epoch: 04, Train Loss: 0.667, Train Acc: 61.56%, Val. Loss: 0.678201, Val. Acc: 57.76%\n",
118 | "Epoch: 5, Idx: 100, Training Loss: 0.6851, Training Accuracy: 56.25%\n",
119 | "Epoch: 5, Idx: 200, Training Loss: 0.6559, Training Accuracy: 62.50%\n",
120 | "Epoch: 5, Idx: 300, Training Loss: 0.6128, Training Accuracy: 62.50%\n",
121 | "Epoch: 5, Idx: 400, Training Loss: 0.6151, Training Accuracy: 59.38%\n",
122 | "Epoch: 5, Idx: 500, Training Loss: 0.6839, Training Accuracy: 68.75%\n",
123 | "Epoch: 05, Train Loss: 0.669, Train Acc: 61.76%, Val. Loss: 0.631271, Val. Acc: 66.40%\n",
124 | "Epoch: 6, Idx: 100, Training Loss: 0.4968, Training Accuracy: 78.12%\n",
125 | "Epoch: 6, Idx: 200, Training Loss: 0.7118, Training Accuracy: 62.50%\n",
126 | "Epoch: 6, Idx: 300, Training Loss: 0.5181, Training Accuracy: 81.25%\n",
127 | "Epoch: 6, Idx: 400, Training Loss: 0.5818, Training Accuracy: 75.00%\n",
128 | "Epoch: 6, Idx: 500, Training Loss: 0.5787, Training Accuracy: 68.75%\n",
129 | "Epoch: 06, Train Loss: 0.664, Train Acc: 62.54%, Val. Loss: 0.714283, Val. Acc: 53.20%\n",
130 | "Epoch: 7, Idx: 100, Training Loss: 0.7741, Training Accuracy: 65.62%\n",
131 | "Epoch: 7, Idx: 200, Training Loss: 0.6719, Training Accuracy: 62.50%\n",
132 | "Epoch: 7, Idx: 300, Training Loss: 0.5993, Training Accuracy: 68.75%\n",
133 | "Epoch: 7, Idx: 400, Training Loss: 0.7759, Training Accuracy: 46.88%\n",
134 | "Epoch: 7, Idx: 500, Training Loss: 0.6450, Training Accuracy: 59.38%\n",
135 | "Epoch: 07, Train Loss: 0.659, Train Acc: 62.85%, Val. Loss: 0.643714, Val. Acc: 61.96%\n",
136 | "Epoch: 8, Idx: 100, Training Loss: 0.6427, Training Accuracy: 75.00%\n",
137 | "Epoch: 8, Idx: 200, Training Loss: 0.7509, Training Accuracy: 46.88%\n",
138 | "Epoch: 8, Idx: 300, Training Loss: 0.7016, Training Accuracy: 53.12%\n",
139 | "Epoch: 8, Idx: 400, Training Loss: 0.6085, Training Accuracy: 71.88%\n",
140 | "Epoch: 8, Idx: 500, Training Loss: 0.5723, Training Accuracy: 71.88%\n",
141 | "Epoch: 08, Train Loss: 0.661, Train Acc: 63.19%, Val. Loss: 0.631669, Val. Acc: 66.58%\n",
142 | "Epoch: 9, Idx: 100, Training Loss: 0.6699, Training Accuracy: 56.25%\n",
143 | "Epoch: 9, Idx: 200, Training Loss: 0.7980, Training Accuracy: 56.25%\n",
144 | "Epoch: 9, Idx: 300, Training Loss: 0.7833, Training Accuracy: 56.25%\n",
145 | "Epoch: 9, Idx: 400, Training Loss: 0.6437, Training Accuracy: 56.25%\n",
146 | "Epoch: 9, Idx: 500, Training Loss: 0.6734, Training Accuracy: 65.62%\n",
147 | "Epoch: 09, Train Loss: 0.658, Train Acc: 63.68%, Val. Loss: 0.675087, Val. Acc: 61.79%\n",
148 | "Epoch: 10, Idx: 100, Training Loss: 0.5863, Training Accuracy: 75.00%\n",
149 | "Epoch: 10, Idx: 200, Training Loss: 0.6492, Training Accuracy: 71.88%\n",
150 | "Epoch: 10, Idx: 300, Training Loss: 0.5064, Training Accuracy: 84.38%\n",
151 | "Epoch: 10, Idx: 400, Training Loss: 0.8142, Training Accuracy: 53.12%\n",
152 | "Epoch: 10, Idx: 500, Training Loss: 0.7308, Training Accuracy: 46.88%\n",
153 | "Epoch: 10, Train Loss: 0.651, Train Acc: 64.40%, Val. Loss: 0.645745, Val. Acc: 65.99%\n",
154 | "Test Loss: 0.647, Test Acc: 65.61%\n"
155 | ]
156 | }
157 | ],
158 | "source": [
159 | "TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter = load_dataset()\n",
160 | "\n",
161 | "def clip_gradient(model, clip_value):\n",
162 | " params = list(filter(lambda p: p.grad is not None, model.parameters()))\n",
163 | " for p in params:\n",
164 | " p.grad.data.clamp_(-clip_value, clip_value)\n",
165 | " \n",
166 | "def train_model(model, train_iter, epoch):\n",
167 | " total_epoch_loss = 0\n",
168 | " total_epoch_acc = 0\n",
169 | " model.cuda()\n",
170 | " optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))\n",
171 | " steps = 0\n",
172 | " model.train()\n",
173 | " for idx, batch in enumerate(train_iter):\n",
174 | " text = batch.text[0]\n",
175 | " target = batch.label\n",
176 | " target = torch.autograd.Variable(target).long()\n",
177 | " if torch.cuda.is_available():\n",
178 | " text = text.cuda()\n",
179 | " target = target.cuda()\n",
180 | " if (text.size()[0] is not 32):# One of the batch returned by BucketIterator has length different than 32.\n",
181 | " continue\n",
182 | " optim.zero_grad()\n",
183 | " prediction = model(text)\n",
184 | " loss = loss_fn(prediction, target)\n",
185 | " num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()\n",
186 | " acc = 100.0 * num_corrects/len(batch)\n",
187 | " loss.backward()\n",
188 | " clip_gradient(model, 1e-1)\n",
189 | " optim.step()\n",
190 | " steps += 1\n",
191 | " \n",
192 | " if steps % 100 == 0:\n",
193 | " print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')\n",
194 | " \n",
195 | " total_epoch_loss += loss.item()\n",
196 | " total_epoch_acc += acc.item()\n",
197 | " \n",
198 | " return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)\n",
199 | "\n",
200 | "def eval_model(model, val_iter):\n",
201 | " total_epoch_loss = 0\n",
202 | " total_epoch_acc = 0\n",
203 | " model.eval()\n",
204 | " with torch.no_grad():\n",
205 | " for idx, batch in enumerate(val_iter):\n",
206 | " text = batch.text[0]\n",
207 | " if (text.size()[0] is not 32):\n",
208 | " continue\n",
209 | " target = batch.label\n",
210 | " target = torch.autograd.Variable(target).long()\n",
211 | " if torch.cuda.is_available():\n",
212 | " text = text.cuda()\n",
213 | " target = target.cuda()\n",
214 | " prediction = model(text)\n",
215 | " loss = loss_fn(prediction, target)\n",
216 | " num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()\n",
217 | " acc = 100.0 * num_corrects/len(batch)\n",
218 | " total_epoch_loss += loss.item()\n",
219 | " total_epoch_acc += acc.item()\n",
220 | "\n",
221 | " return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter)\n",
222 | "\n",
223 | "learning_rate = 1e-5\n",
224 | "batch_size = 32\n",
225 | "output_size = 2\n",
226 | "hidden_size = 256\n",
227 | "embedding_length = 300\n",
228 | "\n",
229 | "model = RNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)\n",
230 | "loss_fn = F.cross_entropy\n",
231 | "\n",
232 | "for epoch in range(10):\n",
233 | " train_loss, train_acc = train_model(model, train_iter, epoch)\n",
234 | " val_loss, val_acc = eval_model(model, valid_iter)\n",
235 | " \n",
236 | " print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')\n",
237 | " \n",
238 | "test_loss, test_acc = eval_model(model, test_iter)\n",
239 | "print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 4,
245 | "metadata": {},
246 | "outputs": [
247 | {
248 | "name": "stdout",
249 | "output_type": "stream",
250 | "text": [
251 | "tensor([[0.1081, 0.8919]], device='cuda:0', grad_fn=)\n",
252 | "Sentiment: Positive\n"
253 | ]
254 | },
255 | {
256 | "name": "stderr",
257 | "output_type": "stream",
258 | "text": [
259 | "c:\\users\\justin\\venv\\lib\\site-packages\\ipykernel_launcher.py:8: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n",
260 | " \n"
261 | ]
262 | }
263 | ],
264 | "source": [
265 | "test_sen1 = \"This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues.\"\n",
266 | "\n",
267 | "test_sen1 = TEXT.preprocess(test_sen1)\n",
268 | "test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]]\n",
269 | "\n",
270 | "test_sen = np.asarray(test_sen1)\n",
271 | "test_sen = torch.LongTensor(test_sen)\n",
272 | "test_tensor = Variable(test_sen, volatile=True)\n",
273 | "test_tensor = test_tensor.cuda()\n",
274 | "model.eval()\n",
275 | "output = model(test_tensor, 1)\n",
276 | "out = F.softmax(output, 1)\n",
277 | "print(out)\n",
278 | "if (torch.argmax(out[0]) == 1):\n",
279 | " print (\"Sentiment: Positive\")\n",
280 | "else:\n",
281 | " print (\"Sentiment: Negative\")"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": 5,
287 | "metadata": {},
288 | "outputs": [
289 | {
290 | "name": "stdout",
291 | "output_type": "stream",
292 | "text": [
293 | "tensor([[0.6741, 0.3259]], device='cuda:0', grad_fn=)\n",
294 | "Sentiment: Negative\n"
295 | ]
296 | },
297 | {
298 | "name": "stderr",
299 | "output_type": "stream",
300 | "text": [
301 | "c:\\users\\justin\\venv\\lib\\site-packages\\ipykernel_launcher.py:7: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n",
302 | " import sys\n"
303 | ]
304 | }
305 | ],
306 | "source": [
307 | "test_sen2 = \"Ohh, such a ridiculous movie. Not gonna recommend it to anyone. Complete waste of time and money.\"\n",
308 | "test_sen2 = TEXT.preprocess(test_sen2)\n",
309 | "test_sen2 = [[TEXT.vocab.stoi[x] for x in test_sen2]]\n",
310 | "\n",
311 | "test_sen = np.asarray(test_sen2)\n",
312 | "test_sen = torch.LongTensor(test_sen)\n",
313 | "test_tensor = Variable(test_sen, volatile=True)\n",
314 | "test_tensor = test_tensor.cuda()\n",
315 | "model.eval()\n",
316 | "output = model(test_tensor, 1)\n",
317 | "out = F.softmax(output, 1)\n",
318 | "print(out)\n",
319 | "if (torch.argmax(out[0]) == 1):\n",
320 | " print (\"Sentiment: Positive\")\n",
321 | "else:\n",
322 | " print (\"Sentiment: Negative\")"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 6,
328 | "metadata": {},
329 | "outputs": [
330 | {
331 | "name": "stdout",
332 | "output_type": "stream",
333 | "text": [
334 | "Epoch: 1, Idx: 100, Training Loss: 0.6934, Training Accuracy: 50.00%\n",
335 | "Epoch: 1, Idx: 200, Training Loss: 0.7078, Training Accuracy: 46.88%\n",
336 | "Epoch: 1, Idx: 300, Training Loss: 0.6930, Training Accuracy: 56.25%\n",
337 | "Epoch: 1, Idx: 400, Training Loss: 0.6874, Training Accuracy: 56.25%\n",
338 | "Epoch: 1, Idx: 500, Training Loss: 0.6718, Training Accuracy: 78.12%\n",
339 | "Epoch: 01, Train Loss: 0.690, Train Acc: 51.97%, Val. Loss: 0.688056, Val. Acc: 51.27%\n",
340 | "Epoch: 2, Idx: 100, Training Loss: 0.6545, Training Accuracy: 62.50%\n",
341 | "Epoch: 2, Idx: 200, Training Loss: 0.6695, Training Accuracy: 62.50%\n",
342 | "Epoch: 2, Idx: 300, Training Loss: 0.6926, Training Accuracy: 53.12%\n",
343 | "Epoch: 2, Idx: 400, Training Loss: 0.6827, Training Accuracy: 56.25%\n",
344 | "Epoch: 2, Idx: 500, Training Loss: 0.6129, Training Accuracy: 68.75%\n",
345 | "Epoch: 02, Train Loss: 0.667, Train Acc: 58.72%, Val. Loss: 0.682012, Val. Acc: 54.80%\n",
346 | "Epoch: 3, Idx: 100, Training Loss: 0.6308, Training Accuracy: 68.75%\n",
347 | "Epoch: 3, Idx: 200, Training Loss: 0.4342, Training Accuracy: 84.38%\n",
348 | "Epoch: 3, Idx: 300, Training Loss: 0.6503, Training Accuracy: 62.50%\n",
349 | "Epoch: 3, Idx: 400, Training Loss: 0.6636, Training Accuracy: 68.75%\n",
350 | "Epoch: 3, Idx: 500, Training Loss: 0.5156, Training Accuracy: 75.00%\n",
351 | "Epoch: 03, Train Loss: 0.590, Train Acc: 69.50%, Val. Loss: 0.494978, Val. Acc: 77.09%\n",
352 | "Epoch: 4, Idx: 100, Training Loss: 0.5605, Training Accuracy: 71.88%\n",
353 | "Epoch: 4, Idx: 200, Training Loss: 0.8281, Training Accuracy: 65.62%\n",
354 | "Epoch: 4, Idx: 300, Training Loss: 0.6036, Training Accuracy: 65.62%\n",
355 | "Epoch: 4, Idx: 400, Training Loss: 0.4735, Training Accuracy: 71.88%\n",
356 | "Epoch: 4, Idx: 500, Training Loss: 0.4546, Training Accuracy: 78.12%\n",
357 | "Epoch: 04, Train Loss: 0.428, Train Acc: 80.67%, Val. Loss: 0.386465, Val. Acc: 82.08%\n",
358 | "Epoch: 5, Idx: 100, Training Loss: 0.3328, Training Accuracy: 87.50%\n",
359 | "Epoch: 5, Idx: 200, Training Loss: 0.3596, Training Accuracy: 78.12%\n",
360 | "Epoch: 5, Idx: 300, Training Loss: 0.3249, Training Accuracy: 87.50%\n",
361 | "Epoch: 5, Idx: 400, Training Loss: 0.6565, Training Accuracy: 68.75%\n",
362 | "Epoch: 5, Idx: 500, Training Loss: 0.4050, Training Accuracy: 78.12%\n",
363 | "Epoch: 05, Train Loss: 0.367, Train Acc: 83.72%, Val. Loss: 0.369900, Val. Acc: 82.79%\n",
364 | "Epoch: 6, Idx: 100, Training Loss: 0.4549, Training Accuracy: 84.38%\n",
365 | "Epoch: 6, Idx: 200, Training Loss: 0.3892, Training Accuracy: 81.25%\n",
366 | "Epoch: 6, Idx: 300, Training Loss: 0.1442, Training Accuracy: 96.88%\n",
367 | "Epoch: 6, Idx: 400, Training Loss: 0.3001, Training Accuracy: 87.50%\n",
368 | "Epoch: 6, Idx: 500, Training Loss: 0.4553, Training Accuracy: 75.00%\n",
369 | "Epoch: 06, Train Loss: 0.324, Train Acc: 85.52%, Val. Loss: 0.367029, Val. Acc: 83.53%\n",
370 | "Epoch: 7, Idx: 100, Training Loss: 0.2308, Training Accuracy: 90.62%\n",
371 | "Epoch: 7, Idx: 200, Training Loss: 0.3394, Training Accuracy: 81.25%\n",
372 | "Epoch: 7, Idx: 300, Training Loss: 0.4261, Training Accuracy: 87.50%\n",
373 | "Epoch: 7, Idx: 400, Training Loss: 0.3106, Training Accuracy: 90.62%\n",
374 | "Epoch: 7, Idx: 500, Training Loss: 0.1421, Training Accuracy: 96.88%\n",
375 | "Epoch: 07, Train Loss: 0.282, Train Acc: 87.91%, Val. Loss: 0.378974, Val. Acc: 83.80%\n",
376 | "Epoch: 8, Idx: 100, Training Loss: 0.1280, Training Accuracy: 96.88%\n",
377 | "Epoch: 8, Idx: 200, Training Loss: 0.4244, Training Accuracy: 84.38%\n",
378 | "Epoch: 8, Idx: 300, Training Loss: 0.3225, Training Accuracy: 90.62%\n",
379 | "Epoch: 8, Idx: 400, Training Loss: 0.3618, Training Accuracy: 84.38%\n",
380 | "Epoch: 8, Idx: 500, Training Loss: 0.2334, Training Accuracy: 87.50%\n",
381 | "Epoch: 08, Train Loss: 0.232, Train Acc: 90.26%, Val. Loss: 0.395538, Val. Acc: 83.34%\n",
382 | "Epoch: 9, Idx: 100, Training Loss: 0.1379, Training Accuracy: 96.88%\n",
383 | "Epoch: 9, Idx: 200, Training Loss: 0.2220, Training Accuracy: 87.50%\n",
384 | "Epoch: 9, Idx: 300, Training Loss: 0.2743, Training Accuracy: 87.50%\n",
385 | "Epoch: 9, Idx: 400, Training Loss: 0.3071, Training Accuracy: 84.38%\n",
386 | "Epoch: 9, Idx: 500, Training Loss: 0.1465, Training Accuracy: 93.75%\n",
387 | "Epoch: 09, Train Loss: 0.181, Train Acc: 92.97%, Val. Loss: 0.433728, Val. Acc: 83.30%\n",
388 | "Epoch: 10, Idx: 100, Training Loss: 0.0535, Training Accuracy: 96.88%\n",
389 | "Epoch: 10, Idx: 200, Training Loss: 0.1756, Training Accuracy: 90.62%\n",
390 | "Epoch: 10, Idx: 300, Training Loss: 0.0970, Training Accuracy: 100.00%\n",
391 | "Epoch: 10, Idx: 400, Training Loss: 0.1835, Training Accuracy: 93.75%\n",
392 | "Epoch: 10, Idx: 500, Training Loss: 0.0648, Training Accuracy: 96.88%\n",
393 | "Epoch: 10, Train Loss: 0.133, Train Acc: 94.96%, Val. Loss: 0.458091, Val. Acc: 82.26%\n",
394 | "Test Loss: 0.456, Test Acc: 82.61%\n"
395 | ]
396 | }
397 | ],
398 | "source": [
399 | "class LSTMClassifier(nn.Module):\n",
400 | " def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):\n",
401 | " super(LSTMClassifier, self).__init__()\n",
402 | " self.batch_size = batch_size\n",
403 | " self.output_size = output_size\n",
404 | " self.hidden_size = hidden_size\n",
405 | " self.vocab_size = vocab_size\n",
406 | " self.embedding_length = embedding_length\n",
407 | "\n",
408 | " self.word_embeddings = nn.Embedding(vocab_size, embedding_length)# Initializing the look-up table.\n",
409 | " self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) # Assigning the look-up table to the pre-trained GloVe word embedding.\n",
410 | " self.lstm = nn.LSTM(embedding_length, hidden_size)\n",
411 | " self.label = nn.Linear(hidden_size, output_size)\n",
412 | " \n",
413 | " def forward(self, input_sentence, batch_size=None):\n",
414 | " input = self.word_embeddings(input_sentence) # embedded input of shape = (batch_size, num_sequences, embedding_length)\n",
415 | " input = input.permute(1, 0, 2) # input.size() = (num_sequences, batch_size, embedding_length)\n",
416 | " if batch_size is None:\n",
417 | " h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) # Initial hidden state of the LSTM\n",
418 | " c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) # Initial cell state of the LSTM\n",
419 | " else:\n",
420 | " h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())\n",
421 | " c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())\n",
422 | " output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))\n",
423 | " final_output = self.label(final_hidden_state[-1]) # final_hidden_state.size() = (1, batch_size, hidden_size) & final_output.size() = (batch_size, output_size)\n",
424 | " return final_output\n",
425 | " \n",
426 | "model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)\n",
427 | "loss_fn = F.cross_entropy\n",
428 | "\n",
429 | "for epoch in range(10):\n",
430 | " train_loss, train_acc = train_model(model, train_iter, epoch)\n",
431 | " val_loss, val_acc = eval_model(model, valid_iter)\n",
432 | " \n",
433 | " print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')\n",
434 | " \n",
435 | "test_loss, test_acc = eval_model(model, test_iter)\n",
436 | "print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')"
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": 7,
442 | "metadata": {},
443 | "outputs": [
444 | {
445 | "name": "stdout",
446 | "output_type": "stream",
447 | "text": [
448 | "tensor([[5.6929e-06, 9.9999e-01]], device='cuda:0', grad_fn=)\n",
449 | "Sentiment: Positive\n"
450 | ]
451 | },
452 | {
453 | "name": "stderr",
454 | "output_type": "stream",
455 | "text": [
456 | "c:\\users\\justin\\venv\\lib\\site-packages\\ipykernel_launcher.py:8: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n",
457 | " \n"
458 | ]
459 | }
460 | ],
461 | "source": [
462 | "test_sen1 = \"This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues.\"\n",
463 | "\n",
464 | "test_sen1 = TEXT.preprocess(test_sen1)\n",
465 | "test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]]\n",
466 | "\n",
467 | "test_sen = np.asarray(test_sen1)\n",
468 | "test_sen = torch.LongTensor(test_sen)\n",
469 | "test_tensor = Variable(test_sen, volatile=True)\n",
470 | "test_tensor = test_tensor.cuda()\n",
471 | "model.eval()\n",
472 | "output = model(test_tensor, 1)\n",
473 | "out = F.softmax(output, 1)\n",
474 | "print(out)\n",
475 | "if (torch.argmax(out[0]) == 1):\n",
476 | " print (\"Sentiment: Positive\")\n",
477 | "else:\n",
478 | " print (\"Sentiment: Negative\")"
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": 8,
484 | "metadata": {},
485 | "outputs": [
486 | {
487 | "name": "stdout",
488 | "output_type": "stream",
489 | "text": [
490 | "tensor([[0.9989, 0.0011]], device='cuda:0', grad_fn=)\n",
491 | "Sentiment: Negative\n"
492 | ]
493 | },
494 | {
495 | "name": "stderr",
496 | "output_type": "stream",
497 | "text": [
498 | "c:\\users\\justin\\venv\\lib\\site-packages\\ipykernel_launcher.py:8: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n",
499 | " \n"
500 | ]
501 | }
502 | ],
503 | "source": [
504 | "test_sen2 = \"Ohh, such a ridiculous movie. Not gonna recommend it to anyone. Complete waste of time and money.\"\n",
505 | "test_sen2 = TEXT.preprocess(test_sen2)\n",
506 | "test_sen2 = [[TEXT.vocab.stoi[x] for x in test_sen2]]\n",
507 | "\n",
508 | "\n",
509 | "test_sen = np.asarray(test_sen2)\n",
510 | "test_sen = torch.LongTensor(test_sen)\n",
511 | "test_tensor = Variable(test_sen, volatile=True)\n",
512 | "test_tensor = test_tensor.cuda()\n",
513 | "model.eval()\n",
514 | "output = model(test_tensor, 1)\n",
515 | "out = F.softmax(output, 1)\n",
516 | "print(out)\n",
517 | "if (torch.argmax(out[0]) == 1):\n",
518 | " print (\"Sentiment: Positive\")\n",
519 | "else:\n",
520 | " print (\"Sentiment: Negative\")"
521 | ]
522 | },
523 | {
524 | "cell_type": "code",
525 | "execution_count": 9,
526 | "metadata": {},
527 | "outputs": [],
528 | "source": [
529 | "class AttentionModel(torch.nn.Module):\n",
530 | " def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):\n",
531 | " super(AttentionModel, self).__init__()\n",
532 | " self.batch_size = batch_size\n",
533 | " self.output_size = output_size\n",
534 | " self.hidden_size = hidden_size\n",
535 | "\n",
536 | " self.vocab_size = vocab_size\n",
537 | " self.embedding_length = embedding_length\n",
538 | " \n",
539 | " self.word_embeddings = nn.Embedding(vocab_size, embedding_length)\n",
540 | " self.word_embeddings.weights = nn.Parameter(weights, requires_grad=False)\n",
541 | " self.lstm = nn.LSTM(embedding_length, hidden_size)\n",
542 | " self.label = nn.Linear(hidden_size, output_size)\n",
543 | " \n",
544 | " def attention_net(self, lstm_output, final_state):\n",
545 | " hidden = final_state.squeeze(0)\n",
546 | "\n",
547 | " attn_weights = torch.bmm(lstm_output, hidden.unsqueeze(2)).squeeze(2)\n",
548 | " soft_attn_weights = F.softmax(attn_weights, 1)\n",
549 | " new_hidden_state = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)\n",
550 | " \n",
551 | " return new_hidden_state\n",
552 | " \n",
553 | " def forward(self, input_sentences, batch_size=None):\n",
554 | " input = self.word_embeddings(input_sentences)\n",
555 | "\n",
556 | " input = input.permute(1, 0, 2)\n",
557 | " if batch_size is None:\n",
558 | " h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda())\n",
559 | " c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda())\n",
560 | " else:\n",
561 | " h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())\n",
562 | " c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())\n",
563 | "\n",
564 | " output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))\n",
565 | " output = output.permute(1, 0, 2)\n",
566 | " \n",
567 | " attn_output = self.attention_net(output, final_hidden_state)\n",
568 | "\n",
569 | " logits = self.label(attn_output)\n",
570 | " \n",
571 | " return logits"
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": 10,
577 | "metadata": {},
578 | "outputs": [
579 | {
580 | "name": "stdout",
581 | "output_type": "stream",
582 | "text": [
583 | "Epoch: 1, Idx: 100, Training Loss: 0.7107, Training Accuracy: 46.88%\n",
584 | "Epoch: 1, Idx: 200, Training Loss: 0.6477, Training Accuracy: 65.62%\n",
585 | "Epoch: 1, Idx: 300, Training Loss: 0.6709, Training Accuracy: 59.38%\n",
586 | "Epoch: 1, Idx: 400, Training Loss: 0.6348, Training Accuracy: 71.88%\n",
587 | "Epoch: 1, Idx: 500, Training Loss: 0.6457, Training Accuracy: 71.88%\n",
588 | "Epoch: 01, Train Loss: 0.670, Train Acc: 61.15%, Val. Loss: 0.622058, Val. Acc: 69.28%\n",
589 | "Epoch: 2, Idx: 100, Training Loss: 0.5689, Training Accuracy: 81.25%\n",
590 | "Epoch: 2, Idx: 200, Training Loss: 0.7160, Training Accuracy: 59.38%\n",
591 | "Epoch: 2, Idx: 300, Training Loss: 0.5420, Training Accuracy: 78.12%\n",
592 | "Epoch: 2, Idx: 400, Training Loss: 0.4943, Training Accuracy: 78.12%\n",
593 | "Epoch: 2, Idx: 500, Training Loss: 0.4309, Training Accuracy: 87.50%\n",
594 | "Epoch: 02, Train Loss: 0.517, Train Acc: 76.50%, Val. Loss: 0.500942, Val. Acc: 76.73%\n",
595 | "Epoch: 3, Idx: 100, Training Loss: 0.4180, Training Accuracy: 78.12%\n",
596 | "Epoch: 3, Idx: 200, Training Loss: 0.1869, Training Accuracy: 93.75%\n",
597 | "Epoch: 3, Idx: 300, Training Loss: 0.4827, Training Accuracy: 75.00%\n",
598 | "Epoch: 3, Idx: 400, Training Loss: 0.3836, Training Accuracy: 84.38%\n",
599 | "Epoch: 3, Idx: 500, Training Loss: 0.6573, Training Accuracy: 68.75%\n",
600 | "Epoch: 03, Train Loss: 0.315, Train Acc: 87.20%, Val. Loss: 0.440898, Val. Acc: 79.07%\n",
601 | "Epoch: 4, Idx: 100, Training Loss: 0.1041, Training Accuracy: 96.88%\n",
602 | "Epoch: 4, Idx: 200, Training Loss: 0.0691, Training Accuracy: 100.00%\n",
603 | "Epoch: 4, Idx: 300, Training Loss: 0.0704, Training Accuracy: 96.88%\n",
604 | "Epoch: 4, Idx: 400, Training Loss: 0.1435, Training Accuracy: 93.75%\n",
605 | "Epoch: 4, Idx: 500, Training Loss: 0.1228, Training Accuracy: 96.88%\n",
606 | "Epoch: 04, Train Loss: 0.163, Train Acc: 93.80%, Val. Loss: 0.498487, Val. Acc: 81.04%\n",
607 | "Epoch: 5, Idx: 100, Training Loss: 0.0418, Training Accuracy: 96.88%\n",
608 | "Epoch: 5, Idx: 200, Training Loss: 0.0405, Training Accuracy: 96.88%\n",
609 | "Epoch: 5, Idx: 300, Training Loss: 0.1384, Training Accuracy: 90.62%\n",
610 | "Epoch: 5, Idx: 400, Training Loss: 0.2633, Training Accuracy: 90.62%\n",
611 | "Epoch: 5, Idx: 500, Training Loss: 0.0360, Training Accuracy: 100.00%\n",
612 | "Epoch: 05, Train Loss: 0.079, Train Acc: 97.15%, Val. Loss: 0.572422, Val. Acc: 81.11%\n",
613 | "Epoch: 6, Idx: 100, Training Loss: 0.0018, Training Accuracy: 100.00%\n",
614 | "Epoch: 6, Idx: 200, Training Loss: 0.0444, Training Accuracy: 96.88%\n",
615 | "Epoch: 6, Idx: 300, Training Loss: 0.1177, Training Accuracy: 96.88%\n",
616 | "Epoch: 6, Idx: 400, Training Loss: 0.1992, Training Accuracy: 96.88%\n",
617 | "Epoch: 6, Idx: 500, Training Loss: 0.0245, Training Accuracy: 100.00%\n",
618 | "Epoch: 06, Train Loss: 0.034, Train Acc: 98.77%, Val. Loss: 0.777436, Val. Acc: 81.54%\n",
619 | "Epoch: 7, Idx: 100, Training Loss: 0.0051, Training Accuracy: 100.00%\n",
620 | "Epoch: 7, Idx: 200, Training Loss: 0.0011, Training Accuracy: 100.00%\n",
621 | "Epoch: 7, Idx: 300, Training Loss: 0.0009, Training Accuracy: 100.00%\n",
622 | "Epoch: 7, Idx: 400, Training Loss: 0.0038, Training Accuracy: 100.00%\n",
623 | "Epoch: 7, Idx: 500, Training Loss: 0.0092, Training Accuracy: 100.00%\n",
624 | "Epoch: 07, Train Loss: 0.021, Train Acc: 99.27%, Val. Loss: 0.816149, Val. Acc: 81.73%\n",
625 | "Epoch: 8, Idx: 100, Training Loss: 0.0013, Training Accuracy: 100.00%\n",
626 | "Epoch: 8, Idx: 200, Training Loss: 0.0011, Training Accuracy: 100.00%\n",
627 | "Epoch: 8, Idx: 300, Training Loss: 0.0069, Training Accuracy: 100.00%\n",
628 | "Epoch: 8, Idx: 400, Training Loss: 0.0105, Training Accuracy: 100.00%\n",
629 | "Epoch: 8, Idx: 500, Training Loss: 0.0003, Training Accuracy: 100.00%\n",
630 | "Epoch: 08, Train Loss: 0.016, Train Acc: 99.34%, Val. Loss: 0.735093, Val. Acc: 80.49%\n",
631 | "Epoch: 9, Idx: 100, Training Loss: 0.0003, Training Accuracy: 100.00%\n",
632 | "Epoch: 9, Idx: 200, Training Loss: 0.0004, Training Accuracy: 100.00%\n",
633 | "Epoch: 9, Idx: 300, Training Loss: 0.0004, Training Accuracy: 100.00%\n",
634 | "Epoch: 9, Idx: 400, Training Loss: 0.0025, Training Accuracy: 100.00%\n",
635 | "Epoch: 9, Idx: 500, Training Loss: 0.0021, Training Accuracy: 100.00%\n",
636 | "Epoch: 09, Train Loss: 0.008, Train Acc: 99.58%, Val. Loss: 1.018735, Val. Acc: 81.88%\n",
637 | "Epoch: 10, Idx: 100, Training Loss: 0.0025, Training Accuracy: 100.00%\n",
638 | "Epoch: 10, Idx: 200, Training Loss: 0.0192, Training Accuracy: 100.00%\n",
639 | "Epoch: 10, Idx: 300, Training Loss: 0.0002, Training Accuracy: 100.00%\n",
640 | "Epoch: 10, Idx: 400, Training Loss: 0.0000, Training Accuracy: 100.00%\n",
641 | "Epoch: 10, Idx: 500, Training Loss: 0.0082, Training Accuracy: 100.00%\n",
642 | "Epoch: 10, Train Loss: 0.011, Train Acc: 99.55%, Val. Loss: 0.810989, Val. Acc: 82.54%\n",
643 | "Test Loss: 0.940, Test Acc: 80.09%\n"
644 | ]
645 | }
646 | ],
647 | "source": [
648 | "model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)\n",
649 | "loss_fn = F.cross_entropy\n",
650 | "\n",
651 | "for epoch in range(10):\n",
652 | " train_loss, train_acc = train_model(model, train_iter, epoch)\n",
653 | " val_loss, val_acc = eval_model(model, valid_iter)\n",
654 | " \n",
655 | " print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')\n",
656 | " \n",
657 | "test_loss, test_acc = eval_model(model, test_iter)\n",
658 | "print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')"
659 | ]
660 | },
661 | {
662 | "cell_type": "code",
663 | "execution_count": 11,
664 | "metadata": {},
665 | "outputs": [
666 | {
667 | "name": "stdout",
668 | "output_type": "stream",
669 | "text": [
670 | "tensor([[3.2883e-07, 1.0000e+00]], device='cuda:0', grad_fn=)\n",
671 | "Sentiment: Positive\n"
672 | ]
673 | },
674 | {
675 | "name": "stderr",
676 | "output_type": "stream",
677 | "text": [
678 | "c:\\users\\justin\\venv\\lib\\site-packages\\ipykernel_launcher.py:8: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n",
679 | " \n"
680 | ]
681 | }
682 | ],
683 | "source": [
684 | "test_sen1 = \"This is one of the best creation of Nolan. I can say, it's his magnum opus. Loved the soundtrack and especially those creative dialogues.\"\n",
685 | "\n",
686 | "test_sen1 = TEXT.preprocess(test_sen1)\n",
687 | "test_sen1 = [[TEXT.vocab.stoi[x] for x in test_sen1]]\n",
688 | "\n",
689 | "test_sen = np.asarray(test_sen1)\n",
690 | "test_sen = torch.LongTensor(test_sen)\n",
691 | "test_tensor = Variable(test_sen, volatile=True)\n",
692 | "test_tensor = test_tensor.cuda()\n",
693 | "model.eval()\n",
694 | "output = model(test_tensor, 1)\n",
695 | "out = F.softmax(output, 1)\n",
696 | "print(out)\n",
697 | "if (torch.argmax(out[0]) == 1):\n",
698 | " print (\"Sentiment: Positive\")\n",
699 | "else:\n",
700 | " print (\"Sentiment: Negative\")"
701 | ]
702 | },
703 | {
704 | "cell_type": "code",
705 | "execution_count": 12,
706 | "metadata": {},
707 | "outputs": [
708 | {
709 | "name": "stdout",
710 | "output_type": "stream",
711 | "text": [
712 | "tensor([[1.0000e+00, 1.0964e-06]], device='cuda:0', grad_fn=)\n",
713 | "Sentiment: Negative\n"
714 | ]
715 | },
716 | {
717 | "name": "stderr",
718 | "output_type": "stream",
719 | "text": [
720 | "c:\\users\\justin\\venv\\lib\\site-packages\\ipykernel_launcher.py:8: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n",
721 | " \n"
722 | ]
723 | }
724 | ],
725 | "source": [
726 | "test_sen2 = \"Ohh, such a ridiculous movie. Not gonna recommend it to anyone. Complete waste of time and money.\"\n",
727 | "test_sen2 = TEXT.preprocess(test_sen2)\n",
728 | "test_sen2 = [[TEXT.vocab.stoi[x] for x in test_sen2]]\n",
729 | "\n",
730 | "\n",
731 | "test_sen = np.asarray(test_sen2)\n",
732 | "test_sen = torch.LongTensor(test_sen)\n",
733 | "test_tensor = Variable(test_sen, volatile=True)\n",
734 | "test_tensor = test_tensor.cuda()\n",
735 | "model.eval()\n",
736 | "output = model(test_tensor, 1)\n",
737 | "out = F.softmax(output, 1)\n",
738 | "print(out)\n",
739 | "if (torch.argmax(out[0]) == 1):\n",
740 | " print (\"Sentiment: Positive\")\n",
741 | "else:\n",
742 | " print (\"Sentiment: Negative\")"
743 | ]
744 | }
745 | ],
746 | "metadata": {
747 | "kernelspec": {
748 | "display_name": "Python 3",
749 | "language": "python",
750 | "name": "python3"
751 | },
752 | "language_info": {
753 | "codemirror_mode": {
754 | "name": "ipython",
755 | "version": 3
756 | },
757 | "file_extension": ".py",
758 | "mimetype": "text/x-python",
759 | "name": "python",
760 | "nbconvert_exporter": "python",
761 | "pygments_lexer": "ipython3",
762 | "version": "3.6.8"
763 | }
764 | },
765 | "nbformat": 4,
766 | "nbformat_minor": 4
767 | }
768 |
--------------------------------------------------------------------------------
/4_NMT.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import torch\n",
10 | "import torch.nn as nn\n",
11 | "import torch.optim as optim\n",
12 | "\n",
13 | "from torchtext.datasets import TranslationDataset, Multi30k\n",
14 | "from torchtext.data import Field, BucketIterator\n",
15 | "\n",
16 | "import spacy\n",
17 | "import numpy as np\n",
18 | "\n",
19 | "import random\n",
20 | "import math\n",
21 | "import time"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "SEED = 1234\n",
31 | "\n",
32 | "random.seed(SEED)\n",
33 | "np.random.seed(SEED)\n",
34 | "torch.manual_seed(SEED)\n",
35 | "torch.cuda.manual_seed(SEED)\n",
36 | "torch.backends.cudnn.deterministic = True"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 3,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "spacy_en = spacy.load('en_core_web_sm')\n",
46 | "spacy_de = spacy.load('de_core_news_sm')"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 4,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "def tokenize_de(text):\n",
56 | " return [tok.text for tok in spacy_de.tokenizer(text)][::-1]\n",
57 | "\n",
58 | "def tokenize_en(text):\n",
59 | " return [tok.text for tok in spacy_en.tokenizer(text)]"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 5,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "SRC = Field(tokenize = tokenize_en, \n",
69 | " init_token = '', \n",
70 | " eos_token = '', \n",
71 | " lower = True)\n",
72 | "\n",
73 | "TRG = Field(tokenize = tokenize_de, \n",
74 | " init_token = '', \n",
75 | " eos_token = '', \n",
76 | " lower = True)"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 6,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), \n",
86 | " fields = (SRC, TRG))"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 7,
92 | "metadata": {},
93 | "outputs": [
94 | {
95 | "name": "stdout",
96 | "output_type": "stream",
97 | "text": [
98 | "Number of training examples: 29000\n",
99 | "Number of validation examples: 1014\n",
100 | "Number of testing examples: 1000\n"
101 | ]
102 | }
103 | ],
104 | "source": [
105 | "print(f\"Number of training examples: {len(train_data.examples)}\")\n",
106 | "print(f\"Number of validation examples: {len(valid_data.examples)}\")\n",
107 | "print(f\"Number of testing examples: {len(test_data.examples)}\")"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 8,
113 | "metadata": {},
114 | "outputs": [
115 | {
116 | "name": "stdout",
117 | "output_type": "stream",
118 | "text": [
119 | "{'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'i', 'm', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.'], 'trg': ['.', 'bushes', 'many', 'near', 'outside', 'are', 'males', 'white', ',', 'young', 'two']}\n"
120 | ]
121 | }
122 | ],
123 | "source": [
124 | "print(vars(train_data.examples[0]))"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 9,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "SRC.build_vocab(train_data, min_freq = 2)\n",
134 | "TRG.build_vocab(train_data, min_freq = 2)"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": 10,
140 | "metadata": {},
141 | "outputs": [
142 | {
143 | "name": "stdout",
144 | "output_type": "stream",
145 | "text": [
146 | "Unique tokens in source (de) vocabulary: 7873\n",
147 | "Unique tokens in target (en) vocabulary: 5923\n"
148 | ]
149 | }
150 | ],
151 | "source": [
152 | "print(f\"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}\")\n",
153 | "print(f\"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}\")"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 11,
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 12,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "BATCH_SIZE = 128\n",
172 | "\n",
173 | "train_iterator, valid_iterator, test_iterator = BucketIterator.splits(\n",
174 | " (train_data, valid_data, test_data), \n",
175 | " batch_size = BATCH_SIZE, \n",
176 | " device = device)"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 13,
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "class Encoder(nn.Module):\n",
186 | " def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):\n",
187 | " super().__init__()\n",
188 | " \n",
189 | " self.hid_dim = hid_dim\n",
190 | " self.n_layers = n_layers\n",
191 | " self.embedding = nn.Embedding(input_dim, emb_dim)\n",
192 | " self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)\n",
193 | " self.dropout = nn.Dropout(dropout)\n",
194 | " \n",
195 | " def forward(self, src):\n",
196 | " embedded = self.dropout(self.embedding(src))\n",
197 | " outputs, (hidden, cell) = self.rnn(embedded)\n",
198 | " return hidden, cell"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 14,
204 | "metadata": {},
205 | "outputs": [],
206 | "source": [
207 | "class Decoder(nn.Module):\n",
208 | " def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):\n",
209 | " super().__init__()\n",
210 | " \n",
211 | " self.output_dim = output_dim\n",
212 | " self.hid_dim = hid_dim\n",
213 | " self.n_layers = n_layers\n",
214 | " self.embedding = nn.Embedding(output_dim, emb_dim)\n",
215 | " self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)\n",
216 | " self.fc_out = nn.Linear(hid_dim, output_dim)\n",
217 | " self.dropout = nn.Dropout(dropout)\n",
218 | " \n",
219 | " def forward(self, input, hidden, cell): \n",
220 | " input = input.unsqueeze(0)\n",
221 | " embedded = self.dropout(self.embedding(input))\n",
222 | " output, (hidden, cell) = self.rnn(embedded, (hidden, cell))\n",
223 | " prediction = self.fc_out(output.squeeze(0))\n",
224 | " return prediction, hidden, cell"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": 15,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "class Seq2Seq(nn.Module):\n",
234 | " def __init__(self, encoder, decoder, device):\n",
235 | " super().__init__()\n",
236 | " \n",
237 | " self.encoder = encoder\n",
238 | " self.decoder = decoder\n",
239 | " self.device = device\n",
240 | " \n",
241 | " assert encoder.hid_dim == decoder.hid_dim, \\\n",
242 | " \"Hidden dimensions of encoder and decoder must be equal!\"\n",
243 | " assert encoder.n_layers == decoder.n_layers, \\\n",
244 | " \"Encoder and decoder must have equal number of layers!\"\n",
245 | " \n",
246 | " def forward(self, src, trg, teacher_forcing_ratio = 0.5):\n",
247 | " \n",
248 | " batch_size = trg.shape[1]\n",
249 | " trg_len = trg.shape[0]\n",
250 | " trg_vocab_size = self.decoder.output_dim\n",
251 | " outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)\n",
252 | " hidden, cell = self.encoder(src)\n",
253 | " input = trg[0,:]\n",
254 | " \n",
255 | " for t in range(1, trg_len):\n",
256 | " output, hidden, cell = self.decoder(input, hidden, cell)\n",
257 | " outputs[t] = output\n",
258 | " teacher_force = random.random() < teacher_forcing_ratio\n",
259 | " top1 = output.argmax(1) \n",
260 | " input = trg[t] if teacher_force else top1\n",
261 | " return outputs"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 16,
267 | "metadata": {},
268 | "outputs": [],
269 | "source": [
270 | "\n",
271 | "INPUT_DIM = len(SRC.vocab)\n",
272 | "OUTPUT_DIM = len(TRG.vocab)\n",
273 | "ENC_EMB_DIM = 256\n",
274 | "DEC_EMB_DIM = 256\n",
275 | "HID_DIM = 512\n",
276 | "N_LAYERS = 2\n",
277 | "ENC_DROPOUT = 0.5\n",
278 | "DEC_DROPOUT = 0.5\n",
279 | "\n",
280 | "enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)\n",
281 | "dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)\n",
282 | "\n",
283 | "model = Seq2Seq(enc, dec, device).to(device)"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 17,
289 | "metadata": {},
290 | "outputs": [
291 | {
292 | "data": {
293 | "text/plain": [
294 | "Seq2Seq(\n",
295 | " (encoder): Encoder(\n",
296 | " (embedding): Embedding(7873, 256)\n",
297 | " (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)\n",
298 | " (dropout): Dropout(p=0.5, inplace=False)\n",
299 | " )\n",
300 | " (decoder): Decoder(\n",
301 | " (embedding): Embedding(5923, 256)\n",
302 | " (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)\n",
303 | " (fc_out): Linear(in_features=512, out_features=5923, bias=True)\n",
304 | " (dropout): Dropout(p=0.5, inplace=False)\n",
305 | " )\n",
306 | ")"
307 | ]
308 | },
309 | "execution_count": 17,
310 | "metadata": {},
311 | "output_type": "execute_result"
312 | }
313 | ],
314 | "source": [
315 | "def init_weights(m):\n",
316 | " for name, param in m.named_parameters():\n",
317 | " nn.init.uniform_(param.data, -0.08, 0.08)\n",
318 | " \n",
319 | "model.apply(init_weights)"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": 18,
325 | "metadata": {},
326 | "outputs": [
327 | {
328 | "name": "stdout",
329 | "output_type": "stream",
330 | "text": [
331 | "The model has 13,926,691 trainable parameters\n"
332 | ]
333 | }
334 | ],
335 | "source": [
336 | "def count_parameters(model):\n",
337 | " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
338 | "\n",
339 | "print(f'The model has {count_parameters(model):,} trainable parameters')"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 19,
345 | "metadata": {},
346 | "outputs": [],
347 | "source": [
348 | "optimizer = optim.Adam(model.parameters())"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": 20,
354 | "metadata": {},
355 | "outputs": [],
356 | "source": [
357 | "TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]\n",
358 | "criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": 21,
364 | "metadata": {},
365 | "outputs": [],
366 | "source": [
367 | "def train(model, iterator, optimizer, criterion, clip):\n",
368 | " \n",
369 | " model.train()\n",
370 | " epoch_loss = 0\n",
371 | " for i, batch in enumerate(iterator):\n",
372 | " src = batch.src\n",
373 | " trg = batch.trg\n",
374 | " optimizer.zero_grad()\n",
375 | " output = model(src, trg)\n",
376 | " output_dim = output.shape[-1]\n",
377 | " output = output[1:].view(-1, output_dim)\n",
378 | " trg = trg[1:].view(-1)\n",
379 | " loss = criterion(output, trg)\n",
380 | " loss.backward()\n",
381 | " torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n",
382 | " optimizer.step()\n",
383 | " epoch_loss += loss.item()\n",
384 | " return epoch_loss / len(iterator)"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": 22,
390 | "metadata": {},
391 | "outputs": [],
392 | "source": [
393 | "def evaluate(model, iterator, criterion):\n",
394 | " \n",
395 | " model.eval()\n",
396 | " epoch_loss = 0\n",
397 | " with torch.no_grad():\n",
398 | " for i, batch in enumerate(iterator):\n",
399 | " src = batch.src\n",
400 | " trg = batch.trg\n",
401 | " output = model(src, trg, 0)\n",
402 | " output_dim = output.shape[-1]\n",
403 | " output = output[1:].view(-1, output_dim)\n",
404 | " trg = trg[1:].view(-1)\n",
405 | " loss = criterion(output, trg)\n",
406 | " epoch_loss += loss.item()\n",
407 | " return epoch_loss / len(iterator)"
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 23,
413 | "metadata": {},
414 | "outputs": [],
415 | "source": [
416 | "def epoch_time(start_time, end_time):\n",
417 | " elapsed_time = end_time - start_time\n",
418 | " elapsed_mins = int(elapsed_time / 60)\n",
419 | " elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n",
420 | " return elapsed_mins, elapsed_secs"
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": 24,
426 | "metadata": {},
427 | "outputs": [
428 | {
429 | "name": "stdout",
430 | "output_type": "stream",
431 | "text": [
432 | "Epoch: 01 | Time: 0m 34s\n",
433 | "\tTrain Loss: 4.985 | Train PPL: 146.191\n",
434 | "\t Val. Loss: 4.928 | Val. PPL: 138.108\n",
435 | "Epoch: 02 | Time: 0m 34s\n",
436 | "\tTrain Loss: 4.462 | Train PPL: 86.666\n",
437 | "\t Val. Loss: 4.883 | Val. PPL: 131.987\n",
438 | "Epoch: 03 | Time: 0m 34s\n",
439 | "\tTrain Loss: 4.200 | Train PPL: 66.677\n",
440 | "\t Val. Loss: 4.602 | Val. PPL: 99.726\n",
441 | "Epoch: 04 | Time: 0m 34s\n",
442 | "\tTrain Loss: 3.999 | Train PPL: 54.560\n",
443 | "\t Val. Loss: 4.467 | Val. PPL: 87.056\n",
444 | "Epoch: 05 | Time: 0m 34s\n",
445 | "\tTrain Loss: 3.828 | Train PPL: 45.983\n",
446 | "\t Val. Loss: 4.386 | Val. PPL: 80.279\n",
447 | "Epoch: 06 | Time: 0m 34s\n",
448 | "\tTrain Loss: 3.653 | Train PPL: 38.600\n",
449 | "\t Val. Loss: 4.248 | Val. PPL: 69.934\n",
450 | "Epoch: 07 | Time: 0m 34s\n",
451 | "\tTrain Loss: 3.489 | Train PPL: 32.764\n",
452 | "\t Val. Loss: 4.083 | Val. PPL: 59.326\n",
453 | "Epoch: 08 | Time: 0m 34s\n",
454 | "\tTrain Loss: 3.339 | Train PPL: 28.182\n",
455 | "\t Val. Loss: 4.000 | Val. PPL: 54.601\n",
456 | "Epoch: 09 | Time: 0m 34s\n",
457 | "\tTrain Loss: 3.189 | Train PPL: 24.269\n",
458 | "\t Val. Loss: 3.956 | Val. PPL: 52.262\n",
459 | "Epoch: 10 | Time: 0m 34s\n",
460 | "\tTrain Loss: 3.056 | Train PPL: 21.245\n",
461 | "\t Val. Loss: 3.917 | Val. PPL: 50.249\n"
462 | ]
463 | }
464 | ],
465 | "source": [
466 | "N_EPOCHS = 10\n",
467 | "CLIP = 1\n",
468 | "\n",
469 | "best_valid_loss = float('inf')\n",
470 | "\n",
471 | "for epoch in range(N_EPOCHS):\n",
472 | " \n",
473 | " start_time = time.time()\n",
474 | " \n",
475 | " train_loss = train(model, train_iterator, optimizer, criterion, CLIP)\n",
476 | " valid_loss = evaluate(model, valid_iterator, criterion)\n",
477 | " \n",
478 | " end_time = time.time()\n",
479 | " \n",
480 | " epoch_mins, epoch_secs = epoch_time(start_time, end_time)\n",
481 | " \n",
482 | " if valid_loss < best_valid_loss:\n",
483 | " best_valid_loss = valid_loss\n",
484 | " torch.save(model.state_dict(), 'tut1-model.pt')\n",
485 | " \n",
486 | " print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')\n",
487 | " print(f'\\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')\n",
488 | " print(f'\\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": 25,
494 | "metadata": {},
495 | "outputs": [
496 | {
497 | "name": "stdout",
498 | "output_type": "stream",
499 | "text": [
500 | "| Test Loss: 4.011 | Test PPL: 55.177 |\n"
501 | ]
502 | }
503 | ],
504 | "source": [
505 | "model.load_state_dict(torch.load('tut1-model.pt'))\n",
506 | "\n",
507 | "test_loss = evaluate(model, test_iterator, criterion)\n",
508 | "\n",
509 | "print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": null,
515 | "metadata": {},
516 | "outputs": [],
517 | "source": []
518 | }
519 | ],
520 | "metadata": {
521 | "kernelspec": {
522 | "display_name": "Python 3",
523 | "language": "python",
524 | "name": "python3"
525 | },
526 | "language_info": {
527 | "codemirror_mode": {
528 | "name": "ipython",
529 | "version": 3
530 | },
531 | "file_extension": ".py",
532 | "mimetype": "text/x-python",
533 | "name": "python",
534 | "nbconvert_exporter": "python",
535 | "pygments_lexer": "ipython3",
536 | "version": "3.6.8"
537 | }
538 | },
539 | "nbformat": 4,
540 | "nbformat_minor": 4
541 | }
542 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # torch_nlp_basic
2 | - Basic Concept to understand Natural Language Process
3 | - Please contact to me by e-mail
4 |
--------------------------------------------------------------------------------