"
85 | ]
86 | },
87 | "metadata": {
88 | "needs_background": "light"
89 | },
90 | "output_type": "display_data"
91 | }
92 | ],
93 | "source": [
94 | "#data = data.head(3)\n",
95 | "data['Log1pSalary'] = np.log1p(data['SalaryNormalized']).astype('float32')\n",
96 | "\n",
97 | "plt.figure(figsize=[8, 4])\n",
98 | "plt.subplot(1, 2, 1)\n",
99 | "\n",
100 | "plt.hist(data[\"SalaryNormalized\"], bins=20);\n",
101 | "\n",
102 | "plt.subplot(1, 2, 2)\n",
103 | "plt.hist(data['Log1pSalary'], bins=20);"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 4,
109 | "metadata": {},
110 | "outputs": [
111 | {
112 | "data": {
113 | "text/plain": [
114 | "(244768,)"
115 | ]
116 | },
117 | "execution_count": 4,
118 | "metadata": {},
119 | "output_type": "execute_result"
120 | }
121 | ],
122 | "source": [
123 | "data['Log1pSalary'].shape"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 5,
129 | "metadata": {},
130 | "outputs": [
131 | {
132 | "data": {
133 | "text/plain": [
134 | "200000"
135 | ]
136 | },
137 | "execution_count": 5,
138 | "metadata": {},
139 | "output_type": "execute_result"
140 | }
141 | ],
142 | "source": [
143 | "np.amax(data[\"SalaryNormalized\"])"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 6,
149 | "metadata": {},
150 | "outputs": [
151 | {
152 | "data": {
153 | "text/plain": [
154 | "5000"
155 | ]
156 | },
157 | "execution_count": 6,
158 | "metadata": {},
159 | "output_type": "execute_result"
160 | }
161 | ],
162 | "source": [
163 | "np.amin(data[\"SalaryNormalized\"])"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 7,
169 | "metadata": {},
170 | "outputs": [
171 | {
172 | "data": {
173 | "text/plain": [
174 | "12.206078"
175 | ]
176 | },
177 | "execution_count": 7,
178 | "metadata": {},
179 | "output_type": "execute_result"
180 | }
181 | ],
182 | "source": [
183 | "np.amax(data[\"Log1pSalary\"])"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 8,
189 | "metadata": {},
190 | "outputs": [
191 | {
192 | "data": {
193 | "text/plain": [
194 | "8.517393"
195 | ]
196 | },
197 | "execution_count": 8,
198 | "metadata": {},
199 | "output_type": "execute_result"
200 | }
201 | ],
202 | "source": [
203 | "np.amin(data[\"Log1pSalary\"])"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "Our task is to predict one number, __Log1pSalary__. (log(1 + 200000))\n",
211 | "\n",
212 | "Log1pSalary 예측이 임무\n",
213 | "\n",
214 | "Title : Job position \n",
215 | "FullDescription : 실제 할일 \n",
216 | "LocationRaw : 위치 (Detail) \n",
217 | "LocationNormalized : 위치 \n",
218 | "Contract Type : 계약 유형 \n",
219 | "Contract Time : 계약 시간 \n",
220 | "Company : 회사 \n",
221 | "Category : 범주 \n",
222 | "SalaryRaw : 급여 범위 및 추가 속성 \n",
223 | "SalaryNormalized : 급여 평균 \n",
224 | "SourceName : 출처 \n",
225 | "\n",
226 | "To do so, our model can access a number of features:\n",
227 | "* Free text: __`Title`__ and __`FullDescription`__\n",
228 | "* Categorical: __`Category`__, __`Company`__, __`LocationNormalized`__, __`ContractType`__, and __`ContractTime`__.\n",
229 | "\n",
230 | "dropna함수는 column내에 NaN값이 있으면 해당 내용은 필요없다 간주하고 삭제해버린다. \n",
231 | "\n",
232 | "fillna함수도 굉장히 유용한다 NaN을 특정 값으로 대체하는 기능을 한다. \n",
233 | "\n",
234 | "\n"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 9,
240 | "metadata": {},
241 | "outputs": [
242 | {
243 | "data": {
244 | "text/html": [
245 | "\n",
246 | "\n",
259 | "
\n",
260 | " \n",
261 | " \n",
262 | " | \n",
263 | " Id | \n",
264 | " Title | \n",
265 | " FullDescription | \n",
266 | " LocationRaw | \n",
267 | " LocationNormalized | \n",
268 | " ContractType | \n",
269 | " ContractTime | \n",
270 | " Company | \n",
271 | " Category | \n",
272 | " SalaryRaw | \n",
273 | " SalaryNormalized | \n",
274 | " SourceName | \n",
275 | " Log1pSalary | \n",
276 | "
\n",
277 | " \n",
278 | " \n",
279 | " \n",
280 | " 182771 | \n",
281 | " 71614858 | \n",
282 | " Registered General Nurse (RGN) East London C... | \n",
283 | " Registered General Nurse/RN/RGNLocation: East ... | \n",
284 | " Chingford | \n",
285 | " Chingford | \n",
286 | " full_time | \n",
287 | " NaN | \n",
288 | " HC Recruitment Services | \n",
289 | " Healthcare & Nursing Jobs | \n",
290 | " 13.00 - 13.15/Hour | \n",
291 | " 25104 | \n",
292 | " staffnurse.com | \n",
293 | " 10.130822 | \n",
294 | "
\n",
295 | " \n",
296 | " 44035 | \n",
297 | " 68506863 | \n",
298 | " Buyer Menswear | \n",
299 | " Buyer Menswear : The Client This design led c... | \n",
300 | " North London London South East | \n",
301 | " North Lambeth | \n",
302 | " NaN | \n",
303 | " permanent | \n",
304 | " FASHION & RETAIL PERSONNEL LIMITED | \n",
305 | " Retail Jobs | \n",
306 | " 40000 - 45000 per annum | \n",
307 | " 42500 | \n",
308 | " retailchoice.com | \n",
309 | " 10.657283 | \n",
310 | "
\n",
311 | " \n",
312 | " 101855 | \n",
313 | " 69547809 | \n",
314 | " HGV 2 Moffett Driver | \n",
315 | " We have a breathtaking opportunity for a HGV C... | \n",
316 | " Southall | \n",
317 | " Southall | \n",
318 | " full_time | \n",
319 | " NaN | \n",
320 | " HR Go Recruitment | \n",
321 | " Logistics & Warehouse Jobs | \n",
322 | " 8.50 - 12.75 per hour | \n",
323 | " 20400 | \n",
324 | " Jobcentre Plus | \n",
325 | " 9.923339 | \n",
326 | "
\n",
327 | " \n",
328 | "
\n",
329 | "
"
330 | ],
331 | "text/plain": [
332 | " Id Title \\\n",
333 | "182771 71614858 Registered General Nurse (RGN) East London C... \n",
334 | "44035 68506863 Buyer Menswear \n",
335 | "101855 69547809 HGV 2 Moffett Driver \n",
336 | "\n",
337 | " FullDescription \\\n",
338 | "182771 Registered General Nurse/RN/RGNLocation: East ... \n",
339 | "44035 Buyer Menswear : The Client This design led c... \n",
340 | "101855 We have a breathtaking opportunity for a HGV C... \n",
341 | "\n",
342 | " LocationRaw LocationNormalized ContractType \\\n",
343 | "182771 Chingford Chingford full_time \n",
344 | "44035 North London London South East North Lambeth NaN \n",
345 | "101855 Southall Southall full_time \n",
346 | "\n",
347 | " ContractTime Company \\\n",
348 | "182771 NaN HC Recruitment Services \n",
349 | "44035 permanent FASHION & RETAIL PERSONNEL LIMITED \n",
350 | "101855 NaN HR Go Recruitment \n",
351 | "\n",
352 | " Category SalaryRaw SalaryNormalized \\\n",
353 | "182771 Healthcare & Nursing Jobs 13.00 - 13.15/Hour 25104 \n",
354 | "44035 Retail Jobs 40000 - 45000 per annum 42500 \n",
355 | "101855 Logistics & Warehouse Jobs 8.50 - 12.75 per hour 20400 \n",
356 | "\n",
357 | " SourceName Log1pSalary \n",
358 | "182771 staffnurse.com 10.130822 \n",
359 | "44035 retailchoice.com 10.657283 \n",
360 | "101855 Jobcentre Plus 9.923339 "
361 | ]
362 | },
363 | "execution_count": 9,
364 | "metadata": {},
365 | "output_type": "execute_result"
366 | }
367 | ],
368 | "source": [
369 | "text_columns = [\"Title\", \"FullDescription\"]\n",
370 | "categorical_columns = [\"Category\", \"Company\", \"LocationNormalized\", \"ContractType\", \"ContractTime\"]\n",
371 | "target_column = \"Log1pSalary\"\n",
372 | "\n",
373 | "data[categorical_columns] = data[categorical_columns].fillna('NaN') # cast missing values to string \"NaN\"\n",
374 | "\n",
375 | "data.sample(3)"
376 | ]
377 | },
378 | {
379 | "cell_type": "markdown",
380 | "metadata": {},
381 | "source": [
382 | "### Preprocessing text data\n",
383 | "\n",
384 | "Just like last week, applying NLP to a problem begins from tokenization: splitting raw text into sequences of tokens (words, punctuation(구두법), etc).\n",
385 | "\n",
386 | "__Your task__ is to lowercase and tokenize all texts under `Title` and `FullDescription` columns. Store the tokenized data as a __space-separated__ string of tokens for performance reasons.\n",
387 | "\n",
388 | "It's okay to use nltk tokenizers. Assertions were designed for WordPunctTokenizer, slight deviations are okay.\n",
389 | "\n",
390 | "\n",
391 | "regexp를 사용하여 텍스트를 영문자 및 비영 문자의 순서로 토큰화 \n",
392 | "\n",
393 | "\\w+|[^\\w\\s]+. \n",
394 | "\n",
395 | " from nltk.tokenize import WordPunctTokenizer \n",
396 | "
s = \"Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\n\\nThanks.\" \n",
397 | "\n",
398 | "
WordPunctTokenizer().tokenize(s) \n",
399 | "
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', \n",
400 | "
'.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] \n",
401 | "\n",
402 | "\n",
403 | "\n",
404 | "http://excelsior-cjh.tistory.com/63 "
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": 10,
410 | "metadata": {},
411 | "outputs": [
412 | {
413 | "name": "stdout",
414 | "output_type": "stream",
415 | "text": [
416 | "Raw text:\n",
417 | "2 Mathematical Modeller / Simulation Analyst / O...\n",
418 | "100002 A successful and high achieving specialist sch...\n",
419 | "200002 Web Designer HTML, CSS, JavaScript, Photoshop...\n",
420 | "Name: FullDescription, dtype: object\n"
421 | ]
422 | }
423 | ],
424 | "source": [
425 | "print(\"Raw text:\")\n",
426 | "print(data[\"FullDescription\"][2::100000])"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": null,
432 | "metadata": {},
433 | "outputs": [
434 | {
435 | "name": "stderr",
436 | "output_type": "stream",
437 | "text": [
438 | "/Users/JunChangWook/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: \n",
439 | "A value is trying to be set on a copy of a slice from a DataFrame\n",
440 | "\n",
441 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
442 | " import sys\n"
443 | ]
444 | }
445 | ],
446 | "source": [
447 | "import nltk\n",
448 | "tokenizer = nltk.tokenize.WordPunctTokenizer()\n",
449 | "\n",
450 | "\n",
451 | "index = 0\n",
452 | "for item in data[\"FullDescription\"]:\n",
453 | " data[\"FullDescription\"][index] = tokenizer.tokenize(item)\n",
454 | " index = index + 1\n",
455 | " \n",
456 | "index = 0\n",
457 | "for item in data[\"Title\"]:\n",
458 | " data[\"Title\"][index] = tokenizer.tokenize(item)\n",
459 | " index = index + 1\n",
460 | "# see task above\n",
461 | "#"
462 | ]
463 | },
464 | {
465 | "cell_type": "markdown",
466 | "metadata": {},
467 | "source": [
468 | "Now we can assume that our text is a space-separated list of tokens:"
469 | ]
470 | },
471 | {
472 | "cell_type": "code",
473 | "execution_count": null,
474 | "metadata": {},
475 | "outputs": [],
476 | "source": [
477 | "print(\"Tokenized:\")\n",
478 | "print(data[\"FullDescription\"][2::100000])\n",
479 | "assert data[\"FullDescription\"][2][:50] == 'mathematical modeller / simulation analyst / opera'\n",
480 | "assert data[\"Title\"][54321] == 'international digital account manager ( german )'"
481 | ]
482 | },
483 | {
484 | "cell_type": "markdown",
485 | "metadata": {},
486 | "source": [
487 | "Not all words are equally useful. Some of them are typos or rare words that are only present a few times. \n",
488 | "모든 단어가 똑같이 유용하지 않다. 몇몇 단어는 오타 또는 희귀 단어 이다.\n",
489 | "\n",
490 | "Let's count how many times is each word present in the data so that we can build a \"white list\" of known words. \n",
491 | "단어 카운트를 기반으로 유용한 단어 리스트를 만든다. (white lists)"
492 | ]
493 | },
494 | {
495 | "cell_type": "code",
496 | "execution_count": null,
497 | "metadata": {},
498 | "outputs": [],
499 | "source": [
500 | "# Count how many times does each token occur in both \"Title\" and \"FullDescription\" in total\n",
501 | "# build a dictionary { token -> it's count }\n",
502 | "import collections\n",
503 | "\n",
504 | "dictionary = []\n",
505 | "\n",
506 | "for item in data[\"FullDescription\"]:\n",
507 | " dictionary.extend(item)\n",
508 | "\n",
509 | "for item in data[\"Title\"]:\n",
510 | " dictionary.extend(item)\n",
511 | "\n",
512 | "token_counts = collections.Counter(dictionary)\n",
513 | "#token_counts = \n",
514 | "\n",
515 | "# hint: you may or may not want to use collections.Counter"
516 | ]
517 | },
518 | {
519 | "cell_type": "code",
520 | "execution_count": null,
521 | "metadata": {},
522 | "outputs": [],
523 | "source": [
524 | "print(\"Total unique tokens :\", len(token_counts))\n",
525 | "print('\\n'.join(map(str, token_counts.most_common(n=5))))\n",
526 | "print('...')\n",
527 | "print('\\n'.join(map(str, token_counts.most_common()[-3:])))\n",
528 | "\n",
529 | "#assert token_counts.most_common(1)[0][1] in range(2600000, 2700000)\n",
530 | "#assert len(token_counts) in range(200000, 210000)\n",
531 | "print('Correct!')"
532 | ]
533 | },
534 | {
535 | "cell_type": "code",
536 | "execution_count": null,
537 | "metadata": {},
538 | "outputs": [],
539 | "source": [
540 | "# Let's see how many words are there for each count\n",
541 | "plt.hist(list(token_counts.values()), range=[0, 10**4], bins=50, log=True)\n",
542 | "plt.xlabel(\"Word counts\");"
543 | ]
544 | },
545 | {
546 | "cell_type": "markdown",
547 | "metadata": {},
548 | "source": [
549 | "Now filter tokens a list of all tokens that occur at least 10 times."
550 | ]
551 | },
552 | {
553 | "cell_type": "code",
554 | "execution_count": null,
555 | "metadata": {},
556 | "outputs": [],
557 | "source": [
558 | "min_count = 10\n",
559 | "temp_tokens = []\n",
560 | "\n",
561 | "#for k,v in token_counts.items():\n",
562 | " #if v > min_count:\n",
563 | " #temp_tokens.append(k)\n",
564 | "\n",
565 | "for k,v in token_counts.items():\n",
566 | " if v > min_count:\n",
567 | " temp_tokens.append(k)\n",
568 | "\n",
569 | "tokens = temp_tokens\n",
570 | "# tokens from token_counts keys that had at least min_count occurrences throughout the dataset\n",
571 | "# tokens = "
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": null,
577 | "metadata": {},
578 | "outputs": [],
579 | "source": [
580 | "# Add a special tokens for unknown and empty words\n",
581 | "UNK, PAD = \"UNK\", \"PAD\"\n",
582 | "tokens = [UNK, PAD] + sorted(tokens)\n",
583 | "print(\"Vocabulary size:\", len(tokens))\n",
584 | "\n",
585 | "assert type(tokens) == list\n",
586 | "assert len(tokens) in range(32000, 35000)\n",
587 | "assert 'me' in tokens\n",
588 | "assert UNK in tokens\n",
589 | "print(\"Correct!\")"
590 | ]
591 | },
592 | {
593 | "cell_type": "markdown",
594 | "metadata": {},
595 | "source": [
596 | "Build an inverse token index: a dictionary from token(string) to it's index in `tokens` (int)"
597 | ]
598 | },
599 | {
600 | "cell_type": "code",
601 | "execution_count": null,
602 | "metadata": {},
603 | "outputs": [],
604 | "source": [
605 | "#token_to_id = \n",
606 | "token_to_id = {word: idx for idx, word in enumerate(tokens)}"
607 | ]
608 | },
609 | {
610 | "cell_type": "code",
611 | "execution_count": null,
612 | "metadata": {},
613 | "outputs": [],
614 | "source": [
615 | "assert isinstance(token_to_id, dict)\n",
616 | "assert len(token_to_id) == len(tokens)\n",
617 | "for tok in tokens:\n",
618 | " assert tokens[token_to_id[tok]] == tok\n",
619 | "\n",
620 | "print(\"Correct!\")"
621 | ]
622 | },
623 | {
624 | "cell_type": "markdown",
625 | "metadata": {},
626 | "source": [
627 | "And finally, let's use the vocabulary you've built to map text lines into neural network-digestible matrices.\n",
628 | "행렬로 매핑하기 \n",
629 | "\n",
630 | ">>> a = list(map(str, range(10))) \n",
631 | ">>> a \n",
632 | "['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] \n"
633 | ]
634 | },
635 | {
636 | "cell_type": "code",
637 | "execution_count": null,
638 | "metadata": {},
639 | "outputs": [],
640 | "source": [
641 | "UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])\n",
642 | "# 매트릭스 구조를 만들고 있다.\n",
643 | "def as_matrix(sequences, max_len=None):\n",
644 | " \"\"\" Convert a list of tokens into a matrix with padding \"\"\"\n",
645 | " # object , classinfo 같으면 참 아니면 거짓\n",
646 | " if isinstance(sequences[0], str):\n",
647 | " sequences = list(map(str.split, sequences))\n",
648 | " # 처음 한번은 양의 무한대와 비교하고 나머지는 시컨스 max_len와 비교 한다. \n",
649 | " max_len = min(max(map(len, sequences)), max_len or float('inf'))\n",
650 | " \n",
651 | " # 전체를 패드로 만들고\n",
652 | " matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))\n",
653 | " # 사전에 있으면 그 인텍스를 아니면 UNK_IX를 넣어서 매트릭스를 구성하고 있다.\n",
654 | " for i,seq in enumerate(sequences):\n",
655 | " row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]\n",
656 | " matrix[i, :len(row_ix)] = row_ix\n",
657 | " print(matrix)\n",
658 | " return matrix"
659 | ]
660 | },
661 | {
662 | "cell_type": "code",
663 | "execution_count": null,
664 | "metadata": {},
665 | "outputs": [],
666 | "source": [
667 | "print(\"Lines:\")\n",
668 | "print('\\n'.join(data[\"Title\"][::100000].values), end='\\n\\n')\n",
669 | "print(\"Matrix:\")\n",
670 | "print(as_matrix(data[\"Title\"][::100000]))"
671 | ]
672 | },
673 | {
674 | "cell_type": "markdown",
675 | "metadata": {},
676 | "source": [
677 | "Now let's encode the categirical data we have.\n",
678 | "\n",
679 | "As usual, we shall use one-hot encoding for simplicity. Kudos if you implement more advanced encodings: tf-idf, pseudo-time-series, etc.\n",
680 | "\n",
681 | "one-hot 인코딩을 사용 Advanced encoding : tf-idf \n",
682 | "\n",
683 | ">>> list(zip([1, 2, 3], [4, 5, 6])) \n",
684 | "[(1, 4), (2, 5), (3, 6)] \n",
685 | "\n",
686 | "set 순서가 없는 딕셔너리 만들기 \n",
687 | " \n",
688 | "\n",
689 | "\n",
690 | "from sklearn.feature_extraction.text import CountVectorizer \n",
691 | "corpus = [ \n",
692 | " 'This is the first document.', \n",
693 | " 'This is the second second document.', \n",
694 | " 'And the third one.',\n",
695 | " 'Is this the first document?', \n",
696 | " 'The last document?', \n",
697 | "]\n",
698 | "vect = CountVectorizer() \n",
699 | "vect.fit(corpus) \n",
700 | "vect.vocabulary_ \n",
701 | " \n",
702 | "\n",
703 | "{'this': 9, \n",
704 | " 'is': 3, \n",
705 | " 'the': 7, \n",
706 | " 'first': 2, \n",
707 | " 'document': 1, \n",
708 | " 'second': 6, \n",
709 | " 'and': 0, \n",
710 | " 'third': 8, \n",
711 | " 'one': 5, \n",
712 | " 'last': 4} \n",
713 | " \n",
714 | " \n",
715 | " vect.transform(['This is the second document.']).toarray() \n",
716 | " \n",
717 | " array([[0, 1, 0, 1, 0, 0, 1, 1, 0, 1]]) \n",
718 | " \n",
719 | " vect.transform(['Something completely new.']).toarray() \n",
720 | " \n",
721 | " array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) \n",
722 | " \n",
723 | " vect.transform(corpus).toarray() \n",
724 | " \n",
725 | " array([[0, 1, 1, 1, 0, 0, 0, 1, 0, 1], \n",
726 | " [0, 1, 0, 1, 0, 0, 2, 1, 0, 1], \n",
727 | " [1, 0, 0, 0, 0, 1, 0, 1, 1, 0], \n",
728 | " [0, 1, 1, 1, 0, 0, 0, 1, 0, 1], \n",
729 | " [0, 1, 0, 0, 1, 0, 0, 1, 0, 0]]) \n",
730 | " \n",
731 | " "
732 | ]
733 | },
734 | {
735 | "cell_type": "code",
736 | "execution_count": null,
737 | "metadata": {},
738 | "outputs": [],
739 | "source": [
740 | "from sklearn.feature_extraction import DictVectorizer\n",
741 | "\n",
742 | "# we only consider top-1k most frequent companies to minimize memory usage\n",
743 | "top_companies, top_counts = zip(*collections.Counter(data['Company']).most_common(1000)) # 동일한 위치 묶어준다.\n",
744 | "print(top_companies)\n",
745 | "recognized_companies = set(top_companies)\n",
746 | "print(recognized_companies)\n",
747 | "# top 1000개 이상은 Company를 표현하고 아닌 모든 것들은 Other로 처리 한다. 여기에 pandas apply 함수를 통해 수행한다.\n",
748 | "data[\"Company\"] = data[\"Company\"].apply(lambda comp: comp if comp in recognized_companies else \"Other\")\n",
749 | "\n",
750 | "categorical_vectorizer = DictVectorizer(dtype=np.float32, sparse=False)\n",
751 | "# dict를 y 축으로 묶는다. pandas apply 함수를 통해서 수행한다. \n",
752 | "categorical_vectorizer.fit(data[categorical_columns].apply(dict, axis=1))"
753 | ]
754 | },
755 | {
756 | "cell_type": "markdown",
757 | "metadata": {},
758 | "source": [
759 | "### The deep learning part\n",
760 | "\n",
761 | "Once we've learned to tokenize the data, let's design a machine learning experiment. (토큰을 배웠고 이제 기계학습 실험)\n",
762 | "\n",
763 | "As before, we won't focus too much on validation, opting for a simple train-test split. (학습 훈련 검증 셋 분할)\n",
764 | "\n",
765 | "__To be completely rigorous,__ we've comitted a small crime here: we used the whole data for tokenization and vocabulary building. A more strict way would be to do that part on training set only. You may want to do that and measure the magnitude of changes."
766 | ]
767 | },
768 | {
769 | "cell_type": "code",
770 | "execution_count": null,
771 | "metadata": {},
772 | "outputs": [],
773 | "source": [
774 | "from sklearn.model_selection import train_test_split\n",
775 | "\n",
776 | "data_train, data_val = train_test_split(data, test_size=0.2, random_state=42)\n",
777 | "data_train.index = range(len(data_train))\n",
778 | "data_val.index = range(len(data_val))\n",
779 | "\n",
780 | "print(\"Train size = \", len(data_train))\n",
781 | "print(\"Validation size = \", len(data_val))"
782 | ]
783 | },
784 | {
785 | "cell_type": "code",
786 | "execution_count": null,
787 | "metadata": {},
788 | "outputs": [],
789 | "source": [
790 | "# 배치 구성하는 함수\n",
791 | "def make_batch(data, max_len=None, word_dropout=0):\n",
792 | " \"\"\"\n",
793 | " Creates a keras-friendly dict from the batch data. (케라스 친화적으로 만든다)\n",
794 | " :param word_dropout: replaces token index with UNK_IX with this probability (word_dropout 확률로 UNK_IX로 대체)\n",
795 | " :returns: a dict with {'title' : int64[batch, title_max_len] (배치 사이즈, 타이틀 최대 크기) 매트릭스 구성\n",
796 | " \"\"\"\n",
797 | " batch = {}\n",
798 | " batch[\"Title\"] = as_matrix(data[\"Title\"].values, max_len)\n",
799 | " batch[\"FullDescription\"] = as_matrix(data[\"FullDescription\"].values, max_len)\n",
800 | " batch['Categorical'] = categorical_vectorizer.transform(data[categorical_columns].apply(dict, axis=1))\n",
801 | " \n",
802 | " if word_dropout != 0:\n",
803 | " batch[\"FullDescription\"] = apply_word_dropout(batch[\"FullDescription\"], 1. - word_dropout)\n",
804 | " # target_column = \"Log1pSalary\"\n",
805 | " if target_column in data.columns:\n",
806 | " batch[target_column] = data[target_column].values\n",
807 | " \n",
808 | " return batch\n",
809 | "\n",
810 | "\n",
811 | "def apply_word_dropout(matrix, keep_prop, replace_with=UNK_IX, pad_ix=PAD_IX,):\n",
812 | " dropout_mask = np.random.choice(2, np.shape(matrix), p=[keep_prop, 1 - keep_prop])\n",
813 | " dropout_mask &= matrix != pad_ix\n",
814 | " # 변환 해준다. 모든 부분의 full_like \n",
815 | " return np.choose(dropout_mask, [matrix, np.full_like(matrix, replace_with)]) # matrix를 replace_with로 변경한다."
816 | ]
817 | },
818 | {
819 | "cell_type": "code",
820 | "execution_count": null,
821 | "metadata": {},
822 | "outputs": [],
823 | "source": [
824 | "make_batch(data_train[:3], max_len=10)"
825 | ]
826 | },
827 | {
828 | "cell_type": "markdown",
829 | "metadata": {},
830 | "source": [
831 | "#### Architecture\n",
832 | "\n",
833 | "Our basic model consists of three branches:\n",
834 | "* Title encoder\n",
835 | "* Description encoder\n",
836 | "* Categorical features encoder\n",
837 | "\n",
838 | "We will then feed all 3 branches into one common network that predicts salary. (급여 예측에 3개의 특성을 쓴다.)\n",
839 | "\n",
840 | "
"
841 | ]
842 | },
843 | {
844 | "cell_type": "markdown",
845 | "metadata": {},
846 | "source": [
847 | "This clearly doesn't fit into keras' __Sequential__ interface. To build such a network, one will have to use __[Keras Functional API](https://keras.io/models/model/)__.\n",
848 | "\n",
849 | "https://keras.io/layers/merge/"
850 | ]
851 | },
852 | {
853 | "cell_type": "code",
854 | "execution_count": null,
855 | "metadata": {},
856 | "outputs": [],
857 | "source": [
858 | "import keras\n",
859 | "#from keras.models import Sequential\n",
860 | "import keras.layers as L"
861 | ]
862 | },
863 | {
864 | "cell_type": "code",
865 | "execution_count": null,
866 | "metadata": {},
867 | "outputs": [],
868 | "source": [
869 | "def build_model(n_tokens=len(tokens), n_cat_features=len(categorical_vectorizer.vocabulary_), hid_size=64):\n",
870 | " \"\"\" Build a model that maps three data sources to a single linear output: predicted log1p(salary) \"\"\"\n",
871 | " \n",
872 | " l_title = L.Input(shape=[None], name=\"Title\")\n",
873 | " l_descr = L.Input(shape=[None], name=\"FullDescription\")\n",
874 | " l_categ = L.Input(shape=[n_cat_features], name=\"Categorical\")\n",
875 | " \n",
876 | " # Build your monster!\n",
877 | " \n",
878 | " x1 = keras.layers.Dense(8, activation='relu')(l_title)\n",
879 | " x2 = keras.layers.Dense(8, activation='relu')(l_descr)\n",
880 | " x3 = keras.layers.Dense(8, activation='relu')(l_categ)\n",
881 | " added = keras.layers.add([x1, x2, x3])\n",
882 | "\n",
883 | " # \n",
884 | " output_layer = keras.layers.Dense(1)(added)\n",
885 | " #output_layer = <...>\n",
886 | " # end of your code\n",
887 | " \n",
888 | " model = keras.models.Model(inputs=[l_title, l_descr, l_categ], outputs=[output_layer])\n",
889 | " model.compile('adam', 'mean_squared_error', metrics=['mean_absolute_error'])\n",
890 | " return model"
891 | ]
892 | },
893 | {
894 | "cell_type": "code",
895 | "execution_count": null,
896 | "metadata": {},
897 | "outputs": [],
898 | "source": [
899 | "model = build_model()\n",
900 | "model.summary() # 모델 요약\n",
901 | "\n",
902 | "dummy_pred = model.predict(make_batch(data_train[:100]))\n",
903 | "dummy_loss = model.train_on_batch(make_batch(data_train[:100]), data_train['Log1pSalary'][:100])[0]\n",
904 | "assert dummy_pred.shape == (100, 1)\n",
905 | "assert len(np.unique(dummy_pred)) > 20, \"model returns suspiciously few unique outputs. Check your initialization\"\n",
906 | "assert np.ndim(dummy_loss) == 0 and 0. <= dummy_loss <= 250., \"make sure you minimize MSE\""
907 | ]
908 | },
909 | {
910 | "cell_type": "markdown",
911 | "metadata": {},
912 | "source": [
913 | "#### Training and evaluation\n",
914 | "\n",
915 | "As usual, we gonna feed our monster with random minibatches of data. \n",
916 | "미니 배치 사용 \n",
917 | "\n",
918 | "As we train, we want to monitor not only loss function, which is computed in log-space, but also the actual error measured in dollars.\n",
919 | "\n",
920 | "로그 공간에서 계산 된 손실 함수뿐만 아니라 달러로 측정 한 실제 오차를 모니터링하려고 합니다. \n"
921 | ]
922 | },
923 | {
924 | "cell_type": "code",
925 | "execution_count": null,
926 | "metadata": {},
927 | "outputs": [],
928 | "source": [
929 | "def iterate_minibatches(data, batch_size=256, shuffle=True, cycle=False, **kwargs):\n",
930 | " \"\"\" iterates minibatches of data in random order \"\"\"\n",
931 | " while True:\n",
932 | " indices = np.arange(len(data))\n",
933 | " if shuffle:\n",
934 | " indices = np.random.permutation(indices)\n",
935 | "\n",
936 | " for start in range(0, len(indices), batch_size):\n",
937 | " batch = make_batch(data.iloc[indices[start : start + batch_size]], **kwargs)\n",
938 | " target = batch.pop(target_column)\n",
939 | " yield batch, target\n",
940 | " \n",
941 | " if not cycle: break"
942 | ]
943 | },
944 | {
945 | "cell_type": "markdown",
946 | "metadata": {},
947 | "source": [
948 | "### Model training\n",
949 | "\n",
950 | "We can now fit our model the usual minibatch way. The interesting part is that we train on an infinite stream of minibatches, produced by `iterate_minibatches` function. (iterate_minibatches 함수는 무한 미니 버스 스트림 구성.)"
951 | ]
952 | },
953 | {
954 | "cell_type": "code",
955 | "execution_count": null,
956 | "metadata": {},
957 | "outputs": [],
958 | "source": [
959 | "batch_size = 256\n",
960 | "epochs = 10 # definitely too small\n",
961 | "steps_per_epoch = 100 # for full pass over data: (len(data_train) - 1) // batch_size + 1\n",
962 | "\n",
963 | "model = build_model()\n",
964 | "#배치 별로 모델 트레이닝\n",
965 | "model.fit_generator(iterate_minibatches(data_train, batch_size, cycle=True, word_dropout=0.05), \n",
966 | " epochs=epochs, steps_per_epoch=steps_per_epoch,\n",
967 | " \n",
968 | " validation_data=iterate_minibatches(data_val, batch_size, cycle=True),\n",
969 | " validation_steps=data_val.shape[0] // batch_size\n",
970 | " )"
971 | ]
972 | },
973 | {
974 | "cell_type": "code",
975 | "execution_count": null,
976 | "metadata": {},
977 | "outputs": [],
978 | "source": [
979 | "def print_metrics(model, data, batch_size=batch_size, name=\"\", **kw):\n",
980 | " squared_error = abs_error = num_samples = 0.0\n",
981 | " for batch_x, batch_y in iterate_minibatches(data, batch_size=batch_size, shuffle=False, **kw):\n",
982 | " batch_pred = model.predict(batch_x)[:, 0]\n",
983 | " squared_error += np.sum(np.square(batch_pred - batch_y))\n",
984 | " abs_error += np.sum(np.abs(batch_pred - batch_y))\n",
985 | " num_samples += len(batch_y)\n",
986 | " print(\"%s results:\" % (name or \"\"))\n",
987 | " print(\"Mean square error: %.5f\" % (squared_error / num_samples))\n",
988 | " print(\"Mean absolute error: %.5f\" % (abs_error / num_samples))\n",
989 | " return squared_error, abs_error\n",
990 | " \n",
991 | "print_metrics(model, data_train, name='Train')\n",
992 | "print_metrics(model, data_val, name='Val');"
993 | ]
994 | },
995 | {
996 | "cell_type": "markdown",
997 | "metadata": {},
998 | "source": [
999 | "### Bonus part: explaining model predictions\n",
1000 | "\n",
1001 | "It's usually a good idea to understand how your model works before you let it make actual decisions. It's simple for linear models: just see which words learned positive or negative weights. However, its much harder for neural networks that learn complex nonlinear dependencies.\n",
1002 | "선형 모델은 비선형 모델 보다 쉽다고 이야기 하고 있음\n",
1003 | "\n",
1004 | "There are, however, some ways to look inside the black box:\n",
1005 | "블랙 박스 들여다 보는 방법\n",
1006 | "* Seeing how model responds to input perturbations \n",
1007 | "입력에 대해서 모델이 어떻게 응답하는지 본다. \n",
1008 | "* Finding inputs that maximize/minimize activation of some chosen neurons (_read more [on distill.pub](https://distill.pub/2018/building-blocks/)_) \n",
1009 | "활성화된 뉴럴의 선택해 최대/최소 찾기\n",
1010 | "* Building local linear approximations to your neural network: [article](https://arxiv.org/abs/1602.04938), [eli5 library](https://github.com/TeamHG-Memex/eli5/tree/master/eli5/formatters) \n",
1011 | "신경망에 대한 로컬 선형 근사법 작성 \n",
1012 | "Today we gonna try the first method just because it's the simplest one."
1013 | ]
1014 | },
1015 | {
1016 | "cell_type": "code",
1017 | "execution_count": null,
1018 | "metadata": {},
1019 | "outputs": [],
1020 | "source": [
1021 | "def explain(model, sample, col_name='Title'):\n",
1022 | " \"\"\" Computes the effect each word had on model predictions \"\"\"\n",
1023 | " sample = dict(sample)\n",
1024 | " sample_col_tokens = [tokens[token_to_id.get(tok, 0)] for tok in sample[col_name].split()]\n",
1025 | " data_drop_one_token = pd.DataFrame([sample] * (len(sample_col_tokens) + 1))\n",
1026 | "\n",
1027 | " for drop_i in range(len(sample_col_tokens)):\n",
1028 | " data_drop_one_token.loc[drop_i, col_name] = ' '.join(UNK if i == drop_i else tok\n",
1029 | " for i, tok in enumerate(sample_col_tokens)) \n",
1030 | "\n",
1031 | " *predictions_drop_one_token, baseline_pred = model.predict(make_batch(data_drop_one_token))[:, 0]\n",
1032 | " diffs = baseline_pred - predictions_drop_one_token\n",
1033 | " return list(zip(sample_col_tokens, diffs))"
1034 | ]
1035 | },
1036 | {
1037 | "cell_type": "code",
1038 | "execution_count": null,
1039 | "metadata": {},
1040 | "outputs": [],
1041 | "source": [
1042 | "from IPython.display import HTML, display_html\n",
1043 | "\n",
1044 | "def draw_html(tokens_and_weights, cmap=plt.get_cmap(\"bwr\"), display=True,\n",
1045 | " token_template=\"\"\"{token}\"\"\",\n",
1046 | " font_style=\"font-size:14px;\"\n",
1047 | " ):\n",
1048 | " \n",
1049 | " def get_color_hex(weight):\n",
1050 | " rgba = cmap(1. / (1 + np.exp(weight)), bytes=True)\n",
1051 | " return '#%02X%02X%02X' % rgba[:3]\n",
1052 | " \n",
1053 | " tokens_html = [\n",
1054 | " token_template.format(token=token, color_hex=get_color_hex(weight))\n",
1055 | " for token, weight in tokens_and_weights\n",
1056 | " ]\n",
1057 | " \n",
1058 | " \n",
1059 | " raw_html = \"\"\"{}
\"\"\".format(font_style, ' '.join(tokens_html))\n",
1060 | " if display:\n",
1061 | " display_html(HTML(raw_html))\n",
1062 | " \n",
1063 | " return raw_html\n",
1064 | " "
1065 | ]
1066 | },
1067 | {
1068 | "cell_type": "code",
1069 | "execution_count": null,
1070 | "metadata": {},
1071 | "outputs": [],
1072 | "source": [
1073 | "i = 36605\n",
1074 | "tokens_and_weights = explain(model, data.loc[i], \"Title\")\n",
1075 | "draw_html([(tok, weight * 5) for tok, weight in tokens_and_weights], font_style='font-size:20px;');\n",
1076 | "\n",
1077 | "tokens_and_weights = explain(model, data.loc[i], \"FullDescription\")\n",
1078 | "draw_html([(tok, weight * 10) for tok, weight in tokens_and_weights]);"
1079 | ]
1080 | },
1081 | {
1082 | "cell_type": "code",
1083 | "execution_count": null,
1084 | "metadata": {},
1085 | "outputs": [],
1086 | "source": [
1087 | "i = 12077\n",
1088 | "tokens_and_weights = explain(model, data.loc[i], \"Title\")\n",
1089 | "draw_html([(tok, weight * 5) for tok, weight in tokens_and_weights], font_style='font-size:20px;');\n",
1090 | "\n",
1091 | "tokens_and_weights = explain(model, data.loc[i], \"FullDescription\")\n",
1092 | "draw_html([(tok, weight * 10) for tok, weight in tokens_and_weights]);"
1093 | ]
1094 | },
1095 | {
1096 | "cell_type": "code",
1097 | "execution_count": null,
1098 | "metadata": {},
1099 | "outputs": [],
1100 | "source": [
1101 | "i = np.random.randint(len(data))\n",
1102 | "print(\"Index:\", i)\n",
1103 | "print(\"Salary (gbp):\", np.expm1(model.predict(make_batch(data.iloc[i: i+1]))[0, 0]))\n",
1104 | "\n",
1105 | "tokens_and_weights = explain(model, data.loc[i], \"Title\")\n",
1106 | "draw_html([(tok, weight * 5) for tok, weight in tokens_and_weights], font_style='font-size:20px;');\n",
1107 | "\n",
1108 | "tokens_and_weights = explain(model, data.loc[i], \"FullDescription\")\n",
1109 | "draw_html([(tok, weight * 10) for tok, weight in tokens_and_weights]);"
1110 | ]
1111 | },
1112 | {
1113 | "cell_type": "markdown",
1114 | "metadata": {},
1115 | "source": [
1116 | "__Terrible start-up idea #1962:__ make a tool that automaticaly rephrases your job description (or CV) to meet salary expectations :)"
1117 | ]
1118 | },
1119 | {
1120 | "cell_type": "code",
1121 | "execution_count": null,
1122 | "metadata": {},
1123 | "outputs": [],
1124 | "source": []
1125 | }
1126 | ],
1127 | "metadata": {
1128 | "kernelspec": {
1129 | "display_name": "Python 3",
1130 | "language": "python",
1131 | "name": "python3"
1132 | },
1133 | "language_info": {
1134 | "codemirror_mode": {
1135 | "name": "ipython",
1136 | "version": 3
1137 | },
1138 | "file_extension": ".py",
1139 | "mimetype": "text/x-python",
1140 | "name": "python",
1141 | "nbconvert_exporter": "python",
1142 | "pygments_lexer": "ipython3",
1143 | "version": "3.6.6"
1144 | }
1145 | },
1146 | "nbformat": 4,
1147 | "nbformat_minor": 2
1148 | }
1149 |
--------------------------------------------------------------------------------
/resource/material/README.md:
--------------------------------------------------------------------------------
1 | # Material
2 |
--------------------------------------------------------------------------------
/resource/slides/MIT-data-science/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/MIT-data-science/.gitignore
--------------------------------------------------------------------------------
/resource/slides/MIT-data-science/Chapter 11. Introduction to Machine Learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/MIT-data-science/Chapter 11. Introduction to Machine Learning.pdf
--------------------------------------------------------------------------------
/resource/slides/MIT-data-science/Chapter 12. Clustering.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/MIT-data-science/Chapter 12. Clustering.pdf
--------------------------------------------------------------------------------
/resource/slides/MIT-data-science/Chapter13,14,15_MJLEE.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/MIT-data-science/Chapter13,14,15_MJLEE.pptx
--------------------------------------------------------------------------------
/resource/slides/MIT-data-science/MIT6_0002F16_lec1_cwjun.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/MIT-data-science/MIT6_0002F16_lec1_cwjun.pdf
--------------------------------------------------------------------------------
/resource/slides/MIT-data-science/MIT6_0002F16_lec2_cwjun.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/MIT-data-science/MIT6_0002F16_lec2_cwjun.pdf
--------------------------------------------------------------------------------
/resource/slides/MIT-data-science/MIT6_0002F16_lec5_lec6_ssg.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/MIT-data-science/MIT6_0002F16_lec5_lec6_ssg.pdf
--------------------------------------------------------------------------------
/resource/slides/MIT-data-science/MIT6_0002F16_lec9_Eon.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/MIT-data-science/MIT6_0002F16_lec9_Eon.pdf
--------------------------------------------------------------------------------
/resource/slides/README.md:
--------------------------------------------------------------------------------
1 | # Slides
2 |
--------------------------------------------------------------------------------
/resource/slides/deeppavlov/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/deeppavlov/.gitignore
--------------------------------------------------------------------------------
/resource/slides/deeppavlov/deeppavlov_Automatic spelling correction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/deeppavlov/deeppavlov_Automatic spelling correction.pdf
--------------------------------------------------------------------------------
/resource/slides/linear-algebra/Chapter_3_Least_square.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/linear-algebra/Chapter_3_Least_square.pptx
--------------------------------------------------------------------------------
/resource/slides/linear-algebra/README.md:
--------------------------------------------------------------------------------
1 | # Linear_algebra
2 |
--------------------------------------------------------------------------------
/resource/slides/paper-review/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/paper-review/.gitignore
--------------------------------------------------------------------------------
/resource/slides/paper-review/Character-Aware Neural Language Models.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/paper-review/Character-Aware Neural Language Models.pdf
--------------------------------------------------------------------------------
/resource/slides/paper-review/Character-level CNN for text classification.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/paper-review/Character-level CNN for text classification.pptx
--------------------------------------------------------------------------------
/resource/slides/paper-review/Efficient Character-level Document Classification by Combining Convolution and Recurrent Layers.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/paper-review/Efficient Character-level Document Classification by Combining Convolution and Recurrent Layers.pptx
--------------------------------------------------------------------------------
/resource/slides/paper-review/Learning phrase representation using RNN Encoder-Decoder for SMT.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/paper-review/Learning phrase representation using RNN Encoder-Decoder for SMT.pdf
--------------------------------------------------------------------------------
/resource/slides/paper-review/MASS.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/paper-review/MASS.pdf
--------------------------------------------------------------------------------
/resource/slides/paper-review/Robustly optimized BERT Pretraining Approaches.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/paper-review/Robustly optimized BERT Pretraining Approaches.pptx
--------------------------------------------------------------------------------
/resource/slides/paper-review/TransformerXL.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/paper-review/TransformerXL.pdf
--------------------------------------------------------------------------------
/resource/slides/paper-review/VDCNN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/paper-review/VDCNN.pdf
--------------------------------------------------------------------------------
/resource/slides/paper-review/seqtoseq_attention_20190417.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/paper-review/seqtoseq_attention_20190417.pdf
--------------------------------------------------------------------------------
/resource/slides/soynlp/Soynlp 2일차.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/soynlp/Soynlp 2일차.pptx
--------------------------------------------------------------------------------
/resource/slides/soynlp/empty:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/soynlp/empty
--------------------------------------------------------------------------------
/resource/slides/soynlp/fastcampus_1일차.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/soynlp/fastcampus_1일차.pptx
--------------------------------------------------------------------------------
/resource/slides/soynlp/fastcampus_day3/From frequency to meaning, Vector space models of semantics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/soynlp/fastcampus_day3/From frequency to meaning, Vector space models of semantics.pdf
--------------------------------------------------------------------------------
/resource/slides/soynlp/fastcampus_day3/Korean conjugation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/soynlp/fastcampus_day3/Korean conjugation.pdf
--------------------------------------------------------------------------------
/resource/slides/soynlp/fastcampus_day3/Korean lemmatization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/soynlp/fastcampus_day3/Korean lemmatization.pdf
--------------------------------------------------------------------------------
/resource/slides/soynlp/fastcampus_day3/L2_L1 regularization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/soynlp/fastcampus_day3/L2_L1 regularization.pdf
--------------------------------------------------------------------------------
/resource/slides/soynlp/fastcampus_day3/LSA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/soynlp/fastcampus_day3/LSA.pdf
--------------------------------------------------------------------------------
/resource/slides/soynlp/fastcampus_day3/Logistic regression with L1, L2 regularization and keyword extraction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/soynlp/fastcampus_day3/Logistic regression with L1, L2 regularization and keyword extraction.pdf
--------------------------------------------------------------------------------
/resource/slides/soynlp/fastcampus_day3/Neural Word Embedding as Implicit Matrix Factorization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/soynlp/fastcampus_day3/Neural Word Embedding as Implicit Matrix Factorization.pdf
--------------------------------------------------------------------------------
/resource/slides/yandex/2월 2째주-yandex- week04_seq2seq_seminar.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/yandex/2월 2째주-yandex- week04_seq2seq_seminar.pptx
--------------------------------------------------------------------------------
/resource/slides/yandex/2월 3째주-yandex-week04_seq2seq_seminar layer normalization.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/yandex/2월 3째주-yandex-week04_seq2seq_seminar layer normalization.pptx
--------------------------------------------------------------------------------
/resource/slides/yandex/yandex-week-07-mt-02.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/modudeepnlp/DeepNLP2019/e303cca4812f85c551968babc379a2f5e140868d/resource/slides/yandex/yandex-week-07-mt-02.pptx
--------------------------------------------------------------------------------