├── .dockerignore
├── .gitignore
├── Dockerfile
├── Document-Embeddings_Demo.ipynb
├── Gigaword_pruned_vectors.txt.gz
├── README.md
├── Word-Embeddings_Demo.ipynb
├── images
├── NLP.png
├── architecture.png
├── architecture_2.png
├── context.png
├── cos_sim.png
├── cos_sim_compare.png
├── country_capital.png
├── distance_measures.png
├── eval_1.png
├── eval_2.png
├── gender_bias.png
├── king_queen.png
├── king_queen_2.png
├── king_queen_vis.png
├── normalize.jpg
├── one_hot.png
├── programmer_homemaker.png
├── unit_circle.png
└── vectorize.png
├── movie_reviews.tsv
├── requirements.txt
├── tfidf_cos_matrix.csv
└── utils.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | # Byte-compiled / optimized / DLL files
3 | __pycache__/
4 | *.py[cod]
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # IPython Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # dotenv
80 | .env
81 |
82 | # virtualenv
83 | venv/
84 | ENV/
85 |
86 | # Spyder project settings
87 | .spyderproject
88 |
89 | # Rope project settings
90 | .ropeproject
91 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # backups
2 | *~
3 | *.swp
4 | .DS_Store
5 | # Byte-compiled / optimized / DLL files
6 | __pycache__/
7 | *.py[cod]
8 | *$py.class
9 |
10 | # C extensions
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | env/
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *,cover
50 | .hypothesis/
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 |
60 | # Flask stuff:
61 | instance/
62 | .webassets-cache
63 |
64 | # Scrapy stuff:
65 | .scrapy
66 |
67 | # Sphinx documentation
68 | docs/_build/
69 |
70 | # PyBuilder
71 | target/
72 |
73 | # IPython Notebook
74 | .ipynb_checkpoints
75 |
76 | # pyenv
77 | .python-version
78 |
79 | # celery beat schedule file
80 | celerybeat-schedule
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | venv/
87 | ENV/
88 |
89 | # Spyder project settings
90 | .spyderproject
91 |
92 | # Rope project settings
93 | .ropeproject
94 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM jupyter/minimal-notebook
2 |
3 | WORKDIR /code
4 | COPY requirements.txt /code
5 | RUN pip install -r /code/requirements.txt
6 | COPY . /code
7 |
--------------------------------------------------------------------------------
/Document-Embeddings_Demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false,
8 | "slideshow": {
9 | "slide_type": "-"
10 | }
11 | },
12 | "outputs": [
13 | {
14 | "data": {
15 | "text/plain": [
16 | "4"
17 | ]
18 | },
19 | "execution_count": 1,
20 | "metadata": {},
21 | "output_type": "execute_result"
22 | }
23 | ],
24 | "source": [
25 | "# using Jupyter notebooks\n",
26 | "# pushing CTRL-c will run the code in a cell\n",
27 | "2 + 2"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {
33 | "slideshow": {
34 | "slide_type": "-"
35 | }
36 | },
37 | "source": [
38 | "# Gentle Introduction to NLP through Document Embeddings"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {
44 | "slideshow": {
45 | "slide_type": "-"
46 | }
47 | },
48 | "source": [
49 | "### Quick Review of Last Time\n",
50 | "* Cosine Similarity\n",
51 | "\n",
52 | "### Two Approaches to Embedding Documents\n",
53 | "* Sparse, bag-of-words embeddings\n",
54 | " - Count embeddings\n",
55 | " - TFIDF embeddings\n",
56 | "* Dense embeddings"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {
62 | "slideshow": {
63 | "slide_type": "-"
64 | }
65 | },
66 | "source": [
67 | ""
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {
73 | "slideshow": {
74 | "slide_type": "-"
75 | }
76 | },
77 | "source": [
78 | "## From Last Time"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {
84 | "slideshow": {
85 | "slide_type": "-"
86 | }
87 | },
88 | "source": [
89 | "\n",
90 | "http://dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {
96 | "slideshow": {
97 | "slide_type": "-"
98 | }
99 | },
100 | "source": [
101 | "\n",
102 | "http://dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {
108 | "slideshow": {
109 | "slide_type": "-"
110 | }
111 | },
112 | "source": [
113 | "### calculating dot product\n",
114 | "$vector_a = [1,2,3]$
\n",
115 | "$vector_b = [4,5,6]$
\n",
116 | "$vector_a \\cdot vector_b = (1*4) + (2*5) + (3*6) = 4 + 10 + 18 = 32$ "
117 | ]
118 | },
119 | {
120 | "cell_type": "markdown",
121 | "metadata": {
122 | "slideshow": {
123 | "slide_type": "-"
124 | }
125 | },
126 | "source": [
127 | "### normalizing a vector\n",
128 | "To normalize a vector, we shrink all values so they fall between $0$ and $1$.\n",
129 | "\n",
130 | "\n",
131 | "http://www.wikihow.com/Normalize-a-Vector"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 2,
137 | "metadata": {
138 | "collapsed": true,
139 | "slideshow": {
140 | "slide_type": "-"
141 | }
142 | },
143 | "outputs": [],
144 | "source": [
145 | "import numpy as np\n",
146 | "import utils\n",
147 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 3,
153 | "metadata": {
154 | "collapsed": true,
155 | "slideshow": {
156 | "slide_type": "-"
157 | }
158 | },
159 | "outputs": [],
160 | "source": [
161 | "def normalize_vector(vector):\n",
162 | " \"\"\"\n",
163 | " Normalizes a vector so that all its values are between 0 and 1\n",
164 | " :param vector: a `numpy` vector\n",
165 | " :return: a normalized `numpy` vector\n",
166 | " \"\"\"\n",
167 | " # norm = np.sqrt(vector.dot(vector))\n",
168 | " # numpy has a built in function\n",
169 | " norm = np.linalg.norm(vector)\n",
170 | " if norm:\n",
171 | " return vector / norm\n",
172 | " else:\n",
173 | " # if norm == 0, then original vector was all 0s\n",
174 | " return vector"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 4,
180 | "metadata": {
181 | "collapsed": false,
182 | "slideshow": {
183 | "slide_type": "-"
184 | }
185 | },
186 | "outputs": [
187 | {
188 | "name": "stdout",
189 | "output_type": "stream",
190 | "text": [
191 | "original vector [1 2 4]\n",
192 | "normalized vector [ 0.21821789 0.43643578 0.87287156]\n"
193 | ]
194 | }
195 | ],
196 | "source": [
197 | "vector_3d = np.array([1,2,4])\n",
198 | "print(\"original vector\", vector_3d)\n",
199 | "print(\"normalized vector\", normalize_vector(vector_3d))\n",
200 | "#0.218 is 1/4th of .873 just like 1 is 1/4th of 4"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 5,
206 | "metadata": {
207 | "collapsed": true,
208 | "slideshow": {
209 | "slide_type": "-"
210 | }
211 | },
212 | "outputs": [],
213 | "source": [
214 | "def cos_sim(vector_one, vector_two):\n",
215 | " \"\"\"\n",
216 | " Calculate the cosine similarity of two `numpy` vectors\n",
217 | " :param vector_one: a `numpy` vector\n",
218 | " :param vector_two: a `numpy` vector\n",
219 | " :return: A score between 0 and 1\n",
220 | " \"\"\"\n",
221 | " # ensure that both vectors are already normalized\n",
222 | " vector_one_norm = normalize_vector(vector_one)\n",
223 | " vector_two_norm = normalize_vector(vector_two)\n",
224 | " \n",
225 | " # calculate the dot product between the two normalized vectors\n",
226 | " return vector_one_norm.dot(vector_two_norm)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 6,
232 | "metadata": {
233 | "collapsed": false,
234 | "slideshow": {
235 | "slide_type": "-"
236 | }
237 | },
238 | "outputs": [
239 | {
240 | "name": "stdout",
241 | "output_type": "stream",
242 | "text": [
243 | "cosine similarity of vector_one and vector_two 0.948683298051\n",
244 | "cosine similarity of vector_one and vector_three 0.904534033733\n",
245 | "cosine similarity of vector_one and vector_four 0.904534033733\n"
246 | ]
247 | }
248 | ],
249 | "source": [
250 | "vector_one = np.array([1,1,1,1,1])\n",
251 | "vector_two = np.array([1,1,1,1,2])\n",
252 | "vector_three = np.array([1,2,3,4,5])\n",
253 | "vector_four = np.array([10,20,30,40,50])\n",
254 | "\n",
255 | "print(\"cosine similarity of vector_one and vector_two\", cos_sim(vector_one, vector_two))\n",
256 | "print(\"cosine similarity of vector_one and vector_three\", cos_sim(vector_one, vector_three))\n",
257 | "print(\"cosine similarity of vector_one and vector_four\", cos_sim(vector_one, vector_four))"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "metadata": {
263 | "slideshow": {
264 | "slide_type": "-"
265 | }
266 | },
267 | "source": [
268 | "### Interpreting \"Similarity\"\n",
269 | "\n",
270 | "https://medium.com/@camrongodbout/creating-a-search-engine-f2f429cab33c#.z7i9w8y5t"
271 | ]
272 | },
273 | {
274 | "cell_type": "markdown",
275 | "metadata": {
276 | "slideshow": {
277 | "slide_type": "-"
278 | }
279 | },
280 | "source": [
281 | ""
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {
287 | "slideshow": {
288 | "slide_type": "-"
289 | }
290 | },
291 | "source": [
292 | "## Embedding a Document \n",
293 | "### Bag of Words\n",
294 | "#### Count Vectorizing"
295 | ]
296 | },
297 | {
298 | "cell_type": "markdown",
299 | "metadata": {
300 | "slideshow": {
301 | "slide_type": "-"
302 | }
303 | },
304 | "source": [
305 | ""
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {
311 | "slideshow": {
312 | "slide_type": "-"
313 | }
314 | },
315 | "source": [
316 | ""
317 | ]
318 | },
319 | {
320 | "cell_type": "markdown",
321 | "metadata": {
322 | "slideshow": {
323 | "slide_type": "-"
324 | }
325 | },
326 | "source": [
327 | "## Embedding a Document \n",
328 | "### Bag of Words\n",
329 | "#### TFIDF Vectorizing\n",
330 | "`TFIDF` = `term frequency, inverse document frequency`"
331 | ]
332 | },
333 | {
334 | "cell_type": "markdown",
335 | "metadata": {
336 | "slideshow": {
337 | "slide_type": "-"
338 | }
339 | },
340 | "source": [
341 | ""
342 | ]
343 | },
344 | {
345 | "cell_type": "markdown",
346 | "metadata": {
347 | "slideshow": {
348 | "slide_type": "-"
349 | }
350 | },
351 | "source": [
352 | ""
353 | ]
354 | },
355 | {
356 | "cell_type": "markdown",
357 | "metadata": {
358 | "slideshow": {
359 | "slide_type": "-"
360 | }
361 | },
362 | "source": [
363 | ""
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "metadata": {
369 | "slideshow": {
370 | "slide_type": "-"
371 | }
372 | },
373 | "source": [
374 | ""
375 | ]
376 | },
377 | {
378 | "cell_type": "markdown",
379 | "metadata": {
380 | "slideshow": {
381 | "slide_type": "-"
382 | }
383 | },
384 | "source": [
385 | ""
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": 7,
391 | "metadata": {
392 | "collapsed": false,
393 | "slideshow": {
394 | "slide_type": "-"
395 | }
396 | },
397 | "outputs": [],
398 | "source": [
399 | "# load reviews\n",
400 | "reviews_dict = utils.load_data(\"movie_reviews.tsv\")\n",
401 | "all_docs, lookup = utils.get_all_docs(reviews_dict)"
402 | ]
403 | },
404 | {
405 | "cell_type": "code",
406 | "execution_count": 8,
407 | "metadata": {
408 | "collapsed": false,
409 | "slideshow": {
410 | "slide_type": "-"
411 | }
412 | },
413 | "outputs": [
414 | {
415 | "data": {
416 | "text/plain": [
417 | "'\"With all this stuff going down at the moment with MJ i\\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\\'s feeling towards the press and also the obvious message of drugs are bad m\\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 20 minutes or so excluding the Smooth Criminal sequence and Joe Pesci is convincing as a psychopathic all powerful drug lord. Why he wants MJ dead so bad is beyond me. Because MJ overheard his plans? Nah, Joe Pesci\\'s character ranted that he wanted people to know it is he who is supplying drugs etc so i dunno, maybe he just hates MJ\\'s music.Lots of cool things in this like MJ turning into a car and a robot and the whole Speed Demon sequence. Also, the director must have had the patience of a saint when it came to filming the kiddy Bad sequence as usually directors hate working with one kid let alone a whole bunch of them performing a complex dance scene.Bottom line, this movie is for people who like MJ on one level or another (which i think is most people). If not, then stay away. It does try and give off a wholesome message and ironically MJ\\'s bestest buddy in this movie is a girl! Michael Jackson is truly one of the most talented people ever to grace this planet but is he guilty? Well, with all the attention i\\'ve gave this subject....hmmm well i don\\'t know because people can be different behind closed doors, i know this for a fact. He is either an extremely nice but stupid guy or one of the most sickest liars. I hope he is not the latter.\"'"
418 | ]
419 | },
420 | "execution_count": 8,
421 | "metadata": {},
422 | "output_type": "execute_result"
423 | }
424 | ],
425 | "source": [
426 | "# `all docs` is a list of all documents\n",
427 | "all_docs[0]"
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": 9,
433 | "metadata": {
434 | "collapsed": false,
435 | "slideshow": {
436 | "slide_type": "-"
437 | }
438 | },
439 | "outputs": [
440 | {
441 | "data": {
442 | "text/plain": [
443 | "'\"With all this stuff going down at the moment with MJ i\\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\\'s feeling towards the press and also the obvious message of drugs are bad m\\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 20 minutes or so excluding the Smooth Criminal sequence and Joe Pesci is convincing as a psychopathic all powerful drug lord. Why he wants MJ dead so bad is beyond me. Because MJ overheard his plans? Nah, Joe Pesci\\'s character ranted that he wanted people to know it is he who is supplying drugs etc so i dunno, maybe he just hates MJ\\'s music.Lots of cool things in this like MJ turning into a car and a robot and the whole Speed Demon sequence. Also, the director must have had the patience of a saint when it came to filming the kiddy Bad sequence as usually directors hate working with one kid let alone a whole bunch of them performing a complex dance scene.Bottom line, this movie is for people who like MJ on one level or another (which i think is most people). If not, then stay away. It does try and give off a wholesome message and ironically MJ\\'s bestest buddy in this movie is a girl! Michael Jackson is truly one of the most talented people ever to grace this planet but is he guilty? Well, with all the attention i\\'ve gave this subject....hmmm well i don\\'t know because people can be different behind closed doors, i know this for a fact. He is either an extremely nice but stupid guy or one of the most sickest liars. I hope he is not the latter.\"'"
444 | ]
445 | },
446 | "execution_count": 9,
447 | "metadata": {},
448 | "output_type": "execute_result"
449 | }
450 | ],
451 | "source": [
452 | "# `lookup` is a lookup dict with {idx: text}\n",
453 | "lookup[0]"
454 | ]
455 | },
456 | {
457 | "cell_type": "markdown",
458 | "metadata": {
459 | "slideshow": {
460 | "slide_type": "-"
461 | }
462 | },
463 | "source": [
464 | "### Using `scikit-learn`\n",
465 | "\n",
466 | "[CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
\n",
467 | "[TfidfVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)"
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": 10,
473 | "metadata": {
474 | "collapsed": false,
475 | "slideshow": {
476 | "slide_type": "-"
477 | }
478 | },
479 | "outputs": [
480 | {
481 | "data": {
482 | "text/plain": [
483 | "<999x18373 sparse matrix of type ''\n",
484 | "\twith 137082 stored elements in Compressed Sparse Row format>"
485 | ]
486 | },
487 | "execution_count": 10,
488 | "metadata": {},
489 | "output_type": "execute_result"
490 | }
491 | ],
492 | "source": [
493 | "# Count\n",
494 | "# call the vectorizer\n",
495 | "cv = CountVectorizer(\n",
496 | " analyzer='word', # 'char', 'char_wb'\n",
497 | " ngram_range=(1,1), # unigrams and bigrams ==> (1, 2)\n",
498 | " stop_words=None, # 'english' \n",
499 | " max_df=1.0, # float ==> percentage, int ==> raw count\n",
500 | " min_df=1, # float ==> percentage, int ==> raw count\n",
501 | " binary=False # True\n",
502 | ") \n",
503 | "# run fit_transform on the of documents\n",
504 | "X_cv = cv.fit_transform(all_docs)\n",
505 | "X_cv"
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": 11,
511 | "metadata": {
512 | "collapsed": false
513 | },
514 | "outputs": [
515 | {
516 | "data": {
517 | "text/plain": [
518 | "<999x18373 sparse matrix of type ''\n",
519 | "\twith 137082 stored elements in Compressed Sparse Row format>"
520 | ]
521 | },
522 | "execution_count": 11,
523 | "metadata": {},
524 | "output_type": "execute_result"
525 | }
526 | ],
527 | "source": [
528 | "# TFIDF\n",
529 | "# call the vectorizer\n",
530 | "tv = TfidfVectorizer(\n",
531 | " analyzer='word', # 'char'\n",
532 | " ngram_range=(1,1), # unigrams and bigrams ==> (1, 2)\n",
533 | " stop_words=None, # 'english' \n",
534 | " max_df=1.0, # float ==> percentage, int ==> raw count\n",
535 | " min_df=1, # float ==> percentage, int ==> raw count\n",
536 | ")\n",
537 | "# run fit_transform on the of documents\n",
538 | "X_tv = tv.fit_transform(all_docs)\n",
539 | "X_tv"
540 | ]
541 | },
542 | {
543 | "cell_type": "markdown",
544 | "metadata": {},
545 | "source": [
546 | "The first dimension (`999` rows) indicates the number of documents we're processing.\n",
547 | "\n",
548 | "The second dimension (columns) indicates the number of features we're processing. This will increase/decrease depending on the `n-gram` parameter."
549 | ]
550 | },
551 | {
552 | "cell_type": "markdown",
553 | "metadata": {},
554 | "source": [
555 | ""
556 | ]
557 | },
558 | {
559 | "cell_type": "code",
560 | "execution_count": 31,
561 | "metadata": {
562 | "collapsed": false,
563 | "slideshow": {
564 | "slide_type": "-"
565 | }
566 | },
567 | "outputs": [],
568 | "source": [
569 | "# see the vocabulary\n",
570 | "cv_vocab = cv.get_feature_names()\n",
571 | "# see the nonzero features (e.g. words, bigrams, character-grams) \n",
572 | "# for each row of data\n",
573 | "cv_words_per_doc = cv.inverse_transform(X_cv)\n",
574 | "tv_words_per_doc = cv.inverse_transform(X_tv)"
575 | ]
576 | },
577 | {
578 | "cell_type": "code",
579 | "execution_count": 32,
580 | "metadata": {
581 | "collapsed": false
582 | },
583 | "outputs": [
584 | {
585 | "data": {
586 | "text/plain": [
587 | "array(['latter', 'hope', 'liars', 'sickest', 'stupid', 'extremely',\n",
588 | " 'either', 'fact', 'doors', 'closed', 'behind', 'different', 'be',\n",
589 | " 'can', 'don'], \n",
590 | " dtype=''\n",
1165 | "\twith 137082 stored elements in Compressed Sparse Row format>"
1166 | ]
1167 | },
1168 | "execution_count": 29,
1169 | "metadata": {},
1170 | "output_type": "execute_result"
1171 | }
1172 | ],
1173 | "source": [
1174 | "X_cv"
1175 | ]
1176 | },
1177 | {
1178 | "cell_type": "markdown",
1179 | "metadata": {},
1180 | "source": [
1181 | ""
1182 | ]
1183 | },
1184 | {
1185 | "cell_type": "markdown",
1186 | "metadata": {
1187 | "slideshow": {
1188 | "slide_type": "-"
1189 | }
1190 | },
1191 | "source": [
1192 | "#### Problems with Bag-of-words:\n",
1193 | "\n",
1194 | " - same concepts, different words don't appear similar\n",
1195 | " - sparse matrix the size of *vocabulary*\n",
1196 | " - two different sentences, same embedding\n",
1197 | " \n",
1198 | "### So can we do better?"
1199 | ]
1200 | },
1201 | {
1202 | "cell_type": "markdown",
1203 | "metadata": {
1204 | "slideshow": {
1205 | "slide_type": "-"
1206 | }
1207 | },
1208 | "source": [
1209 | "## Embedding a Document\n",
1210 | "### Neural Networks"
1211 | ]
1212 | },
1213 | {
1214 | "cell_type": "markdown",
1215 | "metadata": {
1216 | "collapsed": true,
1217 | "slideshow": {
1218 | "slide_type": "-"
1219 | }
1220 | },
1221 | "source": [
1222 | "\n",
1223 | "http://colah.github.io/posts/2015-08-Understanding-LSTMs/"
1224 | ]
1225 | },
1226 | {
1227 | "cell_type": "markdown",
1228 | "metadata": {
1229 | "collapsed": true,
1230 | "slideshow": {
1231 | "slide_type": "-"
1232 | }
1233 | },
1234 | "source": [
1235 | "\n",
1236 | "http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/"
1237 | ]
1238 | },
1239 | {
1240 | "cell_type": "markdown",
1241 | "metadata": {
1242 | "collapsed": true,
1243 | "slideshow": {
1244 | "slide_type": "-"
1245 | }
1246 | },
1247 | "source": [
1248 | "\n",
1249 | "https://cs.umd.edu/~miyyer/pubs/2015_acl_dan.pdf"
1250 | ]
1251 | },
1252 | {
1253 | "cell_type": "markdown",
1254 | "metadata": {
1255 | "slideshow": {
1256 | "slide_type": "-"
1257 | }
1258 | },
1259 | "source": [
1260 | ""
1261 | ]
1262 | },
1263 | {
1264 | "cell_type": "markdown",
1265 | "metadata": {},
1266 | "source": [
1267 | "\n",
1268 | "http://www.wildml.com/2016/04/deep-learning-for-chatbots-part-1-introduction/"
1269 | ]
1270 | },
1271 | {
1272 | "cell_type": "markdown",
1273 | "metadata": {},
1274 | "source": [
1275 | "\n",
1276 | "https://www.researchgate.net/profile/Y_Bengio/publication/277411157_Deep_Learning/links/55e0cdf908ae2fac471ccf0f/Deep-Learning.pdf"
1277 | ]
1278 | },
1279 | {
1280 | "cell_type": "markdown",
1281 | "metadata": {
1282 | "slideshow": {
1283 | "slide_type": "-"
1284 | }
1285 | },
1286 | "source": [
1287 | "## Resources\n",
1288 | "[Stanford IR book, online](http://nlp.stanford.edu/IR-book/html/htmledition/)
\n",
1289 | "[Bag of Words Meets Bags of Popcorn (Kaggle)](https://www.kaggle.com/c/word2vec-nlp-tutorial)
\n",
1290 | "[Neural Networks for NLP](https://arxiv.org/pdf/1510.00726.pdf)
\n",
1291 | "[Blog about LSTM's](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)
\n",
1292 | "[Blog about CNN's](http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/)
\n",
1293 | "[Examples of Recurrent Neural Networks](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) \n",
1294 | "[Recent Talk by C. Manning about Embedding Words and Documents](https://simons.berkeley.edu/talks/christopher-manning-2017-3-27)"
1295 | ]
1296 | }
1297 | ],
1298 | "metadata": {
1299 | "kernelspec": {
1300 | "display_name": "Python 3",
1301 | "language": "python",
1302 | "name": "python3"
1303 | },
1304 | "language_info": {
1305 | "codemirror_mode": {
1306 | "name": "ipython",
1307 | "version": 3
1308 | },
1309 | "file_extension": ".py",
1310 | "mimetype": "text/x-python",
1311 | "name": "python",
1312 | "nbconvert_exporter": "python",
1313 | "pygments_lexer": "ipython3",
1314 | "version": "3.5.2"
1315 | }
1316 | },
1317 | "nbformat": 4,
1318 | "nbformat_minor": 0
1319 | }
1320 |
--------------------------------------------------------------------------------
/Gigaword_pruned_vectors.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/Gigaword_pruned_vectors.txt.gz
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NLP Basics
2 |
3 | ## Preparation
4 |
5 | You can clone this repository:
6 |
7 | ```
8 | git@github.com:michaelcapizzi/nlp-basics.git
9 | ```
10 |
11 | ## Docker install
12 |
13 | You can run the Jupyter notebooks in this repository with [Docker](http://docs.docker.com/installation) by running
14 |
15 | ```
16 | % docker build -t michaelcapizzi/nlp-basics .
17 | % docker run -p 8888:8888 --rm -it michaelcapizzi/nlp-basics # to start a Jupyter notebook server
18 | [I 19:41:11.459 NotebookApp] Writing notebook server cookie secret to /home/jovyan/.local/share/jupyter/runtime/notebook_cookie_secret
19 | [W 19:41:11.591 NotebookApp] Widgets are unavailable. Please install widgetsnbextension or ipywidgets 4.0
20 | [W 19:41:11.598 NotebookApp] WARNING: The notebook server is listening on all IP addresses and not using encryption. This is not recommended.
21 | [I 19:41:11.742 NotebookApp] JupyterLab alpha preview extension loaded from /opt/conda/lib/python3.5/site-packages/jupyterlab
22 | [I 19:41:11.802 NotebookApp] Serving notebooks from local directory: /code
23 | [I 19:41:11.802 NotebookApp] 0 active kernels
24 | [I 19:41:11.802 NotebookApp] The Jupyter Notebook is running at: http://[all ip addresses on your system]:8888/?token=f6925975b83f14758e79c55f81f1bec1267300747d5d6b08
25 | [I 19:41:11.802 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
26 | [C 19:41:11.803 NotebookApp]
27 |
28 | Copy/paste this URL into your browser when you connect for the first time,
29 | to login with a token:
30 | http://localhost:8888/?token=f6925975b83f14758e79c55f81f1bec1267300747d5d6b08
31 | ```
32 |
33 | Your specific token will be different.
34 |
35 | ## Manual pip / virtualenv install
36 |
37 | To run the `jupyter` notebook you'll need a `python` environment for `python 3` with the following requirements:
38 |
39 | - jupyter
40 | - gensim
41 | - sklearn
42 | - numpy
43 | - beautifulsoup4
44 |
45 | All of these can be installed via `pip` or using the `requirements.txt` file:
46 |
47 | ```
48 | pip install -r requirements.txt
49 | ```
50 |
51 | Then to open the notebook, simply run the following in the root folder of the cloned project:
52 |
53 | ```
54 | jupyter notebook
55 | ```
56 |
57 | This will open a new window in your default browser. You can then open the notebook file of choice (ending in `.ipynb`) by clicking on it.
58 |
59 | It will open in a new window.
60 |
61 | You can edit a given cell by clicking on it. To run the cell, push `CTRL-c`.
62 |
63 |
64 |
65 |
66 |
--------------------------------------------------------------------------------
/Word-Embeddings_Demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false,
8 | "slideshow": {
9 | "slide_type": "-"
10 | }
11 | },
12 | "outputs": [
13 | {
14 | "data": {
15 | "text/plain": [
16 | "4"
17 | ]
18 | },
19 | "execution_count": 1,
20 | "metadata": {},
21 | "output_type": "execute_result"
22 | }
23 | ],
24 | "source": [
25 | "# using Jupyter notebooks\n",
26 | "# pushing CTRL-c will run the code in a cell\n",
27 | "2 + 2"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {
33 | "slideshow": {
34 | "slide_type": "-"
35 | }
36 | },
37 | "source": [
38 | "# Gentle Introduction to NLP through Word Embeddings"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {
44 | "slideshow": {
45 | "slide_type": "-"
46 | }
47 | },
48 | "source": [
49 | ""
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {
55 | "slideshow": {
56 | "slide_type": "-"
57 | }
58 | },
59 | "source": [
60 | "# How To Tell If Two Words Are \"Similar\"?"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {
66 | "slideshow": {
67 | "slide_type": "-"
68 | }
69 | },
70 | "source": [
71 | "\n",
72 | "http://dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {
78 | "slideshow": {
79 | "slide_type": "-"
80 | }
81 | },
82 | "source": [
83 | "# Cosine Similarity"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {
89 | "collapsed": true,
90 | "slideshow": {
91 | "slide_type": "-"
92 | }
93 | },
94 | "source": [
95 | "\n",
96 | "http://dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {
102 | "slideshow": {
103 | "slide_type": "-"
104 | }
105 | },
106 | "source": [
107 | "## calculating dot product\n",
108 | "$vector_a = [1,2,3]$
\n",
109 | "$vector_b = [4,5,6]$
\n",
110 | "$vector_a \\cdot vector_b = (1*4) + (2*5) + (3*6) = 4 + 10 + 18 = 32$ "
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {
116 | "slideshow": {
117 | "slide_type": "-"
118 | }
119 | },
120 | "source": [
121 | "## Normalizing a Vector"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {
127 | "slideshow": {
128 | "slide_type": "-"
129 | }
130 | },
131 | "source": [
132 | "To normalize a vector, we shrink all values so they fall between $0$ and $1$.\n",
133 | "\n",
134 | "$vector_{normalized} = \\frac{vector}{\\sqrt{vector \\cdot vector}}$ \n",
135 | "\n",
136 | "\n",
137 | "http://www.wikihow.com/Normalize-a-Vector"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {
143 | "slideshow": {
144 | "slide_type": "-"
145 | }
146 | },
147 | "source": [
148 | "\n",
149 | "https://en.wikipedia.org/wiki/Unit_vector"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 2,
155 | "metadata": {
156 | "collapsed": true,
157 | "slideshow": {
158 | "slide_type": "-"
159 | }
160 | },
161 | "outputs": [],
162 | "source": [
163 | "import numpy as np\n",
164 | "from nltk.corpus import wordnet\n",
165 | "from collections import OrderedDict\n",
166 | "from itertools import combinations\n",
167 | "import string\n",
168 | "from gensim import models"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 3,
174 | "metadata": {
175 | "collapsed": true,
176 | "slideshow": {
177 | "slide_type": "-"
178 | }
179 | },
180 | "outputs": [],
181 | "source": [
182 | "def normalize_vector(vector):\n",
183 | " \"\"\"\n",
184 | " Normalizes a vector so that all its values are between 0 and 1\n",
185 | " :param vector: a `numpy` vector\n",
186 | " :return: a normalized `numpy` vector\n",
187 | " \"\"\"\n",
188 | " # norm = np.sqrt(vector.dot(vector))\n",
189 | " # numpy has a built in function\n",
190 | " norm = np.linalg.norm(vector)\n",
191 | " if norm:\n",
192 | " return vector / norm\n",
193 | " else:\n",
194 | " # if norm == 0, then original vector was all 0s\n",
195 | " return vector"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 4,
201 | "metadata": {
202 | "collapsed": false,
203 | "slideshow": {
204 | "slide_type": "-"
205 | }
206 | },
207 | "outputs": [
208 | {
209 | "name": "stdout",
210 | "output_type": "stream",
211 | "text": [
212 | "original vector [1 2 4]\n",
213 | "normalized vector [ 0.21821789 0.43643578 0.87287156]\n"
214 | ]
215 | }
216 | ],
217 | "source": [
218 | "vector_3d = np.array([1,2,4])\n",
219 | "print(\"original vector\", vector_3d)\n",
220 | "print(\"normalized vector\", normalize_vector(vector_3d))\n",
221 | "#0.218 is 1/4th of .873 just like 1 is 1/4th of 4"
222 | ]
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "metadata": {
227 | "slideshow": {
228 | "slide_type": "-"
229 | }
230 | },
231 | "source": [
232 | "## Calculating Cosine Similarity"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 5,
238 | "metadata": {
239 | "collapsed": true,
240 | "slideshow": {
241 | "slide_type": "-"
242 | }
243 | },
244 | "outputs": [],
245 | "source": [
246 | "def cos_sim(vector_one, vector_two):\n",
247 | " \"\"\"\n",
248 | " Calculate the cosine similarity of two `numpy` vectors\n",
249 | " :param vector_one: a `numpy` vector\n",
250 | " :param vector_two: a `numpy` vector\n",
251 | " :return: A score between 0 and 1\n",
252 | " \"\"\"\n",
253 | " # ensure that both vectors are already normalized\n",
254 | " vector_one_norm = normalize_vector(vector_one)\n",
255 | " vector_two_norm = normalize_vector(vector_two)\n",
256 | " \n",
257 | " # calculate the dot product between the two normalized vectors\n",
258 | " return vector_one_norm.dot(vector_two_norm)"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": 6,
264 | "metadata": {
265 | "collapsed": false,
266 | "slideshow": {
267 | "slide_type": "-"
268 | }
269 | },
270 | "outputs": [
271 | {
272 | "name": "stdout",
273 | "output_type": "stream",
274 | "text": [
275 | "cosine similarity of vector_one and vector_two 0.948683298051\n",
276 | "cosine similarity of vector_one and vector_three 0.904534033733\n",
277 | "cosine similarity of vector_one and vector_four 0.904534033733\n"
278 | ]
279 | }
280 | ],
281 | "source": [
282 | "vector_one = np.array([1,1,1,1,1])\n",
283 | "vector_two = np.array([1,1,1,1,2])\n",
284 | "vector_three = np.array([1,2,3,4,5])\n",
285 | "vector_four = np.array([10,20,30,40,50])\n",
286 | "\n",
287 | "print(\"cosine similarity of vector_one and vector_two\", cos_sim(vector_one, vector_two))\n",
288 | "print(\"cosine similarity of vector_one and vector_three\", cos_sim(vector_one, vector_three))\n",
289 | "print(\"cosine similarity of vector_one and vector_four\", cos_sim(vector_one, vector_four))"
290 | ]
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "metadata": {
295 | "slideshow": {
296 | "slide_type": "-"
297 | }
298 | },
299 | "source": [
300 | "## Measuring the \"Similarity\" of Words"
301 | ]
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "metadata": {
306 | "slideshow": {
307 | "slide_type": "-"
308 | }
309 | },
310 | "source": [
311 | "\n",
312 | "https://medium.com/@camrongodbout/creating-a-search-engine-f2f429cab33c#.z7i9w8y5t"
313 | ]
314 | },
315 | {
316 | "cell_type": "markdown",
317 | "metadata": {
318 | "slideshow": {
319 | "slide_type": "-"
320 | }
321 | },
322 | "source": [
323 | ""
324 | ]
325 | },
326 | {
327 | "cell_type": "markdown",
328 | "metadata": {
329 | "slideshow": {
330 | "slide_type": "-"
331 | }
332 | },
333 | "source": [
334 | "### Option 1: One-hot vectors\n",
335 | "\n",
336 | "\n",
337 | "https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": 7,
343 | "metadata": {
344 | "collapsed": true,
345 | "slideshow": {
346 | "slide_type": "-"
347 | }
348 | },
349 | "outputs": [],
350 | "source": [
351 | "vocabulary = ['apple', 'banana', 'orange', 'cantaloupe', 'peach']"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": 8,
357 | "metadata": {
358 | "collapsed": false,
359 | "slideshow": {
360 | "slide_type": "-"
361 | }
362 | },
363 | "outputs": [],
364 | "source": [
365 | "# generate vocabulary lookup\n",
366 | "def build_voc_lookup(list_of_voc):\n",
367 | " \"\"\"\n",
368 | " Generates a dictionary where the key is the word and the value is its index\n",
369 | " :param list_of_voc: list of vocabulary words\n",
370 | " :return: Dictionary of vocabulary\n",
371 | " \"\"\"\n",
372 | " lookup_dict = OrderedDict()\n",
373 | " counter = 0\n",
374 | " for word in list_of_voc:\n",
375 | " lookup_dict[word] = counter\n",
376 | " counter+=1\n",
377 | " return lookup_dict"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 9,
383 | "metadata": {
384 | "collapsed": true,
385 | "slideshow": {
386 | "slide_type": "-"
387 | }
388 | },
389 | "outputs": [],
390 | "source": [
391 | "# lookup word\n",
392 | "def lookup_word(lookup_dict, word):\n",
393 | " \"\"\" \n",
394 | " Looks up a given word in the vocabulary dictionary, and returns None if word not in vocabulary\n",
395 | " :param lookup_dict: lookup-dictionary built with build_voc_lookup()\n",
396 | " :param word to index\n",
397 | " :return: index of word in vocabulary or None\n",
398 | " \"\"\"\n",
399 | " if word in lookup_dict:\n",
400 | " return lookup_dict[word]\n",
401 | " else:\n",
402 | " return None"
403 | ]
404 | },
405 | {
406 | "cell_type": "code",
407 | "execution_count": 10,
408 | "metadata": {
409 | "collapsed": false,
410 | "slideshow": {
411 | "slide_type": "-"
412 | }
413 | },
414 | "outputs": [
415 | {
416 | "name": "stdout",
417 | "output_type": "stream",
418 | "text": [
419 | "4\n",
420 | "None\n"
421 | ]
422 | }
423 | ],
424 | "source": [
425 | "lookup_dict = build_voc_lookup(vocabulary)\n",
426 | "print(lookup_word(lookup_dict, 'peach'))\n",
427 | "print(lookup_word(lookup_dict, 'hashbrown'))"
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": 11,
433 | "metadata": {
434 | "collapsed": false,
435 | "slideshow": {
436 | "slide_type": "-"
437 | }
438 | },
439 | "outputs": [],
440 | "source": [
441 | "# build one-hot vector for word\n",
442 | "def make_one_hot(lookup_dict, word):\n",
443 | " \"\"\"\n",
444 | " Builds a one-hot numpy vector for a word\n",
445 | " :param lookup_dict: lookup-dictionary built with build_voc_lookup()\n",
446 | " :param word: word to convert to one-hot\n",
447 | " :return numpy vector with dimension equal to size of vocabulary\n",
448 | " \"\"\"\n",
449 | " # get size of vocabulary\n",
450 | " voc_size = len(lookup_dict.items())\n",
451 | " # initialize empty vector of zeros with the size of the vocabulary\n",
452 | " one_hot = np.zeros((voc_size))\n",
453 | " # get index of word (or None if not in vocabulary)\n",
454 | " word_index = lookup_word(lookup_dict, word)\n",
455 | " # make the nth dimension of one-hot (representing the index of word in vocabulary) to 1\n",
456 | " if word_index or word_index == 0:\n",
457 | " one_hot[word_index] = 1\n",
458 | " # if word not in vocabulary, the one-hot will remain zeros\n",
459 | " return one_hot"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 12,
465 | "metadata": {
466 | "collapsed": false,
467 | "slideshow": {
468 | "slide_type": "-"
469 | }
470 | },
471 | "outputs": [
472 | {
473 | "name": "stdout",
474 | "output_type": "stream",
475 | "text": [
476 | "one-hot vector for ' apple' [ 1. 0. 0. 0. 0.]\n",
477 | "one-hot vector for ' banana' [ 0. 1. 0. 0. 0.]\n",
478 | "one-hot vector for ' orange' [ 0. 0. 1. 0. 0.]\n",
479 | "one-hot vector for ' cantaloupe' [ 0. 0. 0. 1. 0.]\n",
480 | "one-hot vector for ' peach' [ 0. 0. 0. 0. 1.]\n",
481 | "one-hot vector for ' hashbrown' [ 0. 0. 0. 0. 0.]\n",
482 | "one-hot vector for ' Capizzi' [ 0. 0. 0. 0. 0.]\n"
483 | ]
484 | }
485 | ],
486 | "source": [
487 | "for word in vocabulary + ['hashbrown', 'Capizzi']:\n",
488 | " print(\"one-hot vector for '{:>11}'\".format(word), make_one_hot(lookup_dict, word))"
489 | ]
490 | },
491 | {
492 | "cell_type": "markdown",
493 | "metadata": {
494 | "slideshow": {
495 | "slide_type": "-"
496 | }
497 | },
498 | "source": [
499 | "#### The problem with one-hot vectors"
500 | ]
501 | },
502 | {
503 | "cell_type": "code",
504 | "execution_count": 13,
505 | "metadata": {
506 | "collapsed": false,
507 | "slideshow": {
508 | "slide_type": "-"
509 | }
510 | },
511 | "outputs": [
512 | {
513 | "name": "stdout",
514 | "output_type": "stream",
515 | "text": [
516 | "cosine similarity between apple and banana 0.0\n",
517 | "cosine similarity between apple and orange 0.0\n",
518 | "cosine similarity between apple and cantaloupe 0.0\n",
519 | "cosine similarity between apple and peach 0.0\n",
520 | "cosine similarity between apple and Phoenix 0.0\n",
521 | "cosine similarity between banana and orange 0.0\n",
522 | "cosine similarity between banana and cantaloupe 0.0\n",
523 | "cosine similarity between banana and peach 0.0\n",
524 | "cosine similarity between banana and Phoenix 0.0\n",
525 | "cosine similarity between orange and cantaloupe 0.0\n",
526 | "cosine similarity between orange and peach 0.0\n",
527 | "cosine similarity between orange and Phoenix 0.0\n",
528 | "cosine similarity between cantaloupe and peach 0.0\n",
529 | "cosine similarity between cantaloupe and Phoenix 0.0\n",
530 | "cosine similarity between peach and Phoenix 0.0\n"
531 | ]
532 | }
533 | ],
534 | "source": [
535 | "# add an OOV word to vocabulary\n",
536 | "vocabulary_plus_oov = vocabulary + [\"Phoenix\"]\n",
537 | "# get all combinations\n",
538 | "all_combinations = combinations(vocabulary_plus_oov, 2)\n",
539 | "# iterate through all combinations and calculate cosine similarity\n",
540 | "for (word1, word2) in all_combinations:\n",
541 | " one_hot_word_1 = make_one_hot(lookup_dict, word1)\n",
542 | " one_hot_word_2 = make_one_hot(lookup_dict, word2)\n",
543 | " print(\"cosine similarity between {} and {}\".format(word1, word2), cos_sim(one_hot_word_1, one_hot_word_2))"
544 | ]
545 | },
546 | {
547 | "cell_type": "markdown",
548 | "metadata": {
549 | "slideshow": {
550 | "slide_type": "-"
551 | }
552 | },
553 | "source": [
554 | "### Option 2: Encode spelling\n",
555 | "Following a similar pattern as the one-hot of a word over a vocabulary, let's build word vectors represented by the frequency of the letters present"
556 | ]
557 | },
558 | {
559 | "cell_type": "code",
560 | "execution_count": 14,
561 | "metadata": {
562 | "collapsed": true,
563 | "slideshow": {
564 | "slide_type": "-"
565 | }
566 | },
567 | "outputs": [],
568 | "source": [
569 | "alphabet = list(string.ascii_lowercase)"
570 | ]
571 | },
572 | {
573 | "cell_type": "code",
574 | "execution_count": 15,
575 | "metadata": {
576 | "collapsed": false,
577 | "slideshow": {
578 | "slide_type": "-"
579 | }
580 | },
581 | "outputs": [],
582 | "source": [
583 | "# since we don't need to worry about \"out-of-vocabulary\" now, we can just use alphabet.index([letter])\n",
584 | "def lookup_letter(letter):\n",
585 | " return alphabet.index(letter.lower())"
586 | ]
587 | },
588 | {
589 | "cell_type": "code",
590 | "execution_count": 16,
591 | "metadata": {
592 | "collapsed": false,
593 | "slideshow": {
594 | "slide_type": "-"
595 | }
596 | },
597 | "outputs": [
598 | {
599 | "name": "stdout",
600 | "output_type": "stream",
601 | "text": [
602 | "a 0\n",
603 | "A 0\n"
604 | ]
605 | }
606 | ],
607 | "source": [
608 | "print(\"a\", lookup_letter('a'))\n",
609 | "print(\"A\", lookup_letter('A'))"
610 | ]
611 | },
612 | {
613 | "cell_type": "code",
614 | "execution_count": 17,
615 | "metadata": {
616 | "collapsed": false,
617 | "slideshow": {
618 | "slide_type": "-"
619 | }
620 | },
621 | "outputs": [],
622 | "source": [
623 | "def make_spelling_vector(word):\n",
624 | " \"\"\"\n",
625 | " Converts a word into a vector of dimension 26 where each cell contains the count for that letter\n",
626 | " :param word: word to vectorize\n",
627 | " :return: numpy vector of 26 dimensions\n",
628 | " \"\"\"\n",
629 | " # initialize vector with zeros\n",
630 | " spelling_vector = np.zeros((26))\n",
631 | " # iterate through each letter and update count\n",
632 | " for letter in word:\n",
633 | " if letter in string.ascii_letters:\n",
634 | " letter_index = lookup_letter(letter)\n",
635 | " spelling_vector[letter_index] = spelling_vector[letter_index] + 1\n",
636 | " return spelling_vector"
637 | ]
638 | },
639 | {
640 | "cell_type": "code",
641 | "execution_count": 18,
642 | "metadata": {
643 | "collapsed": false,
644 | "slideshow": {
645 | "slide_type": "-"
646 | }
647 | },
648 | "outputs": [
649 | {
650 | "data": {
651 | "text/plain": [
652 | "array([ 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,\n",
653 | " 0., 0., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])"
654 | ]
655 | },
656 | "execution_count": 18,
657 | "metadata": {},
658 | "output_type": "execute_result"
659 | }
660 | ],
661 | "source": [
662 | "make_spelling_vector(\"apple\")"
663 | ]
664 | },
665 | {
666 | "cell_type": "code",
667 | "execution_count": 19,
668 | "metadata": {
669 | "collapsed": true,
670 | "slideshow": {
671 | "slide_type": "-"
672 | }
673 | },
674 | "outputs": [],
675 | "source": [
676 | "vocabulary = [\"apple\", \"banana\", \"orange\", \"cantaloupe\", \"peach\", \"Phoenix\"]"
677 | ]
678 | },
679 | {
680 | "cell_type": "code",
681 | "execution_count": 20,
682 | "metadata": {
683 | "collapsed": false,
684 | "slideshow": {
685 | "slide_type": "-"
686 | }
687 | },
688 | "outputs": [
689 | {
690 | "name": "stdout",
691 | "output_type": "stream",
692 | "text": [
693 | "cosine similarity between apple and banana 0.303045763366\n",
694 | "cosine similarity between apple and orange 0.308606699924\n",
695 | "cosine similarity between apple and cantaloupe 0.654653670708\n",
696 | "cosine similarity between apple and peach 0.676123403783\n",
697 | "cosine similarity between apple and Phoenix 0.428571428571\n",
698 | "cosine similarity between banana and orange 0.54554472559\n",
699 | "cosine similarity between banana and cantaloupe 0.617213399848\n",
700 | "cosine similarity between banana and peach 0.3585685828\n",
701 | "cosine similarity between banana and Phoenix 0.20203050891\n",
702 | "cosine similarity between orange and cantaloupe 0.589255650989\n",
703 | "cosine similarity between orange and peach 0.36514837167\n",
704 | "cosine similarity between orange and Phoenix 0.462910049886\n",
705 | "cosine similarity between cantaloupe and peach 0.645497224368\n",
706 | "cosine similarity between cantaloupe and Phoenix 0.436435780472\n",
707 | "cosine similarity between peach and Phoenix 0.507092552837\n"
708 | ]
709 | }
710 | ],
711 | "source": [
712 | "# reset the generator\n",
713 | "all_combinations = combinations(vocabulary, 2)\n",
714 | "# iterate through all words\n",
715 | "for (word1, word2) in all_combinations:\n",
716 | " spelling_vector_1 = make_spelling_vector(word1)\n",
717 | " spelling_vector_2 = make_spelling_vector(word2)\n",
718 | " print(\"cosine similarity between {} and {}\".format(word1, word2), cos_sim(spelling_vector_1, spelling_vector_2))"
719 | ]
720 | },
721 | {
722 | "cell_type": "code",
723 | "execution_count": 21,
724 | "metadata": {
725 | "collapsed": false,
726 | "slideshow": {
727 | "slide_type": "-"
728 | }
729 | },
730 | "outputs": [
731 | {
732 | "data": {
733 | "text/plain": [
734 | "True"
735 | ]
736 | },
737 | "execution_count": 21,
738 | "metadata": {},
739 | "output_type": "execute_result"
740 | }
741 | ],
742 | "source": [
743 | "# what if two words share the same letters?\n",
744 | "dog = make_spelling_vector(\"dog\")\n",
745 | "god = make_spelling_vector(\"God\")\n",
746 | "# god == dog\n",
747 | "np.all(god == dog)"
748 | ]
749 | },
750 | {
751 | "cell_type": "markdown",
752 | "metadata": {
753 | "slideshow": {
754 | "slide_type": "-"
755 | }
756 | },
757 | "source": [
758 | "#### We've successfully generated similarity scores! But...\n",
759 | "\n",
760 | "Do they really reflect anything semantic? \n",
761 | "\n",
762 | "In other words, does it make sense that **\"peach\"** and **\"Phoenix\"**
\n",
763 | "(`cosine similarity = 0.507`)
\n",
764 | "are **more** similar than **\"peach\"** and **\"orange\"**
\n",
765 | "(`cosine similarity = .365`)?"
766 | ]
767 | },
768 | {
769 | "cell_type": "markdown",
770 | "metadata": {
771 | "slideshow": {
772 | "slide_type": "-"
773 | }
774 | },
775 | "source": [
776 | "### Option 3: Word Embeddings\n",
777 | "Create a \"dense\" representation of each word where proximity in vector space represents \"similarity\"."
778 | ]
779 | },
780 | {
781 | "cell_type": "markdown",
782 | "metadata": {
783 | "slideshow": {
784 | "slide_type": "-"
785 | }
786 | },
787 | "source": [
788 | "\n",
789 | "https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/"
790 | ]
791 | },
792 | {
793 | "cell_type": "markdown",
794 | "metadata": {
795 | "slideshow": {
796 | "slide_type": "-"
797 | }
798 | },
799 | "source": [
800 | "\n",
801 | "https://arxiv.org/pdf/1301.3781v3.pdf"
802 | ]
803 | },
804 | {
805 | "cell_type": "markdown",
806 | "metadata": {
807 | "slideshow": {
808 | "slide_type": "-"
809 | }
810 | },
811 | "source": [
812 | "\n",
813 | "https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/"
814 | ]
815 | },
816 | {
817 | "cell_type": "markdown",
818 | "metadata": {
819 | "slideshow": {
820 | "slide_type": "-"
821 | }
822 | },
823 | "source": [
824 | "#### Using the `gensim` package in `python`\n",
825 | "https://radimrehurek.com/gensim/models/word2vec.html"
826 | ]
827 | },
828 | {
829 | "cell_type": "code",
830 | "execution_count": 22,
831 | "metadata": {
832 | "collapsed": false,
833 | "slideshow": {
834 | "slide_type": "-"
835 | }
836 | },
837 | "outputs": [],
838 | "source": [
839 | "# load existing word2vec vectors into gensim\n",
840 | "\n",
841 | "# most frequent 125k words in Gigaword corpus\n",
842 | "w2v = models.Word2Vec.load_word2vec_format(fname=\"Gigaword_pruned_vectors.txt.gz\", binary=False)\n",
843 | "\n",
844 | "# original `word2vec` embeddings can be downloaded here:\n",
845 | "# https://code.google.com/archive/p/word2vec/"
846 | ]
847 | },
848 | {
849 | "cell_type": "markdown",
850 | "metadata": {
851 | "slideshow": {
852 | "slide_type": "-"
853 | }
854 | },
855 | "source": [
856 | "Pre-trained word embeddings can be loaded into `gensim` in `.txt` or `.txt.gz` format *as long as* the first line identifies (1) the number of words in file and (2) the dimensions of the vector\n",
857 | " \n",
858 | "```\n",
859 | "199999 200\n",
860 | "and -0.065843 -0.133472 0.020263 0.102796 0.003295 0.025878 -0.071714 0.054211 -0.026698 -0.036176 -0.024954 0.042049 -0.165819 -0.067038 0.117293 0.046338 0.012154 0.026929 -0.020248 0.120186 0.081922 0.062471 -0.063391 -0.048321 -0.108106 -0.067974 0.092109 -0.034439 -0.024319 0.008799 -0.099953\n",
861 | "...\n",
862 | "```"
863 | ]
864 | },
865 | {
866 | "cell_type": "code",
867 | "execution_count": 23,
868 | "metadata": {
869 | "collapsed": false,
870 | "slideshow": {
871 | "slide_type": "-"
872 | }
873 | },
874 | "outputs": [
875 | {
876 | "data": {
877 | "text/plain": [
878 | "array([ 0.06338 , -0.146809 , 0.110004 , -0.01205 , -0.045637 ,\n",
879 | " -0.02224 , -0.045153 , 0.079144 , -0.027216 , -0.027647 ,\n",
880 | " -0.000434 , 0.108648 , -0.060456 , -0.129502 , 0.010897 ,\n",
881 | " 0.055499 , 0.086099 , 0.055282 , 0.007365 , 0.167188 ,\n",
882 | " 0.016705 , 0.0744 , -0.07096 , -0.105974 , -0.095631 ,\n",
883 | " 0.006107 , 0.12862299, -0.033055 , -0.020641 , 0.024765 ,\n",
884 | " -0.048181 , -0.090195 , 0.007408 , 0.073138 , 0.031994 ,\n",
885 | " -0.014252 , 0.102764 , -0.081244 , 0.10513 , 0.039809 ,\n",
886 | " -0.050727 , 0.002429 , -0.01506 , -0.085081 , -0.02245 ,\n",
887 | " 0.102064 , -0.009099 , -0.092295 , -0.040276 , 0.148752 ], dtype=float32)"
888 | ]
889 | },
890 | "execution_count": 23,
891 | "metadata": {},
892 | "output_type": "execute_result"
893 | }
894 | ],
895 | "source": [
896 | "# the first 50 dimensions of the vector for \"the\"\n",
897 | "w2v[\"the\"][0:50]"
898 | ]
899 | },
900 | {
901 | "cell_type": "code",
902 | "execution_count": 24,
903 | "metadata": {
904 | "collapsed": false,
905 | "slideshow": {
906 | "slide_type": "subslide"
907 | }
908 | },
909 | "outputs": [
910 | {
911 | "ename": "KeyError",
912 | "evalue": "'abcdef'",
913 | "output_type": "error",
914 | "traceback": [
915 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
916 | "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
917 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mw2v\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"abcdef\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
918 | "\u001b[0;32m/Users/mcapizzi/miniconda3/envs/word-embedding/lib/python3.5/site-packages/gensim/models/word2vec.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, words)\u001b[0m\n\u001b[1;32m 1502\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstring_types\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1503\u001b[0m \u001b[0;31m# allow calls like trained_model['office'], as a shorthand for trained_model[['office']]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1504\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msyn0\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvocab\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1505\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1506\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mvstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msyn0\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvocab\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mword\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mword\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mwords\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
919 | "\u001b[0;31mKeyError\u001b[0m: 'abcdef'"
920 | ]
921 | }
922 | ],
923 | "source": [
924 | "w2v[\"abcdef\"]"
925 | ]
926 | },
927 | {
928 | "cell_type": "code",
929 | "execution_count": 25,
930 | "metadata": {
931 | "collapsed": false,
932 | "slideshow": {
933 | "slide_type": "subslide"
934 | }
935 | },
936 | "outputs": [],
937 | "source": [
938 | "def get_vector(word):\n",
939 | " \"\"\"\n",
940 | " Returns the word vector for that word or a vector of 0s for out-of-vocabulary\n",
941 | " :param: word: word to lookup in vectors\n",
942 | " :return: vector or vector of zeros\n",
943 | " \"\"\"\n",
944 | " # determine vector length\n",
945 | " w2v_length = len(w2v[\"the\"])\n",
946 | " # get vector\n",
947 | " if word in w2v:\n",
948 | " return w2v[word]\n",
949 | " else:\n",
950 | " return np.zeros((w2v_length))"
951 | ]
952 | },
953 | {
954 | "cell_type": "code",
955 | "execution_count": 26,
956 | "metadata": {
957 | "collapsed": false,
958 | "slideshow": {
959 | "slide_type": "-"
960 | }
961 | },
962 | "outputs": [
963 | {
964 | "data": {
965 | "text/plain": [
966 | "array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
967 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
968 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
969 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])"
970 | ]
971 | },
972 | "execution_count": 26,
973 | "metadata": {},
974 | "output_type": "execute_result"
975 | }
976 | ],
977 | "source": [
978 | "get_vector(\"abcdef\")[0:50]"
979 | ]
980 | },
981 | {
982 | "cell_type": "code",
983 | "execution_count": 27,
984 | "metadata": {
985 | "collapsed": false,
986 | "slideshow": {
987 | "slide_type": "-"
988 | }
989 | },
990 | "outputs": [
991 | {
992 | "data": {
993 | "text/plain": [
994 | "[('monarch', 0.7166919708251953),\n",
995 | " ('princess', 0.7164901494979858),\n",
996 | " ('margrethe', 0.6889792680740356),\n",
997 | " ('beatrix', 0.6878944039344788),\n",
998 | " ('coronation', 0.6789792776107788),\n",
999 | " ('prince', 0.6730599403381348),\n",
1000 | " ('wilhelmina', 0.6619384288787842),\n",
1001 | " ('mettemarit', 0.6575925946235657),\n",
1002 | " ('consort', 0.6492267847061157),\n",
1003 | " ('duchess', 0.6444146633148193)]"
1004 | ]
1005 | },
1006 | "execution_count": 27,
1007 | "metadata": {},
1008 | "output_type": "execute_result"
1009 | }
1010 | ],
1011 | "source": [
1012 | "# find most similar n words to a given word\n",
1013 | "similar = w2v.similar_by_word(\"queen\", topn=10)\n",
1014 | "similar"
1015 | ]
1016 | },
1017 | {
1018 | "cell_type": "code",
1019 | "execution_count": 28,
1020 | "metadata": {
1021 | "collapsed": false,
1022 | "slideshow": {
1023 | "slide_type": "-"
1024 | }
1025 | },
1026 | "outputs": [
1027 | {
1028 | "data": {
1029 | "text/plain": [
1030 | "[('cat', 1.0),\n",
1031 | " ('dog', 0.8524122834205627),\n",
1032 | " ('puppy', 0.7896589040756226),\n",
1033 | " ('pug', 0.783139169216156),\n",
1034 | " ('critter', 0.7650502324104309),\n",
1035 | " ('squirrel', 0.7516598701477051),\n",
1036 | " ('feline', 0.7436362504959106),\n",
1037 | " ('gerbil', 0.7435644865036011),\n",
1038 | " ('monkey', 0.7434572577476501),\n",
1039 | " ('hamster', 0.7323285341262817)]"
1040 | ]
1041 | },
1042 | "execution_count": 28,
1043 | "metadata": {},
1044 | "output_type": "execute_result"
1045 | }
1046 | ],
1047 | "source": [
1048 | "# find most similar n words to a given vector\n",
1049 | "cat_vector = get_vector(\"cat\")\n",
1050 | "cat_sim = w2v.similar_by_vector(cat_vector, topn=10)\n",
1051 | "cat_sim"
1052 | ]
1053 | },
1054 | {
1055 | "cell_type": "markdown",
1056 | "metadata": {
1057 | "slideshow": {
1058 | "slide_type": "-"
1059 | }
1060 | },
1061 | "source": [
1062 | "#### Evaluation of word embeddings"
1063 | ]
1064 | },
1065 | {
1066 | "cell_type": "markdown",
1067 | "metadata": {
1068 | "slideshow": {
1069 | "slide_type": "-"
1070 | }
1071 | },
1072 | "source": [
1073 | "\n",
1074 | "https://arxiv.org/pdf/1301.3781v3.pdf"
1075 | ]
1076 | },
1077 | {
1078 | "cell_type": "markdown",
1079 | "metadata": {
1080 | "slideshow": {
1081 | "slide_type": "-"
1082 | }
1083 | },
1084 | "source": [
1085 | "\n",
1086 | "https://www.aclweb.org/anthology/N/N13/N13-1090.pdf"
1087 | ]
1088 | },
1089 | {
1090 | "cell_type": "markdown",
1091 | "metadata": {
1092 | "slideshow": {
1093 | "slide_type": "-"
1094 | }
1095 | },
1096 | "source": [
1097 | "\n",
1098 | "https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf"
1099 | ]
1100 | },
1101 | {
1102 | "cell_type": "markdown",
1103 | "metadata": {
1104 | "slideshow": {
1105 | "slide_type": "-"
1106 | }
1107 | },
1108 | "source": [
1109 | "\n",
1110 | "https://arxiv.org/pdf/1301.3781v3.pdf"
1111 | ]
1112 | },
1113 | {
1114 | "cell_type": "markdown",
1115 | "metadata": {
1116 | "slideshow": {
1117 | "slide_type": "-"
1118 | }
1119 | },
1120 | "source": [
1121 | "##### Analogies\n",
1122 | "\n",
1123 | "Built-in function in `gensim`: `most_similar(positive, negative, topn)`\n",
1124 | "\n",
1125 | "`A:B::C:??` --> `most_similar(positive=[B,C], negative=A)`"
1126 | ]
1127 | },
1128 | {
1129 | "cell_type": "code",
1130 | "execution_count": 29,
1131 | "metadata": {
1132 | "collapsed": false,
1133 | "slideshow": {
1134 | "slide_type": "-"
1135 | }
1136 | },
1137 | "outputs": [],
1138 | "source": [
1139 | "def analogy_solver(A, B, C, topn=5):\n",
1140 | " \"\"\"\n",
1141 | " A:B::C:?\n",
1142 | " man:woman::king:???\n",
1143 | " most_similar(positive=[B,C], negative=[A])\n",
1144 | " \"\"\"\n",
1145 | " return w2v.most_similar(\n",
1146 | " positive=[B,C],\n",
1147 | " negative=[A],\n",
1148 | " topn=topn\n",
1149 | " )"
1150 | ]
1151 | },
1152 | {
1153 | "cell_type": "code",
1154 | "execution_count": 30,
1155 | "metadata": {
1156 | "collapsed": false,
1157 | "slideshow": {
1158 | "slide_type": "-"
1159 | }
1160 | },
1161 | "outputs": [
1162 | {
1163 | "name": "stdout",
1164 | "output_type": "stream",
1165 | "text": [
1166 | "[('queen', 0.6834795475006104), ('monarch', 0.6421915292739868), ('princess', 0.5896612405776978), ('beatrix', 0.5811704993247986), ('prince', 0.5663138031959534)]\n",
1167 | "\n",
1168 | "[('queen', 0.6834795475006104), ('monarch', 0.6421915292739868), ('princess', 0.5896612405776978), ('beatrix', 0.5811704993247986), ('prince', 0.5663138031959534)]\n",
1169 | "\n",
1170 | "[('sister', 0.8335152268409729), ('daughter', 0.8259485960006714), ('mother', 0.7856060266494751), ('grandmother', 0.7708373069763184), ('sisterinlaw', 0.7601062655448914)]\n",
1171 | "\n",
1172 | "[('sister', 0.8335152268409729), ('daughter', 0.8259485960006714), ('mother', 0.7856060266494751), ('grandmother', 0.7708373069763184), ('sisterinlaw', 0.7601062655448914)]\n"
1173 | ]
1174 | }
1175 | ],
1176 | "source": [
1177 | "# man:woman::king:???\n",
1178 | "# king - man + woman = ???\n",
1179 | "sol_1 = analogy_solver(\"man\", \"woman\", \"king\")\n",
1180 | "print(sol_1)\n",
1181 | "print()\n",
1182 | "\n",
1183 | "# man:king::woman:???\n",
1184 | "# woman - man + king = ???\n",
1185 | "sol_2 = analogy_solver(\"man\", \"king\", \"woman\")\n",
1186 | "print(sol_2)\n",
1187 | "print()\n",
1188 | "\n",
1189 | "# uncle:brother::aunt:???\n",
1190 | "# brother - uncle + aunt = ???\n",
1191 | "sol_3 = analogy_solver(\"uncle\", \"brother\", \"aunt\")\n",
1192 | "print(sol_3)\n",
1193 | "print()\n",
1194 | "\n",
1195 | "# uncle:aunt::brother:???\n",
1196 | "# aunt - uncle + brother = ???\n",
1197 | "sol_4 = analogy_solver(\"uncle\", \"aunt\", \"brother\")\n",
1198 | "print(sol_4)"
1199 | ]
1200 | },
1201 | {
1202 | "cell_type": "markdown",
1203 | "metadata": {
1204 | "slideshow": {
1205 | "slide_type": "-"
1206 | }
1207 | },
1208 | "source": [
1209 | "##### \"One of these words is not like the other\"\n",
1210 | "\n",
1211 | "`breakfast cereal dinner lunch`\n",
1212 | "\n",
1213 | "Built-in function in `gensim`: `doesnt_match(list_of_words)`"
1214 | ]
1215 | },
1216 | {
1217 | "cell_type": "code",
1218 | "execution_count": 31,
1219 | "metadata": {
1220 | "collapsed": false,
1221 | "slideshow": {
1222 | "slide_type": "-"
1223 | }
1224 | },
1225 | "outputs": [
1226 | {
1227 | "name": "stdout",
1228 | "output_type": "stream",
1229 | "text": [
1230 | "cereal\n",
1231 | "house\n"
1232 | ]
1233 | }
1234 | ],
1235 | "source": [
1236 | "# find which word doesn't match\n",
1237 | "list_of_words = \"breakfast cereal dinner lunch\"\n",
1238 | "doesnt_match = w2v.doesnt_match(list_of_words.split())\n",
1239 | "print(doesnt_match)\n",
1240 | "\n",
1241 | "list_of_words_2 = \"house dog pencil electrocute\"\n",
1242 | "doesnt_match_2 = w2v.doesnt_match(list_of_words_2.split())\n",
1243 | "print(doesnt_match_2)"
1244 | ]
1245 | },
1246 | {
1247 | "cell_type": "markdown",
1248 | "metadata": {
1249 | "slideshow": {
1250 | "slide_type": "-"
1251 | }
1252 | },
1253 | "source": [
1254 | "#### Word Embeddings and Antonyms"
1255 | ]
1256 | },
1257 | {
1258 | "cell_type": "code",
1259 | "execution_count": 32,
1260 | "metadata": {
1261 | "collapsed": false,
1262 | "slideshow": {
1263 | "slide_type": "-"
1264 | }
1265 | },
1266 | "outputs": [
1267 | {
1268 | "data": {
1269 | "text/plain": [
1270 | "[('bad', 0.7170573472976685),\n",
1271 | " ('terrific', 0.7161434888839722),\n",
1272 | " ('decent', 0.7018914222717285),\n",
1273 | " ('lousy', 0.6984266042709351),\n",
1274 | " ('wonderful', 0.6819486618041992),\n",
1275 | " ('perfect', 0.6481753587722778),\n",
1276 | " ('great', 0.6480209827423096),\n",
1277 | " ('nice', 0.6281204223632812),\n",
1278 | " ('darn', 0.623289942741394),\n",
1279 | " ('fun', 0.6176395416259766)]"
1280 | ]
1281 | },
1282 | "execution_count": 32,
1283 | "metadata": {},
1284 | "output_type": "execute_result"
1285 | }
1286 | ],
1287 | "source": [
1288 | "# this approach doesn't handle antonyms well\n",
1289 | "# \"That movie was _______.\"\n",
1290 | "\n",
1291 | "w2v.similar_by_word(\"good\", topn=10)"
1292 | ]
1293 | },
1294 | {
1295 | "cell_type": "markdown",
1296 | "metadata": {
1297 | "slideshow": {
1298 | "slide_type": "-"
1299 | }
1300 | },
1301 | "source": [
1302 | "#### Bias in Word Embeddings"
1303 | ]
1304 | },
1305 | {
1306 | "cell_type": "markdown",
1307 | "metadata": {
1308 | "slideshow": {
1309 | "slide_type": "-"
1310 | }
1311 | },
1312 | "source": [
1313 | "\n",
1314 | "\n",
1315 | "https://arxiv.org/pdf/1607.06520v1.pdf"
1316 | ]
1317 | },
1318 | {
1319 | "cell_type": "markdown",
1320 | "metadata": {
1321 | "slideshow": {
1322 | "slide_type": "subslide"
1323 | }
1324 | },
1325 | "source": [
1326 | "\n",
1327 | "https://arxiv.org/pdf/1607.06520v1.pdf"
1328 | ]
1329 | },
1330 | {
1331 | "cell_type": "code",
1332 | "execution_count": 33,
1333 | "metadata": {
1334 | "collapsed": false,
1335 | "slideshow": {
1336 | "slide_type": "-"
1337 | }
1338 | },
1339 | "outputs": [
1340 | {
1341 | "name": "stdout",
1342 | "output_type": "stream",
1343 | "text": [
1344 | "[('lathes', 0.581120491027832), ('typewriters', 0.5445051193237305), ('washing', 0.5365341305732727), ('machine', 0.5345758199691772), ('shoe', 0.5307853817939758)]\n",
1345 | "\n"
1346 | ]
1347 | }
1348 | ],
1349 | "source": [
1350 | "# she:sewing::he:???\n",
1351 | "bias_1 = analogy_solver(\"she\", \"sewing\", \"he\")\n",
1352 | "print(bias_1)\n",
1353 | "print()"
1354 | ]
1355 | },
1356 | {
1357 | "cell_type": "markdown",
1358 | "metadata": {
1359 | "slideshow": {
1360 | "slide_type": "-"
1361 | }
1362 | },
1363 | "source": [
1364 | "#### Links to available word embeddings\n",
1365 | "\n",
1366 | "[The \"original\" code for `word2vec`, and pre-trained vectors](https://code.google.com/archive/p/word2vec/)\n",
1367 | "\n",
1368 | "[Stanford's approach to word embeddings, and pre-trained vectors](http://nlp.stanford.edu/projects/glove/)\n",
1369 | "\n",
1370 | "[A modified approach to word embeddings (feeding dependency tuples to the neural network instead of words), and pre-trained vectors](https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings/)\n",
1371 | "\n",
1372 | "[Word embeddings from a particular historical period](http://nlp.stanford.edu/projects/histwords/)"
1373 | ]
1374 | },
1375 | {
1376 | "cell_type": "markdown",
1377 | "metadata": {
1378 | "slideshow": {
1379 | "slide_type": "-"
1380 | }
1381 | },
1382 | "source": [
1383 | "## Links to papers\n",
1384 | "\n",
1385 | "The \"original\" three papers on `word2vec` by Mikolov:\n",
1386 | "\n",
1387 | " - [Efficient Estimation of Word Representations in Vector Space](http://arxiv.org/pdf/1301.3781v3.pdf)\n",
1388 | "\n",
1389 | " - [Distributed Representations of Words and Phrases and their Compositionality](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)\n",
1390 | "\n",
1391 | " - [Linguistic Regularities in Continuous Space Word Representations](https://www.aclweb.org/anthology/N/N13/N13-1090.pdf)\n",
1392 | "\n",
1393 | "\n",
1394 | "[Further analysis of approaches to word embeddings and their hyperparameters](https://transacl.org/ojs/index.php/tacl/article/viewFile/570/124)\n",
1395 | "\n",
1396 | "[Detailed evaluation of word embeddings](https://arxiv.org/pdf/1608.04207v1.pdf)\n",
1397 | "\n",
1398 | "[Website for evaluating word embeddings](http://veceval.com/)\n",
1399 | "\n"
1400 | ]
1401 | },
1402 | {
1403 | "cell_type": "markdown",
1404 | "metadata": {
1405 | "slideshow": {
1406 | "slide_type": "-"
1407 | }
1408 | },
1409 | "source": [
1410 | "## Links to blogs\n",
1411 | "\n",
1412 | "[A good overview of NLP](https://blog.monkeylearn.com/the-definitive-guide-to-natural-language-processing/)\n",
1413 | "\n",
1414 | "[Blog post summary of the three \"original\" papers by Mikolov](https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/)\n",
1415 | "\n",
1416 | "[Detailed blog post on the application of word embeddings to analogies](https://quomodocumque.wordpress.com/2016/01/15/messing-around-with-word2vec/)\n",
1417 | "\n",
1418 | "[Appyling word embeddings to computer logs](https://gab41.lab41.org/three-things-we-learned-about-applying-word-vectors-to-computer-logs-c199070f390b#.k2mirf2oa)"
1419 | ]
1420 | }
1421 | ],
1422 | "metadata": {
1423 | "celltoolbar": "Slideshow",
1424 | "kernelspec": {
1425 | "display_name": "Python 3",
1426 | "language": "python",
1427 | "name": "python3"
1428 | },
1429 | "language_info": {
1430 | "codemirror_mode": {
1431 | "name": "ipython",
1432 | "version": 3
1433 | },
1434 | "file_extension": ".py",
1435 | "mimetype": "text/x-python",
1436 | "name": "python",
1437 | "nbconvert_exporter": "python",
1438 | "pygments_lexer": "ipython3",
1439 | "version": "3.5.2"
1440 | }
1441 | },
1442 | "nbformat": 4,
1443 | "nbformat_minor": 0
1444 | }
1445 |
--------------------------------------------------------------------------------
/images/NLP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/NLP.png
--------------------------------------------------------------------------------
/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/architecture.png
--------------------------------------------------------------------------------
/images/architecture_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/architecture_2.png
--------------------------------------------------------------------------------
/images/context.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/context.png
--------------------------------------------------------------------------------
/images/cos_sim.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/cos_sim.png
--------------------------------------------------------------------------------
/images/cos_sim_compare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/cos_sim_compare.png
--------------------------------------------------------------------------------
/images/country_capital.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/country_capital.png
--------------------------------------------------------------------------------
/images/distance_measures.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/distance_measures.png
--------------------------------------------------------------------------------
/images/eval_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/eval_1.png
--------------------------------------------------------------------------------
/images/eval_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/eval_2.png
--------------------------------------------------------------------------------
/images/gender_bias.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/gender_bias.png
--------------------------------------------------------------------------------
/images/king_queen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/king_queen.png
--------------------------------------------------------------------------------
/images/king_queen_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/king_queen_2.png
--------------------------------------------------------------------------------
/images/king_queen_vis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/king_queen_vis.png
--------------------------------------------------------------------------------
/images/normalize.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/normalize.jpg
--------------------------------------------------------------------------------
/images/one_hot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/one_hot.png
--------------------------------------------------------------------------------
/images/programmer_homemaker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/programmer_homemaker.png
--------------------------------------------------------------------------------
/images/unit_circle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/unit_circle.png
--------------------------------------------------------------------------------
/images/vectorize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/vectorize.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4
2 | gensim==1.0.1
3 | nltk==3.4.5
4 | sklearn==0.23.2
5 | numpy
6 | beautifulsoup4
7 | jupyter
8 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | from nltk import word_tokenize
3 | import numpy as np
4 |
5 | # loading data
6 |
7 |
8 | def load_data(path_to_data):
9 | """
10 | Loads `.tsv` of data into a
11 | Ensures that `.html` has been removed
12 | :param path_to_data: full/path/to/data
13 | :return: of ([id], [label], [text])
14 | """
15 | out_ = []
16 | with open(path_to_data, "r") as f:
17 | for line in f:
18 | # parse line
19 | line_split = line.rstrip().split("\t")
20 | if len(line_split) != 3:
21 | continue
22 | id = line_split[0]
23 | label = line_split[1]
24 | raw_text = line_split[2]
25 | # ensure html is removed
26 | text = BeautifulSoup(raw_text, "html.parser").get_text()
27 | out_.append((id, label, text))
28 | return out_
29 |
30 |
31 | def get_all_docs(list_of_tuples):
32 | """
33 | Given a dictionary of data, this will collect all the text into one list
34 | :param list_of_tuples: Output of load_data()
35 | :return of documents, lookup_dict
36 | """
37 | all_docs = []
38 | lookup = {}
39 | for i in range(len(list_of_tuples)):
40 | current = list_of_tuples[i]
41 | all_docs.append(current[2])
42 | lookup[i] = current[2]
43 | return all_docs, lookup
44 |
45 |
46 | # calculations
47 |
48 | def normalize_vector(vector):
49 | """
50 | Normalizes a vector so that all its values are between 0 and 1
51 | :param vector: a `numpy` vector
52 | :return: a normalized `numpy` vector
53 | """
54 | # norm = np.sqrt(vector.dot(vector))
55 | # numpy has a built in function
56 | norm = np.linalg.norm(vector)
57 | if norm:
58 | return vector / norm
59 | else:
60 | # if norm == 0, then original vector was all 0s
61 | return vector
62 |
63 |
64 | def cos_sim(vector_one, vector_two):
65 | """
66 | Calculate the cosine similarity of two `numpy` vectors
67 | :param vector_one: a `numpy` vector
68 | :param vector_two: a `numpy` vector
69 | :return: A score between 0 and 1
70 | """
71 | # ensure that both vectors are already normalized
72 | vector_one_norm = normalize_vector(vector_one)
73 | vector_two_norm = normalize_vector(vector_two)
74 |
75 | # calculate the dot product between the two normalized vectors
76 | return vector_one_norm.dot(vector_two_norm)
77 |
78 |
79 | def generate_all_cos_sim(X_matrix):
80 | """
81 | Generates a matrix of cosine similarities for a set of documents
82 | WARNING: this is too computationally expensive for a python notebook. Run in console.
83 | :param X_matrix: dense `numpy` matrix: num_documents (d) x words_in_vocabulary (v)
84 | :return: dense `numpy` matrix d x d
85 | """
86 | # ensure matrix is dense
87 | if "sparse" in str(type(X_matrix)):
88 | X_matrix = X_matrix.toarray()
89 | # get shape
90 | X_shape = X_matrix.shape
91 | size = X_shape[0]
92 | # build empty matrix
93 | cos_matrix = np.zeros((size, size))
94 | # iterate through rows
95 | for i in range(size):
96 | for j in range(size):
97 | if i != j:
98 | print(i,j)
99 | # calculate cosine similarity
100 | cos_matrix[i][j] = cos_sim(X_matrix[i], X_matrix[j])
101 | else:
102 | # set diagonal to None
103 | cos_matrix[i][j] = None
104 | return cos_matrix
105 |
106 |
107 | def get_similar(cos_sim_matrix, idx, n, direction="most"):
108 | """
109 | Determines similarity of n documents
110 | :param cos_sim_matrix: `numpy` dense array of num_documents x num_documents with values as cosine similarity
111 | :param idx: index of document to calculate most similar
112 | :param n: number of most similar indices to return
113 | :param direction: "most" or "least" for top or bottom of ranked list
114 | :return: of (idx, cos_sim)
115 | """
116 | if direction != "most" and direction != "least":
117 | raise Exception("chooose `most` or `least` for `direction`")
118 | # get all values
119 | if direction == "most":
120 | all_values = sorted(enumerate(filter(lambda x: not np.isnan(x), cos_sim_matrix[idx])), key=lambda x: x[1], reverse=True)
121 | else:
122 | all_values = sorted(enumerate(filter(lambda x: not np.isnan(x), cos_sim_matrix[idx])), key=lambda x: x[1])
123 | return [(x[0] + 1, x[1]) for x in all_values[:n]]
124 |
125 |
126 | # I/O
127 |
128 | def save_matrix_to_csv(X_matrix, save_location):
129 | """
130 | Saves a matrix to csv
131 | :param X_matrix: dense `numpy` array
132 | :param save_location: full/path/to/desired/location.csv
133 | """
134 | # ensure matrix is dense
135 | if "sparse" in str(type(X_matrix)):
136 | X_matrix = X_matrix.toarray()
137 | np.savetxt(save_location, X_matrix, delimiter=",")
138 |
139 |
140 | def load_matrix_from_csv(location):
141 | """
142 | Loads a matrix from csv
143 | :param location: full/path/to/location.csv
144 | :return: dense `numpy` array
145 | """
146 | return np.loadtxt(location, delimiter=",")
147 |
148 |
--------------------------------------------------------------------------------