├── .dockerignore ├── .gitignore ├── Dockerfile ├── Document-Embeddings_Demo.ipynb ├── Gigaword_pruned_vectors.txt.gz ├── README.md ├── Word-Embeddings_Demo.ipynb ├── images ├── NLP.png ├── architecture.png ├── architecture_2.png ├── context.png ├── cos_sim.png ├── cos_sim_compare.png ├── country_capital.png ├── distance_measures.png ├── eval_1.png ├── eval_2.png ├── gender_bias.png ├── king_queen.png ├── king_queen_2.png ├── king_queen_vis.png ├── normalize.jpg ├── one_hot.png ├── programmer_homemaker.png ├── unit_circle.png └── vectorize.png ├── movie_reviews.tsv ├── requirements.txt ├── tfidf_cos_matrix.csv └── utils.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # IPython Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # dotenv 80 | .env 81 | 82 | # virtualenv 83 | venv/ 84 | ENV/ 85 | 86 | # Spyder project settings 87 | .spyderproject 88 | 89 | # Rope project settings 90 | .ropeproject 91 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # backups 2 | *~ 3 | *.swp 4 | .DS_Store 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *,cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # IPython Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | venv/ 87 | ENV/ 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jupyter/minimal-notebook 2 | 3 | WORKDIR /code 4 | COPY requirements.txt /code 5 | RUN pip install -r /code/requirements.txt 6 | COPY . /code 7 | -------------------------------------------------------------------------------- /Document-Embeddings_Demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "slideshow": { 9 | "slide_type": "-" 10 | } 11 | }, 12 | "outputs": [ 13 | { 14 | "data": { 15 | "text/plain": [ 16 | "4" 17 | ] 18 | }, 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "output_type": "execute_result" 22 | } 23 | ], 24 | "source": [ 25 | "# using Jupyter notebooks\n", 26 | "# pushing CTRL-c will run the code in a cell\n", 27 | "2 + 2" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "slideshow": { 34 | "slide_type": "-" 35 | } 36 | }, 37 | "source": [ 38 | "# Gentle Introduction to NLP through Document Embeddings" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "slideshow": { 45 | "slide_type": "-" 46 | } 47 | }, 48 | "source": [ 49 | "### Quick Review of Last Time\n", 50 | "* Cosine Similarity\n", 51 | "\n", 52 | "### Two Approaches to Embedding Documents\n", 53 | "* Sparse, bag-of-words embeddings\n", 54 | " - Count embeddings\n", 55 | " - TFIDF embeddings\n", 56 | "* Dense embeddings" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": { 62 | "slideshow": { 63 | "slide_type": "-" 64 | } 65 | }, 66 | "source": [ 67 | "![NLP](images/NLP.png)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": { 73 | "slideshow": { 74 | "slide_type": "-" 75 | } 76 | }, 77 | "source": [ 78 | "## From Last Time" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": { 84 | "slideshow": { 85 | "slide_type": "-" 86 | } 87 | }, 88 | "source": [ 89 | "![distance](images/distance_measures.png)\n", 90 | "http://dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": { 96 | "slideshow": { 97 | "slide_type": "-" 98 | } 99 | }, 100 | "source": [ 101 | "![cos_sim](images/cos_sim.png)\n", 102 | "http://dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": { 108 | "slideshow": { 109 | "slide_type": "-" 110 | } 111 | }, 112 | "source": [ 113 | "### calculating dot product\n", 114 | "$vector_a = [1,2,3]$
\n", 115 | "$vector_b = [4,5,6]$
\n", 116 | "$vector_a \\cdot vector_b = (1*4) + (2*5) + (3*6) = 4 + 10 + 18 = 32$ " 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": { 122 | "slideshow": { 123 | "slide_type": "-" 124 | } 125 | }, 126 | "source": [ 127 | "### normalizing a vector\n", 128 | "To normalize a vector, we shrink all values so they fall between $0$ and $1$.\n", 129 | "\n", 130 | "![normalize](images/normalize.jpg)\n", 131 | "http://www.wikihow.com/Normalize-a-Vector" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 2, 137 | "metadata": { 138 | "collapsed": true, 139 | "slideshow": { 140 | "slide_type": "-" 141 | } 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "import numpy as np\n", 146 | "import utils\n", 147 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 3, 153 | "metadata": { 154 | "collapsed": true, 155 | "slideshow": { 156 | "slide_type": "-" 157 | } 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "def normalize_vector(vector):\n", 162 | " \"\"\"\n", 163 | " Normalizes a vector so that all its values are between 0 and 1\n", 164 | " :param vector: a `numpy` vector\n", 165 | " :return: a normalized `numpy` vector\n", 166 | " \"\"\"\n", 167 | " # norm = np.sqrt(vector.dot(vector))\n", 168 | " # numpy has a built in function\n", 169 | " norm = np.linalg.norm(vector)\n", 170 | " if norm:\n", 171 | " return vector / norm\n", 172 | " else:\n", 173 | " # if norm == 0, then original vector was all 0s\n", 174 | " return vector" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 4, 180 | "metadata": { 181 | "collapsed": false, 182 | "slideshow": { 183 | "slide_type": "-" 184 | } 185 | }, 186 | "outputs": [ 187 | { 188 | "name": "stdout", 189 | "output_type": "stream", 190 | "text": [ 191 | "original vector [1 2 4]\n", 192 | "normalized vector [ 0.21821789 0.43643578 0.87287156]\n" 193 | ] 194 | } 195 | ], 196 | "source": [ 197 | "vector_3d = np.array([1,2,4])\n", 198 | "print(\"original vector\", vector_3d)\n", 199 | "print(\"normalized vector\", normalize_vector(vector_3d))\n", 200 | "#0.218 is 1/4th of .873 just like 1 is 1/4th of 4" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 5, 206 | "metadata": { 207 | "collapsed": true, 208 | "slideshow": { 209 | "slide_type": "-" 210 | } 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "def cos_sim(vector_one, vector_two):\n", 215 | " \"\"\"\n", 216 | " Calculate the cosine similarity of two `numpy` vectors\n", 217 | " :param vector_one: a `numpy` vector\n", 218 | " :param vector_two: a `numpy` vector\n", 219 | " :return: A score between 0 and 1\n", 220 | " \"\"\"\n", 221 | " # ensure that both vectors are already normalized\n", 222 | " vector_one_norm = normalize_vector(vector_one)\n", 223 | " vector_two_norm = normalize_vector(vector_two)\n", 224 | " \n", 225 | " # calculate the dot product between the two normalized vectors\n", 226 | " return vector_one_norm.dot(vector_two_norm)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 6, 232 | "metadata": { 233 | "collapsed": false, 234 | "slideshow": { 235 | "slide_type": "-" 236 | } 237 | }, 238 | "outputs": [ 239 | { 240 | "name": "stdout", 241 | "output_type": "stream", 242 | "text": [ 243 | "cosine similarity of vector_one and vector_two 0.948683298051\n", 244 | "cosine similarity of vector_one and vector_three 0.904534033733\n", 245 | "cosine similarity of vector_one and vector_four 0.904534033733\n" 246 | ] 247 | } 248 | ], 249 | "source": [ 250 | "vector_one = np.array([1,1,1,1,1])\n", 251 | "vector_two = np.array([1,1,1,1,2])\n", 252 | "vector_three = np.array([1,2,3,4,5])\n", 253 | "vector_four = np.array([10,20,30,40,50])\n", 254 | "\n", 255 | "print(\"cosine similarity of vector_one and vector_two\", cos_sim(vector_one, vector_two))\n", 256 | "print(\"cosine similarity of vector_one and vector_three\", cos_sim(vector_one, vector_three))\n", 257 | "print(\"cosine similarity of vector_one and vector_four\", cos_sim(vector_one, vector_four))" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": { 263 | "slideshow": { 264 | "slide_type": "-" 265 | } 266 | }, 267 | "source": [ 268 | "### Interpreting \"Similarity\"\n", 269 | "![cos_sim_compare](images/cos_sim_compare.png)\n", 270 | "https://medium.com/@camrongodbout/creating-a-search-engine-f2f429cab33c#.z7i9w8y5t" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": { 276 | "slideshow": { 277 | "slide_type": "-" 278 | } 279 | }, 280 | "source": [ 281 | "![vectorize](images/vectorize.png)" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": { 287 | "slideshow": { 288 | "slide_type": "-" 289 | } 290 | }, 291 | "source": [ 292 | "## Embedding a Document \n", 293 | "### Bag of Words\n", 294 | "#### Count Vectorizing" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": { 300 | "slideshow": { 301 | "slide_type": "-" 302 | } 303 | }, 304 | "source": [ 305 | "![bag_of_words](images/bag_of_words_vis.png)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": { 311 | "slideshow": { 312 | "slide_type": "-" 313 | } 314 | }, 315 | "source": [ 316 | "![bag_of_words_count](images/bag_of_words_count_matrix.png)" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": { 322 | "slideshow": { 323 | "slide_type": "-" 324 | } 325 | }, 326 | "source": [ 327 | "## Embedding a Document \n", 328 | "### Bag of Words\n", 329 | "#### TFIDF Vectorizing\n", 330 | "`TFIDF` = `term frequency, inverse document frequency`" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": { 336 | "slideshow": { 337 | "slide_type": "-" 338 | } 339 | }, 340 | "source": [ 341 | "![tfidf_rationale](images/tfidf_rationale.png)" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": { 347 | "slideshow": { 348 | "slide_type": "-" 349 | } 350 | }, 351 | "source": [ 352 | "![doc_freq_vis](images/document_frequency_vis.png)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": { 358 | "slideshow": { 359 | "slide_type": "-" 360 | } 361 | }, 362 | "source": [ 363 | "![tfidf_matrix](images/tfidf_matrix.png)" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": { 369 | "slideshow": { 370 | "slide_type": "-" 371 | } 372 | }, 373 | "source": [ 374 | "![tfidf_matrix_decimal](images/tfidf_matrix_decimal.png)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": { 380 | "slideshow": { 381 | "slide_type": "-" 382 | } 383 | }, 384 | "source": [ 385 | "![bop](images/bags_of_popcorn.png)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 7, 391 | "metadata": { 392 | "collapsed": false, 393 | "slideshow": { 394 | "slide_type": "-" 395 | } 396 | }, 397 | "outputs": [], 398 | "source": [ 399 | "# load reviews\n", 400 | "reviews_dict = utils.load_data(\"movie_reviews.tsv\")\n", 401 | "all_docs, lookup = utils.get_all_docs(reviews_dict)" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": 8, 407 | "metadata": { 408 | "collapsed": false, 409 | "slideshow": { 410 | "slide_type": "-" 411 | } 412 | }, 413 | "outputs": [ 414 | { 415 | "data": { 416 | "text/plain": [ 417 | "'\"With all this stuff going down at the moment with MJ i\\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\\'s feeling towards the press and also the obvious message of drugs are bad m\\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 20 minutes or so excluding the Smooth Criminal sequence and Joe Pesci is convincing as a psychopathic all powerful drug lord. Why he wants MJ dead so bad is beyond me. Because MJ overheard his plans? Nah, Joe Pesci\\'s character ranted that he wanted people to know it is he who is supplying drugs etc so i dunno, maybe he just hates MJ\\'s music.Lots of cool things in this like MJ turning into a car and a robot and the whole Speed Demon sequence. Also, the director must have had the patience of a saint when it came to filming the kiddy Bad sequence as usually directors hate working with one kid let alone a whole bunch of them performing a complex dance scene.Bottom line, this movie is for people who like MJ on one level or another (which i think is most people). If not, then stay away. It does try and give off a wholesome message and ironically MJ\\'s bestest buddy in this movie is a girl! Michael Jackson is truly one of the most talented people ever to grace this planet but is he guilty? Well, with all the attention i\\'ve gave this subject....hmmm well i don\\'t know because people can be different behind closed doors, i know this for a fact. He is either an extremely nice but stupid guy or one of the most sickest liars. I hope he is not the latter.\"'" 418 | ] 419 | }, 420 | "execution_count": 8, 421 | "metadata": {}, 422 | "output_type": "execute_result" 423 | } 424 | ], 425 | "source": [ 426 | "# `all docs` is a list of all documents\n", 427 | "all_docs[0]" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 9, 433 | "metadata": { 434 | "collapsed": false, 435 | "slideshow": { 436 | "slide_type": "-" 437 | } 438 | }, 439 | "outputs": [ 440 | { 441 | "data": { 442 | "text/plain": [ 443 | "'\"With all this stuff going down at the moment with MJ i\\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\\'s feeling towards the press and also the obvious message of drugs are bad m\\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 20 minutes or so excluding the Smooth Criminal sequence and Joe Pesci is convincing as a psychopathic all powerful drug lord. Why he wants MJ dead so bad is beyond me. Because MJ overheard his plans? Nah, Joe Pesci\\'s character ranted that he wanted people to know it is he who is supplying drugs etc so i dunno, maybe he just hates MJ\\'s music.Lots of cool things in this like MJ turning into a car and a robot and the whole Speed Demon sequence. Also, the director must have had the patience of a saint when it came to filming the kiddy Bad sequence as usually directors hate working with one kid let alone a whole bunch of them performing a complex dance scene.Bottom line, this movie is for people who like MJ on one level or another (which i think is most people). If not, then stay away. It does try and give off a wholesome message and ironically MJ\\'s bestest buddy in this movie is a girl! Michael Jackson is truly one of the most talented people ever to grace this planet but is he guilty? Well, with all the attention i\\'ve gave this subject....hmmm well i don\\'t know because people can be different behind closed doors, i know this for a fact. He is either an extremely nice but stupid guy or one of the most sickest liars. I hope he is not the latter.\"'" 444 | ] 445 | }, 446 | "execution_count": 9, 447 | "metadata": {}, 448 | "output_type": "execute_result" 449 | } 450 | ], 451 | "source": [ 452 | "# `lookup` is a lookup dict with {idx: text}\n", 453 | "lookup[0]" 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "metadata": { 459 | "slideshow": { 460 | "slide_type": "-" 461 | } 462 | }, 463 | "source": [ 464 | "### Using `scikit-learn`\n", 465 | "\n", 466 | "[CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
\n", 467 | "[TfidfVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 10, 473 | "metadata": { 474 | "collapsed": false, 475 | "slideshow": { 476 | "slide_type": "-" 477 | } 478 | }, 479 | "outputs": [ 480 | { 481 | "data": { 482 | "text/plain": [ 483 | "<999x18373 sparse matrix of type ''\n", 484 | "\twith 137082 stored elements in Compressed Sparse Row format>" 485 | ] 486 | }, 487 | "execution_count": 10, 488 | "metadata": {}, 489 | "output_type": "execute_result" 490 | } 491 | ], 492 | "source": [ 493 | "# Count\n", 494 | "# call the vectorizer\n", 495 | "cv = CountVectorizer(\n", 496 | " analyzer='word', # 'char', 'char_wb'\n", 497 | " ngram_range=(1,1), # unigrams and bigrams ==> (1, 2)\n", 498 | " stop_words=None, # 'english' \n", 499 | " max_df=1.0, # float ==> percentage, int ==> raw count\n", 500 | " min_df=1, # float ==> percentage, int ==> raw count\n", 501 | " binary=False # True\n", 502 | ") \n", 503 | "# run fit_transform on the of documents\n", 504 | "X_cv = cv.fit_transform(all_docs)\n", 505 | "X_cv" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": 11, 511 | "metadata": { 512 | "collapsed": false 513 | }, 514 | "outputs": [ 515 | { 516 | "data": { 517 | "text/plain": [ 518 | "<999x18373 sparse matrix of type ''\n", 519 | "\twith 137082 stored elements in Compressed Sparse Row format>" 520 | ] 521 | }, 522 | "execution_count": 11, 523 | "metadata": {}, 524 | "output_type": "execute_result" 525 | } 526 | ], 527 | "source": [ 528 | "# TFIDF\n", 529 | "# call the vectorizer\n", 530 | "tv = TfidfVectorizer(\n", 531 | " analyzer='word', # 'char'\n", 532 | " ngram_range=(1,1), # unigrams and bigrams ==> (1, 2)\n", 533 | " stop_words=None, # 'english' \n", 534 | " max_df=1.0, # float ==> percentage, int ==> raw count\n", 535 | " min_df=1, # float ==> percentage, int ==> raw count\n", 536 | ")\n", 537 | "# run fit_transform on the of documents\n", 538 | "X_tv = tv.fit_transform(all_docs)\n", 539 | "X_tv" 540 | ] 541 | }, 542 | { 543 | "cell_type": "markdown", 544 | "metadata": {}, 545 | "source": [ 546 | "The first dimension (`999` rows) indicates the number of documents we're processing.\n", 547 | "\n", 548 | "The second dimension (columns) indicates the number of features we're processing. This will increase/decrease depending on the `n-gram` parameter." 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": {}, 554 | "source": [ 555 | "![bag_of_words_count](images/bag_of_words_count_matrix.png)" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 31, 561 | "metadata": { 562 | "collapsed": false, 563 | "slideshow": { 564 | "slide_type": "-" 565 | } 566 | }, 567 | "outputs": [], 568 | "source": [ 569 | "# see the vocabulary\n", 570 | "cv_vocab = cv.get_feature_names()\n", 571 | "# see the nonzero features (e.g. words, bigrams, character-grams) \n", 572 | "# for each row of data\n", 573 | "cv_words_per_doc = cv.inverse_transform(X_cv)\n", 574 | "tv_words_per_doc = cv.inverse_transform(X_tv)" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": 32, 580 | "metadata": { 581 | "collapsed": false 582 | }, 583 | "outputs": [ 584 | { 585 | "data": { 586 | "text/plain": [ 587 | "array(['latter', 'hope', 'liars', 'sickest', 'stupid', 'extremely',\n", 588 | " 'either', 'fact', 'doors', 'closed', 'behind', 'different', 'be',\n", 589 | " 'can', 'don'], \n", 590 | " dtype=''\n", 1165 | "\twith 137082 stored elements in Compressed Sparse Row format>" 1166 | ] 1167 | }, 1168 | "execution_count": 29, 1169 | "metadata": {}, 1170 | "output_type": "execute_result" 1171 | } 1172 | ], 1173 | "source": [ 1174 | "X_cv" 1175 | ] 1176 | }, 1177 | { 1178 | "cell_type": "markdown", 1179 | "metadata": {}, 1180 | "source": [ 1181 | "![bag_of_words_problem](images/bag_of_words_problem.png)" 1182 | ] 1183 | }, 1184 | { 1185 | "cell_type": "markdown", 1186 | "metadata": { 1187 | "slideshow": { 1188 | "slide_type": "-" 1189 | } 1190 | }, 1191 | "source": [ 1192 | "#### Problems with Bag-of-words:\n", 1193 | "\n", 1194 | " - same concepts, different words don't appear similar\n", 1195 | " - sparse matrix the size of *vocabulary*\n", 1196 | " - two different sentences, same embedding\n", 1197 | " \n", 1198 | "### So can we do better?" 1199 | ] 1200 | }, 1201 | { 1202 | "cell_type": "markdown", 1203 | "metadata": { 1204 | "slideshow": { 1205 | "slide_type": "-" 1206 | } 1207 | }, 1208 | "source": [ 1209 | "## Embedding a Document\n", 1210 | "### Neural Networks" 1211 | ] 1212 | }, 1213 | { 1214 | "cell_type": "markdown", 1215 | "metadata": { 1216 | "collapsed": true, 1217 | "slideshow": { 1218 | "slide_type": "-" 1219 | } 1220 | }, 1221 | "source": [ 1222 | "![recurrent](images/recurrent.png)\n", 1223 | "http://colah.github.io/posts/2015-08-Understanding-LSTMs/" 1224 | ] 1225 | }, 1226 | { 1227 | "cell_type": "markdown", 1228 | "metadata": { 1229 | "collapsed": true, 1230 | "slideshow": { 1231 | "slide_type": "-" 1232 | } 1233 | }, 1234 | "source": [ 1235 | "![cnn](images/cnn.png)\n", 1236 | "http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/" 1237 | ] 1238 | }, 1239 | { 1240 | "cell_type": "markdown", 1241 | "metadata": { 1242 | "collapsed": true, 1243 | "slideshow": { 1244 | "slide_type": "-" 1245 | } 1246 | }, 1247 | "source": [ 1248 | "![dan](images/dan.png)\n", 1249 | "https://cs.umd.edu/~miyyer/pubs/2015_acl_dan.pdf" 1250 | ] 1251 | }, 1252 | { 1253 | "cell_type": "markdown", 1254 | "metadata": { 1255 | "slideshow": { 1256 | "slide_type": "-" 1257 | } 1258 | }, 1259 | "source": [ 1260 | "![bow](images/bag_of_words_performance.png)" 1261 | ] 1262 | }, 1263 | { 1264 | "cell_type": "markdown", 1265 | "metadata": {}, 1266 | "source": [ 1267 | "![seq2seq](images/seq2seq.png)\n", 1268 | "http://www.wildml.com/2016/04/deep-learning-for-chatbots-part-1-introduction/" 1269 | ] 1270 | }, 1271 | { 1272 | "cell_type": "markdown", 1273 | "metadata": {}, 1274 | "source": [ 1275 | "![image_captioning](images/image_captioning.png)\n", 1276 | "https://www.researchgate.net/profile/Y_Bengio/publication/277411157_Deep_Learning/links/55e0cdf908ae2fac471ccf0f/Deep-Learning.pdf" 1277 | ] 1278 | }, 1279 | { 1280 | "cell_type": "markdown", 1281 | "metadata": { 1282 | "slideshow": { 1283 | "slide_type": "-" 1284 | } 1285 | }, 1286 | "source": [ 1287 | "## Resources\n", 1288 | "[Stanford IR book, online](http://nlp.stanford.edu/IR-book/html/htmledition/)
\n", 1289 | "[Bag of Words Meets Bags of Popcorn (Kaggle)](https://www.kaggle.com/c/word2vec-nlp-tutorial)
\n", 1290 | "[Neural Networks for NLP](https://arxiv.org/pdf/1510.00726.pdf)
\n", 1291 | "[Blog about LSTM's](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)
\n", 1292 | "[Blog about CNN's](http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/)
\n", 1293 | "[Examples of Recurrent Neural Networks](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) \n", 1294 | "[Recent Talk by C. Manning about Embedding Words and Documents](https://simons.berkeley.edu/talks/christopher-manning-2017-3-27)" 1295 | ] 1296 | } 1297 | ], 1298 | "metadata": { 1299 | "kernelspec": { 1300 | "display_name": "Python 3", 1301 | "language": "python", 1302 | "name": "python3" 1303 | }, 1304 | "language_info": { 1305 | "codemirror_mode": { 1306 | "name": "ipython", 1307 | "version": 3 1308 | }, 1309 | "file_extension": ".py", 1310 | "mimetype": "text/x-python", 1311 | "name": "python", 1312 | "nbconvert_exporter": "python", 1313 | "pygments_lexer": "ipython3", 1314 | "version": "3.5.2" 1315 | } 1316 | }, 1317 | "nbformat": 4, 1318 | "nbformat_minor": 0 1319 | } 1320 | -------------------------------------------------------------------------------- /Gigaword_pruned_vectors.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/Gigaword_pruned_vectors.txt.gz -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLP Basics 2 | 3 | ## Preparation 4 | 5 | You can clone this repository: 6 | 7 | ``` 8 | git@github.com:michaelcapizzi/nlp-basics.git 9 | ``` 10 | 11 | ## Docker install 12 | 13 | You can run the Jupyter notebooks in this repository with [Docker](http://docs.docker.com/installation) by running 14 | 15 | ``` 16 | % docker build -t michaelcapizzi/nlp-basics . 17 | % docker run -p 8888:8888 --rm -it michaelcapizzi/nlp-basics # to start a Jupyter notebook server 18 | [I 19:41:11.459 NotebookApp] Writing notebook server cookie secret to /home/jovyan/.local/share/jupyter/runtime/notebook_cookie_secret 19 | [W 19:41:11.591 NotebookApp] Widgets are unavailable. Please install widgetsnbextension or ipywidgets 4.0 20 | [W 19:41:11.598 NotebookApp] WARNING: The notebook server is listening on all IP addresses and not using encryption. This is not recommended. 21 | [I 19:41:11.742 NotebookApp] JupyterLab alpha preview extension loaded from /opt/conda/lib/python3.5/site-packages/jupyterlab 22 | [I 19:41:11.802 NotebookApp] Serving notebooks from local directory: /code 23 | [I 19:41:11.802 NotebookApp] 0 active kernels 24 | [I 19:41:11.802 NotebookApp] The Jupyter Notebook is running at: http://[all ip addresses on your system]:8888/?token=f6925975b83f14758e79c55f81f1bec1267300747d5d6b08 25 | [I 19:41:11.802 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation). 26 | [C 19:41:11.803 NotebookApp] 27 | 28 | Copy/paste this URL into your browser when you connect for the first time, 29 | to login with a token: 30 | http://localhost:8888/?token=f6925975b83f14758e79c55f81f1bec1267300747d5d6b08 31 | ``` 32 | 33 | Your specific token will be different. 34 | 35 | ## Manual pip / virtualenv install 36 | 37 | To run the `jupyter` notebook you'll need a `python` environment for `python 3` with the following requirements: 38 | 39 | - jupyter 40 | - gensim 41 | - sklearn 42 | - numpy 43 | - beautifulsoup4 44 | 45 | All of these can be installed via `pip` or using the `requirements.txt` file: 46 | 47 | ``` 48 | pip install -r requirements.txt 49 | ``` 50 | 51 | Then to open the notebook, simply run the following in the root folder of the cloned project: 52 | 53 | ``` 54 | jupyter notebook 55 | ``` 56 | 57 | This will open a new window in your default browser. You can then open the notebook file of choice (ending in `.ipynb`) by clicking on it. 58 | 59 | It will open in a new window. 60 | 61 | You can edit a given cell by clicking on it. To run the cell, push `CTRL-c`. 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /Word-Embeddings_Demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "slideshow": { 9 | "slide_type": "-" 10 | } 11 | }, 12 | "outputs": [ 13 | { 14 | "data": { 15 | "text/plain": [ 16 | "4" 17 | ] 18 | }, 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "output_type": "execute_result" 22 | } 23 | ], 24 | "source": [ 25 | "# using Jupyter notebooks\n", 26 | "# pushing CTRL-c will run the code in a cell\n", 27 | "2 + 2" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "slideshow": { 34 | "slide_type": "-" 35 | } 36 | }, 37 | "source": [ 38 | "# Gentle Introduction to NLP through Word Embeddings" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "slideshow": { 45 | "slide_type": "-" 46 | } 47 | }, 48 | "source": [ 49 | "![NLP](images/NLP.png)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": { 55 | "slideshow": { 56 | "slide_type": "-" 57 | } 58 | }, 59 | "source": [ 60 | "# How To Tell If Two Words Are \"Similar\"?" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": { 66 | "slideshow": { 67 | "slide_type": "-" 68 | } 69 | }, 70 | "source": [ 71 | "![distance](images/distance_measures.png)\n", 72 | "http://dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": { 78 | "slideshow": { 79 | "slide_type": "-" 80 | } 81 | }, 82 | "source": [ 83 | "# Cosine Similarity" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": { 89 | "collapsed": true, 90 | "slideshow": { 91 | "slide_type": "-" 92 | } 93 | }, 94 | "source": [ 95 | "![cos_sim](images/cos_sim.png)\n", 96 | "http://dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": { 102 | "slideshow": { 103 | "slide_type": "-" 104 | } 105 | }, 106 | "source": [ 107 | "## calculating dot product\n", 108 | "$vector_a = [1,2,3]$
\n", 109 | "$vector_b = [4,5,6]$
\n", 110 | "$vector_a \\cdot vector_b = (1*4) + (2*5) + (3*6) = 4 + 10 + 18 = 32$ " 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": { 116 | "slideshow": { 117 | "slide_type": "-" 118 | } 119 | }, 120 | "source": [ 121 | "## Normalizing a Vector" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": { 127 | "slideshow": { 128 | "slide_type": "-" 129 | } 130 | }, 131 | "source": [ 132 | "To normalize a vector, we shrink all values so they fall between $0$ and $1$.\n", 133 | "\n", 134 | "$vector_{normalized} = \\frac{vector}{\\sqrt{vector \\cdot vector}}$ \n", 135 | "![normalize](images/normalize.jpg)\n", 136 | "\n", 137 | "http://www.wikihow.com/Normalize-a-Vector" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": { 143 | "slideshow": { 144 | "slide_type": "-" 145 | } 146 | }, 147 | "source": [ 148 | "![unit_circle](images/unit_circle.png)\n", 149 | "https://en.wikipedia.org/wiki/Unit_vector" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 2, 155 | "metadata": { 156 | "collapsed": true, 157 | "slideshow": { 158 | "slide_type": "-" 159 | } 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "import numpy as np\n", 164 | "from nltk.corpus import wordnet\n", 165 | "from collections import OrderedDict\n", 166 | "from itertools import combinations\n", 167 | "import string\n", 168 | "from gensim import models" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 3, 174 | "metadata": { 175 | "collapsed": true, 176 | "slideshow": { 177 | "slide_type": "-" 178 | } 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "def normalize_vector(vector):\n", 183 | " \"\"\"\n", 184 | " Normalizes a vector so that all its values are between 0 and 1\n", 185 | " :param vector: a `numpy` vector\n", 186 | " :return: a normalized `numpy` vector\n", 187 | " \"\"\"\n", 188 | " # norm = np.sqrt(vector.dot(vector))\n", 189 | " # numpy has a built in function\n", 190 | " norm = np.linalg.norm(vector)\n", 191 | " if norm:\n", 192 | " return vector / norm\n", 193 | " else:\n", 194 | " # if norm == 0, then original vector was all 0s\n", 195 | " return vector" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 4, 201 | "metadata": { 202 | "collapsed": false, 203 | "slideshow": { 204 | "slide_type": "-" 205 | } 206 | }, 207 | "outputs": [ 208 | { 209 | "name": "stdout", 210 | "output_type": "stream", 211 | "text": [ 212 | "original vector [1 2 4]\n", 213 | "normalized vector [ 0.21821789 0.43643578 0.87287156]\n" 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "vector_3d = np.array([1,2,4])\n", 219 | "print(\"original vector\", vector_3d)\n", 220 | "print(\"normalized vector\", normalize_vector(vector_3d))\n", 221 | "#0.218 is 1/4th of .873 just like 1 is 1/4th of 4" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": { 227 | "slideshow": { 228 | "slide_type": "-" 229 | } 230 | }, 231 | "source": [ 232 | "## Calculating Cosine Similarity" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 5, 238 | "metadata": { 239 | "collapsed": true, 240 | "slideshow": { 241 | "slide_type": "-" 242 | } 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "def cos_sim(vector_one, vector_two):\n", 247 | " \"\"\"\n", 248 | " Calculate the cosine similarity of two `numpy` vectors\n", 249 | " :param vector_one: a `numpy` vector\n", 250 | " :param vector_two: a `numpy` vector\n", 251 | " :return: A score between 0 and 1\n", 252 | " \"\"\"\n", 253 | " # ensure that both vectors are already normalized\n", 254 | " vector_one_norm = normalize_vector(vector_one)\n", 255 | " vector_two_norm = normalize_vector(vector_two)\n", 256 | " \n", 257 | " # calculate the dot product between the two normalized vectors\n", 258 | " return vector_one_norm.dot(vector_two_norm)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 6, 264 | "metadata": { 265 | "collapsed": false, 266 | "slideshow": { 267 | "slide_type": "-" 268 | } 269 | }, 270 | "outputs": [ 271 | { 272 | "name": "stdout", 273 | "output_type": "stream", 274 | "text": [ 275 | "cosine similarity of vector_one and vector_two 0.948683298051\n", 276 | "cosine similarity of vector_one and vector_three 0.904534033733\n", 277 | "cosine similarity of vector_one and vector_four 0.904534033733\n" 278 | ] 279 | } 280 | ], 281 | "source": [ 282 | "vector_one = np.array([1,1,1,1,1])\n", 283 | "vector_two = np.array([1,1,1,1,2])\n", 284 | "vector_three = np.array([1,2,3,4,5])\n", 285 | "vector_four = np.array([10,20,30,40,50])\n", 286 | "\n", 287 | "print(\"cosine similarity of vector_one and vector_two\", cos_sim(vector_one, vector_two))\n", 288 | "print(\"cosine similarity of vector_one and vector_three\", cos_sim(vector_one, vector_three))\n", 289 | "print(\"cosine similarity of vector_one and vector_four\", cos_sim(vector_one, vector_four))" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": { 295 | "slideshow": { 296 | "slide_type": "-" 297 | } 298 | }, 299 | "source": [ 300 | "## Measuring the \"Similarity\" of Words" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": { 306 | "slideshow": { 307 | "slide_type": "-" 308 | } 309 | }, 310 | "source": [ 311 | "![cos_sim_compare](images/cos_sim_compare.png)\n", 312 | "https://medium.com/@camrongodbout/creating-a-search-engine-f2f429cab33c#.z7i9w8y5t" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": { 318 | "slideshow": { 319 | "slide_type": "-" 320 | } 321 | }, 322 | "source": [ 323 | "![vectorize](images/vectorize.png)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": { 329 | "slideshow": { 330 | "slide_type": "-" 331 | } 332 | }, 333 | "source": [ 334 | "### Option 1: One-hot vectors\n", 335 | "\n", 336 | "![one_hot](images/one_hot.png)\n", 337 | "https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 7, 343 | "metadata": { 344 | "collapsed": true, 345 | "slideshow": { 346 | "slide_type": "-" 347 | } 348 | }, 349 | "outputs": [], 350 | "source": [ 351 | "vocabulary = ['apple', 'banana', 'orange', 'cantaloupe', 'peach']" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 8, 357 | "metadata": { 358 | "collapsed": false, 359 | "slideshow": { 360 | "slide_type": "-" 361 | } 362 | }, 363 | "outputs": [], 364 | "source": [ 365 | "# generate vocabulary lookup\n", 366 | "def build_voc_lookup(list_of_voc):\n", 367 | " \"\"\"\n", 368 | " Generates a dictionary where the key is the word and the value is its index\n", 369 | " :param list_of_voc: list of vocabulary words\n", 370 | " :return: Dictionary of vocabulary\n", 371 | " \"\"\"\n", 372 | " lookup_dict = OrderedDict()\n", 373 | " counter = 0\n", 374 | " for word in list_of_voc:\n", 375 | " lookup_dict[word] = counter\n", 376 | " counter+=1\n", 377 | " return lookup_dict" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 9, 383 | "metadata": { 384 | "collapsed": true, 385 | "slideshow": { 386 | "slide_type": "-" 387 | } 388 | }, 389 | "outputs": [], 390 | "source": [ 391 | "# lookup word\n", 392 | "def lookup_word(lookup_dict, word):\n", 393 | " \"\"\" \n", 394 | " Looks up a given word in the vocabulary dictionary, and returns None if word not in vocabulary\n", 395 | " :param lookup_dict: lookup-dictionary built with build_voc_lookup()\n", 396 | " :param word to index\n", 397 | " :return: index of word in vocabulary or None\n", 398 | " \"\"\"\n", 399 | " if word in lookup_dict:\n", 400 | " return lookup_dict[word]\n", 401 | " else:\n", 402 | " return None" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 10, 408 | "metadata": { 409 | "collapsed": false, 410 | "slideshow": { 411 | "slide_type": "-" 412 | } 413 | }, 414 | "outputs": [ 415 | { 416 | "name": "stdout", 417 | "output_type": "stream", 418 | "text": [ 419 | "4\n", 420 | "None\n" 421 | ] 422 | } 423 | ], 424 | "source": [ 425 | "lookup_dict = build_voc_lookup(vocabulary)\n", 426 | "print(lookup_word(lookup_dict, 'peach'))\n", 427 | "print(lookup_word(lookup_dict, 'hashbrown'))" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 11, 433 | "metadata": { 434 | "collapsed": false, 435 | "slideshow": { 436 | "slide_type": "-" 437 | } 438 | }, 439 | "outputs": [], 440 | "source": [ 441 | "# build one-hot vector for word\n", 442 | "def make_one_hot(lookup_dict, word):\n", 443 | " \"\"\"\n", 444 | " Builds a one-hot numpy vector for a word\n", 445 | " :param lookup_dict: lookup-dictionary built with build_voc_lookup()\n", 446 | " :param word: word to convert to one-hot\n", 447 | " :return numpy vector with dimension equal to size of vocabulary\n", 448 | " \"\"\"\n", 449 | " # get size of vocabulary\n", 450 | " voc_size = len(lookup_dict.items())\n", 451 | " # initialize empty vector of zeros with the size of the vocabulary\n", 452 | " one_hot = np.zeros((voc_size))\n", 453 | " # get index of word (or None if not in vocabulary)\n", 454 | " word_index = lookup_word(lookup_dict, word)\n", 455 | " # make the nth dimension of one-hot (representing the index of word in vocabulary) to 1\n", 456 | " if word_index or word_index == 0:\n", 457 | " one_hot[word_index] = 1\n", 458 | " # if word not in vocabulary, the one-hot will remain zeros\n", 459 | " return one_hot" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 12, 465 | "metadata": { 466 | "collapsed": false, 467 | "slideshow": { 468 | "slide_type": "-" 469 | } 470 | }, 471 | "outputs": [ 472 | { 473 | "name": "stdout", 474 | "output_type": "stream", 475 | "text": [ 476 | "one-hot vector for ' apple' [ 1. 0. 0. 0. 0.]\n", 477 | "one-hot vector for ' banana' [ 0. 1. 0. 0. 0.]\n", 478 | "one-hot vector for ' orange' [ 0. 0. 1. 0. 0.]\n", 479 | "one-hot vector for ' cantaloupe' [ 0. 0. 0. 1. 0.]\n", 480 | "one-hot vector for ' peach' [ 0. 0. 0. 0. 1.]\n", 481 | "one-hot vector for ' hashbrown' [ 0. 0. 0. 0. 0.]\n", 482 | "one-hot vector for ' Capizzi' [ 0. 0. 0. 0. 0.]\n" 483 | ] 484 | } 485 | ], 486 | "source": [ 487 | "for word in vocabulary + ['hashbrown', 'Capizzi']:\n", 488 | " print(\"one-hot vector for '{:>11}'\".format(word), make_one_hot(lookup_dict, word))" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": { 494 | "slideshow": { 495 | "slide_type": "-" 496 | } 497 | }, 498 | "source": [ 499 | "#### The problem with one-hot vectors" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 13, 505 | "metadata": { 506 | "collapsed": false, 507 | "slideshow": { 508 | "slide_type": "-" 509 | } 510 | }, 511 | "outputs": [ 512 | { 513 | "name": "stdout", 514 | "output_type": "stream", 515 | "text": [ 516 | "cosine similarity between apple and banana 0.0\n", 517 | "cosine similarity between apple and orange 0.0\n", 518 | "cosine similarity between apple and cantaloupe 0.0\n", 519 | "cosine similarity between apple and peach 0.0\n", 520 | "cosine similarity between apple and Phoenix 0.0\n", 521 | "cosine similarity between banana and orange 0.0\n", 522 | "cosine similarity between banana and cantaloupe 0.0\n", 523 | "cosine similarity between banana and peach 0.0\n", 524 | "cosine similarity between banana and Phoenix 0.0\n", 525 | "cosine similarity between orange and cantaloupe 0.0\n", 526 | "cosine similarity between orange and peach 0.0\n", 527 | "cosine similarity between orange and Phoenix 0.0\n", 528 | "cosine similarity between cantaloupe and peach 0.0\n", 529 | "cosine similarity between cantaloupe and Phoenix 0.0\n", 530 | "cosine similarity between peach and Phoenix 0.0\n" 531 | ] 532 | } 533 | ], 534 | "source": [ 535 | "# add an OOV word to vocabulary\n", 536 | "vocabulary_plus_oov = vocabulary + [\"Phoenix\"]\n", 537 | "# get all combinations\n", 538 | "all_combinations = combinations(vocabulary_plus_oov, 2)\n", 539 | "# iterate through all combinations and calculate cosine similarity\n", 540 | "for (word1, word2) in all_combinations:\n", 541 | " one_hot_word_1 = make_one_hot(lookup_dict, word1)\n", 542 | " one_hot_word_2 = make_one_hot(lookup_dict, word2)\n", 543 | " print(\"cosine similarity between {} and {}\".format(word1, word2), cos_sim(one_hot_word_1, one_hot_word_2))" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": { 549 | "slideshow": { 550 | "slide_type": "-" 551 | } 552 | }, 553 | "source": [ 554 | "### Option 2: Encode spelling\n", 555 | "Following a similar pattern as the one-hot of a word over a vocabulary, let's build word vectors represented by the frequency of the letters present" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 14, 561 | "metadata": { 562 | "collapsed": true, 563 | "slideshow": { 564 | "slide_type": "-" 565 | } 566 | }, 567 | "outputs": [], 568 | "source": [ 569 | "alphabet = list(string.ascii_lowercase)" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 15, 575 | "metadata": { 576 | "collapsed": false, 577 | "slideshow": { 578 | "slide_type": "-" 579 | } 580 | }, 581 | "outputs": [], 582 | "source": [ 583 | "# since we don't need to worry about \"out-of-vocabulary\" now, we can just use alphabet.index([letter])\n", 584 | "def lookup_letter(letter):\n", 585 | " return alphabet.index(letter.lower())" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 16, 591 | "metadata": { 592 | "collapsed": false, 593 | "slideshow": { 594 | "slide_type": "-" 595 | } 596 | }, 597 | "outputs": [ 598 | { 599 | "name": "stdout", 600 | "output_type": "stream", 601 | "text": [ 602 | "a 0\n", 603 | "A 0\n" 604 | ] 605 | } 606 | ], 607 | "source": [ 608 | "print(\"a\", lookup_letter('a'))\n", 609 | "print(\"A\", lookup_letter('A'))" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": 17, 615 | "metadata": { 616 | "collapsed": false, 617 | "slideshow": { 618 | "slide_type": "-" 619 | } 620 | }, 621 | "outputs": [], 622 | "source": [ 623 | "def make_spelling_vector(word):\n", 624 | " \"\"\"\n", 625 | " Converts a word into a vector of dimension 26 where each cell contains the count for that letter\n", 626 | " :param word: word to vectorize\n", 627 | " :return: numpy vector of 26 dimensions\n", 628 | " \"\"\"\n", 629 | " # initialize vector with zeros\n", 630 | " spelling_vector = np.zeros((26))\n", 631 | " # iterate through each letter and update count\n", 632 | " for letter in word:\n", 633 | " if letter in string.ascii_letters:\n", 634 | " letter_index = lookup_letter(letter)\n", 635 | " spelling_vector[letter_index] = spelling_vector[letter_index] + 1\n", 636 | " return spelling_vector" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 18, 642 | "metadata": { 643 | "collapsed": false, 644 | "slideshow": { 645 | "slide_type": "-" 646 | } 647 | }, 648 | "outputs": [ 649 | { 650 | "data": { 651 | "text/plain": [ 652 | "array([ 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,\n", 653 | " 0., 0., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])" 654 | ] 655 | }, 656 | "execution_count": 18, 657 | "metadata": {}, 658 | "output_type": "execute_result" 659 | } 660 | ], 661 | "source": [ 662 | "make_spelling_vector(\"apple\")" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": 19, 668 | "metadata": { 669 | "collapsed": true, 670 | "slideshow": { 671 | "slide_type": "-" 672 | } 673 | }, 674 | "outputs": [], 675 | "source": [ 676 | "vocabulary = [\"apple\", \"banana\", \"orange\", \"cantaloupe\", \"peach\", \"Phoenix\"]" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": 20, 682 | "metadata": { 683 | "collapsed": false, 684 | "slideshow": { 685 | "slide_type": "-" 686 | } 687 | }, 688 | "outputs": [ 689 | { 690 | "name": "stdout", 691 | "output_type": "stream", 692 | "text": [ 693 | "cosine similarity between apple and banana 0.303045763366\n", 694 | "cosine similarity between apple and orange 0.308606699924\n", 695 | "cosine similarity between apple and cantaloupe 0.654653670708\n", 696 | "cosine similarity between apple and peach 0.676123403783\n", 697 | "cosine similarity between apple and Phoenix 0.428571428571\n", 698 | "cosine similarity between banana and orange 0.54554472559\n", 699 | "cosine similarity between banana and cantaloupe 0.617213399848\n", 700 | "cosine similarity between banana and peach 0.3585685828\n", 701 | "cosine similarity between banana and Phoenix 0.20203050891\n", 702 | "cosine similarity between orange and cantaloupe 0.589255650989\n", 703 | "cosine similarity between orange and peach 0.36514837167\n", 704 | "cosine similarity between orange and Phoenix 0.462910049886\n", 705 | "cosine similarity between cantaloupe and peach 0.645497224368\n", 706 | "cosine similarity between cantaloupe and Phoenix 0.436435780472\n", 707 | "cosine similarity between peach and Phoenix 0.507092552837\n" 708 | ] 709 | } 710 | ], 711 | "source": [ 712 | "# reset the generator\n", 713 | "all_combinations = combinations(vocabulary, 2)\n", 714 | "# iterate through all words\n", 715 | "for (word1, word2) in all_combinations:\n", 716 | " spelling_vector_1 = make_spelling_vector(word1)\n", 717 | " spelling_vector_2 = make_spelling_vector(word2)\n", 718 | " print(\"cosine similarity between {} and {}\".format(word1, word2), cos_sim(spelling_vector_1, spelling_vector_2))" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": 21, 724 | "metadata": { 725 | "collapsed": false, 726 | "slideshow": { 727 | "slide_type": "-" 728 | } 729 | }, 730 | "outputs": [ 731 | { 732 | "data": { 733 | "text/plain": [ 734 | "True" 735 | ] 736 | }, 737 | "execution_count": 21, 738 | "metadata": {}, 739 | "output_type": "execute_result" 740 | } 741 | ], 742 | "source": [ 743 | "# what if two words share the same letters?\n", 744 | "dog = make_spelling_vector(\"dog\")\n", 745 | "god = make_spelling_vector(\"God\")\n", 746 | "# god == dog\n", 747 | "np.all(god == dog)" 748 | ] 749 | }, 750 | { 751 | "cell_type": "markdown", 752 | "metadata": { 753 | "slideshow": { 754 | "slide_type": "-" 755 | } 756 | }, 757 | "source": [ 758 | "#### We've successfully generated similarity scores! But...\n", 759 | "\n", 760 | "Do they really reflect anything semantic? \n", 761 | "\n", 762 | "In other words, does it make sense that **\"peach\"** and **\"Phoenix\"**
\n", 763 | "(`cosine similarity = 0.507`)
\n", 764 | "are **more** similar than **\"peach\"** and **\"orange\"**
\n", 765 | "(`cosine similarity = .365`)?" 766 | ] 767 | }, 768 | { 769 | "cell_type": "markdown", 770 | "metadata": { 771 | "slideshow": { 772 | "slide_type": "-" 773 | } 774 | }, 775 | "source": [ 776 | "### Option 3: Word Embeddings\n", 777 | "Create a \"dense\" representation of each word where proximity in vector space represents \"similarity\"." 778 | ] 779 | }, 780 | { 781 | "cell_type": "markdown", 782 | "metadata": { 783 | "slideshow": { 784 | "slide_type": "-" 785 | } 786 | }, 787 | "source": [ 788 | "![context](images/context.png)\n", 789 | "https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/" 790 | ] 791 | }, 792 | { 793 | "cell_type": "markdown", 794 | "metadata": { 795 | "slideshow": { 796 | "slide_type": "-" 797 | } 798 | }, 799 | "source": [ 800 | "![architecture](images/architecture.png)\n", 801 | "https://arxiv.org/pdf/1301.3781v3.pdf" 802 | ] 803 | }, 804 | { 805 | "cell_type": "markdown", 806 | "metadata": { 807 | "slideshow": { 808 | "slide_type": "-" 809 | } 810 | }, 811 | "source": [ 812 | "![cbow](images/cbow.png)\n", 813 | "https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/" 814 | ] 815 | }, 816 | { 817 | "cell_type": "markdown", 818 | "metadata": { 819 | "slideshow": { 820 | "slide_type": "-" 821 | } 822 | }, 823 | "source": [ 824 | "#### Using the `gensim` package in `python`\n", 825 | "https://radimrehurek.com/gensim/models/word2vec.html" 826 | ] 827 | }, 828 | { 829 | "cell_type": "code", 830 | "execution_count": 22, 831 | "metadata": { 832 | "collapsed": false, 833 | "slideshow": { 834 | "slide_type": "-" 835 | } 836 | }, 837 | "outputs": [], 838 | "source": [ 839 | "# load existing word2vec vectors into gensim\n", 840 | "\n", 841 | "# most frequent 125k words in Gigaword corpus\n", 842 | "w2v = models.Word2Vec.load_word2vec_format(fname=\"Gigaword_pruned_vectors.txt.gz\", binary=False)\n", 843 | "\n", 844 | "# original `word2vec` embeddings can be downloaded here:\n", 845 | "# https://code.google.com/archive/p/word2vec/" 846 | ] 847 | }, 848 | { 849 | "cell_type": "markdown", 850 | "metadata": { 851 | "slideshow": { 852 | "slide_type": "-" 853 | } 854 | }, 855 | "source": [ 856 | "Pre-trained word embeddings can be loaded into `gensim` in `.txt` or `.txt.gz` format *as long as* the first line identifies (1) the number of words in file and (2) the dimensions of the vector\n", 857 | " \n", 858 | "```\n", 859 | "199999 200\n", 860 | "and -0.065843 -0.133472 0.020263 0.102796 0.003295 0.025878 -0.071714 0.054211 -0.026698 -0.036176 -0.024954 0.042049 -0.165819 -0.067038 0.117293 0.046338 0.012154 0.026929 -0.020248 0.120186 0.081922 0.062471 -0.063391 -0.048321 -0.108106 -0.067974 0.092109 -0.034439 -0.024319 0.008799 -0.099953\n", 861 | "...\n", 862 | "```" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": 23, 868 | "metadata": { 869 | "collapsed": false, 870 | "slideshow": { 871 | "slide_type": "-" 872 | } 873 | }, 874 | "outputs": [ 875 | { 876 | "data": { 877 | "text/plain": [ 878 | "array([ 0.06338 , -0.146809 , 0.110004 , -0.01205 , -0.045637 ,\n", 879 | " -0.02224 , -0.045153 , 0.079144 , -0.027216 , -0.027647 ,\n", 880 | " -0.000434 , 0.108648 , -0.060456 , -0.129502 , 0.010897 ,\n", 881 | " 0.055499 , 0.086099 , 0.055282 , 0.007365 , 0.167188 ,\n", 882 | " 0.016705 , 0.0744 , -0.07096 , -0.105974 , -0.095631 ,\n", 883 | " 0.006107 , 0.12862299, -0.033055 , -0.020641 , 0.024765 ,\n", 884 | " -0.048181 , -0.090195 , 0.007408 , 0.073138 , 0.031994 ,\n", 885 | " -0.014252 , 0.102764 , -0.081244 , 0.10513 , 0.039809 ,\n", 886 | " -0.050727 , 0.002429 , -0.01506 , -0.085081 , -0.02245 ,\n", 887 | " 0.102064 , -0.009099 , -0.092295 , -0.040276 , 0.148752 ], dtype=float32)" 888 | ] 889 | }, 890 | "execution_count": 23, 891 | "metadata": {}, 892 | "output_type": "execute_result" 893 | } 894 | ], 895 | "source": [ 896 | "# the first 50 dimensions of the vector for \"the\"\n", 897 | "w2v[\"the\"][0:50]" 898 | ] 899 | }, 900 | { 901 | "cell_type": "code", 902 | "execution_count": 24, 903 | "metadata": { 904 | "collapsed": false, 905 | "slideshow": { 906 | "slide_type": "subslide" 907 | } 908 | }, 909 | "outputs": [ 910 | { 911 | "ename": "KeyError", 912 | "evalue": "'abcdef'", 913 | "output_type": "error", 914 | "traceback": [ 915 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 916 | "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", 917 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mw2v\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"abcdef\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 918 | "\u001b[0;32m/Users/mcapizzi/miniconda3/envs/word-embedding/lib/python3.5/site-packages/gensim/models/word2vec.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, words)\u001b[0m\n\u001b[1;32m 1502\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstring_types\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1503\u001b[0m \u001b[0;31m# allow calls like trained_model['office'], as a shorthand for trained_model[['office']]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1504\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msyn0\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvocab\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1505\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1506\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mvstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msyn0\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvocab\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mword\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mword\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mwords\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 919 | "\u001b[0;31mKeyError\u001b[0m: 'abcdef'" 920 | ] 921 | } 922 | ], 923 | "source": [ 924 | "w2v[\"abcdef\"]" 925 | ] 926 | }, 927 | { 928 | "cell_type": "code", 929 | "execution_count": 25, 930 | "metadata": { 931 | "collapsed": false, 932 | "slideshow": { 933 | "slide_type": "subslide" 934 | } 935 | }, 936 | "outputs": [], 937 | "source": [ 938 | "def get_vector(word):\n", 939 | " \"\"\"\n", 940 | " Returns the word vector for that word or a vector of 0s for out-of-vocabulary\n", 941 | " :param: word: word to lookup in vectors\n", 942 | " :return: vector or vector of zeros\n", 943 | " \"\"\"\n", 944 | " # determine vector length\n", 945 | " w2v_length = len(w2v[\"the\"])\n", 946 | " # get vector\n", 947 | " if word in w2v:\n", 948 | " return w2v[word]\n", 949 | " else:\n", 950 | " return np.zeros((w2v_length))" 951 | ] 952 | }, 953 | { 954 | "cell_type": "code", 955 | "execution_count": 26, 956 | "metadata": { 957 | "collapsed": false, 958 | "slideshow": { 959 | "slide_type": "-" 960 | } 961 | }, 962 | "outputs": [ 963 | { 964 | "data": { 965 | "text/plain": [ 966 | "array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 967 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 968 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 969 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])" 970 | ] 971 | }, 972 | "execution_count": 26, 973 | "metadata": {}, 974 | "output_type": "execute_result" 975 | } 976 | ], 977 | "source": [ 978 | "get_vector(\"abcdef\")[0:50]" 979 | ] 980 | }, 981 | { 982 | "cell_type": "code", 983 | "execution_count": 27, 984 | "metadata": { 985 | "collapsed": false, 986 | "slideshow": { 987 | "slide_type": "-" 988 | } 989 | }, 990 | "outputs": [ 991 | { 992 | "data": { 993 | "text/plain": [ 994 | "[('monarch', 0.7166919708251953),\n", 995 | " ('princess', 0.7164901494979858),\n", 996 | " ('margrethe', 0.6889792680740356),\n", 997 | " ('beatrix', 0.6878944039344788),\n", 998 | " ('coronation', 0.6789792776107788),\n", 999 | " ('prince', 0.6730599403381348),\n", 1000 | " ('wilhelmina', 0.6619384288787842),\n", 1001 | " ('mettemarit', 0.6575925946235657),\n", 1002 | " ('consort', 0.6492267847061157),\n", 1003 | " ('duchess', 0.6444146633148193)]" 1004 | ] 1005 | }, 1006 | "execution_count": 27, 1007 | "metadata": {}, 1008 | "output_type": "execute_result" 1009 | } 1010 | ], 1011 | "source": [ 1012 | "# find most similar n words to a given word\n", 1013 | "similar = w2v.similar_by_word(\"queen\", topn=10)\n", 1014 | "similar" 1015 | ] 1016 | }, 1017 | { 1018 | "cell_type": "code", 1019 | "execution_count": 28, 1020 | "metadata": { 1021 | "collapsed": false, 1022 | "slideshow": { 1023 | "slide_type": "-" 1024 | } 1025 | }, 1026 | "outputs": [ 1027 | { 1028 | "data": { 1029 | "text/plain": [ 1030 | "[('cat', 1.0),\n", 1031 | " ('dog', 0.8524122834205627),\n", 1032 | " ('puppy', 0.7896589040756226),\n", 1033 | " ('pug', 0.783139169216156),\n", 1034 | " ('critter', 0.7650502324104309),\n", 1035 | " ('squirrel', 0.7516598701477051),\n", 1036 | " ('feline', 0.7436362504959106),\n", 1037 | " ('gerbil', 0.7435644865036011),\n", 1038 | " ('monkey', 0.7434572577476501),\n", 1039 | " ('hamster', 0.7323285341262817)]" 1040 | ] 1041 | }, 1042 | "execution_count": 28, 1043 | "metadata": {}, 1044 | "output_type": "execute_result" 1045 | } 1046 | ], 1047 | "source": [ 1048 | "# find most similar n words to a given vector\n", 1049 | "cat_vector = get_vector(\"cat\")\n", 1050 | "cat_sim = w2v.similar_by_vector(cat_vector, topn=10)\n", 1051 | "cat_sim" 1052 | ] 1053 | }, 1054 | { 1055 | "cell_type": "markdown", 1056 | "metadata": { 1057 | "slideshow": { 1058 | "slide_type": "-" 1059 | } 1060 | }, 1061 | "source": [ 1062 | "#### Evaluation of word embeddings" 1063 | ] 1064 | }, 1065 | { 1066 | "cell_type": "markdown", 1067 | "metadata": { 1068 | "slideshow": { 1069 | "slide_type": "-" 1070 | } 1071 | }, 1072 | "source": [ 1073 | "![king_queen](images/king_queen.png)\n", 1074 | "https://arxiv.org/pdf/1301.3781v3.pdf" 1075 | ] 1076 | }, 1077 | { 1078 | "cell_type": "markdown", 1079 | "metadata": { 1080 | "slideshow": { 1081 | "slide_type": "-" 1082 | } 1083 | }, 1084 | "source": [ 1085 | "![king_queen_vis](images/king_queen_vis.png)\n", 1086 | "https://www.aclweb.org/anthology/N/N13/N13-1090.pdf" 1087 | ] 1088 | }, 1089 | { 1090 | "cell_type": "markdown", 1091 | "metadata": { 1092 | "slideshow": { 1093 | "slide_type": "-" 1094 | } 1095 | }, 1096 | "source": [ 1097 | "![country_capital](images/country_capital.png)\n", 1098 | "https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf" 1099 | ] 1100 | }, 1101 | { 1102 | "cell_type": "markdown", 1103 | "metadata": { 1104 | "slideshow": { 1105 | "slide_type": "-" 1106 | } 1107 | }, 1108 | "source": [ 1109 | "![eval_1](images/eval_1.png)\n", 1110 | "https://arxiv.org/pdf/1301.3781v3.pdf" 1111 | ] 1112 | }, 1113 | { 1114 | "cell_type": "markdown", 1115 | "metadata": { 1116 | "slideshow": { 1117 | "slide_type": "-" 1118 | } 1119 | }, 1120 | "source": [ 1121 | "##### Analogies\n", 1122 | "\n", 1123 | "Built-in function in `gensim`: `most_similar(positive, negative, topn)`\n", 1124 | "\n", 1125 | "`A:B::C:??` --> `most_similar(positive=[B,C], negative=A)`" 1126 | ] 1127 | }, 1128 | { 1129 | "cell_type": "code", 1130 | "execution_count": 29, 1131 | "metadata": { 1132 | "collapsed": false, 1133 | "slideshow": { 1134 | "slide_type": "-" 1135 | } 1136 | }, 1137 | "outputs": [], 1138 | "source": [ 1139 | "def analogy_solver(A, B, C, topn=5):\n", 1140 | " \"\"\"\n", 1141 | " A:B::C:?\n", 1142 | " man:woman::king:???\n", 1143 | " most_similar(positive=[B,C], negative=[A])\n", 1144 | " \"\"\"\n", 1145 | " return w2v.most_similar(\n", 1146 | " positive=[B,C],\n", 1147 | " negative=[A],\n", 1148 | " topn=topn\n", 1149 | " )" 1150 | ] 1151 | }, 1152 | { 1153 | "cell_type": "code", 1154 | "execution_count": 30, 1155 | "metadata": { 1156 | "collapsed": false, 1157 | "slideshow": { 1158 | "slide_type": "-" 1159 | } 1160 | }, 1161 | "outputs": [ 1162 | { 1163 | "name": "stdout", 1164 | "output_type": "stream", 1165 | "text": [ 1166 | "[('queen', 0.6834795475006104), ('monarch', 0.6421915292739868), ('princess', 0.5896612405776978), ('beatrix', 0.5811704993247986), ('prince', 0.5663138031959534)]\n", 1167 | "\n", 1168 | "[('queen', 0.6834795475006104), ('monarch', 0.6421915292739868), ('princess', 0.5896612405776978), ('beatrix', 0.5811704993247986), ('prince', 0.5663138031959534)]\n", 1169 | "\n", 1170 | "[('sister', 0.8335152268409729), ('daughter', 0.8259485960006714), ('mother', 0.7856060266494751), ('grandmother', 0.7708373069763184), ('sisterinlaw', 0.7601062655448914)]\n", 1171 | "\n", 1172 | "[('sister', 0.8335152268409729), ('daughter', 0.8259485960006714), ('mother', 0.7856060266494751), ('grandmother', 0.7708373069763184), ('sisterinlaw', 0.7601062655448914)]\n" 1173 | ] 1174 | } 1175 | ], 1176 | "source": [ 1177 | "# man:woman::king:???\n", 1178 | "# king - man + woman = ???\n", 1179 | "sol_1 = analogy_solver(\"man\", \"woman\", \"king\")\n", 1180 | "print(sol_1)\n", 1181 | "print()\n", 1182 | "\n", 1183 | "# man:king::woman:???\n", 1184 | "# woman - man + king = ???\n", 1185 | "sol_2 = analogy_solver(\"man\", \"king\", \"woman\")\n", 1186 | "print(sol_2)\n", 1187 | "print()\n", 1188 | "\n", 1189 | "# uncle:brother::aunt:???\n", 1190 | "# brother - uncle + aunt = ???\n", 1191 | "sol_3 = analogy_solver(\"uncle\", \"brother\", \"aunt\")\n", 1192 | "print(sol_3)\n", 1193 | "print()\n", 1194 | "\n", 1195 | "# uncle:aunt::brother:???\n", 1196 | "# aunt - uncle + brother = ???\n", 1197 | "sol_4 = analogy_solver(\"uncle\", \"aunt\", \"brother\")\n", 1198 | "print(sol_4)" 1199 | ] 1200 | }, 1201 | { 1202 | "cell_type": "markdown", 1203 | "metadata": { 1204 | "slideshow": { 1205 | "slide_type": "-" 1206 | } 1207 | }, 1208 | "source": [ 1209 | "##### \"One of these words is not like the other\"\n", 1210 | "\n", 1211 | "`breakfast cereal dinner lunch`\n", 1212 | "\n", 1213 | "Built-in function in `gensim`: `doesnt_match(list_of_words)`" 1214 | ] 1215 | }, 1216 | { 1217 | "cell_type": "code", 1218 | "execution_count": 31, 1219 | "metadata": { 1220 | "collapsed": false, 1221 | "slideshow": { 1222 | "slide_type": "-" 1223 | } 1224 | }, 1225 | "outputs": [ 1226 | { 1227 | "name": "stdout", 1228 | "output_type": "stream", 1229 | "text": [ 1230 | "cereal\n", 1231 | "house\n" 1232 | ] 1233 | } 1234 | ], 1235 | "source": [ 1236 | "# find which word doesn't match\n", 1237 | "list_of_words = \"breakfast cereal dinner lunch\"\n", 1238 | "doesnt_match = w2v.doesnt_match(list_of_words.split())\n", 1239 | "print(doesnt_match)\n", 1240 | "\n", 1241 | "list_of_words_2 = \"house dog pencil electrocute\"\n", 1242 | "doesnt_match_2 = w2v.doesnt_match(list_of_words_2.split())\n", 1243 | "print(doesnt_match_2)" 1244 | ] 1245 | }, 1246 | { 1247 | "cell_type": "markdown", 1248 | "metadata": { 1249 | "slideshow": { 1250 | "slide_type": "-" 1251 | } 1252 | }, 1253 | "source": [ 1254 | "#### Word Embeddings and Antonyms" 1255 | ] 1256 | }, 1257 | { 1258 | "cell_type": "code", 1259 | "execution_count": 32, 1260 | "metadata": { 1261 | "collapsed": false, 1262 | "slideshow": { 1263 | "slide_type": "-" 1264 | } 1265 | }, 1266 | "outputs": [ 1267 | { 1268 | "data": { 1269 | "text/plain": [ 1270 | "[('bad', 0.7170573472976685),\n", 1271 | " ('terrific', 0.7161434888839722),\n", 1272 | " ('decent', 0.7018914222717285),\n", 1273 | " ('lousy', 0.6984266042709351),\n", 1274 | " ('wonderful', 0.6819486618041992),\n", 1275 | " ('perfect', 0.6481753587722778),\n", 1276 | " ('great', 0.6480209827423096),\n", 1277 | " ('nice', 0.6281204223632812),\n", 1278 | " ('darn', 0.623289942741394),\n", 1279 | " ('fun', 0.6176395416259766)]" 1280 | ] 1281 | }, 1282 | "execution_count": 32, 1283 | "metadata": {}, 1284 | "output_type": "execute_result" 1285 | } 1286 | ], 1287 | "source": [ 1288 | "# this approach doesn't handle antonyms well\n", 1289 | "# \"That movie was _______.\"\n", 1290 | "\n", 1291 | "w2v.similar_by_word(\"good\", topn=10)" 1292 | ] 1293 | }, 1294 | { 1295 | "cell_type": "markdown", 1296 | "metadata": { 1297 | "slideshow": { 1298 | "slide_type": "-" 1299 | } 1300 | }, 1301 | "source": [ 1302 | "#### Bias in Word Embeddings" 1303 | ] 1304 | }, 1305 | { 1306 | "cell_type": "markdown", 1307 | "metadata": { 1308 | "slideshow": { 1309 | "slide_type": "-" 1310 | } 1311 | }, 1312 | "source": [ 1313 | "![king_queen](images/king_queen_2.png)\n", 1314 | "![programmer_homemaker](images/programmer_homemaker.png)\n", 1315 | "https://arxiv.org/pdf/1607.06520v1.pdf" 1316 | ] 1317 | }, 1318 | { 1319 | "cell_type": "markdown", 1320 | "metadata": { 1321 | "slideshow": { 1322 | "slide_type": "subslide" 1323 | } 1324 | }, 1325 | "source": [ 1326 | "![gender_bias](images/gender_bias.png)\n", 1327 | "https://arxiv.org/pdf/1607.06520v1.pdf" 1328 | ] 1329 | }, 1330 | { 1331 | "cell_type": "code", 1332 | "execution_count": 33, 1333 | "metadata": { 1334 | "collapsed": false, 1335 | "slideshow": { 1336 | "slide_type": "-" 1337 | } 1338 | }, 1339 | "outputs": [ 1340 | { 1341 | "name": "stdout", 1342 | "output_type": "stream", 1343 | "text": [ 1344 | "[('lathes', 0.581120491027832), ('typewriters', 0.5445051193237305), ('washing', 0.5365341305732727), ('machine', 0.5345758199691772), ('shoe', 0.5307853817939758)]\n", 1345 | "\n" 1346 | ] 1347 | } 1348 | ], 1349 | "source": [ 1350 | "# she:sewing::he:???\n", 1351 | "bias_1 = analogy_solver(\"she\", \"sewing\", \"he\")\n", 1352 | "print(bias_1)\n", 1353 | "print()" 1354 | ] 1355 | }, 1356 | { 1357 | "cell_type": "markdown", 1358 | "metadata": { 1359 | "slideshow": { 1360 | "slide_type": "-" 1361 | } 1362 | }, 1363 | "source": [ 1364 | "#### Links to available word embeddings\n", 1365 | "\n", 1366 | "[The \"original\" code for `word2vec`, and pre-trained vectors](https://code.google.com/archive/p/word2vec/)\n", 1367 | "\n", 1368 | "[Stanford's approach to word embeddings, and pre-trained vectors](http://nlp.stanford.edu/projects/glove/)\n", 1369 | "\n", 1370 | "[A modified approach to word embeddings (feeding dependency tuples to the neural network instead of words), and pre-trained vectors](https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings/)\n", 1371 | "\n", 1372 | "[Word embeddings from a particular historical period](http://nlp.stanford.edu/projects/histwords/)" 1373 | ] 1374 | }, 1375 | { 1376 | "cell_type": "markdown", 1377 | "metadata": { 1378 | "slideshow": { 1379 | "slide_type": "-" 1380 | } 1381 | }, 1382 | "source": [ 1383 | "## Links to papers\n", 1384 | "\n", 1385 | "The \"original\" three papers on `word2vec` by Mikolov:\n", 1386 | "\n", 1387 | " - [Efficient Estimation of Word Representations in Vector Space](http://arxiv.org/pdf/1301.3781v3.pdf)\n", 1388 | "\n", 1389 | " - [Distributed Representations of Words and Phrases and their Compositionality](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)\n", 1390 | "\n", 1391 | " - [Linguistic Regularities in Continuous Space Word Representations](https://www.aclweb.org/anthology/N/N13/N13-1090.pdf)\n", 1392 | "\n", 1393 | "\n", 1394 | "[Further analysis of approaches to word embeddings and their hyperparameters](https://transacl.org/ojs/index.php/tacl/article/viewFile/570/124)\n", 1395 | "\n", 1396 | "[Detailed evaluation of word embeddings](https://arxiv.org/pdf/1608.04207v1.pdf)\n", 1397 | "\n", 1398 | "[Website for evaluating word embeddings](http://veceval.com/)\n", 1399 | "\n" 1400 | ] 1401 | }, 1402 | { 1403 | "cell_type": "markdown", 1404 | "metadata": { 1405 | "slideshow": { 1406 | "slide_type": "-" 1407 | } 1408 | }, 1409 | "source": [ 1410 | "## Links to blogs\n", 1411 | "\n", 1412 | "[A good overview of NLP](https://blog.monkeylearn.com/the-definitive-guide-to-natural-language-processing/)\n", 1413 | "\n", 1414 | "[Blog post summary of the three \"original\" papers by Mikolov](https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/)\n", 1415 | "\n", 1416 | "[Detailed blog post on the application of word embeddings to analogies](https://quomodocumque.wordpress.com/2016/01/15/messing-around-with-word2vec/)\n", 1417 | "\n", 1418 | "[Appyling word embeddings to computer logs](https://gab41.lab41.org/three-things-we-learned-about-applying-word-vectors-to-computer-logs-c199070f390b#.k2mirf2oa)" 1419 | ] 1420 | } 1421 | ], 1422 | "metadata": { 1423 | "celltoolbar": "Slideshow", 1424 | "kernelspec": { 1425 | "display_name": "Python 3", 1426 | "language": "python", 1427 | "name": "python3" 1428 | }, 1429 | "language_info": { 1430 | "codemirror_mode": { 1431 | "name": "ipython", 1432 | "version": 3 1433 | }, 1434 | "file_extension": ".py", 1435 | "mimetype": "text/x-python", 1436 | "name": "python", 1437 | "nbconvert_exporter": "python", 1438 | "pygments_lexer": "ipython3", 1439 | "version": "3.5.2" 1440 | } 1441 | }, 1442 | "nbformat": 4, 1443 | "nbformat_minor": 0 1444 | } 1445 | -------------------------------------------------------------------------------- /images/NLP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/NLP.png -------------------------------------------------------------------------------- /images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/architecture.png -------------------------------------------------------------------------------- /images/architecture_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/architecture_2.png -------------------------------------------------------------------------------- /images/context.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/context.png -------------------------------------------------------------------------------- /images/cos_sim.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/cos_sim.png -------------------------------------------------------------------------------- /images/cos_sim_compare.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/cos_sim_compare.png -------------------------------------------------------------------------------- /images/country_capital.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/country_capital.png -------------------------------------------------------------------------------- /images/distance_measures.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/distance_measures.png -------------------------------------------------------------------------------- /images/eval_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/eval_1.png -------------------------------------------------------------------------------- /images/eval_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/eval_2.png -------------------------------------------------------------------------------- /images/gender_bias.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/gender_bias.png -------------------------------------------------------------------------------- /images/king_queen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/king_queen.png -------------------------------------------------------------------------------- /images/king_queen_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/king_queen_2.png -------------------------------------------------------------------------------- /images/king_queen_vis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/king_queen_vis.png -------------------------------------------------------------------------------- /images/normalize.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/normalize.jpg -------------------------------------------------------------------------------- /images/one_hot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/one_hot.png -------------------------------------------------------------------------------- /images/programmer_homemaker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/programmer_homemaker.png -------------------------------------------------------------------------------- /images/unit_circle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/unit_circle.png -------------------------------------------------------------------------------- /images/vectorize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/vectorize.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | gensim==1.0.1 3 | nltk==3.4.5 4 | sklearn==0.23.2 5 | numpy 6 | beautifulsoup4 7 | jupyter 8 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from nltk import word_tokenize 3 | import numpy as np 4 | 5 | # loading data 6 | 7 | 8 | def load_data(path_to_data): 9 | """ 10 | Loads `.tsv` of data into a 11 | Ensures that `.html` has been removed 12 | :param path_to_data: full/path/to/data 13 | :return: of ([id], [label], [text]) 14 | """ 15 | out_ = [] 16 | with open(path_to_data, "r") as f: 17 | for line in f: 18 | # parse line 19 | line_split = line.rstrip().split("\t") 20 | if len(line_split) != 3: 21 | continue 22 | id = line_split[0] 23 | label = line_split[1] 24 | raw_text = line_split[2] 25 | # ensure html is removed 26 | text = BeautifulSoup(raw_text, "html.parser").get_text() 27 | out_.append((id, label, text)) 28 | return out_ 29 | 30 | 31 | def get_all_docs(list_of_tuples): 32 | """ 33 | Given a dictionary of data, this will collect all the text into one list 34 | :param list_of_tuples: Output of load_data() 35 | :return of documents, lookup_dict 36 | """ 37 | all_docs = [] 38 | lookup = {} 39 | for i in range(len(list_of_tuples)): 40 | current = list_of_tuples[i] 41 | all_docs.append(current[2]) 42 | lookup[i] = current[2] 43 | return all_docs, lookup 44 | 45 | 46 | # calculations 47 | 48 | def normalize_vector(vector): 49 | """ 50 | Normalizes a vector so that all its values are between 0 and 1 51 | :param vector: a `numpy` vector 52 | :return: a normalized `numpy` vector 53 | """ 54 | # norm = np.sqrt(vector.dot(vector)) 55 | # numpy has a built in function 56 | norm = np.linalg.norm(vector) 57 | if norm: 58 | return vector / norm 59 | else: 60 | # if norm == 0, then original vector was all 0s 61 | return vector 62 | 63 | 64 | def cos_sim(vector_one, vector_two): 65 | """ 66 | Calculate the cosine similarity of two `numpy` vectors 67 | :param vector_one: a `numpy` vector 68 | :param vector_two: a `numpy` vector 69 | :return: A score between 0 and 1 70 | """ 71 | # ensure that both vectors are already normalized 72 | vector_one_norm = normalize_vector(vector_one) 73 | vector_two_norm = normalize_vector(vector_two) 74 | 75 | # calculate the dot product between the two normalized vectors 76 | return vector_one_norm.dot(vector_two_norm) 77 | 78 | 79 | def generate_all_cos_sim(X_matrix): 80 | """ 81 | Generates a matrix of cosine similarities for a set of documents 82 | WARNING: this is too computationally expensive for a python notebook. Run in console. 83 | :param X_matrix: dense `numpy` matrix: num_documents (d) x words_in_vocabulary (v) 84 | :return: dense `numpy` matrix d x d 85 | """ 86 | # ensure matrix is dense 87 | if "sparse" in str(type(X_matrix)): 88 | X_matrix = X_matrix.toarray() 89 | # get shape 90 | X_shape = X_matrix.shape 91 | size = X_shape[0] 92 | # build empty matrix 93 | cos_matrix = np.zeros((size, size)) 94 | # iterate through rows 95 | for i in range(size): 96 | for j in range(size): 97 | if i != j: 98 | print(i,j) 99 | # calculate cosine similarity 100 | cos_matrix[i][j] = cos_sim(X_matrix[i], X_matrix[j]) 101 | else: 102 | # set diagonal to None 103 | cos_matrix[i][j] = None 104 | return cos_matrix 105 | 106 | 107 | def get_similar(cos_sim_matrix, idx, n, direction="most"): 108 | """ 109 | Determines similarity of n documents 110 | :param cos_sim_matrix: `numpy` dense array of num_documents x num_documents with values as cosine similarity 111 | :param idx: index of document to calculate most similar 112 | :param n: number of most similar indices to return 113 | :param direction: "most" or "least" for top or bottom of ranked list 114 | :return: of (idx, cos_sim) 115 | """ 116 | if direction != "most" and direction != "least": 117 | raise Exception("chooose `most` or `least` for `direction`") 118 | # get all values 119 | if direction == "most": 120 | all_values = sorted(enumerate(filter(lambda x: not np.isnan(x), cos_sim_matrix[idx])), key=lambda x: x[1], reverse=True) 121 | else: 122 | all_values = sorted(enumerate(filter(lambda x: not np.isnan(x), cos_sim_matrix[idx])), key=lambda x: x[1]) 123 | return [(x[0] + 1, x[1]) for x in all_values[:n]] 124 | 125 | 126 | # I/O 127 | 128 | def save_matrix_to_csv(X_matrix, save_location): 129 | """ 130 | Saves a matrix to csv 131 | :param X_matrix: dense `numpy` array 132 | :param save_location: full/path/to/desired/location.csv 133 | """ 134 | # ensure matrix is dense 135 | if "sparse" in str(type(X_matrix)): 136 | X_matrix = X_matrix.toarray() 137 | np.savetxt(save_location, X_matrix, delimiter=",") 138 | 139 | 140 | def load_matrix_from_csv(location): 141 | """ 142 | Loads a matrix from csv 143 | :param location: full/path/to/location.csv 144 | :return: dense `numpy` array 145 | """ 146 | return np.loadtxt(location, delimiter=",") 147 | 148 | --------------------------------------------------------------------------------