├── .dockerignore
├── .gitignore
├── Dockerfile
├── Document-Embeddings_Demo.ipynb
├── Gigaword_pruned_vectors.txt.gz
├── README.md
├── Word-Embeddings_Demo.ipynb
├── images
    ├── NLP.png
    ├── architecture.png
    ├── architecture_2.png
    ├── context.png
    ├── cos_sim.png
    ├── cos_sim_compare.png
    ├── country_capital.png
    ├── distance_measures.png
    ├── eval_1.png
    ├── eval_2.png
    ├── gender_bias.png
    ├── king_queen.png
    ├── king_queen_2.png
    ├── king_queen_vis.png
    ├── normalize.jpg
    ├── one_hot.png
    ├── programmer_homemaker.png
    ├── unit_circle.png
    └── vectorize.png
├── movie_reviews.tsv
├── requirements.txt
├── tfidf_cos_matrix.csv
└── utils.py


/.dockerignore:
--------------------------------------------------------------------------------
 1 | .git
 2 | # Byte-compiled / optimized / DLL files
 3 | __pycache__/
 4 | *.py[cod]
 5 | *$py.class
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 | 
49 | # Translations
50 | *.mo
51 | *.pot
52 | 
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 | 
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 | 
61 | # Scrapy stuff:
62 | .scrapy
63 | 
64 | # Sphinx documentation
65 | docs/_build/
66 | 
67 | # PyBuilder
68 | target/
69 | 
70 | # IPython Notebook
71 | .ipynb_checkpoints
72 | 
73 | # pyenv
74 | .python-version
75 | 
76 | # celery beat schedule file
77 | celerybeat-schedule
78 | 
79 | # dotenv
80 | .env
81 | 
82 | # virtualenv
83 | venv/
84 | ENV/
85 | 
86 | # Spyder project settings
87 | .spyderproject
88 | 
89 | # Rope project settings
90 | .ropeproject
91 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # backups
 2 | *~
 3 | *.swp
 4 | .DS_Store
 5 | # Byte-compiled / optimized / DLL files
 6 | __pycache__/
 7 | *.py[cod]
 8 | *$py.class
 9 | 
10 | # C extensions
11 | *.so
12 | 
13 | # Distribution / packaging
14 | .Python
15 | env/
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | 
31 | # PyInstaller
32 | #  Usually these files are written by a python script from a template
33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 | 
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 | 
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *,cover
50 | .hypothesis/
51 | 
52 | # Translations
53 | *.mo
54 | *.pot
55 | 
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 | 
60 | # Flask stuff:
61 | instance/
62 | .webassets-cache
63 | 
64 | # Scrapy stuff:
65 | .scrapy
66 | 
67 | # Sphinx documentation
68 | docs/_build/
69 | 
70 | # PyBuilder
71 | target/
72 | 
73 | # IPython Notebook
74 | .ipynb_checkpoints
75 | 
76 | # pyenv
77 | .python-version
78 | 
79 | # celery beat schedule file
80 | celerybeat-schedule
81 | 
82 | # dotenv
83 | .env
84 | 
85 | # virtualenv
86 | venv/
87 | ENV/
88 | 
89 | # Spyder project settings
90 | .spyderproject
91 | 
92 | # Rope project settings
93 | .ropeproject
94 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM jupyter/minimal-notebook
2 | 
3 | WORKDIR /code
4 | COPY requirements.txt /code
5 | RUN pip install -r /code/requirements.txt
6 | COPY . /code
7 | 


--------------------------------------------------------------------------------
/Document-Embeddings_Demo.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {
   7 |     "collapsed": false,
   8 |     "slideshow": {
   9 |      "slide_type": "-"
  10 |     }
  11 |    },
  12 |    "outputs": [
  13 |     {
  14 |      "data": {
  15 |       "text/plain": [
  16 |        "4"
  17 |       ]
  18 |      },
  19 |      "execution_count": 1,
  20 |      "metadata": {},
  21 |      "output_type": "execute_result"
  22 |     }
  23 |    ],
  24 |    "source": [
  25 |     "# using Jupyter notebooks\n",
  26 |     "# pushing CTRL-c will run the code in a cell\n",
  27 |     "2 + 2"
  28 |    ]
  29 |   },
  30 |   {
  31 |    "cell_type": "markdown",
  32 |    "metadata": {
  33 |     "slideshow": {
  34 |      "slide_type": "-"
  35 |     }
  36 |    },
  37 |    "source": [
  38 |     "# Gentle Introduction to NLP through Document Embeddings"
  39 |    ]
  40 |   },
  41 |   {
  42 |    "cell_type": "markdown",
  43 |    "metadata": {
  44 |     "slideshow": {
  45 |      "slide_type": "-"
  46 |     }
  47 |    },
  48 |    "source": [
  49 |     "### Quick Review of Last Time\n",
  50 |     "* Cosine Similarity\n",
  51 |     "\n",
  52 |     "### Two Approaches to Embedding Documents\n",
  53 |     "* Sparse, bag-of-words embeddings\n",
  54 |     " - Count embeddings\n",
  55 |     " - TFIDF embeddings\n",
  56 |     "* Dense embeddings"
  57 |    ]
  58 |   },
  59 |   {
  60 |    "cell_type": "markdown",
  61 |    "metadata": {
  62 |     "slideshow": {
  63 |      "slide_type": "-"
  64 |     }
  65 |    },
  66 |    "source": [
  67 |     "![NLP](images/NLP.png)"
  68 |    ]
  69 |   },
  70 |   {
  71 |    "cell_type": "markdown",
  72 |    "metadata": {
  73 |     "slideshow": {
  74 |      "slide_type": "-"
  75 |     }
  76 |    },
  77 |    "source": [
  78 |     "## From Last Time"
  79 |    ]
  80 |   },
  81 |   {
  82 |    "cell_type": "markdown",
  83 |    "metadata": {
  84 |     "slideshow": {
  85 |      "slide_type": "-"
  86 |     }
  87 |    },
  88 |    "source": [
  89 |     "![distance](images/distance_measures.png)\n",
  90 |     "http://dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/"
  91 |    ]
  92 |   },
  93 |   {
  94 |    "cell_type": "markdown",
  95 |    "metadata": {
  96 |     "slideshow": {
  97 |      "slide_type": "-"
  98 |     }
  99 |    },
 100 |    "source": [
 101 |     "![cos_sim](images/cos_sim.png)\n",
 102 |     "http://dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/"
 103 |    ]
 104 |   },
 105 |   {
 106 |    "cell_type": "markdown",
 107 |    "metadata": {
 108 |     "slideshow": {
 109 |      "slide_type": "-"
 110 |     }
 111 |    },
 112 |    "source": [
 113 |     "### calculating dot product\n",
 114 |     "$vector_a = [1,2,3]$ <br>\n",
 115 |     "$vector_b = [4,5,6]$ <br>\n",
 116 |     "$vector_a \\cdot vector_b = (1*4) + (2*5) + (3*6) = 4 + 10 + 18 = 32$ "
 117 |    ]
 118 |   },
 119 |   {
 120 |    "cell_type": "markdown",
 121 |    "metadata": {
 122 |     "slideshow": {
 123 |      "slide_type": "-"
 124 |     }
 125 |    },
 126 |    "source": [
 127 |     "### normalizing a vector\n",
 128 |     "To normalize a vector, we shrink all values so they fall between $0$ and $1$.\n",
 129 |     "\n",
 130 |     "![normalize](images/normalize.jpg)\n",
 131 |     "http://www.wikihow.com/Normalize-a-Vector"
 132 |    ]
 133 |   },
 134 |   {
 135 |    "cell_type": "code",
 136 |    "execution_count": 2,
 137 |    "metadata": {
 138 |     "collapsed": true,
 139 |     "slideshow": {
 140 |      "slide_type": "-"
 141 |     }
 142 |    },
 143 |    "outputs": [],
 144 |    "source": [
 145 |     "import numpy as np\n",
 146 |     "import utils\n",
 147 |     "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer"
 148 |    ]
 149 |   },
 150 |   {
 151 |    "cell_type": "code",
 152 |    "execution_count": 3,
 153 |    "metadata": {
 154 |     "collapsed": true,
 155 |     "slideshow": {
 156 |      "slide_type": "-"
 157 |     }
 158 |    },
 159 |    "outputs": [],
 160 |    "source": [
 161 |     "def normalize_vector(vector):\n",
 162 |     "    \"\"\"\n",
 163 |     "    Normalizes a vector so that all its values are between 0 and 1\n",
 164 |     "    :param vector: a `numpy` vector\n",
 165 |     "    :return: a normalized `numpy` vector\n",
 166 |     "    \"\"\"\n",
 167 |     "    # norm = np.sqrt(vector.dot(vector))\n",
 168 |     "    # numpy has a built in function\n",
 169 |     "    norm = np.linalg.norm(vector)\n",
 170 |     "    if norm:\n",
 171 |     "        return vector / norm\n",
 172 |     "    else:\n",
 173 |     "        # if norm == 0, then original vector was all 0s\n",
 174 |     "        return vector"
 175 |    ]
 176 |   },
 177 |   {
 178 |    "cell_type": "code",
 179 |    "execution_count": 4,
 180 |    "metadata": {
 181 |     "collapsed": false,
 182 |     "slideshow": {
 183 |      "slide_type": "-"
 184 |     }
 185 |    },
 186 |    "outputs": [
 187 |     {
 188 |      "name": "stdout",
 189 |      "output_type": "stream",
 190 |      "text": [
 191 |       "original vector [1 2 4]\n",
 192 |       "normalized vector [ 0.21821789  0.43643578  0.87287156]\n"
 193 |      ]
 194 |     }
 195 |    ],
 196 |    "source": [
 197 |     "vector_3d = np.array([1,2,4])\n",
 198 |     "print(\"original vector\", vector_3d)\n",
 199 |     "print(\"normalized vector\", normalize_vector(vector_3d))\n",
 200 |     "#0.218 is 1/4th of .873 just like 1 is 1/4th of 4"
 201 |    ]
 202 |   },
 203 |   {
 204 |    "cell_type": "code",
 205 |    "execution_count": 5,
 206 |    "metadata": {
 207 |     "collapsed": true,
 208 |     "slideshow": {
 209 |      "slide_type": "-"
 210 |     }
 211 |    },
 212 |    "outputs": [],
 213 |    "source": [
 214 |     "def cos_sim(vector_one, vector_two):\n",
 215 |     "    \"\"\"\n",
 216 |     "    Calculate the cosine similarity of two `numpy` vectors\n",
 217 |     "    :param vector_one: a `numpy` vector\n",
 218 |     "    :param vector_two: a `numpy` vector\n",
 219 |     "    :return: A score between 0 and 1\n",
 220 |     "    \"\"\"\n",
 221 |     "    # ensure that both vectors are already normalized\n",
 222 |     "    vector_one_norm = normalize_vector(vector_one)\n",
 223 |     "    vector_two_norm = normalize_vector(vector_two)\n",
 224 |     "    \n",
 225 |     "    # calculate the dot product between the two normalized vectors\n",
 226 |     "    return vector_one_norm.dot(vector_two_norm)"
 227 |    ]
 228 |   },
 229 |   {
 230 |    "cell_type": "code",
 231 |    "execution_count": 6,
 232 |    "metadata": {
 233 |     "collapsed": false,
 234 |     "slideshow": {
 235 |      "slide_type": "-"
 236 |     }
 237 |    },
 238 |    "outputs": [
 239 |     {
 240 |      "name": "stdout",
 241 |      "output_type": "stream",
 242 |      "text": [
 243 |       "cosine similarity of vector_one and vector_two 0.948683298051\n",
 244 |       "cosine similarity of vector_one and vector_three 0.904534033733\n",
 245 |       "cosine similarity of vector_one and vector_four 0.904534033733\n"
 246 |      ]
 247 |     }
 248 |    ],
 249 |    "source": [
 250 |     "vector_one = np.array([1,1,1,1,1])\n",
 251 |     "vector_two = np.array([1,1,1,1,2])\n",
 252 |     "vector_three = np.array([1,2,3,4,5])\n",
 253 |     "vector_four = np.array([10,20,30,40,50])\n",
 254 |     "\n",
 255 |     "print(\"cosine similarity of vector_one and vector_two\", cos_sim(vector_one, vector_two))\n",
 256 |     "print(\"cosine similarity of vector_one and vector_three\", cos_sim(vector_one, vector_three))\n",
 257 |     "print(\"cosine similarity of vector_one and vector_four\", cos_sim(vector_one, vector_four))"
 258 |    ]
 259 |   },
 260 |   {
 261 |    "cell_type": "markdown",
 262 |    "metadata": {
 263 |     "slideshow": {
 264 |      "slide_type": "-"
 265 |     }
 266 |    },
 267 |    "source": [
 268 |     "### Interpreting \"Similarity\"\n",
 269 |     "![cos_sim_compare](images/cos_sim_compare.png)\n",
 270 |     "https://medium.com/@camrongodbout/creating-a-search-engine-f2f429cab33c#.z7i9w8y5t"
 271 |    ]
 272 |   },
 273 |   {
 274 |    "cell_type": "markdown",
 275 |    "metadata": {
 276 |     "slideshow": {
 277 |      "slide_type": "-"
 278 |     }
 279 |    },
 280 |    "source": [
 281 |     "![vectorize](images/vectorize.png)"
 282 |    ]
 283 |   },
 284 |   {
 285 |    "cell_type": "markdown",
 286 |    "metadata": {
 287 |     "slideshow": {
 288 |      "slide_type": "-"
 289 |     }
 290 |    },
 291 |    "source": [
 292 |     "## Embedding a Document \n",
 293 |     "### Bag of Words\n",
 294 |     "#### Count Vectorizing"
 295 |    ]
 296 |   },
 297 |   {
 298 |    "cell_type": "markdown",
 299 |    "metadata": {
 300 |     "slideshow": {
 301 |      "slide_type": "-"
 302 |     }
 303 |    },
 304 |    "source": [
 305 |     "![bag_of_words](images/bag_of_words_vis.png)"
 306 |    ]
 307 |   },
 308 |   {
 309 |    "cell_type": "markdown",
 310 |    "metadata": {
 311 |     "slideshow": {
 312 |      "slide_type": "-"
 313 |     }
 314 |    },
 315 |    "source": [
 316 |     "![bag_of_words_count](images/bag_of_words_count_matrix.png)"
 317 |    ]
 318 |   },
 319 |   {
 320 |    "cell_type": "markdown",
 321 |    "metadata": {
 322 |     "slideshow": {
 323 |      "slide_type": "-"
 324 |     }
 325 |    },
 326 |    "source": [
 327 |     "## Embedding a Document \n",
 328 |     "### Bag of Words\n",
 329 |     "#### TFIDF Vectorizing\n",
 330 |     "`TFIDF` = `term frequency, inverse document frequency`"
 331 |    ]
 332 |   },
 333 |   {
 334 |    "cell_type": "markdown",
 335 |    "metadata": {
 336 |     "slideshow": {
 337 |      "slide_type": "-"
 338 |     }
 339 |    },
 340 |    "source": [
 341 |     "![tfidf_rationale](images/tfidf_rationale.png)"
 342 |    ]
 343 |   },
 344 |   {
 345 |    "cell_type": "markdown",
 346 |    "metadata": {
 347 |     "slideshow": {
 348 |      "slide_type": "-"
 349 |     }
 350 |    },
 351 |    "source": [
 352 |     "![doc_freq_vis](images/document_frequency_vis.png)"
 353 |    ]
 354 |   },
 355 |   {
 356 |    "cell_type": "markdown",
 357 |    "metadata": {
 358 |     "slideshow": {
 359 |      "slide_type": "-"
 360 |     }
 361 |    },
 362 |    "source": [
 363 |     "![tfidf_matrix](images/tfidf_matrix.png)"
 364 |    ]
 365 |   },
 366 |   {
 367 |    "cell_type": "markdown",
 368 |    "metadata": {
 369 |     "slideshow": {
 370 |      "slide_type": "-"
 371 |     }
 372 |    },
 373 |    "source": [
 374 |     "![tfidf_matrix_decimal](images/tfidf_matrix_decimal.png)"
 375 |    ]
 376 |   },
 377 |   {
 378 |    "cell_type": "markdown",
 379 |    "metadata": {
 380 |     "slideshow": {
 381 |      "slide_type": "-"
 382 |     }
 383 |    },
 384 |    "source": [
 385 |     "![bop](images/bags_of_popcorn.png)"
 386 |    ]
 387 |   },
 388 |   {
 389 |    "cell_type": "code",
 390 |    "execution_count": 7,
 391 |    "metadata": {
 392 |     "collapsed": false,
 393 |     "slideshow": {
 394 |      "slide_type": "-"
 395 |     }
 396 |    },
 397 |    "outputs": [],
 398 |    "source": [
 399 |     "# load reviews\n",
 400 |     "reviews_dict = utils.load_data(\"movie_reviews.tsv\")\n",
 401 |     "all_docs, lookup = utils.get_all_docs(reviews_dict)"
 402 |    ]
 403 |   },
 404 |   {
 405 |    "cell_type": "code",
 406 |    "execution_count": 8,
 407 |    "metadata": {
 408 |     "collapsed": false,
 409 |     "slideshow": {
 410 |      "slide_type": "-"
 411 |     }
 412 |    },
 413 |    "outputs": [
 414 |     {
 415 |      "data": {
 416 |       "text/plain": [
 417 |        "'\"With all this stuff going down at the moment with MJ i\\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\\'s feeling towards the press and also the obvious message of drugs are bad m\\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 20 minutes or so excluding the Smooth Criminal sequence and Joe Pesci is convincing as a psychopathic all powerful drug lord. Why he wants MJ dead so bad is beyond me. Because MJ overheard his plans? Nah, Joe Pesci\\'s character ranted that he wanted people to know it is he who is supplying drugs etc so i dunno, maybe he just hates MJ\\'s music.Lots of cool things in this like MJ turning into a car and a robot and the whole Speed Demon sequence. Also, the director must have had the patience of a saint when it came to filming the kiddy Bad sequence as usually directors hate working with one kid let alone a whole bunch of them performing a complex dance scene.Bottom line, this movie is for people who like MJ on one level or another (which i think is most people). If not, then stay away. It does try and give off a wholesome message and ironically MJ\\'s bestest buddy in this movie is a girl! Michael Jackson is truly one of the most talented people ever to grace this planet but is he guilty? Well, with all the attention i\\'ve gave this subject....hmmm well i don\\'t know because people can be different behind closed doors, i know this for a fact. He is either an extremely nice but stupid guy or one of the most sickest liars. I hope he is not the latter.\"'"
 418 |       ]
 419 |      },
 420 |      "execution_count": 8,
 421 |      "metadata": {},
 422 |      "output_type": "execute_result"
 423 |     }
 424 |    ],
 425 |    "source": [
 426 |     "# `all docs` is a list of all documents\n",
 427 |     "all_docs[0]"
 428 |    ]
 429 |   },
 430 |   {
 431 |    "cell_type": "code",
 432 |    "execution_count": 9,
 433 |    "metadata": {
 434 |     "collapsed": false,
 435 |     "slideshow": {
 436 |      "slide_type": "-"
 437 |     }
 438 |    },
 439 |    "outputs": [
 440 |     {
 441 |      "data": {
 442 |       "text/plain": [
 443 |        "'\"With all this stuff going down at the moment with MJ i\\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\\'s feeling towards the press and also the obvious message of drugs are bad m\\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 20 minutes or so excluding the Smooth Criminal sequence and Joe Pesci is convincing as a psychopathic all powerful drug lord. Why he wants MJ dead so bad is beyond me. Because MJ overheard his plans? Nah, Joe Pesci\\'s character ranted that he wanted people to know it is he who is supplying drugs etc so i dunno, maybe he just hates MJ\\'s music.Lots of cool things in this like MJ turning into a car and a robot and the whole Speed Demon sequence. Also, the director must have had the patience of a saint when it came to filming the kiddy Bad sequence as usually directors hate working with one kid let alone a whole bunch of them performing a complex dance scene.Bottom line, this movie is for people who like MJ on one level or another (which i think is most people). If not, then stay away. It does try and give off a wholesome message and ironically MJ\\'s bestest buddy in this movie is a girl! Michael Jackson is truly one of the most talented people ever to grace this planet but is he guilty? Well, with all the attention i\\'ve gave this subject....hmmm well i don\\'t know because people can be different behind closed doors, i know this for a fact. He is either an extremely nice but stupid guy or one of the most sickest liars. I hope he is not the latter.\"'"
 444 |       ]
 445 |      },
 446 |      "execution_count": 9,
 447 |      "metadata": {},
 448 |      "output_type": "execute_result"
 449 |     }
 450 |    ],
 451 |    "source": [
 452 |     "# `lookup` is a lookup dict with {idx: text}\n",
 453 |     "lookup[0]"
 454 |    ]
 455 |   },
 456 |   {
 457 |    "cell_type": "markdown",
 458 |    "metadata": {
 459 |     "slideshow": {
 460 |      "slide_type": "-"
 461 |     }
 462 |    },
 463 |    "source": [
 464 |     "### Using `scikit-learn`\n",
 465 |     "\n",
 466 |     "[CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) <br>\n",
 467 |     "[TfidfVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)"
 468 |    ]
 469 |   },
 470 |   {
 471 |    "cell_type": "code",
 472 |    "execution_count": 10,
 473 |    "metadata": {
 474 |     "collapsed": false,
 475 |     "slideshow": {
 476 |      "slide_type": "-"
 477 |     }
 478 |    },
 479 |    "outputs": [
 480 |     {
 481 |      "data": {
 482 |       "text/plain": [
 483 |        "<999x18373 sparse matrix of type '<class 'numpy.int64'>'\n",
 484 |        "\twith 137082 stored elements in Compressed Sparse Row format>"
 485 |       ]
 486 |      },
 487 |      "execution_count": 10,
 488 |      "metadata": {},
 489 |      "output_type": "execute_result"
 490 |     }
 491 |    ],
 492 |    "source": [
 493 |     "# Count\n",
 494 |     "# call the vectorizer\n",
 495 |     "cv = CountVectorizer(\n",
 496 |     "    analyzer='word',    # 'char', 'char_wb'\n",
 497 |     "    ngram_range=(1,1),  # unigrams and bigrams ==> (1, 2)\n",
 498 |     "    stop_words=None,    # 'english' \n",
 499 |     "    max_df=1.0,         # float ==> percentage, int ==> raw count\n",
 500 |     "    min_df=1,           # float ==> percentage, int ==> raw count\n",
 501 |     "    binary=False        # True\n",
 502 |     ") \n",
 503 |     "# run fit_transform on the <list> of documents\n",
 504 |     "X_cv = cv.fit_transform(all_docs)\n",
 505 |     "X_cv"
 506 |    ]
 507 |   },
 508 |   {
 509 |    "cell_type": "code",
 510 |    "execution_count": 11,
 511 |    "metadata": {
 512 |     "collapsed": false
 513 |    },
 514 |    "outputs": [
 515 |     {
 516 |      "data": {
 517 |       "text/plain": [
 518 |        "<999x18373 sparse matrix of type '<class 'numpy.float64'>'\n",
 519 |        "\twith 137082 stored elements in Compressed Sparse Row format>"
 520 |       ]
 521 |      },
 522 |      "execution_count": 11,
 523 |      "metadata": {},
 524 |      "output_type": "execute_result"
 525 |     }
 526 |    ],
 527 |    "source": [
 528 |     "# TFIDF\n",
 529 |     "# call the vectorizer\n",
 530 |     "tv = TfidfVectorizer(\n",
 531 |     "    analyzer='word',    # 'char'\n",
 532 |     "    ngram_range=(1,1),  # unigrams and bigrams ==> (1, 2)\n",
 533 |     "    stop_words=None,    # 'english' \n",
 534 |     "    max_df=1.0,         # float ==> percentage, int ==> raw count\n",
 535 |     "    min_df=1,           # float ==> percentage, int ==> raw count\n",
 536 |     ")\n",
 537 |     "# run fit_transform on the <list> of documents\n",
 538 |     "X_tv = tv.fit_transform(all_docs)\n",
 539 |     "X_tv"
 540 |    ]
 541 |   },
 542 |   {
 543 |    "cell_type": "markdown",
 544 |    "metadata": {},
 545 |    "source": [
 546 |     "The first dimension (`999` rows) indicates the number of documents we're processing.\n",
 547 |     "\n",
 548 |     "The second dimension (columns) indicates the number of features we're processing.  This will increase/decrease depending on the `n-gram` parameter."
 549 |    ]
 550 |   },
 551 |   {
 552 |    "cell_type": "markdown",
 553 |    "metadata": {},
 554 |    "source": [
 555 |     "![bag_of_words_count](images/bag_of_words_count_matrix.png)"
 556 |    ]
 557 |   },
 558 |   {
 559 |    "cell_type": "code",
 560 |    "execution_count": 31,
 561 |    "metadata": {
 562 |     "collapsed": false,
 563 |     "slideshow": {
 564 |      "slide_type": "-"
 565 |     }
 566 |    },
 567 |    "outputs": [],
 568 |    "source": [
 569 |     "# see the vocabulary\n",
 570 |     "cv_vocab = cv.get_feature_names()\n",
 571 |     "# see the nonzero features (e.g. words, bigrams, character-grams) \n",
 572 |     "# for each row of data\n",
 573 |     "cv_words_per_doc = cv.inverse_transform(X_cv)\n",
 574 |     "tv_words_per_doc = cv.inverse_transform(X_tv)"
 575 |    ]
 576 |   },
 577 |   {
 578 |    "cell_type": "code",
 579 |    "execution_count": 32,
 580 |    "metadata": {
 581 |     "collapsed": false
 582 |    },
 583 |    "outputs": [
 584 |     {
 585 |      "data": {
 586 |       "text/plain": [
 587 |        "array(['latter', 'hope', 'liars', 'sickest', 'stupid', 'extremely',\n",
 588 |        "       'either', 'fact', 'doors', 'closed', 'behind', 'different', 'be',\n",
 589 |        "       'can', 'don'], \n",
 590 |        "      dtype='<U44')"
 591 |       ]
 592 |      },
 593 |      "execution_count": 32,
 594 |      "metadata": {},
 595 |      "output_type": "execute_result"
 596 |     }
 597 |    ],
 598 |    "source": [
 599 |     "# first index is row (document number)\n",
 600 |     "# second index is how many words to show\n",
 601 |     "cv_words_per_doc[0][0:15]"
 602 |    ]
 603 |   },
 604 |   {
 605 |    "cell_type": "code",
 606 |    "execution_count": 33,
 607 |    "metadata": {
 608 |     "collapsed": false
 609 |    },
 610 |    "outputs": [
 611 |     {
 612 |      "data": {
 613 |       "text/plain": [
 614 |        "array(['don', 'can', 'be', 'different', 'behind', 'closed', 'doors',\n",
 615 |        "       'fact', 'either', 'extremely', 'stupid', 'sickest', 'liars', 'hope',\n",
 616 |        "       'latter'], \n",
 617 |        "      dtype='<U44')"
 618 |       ]
 619 |      },
 620 |      "execution_count": 33,
 621 |      "metadata": {},
 622 |      "output_type": "execute_result"
 623 |     }
 624 |    ],
 625 |    "source": [
 626 |     "tv_words_per_doc[0][-15:]"
 627 |    ]
 628 |   },
 629 |   {
 630 |    "cell_type": "code",
 631 |    "execution_count": 13,
 632 |    "metadata": {
 633 |     "collapsed": false,
 634 |     "slideshow": {
 635 |      "slide_type": "-"
 636 |     }
 637 |    },
 638 |    "outputs": [],
 639 |    "source": [
 640 |     "def get_one_row(X, idx):\n",
 641 |     "    \"\"\"\n",
 642 |     "    Gets one row (representing a document) from sparse matrix, converts to dense, and reshapes\n",
 643 |     "    :param X: the sparse matrix\n",
 644 |     "    :param idx: the index of desired row\n",
 645 |     "    :return: `numpy` dense vector\n",
 646 |     "    \"\"\"\n",
 647 |     "    row_ = X[idx].toarray()\n",
 648 |     "    size = row_.shape[1]\n",
 649 |     "    return row_.reshape(size)"
 650 |    ]
 651 |   },
 652 |   {
 653 |    "cell_type": "code",
 654 |    "execution_count": 39,
 655 |    "metadata": {
 656 |     "collapsed": false
 657 |    },
 658 |    "outputs": [
 659 |     {
 660 |      "data": {
 661 |       "text/plain": [
 662 |        "(18373,)"
 663 |       ]
 664 |      },
 665 |      "execution_count": 39,
 666 |      "metadata": {},
 667 |      "output_type": "execute_result"
 668 |     }
 669 |    ],
 670 |    "source": [
 671 |     "x_0_count = get_one_row(X_cv, 0)\n",
 672 |     "x_0_tfidf = get_one_row(X_tv, 0)\n",
 673 |     "x_0_count.shape"
 674 |    ]
 675 |   },
 676 |   {
 677 |    "cell_type": "markdown",
 678 |    "metadata": {
 679 |     "slideshow": {
 680 |      "slide_type": "-"
 681 |     }
 682 |    },
 683 |    "source": [
 684 |     "### How much does TFIDF affect things?"
 685 |    ]
 686 |   },
 687 |   {
 688 |    "cell_type": "code",
 689 |    "execution_count": 38,
 690 |    "metadata": {
 691 |     "collapsed": false,
 692 |     "slideshow": {
 693 |      "slide_type": "-"
 694 |     }
 695 |    },
 696 |    "outputs": [
 697 |     {
 698 |      "name": "stdout",
 699 |      "output_type": "stream",
 700 |      "text": [
 701 |       "cosine similarity = 0.751,\n",
 702 |       "equal to a 41.3 degree angle\n"
 703 |      ]
 704 |     }
 705 |    ],
 706 |    "source": [
 707 |     "sim = cos_sim(x_0_count, x_0_tfidf)\n",
 708 |     "sim_in_degrees = np.rad2deg(np.arccos(sim))\n",
 709 |     "print(\"cosine similarity = {:.3f},\\nequal to a {:.1f} degree angle\".format(sim, sim_in_degrees))"
 710 |    ]
 711 |   },
 712 |   {
 713 |    "cell_type": "markdown",
 714 |    "metadata": {
 715 |     "slideshow": {
 716 |      "slide_type": "-"
 717 |     }
 718 |    },
 719 |    "source": [
 720 |     "![cos_sim_compare](images/cos_sim_compare.png)\n",
 721 |     "https://medium.com/@camrongodbout/creating-a-search-engine-f2f429cab33c#.z7i9w8y5t"
 722 |    ]
 723 |   },
 724 |   {
 725 |    "cell_type": "code",
 726 |    "execution_count": 34,
 727 |    "metadata": {
 728 |     "collapsed": false
 729 |    },
 730 |    "outputs": [
 731 |     {
 732 |      "name": "stdout",
 733 |      "output_type": "stream",
 734 |      "text": [
 735 |       "cosine similarity: 0.00, angle: 90.00\n",
 736 |       "cosine similarity: 0.05, angle: 86.98\n",
 737 |       "cosine similarity: 0.11, angle: 83.96\n",
 738 |       "cosine similarity: 0.16, angle: 80.92\n",
 739 |       "cosine similarity: 0.21, angle: 77.85\n",
 740 |       "cosine similarity: 0.26, angle: 74.74\n",
 741 |       "cosine similarity: 0.32, angle: 71.59\n",
 742 |       "cosine similarity: 0.37, angle: 68.38\n",
 743 |       "cosine similarity: 0.42, angle: 65.10\n",
 744 |       "cosine similarity: 0.47, angle: 61.73\n",
 745 |       "cosine similarity: 0.53, angle: 58.24\n",
 746 |       "cosine similarity: 0.58, angle: 54.62\n",
 747 |       "cosine similarity: 0.63, angle: 50.83\n",
 748 |       "cosine similarity: 0.68, angle: 46.83\n",
 749 |       "cosine similarity: 0.74, angle: 42.54\n",
 750 |       "cosine similarity: 0.79, angle: 37.86\n",
 751 |       "cosine similarity: 0.84, angle: 32.64\n",
 752 |       "cosine similarity: 0.89, angle: 26.53\n",
 753 |       "cosine similarity: 0.95, angle: 18.67\n",
 754 |       "cosine similarity: 1.00, angle: 0.00\n"
 755 |      ]
 756 |     }
 757 |    ],
 758 |    "source": [
 759 |     "for sim in np.linspace(0, 1, num=20):\n",
 760 |     "    angle = np.rad2deg(np.arccos(sim))\n",
 761 |     "    print(\n",
 762 |     "        \"cosine similarity: {:.2f}, angle: {:.2f}\".format(\n",
 763 |     "            sim, angle)\n",
 764 |     "    )"
 765 |    ]
 766 |   },
 767 |   {
 768 |    "cell_type": "code",
 769 |    "execution_count": 35,
 770 |    "metadata": {
 771 |     "collapsed": true
 772 |    },
 773 |    "outputs": [],
 774 |    "source": [
 775 |     "# reset vectorizers to default values\n",
 776 |     "cv = CountVectorizer() \n",
 777 |     "X_cv = cv.fit_transform(all_docs)\n",
 778 |     "tv = TfidfVectorizer()\n",
 779 |     "X_tv = tv.fit_transform(all_docs)"
 780 |    ]
 781 |   },
 782 |   {
 783 |    "cell_type": "code",
 784 |    "execution_count": 36,
 785 |    "metadata": {
 786 |     "collapsed": false,
 787 |     "slideshow": {
 788 |      "slide_type": "-"
 789 |     }
 790 |    },
 791 |    "outputs": [
 792 |     {
 793 |      "data": {
 794 |       "text/plain": [
 795 |        "'\"With all this stuff going down at the moment with MJ i\\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\\'s feeling towards the press and also the obvious message of drugs are bad m\\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 20 minutes or so excluding the Smooth Criminal sequence and Joe Pesci is convincing as a psychopathic all powerful drug lord. Why he wants MJ dead so bad is beyond me. Because MJ overheard his plans? Nah, Joe Pesci\\'s character ranted that he wanted people to know it is he who is supplying drugs etc so i dunno, maybe he just hates MJ\\'s music.Lots of cool things in this like MJ turning into a car and a robot and the whole Speed Demon sequence. Also, the director must have had the patience of a saint when it came to filming the kiddy Bad sequence as usually directors hate working with one kid let alone a whole bunch of them performing a complex dance scene.Bottom line, this movie is for people who like MJ on one level or another (which i think is most people). If not, then stay away. It does try and give off a wholesome message and ironically MJ\\'s bestest buddy in this movie is a girl! Michael Jackson is truly one of the most talented people ever to grace this planet but is he guilty? Well, with all the attention i\\'ve gave this subject....hmmm well i don\\'t know because people can be different behind closed doors, i know this for a fact. He is either an extremely nice but stupid guy or one of the most sickest liars. I hope he is not the latter.\"'"
 796 |       ]
 797 |      },
 798 |      "execution_count": 36,
 799 |      "metadata": {},
 800 |      "output_type": "execute_result"
 801 |     }
 802 |    ],
 803 |    "source": [
 804 |     "lookup[0]"
 805 |    ]
 806 |   },
 807 |   {
 808 |    "cell_type": "code",
 809 |    "execution_count": 40,
 810 |    "metadata": {
 811 |     "collapsed": false,
 812 |     "slideshow": {
 813 |      "slide_type": "-"
 814 |     }
 815 |    },
 816 |    "outputs": [
 817 |     {
 818 |      "name": "stdout",
 819 |      "output_type": "stream",
 820 |      "text": [
 821 |       "and: count value = 10 versus tfidf value = 0.0869\n"
 822 |      ]
 823 |     }
 824 |    ],
 825 |    "source": [
 826 |     "word = \"and\"\n",
 827 |     "doc_idx_to_review = 0\n",
 828 |     "c_vector = get_one_row(X_cv, doc_idx_to_review)\n",
 829 |     "t_vector = get_one_row(X_tv, doc_idx_to_review) \n",
 830 |     "idx = cv_vocab.index(word)\n",
 831 |     "c = c_vector[idx]\n",
 832 |     "t = t_vector[idx]\n",
 833 |     "print(\"{}: count value = {} versus tfidf value = {:.4f}\".format(word, c, t))"
 834 |    ]
 835 |   },
 836 |   {
 837 |    "cell_type": "code",
 838 |    "execution_count": 41,
 839 |    "metadata": {
 840 |     "collapsed": false,
 841 |     "slideshow": {
 842 |      "slide_type": "-"
 843 |     }
 844 |    },
 845 |    "outputs": [
 846 |     {
 847 |      "name": "stdout",
 848 |      "output_type": "stream",
 849 |      "text": [
 850 |       "pesci: count value = 2 versus tfidf value = 0.1203\n"
 851 |      ]
 852 |     }
 853 |    ],
 854 |    "source": [
 855 |     "word = \"pesci\"\n",
 856 |     "doc_idx_to_review = 0\n",
 857 |     "c_vector = get_one_row(X_cv, doc_idx_to_review)\n",
 858 |     "t_vector = get_one_row(X_tv, doc_idx_to_review) \n",
 859 |     "idx = cv_vocab.index(word)\n",
 860 |     "c = c_vector[idx]\n",
 861 |     "t = t_vector[idx]\n",
 862 |     "print(\"{}: count value = {} versus tfidf value = {:.4f}\".format(word, c, t))"
 863 |    ]
 864 |   },
 865 |   {
 866 |    "cell_type": "markdown",
 867 |    "metadata": {
 868 |     "slideshow": {
 869 |      "slide_type": "-"
 870 |     }
 871 |    },
 872 |    "source": [
 873 |     "### Most/Least similar documents"
 874 |    ]
 875 |   },
 876 |   {
 877 |    "cell_type": "code",
 878 |    "execution_count": 18,
 879 |    "metadata": {
 880 |     "collapsed": true,
 881 |     "slideshow": {
 882 |      "slide_type": "-"
 883 |     }
 884 |    },
 885 |    "outputs": [],
 886 |    "source": [
 887 |     "cos_sim_matrix = utils.load_matrix_from_csv(\"tfidf_cos_matrix.csv\")"
 888 |    ]
 889 |   },
 890 |   {
 891 |    "cell_type": "code",
 892 |    "execution_count": 19,
 893 |    "metadata": {
 894 |     "collapsed": false,
 895 |     "slideshow": {
 896 |      "slide_type": "-"
 897 |     }
 898 |    },
 899 |    "outputs": [
 900 |     {
 901 |      "data": {
 902 |       "text/plain": [
 903 |        "(array([125, 166]), array([166, 125]))"
 904 |       ]
 905 |      },
 906 |      "execution_count": 19,
 907 |      "metadata": {},
 908 |      "output_type": "execute_result"
 909 |     }
 910 |    ],
 911 |    "source": [
 912 |     "# index of highest similarity\n",
 913 |     "np.where(cos_sim_matrix == np.nanmax(cos_sim_matrix))"
 914 |    ]
 915 |   },
 916 |   {
 917 |    "cell_type": "code",
 918 |    "execution_count": 20,
 919 |    "metadata": {
 920 |     "collapsed": false,
 921 |     "slideshow": {
 922 |      "slide_type": "-"
 923 |     }
 924 |    },
 925 |    "outputs": [
 926 |     {
 927 |      "data": {
 928 |       "text/plain": [
 929 |        "'\"Zombie Review #3**Spoilers**Few films are actually \\\\\"so bad they\\'re good\\\\\", and Zombi 3 is not just bad, it\\'s wretchedly, unforgivably bad in so many ways that a whole new language may be needed just to describe them allMore than that, it\\'s a film credited to Lucio Fulci that even by his standards has absolutely no coherency, sense or reason. However we can\\'t blame Fulci as it wasn\\'t really directed by him but by Bruno Mattei, who doesn\\'t even have Fulci\\'s sense of style to help carry the film. Mattei seems to have brought little to the film but staggering ineptitude.So, I\\'m ashamed to say how much I enjoyed every worthless minute of Zombi 3. It has no redeeming features - in a genre known for thin characters, weak story, and lack of film making skill, Zombi 3 pushes the boat out but in doing so it\\'s even funnier than Nightmare City.The \\\\\"action\\\\\" starts when the \\\\\"Death 1\\\\\" gas is stolen from a military base, and damaged in the escape. Who is the thief, why did he steal it, and why did the US military think that creating cannibalistic legions of the living dead would be a good idea? All these questions and more will fail to be answered in Zombi 3....After hiding out at a hotel, the infected thief goes mad from all the green plastecine growing on his face before being tracked down by the army who somewhat foolishly decide the best way to dispose of his corpse will be to burn it, sending \\\\\"Death 1\\\\\" up into the atmosphere resulting in... zombie birds! Who then attack people and turn them into zombie people!!! (if zombies are cannibals, why don\\'t the zombie birds just attack other birds?)Then we meet our \\\\\"heroes\\\\\", a trio of horny GIs and a coachload of girls. There\\'s a couple of other guys with them too, but they\\'re not important - NO ONE is important here. You\\'ll be hard pressed to remember anyone\\'s face, let alone their name or find a reason to care about them. They end up hiding out at the same hotel as the thief (\\\\\"a week ago this place was buzzing with life, now it\\'s buzzing with flies!\\\\\") but there\\'s no escape from the undead.By this point you\\'ll either be completely sucked in or you\\'ll have turned the damned thing off. The script is so appalling even the greatest acting in the world couldn\\'t save it, so it\\'s just as well they have some of the worst - and not just the human characters, the zombie acting here is an all time low. There\\'s no consistancy in how the zombies behave - some shamble about in the time honored style, others engage in full on fist fights or charge around with machettes, not to mention the zombies who are still able to talk (a gimmick that gives the film it\\'s HORRIFYING TWIST ENDING). They die from gunshots to the chest (rather than the head) and even get knocked out by a good left-hook. How can you punch out a zombie???!!!!! In fact the emphasis on badly done 80s action often makes it resemble an episode of V...The zombies also spend a lot of time hiding, seemingly waiting for hours in ridiculous places on the chance some poor sap will pass by and get the fright of their life. They hide in bushes, in garages, in huts, on roofs, in the water, and even underneath pregnant women. At one point a zombie follows a woman up the stairs. To kill and eat her? No! To push her into the water, those zombies and their wacky sense of humour!There is plenty of gore though. Limbs are hacked, wounds ooze green pus, and there\\'s much in the way of flesh eating and people getting their faces mushed in. There\\'s nothing to match the originals eyeball piercing, but if bad make up effects are your bag you won\\'t be let down.All this and I\\'ve not even mentioned the awful music, the inexplicable flying zombie head, the scientist whose acting actually manages to stand out as REALLY bad, or the final chilling punchline.... in an ingenious twist on the originals radio station being overrun by zombies, Zombi 3 gives us an actual zombie DJ!! \\\\\"He\\'s gone over to their side!\\\\\" our escaping hero\\'s cry, before vowing to continue fighting against the undead in a sequel that sadly never came.Zombi 3 is rubbish - it would be no loss to the world if every single print was destroyed and all records of it\\'s existence erased, yet somehow I feel my life is richer for having seen it.Did I say richer? I meant 88 minutes shorter...\"'"
 930 |       ]
 931 |      },
 932 |      "execution_count": 20,
 933 |      "metadata": {},
 934 |      "output_type": "execute_result"
 935 |     }
 936 |    ],
 937 |    "source": [
 938 |     "lookup[125]"
 939 |    ]
 940 |   },
 941 |   {
 942 |    "cell_type": "code",
 943 |    "execution_count": 21,
 944 |    "metadata": {
 945 |     "collapsed": false
 946 |    },
 947 |    "outputs": [
 948 |     {
 949 |      "data": {
 950 |       "text/plain": [
 951 |        "'\"Title: Zombie 3 (1988) Directors: Mostly Lucio Fulci, but also Claudio Fragasso and Bruno Mattei Cast: Ottaviano DellAcqua, Massimo Vani, Beatrice Ring, Deran Serafin Review: To review this flick and get some good background of it, I gotta start by the beginning. And the beginning of this is really George Romeros Dawn of the Dead. When Dawn came out in 79, Lucio Fulci decided to make an indirect sequel to it and call it Zombie 2. That film is the one we know as plain ole Zombie. You know the one in which the zombie fights with the shark! OK so, after that flick (named Zombie 2 in Italy) came out and made a huge chunk of cash, the Italians decided, heck. Lets make some more zombie flicks! These things are raking in the dough! So Zombie 3 was born. Confused yet? The story on this one is really just a rehash of stories we\\'ve seen in a lot of American zombie flicks that we have seen before this one, the best comparison that comes to mind is Return of the Living Dead. Lets see...there\\'s the government making experiments with a certain toxic gas that will turn people into zombies. Canister gets released into the general population and shebang! We get loads of zombies yearning for human flesh. A bunch of people start running away from the zombies and end up in an old abandoned hotel. They gotta fight the zombies to survive.There was a lot of trouble during the filming of this movie. First and foremost, Lucio Fulci the beloved godfather of gore from Italy was sick. So he couldn\\'t really finish this film the way that he wanted to. The film was then handed down to two lesser directors Bruno Mattei (Hell of the Living Dead) and Claudio Fragasso (Zombie 4). They did their best to spice up a film that was already not so good. You see Fulci himself didn\\'t really have his heart and soul on this flick. He was disenchanted with it. He gave the flick over to the producers and basically said: \\\\\"Do whatever the hell you want with it!\\\\\" And god love them, they did.And that is why ladies and gents we have such a crappy zombie flick with the great Fulci credited as its \\\\\"director\\\\\". The main problem in my opinion is that its just such a pointless bore! There\\'s no substance to it whatsoever! After the first few minutes in which some terrorists steal the toxic gas and accidentally release it, the rest of the flick is just a bunch of empty soulless characters with no personality whatsoever running from the zombies. Now in some cases this can prove to be fun, if #1 the zombie make up and zombie action is actually good and fun and #2 there\\'s a lot of gore and guts involved.Here we get neither! Well there\\'s some inspired moments in there, like for example when some eagles get infected by the gas and they start attacking people. That was cool. There\\'s also a scene involving a flying zombie head (wich by the way defies all logic and explanation) and a scene with zombies coming out of the pool of the abandoned hotel and munching off a poor girls legs. But aside from that...the rest of the flick just falls flat on its ass.Endless upon endless scenes that don\\'t do jack to move the already non existent plot along. That was my main gripe with this flick. The sets look unfinished and the art direction is practically non-existent. I hate it when everything looks so damn unfinished! I like my b-movies, but this one just really went even below that! Its closer to a z-level flick, if you ask me.The zombie make up? Pure crap. The zombies are all Asian actors (the movie was filmed in the Philippines) so you get a bunch of Asian looking zombies. But thats not a big problem since they movie was set in the phillipine islands anyway. Its the look of the zombies that really sucks! They all died with the same clothes on for some reason. And what passes for zombie make up here is a bunch of black make up (more like smudges) on their faces. One or two zombies had slightly more complex make up, but it still wasn\\'t good enough to impress. Its just a bunch of goo pointlessly splattered on the actors faces. So not only is this flick slowly paced but the zombies look like crap. These are supposed to be dead folks! Anyhows, for those expecting the usual coolness in a Fulci flick don\\'t come expecting it here cause this is mostly somebody else\\'s flick. And those two involved (Mattei and Fragasso) didn\\'t really put there heart and souls into it. In fact, when you see the extras on the DVD you will see that when Fragasso is asked about his recollections and his feelings on this here flick, he doesn\\'t even take it to seriously. You can tell he is ashamed of it and in many occasions he says they \\\\\"just had a job to do and they did it\\\\\". And that my friends, is the last nail on this flick. There\\'s no love, and no heart put into making this film. Therefore you get a half assed, crappy zombie flick.Only for completest or people who want to have or see every zombie flick ever made. Everybody else, don\\'t even bother! Rating: 1 out of 5\"'"
 952 |       ]
 953 |      },
 954 |      "execution_count": 21,
 955 |      "metadata": {},
 956 |      "output_type": "execute_result"
 957 |     }
 958 |    ],
 959 |    "source": [
 960 |     "lookup[166]"
 961 |    ]
 962 |   },
 963 |   {
 964 |    "cell_type": "code",
 965 |    "execution_count": 22,
 966 |    "metadata": {
 967 |     "collapsed": false,
 968 |     "slideshow": {
 969 |      "slide_type": "-"
 970 |     }
 971 |    },
 972 |    "outputs": [
 973 |     {
 974 |      "name": "stdout",
 975 |      "output_type": "stream",
 976 |      "text": [
 977 |       "{'more', 'actually', 'that', 'just', 'also', 'don', 'but', 'they', 'about', 've', 'fact', 'human', 'be', 'my', 'there', 'up', 'sequel', 'who', 'are', 'yet', 'with', 'doesn', 'their', 'dead', 'credited', 'of', 'as', 'was', 'this', 'best', 'an', 'all', 'reason', 'head', 'out', 'seen', 'really', 'why', 'came', 'lot', 'many', 'the', 'no', 'here', 'hotel', 'story', 'flying', 'we', 'making', 'can', 'his', 'on', 'make', 'lucio', 'few', 'get', 'people', 'he', 'action', 'have', 'way', 'ashamed', 'for', 'review', 'not', 'wasn', 'some', 'or', 'minutes', 'steal', 'poor', 'turn', 'when', 'fulci', 'down', 'you', 'and', 'fights', 'mattei', 'to', 'after', 'those', 'every', 'well', 'from', 'faces', 'is', 'off', 'did', 'still', 'even', 'bruno', 'before', 'girls', 'in', 'if', 'zombies', 'by', 'same', 'living', 'flesh', 'then', 'over', 'zombie', 'end', 'them', 'now', 'gore', 'characters', 'will', 'good', 'film', 'infected', 'one', 'gas', 'it', 'into', 'couldn', 'these', 'so'}\n"
 978 |      ]
 979 |     }
 980 |    ],
 981 |    "source": [
 982 |     "# words present in both reviews\n",
 983 |     "print(set(tv_words_per_doc[166]).intersection(set(tv_words_per_doc[125])))"
 984 |    ]
 985 |   },
 986 |   {
 987 |    "cell_type": "code",
 988 |    "execution_count": 23,
 989 |    "metadata": {
 990 |     "collapsed": false,
 991 |     "slideshow": {
 992 |      "slide_type": "-"
 993 |     }
 994 |    },
 995 |    "outputs": [
 996 |     {
 997 |      "name": "stdout",
 998 |      "output_type": "stream",
 999 |      "text": [
1000 |       "least similar documents have cosine similarity of 0.0\n"
1001 |      ]
1002 |     },
1003 |     {
1004 |      "data": {
1005 |       "text/plain": [
1006 |        "(array([ 10,  10, 242, 319, 401, 404, 456, 456, 512, 512, 512, 512, 512,\n",
1007 |        "        512, 512, 621, 692, 713]),\n",
1008 |        " array([456, 512, 512, 456, 512, 512,  10, 319,  10, 242, 401, 404, 621,\n",
1009 |        "        692, 713, 512, 512, 512]))"
1010 |       ]
1011 |      },
1012 |      "execution_count": 23,
1013 |      "metadata": {},
1014 |      "output_type": "execute_result"
1015 |     }
1016 |    ],
1017 |    "source": [
1018 |     "# index of lowest similarity\n",
1019 |     "lowest_similarity = np.nanmin(cos_sim_matrix)\n",
1020 |     "print(\"least similar documents have cosine similarity of {}\".format(lowest_similarity))\n",
1021 |     "np.where(cos_sim_matrix == lowest_similarity)"
1022 |    ]
1023 |   },
1024 |   {
1025 |    "cell_type": "code",
1026 |    "execution_count": 24,
1027 |    "metadata": {
1028 |     "collapsed": false,
1029 |     "slideshow": {
1030 |      "slide_type": "-"
1031 |     }
1032 |    },
1033 |    "outputs": [
1034 |     {
1035 |      "name": "stdout",
1036 |      "output_type": "stream",
1037 |      "text": [
1038 |       "set()\n"
1039 |      ]
1040 |     }
1041 |    ],
1042 |    "source": [
1043 |     "# words present in both reviews\n",
1044 |     "print(set(tv_words_per_doc[10]).intersection(set(tv_words_per_doc[512])))"
1045 |    ]
1046 |   },
1047 |   {
1048 |    "cell_type": "markdown",
1049 |    "metadata": {},
1050 |    "source": [
1051 |     "### Problems with Bag-of-words"
1052 |    ]
1053 |   },
1054 |   {
1055 |    "cell_type": "code",
1056 |    "execution_count": 25,
1057 |    "metadata": {
1058 |     "collapsed": false,
1059 |     "slideshow": {
1060 |      "slide_type": "-"
1061 |     }
1062 |    },
1063 |    "outputs": [
1064 |     {
1065 |      "data": {
1066 |       "text/plain": [
1067 |        "'\"I loved the episode but seems to me there should have been some quick reference to the secretary getting punished for effectively being an accomplice after the fact. While I like when a episode of Columbo has an unpredictable twist like this one, its resolution should be part of the conclusion of the episode, along with the uncovering of the murderer.The interplay between Peter Falk and Ruth Gordon is priceless. At one point, Gordon, playing a famous writer, makes some comment about being flattered by the famous Lt. Columbo, making a tongue-in-cheek allusion to the detective\\'s real life fame as a crime-solver. This is one of the best of many great Columbo installments.\"'"
1068 |       ]
1069 |      },
1070 |      "execution_count": 25,
1071 |      "metadata": {},
1072 |      "output_type": "execute_result"
1073 |     }
1074 |    ],
1075 |    "source": [
1076 |     "lookup[45]"
1077 |    ]
1078 |   },
1079 |   {
1080 |    "cell_type": "code",
1081 |    "execution_count": 26,
1082 |    "metadata": {
1083 |     "collapsed": false,
1084 |     "slideshow": {
1085 |      "slide_type": "-"
1086 |     }
1087 |    },
1088 |    "outputs": [
1089 |     {
1090 |      "data": {
1091 |       "text/plain": [
1092 |        "'\"It is always satisfying when a detective wraps up a case and the criminal is brought to book. In this case the climax gives me even greater pleasure. To see the smug grin wiped off the face of Abigail Mitchell when she realises her victim has left \\\\\"deathbed testimony\\\\\" which leaves no doubt about her guilt is very satisfying.Please understand: while I admire Ruth Gordon\\'s performance, her character really, *really* irritates me. She is selfish and demanding. She gets her own way by putting on a simpering \\'little girl\\' act which is embarrassing in a woman of her age. Worse, she has now set herself up as judge, jury and executioner against her dead niece\\'s husband.When Columbo is getting too close she tries to unnerve him by manipulating him into making an off-the-cuff speech to an audience of high-class ladies. He turns the tables perfectly by delivering a very warm and humane speech about the realities of police work.Nothing can distract Columbo from the pursuit of justice. Abby\\'s final appeal to his good nature is rejected because he has too much self-respect not to do his job well. Here is one situation you can\\'t squirm out of Ms Mitchell!\"'"
1093 |       ]
1094 |      },
1095 |      "execution_count": 26,
1096 |      "metadata": {},
1097 |      "output_type": "execute_result"
1098 |     }
1099 |    ],
1100 |    "source": [
1101 |     "lookup[433]"
1102 |    ]
1103 |   },
1104 |   {
1105 |    "cell_type": "code",
1106 |    "execution_count": 27,
1107 |    "metadata": {
1108 |     "collapsed": false,
1109 |     "slideshow": {
1110 |      "slide_type": "-"
1111 |     }
1112 |    },
1113 |    "outputs": [
1114 |     {
1115 |      "name": "stdout",
1116 |      "output_type": "stream",
1117 |      "text": [
1118 |       "{'while', 'about', 'the', 'getting', 'ruth', 'in', 'by', 'gordon', 'when', 'and', 'me', 'columbo', 'making', 'detective', 'to', 'one', 'of', 'has', 'as', 'is', 'this', 'an'}\n"
1119 |      ]
1120 |     }
1121 |    ],
1122 |    "source": [
1123 |     "print(set(tv_words_per_doc[45]).intersection(set(tv_words_per_doc[433])))"
1124 |    ]
1125 |   },
1126 |   {
1127 |    "cell_type": "code",
1128 |    "execution_count": 28,
1129 |    "metadata": {
1130 |     "collapsed": false,
1131 |     "slideshow": {
1132 |      "slide_type": "-"
1133 |     }
1134 |    },
1135 |    "outputs": [
1136 |     {
1137 |      "data": {
1138 |       "text/plain": [
1139 |        "0.20648822212468901"
1140 |       ]
1141 |      },
1142 |      "execution_count": 28,
1143 |      "metadata": {},
1144 |      "output_type": "execute_result"
1145 |     }
1146 |    ],
1147 |    "source": [
1148 |     "cos_sim(get_one_row(X_tv, 45), get_one_row(X_tv, 433))"
1149 |    ]
1150 |   },
1151 |   {
1152 |    "cell_type": "code",
1153 |    "execution_count": 29,
1154 |    "metadata": {
1155 |     "collapsed": false,
1156 |     "slideshow": {
1157 |      "slide_type": "-"
1158 |     }
1159 |    },
1160 |    "outputs": [
1161 |     {
1162 |      "data": {
1163 |       "text/plain": [
1164 |        "<999x18373 sparse matrix of type '<class 'numpy.int64'>'\n",
1165 |        "\twith 137082 stored elements in Compressed Sparse Row format>"
1166 |       ]
1167 |      },
1168 |      "execution_count": 29,
1169 |      "metadata": {},
1170 |      "output_type": "execute_result"
1171 |     }
1172 |    ],
1173 |    "source": [
1174 |     "X_cv"
1175 |    ]
1176 |   },
1177 |   {
1178 |    "cell_type": "markdown",
1179 |    "metadata": {},
1180 |    "source": [
1181 |     "![bag_of_words_problem](images/bag_of_words_problem.png)"
1182 |    ]
1183 |   },
1184 |   {
1185 |    "cell_type": "markdown",
1186 |    "metadata": {
1187 |     "slideshow": {
1188 |      "slide_type": "-"
1189 |     }
1190 |    },
1191 |    "source": [
1192 |     "#### Problems with Bag-of-words:\n",
1193 |     "\n",
1194 |     " - same concepts, different words don't appear similar\n",
1195 |     " - sparse matrix the size of *vocabulary*\n",
1196 |     " - two different sentences, same embedding\n",
1197 |     " \n",
1198 |     "### So can we do better?"
1199 |    ]
1200 |   },
1201 |   {
1202 |    "cell_type": "markdown",
1203 |    "metadata": {
1204 |     "slideshow": {
1205 |      "slide_type": "-"
1206 |     }
1207 |    },
1208 |    "source": [
1209 |     "## Embedding a Document\n",
1210 |     "### Neural Networks"
1211 |    ]
1212 |   },
1213 |   {
1214 |    "cell_type": "markdown",
1215 |    "metadata": {
1216 |     "collapsed": true,
1217 |     "slideshow": {
1218 |      "slide_type": "-"
1219 |     }
1220 |    },
1221 |    "source": [
1222 |     "![recurrent](images/recurrent.png)\n",
1223 |     "http://colah.github.io/posts/2015-08-Understanding-LSTMs/"
1224 |    ]
1225 |   },
1226 |   {
1227 |    "cell_type": "markdown",
1228 |    "metadata": {
1229 |     "collapsed": true,
1230 |     "slideshow": {
1231 |      "slide_type": "-"
1232 |     }
1233 |    },
1234 |    "source": [
1235 |     "![cnn](images/cnn.png)\n",
1236 |     "http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/"
1237 |    ]
1238 |   },
1239 |   {
1240 |    "cell_type": "markdown",
1241 |    "metadata": {
1242 |     "collapsed": true,
1243 |     "slideshow": {
1244 |      "slide_type": "-"
1245 |     }
1246 |    },
1247 |    "source": [
1248 |     "![dan](images/dan.png)\n",
1249 |     "https://cs.umd.edu/~miyyer/pubs/2015_acl_dan.pdf"
1250 |    ]
1251 |   },
1252 |   {
1253 |    "cell_type": "markdown",
1254 |    "metadata": {
1255 |     "slideshow": {
1256 |      "slide_type": "-"
1257 |     }
1258 |    },
1259 |    "source": [
1260 |     "![bow](images/bag_of_words_performance.png)"
1261 |    ]
1262 |   },
1263 |   {
1264 |    "cell_type": "markdown",
1265 |    "metadata": {},
1266 |    "source": [
1267 |     "![seq2seq](images/seq2seq.png)\n",
1268 |     "http://www.wildml.com/2016/04/deep-learning-for-chatbots-part-1-introduction/"
1269 |    ]
1270 |   },
1271 |   {
1272 |    "cell_type": "markdown",
1273 |    "metadata": {},
1274 |    "source": [
1275 |     "![image_captioning](images/image_captioning.png)\n",
1276 |     "https://www.researchgate.net/profile/Y_Bengio/publication/277411157_Deep_Learning/links/55e0cdf908ae2fac471ccf0f/Deep-Learning.pdf"
1277 |    ]
1278 |   },
1279 |   {
1280 |    "cell_type": "markdown",
1281 |    "metadata": {
1282 |     "slideshow": {
1283 |      "slide_type": "-"
1284 |     }
1285 |    },
1286 |    "source": [
1287 |     "## Resources\n",
1288 |     "[Stanford IR book, online](http://nlp.stanford.edu/IR-book/html/htmledition/) <br>\n",
1289 |     "[Bag of Words Meets Bags of Popcorn (Kaggle)](https://www.kaggle.com/c/word2vec-nlp-tutorial) <br>\n",
1290 |     "[Neural Networks for NLP](https://arxiv.org/pdf/1510.00726.pdf) <br>\n",
1291 |     "[Blog about LSTM's](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) <br>\n",
1292 |     "[Blog about CNN's](http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/) <br>\n",
1293 |     "[Examples of Recurrent Neural Networks](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) \n",
1294 |     "[Recent Talk by C. Manning about Embedding Words and Documents](https://simons.berkeley.edu/talks/christopher-manning-2017-3-27)"
1295 |    ]
1296 |   }
1297 |  ],
1298 |  "metadata": {
1299 |   "kernelspec": {
1300 |    "display_name": "Python 3",
1301 |    "language": "python",
1302 |    "name": "python3"
1303 |   },
1304 |   "language_info": {
1305 |    "codemirror_mode": {
1306 |     "name": "ipython",
1307 |     "version": 3
1308 |    },
1309 |    "file_extension": ".py",
1310 |    "mimetype": "text/x-python",
1311 |    "name": "python",
1312 |    "nbconvert_exporter": "python",
1313 |    "pygments_lexer": "ipython3",
1314 |    "version": "3.5.2"
1315 |   }
1316 |  },
1317 |  "nbformat": 4,
1318 |  "nbformat_minor": 0
1319 | }
1320 | 


--------------------------------------------------------------------------------
/Gigaword_pruned_vectors.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/Gigaword_pruned_vectors.txt.gz


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NLP Basics
 2 | 
 3 | ## Preparation
 4 | 
 5 | You can clone this repository:
 6 | 
 7 | ```
 8 | git@github.com:michaelcapizzi/nlp-basics.git
 9 | ```
10 | 
11 | ## Docker install
12 | 
13 | You can run the Jupyter notebooks in this repository with [Docker](http://docs.docker.com/installation) by running
14 | 
15 | ```
16 | % docker build -t michaelcapizzi/nlp-basics .
17 | % docker run -p 8888:8888 --rm -it michaelcapizzi/nlp-basics # to start a Jupyter notebook server
18 | [I 19:41:11.459 NotebookApp] Writing notebook server cookie secret to /home/jovyan/.local/share/jupyter/runtime/notebook_cookie_secret
19 | [W 19:41:11.591 NotebookApp] Widgets are unavailable. Please install widgetsnbextension or ipywidgets 4.0
20 | [W 19:41:11.598 NotebookApp] WARNING: The notebook server is listening on all IP addresses and not using encryption. This is not recommended.
21 | [I 19:41:11.742 NotebookApp] JupyterLab alpha preview extension loaded from /opt/conda/lib/python3.5/site-packages/jupyterlab
22 | [I 19:41:11.802 NotebookApp] Serving notebooks from local directory: /code
23 | [I 19:41:11.802 NotebookApp] 0 active kernels 
24 | [I 19:41:11.802 NotebookApp] The Jupyter Notebook is running at: http://[all ip addresses on your system]:8888/?token=f6925975b83f14758e79c55f81f1bec1267300747d5d6b08
25 | [I 19:41:11.802 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
26 | [C 19:41:11.803 NotebookApp] 
27 |     
28 |     Copy/paste this URL into your browser when you connect for the first time,
29 |     to login with a token:
30 |         http://localhost:8888/?token=f6925975b83f14758e79c55f81f1bec1267300747d5d6b08
31 | ```
32 | 
33 | Your specific token will be different.
34 | 
35 | ## Manual pip / virtualenv install
36 | 
37 | To run the `jupyter` notebook you'll need a `python` environment for `python 3` with the following requirements:
38 | 
39 |  - jupyter
40 |  - gensim
41 |  - sklearn
42 |  - numpy
43 |  - beautifulsoup4
44 | 
45 | All of these can be installed via `pip` or using the `requirements.txt` file:
46 | 
47 | ```
48 | pip install -r requirements.txt
49 | ```
50 | 
51 | Then to open the notebook, simply run the following in the root folder of the cloned project:
52 | 
53 | ```
54 | jupyter notebook
55 | ```
56 | 
57 | This will open a new window in your default browser.  You can then open the notebook file of choice (ending in `.ipynb`) by clicking on it.
58 | 
59 | It will open in a new window.
60 | 
61 | You can edit a given cell by clicking on it.  To run the cell, push `CTRL-c`.
62 | 
63 | 
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/Word-Embeddings_Demo.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {
   7 |     "collapsed": false,
   8 |     "slideshow": {
   9 |      "slide_type": "-"
  10 |     }
  11 |    },
  12 |    "outputs": [
  13 |     {
  14 |      "data": {
  15 |       "text/plain": [
  16 |        "4"
  17 |       ]
  18 |      },
  19 |      "execution_count": 1,
  20 |      "metadata": {},
  21 |      "output_type": "execute_result"
  22 |     }
  23 |    ],
  24 |    "source": [
  25 |     "# using Jupyter notebooks\n",
  26 |     "# pushing CTRL-c will run the code in a cell\n",
  27 |     "2 + 2"
  28 |    ]
  29 |   },
  30 |   {
  31 |    "cell_type": "markdown",
  32 |    "metadata": {
  33 |     "slideshow": {
  34 |      "slide_type": "-"
  35 |     }
  36 |    },
  37 |    "source": [
  38 |     "# Gentle Introduction to NLP through Word Embeddings"
  39 |    ]
  40 |   },
  41 |   {
  42 |    "cell_type": "markdown",
  43 |    "metadata": {
  44 |     "slideshow": {
  45 |      "slide_type": "-"
  46 |     }
  47 |    },
  48 |    "source": [
  49 |     "![NLP](images/NLP.png)"
  50 |    ]
  51 |   },
  52 |   {
  53 |    "cell_type": "markdown",
  54 |    "metadata": {
  55 |     "slideshow": {
  56 |      "slide_type": "-"
  57 |     }
  58 |    },
  59 |    "source": [
  60 |     "# How To Tell If Two Words Are \"Similar\"?"
  61 |    ]
  62 |   },
  63 |   {
  64 |    "cell_type": "markdown",
  65 |    "metadata": {
  66 |     "slideshow": {
  67 |      "slide_type": "-"
  68 |     }
  69 |    },
  70 |    "source": [
  71 |     "![distance](images/distance_measures.png)\n",
  72 |     "http://dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/"
  73 |    ]
  74 |   },
  75 |   {
  76 |    "cell_type": "markdown",
  77 |    "metadata": {
  78 |     "slideshow": {
  79 |      "slide_type": "-"
  80 |     }
  81 |    },
  82 |    "source": [
  83 |     "# Cosine Similarity"
  84 |    ]
  85 |   },
  86 |   {
  87 |    "cell_type": "markdown",
  88 |    "metadata": {
  89 |     "collapsed": true,
  90 |     "slideshow": {
  91 |      "slide_type": "-"
  92 |     }
  93 |    },
  94 |    "source": [
  95 |     "![cos_sim](images/cos_sim.png)\n",
  96 |     "http://dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/"
  97 |    ]
  98 |   },
  99 |   {
 100 |    "cell_type": "markdown",
 101 |    "metadata": {
 102 |     "slideshow": {
 103 |      "slide_type": "-"
 104 |     }
 105 |    },
 106 |    "source": [
 107 |     "## calculating dot product\n",
 108 |     "$vector_a = [1,2,3]$ <br>\n",
 109 |     "$vector_b = [4,5,6]$ <br>\n",
 110 |     "$vector_a \\cdot vector_b = (1*4) + (2*5) + (3*6) = 4 + 10 + 18 = 32$ "
 111 |    ]
 112 |   },
 113 |   {
 114 |    "cell_type": "markdown",
 115 |    "metadata": {
 116 |     "slideshow": {
 117 |      "slide_type": "-"
 118 |     }
 119 |    },
 120 |    "source": [
 121 |     "## Normalizing a Vector"
 122 |    ]
 123 |   },
 124 |   {
 125 |    "cell_type": "markdown",
 126 |    "metadata": {
 127 |     "slideshow": {
 128 |      "slide_type": "-"
 129 |     }
 130 |    },
 131 |    "source": [
 132 |     "To normalize a vector, we shrink all values so they fall between $0$ and $1$.\n",
 133 |     "\n",
 134 |     "$vector_{normalized} = \\frac{vector}{\\sqrt{vector \\cdot vector}}$ \n",
 135 |     "![normalize](images/normalize.jpg)\n",
 136 |     "\n",
 137 |     "http://www.wikihow.com/Normalize-a-Vector"
 138 |    ]
 139 |   },
 140 |   {
 141 |    "cell_type": "markdown",
 142 |    "metadata": {
 143 |     "slideshow": {
 144 |      "slide_type": "-"
 145 |     }
 146 |    },
 147 |    "source": [
 148 |     "![unit_circle](images/unit_circle.png)\n",
 149 |     "https://en.wikipedia.org/wiki/Unit_vector"
 150 |    ]
 151 |   },
 152 |   {
 153 |    "cell_type": "code",
 154 |    "execution_count": 2,
 155 |    "metadata": {
 156 |     "collapsed": true,
 157 |     "slideshow": {
 158 |      "slide_type": "-"
 159 |     }
 160 |    },
 161 |    "outputs": [],
 162 |    "source": [
 163 |     "import numpy as np\n",
 164 |     "from nltk.corpus import wordnet\n",
 165 |     "from collections import OrderedDict\n",
 166 |     "from itertools import combinations\n",
 167 |     "import string\n",
 168 |     "from gensim import models"
 169 |    ]
 170 |   },
 171 |   {
 172 |    "cell_type": "code",
 173 |    "execution_count": 3,
 174 |    "metadata": {
 175 |     "collapsed": true,
 176 |     "slideshow": {
 177 |      "slide_type": "-"
 178 |     }
 179 |    },
 180 |    "outputs": [],
 181 |    "source": [
 182 |     "def normalize_vector(vector):\n",
 183 |     "    \"\"\"\n",
 184 |     "    Normalizes a vector so that all its values are between 0 and 1\n",
 185 |     "    :param vector: a `numpy` vector\n",
 186 |     "    :return: a normalized `numpy` vector\n",
 187 |     "    \"\"\"\n",
 188 |     "    # norm = np.sqrt(vector.dot(vector))\n",
 189 |     "    # numpy has a built in function\n",
 190 |     "    norm = np.linalg.norm(vector)\n",
 191 |     "    if norm:\n",
 192 |     "        return vector / norm\n",
 193 |     "    else:\n",
 194 |     "        # if norm == 0, then original vector was all 0s\n",
 195 |     "        return vector"
 196 |    ]
 197 |   },
 198 |   {
 199 |    "cell_type": "code",
 200 |    "execution_count": 4,
 201 |    "metadata": {
 202 |     "collapsed": false,
 203 |     "slideshow": {
 204 |      "slide_type": "-"
 205 |     }
 206 |    },
 207 |    "outputs": [
 208 |     {
 209 |      "name": "stdout",
 210 |      "output_type": "stream",
 211 |      "text": [
 212 |       "original vector [1 2 4]\n",
 213 |       "normalized vector [ 0.21821789  0.43643578  0.87287156]\n"
 214 |      ]
 215 |     }
 216 |    ],
 217 |    "source": [
 218 |     "vector_3d = np.array([1,2,4])\n",
 219 |     "print(\"original vector\", vector_3d)\n",
 220 |     "print(\"normalized vector\", normalize_vector(vector_3d))\n",
 221 |     "#0.218 is 1/4th of .873 just like 1 is 1/4th of 4"
 222 |    ]
 223 |   },
 224 |   {
 225 |    "cell_type": "markdown",
 226 |    "metadata": {
 227 |     "slideshow": {
 228 |      "slide_type": "-"
 229 |     }
 230 |    },
 231 |    "source": [
 232 |     "## Calculating Cosine Similarity"
 233 |    ]
 234 |   },
 235 |   {
 236 |    "cell_type": "code",
 237 |    "execution_count": 5,
 238 |    "metadata": {
 239 |     "collapsed": true,
 240 |     "slideshow": {
 241 |      "slide_type": "-"
 242 |     }
 243 |    },
 244 |    "outputs": [],
 245 |    "source": [
 246 |     "def cos_sim(vector_one, vector_two):\n",
 247 |     "    \"\"\"\n",
 248 |     "    Calculate the cosine similarity of two `numpy` vectors\n",
 249 |     "    :param vector_one: a `numpy` vector\n",
 250 |     "    :param vector_two: a `numpy` vector\n",
 251 |     "    :return: A score between 0 and 1\n",
 252 |     "    \"\"\"\n",
 253 |     "    # ensure that both vectors are already normalized\n",
 254 |     "    vector_one_norm = normalize_vector(vector_one)\n",
 255 |     "    vector_two_norm = normalize_vector(vector_two)\n",
 256 |     "    \n",
 257 |     "    # calculate the dot product between the two normalized vectors\n",
 258 |     "    return vector_one_norm.dot(vector_two_norm)"
 259 |    ]
 260 |   },
 261 |   {
 262 |    "cell_type": "code",
 263 |    "execution_count": 6,
 264 |    "metadata": {
 265 |     "collapsed": false,
 266 |     "slideshow": {
 267 |      "slide_type": "-"
 268 |     }
 269 |    },
 270 |    "outputs": [
 271 |     {
 272 |      "name": "stdout",
 273 |      "output_type": "stream",
 274 |      "text": [
 275 |       "cosine similarity of vector_one and vector_two 0.948683298051\n",
 276 |       "cosine similarity of vector_one and vector_three 0.904534033733\n",
 277 |       "cosine similarity of vector_one and vector_four 0.904534033733\n"
 278 |      ]
 279 |     }
 280 |    ],
 281 |    "source": [
 282 |     "vector_one = np.array([1,1,1,1,1])\n",
 283 |     "vector_two = np.array([1,1,1,1,2])\n",
 284 |     "vector_three = np.array([1,2,3,4,5])\n",
 285 |     "vector_four = np.array([10,20,30,40,50])\n",
 286 |     "\n",
 287 |     "print(\"cosine similarity of vector_one and vector_two\", cos_sim(vector_one, vector_two))\n",
 288 |     "print(\"cosine similarity of vector_one and vector_three\", cos_sim(vector_one, vector_three))\n",
 289 |     "print(\"cosine similarity of vector_one and vector_four\", cos_sim(vector_one, vector_four))"
 290 |    ]
 291 |   },
 292 |   {
 293 |    "cell_type": "markdown",
 294 |    "metadata": {
 295 |     "slideshow": {
 296 |      "slide_type": "-"
 297 |     }
 298 |    },
 299 |    "source": [
 300 |     "## Measuring the \"Similarity\" of Words"
 301 |    ]
 302 |   },
 303 |   {
 304 |    "cell_type": "markdown",
 305 |    "metadata": {
 306 |     "slideshow": {
 307 |      "slide_type": "-"
 308 |     }
 309 |    },
 310 |    "source": [
 311 |     "![cos_sim_compare](images/cos_sim_compare.png)\n",
 312 |     "https://medium.com/@camrongodbout/creating-a-search-engine-f2f429cab33c#.z7i9w8y5t"
 313 |    ]
 314 |   },
 315 |   {
 316 |    "cell_type": "markdown",
 317 |    "metadata": {
 318 |     "slideshow": {
 319 |      "slide_type": "-"
 320 |     }
 321 |    },
 322 |    "source": [
 323 |     "![vectorize](images/vectorize.png)"
 324 |    ]
 325 |   },
 326 |   {
 327 |    "cell_type": "markdown",
 328 |    "metadata": {
 329 |     "slideshow": {
 330 |      "slide_type": "-"
 331 |     }
 332 |    },
 333 |    "source": [
 334 |     "### Option 1: One-hot vectors\n",
 335 |     "\n",
 336 |     "![one_hot](images/one_hot.png)\n",
 337 |     "https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/"
 338 |    ]
 339 |   },
 340 |   {
 341 |    "cell_type": "code",
 342 |    "execution_count": 7,
 343 |    "metadata": {
 344 |     "collapsed": true,
 345 |     "slideshow": {
 346 |      "slide_type": "-"
 347 |     }
 348 |    },
 349 |    "outputs": [],
 350 |    "source": [
 351 |     "vocabulary = ['apple', 'banana', 'orange', 'cantaloupe', 'peach']"
 352 |    ]
 353 |   },
 354 |   {
 355 |    "cell_type": "code",
 356 |    "execution_count": 8,
 357 |    "metadata": {
 358 |     "collapsed": false,
 359 |     "slideshow": {
 360 |      "slide_type": "-"
 361 |     }
 362 |    },
 363 |    "outputs": [],
 364 |    "source": [
 365 |     "# generate vocabulary lookup\n",
 366 |     "def build_voc_lookup(list_of_voc):\n",
 367 |     "    \"\"\"\n",
 368 |     "    Generates a dictionary where the key is the word and the value is its index\n",
 369 |     "    :param list_of_voc: list of vocabulary words\n",
 370 |     "    :return: Dictionary of vocabulary\n",
 371 |     "    \"\"\"\n",
 372 |     "    lookup_dict = OrderedDict()\n",
 373 |     "    counter = 0\n",
 374 |     "    for word in list_of_voc:\n",
 375 |     "        lookup_dict[word] = counter\n",
 376 |     "        counter+=1\n",
 377 |     "    return lookup_dict"
 378 |    ]
 379 |   },
 380 |   {
 381 |    "cell_type": "code",
 382 |    "execution_count": 9,
 383 |    "metadata": {
 384 |     "collapsed": true,
 385 |     "slideshow": {
 386 |      "slide_type": "-"
 387 |     }
 388 |    },
 389 |    "outputs": [],
 390 |    "source": [
 391 |     "# lookup word\n",
 392 |     "def lookup_word(lookup_dict, word):\n",
 393 |     "    \"\"\" \n",
 394 |     "    Looks up a given word in the vocabulary dictionary, and returns None if word not in vocabulary\n",
 395 |     "    :param lookup_dict: lookup-dictionary built with build_voc_lookup()\n",
 396 |     "    :param word to index\n",
 397 |     "    :return: index of word in vocabulary or None\n",
 398 |     "    \"\"\"\n",
 399 |     "    if word in lookup_dict:\n",
 400 |     "        return lookup_dict[word]\n",
 401 |     "    else:\n",
 402 |     "        return None"
 403 |    ]
 404 |   },
 405 |   {
 406 |    "cell_type": "code",
 407 |    "execution_count": 10,
 408 |    "metadata": {
 409 |     "collapsed": false,
 410 |     "slideshow": {
 411 |      "slide_type": "-"
 412 |     }
 413 |    },
 414 |    "outputs": [
 415 |     {
 416 |      "name": "stdout",
 417 |      "output_type": "stream",
 418 |      "text": [
 419 |       "4\n",
 420 |       "None\n"
 421 |      ]
 422 |     }
 423 |    ],
 424 |    "source": [
 425 |     "lookup_dict = build_voc_lookup(vocabulary)\n",
 426 |     "print(lookup_word(lookup_dict, 'peach'))\n",
 427 |     "print(lookup_word(lookup_dict, 'hashbrown'))"
 428 |    ]
 429 |   },
 430 |   {
 431 |    "cell_type": "code",
 432 |    "execution_count": 11,
 433 |    "metadata": {
 434 |     "collapsed": false,
 435 |     "slideshow": {
 436 |      "slide_type": "-"
 437 |     }
 438 |    },
 439 |    "outputs": [],
 440 |    "source": [
 441 |     "# build one-hot vector for word\n",
 442 |     "def make_one_hot(lookup_dict, word):\n",
 443 |     "    \"\"\"\n",
 444 |     "    Builds a one-hot numpy vector for a word\n",
 445 |     "    :param lookup_dict: lookup-dictionary built with build_voc_lookup()\n",
 446 |     "    :param word: word to convert to one-hot\n",
 447 |     "    :return numpy vector with dimension equal to size of vocabulary\n",
 448 |     "    \"\"\"\n",
 449 |     "    # get size of vocabulary\n",
 450 |     "    voc_size = len(lookup_dict.items())\n",
 451 |     "    # initialize empty vector of zeros with the size of the vocabulary\n",
 452 |     "    one_hot = np.zeros((voc_size))\n",
 453 |     "    # get index of word (or None if not in vocabulary)\n",
 454 |     "    word_index = lookup_word(lookup_dict, word)\n",
 455 |     "    # make the nth dimension of one-hot (representing the index of word in vocabulary) to 1\n",
 456 |     "    if word_index or word_index == 0:\n",
 457 |     "        one_hot[word_index] = 1\n",
 458 |     "    # if word not in vocabulary, the one-hot will remain zeros\n",
 459 |     "    return one_hot"
 460 |    ]
 461 |   },
 462 |   {
 463 |    "cell_type": "code",
 464 |    "execution_count": 12,
 465 |    "metadata": {
 466 |     "collapsed": false,
 467 |     "slideshow": {
 468 |      "slide_type": "-"
 469 |     }
 470 |    },
 471 |    "outputs": [
 472 |     {
 473 |      "name": "stdout",
 474 |      "output_type": "stream",
 475 |      "text": [
 476 |       "one-hot vector for '      apple' [ 1.  0.  0.  0.  0.]\n",
 477 |       "one-hot vector for '     banana' [ 0.  1.  0.  0.  0.]\n",
 478 |       "one-hot vector for '     orange' [ 0.  0.  1.  0.  0.]\n",
 479 |       "one-hot vector for ' cantaloupe' [ 0.  0.  0.  1.  0.]\n",
 480 |       "one-hot vector for '      peach' [ 0.  0.  0.  0.  1.]\n",
 481 |       "one-hot vector for '  hashbrown' [ 0.  0.  0.  0.  0.]\n",
 482 |       "one-hot vector for '    Capizzi' [ 0.  0.  0.  0.  0.]\n"
 483 |      ]
 484 |     }
 485 |    ],
 486 |    "source": [
 487 |     "for word in vocabulary + ['hashbrown', 'Capizzi']:\n",
 488 |     "    print(\"one-hot vector for '{:>11}'\".format(word), make_one_hot(lookup_dict, word))"
 489 |    ]
 490 |   },
 491 |   {
 492 |    "cell_type": "markdown",
 493 |    "metadata": {
 494 |     "slideshow": {
 495 |      "slide_type": "-"
 496 |     }
 497 |    },
 498 |    "source": [
 499 |     "#### The problem with one-hot vectors"
 500 |    ]
 501 |   },
 502 |   {
 503 |    "cell_type": "code",
 504 |    "execution_count": 13,
 505 |    "metadata": {
 506 |     "collapsed": false,
 507 |     "slideshow": {
 508 |      "slide_type": "-"
 509 |     }
 510 |    },
 511 |    "outputs": [
 512 |     {
 513 |      "name": "stdout",
 514 |      "output_type": "stream",
 515 |      "text": [
 516 |       "cosine similarity between apple and banana 0.0\n",
 517 |       "cosine similarity between apple and orange 0.0\n",
 518 |       "cosine similarity between apple and cantaloupe 0.0\n",
 519 |       "cosine similarity between apple and peach 0.0\n",
 520 |       "cosine similarity between apple and Phoenix 0.0\n",
 521 |       "cosine similarity between banana and orange 0.0\n",
 522 |       "cosine similarity between banana and cantaloupe 0.0\n",
 523 |       "cosine similarity between banana and peach 0.0\n",
 524 |       "cosine similarity between banana and Phoenix 0.0\n",
 525 |       "cosine similarity between orange and cantaloupe 0.0\n",
 526 |       "cosine similarity between orange and peach 0.0\n",
 527 |       "cosine similarity between orange and Phoenix 0.0\n",
 528 |       "cosine similarity between cantaloupe and peach 0.0\n",
 529 |       "cosine similarity between cantaloupe and Phoenix 0.0\n",
 530 |       "cosine similarity between peach and Phoenix 0.0\n"
 531 |      ]
 532 |     }
 533 |    ],
 534 |    "source": [
 535 |     "# add an OOV word to vocabulary\n",
 536 |     "vocabulary_plus_oov = vocabulary + [\"Phoenix\"]\n",
 537 |     "# get all combinations\n",
 538 |     "all_combinations = combinations(vocabulary_plus_oov, 2)\n",
 539 |     "# iterate through all combinations and calculate cosine similarity\n",
 540 |     "for (word1, word2) in all_combinations:\n",
 541 |     "    one_hot_word_1 = make_one_hot(lookup_dict, word1)\n",
 542 |     "    one_hot_word_2 = make_one_hot(lookup_dict, word2)\n",
 543 |     "    print(\"cosine similarity between {} and {}\".format(word1, word2), cos_sim(one_hot_word_1, one_hot_word_2))"
 544 |    ]
 545 |   },
 546 |   {
 547 |    "cell_type": "markdown",
 548 |    "metadata": {
 549 |     "slideshow": {
 550 |      "slide_type": "-"
 551 |     }
 552 |    },
 553 |    "source": [
 554 |     "### Option 2: Encode spelling\n",
 555 |     "Following a similar pattern as the one-hot of a word over a vocabulary, let's build word vectors represented by the frequency of the letters present"
 556 |    ]
 557 |   },
 558 |   {
 559 |    "cell_type": "code",
 560 |    "execution_count": 14,
 561 |    "metadata": {
 562 |     "collapsed": true,
 563 |     "slideshow": {
 564 |      "slide_type": "-"
 565 |     }
 566 |    },
 567 |    "outputs": [],
 568 |    "source": [
 569 |     "alphabet = list(string.ascii_lowercase)"
 570 |    ]
 571 |   },
 572 |   {
 573 |    "cell_type": "code",
 574 |    "execution_count": 15,
 575 |    "metadata": {
 576 |     "collapsed": false,
 577 |     "slideshow": {
 578 |      "slide_type": "-"
 579 |     }
 580 |    },
 581 |    "outputs": [],
 582 |    "source": [
 583 |     "# since we don't need to worry about \"out-of-vocabulary\" now, we can just use alphabet.index([letter])\n",
 584 |     "def lookup_letter(letter):\n",
 585 |     "    return alphabet.index(letter.lower())"
 586 |    ]
 587 |   },
 588 |   {
 589 |    "cell_type": "code",
 590 |    "execution_count": 16,
 591 |    "metadata": {
 592 |     "collapsed": false,
 593 |     "slideshow": {
 594 |      "slide_type": "-"
 595 |     }
 596 |    },
 597 |    "outputs": [
 598 |     {
 599 |      "name": "stdout",
 600 |      "output_type": "stream",
 601 |      "text": [
 602 |       "a 0\n",
 603 |       "A 0\n"
 604 |      ]
 605 |     }
 606 |    ],
 607 |    "source": [
 608 |     "print(\"a\", lookup_letter('a'))\n",
 609 |     "print(\"A\", lookup_letter('A'))"
 610 |    ]
 611 |   },
 612 |   {
 613 |    "cell_type": "code",
 614 |    "execution_count": 17,
 615 |    "metadata": {
 616 |     "collapsed": false,
 617 |     "slideshow": {
 618 |      "slide_type": "-"
 619 |     }
 620 |    },
 621 |    "outputs": [],
 622 |    "source": [
 623 |     "def make_spelling_vector(word):\n",
 624 |     "    \"\"\"\n",
 625 |     "    Converts a word into a vector of dimension 26 where each cell contains the count for that letter\n",
 626 |     "    :param word: word to vectorize\n",
 627 |     "    :return: numpy vector of 26 dimensions\n",
 628 |     "    \"\"\"\n",
 629 |     "    # initialize vector with zeros\n",
 630 |     "    spelling_vector = np.zeros((26))\n",
 631 |     "    # iterate through each letter and update count\n",
 632 |     "    for letter in word:\n",
 633 |     "        if letter in string.ascii_letters:\n",
 634 |     "            letter_index = lookup_letter(letter)\n",
 635 |     "            spelling_vector[letter_index] = spelling_vector[letter_index] + 1\n",
 636 |     "    return spelling_vector"
 637 |    ]
 638 |   },
 639 |   {
 640 |    "cell_type": "code",
 641 |    "execution_count": 18,
 642 |    "metadata": {
 643 |     "collapsed": false,
 644 |     "slideshow": {
 645 |      "slide_type": "-"
 646 |     }
 647 |    },
 648 |    "outputs": [
 649 |     {
 650 |      "data": {
 651 |       "text/plain": [
 652 |        "array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,\n",
 653 |        "        0.,  0.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])"
 654 |       ]
 655 |      },
 656 |      "execution_count": 18,
 657 |      "metadata": {},
 658 |      "output_type": "execute_result"
 659 |     }
 660 |    ],
 661 |    "source": [
 662 |     "make_spelling_vector(\"apple\")"
 663 |    ]
 664 |   },
 665 |   {
 666 |    "cell_type": "code",
 667 |    "execution_count": 19,
 668 |    "metadata": {
 669 |     "collapsed": true,
 670 |     "slideshow": {
 671 |      "slide_type": "-"
 672 |     }
 673 |    },
 674 |    "outputs": [],
 675 |    "source": [
 676 |     "vocabulary = [\"apple\", \"banana\", \"orange\", \"cantaloupe\", \"peach\", \"Phoenix\"]"
 677 |    ]
 678 |   },
 679 |   {
 680 |    "cell_type": "code",
 681 |    "execution_count": 20,
 682 |    "metadata": {
 683 |     "collapsed": false,
 684 |     "slideshow": {
 685 |      "slide_type": "-"
 686 |     }
 687 |    },
 688 |    "outputs": [
 689 |     {
 690 |      "name": "stdout",
 691 |      "output_type": "stream",
 692 |      "text": [
 693 |       "cosine similarity between apple and banana 0.303045763366\n",
 694 |       "cosine similarity between apple and orange 0.308606699924\n",
 695 |       "cosine similarity between apple and cantaloupe 0.654653670708\n",
 696 |       "cosine similarity between apple and peach 0.676123403783\n",
 697 |       "cosine similarity between apple and Phoenix 0.428571428571\n",
 698 |       "cosine similarity between banana and orange 0.54554472559\n",
 699 |       "cosine similarity between banana and cantaloupe 0.617213399848\n",
 700 |       "cosine similarity between banana and peach 0.3585685828\n",
 701 |       "cosine similarity between banana and Phoenix 0.20203050891\n",
 702 |       "cosine similarity between orange and cantaloupe 0.589255650989\n",
 703 |       "cosine similarity between orange and peach 0.36514837167\n",
 704 |       "cosine similarity between orange and Phoenix 0.462910049886\n",
 705 |       "cosine similarity between cantaloupe and peach 0.645497224368\n",
 706 |       "cosine similarity between cantaloupe and Phoenix 0.436435780472\n",
 707 |       "cosine similarity between peach and Phoenix 0.507092552837\n"
 708 |      ]
 709 |     }
 710 |    ],
 711 |    "source": [
 712 |     "# reset the generator\n",
 713 |     "all_combinations = combinations(vocabulary, 2)\n",
 714 |     "# iterate through all words\n",
 715 |     "for (word1, word2) in all_combinations:\n",
 716 |     "    spelling_vector_1 = make_spelling_vector(word1)\n",
 717 |     "    spelling_vector_2 = make_spelling_vector(word2)\n",
 718 |     "    print(\"cosine similarity between {} and {}\".format(word1, word2), cos_sim(spelling_vector_1, spelling_vector_2))"
 719 |    ]
 720 |   },
 721 |   {
 722 |    "cell_type": "code",
 723 |    "execution_count": 21,
 724 |    "metadata": {
 725 |     "collapsed": false,
 726 |     "slideshow": {
 727 |      "slide_type": "-"
 728 |     }
 729 |    },
 730 |    "outputs": [
 731 |     {
 732 |      "data": {
 733 |       "text/plain": [
 734 |        "True"
 735 |       ]
 736 |      },
 737 |      "execution_count": 21,
 738 |      "metadata": {},
 739 |      "output_type": "execute_result"
 740 |     }
 741 |    ],
 742 |    "source": [
 743 |     "# what if two words share the same letters?\n",
 744 |     "dog = make_spelling_vector(\"dog\")\n",
 745 |     "god = make_spelling_vector(\"God\")\n",
 746 |     "# god == dog\n",
 747 |     "np.all(god == dog)"
 748 |    ]
 749 |   },
 750 |   {
 751 |    "cell_type": "markdown",
 752 |    "metadata": {
 753 |     "slideshow": {
 754 |      "slide_type": "-"
 755 |     }
 756 |    },
 757 |    "source": [
 758 |     "#### We've successfully generated similarity scores!  But...\n",
 759 |     "\n",
 760 |     "Do they really reflect anything semantic?  \n",
 761 |     "\n",
 762 |     "In other words, does it make sense that **\"peach\"** and **\"Phoenix\"** <br> \n",
 763 |     "(`cosine similarity = 0.507`) <br>\n",
 764 |     "are **more** similar than **\"peach\"** and **\"orange\"** <br>\n",
 765 |     "(`cosine similarity = .365`)?"
 766 |    ]
 767 |   },
 768 |   {
 769 |    "cell_type": "markdown",
 770 |    "metadata": {
 771 |     "slideshow": {
 772 |      "slide_type": "-"
 773 |     }
 774 |    },
 775 |    "source": [
 776 |     "### Option 3: Word Embeddings\n",
 777 |     "Create a \"dense\" representation of each word where proximity in vector space represents \"similarity\"."
 778 |    ]
 779 |   },
 780 |   {
 781 |    "cell_type": "markdown",
 782 |    "metadata": {
 783 |     "slideshow": {
 784 |      "slide_type": "-"
 785 |     }
 786 |    },
 787 |    "source": [
 788 |     "![context](images/context.png)\n",
 789 |     "https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/"
 790 |    ]
 791 |   },
 792 |   {
 793 |    "cell_type": "markdown",
 794 |    "metadata": {
 795 |     "slideshow": {
 796 |      "slide_type": "-"
 797 |     }
 798 |    },
 799 |    "source": [
 800 |     "![architecture](images/architecture.png)\n",
 801 |     "https://arxiv.org/pdf/1301.3781v3.pdf"
 802 |    ]
 803 |   },
 804 |   {
 805 |    "cell_type": "markdown",
 806 |    "metadata": {
 807 |     "slideshow": {
 808 |      "slide_type": "-"
 809 |     }
 810 |    },
 811 |    "source": [
 812 |     "![cbow](images/cbow.png)\n",
 813 |     "https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/"
 814 |    ]
 815 |   },
 816 |   {
 817 |    "cell_type": "markdown",
 818 |    "metadata": {
 819 |     "slideshow": {
 820 |      "slide_type": "-"
 821 |     }
 822 |    },
 823 |    "source": [
 824 |     "#### Using the `gensim` package in `python`\n",
 825 |     "https://radimrehurek.com/gensim/models/word2vec.html"
 826 |    ]
 827 |   },
 828 |   {
 829 |    "cell_type": "code",
 830 |    "execution_count": 22,
 831 |    "metadata": {
 832 |     "collapsed": false,
 833 |     "slideshow": {
 834 |      "slide_type": "-"
 835 |     }
 836 |    },
 837 |    "outputs": [],
 838 |    "source": [
 839 |     "# load existing word2vec vectors into gensim\n",
 840 |     "\n",
 841 |     "# most frequent 125k words in Gigaword corpus\n",
 842 |     "w2v = models.Word2Vec.load_word2vec_format(fname=\"Gigaword_pruned_vectors.txt.gz\", binary=False)\n",
 843 |     "\n",
 844 |     "# original `word2vec` embeddings can be downloaded here:\n",
 845 |     "# https://code.google.com/archive/p/word2vec/"
 846 |    ]
 847 |   },
 848 |   {
 849 |    "cell_type": "markdown",
 850 |    "metadata": {
 851 |     "slideshow": {
 852 |      "slide_type": "-"
 853 |     }
 854 |    },
 855 |    "source": [
 856 |     "Pre-trained word embeddings can be loaded into `gensim` in `.txt` or `.txt.gz` format *as long as* the first line identifies (1) the number of words in file and (2) the dimensions of the vector\n",
 857 |     " \n",
 858 |     "```\n",
 859 |     "199999 200\n",
 860 |     "and -0.065843 -0.133472 0.020263 0.102796 0.003295 0.025878 -0.071714 0.054211 -0.026698 -0.036176 -0.024954 0.042049 -0.165819 -0.067038 0.117293 0.046338 0.012154 0.026929 -0.020248 0.120186 0.081922 0.062471 -0.063391 -0.048321 -0.108106 -0.067974 0.092109 -0.034439 -0.024319 0.008799 -0.099953\n",
 861 |     "...\n",
 862 |     "```"
 863 |    ]
 864 |   },
 865 |   {
 866 |    "cell_type": "code",
 867 |    "execution_count": 23,
 868 |    "metadata": {
 869 |     "collapsed": false,
 870 |     "slideshow": {
 871 |      "slide_type": "-"
 872 |     }
 873 |    },
 874 |    "outputs": [
 875 |     {
 876 |      "data": {
 877 |       "text/plain": [
 878 |        "array([ 0.06338   , -0.146809  ,  0.110004  , -0.01205   , -0.045637  ,\n",
 879 |        "       -0.02224   , -0.045153  ,  0.079144  , -0.027216  , -0.027647  ,\n",
 880 |        "       -0.000434  ,  0.108648  , -0.060456  , -0.129502  ,  0.010897  ,\n",
 881 |        "        0.055499  ,  0.086099  ,  0.055282  ,  0.007365  ,  0.167188  ,\n",
 882 |        "        0.016705  ,  0.0744    , -0.07096   , -0.105974  , -0.095631  ,\n",
 883 |        "        0.006107  ,  0.12862299, -0.033055  , -0.020641  ,  0.024765  ,\n",
 884 |        "       -0.048181  , -0.090195  ,  0.007408  ,  0.073138  ,  0.031994  ,\n",
 885 |        "       -0.014252  ,  0.102764  , -0.081244  ,  0.10513   ,  0.039809  ,\n",
 886 |        "       -0.050727  ,  0.002429  , -0.01506   , -0.085081  , -0.02245   ,\n",
 887 |        "        0.102064  , -0.009099  , -0.092295  , -0.040276  ,  0.148752  ], dtype=float32)"
 888 |       ]
 889 |      },
 890 |      "execution_count": 23,
 891 |      "metadata": {},
 892 |      "output_type": "execute_result"
 893 |     }
 894 |    ],
 895 |    "source": [
 896 |     "# the first 50 dimensions of the vector for \"the\"\n",
 897 |     "w2v[\"the\"][0:50]"
 898 |    ]
 899 |   },
 900 |   {
 901 |    "cell_type": "code",
 902 |    "execution_count": 24,
 903 |    "metadata": {
 904 |     "collapsed": false,
 905 |     "slideshow": {
 906 |      "slide_type": "subslide"
 907 |     }
 908 |    },
 909 |    "outputs": [
 910 |     {
 911 |      "ename": "KeyError",
 912 |      "evalue": "'abcdef'",
 913 |      "output_type": "error",
 914 |      "traceback": [
 915 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 916 |       "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
 917 |       "\u001b[0;32m<ipython-input-24-c371a7157d6c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mw2v\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"abcdef\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
 918 |       "\u001b[0;32m/Users/mcapizzi/miniconda3/envs/word-embedding/lib/python3.5/site-packages/gensim/models/word2vec.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, words)\u001b[0m\n\u001b[1;32m   1502\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstring_types\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1503\u001b[0m             \u001b[0;31m# allow calls like trained_model['office'], as a shorthand for trained_model[['office']]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1504\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msyn0\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvocab\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mwords\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1505\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1506\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mvstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msyn0\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvocab\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mword\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mword\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mwords\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 919 |       "\u001b[0;31mKeyError\u001b[0m: 'abcdef'"
 920 |      ]
 921 |     }
 922 |    ],
 923 |    "source": [
 924 |     "w2v[\"abcdef\"]"
 925 |    ]
 926 |   },
 927 |   {
 928 |    "cell_type": "code",
 929 |    "execution_count": 25,
 930 |    "metadata": {
 931 |     "collapsed": false,
 932 |     "slideshow": {
 933 |      "slide_type": "subslide"
 934 |     }
 935 |    },
 936 |    "outputs": [],
 937 |    "source": [
 938 |     "def get_vector(word):\n",
 939 |     "    \"\"\"\n",
 940 |     "    Returns the word vector for that word or a vector of 0s for out-of-vocabulary\n",
 941 |     "    :param: word: word to lookup in vectors\n",
 942 |     "    :return: vector or vector of zeros\n",
 943 |     "    \"\"\"\n",
 944 |     "    # determine vector length\n",
 945 |     "    w2v_length = len(w2v[\"the\"])\n",
 946 |     "    # get vector\n",
 947 |     "    if word in w2v:\n",
 948 |     "            return w2v[word]\n",
 949 |     "    else:\n",
 950 |     "        return np.zeros((w2v_length))"
 951 |    ]
 952 |   },
 953 |   {
 954 |    "cell_type": "code",
 955 |    "execution_count": 26,
 956 |    "metadata": {
 957 |     "collapsed": false,
 958 |     "slideshow": {
 959 |      "slide_type": "-"
 960 |     }
 961 |    },
 962 |    "outputs": [
 963 |     {
 964 |      "data": {
 965 |       "text/plain": [
 966 |        "array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n",
 967 |        "        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n",
 968 |        "        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,\n",
 969 |        "        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])"
 970 |       ]
 971 |      },
 972 |      "execution_count": 26,
 973 |      "metadata": {},
 974 |      "output_type": "execute_result"
 975 |     }
 976 |    ],
 977 |    "source": [
 978 |     "get_vector(\"abcdef\")[0:50]"
 979 |    ]
 980 |   },
 981 |   {
 982 |    "cell_type": "code",
 983 |    "execution_count": 27,
 984 |    "metadata": {
 985 |     "collapsed": false,
 986 |     "slideshow": {
 987 |      "slide_type": "-"
 988 |     }
 989 |    },
 990 |    "outputs": [
 991 |     {
 992 |      "data": {
 993 |       "text/plain": [
 994 |        "[('monarch', 0.7166919708251953),\n",
 995 |        " ('princess', 0.7164901494979858),\n",
 996 |        " ('margrethe', 0.6889792680740356),\n",
 997 |        " ('beatrix', 0.6878944039344788),\n",
 998 |        " ('coronation', 0.6789792776107788),\n",
 999 |        " ('prince', 0.6730599403381348),\n",
1000 |        " ('wilhelmina', 0.6619384288787842),\n",
1001 |        " ('mettemarit', 0.6575925946235657),\n",
1002 |        " ('consort', 0.6492267847061157),\n",
1003 |        " ('duchess', 0.6444146633148193)]"
1004 |       ]
1005 |      },
1006 |      "execution_count": 27,
1007 |      "metadata": {},
1008 |      "output_type": "execute_result"
1009 |     }
1010 |    ],
1011 |    "source": [
1012 |     "# find most similar n words to a given word\n",
1013 |     "similar = w2v.similar_by_word(\"queen\", topn=10)\n",
1014 |     "similar"
1015 |    ]
1016 |   },
1017 |   {
1018 |    "cell_type": "code",
1019 |    "execution_count": 28,
1020 |    "metadata": {
1021 |     "collapsed": false,
1022 |     "slideshow": {
1023 |      "slide_type": "-"
1024 |     }
1025 |    },
1026 |    "outputs": [
1027 |     {
1028 |      "data": {
1029 |       "text/plain": [
1030 |        "[('cat', 1.0),\n",
1031 |        " ('dog', 0.8524122834205627),\n",
1032 |        " ('puppy', 0.7896589040756226),\n",
1033 |        " ('pug', 0.783139169216156),\n",
1034 |        " ('critter', 0.7650502324104309),\n",
1035 |        " ('squirrel', 0.7516598701477051),\n",
1036 |        " ('feline', 0.7436362504959106),\n",
1037 |        " ('gerbil', 0.7435644865036011),\n",
1038 |        " ('monkey', 0.7434572577476501),\n",
1039 |        " ('hamster', 0.7323285341262817)]"
1040 |       ]
1041 |      },
1042 |      "execution_count": 28,
1043 |      "metadata": {},
1044 |      "output_type": "execute_result"
1045 |     }
1046 |    ],
1047 |    "source": [
1048 |     "# find  most similar n words to a given vector\n",
1049 |     "cat_vector = get_vector(\"cat\")\n",
1050 |     "cat_sim = w2v.similar_by_vector(cat_vector, topn=10)\n",
1051 |     "cat_sim"
1052 |    ]
1053 |   },
1054 |   {
1055 |    "cell_type": "markdown",
1056 |    "metadata": {
1057 |     "slideshow": {
1058 |      "slide_type": "-"
1059 |     }
1060 |    },
1061 |    "source": [
1062 |     "#### Evaluation of word embeddings"
1063 |    ]
1064 |   },
1065 |   {
1066 |    "cell_type": "markdown",
1067 |    "metadata": {
1068 |     "slideshow": {
1069 |      "slide_type": "-"
1070 |     }
1071 |    },
1072 |    "source": [
1073 |     "![king_queen](images/king_queen.png)\n",
1074 |     "https://arxiv.org/pdf/1301.3781v3.pdf"
1075 |    ]
1076 |   },
1077 |   {
1078 |    "cell_type": "markdown",
1079 |    "metadata": {
1080 |     "slideshow": {
1081 |      "slide_type": "-"
1082 |     }
1083 |    },
1084 |    "source": [
1085 |     "![king_queen_vis](images/king_queen_vis.png)\n",
1086 |     "https://www.aclweb.org/anthology/N/N13/N13-1090.pdf"
1087 |    ]
1088 |   },
1089 |   {
1090 |    "cell_type": "markdown",
1091 |    "metadata": {
1092 |     "slideshow": {
1093 |      "slide_type": "-"
1094 |     }
1095 |    },
1096 |    "source": [
1097 |     "![country_capital](images/country_capital.png)\n",
1098 |     "https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf"
1099 |    ]
1100 |   },
1101 |   {
1102 |    "cell_type": "markdown",
1103 |    "metadata": {
1104 |     "slideshow": {
1105 |      "slide_type": "-"
1106 |     }
1107 |    },
1108 |    "source": [
1109 |     "![eval_1](images/eval_1.png)\n",
1110 |     "https://arxiv.org/pdf/1301.3781v3.pdf"
1111 |    ]
1112 |   },
1113 |   {
1114 |    "cell_type": "markdown",
1115 |    "metadata": {
1116 |     "slideshow": {
1117 |      "slide_type": "-"
1118 |     }
1119 |    },
1120 |    "source": [
1121 |     "##### Analogies\n",
1122 |     "\n",
1123 |     "Built-in function in `gensim`: `most_similar(positive, negative, topn)`\n",
1124 |     "\n",
1125 |     "`A:B::C:??` --> `most_similar(positive=[B,C], negative=A)`"
1126 |    ]
1127 |   },
1128 |   {
1129 |    "cell_type": "code",
1130 |    "execution_count": 29,
1131 |    "metadata": {
1132 |     "collapsed": false,
1133 |     "slideshow": {
1134 |      "slide_type": "-"
1135 |     }
1136 |    },
1137 |    "outputs": [],
1138 |    "source": [
1139 |     "def analogy_solver(A, B, C, topn=5):\n",
1140 |     "    \"\"\"\n",
1141 |     "    A:B::C:?\n",
1142 |     "    man:woman::king:???\n",
1143 |     "    most_similar(positive=[B,C], negative=[A])\n",
1144 |     "    \"\"\"\n",
1145 |     "    return w2v.most_similar(\n",
1146 |     "        positive=[B,C],\n",
1147 |     "        negative=[A],\n",
1148 |     "        topn=topn\n",
1149 |     "    )"
1150 |    ]
1151 |   },
1152 |   {
1153 |    "cell_type": "code",
1154 |    "execution_count": 30,
1155 |    "metadata": {
1156 |     "collapsed": false,
1157 |     "slideshow": {
1158 |      "slide_type": "-"
1159 |     }
1160 |    },
1161 |    "outputs": [
1162 |     {
1163 |      "name": "stdout",
1164 |      "output_type": "stream",
1165 |      "text": [
1166 |       "[('queen', 0.6834795475006104), ('monarch', 0.6421915292739868), ('princess', 0.5896612405776978), ('beatrix', 0.5811704993247986), ('prince', 0.5663138031959534)]\n",
1167 |       "\n",
1168 |       "[('queen', 0.6834795475006104), ('monarch', 0.6421915292739868), ('princess', 0.5896612405776978), ('beatrix', 0.5811704993247986), ('prince', 0.5663138031959534)]\n",
1169 |       "\n",
1170 |       "[('sister', 0.8335152268409729), ('daughter', 0.8259485960006714), ('mother', 0.7856060266494751), ('grandmother', 0.7708373069763184), ('sisterinlaw', 0.7601062655448914)]\n",
1171 |       "\n",
1172 |       "[('sister', 0.8335152268409729), ('daughter', 0.8259485960006714), ('mother', 0.7856060266494751), ('grandmother', 0.7708373069763184), ('sisterinlaw', 0.7601062655448914)]\n"
1173 |      ]
1174 |     }
1175 |    ],
1176 |    "source": [
1177 |     "# man:woman::king:???\n",
1178 |     "# king - man + woman = ???\n",
1179 |     "sol_1 = analogy_solver(\"man\", \"woman\", \"king\")\n",
1180 |     "print(sol_1)\n",
1181 |     "print()\n",
1182 |     "\n",
1183 |     "# man:king::woman:???\n",
1184 |     "# woman - man + king = ???\n",
1185 |     "sol_2 = analogy_solver(\"man\", \"king\", \"woman\")\n",
1186 |     "print(sol_2)\n",
1187 |     "print()\n",
1188 |     "\n",
1189 |     "# uncle:brother::aunt:???\n",
1190 |     "# brother - uncle + aunt = ???\n",
1191 |     "sol_3 = analogy_solver(\"uncle\", \"brother\", \"aunt\")\n",
1192 |     "print(sol_3)\n",
1193 |     "print()\n",
1194 |     "\n",
1195 |     "# uncle:aunt::brother:???\n",
1196 |     "# aunt - uncle + brother = ???\n",
1197 |     "sol_4 = analogy_solver(\"uncle\", \"aunt\", \"brother\")\n",
1198 |     "print(sol_4)"
1199 |    ]
1200 |   },
1201 |   {
1202 |    "cell_type": "markdown",
1203 |    "metadata": {
1204 |     "slideshow": {
1205 |      "slide_type": "-"
1206 |     }
1207 |    },
1208 |    "source": [
1209 |     "##### \"One of these words is not like the other\"\n",
1210 |     "\n",
1211 |     "`breakfast cereal dinner lunch`\n",
1212 |     "\n",
1213 |     "Built-in function in `gensim`: `doesnt_match(list_of_words)`"
1214 |    ]
1215 |   },
1216 |   {
1217 |    "cell_type": "code",
1218 |    "execution_count": 31,
1219 |    "metadata": {
1220 |     "collapsed": false,
1221 |     "slideshow": {
1222 |      "slide_type": "-"
1223 |     }
1224 |    },
1225 |    "outputs": [
1226 |     {
1227 |      "name": "stdout",
1228 |      "output_type": "stream",
1229 |      "text": [
1230 |       "cereal\n",
1231 |       "house\n"
1232 |      ]
1233 |     }
1234 |    ],
1235 |    "source": [
1236 |     "# find which word doesn't match\n",
1237 |     "list_of_words = \"breakfast cereal dinner lunch\"\n",
1238 |     "doesnt_match = w2v.doesnt_match(list_of_words.split())\n",
1239 |     "print(doesnt_match)\n",
1240 |     "\n",
1241 |     "list_of_words_2 = \"house dog pencil electrocute\"\n",
1242 |     "doesnt_match_2 = w2v.doesnt_match(list_of_words_2.split())\n",
1243 |     "print(doesnt_match_2)"
1244 |    ]
1245 |   },
1246 |   {
1247 |    "cell_type": "markdown",
1248 |    "metadata": {
1249 |     "slideshow": {
1250 |      "slide_type": "-"
1251 |     }
1252 |    },
1253 |    "source": [
1254 |     "#### Word Embeddings and Antonyms"
1255 |    ]
1256 |   },
1257 |   {
1258 |    "cell_type": "code",
1259 |    "execution_count": 32,
1260 |    "metadata": {
1261 |     "collapsed": false,
1262 |     "slideshow": {
1263 |      "slide_type": "-"
1264 |     }
1265 |    },
1266 |    "outputs": [
1267 |     {
1268 |      "data": {
1269 |       "text/plain": [
1270 |        "[('bad', 0.7170573472976685),\n",
1271 |        " ('terrific', 0.7161434888839722),\n",
1272 |        " ('decent', 0.7018914222717285),\n",
1273 |        " ('lousy', 0.6984266042709351),\n",
1274 |        " ('wonderful', 0.6819486618041992),\n",
1275 |        " ('perfect', 0.6481753587722778),\n",
1276 |        " ('great', 0.6480209827423096),\n",
1277 |        " ('nice', 0.6281204223632812),\n",
1278 |        " ('darn', 0.623289942741394),\n",
1279 |        " ('fun', 0.6176395416259766)]"
1280 |       ]
1281 |      },
1282 |      "execution_count": 32,
1283 |      "metadata": {},
1284 |      "output_type": "execute_result"
1285 |     }
1286 |    ],
1287 |    "source": [
1288 |     "# this approach doesn't handle antonyms well\n",
1289 |     "# \"That movie was _______.\"\n",
1290 |     "\n",
1291 |     "w2v.similar_by_word(\"good\", topn=10)"
1292 |    ]
1293 |   },
1294 |   {
1295 |    "cell_type": "markdown",
1296 |    "metadata": {
1297 |     "slideshow": {
1298 |      "slide_type": "-"
1299 |     }
1300 |    },
1301 |    "source": [
1302 |     "#### Bias in Word Embeddings"
1303 |    ]
1304 |   },
1305 |   {
1306 |    "cell_type": "markdown",
1307 |    "metadata": {
1308 |     "slideshow": {
1309 |      "slide_type": "-"
1310 |     }
1311 |    },
1312 |    "source": [
1313 |     "![king_queen](images/king_queen_2.png)\n",
1314 |     "![programmer_homemaker](images/programmer_homemaker.png)\n",
1315 |     "https://arxiv.org/pdf/1607.06520v1.pdf"
1316 |    ]
1317 |   },
1318 |   {
1319 |    "cell_type": "markdown",
1320 |    "metadata": {
1321 |     "slideshow": {
1322 |      "slide_type": "subslide"
1323 |     }
1324 |    },
1325 |    "source": [
1326 |     "![gender_bias](images/gender_bias.png)\n",
1327 |     "https://arxiv.org/pdf/1607.06520v1.pdf"
1328 |    ]
1329 |   },
1330 |   {
1331 |    "cell_type": "code",
1332 |    "execution_count": 33,
1333 |    "metadata": {
1334 |     "collapsed": false,
1335 |     "slideshow": {
1336 |      "slide_type": "-"
1337 |     }
1338 |    },
1339 |    "outputs": [
1340 |     {
1341 |      "name": "stdout",
1342 |      "output_type": "stream",
1343 |      "text": [
1344 |       "[('lathes', 0.581120491027832), ('typewriters', 0.5445051193237305), ('washing', 0.5365341305732727), ('machine', 0.5345758199691772), ('shoe', 0.5307853817939758)]\n",
1345 |       "\n"
1346 |      ]
1347 |     }
1348 |    ],
1349 |    "source": [
1350 |     "# she:sewing::he:???\n",
1351 |     "bias_1 = analogy_solver(\"she\", \"sewing\", \"he\")\n",
1352 |     "print(bias_1)\n",
1353 |     "print()"
1354 |    ]
1355 |   },
1356 |   {
1357 |    "cell_type": "markdown",
1358 |    "metadata": {
1359 |     "slideshow": {
1360 |      "slide_type": "-"
1361 |     }
1362 |    },
1363 |    "source": [
1364 |     "#### Links to available word embeddings\n",
1365 |     "\n",
1366 |     "[The \"original\" code for `word2vec`, and pre-trained vectors](https://code.google.com/archive/p/word2vec/)\n",
1367 |     "\n",
1368 |     "[Stanford's approach to word embeddings, and pre-trained vectors](http://nlp.stanford.edu/projects/glove/)\n",
1369 |     "\n",
1370 |     "[A modified approach to word embeddings (feeding dependency tuples to the neural network instead of words), and pre-trained vectors](https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings/)\n",
1371 |     "\n",
1372 |     "[Word embeddings from a particular historical period](http://nlp.stanford.edu/projects/histwords/)"
1373 |    ]
1374 |   },
1375 |   {
1376 |    "cell_type": "markdown",
1377 |    "metadata": {
1378 |     "slideshow": {
1379 |      "slide_type": "-"
1380 |     }
1381 |    },
1382 |    "source": [
1383 |     "## Links to papers\n",
1384 |     "\n",
1385 |     "The \"original\" three papers on `word2vec` by Mikolov:\n",
1386 |     "\n",
1387 |     " - [Efficient Estimation of Word Representations in Vector Space](http://arxiv.org/pdf/1301.3781v3.pdf)\n",
1388 |     "\n",
1389 |     " - [Distributed Representations of Words and Phrases and their Compositionality](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)\n",
1390 |     "\n",
1391 |     " - [Linguistic Regularities in Continuous Space Word Representations](https://www.aclweb.org/anthology/N/N13/N13-1090.pdf)\n",
1392 |     "\n",
1393 |     "\n",
1394 |     "[Further analysis of approaches to word embeddings and their hyperparameters](https://transacl.org/ojs/index.php/tacl/article/viewFile/570/124)\n",
1395 |     "\n",
1396 |     "[Detailed evaluation of word embeddings](https://arxiv.org/pdf/1608.04207v1.pdf)\n",
1397 |     "\n",
1398 |     "[Website for evaluating word embeddings](http://veceval.com/)\n",
1399 |     "\n"
1400 |    ]
1401 |   },
1402 |   {
1403 |    "cell_type": "markdown",
1404 |    "metadata": {
1405 |     "slideshow": {
1406 |      "slide_type": "-"
1407 |     }
1408 |    },
1409 |    "source": [
1410 |     "## Links to blogs\n",
1411 |     "\n",
1412 |     "[A good overview of NLP](https://blog.monkeylearn.com/the-definitive-guide-to-natural-language-processing/)\n",
1413 |     "\n",
1414 |     "[Blog post summary of the three \"original\" papers by Mikolov](https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/)\n",
1415 |     "\n",
1416 |     "[Detailed blog post on the application of word embeddings to analogies](https://quomodocumque.wordpress.com/2016/01/15/messing-around-with-word2vec/)\n",
1417 |     "\n",
1418 |     "[Appyling word embeddings to computer logs](https://gab41.lab41.org/three-things-we-learned-about-applying-word-vectors-to-computer-logs-c199070f390b#.k2mirf2oa)"
1419 |    ]
1420 |   }
1421 |  ],
1422 |  "metadata": {
1423 |   "celltoolbar": "Slideshow",
1424 |   "kernelspec": {
1425 |    "display_name": "Python 3",
1426 |    "language": "python",
1427 |    "name": "python3"
1428 |   },
1429 |   "language_info": {
1430 |    "codemirror_mode": {
1431 |     "name": "ipython",
1432 |     "version": 3
1433 |    },
1434 |    "file_extension": ".py",
1435 |    "mimetype": "text/x-python",
1436 |    "name": "python",
1437 |    "nbconvert_exporter": "python",
1438 |    "pygments_lexer": "ipython3",
1439 |    "version": "3.5.2"
1440 |   }
1441 |  },
1442 |  "nbformat": 4,
1443 |  "nbformat_minor": 0
1444 | }
1445 | 


--------------------------------------------------------------------------------
/images/NLP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/NLP.png


--------------------------------------------------------------------------------
/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/architecture.png


--------------------------------------------------------------------------------
/images/architecture_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/architecture_2.png


--------------------------------------------------------------------------------
/images/context.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/context.png


--------------------------------------------------------------------------------
/images/cos_sim.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/cos_sim.png


--------------------------------------------------------------------------------
/images/cos_sim_compare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/cos_sim_compare.png


--------------------------------------------------------------------------------
/images/country_capital.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/country_capital.png


--------------------------------------------------------------------------------
/images/distance_measures.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/distance_measures.png


--------------------------------------------------------------------------------
/images/eval_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/eval_1.png


--------------------------------------------------------------------------------
/images/eval_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/eval_2.png


--------------------------------------------------------------------------------
/images/gender_bias.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/gender_bias.png


--------------------------------------------------------------------------------
/images/king_queen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/king_queen.png


--------------------------------------------------------------------------------
/images/king_queen_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/king_queen_2.png


--------------------------------------------------------------------------------
/images/king_queen_vis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/king_queen_vis.png


--------------------------------------------------------------------------------
/images/normalize.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/normalize.jpg


--------------------------------------------------------------------------------
/images/one_hot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/one_hot.png


--------------------------------------------------------------------------------
/images/programmer_homemaker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/programmer_homemaker.png


--------------------------------------------------------------------------------
/images/unit_circle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/unit_circle.png


--------------------------------------------------------------------------------
/images/vectorize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelcapizzi/nlp-basics/ae6fec3bc38f1229d08db00451837ca53b5233c9/images/vectorize.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4
2 | gensim==1.0.1
3 | nltk==3.4.5
4 | sklearn==0.23.2
5 | numpy
6 | beautifulsoup4
7 | jupyter
8 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | from nltk import word_tokenize
  3 | import numpy as np
  4 | 
  5 | # loading data
  6 | 
  7 | 
  8 | def load_data(path_to_data):
  9 |     """
 10 |     Loads `.tsv` of data into a <dict>
 11 |     Ensures that `.html` has been removed
 12 |     :param path_to_data: full/path/to/data
 13 |     :return: <list> of <tuple> ([id], [label], [text])
 14 |     """
 15 |     out_ = []
 16 |     with open(path_to_data, "r") as f:
 17 |         for line in f:
 18 |             # parse line
 19 |             line_split = line.rstrip().split("\t")
 20 |             if len(line_split) != 3:
 21 |                 continue
 22 |             id = line_split[0]
 23 |             label = line_split[1]
 24 |             raw_text = line_split[2]
 25 |             # ensure html is removed
 26 |             text = BeautifulSoup(raw_text, "html.parser").get_text()
 27 |             out_.append((id, label, text))
 28 |     return out_
 29 | 
 30 | 
 31 | def get_all_docs(list_of_tuples):
 32 |     """
 33 |     Given a dictionary of data, this will collect all the text into one list
 34 |     :param list_of_tuples: Output of load_data()
 35 |     :return <list> of documents, lookup_dict
 36 |     """
 37 |     all_docs = []
 38 |     lookup = {}
 39 |     for i in range(len(list_of_tuples)):
 40 |         current = list_of_tuples[i]
 41 |         all_docs.append(current[2])
 42 |         lookup[i] = current[2]
 43 |     return all_docs, lookup
 44 | 
 45 | 
 46 | # calculations
 47 | 
 48 | def normalize_vector(vector):
 49 |     """
 50 |     Normalizes a vector so that all its values are between 0 and 1
 51 |     :param vector: a `numpy` vector
 52 |     :return: a normalized `numpy` vector
 53 |     """
 54 |     # norm = np.sqrt(vector.dot(vector))
 55 |     # numpy has a built in function
 56 |     norm = np.linalg.norm(vector)
 57 |     if norm:
 58 |         return vector / norm
 59 |     else:
 60 |         # if norm == 0, then original vector was all 0s
 61 |         return vector
 62 | 
 63 | 
 64 | def cos_sim(vector_one, vector_two):
 65 |     """
 66 |     Calculate the cosine similarity of two `numpy` vectors
 67 |     :param vector_one: a `numpy` vector
 68 |     :param vector_two: a `numpy` vector
 69 |     :return: A score between 0 and 1
 70 |     """
 71 |     # ensure that both vectors are already normalized
 72 |     vector_one_norm = normalize_vector(vector_one)
 73 |     vector_two_norm = normalize_vector(vector_two)
 74 | 
 75 |     # calculate the dot product between the two normalized vectors
 76 |     return vector_one_norm.dot(vector_two_norm)
 77 | 
 78 | 
 79 | def generate_all_cos_sim(X_matrix):
 80 |     """
 81 |     Generates a matrix of cosine similarities for a set of documents
 82 |     WARNING: this is too computationally expensive for a python notebook.  Run in console.
 83 |     :param X_matrix: dense `numpy` matrix: num_documents (d) x words_in_vocabulary (v)
 84 |     :return: dense `numpy` matrix d x d
 85 |     """
 86 |     # ensure matrix is dense
 87 |     if "sparse" in str(type(X_matrix)):
 88 |         X_matrix = X_matrix.toarray()
 89 |     # get shape
 90 |     X_shape = X_matrix.shape
 91 |     size = X_shape[0]
 92 |     # build empty matrix
 93 |     cos_matrix = np.zeros((size, size))
 94 |     # iterate through rows
 95 |     for i in range(size):
 96 |         for j in range(size):
 97 |             if i != j:
 98 |                 print(i,j)
 99 |                 # calculate cosine similarity
100 |                 cos_matrix[i][j] = cos_sim(X_matrix[i], X_matrix[j])
101 |             else:
102 |                 # set diagonal to None
103 |                 cos_matrix[i][j] = None
104 |     return cos_matrix
105 | 
106 | 
107 | def get_similar(cos_sim_matrix, idx, n, direction="most"):
108 |     """
109 |     Determines similarity of n documents
110 |     :param cos_sim_matrix: `numpy` dense array of num_documents x num_documents with values as cosine similarity
111 |     :param idx: index of document to calculate most similar
112 |     :param n: number of most similar indices to return
113 |     :param direction: "most" or "least" for top or bottom of ranked list
114 |     :return: <list> of (idx, cos_sim)
115 |     """
116 |     if direction != "most" and direction != "least":
117 |         raise Exception("chooose `most` or `least` for `direction`")
118 |     # get all values
119 |     if direction == "most":
120 |         all_values = sorted(enumerate(filter(lambda x: not np.isnan(x), cos_sim_matrix[idx])), key=lambda x: x[1], reverse=True)
121 |     else:
122 |         all_values = sorted(enumerate(filter(lambda x: not np.isnan(x), cos_sim_matrix[idx])), key=lambda x: x[1])
123 |     return [(x[0] + 1, x[1]) for x in all_values[:n]]
124 | 
125 | 
126 | # I/O
127 | 
128 | def save_matrix_to_csv(X_matrix, save_location):
129 |     """
130 |     Saves a matrix to csv
131 |     :param X_matrix: dense `numpy` array
132 |     :param save_location: full/path/to/desired/location.csv
133 |     """
134 |     # ensure matrix is dense
135 |     if "sparse" in str(type(X_matrix)):
136 |         X_matrix = X_matrix.toarray()
137 |     np.savetxt(save_location, X_matrix, delimiter=",")
138 | 
139 | 
140 | def load_matrix_from_csv(location):
141 |     """
142 |     Loads a matrix from csv
143 |     :param location: full/path/to/location.csv
144 |     :return: dense `numpy` array
145 |     """
146 |     return np.loadtxt(location, delimiter=",")
147 | 
148 | 


--------------------------------------------------------------------------------