├── GloVe ├── papers │ └── pennington_et_al_14 (GloVe - Global vectors for word representation).pdf └── source code │ └── GloVe.ipynb ├── README.md ├── Sentence modeling ├── papers │ └── Representing sentences as low-rank subspaces.pdf └── source code │ ├── Sentence modeling.ipynb │ └── sentence-modeling.py ├── doc2vec ├── papers │ └── le_mikolov_14 (Distributed representations of sentences and documents).pdf └── source code │ └── doc2vec.ipynb └── word2vec ├── papers ├── mikolov_et_al_13 (Distributed representations of words and phrases and their compositionality).pdf └── mikolov_et_al_13 (Efficient estimation of word representations in vector space).pdf └── source code └── word2vec.ipynb /GloVe/papers/pennington_et_al_14 (GloVe - Global vectors for word representation).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buomsoo-kim/Word-embedding-with-Python/855bb6eaed8f846343a106e552f0fbb4950d1fc3/GloVe/papers/pennington_et_al_14 (GloVe - Global vectors for word representation).pdf -------------------------------------------------------------------------------- /GloVe/source code/GloVe.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## GloVe implementation with Python (+glove-python)\n", 8 | "- Note: This code is written in Python 3.6.1 (+Glove)\n", 9 | "- glove-python: https://github.com/maciejkula/glove-python" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "### How to install glove-python(https://github.com/maciejkula/glove-python/issues/42)\n", 17 | "- git clone https://github.com/maciejkula/glove-python.git\n", 18 | "\n", 19 | "- go to cloned directory location and open setup.py and remove 'stdc++' from libraries=[] paramerter after removing it will look like below\n", 20 | "\n", 21 | "
Extension(\"glove.corpus_cython\", [glove_corpus],\n", 22 | "
language='C++',\n", 23 | "
libraries=[],\n", 24 | "
extra_link_args=compile_args,\n", 25 | "
extra_compile_args=compile_args)]\n", 26 | "\n", 27 | "- conda install cython\n", 28 | "\n", 29 | "- open cmd from that location where setup.py is stored and run below command\n", 30 | "
python setup.py install" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 50, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "import re\n", 40 | "import numpy as np\n", 41 | "\n", 42 | "from glove import Corpus, Glove\n", 43 | "from nltk.corpus import gutenberg\n", 44 | "from multiprocessing import Pool\n", 45 | "from scipy import spatial" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Import training dataset\n", 53 | "- Import Shakespeare's Hamlet corpus from nltk library" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 11, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "sentences = list(gutenberg.sents('shakespeare-hamlet.txt')) # import the corpus and convert into a list" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 12, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', 'William', 'Shakespeare', '1599', ']']\n", 77 | "['Actus', 'Primus', '.']\n", 78 | "['Fran', '.']\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "print(sentences[0]) # title, author, and year\n", 84 | "print(sentences[1])\n", 85 | "print(sentences[10])" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### Preprocess data\n", 93 | "- Use re module to preprocess data\n", 94 | "- Convert all letters into lowercase\n", 95 | "- Remove punctuations, numbers, etc." 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 13, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "for i in range(len(sentences)):\n", 105 | " sentences[i] = [word.lower() for word in sentences[i] if re.match('^[a-zA-Z]+', word)] " 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 14, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "['the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare']\n", 118 | "['actus', 'primus']\n", 119 | "['fran']\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "print(sentences[0]) # title, author, and year\n", 125 | "print(sentences[1])\n", 126 | "print(sentences[10])" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "### Create Corpus instance\n", 134 | "- Sentences should be fitted into the Corpus instance\n", 135 | "- Recall that GloVe takes advantage of both count-based matrix factorization and local context-based window methods" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 15, 141 | "metadata": { 142 | "collapsed": true 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "corpus = Corpus()" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 16, 152 | "metadata": { 153 | "collapsed": true 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "corpus.fit(sentences, window = 3) # window parameter denotes the distance of context" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 17, 163 | "metadata": { 164 | "collapsed": true 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "glove = Glove(no_components = 100, learning_rate = 0.05)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "### Train model\n", 176 | "- GloVe model is trained with corpus matrix (global statistics of words)\n", 177 | "- Key parameter description\n", 178 | " - **matrix**: co-occurence matrix of the corpus\n", 179 | " - **epochs**: number of epochs (i.e., training iterations)\n", 180 | " - **no_threads**: number of training threads\n", 181 | " - **verbose**: whether to print out the progress messages" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 27, 187 | "metadata": { 188 | "scrolled": true 189 | }, 190 | "outputs": [ 191 | { 192 | "name": "stdout", 193 | "output_type": "stream", 194 | "text": [ 195 | "Performing 30 training epochs with 4 threads\n", 196 | "Epoch 0\n", 197 | "Epoch 1\n", 198 | "Epoch 2\n", 199 | "Epoch 3\n", 200 | "Epoch 4\n", 201 | "Epoch 5\n", 202 | "Epoch 6\n", 203 | "Epoch 7\n", 204 | "Epoch 8\n", 205 | "Epoch 9\n", 206 | "Epoch 10\n", 207 | "Epoch 11\n", 208 | "Epoch 12\n", 209 | "Epoch 13\n", 210 | "Epoch 14\n", 211 | "Epoch 15\n", 212 | "Epoch 16\n", 213 | "Epoch 17\n", 214 | "Epoch 18\n", 215 | "Epoch 19\n", 216 | "Epoch 20\n", 217 | "Epoch 21\n", 218 | "Epoch 22\n", 219 | "Epoch 23\n", 220 | "Epoch 24\n", 221 | "Epoch 25\n", 222 | "Epoch 26\n", 223 | "Epoch 27\n", 224 | "Epoch 28\n", 225 | "Epoch 29\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "glove.fit(matrix = corpus.matrix, epochs = 30, no_threads = Pool()._processes, verbose = True)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 24, 236 | "metadata": { 237 | "collapsed": true 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "glove.add_dictionary(corpus.dictionary) # supply a word-id dictionary to allow similarity queries" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "### Save and load model\n", 249 | "- word2vec model can be saved and loaded locally\n", 250 | "- Doing so can reduce time to train model again" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 32, 256 | "metadata": { 257 | "collapsed": true 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "glove.save('glove_model')" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 40, 267 | "metadata": { 268 | "collapsed": true 269 | }, 270 | "outputs": [], 271 | "source": [ 272 | "glove.load('glove_model')" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "### Similarity calculation\n", 280 | "- Similarity between embedded words (i.e., vectors) can be computed using metrics such as cosine similarity\n", 281 | "- For other metrics and comparisons between them, refer to: https://github.com/taki0112/Vector_Similarity" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 26, 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "text/plain": [ 292 | "[('queene', 0.99407553073822963),\n", 293 | " ('matter', 0.99349224230824584),\n", 294 | " ('players', 0.98878981880933492),\n", 295 | " ('the', 0.98819079663149711),\n", 296 | " ('world', 0.98768057684646038),\n", 297 | " ('against', 0.98706467981587631),\n", 298 | " ('winde', 0.98687851286064199),\n", 299 | " ('drinke', 0.98627319315331974),\n", 300 | " ('very', 0.98547774026678192)]" 301 | ] 302 | }, 303 | "execution_count": 26, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "glove.most_similar('king', number = 10)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 47, 315 | "metadata": { 316 | "collapsed": true 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "# define a function that converts word into embedded vector\n", 321 | "def vector_converter(word):\n", 322 | " idx = glove.dictionary[word]\n", 323 | " return glove.word_vectors[idx]" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 49, 329 | "metadata": { 330 | "collapsed": true 331 | }, 332 | "outputs": [], 333 | "source": [ 334 | "# define a function that computes cosine similarity between two words\n", 335 | "def cosine_similarity(v1, v2):\n", 336 | " return 1 - spatial.distance.cosine(v1, v2)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 52, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "v1 = vector_converter('king')\n", 346 | "v2 = vector_converter('queen')" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 53, 352 | "metadata": { 353 | "scrolled": true 354 | }, 355 | "outputs": [ 356 | { 357 | "data": { 358 | "text/plain": [ 359 | "0.30658440396162456" 360 | ] 361 | }, 362 | "execution_count": 53, 363 | "metadata": {}, 364 | "output_type": "execute_result" 365 | } 366 | ], 367 | "source": [ 368 | "cosine_similarity(v1, v2)" 369 | ] 370 | } 371 | ], 372 | "metadata": { 373 | "kernelspec": { 374 | "display_name": "Python 3", 375 | "language": "python", 376 | "name": "python3" 377 | }, 378 | "language_info": { 379 | "codemirror_mode": { 380 | "name": "ipython", 381 | "version": 3 382 | }, 383 | "file_extension": ".py", 384 | "mimetype": "text/x-python", 385 | "name": "python", 386 | "nbconvert_exporter": "python", 387 | "pygments_lexer": "ipython3", 388 | "version": "3.6.1" 389 | } 390 | }, 391 | "nbformat": 4, 392 | "nbformat_minor": 2 393 | } 394 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Word-embedding-with-Python 2 | word2vec, doc2vec, GloVe implementation with Python 3 | 4 | ## word2vec 5 | 6 | - Python implementation and application of word2vec with Gensim 7 | - Original paper: Mikolov, T., Chen, K., Corrado, G., & Dean, J. (2013). Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781. 8 | 9 | 10 | ## doc2vec 11 | 12 | - Python implementation and application of doc2vec with Gensim 13 | - Original paper: Le, Q., & Mikolov, T. (2014). Distributed representations of sentences and documents. In Proceedings of the 31st International Conference on Machine Learning (ICML-14) (pp. 1188-1196). 14 | 15 | 16 | ## GloVe 17 | 18 | - Python implementation and application of GloVe with glove-python 19 | - Original paper: Pennington, J., Socher, R., & Manning, C. (2014). Glove: Global vectors for word representation. In Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP) (pp. 1532-1543). 20 | -------------------------------------------------------------------------------- /Sentence modeling/papers/Representing sentences as low-rank subspaces.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buomsoo-kim/Word-embedding-with-Python/855bb6eaed8f846343a106e552f0fbb4950d1fc3/Sentence modeling/papers/Representing sentences as low-rank subspaces.pdf -------------------------------------------------------------------------------- /Sentence modeling/source code/Sentence modeling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Sentence modeling\n", 8 | "- One of the methods to represent sentences as vectors (Mu et al 2017)\n", 9 | "- Computing vector representations of each embedded word, and weight average them using PCA\n", 10 | " - If there are **n** words in a sentence, select **N** words with high explained variance (n>N)\n", 11 | " - Most of \"energy\" (around 80%) can be containted using only 4 words (N=4) in the original paper (Mu et al 2017)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 13, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import re\n", 23 | "import numpy as np\n", 24 | "\n", 25 | "from gensim.models import Word2Vec\n", 26 | "from nltk.corpus import gutenberg\n", 27 | "from multiprocessing import Pool\n", 28 | "from scipy import spatial\n", 29 | "from sklearn.decomposition import PCA" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "sentences = list(gutenberg.sents('shakespeare-hamlet.txt')) # import the corpus and convert into a list" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "Type of corpus: \n", 53 | "Length of corpus: 3106\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "print('Type of corpus: ', type(sentences))\n", 59 | "print('Length of corpus: ', len(sentences))" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 5, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "['the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare']\n", 72 | "['actus', 'primus']\n", 73 | "['fran']\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "print(sentences[0]) # title, author, and year\n", 79 | "print(sentences[1])\n", 80 | "print(sentences[10])" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 4, 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "for i in range(len(sentences)):\n", 92 | " sentences[i] = [word.lower() for word in sentences[i] if re.match('^[a-zA-Z]+', word)]" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 6, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "['the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare']\n", 105 | "['actus', 'primus']\n", 106 | "['fran']\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "print(sentences[0]) # title, author, and year\n", 112 | "print(sentences[1])\n", 113 | "print(sentences[10])" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 9, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "# set threshold to consider only sentences longer than certain integer\n", 125 | "threshold = 5" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 10, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "for i in range(len(sentences)):\n", 137 | " if len(sentences[i]) < 5:\n", 138 | " sentences[i] = None" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 11, 144 | "metadata": { 145 | "collapsed": true 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "sentences = [sentence for sentence in sentences if sentence is not None] " 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 12, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "Length of corpus: 1442\n" 162 | ] 163 | } 164 | ], 165 | "source": [ 166 | "print('Length of corpus: ', len(sentences))" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 7, 172 | "metadata": { 173 | "collapsed": true 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "model = Word2Vec(sentences = sentences, size = 100, sg = 1, window = 3, min_count = 1, iter = 10, workers = Pool()._processes)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 8, 183 | "metadata": { 184 | "collapsed": true 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "model.init_sims(replace = True)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 19, 194 | "metadata": { 195 | "collapsed": true 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "# converting each word into its vector representation\n", 200 | "for i in range(len(sentences)):\n", 201 | " sentences[i] = [model[word] for word in sentences[i]]" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "collapsed": true 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "print(sentences[0]) # vector representation of first sentence" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 78, 218 | "metadata": { 219 | "collapsed": true 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "# define function to compute weighted vector representation of sentence\n", 224 | "# parameter 'n' means number of words to be accounted when computing weighted average\n", 225 | "def sent_PCA(sentence, n = 2):\n", 226 | " pca = PCA(n_components = n)\n", 227 | " pca.fit(np.array(sentence).transpose())\n", 228 | " variance = np.array(pca.explained_variance_ratio_)\n", 229 | " words = []\n", 230 | " for _ in range(n):\n", 231 | " idx = np.argmax(variance)\n", 232 | " words.append(np.amax(variance) * sentence[idx])\n", 233 | " variance[idx] = 0\n", 234 | " return np.sum(words, axis = 0)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 80, 240 | "metadata": { 241 | "collapsed": true 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "sent_vectorized = []" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 81, 251 | "metadata": { 252 | "collapsed": true 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "# computing vector representation of each sentence\n", 257 | "for sentence in sentences:\n", 258 | " sent_vectorized.append(sent_PCA(sentence))" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 88, 264 | "metadata": { 265 | "scrolled": true 266 | }, 267 | "outputs": [ 268 | { 269 | "data": { 270 | "text/plain": [ 271 | "True" 272 | ] 273 | }, 274 | "execution_count": 88, 275 | "metadata": {}, 276 | "output_type": "execute_result" 277 | } 278 | ], 279 | "source": [ 280 | "# vector representation of first sentence\n", 281 | "list(sent_PCA(sentences[0])) == list(sent_vectorized[0])" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 83, 287 | "metadata": { 288 | "collapsed": true 289 | }, 290 | "outputs": [], 291 | "source": [ 292 | "# define a function that computes cosine similarity between two words\n", 293 | "def cosine_similarity(v1, v2):\n", 294 | " return 1 - spatial.distance.cosine(v1, v2)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 86, 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "0.980275605104\n" 307 | ] 308 | } 309 | ], 310 | "source": [ 311 | "# similarity between 11th and 101th sentence in the corpus\n", 312 | "print(cosine_similarity(sent_vectorized[10], sent_vectorized[100]))" 313 | ] 314 | } 315 | ], 316 | "metadata": { 317 | "kernelspec": { 318 | "display_name": "Python 3", 319 | "language": "python", 320 | "name": "python3" 321 | }, 322 | "language_info": { 323 | "codemirror_mode": { 324 | "name": "ipython", 325 | "version": 3 326 | }, 327 | "file_extension": ".py", 328 | "mimetype": "text/x-python", 329 | "name": "python", 330 | "nbconvert_exporter": "python", 331 | "pygments_lexer": "ipython3", 332 | "version": "3.6.1" 333 | } 334 | }, 335 | "nbformat": 4, 336 | "nbformat_minor": 2 337 | } 338 | -------------------------------------------------------------------------------- /Sentence modeling/source code/sentence-modeling.py: -------------------------------------------------------------------------------- 1 | import re 2 | import numpy as np 3 | 4 | from gensim.models import Word2Vec 5 | from nltk.corpus import gutenberg 6 | from multiprocessing import Pool 7 | from scipy import spatial 8 | from sklearn.decomposition import PCA 9 | 10 | sentences = list(gutenberg.sents('shakespeare-hamlet.txt')) # import the corpus and convert into a list 11 | 12 | for i in range(len(sentences)): 13 | sentences[i] = [word.lower() for word in sentences[i] if re.match('^[a-zA-Z]+', word)] 14 | 15 | # set threshold to consider only sentences longer than certain integer 16 | threshold = 5 17 | 18 | for i in range(len(sentences)): 19 | if len(sentences[i]) < 5: 20 | sentences[i] = None 21 | 22 | sentences = [sentence for sentence in sentences if sentence is not None] 23 | 24 | model = Word2Vec(sentences = sentences, size = 100, sg = 1, window = 3, min_count = 1, iter = 10, workers = Pool()._processes) 25 | model.init_sims(replace = True) 26 | 27 | # converting each word into its vector representation 28 | for i in range(len(sentences)): 29 | sentences[i] = [model[word] for word in sentences[i]] 30 | 31 | 32 | # define function to compute weighted vector representation of sentence 33 | # parameter 'n' means number of words to be accounted when computing weighted average 34 | def sent_PCA(sentence, n = 2): 35 | pca = PCA(n_components = n) 36 | pca.fit(np.array(sentence).transpose()) 37 | variance = np.array(pca.explained_variance_ratio_) 38 | words = [] 39 | for _ in range(n): 40 | idx = np.argmax(variance) 41 | words.append(np.amax(variance) * sentence[idx]) 42 | variance[idx] = 0 43 | return np.sum(words, axis = 0) 44 | 45 | sent_vectorized = [] 46 | 47 | # computing vector representation of each sentence 48 | for sentence in sentences: 49 | sent_vectorized.append(sent_PCA(sentence)) 50 | 51 | # define a function that computes cosine similarity between two words 52 | def cosine_similarity(v1, v2): 53 | return 1 - spatial.distance.cosine(v1, v2) 54 | 55 | # similarity between 11th and 101th sentence in the corpus 56 | print(cosine_similarity(sent_vectorized[10], sent_vectorized[100])) -------------------------------------------------------------------------------- /doc2vec/papers/le_mikolov_14 (Distributed representations of sentences and documents).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buomsoo-kim/Word-embedding-with-Python/855bb6eaed8f846343a106e552f0fbb4950d1fc3/doc2vec/papers/le_mikolov_14 (Distributed representations of sentences and documents).pdf -------------------------------------------------------------------------------- /doc2vec/source code/doc2vec.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## doc2vec implementation with Python (& Gensim)\n", 8 | "- Note: This code is written in Python 3.6.1 (+Gensim 2.3.0)" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 75, 14 | "metadata": { 15 | "collapsed": true 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "import re\n", 20 | "import numpy as np\n", 21 | "\n", 22 | "from gensim.models import Doc2Vec\n", 23 | "from gensim.models.doc2vec import TaggedDocument\n", 24 | "from nltk.corpus import gutenberg\n", 25 | "from multiprocessing import Pool\n", 26 | "from scipy import spatial" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "### Import training dataset\n", 34 | "- Import Shakespeare's Hamlet corpus from nltk library" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 45, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "sentences = list(gutenberg.sents('shakespeare-hamlet.txt')) # import the corpus and convert into a list" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 46, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "Type of corpus: \n", 58 | "Length of corpus: 3106\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "print('Type of corpus: ', type(sentences))\n", 64 | "print('Length of corpus: ', len(sentences))" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 47, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', 'William', 'Shakespeare', '1599', ']']\n", 77 | "['Actus', 'Primus', '.']\n", 78 | "['Fran', '.']\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "print(sentences[0]) # title, author, and year\n", 84 | "print(sentences[1])\n", 85 | "print(sentences[10])" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### Preprocess data\n", 93 | "- Use re module to preprocess data\n", 94 | "- Convert all letters into lowercase\n", 95 | "- Remove punctuations, numbers, etc.\n", 96 | "- For the doc2vec model, input data should be in format of **iterable TaggedDocuments\"**\n", 97 | " - Each TaggedDocument instance comprises **words** and **tags**\n", 98 | " - Hence, each document (i.e., a sentence or paragraph) should have a unique tag which is identifiable" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 48, 104 | "metadata": { 105 | "collapsed": true 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "for i in range(len(sentences)):\n", 110 | " sentences[i] = [word.lower() for word in sentences[i] if re.match('^[a-zA-Z]+', word)] " 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 49, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "['the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare']\n", 123 | "['actus', 'primus']\n", 124 | "['fran']\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "print(sentences[0]) # title, author, and year\n", 130 | "print(sentences[1])\n", 131 | "print(sentences[10])" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 81, 137 | "metadata": { 138 | "collapsed": true 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "for i in range(len(sentences)):\n", 143 | " sentences[i] = TaggedDocument(words = sentences[i], tags = ['sent{}'.format(i)]) # converting each sentence into a TaggedDocument" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 82, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "TaggedDocument(words=['the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare'], tags=['sent0'])" 155 | ] 156 | }, 157 | "execution_count": 82, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "sentences[0]" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "### Create and train model\n", 171 | "- Create a doc2vec model and train it with Hamlet corpus\n", 172 | "- Key parameter description (https://radimrehurek.com/gensim/models/doc2vec.html)\n", 173 | " - **documents**: training data (has to be iterable TaggedDocument instances)\n", 174 | " - **size**: dimension of embedding space\n", 175 | " - **dm**: DBOW if 0, distributed-memory if 1\n", 176 | " - **window**: number of words accounted for each context (if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)\n", 177 | " - **min_count**: minimum count of words to be included in the vocabulary\n", 178 | " - **iter**: number of training iterations\n", 179 | " - **workers**: number of worker threads to train" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 83, 185 | "metadata": { 186 | "collapsed": true 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "model = Doc2Vec(documents = sentences, dm = 1, size = 100, window = 3, min_count = 1, iter = 10, workers = Pool()._processes)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 96, 196 | "metadata": { 197 | "collapsed": true 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "model.init_sims(replace = True)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "### Save and load model\n", 209 | "- doc2vec model can be saved and loaded locally\n", 210 | "- Doing so can reduce time to train model again" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 99, 216 | "metadata": { 217 | "collapsed": true 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "model.save('doc2vec_model')" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 100, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "model = Doc2Vec.load('doc2vec_model')" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "### Similarity calculation\n", 240 | "- Similarity between embedded words (i.e., vectors) can be computed using metrics such as cosine similarity\n", 241 | "- For other metrics and comparisons between them, refer to: https://github.com/taki0112/Vector_Similarity" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 94, 247 | "metadata": { 248 | "collapsed": true 249 | }, 250 | "outputs": [], 251 | "source": [ 252 | "v1 = model.infer_vector('sent2') # in doc2vec, infer_vector() function is used to infer the vector embedding of a document\n", 253 | "v2 = model.infer_vector('sent3')" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 95, 259 | "metadata": {}, 260 | "outputs": [ 261 | { 262 | "data": { 263 | "text/plain": [ 264 | "[('seeke', 0.9795917272567749),\n", 265 | " ('hither', 0.9794537425041199),\n", 266 | " ('touching', 0.9791266918182373),\n", 267 | " ('spade', 0.9790579080581665),\n", 268 | " ('goes', 0.9789791107177734),\n", 269 | " ('hit', 0.9789602756500244),\n", 270 | " ('lose', 0.9786853790283203),\n", 271 | " ('countries', 0.9786409139633179),\n", 272 | " ('rash', 0.9785533547401428),\n", 273 | " ('honor', 0.978546142578125)]" 274 | ] 275 | }, 276 | "execution_count": 95, 277 | "metadata": {}, 278 | "output_type": "execute_result" 279 | } 280 | ], 281 | "source": [ 282 | "model.most_similar([v1])" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 97, 288 | "metadata": { 289 | "collapsed": true 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "# define a function that computes cosine similarity between two words\n", 294 | "def cosine_similarity(v1, v2):\n", 295 | " return 1 - spatial.distance.cosine(v1, v2)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 98, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/plain": [ 306 | "0.95258642555608464" 307 | ] 308 | }, 309 | "execution_count": 98, 310 | "metadata": {}, 311 | "output_type": "execute_result" 312 | } 313 | ], 314 | "source": [ 315 | "cosine_similarity(v1, v2)" 316 | ] 317 | } 318 | ], 319 | "metadata": { 320 | "kernelspec": { 321 | "display_name": "Python 3", 322 | "language": "python", 323 | "name": "python3" 324 | }, 325 | "language_info": { 326 | "codemirror_mode": { 327 | "name": "ipython", 328 | "version": 3 329 | }, 330 | "file_extension": ".py", 331 | "mimetype": "text/x-python", 332 | "name": "python", 333 | "nbconvert_exporter": "python", 334 | "pygments_lexer": "ipython3", 335 | "version": "3.6.1" 336 | } 337 | }, 338 | "nbformat": 4, 339 | "nbformat_minor": 2 340 | } 341 | -------------------------------------------------------------------------------- /word2vec/papers/mikolov_et_al_13 (Distributed representations of words and phrases and their compositionality).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buomsoo-kim/Word-embedding-with-Python/855bb6eaed8f846343a106e552f0fbb4950d1fc3/word2vec/papers/mikolov_et_al_13 (Distributed representations of words and phrases and their compositionality).pdf -------------------------------------------------------------------------------- /word2vec/papers/mikolov_et_al_13 (Efficient estimation of word representations in vector space).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buomsoo-kim/Word-embedding-with-Python/855bb6eaed8f846343a106e552f0fbb4950d1fc3/word2vec/papers/mikolov_et_al_13 (Efficient estimation of word representations in vector space).pdf -------------------------------------------------------------------------------- /word2vec/source code/word2vec.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## word2vec implementation with Python (& Gensim)\n", 8 | "- Note: This code is written in Python 3.6.1 (+Gensim 2.3.0)" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 59, 14 | "metadata": { 15 | "collapsed": true 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "import re\n", 20 | "import numpy as np\n", 21 | "\n", 22 | "from gensim.models import Word2Vec\n", 23 | "from nltk.corpus import gutenberg\n", 24 | "from multiprocessing import Pool\n", 25 | "from scipy import spatial" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Import training dataset\n", 33 | "- Import Shakespeare's Hamlet corpus from nltk library" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 74, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "sentences = list(gutenberg.sents('shakespeare-hamlet.txt')) # import the corpus and convert into a list" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 75, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "Type of corpus: \n", 57 | "Length of corpus: 3106\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "print('Type of corpus: ', type(sentences))\n", 63 | "print('Length of corpus: ', len(sentences))" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 76, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', 'William', 'Shakespeare', '1599', ']']\n", 76 | "['Actus', 'Primus', '.']\n", 77 | "['Fran', '.']\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "print(sentences[0]) # title, author, and year\n", 83 | "print(sentences[1])\n", 84 | "print(sentences[10])" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "### Preprocess data\n", 92 | "- Use re module to preprocess data\n", 93 | "- Convert all letters into lowercase\n", 94 | "- Remove punctuations, numbers, etc." 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 77, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "for i in range(len(sentences)):\n", 106 | " sentences[i] = [word.lower() for word in sentences[i] if re.match('^[a-zA-Z]+', word)] " 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 78, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "['the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare']\n", 119 | "['actus', 'primus']\n", 120 | "['fran']\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "print(sentences[0]) # title, author, and year\n", 126 | "print(sentences[1])\n", 127 | "print(sentences[10])" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "### Create and train model\n", 135 | "- Create a word2vec model and train it with Hamlet corpus\n", 136 | "- Key parameter description (https://radimrehurek.com/gensim/models/word2vec.html)\n", 137 | " - **sentences**: training data (has to be a list with tokenized sentences)\n", 138 | " - **size**: dimension of embedding space\n", 139 | " - **sg**: CBOW if 0, skip-gram if 1\n", 140 | " - **window**: number of words accounted for each context (if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered)\n", 141 | " - **min_count**: minimum count of words to be included in the vocabulary\n", 142 | " - **iter**: number of training iterations\n", 143 | " - **workers**: number of worker threads to train" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 79, 149 | "metadata": { 150 | "collapsed": true 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "model = Word2Vec(sentences = sentences, size = 100, sg = 1, window = 3, min_count = 1, iter = 10, workers = Pool()._processes)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 80, 160 | "metadata": { 161 | "collapsed": true 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "model.init_sims(replace = True)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "### Save and load model\n", 173 | "- word2vec model can be saved and loaded locally\n", 174 | "- Doing so can reduce time to train model again" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 91, 180 | "metadata": { 181 | "collapsed": true 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "model.save('word2vec_model')" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 92, 191 | "metadata": { 192 | "collapsed": true 193 | }, 194 | "outputs": [], 195 | "source": [ 196 | "model = Word2Vec.load('word2vec_model')" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "### Similarity calculation\n", 204 | "- Similarity between embedded words (i.e., vectors) can be computed using metrics such as cosine similarity\n", 205 | "- For other metrics and comparisons between them, refer to: https://github.com/taki0112/Vector_Similarity" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 86, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "[('horatio', 0.9978846311569214),\n", 217 | " ('queene', 0.9971947073936462),\n", 218 | " ('laertes', 0.9971820116043091),\n", 219 | " ('king', 0.9968599081039429),\n", 220 | " ('mother', 0.9966716170310974),\n", 221 | " ('where', 0.9966292381286621),\n", 222 | " ('deere', 0.9965540170669556),\n", 223 | " ('ophelia', 0.9964221715927124),\n", 224 | " ('very', 0.9963752627372742),\n", 225 | " ('oh', 0.9963476657867432)]" 226 | ] 227 | }, 228 | "execution_count": 86, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "model.most_similar('hamlet')" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 87, 240 | "metadata": { 241 | "collapsed": true, 242 | "scrolled": true 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "v1 = model['king']\n", 247 | "v2 = model['queen']" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 90, 253 | "metadata": { 254 | "collapsed": true 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "# define a function that computes cosine similarity between two words\n", 259 | "def cosine_similarity(v1, v2):\n", 260 | " return 1 - spatial.distance.cosine(v1, v2)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 89, 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "data": { 270 | "text/plain": [ 271 | "0.99437165260314941" 272 | ] 273 | }, 274 | "execution_count": 89, 275 | "metadata": {}, 276 | "output_type": "execute_result" 277 | } 278 | ], 279 | "source": [ 280 | "cosine_similarity(v1, v2)" 281 | ] 282 | } 283 | ], 284 | "metadata": { 285 | "kernelspec": { 286 | "display_name": "Python 3", 287 | "language": "python", 288 | "name": "python3" 289 | }, 290 | "language_info": { 291 | "codemirror_mode": { 292 | "name": "ipython", 293 | "version": 3 294 | }, 295 | "file_extension": ".py", 296 | "mimetype": "text/x-python", 297 | "name": "python", 298 | "nbconvert_exporter": "python", 299 | "pygments_lexer": "ipython3", 300 | "version": "3.6.1" 301 | } 302 | }, 303 | "nbformat": 4, 304 | "nbformat_minor": 2 305 | } 306 | --------------------------------------------------------------------------------