├── Pre-trained GloVe.ipynb ├── Pre-trained word-vectors from gensim-data.ipynb ├── README.md └── data ├── r8-test-all-terms.txt ├── r8-train-all-terms.txt └── readme /Pre-trained GloVe.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Loading word vectors...\n", 13 | "Found 400000 word vectors.\n", 14 | "king - man = queen - woman\n", 15 | "france - paris = britain - london\n", 16 | "france - paris = italy - rome\n", 17 | "paris - france = rome - italy\n", 18 | "france - french = england - english\n", 19 | "japan - japanese = china - chinese\n", 20 | "japan - japanese = italy - italian\n", 21 | "japan - japanese = australia - australian\n", 22 | "december - november = july - june\n", 23 | "miami - florida = houston - texas\n", 24 | "einstein - scientist = matisse - painter\n", 25 | "china - rice = chinese - bread\n", 26 | "man - woman = he - she\n", 27 | "man - woman = uncle - aunt\n", 28 | "man - woman = brother - sister\n", 29 | "man - woman = friend - wife\n", 30 | "man - woman = actor - actress\n", 31 | "man - woman = father - mother\n", 32 | "heir - heiress = queen - princess\n", 33 | "nephew - niece = uncle - aunt\n", 34 | "france - paris = japan - tokyo\n", 35 | "france - paris = china - beijing\n", 36 | "february - january = october - november\n", 37 | "france - paris = italy - rome\n", 38 | "paris - france = rome - italy\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "from __future__ import print_function, division\n", 44 | "from future.utils import iteritems\n", 45 | "from builtins import range\n", 46 | "\n", 47 | "import numpy as np\n", 48 | "from sklearn.metrics.pairwise import pairwise_distances\n", 49 | "\n", 50 | "\n", 51 | "def dist1(a, b):\n", 52 | " return np.linalg.norm(a - b)\n", 53 | "def dist2(a, b):\n", 54 | " return 1 - a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))\n", 55 | "\n", 56 | "# pick a distance type\n", 57 | "dist, metric = dist2, 'cosine'\n", 58 | "# dist, metric = dist1, 'euclidean'\n", 59 | "\n", 60 | "## faster\n", 61 | "def find_analogies(w1, w2, w3):\n", 62 | " for w in (w1, w2, w3):\n", 63 | " if w not in word2vec:\n", 64 | " print(\"%s not in dictionary\" % w)\n", 65 | " return\n", 66 | "\n", 67 | " king = word2vec[w1]\n", 68 | " man = word2vec[w2]\n", 69 | " woman = word2vec[w3]\n", 70 | " v0 = king - man + woman\n", 71 | "\n", 72 | " distances = pairwise_distances(v0.reshape(1, D), embedding, metric=metric).reshape(V)\n", 73 | " idxs = distances.argsort()[:4]\n", 74 | " for idx in idxs:\n", 75 | " word = idx2word[idx]\n", 76 | " if word not in (w1, w2, w3): \n", 77 | " best_word = word\n", 78 | " break\n", 79 | "\n", 80 | " print(w1, \"-\", w2, \"=\", best_word, \"-\", w3)\n", 81 | "\n", 82 | "\n", 83 | "def nearest_neighbors(w, n=5):\n", 84 | " if w not in word2vec:\n", 85 | " print(\"%s not in dictionary:\" % w)\n", 86 | " return\n", 87 | "\n", 88 | " v = word2vec[w]\n", 89 | " distances = pairwise_distances(v.reshape(1, D), embedding, metric=metric).reshape(V)\n", 90 | " idxs = distances.argsort()[1:n+1]\n", 91 | " print(\"neighbors of: %s\" % w)\n", 92 | " for idx in idxs:\n", 93 | " print(\"\\t%s\" % idx2word[idx])\n", 94 | "\n", 95 | "\n", 96 | "\n", 97 | "# load in pre-trained word vectors\n", 98 | "print('Loading word vectors...')\n", 99 | "word2vec = {}\n", 100 | "embedding = []\n", 101 | "idx2word = []\n", 102 | "with open('glove/glove.6B.50d.txt', encoding='utf-8') as f:\n", 103 | " # is just a space-separated text file in the format:\n", 104 | " # word vec[0] vec[1] vec[2] ...\n", 105 | " for line in f:\n", 106 | " values = line.split()\n", 107 | " word = values[0]\n", 108 | " vec = np.asarray(values[1:], dtype='float32')\n", 109 | " word2vec[word] = vec\n", 110 | " embedding.append(vec)\n", 111 | " idx2word.append(word)\n", 112 | "print('Found %s word vectors.' % len(word2vec))\n", 113 | "embedding = np.array(embedding)\n", 114 | "V, D = embedding.shape\n", 115 | "\n", 116 | "\n", 117 | "find_analogies('king', 'man', 'woman')\n", 118 | "find_analogies('france', 'paris', 'london')\n", 119 | "find_analogies('france', 'paris', 'rome')\n", 120 | "find_analogies('paris', 'france', 'italy')\n", 121 | "find_analogies('france', 'french', 'english')\n", 122 | "find_analogies('japan', 'japanese', 'chinese')\n", 123 | "find_analogies('japan', 'japanese', 'italian')\n", 124 | "find_analogies('japan', 'japanese', 'australian')\n", 125 | "find_analogies('december', 'november', 'june')\n", 126 | "find_analogies('miami', 'florida', 'texas')\n", 127 | "find_analogies('einstein', 'scientist', 'painter')\n", 128 | "find_analogies('china', 'rice', 'bread')\n", 129 | "find_analogies('man', 'woman', 'she')\n", 130 | "find_analogies('man', 'woman', 'aunt')\n", 131 | "find_analogies('man', 'woman', 'sister')\n", 132 | "find_analogies('man', 'woman', 'wife')\n", 133 | "find_analogies('man', 'woman', 'actress')\n", 134 | "find_analogies('man', 'woman', 'mother')\n", 135 | "find_analogies('heir', 'heiress', 'princess')\n", 136 | "find_analogies('nephew', 'niece', 'aunt')\n", 137 | "find_analogies('france', 'paris', 'tokyo')\n", 138 | "find_analogies('france', 'paris', 'beijing')\n", 139 | "find_analogies('february', 'january', 'november')\n", 140 | "find_analogies('france', 'paris', 'rome')\n", 141 | "find_analogies('paris', 'france', 'italy')" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 3, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "neighbors of: king\n", 154 | "\tprince\n", 155 | "\tqueen\n", 156 | "\tii\n", 157 | "\temperor\n", 158 | "\tson\n", 159 | "neighbors of: france\n", 160 | "\tfrench\n", 161 | "\tbelgium\n", 162 | "\tparis\n", 163 | "\tspain\n", 164 | "\tnetherlands\n", 165 | "neighbors of: japan\n", 166 | "\tjapanese\n", 167 | "\tchina\n", 168 | "\tkorea\n", 169 | "\ttokyo\n", 170 | "\ttaiwan\n", 171 | "neighbors of: einstein\n", 172 | "\trelativity\n", 173 | "\tbohr\n", 174 | "\tphysics\n", 175 | "\theisenberg\n", 176 | "\tfreud\n", 177 | "neighbors of: woman\n", 178 | "\tgirl\n", 179 | "\tman\n", 180 | "\tmother\n", 181 | "\ther\n", 182 | "\tboy\n", 183 | "neighbors of: nephew\n", 184 | "\tcousin\n", 185 | "\tbrother\n", 186 | "\tgrandson\n", 187 | "\tson\n", 188 | "\tuncle\n", 189 | "neighbors of: february\n", 190 | "\toctober\n", 191 | "\tdecember\n", 192 | "\tjanuary\n", 193 | "\taugust\n", 194 | "\tseptember\n", 195 | "neighbors of: rome\n", 196 | "\tnaples\n", 197 | "\tvenice\n", 198 | "\titaly\n", 199 | "\tturin\n", 200 | "\tpope\n" 201 | ] 202 | } 203 | ], 204 | "source": [ 205 | "nearest_neighbors('king')\n", 206 | "nearest_neighbors('france')\n", 207 | "nearest_neighbors('japan')\n", 208 | "nearest_neighbors('einstein')\n", 209 | "nearest_neighbors('woman')\n", 210 | "nearest_neighbors('nephew')\n", 211 | "nearest_neighbors('february')\n", 212 | "nearest_neighbors('rome')" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [] 221 | } 222 | ], 223 | "metadata": { 224 | "kernelspec": { 225 | "display_name": "Python 3", 226 | "language": "python", 227 | "name": "python3" 228 | }, 229 | "language_info": { 230 | "codemirror_mode": { 231 | "name": "ipython", 232 | "version": 3 233 | }, 234 | "file_extension": ".py", 235 | "mimetype": "text/x-python", 236 | "name": "python", 237 | "nbconvert_exporter": "python", 238 | "pygments_lexer": "ipython3", 239 | "version": "3.6.7" 240 | } 241 | }, 242 | "nbformat": 4, 243 | "nbformat_minor": 2 244 | } 245 | -------------------------------------------------------------------------------- /Pre-trained word-vectors from gensim-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 16, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import gensim.downloader as api\n", 10 | "\n", 11 | "word_vectors = api.load(\"glove-wiki-gigaword-100\")" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 17, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "def find_analogies(w1, w2, w3):\n", 21 | " r = word_vectors.most_similar(positive=[w1, w3], negative=[w2])\n", 22 | " print(\"%s - %s = %s - %s\" % (w1, w2, r[0][0], w3))" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 18, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "def nearest_neighbors(w):\n", 32 | " r = word_vectors.most_similar(positive=[w])\n", 33 | " print(\"neighbors of: %s\" % w)\n", 34 | " for word, score in r:\n", 35 | " print(\"\\t%s\" % word)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 19, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "king - man = queen - woman\n", 48 | "france - paris = britain - london\n", 49 | "france - paris = italy - rome\n", 50 | "paris - france = rome - italy\n", 51 | "france - french = england - english\n", 52 | "japan - japanese = china - chinese\n", 53 | "japan - japanese = italy - italian\n", 54 | "japan - japanese = australia - australian\n", 55 | "december - november = july - june\n", 56 | "miami - florida = dallas - texas\n", 57 | "einstein - scientist = picasso - painter\n", 58 | "china - rice = chinese - bread\n", 59 | "man - woman = he - she\n", 60 | "man - woman = uncle - aunt\n", 61 | "man - woman = brother - sister\n", 62 | "man - woman = brother - wife\n", 63 | "man - woman = actor - actress\n", 64 | "man - woman = father - mother\n", 65 | "heir - heiress = prince - princess\n", 66 | "nephew - niece = uncle - aunt\n", 67 | "france - paris = japan - tokyo\n", 68 | "france - paris = china - beijing\n", 69 | "february - january = october - november\n", 70 | "france - paris = italy - rome\n", 71 | "paris - france = rome - italy\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "find_analogies('king', 'man', 'woman')\n", 77 | "find_analogies('france', 'paris', 'london')\n", 78 | "find_analogies('france', 'paris', 'rome')\n", 79 | "find_analogies('paris', 'france', 'italy')\n", 80 | "find_analogies('france', 'french', 'english')\n", 81 | "find_analogies('japan', 'japanese', 'chinese')\n", 82 | "find_analogies('japan', 'japanese', 'italian')\n", 83 | "find_analogies('japan', 'japanese', 'australian')\n", 84 | "find_analogies('december', 'november', 'june')\n", 85 | "find_analogies('miami', 'florida', 'texas')\n", 86 | "find_analogies('einstein', 'scientist', 'painter')\n", 87 | "find_analogies('china', 'rice', 'bread')\n", 88 | "find_analogies('man', 'woman', 'she')\n", 89 | "find_analogies('man', 'woman', 'aunt')\n", 90 | "find_analogies('man', 'woman', 'sister')\n", 91 | "find_analogies('man', 'woman', 'wife')\n", 92 | "find_analogies('man', 'woman', 'actress')\n", 93 | "find_analogies('man', 'woman', 'mother')\n", 94 | "find_analogies('heir', 'heiress', 'princess')\n", 95 | "find_analogies('nephew', 'niece', 'aunt')\n", 96 | "find_analogies('france', 'paris', 'tokyo')\n", 97 | "find_analogies('france', 'paris', 'beijing')\n", 98 | "find_analogies('february', 'january', 'november')\n", 99 | "find_analogies('france', 'paris', 'rome')\n", 100 | "find_analogies('paris', 'france', 'italy')" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 20, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "neighbors of: king\n", 113 | "\tprince\n", 114 | "\tqueen\n", 115 | "\tson\n", 116 | "\tbrother\n", 117 | "\tmonarch\n", 118 | "\tthrone\n", 119 | "\tkingdom\n", 120 | "\tfather\n", 121 | "\temperor\n", 122 | "\tii\n", 123 | "neighbors of: france\n", 124 | "\tbelgium\n", 125 | "\tfrench\n", 126 | "\tbritain\n", 127 | "\tspain\n", 128 | "\tparis\n", 129 | "\tgermany\n", 130 | "\titaly\n", 131 | "\teurope\n", 132 | "\tnetherlands\n", 133 | "\tluxembourg\n", 134 | "neighbors of: japan\n", 135 | "\tjapanese\n", 136 | "\tkorea\n", 137 | "\ttokyo\n", 138 | "\tchina\n", 139 | "\tindonesia\n", 140 | "\trussia\n", 141 | "\tasia\n", 142 | "\tthailand\n", 143 | "\ttaiwan\n", 144 | "\tgermany\n", 145 | "neighbors of: einstein\n", 146 | "\trelativity\n", 147 | "\tfreud\n", 148 | "\tphysics\n", 149 | "\tbohr\n", 150 | "\ttheory\n", 151 | "\tquantum\n", 152 | "\tsigmund\n", 153 | "\ttheories\n", 154 | "\tcosmological\n", 155 | "\tequations\n", 156 | "neighbors of: woman\n", 157 | "\tgirl\n", 158 | "\tman\n", 159 | "\tmother\n", 160 | "\tboy\n", 161 | "\tshe\n", 162 | "\tchild\n", 163 | "\twife\n", 164 | "\ther\n", 165 | "\therself\n", 166 | "\tdaughter\n", 167 | "neighbors of: nephew\n", 168 | "\tcousin\n", 169 | "\tgrandson\n", 170 | "\tbrother\n", 171 | "\tson\n", 172 | "\tuncle\n", 173 | "\teldest\n", 174 | "\tbrother-in-law\n", 175 | "\tniece\n", 176 | "\tfather\n", 177 | "\tgrandfather\n", 178 | "neighbors of: february\n", 179 | "\toctober\n", 180 | "\tdecember\n", 181 | "\tseptember\n", 182 | "\tjanuary\n", 183 | "\taugust\n", 184 | "\tnovember\n", 185 | "\tjune\n", 186 | "\tapril\n", 187 | "\tjuly\n", 188 | "\tmarch\n", 189 | "neighbors of: rome\n", 190 | "\tnaples\n", 191 | "\tparis\n", 192 | "\tvenice\n", 193 | "\troman\n", 194 | "\tvienna\n", 195 | "\titaly\n", 196 | "\tturin\n", 197 | "\tmilan\n", 198 | "\tpope\n", 199 | "\tvatican\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "nearest_neighbors('king')\n", 205 | "nearest_neighbors('france')\n", 206 | "nearest_neighbors('japan')\n", 207 | "nearest_neighbors('einstein')\n", 208 | "nearest_neighbors('woman')\n", 209 | "nearest_neighbors('nephew')\n", 210 | "nearest_neighbors('february')\n", 211 | "nearest_neighbors('rome')" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "Python 3", 225 | "language": "python", 226 | "name": "python3" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "ipython", 231 | "version": 3 232 | }, 233 | "file_extension": ".py", 234 | "mimetype": "text/x-python", 235 | "name": "python", 236 | "nbconvert_exporter": "python", 237 | "pygments_lexer": "ipython3", 238 | "version": "3.6.7" 239 | } 240 | }, 241 | "nbformat": 4, 242 | "nbformat_minor": 2 243 | } 244 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Natural-Language-Processing-with-Deep-Learning-in-Python- 2 | The repository for the course in Udemy 3 | -------------------------------------------------------------------------------- /data/readme: -------------------------------------------------------------------------------- 1 | 2 | --------------------------------------------------------------------------------