├── Distributional Semantics ├── .DS_Store ├── codes │ ├── .DS_Store │ ├── text-preprocessing.ipynb │ ├── utils.py │ ├── w2v-text-classification.ipynb │ ├── w2v.ipynb │ └── wiki-countries.w2v ├── data │ └── init.xml └── images │ ├── Emb1.png │ ├── Emb2.png │ ├── Emb3.png │ ├── Emb4.png │ ├── init │ ├── king-queen.png │ ├── man-king.png │ ├── man-woman.png │ └── woman-queen.png ├── Knowledge Graphs ├── .DS_Store └── codes │ ├── .DS_Store │ ├── intro-to-wordnet.ipynb │ ├── lesk.ipynb │ ├── wordnet-graph.ipynb │ └── wordnet.png ├── README.md ├── Topic Modelling ├── .DS_Store ├── code files │ └── nmf-imdb-movie-reviews.ipynb └── data │ └── init.xml └── images ├── .DS_Store ├── image1.png ├── image2.png ├── image3.png ├── image4.png ├── image5.png └── image6.png /Distributional Semantics/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/Distributional Semantics/.DS_Store -------------------------------------------------------------------------------- /Distributional Semantics/codes/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/Distributional Semantics/codes/.DS_Store -------------------------------------------------------------------------------- /Distributional Semantics/codes/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | from sklearn.decomposition import PCA 8 | from sklearn.manifold import TSNE 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | from gensim.models.callbacks import CallbackAny2Vec 12 | from gensim.models import Word2Vec, KeyedVectors 13 | from tensorflow.keras.layers import Embedding 14 | 15 | 16 | class MetricCallback(CallbackAny2Vec): 17 | """ 18 | Callback to print loss after each epoch 19 | """ 20 | def __init__(self, every=10): 21 | self.myloss = [] 22 | self.epoch = 0 23 | self.every = every 24 | 25 | def on_epoch_end(self, model): 26 | loss = model.get_latest_training_loss() 27 | if self.epoch == 0: 28 | self.myloss.append(loss) 29 | else: 30 | self.myloss.append(loss - self.loss_previous_step) 31 | if self.epoch % self.every == 0: 32 | print(f'Loss after epoch {self.epoch}: {self.myloss[-1]}') # NOQA: T001 33 | self.epoch += 1 34 | self.loss_previous_step = loss 35 | 36 | 37 | def plot_arrows(starts, ends, wv, estimator=PCA, **kwargs): 38 | if len(starts) != len(ends): 39 | raise ValueError('starts and ends must be the same length.') 40 | fig, ax = plt.subplots(figsize=kwargs.pop('figsize', (8, 8))) 41 | X = wv[starts + ends] # NOQA: N806 42 | x_red = estimator(n_components=2).fit_transform(X) 43 | plt.scatter(*x_red.T) 44 | for i, word in enumerate(starts + ends): 45 | plt.annotate(word, x_red[i]) 46 | xstart = x_red[:len(starts)] 47 | xend = x_red[len(starts):] 48 | for i, (start, end) in enumerate(zip(starts, ends)): 49 | x1, y1 = xstart[i] 50 | x2, y2 = xend[i] 51 | plt.arrow(x1, y1, x2 - x1, y2 - y1) 52 | 53 | 54 | def plot_vectors(words, model, estimator=TSNE, **kwargs): 55 | names = [] 56 | vectors = [] 57 | for word in words: 58 | if word in model.wv: 59 | names.append(word) 60 | vectors.append(model.wv[word]) 61 | 62 | X = np.r_[vectors] # NOQA: N806 63 | x_red = estimator(n_components=2).fit_transform(X) 64 | fig, ax = plt.subplots(figsize=kwargs.pop('figsize', (16, 16))) # NOQA: E912 65 | ax.scatter(*x_red.T) 66 | 67 | for i, word in enumerate(names): 68 | plt.annotate(word, x_red[i]) 69 | 70 | 71 | def make_embedding_layer(model, tokenizer, MAX_SEQUENCE_LENGTH): # NOQA: N803 72 | word_index = tokenizer.word_index 73 | if isinstance(model, Word2Vec): 74 | wv = model.wv 75 | elif isinstance(model, KeyedVectors): 76 | wv = model 77 | embedding_matrix = np.zeros((len(word_index) + 1, wv.vector_size)) 78 | for word, i in word_index.items(): 79 | try: 80 | vector = wv.get_vector(word, False) 81 | embedding_matrix[i] = vector 82 | except KeyError: 83 | continue 84 | el = Embedding( 85 | len(word_index) + 1, wv.vector_size, weights=[embedding_matrix], 86 | input_length=MAX_SEQUENCE_LENGTH, trainable=False 87 | ) 88 | return el 89 | -------------------------------------------------------------------------------- /Distributional Semantics/codes/w2v-text-classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "fitting-soccer", 6 | "metadata": {}, 7 | "source": [ 8 | "## The Problem: Large Movie Dataset Review\n", 9 | "### Classify movie reviews from IMDB into positive or negative sentiment.\n", 10 | "### Download the dataset [here](https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz)" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "coordinated-amendment", 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stderr", 21 | "output_type": "stream", 22 | "text": [ 23 | "/Users/tejaswiniallikanti/miniconda3/lib/python3.8/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", 24 | " warnings.warn(msg)\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "# imports\n", 30 | "\n", 31 | "from gensim.models import KeyedVectors\n", 32 | "import numpy as np\n", 33 | "import pandas as pd\n", 34 | "import matplotlib.pyplot as plt\n", 35 | "from sklearn.model_selection import train_test_split\n", 36 | "from tensorflow.keras.preprocessing import text_dataset_from_directory\n", 37 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 38 | "from tensorflow.keras.preprocessing.text import Tokenizer\n", 39 | "from tensorflow.keras.layers import Embedding, Dense, Input, GlobalAveragePooling1D\n", 40 | "from tensorflow.keras.models import Sequential\n", 41 | "from tensorflow.keras.optimizers import Adam\n", 42 | "\n", 43 | "import utils" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "id": "interior-washer", 49 | "metadata": {}, 50 | "source": [ 51 | "## Exploring the data" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "id": "welsh-barcelona", 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "Found 75000 files belonging to 3 classes.\n", 65 | "Found 25000 files belonging to 2 classes.\n" 66 | ] 67 | }, 68 | { 69 | "name": "stderr", 70 | "output_type": "stream", 71 | "text": [ 72 | ":11: SettingWithCopyWarning: \n", 73 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 74 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 75 | "\n", 76 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 77 | " xts['text'] = xts['text'].map(lambda x: x.decode())\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "# Importing & preprocessing the dataset\n", 83 | "\n", 84 | "train_ds = text_dataset_from_directory('../Distributional Semantics/data/aclImdb/train')\n", 85 | "test_ds = text_dataset_from_directory('../Distributional Semantics/data/aclImdb/test')\n", 86 | "\n", 87 | "dfTrain = pd.DataFrame(train_ds.unbatch().as_numpy_iterator(), columns=['text', 'label'])\n", 88 | "dfTest = pd.DataFrame(test_ds.unbatch().as_numpy_iterator(), columns=['text', 'label'])\n", 89 | "_, xts = train_test_split(dfTest, stratify=dfTest['label'], test_size=0.25)\n", 90 | "\n", 91 | "dfTrain['text'] = dfTrain['text'].map(lambda x: x.decode())\n", 92 | "xts['text'] = xts['text'].map(lambda x: x.decode())" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 4, 98 | "id": "right-visiting", 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/html": [ 104 | "
\n", 105 | "\n", 118 | "\n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | "
textlabel
72396For the first half of the film, as it actually attempts to make some sort of a story involving the theft of the Pink Panther, this movie almost works. However, when there are no more outtakes, or ...2
1353In Manhattan, the American middle class Jim Blandings (Cary Grant) lives with his wife Muriel (Myrna Loy) and two teenage daughters in a four bedroom and one bathroom only leased apartment. Jim wo...1
16953Here was a director and a writer who knew that they had a real story that needed a minimum amount of added-on work to make a fine movie. The time passing early on being marked by the fighter-jets ...2
24931I want to state first that I am a Christian (and that I do work in the film and TV industry) so I understand what it is like to work on a feature length film so props to the filmmakers in that reg...0
23975The King of Masks is a beautifully told story that pits the familial gender preference towards males against human preference for love and companionship. Set in 1930s China during a time of floods...1
\n", 154 | "
" 155 | ], 156 | "text/plain": [ 157 | " text \\\n", 158 | "72396 For the first half of the film, as it actually attempts to make some sort of a story involving the theft of the Pink Panther, this movie almost works. However, when there are no more outtakes, or ... \n", 159 | "1353 In Manhattan, the American middle class Jim Blandings (Cary Grant) lives with his wife Muriel (Myrna Loy) and two teenage daughters in a four bedroom and one bathroom only leased apartment. Jim wo... \n", 160 | "16953 Here was a director and a writer who knew that they had a real story that needed a minimum amount of added-on work to make a fine movie. The time passing early on being marked by the fighter-jets ... \n", 161 | "24931 I want to state first that I am a Christian (and that I do work in the film and TV industry) so I understand what it is like to work on a feature length film so props to the filmmakers in that reg... \n", 162 | "23975 The King of Masks is a beautifully told story that pits the familial gender preference towards males against human preference for love and companionship. Set in 1930s China during a time of floods... \n", 163 | "\n", 164 | " label \n", 165 | "72396 2 \n", 166 | "1353 1 \n", 167 | "16953 2 \n", 168 | "24931 0 \n", 169 | "23975 1 " 170 | ] 171 | }, 172 | "execution_count": 4, 173 | "metadata": {}, 174 | "output_type": "execute_result" 175 | } 176 | ], 177 | "source": [ 178 | "pd.options.display.max_colwidth = 200\n", 179 | "dfTrain.sample(n=5)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 5, 185 | "id": "japanese-brooklyn", 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "This is a great example of \"film noir,\" as every scene has some sort of shadow pattern on the wall, the floor, the faces. All shots are done with key light on the faces. The patterns suggest \"jail,\" \"locked up,\" \"flight\" (as in a train track), \"trapped,\" (as in a cobweb), and others. There isn't one scene that doesn't have a shadow in it! Even the day time sequences. And the actors that had great careers: Stanwyck, Gary Merrill, Claude Akins, even Jesse (the original maytag repairman) White, and, of course, George Sanders, who plays a \"deNazified\" ex-Nazi. Whew! Great stuff.\n" 193 | ] 194 | } 195 | ], 196 | "source": [ 197 | "print(dfTrain.loc[0, 'text'])" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "id": "rational-vault", 203 | "metadata": {}, 204 | "source": [ 205 | "## Tokenize the text" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 6, 211 | "id": "future-salad", 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "name": "stdout", 216 | "output_type": "stream", 217 | "text": [ 218 | "Found 153845 unique tokens.\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "tokenizer = Tokenizer()\n", 224 | "tokenizer.fit_on_texts(dfTrain['text'].tolist())\n", 225 | "train_sequences = tokenizer.texts_to_sequences(dfTrain['text'].tolist())\n", 226 | "test_sequences = tokenizer.texts_to_sequences(xts['text'].tolist())\n", 227 | "\n", 228 | "\n", 229 | "word_index = tokenizer.word_index\n", 230 | "print('Found %s unique tokens.' % len(word_index))" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 7, 236 | "id": "headed-saver", 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "name": "stdout", 241 | "output_type": "stream", 242 | "text": [ 243 | "[11, 6, 3, 80, 501, 4, 19, 1460, 14, 172, 129, 44, 46, 435, 4, 2831, 6273, 20, 1, 1600, 1, 1808, 1, 1465, 29, 684, 23, 224, 15, 1296, 627, 20, 1, 1465, 1, 10593, 1478, 2618, 3008, 53, 2762, 14, 8, 3, 1233, 1441, 2387, 14, 8, 3, 29382, 2, 395, 47, 215, 27, 129, 12, 148, 25, 3, 2831, 8, 9, 56, 1, 250, 55, 814, 2, 1, 156, 12, 67, 80, 3704, 3755, 1964, 14386, 4144, 22703, 56, 3838, 1, 207, 58547, 29383, 461, 2, 4, 267, 745, 7188, 34, 284, 3, 87409, 1180, 2292, 15749, 80, 529]\n" 244 | ] 245 | } 246 | ], 247 | "source": [ 248 | "print(train_sequences[0])" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 8, 254 | "id": "oriental-copper", 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "name": "stdout", 259 | "output_type": "stream", 260 | "text": [ 261 | "['this', 'is', 'a', 'great', 'example', 'of', 'film', 'noir', 'as', 'every', 'scene', 'has', 'some', 'sort', 'of', 'shadow', 'pattern', 'on', 'the', 'wall', 'the', 'floor', 'the', 'faces', 'all', 'shots', 'are', 'done', 'with', 'key', 'light', 'on', 'the', 'faces', 'the', 'patterns', 'suggest', 'jail', 'locked', 'up', 'flight', 'as', 'in', 'a', 'train', 'track', 'trapped', 'as', 'in', 'a', 'cobweb', 'and', 'others', 'there', \"isn't\", 'one', 'scene', 'that', \"doesn't\", 'have', 'a', 'shadow', 'in', 'it', 'even', 'the', 'day', 'time', 'sequences', 'and', 'the', 'actors', 'that', 'had', 'great', 'careers', 'stanwyck', 'gary', 'merrill', 'claude', 'akins', 'even', 'jesse', 'the', 'original', 'maytag', 'repairman', 'white', 'and', 'of', 'course', 'george', 'sanders', 'who', 'plays', 'a', 'denazified', 'ex', 'nazi', 'whew', 'great', 'stuff']\n" 262 | ] 263 | } 264 | ], 265 | "source": [ 266 | "print([tokenizer.index_word[k] for k in train_sequences[0]])" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 27, 272 | "id": "subjective-mailman", 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "MAX_SEQUENCE_LENGTH = max([max(map(len, train_sequences)), max(map(len, test_sequences))])" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 28, 282 | "id": "promising-rochester", 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "data": { 287 | "text/plain": [ 288 | "2493" 289 | ] 290 | }, 291 | "execution_count": 28, 292 | "metadata": {}, 293 | "output_type": "execute_result" 294 | } 295 | ], 296 | "source": [ 297 | "MAX_SEQUENCE_LENGTH" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 29, 303 | "id": "surgical-specific", 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", 308 | "test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 30, 314 | "id": "sexual-convenience", 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "name": "stdout", 319 | "output_type": "stream", 320 | "text": [ 321 | "['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'ina', 'garden', 'rocks', 'her', 'cooking', 'is', 'simple', 'tasty', 'and', 'fun', 'to', 'make', 'i', 'love', 'making', 'her', 'deserts', \"it's\", 'really', 'easy', 'she', 'talks', 'the', 'talks', 'and', 'walks', 'the', 'walks', 'i', 'mean', 'some', 'other', 'cooking', 'shows', 'the', 'ingredients', 'are', 'really', 'hard', 'to', 'find', 'but', 'ina', 'recipes', 'are', 'easy', 'and', 'great', 'i', 'mean', 'her', 'famous', 'roast', 'chicken', 'are', 'really', 'tasty', 'i', 'bought', 'her', 'book', 'about', '2', 'weeks', 'ago', 'and', 'the', 'recipes', 'are', 'easy', 'in', 'her', 'show', 'i', 'sometimes', 'like', 'to', 'cook', 'along', 'with', 'her', 'and', 'i', 'love', 'cooking', \"ina's\", 'show', 'makes', 'a', 'big', 'difference', 'for', 'new', 'people', 'who', 'are', 'trying', 'to', 'cook', 'watch', 'this', 'show', 'and', 'love', 'it', 'it', 'has', 'great', 'ideas', 'of', 'cooking', 'and', 'you', 'might', 'learn', 'something', 'new', 'like', 'i', 'learned', 'a', 'new', 'way', 'of', 'how', 'to', 'set', 'the', 'table', 'laugh', 'out', 'loud', 'but', 'the', 'point', 'is', 'she', 'is', 'a', 'great', 'host', 'a', 'chef', 'and', 'her', 'cooking', 'is', 'of', 'the', 'hook']\n" 322 | ] 323 | } 324 | ], 325 | "source": [ 326 | "print([tokenizer.index_word.get(k, '') for k in train_data[0]])" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "id": "agricultural-radius", 332 | "metadata": {}, 333 | "source": [ 334 | "# Train a classifier with Word Embeddings" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 31, 340 | "id": "human-laser", 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "countries_wiki = KeyedVectors.load('wiki-countries.w2v')" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 32, 350 | "id": "honey-occasions", 351 | "metadata": {}, 352 | "outputs": [ 353 | { 354 | "name": "stdout", 355 | "output_type": "stream", 356 | "text": [ 357 | "WARNING:tensorflow:Please add `keras.layers.InputLayer` instead of `keras.Input` to Sequential model. `keras.Input` is intended to be used by Functional model.\n" 358 | ] 359 | } 360 | ], 361 | "source": [ 362 | "embedding_layer = utils.make_embedding_layer(countries_wiki, tokenizer, MAX_SEQUENCE_LENGTH)\n", 363 | "countries_wiki_model = Sequential([\n", 364 | " Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32'),\n", 365 | " embedding_layer,\n", 366 | " GlobalAveragePooling1D(),\n", 367 | " Dense(128, activation='relu'),\n", 368 | " Dense(64, activation='relu'),\n", 369 | " Dense(1, activation='sigmoid')\n", 370 | "])\n", 371 | "countries_wiki_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 33, 377 | "id": "married-slovakia", 378 | "metadata": {}, 379 | "outputs": [ 380 | { 381 | "name": "stdout", 382 | "output_type": "stream", 383 | "text": [ 384 | "Epoch 1/30\n", 385 | "1172/1172 [==============================] - 24s 21ms/step - loss: -16641.9727 - accuracy: 0.1667 - val_loss: 65643.5391 - val_accuracy: 0.5000\n", 386 | "Epoch 2/30\n", 387 | "1172/1172 [==============================] - 27s 23ms/step - loss: -225707.7031 - accuracy: 0.1667 - val_loss: 447620.6250 - val_accuracy: 0.5000\n", 388 | "Epoch 3/30\n", 389 | "1172/1172 [==============================] - 37s 32ms/step - loss: -828230.2500 - accuracy: 0.1667 - val_loss: 1270633.3750 - val_accuracy: 0.5000\n", 390 | "Epoch 4/30\n", 391 | "1172/1172 [==============================] - 50s 43ms/step - loss: -1917466.7500 - accuracy: 0.1667 - val_loss: 2608856.2500 - val_accuracy: 0.5000\n", 392 | "Epoch 5/30\n", 393 | "1172/1172 [==============================] - 43s 37ms/step - loss: -3549600.5000 - accuracy: 0.1667 - val_loss: 4510364.0000 - val_accuracy: 0.5000\n", 394 | "Epoch 6/30\n", 395 | "1172/1172 [==============================] - 30s 26ms/step - loss: -5783278.5000 - accuracy: 0.1667 - val_loss: 7038023.0000 - val_accuracy: 0.5000\n", 396 | "Epoch 7/30\n", 397 | "1172/1172 [==============================] - 30s 25ms/step - loss: -8691590.0000 - accuracy: 0.1667 - val_loss: 10272849.0000 - val_accuracy: 0.5000\n", 398 | "Epoch 8/30\n", 399 | "1172/1172 [==============================] - 31s 27ms/step - loss: -12336643.0000 - accuracy: 0.1667 - val_loss: 14257598.0000 - val_accuracy: 0.5000\n", 400 | "Epoch 9/30\n", 401 | "1172/1172 [==============================] - 29s 25ms/step - loss: -16778410.0000 - accuracy: 0.1667 - val_loss: 19058352.0000 - val_accuracy: 0.5000\n", 402 | "Epoch 10/30\n", 403 | "1172/1172 [==============================] - 33s 28ms/step - loss: -22084874.0000 - accuracy: 0.1667 - val_loss: 24761140.0000 - val_accuracy: 0.5000\n", 404 | "Epoch 11/30\n", 405 | "1172/1172 [==============================] - 35s 30ms/step - loss: -28339982.0000 - accuracy: 0.1667 - val_loss: 31435292.0000 - val_accuracy: 0.5000\n", 406 | "Epoch 12/30\n", 407 | "1172/1172 [==============================] - 36s 30ms/step - loss: -35588056.0000 - accuracy: 0.1667 - val_loss: 39109404.0000 - val_accuracy: 0.5000\n", 408 | "Epoch 13/30\n", 409 | "1172/1172 [==============================] - 31s 27ms/step - loss: -43863900.0000 - accuracy: 0.1667 - val_loss: 47811596.0000 - val_accuracy: 0.5000\n", 410 | "Epoch 14/30\n", 411 | "1172/1172 [==============================] - 34s 29ms/step - loss: -53257640.0000 - accuracy: 0.1667 - val_loss: 57697140.0000 - val_accuracy: 0.5000\n", 412 | "Epoch 15/30\n", 413 | "1172/1172 [==============================] - 33s 28ms/step - loss: -63865820.0000 - accuracy: 0.1667 - val_loss: 68784184.0000 - val_accuracy: 0.5000\n", 414 | "Epoch 16/30\n", 415 | "1172/1172 [==============================] - 33s 28ms/step - loss: -75711872.0000 - accuracy: 0.1667 - val_loss: 81125408.0000 - val_accuracy: 0.5000\n", 416 | "Epoch 17/30\n", 417 | "1172/1172 [==============================] - 33s 28ms/step - loss: -88845176.0000 - accuracy: 0.1667 - val_loss: 94777192.0000 - val_accuracy: 0.5000\n", 418 | "Epoch 18/30\n", 419 | "1172/1172 [==============================] - 33s 28ms/step - loss: -103363848.0000 - accuracy: 0.1667 - val_loss: 109816328.0000 - val_accuracy: 0.5000\n", 420 | "Epoch 19/30\n", 421 | "1172/1172 [==============================] - 32s 27ms/step - loss: -119283744.0000 - accuracy: 0.1667 - val_loss: 126276072.0000 - val_accuracy: 0.5000\n", 422 | "Epoch 20/30\n", 423 | "1172/1172 [==============================] - 32s 27ms/step - loss: -136716848.0000 - accuracy: 0.1667 - val_loss: 144300400.0000 - val_accuracy: 0.5000\n", 424 | "Epoch 21/30\n", 425 | "1172/1172 [==============================] - 32s 27ms/step - loss: -155722960.0000 - accuracy: 0.1667 - val_loss: 163854832.0000 - val_accuracy: 0.5000\n", 426 | "Epoch 22/30\n", 427 | "1172/1172 [==============================] - 32s 27ms/step - loss: -176284048.0000 - accuracy: 0.1667 - val_loss: 184962816.0000 - val_accuracy: 0.5000\n", 428 | "Epoch 23/30\n", 429 | "1172/1172 [==============================] - 31s 27ms/step - loss: -198497152.0000 - accuracy: 0.1667 - val_loss: 207794800.0000 - val_accuracy: 0.5000\n", 430 | "Epoch 24/30\n", 431 | "1172/1172 [==============================] - 32s 27ms/step - loss: -222518048.0000 - accuracy: 0.1667 - val_loss: 232431136.0000 - val_accuracy: 0.5000\n", 432 | "Epoch 25/30\n", 433 | "1172/1172 [==============================] - 33s 28ms/step - loss: -248399744.0000 - accuracy: 0.1667 - val_loss: 258972480.0000 - val_accuracy: 0.5000\n", 434 | "Epoch 26/30\n", 435 | "1172/1172 [==============================] - 35s 30ms/step - loss: -276105984.0000 - accuracy: 0.1667 - val_loss: 287263168.0000 - val_accuracy: 0.5000\n", 436 | "Epoch 27/30\n", 437 | "1172/1172 [==============================] - 41s 35ms/step - loss: -305662048.0000 - accuracy: 0.1667 - val_loss: 317435360.0000 - val_accuracy: 0.5000\n", 438 | "Epoch 28/30\n", 439 | "1172/1172 [==============================] - 35s 30ms/step - loss: -337288928.0000 - accuracy: 0.1667 - val_loss: 349765856.0000 - val_accuracy: 0.5000\n", 440 | "Epoch 29/30\n", 441 | "1172/1172 [==============================] - 33s 28ms/step - loss: -370958208.0000 - accuracy: 0.1667 - val_loss: 384041920.0000 - val_accuracy: 0.5000\n", 442 | "Epoch 30/30\n", 443 | "1172/1172 [==============================] - 44s 37ms/step - loss: -406680032.0000 - accuracy: 0.1667 - val_loss: 420405952.0000 - val_accuracy: 0.5000\n" 444 | ] 445 | } 446 | ], 447 | "source": [ 448 | "countries_wiki_history = countries_wiki_model.fit(\n", 449 | " train_data, dfTrain['label'].values,\n", 450 | " validation_data=(test_data, xts['label'].values),\n", 451 | " batch_size=64, epochs=30\n", 452 | ")" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "id": "mighty-jersey", 458 | "metadata": {}, 459 | "source": [ 460 | "# Train with a different set of word embeddings\n", 461 | "\n", 462 | "## GloVe: Global Vectors for Word Representation\n", 463 | "### Download [here](http://nlp.stanford.edu/data/glove.6B.zip)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": 15, 469 | "id": "voluntary-enemy", 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "glove_wiki = KeyedVectors.load_word2vec_format('data/glove.6B/glove.6B.300d.txt', binary=False, no_header=True)" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 16, 479 | "id": "beginning-concert", 480 | "metadata": {}, 481 | "outputs": [ 482 | { 483 | "name": "stdout", 484 | "output_type": "stream", 485 | "text": [ 486 | "WARNING:tensorflow:Please add `keras.layers.InputLayer` instead of `keras.Input` to Sequential model. `keras.Input` is intended to be used by Functional model.\n" 487 | ] 488 | } 489 | ], 490 | "source": [ 491 | "embedding_layer = utils.make_embedding_layer(glove_wiki, tokenizer, MAX_SEQUENCE_LENGTH)\n", 492 | "\n", 493 | "glove_model = Sequential([\n", 494 | " Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32'),\n", 495 | " embedding_layer,\n", 496 | " GlobalAveragePooling1D(),\n", 497 | " Dense(128, activation='relu'),\n", 498 | " Dense(64, activation='relu'),\n", 499 | " Dense(1, activation='sigmoid')\n", 500 | "])\n", 501 | "glove_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 17, 507 | "id": "identical-breath", 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "name": "stdout", 512 | "output_type": "stream", 513 | "text": [ 514 | "Epoch 1/30\n", 515 | "2344/2344 [==============================] - 81s 33ms/step - loss: -50649.3477 - accuracy: 0.1667 - val_loss: 184496.8906 - val_accuracy: 0.5000\n", 516 | "Epoch 2/30\n", 517 | "2344/2344 [==============================] - 85s 36ms/step - loss: -566044.8125 - accuracy: 0.1667 - val_loss: 1068684.7500 - val_accuracy: 0.5000\n", 518 | "Epoch 3/30\n", 519 | "2344/2344 [==============================] - 76s 32ms/step - loss: -1918321.1250 - accuracy: 0.1667 - val_loss: 2877956.5000 - val_accuracy: 0.5000\n", 520 | "Epoch 4/30\n", 521 | "2344/2344 [==============================] - 75s 32ms/step - loss: -4323605.0000 - accuracy: 0.1667 - val_loss: 5836816.5000 - val_accuracy: 0.5000\n", 522 | "Epoch 5/30\n", 523 | "2344/2344 [==============================] - 80s 34ms/step - loss: -8016980.5000 - accuracy: 0.1667 - val_loss: 10163823.0000 - val_accuracy: 0.5000\n", 524 | "Epoch 6/30\n", 525 | "2344/2344 [==============================] - 83s 35ms/step - loss: -13169944.0000 - accuracy: 0.1667 - val_loss: 16015656.0000 - val_accuracy: 0.5000\n", 526 | "Epoch 7/30\n", 527 | "2344/2344 [==============================] - 68s 29ms/step - loss: -20012426.0000 - accuracy: 0.1667 - val_loss: 23654598.0000 - val_accuracy: 0.5000\n", 528 | "Epoch 8/30\n", 529 | "2344/2344 [==============================] - 69s 29ms/step - loss: -28770734.0000 - accuracy: 0.1667 - val_loss: 33268432.0000 - val_accuracy: 0.5000\n", 530 | "Epoch 9/30\n", 531 | "2344/2344 [==============================] - 68s 29ms/step - loss: -39600532.0000 - accuracy: 0.1667 - val_loss: 44991732.0000 - val_accuracy: 0.5000\n", 532 | "Epoch 10/30\n", 533 | "2344/2344 [==============================] - 69s 29ms/step - loss: -52736648.0000 - accuracy: 0.1667 - val_loss: 59130552.0000 - val_accuracy: 0.5000\n", 534 | "Epoch 11/30\n", 535 | "2344/2344 [==============================] - 78s 33ms/step - loss: -68412664.0000 - accuracy: 0.1667 - val_loss: 75856912.0000 - val_accuracy: 0.5000\n", 536 | "Epoch 12/30\n", 537 | "2344/2344 [==============================] - 76s 33ms/step - loss: -86735552.0000 - accuracy: 0.1667 - val_loss: 95242024.0000 - val_accuracy: 0.5000\n", 538 | "Epoch 13/30\n", 539 | "2344/2344 [==============================] - 73s 31ms/step - loss: -108058864.0000 - accuracy: 0.1667 - val_loss: 117779864.0000 - val_accuracy: 0.5000\n", 540 | "Epoch 14/30\n", 541 | "2344/2344 [==============================] - 72s 31ms/step - loss: -132506200.0000 - accuracy: 0.1667 - val_loss: 143370016.0000 - val_accuracy: 0.5000\n", 542 | "Epoch 15/30\n", 543 | "2344/2344 [==============================] - 76s 33ms/step - loss: -160165152.0000 - accuracy: 0.1667 - val_loss: 172248944.0000 - val_accuracy: 0.5000\n", 544 | "Epoch 16/30\n", 545 | "2344/2344 [==============================] - 75s 32ms/step - loss: -191427376.0000 - accuracy: 0.1667 - val_loss: 204798768.0000 - val_accuracy: 0.5000\n", 546 | "Epoch 17/30\n", 547 | "2344/2344 [==============================] - 74s 32ms/step - loss: -226359504.0000 - accuracy: 0.1667 - val_loss: 241070832.0000 - val_accuracy: 0.5000\n", 548 | "Epoch 18/30\n", 549 | "2344/2344 [==============================] - 68s 29ms/step - loss: -265203296.0000 - accuracy: 0.1667 - val_loss: 281210496.0000 - val_accuracy: 0.5000\n", 550 | "Epoch 19/30\n", 551 | "2344/2344 [==============================] - 67s 29ms/step - loss: -308088864.0000 - accuracy: 0.1667 - val_loss: 325452800.0000 - val_accuracy: 0.5000\n", 552 | "Epoch 20/30\n", 553 | "2344/2344 [==============================] - 67s 29ms/step - loss: -355347392.0000 - accuracy: 0.1667 - val_loss: 374199872.0000 - val_accuracy: 0.5000\n", 554 | "Epoch 21/30\n", 555 | "2344/2344 [==============================] - 67s 29ms/step - loss: -407002880.0000 - accuracy: 0.1667 - val_loss: 427178944.0000 - val_accuracy: 0.5000\n", 556 | "Epoch 22/30\n", 557 | "2344/2344 [==============================] - 67s 29ms/step - loss: -463545408.0000 - accuracy: 0.1667 - val_loss: 485264896.0000 - val_accuracy: 0.5000\n", 558 | "Epoch 23/30\n", 559 | "2344/2344 [==============================] - 67s 29ms/step - loss: -524920672.0000 - accuracy: 0.1667 - val_loss: 548066752.0000 - val_accuracy: 0.5000\n", 560 | "Epoch 24/30\n", 561 | "2344/2344 [==============================] - 69s 30ms/step - loss: -591354880.0000 - accuracy: 0.1667 - val_loss: 615823040.0000 - val_accuracy: 0.5000\n", 562 | "Epoch 25/30\n", 563 | "2344/2344 [==============================] - 63s 27ms/step - loss: -663010240.0000 - accuracy: 0.1667 - val_loss: 689036928.0000 - val_accuracy: 0.5000\n", 564 | "Epoch 26/30\n", 565 | "2344/2344 [==============================] - 60s 25ms/step - loss: -740288000.0000 - accuracy: 0.1667 - val_loss: 767898048.0000 - val_accuracy: 0.5000\n", 566 | "Epoch 27/30\n", 567 | "2344/2344 [==============================] - 60s 26ms/step - loss: -823263232.0000 - accuracy: 0.1667 - val_loss: 852279872.0000 - val_accuracy: 0.5000\n", 568 | "Epoch 28/30\n", 569 | "2344/2344 [==============================] - 60s 26ms/step - loss: -912188800.0000 - accuracy: 0.1667 - val_loss: 942704448.0000 - val_accuracy: 0.5000\n", 570 | "Epoch 29/30\n", 571 | "2344/2344 [==============================] - 60s 25ms/step - loss: -1007166592.0000 - accuracy: 0.1667 - val_loss: 1039149120.0000 - val_accuracy: 0.5000\n", 572 | "Epoch 30/30\n", 573 | "2344/2344 [==============================] - 60s 25ms/step - loss: -1108177024.0000 - accuracy: 0.1667 - val_loss: 1141446528.0000 - val_accuracy: 0.5000\n" 574 | ] 575 | } 576 | ], 577 | "source": [ 578 | "glove_history = glove_model.fit(\n", 579 | " train_data, dfTrain['label'].values,\n", 580 | " validation_data=(test_data, xts['label'].values),\n", 581 | " batch_size=32, epochs=30\n", 582 | ")" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": 18, 588 | "id": "interior-season", 589 | "metadata": {}, 590 | "outputs": [ 591 | { 592 | "data": { 593 | "text/plain": [ 594 | "" 595 | ] 596 | }, 597 | "execution_count": 18, 598 | "metadata": {}, 599 | "output_type": "execute_result" 600 | }, 601 | { 602 | "data": { 603 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXsAAAD4CAYAAAANbUbJAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAWx0lEQVR4nO3dfZBV9Z3n8fcnDYQZQCIPI2ijMJZGsUGwuxG61VBsVMgDLLKyaDYGa9RQGyquGhV3E5nB2komMYYkRcZCfMo4CySMJm1iZCXRRG1Uuh0QedAAhdKMDw2EEbKIafjuH33papp+uE1fbO79fV5VXdzzO+d3zvdXp/j06d+591xFBGZmVtg+0d0FmJnZieewNzNLgMPezCwBDnszswQ47M3MEtCjuwtoadCgQTF8+PDuLsPMLK/U1tbuiojBba0/6cJ++PDh1NTUdHcZZmZ5RdJb7a33NI6ZWQIc9mZmCXDYm5kl4KSbszezj99f/vIX6urq+PDDD7u7FOtA7969KS4upmfPnp3q57A3M+rq6ujXrx/Dhw9HUneXY22ICHbv3k1dXR0jRozoVF9P45gZH374IQMHDnTQn+QkMXDgwOP6C8xhb2YADvo8cbznyWFvZpYAh72ZnRTeffddZs2axdlnn01paSmf+9znePPNN3N6jOeee47q6uo211dVVfGd73ynS8e45ZZbWLhwYdPylVdeyQ033NC0fNttt3HfffcddazZs2ezYsWKY/Z1ww03sHHjxi7Vc4TD3sy6XUQwffp0Jk6cyNatW6mtreXb3/427733Xk6P017YNzQ0MHXqVObNm9elY1RWVjYd4/Dhw+zatYsNGzY0ra+urqaioiKrYy1ZsoSRI0d2qZ4jHPZm1u2effZZevbsyZw5c5raLrzwQi699FIigttvv52SkhJGjRrF8uXLgcbg/sIXvtC0/dy5c3nkkUeAxseuzJ8/n4suuohRo0axefNmtm/fzv33388PfvADxowZw/PPP8/s2bOZM2cOF198MXfccQePPPIIc+fOBaC+vp4ZM2ZQXl5OeXk5L774IgC///3vGTNmDGPGjGHs2LHs27fvqLFUVFSwevVqADZs2EBJSQn9+vXjT3/6EwcPHmTTpk1cdNFFRx2ruW9961vMnj2bQ4cOMXHixJw9PsZvvTSzo/zDkxvY+O8f5HSfI08/hflfvKDN9a+//jqlpaWtrnv88cdZu3Yt69atY9euXZSXl3PZZZd1eMxBgwbx6quv8pOf/IR7772XJUuWMGfOHPr27cs3vvENAB588EHq6uqorq6mqKio6ZcFwM0338wtt9zCJZdcwttvv82VV17Jpk2buPfee1m0aBGVlZXs37+f3r17H3Xc008/nR49evD2229TXV3NhAkT2LlzJ6tXr6Z///6MGjWKXr16tVrz7bffzr59+3j44YdzfsPcYW9mJ7UXXniBa665hqKiIk477TQ+85nPsGbNGk455ZR2+1111VUAlJaW8vjjj7e53dVXX01RUdEx7atWrTpqvvyDDz5g//79VFZWcuutt/KlL32Jq666iuLi4mP6VlRUUF1dTXV1Nbfeeis7d+6kurqa/v37U1lZ2Wod99xzDxdffDGLFy9ud1zHy2FvZkdp7wr8RLngggtavUHZnh49enD48OGm5ZbvPf/kJz8JQFFREQ0NDW3up0+fPq22Hz58mJdeeumYK/d58+bx+c9/nqeeeorKykpWrlzJeeedd9Q2R+bt169fT0lJCcOGDeP73/8+p5xyCtdff32rxysvL6e2tpY9e/YwYMCAtgd+nDxnb2bdbtKkSRw8ePCoq9rXXnuN559/nksvvZTly5dz6NAh6uvr+cMf/sC4ceM466yz2LhxIwcPHmTv3r389re/7fA4/fr1O2aOvS1XXHEFP/7xj5uW165dC8DWrVsZNWoUd955J+Xl5WzevPmYvhUVFfzqV79iwIABFBUVMWDAAPbu3cvq1aupqKho9XiTJ09u+kWSbY2d4bA3s24niSeeeIJVq1Zx9tlnc8EFF3DXXXcxZMgQpk+fzujRo7nwwguZNGkS3/3udxkyZAjDhg1j5syZlJSUMHPmTMaOHdvhcb74xS/yxBNPNN2gbc+PfvQjampqGD16NCNHjuT+++8HYOHChZSUlDB69Gh69uzJlClTjuk7atQodu3axfjx449q69+/P4MGDWrzmFdffTU33ngjU6dO5cCBAx2OpzMUETndYVeVlZWFv7zE7OO1adMmzj///O4uw7LU2vmSVBsRZW318ZW9mVkCHPZmZglw2JuZJcBhb2aWAIe9mVkCHPZmZglw2JvZSeMXv/gFko76oNL27dspKSkBjn342RFjx45t+tBTQ0MDffv25bHHHmtaX1payquvvsrdd9/NqlWrgMaHpe3ateuYfbX1oad857A3s5PG0qVLueSSS1i6dGmn+jV/rPC6des499xzm5b//Oc/s3XrVi688EIWLFjAZz/72Xb31d7z7vNZVmEvabKkNyRtkXTMA5glzZZUL2lt5ueGTPsYSaslbZD0mqT/musBmFlh2L9/Py+88AIPPvggy5Yt61TfIw8eg8awnjNnTtOV/iuvvEJpaSlFRUWtfknIgQMHmDJlCg888AAAffv27fpgTkIdPghNUhGwCLgcqAPWSKqKiJZfn7I8Ilo+nPn/AddFxB8lnQ7USloZEXtzULuZnQi/mQfvrs/tPoeMgintfwPUL3/5SyZPnsy5557LwIEDqa2tbfOxxy1VVlbyzW9+E2gM+/nz57N06VL27dvX9GUhrdm/fz+zZs3iuuuu47rrruvcmPJMNlf244AtEbEtIj4ClgHTstl5RLwZEX/MvP534H1g8PEWa2aFa+nSpcyaNQuAWbNmdWoq56yzzuKjjz7i3XffZfPmzXz605+mvLycl19+merq6jYfKzxt2jSuv/76gg96yO4Rx2cAO5ot1wEXt7LdDEmXAW8Ct0RE8z5IGgf0Ara27CjpJuAmgDPPPDO7ys3sxOjgCvxE2LNnD7/73e9Yv349kjh06BCS+N73vpf1PioqKvj5z3/O0KFDkcT48eN58cUXeeWVV5gwYUKrfSorK3n66ae59tprc/5lISebXN2gfRIYHhGjgWeAR5uvlDQU+Gfg+og43LJzRCyOiLKIKBs82Bf+ZqlZsWIFX/7yl3nrrbfYvn07O3bsYMSIER0+mbK5iooKFi5c2BTsEyZM4Kc//SlDhgyhf//+rfZZsGABp556Kl/72tdyMo6TWTZhvxMY1my5ONPWJCJ2R8TBzOISoGmiTdIpwK+B/xURL3WtXDMrREuXLmX69OlHtc2YMaNTUzmVlZVs27atKeyHDh3KoUOHOnwr5Q9/+EMOHDjAHXfc0fnC80iHjziW1IPGqZn/RGPIrwGujYgNzbYZGhHvZF5PB+6MiPGSegG/AZ6MiIXZFORHHJt9/PyI4/xyPI847nDOPiIaJM0FVgJFwEMRsUHSAqAmIqqAr0uaCjQAe4DZme4zgcuAgZKOtM2OiLWdGZiZmXVNVt9BGxFPAU+1aLu72eu7gLta6fcY8FjLdjMz+3j5E7RmBsDJ9q111rrjPU8OezOjd+/e7N6924F/kosIdu/eTe/evTvdN6tpHDMrbMXFxdTV1VFfX9/dpVgHevfuTXFxcaf7OezNjJ49ezJixIjuLsNOIE/jmJklwGFvZpYAh72ZWQIc9mZmCXDYm5klwGFvZpYAh72ZWQIc9mZmCXDYm5klwGFvZpYAh72ZWQIc9mZmCXDYm5klwGFvZpYAh72ZWQIc9mZmCXDYm5klwGFvZpYAh72ZWQIc9mZmCXDYm5klwGFvZpYAh72ZWQIc9mZmCXDYm5klwGFvZpaArMJe0mRJb0jaImleK+tnS6qXtDbzc0OzdU9L2ivpV7ks3MzMstejow0kFQGLgMuBOmCNpKqI2Nhi0+URMbeVXXwP+Gvgq10t1szMjk82V/bjgC0RsS0iPgKWAdOyPUBE/BbYd5z1mZlZDmQT9mcAO5ot12XaWpoh6TVJKyQNy0l1ZmaWE7m6QfskMDwiRgPPAI92prOkmyTVSKqpr6/PUUlmZnZENmG/E2h+pV6caWsSEbsj4mBmcQlQ2pkiImJxRJRFRNngwYM709XMzLKQTdivAc6RNEJSL2AWUNV8A0lDmy1OBTblrkQzM+uqDt+NExENkuYCK4Ei4KGI2CBpAVATEVXA1yVNBRqAPcDsI/0lPQ+cB/SVVAf8XUSszP1QzMysLYqI7q7hKGVlZVFTU9PdZZiZ5RVJtRFR1tZ6f4LWzCwBDnszswQ47M3MEuCwNzNLgMPezCwBDnszswQ47M3MEuCwNzNLgMPezCwBDnszswQ47M3MEuCwNzNLgMPezCwBDnszswQ47M3MEuCwNzNLgMPezCwBDnszswQ47M3MEuCwNzNLgMPezCwBDnszswQ47M3MEuCwNzNLgMPezCwBDnszswQ47M3MEuCwNzNLgMPezCwBDnszswQ47M3MEpBV2EuaLOkNSVskzWtl/WxJ9ZLWZn5uaLbuK5L+mPn5Si6LNzOz7PToaANJRcAi4HKgDlgjqSoiNrbYdHlEzG3RdwAwHygDAqjN9P1TTqo3M7OsdBj2wDhgS0RsA5C0DJgGtAz71lwJPBMRezJ9nwEmA0uPr9z2vfSTG+m3d9OJ2LWZ2Qm371PnM/6/P3BC9p3NNM4ZwI5my3WZtpZmSHpN0gpJwzrTV9JNkmok1dTX12dZupmZZSubK/tsPAksjYiDkr4KPApMyrZzRCwGFgOUlZXF8RZxon4jmpnlu2yu7HcCw5otF2famkTE7og4mFlcApRm29fMzE68bMJ+DXCOpBGSegGzgKrmG0ga2mxxKnBk4nwlcIWkUyWdClyRaTMzs49Rh9M4EdEgaS6NIV0EPBQRGyQtAGoiogr4uqSpQAOwB5id6btH0j00/sIAWHDkZq2ZmX18FHHcU+QnRFlZWdTU1HR3GWZmeUVSbUSUtbXen6A1M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAFZhb2kyZLekLRF0rx2tpshKSSVZZZ7SXpY0npJ6yRNzE3ZZmbWGT062kBSEbAIuByoA9ZIqoqIjS226wfcDLzcrPlGgIgYJelvgN9IKo+Iw7kagJmZdSybK/txwJaI2BYRHwHLgGmtbHcP8I/Ah83aRgK/A4iI94G9QFlXCjYzs87LJuzPAHY0W67LtDWRdBEwLCJ+3aLvOmCqpB6SRgClwLAu1GtmZsehw2mcjkj6BHAfMLuV1Q8B5wM1wFtANXColX3cBNwEcOaZZ3a1JDMzayGbK/udHH01XpxpO6IfUAI8J2k7MB6oklQWEQ0RcUtEjImIacCngDdbHiAiFkdEWUSUDR48+DiHYmZmbckm7NcA50gaIakXMAuoOrIyIv4jIgZFxPCIGA68BEyNiBpJfy2pD4Cky4GGljd2zczsxOtwGiciGiTNBVYCRcBDEbFB0gKgJiKq2un+N8BKSYdp/Gvgy7ko2szMOierOfuIeAp4qkXb3W1sO7HZ6+3Ap4+/PDMzywV/gtbMLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwSkFXYS5os6Q1JWyTNa2e7GZJCUllmuaekRyWtl7RJ0l25KtzMzLLXYdhLKgIWAVOAkcA1kka2sl0/4Gbg5WbNVwOfjIhRQCnwVUnDc1C3mZl1QjZX9uOALRGxLSI+ApYB01rZ7h7gH4EPm7UF0EdSD+CvgI+AD7pWspmZdVY2YX8GsKPZcl2mrYmki4BhEfHrFn1XAH8G3gHeBu6NiD0tDyDpJkk1kmrq6+s7U7+ZmWWhyzdoJX0CuA+4rZXV44BDwOnACOA2SX/bcqOIWBwRZRFRNnjw4K6WZGZmLfTIYpudwLBmy8WZtiP6ASXAc5IAhgBVkqYC1wJPR8RfgPclvQiUAdtyULuZmWUpmyv7NcA5kkZI6gXMAqqOrIyI/4iIQRExPCKGAy8BUyOihsapm0kAkvoA44HNOR6DmZl1oMOwj4gGYC6wEtgE/CwiNkhakLl6b88ioK+kDTT+0ng4Il7ratFmZtY5iojuruEoZWVlUVNT091lmJnlFUm1EVHW1np/gtbMLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAw97MLAEOezOzBDjszcwS4LA3M0uAIqK7aziKpHrgrS7sYhCwK0flnAwKbTxQeGMqtPFA4Y2p0MYDx47prIgY3NbGJ13Yd5Wkmogo6+46cqXQxgOFN6ZCGw8U3pgKbTzQ+TF5GsfMLAEOezOzBBRi2C/u7gJyrNDGA4U3pkIbDxTemAptPNDJMRXcnL2ZmR2rEK/szcysBYe9mVkCCibsJU2W9IakLZLmdXc9uSBpu6T1ktZKqunuejpL0kOS3pf0erO2AZKekfTHzL+ndmeNndXGmP5e0s7MeVor6XPdWWNnSBom6VlJGyVtkHRzpj0vz1M748nnc9Rb0iuS1mXG9A+Z9hGSXs5k3nJJvdrdTyHM2UsqAt4ELgfqgDXANRGxsVsL6yJJ24GyiMjLD4NIugzYD/w0Ikoybd8F9kTEdzK/lE+NiDu7s87OaGNMfw/sj4h7u7O24yFpKDA0Il6V1A+oBf4zMJs8PE/tjGcm+XuOBPSJiP2SegIvADcDtwKPR8QySfcD6yLin9raT6Fc2Y8DtkTEtoj4CFgGTOvmmpIXEX8A9rRongY8mnn9KI3/EfNGG2PKWxHxTkS8mnm9D9gEnEGenqd2xpO3otH+zGLPzE8Ak4AVmfYOz1GhhP0ZwI5my3Xk+QnOCOD/SqqVdFN3F5Mjp0XEO5nX7wKndWcxOTRX0muZaZ68mPJoSdJwYCzwMgVwnlqMB/L4HEkqkrQWeB94BtgK7I2IhswmHWZeoYR9obokIi4CpgBfy0whFIxonEPM/3lE+CfgbGAM8A7w/W6t5jhI6gv8K/A/IuKD5uvy8Ty1Mp68PkcRcSgixgDFNM5knNfZfRRK2O8EhjVbLs605bWI2Jn5933gCRpPcr57LzOvemR+9f1urqfLIuK9zH/Gw8AD5Nl5yswD/yvwLxHxeKY5b89Ta+PJ93N0RETsBZ4FJgCfktQjs6rDzCuUsF8DnJO5O90LmAVUdXNNXSKpT+YGE5L6AFcAr7ffKy9UAV/JvP4K8MturCUnjoRixnTy6Dxlbv49CGyKiPuarcrL89TWePL8HA2W9KnM67+i8Y0om2gM/f+S2azDc1QQ78YByLyVaiFQBDwUEf+7eyvqGkl/S+PVPEAP4P/k25gkLQUm0vgo1veA+cAvgJ8BZ9L4KOuZEZE3NzzbGNNEGqcHAtgOfLXZfPdJTdIlwPPAeuBwpvl/0jjPnXfnqZ3xXEP+nqPRNN6ALaLxAv1nEbEgkxHLgAHAvwH/LSIOtrmfQgl7MzNrW6FM45iZWTsc9mZmCXDYm5klwGFvZpYAh72ZWQIc9mZmCXDYm5kl4P8D/HQjwP1S62YAAAAASUVORK5CYII=\n", 604 | "text/plain": [ 605 | "
" 606 | ] 607 | }, 608 | "metadata": { 609 | "needs_background": "light" 610 | }, 611 | "output_type": "display_data" 612 | } 613 | ], 614 | "source": [ 615 | "plt.plot(countries_wiki_history.history['val_accuracy'], label='Countries Wiki')\n", 616 | "plt.plot(glove_history.history['val_accuracy'], label='All Wiki')\n", 617 | "plt.legend()" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": null, 623 | "id": "visible-universe", 624 | "metadata": {}, 625 | "outputs": [], 626 | "source": [] 627 | } 628 | ], 629 | "metadata": { 630 | "kernelspec": { 631 | "display_name": "Python 3", 632 | "language": "python", 633 | "name": "python3" 634 | }, 635 | "language_info": { 636 | "codemirror_mode": { 637 | "name": "ipython", 638 | "version": 3 639 | }, 640 | "file_extension": ".py", 641 | "mimetype": "text/x-python", 642 | "name": "python", 643 | "nbconvert_exporter": "python", 644 | "pygments_lexer": "ipython3", 645 | "version": "3.8.5" 646 | } 647 | }, 648 | "nbformat": 4, 649 | "nbformat_minor": 5 650 | } 651 | -------------------------------------------------------------------------------- /Distributional Semantics/codes/wiki-countries.w2v: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/Distributional Semantics/codes/wiki-countries.w2v -------------------------------------------------------------------------------- /Distributional Semantics/data/init.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Init -------------------------------------------------------------------------------- /Distributional Semantics/images/Emb1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/Distributional Semantics/images/Emb1.png -------------------------------------------------------------------------------- /Distributional Semantics/images/Emb2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/Distributional Semantics/images/Emb2.png -------------------------------------------------------------------------------- /Distributional Semantics/images/Emb3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/Distributional Semantics/images/Emb3.png -------------------------------------------------------------------------------- /Distributional Semantics/images/Emb4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/Distributional Semantics/images/Emb4.png -------------------------------------------------------------------------------- /Distributional Semantics/images/init: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Distributional Semantics/images/king-queen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/Distributional Semantics/images/king-queen.png -------------------------------------------------------------------------------- /Distributional Semantics/images/man-king.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/Distributional Semantics/images/man-king.png -------------------------------------------------------------------------------- /Distributional Semantics/images/man-woman.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/Distributional Semantics/images/man-woman.png -------------------------------------------------------------------------------- /Distributional Semantics/images/woman-queen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/Distributional Semantics/images/woman-queen.png -------------------------------------------------------------------------------- /Knowledge Graphs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/Knowledge Graphs/.DS_Store -------------------------------------------------------------------------------- /Knowledge Graphs/codes/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/Knowledge Graphs/codes/.DS_Store -------------------------------------------------------------------------------- /Knowledge Graphs/codes/intro-to-wordnet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "raw", 5 | "id": "freelance-naples", 6 | "metadata": {}, 7 | "source": [ 8 | "----------------------------------------------------------------------\n", 9 | "Filename : intro-to-wordnet.ipynb\n", 10 | "Author : Jaidev Deshpande\n", 11 | "Purpose : Understanding Wordnet functionalities\n", 12 | "Libraries: nltk\n", 13 | "----------------------------------------------------------------------" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "anonymous-canadian", 19 | "metadata": {}, 20 | "source": [ 21 | "## [WordNet®](https://wordnet.princeton.edu/) Tutorial\n", 22 | "\n", 23 | "### Navigating Wornet Relationships" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "id": "widespread-traveler", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "!pip install nltk" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "id": "vocational-clearance", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "from nltk import download" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "sensitive-albany", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "download('wordnet')" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "weird-memory", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "from nltk.corpus import wordnet" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "id": "developing-failure", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# Synsets\n", 74 | "\n", 75 | "tractor = wordnet.synsets('tractor')\n", 76 | "tractor" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "id": "seasonal-hungary", 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "# Definitions of senses\n", 87 | "\n", 88 | "[syn.definition() for syn in tractor]" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "id": "active-norwegian", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "# Hypernyms: Relation between a concept and its superordinate\n", 99 | "\n", 100 | "tractor = wordnet.synset('tractor.n.01')\n", 101 | "tractor.hypernyms()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "id": "arctic-customer", 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "self_propelled_vehicle = wordnet.synset('self-propelled_vehicle.n.01')\n", 112 | "self_propelled_vehicle.hypernyms()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "invalid-lease", 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "# Meronyms: Relation between a part and its whole\n", 123 | "\n", 124 | "wheeled_vehicle = wordnet.synset('wheeled_vehicle.n.01')\n", 125 | "wheeled_vehicle.part_meronyms()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "id": "brown-weekend", 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "# Hyponyms: Relation between a concept and its subordinate\n", 136 | "\n", 137 | "wheeled_vehicle.hyponyms()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "id": "theoretical-bargain", 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "# Holonyms: Relation between whole and its parts\n", 148 | "\n", 149 | "axle = wordnet.synset('axle.n.01')\n", 150 | "axle.part_holonyms()" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "id": "identical-shoulder", 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "self_propelled_vehicle.hyponyms()" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "id": "quick-strengthening", 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "motor_vehicle = wordnet.synset('motor_vehicle.n.01')\n", 171 | "motor_vehicle.hyponyms()" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "id": "egyptian-appliance", 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "car = wordnet.synset('car.n.01')\n", 182 | "car.part_meronyms()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "id": "civic-worst", 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [] 192 | } 193 | ], 194 | "metadata": { 195 | "kernelspec": { 196 | "display_name": "Python 3", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.8.5" 211 | } 212 | }, 213 | "nbformat": 4, 214 | "nbformat_minor": 5 215 | } 216 | -------------------------------------------------------------------------------- /Knowledge Graphs/codes/lesk.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "shaped-norway", 6 | "metadata": {}, 7 | "source": [ 8 | "## Word-Sense Disambiguation" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "genetic-terror", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from nltk.corpus import wordnet as wn\n", 19 | "from nltk import wsd" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "id": "adult-bangladesh", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "X = 'The die is cast.'\n", 30 | "Y = 'Roll the die to get a 6.'\n", 31 | "Z = 'What is dead may never die.'" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "antique-burlington", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "wn.synsets('die')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "governing-montana", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "wn.synsets('die', pos=wn.NOUN)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "located-bookmark", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "for syn in wn.synsets('die', pos=wn.NOUN):\n", 62 | " print(syn.definition())" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "id": "after-party", 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "for syn in wn.synsets('die', pos=wn.VERB):\n", 73 | " print(syn.definition())" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "id": "bacterial-effect", 79 | "metadata": {}, 80 | "source": [ 81 | "## Word-Sense Disambiguation with Lesk Algorithm" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "id": "threaded-tourism", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "print(X)\n", 92 | "wsd.lesk(X.split(), 'die')" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "fluid-cargo", 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "_.definition()" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "independent-melissa", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "wsd.lesk(X.split(), 'die', pos=wn.NOUN).definition()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "progressive-origin", 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "print(Y)\n", 123 | "wsd.lesk(Y.split(), 'die').definition()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "id": "proof-while", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "wsd.lesk(Y.split(), 'die', pos=wn.NOUN).definition()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "id": "middle-object", 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "print(Z)\n", 144 | "wsd.lesk(Z.split(), 'die').definition()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "id": "organizational-joint", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "wsd.lesk(Z.split(), 'die', pos=wn.VERB).definition()" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "id": "valuable-harbor", 160 | "metadata": {}, 161 | "source": [ 162 | "## Automatic POS Tagging + Lesk with spaCy" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "id": "defensive-trailer", 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "!pip install spacy" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "id": "cellular-reflection", 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "from spacy.cli import download\n", 183 | "from spacy import load\n", 184 | "# download('en_core_web_sm')\n", 185 | "nlp = load('en_core_web_sm')" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "id": "radical-melbourne", 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "import warnings\n", 196 | "\n", 197 | "POS_MAP = {\n", 198 | " 'VERB': wn.VERB,\n", 199 | " 'NOUN': wn.NOUN,\n", 200 | " 'PROPN': wn.NOUN\n", 201 | "}\n", 202 | "\n", 203 | "\n", 204 | "def lesk(doc, word):\n", 205 | " found = False\n", 206 | " for token in doc:\n", 207 | " if token.text == word:\n", 208 | " word = token\n", 209 | " found = True\n", 210 | " break\n", 211 | " if not found:\n", 212 | " raise ValueError(f'Word \\\"{word}\\\" does not appear in the document: {doc.text}.')\n", 213 | " pos = POS_MAP.get(word.pos_, False)\n", 214 | " if not pos:\n", 215 | " warnings.warn(f'POS tag for {word.text} not found in wordnet. Falling back to default Lesk behaviour.')\n", 216 | " args = [c.text for c in doc], word.text\n", 217 | " kwargs = dict(pos=pos)\n", 218 | " return wsd.lesk(*args, **kwargs)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "id": "monetary-disaster", 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "doc = nlp('Roll the die to get a 6.')" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "id": "surgical-chrome", 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "lesk(doc, 'die')" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "id": "cordless-bankruptcy", 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "lesk(doc, 'die').definition()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "id": "excess-consultancy", 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "lesk(nlp('I work at google.'), 'google').definition()" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "id": "infectious-binary", 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "lesk(nlp('I will google it.'), 'google').definition()" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "id": "laughing-carolina", 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [] 278 | } 279 | ], 280 | "metadata": { 281 | "kernelspec": { 282 | "display_name": "Python 3", 283 | "language": "python", 284 | "name": "python3" 285 | }, 286 | "language_info": { 287 | "codemirror_mode": { 288 | "name": "ipython", 289 | "version": 3 290 | }, 291 | "file_extension": ".py", 292 | "mimetype": "text/x-python", 293 | "name": "python", 294 | "nbconvert_exporter": "python", 295 | "pygments_lexer": "ipython3", 296 | "version": "3.8.5" 297 | } 298 | }, 299 | "nbformat": 4, 300 | "nbformat_minor": 5 301 | } 302 | -------------------------------------------------------------------------------- /Knowledge Graphs/codes/wordnet-graph.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "advance-confidence", 6 | "metadata": {}, 7 | "source": [] 8 | }, 9 | { 10 | "cell_type": "markdown", 11 | "id": "prostate-franchise", 12 | "metadata": {}, 13 | "source": [ 14 | "" 15 | ] 16 | } 17 | ], 18 | "metadata": { 19 | "kernelspec": { 20 | "display_name": "Python 3", 21 | "language": "python", 22 | "name": "python3" 23 | }, 24 | "language_info": { 25 | "codemirror_mode": { 26 | "name": "ipython", 27 | "version": 3 28 | }, 29 | "file_extension": ".py", 30 | "mimetype": "text/x-python", 31 | "name": "python", 32 | "nbconvert_exporter": "python", 33 | "pygments_lexer": "ipython3", 34 | "version": "3.8.5" 35 | } 36 | }, 37 | "nbformat": 4, 38 | "nbformat_minor": 5 39 | } 40 | -------------------------------------------------------------------------------- /Knowledge Graphs/codes/wordnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/Knowledge Graphs/codes/wordnet.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Welcome to Semantic Processing Module 3 | 4 | ## TOC: 5 | - How to download files? 6 | - What is where? 7 | 8 | ### How to download files? 9 | ![](images/image1.png) 10 | Click on Code button and then click on Download ZIP 11 | OR 12 | Use `git clone https://github.com/ContentUpgrad/intro_to_neural_networks.git` command on your terminal if git is installed in your machine. 13 | 14 | 15 | ### What is where? 16 | The folder structure is given below: 17 | 18 | ![](images/image6.png) 19 | 20 | As you can see there are three main folders when you log in: 21 | 22 | 1. **Distributional Semantics** This is where all the code files regarding distributional semantics sessions are kept 23 | 2. **Knowledge Graphs** This is where all the code files regarding Knowledge Graph session are kept 24 | 3. **Topic Modelling**This is where all the code files regarding Topic Modelling session are kept 25 | 26 | When you click on any folder you will find the code and data folders as shown below: 27 | ![](images/image2.png) 28 | You will find all the code files of the session in code folder and data folder will be empty. Please note that you need to follow the instructions given in the segment for downloading data files and keep it in the data folder manually. 29 | 30 | #### Distributional Semantics 31 | You will find the following files in the code folder of Distributional Semantics 32 | ![](images/image4.png) 33 | The data files required can be found [here](https://drive.google.com/drive/u/0/folders/1KUnMvuufvo0yXS23EaI2EMNaq2lt5Ynh) 34 | 35 | #### Knowledge Graphs 36 | You will find the following files in the code folder of Distributional Semantics 37 | ![](images/image3.png) 38 | There are no data files required for this session. 39 | 40 | #### Topic Modelling 41 | You will find the following files in the code folder of Distributional Semantics 42 | ![](images/image5.png) 43 | The data files required can be found [here](https://drive.google.com/drive/u/0/folders/1umS1MgUXyra3KVF-6FsN8krHQ31lXhlX) 44 | 45 | 46 | #### Authors 47 | Tejaswini Allikanti, Jaidev Deshpande and Gunnvant Saini 48 | 49 | -------------------------------------------------------------------------------- /Topic Modelling/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/Topic Modelling/.DS_Store -------------------------------------------------------------------------------- /Topic Modelling/code files/nmf-imdb-movie-reviews.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "interior-basics", 6 | "metadata": {}, 7 | "source": [ 8 | "# Inferring Topics from IMDB Reviews" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "established-malta", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import os\n", 20 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 21 | "from sklearn.decomposition import NMF\n", 22 | "import pandas as pd\n", 23 | "import matplotlib.pyplot as plt" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "id": "political-ability", 29 | "metadata": {}, 30 | "source": [ 31 | "## Exploring the Dataset: [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "id": "physical-speaker", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "ROOT = '../neuralnets/aclImdb/train/pos/'" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "id": "solar-universe", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "reviews = []\n", 52 | "for file in os.listdir(ROOT):\n", 53 | " path = os.path.join(ROOT, file)\n", 54 | " if os.path.isfile(path):\n", 55 | " with open(path, 'r') as fin:\n", 56 | " reviews.append(fin.read())" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "id": "permanent-pride", 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "12500" 69 | ] 70 | }, 71 | "execution_count": 4, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "len(reviews)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 5, 83 | "id": "similar-commander", 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "Not wishing to give *anything* away here, I would just say this technically excellent, flawlessly acted and uplifting little flic will reward the viewer with an excellent hour and a half's entertainment: It will amuse, surprise, possibly embarrass occasionally and almost certainly tug at the heartstrings from time to time, as it approaches the inevitable, but not obvious, ending without becoming clichéd or predictable in any way. Most definitely recommended.

A previous User's Comment gives 8 out of 10 for the film and 10 out of 10 for both Branagh and Bonham-Carter's outstanding performances - I agree entirely....\n", 91 | "======================================================================================================================================================\n", 92 | "Wrestlemania 14 is not often looked as one of the great Wrestlemania's but I would personally put it, in my top 5, if not the top 3. It has so many great things, and it truly signified the birth of The Attitude Era, which was WWE's best era, in my opinion. HBK has the heart of a lion, and him putting over Austin like he did, on his way out, was pure class on his part. It has one of the hottest crowds you will ever see, and it has J.R and The King at their announcing best!.

Matches.

15 – team battle royal LOUD pop for L.O.D's return. I'm not a fan of battle royal's, and this is yet another average one. Very predictable, even when you 1st see it, it's obvious L.O.D would win. Looking at Sunny for 8 or so minutes though, definitely helps.

2/5

WWF Light Heavyweight Championship

Taka Michinoku|C| Vs Aguila.

Taka gets a surprising pop, with his entrance. Fast, high-flying, and very exciting. If these two had more time, they would have surely tore the roof off, with their stuff. Taka wins with the Michinoku driver.

3 1/2 /5

WWF European Championship.

Triple H|C| Vs Owen Hart Stipulation here, is Chyna is handcuffed to Slaughter. Nice pop for Owen, mixed reaction for Trips. A really, really underrated match, that ranks among one of my favorites for Wrestlemania, actually. The two mixed together very well, and Owen can go with anybody. Trips wins, with Chyna interference.

4/5

Mixed Tag match. Marc Mero&Sable Vs Goldust&Luna. Defining pop for Sable, unheard of that time, for woman. Sable actually looks hot, and the crowd is just eating her up!. Constant Sable chants, and them erupting almost every time she gets in the ring. Not bad for a Mixed tag match, it had entertaining antics, and passed the time well. Sable's team wins, when Sable hits the TKO.

2 1/2 /5

WWF Intercontinental Championship. Ken Shamrock Vs The Rock|C|. Before I review the match, I'd like to note The Rock showed off his immense potential, with his interview with Jennifer Flowers, before his match. Nice pop for Shamrock, big time heat for The Rock. Too disappointingly short, and I thought the ending was kinda stupid, though Shamrock's snapping antics were awesome to see, and the crowd went nuts for it. Rock keeps the title, when The Ref reverses the decision.

2/5

Dumpster match, for The WWF Tag Team Championship

Catcus Jack&Terry Funk Vs The New Age Outlaws. The Outlaws are not as over, as they were gonna be at this time. Crowd is actually somewhat dead for this, but I thought it had some great Hardcore bits, with some sick looking bumps. Cactus and Terry win the titles in the end.

3/5

The Undertaker vs Kane. Big time ovation, for The Undertaker. Much better than there outing at Wrestlemania 20, and for a big man vs big man match, this was really good. It was a great all out brawl, with The Undertaker taking a sick looking bump, through the table. WWE was smart, by making Kane looking strong, even through defeat. After 2 tombstone kick out's, Taker finally puts him away, with a 3rd one.

3 1/2 /5

WWF Championship.

Special Guest Enforcer \"Mike Tyson\"

HBK|C| Vs Steve Austin. Big heat for Tyson. Crowd goes ape sh*t for Austin, definitely one of the biggest pops I have heard. Mixed reaction, for HBK. This is truly a special match up, one of the greatest wrestlemania main events in history, you can tell when J.R is even out of breath. HBK gives it his all, in what was supposed to be his last match, and Austin has rarely been better. The animosity and electricity from the crowd is amazing, and it's as exciting as it gets. Austin wins with the stunner, with Tyson joining 3:16 by knocking out Michaels. Austin's celebratory victory, is a wonder to behold, with one of the nosiest crowd's you will ever see, King said it right, they were going nuts.

5/5

Bottom line. Wrestlemania 14 is one of the greatest for real. It has everything you want in a Wrestlemania, and truly kick started the Attitude Era. This is very special to me, because it was the 1st Wrestlemania I ever saw, back in 98. \"The Austin Era, has begun!\"

9 1/2 /10\n", 93 | "======================================================================================================================================================\n", 94 | "It could have been a better film. It does drag at points, and the central story shifts from Boyer completing his mission to Boyer avenging Wanda Hendrix's death, but Graham Greene is an author who is really hard to spoil. His stories are all morality tales, due to his own considerations of Catholicism, guilt and innocence (very relative terms in his world view), and the human condition.

Boyer is Luis Denard, a well-known concert pianist, who has sided with the Republicans in the Spanish Civil War. He has been sent to England to try to carry through an arms purchase deal that is desperately needed. Unfortunately for Denard he is literally on his own - everyone of his contacts turns out to be a willing turncoat for the Falagists of Spain. In particular Katina Paxinou (Mrs. Melendez) a grim boarding house keeper, and Peter Lorre (Mr. Contreras) a teacher of an \"esperanto\" type international language. Wanda Hendrix is the drudge of a girl (Else) who works for Mrs. Melendez. The local diplomat, Licata (Victor Francken) is already a willing associate of the Falangists.

The Brits (Holmes Herbert, Miles Mander, and best - if not worst - of the lot, George Coulouris) don't give much hope to Boyer's cause (which he soon grasps may be Britain's before long). Herbert and Mander just retreat behind the official policy of neutrality ordered by the Ramsay MacDonald's and Stanley Baldwin's governments during the Civil War. Coulouris here is a typical Col. Blimp type - always impeccable in his native English diction, he is sharp in showing his dislike for foreigners in general.

The one ray of hope is Lauren Bacall (Rose Cullen), here trying to play her role as well as she can - but she can't really. She's an aristocrat - the daughter of a Press lord. It was Bacall's second film, and (sad to say) almost sank her long career. She does act well, but the spark she showed in her first film was due to the dual effect of starring with Humphrey Bogart and being directed by Howard Hawks. Boyer is a fine actor, but he's not Bogie, and Herman Shumlin is not Hawks. Her next film returned her to Bogie and Hawks again, and her star resumed it's ascendancy.

It's a bleak film (as was the novel). Boyer's mission never succeeds, as he has too many hidden foes all over the place. But the villains are likewise also losers - frequently with their lives.

With Dan Seymour as a suspicious foreign tenant of Katina Paxinou (and the man who destroys her). It is well worth watching to catch the Warner's lot of character actors doing their best given the weakness in direction.\n", 95 | "======================================================================================================================================================\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "for i in range(3):\n", 101 | " print(reviews[i])\n", 102 | " print('=' * 150)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "id": "blind-relative", 108 | "metadata": {}, 109 | "source": [ 110 | "## Feature Extraction" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 8, 116 | "id": "fuzzy-legislation", 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/html": [ 122 | "
\n", 123 | "\n", 136 | "\n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | "
00000000s0038300060070079008000830093638...élanémigréémigrésétaitétatétcêxtaseísøstbyeüber
00.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
40.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
..................................................................
124950.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
124960.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
124970.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
124980.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
124990.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", 430 | "

12500 rows × 55428 columns

\n", 431 | "
" 432 | ], 433 | "text/plain": [ 434 | " 00 000 000s 003830 006 007 0079 0080 0083 0093638 ... élan \\\n", 435 | "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", 436 | "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", 437 | "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", 438 | "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", 439 | "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", 440 | "... ... ... ... ... ... ... ... ... ... ... ... ... \n", 441 | "12495 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", 442 | "12496 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", 443 | "12497 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", 444 | "12498 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", 445 | "12499 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n", 446 | "\n", 447 | " émigré émigrés était état étc êxtase ís østbye über \n", 448 | "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 449 | "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 450 | "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 451 | "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 452 | "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 453 | "... ... ... ... ... ... ... ... ... ... \n", 454 | "12495 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 455 | "12496 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 456 | "12497 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 457 | "12498 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 458 | "12499 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", 459 | "\n", 460 | "[12500 rows x 55428 columns]" 461 | ] 462 | }, 463 | "execution_count": 8, 464 | "metadata": {}, 465 | "output_type": "execute_result" 466 | } 467 | ], 468 | "source": [ 469 | "vect = TfidfVectorizer(stop_words='english')\n", 470 | "X = vect.fit_transform(reviews)\n", 471 | "\n", 472 | "pd.DataFrame(X.toarray(), columns=vect.get_feature_names())" 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "id": "close-advantage", 478 | "metadata": {}, 479 | "source": [ 480 | "## NMF Decomposition" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": 14, 486 | "id": "surprising-lindsay", 487 | "metadata": {}, 488 | "outputs": [ 489 | { 490 | "name": "stderr", 491 | "output_type": "stream", 492 | "text": [ 493 | "/home/jaidevd/anaconda3/lib/python3.7/site-packages/sklearn/decomposition/_nmf.py:315: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).\n", 494 | " \"'nndsvda' in 1.1 (renaming of 0.26).\"), FutureWarning)\n", 495 | "/home/jaidevd/anaconda3/lib/python3.7/site-packages/sklearn/decomposition/_nmf.py:1091: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.\n", 496 | " \" improve convergence.\" % max_iter, ConvergenceWarning)\n" 497 | ] 498 | } 499 | ], 500 | "source": [ 501 | "N_TOPICS = 15\n", 502 | "nmf = NMF(n_components=N_TOPICS)\n", 503 | "W = nmf.fit_transform(X) # Document-topic matrix\n", 504 | "H = nmf.components_ # Topic-term matrix" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 15, 510 | "id": "african-corps", 511 | "metadata": {}, 512 | "outputs": [ 513 | { 514 | "data": { 515 | "text/html": [ 516 | "
\n", 517 | "\n", 530 | "\n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | "
Word 1Word 2Word 3Word 4Word 5Word 6Word 7Word 8Word 9Word 10
Topic 1br10llspoilersendsimplyyesspoilerquitejust
Topic 2moviemovieswatchrecommend10seensawbestactorsdefinitely
Topic 3filmfilmsdirectorcharactersseencinemafestivalworkscenesart
Topic 4seriesepisodeepisodesseasontvcharacterstrekseasonsshowstelevision
Topic 5manrolecharacterperformancebestplaysjohnplayeddoesactor
Topic 6goodprettystorybadactingreallyjoblikednicelittle
Topic 7warworlddocumentarypeopleamericanhistorysoldiersmenwomenhitler
Topic 8funnycomedylaughhilariouseddiefunjokeshumorfunniestmurphy
Topic 9likethinkreallyjustdonpeopleknowsaydidnlot
Topic 10timeyearssawseendvdoldremembervemusicdisney
Topic 11lifestorylovefamilyrealcharacterspeopleyoungbeautifultrue
Topic 12bookjaneversionreadeyrenovelrochesterdaltontarzanemma
Topic 13horrorhousecreepyscarygorefilmshalloweenbudgetfanseffects
Topic 14greatactingreallyactorscastjobbestmusicwonderfuljust
Topic 15actionjackiechanscenesfukungfightmartialbournestory
\n", 744 | "
" 745 | ], 746 | "text/plain": [ 747 | " Word 1 Word 2 Word 3 Word 4 Word 5 Word 6 \\\n", 748 | "Topic 1 br 10 ll spoilers end simply \n", 749 | "Topic 2 movie movies watch recommend 10 seen \n", 750 | "Topic 3 film films director characters seen cinema \n", 751 | "Topic 4 series episode episodes season tv characters \n", 752 | "Topic 5 man role character performance best plays \n", 753 | "Topic 6 good pretty story bad acting really \n", 754 | "Topic 7 war world documentary people american history \n", 755 | "Topic 8 funny comedy laugh hilarious eddie fun \n", 756 | "Topic 9 like think really just don people \n", 757 | "Topic 10 time years saw seen dvd old \n", 758 | "Topic 11 life story love family real characters \n", 759 | "Topic 12 book jane version read eyre novel \n", 760 | "Topic 13 horror house creepy scary gore films \n", 761 | "Topic 14 great acting really actors cast job \n", 762 | "Topic 15 action jackie chan scenes fu kung \n", 763 | "\n", 764 | " Word 7 Word 8 Word 9 Word 10 \n", 765 | "Topic 1 yes spoiler quite just \n", 766 | "Topic 2 saw best actors definitely \n", 767 | "Topic 3 festival work scenes art \n", 768 | "Topic 4 trek seasons shows television \n", 769 | "Topic 5 john played does actor \n", 770 | "Topic 6 job liked nice little \n", 771 | "Topic 7 soldiers men women hitler \n", 772 | "Topic 8 jokes humor funniest murphy \n", 773 | "Topic 9 know say didn lot \n", 774 | "Topic 10 remember ve music disney \n", 775 | "Topic 11 people young beautiful true \n", 776 | "Topic 12 rochester dalton tarzan emma \n", 777 | "Topic 13 halloween budget fans effects \n", 778 | "Topic 14 best music wonderful just \n", 779 | "Topic 15 fight martial bourne story " 780 | ] 781 | }, 782 | "execution_count": 15, 783 | "metadata": {}, 784 | "output_type": "execute_result" 785 | } 786 | ], 787 | "source": [ 788 | "# Top 10 words per topic\n", 789 | "\n", 790 | "words = np.array(vect.get_feature_names())\n", 791 | "topic_words = pd.DataFrame(np.zeros((N_TOPICS, 10)), index=[f'Topic {i + 1}' for i in range(N_TOPICS)],\n", 792 | " columns=[f'Word {i + 1}' for i in range(10)]).astype(str)\n", 793 | "for i in range(N_TOPICS):\n", 794 | " ix = H[i].argsort()[::-1][:10]\n", 795 | " topic_words.iloc[i] = words[ix]\n", 796 | "\n", 797 | "topic_words" 798 | ] 799 | }, 800 | { 801 | "cell_type": "code", 802 | "execution_count": 16, 803 | "id": "thousand-clearance", 804 | "metadata": {}, 805 | "outputs": [], 806 | "source": [ 807 | "# Create a topic mapping\n", 808 | "\n", 809 | "topic_mapping = {\n", 810 | " 'Topic 4': 'TV',\n", 811 | " 'Topic 7': 'War',\n", 812 | " 'Topic 8': 'Comedy',\n", 813 | " 'Topic 12': 'Book Adaptation',\n", 814 | " 'Topic 13': 'Horror',\n", 815 | " 'Topic 15': 'Martial Arts / Action'\n", 816 | "}" 817 | ] 818 | }, 819 | { 820 | "cell_type": "code", 821 | "execution_count": 17, 822 | "id": "intellectual-somerset", 823 | "metadata": {}, 824 | "outputs": [ 825 | { 826 | "data": { 827 | "text/html": [ 828 | "
\n", 829 | "\n", 842 | "\n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | "
Topic 1Topic 2Topic 3Topic 4Topic 5Topic 6Topic 7Topic 8Topic 9Topic 10Topic 11Topic 12Topic 13Topic 14Topic 15max_topic
20.0283140.0000000.0221220.0014800.0230430.0020440.0309390.0000000.0063890.0000000.0007740.0072510.0000000.0035740.000000War
160.0002510.0000000.0015750.0000000.0291320.0022570.0000000.0331080.0162830.0000000.0123370.0000000.0035950.0119440.010159Comedy
180.0295740.0000000.0190100.0017970.0169060.0085740.0001290.0380100.0055580.0062500.0366520.0000000.0000000.0000000.000000Comedy
260.0151790.0003490.0000000.0000000.0159070.0123490.0000000.0343280.0157220.0088090.0043180.0000000.0000000.0019580.000922Comedy
270.0315230.0080990.0001710.0031510.0099750.0014110.0351580.0425880.0000000.0000000.0014250.0026240.0000000.0038650.002781Comedy
290.0000000.0006140.0000000.0000000.0000000.0148620.0000000.0149870.0109410.0000000.0000000.0015340.0662630.0000000.036239Horror
300.0234040.0121070.0168140.0000000.0081350.0096200.0013770.0403820.0008090.0045820.0048030.0011860.0141940.0000000.000000Comedy
310.0123240.0035540.0287530.0000000.0171250.0034830.0068040.0000000.0037020.0000000.0064490.0008330.0341610.0056820.000000Horror
340.0000000.0165030.0000000.0000000.0138250.0000000.0000000.0385670.0044790.0214620.0000000.0000000.0000000.0101320.000000Comedy
580.0002280.0466860.0000000.0000000.0001000.0000000.0048660.0000000.0016390.0137410.0370630.0692370.0000000.0120970.000000Book Adaptation
\n", 1057 | "
" 1058 | ], 1059 | "text/plain": [ 1060 | " Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 \\\n", 1061 | "2 0.028314 0.000000 0.022122 0.001480 0.023043 0.002044 0.030939 \n", 1062 | "16 0.000251 0.000000 0.001575 0.000000 0.029132 0.002257 0.000000 \n", 1063 | "18 0.029574 0.000000 0.019010 0.001797 0.016906 0.008574 0.000129 \n", 1064 | "26 0.015179 0.000349 0.000000 0.000000 0.015907 0.012349 0.000000 \n", 1065 | "27 0.031523 0.008099 0.000171 0.003151 0.009975 0.001411 0.035158 \n", 1066 | "29 0.000000 0.000614 0.000000 0.000000 0.000000 0.014862 0.000000 \n", 1067 | "30 0.023404 0.012107 0.016814 0.000000 0.008135 0.009620 0.001377 \n", 1068 | "31 0.012324 0.003554 0.028753 0.000000 0.017125 0.003483 0.006804 \n", 1069 | "34 0.000000 0.016503 0.000000 0.000000 0.013825 0.000000 0.000000 \n", 1070 | "58 0.000228 0.046686 0.000000 0.000000 0.000100 0.000000 0.004866 \n", 1071 | "\n", 1072 | " Topic 8 Topic 9 Topic 10 Topic 11 Topic 12 Topic 13 Topic 14 \\\n", 1073 | "2 0.000000 0.006389 0.000000 0.000774 0.007251 0.000000 0.003574 \n", 1074 | "16 0.033108 0.016283 0.000000 0.012337 0.000000 0.003595 0.011944 \n", 1075 | "18 0.038010 0.005558 0.006250 0.036652 0.000000 0.000000 0.000000 \n", 1076 | "26 0.034328 0.015722 0.008809 0.004318 0.000000 0.000000 0.001958 \n", 1077 | "27 0.042588 0.000000 0.000000 0.001425 0.002624 0.000000 0.003865 \n", 1078 | "29 0.014987 0.010941 0.000000 0.000000 0.001534 0.066263 0.000000 \n", 1079 | "30 0.040382 0.000809 0.004582 0.004803 0.001186 0.014194 0.000000 \n", 1080 | "31 0.000000 0.003702 0.000000 0.006449 0.000833 0.034161 0.005682 \n", 1081 | "34 0.038567 0.004479 0.021462 0.000000 0.000000 0.000000 0.010132 \n", 1082 | "58 0.000000 0.001639 0.013741 0.037063 0.069237 0.000000 0.012097 \n", 1083 | "\n", 1084 | " Topic 15 max_topic \n", 1085 | "2 0.000000 War \n", 1086 | "16 0.010159 Comedy \n", 1087 | "18 0.000000 Comedy \n", 1088 | "26 0.000922 Comedy \n", 1089 | "27 0.002781 Comedy \n", 1090 | "29 0.036239 Horror \n", 1091 | "30 0.000000 Comedy \n", 1092 | "31 0.000000 Horror \n", 1093 | "34 0.000000 Comedy \n", 1094 | "58 0.000000 Book Adaptation " 1095 | ] 1096 | }, 1097 | "execution_count": 17, 1098 | "metadata": {}, 1099 | "output_type": "execute_result" 1100 | } 1101 | ], 1102 | "source": [ 1103 | "# Recall the document-topic matrix, W\n", 1104 | "\n", 1105 | "W = pd.DataFrame(W, columns=[f'Topic {i + 1}' for i in range(N_TOPICS)])\n", 1106 | "W['max_topic'] = W.apply(lambda x: topic_mapping.get(x.idxmax()), axis=1)\n", 1107 | "W[pd.notnull(W['max_topic'])].head(10)" 1108 | ] 1109 | }, 1110 | { 1111 | "cell_type": "code", 1112 | "execution_count": 21, 1113 | "id": "continent-intellectual", 1114 | "metadata": {}, 1115 | "outputs": [ 1116 | { 1117 | "data": { 1118 | "text/plain": [ 1119 | "'In my humble opinion, this movie did not receive the recognition it deserved. Robert Redford lives near me here in Provo, Utah, at Sundance. I enjoy most of his work, and this was my favorite. I\\'m sorry that more people didn\\'t appreciate it. My grandmother was an avid reader and read the book years before it came out on the big screen. She gave it to me to read after we had seen the movie together. The movie and book hit an emotional spot within my heart, and I was weepy for several days after seeing the movie. Sometimes love isn\\'t enough to keep our loved ones from hurting themselves. We see this in our own family relationships, yet our love and our families and their stories endure throughout generations of time. The cinematography was perfect and breathtaking -- I was awed by its beauty and how well it brought to life the words of the author of the book, Norman Maclean, \"But when I am alone in the half light of the canyon, all existence seems to fade to a being with my soul, and memories. And the sounds of the Big Black Foot River, and a four count rhythm, and the hope that a fish will rise. Eventually, all things merge into one, and a river runs through it. The river was cut by the world\\'s great flood and runs over rocks from the basement of time. On some of the rocks are timeless raindrops. Under the rocks are the words, and some of the words are theirs. I am haunted by waters.\" These words, taken from the book and spoken at the end of the movie (by Robert Redford who is narrating as Norman Maclean), are basically scripture, in my opinion. Any possible flaws the movie may have are overshadowed by the beauty and grace of the story and the cinematography. It was beautiful!'" 1120 | ] 1121 | }, 1122 | "execution_count": 21, 1123 | "metadata": {}, 1124 | "output_type": "execute_result" 1125 | } 1126 | ], 1127 | "source": [ 1128 | "reviews[58]" 1129 | ] 1130 | }, 1131 | { 1132 | "cell_type": "code", 1133 | "execution_count": null, 1134 | "id": "regulation-comparison", 1135 | "metadata": {}, 1136 | "outputs": [], 1137 | "source": [] 1138 | } 1139 | ], 1140 | "metadata": { 1141 | "kernelspec": { 1142 | "display_name": "Python 3", 1143 | "language": "python", 1144 | "name": "python3" 1145 | }, 1146 | "language_info": { 1147 | "codemirror_mode": { 1148 | "name": "ipython", 1149 | "version": 3 1150 | }, 1151 | "file_extension": ".py", 1152 | "mimetype": "text/x-python", 1153 | "name": "python", 1154 | "nbconvert_exporter": "python", 1155 | "pygments_lexer": "ipython3", 1156 | "version": "3.8.5" 1157 | } 1158 | }, 1159 | "nbformat": 4, 1160 | "nbformat_minor": 5 1161 | } 1162 | -------------------------------------------------------------------------------- /Topic Modelling/data/init.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Init -------------------------------------------------------------------------------- /images/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/images/.DS_Store -------------------------------------------------------------------------------- /images/image1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/images/image1.png -------------------------------------------------------------------------------- /images/image2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/images/image2.png -------------------------------------------------------------------------------- /images/image3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/images/image3.png -------------------------------------------------------------------------------- /images/image4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/images/image4.png -------------------------------------------------------------------------------- /images/image5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/images/image5.png -------------------------------------------------------------------------------- /images/image6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ContentUpgrad/semantic_processing/c20bd62ff9120544d808c60718dcc7653b2853e3/images/image6.png --------------------------------------------------------------------------------