├── Knowledge Graphs
├── init
├── wordnet.png
├── wordnet-graph.ipynb
├── tree.py
├── intro-to-wordnet.ipynb
└── lesk.ipynb
├── topic modelling
├── init
└── nmf-imdb-movie-reviews.ipynb
├── neural networks for NLP
├── init
├── images
│ ├── init
│ ├── or.png
│ ├── ann.png
│ ├── nand.png
│ ├── xor.png
│ ├── ANDand.png
│ ├── combo.png
│ ├── deepnet.png
│ └── matmul.png
├── imdb-reviews-classification.ipynb
├── keras.ipynb
└── forward-pass.ipynb
├── _config.yml
└── distributional semantics
├── init
├── images
├── init
├── Emb1.png
├── Emb2.png
├── Emb3.png
├── Emb4.png
├── man-king.png
├── king-queen.png
├── man-woman.png
└── woman-queen.png
├── utils.py
└── w2v-text-classification.ipynb
/Knowledge Graphs/init:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/topic modelling/init:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/neural networks for NLP/init:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
--------------------------------------------------------------------------------
/distributional semantics/init:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/distributional semantics/images/init:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/neural networks for NLP/images/init:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/Knowledge Graphs/wordnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/Knowledge Graphs/wordnet.png
--------------------------------------------------------------------------------
/neural networks for NLP/images/or.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/neural networks for NLP/images/or.png
--------------------------------------------------------------------------------
/neural networks for NLP/images/ann.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/neural networks for NLP/images/ann.png
--------------------------------------------------------------------------------
/neural networks for NLP/images/nand.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/neural networks for NLP/images/nand.png
--------------------------------------------------------------------------------
/neural networks for NLP/images/xor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/neural networks for NLP/images/xor.png
--------------------------------------------------------------------------------
/distributional semantics/images/Emb1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/distributional semantics/images/Emb1.png
--------------------------------------------------------------------------------
/distributional semantics/images/Emb2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/distributional semantics/images/Emb2.png
--------------------------------------------------------------------------------
/distributional semantics/images/Emb3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/distributional semantics/images/Emb3.png
--------------------------------------------------------------------------------
/distributional semantics/images/Emb4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/distributional semantics/images/Emb4.png
--------------------------------------------------------------------------------
/neural networks for NLP/images/ANDand.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/neural networks for NLP/images/ANDand.png
--------------------------------------------------------------------------------
/neural networks for NLP/images/combo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/neural networks for NLP/images/combo.png
--------------------------------------------------------------------------------
/neural networks for NLP/images/deepnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/neural networks for NLP/images/deepnet.png
--------------------------------------------------------------------------------
/neural networks for NLP/images/matmul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/neural networks for NLP/images/matmul.png
--------------------------------------------------------------------------------
/distributional semantics/images/man-king.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/distributional semantics/images/man-king.png
--------------------------------------------------------------------------------
/distributional semantics/images/king-queen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/distributional semantics/images/king-queen.png
--------------------------------------------------------------------------------
/distributional semantics/images/man-woman.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/distributional semantics/images/man-woman.png
--------------------------------------------------------------------------------
/distributional semantics/images/woman-queen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ContentUpgrad/Semantic-Processing/main/distributional semantics/images/woman-queen.png
--------------------------------------------------------------------------------
/Knowledge Graphs/wordnet-graph.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "advance-confidence",
6 | "metadata": {},
7 | "source": []
8 | },
9 | {
10 | "cell_type": "markdown",
11 | "id": "prostate-franchise",
12 | "metadata": {},
13 | "source": [
14 | "
"
15 | ]
16 | }
17 | ],
18 | "metadata": {
19 | "kernelspec": {
20 | "display_name": "Python 3",
21 | "language": "python",
22 | "name": "python3"
23 | },
24 | "language_info": {
25 | "codemirror_mode": {
26 | "name": "ipython",
27 | "version": 3
28 | },
29 | "file_extension": ".py",
30 | "mimetype": "text/x-python",
31 | "name": "python",
32 | "nbconvert_exporter": "python",
33 | "pygments_lexer": "ipython3",
34 | "version": "3.7.7"
35 | }
36 | },
37 | "nbformat": 4,
38 | "nbformat_minor": 5
39 | }
40 |
--------------------------------------------------------------------------------
/Knowledge Graphs/tree.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from spacy_wordnet.wordnet_annotator import WordnetAnnotator
3 | from spacy import load
4 | from spacy_wordnet.wordnet_annotator import WordnetAnnotator
5 | import pandas as pd
6 | nlp = load('en_core_web_sm')
7 | nlp.add_pipe(WordnetAnnotator(nlp.lang))
8 | token = nlp('Calculator.')[0]
9 | token._.wordnet.synsets()
10 | meaning1, meaning2 = token = nlp('Calculator.')[0]
11 | meaning1, meaning2 = token._.wordnet.synsets()
12 | meaning1
13 | meaning1.name()
14 | meaning1.lemmas()
15 | meaning2.lemmas()
16 | token._.wordnet.wordnet_domains()
17 | nlp('mathematics')[0]._.wordnet.wordnet_domains()
18 | 'science' in _
19 | nlp('pure_science')[0]._.wordnet.wordnet_domains()
20 | nlp('science')[0]._.wordnet.wordnet_domains()
21 | wnet = nlp('science')[0]._.wordnet
22 | wnet.wordnet_synsets_for_domain()
23 | wnet.lemmas()
24 | token = nlp('human')[0]
25 | token._.wordnet.lemmas()
26 | token._.wordnet.synsets()
27 | [c.lemmas() for c in token._.wordnet.synsets()]
28 | syn =token._.wordnet.synsets()
29 | x = syn[0]
30 | x
31 | x.common_hypernyms()
32 | man = nlp('man')[0]
33 | woman = nlp('woman')[0]
34 | man._.wordnet.synsets()
35 | man_syn = man._.wordnet.synsets()[0]
36 | woman_syn = woman._.wordnet.synsets()[0]
37 | man_syn
38 | woman_syn
39 | man_syn.common_hypernyms(woman_syn)
40 | man_syn.entailments()
41 | get_ipython().run_line_magic('pinfo', 'man_syn.entailments')
42 | man_syn.examples()
43 | man_syn.hypernyms()
44 | woman_syn.hypernyms()
45 | man_syn.hyponyms()
46 | woman_syn.hyponyms()
47 | get_ipython().run_line_magic('pinfo', 'man_syn.jcn_similarity')
48 | get_ipython().run_line_magic('pinfo', 'man_syn.lemmas')
49 | man_syn.lemmas()
50 | get_ipython().run_line_magic('pinfo', 'man_syn.similar_tos')
51 | man_syn.similar_tos()
52 | while True:
53 | x = man_syn.hypernyms()[0]
54 | print(x)
55 | x = x.hypernyms()[0]
56 |
57 | x = man_syn
58 | while True:
59 | print(x)
60 | x = x.hypernyms()[0]
61 |
62 | x
63 | x.hypernyms()
64 | x = man_syn
65 | while True:
66 | print(x)
67 | x = x.hyponyms()[0]
68 |
69 | while True:
70 | print(x)
71 | i = 0
72 | try:
73 | x = x.hyponyms()[i]
74 | except IndexError:
75 | i += 1
76 | x = x.hyponyms()[i]
77 |
78 | while True:
79 | print(x)
80 | try:
81 | i = 0
82 | x = x.hyponyms()[i]
83 | except IndexError:
84 | i += 1
85 | x = x.hyponyms()[i]
86 |
87 |
88 |
--------------------------------------------------------------------------------
/distributional semantics/utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 |
7 | from sklearn.decomposition import PCA
8 | from sklearn.manifold import TSNE
9 | import numpy as np
10 | import matplotlib.pyplot as plt
11 | from gensim.models.callbacks import CallbackAny2Vec
12 | from gensim.models import Word2Vec, KeyedVectors
13 | from tensorflow.keras.layers import Embedding
14 |
15 |
16 | class MetricCallback(CallbackAny2Vec):
17 | """
18 | Callback to print loss after each epoch
19 | """
20 | def __init__(self, every=10):
21 | self.myloss = []
22 | self.epoch = 0
23 | self.every = every
24 |
25 | def on_epoch_end(self, model):
26 | loss = model.get_latest_training_loss()
27 | if self.epoch == 0:
28 | self.myloss.append(loss)
29 | else:
30 | self.myloss.append(loss - self.loss_previous_step)
31 | if self.epoch % self.every == 0:
32 | print(f'Loss after epoch {self.epoch}: {self.myloss[-1]}') # NOQA: T001
33 | self.epoch += 1
34 | self.loss_previous_step = loss
35 |
36 |
37 | def plot_arrows(starts, ends, wv, estimator=PCA, **kwargs):
38 | if len(starts) != len(ends):
39 | raise ValueError('starts and ends must be the same length.')
40 | fig, ax = plt.subplots(figsize=kwargs.pop('figsize', (8, 8)))
41 | X = wv[starts + ends] # NOQA: N806
42 | x_red = estimator(n_components=2).fit_transform(X)
43 | plt.scatter(*x_red.T)
44 | for i, word in enumerate(starts + ends):
45 | plt.annotate(word, x_red[i])
46 | xstart = x_red[:len(starts)]
47 | xend = x_red[len(starts):]
48 | for i, (start, end) in enumerate(zip(starts, ends)):
49 | x1, y1 = xstart[i]
50 | x2, y2 = xend[i]
51 | plt.arrow(x1, y1, x2 - x1, y2 - y1)
52 |
53 |
54 | def plot_vectors(words, model, estimator=TSNE, **kwargs):
55 | names = []
56 | vectors = []
57 | for word in words:
58 | if word in model.wv:
59 | names.append(word)
60 | vectors.append(model.wv[word])
61 |
62 | X = np.r_[vectors] # NOQA: N806
63 | x_red = estimator(n_components=2).fit_transform(X)
64 | fig, ax = plt.subplots(figsize=kwargs.pop('figsize', (16, 16))) # NOQA: E912
65 | ax.scatter(*x_red.T)
66 |
67 | for i, word in enumerate(names):
68 | plt.annotate(word, x_red[i])
69 |
70 |
71 | def make_embedding_layer(model, tokenizer, MAX_SEQUENCE_LENGTH): # NOQA: N803
72 | word_index = tokenizer.word_index
73 | if isinstance(model, Word2Vec):
74 | wv = model.wv
75 | elif isinstance(model, KeyedVectors):
76 | wv = model
77 | embedding_matrix = np.zeros((len(word_index) + 1, wv.vector_size))
78 | for word, i in word_index.items():
79 | try:
80 | vector = wv.get_vector(word, False)
81 | embedding_matrix[i] = vector
82 | except KeyError:
83 | continue
84 | el = Embedding(
85 | len(word_index) + 1, wv.vector_size, weights=[embedding_matrix],
86 | input_length=MAX_SEQUENCE_LENGTH, trainable=False
87 | )
88 | return el
89 |
--------------------------------------------------------------------------------
/neural networks for NLP/imdb-reviews-classification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "respective-webster",
6 | "metadata": {},
7 | "source": [
8 | "# Text Classification"
9 | ]
10 | },
11 | {
12 | "cell_type": "raw",
13 | "id": "adverse-evolution",
14 | "metadata": {},
15 | "source": [
16 | "----------------------------------------------------------------------\n",
17 | "Filename : imdb-reviews-classification.ipynb\n",
18 | "Author : Jaidev Deshpande\n",
19 | "Purpose : Understanding text classification using keras\n",
20 | "Libraries: tensorflow.keras, numpy, sklearn and pandas\n",
21 | "----------------------------------------------------------------------"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "id": "seasonal-balloon",
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "# Imports\n",
32 | "\n",
33 | "from tensorflow.keras.preprocessing import text_dataset_from_directory\n",
34 | "from tensorflow.keras import Sequential\n",
35 | "from tensorflow.keras.layers import Dense\n",
36 | "from tensorflow.keras.optimizers import SGD\n",
37 | "\n",
38 | "import pandas as pd\n",
39 | "import numpy as np\n",
40 | "from sklearn.feature_extraction.text import TfidfVectorizer\n",
41 | "from sklearn.model_selection import train_test_split\n",
42 | "import matplotlib.pyplot as plt\n",
43 | "%matplotlib inline"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "id": "corrected-railway",
49 | "metadata": {},
50 | "source": [
51 | "## The Problem: Large Movie Dataset Review\n",
52 | "### Classify movie reviews from IMDB into positive or negative sentiment.\n",
53 | "### Download the dataset [here](https://drive.google.com/drive/u/0/folders/1hYw0TQbwcM2YWEiKN-2c_kHCPtHO_TMC)"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "id": "fossil-vertical",
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "# Importing & preprocessing the dataset\n",
64 | "\n",
65 | "train_ds = text_dataset_from_directory('aclImdb/train')\n",
66 | "test_ds = text_dataset_from_directory('aclImdb/test')\n",
67 | "\n",
68 | "dfTrain = pd.DataFrame(train_ds.unbatch().as_numpy_iterator(), columns=['text', 'label'])\n",
69 | "dfTest = pd.DataFrame(test_ds.unbatch().as_numpy_iterator(), columns=['text', 'label'])\n",
70 | "_, xts = train_test_split(dfTest, stratify=dfTest['label'], test_size=0.25)"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "id": "vertical-northern",
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "# Look at a sample movie review\n",
81 | "\n",
82 | "print(dfTrain.loc[0, 'text'])"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "id": "virgin-illustration",
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "# Look at the label of the review\n",
93 | "\n",
94 | "dfTrain.loc[0, 'label']"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "id": "sustained-camcorder",
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "pd.options.display.max_colwidth = 100\n",
105 | "dfTrain.head()"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "id": "improved-negotiation",
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "dfTest.head()"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "id": "enormous-comment",
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "# Feature Extraction - Text to TFIDF\n",
126 | "\n",
127 | "vect = TfidfVectorizer(stop_words='english')\n",
128 | "XTrain = vect.fit_transform(dfTrain['text']).toarray()\n",
129 | "XTest = vect.transform(xts['text']).toarray()"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "id": "included-healing",
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "XTrain.shape"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "id": "according-russia",
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "# Assemble and compile the neural network\n",
150 | "\n",
151 | "model = Sequential([\n",
152 | " Dense(128, input_shape=(XTrain.shape[1],), activation='relu'),\n",
153 | " Dense(128, activation='relu'),\n",
154 | " Dense(64, activation='relu'),\n",
155 | " Dense(1, activation='sigmoid')\n",
156 | "])\n",
157 | "model.compile(loss='binary_crossentropy', optimizer=SGD(lr=1e-3), metrics=['accuracy'])"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "id": "protective-parking",
164 | "metadata": {
165 | "scrolled": true
166 | },
167 | "outputs": [],
168 | "source": [
169 | "# Train the neural network\n",
170 | "\n",
171 | "history = model.fit(XTrain, dfTrain['label'], batch_size=64, validation_data=(XTest, xts['label']), epochs=50)"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "id": "completed-poultry",
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "# Visualize the learning curve\n",
182 | "\n",
183 | "plt.plot(history.history['accuracy'], label='Train Accuracy')\n",
184 | "plt.plot(history.history['val_accuracy'], label='Test Accuracy')\n",
185 | "plt.legend()"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "id": "metallic-office",
192 | "metadata": {},
193 | "outputs": [],
194 | "source": []
195 | }
196 | ],
197 | "metadata": {
198 | "kernelspec": {
199 | "display_name": "Python 3",
200 | "language": "python",
201 | "name": "python3"
202 | },
203 | "language_info": {
204 | "codemirror_mode": {
205 | "name": "ipython",
206 | "version": 3
207 | },
208 | "file_extension": ".py",
209 | "mimetype": "text/x-python",
210 | "name": "python",
211 | "nbconvert_exporter": "python",
212 | "pygments_lexer": "ipython3",
213 | "version": "3.7.7"
214 | }
215 | },
216 | "nbformat": 4,
217 | "nbformat_minor": 5
218 | }
219 |
--------------------------------------------------------------------------------
/neural networks for NLP/keras.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "strange-seafood",
6 | "metadata": {},
7 | "source": [
8 | "## [Installing Tensorflow / Keras](https://www.tensorflow.org/install)"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "id": "aquatic-learning",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# Imports\n",
19 | "\n",
20 | "from tensorflow.keras.layers import Dense\n",
21 | "from tensorflow.keras import Sequential\n",
22 | "from tensorflow.keras.optimizers import SGD\n",
23 | "from tensorflow.keras.utils import to_categorical, plot_model\n",
24 | "import pandas as pd\n",
25 | "import numpy as np\n",
26 | "import matplotlib.pyplot as plt\n",
27 | "%matplotlib inline"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "id": "maritime-flavor",
33 | "metadata": {},
34 | "source": [
35 | "## Problem: **Credit Card Fraud Detection**\n",
36 | "### Given a list of 28 anonimyzed features and the amount of money involved in a credt card transaction, predict whether the transaction is likely to be fraudulent.\n",
37 | "### Source: https://www.kaggle.com/mlg-ulb/creditcardfraud/"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "id": "occupied-eagle",
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "# Preview dataset\n",
48 | "\n",
49 | "df = pd.read_csv('creditcard.csv')\n",
50 | "df.drop(['Time'], axis=1, inplace=True)\n",
51 | "df.head()"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "id": "stock-imperial",
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "print(len(df))"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "id": "charged-parade",
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "# Check the proportion of the classes\n",
72 | "\n",
73 | "df['Class'].value_counts(normalize=True)"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "id": "level-telephone",
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "df['Class'].value_counts(normalize=True).plot(kind='pie')"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "id": "attached-potter",
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "# Define the input and the output\n",
94 | "\n",
95 | "y = df.pop('Class').values\n",
96 | "X = df.values"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "id": "aging-cream",
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "# Assemble the network\n",
107 | "\n",
108 | "model = Sequential([\n",
109 | " Dense(16, input_shape=(29,), activation='relu'),\n",
110 | " Dense(8, activation='relu'),\n",
111 | " Dense(2, activation='softmax')\n",
112 | "])\n",
113 | "model.compile(loss='sparse_categorical_crossentropy', optimizer=SGD(lr=1e-3), metrics=['categorical_accuracy'])"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "id": "experienced-terminal",
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "# View the network - layer by layer\n",
124 | "\n",
125 | "plot_model(model, show_shapes=True, rankdir='LR')"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "id": "revolutionary-gothic",
132 | "metadata": {},
133 | "outputs": [],
134 | "source": [
135 | "model.layers"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "id": "authorized-holiday",
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "layer1, layer2, layer3 = model.layers"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "id": "quarterly-participation",
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "# View the shapes of weights and biases for each layer\n",
156 | "\n",
157 | "l1_weight, l1_bias = layer1.weights\n",
158 | "print(l1_weight.shape)\n",
159 | "print(l1_bias.shape)"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "id": "impaired-emphasis",
166 | "metadata": {},
167 | "outputs": [],
168 | "source": [
169 | "l2_weight, l2_bias = layer2.weights\n",
170 | "print(l2_weight.shape)\n",
171 | "print(l2_bias.shape)"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "id": "contained-leave",
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "l3_weight, l3_bias = layer3.weights\n",
182 | "print(l3_weight.shape)\n",
183 | "print(l3_bias.shape)"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "id": "ultimate-times",
190 | "metadata": {
191 | "scrolled": true
192 | },
193 | "outputs": [],
194 | "source": [
195 | "# Train the model\n",
196 | "\n",
197 | "history = model.fit(X, y, batch_size=128, epochs=30)"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": null,
203 | "id": "signal-fireplace",
204 | "metadata": {},
205 | "outputs": [],
206 | "source": [
207 | "# Plot the loss across epochs\n",
208 | "\n",
209 | "plt.plot(history.history['loss'], label='Loss')"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "id": "advised-border",
216 | "metadata": {},
217 | "outputs": [],
218 | "source": [
219 | "# Plot the accuracy across epochs\n",
220 | "\n",
221 | "plt.plot(history.history['categorical_accuracy'], label='Accuracy')"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "id": "attended-mileage",
228 | "metadata": {},
229 | "outputs": [],
230 | "source": []
231 | }
232 | ],
233 | "metadata": {
234 | "kernelspec": {
235 | "display_name": "Python 3",
236 | "language": "python",
237 | "name": "python3"
238 | },
239 | "language_info": {
240 | "codemirror_mode": {
241 | "name": "ipython",
242 | "version": 3
243 | },
244 | "file_extension": ".py",
245 | "mimetype": "text/x-python",
246 | "name": "python",
247 | "nbconvert_exporter": "python",
248 | "pygments_lexer": "ipython3",
249 | "version": "3.7.7"
250 | }
251 | },
252 | "nbformat": 4,
253 | "nbformat_minor": 5
254 | }
255 |
--------------------------------------------------------------------------------
/neural networks for NLP/forward-pass.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "looking-election",
6 | "metadata": {},
7 | "source": [
8 | "# Forward Pass"
9 | ]
10 | },
11 | {
12 | "cell_type": "raw",
13 | "id": "continuous-royalty",
14 | "metadata": {},
15 | "source": [
16 | "----------------------------------------------------------------------\n",
17 | "Filename : forward-pass.ipynb\n",
18 | "Author : Jaidev Deshpande\n",
19 | "Content : Understanding feed forward propagation\n",
20 | "Libraries: numpy, sklearn and pandas\n",
21 | "----------------------------------------------------------------------"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "id": "understood-island",
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "# Importing libraries\n",
32 | "\n",
33 | "from sklearn.datasets import load_iris\n",
34 | "import pandas as pd\n",
35 | "import numpy as np"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "id": "spoken-clarity",
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "# Load the Iris dataset\n",
46 | "\n",
47 | "iris = load_iris()\n",
48 | "X = iris.data\n",
49 | "y = iris.target\n",
50 | "df = pd.DataFrame(X, columns=iris.feature_names)\n",
51 | "df['species'] = iris.target_names[y]\n",
52 | "df.sample(n=5)"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "id": "western-consensus",
58 | "metadata": {},
59 | "source": [
60 | "
"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "id": "detailed-nomination",
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "# Activation functions\n",
71 | "\n",
72 | "def sigmoid(x):\n",
73 | " return 1 / (1 + np.exp(-x))\n",
74 | "\n",
75 | "\n",
76 | "def softmax(x):\n",
77 | " return np.exp(x) / np.exp(x).sum()"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "id": "unable-typing",
83 | "metadata": {},
84 | "source": [
85 | "## Matrix Multiplication\n",
86 | "
"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "id": "played-brighton",
92 | "metadata": {},
93 | "source": [
94 | "## Layer 1 Weights, Biases and Activation"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "id": "mounted-penetration",
100 | "metadata": {},
101 | "source": [
102 | "
"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "id": "loving-metropolitan",
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "df.head()"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "id": "split-tension",
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "x = X[:5]\n",
123 | "x"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "id": "cross-finder",
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "# Initialize weights and biases\n",
134 | "\n",
135 | "w1 = np.random.rand(4, 5)\n",
136 | "b1 = np.random.rand(5)\n",
137 | "print(w1.shape)\n",
138 | "print(b1.shape)"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "id": "recorded-prison",
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "# Take a data sample\n",
149 | "\n",
150 | "sample = x[0]\n",
151 | "sample"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "id": "american-access",
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "print(w1)"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "id": "governing-reader",
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "# Multiply the sample with the first layer weights <𝐱,𝐰1>\n",
172 | "\n",
173 | "a1 = np.dot(sample, w1)\n",
174 | "a1"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "id": "concrete-differential",
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "b1"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "id": "institutional-talent",
191 | "metadata": {},
192 | "outputs": [],
193 | "source": [
194 | "# Add the bias to the product <𝐱,𝐰1>+ b\n",
195 | "\n",
196 | "a1 + b1"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "id": "balanced-storm",
203 | "metadata": {},
204 | "outputs": [],
205 | "source": [
206 | "# Apply the activation to get the output of the first layer f(<𝐱,𝐰1>+ b)\n",
207 | "\n",
208 | "f1 = sigmoid(a1 + b1)\n",
209 | "f1"
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "id": "promising-exhibit",
215 | "metadata": {},
216 | "source": [
217 | "## Layer 2: Weights, Biases and Activations"
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "id": "acknowledged-cruise",
223 | "metadata": {},
224 | "source": [
225 | "
"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": null,
231 | "id": "color-source",
232 | "metadata": {},
233 | "outputs": [],
234 | "source": [
235 | "w2 = np.random.rand(5, 7)\n",
236 | "b2 = np.random.rand(7)"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "id": "visible-fifth",
243 | "metadata": {},
244 | "outputs": [],
245 | "source": [
246 | "a2 = np.dot(f1, w2) + b2\n",
247 | "a2"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "id": "radical-serum",
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "f2 = sigmoid(a2)\n",
258 | "f2"
259 | ]
260 | },
261 | {
262 | "cell_type": "markdown",
263 | "id": "ahead-outreach",
264 | "metadata": {},
265 | "source": [
266 | "## Layer 3: Weights, Biases and Activations"
267 | ]
268 | },
269 | {
270 | "cell_type": "markdown",
271 | "id": "fuzzy-homeless",
272 | "metadata": {},
273 | "source": [
274 | "
"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": null,
280 | "id": "conventional-white",
281 | "metadata": {},
282 | "outputs": [],
283 | "source": [
284 | "w3 = np.random.rand(7, 3)\n",
285 | "b3 = np.random.rand(3)"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": null,
291 | "id": "beautiful-springfield",
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "a3 = np.dot(f2, w3) + b3\n",
296 | "a3"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": null,
302 | "id": "solid-inquiry",
303 | "metadata": {},
304 | "outputs": [],
305 | "source": [
306 | "f3 = softmax(a3)\n",
307 | "f3"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "id": "valuable-science",
314 | "metadata": {},
315 | "outputs": [],
316 | "source": []
317 | }
318 | ],
319 | "metadata": {
320 | "kernelspec": {
321 | "display_name": "Python 3",
322 | "language": "python",
323 | "name": "python3"
324 | },
325 | "language_info": {
326 | "codemirror_mode": {
327 | "name": "ipython",
328 | "version": 3
329 | },
330 | "file_extension": ".py",
331 | "mimetype": "text/x-python",
332 | "name": "python",
333 | "nbconvert_exporter": "python",
334 | "pygments_lexer": "ipython3",
335 | "version": "3.7.7"
336 | }
337 | },
338 | "nbformat": 4,
339 | "nbformat_minor": 5
340 | }
341 |
--------------------------------------------------------------------------------
/Knowledge Graphs/intro-to-wordnet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "raw",
5 | "id": "freelance-naples",
6 | "metadata": {},
7 | "source": [
8 | "----------------------------------------------------------------------\n",
9 | "Filename : intro-to-wordnet.ipynb\n",
10 | "Author : Jaidev Deshpande\n",
11 | "Purpose : Understanding Wordnet functionalities\n",
12 | "Libraries: nltk\n",
13 | "----------------------------------------------------------------------"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "id": "anonymous-canadian",
19 | "metadata": {},
20 | "source": [
21 | "## [WordNet®](https://wordnet.princeton.edu/) Tutorial\n",
22 | "\n",
23 | "### Navigating Wornet Relationships"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 1,
29 | "id": "widespread-traveler",
30 | "metadata": {},
31 | "outputs": [
32 | {
33 | "name": "stdout",
34 | "output_type": "stream",
35 | "text": [
36 | "Requirement already satisfied: nltk in /home/jaidevd/anaconda3/lib/python3.7/site-packages (3.3)\n",
37 | "Requirement already satisfied: six in /home/jaidevd/anaconda3/lib/python3.7/site-packages (from nltk) (1.15.0)\n",
38 | "\u001b[33mWARNING: You are using pip version 21.0.1; however, version 21.1 is available.\n",
39 | "You should consider upgrading via the '/home/jaidevd/anaconda3/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "!pip install nltk"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 2,
50 | "id": "vocational-clearance",
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "from nltk import download"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 3,
60 | "id": "sensitive-albany",
61 | "metadata": {},
62 | "outputs": [
63 | {
64 | "name": "stdout",
65 | "output_type": "stream",
66 | "text": [
67 | "[nltk_data] Downloading package wordnet to /home/jaidevd/nltk_data...\n",
68 | "[nltk_data] Package wordnet is already up-to-date!\n"
69 | ]
70 | },
71 | {
72 | "data": {
73 | "text/plain": [
74 | "True"
75 | ]
76 | },
77 | "execution_count": 3,
78 | "metadata": {},
79 | "output_type": "execute_result"
80 | }
81 | ],
82 | "source": [
83 | "download('wordnet')"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 4,
89 | "id": "weird-memory",
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "from nltk.corpus import wordnet"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 5,
99 | "id": "developing-failure",
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "data": {
104 | "text/plain": [
105 | "[Synset('tractor.n.01'), Synset('tractor.n.02')]"
106 | ]
107 | },
108 | "execution_count": 5,
109 | "metadata": {},
110 | "output_type": "execute_result"
111 | }
112 | ],
113 | "source": [
114 | "# Synsets\n",
115 | "\n",
116 | "tractor = wordnet.synsets('tractor')\n",
117 | "tractor"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 6,
123 | "id": "seasonal-hungary",
124 | "metadata": {},
125 | "outputs": [
126 | {
127 | "data": {
128 | "text/plain": [
129 | "['a wheeled vehicle with large wheels; used in farming and other applications',\n",
130 | " 'a truck that has a cab but no body; used for pulling large trailers or vans']"
131 | ]
132 | },
133 | "execution_count": 6,
134 | "metadata": {},
135 | "output_type": "execute_result"
136 | }
137 | ],
138 | "source": [
139 | "# Definitions of senses\n",
140 | "\n",
141 | "[syn.definition() for syn in tractor]"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 7,
147 | "id": "active-norwegian",
148 | "metadata": {},
149 | "outputs": [
150 | {
151 | "data": {
152 | "text/plain": [
153 | "[Synset('self-propelled_vehicle.n.01')]"
154 | ]
155 | },
156 | "execution_count": 7,
157 | "metadata": {},
158 | "output_type": "execute_result"
159 | }
160 | ],
161 | "source": [
162 | "# Hypernyms: Relation between a concept and its superordinate\n",
163 | "\n",
164 | "tractor = wordnet.synset('tractor.n.01')\n",
165 | "tractor.hypernyms()"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 8,
171 | "id": "arctic-customer",
172 | "metadata": {},
173 | "outputs": [
174 | {
175 | "data": {
176 | "text/plain": [
177 | "[Synset('wheeled_vehicle.n.01')]"
178 | ]
179 | },
180 | "execution_count": 8,
181 | "metadata": {},
182 | "output_type": "execute_result"
183 | }
184 | ],
185 | "source": [
186 | "self_propelled_vehicle = wordnet.synset('self-propelled_vehicle.n.01')\n",
187 | "self_propelled_vehicle.hypernyms()"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 9,
193 | "id": "invalid-lease",
194 | "metadata": {},
195 | "outputs": [
196 | {
197 | "data": {
198 | "text/plain": [
199 | "[Synset('axle.n.01'),\n",
200 | " Synset('brake.n.01'),\n",
201 | " Synset('splasher.n.01'),\n",
202 | " Synset('wheel.n.01')]"
203 | ]
204 | },
205 | "execution_count": 9,
206 | "metadata": {},
207 | "output_type": "execute_result"
208 | }
209 | ],
210 | "source": [
211 | "# Meronyms: Relation between a part and its whole\n",
212 | "\n",
213 | "wheeled_vehicle = wordnet.synset('wheeled_vehicle.n.01')\n",
214 | "wheeled_vehicle.part_meronyms()"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 10,
220 | "id": "brown-weekend",
221 | "metadata": {},
222 | "outputs": [
223 | {
224 | "data": {
225 | "text/plain": [
226 | "[Synset('baby_buggy.n.01'),\n",
227 | " Synset('bicycle.n.01'),\n",
228 | " Synset('boneshaker.n.01'),\n",
229 | " Synset('car.n.02'),\n",
230 | " Synset('handcart.n.01'),\n",
231 | " Synset('horse-drawn_vehicle.n.01'),\n",
232 | " Synset('motor_scooter.n.01'),\n",
233 | " Synset('rolling_stock.n.01'),\n",
234 | " Synset('scooter.n.02'),\n",
235 | " Synset('self-propelled_vehicle.n.01'),\n",
236 | " Synset('skateboard.n.01'),\n",
237 | " Synset('trailer.n.04'),\n",
238 | " Synset('tricycle.n.01'),\n",
239 | " Synset('unicycle.n.01'),\n",
240 | " Synset('wagon.n.01'),\n",
241 | " Synset('wagon.n.04'),\n",
242 | " Synset('welcome_wagon.n.01')]"
243 | ]
244 | },
245 | "execution_count": 10,
246 | "metadata": {},
247 | "output_type": "execute_result"
248 | }
249 | ],
250 | "source": [
251 | "# Hyponyms: Relation between a concept and its subordinate\n",
252 | "\n",
253 | "wheeled_vehicle.hyponyms()"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": 11,
259 | "id": "theoretical-bargain",
260 | "metadata": {},
261 | "outputs": [
262 | {
263 | "data": {
264 | "text/plain": [
265 | "[Synset('wheeled_vehicle.n.01')]"
266 | ]
267 | },
268 | "execution_count": 11,
269 | "metadata": {},
270 | "output_type": "execute_result"
271 | }
272 | ],
273 | "source": [
274 | "# Holonyms: Relation between whole and its parts\n",
275 | "\n",
276 | "axle = wordnet.synset('axle.n.01')\n",
277 | "axle.part_holonyms()"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": 12,
283 | "id": "identical-shoulder",
284 | "metadata": {},
285 | "outputs": [
286 | {
287 | "data": {
288 | "text/plain": [
289 | "[Synset('armored_vehicle.n.01'),\n",
290 | " Synset('carrier.n.02'),\n",
291 | " Synset('forklift.n.01'),\n",
292 | " Synset('locomotive.n.01'),\n",
293 | " Synset('motor_vehicle.n.01'),\n",
294 | " Synset('personnel_carrier.n.01'),\n",
295 | " Synset('reconnaissance_vehicle.n.01'),\n",
296 | " Synset('recreational_vehicle.n.01'),\n",
297 | " Synset('streetcar.n.01'),\n",
298 | " Synset('tracked_vehicle.n.01'),\n",
299 | " Synset('tractor.n.01'),\n",
300 | " Synset('weapons_carrier.n.01')]"
301 | ]
302 | },
303 | "execution_count": 12,
304 | "metadata": {},
305 | "output_type": "execute_result"
306 | }
307 | ],
308 | "source": [
309 | "self_propelled_vehicle.hyponyms()"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 13,
315 | "id": "quick-strengthening",
316 | "metadata": {},
317 | "outputs": [
318 | {
319 | "data": {
320 | "text/plain": [
321 | "[Synset('amphibian.n.01'),\n",
322 | " Synset('bloodmobile.n.01'),\n",
323 | " Synset('car.n.01'),\n",
324 | " Synset('doodlebug.n.01'),\n",
325 | " Synset('four-wheel_drive.n.01'),\n",
326 | " Synset('go-kart.n.01'),\n",
327 | " Synset('golfcart.n.01'),\n",
328 | " Synset('hearse.n.01'),\n",
329 | " Synset('motorcycle.n.01'),\n",
330 | " Synset('snowplow.n.01'),\n",
331 | " Synset('truck.n.01')]"
332 | ]
333 | },
334 | "execution_count": 13,
335 | "metadata": {},
336 | "output_type": "execute_result"
337 | }
338 | ],
339 | "source": [
340 | "motor_vehicle = wordnet.synset('motor_vehicle.n.01')\n",
341 | "motor_vehicle.hyponyms()"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": 14,
347 | "id": "egyptian-appliance",
348 | "metadata": {},
349 | "outputs": [
350 | {
351 | "data": {
352 | "text/plain": [
353 | "[Synset('accelerator.n.01'),\n",
354 | " Synset('air_bag.n.01'),\n",
355 | " Synset('auto_accessory.n.01'),\n",
356 | " Synset('automobile_engine.n.01'),\n",
357 | " Synset('automobile_horn.n.01'),\n",
358 | " Synset('buffer.n.06'),\n",
359 | " Synset('bumper.n.02'),\n",
360 | " Synset('car_door.n.01'),\n",
361 | " Synset('car_mirror.n.01'),\n",
362 | " Synset('car_seat.n.01'),\n",
363 | " Synset('car_window.n.01'),\n",
364 | " Synset('fender.n.01'),\n",
365 | " Synset('first_gear.n.01'),\n",
366 | " Synset('floorboard.n.02'),\n",
367 | " Synset('gasoline_engine.n.01'),\n",
368 | " Synset('glove_compartment.n.01'),\n",
369 | " Synset('grille.n.02'),\n",
370 | " Synset('high_gear.n.01'),\n",
371 | " Synset('hood.n.09'),\n",
372 | " Synset('luggage_compartment.n.01'),\n",
373 | " Synset('rear_window.n.01'),\n",
374 | " Synset('reverse.n.02'),\n",
375 | " Synset('roof.n.02'),\n",
376 | " Synset('running_board.n.01'),\n",
377 | " Synset('stabilizer_bar.n.01'),\n",
378 | " Synset('sunroof.n.01'),\n",
379 | " Synset('tail_fin.n.02'),\n",
380 | " Synset('third_gear.n.01'),\n",
381 | " Synset('window.n.02')]"
382 | ]
383 | },
384 | "execution_count": 14,
385 | "metadata": {},
386 | "output_type": "execute_result"
387 | }
388 | ],
389 | "source": [
390 | "car = wordnet.synset('car.n.01')\n",
391 | "car.part_meronyms()"
392 | ]
393 | },
394 | {
395 | "cell_type": "code",
396 | "execution_count": null,
397 | "id": "civic-worst",
398 | "metadata": {},
399 | "outputs": [],
400 | "source": []
401 | }
402 | ],
403 | "metadata": {
404 | "kernelspec": {
405 | "display_name": "Python 3",
406 | "language": "python",
407 | "name": "python3"
408 | },
409 | "language_info": {
410 | "codemirror_mode": {
411 | "name": "ipython",
412 | "version": 3
413 | },
414 | "file_extension": ".py",
415 | "mimetype": "text/x-python",
416 | "name": "python",
417 | "nbconvert_exporter": "python",
418 | "pygments_lexer": "ipython3",
419 | "version": "3.7.7"
420 | }
421 | },
422 | "nbformat": 4,
423 | "nbformat_minor": 5
424 | }
425 |
--------------------------------------------------------------------------------
/Knowledge Graphs/lesk.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "shaped-norway",
6 | "metadata": {},
7 | "source": [
8 | "## Word-Sense Disambiguation"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "id": "genetic-terror",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "from nltk.corpus import wordnet as wn\n",
19 | "from nltk import wsd"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "id": "adult-bangladesh",
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "X = 'The die is cast.'\n",
30 | "Y = 'Roll the die to get a 6.'\n",
31 | "Z = 'What is dead may never die.'"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 3,
37 | "id": "antique-burlington",
38 | "metadata": {},
39 | "outputs": [
40 | {
41 | "data": {
42 | "text/plain": [
43 | "[Synset('die.n.01'),\n",
44 | " Synset('die.n.02'),\n",
45 | " Synset('die.n.03'),\n",
46 | " Synset('die.v.01'),\n",
47 | " Synset('die.v.02'),\n",
48 | " Synset('die.v.03'),\n",
49 | " Synset('fail.v.04'),\n",
50 | " Synset('die.v.05'),\n",
51 | " Synset('die.v.06'),\n",
52 | " Synset('die.v.07'),\n",
53 | " Synset('die.v.08'),\n",
54 | " Synset('die.v.09'),\n",
55 | " Synset('die.v.10'),\n",
56 | " Synset('die.v.11')]"
57 | ]
58 | },
59 | "execution_count": 3,
60 | "metadata": {},
61 | "output_type": "execute_result"
62 | }
63 | ],
64 | "source": [
65 | "wn.synsets('die')"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 4,
71 | "id": "governing-montana",
72 | "metadata": {},
73 | "outputs": [
74 | {
75 | "data": {
76 | "text/plain": [
77 | "[Synset('die.n.01'), Synset('die.n.02'), Synset('die.n.03')]"
78 | ]
79 | },
80 | "execution_count": 4,
81 | "metadata": {},
82 | "output_type": "execute_result"
83 | }
84 | ],
85 | "source": [
86 | "wn.synsets('die', pos=wn.NOUN)"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 5,
92 | "id": "located-bookmark",
93 | "metadata": {},
94 | "outputs": [
95 | {
96 | "name": "stdout",
97 | "output_type": "stream",
98 | "text": [
99 | "a small cube with 1 to 6 spots on the six faces; used in gambling to generate random numbers\n",
100 | "a device used for shaping metal\n",
101 | "a cutting tool that is fitted into a diestock and used for cutting male (external) screw threads on screws or bolts or pipes or rods\n"
102 | ]
103 | }
104 | ],
105 | "source": [
106 | "for syn in wn.synsets('die', pos=wn.NOUN):\n",
107 | " print(syn.definition())"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 6,
113 | "id": "after-party",
114 | "metadata": {},
115 | "outputs": [
116 | {
117 | "name": "stdout",
118 | "output_type": "stream",
119 | "text": [
120 | "pass from physical life and lose all bodily attributes and functions necessary to sustain life\n",
121 | "suffer or face the pain of death\n",
122 | "be brought to or as if to the point of death by an intense emotion such as embarrassment, amusement, or shame\n",
123 | "stop operating or functioning\n",
124 | "feel indifferent towards\n",
125 | "languish as with love or desire\n",
126 | "cut or shape with a die\n",
127 | "to be on base at the end of an inning, of a player\n",
128 | "lose sparkle or bouquet\n",
129 | "disappear or come to an end\n",
130 | "suffer spiritual death; be damned (in the religious sense)\n"
131 | ]
132 | }
133 | ],
134 | "source": [
135 | "for syn in wn.synsets('die', pos=wn.VERB):\n",
136 | " print(syn.definition())"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "id": "bacterial-effect",
142 | "metadata": {},
143 | "source": [
144 | "## Word-Sense Disambiguation with Lesk Algorithm"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 7,
150 | "id": "threaded-tourism",
151 | "metadata": {},
152 | "outputs": [
153 | {
154 | "name": "stdout",
155 | "output_type": "stream",
156 | "text": [
157 | "The die is cast.\n"
158 | ]
159 | },
160 | {
161 | "data": {
162 | "text/plain": [
163 | "Synset('die.v.07')"
164 | ]
165 | },
166 | "execution_count": 7,
167 | "metadata": {},
168 | "output_type": "execute_result"
169 | }
170 | ],
171 | "source": [
172 | "print(X)\n",
173 | "wsd.lesk(X.split(), 'die')"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 8,
179 | "id": "fluid-cargo",
180 | "metadata": {},
181 | "outputs": [
182 | {
183 | "data": {
184 | "text/plain": [
185 | "'cut or shape with a die'"
186 | ]
187 | },
188 | "execution_count": 8,
189 | "metadata": {},
190 | "output_type": "execute_result"
191 | }
192 | ],
193 | "source": [
194 | "_.definition()"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 9,
200 | "id": "independent-melissa",
201 | "metadata": {},
202 | "outputs": [
203 | {
204 | "data": {
205 | "text/plain": [
206 | "'a cutting tool that is fitted into a diestock and used for cutting male (external) screw threads on screws or bolts or pipes or rods'"
207 | ]
208 | },
209 | "execution_count": 9,
210 | "metadata": {},
211 | "output_type": "execute_result"
212 | }
213 | ],
214 | "source": [
215 | "wsd.lesk(X.split(), 'die', pos=wn.NOUN).definition()"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 10,
221 | "id": "progressive-origin",
222 | "metadata": {},
223 | "outputs": [
224 | {
225 | "name": "stdout",
226 | "output_type": "stream",
227 | "text": [
228 | "Roll the die to get a 6.\n"
229 | ]
230 | },
231 | {
232 | "data": {
233 | "text/plain": [
234 | "'to be on base at the end of an inning, of a player'"
235 | ]
236 | },
237 | "execution_count": 10,
238 | "metadata": {},
239 | "output_type": "execute_result"
240 | }
241 | ],
242 | "source": [
243 | "print(Y)\n",
244 | "wsd.lesk(Y.split(), 'die').definition()"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 11,
250 | "id": "proof-while",
251 | "metadata": {},
252 | "outputs": [
253 | {
254 | "data": {
255 | "text/plain": [
256 | "'a small cube with 1 to 6 spots on the six faces; used in gambling to generate random numbers'"
257 | ]
258 | },
259 | "execution_count": 11,
260 | "metadata": {},
261 | "output_type": "execute_result"
262 | }
263 | ],
264 | "source": [
265 | "wsd.lesk(Y.split(), 'die', pos=wn.NOUN).definition()"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": 12,
271 | "id": "middle-object",
272 | "metadata": {},
273 | "outputs": [
274 | {
275 | "name": "stdout",
276 | "output_type": "stream",
277 | "text": [
278 | "What is dead may never die.\n"
279 | ]
280 | },
281 | {
282 | "data": {
283 | "text/plain": [
284 | "'a cutting tool that is fitted into a diestock and used for cutting male (external) screw threads on screws or bolts or pipes or rods'"
285 | ]
286 | },
287 | "execution_count": 12,
288 | "metadata": {},
289 | "output_type": "execute_result"
290 | }
291 | ],
292 | "source": [
293 | "print(Z)\n",
294 | "wsd.lesk(Z.split(), 'die').definition()"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 13,
300 | "id": "organizational-joint",
301 | "metadata": {},
302 | "outputs": [
303 | {
304 | "data": {
305 | "text/plain": [
306 | "'stop operating or functioning'"
307 | ]
308 | },
309 | "execution_count": 13,
310 | "metadata": {},
311 | "output_type": "execute_result"
312 | }
313 | ],
314 | "source": [
315 | "wsd.lesk(Z.split(), 'die', pos=wn.VERB).definition()"
316 | ]
317 | },
318 | {
319 | "cell_type": "markdown",
320 | "id": "valuable-harbor",
321 | "metadata": {},
322 | "source": [
323 | "## Automatic POS Tagging + Lesk with spaCy"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": null,
329 | "id": "defensive-trailer",
330 | "metadata": {},
331 | "outputs": [],
332 | "source": [
333 | "!pip install spacy"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": 34,
339 | "id": "cellular-reflection",
340 | "metadata": {},
341 | "outputs": [],
342 | "source": [
343 | "from spacy.cli import download\n",
344 | "from spacy import load\n",
345 | "# download('en_core_web_sm')\n",
346 | "nlp = load('en_core_web_sm')"
347 | ]
348 | },
349 | {
350 | "cell_type": "code",
351 | "execution_count": 35,
352 | "id": "radical-melbourne",
353 | "metadata": {},
354 | "outputs": [],
355 | "source": [
356 | "import warnings\n",
357 | "\n",
358 | "POS_MAP = {\n",
359 | " 'VERB': wn.VERB,\n",
360 | " 'NOUN': wn.NOUN,\n",
361 | " 'PROPN': wn.NOUN\n",
362 | "}\n",
363 | "\n",
364 | "\n",
365 | "def lesk(doc, word):\n",
366 | " found = False\n",
367 | " for token in doc:\n",
368 | " if token.text == word:\n",
369 | " word = token\n",
370 | " found = True\n",
371 | " break\n",
372 | " if not found:\n",
373 | " raise ValueError(f'Word \\\"{word}\\\" does not appear in the document: {doc.text}.')\n",
374 | " pos = POS_MAP.get(word.pos_, False)\n",
375 | " if not pos:\n",
376 | " warnings.warn(f'POS tag for {word.text} not found in wordnet. Falling back to default Lesk behaviour.')\n",
377 | " args = [c.text for c in doc], word.text\n",
378 | " kwargs = dict(pos=pos)\n",
379 | " return wsd.lesk(*args, **kwargs)"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": 36,
385 | "id": "monetary-disaster",
386 | "metadata": {},
387 | "outputs": [],
388 | "source": [
389 | "doc = nlp('Roll the die to get a 6.')"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": 37,
395 | "id": "surgical-chrome",
396 | "metadata": {},
397 | "outputs": [
398 | {
399 | "data": {
400 | "text/plain": [
401 | "Synset('die.n.01')"
402 | ]
403 | },
404 | "execution_count": 37,
405 | "metadata": {},
406 | "output_type": "execute_result"
407 | }
408 | ],
409 | "source": [
410 | "lesk(doc, 'die')"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": 38,
416 | "id": "cordless-bankruptcy",
417 | "metadata": {},
418 | "outputs": [
419 | {
420 | "data": {
421 | "text/plain": [
422 | "'a small cube with 1 to 6 spots on the six faces; used in gambling to generate random numbers'"
423 | ]
424 | },
425 | "execution_count": 38,
426 | "metadata": {},
427 | "output_type": "execute_result"
428 | }
429 | ],
430 | "source": [
431 | "lesk(doc, 'die').definition()"
432 | ]
433 | },
434 | {
435 | "cell_type": "code",
436 | "execution_count": 39,
437 | "id": "excess-consultancy",
438 | "metadata": {},
439 | "outputs": [
440 | {
441 | "data": {
442 | "text/plain": [
443 | "\"a widely used search engine that uses text-matching techniques to find web pages that are important and relevant to a user's search\""
444 | ]
445 | },
446 | "execution_count": 39,
447 | "metadata": {},
448 | "output_type": "execute_result"
449 | }
450 | ],
451 | "source": [
452 | "lesk(nlp('I work at google.'), 'google').definition()"
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "execution_count": 40,
458 | "id": "infectious-binary",
459 | "metadata": {},
460 | "outputs": [
461 | {
462 | "data": {
463 | "text/plain": [
464 | "'search the internet (for information) using the Google search engine'"
465 | ]
466 | },
467 | "execution_count": 40,
468 | "metadata": {},
469 | "output_type": "execute_result"
470 | }
471 | ],
472 | "source": [
473 | "lesk(nlp('I will google it.'), 'google').definition()"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": null,
479 | "id": "laughing-carolina",
480 | "metadata": {},
481 | "outputs": [],
482 | "source": []
483 | }
484 | ],
485 | "metadata": {
486 | "kernelspec": {
487 | "display_name": "Python 3",
488 | "language": "python",
489 | "name": "python3"
490 | },
491 | "language_info": {
492 | "codemirror_mode": {
493 | "name": "ipython",
494 | "version": 3
495 | },
496 | "file_extension": ".py",
497 | "mimetype": "text/x-python",
498 | "name": "python",
499 | "nbconvert_exporter": "python",
500 | "pygments_lexer": "ipython3",
501 | "version": "3.7.7"
502 | }
503 | },
504 | "nbformat": 4,
505 | "nbformat_minor": 5
506 | }
507 |
--------------------------------------------------------------------------------
/topic modelling/nmf-imdb-movie-reviews.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "interior-basics",
6 | "metadata": {},
7 | "source": [
8 | "# Inferring Topics from IMDB Reviews"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "id": "established-malta",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import numpy as np\n",
19 | "import os\n",
20 | "from sklearn.feature_extraction.text import TfidfVectorizer\n",
21 | "from sklearn.decomposition import NMF\n",
22 | "import pandas as pd\n",
23 | "import matplotlib.pyplot as plt"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "id": "political-ability",
29 | "metadata": {},
30 | "source": [
31 | "## Exploring the Dataset: [Large Movie Review Dataset](https://drive.google.com/drive/u/0/folders/1umS1MgUXyra3KVF-6FsN8krHQ31lXhlX)"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "id": "physical-speaker",
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "ROOT = '../neuralnets/aclImdb/train/pos/'"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 3,
47 | "id": "solar-universe",
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "reviews = []\n",
52 | "for file in os.listdir(ROOT):\n",
53 | " path = os.path.join(ROOT, file)\n",
54 | " if os.path.isfile(path):\n",
55 | " with open(path, 'r') as fin:\n",
56 | " reviews.append(fin.read())"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 4,
62 | "id": "permanent-pride",
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "data": {
67 | "text/plain": [
68 | "12500"
69 | ]
70 | },
71 | "execution_count": 4,
72 | "metadata": {},
73 | "output_type": "execute_result"
74 | }
75 | ],
76 | "source": [
77 | "len(reviews)"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 5,
83 | "id": "similar-commander",
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "name": "stdout",
88 | "output_type": "stream",
89 | "text": [
90 | "Not wishing to give *anything* away here, I would just say this technically excellent, flawlessly acted and uplifting little flic will reward the viewer with an excellent hour and a half's entertainment: It will amuse, surprise, possibly embarrass occasionally and almost certainly tug at the heartstrings from time to time, as it approaches the inevitable, but not obvious, ending without becoming clichéd or predictable in any way. Most definitely recommended.
A previous User's Comment gives 8 out of 10 for the film and 10 out of 10 for both Branagh and Bonham-Carter's outstanding performances - I agree entirely....\n",
91 | "======================================================================================================================================================\n",
92 | "Wrestlemania 14 is not often looked as one of the great Wrestlemania's but I would personally put it, in my top 5, if not the top 3. It has so many great things, and it truly signified the birth of The Attitude Era, which was WWE's best era, in my opinion. HBK has the heart of a lion, and him putting over Austin like he did, on his way out, was pure class on his part. It has one of the hottest crowds you will ever see, and it has J.R and The King at their announcing best!.
Matches.
15 team battle royal LOUD pop for L.O.D's return. I'm not a fan of battle royal's, and this is yet another average one. Very predictable, even when you 1st see it, it's obvious L.O.D would win. Looking at Sunny for 8 or so minutes though, definitely helps.
2/5
WWF Light Heavyweight Championship
Taka Michinoku|C| Vs Aguila.
Taka gets a surprising pop, with his entrance. Fast, high-flying, and very exciting. If these two had more time, they would have surely tore the roof off, with their stuff. Taka wins with the Michinoku driver.
3 1/2 /5
WWF European Championship.
Triple H|C| Vs Owen Hart Stipulation here, is Chyna is handcuffed to Slaughter. Nice pop for Owen, mixed reaction for Trips. A really, really underrated match, that ranks among one of my favorites for Wrestlemania, actually. The two mixed together very well, and Owen can go with anybody. Trips wins, with Chyna interference.
4/5
Mixed Tag match. Marc Mero&Sable Vs Goldust&Luna. Defining pop for Sable, unheard of that time, for woman. Sable actually looks hot, and the crowd is just eating her up!. Constant Sable chants, and them erupting almost every time she gets in the ring. Not bad for a Mixed tag match, it had entertaining antics, and passed the time well. Sable's team wins, when Sable hits the TKO.
2 1/2 /5
WWF Intercontinental Championship. Ken Shamrock Vs The Rock|C|. Before I review the match, I'd like to note The Rock showed off his immense potential, with his interview with Jennifer Flowers, before his match. Nice pop for Shamrock, big time heat for The Rock. Too disappointingly short, and I thought the ending was kinda stupid, though Shamrock's snapping antics were awesome to see, and the crowd went nuts for it. Rock keeps the title, when The Ref reverses the decision.
2/5
Dumpster match, for The WWF Tag Team Championship
Catcus Jack&Terry Funk Vs The New Age Outlaws. The Outlaws are not as over, as they were gonna be at this time. Crowd is actually somewhat dead for this, but I thought it had some great Hardcore bits, with some sick looking bumps. Cactus and Terry win the titles in the end.
3/5
The Undertaker vs Kane. Big time ovation, for The Undertaker. Much better than there outing at Wrestlemania 20, and for a big man vs big man match, this was really good. It was a great all out brawl, with The Undertaker taking a sick looking bump, through the table. WWE was smart, by making Kane looking strong, even through defeat. After 2 tombstone kick out's, Taker finally puts him away, with a 3rd one.
3 1/2 /5
WWF Championship.
Special Guest Enforcer \"Mike Tyson\"
HBK|C| Vs Steve Austin. Big heat for Tyson. Crowd goes ape sh*t for Austin, definitely one of the biggest pops I have heard. Mixed reaction, for HBK. This is truly a special match up, one of the greatest wrestlemania main events in history, you can tell when J.R is even out of breath. HBK gives it his all, in what was supposed to be his last match, and Austin has rarely been better. The animosity and electricity from the crowd is amazing, and it's as exciting as it gets. Austin wins with the stunner, with Tyson joining 3:16 by knocking out Michaels. Austin's celebratory victory, is a wonder to behold, with one of the nosiest crowd's you will ever see, King said it right, they were going nuts.
5/5
Bottom line. Wrestlemania 14 is one of the greatest for real. It has everything you want in a Wrestlemania, and truly kick started the Attitude Era. This is very special to me, because it was the 1st Wrestlemania I ever saw, back in 98. \"The Austin Era, has begun!\"
9 1/2 /10\n",
93 | "======================================================================================================================================================\n",
94 | "It could have been a better film. It does drag at points, and the central story shifts from Boyer completing his mission to Boyer avenging Wanda Hendrix's death, but Graham Greene is an author who is really hard to spoil. His stories are all morality tales, due to his own considerations of Catholicism, guilt and innocence (very relative terms in his world view), and the human condition.
Boyer is Luis Denard, a well-known concert pianist, who has sided with the Republicans in the Spanish Civil War. He has been sent to England to try to carry through an arms purchase deal that is desperately needed. Unfortunately for Denard he is literally on his own - everyone of his contacts turns out to be a willing turncoat for the Falagists of Spain. In particular Katina Paxinou (Mrs. Melendez) a grim boarding house keeper, and Peter Lorre (Mr. Contreras) a teacher of an \"esperanto\" type international language. Wanda Hendrix is the drudge of a girl (Else) who works for Mrs. Melendez. The local diplomat, Licata (Victor Francken) is already a willing associate of the Falangists.
The Brits (Holmes Herbert, Miles Mander, and best - if not worst - of the lot, George Coulouris) don't give much hope to Boyer's cause (which he soon grasps may be Britain's before long). Herbert and Mander just retreat behind the official policy of neutrality ordered by the Ramsay MacDonald's and Stanley Baldwin's governments during the Civil War. Coulouris here is a typical Col. Blimp type - always impeccable in his native English diction, he is sharp in showing his dislike for foreigners in general.
The one ray of hope is Lauren Bacall (Rose Cullen), here trying to play her role as well as she can - but she can't really. She's an aristocrat - the daughter of a Press lord. It was Bacall's second film, and (sad to say) almost sank her long career. She does act well, but the spark she showed in her first film was due to the dual effect of starring with Humphrey Bogart and being directed by Howard Hawks. Boyer is a fine actor, but he's not Bogie, and Herman Shumlin is not Hawks. Her next film returned her to Bogie and Hawks again, and her star resumed it's ascendancy.
It's a bleak film (as was the novel). Boyer's mission never succeeds, as he has too many hidden foes all over the place. But the villains are likewise also losers - frequently with their lives.
With Dan Seymour as a suspicious foreign tenant of Katina Paxinou (and the man who destroys her). It is well worth watching to catch the Warner's lot of character actors doing their best given the weakness in direction.\n",
95 | "======================================================================================================================================================\n"
96 | ]
97 | }
98 | ],
99 | "source": [
100 | "for i in range(3):\n",
101 | " print(reviews[i])\n",
102 | " print('=' * 150)"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "id": "blind-relative",
108 | "metadata": {},
109 | "source": [
110 | "## Feature Extraction"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 8,
116 | "id": "fuzzy-legislation",
117 | "metadata": {},
118 | "outputs": [
119 | {
120 | "data": {
121 | "text/html": [
122 | "
\n",
123 | "\n",
136 | "
\n",
137 | " \n",
138 | " \n",
139 | " | \n",
140 | " 00 | \n",
141 | " 000 | \n",
142 | " 000s | \n",
143 | " 003830 | \n",
144 | " 006 | \n",
145 | " 007 | \n",
146 | " 0079 | \n",
147 | " 0080 | \n",
148 | " 0083 | \n",
149 | " 0093638 | \n",
150 | " ... | \n",
151 | " élan | \n",
152 | " émigré | \n",
153 | " émigrés | \n",
154 | " était | \n",
155 | " état | \n",
156 | " étc | \n",
157 | " êxtase | \n",
158 | " ís | \n",
159 | " østbye | \n",
160 | " über | \n",
161 | "
\n",
162 | " \n",
163 | " \n",
164 | " \n",
165 | " | 0 | \n",
166 | " 0.0 | \n",
167 | " 0.0 | \n",
168 | " 0.0 | \n",
169 | " 0.0 | \n",
170 | " 0.0 | \n",
171 | " 0.0 | \n",
172 | " 0.0 | \n",
173 | " 0.0 | \n",
174 | " 0.0 | \n",
175 | " 0.0 | \n",
176 | " ... | \n",
177 | " 0.0 | \n",
178 | " 0.0 | \n",
179 | " 0.0 | \n",
180 | " 0.0 | \n",
181 | " 0.0 | \n",
182 | " 0.0 | \n",
183 | " 0.0 | \n",
184 | " 0.0 | \n",
185 | " 0.0 | \n",
186 | " 0.0 | \n",
187 | "
\n",
188 | " \n",
189 | " | 1 | \n",
190 | " 0.0 | \n",
191 | " 0.0 | \n",
192 | " 0.0 | \n",
193 | " 0.0 | \n",
194 | " 0.0 | \n",
195 | " 0.0 | \n",
196 | " 0.0 | \n",
197 | " 0.0 | \n",
198 | " 0.0 | \n",
199 | " 0.0 | \n",
200 | " ... | \n",
201 | " 0.0 | \n",
202 | " 0.0 | \n",
203 | " 0.0 | \n",
204 | " 0.0 | \n",
205 | " 0.0 | \n",
206 | " 0.0 | \n",
207 | " 0.0 | \n",
208 | " 0.0 | \n",
209 | " 0.0 | \n",
210 | " 0.0 | \n",
211 | "
\n",
212 | " \n",
213 | " | 2 | \n",
214 | " 0.0 | \n",
215 | " 0.0 | \n",
216 | " 0.0 | \n",
217 | " 0.0 | \n",
218 | " 0.0 | \n",
219 | " 0.0 | \n",
220 | " 0.0 | \n",
221 | " 0.0 | \n",
222 | " 0.0 | \n",
223 | " 0.0 | \n",
224 | " ... | \n",
225 | " 0.0 | \n",
226 | " 0.0 | \n",
227 | " 0.0 | \n",
228 | " 0.0 | \n",
229 | " 0.0 | \n",
230 | " 0.0 | \n",
231 | " 0.0 | \n",
232 | " 0.0 | \n",
233 | " 0.0 | \n",
234 | " 0.0 | \n",
235 | "
\n",
236 | " \n",
237 | " | 3 | \n",
238 | " 0.0 | \n",
239 | " 0.0 | \n",
240 | " 0.0 | \n",
241 | " 0.0 | \n",
242 | " 0.0 | \n",
243 | " 0.0 | \n",
244 | " 0.0 | \n",
245 | " 0.0 | \n",
246 | " 0.0 | \n",
247 | " 0.0 | \n",
248 | " ... | \n",
249 | " 0.0 | \n",
250 | " 0.0 | \n",
251 | " 0.0 | \n",
252 | " 0.0 | \n",
253 | " 0.0 | \n",
254 | " 0.0 | \n",
255 | " 0.0 | \n",
256 | " 0.0 | \n",
257 | " 0.0 | \n",
258 | " 0.0 | \n",
259 | "
\n",
260 | " \n",
261 | " | 4 | \n",
262 | " 0.0 | \n",
263 | " 0.0 | \n",
264 | " 0.0 | \n",
265 | " 0.0 | \n",
266 | " 0.0 | \n",
267 | " 0.0 | \n",
268 | " 0.0 | \n",
269 | " 0.0 | \n",
270 | " 0.0 | \n",
271 | " 0.0 | \n",
272 | " ... | \n",
273 | " 0.0 | \n",
274 | " 0.0 | \n",
275 | " 0.0 | \n",
276 | " 0.0 | \n",
277 | " 0.0 | \n",
278 | " 0.0 | \n",
279 | " 0.0 | \n",
280 | " 0.0 | \n",
281 | " 0.0 | \n",
282 | " 0.0 | \n",
283 | "
\n",
284 | " \n",
285 | " | ... | \n",
286 | " ... | \n",
287 | " ... | \n",
288 | " ... | \n",
289 | " ... | \n",
290 | " ... | \n",
291 | " ... | \n",
292 | " ... | \n",
293 | " ... | \n",
294 | " ... | \n",
295 | " ... | \n",
296 | " ... | \n",
297 | " ... | \n",
298 | " ... | \n",
299 | " ... | \n",
300 | " ... | \n",
301 | " ... | \n",
302 | " ... | \n",
303 | " ... | \n",
304 | " ... | \n",
305 | " ... | \n",
306 | " ... | \n",
307 | "
\n",
308 | " \n",
309 | " | 12495 | \n",
310 | " 0.0 | \n",
311 | " 0.0 | \n",
312 | " 0.0 | \n",
313 | " 0.0 | \n",
314 | " 0.0 | \n",
315 | " 0.0 | \n",
316 | " 0.0 | \n",
317 | " 0.0 | \n",
318 | " 0.0 | \n",
319 | " 0.0 | \n",
320 | " ... | \n",
321 | " 0.0 | \n",
322 | " 0.0 | \n",
323 | " 0.0 | \n",
324 | " 0.0 | \n",
325 | " 0.0 | \n",
326 | " 0.0 | \n",
327 | " 0.0 | \n",
328 | " 0.0 | \n",
329 | " 0.0 | \n",
330 | " 0.0 | \n",
331 | "
\n",
332 | " \n",
333 | " | 12496 | \n",
334 | " 0.0 | \n",
335 | " 0.0 | \n",
336 | " 0.0 | \n",
337 | " 0.0 | \n",
338 | " 0.0 | \n",
339 | " 0.0 | \n",
340 | " 0.0 | \n",
341 | " 0.0 | \n",
342 | " 0.0 | \n",
343 | " 0.0 | \n",
344 | " ... | \n",
345 | " 0.0 | \n",
346 | " 0.0 | \n",
347 | " 0.0 | \n",
348 | " 0.0 | \n",
349 | " 0.0 | \n",
350 | " 0.0 | \n",
351 | " 0.0 | \n",
352 | " 0.0 | \n",
353 | " 0.0 | \n",
354 | " 0.0 | \n",
355 | "
\n",
356 | " \n",
357 | " | 12497 | \n",
358 | " 0.0 | \n",
359 | " 0.0 | \n",
360 | " 0.0 | \n",
361 | " 0.0 | \n",
362 | " 0.0 | \n",
363 | " 0.0 | \n",
364 | " 0.0 | \n",
365 | " 0.0 | \n",
366 | " 0.0 | \n",
367 | " 0.0 | \n",
368 | " ... | \n",
369 | " 0.0 | \n",
370 | " 0.0 | \n",
371 | " 0.0 | \n",
372 | " 0.0 | \n",
373 | " 0.0 | \n",
374 | " 0.0 | \n",
375 | " 0.0 | \n",
376 | " 0.0 | \n",
377 | " 0.0 | \n",
378 | " 0.0 | \n",
379 | "
\n",
380 | " \n",
381 | " | 12498 | \n",
382 | " 0.0 | \n",
383 | " 0.0 | \n",
384 | " 0.0 | \n",
385 | " 0.0 | \n",
386 | " 0.0 | \n",
387 | " 0.0 | \n",
388 | " 0.0 | \n",
389 | " 0.0 | \n",
390 | " 0.0 | \n",
391 | " 0.0 | \n",
392 | " ... | \n",
393 | " 0.0 | \n",
394 | " 0.0 | \n",
395 | " 0.0 | \n",
396 | " 0.0 | \n",
397 | " 0.0 | \n",
398 | " 0.0 | \n",
399 | " 0.0 | \n",
400 | " 0.0 | \n",
401 | " 0.0 | \n",
402 | " 0.0 | \n",
403 | "
\n",
404 | " \n",
405 | " | 12499 | \n",
406 | " 0.0 | \n",
407 | " 0.0 | \n",
408 | " 0.0 | \n",
409 | " 0.0 | \n",
410 | " 0.0 | \n",
411 | " 0.0 | \n",
412 | " 0.0 | \n",
413 | " 0.0 | \n",
414 | " 0.0 | \n",
415 | " 0.0 | \n",
416 | " ... | \n",
417 | " 0.0 | \n",
418 | " 0.0 | \n",
419 | " 0.0 | \n",
420 | " 0.0 | \n",
421 | " 0.0 | \n",
422 | " 0.0 | \n",
423 | " 0.0 | \n",
424 | " 0.0 | \n",
425 | " 0.0 | \n",
426 | " 0.0 | \n",
427 | "
\n",
428 | " \n",
429 | "
\n",
430 | "
12500 rows × 55428 columns
\n",
431 | "
"
432 | ],
433 | "text/plain": [
434 | " 00 000 000s 003830 006 007 0079 0080 0083 0093638 ... élan \\\n",
435 | "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
436 | "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
437 | "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
438 | "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
439 | "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
440 | "... ... ... ... ... ... ... ... ... ... ... ... ... \n",
441 | "12495 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
442 | "12496 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
443 | "12497 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
444 | "12498 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
445 | "12499 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
446 | "\n",
447 | " émigré émigrés était état étc êxtase ís østbye über \n",
448 | "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
449 | "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
450 | "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
451 | "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
452 | "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
453 | "... ... ... ... ... ... ... ... ... ... \n",
454 | "12495 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
455 | "12496 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
456 | "12497 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
457 | "12498 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
458 | "12499 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
459 | "\n",
460 | "[12500 rows x 55428 columns]"
461 | ]
462 | },
463 | "execution_count": 8,
464 | "metadata": {},
465 | "output_type": "execute_result"
466 | }
467 | ],
468 | "source": [
469 | "vect = TfidfVectorizer(stop_words='english')\n",
470 | "X = vect.fit_transform(reviews)\n",
471 | "\n",
472 | "pd.DataFrame(X.toarray(), columns=vect.get_feature_names())"
473 | ]
474 | },
475 | {
476 | "cell_type": "markdown",
477 | "id": "close-advantage",
478 | "metadata": {},
479 | "source": [
480 | "## NMF Decomposition"
481 | ]
482 | },
483 | {
484 | "cell_type": "code",
485 | "execution_count": 14,
486 | "id": "surprising-lindsay",
487 | "metadata": {},
488 | "outputs": [
489 | {
490 | "name": "stderr",
491 | "output_type": "stream",
492 | "text": [
493 | "/home/jaidevd/anaconda3/lib/python3.7/site-packages/sklearn/decomposition/_nmf.py:315: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).\n",
494 | " \"'nndsvda' in 1.1 (renaming of 0.26).\"), FutureWarning)\n",
495 | "/home/jaidevd/anaconda3/lib/python3.7/site-packages/sklearn/decomposition/_nmf.py:1091: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.\n",
496 | " \" improve convergence.\" % max_iter, ConvergenceWarning)\n"
497 | ]
498 | }
499 | ],
500 | "source": [
501 | "N_TOPICS = 15\n",
502 | "nmf = NMF(n_components=N_TOPICS)\n",
503 | "W = nmf.fit_transform(X) # Document-topic matrix\n",
504 | "H = nmf.components_ # Topic-term matrix"
505 | ]
506 | },
507 | {
508 | "cell_type": "code",
509 | "execution_count": 15,
510 | "id": "african-corps",
511 | "metadata": {},
512 | "outputs": [
513 | {
514 | "data": {
515 | "text/html": [
516 | "\n",
517 | "\n",
530 | "
\n",
531 | " \n",
532 | " \n",
533 | " | \n",
534 | " Word 1 | \n",
535 | " Word 2 | \n",
536 | " Word 3 | \n",
537 | " Word 4 | \n",
538 | " Word 5 | \n",
539 | " Word 6 | \n",
540 | " Word 7 | \n",
541 | " Word 8 | \n",
542 | " Word 9 | \n",
543 | " Word 10 | \n",
544 | "
\n",
545 | " \n",
546 | " \n",
547 | " \n",
548 | " | Topic 1 | \n",
549 | " br | \n",
550 | " 10 | \n",
551 | " ll | \n",
552 | " spoilers | \n",
553 | " end | \n",
554 | " simply | \n",
555 | " yes | \n",
556 | " spoiler | \n",
557 | " quite | \n",
558 | " just | \n",
559 | "
\n",
560 | " \n",
561 | " | Topic 2 | \n",
562 | " movie | \n",
563 | " movies | \n",
564 | " watch | \n",
565 | " recommend | \n",
566 | " 10 | \n",
567 | " seen | \n",
568 | " saw | \n",
569 | " best | \n",
570 | " actors | \n",
571 | " definitely | \n",
572 | "
\n",
573 | " \n",
574 | " | Topic 3 | \n",
575 | " film | \n",
576 | " films | \n",
577 | " director | \n",
578 | " characters | \n",
579 | " seen | \n",
580 | " cinema | \n",
581 | " festival | \n",
582 | " work | \n",
583 | " scenes | \n",
584 | " art | \n",
585 | "
\n",
586 | " \n",
587 | " | Topic 4 | \n",
588 | " series | \n",
589 | " episode | \n",
590 | " episodes | \n",
591 | " season | \n",
592 | " tv | \n",
593 | " characters | \n",
594 | " trek | \n",
595 | " seasons | \n",
596 | " shows | \n",
597 | " television | \n",
598 | "
\n",
599 | " \n",
600 | " | Topic 5 | \n",
601 | " man | \n",
602 | " role | \n",
603 | " character | \n",
604 | " performance | \n",
605 | " best | \n",
606 | " plays | \n",
607 | " john | \n",
608 | " played | \n",
609 | " does | \n",
610 | " actor | \n",
611 | "
\n",
612 | " \n",
613 | " | Topic 6 | \n",
614 | " good | \n",
615 | " pretty | \n",
616 | " story | \n",
617 | " bad | \n",
618 | " acting | \n",
619 | " really | \n",
620 | " job | \n",
621 | " liked | \n",
622 | " nice | \n",
623 | " little | \n",
624 | "
\n",
625 | " \n",
626 | " | Topic 7 | \n",
627 | " war | \n",
628 | " world | \n",
629 | " documentary | \n",
630 | " people | \n",
631 | " american | \n",
632 | " history | \n",
633 | " soldiers | \n",
634 | " men | \n",
635 | " women | \n",
636 | " hitler | \n",
637 | "
\n",
638 | " \n",
639 | " | Topic 8 | \n",
640 | " funny | \n",
641 | " comedy | \n",
642 | " laugh | \n",
643 | " hilarious | \n",
644 | " eddie | \n",
645 | " fun | \n",
646 | " jokes | \n",
647 | " humor | \n",
648 | " funniest | \n",
649 | " murphy | \n",
650 | "
\n",
651 | " \n",
652 | " | Topic 9 | \n",
653 | " like | \n",
654 | " think | \n",
655 | " really | \n",
656 | " just | \n",
657 | " don | \n",
658 | " people | \n",
659 | " know | \n",
660 | " say | \n",
661 | " didn | \n",
662 | " lot | \n",
663 | "
\n",
664 | " \n",
665 | " | Topic 10 | \n",
666 | " time | \n",
667 | " years | \n",
668 | " saw | \n",
669 | " seen | \n",
670 | " dvd | \n",
671 | " old | \n",
672 | " remember | \n",
673 | " ve | \n",
674 | " music | \n",
675 | " disney | \n",
676 | "
\n",
677 | " \n",
678 | " | Topic 11 | \n",
679 | " life | \n",
680 | " story | \n",
681 | " love | \n",
682 | " family | \n",
683 | " real | \n",
684 | " characters | \n",
685 | " people | \n",
686 | " young | \n",
687 | " beautiful | \n",
688 | " true | \n",
689 | "
\n",
690 | " \n",
691 | " | Topic 12 | \n",
692 | " book | \n",
693 | " jane | \n",
694 | " version | \n",
695 | " read | \n",
696 | " eyre | \n",
697 | " novel | \n",
698 | " rochester | \n",
699 | " dalton | \n",
700 | " tarzan | \n",
701 | " emma | \n",
702 | "
\n",
703 | " \n",
704 | " | Topic 13 | \n",
705 | " horror | \n",
706 | " house | \n",
707 | " creepy | \n",
708 | " scary | \n",
709 | " gore | \n",
710 | " films | \n",
711 | " halloween | \n",
712 | " budget | \n",
713 | " fans | \n",
714 | " effects | \n",
715 | "
\n",
716 | " \n",
717 | " | Topic 14 | \n",
718 | " great | \n",
719 | " acting | \n",
720 | " really | \n",
721 | " actors | \n",
722 | " cast | \n",
723 | " job | \n",
724 | " best | \n",
725 | " music | \n",
726 | " wonderful | \n",
727 | " just | \n",
728 | "
\n",
729 | " \n",
730 | " | Topic 15 | \n",
731 | " action | \n",
732 | " jackie | \n",
733 | " chan | \n",
734 | " scenes | \n",
735 | " fu | \n",
736 | " kung | \n",
737 | " fight | \n",
738 | " martial | \n",
739 | " bourne | \n",
740 | " story | \n",
741 | "
\n",
742 | " \n",
743 | "
\n",
744 | "
"
745 | ],
746 | "text/plain": [
747 | " Word 1 Word 2 Word 3 Word 4 Word 5 Word 6 \\\n",
748 | "Topic 1 br 10 ll spoilers end simply \n",
749 | "Topic 2 movie movies watch recommend 10 seen \n",
750 | "Topic 3 film films director characters seen cinema \n",
751 | "Topic 4 series episode episodes season tv characters \n",
752 | "Topic 5 man role character performance best plays \n",
753 | "Topic 6 good pretty story bad acting really \n",
754 | "Topic 7 war world documentary people american history \n",
755 | "Topic 8 funny comedy laugh hilarious eddie fun \n",
756 | "Topic 9 like think really just don people \n",
757 | "Topic 10 time years saw seen dvd old \n",
758 | "Topic 11 life story love family real characters \n",
759 | "Topic 12 book jane version read eyre novel \n",
760 | "Topic 13 horror house creepy scary gore films \n",
761 | "Topic 14 great acting really actors cast job \n",
762 | "Topic 15 action jackie chan scenes fu kung \n",
763 | "\n",
764 | " Word 7 Word 8 Word 9 Word 10 \n",
765 | "Topic 1 yes spoiler quite just \n",
766 | "Topic 2 saw best actors definitely \n",
767 | "Topic 3 festival work scenes art \n",
768 | "Topic 4 trek seasons shows television \n",
769 | "Topic 5 john played does actor \n",
770 | "Topic 6 job liked nice little \n",
771 | "Topic 7 soldiers men women hitler \n",
772 | "Topic 8 jokes humor funniest murphy \n",
773 | "Topic 9 know say didn lot \n",
774 | "Topic 10 remember ve music disney \n",
775 | "Topic 11 people young beautiful true \n",
776 | "Topic 12 rochester dalton tarzan emma \n",
777 | "Topic 13 halloween budget fans effects \n",
778 | "Topic 14 best music wonderful just \n",
779 | "Topic 15 fight martial bourne story "
780 | ]
781 | },
782 | "execution_count": 15,
783 | "metadata": {},
784 | "output_type": "execute_result"
785 | }
786 | ],
787 | "source": [
788 | "# Top 10 words per topic\n",
789 | "\n",
790 | "words = np.array(vect.get_feature_names())\n",
791 | "topic_words = pd.DataFrame(np.zeros((N_TOPICS, 10)), index=[f'Topic {i + 1}' for i in range(N_TOPICS)],\n",
792 | " columns=[f'Word {i + 1}' for i in range(10)]).astype(str)\n",
793 | "for i in range(N_TOPICS):\n",
794 | " ix = H[i].argsort()[::-1][:10]\n",
795 | " topic_words.iloc[i] = words[ix]\n",
796 | "\n",
797 | "topic_words"
798 | ]
799 | },
800 | {
801 | "cell_type": "code",
802 | "execution_count": 16,
803 | "id": "thousand-clearance",
804 | "metadata": {},
805 | "outputs": [],
806 | "source": [
807 | "# Create a topic mapping\n",
808 | "\n",
809 | "topic_mapping = {\n",
810 | " 'Topic 4': 'TV',\n",
811 | " 'Topic 7': 'War',\n",
812 | " 'Topic 8': 'Comedy',\n",
813 | " 'Topic 12': 'Book Adaptation',\n",
814 | " 'Topic 13': 'Horror',\n",
815 | " 'Topic 15': 'Martial Arts / Action'\n",
816 | "}"
817 | ]
818 | },
819 | {
820 | "cell_type": "code",
821 | "execution_count": 17,
822 | "id": "intellectual-somerset",
823 | "metadata": {},
824 | "outputs": [
825 | {
826 | "data": {
827 | "text/html": [
828 | "\n",
829 | "\n",
842 | "
\n",
843 | " \n",
844 | " \n",
845 | " | \n",
846 | " Topic 1 | \n",
847 | " Topic 2 | \n",
848 | " Topic 3 | \n",
849 | " Topic 4 | \n",
850 | " Topic 5 | \n",
851 | " Topic 6 | \n",
852 | " Topic 7 | \n",
853 | " Topic 8 | \n",
854 | " Topic 9 | \n",
855 | " Topic 10 | \n",
856 | " Topic 11 | \n",
857 | " Topic 12 | \n",
858 | " Topic 13 | \n",
859 | " Topic 14 | \n",
860 | " Topic 15 | \n",
861 | " max_topic | \n",
862 | "
\n",
863 | " \n",
864 | " \n",
865 | " \n",
866 | " | 2 | \n",
867 | " 0.028314 | \n",
868 | " 0.000000 | \n",
869 | " 0.022122 | \n",
870 | " 0.001480 | \n",
871 | " 0.023043 | \n",
872 | " 0.002044 | \n",
873 | " 0.030939 | \n",
874 | " 0.000000 | \n",
875 | " 0.006389 | \n",
876 | " 0.000000 | \n",
877 | " 0.000774 | \n",
878 | " 0.007251 | \n",
879 | " 0.000000 | \n",
880 | " 0.003574 | \n",
881 | " 0.000000 | \n",
882 | " War | \n",
883 | "
\n",
884 | " \n",
885 | " | 16 | \n",
886 | " 0.000251 | \n",
887 | " 0.000000 | \n",
888 | " 0.001575 | \n",
889 | " 0.000000 | \n",
890 | " 0.029132 | \n",
891 | " 0.002257 | \n",
892 | " 0.000000 | \n",
893 | " 0.033108 | \n",
894 | " 0.016283 | \n",
895 | " 0.000000 | \n",
896 | " 0.012337 | \n",
897 | " 0.000000 | \n",
898 | " 0.003595 | \n",
899 | " 0.011944 | \n",
900 | " 0.010159 | \n",
901 | " Comedy | \n",
902 | "
\n",
903 | " \n",
904 | " | 18 | \n",
905 | " 0.029574 | \n",
906 | " 0.000000 | \n",
907 | " 0.019010 | \n",
908 | " 0.001797 | \n",
909 | " 0.016906 | \n",
910 | " 0.008574 | \n",
911 | " 0.000129 | \n",
912 | " 0.038010 | \n",
913 | " 0.005558 | \n",
914 | " 0.006250 | \n",
915 | " 0.036652 | \n",
916 | " 0.000000 | \n",
917 | " 0.000000 | \n",
918 | " 0.000000 | \n",
919 | " 0.000000 | \n",
920 | " Comedy | \n",
921 | "
\n",
922 | " \n",
923 | " | 26 | \n",
924 | " 0.015179 | \n",
925 | " 0.000349 | \n",
926 | " 0.000000 | \n",
927 | " 0.000000 | \n",
928 | " 0.015907 | \n",
929 | " 0.012349 | \n",
930 | " 0.000000 | \n",
931 | " 0.034328 | \n",
932 | " 0.015722 | \n",
933 | " 0.008809 | \n",
934 | " 0.004318 | \n",
935 | " 0.000000 | \n",
936 | " 0.000000 | \n",
937 | " 0.001958 | \n",
938 | " 0.000922 | \n",
939 | " Comedy | \n",
940 | "
\n",
941 | " \n",
942 | " | 27 | \n",
943 | " 0.031523 | \n",
944 | " 0.008099 | \n",
945 | " 0.000171 | \n",
946 | " 0.003151 | \n",
947 | " 0.009975 | \n",
948 | " 0.001411 | \n",
949 | " 0.035158 | \n",
950 | " 0.042588 | \n",
951 | " 0.000000 | \n",
952 | " 0.000000 | \n",
953 | " 0.001425 | \n",
954 | " 0.002624 | \n",
955 | " 0.000000 | \n",
956 | " 0.003865 | \n",
957 | " 0.002781 | \n",
958 | " Comedy | \n",
959 | "
\n",
960 | " \n",
961 | " | 29 | \n",
962 | " 0.000000 | \n",
963 | " 0.000614 | \n",
964 | " 0.000000 | \n",
965 | " 0.000000 | \n",
966 | " 0.000000 | \n",
967 | " 0.014862 | \n",
968 | " 0.000000 | \n",
969 | " 0.014987 | \n",
970 | " 0.010941 | \n",
971 | " 0.000000 | \n",
972 | " 0.000000 | \n",
973 | " 0.001534 | \n",
974 | " 0.066263 | \n",
975 | " 0.000000 | \n",
976 | " 0.036239 | \n",
977 | " Horror | \n",
978 | "
\n",
979 | " \n",
980 | " | 30 | \n",
981 | " 0.023404 | \n",
982 | " 0.012107 | \n",
983 | " 0.016814 | \n",
984 | " 0.000000 | \n",
985 | " 0.008135 | \n",
986 | " 0.009620 | \n",
987 | " 0.001377 | \n",
988 | " 0.040382 | \n",
989 | " 0.000809 | \n",
990 | " 0.004582 | \n",
991 | " 0.004803 | \n",
992 | " 0.001186 | \n",
993 | " 0.014194 | \n",
994 | " 0.000000 | \n",
995 | " 0.000000 | \n",
996 | " Comedy | \n",
997 | "
\n",
998 | " \n",
999 | " | 31 | \n",
1000 | " 0.012324 | \n",
1001 | " 0.003554 | \n",
1002 | " 0.028753 | \n",
1003 | " 0.000000 | \n",
1004 | " 0.017125 | \n",
1005 | " 0.003483 | \n",
1006 | " 0.006804 | \n",
1007 | " 0.000000 | \n",
1008 | " 0.003702 | \n",
1009 | " 0.000000 | \n",
1010 | " 0.006449 | \n",
1011 | " 0.000833 | \n",
1012 | " 0.034161 | \n",
1013 | " 0.005682 | \n",
1014 | " 0.000000 | \n",
1015 | " Horror | \n",
1016 | "
\n",
1017 | " \n",
1018 | " | 34 | \n",
1019 | " 0.000000 | \n",
1020 | " 0.016503 | \n",
1021 | " 0.000000 | \n",
1022 | " 0.000000 | \n",
1023 | " 0.013825 | \n",
1024 | " 0.000000 | \n",
1025 | " 0.000000 | \n",
1026 | " 0.038567 | \n",
1027 | " 0.004479 | \n",
1028 | " 0.021462 | \n",
1029 | " 0.000000 | \n",
1030 | " 0.000000 | \n",
1031 | " 0.000000 | \n",
1032 | " 0.010132 | \n",
1033 | " 0.000000 | \n",
1034 | " Comedy | \n",
1035 | "
\n",
1036 | " \n",
1037 | " | 58 | \n",
1038 | " 0.000228 | \n",
1039 | " 0.046686 | \n",
1040 | " 0.000000 | \n",
1041 | " 0.000000 | \n",
1042 | " 0.000100 | \n",
1043 | " 0.000000 | \n",
1044 | " 0.004866 | \n",
1045 | " 0.000000 | \n",
1046 | " 0.001639 | \n",
1047 | " 0.013741 | \n",
1048 | " 0.037063 | \n",
1049 | " 0.069237 | \n",
1050 | " 0.000000 | \n",
1051 | " 0.012097 | \n",
1052 | " 0.000000 | \n",
1053 | " Book Adaptation | \n",
1054 | "
\n",
1055 | " \n",
1056 | "
\n",
1057 | "
"
1058 | ],
1059 | "text/plain": [
1060 | " Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 \\\n",
1061 | "2 0.028314 0.000000 0.022122 0.001480 0.023043 0.002044 0.030939 \n",
1062 | "16 0.000251 0.000000 0.001575 0.000000 0.029132 0.002257 0.000000 \n",
1063 | "18 0.029574 0.000000 0.019010 0.001797 0.016906 0.008574 0.000129 \n",
1064 | "26 0.015179 0.000349 0.000000 0.000000 0.015907 0.012349 0.000000 \n",
1065 | "27 0.031523 0.008099 0.000171 0.003151 0.009975 0.001411 0.035158 \n",
1066 | "29 0.000000 0.000614 0.000000 0.000000 0.000000 0.014862 0.000000 \n",
1067 | "30 0.023404 0.012107 0.016814 0.000000 0.008135 0.009620 0.001377 \n",
1068 | "31 0.012324 0.003554 0.028753 0.000000 0.017125 0.003483 0.006804 \n",
1069 | "34 0.000000 0.016503 0.000000 0.000000 0.013825 0.000000 0.000000 \n",
1070 | "58 0.000228 0.046686 0.000000 0.000000 0.000100 0.000000 0.004866 \n",
1071 | "\n",
1072 | " Topic 8 Topic 9 Topic 10 Topic 11 Topic 12 Topic 13 Topic 14 \\\n",
1073 | "2 0.000000 0.006389 0.000000 0.000774 0.007251 0.000000 0.003574 \n",
1074 | "16 0.033108 0.016283 0.000000 0.012337 0.000000 0.003595 0.011944 \n",
1075 | "18 0.038010 0.005558 0.006250 0.036652 0.000000 0.000000 0.000000 \n",
1076 | "26 0.034328 0.015722 0.008809 0.004318 0.000000 0.000000 0.001958 \n",
1077 | "27 0.042588 0.000000 0.000000 0.001425 0.002624 0.000000 0.003865 \n",
1078 | "29 0.014987 0.010941 0.000000 0.000000 0.001534 0.066263 0.000000 \n",
1079 | "30 0.040382 0.000809 0.004582 0.004803 0.001186 0.014194 0.000000 \n",
1080 | "31 0.000000 0.003702 0.000000 0.006449 0.000833 0.034161 0.005682 \n",
1081 | "34 0.038567 0.004479 0.021462 0.000000 0.000000 0.000000 0.010132 \n",
1082 | "58 0.000000 0.001639 0.013741 0.037063 0.069237 0.000000 0.012097 \n",
1083 | "\n",
1084 | " Topic 15 max_topic \n",
1085 | "2 0.000000 War \n",
1086 | "16 0.010159 Comedy \n",
1087 | "18 0.000000 Comedy \n",
1088 | "26 0.000922 Comedy \n",
1089 | "27 0.002781 Comedy \n",
1090 | "29 0.036239 Horror \n",
1091 | "30 0.000000 Comedy \n",
1092 | "31 0.000000 Horror \n",
1093 | "34 0.000000 Comedy \n",
1094 | "58 0.000000 Book Adaptation "
1095 | ]
1096 | },
1097 | "execution_count": 17,
1098 | "metadata": {},
1099 | "output_type": "execute_result"
1100 | }
1101 | ],
1102 | "source": [
1103 | "# Recall the document-topic matrix, W\n",
1104 | "\n",
1105 | "W = pd.DataFrame(W, columns=[f'Topic {i + 1}' for i in range(N_TOPICS)])\n",
1106 | "W['max_topic'] = W.apply(lambda x: topic_mapping.get(x.idxmax()), axis=1)\n",
1107 | "W[pd.notnull(W['max_topic'])].head(10)"
1108 | ]
1109 | },
1110 | {
1111 | "cell_type": "code",
1112 | "execution_count": 21,
1113 | "id": "continent-intellectual",
1114 | "metadata": {},
1115 | "outputs": [
1116 | {
1117 | "data": {
1118 | "text/plain": [
1119 | "'In my humble opinion, this movie did not receive the recognition it deserved. Robert Redford lives near me here in Provo, Utah, at Sundance. I enjoy most of his work, and this was my favorite. I\\'m sorry that more people didn\\'t appreciate it. My grandmother was an avid reader and read the book years before it came out on the big screen. She gave it to me to read after we had seen the movie together. The movie and book hit an emotional spot within my heart, and I was weepy for several days after seeing the movie. Sometimes love isn\\'t enough to keep our loved ones from hurting themselves. We see this in our own family relationships, yet our love and our families and their stories endure throughout generations of time. The cinematography was perfect and breathtaking -- I was awed by its beauty and how well it brought to life the words of the author of the book, Norman Maclean, \"But when I am alone in the half light of the canyon, all existence seems to fade to a being with my soul, and memories. And the sounds of the Big Black Foot River, and a four count rhythm, and the hope that a fish will rise. Eventually, all things merge into one, and a river runs through it. The river was cut by the world\\'s great flood and runs over rocks from the basement of time. On some of the rocks are timeless raindrops. Under the rocks are the words, and some of the words are theirs. I am haunted by waters.\" These words, taken from the book and spoken at the end of the movie (by Robert Redford who is narrating as Norman Maclean), are basically scripture, in my opinion. Any possible flaws the movie may have are overshadowed by the beauty and grace of the story and the cinematography. It was beautiful!'"
1120 | ]
1121 | },
1122 | "execution_count": 21,
1123 | "metadata": {},
1124 | "output_type": "execute_result"
1125 | }
1126 | ],
1127 | "source": [
1128 | "reviews[58]"
1129 | ]
1130 | },
1131 | {
1132 | "cell_type": "code",
1133 | "execution_count": null,
1134 | "id": "regulation-comparison",
1135 | "metadata": {},
1136 | "outputs": [],
1137 | "source": []
1138 | }
1139 | ],
1140 | "metadata": {
1141 | "kernelspec": {
1142 | "display_name": "Python 3",
1143 | "language": "python",
1144 | "name": "python3"
1145 | },
1146 | "language_info": {
1147 | "codemirror_mode": {
1148 | "name": "ipython",
1149 | "version": 3
1150 | },
1151 | "file_extension": ".py",
1152 | "mimetype": "text/x-python",
1153 | "name": "python",
1154 | "nbconvert_exporter": "python",
1155 | "pygments_lexer": "ipython3",
1156 | "version": "3.7.7"
1157 | }
1158 | },
1159 | "nbformat": 4,
1160 | "nbformat_minor": 5
1161 | }
1162 |
--------------------------------------------------------------------------------
/distributional semantics/w2v-text-classification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "fitting-soccer",
6 | "metadata": {},
7 | "source": [
8 | "## The Problem: Large Movie Dataset Review\n",
9 | "### Classify movie reviews from IMDB into positive or negative sentiment.\n",
10 | "### Download the dataset [here](https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz)"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "id": "coordinated-amendment",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# imports\n",
21 | "\n",
22 | "from gensim.models import KeyedVectors\n",
23 | "import numpy as np\n",
24 | "import pandas as pd\n",
25 | "import matplotlib.pyplot as plt\n",
26 | "from sklearn.model_selection import train_test_split\n",
27 | "from tensorflow.keras.preprocessing import text_dataset_from_directory\n",
28 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
29 | "from tensorflow.keras.preprocessing.text import Tokenizer\n",
30 | "from tensorflow.keras.layers import Embedding, Dense, Input, GlobalAveragePooling1D\n",
31 | "from tensorflow.keras.models import Sequential\n",
32 | "from tensorflow.keras.optimizers import Adam\n",
33 | "\n",
34 | "import utils"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "id": "interior-washer",
40 | "metadata": {},
41 | "source": [
42 | "## Exploring the data"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 2,
48 | "id": "welsh-barcelona",
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "name": "stdout",
53 | "output_type": "stream",
54 | "text": [
55 | "Found 25001 files belonging to 2 classes.\n",
56 | "Found 25000 files belonging to 2 classes.\n"
57 | ]
58 | },
59 | {
60 | "name": "stderr",
61 | "output_type": "stream",
62 | "text": [
63 | "/home/jaidevd/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: \n",
64 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
65 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
66 | "\n",
67 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
68 | " # This is added back by InteractiveShellApp.init_path()\n"
69 | ]
70 | }
71 | ],
72 | "source": [
73 | "# Importing & preprocessing the dataset\n",
74 | "\n",
75 | "train_ds = text_dataset_from_directory('../neuralnets/aclImdb/train')\n",
76 | "test_ds = text_dataset_from_directory('../neuralnets/aclImdb/test')\n",
77 | "\n",
78 | "dfTrain = pd.DataFrame(train_ds.unbatch().as_numpy_iterator(), columns=['text', 'label'])\n",
79 | "dfTest = pd.DataFrame(test_ds.unbatch().as_numpy_iterator(), columns=['text', 'label'])\n",
80 | "_, xts = train_test_split(dfTest, stratify=dfTest['label'], test_size=0.25)\n",
81 | "\n",
82 | "dfTrain['text'] = dfTrain['text'].map(lambda x: x.decode())\n",
83 | "xts['text'] = xts['text'].map(lambda x: x.decode())"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 4,
89 | "id": "right-visiting",
90 | "metadata": {},
91 | "outputs": [
92 | {
93 | "data": {
94 | "text/html": [
95 | "\n",
96 | "\n",
109 | "
\n",
110 | " \n",
111 | " \n",
112 | " | \n",
113 | " text | \n",
114 | " label | \n",
115 | "
\n",
116 | " \n",
117 | " \n",
118 | " \n",
119 | " | 4966 | \n",
120 | " Humm, an Italian movie starred by David hasselhoff and Linda Blair, I wasn´t expecting very much, to be honest and in fact, I took even less than I was expecting. It doesn´t mean this movie is the... | \n",
121 | " 0 | \n",
122 | "
\n",
123 | " \n",
124 | " | 24885 | \n",
125 | " Not only was this movie better than all the final season of H:LOTS. But it was better than any movie made for TV I have ever seen!<br /><br />Looking at the \"Top 250\" I see that only one small scr... | \n",
126 | " 1 | \n",
127 | "
\n",
128 | " \n",
129 | " | 2310 | \n",
130 | " This is a well-worn story about a man who marries to escape the hangman's noose, then sets about \"taming\" his reluctant bride. It manages to be sexist and racist at exactly the same time. We never... | \n",
131 | " 0 | \n",
132 | "
\n",
133 | " \n",
134 | " | 13648 | \n",
135 | " Being from a small town in Illinois myself, I can instantly relate to this movie. Considering the era it was made in, the townsfolk look uncomfortably like a lot of people I grew up with. Yes the ... | \n",
136 | " 1 | \n",
137 | "
\n",
138 | " \n",
139 | " | 4021 | \n",
140 | " (David H. Steinberg)'s script seemed initially having some real smart points that could've made good romantic comedy, BUT BUT BUT, oh dear ! What did ever happen in the way ???!!!! <br /><br />I'l... | \n",
141 | " 0 | \n",
142 | "
\n",
143 | " \n",
144 | "
\n",
145 | "
"
146 | ],
147 | "text/plain": [
148 | " text \\\n",
149 | "4966 Humm, an Italian movie starred by David hasselhoff and Linda Blair, I wasn´t expecting very much, to be honest and in fact, I took even less than I was expecting. It doesn´t mean this movie is the... \n",
150 | "24885 Not only was this movie better than all the final season of H:LOTS. But it was better than any movie made for TV I have ever seen!
Looking at the \"Top 250\" I see that only one small scr... \n",
151 | "2310 This is a well-worn story about a man who marries to escape the hangman's noose, then sets about \"taming\" his reluctant bride. It manages to be sexist and racist at exactly the same time. We never... \n",
152 | "13648 Being from a small town in Illinois myself, I can instantly relate to this movie. Considering the era it was made in, the townsfolk look uncomfortably like a lot of people I grew up with. Yes the ... \n",
153 | "4021 (David H. Steinberg)'s script seemed initially having some real smart points that could've made good romantic comedy, BUT BUT BUT, oh dear ! What did ever happen in the way ???!!!!
I'l... \n",
154 | "\n",
155 | " label \n",
156 | "4966 0 \n",
157 | "24885 1 \n",
158 | "2310 0 \n",
159 | "13648 1 \n",
160 | "4021 0 "
161 | ]
162 | },
163 | "execution_count": 4,
164 | "metadata": {},
165 | "output_type": "execute_result"
166 | }
167 | ],
168 | "source": [
169 | "pd.options.display.max_colwidth = 200\n",
170 | "dfTrain.sample(n=5)"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 5,
176 | "id": "japanese-brooklyn",
177 | "metadata": {},
178 | "outputs": [
179 | {
180 | "name": "stdout",
181 | "output_type": "stream",
182 | "text": [
183 | "I'm sure this was one of those \"WOAH!\" attractions in 1982 when Epcot opened, but now it's just silly. The film's message is cliché. The Circle-Vision is disorienting. And that awful song at the end is grating. And I really wish they'd install seats. After so much walking, all you want to do is sit down for a few minutes. And when you hear there's a film to see it sounds pretty glamorous! You get entertained while sitting down, right? WRONG! You're standing there for 18+ minutes leaning against a short little railing. Disney should make a newer Maelstrom like attraction to liven things up and replace this dull, lackluster film. NOT FUN. Skip it. In fact, skip Canada altogether unless you're eating there. Move directly to the United Kingdom.\n"
184 | ]
185 | }
186 | ],
187 | "source": [
188 | "print(dfTrain.loc[0, 'text'])"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "id": "rational-vault",
194 | "metadata": {},
195 | "source": [
196 | "## Tokenize the text"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 6,
202 | "id": "future-salad",
203 | "metadata": {},
204 | "outputs": [
205 | {
206 | "name": "stdout",
207 | "output_type": "stream",
208 | "text": [
209 | "Found 88582 unique tokens.\n"
210 | ]
211 | }
212 | ],
213 | "source": [
214 | "tokenizer = Tokenizer()\n",
215 | "tokenizer.fit_on_texts(dfTrain['text'].tolist())\n",
216 | "train_sequences = tokenizer.texts_to_sequences(dfTrain['text'].tolist())\n",
217 | "test_sequences = tokenizer.texts_to_sequences(xts['text'].tolist())\n",
218 | "\n",
219 | "\n",
220 | "word_index = tokenizer.word_index\n",
221 | "print('Found %s unique tokens.' % len(word_index))"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 7,
227 | "id": "headed-saver",
228 | "metadata": {},
229 | "outputs": [
230 | {
231 | "name": "stdout",
232 | "output_type": "stream",
233 | "text": [
234 | "[143, 249, 11, 13, 28, 4, 145, 21906, 14896, 8, 6436, 51, 52009, 3050, 18, 147, 42, 40, 705, 1, 593, 746, 6, 1588, 1, 4201, 1768, 6, 21907, 2, 12, 370, 610, 30, 1, 127, 6, 8169, 2, 10, 63, 654, 3396, 23379, 7180, 100, 35, 73, 1282, 29, 22, 178, 5, 78, 6, 866, 177, 15, 3, 168, 231, 2, 51, 22, 839, 222, 3, 19, 5, 64, 9, 931, 181, 5794, 22, 76, 2162, 134, 1260, 177, 205, 352, 332, 2086, 47, 15, 3051, 231, 12258, 426, 3, 343, 114, 20600, 906, 141, 94, 3, 5926, 27631, 37, 3210, 5, 13420, 180, 53, 2, 5171, 11, 750, 5120, 19, 21, 250, 1769, 9, 8, 189, 1769, 3462, 3899, 891, 332, 1883, 47, 844, 2547, 5, 1, 2345, 4517]\n"
235 | ]
236 | }
237 | ],
238 | "source": [
239 | "print(train_sequences[0])"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 8,
245 | "id": "oriental-copper",
246 | "metadata": {},
247 | "outputs": [
248 | {
249 | "name": "stdout",
250 | "output_type": "stream",
251 | "text": [
252 | "[\"i'm\", 'sure', 'this', 'was', 'one', 'of', 'those', 'woah', 'attractions', 'in', '1982', 'when', 'epcot', 'opened', 'but', 'now', \"it's\", 'just', 'silly', 'the', \"film's\", 'message', 'is', 'cliché', 'the', 'circle', 'vision', 'is', 'disorienting', 'and', 'that', 'awful', 'song', 'at', 'the', 'end', 'is', 'grating', 'and', 'i', 'really', 'wish', \"they'd\", 'install', 'seats', 'after', 'so', 'much', 'walking', 'all', 'you', 'want', 'to', 'do', 'is', 'sit', 'down', 'for', 'a', 'few', 'minutes', 'and', 'when', 'you', 'hear', \"there's\", 'a', 'film', 'to', 'see', 'it', 'sounds', 'pretty', 'glamorous', 'you', 'get', 'entertained', 'while', 'sitting', 'down', 'right', 'wrong', \"you're\", 'standing', 'there', 'for', '18', 'minutes', 'leaning', 'against', 'a', 'short', 'little', 'railing', 'disney', 'should', 'make', 'a', 'newer', 'maelstrom', 'like', 'attraction', 'to', 'liven', 'things', 'up', 'and', 'replace', 'this', 'dull', 'lackluster', 'film', 'not', 'fun', 'skip', 'it', 'in', 'fact', 'skip', 'canada', 'altogether', 'unless', \"you're\", 'eating', 'there', 'move', 'directly', 'to', 'the', 'united', 'kingdom']\n"
253 | ]
254 | }
255 | ],
256 | "source": [
257 | "print([tokenizer.index_word[k] for k in train_sequences[0]])"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 9,
263 | "id": "subjective-mailman",
264 | "metadata": {},
265 | "outputs": [],
266 | "source": [
267 | "MAX_SEQUENCE_LENGTH = max([max(map(len, train_sequences)), max(map(len, test_sequences))])"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 10,
273 | "id": "promising-rochester",
274 | "metadata": {},
275 | "outputs": [
276 | {
277 | "data": {
278 | "text/plain": [
279 | "2493"
280 | ]
281 | },
282 | "execution_count": 10,
283 | "metadata": {},
284 | "output_type": "execute_result"
285 | }
286 | ],
287 | "source": [
288 | "MAX_SEQUENCE_LENGTH"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 11,
294 | "id": "surgical-specific",
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)\n",
299 | "test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 12,
305 | "id": "sexual-convenience",
306 | "metadata": {},
307 | "outputs": [
308 | {
309 | "name": "stdout",
310 | "output_type": "stream",
311 | "text": [
312 | "['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '