├── README.md
├── glove-bilstm_paper_implementation.ipynb
├── glove-bilstm_experiment2.ipynb
├── CBOW MLP Ppaer Implementation.ipynb
├── glove-lstm_paper_experiment1.ipynb
├── CBOW MLP Sum Diff Product of Embeddings.ipynb
├── CBOW ML Dropout Regularisation.ipynb
└── CBOW MLP He initialisation.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # smai_project
2 |
3 |
4 | This repository contains the code which partially fulfills the requirement for our course **Statistical Methods in AI**. The project is titled **Natural language understanding on Quora Question pairs dataset**.
5 |
--------------------------------------------------------------------------------
/glove-bilstm_paper_implementation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
8 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
9 | "execution": {
10 | "iopub.execute_input": "2021-12-05T07:21:24.696231Z",
11 | "iopub.status.busy": "2021-12-05T07:21:24.695446Z",
12 | "iopub.status.idle": "2021-12-05T07:21:29.177093Z",
13 | "shell.execute_reply": "2021-12-05T07:21:29.176247Z",
14 | "shell.execute_reply.started": "2021-12-05T07:21:24.696089Z"
15 | }
16 | },
17 | "outputs": [],
18 | "source": [
19 | "import pandas as pd\n",
20 | "import numpy as np\n",
21 | "from tqdm import tqdm\n",
22 | "import tensorflow as tf\n",
23 | "from sklearn.metrics import f1_score"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 2,
29 | "metadata": {
30 | "execution": {
31 | "iopub.execute_input": "2021-12-05T07:21:29.179483Z",
32 | "iopub.status.busy": "2021-12-05T07:21:29.178997Z",
33 | "iopub.status.idle": "2021-12-05T07:21:32.486070Z",
34 | "shell.execute_reply": "2021-12-05T07:21:32.485330Z",
35 | "shell.execute_reply.started": "2021-12-05T07:21:29.179449Z"
36 | }
37 | },
38 | "outputs": [],
39 | "source": [
40 | "train = pd.read_csv('../input/smai-project-data/train_data.csv').fillna('')\n",
41 | "val = pd.read_csv('../input/smai-project-data/val_data.csv').fillna('')\n",
42 | "test = pd.read_csv('../input/smai-project-data/test_data.csv').fillna('')"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 3,
48 | "metadata": {
49 | "execution": {
50 | "iopub.execute_input": "2021-12-05T07:21:32.489624Z",
51 | "iopub.status.busy": "2021-12-05T07:21:32.489415Z",
52 | "iopub.status.idle": "2021-12-05T07:21:32.511292Z",
53 | "shell.execute_reply": "2021-12-05T07:21:32.510674Z",
54 | "shell.execute_reply.started": "2021-12-05T07:21:32.489594Z"
55 | }
56 | },
57 | "outputs": [
58 | {
59 | "data": {
60 | "text/html": [
61 | "
\n",
62 | "\n",
75 | "
\n",
76 | " \n",
77 | " \n",
78 | " | \n",
79 | " id | \n",
80 | " qid1 | \n",
81 | " qid2 | \n",
82 | " question1 | \n",
83 | " question2 | \n",
84 | " is_duplicate | \n",
85 | " question1_preprocessed | \n",
86 | " question2_preprocessed | \n",
87 | "
\n",
88 | " \n",
89 | " \n",
90 | " \n",
91 | " | 0 | \n",
92 | " 8067 | \n",
93 | " 15738 | \n",
94 | " 15739 | \n",
95 | " How do I play Pokémon GO in Korea? | \n",
96 | " How do I play Pokémon GO in China? | \n",
97 | " 0 | \n",
98 | " how do i play pok mon go in korea ? | \n",
99 | " how do i play pok mon go in china ? | \n",
100 | "
\n",
101 | " \n",
102 | " | 1 | \n",
103 | " 368101 | \n",
104 | " 12736 | \n",
105 | " 104117 | \n",
106 | " What are some of the best side dishes for crab... | \n",
107 | " What are some good side dishes for buffalo chi... | \n",
108 | " 0 | \n",
109 | " what are some of the best side dishes for crab... | \n",
110 | " what are some good side dishes for buffalo chi... | \n",
111 | "
\n",
112 | " \n",
113 | " | 2 | \n",
114 | " 70497 | \n",
115 | " 121486 | \n",
116 | " 121487 | \n",
117 | " Which is more advisable and better material fo... | \n",
118 | " What is the best server setup for buddypress? | \n",
119 | " 0 | \n",
120 | " which is more advisable and better material fo... | \n",
121 | " what is the best server setup for buddypress ? | \n",
122 | "
\n",
123 | " \n",
124 | " | 3 | \n",
125 | " 226567 | \n",
126 | " 254474 | \n",
127 | " 258192 | \n",
128 | " How do I improve logical programming skills? | \n",
129 | " How can I improve my logical skills for progra... | \n",
130 | " 1 | \n",
131 | " how do i improve logical programming skills ? | \n",
132 | " how can i improve my logical skills for progra... | \n",
133 | "
\n",
134 | " \n",
135 | " | 4 | \n",
136 | " 73186 | \n",
137 | " 48103 | \n",
138 | " 3062 | \n",
139 | " How close we are to see 3rd world war? | \n",
140 | " How close is a World War III? | \n",
141 | " 1 | \n",
142 | " how close we are to see 3rd world war ? | \n",
143 | " how close is a world war iii ? | \n",
144 | "
\n",
145 | " \n",
146 | "
\n",
147 | "
"
148 | ],
149 | "text/plain": [
150 | " id qid1 qid2 question1 \\\n",
151 | "0 8067 15738 15739 How do I play Pokémon GO in Korea? \n",
152 | "1 368101 12736 104117 What are some of the best side dishes for crab... \n",
153 | "2 70497 121486 121487 Which is more advisable and better material fo... \n",
154 | "3 226567 254474 258192 How do I improve logical programming skills? \n",
155 | "4 73186 48103 3062 How close we are to see 3rd world war? \n",
156 | "\n",
157 | " question2 is_duplicate \\\n",
158 | "0 How do I play Pokémon GO in China? 0 \n",
159 | "1 What are some good side dishes for buffalo chi... 0 \n",
160 | "2 What is the best server setup for buddypress? 0 \n",
161 | "3 How can I improve my logical skills for progra... 1 \n",
162 | "4 How close is a World War III? 1 \n",
163 | "\n",
164 | " question1_preprocessed \\\n",
165 | "0 how do i play pok mon go in korea ? \n",
166 | "1 what are some of the best side dishes for crab... \n",
167 | "2 which is more advisable and better material fo... \n",
168 | "3 how do i improve logical programming skills ? \n",
169 | "4 how close we are to see 3rd world war ? \n",
170 | "\n",
171 | " question2_preprocessed \n",
172 | "0 how do i play pok mon go in china ? \n",
173 | "1 what are some good side dishes for buffalo chi... \n",
174 | "2 what is the best server setup for buddypress ? \n",
175 | "3 how can i improve my logical skills for progra... \n",
176 | "4 how close is a world war iii ? "
177 | ]
178 | },
179 | "execution_count": 3,
180 | "metadata": {},
181 | "output_type": "execute_result"
182 | }
183 | ],
184 | "source": [
185 | "train.head()"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": 4,
191 | "metadata": {
192 | "execution": {
193 | "iopub.execute_input": "2021-12-05T07:21:32.514130Z",
194 | "iopub.status.busy": "2021-12-05T07:21:32.513901Z",
195 | "iopub.status.idle": "2021-12-05T07:21:32.524800Z",
196 | "shell.execute_reply": "2021-12-05T07:21:32.524106Z",
197 | "shell.execute_reply.started": "2021-12-05T07:21:32.514094Z"
198 | }
199 | },
200 | "outputs": [],
201 | "source": [
202 | "def buildVocabulary(reviews):\n",
203 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n",
204 | " tokenizer.fit_on_texts(reviews)\n",
205 | " return tokenizer\n",
206 | "\n",
207 | "def getSequences(reviews, tokenizer, seq_maxlen):\n",
208 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n",
209 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n",
210 | "\n",
211 | "def loadGloveWordEmbeddings():\n",
212 | " embedding_vectors = {}\n",
213 | " with open('../input/glove840b300dtxt/glove.840B.300d.txt') as f:\n",
214 | " for line in tqdm(f):\n",
215 | " values = line.split(' ')\n",
216 | " word = values[0]\n",
217 | " coefs = np.asarray(values[1:], dtype='float32')\n",
218 | " embedding_vectors[word] = coefs\n",
219 | " return embedding_vectors\n",
220 | "\n",
221 | "def getEmbeddingWeightMatrix(embedding_vectors, word2idx): \n",
222 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n",
223 | " for word, i in tqdm(word2idx.items()):\n",
224 | " embedding_vector = embedding_vectors.get(word)\n",
225 | " if embedding_vector is not None:\n",
226 | " embedding_matrix[i] = embedding_vector\n",
227 | " return embedding_matrix"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 5,
233 | "metadata": {
234 | "execution": {
235 | "iopub.execute_input": "2021-12-05T07:21:32.526477Z",
236 | "iopub.status.busy": "2021-12-05T07:21:32.526215Z",
237 | "iopub.status.idle": "2021-12-05T07:22:04.067116Z",
238 | "shell.execute_reply": "2021-12-05T07:22:04.066259Z",
239 | "shell.execute_reply.started": "2021-12-05T07:21:32.526443Z"
240 | }
241 | },
242 | "outputs": [
243 | {
244 | "name": "stdout",
245 | "output_type": "stream",
246 | "text": [
247 | "119558\n"
248 | ]
249 | }
250 | ],
251 | "source": [
252 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n",
253 | "vocab_size = len(tokenizer.word_index) + 1\n",
254 | "print(vocab_size)\n",
255 | "\n",
256 | "x_train1 = getSequences(train['question1'], tokenizer, 128)\n",
257 | "x_train2 = getSequences(train['question2'], tokenizer, 128)\n",
258 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n",
259 | "\n",
260 | "x_val1 = getSequences(val['question1'], tokenizer, 128)\n",
261 | "x_val2 = getSequences(val['question2'], tokenizer, 128)\n",
262 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n",
263 | "\n",
264 | "x_test1 = getSequences(test['question1'], tokenizer, 128)\n",
265 | "x_test2 = getSequences(test['question2'], tokenizer, 128)\n",
266 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 6,
272 | "metadata": {
273 | "execution": {
274 | "iopub.execute_input": "2021-12-05T07:22:04.068714Z",
275 | "iopub.status.busy": "2021-12-05T07:22:04.068455Z",
276 | "iopub.status.idle": "2021-12-05T07:26:17.684935Z",
277 | "shell.execute_reply": "2021-12-05T07:26:17.684071Z",
278 | "shell.execute_reply.started": "2021-12-05T07:22:04.068679Z"
279 | }
280 | },
281 | "outputs": [
282 | {
283 | "name": "stderr",
284 | "output_type": "stream",
285 | "text": [
286 | "2196018it [04:13, 8673.48it/s]\n"
287 | ]
288 | },
289 | {
290 | "name": "stdout",
291 | "output_type": "stream",
292 | "text": [
293 | "2196017\n"
294 | ]
295 | },
296 | {
297 | "name": "stderr",
298 | "output_type": "stream",
299 | "text": [
300 | "100%|██████████| 119557/119557 [00:00<00:00, 289620.78it/s]"
301 | ]
302 | },
303 | {
304 | "name": "stdout",
305 | "output_type": "stream",
306 | "text": [
307 | "(119558, 300)\n"
308 | ]
309 | },
310 | {
311 | "name": "stderr",
312 | "output_type": "stream",
313 | "text": [
314 | "\n"
315 | ]
316 | }
317 | ],
318 | "source": [
319 | "embedding_vectors = loadGloveWordEmbeddings()\n",
320 | "print(len(embedding_vectors))\n",
321 | "\n",
322 | "embedding_weight_matrix = getEmbeddingWeightMatrix(embedding_vectors, tokenizer.word_index)\n",
323 | "print(embedding_weight_matrix.shape)"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": 8,
329 | "metadata": {
330 | "execution": {
331 | "iopub.execute_input": "2021-12-05T07:28:18.980466Z",
332 | "iopub.status.busy": "2021-12-05T07:28:18.979883Z",
333 | "iopub.status.idle": "2021-12-05T07:28:19.835475Z",
334 | "shell.execute_reply": "2021-12-05T07:28:19.834704Z",
335 | "shell.execute_reply.started": "2021-12-05T07:28:18.980425Z"
336 | }
337 | },
338 | "outputs": [
339 | {
340 | "name": "stderr",
341 | "output_type": "stream",
342 | "text": [
343 | "2021-12-05 07:28:19.052944: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n",
344 | "2021-12-05 07:28:19.259332: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n"
345 | ]
346 | }
347 | ],
348 | "source": [
349 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n",
350 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n",
351 | "\n",
352 | "inner1= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128, \n",
353 | " weights=[embedding_weight_matrix], trainable=False)(inp1)\n",
354 | "inner2= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128,\n",
355 | " weights=[embedding_weight_matrix], trainable=False)(inp2)\n",
356 | "\n",
357 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n",
358 | "\n",
359 | "out = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, kernel_regularizer='l2', dropout=0.1, return_sequences=True))(inner)\n",
360 | "\n",
361 | "out = tf.keras.backend.mean(out, axis=1, keepdims=False)\n",
362 | "\n",
363 | "output = tf.keras.layers.Dense(2, kernel_regularizer='l2', activation='softmax')(out)\n",
364 | "\n",
365 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": 9,
371 | "metadata": {
372 | "execution": {
373 | "iopub.execute_input": "2021-12-05T07:28:22.230481Z",
374 | "iopub.status.busy": "2021-12-05T07:28:22.229633Z",
375 | "iopub.status.idle": "2021-12-05T07:28:22.254231Z",
376 | "shell.execute_reply": "2021-12-05T07:28:22.252329Z",
377 | "shell.execute_reply.started": "2021-12-05T07:28:22.230431Z"
378 | }
379 | },
380 | "outputs": [
381 | {
382 | "name": "stdout",
383 | "output_type": "stream",
384 | "text": [
385 | "Model: \"model\"\n",
386 | "__________________________________________________________________________________________________\n",
387 | "Layer (type) Output Shape Param # Connected to \n",
388 | "==================================================================================================\n",
389 | "input_3 (InputLayer) [(None, 128)] 0 \n",
390 | "__________________________________________________________________________________________________\n",
391 | "input_4 (InputLayer) [(None, 128)] 0 \n",
392 | "__________________________________________________________________________________________________\n",
393 | "embedding_2 (Embedding) (None, 128, 300) 35867400 input_3[0][0] \n",
394 | "__________________________________________________________________________________________________\n",
395 | "embedding_3 (Embedding) (None, 128, 300) 35867400 input_4[0][0] \n",
396 | "__________________________________________________________________________________________________\n",
397 | "tf.__operators__.add_1 (TFOpLam (None, 128, 300) 0 embedding_2[0][0] \n",
398 | " embedding_3[0][0] \n",
399 | "__________________________________________________________________________________________________\n",
400 | "tf.math.subtract_1 (TFOpLambda) (None, 128, 300) 0 embedding_2[0][0] \n",
401 | " embedding_3[0][0] \n",
402 | "__________________________________________________________________________________________________\n",
403 | "tf.math.multiply_1 (TFOpLambda) (None, 128, 300) 0 embedding_2[0][0] \n",
404 | " embedding_3[0][0] \n",
405 | "__________________________________________________________________________________________________\n",
406 | "concatenate_1 (Concatenate) (None, 128, 900) 0 tf.__operators__.add_1[0][0] \n",
407 | " tf.math.subtract_1[0][0] \n",
408 | " tf.math.multiply_1[0][0] \n",
409 | "__________________________________________________________________________________________________\n",
410 | "bidirectional_1 (Bidirectional) (None, 128, 300) 1261200 concatenate_1[0][0] \n",
411 | "__________________________________________________________________________________________________\n",
412 | "tf.math.reduce_mean_1 (TFOpLamb (None, 300) 0 bidirectional_1[0][0] \n",
413 | "__________________________________________________________________________________________________\n",
414 | "dense (Dense) (None, 2) 602 tf.math.reduce_mean_1[0][0] \n",
415 | "==================================================================================================\n",
416 | "Total params: 72,996,602\n",
417 | "Trainable params: 1,261,802\n",
418 | "Non-trainable params: 71,734,800\n",
419 | "__________________________________________________________________________________________________\n"
420 | ]
421 | }
422 | ],
423 | "source": [
424 | "model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics=['accuracy'])\n",
425 | "model.summary()"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": 15,
431 | "metadata": {
432 | "execution": {
433 | "iopub.execute_input": "2021-12-05T07:54:34.407682Z",
434 | "iopub.status.busy": "2021-12-05T07:54:34.406978Z",
435 | "iopub.status.idle": "2021-12-05T08:02:56.589419Z",
436 | "shell.execute_reply": "2021-12-05T08:02:56.588628Z",
437 | "shell.execute_reply.started": "2021-12-05T07:54:34.407647Z"
438 | }
439 | },
440 | "outputs": [
441 | {
442 | "name": "stdout",
443 | "output_type": "stream",
444 | "text": [
445 | "Epoch 1/2\n",
446 | "8844/8844 [==============================] - 249s 28ms/step - loss: 0.5754 - accuracy: 0.7337 - val_loss: 0.5890 - val_accuracy: 0.7333\n",
447 | "\n",
448 | "Epoch 00001: val_loss improved from inf to 0.58903, saving model to weights.best.hdf5\n",
449 | "Epoch 2/2\n",
450 | "8844/8844 [==============================] - 249s 28ms/step - loss: 0.5752 - accuracy: 0.7361 - val_loss: 0.5766 - val_accuracy: 0.7340\n",
451 | "\n",
452 | "Epoch 00002: val_loss improved from 0.58903 to 0.57661, saving model to weights.best.hdf5\n"
453 | ]
454 | }
455 | ],
456 | "source": [
457 | "checkpoint_filepath = 'weights.best.hdf5'\n",
458 | "model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n",
459 | " verbose = 1, \n",
460 | " monitor = 'val_loss',\n",
461 | " save_best_only = True)\n",
462 | "\n",
463 | "history = model.fit((x_train1, x_train2), y_train,\n",
464 | " batch_size = 32,\n",
465 | " validation_data = ((x_val1, x_val2), y_val),\n",
466 | " validation_batch_size = 16,\n",
467 | " epochs=5,\n",
468 | " callbacks=[model_checkpoint_callback], \n",
469 | " verbose=1)"
470 | ]
471 | },
472 | {
473 | "cell_type": "code",
474 | "execution_count": 18,
475 | "metadata": {
476 | "execution": {
477 | "iopub.execute_input": "2021-12-05T08:02:57.030112Z",
478 | "iopub.status.busy": "2021-12-05T08:02:57.029591Z",
479 | "iopub.status.idle": "2021-12-05T08:04:06.879698Z",
480 | "shell.execute_reply": "2021-12-05T08:04:06.878884Z",
481 | "shell.execute_reply.started": "2021-12-05T08:02:57.030075Z"
482 | }
483 | },
484 | "outputs": [
485 | {
486 | "name": "stdout",
487 | "output_type": "stream",
488 | "text": [
489 | "10108/10108 [==============================] - 70s 7ms/step - loss: 0.5730 - accuracy: 0.7340\n",
490 | "loss on test data is 0.5730125308036804\n",
491 | "accuracy on test data is 0.7340275645256042\n"
492 | ]
493 | }
494 | ],
495 | "source": [
496 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n",
497 | "\n",
498 | "print('loss on test data is', loss)\n",
499 | "print('accuracy on test data is', accuracy)"
500 | ]
501 | },
502 | {
503 | "cell_type": "code",
504 | "execution_count": 19,
505 | "metadata": {
506 | "execution": {
507 | "iopub.execute_input": "2021-12-05T08:04:06.881174Z",
508 | "iopub.status.busy": "2021-12-05T08:04:06.880894Z",
509 | "iopub.status.idle": "2021-12-05T08:04:17.186473Z",
510 | "shell.execute_reply": "2021-12-05T08:04:17.185721Z",
511 | "shell.execute_reply.started": "2021-12-05T08:04:06.881138Z"
512 | }
513 | },
514 | "outputs": [
515 | {
516 | "name": "stdout",
517 | "output_type": "stream",
518 | "text": [
519 | "f1_score on test dataset is 0.6310516383599245\n"
520 | ]
521 | }
522 | ],
523 | "source": [
524 | "pred = model.predict((x_test1, x_test2))\n",
525 | "\n",
526 | "print('f1_score on test dataset is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))"
527 | ]
528 | },
529 | {
530 | "cell_type": "code",
531 | "execution_count": null,
532 | "metadata": {},
533 | "outputs": [],
534 | "source": []
535 | }
536 | ],
537 | "metadata": {
538 | "kernelspec": {
539 | "display_name": "Python 3 (ipykernel)",
540 | "language": "python",
541 | "name": "python3"
542 | },
543 | "language_info": {
544 | "codemirror_mode": {
545 | "name": "ipython",
546 | "version": 3
547 | },
548 | "file_extension": ".py",
549 | "mimetype": "text/x-python",
550 | "name": "python",
551 | "nbconvert_exporter": "python",
552 | "pygments_lexer": "ipython3",
553 | "version": "3.8.10"
554 | }
555 | },
556 | "nbformat": 4,
557 | "nbformat_minor": 4
558 | }
559 |
--------------------------------------------------------------------------------
/glove-bilstm_experiment2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
8 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
9 | "execution": {
10 | "iopub.execute_input": "2021-12-05T08:27:55.991770Z",
11 | "iopub.status.busy": "2021-12-05T08:27:55.991383Z",
12 | "iopub.status.idle": "2021-12-05T08:28:00.572205Z",
13 | "shell.execute_reply": "2021-12-05T08:28:00.571459Z",
14 | "shell.execute_reply.started": "2021-12-05T08:27:55.991676Z"
15 | }
16 | },
17 | "outputs": [],
18 | "source": [
19 | "import pandas as pd\n",
20 | "import numpy as np\n",
21 | "from tqdm import tqdm\n",
22 | "import tensorflow as tf\n",
23 | "from sklearn.metrics import f1_score"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 2,
29 | "metadata": {
30 | "execution": {
31 | "iopub.execute_input": "2021-12-05T08:28:00.575744Z",
32 | "iopub.status.busy": "2021-12-05T08:28:00.575547Z",
33 | "iopub.status.idle": "2021-12-05T08:28:03.698196Z",
34 | "shell.execute_reply": "2021-12-05T08:28:03.697341Z",
35 | "shell.execute_reply.started": "2021-12-05T08:28:00.575719Z"
36 | }
37 | },
38 | "outputs": [],
39 | "source": [
40 | "train = pd.read_csv('../input/smai-project-data/train_data.csv').fillna('')\n",
41 | "val = pd.read_csv('../input/smai-project-data/val_data.csv').fillna('')\n",
42 | "test = pd.read_csv('../input/smai-project-data/test_data.csv').fillna('')"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 3,
48 | "metadata": {
49 | "execution": {
50 | "iopub.execute_input": "2021-12-05T08:28:03.701370Z",
51 | "iopub.status.busy": "2021-12-05T08:28:03.699765Z",
52 | "iopub.status.idle": "2021-12-05T08:28:03.725788Z",
53 | "shell.execute_reply": "2021-12-05T08:28:03.725005Z",
54 | "shell.execute_reply.started": "2021-12-05T08:28:03.701323Z"
55 | }
56 | },
57 | "outputs": [
58 | {
59 | "data": {
60 | "text/html": [
61 | "\n",
62 | "\n",
75 | "
\n",
76 | " \n",
77 | " \n",
78 | " | \n",
79 | " id | \n",
80 | " qid1 | \n",
81 | " qid2 | \n",
82 | " question1 | \n",
83 | " question2 | \n",
84 | " is_duplicate | \n",
85 | " question1_preprocessed | \n",
86 | " question2_preprocessed | \n",
87 | "
\n",
88 | " \n",
89 | " \n",
90 | " \n",
91 | " | 0 | \n",
92 | " 8067 | \n",
93 | " 15738 | \n",
94 | " 15739 | \n",
95 | " How do I play Pokémon GO in Korea? | \n",
96 | " How do I play Pokémon GO in China? | \n",
97 | " 0 | \n",
98 | " how do i play pok mon go in korea ? | \n",
99 | " how do i play pok mon go in china ? | \n",
100 | "
\n",
101 | " \n",
102 | " | 1 | \n",
103 | " 368101 | \n",
104 | " 12736 | \n",
105 | " 104117 | \n",
106 | " What are some of the best side dishes for crab... | \n",
107 | " What are some good side dishes for buffalo chi... | \n",
108 | " 0 | \n",
109 | " what are some of the best side dishes for crab... | \n",
110 | " what are some good side dishes for buffalo chi... | \n",
111 | "
\n",
112 | " \n",
113 | " | 2 | \n",
114 | " 70497 | \n",
115 | " 121486 | \n",
116 | " 121487 | \n",
117 | " Which is more advisable and better material fo... | \n",
118 | " What is the best server setup for buddypress? | \n",
119 | " 0 | \n",
120 | " which is more advisable and better material fo... | \n",
121 | " what is the best server setup for buddypress ? | \n",
122 | "
\n",
123 | " \n",
124 | " | 3 | \n",
125 | " 226567 | \n",
126 | " 254474 | \n",
127 | " 258192 | \n",
128 | " How do I improve logical programming skills? | \n",
129 | " How can I improve my logical skills for progra... | \n",
130 | " 1 | \n",
131 | " how do i improve logical programming skills ? | \n",
132 | " how can i improve my logical skills for progra... | \n",
133 | "
\n",
134 | " \n",
135 | " | 4 | \n",
136 | " 73186 | \n",
137 | " 48103 | \n",
138 | " 3062 | \n",
139 | " How close we are to see 3rd world war? | \n",
140 | " How close is a World War III? | \n",
141 | " 1 | \n",
142 | " how close we are to see 3rd world war ? | \n",
143 | " how close is a world war iii ? | \n",
144 | "
\n",
145 | " \n",
146 | "
\n",
147 | "
"
148 | ],
149 | "text/plain": [
150 | " id qid1 qid2 question1 \\\n",
151 | "0 8067 15738 15739 How do I play Pokémon GO in Korea? \n",
152 | "1 368101 12736 104117 What are some of the best side dishes for crab... \n",
153 | "2 70497 121486 121487 Which is more advisable and better material fo... \n",
154 | "3 226567 254474 258192 How do I improve logical programming skills? \n",
155 | "4 73186 48103 3062 How close we are to see 3rd world war? \n",
156 | "\n",
157 | " question2 is_duplicate \\\n",
158 | "0 How do I play Pokémon GO in China? 0 \n",
159 | "1 What are some good side dishes for buffalo chi... 0 \n",
160 | "2 What is the best server setup for buddypress? 0 \n",
161 | "3 How can I improve my logical skills for progra... 1 \n",
162 | "4 How close is a World War III? 1 \n",
163 | "\n",
164 | " question1_preprocessed \\\n",
165 | "0 how do i play pok mon go in korea ? \n",
166 | "1 what are some of the best side dishes for crab... \n",
167 | "2 which is more advisable and better material fo... \n",
168 | "3 how do i improve logical programming skills ? \n",
169 | "4 how close we are to see 3rd world war ? \n",
170 | "\n",
171 | " question2_preprocessed \n",
172 | "0 how do i play pok mon go in china ? \n",
173 | "1 what are some good side dishes for buffalo chi... \n",
174 | "2 what is the best server setup for buddypress ? \n",
175 | "3 how can i improve my logical skills for progra... \n",
176 | "4 how close is a world war iii ? "
177 | ]
178 | },
179 | "execution_count": 3,
180 | "metadata": {},
181 | "output_type": "execute_result"
182 | }
183 | ],
184 | "source": [
185 | "train.head()"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": 4,
191 | "metadata": {
192 | "execution": {
193 | "iopub.execute_input": "2021-12-05T08:28:03.728265Z",
194 | "iopub.status.busy": "2021-12-05T08:28:03.727992Z",
195 | "iopub.status.idle": "2021-12-05T08:28:03.737968Z",
196 | "shell.execute_reply": "2021-12-05T08:28:03.736992Z",
197 | "shell.execute_reply.started": "2021-12-05T08:28:03.728229Z"
198 | }
199 | },
200 | "outputs": [],
201 | "source": [
202 | "def buildVocabulary(reviews):\n",
203 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n",
204 | " tokenizer.fit_on_texts(reviews)\n",
205 | " return tokenizer\n",
206 | "\n",
207 | "def getSequences(reviews, tokenizer, seq_maxlen):\n",
208 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n",
209 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n",
210 | "\n",
211 | "def loadGloveWordEmbeddings():\n",
212 | " embedding_vectors = {}\n",
213 | " with open('../input/glove840b300dtxt/glove.840B.300d.txt') as f:\n",
214 | " for line in tqdm(f):\n",
215 | " values = line.split(' ')\n",
216 | " word = values[0]\n",
217 | " coefs = np.asarray(values[1:], dtype='float32')\n",
218 | " embedding_vectors[word] = coefs\n",
219 | " return embedding_vectors\n",
220 | "\n",
221 | "def getEmbeddingWeightMatrix(embedding_vectors, word2idx): \n",
222 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n",
223 | " for word, i in tqdm(word2idx.items()):\n",
224 | " embedding_vector = embedding_vectors.get(word)\n",
225 | " if embedding_vector is not None:\n",
226 | " embedding_matrix[i] = embedding_vector\n",
227 | " return embedding_matrix"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 5,
233 | "metadata": {
234 | "execution": {
235 | "iopub.execute_input": "2021-12-05T08:28:03.740285Z",
236 | "iopub.status.busy": "2021-12-05T08:28:03.739650Z",
237 | "iopub.status.idle": "2021-12-05T08:28:35.589006Z",
238 | "shell.execute_reply": "2021-12-05T08:28:35.588270Z",
239 | "shell.execute_reply.started": "2021-12-05T08:28:03.740250Z"
240 | }
241 | },
242 | "outputs": [
243 | {
244 | "name": "stdout",
245 | "output_type": "stream",
246 | "text": [
247 | "119558\n"
248 | ]
249 | }
250 | ],
251 | "source": [
252 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n",
253 | "vocab_size = len(tokenizer.word_index) + 1\n",
254 | "print(vocab_size)\n",
255 | "\n",
256 | "x_train1 = getSequences(train['question1'], tokenizer, 128)\n",
257 | "x_train2 = getSequences(train['question2'], tokenizer, 128)\n",
258 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n",
259 | "\n",
260 | "x_val1 = getSequences(val['question1'], tokenizer, 128)\n",
261 | "x_val2 = getSequences(val['question2'], tokenizer, 128)\n",
262 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n",
263 | "\n",
264 | "x_test1 = getSequences(test['question1'], tokenizer, 128)\n",
265 | "x_test2 = getSequences(test['question2'], tokenizer, 128)\n",
266 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 6,
272 | "metadata": {
273 | "execution": {
274 | "iopub.execute_input": "2021-12-05T08:28:35.590734Z",
275 | "iopub.status.busy": "2021-12-05T08:28:35.590499Z",
276 | "iopub.status.idle": "2021-12-05T08:32:50.715266Z",
277 | "shell.execute_reply": "2021-12-05T08:32:50.714504Z",
278 | "shell.execute_reply.started": "2021-12-05T08:28:35.590699Z"
279 | }
280 | },
281 | "outputs": [
282 | {
283 | "name": "stderr",
284 | "output_type": "stream",
285 | "text": [
286 | "2196018it [04:14, 8621.81it/s]\n"
287 | ]
288 | },
289 | {
290 | "name": "stdout",
291 | "output_type": "stream",
292 | "text": [
293 | "2196017\n"
294 | ]
295 | },
296 | {
297 | "name": "stderr",
298 | "output_type": "stream",
299 | "text": [
300 | "100%|██████████| 119557/119557 [00:00<00:00, 296253.60it/s]"
301 | ]
302 | },
303 | {
304 | "name": "stdout",
305 | "output_type": "stream",
306 | "text": [
307 | "(119558, 300)\n"
308 | ]
309 | },
310 | {
311 | "name": "stderr",
312 | "output_type": "stream",
313 | "text": [
314 | "\n"
315 | ]
316 | }
317 | ],
318 | "source": [
319 | "embedding_vectors = loadGloveWordEmbeddings()\n",
320 | "print(len(embedding_vectors))\n",
321 | "\n",
322 | "embedding_weight_matrix = getEmbeddingWeightMatrix(embedding_vectors, tokenizer.word_index)\n",
323 | "print(embedding_weight_matrix.shape)"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": 7,
329 | "metadata": {
330 | "execution": {
331 | "iopub.execute_input": "2021-12-05T08:32:50.717025Z",
332 | "iopub.status.busy": "2021-12-05T08:32:50.716763Z",
333 | "iopub.status.idle": "2021-12-05T08:32:54.266862Z",
334 | "shell.execute_reply": "2021-12-05T08:32:54.266168Z",
335 | "shell.execute_reply.started": "2021-12-05T08:32:50.716989Z"
336 | }
337 | },
338 | "outputs": [
339 | {
340 | "name": "stderr",
341 | "output_type": "stream",
342 | "text": [
343 | "2021-12-05 08:32:50.811328: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
344 | "2021-12-05 08:32:50.924265: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
345 | "2021-12-05 08:32:50.924979: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
346 | "2021-12-05 08:32:50.926274: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n",
347 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
348 | "2021-12-05 08:32:50.927066: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
349 | "2021-12-05 08:32:50.927770: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
350 | "2021-12-05 08:32:50.928410: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
351 | "2021-12-05 08:32:52.819781: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
352 | "2021-12-05 08:32:52.820602: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
353 | "2021-12-05 08:32:52.821254: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
354 | "2021-12-05 08:32:52.821837: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n",
355 | "2021-12-05 08:32:53.377677: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n",
356 | "2021-12-05 08:32:53.611921: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n"
357 | ]
358 | }
359 | ],
360 | "source": [
361 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n",
362 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n",
363 | "\n",
364 | "inner1= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128, \n",
365 | " weights=[embedding_weight_matrix], trainable=False)(inp1)\n",
366 | "inner2= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128,\n",
367 | " weights=[embedding_weight_matrix], trainable=False)(inp2)\n",
368 | "\n",
369 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n",
370 | "\n",
371 | "out = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, kernel_regularizer='l2', return_sequences=True))(inner)\n",
372 | "\n",
373 | "out = tf.keras.backend.mean(out, axis=1, keepdims=False)\n",
374 | "\n",
375 | "output = tf.keras.layers.Dense(2, kernel_regularizer='l2', activation='softmax')(out)\n",
376 | "\n",
377 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 8,
383 | "metadata": {
384 | "execution": {
385 | "iopub.execute_input": "2021-12-05T08:32:54.268296Z",
386 | "iopub.status.busy": "2021-12-05T08:32:54.268035Z",
387 | "iopub.status.idle": "2021-12-05T08:32:54.288773Z",
388 | "shell.execute_reply": "2021-12-05T08:32:54.288146Z",
389 | "shell.execute_reply.started": "2021-12-05T08:32:54.268263Z"
390 | }
391 | },
392 | "outputs": [
393 | {
394 | "name": "stdout",
395 | "output_type": "stream",
396 | "text": [
397 | "Model: \"model\"\n",
398 | "__________________________________________________________________________________________________\n",
399 | "Layer (type) Output Shape Param # Connected to \n",
400 | "==================================================================================================\n",
401 | "input_1 (InputLayer) [(None, 128)] 0 \n",
402 | "__________________________________________________________________________________________________\n",
403 | "input_2 (InputLayer) [(None, 128)] 0 \n",
404 | "__________________________________________________________________________________________________\n",
405 | "embedding (Embedding) (None, 128, 300) 35867400 input_1[0][0] \n",
406 | "__________________________________________________________________________________________________\n",
407 | "embedding_1 (Embedding) (None, 128, 300) 35867400 input_2[0][0] \n",
408 | "__________________________________________________________________________________________________\n",
409 | "tf.__operators__.add (TFOpLambd (None, 128, 300) 0 embedding[0][0] \n",
410 | " embedding_1[0][0] \n",
411 | "__________________________________________________________________________________________________\n",
412 | "tf.math.subtract (TFOpLambda) (None, 128, 300) 0 embedding[0][0] \n",
413 | " embedding_1[0][0] \n",
414 | "__________________________________________________________________________________________________\n",
415 | "tf.math.multiply (TFOpLambda) (None, 128, 300) 0 embedding[0][0] \n",
416 | " embedding_1[0][0] \n",
417 | "__________________________________________________________________________________________________\n",
418 | "concatenate (Concatenate) (None, 128, 900) 0 tf.__operators__.add[0][0] \n",
419 | " tf.math.subtract[0][0] \n",
420 | " tf.math.multiply[0][0] \n",
421 | "__________________________________________________________________________________________________\n",
422 | "bidirectional (Bidirectional) (None, 128, 300) 1261200 concatenate[0][0] \n",
423 | "__________________________________________________________________________________________________\n",
424 | "tf.math.reduce_mean (TFOpLambda (None, 300) 0 bidirectional[0][0] \n",
425 | "__________________________________________________________________________________________________\n",
426 | "dense (Dense) (None, 2) 602 tf.math.reduce_mean[0][0] \n",
427 | "==================================================================================================\n",
428 | "Total params: 72,996,602\n",
429 | "Trainable params: 1,261,802\n",
430 | "Non-trainable params: 71,734,800\n",
431 | "__________________________________________________________________________________________________\n"
432 | ]
433 | }
434 | ],
435 | "source": [
436 | "model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics=['accuracy'])\n",
437 | "model.summary()"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": 9,
443 | "metadata": {
444 | "execution": {
445 | "iopub.execute_input": "2021-12-05T08:32:54.290164Z",
446 | "iopub.status.busy": "2021-12-05T08:32:54.289923Z",
447 | "iopub.status.idle": "2021-12-05T08:54:19.362354Z",
448 | "shell.execute_reply": "2021-12-05T08:54:19.361515Z",
449 | "shell.execute_reply.started": "2021-12-05T08:32:54.290131Z"
450 | }
451 | },
452 | "outputs": [
453 | {
454 | "name": "stderr",
455 | "output_type": "stream",
456 | "text": [
457 | "2021-12-05 08:32:54.295725: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 144897536 exceeds 10% of free system memory.\n",
458 | "2021-12-05 08:32:54.402477: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 144897536 exceeds 10% of free system memory.\n",
459 | "2021-12-05 08:32:54.568071: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n"
460 | ]
461 | },
462 | {
463 | "name": "stdout",
464 | "output_type": "stream",
465 | "text": [
466 | "Epoch 1/5\n"
467 | ]
468 | },
469 | {
470 | "name": "stderr",
471 | "output_type": "stream",
472 | "text": [
473 | "2021-12-05 08:32:58.225519: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005\n"
474 | ]
475 | },
476 | {
477 | "name": "stdout",
478 | "output_type": "stream",
479 | "text": [
480 | "8844/8844 [==============================] - ETA: 0s - loss: 0.6319 - accuracy: 0.7057"
481 | ]
482 | },
483 | {
484 | "name": "stderr",
485 | "output_type": "stream",
486 | "text": [
487 | "2021-12-05 08:36:24.846568: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 41399296 exceeds 10% of free system memory.\n"
488 | ]
489 | },
490 | {
491 | "name": "stdout",
492 | "output_type": "stream",
493 | "text": [
494 | "8844/8844 [==============================] - 250s 28ms/step - loss: 0.6319 - accuracy: 0.7057 - val_loss: 0.5694 - val_accuracy: 0.7317\n",
495 | "\n",
496 | "Epoch 00001: val_loss improved from inf to 0.56941, saving model to weights.best.hdf5\n",
497 | "Epoch 2/5\n",
498 | "8844/8844 [==============================] - 245s 28ms/step - loss: 0.5812 - accuracy: 0.7274 - val_loss: 0.5639 - val_accuracy: 0.7362\n",
499 | "\n",
500 | "Epoch 00002: val_loss improved from 0.56941 to 0.56394, saving model to weights.best.hdf5\n",
501 | "Epoch 3/5\n",
502 | "8844/8844 [==============================] - 243s 28ms/step - loss: 0.5915 - accuracy: 0.7223 - val_loss: 0.6492 - val_accuracy: 0.6690\n",
503 | "\n",
504 | "Epoch 00003: val_loss did not improve from 0.56394\n",
505 | "Epoch 4/5\n",
506 | "8844/8844 [==============================] - 244s 28ms/step - loss: 0.5840 - accuracy: 0.7287 - val_loss: 0.5697 - val_accuracy: 0.7288\n",
507 | "\n",
508 | "Epoch 00004: val_loss did not improve from 0.56394\n",
509 | "Epoch 5/5\n",
510 | "8844/8844 [==============================] - 243s 28ms/step - loss: 0.5734 - accuracy: 0.7334 - val_loss: 0.5623 - val_accuracy: 0.7404\n",
511 | "\n",
512 | "Epoch 00005: val_loss improved from 0.56394 to 0.56234, saving model to weights.best.hdf5\n"
513 | ]
514 | }
515 | ],
516 | "source": [
517 | "checkpoint_filepath = 'weights.best.hdf5'\n",
518 | "model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n",
519 | " verbose = 1, \n",
520 | " monitor = 'val_loss',\n",
521 | " save_best_only = True)\n",
522 | "\n",
523 | "history = model.fit((x_train1, x_train2), y_train,\n",
524 | " batch_size = 32,\n",
525 | " validation_data = ((x_val1, x_val2), y_val),\n",
526 | " validation_batch_size = 16,\n",
527 | " epochs=5,\n",
528 | " callbacks=[model_checkpoint_callback], \n",
529 | " verbose=1)"
530 | ]
531 | },
532 | {
533 | "cell_type": "code",
534 | "execution_count": 12,
535 | "metadata": {
536 | "execution": {
537 | "iopub.execute_input": "2021-12-05T08:54:19.834213Z",
538 | "iopub.status.busy": "2021-12-05T08:54:19.833805Z",
539 | "iopub.status.idle": "2021-12-05T08:55:29.256507Z",
540 | "shell.execute_reply": "2021-12-05T08:55:29.255776Z",
541 | "shell.execute_reply.started": "2021-12-05T08:54:19.834175Z"
542 | }
543 | },
544 | "outputs": [
545 | {
546 | "name": "stdout",
547 | "output_type": "stream",
548 | "text": [
549 | "10108/10108 [==============================] - 69s 7ms/step - loss: 0.5629 - accuracy: 0.7411\n",
550 | "loss on test data is 0.5628555417060852\n",
551 | "accuracy on test data is 0.7410769462585449\n"
552 | ]
553 | }
554 | ],
555 | "source": [
556 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n",
557 | "\n",
558 | "print('loss on test data is', loss)\n",
559 | "print('accuracy on test data is', accuracy)"
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": 13,
565 | "metadata": {
566 | "execution": {
567 | "iopub.execute_input": "2021-12-05T08:55:29.258466Z",
568 | "iopub.status.busy": "2021-12-05T08:55:29.258123Z",
569 | "iopub.status.idle": "2021-12-05T08:55:40.114361Z",
570 | "shell.execute_reply": "2021-12-05T08:55:40.113551Z",
571 | "shell.execute_reply.started": "2021-12-05T08:55:29.258423Z"
572 | }
573 | },
574 | "outputs": [
575 | {
576 | "name": "stdout",
577 | "output_type": "stream",
578 | "text": [
579 | "f1_score on test dataset is 0.6306802145074795\n"
580 | ]
581 | }
582 | ],
583 | "source": [
584 | "pred = model.predict((x_test1, x_test2))\n",
585 | "\n",
586 | "print('f1_score on test dataset is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))"
587 | ]
588 | },
589 | {
590 | "cell_type": "code",
591 | "execution_count": null,
592 | "metadata": {},
593 | "outputs": [],
594 | "source": []
595 | }
596 | ],
597 | "metadata": {
598 | "kernelspec": {
599 | "display_name": "Python 3 (ipykernel)",
600 | "language": "python",
601 | "name": "python3"
602 | },
603 | "language_info": {
604 | "codemirror_mode": {
605 | "name": "ipython",
606 | "version": 3
607 | },
608 | "file_extension": ".py",
609 | "mimetype": "text/x-python",
610 | "name": "python",
611 | "nbconvert_exporter": "python",
612 | "pygments_lexer": "ipython3",
613 | "version": "3.8.10"
614 | }
615 | },
616 | "nbformat": 4,
617 | "nbformat_minor": 4
618 | }
619 |
--------------------------------------------------------------------------------
/CBOW MLP Ppaer Implementation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "8ca79fa1",
7 | "metadata": {
8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
10 | "execution": {
11 | "iopub.execute_input": "2021-12-05T06:41:42.716152Z",
12 | "iopub.status.busy": "2021-12-05T06:41:42.714638Z",
13 | "iopub.status.idle": "2021-12-05T06:41:47.285389Z",
14 | "shell.execute_reply": "2021-12-05T06:41:47.284775Z",
15 | "shell.execute_reply.started": "2021-12-05T06:28:02.081372Z"
16 | },
17 | "papermill": {
18 | "duration": 4.588331,
19 | "end_time": "2021-12-05T06:41:47.285538",
20 | "exception": false,
21 | "start_time": "2021-12-05T06:41:42.697207",
22 | "status": "completed"
23 | },
24 | "tags": []
25 | },
26 | "outputs": [],
27 | "source": [
28 | "import pandas as pd\n",
29 | "import numpy as np\n",
30 | "from tqdm import tqdm\n",
31 | "import tensorflow as tf\n",
32 | "from gensim.models import KeyedVectors\n",
33 | "import gensim\n",
34 | "import re\n",
35 | "from sklearn.metrics import f1_score\n",
36 | "import matplotlib.pyplot as plt"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "id": "57d8136d",
43 | "metadata": {
44 | "execution": {
45 | "iopub.execute_input": "2021-12-05T06:41:47.314041Z",
46 | "iopub.status.busy": "2021-12-05T06:41:47.313448Z",
47 | "iopub.status.idle": "2021-12-05T06:41:48.775380Z",
48 | "shell.execute_reply": "2021-12-05T06:41:48.774838Z",
49 | "shell.execute_reply.started": "2021-12-05T06:28:06.599367Z"
50 | },
51 | "papermill": {
52 | "duration": 1.478299,
53 | "end_time": "2021-12-05T06:41:48.775521",
54 | "exception": false,
55 | "start_time": "2021-12-05T06:41:47.297222",
56 | "status": "completed"
57 | },
58 | "tags": []
59 | },
60 | "outputs": [],
61 | "source": [
62 | "train = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')\n",
63 | "val = pd.read_csv('../input/quora-ques-pair/val_data.csv/val_data.csv').fillna('')\n",
64 | "test = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 3,
70 | "id": "0fb8cffe",
71 | "metadata": {
72 | "execution": {
73 | "iopub.execute_input": "2021-12-05T06:41:48.800775Z",
74 | "iopub.status.busy": "2021-12-05T06:41:48.799943Z",
75 | "iopub.status.idle": "2021-12-05T06:41:48.804283Z",
76 | "shell.execute_reply": "2021-12-05T06:41:48.803802Z",
77 | "shell.execute_reply.started": "2021-12-05T06:28:07.887882Z"
78 | },
79 | "papermill": {
80 | "duration": 0.017602,
81 | "end_time": "2021-12-05T06:41:48.804397",
82 | "exception": false,
83 | "start_time": "2021-12-05T06:41:48.786795",
84 | "status": "completed"
85 | },
86 | "tags": []
87 | },
88 | "outputs": [],
89 | "source": [
90 | "word2vec_file = '../input/d/sandreds/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 4,
96 | "id": "0a74cebd",
97 | "metadata": {
98 | "execution": {
99 | "iopub.execute_input": "2021-12-05T06:41:48.834263Z",
100 | "iopub.status.busy": "2021-12-05T06:41:48.833473Z",
101 | "iopub.status.idle": "2021-12-05T06:41:48.846019Z",
102 | "shell.execute_reply": "2021-12-05T06:41:48.846450Z",
103 | "shell.execute_reply.started": "2021-12-05T06:28:07.895005Z"
104 | },
105 | "papermill": {
106 | "duration": 0.031576,
107 | "end_time": "2021-12-05T06:41:48.846572",
108 | "exception": false,
109 | "start_time": "2021-12-05T06:41:48.814996",
110 | "status": "completed"
111 | },
112 | "tags": []
113 | },
114 | "outputs": [
115 | {
116 | "data": {
117 | "text/html": [
118 | "\n",
119 | "\n",
132 | "
\n",
133 | " \n",
134 | " \n",
135 | " | \n",
136 | " id | \n",
137 | " qid1 | \n",
138 | " qid2 | \n",
139 | " question1 | \n",
140 | " question2 | \n",
141 | " is_duplicate | \n",
142 | " question1_preprocessed | \n",
143 | " question2_preprocessed | \n",
144 | "
\n",
145 | " \n",
146 | " \n",
147 | " \n",
148 | " | 0 | \n",
149 | " 204673 | \n",
150 | " 93885 | \n",
151 | " 307635 | \n",
152 | " If there is a God, where is He! | \n",
153 | " Why is god a \"He\"? | \n",
154 | " 0 | \n",
155 | " if there is a god , where is he ! | \n",
156 | " why is god a `` he '' ? | \n",
157 | "
\n",
158 | " \n",
159 | " | 1 | \n",
160 | " 17716 | \n",
161 | " 2093 | \n",
162 | " 15628 | \n",
163 | " Do you believe that everything happens for a r... | \n",
164 | " Does everything happen for a reason? | \n",
165 | " 1 | \n",
166 | " do you believe that everything happens for a r... | \n",
167 | " does everything happen for a reason ? | \n",
168 | "
\n",
169 | " \n",
170 | " | 2 | \n",
171 | " 291767 | \n",
172 | " 352623 | \n",
173 | " 413255 | \n",
174 | " Will there always be web hosting that will sup... | \n",
175 | " Will there always be web hosting that supports... | \n",
176 | " 1 | \n",
177 | " will there always be web hosting that will sup... | \n",
178 | " will there always be web hosting that supports... | \n",
179 | "
\n",
180 | " \n",
181 | " | 3 | \n",
182 | " 203758 | \n",
183 | " 59824 | \n",
184 | " 67971 | \n",
185 | " What is the proof of Indian Army's surgical st... | \n",
186 | " Has India provided any proof of the surgical s... | \n",
187 | " 1 | \n",
188 | " what is the proof of indian army 's surgical s... | \n",
189 | " has india provided any proof of the surgical s... | \n",
190 | "
\n",
191 | " \n",
192 | " | 4 | \n",
193 | " 41747 | \n",
194 | " 75326 | \n",
195 | " 75327 | \n",
196 | " What do Indian Muslims think of Modi? | \n",
197 | " What do Indian Muslim think about PM Narendra ... | \n",
198 | " 1 | \n",
199 | " what do indian muslims think of modi ? | \n",
200 | " what do indian muslim think about pm narendra ... | \n",
201 | "
\n",
202 | " \n",
203 | "
\n",
204 | "
"
205 | ],
206 | "text/plain": [
207 | " id qid1 qid2 question1 \\\n",
208 | "0 204673 93885 307635 If there is a God, where is He! \n",
209 | "1 17716 2093 15628 Do you believe that everything happens for a r... \n",
210 | "2 291767 352623 413255 Will there always be web hosting that will sup... \n",
211 | "3 203758 59824 67971 What is the proof of Indian Army's surgical st... \n",
212 | "4 41747 75326 75327 What do Indian Muslims think of Modi? \n",
213 | "\n",
214 | " question2 is_duplicate \\\n",
215 | "0 Why is god a \"He\"? 0 \n",
216 | "1 Does everything happen for a reason? 1 \n",
217 | "2 Will there always be web hosting that supports... 1 \n",
218 | "3 Has India provided any proof of the surgical s... 1 \n",
219 | "4 What do Indian Muslim think about PM Narendra ... 1 \n",
220 | "\n",
221 | " question1_preprocessed \\\n",
222 | "0 if there is a god , where is he ! \n",
223 | "1 do you believe that everything happens for a r... \n",
224 | "2 will there always be web hosting that will sup... \n",
225 | "3 what is the proof of indian army 's surgical s... \n",
226 | "4 what do indian muslims think of modi ? \n",
227 | "\n",
228 | " question2_preprocessed \n",
229 | "0 why is god a `` he '' ? \n",
230 | "1 does everything happen for a reason ? \n",
231 | "2 will there always be web hosting that supports... \n",
232 | "3 has india provided any proof of the surgical s... \n",
233 | "4 what do indian muslim think about pm narendra ... "
234 | ]
235 | },
236 | "execution_count": 4,
237 | "metadata": {},
238 | "output_type": "execute_result"
239 | }
240 | ],
241 | "source": [
242 | "train.head()"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 5,
248 | "id": "e183a4fe",
249 | "metadata": {
250 | "execution": {
251 | "iopub.execute_input": "2021-12-05T06:41:48.876516Z",
252 | "iopub.status.busy": "2021-12-05T06:41:48.875896Z",
253 | "iopub.status.idle": "2021-12-05T06:42:50.921199Z",
254 | "shell.execute_reply": "2021-12-05T06:42:50.920691Z",
255 | "shell.execute_reply.started": "2021-12-05T06:28:07.919112Z"
256 | },
257 | "papermill": {
258 | "duration": 62.06346,
259 | "end_time": "2021-12-05T06:42:50.921351",
260 | "exception": false,
261 | "start_time": "2021-12-05T06:41:48.857891",
262 | "status": "completed"
263 | },
264 | "tags": []
265 | },
266 | "outputs": [],
267 | "source": [
268 | "def buildVocabulary(reviews):\n",
269 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n",
270 | " tokenizer.fit_on_texts(reviews)\n",
271 | " return tokenizer\n",
272 | "\n",
273 | "def getSequences(reviews, tokenizer, seq_maxlen):\n",
274 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n",
275 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n",
276 | "\n",
277 | "word2vec_model = KeyedVectors.load_word2vec_format(word2vec_file, binary = True)\n",
278 | "\n",
279 | "def getEmbeddingWeightMatrix(word2idx): \n",
280 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n",
281 | " for word, i in tqdm(word2idx.items()):\n",
282 | " \n",
283 | " embedding_vector = word2vec_model[word] if word in word2vec_model else np.random.rand(1,300)\n",
284 | " if embedding_vector is not None:\n",
285 | " embedding_matrix[i] = embedding_vector\n",
286 | " return embedding_matrix"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 6,
292 | "id": "40a3ebc0",
293 | "metadata": {
294 | "execution": {
295 | "iopub.execute_input": "2021-12-05T06:42:50.974813Z",
296 | "iopub.status.busy": "2021-12-05T06:42:50.963194Z",
297 | "iopub.status.idle": "2021-12-05T06:43:04.692817Z",
298 | "shell.execute_reply": "2021-12-05T06:43:04.691844Z",
299 | "shell.execute_reply.started": "2021-12-05T06:29:09.661554Z"
300 | },
301 | "papermill": {
302 | "duration": 13.76014,
303 | "end_time": "2021-12-05T06:43:04.692993",
304 | "exception": false,
305 | "start_time": "2021-12-05T06:42:50.932853",
306 | "status": "completed"
307 | },
308 | "tags": []
309 | },
310 | "outputs": [
311 | {
312 | "name": "stdout",
313 | "output_type": "stream",
314 | "text": [
315 | "67043\n"
316 | ]
317 | }
318 | ],
319 | "source": [
320 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n",
321 | "vocab_size = len(tokenizer.word_index) + 1\n",
322 | "print(vocab_size)\n",
323 | "\n",
324 | "x_train1 = getSequences(train['question1'], tokenizer, 200)\n",
325 | "x_train2 = getSequences(train['question2'], tokenizer, 200)\n",
326 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n",
327 | "\n",
328 | "x_val1 = getSequences(val['question1'], tokenizer, 200)\n",
329 | "x_val2 = getSequences(val['question2'], tokenizer, 200)\n",
330 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n",
331 | "\n",
332 | "x_test1 = getSequences(test['question1'], tokenizer, 200)\n",
333 | "x_test2 = getSequences(test['question2'], tokenizer, 200)\n",
334 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 7,
340 | "id": "61abd026",
341 | "metadata": {
342 | "execution": {
343 | "iopub.execute_input": "2021-12-05T06:43:04.722083Z",
344 | "iopub.status.busy": "2021-12-05T06:43:04.721506Z",
345 | "iopub.status.idle": "2021-12-05T06:43:05.183883Z",
346 | "shell.execute_reply": "2021-12-05T06:43:05.182396Z",
347 | "shell.execute_reply.started": "2021-12-05T06:29:22.925663Z"
348 | },
349 | "papermill": {
350 | "duration": 0.478999,
351 | "end_time": "2021-12-05T06:43:05.184024",
352 | "exception": false,
353 | "start_time": "2021-12-05T06:43:04.705025",
354 | "status": "completed"
355 | },
356 | "tags": []
357 | },
358 | "outputs": [
359 | {
360 | "name": "stderr",
361 | "output_type": "stream",
362 | "text": [
363 | "100%|██████████| 67042/67042 [00:00<00:00, 148076.38it/s]"
364 | ]
365 | },
366 | {
367 | "name": "stdout",
368 | "output_type": "stream",
369 | "text": [
370 | "(67043, 300)\n"
371 | ]
372 | },
373 | {
374 | "name": "stderr",
375 | "output_type": "stream",
376 | "text": [
377 | "\n"
378 | ]
379 | }
380 | ],
381 | "source": [
382 | "#embedding_vectors = loadGloveWordEmbeddings()\n",
383 | "#print(len(embedding_vectors))\n",
384 | "\n",
385 | "embedding_weight_matrix = getEmbeddingWeightMatrix(tokenizer.word_index)\n",
386 | "print(embedding_weight_matrix.shape)"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 8,
392 | "id": "59b405dc",
393 | "metadata": {
394 | "execution": {
395 | "iopub.execute_input": "2021-12-05T06:43:05.221581Z",
396 | "iopub.status.busy": "2021-12-05T06:43:05.221015Z",
397 | "iopub.status.idle": "2021-12-05T06:43:07.840042Z",
398 | "shell.execute_reply": "2021-12-05T06:43:07.839136Z",
399 | "shell.execute_reply.started": "2021-12-05T06:29:23.399993Z"
400 | },
401 | "papermill": {
402 | "duration": 2.642422,
403 | "end_time": "2021-12-05T06:43:07.840177",
404 | "exception": false,
405 | "start_time": "2021-12-05T06:43:05.197755",
406 | "status": "completed"
407 | },
408 | "tags": []
409 | },
410 | "outputs": [
411 | {
412 | "name": "stderr",
413 | "output_type": "stream",
414 | "text": [
415 | "2021-12-05 06:43:05.301034: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
416 | "2021-12-05 06:43:05.400333: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
417 | "2021-12-05 06:43:05.401024: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
418 | "2021-12-05 06:43:05.402338: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n",
419 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
420 | "2021-12-05 06:43:05.403313: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
421 | "2021-12-05 06:43:05.404059: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
422 | "2021-12-05 06:43:05.404737: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
423 | "2021-12-05 06:43:07.213294: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
424 | "2021-12-05 06:43:07.214235: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
425 | "2021-12-05 06:43:07.215011: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
426 | "2021-12-05 06:43:07.215600: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n"
427 | ]
428 | }
429 | ],
430 | "source": [
431 | "#he_initializer = tf.keras.initializers.HeUniform()\n",
432 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n",
433 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n",
434 | "\n",
435 | "inner1= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200, \n",
436 | " weights=[embedding_weight_matrix], trainable=True)(inp1)\n",
437 | "inner2= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200,\n",
438 | " weights=[embedding_weight_matrix], trainable=True)(inp2)\n",
439 | "\n",
440 | "\n",
441 | "#inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n",
442 | "\n",
443 | "inner1 = tf.keras.backend.sum(inner1, axis=1, keepdims=False)\n",
444 | "inner2 = tf.keras.backend.sum(inner2, axis=1, keepdims=False)\n",
445 | "inner = inner1+inner2\n",
446 | "inner = tf.keras.layers.Dense(200, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n",
447 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n",
448 | "inner = tf.keras.layers.Dense(120, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n",
449 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n",
450 | "inner = tf.keras.layers.Dense(60, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n",
451 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n",
452 | "output = tf.keras.layers.Dense(2, activation='softmax')(inner)\n",
453 | "\n",
454 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)"
455 | ]
456 | },
457 | {
458 | "cell_type": "code",
459 | "execution_count": 9,
460 | "id": "7eab1687",
461 | "metadata": {
462 | "execution": {
463 | "iopub.execute_input": "2021-12-05T06:43:07.876275Z",
464 | "iopub.status.busy": "2021-12-05T06:43:07.875614Z",
465 | "iopub.status.idle": "2021-12-05T06:43:07.891008Z",
466 | "shell.execute_reply": "2021-12-05T06:43:07.890387Z",
467 | "shell.execute_reply.started": "2021-12-05T06:29:26.103961Z"
468 | },
469 | "papermill": {
470 | "duration": 0.036541,
471 | "end_time": "2021-12-05T06:43:07.891138",
472 | "exception": false,
473 | "start_time": "2021-12-05T06:43:07.854597",
474 | "status": "completed"
475 | },
476 | "tags": []
477 | },
478 | "outputs": [
479 | {
480 | "name": "stdout",
481 | "output_type": "stream",
482 | "text": [
483 | "Model: \"model\"\n",
484 | "__________________________________________________________________________________________________\n",
485 | "Layer (type) Output Shape Param # Connected to \n",
486 | "==================================================================================================\n",
487 | "input_1 (InputLayer) [(None, 200)] 0 \n",
488 | "__________________________________________________________________________________________________\n",
489 | "input_2 (InputLayer) [(None, 200)] 0 \n",
490 | "__________________________________________________________________________________________________\n",
491 | "embedding (Embedding) (None, 200, 300) 20112900 input_1[0][0] \n",
492 | "__________________________________________________________________________________________________\n",
493 | "embedding_1 (Embedding) (None, 200, 300) 20112900 input_2[0][0] \n",
494 | "__________________________________________________________________________________________________\n",
495 | "tf.math.reduce_sum (TFOpLambda) (None, 300) 0 embedding[0][0] \n",
496 | "__________________________________________________________________________________________________\n",
497 | "tf.math.reduce_sum_1 (TFOpLambd (None, 300) 0 embedding_1[0][0] \n",
498 | "__________________________________________________________________________________________________\n",
499 | "tf.__operators__.add (TFOpLambd (None, 300) 0 tf.math.reduce_sum[0][0] \n",
500 | " tf.math.reduce_sum_1[0][0] \n",
501 | "__________________________________________________________________________________________________\n",
502 | "dense (Dense) (None, 200) 60200 tf.__operators__.add[0][0] \n",
503 | "__________________________________________________________________________________________________\n",
504 | "dropout (Dropout) (None, 200) 0 dense[0][0] \n",
505 | "__________________________________________________________________________________________________\n",
506 | "dense_1 (Dense) (None, 120) 24120 dropout[0][0] \n",
507 | "__________________________________________________________________________________________________\n",
508 | "dropout_1 (Dropout) (None, 120) 0 dense_1[0][0] \n",
509 | "__________________________________________________________________________________________________\n",
510 | "dense_2 (Dense) (None, 60) 7260 dropout_1[0][0] \n",
511 | "__________________________________________________________________________________________________\n",
512 | "dropout_2 (Dropout) (None, 60) 0 dense_2[0][0] \n",
513 | "__________________________________________________________________________________________________\n",
514 | "dense_3 (Dense) (None, 2) 122 dropout_2[0][0] \n",
515 | "==================================================================================================\n",
516 | "Total params: 40,317,502\n",
517 | "Trainable params: 40,317,502\n",
518 | "Non-trainable params: 0\n",
519 | "__________________________________________________________________________________________________\n"
520 | ]
521 | }
522 | ],
523 | "source": [
524 | "model.compile(optimizer = \"adam\", loss = 'categorical_crossentropy', metrics=['accuracy'])\n",
525 | "model.summary()"
526 | ]
527 | },
528 | {
529 | "cell_type": "code",
530 | "execution_count": 10,
531 | "id": "0aae2f11",
532 | "metadata": {
533 | "execution": {
534 | "iopub.execute_input": "2021-12-05T06:43:07.925895Z",
535 | "iopub.status.busy": "2021-12-05T06:43:07.925115Z",
536 | "iopub.status.idle": "2021-12-05T06:44:04.257889Z",
537 | "shell.execute_reply": "2021-12-05T06:44:04.258447Z",
538 | "shell.execute_reply.started": "2021-12-05T06:29:26.126805Z"
539 | },
540 | "papermill": {
541 | "duration": 56.353085,
542 | "end_time": "2021-12-05T06:44:04.258638",
543 | "exception": false,
544 | "start_time": "2021-12-05T06:43:07.905553",
545 | "status": "completed"
546 | },
547 | "tags": []
548 | },
549 | "outputs": [
550 | {
551 | "name": "stderr",
552 | "output_type": "stream",
553 | "text": [
554 | "2021-12-05 06:43:08.036642: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n"
555 | ]
556 | },
557 | {
558 | "name": "stdout",
559 | "output_type": "stream",
560 | "text": [
561 | "Epoch 1/4\n",
562 | "632/632 [==============================] - 14s 20ms/step - loss: 1.5897 - accuracy: 0.6573 - val_loss: 0.6889 - val_accuracy: 0.6883\n",
563 | "Epoch 2/4\n",
564 | "632/632 [==============================] - 13s 20ms/step - loss: 0.5678 - accuracy: 0.7383 - val_loss: 0.5906 - val_accuracy: 0.7223\n",
565 | "Epoch 3/4\n",
566 | "632/632 [==============================] - 12s 19ms/step - loss: 0.4759 - accuracy: 0.7885 - val_loss: 0.5830 - val_accuracy: 0.7369\n",
567 | "Epoch 4/4\n",
568 | "632/632 [==============================] - 12s 19ms/step - loss: 0.4199 - accuracy: 0.8166 - val_loss: 0.6234 - val_accuracy: 0.7322\n"
569 | ]
570 | }
571 | ],
572 | "source": [
573 | "save_weights = tf.keras.callbacks.ModelCheckpoint('cbow_mlp.h5', monitor='val_loss', save_best_only=True)\n",
574 | "checkpoint_filepath = 'weights.best.{epoch:01d}.hdf5'\n",
575 | "#model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n",
576 | "#verbose = 1,\n",
577 | "#monitor = 'val_loss',\n",
578 | "#save_best_only = False)\n",
579 | "history = model.fit((x_train1, x_train2), y_train,\n",
580 | " batch_size = 64,\n",
581 | " validation_data = ((x_val1, x_val2), y_val),\n",
582 | " validation_batch_size = 64,\n",
583 | " epochs=4, \n",
584 | " callbacks=[save_weights], \n",
585 | " verbose=1)"
586 | ]
587 | },
588 | {
589 | "cell_type": "code",
590 | "execution_count": 13,
591 | "id": "6195ba4e",
592 | "metadata": {
593 | "execution": {
594 | "iopub.execute_input": "2021-12-05T06:44:05.863472Z",
595 | "iopub.status.busy": "2021-12-05T06:44:05.862509Z",
596 | "iopub.status.idle": "2021-12-05T06:44:28.229271Z",
597 | "shell.execute_reply": "2021-12-05T06:44:28.229695Z",
598 | "shell.execute_reply.started": "2021-12-05T06:40:02.762209Z"
599 | },
600 | "papermill": {
601 | "duration": 22.559635,
602 | "end_time": "2021-12-05T06:44:28.229842",
603 | "exception": false,
604 | "start_time": "2021-12-05T06:44:05.670207",
605 | "status": "completed"
606 | },
607 | "tags": []
608 | },
609 | "outputs": [
610 | {
611 | "name": "stdout",
612 | "output_type": "stream",
613 | "text": [
614 | "10108/10108 [==============================] - 22s 2ms/step - loss: 0.3327 - accuracy: 0.8621\n",
615 | "loss on test data is 0.3327052593231201\n",
616 | "accuracy on test data is 0.8620792031288147\n"
617 | ]
618 | }
619 | ],
620 | "source": [
621 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n",
622 | "\n",
623 | "print('loss on test data is', loss)\n",
624 | "print('accuracy on test data is', accuracy)"
625 | ]
626 | },
627 | {
628 | "cell_type": "code",
629 | "execution_count": 14,
630 | "id": "3d4078c0",
631 | "metadata": {
632 | "execution": {
633 | "iopub.execute_input": "2021-12-05T06:44:28.830291Z",
634 | "iopub.status.busy": "2021-12-05T06:44:28.829339Z",
635 | "iopub.status.idle": "2021-12-05T06:44:31.520669Z",
636 | "shell.execute_reply": "2021-12-05T06:44:31.521533Z",
637 | "shell.execute_reply.started": "2021-12-05T06:40:27.475503Z"
638 | },
639 | "papermill": {
640 | "duration": 2.995351,
641 | "end_time": "2021-12-05T06:44:31.521775",
642 | "exception": false,
643 | "start_time": "2021-12-05T06:44:28.526424",
644 | "status": "completed"
645 | },
646 | "tags": []
647 | },
648 | "outputs": [
649 | {
650 | "name": "stdout",
651 | "output_type": "stream",
652 | "text": [
653 | "F1_score on test is 0.7974573192880495\n"
654 | ]
655 | }
656 | ],
657 | "source": [
658 | "pred = model.predict((x_test1, x_test2))\n",
659 | "print('F1_score on test is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))\n"
660 | ]
661 | },
662 | {
663 | "cell_type": "code",
664 | "execution_count": null,
665 | "id": "b54608e8",
666 | "metadata": {
667 | "papermill": {
668 | "duration": 0.488623,
669 | "end_time": "2021-12-05T06:44:32.446218",
670 | "exception": false,
671 | "start_time": "2021-12-05T06:44:31.957595",
672 | "status": "completed"
673 | },
674 | "tags": []
675 | },
676 | "outputs": [],
677 | "source": []
678 | }
679 | ],
680 | "metadata": {
681 | "kernelspec": {
682 | "display_name": "Python 3",
683 | "language": "python",
684 | "name": "python3"
685 | },
686 | "language_info": {
687 | "codemirror_mode": {
688 | "name": "ipython",
689 | "version": 3
690 | },
691 | "file_extension": ".py",
692 | "mimetype": "text/x-python",
693 | "name": "python",
694 | "nbconvert_exporter": "python",
695 | "pygments_lexer": "ipython3",
696 | "version": "3.8.8"
697 | },
698 | "papermill": {
699 | "default_parameters": {},
700 | "duration": 181.391801,
701 | "end_time": "2021-12-05T06:44:36.786152",
702 | "environment_variables": {},
703 | "exception": null,
704 | "input_path": "__notebook__.ipynb",
705 | "output_path": "__notebook__.ipynb",
706 | "parameters": {},
707 | "start_time": "2021-12-05T06:41:35.394351",
708 | "version": "2.3.3"
709 | }
710 | },
711 | "nbformat": 4,
712 | "nbformat_minor": 5
713 | }
714 |
--------------------------------------------------------------------------------
/glove-lstm_paper_experiment1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "e8f714bf",
7 | "metadata": {
8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
10 | "execution": {
11 | "iopub.execute_input": "2021-11-27T17:31:54.134744Z",
12 | "iopub.status.busy": "2021-11-27T17:31:54.132593Z",
13 | "iopub.status.idle": "2021-11-27T17:31:59.374583Z",
14 | "shell.execute_reply": "2021-11-27T17:31:59.373668Z",
15 | "shell.execute_reply.started": "2021-11-27T17:18:16.053325Z"
16 | },
17 | "papermill": {
18 | "duration": 5.264768,
19 | "end_time": "2021-11-27T17:31:59.374813",
20 | "exception": false,
21 | "start_time": "2021-11-27T17:31:54.110045",
22 | "status": "completed"
23 | },
24 | "tags": []
25 | },
26 | "outputs": [],
27 | "source": [
28 | "import pandas as pd\n",
29 | "import numpy as np\n",
30 | "from tqdm import tqdm\n",
31 | "import tensorflow as tf\n",
32 | "from sklearn.metrics import f1_score"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 2,
38 | "id": "674aa0b8",
39 | "metadata": {
40 | "execution": {
41 | "iopub.execute_input": "2021-11-27T17:31:59.409729Z",
42 | "iopub.status.busy": "2021-11-27T17:31:59.408970Z",
43 | "iopub.status.idle": "2021-11-27T17:32:02.909462Z",
44 | "shell.execute_reply": "2021-11-27T17:32:02.908882Z",
45 | "shell.execute_reply.started": "2021-11-27T17:18:26.878800Z"
46 | },
47 | "papermill": {
48 | "duration": 3.520358,
49 | "end_time": "2021-11-27T17:32:02.909614",
50 | "exception": false,
51 | "start_time": "2021-11-27T17:31:59.389256",
52 | "status": "completed"
53 | },
54 | "tags": []
55 | },
56 | "outputs": [],
57 | "source": [
58 | "train = pd.read_csv('../input/smai-project-data/train_data.csv').fillna('')\n",
59 | "val = pd.read_csv('../input/smai-project-data/val_data.csv').fillna('')\n",
60 | "test = pd.read_csv('../input/smai-project-data/test_data.csv').fillna('')"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 3,
66 | "id": "0f8bc2ba",
67 | "metadata": {
68 | "execution": {
69 | "iopub.execute_input": "2021-11-27T17:32:02.947083Z",
70 | "iopub.status.busy": "2021-11-27T17:32:02.946007Z",
71 | "iopub.status.idle": "2021-11-27T17:32:02.962532Z",
72 | "shell.execute_reply": "2021-11-27T17:32:02.963124Z",
73 | "shell.execute_reply.started": "2021-11-27T17:18:30.138188Z"
74 | },
75 | "papermill": {
76 | "duration": 0.040354,
77 | "end_time": "2021-11-27T17:32:02.963310",
78 | "exception": false,
79 | "start_time": "2021-11-27T17:32:02.922956",
80 | "status": "completed"
81 | },
82 | "tags": []
83 | },
84 | "outputs": [
85 | {
86 | "data": {
87 | "text/html": [
88 | "\n",
89 | "\n",
102 | "
\n",
103 | " \n",
104 | " \n",
105 | " | \n",
106 | " id | \n",
107 | " qid1 | \n",
108 | " qid2 | \n",
109 | " question1 | \n",
110 | " question2 | \n",
111 | " is_duplicate | \n",
112 | " question1_preprocessed | \n",
113 | " question2_preprocessed | \n",
114 | "
\n",
115 | " \n",
116 | " \n",
117 | " \n",
118 | " | 0 | \n",
119 | " 8067 | \n",
120 | " 15738 | \n",
121 | " 15739 | \n",
122 | " How do I play Pokémon GO in Korea? | \n",
123 | " How do I play Pokémon GO in China? | \n",
124 | " 0 | \n",
125 | " how do i play pok mon go in korea ? | \n",
126 | " how do i play pok mon go in china ? | \n",
127 | "
\n",
128 | " \n",
129 | " | 1 | \n",
130 | " 368101 | \n",
131 | " 12736 | \n",
132 | " 104117 | \n",
133 | " What are some of the best side dishes for crab... | \n",
134 | " What are some good side dishes for buffalo chi... | \n",
135 | " 0 | \n",
136 | " what are some of the best side dishes for crab... | \n",
137 | " what are some good side dishes for buffalo chi... | \n",
138 | "
\n",
139 | " \n",
140 | " | 2 | \n",
141 | " 70497 | \n",
142 | " 121486 | \n",
143 | " 121487 | \n",
144 | " Which is more advisable and better material fo... | \n",
145 | " What is the best server setup for buddypress? | \n",
146 | " 0 | \n",
147 | " which is more advisable and better material fo... | \n",
148 | " what is the best server setup for buddypress ? | \n",
149 | "
\n",
150 | " \n",
151 | " | 3 | \n",
152 | " 226567 | \n",
153 | " 254474 | \n",
154 | " 258192 | \n",
155 | " How do I improve logical programming skills? | \n",
156 | " How can I improve my logical skills for progra... | \n",
157 | " 1 | \n",
158 | " how do i improve logical programming skills ? | \n",
159 | " how can i improve my logical skills for progra... | \n",
160 | "
\n",
161 | " \n",
162 | " | 4 | \n",
163 | " 73186 | \n",
164 | " 48103 | \n",
165 | " 3062 | \n",
166 | " How close we are to see 3rd world war? | \n",
167 | " How close is a World War III? | \n",
168 | " 1 | \n",
169 | " how close we are to see 3rd world war ? | \n",
170 | " how close is a world war iii ? | \n",
171 | "
\n",
172 | " \n",
173 | "
\n",
174 | "
"
175 | ],
176 | "text/plain": [
177 | " id qid1 qid2 question1 \\\n",
178 | "0 8067 15738 15739 How do I play Pokémon GO in Korea? \n",
179 | "1 368101 12736 104117 What are some of the best side dishes for crab... \n",
180 | "2 70497 121486 121487 Which is more advisable and better material fo... \n",
181 | "3 226567 254474 258192 How do I improve logical programming skills? \n",
182 | "4 73186 48103 3062 How close we are to see 3rd world war? \n",
183 | "\n",
184 | " question2 is_duplicate \\\n",
185 | "0 How do I play Pokémon GO in China? 0 \n",
186 | "1 What are some good side dishes for buffalo chi... 0 \n",
187 | "2 What is the best server setup for buddypress? 0 \n",
188 | "3 How can I improve my logical skills for progra... 1 \n",
189 | "4 How close is a World War III? 1 \n",
190 | "\n",
191 | " question1_preprocessed \\\n",
192 | "0 how do i play pok mon go in korea ? \n",
193 | "1 what are some of the best side dishes for crab... \n",
194 | "2 which is more advisable and better material fo... \n",
195 | "3 how do i improve logical programming skills ? \n",
196 | "4 how close we are to see 3rd world war ? \n",
197 | "\n",
198 | " question2_preprocessed \n",
199 | "0 how do i play pok mon go in china ? \n",
200 | "1 what are some good side dishes for buffalo chi... \n",
201 | "2 what is the best server setup for buddypress ? \n",
202 | "3 how can i improve my logical skills for progra... \n",
203 | "4 how close is a world war iii ? "
204 | ]
205 | },
206 | "execution_count": 3,
207 | "metadata": {},
208 | "output_type": "execute_result"
209 | }
210 | ],
211 | "source": [
212 | "train.head()"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": 4,
218 | "id": "3789630a",
219 | "metadata": {
220 | "execution": {
221 | "iopub.execute_input": "2021-11-27T17:32:03.002462Z",
222 | "iopub.status.busy": "2021-11-27T17:32:03.001753Z",
223 | "iopub.status.idle": "2021-11-27T17:32:03.006592Z",
224 | "shell.execute_reply": "2021-11-27T17:32:03.006031Z",
225 | "shell.execute_reply.started": "2021-11-27T17:18:30.162922Z"
226 | },
227 | "papermill": {
228 | "duration": 0.029215,
229 | "end_time": "2021-11-27T17:32:03.006773",
230 | "exception": false,
231 | "start_time": "2021-11-27T17:32:02.977558",
232 | "status": "completed"
233 | },
234 | "tags": []
235 | },
236 | "outputs": [],
237 | "source": [
238 | "def buildVocabulary(reviews):\n",
239 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n",
240 | " tokenizer.fit_on_texts(reviews)\n",
241 | " return tokenizer\n",
242 | "\n",
243 | "def getSequences(reviews, tokenizer, seq_maxlen):\n",
244 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n",
245 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n",
246 | "\n",
247 | "def loadGloveWordEmbeddings():\n",
248 | " embedding_vectors = {}\n",
249 | " with open('../input/glove840b300dtxt/glove.840B.300d.txt') as f:\n",
250 | " for line in tqdm(f):\n",
251 | " values = line.split(' ')\n",
252 | " word = values[0]\n",
253 | " coefs = np.asarray(values[1:], dtype='float32')\n",
254 | " embedding_vectors[word] = coefs\n",
255 | " return embedding_vectors\n",
256 | "\n",
257 | "def getEmbeddingWeightMatrix(embedding_vectors, word2idx): \n",
258 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n",
259 | " for word, i in tqdm(word2idx.items()):\n",
260 | " embedding_vector = embedding_vectors.get(word)\n",
261 | " if embedding_vector is not None:\n",
262 | " embedding_matrix[i] = embedding_vector\n",
263 | " return embedding_matrix"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 5,
269 | "id": "33f503b8",
270 | "metadata": {
271 | "execution": {
272 | "iopub.execute_input": "2021-11-27T17:32:03.086757Z",
273 | "iopub.status.busy": "2021-11-27T17:32:03.043126Z",
274 | "iopub.status.idle": "2021-11-27T17:32:42.350295Z",
275 | "shell.execute_reply": "2021-11-27T17:32:42.349686Z",
276 | "shell.execute_reply.started": "2021-11-27T17:18:30.175848Z"
277 | },
278 | "papermill": {
279 | "duration": 39.329657,
280 | "end_time": "2021-11-27T17:32:42.350460",
281 | "exception": false,
282 | "start_time": "2021-11-27T17:32:03.020803",
283 | "status": "completed"
284 | },
285 | "tags": []
286 | },
287 | "outputs": [
288 | {
289 | "name": "stdout",
290 | "output_type": "stream",
291 | "text": [
292 | "119558\n"
293 | ]
294 | }
295 | ],
296 | "source": [
297 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n",
298 | "vocab_size = len(tokenizer.word_index) + 1\n",
299 | "print(vocab_size)\n",
300 | "\n",
301 | "x_train1 = getSequences(train['question1'], tokenizer, 128)\n",
302 | "x_train2 = getSequences(train['question2'], tokenizer, 128)\n",
303 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n",
304 | "\n",
305 | "x_val1 = getSequences(val['question1'], tokenizer, 128)\n",
306 | "x_val2 = getSequences(val['question2'], tokenizer, 128)\n",
307 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n",
308 | "\n",
309 | "x_test1 = getSequences(test['question1'], tokenizer, 128)\n",
310 | "x_test2 = getSequences(test['question2'], tokenizer, 128)\n",
311 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": 6,
317 | "id": "bdffb7fe",
318 | "metadata": {
319 | "execution": {
320 | "iopub.execute_input": "2021-11-27T17:32:42.385156Z",
321 | "iopub.status.busy": "2021-11-27T17:32:42.384482Z",
322 | "iopub.status.idle": "2021-11-27T17:37:39.763269Z",
323 | "shell.execute_reply": "2021-11-27T17:37:39.761448Z",
324 | "shell.execute_reply.started": "2021-11-27T17:19:01.967098Z"
325 | },
326 | "papermill": {
327 | "duration": 297.397774,
328 | "end_time": "2021-11-27T17:37:39.763433",
329 | "exception": false,
330 | "start_time": "2021-11-27T17:32:42.365659",
331 | "status": "completed"
332 | },
333 | "tags": []
334 | },
335 | "outputs": [
336 | {
337 | "name": "stderr",
338 | "output_type": "stream",
339 | "text": [
340 | "2196018it [04:56, 7397.00it/s]\n"
341 | ]
342 | },
343 | {
344 | "name": "stdout",
345 | "output_type": "stream",
346 | "text": [
347 | "2196017\n"
348 | ]
349 | },
350 | {
351 | "name": "stderr",
352 | "output_type": "stream",
353 | "text": [
354 | "100%|██████████| 119557/119557 [00:00<00:00, 255058.70it/s]"
355 | ]
356 | },
357 | {
358 | "name": "stdout",
359 | "output_type": "stream",
360 | "text": [
361 | "(119558, 300)\n"
362 | ]
363 | },
364 | {
365 | "name": "stderr",
366 | "output_type": "stream",
367 | "text": [
368 | "\n"
369 | ]
370 | }
371 | ],
372 | "source": [
373 | "embedding_vectors = loadGloveWordEmbeddings()\n",
374 | "print(len(embedding_vectors))\n",
375 | "\n",
376 | "embedding_weight_matrix = getEmbeddingWeightMatrix(embedding_vectors, tokenizer.word_index)\n",
377 | "print(embedding_weight_matrix.shape)"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 7,
383 | "id": "e95c5bc7",
384 | "metadata": {
385 | "execution": {
386 | "iopub.execute_input": "2021-11-27T17:37:41.733535Z",
387 | "iopub.status.busy": "2021-11-27T17:37:41.732702Z",
388 | "iopub.status.idle": "2021-11-27T17:37:45.831178Z",
389 | "shell.execute_reply": "2021-11-27T17:37:45.831708Z",
390 | "shell.execute_reply.started": "2021-11-27T17:23:23.346592Z"
391 | },
392 | "papermill": {
393 | "duration": 5.091622,
394 | "end_time": "2021-11-27T17:37:45.831935",
395 | "exception": false,
396 | "start_time": "2021-11-27T17:37:40.740313",
397 | "status": "completed"
398 | },
399 | "tags": []
400 | },
401 | "outputs": [
402 | {
403 | "name": "stderr",
404 | "output_type": "stream",
405 | "text": [
406 | "2021-11-27 17:37:41.835873: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
407 | "2021-11-27 17:37:41.961540: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
408 | "2021-11-27 17:37:41.962835: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
409 | "2021-11-27 17:37:41.965578: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
410 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
411 | "2021-11-27 17:37:41.966902: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
412 | "2021-11-27 17:37:41.968136: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
413 | "2021-11-27 17:37:41.969191: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
414 | "2021-11-27 17:37:44.394226: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
415 | "2021-11-27 17:37:44.395501: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
416 | "2021-11-27 17:37:44.396537: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
417 | "2021-11-27 17:37:44.398460: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n",
418 | "2021-11-27 17:37:45.013946: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n",
419 | "2021-11-27 17:37:45.271914: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n"
420 | ]
421 | }
422 | ],
423 | "source": [
424 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n",
425 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n",
426 | "\n",
427 | "inner1= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128, \n",
428 | " weights=[embedding_weight_matrix], trainable=False)(inp1)\n",
429 | "inner2= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128,\n",
430 | " weights=[embedding_weight_matrix], trainable=False)(inp2)\n",
431 | "\n",
432 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n",
433 | "\n",
434 | "out, h, c = tf.keras.layers.LSTM(200, return_sequences=False, return_state=True)(inner)\n",
435 | "\n",
436 | "output = tf.keras.layers.Dense(2, activation='softmax')(c)\n",
437 | "\n",
438 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)"
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "execution_count": 8,
444 | "id": "3880b605",
445 | "metadata": {
446 | "execution": {
447 | "iopub.execute_input": "2021-11-27T17:37:48.855149Z",
448 | "iopub.status.busy": "2021-11-27T17:37:48.853991Z",
449 | "iopub.status.idle": "2021-11-27T17:37:48.878756Z",
450 | "shell.execute_reply": "2021-11-27T17:37:48.878201Z",
451 | "shell.execute_reply.started": "2021-11-27T17:23:26.835969Z"
452 | },
453 | "papermill": {
454 | "duration": 2.054917,
455 | "end_time": "2021-11-27T17:37:48.878932",
456 | "exception": false,
457 | "start_time": "2021-11-27T17:37:46.824015",
458 | "status": "completed"
459 | },
460 | "tags": []
461 | },
462 | "outputs": [
463 | {
464 | "name": "stdout",
465 | "output_type": "stream",
466 | "text": [
467 | "Model: \"model\"\n",
468 | "__________________________________________________________________________________________________\n",
469 | "Layer (type) Output Shape Param # Connected to \n",
470 | "==================================================================================================\n",
471 | "input_1 (InputLayer) [(None, 128)] 0 \n",
472 | "__________________________________________________________________________________________________\n",
473 | "input_2 (InputLayer) [(None, 128)] 0 \n",
474 | "__________________________________________________________________________________________________\n",
475 | "embedding (Embedding) (None, 128, 300) 35867400 input_1[0][0] \n",
476 | "__________________________________________________________________________________________________\n",
477 | "embedding_1 (Embedding) (None, 128, 300) 35867400 input_2[0][0] \n",
478 | "__________________________________________________________________________________________________\n",
479 | "tf.__operators__.add (TFOpLambd (None, 128, 300) 0 embedding[0][0] \n",
480 | " embedding_1[0][0] \n",
481 | "__________________________________________________________________________________________________\n",
482 | "tf.math.subtract (TFOpLambda) (None, 128, 300) 0 embedding[0][0] \n",
483 | " embedding_1[0][0] \n",
484 | "__________________________________________________________________________________________________\n",
485 | "tf.math.multiply (TFOpLambda) (None, 128, 300) 0 embedding[0][0] \n",
486 | " embedding_1[0][0] \n",
487 | "__________________________________________________________________________________________________\n",
488 | "concatenate (Concatenate) (None, 128, 900) 0 tf.__operators__.add[0][0] \n",
489 | " tf.math.subtract[0][0] \n",
490 | " tf.math.multiply[0][0] \n",
491 | "__________________________________________________________________________________________________\n",
492 | "lstm (LSTM) [(None, 200), (None, 880800 concatenate[0][0] \n",
493 | "__________________________________________________________________________________________________\n",
494 | "dense (Dense) (None, 2) 402 lstm[0][2] \n",
495 | "==================================================================================================\n",
496 | "Total params: 72,616,002\n",
497 | "Trainable params: 881,202\n",
498 | "Non-trainable params: 71,734,800\n",
499 | "__________________________________________________________________________________________________\n"
500 | ]
501 | }
502 | ],
503 | "source": [
504 | "model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics=['accuracy'])\n",
505 | "model.summary()"
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": 9,
511 | "id": "1b7808b5",
512 | "metadata": {
513 | "execution": {
514 | "iopub.execute_input": "2021-11-27T17:37:50.886838Z",
515 | "iopub.status.busy": "2021-11-27T17:37:50.885717Z",
516 | "iopub.status.idle": "2021-11-27T17:46:15.153633Z",
517 | "shell.execute_reply": "2021-11-27T17:46:15.154306Z",
518 | "shell.execute_reply.started": "2021-11-27T17:23:26.858668Z"
519 | },
520 | "papermill": {
521 | "duration": 505.288706,
522 | "end_time": "2021-11-27T17:46:15.154505",
523 | "exception": false,
524 | "start_time": "2021-11-27T17:37:49.865799",
525 | "status": "completed"
526 | },
527 | "tags": []
528 | },
529 | "outputs": [
530 | {
531 | "name": "stderr",
532 | "output_type": "stream",
533 | "text": [
534 | "2021-11-27 17:37:50.888003: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 144897536 exceeds 10% of free system memory.\n",
535 | "2021-11-27 17:37:51.004600: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 144897536 exceeds 10% of free system memory.\n",
536 | "2021-11-27 17:37:51.188913: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n"
537 | ]
538 | },
539 | {
540 | "name": "stdout",
541 | "output_type": "stream",
542 | "text": [
543 | "Epoch 1/3\n"
544 | ]
545 | },
546 | {
547 | "name": "stderr",
548 | "output_type": "stream",
549 | "text": [
550 | "2021-11-27 17:37:54.166583: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005\n"
551 | ]
552 | },
553 | {
554 | "name": "stdout",
555 | "output_type": "stream",
556 | "text": [
557 | "8841/8844 [============================>.] - ETA: 0s - loss: 0.4669 - accuracy: 0.7735"
558 | ]
559 | },
560 | {
561 | "name": "stderr",
562 | "output_type": "stream",
563 | "text": [
564 | "2021-11-27 17:40:06.040932: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 41399296 exceeds 10% of free system memory.\n"
565 | ]
566 | },
567 | {
568 | "name": "stdout",
569 | "output_type": "stream",
570 | "text": [
571 | "8844/8844 [==============================] - 164s 18ms/step - loss: 0.4669 - accuracy: 0.7735 - val_loss: 0.4332 - val_accuracy: 0.7948\n",
572 | "\n",
573 | "Epoch 00001: saving model to weights.best.1.hdf5\n",
574 | "Epoch 2/3\n",
575 | "8844/8844 [==============================] - 171s 19ms/step - loss: 0.3708 - accuracy: 0.8291 - val_loss: 0.4160 - val_accuracy: 0.8042\n",
576 | "\n",
577 | "Epoch 00002: saving model to weights.best.2.hdf5\n",
578 | "Epoch 3/3\n",
579 | "8844/8844 [==============================] - 159s 18ms/step - loss: 0.2868 - accuracy: 0.8736 - val_loss: 0.4415 - val_accuracy: 0.8081\n",
580 | "\n",
581 | "Epoch 00003: saving model to weights.best.3.hdf5\n"
582 | ]
583 | }
584 | ],
585 | "source": [
586 | "checkpoint_filepath = 'weights.best.{epoch:01d}.hdf5'\n",
587 | "model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n",
588 | " verbose = 1, \n",
589 | " monitor = 'val_loss',\n",
590 | " save_best_only = False)\n",
591 | "\n",
592 | "history = model.fit((x_train1, x_train2), y_train,\n",
593 | " batch_size = 32,\n",
594 | " validation_data = ((x_val1, x_val2), y_val),\n",
595 | " validation_batch_size = 16,\n",
596 | " epochs=3,\n",
597 | " callbacks=[model_checkpoint_callback], \n",
598 | " verbose=1)"
599 | ]
600 | },
601 | {
602 | "cell_type": "code",
603 | "execution_count": 12,
604 | "id": "8863f669",
605 | "metadata": {
606 | "execution": {
607 | "iopub.execute_input": "2021-11-27T17:46:36.740103Z",
608 | "iopub.status.busy": "2021-11-27T17:46:36.738488Z",
609 | "iopub.status.idle": "2021-11-27T17:47:58.715697Z",
610 | "shell.execute_reply": "2021-11-27T17:47:58.715075Z",
611 | "shell.execute_reply.started": "2021-11-27T17:29:15.189693Z"
612 | },
613 | "papermill": {
614 | "duration": 86.274629,
615 | "end_time": "2021-11-27T17:47:58.715889",
616 | "exception": false,
617 | "start_time": "2021-11-27T17:46:32.441260",
618 | "status": "completed"
619 | },
620 | "tags": []
621 | },
622 | "outputs": [
623 | {
624 | "name": "stdout",
625 | "output_type": "stream",
626 | "text": [
627 | "10108/10108 [==============================] - 59s 6ms/step - loss: 0.4340 - accuracy: 0.8087\n",
628 | "loss on test data is 0.43401026725769043\n",
629 | "accuracy on test data is 0.80872642993927\n"
630 | ]
631 | }
632 | ],
633 | "source": [
634 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n",
635 | "\n",
636 | "print('loss on test data is', loss)\n",
637 | "print('accuracy on test data is', accuracy)"
638 | ]
639 | },
640 | {
641 | "cell_type": "code",
642 | "execution_count": 13,
643 | "id": "da98475b",
644 | "metadata": {
645 | "execution": {
646 | "iopub.execute_input": "2021-11-27T17:48:06.451346Z",
647 | "iopub.status.busy": "2021-11-27T17:48:06.450104Z",
648 | "iopub.status.idle": "2021-11-27T17:48:17.164859Z",
649 | "shell.execute_reply": "2021-11-27T17:48:17.163722Z",
650 | "shell.execute_reply.started": "2021-11-27T17:30:03.625800Z"
651 | },
652 | "papermill": {
653 | "duration": 14.724145,
654 | "end_time": "2021-11-27T17:48:17.165031",
655 | "exception": false,
656 | "start_time": "2021-11-27T17:48:02.440886",
657 | "status": "completed"
658 | },
659 | "tags": []
660 | },
661 | "outputs": [
662 | {
663 | "name": "stdout",
664 | "output_type": "stream",
665 | "text": [
666 | "f1_score on test dataset is 0.7398311072233624\n"
667 | ]
668 | }
669 | ],
670 | "source": [
671 | "pred = model.predict((x_test1, x_test2))\n",
672 | "\n",
673 | "print('f1_score on test dataset is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))"
674 | ]
675 | },
676 | {
677 | "cell_type": "code",
678 | "execution_count": null,
679 | "id": "90b4b575",
680 | "metadata": {
681 | "papermill": {
682 | "duration": 3.647781,
683 | "end_time": "2021-11-27T17:48:24.449422",
684 | "exception": false,
685 | "start_time": "2021-11-27T17:48:20.801641",
686 | "status": "completed"
687 | },
688 | "tags": []
689 | },
690 | "outputs": [],
691 | "source": []
692 | }
693 | ],
694 | "metadata": {
695 | "kernelspec": {
696 | "display_name": "Python 3 (ipykernel)",
697 | "language": "python",
698 | "name": "python3"
699 | },
700 | "language_info": {
701 | "codemirror_mode": {
702 | "name": "ipython",
703 | "version": 3
704 | },
705 | "file_extension": ".py",
706 | "mimetype": "text/x-python",
707 | "name": "python",
708 | "nbconvert_exporter": "python",
709 | "pygments_lexer": "ipython3",
710 | "version": "3.8.10"
711 | },
712 | "papermill": {
713 | "default_parameters": {},
714 | "duration": 1006.545475,
715 | "end_time": "2021-11-27T17:48:31.257168",
716 | "environment_variables": {},
717 | "exception": null,
718 | "input_path": "__notebook__.ipynb",
719 | "output_path": "__notebook__.ipynb",
720 | "parameters": {},
721 | "start_time": "2021-11-27T17:31:44.711693",
722 | "version": "2.3.3"
723 | }
724 | },
725 | "nbformat": 4,
726 | "nbformat_minor": 5
727 | }
728 |
--------------------------------------------------------------------------------
/CBOW MLP Sum Diff Product of Embeddings.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "0ffba8bb",
7 | "metadata": {
8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
10 | "execution": {
11 | "iopub.execute_input": "2021-11-27T13:27:02.120873Z",
12 | "iopub.status.busy": "2021-11-27T13:27:02.120117Z",
13 | "iopub.status.idle": "2021-11-27T13:27:06.777897Z",
14 | "shell.execute_reply": "2021-11-27T13:27:06.777261Z",
15 | "shell.execute_reply.started": "2021-11-27T13:25:33.869721Z"
16 | },
17 | "papermill": {
18 | "duration": 4.681193,
19 | "end_time": "2021-11-27T13:27:06.778056",
20 | "exception": false,
21 | "start_time": "2021-11-27T13:27:02.096863",
22 | "status": "completed"
23 | },
24 | "tags": []
25 | },
26 | "outputs": [],
27 | "source": [
28 | "import pandas as pd\n",
29 | "import numpy as np\n",
30 | "from tqdm import tqdm\n",
31 | "import tensorflow as tf\n",
32 | "from gensim.models import KeyedVectors\n",
33 | "import gensim\n",
34 | "import re\n",
35 | "from sklearn.metrics import f1_score"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 2,
41 | "id": "2d9327ba",
42 | "metadata": {
43 | "execution": {
44 | "iopub.execute_input": "2021-11-27T13:27:06.803571Z",
45 | "iopub.status.busy": "2021-11-27T13:27:06.802996Z",
46 | "iopub.status.idle": "2021-11-27T13:27:07.887920Z",
47 | "shell.execute_reply": "2021-11-27T13:27:07.886982Z",
48 | "shell.execute_reply.started": "2021-11-27T13:10:42.402545Z"
49 | },
50 | "papermill": {
51 | "duration": 1.099041,
52 | "end_time": "2021-11-27T13:27:07.888060",
53 | "exception": false,
54 | "start_time": "2021-11-27T13:27:06.789019",
55 | "status": "completed"
56 | },
57 | "tags": []
58 | },
59 | "outputs": [],
60 | "source": [
61 | "train = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')\n",
62 | "val = pd.read_csv('../input/quora-ques-pair/val_data.csv/val_data.csv').fillna('')\n",
63 | "test = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 3,
69 | "id": "c0bfef54",
70 | "metadata": {
71 | "execution": {
72 | "iopub.execute_input": "2021-11-27T13:27:07.912052Z",
73 | "iopub.status.busy": "2021-11-27T13:27:07.911329Z",
74 | "iopub.status.idle": "2021-11-27T13:27:07.914031Z",
75 | "shell.execute_reply": "2021-11-27T13:27:07.913405Z",
76 | "shell.execute_reply.started": "2021-11-27T13:10:43.715098Z"
77 | },
78 | "papermill": {
79 | "duration": 0.015828,
80 | "end_time": "2021-11-27T13:27:07.914162",
81 | "exception": false,
82 | "start_time": "2021-11-27T13:27:07.898334",
83 | "status": "completed"
84 | },
85 | "tags": []
86 | },
87 | "outputs": [],
88 | "source": [
89 | "word2vec_file = '../input/d/sandreds/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 4,
95 | "id": "f318327c",
96 | "metadata": {
97 | "execution": {
98 | "iopub.execute_input": "2021-11-27T13:27:07.942268Z",
99 | "iopub.status.busy": "2021-11-27T13:27:07.941457Z",
100 | "iopub.status.idle": "2021-11-27T13:27:07.953929Z",
101 | "shell.execute_reply": "2021-11-27T13:27:07.954439Z",
102 | "shell.execute_reply.started": "2021-11-27T13:10:43.722268Z"
103 | },
104 | "papermill": {
105 | "duration": 0.030715,
106 | "end_time": "2021-11-27T13:27:07.954598",
107 | "exception": false,
108 | "start_time": "2021-11-27T13:27:07.923883",
109 | "status": "completed"
110 | },
111 | "tags": []
112 | },
113 | "outputs": [
114 | {
115 | "data": {
116 | "text/html": [
117 | "\n",
118 | "\n",
131 | "
\n",
132 | " \n",
133 | " \n",
134 | " | \n",
135 | " id | \n",
136 | " qid1 | \n",
137 | " qid2 | \n",
138 | " question1 | \n",
139 | " question2 | \n",
140 | " is_duplicate | \n",
141 | " question1_preprocessed | \n",
142 | " question2_preprocessed | \n",
143 | "
\n",
144 | " \n",
145 | " \n",
146 | " \n",
147 | " | 0 | \n",
148 | " 204673 | \n",
149 | " 93885 | \n",
150 | " 307635 | \n",
151 | " If there is a God, where is He! | \n",
152 | " Why is god a \"He\"? | \n",
153 | " 0 | \n",
154 | " if there is a god , where is he ! | \n",
155 | " why is god a `` he '' ? | \n",
156 | "
\n",
157 | " \n",
158 | " | 1 | \n",
159 | " 17716 | \n",
160 | " 2093 | \n",
161 | " 15628 | \n",
162 | " Do you believe that everything happens for a r... | \n",
163 | " Does everything happen for a reason? | \n",
164 | " 1 | \n",
165 | " do you believe that everything happens for a r... | \n",
166 | " does everything happen for a reason ? | \n",
167 | "
\n",
168 | " \n",
169 | " | 2 | \n",
170 | " 291767 | \n",
171 | " 352623 | \n",
172 | " 413255 | \n",
173 | " Will there always be web hosting that will sup... | \n",
174 | " Will there always be web hosting that supports... | \n",
175 | " 1 | \n",
176 | " will there always be web hosting that will sup... | \n",
177 | " will there always be web hosting that supports... | \n",
178 | "
\n",
179 | " \n",
180 | " | 3 | \n",
181 | " 203758 | \n",
182 | " 59824 | \n",
183 | " 67971 | \n",
184 | " What is the proof of Indian Army's surgical st... | \n",
185 | " Has India provided any proof of the surgical s... | \n",
186 | " 1 | \n",
187 | " what is the proof of indian army 's surgical s... | \n",
188 | " has india provided any proof of the surgical s... | \n",
189 | "
\n",
190 | " \n",
191 | " | 4 | \n",
192 | " 41747 | \n",
193 | " 75326 | \n",
194 | " 75327 | \n",
195 | " What do Indian Muslims think of Modi? | \n",
196 | " What do Indian Muslim think about PM Narendra ... | \n",
197 | " 1 | \n",
198 | " what do indian muslims think of modi ? | \n",
199 | " what do indian muslim think about pm narendra ... | \n",
200 | "
\n",
201 | " \n",
202 | "
\n",
203 | "
"
204 | ],
205 | "text/plain": [
206 | " id qid1 qid2 question1 \\\n",
207 | "0 204673 93885 307635 If there is a God, where is He! \n",
208 | "1 17716 2093 15628 Do you believe that everything happens for a r... \n",
209 | "2 291767 352623 413255 Will there always be web hosting that will sup... \n",
210 | "3 203758 59824 67971 What is the proof of Indian Army's surgical st... \n",
211 | "4 41747 75326 75327 What do Indian Muslims think of Modi? \n",
212 | "\n",
213 | " question2 is_duplicate \\\n",
214 | "0 Why is god a \"He\"? 0 \n",
215 | "1 Does everything happen for a reason? 1 \n",
216 | "2 Will there always be web hosting that supports... 1 \n",
217 | "3 Has India provided any proof of the surgical s... 1 \n",
218 | "4 What do Indian Muslim think about PM Narendra ... 1 \n",
219 | "\n",
220 | " question1_preprocessed \\\n",
221 | "0 if there is a god , where is he ! \n",
222 | "1 do you believe that everything happens for a r... \n",
223 | "2 will there always be web hosting that will sup... \n",
224 | "3 what is the proof of indian army 's surgical s... \n",
225 | "4 what do indian muslims think of modi ? \n",
226 | "\n",
227 | " question2_preprocessed \n",
228 | "0 why is god a `` he '' ? \n",
229 | "1 does everything happen for a reason ? \n",
230 | "2 will there always be web hosting that supports... \n",
231 | "3 has india provided any proof of the surgical s... \n",
232 | "4 what do indian muslim think about pm narendra ... "
233 | ]
234 | },
235 | "execution_count": 4,
236 | "metadata": {},
237 | "output_type": "execute_result"
238 | }
239 | ],
240 | "source": [
241 | "train.head()"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 5,
247 | "id": "fdd3b917",
248 | "metadata": {
249 | "execution": {
250 | "iopub.execute_input": "2021-11-27T13:27:07.982238Z",
251 | "iopub.status.busy": "2021-11-27T13:27:07.981577Z",
252 | "iopub.status.idle": "2021-11-27T13:28:12.395623Z",
253 | "shell.execute_reply": "2021-11-27T13:28:12.396099Z",
254 | "shell.execute_reply.started": "2021-11-27T13:10:43.746605Z"
255 | },
256 | "papermill": {
257 | "duration": 64.431482,
258 | "end_time": "2021-11-27T13:28:12.396269",
259 | "exception": false,
260 | "start_time": "2021-11-27T13:27:07.964787",
261 | "status": "completed"
262 | },
263 | "tags": []
264 | },
265 | "outputs": [],
266 | "source": [
267 | "def buildVocabulary(reviews):\n",
268 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n",
269 | " tokenizer.fit_on_texts(reviews)\n",
270 | " return tokenizer\n",
271 | "\n",
272 | "def getSequences(reviews, tokenizer, seq_maxlen):\n",
273 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n",
274 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n",
275 | "\n",
276 | "word2vec_model = KeyedVectors.load_word2vec_format(word2vec_file, binary = True)\n",
277 | "\n",
278 | "def getEmbeddingWeightMatrix(word2idx): \n",
279 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n",
280 | " for word, i in tqdm(word2idx.items()):\n",
281 | " \n",
282 | " embedding_vector = word2vec_model[word] if word in word2vec_model else np.random.rand(1,300)\n",
283 | " if embedding_vector is not None:\n",
284 | " embedding_matrix[i] = embedding_vector\n",
285 | " return embedding_matrix"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 6,
291 | "id": "47a83b4b",
292 | "metadata": {
293 | "execution": {
294 | "iopub.execute_input": "2021-11-27T13:28:12.451991Z",
295 | "iopub.status.busy": "2021-11-27T13:28:12.438857Z",
296 | "iopub.status.idle": "2021-11-27T13:28:26.338075Z",
297 | "shell.execute_reply": "2021-11-27T13:28:26.337092Z",
298 | "shell.execute_reply.started": "2021-11-27T13:11:45.518534Z"
299 | },
300 | "papermill": {
301 | "duration": 13.930379,
302 | "end_time": "2021-11-27T13:28:26.338242",
303 | "exception": false,
304 | "start_time": "2021-11-27T13:28:12.407863",
305 | "status": "completed"
306 | },
307 | "tags": []
308 | },
309 | "outputs": [
310 | {
311 | "name": "stdout",
312 | "output_type": "stream",
313 | "text": [
314 | "67043\n"
315 | ]
316 | }
317 | ],
318 | "source": [
319 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n",
320 | "vocab_size = len(tokenizer.word_index) + 1\n",
321 | "print(vocab_size)\n",
322 | "\n",
323 | "x_train1 = getSequences(train['question1'], tokenizer, 200)\n",
324 | "x_train2 = getSequences(train['question2'], tokenizer, 200)\n",
325 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n",
326 | "\n",
327 | "x_val1 = getSequences(val['question1'], tokenizer, 200)\n",
328 | "x_val2 = getSequences(val['question2'], tokenizer, 200)\n",
329 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n",
330 | "\n",
331 | "x_test1 = getSequences(test['question1'], tokenizer, 200)\n",
332 | "x_test2 = getSequences(test['question2'], tokenizer, 200)\n",
333 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": 7,
339 | "id": "064a2f92",
340 | "metadata": {
341 | "execution": {
342 | "iopub.execute_input": "2021-11-27T13:28:26.364903Z",
343 | "iopub.status.busy": "2021-11-27T13:28:26.364312Z",
344 | "iopub.status.idle": "2021-11-27T13:28:26.845268Z",
345 | "shell.execute_reply": "2021-11-27T13:28:26.843711Z",
346 | "shell.execute_reply.started": "2021-11-27T13:12:14.356043Z"
347 | },
348 | "papermill": {
349 | "duration": 0.496339,
350 | "end_time": "2021-11-27T13:28:26.845392",
351 | "exception": false,
352 | "start_time": "2021-11-27T13:28:26.349053",
353 | "status": "completed"
354 | },
355 | "tags": []
356 | },
357 | "outputs": [
358 | {
359 | "name": "stderr",
360 | "output_type": "stream",
361 | "text": [
362 | "100%|██████████| 67042/67042 [00:00<00:00, 142685.61it/s]"
363 | ]
364 | },
365 | {
366 | "name": "stdout",
367 | "output_type": "stream",
368 | "text": [
369 | "(67043, 300)\n"
370 | ]
371 | },
372 | {
373 | "name": "stderr",
374 | "output_type": "stream",
375 | "text": [
376 | "\n"
377 | ]
378 | }
379 | ],
380 | "source": [
381 | "#embedding_vectors = loadGloveWordEmbeddings()\n",
382 | "#print(len(embedding_vectors))\n",
383 | "\n",
384 | "embedding_weight_matrix = getEmbeddingWeightMatrix(tokenizer.word_index)\n",
385 | "print(embedding_weight_matrix.shape)"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": 8,
391 | "id": "13dd3e2c",
392 | "metadata": {
393 | "execution": {
394 | "iopub.execute_input": "2021-11-27T13:28:26.881262Z",
395 | "iopub.status.busy": "2021-11-27T13:28:26.880656Z",
396 | "iopub.status.idle": "2021-11-27T13:28:29.585007Z",
397 | "shell.execute_reply": "2021-11-27T13:28:29.585470Z",
398 | "shell.execute_reply.started": "2021-11-27T13:21:50.654150Z"
399 | },
400 | "papermill": {
401 | "duration": 2.727389,
402 | "end_time": "2021-11-27T13:28:29.585638",
403 | "exception": false,
404 | "start_time": "2021-11-27T13:28:26.858249",
405 | "status": "completed"
406 | },
407 | "tags": []
408 | },
409 | "outputs": [
410 | {
411 | "name": "stderr",
412 | "output_type": "stream",
413 | "text": [
414 | "2021-11-27 13:28:26.961758: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
415 | "2021-11-27 13:28:27.082051: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
416 | "2021-11-27 13:28:27.082785: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
417 | "2021-11-27 13:28:27.084455: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n",
418 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
419 | "2021-11-27 13:28:27.085735: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
420 | "2021-11-27 13:28:27.086434: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
421 | "2021-11-27 13:28:27.087162: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
422 | "2021-11-27 13:28:28.950796: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
423 | "2021-11-27 13:28:28.951646: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
424 | "2021-11-27 13:28:28.952433: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
425 | "2021-11-27 13:28:28.953120: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n"
426 | ]
427 | }
428 | ],
429 | "source": [
430 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n",
431 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n",
432 | "\n",
433 | "inner1= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200, \n",
434 | " weights=[embedding_weight_matrix], trainable=True)(inp1)\n",
435 | "inner2= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200,\n",
436 | " weights=[embedding_weight_matrix], trainable=True)(inp2)\n",
437 | " \n",
438 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n",
439 | "\n",
440 | "inner = tf.keras.backend.sum(inner, axis=1, keepdims=False)\n",
441 | "#tf.keras.regularizers.l2(l2=0.01)\n",
442 | "inner = tf.keras.layers.Dense(300, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n",
443 | "inner = tf.keras.layers.Dropout(0.1)(inner)\n",
444 | "inner = tf.keras.layers.Dense(200, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n",
445 | "inner = tf.keras.layers.Dropout(0.1)(inner)\n",
446 | "inner = tf.keras.layers.Dense(100, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n",
447 | "inner = tf.keras.layers.Dropout(0.1)(inner)\n",
448 | "output = tf.keras.layers.Dense(2, activation='softmax')(inner)\n",
449 | "\n",
450 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)"
451 | ]
452 | },
453 | {
454 | "cell_type": "code",
455 | "execution_count": 9,
456 | "id": "2a0fb964",
457 | "metadata": {
458 | "execution": {
459 | "iopub.execute_input": "2021-11-27T13:28:29.619321Z",
460 | "iopub.status.busy": "2021-11-27T13:28:29.618811Z",
461 | "iopub.status.idle": "2021-11-27T13:28:29.634853Z",
462 | "shell.execute_reply": "2021-11-27T13:28:29.635434Z",
463 | "shell.execute_reply.started": "2021-11-27T13:21:51.909270Z"
464 | },
465 | "papermill": {
466 | "duration": 0.036653,
467 | "end_time": "2021-11-27T13:28:29.635595",
468 | "exception": false,
469 | "start_time": "2021-11-27T13:28:29.598942",
470 | "status": "completed"
471 | },
472 | "tags": []
473 | },
474 | "outputs": [
475 | {
476 | "name": "stdout",
477 | "output_type": "stream",
478 | "text": [
479 | "Model: \"model\"\n",
480 | "__________________________________________________________________________________________________\n",
481 | "Layer (type) Output Shape Param # Connected to \n",
482 | "==================================================================================================\n",
483 | "input_1 (InputLayer) [(None, 200)] 0 \n",
484 | "__________________________________________________________________________________________________\n",
485 | "input_2 (InputLayer) [(None, 200)] 0 \n",
486 | "__________________________________________________________________________________________________\n",
487 | "embedding (Embedding) (None, 200, 300) 20112900 input_1[0][0] \n",
488 | "__________________________________________________________________________________________________\n",
489 | "embedding_1 (Embedding) (None, 200, 300) 20112900 input_2[0][0] \n",
490 | "__________________________________________________________________________________________________\n",
491 | "tf.__operators__.add (TFOpLambd (None, 200, 300) 0 embedding[0][0] \n",
492 | " embedding_1[0][0] \n",
493 | "__________________________________________________________________________________________________\n",
494 | "tf.math.subtract (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n",
495 | " embedding_1[0][0] \n",
496 | "__________________________________________________________________________________________________\n",
497 | "tf.math.multiply (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n",
498 | " embedding_1[0][0] \n",
499 | "__________________________________________________________________________________________________\n",
500 | "concatenate (Concatenate) (None, 200, 900) 0 tf.__operators__.add[0][0] \n",
501 | " tf.math.subtract[0][0] \n",
502 | " tf.math.multiply[0][0] \n",
503 | "__________________________________________________________________________________________________\n",
504 | "tf.math.reduce_sum (TFOpLambda) (None, 900) 0 concatenate[0][0] \n",
505 | "__________________________________________________________________________________________________\n",
506 | "dense (Dense) (None, 300) 270300 tf.math.reduce_sum[0][0] \n",
507 | "__________________________________________________________________________________________________\n",
508 | "dropout (Dropout) (None, 300) 0 dense[0][0] \n",
509 | "__________________________________________________________________________________________________\n",
510 | "dense_1 (Dense) (None, 200) 60200 dropout[0][0] \n",
511 | "__________________________________________________________________________________________________\n",
512 | "dropout_1 (Dropout) (None, 200) 0 dense_1[0][0] \n",
513 | "__________________________________________________________________________________________________\n",
514 | "dense_2 (Dense) (None, 100) 20100 dropout_1[0][0] \n",
515 | "__________________________________________________________________________________________________\n",
516 | "dropout_2 (Dropout) (None, 100) 0 dense_2[0][0] \n",
517 | "__________________________________________________________________________________________________\n",
518 | "dense_3 (Dense) (None, 2) 202 dropout_2[0][0] \n",
519 | "==================================================================================================\n",
520 | "Total params: 40,576,602\n",
521 | "Trainable params: 40,576,602\n",
522 | "Non-trainable params: 0\n",
523 | "__________________________________________________________________________________________________\n"
524 | ]
525 | }
526 | ],
527 | "source": [
528 | "model.compile(optimizer = \"adam\", loss = 'categorical_crossentropy', metrics=['accuracy'])\n",
529 | "model.summary()"
530 | ]
531 | },
532 | {
533 | "cell_type": "code",
534 | "execution_count": 10,
535 | "id": "2769fd24",
536 | "metadata": {
537 | "execution": {
538 | "iopub.execute_input": "2021-11-27T13:28:29.669190Z",
539 | "iopub.status.busy": "2021-11-27T13:28:29.668319Z",
540 | "iopub.status.idle": "2021-11-27T13:29:27.302188Z",
541 | "shell.execute_reply": "2021-11-27T13:29:27.301703Z",
542 | "shell.execute_reply.started": "2021-11-27T13:21:52.934089Z"
543 | },
544 | "papermill": {
545 | "duration": 57.652535,
546 | "end_time": "2021-11-27T13:29:27.302336",
547 | "exception": false,
548 | "start_time": "2021-11-27T13:28:29.649801",
549 | "status": "completed"
550 | },
551 | "tags": []
552 | },
553 | "outputs": [
554 | {
555 | "name": "stderr",
556 | "output_type": "stream",
557 | "text": [
558 | "2021-11-27 13:28:29.784743: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n"
559 | ]
560 | },
561 | {
562 | "name": "stdout",
563 | "output_type": "stream",
564 | "text": [
565 | "Epoch 1/3\n",
566 | "632/632 [==============================] - 18s 26ms/step - loss: 1.5922 - accuracy: 0.6810 - val_loss: 0.6328 - val_accuracy: 0.7140\n",
567 | "Epoch 2/3\n",
568 | "632/632 [==============================] - 16s 25ms/step - loss: 0.5415 - accuracy: 0.7529 - val_loss: 0.5487 - val_accuracy: 0.7364\n",
569 | "Epoch 3/3\n",
570 | "632/632 [==============================] - 21s 33ms/step - loss: 0.4592 - accuracy: 0.7990 - val_loss: 0.5759 - val_accuracy: 0.7274\n"
571 | ]
572 | }
573 | ],
574 | "source": [
575 | "save_weights = tf.keras.callbacks.ModelCheckpoint('cbow_mlp.h5', monitor='val_loss', save_best_only=True)\n",
576 | "checkpoint_filepath = 'weights.best.{epoch:01d}.hdf5'\n",
577 | "#model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n",
578 | "#verbose = 1,\n",
579 | "#monitor = 'val_loss',\n",
580 | "#save_best_only = False)\n",
581 | "history = model.fit((x_train1, x_train2), y_train,\n",
582 | " batch_size = 64,\n",
583 | " validation_data = ((x_val1, x_val2), y_val),\n",
584 | " validation_batch_size = 32,\n",
585 | " epochs=3, \n",
586 | " callbacks=[save_weights], \n",
587 | " verbose=1)"
588 | ]
589 | },
590 | {
591 | "cell_type": "code",
592 | "execution_count": 11,
593 | "id": "ab5c9bea",
594 | "metadata": {
595 | "execution": {
596 | "iopub.execute_input": "2021-11-27T13:29:27.609229Z",
597 | "iopub.status.busy": "2021-11-27T13:29:27.608248Z",
598 | "iopub.status.idle": "2021-11-27T13:29:51.444179Z",
599 | "shell.execute_reply": "2021-11-27T13:29:51.445132Z",
600 | "shell.execute_reply.started": "2021-11-27T13:23:37.976281Z"
601 | },
602 | "papermill": {
603 | "duration": 23.995934,
604 | "end_time": "2021-11-27T13:29:51.445323",
605 | "exception": false,
606 | "start_time": "2021-11-27T13:29:27.449389",
607 | "status": "completed"
608 | },
609 | "tags": []
610 | },
611 | "outputs": [
612 | {
613 | "name": "stdout",
614 | "output_type": "stream",
615 | "text": [
616 | "10108/10108 [==============================] - 24s 2ms/step - loss: 0.3645 - accuracy: 0.8614\n",
617 | "loss on test data is 0.36454567313194275\n",
618 | "accuracy on test data is 0.8613866567611694\n"
619 | ]
620 | }
621 | ],
622 | "source": [
623 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n",
624 | "\n",
625 | "print('loss on test data is', loss)\n",
626 | "print('accuracy on test data is', accuracy)"
627 | ]
628 | },
629 | {
630 | "cell_type": "code",
631 | "execution_count": 12,
632 | "id": "072e0c3f",
633 | "metadata": {
634 | "execution": {
635 | "iopub.execute_input": "2021-11-27T13:29:52.033949Z",
636 | "iopub.status.busy": "2021-11-27T13:29:52.033003Z",
637 | "iopub.status.idle": "2021-11-27T13:29:54.764953Z",
638 | "shell.execute_reply": "2021-11-27T13:29:54.765365Z",
639 | "shell.execute_reply.started": "2021-11-27T13:25:52.168723Z"
640 | },
641 | "papermill": {
642 | "duration": 3.037087,
643 | "end_time": "2021-11-27T13:29:54.765519",
644 | "exception": false,
645 | "start_time": "2021-11-27T13:29:51.728432",
646 | "status": "completed"
647 | },
648 | "tags": []
649 | },
650 | "outputs": [
651 | {
652 | "name": "stdout",
653 | "output_type": "stream",
654 | "text": [
655 | "F1_score on test is 0.8275480059084195\n"
656 | ]
657 | }
658 | ],
659 | "source": [
660 | "pred = model.predict((x_test1, x_test2))\n",
661 | "print('F1_score on test is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))\n"
662 | ]
663 | },
664 | {
665 | "cell_type": "code",
666 | "execution_count": null,
667 | "id": "0b1a3522",
668 | "metadata": {
669 | "papermill": {
670 | "duration": 0.264673,
671 | "end_time": "2021-11-27T13:29:55.301567",
672 | "exception": false,
673 | "start_time": "2021-11-27T13:29:55.036894",
674 | "status": "completed"
675 | },
676 | "tags": []
677 | },
678 | "outputs": [],
679 | "source": []
680 | }
681 | ],
682 | "metadata": {
683 | "kernelspec": {
684 | "display_name": "Python 3",
685 | "language": "python",
686 | "name": "python3"
687 | },
688 | "language_info": {
689 | "codemirror_mode": {
690 | "name": "ipython",
691 | "version": 3
692 | },
693 | "file_extension": ".py",
694 | "mimetype": "text/x-python",
695 | "name": "python",
696 | "nbconvert_exporter": "python",
697 | "pygments_lexer": "ipython3",
698 | "version": "3.7.10"
699 | },
700 | "papermill": {
701 | "default_parameters": {},
702 | "duration": 184.393918,
703 | "end_time": "2021-11-27T13:29:59.169536",
704 | "environment_variables": {},
705 | "exception": null,
706 | "input_path": "__notebook__.ipynb",
707 | "output_path": "__notebook__.ipynb",
708 | "parameters": {},
709 | "start_time": "2021-11-27T13:26:54.775618",
710 | "version": "2.3.3"
711 | }
712 | },
713 | "nbformat": 4,
714 | "nbformat_minor": 5
715 | }
716 |
--------------------------------------------------------------------------------
/CBOW ML Dropout Regularisation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "7129363e",
7 | "metadata": {
8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
10 | "execution": {
11 | "iopub.execute_input": "2021-11-27T13:39:40.100619Z",
12 | "iopub.status.busy": "2021-11-27T13:39:40.097692Z",
13 | "iopub.status.idle": "2021-11-27T13:39:45.510044Z",
14 | "shell.execute_reply": "2021-11-27T13:39:45.509398Z",
15 | "shell.execute_reply.started": "2021-11-27T13:34:29.669569Z"
16 | },
17 | "papermill": {
18 | "duration": 5.435369,
19 | "end_time": "2021-11-27T13:39:45.510212",
20 | "exception": false,
21 | "start_time": "2021-11-27T13:39:40.074843",
22 | "status": "completed"
23 | },
24 | "tags": []
25 | },
26 | "outputs": [],
27 | "source": [
28 | "import pandas as pd\n",
29 | "import numpy as np\n",
30 | "from tqdm import tqdm\n",
31 | "import tensorflow as tf\n",
32 | "from gensim.models import KeyedVectors\n",
33 | "import gensim\n",
34 | "import re\n",
35 | "from sklearn.metrics import f1_score\n",
36 | "import matplotlib.pyplot as plt"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "id": "fa981eee",
43 | "metadata": {
44 | "execution": {
45 | "iopub.execute_input": "2021-11-27T13:39:45.539763Z",
46 | "iopub.status.busy": "2021-11-27T13:39:45.539259Z",
47 | "iopub.status.idle": "2021-11-27T13:39:46.766976Z",
48 | "shell.execute_reply": "2021-11-27T13:39:46.766492Z",
49 | "shell.execute_reply.started": "2021-11-27T13:10:42.402545Z"
50 | },
51 | "papermill": {
52 | "duration": 1.24347,
53 | "end_time": "2021-11-27T13:39:46.767141",
54 | "exception": false,
55 | "start_time": "2021-11-27T13:39:45.523671",
56 | "status": "completed"
57 | },
58 | "tags": []
59 | },
60 | "outputs": [],
61 | "source": [
62 | "train = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')\n",
63 | "val = pd.read_csv('../input/quora-ques-pair/val_data.csv/val_data.csv').fillna('')\n",
64 | "test = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 3,
70 | "id": "18c9ee56",
71 | "metadata": {
72 | "execution": {
73 | "iopub.execute_input": "2021-11-27T13:39:46.793631Z",
74 | "iopub.status.busy": "2021-11-27T13:39:46.792950Z",
75 | "iopub.status.idle": "2021-11-27T13:39:46.795498Z",
76 | "shell.execute_reply": "2021-11-27T13:39:46.795872Z",
77 | "shell.execute_reply.started": "2021-11-27T13:10:43.715098Z"
78 | },
79 | "papermill": {
80 | "duration": 0.017601,
81 | "end_time": "2021-11-27T13:39:46.795999",
82 | "exception": false,
83 | "start_time": "2021-11-27T13:39:46.778398",
84 | "status": "completed"
85 | },
86 | "tags": []
87 | },
88 | "outputs": [],
89 | "source": [
90 | "word2vec_file = '../input/d/sandreds/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 4,
96 | "id": "9a62dc09",
97 | "metadata": {
98 | "execution": {
99 | "iopub.execute_input": "2021-11-27T13:39:46.827484Z",
100 | "iopub.status.busy": "2021-11-27T13:39:46.826713Z",
101 | "iopub.status.idle": "2021-11-27T13:39:46.839866Z",
102 | "shell.execute_reply": "2021-11-27T13:39:46.840300Z",
103 | "shell.execute_reply.started": "2021-11-27T13:10:43.722268Z"
104 | },
105 | "papermill": {
106 | "duration": 0.033588,
107 | "end_time": "2021-11-27T13:39:46.840430",
108 | "exception": false,
109 | "start_time": "2021-11-27T13:39:46.806842",
110 | "status": "completed"
111 | },
112 | "tags": []
113 | },
114 | "outputs": [
115 | {
116 | "data": {
117 | "text/html": [
118 | "\n",
119 | "\n",
132 | "
\n",
133 | " \n",
134 | " \n",
135 | " | \n",
136 | " id | \n",
137 | " qid1 | \n",
138 | " qid2 | \n",
139 | " question1 | \n",
140 | " question2 | \n",
141 | " is_duplicate | \n",
142 | " question1_preprocessed | \n",
143 | " question2_preprocessed | \n",
144 | "
\n",
145 | " \n",
146 | " \n",
147 | " \n",
148 | " | 0 | \n",
149 | " 204673 | \n",
150 | " 93885 | \n",
151 | " 307635 | \n",
152 | " If there is a God, where is He! | \n",
153 | " Why is god a \"He\"? | \n",
154 | " 0 | \n",
155 | " if there is a god , where is he ! | \n",
156 | " why is god a `` he '' ? | \n",
157 | "
\n",
158 | " \n",
159 | " | 1 | \n",
160 | " 17716 | \n",
161 | " 2093 | \n",
162 | " 15628 | \n",
163 | " Do you believe that everything happens for a r... | \n",
164 | " Does everything happen for a reason? | \n",
165 | " 1 | \n",
166 | " do you believe that everything happens for a r... | \n",
167 | " does everything happen for a reason ? | \n",
168 | "
\n",
169 | " \n",
170 | " | 2 | \n",
171 | " 291767 | \n",
172 | " 352623 | \n",
173 | " 413255 | \n",
174 | " Will there always be web hosting that will sup... | \n",
175 | " Will there always be web hosting that supports... | \n",
176 | " 1 | \n",
177 | " will there always be web hosting that will sup... | \n",
178 | " will there always be web hosting that supports... | \n",
179 | "
\n",
180 | " \n",
181 | " | 3 | \n",
182 | " 203758 | \n",
183 | " 59824 | \n",
184 | " 67971 | \n",
185 | " What is the proof of Indian Army's surgical st... | \n",
186 | " Has India provided any proof of the surgical s... | \n",
187 | " 1 | \n",
188 | " what is the proof of indian army 's surgical s... | \n",
189 | " has india provided any proof of the surgical s... | \n",
190 | "
\n",
191 | " \n",
192 | " | 4 | \n",
193 | " 41747 | \n",
194 | " 75326 | \n",
195 | " 75327 | \n",
196 | " What do Indian Muslims think of Modi? | \n",
197 | " What do Indian Muslim think about PM Narendra ... | \n",
198 | " 1 | \n",
199 | " what do indian muslims think of modi ? | \n",
200 | " what do indian muslim think about pm narendra ... | \n",
201 | "
\n",
202 | " \n",
203 | "
\n",
204 | "
"
205 | ],
206 | "text/plain": [
207 | " id qid1 qid2 question1 \\\n",
208 | "0 204673 93885 307635 If there is a God, where is He! \n",
209 | "1 17716 2093 15628 Do you believe that everything happens for a r... \n",
210 | "2 291767 352623 413255 Will there always be web hosting that will sup... \n",
211 | "3 203758 59824 67971 What is the proof of Indian Army's surgical st... \n",
212 | "4 41747 75326 75327 What do Indian Muslims think of Modi? \n",
213 | "\n",
214 | " question2 is_duplicate \\\n",
215 | "0 Why is god a \"He\"? 0 \n",
216 | "1 Does everything happen for a reason? 1 \n",
217 | "2 Will there always be web hosting that supports... 1 \n",
218 | "3 Has India provided any proof of the surgical s... 1 \n",
219 | "4 What do Indian Muslim think about PM Narendra ... 1 \n",
220 | "\n",
221 | " question1_preprocessed \\\n",
222 | "0 if there is a god , where is he ! \n",
223 | "1 do you believe that everything happens for a r... \n",
224 | "2 will there always be web hosting that will sup... \n",
225 | "3 what is the proof of indian army 's surgical s... \n",
226 | "4 what do indian muslims think of modi ? \n",
227 | "\n",
228 | " question2_preprocessed \n",
229 | "0 why is god a `` he '' ? \n",
230 | "1 does everything happen for a reason ? \n",
231 | "2 will there always be web hosting that supports... \n",
232 | "3 has india provided any proof of the surgical s... \n",
233 | "4 what do indian muslim think about pm narendra ... "
234 | ]
235 | },
236 | "execution_count": 4,
237 | "metadata": {},
238 | "output_type": "execute_result"
239 | }
240 | ],
241 | "source": [
242 | "train.head()"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 5,
248 | "id": "2e9f6d2a",
249 | "metadata": {
250 | "execution": {
251 | "iopub.execute_input": "2021-11-27T13:39:46.870196Z",
252 | "iopub.status.busy": "2021-11-27T13:39:46.869680Z",
253 | "iopub.status.idle": "2021-11-27T13:40:56.405543Z",
254 | "shell.execute_reply": "2021-11-27T13:40:56.404933Z",
255 | "shell.execute_reply.started": "2021-11-27T13:10:43.746605Z"
256 | },
257 | "papermill": {
258 | "duration": 69.55394,
259 | "end_time": "2021-11-27T13:40:56.405691",
260 | "exception": false,
261 | "start_time": "2021-11-27T13:39:46.851751",
262 | "status": "completed"
263 | },
264 | "tags": []
265 | },
266 | "outputs": [],
267 | "source": [
268 | "def buildVocabulary(reviews):\n",
269 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n",
270 | " tokenizer.fit_on_texts(reviews)\n",
271 | " return tokenizer\n",
272 | "\n",
273 | "def getSequences(reviews, tokenizer, seq_maxlen):\n",
274 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n",
275 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n",
276 | "\n",
277 | "word2vec_model = KeyedVectors.load_word2vec_format(word2vec_file, binary = True)\n",
278 | "\n",
279 | "def getEmbeddingWeightMatrix(word2idx): \n",
280 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n",
281 | " for word, i in tqdm(word2idx.items()):\n",
282 | " \n",
283 | " embedding_vector = word2vec_model[word] if word in word2vec_model else np.random.rand(1,300)\n",
284 | " if embedding_vector is not None:\n",
285 | " embedding_matrix[i] = embedding_vector\n",
286 | " return embedding_matrix"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 6,
292 | "id": "1c97283f",
293 | "metadata": {
294 | "execution": {
295 | "iopub.execute_input": "2021-11-27T13:40:56.460244Z",
296 | "iopub.status.busy": "2021-11-27T13:40:56.459548Z",
297 | "iopub.status.idle": "2021-11-27T13:41:10.200708Z",
298 | "shell.execute_reply": "2021-11-27T13:41:10.200210Z",
299 | "shell.execute_reply.started": "2021-11-27T13:11:45.518534Z"
300 | },
301 | "papermill": {
302 | "duration": 13.783239,
303 | "end_time": "2021-11-27T13:41:10.200838",
304 | "exception": false,
305 | "start_time": "2021-11-27T13:40:56.417599",
306 | "status": "completed"
307 | },
308 | "tags": []
309 | },
310 | "outputs": [
311 | {
312 | "name": "stdout",
313 | "output_type": "stream",
314 | "text": [
315 | "67043\n"
316 | ]
317 | }
318 | ],
319 | "source": [
320 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n",
321 | "vocab_size = len(tokenizer.word_index) + 1\n",
322 | "print(vocab_size)\n",
323 | "\n",
324 | "x_train1 = getSequences(train['question1'], tokenizer, 200)\n",
325 | "x_train2 = getSequences(train['question2'], tokenizer, 200)\n",
326 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n",
327 | "\n",
328 | "x_val1 = getSequences(val['question1'], tokenizer, 200)\n",
329 | "x_val2 = getSequences(val['question2'], tokenizer, 200)\n",
330 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n",
331 | "\n",
332 | "x_test1 = getSequences(test['question1'], tokenizer, 200)\n",
333 | "x_test2 = getSequences(test['question2'], tokenizer, 200)\n",
334 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 7,
340 | "id": "580d251a",
341 | "metadata": {
342 | "execution": {
343 | "iopub.execute_input": "2021-11-27T13:41:10.228748Z",
344 | "iopub.status.busy": "2021-11-27T13:41:10.228020Z",
345 | "iopub.status.idle": "2021-11-27T13:41:10.705699Z",
346 | "shell.execute_reply": "2021-11-27T13:41:10.704306Z",
347 | "shell.execute_reply.started": "2021-11-27T13:12:14.356043Z"
348 | },
349 | "papermill": {
350 | "duration": 0.493073,
351 | "end_time": "2021-11-27T13:41:10.705828",
352 | "exception": false,
353 | "start_time": "2021-11-27T13:41:10.212755",
354 | "status": "completed"
355 | },
356 | "tags": []
357 | },
358 | "outputs": [
359 | {
360 | "name": "stderr",
361 | "output_type": "stream",
362 | "text": [
363 | "100%|██████████| 67042/67042 [00:00<00:00, 143587.33it/s]"
364 | ]
365 | },
366 | {
367 | "name": "stdout",
368 | "output_type": "stream",
369 | "text": [
370 | "(67043, 300)\n"
371 | ]
372 | },
373 | {
374 | "name": "stderr",
375 | "output_type": "stream",
376 | "text": [
377 | "\n"
378 | ]
379 | }
380 | ],
381 | "source": [
382 | "#embedding_vectors = loadGloveWordEmbeddings()\n",
383 | "#print(len(embedding_vectors))\n",
384 | "\n",
385 | "embedding_weight_matrix = getEmbeddingWeightMatrix(tokenizer.word_index)\n",
386 | "print(embedding_weight_matrix.shape)"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 8,
392 | "id": "376d9fdb",
393 | "metadata": {
394 | "execution": {
395 | "iopub.execute_input": "2021-11-27T13:41:10.744344Z",
396 | "iopub.status.busy": "2021-11-27T13:41:10.743779Z",
397 | "iopub.status.idle": "2021-11-27T13:41:13.935711Z",
398 | "shell.execute_reply": "2021-11-27T13:41:13.934772Z",
399 | "shell.execute_reply.started": "2021-11-27T13:27:27.669498Z"
400 | },
401 | "papermill": {
402 | "duration": 3.215972,
403 | "end_time": "2021-11-27T13:41:13.935848",
404 | "exception": false,
405 | "start_time": "2021-11-27T13:41:10.719876",
406 | "status": "completed"
407 | },
408 | "tags": []
409 | },
410 | "outputs": [
411 | {
412 | "name": "stderr",
413 | "output_type": "stream",
414 | "text": [
415 | "2021-11-27 13:41:10.832116: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
416 | "2021-11-27 13:41:10.981618: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
417 | "2021-11-27 13:41:10.982427: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
418 | "2021-11-27 13:41:10.983858: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n",
419 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
420 | "2021-11-27 13:41:10.985130: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
421 | "2021-11-27 13:41:10.985799: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
422 | "2021-11-27 13:41:10.986424: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
423 | "2021-11-27 13:41:13.311232: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
424 | "2021-11-27 13:41:13.311964: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
425 | "2021-11-27 13:41:13.312674: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
426 | "2021-11-27 13:41:13.313255: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n"
427 | ]
428 | }
429 | ],
430 | "source": [
431 | "#he_initializer = tf.keras.initializers.HeUniform()\n",
432 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n",
433 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n",
434 | "\n",
435 | "inner1= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200, \n",
436 | " weights=[embedding_weight_matrix], trainable=True)(inp1)\n",
437 | "inner2= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200,\n",
438 | " weights=[embedding_weight_matrix], trainable=True)(inp2)\n",
439 | " \n",
440 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n",
441 | "\n",
442 | "inner = tf.keras.backend.sum(inner, axis=1, keepdims=False)\n",
443 | "inner = tf.keras.layers.Dense(300, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n",
444 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n",
445 | "inner = tf.keras.layers.Dense(200, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n",
446 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n",
447 | "inner = tf.keras.layers.Dense(100, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n",
448 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n",
449 | "output = tf.keras.layers.Dense(2, activation='softmax')(inner)\n",
450 | "\n",
451 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)"
452 | ]
453 | },
454 | {
455 | "cell_type": "code",
456 | "execution_count": 9,
457 | "id": "e7dba2d8",
458 | "metadata": {
459 | "execution": {
460 | "iopub.execute_input": "2021-11-27T13:41:13.973100Z",
461 | "iopub.status.busy": "2021-11-27T13:41:13.972255Z",
462 | "iopub.status.idle": "2021-11-27T13:41:13.987912Z",
463 | "shell.execute_reply": "2021-11-27T13:41:13.988527Z",
464 | "shell.execute_reply.started": "2021-11-27T13:27:28.324190Z"
465 | },
466 | "papermill": {
467 | "duration": 0.037826,
468 | "end_time": "2021-11-27T13:41:13.988697",
469 | "exception": false,
470 | "start_time": "2021-11-27T13:41:13.950871",
471 | "status": "completed"
472 | },
473 | "tags": []
474 | },
475 | "outputs": [
476 | {
477 | "name": "stdout",
478 | "output_type": "stream",
479 | "text": [
480 | "Model: \"model\"\n",
481 | "__________________________________________________________________________________________________\n",
482 | "Layer (type) Output Shape Param # Connected to \n",
483 | "==================================================================================================\n",
484 | "input_1 (InputLayer) [(None, 200)] 0 \n",
485 | "__________________________________________________________________________________________________\n",
486 | "input_2 (InputLayer) [(None, 200)] 0 \n",
487 | "__________________________________________________________________________________________________\n",
488 | "embedding (Embedding) (None, 200, 300) 20112900 input_1[0][0] \n",
489 | "__________________________________________________________________________________________________\n",
490 | "embedding_1 (Embedding) (None, 200, 300) 20112900 input_2[0][0] \n",
491 | "__________________________________________________________________________________________________\n",
492 | "tf.__operators__.add (TFOpLambd (None, 200, 300) 0 embedding[0][0] \n",
493 | " embedding_1[0][0] \n",
494 | "__________________________________________________________________________________________________\n",
495 | "tf.math.subtract (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n",
496 | " embedding_1[0][0] \n",
497 | "__________________________________________________________________________________________________\n",
498 | "tf.math.multiply (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n",
499 | " embedding_1[0][0] \n",
500 | "__________________________________________________________________________________________________\n",
501 | "concatenate (Concatenate) (None, 200, 900) 0 tf.__operators__.add[0][0] \n",
502 | " tf.math.subtract[0][0] \n",
503 | " tf.math.multiply[0][0] \n",
504 | "__________________________________________________________________________________________________\n",
505 | "tf.math.reduce_sum (TFOpLambda) (None, 900) 0 concatenate[0][0] \n",
506 | "__________________________________________________________________________________________________\n",
507 | "dense (Dense) (None, 300) 270300 tf.math.reduce_sum[0][0] \n",
508 | "__________________________________________________________________________________________________\n",
509 | "dropout (Dropout) (None, 300) 0 dense[0][0] \n",
510 | "__________________________________________________________________________________________________\n",
511 | "dense_1 (Dense) (None, 200) 60200 dropout[0][0] \n",
512 | "__________________________________________________________________________________________________\n",
513 | "dropout_1 (Dropout) (None, 200) 0 dense_1[0][0] \n",
514 | "__________________________________________________________________________________________________\n",
515 | "dense_2 (Dense) (None, 100) 20100 dropout_1[0][0] \n",
516 | "__________________________________________________________________________________________________\n",
517 | "dropout_2 (Dropout) (None, 100) 0 dense_2[0][0] \n",
518 | "__________________________________________________________________________________________________\n",
519 | "dense_3 (Dense) (None, 2) 202 dropout_2[0][0] \n",
520 | "==================================================================================================\n",
521 | "Total params: 40,576,602\n",
522 | "Trainable params: 40,576,602\n",
523 | "Non-trainable params: 0\n",
524 | "__________________________________________________________________________________________________\n"
525 | ]
526 | }
527 | ],
528 | "source": [
529 | "model.compile(optimizer = \"adam\", loss = 'categorical_crossentropy', metrics=['accuracy'])\n",
530 | "model.summary()"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": 10,
536 | "id": "5c0c86ea",
537 | "metadata": {
538 | "execution": {
539 | "iopub.execute_input": "2021-11-27T13:41:14.024992Z",
540 | "iopub.status.busy": "2021-11-27T13:41:14.024188Z",
541 | "iopub.status.idle": "2021-11-27T13:42:26.517456Z",
542 | "shell.execute_reply": "2021-11-27T13:42:26.518855Z",
543 | "shell.execute_reply.started": "2021-11-27T13:27:29.396799Z"
544 | },
545 | "papermill": {
546 | "duration": 72.514821,
547 | "end_time": "2021-11-27T13:42:26.519128",
548 | "exception": false,
549 | "start_time": "2021-11-27T13:41:14.004307",
550 | "status": "completed"
551 | },
552 | "tags": []
553 | },
554 | "outputs": [
555 | {
556 | "name": "stderr",
557 | "output_type": "stream",
558 | "text": [
559 | "2021-11-27 13:41:14.139989: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n"
560 | ]
561 | },
562 | {
563 | "name": "stdout",
564 | "output_type": "stream",
565 | "text": [
566 | "Epoch 1/4\n",
567 | "632/632 [==============================] - 23s 33ms/step - loss: 1.4585 - accuracy: 0.6716 - val_loss: 0.5944 - val_accuracy: 0.7091\n",
568 | "Epoch 2/4\n",
569 | "632/632 [==============================] - 16s 25ms/step - loss: 0.5432 - accuracy: 0.7499 - val_loss: 0.5638 - val_accuracy: 0.7303\n",
570 | "Epoch 3/4\n",
571 | "632/632 [==============================] - 15s 24ms/step - loss: 0.4664 - accuracy: 0.7947 - val_loss: 0.6222 - val_accuracy: 0.7380\n",
572 | "Epoch 4/4\n",
573 | "632/632 [==============================] - 15s 24ms/step - loss: 0.4202 - accuracy: 0.8218 - val_loss: 0.5795 - val_accuracy: 0.7284\n"
574 | ]
575 | }
576 | ],
577 | "source": [
578 | "save_weights = tf.keras.callbacks.ModelCheckpoint('cbow_mlp.h5', monitor='val_loss', save_best_only=True)\n",
579 | "checkpoint_filepath = 'weights.best.{epoch:01d}.hdf5'\n",
580 | "#model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n",
581 | "#verbose = 1,\n",
582 | "#monitor = 'val_loss',\n",
583 | "#save_best_only = False)\n",
584 | "history = model.fit((x_train1, x_train2), y_train,\n",
585 | " batch_size = 64,\n",
586 | " validation_data = ((x_val1, x_val2), y_val),\n",
587 | " validation_batch_size = 32,\n",
588 | " epochs=4, \n",
589 | " callbacks=[save_weights], \n",
590 | " verbose=1)"
591 | ]
592 | },
593 | {
594 | "cell_type": "code",
595 | "execution_count": 13,
596 | "id": "8e786b68",
597 | "metadata": {
598 | "execution": {
599 | "iopub.execute_input": "2021-11-27T13:42:28.244754Z",
600 | "iopub.status.busy": "2021-11-27T13:42:28.243805Z",
601 | "iopub.status.idle": "2021-11-27T13:43:09.569995Z",
602 | "shell.execute_reply": "2021-11-27T13:43:09.570512Z",
603 | "shell.execute_reply.started": "2021-11-27T13:31:52.063332Z"
604 | },
605 | "papermill": {
606 | "duration": 41.528106,
607 | "end_time": "2021-11-27T13:43:09.570664",
608 | "exception": false,
609 | "start_time": "2021-11-27T13:42:28.042558",
610 | "status": "completed"
611 | },
612 | "tags": []
613 | },
614 | "outputs": [
615 | {
616 | "name": "stdout",
617 | "output_type": "stream",
618 | "text": [
619 | "10108/10108 [==============================] - 23s 2ms/step - loss: 0.3454 - accuracy: 0.8793\n",
620 | "loss on test data is 0.345432311296463\n",
621 | "accuracy on test data is 0.8792945742607117\n"
622 | ]
623 | }
624 | ],
625 | "source": [
626 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n",
627 | "\n",
628 | "print('loss on test data is', loss)\n",
629 | "print('accuracy on test data is', accuracy)"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": 14,
635 | "id": "c84c1ff4",
636 | "metadata": {
637 | "execution": {
638 | "iopub.execute_input": "2021-11-27T13:43:10.313909Z",
639 | "iopub.status.busy": "2021-11-27T13:43:10.306261Z",
640 | "iopub.status.idle": "2021-11-27T13:43:11.829214Z",
641 | "shell.execute_reply": "2021-11-27T13:43:11.829678Z",
642 | "shell.execute_reply.started": "2021-11-27T13:33:47.897210Z"
643 | },
644 | "papermill": {
645 | "duration": 1.948284,
646 | "end_time": "2021-11-27T13:43:11.829824",
647 | "exception": false,
648 | "start_time": "2021-11-27T13:43:09.881540",
649 | "status": "completed"
650 | },
651 | "tags": []
652 | },
653 | "outputs": [
654 | {
655 | "name": "stdout",
656 | "output_type": "stream",
657 | "text": [
658 | "F1_score on test is 0.8467144113582108\n"
659 | ]
660 | }
661 | ],
662 | "source": [
663 | "pred = model.predict((x_test1, x_test2))\n",
664 | "print('F1_score on test is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))\n"
665 | ]
666 | },
667 | {
668 | "cell_type": "code",
669 | "execution_count": null,
670 | "id": "9929df87",
671 | "metadata": {
672 | "papermill": {
673 | "duration": 0.308571,
674 | "end_time": "2021-11-27T13:43:12.450411",
675 | "exception": false,
676 | "start_time": "2021-11-27T13:43:12.141840",
677 | "status": "completed"
678 | },
679 | "tags": []
680 | },
681 | "outputs": [],
682 | "source": []
683 | }
684 | ],
685 | "metadata": {
686 | "kernelspec": {
687 | "display_name": "Python 3",
688 | "language": "python",
689 | "name": "python3"
690 | },
691 | "language_info": {
692 | "codemirror_mode": {
693 | "name": "ipython",
694 | "version": 3
695 | },
696 | "file_extension": ".py",
697 | "mimetype": "text/x-python",
698 | "name": "python",
699 | "nbconvert_exporter": "python",
700 | "pygments_lexer": "ipython3",
701 | "version": "3.8.8"
702 | },
703 | "papermill": {
704 | "default_parameters": {},
705 | "duration": 223.597372,
706 | "end_time": "2021-11-27T13:43:15.659790",
707 | "environment_variables": {},
708 | "exception": null,
709 | "input_path": "__notebook__.ipynb",
710 | "output_path": "__notebook__.ipynb",
711 | "parameters": {},
712 | "start_time": "2021-11-27T13:39:32.062418",
713 | "version": "2.3.3"
714 | }
715 | },
716 | "nbformat": 4,
717 | "nbformat_minor": 5
718 | }
719 |
--------------------------------------------------------------------------------
/CBOW MLP He initialisation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "f4ff64c0",
7 | "metadata": {
8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
10 | "execution": {
11 | "iopub.execute_input": "2021-11-27T13:47:47.616966Z",
12 | "iopub.status.busy": "2021-11-27T13:47:47.615429Z",
13 | "iopub.status.idle": "2021-11-27T13:47:52.415578Z",
14 | "shell.execute_reply": "2021-11-27T13:47:52.414840Z",
15 | "shell.execute_reply.started": "2021-11-27T13:34:29.669569Z"
16 | },
17 | "papermill": {
18 | "duration": 4.817962,
19 | "end_time": "2021-11-27T13:47:52.415761",
20 | "exception": false,
21 | "start_time": "2021-11-27T13:47:47.597799",
22 | "status": "completed"
23 | },
24 | "tags": []
25 | },
26 | "outputs": [],
27 | "source": [
28 | "import pandas as pd\n",
29 | "import numpy as np\n",
30 | "from tqdm import tqdm\n",
31 | "import tensorflow as tf\n",
32 | "from gensim.models import KeyedVectors\n",
33 | "import gensim\n",
34 | "import re\n",
35 | "from sklearn.metrics import f1_score\n",
36 | "import matplotlib.pyplot as plt"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "id": "4978cce8",
43 | "metadata": {
44 | "execution": {
45 | "iopub.execute_input": "2021-11-27T13:47:52.446265Z",
46 | "iopub.status.busy": "2021-11-27T13:47:52.445619Z",
47 | "iopub.status.idle": "2021-11-27T13:47:53.538577Z",
48 | "shell.execute_reply": "2021-11-27T13:47:53.538101Z",
49 | "shell.execute_reply.started": "2021-11-27T13:10:42.402545Z"
50 | },
51 | "papermill": {
52 | "duration": 1.109649,
53 | "end_time": "2021-11-27T13:47:53.538734",
54 | "exception": false,
55 | "start_time": "2021-11-27T13:47:52.429085",
56 | "status": "completed"
57 | },
58 | "tags": []
59 | },
60 | "outputs": [],
61 | "source": [
62 | "train = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')\n",
63 | "val = pd.read_csv('../input/quora-ques-pair/val_data.csv/val_data.csv').fillna('')\n",
64 | "test = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 3,
70 | "id": "93666f68",
71 | "metadata": {
72 | "execution": {
73 | "iopub.execute_input": "2021-11-27T13:47:53.565394Z",
74 | "iopub.status.busy": "2021-11-27T13:47:53.564877Z",
75 | "iopub.status.idle": "2021-11-27T13:47:53.568521Z",
76 | "shell.execute_reply": "2021-11-27T13:47:53.568123Z",
77 | "shell.execute_reply.started": "2021-11-27T13:10:43.715098Z"
78 | },
79 | "papermill": {
80 | "duration": 0.018444,
81 | "end_time": "2021-11-27T13:47:53.568628",
82 | "exception": false,
83 | "start_time": "2021-11-27T13:47:53.550184",
84 | "status": "completed"
85 | },
86 | "tags": []
87 | },
88 | "outputs": [],
89 | "source": [
90 | "word2vec_file = '../input/d/sandreds/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 4,
96 | "id": "69688c16",
97 | "metadata": {
98 | "execution": {
99 | "iopub.execute_input": "2021-11-27T13:47:53.598579Z",
100 | "iopub.status.busy": "2021-11-27T13:47:53.597979Z",
101 | "iopub.status.idle": "2021-11-27T13:47:53.611503Z",
102 | "shell.execute_reply": "2021-11-27T13:47:53.611946Z",
103 | "shell.execute_reply.started": "2021-11-27T13:10:43.722268Z"
104 | },
105 | "papermill": {
106 | "duration": 0.03278,
107 | "end_time": "2021-11-27T13:47:53.612067",
108 | "exception": false,
109 | "start_time": "2021-11-27T13:47:53.579287",
110 | "status": "completed"
111 | },
112 | "tags": []
113 | },
114 | "outputs": [
115 | {
116 | "data": {
117 | "text/html": [
118 | "\n",
119 | "\n",
132 | "
\n",
133 | " \n",
134 | " \n",
135 | " | \n",
136 | " id | \n",
137 | " qid1 | \n",
138 | " qid2 | \n",
139 | " question1 | \n",
140 | " question2 | \n",
141 | " is_duplicate | \n",
142 | " question1_preprocessed | \n",
143 | " question2_preprocessed | \n",
144 | "
\n",
145 | " \n",
146 | " \n",
147 | " \n",
148 | " | 0 | \n",
149 | " 204673 | \n",
150 | " 93885 | \n",
151 | " 307635 | \n",
152 | " If there is a God, where is He! | \n",
153 | " Why is god a \"He\"? | \n",
154 | " 0 | \n",
155 | " if there is a god , where is he ! | \n",
156 | " why is god a `` he '' ? | \n",
157 | "
\n",
158 | " \n",
159 | " | 1 | \n",
160 | " 17716 | \n",
161 | " 2093 | \n",
162 | " 15628 | \n",
163 | " Do you believe that everything happens for a r... | \n",
164 | " Does everything happen for a reason? | \n",
165 | " 1 | \n",
166 | " do you believe that everything happens for a r... | \n",
167 | " does everything happen for a reason ? | \n",
168 | "
\n",
169 | " \n",
170 | " | 2 | \n",
171 | " 291767 | \n",
172 | " 352623 | \n",
173 | " 413255 | \n",
174 | " Will there always be web hosting that will sup... | \n",
175 | " Will there always be web hosting that supports... | \n",
176 | " 1 | \n",
177 | " will there always be web hosting that will sup... | \n",
178 | " will there always be web hosting that supports... | \n",
179 | "
\n",
180 | " \n",
181 | " | 3 | \n",
182 | " 203758 | \n",
183 | " 59824 | \n",
184 | " 67971 | \n",
185 | " What is the proof of Indian Army's surgical st... | \n",
186 | " Has India provided any proof of the surgical s... | \n",
187 | " 1 | \n",
188 | " what is the proof of indian army 's surgical s... | \n",
189 | " has india provided any proof of the surgical s... | \n",
190 | "
\n",
191 | " \n",
192 | " | 4 | \n",
193 | " 41747 | \n",
194 | " 75326 | \n",
195 | " 75327 | \n",
196 | " What do Indian Muslims think of Modi? | \n",
197 | " What do Indian Muslim think about PM Narendra ... | \n",
198 | " 1 | \n",
199 | " what do indian muslims think of modi ? | \n",
200 | " what do indian muslim think about pm narendra ... | \n",
201 | "
\n",
202 | " \n",
203 | "
\n",
204 | "
"
205 | ],
206 | "text/plain": [
207 | " id qid1 qid2 question1 \\\n",
208 | "0 204673 93885 307635 If there is a God, where is He! \n",
209 | "1 17716 2093 15628 Do you believe that everything happens for a r... \n",
210 | "2 291767 352623 413255 Will there always be web hosting that will sup... \n",
211 | "3 203758 59824 67971 What is the proof of Indian Army's surgical st... \n",
212 | "4 41747 75326 75327 What do Indian Muslims think of Modi? \n",
213 | "\n",
214 | " question2 is_duplicate \\\n",
215 | "0 Why is god a \"He\"? 0 \n",
216 | "1 Does everything happen for a reason? 1 \n",
217 | "2 Will there always be web hosting that supports... 1 \n",
218 | "3 Has India provided any proof of the surgical s... 1 \n",
219 | "4 What do Indian Muslim think about PM Narendra ... 1 \n",
220 | "\n",
221 | " question1_preprocessed \\\n",
222 | "0 if there is a god , where is he ! \n",
223 | "1 do you believe that everything happens for a r... \n",
224 | "2 will there always be web hosting that will sup... \n",
225 | "3 what is the proof of indian army 's surgical s... \n",
226 | "4 what do indian muslims think of modi ? \n",
227 | "\n",
228 | " question2_preprocessed \n",
229 | "0 why is god a `` he '' ? \n",
230 | "1 does everything happen for a reason ? \n",
231 | "2 will there always be web hosting that supports... \n",
232 | "3 has india provided any proof of the surgical s... \n",
233 | "4 what do indian muslim think about pm narendra ... "
234 | ]
235 | },
236 | "execution_count": 4,
237 | "metadata": {},
238 | "output_type": "execute_result"
239 | }
240 | ],
241 | "source": [
242 | "train.head()"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 5,
248 | "id": "d38628e4",
249 | "metadata": {
250 | "execution": {
251 | "iopub.execute_input": "2021-11-27T13:47:53.641763Z",
252 | "iopub.status.busy": "2021-11-27T13:47:53.641235Z",
253 | "iopub.status.idle": "2021-11-27T13:48:55.410135Z",
254 | "shell.execute_reply": "2021-11-27T13:48:55.409588Z",
255 | "shell.execute_reply.started": "2021-11-27T13:10:43.746605Z"
256 | },
257 | "papermill": {
258 | "duration": 61.786788,
259 | "end_time": "2021-11-27T13:48:55.410279",
260 | "exception": false,
261 | "start_time": "2021-11-27T13:47:53.623491",
262 | "status": "completed"
263 | },
264 | "tags": []
265 | },
266 | "outputs": [],
267 | "source": [
268 | "def buildVocabulary(reviews):\n",
269 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n",
270 | " tokenizer.fit_on_texts(reviews)\n",
271 | " return tokenizer\n",
272 | "\n",
273 | "def getSequences(reviews, tokenizer, seq_maxlen):\n",
274 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n",
275 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n",
276 | "\n",
277 | "word2vec_model = KeyedVectors.load_word2vec_format(word2vec_file, binary = True)\n",
278 | "\n",
279 | "def getEmbeddingWeightMatrix(word2idx): \n",
280 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n",
281 | " for word, i in tqdm(word2idx.items()):\n",
282 | " \n",
283 | " embedding_vector = word2vec_model[word] if word in word2vec_model else np.random.rand(1,300)\n",
284 | " if embedding_vector is not None:\n",
285 | " embedding_matrix[i] = embedding_vector\n",
286 | " return embedding_matrix"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 6,
292 | "id": "28c391f9",
293 | "metadata": {
294 | "execution": {
295 | "iopub.execute_input": "2021-11-27T13:48:55.462462Z",
296 | "iopub.status.busy": "2021-11-27T13:48:55.455009Z",
297 | "iopub.status.idle": "2021-11-27T13:49:09.398443Z",
298 | "shell.execute_reply": "2021-11-27T13:49:09.397946Z",
299 | "shell.execute_reply.started": "2021-11-27T13:11:45.518534Z"
300 | },
301 | "papermill": {
302 | "duration": 13.976622,
303 | "end_time": "2021-11-27T13:49:09.398601",
304 | "exception": false,
305 | "start_time": "2021-11-27T13:48:55.421979",
306 | "status": "completed"
307 | },
308 | "tags": []
309 | },
310 | "outputs": [
311 | {
312 | "name": "stdout",
313 | "output_type": "stream",
314 | "text": [
315 | "67043\n"
316 | ]
317 | }
318 | ],
319 | "source": [
320 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n",
321 | "vocab_size = len(tokenizer.word_index) + 1\n",
322 | "print(vocab_size)\n",
323 | "\n",
324 | "x_train1 = getSequences(train['question1'], tokenizer, 200)\n",
325 | "x_train2 = getSequences(train['question2'], tokenizer, 200)\n",
326 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n",
327 | "\n",
328 | "x_val1 = getSequences(val['question1'], tokenizer, 200)\n",
329 | "x_val2 = getSequences(val['question2'], tokenizer, 200)\n",
330 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n",
331 | "\n",
332 | "x_test1 = getSequences(test['question1'], tokenizer, 200)\n",
333 | "x_test2 = getSequences(test['question2'], tokenizer, 200)\n",
334 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 7,
340 | "id": "2b48ce03",
341 | "metadata": {
342 | "execution": {
343 | "iopub.execute_input": "2021-11-27T13:49:09.427768Z",
344 | "iopub.status.busy": "2021-11-27T13:49:09.427184Z",
345 | "iopub.status.idle": "2021-11-27T13:49:09.887361Z",
346 | "shell.execute_reply": "2021-11-27T13:49:09.888055Z",
347 | "shell.execute_reply.started": "2021-11-27T13:12:14.356043Z"
348 | },
349 | "papermill": {
350 | "duration": 0.47744,
351 | "end_time": "2021-11-27T13:49:09.888245",
352 | "exception": false,
353 | "start_time": "2021-11-27T13:49:09.410805",
354 | "status": "completed"
355 | },
356 | "tags": []
357 | },
358 | "outputs": [
359 | {
360 | "name": "stderr",
361 | "output_type": "stream",
362 | "text": [
363 | "100%|██████████| 67042/67042 [00:00<00:00, 148266.73it/s]"
364 | ]
365 | },
366 | {
367 | "name": "stdout",
368 | "output_type": "stream",
369 | "text": [
370 | "(67043, 300)\n"
371 | ]
372 | },
373 | {
374 | "name": "stderr",
375 | "output_type": "stream",
376 | "text": [
377 | "\n"
378 | ]
379 | }
380 | ],
381 | "source": [
382 | "#embedding_vectors = loadGloveWordEmbeddings()\n",
383 | "#print(len(embedding_vectors))\n",
384 | "\n",
385 | "embedding_weight_matrix = getEmbeddingWeightMatrix(tokenizer.word_index)\n",
386 | "print(embedding_weight_matrix.shape)"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 8,
392 | "id": "6d070a67",
393 | "metadata": {
394 | "execution": {
395 | "iopub.execute_input": "2021-11-27T13:49:09.928507Z",
396 | "iopub.status.busy": "2021-11-27T13:49:09.927955Z",
397 | "iopub.status.idle": "2021-11-27T13:49:12.560874Z",
398 | "shell.execute_reply": "2021-11-27T13:49:12.560395Z",
399 | "shell.execute_reply.started": "2021-11-27T13:40:42.738444Z"
400 | },
401 | "papermill": {
402 | "duration": 2.657589,
403 | "end_time": "2021-11-27T13:49:12.561002",
404 | "exception": false,
405 | "start_time": "2021-11-27T13:49:09.903413",
406 | "status": "completed"
407 | },
408 | "tags": []
409 | },
410 | "outputs": [
411 | {
412 | "name": "stderr",
413 | "output_type": "stream",
414 | "text": [
415 | "2021-11-27 13:49:10.021754: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
416 | "2021-11-27 13:49:10.127095: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
417 | "2021-11-27 13:49:10.127865: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
418 | "2021-11-27 13:49:10.129230: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n",
419 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
420 | "2021-11-27 13:49:10.130458: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
421 | "2021-11-27 13:49:10.131136: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
422 | "2021-11-27 13:49:10.131767: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
423 | "2021-11-27 13:49:11.945849: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
424 | "2021-11-27 13:49:11.946562: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
425 | "2021-11-27 13:49:11.947539: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
426 | "2021-11-27 13:49:11.948205: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n"
427 | ]
428 | }
429 | ],
430 | "source": [
431 | "he_initializer = tf.keras.initializers.HeUniform()\n",
432 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n",
433 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n",
434 | "\n",
435 | "inner1= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200, \n",
436 | " weights=[embedding_weight_matrix], trainable=True)(inp1)\n",
437 | "inner2= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200,\n",
438 | " weights=[embedding_weight_matrix], trainable=True)(inp2)\n",
439 | " \n",
440 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n",
441 | "\n",
442 | "inner = tf.keras.backend.sum(inner, axis=1, keepdims=False)\n",
443 | "inner = tf.keras.layers.Dense(300, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01), kernel_initializer = he_initializer)(inner)\n",
444 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n",
445 | "inner = tf.keras.layers.Dense(200, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01), kernel_initializer = he_initializer)(inner)\n",
446 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n",
447 | "inner = tf.keras.layers.Dense(100, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01), kernel_initializer = he_initializer)(inner)\n",
448 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n",
449 | "output = tf.keras.layers.Dense(2, activation='softmax')(inner)\n",
450 | "\n",
451 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)"
452 | ]
453 | },
454 | {
455 | "cell_type": "code",
456 | "execution_count": 9,
457 | "id": "060a6db0",
458 | "metadata": {
459 | "execution": {
460 | "iopub.execute_input": "2021-11-27T13:49:12.597283Z",
461 | "iopub.status.busy": "2021-11-27T13:49:12.596481Z",
462 | "iopub.status.idle": "2021-11-27T13:49:12.612017Z",
463 | "shell.execute_reply": "2021-11-27T13:49:12.611523Z",
464 | "shell.execute_reply.started": "2021-11-27T13:40:46.214676Z"
465 | },
466 | "papermill": {
467 | "duration": 0.036856,
468 | "end_time": "2021-11-27T13:49:12.612121",
469 | "exception": false,
470 | "start_time": "2021-11-27T13:49:12.575265",
471 | "status": "completed"
472 | },
473 | "tags": []
474 | },
475 | "outputs": [
476 | {
477 | "name": "stdout",
478 | "output_type": "stream",
479 | "text": [
480 | "Model: \"model\"\n",
481 | "__________________________________________________________________________________________________\n",
482 | "Layer (type) Output Shape Param # Connected to \n",
483 | "==================================================================================================\n",
484 | "input_1 (InputLayer) [(None, 200)] 0 \n",
485 | "__________________________________________________________________________________________________\n",
486 | "input_2 (InputLayer) [(None, 200)] 0 \n",
487 | "__________________________________________________________________________________________________\n",
488 | "embedding (Embedding) (None, 200, 300) 20112900 input_1[0][0] \n",
489 | "__________________________________________________________________________________________________\n",
490 | "embedding_1 (Embedding) (None, 200, 300) 20112900 input_2[0][0] \n",
491 | "__________________________________________________________________________________________________\n",
492 | "tf.__operators__.add (TFOpLambd (None, 200, 300) 0 embedding[0][0] \n",
493 | " embedding_1[0][0] \n",
494 | "__________________________________________________________________________________________________\n",
495 | "tf.math.subtract (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n",
496 | " embedding_1[0][0] \n",
497 | "__________________________________________________________________________________________________\n",
498 | "tf.math.multiply (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n",
499 | " embedding_1[0][0] \n",
500 | "__________________________________________________________________________________________________\n",
501 | "concatenate (Concatenate) (None, 200, 900) 0 tf.__operators__.add[0][0] \n",
502 | " tf.math.subtract[0][0] \n",
503 | " tf.math.multiply[0][0] \n",
504 | "__________________________________________________________________________________________________\n",
505 | "tf.math.reduce_sum (TFOpLambda) (None, 900) 0 concatenate[0][0] \n",
506 | "__________________________________________________________________________________________________\n",
507 | "dense (Dense) (None, 300) 270300 tf.math.reduce_sum[0][0] \n",
508 | "__________________________________________________________________________________________________\n",
509 | "dropout (Dropout) (None, 300) 0 dense[0][0] \n",
510 | "__________________________________________________________________________________________________\n",
511 | "dense_1 (Dense) (None, 200) 60200 dropout[0][0] \n",
512 | "__________________________________________________________________________________________________\n",
513 | "dropout_1 (Dropout) (None, 200) 0 dense_1[0][0] \n",
514 | "__________________________________________________________________________________________________\n",
515 | "dense_2 (Dense) (None, 100) 20100 dropout_1[0][0] \n",
516 | "__________________________________________________________________________________________________\n",
517 | "dropout_2 (Dropout) (None, 100) 0 dense_2[0][0] \n",
518 | "__________________________________________________________________________________________________\n",
519 | "dense_3 (Dense) (None, 2) 202 dropout_2[0][0] \n",
520 | "==================================================================================================\n",
521 | "Total params: 40,576,602\n",
522 | "Trainable params: 40,576,602\n",
523 | "Non-trainable params: 0\n",
524 | "__________________________________________________________________________________________________\n"
525 | ]
526 | }
527 | ],
528 | "source": [
529 | "model.compile(optimizer = \"adam\", loss = 'categorical_crossentropy', metrics=['accuracy'])\n",
530 | "model.summary()"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": 10,
536 | "id": "cca93a0e",
537 | "metadata": {
538 | "execution": {
539 | "iopub.execute_input": "2021-11-27T13:49:12.646592Z",
540 | "iopub.status.busy": "2021-11-27T13:49:12.645774Z",
541 | "iopub.status.idle": "2021-11-27T13:50:35.620684Z",
542 | "shell.execute_reply": "2021-11-27T13:50:35.619421Z",
543 | "shell.execute_reply.started": "2021-11-27T13:42:31.052600Z"
544 | },
545 | "papermill": {
546 | "duration": 82.994418,
547 | "end_time": "2021-11-27T13:50:35.620868",
548 | "exception": false,
549 | "start_time": "2021-11-27T13:49:12.626450",
550 | "status": "completed"
551 | },
552 | "tags": []
553 | },
554 | "outputs": [
555 | {
556 | "name": "stdout",
557 | "output_type": "stream",
558 | "text": [
559 | "Epoch 1/4\n"
560 | ]
561 | },
562 | {
563 | "name": "stderr",
564 | "output_type": "stream",
565 | "text": [
566 | "2021-11-27 13:49:12.731819: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n"
567 | ]
568 | },
569 | {
570 | "name": "stdout",
571 | "output_type": "stream",
572 | "text": [
573 | "632/632 [==============================] - 18s 25ms/step - loss: 2.7537 - accuracy: 0.6674 - val_loss: 0.7220 - val_accuracy: 0.7186\n",
574 | "Epoch 2/4\n",
575 | "632/632 [==============================] - 14s 21ms/step - loss: 0.5893 - accuracy: 0.7484 - val_loss: 0.5575 - val_accuracy: 0.7417\n",
576 | "Epoch 3/4\n",
577 | "632/632 [==============================] - 16s 25ms/step - loss: 0.4798 - accuracy: 0.7898 - val_loss: 0.5790 - val_accuracy: 0.7410\n",
578 | "Epoch 4/4\n",
579 | "632/632 [==============================] - 16s 25ms/step - loss: 0.4240 - accuracy: 0.8221 - val_loss: 0.5786 - val_accuracy: 0.7387\n"
580 | ]
581 | }
582 | ],
583 | "source": [
584 | "save_weights = tf.keras.callbacks.ModelCheckpoint('cbow_mlp.h5', monitor='val_loss', save_best_only=True)\n",
585 | "checkpoint_filepath = 'weights.best.{epoch:01d}.hdf5'\n",
586 | "#model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n",
587 | "#verbose = 1,\n",
588 | "#monitor = 'val_loss',\n",
589 | "#save_best_only = False)\n",
590 | "history = model.fit((x_train1, x_train2), y_train,\n",
591 | " batch_size = 64,\n",
592 | " validation_data = ((x_val1, x_val2), y_val),\n",
593 | " validation_batch_size = 64,\n",
594 | " epochs=4, \n",
595 | " callbacks=[save_weights], \n",
596 | " verbose=1)"
597 | ]
598 | },
599 | {
600 | "cell_type": "code",
601 | "execution_count": 13,
602 | "id": "54a1dca3",
603 | "metadata": {
604 | "execution": {
605 | "iopub.execute_input": "2021-11-27T13:50:37.375741Z",
606 | "iopub.status.busy": "2021-11-27T13:50:37.374766Z",
607 | "iopub.status.idle": "2021-11-27T13:51:18.727911Z",
608 | "shell.execute_reply": "2021-11-27T13:51:18.728543Z",
609 | "shell.execute_reply.started": "2021-11-27T13:45:19.009457Z"
610 | },
611 | "papermill": {
612 | "duration": 41.560134,
613 | "end_time": "2021-11-27T13:51:18.728745",
614 | "exception": false,
615 | "start_time": "2021-11-27T13:50:37.168611",
616 | "status": "completed"
617 | },
618 | "tags": []
619 | },
620 | "outputs": [
621 | {
622 | "name": "stdout",
623 | "output_type": "stream",
624 | "text": [
625 | "10108/10108 [==============================] - 23s 2ms/step - loss: 0.3358 - accuracy: 0.8777\n",
626 | "loss on test data is 0.3357672095298767\n",
627 | "accuracy on test data is 0.8776867985725403\n"
628 | ]
629 | }
630 | ],
631 | "source": [
632 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n",
633 | "\n",
634 | "print('loss on test data is', loss)\n",
635 | "print('accuracy on test data is', accuracy)"
636 | ]
637 | },
638 | {
639 | "cell_type": "code",
640 | "execution_count": 14,
641 | "id": "112b1612",
642 | "metadata": {
643 | "execution": {
644 | "iopub.execute_input": "2021-11-27T13:51:19.723275Z",
645 | "iopub.status.busy": "2021-11-27T13:51:19.719029Z",
646 | "iopub.status.idle": "2021-11-27T13:51:22.786317Z",
647 | "shell.execute_reply": "2021-11-27T13:51:22.785865Z",
648 | "shell.execute_reply.started": "2021-11-27T13:47:08.430248Z"
649 | },
650 | "papermill": {
651 | "duration": 3.646437,
652 | "end_time": "2021-11-27T13:51:22.786439",
653 | "exception": false,
654 | "start_time": "2021-11-27T13:51:19.140002",
655 | "status": "completed"
656 | },
657 | "tags": []
658 | },
659 | "outputs": [
660 | {
661 | "name": "stdout",
662 | "output_type": "stream",
663 | "text": [
664 | "F1_score on test is 0.8322648485465214\n"
665 | ]
666 | }
667 | ],
668 | "source": [
669 | "pred = model.predict((x_test1, x_test2))\n",
670 | "print('F1_score on test is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))\n"
671 | ]
672 | },
673 | {
674 | "cell_type": "code",
675 | "execution_count": null,
676 | "id": "26e27882",
677 | "metadata": {
678 | "papermill": {
679 | "duration": 0.304682,
680 | "end_time": "2021-11-27T13:51:23.414985",
681 | "exception": false,
682 | "start_time": "2021-11-27T13:51:23.110303",
683 | "status": "completed"
684 | },
685 | "tags": []
686 | },
687 | "outputs": [],
688 | "source": []
689 | }
690 | ],
691 | "metadata": {
692 | "kernelspec": {
693 | "display_name": "Python 3",
694 | "language": "python",
695 | "name": "python3"
696 | },
697 | "language_info": {
698 | "codemirror_mode": {
699 | "name": "ipython",
700 | "version": 3
701 | },
702 | "file_extension": ".py",
703 | "mimetype": "text/x-python",
704 | "name": "python",
705 | "nbconvert_exporter": "python",
706 | "pygments_lexer": "ipython3",
707 | "version": "3.8.8"
708 | },
709 | "papermill": {
710 | "default_parameters": {},
711 | "duration": 226.464045,
712 | "end_time": "2021-11-27T13:51:26.695502",
713 | "environment_variables": {},
714 | "exception": null,
715 | "input_path": "__notebook__.ipynb",
716 | "output_path": "__notebook__.ipynb",
717 | "parameters": {},
718 | "start_time": "2021-11-27T13:47:40.231457",
719 | "version": "2.3.3"
720 | }
721 | },
722 | "nbformat": 4,
723 | "nbformat_minor": 5
724 | }
725 |
--------------------------------------------------------------------------------