├── README.md ├── glove-bilstm_paper_implementation.ipynb ├── glove-bilstm_experiment2.ipynb ├── CBOW MLP Ppaer Implementation.ipynb ├── glove-lstm_paper_experiment1.ipynb ├── CBOW MLP Sum Diff Product of Embeddings.ipynb ├── CBOW ML Dropout Regularisation.ipynb └── CBOW MLP He initialisation.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # smai_project 2 | 3 | 4 | This repository contains the code which partially fulfills the requirement for our course **Statistical Methods in AI**. The project is titled **Natural language understanding on Quora Question pairs dataset**. 5 | -------------------------------------------------------------------------------- /glove-bilstm_paper_implementation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 8 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 9 | "execution": { 10 | "iopub.execute_input": "2021-12-05T07:21:24.696231Z", 11 | "iopub.status.busy": "2021-12-05T07:21:24.695446Z", 12 | "iopub.status.idle": "2021-12-05T07:21:29.177093Z", 13 | "shell.execute_reply": "2021-12-05T07:21:29.176247Z", 14 | "shell.execute_reply.started": "2021-12-05T07:21:24.696089Z" 15 | } 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "import pandas as pd\n", 20 | "import numpy as np\n", 21 | "from tqdm import tqdm\n", 22 | "import tensorflow as tf\n", 23 | "from sklearn.metrics import f1_score" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": { 30 | "execution": { 31 | "iopub.execute_input": "2021-12-05T07:21:29.179483Z", 32 | "iopub.status.busy": "2021-12-05T07:21:29.178997Z", 33 | "iopub.status.idle": "2021-12-05T07:21:32.486070Z", 34 | "shell.execute_reply": "2021-12-05T07:21:32.485330Z", 35 | "shell.execute_reply.started": "2021-12-05T07:21:29.179449Z" 36 | } 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "train = pd.read_csv('../input/smai-project-data/train_data.csv').fillna('')\n", 41 | "val = pd.read_csv('../input/smai-project-data/val_data.csv').fillna('')\n", 42 | "test = pd.read_csv('../input/smai-project-data/test_data.csv').fillna('')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": { 49 | "execution": { 50 | "iopub.execute_input": "2021-12-05T07:21:32.489624Z", 51 | "iopub.status.busy": "2021-12-05T07:21:32.489415Z", 52 | "iopub.status.idle": "2021-12-05T07:21:32.511292Z", 53 | "shell.execute_reply": "2021-12-05T07:21:32.510674Z", 54 | "shell.execute_reply.started": "2021-12-05T07:21:32.489594Z" 55 | } 56 | }, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/html": [ 61 | "

\n", 62 | "\n", 75 | "\n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | "

	id	qid1	qid2	question1	question2	is_duplicate	question1_preprocessed	question2_preprocessed
0	8067	15738	15739	How do I play Pokémon GO in Korea?	How do I play Pokémon GO in China?	0	how do i play pok mon go in korea ?	how do i play pok mon go in china ?
1	368101	12736	104117	What are some of the best side dishes for crab...	What are some good side dishes for buffalo chi...	0	what are some of the best side dishes for crab...	what are some good side dishes for buffalo chi...
2	70497	121486	121487	Which is more advisable and better material fo...	What is the best server setup for buddypress?	0	which is more advisable and better material fo...	what is the best server setup for buddypress ?
3	226567	254474	258192	How do I improve logical programming skills?	How can I improve my logical skills for progra...	1	how do i improve logical programming skills ?	how can i improve my logical skills for progra...
4	73186	48103	3062	How close we are to see 3rd world war?	How close is a World War III?	1	how close we are to see 3rd world war ?	how close is a world war iii ?

\n", 147 | "

" 148 | ], 149 | "text/plain": [ 150 | " id qid1 qid2 question1 \\\n", 151 | "0 8067 15738 15739 How do I play Pokémon GO in Korea? \n", 152 | "1 368101 12736 104117 What are some of the best side dishes for crab... \n", 153 | "2 70497 121486 121487 Which is more advisable and better material fo... \n", 154 | "3 226567 254474 258192 How do I improve logical programming skills? \n", 155 | "4 73186 48103 3062 How close we are to see 3rd world war? \n", 156 | "\n", 157 | " question2 is_duplicate \\\n", 158 | "0 How do I play Pokémon GO in China? 0 \n", 159 | "1 What are some good side dishes for buffalo chi... 0 \n", 160 | "2 What is the best server setup for buddypress? 0 \n", 161 | "3 How can I improve my logical skills for progra... 1 \n", 162 | "4 How close is a World War III? 1 \n", 163 | "\n", 164 | " question1_preprocessed \\\n", 165 | "0 how do i play pok mon go in korea ? \n", 166 | "1 what are some of the best side dishes for crab... \n", 167 | "2 which is more advisable and better material fo... \n", 168 | "3 how do i improve logical programming skills ? \n", 169 | "4 how close we are to see 3rd world war ? \n", 170 | "\n", 171 | " question2_preprocessed \n", 172 | "0 how do i play pok mon go in china ? \n", 173 | "1 what are some good side dishes for buffalo chi... \n", 174 | "2 what is the best server setup for buddypress ? \n", 175 | "3 how can i improve my logical skills for progra... \n", 176 | "4 how close is a world war iii ? " 177 | ] 178 | }, 179 | "execution_count": 3, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "train.head()" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 4, 191 | "metadata": { 192 | "execution": { 193 | "iopub.execute_input": "2021-12-05T07:21:32.514130Z", 194 | "iopub.status.busy": "2021-12-05T07:21:32.513901Z", 195 | "iopub.status.idle": "2021-12-05T07:21:32.524800Z", 196 | "shell.execute_reply": "2021-12-05T07:21:32.524106Z", 197 | "shell.execute_reply.started": "2021-12-05T07:21:32.514094Z" 198 | } 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "def buildVocabulary(reviews):\n", 203 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n", 204 | " tokenizer.fit_on_texts(reviews)\n", 205 | " return tokenizer\n", 206 | "\n", 207 | "def getSequences(reviews, tokenizer, seq_maxlen):\n", 208 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n", 209 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n", 210 | "\n", 211 | "def loadGloveWordEmbeddings():\n", 212 | " embedding_vectors = {}\n", 213 | " with open('../input/glove840b300dtxt/glove.840B.300d.txt') as f:\n", 214 | " for line in tqdm(f):\n", 215 | " values = line.split(' ')\n", 216 | " word = values[0]\n", 217 | " coefs = np.asarray(values[1:], dtype='float32')\n", 218 | " embedding_vectors[word] = coefs\n", 219 | " return embedding_vectors\n", 220 | "\n", 221 | "def getEmbeddingWeightMatrix(embedding_vectors, word2idx): \n", 222 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n", 223 | " for word, i in tqdm(word2idx.items()):\n", 224 | " embedding_vector = embedding_vectors.get(word)\n", 225 | " if embedding_vector is not None:\n", 226 | " embedding_matrix[i] = embedding_vector\n", 227 | " return embedding_matrix" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 5, 233 | "metadata": { 234 | "execution": { 235 | "iopub.execute_input": "2021-12-05T07:21:32.526477Z", 236 | "iopub.status.busy": "2021-12-05T07:21:32.526215Z", 237 | "iopub.status.idle": "2021-12-05T07:22:04.067116Z", 238 | "shell.execute_reply": "2021-12-05T07:22:04.066259Z", 239 | "shell.execute_reply.started": "2021-12-05T07:21:32.526443Z" 240 | } 241 | }, 242 | "outputs": [ 243 | { 244 | "name": "stdout", 245 | "output_type": "stream", 246 | "text": [ 247 | "119558\n" 248 | ] 249 | } 250 | ], 251 | "source": [ 252 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n", 253 | "vocab_size = len(tokenizer.word_index) + 1\n", 254 | "print(vocab_size)\n", 255 | "\n", 256 | "x_train1 = getSequences(train['question1'], tokenizer, 128)\n", 257 | "x_train2 = getSequences(train['question2'], tokenizer, 128)\n", 258 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n", 259 | "\n", 260 | "x_val1 = getSequences(val['question1'], tokenizer, 128)\n", 261 | "x_val2 = getSequences(val['question2'], tokenizer, 128)\n", 262 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n", 263 | "\n", 264 | "x_test1 = getSequences(test['question1'], tokenizer, 128)\n", 265 | "x_test2 = getSequences(test['question2'], tokenizer, 128)\n", 266 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 6, 272 | "metadata": { 273 | "execution": { 274 | "iopub.execute_input": "2021-12-05T07:22:04.068714Z", 275 | "iopub.status.busy": "2021-12-05T07:22:04.068455Z", 276 | "iopub.status.idle": "2021-12-05T07:26:17.684935Z", 277 | "shell.execute_reply": "2021-12-05T07:26:17.684071Z", 278 | "shell.execute_reply.started": "2021-12-05T07:22:04.068679Z" 279 | } 280 | }, 281 | "outputs": [ 282 | { 283 | "name": "stderr", 284 | "output_type": "stream", 285 | "text": [ 286 | "2196018it [04:13, 8673.48it/s]\n" 287 | ] 288 | }, 289 | { 290 | "name": "stdout", 291 | "output_type": "stream", 292 | "text": [ 293 | "2196017\n" 294 | ] 295 | }, 296 | { 297 | "name": "stderr", 298 | "output_type": "stream", 299 | "text": [ 300 | "100%|██████████| 119557/119557 [00:00<00:00, 289620.78it/s]" 301 | ] 302 | }, 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "(119558, 300)\n" 308 | ] 309 | }, 310 | { 311 | "name": "stderr", 312 | "output_type": "stream", 313 | "text": [ 314 | "\n" 315 | ] 316 | } 317 | ], 318 | "source": [ 319 | "embedding_vectors = loadGloveWordEmbeddings()\n", 320 | "print(len(embedding_vectors))\n", 321 | "\n", 322 | "embedding_weight_matrix = getEmbeddingWeightMatrix(embedding_vectors, tokenizer.word_index)\n", 323 | "print(embedding_weight_matrix.shape)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 8, 329 | "metadata": { 330 | "execution": { 331 | "iopub.execute_input": "2021-12-05T07:28:18.980466Z", 332 | "iopub.status.busy": "2021-12-05T07:28:18.979883Z", 333 | "iopub.status.idle": "2021-12-05T07:28:19.835475Z", 334 | "shell.execute_reply": "2021-12-05T07:28:19.834704Z", 335 | "shell.execute_reply.started": "2021-12-05T07:28:18.980425Z" 336 | } 337 | }, 338 | "outputs": [ 339 | { 340 | "name": "stderr", 341 | "output_type": "stream", 342 | "text": [ 343 | "2021-12-05 07:28:19.052944: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n", 344 | "2021-12-05 07:28:19.259332: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n" 345 | ] 346 | } 347 | ], 348 | "source": [ 349 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n", 350 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n", 351 | "\n", 352 | "inner1= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128, \n", 353 | " weights=[embedding_weight_matrix], trainable=False)(inp1)\n", 354 | "inner2= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128,\n", 355 | " weights=[embedding_weight_matrix], trainable=False)(inp2)\n", 356 | "\n", 357 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n", 358 | "\n", 359 | "out = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, kernel_regularizer='l2', dropout=0.1, return_sequences=True))(inner)\n", 360 | "\n", 361 | "out = tf.keras.backend.mean(out, axis=1, keepdims=False)\n", 362 | "\n", 363 | "output = tf.keras.layers.Dense(2, kernel_regularizer='l2', activation='softmax')(out)\n", 364 | "\n", 365 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 9, 371 | "metadata": { 372 | "execution": { 373 | "iopub.execute_input": "2021-12-05T07:28:22.230481Z", 374 | "iopub.status.busy": "2021-12-05T07:28:22.229633Z", 375 | "iopub.status.idle": "2021-12-05T07:28:22.254231Z", 376 | "shell.execute_reply": "2021-12-05T07:28:22.252329Z", 377 | "shell.execute_reply.started": "2021-12-05T07:28:22.230431Z" 378 | } 379 | }, 380 | "outputs": [ 381 | { 382 | "name": "stdout", 383 | "output_type": "stream", 384 | "text": [ 385 | "Model: \"model\"\n", 386 | "__________________________________________________________________________________________________\n", 387 | "Layer (type) Output Shape Param # Connected to \n", 388 | "==================================================================================================\n", 389 | "input_3 (InputLayer) [(None, 128)] 0 \n", 390 | "__________________________________________________________________________________________________\n", 391 | "input_4 (InputLayer) [(None, 128)] 0 \n", 392 | "__________________________________________________________________________________________________\n", 393 | "embedding_2 (Embedding) (None, 128, 300) 35867400 input_3[0][0] \n", 394 | "__________________________________________________________________________________________________\n", 395 | "embedding_3 (Embedding) (None, 128, 300) 35867400 input_4[0][0] \n", 396 | "__________________________________________________________________________________________________\n", 397 | "tf.__operators__.add_1 (TFOpLam (None, 128, 300) 0 embedding_2[0][0] \n", 398 | " embedding_3[0][0] \n", 399 | "__________________________________________________________________________________________________\n", 400 | "tf.math.subtract_1 (TFOpLambda) (None, 128, 300) 0 embedding_2[0][0] \n", 401 | " embedding_3[0][0] \n", 402 | "__________________________________________________________________________________________________\n", 403 | "tf.math.multiply_1 (TFOpLambda) (None, 128, 300) 0 embedding_2[0][0] \n", 404 | " embedding_3[0][0] \n", 405 | "__________________________________________________________________________________________________\n", 406 | "concatenate_1 (Concatenate) (None, 128, 900) 0 tf.__operators__.add_1[0][0] \n", 407 | " tf.math.subtract_1[0][0] \n", 408 | " tf.math.multiply_1[0][0] \n", 409 | "__________________________________________________________________________________________________\n", 410 | "bidirectional_1 (Bidirectional) (None, 128, 300) 1261200 concatenate_1[0][0] \n", 411 | "__________________________________________________________________________________________________\n", 412 | "tf.math.reduce_mean_1 (TFOpLamb (None, 300) 0 bidirectional_1[0][0] \n", 413 | "__________________________________________________________________________________________________\n", 414 | "dense (Dense) (None, 2) 602 tf.math.reduce_mean_1[0][0] \n", 415 | "==================================================================================================\n", 416 | "Total params: 72,996,602\n", 417 | "Trainable params: 1,261,802\n", 418 | "Non-trainable params: 71,734,800\n", 419 | "__________________________________________________________________________________________________\n" 420 | ] 421 | } 422 | ], 423 | "source": [ 424 | "model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics=['accuracy'])\n", 425 | "model.summary()" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 15, 431 | "metadata": { 432 | "execution": { 433 | "iopub.execute_input": "2021-12-05T07:54:34.407682Z", 434 | "iopub.status.busy": "2021-12-05T07:54:34.406978Z", 435 | "iopub.status.idle": "2021-12-05T08:02:56.589419Z", 436 | "shell.execute_reply": "2021-12-05T08:02:56.588628Z", 437 | "shell.execute_reply.started": "2021-12-05T07:54:34.407647Z" 438 | } 439 | }, 440 | "outputs": [ 441 | { 442 | "name": "stdout", 443 | "output_type": "stream", 444 | "text": [ 445 | "Epoch 1/2\n", 446 | "8844/8844 [==============================] - 249s 28ms/step - loss: 0.5754 - accuracy: 0.7337 - val_loss: 0.5890 - val_accuracy: 0.7333\n", 447 | "\n", 448 | "Epoch 00001: val_loss improved from inf to 0.58903, saving model to weights.best.hdf5\n", 449 | "Epoch 2/2\n", 450 | "8844/8844 [==============================] - 249s 28ms/step - loss: 0.5752 - accuracy: 0.7361 - val_loss: 0.5766 - val_accuracy: 0.7340\n", 451 | "\n", 452 | "Epoch 00002: val_loss improved from 0.58903 to 0.57661, saving model to weights.best.hdf5\n" 453 | ] 454 | } 455 | ], 456 | "source": [ 457 | "checkpoint_filepath = 'weights.best.hdf5'\n", 458 | "model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n", 459 | " verbose = 1, \n", 460 | " monitor = 'val_loss',\n", 461 | " save_best_only = True)\n", 462 | "\n", 463 | "history = model.fit((x_train1, x_train2), y_train,\n", 464 | " batch_size = 32,\n", 465 | " validation_data = ((x_val1, x_val2), y_val),\n", 466 | " validation_batch_size = 16,\n", 467 | " epochs=5,\n", 468 | " callbacks=[model_checkpoint_callback], \n", 469 | " verbose=1)" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 18, 475 | "metadata": { 476 | "execution": { 477 | "iopub.execute_input": "2021-12-05T08:02:57.030112Z", 478 | "iopub.status.busy": "2021-12-05T08:02:57.029591Z", 479 | "iopub.status.idle": "2021-12-05T08:04:06.879698Z", 480 | "shell.execute_reply": "2021-12-05T08:04:06.878884Z", 481 | "shell.execute_reply.started": "2021-12-05T08:02:57.030075Z" 482 | } 483 | }, 484 | "outputs": [ 485 | { 486 | "name": "stdout", 487 | "output_type": "stream", 488 | "text": [ 489 | "10108/10108 [==============================] - 70s 7ms/step - loss: 0.5730 - accuracy: 0.7340\n", 490 | "loss on test data is 0.5730125308036804\n", 491 | "accuracy on test data is 0.7340275645256042\n" 492 | ] 493 | } 494 | ], 495 | "source": [ 496 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n", 497 | "\n", 498 | "print('loss on test data is', loss)\n", 499 | "print('accuracy on test data is', accuracy)" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 19, 505 | "metadata": { 506 | "execution": { 507 | "iopub.execute_input": "2021-12-05T08:04:06.881174Z", 508 | "iopub.status.busy": "2021-12-05T08:04:06.880894Z", 509 | "iopub.status.idle": "2021-12-05T08:04:17.186473Z", 510 | "shell.execute_reply": "2021-12-05T08:04:17.185721Z", 511 | "shell.execute_reply.started": "2021-12-05T08:04:06.881138Z" 512 | } 513 | }, 514 | "outputs": [ 515 | { 516 | "name": "stdout", 517 | "output_type": "stream", 518 | "text": [ 519 | "f1_score on test dataset is 0.6310516383599245\n" 520 | ] 521 | } 522 | ], 523 | "source": [ 524 | "pred = model.predict((x_test1, x_test2))\n", 525 | "\n", 526 | "print('f1_score on test dataset is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [] 535 | } 536 | ], 537 | "metadata": { 538 | "kernelspec": { 539 | "display_name": "Python 3 (ipykernel)", 540 | "language": "python", 541 | "name": "python3" 542 | }, 543 | "language_info": { 544 | "codemirror_mode": { 545 | "name": "ipython", 546 | "version": 3 547 | }, 548 | "file_extension": ".py", 549 | "mimetype": "text/x-python", 550 | "name": "python", 551 | "nbconvert_exporter": "python", 552 | "pygments_lexer": "ipython3", 553 | "version": "3.8.10" 554 | } 555 | }, 556 | "nbformat": 4, 557 | "nbformat_minor": 4 558 | } 559 | -------------------------------------------------------------------------------- /glove-bilstm_experiment2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 8 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 9 | "execution": { 10 | "iopub.execute_input": "2021-12-05T08:27:55.991770Z", 11 | "iopub.status.busy": "2021-12-05T08:27:55.991383Z", 12 | "iopub.status.idle": "2021-12-05T08:28:00.572205Z", 13 | "shell.execute_reply": "2021-12-05T08:28:00.571459Z", 14 | "shell.execute_reply.started": "2021-12-05T08:27:55.991676Z" 15 | } 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "import pandas as pd\n", 20 | "import numpy as np\n", 21 | "from tqdm import tqdm\n", 22 | "import tensorflow as tf\n", 23 | "from sklearn.metrics import f1_score" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": { 30 | "execution": { 31 | "iopub.execute_input": "2021-12-05T08:28:00.575744Z", 32 | "iopub.status.busy": "2021-12-05T08:28:00.575547Z", 33 | "iopub.status.idle": "2021-12-05T08:28:03.698196Z", 34 | "shell.execute_reply": "2021-12-05T08:28:03.697341Z", 35 | "shell.execute_reply.started": "2021-12-05T08:28:00.575719Z" 36 | } 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "train = pd.read_csv('../input/smai-project-data/train_data.csv').fillna('')\n", 41 | "val = pd.read_csv('../input/smai-project-data/val_data.csv').fillna('')\n", 42 | "test = pd.read_csv('../input/smai-project-data/test_data.csv').fillna('')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": { 49 | "execution": { 50 | "iopub.execute_input": "2021-12-05T08:28:03.701370Z", 51 | "iopub.status.busy": "2021-12-05T08:28:03.699765Z", 52 | "iopub.status.idle": "2021-12-05T08:28:03.725788Z", 53 | "shell.execute_reply": "2021-12-05T08:28:03.725005Z", 54 | "shell.execute_reply.started": "2021-12-05T08:28:03.701323Z" 55 | } 56 | }, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/html": [ 61 | "

	id	qid1	qid2	question1	question2	is_duplicate	question1_preprocessed	question2_preprocessed
0	8067	15738	15739	How do I play Pokémon GO in Korea?	How do I play Pokémon GO in China?	0	how do i play pok mon go in korea ?	how do i play pok mon go in china ?
1	368101	12736	104117	What are some of the best side dishes for crab...	What are some good side dishes for buffalo chi...	0	what are some of the best side dishes for crab...	what are some good side dishes for buffalo chi...
2	70497	121486	121487	Which is more advisable and better material fo...	What is the best server setup for buddypress?	0	which is more advisable and better material fo...	what is the best server setup for buddypress ?
3	226567	254474	258192	How do I improve logical programming skills?	How can I improve my logical skills for progra...	1	how do i improve logical programming skills ?	how can i improve my logical skills for progra...
4	73186	48103	3062	How close we are to see 3rd world war?	How close is a World War III?	1	how close we are to see 3rd world war ?	how close is a world war iii ?

\n", 147 | "

" 148 | ], 149 | "text/plain": [ 150 | " id qid1 qid2 question1 \\\n", 151 | "0 8067 15738 15739 How do I play Pokémon GO in Korea? \n", 152 | "1 368101 12736 104117 What are some of the best side dishes for crab... \n", 153 | "2 70497 121486 121487 Which is more advisable and better material fo... \n", 154 | "3 226567 254474 258192 How do I improve logical programming skills? \n", 155 | "4 73186 48103 3062 How close we are to see 3rd world war? \n", 156 | "\n", 157 | " question2 is_duplicate \\\n", 158 | "0 How do I play Pokémon GO in China? 0 \n", 159 | "1 What are some good side dishes for buffalo chi... 0 \n", 160 | "2 What is the best server setup for buddypress? 0 \n", 161 | "3 How can I improve my logical skills for progra... 1 \n", 162 | "4 How close is a World War III? 1 \n", 163 | "\n", 164 | " question1_preprocessed \\\n", 165 | "0 how do i play pok mon go in korea ? \n", 166 | "1 what are some of the best side dishes for crab... \n", 167 | "2 which is more advisable and better material fo... \n", 168 | "3 how do i improve logical programming skills ? \n", 169 | "4 how close we are to see 3rd world war ? \n", 170 | "\n", 171 | " question2_preprocessed \n", 172 | "0 how do i play pok mon go in china ? \n", 173 | "1 what are some good side dishes for buffalo chi... \n", 174 | "2 what is the best server setup for buddypress ? \n", 175 | "3 how can i improve my logical skills for progra... \n", 176 | "4 how close is a world war iii ? " 177 | ] 178 | }, 179 | "execution_count": 3, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "train.head()" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 4, 191 | "metadata": { 192 | "execution": { 193 | "iopub.execute_input": "2021-12-05T08:28:03.728265Z", 194 | "iopub.status.busy": "2021-12-05T08:28:03.727992Z", 195 | "iopub.status.idle": "2021-12-05T08:28:03.737968Z", 196 | "shell.execute_reply": "2021-12-05T08:28:03.736992Z", 197 | "shell.execute_reply.started": "2021-12-05T08:28:03.728229Z" 198 | } 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "def buildVocabulary(reviews):\n", 203 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n", 204 | " tokenizer.fit_on_texts(reviews)\n", 205 | " return tokenizer\n", 206 | "\n", 207 | "def getSequences(reviews, tokenizer, seq_maxlen):\n", 208 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n", 209 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n", 210 | "\n", 211 | "def loadGloveWordEmbeddings():\n", 212 | " embedding_vectors = {}\n", 213 | " with open('../input/glove840b300dtxt/glove.840B.300d.txt') as f:\n", 214 | " for line in tqdm(f):\n", 215 | " values = line.split(' ')\n", 216 | " word = values[0]\n", 217 | " coefs = np.asarray(values[1:], dtype='float32')\n", 218 | " embedding_vectors[word] = coefs\n", 219 | " return embedding_vectors\n", 220 | "\n", 221 | "def getEmbeddingWeightMatrix(embedding_vectors, word2idx): \n", 222 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n", 223 | " for word, i in tqdm(word2idx.items()):\n", 224 | " embedding_vector = embedding_vectors.get(word)\n", 225 | " if embedding_vector is not None:\n", 226 | " embedding_matrix[i] = embedding_vector\n", 227 | " return embedding_matrix" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 5, 233 | "metadata": { 234 | "execution": { 235 | "iopub.execute_input": "2021-12-05T08:28:03.740285Z", 236 | "iopub.status.busy": "2021-12-05T08:28:03.739650Z", 237 | "iopub.status.idle": "2021-12-05T08:28:35.589006Z", 238 | "shell.execute_reply": "2021-12-05T08:28:35.588270Z", 239 | "shell.execute_reply.started": "2021-12-05T08:28:03.740250Z" 240 | } 241 | }, 242 | "outputs": [ 243 | { 244 | "name": "stdout", 245 | "output_type": "stream", 246 | "text": [ 247 | "119558\n" 248 | ] 249 | } 250 | ], 251 | "source": [ 252 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n", 253 | "vocab_size = len(tokenizer.word_index) + 1\n", 254 | "print(vocab_size)\n", 255 | "\n", 256 | "x_train1 = getSequences(train['question1'], tokenizer, 128)\n", 257 | "x_train2 = getSequences(train['question2'], tokenizer, 128)\n", 258 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n", 259 | "\n", 260 | "x_val1 = getSequences(val['question1'], tokenizer, 128)\n", 261 | "x_val2 = getSequences(val['question2'], tokenizer, 128)\n", 262 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n", 263 | "\n", 264 | "x_test1 = getSequences(test['question1'], tokenizer, 128)\n", 265 | "x_test2 = getSequences(test['question2'], tokenizer, 128)\n", 266 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 6, 272 | "metadata": { 273 | "execution": { 274 | "iopub.execute_input": "2021-12-05T08:28:35.590734Z", 275 | "iopub.status.busy": "2021-12-05T08:28:35.590499Z", 276 | "iopub.status.idle": "2021-12-05T08:32:50.715266Z", 277 | "shell.execute_reply": "2021-12-05T08:32:50.714504Z", 278 | "shell.execute_reply.started": "2021-12-05T08:28:35.590699Z" 279 | } 280 | }, 281 | "outputs": [ 282 | { 283 | "name": "stderr", 284 | "output_type": "stream", 285 | "text": [ 286 | "2196018it [04:14, 8621.81it/s]\n" 287 | ] 288 | }, 289 | { 290 | "name": "stdout", 291 | "output_type": "stream", 292 | "text": [ 293 | "2196017\n" 294 | ] 295 | }, 296 | { 297 | "name": "stderr", 298 | "output_type": "stream", 299 | "text": [ 300 | "100%|██████████| 119557/119557 [00:00<00:00, 296253.60it/s]" 301 | ] 302 | }, 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "(119558, 300)\n" 308 | ] 309 | }, 310 | { 311 | "name": "stderr", 312 | "output_type": "stream", 313 | "text": [ 314 | "\n" 315 | ] 316 | } 317 | ], 318 | "source": [ 319 | "embedding_vectors = loadGloveWordEmbeddings()\n", 320 | "print(len(embedding_vectors))\n", 321 | "\n", 322 | "embedding_weight_matrix = getEmbeddingWeightMatrix(embedding_vectors, tokenizer.word_index)\n", 323 | "print(embedding_weight_matrix.shape)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 7, 329 | "metadata": { 330 | "execution": { 331 | "iopub.execute_input": "2021-12-05T08:32:50.717025Z", 332 | "iopub.status.busy": "2021-12-05T08:32:50.716763Z", 333 | "iopub.status.idle": "2021-12-05T08:32:54.266862Z", 334 | "shell.execute_reply": "2021-12-05T08:32:54.266168Z", 335 | "shell.execute_reply.started": "2021-12-05T08:32:50.716989Z" 336 | } 337 | }, 338 | "outputs": [ 339 | { 340 | "name": "stderr", 341 | "output_type": "stream", 342 | "text": [ 343 | "2021-12-05 08:32:50.811328: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 344 | "2021-12-05 08:32:50.924265: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 345 | "2021-12-05 08:32:50.924979: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 346 | "2021-12-05 08:32:50.926274: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n", 347 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", 348 | "2021-12-05 08:32:50.927066: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 349 | "2021-12-05 08:32:50.927770: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 350 | "2021-12-05 08:32:50.928410: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 351 | "2021-12-05 08:32:52.819781: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 352 | "2021-12-05 08:32:52.820602: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 353 | "2021-12-05 08:32:52.821254: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 354 | "2021-12-05 08:32:52.821837: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n", 355 | "2021-12-05 08:32:53.377677: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n", 356 | "2021-12-05 08:32:53.611921: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n" 357 | ] 358 | } 359 | ], 360 | "source": [ 361 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n", 362 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n", 363 | "\n", 364 | "inner1= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128, \n", 365 | " weights=[embedding_weight_matrix], trainable=False)(inp1)\n", 366 | "inner2= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128,\n", 367 | " weights=[embedding_weight_matrix], trainable=False)(inp2)\n", 368 | "\n", 369 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n", 370 | "\n", 371 | "out = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, kernel_regularizer='l2', return_sequences=True))(inner)\n", 372 | "\n", 373 | "out = tf.keras.backend.mean(out, axis=1, keepdims=False)\n", 374 | "\n", 375 | "output = tf.keras.layers.Dense(2, kernel_regularizer='l2', activation='softmax')(out)\n", 376 | "\n", 377 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 8, 383 | "metadata": { 384 | "execution": { 385 | "iopub.execute_input": "2021-12-05T08:32:54.268296Z", 386 | "iopub.status.busy": "2021-12-05T08:32:54.268035Z", 387 | "iopub.status.idle": "2021-12-05T08:32:54.288773Z", 388 | "shell.execute_reply": "2021-12-05T08:32:54.288146Z", 389 | "shell.execute_reply.started": "2021-12-05T08:32:54.268263Z" 390 | } 391 | }, 392 | "outputs": [ 393 | { 394 | "name": "stdout", 395 | "output_type": "stream", 396 | "text": [ 397 | "Model: \"model\"\n", 398 | "__________________________________________________________________________________________________\n", 399 | "Layer (type) Output Shape Param # Connected to \n", 400 | "==================================================================================================\n", 401 | "input_1 (InputLayer) [(None, 128)] 0 \n", 402 | "__________________________________________________________________________________________________\n", 403 | "input_2 (InputLayer) [(None, 128)] 0 \n", 404 | "__________________________________________________________________________________________________\n", 405 | "embedding (Embedding) (None, 128, 300) 35867400 input_1[0][0] \n", 406 | "__________________________________________________________________________________________________\n", 407 | "embedding_1 (Embedding) (None, 128, 300) 35867400 input_2[0][0] \n", 408 | "__________________________________________________________________________________________________\n", 409 | "tf.__operators__.add (TFOpLambd (None, 128, 300) 0 embedding[0][0] \n", 410 | " embedding_1[0][0] \n", 411 | "__________________________________________________________________________________________________\n", 412 | "tf.math.subtract (TFOpLambda) (None, 128, 300) 0 embedding[0][0] \n", 413 | " embedding_1[0][0] \n", 414 | "__________________________________________________________________________________________________\n", 415 | "tf.math.multiply (TFOpLambda) (None, 128, 300) 0 embedding[0][0] \n", 416 | " embedding_1[0][0] \n", 417 | "__________________________________________________________________________________________________\n", 418 | "concatenate (Concatenate) (None, 128, 900) 0 tf.__operators__.add[0][0] \n", 419 | " tf.math.subtract[0][0] \n", 420 | " tf.math.multiply[0][0] \n", 421 | "__________________________________________________________________________________________________\n", 422 | "bidirectional (Bidirectional) (None, 128, 300) 1261200 concatenate[0][0] \n", 423 | "__________________________________________________________________________________________________\n", 424 | "tf.math.reduce_mean (TFOpLambda (None, 300) 0 bidirectional[0][0] \n", 425 | "__________________________________________________________________________________________________\n", 426 | "dense (Dense) (None, 2) 602 tf.math.reduce_mean[0][0] \n", 427 | "==================================================================================================\n", 428 | "Total params: 72,996,602\n", 429 | "Trainable params: 1,261,802\n", 430 | "Non-trainable params: 71,734,800\n", 431 | "__________________________________________________________________________________________________\n" 432 | ] 433 | } 434 | ], 435 | "source": [ 436 | "model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics=['accuracy'])\n", 437 | "model.summary()" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 9, 443 | "metadata": { 444 | "execution": { 445 | "iopub.execute_input": "2021-12-05T08:32:54.290164Z", 446 | "iopub.status.busy": "2021-12-05T08:32:54.289923Z", 447 | "iopub.status.idle": "2021-12-05T08:54:19.362354Z", 448 | "shell.execute_reply": "2021-12-05T08:54:19.361515Z", 449 | "shell.execute_reply.started": "2021-12-05T08:32:54.290131Z" 450 | } 451 | }, 452 | "outputs": [ 453 | { 454 | "name": "stderr", 455 | "output_type": "stream", 456 | "text": [ 457 | "2021-12-05 08:32:54.295725: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 144897536 exceeds 10% of free system memory.\n", 458 | "2021-12-05 08:32:54.402477: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 144897536 exceeds 10% of free system memory.\n", 459 | "2021-12-05 08:32:54.568071: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n" 460 | ] 461 | }, 462 | { 463 | "name": "stdout", 464 | "output_type": "stream", 465 | "text": [ 466 | "Epoch 1/5\n" 467 | ] 468 | }, 469 | { 470 | "name": "stderr", 471 | "output_type": "stream", 472 | "text": [ 473 | "2021-12-05 08:32:58.225519: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005\n" 474 | ] 475 | }, 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "8844/8844 [==============================] - ETA: 0s - loss: 0.6319 - accuracy: 0.7057" 481 | ] 482 | }, 483 | { 484 | "name": "stderr", 485 | "output_type": "stream", 486 | "text": [ 487 | "2021-12-05 08:36:24.846568: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 41399296 exceeds 10% of free system memory.\n" 488 | ] 489 | }, 490 | { 491 | "name": "stdout", 492 | "output_type": "stream", 493 | "text": [ 494 | "8844/8844 [==============================] - 250s 28ms/step - loss: 0.6319 - accuracy: 0.7057 - val_loss: 0.5694 - val_accuracy: 0.7317\n", 495 | "\n", 496 | "Epoch 00001: val_loss improved from inf to 0.56941, saving model to weights.best.hdf5\n", 497 | "Epoch 2/5\n", 498 | "8844/8844 [==============================] - 245s 28ms/step - loss: 0.5812 - accuracy: 0.7274 - val_loss: 0.5639 - val_accuracy: 0.7362\n", 499 | "\n", 500 | "Epoch 00002: val_loss improved from 0.56941 to 0.56394, saving model to weights.best.hdf5\n", 501 | "Epoch 3/5\n", 502 | "8844/8844 [==============================] - 243s 28ms/step - loss: 0.5915 - accuracy: 0.7223 - val_loss: 0.6492 - val_accuracy: 0.6690\n", 503 | "\n", 504 | "Epoch 00003: val_loss did not improve from 0.56394\n", 505 | "Epoch 4/5\n", 506 | "8844/8844 [==============================] - 244s 28ms/step - loss: 0.5840 - accuracy: 0.7287 - val_loss: 0.5697 - val_accuracy: 0.7288\n", 507 | "\n", 508 | "Epoch 00004: val_loss did not improve from 0.56394\n", 509 | "Epoch 5/5\n", 510 | "8844/8844 [==============================] - 243s 28ms/step - loss: 0.5734 - accuracy: 0.7334 - val_loss: 0.5623 - val_accuracy: 0.7404\n", 511 | "\n", 512 | "Epoch 00005: val_loss improved from 0.56394 to 0.56234, saving model to weights.best.hdf5\n" 513 | ] 514 | } 515 | ], 516 | "source": [ 517 | "checkpoint_filepath = 'weights.best.hdf5'\n", 518 | "model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n", 519 | " verbose = 1, \n", 520 | " monitor = 'val_loss',\n", 521 | " save_best_only = True)\n", 522 | "\n", 523 | "history = model.fit((x_train1, x_train2), y_train,\n", 524 | " batch_size = 32,\n", 525 | " validation_data = ((x_val1, x_val2), y_val),\n", 526 | " validation_batch_size = 16,\n", 527 | " epochs=5,\n", 528 | " callbacks=[model_checkpoint_callback], \n", 529 | " verbose=1)" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 12, 535 | "metadata": { 536 | "execution": { 537 | "iopub.execute_input": "2021-12-05T08:54:19.834213Z", 538 | "iopub.status.busy": "2021-12-05T08:54:19.833805Z", 539 | "iopub.status.idle": "2021-12-05T08:55:29.256507Z", 540 | "shell.execute_reply": "2021-12-05T08:55:29.255776Z", 541 | "shell.execute_reply.started": "2021-12-05T08:54:19.834175Z" 542 | } 543 | }, 544 | "outputs": [ 545 | { 546 | "name": "stdout", 547 | "output_type": "stream", 548 | "text": [ 549 | "10108/10108 [==============================] - 69s 7ms/step - loss: 0.5629 - accuracy: 0.7411\n", 550 | "loss on test data is 0.5628555417060852\n", 551 | "accuracy on test data is 0.7410769462585449\n" 552 | ] 553 | } 554 | ], 555 | "source": [ 556 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n", 557 | "\n", 558 | "print('loss on test data is', loss)\n", 559 | "print('accuracy on test data is', accuracy)" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 13, 565 | "metadata": { 566 | "execution": { 567 | "iopub.execute_input": "2021-12-05T08:55:29.258466Z", 568 | "iopub.status.busy": "2021-12-05T08:55:29.258123Z", 569 | "iopub.status.idle": "2021-12-05T08:55:40.114361Z", 570 | "shell.execute_reply": "2021-12-05T08:55:40.113551Z", 571 | "shell.execute_reply.started": "2021-12-05T08:55:29.258423Z" 572 | } 573 | }, 574 | "outputs": [ 575 | { 576 | "name": "stdout", 577 | "output_type": "stream", 578 | "text": [ 579 | "f1_score on test dataset is 0.6306802145074795\n" 580 | ] 581 | } 582 | ], 583 | "source": [ 584 | "pred = model.predict((x_test1, x_test2))\n", 585 | "\n", 586 | "print('f1_score on test dataset is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [] 595 | } 596 | ], 597 | "metadata": { 598 | "kernelspec": { 599 | "display_name": "Python 3 (ipykernel)", 600 | "language": "python", 601 | "name": "python3" 602 | }, 603 | "language_info": { 604 | "codemirror_mode": { 605 | "name": "ipython", 606 | "version": 3 607 | }, 608 | "file_extension": ".py", 609 | "mimetype": "text/x-python", 610 | "name": "python", 611 | "nbconvert_exporter": "python", 612 | "pygments_lexer": "ipython3", 613 | "version": "3.8.10" 614 | } 615 | }, 616 | "nbformat": 4, 617 | "nbformat_minor": 4 618 | } 619 | -------------------------------------------------------------------------------- /CBOW MLP Ppaer Implementation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "8ca79fa1", 7 | "metadata": { 8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 10 | "execution": { 11 | "iopub.execute_input": "2021-12-05T06:41:42.716152Z", 12 | "iopub.status.busy": "2021-12-05T06:41:42.714638Z", 13 | "iopub.status.idle": "2021-12-05T06:41:47.285389Z", 14 | "shell.execute_reply": "2021-12-05T06:41:47.284775Z", 15 | "shell.execute_reply.started": "2021-12-05T06:28:02.081372Z" 16 | }, 17 | "papermill": { 18 | "duration": 4.588331, 19 | "end_time": "2021-12-05T06:41:47.285538", 20 | "exception": false, 21 | "start_time": "2021-12-05T06:41:42.697207", 22 | "status": "completed" 23 | }, 24 | "tags": [] 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import pandas as pd\n", 29 | "import numpy as np\n", 30 | "from tqdm import tqdm\n", 31 | "import tensorflow as tf\n", 32 | "from gensim.models import KeyedVectors\n", 33 | "import gensim\n", 34 | "import re\n", 35 | "from sklearn.metrics import f1_score\n", 36 | "import matplotlib.pyplot as plt" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "id": "57d8136d", 43 | "metadata": { 44 | "execution": { 45 | "iopub.execute_input": "2021-12-05T06:41:47.314041Z", 46 | "iopub.status.busy": "2021-12-05T06:41:47.313448Z", 47 | "iopub.status.idle": "2021-12-05T06:41:48.775380Z", 48 | "shell.execute_reply": "2021-12-05T06:41:48.774838Z", 49 | "shell.execute_reply.started": "2021-12-05T06:28:06.599367Z" 50 | }, 51 | "papermill": { 52 | "duration": 1.478299, 53 | "end_time": "2021-12-05T06:41:48.775521", 54 | "exception": false, 55 | "start_time": "2021-12-05T06:41:47.297222", 56 | "status": "completed" 57 | }, 58 | "tags": [] 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "train = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')\n", 63 | "val = pd.read_csv('../input/quora-ques-pair/val_data.csv/val_data.csv').fillna('')\n", 64 | "test = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "id": "0fb8cffe", 71 | "metadata": { 72 | "execution": { 73 | "iopub.execute_input": "2021-12-05T06:41:48.800775Z", 74 | "iopub.status.busy": "2021-12-05T06:41:48.799943Z", 75 | "iopub.status.idle": "2021-12-05T06:41:48.804283Z", 76 | "shell.execute_reply": "2021-12-05T06:41:48.803802Z", 77 | "shell.execute_reply.started": "2021-12-05T06:28:07.887882Z" 78 | }, 79 | "papermill": { 80 | "duration": 0.017602, 81 | "end_time": "2021-12-05T06:41:48.804397", 82 | "exception": false, 83 | "start_time": "2021-12-05T06:41:48.786795", 84 | "status": "completed" 85 | }, 86 | "tags": [] 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "word2vec_file = '../input/d/sandreds/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "id": "0a74cebd", 97 | "metadata": { 98 | "execution": { 99 | "iopub.execute_input": "2021-12-05T06:41:48.834263Z", 100 | "iopub.status.busy": "2021-12-05T06:41:48.833473Z", 101 | "iopub.status.idle": "2021-12-05T06:41:48.846019Z", 102 | "shell.execute_reply": "2021-12-05T06:41:48.846450Z", 103 | "shell.execute_reply.started": "2021-12-05T06:28:07.895005Z" 104 | }, 105 | "papermill": { 106 | "duration": 0.031576, 107 | "end_time": "2021-12-05T06:41:48.846572", 108 | "exception": false, 109 | "start_time": "2021-12-05T06:41:48.814996", 110 | "status": "completed" 111 | }, 112 | "tags": [] 113 | }, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/html": [ 118 | "

\n", 119 | "\n", 132 | "\n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | "

	id	qid1	qid2	question1	question2	is_duplicate	question1_preprocessed	question2_preprocessed
0	204673	93885	307635	If there is a God, where is He!	Why is god a \"He\"?	0	if there is a god , where is he !	why is god a `` he '' ?
1	17716	2093	15628	Do you believe that everything happens for a r...	Does everything happen for a reason?	1	do you believe that everything happens for a r...	does everything happen for a reason ?
2	291767	352623	413255	Will there always be web hosting that will sup...	Will there always be web hosting that supports...	1	will there always be web hosting that will sup...	will there always be web hosting that supports...
3	203758	59824	67971	What is the proof of Indian Army's surgical st...	Has India provided any proof of the surgical s...	1	what is the proof of indian army 's surgical s...	has india provided any proof of the surgical s...
4	41747	75326	75327	What do Indian Muslims think of Modi?	What do Indian Muslim think about PM Narendra ...	1	what do indian muslims think of modi ?	what do indian muslim think about pm narendra ...

\n", 204 | "

" 205 | ], 206 | "text/plain": [ 207 | " id qid1 qid2 question1 \\\n", 208 | "0 204673 93885 307635 If there is a God, where is He! \n", 209 | "1 17716 2093 15628 Do you believe that everything happens for a r... \n", 210 | "2 291767 352623 413255 Will there always be web hosting that will sup... \n", 211 | "3 203758 59824 67971 What is the proof of Indian Army's surgical st... \n", 212 | "4 41747 75326 75327 What do Indian Muslims think of Modi? \n", 213 | "\n", 214 | " question2 is_duplicate \\\n", 215 | "0 Why is god a \"He\"? 0 \n", 216 | "1 Does everything happen for a reason? 1 \n", 217 | "2 Will there always be web hosting that supports... 1 \n", 218 | "3 Has India provided any proof of the surgical s... 1 \n", 219 | "4 What do Indian Muslim think about PM Narendra ... 1 \n", 220 | "\n", 221 | " question1_preprocessed \\\n", 222 | "0 if there is a god , where is he ! \n", 223 | "1 do you believe that everything happens for a r... \n", 224 | "2 will there always be web hosting that will sup... \n", 225 | "3 what is the proof of indian army 's surgical s... \n", 226 | "4 what do indian muslims think of modi ? \n", 227 | "\n", 228 | " question2_preprocessed \n", 229 | "0 why is god a `` he '' ? \n", 230 | "1 does everything happen for a reason ? \n", 231 | "2 will there always be web hosting that supports... \n", 232 | "3 has india provided any proof of the surgical s... \n", 233 | "4 what do indian muslim think about pm narendra ... " 234 | ] 235 | }, 236 | "execution_count": 4, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "train.head()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 5, 248 | "id": "e183a4fe", 249 | "metadata": { 250 | "execution": { 251 | "iopub.execute_input": "2021-12-05T06:41:48.876516Z", 252 | "iopub.status.busy": "2021-12-05T06:41:48.875896Z", 253 | "iopub.status.idle": "2021-12-05T06:42:50.921199Z", 254 | "shell.execute_reply": "2021-12-05T06:42:50.920691Z", 255 | "shell.execute_reply.started": "2021-12-05T06:28:07.919112Z" 256 | }, 257 | "papermill": { 258 | "duration": 62.06346, 259 | "end_time": "2021-12-05T06:42:50.921351", 260 | "exception": false, 261 | "start_time": "2021-12-05T06:41:48.857891", 262 | "status": "completed" 263 | }, 264 | "tags": [] 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "def buildVocabulary(reviews):\n", 269 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n", 270 | " tokenizer.fit_on_texts(reviews)\n", 271 | " return tokenizer\n", 272 | "\n", 273 | "def getSequences(reviews, tokenizer, seq_maxlen):\n", 274 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n", 275 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n", 276 | "\n", 277 | "word2vec_model = KeyedVectors.load_word2vec_format(word2vec_file, binary = True)\n", 278 | "\n", 279 | "def getEmbeddingWeightMatrix(word2idx): \n", 280 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n", 281 | " for word, i in tqdm(word2idx.items()):\n", 282 | " \n", 283 | " embedding_vector = word2vec_model[word] if word in word2vec_model else np.random.rand(1,300)\n", 284 | " if embedding_vector is not None:\n", 285 | " embedding_matrix[i] = embedding_vector\n", 286 | " return embedding_matrix" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 6, 292 | "id": "40a3ebc0", 293 | "metadata": { 294 | "execution": { 295 | "iopub.execute_input": "2021-12-05T06:42:50.974813Z", 296 | "iopub.status.busy": "2021-12-05T06:42:50.963194Z", 297 | "iopub.status.idle": "2021-12-05T06:43:04.692817Z", 298 | "shell.execute_reply": "2021-12-05T06:43:04.691844Z", 299 | "shell.execute_reply.started": "2021-12-05T06:29:09.661554Z" 300 | }, 301 | "papermill": { 302 | "duration": 13.76014, 303 | "end_time": "2021-12-05T06:43:04.692993", 304 | "exception": false, 305 | "start_time": "2021-12-05T06:42:50.932853", 306 | "status": "completed" 307 | }, 308 | "tags": [] 309 | }, 310 | "outputs": [ 311 | { 312 | "name": "stdout", 313 | "output_type": "stream", 314 | "text": [ 315 | "67043\n" 316 | ] 317 | } 318 | ], 319 | "source": [ 320 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n", 321 | "vocab_size = len(tokenizer.word_index) + 1\n", 322 | "print(vocab_size)\n", 323 | "\n", 324 | "x_train1 = getSequences(train['question1'], tokenizer, 200)\n", 325 | "x_train2 = getSequences(train['question2'], tokenizer, 200)\n", 326 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n", 327 | "\n", 328 | "x_val1 = getSequences(val['question1'], tokenizer, 200)\n", 329 | "x_val2 = getSequences(val['question2'], tokenizer, 200)\n", 330 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n", 331 | "\n", 332 | "x_test1 = getSequences(test['question1'], tokenizer, 200)\n", 333 | "x_test2 = getSequences(test['question2'], tokenizer, 200)\n", 334 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 7, 340 | "id": "61abd026", 341 | "metadata": { 342 | "execution": { 343 | "iopub.execute_input": "2021-12-05T06:43:04.722083Z", 344 | "iopub.status.busy": "2021-12-05T06:43:04.721506Z", 345 | "iopub.status.idle": "2021-12-05T06:43:05.183883Z", 346 | "shell.execute_reply": "2021-12-05T06:43:05.182396Z", 347 | "shell.execute_reply.started": "2021-12-05T06:29:22.925663Z" 348 | }, 349 | "papermill": { 350 | "duration": 0.478999, 351 | "end_time": "2021-12-05T06:43:05.184024", 352 | "exception": false, 353 | "start_time": "2021-12-05T06:43:04.705025", 354 | "status": "completed" 355 | }, 356 | "tags": [] 357 | }, 358 | "outputs": [ 359 | { 360 | "name": "stderr", 361 | "output_type": "stream", 362 | "text": [ 363 | "100%|██████████| 67042/67042 [00:00<00:00, 148076.38it/s]" 364 | ] 365 | }, 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "(67043, 300)\n" 371 | ] 372 | }, 373 | { 374 | "name": "stderr", 375 | "output_type": "stream", 376 | "text": [ 377 | "\n" 378 | ] 379 | } 380 | ], 381 | "source": [ 382 | "#embedding_vectors = loadGloveWordEmbeddings()\n", 383 | "#print(len(embedding_vectors))\n", 384 | "\n", 385 | "embedding_weight_matrix = getEmbeddingWeightMatrix(tokenizer.word_index)\n", 386 | "print(embedding_weight_matrix.shape)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 8, 392 | "id": "59b405dc", 393 | "metadata": { 394 | "execution": { 395 | "iopub.execute_input": "2021-12-05T06:43:05.221581Z", 396 | "iopub.status.busy": "2021-12-05T06:43:05.221015Z", 397 | "iopub.status.idle": "2021-12-05T06:43:07.840042Z", 398 | "shell.execute_reply": "2021-12-05T06:43:07.839136Z", 399 | "shell.execute_reply.started": "2021-12-05T06:29:23.399993Z" 400 | }, 401 | "papermill": { 402 | "duration": 2.642422, 403 | "end_time": "2021-12-05T06:43:07.840177", 404 | "exception": false, 405 | "start_time": "2021-12-05T06:43:05.197755", 406 | "status": "completed" 407 | }, 408 | "tags": [] 409 | }, 410 | "outputs": [ 411 | { 412 | "name": "stderr", 413 | "output_type": "stream", 414 | "text": [ 415 | "2021-12-05 06:43:05.301034: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 416 | "2021-12-05 06:43:05.400333: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 417 | "2021-12-05 06:43:05.401024: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 418 | "2021-12-05 06:43:05.402338: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n", 419 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", 420 | "2021-12-05 06:43:05.403313: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 421 | "2021-12-05 06:43:05.404059: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 422 | "2021-12-05 06:43:05.404737: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 423 | "2021-12-05 06:43:07.213294: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 424 | "2021-12-05 06:43:07.214235: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 425 | "2021-12-05 06:43:07.215011: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 426 | "2021-12-05 06:43:07.215600: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n" 427 | ] 428 | } 429 | ], 430 | "source": [ 431 | "#he_initializer = tf.keras.initializers.HeUniform()\n", 432 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n", 433 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n", 434 | "\n", 435 | "inner1= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200, \n", 436 | " weights=[embedding_weight_matrix], trainable=True)(inp1)\n", 437 | "inner2= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200,\n", 438 | " weights=[embedding_weight_matrix], trainable=True)(inp2)\n", 439 | "\n", 440 | "\n", 441 | "#inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n", 442 | "\n", 443 | "inner1 = tf.keras.backend.sum(inner1, axis=1, keepdims=False)\n", 444 | "inner2 = tf.keras.backend.sum(inner2, axis=1, keepdims=False)\n", 445 | "inner = inner1+inner2\n", 446 | "inner = tf.keras.layers.Dense(200, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 447 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 448 | "inner = tf.keras.layers.Dense(120, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 449 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 450 | "inner = tf.keras.layers.Dense(60, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 451 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 452 | "output = tf.keras.layers.Dense(2, activation='softmax')(inner)\n", 453 | "\n", 454 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 9, 460 | "id": "7eab1687", 461 | "metadata": { 462 | "execution": { 463 | "iopub.execute_input": "2021-12-05T06:43:07.876275Z", 464 | "iopub.status.busy": "2021-12-05T06:43:07.875614Z", 465 | "iopub.status.idle": "2021-12-05T06:43:07.891008Z", 466 | "shell.execute_reply": "2021-12-05T06:43:07.890387Z", 467 | "shell.execute_reply.started": "2021-12-05T06:29:26.103961Z" 468 | }, 469 | "papermill": { 470 | "duration": 0.036541, 471 | "end_time": "2021-12-05T06:43:07.891138", 472 | "exception": false, 473 | "start_time": "2021-12-05T06:43:07.854597", 474 | "status": "completed" 475 | }, 476 | "tags": [] 477 | }, 478 | "outputs": [ 479 | { 480 | "name": "stdout", 481 | "output_type": "stream", 482 | "text": [ 483 | "Model: \"model\"\n", 484 | "__________________________________________________________________________________________________\n", 485 | "Layer (type) Output Shape Param # Connected to \n", 486 | "==================================================================================================\n", 487 | "input_1 (InputLayer) [(None, 200)] 0 \n", 488 | "__________________________________________________________________________________________________\n", 489 | "input_2 (InputLayer) [(None, 200)] 0 \n", 490 | "__________________________________________________________________________________________________\n", 491 | "embedding (Embedding) (None, 200, 300) 20112900 input_1[0][0] \n", 492 | "__________________________________________________________________________________________________\n", 493 | "embedding_1 (Embedding) (None, 200, 300) 20112900 input_2[0][0] \n", 494 | "__________________________________________________________________________________________________\n", 495 | "tf.math.reduce_sum (TFOpLambda) (None, 300) 0 embedding[0][0] \n", 496 | "__________________________________________________________________________________________________\n", 497 | "tf.math.reduce_sum_1 (TFOpLambd (None, 300) 0 embedding_1[0][0] \n", 498 | "__________________________________________________________________________________________________\n", 499 | "tf.__operators__.add (TFOpLambd (None, 300) 0 tf.math.reduce_sum[0][0] \n", 500 | " tf.math.reduce_sum_1[0][0] \n", 501 | "__________________________________________________________________________________________________\n", 502 | "dense (Dense) (None, 200) 60200 tf.__operators__.add[0][0] \n", 503 | "__________________________________________________________________________________________________\n", 504 | "dropout (Dropout) (None, 200) 0 dense[0][0] \n", 505 | "__________________________________________________________________________________________________\n", 506 | "dense_1 (Dense) (None, 120) 24120 dropout[0][0] \n", 507 | "__________________________________________________________________________________________________\n", 508 | "dropout_1 (Dropout) (None, 120) 0 dense_1[0][0] \n", 509 | "__________________________________________________________________________________________________\n", 510 | "dense_2 (Dense) (None, 60) 7260 dropout_1[0][0] \n", 511 | "__________________________________________________________________________________________________\n", 512 | "dropout_2 (Dropout) (None, 60) 0 dense_2[0][0] \n", 513 | "__________________________________________________________________________________________________\n", 514 | "dense_3 (Dense) (None, 2) 122 dropout_2[0][0] \n", 515 | "==================================================================================================\n", 516 | "Total params: 40,317,502\n", 517 | "Trainable params: 40,317,502\n", 518 | "Non-trainable params: 0\n", 519 | "__________________________________________________________________________________________________\n" 520 | ] 521 | } 522 | ], 523 | "source": [ 524 | "model.compile(optimizer = \"adam\", loss = 'categorical_crossentropy', metrics=['accuracy'])\n", 525 | "model.summary()" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 10, 531 | "id": "0aae2f11", 532 | "metadata": { 533 | "execution": { 534 | "iopub.execute_input": "2021-12-05T06:43:07.925895Z", 535 | "iopub.status.busy": "2021-12-05T06:43:07.925115Z", 536 | "iopub.status.idle": "2021-12-05T06:44:04.257889Z", 537 | "shell.execute_reply": "2021-12-05T06:44:04.258447Z", 538 | "shell.execute_reply.started": "2021-12-05T06:29:26.126805Z" 539 | }, 540 | "papermill": { 541 | "duration": 56.353085, 542 | "end_time": "2021-12-05T06:44:04.258638", 543 | "exception": false, 544 | "start_time": "2021-12-05T06:43:07.905553", 545 | "status": "completed" 546 | }, 547 | "tags": [] 548 | }, 549 | "outputs": [ 550 | { 551 | "name": "stderr", 552 | "output_type": "stream", 553 | "text": [ 554 | "2021-12-05 06:43:08.036642: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n" 555 | ] 556 | }, 557 | { 558 | "name": "stdout", 559 | "output_type": "stream", 560 | "text": [ 561 | "Epoch 1/4\n", 562 | "632/632 [==============================] - 14s 20ms/step - loss: 1.5897 - accuracy: 0.6573 - val_loss: 0.6889 - val_accuracy: 0.6883\n", 563 | "Epoch 2/4\n", 564 | "632/632 [==============================] - 13s 20ms/step - loss: 0.5678 - accuracy: 0.7383 - val_loss: 0.5906 - val_accuracy: 0.7223\n", 565 | "Epoch 3/4\n", 566 | "632/632 [==============================] - 12s 19ms/step - loss: 0.4759 - accuracy: 0.7885 - val_loss: 0.5830 - val_accuracy: 0.7369\n", 567 | "Epoch 4/4\n", 568 | "632/632 [==============================] - 12s 19ms/step - loss: 0.4199 - accuracy: 0.8166 - val_loss: 0.6234 - val_accuracy: 0.7322\n" 569 | ] 570 | } 571 | ], 572 | "source": [ 573 | "save_weights = tf.keras.callbacks.ModelCheckpoint('cbow_mlp.h5', monitor='val_loss', save_best_only=True)\n", 574 | "checkpoint_filepath = 'weights.best.{epoch:01d}.hdf5'\n", 575 | "#model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n", 576 | "#verbose = 1,\n", 577 | "#monitor = 'val_loss',\n", 578 | "#save_best_only = False)\n", 579 | "history = model.fit((x_train1, x_train2), y_train,\n", 580 | " batch_size = 64,\n", 581 | " validation_data = ((x_val1, x_val2), y_val),\n", 582 | " validation_batch_size = 64,\n", 583 | " epochs=4, \n", 584 | " callbacks=[save_weights], \n", 585 | " verbose=1)" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 13, 591 | "id": "6195ba4e", 592 | "metadata": { 593 | "execution": { 594 | "iopub.execute_input": "2021-12-05T06:44:05.863472Z", 595 | "iopub.status.busy": "2021-12-05T06:44:05.862509Z", 596 | "iopub.status.idle": "2021-12-05T06:44:28.229271Z", 597 | "shell.execute_reply": "2021-12-05T06:44:28.229695Z", 598 | "shell.execute_reply.started": "2021-12-05T06:40:02.762209Z" 599 | }, 600 | "papermill": { 601 | "duration": 22.559635, 602 | "end_time": "2021-12-05T06:44:28.229842", 603 | "exception": false, 604 | "start_time": "2021-12-05T06:44:05.670207", 605 | "status": "completed" 606 | }, 607 | "tags": [] 608 | }, 609 | "outputs": [ 610 | { 611 | "name": "stdout", 612 | "output_type": "stream", 613 | "text": [ 614 | "10108/10108 [==============================] - 22s 2ms/step - loss: 0.3327 - accuracy: 0.8621\n", 615 | "loss on test data is 0.3327052593231201\n", 616 | "accuracy on test data is 0.8620792031288147\n" 617 | ] 618 | } 619 | ], 620 | "source": [ 621 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n", 622 | "\n", 623 | "print('loss on test data is', loss)\n", 624 | "print('accuracy on test data is', accuracy)" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 14, 630 | "id": "3d4078c0", 631 | "metadata": { 632 | "execution": { 633 | "iopub.execute_input": "2021-12-05T06:44:28.830291Z", 634 | "iopub.status.busy": "2021-12-05T06:44:28.829339Z", 635 | "iopub.status.idle": "2021-12-05T06:44:31.520669Z", 636 | "shell.execute_reply": "2021-12-05T06:44:31.521533Z", 637 | "shell.execute_reply.started": "2021-12-05T06:40:27.475503Z" 638 | }, 639 | "papermill": { 640 | "duration": 2.995351, 641 | "end_time": "2021-12-05T06:44:31.521775", 642 | "exception": false, 643 | "start_time": "2021-12-05T06:44:28.526424", 644 | "status": "completed" 645 | }, 646 | "tags": [] 647 | }, 648 | "outputs": [ 649 | { 650 | "name": "stdout", 651 | "output_type": "stream", 652 | "text": [ 653 | "F1_score on test is 0.7974573192880495\n" 654 | ] 655 | } 656 | ], 657 | "source": [ 658 | "pred = model.predict((x_test1, x_test2))\n", 659 | "print('F1_score on test is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))\n" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": null, 665 | "id": "b54608e8", 666 | "metadata": { 667 | "papermill": { 668 | "duration": 0.488623, 669 | "end_time": "2021-12-05T06:44:32.446218", 670 | "exception": false, 671 | "start_time": "2021-12-05T06:44:31.957595", 672 | "status": "completed" 673 | }, 674 | "tags": [] 675 | }, 676 | "outputs": [], 677 | "source": [] 678 | } 679 | ], 680 | "metadata": { 681 | "kernelspec": { 682 | "display_name": "Python 3", 683 | "language": "python", 684 | "name": "python3" 685 | }, 686 | "language_info": { 687 | "codemirror_mode": { 688 | "name": "ipython", 689 | "version": 3 690 | }, 691 | "file_extension": ".py", 692 | "mimetype": "text/x-python", 693 | "name": "python", 694 | "nbconvert_exporter": "python", 695 | "pygments_lexer": "ipython3", 696 | "version": "3.8.8" 697 | }, 698 | "papermill": { 699 | "default_parameters": {}, 700 | "duration": 181.391801, 701 | "end_time": "2021-12-05T06:44:36.786152", 702 | "environment_variables": {}, 703 | "exception": null, 704 | "input_path": "__notebook__.ipynb", 705 | "output_path": "__notebook__.ipynb", 706 | "parameters": {}, 707 | "start_time": "2021-12-05T06:41:35.394351", 708 | "version": "2.3.3" 709 | } 710 | }, 711 | "nbformat": 4, 712 | "nbformat_minor": 5 713 | } 714 | -------------------------------------------------------------------------------- /glove-lstm_paper_experiment1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "e8f714bf", 7 | "metadata": { 8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 10 | "execution": { 11 | "iopub.execute_input": "2021-11-27T17:31:54.134744Z", 12 | "iopub.status.busy": "2021-11-27T17:31:54.132593Z", 13 | "iopub.status.idle": "2021-11-27T17:31:59.374583Z", 14 | "shell.execute_reply": "2021-11-27T17:31:59.373668Z", 15 | "shell.execute_reply.started": "2021-11-27T17:18:16.053325Z" 16 | }, 17 | "papermill": { 18 | "duration": 5.264768, 19 | "end_time": "2021-11-27T17:31:59.374813", 20 | "exception": false, 21 | "start_time": "2021-11-27T17:31:54.110045", 22 | "status": "completed" 23 | }, 24 | "tags": [] 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import pandas as pd\n", 29 | "import numpy as np\n", 30 | "from tqdm import tqdm\n", 31 | "import tensorflow as tf\n", 32 | "from sklearn.metrics import f1_score" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "id": "674aa0b8", 39 | "metadata": { 40 | "execution": { 41 | "iopub.execute_input": "2021-11-27T17:31:59.409729Z", 42 | "iopub.status.busy": "2021-11-27T17:31:59.408970Z", 43 | "iopub.status.idle": "2021-11-27T17:32:02.909462Z", 44 | "shell.execute_reply": "2021-11-27T17:32:02.908882Z", 45 | "shell.execute_reply.started": "2021-11-27T17:18:26.878800Z" 46 | }, 47 | "papermill": { 48 | "duration": 3.520358, 49 | "end_time": "2021-11-27T17:32:02.909614", 50 | "exception": false, 51 | "start_time": "2021-11-27T17:31:59.389256", 52 | "status": "completed" 53 | }, 54 | "tags": [] 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "train = pd.read_csv('../input/smai-project-data/train_data.csv').fillna('')\n", 59 | "val = pd.read_csv('../input/smai-project-data/val_data.csv').fillna('')\n", 60 | "test = pd.read_csv('../input/smai-project-data/test_data.csv').fillna('')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 3, 66 | "id": "0f8bc2ba", 67 | "metadata": { 68 | "execution": { 69 | "iopub.execute_input": "2021-11-27T17:32:02.947083Z", 70 | "iopub.status.busy": "2021-11-27T17:32:02.946007Z", 71 | "iopub.status.idle": "2021-11-27T17:32:02.962532Z", 72 | "shell.execute_reply": "2021-11-27T17:32:02.963124Z", 73 | "shell.execute_reply.started": "2021-11-27T17:18:30.138188Z" 74 | }, 75 | "papermill": { 76 | "duration": 0.040354, 77 | "end_time": "2021-11-27T17:32:02.963310", 78 | "exception": false, 79 | "start_time": "2021-11-27T17:32:02.922956", 80 | "status": "completed" 81 | }, 82 | "tags": [] 83 | }, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/html": [ 88 | "

\n", 89 | "\n", 102 | "\n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | "

	id	qid1	qid2	question1	question2	is_duplicate	question1_preprocessed	question2_preprocessed
0	8067	15738	15739	How do I play Pokémon GO in Korea?	How do I play Pokémon GO in China?	0	how do i play pok mon go in korea ?	how do i play pok mon go in china ?
1	368101	12736	104117	What are some of the best side dishes for crab...	What are some good side dishes for buffalo chi...	0	what are some of the best side dishes for crab...	what are some good side dishes for buffalo chi...
2	70497	121486	121487	Which is more advisable and better material fo...	What is the best server setup for buddypress?	0	which is more advisable and better material fo...	what is the best server setup for buddypress ?
3	226567	254474	258192	How do I improve logical programming skills?	How can I improve my logical skills for progra...	1	how do i improve logical programming skills ?	how can i improve my logical skills for progra...
4	73186	48103	3062	How close we are to see 3rd world war?	How close is a World War III?	1	how close we are to see 3rd world war ?	how close is a world war iii ?

\n", 174 | "

" 175 | ], 176 | "text/plain": [ 177 | " id qid1 qid2 question1 \\\n", 178 | "0 8067 15738 15739 How do I play Pokémon GO in Korea? \n", 179 | "1 368101 12736 104117 What are some of the best side dishes for crab... \n", 180 | "2 70497 121486 121487 Which is more advisable and better material fo... \n", 181 | "3 226567 254474 258192 How do I improve logical programming skills? \n", 182 | "4 73186 48103 3062 How close we are to see 3rd world war? \n", 183 | "\n", 184 | " question2 is_duplicate \\\n", 185 | "0 How do I play Pokémon GO in China? 0 \n", 186 | "1 What are some good side dishes for buffalo chi... 0 \n", 187 | "2 What is the best server setup for buddypress? 0 \n", 188 | "3 How can I improve my logical skills for progra... 1 \n", 189 | "4 How close is a World War III? 1 \n", 190 | "\n", 191 | " question1_preprocessed \\\n", 192 | "0 how do i play pok mon go in korea ? \n", 193 | "1 what are some of the best side dishes for crab... \n", 194 | "2 which is more advisable and better material fo... \n", 195 | "3 how do i improve logical programming skills ? \n", 196 | "4 how close we are to see 3rd world war ? \n", 197 | "\n", 198 | " question2_preprocessed \n", 199 | "0 how do i play pok mon go in china ? \n", 200 | "1 what are some good side dishes for buffalo chi... \n", 201 | "2 what is the best server setup for buddypress ? \n", 202 | "3 how can i improve my logical skills for progra... \n", 203 | "4 how close is a world war iii ? " 204 | ] 205 | }, 206 | "execution_count": 3, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "train.head()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 4, 218 | "id": "3789630a", 219 | "metadata": { 220 | "execution": { 221 | "iopub.execute_input": "2021-11-27T17:32:03.002462Z", 222 | "iopub.status.busy": "2021-11-27T17:32:03.001753Z", 223 | "iopub.status.idle": "2021-11-27T17:32:03.006592Z", 224 | "shell.execute_reply": "2021-11-27T17:32:03.006031Z", 225 | "shell.execute_reply.started": "2021-11-27T17:18:30.162922Z" 226 | }, 227 | "papermill": { 228 | "duration": 0.029215, 229 | "end_time": "2021-11-27T17:32:03.006773", 230 | "exception": false, 231 | "start_time": "2021-11-27T17:32:02.977558", 232 | "status": "completed" 233 | }, 234 | "tags": [] 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "def buildVocabulary(reviews):\n", 239 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n", 240 | " tokenizer.fit_on_texts(reviews)\n", 241 | " return tokenizer\n", 242 | "\n", 243 | "def getSequences(reviews, tokenizer, seq_maxlen):\n", 244 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n", 245 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n", 246 | "\n", 247 | "def loadGloveWordEmbeddings():\n", 248 | " embedding_vectors = {}\n", 249 | " with open('../input/glove840b300dtxt/glove.840B.300d.txt') as f:\n", 250 | " for line in tqdm(f):\n", 251 | " values = line.split(' ')\n", 252 | " word = values[0]\n", 253 | " coefs = np.asarray(values[1:], dtype='float32')\n", 254 | " embedding_vectors[word] = coefs\n", 255 | " return embedding_vectors\n", 256 | "\n", 257 | "def getEmbeddingWeightMatrix(embedding_vectors, word2idx): \n", 258 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n", 259 | " for word, i in tqdm(word2idx.items()):\n", 260 | " embedding_vector = embedding_vectors.get(word)\n", 261 | " if embedding_vector is not None:\n", 262 | " embedding_matrix[i] = embedding_vector\n", 263 | " return embedding_matrix" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 5, 269 | "id": "33f503b8", 270 | "metadata": { 271 | "execution": { 272 | "iopub.execute_input": "2021-11-27T17:32:03.086757Z", 273 | "iopub.status.busy": "2021-11-27T17:32:03.043126Z", 274 | "iopub.status.idle": "2021-11-27T17:32:42.350295Z", 275 | "shell.execute_reply": "2021-11-27T17:32:42.349686Z", 276 | "shell.execute_reply.started": "2021-11-27T17:18:30.175848Z" 277 | }, 278 | "papermill": { 279 | "duration": 39.329657, 280 | "end_time": "2021-11-27T17:32:42.350460", 281 | "exception": false, 282 | "start_time": "2021-11-27T17:32:03.020803", 283 | "status": "completed" 284 | }, 285 | "tags": [] 286 | }, 287 | "outputs": [ 288 | { 289 | "name": "stdout", 290 | "output_type": "stream", 291 | "text": [ 292 | "119558\n" 293 | ] 294 | } 295 | ], 296 | "source": [ 297 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n", 298 | "vocab_size = len(tokenizer.word_index) + 1\n", 299 | "print(vocab_size)\n", 300 | "\n", 301 | "x_train1 = getSequences(train['question1'], tokenizer, 128)\n", 302 | "x_train2 = getSequences(train['question2'], tokenizer, 128)\n", 303 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n", 304 | "\n", 305 | "x_val1 = getSequences(val['question1'], tokenizer, 128)\n", 306 | "x_val2 = getSequences(val['question2'], tokenizer, 128)\n", 307 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n", 308 | "\n", 309 | "x_test1 = getSequences(test['question1'], tokenizer, 128)\n", 310 | "x_test2 = getSequences(test['question2'], tokenizer, 128)\n", 311 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 6, 317 | "id": "bdffb7fe", 318 | "metadata": { 319 | "execution": { 320 | "iopub.execute_input": "2021-11-27T17:32:42.385156Z", 321 | "iopub.status.busy": "2021-11-27T17:32:42.384482Z", 322 | "iopub.status.idle": "2021-11-27T17:37:39.763269Z", 323 | "shell.execute_reply": "2021-11-27T17:37:39.761448Z", 324 | "shell.execute_reply.started": "2021-11-27T17:19:01.967098Z" 325 | }, 326 | "papermill": { 327 | "duration": 297.397774, 328 | "end_time": "2021-11-27T17:37:39.763433", 329 | "exception": false, 330 | "start_time": "2021-11-27T17:32:42.365659", 331 | "status": "completed" 332 | }, 333 | "tags": [] 334 | }, 335 | "outputs": [ 336 | { 337 | "name": "stderr", 338 | "output_type": "stream", 339 | "text": [ 340 | "2196018it [04:56, 7397.00it/s]\n" 341 | ] 342 | }, 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "2196017\n" 348 | ] 349 | }, 350 | { 351 | "name": "stderr", 352 | "output_type": "stream", 353 | "text": [ 354 | "100%|██████████| 119557/119557 [00:00<00:00, 255058.70it/s]" 355 | ] 356 | }, 357 | { 358 | "name": "stdout", 359 | "output_type": "stream", 360 | "text": [ 361 | "(119558, 300)\n" 362 | ] 363 | }, 364 | { 365 | "name": "stderr", 366 | "output_type": "stream", 367 | "text": [ 368 | "\n" 369 | ] 370 | } 371 | ], 372 | "source": [ 373 | "embedding_vectors = loadGloveWordEmbeddings()\n", 374 | "print(len(embedding_vectors))\n", 375 | "\n", 376 | "embedding_weight_matrix = getEmbeddingWeightMatrix(embedding_vectors, tokenizer.word_index)\n", 377 | "print(embedding_weight_matrix.shape)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 7, 383 | "id": "e95c5bc7", 384 | "metadata": { 385 | "execution": { 386 | "iopub.execute_input": "2021-11-27T17:37:41.733535Z", 387 | "iopub.status.busy": "2021-11-27T17:37:41.732702Z", 388 | "iopub.status.idle": "2021-11-27T17:37:45.831178Z", 389 | "shell.execute_reply": "2021-11-27T17:37:45.831708Z", 390 | "shell.execute_reply.started": "2021-11-27T17:23:23.346592Z" 391 | }, 392 | "papermill": { 393 | "duration": 5.091622, 394 | "end_time": "2021-11-27T17:37:45.831935", 395 | "exception": false, 396 | "start_time": "2021-11-27T17:37:40.740313", 397 | "status": "completed" 398 | }, 399 | "tags": [] 400 | }, 401 | "outputs": [ 402 | { 403 | "name": "stderr", 404 | "output_type": "stream", 405 | "text": [ 406 | "2021-11-27 17:37:41.835873: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 407 | "2021-11-27 17:37:41.961540: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 408 | "2021-11-27 17:37:41.962835: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 409 | "2021-11-27 17:37:41.965578: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", 410 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", 411 | "2021-11-27 17:37:41.966902: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 412 | "2021-11-27 17:37:41.968136: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 413 | "2021-11-27 17:37:41.969191: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 414 | "2021-11-27 17:37:44.394226: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 415 | "2021-11-27 17:37:44.395501: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 416 | "2021-11-27 17:37:44.396537: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 417 | "2021-11-27 17:37:44.398460: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n", 418 | "2021-11-27 17:37:45.013946: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n", 419 | "2021-11-27 17:37:45.271914: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n" 420 | ] 421 | } 422 | ], 423 | "source": [ 424 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n", 425 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n", 426 | "\n", 427 | "inner1= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128, \n", 428 | " weights=[embedding_weight_matrix], trainable=False)(inp1)\n", 429 | "inner2= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128,\n", 430 | " weights=[embedding_weight_matrix], trainable=False)(inp2)\n", 431 | "\n", 432 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n", 433 | "\n", 434 | "out, h, c = tf.keras.layers.LSTM(200, return_sequences=False, return_state=True)(inner)\n", 435 | "\n", 436 | "output = tf.keras.layers.Dense(2, activation='softmax')(c)\n", 437 | "\n", 438 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 8, 444 | "id": "3880b605", 445 | "metadata": { 446 | "execution": { 447 | "iopub.execute_input": "2021-11-27T17:37:48.855149Z", 448 | "iopub.status.busy": "2021-11-27T17:37:48.853991Z", 449 | "iopub.status.idle": "2021-11-27T17:37:48.878756Z", 450 | "shell.execute_reply": "2021-11-27T17:37:48.878201Z", 451 | "shell.execute_reply.started": "2021-11-27T17:23:26.835969Z" 452 | }, 453 | "papermill": { 454 | "duration": 2.054917, 455 | "end_time": "2021-11-27T17:37:48.878932", 456 | "exception": false, 457 | "start_time": "2021-11-27T17:37:46.824015", 458 | "status": "completed" 459 | }, 460 | "tags": [] 461 | }, 462 | "outputs": [ 463 | { 464 | "name": "stdout", 465 | "output_type": "stream", 466 | "text": [ 467 | "Model: \"model\"\n", 468 | "__________________________________________________________________________________________________\n", 469 | "Layer (type) Output Shape Param # Connected to \n", 470 | "==================================================================================================\n", 471 | "input_1 (InputLayer) [(None, 128)] 0 \n", 472 | "__________________________________________________________________________________________________\n", 473 | "input_2 (InputLayer) [(None, 128)] 0 \n", 474 | "__________________________________________________________________________________________________\n", 475 | "embedding (Embedding) (None, 128, 300) 35867400 input_1[0][0] \n", 476 | "__________________________________________________________________________________________________\n", 477 | "embedding_1 (Embedding) (None, 128, 300) 35867400 input_2[0][0] \n", 478 | "__________________________________________________________________________________________________\n", 479 | "tf.__operators__.add (TFOpLambd (None, 128, 300) 0 embedding[0][0] \n", 480 | " embedding_1[0][0] \n", 481 | "__________________________________________________________________________________________________\n", 482 | "tf.math.subtract (TFOpLambda) (None, 128, 300) 0 embedding[0][0] \n", 483 | " embedding_1[0][0] \n", 484 | "__________________________________________________________________________________________________\n", 485 | "tf.math.multiply (TFOpLambda) (None, 128, 300) 0 embedding[0][0] \n", 486 | " embedding_1[0][0] \n", 487 | "__________________________________________________________________________________________________\n", 488 | "concatenate (Concatenate) (None, 128, 900) 0 tf.__operators__.add[0][0] \n", 489 | " tf.math.subtract[0][0] \n", 490 | " tf.math.multiply[0][0] \n", 491 | "__________________________________________________________________________________________________\n", 492 | "lstm (LSTM) [(None, 200), (None, 880800 concatenate[0][0] \n", 493 | "__________________________________________________________________________________________________\n", 494 | "dense (Dense) (None, 2) 402 lstm[0][2] \n", 495 | "==================================================================================================\n", 496 | "Total params: 72,616,002\n", 497 | "Trainable params: 881,202\n", 498 | "Non-trainable params: 71,734,800\n", 499 | "__________________________________________________________________________________________________\n" 500 | ] 501 | } 502 | ], 503 | "source": [ 504 | "model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics=['accuracy'])\n", 505 | "model.summary()" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": 9, 511 | "id": "1b7808b5", 512 | "metadata": { 513 | "execution": { 514 | "iopub.execute_input": "2021-11-27T17:37:50.886838Z", 515 | "iopub.status.busy": "2021-11-27T17:37:50.885717Z", 516 | "iopub.status.idle": "2021-11-27T17:46:15.153633Z", 517 | "shell.execute_reply": "2021-11-27T17:46:15.154306Z", 518 | "shell.execute_reply.started": "2021-11-27T17:23:26.858668Z" 519 | }, 520 | "papermill": { 521 | "duration": 505.288706, 522 | "end_time": "2021-11-27T17:46:15.154505", 523 | "exception": false, 524 | "start_time": "2021-11-27T17:37:49.865799", 525 | "status": "completed" 526 | }, 527 | "tags": [] 528 | }, 529 | "outputs": [ 530 | { 531 | "name": "stderr", 532 | "output_type": "stream", 533 | "text": [ 534 | "2021-11-27 17:37:50.888003: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 144897536 exceeds 10% of free system memory.\n", 535 | "2021-11-27 17:37:51.004600: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 144897536 exceeds 10% of free system memory.\n", 536 | "2021-11-27 17:37:51.188913: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n" 537 | ] 538 | }, 539 | { 540 | "name": "stdout", 541 | "output_type": "stream", 542 | "text": [ 543 | "Epoch 1/3\n" 544 | ] 545 | }, 546 | { 547 | "name": "stderr", 548 | "output_type": "stream", 549 | "text": [ 550 | "2021-11-27 17:37:54.166583: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005\n" 551 | ] 552 | }, 553 | { 554 | "name": "stdout", 555 | "output_type": "stream", 556 | "text": [ 557 | "8841/8844 [============================>.] - ETA: 0s - loss: 0.4669 - accuracy: 0.7735" 558 | ] 559 | }, 560 | { 561 | "name": "stderr", 562 | "output_type": "stream", 563 | "text": [ 564 | "2021-11-27 17:40:06.040932: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 41399296 exceeds 10% of free system memory.\n" 565 | ] 566 | }, 567 | { 568 | "name": "stdout", 569 | "output_type": "stream", 570 | "text": [ 571 | "8844/8844 [==============================] - 164s 18ms/step - loss: 0.4669 - accuracy: 0.7735 - val_loss: 0.4332 - val_accuracy: 0.7948\n", 572 | "\n", 573 | "Epoch 00001: saving model to weights.best.1.hdf5\n", 574 | "Epoch 2/3\n", 575 | "8844/8844 [==============================] - 171s 19ms/step - loss: 0.3708 - accuracy: 0.8291 - val_loss: 0.4160 - val_accuracy: 0.8042\n", 576 | "\n", 577 | "Epoch 00002: saving model to weights.best.2.hdf5\n", 578 | "Epoch 3/3\n", 579 | "8844/8844 [==============================] - 159s 18ms/step - loss: 0.2868 - accuracy: 0.8736 - val_loss: 0.4415 - val_accuracy: 0.8081\n", 580 | "\n", 581 | "Epoch 00003: saving model to weights.best.3.hdf5\n" 582 | ] 583 | } 584 | ], 585 | "source": [ 586 | "checkpoint_filepath = 'weights.best.{epoch:01d}.hdf5'\n", 587 | "model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n", 588 | " verbose = 1, \n", 589 | " monitor = 'val_loss',\n", 590 | " save_best_only = False)\n", 591 | "\n", 592 | "history = model.fit((x_train1, x_train2), y_train,\n", 593 | " batch_size = 32,\n", 594 | " validation_data = ((x_val1, x_val2), y_val),\n", 595 | " validation_batch_size = 16,\n", 596 | " epochs=3,\n", 597 | " callbacks=[model_checkpoint_callback], \n", 598 | " verbose=1)" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": 12, 604 | "id": "8863f669", 605 | "metadata": { 606 | "execution": { 607 | "iopub.execute_input": "2021-11-27T17:46:36.740103Z", 608 | "iopub.status.busy": "2021-11-27T17:46:36.738488Z", 609 | "iopub.status.idle": "2021-11-27T17:47:58.715697Z", 610 | "shell.execute_reply": "2021-11-27T17:47:58.715075Z", 611 | "shell.execute_reply.started": "2021-11-27T17:29:15.189693Z" 612 | }, 613 | "papermill": { 614 | "duration": 86.274629, 615 | "end_time": "2021-11-27T17:47:58.715889", 616 | "exception": false, 617 | "start_time": "2021-11-27T17:46:32.441260", 618 | "status": "completed" 619 | }, 620 | "tags": [] 621 | }, 622 | "outputs": [ 623 | { 624 | "name": "stdout", 625 | "output_type": "stream", 626 | "text": [ 627 | "10108/10108 [==============================] - 59s 6ms/step - loss: 0.4340 - accuracy: 0.8087\n", 628 | "loss on test data is 0.43401026725769043\n", 629 | "accuracy on test data is 0.80872642993927\n" 630 | ] 631 | } 632 | ], 633 | "source": [ 634 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n", 635 | "\n", 636 | "print('loss on test data is', loss)\n", 637 | "print('accuracy on test data is', accuracy)" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": 13, 643 | "id": "da98475b", 644 | "metadata": { 645 | "execution": { 646 | "iopub.execute_input": "2021-11-27T17:48:06.451346Z", 647 | "iopub.status.busy": "2021-11-27T17:48:06.450104Z", 648 | "iopub.status.idle": "2021-11-27T17:48:17.164859Z", 649 | "shell.execute_reply": "2021-11-27T17:48:17.163722Z", 650 | "shell.execute_reply.started": "2021-11-27T17:30:03.625800Z" 651 | }, 652 | "papermill": { 653 | "duration": 14.724145, 654 | "end_time": "2021-11-27T17:48:17.165031", 655 | "exception": false, 656 | "start_time": "2021-11-27T17:48:02.440886", 657 | "status": "completed" 658 | }, 659 | "tags": [] 660 | }, 661 | "outputs": [ 662 | { 663 | "name": "stdout", 664 | "output_type": "stream", 665 | "text": [ 666 | "f1_score on test dataset is 0.7398311072233624\n" 667 | ] 668 | } 669 | ], 670 | "source": [ 671 | "pred = model.predict((x_test1, x_test2))\n", 672 | "\n", 673 | "print('f1_score on test dataset is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": null, 679 | "id": "90b4b575", 680 | "metadata": { 681 | "papermill": { 682 | "duration": 3.647781, 683 | "end_time": "2021-11-27T17:48:24.449422", 684 | "exception": false, 685 | "start_time": "2021-11-27T17:48:20.801641", 686 | "status": "completed" 687 | }, 688 | "tags": [] 689 | }, 690 | "outputs": [], 691 | "source": [] 692 | } 693 | ], 694 | "metadata": { 695 | "kernelspec": { 696 | "display_name": "Python 3 (ipykernel)", 697 | "language": "python", 698 | "name": "python3" 699 | }, 700 | "language_info": { 701 | "codemirror_mode": { 702 | "name": "ipython", 703 | "version": 3 704 | }, 705 | "file_extension": ".py", 706 | "mimetype": "text/x-python", 707 | "name": "python", 708 | "nbconvert_exporter": "python", 709 | "pygments_lexer": "ipython3", 710 | "version": "3.8.10" 711 | }, 712 | "papermill": { 713 | "default_parameters": {}, 714 | "duration": 1006.545475, 715 | "end_time": "2021-11-27T17:48:31.257168", 716 | "environment_variables": {}, 717 | "exception": null, 718 | "input_path": "__notebook__.ipynb", 719 | "output_path": "__notebook__.ipynb", 720 | "parameters": {}, 721 | "start_time": "2021-11-27T17:31:44.711693", 722 | "version": "2.3.3" 723 | } 724 | }, 725 | "nbformat": 4, 726 | "nbformat_minor": 5 727 | } 728 | -------------------------------------------------------------------------------- /CBOW MLP Sum Diff Product of Embeddings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "0ffba8bb", 7 | "metadata": { 8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 10 | "execution": { 11 | "iopub.execute_input": "2021-11-27T13:27:02.120873Z", 12 | "iopub.status.busy": "2021-11-27T13:27:02.120117Z", 13 | "iopub.status.idle": "2021-11-27T13:27:06.777897Z", 14 | "shell.execute_reply": "2021-11-27T13:27:06.777261Z", 15 | "shell.execute_reply.started": "2021-11-27T13:25:33.869721Z" 16 | }, 17 | "papermill": { 18 | "duration": 4.681193, 19 | "end_time": "2021-11-27T13:27:06.778056", 20 | "exception": false, 21 | "start_time": "2021-11-27T13:27:02.096863", 22 | "status": "completed" 23 | }, 24 | "tags": [] 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import pandas as pd\n", 29 | "import numpy as np\n", 30 | "from tqdm import tqdm\n", 31 | "import tensorflow as tf\n", 32 | "from gensim.models import KeyedVectors\n", 33 | "import gensim\n", 34 | "import re\n", 35 | "from sklearn.metrics import f1_score" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "id": "2d9327ba", 42 | "metadata": { 43 | "execution": { 44 | "iopub.execute_input": "2021-11-27T13:27:06.803571Z", 45 | "iopub.status.busy": "2021-11-27T13:27:06.802996Z", 46 | "iopub.status.idle": "2021-11-27T13:27:07.887920Z", 47 | "shell.execute_reply": "2021-11-27T13:27:07.886982Z", 48 | "shell.execute_reply.started": "2021-11-27T13:10:42.402545Z" 49 | }, 50 | "papermill": { 51 | "duration": 1.099041, 52 | "end_time": "2021-11-27T13:27:07.888060", 53 | "exception": false, 54 | "start_time": "2021-11-27T13:27:06.789019", 55 | "status": "completed" 56 | }, 57 | "tags": [] 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "train = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')\n", 62 | "val = pd.read_csv('../input/quora-ques-pair/val_data.csv/val_data.csv').fillna('')\n", 63 | "test = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "id": "c0bfef54", 70 | "metadata": { 71 | "execution": { 72 | "iopub.execute_input": "2021-11-27T13:27:07.912052Z", 73 | "iopub.status.busy": "2021-11-27T13:27:07.911329Z", 74 | "iopub.status.idle": "2021-11-27T13:27:07.914031Z", 75 | "shell.execute_reply": "2021-11-27T13:27:07.913405Z", 76 | "shell.execute_reply.started": "2021-11-27T13:10:43.715098Z" 77 | }, 78 | "papermill": { 79 | "duration": 0.015828, 80 | "end_time": "2021-11-27T13:27:07.914162", 81 | "exception": false, 82 | "start_time": "2021-11-27T13:27:07.898334", 83 | "status": "completed" 84 | }, 85 | "tags": [] 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "word2vec_file = '../input/d/sandreds/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "id": "f318327c", 96 | "metadata": { 97 | "execution": { 98 | "iopub.execute_input": "2021-11-27T13:27:07.942268Z", 99 | "iopub.status.busy": "2021-11-27T13:27:07.941457Z", 100 | "iopub.status.idle": "2021-11-27T13:27:07.953929Z", 101 | "shell.execute_reply": "2021-11-27T13:27:07.954439Z", 102 | "shell.execute_reply.started": "2021-11-27T13:10:43.722268Z" 103 | }, 104 | "papermill": { 105 | "duration": 0.030715, 106 | "end_time": "2021-11-27T13:27:07.954598", 107 | "exception": false, 108 | "start_time": "2021-11-27T13:27:07.923883", 109 | "status": "completed" 110 | }, 111 | "tags": [] 112 | }, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/html": [ 117 | "

\n", 118 | "\n", 131 | "\n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | "

	id	qid1	qid2	question1	question2	is_duplicate	question1_preprocessed	question2_preprocessed
0	204673	93885	307635	If there is a God, where is He!	Why is god a \"He\"?	0	if there is a god , where is he !	why is god a `` he '' ?
1	17716	2093	15628	Do you believe that everything happens for a r...	Does everything happen for a reason?	1	do you believe that everything happens for a r...	does everything happen for a reason ?
2	291767	352623	413255	Will there always be web hosting that will sup...	Will there always be web hosting that supports...	1	will there always be web hosting that will sup...	will there always be web hosting that supports...
3	203758	59824	67971	What is the proof of Indian Army's surgical st...	Has India provided any proof of the surgical s...	1	what is the proof of indian army 's surgical s...	has india provided any proof of the surgical s...
4	41747	75326	75327	What do Indian Muslims think of Modi?	What do Indian Muslim think about PM Narendra ...	1	what do indian muslims think of modi ?	what do indian muslim think about pm narendra ...

\n", 203 | "

" 204 | ], 205 | "text/plain": [ 206 | " id qid1 qid2 question1 \\\n", 207 | "0 204673 93885 307635 If there is a God, where is He! \n", 208 | "1 17716 2093 15628 Do you believe that everything happens for a r... \n", 209 | "2 291767 352623 413255 Will there always be web hosting that will sup... \n", 210 | "3 203758 59824 67971 What is the proof of Indian Army's surgical st... \n", 211 | "4 41747 75326 75327 What do Indian Muslims think of Modi? \n", 212 | "\n", 213 | " question2 is_duplicate \\\n", 214 | "0 Why is god a \"He\"? 0 \n", 215 | "1 Does everything happen for a reason? 1 \n", 216 | "2 Will there always be web hosting that supports... 1 \n", 217 | "3 Has India provided any proof of the surgical s... 1 \n", 218 | "4 What do Indian Muslim think about PM Narendra ... 1 \n", 219 | "\n", 220 | " question1_preprocessed \\\n", 221 | "0 if there is a god , where is he ! \n", 222 | "1 do you believe that everything happens for a r... \n", 223 | "2 will there always be web hosting that will sup... \n", 224 | "3 what is the proof of indian army 's surgical s... \n", 225 | "4 what do indian muslims think of modi ? \n", 226 | "\n", 227 | " question2_preprocessed \n", 228 | "0 why is god a `` he '' ? \n", 229 | "1 does everything happen for a reason ? \n", 230 | "2 will there always be web hosting that supports... \n", 231 | "3 has india provided any proof of the surgical s... \n", 232 | "4 what do indian muslim think about pm narendra ... " 233 | ] 234 | }, 235 | "execution_count": 4, 236 | "metadata": {}, 237 | "output_type": "execute_result" 238 | } 239 | ], 240 | "source": [ 241 | "train.head()" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 5, 247 | "id": "fdd3b917", 248 | "metadata": { 249 | "execution": { 250 | "iopub.execute_input": "2021-11-27T13:27:07.982238Z", 251 | "iopub.status.busy": "2021-11-27T13:27:07.981577Z", 252 | "iopub.status.idle": "2021-11-27T13:28:12.395623Z", 253 | "shell.execute_reply": "2021-11-27T13:28:12.396099Z", 254 | "shell.execute_reply.started": "2021-11-27T13:10:43.746605Z" 255 | }, 256 | "papermill": { 257 | "duration": 64.431482, 258 | "end_time": "2021-11-27T13:28:12.396269", 259 | "exception": false, 260 | "start_time": "2021-11-27T13:27:07.964787", 261 | "status": "completed" 262 | }, 263 | "tags": [] 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "def buildVocabulary(reviews):\n", 268 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n", 269 | " tokenizer.fit_on_texts(reviews)\n", 270 | " return tokenizer\n", 271 | "\n", 272 | "def getSequences(reviews, tokenizer, seq_maxlen):\n", 273 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n", 274 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n", 275 | "\n", 276 | "word2vec_model = KeyedVectors.load_word2vec_format(word2vec_file, binary = True)\n", 277 | "\n", 278 | "def getEmbeddingWeightMatrix(word2idx): \n", 279 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n", 280 | " for word, i in tqdm(word2idx.items()):\n", 281 | " \n", 282 | " embedding_vector = word2vec_model[word] if word in word2vec_model else np.random.rand(1,300)\n", 283 | " if embedding_vector is not None:\n", 284 | " embedding_matrix[i] = embedding_vector\n", 285 | " return embedding_matrix" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 6, 291 | "id": "47a83b4b", 292 | "metadata": { 293 | "execution": { 294 | "iopub.execute_input": "2021-11-27T13:28:12.451991Z", 295 | "iopub.status.busy": "2021-11-27T13:28:12.438857Z", 296 | "iopub.status.idle": "2021-11-27T13:28:26.338075Z", 297 | "shell.execute_reply": "2021-11-27T13:28:26.337092Z", 298 | "shell.execute_reply.started": "2021-11-27T13:11:45.518534Z" 299 | }, 300 | "papermill": { 301 | "duration": 13.930379, 302 | "end_time": "2021-11-27T13:28:26.338242", 303 | "exception": false, 304 | "start_time": "2021-11-27T13:28:12.407863", 305 | "status": "completed" 306 | }, 307 | "tags": [] 308 | }, 309 | "outputs": [ 310 | { 311 | "name": "stdout", 312 | "output_type": "stream", 313 | "text": [ 314 | "67043\n" 315 | ] 316 | } 317 | ], 318 | "source": [ 319 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n", 320 | "vocab_size = len(tokenizer.word_index) + 1\n", 321 | "print(vocab_size)\n", 322 | "\n", 323 | "x_train1 = getSequences(train['question1'], tokenizer, 200)\n", 324 | "x_train2 = getSequences(train['question2'], tokenizer, 200)\n", 325 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n", 326 | "\n", 327 | "x_val1 = getSequences(val['question1'], tokenizer, 200)\n", 328 | "x_val2 = getSequences(val['question2'], tokenizer, 200)\n", 329 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n", 330 | "\n", 331 | "x_test1 = getSequences(test['question1'], tokenizer, 200)\n", 332 | "x_test2 = getSequences(test['question2'], tokenizer, 200)\n", 333 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 7, 339 | "id": "064a2f92", 340 | "metadata": { 341 | "execution": { 342 | "iopub.execute_input": "2021-11-27T13:28:26.364903Z", 343 | "iopub.status.busy": "2021-11-27T13:28:26.364312Z", 344 | "iopub.status.idle": "2021-11-27T13:28:26.845268Z", 345 | "shell.execute_reply": "2021-11-27T13:28:26.843711Z", 346 | "shell.execute_reply.started": "2021-11-27T13:12:14.356043Z" 347 | }, 348 | "papermill": { 349 | "duration": 0.496339, 350 | "end_time": "2021-11-27T13:28:26.845392", 351 | "exception": false, 352 | "start_time": "2021-11-27T13:28:26.349053", 353 | "status": "completed" 354 | }, 355 | "tags": [] 356 | }, 357 | "outputs": [ 358 | { 359 | "name": "stderr", 360 | "output_type": "stream", 361 | "text": [ 362 | "100%|██████████| 67042/67042 [00:00<00:00, 142685.61it/s]" 363 | ] 364 | }, 365 | { 366 | "name": "stdout", 367 | "output_type": "stream", 368 | "text": [ 369 | "(67043, 300)\n" 370 | ] 371 | }, 372 | { 373 | "name": "stderr", 374 | "output_type": "stream", 375 | "text": [ 376 | "\n" 377 | ] 378 | } 379 | ], 380 | "source": [ 381 | "#embedding_vectors = loadGloveWordEmbeddings()\n", 382 | "#print(len(embedding_vectors))\n", 383 | "\n", 384 | "embedding_weight_matrix = getEmbeddingWeightMatrix(tokenizer.word_index)\n", 385 | "print(embedding_weight_matrix.shape)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 8, 391 | "id": "13dd3e2c", 392 | "metadata": { 393 | "execution": { 394 | "iopub.execute_input": "2021-11-27T13:28:26.881262Z", 395 | "iopub.status.busy": "2021-11-27T13:28:26.880656Z", 396 | "iopub.status.idle": "2021-11-27T13:28:29.585007Z", 397 | "shell.execute_reply": "2021-11-27T13:28:29.585470Z", 398 | "shell.execute_reply.started": "2021-11-27T13:21:50.654150Z" 399 | }, 400 | "papermill": { 401 | "duration": 2.727389, 402 | "end_time": "2021-11-27T13:28:29.585638", 403 | "exception": false, 404 | "start_time": "2021-11-27T13:28:26.858249", 405 | "status": "completed" 406 | }, 407 | "tags": [] 408 | }, 409 | "outputs": [ 410 | { 411 | "name": "stderr", 412 | "output_type": "stream", 413 | "text": [ 414 | "2021-11-27 13:28:26.961758: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 415 | "2021-11-27 13:28:27.082051: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 416 | "2021-11-27 13:28:27.082785: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 417 | "2021-11-27 13:28:27.084455: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n", 418 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", 419 | "2021-11-27 13:28:27.085735: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 420 | "2021-11-27 13:28:27.086434: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 421 | "2021-11-27 13:28:27.087162: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 422 | "2021-11-27 13:28:28.950796: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 423 | "2021-11-27 13:28:28.951646: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 424 | "2021-11-27 13:28:28.952433: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 425 | "2021-11-27 13:28:28.953120: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n" 426 | ] 427 | } 428 | ], 429 | "source": [ 430 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n", 431 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n", 432 | "\n", 433 | "inner1= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200, \n", 434 | " weights=[embedding_weight_matrix], trainable=True)(inp1)\n", 435 | "inner2= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200,\n", 436 | " weights=[embedding_weight_matrix], trainable=True)(inp2)\n", 437 | " \n", 438 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n", 439 | "\n", 440 | "inner = tf.keras.backend.sum(inner, axis=1, keepdims=False)\n", 441 | "#tf.keras.regularizers.l2(l2=0.01)\n", 442 | "inner = tf.keras.layers.Dense(300, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 443 | "inner = tf.keras.layers.Dropout(0.1)(inner)\n", 444 | "inner = tf.keras.layers.Dense(200, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 445 | "inner = tf.keras.layers.Dropout(0.1)(inner)\n", 446 | "inner = tf.keras.layers.Dense(100, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 447 | "inner = tf.keras.layers.Dropout(0.1)(inner)\n", 448 | "output = tf.keras.layers.Dense(2, activation='softmax')(inner)\n", 449 | "\n", 450 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 9, 456 | "id": "2a0fb964", 457 | "metadata": { 458 | "execution": { 459 | "iopub.execute_input": "2021-11-27T13:28:29.619321Z", 460 | "iopub.status.busy": "2021-11-27T13:28:29.618811Z", 461 | "iopub.status.idle": "2021-11-27T13:28:29.634853Z", 462 | "shell.execute_reply": "2021-11-27T13:28:29.635434Z", 463 | "shell.execute_reply.started": "2021-11-27T13:21:51.909270Z" 464 | }, 465 | "papermill": { 466 | "duration": 0.036653, 467 | "end_time": "2021-11-27T13:28:29.635595", 468 | "exception": false, 469 | "start_time": "2021-11-27T13:28:29.598942", 470 | "status": "completed" 471 | }, 472 | "tags": [] 473 | }, 474 | "outputs": [ 475 | { 476 | "name": "stdout", 477 | "output_type": "stream", 478 | "text": [ 479 | "Model: \"model\"\n", 480 | "__________________________________________________________________________________________________\n", 481 | "Layer (type) Output Shape Param # Connected to \n", 482 | "==================================================================================================\n", 483 | "input_1 (InputLayer) [(None, 200)] 0 \n", 484 | "__________________________________________________________________________________________________\n", 485 | "input_2 (InputLayer) [(None, 200)] 0 \n", 486 | "__________________________________________________________________________________________________\n", 487 | "embedding (Embedding) (None, 200, 300) 20112900 input_1[0][0] \n", 488 | "__________________________________________________________________________________________________\n", 489 | "embedding_1 (Embedding) (None, 200, 300) 20112900 input_2[0][0] \n", 490 | "__________________________________________________________________________________________________\n", 491 | "tf.__operators__.add (TFOpLambd (None, 200, 300) 0 embedding[0][0] \n", 492 | " embedding_1[0][0] \n", 493 | "__________________________________________________________________________________________________\n", 494 | "tf.math.subtract (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n", 495 | " embedding_1[0][0] \n", 496 | "__________________________________________________________________________________________________\n", 497 | "tf.math.multiply (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n", 498 | " embedding_1[0][0] \n", 499 | "__________________________________________________________________________________________________\n", 500 | "concatenate (Concatenate) (None, 200, 900) 0 tf.__operators__.add[0][0] \n", 501 | " tf.math.subtract[0][0] \n", 502 | " tf.math.multiply[0][0] \n", 503 | "__________________________________________________________________________________________________\n", 504 | "tf.math.reduce_sum (TFOpLambda) (None, 900) 0 concatenate[0][0] \n", 505 | "__________________________________________________________________________________________________\n", 506 | "dense (Dense) (None, 300) 270300 tf.math.reduce_sum[0][0] \n", 507 | "__________________________________________________________________________________________________\n", 508 | "dropout (Dropout) (None, 300) 0 dense[0][0] \n", 509 | "__________________________________________________________________________________________________\n", 510 | "dense_1 (Dense) (None, 200) 60200 dropout[0][0] \n", 511 | "__________________________________________________________________________________________________\n", 512 | "dropout_1 (Dropout) (None, 200) 0 dense_1[0][0] \n", 513 | "__________________________________________________________________________________________________\n", 514 | "dense_2 (Dense) (None, 100) 20100 dropout_1[0][0] \n", 515 | "__________________________________________________________________________________________________\n", 516 | "dropout_2 (Dropout) (None, 100) 0 dense_2[0][0] \n", 517 | "__________________________________________________________________________________________________\n", 518 | "dense_3 (Dense) (None, 2) 202 dropout_2[0][0] \n", 519 | "==================================================================================================\n", 520 | "Total params: 40,576,602\n", 521 | "Trainable params: 40,576,602\n", 522 | "Non-trainable params: 0\n", 523 | "__________________________________________________________________________________________________\n" 524 | ] 525 | } 526 | ], 527 | "source": [ 528 | "model.compile(optimizer = \"adam\", loss = 'categorical_crossentropy', metrics=['accuracy'])\n", 529 | "model.summary()" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 10, 535 | "id": "2769fd24", 536 | "metadata": { 537 | "execution": { 538 | "iopub.execute_input": "2021-11-27T13:28:29.669190Z", 539 | "iopub.status.busy": "2021-11-27T13:28:29.668319Z", 540 | "iopub.status.idle": "2021-11-27T13:29:27.302188Z", 541 | "shell.execute_reply": "2021-11-27T13:29:27.301703Z", 542 | "shell.execute_reply.started": "2021-11-27T13:21:52.934089Z" 543 | }, 544 | "papermill": { 545 | "duration": 57.652535, 546 | "end_time": "2021-11-27T13:29:27.302336", 547 | "exception": false, 548 | "start_time": "2021-11-27T13:28:29.649801", 549 | "status": "completed" 550 | }, 551 | "tags": [] 552 | }, 553 | "outputs": [ 554 | { 555 | "name": "stderr", 556 | "output_type": "stream", 557 | "text": [ 558 | "2021-11-27 13:28:29.784743: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n" 559 | ] 560 | }, 561 | { 562 | "name": "stdout", 563 | "output_type": "stream", 564 | "text": [ 565 | "Epoch 1/3\n", 566 | "632/632 [==============================] - 18s 26ms/step - loss: 1.5922 - accuracy: 0.6810 - val_loss: 0.6328 - val_accuracy: 0.7140\n", 567 | "Epoch 2/3\n", 568 | "632/632 [==============================] - 16s 25ms/step - loss: 0.5415 - accuracy: 0.7529 - val_loss: 0.5487 - val_accuracy: 0.7364\n", 569 | "Epoch 3/3\n", 570 | "632/632 [==============================] - 21s 33ms/step - loss: 0.4592 - accuracy: 0.7990 - val_loss: 0.5759 - val_accuracy: 0.7274\n" 571 | ] 572 | } 573 | ], 574 | "source": [ 575 | "save_weights = tf.keras.callbacks.ModelCheckpoint('cbow_mlp.h5', monitor='val_loss', save_best_only=True)\n", 576 | "checkpoint_filepath = 'weights.best.{epoch:01d}.hdf5'\n", 577 | "#model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n", 578 | "#verbose = 1,\n", 579 | "#monitor = 'val_loss',\n", 580 | "#save_best_only = False)\n", 581 | "history = model.fit((x_train1, x_train2), y_train,\n", 582 | " batch_size = 64,\n", 583 | " validation_data = ((x_val1, x_val2), y_val),\n", 584 | " validation_batch_size = 32,\n", 585 | " epochs=3, \n", 586 | " callbacks=[save_weights], \n", 587 | " verbose=1)" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 11, 593 | "id": "ab5c9bea", 594 | "metadata": { 595 | "execution": { 596 | "iopub.execute_input": "2021-11-27T13:29:27.609229Z", 597 | "iopub.status.busy": "2021-11-27T13:29:27.608248Z", 598 | "iopub.status.idle": "2021-11-27T13:29:51.444179Z", 599 | "shell.execute_reply": "2021-11-27T13:29:51.445132Z", 600 | "shell.execute_reply.started": "2021-11-27T13:23:37.976281Z" 601 | }, 602 | "papermill": { 603 | "duration": 23.995934, 604 | "end_time": "2021-11-27T13:29:51.445323", 605 | "exception": false, 606 | "start_time": "2021-11-27T13:29:27.449389", 607 | "status": "completed" 608 | }, 609 | "tags": [] 610 | }, 611 | "outputs": [ 612 | { 613 | "name": "stdout", 614 | "output_type": "stream", 615 | "text": [ 616 | "10108/10108 [==============================] - 24s 2ms/step - loss: 0.3645 - accuracy: 0.8614\n", 617 | "loss on test data is 0.36454567313194275\n", 618 | "accuracy on test data is 0.8613866567611694\n" 619 | ] 620 | } 621 | ], 622 | "source": [ 623 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n", 624 | "\n", 625 | "print('loss on test data is', loss)\n", 626 | "print('accuracy on test data is', accuracy)" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": 12, 632 | "id": "072e0c3f", 633 | "metadata": { 634 | "execution": { 635 | "iopub.execute_input": "2021-11-27T13:29:52.033949Z", 636 | "iopub.status.busy": "2021-11-27T13:29:52.033003Z", 637 | "iopub.status.idle": "2021-11-27T13:29:54.764953Z", 638 | "shell.execute_reply": "2021-11-27T13:29:54.765365Z", 639 | "shell.execute_reply.started": "2021-11-27T13:25:52.168723Z" 640 | }, 641 | "papermill": { 642 | "duration": 3.037087, 643 | "end_time": "2021-11-27T13:29:54.765519", 644 | "exception": false, 645 | "start_time": "2021-11-27T13:29:51.728432", 646 | "status": "completed" 647 | }, 648 | "tags": [] 649 | }, 650 | "outputs": [ 651 | { 652 | "name": "stdout", 653 | "output_type": "stream", 654 | "text": [ 655 | "F1_score on test is 0.8275480059084195\n" 656 | ] 657 | } 658 | ], 659 | "source": [ 660 | "pred = model.predict((x_test1, x_test2))\n", 661 | "print('F1_score on test is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))\n" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": null, 667 | "id": "0b1a3522", 668 | "metadata": { 669 | "papermill": { 670 | "duration": 0.264673, 671 | "end_time": "2021-11-27T13:29:55.301567", 672 | "exception": false, 673 | "start_time": "2021-11-27T13:29:55.036894", 674 | "status": "completed" 675 | }, 676 | "tags": [] 677 | }, 678 | "outputs": [], 679 | "source": [] 680 | } 681 | ], 682 | "metadata": { 683 | "kernelspec": { 684 | "display_name": "Python 3", 685 | "language": "python", 686 | "name": "python3" 687 | }, 688 | "language_info": { 689 | "codemirror_mode": { 690 | "name": "ipython", 691 | "version": 3 692 | }, 693 | "file_extension": ".py", 694 | "mimetype": "text/x-python", 695 | "name": "python", 696 | "nbconvert_exporter": "python", 697 | "pygments_lexer": "ipython3", 698 | "version": "3.7.10" 699 | }, 700 | "papermill": { 701 | "default_parameters": {}, 702 | "duration": 184.393918, 703 | "end_time": "2021-11-27T13:29:59.169536", 704 | "environment_variables": {}, 705 | "exception": null, 706 | "input_path": "__notebook__.ipynb", 707 | "output_path": "__notebook__.ipynb", 708 | "parameters": {}, 709 | "start_time": "2021-11-27T13:26:54.775618", 710 | "version": "2.3.3" 711 | } 712 | }, 713 | "nbformat": 4, 714 | "nbformat_minor": 5 715 | } 716 | -------------------------------------------------------------------------------- /CBOW ML Dropout Regularisation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "7129363e", 7 | "metadata": { 8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 10 | "execution": { 11 | "iopub.execute_input": "2021-11-27T13:39:40.100619Z", 12 | "iopub.status.busy": "2021-11-27T13:39:40.097692Z", 13 | "iopub.status.idle": "2021-11-27T13:39:45.510044Z", 14 | "shell.execute_reply": "2021-11-27T13:39:45.509398Z", 15 | "shell.execute_reply.started": "2021-11-27T13:34:29.669569Z" 16 | }, 17 | "papermill": { 18 | "duration": 5.435369, 19 | "end_time": "2021-11-27T13:39:45.510212", 20 | "exception": false, 21 | "start_time": "2021-11-27T13:39:40.074843", 22 | "status": "completed" 23 | }, 24 | "tags": [] 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import pandas as pd\n", 29 | "import numpy as np\n", 30 | "from tqdm import tqdm\n", 31 | "import tensorflow as tf\n", 32 | "from gensim.models import KeyedVectors\n", 33 | "import gensim\n", 34 | "import re\n", 35 | "from sklearn.metrics import f1_score\n", 36 | "import matplotlib.pyplot as plt" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "id": "fa981eee", 43 | "metadata": { 44 | "execution": { 45 | "iopub.execute_input": "2021-11-27T13:39:45.539763Z", 46 | "iopub.status.busy": "2021-11-27T13:39:45.539259Z", 47 | "iopub.status.idle": "2021-11-27T13:39:46.766976Z", 48 | "shell.execute_reply": "2021-11-27T13:39:46.766492Z", 49 | "shell.execute_reply.started": "2021-11-27T13:10:42.402545Z" 50 | }, 51 | "papermill": { 52 | "duration": 1.24347, 53 | "end_time": "2021-11-27T13:39:46.767141", 54 | "exception": false, 55 | "start_time": "2021-11-27T13:39:45.523671", 56 | "status": "completed" 57 | }, 58 | "tags": [] 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "train = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')\n", 63 | "val = pd.read_csv('../input/quora-ques-pair/val_data.csv/val_data.csv').fillna('')\n", 64 | "test = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "id": "18c9ee56", 71 | "metadata": { 72 | "execution": { 73 | "iopub.execute_input": "2021-11-27T13:39:46.793631Z", 74 | "iopub.status.busy": "2021-11-27T13:39:46.792950Z", 75 | "iopub.status.idle": "2021-11-27T13:39:46.795498Z", 76 | "shell.execute_reply": "2021-11-27T13:39:46.795872Z", 77 | "shell.execute_reply.started": "2021-11-27T13:10:43.715098Z" 78 | }, 79 | "papermill": { 80 | "duration": 0.017601, 81 | "end_time": "2021-11-27T13:39:46.795999", 82 | "exception": false, 83 | "start_time": "2021-11-27T13:39:46.778398", 84 | "status": "completed" 85 | }, 86 | "tags": [] 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "word2vec_file = '../input/d/sandreds/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "id": "9a62dc09", 97 | "metadata": { 98 | "execution": { 99 | "iopub.execute_input": "2021-11-27T13:39:46.827484Z", 100 | "iopub.status.busy": "2021-11-27T13:39:46.826713Z", 101 | "iopub.status.idle": "2021-11-27T13:39:46.839866Z", 102 | "shell.execute_reply": "2021-11-27T13:39:46.840300Z", 103 | "shell.execute_reply.started": "2021-11-27T13:10:43.722268Z" 104 | }, 105 | "papermill": { 106 | "duration": 0.033588, 107 | "end_time": "2021-11-27T13:39:46.840430", 108 | "exception": false, 109 | "start_time": "2021-11-27T13:39:46.806842", 110 | "status": "completed" 111 | }, 112 | "tags": [] 113 | }, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/html": [ 118 | "

	id	qid1	qid2	question1	question2	is_duplicate	question1_preprocessed	question2_preprocessed
0	204673	93885	307635	If there is a God, where is He!	Why is god a \"He\"?	0	if there is a god , where is he !	why is god a `` he '' ?
1	17716	2093	15628	Do you believe that everything happens for a r...	Does everything happen for a reason?	1	do you believe that everything happens for a r...	does everything happen for a reason ?
2	291767	352623	413255	Will there always be web hosting that will sup...	Will there always be web hosting that supports...	1	will there always be web hosting that will sup...	will there always be web hosting that supports...
3	203758	59824	67971	What is the proof of Indian Army's surgical st...	Has India provided any proof of the surgical s...	1	what is the proof of indian army 's surgical s...	has india provided any proof of the surgical s...
4	41747	75326	75327	What do Indian Muslims think of Modi?	What do Indian Muslim think about PM Narendra ...	1	what do indian muslims think of modi ?	what do indian muslim think about pm narendra ...

\n", 204 | "

" 205 | ], 206 | "text/plain": [ 207 | " id qid1 qid2 question1 \\\n", 208 | "0 204673 93885 307635 If there is a God, where is He! \n", 209 | "1 17716 2093 15628 Do you believe that everything happens for a r... \n", 210 | "2 291767 352623 413255 Will there always be web hosting that will sup... \n", 211 | "3 203758 59824 67971 What is the proof of Indian Army's surgical st... \n", 212 | "4 41747 75326 75327 What do Indian Muslims think of Modi? \n", 213 | "\n", 214 | " question2 is_duplicate \\\n", 215 | "0 Why is god a \"He\"? 0 \n", 216 | "1 Does everything happen for a reason? 1 \n", 217 | "2 Will there always be web hosting that supports... 1 \n", 218 | "3 Has India provided any proof of the surgical s... 1 \n", 219 | "4 What do Indian Muslim think about PM Narendra ... 1 \n", 220 | "\n", 221 | " question1_preprocessed \\\n", 222 | "0 if there is a god , where is he ! \n", 223 | "1 do you believe that everything happens for a r... \n", 224 | "2 will there always be web hosting that will sup... \n", 225 | "3 what is the proof of indian army 's surgical s... \n", 226 | "4 what do indian muslims think of modi ? \n", 227 | "\n", 228 | " question2_preprocessed \n", 229 | "0 why is god a `` he '' ? \n", 230 | "1 does everything happen for a reason ? \n", 231 | "2 will there always be web hosting that supports... \n", 232 | "3 has india provided any proof of the surgical s... \n", 233 | "4 what do indian muslim think about pm narendra ... " 234 | ] 235 | }, 236 | "execution_count": 4, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "train.head()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 5, 248 | "id": "2e9f6d2a", 249 | "metadata": { 250 | "execution": { 251 | "iopub.execute_input": "2021-11-27T13:39:46.870196Z", 252 | "iopub.status.busy": "2021-11-27T13:39:46.869680Z", 253 | "iopub.status.idle": "2021-11-27T13:40:56.405543Z", 254 | "shell.execute_reply": "2021-11-27T13:40:56.404933Z", 255 | "shell.execute_reply.started": "2021-11-27T13:10:43.746605Z" 256 | }, 257 | "papermill": { 258 | "duration": 69.55394, 259 | "end_time": "2021-11-27T13:40:56.405691", 260 | "exception": false, 261 | "start_time": "2021-11-27T13:39:46.851751", 262 | "status": "completed" 263 | }, 264 | "tags": [] 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "def buildVocabulary(reviews):\n", 269 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n", 270 | " tokenizer.fit_on_texts(reviews)\n", 271 | " return tokenizer\n", 272 | "\n", 273 | "def getSequences(reviews, tokenizer, seq_maxlen):\n", 274 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n", 275 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n", 276 | "\n", 277 | "word2vec_model = KeyedVectors.load_word2vec_format(word2vec_file, binary = True)\n", 278 | "\n", 279 | "def getEmbeddingWeightMatrix(word2idx): \n", 280 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n", 281 | " for word, i in tqdm(word2idx.items()):\n", 282 | " \n", 283 | " embedding_vector = word2vec_model[word] if word in word2vec_model else np.random.rand(1,300)\n", 284 | " if embedding_vector is not None:\n", 285 | " embedding_matrix[i] = embedding_vector\n", 286 | " return embedding_matrix" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 6, 292 | "id": "1c97283f", 293 | "metadata": { 294 | "execution": { 295 | "iopub.execute_input": "2021-11-27T13:40:56.460244Z", 296 | "iopub.status.busy": "2021-11-27T13:40:56.459548Z", 297 | "iopub.status.idle": "2021-11-27T13:41:10.200708Z", 298 | "shell.execute_reply": "2021-11-27T13:41:10.200210Z", 299 | "shell.execute_reply.started": "2021-11-27T13:11:45.518534Z" 300 | }, 301 | "papermill": { 302 | "duration": 13.783239, 303 | "end_time": "2021-11-27T13:41:10.200838", 304 | "exception": false, 305 | "start_time": "2021-11-27T13:40:56.417599", 306 | "status": "completed" 307 | }, 308 | "tags": [] 309 | }, 310 | "outputs": [ 311 | { 312 | "name": "stdout", 313 | "output_type": "stream", 314 | "text": [ 315 | "67043\n" 316 | ] 317 | } 318 | ], 319 | "source": [ 320 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n", 321 | "vocab_size = len(tokenizer.word_index) + 1\n", 322 | "print(vocab_size)\n", 323 | "\n", 324 | "x_train1 = getSequences(train['question1'], tokenizer, 200)\n", 325 | "x_train2 = getSequences(train['question2'], tokenizer, 200)\n", 326 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n", 327 | "\n", 328 | "x_val1 = getSequences(val['question1'], tokenizer, 200)\n", 329 | "x_val2 = getSequences(val['question2'], tokenizer, 200)\n", 330 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n", 331 | "\n", 332 | "x_test1 = getSequences(test['question1'], tokenizer, 200)\n", 333 | "x_test2 = getSequences(test['question2'], tokenizer, 200)\n", 334 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 7, 340 | "id": "580d251a", 341 | "metadata": { 342 | "execution": { 343 | "iopub.execute_input": "2021-11-27T13:41:10.228748Z", 344 | "iopub.status.busy": "2021-11-27T13:41:10.228020Z", 345 | "iopub.status.idle": "2021-11-27T13:41:10.705699Z", 346 | "shell.execute_reply": "2021-11-27T13:41:10.704306Z", 347 | "shell.execute_reply.started": "2021-11-27T13:12:14.356043Z" 348 | }, 349 | "papermill": { 350 | "duration": 0.493073, 351 | "end_time": "2021-11-27T13:41:10.705828", 352 | "exception": false, 353 | "start_time": "2021-11-27T13:41:10.212755", 354 | "status": "completed" 355 | }, 356 | "tags": [] 357 | }, 358 | "outputs": [ 359 | { 360 | "name": "stderr", 361 | "output_type": "stream", 362 | "text": [ 363 | "100%|██████████| 67042/67042 [00:00<00:00, 143587.33it/s]" 364 | ] 365 | }, 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "(67043, 300)\n" 371 | ] 372 | }, 373 | { 374 | "name": "stderr", 375 | "output_type": "stream", 376 | "text": [ 377 | "\n" 378 | ] 379 | } 380 | ], 381 | "source": [ 382 | "#embedding_vectors = loadGloveWordEmbeddings()\n", 383 | "#print(len(embedding_vectors))\n", 384 | "\n", 385 | "embedding_weight_matrix = getEmbeddingWeightMatrix(tokenizer.word_index)\n", 386 | "print(embedding_weight_matrix.shape)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 8, 392 | "id": "376d9fdb", 393 | "metadata": { 394 | "execution": { 395 | "iopub.execute_input": "2021-11-27T13:41:10.744344Z", 396 | "iopub.status.busy": "2021-11-27T13:41:10.743779Z", 397 | "iopub.status.idle": "2021-11-27T13:41:13.935711Z", 398 | "shell.execute_reply": "2021-11-27T13:41:13.934772Z", 399 | "shell.execute_reply.started": "2021-11-27T13:27:27.669498Z" 400 | }, 401 | "papermill": { 402 | "duration": 3.215972, 403 | "end_time": "2021-11-27T13:41:13.935848", 404 | "exception": false, 405 | "start_time": "2021-11-27T13:41:10.719876", 406 | "status": "completed" 407 | }, 408 | "tags": [] 409 | }, 410 | "outputs": [ 411 | { 412 | "name": "stderr", 413 | "output_type": "stream", 414 | "text": [ 415 | "2021-11-27 13:41:10.832116: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 416 | "2021-11-27 13:41:10.981618: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 417 | "2021-11-27 13:41:10.982427: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 418 | "2021-11-27 13:41:10.983858: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n", 419 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", 420 | "2021-11-27 13:41:10.985130: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 421 | "2021-11-27 13:41:10.985799: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 422 | "2021-11-27 13:41:10.986424: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 423 | "2021-11-27 13:41:13.311232: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 424 | "2021-11-27 13:41:13.311964: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 425 | "2021-11-27 13:41:13.312674: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 426 | "2021-11-27 13:41:13.313255: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n" 427 | ] 428 | } 429 | ], 430 | "source": [ 431 | "#he_initializer = tf.keras.initializers.HeUniform()\n", 432 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n", 433 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n", 434 | "\n", 435 | "inner1= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200, \n", 436 | " weights=[embedding_weight_matrix], trainable=True)(inp1)\n", 437 | "inner2= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200,\n", 438 | " weights=[embedding_weight_matrix], trainable=True)(inp2)\n", 439 | " \n", 440 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n", 441 | "\n", 442 | "inner = tf.keras.backend.sum(inner, axis=1, keepdims=False)\n", 443 | "inner = tf.keras.layers.Dense(300, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 444 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 445 | "inner = tf.keras.layers.Dense(200, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 446 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 447 | "inner = tf.keras.layers.Dense(100, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 448 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 449 | "output = tf.keras.layers.Dense(2, activation='softmax')(inner)\n", 450 | "\n", 451 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 9, 457 | "id": "e7dba2d8", 458 | "metadata": { 459 | "execution": { 460 | "iopub.execute_input": "2021-11-27T13:41:13.973100Z", 461 | "iopub.status.busy": "2021-11-27T13:41:13.972255Z", 462 | "iopub.status.idle": "2021-11-27T13:41:13.987912Z", 463 | "shell.execute_reply": "2021-11-27T13:41:13.988527Z", 464 | "shell.execute_reply.started": "2021-11-27T13:27:28.324190Z" 465 | }, 466 | "papermill": { 467 | "duration": 0.037826, 468 | "end_time": "2021-11-27T13:41:13.988697", 469 | "exception": false, 470 | "start_time": "2021-11-27T13:41:13.950871", 471 | "status": "completed" 472 | }, 473 | "tags": [] 474 | }, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "Model: \"model\"\n", 481 | "__________________________________________________________________________________________________\n", 482 | "Layer (type) Output Shape Param # Connected to \n", 483 | "==================================================================================================\n", 484 | "input_1 (InputLayer) [(None, 200)] 0 \n", 485 | "__________________________________________________________________________________________________\n", 486 | "input_2 (InputLayer) [(None, 200)] 0 \n", 487 | "__________________________________________________________________________________________________\n", 488 | "embedding (Embedding) (None, 200, 300) 20112900 input_1[0][0] \n", 489 | "__________________________________________________________________________________________________\n", 490 | "embedding_1 (Embedding) (None, 200, 300) 20112900 input_2[0][0] \n", 491 | "__________________________________________________________________________________________________\n", 492 | "tf.__operators__.add (TFOpLambd (None, 200, 300) 0 embedding[0][0] \n", 493 | " embedding_1[0][0] \n", 494 | "__________________________________________________________________________________________________\n", 495 | "tf.math.subtract (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n", 496 | " embedding_1[0][0] \n", 497 | "__________________________________________________________________________________________________\n", 498 | "tf.math.multiply (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n", 499 | " embedding_1[0][0] \n", 500 | "__________________________________________________________________________________________________\n", 501 | "concatenate (Concatenate) (None, 200, 900) 0 tf.__operators__.add[0][0] \n", 502 | " tf.math.subtract[0][0] \n", 503 | " tf.math.multiply[0][0] \n", 504 | "__________________________________________________________________________________________________\n", 505 | "tf.math.reduce_sum (TFOpLambda) (None, 900) 0 concatenate[0][0] \n", 506 | "__________________________________________________________________________________________________\n", 507 | "dense (Dense) (None, 300) 270300 tf.math.reduce_sum[0][0] \n", 508 | "__________________________________________________________________________________________________\n", 509 | "dropout (Dropout) (None, 300) 0 dense[0][0] \n", 510 | "__________________________________________________________________________________________________\n", 511 | "dense_1 (Dense) (None, 200) 60200 dropout[0][0] \n", 512 | "__________________________________________________________________________________________________\n", 513 | "dropout_1 (Dropout) (None, 200) 0 dense_1[0][0] \n", 514 | "__________________________________________________________________________________________________\n", 515 | "dense_2 (Dense) (None, 100) 20100 dropout_1[0][0] \n", 516 | "__________________________________________________________________________________________________\n", 517 | "dropout_2 (Dropout) (None, 100) 0 dense_2[0][0] \n", 518 | "__________________________________________________________________________________________________\n", 519 | "dense_3 (Dense) (None, 2) 202 dropout_2[0][0] \n", 520 | "==================================================================================================\n", 521 | "Total params: 40,576,602\n", 522 | "Trainable params: 40,576,602\n", 523 | "Non-trainable params: 0\n", 524 | "__________________________________________________________________________________________________\n" 525 | ] 526 | } 527 | ], 528 | "source": [ 529 | "model.compile(optimizer = \"adam\", loss = 'categorical_crossentropy', metrics=['accuracy'])\n", 530 | "model.summary()" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 10, 536 | "id": "5c0c86ea", 537 | "metadata": { 538 | "execution": { 539 | "iopub.execute_input": "2021-11-27T13:41:14.024992Z", 540 | "iopub.status.busy": "2021-11-27T13:41:14.024188Z", 541 | "iopub.status.idle": "2021-11-27T13:42:26.517456Z", 542 | "shell.execute_reply": "2021-11-27T13:42:26.518855Z", 543 | "shell.execute_reply.started": "2021-11-27T13:27:29.396799Z" 544 | }, 545 | "papermill": { 546 | "duration": 72.514821, 547 | "end_time": "2021-11-27T13:42:26.519128", 548 | "exception": false, 549 | "start_time": "2021-11-27T13:41:14.004307", 550 | "status": "completed" 551 | }, 552 | "tags": [] 553 | }, 554 | "outputs": [ 555 | { 556 | "name": "stderr", 557 | "output_type": "stream", 558 | "text": [ 559 | "2021-11-27 13:41:14.139989: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n" 560 | ] 561 | }, 562 | { 563 | "name": "stdout", 564 | "output_type": "stream", 565 | "text": [ 566 | "Epoch 1/4\n", 567 | "632/632 [==============================] - 23s 33ms/step - loss: 1.4585 - accuracy: 0.6716 - val_loss: 0.5944 - val_accuracy: 0.7091\n", 568 | "Epoch 2/4\n", 569 | "632/632 [==============================] - 16s 25ms/step - loss: 0.5432 - accuracy: 0.7499 - val_loss: 0.5638 - val_accuracy: 0.7303\n", 570 | "Epoch 3/4\n", 571 | "632/632 [==============================] - 15s 24ms/step - loss: 0.4664 - accuracy: 0.7947 - val_loss: 0.6222 - val_accuracy: 0.7380\n", 572 | "Epoch 4/4\n", 573 | "632/632 [==============================] - 15s 24ms/step - loss: 0.4202 - accuracy: 0.8218 - val_loss: 0.5795 - val_accuracy: 0.7284\n" 574 | ] 575 | } 576 | ], 577 | "source": [ 578 | "save_weights = tf.keras.callbacks.ModelCheckpoint('cbow_mlp.h5', monitor='val_loss', save_best_only=True)\n", 579 | "checkpoint_filepath = 'weights.best.{epoch:01d}.hdf5'\n", 580 | "#model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n", 581 | "#verbose = 1,\n", 582 | "#monitor = 'val_loss',\n", 583 | "#save_best_only = False)\n", 584 | "history = model.fit((x_train1, x_train2), y_train,\n", 585 | " batch_size = 64,\n", 586 | " validation_data = ((x_val1, x_val2), y_val),\n", 587 | " validation_batch_size = 32,\n", 588 | " epochs=4, \n", 589 | " callbacks=[save_weights], \n", 590 | " verbose=1)" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": 13, 596 | "id": "8e786b68", 597 | "metadata": { 598 | "execution": { 599 | "iopub.execute_input": "2021-11-27T13:42:28.244754Z", 600 | "iopub.status.busy": "2021-11-27T13:42:28.243805Z", 601 | "iopub.status.idle": "2021-11-27T13:43:09.569995Z", 602 | "shell.execute_reply": "2021-11-27T13:43:09.570512Z", 603 | "shell.execute_reply.started": "2021-11-27T13:31:52.063332Z" 604 | }, 605 | "papermill": { 606 | "duration": 41.528106, 607 | "end_time": "2021-11-27T13:43:09.570664", 608 | "exception": false, 609 | "start_time": "2021-11-27T13:42:28.042558", 610 | "status": "completed" 611 | }, 612 | "tags": [] 613 | }, 614 | "outputs": [ 615 | { 616 | "name": "stdout", 617 | "output_type": "stream", 618 | "text": [ 619 | "10108/10108 [==============================] - 23s 2ms/step - loss: 0.3454 - accuracy: 0.8793\n", 620 | "loss on test data is 0.345432311296463\n", 621 | "accuracy on test data is 0.8792945742607117\n" 622 | ] 623 | } 624 | ], 625 | "source": [ 626 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n", 627 | "\n", 628 | "print('loss on test data is', loss)\n", 629 | "print('accuracy on test data is', accuracy)" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": 14, 635 | "id": "c84c1ff4", 636 | "metadata": { 637 | "execution": { 638 | "iopub.execute_input": "2021-11-27T13:43:10.313909Z", 639 | "iopub.status.busy": "2021-11-27T13:43:10.306261Z", 640 | "iopub.status.idle": "2021-11-27T13:43:11.829214Z", 641 | "shell.execute_reply": "2021-11-27T13:43:11.829678Z", 642 | "shell.execute_reply.started": "2021-11-27T13:33:47.897210Z" 643 | }, 644 | "papermill": { 645 | "duration": 1.948284, 646 | "end_time": "2021-11-27T13:43:11.829824", 647 | "exception": false, 648 | "start_time": "2021-11-27T13:43:09.881540", 649 | "status": "completed" 650 | }, 651 | "tags": [] 652 | }, 653 | "outputs": [ 654 | { 655 | "name": "stdout", 656 | "output_type": "stream", 657 | "text": [ 658 | "F1_score on test is 0.8467144113582108\n" 659 | ] 660 | } 661 | ], 662 | "source": [ 663 | "pred = model.predict((x_test1, x_test2))\n", 664 | "print('F1_score on test is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))\n" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": null, 670 | "id": "9929df87", 671 | "metadata": { 672 | "papermill": { 673 | "duration": 0.308571, 674 | "end_time": "2021-11-27T13:43:12.450411", 675 | "exception": false, 676 | "start_time": "2021-11-27T13:43:12.141840", 677 | "status": "completed" 678 | }, 679 | "tags": [] 680 | }, 681 | "outputs": [], 682 | "source": [] 683 | } 684 | ], 685 | "metadata": { 686 | "kernelspec": { 687 | "display_name": "Python 3", 688 | "language": "python", 689 | "name": "python3" 690 | }, 691 | "language_info": { 692 | "codemirror_mode": { 693 | "name": "ipython", 694 | "version": 3 695 | }, 696 | "file_extension": ".py", 697 | "mimetype": "text/x-python", 698 | "name": "python", 699 | "nbconvert_exporter": "python", 700 | "pygments_lexer": "ipython3", 701 | "version": "3.8.8" 702 | }, 703 | "papermill": { 704 | "default_parameters": {}, 705 | "duration": 223.597372, 706 | "end_time": "2021-11-27T13:43:15.659790", 707 | "environment_variables": {}, 708 | "exception": null, 709 | "input_path": "__notebook__.ipynb", 710 | "output_path": "__notebook__.ipynb", 711 | "parameters": {}, 712 | "start_time": "2021-11-27T13:39:32.062418", 713 | "version": "2.3.3" 714 | } 715 | }, 716 | "nbformat": 4, 717 | "nbformat_minor": 5 718 | } 719 | -------------------------------------------------------------------------------- /CBOW MLP He initialisation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f4ff64c0", 7 | "metadata": { 8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 10 | "execution": { 11 | "iopub.execute_input": "2021-11-27T13:47:47.616966Z", 12 | "iopub.status.busy": "2021-11-27T13:47:47.615429Z", 13 | "iopub.status.idle": "2021-11-27T13:47:52.415578Z", 14 | "shell.execute_reply": "2021-11-27T13:47:52.414840Z", 15 | "shell.execute_reply.started": "2021-11-27T13:34:29.669569Z" 16 | }, 17 | "papermill": { 18 | "duration": 4.817962, 19 | "end_time": "2021-11-27T13:47:52.415761", 20 | "exception": false, 21 | "start_time": "2021-11-27T13:47:47.597799", 22 | "status": "completed" 23 | }, 24 | "tags": [] 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import pandas as pd\n", 29 | "import numpy as np\n", 30 | "from tqdm import tqdm\n", 31 | "import tensorflow as tf\n", 32 | "from gensim.models import KeyedVectors\n", 33 | "import gensim\n", 34 | "import re\n", 35 | "from sklearn.metrics import f1_score\n", 36 | "import matplotlib.pyplot as plt" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "id": "4978cce8", 43 | "metadata": { 44 | "execution": { 45 | "iopub.execute_input": "2021-11-27T13:47:52.446265Z", 46 | "iopub.status.busy": "2021-11-27T13:47:52.445619Z", 47 | "iopub.status.idle": "2021-11-27T13:47:53.538577Z", 48 | "shell.execute_reply": "2021-11-27T13:47:53.538101Z", 49 | "shell.execute_reply.started": "2021-11-27T13:10:42.402545Z" 50 | }, 51 | "papermill": { 52 | "duration": 1.109649, 53 | "end_time": "2021-11-27T13:47:53.538734", 54 | "exception": false, 55 | "start_time": "2021-11-27T13:47:52.429085", 56 | "status": "completed" 57 | }, 58 | "tags": [] 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "train = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')\n", 63 | "val = pd.read_csv('../input/quora-ques-pair/val_data.csv/val_data.csv').fillna('')\n", 64 | "test = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "id": "93666f68", 71 | "metadata": { 72 | "execution": { 73 | "iopub.execute_input": "2021-11-27T13:47:53.565394Z", 74 | "iopub.status.busy": "2021-11-27T13:47:53.564877Z", 75 | "iopub.status.idle": "2021-11-27T13:47:53.568521Z", 76 | "shell.execute_reply": "2021-11-27T13:47:53.568123Z", 77 | "shell.execute_reply.started": "2021-11-27T13:10:43.715098Z" 78 | }, 79 | "papermill": { 80 | "duration": 0.018444, 81 | "end_time": "2021-11-27T13:47:53.568628", 82 | "exception": false, 83 | "start_time": "2021-11-27T13:47:53.550184", 84 | "status": "completed" 85 | }, 86 | "tags": [] 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "word2vec_file = '../input/d/sandreds/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "id": "69688c16", 97 | "metadata": { 98 | "execution": { 99 | "iopub.execute_input": "2021-11-27T13:47:53.598579Z", 100 | "iopub.status.busy": "2021-11-27T13:47:53.597979Z", 101 | "iopub.status.idle": "2021-11-27T13:47:53.611503Z", 102 | "shell.execute_reply": "2021-11-27T13:47:53.611946Z", 103 | "shell.execute_reply.started": "2021-11-27T13:10:43.722268Z" 104 | }, 105 | "papermill": { 106 | "duration": 0.03278, 107 | "end_time": "2021-11-27T13:47:53.612067", 108 | "exception": false, 109 | "start_time": "2021-11-27T13:47:53.579287", 110 | "status": "completed" 111 | }, 112 | "tags": [] 113 | }, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/html": [ 118 | "

	id	qid1	qid2	question1	question2	is_duplicate	question1_preprocessed	question2_preprocessed
0	204673	93885	307635	If there is a God, where is He!	Why is god a \"He\"?	0	if there is a god , where is he !	why is god a `` he '' ?
1	17716	2093	15628	Do you believe that everything happens for a r...	Does everything happen for a reason?	1	do you believe that everything happens for a r...	does everything happen for a reason ?
2	291767	352623	413255	Will there always be web hosting that will sup...	Will there always be web hosting that supports...	1	will there always be web hosting that will sup...	will there always be web hosting that supports...
3	203758	59824	67971	What is the proof of Indian Army's surgical st...	Has India provided any proof of the surgical s...	1	what is the proof of indian army 's surgical s...	has india provided any proof of the surgical s...
4	41747	75326	75327	What do Indian Muslims think of Modi?	What do Indian Muslim think about PM Narendra ...	1	what do indian muslims think of modi ?	what do indian muslim think about pm narendra ...

\n", 204 | "

" 205 | ], 206 | "text/plain": [ 207 | " id qid1 qid2 question1 \\\n", 208 | "0 204673 93885 307635 If there is a God, where is He! \n", 209 | "1 17716 2093 15628 Do you believe that everything happens for a r... \n", 210 | "2 291767 352623 413255 Will there always be web hosting that will sup... \n", 211 | "3 203758 59824 67971 What is the proof of Indian Army's surgical st... \n", 212 | "4 41747 75326 75327 What do Indian Muslims think of Modi? \n", 213 | "\n", 214 | " question2 is_duplicate \\\n", 215 | "0 Why is god a \"He\"? 0 \n", 216 | "1 Does everything happen for a reason? 1 \n", 217 | "2 Will there always be web hosting that supports... 1 \n", 218 | "3 Has India provided any proof of the surgical s... 1 \n", 219 | "4 What do Indian Muslim think about PM Narendra ... 1 \n", 220 | "\n", 221 | " question1_preprocessed \\\n", 222 | "0 if there is a god , where is he ! \n", 223 | "1 do you believe that everything happens for a r... \n", 224 | "2 will there always be web hosting that will sup... \n", 225 | "3 what is the proof of indian army 's surgical s... \n", 226 | "4 what do indian muslims think of modi ? \n", 227 | "\n", 228 | " question2_preprocessed \n", 229 | "0 why is god a `` he '' ? \n", 230 | "1 does everything happen for a reason ? \n", 231 | "2 will there always be web hosting that supports... \n", 232 | "3 has india provided any proof of the surgical s... \n", 233 | "4 what do indian muslim think about pm narendra ... " 234 | ] 235 | }, 236 | "execution_count": 4, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "train.head()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 5, 248 | "id": "d38628e4", 249 | "metadata": { 250 | "execution": { 251 | "iopub.execute_input": "2021-11-27T13:47:53.641763Z", 252 | "iopub.status.busy": "2021-11-27T13:47:53.641235Z", 253 | "iopub.status.idle": "2021-11-27T13:48:55.410135Z", 254 | "shell.execute_reply": "2021-11-27T13:48:55.409588Z", 255 | "shell.execute_reply.started": "2021-11-27T13:10:43.746605Z" 256 | }, 257 | "papermill": { 258 | "duration": 61.786788, 259 | "end_time": "2021-11-27T13:48:55.410279", 260 | "exception": false, 261 | "start_time": "2021-11-27T13:47:53.623491", 262 | "status": "completed" 263 | }, 264 | "tags": [] 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "def buildVocabulary(reviews):\n", 269 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n", 270 | " tokenizer.fit_on_texts(reviews)\n", 271 | " return tokenizer\n", 272 | "\n", 273 | "def getSequences(reviews, tokenizer, seq_maxlen):\n", 274 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n", 275 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n", 276 | "\n", 277 | "word2vec_model = KeyedVectors.load_word2vec_format(word2vec_file, binary = True)\n", 278 | "\n", 279 | "def getEmbeddingWeightMatrix(word2idx): \n", 280 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n", 281 | " for word, i in tqdm(word2idx.items()):\n", 282 | " \n", 283 | " embedding_vector = word2vec_model[word] if word in word2vec_model else np.random.rand(1,300)\n", 284 | " if embedding_vector is not None:\n", 285 | " embedding_matrix[i] = embedding_vector\n", 286 | " return embedding_matrix" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 6, 292 | "id": "28c391f9", 293 | "metadata": { 294 | "execution": { 295 | "iopub.execute_input": "2021-11-27T13:48:55.462462Z", 296 | "iopub.status.busy": "2021-11-27T13:48:55.455009Z", 297 | "iopub.status.idle": "2021-11-27T13:49:09.398443Z", 298 | "shell.execute_reply": "2021-11-27T13:49:09.397946Z", 299 | "shell.execute_reply.started": "2021-11-27T13:11:45.518534Z" 300 | }, 301 | "papermill": { 302 | "duration": 13.976622, 303 | "end_time": "2021-11-27T13:49:09.398601", 304 | "exception": false, 305 | "start_time": "2021-11-27T13:48:55.421979", 306 | "status": "completed" 307 | }, 308 | "tags": [] 309 | }, 310 | "outputs": [ 311 | { 312 | "name": "stdout", 313 | "output_type": "stream", 314 | "text": [ 315 | "67043\n" 316 | ] 317 | } 318 | ], 319 | "source": [ 320 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n", 321 | "vocab_size = len(tokenizer.word_index) + 1\n", 322 | "print(vocab_size)\n", 323 | "\n", 324 | "x_train1 = getSequences(train['question1'], tokenizer, 200)\n", 325 | "x_train2 = getSequences(train['question2'], tokenizer, 200)\n", 326 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n", 327 | "\n", 328 | "x_val1 = getSequences(val['question1'], tokenizer, 200)\n", 329 | "x_val2 = getSequences(val['question2'], tokenizer, 200)\n", 330 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n", 331 | "\n", 332 | "x_test1 = getSequences(test['question1'], tokenizer, 200)\n", 333 | "x_test2 = getSequences(test['question2'], tokenizer, 200)\n", 334 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 7, 340 | "id": "2b48ce03", 341 | "metadata": { 342 | "execution": { 343 | "iopub.execute_input": "2021-11-27T13:49:09.427768Z", 344 | "iopub.status.busy": "2021-11-27T13:49:09.427184Z", 345 | "iopub.status.idle": "2021-11-27T13:49:09.887361Z", 346 | "shell.execute_reply": "2021-11-27T13:49:09.888055Z", 347 | "shell.execute_reply.started": "2021-11-27T13:12:14.356043Z" 348 | }, 349 | "papermill": { 350 | "duration": 0.47744, 351 | "end_time": "2021-11-27T13:49:09.888245", 352 | "exception": false, 353 | "start_time": "2021-11-27T13:49:09.410805", 354 | "status": "completed" 355 | }, 356 | "tags": [] 357 | }, 358 | "outputs": [ 359 | { 360 | "name": "stderr", 361 | "output_type": "stream", 362 | "text": [ 363 | "100%|██████████| 67042/67042 [00:00<00:00, 148266.73it/s]" 364 | ] 365 | }, 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "(67043, 300)\n" 371 | ] 372 | }, 373 | { 374 | "name": "stderr", 375 | "output_type": "stream", 376 | "text": [ 377 | "\n" 378 | ] 379 | } 380 | ], 381 | "source": [ 382 | "#embedding_vectors = loadGloveWordEmbeddings()\n", 383 | "#print(len(embedding_vectors))\n", 384 | "\n", 385 | "embedding_weight_matrix = getEmbeddingWeightMatrix(tokenizer.word_index)\n", 386 | "print(embedding_weight_matrix.shape)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 8, 392 | "id": "6d070a67", 393 | "metadata": { 394 | "execution": { 395 | "iopub.execute_input": "2021-11-27T13:49:09.928507Z", 396 | "iopub.status.busy": "2021-11-27T13:49:09.927955Z", 397 | "iopub.status.idle": "2021-11-27T13:49:12.560874Z", 398 | "shell.execute_reply": "2021-11-27T13:49:12.560395Z", 399 | "shell.execute_reply.started": "2021-11-27T13:40:42.738444Z" 400 | }, 401 | "papermill": { 402 | "duration": 2.657589, 403 | "end_time": "2021-11-27T13:49:12.561002", 404 | "exception": false, 405 | "start_time": "2021-11-27T13:49:09.903413", 406 | "status": "completed" 407 | }, 408 | "tags": [] 409 | }, 410 | "outputs": [ 411 | { 412 | "name": "stderr", 413 | "output_type": "stream", 414 | "text": [ 415 | "2021-11-27 13:49:10.021754: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 416 | "2021-11-27 13:49:10.127095: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 417 | "2021-11-27 13:49:10.127865: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 418 | "2021-11-27 13:49:10.129230: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n", 419 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", 420 | "2021-11-27 13:49:10.130458: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 421 | "2021-11-27 13:49:10.131136: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 422 | "2021-11-27 13:49:10.131767: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 423 | "2021-11-27 13:49:11.945849: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 424 | "2021-11-27 13:49:11.946562: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 425 | "2021-11-27 13:49:11.947539: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 426 | "2021-11-27 13:49:11.948205: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n" 427 | ] 428 | } 429 | ], 430 | "source": [ 431 | "he_initializer = tf.keras.initializers.HeUniform()\n", 432 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n", 433 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n", 434 | "\n", 435 | "inner1= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200, \n", 436 | " weights=[embedding_weight_matrix], trainable=True)(inp1)\n", 437 | "inner2= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200,\n", 438 | " weights=[embedding_weight_matrix], trainable=True)(inp2)\n", 439 | " \n", 440 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n", 441 | "\n", 442 | "inner = tf.keras.backend.sum(inner, axis=1, keepdims=False)\n", 443 | "inner = tf.keras.layers.Dense(300, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01), kernel_initializer = he_initializer)(inner)\n", 444 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 445 | "inner = tf.keras.layers.Dense(200, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01), kernel_initializer = he_initializer)(inner)\n", 446 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 447 | "inner = tf.keras.layers.Dense(100, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01), kernel_initializer = he_initializer)(inner)\n", 448 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 449 | "output = tf.keras.layers.Dense(2, activation='softmax')(inner)\n", 450 | "\n", 451 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 9, 457 | "id": "060a6db0", 458 | "metadata": { 459 | "execution": { 460 | "iopub.execute_input": "2021-11-27T13:49:12.597283Z", 461 | "iopub.status.busy": "2021-11-27T13:49:12.596481Z", 462 | "iopub.status.idle": "2021-11-27T13:49:12.612017Z", 463 | "shell.execute_reply": "2021-11-27T13:49:12.611523Z", 464 | "shell.execute_reply.started": "2021-11-27T13:40:46.214676Z" 465 | }, 466 | "papermill": { 467 | "duration": 0.036856, 468 | "end_time": "2021-11-27T13:49:12.612121", 469 | "exception": false, 470 | "start_time": "2021-11-27T13:49:12.575265", 471 | "status": "completed" 472 | }, 473 | "tags": [] 474 | }, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "Model: \"model\"\n", 481 | "__________________________________________________________________________________________________\n", 482 | "Layer (type) Output Shape Param # Connected to \n", 483 | "==================================================================================================\n", 484 | "input_1 (InputLayer) [(None, 200)] 0 \n", 485 | "__________________________________________________________________________________________________\n", 486 | "input_2 (InputLayer) [(None, 200)] 0 \n", 487 | "__________________________________________________________________________________________________\n", 488 | "embedding (Embedding) (None, 200, 300) 20112900 input_1[0][0] \n", 489 | "__________________________________________________________________________________________________\n", 490 | "embedding_1 (Embedding) (None, 200, 300) 20112900 input_2[0][0] \n", 491 | "__________________________________________________________________________________________________\n", 492 | "tf.__operators__.add (TFOpLambd (None, 200, 300) 0 embedding[0][0] \n", 493 | " embedding_1[0][0] \n", 494 | "__________________________________________________________________________________________________\n", 495 | "tf.math.subtract (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n", 496 | " embedding_1[0][0] \n", 497 | "__________________________________________________________________________________________________\n", 498 | "tf.math.multiply (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n", 499 | " embedding_1[0][0] \n", 500 | "__________________________________________________________________________________________________\n", 501 | "concatenate (Concatenate) (None, 200, 900) 0 tf.__operators__.add[0][0] \n", 502 | " tf.math.subtract[0][0] \n", 503 | " tf.math.multiply[0][0] \n", 504 | "__________________________________________________________________________________________________\n", 505 | "tf.math.reduce_sum (TFOpLambda) (None, 900) 0 concatenate[0][0] \n", 506 | "__________________________________________________________________________________________________\n", 507 | "dense (Dense) (None, 300) 270300 tf.math.reduce_sum[0][0] \n", 508 | "__________________________________________________________________________________________________\n", 509 | "dropout (Dropout) (None, 300) 0 dense[0][0] \n", 510 | "__________________________________________________________________________________________________\n", 511 | "dense_1 (Dense) (None, 200) 60200 dropout[0][0] \n", 512 | "__________________________________________________________________________________________________\n", 513 | "dropout_1 (Dropout) (None, 200) 0 dense_1[0][0] \n", 514 | "__________________________________________________________________________________________________\n", 515 | "dense_2 (Dense) (None, 100) 20100 dropout_1[0][0] \n", 516 | "__________________________________________________________________________________________________\n", 517 | "dropout_2 (Dropout) (None, 100) 0 dense_2[0][0] \n", 518 | "__________________________________________________________________________________________________\n", 519 | "dense_3 (Dense) (None, 2) 202 dropout_2[0][0] \n", 520 | "==================================================================================================\n", 521 | "Total params: 40,576,602\n", 522 | "Trainable params: 40,576,602\n", 523 | "Non-trainable params: 0\n", 524 | "__________________________________________________________________________________________________\n" 525 | ] 526 | } 527 | ], 528 | "source": [ 529 | "model.compile(optimizer = \"adam\", loss = 'categorical_crossentropy', metrics=['accuracy'])\n", 530 | "model.summary()" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 10, 536 | "id": "cca93a0e", 537 | "metadata": { 538 | "execution": { 539 | "iopub.execute_input": "2021-11-27T13:49:12.646592Z", 540 | "iopub.status.busy": "2021-11-27T13:49:12.645774Z", 541 | "iopub.status.idle": "2021-11-27T13:50:35.620684Z", 542 | "shell.execute_reply": "2021-11-27T13:50:35.619421Z", 543 | "shell.execute_reply.started": "2021-11-27T13:42:31.052600Z" 544 | }, 545 | "papermill": { 546 | "duration": 82.994418, 547 | "end_time": "2021-11-27T13:50:35.620868", 548 | "exception": false, 549 | "start_time": "2021-11-27T13:49:12.626450", 550 | "status": "completed" 551 | }, 552 | "tags": [] 553 | }, 554 | "outputs": [ 555 | { 556 | "name": "stdout", 557 | "output_type": "stream", 558 | "text": [ 559 | "Epoch 1/4\n" 560 | ] 561 | }, 562 | { 563 | "name": "stderr", 564 | "output_type": "stream", 565 | "text": [ 566 | "2021-11-27 13:49:12.731819: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n" 567 | ] 568 | }, 569 | { 570 | "name": "stdout", 571 | "output_type": "stream", 572 | "text": [ 573 | "632/632 [==============================] - 18s 25ms/step - loss: 2.7537 - accuracy: 0.6674 - val_loss: 0.7220 - val_accuracy: 0.7186\n", 574 | "Epoch 2/4\n", 575 | "632/632 [==============================] - 14s 21ms/step - loss: 0.5893 - accuracy: 0.7484 - val_loss: 0.5575 - val_accuracy: 0.7417\n", 576 | "Epoch 3/4\n", 577 | "632/632 [==============================] - 16s 25ms/step - loss: 0.4798 - accuracy: 0.7898 - val_loss: 0.5790 - val_accuracy: 0.7410\n", 578 | "Epoch 4/4\n", 579 | "632/632 [==============================] - 16s 25ms/step - loss: 0.4240 - accuracy: 0.8221 - val_loss: 0.5786 - val_accuracy: 0.7387\n" 580 | ] 581 | } 582 | ], 583 | "source": [ 584 | "save_weights = tf.keras.callbacks.ModelCheckpoint('cbow_mlp.h5', monitor='val_loss', save_best_only=True)\n", 585 | "checkpoint_filepath = 'weights.best.{epoch:01d}.hdf5'\n", 586 | "#model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n", 587 | "#verbose = 1,\n", 588 | "#monitor = 'val_loss',\n", 589 | "#save_best_only = False)\n", 590 | "history = model.fit((x_train1, x_train2), y_train,\n", 591 | " batch_size = 64,\n", 592 | " validation_data = ((x_val1, x_val2), y_val),\n", 593 | " validation_batch_size = 64,\n", 594 | " epochs=4, \n", 595 | " callbacks=[save_weights], \n", 596 | " verbose=1)" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 13, 602 | "id": "54a1dca3", 603 | "metadata": { 604 | "execution": { 605 | "iopub.execute_input": "2021-11-27T13:50:37.375741Z", 606 | "iopub.status.busy": "2021-11-27T13:50:37.374766Z", 607 | "iopub.status.idle": "2021-11-27T13:51:18.727911Z", 608 | "shell.execute_reply": "2021-11-27T13:51:18.728543Z", 609 | "shell.execute_reply.started": "2021-11-27T13:45:19.009457Z" 610 | }, 611 | "papermill": { 612 | "duration": 41.560134, 613 | "end_time": "2021-11-27T13:51:18.728745", 614 | "exception": false, 615 | "start_time": "2021-11-27T13:50:37.168611", 616 | "status": "completed" 617 | }, 618 | "tags": [] 619 | }, 620 | "outputs": [ 621 | { 622 | "name": "stdout", 623 | "output_type": "stream", 624 | "text": [ 625 | "10108/10108 [==============================] - 23s 2ms/step - loss: 0.3358 - accuracy: 0.8777\n", 626 | "loss on test data is 0.3357672095298767\n", 627 | "accuracy on test data is 0.8776867985725403\n" 628 | ] 629 | } 630 | ], 631 | "source": [ 632 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n", 633 | "\n", 634 | "print('loss on test data is', loss)\n", 635 | "print('accuracy on test data is', accuracy)" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 14, 641 | "id": "112b1612", 642 | "metadata": { 643 | "execution": { 644 | "iopub.execute_input": "2021-11-27T13:51:19.723275Z", 645 | "iopub.status.busy": "2021-11-27T13:51:19.719029Z", 646 | "iopub.status.idle": "2021-11-27T13:51:22.786317Z", 647 | "shell.execute_reply": "2021-11-27T13:51:22.785865Z", 648 | "shell.execute_reply.started": "2021-11-27T13:47:08.430248Z" 649 | }, 650 | "papermill": { 651 | "duration": 3.646437, 652 | "end_time": "2021-11-27T13:51:22.786439", 653 | "exception": false, 654 | "start_time": "2021-11-27T13:51:19.140002", 655 | "status": "completed" 656 | }, 657 | "tags": [] 658 | }, 659 | "outputs": [ 660 | { 661 | "name": "stdout", 662 | "output_type": "stream", 663 | "text": [ 664 | "F1_score on test is 0.8322648485465214\n" 665 | ] 666 | } 667 | ], 668 | "source": [ 669 | "pred = model.predict((x_test1, x_test2))\n", 670 | "print('F1_score on test is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))\n" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": null, 676 | "id": "26e27882", 677 | "metadata": { 678 | "papermill": { 679 | "duration": 0.304682, 680 | "end_time": "2021-11-27T13:51:23.414985", 681 | "exception": false, 682 | "start_time": "2021-11-27T13:51:23.110303", 683 | "status": "completed" 684 | }, 685 | "tags": [] 686 | }, 687 | "outputs": [], 688 | "source": [] 689 | } 690 | ], 691 | "metadata": { 692 | "kernelspec": { 693 | "display_name": "Python 3", 694 | "language": "python", 695 | "name": "python3" 696 | }, 697 | "language_info": { 698 | "codemirror_mode": { 699 | "name": "ipython", 700 | "version": 3 701 | }, 702 | "file_extension": ".py", 703 | "mimetype": "text/x-python", 704 | "name": "python", 705 | "nbconvert_exporter": "python", 706 | "pygments_lexer": "ipython3", 707 | "version": "3.8.8" 708 | }, 709 | "papermill": { 710 | "default_parameters": {}, 711 | "duration": 226.464045, 712 | "end_time": "2021-11-27T13:51:26.695502", 713 | "environment_variables": {}, 714 | "exception": null, 715 | "input_path": "__notebook__.ipynb", 716 | "output_path": "__notebook__.ipynb", 717 | "parameters": {}, 718 | "start_time": "2021-11-27T13:47:40.231457", 719 | "version": "2.3.3" 720 | } 721 | }, 722 | "nbformat": 4, 723 | "nbformat_minor": 5 724 | } 725 | --------------------------------------------------------------------------------