├── README.md ├── glove-bilstm_paper_implementation.ipynb ├── glove-bilstm_experiment2.ipynb ├── CBOW MLP Ppaer Implementation.ipynb ├── glove-lstm_paper_experiment1.ipynb ├── CBOW MLP Sum Diff Product of Embeddings.ipynb ├── CBOW ML Dropout Regularisation.ipynb └── CBOW MLP He initialisation.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # smai_project 2 | 3 | 4 | This repository contains the code which partially fulfills the requirement for our course **Statistical Methods in AI**. The project is titled **Natural language understanding on Quora Question pairs dataset**. 5 | -------------------------------------------------------------------------------- /glove-bilstm_paper_implementation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 8 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 9 | "execution": { 10 | "iopub.execute_input": "2021-12-05T07:21:24.696231Z", 11 | "iopub.status.busy": "2021-12-05T07:21:24.695446Z", 12 | "iopub.status.idle": "2021-12-05T07:21:29.177093Z", 13 | "shell.execute_reply": "2021-12-05T07:21:29.176247Z", 14 | "shell.execute_reply.started": "2021-12-05T07:21:24.696089Z" 15 | } 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "import pandas as pd\n", 20 | "import numpy as np\n", 21 | "from tqdm import tqdm\n", 22 | "import tensorflow as tf\n", 23 | "from sklearn.metrics import f1_score" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": { 30 | "execution": { 31 | "iopub.execute_input": "2021-12-05T07:21:29.179483Z", 32 | "iopub.status.busy": "2021-12-05T07:21:29.178997Z", 33 | "iopub.status.idle": "2021-12-05T07:21:32.486070Z", 34 | "shell.execute_reply": "2021-12-05T07:21:32.485330Z", 35 | "shell.execute_reply.started": "2021-12-05T07:21:29.179449Z" 36 | } 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "train = pd.read_csv('../input/smai-project-data/train_data.csv').fillna('')\n", 41 | "val = pd.read_csv('../input/smai-project-data/val_data.csv').fillna('')\n", 42 | "test = pd.read_csv('../input/smai-project-data/test_data.csv').fillna('')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": { 49 | "execution": { 50 | "iopub.execute_input": "2021-12-05T07:21:32.489624Z", 51 | "iopub.status.busy": "2021-12-05T07:21:32.489415Z", 52 | "iopub.status.idle": "2021-12-05T07:21:32.511292Z", 53 | "shell.execute_reply": "2021-12-05T07:21:32.510674Z", 54 | "shell.execute_reply.started": "2021-12-05T07:21:32.489594Z" 55 | } 56 | }, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/html": [ 61 | "
\n", 62 | "\n", 75 | "\n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | "
idqid1qid2question1question2is_duplicatequestion1_preprocessedquestion2_preprocessed
080671573815739How do I play Pokémon GO in Korea?How do I play Pokémon GO in China?0how do i play pok mon go in korea ?how do i play pok mon go in china ?
136810112736104117What are some of the best side dishes for crab...What are some good side dishes for buffalo chi...0what are some of the best side dishes for crab...what are some good side dishes for buffalo chi...
270497121486121487Which is more advisable and better material fo...What is the best server setup for buddypress?0which is more advisable and better material fo...what is the best server setup for buddypress ?
3226567254474258192How do I improve logical programming skills?How can I improve my logical skills for progra...1how do i improve logical programming skills ?how can i improve my logical skills for progra...
473186481033062How close we are to see 3rd world war?How close is a World War III?1how close we are to see 3rd world war ?how close is a world war iii ?
\n", 147 | "
" 148 | ], 149 | "text/plain": [ 150 | " id qid1 qid2 question1 \\\n", 151 | "0 8067 15738 15739 How do I play Pokémon GO in Korea? \n", 152 | "1 368101 12736 104117 What are some of the best side dishes for crab... \n", 153 | "2 70497 121486 121487 Which is more advisable and better material fo... \n", 154 | "3 226567 254474 258192 How do I improve logical programming skills? \n", 155 | "4 73186 48103 3062 How close we are to see 3rd world war? \n", 156 | "\n", 157 | " question2 is_duplicate \\\n", 158 | "0 How do I play Pokémon GO in China? 0 \n", 159 | "1 What are some good side dishes for buffalo chi... 0 \n", 160 | "2 What is the best server setup for buddypress? 0 \n", 161 | "3 How can I improve my logical skills for progra... 1 \n", 162 | "4 How close is a World War III? 1 \n", 163 | "\n", 164 | " question1_preprocessed \\\n", 165 | "0 how do i play pok mon go in korea ? \n", 166 | "1 what are some of the best side dishes for crab... \n", 167 | "2 which is more advisable and better material fo... \n", 168 | "3 how do i improve logical programming skills ? \n", 169 | "4 how close we are to see 3rd world war ? \n", 170 | "\n", 171 | " question2_preprocessed \n", 172 | "0 how do i play pok mon go in china ? \n", 173 | "1 what are some good side dishes for buffalo chi... \n", 174 | "2 what is the best server setup for buddypress ? \n", 175 | "3 how can i improve my logical skills for progra... \n", 176 | "4 how close is a world war iii ? " 177 | ] 178 | }, 179 | "execution_count": 3, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "train.head()" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 4, 191 | "metadata": { 192 | "execution": { 193 | "iopub.execute_input": "2021-12-05T07:21:32.514130Z", 194 | "iopub.status.busy": "2021-12-05T07:21:32.513901Z", 195 | "iopub.status.idle": "2021-12-05T07:21:32.524800Z", 196 | "shell.execute_reply": "2021-12-05T07:21:32.524106Z", 197 | "shell.execute_reply.started": "2021-12-05T07:21:32.514094Z" 198 | } 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "def buildVocabulary(reviews):\n", 203 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n", 204 | " tokenizer.fit_on_texts(reviews)\n", 205 | " return tokenizer\n", 206 | "\n", 207 | "def getSequences(reviews, tokenizer, seq_maxlen):\n", 208 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n", 209 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n", 210 | "\n", 211 | "def loadGloveWordEmbeddings():\n", 212 | " embedding_vectors = {}\n", 213 | " with open('../input/glove840b300dtxt/glove.840B.300d.txt') as f:\n", 214 | " for line in tqdm(f):\n", 215 | " values = line.split(' ')\n", 216 | " word = values[0]\n", 217 | " coefs = np.asarray(values[1:], dtype='float32')\n", 218 | " embedding_vectors[word] = coefs\n", 219 | " return embedding_vectors\n", 220 | "\n", 221 | "def getEmbeddingWeightMatrix(embedding_vectors, word2idx): \n", 222 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n", 223 | " for word, i in tqdm(word2idx.items()):\n", 224 | " embedding_vector = embedding_vectors.get(word)\n", 225 | " if embedding_vector is not None:\n", 226 | " embedding_matrix[i] = embedding_vector\n", 227 | " return embedding_matrix" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 5, 233 | "metadata": { 234 | "execution": { 235 | "iopub.execute_input": "2021-12-05T07:21:32.526477Z", 236 | "iopub.status.busy": "2021-12-05T07:21:32.526215Z", 237 | "iopub.status.idle": "2021-12-05T07:22:04.067116Z", 238 | "shell.execute_reply": "2021-12-05T07:22:04.066259Z", 239 | "shell.execute_reply.started": "2021-12-05T07:21:32.526443Z" 240 | } 241 | }, 242 | "outputs": [ 243 | { 244 | "name": "stdout", 245 | "output_type": "stream", 246 | "text": [ 247 | "119558\n" 248 | ] 249 | } 250 | ], 251 | "source": [ 252 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n", 253 | "vocab_size = len(tokenizer.word_index) + 1\n", 254 | "print(vocab_size)\n", 255 | "\n", 256 | "x_train1 = getSequences(train['question1'], tokenizer, 128)\n", 257 | "x_train2 = getSequences(train['question2'], tokenizer, 128)\n", 258 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n", 259 | "\n", 260 | "x_val1 = getSequences(val['question1'], tokenizer, 128)\n", 261 | "x_val2 = getSequences(val['question2'], tokenizer, 128)\n", 262 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n", 263 | "\n", 264 | "x_test1 = getSequences(test['question1'], tokenizer, 128)\n", 265 | "x_test2 = getSequences(test['question2'], tokenizer, 128)\n", 266 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 6, 272 | "metadata": { 273 | "execution": { 274 | "iopub.execute_input": "2021-12-05T07:22:04.068714Z", 275 | "iopub.status.busy": "2021-12-05T07:22:04.068455Z", 276 | "iopub.status.idle": "2021-12-05T07:26:17.684935Z", 277 | "shell.execute_reply": "2021-12-05T07:26:17.684071Z", 278 | "shell.execute_reply.started": "2021-12-05T07:22:04.068679Z" 279 | } 280 | }, 281 | "outputs": [ 282 | { 283 | "name": "stderr", 284 | "output_type": "stream", 285 | "text": [ 286 | "2196018it [04:13, 8673.48it/s]\n" 287 | ] 288 | }, 289 | { 290 | "name": "stdout", 291 | "output_type": "stream", 292 | "text": [ 293 | "2196017\n" 294 | ] 295 | }, 296 | { 297 | "name": "stderr", 298 | "output_type": "stream", 299 | "text": [ 300 | "100%|██████████| 119557/119557 [00:00<00:00, 289620.78it/s]" 301 | ] 302 | }, 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "(119558, 300)\n" 308 | ] 309 | }, 310 | { 311 | "name": "stderr", 312 | "output_type": "stream", 313 | "text": [ 314 | "\n" 315 | ] 316 | } 317 | ], 318 | "source": [ 319 | "embedding_vectors = loadGloveWordEmbeddings()\n", 320 | "print(len(embedding_vectors))\n", 321 | "\n", 322 | "embedding_weight_matrix = getEmbeddingWeightMatrix(embedding_vectors, tokenizer.word_index)\n", 323 | "print(embedding_weight_matrix.shape)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 8, 329 | "metadata": { 330 | "execution": { 331 | "iopub.execute_input": "2021-12-05T07:28:18.980466Z", 332 | "iopub.status.busy": "2021-12-05T07:28:18.979883Z", 333 | "iopub.status.idle": "2021-12-05T07:28:19.835475Z", 334 | "shell.execute_reply": "2021-12-05T07:28:19.834704Z", 335 | "shell.execute_reply.started": "2021-12-05T07:28:18.980425Z" 336 | } 337 | }, 338 | "outputs": [ 339 | { 340 | "name": "stderr", 341 | "output_type": "stream", 342 | "text": [ 343 | "2021-12-05 07:28:19.052944: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n", 344 | "2021-12-05 07:28:19.259332: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n" 345 | ] 346 | } 347 | ], 348 | "source": [ 349 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n", 350 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n", 351 | "\n", 352 | "inner1= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128, \n", 353 | " weights=[embedding_weight_matrix], trainable=False)(inp1)\n", 354 | "inner2= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128,\n", 355 | " weights=[embedding_weight_matrix], trainable=False)(inp2)\n", 356 | "\n", 357 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n", 358 | "\n", 359 | "out = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, kernel_regularizer='l2', dropout=0.1, return_sequences=True))(inner)\n", 360 | "\n", 361 | "out = tf.keras.backend.mean(out, axis=1, keepdims=False)\n", 362 | "\n", 363 | "output = tf.keras.layers.Dense(2, kernel_regularizer='l2', activation='softmax')(out)\n", 364 | "\n", 365 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 9, 371 | "metadata": { 372 | "execution": { 373 | "iopub.execute_input": "2021-12-05T07:28:22.230481Z", 374 | "iopub.status.busy": "2021-12-05T07:28:22.229633Z", 375 | "iopub.status.idle": "2021-12-05T07:28:22.254231Z", 376 | "shell.execute_reply": "2021-12-05T07:28:22.252329Z", 377 | "shell.execute_reply.started": "2021-12-05T07:28:22.230431Z" 378 | } 379 | }, 380 | "outputs": [ 381 | { 382 | "name": "stdout", 383 | "output_type": "stream", 384 | "text": [ 385 | "Model: \"model\"\n", 386 | "__________________________________________________________________________________________________\n", 387 | "Layer (type) Output Shape Param # Connected to \n", 388 | "==================================================================================================\n", 389 | "input_3 (InputLayer) [(None, 128)] 0 \n", 390 | "__________________________________________________________________________________________________\n", 391 | "input_4 (InputLayer) [(None, 128)] 0 \n", 392 | "__________________________________________________________________________________________________\n", 393 | "embedding_2 (Embedding) (None, 128, 300) 35867400 input_3[0][0] \n", 394 | "__________________________________________________________________________________________________\n", 395 | "embedding_3 (Embedding) (None, 128, 300) 35867400 input_4[0][0] \n", 396 | "__________________________________________________________________________________________________\n", 397 | "tf.__operators__.add_1 (TFOpLam (None, 128, 300) 0 embedding_2[0][0] \n", 398 | " embedding_3[0][0] \n", 399 | "__________________________________________________________________________________________________\n", 400 | "tf.math.subtract_1 (TFOpLambda) (None, 128, 300) 0 embedding_2[0][0] \n", 401 | " embedding_3[0][0] \n", 402 | "__________________________________________________________________________________________________\n", 403 | "tf.math.multiply_1 (TFOpLambda) (None, 128, 300) 0 embedding_2[0][0] \n", 404 | " embedding_3[0][0] \n", 405 | "__________________________________________________________________________________________________\n", 406 | "concatenate_1 (Concatenate) (None, 128, 900) 0 tf.__operators__.add_1[0][0] \n", 407 | " tf.math.subtract_1[0][0] \n", 408 | " tf.math.multiply_1[0][0] \n", 409 | "__________________________________________________________________________________________________\n", 410 | "bidirectional_1 (Bidirectional) (None, 128, 300) 1261200 concatenate_1[0][0] \n", 411 | "__________________________________________________________________________________________________\n", 412 | "tf.math.reduce_mean_1 (TFOpLamb (None, 300) 0 bidirectional_1[0][0] \n", 413 | "__________________________________________________________________________________________________\n", 414 | "dense (Dense) (None, 2) 602 tf.math.reduce_mean_1[0][0] \n", 415 | "==================================================================================================\n", 416 | "Total params: 72,996,602\n", 417 | "Trainable params: 1,261,802\n", 418 | "Non-trainable params: 71,734,800\n", 419 | "__________________________________________________________________________________________________\n" 420 | ] 421 | } 422 | ], 423 | "source": [ 424 | "model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics=['accuracy'])\n", 425 | "model.summary()" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 15, 431 | "metadata": { 432 | "execution": { 433 | "iopub.execute_input": "2021-12-05T07:54:34.407682Z", 434 | "iopub.status.busy": "2021-12-05T07:54:34.406978Z", 435 | "iopub.status.idle": "2021-12-05T08:02:56.589419Z", 436 | "shell.execute_reply": "2021-12-05T08:02:56.588628Z", 437 | "shell.execute_reply.started": "2021-12-05T07:54:34.407647Z" 438 | } 439 | }, 440 | "outputs": [ 441 | { 442 | "name": "stdout", 443 | "output_type": "stream", 444 | "text": [ 445 | "Epoch 1/2\n", 446 | "8844/8844 [==============================] - 249s 28ms/step - loss: 0.5754 - accuracy: 0.7337 - val_loss: 0.5890 - val_accuracy: 0.7333\n", 447 | "\n", 448 | "Epoch 00001: val_loss improved from inf to 0.58903, saving model to weights.best.hdf5\n", 449 | "Epoch 2/2\n", 450 | "8844/8844 [==============================] - 249s 28ms/step - loss: 0.5752 - accuracy: 0.7361 - val_loss: 0.5766 - val_accuracy: 0.7340\n", 451 | "\n", 452 | "Epoch 00002: val_loss improved from 0.58903 to 0.57661, saving model to weights.best.hdf5\n" 453 | ] 454 | } 455 | ], 456 | "source": [ 457 | "checkpoint_filepath = 'weights.best.hdf5'\n", 458 | "model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n", 459 | " verbose = 1, \n", 460 | " monitor = 'val_loss',\n", 461 | " save_best_only = True)\n", 462 | "\n", 463 | "history = model.fit((x_train1, x_train2), y_train,\n", 464 | " batch_size = 32,\n", 465 | " validation_data = ((x_val1, x_val2), y_val),\n", 466 | " validation_batch_size = 16,\n", 467 | " epochs=5,\n", 468 | " callbacks=[model_checkpoint_callback], \n", 469 | " verbose=1)" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 18, 475 | "metadata": { 476 | "execution": { 477 | "iopub.execute_input": "2021-12-05T08:02:57.030112Z", 478 | "iopub.status.busy": "2021-12-05T08:02:57.029591Z", 479 | "iopub.status.idle": "2021-12-05T08:04:06.879698Z", 480 | "shell.execute_reply": "2021-12-05T08:04:06.878884Z", 481 | "shell.execute_reply.started": "2021-12-05T08:02:57.030075Z" 482 | } 483 | }, 484 | "outputs": [ 485 | { 486 | "name": "stdout", 487 | "output_type": "stream", 488 | "text": [ 489 | "10108/10108 [==============================] - 70s 7ms/step - loss: 0.5730 - accuracy: 0.7340\n", 490 | "loss on test data is 0.5730125308036804\n", 491 | "accuracy on test data is 0.7340275645256042\n" 492 | ] 493 | } 494 | ], 495 | "source": [ 496 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n", 497 | "\n", 498 | "print('loss on test data is', loss)\n", 499 | "print('accuracy on test data is', accuracy)" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 19, 505 | "metadata": { 506 | "execution": { 507 | "iopub.execute_input": "2021-12-05T08:04:06.881174Z", 508 | "iopub.status.busy": "2021-12-05T08:04:06.880894Z", 509 | "iopub.status.idle": "2021-12-05T08:04:17.186473Z", 510 | "shell.execute_reply": "2021-12-05T08:04:17.185721Z", 511 | "shell.execute_reply.started": "2021-12-05T08:04:06.881138Z" 512 | } 513 | }, 514 | "outputs": [ 515 | { 516 | "name": "stdout", 517 | "output_type": "stream", 518 | "text": [ 519 | "f1_score on test dataset is 0.6310516383599245\n" 520 | ] 521 | } 522 | ], 523 | "source": [ 524 | "pred = model.predict((x_test1, x_test2))\n", 525 | "\n", 526 | "print('f1_score on test dataset is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [] 535 | } 536 | ], 537 | "metadata": { 538 | "kernelspec": { 539 | "display_name": "Python 3 (ipykernel)", 540 | "language": "python", 541 | "name": "python3" 542 | }, 543 | "language_info": { 544 | "codemirror_mode": { 545 | "name": "ipython", 546 | "version": 3 547 | }, 548 | "file_extension": ".py", 549 | "mimetype": "text/x-python", 550 | "name": "python", 551 | "nbconvert_exporter": "python", 552 | "pygments_lexer": "ipython3", 553 | "version": "3.8.10" 554 | } 555 | }, 556 | "nbformat": 4, 557 | "nbformat_minor": 4 558 | } 559 | -------------------------------------------------------------------------------- /glove-bilstm_experiment2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 8 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 9 | "execution": { 10 | "iopub.execute_input": "2021-12-05T08:27:55.991770Z", 11 | "iopub.status.busy": "2021-12-05T08:27:55.991383Z", 12 | "iopub.status.idle": "2021-12-05T08:28:00.572205Z", 13 | "shell.execute_reply": "2021-12-05T08:28:00.571459Z", 14 | "shell.execute_reply.started": "2021-12-05T08:27:55.991676Z" 15 | } 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "import pandas as pd\n", 20 | "import numpy as np\n", 21 | "from tqdm import tqdm\n", 22 | "import tensorflow as tf\n", 23 | "from sklearn.metrics import f1_score" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": { 30 | "execution": { 31 | "iopub.execute_input": "2021-12-05T08:28:00.575744Z", 32 | "iopub.status.busy": "2021-12-05T08:28:00.575547Z", 33 | "iopub.status.idle": "2021-12-05T08:28:03.698196Z", 34 | "shell.execute_reply": "2021-12-05T08:28:03.697341Z", 35 | "shell.execute_reply.started": "2021-12-05T08:28:00.575719Z" 36 | } 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "train = pd.read_csv('../input/smai-project-data/train_data.csv').fillna('')\n", 41 | "val = pd.read_csv('../input/smai-project-data/val_data.csv').fillna('')\n", 42 | "test = pd.read_csv('../input/smai-project-data/test_data.csv').fillna('')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": { 49 | "execution": { 50 | "iopub.execute_input": "2021-12-05T08:28:03.701370Z", 51 | "iopub.status.busy": "2021-12-05T08:28:03.699765Z", 52 | "iopub.status.idle": "2021-12-05T08:28:03.725788Z", 53 | "shell.execute_reply": "2021-12-05T08:28:03.725005Z", 54 | "shell.execute_reply.started": "2021-12-05T08:28:03.701323Z" 55 | } 56 | }, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/html": [ 61 | "
\n", 62 | "\n", 75 | "\n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | "
idqid1qid2question1question2is_duplicatequestion1_preprocessedquestion2_preprocessed
080671573815739How do I play Pokémon GO in Korea?How do I play Pokémon GO in China?0how do i play pok mon go in korea ?how do i play pok mon go in china ?
136810112736104117What are some of the best side dishes for crab...What are some good side dishes for buffalo chi...0what are some of the best side dishes for crab...what are some good side dishes for buffalo chi...
270497121486121487Which is more advisable and better material fo...What is the best server setup for buddypress?0which is more advisable and better material fo...what is the best server setup for buddypress ?
3226567254474258192How do I improve logical programming skills?How can I improve my logical skills for progra...1how do i improve logical programming skills ?how can i improve my logical skills for progra...
473186481033062How close we are to see 3rd world war?How close is a World War III?1how close we are to see 3rd world war ?how close is a world war iii ?
\n", 147 | "
" 148 | ], 149 | "text/plain": [ 150 | " id qid1 qid2 question1 \\\n", 151 | "0 8067 15738 15739 How do I play Pokémon GO in Korea? \n", 152 | "1 368101 12736 104117 What are some of the best side dishes for crab... \n", 153 | "2 70497 121486 121487 Which is more advisable and better material fo... \n", 154 | "3 226567 254474 258192 How do I improve logical programming skills? \n", 155 | "4 73186 48103 3062 How close we are to see 3rd world war? \n", 156 | "\n", 157 | " question2 is_duplicate \\\n", 158 | "0 How do I play Pokémon GO in China? 0 \n", 159 | "1 What are some good side dishes for buffalo chi... 0 \n", 160 | "2 What is the best server setup for buddypress? 0 \n", 161 | "3 How can I improve my logical skills for progra... 1 \n", 162 | "4 How close is a World War III? 1 \n", 163 | "\n", 164 | " question1_preprocessed \\\n", 165 | "0 how do i play pok mon go in korea ? \n", 166 | "1 what are some of the best side dishes for crab... \n", 167 | "2 which is more advisable and better material fo... \n", 168 | "3 how do i improve logical programming skills ? \n", 169 | "4 how close we are to see 3rd world war ? \n", 170 | "\n", 171 | " question2_preprocessed \n", 172 | "0 how do i play pok mon go in china ? \n", 173 | "1 what are some good side dishes for buffalo chi... \n", 174 | "2 what is the best server setup for buddypress ? \n", 175 | "3 how can i improve my logical skills for progra... \n", 176 | "4 how close is a world war iii ? " 177 | ] 178 | }, 179 | "execution_count": 3, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "train.head()" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 4, 191 | "metadata": { 192 | "execution": { 193 | "iopub.execute_input": "2021-12-05T08:28:03.728265Z", 194 | "iopub.status.busy": "2021-12-05T08:28:03.727992Z", 195 | "iopub.status.idle": "2021-12-05T08:28:03.737968Z", 196 | "shell.execute_reply": "2021-12-05T08:28:03.736992Z", 197 | "shell.execute_reply.started": "2021-12-05T08:28:03.728229Z" 198 | } 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "def buildVocabulary(reviews):\n", 203 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n", 204 | " tokenizer.fit_on_texts(reviews)\n", 205 | " return tokenizer\n", 206 | "\n", 207 | "def getSequences(reviews, tokenizer, seq_maxlen):\n", 208 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n", 209 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n", 210 | "\n", 211 | "def loadGloveWordEmbeddings():\n", 212 | " embedding_vectors = {}\n", 213 | " with open('../input/glove840b300dtxt/glove.840B.300d.txt') as f:\n", 214 | " for line in tqdm(f):\n", 215 | " values = line.split(' ')\n", 216 | " word = values[0]\n", 217 | " coefs = np.asarray(values[1:], dtype='float32')\n", 218 | " embedding_vectors[word] = coefs\n", 219 | " return embedding_vectors\n", 220 | "\n", 221 | "def getEmbeddingWeightMatrix(embedding_vectors, word2idx): \n", 222 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n", 223 | " for word, i in tqdm(word2idx.items()):\n", 224 | " embedding_vector = embedding_vectors.get(word)\n", 225 | " if embedding_vector is not None:\n", 226 | " embedding_matrix[i] = embedding_vector\n", 227 | " return embedding_matrix" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 5, 233 | "metadata": { 234 | "execution": { 235 | "iopub.execute_input": "2021-12-05T08:28:03.740285Z", 236 | "iopub.status.busy": "2021-12-05T08:28:03.739650Z", 237 | "iopub.status.idle": "2021-12-05T08:28:35.589006Z", 238 | "shell.execute_reply": "2021-12-05T08:28:35.588270Z", 239 | "shell.execute_reply.started": "2021-12-05T08:28:03.740250Z" 240 | } 241 | }, 242 | "outputs": [ 243 | { 244 | "name": "stdout", 245 | "output_type": "stream", 246 | "text": [ 247 | "119558\n" 248 | ] 249 | } 250 | ], 251 | "source": [ 252 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n", 253 | "vocab_size = len(tokenizer.word_index) + 1\n", 254 | "print(vocab_size)\n", 255 | "\n", 256 | "x_train1 = getSequences(train['question1'], tokenizer, 128)\n", 257 | "x_train2 = getSequences(train['question2'], tokenizer, 128)\n", 258 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n", 259 | "\n", 260 | "x_val1 = getSequences(val['question1'], tokenizer, 128)\n", 261 | "x_val2 = getSequences(val['question2'], tokenizer, 128)\n", 262 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n", 263 | "\n", 264 | "x_test1 = getSequences(test['question1'], tokenizer, 128)\n", 265 | "x_test2 = getSequences(test['question2'], tokenizer, 128)\n", 266 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 6, 272 | "metadata": { 273 | "execution": { 274 | "iopub.execute_input": "2021-12-05T08:28:35.590734Z", 275 | "iopub.status.busy": "2021-12-05T08:28:35.590499Z", 276 | "iopub.status.idle": "2021-12-05T08:32:50.715266Z", 277 | "shell.execute_reply": "2021-12-05T08:32:50.714504Z", 278 | "shell.execute_reply.started": "2021-12-05T08:28:35.590699Z" 279 | } 280 | }, 281 | "outputs": [ 282 | { 283 | "name": "stderr", 284 | "output_type": "stream", 285 | "text": [ 286 | "2196018it [04:14, 8621.81it/s]\n" 287 | ] 288 | }, 289 | { 290 | "name": "stdout", 291 | "output_type": "stream", 292 | "text": [ 293 | "2196017\n" 294 | ] 295 | }, 296 | { 297 | "name": "stderr", 298 | "output_type": "stream", 299 | "text": [ 300 | "100%|██████████| 119557/119557 [00:00<00:00, 296253.60it/s]" 301 | ] 302 | }, 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "(119558, 300)\n" 308 | ] 309 | }, 310 | { 311 | "name": "stderr", 312 | "output_type": "stream", 313 | "text": [ 314 | "\n" 315 | ] 316 | } 317 | ], 318 | "source": [ 319 | "embedding_vectors = loadGloveWordEmbeddings()\n", 320 | "print(len(embedding_vectors))\n", 321 | "\n", 322 | "embedding_weight_matrix = getEmbeddingWeightMatrix(embedding_vectors, tokenizer.word_index)\n", 323 | "print(embedding_weight_matrix.shape)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 7, 329 | "metadata": { 330 | "execution": { 331 | "iopub.execute_input": "2021-12-05T08:32:50.717025Z", 332 | "iopub.status.busy": "2021-12-05T08:32:50.716763Z", 333 | "iopub.status.idle": "2021-12-05T08:32:54.266862Z", 334 | "shell.execute_reply": "2021-12-05T08:32:54.266168Z", 335 | "shell.execute_reply.started": "2021-12-05T08:32:50.716989Z" 336 | } 337 | }, 338 | "outputs": [ 339 | { 340 | "name": "stderr", 341 | "output_type": "stream", 342 | "text": [ 343 | "2021-12-05 08:32:50.811328: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 344 | "2021-12-05 08:32:50.924265: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 345 | "2021-12-05 08:32:50.924979: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 346 | "2021-12-05 08:32:50.926274: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n", 347 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", 348 | "2021-12-05 08:32:50.927066: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 349 | "2021-12-05 08:32:50.927770: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 350 | "2021-12-05 08:32:50.928410: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 351 | "2021-12-05 08:32:52.819781: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 352 | "2021-12-05 08:32:52.820602: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 353 | "2021-12-05 08:32:52.821254: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 354 | "2021-12-05 08:32:52.821837: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n", 355 | "2021-12-05 08:32:53.377677: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n", 356 | "2021-12-05 08:32:53.611921: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n" 357 | ] 358 | } 359 | ], 360 | "source": [ 361 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n", 362 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n", 363 | "\n", 364 | "inner1= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128, \n", 365 | " weights=[embedding_weight_matrix], trainable=False)(inp1)\n", 366 | "inner2= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128,\n", 367 | " weights=[embedding_weight_matrix], trainable=False)(inp2)\n", 368 | "\n", 369 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n", 370 | "\n", 371 | "out = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, kernel_regularizer='l2', return_sequences=True))(inner)\n", 372 | "\n", 373 | "out = tf.keras.backend.mean(out, axis=1, keepdims=False)\n", 374 | "\n", 375 | "output = tf.keras.layers.Dense(2, kernel_regularizer='l2', activation='softmax')(out)\n", 376 | "\n", 377 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 8, 383 | "metadata": { 384 | "execution": { 385 | "iopub.execute_input": "2021-12-05T08:32:54.268296Z", 386 | "iopub.status.busy": "2021-12-05T08:32:54.268035Z", 387 | "iopub.status.idle": "2021-12-05T08:32:54.288773Z", 388 | "shell.execute_reply": "2021-12-05T08:32:54.288146Z", 389 | "shell.execute_reply.started": "2021-12-05T08:32:54.268263Z" 390 | } 391 | }, 392 | "outputs": [ 393 | { 394 | "name": "stdout", 395 | "output_type": "stream", 396 | "text": [ 397 | "Model: \"model\"\n", 398 | "__________________________________________________________________________________________________\n", 399 | "Layer (type) Output Shape Param # Connected to \n", 400 | "==================================================================================================\n", 401 | "input_1 (InputLayer) [(None, 128)] 0 \n", 402 | "__________________________________________________________________________________________________\n", 403 | "input_2 (InputLayer) [(None, 128)] 0 \n", 404 | "__________________________________________________________________________________________________\n", 405 | "embedding (Embedding) (None, 128, 300) 35867400 input_1[0][0] \n", 406 | "__________________________________________________________________________________________________\n", 407 | "embedding_1 (Embedding) (None, 128, 300) 35867400 input_2[0][0] \n", 408 | "__________________________________________________________________________________________________\n", 409 | "tf.__operators__.add (TFOpLambd (None, 128, 300) 0 embedding[0][0] \n", 410 | " embedding_1[0][0] \n", 411 | "__________________________________________________________________________________________________\n", 412 | "tf.math.subtract (TFOpLambda) (None, 128, 300) 0 embedding[0][0] \n", 413 | " embedding_1[0][0] \n", 414 | "__________________________________________________________________________________________________\n", 415 | "tf.math.multiply (TFOpLambda) (None, 128, 300) 0 embedding[0][0] \n", 416 | " embedding_1[0][0] \n", 417 | "__________________________________________________________________________________________________\n", 418 | "concatenate (Concatenate) (None, 128, 900) 0 tf.__operators__.add[0][0] \n", 419 | " tf.math.subtract[0][0] \n", 420 | " tf.math.multiply[0][0] \n", 421 | "__________________________________________________________________________________________________\n", 422 | "bidirectional (Bidirectional) (None, 128, 300) 1261200 concatenate[0][0] \n", 423 | "__________________________________________________________________________________________________\n", 424 | "tf.math.reduce_mean (TFOpLambda (None, 300) 0 bidirectional[0][0] \n", 425 | "__________________________________________________________________________________________________\n", 426 | "dense (Dense) (None, 2) 602 tf.math.reduce_mean[0][0] \n", 427 | "==================================================================================================\n", 428 | "Total params: 72,996,602\n", 429 | "Trainable params: 1,261,802\n", 430 | "Non-trainable params: 71,734,800\n", 431 | "__________________________________________________________________________________________________\n" 432 | ] 433 | } 434 | ], 435 | "source": [ 436 | "model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics=['accuracy'])\n", 437 | "model.summary()" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 9, 443 | "metadata": { 444 | "execution": { 445 | "iopub.execute_input": "2021-12-05T08:32:54.290164Z", 446 | "iopub.status.busy": "2021-12-05T08:32:54.289923Z", 447 | "iopub.status.idle": "2021-12-05T08:54:19.362354Z", 448 | "shell.execute_reply": "2021-12-05T08:54:19.361515Z", 449 | "shell.execute_reply.started": "2021-12-05T08:32:54.290131Z" 450 | } 451 | }, 452 | "outputs": [ 453 | { 454 | "name": "stderr", 455 | "output_type": "stream", 456 | "text": [ 457 | "2021-12-05 08:32:54.295725: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 144897536 exceeds 10% of free system memory.\n", 458 | "2021-12-05 08:32:54.402477: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 144897536 exceeds 10% of free system memory.\n", 459 | "2021-12-05 08:32:54.568071: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n" 460 | ] 461 | }, 462 | { 463 | "name": "stdout", 464 | "output_type": "stream", 465 | "text": [ 466 | "Epoch 1/5\n" 467 | ] 468 | }, 469 | { 470 | "name": "stderr", 471 | "output_type": "stream", 472 | "text": [ 473 | "2021-12-05 08:32:58.225519: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005\n" 474 | ] 475 | }, 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "8844/8844 [==============================] - ETA: 0s - loss: 0.6319 - accuracy: 0.7057" 481 | ] 482 | }, 483 | { 484 | "name": "stderr", 485 | "output_type": "stream", 486 | "text": [ 487 | "2021-12-05 08:36:24.846568: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 41399296 exceeds 10% of free system memory.\n" 488 | ] 489 | }, 490 | { 491 | "name": "stdout", 492 | "output_type": "stream", 493 | "text": [ 494 | "8844/8844 [==============================] - 250s 28ms/step - loss: 0.6319 - accuracy: 0.7057 - val_loss: 0.5694 - val_accuracy: 0.7317\n", 495 | "\n", 496 | "Epoch 00001: val_loss improved from inf to 0.56941, saving model to weights.best.hdf5\n", 497 | "Epoch 2/5\n", 498 | "8844/8844 [==============================] - 245s 28ms/step - loss: 0.5812 - accuracy: 0.7274 - val_loss: 0.5639 - val_accuracy: 0.7362\n", 499 | "\n", 500 | "Epoch 00002: val_loss improved from 0.56941 to 0.56394, saving model to weights.best.hdf5\n", 501 | "Epoch 3/5\n", 502 | "8844/8844 [==============================] - 243s 28ms/step - loss: 0.5915 - accuracy: 0.7223 - val_loss: 0.6492 - val_accuracy: 0.6690\n", 503 | "\n", 504 | "Epoch 00003: val_loss did not improve from 0.56394\n", 505 | "Epoch 4/5\n", 506 | "8844/8844 [==============================] - 244s 28ms/step - loss: 0.5840 - accuracy: 0.7287 - val_loss: 0.5697 - val_accuracy: 0.7288\n", 507 | "\n", 508 | "Epoch 00004: val_loss did not improve from 0.56394\n", 509 | "Epoch 5/5\n", 510 | "8844/8844 [==============================] - 243s 28ms/step - loss: 0.5734 - accuracy: 0.7334 - val_loss: 0.5623 - val_accuracy: 0.7404\n", 511 | "\n", 512 | "Epoch 00005: val_loss improved from 0.56394 to 0.56234, saving model to weights.best.hdf5\n" 513 | ] 514 | } 515 | ], 516 | "source": [ 517 | "checkpoint_filepath = 'weights.best.hdf5'\n", 518 | "model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n", 519 | " verbose = 1, \n", 520 | " monitor = 'val_loss',\n", 521 | " save_best_only = True)\n", 522 | "\n", 523 | "history = model.fit((x_train1, x_train2), y_train,\n", 524 | " batch_size = 32,\n", 525 | " validation_data = ((x_val1, x_val2), y_val),\n", 526 | " validation_batch_size = 16,\n", 527 | " epochs=5,\n", 528 | " callbacks=[model_checkpoint_callback], \n", 529 | " verbose=1)" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 12, 535 | "metadata": { 536 | "execution": { 537 | "iopub.execute_input": "2021-12-05T08:54:19.834213Z", 538 | "iopub.status.busy": "2021-12-05T08:54:19.833805Z", 539 | "iopub.status.idle": "2021-12-05T08:55:29.256507Z", 540 | "shell.execute_reply": "2021-12-05T08:55:29.255776Z", 541 | "shell.execute_reply.started": "2021-12-05T08:54:19.834175Z" 542 | } 543 | }, 544 | "outputs": [ 545 | { 546 | "name": "stdout", 547 | "output_type": "stream", 548 | "text": [ 549 | "10108/10108 [==============================] - 69s 7ms/step - loss: 0.5629 - accuracy: 0.7411\n", 550 | "loss on test data is 0.5628555417060852\n", 551 | "accuracy on test data is 0.7410769462585449\n" 552 | ] 553 | } 554 | ], 555 | "source": [ 556 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n", 557 | "\n", 558 | "print('loss on test data is', loss)\n", 559 | "print('accuracy on test data is', accuracy)" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 13, 565 | "metadata": { 566 | "execution": { 567 | "iopub.execute_input": "2021-12-05T08:55:29.258466Z", 568 | "iopub.status.busy": "2021-12-05T08:55:29.258123Z", 569 | "iopub.status.idle": "2021-12-05T08:55:40.114361Z", 570 | "shell.execute_reply": "2021-12-05T08:55:40.113551Z", 571 | "shell.execute_reply.started": "2021-12-05T08:55:29.258423Z" 572 | } 573 | }, 574 | "outputs": [ 575 | { 576 | "name": "stdout", 577 | "output_type": "stream", 578 | "text": [ 579 | "f1_score on test dataset is 0.6306802145074795\n" 580 | ] 581 | } 582 | ], 583 | "source": [ 584 | "pred = model.predict((x_test1, x_test2))\n", 585 | "\n", 586 | "print('f1_score on test dataset is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [] 595 | } 596 | ], 597 | "metadata": { 598 | "kernelspec": { 599 | "display_name": "Python 3 (ipykernel)", 600 | "language": "python", 601 | "name": "python3" 602 | }, 603 | "language_info": { 604 | "codemirror_mode": { 605 | "name": "ipython", 606 | "version": 3 607 | }, 608 | "file_extension": ".py", 609 | "mimetype": "text/x-python", 610 | "name": "python", 611 | "nbconvert_exporter": "python", 612 | "pygments_lexer": "ipython3", 613 | "version": "3.8.10" 614 | } 615 | }, 616 | "nbformat": 4, 617 | "nbformat_minor": 4 618 | } 619 | -------------------------------------------------------------------------------- /CBOW MLP Ppaer Implementation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "8ca79fa1", 7 | "metadata": { 8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 10 | "execution": { 11 | "iopub.execute_input": "2021-12-05T06:41:42.716152Z", 12 | "iopub.status.busy": "2021-12-05T06:41:42.714638Z", 13 | "iopub.status.idle": "2021-12-05T06:41:47.285389Z", 14 | "shell.execute_reply": "2021-12-05T06:41:47.284775Z", 15 | "shell.execute_reply.started": "2021-12-05T06:28:02.081372Z" 16 | }, 17 | "papermill": { 18 | "duration": 4.588331, 19 | "end_time": "2021-12-05T06:41:47.285538", 20 | "exception": false, 21 | "start_time": "2021-12-05T06:41:42.697207", 22 | "status": "completed" 23 | }, 24 | "tags": [] 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import pandas as pd\n", 29 | "import numpy as np\n", 30 | "from tqdm import tqdm\n", 31 | "import tensorflow as tf\n", 32 | "from gensim.models import KeyedVectors\n", 33 | "import gensim\n", 34 | "import re\n", 35 | "from sklearn.metrics import f1_score\n", 36 | "import matplotlib.pyplot as plt" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "id": "57d8136d", 43 | "metadata": { 44 | "execution": { 45 | "iopub.execute_input": "2021-12-05T06:41:47.314041Z", 46 | "iopub.status.busy": "2021-12-05T06:41:47.313448Z", 47 | "iopub.status.idle": "2021-12-05T06:41:48.775380Z", 48 | "shell.execute_reply": "2021-12-05T06:41:48.774838Z", 49 | "shell.execute_reply.started": "2021-12-05T06:28:06.599367Z" 50 | }, 51 | "papermill": { 52 | "duration": 1.478299, 53 | "end_time": "2021-12-05T06:41:48.775521", 54 | "exception": false, 55 | "start_time": "2021-12-05T06:41:47.297222", 56 | "status": "completed" 57 | }, 58 | "tags": [] 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "train = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')\n", 63 | "val = pd.read_csv('../input/quora-ques-pair/val_data.csv/val_data.csv').fillna('')\n", 64 | "test = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "id": "0fb8cffe", 71 | "metadata": { 72 | "execution": { 73 | "iopub.execute_input": "2021-12-05T06:41:48.800775Z", 74 | "iopub.status.busy": "2021-12-05T06:41:48.799943Z", 75 | "iopub.status.idle": "2021-12-05T06:41:48.804283Z", 76 | "shell.execute_reply": "2021-12-05T06:41:48.803802Z", 77 | "shell.execute_reply.started": "2021-12-05T06:28:07.887882Z" 78 | }, 79 | "papermill": { 80 | "duration": 0.017602, 81 | "end_time": "2021-12-05T06:41:48.804397", 82 | "exception": false, 83 | "start_time": "2021-12-05T06:41:48.786795", 84 | "status": "completed" 85 | }, 86 | "tags": [] 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "word2vec_file = '../input/d/sandreds/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "id": "0a74cebd", 97 | "metadata": { 98 | "execution": { 99 | "iopub.execute_input": "2021-12-05T06:41:48.834263Z", 100 | "iopub.status.busy": "2021-12-05T06:41:48.833473Z", 101 | "iopub.status.idle": "2021-12-05T06:41:48.846019Z", 102 | "shell.execute_reply": "2021-12-05T06:41:48.846450Z", 103 | "shell.execute_reply.started": "2021-12-05T06:28:07.895005Z" 104 | }, 105 | "papermill": { 106 | "duration": 0.031576, 107 | "end_time": "2021-12-05T06:41:48.846572", 108 | "exception": false, 109 | "start_time": "2021-12-05T06:41:48.814996", 110 | "status": "completed" 111 | }, 112 | "tags": [] 113 | }, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/html": [ 118 | "
\n", 119 | "\n", 132 | "\n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | "
idqid1qid2question1question2is_duplicatequestion1_preprocessedquestion2_preprocessed
020467393885307635If there is a God, where is He!Why is god a \"He\"?0if there is a god , where is he !why is god a `` he '' ?
117716209315628Do you believe that everything happens for a r...Does everything happen for a reason?1do you believe that everything happens for a r...does everything happen for a reason ?
2291767352623413255Will there always be web hosting that will sup...Will there always be web hosting that supports...1will there always be web hosting that will sup...will there always be web hosting that supports...
32037585982467971What is the proof of Indian Army's surgical st...Has India provided any proof of the surgical s...1what is the proof of indian army 's surgical s...has india provided any proof of the surgical s...
4417477532675327What do Indian Muslims think of Modi?What do Indian Muslim think about PM Narendra ...1what do indian muslims think of modi ?what do indian muslim think about pm narendra ...
\n", 204 | "
" 205 | ], 206 | "text/plain": [ 207 | " id qid1 qid2 question1 \\\n", 208 | "0 204673 93885 307635 If there is a God, where is He! \n", 209 | "1 17716 2093 15628 Do you believe that everything happens for a r... \n", 210 | "2 291767 352623 413255 Will there always be web hosting that will sup... \n", 211 | "3 203758 59824 67971 What is the proof of Indian Army's surgical st... \n", 212 | "4 41747 75326 75327 What do Indian Muslims think of Modi? \n", 213 | "\n", 214 | " question2 is_duplicate \\\n", 215 | "0 Why is god a \"He\"? 0 \n", 216 | "1 Does everything happen for a reason? 1 \n", 217 | "2 Will there always be web hosting that supports... 1 \n", 218 | "3 Has India provided any proof of the surgical s... 1 \n", 219 | "4 What do Indian Muslim think about PM Narendra ... 1 \n", 220 | "\n", 221 | " question1_preprocessed \\\n", 222 | "0 if there is a god , where is he ! \n", 223 | "1 do you believe that everything happens for a r... \n", 224 | "2 will there always be web hosting that will sup... \n", 225 | "3 what is the proof of indian army 's surgical s... \n", 226 | "4 what do indian muslims think of modi ? \n", 227 | "\n", 228 | " question2_preprocessed \n", 229 | "0 why is god a `` he '' ? \n", 230 | "1 does everything happen for a reason ? \n", 231 | "2 will there always be web hosting that supports... \n", 232 | "3 has india provided any proof of the surgical s... \n", 233 | "4 what do indian muslim think about pm narendra ... " 234 | ] 235 | }, 236 | "execution_count": 4, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "train.head()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 5, 248 | "id": "e183a4fe", 249 | "metadata": { 250 | "execution": { 251 | "iopub.execute_input": "2021-12-05T06:41:48.876516Z", 252 | "iopub.status.busy": "2021-12-05T06:41:48.875896Z", 253 | "iopub.status.idle": "2021-12-05T06:42:50.921199Z", 254 | "shell.execute_reply": "2021-12-05T06:42:50.920691Z", 255 | "shell.execute_reply.started": "2021-12-05T06:28:07.919112Z" 256 | }, 257 | "papermill": { 258 | "duration": 62.06346, 259 | "end_time": "2021-12-05T06:42:50.921351", 260 | "exception": false, 261 | "start_time": "2021-12-05T06:41:48.857891", 262 | "status": "completed" 263 | }, 264 | "tags": [] 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "def buildVocabulary(reviews):\n", 269 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n", 270 | " tokenizer.fit_on_texts(reviews)\n", 271 | " return tokenizer\n", 272 | "\n", 273 | "def getSequences(reviews, tokenizer, seq_maxlen):\n", 274 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n", 275 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n", 276 | "\n", 277 | "word2vec_model = KeyedVectors.load_word2vec_format(word2vec_file, binary = True)\n", 278 | "\n", 279 | "def getEmbeddingWeightMatrix(word2idx): \n", 280 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n", 281 | " for word, i in tqdm(word2idx.items()):\n", 282 | " \n", 283 | " embedding_vector = word2vec_model[word] if word in word2vec_model else np.random.rand(1,300)\n", 284 | " if embedding_vector is not None:\n", 285 | " embedding_matrix[i] = embedding_vector\n", 286 | " return embedding_matrix" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 6, 292 | "id": "40a3ebc0", 293 | "metadata": { 294 | "execution": { 295 | "iopub.execute_input": "2021-12-05T06:42:50.974813Z", 296 | "iopub.status.busy": "2021-12-05T06:42:50.963194Z", 297 | "iopub.status.idle": "2021-12-05T06:43:04.692817Z", 298 | "shell.execute_reply": "2021-12-05T06:43:04.691844Z", 299 | "shell.execute_reply.started": "2021-12-05T06:29:09.661554Z" 300 | }, 301 | "papermill": { 302 | "duration": 13.76014, 303 | "end_time": "2021-12-05T06:43:04.692993", 304 | "exception": false, 305 | "start_time": "2021-12-05T06:42:50.932853", 306 | "status": "completed" 307 | }, 308 | "tags": [] 309 | }, 310 | "outputs": [ 311 | { 312 | "name": "stdout", 313 | "output_type": "stream", 314 | "text": [ 315 | "67043\n" 316 | ] 317 | } 318 | ], 319 | "source": [ 320 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n", 321 | "vocab_size = len(tokenizer.word_index) + 1\n", 322 | "print(vocab_size)\n", 323 | "\n", 324 | "x_train1 = getSequences(train['question1'], tokenizer, 200)\n", 325 | "x_train2 = getSequences(train['question2'], tokenizer, 200)\n", 326 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n", 327 | "\n", 328 | "x_val1 = getSequences(val['question1'], tokenizer, 200)\n", 329 | "x_val2 = getSequences(val['question2'], tokenizer, 200)\n", 330 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n", 331 | "\n", 332 | "x_test1 = getSequences(test['question1'], tokenizer, 200)\n", 333 | "x_test2 = getSequences(test['question2'], tokenizer, 200)\n", 334 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 7, 340 | "id": "61abd026", 341 | "metadata": { 342 | "execution": { 343 | "iopub.execute_input": "2021-12-05T06:43:04.722083Z", 344 | "iopub.status.busy": "2021-12-05T06:43:04.721506Z", 345 | "iopub.status.idle": "2021-12-05T06:43:05.183883Z", 346 | "shell.execute_reply": "2021-12-05T06:43:05.182396Z", 347 | "shell.execute_reply.started": "2021-12-05T06:29:22.925663Z" 348 | }, 349 | "papermill": { 350 | "duration": 0.478999, 351 | "end_time": "2021-12-05T06:43:05.184024", 352 | "exception": false, 353 | "start_time": "2021-12-05T06:43:04.705025", 354 | "status": "completed" 355 | }, 356 | "tags": [] 357 | }, 358 | "outputs": [ 359 | { 360 | "name": "stderr", 361 | "output_type": "stream", 362 | "text": [ 363 | "100%|██████████| 67042/67042 [00:00<00:00, 148076.38it/s]" 364 | ] 365 | }, 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "(67043, 300)\n" 371 | ] 372 | }, 373 | { 374 | "name": "stderr", 375 | "output_type": "stream", 376 | "text": [ 377 | "\n" 378 | ] 379 | } 380 | ], 381 | "source": [ 382 | "#embedding_vectors = loadGloveWordEmbeddings()\n", 383 | "#print(len(embedding_vectors))\n", 384 | "\n", 385 | "embedding_weight_matrix = getEmbeddingWeightMatrix(tokenizer.word_index)\n", 386 | "print(embedding_weight_matrix.shape)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 8, 392 | "id": "59b405dc", 393 | "metadata": { 394 | "execution": { 395 | "iopub.execute_input": "2021-12-05T06:43:05.221581Z", 396 | "iopub.status.busy": "2021-12-05T06:43:05.221015Z", 397 | "iopub.status.idle": "2021-12-05T06:43:07.840042Z", 398 | "shell.execute_reply": "2021-12-05T06:43:07.839136Z", 399 | "shell.execute_reply.started": "2021-12-05T06:29:23.399993Z" 400 | }, 401 | "papermill": { 402 | "duration": 2.642422, 403 | "end_time": "2021-12-05T06:43:07.840177", 404 | "exception": false, 405 | "start_time": "2021-12-05T06:43:05.197755", 406 | "status": "completed" 407 | }, 408 | "tags": [] 409 | }, 410 | "outputs": [ 411 | { 412 | "name": "stderr", 413 | "output_type": "stream", 414 | "text": [ 415 | "2021-12-05 06:43:05.301034: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 416 | "2021-12-05 06:43:05.400333: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 417 | "2021-12-05 06:43:05.401024: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 418 | "2021-12-05 06:43:05.402338: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n", 419 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", 420 | "2021-12-05 06:43:05.403313: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 421 | "2021-12-05 06:43:05.404059: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 422 | "2021-12-05 06:43:05.404737: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 423 | "2021-12-05 06:43:07.213294: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 424 | "2021-12-05 06:43:07.214235: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 425 | "2021-12-05 06:43:07.215011: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 426 | "2021-12-05 06:43:07.215600: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n" 427 | ] 428 | } 429 | ], 430 | "source": [ 431 | "#he_initializer = tf.keras.initializers.HeUniform()\n", 432 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n", 433 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n", 434 | "\n", 435 | "inner1= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200, \n", 436 | " weights=[embedding_weight_matrix], trainable=True)(inp1)\n", 437 | "inner2= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200,\n", 438 | " weights=[embedding_weight_matrix], trainable=True)(inp2)\n", 439 | "\n", 440 | "\n", 441 | "#inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n", 442 | "\n", 443 | "inner1 = tf.keras.backend.sum(inner1, axis=1, keepdims=False)\n", 444 | "inner2 = tf.keras.backend.sum(inner2, axis=1, keepdims=False)\n", 445 | "inner = inner1+inner2\n", 446 | "inner = tf.keras.layers.Dense(200, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 447 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 448 | "inner = tf.keras.layers.Dense(120, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 449 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 450 | "inner = tf.keras.layers.Dense(60, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 451 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 452 | "output = tf.keras.layers.Dense(2, activation='softmax')(inner)\n", 453 | "\n", 454 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 9, 460 | "id": "7eab1687", 461 | "metadata": { 462 | "execution": { 463 | "iopub.execute_input": "2021-12-05T06:43:07.876275Z", 464 | "iopub.status.busy": "2021-12-05T06:43:07.875614Z", 465 | "iopub.status.idle": "2021-12-05T06:43:07.891008Z", 466 | "shell.execute_reply": "2021-12-05T06:43:07.890387Z", 467 | "shell.execute_reply.started": "2021-12-05T06:29:26.103961Z" 468 | }, 469 | "papermill": { 470 | "duration": 0.036541, 471 | "end_time": "2021-12-05T06:43:07.891138", 472 | "exception": false, 473 | "start_time": "2021-12-05T06:43:07.854597", 474 | "status": "completed" 475 | }, 476 | "tags": [] 477 | }, 478 | "outputs": [ 479 | { 480 | "name": "stdout", 481 | "output_type": "stream", 482 | "text": [ 483 | "Model: \"model\"\n", 484 | "__________________________________________________________________________________________________\n", 485 | "Layer (type) Output Shape Param # Connected to \n", 486 | "==================================================================================================\n", 487 | "input_1 (InputLayer) [(None, 200)] 0 \n", 488 | "__________________________________________________________________________________________________\n", 489 | "input_2 (InputLayer) [(None, 200)] 0 \n", 490 | "__________________________________________________________________________________________________\n", 491 | "embedding (Embedding) (None, 200, 300) 20112900 input_1[0][0] \n", 492 | "__________________________________________________________________________________________________\n", 493 | "embedding_1 (Embedding) (None, 200, 300) 20112900 input_2[0][0] \n", 494 | "__________________________________________________________________________________________________\n", 495 | "tf.math.reduce_sum (TFOpLambda) (None, 300) 0 embedding[0][0] \n", 496 | "__________________________________________________________________________________________________\n", 497 | "tf.math.reduce_sum_1 (TFOpLambd (None, 300) 0 embedding_1[0][0] \n", 498 | "__________________________________________________________________________________________________\n", 499 | "tf.__operators__.add (TFOpLambd (None, 300) 0 tf.math.reduce_sum[0][0] \n", 500 | " tf.math.reduce_sum_1[0][0] \n", 501 | "__________________________________________________________________________________________________\n", 502 | "dense (Dense) (None, 200) 60200 tf.__operators__.add[0][0] \n", 503 | "__________________________________________________________________________________________________\n", 504 | "dropout (Dropout) (None, 200) 0 dense[0][0] \n", 505 | "__________________________________________________________________________________________________\n", 506 | "dense_1 (Dense) (None, 120) 24120 dropout[0][0] \n", 507 | "__________________________________________________________________________________________________\n", 508 | "dropout_1 (Dropout) (None, 120) 0 dense_1[0][0] \n", 509 | "__________________________________________________________________________________________________\n", 510 | "dense_2 (Dense) (None, 60) 7260 dropout_1[0][0] \n", 511 | "__________________________________________________________________________________________________\n", 512 | "dropout_2 (Dropout) (None, 60) 0 dense_2[0][0] \n", 513 | "__________________________________________________________________________________________________\n", 514 | "dense_3 (Dense) (None, 2) 122 dropout_2[0][0] \n", 515 | "==================================================================================================\n", 516 | "Total params: 40,317,502\n", 517 | "Trainable params: 40,317,502\n", 518 | "Non-trainable params: 0\n", 519 | "__________________________________________________________________________________________________\n" 520 | ] 521 | } 522 | ], 523 | "source": [ 524 | "model.compile(optimizer = \"adam\", loss = 'categorical_crossentropy', metrics=['accuracy'])\n", 525 | "model.summary()" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 10, 531 | "id": "0aae2f11", 532 | "metadata": { 533 | "execution": { 534 | "iopub.execute_input": "2021-12-05T06:43:07.925895Z", 535 | "iopub.status.busy": "2021-12-05T06:43:07.925115Z", 536 | "iopub.status.idle": "2021-12-05T06:44:04.257889Z", 537 | "shell.execute_reply": "2021-12-05T06:44:04.258447Z", 538 | "shell.execute_reply.started": "2021-12-05T06:29:26.126805Z" 539 | }, 540 | "papermill": { 541 | "duration": 56.353085, 542 | "end_time": "2021-12-05T06:44:04.258638", 543 | "exception": false, 544 | "start_time": "2021-12-05T06:43:07.905553", 545 | "status": "completed" 546 | }, 547 | "tags": [] 548 | }, 549 | "outputs": [ 550 | { 551 | "name": "stderr", 552 | "output_type": "stream", 553 | "text": [ 554 | "2021-12-05 06:43:08.036642: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n" 555 | ] 556 | }, 557 | { 558 | "name": "stdout", 559 | "output_type": "stream", 560 | "text": [ 561 | "Epoch 1/4\n", 562 | "632/632 [==============================] - 14s 20ms/step - loss: 1.5897 - accuracy: 0.6573 - val_loss: 0.6889 - val_accuracy: 0.6883\n", 563 | "Epoch 2/4\n", 564 | "632/632 [==============================] - 13s 20ms/step - loss: 0.5678 - accuracy: 0.7383 - val_loss: 0.5906 - val_accuracy: 0.7223\n", 565 | "Epoch 3/4\n", 566 | "632/632 [==============================] - 12s 19ms/step - loss: 0.4759 - accuracy: 0.7885 - val_loss: 0.5830 - val_accuracy: 0.7369\n", 567 | "Epoch 4/4\n", 568 | "632/632 [==============================] - 12s 19ms/step - loss: 0.4199 - accuracy: 0.8166 - val_loss: 0.6234 - val_accuracy: 0.7322\n" 569 | ] 570 | } 571 | ], 572 | "source": [ 573 | "save_weights = tf.keras.callbacks.ModelCheckpoint('cbow_mlp.h5', monitor='val_loss', save_best_only=True)\n", 574 | "checkpoint_filepath = 'weights.best.{epoch:01d}.hdf5'\n", 575 | "#model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n", 576 | "#verbose = 1,\n", 577 | "#monitor = 'val_loss',\n", 578 | "#save_best_only = False)\n", 579 | "history = model.fit((x_train1, x_train2), y_train,\n", 580 | " batch_size = 64,\n", 581 | " validation_data = ((x_val1, x_val2), y_val),\n", 582 | " validation_batch_size = 64,\n", 583 | " epochs=4, \n", 584 | " callbacks=[save_weights], \n", 585 | " verbose=1)" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 13, 591 | "id": "6195ba4e", 592 | "metadata": { 593 | "execution": { 594 | "iopub.execute_input": "2021-12-05T06:44:05.863472Z", 595 | "iopub.status.busy": "2021-12-05T06:44:05.862509Z", 596 | "iopub.status.idle": "2021-12-05T06:44:28.229271Z", 597 | "shell.execute_reply": "2021-12-05T06:44:28.229695Z", 598 | "shell.execute_reply.started": "2021-12-05T06:40:02.762209Z" 599 | }, 600 | "papermill": { 601 | "duration": 22.559635, 602 | "end_time": "2021-12-05T06:44:28.229842", 603 | "exception": false, 604 | "start_time": "2021-12-05T06:44:05.670207", 605 | "status": "completed" 606 | }, 607 | "tags": [] 608 | }, 609 | "outputs": [ 610 | { 611 | "name": "stdout", 612 | "output_type": "stream", 613 | "text": [ 614 | "10108/10108 [==============================] - 22s 2ms/step - loss: 0.3327 - accuracy: 0.8621\n", 615 | "loss on test data is 0.3327052593231201\n", 616 | "accuracy on test data is 0.8620792031288147\n" 617 | ] 618 | } 619 | ], 620 | "source": [ 621 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n", 622 | "\n", 623 | "print('loss on test data is', loss)\n", 624 | "print('accuracy on test data is', accuracy)" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 14, 630 | "id": "3d4078c0", 631 | "metadata": { 632 | "execution": { 633 | "iopub.execute_input": "2021-12-05T06:44:28.830291Z", 634 | "iopub.status.busy": "2021-12-05T06:44:28.829339Z", 635 | "iopub.status.idle": "2021-12-05T06:44:31.520669Z", 636 | "shell.execute_reply": "2021-12-05T06:44:31.521533Z", 637 | "shell.execute_reply.started": "2021-12-05T06:40:27.475503Z" 638 | }, 639 | "papermill": { 640 | "duration": 2.995351, 641 | "end_time": "2021-12-05T06:44:31.521775", 642 | "exception": false, 643 | "start_time": "2021-12-05T06:44:28.526424", 644 | "status": "completed" 645 | }, 646 | "tags": [] 647 | }, 648 | "outputs": [ 649 | { 650 | "name": "stdout", 651 | "output_type": "stream", 652 | "text": [ 653 | "F1_score on test is 0.7974573192880495\n" 654 | ] 655 | } 656 | ], 657 | "source": [ 658 | "pred = model.predict((x_test1, x_test2))\n", 659 | "print('F1_score on test is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))\n" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": null, 665 | "id": "b54608e8", 666 | "metadata": { 667 | "papermill": { 668 | "duration": 0.488623, 669 | "end_time": "2021-12-05T06:44:32.446218", 670 | "exception": false, 671 | "start_time": "2021-12-05T06:44:31.957595", 672 | "status": "completed" 673 | }, 674 | "tags": [] 675 | }, 676 | "outputs": [], 677 | "source": [] 678 | } 679 | ], 680 | "metadata": { 681 | "kernelspec": { 682 | "display_name": "Python 3", 683 | "language": "python", 684 | "name": "python3" 685 | }, 686 | "language_info": { 687 | "codemirror_mode": { 688 | "name": "ipython", 689 | "version": 3 690 | }, 691 | "file_extension": ".py", 692 | "mimetype": "text/x-python", 693 | "name": "python", 694 | "nbconvert_exporter": "python", 695 | "pygments_lexer": "ipython3", 696 | "version": "3.8.8" 697 | }, 698 | "papermill": { 699 | "default_parameters": {}, 700 | "duration": 181.391801, 701 | "end_time": "2021-12-05T06:44:36.786152", 702 | "environment_variables": {}, 703 | "exception": null, 704 | "input_path": "__notebook__.ipynb", 705 | "output_path": "__notebook__.ipynb", 706 | "parameters": {}, 707 | "start_time": "2021-12-05T06:41:35.394351", 708 | "version": "2.3.3" 709 | } 710 | }, 711 | "nbformat": 4, 712 | "nbformat_minor": 5 713 | } 714 | -------------------------------------------------------------------------------- /glove-lstm_paper_experiment1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "e8f714bf", 7 | "metadata": { 8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 10 | "execution": { 11 | "iopub.execute_input": "2021-11-27T17:31:54.134744Z", 12 | "iopub.status.busy": "2021-11-27T17:31:54.132593Z", 13 | "iopub.status.idle": "2021-11-27T17:31:59.374583Z", 14 | "shell.execute_reply": "2021-11-27T17:31:59.373668Z", 15 | "shell.execute_reply.started": "2021-11-27T17:18:16.053325Z" 16 | }, 17 | "papermill": { 18 | "duration": 5.264768, 19 | "end_time": "2021-11-27T17:31:59.374813", 20 | "exception": false, 21 | "start_time": "2021-11-27T17:31:54.110045", 22 | "status": "completed" 23 | }, 24 | "tags": [] 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import pandas as pd\n", 29 | "import numpy as np\n", 30 | "from tqdm import tqdm\n", 31 | "import tensorflow as tf\n", 32 | "from sklearn.metrics import f1_score" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "id": "674aa0b8", 39 | "metadata": { 40 | "execution": { 41 | "iopub.execute_input": "2021-11-27T17:31:59.409729Z", 42 | "iopub.status.busy": "2021-11-27T17:31:59.408970Z", 43 | "iopub.status.idle": "2021-11-27T17:32:02.909462Z", 44 | "shell.execute_reply": "2021-11-27T17:32:02.908882Z", 45 | "shell.execute_reply.started": "2021-11-27T17:18:26.878800Z" 46 | }, 47 | "papermill": { 48 | "duration": 3.520358, 49 | "end_time": "2021-11-27T17:32:02.909614", 50 | "exception": false, 51 | "start_time": "2021-11-27T17:31:59.389256", 52 | "status": "completed" 53 | }, 54 | "tags": [] 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "train = pd.read_csv('../input/smai-project-data/train_data.csv').fillna('')\n", 59 | "val = pd.read_csv('../input/smai-project-data/val_data.csv').fillna('')\n", 60 | "test = pd.read_csv('../input/smai-project-data/test_data.csv').fillna('')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 3, 66 | "id": "0f8bc2ba", 67 | "metadata": { 68 | "execution": { 69 | "iopub.execute_input": "2021-11-27T17:32:02.947083Z", 70 | "iopub.status.busy": "2021-11-27T17:32:02.946007Z", 71 | "iopub.status.idle": "2021-11-27T17:32:02.962532Z", 72 | "shell.execute_reply": "2021-11-27T17:32:02.963124Z", 73 | "shell.execute_reply.started": "2021-11-27T17:18:30.138188Z" 74 | }, 75 | "papermill": { 76 | "duration": 0.040354, 77 | "end_time": "2021-11-27T17:32:02.963310", 78 | "exception": false, 79 | "start_time": "2021-11-27T17:32:02.922956", 80 | "status": "completed" 81 | }, 82 | "tags": [] 83 | }, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/html": [ 88 | "
\n", 89 | "\n", 102 | "\n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | "
idqid1qid2question1question2is_duplicatequestion1_preprocessedquestion2_preprocessed
080671573815739How do I play Pokémon GO in Korea?How do I play Pokémon GO in China?0how do i play pok mon go in korea ?how do i play pok mon go in china ?
136810112736104117What are some of the best side dishes for crab...What are some good side dishes for buffalo chi...0what are some of the best side dishes for crab...what are some good side dishes for buffalo chi...
270497121486121487Which is more advisable and better material fo...What is the best server setup for buddypress?0which is more advisable and better material fo...what is the best server setup for buddypress ?
3226567254474258192How do I improve logical programming skills?How can I improve my logical skills for progra...1how do i improve logical programming skills ?how can i improve my logical skills for progra...
473186481033062How close we are to see 3rd world war?How close is a World War III?1how close we are to see 3rd world war ?how close is a world war iii ?
\n", 174 | "
" 175 | ], 176 | "text/plain": [ 177 | " id qid1 qid2 question1 \\\n", 178 | "0 8067 15738 15739 How do I play Pokémon GO in Korea? \n", 179 | "1 368101 12736 104117 What are some of the best side dishes for crab... \n", 180 | "2 70497 121486 121487 Which is more advisable and better material fo... \n", 181 | "3 226567 254474 258192 How do I improve logical programming skills? \n", 182 | "4 73186 48103 3062 How close we are to see 3rd world war? \n", 183 | "\n", 184 | " question2 is_duplicate \\\n", 185 | "0 How do I play Pokémon GO in China? 0 \n", 186 | "1 What are some good side dishes for buffalo chi... 0 \n", 187 | "2 What is the best server setup for buddypress? 0 \n", 188 | "3 How can I improve my logical skills for progra... 1 \n", 189 | "4 How close is a World War III? 1 \n", 190 | "\n", 191 | " question1_preprocessed \\\n", 192 | "0 how do i play pok mon go in korea ? \n", 193 | "1 what are some of the best side dishes for crab... \n", 194 | "2 which is more advisable and better material fo... \n", 195 | "3 how do i improve logical programming skills ? \n", 196 | "4 how close we are to see 3rd world war ? \n", 197 | "\n", 198 | " question2_preprocessed \n", 199 | "0 how do i play pok mon go in china ? \n", 200 | "1 what are some good side dishes for buffalo chi... \n", 201 | "2 what is the best server setup for buddypress ? \n", 202 | "3 how can i improve my logical skills for progra... \n", 203 | "4 how close is a world war iii ? " 204 | ] 205 | }, 206 | "execution_count": 3, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "train.head()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 4, 218 | "id": "3789630a", 219 | "metadata": { 220 | "execution": { 221 | "iopub.execute_input": "2021-11-27T17:32:03.002462Z", 222 | "iopub.status.busy": "2021-11-27T17:32:03.001753Z", 223 | "iopub.status.idle": "2021-11-27T17:32:03.006592Z", 224 | "shell.execute_reply": "2021-11-27T17:32:03.006031Z", 225 | "shell.execute_reply.started": "2021-11-27T17:18:30.162922Z" 226 | }, 227 | "papermill": { 228 | "duration": 0.029215, 229 | "end_time": "2021-11-27T17:32:03.006773", 230 | "exception": false, 231 | "start_time": "2021-11-27T17:32:02.977558", 232 | "status": "completed" 233 | }, 234 | "tags": [] 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "def buildVocabulary(reviews):\n", 239 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n", 240 | " tokenizer.fit_on_texts(reviews)\n", 241 | " return tokenizer\n", 242 | "\n", 243 | "def getSequences(reviews, tokenizer, seq_maxlen):\n", 244 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n", 245 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n", 246 | "\n", 247 | "def loadGloveWordEmbeddings():\n", 248 | " embedding_vectors = {}\n", 249 | " with open('../input/glove840b300dtxt/glove.840B.300d.txt') as f:\n", 250 | " for line in tqdm(f):\n", 251 | " values = line.split(' ')\n", 252 | " word = values[0]\n", 253 | " coefs = np.asarray(values[1:], dtype='float32')\n", 254 | " embedding_vectors[word] = coefs\n", 255 | " return embedding_vectors\n", 256 | "\n", 257 | "def getEmbeddingWeightMatrix(embedding_vectors, word2idx): \n", 258 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n", 259 | " for word, i in tqdm(word2idx.items()):\n", 260 | " embedding_vector = embedding_vectors.get(word)\n", 261 | " if embedding_vector is not None:\n", 262 | " embedding_matrix[i] = embedding_vector\n", 263 | " return embedding_matrix" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 5, 269 | "id": "33f503b8", 270 | "metadata": { 271 | "execution": { 272 | "iopub.execute_input": "2021-11-27T17:32:03.086757Z", 273 | "iopub.status.busy": "2021-11-27T17:32:03.043126Z", 274 | "iopub.status.idle": "2021-11-27T17:32:42.350295Z", 275 | "shell.execute_reply": "2021-11-27T17:32:42.349686Z", 276 | "shell.execute_reply.started": "2021-11-27T17:18:30.175848Z" 277 | }, 278 | "papermill": { 279 | "duration": 39.329657, 280 | "end_time": "2021-11-27T17:32:42.350460", 281 | "exception": false, 282 | "start_time": "2021-11-27T17:32:03.020803", 283 | "status": "completed" 284 | }, 285 | "tags": [] 286 | }, 287 | "outputs": [ 288 | { 289 | "name": "stdout", 290 | "output_type": "stream", 291 | "text": [ 292 | "119558\n" 293 | ] 294 | } 295 | ], 296 | "source": [ 297 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n", 298 | "vocab_size = len(tokenizer.word_index) + 1\n", 299 | "print(vocab_size)\n", 300 | "\n", 301 | "x_train1 = getSequences(train['question1'], tokenizer, 128)\n", 302 | "x_train2 = getSequences(train['question2'], tokenizer, 128)\n", 303 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n", 304 | "\n", 305 | "x_val1 = getSequences(val['question1'], tokenizer, 128)\n", 306 | "x_val2 = getSequences(val['question2'], tokenizer, 128)\n", 307 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n", 308 | "\n", 309 | "x_test1 = getSequences(test['question1'], tokenizer, 128)\n", 310 | "x_test2 = getSequences(test['question2'], tokenizer, 128)\n", 311 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 6, 317 | "id": "bdffb7fe", 318 | "metadata": { 319 | "execution": { 320 | "iopub.execute_input": "2021-11-27T17:32:42.385156Z", 321 | "iopub.status.busy": "2021-11-27T17:32:42.384482Z", 322 | "iopub.status.idle": "2021-11-27T17:37:39.763269Z", 323 | "shell.execute_reply": "2021-11-27T17:37:39.761448Z", 324 | "shell.execute_reply.started": "2021-11-27T17:19:01.967098Z" 325 | }, 326 | "papermill": { 327 | "duration": 297.397774, 328 | "end_time": "2021-11-27T17:37:39.763433", 329 | "exception": false, 330 | "start_time": "2021-11-27T17:32:42.365659", 331 | "status": "completed" 332 | }, 333 | "tags": [] 334 | }, 335 | "outputs": [ 336 | { 337 | "name": "stderr", 338 | "output_type": "stream", 339 | "text": [ 340 | "2196018it [04:56, 7397.00it/s]\n" 341 | ] 342 | }, 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "2196017\n" 348 | ] 349 | }, 350 | { 351 | "name": "stderr", 352 | "output_type": "stream", 353 | "text": [ 354 | "100%|██████████| 119557/119557 [00:00<00:00, 255058.70it/s]" 355 | ] 356 | }, 357 | { 358 | "name": "stdout", 359 | "output_type": "stream", 360 | "text": [ 361 | "(119558, 300)\n" 362 | ] 363 | }, 364 | { 365 | "name": "stderr", 366 | "output_type": "stream", 367 | "text": [ 368 | "\n" 369 | ] 370 | } 371 | ], 372 | "source": [ 373 | "embedding_vectors = loadGloveWordEmbeddings()\n", 374 | "print(len(embedding_vectors))\n", 375 | "\n", 376 | "embedding_weight_matrix = getEmbeddingWeightMatrix(embedding_vectors, tokenizer.word_index)\n", 377 | "print(embedding_weight_matrix.shape)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 7, 383 | "id": "e95c5bc7", 384 | "metadata": { 385 | "execution": { 386 | "iopub.execute_input": "2021-11-27T17:37:41.733535Z", 387 | "iopub.status.busy": "2021-11-27T17:37:41.732702Z", 388 | "iopub.status.idle": "2021-11-27T17:37:45.831178Z", 389 | "shell.execute_reply": "2021-11-27T17:37:45.831708Z", 390 | "shell.execute_reply.started": "2021-11-27T17:23:23.346592Z" 391 | }, 392 | "papermill": { 393 | "duration": 5.091622, 394 | "end_time": "2021-11-27T17:37:45.831935", 395 | "exception": false, 396 | "start_time": "2021-11-27T17:37:40.740313", 397 | "status": "completed" 398 | }, 399 | "tags": [] 400 | }, 401 | "outputs": [ 402 | { 403 | "name": "stderr", 404 | "output_type": "stream", 405 | "text": [ 406 | "2021-11-27 17:37:41.835873: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 407 | "2021-11-27 17:37:41.961540: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 408 | "2021-11-27 17:37:41.962835: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 409 | "2021-11-27 17:37:41.965578: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", 410 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", 411 | "2021-11-27 17:37:41.966902: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 412 | "2021-11-27 17:37:41.968136: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 413 | "2021-11-27 17:37:41.969191: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 414 | "2021-11-27 17:37:44.394226: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 415 | "2021-11-27 17:37:44.395501: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 416 | "2021-11-27 17:37:44.396537: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 417 | "2021-11-27 17:37:44.398460: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n", 418 | "2021-11-27 17:37:45.013946: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n", 419 | "2021-11-27 17:37:45.271914: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 143469600 exceeds 10% of free system memory.\n" 420 | ] 421 | } 422 | ], 423 | "source": [ 424 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n", 425 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n", 426 | "\n", 427 | "inner1= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128, \n", 428 | " weights=[embedding_weight_matrix], trainable=False)(inp1)\n", 429 | "inner2= tf.keras.layers.Embedding(input_dim=119558, output_dim=300, input_length=128,\n", 430 | " weights=[embedding_weight_matrix], trainable=False)(inp2)\n", 431 | "\n", 432 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n", 433 | "\n", 434 | "out, h, c = tf.keras.layers.LSTM(200, return_sequences=False, return_state=True)(inner)\n", 435 | "\n", 436 | "output = tf.keras.layers.Dense(2, activation='softmax')(c)\n", 437 | "\n", 438 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 8, 444 | "id": "3880b605", 445 | "metadata": { 446 | "execution": { 447 | "iopub.execute_input": "2021-11-27T17:37:48.855149Z", 448 | "iopub.status.busy": "2021-11-27T17:37:48.853991Z", 449 | "iopub.status.idle": "2021-11-27T17:37:48.878756Z", 450 | "shell.execute_reply": "2021-11-27T17:37:48.878201Z", 451 | "shell.execute_reply.started": "2021-11-27T17:23:26.835969Z" 452 | }, 453 | "papermill": { 454 | "duration": 2.054917, 455 | "end_time": "2021-11-27T17:37:48.878932", 456 | "exception": false, 457 | "start_time": "2021-11-27T17:37:46.824015", 458 | "status": "completed" 459 | }, 460 | "tags": [] 461 | }, 462 | "outputs": [ 463 | { 464 | "name": "stdout", 465 | "output_type": "stream", 466 | "text": [ 467 | "Model: \"model\"\n", 468 | "__________________________________________________________________________________________________\n", 469 | "Layer (type) Output Shape Param # Connected to \n", 470 | "==================================================================================================\n", 471 | "input_1 (InputLayer) [(None, 128)] 0 \n", 472 | "__________________________________________________________________________________________________\n", 473 | "input_2 (InputLayer) [(None, 128)] 0 \n", 474 | "__________________________________________________________________________________________________\n", 475 | "embedding (Embedding) (None, 128, 300) 35867400 input_1[0][0] \n", 476 | "__________________________________________________________________________________________________\n", 477 | "embedding_1 (Embedding) (None, 128, 300) 35867400 input_2[0][0] \n", 478 | "__________________________________________________________________________________________________\n", 479 | "tf.__operators__.add (TFOpLambd (None, 128, 300) 0 embedding[0][0] \n", 480 | " embedding_1[0][0] \n", 481 | "__________________________________________________________________________________________________\n", 482 | "tf.math.subtract (TFOpLambda) (None, 128, 300) 0 embedding[0][0] \n", 483 | " embedding_1[0][0] \n", 484 | "__________________________________________________________________________________________________\n", 485 | "tf.math.multiply (TFOpLambda) (None, 128, 300) 0 embedding[0][0] \n", 486 | " embedding_1[0][0] \n", 487 | "__________________________________________________________________________________________________\n", 488 | "concatenate (Concatenate) (None, 128, 900) 0 tf.__operators__.add[0][0] \n", 489 | " tf.math.subtract[0][0] \n", 490 | " tf.math.multiply[0][0] \n", 491 | "__________________________________________________________________________________________________\n", 492 | "lstm (LSTM) [(None, 200), (None, 880800 concatenate[0][0] \n", 493 | "__________________________________________________________________________________________________\n", 494 | "dense (Dense) (None, 2) 402 lstm[0][2] \n", 495 | "==================================================================================================\n", 496 | "Total params: 72,616,002\n", 497 | "Trainable params: 881,202\n", 498 | "Non-trainable params: 71,734,800\n", 499 | "__________________________________________________________________________________________________\n" 500 | ] 501 | } 502 | ], 503 | "source": [ 504 | "model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics=['accuracy'])\n", 505 | "model.summary()" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": 9, 511 | "id": "1b7808b5", 512 | "metadata": { 513 | "execution": { 514 | "iopub.execute_input": "2021-11-27T17:37:50.886838Z", 515 | "iopub.status.busy": "2021-11-27T17:37:50.885717Z", 516 | "iopub.status.idle": "2021-11-27T17:46:15.153633Z", 517 | "shell.execute_reply": "2021-11-27T17:46:15.154306Z", 518 | "shell.execute_reply.started": "2021-11-27T17:23:26.858668Z" 519 | }, 520 | "papermill": { 521 | "duration": 505.288706, 522 | "end_time": "2021-11-27T17:46:15.154505", 523 | "exception": false, 524 | "start_time": "2021-11-27T17:37:49.865799", 525 | "status": "completed" 526 | }, 527 | "tags": [] 528 | }, 529 | "outputs": [ 530 | { 531 | "name": "stderr", 532 | "output_type": "stream", 533 | "text": [ 534 | "2021-11-27 17:37:50.888003: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 144897536 exceeds 10% of free system memory.\n", 535 | "2021-11-27 17:37:51.004600: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 144897536 exceeds 10% of free system memory.\n", 536 | "2021-11-27 17:37:51.188913: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n" 537 | ] 538 | }, 539 | { 540 | "name": "stdout", 541 | "output_type": "stream", 542 | "text": [ 543 | "Epoch 1/3\n" 544 | ] 545 | }, 546 | { 547 | "name": "stderr", 548 | "output_type": "stream", 549 | "text": [ 550 | "2021-11-27 17:37:54.166583: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005\n" 551 | ] 552 | }, 553 | { 554 | "name": "stdout", 555 | "output_type": "stream", 556 | "text": [ 557 | "8841/8844 [============================>.] - ETA: 0s - loss: 0.4669 - accuracy: 0.7735" 558 | ] 559 | }, 560 | { 561 | "name": "stderr", 562 | "output_type": "stream", 563 | "text": [ 564 | "2021-11-27 17:40:06.040932: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 41399296 exceeds 10% of free system memory.\n" 565 | ] 566 | }, 567 | { 568 | "name": "stdout", 569 | "output_type": "stream", 570 | "text": [ 571 | "8844/8844 [==============================] - 164s 18ms/step - loss: 0.4669 - accuracy: 0.7735 - val_loss: 0.4332 - val_accuracy: 0.7948\n", 572 | "\n", 573 | "Epoch 00001: saving model to weights.best.1.hdf5\n", 574 | "Epoch 2/3\n", 575 | "8844/8844 [==============================] - 171s 19ms/step - loss: 0.3708 - accuracy: 0.8291 - val_loss: 0.4160 - val_accuracy: 0.8042\n", 576 | "\n", 577 | "Epoch 00002: saving model to weights.best.2.hdf5\n", 578 | "Epoch 3/3\n", 579 | "8844/8844 [==============================] - 159s 18ms/step - loss: 0.2868 - accuracy: 0.8736 - val_loss: 0.4415 - val_accuracy: 0.8081\n", 580 | "\n", 581 | "Epoch 00003: saving model to weights.best.3.hdf5\n" 582 | ] 583 | } 584 | ], 585 | "source": [ 586 | "checkpoint_filepath = 'weights.best.{epoch:01d}.hdf5'\n", 587 | "model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n", 588 | " verbose = 1, \n", 589 | " monitor = 'val_loss',\n", 590 | " save_best_only = False)\n", 591 | "\n", 592 | "history = model.fit((x_train1, x_train2), y_train,\n", 593 | " batch_size = 32,\n", 594 | " validation_data = ((x_val1, x_val2), y_val),\n", 595 | " validation_batch_size = 16,\n", 596 | " epochs=3,\n", 597 | " callbacks=[model_checkpoint_callback], \n", 598 | " verbose=1)" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": 12, 604 | "id": "8863f669", 605 | "metadata": { 606 | "execution": { 607 | "iopub.execute_input": "2021-11-27T17:46:36.740103Z", 608 | "iopub.status.busy": "2021-11-27T17:46:36.738488Z", 609 | "iopub.status.idle": "2021-11-27T17:47:58.715697Z", 610 | "shell.execute_reply": "2021-11-27T17:47:58.715075Z", 611 | "shell.execute_reply.started": "2021-11-27T17:29:15.189693Z" 612 | }, 613 | "papermill": { 614 | "duration": 86.274629, 615 | "end_time": "2021-11-27T17:47:58.715889", 616 | "exception": false, 617 | "start_time": "2021-11-27T17:46:32.441260", 618 | "status": "completed" 619 | }, 620 | "tags": [] 621 | }, 622 | "outputs": [ 623 | { 624 | "name": "stdout", 625 | "output_type": "stream", 626 | "text": [ 627 | "10108/10108 [==============================] - 59s 6ms/step - loss: 0.4340 - accuracy: 0.8087\n", 628 | "loss on test data is 0.43401026725769043\n", 629 | "accuracy on test data is 0.80872642993927\n" 630 | ] 631 | } 632 | ], 633 | "source": [ 634 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n", 635 | "\n", 636 | "print('loss on test data is', loss)\n", 637 | "print('accuracy on test data is', accuracy)" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": 13, 643 | "id": "da98475b", 644 | "metadata": { 645 | "execution": { 646 | "iopub.execute_input": "2021-11-27T17:48:06.451346Z", 647 | "iopub.status.busy": "2021-11-27T17:48:06.450104Z", 648 | "iopub.status.idle": "2021-11-27T17:48:17.164859Z", 649 | "shell.execute_reply": "2021-11-27T17:48:17.163722Z", 650 | "shell.execute_reply.started": "2021-11-27T17:30:03.625800Z" 651 | }, 652 | "papermill": { 653 | "duration": 14.724145, 654 | "end_time": "2021-11-27T17:48:17.165031", 655 | "exception": false, 656 | "start_time": "2021-11-27T17:48:02.440886", 657 | "status": "completed" 658 | }, 659 | "tags": [] 660 | }, 661 | "outputs": [ 662 | { 663 | "name": "stdout", 664 | "output_type": "stream", 665 | "text": [ 666 | "f1_score on test dataset is 0.7398311072233624\n" 667 | ] 668 | } 669 | ], 670 | "source": [ 671 | "pred = model.predict((x_test1, x_test2))\n", 672 | "\n", 673 | "print('f1_score on test dataset is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": null, 679 | "id": "90b4b575", 680 | "metadata": { 681 | "papermill": { 682 | "duration": 3.647781, 683 | "end_time": "2021-11-27T17:48:24.449422", 684 | "exception": false, 685 | "start_time": "2021-11-27T17:48:20.801641", 686 | "status": "completed" 687 | }, 688 | "tags": [] 689 | }, 690 | "outputs": [], 691 | "source": [] 692 | } 693 | ], 694 | "metadata": { 695 | "kernelspec": { 696 | "display_name": "Python 3 (ipykernel)", 697 | "language": "python", 698 | "name": "python3" 699 | }, 700 | "language_info": { 701 | "codemirror_mode": { 702 | "name": "ipython", 703 | "version": 3 704 | }, 705 | "file_extension": ".py", 706 | "mimetype": "text/x-python", 707 | "name": "python", 708 | "nbconvert_exporter": "python", 709 | "pygments_lexer": "ipython3", 710 | "version": "3.8.10" 711 | }, 712 | "papermill": { 713 | "default_parameters": {}, 714 | "duration": 1006.545475, 715 | "end_time": "2021-11-27T17:48:31.257168", 716 | "environment_variables": {}, 717 | "exception": null, 718 | "input_path": "__notebook__.ipynb", 719 | "output_path": "__notebook__.ipynb", 720 | "parameters": {}, 721 | "start_time": "2021-11-27T17:31:44.711693", 722 | "version": "2.3.3" 723 | } 724 | }, 725 | "nbformat": 4, 726 | "nbformat_minor": 5 727 | } 728 | -------------------------------------------------------------------------------- /CBOW MLP Sum Diff Product of Embeddings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "0ffba8bb", 7 | "metadata": { 8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 10 | "execution": { 11 | "iopub.execute_input": "2021-11-27T13:27:02.120873Z", 12 | "iopub.status.busy": "2021-11-27T13:27:02.120117Z", 13 | "iopub.status.idle": "2021-11-27T13:27:06.777897Z", 14 | "shell.execute_reply": "2021-11-27T13:27:06.777261Z", 15 | "shell.execute_reply.started": "2021-11-27T13:25:33.869721Z" 16 | }, 17 | "papermill": { 18 | "duration": 4.681193, 19 | "end_time": "2021-11-27T13:27:06.778056", 20 | "exception": false, 21 | "start_time": "2021-11-27T13:27:02.096863", 22 | "status": "completed" 23 | }, 24 | "tags": [] 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import pandas as pd\n", 29 | "import numpy as np\n", 30 | "from tqdm import tqdm\n", 31 | "import tensorflow as tf\n", 32 | "from gensim.models import KeyedVectors\n", 33 | "import gensim\n", 34 | "import re\n", 35 | "from sklearn.metrics import f1_score" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "id": "2d9327ba", 42 | "metadata": { 43 | "execution": { 44 | "iopub.execute_input": "2021-11-27T13:27:06.803571Z", 45 | "iopub.status.busy": "2021-11-27T13:27:06.802996Z", 46 | "iopub.status.idle": "2021-11-27T13:27:07.887920Z", 47 | "shell.execute_reply": "2021-11-27T13:27:07.886982Z", 48 | "shell.execute_reply.started": "2021-11-27T13:10:42.402545Z" 49 | }, 50 | "papermill": { 51 | "duration": 1.099041, 52 | "end_time": "2021-11-27T13:27:07.888060", 53 | "exception": false, 54 | "start_time": "2021-11-27T13:27:06.789019", 55 | "status": "completed" 56 | }, 57 | "tags": [] 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "train = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')\n", 62 | "val = pd.read_csv('../input/quora-ques-pair/val_data.csv/val_data.csv').fillna('')\n", 63 | "test = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "id": "c0bfef54", 70 | "metadata": { 71 | "execution": { 72 | "iopub.execute_input": "2021-11-27T13:27:07.912052Z", 73 | "iopub.status.busy": "2021-11-27T13:27:07.911329Z", 74 | "iopub.status.idle": "2021-11-27T13:27:07.914031Z", 75 | "shell.execute_reply": "2021-11-27T13:27:07.913405Z", 76 | "shell.execute_reply.started": "2021-11-27T13:10:43.715098Z" 77 | }, 78 | "papermill": { 79 | "duration": 0.015828, 80 | "end_time": "2021-11-27T13:27:07.914162", 81 | "exception": false, 82 | "start_time": "2021-11-27T13:27:07.898334", 83 | "status": "completed" 84 | }, 85 | "tags": [] 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "word2vec_file = '../input/d/sandreds/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "id": "f318327c", 96 | "metadata": { 97 | "execution": { 98 | "iopub.execute_input": "2021-11-27T13:27:07.942268Z", 99 | "iopub.status.busy": "2021-11-27T13:27:07.941457Z", 100 | "iopub.status.idle": "2021-11-27T13:27:07.953929Z", 101 | "shell.execute_reply": "2021-11-27T13:27:07.954439Z", 102 | "shell.execute_reply.started": "2021-11-27T13:10:43.722268Z" 103 | }, 104 | "papermill": { 105 | "duration": 0.030715, 106 | "end_time": "2021-11-27T13:27:07.954598", 107 | "exception": false, 108 | "start_time": "2021-11-27T13:27:07.923883", 109 | "status": "completed" 110 | }, 111 | "tags": [] 112 | }, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/html": [ 117 | "
\n", 118 | "\n", 131 | "\n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | "
idqid1qid2question1question2is_duplicatequestion1_preprocessedquestion2_preprocessed
020467393885307635If there is a God, where is He!Why is god a \"He\"?0if there is a god , where is he !why is god a `` he '' ?
117716209315628Do you believe that everything happens for a r...Does everything happen for a reason?1do you believe that everything happens for a r...does everything happen for a reason ?
2291767352623413255Will there always be web hosting that will sup...Will there always be web hosting that supports...1will there always be web hosting that will sup...will there always be web hosting that supports...
32037585982467971What is the proof of Indian Army's surgical st...Has India provided any proof of the surgical s...1what is the proof of indian army 's surgical s...has india provided any proof of the surgical s...
4417477532675327What do Indian Muslims think of Modi?What do Indian Muslim think about PM Narendra ...1what do indian muslims think of modi ?what do indian muslim think about pm narendra ...
\n", 203 | "
" 204 | ], 205 | "text/plain": [ 206 | " id qid1 qid2 question1 \\\n", 207 | "0 204673 93885 307635 If there is a God, where is He! \n", 208 | "1 17716 2093 15628 Do you believe that everything happens for a r... \n", 209 | "2 291767 352623 413255 Will there always be web hosting that will sup... \n", 210 | "3 203758 59824 67971 What is the proof of Indian Army's surgical st... \n", 211 | "4 41747 75326 75327 What do Indian Muslims think of Modi? \n", 212 | "\n", 213 | " question2 is_duplicate \\\n", 214 | "0 Why is god a \"He\"? 0 \n", 215 | "1 Does everything happen for a reason? 1 \n", 216 | "2 Will there always be web hosting that supports... 1 \n", 217 | "3 Has India provided any proof of the surgical s... 1 \n", 218 | "4 What do Indian Muslim think about PM Narendra ... 1 \n", 219 | "\n", 220 | " question1_preprocessed \\\n", 221 | "0 if there is a god , where is he ! \n", 222 | "1 do you believe that everything happens for a r... \n", 223 | "2 will there always be web hosting that will sup... \n", 224 | "3 what is the proof of indian army 's surgical s... \n", 225 | "4 what do indian muslims think of modi ? \n", 226 | "\n", 227 | " question2_preprocessed \n", 228 | "0 why is god a `` he '' ? \n", 229 | "1 does everything happen for a reason ? \n", 230 | "2 will there always be web hosting that supports... \n", 231 | "3 has india provided any proof of the surgical s... \n", 232 | "4 what do indian muslim think about pm narendra ... " 233 | ] 234 | }, 235 | "execution_count": 4, 236 | "metadata": {}, 237 | "output_type": "execute_result" 238 | } 239 | ], 240 | "source": [ 241 | "train.head()" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 5, 247 | "id": "fdd3b917", 248 | "metadata": { 249 | "execution": { 250 | "iopub.execute_input": "2021-11-27T13:27:07.982238Z", 251 | "iopub.status.busy": "2021-11-27T13:27:07.981577Z", 252 | "iopub.status.idle": "2021-11-27T13:28:12.395623Z", 253 | "shell.execute_reply": "2021-11-27T13:28:12.396099Z", 254 | "shell.execute_reply.started": "2021-11-27T13:10:43.746605Z" 255 | }, 256 | "papermill": { 257 | "duration": 64.431482, 258 | "end_time": "2021-11-27T13:28:12.396269", 259 | "exception": false, 260 | "start_time": "2021-11-27T13:27:07.964787", 261 | "status": "completed" 262 | }, 263 | "tags": [] 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "def buildVocabulary(reviews):\n", 268 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n", 269 | " tokenizer.fit_on_texts(reviews)\n", 270 | " return tokenizer\n", 271 | "\n", 272 | "def getSequences(reviews, tokenizer, seq_maxlen):\n", 273 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n", 274 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n", 275 | "\n", 276 | "word2vec_model = KeyedVectors.load_word2vec_format(word2vec_file, binary = True)\n", 277 | "\n", 278 | "def getEmbeddingWeightMatrix(word2idx): \n", 279 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n", 280 | " for word, i in tqdm(word2idx.items()):\n", 281 | " \n", 282 | " embedding_vector = word2vec_model[word] if word in word2vec_model else np.random.rand(1,300)\n", 283 | " if embedding_vector is not None:\n", 284 | " embedding_matrix[i] = embedding_vector\n", 285 | " return embedding_matrix" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 6, 291 | "id": "47a83b4b", 292 | "metadata": { 293 | "execution": { 294 | "iopub.execute_input": "2021-11-27T13:28:12.451991Z", 295 | "iopub.status.busy": "2021-11-27T13:28:12.438857Z", 296 | "iopub.status.idle": "2021-11-27T13:28:26.338075Z", 297 | "shell.execute_reply": "2021-11-27T13:28:26.337092Z", 298 | "shell.execute_reply.started": "2021-11-27T13:11:45.518534Z" 299 | }, 300 | "papermill": { 301 | "duration": 13.930379, 302 | "end_time": "2021-11-27T13:28:26.338242", 303 | "exception": false, 304 | "start_time": "2021-11-27T13:28:12.407863", 305 | "status": "completed" 306 | }, 307 | "tags": [] 308 | }, 309 | "outputs": [ 310 | { 311 | "name": "stdout", 312 | "output_type": "stream", 313 | "text": [ 314 | "67043\n" 315 | ] 316 | } 317 | ], 318 | "source": [ 319 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n", 320 | "vocab_size = len(tokenizer.word_index) + 1\n", 321 | "print(vocab_size)\n", 322 | "\n", 323 | "x_train1 = getSequences(train['question1'], tokenizer, 200)\n", 324 | "x_train2 = getSequences(train['question2'], tokenizer, 200)\n", 325 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n", 326 | "\n", 327 | "x_val1 = getSequences(val['question1'], tokenizer, 200)\n", 328 | "x_val2 = getSequences(val['question2'], tokenizer, 200)\n", 329 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n", 330 | "\n", 331 | "x_test1 = getSequences(test['question1'], tokenizer, 200)\n", 332 | "x_test2 = getSequences(test['question2'], tokenizer, 200)\n", 333 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 7, 339 | "id": "064a2f92", 340 | "metadata": { 341 | "execution": { 342 | "iopub.execute_input": "2021-11-27T13:28:26.364903Z", 343 | "iopub.status.busy": "2021-11-27T13:28:26.364312Z", 344 | "iopub.status.idle": "2021-11-27T13:28:26.845268Z", 345 | "shell.execute_reply": "2021-11-27T13:28:26.843711Z", 346 | "shell.execute_reply.started": "2021-11-27T13:12:14.356043Z" 347 | }, 348 | "papermill": { 349 | "duration": 0.496339, 350 | "end_time": "2021-11-27T13:28:26.845392", 351 | "exception": false, 352 | "start_time": "2021-11-27T13:28:26.349053", 353 | "status": "completed" 354 | }, 355 | "tags": [] 356 | }, 357 | "outputs": [ 358 | { 359 | "name": "stderr", 360 | "output_type": "stream", 361 | "text": [ 362 | "100%|██████████| 67042/67042 [00:00<00:00, 142685.61it/s]" 363 | ] 364 | }, 365 | { 366 | "name": "stdout", 367 | "output_type": "stream", 368 | "text": [ 369 | "(67043, 300)\n" 370 | ] 371 | }, 372 | { 373 | "name": "stderr", 374 | "output_type": "stream", 375 | "text": [ 376 | "\n" 377 | ] 378 | } 379 | ], 380 | "source": [ 381 | "#embedding_vectors = loadGloveWordEmbeddings()\n", 382 | "#print(len(embedding_vectors))\n", 383 | "\n", 384 | "embedding_weight_matrix = getEmbeddingWeightMatrix(tokenizer.word_index)\n", 385 | "print(embedding_weight_matrix.shape)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 8, 391 | "id": "13dd3e2c", 392 | "metadata": { 393 | "execution": { 394 | "iopub.execute_input": "2021-11-27T13:28:26.881262Z", 395 | "iopub.status.busy": "2021-11-27T13:28:26.880656Z", 396 | "iopub.status.idle": "2021-11-27T13:28:29.585007Z", 397 | "shell.execute_reply": "2021-11-27T13:28:29.585470Z", 398 | "shell.execute_reply.started": "2021-11-27T13:21:50.654150Z" 399 | }, 400 | "papermill": { 401 | "duration": 2.727389, 402 | "end_time": "2021-11-27T13:28:29.585638", 403 | "exception": false, 404 | "start_time": "2021-11-27T13:28:26.858249", 405 | "status": "completed" 406 | }, 407 | "tags": [] 408 | }, 409 | "outputs": [ 410 | { 411 | "name": "stderr", 412 | "output_type": "stream", 413 | "text": [ 414 | "2021-11-27 13:28:26.961758: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 415 | "2021-11-27 13:28:27.082051: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 416 | "2021-11-27 13:28:27.082785: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 417 | "2021-11-27 13:28:27.084455: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n", 418 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", 419 | "2021-11-27 13:28:27.085735: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 420 | "2021-11-27 13:28:27.086434: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 421 | "2021-11-27 13:28:27.087162: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 422 | "2021-11-27 13:28:28.950796: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 423 | "2021-11-27 13:28:28.951646: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 424 | "2021-11-27 13:28:28.952433: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 425 | "2021-11-27 13:28:28.953120: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n" 426 | ] 427 | } 428 | ], 429 | "source": [ 430 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n", 431 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n", 432 | "\n", 433 | "inner1= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200, \n", 434 | " weights=[embedding_weight_matrix], trainable=True)(inp1)\n", 435 | "inner2= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200,\n", 436 | " weights=[embedding_weight_matrix], trainable=True)(inp2)\n", 437 | " \n", 438 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n", 439 | "\n", 440 | "inner = tf.keras.backend.sum(inner, axis=1, keepdims=False)\n", 441 | "#tf.keras.regularizers.l2(l2=0.01)\n", 442 | "inner = tf.keras.layers.Dense(300, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 443 | "inner = tf.keras.layers.Dropout(0.1)(inner)\n", 444 | "inner = tf.keras.layers.Dense(200, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 445 | "inner = tf.keras.layers.Dropout(0.1)(inner)\n", 446 | "inner = tf.keras.layers.Dense(100, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 447 | "inner = tf.keras.layers.Dropout(0.1)(inner)\n", 448 | "output = tf.keras.layers.Dense(2, activation='softmax')(inner)\n", 449 | "\n", 450 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 9, 456 | "id": "2a0fb964", 457 | "metadata": { 458 | "execution": { 459 | "iopub.execute_input": "2021-11-27T13:28:29.619321Z", 460 | "iopub.status.busy": "2021-11-27T13:28:29.618811Z", 461 | "iopub.status.idle": "2021-11-27T13:28:29.634853Z", 462 | "shell.execute_reply": "2021-11-27T13:28:29.635434Z", 463 | "shell.execute_reply.started": "2021-11-27T13:21:51.909270Z" 464 | }, 465 | "papermill": { 466 | "duration": 0.036653, 467 | "end_time": "2021-11-27T13:28:29.635595", 468 | "exception": false, 469 | "start_time": "2021-11-27T13:28:29.598942", 470 | "status": "completed" 471 | }, 472 | "tags": [] 473 | }, 474 | "outputs": [ 475 | { 476 | "name": "stdout", 477 | "output_type": "stream", 478 | "text": [ 479 | "Model: \"model\"\n", 480 | "__________________________________________________________________________________________________\n", 481 | "Layer (type) Output Shape Param # Connected to \n", 482 | "==================================================================================================\n", 483 | "input_1 (InputLayer) [(None, 200)] 0 \n", 484 | "__________________________________________________________________________________________________\n", 485 | "input_2 (InputLayer) [(None, 200)] 0 \n", 486 | "__________________________________________________________________________________________________\n", 487 | "embedding (Embedding) (None, 200, 300) 20112900 input_1[0][0] \n", 488 | "__________________________________________________________________________________________________\n", 489 | "embedding_1 (Embedding) (None, 200, 300) 20112900 input_2[0][0] \n", 490 | "__________________________________________________________________________________________________\n", 491 | "tf.__operators__.add (TFOpLambd (None, 200, 300) 0 embedding[0][0] \n", 492 | " embedding_1[0][0] \n", 493 | "__________________________________________________________________________________________________\n", 494 | "tf.math.subtract (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n", 495 | " embedding_1[0][0] \n", 496 | "__________________________________________________________________________________________________\n", 497 | "tf.math.multiply (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n", 498 | " embedding_1[0][0] \n", 499 | "__________________________________________________________________________________________________\n", 500 | "concatenate (Concatenate) (None, 200, 900) 0 tf.__operators__.add[0][0] \n", 501 | " tf.math.subtract[0][0] \n", 502 | " tf.math.multiply[0][0] \n", 503 | "__________________________________________________________________________________________________\n", 504 | "tf.math.reduce_sum (TFOpLambda) (None, 900) 0 concatenate[0][0] \n", 505 | "__________________________________________________________________________________________________\n", 506 | "dense (Dense) (None, 300) 270300 tf.math.reduce_sum[0][0] \n", 507 | "__________________________________________________________________________________________________\n", 508 | "dropout (Dropout) (None, 300) 0 dense[0][0] \n", 509 | "__________________________________________________________________________________________________\n", 510 | "dense_1 (Dense) (None, 200) 60200 dropout[0][0] \n", 511 | "__________________________________________________________________________________________________\n", 512 | "dropout_1 (Dropout) (None, 200) 0 dense_1[0][0] \n", 513 | "__________________________________________________________________________________________________\n", 514 | "dense_2 (Dense) (None, 100) 20100 dropout_1[0][0] \n", 515 | "__________________________________________________________________________________________________\n", 516 | "dropout_2 (Dropout) (None, 100) 0 dense_2[0][0] \n", 517 | "__________________________________________________________________________________________________\n", 518 | "dense_3 (Dense) (None, 2) 202 dropout_2[0][0] \n", 519 | "==================================================================================================\n", 520 | "Total params: 40,576,602\n", 521 | "Trainable params: 40,576,602\n", 522 | "Non-trainable params: 0\n", 523 | "__________________________________________________________________________________________________\n" 524 | ] 525 | } 526 | ], 527 | "source": [ 528 | "model.compile(optimizer = \"adam\", loss = 'categorical_crossentropy', metrics=['accuracy'])\n", 529 | "model.summary()" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 10, 535 | "id": "2769fd24", 536 | "metadata": { 537 | "execution": { 538 | "iopub.execute_input": "2021-11-27T13:28:29.669190Z", 539 | "iopub.status.busy": "2021-11-27T13:28:29.668319Z", 540 | "iopub.status.idle": "2021-11-27T13:29:27.302188Z", 541 | "shell.execute_reply": "2021-11-27T13:29:27.301703Z", 542 | "shell.execute_reply.started": "2021-11-27T13:21:52.934089Z" 543 | }, 544 | "papermill": { 545 | "duration": 57.652535, 546 | "end_time": "2021-11-27T13:29:27.302336", 547 | "exception": false, 548 | "start_time": "2021-11-27T13:28:29.649801", 549 | "status": "completed" 550 | }, 551 | "tags": [] 552 | }, 553 | "outputs": [ 554 | { 555 | "name": "stderr", 556 | "output_type": "stream", 557 | "text": [ 558 | "2021-11-27 13:28:29.784743: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n" 559 | ] 560 | }, 561 | { 562 | "name": "stdout", 563 | "output_type": "stream", 564 | "text": [ 565 | "Epoch 1/3\n", 566 | "632/632 [==============================] - 18s 26ms/step - loss: 1.5922 - accuracy: 0.6810 - val_loss: 0.6328 - val_accuracy: 0.7140\n", 567 | "Epoch 2/3\n", 568 | "632/632 [==============================] - 16s 25ms/step - loss: 0.5415 - accuracy: 0.7529 - val_loss: 0.5487 - val_accuracy: 0.7364\n", 569 | "Epoch 3/3\n", 570 | "632/632 [==============================] - 21s 33ms/step - loss: 0.4592 - accuracy: 0.7990 - val_loss: 0.5759 - val_accuracy: 0.7274\n" 571 | ] 572 | } 573 | ], 574 | "source": [ 575 | "save_weights = tf.keras.callbacks.ModelCheckpoint('cbow_mlp.h5', monitor='val_loss', save_best_only=True)\n", 576 | "checkpoint_filepath = 'weights.best.{epoch:01d}.hdf5'\n", 577 | "#model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n", 578 | "#verbose = 1,\n", 579 | "#monitor = 'val_loss',\n", 580 | "#save_best_only = False)\n", 581 | "history = model.fit((x_train1, x_train2), y_train,\n", 582 | " batch_size = 64,\n", 583 | " validation_data = ((x_val1, x_val2), y_val),\n", 584 | " validation_batch_size = 32,\n", 585 | " epochs=3, \n", 586 | " callbacks=[save_weights], \n", 587 | " verbose=1)" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 11, 593 | "id": "ab5c9bea", 594 | "metadata": { 595 | "execution": { 596 | "iopub.execute_input": "2021-11-27T13:29:27.609229Z", 597 | "iopub.status.busy": "2021-11-27T13:29:27.608248Z", 598 | "iopub.status.idle": "2021-11-27T13:29:51.444179Z", 599 | "shell.execute_reply": "2021-11-27T13:29:51.445132Z", 600 | "shell.execute_reply.started": "2021-11-27T13:23:37.976281Z" 601 | }, 602 | "papermill": { 603 | "duration": 23.995934, 604 | "end_time": "2021-11-27T13:29:51.445323", 605 | "exception": false, 606 | "start_time": "2021-11-27T13:29:27.449389", 607 | "status": "completed" 608 | }, 609 | "tags": [] 610 | }, 611 | "outputs": [ 612 | { 613 | "name": "stdout", 614 | "output_type": "stream", 615 | "text": [ 616 | "10108/10108 [==============================] - 24s 2ms/step - loss: 0.3645 - accuracy: 0.8614\n", 617 | "loss on test data is 0.36454567313194275\n", 618 | "accuracy on test data is 0.8613866567611694\n" 619 | ] 620 | } 621 | ], 622 | "source": [ 623 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n", 624 | "\n", 625 | "print('loss on test data is', loss)\n", 626 | "print('accuracy on test data is', accuracy)" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": 12, 632 | "id": "072e0c3f", 633 | "metadata": { 634 | "execution": { 635 | "iopub.execute_input": "2021-11-27T13:29:52.033949Z", 636 | "iopub.status.busy": "2021-11-27T13:29:52.033003Z", 637 | "iopub.status.idle": "2021-11-27T13:29:54.764953Z", 638 | "shell.execute_reply": "2021-11-27T13:29:54.765365Z", 639 | "shell.execute_reply.started": "2021-11-27T13:25:52.168723Z" 640 | }, 641 | "papermill": { 642 | "duration": 3.037087, 643 | "end_time": "2021-11-27T13:29:54.765519", 644 | "exception": false, 645 | "start_time": "2021-11-27T13:29:51.728432", 646 | "status": "completed" 647 | }, 648 | "tags": [] 649 | }, 650 | "outputs": [ 651 | { 652 | "name": "stdout", 653 | "output_type": "stream", 654 | "text": [ 655 | "F1_score on test is 0.8275480059084195\n" 656 | ] 657 | } 658 | ], 659 | "source": [ 660 | "pred = model.predict((x_test1, x_test2))\n", 661 | "print('F1_score on test is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))\n" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": null, 667 | "id": "0b1a3522", 668 | "metadata": { 669 | "papermill": { 670 | "duration": 0.264673, 671 | "end_time": "2021-11-27T13:29:55.301567", 672 | "exception": false, 673 | "start_time": "2021-11-27T13:29:55.036894", 674 | "status": "completed" 675 | }, 676 | "tags": [] 677 | }, 678 | "outputs": [], 679 | "source": [] 680 | } 681 | ], 682 | "metadata": { 683 | "kernelspec": { 684 | "display_name": "Python 3", 685 | "language": "python", 686 | "name": "python3" 687 | }, 688 | "language_info": { 689 | "codemirror_mode": { 690 | "name": "ipython", 691 | "version": 3 692 | }, 693 | "file_extension": ".py", 694 | "mimetype": "text/x-python", 695 | "name": "python", 696 | "nbconvert_exporter": "python", 697 | "pygments_lexer": "ipython3", 698 | "version": "3.7.10" 699 | }, 700 | "papermill": { 701 | "default_parameters": {}, 702 | "duration": 184.393918, 703 | "end_time": "2021-11-27T13:29:59.169536", 704 | "environment_variables": {}, 705 | "exception": null, 706 | "input_path": "__notebook__.ipynb", 707 | "output_path": "__notebook__.ipynb", 708 | "parameters": {}, 709 | "start_time": "2021-11-27T13:26:54.775618", 710 | "version": "2.3.3" 711 | } 712 | }, 713 | "nbformat": 4, 714 | "nbformat_minor": 5 715 | } 716 | -------------------------------------------------------------------------------- /CBOW ML Dropout Regularisation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "7129363e", 7 | "metadata": { 8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 10 | "execution": { 11 | "iopub.execute_input": "2021-11-27T13:39:40.100619Z", 12 | "iopub.status.busy": "2021-11-27T13:39:40.097692Z", 13 | "iopub.status.idle": "2021-11-27T13:39:45.510044Z", 14 | "shell.execute_reply": "2021-11-27T13:39:45.509398Z", 15 | "shell.execute_reply.started": "2021-11-27T13:34:29.669569Z" 16 | }, 17 | "papermill": { 18 | "duration": 5.435369, 19 | "end_time": "2021-11-27T13:39:45.510212", 20 | "exception": false, 21 | "start_time": "2021-11-27T13:39:40.074843", 22 | "status": "completed" 23 | }, 24 | "tags": [] 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import pandas as pd\n", 29 | "import numpy as np\n", 30 | "from tqdm import tqdm\n", 31 | "import tensorflow as tf\n", 32 | "from gensim.models import KeyedVectors\n", 33 | "import gensim\n", 34 | "import re\n", 35 | "from sklearn.metrics import f1_score\n", 36 | "import matplotlib.pyplot as plt" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "id": "fa981eee", 43 | "metadata": { 44 | "execution": { 45 | "iopub.execute_input": "2021-11-27T13:39:45.539763Z", 46 | "iopub.status.busy": "2021-11-27T13:39:45.539259Z", 47 | "iopub.status.idle": "2021-11-27T13:39:46.766976Z", 48 | "shell.execute_reply": "2021-11-27T13:39:46.766492Z", 49 | "shell.execute_reply.started": "2021-11-27T13:10:42.402545Z" 50 | }, 51 | "papermill": { 52 | "duration": 1.24347, 53 | "end_time": "2021-11-27T13:39:46.767141", 54 | "exception": false, 55 | "start_time": "2021-11-27T13:39:45.523671", 56 | "status": "completed" 57 | }, 58 | "tags": [] 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "train = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')\n", 63 | "val = pd.read_csv('../input/quora-ques-pair/val_data.csv/val_data.csv').fillna('')\n", 64 | "test = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "id": "18c9ee56", 71 | "metadata": { 72 | "execution": { 73 | "iopub.execute_input": "2021-11-27T13:39:46.793631Z", 74 | "iopub.status.busy": "2021-11-27T13:39:46.792950Z", 75 | "iopub.status.idle": "2021-11-27T13:39:46.795498Z", 76 | "shell.execute_reply": "2021-11-27T13:39:46.795872Z", 77 | "shell.execute_reply.started": "2021-11-27T13:10:43.715098Z" 78 | }, 79 | "papermill": { 80 | "duration": 0.017601, 81 | "end_time": "2021-11-27T13:39:46.795999", 82 | "exception": false, 83 | "start_time": "2021-11-27T13:39:46.778398", 84 | "status": "completed" 85 | }, 86 | "tags": [] 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "word2vec_file = '../input/d/sandreds/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "id": "9a62dc09", 97 | "metadata": { 98 | "execution": { 99 | "iopub.execute_input": "2021-11-27T13:39:46.827484Z", 100 | "iopub.status.busy": "2021-11-27T13:39:46.826713Z", 101 | "iopub.status.idle": "2021-11-27T13:39:46.839866Z", 102 | "shell.execute_reply": "2021-11-27T13:39:46.840300Z", 103 | "shell.execute_reply.started": "2021-11-27T13:10:43.722268Z" 104 | }, 105 | "papermill": { 106 | "duration": 0.033588, 107 | "end_time": "2021-11-27T13:39:46.840430", 108 | "exception": false, 109 | "start_time": "2021-11-27T13:39:46.806842", 110 | "status": "completed" 111 | }, 112 | "tags": [] 113 | }, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/html": [ 118 | "
\n", 119 | "\n", 132 | "\n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | "
idqid1qid2question1question2is_duplicatequestion1_preprocessedquestion2_preprocessed
020467393885307635If there is a God, where is He!Why is god a \"He\"?0if there is a god , where is he !why is god a `` he '' ?
117716209315628Do you believe that everything happens for a r...Does everything happen for a reason?1do you believe that everything happens for a r...does everything happen for a reason ?
2291767352623413255Will there always be web hosting that will sup...Will there always be web hosting that supports...1will there always be web hosting that will sup...will there always be web hosting that supports...
32037585982467971What is the proof of Indian Army's surgical st...Has India provided any proof of the surgical s...1what is the proof of indian army 's surgical s...has india provided any proof of the surgical s...
4417477532675327What do Indian Muslims think of Modi?What do Indian Muslim think about PM Narendra ...1what do indian muslims think of modi ?what do indian muslim think about pm narendra ...
\n", 204 | "
" 205 | ], 206 | "text/plain": [ 207 | " id qid1 qid2 question1 \\\n", 208 | "0 204673 93885 307635 If there is a God, where is He! \n", 209 | "1 17716 2093 15628 Do you believe that everything happens for a r... \n", 210 | "2 291767 352623 413255 Will there always be web hosting that will sup... \n", 211 | "3 203758 59824 67971 What is the proof of Indian Army's surgical st... \n", 212 | "4 41747 75326 75327 What do Indian Muslims think of Modi? \n", 213 | "\n", 214 | " question2 is_duplicate \\\n", 215 | "0 Why is god a \"He\"? 0 \n", 216 | "1 Does everything happen for a reason? 1 \n", 217 | "2 Will there always be web hosting that supports... 1 \n", 218 | "3 Has India provided any proof of the surgical s... 1 \n", 219 | "4 What do Indian Muslim think about PM Narendra ... 1 \n", 220 | "\n", 221 | " question1_preprocessed \\\n", 222 | "0 if there is a god , where is he ! \n", 223 | "1 do you believe that everything happens for a r... \n", 224 | "2 will there always be web hosting that will sup... \n", 225 | "3 what is the proof of indian army 's surgical s... \n", 226 | "4 what do indian muslims think of modi ? \n", 227 | "\n", 228 | " question2_preprocessed \n", 229 | "0 why is god a `` he '' ? \n", 230 | "1 does everything happen for a reason ? \n", 231 | "2 will there always be web hosting that supports... \n", 232 | "3 has india provided any proof of the surgical s... \n", 233 | "4 what do indian muslim think about pm narendra ... " 234 | ] 235 | }, 236 | "execution_count": 4, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "train.head()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 5, 248 | "id": "2e9f6d2a", 249 | "metadata": { 250 | "execution": { 251 | "iopub.execute_input": "2021-11-27T13:39:46.870196Z", 252 | "iopub.status.busy": "2021-11-27T13:39:46.869680Z", 253 | "iopub.status.idle": "2021-11-27T13:40:56.405543Z", 254 | "shell.execute_reply": "2021-11-27T13:40:56.404933Z", 255 | "shell.execute_reply.started": "2021-11-27T13:10:43.746605Z" 256 | }, 257 | "papermill": { 258 | "duration": 69.55394, 259 | "end_time": "2021-11-27T13:40:56.405691", 260 | "exception": false, 261 | "start_time": "2021-11-27T13:39:46.851751", 262 | "status": "completed" 263 | }, 264 | "tags": [] 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "def buildVocabulary(reviews):\n", 269 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n", 270 | " tokenizer.fit_on_texts(reviews)\n", 271 | " return tokenizer\n", 272 | "\n", 273 | "def getSequences(reviews, tokenizer, seq_maxlen):\n", 274 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n", 275 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n", 276 | "\n", 277 | "word2vec_model = KeyedVectors.load_word2vec_format(word2vec_file, binary = True)\n", 278 | "\n", 279 | "def getEmbeddingWeightMatrix(word2idx): \n", 280 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n", 281 | " for word, i in tqdm(word2idx.items()):\n", 282 | " \n", 283 | " embedding_vector = word2vec_model[word] if word in word2vec_model else np.random.rand(1,300)\n", 284 | " if embedding_vector is not None:\n", 285 | " embedding_matrix[i] = embedding_vector\n", 286 | " return embedding_matrix" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 6, 292 | "id": "1c97283f", 293 | "metadata": { 294 | "execution": { 295 | "iopub.execute_input": "2021-11-27T13:40:56.460244Z", 296 | "iopub.status.busy": "2021-11-27T13:40:56.459548Z", 297 | "iopub.status.idle": "2021-11-27T13:41:10.200708Z", 298 | "shell.execute_reply": "2021-11-27T13:41:10.200210Z", 299 | "shell.execute_reply.started": "2021-11-27T13:11:45.518534Z" 300 | }, 301 | "papermill": { 302 | "duration": 13.783239, 303 | "end_time": "2021-11-27T13:41:10.200838", 304 | "exception": false, 305 | "start_time": "2021-11-27T13:40:56.417599", 306 | "status": "completed" 307 | }, 308 | "tags": [] 309 | }, 310 | "outputs": [ 311 | { 312 | "name": "stdout", 313 | "output_type": "stream", 314 | "text": [ 315 | "67043\n" 316 | ] 317 | } 318 | ], 319 | "source": [ 320 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n", 321 | "vocab_size = len(tokenizer.word_index) + 1\n", 322 | "print(vocab_size)\n", 323 | "\n", 324 | "x_train1 = getSequences(train['question1'], tokenizer, 200)\n", 325 | "x_train2 = getSequences(train['question2'], tokenizer, 200)\n", 326 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n", 327 | "\n", 328 | "x_val1 = getSequences(val['question1'], tokenizer, 200)\n", 329 | "x_val2 = getSequences(val['question2'], tokenizer, 200)\n", 330 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n", 331 | "\n", 332 | "x_test1 = getSequences(test['question1'], tokenizer, 200)\n", 333 | "x_test2 = getSequences(test['question2'], tokenizer, 200)\n", 334 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 7, 340 | "id": "580d251a", 341 | "metadata": { 342 | "execution": { 343 | "iopub.execute_input": "2021-11-27T13:41:10.228748Z", 344 | "iopub.status.busy": "2021-11-27T13:41:10.228020Z", 345 | "iopub.status.idle": "2021-11-27T13:41:10.705699Z", 346 | "shell.execute_reply": "2021-11-27T13:41:10.704306Z", 347 | "shell.execute_reply.started": "2021-11-27T13:12:14.356043Z" 348 | }, 349 | "papermill": { 350 | "duration": 0.493073, 351 | "end_time": "2021-11-27T13:41:10.705828", 352 | "exception": false, 353 | "start_time": "2021-11-27T13:41:10.212755", 354 | "status": "completed" 355 | }, 356 | "tags": [] 357 | }, 358 | "outputs": [ 359 | { 360 | "name": "stderr", 361 | "output_type": "stream", 362 | "text": [ 363 | "100%|██████████| 67042/67042 [00:00<00:00, 143587.33it/s]" 364 | ] 365 | }, 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "(67043, 300)\n" 371 | ] 372 | }, 373 | { 374 | "name": "stderr", 375 | "output_type": "stream", 376 | "text": [ 377 | "\n" 378 | ] 379 | } 380 | ], 381 | "source": [ 382 | "#embedding_vectors = loadGloveWordEmbeddings()\n", 383 | "#print(len(embedding_vectors))\n", 384 | "\n", 385 | "embedding_weight_matrix = getEmbeddingWeightMatrix(tokenizer.word_index)\n", 386 | "print(embedding_weight_matrix.shape)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 8, 392 | "id": "376d9fdb", 393 | "metadata": { 394 | "execution": { 395 | "iopub.execute_input": "2021-11-27T13:41:10.744344Z", 396 | "iopub.status.busy": "2021-11-27T13:41:10.743779Z", 397 | "iopub.status.idle": "2021-11-27T13:41:13.935711Z", 398 | "shell.execute_reply": "2021-11-27T13:41:13.934772Z", 399 | "shell.execute_reply.started": "2021-11-27T13:27:27.669498Z" 400 | }, 401 | "papermill": { 402 | "duration": 3.215972, 403 | "end_time": "2021-11-27T13:41:13.935848", 404 | "exception": false, 405 | "start_time": "2021-11-27T13:41:10.719876", 406 | "status": "completed" 407 | }, 408 | "tags": [] 409 | }, 410 | "outputs": [ 411 | { 412 | "name": "stderr", 413 | "output_type": "stream", 414 | "text": [ 415 | "2021-11-27 13:41:10.832116: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 416 | "2021-11-27 13:41:10.981618: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 417 | "2021-11-27 13:41:10.982427: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 418 | "2021-11-27 13:41:10.983858: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n", 419 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", 420 | "2021-11-27 13:41:10.985130: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 421 | "2021-11-27 13:41:10.985799: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 422 | "2021-11-27 13:41:10.986424: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 423 | "2021-11-27 13:41:13.311232: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 424 | "2021-11-27 13:41:13.311964: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 425 | "2021-11-27 13:41:13.312674: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 426 | "2021-11-27 13:41:13.313255: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n" 427 | ] 428 | } 429 | ], 430 | "source": [ 431 | "#he_initializer = tf.keras.initializers.HeUniform()\n", 432 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n", 433 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n", 434 | "\n", 435 | "inner1= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200, \n", 436 | " weights=[embedding_weight_matrix], trainable=True)(inp1)\n", 437 | "inner2= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200,\n", 438 | " weights=[embedding_weight_matrix], trainable=True)(inp2)\n", 439 | " \n", 440 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n", 441 | "\n", 442 | "inner = tf.keras.backend.sum(inner, axis=1, keepdims=False)\n", 443 | "inner = tf.keras.layers.Dense(300, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 444 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 445 | "inner = tf.keras.layers.Dense(200, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 446 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 447 | "inner = tf.keras.layers.Dense(100, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01))(inner)\n", 448 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 449 | "output = tf.keras.layers.Dense(2, activation='softmax')(inner)\n", 450 | "\n", 451 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 9, 457 | "id": "e7dba2d8", 458 | "metadata": { 459 | "execution": { 460 | "iopub.execute_input": "2021-11-27T13:41:13.973100Z", 461 | "iopub.status.busy": "2021-11-27T13:41:13.972255Z", 462 | "iopub.status.idle": "2021-11-27T13:41:13.987912Z", 463 | "shell.execute_reply": "2021-11-27T13:41:13.988527Z", 464 | "shell.execute_reply.started": "2021-11-27T13:27:28.324190Z" 465 | }, 466 | "papermill": { 467 | "duration": 0.037826, 468 | "end_time": "2021-11-27T13:41:13.988697", 469 | "exception": false, 470 | "start_time": "2021-11-27T13:41:13.950871", 471 | "status": "completed" 472 | }, 473 | "tags": [] 474 | }, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "Model: \"model\"\n", 481 | "__________________________________________________________________________________________________\n", 482 | "Layer (type) Output Shape Param # Connected to \n", 483 | "==================================================================================================\n", 484 | "input_1 (InputLayer) [(None, 200)] 0 \n", 485 | "__________________________________________________________________________________________________\n", 486 | "input_2 (InputLayer) [(None, 200)] 0 \n", 487 | "__________________________________________________________________________________________________\n", 488 | "embedding (Embedding) (None, 200, 300) 20112900 input_1[0][0] \n", 489 | "__________________________________________________________________________________________________\n", 490 | "embedding_1 (Embedding) (None, 200, 300) 20112900 input_2[0][0] \n", 491 | "__________________________________________________________________________________________________\n", 492 | "tf.__operators__.add (TFOpLambd (None, 200, 300) 0 embedding[0][0] \n", 493 | " embedding_1[0][0] \n", 494 | "__________________________________________________________________________________________________\n", 495 | "tf.math.subtract (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n", 496 | " embedding_1[0][0] \n", 497 | "__________________________________________________________________________________________________\n", 498 | "tf.math.multiply (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n", 499 | " embedding_1[0][0] \n", 500 | "__________________________________________________________________________________________________\n", 501 | "concatenate (Concatenate) (None, 200, 900) 0 tf.__operators__.add[0][0] \n", 502 | " tf.math.subtract[0][0] \n", 503 | " tf.math.multiply[0][0] \n", 504 | "__________________________________________________________________________________________________\n", 505 | "tf.math.reduce_sum (TFOpLambda) (None, 900) 0 concatenate[0][0] \n", 506 | "__________________________________________________________________________________________________\n", 507 | "dense (Dense) (None, 300) 270300 tf.math.reduce_sum[0][0] \n", 508 | "__________________________________________________________________________________________________\n", 509 | "dropout (Dropout) (None, 300) 0 dense[0][0] \n", 510 | "__________________________________________________________________________________________________\n", 511 | "dense_1 (Dense) (None, 200) 60200 dropout[0][0] \n", 512 | "__________________________________________________________________________________________________\n", 513 | "dropout_1 (Dropout) (None, 200) 0 dense_1[0][0] \n", 514 | "__________________________________________________________________________________________________\n", 515 | "dense_2 (Dense) (None, 100) 20100 dropout_1[0][0] \n", 516 | "__________________________________________________________________________________________________\n", 517 | "dropout_2 (Dropout) (None, 100) 0 dense_2[0][0] \n", 518 | "__________________________________________________________________________________________________\n", 519 | "dense_3 (Dense) (None, 2) 202 dropout_2[0][0] \n", 520 | "==================================================================================================\n", 521 | "Total params: 40,576,602\n", 522 | "Trainable params: 40,576,602\n", 523 | "Non-trainable params: 0\n", 524 | "__________________________________________________________________________________________________\n" 525 | ] 526 | } 527 | ], 528 | "source": [ 529 | "model.compile(optimizer = \"adam\", loss = 'categorical_crossentropy', metrics=['accuracy'])\n", 530 | "model.summary()" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 10, 536 | "id": "5c0c86ea", 537 | "metadata": { 538 | "execution": { 539 | "iopub.execute_input": "2021-11-27T13:41:14.024992Z", 540 | "iopub.status.busy": "2021-11-27T13:41:14.024188Z", 541 | "iopub.status.idle": "2021-11-27T13:42:26.517456Z", 542 | "shell.execute_reply": "2021-11-27T13:42:26.518855Z", 543 | "shell.execute_reply.started": "2021-11-27T13:27:29.396799Z" 544 | }, 545 | "papermill": { 546 | "duration": 72.514821, 547 | "end_time": "2021-11-27T13:42:26.519128", 548 | "exception": false, 549 | "start_time": "2021-11-27T13:41:14.004307", 550 | "status": "completed" 551 | }, 552 | "tags": [] 553 | }, 554 | "outputs": [ 555 | { 556 | "name": "stderr", 557 | "output_type": "stream", 558 | "text": [ 559 | "2021-11-27 13:41:14.139989: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n" 560 | ] 561 | }, 562 | { 563 | "name": "stdout", 564 | "output_type": "stream", 565 | "text": [ 566 | "Epoch 1/4\n", 567 | "632/632 [==============================] - 23s 33ms/step - loss: 1.4585 - accuracy: 0.6716 - val_loss: 0.5944 - val_accuracy: 0.7091\n", 568 | "Epoch 2/4\n", 569 | "632/632 [==============================] - 16s 25ms/step - loss: 0.5432 - accuracy: 0.7499 - val_loss: 0.5638 - val_accuracy: 0.7303\n", 570 | "Epoch 3/4\n", 571 | "632/632 [==============================] - 15s 24ms/step - loss: 0.4664 - accuracy: 0.7947 - val_loss: 0.6222 - val_accuracy: 0.7380\n", 572 | "Epoch 4/4\n", 573 | "632/632 [==============================] - 15s 24ms/step - loss: 0.4202 - accuracy: 0.8218 - val_loss: 0.5795 - val_accuracy: 0.7284\n" 574 | ] 575 | } 576 | ], 577 | "source": [ 578 | "save_weights = tf.keras.callbacks.ModelCheckpoint('cbow_mlp.h5', monitor='val_loss', save_best_only=True)\n", 579 | "checkpoint_filepath = 'weights.best.{epoch:01d}.hdf5'\n", 580 | "#model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n", 581 | "#verbose = 1,\n", 582 | "#monitor = 'val_loss',\n", 583 | "#save_best_only = False)\n", 584 | "history = model.fit((x_train1, x_train2), y_train,\n", 585 | " batch_size = 64,\n", 586 | " validation_data = ((x_val1, x_val2), y_val),\n", 587 | " validation_batch_size = 32,\n", 588 | " epochs=4, \n", 589 | " callbacks=[save_weights], \n", 590 | " verbose=1)" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": 13, 596 | "id": "8e786b68", 597 | "metadata": { 598 | "execution": { 599 | "iopub.execute_input": "2021-11-27T13:42:28.244754Z", 600 | "iopub.status.busy": "2021-11-27T13:42:28.243805Z", 601 | "iopub.status.idle": "2021-11-27T13:43:09.569995Z", 602 | "shell.execute_reply": "2021-11-27T13:43:09.570512Z", 603 | "shell.execute_reply.started": "2021-11-27T13:31:52.063332Z" 604 | }, 605 | "papermill": { 606 | "duration": 41.528106, 607 | "end_time": "2021-11-27T13:43:09.570664", 608 | "exception": false, 609 | "start_time": "2021-11-27T13:42:28.042558", 610 | "status": "completed" 611 | }, 612 | "tags": [] 613 | }, 614 | "outputs": [ 615 | { 616 | "name": "stdout", 617 | "output_type": "stream", 618 | "text": [ 619 | "10108/10108 [==============================] - 23s 2ms/step - loss: 0.3454 - accuracy: 0.8793\n", 620 | "loss on test data is 0.345432311296463\n", 621 | "accuracy on test data is 0.8792945742607117\n" 622 | ] 623 | } 624 | ], 625 | "source": [ 626 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n", 627 | "\n", 628 | "print('loss on test data is', loss)\n", 629 | "print('accuracy on test data is', accuracy)" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": 14, 635 | "id": "c84c1ff4", 636 | "metadata": { 637 | "execution": { 638 | "iopub.execute_input": "2021-11-27T13:43:10.313909Z", 639 | "iopub.status.busy": "2021-11-27T13:43:10.306261Z", 640 | "iopub.status.idle": "2021-11-27T13:43:11.829214Z", 641 | "shell.execute_reply": "2021-11-27T13:43:11.829678Z", 642 | "shell.execute_reply.started": "2021-11-27T13:33:47.897210Z" 643 | }, 644 | "papermill": { 645 | "duration": 1.948284, 646 | "end_time": "2021-11-27T13:43:11.829824", 647 | "exception": false, 648 | "start_time": "2021-11-27T13:43:09.881540", 649 | "status": "completed" 650 | }, 651 | "tags": [] 652 | }, 653 | "outputs": [ 654 | { 655 | "name": "stdout", 656 | "output_type": "stream", 657 | "text": [ 658 | "F1_score on test is 0.8467144113582108\n" 659 | ] 660 | } 661 | ], 662 | "source": [ 663 | "pred = model.predict((x_test1, x_test2))\n", 664 | "print('F1_score on test is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))\n" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": null, 670 | "id": "9929df87", 671 | "metadata": { 672 | "papermill": { 673 | "duration": 0.308571, 674 | "end_time": "2021-11-27T13:43:12.450411", 675 | "exception": false, 676 | "start_time": "2021-11-27T13:43:12.141840", 677 | "status": "completed" 678 | }, 679 | "tags": [] 680 | }, 681 | "outputs": [], 682 | "source": [] 683 | } 684 | ], 685 | "metadata": { 686 | "kernelspec": { 687 | "display_name": "Python 3", 688 | "language": "python", 689 | "name": "python3" 690 | }, 691 | "language_info": { 692 | "codemirror_mode": { 693 | "name": "ipython", 694 | "version": 3 695 | }, 696 | "file_extension": ".py", 697 | "mimetype": "text/x-python", 698 | "name": "python", 699 | "nbconvert_exporter": "python", 700 | "pygments_lexer": "ipython3", 701 | "version": "3.8.8" 702 | }, 703 | "papermill": { 704 | "default_parameters": {}, 705 | "duration": 223.597372, 706 | "end_time": "2021-11-27T13:43:15.659790", 707 | "environment_variables": {}, 708 | "exception": null, 709 | "input_path": "__notebook__.ipynb", 710 | "output_path": "__notebook__.ipynb", 711 | "parameters": {}, 712 | "start_time": "2021-11-27T13:39:32.062418", 713 | "version": "2.3.3" 714 | } 715 | }, 716 | "nbformat": 4, 717 | "nbformat_minor": 5 718 | } 719 | -------------------------------------------------------------------------------- /CBOW MLP He initialisation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f4ff64c0", 7 | "metadata": { 8 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 9 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", 10 | "execution": { 11 | "iopub.execute_input": "2021-11-27T13:47:47.616966Z", 12 | "iopub.status.busy": "2021-11-27T13:47:47.615429Z", 13 | "iopub.status.idle": "2021-11-27T13:47:52.415578Z", 14 | "shell.execute_reply": "2021-11-27T13:47:52.414840Z", 15 | "shell.execute_reply.started": "2021-11-27T13:34:29.669569Z" 16 | }, 17 | "papermill": { 18 | "duration": 4.817962, 19 | "end_time": "2021-11-27T13:47:52.415761", 20 | "exception": false, 21 | "start_time": "2021-11-27T13:47:47.597799", 22 | "status": "completed" 23 | }, 24 | "tags": [] 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import pandas as pd\n", 29 | "import numpy as np\n", 30 | "from tqdm import tqdm\n", 31 | "import tensorflow as tf\n", 32 | "from gensim.models import KeyedVectors\n", 33 | "import gensim\n", 34 | "import re\n", 35 | "from sklearn.metrics import f1_score\n", 36 | "import matplotlib.pyplot as plt" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "id": "4978cce8", 43 | "metadata": { 44 | "execution": { 45 | "iopub.execute_input": "2021-11-27T13:47:52.446265Z", 46 | "iopub.status.busy": "2021-11-27T13:47:52.445619Z", 47 | "iopub.status.idle": "2021-11-27T13:47:53.538577Z", 48 | "shell.execute_reply": "2021-11-27T13:47:53.538101Z", 49 | "shell.execute_reply.started": "2021-11-27T13:10:42.402545Z" 50 | }, 51 | "papermill": { 52 | "duration": 1.109649, 53 | "end_time": "2021-11-27T13:47:53.538734", 54 | "exception": false, 55 | "start_time": "2021-11-27T13:47:52.429085", 56 | "status": "completed" 57 | }, 58 | "tags": [] 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "train = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')\n", 63 | "val = pd.read_csv('../input/quora-ques-pair/val_data.csv/val_data.csv').fillna('')\n", 64 | "test = pd.read_csv('../input/quora-ques-pair/test_data.csv/test_data.csv').fillna('')" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "id": "93666f68", 71 | "metadata": { 72 | "execution": { 73 | "iopub.execute_input": "2021-11-27T13:47:53.565394Z", 74 | "iopub.status.busy": "2021-11-27T13:47:53.564877Z", 75 | "iopub.status.idle": "2021-11-27T13:47:53.568521Z", 76 | "shell.execute_reply": "2021-11-27T13:47:53.568123Z", 77 | "shell.execute_reply.started": "2021-11-27T13:10:43.715098Z" 78 | }, 79 | "papermill": { 80 | "duration": 0.018444, 81 | "end_time": "2021-11-27T13:47:53.568628", 82 | "exception": false, 83 | "start_time": "2021-11-27T13:47:53.550184", 84 | "status": "completed" 85 | }, 86 | "tags": [] 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "word2vec_file = '../input/d/sandreds/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "id": "69688c16", 97 | "metadata": { 98 | "execution": { 99 | "iopub.execute_input": "2021-11-27T13:47:53.598579Z", 100 | "iopub.status.busy": "2021-11-27T13:47:53.597979Z", 101 | "iopub.status.idle": "2021-11-27T13:47:53.611503Z", 102 | "shell.execute_reply": "2021-11-27T13:47:53.611946Z", 103 | "shell.execute_reply.started": "2021-11-27T13:10:43.722268Z" 104 | }, 105 | "papermill": { 106 | "duration": 0.03278, 107 | "end_time": "2021-11-27T13:47:53.612067", 108 | "exception": false, 109 | "start_time": "2021-11-27T13:47:53.579287", 110 | "status": "completed" 111 | }, 112 | "tags": [] 113 | }, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/html": [ 118 | "
\n", 119 | "\n", 132 | "\n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | "
idqid1qid2question1question2is_duplicatequestion1_preprocessedquestion2_preprocessed
020467393885307635If there is a God, where is He!Why is god a \"He\"?0if there is a god , where is he !why is god a `` he '' ?
117716209315628Do you believe that everything happens for a r...Does everything happen for a reason?1do you believe that everything happens for a r...does everything happen for a reason ?
2291767352623413255Will there always be web hosting that will sup...Will there always be web hosting that supports...1will there always be web hosting that will sup...will there always be web hosting that supports...
32037585982467971What is the proof of Indian Army's surgical st...Has India provided any proof of the surgical s...1what is the proof of indian army 's surgical s...has india provided any proof of the surgical s...
4417477532675327What do Indian Muslims think of Modi?What do Indian Muslim think about PM Narendra ...1what do indian muslims think of modi ?what do indian muslim think about pm narendra ...
\n", 204 | "
" 205 | ], 206 | "text/plain": [ 207 | " id qid1 qid2 question1 \\\n", 208 | "0 204673 93885 307635 If there is a God, where is He! \n", 209 | "1 17716 2093 15628 Do you believe that everything happens for a r... \n", 210 | "2 291767 352623 413255 Will there always be web hosting that will sup... \n", 211 | "3 203758 59824 67971 What is the proof of Indian Army's surgical st... \n", 212 | "4 41747 75326 75327 What do Indian Muslims think of Modi? \n", 213 | "\n", 214 | " question2 is_duplicate \\\n", 215 | "0 Why is god a \"He\"? 0 \n", 216 | "1 Does everything happen for a reason? 1 \n", 217 | "2 Will there always be web hosting that supports... 1 \n", 218 | "3 Has India provided any proof of the surgical s... 1 \n", 219 | "4 What do Indian Muslim think about PM Narendra ... 1 \n", 220 | "\n", 221 | " question1_preprocessed \\\n", 222 | "0 if there is a god , where is he ! \n", 223 | "1 do you believe that everything happens for a r... \n", 224 | "2 will there always be web hosting that will sup... \n", 225 | "3 what is the proof of indian army 's surgical s... \n", 226 | "4 what do indian muslims think of modi ? \n", 227 | "\n", 228 | " question2_preprocessed \n", 229 | "0 why is god a `` he '' ? \n", 230 | "1 does everything happen for a reason ? \n", 231 | "2 will there always be web hosting that supports... \n", 232 | "3 has india provided any proof of the surgical s... \n", 233 | "4 what do indian muslim think about pm narendra ... " 234 | ] 235 | }, 236 | "execution_count": 4, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "train.head()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 5, 248 | "id": "d38628e4", 249 | "metadata": { 250 | "execution": { 251 | "iopub.execute_input": "2021-11-27T13:47:53.641763Z", 252 | "iopub.status.busy": "2021-11-27T13:47:53.641235Z", 253 | "iopub.status.idle": "2021-11-27T13:48:55.410135Z", 254 | "shell.execute_reply": "2021-11-27T13:48:55.409588Z", 255 | "shell.execute_reply.started": "2021-11-27T13:10:43.746605Z" 256 | }, 257 | "papermill": { 258 | "duration": 61.786788, 259 | "end_time": "2021-11-27T13:48:55.410279", 260 | "exception": false, 261 | "start_time": "2021-11-27T13:47:53.623491", 262 | "status": "completed" 263 | }, 264 | "tags": [] 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "def buildVocabulary(reviews):\n", 269 | " tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=False, split=' ')\n", 270 | " tokenizer.fit_on_texts(reviews)\n", 271 | " return tokenizer\n", 272 | "\n", 273 | "def getSequences(reviews, tokenizer, seq_maxlen):\n", 274 | " reviews_seq = tokenizer.texts_to_sequences(reviews)\n", 275 | " return np.array(tf.keras.preprocessing.sequence.pad_sequences(reviews_seq, maxlen=seq_maxlen))\n", 276 | "\n", 277 | "word2vec_model = KeyedVectors.load_word2vec_format(word2vec_file, binary = True)\n", 278 | "\n", 279 | "def getEmbeddingWeightMatrix(word2idx): \n", 280 | " embedding_matrix = np.zeros((len(word2idx)+1, 300))\n", 281 | " for word, i in tqdm(word2idx.items()):\n", 282 | " \n", 283 | " embedding_vector = word2vec_model[word] if word in word2vec_model else np.random.rand(1,300)\n", 284 | " if embedding_vector is not None:\n", 285 | " embedding_matrix[i] = embedding_vector\n", 286 | " return embedding_matrix" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 6, 292 | "id": "28c391f9", 293 | "metadata": { 294 | "execution": { 295 | "iopub.execute_input": "2021-11-27T13:48:55.462462Z", 296 | "iopub.status.busy": "2021-11-27T13:48:55.455009Z", 297 | "iopub.status.idle": "2021-11-27T13:49:09.398443Z", 298 | "shell.execute_reply": "2021-11-27T13:49:09.397946Z", 299 | "shell.execute_reply.started": "2021-11-27T13:11:45.518534Z" 300 | }, 301 | "papermill": { 302 | "duration": 13.976622, 303 | "end_time": "2021-11-27T13:49:09.398601", 304 | "exception": false, 305 | "start_time": "2021-11-27T13:48:55.421979", 306 | "status": "completed" 307 | }, 308 | "tags": [] 309 | }, 310 | "outputs": [ 311 | { 312 | "name": "stdout", 313 | "output_type": "stream", 314 | "text": [ 315 | "67043\n" 316 | ] 317 | } 318 | ], 319 | "source": [ 320 | "tokenizer = buildVocabulary(train['question1'].tolist()+train['question2'].tolist()+val['question1'].tolist()+val['question2'].tolist()+test['question1'].tolist()+test['question2'].tolist())\n", 321 | "vocab_size = len(tokenizer.word_index) + 1\n", 322 | "print(vocab_size)\n", 323 | "\n", 324 | "x_train1 = getSequences(train['question1'], tokenizer, 200)\n", 325 | "x_train2 = getSequences(train['question2'], tokenizer, 200)\n", 326 | "y_train = tf.keras.utils.to_categorical(train['is_duplicate'])\n", 327 | "\n", 328 | "x_val1 = getSequences(val['question1'], tokenizer, 200)\n", 329 | "x_val2 = getSequences(val['question2'], tokenizer, 200)\n", 330 | "y_val = tf.keras.utils.to_categorical(val['is_duplicate'])\n", 331 | "\n", 332 | "x_test1 = getSequences(test['question1'], tokenizer, 200)\n", 333 | "x_test2 = getSequences(test['question2'], tokenizer, 200)\n", 334 | "y_test = tf.keras.utils.to_categorical(test['is_duplicate'])" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 7, 340 | "id": "2b48ce03", 341 | "metadata": { 342 | "execution": { 343 | "iopub.execute_input": "2021-11-27T13:49:09.427768Z", 344 | "iopub.status.busy": "2021-11-27T13:49:09.427184Z", 345 | "iopub.status.idle": "2021-11-27T13:49:09.887361Z", 346 | "shell.execute_reply": "2021-11-27T13:49:09.888055Z", 347 | "shell.execute_reply.started": "2021-11-27T13:12:14.356043Z" 348 | }, 349 | "papermill": { 350 | "duration": 0.47744, 351 | "end_time": "2021-11-27T13:49:09.888245", 352 | "exception": false, 353 | "start_time": "2021-11-27T13:49:09.410805", 354 | "status": "completed" 355 | }, 356 | "tags": [] 357 | }, 358 | "outputs": [ 359 | { 360 | "name": "stderr", 361 | "output_type": "stream", 362 | "text": [ 363 | "100%|██████████| 67042/67042 [00:00<00:00, 148266.73it/s]" 364 | ] 365 | }, 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "(67043, 300)\n" 371 | ] 372 | }, 373 | { 374 | "name": "stderr", 375 | "output_type": "stream", 376 | "text": [ 377 | "\n" 378 | ] 379 | } 380 | ], 381 | "source": [ 382 | "#embedding_vectors = loadGloveWordEmbeddings()\n", 383 | "#print(len(embedding_vectors))\n", 384 | "\n", 385 | "embedding_weight_matrix = getEmbeddingWeightMatrix(tokenizer.word_index)\n", 386 | "print(embedding_weight_matrix.shape)" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 8, 392 | "id": "6d070a67", 393 | "metadata": { 394 | "execution": { 395 | "iopub.execute_input": "2021-11-27T13:49:09.928507Z", 396 | "iopub.status.busy": "2021-11-27T13:49:09.927955Z", 397 | "iopub.status.idle": "2021-11-27T13:49:12.560874Z", 398 | "shell.execute_reply": "2021-11-27T13:49:12.560395Z", 399 | "shell.execute_reply.started": "2021-11-27T13:40:42.738444Z" 400 | }, 401 | "papermill": { 402 | "duration": 2.657589, 403 | "end_time": "2021-11-27T13:49:12.561002", 404 | "exception": false, 405 | "start_time": "2021-11-27T13:49:09.903413", 406 | "status": "completed" 407 | }, 408 | "tags": [] 409 | }, 410 | "outputs": [ 411 | { 412 | "name": "stderr", 413 | "output_type": "stream", 414 | "text": [ 415 | "2021-11-27 13:49:10.021754: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 416 | "2021-11-27 13:49:10.127095: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 417 | "2021-11-27 13:49:10.127865: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 418 | "2021-11-27 13:49:10.129230: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n", 419 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", 420 | "2021-11-27 13:49:10.130458: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 421 | "2021-11-27 13:49:10.131136: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 422 | "2021-11-27 13:49:10.131767: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 423 | "2021-11-27 13:49:11.945849: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 424 | "2021-11-27 13:49:11.946562: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 425 | "2021-11-27 13:49:11.947539: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n", 426 | "2021-11-27 13:49:11.948205: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0\n" 427 | ] 428 | } 429 | ], 430 | "source": [ 431 | "he_initializer = tf.keras.initializers.HeUniform()\n", 432 | "inp1 = tf.keras.Input(shape=(x_train1.shape[1],))\n", 433 | "inp2 = tf.keras.Input(shape=(x_train2.shape[1],))\n", 434 | "\n", 435 | "inner1= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200, \n", 436 | " weights=[embedding_weight_matrix], trainable=True)(inp1)\n", 437 | "inner2= tf.keras.layers.Embedding(input_dim=67043, output_dim=300, input_length=200,\n", 438 | " weights=[embedding_weight_matrix], trainable=True)(inp2)\n", 439 | " \n", 440 | "inner = tf.keras.layers.concatenate([inner1+inner2, inner1-inner2, tf.math.multiply(inner1, inner2)], axis=-1)\n", 441 | "\n", 442 | "inner = tf.keras.backend.sum(inner, axis=1, keepdims=False)\n", 443 | "inner = tf.keras.layers.Dense(300, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01), kernel_initializer = he_initializer)(inner)\n", 444 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 445 | "inner = tf.keras.layers.Dense(200, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01), kernel_initializer = he_initializer)(inner)\n", 446 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 447 | "inner = tf.keras.layers.Dense(100, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(l2=0.01), kernel_initializer = he_initializer)(inner)\n", 448 | "inner = tf.keras.layers.Dropout(0.2)(inner)\n", 449 | "output = tf.keras.layers.Dense(2, activation='softmax')(inner)\n", 450 | "\n", 451 | "model = tf.keras.Model(inputs = [inp1, inp2], outputs = output)" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 9, 457 | "id": "060a6db0", 458 | "metadata": { 459 | "execution": { 460 | "iopub.execute_input": "2021-11-27T13:49:12.597283Z", 461 | "iopub.status.busy": "2021-11-27T13:49:12.596481Z", 462 | "iopub.status.idle": "2021-11-27T13:49:12.612017Z", 463 | "shell.execute_reply": "2021-11-27T13:49:12.611523Z", 464 | "shell.execute_reply.started": "2021-11-27T13:40:46.214676Z" 465 | }, 466 | "papermill": { 467 | "duration": 0.036856, 468 | "end_time": "2021-11-27T13:49:12.612121", 469 | "exception": false, 470 | "start_time": "2021-11-27T13:49:12.575265", 471 | "status": "completed" 472 | }, 473 | "tags": [] 474 | }, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "Model: \"model\"\n", 481 | "__________________________________________________________________________________________________\n", 482 | "Layer (type) Output Shape Param # Connected to \n", 483 | "==================================================================================================\n", 484 | "input_1 (InputLayer) [(None, 200)] 0 \n", 485 | "__________________________________________________________________________________________________\n", 486 | "input_2 (InputLayer) [(None, 200)] 0 \n", 487 | "__________________________________________________________________________________________________\n", 488 | "embedding (Embedding) (None, 200, 300) 20112900 input_1[0][0] \n", 489 | "__________________________________________________________________________________________________\n", 490 | "embedding_1 (Embedding) (None, 200, 300) 20112900 input_2[0][0] \n", 491 | "__________________________________________________________________________________________________\n", 492 | "tf.__operators__.add (TFOpLambd (None, 200, 300) 0 embedding[0][0] \n", 493 | " embedding_1[0][0] \n", 494 | "__________________________________________________________________________________________________\n", 495 | "tf.math.subtract (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n", 496 | " embedding_1[0][0] \n", 497 | "__________________________________________________________________________________________________\n", 498 | "tf.math.multiply (TFOpLambda) (None, 200, 300) 0 embedding[0][0] \n", 499 | " embedding_1[0][0] \n", 500 | "__________________________________________________________________________________________________\n", 501 | "concatenate (Concatenate) (None, 200, 900) 0 tf.__operators__.add[0][0] \n", 502 | " tf.math.subtract[0][0] \n", 503 | " tf.math.multiply[0][0] \n", 504 | "__________________________________________________________________________________________________\n", 505 | "tf.math.reduce_sum (TFOpLambda) (None, 900) 0 concatenate[0][0] \n", 506 | "__________________________________________________________________________________________________\n", 507 | "dense (Dense) (None, 300) 270300 tf.math.reduce_sum[0][0] \n", 508 | "__________________________________________________________________________________________________\n", 509 | "dropout (Dropout) (None, 300) 0 dense[0][0] \n", 510 | "__________________________________________________________________________________________________\n", 511 | "dense_1 (Dense) (None, 200) 60200 dropout[0][0] \n", 512 | "__________________________________________________________________________________________________\n", 513 | "dropout_1 (Dropout) (None, 200) 0 dense_1[0][0] \n", 514 | "__________________________________________________________________________________________________\n", 515 | "dense_2 (Dense) (None, 100) 20100 dropout_1[0][0] \n", 516 | "__________________________________________________________________________________________________\n", 517 | "dropout_2 (Dropout) (None, 100) 0 dense_2[0][0] \n", 518 | "__________________________________________________________________________________________________\n", 519 | "dense_3 (Dense) (None, 2) 202 dropout_2[0][0] \n", 520 | "==================================================================================================\n", 521 | "Total params: 40,576,602\n", 522 | "Trainable params: 40,576,602\n", 523 | "Non-trainable params: 0\n", 524 | "__________________________________________________________________________________________________\n" 525 | ] 526 | } 527 | ], 528 | "source": [ 529 | "model.compile(optimizer = \"adam\", loss = 'categorical_crossentropy', metrics=['accuracy'])\n", 530 | "model.summary()" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 10, 536 | "id": "cca93a0e", 537 | "metadata": { 538 | "execution": { 539 | "iopub.execute_input": "2021-11-27T13:49:12.646592Z", 540 | "iopub.status.busy": "2021-11-27T13:49:12.645774Z", 541 | "iopub.status.idle": "2021-11-27T13:50:35.620684Z", 542 | "shell.execute_reply": "2021-11-27T13:50:35.619421Z", 543 | "shell.execute_reply.started": "2021-11-27T13:42:31.052600Z" 544 | }, 545 | "papermill": { 546 | "duration": 82.994418, 547 | "end_time": "2021-11-27T13:50:35.620868", 548 | "exception": false, 549 | "start_time": "2021-11-27T13:49:12.626450", 550 | "status": "completed" 551 | }, 552 | "tags": [] 553 | }, 554 | "outputs": [ 555 | { 556 | "name": "stdout", 557 | "output_type": "stream", 558 | "text": [ 559 | "Epoch 1/4\n" 560 | ] 561 | }, 562 | { 563 | "name": "stderr", 564 | "output_type": "stream", 565 | "text": [ 566 | "2021-11-27 13:49:12.731819: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n" 567 | ] 568 | }, 569 | { 570 | "name": "stdout", 571 | "output_type": "stream", 572 | "text": [ 573 | "632/632 [==============================] - 18s 25ms/step - loss: 2.7537 - accuracy: 0.6674 - val_loss: 0.7220 - val_accuracy: 0.7186\n", 574 | "Epoch 2/4\n", 575 | "632/632 [==============================] - 14s 21ms/step - loss: 0.5893 - accuracy: 0.7484 - val_loss: 0.5575 - val_accuracy: 0.7417\n", 576 | "Epoch 3/4\n", 577 | "632/632 [==============================] - 16s 25ms/step - loss: 0.4798 - accuracy: 0.7898 - val_loss: 0.5790 - val_accuracy: 0.7410\n", 578 | "Epoch 4/4\n", 579 | "632/632 [==============================] - 16s 25ms/step - loss: 0.4240 - accuracy: 0.8221 - val_loss: 0.5786 - val_accuracy: 0.7387\n" 580 | ] 581 | } 582 | ], 583 | "source": [ 584 | "save_weights = tf.keras.callbacks.ModelCheckpoint('cbow_mlp.h5', monitor='val_loss', save_best_only=True)\n", 585 | "checkpoint_filepath = 'weights.best.{epoch:01d}.hdf5'\n", 586 | "#model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,\n", 587 | "#verbose = 1,\n", 588 | "#monitor = 'val_loss',\n", 589 | "#save_best_only = False)\n", 590 | "history = model.fit((x_train1, x_train2), y_train,\n", 591 | " batch_size = 64,\n", 592 | " validation_data = ((x_val1, x_val2), y_val),\n", 593 | " validation_batch_size = 64,\n", 594 | " epochs=4, \n", 595 | " callbacks=[save_weights], \n", 596 | " verbose=1)" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 13, 602 | "id": "54a1dca3", 603 | "metadata": { 604 | "execution": { 605 | "iopub.execute_input": "2021-11-27T13:50:37.375741Z", 606 | "iopub.status.busy": "2021-11-27T13:50:37.374766Z", 607 | "iopub.status.idle": "2021-11-27T13:51:18.727911Z", 608 | "shell.execute_reply": "2021-11-27T13:51:18.728543Z", 609 | "shell.execute_reply.started": "2021-11-27T13:45:19.009457Z" 610 | }, 611 | "papermill": { 612 | "duration": 41.560134, 613 | "end_time": "2021-11-27T13:51:18.728745", 614 | "exception": false, 615 | "start_time": "2021-11-27T13:50:37.168611", 616 | "status": "completed" 617 | }, 618 | "tags": [] 619 | }, 620 | "outputs": [ 621 | { 622 | "name": "stdout", 623 | "output_type": "stream", 624 | "text": [ 625 | "10108/10108 [==============================] - 23s 2ms/step - loss: 0.3358 - accuracy: 0.8777\n", 626 | "loss on test data is 0.3357672095298767\n", 627 | "accuracy on test data is 0.8776867985725403\n" 628 | ] 629 | } 630 | ], 631 | "source": [ 632 | "loss, accuracy = model.evaluate((x_test1, x_test2), y_test, batch_size=4, verbose=1)\n", 633 | "\n", 634 | "print('loss on test data is', loss)\n", 635 | "print('accuracy on test data is', accuracy)" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 14, 641 | "id": "112b1612", 642 | "metadata": { 643 | "execution": { 644 | "iopub.execute_input": "2021-11-27T13:51:19.723275Z", 645 | "iopub.status.busy": "2021-11-27T13:51:19.719029Z", 646 | "iopub.status.idle": "2021-11-27T13:51:22.786317Z", 647 | "shell.execute_reply": "2021-11-27T13:51:22.785865Z", 648 | "shell.execute_reply.started": "2021-11-27T13:47:08.430248Z" 649 | }, 650 | "papermill": { 651 | "duration": 3.646437, 652 | "end_time": "2021-11-27T13:51:22.786439", 653 | "exception": false, 654 | "start_time": "2021-11-27T13:51:19.140002", 655 | "status": "completed" 656 | }, 657 | "tags": [] 658 | }, 659 | "outputs": [ 660 | { 661 | "name": "stdout", 662 | "output_type": "stream", 663 | "text": [ 664 | "F1_score on test is 0.8322648485465214\n" 665 | ] 666 | } 667 | ], 668 | "source": [ 669 | "pred = model.predict((x_test1, x_test2))\n", 670 | "print('F1_score on test is', f1_score(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))\n" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": null, 676 | "id": "26e27882", 677 | "metadata": { 678 | "papermill": { 679 | "duration": 0.304682, 680 | "end_time": "2021-11-27T13:51:23.414985", 681 | "exception": false, 682 | "start_time": "2021-11-27T13:51:23.110303", 683 | "status": "completed" 684 | }, 685 | "tags": [] 686 | }, 687 | "outputs": [], 688 | "source": [] 689 | } 690 | ], 691 | "metadata": { 692 | "kernelspec": { 693 | "display_name": "Python 3", 694 | "language": "python", 695 | "name": "python3" 696 | }, 697 | "language_info": { 698 | "codemirror_mode": { 699 | "name": "ipython", 700 | "version": 3 701 | }, 702 | "file_extension": ".py", 703 | "mimetype": "text/x-python", 704 | "name": "python", 705 | "nbconvert_exporter": "python", 706 | "pygments_lexer": "ipython3", 707 | "version": "3.8.8" 708 | }, 709 | "papermill": { 710 | "default_parameters": {}, 711 | "duration": 226.464045, 712 | "end_time": "2021-11-27T13:51:26.695502", 713 | "environment_variables": {}, 714 | "exception": null, 715 | "input_path": "__notebook__.ipynb", 716 | "output_path": "__notebook__.ipynb", 717 | "parameters": {}, 718 | "start_time": "2021-11-27T13:47:40.231457", 719 | "version": "2.3.3" 720 | } 721 | }, 722 | "nbformat": 4, 723 | "nbformat_minor": 5 724 | } 725 | --------------------------------------------------------------------------------