├── README.md ├── bow-with-basic-features.ipynb ├── bow-with-preprocessing-and-advanced-features.ipynb ├── initial_EDA.ipynb ├── only-bow.ipynb └── streamlit-app ├── Procfile ├── app.py ├── helper.py ├── readme.txt ├── requirements.txt └── setup.sh /README.md: -------------------------------------------------------------------------------- 1 | # quora-question-pairs 2 | A NLP project to find weather given 2 questions are same are not semantically speaking. 3 | 4 | Dataset Link - https://www.kaggle.com/c/quora-question-pairs 5 | -------------------------------------------------------------------------------- /initial_EDA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 21, 6 | "id": "600ccbe8", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "import seaborn as sns\n", 13 | "import matplotlib.pyplot as plt" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 22, 19 | "id": "60425156", 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "data": { 24 | "text/plain": [ 25 | "(404290, 6)" 26 | ] 27 | }, 28 | "execution_count": 22, 29 | "metadata": {}, 30 | "output_type": "execute_result" 31 | } 32 | ], 33 | "source": [ 34 | "df = pd.read_csv(\"train.csv\")\n", 35 | "df.shape" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 25, 41 | "id": "5a3d86b9", 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/html": [ 47 | "
\n", 48 | "\n", 61 | "\n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | "
idqid1qid2question1question2is_duplicate
183268183268280288280289How did monkeys get to South America from Afri...I fucking hate my life, I'm black, poor nd liv...0
112930112930184684132960What is the best photo ever taken in your life?What is the best picture taken by you?1
300075300075348955422827What are some things new employees should know...What are some things new employees should know...0
223993223993296218184831Why do the British care about the Royal Family?Why has the UK retained the monarchy?0
171389171389177374264819Which is the most inspiring book to read?What is the most inspiring book you have ever ...0
357002357002486390486391Why can't I forget my girlfriend?Why can't I forget my first girlfriend?1
348760348760477337477338Which is greater rise in 1 degree Celsius or r...If I sit and hold 100 grams of ice at zero deg...0
119950119950194645194646What are some ways to amplify linear motion an...How do you amplify linear motion?1
209885209885314294314295How should one prepare for IAS when he is in h...How can I prepare for IAS from my first year o...1
23430234304388543886In the initial days of a SaaS startup, when th...I have to manage the entire operations and pro...0
\n", 166 | "
" 167 | ], 168 | "text/plain": [ 169 | " id qid1 qid2 \\\n", 170 | "183268 183268 280288 280289 \n", 171 | "112930 112930 184684 132960 \n", 172 | "300075 300075 348955 422827 \n", 173 | "223993 223993 296218 184831 \n", 174 | "171389 171389 177374 264819 \n", 175 | "357002 357002 486390 486391 \n", 176 | "348760 348760 477337 477338 \n", 177 | "119950 119950 194645 194646 \n", 178 | "209885 209885 314294 314295 \n", 179 | "23430 23430 43885 43886 \n", 180 | "\n", 181 | " question1 \\\n", 182 | "183268 How did monkeys get to South America from Afri... \n", 183 | "112930 What is the best photo ever taken in your life? \n", 184 | "300075 What are some things new employees should know... \n", 185 | "223993 Why do the British care about the Royal Family? \n", 186 | "171389 Which is the most inspiring book to read? \n", 187 | "357002 Why can't I forget my girlfriend? \n", 188 | "348760 Which is greater rise in 1 degree Celsius or r... \n", 189 | "119950 What are some ways to amplify linear motion an... \n", 190 | "209885 How should one prepare for IAS when he is in h... \n", 191 | "23430 In the initial days of a SaaS startup, when th... \n", 192 | "\n", 193 | " question2 is_duplicate \n", 194 | "183268 I fucking hate my life, I'm black, poor nd liv... 0 \n", 195 | "112930 What is the best picture taken by you? 1 \n", 196 | "300075 What are some things new employees should know... 0 \n", 197 | "223993 Why has the UK retained the monarchy? 0 \n", 198 | "171389 What is the most inspiring book you have ever ... 0 \n", 199 | "357002 Why can't I forget my first girlfriend? 1 \n", 200 | "348760 If I sit and hold 100 grams of ice at zero deg... 0 \n", 201 | "119950 How do you amplify linear motion? 1 \n", 202 | "209885 How can I prepare for IAS from my first year o... 1 \n", 203 | "23430 I have to manage the entire operations and pro... 0 " 204 | ] 205 | }, 206 | "execution_count": 25, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "df.sample(10)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 4, 218 | "id": "37b00141", 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "\n", 226 | "RangeIndex: 404290 entries, 0 to 404289\n", 227 | "Data columns (total 6 columns):\n", 228 | " # Column Non-Null Count Dtype \n", 229 | "--- ------ -------------- ----- \n", 230 | " 0 id 404290 non-null int64 \n", 231 | " 1 qid1 404290 non-null int64 \n", 232 | " 2 qid2 404290 non-null int64 \n", 233 | " 3 question1 404289 non-null object\n", 234 | " 4 question2 404288 non-null object\n", 235 | " 5 is_duplicate 404290 non-null int64 \n", 236 | "dtypes: int64(4), object(2)\n", 237 | "memory usage: 18.5+ MB\n" 238 | ] 239 | } 240 | ], 241 | "source": [ 242 | "df.info()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 5, 248 | "id": "c5b82789", 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "data": { 253 | "text/plain": [ 254 | "id 0\n", 255 | "qid1 0\n", 256 | "qid2 0\n", 257 | "question1 1\n", 258 | "question2 2\n", 259 | "is_duplicate 0\n", 260 | "dtype: int64" 261 | ] 262 | }, 263 | "execution_count": 5, 264 | "metadata": {}, 265 | "output_type": "execute_result" 266 | } 267 | ], 268 | "source": [ 269 | "# missing values\n", 270 | "df.isnull().sum()" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 6, 276 | "id": "e704abaf", 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/plain": [ 282 | "0" 283 | ] 284 | }, 285 | "execution_count": 6, 286 | "metadata": {}, 287 | "output_type": "execute_result" 288 | } 289 | ], 290 | "source": [ 291 | "# duplicate rows\n", 292 | "df.duplicated().sum()" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 17, 298 | "id": "3f9bd6af", 299 | "metadata": {}, 300 | "outputs": [ 301 | { 302 | "name": "stdout", 303 | "output_type": "stream", 304 | "text": [ 305 | "0 255027\n", 306 | "1 149263\n", 307 | "Name: is_duplicate, dtype: int64\n", 308 | "0 63.080215\n", 309 | "1 36.919785\n", 310 | "Name: is_duplicate, dtype: float64\n" 311 | ] 312 | }, 313 | { 314 | "data": { 315 | "text/plain": [ 316 | "" 317 | ] 318 | }, 319 | "execution_count": 17, 320 | "metadata": {}, 321 | "output_type": "execute_result" 322 | }, 323 | { 324 | "data": { 325 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAD1CAYAAABOfbKwAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAQMElEQVR4nO3db6je5X3H8fdnphNZq0Q9is2fRWbKpsIshij0SUcgydoHWlB2fFDDFkgRhRb6YNonFiWgsFYQpmAxGKWrBttiWGtdph2lzKnHItXoXA7Vapqg6RKse6Bb0u8e3Ndp75zeuc7JSXJOYt4v+HH/7u/vuq5z3XDkk991/e5jqgpJko7kjxZ6ApKkk5tBIUnqMigkSV0GhSSpy6CQJHUZFJKkrkULPYHj7fzzz68VK1Ys9DQk6ZTy4osv/rqqxkZd+8gFxYoVK5iYmFjoaUjSKSXJL490zaUnSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkro+cl+4O1WsuPUHCz2Fj5Q37/r8Qk9B+sia8Y4iybIkP07yWpKdSb7c6l9P8qskL7Xjc0N9bksymeT1JOuG6lcmeblduzdJWv3MJI+1+nNJVgz12ZBkVzs2HNdPL0ma0WzuKA4CX62qnyX5BPBikh3t2j1V9Q/DjZNcCowDlwGfBP41yaeq6hBwP7AJ+A/gh8B64ElgI3Cgqi5JMg7cDfxNknOB24FVQLWfvb2qDhzbx5YkzdaMdxRVtbeqftbO3wdeA5Z0ulwDPFpVH1bVG8AksDrJRcDZVfVsDf5H3Q8D1w712drOHwfWtLuNdcCOqtrfwmEHg3CRJM2To9rMbktCnwaea6Vbkvw8yZYki1ttCfD2ULfdrbaknU+vH9anqg4C7wHndcaaPq9NSSaSTOzbt+9oPpIkaQazDookHwe+C3ylqn7DYBnpz4ArgL3AN6aajuhenfpc+/y+UPVAVa2qqlVjYyP/Sq4kaY5mFRRJPsYgJL5dVd8DqKp3qupQVf0W+BawujXfDSwb6r4U2NPqS0fUD+uTZBFwDrC/M5YkaZ7M5qmnAA8Cr1XVN4fqFw01+wLwSjvfDoy3J5kuBlYCz1fVXuD9JFe3MW8EnhjqM/VE03XAM20f4ylgbZLFbWlrbatJkubJbJ56+gzwReDlJC+12teAG5JcwWAp6E3gSwBVtTPJNuBVBk9M3dyeeAK4CXgIOIvB005PtvqDwCNJJhncSYy3sfYnuRN4obW7o6r2z+WDSpLmZsagqKqfMnqv4IedPpuBzSPqE8DlI+ofANcfYawtwJaZ5ilJOjH8Ex6SpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKlrxqBIsizJj5O8lmRnki+3+rlJdiTZ1V4XD/W5LclkkteTrBuqX5nk5Xbt3iRp9TOTPNbqzyVZMdRnQ/sZu5JsOK6fXpI0o9ncURwEvlpVfwFcDdyc5FLgVuDpqloJPN3e066NA5cB64H7kpzRxrof2ASsbMf6Vt8IHKiqS4B7gLvbWOcCtwNXAauB24cDSZJ04s0YFFW1t6p+1s7fB14DlgDXAFtbs63Ate38GuDRqvqwqt4AJoHVSS4Czq6qZ6uqgIen9Zka63FgTbvbWAfsqKr9VXUA2MHvw0WSNA+Oao+iLQl9GngOuLCq9sIgTIALWrMlwNtD3Xa32pJ2Pr1+WJ+qOgi8B5zXGUuSNE9mHRRJPg58F/hKVf2m13RErTr1ufYZntumJBNJJvbt29eZmiTpaM0qKJJ8jEFIfLuqvtfK77TlJNrru62+G1g21H0psKfVl46oH9YnySLgHGB/Z6zDVNUDVbWqqlaNjY3N5iNJkmZpNk89BXgQeK2qvjl0aTsw9RTSBuCJofp4e5LpYgab1s+35an3k1zdxrxxWp+psa4Dnmn7GE8Ba5MsbpvYa1tNkjRPFs2izWeALwIvJ3mp1b4G3AVsS7IReAu4HqCqdibZBrzK4Impm6vqUOt3E/AQcBbwZDtgEESPJJlkcCcx3sban+RO4IXW7o6q2j+3jypJmosZg6KqfsrovQKANUfosxnYPKI+AVw+ov4BLWhGXNsCbJlpnpKkE8NvZkuSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUNWNQJNmS5N0krwzVvp7kV0leasfnhq7dlmQyyetJ1g3Vr0zycrt2b5K0+plJHmv155KsGOqzIcmudmw4bp9akjRrs7mjeAhYP6J+T1Vd0Y4fAiS5FBgHLmt97ktyRmt/P7AJWNmOqTE3Ageq6hLgHuDuNta5wO3AVcBq4PYki4/6E0qSjsmMQVFVPwH2z3K8a4BHq+rDqnoDmARWJ7kIOLuqnq2qAh4Grh3qs7WdPw6saXcb64AdVbW/qg4AOxgdWJKkE+hY9ihuSfLztjQ19S/9JcDbQ212t9qSdj69flifqjoIvAec1xlLkjSPFs2x3/3AnUC1128AfwdkRNvq1Jljn8Mk2cRgWYvly5f35i1pFlbc+oOFnsJHxpt3fX6hp3DM5nRHUVXvVNWhqvot8C0Gewgw+Ff/sqGmS4E9rb50RP2wPkkWAecwWOo60lij5vNAVa2qqlVjY2Nz+UiSpCOYU1C0PYcpXwCmnojaDoy3J5kuZrBp/XxV7QXeT3J123+4EXhiqM/UE03XAc+0fYyngLVJFrelrbWtJkmaRzMuPSX5DvBZ4Pwkuxk8ifTZJFcwWAp6E/gSQFXtTLINeBU4CNxcVYfaUDcxeILqLODJdgA8CDySZJLBncR4G2t/kjuBF1q7O6pqtpvqkqTjZMagqKobRpQf7LTfDGweUZ8ALh9R/wC4/ghjbQG2zDRHSdKJ4zezJUldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklS14xBkWRLkneTvDJUOzfJjiS72uvioWu3JZlM8nqSdUP1K5O83K7dmyStfmaSx1r9uSQrhvpsaD9jV5INx+1TS5JmbTZ3FA8B66fVbgWerqqVwNPtPUkuBcaBy1qf+5Kc0frcD2wCVrZjasyNwIGqugS4B7i7jXUucDtwFbAauH04kCRJ82PGoKiqnwD7p5WvAba2863AtUP1R6vqw6p6A5gEVie5CDi7qp6tqgIentZnaqzHgTXtbmMdsKOq9lfVAWAHfxhYkqQTbK57FBdW1V6A9npBqy8B3h5qt7vVlrTz6fXD+lTVQeA94LzOWJKkeXS8N7Mzolad+lz7HP5Dk01JJpJM7Nu3b1YTlSTNzlyD4p22nER7fbfVdwPLhtotBfa0+tIR9cP6JFkEnMNgqetIY/2BqnqgqlZV1aqxsbE5fiRJ0ihzDYrtwNRTSBuAJ4bq4+1JposZbFo/35an3k9yddt/uHFan6mxrgOeafsYTwFrkyxum9hrW02SNI8WzdQgyXeAzwLnJ9nN4Emku4BtSTYCbwHXA1TVziTbgFeBg8DNVXWoDXUTgyeozgKebAfAg8AjSSYZ3EmMt7H2J7kTeKG1u6Oqpm+qS5JOsBmDoqpuOMKlNUdovxnYPKI+AVw+ov4BLWhGXNsCbJlpjpKkE8dvZkuSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUdUxBkeTNJC8neSnJRKudm2RHkl3tdfFQ+9uSTCZ5Pcm6ofqVbZzJJPcmSaufmeSxVn8uyYpjma8k6egdjzuKv6qqK6pqVXt/K/B0Va0Enm7vSXIpMA5cBqwH7ktyRutzP7AJWNmO9a2+EThQVZcA9wB3H4f5SpKOwolYeroG2NrOtwLXDtUfraoPq+oNYBJYneQi4OyqeraqCnh4Wp+psR4H1kzdbUiS5sexBkUB/5LkxSSbWu3CqtoL0F4vaPUlwNtDfXe32pJ2Pr1+WJ+qOgi8B5x3jHOWJB2FRcfY/zNVtSfJBcCOJP/ZaTvqTqA69V6fwwcehNQmgOXLl/dnLEk6Ksd0R1FVe9rru8D3gdXAO205ifb6bmu+G1g21H0psKfVl46oH9YnySLgHGD/iHk8UFWrqmrV2NjYsXwkSdI0cw6KJH+S5BNT58Ba4BVgO7ChNdsAPNHOtwPj7UmmixlsWj/flqfeT3J123+4cVqfqbGuA55p+xiSpHlyLEtPFwLfb3vLi4B/qqofJXkB2JZkI/AWcD1AVe1Msg14FTgI3FxVh9pYNwEPAWcBT7YD4EHgkSSTDO4kxo9hvpKkOZhzUFTVL4C/HFH/b2DNEfpsBjaPqE8Al4+of0ALGknSwvCb2ZKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1nRJBkWR9kteTTCa5daHnI0mnk5M+KJKcAfwj8NfApcANSS5d2FlJ0unjpA8KYDUwWVW/qKr/BR4FrlngOUnSaWPRQk9gFpYAbw+93w1cNdwgySZgU3v7P0len6e5nQ7OB3690JOYSe5e6BlogZz0v5+n0O/mnx7pwqkQFBlRq8PeVD0APDA/0zm9JJmoqlULPQ9pFH8/58epsPS0G1g29H4psGeB5iJJp51TISheAFYmuTjJHwPjwPYFnpMknTZO+qWnqjqY5BbgKeAMYEtV7VzgaZ1OXNLTyczfz3mQqpq5lSTptHUqLD1JkhaQQSFJ6jIoJEldJ/1mtuZXkj9n8M33JQy+r7IH2F5Vry3oxCQtGO8o9DtJ/p7Bn0gJ8DyDR5MDfMc/xqiTWZK/Xeg5fJT51JN+J8l/AZdV1f9Nq/8xsLOqVi7MzKS+JG9V1fKFnsdHlUtPGvZb4JPAL6fVL2rXpAWT5OdHugRcOJ9zOd0YFBr2FeDpJLv4/R9iXA5cAtyyUJOSmguBdcCBafUA/z7/0zl9GBT6nar6UZJPMfjT7ksY/Ae4G3ihqg4t6OQk+Gfg41X10vQLSf5t3mdzGnGPQpLU5VNPkqQug0KS1GVQSJK6DApJUpdBIUnq+n/InXmx1HDi4wAAAABJRU5ErkJggg==\n", 326 | "text/plain": [ 327 | "
" 328 | ] 329 | }, 330 | "metadata": { 331 | "needs_background": "light" 332 | }, 333 | "output_type": "display_data" 334 | } 335 | ], 336 | "source": [ 337 | "# Distribution of duplicate and non-duplicate questions\n", 338 | "\n", 339 | "print(df['is_duplicate'].value_counts())\n", 340 | "print((df['is_duplicate'].value_counts()/df['is_duplicate'].count())*100)\n", 341 | "df['is_duplicate'].value_counts().plot(kind='bar')" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 19, 347 | "id": "788d2d08", 348 | "metadata": {}, 349 | "outputs": [ 350 | { 351 | "name": "stdout", 352 | "output_type": "stream", 353 | "text": [ 354 | "Number of unique questions 537933\n", 355 | "Number of questions getting repeated 111780\n" 356 | ] 357 | } 358 | ], 359 | "source": [ 360 | "# Repeated questions\n", 361 | "\n", 362 | "qid = pd.Series(df['qid1'].tolist() + df['qid2'].tolist())\n", 363 | "print('Number of unique questions',np.unique(qid).shape[0])\n", 364 | "x = qid.value_counts()>1\n", 365 | "print('Number of questions getting repeated',x[x].shape[0])" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 20, 371 | "id": "2fa5bb83", 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "data": { 376 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAQoklEQVR4nO3df6zdd13H8efLzjtk6PixobM/bOGOxf6jwHX88EcmTGyBbopE10ACWtdgMuKPqHSZMeEvBxpjiJNZdc7A7NLMCS0rmYriMFlg3WTYUiplDHbZoJ2YGdFkTN7+cU7Zydm97bnnR8+5n/t8JE3v93PO+X5fve1593Pf38/5flNVSJLa8h3TDiBJGj+LuyQ1yOIuSQ2yuEtSgyzuktSg86YdAOCiiy6qzZs3TzuGJK0q999//+NVdfFSj021uCfZAeyYn5/n8OHD04wiSatOki8t99hU2zJVdbCqdl944YXTjCFJzZlqcU+yI8neJ554YpoxJKk5ztwlqUHO3CWpQc7cJalBrnOXpAbZlpGkBtmWkaQGrfq2zOY9d7F5z13TjiFJM8W2jCQ1yLaMJDVo1bdlJEnPZHGXpAZZ3CWpQZ5QlaQGeUJVkhpkW0aSGmRxl6QGWdwlqUEWd0lqkKtlJKlBrpaRpAbZlpGkBlncJalBFndJapDFXZIaZHGXpAaNvbgnuSLJJ5LcnOSKce9fknR2AxX3JLckOZnkSN/4tiTHk5xIsqc7XMB/A88CFscbV5I0iEFn7rcC23oHkqwDbgK2A1uBnUm2Ap+oqu3Au4B3jy+qJGlQAxX3qroH+Hrf8OXAiap6qKqeBG4Hrq6qb3Uf/0/g/OX2mWR3ksNJDp86dWqI6JKk5YzSc18PPNKzvQisT/KmJH8KfAD44+VeXFV7q2qhqhYuvvjiEWJIkvqdN8Jrs8RYVdWdwJ0D7SDZAeyYn58fIYYkqd8oM/dFYGPP9gbg0ZXswGvLSNJkjFLc7wMuTbIlyRxwDXBgJTvwqpCSNBmDLoXcB9wLXJZkMcmuqnoKuA64GzgG7K+qoys5uDN3SZqMgXruVbVzmfFDwKFhD27PXZImo5nruW/ecxeb99w1hlSStPp5JyZJalAzM3dJ0tO8KqQkNci2jCQ1yLaMJDXItowkNci2jCQ1yLaMJDXItowkNcjiLkkNsrhLUoM8oSpJDfKEqiQ1yLaMJDXI4i5JDbK4S1KDmivu3rRDklwtI0lNcrWMJDWoubaMJMniLklNsrhLUoMs7pLUIIu7JDVoIsU9yQVJ7k/yxknsX5J0ZgMV9yS3JDmZ5Ejf+LYkx5OcSLKn56F3AfvHGVSSNLhBZ+63Att6B5KsA24CtgNbgZ1Jtia5Evgs8LUx5pQkrcB5gzypqu5Jsrlv+HLgRFU9BJDkduBq4DnABXQK/v8mOVRV3+rfZ5LdwG6ATZs2Df0HkCQ900DFfRnrgUd6theBV1TVdQBJ3g48vlRhB6iqvcBegIWFhRohx5JOX1/m4RvfMO5dS9LMG6W4Z4mxbxfpqrr1rDtIdgA75ufnR4ghSeo3ymqZRWBjz/YG4NGV7MBry0jSZIxS3O8DLk2yJckccA1wYCU78KqQkjQZgy6F3AfcC1yWZDHJrqp6CrgOuBs4BuyvqqMrObgzd0majEFXy+xcZvwQcGjYg9tzl6TJ8HruktQg78QkSQ1y5i5JDfKqkJLUoObbMpv33PXtT6tK0lphW0aSGmRbRpIa1HxbRpLWItsyktQg2zKS1CCLuyQ1yOIuSQ3yhKokNWjNnFD1w0yS1hLbMpLUIIu7JDXI4i5JDbK4S1KDXC0jSQ1aM6tlTnPVjKS1wLaMJDXI4i5JDbK4S1KDLO6S1KA1W9w9qSqpZWMv7kl+MMnNSe5I8ivj3r8k6ewGKu5JbklyMsmRvvFtSY4nOZFkD0BVHauqdwA/DyyMP7Ik6WwGnbnfCmzrHUiyDrgJ2A5sBXYm2dp97CrgX4CPjS2pJGlgAxX3qroH+Hrf8OXAiap6qKqeBG4Hru4+/0BVvRp4yzjDSpIGc94Ir10PPNKzvQi8IskVwJuA84FDy704yW5gN8CmTZtGiCFJ6jdKcc8SY1VVHwc+frYXV9XeJI8BO+bm5l4+Qo6hnV4x8/CNb5jG4SVpYkZZLbMIbOzZ3gA8upIdTOPaMpK0FoxS3O8DLk2yJckccA1wYCU78KqQkjQZgy6F3AfcC1yWZDHJrqp6CrgOuBs4BuyvqqMrObgzd0majIF67lW1c5nxQ5zhpOnZJNkB7Jifnx92F2Nh711Sa9bc9dwlaS3wTkyS1CBn7j28S5OkVqzZq0JKUstsy0hSg2zLLMH2jKTVzraMJDXI4i5JDbLnfga2ZyStVvbcJalBtmUkqUEW9wHYnpG02thzl6QG2XOXpAbZllkB2zOSVguLuyQ1yOIuSQ2yuEtSg1wtMwR775Jm3UD3UJ2UqjoIHFxYWLh2mjmG1V/gvQerpFlhW0aSGmRxl6QGTbUt05reNo0tGknT5MxdkhpkcZekBk2kuCf5mSR/luTDSV43iWNIkpY3cHFPckuSk0mO9I1vS3I8yYkkewCq6kNVdS3wduAXxppYknRWK5m53wps6x1Isg64CdgObAV2Jtna85Tf6T6+5vhBJ0nTNPBqmaq6J8nmvuHLgRNV9RBAktuBq5McA24EPlpVD4wr7GrkB50kTcOoPff1wCM924vdsXcCVwJvTvKOpV6YZHeSw0kOnzp1asQYkqReo65zzxJjVVXvA953phdW1d4kjwE75ubmXj5iDklSj1Fn7ovAxp7tDcCjg754Ld6JyV68pHNh1OJ+H3Bpki1J5oBrgAODvni1XhVSkmbdSpZC7gPuBS5LsphkV1U9BVwH3A0cA/ZX1dFB97kWZ+6nOYOXNEkrWS2zc5nxQ8ChYQ6eZAewY35+fpiXS5KWMdXLD6zlmXs/Z/KSxsk7MUlSg7wT05Q5W5c0CV4VUpIaZFtGkhrkCdUZ44lVSeNgW0aSGmRxl6QG2XNfhWzdSDobl0KuIhZ0SYOaanHXYCzqklbKnvuMsqBLGsVUZ+5eOOzMLPCShuU6d0lqkG0ZSWqQxV2SGmRxl6QGWdwlqUF+QnUVG+WTqn7KVWqbq2UkqUG2ZRrgLFxSP4u7JDXI4i5JDbK4r3G2dKQ2WdwlqUEWd0lq0NivCpnkRcANwIVV9eZx719n199mefjGN0wpiaRpGWjmnuSWJCeTHOkb35bkeJITSfYAVNVDVbVrEmF1ZvbPJZ02aFvmVmBb70CSdcBNwHZgK7AzydaxppMkDWWg4l5V9wBf7xu+HDjRnak/CdwOXD3ogZPsTnI4yeFTp04NHFiT4axfassoJ1TXA4/0bC8C65O8IMnNwEuTXL/ci6tqL/Bu4IG5ubkRYuhc8z8CafaNUtyzxFhV1X9U1Tuq6sVV9Xtn2oHXlpGkyRhltcwisLFnewPw6Ep24D1Uz72zzbiXe9wVN9LqMsrM/T7g0iRbkswB1wAHVrIDZ+6SNBmDLoXcB9wLXJZkMcmuqnoKuA64GzgG7K+qoys5uNdzPzcm3R+3By/NnoHaMlW1c5nxQ8ChYQ9eVQeBgwsLC9cOuw9J0jN5+QFJapC32ZOkBnmbPUlq0NgvHLYSLoVcPU6fMO1dEulJVGl2OXOXpAZ5QlWSGmRxl6QG2XPXxHjTEGl67LlLUoNsy0hSgyzuktQgP6GqFTnT2nYvICbNDnvuktQg2zKS1CCLuyQ1yOIuSQ2yuEtSg1wtI0kNcrWMJDXItowkNcjiLkkNsrhLUoMs7pLUIIu7JDXI4i5JDRr7nZiSXAD8CfAk8PGqum3cx5AkndlAM/cktyQ5meRI3/i2JMeTnEiypzv8JuCOqroWuGrMeSVJAxi0LXMrsK13IMk64CZgO7AV2JlkK7ABeKT7tP8bT0xJ0koM1JapqnuSbO4bvhw4UVUPASS5HbgaWKRT4D/NGf7zSLIb2A2wadOmlebWDFvuhh1LjfffNPv0c06P92+v9HkrsdQ+xrHf0/tZLTcIH9efedz7as2kvzejnFBdz9MzdOgU9fXAncDPJXk/cHC5F1fV3qpaqKqFiy++eIQYkqR+o5xQzRJjVVXfAH5xoB0kO4Ad8/PzI8SQJPUbZea+CGzs2d4APDpaHEnSOIxS3O8DLk2yJckccA1wYCU78KqQkjQZgy6F3AfcC1yWZDHJrqp6CrgOuBs4BuyvqqMrObjXc5ekyRh0tczOZcYPAYeGPXhVHQQOLiwsXDvsPiRJz+TlBySpQd5mT5Ia5G32JKlBqappZyDJKeBLK3zZRcDjE4gzDmYbjtmGY7bhtJDtB6pqyU+BzkRxH0aSw1W1MO0cSzHbcMw2HLMNp/VsnlCVpAZZ3CWpQau5uO+ddoAzMNtwzDYcsw2n6WyrtucuSVreap65S5KWYXGXpAatyuK+zL1bp5VlY5J/SnIsydEkv9odf36Sv0/y+e7vz5tSvnVJ/jXJR2YpVzfLc5PckeRz3e/fq2YlX5Jf7/59HkmyL8mzppVtqXsYnylLkuu7743jSX56Ctl+v/t3+pkkf5vkubOSreex30xSSS6apWxJ3tk9/tEk7x0pW1Wtql/AOuALwIuAOeBBYOsU81wCvKz79XcD/07nnrLvBfZ0x/cA75lSvt8A/hr4SHd7JnJ1j/9XwC93v54DnjsL+ejcUeyLwHd1t/cDb59WNuAngJcBR3rGlszS/bf3IHA+sKX7Xll3jrO9Djiv+/V7Zilbd3wjnavZfgm4aFayAT8J/ANwfnf7haNkO6dvmjF9U14F3N2zfT1w/bRz9eT5MPBTwHHgku7YJcDxKWTZAHwMeE1PcZ96ru6xv6dbQNM3PvV8PH0LyefTuXLqR7oFa2rZgM19hWDJLP3vh24Re9W5zNb32M8Ct81SNuAO4IeAh3uK+9Sz0ZlEXLnE84bKthrbMsvdu3XqujcRfynwSeB7q+oxgO7vL5xCpD8Cfhv4Vs/YLOSCzk9ep4C/7LaN/jzJBbOQr6q+AvwB8GXgMeCJqvq7WcjWY7kss/b++CXgo92vp54tyVXAV6rqwb6Hpp4NeAnw40k+meSfk/zIKNlWY3Ff8t6t5zxFnyTPAf4G+LWq+q8ZyPNG4GRV3T/tLMs4j86Ppe+vqpcC36DTXpi6bv/6ajo/An8/cEGSt0431cBm5v2R5AbgKeC200NLPO2cZUvybOAG4HeXeniJsXP9fTsPeB7wSuC3gP1JwpDZVmNxn7l7tyb5TjqF/baqurM7/LUkl3QfvwQ4eY5j/ShwVZKHgduB1yT54AzkOm0RWKyqT3a376BT7Gch35XAF6vqVFV9E7gTePWMZDttuSwz8f5I8jbgjcBbqttLmIFsL6bzH/aD3ffFBuCBJN83A9noZrizOj5F5yfui4bNthqL+8j3bh2n7v+sfwEcq6o/7HnoAPC27tdvo9OLP2eq6vqq2lBVm+l8j/6xqt467Vw9+b4KPJLksu7Qa4HPMhv5vgy8Msmzu3+/r6VzK8lZyHbaclkOANckOT/JFuBS4FPnMliSbcC7gKuq6n96Hppqtqr6t6p6YVVt7r4vFukshvjqtLN1fYjO+TGSvITOIoPHh842yRMGEzwR8Xo6q1K+ANww5Sw/RudHpM8An+7+ej3wAjonMz/f/f35U8x4BU+fUJ2lXD8MHO5+7z5E50fSmcgHvBv4HHAE+ACdlQpTyQbso9P7/yadgrTrTFnotB6+QOek6/YpZDtBp0d8+v1w86xk63v8YbonVGchG51i/sHuv7kHgNeMks3LD0hSg1ZjW0aSdBYWd0lqkMVdkhpkcZekBlncJalBFndJapDFXZIa9P9yyn9QIsa7pwAAAABJRU5ErkJggg==\n", 377 | "text/plain": [ 378 | "
" 379 | ] 380 | }, 381 | "metadata": { 382 | "needs_background": "light" 383 | }, 384 | "output_type": "display_data" 385 | } 386 | ], 387 | "source": [ 388 | "# Repeated questions histogram\n", 389 | "\n", 390 | "plt.hist(qid.value_counts().values,bins=160)\n", 391 | "plt.yscale('log')\n", 392 | "plt.show()" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "id": "f9573e2f", 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [] 402 | } 403 | ], 404 | "metadata": { 405 | "kernelspec": { 406 | "display_name": "Python 3", 407 | "language": "python", 408 | "name": "python3" 409 | }, 410 | "language_info": { 411 | "codemirror_mode": { 412 | "name": "ipython", 413 | "version": 3 414 | }, 415 | "file_extension": ".py", 416 | "mimetype": "text/x-python", 417 | "name": "python", 418 | "nbconvert_exporter": "python", 419 | "pygments_lexer": "ipython3", 420 | "version": "3.8.8" 421 | } 422 | }, 423 | "nbformat": 4, 424 | "nbformat_minor": 5 425 | } 426 | -------------------------------------------------------------------------------- /only-bow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "b478deb3", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import seaborn as sns" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "id": "217d407d", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "df = pd.read_csv('train.csv')" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "id": "0cb99da3", 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/plain": [ 35 | "(404290, 6)" 36 | ] 37 | }, 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "output_type": "execute_result" 41 | } 42 | ], 43 | "source": [ 44 | "df.shape" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 4, 50 | "id": "e5e7ce9a", 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/html": [ 56 | "
\n", 57 | "\n", 70 | "\n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | "
idqid1qid2question1question2is_duplicate
0012What is the step by step guide to invest in sh...What is the step by step guide to invest in sh...0
1134What is the story of Kohinoor (Koh-i-Noor) Dia...What would happen if the Indian government sto...0
2256How can I increase the speed of my internet co...How can Internet speed be increased by hacking...0
3378Why am I mentally very lonely? How can I solve...Find the remainder when [math]23^{24}[/math] i...0
44910Which one dissolve in water quikly sugar, salt...Which fish would survive in salt water?0
\n", 130 | "
" 131 | ], 132 | "text/plain": [ 133 | " id qid1 qid2 question1 \\\n", 134 | "0 0 1 2 What is the step by step guide to invest in sh... \n", 135 | "1 1 3 4 What is the story of Kohinoor (Koh-i-Noor) Dia... \n", 136 | "2 2 5 6 How can I increase the speed of my internet co... \n", 137 | "3 3 7 8 Why am I mentally very lonely? How can I solve... \n", 138 | "4 4 9 10 Which one dissolve in water quikly sugar, salt... \n", 139 | "\n", 140 | " question2 is_duplicate \n", 141 | "0 What is the step by step guide to invest in sh... 0 \n", 142 | "1 What would happen if the Indian government sto... 0 \n", 143 | "2 How can Internet speed be increased by hacking... 0 \n", 144 | "3 Find the remainder when [math]23^{24}[/math] i... 0 \n", 145 | "4 Which fish would survive in salt water? 0 " 146 | ] 147 | }, 148 | "execution_count": 4, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "df.head()" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 5, 160 | "id": "94b6e88e", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "new_df = df.sample(30000)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 6, 170 | "id": "5074efd7", 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "id 0\n", 177 | "qid1 0\n", 178 | "qid2 0\n", 179 | "question1 0\n", 180 | "question2 0\n", 181 | "is_duplicate 0\n", 182 | "dtype: int64" 183 | ] 184 | }, 185 | "execution_count": 6, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "new_df.isnull().sum()" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 7, 197 | "id": "e2763e28", 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/plain": [ 203 | "0" 204 | ] 205 | }, 206 | "execution_count": 7, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "new_df.duplicated().sum()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 8, 218 | "id": "a1e18aeb", 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "data": { 223 | "text/html": [ 224 | "
\n", 225 | "\n", 238 | "\n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | "
question1question2
394861What is Mark Zuckerberg's degree?Is there any reservation for NT in upsc?
266389Why is Saltwater Taffy candy imported in Poland?Why is Saltwater Taffy candy imported in Portu...
32076Who are the best people on the Internet to fol...Who are the best people on the Internet to fol...
150593I have been with my partner for 8 years Last y...I split up with my wife two years ago. Officia...
128073Is it legal to earn income from online freelan...Can 4-5 employers file a petition for your H1-...
\n", 274 | "
" 275 | ], 276 | "text/plain": [ 277 | " question1 \\\n", 278 | "394861 What is Mark Zuckerberg's degree? \n", 279 | "266389 Why is Saltwater Taffy candy imported in Poland? \n", 280 | "32076 Who are the best people on the Internet to fol... \n", 281 | "150593 I have been with my partner for 8 years Last y... \n", 282 | "128073 Is it legal to earn income from online freelan... \n", 283 | "\n", 284 | " question2 \n", 285 | "394861 Is there any reservation for NT in upsc? \n", 286 | "266389 Why is Saltwater Taffy candy imported in Portu... \n", 287 | "32076 Who are the best people on the Internet to fol... \n", 288 | "150593 I split up with my wife two years ago. Officia... \n", 289 | "128073 Can 4-5 employers file a petition for your H1-... " 290 | ] 291 | }, 292 | "execution_count": 8, 293 | "metadata": {}, 294 | "output_type": "execute_result" 295 | } 296 | ], 297 | "source": [ 298 | "ques_df = new_df[['question1','question2']]\n", 299 | "ques_df.head()" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 9, 305 | "id": "dec56417", 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "from sklearn.feature_extraction.text import CountVectorizer\n", 310 | "# merge texts\n", 311 | "questions = list(ques_df['question1']) + list(ques_df['question2'])\n", 312 | "\n", 313 | "cv = CountVectorizer(max_features=3000)\n", 314 | "q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 10, 320 | "id": "88026075", 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "data": { 325 | "text/plain": [ 326 | "(30000, 6000)" 327 | ] 328 | }, 329 | "execution_count": 10, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "temp_df1 = pd.DataFrame(q1_arr, index= ques_df.index)\n", 336 | "temp_df2 = pd.DataFrame(q2_arr, index= ques_df.index)\n", 337 | "temp_df = pd.concat([temp_df1, temp_df2], axis=1)\n", 338 | "temp_df.shape" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 15, 344 | "id": "2f202654", 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "data": { 349 | "text/html": [ 350 | "
\n", 351 | "\n", 364 | "\n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | "
0123456789...299129922993299429952996299729982999is_duplicate
3948610000000000...0000000000
2663890000000000...0000000001
320760000000000...0000000000
1505930000000000...0000000000
1280730000000000...0010000000
..................................................................
3500700000000000...0000000000
3013460000000000...0000000000
799320000000000...0000000000
747880000000000...0000000000
617700000000000...0000000000
\n", 658 | "

30000 rows × 6001 columns

\n", 659 | "
" 660 | ], 661 | "text/plain": [ 662 | " 0 1 2 3 4 5 6 7 8 9 ... 2991 2992 2993 2994 2995 2996 \\\n", 663 | "394861 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 664 | "266389 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 665 | "32076 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 666 | "150593 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 667 | "128073 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 \n", 668 | "... .. .. .. .. .. .. .. .. .. .. ... ... ... ... ... ... ... \n", 669 | "350070 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 670 | "301346 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 671 | "79932 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 672 | "74788 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 673 | "61770 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 674 | "\n", 675 | " 2997 2998 2999 is_duplicate \n", 676 | "394861 0 0 0 0 \n", 677 | "266389 0 0 0 1 \n", 678 | "32076 0 0 0 0 \n", 679 | "150593 0 0 0 0 \n", 680 | "128073 0 0 0 0 \n", 681 | "... ... ... ... ... \n", 682 | "350070 0 0 0 0 \n", 683 | "301346 0 0 0 0 \n", 684 | "79932 0 0 0 0 \n", 685 | "74788 0 0 0 0 \n", 686 | "61770 0 0 0 0 \n", 687 | "\n", 688 | "[30000 rows x 6001 columns]" 689 | ] 690 | }, 691 | "execution_count": 15, 692 | "metadata": {}, 693 | "output_type": "execute_result" 694 | } 695 | ], 696 | "source": [ 697 | "temp_df" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": 11, 703 | "id": "b634b449", 704 | "metadata": {}, 705 | "outputs": [], 706 | "source": [ 707 | "temp_df['is_duplicate'] = new_df['is_duplicate']" 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": 12, 713 | "id": "96f3125e", 714 | "metadata": {}, 715 | "outputs": [ 716 | { 717 | "data": { 718 | "text/html": [ 719 | "
\n", 720 | "\n", 733 | "\n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | "
0123456789...299129922993299429952996299729982999is_duplicate
3948610000000000...0000000000
2663890000000000...0000000001
320760000000000...0000000000
1505930000000000...0000000000
1280730000000000...0010000000
\n", 883 | "

5 rows × 6001 columns

\n", 884 | "
" 885 | ], 886 | "text/plain": [ 887 | " 0 1 2 3 4 5 6 7 8 9 ... 2991 2992 2993 2994 2995 2996 \\\n", 888 | "394861 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 889 | "266389 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 890 | "32076 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 891 | "150593 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 892 | "128073 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 \n", 893 | "\n", 894 | " 2997 2998 2999 is_duplicate \n", 895 | "394861 0 0 0 0 \n", 896 | "266389 0 0 0 1 \n", 897 | "32076 0 0 0 0 \n", 898 | "150593 0 0 0 0 \n", 899 | "128073 0 0 0 0 \n", 900 | "\n", 901 | "[5 rows x 6001 columns]" 902 | ] 903 | }, 904 | "execution_count": 12, 905 | "metadata": {}, 906 | "output_type": "execute_result" 907 | } 908 | ], 909 | "source": [ 910 | "temp_df.head()" 911 | ] 912 | }, 913 | { 914 | "cell_type": "code", 915 | "execution_count": 13, 916 | "id": "7dec87f7", 917 | "metadata": {}, 918 | "outputs": [], 919 | "source": [ 920 | "from sklearn.model_selection import train_test_split\n", 921 | "X_train,X_test,y_train,y_test = train_test_split(temp_df.iloc[:,0:-1].values,temp_df.iloc[:,-1].values,test_size=0.2,random_state=1)" 922 | ] 923 | }, 924 | { 925 | "cell_type": "code", 926 | "execution_count": 14, 927 | "id": "92d4785f", 928 | "metadata": {}, 929 | "outputs": [ 930 | { 931 | "data": { 932 | "text/plain": [ 933 | "0.742" 934 | ] 935 | }, 936 | "execution_count": 14, 937 | "metadata": {}, 938 | "output_type": "execute_result" 939 | } 940 | ], 941 | "source": [ 942 | "from sklearn.ensemble import RandomForestClassifier\n", 943 | "from sklearn.metrics import accuracy_score\n", 944 | "rf = RandomForestClassifier()\n", 945 | "rf.fit(X_train,y_train)\n", 946 | "y_pred = rf.predict(X_test)\n", 947 | "accuracy_score(y_test,y_pred)" 948 | ] 949 | }, 950 | { 951 | "cell_type": "code", 952 | "execution_count": 15, 953 | "id": "a69c5c5a", 954 | "metadata": {}, 955 | "outputs": [ 956 | { 957 | "name": "stderr", 958 | "output_type": "stream", 959 | "text": [ 960 | "C:\\Users\\91842\\anaconda3\\lib\\site-packages\\xgboost\\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", 961 | " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n" 962 | ] 963 | }, 964 | { 965 | "name": "stdout", 966 | "output_type": "stream", 967 | "text": [ 968 | "[14:13:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.0/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" 969 | ] 970 | }, 971 | { 972 | "data": { 973 | "text/plain": [ 974 | "0.7328333333333333" 975 | ] 976 | }, 977 | "execution_count": 15, 978 | "metadata": {}, 979 | "output_type": "execute_result" 980 | } 981 | ], 982 | "source": [ 983 | "from xgboost import XGBClassifier\n", 984 | "xgb = XGBClassifier()\n", 985 | "xgb.fit(X_train,y_train)\n", 986 | "y_pred = xgb.predict(X_test)\n", 987 | "accuracy_score(y_test,y_pred)" 988 | ] 989 | }, 990 | { 991 | "cell_type": "code", 992 | "execution_count": null, 993 | "id": "9c576f36", 994 | "metadata": {}, 995 | "outputs": [], 996 | "source": [] 997 | } 998 | ], 999 | "metadata": { 1000 | "kernelspec": { 1001 | "display_name": "Python 3", 1002 | "language": "python", 1003 | "name": "python3" 1004 | }, 1005 | "language_info": { 1006 | "codemirror_mode": { 1007 | "name": "ipython", 1008 | "version": 3 1009 | }, 1010 | "file_extension": ".py", 1011 | "mimetype": "text/x-python", 1012 | "name": "python", 1013 | "nbconvert_exporter": "python", 1014 | "pygments_lexer": "ipython3", 1015 | "version": "3.8.8" 1016 | } 1017 | }, 1018 | "nbformat": 4, 1019 | "nbformat_minor": 5 1020 | } 1021 | -------------------------------------------------------------------------------- /streamlit-app/Procfile: -------------------------------------------------------------------------------- 1 | web: sh setup.sh && streamlit run app.py -------------------------------------------------------------------------------- /streamlit-app/app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import helper 3 | import pickle 4 | 5 | model = pickle.load(open('model.pkl','rb')) 6 | 7 | st.header('Duplicate Question Pairs') 8 | 9 | q1 = st.text_input('Enter question 1') 10 | q2 = st.text_input('Enter question 2') 11 | 12 | if st.button('Find'): 13 | query = helper.query_point_creator(q1,q2) 14 | result = model.predict(query)[0] 15 | 16 | if result: 17 | st.header('Duplicate') 18 | else: 19 | st.header('Not Duplicate') 20 | 21 | 22 | -------------------------------------------------------------------------------- /streamlit-app/helper.py: -------------------------------------------------------------------------------- 1 | import re 2 | from bs4 import BeautifulSoup 3 | import distance 4 | from fuzzywuzzy import fuzz 5 | import pickle 6 | import numpy as np 7 | 8 | cv = pickle.load(open('cv.pkl','rb')) 9 | 10 | 11 | def test_common_words(q1,q2): 12 | w1 = set(map(lambda word: word.lower().strip(), q1.split(" "))) 13 | w2 = set(map(lambda word: word.lower().strip(), q2.split(" "))) 14 | return len(w1 & w2) 15 | 16 | def test_total_words(q1,q2): 17 | w1 = set(map(lambda word: word.lower().strip(), q1.split(" "))) 18 | w2 = set(map(lambda word: word.lower().strip(), q2.split(" "))) 19 | return (len(w1) + len(w2)) 20 | 21 | 22 | def test_fetch_token_features(q1, q2): 23 | SAFE_DIV = 0.0001 24 | 25 | STOP_WORDS = pickle.load(open('stopwords.pkl','rb')) 26 | 27 | token_features = [0.0] * 8 28 | 29 | # Converting the Sentence into Tokens: 30 | q1_tokens = q1.split() 31 | q2_tokens = q2.split() 32 | 33 | if len(q1_tokens) == 0 or len(q2_tokens) == 0: 34 | return token_features 35 | 36 | # Get the non-stopwords in Questions 37 | q1_words = set([word for word in q1_tokens if word not in STOP_WORDS]) 38 | q2_words = set([word for word in q2_tokens if word not in STOP_WORDS]) 39 | 40 | # Get the stopwords in Questions 41 | q1_stops = set([word for word in q1_tokens if word in STOP_WORDS]) 42 | q2_stops = set([word for word in q2_tokens if word in STOP_WORDS]) 43 | 44 | # Get the common non-stopwords from Question pair 45 | common_word_count = len(q1_words.intersection(q2_words)) 46 | 47 | # Get the common stopwords from Question pair 48 | common_stop_count = len(q1_stops.intersection(q2_stops)) 49 | 50 | # Get the common Tokens from Question pair 51 | common_token_count = len(set(q1_tokens).intersection(set(q2_tokens))) 52 | 53 | token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV) 54 | token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV) 55 | token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV) 56 | token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV) 57 | token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV) 58 | token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV) 59 | 60 | # Last word of both question is same or not 61 | token_features[6] = int(q1_tokens[-1] == q2_tokens[-1]) 62 | 63 | # First word of both question is same or not 64 | token_features[7] = int(q1_tokens[0] == q2_tokens[0]) 65 | 66 | return token_features 67 | 68 | 69 | def test_fetch_length_features(q1, q2): 70 | length_features = [0.0] * 3 71 | 72 | # Converting the Sentence into Tokens: 73 | q1_tokens = q1.split() 74 | q2_tokens = q2.split() 75 | 76 | if len(q1_tokens) == 0 or len(q2_tokens) == 0: 77 | return length_features 78 | 79 | # Absolute length features 80 | length_features[0] = abs(len(q1_tokens) - len(q2_tokens)) 81 | 82 | # Average Token Length of both Questions 83 | length_features[1] = (len(q1_tokens) + len(q2_tokens)) / 2 84 | 85 | strs = list(distance.lcsubstrings(q1, q2)) 86 | length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1) 87 | 88 | return length_features 89 | 90 | 91 | def test_fetch_fuzzy_features(q1, q2): 92 | fuzzy_features = [0.0] * 4 93 | 94 | # fuzz_ratio 95 | fuzzy_features[0] = fuzz.QRatio(q1, q2) 96 | 97 | # fuzz_partial_ratio 98 | fuzzy_features[1] = fuzz.partial_ratio(q1, q2) 99 | 100 | # token_sort_ratio 101 | fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2) 102 | 103 | # token_set_ratio 104 | fuzzy_features[3] = fuzz.token_set_ratio(q1, q2) 105 | 106 | return fuzzy_features 107 | 108 | 109 | def preprocess(q): 110 | q = str(q).lower().strip() 111 | 112 | # Replace certain special characters with their string equivalents 113 | q = q.replace('%', ' percent') 114 | q = q.replace('$', ' dollar ') 115 | q = q.replace('₹', ' rupee ') 116 | q = q.replace('€', ' euro ') 117 | q = q.replace('@', ' at ') 118 | 119 | # The pattern '[math]' appears around 900 times in the whole dataset. 120 | q = q.replace('[math]', '') 121 | 122 | # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases) 123 | q = q.replace(',000,000,000 ', 'b ') 124 | q = q.replace(',000,000 ', 'm ') 125 | q = q.replace(',000 ', 'k ') 126 | q = re.sub(r'([0-9]+)000000000', r'\1b', q) 127 | q = re.sub(r'([0-9]+)000000', r'\1m', q) 128 | q = re.sub(r'([0-9]+)000', r'\1k', q) 129 | 130 | # Decontracting words 131 | # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions 132 | # https://stackoverflow.com/a/19794953 133 | contractions = { 134 | "ain't": "am not", 135 | "aren't": "are not", 136 | "can't": "can not", 137 | "can't've": "can not have", 138 | "'cause": "because", 139 | "could've": "could have", 140 | "couldn't": "could not", 141 | "couldn't've": "could not have", 142 | "didn't": "did not", 143 | "doesn't": "does not", 144 | "don't": "do not", 145 | "hadn't": "had not", 146 | "hadn't've": "had not have", 147 | "hasn't": "has not", 148 | "haven't": "have not", 149 | "he'd": "he would", 150 | "he'd've": "he would have", 151 | "he'll": "he will", 152 | "he'll've": "he will have", 153 | "he's": "he is", 154 | "how'd": "how did", 155 | "how'd'y": "how do you", 156 | "how'll": "how will", 157 | "how's": "how is", 158 | "i'd": "i would", 159 | "i'd've": "i would have", 160 | "i'll": "i will", 161 | "i'll've": "i will have", 162 | "i'm": "i am", 163 | "i've": "i have", 164 | "isn't": "is not", 165 | "it'd": "it would", 166 | "it'd've": "it would have", 167 | "it'll": "it will", 168 | "it'll've": "it will have", 169 | "it's": "it is", 170 | "let's": "let us", 171 | "ma'am": "madam", 172 | "mayn't": "may not", 173 | "might've": "might have", 174 | "mightn't": "might not", 175 | "mightn't've": "might not have", 176 | "must've": "must have", 177 | "mustn't": "must not", 178 | "mustn't've": "must not have", 179 | "needn't": "need not", 180 | "needn't've": "need not have", 181 | "o'clock": "of the clock", 182 | "oughtn't": "ought not", 183 | "oughtn't've": "ought not have", 184 | "shan't": "shall not", 185 | "sha'n't": "shall not", 186 | "shan't've": "shall not have", 187 | "she'd": "she would", 188 | "she'd've": "she would have", 189 | "she'll": "she will", 190 | "she'll've": "she will have", 191 | "she's": "she is", 192 | "should've": "should have", 193 | "shouldn't": "should not", 194 | "shouldn't've": "should not have", 195 | "so've": "so have", 196 | "so's": "so as", 197 | "that'd": "that would", 198 | "that'd've": "that would have", 199 | "that's": "that is", 200 | "there'd": "there would", 201 | "there'd've": "there would have", 202 | "there's": "there is", 203 | "they'd": "they would", 204 | "they'd've": "they would have", 205 | "they'll": "they will", 206 | "they'll've": "they will have", 207 | "they're": "they are", 208 | "they've": "they have", 209 | "to've": "to have", 210 | "wasn't": "was not", 211 | "we'd": "we would", 212 | "we'd've": "we would have", 213 | "we'll": "we will", 214 | "we'll've": "we will have", 215 | "we're": "we are", 216 | "we've": "we have", 217 | "weren't": "were not", 218 | "what'll": "what will", 219 | "what'll've": "what will have", 220 | "what're": "what are", 221 | "what's": "what is", 222 | "what've": "what have", 223 | "when's": "when is", 224 | "when've": "when have", 225 | "where'd": "where did", 226 | "where's": "where is", 227 | "where've": "where have", 228 | "who'll": "who will", 229 | "who'll've": "who will have", 230 | "who's": "who is", 231 | "who've": "who have", 232 | "why's": "why is", 233 | "why've": "why have", 234 | "will've": "will have", 235 | "won't": "will not", 236 | "won't've": "will not have", 237 | "would've": "would have", 238 | "wouldn't": "would not", 239 | "wouldn't've": "would not have", 240 | "y'all": "you all", 241 | "y'all'd": "you all would", 242 | "y'all'd've": "you all would have", 243 | "y'all're": "you all are", 244 | "y'all've": "you all have", 245 | "you'd": "you would", 246 | "you'd've": "you would have", 247 | "you'll": "you will", 248 | "you'll've": "you will have", 249 | "you're": "you are", 250 | "you've": "you have" 251 | } 252 | 253 | q_decontracted = [] 254 | 255 | for word in q.split(): 256 | if word in contractions: 257 | word = contractions[word] 258 | 259 | q_decontracted.append(word) 260 | 261 | q = ' '.join(q_decontracted) 262 | q = q.replace("'ve", " have") 263 | q = q.replace("n't", " not") 264 | q = q.replace("'re", " are") 265 | q = q.replace("'ll", " will") 266 | 267 | # Removing HTML tags 268 | q = BeautifulSoup(q) 269 | q = q.get_text() 270 | 271 | # Remove punctuations 272 | pattern = re.compile('\W') 273 | q = re.sub(pattern, ' ', q).strip() 274 | 275 | return q 276 | 277 | 278 | def query_point_creator(q1, q2): 279 | input_query = [] 280 | 281 | # preprocess 282 | q1 = preprocess(q1) 283 | q2 = preprocess(q2) 284 | 285 | # fetch basic features 286 | input_query.append(len(q1)) 287 | input_query.append(len(q2)) 288 | 289 | input_query.append(len(q1.split(" "))) 290 | input_query.append(len(q2.split(" "))) 291 | 292 | input_query.append(test_common_words(q1, q2)) 293 | input_query.append(test_total_words(q1, q2)) 294 | input_query.append(round(test_common_words(q1, q2) / test_total_words(q1, q2), 2)) 295 | 296 | # fetch token features 297 | token_features = test_fetch_token_features(q1, q2) 298 | input_query.extend(token_features) 299 | 300 | # fetch length based features 301 | length_features = test_fetch_length_features(q1, q2) 302 | input_query.extend(length_features) 303 | 304 | # fetch fuzzy features 305 | fuzzy_features = test_fetch_fuzzy_features(q1, q2) 306 | input_query.extend(fuzzy_features) 307 | 308 | # bow feature for q1 309 | q1_bow = cv.transform([q1]).toarray() 310 | 311 | # bow feature for q2 312 | q2_bow = cv.transform([q2]).toarray() 313 | 314 | return np.hstack((np.array(input_query).reshape(1, 22), q1_bow, q2_bow)) -------------------------------------------------------------------------------- /streamlit-app/readme.txt: -------------------------------------------------------------------------------- 1 | This is the streamlit web app 2 | -------------------------------------------------------------------------------- /streamlit-app/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | sklearn 3 | fuzzywuzzy 4 | distance 5 | bs4 -------------------------------------------------------------------------------- /streamlit-app/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ~/.streamlit/ 2 | 3 | echo "\ 4 | [server]\n\ 5 | port = $PORT\n\ 6 | enableCORS = false\n\ 7 | headless = true\n\ 8 | \n\ 9 | " > ~/.streamlit/config.toml --------------------------------------------------------------------------------