├── README.md ├── bow-with-basic-features.ipynb ├── bow-with-preprocessing-and-advanced-features.ipynb ├── initial_EDA.ipynb ├── only-bow.ipynb └── streamlit-app ├── Procfile ├── app.py ├── helper.py ├── readme.txt ├── requirements.txt └── setup.sh /README.md: -------------------------------------------------------------------------------- 1 | # quora-question-pairs 2 | A NLP project to find weather given 2 questions are same are not semantically speaking. 3 | 4 | Dataset Link - https://www.kaggle.com/c/quora-question-pairs 5 | -------------------------------------------------------------------------------- /initial_EDA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 21, 6 | "id": "600ccbe8", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "import seaborn as sns\n", 13 | "import matplotlib.pyplot as plt" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 22, 19 | "id": "60425156", 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "data": { 24 | "text/plain": [ 25 | "(404290, 6)" 26 | ] 27 | }, 28 | "execution_count": 22, 29 | "metadata": {}, 30 | "output_type": "execute_result" 31 | } 32 | ], 33 | "source": [ 34 | "df = pd.read_csv(\"train.csv\")\n", 35 | "df.shape" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 25, 41 | "id": "5a3d86b9", 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/html": [ 47 | "

\n", 48 | "\n", 61 | "\n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | "

	id	qid1	qid2	question1	question2	is_duplicate
183268	183268	280288	280289	How did monkeys get to South America from Afri...	I fucking hate my life, I'm black, poor nd liv...	0
112930	112930	184684	132960	What is the best photo ever taken in your life?	What is the best picture taken by you?	1
300075	300075	348955	422827	What are some things new employees should know...	What are some things new employees should know...	0
223993	223993	296218	184831	Why do the British care about the Royal Family?	Why has the UK retained the monarchy?	0
171389	171389	177374	264819	Which is the most inspiring book to read?	What is the most inspiring book you have ever ...	0
357002	357002	486390	486391	Why can't I forget my girlfriend?	Why can't I forget my first girlfriend?	1
348760	348760	477337	477338	Which is greater rise in 1 degree Celsius or r...	If I sit and hold 100 grams of ice at zero deg...	0
119950	119950	194645	194646	What are some ways to amplify linear motion an...	How do you amplify linear motion?	1
209885	209885	314294	314295	How should one prepare for IAS when he is in h...	How can I prepare for IAS from my first year o...	1
23430	23430	43885	43886	In the initial days of a SaaS startup, when th...	I have to manage the entire operations and pro...	0

\n", 166 | "

" 167 | ], 168 | "text/plain": [ 169 | " id qid1 qid2 \\\n", 170 | "183268 183268 280288 280289 \n", 171 | "112930 112930 184684 132960 \n", 172 | "300075 300075 348955 422827 \n", 173 | "223993 223993 296218 184831 \n", 174 | "171389 171389 177374 264819 \n", 175 | "357002 357002 486390 486391 \n", 176 | "348760 348760 477337 477338 \n", 177 | "119950 119950 194645 194646 \n", 178 | "209885 209885 314294 314295 \n", 179 | "23430 23430 43885 43886 \n", 180 | "\n", 181 | " question1 \\\n", 182 | "183268 How did monkeys get to South America from Afri... \n", 183 | "112930 What is the best photo ever taken in your life? \n", 184 | "300075 What are some things new employees should know... \n", 185 | "223993 Why do the British care about the Royal Family? \n", 186 | "171389 Which is the most inspiring book to read? \n", 187 | "357002 Why can't I forget my girlfriend? \n", 188 | "348760 Which is greater rise in 1 degree Celsius or r... \n", 189 | "119950 What are some ways to amplify linear motion an... \n", 190 | "209885 How should one prepare for IAS when he is in h... \n", 191 | "23430 In the initial days of a SaaS startup, when th... \n", 192 | "\n", 193 | " question2 is_duplicate \n", 194 | "183268 I fucking hate my life, I'm black, poor nd liv... 0 \n", 195 | "112930 What is the best picture taken by you? 1 \n", 196 | "300075 What are some things new employees should know... 0 \n", 197 | "223993 Why has the UK retained the monarchy? 0 \n", 198 | "171389 What is the most inspiring book you have ever ... 0 \n", 199 | "357002 Why can't I forget my first girlfriend? 1 \n", 200 | "348760 If I sit and hold 100 grams of ice at zero deg... 0 \n", 201 | "119950 How do you amplify linear motion? 1 \n", 202 | "209885 How can I prepare for IAS from my first year o... 1 \n", 203 | "23430 I have to manage the entire operations and pro... 0 " 204 | ] 205 | }, 206 | "execution_count": 25, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "df.sample(10)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 4, 218 | "id": "37b00141", 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "\n", 226 | "RangeIndex: 404290 entries, 0 to 404289\n", 227 | "Data columns (total 6 columns):\n", 228 | " # Column Non-Null Count Dtype \n", 229 | "--- ------ -------------- ----- \n", 230 | " 0 id 404290 non-null int64 \n", 231 | " 1 qid1 404290 non-null int64 \n", 232 | " 2 qid2 404290 non-null int64 \n", 233 | " 3 question1 404289 non-null object\n", 234 | " 4 question2 404288 non-null object\n", 235 | " 5 is_duplicate 404290 non-null int64 \n", 236 | "dtypes: int64(4), object(2)\n", 237 | "memory usage: 18.5+ MB\n" 238 | ] 239 | } 240 | ], 241 | "source": [ 242 | "df.info()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 5, 248 | "id": "c5b82789", 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "data": { 253 | "text/plain": [ 254 | "id 0\n", 255 | "qid1 0\n", 256 | "qid2 0\n", 257 | "question1 1\n", 258 | "question2 2\n", 259 | "is_duplicate 0\n", 260 | "dtype: int64" 261 | ] 262 | }, 263 | "execution_count": 5, 264 | "metadata": {}, 265 | "output_type": "execute_result" 266 | } 267 | ], 268 | "source": [ 269 | "# missing values\n", 270 | "df.isnull().sum()" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 6, 276 | "id": "e704abaf", 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/plain": [ 282 | "0" 283 | ] 284 | }, 285 | "execution_count": 6, 286 | "metadata": {}, 287 | "output_type": "execute_result" 288 | } 289 | ], 290 | "source": [ 291 | "# duplicate rows\n", 292 | "df.duplicated().sum()" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 17, 298 | "id": "3f9bd6af", 299 | "metadata": {}, 300 | "outputs": [ 301 | { 302 | "name": "stdout", 303 | "output_type": "stream", 304 | "text": [ 305 | "0 255027\n", 306 | "1 149263\n", 307 | "Name: is_duplicate, dtype: int64\n", 308 | "0 63.080215\n", 309 | "1 36.919785\n", 310 | "Name: is_duplicate, dtype: float64\n" 311 | ] 312 | }, 313 | { 314 | "data": { 315 | "text/plain": [ 316 | "" 317 | ] 318 | }, 319 | "execution_count": 17, 320 | "metadata": {}, 321 | "output_type": "execute_result" 322 | }, 323 | { 324 | "data": { 325 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAD1CAYAAABOfbKwAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAQMElEQVR4nO3db6je5X3H8fdnphNZq0Q9is2fRWbKpsIshij0SUcgydoHWlB2fFDDFkgRhRb6YNonFiWgsFYQpmAxGKWrBttiWGtdph2lzKnHItXoXA7Vapqg6RKse6Bb0u8e3Ndp75zeuc7JSXJOYt4v+HH/7u/vuq5z3XDkk991/e5jqgpJko7kjxZ6ApKkk5tBIUnqMigkSV0GhSSpy6CQJHUZFJKkrkULPYHj7fzzz68VK1Ys9DQk6ZTy4osv/rqqxkZd+8gFxYoVK5iYmFjoaUjSKSXJL490zaUnSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkro+cl+4O1WsuPUHCz2Fj5Q37/r8Qk9B+sia8Y4iybIkP07yWpKdSb7c6l9P8qskL7Xjc0N9bksymeT1JOuG6lcmeblduzdJWv3MJI+1+nNJVgz12ZBkVzs2HNdPL0ma0WzuKA4CX62qnyX5BPBikh3t2j1V9Q/DjZNcCowDlwGfBP41yaeq6hBwP7AJ+A/gh8B64ElgI3Cgqi5JMg7cDfxNknOB24FVQLWfvb2qDhzbx5YkzdaMdxRVtbeqftbO3wdeA5Z0ulwDPFpVH1bVG8AksDrJRcDZVfVsDf5H3Q8D1w712drOHwfWtLuNdcCOqtrfwmEHg3CRJM2To9rMbktCnwaea6Vbkvw8yZYki1ttCfD2ULfdrbaknU+vH9anqg4C7wHndcaaPq9NSSaSTOzbt+9oPpIkaQazDookHwe+C3ylqn7DYBnpz4ArgL3AN6aajuhenfpc+/y+UPVAVa2qqlVjYyP/Sq4kaY5mFRRJPsYgJL5dVd8DqKp3qupQVf0W+BawujXfDSwb6r4U2NPqS0fUD+uTZBFwDrC/M5YkaZ7M5qmnAA8Cr1XVN4fqFw01+wLwSjvfDoy3J5kuBlYCz1fVXuD9JFe3MW8EnhjqM/VE03XAM20f4ylgbZLFbWlrbatJkubJbJ56+gzwReDlJC+12teAG5JcwWAp6E3gSwBVtTPJNuBVBk9M3dyeeAK4CXgIOIvB005PtvqDwCNJJhncSYy3sfYnuRN4obW7o6r2z+WDSpLmZsagqKqfMnqv4IedPpuBzSPqE8DlI+ofANcfYawtwJaZ5ilJOjH8Ex6SpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKlrxqBIsizJj5O8lmRnki+3+rlJdiTZ1V4XD/W5LclkkteTrBuqX5nk5Xbt3iRp9TOTPNbqzyVZMdRnQ/sZu5JsOK6fXpI0o9ncURwEvlpVfwFcDdyc5FLgVuDpqloJPN3e066NA5cB64H7kpzRxrof2ASsbMf6Vt8IHKiqS4B7gLvbWOcCtwNXAauB24cDSZJ04s0YFFW1t6p+1s7fB14DlgDXAFtbs63Ate38GuDRqvqwqt4AJoHVSS4Czq6qZ6uqgIen9Zka63FgTbvbWAfsqKr9VXUA2MHvw0WSNA+Oao+iLQl9GngOuLCq9sIgTIALWrMlwNtD3Xa32pJ2Pr1+WJ+qOgi8B5zXGUuSNE9mHRRJPg58F/hKVf2m13RErTr1ufYZntumJBNJJvbt29eZmiTpaM0qKJJ8jEFIfLuqvtfK77TlJNrru62+G1g21H0psKfVl46oH9YnySLgHGB/Z6zDVNUDVbWqqlaNjY3N5iNJkmZpNk89BXgQeK2qvjl0aTsw9RTSBuCJofp4e5LpYgab1s+35an3k1zdxrxxWp+psa4Dnmn7GE8Ba5MsbpvYa1tNkjRPFs2izWeALwIvJ3mp1b4G3AVsS7IReAu4HqCqdibZBrzK4Impm6vqUOt3E/AQcBbwZDtgEESPJJlkcCcx3sban+RO4IXW7o6q2j+3jypJmosZg6KqfsrovQKANUfosxnYPKI+AVw+ov4BLWhGXNsCbJlpnpKkE8NvZkuSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUNWNQJNmS5N0krwzVvp7kV0leasfnhq7dlmQyyetJ1g3Vr0zycrt2b5K0+plJHmv155KsGOqzIcmudmw4bp9akjRrs7mjeAhYP6J+T1Vd0Y4fAiS5FBgHLmt97ktyRmt/P7AJWNmOqTE3Ageq6hLgHuDuNta5wO3AVcBq4PYki4/6E0qSjsmMQVFVPwH2z3K8a4BHq+rDqnoDmARWJ7kIOLuqnq2qAh4Grh3qs7WdPw6saXcb64AdVbW/qg4AOxgdWJKkE+hY9ihuSfLztjQ19S/9JcDbQ212t9qSdj69flifqjoIvAec1xlLkjSPFs2x3/3AnUC1128AfwdkRNvq1Jljn8Mk2cRgWYvly5f35i1pFlbc+oOFnsJHxpt3fX6hp3DM5nRHUVXvVNWhqvot8C0Gewgw+Ff/sqGmS4E9rb50RP2wPkkWAecwWOo60lij5vNAVa2qqlVjY2Nz+UiSpCOYU1C0PYcpXwCmnojaDoy3J5kuZrBp/XxV7QXeT3J123+4EXhiqM/UE03XAc+0fYyngLVJFrelrbWtJkmaRzMuPSX5DvBZ4Pwkuxk8ifTZJFcwWAp6E/gSQFXtTLINeBU4CNxcVYfaUDcxeILqLODJdgA8CDySZJLBncR4G2t/kjuBF1q7O6pqtpvqkqTjZMagqKobRpQf7LTfDGweUZ8ALh9R/wC4/ghjbQG2zDRHSdKJ4zezJUldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklS14xBkWRLkneTvDJUOzfJjiS72uvioWu3JZlM8nqSdUP1K5O83K7dmyStfmaSx1r9uSQrhvpsaD9jV5INx+1TS5JmbTZ3FA8B66fVbgWerqqVwNPtPUkuBcaBy1qf+5Kc0frcD2wCVrZjasyNwIGqugS4B7i7jXUucDtwFbAauH04kCRJ82PGoKiqnwD7p5WvAba2863AtUP1R6vqw6p6A5gEVie5CDi7qp6tqgIentZnaqzHgTXtbmMdsKOq9lfVAWAHfxhYkqQTbK57FBdW1V6A9npBqy8B3h5qt7vVlrTz6fXD+lTVQeA94LzOWJKkeXS8N7Mzolad+lz7HP5Dk01JJpJM7Nu3b1YTlSTNzlyD4p22nER7fbfVdwPLhtotBfa0+tIR9cP6JFkEnMNgqetIY/2BqnqgqlZV1aqxsbE5fiRJ0ihzDYrtwNRTSBuAJ4bq4+1JposZbFo/35an3k9yddt/uHFan6mxrgOeafsYTwFrkyxum9hrW02SNI8WzdQgyXeAzwLnJ9nN4Emku4BtSTYCbwHXA1TVziTbgFeBg8DNVXWoDXUTgyeozgKebAfAg8AjSSYZ3EmMt7H2J7kTeKG1u6Oqpm+qS5JOsBmDoqpuOMKlNUdovxnYPKI+AVw+ov4BLWhGXNsCbJlpjpKkE8dvZkuSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUdUxBkeTNJC8neSnJRKudm2RHkl3tdfFQ+9uSTCZ5Pcm6ofqVbZzJJPcmSaufmeSxVn8uyYpjma8k6egdjzuKv6qqK6pqVXt/K/B0Va0Enm7vSXIpMA5cBqwH7ktyRutzP7AJWNmO9a2+EThQVZcA9wB3H4f5SpKOwolYeroG2NrOtwLXDtUfraoPq+oNYBJYneQi4OyqeraqCnh4Wp+psR4H1kzdbUiS5sexBkUB/5LkxSSbWu3CqtoL0F4vaPUlwNtDfXe32pJ2Pr1+WJ+qOgi8B5x3jHOWJB2FRcfY/zNVtSfJBcCOJP/ZaTvqTqA69V6fwwcehNQmgOXLl/dnLEk6Ksd0R1FVe9rru8D3gdXAO205ifb6bmu+G1g21H0psKfVl46oH9YnySLgHGD/iHk8UFWrqmrV2NjYsXwkSdI0cw6KJH+S5BNT58Ba4BVgO7ChNdsAPNHOtwPj7UmmixlsWj/flqfeT3J123+4cVqfqbGuA55p+xiSpHlyLEtPFwLfb3vLi4B/qqofJXkB2JZkI/AWcD1AVe1Msg14FTgI3FxVh9pYNwEPAWcBT7YD4EHgkSSTDO4kxo9hvpKkOZhzUFTVL4C/HFH/b2DNEfpsBjaPqE8Al4+of0ALGknSwvCb2ZKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1nRJBkWR9kteTTCa5daHnI0mnk5M+KJKcAfwj8NfApcANSS5d2FlJ0unjpA8KYDUwWVW/qKr/BR4FrlngOUnSaWPRQk9gFpYAbw+93w1cNdwgySZgU3v7P0len6e5nQ7OB3690JOYSe5e6BlogZz0v5+n0O/mnx7pwqkQFBlRq8PeVD0APDA/0zm9JJmoqlULPQ9pFH8/58epsPS0G1g29H4psGeB5iJJp51TISheAFYmuTjJHwPjwPYFnpMknTZO+qWnqjqY5BbgKeAMYEtV7VzgaZ1OXNLTyczfz3mQqpq5lSTptHUqLD1JkhaQQSFJ6jIoJEldJ/1mtuZXkj9n8M33JQy+r7IH2F5Vry3oxCQtGO8o9DtJ/p7Bn0gJ8DyDR5MDfMc/xqiTWZK/Xeg5fJT51JN+J8l/AZdV1f9Nq/8xsLOqVi7MzKS+JG9V1fKFnsdHlUtPGvZb4JPAL6fVL2rXpAWT5OdHugRcOJ9zOd0YFBr2FeDpJLv4/R9iXA5cAtyyUJOSmguBdcCBafUA/z7/0zl9GBT6nar6UZJPMfjT7ksY/Ae4G3ihqg4t6OQk+Gfg41X10vQLSf5t3mdzGnGPQpLU5VNPkqQug0KS1GVQSJK6DApJUpdBIUnq+n/InXmx1HDi4wAAAABJRU5ErkJggg==\n", 326 | "text/plain": [ 327 | "

" 328 | ] 329 | }, 330 | "metadata": { 331 | "needs_background": "light" 332 | }, 333 | "output_type": "display_data" 334 | } 335 | ], 336 | "source": [ 337 | "# Distribution of duplicate and non-duplicate questions\n", 338 | "\n", 339 | "print(df['is_duplicate'].value_counts())\n", 340 | "print((df['is_duplicate'].value_counts()/df['is_duplicate'].count())*100)\n", 341 | "df['is_duplicate'].value_counts().plot(kind='bar')" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 19, 347 | "id": "788d2d08", 348 | "metadata": {}, 349 | "outputs": [ 350 | { 351 | "name": "stdout", 352 | "output_type": "stream", 353 | "text": [ 354 | "Number of unique questions 537933\n", 355 | "Number of questions getting repeated 111780\n" 356 | ] 357 | } 358 | ], 359 | "source": [ 360 | "# Repeated questions\n", 361 | "\n", 362 | "qid = pd.Series(df['qid1'].tolist() + df['qid2'].tolist())\n", 363 | "print('Number of unique questions',np.unique(qid).shape[0])\n", 364 | "x = qid.value_counts()>1\n", 365 | "print('Number of questions getting repeated',x[x].shape[0])" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 20, 371 | "id": "2fa5bb83", 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "data": { 376 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAQoklEQVR4nO3df6zdd13H8efLzjtk6PixobM/bOGOxf6jwHX88EcmTGyBbopE10ACWtdgMuKPqHSZMeEvBxpjiJNZdc7A7NLMCS0rmYriMFlg3WTYUiplDHbZoJ2YGdFkTN7+cU7Zydm97bnnR8+5n/t8JE3v93PO+X5fve1593Pf38/5flNVSJLa8h3TDiBJGj+LuyQ1yOIuSQ2yuEtSgyzuktSg86YdAOCiiy6qzZs3TzuGJK0q999//+NVdfFSj021uCfZAeyYn5/n8OHD04wiSatOki8t99hU2zJVdbCqdl944YXTjCFJzZlqcU+yI8neJ554YpoxJKk5ztwlqUHO3CWpQc7cJalBrnOXpAbZlpGkBtmWkaQGrfq2zOY9d7F5z13TjiFJM8W2jCQ1yLaMJDVo1bdlJEnPZHGXpAZZ3CWpQZ5QlaQGeUJVkhpkW0aSGmRxl6QGWdwlqUEWd0lqkKtlJKlBrpaRpAbZlpGkBlncJalBFndJapDFXZIaZHGXpAaNvbgnuSLJJ5LcnOSKce9fknR2AxX3JLckOZnkSN/4tiTHk5xIsqc7XMB/A88CFscbV5I0iEFn7rcC23oHkqwDbgK2A1uBnUm2Ap+oqu3Au4B3jy+qJGlQAxX3qroH+Hrf8OXAiap6qKqeBG4Hrq6qb3Uf/0/g/OX2mWR3ksNJDp86dWqI6JKk5YzSc18PPNKzvQisT/KmJH8KfAD44+VeXFV7q2qhqhYuvvjiEWJIkvqdN8Jrs8RYVdWdwJ0D7SDZAeyYn58fIYYkqd8oM/dFYGPP9gbg0ZXswGvLSNJkjFLc7wMuTbIlyRxwDXBgJTvwqpCSNBmDLoXcB9wLXJZkMcmuqnoKuA64GzgG7K+qoys5uDN3SZqMgXruVbVzmfFDwKFhD27PXZImo5nruW/ecxeb99w1hlSStPp5JyZJalAzM3dJ0tO8KqQkNci2jCQ1yLaMJDXItowkNci2jCQ1yLaMJDXItowkNcjiLkkNsrhLUoM8oSpJDfKEqiQ1yLaMJDXI4i5JDbK4S1KDmivu3rRDklwtI0lNcrWMJDWoubaMJMniLklNsrhLUoMs7pLUIIu7JDVoIsU9yQVJ7k/yxknsX5J0ZgMV9yS3JDmZ5Ejf+LYkx5OcSLKn56F3AfvHGVSSNLhBZ+63Att6B5KsA24CtgNbgZ1Jtia5Evgs8LUx5pQkrcB5gzypqu5Jsrlv+HLgRFU9BJDkduBq4DnABXQK/v8mOVRV3+rfZ5LdwG6ATZs2Df0HkCQ900DFfRnrgUd6theBV1TVdQBJ3g48vlRhB6iqvcBegIWFhRohx5JOX1/m4RvfMO5dS9LMG6W4Z4mxbxfpqrr1rDtIdgA75ufnR4ghSeo3ymqZRWBjz/YG4NGV7MBry0jSZIxS3O8DLk2yJckccA1wYCU78KqQkjQZgy6F3AfcC1yWZDHJrqp6CrgOuBs4BuyvqqMrObgzd0majEFXy+xcZvwQcGjYg9tzl6TJ8HruktQg78QkSQ1y5i5JDfKqkJLUoObbMpv33PXtT6tK0lphW0aSGmRbRpIa1HxbRpLWItsyktQg2zKS1CCLuyQ1yOIuSQ3yhKokNWjNnFD1w0yS1hLbMpLUIIu7JDXI4i5JDbK4S1KDXC0jSQ1aM6tlTnPVjKS1wLaMJDXI4i5JDbK4S1KDLO6S1KA1W9w9qSqpZWMv7kl+MMnNSe5I8ivj3r8k6ewGKu5JbklyMsmRvvFtSY4nOZFkD0BVHauqdwA/DyyMP7Ik6WwGnbnfCmzrHUiyDrgJ2A5sBXYm2dp97CrgX4CPjS2pJGlgAxX3qroH+Hrf8OXAiap6qKqeBG4Hru4+/0BVvRp4yzjDSpIGc94Ir10PPNKzvQi8IskVwJuA84FDy704yW5gN8CmTZtGiCFJ6jdKcc8SY1VVHwc+frYXV9XeJI8BO+bm5l4+Qo6hnV4x8/CNb5jG4SVpYkZZLbMIbOzZ3gA8upIdTOPaMpK0FoxS3O8DLk2yJckccA1wYCU78KqQkjQZgy6F3AfcC1yWZDHJrqp6CrgOuBs4BuyvqqMrObgzd0majIF67lW1c5nxQ5zhpOnZJNkB7Jifnx92F2Nh711Sa9bc9dwlaS3wTkyS1CBn7j28S5OkVqzZq0JKUstsy0hSg2zLLMH2jKTVzraMJDXI4i5JDbLnfga2ZyStVvbcJalBtmUkqUEW9wHYnpG02thzl6QG2XOXpAbZllkB2zOSVguLuyQ1yOIuSQ2yuEtSg1wtMwR775Jm3UD3UJ2UqjoIHFxYWLh2mjmG1V/gvQerpFlhW0aSGmRxl6QGTbUt05reNo0tGknT5MxdkhpkcZekBk2kuCf5mSR/luTDSV43iWNIkpY3cHFPckuSk0mO9I1vS3I8yYkkewCq6kNVdS3wduAXxppYknRWK5m53wps6x1Isg64CdgObAV2Jtna85Tf6T6+5vhBJ0nTNPBqmaq6J8nmvuHLgRNV9RBAktuBq5McA24EPlpVD4wr7GrkB50kTcOoPff1wCM924vdsXcCVwJvTvKOpV6YZHeSw0kOnzp1asQYkqReo65zzxJjVVXvA953phdW1d4kjwE75ubmXj5iDklSj1Fn7ovAxp7tDcCjg754Ld6JyV68pHNh1OJ+H3Bpki1J5oBrgAODvni1XhVSkmbdSpZC7gPuBS5LsphkV1U9BVwH3A0cA/ZX1dFB97kWZ+6nOYOXNEkrWS2zc5nxQ8ChYQ6eZAewY35+fpiXS5KWMdXLD6zlmXs/Z/KSxsk7MUlSg7wT05Q5W5c0CV4VUpIaZFtGkhrkCdUZ44lVSeNgW0aSGmRxl6QG2XNfhWzdSDobl0KuIhZ0SYOaanHXYCzqklbKnvuMsqBLGsVUZ+5eOOzMLPCShuU6d0lqkG0ZSWqQxV2SGmRxl6QGWdwlqUF+QnUVG+WTqn7KVWqbq2UkqUG2ZRrgLFxSP4u7JDXI4i5JDbK4r3G2dKQ2WdwlqUEWd0lq0NivCpnkRcANwIVV9eZx719n199mefjGN0wpiaRpGWjmnuSWJCeTHOkb35bkeJITSfYAVNVDVbVrEmF1ZvbPJZ02aFvmVmBb70CSdcBNwHZgK7AzydaxppMkDWWg4l5V9wBf7xu+HDjRnak/CdwOXD3ogZPsTnI4yeFTp04NHFiT4axfassoJ1TXA4/0bC8C65O8IMnNwEuTXL/ci6tqL/Bu4IG5ubkRYuhc8z8CafaNUtyzxFhV1X9U1Tuq6sVV9Xtn2oHXlpGkyRhltcwisLFnewPw6Ep24D1Uz72zzbiXe9wVN9LqMsrM/T7g0iRbkswB1wAHVrIDZ+6SNBmDLoXcB9wLXJZkMcmuqnoKuA64GzgG7K+qoys5uNdzPzcm3R+3By/NnoHaMlW1c5nxQ8ChYQ9eVQeBgwsLC9cOuw9J0jN5+QFJapC32ZOkBnmbPUlq0NgvHLYSLoVcPU6fMO1dEulJVGl2OXOXpAZ5QlWSGmRxl6QG2XPXxHjTEGl67LlLUoNsy0hSgyzuktQgP6GqFTnT2nYvICbNDnvuktQg2zKS1CCLuyQ1yOIuSQ2yuEtSg1wtI0kNcrWMJDXItowkNcjiLkkNsrhLUoMs7pLUIIu7JDXI4i5JDRr7nZiSXAD8CfAk8PGqum3cx5AkndlAM/cktyQ5meRI3/i2JMeTnEiypzv8JuCOqroWuGrMeSVJAxi0LXMrsK13IMk64CZgO7AV2JlkK7ABeKT7tP8bT0xJ0koM1JapqnuSbO4bvhw4UVUPASS5HbgaWKRT4D/NGf7zSLIb2A2wadOmlebWDFvuhh1LjfffNPv0c06P92+v9HkrsdQ+xrHf0/tZLTcIH9efedz7as2kvzejnFBdz9MzdOgU9fXAncDPJXk/cHC5F1fV3qpaqKqFiy++eIQYkqR+o5xQzRJjVVXfAH5xoB0kO4Ad8/PzI8SQJPUbZea+CGzs2d4APDpaHEnSOIxS3O8DLk2yJckccA1wYCU78KqQkjQZgy6F3AfcC1yWZDHJrqp6CrgOuBs4BuyvqqMrObjXc5ekyRh0tczOZcYPAYeGPXhVHQQOLiwsXDvsPiRJz+TlBySpQd5mT5Ia5G32JKlBqappZyDJKeBLK3zZRcDjE4gzDmYbjtmGY7bhtJDtB6pqyU+BzkRxH0aSw1W1MO0cSzHbcMw2HLMNp/VsnlCVpAZZ3CWpQau5uO+ddoAzMNtwzDYcsw2n6WyrtucuSVreap65S5KWYXGXpAatyuK+zL1bp5VlY5J/SnIsydEkv9odf36Sv0/y+e7vz5tSvnVJ/jXJR2YpVzfLc5PckeRz3e/fq2YlX5Jf7/59HkmyL8mzppVtqXsYnylLkuu7743jSX56Ctl+v/t3+pkkf5vkubOSreex30xSSS6apWxJ3tk9/tEk7x0pW1Wtql/AOuALwIuAOeBBYOsU81wCvKz79XcD/07nnrLvBfZ0x/cA75lSvt8A/hr4SHd7JnJ1j/9XwC93v54DnjsL+ejcUeyLwHd1t/cDb59WNuAngJcBR3rGlszS/bf3IHA+sKX7Xll3jrO9Djiv+/V7Zilbd3wjnavZfgm4aFayAT8J/ANwfnf7haNkO6dvmjF9U14F3N2zfT1w/bRz9eT5MPBTwHHgku7YJcDxKWTZAHwMeE1PcZ96ru6xv6dbQNM3PvV8PH0LyefTuXLqR7oFa2rZgM19hWDJLP3vh24Re9W5zNb32M8Ct81SNuAO4IeAh3uK+9Sz0ZlEXLnE84bKthrbMsvdu3XqujcRfynwSeB7q+oxgO7vL5xCpD8Cfhv4Vs/YLOSCzk9ep4C/7LaN/jzJBbOQr6q+AvwB8GXgMeCJqvq7WcjWY7kss/b++CXgo92vp54tyVXAV6rqwb6Hpp4NeAnw40k+meSfk/zIKNlWY3Ff8t6t5zxFnyTPAf4G+LWq+q8ZyPNG4GRV3T/tLMs4j86Ppe+vqpcC36DTXpi6bv/6ajo/An8/cEGSt0431cBm5v2R5AbgKeC200NLPO2cZUvybOAG4HeXeniJsXP9fTsPeB7wSuC3gP1JwpDZVmNxn7l7tyb5TjqF/baqurM7/LUkl3QfvwQ4eY5j/ShwVZKHgduB1yT54AzkOm0RWKyqT3a376BT7Gch35XAF6vqVFV9E7gTePWMZDttuSwz8f5I8jbgjcBbqttLmIFsL6bzH/aD3ffFBuCBJN83A9noZrizOj5F5yfui4bNthqL+8j3bh2n7v+sfwEcq6o/7HnoAPC27tdvo9OLP2eq6vqq2lBVm+l8j/6xqt467Vw9+b4KPJLksu7Qa4HPMhv5vgy8Msmzu3+/r6VzK8lZyHbaclkOANckOT/JFuBS4FPnMliSbcC7gKuq6n96Hppqtqr6t6p6YVVt7r4vFukshvjqtLN1fYjO+TGSvITOIoPHh842yRMGEzwR8Xo6q1K+ANww5Sw/RudHpM8An+7+ej3wAjonMz/f/f35U8x4BU+fUJ2lXD8MHO5+7z5E50fSmcgHvBv4HHAE+ACdlQpTyQbso9P7/yadgrTrTFnotB6+QOek6/YpZDtBp0d8+v1w86xk63v8YbonVGchG51i/sHuv7kHgNeMks3LD0hSg1ZjW0aSdBYWd0lqkMVdkhpkcZekBlncJalBFndJapDFXZIa9P9yyn9QIsa7pwAAAABJRU5ErkJggg==\n", 377 | "text/plain": [ 378 | "

" 379 | ] 380 | }, 381 | "metadata": { 382 | "needs_background": "light" 383 | }, 384 | "output_type": "display_data" 385 | } 386 | ], 387 | "source": [ 388 | "# Repeated questions histogram\n", 389 | "\n", 390 | "plt.hist(qid.value_counts().values,bins=160)\n", 391 | "plt.yscale('log')\n", 392 | "plt.show()" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "id": "f9573e2f", 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [] 402 | } 403 | ], 404 | "metadata": { 405 | "kernelspec": { 406 | "display_name": "Python 3", 407 | "language": "python", 408 | "name": "python3" 409 | }, 410 | "language_info": { 411 | "codemirror_mode": { 412 | "name": "ipython", 413 | "version": 3 414 | }, 415 | "file_extension": ".py", 416 | "mimetype": "text/x-python", 417 | "name": "python", 418 | "nbconvert_exporter": "python", 419 | "pygments_lexer": "ipython3", 420 | "version": "3.8.8" 421 | } 422 | }, 423 | "nbformat": 4, 424 | "nbformat_minor": 5 425 | } 426 | -------------------------------------------------------------------------------- /only-bow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "b478deb3", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import seaborn as sns" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "id": "217d407d", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "df = pd.read_csv('train.csv')" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "id": "0cb99da3", 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/plain": [ 35 | "(404290, 6)" 36 | ] 37 | }, 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "output_type": "execute_result" 41 | } 42 | ], 43 | "source": [ 44 | "df.shape" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 4, 50 | "id": "e5e7ce9a", 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/html": [ 56 | "

\n", 57 | "\n", 70 | "\n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | "

	id	qid1	qid2	question1	question2
0	0	1	2	What is the step by step guide to invest in sh...	What is the step by step guide to invest in sh...
1	1	3	4	What is the story of Kohinoor (Koh-i-Noor) Dia...	What would happen if the Indian government sto...
2	2	5	6	How can I increase the speed of my internet co...	How can Internet speed be increased by hacking...
3	3	7	8	Why am I mentally very lonely? How can I solve...	Find the remainder when [math]23^{24}[/math] i...
4	4	9	10	Which one dissolve in water quikly sugar, salt...	Which fish would survive in salt water?

\n", 130 | "

" 131 | ], 132 | "text/plain": [ 133 | " id qid1 qid2 question1 \\\n", 134 | "0 0 1 2 What is the step by step guide to invest in sh... \n", 135 | "1 1 3 4 What is the story of Kohinoor (Koh-i-Noor) Dia... \n", 136 | "2 2 5 6 How can I increase the speed of my internet co... \n", 137 | "3 3 7 8 Why am I mentally very lonely? How can I solve... \n", 138 | "4 4 9 10 Which one dissolve in water quikly sugar, salt... \n", 139 | "\n", 140 | " question2 is_duplicate \n", 141 | "0 What is the step by step guide to invest in sh... 0 \n", 142 | "1 What would happen if the Indian government sto... 0 \n", 143 | "2 How can Internet speed be increased by hacking... 0 \n", 144 | "3 Find the remainder when [math]23^{24}[/math] i... 0 \n", 145 | "4 Which fish would survive in salt water? 0 " 146 | ] 147 | }, 148 | "execution_count": 4, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "df.head()" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 5, 160 | "id": "94b6e88e", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "new_df = df.sample(30000)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 6, 170 | "id": "5074efd7", 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "id 0\n", 177 | "qid1 0\n", 178 | "qid2 0\n", 179 | "question1 0\n", 180 | "question2 0\n", 181 | "is_duplicate 0\n", 182 | "dtype: int64" 183 | ] 184 | }, 185 | "execution_count": 6, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "new_df.isnull().sum()" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 7, 197 | "id": "e2763e28", 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/plain": [ 203 | "0" 204 | ] 205 | }, 206 | "execution_count": 7, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "new_df.duplicated().sum()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 8, 218 | "id": "a1e18aeb", 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "data": { 223 | "text/html": [ 224 | "

\n", 225 | "\n", 238 | "\n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | "

	question1	question2
394861	What is Mark Zuckerberg's degree?	Is there any reservation for NT in upsc?
266389	Why is Saltwater Taffy candy imported in Poland?	Why is Saltwater Taffy candy imported in Portu...
32076	Who are the best people on the Internet to fol...	Who are the best people on the Internet to fol...
150593	I have been with my partner for 8 years Last y...	I split up with my wife two years ago. Officia...
128073	Is it legal to earn income from online freelan...	Can 4-5 employers file a petition for your H1-...

\n", 274 | "

" 275 | ], 276 | "text/plain": [ 277 | " question1 \\\n", 278 | "394861 What is Mark Zuckerberg's degree? \n", 279 | "266389 Why is Saltwater Taffy candy imported in Poland? \n", 280 | "32076 Who are the best people on the Internet to fol... \n", 281 | "150593 I have been with my partner for 8 years Last y... \n", 282 | "128073 Is it legal to earn income from online freelan... \n", 283 | "\n", 284 | " question2 \n", 285 | "394861 Is there any reservation for NT in upsc? \n", 286 | "266389 Why is Saltwater Taffy candy imported in Portu... \n", 287 | "32076 Who are the best people on the Internet to fol... \n", 288 | "150593 I split up with my wife two years ago. Officia... \n", 289 | "128073 Can 4-5 employers file a petition for your H1-... " 290 | ] 291 | }, 292 | "execution_count": 8, 293 | "metadata": {}, 294 | "output_type": "execute_result" 295 | } 296 | ], 297 | "source": [ 298 | "ques_df = new_df[['question1','question2']]\n", 299 | "ques_df.head()" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 9, 305 | "id": "dec56417", 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "from sklearn.feature_extraction.text import CountVectorizer\n", 310 | "# merge texts\n", 311 | "questions = list(ques_df['question1']) + list(ques_df['question2'])\n", 312 | "\n", 313 | "cv = CountVectorizer(max_features=3000)\n", 314 | "q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 10, 320 | "id": "88026075", 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "data": { 325 | "text/plain": [ 326 | "(30000, 6000)" 327 | ] 328 | }, 329 | "execution_count": 10, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "temp_df1 = pd.DataFrame(q1_arr, index= ques_df.index)\n", 336 | "temp_df2 = pd.DataFrame(q2_arr, index= ques_df.index)\n", 337 | "temp_df = pd.concat([temp_df1, temp_df2], axis=1)\n", 338 | "temp_df.shape" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 15, 344 | "id": "2f202654", 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "data": { 349 | "text/html": [ 350 | "

\n", 351 | "\n", 364 | "\n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | "

	0	1	2	3	4	5	6	7	8	9	...	2991	2992	2993	2994	2995	2996	2997	2998	2999	is_duplicate
394861	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
266389	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	1
32076	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
150593	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
128073	0	0	0	0	0	0	0	0	0	0	...	0	0	1	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
350070	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
301346	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
79932	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
74788	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
61770	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

\n", 658 | "

30000 rows × 6001 columns

\n", 659 | "

" 660 | ], 661 | "text/plain": [ 662 | " 0 1 2 3 4 5 6 7 8 9 ... 2991 2992 2993 2994 2995 2996 \\\n", 663 | "394861 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 664 | "266389 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 665 | "32076 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 666 | "150593 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 667 | "128073 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 \n", 668 | "... .. .. .. .. .. .. .. .. .. .. ... ... ... ... ... ... ... \n", 669 | "350070 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 670 | "301346 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 671 | "79932 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 672 | "74788 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 673 | "61770 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 674 | "\n", 675 | " 2997 2998 2999 is_duplicate \n", 676 | "394861 0 0 0 0 \n", 677 | "266389 0 0 0 1 \n", 678 | "32076 0 0 0 0 \n", 679 | "150593 0 0 0 0 \n", 680 | "128073 0 0 0 0 \n", 681 | "... ... ... ... ... \n", 682 | "350070 0 0 0 0 \n", 683 | "301346 0 0 0 0 \n", 684 | "79932 0 0 0 0 \n", 685 | "74788 0 0 0 0 \n", 686 | "61770 0 0 0 0 \n", 687 | "\n", 688 | "[30000 rows x 6001 columns]" 689 | ] 690 | }, 691 | "execution_count": 15, 692 | "metadata": {}, 693 | "output_type": "execute_result" 694 | } 695 | ], 696 | "source": [ 697 | "temp_df" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": 11, 703 | "id": "b634b449", 704 | "metadata": {}, 705 | "outputs": [], 706 | "source": [ 707 | "temp_df['is_duplicate'] = new_df['is_duplicate']" 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": 12, 713 | "id": "96f3125e", 714 | "metadata": {}, 715 | "outputs": [ 716 | { 717 | "data": { 718 | "text/html": [ 719 | "

\n", 720 | "\n", 733 | "\n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | "

	...	2993	is_duplicate
394861	...	0	0
266389	...	0	1
32076	...	0	0
150593	...	0	0
128073	...	1	0

\n", 883 | "

5 rows × 6001 columns

\n", 884 | "

" 885 | ], 886 | "text/plain": [ 887 | " 0 1 2 3 4 5 6 7 8 9 ... 2991 2992 2993 2994 2995 2996 \\\n", 888 | "394861 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 889 | "266389 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 890 | "32076 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 891 | "150593 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n", 892 | "128073 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 \n", 893 | "\n", 894 | " 2997 2998 2999 is_duplicate \n", 895 | "394861 0 0 0 0 \n", 896 | "266389 0 0 0 1 \n", 897 | "32076 0 0 0 0 \n", 898 | "150593 0 0 0 0 \n", 899 | "128073 0 0 0 0 \n", 900 | "\n", 901 | "[5 rows x 6001 columns]" 902 | ] 903 | }, 904 | "execution_count": 12, 905 | "metadata": {}, 906 | "output_type": "execute_result" 907 | } 908 | ], 909 | "source": [ 910 | "temp_df.head()" 911 | ] 912 | }, 913 | { 914 | "cell_type": "code", 915 | "execution_count": 13, 916 | "id": "7dec87f7", 917 | "metadata": {}, 918 | "outputs": [], 919 | "source": [ 920 | "from sklearn.model_selection import train_test_split\n", 921 | "X_train,X_test,y_train,y_test = train_test_split(temp_df.iloc[:,0:-1].values,temp_df.iloc[:,-1].values,test_size=0.2,random_state=1)" 922 | ] 923 | }, 924 | { 925 | "cell_type": "code", 926 | "execution_count": 14, 927 | "id": "92d4785f", 928 | "metadata": {}, 929 | "outputs": [ 930 | { 931 | "data": { 932 | "text/plain": [ 933 | "0.742" 934 | ] 935 | }, 936 | "execution_count": 14, 937 | "metadata": {}, 938 | "output_type": "execute_result" 939 | } 940 | ], 941 | "source": [ 942 | "from sklearn.ensemble import RandomForestClassifier\n", 943 | "from sklearn.metrics import accuracy_score\n", 944 | "rf = RandomForestClassifier()\n", 945 | "rf.fit(X_train,y_train)\n", 946 | "y_pred = rf.predict(X_test)\n", 947 | "accuracy_score(y_test,y_pred)" 948 | ] 949 | }, 950 | { 951 | "cell_type": "code", 952 | "execution_count": 15, 953 | "id": "a69c5c5a", 954 | "metadata": {}, 955 | "outputs": [ 956 | { 957 | "name": "stderr", 958 | "output_type": "stream", 959 | "text": [ 960 | "C:\\Users\\91842\\anaconda3\\lib\\site-packages\\xgboost\\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n", 961 | " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n" 962 | ] 963 | }, 964 | { 965 | "name": "stdout", 966 | "output_type": "stream", 967 | "text": [ 968 | "[14:13:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.0/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n" 969 | ] 970 | }, 971 | { 972 | "data": { 973 | "text/plain": [ 974 | "0.7328333333333333" 975 | ] 976 | }, 977 | "execution_count": 15, 978 | "metadata": {}, 979 | "output_type": "execute_result" 980 | } 981 | ], 982 | "source": [ 983 | "from xgboost import XGBClassifier\n", 984 | "xgb = XGBClassifier()\n", 985 | "xgb.fit(X_train,y_train)\n", 986 | "y_pred = xgb.predict(X_test)\n", 987 | "accuracy_score(y_test,y_pred)" 988 | ] 989 | }, 990 | { 991 | "cell_type": "code", 992 | "execution_count": null, 993 | "id": "9c576f36", 994 | "metadata": {}, 995 | "outputs": [], 996 | "source": [] 997 | } 998 | ], 999 | "metadata": { 1000 | "kernelspec": { 1001 | "display_name": "Python 3", 1002 | "language": "python", 1003 | "name": "python3" 1004 | }, 1005 | "language_info": { 1006 | "codemirror_mode": { 1007 | "name": "ipython", 1008 | "version": 3 1009 | }, 1010 | "file_extension": ".py", 1011 | "mimetype": "text/x-python", 1012 | "name": "python", 1013 | "nbconvert_exporter": "python", 1014 | "pygments_lexer": "ipython3", 1015 | "version": "3.8.8" 1016 | } 1017 | }, 1018 | "nbformat": 4, 1019 | "nbformat_minor": 5 1020 | } 1021 | -------------------------------------------------------------------------------- /streamlit-app/Procfile: -------------------------------------------------------------------------------- 1 | web: sh setup.sh && streamlit run app.py -------------------------------------------------------------------------------- /streamlit-app/app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import helper 3 | import pickle 4 | 5 | model = pickle.load(open('model.pkl','rb')) 6 | 7 | st.header('Duplicate Question Pairs') 8 | 9 | q1 = st.text_input('Enter question 1') 10 | q2 = st.text_input('Enter question 2') 11 | 12 | if st.button('Find'): 13 | query = helper.query_point_creator(q1,q2) 14 | result = model.predict(query)[0] 15 | 16 | if result: 17 | st.header('Duplicate') 18 | else: 19 | st.header('Not Duplicate') 20 | 21 | 22 | -------------------------------------------------------------------------------- /streamlit-app/helper.py: -------------------------------------------------------------------------------- 1 | import re 2 | from bs4 import BeautifulSoup 3 | import distance 4 | from fuzzywuzzy import fuzz 5 | import pickle 6 | import numpy as np 7 | 8 | cv = pickle.load(open('cv.pkl','rb')) 9 | 10 | 11 | def test_common_words(q1,q2): 12 | w1 = set(map(lambda word: word.lower().strip(), q1.split(" "))) 13 | w2 = set(map(lambda word: word.lower().strip(), q2.split(" "))) 14 | return len(w1 & w2) 15 | 16 | def test_total_words(q1,q2): 17 | w1 = set(map(lambda word: word.lower().strip(), q1.split(" "))) 18 | w2 = set(map(lambda word: word.lower().strip(), q2.split(" "))) 19 | return (len(w1) + len(w2)) 20 | 21 | 22 | def test_fetch_token_features(q1, q2): 23 | SAFE_DIV = 0.0001 24 | 25 | STOP_WORDS = pickle.load(open('stopwords.pkl','rb')) 26 | 27 | token_features = [0.0] * 8 28 | 29 | # Converting the Sentence into Tokens: 30 | q1_tokens = q1.split() 31 | q2_tokens = q2.split() 32 | 33 | if len(q1_tokens) == 0 or len(q2_tokens) == 0: 34 | return token_features 35 | 36 | # Get the non-stopwords in Questions 37 | q1_words = set([word for word in q1_tokens if word not in STOP_WORDS]) 38 | q2_words = set([word for word in q2_tokens if word not in STOP_WORDS]) 39 | 40 | # Get the stopwords in Questions 41 | q1_stops = set([word for word in q1_tokens if word in STOP_WORDS]) 42 | q2_stops = set([word for word in q2_tokens if word in STOP_WORDS]) 43 | 44 | # Get the common non-stopwords from Question pair 45 | common_word_count = len(q1_words.intersection(q2_words)) 46 | 47 | # Get the common stopwords from Question pair 48 | common_stop_count = len(q1_stops.intersection(q2_stops)) 49 | 50 | # Get the common Tokens from Question pair 51 | common_token_count = len(set(q1_tokens).intersection(set(q2_tokens))) 52 | 53 | token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV) 54 | token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV) 55 | token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV) 56 | token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV) 57 | token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV) 58 | token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV) 59 | 60 | # Last word of both question is same or not 61 | token_features[6] = int(q1_tokens[-1] == q2_tokens[-1]) 62 | 63 | # First word of both question is same or not 64 | token_features[7] = int(q1_tokens[0] == q2_tokens[0]) 65 | 66 | return token_features 67 | 68 | 69 | def test_fetch_length_features(q1, q2): 70 | length_features = [0.0] * 3 71 | 72 | # Converting the Sentence into Tokens: 73 | q1_tokens = q1.split() 74 | q2_tokens = q2.split() 75 | 76 | if len(q1_tokens) == 0 or len(q2_tokens) == 0: 77 | return length_features 78 | 79 | # Absolute length features 80 | length_features[0] = abs(len(q1_tokens) - len(q2_tokens)) 81 | 82 | # Average Token Length of both Questions 83 | length_features[1] = (len(q1_tokens) + len(q2_tokens)) / 2 84 | 85 | strs = list(distance.lcsubstrings(q1, q2)) 86 | length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1) 87 | 88 | return length_features 89 | 90 | 91 | def test_fetch_fuzzy_features(q1, q2): 92 | fuzzy_features = [0.0] * 4 93 | 94 | # fuzz_ratio 95 | fuzzy_features[0] = fuzz.QRatio(q1, q2) 96 | 97 | # fuzz_partial_ratio 98 | fuzzy_features[1] = fuzz.partial_ratio(q1, q2) 99 | 100 | # token_sort_ratio 101 | fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2) 102 | 103 | # token_set_ratio 104 | fuzzy_features[3] = fuzz.token_set_ratio(q1, q2) 105 | 106 | return fuzzy_features 107 | 108 | 109 | def preprocess(q): 110 | q = str(q).lower().strip() 111 | 112 | # Replace certain special characters with their string equivalents 113 | q = q.replace('%', ' percent') 114 | q = q.replace('$', ' dollar ') 115 | q = q.replace('₹', ' rupee ') 116 | q = q.replace('€', ' euro ') 117 | q = q.replace('@', ' at ') 118 | 119 | # The pattern '[math]' appears around 900 times in the whole dataset. 120 | q = q.replace('[math]', '') 121 | 122 | # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases) 123 | q = q.replace(',000,000,000 ', 'b ') 124 | q = q.replace(',000,000 ', 'm ') 125 | q = q.replace(',000 ', 'k ') 126 | q = re.sub(r'([0-9]+)000000000', r'\1b', q) 127 | q = re.sub(r'([0-9]+)000000', r'\1m', q) 128 | q = re.sub(r'([0-9]+)000', r'\1k', q) 129 | 130 | # Decontracting words 131 | # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions 132 | # https://stackoverflow.com/a/19794953 133 | contractions = { 134 | "ain't": "am not", 135 | "aren't": "are not", 136 | "can't": "can not", 137 | "can't've": "can not have", 138 | "'cause": "because", 139 | "could've": "could have", 140 | "couldn't": "could not", 141 | "couldn't've": "could not have", 142 | "didn't": "did not", 143 | "doesn't": "does not", 144 | "don't": "do not", 145 | "hadn't": "had not", 146 | "hadn't've": "had not have", 147 | "hasn't": "has not", 148 | "haven't": "have not", 149 | "he'd": "he would", 150 | "he'd've": "he would have", 151 | "he'll": "he will", 152 | "he'll've": "he will have", 153 | "he's": "he is", 154 | "how'd": "how did", 155 | "how'd'y": "how do you", 156 | "how'll": "how will", 157 | "how's": "how is", 158 | "i'd": "i would", 159 | "i'd've": "i would have", 160 | "i'll": "i will", 161 | "i'll've": "i will have", 162 | "i'm": "i am", 163 | "i've": "i have", 164 | "isn't": "is not", 165 | "it'd": "it would", 166 | "it'd've": "it would have", 167 | "it'll": "it will", 168 | "it'll've": "it will have", 169 | "it's": "it is", 170 | "let's": "let us", 171 | "ma'am": "madam", 172 | "mayn't": "may not", 173 | "might've": "might have", 174 | "mightn't": "might not", 175 | "mightn't've": "might not have", 176 | "must've": "must have", 177 | "mustn't": "must not", 178 | "mustn't've": "must not have", 179 | "needn't": "need not", 180 | "needn't've": "need not have", 181 | "o'clock": "of the clock", 182 | "oughtn't": "ought not", 183 | "oughtn't've": "ought not have", 184 | "shan't": "shall not", 185 | "sha'n't": "shall not", 186 | "shan't've": "shall not have", 187 | "she'd": "she would", 188 | "she'd've": "she would have", 189 | "she'll": "she will", 190 | "she'll've": "she will have", 191 | "she's": "she is", 192 | "should've": "should have", 193 | "shouldn't": "should not", 194 | "shouldn't've": "should not have", 195 | "so've": "so have", 196 | "so's": "so as", 197 | "that'd": "that would", 198 | "that'd've": "that would have", 199 | "that's": "that is", 200 | "there'd": "there would", 201 | "there'd've": "there would have", 202 | "there's": "there is", 203 | "they'd": "they would", 204 | "they'd've": "they would have", 205 | "they'll": "they will", 206 | "they'll've": "they will have", 207 | "they're": "they are", 208 | "they've": "they have", 209 | "to've": "to have", 210 | "wasn't": "was not", 211 | "we'd": "we would", 212 | "we'd've": "we would have", 213 | "we'll": "we will", 214 | "we'll've": "we will have", 215 | "we're": "we are", 216 | "we've": "we have", 217 | "weren't": "were not", 218 | "what'll": "what will", 219 | "what'll've": "what will have", 220 | "what're": "what are", 221 | "what's": "what is", 222 | "what've": "what have", 223 | "when's": "when is", 224 | "when've": "when have", 225 | "where'd": "where did", 226 | "where's": "where is", 227 | "where've": "where have", 228 | "who'll": "who will", 229 | "who'll've": "who will have", 230 | "who's": "who is", 231 | "who've": "who have", 232 | "why's": "why is", 233 | "why've": "why have", 234 | "will've": "will have", 235 | "won't": "will not", 236 | "won't've": "will not have", 237 | "would've": "would have", 238 | "wouldn't": "would not", 239 | "wouldn't've": "would not have", 240 | "y'all": "you all", 241 | "y'all'd": "you all would", 242 | "y'all'd've": "you all would have", 243 | "y'all're": "you all are", 244 | "y'all've": "you all have", 245 | "you'd": "you would", 246 | "you'd've": "you would have", 247 | "you'll": "you will", 248 | "you'll've": "you will have", 249 | "you're": "you are", 250 | "you've": "you have" 251 | } 252 | 253 | q_decontracted = [] 254 | 255 | for word in q.split(): 256 | if word in contractions: 257 | word = contractions[word] 258 | 259 | q_decontracted.append(word) 260 | 261 | q = ' '.join(q_decontracted) 262 | q = q.replace("'ve", " have") 263 | q = q.replace("n't", " not") 264 | q = q.replace("'re", " are") 265 | q = q.replace("'ll", " will") 266 | 267 | # Removing HTML tags 268 | q = BeautifulSoup(q) 269 | q = q.get_text() 270 | 271 | # Remove punctuations 272 | pattern = re.compile('\W') 273 | q = re.sub(pattern, ' ', q).strip() 274 | 275 | return q 276 | 277 | 278 | def query_point_creator(q1, q2): 279 | input_query = [] 280 | 281 | # preprocess 282 | q1 = preprocess(q1) 283 | q2 = preprocess(q2) 284 | 285 | # fetch basic features 286 | input_query.append(len(q1)) 287 | input_query.append(len(q2)) 288 | 289 | input_query.append(len(q1.split(" "))) 290 | input_query.append(len(q2.split(" "))) 291 | 292 | input_query.append(test_common_words(q1, q2)) 293 | input_query.append(test_total_words(q1, q2)) 294 | input_query.append(round(test_common_words(q1, q2) / test_total_words(q1, q2), 2)) 295 | 296 | # fetch token features 297 | token_features = test_fetch_token_features(q1, q2) 298 | input_query.extend(token_features) 299 | 300 | # fetch length based features 301 | length_features = test_fetch_length_features(q1, q2) 302 | input_query.extend(length_features) 303 | 304 | # fetch fuzzy features 305 | fuzzy_features = test_fetch_fuzzy_features(q1, q2) 306 | input_query.extend(fuzzy_features) 307 | 308 | # bow feature for q1 309 | q1_bow = cv.transform([q1]).toarray() 310 | 311 | # bow feature for q2 312 | q2_bow = cv.transform([q2]).toarray() 313 | 314 | return np.hstack((np.array(input_query).reshape(1, 22), q1_bow, q2_bow)) -------------------------------------------------------------------------------- /streamlit-app/readme.txt: -------------------------------------------------------------------------------- 1 | This is the streamlit web app 2 | -------------------------------------------------------------------------------- /streamlit-app/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | sklearn 3 | fuzzywuzzy 4 | distance 5 | bs4 -------------------------------------------------------------------------------- /streamlit-app/setup.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ~/.streamlit/ 2 | 3 | echo "\ 4 | [server]\n\ 5 | port = $PORT\n\ 6 | enableCORS = false\n\ 7 | headless = true\n\ 8 | \n\ 9 | " > ~/.streamlit/config.toml --------------------------------------------------------------------------------