├── README.md
├── bow-with-basic-features.ipynb
├── bow-with-preprocessing-and-advanced-features.ipynb
├── initial_EDA.ipynb
├── only-bow.ipynb
└── streamlit-app
├── Procfile
├── app.py
├── helper.py
├── readme.txt
├── requirements.txt
└── setup.sh
/README.md:
--------------------------------------------------------------------------------
1 | # quora-question-pairs
2 | A NLP project to find weather given 2 questions are same are not semantically speaking.
3 |
4 | Dataset Link - https://www.kaggle.com/c/quora-question-pairs
5 |
--------------------------------------------------------------------------------
/initial_EDA.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 21,
6 | "id": "600ccbe8",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import numpy as np\n",
11 | "import pandas as pd\n",
12 | "import seaborn as sns\n",
13 | "import matplotlib.pyplot as plt"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 22,
19 | "id": "60425156",
20 | "metadata": {},
21 | "outputs": [
22 | {
23 | "data": {
24 | "text/plain": [
25 | "(404290, 6)"
26 | ]
27 | },
28 | "execution_count": 22,
29 | "metadata": {},
30 | "output_type": "execute_result"
31 | }
32 | ],
33 | "source": [
34 | "df = pd.read_csv(\"train.csv\")\n",
35 | "df.shape"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 25,
41 | "id": "5a3d86b9",
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "data": {
46 | "text/html": [
47 | "
\n",
48 | "\n",
61 | "
\n",
62 | " \n",
63 | " \n",
64 | " | \n",
65 | " id | \n",
66 | " qid1 | \n",
67 | " qid2 | \n",
68 | " question1 | \n",
69 | " question2 | \n",
70 | " is_duplicate | \n",
71 | "
\n",
72 | " \n",
73 | " \n",
74 | " \n",
75 | " 183268 | \n",
76 | " 183268 | \n",
77 | " 280288 | \n",
78 | " 280289 | \n",
79 | " How did monkeys get to South America from Afri... | \n",
80 | " I fucking hate my life, I'm black, poor nd liv... | \n",
81 | " 0 | \n",
82 | "
\n",
83 | " \n",
84 | " 112930 | \n",
85 | " 112930 | \n",
86 | " 184684 | \n",
87 | " 132960 | \n",
88 | " What is the best photo ever taken in your life? | \n",
89 | " What is the best picture taken by you? | \n",
90 | " 1 | \n",
91 | "
\n",
92 | " \n",
93 | " 300075 | \n",
94 | " 300075 | \n",
95 | " 348955 | \n",
96 | " 422827 | \n",
97 | " What are some things new employees should know... | \n",
98 | " What are some things new employees should know... | \n",
99 | " 0 | \n",
100 | "
\n",
101 | " \n",
102 | " 223993 | \n",
103 | " 223993 | \n",
104 | " 296218 | \n",
105 | " 184831 | \n",
106 | " Why do the British care about the Royal Family? | \n",
107 | " Why has the UK retained the monarchy? | \n",
108 | " 0 | \n",
109 | "
\n",
110 | " \n",
111 | " 171389 | \n",
112 | " 171389 | \n",
113 | " 177374 | \n",
114 | " 264819 | \n",
115 | " Which is the most inspiring book to read? | \n",
116 | " What is the most inspiring book you have ever ... | \n",
117 | " 0 | \n",
118 | "
\n",
119 | " \n",
120 | " 357002 | \n",
121 | " 357002 | \n",
122 | " 486390 | \n",
123 | " 486391 | \n",
124 | " Why can't I forget my girlfriend? | \n",
125 | " Why can't I forget my first girlfriend? | \n",
126 | " 1 | \n",
127 | "
\n",
128 | " \n",
129 | " 348760 | \n",
130 | " 348760 | \n",
131 | " 477337 | \n",
132 | " 477338 | \n",
133 | " Which is greater rise in 1 degree Celsius or r... | \n",
134 | " If I sit and hold 100 grams of ice at zero deg... | \n",
135 | " 0 | \n",
136 | "
\n",
137 | " \n",
138 | " 119950 | \n",
139 | " 119950 | \n",
140 | " 194645 | \n",
141 | " 194646 | \n",
142 | " What are some ways to amplify linear motion an... | \n",
143 | " How do you amplify linear motion? | \n",
144 | " 1 | \n",
145 | "
\n",
146 | " \n",
147 | " 209885 | \n",
148 | " 209885 | \n",
149 | " 314294 | \n",
150 | " 314295 | \n",
151 | " How should one prepare for IAS when he is in h... | \n",
152 | " How can I prepare for IAS from my first year o... | \n",
153 | " 1 | \n",
154 | "
\n",
155 | " \n",
156 | " 23430 | \n",
157 | " 23430 | \n",
158 | " 43885 | \n",
159 | " 43886 | \n",
160 | " In the initial days of a SaaS startup, when th... | \n",
161 | " I have to manage the entire operations and pro... | \n",
162 | " 0 | \n",
163 | "
\n",
164 | " \n",
165 | "
\n",
166 | "
"
167 | ],
168 | "text/plain": [
169 | " id qid1 qid2 \\\n",
170 | "183268 183268 280288 280289 \n",
171 | "112930 112930 184684 132960 \n",
172 | "300075 300075 348955 422827 \n",
173 | "223993 223993 296218 184831 \n",
174 | "171389 171389 177374 264819 \n",
175 | "357002 357002 486390 486391 \n",
176 | "348760 348760 477337 477338 \n",
177 | "119950 119950 194645 194646 \n",
178 | "209885 209885 314294 314295 \n",
179 | "23430 23430 43885 43886 \n",
180 | "\n",
181 | " question1 \\\n",
182 | "183268 How did monkeys get to South America from Afri... \n",
183 | "112930 What is the best photo ever taken in your life? \n",
184 | "300075 What are some things new employees should know... \n",
185 | "223993 Why do the British care about the Royal Family? \n",
186 | "171389 Which is the most inspiring book to read? \n",
187 | "357002 Why can't I forget my girlfriend? \n",
188 | "348760 Which is greater rise in 1 degree Celsius or r... \n",
189 | "119950 What are some ways to amplify linear motion an... \n",
190 | "209885 How should one prepare for IAS when he is in h... \n",
191 | "23430 In the initial days of a SaaS startup, when th... \n",
192 | "\n",
193 | " question2 is_duplicate \n",
194 | "183268 I fucking hate my life, I'm black, poor nd liv... 0 \n",
195 | "112930 What is the best picture taken by you? 1 \n",
196 | "300075 What are some things new employees should know... 0 \n",
197 | "223993 Why has the UK retained the monarchy? 0 \n",
198 | "171389 What is the most inspiring book you have ever ... 0 \n",
199 | "357002 Why can't I forget my first girlfriend? 1 \n",
200 | "348760 If I sit and hold 100 grams of ice at zero deg... 0 \n",
201 | "119950 How do you amplify linear motion? 1 \n",
202 | "209885 How can I prepare for IAS from my first year o... 1 \n",
203 | "23430 I have to manage the entire operations and pro... 0 "
204 | ]
205 | },
206 | "execution_count": 25,
207 | "metadata": {},
208 | "output_type": "execute_result"
209 | }
210 | ],
211 | "source": [
212 | "df.sample(10)"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": 4,
218 | "id": "37b00141",
219 | "metadata": {},
220 | "outputs": [
221 | {
222 | "name": "stdout",
223 | "output_type": "stream",
224 | "text": [
225 | "\n",
226 | "RangeIndex: 404290 entries, 0 to 404289\n",
227 | "Data columns (total 6 columns):\n",
228 | " # Column Non-Null Count Dtype \n",
229 | "--- ------ -------------- ----- \n",
230 | " 0 id 404290 non-null int64 \n",
231 | " 1 qid1 404290 non-null int64 \n",
232 | " 2 qid2 404290 non-null int64 \n",
233 | " 3 question1 404289 non-null object\n",
234 | " 4 question2 404288 non-null object\n",
235 | " 5 is_duplicate 404290 non-null int64 \n",
236 | "dtypes: int64(4), object(2)\n",
237 | "memory usage: 18.5+ MB\n"
238 | ]
239 | }
240 | ],
241 | "source": [
242 | "df.info()"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 5,
248 | "id": "c5b82789",
249 | "metadata": {},
250 | "outputs": [
251 | {
252 | "data": {
253 | "text/plain": [
254 | "id 0\n",
255 | "qid1 0\n",
256 | "qid2 0\n",
257 | "question1 1\n",
258 | "question2 2\n",
259 | "is_duplicate 0\n",
260 | "dtype: int64"
261 | ]
262 | },
263 | "execution_count": 5,
264 | "metadata": {},
265 | "output_type": "execute_result"
266 | }
267 | ],
268 | "source": [
269 | "# missing values\n",
270 | "df.isnull().sum()"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 6,
276 | "id": "e704abaf",
277 | "metadata": {},
278 | "outputs": [
279 | {
280 | "data": {
281 | "text/plain": [
282 | "0"
283 | ]
284 | },
285 | "execution_count": 6,
286 | "metadata": {},
287 | "output_type": "execute_result"
288 | }
289 | ],
290 | "source": [
291 | "# duplicate rows\n",
292 | "df.duplicated().sum()"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 17,
298 | "id": "3f9bd6af",
299 | "metadata": {},
300 | "outputs": [
301 | {
302 | "name": "stdout",
303 | "output_type": "stream",
304 | "text": [
305 | "0 255027\n",
306 | "1 149263\n",
307 | "Name: is_duplicate, dtype: int64\n",
308 | "0 63.080215\n",
309 | "1 36.919785\n",
310 | "Name: is_duplicate, dtype: float64\n"
311 | ]
312 | },
313 | {
314 | "data": {
315 | "text/plain": [
316 | ""
317 | ]
318 | },
319 | "execution_count": 17,
320 | "metadata": {},
321 | "output_type": "execute_result"
322 | },
323 | {
324 | "data": {
325 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAD1CAYAAABOfbKwAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAQMElEQVR4nO3db6je5X3H8fdnphNZq0Q9is2fRWbKpsIshij0SUcgydoHWlB2fFDDFkgRhRb6YNonFiWgsFYQpmAxGKWrBttiWGtdph2lzKnHItXoXA7Vapqg6RKse6Bb0u8e3Ndp75zeuc7JSXJOYt4v+HH/7u/vuq5z3XDkk991/e5jqgpJko7kjxZ6ApKkk5tBIUnqMigkSV0GhSSpy6CQJHUZFJKkrkULPYHj7fzzz68VK1Ys9DQk6ZTy4osv/rqqxkZd+8gFxYoVK5iYmFjoaUjSKSXJL490zaUnSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkro+cl+4O1WsuPUHCz2Fj5Q37/r8Qk9B+sia8Y4iybIkP07yWpKdSb7c6l9P8qskL7Xjc0N9bksymeT1JOuG6lcmeblduzdJWv3MJI+1+nNJVgz12ZBkVzs2HNdPL0ma0WzuKA4CX62qnyX5BPBikh3t2j1V9Q/DjZNcCowDlwGfBP41yaeq6hBwP7AJ+A/gh8B64ElgI3Cgqi5JMg7cDfxNknOB24FVQLWfvb2qDhzbx5YkzdaMdxRVtbeqftbO3wdeA5Z0ulwDPFpVH1bVG8AksDrJRcDZVfVsDf5H3Q8D1w712drOHwfWtLuNdcCOqtrfwmEHg3CRJM2To9rMbktCnwaea6Vbkvw8yZYki1ttCfD2ULfdrbaknU+vH9anqg4C7wHndcaaPq9NSSaSTOzbt+9oPpIkaQazDookHwe+C3ylqn7DYBnpz4ArgL3AN6aajuhenfpc+/y+UPVAVa2qqlVjYyP/Sq4kaY5mFRRJPsYgJL5dVd8DqKp3qupQVf0W+BawujXfDSwb6r4U2NPqS0fUD+uTZBFwDrC/M5YkaZ7M5qmnAA8Cr1XVN4fqFw01+wLwSjvfDoy3J5kuBlYCz1fVXuD9JFe3MW8EnhjqM/VE03XAM20f4ylgbZLFbWlrbatJkubJbJ56+gzwReDlJC+12teAG5JcwWAp6E3gSwBVtTPJNuBVBk9M3dyeeAK4CXgIOIvB005PtvqDwCNJJhncSYy3sfYnuRN4obW7o6r2z+WDSpLmZsagqKqfMnqv4IedPpuBzSPqE8DlI+ofANcfYawtwJaZ5ilJOjH8Ex6SpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKlrxqBIsizJj5O8lmRnki+3+rlJdiTZ1V4XD/W5LclkkteTrBuqX5nk5Xbt3iRp9TOTPNbqzyVZMdRnQ/sZu5JsOK6fXpI0o9ncURwEvlpVfwFcDdyc5FLgVuDpqloJPN3e066NA5cB64H7kpzRxrof2ASsbMf6Vt8IHKiqS4B7gLvbWOcCtwNXAauB24cDSZJ04s0YFFW1t6p+1s7fB14DlgDXAFtbs63Ate38GuDRqvqwqt4AJoHVSS4Czq6qZ6uqgIen9Zka63FgTbvbWAfsqKr9VXUA2MHvw0WSNA+Oao+iLQl9GngOuLCq9sIgTIALWrMlwNtD3Xa32pJ2Pr1+WJ+qOgi8B5zXGUuSNE9mHRRJPg58F/hKVf2m13RErTr1ufYZntumJBNJJvbt29eZmiTpaM0qKJJ8jEFIfLuqvtfK77TlJNrru62+G1g21H0psKfVl46oH9YnySLgHGB/Z6zDVNUDVbWqqlaNjY3N5iNJkmZpNk89BXgQeK2qvjl0aTsw9RTSBuCJofp4e5LpYgab1s+35an3k1zdxrxxWp+psa4Dnmn7GE8Ba5MsbpvYa1tNkjRPFs2izWeALwIvJ3mp1b4G3AVsS7IReAu4HqCqdibZBrzK4Impm6vqUOt3E/AQcBbwZDtgEESPJJlkcCcx3sban+RO4IXW7o6q2j+3jypJmosZg6KqfsrovQKANUfosxnYPKI+AVw+ov4BLWhGXNsCbJlpnpKkE8NvZkuSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUNWNQJNmS5N0krwzVvp7kV0leasfnhq7dlmQyyetJ1g3Vr0zycrt2b5K0+plJHmv155KsGOqzIcmudmw4bp9akjRrs7mjeAhYP6J+T1Vd0Y4fAiS5FBgHLmt97ktyRmt/P7AJWNmOqTE3Ageq6hLgHuDuNta5wO3AVcBq4PYki4/6E0qSjsmMQVFVPwH2z3K8a4BHq+rDqnoDmARWJ7kIOLuqnq2qAh4Grh3qs7WdPw6saXcb64AdVbW/qg4AOxgdWJKkE+hY9ihuSfLztjQ19S/9JcDbQ212t9qSdj69flifqjoIvAec1xlLkjSPFs2x3/3AnUC1128AfwdkRNvq1Jljn8Mk2cRgWYvly5f35i1pFlbc+oOFnsJHxpt3fX6hp3DM5nRHUVXvVNWhqvot8C0Gewgw+Ff/sqGmS4E9rb50RP2wPkkWAecwWOo60lij5vNAVa2qqlVjY2Nz+UiSpCOYU1C0PYcpXwCmnojaDoy3J5kuZrBp/XxV7QXeT3J123+4EXhiqM/UE03XAc+0fYyngLVJFrelrbWtJkmaRzMuPSX5DvBZ4Pwkuxk8ifTZJFcwWAp6E/gSQFXtTLINeBU4CNxcVYfaUDcxeILqLODJdgA8CDySZJLBncR4G2t/kjuBF1q7O6pqtpvqkqTjZMagqKobRpQf7LTfDGweUZ8ALh9R/wC4/ghjbQG2zDRHSdKJ4zezJUldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklS14xBkWRLkneTvDJUOzfJjiS72uvioWu3JZlM8nqSdUP1K5O83K7dmyStfmaSx1r9uSQrhvpsaD9jV5INx+1TS5JmbTZ3FA8B66fVbgWerqqVwNPtPUkuBcaBy1qf+5Kc0frcD2wCVrZjasyNwIGqugS4B7i7jXUucDtwFbAauH04kCRJ82PGoKiqnwD7p5WvAba2863AtUP1R6vqw6p6A5gEVie5CDi7qp6tqgIentZnaqzHgTXtbmMdsKOq9lfVAWAHfxhYkqQTbK57FBdW1V6A9npBqy8B3h5qt7vVlrTz6fXD+lTVQeA94LzOWJKkeXS8N7Mzolad+lz7HP5Dk01JJpJM7Nu3b1YTlSTNzlyD4p22nER7fbfVdwPLhtotBfa0+tIR9cP6JFkEnMNgqetIY/2BqnqgqlZV1aqxsbE5fiRJ0ihzDYrtwNRTSBuAJ4bq4+1JposZbFo/35an3k9yddt/uHFan6mxrgOeafsYTwFrkyxum9hrW02SNI8WzdQgyXeAzwLnJ9nN4Emku4BtSTYCbwHXA1TVziTbgFeBg8DNVXWoDXUTgyeozgKebAfAg8AjSSYZ3EmMt7H2J7kTeKG1u6Oqpm+qS5JOsBmDoqpuOMKlNUdovxnYPKI+AVw+ov4BLWhGXNsCbJlpjpKkE8dvZkuSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUdUxBkeTNJC8neSnJRKudm2RHkl3tdfFQ+9uSTCZ5Pcm6ofqVbZzJJPcmSaufmeSxVn8uyYpjma8k6egdjzuKv6qqK6pqVXt/K/B0Va0Enm7vSXIpMA5cBqwH7ktyRutzP7AJWNmO9a2+EThQVZcA9wB3H4f5SpKOwolYeroG2NrOtwLXDtUfraoPq+oNYBJYneQi4OyqeraqCnh4Wp+psR4H1kzdbUiS5sexBkUB/5LkxSSbWu3CqtoL0F4vaPUlwNtDfXe32pJ2Pr1+WJ+qOgi8B5x3jHOWJB2FRcfY/zNVtSfJBcCOJP/ZaTvqTqA69V6fwwcehNQmgOXLl/dnLEk6Ksd0R1FVe9rru8D3gdXAO205ifb6bmu+G1g21H0psKfVl46oH9YnySLgHGD/iHk8UFWrqmrV2NjYsXwkSdI0cw6KJH+S5BNT58Ba4BVgO7ChNdsAPNHOtwPj7UmmixlsWj/flqfeT3J123+4cVqfqbGuA55p+xiSpHlyLEtPFwLfb3vLi4B/qqofJXkB2JZkI/AWcD1AVe1Msg14FTgI3FxVh9pYNwEPAWcBT7YD4EHgkSSTDO4kxo9hvpKkOZhzUFTVL4C/HFH/b2DNEfpsBjaPqE8Al4+of0ALGknSwvCb2ZKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1GRSSpC6DQpLUZVBIkroMCklSl0EhSeoyKCRJXQaFJKnLoJAkdRkUkqQug0KS1GVQSJK6DApJUpdBIUnqMigkSV0GhSSpy6CQJHUZFJKkLoNCktRlUEiSugwKSVKXQSFJ6jIoJEldBoUkqcugkCR1nRJBkWR9kteTTCa5daHnI0mnk5M+KJKcAfwj8NfApcANSS5d2FlJ0unjpA8KYDUwWVW/qKr/BR4FrlngOUnSaWPRQk9gFpYAbw+93w1cNdwgySZgU3v7P0len6e5nQ7OB3690JOYSe5e6BlogZz0v5+n0O/mnx7pwqkQFBlRq8PeVD0APDA/0zm9JJmoqlULPQ9pFH8/58epsPS0G1g29H4psGeB5iJJp51TISheAFYmuTjJHwPjwPYFnpMknTZO+qWnqjqY5BbgKeAMYEtV7VzgaZ1OXNLTyczfz3mQqpq5lSTptHUqLD1JkhaQQSFJ6jIoJEldJ/1mtuZXkj9n8M33JQy+r7IH2F5Vry3oxCQtGO8o9DtJ/p7Bn0gJ8DyDR5MDfMc/xqiTWZK/Xeg5fJT51JN+J8l/AZdV1f9Nq/8xsLOqVi7MzKS+JG9V1fKFnsdHlUtPGvZb4JPAL6fVL2rXpAWT5OdHugRcOJ9zOd0YFBr2FeDpJLv4/R9iXA5cAtyyUJOSmguBdcCBafUA/z7/0zl9GBT6nar6UZJPMfjT7ksY/Ae4G3ihqg4t6OQk+Gfg41X10vQLSf5t3mdzGnGPQpLU5VNPkqQug0KS1GVQSJK6DApJUpdBIUnq+n/InXmx1HDi4wAAAABJRU5ErkJggg==\n",
326 | "text/plain": [
327 | ""
328 | ]
329 | },
330 | "metadata": {
331 | "needs_background": "light"
332 | },
333 | "output_type": "display_data"
334 | }
335 | ],
336 | "source": [
337 | "# Distribution of duplicate and non-duplicate questions\n",
338 | "\n",
339 | "print(df['is_duplicate'].value_counts())\n",
340 | "print((df['is_duplicate'].value_counts()/df['is_duplicate'].count())*100)\n",
341 | "df['is_duplicate'].value_counts().plot(kind='bar')"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": 19,
347 | "id": "788d2d08",
348 | "metadata": {},
349 | "outputs": [
350 | {
351 | "name": "stdout",
352 | "output_type": "stream",
353 | "text": [
354 | "Number of unique questions 537933\n",
355 | "Number of questions getting repeated 111780\n"
356 | ]
357 | }
358 | ],
359 | "source": [
360 | "# Repeated questions\n",
361 | "\n",
362 | "qid = pd.Series(df['qid1'].tolist() + df['qid2'].tolist())\n",
363 | "print('Number of unique questions',np.unique(qid).shape[0])\n",
364 | "x = qid.value_counts()>1\n",
365 | "print('Number of questions getting repeated',x[x].shape[0])"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": 20,
371 | "id": "2fa5bb83",
372 | "metadata": {},
373 | "outputs": [
374 | {
375 | "data": {
376 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAQoklEQVR4nO3df6zdd13H8efLzjtk6PixobM/bOGOxf6jwHX88EcmTGyBbopE10ACWtdgMuKPqHSZMeEvBxpjiJNZdc7A7NLMCS0rmYriMFlg3WTYUiplDHbZoJ2YGdFkTN7+cU7Zydm97bnnR8+5n/t8JE3v93PO+X5fve1593Pf38/5flNVSJLa8h3TDiBJGj+LuyQ1yOIuSQ2yuEtSgyzuktSg86YdAOCiiy6qzZs3TzuGJK0q999//+NVdfFSj021uCfZAeyYn5/n8OHD04wiSatOki8t99hU2zJVdbCqdl944YXTjCFJzZlqcU+yI8neJ554YpoxJKk5ztwlqUHO3CWpQc7cJalBrnOXpAbZlpGkBtmWkaQGrfq2zOY9d7F5z13TjiFJM8W2jCQ1yLaMJDVo1bdlJEnPZHGXpAZZ3CWpQZ5QlaQGeUJVkhpkW0aSGmRxl6QGWdwlqUEWd0lqkKtlJKlBrpaRpAbZlpGkBlncJalBFndJapDFXZIaZHGXpAaNvbgnuSLJJ5LcnOSKce9fknR2AxX3JLckOZnkSN/4tiTHk5xIsqc7XMB/A88CFscbV5I0iEFn7rcC23oHkqwDbgK2A1uBnUm2Ap+oqu3Au4B3jy+qJGlQAxX3qroH+Hrf8OXAiap6qKqeBG4Hrq6qb3Uf/0/g/OX2mWR3ksNJDp86dWqI6JKk5YzSc18PPNKzvQisT/KmJH8KfAD44+VeXFV7q2qhqhYuvvjiEWJIkvqdN8Jrs8RYVdWdwJ0D7SDZAeyYn58fIYYkqd8oM/dFYGPP9gbg0ZXswGvLSNJkjFLc7wMuTbIlyRxwDXBgJTvwqpCSNBmDLoXcB9wLXJZkMcmuqnoKuA64GzgG7K+qoys5uDN3SZqMgXruVbVzmfFDwKFhD27PXZImo5nruW/ecxeb99w1hlSStPp5JyZJalAzM3dJ0tO8KqQkNci2jCQ1yLaMJDXItowkNci2jCQ1yLaMJDXItowkNcjiLkkNsrhLUoM8oSpJDfKEqiQ1yLaMJDXI4i5JDbK4S1KDmivu3rRDklwtI0lNcrWMJDWoubaMJMniLklNsrhLUoMs7pLUIIu7JDVoIsU9yQVJ7k/yxknsX5J0ZgMV9yS3JDmZ5Ejf+LYkx5OcSLKn56F3AfvHGVSSNLhBZ+63Att6B5KsA24CtgNbgZ1Jtia5Evgs8LUx5pQkrcB5gzypqu5Jsrlv+HLgRFU9BJDkduBq4DnABXQK/v8mOVRV3+rfZ5LdwG6ATZs2Df0HkCQ900DFfRnrgUd6theBV1TVdQBJ3g48vlRhB6iqvcBegIWFhRohx5JOX1/m4RvfMO5dS9LMG6W4Z4mxbxfpqrr1rDtIdgA75ufnR4ghSeo3ymqZRWBjz/YG4NGV7MBry0jSZIxS3O8DLk2yJckccA1wYCU78KqQkjQZgy6F3AfcC1yWZDHJrqp6CrgOuBs4BuyvqqMrObgzd0majEFXy+xcZvwQcGjYg9tzl6TJ8HruktQg78QkSQ1y5i5JDfKqkJLUoObbMpv33PXtT6tK0lphW0aSGmRbRpIa1HxbRpLWItsyktQg2zKS1CCLuyQ1yOIuSQ3yhKokNWjNnFD1w0yS1hLbMpLUIIu7JDXI4i5JDbK4S1KDXC0jSQ1aM6tlTnPVjKS1wLaMJDXI4i5JDbK4S1KDLO6S1KA1W9w9qSqpZWMv7kl+MMnNSe5I8ivj3r8k6ewGKu5JbklyMsmRvvFtSY4nOZFkD0BVHauqdwA/DyyMP7Ik6WwGnbnfCmzrHUiyDrgJ2A5sBXYm2dp97CrgX4CPjS2pJGlgAxX3qroH+Hrf8OXAiap6qKqeBG4Hru4+/0BVvRp4yzjDSpIGc94Ir10PPNKzvQi8IskVwJuA84FDy704yW5gN8CmTZtGiCFJ6jdKcc8SY1VVHwc+frYXV9XeJI8BO+bm5l4+Qo6hnV4x8/CNb5jG4SVpYkZZLbMIbOzZ3gA8upIdTOPaMpK0FoxS3O8DLk2yJckccA1wYCU78KqQkjQZgy6F3AfcC1yWZDHJrqp6CrgOuBs4BuyvqqMrObgzd0majIF67lW1c5nxQ5zhpOnZJNkB7Jifnx92F2Nh711Sa9bc9dwlaS3wTkyS1CBn7j28S5OkVqzZq0JKUstsy0hSg2zLLMH2jKTVzraMJDXI4i5JDbLnfga2ZyStVvbcJalBtmUkqUEW9wHYnpG02thzl6QG2XOXpAbZllkB2zOSVguLuyQ1yOIuSQ2yuEtSg1wtMwR775Jm3UD3UJ2UqjoIHFxYWLh2mjmG1V/gvQerpFlhW0aSGmRxl6QGTbUt05reNo0tGknT5MxdkhpkcZekBk2kuCf5mSR/luTDSV43iWNIkpY3cHFPckuSk0mO9I1vS3I8yYkkewCq6kNVdS3wduAXxppYknRWK5m53wps6x1Isg64CdgObAV2Jtna85Tf6T6+5vhBJ0nTNPBqmaq6J8nmvuHLgRNV9RBAktuBq5McA24EPlpVD4wr7GrkB50kTcOoPff1wCM924vdsXcCVwJvTvKOpV6YZHeSw0kOnzp1asQYkqReo65zzxJjVVXvA953phdW1d4kjwE75ubmXj5iDklSj1Fn7ovAxp7tDcCjg754Ld6JyV68pHNh1OJ+H3Bpki1J5oBrgAODvni1XhVSkmbdSpZC7gPuBS5LsphkV1U9BVwH3A0cA/ZX1dFB97kWZ+6nOYOXNEkrWS2zc5nxQ8ChYQ6eZAewY35+fpiXS5KWMdXLD6zlmXs/Z/KSxsk7MUlSg7wT05Q5W5c0CV4VUpIaZFtGkhrkCdUZ44lVSeNgW0aSGmRxl6QG2XNfhWzdSDobl0KuIhZ0SYOaanHXYCzqklbKnvuMsqBLGsVUZ+5eOOzMLPCShuU6d0lqkG0ZSWqQxV2SGmRxl6QGWdwlqUF+QnUVG+WTqn7KVWqbq2UkqUG2ZRrgLFxSP4u7JDXI4i5JDbK4r3G2dKQ2WdwlqUEWd0lq0NivCpnkRcANwIVV9eZx719n199mefjGN0wpiaRpGWjmnuSWJCeTHOkb35bkeJITSfYAVNVDVbVrEmF1ZvbPJZ02aFvmVmBb70CSdcBNwHZgK7AzydaxppMkDWWg4l5V9wBf7xu+HDjRnak/CdwOXD3ogZPsTnI4yeFTp04NHFiT4axfassoJ1TXA4/0bC8C65O8IMnNwEuTXL/ci6tqL/Bu4IG5ubkRYuhc8z8CafaNUtyzxFhV1X9U1Tuq6sVV9Xtn2oHXlpGkyRhltcwisLFnewPw6Ep24D1Uz72zzbiXe9wVN9LqMsrM/T7g0iRbkswB1wAHVrIDZ+6SNBmDLoXcB9wLXJZkMcmuqnoKuA64GzgG7K+qoys5uNdzPzcm3R+3By/NnoHaMlW1c5nxQ8ChYQ9eVQeBgwsLC9cOuw9J0jN5+QFJapC32ZOkBnmbPUlq0NgvHLYSLoVcPU6fMO1dEulJVGl2OXOXpAZ5QlWSGmRxl6QG2XPXxHjTEGl67LlLUoNsy0hSgyzuktQgP6GqFTnT2nYvICbNDnvuktQg2zKS1CCLuyQ1yOIuSQ2yuEtSg1wtI0kNcrWMJDXItowkNcjiLkkNsrhLUoMs7pLUIIu7JDXI4i5JDRr7nZiSXAD8CfAk8PGqum3cx5AkndlAM/cktyQ5meRI3/i2JMeTnEiypzv8JuCOqroWuGrMeSVJAxi0LXMrsK13IMk64CZgO7AV2JlkK7ABeKT7tP8bT0xJ0koM1JapqnuSbO4bvhw4UVUPASS5HbgaWKRT4D/NGf7zSLIb2A2wadOmlebWDFvuhh1LjfffNPv0c06P92+v9HkrsdQ+xrHf0/tZLTcIH9efedz7as2kvzejnFBdz9MzdOgU9fXAncDPJXk/cHC5F1fV3qpaqKqFiy++eIQYkqR+o5xQzRJjVVXfAH5xoB0kO4Ad8/PzI8SQJPUbZea+CGzs2d4APDpaHEnSOIxS3O8DLk2yJckccA1wYCU78KqQkjQZgy6F3AfcC1yWZDHJrqp6CrgOuBs4BuyvqqMrObjXc5ekyRh0tczOZcYPAYeGPXhVHQQOLiwsXDvsPiRJz+TlBySpQd5mT5Ia5G32JKlBqappZyDJKeBLK3zZRcDjE4gzDmYbjtmGY7bhtJDtB6pqyU+BzkRxH0aSw1W1MO0cSzHbcMw2HLMNp/VsnlCVpAZZ3CWpQau5uO+ddoAzMNtwzDYcsw2n6WyrtucuSVreap65S5KWYXGXpAatyuK+zL1bp5VlY5J/SnIsydEkv9odf36Sv0/y+e7vz5tSvnVJ/jXJR2YpVzfLc5PckeRz3e/fq2YlX5Jf7/59HkmyL8mzppVtqXsYnylLkuu7743jSX56Ctl+v/t3+pkkf5vkubOSreex30xSSS6apWxJ3tk9/tEk7x0pW1Wtql/AOuALwIuAOeBBYOsU81wCvKz79XcD/07nnrLvBfZ0x/cA75lSvt8A/hr4SHd7JnJ1j/9XwC93v54DnjsL+ejcUeyLwHd1t/cDb59WNuAngJcBR3rGlszS/bf3IHA+sKX7Xll3jrO9Djiv+/V7Zilbd3wjnavZfgm4aFayAT8J/ANwfnf7haNkO6dvmjF9U14F3N2zfT1w/bRz9eT5MPBTwHHgku7YJcDxKWTZAHwMeE1PcZ96ru6xv6dbQNM3PvV8PH0LyefTuXLqR7oFa2rZgM19hWDJLP3vh24Re9W5zNb32M8Ct81SNuAO4IeAh3uK+9Sz0ZlEXLnE84bKthrbMsvdu3XqujcRfynwSeB7q+oxgO7vL5xCpD8Cfhv4Vs/YLOSCzk9ep4C/7LaN/jzJBbOQr6q+AvwB8GXgMeCJqvq7WcjWY7kss/b++CXgo92vp54tyVXAV6rqwb6Hpp4NeAnw40k+meSfk/zIKNlWY3Ff8t6t5zxFnyTPAf4G+LWq+q8ZyPNG4GRV3T/tLMs4j86Ppe+vqpcC36DTXpi6bv/6ajo/An8/cEGSt0431cBm5v2R5AbgKeC200NLPO2cZUvybOAG4HeXeniJsXP9fTsPeB7wSuC3gP1JwpDZVmNxn7l7tyb5TjqF/baqurM7/LUkl3QfvwQ4eY5j/ShwVZKHgduB1yT54AzkOm0RWKyqT3a376BT7Gch35XAF6vqVFV9E7gTePWMZDttuSwz8f5I8jbgjcBbqttLmIFsL6bzH/aD3ffFBuCBJN83A9noZrizOj5F5yfui4bNthqL+8j3bh2n7v+sfwEcq6o/7HnoAPC27tdvo9OLP2eq6vqq2lBVm+l8j/6xqt467Vw9+b4KPJLksu7Qa4HPMhv5vgy8Msmzu3+/r6VzK8lZyHbaclkOANckOT/JFuBS4FPnMliSbcC7gKuq6n96Hppqtqr6t6p6YVVt7r4vFukshvjqtLN1fYjO+TGSvITOIoPHh842yRMGEzwR8Xo6q1K+ANww5Sw/RudHpM8An+7+ej3wAjonMz/f/f35U8x4BU+fUJ2lXD8MHO5+7z5E50fSmcgHvBv4HHAE+ACdlQpTyQbso9P7/yadgrTrTFnotB6+QOek6/YpZDtBp0d8+v1w86xk63v8YbonVGchG51i/sHuv7kHgNeMks3LD0hSg1ZjW0aSdBYWd0lqkMVdkhpkcZekBlncJalBFndJapDFXZIa9P9yyn9QIsa7pwAAAABJRU5ErkJggg==\n",
377 | "text/plain": [
378 | ""
379 | ]
380 | },
381 | "metadata": {
382 | "needs_background": "light"
383 | },
384 | "output_type": "display_data"
385 | }
386 | ],
387 | "source": [
388 | "# Repeated questions histogram\n",
389 | "\n",
390 | "plt.hist(qid.value_counts().values,bins=160)\n",
391 | "plt.yscale('log')\n",
392 | "plt.show()"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": null,
398 | "id": "f9573e2f",
399 | "metadata": {},
400 | "outputs": [],
401 | "source": []
402 | }
403 | ],
404 | "metadata": {
405 | "kernelspec": {
406 | "display_name": "Python 3",
407 | "language": "python",
408 | "name": "python3"
409 | },
410 | "language_info": {
411 | "codemirror_mode": {
412 | "name": "ipython",
413 | "version": 3
414 | },
415 | "file_extension": ".py",
416 | "mimetype": "text/x-python",
417 | "name": "python",
418 | "nbconvert_exporter": "python",
419 | "pygments_lexer": "ipython3",
420 | "version": "3.8.8"
421 | }
422 | },
423 | "nbformat": 4,
424 | "nbformat_minor": 5
425 | }
426 |
--------------------------------------------------------------------------------
/only-bow.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "b478deb3",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import numpy as np\n",
11 | "import pandas as pd\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import seaborn as sns"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 2,
19 | "id": "217d407d",
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "df = pd.read_csv('train.csv')"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 3,
29 | "id": "0cb99da3",
30 | "metadata": {},
31 | "outputs": [
32 | {
33 | "data": {
34 | "text/plain": [
35 | "(404290, 6)"
36 | ]
37 | },
38 | "execution_count": 3,
39 | "metadata": {},
40 | "output_type": "execute_result"
41 | }
42 | ],
43 | "source": [
44 | "df.shape"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 4,
50 | "id": "e5e7ce9a",
51 | "metadata": {},
52 | "outputs": [
53 | {
54 | "data": {
55 | "text/html": [
56 | "\n",
57 | "\n",
70 | "
\n",
71 | " \n",
72 | " \n",
73 | " | \n",
74 | " id | \n",
75 | " qid1 | \n",
76 | " qid2 | \n",
77 | " question1 | \n",
78 | " question2 | \n",
79 | " is_duplicate | \n",
80 | "
\n",
81 | " \n",
82 | " \n",
83 | " \n",
84 | " 0 | \n",
85 | " 0 | \n",
86 | " 1 | \n",
87 | " 2 | \n",
88 | " What is the step by step guide to invest in sh... | \n",
89 | " What is the step by step guide to invest in sh... | \n",
90 | " 0 | \n",
91 | "
\n",
92 | " \n",
93 | " 1 | \n",
94 | " 1 | \n",
95 | " 3 | \n",
96 | " 4 | \n",
97 | " What is the story of Kohinoor (Koh-i-Noor) Dia... | \n",
98 | " What would happen if the Indian government sto... | \n",
99 | " 0 | \n",
100 | "
\n",
101 | " \n",
102 | " 2 | \n",
103 | " 2 | \n",
104 | " 5 | \n",
105 | " 6 | \n",
106 | " How can I increase the speed of my internet co... | \n",
107 | " How can Internet speed be increased by hacking... | \n",
108 | " 0 | \n",
109 | "
\n",
110 | " \n",
111 | " 3 | \n",
112 | " 3 | \n",
113 | " 7 | \n",
114 | " 8 | \n",
115 | " Why am I mentally very lonely? How can I solve... | \n",
116 | " Find the remainder when [math]23^{24}[/math] i... | \n",
117 | " 0 | \n",
118 | "
\n",
119 | " \n",
120 | " 4 | \n",
121 | " 4 | \n",
122 | " 9 | \n",
123 | " 10 | \n",
124 | " Which one dissolve in water quikly sugar, salt... | \n",
125 | " Which fish would survive in salt water? | \n",
126 | " 0 | \n",
127 | "
\n",
128 | " \n",
129 | "
\n",
130 | "
"
131 | ],
132 | "text/plain": [
133 | " id qid1 qid2 question1 \\\n",
134 | "0 0 1 2 What is the step by step guide to invest in sh... \n",
135 | "1 1 3 4 What is the story of Kohinoor (Koh-i-Noor) Dia... \n",
136 | "2 2 5 6 How can I increase the speed of my internet co... \n",
137 | "3 3 7 8 Why am I mentally very lonely? How can I solve... \n",
138 | "4 4 9 10 Which one dissolve in water quikly sugar, salt... \n",
139 | "\n",
140 | " question2 is_duplicate \n",
141 | "0 What is the step by step guide to invest in sh... 0 \n",
142 | "1 What would happen if the Indian government sto... 0 \n",
143 | "2 How can Internet speed be increased by hacking... 0 \n",
144 | "3 Find the remainder when [math]23^{24}[/math] i... 0 \n",
145 | "4 Which fish would survive in salt water? 0 "
146 | ]
147 | },
148 | "execution_count": 4,
149 | "metadata": {},
150 | "output_type": "execute_result"
151 | }
152 | ],
153 | "source": [
154 | "df.head()"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 5,
160 | "id": "94b6e88e",
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "new_df = df.sample(30000)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 6,
170 | "id": "5074efd7",
171 | "metadata": {},
172 | "outputs": [
173 | {
174 | "data": {
175 | "text/plain": [
176 | "id 0\n",
177 | "qid1 0\n",
178 | "qid2 0\n",
179 | "question1 0\n",
180 | "question2 0\n",
181 | "is_duplicate 0\n",
182 | "dtype: int64"
183 | ]
184 | },
185 | "execution_count": 6,
186 | "metadata": {},
187 | "output_type": "execute_result"
188 | }
189 | ],
190 | "source": [
191 | "new_df.isnull().sum()"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 7,
197 | "id": "e2763e28",
198 | "metadata": {},
199 | "outputs": [
200 | {
201 | "data": {
202 | "text/plain": [
203 | "0"
204 | ]
205 | },
206 | "execution_count": 7,
207 | "metadata": {},
208 | "output_type": "execute_result"
209 | }
210 | ],
211 | "source": [
212 | "new_df.duplicated().sum()"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": 8,
218 | "id": "a1e18aeb",
219 | "metadata": {},
220 | "outputs": [
221 | {
222 | "data": {
223 | "text/html": [
224 | "\n",
225 | "\n",
238 | "
\n",
239 | " \n",
240 | " \n",
241 | " | \n",
242 | " question1 | \n",
243 | " question2 | \n",
244 | "
\n",
245 | " \n",
246 | " \n",
247 | " \n",
248 | " 394861 | \n",
249 | " What is Mark Zuckerberg's degree? | \n",
250 | " Is there any reservation for NT in upsc? | \n",
251 | "
\n",
252 | " \n",
253 | " 266389 | \n",
254 | " Why is Saltwater Taffy candy imported in Poland? | \n",
255 | " Why is Saltwater Taffy candy imported in Portu... | \n",
256 | "
\n",
257 | " \n",
258 | " 32076 | \n",
259 | " Who are the best people on the Internet to fol... | \n",
260 | " Who are the best people on the Internet to fol... | \n",
261 | "
\n",
262 | " \n",
263 | " 150593 | \n",
264 | " I have been with my partner for 8 years Last y... | \n",
265 | " I split up with my wife two years ago. Officia... | \n",
266 | "
\n",
267 | " \n",
268 | " 128073 | \n",
269 | " Is it legal to earn income from online freelan... | \n",
270 | " Can 4-5 employers file a petition for your H1-... | \n",
271 | "
\n",
272 | " \n",
273 | "
\n",
274 | "
"
275 | ],
276 | "text/plain": [
277 | " question1 \\\n",
278 | "394861 What is Mark Zuckerberg's degree? \n",
279 | "266389 Why is Saltwater Taffy candy imported in Poland? \n",
280 | "32076 Who are the best people on the Internet to fol... \n",
281 | "150593 I have been with my partner for 8 years Last y... \n",
282 | "128073 Is it legal to earn income from online freelan... \n",
283 | "\n",
284 | " question2 \n",
285 | "394861 Is there any reservation for NT in upsc? \n",
286 | "266389 Why is Saltwater Taffy candy imported in Portu... \n",
287 | "32076 Who are the best people on the Internet to fol... \n",
288 | "150593 I split up with my wife two years ago. Officia... \n",
289 | "128073 Can 4-5 employers file a petition for your H1-... "
290 | ]
291 | },
292 | "execution_count": 8,
293 | "metadata": {},
294 | "output_type": "execute_result"
295 | }
296 | ],
297 | "source": [
298 | "ques_df = new_df[['question1','question2']]\n",
299 | "ques_df.head()"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 9,
305 | "id": "dec56417",
306 | "metadata": {},
307 | "outputs": [],
308 | "source": [
309 | "from sklearn.feature_extraction.text import CountVectorizer\n",
310 | "# merge texts\n",
311 | "questions = list(ques_df['question1']) + list(ques_df['question2'])\n",
312 | "\n",
313 | "cv = CountVectorizer(max_features=3000)\n",
314 | "q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": 10,
320 | "id": "88026075",
321 | "metadata": {},
322 | "outputs": [
323 | {
324 | "data": {
325 | "text/plain": [
326 | "(30000, 6000)"
327 | ]
328 | },
329 | "execution_count": 10,
330 | "metadata": {},
331 | "output_type": "execute_result"
332 | }
333 | ],
334 | "source": [
335 | "temp_df1 = pd.DataFrame(q1_arr, index= ques_df.index)\n",
336 | "temp_df2 = pd.DataFrame(q2_arr, index= ques_df.index)\n",
337 | "temp_df = pd.concat([temp_df1, temp_df2], axis=1)\n",
338 | "temp_df.shape"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 15,
344 | "id": "2f202654",
345 | "metadata": {},
346 | "outputs": [
347 | {
348 | "data": {
349 | "text/html": [
350 | "\n",
351 | "\n",
364 | "
\n",
365 | " \n",
366 | " \n",
367 | " | \n",
368 | " 0 | \n",
369 | " 1 | \n",
370 | " 2 | \n",
371 | " 3 | \n",
372 | " 4 | \n",
373 | " 5 | \n",
374 | " 6 | \n",
375 | " 7 | \n",
376 | " 8 | \n",
377 | " 9 | \n",
378 | " ... | \n",
379 | " 2991 | \n",
380 | " 2992 | \n",
381 | " 2993 | \n",
382 | " 2994 | \n",
383 | " 2995 | \n",
384 | " 2996 | \n",
385 | " 2997 | \n",
386 | " 2998 | \n",
387 | " 2999 | \n",
388 | " is_duplicate | \n",
389 | "
\n",
390 | " \n",
391 | " \n",
392 | " \n",
393 | " 394861 | \n",
394 | " 0 | \n",
395 | " 0 | \n",
396 | " 0 | \n",
397 | " 0 | \n",
398 | " 0 | \n",
399 | " 0 | \n",
400 | " 0 | \n",
401 | " 0 | \n",
402 | " 0 | \n",
403 | " 0 | \n",
404 | " ... | \n",
405 | " 0 | \n",
406 | " 0 | \n",
407 | " 0 | \n",
408 | " 0 | \n",
409 | " 0 | \n",
410 | " 0 | \n",
411 | " 0 | \n",
412 | " 0 | \n",
413 | " 0 | \n",
414 | " 0 | \n",
415 | "
\n",
416 | " \n",
417 | " 266389 | \n",
418 | " 0 | \n",
419 | " 0 | \n",
420 | " 0 | \n",
421 | " 0 | \n",
422 | " 0 | \n",
423 | " 0 | \n",
424 | " 0 | \n",
425 | " 0 | \n",
426 | " 0 | \n",
427 | " 0 | \n",
428 | " ... | \n",
429 | " 0 | \n",
430 | " 0 | \n",
431 | " 0 | \n",
432 | " 0 | \n",
433 | " 0 | \n",
434 | " 0 | \n",
435 | " 0 | \n",
436 | " 0 | \n",
437 | " 0 | \n",
438 | " 1 | \n",
439 | "
\n",
440 | " \n",
441 | " 32076 | \n",
442 | " 0 | \n",
443 | " 0 | \n",
444 | " 0 | \n",
445 | " 0 | \n",
446 | " 0 | \n",
447 | " 0 | \n",
448 | " 0 | \n",
449 | " 0 | \n",
450 | " 0 | \n",
451 | " 0 | \n",
452 | " ... | \n",
453 | " 0 | \n",
454 | " 0 | \n",
455 | " 0 | \n",
456 | " 0 | \n",
457 | " 0 | \n",
458 | " 0 | \n",
459 | " 0 | \n",
460 | " 0 | \n",
461 | " 0 | \n",
462 | " 0 | \n",
463 | "
\n",
464 | " \n",
465 | " 150593 | \n",
466 | " 0 | \n",
467 | " 0 | \n",
468 | " 0 | \n",
469 | " 0 | \n",
470 | " 0 | \n",
471 | " 0 | \n",
472 | " 0 | \n",
473 | " 0 | \n",
474 | " 0 | \n",
475 | " 0 | \n",
476 | " ... | \n",
477 | " 0 | \n",
478 | " 0 | \n",
479 | " 0 | \n",
480 | " 0 | \n",
481 | " 0 | \n",
482 | " 0 | \n",
483 | " 0 | \n",
484 | " 0 | \n",
485 | " 0 | \n",
486 | " 0 | \n",
487 | "
\n",
488 | " \n",
489 | " 128073 | \n",
490 | " 0 | \n",
491 | " 0 | \n",
492 | " 0 | \n",
493 | " 0 | \n",
494 | " 0 | \n",
495 | " 0 | \n",
496 | " 0 | \n",
497 | " 0 | \n",
498 | " 0 | \n",
499 | " 0 | \n",
500 | " ... | \n",
501 | " 0 | \n",
502 | " 0 | \n",
503 | " 1 | \n",
504 | " 0 | \n",
505 | " 0 | \n",
506 | " 0 | \n",
507 | " 0 | \n",
508 | " 0 | \n",
509 | " 0 | \n",
510 | " 0 | \n",
511 | "
\n",
512 | " \n",
513 | " ... | \n",
514 | " ... | \n",
515 | " ... | \n",
516 | " ... | \n",
517 | " ... | \n",
518 | " ... | \n",
519 | " ... | \n",
520 | " ... | \n",
521 | " ... | \n",
522 | " ... | \n",
523 | " ... | \n",
524 | " ... | \n",
525 | " ... | \n",
526 | " ... | \n",
527 | " ... | \n",
528 | " ... | \n",
529 | " ... | \n",
530 | " ... | \n",
531 | " ... | \n",
532 | " ... | \n",
533 | " ... | \n",
534 | " ... | \n",
535 | "
\n",
536 | " \n",
537 | " 350070 | \n",
538 | " 0 | \n",
539 | " 0 | \n",
540 | " 0 | \n",
541 | " 0 | \n",
542 | " 0 | \n",
543 | " 0 | \n",
544 | " 0 | \n",
545 | " 0 | \n",
546 | " 0 | \n",
547 | " 0 | \n",
548 | " ... | \n",
549 | " 0 | \n",
550 | " 0 | \n",
551 | " 0 | \n",
552 | " 0 | \n",
553 | " 0 | \n",
554 | " 0 | \n",
555 | " 0 | \n",
556 | " 0 | \n",
557 | " 0 | \n",
558 | " 0 | \n",
559 | "
\n",
560 | " \n",
561 | " 301346 | \n",
562 | " 0 | \n",
563 | " 0 | \n",
564 | " 0 | \n",
565 | " 0 | \n",
566 | " 0 | \n",
567 | " 0 | \n",
568 | " 0 | \n",
569 | " 0 | \n",
570 | " 0 | \n",
571 | " 0 | \n",
572 | " ... | \n",
573 | " 0 | \n",
574 | " 0 | \n",
575 | " 0 | \n",
576 | " 0 | \n",
577 | " 0 | \n",
578 | " 0 | \n",
579 | " 0 | \n",
580 | " 0 | \n",
581 | " 0 | \n",
582 | " 0 | \n",
583 | "
\n",
584 | " \n",
585 | " 79932 | \n",
586 | " 0 | \n",
587 | " 0 | \n",
588 | " 0 | \n",
589 | " 0 | \n",
590 | " 0 | \n",
591 | " 0 | \n",
592 | " 0 | \n",
593 | " 0 | \n",
594 | " 0 | \n",
595 | " 0 | \n",
596 | " ... | \n",
597 | " 0 | \n",
598 | " 0 | \n",
599 | " 0 | \n",
600 | " 0 | \n",
601 | " 0 | \n",
602 | " 0 | \n",
603 | " 0 | \n",
604 | " 0 | \n",
605 | " 0 | \n",
606 | " 0 | \n",
607 | "
\n",
608 | " \n",
609 | " 74788 | \n",
610 | " 0 | \n",
611 | " 0 | \n",
612 | " 0 | \n",
613 | " 0 | \n",
614 | " 0 | \n",
615 | " 0 | \n",
616 | " 0 | \n",
617 | " 0 | \n",
618 | " 0 | \n",
619 | " 0 | \n",
620 | " ... | \n",
621 | " 0 | \n",
622 | " 0 | \n",
623 | " 0 | \n",
624 | " 0 | \n",
625 | " 0 | \n",
626 | " 0 | \n",
627 | " 0 | \n",
628 | " 0 | \n",
629 | " 0 | \n",
630 | " 0 | \n",
631 | "
\n",
632 | " \n",
633 | " 61770 | \n",
634 | " 0 | \n",
635 | " 0 | \n",
636 | " 0 | \n",
637 | " 0 | \n",
638 | " 0 | \n",
639 | " 0 | \n",
640 | " 0 | \n",
641 | " 0 | \n",
642 | " 0 | \n",
643 | " 0 | \n",
644 | " ... | \n",
645 | " 0 | \n",
646 | " 0 | \n",
647 | " 0 | \n",
648 | " 0 | \n",
649 | " 0 | \n",
650 | " 0 | \n",
651 | " 0 | \n",
652 | " 0 | \n",
653 | " 0 | \n",
654 | " 0 | \n",
655 | "
\n",
656 | " \n",
657 | "
\n",
658 | "
30000 rows × 6001 columns
\n",
659 | "
"
660 | ],
661 | "text/plain": [
662 | " 0 1 2 3 4 5 6 7 8 9 ... 2991 2992 2993 2994 2995 2996 \\\n",
663 | "394861 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n",
664 | "266389 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n",
665 | "32076 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n",
666 | "150593 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n",
667 | "128073 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 \n",
668 | "... .. .. .. .. .. .. .. .. .. .. ... ... ... ... ... ... ... \n",
669 | "350070 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n",
670 | "301346 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n",
671 | "79932 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n",
672 | "74788 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n",
673 | "61770 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n",
674 | "\n",
675 | " 2997 2998 2999 is_duplicate \n",
676 | "394861 0 0 0 0 \n",
677 | "266389 0 0 0 1 \n",
678 | "32076 0 0 0 0 \n",
679 | "150593 0 0 0 0 \n",
680 | "128073 0 0 0 0 \n",
681 | "... ... ... ... ... \n",
682 | "350070 0 0 0 0 \n",
683 | "301346 0 0 0 0 \n",
684 | "79932 0 0 0 0 \n",
685 | "74788 0 0 0 0 \n",
686 | "61770 0 0 0 0 \n",
687 | "\n",
688 | "[30000 rows x 6001 columns]"
689 | ]
690 | },
691 | "execution_count": 15,
692 | "metadata": {},
693 | "output_type": "execute_result"
694 | }
695 | ],
696 | "source": [
697 | "temp_df"
698 | ]
699 | },
700 | {
701 | "cell_type": "code",
702 | "execution_count": 11,
703 | "id": "b634b449",
704 | "metadata": {},
705 | "outputs": [],
706 | "source": [
707 | "temp_df['is_duplicate'] = new_df['is_duplicate']"
708 | ]
709 | },
710 | {
711 | "cell_type": "code",
712 | "execution_count": 12,
713 | "id": "96f3125e",
714 | "metadata": {},
715 | "outputs": [
716 | {
717 | "data": {
718 | "text/html": [
719 | "\n",
720 | "\n",
733 | "
\n",
734 | " \n",
735 | " \n",
736 | " | \n",
737 | " 0 | \n",
738 | " 1 | \n",
739 | " 2 | \n",
740 | " 3 | \n",
741 | " 4 | \n",
742 | " 5 | \n",
743 | " 6 | \n",
744 | " 7 | \n",
745 | " 8 | \n",
746 | " 9 | \n",
747 | " ... | \n",
748 | " 2991 | \n",
749 | " 2992 | \n",
750 | " 2993 | \n",
751 | " 2994 | \n",
752 | " 2995 | \n",
753 | " 2996 | \n",
754 | " 2997 | \n",
755 | " 2998 | \n",
756 | " 2999 | \n",
757 | " is_duplicate | \n",
758 | "
\n",
759 | " \n",
760 | " \n",
761 | " \n",
762 | " 394861 | \n",
763 | " 0 | \n",
764 | " 0 | \n",
765 | " 0 | \n",
766 | " 0 | \n",
767 | " 0 | \n",
768 | " 0 | \n",
769 | " 0 | \n",
770 | " 0 | \n",
771 | " 0 | \n",
772 | " 0 | \n",
773 | " ... | \n",
774 | " 0 | \n",
775 | " 0 | \n",
776 | " 0 | \n",
777 | " 0 | \n",
778 | " 0 | \n",
779 | " 0 | \n",
780 | " 0 | \n",
781 | " 0 | \n",
782 | " 0 | \n",
783 | " 0 | \n",
784 | "
\n",
785 | " \n",
786 | " 266389 | \n",
787 | " 0 | \n",
788 | " 0 | \n",
789 | " 0 | \n",
790 | " 0 | \n",
791 | " 0 | \n",
792 | " 0 | \n",
793 | " 0 | \n",
794 | " 0 | \n",
795 | " 0 | \n",
796 | " 0 | \n",
797 | " ... | \n",
798 | " 0 | \n",
799 | " 0 | \n",
800 | " 0 | \n",
801 | " 0 | \n",
802 | " 0 | \n",
803 | " 0 | \n",
804 | " 0 | \n",
805 | " 0 | \n",
806 | " 0 | \n",
807 | " 1 | \n",
808 | "
\n",
809 | " \n",
810 | " 32076 | \n",
811 | " 0 | \n",
812 | " 0 | \n",
813 | " 0 | \n",
814 | " 0 | \n",
815 | " 0 | \n",
816 | " 0 | \n",
817 | " 0 | \n",
818 | " 0 | \n",
819 | " 0 | \n",
820 | " 0 | \n",
821 | " ... | \n",
822 | " 0 | \n",
823 | " 0 | \n",
824 | " 0 | \n",
825 | " 0 | \n",
826 | " 0 | \n",
827 | " 0 | \n",
828 | " 0 | \n",
829 | " 0 | \n",
830 | " 0 | \n",
831 | " 0 | \n",
832 | "
\n",
833 | " \n",
834 | " 150593 | \n",
835 | " 0 | \n",
836 | " 0 | \n",
837 | " 0 | \n",
838 | " 0 | \n",
839 | " 0 | \n",
840 | " 0 | \n",
841 | " 0 | \n",
842 | " 0 | \n",
843 | " 0 | \n",
844 | " 0 | \n",
845 | " ... | \n",
846 | " 0 | \n",
847 | " 0 | \n",
848 | " 0 | \n",
849 | " 0 | \n",
850 | " 0 | \n",
851 | " 0 | \n",
852 | " 0 | \n",
853 | " 0 | \n",
854 | " 0 | \n",
855 | " 0 | \n",
856 | "
\n",
857 | " \n",
858 | " 128073 | \n",
859 | " 0 | \n",
860 | " 0 | \n",
861 | " 0 | \n",
862 | " 0 | \n",
863 | " 0 | \n",
864 | " 0 | \n",
865 | " 0 | \n",
866 | " 0 | \n",
867 | " 0 | \n",
868 | " 0 | \n",
869 | " ... | \n",
870 | " 0 | \n",
871 | " 0 | \n",
872 | " 1 | \n",
873 | " 0 | \n",
874 | " 0 | \n",
875 | " 0 | \n",
876 | " 0 | \n",
877 | " 0 | \n",
878 | " 0 | \n",
879 | " 0 | \n",
880 | "
\n",
881 | " \n",
882 | "
\n",
883 | "
5 rows × 6001 columns
\n",
884 | "
"
885 | ],
886 | "text/plain": [
887 | " 0 1 2 3 4 5 6 7 8 9 ... 2991 2992 2993 2994 2995 2996 \\\n",
888 | "394861 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n",
889 | "266389 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n",
890 | "32076 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n",
891 | "150593 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 \n",
892 | "128073 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 \n",
893 | "\n",
894 | " 2997 2998 2999 is_duplicate \n",
895 | "394861 0 0 0 0 \n",
896 | "266389 0 0 0 1 \n",
897 | "32076 0 0 0 0 \n",
898 | "150593 0 0 0 0 \n",
899 | "128073 0 0 0 0 \n",
900 | "\n",
901 | "[5 rows x 6001 columns]"
902 | ]
903 | },
904 | "execution_count": 12,
905 | "metadata": {},
906 | "output_type": "execute_result"
907 | }
908 | ],
909 | "source": [
910 | "temp_df.head()"
911 | ]
912 | },
913 | {
914 | "cell_type": "code",
915 | "execution_count": 13,
916 | "id": "7dec87f7",
917 | "metadata": {},
918 | "outputs": [],
919 | "source": [
920 | "from sklearn.model_selection import train_test_split\n",
921 | "X_train,X_test,y_train,y_test = train_test_split(temp_df.iloc[:,0:-1].values,temp_df.iloc[:,-1].values,test_size=0.2,random_state=1)"
922 | ]
923 | },
924 | {
925 | "cell_type": "code",
926 | "execution_count": 14,
927 | "id": "92d4785f",
928 | "metadata": {},
929 | "outputs": [
930 | {
931 | "data": {
932 | "text/plain": [
933 | "0.742"
934 | ]
935 | },
936 | "execution_count": 14,
937 | "metadata": {},
938 | "output_type": "execute_result"
939 | }
940 | ],
941 | "source": [
942 | "from sklearn.ensemble import RandomForestClassifier\n",
943 | "from sklearn.metrics import accuracy_score\n",
944 | "rf = RandomForestClassifier()\n",
945 | "rf.fit(X_train,y_train)\n",
946 | "y_pred = rf.predict(X_test)\n",
947 | "accuracy_score(y_test,y_pred)"
948 | ]
949 | },
950 | {
951 | "cell_type": "code",
952 | "execution_count": 15,
953 | "id": "a69c5c5a",
954 | "metadata": {},
955 | "outputs": [
956 | {
957 | "name": "stderr",
958 | "output_type": "stream",
959 | "text": [
960 | "C:\\Users\\91842\\anaconda3\\lib\\site-packages\\xgboost\\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].\n",
961 | " warnings.warn(label_encoder_deprecation_msg, UserWarning)\n"
962 | ]
963 | },
964 | {
965 | "name": "stdout",
966 | "output_type": "stream",
967 | "text": [
968 | "[14:13:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.0/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.\n"
969 | ]
970 | },
971 | {
972 | "data": {
973 | "text/plain": [
974 | "0.7328333333333333"
975 | ]
976 | },
977 | "execution_count": 15,
978 | "metadata": {},
979 | "output_type": "execute_result"
980 | }
981 | ],
982 | "source": [
983 | "from xgboost import XGBClassifier\n",
984 | "xgb = XGBClassifier()\n",
985 | "xgb.fit(X_train,y_train)\n",
986 | "y_pred = xgb.predict(X_test)\n",
987 | "accuracy_score(y_test,y_pred)"
988 | ]
989 | },
990 | {
991 | "cell_type": "code",
992 | "execution_count": null,
993 | "id": "9c576f36",
994 | "metadata": {},
995 | "outputs": [],
996 | "source": []
997 | }
998 | ],
999 | "metadata": {
1000 | "kernelspec": {
1001 | "display_name": "Python 3",
1002 | "language": "python",
1003 | "name": "python3"
1004 | },
1005 | "language_info": {
1006 | "codemirror_mode": {
1007 | "name": "ipython",
1008 | "version": 3
1009 | },
1010 | "file_extension": ".py",
1011 | "mimetype": "text/x-python",
1012 | "name": "python",
1013 | "nbconvert_exporter": "python",
1014 | "pygments_lexer": "ipython3",
1015 | "version": "3.8.8"
1016 | }
1017 | },
1018 | "nbformat": 4,
1019 | "nbformat_minor": 5
1020 | }
1021 |
--------------------------------------------------------------------------------
/streamlit-app/Procfile:
--------------------------------------------------------------------------------
1 | web: sh setup.sh && streamlit run app.py
--------------------------------------------------------------------------------
/streamlit-app/app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import helper
3 | import pickle
4 |
5 | model = pickle.load(open('model.pkl','rb'))
6 |
7 | st.header('Duplicate Question Pairs')
8 |
9 | q1 = st.text_input('Enter question 1')
10 | q2 = st.text_input('Enter question 2')
11 |
12 | if st.button('Find'):
13 | query = helper.query_point_creator(q1,q2)
14 | result = model.predict(query)[0]
15 |
16 | if result:
17 | st.header('Duplicate')
18 | else:
19 | st.header('Not Duplicate')
20 |
21 |
22 |
--------------------------------------------------------------------------------
/streamlit-app/helper.py:
--------------------------------------------------------------------------------
1 | import re
2 | from bs4 import BeautifulSoup
3 | import distance
4 | from fuzzywuzzy import fuzz
5 | import pickle
6 | import numpy as np
7 |
8 | cv = pickle.load(open('cv.pkl','rb'))
9 |
10 |
11 | def test_common_words(q1,q2):
12 | w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
13 | w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
14 | return len(w1 & w2)
15 |
16 | def test_total_words(q1,q2):
17 | w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
18 | w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
19 | return (len(w1) + len(w2))
20 |
21 |
22 | def test_fetch_token_features(q1, q2):
23 | SAFE_DIV = 0.0001
24 |
25 | STOP_WORDS = pickle.load(open('stopwords.pkl','rb'))
26 |
27 | token_features = [0.0] * 8
28 |
29 | # Converting the Sentence into Tokens:
30 | q1_tokens = q1.split()
31 | q2_tokens = q2.split()
32 |
33 | if len(q1_tokens) == 0 or len(q2_tokens) == 0:
34 | return token_features
35 |
36 | # Get the non-stopwords in Questions
37 | q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
38 | q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
39 |
40 | # Get the stopwords in Questions
41 | q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
42 | q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
43 |
44 | # Get the common non-stopwords from Question pair
45 | common_word_count = len(q1_words.intersection(q2_words))
46 |
47 | # Get the common stopwords from Question pair
48 | common_stop_count = len(q1_stops.intersection(q2_stops))
49 |
50 | # Get the common Tokens from Question pair
51 | common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
52 |
53 | token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
54 | token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
55 | token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
56 | token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
57 | token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
58 | token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
59 |
60 | # Last word of both question is same or not
61 | token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
62 |
63 | # First word of both question is same or not
64 | token_features[7] = int(q1_tokens[0] == q2_tokens[0])
65 |
66 | return token_features
67 |
68 |
69 | def test_fetch_length_features(q1, q2):
70 | length_features = [0.0] * 3
71 |
72 | # Converting the Sentence into Tokens:
73 | q1_tokens = q1.split()
74 | q2_tokens = q2.split()
75 |
76 | if len(q1_tokens) == 0 or len(q2_tokens) == 0:
77 | return length_features
78 |
79 | # Absolute length features
80 | length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
81 |
82 | # Average Token Length of both Questions
83 | length_features[1] = (len(q1_tokens) + len(q2_tokens)) / 2
84 |
85 | strs = list(distance.lcsubstrings(q1, q2))
86 | length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)
87 |
88 | return length_features
89 |
90 |
91 | def test_fetch_fuzzy_features(q1, q2):
92 | fuzzy_features = [0.0] * 4
93 |
94 | # fuzz_ratio
95 | fuzzy_features[0] = fuzz.QRatio(q1, q2)
96 |
97 | # fuzz_partial_ratio
98 | fuzzy_features[1] = fuzz.partial_ratio(q1, q2)
99 |
100 | # token_sort_ratio
101 | fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)
102 |
103 | # token_set_ratio
104 | fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)
105 |
106 | return fuzzy_features
107 |
108 |
109 | def preprocess(q):
110 | q = str(q).lower().strip()
111 |
112 | # Replace certain special characters with their string equivalents
113 | q = q.replace('%', ' percent')
114 | q = q.replace('$', ' dollar ')
115 | q = q.replace('₹', ' rupee ')
116 | q = q.replace('€', ' euro ')
117 | q = q.replace('@', ' at ')
118 |
119 | # The pattern '[math]' appears around 900 times in the whole dataset.
120 | q = q.replace('[math]', '')
121 |
122 | # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
123 | q = q.replace(',000,000,000 ', 'b ')
124 | q = q.replace(',000,000 ', 'm ')
125 | q = q.replace(',000 ', 'k ')
126 | q = re.sub(r'([0-9]+)000000000', r'\1b', q)
127 | q = re.sub(r'([0-9]+)000000', r'\1m', q)
128 | q = re.sub(r'([0-9]+)000', r'\1k', q)
129 |
130 | # Decontracting words
131 | # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
132 | # https://stackoverflow.com/a/19794953
133 | contractions = {
134 | "ain't": "am not",
135 | "aren't": "are not",
136 | "can't": "can not",
137 | "can't've": "can not have",
138 | "'cause": "because",
139 | "could've": "could have",
140 | "couldn't": "could not",
141 | "couldn't've": "could not have",
142 | "didn't": "did not",
143 | "doesn't": "does not",
144 | "don't": "do not",
145 | "hadn't": "had not",
146 | "hadn't've": "had not have",
147 | "hasn't": "has not",
148 | "haven't": "have not",
149 | "he'd": "he would",
150 | "he'd've": "he would have",
151 | "he'll": "he will",
152 | "he'll've": "he will have",
153 | "he's": "he is",
154 | "how'd": "how did",
155 | "how'd'y": "how do you",
156 | "how'll": "how will",
157 | "how's": "how is",
158 | "i'd": "i would",
159 | "i'd've": "i would have",
160 | "i'll": "i will",
161 | "i'll've": "i will have",
162 | "i'm": "i am",
163 | "i've": "i have",
164 | "isn't": "is not",
165 | "it'd": "it would",
166 | "it'd've": "it would have",
167 | "it'll": "it will",
168 | "it'll've": "it will have",
169 | "it's": "it is",
170 | "let's": "let us",
171 | "ma'am": "madam",
172 | "mayn't": "may not",
173 | "might've": "might have",
174 | "mightn't": "might not",
175 | "mightn't've": "might not have",
176 | "must've": "must have",
177 | "mustn't": "must not",
178 | "mustn't've": "must not have",
179 | "needn't": "need not",
180 | "needn't've": "need not have",
181 | "o'clock": "of the clock",
182 | "oughtn't": "ought not",
183 | "oughtn't've": "ought not have",
184 | "shan't": "shall not",
185 | "sha'n't": "shall not",
186 | "shan't've": "shall not have",
187 | "she'd": "she would",
188 | "she'd've": "she would have",
189 | "she'll": "she will",
190 | "she'll've": "she will have",
191 | "she's": "she is",
192 | "should've": "should have",
193 | "shouldn't": "should not",
194 | "shouldn't've": "should not have",
195 | "so've": "so have",
196 | "so's": "so as",
197 | "that'd": "that would",
198 | "that'd've": "that would have",
199 | "that's": "that is",
200 | "there'd": "there would",
201 | "there'd've": "there would have",
202 | "there's": "there is",
203 | "they'd": "they would",
204 | "they'd've": "they would have",
205 | "they'll": "they will",
206 | "they'll've": "they will have",
207 | "they're": "they are",
208 | "they've": "they have",
209 | "to've": "to have",
210 | "wasn't": "was not",
211 | "we'd": "we would",
212 | "we'd've": "we would have",
213 | "we'll": "we will",
214 | "we'll've": "we will have",
215 | "we're": "we are",
216 | "we've": "we have",
217 | "weren't": "were not",
218 | "what'll": "what will",
219 | "what'll've": "what will have",
220 | "what're": "what are",
221 | "what's": "what is",
222 | "what've": "what have",
223 | "when's": "when is",
224 | "when've": "when have",
225 | "where'd": "where did",
226 | "where's": "where is",
227 | "where've": "where have",
228 | "who'll": "who will",
229 | "who'll've": "who will have",
230 | "who's": "who is",
231 | "who've": "who have",
232 | "why's": "why is",
233 | "why've": "why have",
234 | "will've": "will have",
235 | "won't": "will not",
236 | "won't've": "will not have",
237 | "would've": "would have",
238 | "wouldn't": "would not",
239 | "wouldn't've": "would not have",
240 | "y'all": "you all",
241 | "y'all'd": "you all would",
242 | "y'all'd've": "you all would have",
243 | "y'all're": "you all are",
244 | "y'all've": "you all have",
245 | "you'd": "you would",
246 | "you'd've": "you would have",
247 | "you'll": "you will",
248 | "you'll've": "you will have",
249 | "you're": "you are",
250 | "you've": "you have"
251 | }
252 |
253 | q_decontracted = []
254 |
255 | for word in q.split():
256 | if word in contractions:
257 | word = contractions[word]
258 |
259 | q_decontracted.append(word)
260 |
261 | q = ' '.join(q_decontracted)
262 | q = q.replace("'ve", " have")
263 | q = q.replace("n't", " not")
264 | q = q.replace("'re", " are")
265 | q = q.replace("'ll", " will")
266 |
267 | # Removing HTML tags
268 | q = BeautifulSoup(q)
269 | q = q.get_text()
270 |
271 | # Remove punctuations
272 | pattern = re.compile('\W')
273 | q = re.sub(pattern, ' ', q).strip()
274 |
275 | return q
276 |
277 |
278 | def query_point_creator(q1, q2):
279 | input_query = []
280 |
281 | # preprocess
282 | q1 = preprocess(q1)
283 | q2 = preprocess(q2)
284 |
285 | # fetch basic features
286 | input_query.append(len(q1))
287 | input_query.append(len(q2))
288 |
289 | input_query.append(len(q1.split(" ")))
290 | input_query.append(len(q2.split(" ")))
291 |
292 | input_query.append(test_common_words(q1, q2))
293 | input_query.append(test_total_words(q1, q2))
294 | input_query.append(round(test_common_words(q1, q2) / test_total_words(q1, q2), 2))
295 |
296 | # fetch token features
297 | token_features = test_fetch_token_features(q1, q2)
298 | input_query.extend(token_features)
299 |
300 | # fetch length based features
301 | length_features = test_fetch_length_features(q1, q2)
302 | input_query.extend(length_features)
303 |
304 | # fetch fuzzy features
305 | fuzzy_features = test_fetch_fuzzy_features(q1, q2)
306 | input_query.extend(fuzzy_features)
307 |
308 | # bow feature for q1
309 | q1_bow = cv.transform([q1]).toarray()
310 |
311 | # bow feature for q2
312 | q2_bow = cv.transform([q2]).toarray()
313 |
314 | return np.hstack((np.array(input_query).reshape(1, 22), q1_bow, q2_bow))
--------------------------------------------------------------------------------
/streamlit-app/readme.txt:
--------------------------------------------------------------------------------
1 | This is the streamlit web app
2 |
--------------------------------------------------------------------------------
/streamlit-app/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit
2 | sklearn
3 | fuzzywuzzy
4 | distance
5 | bs4
--------------------------------------------------------------------------------
/streamlit-app/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ~/.streamlit/
2 |
3 | echo "\
4 | [server]\n\
5 | port = $PORT\n\
6 | enableCORS = false\n\
7 | headless = true\n\
8 | \n\
9 | " > ~/.streamlit/config.toml
--------------------------------------------------------------------------------