├── LICENSE
├── Preprocessing Cheat - Logistic Regression.ipynb
└── README.md
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Ajay Halthor
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Preprocessing Cheat - Logistic Regression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 349,
6 | "metadata": {
7 | "scrolled": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "#!pip3 install statsmodels"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "Let's see how Logistic Regression acts with 5 techniques:\n",
19 | "1. Standardization of Numerical Variables\n",
20 | "2. Encoding of Categorical Variables\n",
21 | "3. Data Imbalance\n",
22 | "4. Colinearity\n",
23 | "5. Missing Values"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 1,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "import numpy as np\n",
33 | "import pandas as pd\n",
34 | "from sklearn.impute import SimpleImputer\n",
35 | "from sklearn.linear_model import LogisticRegression\n",
36 | "from sklearn.metrics import roc_auc_score, average_precision_score\n",
37 | "from sklearn.model_selection import train_test_split\n",
38 | "from sklearn.pipeline import Pipeline\n",
39 | "from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder\n",
40 | "from sklearn_pandas import DataFrameMapper\n",
41 | "from snape.make_dataset import make_dataset"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "Check out snape [here](https://github.com/mbernico/snape)"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 71,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "def get_data(categorical_features=True,\n",
58 | " balanced=True, \n",
59 | " correlated_features=False, \n",
60 | " missing_values=False,\n",
61 | " dataset_size=12000):\n",
62 | " \n",
63 | " if categorical_features:\n",
64 | " label_list = []\n",
65 | " N_CATEGORICAL = 4\n",
66 | " for i in range(N_CATEGORICAL):\n",
67 | " num_classes = np.random.randint(2, 10)\n",
68 | " labels = list(np.arange(num_classes))\n",
69 | " labels = [f'str_{i}' for i in labels]\n",
70 | " label_list.append(labels)\n",
71 | " \n",
72 | " if correlated_features:\n",
73 | " N_REDUNDANT = 1\n",
74 | " N_REPEATED = 1\n",
75 | " N_INFORMATIVE = 8 - N_REDUNDANT - N_REPEATED\n",
76 | " \n",
77 | " conf = {\n",
78 | " \"type\": \"classification\",\n",
79 | " \"n_classes\": 2,\n",
80 | " \"n_samples\": dataset_size,\n",
81 | " \"n_features\": 8,\n",
82 | " \"out_path\": \"./\",\n",
83 | " \"output\": \"my_dataset\",\n",
84 | " \"n_informative\": N_INFORMATIVE if correlated_features else 8,\n",
85 | " \"n_repeated\": N_REPEATED if correlated_features else 0,\n",
86 | " \"n_redundant\": N_REDUNDANT if correlated_features else 0,\n",
87 | " \"n_clusters\": 2,\n",
88 | " \"weights\": [0.5, 0.5] if balanced else [0.9, 0.1],\n",
89 | " \"pct_missing\": 0.70 if missing_values else 0.00,\n",
90 | " \"n_categorical\": N_CATEGORICAL if categorical_features else 0,\n",
91 | " \"random_seed\":42,\n",
92 | " \"label_list\":label_list if categorical_features else []\n",
93 | " }\n",
94 | "\n",
95 | " make_dataset(config=conf)\n",
96 | " df = pd.read_csv('my_dataset_train.csv')\n",
97 | " \n",
98 | " label = 'y'\n",
99 | " categorical_features = [col for col in df.columns if (df[col].dtype==object) & (col != label)]\n",
100 | " numerical_features = [col for col in df.columns if (col not in categorical_features) & (col != label)]\n",
101 | " \n",
102 | " return df, label, categorical_features, numerical_features\n",
103 | "\n",
104 | "def evaluation(pipeline, X, y):\n",
105 | " y_predict_proba = pipeline.predict_proba(X)[:, 1]\n",
106 | " return{\n",
107 | " 'auc': roc_auc_score(y, y_predict_proba),\n",
108 | " 'pr-auc': average_precision_score(y, y_predict_proba)\n",
109 | " }"
110 | ]
111 | },
112 | {
113 | "cell_type": "markdown",
114 | "metadata": {},
115 | "source": [
116 | "# Logistic Regression"
117 | ]
118 | },
119 | {
120 | "cell_type": "markdown",
121 | "metadata": {},
122 | "source": [
123 | "## 1.1 Standardiazation"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "### Without Standardization"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 108,
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "name": "stdout",
140 | "output_type": "stream",
141 | "text": [
142 | "--------------------------------------------------------------------------------\n",
143 | "Warning: n_repeated not in configuration, defaulting to 0\n",
144 | "Warning: n_clusters_per_class not in configuration, defaulting to 2\n",
145 | "Warning: effective_rank not in configuration, defaulting to None\n",
146 | "Warning: tail_strength not in configuration, defaulting to 0.5\n",
147 | "Warning: noise not in configuration, defaulting to 0.0\n",
148 | "Warning: shuffle not in configuration, defaulting to True\n",
149 | "Creating Classification Dataset...\n",
150 | "Warning: insert_dollar not in configuration, defaulting to 'No'\n",
151 | "Warning: insert_percent not in configuration, defaulting to 'No'\n",
152 | "Warning: star_schema not in configuration, defaulting to 'No'\n",
153 | "Writing Train/Test Datasets\n"
154 | ]
155 | },
156 | {
157 | "data": {
158 | "text/html": [
159 | "
\n",
160 | "\n",
173 | "
\n",
174 | " \n",
175 | " \n",
176 | " | \n",
177 | " x0 | \n",
178 | " x1 | \n",
179 | " x2 | \n",
180 | " x3 | \n",
181 | " x4 | \n",
182 | " x5 | \n",
183 | " x6 | \n",
184 | " x7 | \n",
185 | "
\n",
186 | " \n",
187 | " \n",
188 | " \n",
189 | " count | \n",
190 | " 96000.000000 | \n",
191 | " 96000.000000 | \n",
192 | " 96000.000000 | \n",
193 | " 96000.000000 | \n",
194 | " 96000.000000 | \n",
195 | " 96000.000000 | \n",
196 | " 96000.000000 | \n",
197 | " 96000.000000 | \n",
198 | "
\n",
199 | " \n",
200 | " mean | \n",
201 | " -1.645228 | \n",
202 | " 2.244024 | \n",
203 | " 1.045137 | \n",
204 | " 2.714882 | \n",
205 | " -0.020220 | \n",
206 | " -0.004654 | \n",
207 | " -1.536356 | \n",
208 | " -4.197611 | \n",
209 | "
\n",
210 | " \n",
211 | " std | \n",
212 | " 6.209578 | \n",
213 | " 8.602877 | \n",
214 | " 1.733465 | \n",
215 | " 9.096954 | \n",
216 | " 11.988777 | \n",
217 | " 4.719140 | \n",
218 | " 2.810519 | \n",
219 | " 15.886376 | \n",
220 | "
\n",
221 | " \n",
222 | " min | \n",
223 | " -35.268886 | \n",
224 | " -34.012532 | \n",
225 | " -7.229214 | \n",
226 | " -35.163348 | \n",
227 | " -49.768093 | \n",
228 | " -20.945358 | \n",
229 | " -13.822828 | \n",
230 | " -78.725850 | \n",
231 | "
\n",
232 | " \n",
233 | " 25% | \n",
234 | " -5.758226 | \n",
235 | " -3.452518 | \n",
236 | " -0.106077 | \n",
237 | " -3.561230 | \n",
238 | " -8.276808 | \n",
239 | " -3.241806 | \n",
240 | " -3.394866 | \n",
241 | " -15.068164 | \n",
242 | "
\n",
243 | " \n",
244 | " 50% | \n",
245 | " -1.709702 | \n",
246 | " 2.393561 | \n",
247 | " 1.051668 | \n",
248 | " 2.789480 | \n",
249 | " 0.106544 | \n",
250 | " -0.308207 | \n",
251 | " -1.546129 | \n",
252 | " -3.386493 | \n",
253 | "
\n",
254 | " \n",
255 | " 75% | \n",
256 | " 2.515567 | \n",
257 | " 8.106976 | \n",
258 | " 2.192445 | \n",
259 | " 9.022244 | \n",
260 | " 8.357289 | \n",
261 | " 3.115309 | \n",
262 | " 0.302129 | \n",
263 | " 7.250865 | \n",
264 | "
\n",
265 | " \n",
266 | " max | \n",
267 | " 25.206965 | \n",
268 | " 42.072130 | \n",
269 | " 9.851492 | \n",
270 | " 43.896856 | \n",
271 | " 43.189071 | \n",
272 | " 21.426677 | \n",
273 | " 13.305723 | \n",
274 | " 62.477698 | \n",
275 | "
\n",
276 | " \n",
277 | "
\n",
278 | "
"
279 | ],
280 | "text/plain": [
281 | " x0 x1 x2 x3 x4 \\\n",
282 | "count 96000.000000 96000.000000 96000.000000 96000.000000 96000.000000 \n",
283 | "mean -1.645228 2.244024 1.045137 2.714882 -0.020220 \n",
284 | "std 6.209578 8.602877 1.733465 9.096954 11.988777 \n",
285 | "min -35.268886 -34.012532 -7.229214 -35.163348 -49.768093 \n",
286 | "25% -5.758226 -3.452518 -0.106077 -3.561230 -8.276808 \n",
287 | "50% -1.709702 2.393561 1.051668 2.789480 0.106544 \n",
288 | "75% 2.515567 8.106976 2.192445 9.022244 8.357289 \n",
289 | "max 25.206965 42.072130 9.851492 43.896856 43.189071 \n",
290 | "\n",
291 | " x5 x6 x7 \n",
292 | "count 96000.000000 96000.000000 96000.000000 \n",
293 | "mean -0.004654 -1.536356 -4.197611 \n",
294 | "std 4.719140 2.810519 15.886376 \n",
295 | "min -20.945358 -13.822828 -78.725850 \n",
296 | "25% -3.241806 -3.394866 -15.068164 \n",
297 | "50% -0.308207 -1.546129 -3.386493 \n",
298 | "75% 3.115309 0.302129 7.250865 \n",
299 | "max 21.426677 13.305723 62.477698 "
300 | ]
301 | },
302 | "execution_count": 108,
303 | "metadata": {},
304 | "output_type": "execute_result"
305 | }
306 | ],
307 | "source": [
308 | "df, label, categorical_features, numerical_features = get_data(categorical_features=False, dataset_size=120000)\n",
309 | "df[numerical_features].describe()"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 111,
315 | "metadata": {},
316 | "outputs": [],
317 | "source": [
318 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
319 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
320 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": 114,
326 | "metadata": {},
327 | "outputs": [
328 | {
329 | "name": "stdout",
330 | "output_type": "stream",
331 | "text": [
332 | "[Pipeline] ............... (step 1 of 1) Processing clf, total= 0.3s\n"
333 | ]
334 | },
335 | {
336 | "data": {
337 | "text/plain": [
338 | "{'auc': 0.8148784308322949, 'pr-auc': 0.818032430163559}"
339 | ]
340 | },
341 | "execution_count": 114,
342 | "metadata": {},
343 | "output_type": "execute_result"
344 | }
345 | ],
346 | "source": [
347 | "clf = LogisticRegression()\n",
348 | "pipeline = Pipeline([\n",
349 | " ('clf', clf)\n",
350 | "], verbose=True)\n",
351 | "\n",
352 | "pipeline.fit(X_train[numerical_features], y_train)\n",
353 | "evaluation(pipeline, X_test[numerical_features], y_test)"
354 | ]
355 | },
356 | {
357 | "cell_type": "markdown",
358 | "metadata": {},
359 | "source": [
360 | "### With Standardization"
361 | ]
362 | },
363 | {
364 | "cell_type": "code",
365 | "execution_count": 115,
366 | "metadata": {},
367 | "outputs": [
368 | {
369 | "name": "stdout",
370 | "output_type": "stream",
371 | "text": [
372 | "[Pipeline] ........ (step 1 of 2) Processing preprocess, total= 0.1s\n",
373 | "[Pipeline] ............... (step 2 of 2) Processing clf, total= 0.1s\n"
374 | ]
375 | },
376 | {
377 | "data": {
378 | "text/plain": [
379 | "{'auc': 0.8148798631692816, 'pr-auc': 0.8180303186841142}"
380 | ]
381 | },
382 | "execution_count": 115,
383 | "metadata": {},
384 | "output_type": "execute_result"
385 | }
386 | ],
387 | "source": [
388 | "num = [([n], [StandardScaler()]) for n in numerical_features]\n",
389 | "mapper = DataFrameMapper(num, df_out=True)\n",
390 | "\n",
391 | "clf = LogisticRegression()\n",
392 | "pipeline = Pipeline([\n",
393 | " ('preprocess', mapper),\n",
394 | " ('clf', clf)\n",
395 | "], verbose=True)\n",
396 | "\n",
397 | "pipeline.fit(X_train[numerical_features], y_train)\n",
398 | "evaluation(pipeline, X_test[numerical_features], y_test)"
399 | ]
400 | },
401 | {
402 | "cell_type": "markdown",
403 | "metadata": {},
404 | "source": [
405 | "**Result**\n",
406 | "- No need to scale for logistic regression accuracy. But convergence is faster. [More info here](https://stats.stackexchange.com/questions/48360/is-standardization-needed-before-fitting-logistic-regression#:~:text=3%20Answers&text=Standardization%20isn't%20required%20for,the%20technique%20used%20for%20optimization.&text=Otherwise%2C%20you%20can%20run%20your,standardization%20treatment%20on%20the%20features)"
407 | ]
408 | },
409 | {
410 | "cell_type": "markdown",
411 | "metadata": {},
412 | "source": [
413 | "## 1.2 Encoding"
414 | ]
415 | },
416 | {
417 | "cell_type": "markdown",
418 | "metadata": {},
419 | "source": [
420 | "We need numeric encoding for logistic regression."
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": 3,
426 | "metadata": {},
427 | "outputs": [
428 | {
429 | "name": "stdout",
430 | "output_type": "stream",
431 | "text": [
432 | "--------------------------------------------------------------------------------\n",
433 | "Warning: n_repeated not in configuration, defaulting to 0\n",
434 | "Warning: n_clusters_per_class not in configuration, defaulting to 2\n",
435 | "Warning: effective_rank not in configuration, defaulting to None\n",
436 | "Warning: tail_strength not in configuration, defaulting to 0.5\n",
437 | "Warning: noise not in configuration, defaulting to 0.0\n",
438 | "Warning: shuffle not in configuration, defaulting to True\n",
439 | "Creating Classification Dataset...\n",
440 | "Creating Categorical Features...\n",
441 | "Warning: insert_dollar not in configuration, defaulting to 'No'\n",
442 | "Warning: insert_percent not in configuration, defaulting to 'No'\n",
443 | "Warning: star_schema not in configuration, defaulting to 'No'\n",
444 | "Writing Train/Test Datasets\n"
445 | ]
446 | }
447 | ],
448 | "source": [
449 | "df, label, categorical_features, numerical_features = get_data()"
450 | ]
451 | },
452 | {
453 | "cell_type": "markdown",
454 | "metadata": {},
455 | "source": [
456 | "### One Hot Encoding"
457 | ]
458 | },
459 | {
460 | "cell_type": "code",
461 | "execution_count": 10,
462 | "metadata": {},
463 | "outputs": [
464 | {
465 | "name": "stdout",
466 | "output_type": "stream",
467 | "text": [
468 | "[Pipeline] ........ (step 1 of 2) Processing preprocess, total= 0.1s\n",
469 | "[Pipeline] ............... (step 2 of 2) Processing clf, total= 0.9s\n"
470 | ]
471 | },
472 | {
473 | "data": {
474 | "text/plain": [
475 | "{'auc': 0.8304397645792462, 'pr-auc': 0.80297861579569}"
476 | ]
477 | },
478 | "execution_count": 10,
479 | "metadata": {},
480 | "output_type": "execute_result"
481 | }
482 | ],
483 | "source": [
484 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
485 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
486 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
487 | "\n",
488 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
489 | "cat = [([c], [OneHotEncoder()]) for c in categorical_features]\n",
490 | "mapper = DataFrameMapper(cat + num, df_out=True)\n",
491 | "\n",
492 | "clf = LogisticRegression(max_iter=1000)\n",
493 | "pipeline = Pipeline([\n",
494 | " ('preprocess', mapper),\n",
495 | " ('clf', clf)\n",
496 | "], verbose=True)\n",
497 | "\n",
498 | "pipeline.fit(X_train, y_train)\n",
499 | "evaluation(pipeline, X_test, y_test)"
500 | ]
501 | },
502 | {
503 | "cell_type": "code",
504 | "execution_count": 11,
505 | "metadata": {
506 | "scrolled": true
507 | },
508 | "outputs": [
509 | {
510 | "data": {
511 | "text/html": [
512 | "\n",
513 | "\n",
526 | "
\n",
527 | " \n",
528 | " \n",
529 | " | \n",
530 | " 8640 | \n",
531 | " 8641 | \n",
532 | " 8642 | \n",
533 | " 8643 | \n",
534 | " 8644 | \n",
535 | "
\n",
536 | " \n",
537 | " \n",
538 | " \n",
539 | " x1_x0_str_0 | \n",
540 | " 0.000000 | \n",
541 | " 0.000000 | \n",
542 | " 0.000000 | \n",
543 | " 0.000000 | \n",
544 | " 0.000000 | \n",
545 | "
\n",
546 | " \n",
547 | " x1_x0_str_1 | \n",
548 | " 0.000000 | \n",
549 | " 0.000000 | \n",
550 | " 0.000000 | \n",
551 | " 0.000000 | \n",
552 | " 0.000000 | \n",
553 | "
\n",
554 | " \n",
555 | " x1_x0_str_2 | \n",
556 | " 1.000000 | \n",
557 | " 0.000000 | \n",
558 | " 0.000000 | \n",
559 | " 1.000000 | \n",
560 | " 1.000000 | \n",
561 | "
\n",
562 | " \n",
563 | " x1_x0_str_3 | \n",
564 | " 0.000000 | \n",
565 | " 1.000000 | \n",
566 | " 1.000000 | \n",
567 | " 0.000000 | \n",
568 | " 0.000000 | \n",
569 | "
\n",
570 | " \n",
571 | " x1_x0_str_4 | \n",
572 | " 0.000000 | \n",
573 | " 0.000000 | \n",
574 | " 0.000000 | \n",
575 | " 0.000000 | \n",
576 | " 0.000000 | \n",
577 | "
\n",
578 | " \n",
579 | " x1_x0_str_5 | \n",
580 | " 0.000000 | \n",
581 | " 0.000000 | \n",
582 | " 0.000000 | \n",
583 | " 0.000000 | \n",
584 | " 0.000000 | \n",
585 | "
\n",
586 | " \n",
587 | " x3_x0_str_0 | \n",
588 | " 0.000000 | \n",
589 | " 0.000000 | \n",
590 | " 0.000000 | \n",
591 | " 0.000000 | \n",
592 | " 0.000000 | \n",
593 | "
\n",
594 | " \n",
595 | " x3_x0_str_1 | \n",
596 | " 0.000000 | \n",
597 | " 0.000000 | \n",
598 | " 1.000000 | \n",
599 | " 0.000000 | \n",
600 | " 0.000000 | \n",
601 | "
\n",
602 | " \n",
603 | " x3_x0_str_2 | \n",
604 | " 0.000000 | \n",
605 | " 1.000000 | \n",
606 | " 0.000000 | \n",
607 | " 1.000000 | \n",
608 | " 1.000000 | \n",
609 | "
\n",
610 | " \n",
611 | " x3_x0_str_3 | \n",
612 | " 1.000000 | \n",
613 | " 0.000000 | \n",
614 | " 0.000000 | \n",
615 | " 0.000000 | \n",
616 | " 0.000000 | \n",
617 | "
\n",
618 | " \n",
619 | " x3_x0_str_4 | \n",
620 | " 0.000000 | \n",
621 | " 0.000000 | \n",
622 | " 0.000000 | \n",
623 | " 0.000000 | \n",
624 | " 0.000000 | \n",
625 | "
\n",
626 | " \n",
627 | " x6_x0_str_0 | \n",
628 | " 0.000000 | \n",
629 | " 0.000000 | \n",
630 | " 0.000000 | \n",
631 | " 0.000000 | \n",
632 | " 0.000000 | \n",
633 | "
\n",
634 | " \n",
635 | " x6_x0_str_1 | \n",
636 | " 0.000000 | \n",
637 | " 0.000000 | \n",
638 | " 0.000000 | \n",
639 | " 0.000000 | \n",
640 | " 0.000000 | \n",
641 | "
\n",
642 | " \n",
643 | " x6_x0_str_2 | \n",
644 | " 0.000000 | \n",
645 | " 0.000000 | \n",
646 | " 0.000000 | \n",
647 | " 1.000000 | \n",
648 | " 0.000000 | \n",
649 | "
\n",
650 | " \n",
651 | " x6_x0_str_3 | \n",
652 | " 1.000000 | \n",
653 | " 1.000000 | \n",
654 | " 0.000000 | \n",
655 | " 0.000000 | \n",
656 | " 0.000000 | \n",
657 | "
\n",
658 | " \n",
659 | " x6_x0_str_4 | \n",
660 | " 0.000000 | \n",
661 | " 0.000000 | \n",
662 | " 1.000000 | \n",
663 | " 0.000000 | \n",
664 | " 1.000000 | \n",
665 | "
\n",
666 | " \n",
667 | " x6_x0_str_5 | \n",
668 | " 0.000000 | \n",
669 | " 0.000000 | \n",
670 | " 0.000000 | \n",
671 | " 0.000000 | \n",
672 | " 0.000000 | \n",
673 | "
\n",
674 | " \n",
675 | " x6_x0_str_6 | \n",
676 | " 0.000000 | \n",
677 | " 0.000000 | \n",
678 | " 0.000000 | \n",
679 | " 0.000000 | \n",
680 | " 0.000000 | \n",
681 | "
\n",
682 | " \n",
683 | " x6_x0_str_7 | \n",
684 | " 0.000000 | \n",
685 | " 0.000000 | \n",
686 | " 0.000000 | \n",
687 | " 0.000000 | \n",
688 | " 0.000000 | \n",
689 | "
\n",
690 | " \n",
691 | " x7_x0_str_0 | \n",
692 | " 0.000000 | \n",
693 | " 0.000000 | \n",
694 | " 0.000000 | \n",
695 | " 0.000000 | \n",
696 | " 0.000000 | \n",
697 | "
\n",
698 | " \n",
699 | " x7_x0_str_1 | \n",
700 | " 0.000000 | \n",
701 | " 0.000000 | \n",
702 | " 0.000000 | \n",
703 | " 0.000000 | \n",
704 | " 1.000000 | \n",
705 | "
\n",
706 | " \n",
707 | " x7_x0_str_2 | \n",
708 | " 0.000000 | \n",
709 | " 0.000000 | \n",
710 | " 0.000000 | \n",
711 | " 0.000000 | \n",
712 | " 0.000000 | \n",
713 | "
\n",
714 | " \n",
715 | " x7_x0_str_3 | \n",
716 | " 1.000000 | \n",
717 | " 0.000000 | \n",
718 | " 1.000000 | \n",
719 | " 1.000000 | \n",
720 | " 0.000000 | \n",
721 | "
\n",
722 | " \n",
723 | " x7_x0_str_4 | \n",
724 | " 0.000000 | \n",
725 | " 0.000000 | \n",
726 | " 0.000000 | \n",
727 | " 0.000000 | \n",
728 | " 0.000000 | \n",
729 | "
\n",
730 | " \n",
731 | " x7_x0_str_5 | \n",
732 | " 0.000000 | \n",
733 | " 1.000000 | \n",
734 | " 0.000000 | \n",
735 | " 0.000000 | \n",
736 | " 0.000000 | \n",
737 | "
\n",
738 | " \n",
739 | " x7_x0_str_6 | \n",
740 | " 0.000000 | \n",
741 | " 0.000000 | \n",
742 | " 0.000000 | \n",
743 | " 0.000000 | \n",
744 | " 0.000000 | \n",
745 | "
\n",
746 | " \n",
747 | " x0 | \n",
748 | " -10.703380 | \n",
749 | " -4.189989 | \n",
750 | " 10.965457 | \n",
751 | " 11.707606 | \n",
752 | " -10.140494 | \n",
753 | "
\n",
754 | " \n",
755 | " x2 | \n",
756 | " -12.941344 | \n",
757 | " -14.909158 | \n",
758 | " -21.448502 | \n",
759 | " -6.947473 | \n",
760 | " -36.795258 | \n",
761 | "
\n",
762 | " \n",
763 | " x4 | \n",
764 | " -6.464533 | \n",
765 | " 7.366311 | \n",
766 | " 3.887812 | \n",
767 | " -8.306792 | \n",
768 | " -7.842345 | \n",
769 | "
\n",
770 | " \n",
771 | " x5 | \n",
772 | " -0.328846 | \n",
773 | " -11.833629 | \n",
774 | " 13.592603 | \n",
775 | " 10.200299 | \n",
776 | " -3.358164 | \n",
777 | "
\n",
778 | " \n",
779 | "
\n",
780 | "
"
781 | ],
782 | "text/plain": [
783 | " 8640 8641 8642 8643 8644\n",
784 | "x1_x0_str_0 0.000000 0.000000 0.000000 0.000000 0.000000\n",
785 | "x1_x0_str_1 0.000000 0.000000 0.000000 0.000000 0.000000\n",
786 | "x1_x0_str_2 1.000000 0.000000 0.000000 1.000000 1.000000\n",
787 | "x1_x0_str_3 0.000000 1.000000 1.000000 0.000000 0.000000\n",
788 | "x1_x0_str_4 0.000000 0.000000 0.000000 0.000000 0.000000\n",
789 | "x1_x0_str_5 0.000000 0.000000 0.000000 0.000000 0.000000\n",
790 | "x3_x0_str_0 0.000000 0.000000 0.000000 0.000000 0.000000\n",
791 | "x3_x0_str_1 0.000000 0.000000 1.000000 0.000000 0.000000\n",
792 | "x3_x0_str_2 0.000000 1.000000 0.000000 1.000000 1.000000\n",
793 | "x3_x0_str_3 1.000000 0.000000 0.000000 0.000000 0.000000\n",
794 | "x3_x0_str_4 0.000000 0.000000 0.000000 0.000000 0.000000\n",
795 | "x6_x0_str_0 0.000000 0.000000 0.000000 0.000000 0.000000\n",
796 | "x6_x0_str_1 0.000000 0.000000 0.000000 0.000000 0.000000\n",
797 | "x6_x0_str_2 0.000000 0.000000 0.000000 1.000000 0.000000\n",
798 | "x6_x0_str_3 1.000000 1.000000 0.000000 0.000000 0.000000\n",
799 | "x6_x0_str_4 0.000000 0.000000 1.000000 0.000000 1.000000\n",
800 | "x6_x0_str_5 0.000000 0.000000 0.000000 0.000000 0.000000\n",
801 | "x6_x0_str_6 0.000000 0.000000 0.000000 0.000000 0.000000\n",
802 | "x6_x0_str_7 0.000000 0.000000 0.000000 0.000000 0.000000\n",
803 | "x7_x0_str_0 0.000000 0.000000 0.000000 0.000000 0.000000\n",
804 | "x7_x0_str_1 0.000000 0.000000 0.000000 0.000000 1.000000\n",
805 | "x7_x0_str_2 0.000000 0.000000 0.000000 0.000000 0.000000\n",
806 | "x7_x0_str_3 1.000000 0.000000 1.000000 1.000000 0.000000\n",
807 | "x7_x0_str_4 0.000000 0.000000 0.000000 0.000000 0.000000\n",
808 | "x7_x0_str_5 0.000000 1.000000 0.000000 0.000000 0.000000\n",
809 | "x7_x0_str_6 0.000000 0.000000 0.000000 0.000000 0.000000\n",
810 | "x0 -10.703380 -4.189989 10.965457 11.707606 -10.140494\n",
811 | "x2 -12.941344 -14.909158 -21.448502 -6.947473 -36.795258\n",
812 | "x4 -6.464533 7.366311 3.887812 -8.306792 -7.842345\n",
813 | "x5 -0.328846 -11.833629 13.592603 10.200299 -3.358164"
814 | ]
815 | },
816 | "execution_count": 11,
817 | "metadata": {},
818 | "output_type": "execute_result"
819 | }
820 | ],
821 | "source": [
822 | "preprocessed_X_test = mapper.transform(X_test)\n",
823 | "preprocessed_X_test.head().T"
824 | ]
825 | },
826 | {
827 | "cell_type": "markdown",
828 | "metadata": {},
829 | "source": [
830 | "### Ordinal Encoding"
831 | ]
832 | },
833 | {
834 | "cell_type": "code",
835 | "execution_count": 13,
836 | "metadata": {},
837 | "outputs": [
838 | {
839 | "name": "stdout",
840 | "output_type": "stream",
841 | "text": [
842 | "[Pipeline] ........ (step 1 of 2) Processing preprocess, total= 0.1s\n",
843 | "[Pipeline] ............... (step 2 of 2) Processing clf, total= 0.1s\n"
844 | ]
845 | },
846 | {
847 | "data": {
848 | "text/plain": [
849 | "{'auc': 0.8194499904512231, 'pr-auc': 0.7996358755932719}"
850 | ]
851 | },
852 | "execution_count": 13,
853 | "metadata": {},
854 | "output_type": "execute_result"
855 | }
856 | ],
857 | "source": [
858 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
859 | "cat = [([c], [OrdinalEncoder()]) for c in categorical_features]\n",
860 | "mapper = DataFrameMapper(cat + num, df_out=True)\n",
861 | "\n",
862 | "clf = LogisticRegression()\n",
863 | "pipeline = Pipeline([\n",
864 | " ('preprocess', mapper),\n",
865 | " ('clf', clf)\n",
866 | "], verbose=True)\n",
867 | "\n",
868 | "pipeline.fit(X_train, y_train)\n",
869 | "evaluation(pipeline, X_test, y_test)"
870 | ]
871 | },
872 | {
873 | "cell_type": "code",
874 | "execution_count": 145,
875 | "metadata": {},
876 | "outputs": [
877 | {
878 | "data": {
879 | "text/html": [
880 | "\n",
881 | "\n",
894 | "
\n",
895 | " \n",
896 | " \n",
897 | " | \n",
898 | " 9000 | \n",
899 | " 9001 | \n",
900 | " 9002 | \n",
901 | " 9003 | \n",
902 | " 9004 | \n",
903 | "
\n",
904 | " \n",
905 | " \n",
906 | " \n",
907 | " feat_5 | \n",
908 | " 1.000000 | \n",
909 | " 3.000000 | \n",
910 | " 6.000000 | \n",
911 | " 0.000000 | \n",
912 | " 5.000000 | \n",
913 | "
\n",
914 | " \n",
915 | " feat_6 | \n",
916 | " 2.000000 | \n",
917 | " 3.000000 | \n",
918 | " 0.000000 | \n",
919 | " 1.000000 | \n",
920 | " 0.000000 | \n",
921 | "
\n",
922 | " \n",
923 | " feat_7 | \n",
924 | " 0.000000 | \n",
925 | " 2.000000 | \n",
926 | " 6.000000 | \n",
927 | " 4.000000 | \n",
928 | " 1.000000 | \n",
929 | "
\n",
930 | " \n",
931 | " feat_8 | \n",
932 | " 4.000000 | \n",
933 | " 7.000000 | \n",
934 | " 7.000000 | \n",
935 | " 0.000000 | \n",
936 | " 5.000000 | \n",
937 | "
\n",
938 | " \n",
939 | " feat_1 | \n",
940 | " -0.068768 | \n",
941 | " 0.425899 | \n",
942 | " 1.930354 | \n",
943 | " 1.157980 | \n",
944 | " -1.304169 | \n",
945 | "
\n",
946 | " \n",
947 | " feat_2 | \n",
948 | " -1.222878 | \n",
949 | " 0.293660 | \n",
950 | " 1.729959 | \n",
951 | " -0.716538 | \n",
952 | " 1.169799 | \n",
953 | "
\n",
954 | " \n",
955 | " feat_3 | \n",
956 | " -0.714906 | \n",
957 | " 1.509702 | \n",
958 | " -0.429593 | \n",
959 | " -0.708234 | \n",
960 | " -0.304866 | \n",
961 | "
\n",
962 | " \n",
963 | " feat_4 | \n",
964 | " -0.823643 | \n",
965 | " 1.997845 | \n",
966 | " 0.105752 | \n",
967 | " -0.953579 | \n",
968 | " 0.690543 | \n",
969 | "
\n",
970 | " \n",
971 | "
\n",
972 | "
"
973 | ],
974 | "text/plain": [
975 | " 9000 9001 9002 9003 9004\n",
976 | "feat_5 1.000000 3.000000 6.000000 0.000000 5.000000\n",
977 | "feat_6 2.000000 3.000000 0.000000 1.000000 0.000000\n",
978 | "feat_7 0.000000 2.000000 6.000000 4.000000 1.000000\n",
979 | "feat_8 4.000000 7.000000 7.000000 0.000000 5.000000\n",
980 | "feat_1 -0.068768 0.425899 1.930354 1.157980 -1.304169\n",
981 | "feat_2 -1.222878 0.293660 1.729959 -0.716538 1.169799\n",
982 | "feat_3 -0.714906 1.509702 -0.429593 -0.708234 -0.304866\n",
983 | "feat_4 -0.823643 1.997845 0.105752 -0.953579 0.690543"
984 | ]
985 | },
986 | "execution_count": 145,
987 | "metadata": {},
988 | "output_type": "execute_result"
989 | }
990 | ],
991 | "source": [
992 | "preprocessed_X_test = mapper.transform(X_test)\n",
993 | "preprocessed_X_test.head().T"
994 | ]
995 | },
996 | {
997 | "cell_type": "markdown",
998 | "metadata": {},
999 | "source": [
1000 | "**Result**: \n",
1001 | "- `OrdinalEncoding` works when relationships exist between categorical variables (size, weather). Otherwise, prefer `OneHotEncoding`\n",
1002 | "- `OneHotEncoding` takes up space. Hence more training time"
1003 | ]
1004 | },
1005 | {
1006 | "cell_type": "markdown",
1007 | "metadata": {},
1008 | "source": [
1009 | "## 1.3 Data Imbalance"
1010 | ]
1011 | },
1012 | {
1013 | "cell_type": "markdown",
1014 | "metadata": {},
1015 | "source": [
1016 | "What happens if the training data isn't balanced?"
1017 | ]
1018 | },
1019 | {
1020 | "cell_type": "markdown",
1021 | "metadata": {},
1022 | "source": [
1023 | "### Unbalanced"
1024 | ]
1025 | },
1026 | {
1027 | "cell_type": "code",
1028 | "execution_count": 102,
1029 | "metadata": {},
1030 | "outputs": [
1031 | {
1032 | "name": "stdout",
1033 | "output_type": "stream",
1034 | "text": [
1035 | "--------------------------------------------------------------------------------\n",
1036 | "Warning: n_clusters_per_class not in configuration, defaulting to 2\n",
1037 | "Warning: effective_rank not in configuration, defaulting to None\n",
1038 | "Warning: tail_strength not in configuration, defaulting to 0.5\n",
1039 | "Warning: noise not in configuration, defaulting to 0.0\n",
1040 | "Warning: shuffle not in configuration, defaulting to True\n",
1041 | "Creating Classification Dataset...\n",
1042 | "Creating Categorical Features...\n",
1043 | "Warning: insert_dollar not in configuration, defaulting to 'No'\n",
1044 | "Warning: insert_percent not in configuration, defaulting to 'No'\n",
1045 | "Warning: star_schema not in configuration, defaulting to 'No'\n",
1046 | "Writing Train/Test Datasets\n"
1047 | ]
1048 | }
1049 | ],
1050 | "source": [
1051 | "df, label, categorical_features, numerical_features = get_data(balanced=False)\n",
1052 | "\n",
1053 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
1054 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
1055 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]"
1056 | ]
1057 | },
1058 | {
1059 | "cell_type": "code",
1060 | "execution_count": 103,
1061 | "metadata": {},
1062 | "outputs": [
1063 | {
1064 | "data": {
1065 | "text/plain": [
1066 | "0 8599\n",
1067 | "1 1001\n",
1068 | "Name: y, dtype: int64"
1069 | ]
1070 | },
1071 | "execution_count": 103,
1072 | "metadata": {},
1073 | "output_type": "execute_result"
1074 | }
1075 | ],
1076 | "source": [
1077 | "df[label].value_counts()"
1078 | ]
1079 | },
1080 | {
1081 | "cell_type": "code",
1082 | "execution_count": 104,
1083 | "metadata": {},
1084 | "outputs": [
1085 | {
1086 | "data": {
1087 | "text/plain": [
1088 | "{'auc': 0.7869518716577542, 'pr-auc': 0.39239809756882393}"
1089 | ]
1090 | },
1091 | "execution_count": 104,
1092 | "metadata": {},
1093 | "output_type": "execute_result"
1094 | }
1095 | ],
1096 | "source": [
1097 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
1098 | "cat = [([c], [OrdinalEncoder()]) for c in categorical_features]\n",
1099 | "mapper = DataFrameMapper(cat + num, df_out=True)\n",
1100 | "\n",
1101 | "clf = LogisticRegression()\n",
1102 | "pipeline = Pipeline([\n",
1103 | " ('preprocess', mapper),\n",
1104 | " ('clf', clf)\n",
1105 | "])\n",
1106 | "\n",
1107 | "pipeline.fit(X_train, y_train)\n",
1108 | "evaluation(pipeline, X_test, y_test)"
1109 | ]
1110 | },
1111 | {
1112 | "cell_type": "code",
1113 | "execution_count": 105,
1114 | "metadata": {},
1115 | "outputs": [],
1116 | "source": [
1117 | "y_predict_proba = pipeline.predict_proba(X_test)[:, 1]"
1118 | ]
1119 | },
1120 | {
1121 | "cell_type": "code",
1122 | "execution_count": 106,
1123 | "metadata": {},
1124 | "outputs": [
1125 | {
1126 | "data": {
1127 | "text/plain": [
1128 | "0.10815533327119523"
1129 | ]
1130 | },
1131 | "execution_count": 106,
1132 | "metadata": {},
1133 | "output_type": "execute_result"
1134 | }
1135 | ],
1136 | "source": [
1137 | "y_predict_proba.mean()"
1138 | ]
1139 | },
1140 | {
1141 | "cell_type": "markdown",
1142 | "metadata": {},
1143 | "source": [
1144 | "## Balanced"
1145 | ]
1146 | },
1147 | {
1148 | "cell_type": "code",
1149 | "execution_count": 35,
1150 | "metadata": {},
1151 | "outputs": [
1152 | {
1153 | "name": "stdout",
1154 | "output_type": "stream",
1155 | "text": [
1156 | "--------------------------------------------------------------------------------\n",
1157 | "Warning: n_repeated not in configuration, defaulting to 0\n",
1158 | "Warning: n_clusters_per_class not in configuration, defaulting to 2\n",
1159 | "Warning: effective_rank not in configuration, defaulting to None\n",
1160 | "Warning: tail_strength not in configuration, defaulting to 0.5\n",
1161 | "Warning: noise not in configuration, defaulting to 0.0\n",
1162 | "Warning: shuffle not in configuration, defaulting to True\n",
1163 | "Creating Classification Dataset...\n",
1164 | "Creating Categorical Features...\n",
1165 | "Warning: insert_dollar not in configuration, defaulting to 'No'\n",
1166 | "Warning: insert_percent not in configuration, defaulting to 'No'\n",
1167 | "Warning: star_schema not in configuration, defaulting to 'No'\n",
1168 | "Writing Train/Test Datasets\n"
1169 | ]
1170 | },
1171 | {
1172 | "data": {
1173 | "text/plain": [
1174 | "{'auc': 0.7949023220244715, 'pr-auc': 0.7742073929744453}"
1175 | ]
1176 | },
1177 | "execution_count": 35,
1178 | "metadata": {},
1179 | "output_type": "execute_result"
1180 | }
1181 | ],
1182 | "source": [
1183 | "df, label, categorical_features, numerical_features = get_data(balanced=True)\n",
1184 | "\n",
1185 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
1186 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
1187 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
1188 | "\n",
1189 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
1190 | "cat = [([c], [OrdinalEncoder()]) for c in categorical_features]\n",
1191 | "mapper = DataFrameMapper(cat + num, df_out=True)\n",
1192 | "\n",
1193 | "clf = LogisticRegression()\n",
1194 | "pipeline = Pipeline([\n",
1195 | " ('preprocess', mapper),\n",
1196 | " ('clf', clf)\n",
1197 | "])\n",
1198 | "\n",
1199 | "pipeline.fit(X_train, y_train)\n",
1200 | "evaluation(pipeline, X_test, y_test)"
1201 | ]
1202 | },
1203 | {
1204 | "cell_type": "code",
1205 | "execution_count": 36,
1206 | "metadata": {},
1207 | "outputs": [
1208 | {
1209 | "data": {
1210 | "text/plain": [
1211 | "0.4994547544271453"
1212 | ]
1213 | },
1214 | "execution_count": 36,
1215 | "metadata": {},
1216 | "output_type": "execute_result"
1217 | }
1218 | ],
1219 | "source": [
1220 | "y_predict_proba = pipeline.predict_proba(X_test)[:, 1]\n",
1221 | "y_predict_proba.mean()"
1222 | ]
1223 | },
1224 | {
1225 | "cell_type": "markdown",
1226 | "metadata": {},
1227 | "source": [
1228 | "## Dealing with unbalanced data by over weighting"
1229 | ]
1230 | },
1231 | {
1232 | "cell_type": "code",
1233 | "execution_count": 101,
1234 | "metadata": {},
1235 | "outputs": [
1236 | {
1237 | "name": "stdout",
1238 | "output_type": "stream",
1239 | "text": [
1240 | "--------------------------------------------------------------------------------\n",
1241 | "Warning: n_clusters_per_class not in configuration, defaulting to 2\n",
1242 | "Warning: effective_rank not in configuration, defaulting to None\n",
1243 | "Warning: tail_strength not in configuration, defaulting to 0.5\n",
1244 | "Warning: noise not in configuration, defaulting to 0.0\n",
1245 | "Warning: shuffle not in configuration, defaulting to True\n",
1246 | "Creating Classification Dataset...\n",
1247 | "Creating Categorical Features...\n",
1248 | "Warning: insert_dollar not in configuration, defaulting to 'No'\n",
1249 | "Warning: insert_percent not in configuration, defaulting to 'No'\n",
1250 | "Warning: star_schema not in configuration, defaulting to 'No'\n",
1251 | "Writing Train/Test Datasets\n"
1252 | ]
1253 | },
1254 | {
1255 | "data": {
1256 | "text/plain": [
1257 | "{'auc': 0.8113720373994346, 'pr-auc': 0.30360454333181025}"
1258 | ]
1259 | },
1260 | "execution_count": 101,
1261 | "metadata": {},
1262 | "output_type": "execute_result"
1263 | }
1264 | ],
1265 | "source": [
1266 | "df, label, categorical_features, numerical_features = get_data(balanced=False)\n",
1267 | "\n",
1268 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
1269 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
1270 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
1271 | "\n",
1272 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
1273 | "cat = [([c], [OrdinalEncoder()]) for c in categorical_features]\n",
1274 | "mapper = DataFrameMapper(cat + num, df_out=True)\n",
1275 | "\n",
1276 | "clf = LogisticRegression(class_weight='balanced')\n",
1277 | "pipeline = Pipeline([\n",
1278 | " ('preprocess', mapper),\n",
1279 | " ('clf', clf)\n",
1280 | "])\n",
1281 | "\n",
1282 | "pipeline.fit(X_train, y_train)\n",
1283 | "evaluation(pipeline, X_test, y_test)"
1284 | ]
1285 | },
1286 | {
1287 | "cell_type": "markdown",
1288 | "metadata": {},
1289 | "source": [
1290 | "**Result**:\n",
1291 | "- Having an unbalanced dataset doesn't harm accuracy, but harms precision-recall metrics of the positive class.\n",
1292 | "- This is mostly due to lower predicted probability values. "
1293 | ]
1294 | },
1295 | {
1296 | "cell_type": "markdown",
1297 | "metadata": {},
1298 | "source": [
1299 | "## 1.4 Correlated Features"
1300 | ]
1301 | },
1302 | {
1303 | "cell_type": "code",
1304 | "execution_count": 72,
1305 | "metadata": {},
1306 | "outputs": [
1307 | {
1308 | "name": "stdout",
1309 | "output_type": "stream",
1310 | "text": [
1311 | "--------------------------------------------------------------------------------\n",
1312 | "Warning: n_clusters_per_class not in configuration, defaulting to 2\n",
1313 | "Warning: effective_rank not in configuration, defaulting to None\n",
1314 | "Warning: tail_strength not in configuration, defaulting to 0.5\n",
1315 | "Warning: noise not in configuration, defaulting to 0.0\n",
1316 | "Warning: shuffle not in configuration, defaulting to True\n",
1317 | "Creating Classification Dataset...\n",
1318 | "Warning: insert_dollar not in configuration, defaulting to 'No'\n",
1319 | "Warning: insert_percent not in configuration, defaulting to 'No'\n",
1320 | "Warning: star_schema not in configuration, defaulting to 'No'\n",
1321 | "Writing Train/Test Datasets\n"
1322 | ]
1323 | }
1324 | ],
1325 | "source": [
1326 | "df, label, categorical_features, numerical_features = get_data(categorical_features=False, correlated_features=True)"
1327 | ]
1328 | },
1329 | {
1330 | "cell_type": "code",
1331 | "execution_count": 74,
1332 | "metadata": {},
1333 | "outputs": [
1334 | {
1335 | "data": {
1336 | "text/plain": [
1337 | "{'auc': 0.9194931452103352, 'pr-auc': 0.8982012865508728}"
1338 | ]
1339 | },
1340 | "execution_count": 74,
1341 | "metadata": {},
1342 | "output_type": "execute_result"
1343 | }
1344 | ],
1345 | "source": [
1346 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
1347 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
1348 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
1349 | "\n",
1350 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
1351 | "mapper = DataFrameMapper(num, df_out=True)\n",
1352 | "\n",
1353 | "clf = LogisticRegression()\n",
1354 | "pipeline = Pipeline([\n",
1355 | " ('preprocess', mapper),\n",
1356 | " ('clf', clf)\n",
1357 | "])\n",
1358 | "\n",
1359 | "pipeline.fit(X_train, y_train)\n",
1360 | "evaluation(pipeline, X_test, y_test)"
1361 | ]
1362 | },
1363 | {
1364 | "cell_type": "code",
1365 | "execution_count": 75,
1366 | "metadata": {},
1367 | "outputs": [
1368 | {
1369 | "data": {
1370 | "text/html": [
1371 | "\n",
1372 | "OLS Regression Results\n",
1373 | "\n",
1374 | " Dep. Variable: | y | R-squared: | 0.483 | \n",
1375 | "
\n",
1376 | "\n",
1377 | " Model: | OLS | Adj. R-squared: | 0.483 | \n",
1378 | "
\n",
1379 | "\n",
1380 | " Method: | Least Squares | F-statistic: | 1345. | \n",
1381 | "
\n",
1382 | "\n",
1383 | " Date: | Sat, 10 Apr 2021 | Prob (F-statistic): | 0.00 | \n",
1384 | "
\n",
1385 | "\n",
1386 | " Time: | 14:31:47 | Log-Likelihood: | -3420.3 | \n",
1387 | "
\n",
1388 | "\n",
1389 | " No. Observations: | 8640 | AIC: | 6855. | \n",
1390 | "
\n",
1391 | "\n",
1392 | " Df Residuals: | 8633 | BIC: | 6904. | \n",
1393 | "
\n",
1394 | "\n",
1395 | " Df Model: | 6 | | | \n",
1396 | "
\n",
1397 | "\n",
1398 | " Covariance Type: | nonrobust | | | \n",
1399 | "
\n",
1400 | "
\n",
1401 | "\n",
1402 | "\n",
1403 | " | coef | std err | t | P>|t| | [0.025 | 0.975] | \n",
1404 | "
\n",
1405 | "\n",
1406 | " const | 0.5906 | 0.006 | 104.957 | 0.000 | 0.580 | 0.602 | \n",
1407 | "
\n",
1408 | "\n",
1409 | " x0 | -0.0043 | 0.000 | -20.533 | 0.000 | -0.005 | -0.004 | \n",
1410 | "
\n",
1411 | "\n",
1412 | " x1 | 0.0335 | 0.002 | 19.438 | 0.000 | 0.030 | 0.037 | \n",
1413 | "
\n",
1414 | "\n",
1415 | " x2 | 0.0447 | 0.001 | 43.084 | 0.000 | 0.043 | 0.047 | \n",
1416 | "
\n",
1417 | "\n",
1418 | " x3 | -0.0076 | 0.000 | -20.533 | 0.000 | -0.008 | -0.007 | \n",
1419 | "
\n",
1420 | "\n",
1421 | " x4 | -0.0142 | 0.001 | -27.006 | 0.000 | -0.015 | -0.013 | \n",
1422 | "
\n",
1423 | "\n",
1424 | " x5 | 0.0125 | 0.000 | 45.550 | 0.000 | 0.012 | 0.013 | \n",
1425 | "
\n",
1426 | "\n",
1427 | " x6 | -0.0017 | 0.002 | -0.997 | 0.319 | -0.005 | 0.002 | \n",
1428 | "
\n",
1429 | "\n",
1430 | " x7 | 0.0270 | 0.001 | 28.009 | 0.000 | 0.025 | 0.029 | \n",
1431 | "
\n",
1432 | "
\n",
1433 | "\n",
1434 | "\n",
1435 | " Omnibus: | 341.439 | Durbin-Watson: | 2.027 | \n",
1436 | "
\n",
1437 | "\n",
1438 | " Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 353.022 | \n",
1439 | "
\n",
1440 | "\n",
1441 | " Skew: | -0.467 | Prob(JB): | 2.20e-77 | \n",
1442 | "
\n",
1443 | "\n",
1444 | " Kurtosis: | 2.670 | Cond. No. | 1.54e+16 | \n",
1445 | "
\n",
1446 | "
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.5e-26. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular."
1447 | ],
1448 | "text/plain": [
1449 | "\n",
1450 | "\"\"\"\n",
1451 | " OLS Regression Results \n",
1452 | "==============================================================================\n",
1453 | "Dep. Variable: y R-squared: 0.483\n",
1454 | "Model: OLS Adj. R-squared: 0.483\n",
1455 | "Method: Least Squares F-statistic: 1345.\n",
1456 | "Date: Sat, 10 Apr 2021 Prob (F-statistic): 0.00\n",
1457 | "Time: 14:31:47 Log-Likelihood: -3420.3\n",
1458 | "No. Observations: 8640 AIC: 6855.\n",
1459 | "Df Residuals: 8633 BIC: 6904.\n",
1460 | "Df Model: 6 \n",
1461 | "Covariance Type: nonrobust \n",
1462 | "==============================================================================\n",
1463 | " coef std err t P>|t| [0.025 0.975]\n",
1464 | "------------------------------------------------------------------------------\n",
1465 | "const 0.5906 0.006 104.957 0.000 0.580 0.602\n",
1466 | "x0 -0.0043 0.000 -20.533 0.000 -0.005 -0.004\n",
1467 | "x1 0.0335 0.002 19.438 0.000 0.030 0.037\n",
1468 | "x2 0.0447 0.001 43.084 0.000 0.043 0.047\n",
1469 | "x3 -0.0076 0.000 -20.533 0.000 -0.008 -0.007\n",
1470 | "x4 -0.0142 0.001 -27.006 0.000 -0.015 -0.013\n",
1471 | "x5 0.0125 0.000 45.550 0.000 0.012 0.013\n",
1472 | "x6 -0.0017 0.002 -0.997 0.319 -0.005 0.002\n",
1473 | "x7 0.0270 0.001 28.009 0.000 0.025 0.029\n",
1474 | "==============================================================================\n",
1475 | "Omnibus: 341.439 Durbin-Watson: 2.027\n",
1476 | "Prob(Omnibus): 0.000 Jarque-Bera (JB): 353.022\n",
1477 | "Skew: -0.467 Prob(JB): 2.20e-77\n",
1478 | "Kurtosis: 2.670 Cond. No. 1.54e+16\n",
1479 | "==============================================================================\n",
1480 | "\n",
1481 | "Notes:\n",
1482 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
1483 | "[2] The smallest eigenvalue is 1.5e-26. This might indicate that there are\n",
1484 | "strong multicollinearity problems or that the design matrix is singular.\n",
1485 | "\"\"\""
1486 | ]
1487 | },
1488 | "execution_count": 75,
1489 | "metadata": {},
1490 | "output_type": "execute_result"
1491 | }
1492 | ],
1493 | "source": [
1494 | "import statsmodels.api as sm\n",
1495 | "preprocessed_X_train = mapper.transform(X_train)\n",
1496 | "preprocessed_X_train = sm.add_constant(preprocessed_X_train)\n",
1497 | "results = sm.OLS(y_train, preprocessed_X_train).fit()\n",
1498 | "results.summary()"
1499 | ]
1500 | },
1501 | {
1502 | "cell_type": "code",
1503 | "execution_count": 76,
1504 | "metadata": {},
1505 | "outputs": [
1506 | {
1507 | "name": "stdout",
1508 | "output_type": "stream",
1509 | "text": [
1510 | "x0, inf\n",
1511 | "x1, inf\n",
1512 | "x2, inf\n",
1513 | "x3, inf\n",
1514 | "x4, inf\n",
1515 | "x5, inf\n",
1516 | "x6, inf\n",
1517 | "x7, inf\n"
1518 | ]
1519 | },
1520 | {
1521 | "name": "stderr",
1522 | "output_type": "stream",
1523 | "text": [
1524 | "/usr/local/lib/python3.7/site-packages/statsmodels/stats/outliers_influence.py:193: RuntimeWarning: divide by zero encountered in double_scalars\n",
1525 | " vif = 1. / (1. - r_squared_i)\n"
1526 | ]
1527 | }
1528 | ],
1529 | "source": [
1530 | "from statsmodels.stats.outliers_influence import variance_inflation_factor\n",
1531 | "for column in numerical_features:\n",
1532 | " print(f\"\"\"{column}, {variance_inflation_factor(\n",
1533 | " preprocessed_X_train.values, \n",
1534 | " list(preprocessed_X_train.columns).index(column))}\"\"\")"
1535 | ]
1536 | },
1537 | {
1538 | "cell_type": "code",
1539 | "execution_count": 89,
1540 | "metadata": {},
1541 | "outputs": [
1542 | {
1543 | "data": {
1544 | "text/html": [
1545 | "\n",
1546 | "\n",
1559 | "
\n",
1560 | " \n",
1561 | " \n",
1562 | " | \n",
1563 | " x0 | \n",
1564 | " x1 | \n",
1565 | " x2 | \n",
1566 | " x3 | \n",
1567 | " x4 | \n",
1568 | " x5 | \n",
1569 | " x6 | \n",
1570 | " x7 | \n",
1571 | " y | \n",
1572 | "
\n",
1573 | " \n",
1574 | " \n",
1575 | " \n",
1576 | " x0 | \n",
1577 | " 1.000000 | \n",
1578 | " 0.132384 | \n",
1579 | " -0.097071 | \n",
1580 | " 1.000000 | \n",
1581 | " -0.035234 | \n",
1582 | " -0.162566 | \n",
1583 | " 0.346866 | \n",
1584 | " 0.567626 | \n",
1585 | " -0.000326 | \n",
1586 | "
\n",
1587 | " \n",
1588 | " x1 | \n",
1589 | " 0.132384 | \n",
1590 | " 1.000000 | \n",
1591 | " 0.029556 | \n",
1592 | " 0.132384 | \n",
1593 | " 0.143301 | \n",
1594 | " -0.434811 | \n",
1595 | " 0.091475 | \n",
1596 | " 0.211035 | \n",
1597 | " -0.020443 | \n",
1598 | "
\n",
1599 | " \n",
1600 | " x2 | \n",
1601 | " -0.097071 | \n",
1602 | " 0.029556 | \n",
1603 | " 1.000000 | \n",
1604 | " -0.097071 | \n",
1605 | " 0.272320 | \n",
1606 | " 0.001597 | \n",
1607 | " -0.077077 | \n",
1608 | " -0.546263 | \n",
1609 | " 0.275935 | \n",
1610 | "
\n",
1611 | " \n",
1612 | " x3 | \n",
1613 | " 1.000000 | \n",
1614 | " 0.132384 | \n",
1615 | " -0.097071 | \n",
1616 | " 1.000000 | \n",
1617 | " -0.035234 | \n",
1618 | " -0.162566 | \n",
1619 | " 0.346866 | \n",
1620 | " 0.567626 | \n",
1621 | " -0.000326 | \n",
1622 | "
\n",
1623 | " \n",
1624 | " x4 | \n",
1625 | " -0.035234 | \n",
1626 | " 0.143301 | \n",
1627 | " 0.272320 | \n",
1628 | " -0.035234 | \n",
1629 | " 1.000000 | \n",
1630 | " -0.144259 | \n",
1631 | " 0.144366 | \n",
1632 | " 0.314752 | \n",
1633 | " -0.008192 | \n",
1634 | "
\n",
1635 | " \n",
1636 | " x5 | \n",
1637 | " -0.162566 | \n",
1638 | " -0.434811 | \n",
1639 | " 0.001597 | \n",
1640 | " -0.162566 | \n",
1641 | " -0.144259 | \n",
1642 | " 1.000000 | \n",
1643 | " 0.120178 | \n",
1644 | " 0.083330 | \n",
1645 | " 0.544321 | \n",
1646 | "
\n",
1647 | " \n",
1648 | " x6 | \n",
1649 | " 0.346866 | \n",
1650 | " 0.091475 | \n",
1651 | " -0.077077 | \n",
1652 | " 0.346866 | \n",
1653 | " 0.144366 | \n",
1654 | " 0.120178 | \n",
1655 | " 1.000000 | \n",
1656 | " 0.649177 | \n",
1657 | " 0.308940 | \n",
1658 | "
\n",
1659 | " \n",
1660 | " x7 | \n",
1661 | " 0.567626 | \n",
1662 | " 0.211035 | \n",
1663 | " -0.546263 | \n",
1664 | " 0.567626 | \n",
1665 | " 0.314752 | \n",
1666 | " 0.083330 | \n",
1667 | " 0.649177 | \n",
1668 | " 1.000000 | \n",
1669 | " 0.071201 | \n",
1670 | "
\n",
1671 | " \n",
1672 | " y | \n",
1673 | " -0.000326 | \n",
1674 | " -0.020443 | \n",
1675 | " 0.275935 | \n",
1676 | " -0.000326 | \n",
1677 | " -0.008192 | \n",
1678 | " 0.544321 | \n",
1679 | " 0.308940 | \n",
1680 | " 0.071201 | \n",
1681 | " 1.000000 | \n",
1682 | "
\n",
1683 | " \n",
1684 | "
\n",
1685 | "
"
1686 | ],
1687 | "text/plain": [
1688 | " x0 x1 x2 x3 x4 x5 x6 \\\n",
1689 | "x0 1.000000 0.132384 -0.097071 1.000000 -0.035234 -0.162566 0.346866 \n",
1690 | "x1 0.132384 1.000000 0.029556 0.132384 0.143301 -0.434811 0.091475 \n",
1691 | "x2 -0.097071 0.029556 1.000000 -0.097071 0.272320 0.001597 -0.077077 \n",
1692 | "x3 1.000000 0.132384 -0.097071 1.000000 -0.035234 -0.162566 0.346866 \n",
1693 | "x4 -0.035234 0.143301 0.272320 -0.035234 1.000000 -0.144259 0.144366 \n",
1694 | "x5 -0.162566 -0.434811 0.001597 -0.162566 -0.144259 1.000000 0.120178 \n",
1695 | "x6 0.346866 0.091475 -0.077077 0.346866 0.144366 0.120178 1.000000 \n",
1696 | "x7 0.567626 0.211035 -0.546263 0.567626 0.314752 0.083330 0.649177 \n",
1697 | "y -0.000326 -0.020443 0.275935 -0.000326 -0.008192 0.544321 0.308940 \n",
1698 | "\n",
1699 | " x7 y \n",
1700 | "x0 0.567626 -0.000326 \n",
1701 | "x1 0.211035 -0.020443 \n",
1702 | "x2 -0.546263 0.275935 \n",
1703 | "x3 0.567626 -0.000326 \n",
1704 | "x4 0.314752 -0.008192 \n",
1705 | "x5 0.083330 0.544321 \n",
1706 | "x6 0.649177 0.308940 \n",
1707 | "x7 1.000000 0.071201 \n",
1708 | "y 0.071201 1.000000 "
1709 | ]
1710 | },
1711 | "execution_count": 89,
1712 | "metadata": {},
1713 | "output_type": "execute_result"
1714 | }
1715 | ],
1716 | "source": [
1717 | "df.corr()"
1718 | ]
1719 | },
1720 | {
1721 | "cell_type": "markdown",
1722 | "metadata": {},
1723 | "source": [
1724 | "### Start with removing perfectly multicolinearity"
1725 | ]
1726 | },
1727 | {
1728 | "cell_type": "code",
1729 | "execution_count": 90,
1730 | "metadata": {},
1731 | "outputs": [
1732 | {
1733 | "data": {
1734 | "text/plain": [
1735 | "{'auc': 0.9194974891835068, 'pr-auc': 0.8982064967028441}"
1736 | ]
1737 | },
1738 | "execution_count": 90,
1739 | "metadata": {},
1740 | "output_type": "execute_result"
1741 | }
1742 | ],
1743 | "source": [
1744 | "numerical_features = ['x0', 'x1', 'x2', 'x4', 'x5', 'x6', 'x7'] # remove x3\n",
1745 | "\n",
1746 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
1747 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
1748 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
1749 | "\n",
1750 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
1751 | "mapper = DataFrameMapper(num, df_out=True)\n",
1752 | "\n",
1753 | "clf = LogisticRegression()\n",
1754 | "pipeline = Pipeline([\n",
1755 | " ('preprocess', mapper),\n",
1756 | " ('clf', clf)\n",
1757 | "])\n",
1758 | "\n",
1759 | "pipeline.fit(X_train, y_train)\n",
1760 | "evaluation(pipeline, X_test, y_test)"
1761 | ]
1762 | },
1763 | {
1764 | "cell_type": "code",
1765 | "execution_count": 91,
1766 | "metadata": {},
1767 | "outputs": [
1768 | {
1769 | "data": {
1770 | "text/html": [
1771 | "\n",
1772 | "OLS Regression Results\n",
1773 | "\n",
1774 | " Dep. Variable: | y | R-squared: | 0.483 | \n",
1775 | "
\n",
1776 | "\n",
1777 | " Model: | OLS | Adj. R-squared: | 0.483 | \n",
1778 | "
\n",
1779 | "\n",
1780 | " Method: | Least Squares | F-statistic: | 1345. | \n",
1781 | "
\n",
1782 | "\n",
1783 | " Date: | Sat, 10 Apr 2021 | Prob (F-statistic): | 0.00 | \n",
1784 | "
\n",
1785 | "\n",
1786 | " Time: | 14:37:17 | Log-Likelihood: | -3420.3 | \n",
1787 | "
\n",
1788 | "\n",
1789 | " No. Observations: | 8640 | AIC: | 6855. | \n",
1790 | "
\n",
1791 | "\n",
1792 | " Df Residuals: | 8633 | BIC: | 6904. | \n",
1793 | "
\n",
1794 | "\n",
1795 | " Df Model: | 6 | | | \n",
1796 | "
\n",
1797 | "\n",
1798 | " Covariance Type: | nonrobust | | | \n",
1799 | "
\n",
1800 | "
\n",
1801 | "\n",
1802 | "\n",
1803 | " | coef | std err | t | P>|t| | [0.025 | 0.975] | \n",
1804 | "
\n",
1805 | "\n",
1806 | " const | 0.5906 | 0.006 | 104.957 | 0.000 | 0.580 | 0.602 | \n",
1807 | "
\n",
1808 | "\n",
1809 | " x0 | -0.0172 | 0.001 | -20.533 | 0.000 | -0.019 | -0.016 | \n",
1810 | "
\n",
1811 | "\n",
1812 | " x1 | 0.0357 | 0.002 | 19.852 | 0.000 | 0.032 | 0.039 | \n",
1813 | "
\n",
1814 | "\n",
1815 | " x2 | 0.0438 | 0.001 | 43.527 | 0.000 | 0.042 | 0.046 | \n",
1816 | "
\n",
1817 | "\n",
1818 | " x4 | -0.0138 | 0.001 | -26.951 | 0.000 | -0.015 | -0.013 | \n",
1819 | "
\n",
1820 | "\n",
1821 | " x5 | 0.0127 | 0.000 | 47.049 | 0.000 | 0.012 | 0.013 | \n",
1822 | "
\n",
1823 | "\n",
1824 | " x6 | -0.0005 | 0.002 | -0.294 | 0.769 | -0.004 | 0.003 | \n",
1825 | "
\n",
1826 | "\n",
1827 | " x7 | 0.0260 | 0.001 | 28.111 | 0.000 | 0.024 | 0.028 | \n",
1828 | "
\n",
1829 | "
\n",
1830 | "\n",
1831 | "\n",
1832 | " Omnibus: | 341.439 | Durbin-Watson: | 2.027 | \n",
1833 | "
\n",
1834 | "\n",
1835 | " Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 353.022 | \n",
1836 | "
\n",
1837 | "\n",
1838 | " Skew: | -0.467 | Prob(JB): | 2.20e-77 | \n",
1839 | "
\n",
1840 | "\n",
1841 | " Kurtosis: | 2.670 | Cond. No. | 1.67e+16 | \n",
1842 | "
\n",
1843 | "
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 9e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular."
1844 | ],
1845 | "text/plain": [
1846 | "\n",
1847 | "\"\"\"\n",
1848 | " OLS Regression Results \n",
1849 | "==============================================================================\n",
1850 | "Dep. Variable: y R-squared: 0.483\n",
1851 | "Model: OLS Adj. R-squared: 0.483\n",
1852 | "Method: Least Squares F-statistic: 1345.\n",
1853 | "Date: Sat, 10 Apr 2021 Prob (F-statistic): 0.00\n",
1854 | "Time: 14:37:17 Log-Likelihood: -3420.3\n",
1855 | "No. Observations: 8640 AIC: 6855.\n",
1856 | "Df Residuals: 8633 BIC: 6904.\n",
1857 | "Df Model: 6 \n",
1858 | "Covariance Type: nonrobust \n",
1859 | "==============================================================================\n",
1860 | " coef std err t P>|t| [0.025 0.975]\n",
1861 | "------------------------------------------------------------------------------\n",
1862 | "const 0.5906 0.006 104.957 0.000 0.580 0.602\n",
1863 | "x0 -0.0172 0.001 -20.533 0.000 -0.019 -0.016\n",
1864 | "x1 0.0357 0.002 19.852 0.000 0.032 0.039\n",
1865 | "x2 0.0438 0.001 43.527 0.000 0.042 0.046\n",
1866 | "x4 -0.0138 0.001 -26.951 0.000 -0.015 -0.013\n",
1867 | "x5 0.0127 0.000 47.049 0.000 0.012 0.013\n",
1868 | "x6 -0.0005 0.002 -0.294 0.769 -0.004 0.003\n",
1869 | "x7 0.0260 0.001 28.111 0.000 0.024 0.028\n",
1870 | "==============================================================================\n",
1871 | "Omnibus: 341.439 Durbin-Watson: 2.027\n",
1872 | "Prob(Omnibus): 0.000 Jarque-Bera (JB): 353.022\n",
1873 | "Skew: -0.467 Prob(JB): 2.20e-77\n",
1874 | "Kurtosis: 2.670 Cond. No. 1.67e+16\n",
1875 | "==============================================================================\n",
1876 | "\n",
1877 | "Notes:\n",
1878 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
1879 | "[2] The smallest eigenvalue is 9e-27. This might indicate that there are\n",
1880 | "strong multicollinearity problems or that the design matrix is singular.\n",
1881 | "\"\"\""
1882 | ]
1883 | },
1884 | "execution_count": 91,
1885 | "metadata": {},
1886 | "output_type": "execute_result"
1887 | }
1888 | ],
1889 | "source": [
1890 | "preprocessed_X_train = mapper.transform(X_train)\n",
1891 | "preprocessed_X_train = sm.add_constant(preprocessed_X_train)\n",
1892 | "results = sm.OLS(y_train, preprocessed_X_train).fit()\n",
1893 | "results.summary()"
1894 | ]
1895 | },
1896 | {
1897 | "cell_type": "code",
1898 | "execution_count": 88,
1899 | "metadata": {},
1900 | "outputs": [
1901 | {
1902 | "name": "stdout",
1903 | "output_type": "stream",
1904 | "text": [
1905 | "x0, inf\n",
1906 | "x1, inf\n",
1907 | "x2, inf\n",
1908 | "x4, inf\n",
1909 | "x5, inf\n",
1910 | "x6, inf\n",
1911 | "x7, inf\n"
1912 | ]
1913 | },
1914 | {
1915 | "name": "stderr",
1916 | "output_type": "stream",
1917 | "text": [
1918 | "/usr/local/lib/python3.7/site-packages/statsmodels/stats/outliers_influence.py:193: RuntimeWarning: divide by zero encountered in double_scalars\n",
1919 | " vif = 1. / (1. - r_squared_i)\n"
1920 | ]
1921 | }
1922 | ],
1923 | "source": [
1924 | "for column in numerical_features:\n",
1925 | " print(f\"\"\"{column}, {variance_inflation_factor(\n",
1926 | " preprocessed_X_train.values, \n",
1927 | " list(preprocessed_X_train.columns).index(column))}\"\"\")"
1928 | ]
1929 | },
1930 | {
1931 | "cell_type": "markdown",
1932 | "metadata": {},
1933 | "source": [
1934 | "Removing feature with perfect multicolinearity:\n",
1935 | "- Improves interpretability of the coefficients (like `x0` here)\n",
1936 | "- Logistic Regression doesn't lose performance. "
1937 | ]
1938 | },
1939 | {
1940 | "cell_type": "markdown",
1941 | "metadata": {},
1942 | "source": [
1943 | "## Remove multicolinearity"
1944 | ]
1945 | },
1946 | {
1947 | "cell_type": "code",
1948 | "execution_count": 92,
1949 | "metadata": {},
1950 | "outputs": [
1951 | {
1952 | "data": {
1953 | "text/plain": [
1954 | "{'auc': 0.9194974891835068, 'pr-auc': 0.8982064967028441}"
1955 | ]
1956 | },
1957 | "execution_count": 92,
1958 | "metadata": {},
1959 | "output_type": "execute_result"
1960 | }
1961 | ],
1962 | "source": [
1963 | "numerical_features = ['x0', 'x1', 'x2', 'x4', 'x5', 'x7'] # remove x3, x6\n",
1964 | "\n",
1965 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
1966 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
1967 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
1968 | "\n",
1969 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
1970 | "mapper = DataFrameMapper(num, df_out=True)\n",
1971 | "\n",
1972 | "clf = LogisticRegression()\n",
1973 | "pipeline = Pipeline([\n",
1974 | " ('preprocess', mapper),\n",
1975 | " ('clf', clf)\n",
1976 | "])\n",
1977 | "\n",
1978 | "pipeline.fit(X_train, y_train)\n",
1979 | "evaluation(pipeline, X_test, y_test)"
1980 | ]
1981 | },
1982 | {
1983 | "cell_type": "code",
1984 | "execution_count": 93,
1985 | "metadata": {},
1986 | "outputs": [
1987 | {
1988 | "data": {
1989 | "text/html": [
1990 | "\n",
1991 | "OLS Regression Results\n",
1992 | "\n",
1993 | " Dep. Variable: | y | R-squared: | 0.483 | \n",
1994 | "
\n",
1995 | "\n",
1996 | " Model: | OLS | Adj. R-squared: | 0.483 | \n",
1997 | "
\n",
1998 | "\n",
1999 | " Method: | Least Squares | F-statistic: | 1345. | \n",
2000 | "
\n",
2001 | "\n",
2002 | " Date: | Sat, 10 Apr 2021 | Prob (F-statistic): | 0.00 | \n",
2003 | "
\n",
2004 | "\n",
2005 | " Time: | 14:38:52 | Log-Likelihood: | -3420.3 | \n",
2006 | "
\n",
2007 | "\n",
2008 | " No. Observations: | 8640 | AIC: | 6855. | \n",
2009 | "
\n",
2010 | "\n",
2011 | " Df Residuals: | 8633 | BIC: | 6904. | \n",
2012 | "
\n",
2013 | "\n",
2014 | " Df Model: | 6 | | | \n",
2015 | "
\n",
2016 | "\n",
2017 | " Covariance Type: | nonrobust | | | \n",
2018 | "
\n",
2019 | "
\n",
2020 | "\n",
2021 | "\n",
2022 | " | coef | std err | t | P>|t| | [0.025 | 0.975] | \n",
2023 | "
\n",
2024 | "\n",
2025 | " const | 0.5906 | 0.006 | 104.957 | 0.000 | 0.580 | 0.602 | \n",
2026 | "
\n",
2027 | "\n",
2028 | " x0 | -0.0169 | 0.001 | -17.744 | 0.000 | -0.019 | -0.015 | \n",
2029 | "
\n",
2030 | "\n",
2031 | " x1 | 0.0366 | 0.005 | 7.684 | 0.000 | 0.027 | 0.046 | \n",
2032 | "
\n",
2033 | "\n",
2034 | " x2 | 0.0434 | 0.001 | 40.460 | 0.000 | 0.041 | 0.045 | \n",
2035 | "
\n",
2036 | "\n",
2037 | " x4 | -0.0137 | 0.001 | -25.047 | 0.000 | -0.015 | -0.013 | \n",
2038 | "
\n",
2039 | "\n",
2040 | " x5 | 0.0128 | 0.000 | 33.306 | 0.000 | 0.012 | 0.014 | \n",
2041 | "
\n",
2042 | "\n",
2043 | " x7 | 0.0256 | 0.001 | 27.862 | 0.000 | 0.024 | 0.027 | \n",
2044 | "
\n",
2045 | "
\n",
2046 | "\n",
2047 | "\n",
2048 | " Omnibus: | 341.439 | Durbin-Watson: | 2.027 | \n",
2049 | "
\n",
2050 | "\n",
2051 | " Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 353.022 | \n",
2052 | "
\n",
2053 | "\n",
2054 | " Skew: | -0.467 | Prob(JB): | 2.20e-77 | \n",
2055 | "
\n",
2056 | "\n",
2057 | " Kurtosis: | 2.670 | Cond. No. | 30.1 | \n",
2058 | "
\n",
2059 | "
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified."
2060 | ],
2061 | "text/plain": [
2062 | "\n",
2063 | "\"\"\"\n",
2064 | " OLS Regression Results \n",
2065 | "==============================================================================\n",
2066 | "Dep. Variable: y R-squared: 0.483\n",
2067 | "Model: OLS Adj. R-squared: 0.483\n",
2068 | "Method: Least Squares F-statistic: 1345.\n",
2069 | "Date: Sat, 10 Apr 2021 Prob (F-statistic): 0.00\n",
2070 | "Time: 14:38:52 Log-Likelihood: -3420.3\n",
2071 | "No. Observations: 8640 AIC: 6855.\n",
2072 | "Df Residuals: 8633 BIC: 6904.\n",
2073 | "Df Model: 6 \n",
2074 | "Covariance Type: nonrobust \n",
2075 | "==============================================================================\n",
2076 | " coef std err t P>|t| [0.025 0.975]\n",
2077 | "------------------------------------------------------------------------------\n",
2078 | "const 0.5906 0.006 104.957 0.000 0.580 0.602\n",
2079 | "x0 -0.0169 0.001 -17.744 0.000 -0.019 -0.015\n",
2080 | "x1 0.0366 0.005 7.684 0.000 0.027 0.046\n",
2081 | "x2 0.0434 0.001 40.460 0.000 0.041 0.045\n",
2082 | "x4 -0.0137 0.001 -25.047 0.000 -0.015 -0.013\n",
2083 | "x5 0.0128 0.000 33.306 0.000 0.012 0.014\n",
2084 | "x7 0.0256 0.001 27.862 0.000 0.024 0.027\n",
2085 | "==============================================================================\n",
2086 | "Omnibus: 341.439 Durbin-Watson: 2.027\n",
2087 | "Prob(Omnibus): 0.000 Jarque-Bera (JB): 353.022\n",
2088 | "Skew: -0.467 Prob(JB): 2.20e-77\n",
2089 | "Kurtosis: 2.670 Cond. No. 30.1\n",
2090 | "==============================================================================\n",
2091 | "\n",
2092 | "Notes:\n",
2093 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
2094 | "\"\"\""
2095 | ]
2096 | },
2097 | "execution_count": 93,
2098 | "metadata": {},
2099 | "output_type": "execute_result"
2100 | }
2101 | ],
2102 | "source": [
2103 | "preprocessed_X_train = mapper.transform(X_train)\n",
2104 | "preprocessed_X_train = sm.add_constant(preprocessed_X_train)\n",
2105 | "results = sm.OLS(y_train, preprocessed_X_train).fit()\n",
2106 | "results.summary()"
2107 | ]
2108 | },
2109 | {
2110 | "cell_type": "code",
2111 | "execution_count": 94,
2112 | "metadata": {},
2113 | "outputs": [
2114 | {
2115 | "name": "stdout",
2116 | "output_type": "stream",
2117 | "text": [
2118 | "x0, 4.358204798860465\n",
2119 | "x1, 1.8526871839909662\n",
2120 | "x2, 5.622338237184614\n",
2121 | "x4, 4.123960180952725\n",
2122 | "x5, 2.6095687697415917\n",
2123 | "x7, 10.922197872534808\n"
2124 | ]
2125 | }
2126 | ],
2127 | "source": [
2128 | "for column in numerical_features:\n",
2129 | " print(f\"\"\"{column}, {variance_inflation_factor(\n",
2130 | " preprocessed_X_train.values, \n",
2131 | " list(preprocessed_X_train.columns).index(column))}\"\"\")"
2132 | ]
2133 | },
2134 | {
2135 | "cell_type": "markdown",
2136 | "metadata": {},
2137 | "source": [
2138 | "Removing `x6`, we didn't lose explainability nor performance "
2139 | ]
2140 | },
2141 | {
2142 | "cell_type": "markdown",
2143 | "metadata": {},
2144 | "source": [
2145 | "#### Remove x7 with high VAR"
2146 | ]
2147 | },
2148 | {
2149 | "cell_type": "code",
2150 | "execution_count": 95,
2151 | "metadata": {},
2152 | "outputs": [
2153 | {
2154 | "data": {
2155 | "text/plain": [
2156 | "{'auc': 0.8916873729387849, 'pr-auc': 0.858019953399781}"
2157 | ]
2158 | },
2159 | "execution_count": 95,
2160 | "metadata": {},
2161 | "output_type": "execute_result"
2162 | }
2163 | ],
2164 | "source": [
2165 | "numerical_features = ['x0', 'x1', 'x2', 'x4', 'x5'] # remove x3, x6, x7\n",
2166 | "\n",
2167 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
2168 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
2169 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
2170 | "\n",
2171 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
2172 | "mapper = DataFrameMapper(num, df_out=True)\n",
2173 | "\n",
2174 | "clf = LogisticRegression()\n",
2175 | "pipeline = Pipeline([\n",
2176 | " ('preprocess', mapper),\n",
2177 | " ('clf', clf)\n",
2178 | "])\n",
2179 | "\n",
2180 | "pipeline.fit(X_train, y_train)\n",
2181 | "evaluation(pipeline, X_test, y_test)"
2182 | ]
2183 | },
2184 | {
2185 | "cell_type": "code",
2186 | "execution_count": 96,
2187 | "metadata": {},
2188 | "outputs": [
2189 | {
2190 | "data": {
2191 | "text/html": [
2192 | "\n",
2193 | "OLS Regression Results\n",
2194 | "\n",
2195 | " Dep. Variable: | y | R-squared: | 0.437 | \n",
2196 | "
\n",
2197 | "\n",
2198 | " Model: | OLS | Adj. R-squared: | 0.436 | \n",
2199 | "
\n",
2200 | "\n",
2201 | " Method: | Least Squares | F-statistic: | 1338. | \n",
2202 | "
\n",
2203 | "\n",
2204 | " Date: | Sat, 10 Apr 2021 | Prob (F-statistic): | 0.00 | \n",
2205 | "
\n",
2206 | "\n",
2207 | " Time: | 14:40:42 | Log-Likelihood: | -3792.2 | \n",
2208 | "
\n",
2209 | "\n",
2210 | " No. Observations: | 8640 | AIC: | 7596. | \n",
2211 | "
\n",
2212 | "\n",
2213 | " Df Residuals: | 8634 | BIC: | 7639. | \n",
2214 | "
\n",
2215 | "\n",
2216 | " Df Model: | 5 | | | \n",
2217 | "
\n",
2218 | "\n",
2219 | " Covariance Type: | nonrobust | | | \n",
2220 | "
\n",
2221 | "
\n",
2222 | "\n",
2223 | "\n",
2224 | " | coef | std err | t | P>|t| | [0.025 | 0.975] | \n",
2225 | "
\n",
2226 | "\n",
2227 | " const | 0.6395 | 0.006 | 114.609 | 0.000 | 0.629 | 0.650 | \n",
2228 | "
\n",
2229 | "\n",
2230 | " x0 | 0.0063 | 0.000 | 12.833 | 0.000 | 0.005 | 0.007 | \n",
2231 | "
\n",
2232 | "\n",
2233 | " x1 | 0.1125 | 0.004 | 27.526 | 0.000 | 0.104 | 0.121 | \n",
2234 | "
\n",
2235 | "\n",
2236 | " x2 | 0.0166 | 0.000 | 33.605 | 0.000 | 0.016 | 0.018 | \n",
2237 | "
\n",
2238 | "\n",
2239 | " x4 | -0.0007 | 0.000 | -2.314 | 0.021 | -0.001 | -0.000 | \n",
2240 | "
\n",
2241 | "\n",
2242 | " x5 | 0.0204 | 0.000 | 73.414 | 0.000 | 0.020 | 0.021 | \n",
2243 | "
\n",
2244 | "
\n",
2245 | "\n",
2246 | "\n",
2247 | " Omnibus: | 381.662 | Durbin-Watson: | 2.031 | \n",
2248 | "
\n",
2249 | "\n",
2250 | " Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 330.564 | \n",
2251 | "
\n",
2252 | "\n",
2253 | " Skew: | -0.412 | Prob(JB): | 1.66e-72 | \n",
2254 | "
\n",
2255 | "\n",
2256 | " Kurtosis: | 2.511 | Cond. No. | 26.7 | \n",
2257 | "
\n",
2258 | "
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified."
2259 | ],
2260 | "text/plain": [
2261 | "\n",
2262 | "\"\"\"\n",
2263 | " OLS Regression Results \n",
2264 | "==============================================================================\n",
2265 | "Dep. Variable: y R-squared: 0.437\n",
2266 | "Model: OLS Adj. R-squared: 0.436\n",
2267 | "Method: Least Squares F-statistic: 1338.\n",
2268 | "Date: Sat, 10 Apr 2021 Prob (F-statistic): 0.00\n",
2269 | "Time: 14:40:42 Log-Likelihood: -3792.2\n",
2270 | "No. Observations: 8640 AIC: 7596.\n",
2271 | "Df Residuals: 8634 BIC: 7639.\n",
2272 | "Df Model: 5 \n",
2273 | "Covariance Type: nonrobust \n",
2274 | "==============================================================================\n",
2275 | " coef std err t P>|t| [0.025 0.975]\n",
2276 | "------------------------------------------------------------------------------\n",
2277 | "const 0.6395 0.006 114.609 0.000 0.629 0.650\n",
2278 | "x0 0.0063 0.000 12.833 0.000 0.005 0.007\n",
2279 | "x1 0.1125 0.004 27.526 0.000 0.104 0.121\n",
2280 | "x2 0.0166 0.000 33.605 0.000 0.016 0.018\n",
2281 | "x4 -0.0007 0.000 -2.314 0.021 -0.001 -0.000\n",
2282 | "x5 0.0204 0.000 73.414 0.000 0.020 0.021\n",
2283 | "==============================================================================\n",
2284 | "Omnibus: 381.662 Durbin-Watson: 2.031\n",
2285 | "Prob(Omnibus): 0.000 Jarque-Bera (JB): 330.564\n",
2286 | "Skew: -0.412 Prob(JB): 1.66e-72\n",
2287 | "Kurtosis: 2.511 Cond. No. 26.7\n",
2288 | "==============================================================================\n",
2289 | "\n",
2290 | "Notes:\n",
2291 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
2292 | "\"\"\""
2293 | ]
2294 | },
2295 | "execution_count": 96,
2296 | "metadata": {},
2297 | "output_type": "execute_result"
2298 | }
2299 | ],
2300 | "source": [
2301 | "preprocessed_X_train = mapper.transform(X_train)\n",
2302 | "preprocessed_X_train = sm.add_constant(preprocessed_X_train)\n",
2303 | "results = sm.OLS(y_train, preprocessed_X_train).fit()\n",
2304 | "results.summary()"
2305 | ]
2306 | },
2307 | {
2308 | "cell_type": "code",
2309 | "execution_count": 97,
2310 | "metadata": {},
2311 | "outputs": [
2312 | {
2313 | "name": "stdout",
2314 | "output_type": "stream",
2315 | "text": [
2316 | "x0, 1.0434492528061576\n",
2317 | "x1, 1.2487373171729157\n",
2318 | "x2, 1.089610333638892\n",
2319 | "x4, 1.1174255753042328\n",
2320 | "x5, 1.2630916367080673\n"
2321 | ]
2322 | }
2323 | ],
2324 | "source": [
2325 | "for column in numerical_features:\n",
2326 | " print(f\"\"\"{column}, {variance_inflation_factor(\n",
2327 | " preprocessed_X_train.values, \n",
2328 | " list(preprocessed_X_train.columns).index(column))}\"\"\")"
2329 | ]
2330 | },
2331 | {
2332 | "cell_type": "markdown",
2333 | "metadata": {},
2334 | "source": [
2335 | "Removing `x7`:\n",
2336 | "- Helped explainability \n",
2337 | "- Negatively impacted performance"
2338 | ]
2339 | },
2340 | {
2341 | "cell_type": "markdown",
2342 | "metadata": {},
2343 | "source": [
2344 | "Remedy: Add polynomial terms, Try other models that capture more complex interactions. "
2345 | ]
2346 | },
2347 | {
2348 | "cell_type": "markdown",
2349 | "metadata": {},
2350 | "source": [
2351 | "## 1.5 Missing Values"
2352 | ]
2353 | },
2354 | {
2355 | "cell_type": "code",
2356 | "execution_count": 98,
2357 | "metadata": {},
2358 | "outputs": [
2359 | {
2360 | "name": "stdout",
2361 | "output_type": "stream",
2362 | "text": [
2363 | "--------------------------------------------------------------------------------\n",
2364 | "Warning: n_clusters_per_class not in configuration, defaulting to 2\n",
2365 | "Warning: effective_rank not in configuration, defaulting to None\n",
2366 | "Warning: tail_strength not in configuration, defaulting to 0.5\n",
2367 | "Warning: noise not in configuration, defaulting to 0.0\n",
2368 | "Warning: shuffle not in configuration, defaulting to True\n",
2369 | "Creating Classification Dataset...\n",
2370 | "Creating Categorical Features...\n",
2371 | "Warning: insert_dollar not in configuration, defaulting to 'No'\n",
2372 | "Warning: insert_percent not in configuration, defaulting to 'No'\n",
2373 | "Warning: star_schema not in configuration, defaulting to 'No'\n",
2374 | "Writing Train/Test Datasets\n"
2375 | ]
2376 | }
2377 | ],
2378 | "source": [
2379 | "df, label, categorical_features, numerical_features = get_data(missing_values=True)"
2380 | ]
2381 | },
2382 | {
2383 | "cell_type": "code",
2384 | "execution_count": 99,
2385 | "metadata": {},
2386 | "outputs": [
2387 | {
2388 | "ename": "ValueError",
2389 | "evalue": "Input contains NaN, infinity or a value too large for dtype('float64').",
2390 | "output_type": "error",
2391 | "traceback": [
2392 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
2393 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
2394 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 12\u001b[0m ])\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0mpipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnumerical_features\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 15\u001b[0m \u001b[0mevaluation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpipeline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnumerical_features\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
2395 | "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/pipeline.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_final_estimator\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m'passthrough'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[0mfit_params_last_step\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfit_params_steps\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msteps\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 346\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_final_estimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mXt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params_last_step\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 347\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 348\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
2396 | "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 1344\u001b[0m X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype,\n\u001b[1;32m 1345\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"C\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1346\u001b[0;31m accept_large_sparse=solver != 'liblinear')\n\u001b[0m\u001b[1;32m 1347\u001b[0m \u001b[0mcheck_classification_targets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1348\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclasses_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
2397 | "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36m_validate_data\u001b[0;34m(self, X, y, reset, validate_separately, **check_params)\u001b[0m\n\u001b[1;32m 431\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 432\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 433\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 434\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 435\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
2398 | "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;31m# extra_args > 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
2399 | "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 819\u001b[0m \u001b[0mensure_min_samples\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mensure_min_samples\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 820\u001b[0m \u001b[0mensure_min_features\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mensure_min_features\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 821\u001b[0;31m estimator=estimator)\n\u001b[0m\u001b[1;32m 822\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmulti_output\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 823\u001b[0m y = check_array(y, accept_sparse='csr', force_all_finite=True,\n",
2400 | "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;31m# extra_args > 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
2401 | "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)\u001b[0m\n\u001b[1;32m 662\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 663\u001b[0m _assert_all_finite(array,\n\u001b[0;32m--> 664\u001b[0;31m allow_nan=force_all_finite == 'allow-nan')\n\u001b[0m\u001b[1;32m 665\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 666\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mensure_min_samples\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
2402 | "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m_assert_all_finite\u001b[0;34m(X, allow_nan, msg_dtype)\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[0mmsg_err\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 105\u001b[0m (type_err,\n\u001b[0;32m--> 106\u001b[0;31m msg_dtype if msg_dtype is not None else X.dtype)\n\u001b[0m\u001b[1;32m 107\u001b[0m )\n\u001b[1;32m 108\u001b[0m \u001b[0;31m# for object dtype data, we only check for NaNs (GH-13254)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
2403 | "\u001b[0;31mValueError\u001b[0m: Input contains NaN, infinity or a value too large for dtype('float64')."
2404 | ]
2405 | }
2406 | ],
2407 | "source": [
2408 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
2409 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
2410 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
2411 | "\n",
2412 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
2413 | "mapper = DataFrameMapper(num, df_out=True)\n",
2414 | "\n",
2415 | "clf = LogisticRegression()\n",
2416 | "pipeline = Pipeline([\n",
2417 | " #('preprocess', mapper),\n",
2418 | " ('clf', clf)\n",
2419 | "])\n",
2420 | "\n",
2421 | "pipeline.fit(X_train[numerical_features], y_train)\n",
2422 | "evaluation(pipeline, X_test[numerical_features], y_test)"
2423 | ]
2424 | },
2425 | {
2426 | "cell_type": "code",
2427 | "execution_count": 100,
2428 | "metadata": {},
2429 | "outputs": [
2430 | {
2431 | "data": {
2432 | "text/plain": [
2433 | "{'auc': 0.7473034970984109, 'pr-auc': 0.676792150205654}"
2434 | ]
2435 | },
2436 | "execution_count": 100,
2437 | "metadata": {},
2438 | "output_type": "execute_result"
2439 | }
2440 | ],
2441 | "source": [
2442 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
2443 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
2444 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
2445 | "\n",
2446 | "num = [([n], [SimpleImputer()]) for n in numerical_features] # Impute values\n",
2447 | "mapper = DataFrameMapper(num, df_out=True)\n",
2448 | " \n",
2449 | "clf = LogisticRegression()\n",
2450 | "pipeline = Pipeline([\n",
2451 | " ('preprocess', mapper),\n",
2452 | " ('clf', clf)\n",
2453 | "])\n",
2454 | "\n",
2455 | "pipeline.fit(X_train[numerical_features], y_train)\n",
2456 | "evaluation(pipeline, X_test[numerical_features], y_test)"
2457 | ]
2458 | },
2459 | {
2460 | "cell_type": "markdown",
2461 | "metadata": {},
2462 | "source": [
2463 | "**Result**\n",
2464 | "- Logistic Regression can't handle missing values. Best Imupute with mean"
2465 | ]
2466 | },
2467 | {
2468 | "cell_type": "markdown",
2469 | "metadata": {},
2470 | "source": [
2471 | "## Summary \n",
2472 | "\n",
2473 | "Let's see how Logistic Regression acts with 5 techniques:\n",
2474 | "1. **Standardization of Numerical Variables**\n",
2475 | " - Performance doesn't necessarily improve. But convergence is faster during training\n",
2476 | "2. **Encoding of Categorical Variables**\n",
2477 | " - We can use ordinal encoding if the categories are related (size). Otherwise, use one hot encoding\n",
2478 | "3. **Data Imbalance**\n",
2479 | " - Perform overweighting of the minor class and undersampling of the major class\n",
2480 | "4. **Colinearity**\n",
2481 | " - remove features which exhibit perfect multicolinearity\n",
2482 | " - try different modeling strategies to ensure the model is capturing non-linear interactions\n",
2483 | "5. **Missing Values**\n",
2484 | " - Impute with mean (or a constant value). This is problem specific"
2485 | ]
2486 | }
2487 | ],
2488 | "metadata": {
2489 | "kernelspec": {
2490 | "display_name": "Python 3",
2491 | "language": "python",
2492 | "name": "python3"
2493 | },
2494 | "language_info": {
2495 | "codemirror_mode": {
2496 | "name": "ipython",
2497 | "version": 3
2498 | },
2499 | "file_extension": ".py",
2500 | "mimetype": "text/x-python",
2501 | "name": "python",
2502 | "nbconvert_exporter": "python",
2503 | "pygments_lexer": "ipython3",
2504 | "version": "3.7.4"
2505 | }
2506 | },
2507 | "nbformat": 4,
2508 | "nbformat_minor": 4
2509 | }
2510 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # preprocessing-cheat-sheet
2 |
3 | Preprocessing cheat sheet for some machine learning algorithms. Starting with Logistic Regression for now. This might grow in the future
4 |
--------------------------------------------------------------------------------