├── LICENSE ├── Preprocessing Cheat - Logistic Regression.ipynb └── README.md /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Ajay Halthor 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Preprocessing Cheat - Logistic Regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 349, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "#!pip3 install statsmodels" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "Let's see how Logistic Regression acts with 5 techniques:\n", 19 | "1. Standardization of Numerical Variables\n", 20 | "2. Encoding of Categorical Variables\n", 21 | "3. Data Imbalance\n", 22 | "4. Colinearity\n", 23 | "5. Missing Values" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "import numpy as np\n", 33 | "import pandas as pd\n", 34 | "from sklearn.impute import SimpleImputer\n", 35 | "from sklearn.linear_model import LogisticRegression\n", 36 | "from sklearn.metrics import roc_auc_score, average_precision_score\n", 37 | "from sklearn.model_selection import train_test_split\n", 38 | "from sklearn.pipeline import Pipeline\n", 39 | "from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder\n", 40 | "from sklearn_pandas import DataFrameMapper\n", 41 | "from snape.make_dataset import make_dataset" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "Check out snape [here](https://github.com/mbernico/snape)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 71, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "def get_data(categorical_features=True,\n", 58 | " balanced=True, \n", 59 | " correlated_features=False, \n", 60 | " missing_values=False,\n", 61 | " dataset_size=12000):\n", 62 | " \n", 63 | " if categorical_features:\n", 64 | " label_list = []\n", 65 | " N_CATEGORICAL = 4\n", 66 | " for i in range(N_CATEGORICAL):\n", 67 | " num_classes = np.random.randint(2, 10)\n", 68 | " labels = list(np.arange(num_classes))\n", 69 | " labels = [f'str_{i}' for i in labels]\n", 70 | " label_list.append(labels)\n", 71 | " \n", 72 | " if correlated_features:\n", 73 | " N_REDUNDANT = 1\n", 74 | " N_REPEATED = 1\n", 75 | " N_INFORMATIVE = 8 - N_REDUNDANT - N_REPEATED\n", 76 | " \n", 77 | " conf = {\n", 78 | " \"type\": \"classification\",\n", 79 | " \"n_classes\": 2,\n", 80 | " \"n_samples\": dataset_size,\n", 81 | " \"n_features\": 8,\n", 82 | " \"out_path\": \"./\",\n", 83 | " \"output\": \"my_dataset\",\n", 84 | " \"n_informative\": N_INFORMATIVE if correlated_features else 8,\n", 85 | " \"n_repeated\": N_REPEATED if correlated_features else 0,\n", 86 | " \"n_redundant\": N_REDUNDANT if correlated_features else 0,\n", 87 | " \"n_clusters\": 2,\n", 88 | " \"weights\": [0.5, 0.5] if balanced else [0.9, 0.1],\n", 89 | " \"pct_missing\": 0.70 if missing_values else 0.00,\n", 90 | " \"n_categorical\": N_CATEGORICAL if categorical_features else 0,\n", 91 | " \"random_seed\":42,\n", 92 | " \"label_list\":label_list if categorical_features else []\n", 93 | " }\n", 94 | "\n", 95 | " make_dataset(config=conf)\n", 96 | " df = pd.read_csv('my_dataset_train.csv')\n", 97 | " \n", 98 | " label = 'y'\n", 99 | " categorical_features = [col for col in df.columns if (df[col].dtype==object) & (col != label)]\n", 100 | " numerical_features = [col for col in df.columns if (col not in categorical_features) & (col != label)]\n", 101 | " \n", 102 | " return df, label, categorical_features, numerical_features\n", 103 | "\n", 104 | "def evaluation(pipeline, X, y):\n", 105 | " y_predict_proba = pipeline.predict_proba(X)[:, 1]\n", 106 | " return{\n", 107 | " 'auc': roc_auc_score(y, y_predict_proba),\n", 108 | " 'pr-auc': average_precision_score(y, y_predict_proba)\n", 109 | " }" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "# Logistic Regression" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "## 1.1 Standardiazation" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "### Without Standardization" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 108, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "--------------------------------------------------------------------------------\n", 143 | "Warning: n_repeated not in configuration, defaulting to 0\n", 144 | "Warning: n_clusters_per_class not in configuration, defaulting to 2\n", 145 | "Warning: effective_rank not in configuration, defaulting to None\n", 146 | "Warning: tail_strength not in configuration, defaulting to 0.5\n", 147 | "Warning: noise not in configuration, defaulting to 0.0\n", 148 | "Warning: shuffle not in configuration, defaulting to True\n", 149 | "Creating Classification Dataset...\n", 150 | "Warning: insert_dollar not in configuration, defaulting to 'No'\n", 151 | "Warning: insert_percent not in configuration, defaulting to 'No'\n", 152 | "Warning: star_schema not in configuration, defaulting to 'No'\n", 153 | "Writing Train/Test Datasets\n" 154 | ] 155 | }, 156 | { 157 | "data": { 158 | "text/html": [ 159 | "
\n", 160 | "\n", 173 | "\n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | "
x0x1x2x3x4x5x6x7
count96000.00000096000.00000096000.00000096000.00000096000.00000096000.00000096000.00000096000.000000
mean-1.6452282.2440241.0451372.714882-0.020220-0.004654-1.536356-4.197611
std6.2095788.6028771.7334659.09695411.9887774.7191402.81051915.886376
min-35.268886-34.012532-7.229214-35.163348-49.768093-20.945358-13.822828-78.725850
25%-5.758226-3.452518-0.106077-3.561230-8.276808-3.241806-3.394866-15.068164
50%-1.7097022.3935611.0516682.7894800.106544-0.308207-1.546129-3.386493
75%2.5155678.1069762.1924459.0222448.3572893.1153090.3021297.250865
max25.20696542.0721309.85149243.89685643.18907121.42667713.30572362.477698
\n", 278 | "
" 279 | ], 280 | "text/plain": [ 281 | " x0 x1 x2 x3 x4 \\\n", 282 | "count 96000.000000 96000.000000 96000.000000 96000.000000 96000.000000 \n", 283 | "mean -1.645228 2.244024 1.045137 2.714882 -0.020220 \n", 284 | "std 6.209578 8.602877 1.733465 9.096954 11.988777 \n", 285 | "min -35.268886 -34.012532 -7.229214 -35.163348 -49.768093 \n", 286 | "25% -5.758226 -3.452518 -0.106077 -3.561230 -8.276808 \n", 287 | "50% -1.709702 2.393561 1.051668 2.789480 0.106544 \n", 288 | "75% 2.515567 8.106976 2.192445 9.022244 8.357289 \n", 289 | "max 25.206965 42.072130 9.851492 43.896856 43.189071 \n", 290 | "\n", 291 | " x5 x6 x7 \n", 292 | "count 96000.000000 96000.000000 96000.000000 \n", 293 | "mean -0.004654 -1.536356 -4.197611 \n", 294 | "std 4.719140 2.810519 15.886376 \n", 295 | "min -20.945358 -13.822828 -78.725850 \n", 296 | "25% -3.241806 -3.394866 -15.068164 \n", 297 | "50% -0.308207 -1.546129 -3.386493 \n", 298 | "75% 3.115309 0.302129 7.250865 \n", 299 | "max 21.426677 13.305723 62.477698 " 300 | ] 301 | }, 302 | "execution_count": 108, 303 | "metadata": {}, 304 | "output_type": "execute_result" 305 | } 306 | ], 307 | "source": [ 308 | "df, label, categorical_features, numerical_features = get_data(categorical_features=False, dataset_size=120000)\n", 309 | "df[numerical_features].describe()" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 111, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n", 319 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n", 320 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 114, 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "name": "stdout", 330 | "output_type": "stream", 331 | "text": [ 332 | "[Pipeline] ............... (step 1 of 1) Processing clf, total= 0.3s\n" 333 | ] 334 | }, 335 | { 336 | "data": { 337 | "text/plain": [ 338 | "{'auc': 0.8148784308322949, 'pr-auc': 0.818032430163559}" 339 | ] 340 | }, 341 | "execution_count": 114, 342 | "metadata": {}, 343 | "output_type": "execute_result" 344 | } 345 | ], 346 | "source": [ 347 | "clf = LogisticRegression()\n", 348 | "pipeline = Pipeline([\n", 349 | " ('clf', clf)\n", 350 | "], verbose=True)\n", 351 | "\n", 352 | "pipeline.fit(X_train[numerical_features], y_train)\n", 353 | "evaluation(pipeline, X_test[numerical_features], y_test)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "### With Standardization" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 115, 366 | "metadata": {}, 367 | "outputs": [ 368 | { 369 | "name": "stdout", 370 | "output_type": "stream", 371 | "text": [ 372 | "[Pipeline] ........ (step 1 of 2) Processing preprocess, total= 0.1s\n", 373 | "[Pipeline] ............... (step 2 of 2) Processing clf, total= 0.1s\n" 374 | ] 375 | }, 376 | { 377 | "data": { 378 | "text/plain": [ 379 | "{'auc': 0.8148798631692816, 'pr-auc': 0.8180303186841142}" 380 | ] 381 | }, 382 | "execution_count": 115, 383 | "metadata": {}, 384 | "output_type": "execute_result" 385 | } 386 | ], 387 | "source": [ 388 | "num = [([n], [StandardScaler()]) for n in numerical_features]\n", 389 | "mapper = DataFrameMapper(num, df_out=True)\n", 390 | "\n", 391 | "clf = LogisticRegression()\n", 392 | "pipeline = Pipeline([\n", 393 | " ('preprocess', mapper),\n", 394 | " ('clf', clf)\n", 395 | "], verbose=True)\n", 396 | "\n", 397 | "pipeline.fit(X_train[numerical_features], y_train)\n", 398 | "evaluation(pipeline, X_test[numerical_features], y_test)" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "**Result**\n", 406 | "- No need to scale for logistic regression accuracy. But convergence is faster. [More info here](https://stats.stackexchange.com/questions/48360/is-standardization-needed-before-fitting-logistic-regression#:~:text=3%20Answers&text=Standardization%20isn't%20required%20for,the%20technique%20used%20for%20optimization.&text=Otherwise%2C%20you%20can%20run%20your,standardization%20treatment%20on%20the%20features)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "## 1.2 Encoding" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "We need numeric encoding for logistic regression." 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 3, 426 | "metadata": {}, 427 | "outputs": [ 428 | { 429 | "name": "stdout", 430 | "output_type": "stream", 431 | "text": [ 432 | "--------------------------------------------------------------------------------\n", 433 | "Warning: n_repeated not in configuration, defaulting to 0\n", 434 | "Warning: n_clusters_per_class not in configuration, defaulting to 2\n", 435 | "Warning: effective_rank not in configuration, defaulting to None\n", 436 | "Warning: tail_strength not in configuration, defaulting to 0.5\n", 437 | "Warning: noise not in configuration, defaulting to 0.0\n", 438 | "Warning: shuffle not in configuration, defaulting to True\n", 439 | "Creating Classification Dataset...\n", 440 | "Creating Categorical Features...\n", 441 | "Warning: insert_dollar not in configuration, defaulting to 'No'\n", 442 | "Warning: insert_percent not in configuration, defaulting to 'No'\n", 443 | "Warning: star_schema not in configuration, defaulting to 'No'\n", 444 | "Writing Train/Test Datasets\n" 445 | ] 446 | } 447 | ], 448 | "source": [ 449 | "df, label, categorical_features, numerical_features = get_data()" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "### One Hot Encoding" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 10, 462 | "metadata": {}, 463 | "outputs": [ 464 | { 465 | "name": "stdout", 466 | "output_type": "stream", 467 | "text": [ 468 | "[Pipeline] ........ (step 1 of 2) Processing preprocess, total= 0.1s\n", 469 | "[Pipeline] ............... (step 2 of 2) Processing clf, total= 0.9s\n" 470 | ] 471 | }, 472 | { 473 | "data": { 474 | "text/plain": [ 475 | "{'auc': 0.8304397645792462, 'pr-auc': 0.80297861579569}" 476 | ] 477 | }, 478 | "execution_count": 10, 479 | "metadata": {}, 480 | "output_type": "execute_result" 481 | } 482 | ], 483 | "source": [ 484 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n", 485 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n", 486 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n", 487 | "\n", 488 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n", 489 | "cat = [([c], [OneHotEncoder()]) for c in categorical_features]\n", 490 | "mapper = DataFrameMapper(cat + num, df_out=True)\n", 491 | "\n", 492 | "clf = LogisticRegression(max_iter=1000)\n", 493 | "pipeline = Pipeline([\n", 494 | " ('preprocess', mapper),\n", 495 | " ('clf', clf)\n", 496 | "], verbose=True)\n", 497 | "\n", 498 | "pipeline.fit(X_train, y_train)\n", 499 | "evaluation(pipeline, X_test, y_test)" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 11, 505 | "metadata": { 506 | "scrolled": true 507 | }, 508 | "outputs": [ 509 | { 510 | "data": { 511 | "text/html": [ 512 | "
\n", 513 | "\n", 526 | "\n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | "
86408641864286438644
x1_x0_str_00.0000000.0000000.0000000.0000000.000000
x1_x0_str_10.0000000.0000000.0000000.0000000.000000
x1_x0_str_21.0000000.0000000.0000001.0000001.000000
x1_x0_str_30.0000001.0000001.0000000.0000000.000000
x1_x0_str_40.0000000.0000000.0000000.0000000.000000
x1_x0_str_50.0000000.0000000.0000000.0000000.000000
x3_x0_str_00.0000000.0000000.0000000.0000000.000000
x3_x0_str_10.0000000.0000001.0000000.0000000.000000
x3_x0_str_20.0000001.0000000.0000001.0000001.000000
x3_x0_str_31.0000000.0000000.0000000.0000000.000000
x3_x0_str_40.0000000.0000000.0000000.0000000.000000
x6_x0_str_00.0000000.0000000.0000000.0000000.000000
x6_x0_str_10.0000000.0000000.0000000.0000000.000000
x6_x0_str_20.0000000.0000000.0000001.0000000.000000
x6_x0_str_31.0000001.0000000.0000000.0000000.000000
x6_x0_str_40.0000000.0000001.0000000.0000001.000000
x6_x0_str_50.0000000.0000000.0000000.0000000.000000
x6_x0_str_60.0000000.0000000.0000000.0000000.000000
x6_x0_str_70.0000000.0000000.0000000.0000000.000000
x7_x0_str_00.0000000.0000000.0000000.0000000.000000
x7_x0_str_10.0000000.0000000.0000000.0000001.000000
x7_x0_str_20.0000000.0000000.0000000.0000000.000000
x7_x0_str_31.0000000.0000001.0000001.0000000.000000
x7_x0_str_40.0000000.0000000.0000000.0000000.000000
x7_x0_str_50.0000001.0000000.0000000.0000000.000000
x7_x0_str_60.0000000.0000000.0000000.0000000.000000
x0-10.703380-4.18998910.96545711.707606-10.140494
x2-12.941344-14.909158-21.448502-6.947473-36.795258
x4-6.4645337.3663113.887812-8.306792-7.842345
x5-0.328846-11.83362913.59260310.200299-3.358164
\n", 780 | "
" 781 | ], 782 | "text/plain": [ 783 | " 8640 8641 8642 8643 8644\n", 784 | "x1_x0_str_0 0.000000 0.000000 0.000000 0.000000 0.000000\n", 785 | "x1_x0_str_1 0.000000 0.000000 0.000000 0.000000 0.000000\n", 786 | "x1_x0_str_2 1.000000 0.000000 0.000000 1.000000 1.000000\n", 787 | "x1_x0_str_3 0.000000 1.000000 1.000000 0.000000 0.000000\n", 788 | "x1_x0_str_4 0.000000 0.000000 0.000000 0.000000 0.000000\n", 789 | "x1_x0_str_5 0.000000 0.000000 0.000000 0.000000 0.000000\n", 790 | "x3_x0_str_0 0.000000 0.000000 0.000000 0.000000 0.000000\n", 791 | "x3_x0_str_1 0.000000 0.000000 1.000000 0.000000 0.000000\n", 792 | "x3_x0_str_2 0.000000 1.000000 0.000000 1.000000 1.000000\n", 793 | "x3_x0_str_3 1.000000 0.000000 0.000000 0.000000 0.000000\n", 794 | "x3_x0_str_4 0.000000 0.000000 0.000000 0.000000 0.000000\n", 795 | "x6_x0_str_0 0.000000 0.000000 0.000000 0.000000 0.000000\n", 796 | "x6_x0_str_1 0.000000 0.000000 0.000000 0.000000 0.000000\n", 797 | "x6_x0_str_2 0.000000 0.000000 0.000000 1.000000 0.000000\n", 798 | "x6_x0_str_3 1.000000 1.000000 0.000000 0.000000 0.000000\n", 799 | "x6_x0_str_4 0.000000 0.000000 1.000000 0.000000 1.000000\n", 800 | "x6_x0_str_5 0.000000 0.000000 0.000000 0.000000 0.000000\n", 801 | "x6_x0_str_6 0.000000 0.000000 0.000000 0.000000 0.000000\n", 802 | "x6_x0_str_7 0.000000 0.000000 0.000000 0.000000 0.000000\n", 803 | "x7_x0_str_0 0.000000 0.000000 0.000000 0.000000 0.000000\n", 804 | "x7_x0_str_1 0.000000 0.000000 0.000000 0.000000 1.000000\n", 805 | "x7_x0_str_2 0.000000 0.000000 0.000000 0.000000 0.000000\n", 806 | "x7_x0_str_3 1.000000 0.000000 1.000000 1.000000 0.000000\n", 807 | "x7_x0_str_4 0.000000 0.000000 0.000000 0.000000 0.000000\n", 808 | "x7_x0_str_5 0.000000 1.000000 0.000000 0.000000 0.000000\n", 809 | "x7_x0_str_6 0.000000 0.000000 0.000000 0.000000 0.000000\n", 810 | "x0 -10.703380 -4.189989 10.965457 11.707606 -10.140494\n", 811 | "x2 -12.941344 -14.909158 -21.448502 -6.947473 -36.795258\n", 812 | "x4 -6.464533 7.366311 3.887812 -8.306792 -7.842345\n", 813 | "x5 -0.328846 -11.833629 13.592603 10.200299 -3.358164" 814 | ] 815 | }, 816 | "execution_count": 11, 817 | "metadata": {}, 818 | "output_type": "execute_result" 819 | } 820 | ], 821 | "source": [ 822 | "preprocessed_X_test = mapper.transform(X_test)\n", 823 | "preprocessed_X_test.head().T" 824 | ] 825 | }, 826 | { 827 | "cell_type": "markdown", 828 | "metadata": {}, 829 | "source": [ 830 | "### Ordinal Encoding" 831 | ] 832 | }, 833 | { 834 | "cell_type": "code", 835 | "execution_count": 13, 836 | "metadata": {}, 837 | "outputs": [ 838 | { 839 | "name": "stdout", 840 | "output_type": "stream", 841 | "text": [ 842 | "[Pipeline] ........ (step 1 of 2) Processing preprocess, total= 0.1s\n", 843 | "[Pipeline] ............... (step 2 of 2) Processing clf, total= 0.1s\n" 844 | ] 845 | }, 846 | { 847 | "data": { 848 | "text/plain": [ 849 | "{'auc': 0.8194499904512231, 'pr-auc': 0.7996358755932719}" 850 | ] 851 | }, 852 | "execution_count": 13, 853 | "metadata": {}, 854 | "output_type": "execute_result" 855 | } 856 | ], 857 | "source": [ 858 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n", 859 | "cat = [([c], [OrdinalEncoder()]) for c in categorical_features]\n", 860 | "mapper = DataFrameMapper(cat + num, df_out=True)\n", 861 | "\n", 862 | "clf = LogisticRegression()\n", 863 | "pipeline = Pipeline([\n", 864 | " ('preprocess', mapper),\n", 865 | " ('clf', clf)\n", 866 | "], verbose=True)\n", 867 | "\n", 868 | "pipeline.fit(X_train, y_train)\n", 869 | "evaluation(pipeline, X_test, y_test)" 870 | ] 871 | }, 872 | { 873 | "cell_type": "code", 874 | "execution_count": 145, 875 | "metadata": {}, 876 | "outputs": [ 877 | { 878 | "data": { 879 | "text/html": [ 880 | "
\n", 881 | "\n", 894 | "\n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | "
90009001900290039004
feat_51.0000003.0000006.0000000.0000005.000000
feat_62.0000003.0000000.0000001.0000000.000000
feat_70.0000002.0000006.0000004.0000001.000000
feat_84.0000007.0000007.0000000.0000005.000000
feat_1-0.0687680.4258991.9303541.157980-1.304169
feat_2-1.2228780.2936601.729959-0.7165381.169799
feat_3-0.7149061.509702-0.429593-0.708234-0.304866
feat_4-0.8236431.9978450.105752-0.9535790.690543
\n", 972 | "
" 973 | ], 974 | "text/plain": [ 975 | " 9000 9001 9002 9003 9004\n", 976 | "feat_5 1.000000 3.000000 6.000000 0.000000 5.000000\n", 977 | "feat_6 2.000000 3.000000 0.000000 1.000000 0.000000\n", 978 | "feat_7 0.000000 2.000000 6.000000 4.000000 1.000000\n", 979 | "feat_8 4.000000 7.000000 7.000000 0.000000 5.000000\n", 980 | "feat_1 -0.068768 0.425899 1.930354 1.157980 -1.304169\n", 981 | "feat_2 -1.222878 0.293660 1.729959 -0.716538 1.169799\n", 982 | "feat_3 -0.714906 1.509702 -0.429593 -0.708234 -0.304866\n", 983 | "feat_4 -0.823643 1.997845 0.105752 -0.953579 0.690543" 984 | ] 985 | }, 986 | "execution_count": 145, 987 | "metadata": {}, 988 | "output_type": "execute_result" 989 | } 990 | ], 991 | "source": [ 992 | "preprocessed_X_test = mapper.transform(X_test)\n", 993 | "preprocessed_X_test.head().T" 994 | ] 995 | }, 996 | { 997 | "cell_type": "markdown", 998 | "metadata": {}, 999 | "source": [ 1000 | "**Result**: \n", 1001 | "- `OrdinalEncoding` works when relationships exist between categorical variables (size, weather). Otherwise, prefer `OneHotEncoding`\n", 1002 | "- `OneHotEncoding` takes up space. Hence more training time" 1003 | ] 1004 | }, 1005 | { 1006 | "cell_type": "markdown", 1007 | "metadata": {}, 1008 | "source": [ 1009 | "## 1.3 Data Imbalance" 1010 | ] 1011 | }, 1012 | { 1013 | "cell_type": "markdown", 1014 | "metadata": {}, 1015 | "source": [ 1016 | "What happens if the training data isn't balanced?" 1017 | ] 1018 | }, 1019 | { 1020 | "cell_type": "markdown", 1021 | "metadata": {}, 1022 | "source": [ 1023 | "### Unbalanced" 1024 | ] 1025 | }, 1026 | { 1027 | "cell_type": "code", 1028 | "execution_count": 102, 1029 | "metadata": {}, 1030 | "outputs": [ 1031 | { 1032 | "name": "stdout", 1033 | "output_type": "stream", 1034 | "text": [ 1035 | "--------------------------------------------------------------------------------\n", 1036 | "Warning: n_clusters_per_class not in configuration, defaulting to 2\n", 1037 | "Warning: effective_rank not in configuration, defaulting to None\n", 1038 | "Warning: tail_strength not in configuration, defaulting to 0.5\n", 1039 | "Warning: noise not in configuration, defaulting to 0.0\n", 1040 | "Warning: shuffle not in configuration, defaulting to True\n", 1041 | "Creating Classification Dataset...\n", 1042 | "Creating Categorical Features...\n", 1043 | "Warning: insert_dollar not in configuration, defaulting to 'No'\n", 1044 | "Warning: insert_percent not in configuration, defaulting to 'No'\n", 1045 | "Warning: star_schema not in configuration, defaulting to 'No'\n", 1046 | "Writing Train/Test Datasets\n" 1047 | ] 1048 | } 1049 | ], 1050 | "source": [ 1051 | "df, label, categorical_features, numerical_features = get_data(balanced=False)\n", 1052 | "\n", 1053 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n", 1054 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n", 1055 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]" 1056 | ] 1057 | }, 1058 | { 1059 | "cell_type": "code", 1060 | "execution_count": 103, 1061 | "metadata": {}, 1062 | "outputs": [ 1063 | { 1064 | "data": { 1065 | "text/plain": [ 1066 | "0 8599\n", 1067 | "1 1001\n", 1068 | "Name: y, dtype: int64" 1069 | ] 1070 | }, 1071 | "execution_count": 103, 1072 | "metadata": {}, 1073 | "output_type": "execute_result" 1074 | } 1075 | ], 1076 | "source": [ 1077 | "df[label].value_counts()" 1078 | ] 1079 | }, 1080 | { 1081 | "cell_type": "code", 1082 | "execution_count": 104, 1083 | "metadata": {}, 1084 | "outputs": [ 1085 | { 1086 | "data": { 1087 | "text/plain": [ 1088 | "{'auc': 0.7869518716577542, 'pr-auc': 0.39239809756882393}" 1089 | ] 1090 | }, 1091 | "execution_count": 104, 1092 | "metadata": {}, 1093 | "output_type": "execute_result" 1094 | } 1095 | ], 1096 | "source": [ 1097 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n", 1098 | "cat = [([c], [OrdinalEncoder()]) for c in categorical_features]\n", 1099 | "mapper = DataFrameMapper(cat + num, df_out=True)\n", 1100 | "\n", 1101 | "clf = LogisticRegression()\n", 1102 | "pipeline = Pipeline([\n", 1103 | " ('preprocess', mapper),\n", 1104 | " ('clf', clf)\n", 1105 | "])\n", 1106 | "\n", 1107 | "pipeline.fit(X_train, y_train)\n", 1108 | "evaluation(pipeline, X_test, y_test)" 1109 | ] 1110 | }, 1111 | { 1112 | "cell_type": "code", 1113 | "execution_count": 105, 1114 | "metadata": {}, 1115 | "outputs": [], 1116 | "source": [ 1117 | "y_predict_proba = pipeline.predict_proba(X_test)[:, 1]" 1118 | ] 1119 | }, 1120 | { 1121 | "cell_type": "code", 1122 | "execution_count": 106, 1123 | "metadata": {}, 1124 | "outputs": [ 1125 | { 1126 | "data": { 1127 | "text/plain": [ 1128 | "0.10815533327119523" 1129 | ] 1130 | }, 1131 | "execution_count": 106, 1132 | "metadata": {}, 1133 | "output_type": "execute_result" 1134 | } 1135 | ], 1136 | "source": [ 1137 | "y_predict_proba.mean()" 1138 | ] 1139 | }, 1140 | { 1141 | "cell_type": "markdown", 1142 | "metadata": {}, 1143 | "source": [ 1144 | "## Balanced" 1145 | ] 1146 | }, 1147 | { 1148 | "cell_type": "code", 1149 | "execution_count": 35, 1150 | "metadata": {}, 1151 | "outputs": [ 1152 | { 1153 | "name": "stdout", 1154 | "output_type": "stream", 1155 | "text": [ 1156 | "--------------------------------------------------------------------------------\n", 1157 | "Warning: n_repeated not in configuration, defaulting to 0\n", 1158 | "Warning: n_clusters_per_class not in configuration, defaulting to 2\n", 1159 | "Warning: effective_rank not in configuration, defaulting to None\n", 1160 | "Warning: tail_strength not in configuration, defaulting to 0.5\n", 1161 | "Warning: noise not in configuration, defaulting to 0.0\n", 1162 | "Warning: shuffle not in configuration, defaulting to True\n", 1163 | "Creating Classification Dataset...\n", 1164 | "Creating Categorical Features...\n", 1165 | "Warning: insert_dollar not in configuration, defaulting to 'No'\n", 1166 | "Warning: insert_percent not in configuration, defaulting to 'No'\n", 1167 | "Warning: star_schema not in configuration, defaulting to 'No'\n", 1168 | "Writing Train/Test Datasets\n" 1169 | ] 1170 | }, 1171 | { 1172 | "data": { 1173 | "text/plain": [ 1174 | "{'auc': 0.7949023220244715, 'pr-auc': 0.7742073929744453}" 1175 | ] 1176 | }, 1177 | "execution_count": 35, 1178 | "metadata": {}, 1179 | "output_type": "execute_result" 1180 | } 1181 | ], 1182 | "source": [ 1183 | "df, label, categorical_features, numerical_features = get_data(balanced=True)\n", 1184 | "\n", 1185 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n", 1186 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n", 1187 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n", 1188 | "\n", 1189 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n", 1190 | "cat = [([c], [OrdinalEncoder()]) for c in categorical_features]\n", 1191 | "mapper = DataFrameMapper(cat + num, df_out=True)\n", 1192 | "\n", 1193 | "clf = LogisticRegression()\n", 1194 | "pipeline = Pipeline([\n", 1195 | " ('preprocess', mapper),\n", 1196 | " ('clf', clf)\n", 1197 | "])\n", 1198 | "\n", 1199 | "pipeline.fit(X_train, y_train)\n", 1200 | "evaluation(pipeline, X_test, y_test)" 1201 | ] 1202 | }, 1203 | { 1204 | "cell_type": "code", 1205 | "execution_count": 36, 1206 | "metadata": {}, 1207 | "outputs": [ 1208 | { 1209 | "data": { 1210 | "text/plain": [ 1211 | "0.4994547544271453" 1212 | ] 1213 | }, 1214 | "execution_count": 36, 1215 | "metadata": {}, 1216 | "output_type": "execute_result" 1217 | } 1218 | ], 1219 | "source": [ 1220 | "y_predict_proba = pipeline.predict_proba(X_test)[:, 1]\n", 1221 | "y_predict_proba.mean()" 1222 | ] 1223 | }, 1224 | { 1225 | "cell_type": "markdown", 1226 | "metadata": {}, 1227 | "source": [ 1228 | "## Dealing with unbalanced data by over weighting" 1229 | ] 1230 | }, 1231 | { 1232 | "cell_type": "code", 1233 | "execution_count": 101, 1234 | "metadata": {}, 1235 | "outputs": [ 1236 | { 1237 | "name": "stdout", 1238 | "output_type": "stream", 1239 | "text": [ 1240 | "--------------------------------------------------------------------------------\n", 1241 | "Warning: n_clusters_per_class not in configuration, defaulting to 2\n", 1242 | "Warning: effective_rank not in configuration, defaulting to None\n", 1243 | "Warning: tail_strength not in configuration, defaulting to 0.5\n", 1244 | "Warning: noise not in configuration, defaulting to 0.0\n", 1245 | "Warning: shuffle not in configuration, defaulting to True\n", 1246 | "Creating Classification Dataset...\n", 1247 | "Creating Categorical Features...\n", 1248 | "Warning: insert_dollar not in configuration, defaulting to 'No'\n", 1249 | "Warning: insert_percent not in configuration, defaulting to 'No'\n", 1250 | "Warning: star_schema not in configuration, defaulting to 'No'\n", 1251 | "Writing Train/Test Datasets\n" 1252 | ] 1253 | }, 1254 | { 1255 | "data": { 1256 | "text/plain": [ 1257 | "{'auc': 0.8113720373994346, 'pr-auc': 0.30360454333181025}" 1258 | ] 1259 | }, 1260 | "execution_count": 101, 1261 | "metadata": {}, 1262 | "output_type": "execute_result" 1263 | } 1264 | ], 1265 | "source": [ 1266 | "df, label, categorical_features, numerical_features = get_data(balanced=False)\n", 1267 | "\n", 1268 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n", 1269 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n", 1270 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n", 1271 | "\n", 1272 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n", 1273 | "cat = [([c], [OrdinalEncoder()]) for c in categorical_features]\n", 1274 | "mapper = DataFrameMapper(cat + num, df_out=True)\n", 1275 | "\n", 1276 | "clf = LogisticRegression(class_weight='balanced')\n", 1277 | "pipeline = Pipeline([\n", 1278 | " ('preprocess', mapper),\n", 1279 | " ('clf', clf)\n", 1280 | "])\n", 1281 | "\n", 1282 | "pipeline.fit(X_train, y_train)\n", 1283 | "evaluation(pipeline, X_test, y_test)" 1284 | ] 1285 | }, 1286 | { 1287 | "cell_type": "markdown", 1288 | "metadata": {}, 1289 | "source": [ 1290 | "**Result**:\n", 1291 | "- Having an unbalanced dataset doesn't harm accuracy, but harms precision-recall metrics of the positive class.\n", 1292 | "- This is mostly due to lower predicted probability values. " 1293 | ] 1294 | }, 1295 | { 1296 | "cell_type": "markdown", 1297 | "metadata": {}, 1298 | "source": [ 1299 | "## 1.4 Correlated Features" 1300 | ] 1301 | }, 1302 | { 1303 | "cell_type": "code", 1304 | "execution_count": 72, 1305 | "metadata": {}, 1306 | "outputs": [ 1307 | { 1308 | "name": "stdout", 1309 | "output_type": "stream", 1310 | "text": [ 1311 | "--------------------------------------------------------------------------------\n", 1312 | "Warning: n_clusters_per_class not in configuration, defaulting to 2\n", 1313 | "Warning: effective_rank not in configuration, defaulting to None\n", 1314 | "Warning: tail_strength not in configuration, defaulting to 0.5\n", 1315 | "Warning: noise not in configuration, defaulting to 0.0\n", 1316 | "Warning: shuffle not in configuration, defaulting to True\n", 1317 | "Creating Classification Dataset...\n", 1318 | "Warning: insert_dollar not in configuration, defaulting to 'No'\n", 1319 | "Warning: insert_percent not in configuration, defaulting to 'No'\n", 1320 | "Warning: star_schema not in configuration, defaulting to 'No'\n", 1321 | "Writing Train/Test Datasets\n" 1322 | ] 1323 | } 1324 | ], 1325 | "source": [ 1326 | "df, label, categorical_features, numerical_features = get_data(categorical_features=False, correlated_features=True)" 1327 | ] 1328 | }, 1329 | { 1330 | "cell_type": "code", 1331 | "execution_count": 74, 1332 | "metadata": {}, 1333 | "outputs": [ 1334 | { 1335 | "data": { 1336 | "text/plain": [ 1337 | "{'auc': 0.9194931452103352, 'pr-auc': 0.8982012865508728}" 1338 | ] 1339 | }, 1340 | "execution_count": 74, 1341 | "metadata": {}, 1342 | "output_type": "execute_result" 1343 | } 1344 | ], 1345 | "source": [ 1346 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n", 1347 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n", 1348 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n", 1349 | "\n", 1350 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n", 1351 | "mapper = DataFrameMapper(num, df_out=True)\n", 1352 | "\n", 1353 | "clf = LogisticRegression()\n", 1354 | "pipeline = Pipeline([\n", 1355 | " ('preprocess', mapper),\n", 1356 | " ('clf', clf)\n", 1357 | "])\n", 1358 | "\n", 1359 | "pipeline.fit(X_train, y_train)\n", 1360 | "evaluation(pipeline, X_test, y_test)" 1361 | ] 1362 | }, 1363 | { 1364 | "cell_type": "code", 1365 | "execution_count": 75, 1366 | "metadata": {}, 1367 | "outputs": [ 1368 | { 1369 | "data": { 1370 | "text/html": [ 1371 | "\n", 1372 | "\n", 1373 | "\n", 1374 | " \n", 1375 | "\n", 1376 | "\n", 1377 | " \n", 1378 | "\n", 1379 | "\n", 1380 | " \n", 1381 | "\n", 1382 | "\n", 1383 | " \n", 1384 | "\n", 1385 | "\n", 1386 | " \n", 1387 | "\n", 1388 | "\n", 1389 | " \n", 1390 | "\n", 1391 | "\n", 1392 | " \n", 1393 | "\n", 1394 | "\n", 1395 | " \n", 1396 | "\n", 1397 | "\n", 1398 | " \n", 1399 | "\n", 1400 | "
OLS Regression Results
Dep. Variable: y R-squared: 0.483
Model: OLS Adj. R-squared: 0.483
Method: Least Squares F-statistic: 1345.
Date: Sat, 10 Apr 2021 Prob (F-statistic): 0.00
Time: 14:31:47 Log-Likelihood: -3420.3
No. Observations: 8640 AIC: 6855.
Df Residuals: 8633 BIC: 6904.
Df Model: 6
Covariance Type: nonrobust
\n", 1401 | "\n", 1402 | "\n", 1403 | " \n", 1404 | "\n", 1405 | "\n", 1406 | " \n", 1407 | "\n", 1408 | "\n", 1409 | " \n", 1410 | "\n", 1411 | "\n", 1412 | " \n", 1413 | "\n", 1414 | "\n", 1415 | " \n", 1416 | "\n", 1417 | "\n", 1418 | " \n", 1419 | "\n", 1420 | "\n", 1421 | " \n", 1422 | "\n", 1423 | "\n", 1424 | " \n", 1425 | "\n", 1426 | "\n", 1427 | " \n", 1428 | "\n", 1429 | "\n", 1430 | " \n", 1431 | "\n", 1432 | "
coef std err t P>|t| [0.025 0.975]
const 0.5906 0.006 104.957 0.000 0.580 0.602
x0 -0.0043 0.000 -20.533 0.000 -0.005 -0.004
x1 0.0335 0.002 19.438 0.000 0.030 0.037
x2 0.0447 0.001 43.084 0.000 0.043 0.047
x3 -0.0076 0.000 -20.533 0.000 -0.008 -0.007
x4 -0.0142 0.001 -27.006 0.000 -0.015 -0.013
x5 0.0125 0.000 45.550 0.000 0.012 0.013
x6 -0.0017 0.002 -0.997 0.319 -0.005 0.002
x7 0.0270 0.001 28.009 0.000 0.025 0.029
\n", 1433 | "\n", 1434 | "\n", 1435 | " \n", 1436 | "\n", 1437 | "\n", 1438 | " \n", 1439 | "\n", 1440 | "\n", 1441 | " \n", 1442 | "\n", 1443 | "\n", 1444 | " \n", 1445 | "\n", 1446 | "
Omnibus: 341.439 Durbin-Watson: 2.027
Prob(Omnibus): 0.000 Jarque-Bera (JB): 353.022
Skew: -0.467 Prob(JB): 2.20e-77
Kurtosis: 2.670 Cond. No. 1.54e+16


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.5e-26. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular." 1447 | ], 1448 | "text/plain": [ 1449 | "\n", 1450 | "\"\"\"\n", 1451 | " OLS Regression Results \n", 1452 | "==============================================================================\n", 1453 | "Dep. Variable: y R-squared: 0.483\n", 1454 | "Model: OLS Adj. R-squared: 0.483\n", 1455 | "Method: Least Squares F-statistic: 1345.\n", 1456 | "Date: Sat, 10 Apr 2021 Prob (F-statistic): 0.00\n", 1457 | "Time: 14:31:47 Log-Likelihood: -3420.3\n", 1458 | "No. Observations: 8640 AIC: 6855.\n", 1459 | "Df Residuals: 8633 BIC: 6904.\n", 1460 | "Df Model: 6 \n", 1461 | "Covariance Type: nonrobust \n", 1462 | "==============================================================================\n", 1463 | " coef std err t P>|t| [0.025 0.975]\n", 1464 | "------------------------------------------------------------------------------\n", 1465 | "const 0.5906 0.006 104.957 0.000 0.580 0.602\n", 1466 | "x0 -0.0043 0.000 -20.533 0.000 -0.005 -0.004\n", 1467 | "x1 0.0335 0.002 19.438 0.000 0.030 0.037\n", 1468 | "x2 0.0447 0.001 43.084 0.000 0.043 0.047\n", 1469 | "x3 -0.0076 0.000 -20.533 0.000 -0.008 -0.007\n", 1470 | "x4 -0.0142 0.001 -27.006 0.000 -0.015 -0.013\n", 1471 | "x5 0.0125 0.000 45.550 0.000 0.012 0.013\n", 1472 | "x6 -0.0017 0.002 -0.997 0.319 -0.005 0.002\n", 1473 | "x7 0.0270 0.001 28.009 0.000 0.025 0.029\n", 1474 | "==============================================================================\n", 1475 | "Omnibus: 341.439 Durbin-Watson: 2.027\n", 1476 | "Prob(Omnibus): 0.000 Jarque-Bera (JB): 353.022\n", 1477 | "Skew: -0.467 Prob(JB): 2.20e-77\n", 1478 | "Kurtosis: 2.670 Cond. No. 1.54e+16\n", 1479 | "==============================================================================\n", 1480 | "\n", 1481 | "Notes:\n", 1482 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", 1483 | "[2] The smallest eigenvalue is 1.5e-26. This might indicate that there are\n", 1484 | "strong multicollinearity problems or that the design matrix is singular.\n", 1485 | "\"\"\"" 1486 | ] 1487 | }, 1488 | "execution_count": 75, 1489 | "metadata": {}, 1490 | "output_type": "execute_result" 1491 | } 1492 | ], 1493 | "source": [ 1494 | "import statsmodels.api as sm\n", 1495 | "preprocessed_X_train = mapper.transform(X_train)\n", 1496 | "preprocessed_X_train = sm.add_constant(preprocessed_X_train)\n", 1497 | "results = sm.OLS(y_train, preprocessed_X_train).fit()\n", 1498 | "results.summary()" 1499 | ] 1500 | }, 1501 | { 1502 | "cell_type": "code", 1503 | "execution_count": 76, 1504 | "metadata": {}, 1505 | "outputs": [ 1506 | { 1507 | "name": "stdout", 1508 | "output_type": "stream", 1509 | "text": [ 1510 | "x0, inf\n", 1511 | "x1, inf\n", 1512 | "x2, inf\n", 1513 | "x3, inf\n", 1514 | "x4, inf\n", 1515 | "x5, inf\n", 1516 | "x6, inf\n", 1517 | "x7, inf\n" 1518 | ] 1519 | }, 1520 | { 1521 | "name": "stderr", 1522 | "output_type": "stream", 1523 | "text": [ 1524 | "/usr/local/lib/python3.7/site-packages/statsmodels/stats/outliers_influence.py:193: RuntimeWarning: divide by zero encountered in double_scalars\n", 1525 | " vif = 1. / (1. - r_squared_i)\n" 1526 | ] 1527 | } 1528 | ], 1529 | "source": [ 1530 | "from statsmodels.stats.outliers_influence import variance_inflation_factor\n", 1531 | "for column in numerical_features:\n", 1532 | " print(f\"\"\"{column}, {variance_inflation_factor(\n", 1533 | " preprocessed_X_train.values, \n", 1534 | " list(preprocessed_X_train.columns).index(column))}\"\"\")" 1535 | ] 1536 | }, 1537 | { 1538 | "cell_type": "code", 1539 | "execution_count": 89, 1540 | "metadata": {}, 1541 | "outputs": [ 1542 | { 1543 | "data": { 1544 | "text/html": [ 1545 | "
\n", 1546 | "\n", 1559 | "\n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | " \n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | "
x0x1x2x3x4x5x6x7y
x01.0000000.132384-0.0970711.000000-0.035234-0.1625660.3468660.567626-0.000326
x10.1323841.0000000.0295560.1323840.143301-0.4348110.0914750.211035-0.020443
x2-0.0970710.0295561.000000-0.0970710.2723200.001597-0.077077-0.5462630.275935
x31.0000000.132384-0.0970711.000000-0.035234-0.1625660.3468660.567626-0.000326
x4-0.0352340.1433010.272320-0.0352341.000000-0.1442590.1443660.314752-0.008192
x5-0.162566-0.4348110.001597-0.162566-0.1442591.0000000.1201780.0833300.544321
x60.3468660.091475-0.0770770.3468660.1443660.1201781.0000000.6491770.308940
x70.5676260.211035-0.5462630.5676260.3147520.0833300.6491771.0000000.071201
y-0.000326-0.0204430.275935-0.000326-0.0081920.5443210.3089400.0712011.000000
\n", 1685 | "
" 1686 | ], 1687 | "text/plain": [ 1688 | " x0 x1 x2 x3 x4 x5 x6 \\\n", 1689 | "x0 1.000000 0.132384 -0.097071 1.000000 -0.035234 -0.162566 0.346866 \n", 1690 | "x1 0.132384 1.000000 0.029556 0.132384 0.143301 -0.434811 0.091475 \n", 1691 | "x2 -0.097071 0.029556 1.000000 -0.097071 0.272320 0.001597 -0.077077 \n", 1692 | "x3 1.000000 0.132384 -0.097071 1.000000 -0.035234 -0.162566 0.346866 \n", 1693 | "x4 -0.035234 0.143301 0.272320 -0.035234 1.000000 -0.144259 0.144366 \n", 1694 | "x5 -0.162566 -0.434811 0.001597 -0.162566 -0.144259 1.000000 0.120178 \n", 1695 | "x6 0.346866 0.091475 -0.077077 0.346866 0.144366 0.120178 1.000000 \n", 1696 | "x7 0.567626 0.211035 -0.546263 0.567626 0.314752 0.083330 0.649177 \n", 1697 | "y -0.000326 -0.020443 0.275935 -0.000326 -0.008192 0.544321 0.308940 \n", 1698 | "\n", 1699 | " x7 y \n", 1700 | "x0 0.567626 -0.000326 \n", 1701 | "x1 0.211035 -0.020443 \n", 1702 | "x2 -0.546263 0.275935 \n", 1703 | "x3 0.567626 -0.000326 \n", 1704 | "x4 0.314752 -0.008192 \n", 1705 | "x5 0.083330 0.544321 \n", 1706 | "x6 0.649177 0.308940 \n", 1707 | "x7 1.000000 0.071201 \n", 1708 | "y 0.071201 1.000000 " 1709 | ] 1710 | }, 1711 | "execution_count": 89, 1712 | "metadata": {}, 1713 | "output_type": "execute_result" 1714 | } 1715 | ], 1716 | "source": [ 1717 | "df.corr()" 1718 | ] 1719 | }, 1720 | { 1721 | "cell_type": "markdown", 1722 | "metadata": {}, 1723 | "source": [ 1724 | "### Start with removing perfectly multicolinearity" 1725 | ] 1726 | }, 1727 | { 1728 | "cell_type": "code", 1729 | "execution_count": 90, 1730 | "metadata": {}, 1731 | "outputs": [ 1732 | { 1733 | "data": { 1734 | "text/plain": [ 1735 | "{'auc': 0.9194974891835068, 'pr-auc': 0.8982064967028441}" 1736 | ] 1737 | }, 1738 | "execution_count": 90, 1739 | "metadata": {}, 1740 | "output_type": "execute_result" 1741 | } 1742 | ], 1743 | "source": [ 1744 | "numerical_features = ['x0', 'x1', 'x2', 'x4', 'x5', 'x6', 'x7'] # remove x3\n", 1745 | "\n", 1746 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n", 1747 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n", 1748 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n", 1749 | "\n", 1750 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n", 1751 | "mapper = DataFrameMapper(num, df_out=True)\n", 1752 | "\n", 1753 | "clf = LogisticRegression()\n", 1754 | "pipeline = Pipeline([\n", 1755 | " ('preprocess', mapper),\n", 1756 | " ('clf', clf)\n", 1757 | "])\n", 1758 | "\n", 1759 | "pipeline.fit(X_train, y_train)\n", 1760 | "evaluation(pipeline, X_test, y_test)" 1761 | ] 1762 | }, 1763 | { 1764 | "cell_type": "code", 1765 | "execution_count": 91, 1766 | "metadata": {}, 1767 | "outputs": [ 1768 | { 1769 | "data": { 1770 | "text/html": [ 1771 | "\n", 1772 | "\n", 1773 | "\n", 1774 | " \n", 1775 | "\n", 1776 | "\n", 1777 | " \n", 1778 | "\n", 1779 | "\n", 1780 | " \n", 1781 | "\n", 1782 | "\n", 1783 | " \n", 1784 | "\n", 1785 | "\n", 1786 | " \n", 1787 | "\n", 1788 | "\n", 1789 | " \n", 1790 | "\n", 1791 | "\n", 1792 | " \n", 1793 | "\n", 1794 | "\n", 1795 | " \n", 1796 | "\n", 1797 | "\n", 1798 | " \n", 1799 | "\n", 1800 | "
OLS Regression Results
Dep. Variable: y R-squared: 0.483
Model: OLS Adj. R-squared: 0.483
Method: Least Squares F-statistic: 1345.
Date: Sat, 10 Apr 2021 Prob (F-statistic): 0.00
Time: 14:37:17 Log-Likelihood: -3420.3
No. Observations: 8640 AIC: 6855.
Df Residuals: 8633 BIC: 6904.
Df Model: 6
Covariance Type: nonrobust
\n", 1801 | "\n", 1802 | "\n", 1803 | " \n", 1804 | "\n", 1805 | "\n", 1806 | " \n", 1807 | "\n", 1808 | "\n", 1809 | " \n", 1810 | "\n", 1811 | "\n", 1812 | " \n", 1813 | "\n", 1814 | "\n", 1815 | " \n", 1816 | "\n", 1817 | "\n", 1818 | " \n", 1819 | "\n", 1820 | "\n", 1821 | " \n", 1822 | "\n", 1823 | "\n", 1824 | " \n", 1825 | "\n", 1826 | "\n", 1827 | " \n", 1828 | "\n", 1829 | "
coef std err t P>|t| [0.025 0.975]
const 0.5906 0.006 104.957 0.000 0.580 0.602
x0 -0.0172 0.001 -20.533 0.000 -0.019 -0.016
x1 0.0357 0.002 19.852 0.000 0.032 0.039
x2 0.0438 0.001 43.527 0.000 0.042 0.046
x4 -0.0138 0.001 -26.951 0.000 -0.015 -0.013
x5 0.0127 0.000 47.049 0.000 0.012 0.013
x6 -0.0005 0.002 -0.294 0.769 -0.004 0.003
x7 0.0260 0.001 28.111 0.000 0.024 0.028
\n", 1830 | "\n", 1831 | "\n", 1832 | " \n", 1833 | "\n", 1834 | "\n", 1835 | " \n", 1836 | "\n", 1837 | "\n", 1838 | " \n", 1839 | "\n", 1840 | "\n", 1841 | " \n", 1842 | "\n", 1843 | "
Omnibus: 341.439 Durbin-Watson: 2.027
Prob(Omnibus): 0.000 Jarque-Bera (JB): 353.022
Skew: -0.467 Prob(JB): 2.20e-77
Kurtosis: 2.670 Cond. No. 1.67e+16


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 9e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular." 1844 | ], 1845 | "text/plain": [ 1846 | "\n", 1847 | "\"\"\"\n", 1848 | " OLS Regression Results \n", 1849 | "==============================================================================\n", 1850 | "Dep. Variable: y R-squared: 0.483\n", 1851 | "Model: OLS Adj. R-squared: 0.483\n", 1852 | "Method: Least Squares F-statistic: 1345.\n", 1853 | "Date: Sat, 10 Apr 2021 Prob (F-statistic): 0.00\n", 1854 | "Time: 14:37:17 Log-Likelihood: -3420.3\n", 1855 | "No. Observations: 8640 AIC: 6855.\n", 1856 | "Df Residuals: 8633 BIC: 6904.\n", 1857 | "Df Model: 6 \n", 1858 | "Covariance Type: nonrobust \n", 1859 | "==============================================================================\n", 1860 | " coef std err t P>|t| [0.025 0.975]\n", 1861 | "------------------------------------------------------------------------------\n", 1862 | "const 0.5906 0.006 104.957 0.000 0.580 0.602\n", 1863 | "x0 -0.0172 0.001 -20.533 0.000 -0.019 -0.016\n", 1864 | "x1 0.0357 0.002 19.852 0.000 0.032 0.039\n", 1865 | "x2 0.0438 0.001 43.527 0.000 0.042 0.046\n", 1866 | "x4 -0.0138 0.001 -26.951 0.000 -0.015 -0.013\n", 1867 | "x5 0.0127 0.000 47.049 0.000 0.012 0.013\n", 1868 | "x6 -0.0005 0.002 -0.294 0.769 -0.004 0.003\n", 1869 | "x7 0.0260 0.001 28.111 0.000 0.024 0.028\n", 1870 | "==============================================================================\n", 1871 | "Omnibus: 341.439 Durbin-Watson: 2.027\n", 1872 | "Prob(Omnibus): 0.000 Jarque-Bera (JB): 353.022\n", 1873 | "Skew: -0.467 Prob(JB): 2.20e-77\n", 1874 | "Kurtosis: 2.670 Cond. No. 1.67e+16\n", 1875 | "==============================================================================\n", 1876 | "\n", 1877 | "Notes:\n", 1878 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", 1879 | "[2] The smallest eigenvalue is 9e-27. This might indicate that there are\n", 1880 | "strong multicollinearity problems or that the design matrix is singular.\n", 1881 | "\"\"\"" 1882 | ] 1883 | }, 1884 | "execution_count": 91, 1885 | "metadata": {}, 1886 | "output_type": "execute_result" 1887 | } 1888 | ], 1889 | "source": [ 1890 | "preprocessed_X_train = mapper.transform(X_train)\n", 1891 | "preprocessed_X_train = sm.add_constant(preprocessed_X_train)\n", 1892 | "results = sm.OLS(y_train, preprocessed_X_train).fit()\n", 1893 | "results.summary()" 1894 | ] 1895 | }, 1896 | { 1897 | "cell_type": "code", 1898 | "execution_count": 88, 1899 | "metadata": {}, 1900 | "outputs": [ 1901 | { 1902 | "name": "stdout", 1903 | "output_type": "stream", 1904 | "text": [ 1905 | "x0, inf\n", 1906 | "x1, inf\n", 1907 | "x2, inf\n", 1908 | "x4, inf\n", 1909 | "x5, inf\n", 1910 | "x6, inf\n", 1911 | "x7, inf\n" 1912 | ] 1913 | }, 1914 | { 1915 | "name": "stderr", 1916 | "output_type": "stream", 1917 | "text": [ 1918 | "/usr/local/lib/python3.7/site-packages/statsmodels/stats/outliers_influence.py:193: RuntimeWarning: divide by zero encountered in double_scalars\n", 1919 | " vif = 1. / (1. - r_squared_i)\n" 1920 | ] 1921 | } 1922 | ], 1923 | "source": [ 1924 | "for column in numerical_features:\n", 1925 | " print(f\"\"\"{column}, {variance_inflation_factor(\n", 1926 | " preprocessed_X_train.values, \n", 1927 | " list(preprocessed_X_train.columns).index(column))}\"\"\")" 1928 | ] 1929 | }, 1930 | { 1931 | "cell_type": "markdown", 1932 | "metadata": {}, 1933 | "source": [ 1934 | "Removing feature with perfect multicolinearity:\n", 1935 | "- Improves interpretability of the coefficients (like `x0` here)\n", 1936 | "- Logistic Regression doesn't lose performance. " 1937 | ] 1938 | }, 1939 | { 1940 | "cell_type": "markdown", 1941 | "metadata": {}, 1942 | "source": [ 1943 | "## Remove multicolinearity" 1944 | ] 1945 | }, 1946 | { 1947 | "cell_type": "code", 1948 | "execution_count": 92, 1949 | "metadata": {}, 1950 | "outputs": [ 1951 | { 1952 | "data": { 1953 | "text/plain": [ 1954 | "{'auc': 0.9194974891835068, 'pr-auc': 0.8982064967028441}" 1955 | ] 1956 | }, 1957 | "execution_count": 92, 1958 | "metadata": {}, 1959 | "output_type": "execute_result" 1960 | } 1961 | ], 1962 | "source": [ 1963 | "numerical_features = ['x0', 'x1', 'x2', 'x4', 'x5', 'x7'] # remove x3, x6\n", 1964 | "\n", 1965 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n", 1966 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n", 1967 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n", 1968 | "\n", 1969 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n", 1970 | "mapper = DataFrameMapper(num, df_out=True)\n", 1971 | "\n", 1972 | "clf = LogisticRegression()\n", 1973 | "pipeline = Pipeline([\n", 1974 | " ('preprocess', mapper),\n", 1975 | " ('clf', clf)\n", 1976 | "])\n", 1977 | "\n", 1978 | "pipeline.fit(X_train, y_train)\n", 1979 | "evaluation(pipeline, X_test, y_test)" 1980 | ] 1981 | }, 1982 | { 1983 | "cell_type": "code", 1984 | "execution_count": 93, 1985 | "metadata": {}, 1986 | "outputs": [ 1987 | { 1988 | "data": { 1989 | "text/html": [ 1990 | "\n", 1991 | "\n", 1992 | "\n", 1993 | " \n", 1994 | "\n", 1995 | "\n", 1996 | " \n", 1997 | "\n", 1998 | "\n", 1999 | " \n", 2000 | "\n", 2001 | "\n", 2002 | " \n", 2003 | "\n", 2004 | "\n", 2005 | " \n", 2006 | "\n", 2007 | "\n", 2008 | " \n", 2009 | "\n", 2010 | "\n", 2011 | " \n", 2012 | "\n", 2013 | "\n", 2014 | " \n", 2015 | "\n", 2016 | "\n", 2017 | " \n", 2018 | "\n", 2019 | "
OLS Regression Results
Dep. Variable: y R-squared: 0.483
Model: OLS Adj. R-squared: 0.483
Method: Least Squares F-statistic: 1345.
Date: Sat, 10 Apr 2021 Prob (F-statistic): 0.00
Time: 14:38:52 Log-Likelihood: -3420.3
No. Observations: 8640 AIC: 6855.
Df Residuals: 8633 BIC: 6904.
Df Model: 6
Covariance Type: nonrobust
\n", 2020 | "\n", 2021 | "\n", 2022 | " \n", 2023 | "\n", 2024 | "\n", 2025 | " \n", 2026 | "\n", 2027 | "\n", 2028 | " \n", 2029 | "\n", 2030 | "\n", 2031 | " \n", 2032 | "\n", 2033 | "\n", 2034 | " \n", 2035 | "\n", 2036 | "\n", 2037 | " \n", 2038 | "\n", 2039 | "\n", 2040 | " \n", 2041 | "\n", 2042 | "\n", 2043 | " \n", 2044 | "\n", 2045 | "
coef std err t P>|t| [0.025 0.975]
const 0.5906 0.006 104.957 0.000 0.580 0.602
x0 -0.0169 0.001 -17.744 0.000 -0.019 -0.015
x1 0.0366 0.005 7.684 0.000 0.027 0.046
x2 0.0434 0.001 40.460 0.000 0.041 0.045
x4 -0.0137 0.001 -25.047 0.000 -0.015 -0.013
x5 0.0128 0.000 33.306 0.000 0.012 0.014
x7 0.0256 0.001 27.862 0.000 0.024 0.027
\n", 2046 | "\n", 2047 | "\n", 2048 | " \n", 2049 | "\n", 2050 | "\n", 2051 | " \n", 2052 | "\n", 2053 | "\n", 2054 | " \n", 2055 | "\n", 2056 | "\n", 2057 | " \n", 2058 | "\n", 2059 | "
Omnibus: 341.439 Durbin-Watson: 2.027
Prob(Omnibus): 0.000 Jarque-Bera (JB): 353.022
Skew: -0.467 Prob(JB): 2.20e-77
Kurtosis: 2.670 Cond. No. 30.1


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified." 2060 | ], 2061 | "text/plain": [ 2062 | "\n", 2063 | "\"\"\"\n", 2064 | " OLS Regression Results \n", 2065 | "==============================================================================\n", 2066 | "Dep. Variable: y R-squared: 0.483\n", 2067 | "Model: OLS Adj. R-squared: 0.483\n", 2068 | "Method: Least Squares F-statistic: 1345.\n", 2069 | "Date: Sat, 10 Apr 2021 Prob (F-statistic): 0.00\n", 2070 | "Time: 14:38:52 Log-Likelihood: -3420.3\n", 2071 | "No. Observations: 8640 AIC: 6855.\n", 2072 | "Df Residuals: 8633 BIC: 6904.\n", 2073 | "Df Model: 6 \n", 2074 | "Covariance Type: nonrobust \n", 2075 | "==============================================================================\n", 2076 | " coef std err t P>|t| [0.025 0.975]\n", 2077 | "------------------------------------------------------------------------------\n", 2078 | "const 0.5906 0.006 104.957 0.000 0.580 0.602\n", 2079 | "x0 -0.0169 0.001 -17.744 0.000 -0.019 -0.015\n", 2080 | "x1 0.0366 0.005 7.684 0.000 0.027 0.046\n", 2081 | "x2 0.0434 0.001 40.460 0.000 0.041 0.045\n", 2082 | "x4 -0.0137 0.001 -25.047 0.000 -0.015 -0.013\n", 2083 | "x5 0.0128 0.000 33.306 0.000 0.012 0.014\n", 2084 | "x7 0.0256 0.001 27.862 0.000 0.024 0.027\n", 2085 | "==============================================================================\n", 2086 | "Omnibus: 341.439 Durbin-Watson: 2.027\n", 2087 | "Prob(Omnibus): 0.000 Jarque-Bera (JB): 353.022\n", 2088 | "Skew: -0.467 Prob(JB): 2.20e-77\n", 2089 | "Kurtosis: 2.670 Cond. No. 30.1\n", 2090 | "==============================================================================\n", 2091 | "\n", 2092 | "Notes:\n", 2093 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", 2094 | "\"\"\"" 2095 | ] 2096 | }, 2097 | "execution_count": 93, 2098 | "metadata": {}, 2099 | "output_type": "execute_result" 2100 | } 2101 | ], 2102 | "source": [ 2103 | "preprocessed_X_train = mapper.transform(X_train)\n", 2104 | "preprocessed_X_train = sm.add_constant(preprocessed_X_train)\n", 2105 | "results = sm.OLS(y_train, preprocessed_X_train).fit()\n", 2106 | "results.summary()" 2107 | ] 2108 | }, 2109 | { 2110 | "cell_type": "code", 2111 | "execution_count": 94, 2112 | "metadata": {}, 2113 | "outputs": [ 2114 | { 2115 | "name": "stdout", 2116 | "output_type": "stream", 2117 | "text": [ 2118 | "x0, 4.358204798860465\n", 2119 | "x1, 1.8526871839909662\n", 2120 | "x2, 5.622338237184614\n", 2121 | "x4, 4.123960180952725\n", 2122 | "x5, 2.6095687697415917\n", 2123 | "x7, 10.922197872534808\n" 2124 | ] 2125 | } 2126 | ], 2127 | "source": [ 2128 | "for column in numerical_features:\n", 2129 | " print(f\"\"\"{column}, {variance_inflation_factor(\n", 2130 | " preprocessed_X_train.values, \n", 2131 | " list(preprocessed_X_train.columns).index(column))}\"\"\")" 2132 | ] 2133 | }, 2134 | { 2135 | "cell_type": "markdown", 2136 | "metadata": {}, 2137 | "source": [ 2138 | "Removing `x6`, we didn't lose explainability nor performance " 2139 | ] 2140 | }, 2141 | { 2142 | "cell_type": "markdown", 2143 | "metadata": {}, 2144 | "source": [ 2145 | "#### Remove x7 with high VAR" 2146 | ] 2147 | }, 2148 | { 2149 | "cell_type": "code", 2150 | "execution_count": 95, 2151 | "metadata": {}, 2152 | "outputs": [ 2153 | { 2154 | "data": { 2155 | "text/plain": [ 2156 | "{'auc': 0.8916873729387849, 'pr-auc': 0.858019953399781}" 2157 | ] 2158 | }, 2159 | "execution_count": 95, 2160 | "metadata": {}, 2161 | "output_type": "execute_result" 2162 | } 2163 | ], 2164 | "source": [ 2165 | "numerical_features = ['x0', 'x1', 'x2', 'x4', 'x5'] # remove x3, x6, x7\n", 2166 | "\n", 2167 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n", 2168 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n", 2169 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n", 2170 | "\n", 2171 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n", 2172 | "mapper = DataFrameMapper(num, df_out=True)\n", 2173 | "\n", 2174 | "clf = LogisticRegression()\n", 2175 | "pipeline = Pipeline([\n", 2176 | " ('preprocess', mapper),\n", 2177 | " ('clf', clf)\n", 2178 | "])\n", 2179 | "\n", 2180 | "pipeline.fit(X_train, y_train)\n", 2181 | "evaluation(pipeline, X_test, y_test)" 2182 | ] 2183 | }, 2184 | { 2185 | "cell_type": "code", 2186 | "execution_count": 96, 2187 | "metadata": {}, 2188 | "outputs": [ 2189 | { 2190 | "data": { 2191 | "text/html": [ 2192 | "\n", 2193 | "\n", 2194 | "\n", 2195 | " \n", 2196 | "\n", 2197 | "\n", 2198 | " \n", 2199 | "\n", 2200 | "\n", 2201 | " \n", 2202 | "\n", 2203 | "\n", 2204 | " \n", 2205 | "\n", 2206 | "\n", 2207 | " \n", 2208 | "\n", 2209 | "\n", 2210 | " \n", 2211 | "\n", 2212 | "\n", 2213 | " \n", 2214 | "\n", 2215 | "\n", 2216 | " \n", 2217 | "\n", 2218 | "\n", 2219 | " \n", 2220 | "\n", 2221 | "
OLS Regression Results
Dep. Variable: y R-squared: 0.437
Model: OLS Adj. R-squared: 0.436
Method: Least Squares F-statistic: 1338.
Date: Sat, 10 Apr 2021 Prob (F-statistic): 0.00
Time: 14:40:42 Log-Likelihood: -3792.2
No. Observations: 8640 AIC: 7596.
Df Residuals: 8634 BIC: 7639.
Df Model: 5
Covariance Type: nonrobust
\n", 2222 | "\n", 2223 | "\n", 2224 | " \n", 2225 | "\n", 2226 | "\n", 2227 | " \n", 2228 | "\n", 2229 | "\n", 2230 | " \n", 2231 | "\n", 2232 | "\n", 2233 | " \n", 2234 | "\n", 2235 | "\n", 2236 | " \n", 2237 | "\n", 2238 | "\n", 2239 | " \n", 2240 | "\n", 2241 | "\n", 2242 | " \n", 2243 | "\n", 2244 | "
coef std err t P>|t| [0.025 0.975]
const 0.6395 0.006 114.609 0.000 0.629 0.650
x0 0.0063 0.000 12.833 0.000 0.005 0.007
x1 0.1125 0.004 27.526 0.000 0.104 0.121
x2 0.0166 0.000 33.605 0.000 0.016 0.018
x4 -0.0007 0.000 -2.314 0.021 -0.001 -0.000
x5 0.0204 0.000 73.414 0.000 0.020 0.021
\n", 2245 | "\n", 2246 | "\n", 2247 | " \n", 2248 | "\n", 2249 | "\n", 2250 | " \n", 2251 | "\n", 2252 | "\n", 2253 | " \n", 2254 | "\n", 2255 | "\n", 2256 | " \n", 2257 | "\n", 2258 | "
Omnibus: 381.662 Durbin-Watson: 2.031
Prob(Omnibus): 0.000 Jarque-Bera (JB): 330.564
Skew: -0.412 Prob(JB): 1.66e-72
Kurtosis: 2.511 Cond. No. 26.7


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified." 2259 | ], 2260 | "text/plain": [ 2261 | "\n", 2262 | "\"\"\"\n", 2263 | " OLS Regression Results \n", 2264 | "==============================================================================\n", 2265 | "Dep. Variable: y R-squared: 0.437\n", 2266 | "Model: OLS Adj. R-squared: 0.436\n", 2267 | "Method: Least Squares F-statistic: 1338.\n", 2268 | "Date: Sat, 10 Apr 2021 Prob (F-statistic): 0.00\n", 2269 | "Time: 14:40:42 Log-Likelihood: -3792.2\n", 2270 | "No. Observations: 8640 AIC: 7596.\n", 2271 | "Df Residuals: 8634 BIC: 7639.\n", 2272 | "Df Model: 5 \n", 2273 | "Covariance Type: nonrobust \n", 2274 | "==============================================================================\n", 2275 | " coef std err t P>|t| [0.025 0.975]\n", 2276 | "------------------------------------------------------------------------------\n", 2277 | "const 0.6395 0.006 114.609 0.000 0.629 0.650\n", 2278 | "x0 0.0063 0.000 12.833 0.000 0.005 0.007\n", 2279 | "x1 0.1125 0.004 27.526 0.000 0.104 0.121\n", 2280 | "x2 0.0166 0.000 33.605 0.000 0.016 0.018\n", 2281 | "x4 -0.0007 0.000 -2.314 0.021 -0.001 -0.000\n", 2282 | "x5 0.0204 0.000 73.414 0.000 0.020 0.021\n", 2283 | "==============================================================================\n", 2284 | "Omnibus: 381.662 Durbin-Watson: 2.031\n", 2285 | "Prob(Omnibus): 0.000 Jarque-Bera (JB): 330.564\n", 2286 | "Skew: -0.412 Prob(JB): 1.66e-72\n", 2287 | "Kurtosis: 2.511 Cond. No. 26.7\n", 2288 | "==============================================================================\n", 2289 | "\n", 2290 | "Notes:\n", 2291 | "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", 2292 | "\"\"\"" 2293 | ] 2294 | }, 2295 | "execution_count": 96, 2296 | "metadata": {}, 2297 | "output_type": "execute_result" 2298 | } 2299 | ], 2300 | "source": [ 2301 | "preprocessed_X_train = mapper.transform(X_train)\n", 2302 | "preprocessed_X_train = sm.add_constant(preprocessed_X_train)\n", 2303 | "results = sm.OLS(y_train, preprocessed_X_train).fit()\n", 2304 | "results.summary()" 2305 | ] 2306 | }, 2307 | { 2308 | "cell_type": "code", 2309 | "execution_count": 97, 2310 | "metadata": {}, 2311 | "outputs": [ 2312 | { 2313 | "name": "stdout", 2314 | "output_type": "stream", 2315 | "text": [ 2316 | "x0, 1.0434492528061576\n", 2317 | "x1, 1.2487373171729157\n", 2318 | "x2, 1.089610333638892\n", 2319 | "x4, 1.1174255753042328\n", 2320 | "x5, 1.2630916367080673\n" 2321 | ] 2322 | } 2323 | ], 2324 | "source": [ 2325 | "for column in numerical_features:\n", 2326 | " print(f\"\"\"{column}, {variance_inflation_factor(\n", 2327 | " preprocessed_X_train.values, \n", 2328 | " list(preprocessed_X_train.columns).index(column))}\"\"\")" 2329 | ] 2330 | }, 2331 | { 2332 | "cell_type": "markdown", 2333 | "metadata": {}, 2334 | "source": [ 2335 | "Removing `x7`:\n", 2336 | "- Helped explainability \n", 2337 | "- Negatively impacted performance" 2338 | ] 2339 | }, 2340 | { 2341 | "cell_type": "markdown", 2342 | "metadata": {}, 2343 | "source": [ 2344 | "Remedy: Add polynomial terms, Try other models that capture more complex interactions. " 2345 | ] 2346 | }, 2347 | { 2348 | "cell_type": "markdown", 2349 | "metadata": {}, 2350 | "source": [ 2351 | "## 1.5 Missing Values" 2352 | ] 2353 | }, 2354 | { 2355 | "cell_type": "code", 2356 | "execution_count": 98, 2357 | "metadata": {}, 2358 | "outputs": [ 2359 | { 2360 | "name": "stdout", 2361 | "output_type": "stream", 2362 | "text": [ 2363 | "--------------------------------------------------------------------------------\n", 2364 | "Warning: n_clusters_per_class not in configuration, defaulting to 2\n", 2365 | "Warning: effective_rank not in configuration, defaulting to None\n", 2366 | "Warning: tail_strength not in configuration, defaulting to 0.5\n", 2367 | "Warning: noise not in configuration, defaulting to 0.0\n", 2368 | "Warning: shuffle not in configuration, defaulting to True\n", 2369 | "Creating Classification Dataset...\n", 2370 | "Creating Categorical Features...\n", 2371 | "Warning: insert_dollar not in configuration, defaulting to 'No'\n", 2372 | "Warning: insert_percent not in configuration, defaulting to 'No'\n", 2373 | "Warning: star_schema not in configuration, defaulting to 'No'\n", 2374 | "Writing Train/Test Datasets\n" 2375 | ] 2376 | } 2377 | ], 2378 | "source": [ 2379 | "df, label, categorical_features, numerical_features = get_data(missing_values=True)" 2380 | ] 2381 | }, 2382 | { 2383 | "cell_type": "code", 2384 | "execution_count": 99, 2385 | "metadata": {}, 2386 | "outputs": [ 2387 | { 2388 | "ename": "ValueError", 2389 | "evalue": "Input contains NaN, infinity or a value too large for dtype('float64').", 2390 | "output_type": "error", 2391 | "traceback": [ 2392 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 2393 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 2394 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 12\u001b[0m ])\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0mpipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnumerical_features\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 15\u001b[0m \u001b[0mevaluation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpipeline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnumerical_features\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 2395 | "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/pipeline.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_final_estimator\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m'passthrough'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[0mfit_params_last_step\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfit_params_steps\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msteps\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 346\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_final_estimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mXt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params_last_step\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 347\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 348\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 2396 | "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 1344\u001b[0m X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype,\n\u001b[1;32m 1345\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"C\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1346\u001b[0;31m accept_large_sparse=solver != 'liblinear')\n\u001b[0m\u001b[1;32m 1347\u001b[0m \u001b[0mcheck_classification_targets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1348\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclasses_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 2397 | "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36m_validate_data\u001b[0;34m(self, X, y, reset, validate_separately, **check_params)\u001b[0m\n\u001b[1;32m 431\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 432\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 433\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 434\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 435\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 2398 | "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;31m# extra_args > 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 2399 | "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 819\u001b[0m \u001b[0mensure_min_samples\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mensure_min_samples\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 820\u001b[0m \u001b[0mensure_min_features\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mensure_min_features\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 821\u001b[0;31m estimator=estimator)\n\u001b[0m\u001b[1;32m 822\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmulti_output\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 823\u001b[0m y = check_array(y, accept_sparse='csr', force_all_finite=True,\n", 2400 | "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;31m# extra_args > 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 2401 | "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)\u001b[0m\n\u001b[1;32m 662\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 663\u001b[0m _assert_all_finite(array,\n\u001b[0;32m--> 664\u001b[0;31m allow_nan=force_all_finite == 'allow-nan')\n\u001b[0m\u001b[1;32m 665\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 666\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mensure_min_samples\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 2402 | "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m_assert_all_finite\u001b[0;34m(X, allow_nan, msg_dtype)\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[0mmsg_err\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 105\u001b[0m (type_err,\n\u001b[0;32m--> 106\u001b[0;31m msg_dtype if msg_dtype is not None else X.dtype)\n\u001b[0m\u001b[1;32m 107\u001b[0m )\n\u001b[1;32m 108\u001b[0m \u001b[0;31m# for object dtype data, we only check for NaNs (GH-13254)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 2403 | "\u001b[0;31mValueError\u001b[0m: Input contains NaN, infinity or a value too large for dtype('float64')." 2404 | ] 2405 | } 2406 | ], 2407 | "source": [ 2408 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n", 2409 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n", 2410 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n", 2411 | "\n", 2412 | "num = [([n], [SimpleImputer()]) for n in numerical_features]\n", 2413 | "mapper = DataFrameMapper(num, df_out=True)\n", 2414 | "\n", 2415 | "clf = LogisticRegression()\n", 2416 | "pipeline = Pipeline([\n", 2417 | " #('preprocess', mapper),\n", 2418 | " ('clf', clf)\n", 2419 | "])\n", 2420 | "\n", 2421 | "pipeline.fit(X_train[numerical_features], y_train)\n", 2422 | "evaluation(pipeline, X_test[numerical_features], y_test)" 2423 | ] 2424 | }, 2425 | { 2426 | "cell_type": "code", 2427 | "execution_count": 100, 2428 | "metadata": {}, 2429 | "outputs": [ 2430 | { 2431 | "data": { 2432 | "text/plain": [ 2433 | "{'auc': 0.7473034970984109, 'pr-auc': 0.676792150205654}" 2434 | ] 2435 | }, 2436 | "execution_count": 100, 2437 | "metadata": {}, 2438 | "output_type": "execute_result" 2439 | } 2440 | ], 2441 | "source": [ 2442 | "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n", 2443 | "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n", 2444 | "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n", 2445 | "\n", 2446 | "num = [([n], [SimpleImputer()]) for n in numerical_features] # Impute values\n", 2447 | "mapper = DataFrameMapper(num, df_out=True)\n", 2448 | " \n", 2449 | "clf = LogisticRegression()\n", 2450 | "pipeline = Pipeline([\n", 2451 | " ('preprocess', mapper),\n", 2452 | " ('clf', clf)\n", 2453 | "])\n", 2454 | "\n", 2455 | "pipeline.fit(X_train[numerical_features], y_train)\n", 2456 | "evaluation(pipeline, X_test[numerical_features], y_test)" 2457 | ] 2458 | }, 2459 | { 2460 | "cell_type": "markdown", 2461 | "metadata": {}, 2462 | "source": [ 2463 | "**Result**\n", 2464 | "- Logistic Regression can't handle missing values. Best Imupute with mean" 2465 | ] 2466 | }, 2467 | { 2468 | "cell_type": "markdown", 2469 | "metadata": {}, 2470 | "source": [ 2471 | "## Summary \n", 2472 | "\n", 2473 | "Let's see how Logistic Regression acts with 5 techniques:\n", 2474 | "1. **Standardization of Numerical Variables**\n", 2475 | " - Performance doesn't necessarily improve. But convergence is faster during training\n", 2476 | "2. **Encoding of Categorical Variables**\n", 2477 | " - We can use ordinal encoding if the categories are related (size). Otherwise, use one hot encoding\n", 2478 | "3. **Data Imbalance**\n", 2479 | " - Perform overweighting of the minor class and undersampling of the major class\n", 2480 | "4. **Colinearity**\n", 2481 | " - remove features which exhibit perfect multicolinearity\n", 2482 | " - try different modeling strategies to ensure the model is capturing non-linear interactions\n", 2483 | "5. **Missing Values**\n", 2484 | " - Impute with mean (or a constant value). This is problem specific" 2485 | ] 2486 | } 2487 | ], 2488 | "metadata": { 2489 | "kernelspec": { 2490 | "display_name": "Python 3", 2491 | "language": "python", 2492 | "name": "python3" 2493 | }, 2494 | "language_info": { 2495 | "codemirror_mode": { 2496 | "name": "ipython", 2497 | "version": 3 2498 | }, 2499 | "file_extension": ".py", 2500 | "mimetype": "text/x-python", 2501 | "name": "python", 2502 | "nbconvert_exporter": "python", 2503 | "pygments_lexer": "ipython3", 2504 | "version": "3.7.4" 2505 | } 2506 | }, 2507 | "nbformat": 4, 2508 | "nbformat_minor": 4 2509 | } 2510 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # preprocessing-cheat-sheet 2 | 3 | Preprocessing cheat sheet for some machine learning algorithms. Starting with Logistic Regression for now. This might grow in the future 4 | --------------------------------------------------------------------------------