├── LICENSE
├── Preprocessing Cheat - Logistic Regression.ipynb
└── README.md


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Ajay Halthor
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Preprocessing Cheat - Logistic Regression.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 349,
   6 |    "metadata": {
   7 |     "scrolled": true
   8 |    },
   9 |    "outputs": [],
  10 |    "source": [
  11 |     "#!pip3 install statsmodels"
  12 |    ]
  13 |   },
  14 |   {
  15 |    "cell_type": "markdown",
  16 |    "metadata": {},
  17 |    "source": [
  18 |     "Let's see how Logistic Regression acts with 5 techniques:\n",
  19 |     "1. Standardization of Numerical Variables\n",
  20 |     "2. Encoding of Categorical Variables\n",
  21 |     "3. Data Imbalance\n",
  22 |     "4. Colinearity\n",
  23 |     "5. Missing Values"
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "code",
  28 |    "execution_count": 1,
  29 |    "metadata": {},
  30 |    "outputs": [],
  31 |    "source": [
  32 |     "import numpy as np\n",
  33 |     "import pandas as pd\n",
  34 |     "from sklearn.impute import SimpleImputer\n",
  35 |     "from sklearn.linear_model import LogisticRegression\n",
  36 |     "from sklearn.metrics import roc_auc_score, average_precision_score\n",
  37 |     "from sklearn.model_selection import train_test_split\n",
  38 |     "from sklearn.pipeline import Pipeline\n",
  39 |     "from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder\n",
  40 |     "from sklearn_pandas import DataFrameMapper\n",
  41 |     "from snape.make_dataset import make_dataset"
  42 |    ]
  43 |   },
  44 |   {
  45 |    "cell_type": "markdown",
  46 |    "metadata": {},
  47 |    "source": [
  48 |     "Check out snape [here](https://github.com/mbernico/snape)"
  49 |    ]
  50 |   },
  51 |   {
  52 |    "cell_type": "code",
  53 |    "execution_count": 71,
  54 |    "metadata": {},
  55 |    "outputs": [],
  56 |    "source": [
  57 |     "def get_data(categorical_features=True,\n",
  58 |     "             balanced=True, \n",
  59 |     "             correlated_features=False, \n",
  60 |     "             missing_values=False,\n",
  61 |     "             dataset_size=12000):\n",
  62 |     "    \n",
  63 |     "    if categorical_features:\n",
  64 |     "        label_list = []\n",
  65 |     "        N_CATEGORICAL = 4\n",
  66 |     "        for i in range(N_CATEGORICAL):\n",
  67 |     "            num_classes = np.random.randint(2, 10)\n",
  68 |     "            labels = list(np.arange(num_classes))\n",
  69 |     "            labels = [f'str_{i}' for i in labels]\n",
  70 |     "            label_list.append(labels)\n",
  71 |     "            \n",
  72 |     "    if correlated_features:\n",
  73 |     "        N_REDUNDANT = 1\n",
  74 |     "        N_REPEATED = 1\n",
  75 |     "        N_INFORMATIVE = 8 - N_REDUNDANT - N_REPEATED\n",
  76 |     "    \n",
  77 |     "    conf = {\n",
  78 |     "        \"type\": \"classification\",\n",
  79 |     "        \"n_classes\": 2,\n",
  80 |     "        \"n_samples\": dataset_size,\n",
  81 |     "        \"n_features\": 8,\n",
  82 |     "        \"out_path\": \"./\",\n",
  83 |     "        \"output\": \"my_dataset\",\n",
  84 |     "        \"n_informative\": N_INFORMATIVE if correlated_features else 8,\n",
  85 |     "        \"n_repeated\": N_REPEATED if correlated_features else 0,\n",
  86 |     "        \"n_redundant\": N_REDUNDANT if correlated_features else 0,\n",
  87 |     "        \"n_clusters\": 2,\n",
  88 |     "        \"weights\": [0.5, 0.5] if balanced else [0.9, 0.1],\n",
  89 |     "        \"pct_missing\": 0.70 if missing_values else 0.00,\n",
  90 |     "        \"n_categorical\": N_CATEGORICAL if categorical_features else 0,\n",
  91 |     "        \"random_seed\":42,\n",
  92 |     "        \"label_list\":label_list if categorical_features else []\n",
  93 |     "    }\n",
  94 |     "\n",
  95 |     "    make_dataset(config=conf)\n",
  96 |     "    df = pd.read_csv('my_dataset_train.csv')\n",
  97 |     "    \n",
  98 |     "    label = 'y'\n",
  99 |     "    categorical_features = [col for col in df.columns if (df[col].dtype==object) & (col != label)]\n",
 100 |     "    numerical_features = [col for col in df.columns if (col not in categorical_features) & (col != label)]\n",
 101 |     "    \n",
 102 |     "    return df, label, categorical_features, numerical_features\n",
 103 |     "\n",
 104 |     "def evaluation(pipeline, X, y):\n",
 105 |     "    y_predict_proba = pipeline.predict_proba(X)[:, 1]\n",
 106 |     "    return{\n",
 107 |     "        'auc': roc_auc_score(y, y_predict_proba),\n",
 108 |     "        'pr-auc': average_precision_score(y, y_predict_proba)\n",
 109 |     "    }"
 110 |    ]
 111 |   },
 112 |   {
 113 |    "cell_type": "markdown",
 114 |    "metadata": {},
 115 |    "source": [
 116 |     "# Logistic Regression"
 117 |    ]
 118 |   },
 119 |   {
 120 |    "cell_type": "markdown",
 121 |    "metadata": {},
 122 |    "source": [
 123 |     "## 1.1 Standardiazation"
 124 |    ]
 125 |   },
 126 |   {
 127 |    "cell_type": "markdown",
 128 |    "metadata": {},
 129 |    "source": [
 130 |     "### Without Standardization"
 131 |    ]
 132 |   },
 133 |   {
 134 |    "cell_type": "code",
 135 |    "execution_count": 108,
 136 |    "metadata": {},
 137 |    "outputs": [
 138 |     {
 139 |      "name": "stdout",
 140 |      "output_type": "stream",
 141 |      "text": [
 142 |       "--------------------------------------------------------------------------------\n",
 143 |       "Warning: n_repeated not in configuration, defaulting to 0\n",
 144 |       "Warning: n_clusters_per_class not in configuration, defaulting to 2\n",
 145 |       "Warning: effective_rank not in configuration, defaulting to None\n",
 146 |       "Warning: tail_strength not in configuration, defaulting to 0.5\n",
 147 |       "Warning: noise not in configuration, defaulting to 0.0\n",
 148 |       "Warning: shuffle not in configuration, defaulting to True\n",
 149 |       "Creating Classification Dataset...\n",
 150 |       "Warning: insert_dollar not in configuration, defaulting to 'No'\n",
 151 |       "Warning: insert_percent not in configuration, defaulting to 'No'\n",
 152 |       "Warning: star_schema not in configuration, defaulting to 'No'\n",
 153 |       "Writing Train/Test Datasets\n"
 154 |      ]
 155 |     },
 156 |     {
 157 |      "data": {
 158 |       "text/html": [
 159 |        "<div>\n",
 160 |        "<style scoped>\n",
 161 |        "    .dataframe tbody tr th:only-of-type {\n",
 162 |        "        vertical-align: middle;\n",
 163 |        "    }\n",
 164 |        "\n",
 165 |        "    .dataframe tbody tr th {\n",
 166 |        "        vertical-align: top;\n",
 167 |        "    }\n",
 168 |        "\n",
 169 |        "    .dataframe thead th {\n",
 170 |        "        text-align: right;\n",
 171 |        "    }\n",
 172 |        "</style>\n",
 173 |        "<table border=\"1\" class=\"dataframe\">\n",
 174 |        "  <thead>\n",
 175 |        "    <tr style=\"text-align: right;\">\n",
 176 |        "      <th></th>\n",
 177 |        "      <th>x0</th>\n",
 178 |        "      <th>x1</th>\n",
 179 |        "      <th>x2</th>\n",
 180 |        "      <th>x3</th>\n",
 181 |        "      <th>x4</th>\n",
 182 |        "      <th>x5</th>\n",
 183 |        "      <th>x6</th>\n",
 184 |        "      <th>x7</th>\n",
 185 |        "    </tr>\n",
 186 |        "  </thead>\n",
 187 |        "  <tbody>\n",
 188 |        "    <tr>\n",
 189 |        "      <th>count</th>\n",
 190 |        "      <td>96000.000000</td>\n",
 191 |        "      <td>96000.000000</td>\n",
 192 |        "      <td>96000.000000</td>\n",
 193 |        "      <td>96000.000000</td>\n",
 194 |        "      <td>96000.000000</td>\n",
 195 |        "      <td>96000.000000</td>\n",
 196 |        "      <td>96000.000000</td>\n",
 197 |        "      <td>96000.000000</td>\n",
 198 |        "    </tr>\n",
 199 |        "    <tr>\n",
 200 |        "      <th>mean</th>\n",
 201 |        "      <td>-1.645228</td>\n",
 202 |        "      <td>2.244024</td>\n",
 203 |        "      <td>1.045137</td>\n",
 204 |        "      <td>2.714882</td>\n",
 205 |        "      <td>-0.020220</td>\n",
 206 |        "      <td>-0.004654</td>\n",
 207 |        "      <td>-1.536356</td>\n",
 208 |        "      <td>-4.197611</td>\n",
 209 |        "    </tr>\n",
 210 |        "    <tr>\n",
 211 |        "      <th>std</th>\n",
 212 |        "      <td>6.209578</td>\n",
 213 |        "      <td>8.602877</td>\n",
 214 |        "      <td>1.733465</td>\n",
 215 |        "      <td>9.096954</td>\n",
 216 |        "      <td>11.988777</td>\n",
 217 |        "      <td>4.719140</td>\n",
 218 |        "      <td>2.810519</td>\n",
 219 |        "      <td>15.886376</td>\n",
 220 |        "    </tr>\n",
 221 |        "    <tr>\n",
 222 |        "      <th>min</th>\n",
 223 |        "      <td>-35.268886</td>\n",
 224 |        "      <td>-34.012532</td>\n",
 225 |        "      <td>-7.229214</td>\n",
 226 |        "      <td>-35.163348</td>\n",
 227 |        "      <td>-49.768093</td>\n",
 228 |        "      <td>-20.945358</td>\n",
 229 |        "      <td>-13.822828</td>\n",
 230 |        "      <td>-78.725850</td>\n",
 231 |        "    </tr>\n",
 232 |        "    <tr>\n",
 233 |        "      <th>25%</th>\n",
 234 |        "      <td>-5.758226</td>\n",
 235 |        "      <td>-3.452518</td>\n",
 236 |        "      <td>-0.106077</td>\n",
 237 |        "      <td>-3.561230</td>\n",
 238 |        "      <td>-8.276808</td>\n",
 239 |        "      <td>-3.241806</td>\n",
 240 |        "      <td>-3.394866</td>\n",
 241 |        "      <td>-15.068164</td>\n",
 242 |        "    </tr>\n",
 243 |        "    <tr>\n",
 244 |        "      <th>50%</th>\n",
 245 |        "      <td>-1.709702</td>\n",
 246 |        "      <td>2.393561</td>\n",
 247 |        "      <td>1.051668</td>\n",
 248 |        "      <td>2.789480</td>\n",
 249 |        "      <td>0.106544</td>\n",
 250 |        "      <td>-0.308207</td>\n",
 251 |        "      <td>-1.546129</td>\n",
 252 |        "      <td>-3.386493</td>\n",
 253 |        "    </tr>\n",
 254 |        "    <tr>\n",
 255 |        "      <th>75%</th>\n",
 256 |        "      <td>2.515567</td>\n",
 257 |        "      <td>8.106976</td>\n",
 258 |        "      <td>2.192445</td>\n",
 259 |        "      <td>9.022244</td>\n",
 260 |        "      <td>8.357289</td>\n",
 261 |        "      <td>3.115309</td>\n",
 262 |        "      <td>0.302129</td>\n",
 263 |        "      <td>7.250865</td>\n",
 264 |        "    </tr>\n",
 265 |        "    <tr>\n",
 266 |        "      <th>max</th>\n",
 267 |        "      <td>25.206965</td>\n",
 268 |        "      <td>42.072130</td>\n",
 269 |        "      <td>9.851492</td>\n",
 270 |        "      <td>43.896856</td>\n",
 271 |        "      <td>43.189071</td>\n",
 272 |        "      <td>21.426677</td>\n",
 273 |        "      <td>13.305723</td>\n",
 274 |        "      <td>62.477698</td>\n",
 275 |        "    </tr>\n",
 276 |        "  </tbody>\n",
 277 |        "</table>\n",
 278 |        "</div>"
 279 |       ],
 280 |       "text/plain": [
 281 |        "                 x0            x1            x2            x3            x4  \\\n",
 282 |        "count  96000.000000  96000.000000  96000.000000  96000.000000  96000.000000   \n",
 283 |        "mean      -1.645228      2.244024      1.045137      2.714882     -0.020220   \n",
 284 |        "std        6.209578      8.602877      1.733465      9.096954     11.988777   \n",
 285 |        "min      -35.268886    -34.012532     -7.229214    -35.163348    -49.768093   \n",
 286 |        "25%       -5.758226     -3.452518     -0.106077     -3.561230     -8.276808   \n",
 287 |        "50%       -1.709702      2.393561      1.051668      2.789480      0.106544   \n",
 288 |        "75%        2.515567      8.106976      2.192445      9.022244      8.357289   \n",
 289 |        "max       25.206965     42.072130      9.851492     43.896856     43.189071   \n",
 290 |        "\n",
 291 |        "                 x5            x6            x7  \n",
 292 |        "count  96000.000000  96000.000000  96000.000000  \n",
 293 |        "mean      -0.004654     -1.536356     -4.197611  \n",
 294 |        "std        4.719140      2.810519     15.886376  \n",
 295 |        "min      -20.945358    -13.822828    -78.725850  \n",
 296 |        "25%       -3.241806     -3.394866    -15.068164  \n",
 297 |        "50%       -0.308207     -1.546129     -3.386493  \n",
 298 |        "75%        3.115309      0.302129      7.250865  \n",
 299 |        "max       21.426677     13.305723     62.477698  "
 300 |       ]
 301 |      },
 302 |      "execution_count": 108,
 303 |      "metadata": {},
 304 |      "output_type": "execute_result"
 305 |     }
 306 |    ],
 307 |    "source": [
 308 |     "df, label, categorical_features, numerical_features = get_data(categorical_features=False, dataset_size=120000)\n",
 309 |     "df[numerical_features].describe()"
 310 |    ]
 311 |   },
 312 |   {
 313 |    "cell_type": "code",
 314 |    "execution_count": 111,
 315 |    "metadata": {},
 316 |    "outputs": [],
 317 |    "source": [
 318 |     "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
 319 |     "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
 320 |     "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]"
 321 |    ]
 322 |   },
 323 |   {
 324 |    "cell_type": "code",
 325 |    "execution_count": 114,
 326 |    "metadata": {},
 327 |    "outputs": [
 328 |     {
 329 |      "name": "stdout",
 330 |      "output_type": "stream",
 331 |      "text": [
 332 |       "[Pipeline] ............... (step 1 of 1) Processing clf, total=   0.3s\n"
 333 |      ]
 334 |     },
 335 |     {
 336 |      "data": {
 337 |       "text/plain": [
 338 |        "{'auc': 0.8148784308322949, 'pr-auc': 0.818032430163559}"
 339 |       ]
 340 |      },
 341 |      "execution_count": 114,
 342 |      "metadata": {},
 343 |      "output_type": "execute_result"
 344 |     }
 345 |    ],
 346 |    "source": [
 347 |     "clf = LogisticRegression()\n",
 348 |     "pipeline = Pipeline([\n",
 349 |     "    ('clf', clf)\n",
 350 |     "], verbose=True)\n",
 351 |     "\n",
 352 |     "pipeline.fit(X_train[numerical_features], y_train)\n",
 353 |     "evaluation(pipeline, X_test[numerical_features], y_test)"
 354 |    ]
 355 |   },
 356 |   {
 357 |    "cell_type": "markdown",
 358 |    "metadata": {},
 359 |    "source": [
 360 |     "### With Standardization"
 361 |    ]
 362 |   },
 363 |   {
 364 |    "cell_type": "code",
 365 |    "execution_count": 115,
 366 |    "metadata": {},
 367 |    "outputs": [
 368 |     {
 369 |      "name": "stdout",
 370 |      "output_type": "stream",
 371 |      "text": [
 372 |       "[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   0.1s\n",
 373 |       "[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.1s\n"
 374 |      ]
 375 |     },
 376 |     {
 377 |      "data": {
 378 |       "text/plain": [
 379 |        "{'auc': 0.8148798631692816, 'pr-auc': 0.8180303186841142}"
 380 |       ]
 381 |      },
 382 |      "execution_count": 115,
 383 |      "metadata": {},
 384 |      "output_type": "execute_result"
 385 |     }
 386 |    ],
 387 |    "source": [
 388 |     "num = [([n], [StandardScaler()]) for n in numerical_features]\n",
 389 |     "mapper = DataFrameMapper(num, df_out=True)\n",
 390 |     "\n",
 391 |     "clf = LogisticRegression()\n",
 392 |     "pipeline = Pipeline([\n",
 393 |     "    ('preprocess', mapper),\n",
 394 |     "    ('clf', clf)\n",
 395 |     "], verbose=True)\n",
 396 |     "\n",
 397 |     "pipeline.fit(X_train[numerical_features], y_train)\n",
 398 |     "evaluation(pipeline, X_test[numerical_features], y_test)"
 399 |    ]
 400 |   },
 401 |   {
 402 |    "cell_type": "markdown",
 403 |    "metadata": {},
 404 |    "source": [
 405 |     "**Result**\n",
 406 |     "- No need to scale for logistic regression accuracy. But convergence is faster. [More info here](https://stats.stackexchange.com/questions/48360/is-standardization-needed-before-fitting-logistic-regression#:~:text=3%20Answers&text=Standardization%20isn't%20required%20for,the%20technique%20used%20for%20optimization.&text=Otherwise%2C%20you%20can%20run%20your,standardization%20treatment%20on%20the%20features)"
 407 |    ]
 408 |   },
 409 |   {
 410 |    "cell_type": "markdown",
 411 |    "metadata": {},
 412 |    "source": [
 413 |     "## 1.2 Encoding"
 414 |    ]
 415 |   },
 416 |   {
 417 |    "cell_type": "markdown",
 418 |    "metadata": {},
 419 |    "source": [
 420 |     "We need numeric encoding for logistic regression."
 421 |    ]
 422 |   },
 423 |   {
 424 |    "cell_type": "code",
 425 |    "execution_count": 3,
 426 |    "metadata": {},
 427 |    "outputs": [
 428 |     {
 429 |      "name": "stdout",
 430 |      "output_type": "stream",
 431 |      "text": [
 432 |       "--------------------------------------------------------------------------------\n",
 433 |       "Warning: n_repeated not in configuration, defaulting to 0\n",
 434 |       "Warning: n_clusters_per_class not in configuration, defaulting to 2\n",
 435 |       "Warning: effective_rank not in configuration, defaulting to None\n",
 436 |       "Warning: tail_strength not in configuration, defaulting to 0.5\n",
 437 |       "Warning: noise not in configuration, defaulting to 0.0\n",
 438 |       "Warning: shuffle not in configuration, defaulting to True\n",
 439 |       "Creating Classification Dataset...\n",
 440 |       "Creating Categorical Features...\n",
 441 |       "Warning: insert_dollar not in configuration, defaulting to 'No'\n",
 442 |       "Warning: insert_percent not in configuration, defaulting to 'No'\n",
 443 |       "Warning: star_schema not in configuration, defaulting to 'No'\n",
 444 |       "Writing Train/Test Datasets\n"
 445 |      ]
 446 |     }
 447 |    ],
 448 |    "source": [
 449 |     "df, label, categorical_features, numerical_features = get_data()"
 450 |    ]
 451 |   },
 452 |   {
 453 |    "cell_type": "markdown",
 454 |    "metadata": {},
 455 |    "source": [
 456 |     "### One Hot Encoding"
 457 |    ]
 458 |   },
 459 |   {
 460 |    "cell_type": "code",
 461 |    "execution_count": 10,
 462 |    "metadata": {},
 463 |    "outputs": [
 464 |     {
 465 |      "name": "stdout",
 466 |      "output_type": "stream",
 467 |      "text": [
 468 |       "[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   0.1s\n",
 469 |       "[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.9s\n"
 470 |      ]
 471 |     },
 472 |     {
 473 |      "data": {
 474 |       "text/plain": [
 475 |        "{'auc': 0.8304397645792462, 'pr-auc': 0.80297861579569}"
 476 |       ]
 477 |      },
 478 |      "execution_count": 10,
 479 |      "metadata": {},
 480 |      "output_type": "execute_result"
 481 |     }
 482 |    ],
 483 |    "source": [
 484 |     "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
 485 |     "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
 486 |     "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
 487 |     "\n",
 488 |     "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
 489 |     "cat = [([c], [OneHotEncoder()]) for c in categorical_features]\n",
 490 |     "mapper = DataFrameMapper(cat + num, df_out=True)\n",
 491 |     "\n",
 492 |     "clf = LogisticRegression(max_iter=1000)\n",
 493 |     "pipeline = Pipeline([\n",
 494 |     "    ('preprocess', mapper),\n",
 495 |     "    ('clf', clf)\n",
 496 |     "], verbose=True)\n",
 497 |     "\n",
 498 |     "pipeline.fit(X_train, y_train)\n",
 499 |     "evaluation(pipeline, X_test, y_test)"
 500 |    ]
 501 |   },
 502 |   {
 503 |    "cell_type": "code",
 504 |    "execution_count": 11,
 505 |    "metadata": {
 506 |     "scrolled": true
 507 |    },
 508 |    "outputs": [
 509 |     {
 510 |      "data": {
 511 |       "text/html": [
 512 |        "<div>\n",
 513 |        "<style scoped>\n",
 514 |        "    .dataframe tbody tr th:only-of-type {\n",
 515 |        "        vertical-align: middle;\n",
 516 |        "    }\n",
 517 |        "\n",
 518 |        "    .dataframe tbody tr th {\n",
 519 |        "        vertical-align: top;\n",
 520 |        "    }\n",
 521 |        "\n",
 522 |        "    .dataframe thead th {\n",
 523 |        "        text-align: right;\n",
 524 |        "    }\n",
 525 |        "</style>\n",
 526 |        "<table border=\"1\" class=\"dataframe\">\n",
 527 |        "  <thead>\n",
 528 |        "    <tr style=\"text-align: right;\">\n",
 529 |        "      <th></th>\n",
 530 |        "      <th>8640</th>\n",
 531 |        "      <th>8641</th>\n",
 532 |        "      <th>8642</th>\n",
 533 |        "      <th>8643</th>\n",
 534 |        "      <th>8644</th>\n",
 535 |        "    </tr>\n",
 536 |        "  </thead>\n",
 537 |        "  <tbody>\n",
 538 |        "    <tr>\n",
 539 |        "      <th>x1_x0_str_0</th>\n",
 540 |        "      <td>0.000000</td>\n",
 541 |        "      <td>0.000000</td>\n",
 542 |        "      <td>0.000000</td>\n",
 543 |        "      <td>0.000000</td>\n",
 544 |        "      <td>0.000000</td>\n",
 545 |        "    </tr>\n",
 546 |        "    <tr>\n",
 547 |        "      <th>x1_x0_str_1</th>\n",
 548 |        "      <td>0.000000</td>\n",
 549 |        "      <td>0.000000</td>\n",
 550 |        "      <td>0.000000</td>\n",
 551 |        "      <td>0.000000</td>\n",
 552 |        "      <td>0.000000</td>\n",
 553 |        "    </tr>\n",
 554 |        "    <tr>\n",
 555 |        "      <th>x1_x0_str_2</th>\n",
 556 |        "      <td>1.000000</td>\n",
 557 |        "      <td>0.000000</td>\n",
 558 |        "      <td>0.000000</td>\n",
 559 |        "      <td>1.000000</td>\n",
 560 |        "      <td>1.000000</td>\n",
 561 |        "    </tr>\n",
 562 |        "    <tr>\n",
 563 |        "      <th>x1_x0_str_3</th>\n",
 564 |        "      <td>0.000000</td>\n",
 565 |        "      <td>1.000000</td>\n",
 566 |        "      <td>1.000000</td>\n",
 567 |        "      <td>0.000000</td>\n",
 568 |        "      <td>0.000000</td>\n",
 569 |        "    </tr>\n",
 570 |        "    <tr>\n",
 571 |        "      <th>x1_x0_str_4</th>\n",
 572 |        "      <td>0.000000</td>\n",
 573 |        "      <td>0.000000</td>\n",
 574 |        "      <td>0.000000</td>\n",
 575 |        "      <td>0.000000</td>\n",
 576 |        "      <td>0.000000</td>\n",
 577 |        "    </tr>\n",
 578 |        "    <tr>\n",
 579 |        "      <th>x1_x0_str_5</th>\n",
 580 |        "      <td>0.000000</td>\n",
 581 |        "      <td>0.000000</td>\n",
 582 |        "      <td>0.000000</td>\n",
 583 |        "      <td>0.000000</td>\n",
 584 |        "      <td>0.000000</td>\n",
 585 |        "    </tr>\n",
 586 |        "    <tr>\n",
 587 |        "      <th>x3_x0_str_0</th>\n",
 588 |        "      <td>0.000000</td>\n",
 589 |        "      <td>0.000000</td>\n",
 590 |        "      <td>0.000000</td>\n",
 591 |        "      <td>0.000000</td>\n",
 592 |        "      <td>0.000000</td>\n",
 593 |        "    </tr>\n",
 594 |        "    <tr>\n",
 595 |        "      <th>x3_x0_str_1</th>\n",
 596 |        "      <td>0.000000</td>\n",
 597 |        "      <td>0.000000</td>\n",
 598 |        "      <td>1.000000</td>\n",
 599 |        "      <td>0.000000</td>\n",
 600 |        "      <td>0.000000</td>\n",
 601 |        "    </tr>\n",
 602 |        "    <tr>\n",
 603 |        "      <th>x3_x0_str_2</th>\n",
 604 |        "      <td>0.000000</td>\n",
 605 |        "      <td>1.000000</td>\n",
 606 |        "      <td>0.000000</td>\n",
 607 |        "      <td>1.000000</td>\n",
 608 |        "      <td>1.000000</td>\n",
 609 |        "    </tr>\n",
 610 |        "    <tr>\n",
 611 |        "      <th>x3_x0_str_3</th>\n",
 612 |        "      <td>1.000000</td>\n",
 613 |        "      <td>0.000000</td>\n",
 614 |        "      <td>0.000000</td>\n",
 615 |        "      <td>0.000000</td>\n",
 616 |        "      <td>0.000000</td>\n",
 617 |        "    </tr>\n",
 618 |        "    <tr>\n",
 619 |        "      <th>x3_x0_str_4</th>\n",
 620 |        "      <td>0.000000</td>\n",
 621 |        "      <td>0.000000</td>\n",
 622 |        "      <td>0.000000</td>\n",
 623 |        "      <td>0.000000</td>\n",
 624 |        "      <td>0.000000</td>\n",
 625 |        "    </tr>\n",
 626 |        "    <tr>\n",
 627 |        "      <th>x6_x0_str_0</th>\n",
 628 |        "      <td>0.000000</td>\n",
 629 |        "      <td>0.000000</td>\n",
 630 |        "      <td>0.000000</td>\n",
 631 |        "      <td>0.000000</td>\n",
 632 |        "      <td>0.000000</td>\n",
 633 |        "    </tr>\n",
 634 |        "    <tr>\n",
 635 |        "      <th>x6_x0_str_1</th>\n",
 636 |        "      <td>0.000000</td>\n",
 637 |        "      <td>0.000000</td>\n",
 638 |        "      <td>0.000000</td>\n",
 639 |        "      <td>0.000000</td>\n",
 640 |        "      <td>0.000000</td>\n",
 641 |        "    </tr>\n",
 642 |        "    <tr>\n",
 643 |        "      <th>x6_x0_str_2</th>\n",
 644 |        "      <td>0.000000</td>\n",
 645 |        "      <td>0.000000</td>\n",
 646 |        "      <td>0.000000</td>\n",
 647 |        "      <td>1.000000</td>\n",
 648 |        "      <td>0.000000</td>\n",
 649 |        "    </tr>\n",
 650 |        "    <tr>\n",
 651 |        "      <th>x6_x0_str_3</th>\n",
 652 |        "      <td>1.000000</td>\n",
 653 |        "      <td>1.000000</td>\n",
 654 |        "      <td>0.000000</td>\n",
 655 |        "      <td>0.000000</td>\n",
 656 |        "      <td>0.000000</td>\n",
 657 |        "    </tr>\n",
 658 |        "    <tr>\n",
 659 |        "      <th>x6_x0_str_4</th>\n",
 660 |        "      <td>0.000000</td>\n",
 661 |        "      <td>0.000000</td>\n",
 662 |        "      <td>1.000000</td>\n",
 663 |        "      <td>0.000000</td>\n",
 664 |        "      <td>1.000000</td>\n",
 665 |        "    </tr>\n",
 666 |        "    <tr>\n",
 667 |        "      <th>x6_x0_str_5</th>\n",
 668 |        "      <td>0.000000</td>\n",
 669 |        "      <td>0.000000</td>\n",
 670 |        "      <td>0.000000</td>\n",
 671 |        "      <td>0.000000</td>\n",
 672 |        "      <td>0.000000</td>\n",
 673 |        "    </tr>\n",
 674 |        "    <tr>\n",
 675 |        "      <th>x6_x0_str_6</th>\n",
 676 |        "      <td>0.000000</td>\n",
 677 |        "      <td>0.000000</td>\n",
 678 |        "      <td>0.000000</td>\n",
 679 |        "      <td>0.000000</td>\n",
 680 |        "      <td>0.000000</td>\n",
 681 |        "    </tr>\n",
 682 |        "    <tr>\n",
 683 |        "      <th>x6_x0_str_7</th>\n",
 684 |        "      <td>0.000000</td>\n",
 685 |        "      <td>0.000000</td>\n",
 686 |        "      <td>0.000000</td>\n",
 687 |        "      <td>0.000000</td>\n",
 688 |        "      <td>0.000000</td>\n",
 689 |        "    </tr>\n",
 690 |        "    <tr>\n",
 691 |        "      <th>x7_x0_str_0</th>\n",
 692 |        "      <td>0.000000</td>\n",
 693 |        "      <td>0.000000</td>\n",
 694 |        "      <td>0.000000</td>\n",
 695 |        "      <td>0.000000</td>\n",
 696 |        "      <td>0.000000</td>\n",
 697 |        "    </tr>\n",
 698 |        "    <tr>\n",
 699 |        "      <th>x7_x0_str_1</th>\n",
 700 |        "      <td>0.000000</td>\n",
 701 |        "      <td>0.000000</td>\n",
 702 |        "      <td>0.000000</td>\n",
 703 |        "      <td>0.000000</td>\n",
 704 |        "      <td>1.000000</td>\n",
 705 |        "    </tr>\n",
 706 |        "    <tr>\n",
 707 |        "      <th>x7_x0_str_2</th>\n",
 708 |        "      <td>0.000000</td>\n",
 709 |        "      <td>0.000000</td>\n",
 710 |        "      <td>0.000000</td>\n",
 711 |        "      <td>0.000000</td>\n",
 712 |        "      <td>0.000000</td>\n",
 713 |        "    </tr>\n",
 714 |        "    <tr>\n",
 715 |        "      <th>x7_x0_str_3</th>\n",
 716 |        "      <td>1.000000</td>\n",
 717 |        "      <td>0.000000</td>\n",
 718 |        "      <td>1.000000</td>\n",
 719 |        "      <td>1.000000</td>\n",
 720 |        "      <td>0.000000</td>\n",
 721 |        "    </tr>\n",
 722 |        "    <tr>\n",
 723 |        "      <th>x7_x0_str_4</th>\n",
 724 |        "      <td>0.000000</td>\n",
 725 |        "      <td>0.000000</td>\n",
 726 |        "      <td>0.000000</td>\n",
 727 |        "      <td>0.000000</td>\n",
 728 |        "      <td>0.000000</td>\n",
 729 |        "    </tr>\n",
 730 |        "    <tr>\n",
 731 |        "      <th>x7_x0_str_5</th>\n",
 732 |        "      <td>0.000000</td>\n",
 733 |        "      <td>1.000000</td>\n",
 734 |        "      <td>0.000000</td>\n",
 735 |        "      <td>0.000000</td>\n",
 736 |        "      <td>0.000000</td>\n",
 737 |        "    </tr>\n",
 738 |        "    <tr>\n",
 739 |        "      <th>x7_x0_str_6</th>\n",
 740 |        "      <td>0.000000</td>\n",
 741 |        "      <td>0.000000</td>\n",
 742 |        "      <td>0.000000</td>\n",
 743 |        "      <td>0.000000</td>\n",
 744 |        "      <td>0.000000</td>\n",
 745 |        "    </tr>\n",
 746 |        "    <tr>\n",
 747 |        "      <th>x0</th>\n",
 748 |        "      <td>-10.703380</td>\n",
 749 |        "      <td>-4.189989</td>\n",
 750 |        "      <td>10.965457</td>\n",
 751 |        "      <td>11.707606</td>\n",
 752 |        "      <td>-10.140494</td>\n",
 753 |        "    </tr>\n",
 754 |        "    <tr>\n",
 755 |        "      <th>x2</th>\n",
 756 |        "      <td>-12.941344</td>\n",
 757 |        "      <td>-14.909158</td>\n",
 758 |        "      <td>-21.448502</td>\n",
 759 |        "      <td>-6.947473</td>\n",
 760 |        "      <td>-36.795258</td>\n",
 761 |        "    </tr>\n",
 762 |        "    <tr>\n",
 763 |        "      <th>x4</th>\n",
 764 |        "      <td>-6.464533</td>\n",
 765 |        "      <td>7.366311</td>\n",
 766 |        "      <td>3.887812</td>\n",
 767 |        "      <td>-8.306792</td>\n",
 768 |        "      <td>-7.842345</td>\n",
 769 |        "    </tr>\n",
 770 |        "    <tr>\n",
 771 |        "      <th>x5</th>\n",
 772 |        "      <td>-0.328846</td>\n",
 773 |        "      <td>-11.833629</td>\n",
 774 |        "      <td>13.592603</td>\n",
 775 |        "      <td>10.200299</td>\n",
 776 |        "      <td>-3.358164</td>\n",
 777 |        "    </tr>\n",
 778 |        "  </tbody>\n",
 779 |        "</table>\n",
 780 |        "</div>"
 781 |       ],
 782 |       "text/plain": [
 783 |        "                  8640       8641       8642       8643       8644\n",
 784 |        "x1_x0_str_0   0.000000   0.000000   0.000000   0.000000   0.000000\n",
 785 |        "x1_x0_str_1   0.000000   0.000000   0.000000   0.000000   0.000000\n",
 786 |        "x1_x0_str_2   1.000000   0.000000   0.000000   1.000000   1.000000\n",
 787 |        "x1_x0_str_3   0.000000   1.000000   1.000000   0.000000   0.000000\n",
 788 |        "x1_x0_str_4   0.000000   0.000000   0.000000   0.000000   0.000000\n",
 789 |        "x1_x0_str_5   0.000000   0.000000   0.000000   0.000000   0.000000\n",
 790 |        "x3_x0_str_0   0.000000   0.000000   0.000000   0.000000   0.000000\n",
 791 |        "x3_x0_str_1   0.000000   0.000000   1.000000   0.000000   0.000000\n",
 792 |        "x3_x0_str_2   0.000000   1.000000   0.000000   1.000000   1.000000\n",
 793 |        "x3_x0_str_3   1.000000   0.000000   0.000000   0.000000   0.000000\n",
 794 |        "x3_x0_str_4   0.000000   0.000000   0.000000   0.000000   0.000000\n",
 795 |        "x6_x0_str_0   0.000000   0.000000   0.000000   0.000000   0.000000\n",
 796 |        "x6_x0_str_1   0.000000   0.000000   0.000000   0.000000   0.000000\n",
 797 |        "x6_x0_str_2   0.000000   0.000000   0.000000   1.000000   0.000000\n",
 798 |        "x6_x0_str_3   1.000000   1.000000   0.000000   0.000000   0.000000\n",
 799 |        "x6_x0_str_4   0.000000   0.000000   1.000000   0.000000   1.000000\n",
 800 |        "x6_x0_str_5   0.000000   0.000000   0.000000   0.000000   0.000000\n",
 801 |        "x6_x0_str_6   0.000000   0.000000   0.000000   0.000000   0.000000\n",
 802 |        "x6_x0_str_7   0.000000   0.000000   0.000000   0.000000   0.000000\n",
 803 |        "x7_x0_str_0   0.000000   0.000000   0.000000   0.000000   0.000000\n",
 804 |        "x7_x0_str_1   0.000000   0.000000   0.000000   0.000000   1.000000\n",
 805 |        "x7_x0_str_2   0.000000   0.000000   0.000000   0.000000   0.000000\n",
 806 |        "x7_x0_str_3   1.000000   0.000000   1.000000   1.000000   0.000000\n",
 807 |        "x7_x0_str_4   0.000000   0.000000   0.000000   0.000000   0.000000\n",
 808 |        "x7_x0_str_5   0.000000   1.000000   0.000000   0.000000   0.000000\n",
 809 |        "x7_x0_str_6   0.000000   0.000000   0.000000   0.000000   0.000000\n",
 810 |        "x0          -10.703380  -4.189989  10.965457  11.707606 -10.140494\n",
 811 |        "x2          -12.941344 -14.909158 -21.448502  -6.947473 -36.795258\n",
 812 |        "x4           -6.464533   7.366311   3.887812  -8.306792  -7.842345\n",
 813 |        "x5           -0.328846 -11.833629  13.592603  10.200299  -3.358164"
 814 |       ]
 815 |      },
 816 |      "execution_count": 11,
 817 |      "metadata": {},
 818 |      "output_type": "execute_result"
 819 |     }
 820 |    ],
 821 |    "source": [
 822 |     "preprocessed_X_test = mapper.transform(X_test)\n",
 823 |     "preprocessed_X_test.head().T"
 824 |    ]
 825 |   },
 826 |   {
 827 |    "cell_type": "markdown",
 828 |    "metadata": {},
 829 |    "source": [
 830 |     "### Ordinal Encoding"
 831 |    ]
 832 |   },
 833 |   {
 834 |    "cell_type": "code",
 835 |    "execution_count": 13,
 836 |    "metadata": {},
 837 |    "outputs": [
 838 |     {
 839 |      "name": "stdout",
 840 |      "output_type": "stream",
 841 |      "text": [
 842 |       "[Pipeline] ........ (step 1 of 2) Processing preprocess, total=   0.1s\n",
 843 |       "[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.1s\n"
 844 |      ]
 845 |     },
 846 |     {
 847 |      "data": {
 848 |       "text/plain": [
 849 |        "{'auc': 0.8194499904512231, 'pr-auc': 0.7996358755932719}"
 850 |       ]
 851 |      },
 852 |      "execution_count": 13,
 853 |      "metadata": {},
 854 |      "output_type": "execute_result"
 855 |     }
 856 |    ],
 857 |    "source": [
 858 |     "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
 859 |     "cat = [([c], [OrdinalEncoder()]) for c in categorical_features]\n",
 860 |     "mapper = DataFrameMapper(cat + num, df_out=True)\n",
 861 |     "\n",
 862 |     "clf = LogisticRegression()\n",
 863 |     "pipeline = Pipeline([\n",
 864 |     "    ('preprocess', mapper),\n",
 865 |     "    ('clf', clf)\n",
 866 |     "], verbose=True)\n",
 867 |     "\n",
 868 |     "pipeline.fit(X_train, y_train)\n",
 869 |     "evaluation(pipeline, X_test, y_test)"
 870 |    ]
 871 |   },
 872 |   {
 873 |    "cell_type": "code",
 874 |    "execution_count": 145,
 875 |    "metadata": {},
 876 |    "outputs": [
 877 |     {
 878 |      "data": {
 879 |       "text/html": [
 880 |        "<div>\n",
 881 |        "<style scoped>\n",
 882 |        "    .dataframe tbody tr th:only-of-type {\n",
 883 |        "        vertical-align: middle;\n",
 884 |        "    }\n",
 885 |        "\n",
 886 |        "    .dataframe tbody tr th {\n",
 887 |        "        vertical-align: top;\n",
 888 |        "    }\n",
 889 |        "\n",
 890 |        "    .dataframe thead th {\n",
 891 |        "        text-align: right;\n",
 892 |        "    }\n",
 893 |        "</style>\n",
 894 |        "<table border=\"1\" class=\"dataframe\">\n",
 895 |        "  <thead>\n",
 896 |        "    <tr style=\"text-align: right;\">\n",
 897 |        "      <th></th>\n",
 898 |        "      <th>9000</th>\n",
 899 |        "      <th>9001</th>\n",
 900 |        "      <th>9002</th>\n",
 901 |        "      <th>9003</th>\n",
 902 |        "      <th>9004</th>\n",
 903 |        "    </tr>\n",
 904 |        "  </thead>\n",
 905 |        "  <tbody>\n",
 906 |        "    <tr>\n",
 907 |        "      <th>feat_5</th>\n",
 908 |        "      <td>1.000000</td>\n",
 909 |        "      <td>3.000000</td>\n",
 910 |        "      <td>6.000000</td>\n",
 911 |        "      <td>0.000000</td>\n",
 912 |        "      <td>5.000000</td>\n",
 913 |        "    </tr>\n",
 914 |        "    <tr>\n",
 915 |        "      <th>feat_6</th>\n",
 916 |        "      <td>2.000000</td>\n",
 917 |        "      <td>3.000000</td>\n",
 918 |        "      <td>0.000000</td>\n",
 919 |        "      <td>1.000000</td>\n",
 920 |        "      <td>0.000000</td>\n",
 921 |        "    </tr>\n",
 922 |        "    <tr>\n",
 923 |        "      <th>feat_7</th>\n",
 924 |        "      <td>0.000000</td>\n",
 925 |        "      <td>2.000000</td>\n",
 926 |        "      <td>6.000000</td>\n",
 927 |        "      <td>4.000000</td>\n",
 928 |        "      <td>1.000000</td>\n",
 929 |        "    </tr>\n",
 930 |        "    <tr>\n",
 931 |        "      <th>feat_8</th>\n",
 932 |        "      <td>4.000000</td>\n",
 933 |        "      <td>7.000000</td>\n",
 934 |        "      <td>7.000000</td>\n",
 935 |        "      <td>0.000000</td>\n",
 936 |        "      <td>5.000000</td>\n",
 937 |        "    </tr>\n",
 938 |        "    <tr>\n",
 939 |        "      <th>feat_1</th>\n",
 940 |        "      <td>-0.068768</td>\n",
 941 |        "      <td>0.425899</td>\n",
 942 |        "      <td>1.930354</td>\n",
 943 |        "      <td>1.157980</td>\n",
 944 |        "      <td>-1.304169</td>\n",
 945 |        "    </tr>\n",
 946 |        "    <tr>\n",
 947 |        "      <th>feat_2</th>\n",
 948 |        "      <td>-1.222878</td>\n",
 949 |        "      <td>0.293660</td>\n",
 950 |        "      <td>1.729959</td>\n",
 951 |        "      <td>-0.716538</td>\n",
 952 |        "      <td>1.169799</td>\n",
 953 |        "    </tr>\n",
 954 |        "    <tr>\n",
 955 |        "      <th>feat_3</th>\n",
 956 |        "      <td>-0.714906</td>\n",
 957 |        "      <td>1.509702</td>\n",
 958 |        "      <td>-0.429593</td>\n",
 959 |        "      <td>-0.708234</td>\n",
 960 |        "      <td>-0.304866</td>\n",
 961 |        "    </tr>\n",
 962 |        "    <tr>\n",
 963 |        "      <th>feat_4</th>\n",
 964 |        "      <td>-0.823643</td>\n",
 965 |        "      <td>1.997845</td>\n",
 966 |        "      <td>0.105752</td>\n",
 967 |        "      <td>-0.953579</td>\n",
 968 |        "      <td>0.690543</td>\n",
 969 |        "    </tr>\n",
 970 |        "  </tbody>\n",
 971 |        "</table>\n",
 972 |        "</div>"
 973 |       ],
 974 |       "text/plain": [
 975 |        "            9000      9001      9002      9003      9004\n",
 976 |        "feat_5  1.000000  3.000000  6.000000  0.000000  5.000000\n",
 977 |        "feat_6  2.000000  3.000000  0.000000  1.000000  0.000000\n",
 978 |        "feat_7  0.000000  2.000000  6.000000  4.000000  1.000000\n",
 979 |        "feat_8  4.000000  7.000000  7.000000  0.000000  5.000000\n",
 980 |        "feat_1 -0.068768  0.425899  1.930354  1.157980 -1.304169\n",
 981 |        "feat_2 -1.222878  0.293660  1.729959 -0.716538  1.169799\n",
 982 |        "feat_3 -0.714906  1.509702 -0.429593 -0.708234 -0.304866\n",
 983 |        "feat_4 -0.823643  1.997845  0.105752 -0.953579  0.690543"
 984 |       ]
 985 |      },
 986 |      "execution_count": 145,
 987 |      "metadata": {},
 988 |      "output_type": "execute_result"
 989 |     }
 990 |    ],
 991 |    "source": [
 992 |     "preprocessed_X_test = mapper.transform(X_test)\n",
 993 |     "preprocessed_X_test.head().T"
 994 |    ]
 995 |   },
 996 |   {
 997 |    "cell_type": "markdown",
 998 |    "metadata": {},
 999 |    "source": [
1000 |     "**Result**: \n",
1001 |     "- `OrdinalEncoding` works when relationships exist between categorical variables (size, weather). Otherwise, prefer `OneHotEncoding`\n",
1002 |     "- `OneHotEncoding` takes up space. Hence more training time"
1003 |    ]
1004 |   },
1005 |   {
1006 |    "cell_type": "markdown",
1007 |    "metadata": {},
1008 |    "source": [
1009 |     "## 1.3 Data Imbalance"
1010 |    ]
1011 |   },
1012 |   {
1013 |    "cell_type": "markdown",
1014 |    "metadata": {},
1015 |    "source": [
1016 |     "What happens if the training data isn't balanced?"
1017 |    ]
1018 |   },
1019 |   {
1020 |    "cell_type": "markdown",
1021 |    "metadata": {},
1022 |    "source": [
1023 |     "### Unbalanced"
1024 |    ]
1025 |   },
1026 |   {
1027 |    "cell_type": "code",
1028 |    "execution_count": 102,
1029 |    "metadata": {},
1030 |    "outputs": [
1031 |     {
1032 |      "name": "stdout",
1033 |      "output_type": "stream",
1034 |      "text": [
1035 |       "--------------------------------------------------------------------------------\n",
1036 |       "Warning: n_clusters_per_class not in configuration, defaulting to 2\n",
1037 |       "Warning: effective_rank not in configuration, defaulting to None\n",
1038 |       "Warning: tail_strength not in configuration, defaulting to 0.5\n",
1039 |       "Warning: noise not in configuration, defaulting to 0.0\n",
1040 |       "Warning: shuffle not in configuration, defaulting to True\n",
1041 |       "Creating Classification Dataset...\n",
1042 |       "Creating Categorical Features...\n",
1043 |       "Warning: insert_dollar not in configuration, defaulting to 'No'\n",
1044 |       "Warning: insert_percent not in configuration, defaulting to 'No'\n",
1045 |       "Warning: star_schema not in configuration, defaulting to 'No'\n",
1046 |       "Writing Train/Test Datasets\n"
1047 |      ]
1048 |     }
1049 |    ],
1050 |    "source": [
1051 |     "df, label, categorical_features, numerical_features = get_data(balanced=False)\n",
1052 |     "\n",
1053 |     "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
1054 |     "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
1055 |     "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]"
1056 |    ]
1057 |   },
1058 |   {
1059 |    "cell_type": "code",
1060 |    "execution_count": 103,
1061 |    "metadata": {},
1062 |    "outputs": [
1063 |     {
1064 |      "data": {
1065 |       "text/plain": [
1066 |        "0    8599\n",
1067 |        "1    1001\n",
1068 |        "Name: y, dtype: int64"
1069 |       ]
1070 |      },
1071 |      "execution_count": 103,
1072 |      "metadata": {},
1073 |      "output_type": "execute_result"
1074 |     }
1075 |    ],
1076 |    "source": [
1077 |     "df[label].value_counts()"
1078 |    ]
1079 |   },
1080 |   {
1081 |    "cell_type": "code",
1082 |    "execution_count": 104,
1083 |    "metadata": {},
1084 |    "outputs": [
1085 |     {
1086 |      "data": {
1087 |       "text/plain": [
1088 |        "{'auc': 0.7869518716577542, 'pr-auc': 0.39239809756882393}"
1089 |       ]
1090 |      },
1091 |      "execution_count": 104,
1092 |      "metadata": {},
1093 |      "output_type": "execute_result"
1094 |     }
1095 |    ],
1096 |    "source": [
1097 |     "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
1098 |     "cat = [([c], [OrdinalEncoder()]) for c in categorical_features]\n",
1099 |     "mapper = DataFrameMapper(cat + num, df_out=True)\n",
1100 |     "\n",
1101 |     "clf = LogisticRegression()\n",
1102 |     "pipeline = Pipeline([\n",
1103 |     "    ('preprocess', mapper),\n",
1104 |     "    ('clf', clf)\n",
1105 |     "])\n",
1106 |     "\n",
1107 |     "pipeline.fit(X_train, y_train)\n",
1108 |     "evaluation(pipeline, X_test, y_test)"
1109 |    ]
1110 |   },
1111 |   {
1112 |    "cell_type": "code",
1113 |    "execution_count": 105,
1114 |    "metadata": {},
1115 |    "outputs": [],
1116 |    "source": [
1117 |     "y_predict_proba = pipeline.predict_proba(X_test)[:, 1]"
1118 |    ]
1119 |   },
1120 |   {
1121 |    "cell_type": "code",
1122 |    "execution_count": 106,
1123 |    "metadata": {},
1124 |    "outputs": [
1125 |     {
1126 |      "data": {
1127 |       "text/plain": [
1128 |        "0.10815533327119523"
1129 |       ]
1130 |      },
1131 |      "execution_count": 106,
1132 |      "metadata": {},
1133 |      "output_type": "execute_result"
1134 |     }
1135 |    ],
1136 |    "source": [
1137 |     "y_predict_proba.mean()"
1138 |    ]
1139 |   },
1140 |   {
1141 |    "cell_type": "markdown",
1142 |    "metadata": {},
1143 |    "source": [
1144 |     "## Balanced"
1145 |    ]
1146 |   },
1147 |   {
1148 |    "cell_type": "code",
1149 |    "execution_count": 35,
1150 |    "metadata": {},
1151 |    "outputs": [
1152 |     {
1153 |      "name": "stdout",
1154 |      "output_type": "stream",
1155 |      "text": [
1156 |       "--------------------------------------------------------------------------------\n",
1157 |       "Warning: n_repeated not in configuration, defaulting to 0\n",
1158 |       "Warning: n_clusters_per_class not in configuration, defaulting to 2\n",
1159 |       "Warning: effective_rank not in configuration, defaulting to None\n",
1160 |       "Warning: tail_strength not in configuration, defaulting to 0.5\n",
1161 |       "Warning: noise not in configuration, defaulting to 0.0\n",
1162 |       "Warning: shuffle not in configuration, defaulting to True\n",
1163 |       "Creating Classification Dataset...\n",
1164 |       "Creating Categorical Features...\n",
1165 |       "Warning: insert_dollar not in configuration, defaulting to 'No'\n",
1166 |       "Warning: insert_percent not in configuration, defaulting to 'No'\n",
1167 |       "Warning: star_schema not in configuration, defaulting to 'No'\n",
1168 |       "Writing Train/Test Datasets\n"
1169 |      ]
1170 |     },
1171 |     {
1172 |      "data": {
1173 |       "text/plain": [
1174 |        "{'auc': 0.7949023220244715, 'pr-auc': 0.7742073929744453}"
1175 |       ]
1176 |      },
1177 |      "execution_count": 35,
1178 |      "metadata": {},
1179 |      "output_type": "execute_result"
1180 |     }
1181 |    ],
1182 |    "source": [
1183 |     "df, label, categorical_features, numerical_features = get_data(balanced=True)\n",
1184 |     "\n",
1185 |     "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
1186 |     "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
1187 |     "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
1188 |     "\n",
1189 |     "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
1190 |     "cat = [([c], [OrdinalEncoder()]) for c in categorical_features]\n",
1191 |     "mapper = DataFrameMapper(cat + num, df_out=True)\n",
1192 |     "\n",
1193 |     "clf = LogisticRegression()\n",
1194 |     "pipeline = Pipeline([\n",
1195 |     "    ('preprocess', mapper),\n",
1196 |     "    ('clf', clf)\n",
1197 |     "])\n",
1198 |     "\n",
1199 |     "pipeline.fit(X_train, y_train)\n",
1200 |     "evaluation(pipeline, X_test, y_test)"
1201 |    ]
1202 |   },
1203 |   {
1204 |    "cell_type": "code",
1205 |    "execution_count": 36,
1206 |    "metadata": {},
1207 |    "outputs": [
1208 |     {
1209 |      "data": {
1210 |       "text/plain": [
1211 |        "0.4994547544271453"
1212 |       ]
1213 |      },
1214 |      "execution_count": 36,
1215 |      "metadata": {},
1216 |      "output_type": "execute_result"
1217 |     }
1218 |    ],
1219 |    "source": [
1220 |     "y_predict_proba = pipeline.predict_proba(X_test)[:, 1]\n",
1221 |     "y_predict_proba.mean()"
1222 |    ]
1223 |   },
1224 |   {
1225 |    "cell_type": "markdown",
1226 |    "metadata": {},
1227 |    "source": [
1228 |     "## Dealing with unbalanced data by over weighting"
1229 |    ]
1230 |   },
1231 |   {
1232 |    "cell_type": "code",
1233 |    "execution_count": 101,
1234 |    "metadata": {},
1235 |    "outputs": [
1236 |     {
1237 |      "name": "stdout",
1238 |      "output_type": "stream",
1239 |      "text": [
1240 |       "--------------------------------------------------------------------------------\n",
1241 |       "Warning: n_clusters_per_class not in configuration, defaulting to 2\n",
1242 |       "Warning: effective_rank not in configuration, defaulting to None\n",
1243 |       "Warning: tail_strength not in configuration, defaulting to 0.5\n",
1244 |       "Warning: noise not in configuration, defaulting to 0.0\n",
1245 |       "Warning: shuffle not in configuration, defaulting to True\n",
1246 |       "Creating Classification Dataset...\n",
1247 |       "Creating Categorical Features...\n",
1248 |       "Warning: insert_dollar not in configuration, defaulting to 'No'\n",
1249 |       "Warning: insert_percent not in configuration, defaulting to 'No'\n",
1250 |       "Warning: star_schema not in configuration, defaulting to 'No'\n",
1251 |       "Writing Train/Test Datasets\n"
1252 |      ]
1253 |     },
1254 |     {
1255 |      "data": {
1256 |       "text/plain": [
1257 |        "{'auc': 0.8113720373994346, 'pr-auc': 0.30360454333181025}"
1258 |       ]
1259 |      },
1260 |      "execution_count": 101,
1261 |      "metadata": {},
1262 |      "output_type": "execute_result"
1263 |     }
1264 |    ],
1265 |    "source": [
1266 |     "df, label, categorical_features, numerical_features = get_data(balanced=False)\n",
1267 |     "\n",
1268 |     "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
1269 |     "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
1270 |     "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
1271 |     "\n",
1272 |     "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
1273 |     "cat = [([c], [OrdinalEncoder()]) for c in categorical_features]\n",
1274 |     "mapper = DataFrameMapper(cat + num, df_out=True)\n",
1275 |     "\n",
1276 |     "clf = LogisticRegression(class_weight='balanced')\n",
1277 |     "pipeline = Pipeline([\n",
1278 |     "    ('preprocess', mapper),\n",
1279 |     "    ('clf', clf)\n",
1280 |     "])\n",
1281 |     "\n",
1282 |     "pipeline.fit(X_train, y_train)\n",
1283 |     "evaluation(pipeline, X_test, y_test)"
1284 |    ]
1285 |   },
1286 |   {
1287 |    "cell_type": "markdown",
1288 |    "metadata": {},
1289 |    "source": [
1290 |     "**Result**:\n",
1291 |     "- Having an unbalanced dataset doesn't harm accuracy, but harms precision-recall metrics of the positive class.\n",
1292 |     "- This is mostly due to lower predicted probability values. "
1293 |    ]
1294 |   },
1295 |   {
1296 |    "cell_type": "markdown",
1297 |    "metadata": {},
1298 |    "source": [
1299 |     "## 1.4 Correlated Features"
1300 |    ]
1301 |   },
1302 |   {
1303 |    "cell_type": "code",
1304 |    "execution_count": 72,
1305 |    "metadata": {},
1306 |    "outputs": [
1307 |     {
1308 |      "name": "stdout",
1309 |      "output_type": "stream",
1310 |      "text": [
1311 |       "--------------------------------------------------------------------------------\n",
1312 |       "Warning: n_clusters_per_class not in configuration, defaulting to 2\n",
1313 |       "Warning: effective_rank not in configuration, defaulting to None\n",
1314 |       "Warning: tail_strength not in configuration, defaulting to 0.5\n",
1315 |       "Warning: noise not in configuration, defaulting to 0.0\n",
1316 |       "Warning: shuffle not in configuration, defaulting to True\n",
1317 |       "Creating Classification Dataset...\n",
1318 |       "Warning: insert_dollar not in configuration, defaulting to 'No'\n",
1319 |       "Warning: insert_percent not in configuration, defaulting to 'No'\n",
1320 |       "Warning: star_schema not in configuration, defaulting to 'No'\n",
1321 |       "Writing Train/Test Datasets\n"
1322 |      ]
1323 |     }
1324 |    ],
1325 |    "source": [
1326 |     "df, label, categorical_features, numerical_features = get_data(categorical_features=False, correlated_features=True)"
1327 |    ]
1328 |   },
1329 |   {
1330 |    "cell_type": "code",
1331 |    "execution_count": 74,
1332 |    "metadata": {},
1333 |    "outputs": [
1334 |     {
1335 |      "data": {
1336 |       "text/plain": [
1337 |        "{'auc': 0.9194931452103352, 'pr-auc': 0.8982012865508728}"
1338 |       ]
1339 |      },
1340 |      "execution_count": 74,
1341 |      "metadata": {},
1342 |      "output_type": "execute_result"
1343 |     }
1344 |    ],
1345 |    "source": [
1346 |     "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
1347 |     "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
1348 |     "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
1349 |     "\n",
1350 |     "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
1351 |     "mapper = DataFrameMapper(num, df_out=True)\n",
1352 |     "\n",
1353 |     "clf = LogisticRegression()\n",
1354 |     "pipeline = Pipeline([\n",
1355 |     "    ('preprocess', mapper),\n",
1356 |     "    ('clf', clf)\n",
1357 |     "])\n",
1358 |     "\n",
1359 |     "pipeline.fit(X_train, y_train)\n",
1360 |     "evaluation(pipeline, X_test, y_test)"
1361 |    ]
1362 |   },
1363 |   {
1364 |    "cell_type": "code",
1365 |    "execution_count": 75,
1366 |    "metadata": {},
1367 |    "outputs": [
1368 |     {
1369 |      "data": {
1370 |       "text/html": [
1371 |        "<table class=\"simpletable\">\n",
1372 |        "<caption>OLS Regression Results</caption>\n",
1373 |        "<tr>\n",
1374 |        "  <th>Dep. Variable:</th>            <td>y</td>        <th>  R-squared:         </th> <td>   0.483</td>\n",
1375 |        "</tr>\n",
1376 |        "<tr>\n",
1377 |        "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.483</td>\n",
1378 |        "</tr>\n",
1379 |        "<tr>\n",
1380 |        "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   1345.</td>\n",
1381 |        "</tr>\n",
1382 |        "<tr>\n",
1383 |        "  <th>Date:</th>             <td>Sat, 10 Apr 2021</td> <th>  Prob (F-statistic):</th>  <td>  0.00</td> \n",
1384 |        "</tr>\n",
1385 |        "<tr>\n",
1386 |        "  <th>Time:</th>                 <td>14:31:47</td>     <th>  Log-Likelihood:    </th> <td> -3420.3</td>\n",
1387 |        "</tr>\n",
1388 |        "<tr>\n",
1389 |        "  <th>No. Observations:</th>      <td>  8640</td>      <th>  AIC:               </th> <td>   6855.</td>\n",
1390 |        "</tr>\n",
1391 |        "<tr>\n",
1392 |        "  <th>Df Residuals:</th>          <td>  8633</td>      <th>  BIC:               </th> <td>   6904.</td>\n",
1393 |        "</tr>\n",
1394 |        "<tr>\n",
1395 |        "  <th>Df Model:</th>              <td>     6</td>      <th>                     </th>     <td> </td>   \n",
1396 |        "</tr>\n",
1397 |        "<tr>\n",
1398 |        "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>   \n",
1399 |        "</tr>\n",
1400 |        "</table>\n",
1401 |        "<table class=\"simpletable\">\n",
1402 |        "<tr>\n",
1403 |        "    <td></td>       <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
1404 |        "</tr>\n",
1405 |        "<tr>\n",
1406 |        "  <th>const</th> <td>    0.5906</td> <td>    0.006</td> <td>  104.957</td> <td> 0.000</td> <td>    0.580</td> <td>    0.602</td>\n",
1407 |        "</tr>\n",
1408 |        "<tr>\n",
1409 |        "  <th>x0</th>    <td>   -0.0043</td> <td>    0.000</td> <td>  -20.533</td> <td> 0.000</td> <td>   -0.005</td> <td>   -0.004</td>\n",
1410 |        "</tr>\n",
1411 |        "<tr>\n",
1412 |        "  <th>x1</th>    <td>    0.0335</td> <td>    0.002</td> <td>   19.438</td> <td> 0.000</td> <td>    0.030</td> <td>    0.037</td>\n",
1413 |        "</tr>\n",
1414 |        "<tr>\n",
1415 |        "  <th>x2</th>    <td>    0.0447</td> <td>    0.001</td> <td>   43.084</td> <td> 0.000</td> <td>    0.043</td> <td>    0.047</td>\n",
1416 |        "</tr>\n",
1417 |        "<tr>\n",
1418 |        "  <th>x3</th>    <td>   -0.0076</td> <td>    0.000</td> <td>  -20.533</td> <td> 0.000</td> <td>   -0.008</td> <td>   -0.007</td>\n",
1419 |        "</tr>\n",
1420 |        "<tr>\n",
1421 |        "  <th>x4</th>    <td>   -0.0142</td> <td>    0.001</td> <td>  -27.006</td> <td> 0.000</td> <td>   -0.015</td> <td>   -0.013</td>\n",
1422 |        "</tr>\n",
1423 |        "<tr>\n",
1424 |        "  <th>x5</th>    <td>    0.0125</td> <td>    0.000</td> <td>   45.550</td> <td> 0.000</td> <td>    0.012</td> <td>    0.013</td>\n",
1425 |        "</tr>\n",
1426 |        "<tr>\n",
1427 |        "  <th>x6</th>    <td>   -0.0017</td> <td>    0.002</td> <td>   -0.997</td> <td> 0.319</td> <td>   -0.005</td> <td>    0.002</td>\n",
1428 |        "</tr>\n",
1429 |        "<tr>\n",
1430 |        "  <th>x7</th>    <td>    0.0270</td> <td>    0.001</td> <td>   28.009</td> <td> 0.000</td> <td>    0.025</td> <td>    0.029</td>\n",
1431 |        "</tr>\n",
1432 |        "</table>\n",
1433 |        "<table class=\"simpletable\">\n",
1434 |        "<tr>\n",
1435 |        "  <th>Omnibus:</th>       <td>341.439</td> <th>  Durbin-Watson:     </th> <td>   2.027</td>\n",
1436 |        "</tr>\n",
1437 |        "<tr>\n",
1438 |        "  <th>Prob(Omnibus):</th> <td> 0.000</td>  <th>  Jarque-Bera (JB):  </th> <td> 353.022</td>\n",
1439 |        "</tr>\n",
1440 |        "<tr>\n",
1441 |        "  <th>Skew:</th>          <td>-0.467</td>  <th>  Prob(JB):          </th> <td>2.20e-77</td>\n",
1442 |        "</tr>\n",
1443 |        "<tr>\n",
1444 |        "  <th>Kurtosis:</th>      <td> 2.670</td>  <th>  Cond. No.          </th> <td>1.54e+16</td>\n",
1445 |        "</tr>\n",
1446 |        "</table><br/><br/>Notes:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.<br/>[2] The smallest eigenvalue is 1.5e-26. This might indicate that there are<br/>strong multicollinearity problems or that the design matrix is singular."
1447 |       ],
1448 |       "text/plain": [
1449 |        "<class 'statsmodels.iolib.summary.Summary'>\n",
1450 |        "\"\"\"\n",
1451 |        "                            OLS Regression Results                            \n",
1452 |        "==============================================================================\n",
1453 |        "Dep. Variable:                      y   R-squared:                       0.483\n",
1454 |        "Model:                            OLS   Adj. R-squared:                  0.483\n",
1455 |        "Method:                 Least Squares   F-statistic:                     1345.\n",
1456 |        "Date:                Sat, 10 Apr 2021   Prob (F-statistic):               0.00\n",
1457 |        "Time:                        14:31:47   Log-Likelihood:                -3420.3\n",
1458 |        "No. Observations:                8640   AIC:                             6855.\n",
1459 |        "Df Residuals:                    8633   BIC:                             6904.\n",
1460 |        "Df Model:                           6                                         \n",
1461 |        "Covariance Type:            nonrobust                                         \n",
1462 |        "==============================================================================\n",
1463 |        "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
1464 |        "------------------------------------------------------------------------------\n",
1465 |        "const          0.5906      0.006    104.957      0.000       0.580       0.602\n",
1466 |        "x0            -0.0043      0.000    -20.533      0.000      -0.005      -0.004\n",
1467 |        "x1             0.0335      0.002     19.438      0.000       0.030       0.037\n",
1468 |        "x2             0.0447      0.001     43.084      0.000       0.043       0.047\n",
1469 |        "x3            -0.0076      0.000    -20.533      0.000      -0.008      -0.007\n",
1470 |        "x4            -0.0142      0.001    -27.006      0.000      -0.015      -0.013\n",
1471 |        "x5             0.0125      0.000     45.550      0.000       0.012       0.013\n",
1472 |        "x6            -0.0017      0.002     -0.997      0.319      -0.005       0.002\n",
1473 |        "x7             0.0270      0.001     28.009      0.000       0.025       0.029\n",
1474 |        "==============================================================================\n",
1475 |        "Omnibus:                      341.439   Durbin-Watson:                   2.027\n",
1476 |        "Prob(Omnibus):                  0.000   Jarque-Bera (JB):              353.022\n",
1477 |        "Skew:                          -0.467   Prob(JB):                     2.20e-77\n",
1478 |        "Kurtosis:                       2.670   Cond. No.                     1.54e+16\n",
1479 |        "==============================================================================\n",
1480 |        "\n",
1481 |        "Notes:\n",
1482 |        "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
1483 |        "[2] The smallest eigenvalue is 1.5e-26. This might indicate that there are\n",
1484 |        "strong multicollinearity problems or that the design matrix is singular.\n",
1485 |        "\"\"\""
1486 |       ]
1487 |      },
1488 |      "execution_count": 75,
1489 |      "metadata": {},
1490 |      "output_type": "execute_result"
1491 |     }
1492 |    ],
1493 |    "source": [
1494 |     "import statsmodels.api as sm\n",
1495 |     "preprocessed_X_train = mapper.transform(X_train)\n",
1496 |     "preprocessed_X_train = sm.add_constant(preprocessed_X_train)\n",
1497 |     "results = sm.OLS(y_train, preprocessed_X_train).fit()\n",
1498 |     "results.summary()"
1499 |    ]
1500 |   },
1501 |   {
1502 |    "cell_type": "code",
1503 |    "execution_count": 76,
1504 |    "metadata": {},
1505 |    "outputs": [
1506 |     {
1507 |      "name": "stdout",
1508 |      "output_type": "stream",
1509 |      "text": [
1510 |       "x0, inf\n",
1511 |       "x1, inf\n",
1512 |       "x2, inf\n",
1513 |       "x3, inf\n",
1514 |       "x4, inf\n",
1515 |       "x5, inf\n",
1516 |       "x6, inf\n",
1517 |       "x7, inf\n"
1518 |      ]
1519 |     },
1520 |     {
1521 |      "name": "stderr",
1522 |      "output_type": "stream",
1523 |      "text": [
1524 |       "/usr/local/lib/python3.7/site-packages/statsmodels/stats/outliers_influence.py:193: RuntimeWarning: divide by zero encountered in double_scalars\n",
1525 |       "  vif = 1. / (1. - r_squared_i)\n"
1526 |      ]
1527 |     }
1528 |    ],
1529 |    "source": [
1530 |     "from statsmodels.stats.outliers_influence import variance_inflation_factor\n",
1531 |     "for column in numerical_features:\n",
1532 |     "    print(f\"\"\"{column}, {variance_inflation_factor(\n",
1533 |     "                                preprocessed_X_train.values, \n",
1534 |     "                                list(preprocessed_X_train.columns).index(column))}\"\"\")"
1535 |    ]
1536 |   },
1537 |   {
1538 |    "cell_type": "code",
1539 |    "execution_count": 89,
1540 |    "metadata": {},
1541 |    "outputs": [
1542 |     {
1543 |      "data": {
1544 |       "text/html": [
1545 |        "<div>\n",
1546 |        "<style scoped>\n",
1547 |        "    .dataframe tbody tr th:only-of-type {\n",
1548 |        "        vertical-align: middle;\n",
1549 |        "    }\n",
1550 |        "\n",
1551 |        "    .dataframe tbody tr th {\n",
1552 |        "        vertical-align: top;\n",
1553 |        "    }\n",
1554 |        "\n",
1555 |        "    .dataframe thead th {\n",
1556 |        "        text-align: right;\n",
1557 |        "    }\n",
1558 |        "</style>\n",
1559 |        "<table border=\"1\" class=\"dataframe\">\n",
1560 |        "  <thead>\n",
1561 |        "    <tr style=\"text-align: right;\">\n",
1562 |        "      <th></th>\n",
1563 |        "      <th>x0</th>\n",
1564 |        "      <th>x1</th>\n",
1565 |        "      <th>x2</th>\n",
1566 |        "      <th>x3</th>\n",
1567 |        "      <th>x4</th>\n",
1568 |        "      <th>x5</th>\n",
1569 |        "      <th>x6</th>\n",
1570 |        "      <th>x7</th>\n",
1571 |        "      <th>y</th>\n",
1572 |        "    </tr>\n",
1573 |        "  </thead>\n",
1574 |        "  <tbody>\n",
1575 |        "    <tr>\n",
1576 |        "      <th>x0</th>\n",
1577 |        "      <td>1.000000</td>\n",
1578 |        "      <td>0.132384</td>\n",
1579 |        "      <td>-0.097071</td>\n",
1580 |        "      <td>1.000000</td>\n",
1581 |        "      <td>-0.035234</td>\n",
1582 |        "      <td>-0.162566</td>\n",
1583 |        "      <td>0.346866</td>\n",
1584 |        "      <td>0.567626</td>\n",
1585 |        "      <td>-0.000326</td>\n",
1586 |        "    </tr>\n",
1587 |        "    <tr>\n",
1588 |        "      <th>x1</th>\n",
1589 |        "      <td>0.132384</td>\n",
1590 |        "      <td>1.000000</td>\n",
1591 |        "      <td>0.029556</td>\n",
1592 |        "      <td>0.132384</td>\n",
1593 |        "      <td>0.143301</td>\n",
1594 |        "      <td>-0.434811</td>\n",
1595 |        "      <td>0.091475</td>\n",
1596 |        "      <td>0.211035</td>\n",
1597 |        "      <td>-0.020443</td>\n",
1598 |        "    </tr>\n",
1599 |        "    <tr>\n",
1600 |        "      <th>x2</th>\n",
1601 |        "      <td>-0.097071</td>\n",
1602 |        "      <td>0.029556</td>\n",
1603 |        "      <td>1.000000</td>\n",
1604 |        "      <td>-0.097071</td>\n",
1605 |        "      <td>0.272320</td>\n",
1606 |        "      <td>0.001597</td>\n",
1607 |        "      <td>-0.077077</td>\n",
1608 |        "      <td>-0.546263</td>\n",
1609 |        "      <td>0.275935</td>\n",
1610 |        "    </tr>\n",
1611 |        "    <tr>\n",
1612 |        "      <th>x3</th>\n",
1613 |        "      <td>1.000000</td>\n",
1614 |        "      <td>0.132384</td>\n",
1615 |        "      <td>-0.097071</td>\n",
1616 |        "      <td>1.000000</td>\n",
1617 |        "      <td>-0.035234</td>\n",
1618 |        "      <td>-0.162566</td>\n",
1619 |        "      <td>0.346866</td>\n",
1620 |        "      <td>0.567626</td>\n",
1621 |        "      <td>-0.000326</td>\n",
1622 |        "    </tr>\n",
1623 |        "    <tr>\n",
1624 |        "      <th>x4</th>\n",
1625 |        "      <td>-0.035234</td>\n",
1626 |        "      <td>0.143301</td>\n",
1627 |        "      <td>0.272320</td>\n",
1628 |        "      <td>-0.035234</td>\n",
1629 |        "      <td>1.000000</td>\n",
1630 |        "      <td>-0.144259</td>\n",
1631 |        "      <td>0.144366</td>\n",
1632 |        "      <td>0.314752</td>\n",
1633 |        "      <td>-0.008192</td>\n",
1634 |        "    </tr>\n",
1635 |        "    <tr>\n",
1636 |        "      <th>x5</th>\n",
1637 |        "      <td>-0.162566</td>\n",
1638 |        "      <td>-0.434811</td>\n",
1639 |        "      <td>0.001597</td>\n",
1640 |        "      <td>-0.162566</td>\n",
1641 |        "      <td>-0.144259</td>\n",
1642 |        "      <td>1.000000</td>\n",
1643 |        "      <td>0.120178</td>\n",
1644 |        "      <td>0.083330</td>\n",
1645 |        "      <td>0.544321</td>\n",
1646 |        "    </tr>\n",
1647 |        "    <tr>\n",
1648 |        "      <th>x6</th>\n",
1649 |        "      <td>0.346866</td>\n",
1650 |        "      <td>0.091475</td>\n",
1651 |        "      <td>-0.077077</td>\n",
1652 |        "      <td>0.346866</td>\n",
1653 |        "      <td>0.144366</td>\n",
1654 |        "      <td>0.120178</td>\n",
1655 |        "      <td>1.000000</td>\n",
1656 |        "      <td>0.649177</td>\n",
1657 |        "      <td>0.308940</td>\n",
1658 |        "    </tr>\n",
1659 |        "    <tr>\n",
1660 |        "      <th>x7</th>\n",
1661 |        "      <td>0.567626</td>\n",
1662 |        "      <td>0.211035</td>\n",
1663 |        "      <td>-0.546263</td>\n",
1664 |        "      <td>0.567626</td>\n",
1665 |        "      <td>0.314752</td>\n",
1666 |        "      <td>0.083330</td>\n",
1667 |        "      <td>0.649177</td>\n",
1668 |        "      <td>1.000000</td>\n",
1669 |        "      <td>0.071201</td>\n",
1670 |        "    </tr>\n",
1671 |        "    <tr>\n",
1672 |        "      <th>y</th>\n",
1673 |        "      <td>-0.000326</td>\n",
1674 |        "      <td>-0.020443</td>\n",
1675 |        "      <td>0.275935</td>\n",
1676 |        "      <td>-0.000326</td>\n",
1677 |        "      <td>-0.008192</td>\n",
1678 |        "      <td>0.544321</td>\n",
1679 |        "      <td>0.308940</td>\n",
1680 |        "      <td>0.071201</td>\n",
1681 |        "      <td>1.000000</td>\n",
1682 |        "    </tr>\n",
1683 |        "  </tbody>\n",
1684 |        "</table>\n",
1685 |        "</div>"
1686 |       ],
1687 |       "text/plain": [
1688 |        "          x0        x1        x2        x3        x4        x5        x6  \\\n",
1689 |        "x0  1.000000  0.132384 -0.097071  1.000000 -0.035234 -0.162566  0.346866   \n",
1690 |        "x1  0.132384  1.000000  0.029556  0.132384  0.143301 -0.434811  0.091475   \n",
1691 |        "x2 -0.097071  0.029556  1.000000 -0.097071  0.272320  0.001597 -0.077077   \n",
1692 |        "x3  1.000000  0.132384 -0.097071  1.000000 -0.035234 -0.162566  0.346866   \n",
1693 |        "x4 -0.035234  0.143301  0.272320 -0.035234  1.000000 -0.144259  0.144366   \n",
1694 |        "x5 -0.162566 -0.434811  0.001597 -0.162566 -0.144259  1.000000  0.120178   \n",
1695 |        "x6  0.346866  0.091475 -0.077077  0.346866  0.144366  0.120178  1.000000   \n",
1696 |        "x7  0.567626  0.211035 -0.546263  0.567626  0.314752  0.083330  0.649177   \n",
1697 |        "y  -0.000326 -0.020443  0.275935 -0.000326 -0.008192  0.544321  0.308940   \n",
1698 |        "\n",
1699 |        "          x7         y  \n",
1700 |        "x0  0.567626 -0.000326  \n",
1701 |        "x1  0.211035 -0.020443  \n",
1702 |        "x2 -0.546263  0.275935  \n",
1703 |        "x3  0.567626 -0.000326  \n",
1704 |        "x4  0.314752 -0.008192  \n",
1705 |        "x5  0.083330  0.544321  \n",
1706 |        "x6  0.649177  0.308940  \n",
1707 |        "x7  1.000000  0.071201  \n",
1708 |        "y   0.071201  1.000000  "
1709 |       ]
1710 |      },
1711 |      "execution_count": 89,
1712 |      "metadata": {},
1713 |      "output_type": "execute_result"
1714 |     }
1715 |    ],
1716 |    "source": [
1717 |     "df.corr()"
1718 |    ]
1719 |   },
1720 |   {
1721 |    "cell_type": "markdown",
1722 |    "metadata": {},
1723 |    "source": [
1724 |     "### Start with removing perfectly multicolinearity"
1725 |    ]
1726 |   },
1727 |   {
1728 |    "cell_type": "code",
1729 |    "execution_count": 90,
1730 |    "metadata": {},
1731 |    "outputs": [
1732 |     {
1733 |      "data": {
1734 |       "text/plain": [
1735 |        "{'auc': 0.9194974891835068, 'pr-auc': 0.8982064967028441}"
1736 |       ]
1737 |      },
1738 |      "execution_count": 90,
1739 |      "metadata": {},
1740 |      "output_type": "execute_result"
1741 |     }
1742 |    ],
1743 |    "source": [
1744 |     "numerical_features = ['x0', 'x1', 'x2', 'x4', 'x5', 'x6', 'x7'] # remove x3\n",
1745 |     "\n",
1746 |     "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
1747 |     "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
1748 |     "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
1749 |     "\n",
1750 |     "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
1751 |     "mapper = DataFrameMapper(num, df_out=True)\n",
1752 |     "\n",
1753 |     "clf = LogisticRegression()\n",
1754 |     "pipeline = Pipeline([\n",
1755 |     "    ('preprocess', mapper),\n",
1756 |     "    ('clf', clf)\n",
1757 |     "])\n",
1758 |     "\n",
1759 |     "pipeline.fit(X_train, y_train)\n",
1760 |     "evaluation(pipeline, X_test, y_test)"
1761 |    ]
1762 |   },
1763 |   {
1764 |    "cell_type": "code",
1765 |    "execution_count": 91,
1766 |    "metadata": {},
1767 |    "outputs": [
1768 |     {
1769 |      "data": {
1770 |       "text/html": [
1771 |        "<table class=\"simpletable\">\n",
1772 |        "<caption>OLS Regression Results</caption>\n",
1773 |        "<tr>\n",
1774 |        "  <th>Dep. Variable:</th>            <td>y</td>        <th>  R-squared:         </th> <td>   0.483</td>\n",
1775 |        "</tr>\n",
1776 |        "<tr>\n",
1777 |        "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.483</td>\n",
1778 |        "</tr>\n",
1779 |        "<tr>\n",
1780 |        "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   1345.</td>\n",
1781 |        "</tr>\n",
1782 |        "<tr>\n",
1783 |        "  <th>Date:</th>             <td>Sat, 10 Apr 2021</td> <th>  Prob (F-statistic):</th>  <td>  0.00</td> \n",
1784 |        "</tr>\n",
1785 |        "<tr>\n",
1786 |        "  <th>Time:</th>                 <td>14:37:17</td>     <th>  Log-Likelihood:    </th> <td> -3420.3</td>\n",
1787 |        "</tr>\n",
1788 |        "<tr>\n",
1789 |        "  <th>No. Observations:</th>      <td>  8640</td>      <th>  AIC:               </th> <td>   6855.</td>\n",
1790 |        "</tr>\n",
1791 |        "<tr>\n",
1792 |        "  <th>Df Residuals:</th>          <td>  8633</td>      <th>  BIC:               </th> <td>   6904.</td>\n",
1793 |        "</tr>\n",
1794 |        "<tr>\n",
1795 |        "  <th>Df Model:</th>              <td>     6</td>      <th>                     </th>     <td> </td>   \n",
1796 |        "</tr>\n",
1797 |        "<tr>\n",
1798 |        "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>   \n",
1799 |        "</tr>\n",
1800 |        "</table>\n",
1801 |        "<table class=\"simpletable\">\n",
1802 |        "<tr>\n",
1803 |        "    <td></td>       <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
1804 |        "</tr>\n",
1805 |        "<tr>\n",
1806 |        "  <th>const</th> <td>    0.5906</td> <td>    0.006</td> <td>  104.957</td> <td> 0.000</td> <td>    0.580</td> <td>    0.602</td>\n",
1807 |        "</tr>\n",
1808 |        "<tr>\n",
1809 |        "  <th>x0</th>    <td>   -0.0172</td> <td>    0.001</td> <td>  -20.533</td> <td> 0.000</td> <td>   -0.019</td> <td>   -0.016</td>\n",
1810 |        "</tr>\n",
1811 |        "<tr>\n",
1812 |        "  <th>x1</th>    <td>    0.0357</td> <td>    0.002</td> <td>   19.852</td> <td> 0.000</td> <td>    0.032</td> <td>    0.039</td>\n",
1813 |        "</tr>\n",
1814 |        "<tr>\n",
1815 |        "  <th>x2</th>    <td>    0.0438</td> <td>    0.001</td> <td>   43.527</td> <td> 0.000</td> <td>    0.042</td> <td>    0.046</td>\n",
1816 |        "</tr>\n",
1817 |        "<tr>\n",
1818 |        "  <th>x4</th>    <td>   -0.0138</td> <td>    0.001</td> <td>  -26.951</td> <td> 0.000</td> <td>   -0.015</td> <td>   -0.013</td>\n",
1819 |        "</tr>\n",
1820 |        "<tr>\n",
1821 |        "  <th>x5</th>    <td>    0.0127</td> <td>    0.000</td> <td>   47.049</td> <td> 0.000</td> <td>    0.012</td> <td>    0.013</td>\n",
1822 |        "</tr>\n",
1823 |        "<tr>\n",
1824 |        "  <th>x6</th>    <td>   -0.0005</td> <td>    0.002</td> <td>   -0.294</td> <td> 0.769</td> <td>   -0.004</td> <td>    0.003</td>\n",
1825 |        "</tr>\n",
1826 |        "<tr>\n",
1827 |        "  <th>x7</th>    <td>    0.0260</td> <td>    0.001</td> <td>   28.111</td> <td> 0.000</td> <td>    0.024</td> <td>    0.028</td>\n",
1828 |        "</tr>\n",
1829 |        "</table>\n",
1830 |        "<table class=\"simpletable\">\n",
1831 |        "<tr>\n",
1832 |        "  <th>Omnibus:</th>       <td>341.439</td> <th>  Durbin-Watson:     </th> <td>   2.027</td>\n",
1833 |        "</tr>\n",
1834 |        "<tr>\n",
1835 |        "  <th>Prob(Omnibus):</th> <td> 0.000</td>  <th>  Jarque-Bera (JB):  </th> <td> 353.022</td>\n",
1836 |        "</tr>\n",
1837 |        "<tr>\n",
1838 |        "  <th>Skew:</th>          <td>-0.467</td>  <th>  Prob(JB):          </th> <td>2.20e-77</td>\n",
1839 |        "</tr>\n",
1840 |        "<tr>\n",
1841 |        "  <th>Kurtosis:</th>      <td> 2.670</td>  <th>  Cond. No.          </th> <td>1.67e+16</td>\n",
1842 |        "</tr>\n",
1843 |        "</table><br/><br/>Notes:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.<br/>[2] The smallest eigenvalue is  9e-27. This might indicate that there are<br/>strong multicollinearity problems or that the design matrix is singular."
1844 |       ],
1845 |       "text/plain": [
1846 |        "<class 'statsmodels.iolib.summary.Summary'>\n",
1847 |        "\"\"\"\n",
1848 |        "                            OLS Regression Results                            \n",
1849 |        "==============================================================================\n",
1850 |        "Dep. Variable:                      y   R-squared:                       0.483\n",
1851 |        "Model:                            OLS   Adj. R-squared:                  0.483\n",
1852 |        "Method:                 Least Squares   F-statistic:                     1345.\n",
1853 |        "Date:                Sat, 10 Apr 2021   Prob (F-statistic):               0.00\n",
1854 |        "Time:                        14:37:17   Log-Likelihood:                -3420.3\n",
1855 |        "No. Observations:                8640   AIC:                             6855.\n",
1856 |        "Df Residuals:                    8633   BIC:                             6904.\n",
1857 |        "Df Model:                           6                                         \n",
1858 |        "Covariance Type:            nonrobust                                         \n",
1859 |        "==============================================================================\n",
1860 |        "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
1861 |        "------------------------------------------------------------------------------\n",
1862 |        "const          0.5906      0.006    104.957      0.000       0.580       0.602\n",
1863 |        "x0            -0.0172      0.001    -20.533      0.000      -0.019      -0.016\n",
1864 |        "x1             0.0357      0.002     19.852      0.000       0.032       0.039\n",
1865 |        "x2             0.0438      0.001     43.527      0.000       0.042       0.046\n",
1866 |        "x4            -0.0138      0.001    -26.951      0.000      -0.015      -0.013\n",
1867 |        "x5             0.0127      0.000     47.049      0.000       0.012       0.013\n",
1868 |        "x6            -0.0005      0.002     -0.294      0.769      -0.004       0.003\n",
1869 |        "x7             0.0260      0.001     28.111      0.000       0.024       0.028\n",
1870 |        "==============================================================================\n",
1871 |        "Omnibus:                      341.439   Durbin-Watson:                   2.027\n",
1872 |        "Prob(Omnibus):                  0.000   Jarque-Bera (JB):              353.022\n",
1873 |        "Skew:                          -0.467   Prob(JB):                     2.20e-77\n",
1874 |        "Kurtosis:                       2.670   Cond. No.                     1.67e+16\n",
1875 |        "==============================================================================\n",
1876 |        "\n",
1877 |        "Notes:\n",
1878 |        "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
1879 |        "[2] The smallest eigenvalue is  9e-27. This might indicate that there are\n",
1880 |        "strong multicollinearity problems or that the design matrix is singular.\n",
1881 |        "\"\"\""
1882 |       ]
1883 |      },
1884 |      "execution_count": 91,
1885 |      "metadata": {},
1886 |      "output_type": "execute_result"
1887 |     }
1888 |    ],
1889 |    "source": [
1890 |     "preprocessed_X_train = mapper.transform(X_train)\n",
1891 |     "preprocessed_X_train = sm.add_constant(preprocessed_X_train)\n",
1892 |     "results = sm.OLS(y_train, preprocessed_X_train).fit()\n",
1893 |     "results.summary()"
1894 |    ]
1895 |   },
1896 |   {
1897 |    "cell_type": "code",
1898 |    "execution_count": 88,
1899 |    "metadata": {},
1900 |    "outputs": [
1901 |     {
1902 |      "name": "stdout",
1903 |      "output_type": "stream",
1904 |      "text": [
1905 |       "x0, inf\n",
1906 |       "x1, inf\n",
1907 |       "x2, inf\n",
1908 |       "x4, inf\n",
1909 |       "x5, inf\n",
1910 |       "x6, inf\n",
1911 |       "x7, inf\n"
1912 |      ]
1913 |     },
1914 |     {
1915 |      "name": "stderr",
1916 |      "output_type": "stream",
1917 |      "text": [
1918 |       "/usr/local/lib/python3.7/site-packages/statsmodels/stats/outliers_influence.py:193: RuntimeWarning: divide by zero encountered in double_scalars\n",
1919 |       "  vif = 1. / (1. - r_squared_i)\n"
1920 |      ]
1921 |     }
1922 |    ],
1923 |    "source": [
1924 |     "for column in numerical_features:\n",
1925 |     "    print(f\"\"\"{column}, {variance_inflation_factor(\n",
1926 |     "                                preprocessed_X_train.values, \n",
1927 |     "                                list(preprocessed_X_train.columns).index(column))}\"\"\")"
1928 |    ]
1929 |   },
1930 |   {
1931 |    "cell_type": "markdown",
1932 |    "metadata": {},
1933 |    "source": [
1934 |     "Removing feature with perfect multicolinearity:\n",
1935 |     "- Improves interpretability of the coefficients (like `x0` here)\n",
1936 |     "- Logistic Regression doesn't lose performance. "
1937 |    ]
1938 |   },
1939 |   {
1940 |    "cell_type": "markdown",
1941 |    "metadata": {},
1942 |    "source": [
1943 |     "## Remove multicolinearity"
1944 |    ]
1945 |   },
1946 |   {
1947 |    "cell_type": "code",
1948 |    "execution_count": 92,
1949 |    "metadata": {},
1950 |    "outputs": [
1951 |     {
1952 |      "data": {
1953 |       "text/plain": [
1954 |        "{'auc': 0.9194974891835068, 'pr-auc': 0.8982064967028441}"
1955 |       ]
1956 |      },
1957 |      "execution_count": 92,
1958 |      "metadata": {},
1959 |      "output_type": "execute_result"
1960 |     }
1961 |    ],
1962 |    "source": [
1963 |     "numerical_features = ['x0', 'x1', 'x2', 'x4', 'x5', 'x7'] # remove x3, x6\n",
1964 |     "\n",
1965 |     "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
1966 |     "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
1967 |     "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
1968 |     "\n",
1969 |     "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
1970 |     "mapper = DataFrameMapper(num, df_out=True)\n",
1971 |     "\n",
1972 |     "clf = LogisticRegression()\n",
1973 |     "pipeline = Pipeline([\n",
1974 |     "    ('preprocess', mapper),\n",
1975 |     "    ('clf', clf)\n",
1976 |     "])\n",
1977 |     "\n",
1978 |     "pipeline.fit(X_train, y_train)\n",
1979 |     "evaluation(pipeline, X_test, y_test)"
1980 |    ]
1981 |   },
1982 |   {
1983 |    "cell_type": "code",
1984 |    "execution_count": 93,
1985 |    "metadata": {},
1986 |    "outputs": [
1987 |     {
1988 |      "data": {
1989 |       "text/html": [
1990 |        "<table class=\"simpletable\">\n",
1991 |        "<caption>OLS Regression Results</caption>\n",
1992 |        "<tr>\n",
1993 |        "  <th>Dep. Variable:</th>            <td>y</td>        <th>  R-squared:         </th> <td>   0.483</td>\n",
1994 |        "</tr>\n",
1995 |        "<tr>\n",
1996 |        "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.483</td>\n",
1997 |        "</tr>\n",
1998 |        "<tr>\n",
1999 |        "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   1345.</td>\n",
2000 |        "</tr>\n",
2001 |        "<tr>\n",
2002 |        "  <th>Date:</th>             <td>Sat, 10 Apr 2021</td> <th>  Prob (F-statistic):</th>  <td>  0.00</td> \n",
2003 |        "</tr>\n",
2004 |        "<tr>\n",
2005 |        "  <th>Time:</th>                 <td>14:38:52</td>     <th>  Log-Likelihood:    </th> <td> -3420.3</td>\n",
2006 |        "</tr>\n",
2007 |        "<tr>\n",
2008 |        "  <th>No. Observations:</th>      <td>  8640</td>      <th>  AIC:               </th> <td>   6855.</td>\n",
2009 |        "</tr>\n",
2010 |        "<tr>\n",
2011 |        "  <th>Df Residuals:</th>          <td>  8633</td>      <th>  BIC:               </th> <td>   6904.</td>\n",
2012 |        "</tr>\n",
2013 |        "<tr>\n",
2014 |        "  <th>Df Model:</th>              <td>     6</td>      <th>                     </th>     <td> </td>   \n",
2015 |        "</tr>\n",
2016 |        "<tr>\n",
2017 |        "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>   \n",
2018 |        "</tr>\n",
2019 |        "</table>\n",
2020 |        "<table class=\"simpletable\">\n",
2021 |        "<tr>\n",
2022 |        "    <td></td>       <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
2023 |        "</tr>\n",
2024 |        "<tr>\n",
2025 |        "  <th>const</th> <td>    0.5906</td> <td>    0.006</td> <td>  104.957</td> <td> 0.000</td> <td>    0.580</td> <td>    0.602</td>\n",
2026 |        "</tr>\n",
2027 |        "<tr>\n",
2028 |        "  <th>x0</th>    <td>   -0.0169</td> <td>    0.001</td> <td>  -17.744</td> <td> 0.000</td> <td>   -0.019</td> <td>   -0.015</td>\n",
2029 |        "</tr>\n",
2030 |        "<tr>\n",
2031 |        "  <th>x1</th>    <td>    0.0366</td> <td>    0.005</td> <td>    7.684</td> <td> 0.000</td> <td>    0.027</td> <td>    0.046</td>\n",
2032 |        "</tr>\n",
2033 |        "<tr>\n",
2034 |        "  <th>x2</th>    <td>    0.0434</td> <td>    0.001</td> <td>   40.460</td> <td> 0.000</td> <td>    0.041</td> <td>    0.045</td>\n",
2035 |        "</tr>\n",
2036 |        "<tr>\n",
2037 |        "  <th>x4</th>    <td>   -0.0137</td> <td>    0.001</td> <td>  -25.047</td> <td> 0.000</td> <td>   -0.015</td> <td>   -0.013</td>\n",
2038 |        "</tr>\n",
2039 |        "<tr>\n",
2040 |        "  <th>x5</th>    <td>    0.0128</td> <td>    0.000</td> <td>   33.306</td> <td> 0.000</td> <td>    0.012</td> <td>    0.014</td>\n",
2041 |        "</tr>\n",
2042 |        "<tr>\n",
2043 |        "  <th>x7</th>    <td>    0.0256</td> <td>    0.001</td> <td>   27.862</td> <td> 0.000</td> <td>    0.024</td> <td>    0.027</td>\n",
2044 |        "</tr>\n",
2045 |        "</table>\n",
2046 |        "<table class=\"simpletable\">\n",
2047 |        "<tr>\n",
2048 |        "  <th>Omnibus:</th>       <td>341.439</td> <th>  Durbin-Watson:     </th> <td>   2.027</td>\n",
2049 |        "</tr>\n",
2050 |        "<tr>\n",
2051 |        "  <th>Prob(Omnibus):</th> <td> 0.000</td>  <th>  Jarque-Bera (JB):  </th> <td> 353.022</td>\n",
2052 |        "</tr>\n",
2053 |        "<tr>\n",
2054 |        "  <th>Skew:</th>          <td>-0.467</td>  <th>  Prob(JB):          </th> <td>2.20e-77</td>\n",
2055 |        "</tr>\n",
2056 |        "<tr>\n",
2057 |        "  <th>Kurtosis:</th>      <td> 2.670</td>  <th>  Cond. No.          </th> <td>    30.1</td>\n",
2058 |        "</tr>\n",
2059 |        "</table><br/><br/>Notes:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified."
2060 |       ],
2061 |       "text/plain": [
2062 |        "<class 'statsmodels.iolib.summary.Summary'>\n",
2063 |        "\"\"\"\n",
2064 |        "                            OLS Regression Results                            \n",
2065 |        "==============================================================================\n",
2066 |        "Dep. Variable:                      y   R-squared:                       0.483\n",
2067 |        "Model:                            OLS   Adj. R-squared:                  0.483\n",
2068 |        "Method:                 Least Squares   F-statistic:                     1345.\n",
2069 |        "Date:                Sat, 10 Apr 2021   Prob (F-statistic):               0.00\n",
2070 |        "Time:                        14:38:52   Log-Likelihood:                -3420.3\n",
2071 |        "No. Observations:                8640   AIC:                             6855.\n",
2072 |        "Df Residuals:                    8633   BIC:                             6904.\n",
2073 |        "Df Model:                           6                                         \n",
2074 |        "Covariance Type:            nonrobust                                         \n",
2075 |        "==============================================================================\n",
2076 |        "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
2077 |        "------------------------------------------------------------------------------\n",
2078 |        "const          0.5906      0.006    104.957      0.000       0.580       0.602\n",
2079 |        "x0            -0.0169      0.001    -17.744      0.000      -0.019      -0.015\n",
2080 |        "x1             0.0366      0.005      7.684      0.000       0.027       0.046\n",
2081 |        "x2             0.0434      0.001     40.460      0.000       0.041       0.045\n",
2082 |        "x4            -0.0137      0.001    -25.047      0.000      -0.015      -0.013\n",
2083 |        "x5             0.0128      0.000     33.306      0.000       0.012       0.014\n",
2084 |        "x7             0.0256      0.001     27.862      0.000       0.024       0.027\n",
2085 |        "==============================================================================\n",
2086 |        "Omnibus:                      341.439   Durbin-Watson:                   2.027\n",
2087 |        "Prob(Omnibus):                  0.000   Jarque-Bera (JB):              353.022\n",
2088 |        "Skew:                          -0.467   Prob(JB):                     2.20e-77\n",
2089 |        "Kurtosis:                       2.670   Cond. No.                         30.1\n",
2090 |        "==============================================================================\n",
2091 |        "\n",
2092 |        "Notes:\n",
2093 |        "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
2094 |        "\"\"\""
2095 |       ]
2096 |      },
2097 |      "execution_count": 93,
2098 |      "metadata": {},
2099 |      "output_type": "execute_result"
2100 |     }
2101 |    ],
2102 |    "source": [
2103 |     "preprocessed_X_train = mapper.transform(X_train)\n",
2104 |     "preprocessed_X_train = sm.add_constant(preprocessed_X_train)\n",
2105 |     "results = sm.OLS(y_train, preprocessed_X_train).fit()\n",
2106 |     "results.summary()"
2107 |    ]
2108 |   },
2109 |   {
2110 |    "cell_type": "code",
2111 |    "execution_count": 94,
2112 |    "metadata": {},
2113 |    "outputs": [
2114 |     {
2115 |      "name": "stdout",
2116 |      "output_type": "stream",
2117 |      "text": [
2118 |       "x0, 4.358204798860465\n",
2119 |       "x1, 1.8526871839909662\n",
2120 |       "x2, 5.622338237184614\n",
2121 |       "x4, 4.123960180952725\n",
2122 |       "x5, 2.6095687697415917\n",
2123 |       "x7, 10.922197872534808\n"
2124 |      ]
2125 |     }
2126 |    ],
2127 |    "source": [
2128 |     "for column in numerical_features:\n",
2129 |     "    print(f\"\"\"{column}, {variance_inflation_factor(\n",
2130 |     "                                preprocessed_X_train.values, \n",
2131 |     "                                list(preprocessed_X_train.columns).index(column))}\"\"\")"
2132 |    ]
2133 |   },
2134 |   {
2135 |    "cell_type": "markdown",
2136 |    "metadata": {},
2137 |    "source": [
2138 |     "Removing `x6`, we didn't lose explainability nor performance "
2139 |    ]
2140 |   },
2141 |   {
2142 |    "cell_type": "markdown",
2143 |    "metadata": {},
2144 |    "source": [
2145 |     "#### Remove x7 with high VAR"
2146 |    ]
2147 |   },
2148 |   {
2149 |    "cell_type": "code",
2150 |    "execution_count": 95,
2151 |    "metadata": {},
2152 |    "outputs": [
2153 |     {
2154 |      "data": {
2155 |       "text/plain": [
2156 |        "{'auc': 0.8916873729387849, 'pr-auc': 0.858019953399781}"
2157 |       ]
2158 |      },
2159 |      "execution_count": 95,
2160 |      "metadata": {},
2161 |      "output_type": "execute_result"
2162 |     }
2163 |    ],
2164 |    "source": [
2165 |     "numerical_features = ['x0', 'x1', 'x2', 'x4', 'x5'] # remove x3, x6, x7\n",
2166 |     "\n",
2167 |     "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
2168 |     "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
2169 |     "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
2170 |     "\n",
2171 |     "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
2172 |     "mapper = DataFrameMapper(num, df_out=True)\n",
2173 |     "\n",
2174 |     "clf = LogisticRegression()\n",
2175 |     "pipeline = Pipeline([\n",
2176 |     "    ('preprocess', mapper),\n",
2177 |     "    ('clf', clf)\n",
2178 |     "])\n",
2179 |     "\n",
2180 |     "pipeline.fit(X_train, y_train)\n",
2181 |     "evaluation(pipeline, X_test, y_test)"
2182 |    ]
2183 |   },
2184 |   {
2185 |    "cell_type": "code",
2186 |    "execution_count": 96,
2187 |    "metadata": {},
2188 |    "outputs": [
2189 |     {
2190 |      "data": {
2191 |       "text/html": [
2192 |        "<table class=\"simpletable\">\n",
2193 |        "<caption>OLS Regression Results</caption>\n",
2194 |        "<tr>\n",
2195 |        "  <th>Dep. Variable:</th>            <td>y</td>        <th>  R-squared:         </th> <td>   0.437</td>\n",
2196 |        "</tr>\n",
2197 |        "<tr>\n",
2198 |        "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.436</td>\n",
2199 |        "</tr>\n",
2200 |        "<tr>\n",
2201 |        "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   1338.</td>\n",
2202 |        "</tr>\n",
2203 |        "<tr>\n",
2204 |        "  <th>Date:</th>             <td>Sat, 10 Apr 2021</td> <th>  Prob (F-statistic):</th>  <td>  0.00</td> \n",
2205 |        "</tr>\n",
2206 |        "<tr>\n",
2207 |        "  <th>Time:</th>                 <td>14:40:42</td>     <th>  Log-Likelihood:    </th> <td> -3792.2</td>\n",
2208 |        "</tr>\n",
2209 |        "<tr>\n",
2210 |        "  <th>No. Observations:</th>      <td>  8640</td>      <th>  AIC:               </th> <td>   7596.</td>\n",
2211 |        "</tr>\n",
2212 |        "<tr>\n",
2213 |        "  <th>Df Residuals:</th>          <td>  8634</td>      <th>  BIC:               </th> <td>   7639.</td>\n",
2214 |        "</tr>\n",
2215 |        "<tr>\n",
2216 |        "  <th>Df Model:</th>              <td>     5</td>      <th>                     </th>     <td> </td>   \n",
2217 |        "</tr>\n",
2218 |        "<tr>\n",
2219 |        "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>   \n",
2220 |        "</tr>\n",
2221 |        "</table>\n",
2222 |        "<table class=\"simpletable\">\n",
2223 |        "<tr>\n",
2224 |        "    <td></td>       <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
2225 |        "</tr>\n",
2226 |        "<tr>\n",
2227 |        "  <th>const</th> <td>    0.6395</td> <td>    0.006</td> <td>  114.609</td> <td> 0.000</td> <td>    0.629</td> <td>    0.650</td>\n",
2228 |        "</tr>\n",
2229 |        "<tr>\n",
2230 |        "  <th>x0</th>    <td>    0.0063</td> <td>    0.000</td> <td>   12.833</td> <td> 0.000</td> <td>    0.005</td> <td>    0.007</td>\n",
2231 |        "</tr>\n",
2232 |        "<tr>\n",
2233 |        "  <th>x1</th>    <td>    0.1125</td> <td>    0.004</td> <td>   27.526</td> <td> 0.000</td> <td>    0.104</td> <td>    0.121</td>\n",
2234 |        "</tr>\n",
2235 |        "<tr>\n",
2236 |        "  <th>x2</th>    <td>    0.0166</td> <td>    0.000</td> <td>   33.605</td> <td> 0.000</td> <td>    0.016</td> <td>    0.018</td>\n",
2237 |        "</tr>\n",
2238 |        "<tr>\n",
2239 |        "  <th>x4</th>    <td>   -0.0007</td> <td>    0.000</td> <td>   -2.314</td> <td> 0.021</td> <td>   -0.001</td> <td>   -0.000</td>\n",
2240 |        "</tr>\n",
2241 |        "<tr>\n",
2242 |        "  <th>x5</th>    <td>    0.0204</td> <td>    0.000</td> <td>   73.414</td> <td> 0.000</td> <td>    0.020</td> <td>    0.021</td>\n",
2243 |        "</tr>\n",
2244 |        "</table>\n",
2245 |        "<table class=\"simpletable\">\n",
2246 |        "<tr>\n",
2247 |        "  <th>Omnibus:</th>       <td>381.662</td> <th>  Durbin-Watson:     </th> <td>   2.031</td>\n",
2248 |        "</tr>\n",
2249 |        "<tr>\n",
2250 |        "  <th>Prob(Omnibus):</th> <td> 0.000</td>  <th>  Jarque-Bera (JB):  </th> <td> 330.564</td>\n",
2251 |        "</tr>\n",
2252 |        "<tr>\n",
2253 |        "  <th>Skew:</th>          <td>-0.412</td>  <th>  Prob(JB):          </th> <td>1.66e-72</td>\n",
2254 |        "</tr>\n",
2255 |        "<tr>\n",
2256 |        "  <th>Kurtosis:</th>      <td> 2.511</td>  <th>  Cond. No.          </th> <td>    26.7</td>\n",
2257 |        "</tr>\n",
2258 |        "</table><br/><br/>Notes:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified."
2259 |       ],
2260 |       "text/plain": [
2261 |        "<class 'statsmodels.iolib.summary.Summary'>\n",
2262 |        "\"\"\"\n",
2263 |        "                            OLS Regression Results                            \n",
2264 |        "==============================================================================\n",
2265 |        "Dep. Variable:                      y   R-squared:                       0.437\n",
2266 |        "Model:                            OLS   Adj. R-squared:                  0.436\n",
2267 |        "Method:                 Least Squares   F-statistic:                     1338.\n",
2268 |        "Date:                Sat, 10 Apr 2021   Prob (F-statistic):               0.00\n",
2269 |        "Time:                        14:40:42   Log-Likelihood:                -3792.2\n",
2270 |        "No. Observations:                8640   AIC:                             7596.\n",
2271 |        "Df Residuals:                    8634   BIC:                             7639.\n",
2272 |        "Df Model:                           5                                         \n",
2273 |        "Covariance Type:            nonrobust                                         \n",
2274 |        "==============================================================================\n",
2275 |        "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
2276 |        "------------------------------------------------------------------------------\n",
2277 |        "const          0.6395      0.006    114.609      0.000       0.629       0.650\n",
2278 |        "x0             0.0063      0.000     12.833      0.000       0.005       0.007\n",
2279 |        "x1             0.1125      0.004     27.526      0.000       0.104       0.121\n",
2280 |        "x2             0.0166      0.000     33.605      0.000       0.016       0.018\n",
2281 |        "x4            -0.0007      0.000     -2.314      0.021      -0.001      -0.000\n",
2282 |        "x5             0.0204      0.000     73.414      0.000       0.020       0.021\n",
2283 |        "==============================================================================\n",
2284 |        "Omnibus:                      381.662   Durbin-Watson:                   2.031\n",
2285 |        "Prob(Omnibus):                  0.000   Jarque-Bera (JB):              330.564\n",
2286 |        "Skew:                          -0.412   Prob(JB):                     1.66e-72\n",
2287 |        "Kurtosis:                       2.511   Cond. No.                         26.7\n",
2288 |        "==============================================================================\n",
2289 |        "\n",
2290 |        "Notes:\n",
2291 |        "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
2292 |        "\"\"\""
2293 |       ]
2294 |      },
2295 |      "execution_count": 96,
2296 |      "metadata": {},
2297 |      "output_type": "execute_result"
2298 |     }
2299 |    ],
2300 |    "source": [
2301 |     "preprocessed_X_train = mapper.transform(X_train)\n",
2302 |     "preprocessed_X_train = sm.add_constant(preprocessed_X_train)\n",
2303 |     "results = sm.OLS(y_train, preprocessed_X_train).fit()\n",
2304 |     "results.summary()"
2305 |    ]
2306 |   },
2307 |   {
2308 |    "cell_type": "code",
2309 |    "execution_count": 97,
2310 |    "metadata": {},
2311 |    "outputs": [
2312 |     {
2313 |      "name": "stdout",
2314 |      "output_type": "stream",
2315 |      "text": [
2316 |       "x0, 1.0434492528061576\n",
2317 |       "x1, 1.2487373171729157\n",
2318 |       "x2, 1.089610333638892\n",
2319 |       "x4, 1.1174255753042328\n",
2320 |       "x5, 1.2630916367080673\n"
2321 |      ]
2322 |     }
2323 |    ],
2324 |    "source": [
2325 |     "for column in numerical_features:\n",
2326 |     "    print(f\"\"\"{column}, {variance_inflation_factor(\n",
2327 |     "                                preprocessed_X_train.values, \n",
2328 |     "                                list(preprocessed_X_train.columns).index(column))}\"\"\")"
2329 |    ]
2330 |   },
2331 |   {
2332 |    "cell_type": "markdown",
2333 |    "metadata": {},
2334 |    "source": [
2335 |     "Removing `x7`:\n",
2336 |     "- Helped explainability \n",
2337 |     "- Negatively impacted performance"
2338 |    ]
2339 |   },
2340 |   {
2341 |    "cell_type": "markdown",
2342 |    "metadata": {},
2343 |    "source": [
2344 |     "Remedy: Add polynomial terms, Try other models that capture more complex interactions. "
2345 |    ]
2346 |   },
2347 |   {
2348 |    "cell_type": "markdown",
2349 |    "metadata": {},
2350 |    "source": [
2351 |     "## 1.5 Missing Values"
2352 |    ]
2353 |   },
2354 |   {
2355 |    "cell_type": "code",
2356 |    "execution_count": 98,
2357 |    "metadata": {},
2358 |    "outputs": [
2359 |     {
2360 |      "name": "stdout",
2361 |      "output_type": "stream",
2362 |      "text": [
2363 |       "--------------------------------------------------------------------------------\n",
2364 |       "Warning: n_clusters_per_class not in configuration, defaulting to 2\n",
2365 |       "Warning: effective_rank not in configuration, defaulting to None\n",
2366 |       "Warning: tail_strength not in configuration, defaulting to 0.5\n",
2367 |       "Warning: noise not in configuration, defaulting to 0.0\n",
2368 |       "Warning: shuffle not in configuration, defaulting to True\n",
2369 |       "Creating Classification Dataset...\n",
2370 |       "Creating Categorical Features...\n",
2371 |       "Warning: insert_dollar not in configuration, defaulting to 'No'\n",
2372 |       "Warning: insert_percent not in configuration, defaulting to 'No'\n",
2373 |       "Warning: star_schema not in configuration, defaulting to 'No'\n",
2374 |       "Writing Train/Test Datasets\n"
2375 |      ]
2376 |     }
2377 |    ],
2378 |    "source": [
2379 |     "df, label, categorical_features, numerical_features = get_data(missing_values=True)"
2380 |    ]
2381 |   },
2382 |   {
2383 |    "cell_type": "code",
2384 |    "execution_count": 99,
2385 |    "metadata": {},
2386 |    "outputs": [
2387 |     {
2388 |      "ename": "ValueError",
2389 |      "evalue": "Input contains NaN, infinity or a value too large for dtype('float64').",
2390 |      "output_type": "error",
2391 |      "traceback": [
2392 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
2393 |       "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
2394 |       "\u001b[0;32m<ipython-input-99-431bbbcde037>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     12\u001b[0m ])\n\u001b[1;32m     13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0mpipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnumerical_features\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     15\u001b[0m \u001b[0mevaluation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpipeline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnumerical_features\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
2395 |       "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/pipeline.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m    344\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_final_estimator\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m'passthrough'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    345\u001b[0m                 \u001b[0mfit_params_last_step\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfit_params_steps\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msteps\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 346\u001b[0;31m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_final_estimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mXt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params_last_step\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    347\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    348\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
2396 |       "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m   1344\u001b[0m         X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype,\n\u001b[1;32m   1345\u001b[0m                                    \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"C\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1346\u001b[0;31m                                    accept_large_sparse=solver != 'liblinear')\n\u001b[0m\u001b[1;32m   1347\u001b[0m         \u001b[0mcheck_classification_targets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1348\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclasses_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munique\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
2397 |       "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36m_validate_data\u001b[0;34m(self, X, y, reset, validate_separately, **check_params)\u001b[0m\n\u001b[1;32m    431\u001b[0m                 \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    432\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 433\u001b[0;31m                 \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    434\u001b[0m             \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    435\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
2398 |       "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     61\u001b[0m             \u001b[0mextra_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     62\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     65\u001b[0m             \u001b[0;31m# extra_args > 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
2399 |       "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m    819\u001b[0m                     \u001b[0mensure_min_samples\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mensure_min_samples\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    820\u001b[0m                     \u001b[0mensure_min_features\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mensure_min_features\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 821\u001b[0;31m                     estimator=estimator)\n\u001b[0m\u001b[1;32m    822\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mmulti_output\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    823\u001b[0m         y = check_array(y, accept_sparse='csr', force_all_finite=True,\n",
2400 |       "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     61\u001b[0m             \u001b[0mextra_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     62\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     65\u001b[0m             \u001b[0;31m# extra_args > 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
2401 |       "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)\u001b[0m\n\u001b[1;32m    662\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    663\u001b[0m             _assert_all_finite(array,\n\u001b[0;32m--> 664\u001b[0;31m                                allow_nan=force_all_finite == 'allow-nan')\n\u001b[0m\u001b[1;32m    665\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    666\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mensure_min_samples\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
2402 |       "\u001b[0;32m/usr/local/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m_assert_all_finite\u001b[0;34m(X, allow_nan, msg_dtype)\u001b[0m\n\u001b[1;32m    104\u001b[0m                     \u001b[0mmsg_err\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    105\u001b[0m                     (type_err,\n\u001b[0;32m--> 106\u001b[0;31m                      msg_dtype if msg_dtype is not None else X.dtype)\n\u001b[0m\u001b[1;32m    107\u001b[0m             )\n\u001b[1;32m    108\u001b[0m     \u001b[0;31m# for object dtype data, we only check for NaNs (GH-13254)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
2403 |       "\u001b[0;31mValueError\u001b[0m: Input contains NaN, infinity or a value too large for dtype('float64')."
2404 |      ]
2405 |     }
2406 |    ],
2407 |    "source": [
2408 |     "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
2409 |     "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
2410 |     "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
2411 |     "\n",
2412 |     "num = [([n], [SimpleImputer()]) for n in numerical_features]\n",
2413 |     "mapper = DataFrameMapper(num, df_out=True)\n",
2414 |     "\n",
2415 |     "clf = LogisticRegression()\n",
2416 |     "pipeline = Pipeline([\n",
2417 |     "    #('preprocess', mapper),\n",
2418 |     "    ('clf', clf)\n",
2419 |     "])\n",
2420 |     "\n",
2421 |     "pipeline.fit(X_train[numerical_features], y_train)\n",
2422 |     "evaluation(pipeline, X_test[numerical_features], y_test)"
2423 |    ]
2424 |   },
2425 |   {
2426 |    "cell_type": "code",
2427 |    "execution_count": 100,
2428 |    "metadata": {},
2429 |    "outputs": [
2430 |     {
2431 |      "data": {
2432 |       "text/plain": [
2433 |        "{'auc': 0.7473034970984109, 'pr-auc': 0.676792150205654}"
2434 |       ]
2435 |      },
2436 |      "execution_count": 100,
2437 |      "metadata": {},
2438 |      "output_type": "execute_result"
2439 |     }
2440 |    ],
2441 |    "source": [
2442 |     "train_df, test_df = train_test_split(df, test_size=0.1, shuffle=False)\n",
2443 |     "X_train, y_train = train_df[categorical_features + numerical_features], train_df[label]\n",
2444 |     "X_test, y_test = test_df[categorical_features + numerical_features], test_df[label]\n",
2445 |     "\n",
2446 |     "num = [([n], [SimpleImputer()]) for n in numerical_features] # Impute values\n",
2447 |     "mapper = DataFrameMapper(num, df_out=True)\n",
2448 |     "                                                                                                                                                           \n",
2449 |     "clf = LogisticRegression()\n",
2450 |     "pipeline = Pipeline([\n",
2451 |     "    ('preprocess', mapper),\n",
2452 |     "    ('clf', clf)\n",
2453 |     "])\n",
2454 |     "\n",
2455 |     "pipeline.fit(X_train[numerical_features], y_train)\n",
2456 |     "evaluation(pipeline, X_test[numerical_features], y_test)"
2457 |    ]
2458 |   },
2459 |   {
2460 |    "cell_type": "markdown",
2461 |    "metadata": {},
2462 |    "source": [
2463 |     "**Result**\n",
2464 |     "- Logistic Regression can't handle missing values. Best Imupute with mean"
2465 |    ]
2466 |   },
2467 |   {
2468 |    "cell_type": "markdown",
2469 |    "metadata": {},
2470 |    "source": [
2471 |     "## Summary \n",
2472 |     "\n",
2473 |     "Let's see how Logistic Regression acts with 5 techniques:\n",
2474 |     "1. **Standardization of Numerical Variables**\n",
2475 |     "    - Performance doesn't necessarily improve. But convergence is faster during training\n",
2476 |     "2. **Encoding of Categorical Variables**\n",
2477 |     "    - We can use ordinal encoding if the categories are related (size). Otherwise, use one hot encoding\n",
2478 |     "3. **Data Imbalance**\n",
2479 |     "    - Perform overweighting of the minor class and undersampling of the major class\n",
2480 |     "4. **Colinearity**\n",
2481 |     "    - remove features which exhibit perfect multicolinearity\n",
2482 |     "    - try different modeling strategies to ensure the model is capturing non-linear interactions\n",
2483 |     "5. **Missing Values**\n",
2484 |     "    - Impute with mean (or a constant value). This is problem specific"
2485 |    ]
2486 |   }
2487 |  ],
2488 |  "metadata": {
2489 |   "kernelspec": {
2490 |    "display_name": "Python 3",
2491 |    "language": "python",
2492 |    "name": "python3"
2493 |   },
2494 |   "language_info": {
2495 |    "codemirror_mode": {
2496 |     "name": "ipython",
2497 |     "version": 3
2498 |    },
2499 |    "file_extension": ".py",
2500 |    "mimetype": "text/x-python",
2501 |    "name": "python",
2502 |    "nbconvert_exporter": "python",
2503 |    "pygments_lexer": "ipython3",
2504 |    "version": "3.7.4"
2505 |   }
2506 |  },
2507 |  "nbformat": 4,
2508 |  "nbformat_minor": 4
2509 | }
2510 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # preprocessing-cheat-sheet
2 | 
3 | Preprocessing cheat sheet for some machine learning algorithms. Starting with Logistic Regression for now. This might grow in the future
4 | 


--------------------------------------------------------------------------------