├── Chapter01
    ├── .ipynb_checkpoints
    │   └── ch1-code-snippets-checkpoint.ipynb
    └── ch1-code-snippets.ipynb
├── Chapter02
    ├── ch2-1-diamond-prices.ipynb
    └── ch2-2-credit-card-default.ipynb
├── Chapter03
    ├── .ipynb_checkpoints
    │   ├── ch3-1-eda-diamond-prices-checkpoint.ipynb
    │   └── ch3-2-eda-credit-card-default-checkpoint.ipynb
    ├── ch3-1-eda-diamond-prices.ipynb
    └── ch3-2-eda-credit-card-default.ipynb
├── Chapter04
    ├── .ipynb_checkpoints
    │   └── ch4-overfitting-example-checkpoint.ipynb
    ├── ch4-overfitting-example.ipynb
    └── ch4-predicting-diamond-prices.ipynb
├── Chapter05
    ├── .ipynb_checkpoints
    │   └── ch5-predicting-credit-card-default-checkpoint.ipynb
    └── ch5-predicting-credit-card-default.ipynb
├── Chapter06
    ├── .ipynb_checkpoints
    │   ├── ch6-1-regression-with-neural-networks-checkpoint.ipynb
    │   └── ch6-2-classification-with-neural-networks-checkpoint.ipynb
    ├── ch6-1-regression-with-neural-networks.ipynb
    ├── ch6-2-classification-with-neural-networks.ipynb
    └── class_initial_w.h5
├── Chapter07
    ├── .ipynb_checkpoints
    │   └── ch7-credit-card-def-model-tuning-and-evaluation-checkpoint.ipynb
    ├── ch7-credit-card-def-model-tuning-and-evaluation.ipynb
    └── ch7-diamond-prices-model-tuning-and-evaluation.ipynb
├── Chapter08
    ├── .ipynb_checkpoints
    │   ├── ch8-credit-card-def-model-tuning-checkpoint.ipynb
    │   └── ch8-diamond-prices-model-tuning-checkpoint.ipynb
    ├── ch8-credit-card-def-model-tuning.ipynb
    └── ch8-diamond-prices-model-tuning.ipynb
├── Chapter09
    ├── Model
    │   ├── diamond-prices-model.h5
    │   ├── pca.joblib
    │   └── scaler.joblib
    ├── dash-example-no-user-inputs.py
    ├── dash-example-user-inputs.py
    ├── diamonds-model-training.py
    └── predict-diamond-prices.py
├── Data
    ├── credit_card_default.csv
    └── diamonds.csv
├── LICENSE
├── README.md
├── conda-cheatsheet.pdf
└── requirements.txt


/Chapter02/ch2-2-credit-card-default.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Introducing the credit card defualt dataset"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "### Data Set Information:\n",
  15 |     "\n",
  16 |     "**This research aimed at the case of customers default payments in Taiwan**\n",
  17 |     "\n",
  18 |     "### Features description:\n",
  19 |     "\n",
  20 |     "- LIMIT_BAL: Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit. \n",
  21 |     "- SEX: Gender (1 = male; 2 = female). \n",
  22 |     "- EDUCATION: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others). \n",
  23 |     "- MARRIAGE: Marital status (1 = married; 2 = single; 3 = others). \n",
  24 |     "- AGE: Age (year). \n",
  25 |     "- PAY_1 - PAY_6: History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: 1 = the repayment status in September, 2005; 1 = the repayment status in August, 2005; . . .; 6 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.\n",
  26 |     "- BILL_AMT1-BILL_AMT6: Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005. \n",
  27 |     "- PAY_AMT1-PAY_AMT6: Amount of previous payment (NT dollar).\n",
  28 |     "- default payment next month: **positive class: default | negative class: pay**"
  29 |    ]
  30 |   },
  31 |   {
  32 |    "cell_type": "code",
  33 |    "execution_count": 19,
  34 |    "metadata": {
  35 |     "collapsed": true
  36 |    },
  37 |    "outputs": [],
  38 |    "source": [
  39 |     "import numpy as np\n",
  40 |     "import pandas as pd\n",
  41 |     "import os"
  42 |    ]
  43 |   },
  44 |   {
  45 |    "cell_type": "code",
  46 |    "execution_count": 20,
  47 |    "metadata": {},
  48 |    "outputs": [
  49 |     {
  50 |      "data": {
  51 |       "text/html": [
  52 |        "<div>\n",
  53 |        "<style>\n",
  54 |        "    .dataframe thead tr:only-child th {\n",
  55 |        "        text-align: right;\n",
  56 |        "    }\n",
  57 |        "\n",
  58 |        "    .dataframe thead th {\n",
  59 |        "        text-align: left;\n",
  60 |        "    }\n",
  61 |        "\n",
  62 |        "    .dataframe tbody tr th {\n",
  63 |        "        vertical-align: top;\n",
  64 |        "    }\n",
  65 |        "</style>\n",
  66 |        "<table border=\"1\" class=\"dataframe\">\n",
  67 |        "  <thead>\n",
  68 |        "    <tr style=\"text-align: right;\">\n",
  69 |        "      <th></th>\n",
  70 |        "      <th>LIMIT_BAL</th>\n",
  71 |        "      <th>SEX</th>\n",
  72 |        "      <th>EDUCATION</th>\n",
  73 |        "      <th>MARRIAGE</th>\n",
  74 |        "      <th>AGE</th>\n",
  75 |        "      <th>PAY_1</th>\n",
  76 |        "      <th>PAY_2</th>\n",
  77 |        "      <th>PAY_3</th>\n",
  78 |        "      <th>PAY_4</th>\n",
  79 |        "      <th>PAY_5</th>\n",
  80 |        "      <th>...</th>\n",
  81 |        "      <th>BILL_AMT4</th>\n",
  82 |        "      <th>BILL_AMT5</th>\n",
  83 |        "      <th>BILL_AMT6</th>\n",
  84 |        "      <th>PAY_AMT1</th>\n",
  85 |        "      <th>PAY_AMT2</th>\n",
  86 |        "      <th>PAY_AMT3</th>\n",
  87 |        "      <th>PAY_AMT4</th>\n",
  88 |        "      <th>PAY_AMT5</th>\n",
  89 |        "      <th>PAY_AMT6</th>\n",
  90 |        "      <th>default payment next month</th>\n",
  91 |        "    </tr>\n",
  92 |        "    <tr>\n",
  93 |        "      <th>ID</th>\n",
  94 |        "      <th></th>\n",
  95 |        "      <th></th>\n",
  96 |        "      <th></th>\n",
  97 |        "      <th></th>\n",
  98 |        "      <th></th>\n",
  99 |        "      <th></th>\n",
 100 |        "      <th></th>\n",
 101 |        "      <th></th>\n",
 102 |        "      <th></th>\n",
 103 |        "      <th></th>\n",
 104 |        "      <th></th>\n",
 105 |        "      <th></th>\n",
 106 |        "      <th></th>\n",
 107 |        "      <th></th>\n",
 108 |        "      <th></th>\n",
 109 |        "      <th></th>\n",
 110 |        "      <th></th>\n",
 111 |        "      <th></th>\n",
 112 |        "      <th></th>\n",
 113 |        "      <th></th>\n",
 114 |        "      <th></th>\n",
 115 |        "    </tr>\n",
 116 |        "  </thead>\n",
 117 |        "  <tbody>\n",
 118 |        "    <tr>\n",
 119 |        "      <th>1</th>\n",
 120 |        "      <td>20000</td>\n",
 121 |        "      <td>2</td>\n",
 122 |        "      <td>2</td>\n",
 123 |        "      <td>1</td>\n",
 124 |        "      <td>24</td>\n",
 125 |        "      <td>2</td>\n",
 126 |        "      <td>2</td>\n",
 127 |        "      <td>-1</td>\n",
 128 |        "      <td>-1</td>\n",
 129 |        "      <td>-2</td>\n",
 130 |        "      <td>...</td>\n",
 131 |        "      <td>0</td>\n",
 132 |        "      <td>0</td>\n",
 133 |        "      <td>0</td>\n",
 134 |        "      <td>0</td>\n",
 135 |        "      <td>689</td>\n",
 136 |        "      <td>0</td>\n",
 137 |        "      <td>0</td>\n",
 138 |        "      <td>0</td>\n",
 139 |        "      <td>0</td>\n",
 140 |        "      <td>1</td>\n",
 141 |        "    </tr>\n",
 142 |        "    <tr>\n",
 143 |        "      <th>2</th>\n",
 144 |        "      <td>120000</td>\n",
 145 |        "      <td>2</td>\n",
 146 |        "      <td>2</td>\n",
 147 |        "      <td>2</td>\n",
 148 |        "      <td>26</td>\n",
 149 |        "      <td>-1</td>\n",
 150 |        "      <td>2</td>\n",
 151 |        "      <td>0</td>\n",
 152 |        "      <td>0</td>\n",
 153 |        "      <td>0</td>\n",
 154 |        "      <td>...</td>\n",
 155 |        "      <td>3272</td>\n",
 156 |        "      <td>3455</td>\n",
 157 |        "      <td>3261</td>\n",
 158 |        "      <td>0</td>\n",
 159 |        "      <td>1000</td>\n",
 160 |        "      <td>1000</td>\n",
 161 |        "      <td>1000</td>\n",
 162 |        "      <td>0</td>\n",
 163 |        "      <td>2000</td>\n",
 164 |        "      <td>1</td>\n",
 165 |        "    </tr>\n",
 166 |        "    <tr>\n",
 167 |        "      <th>3</th>\n",
 168 |        "      <td>90000</td>\n",
 169 |        "      <td>2</td>\n",
 170 |        "      <td>2</td>\n",
 171 |        "      <td>2</td>\n",
 172 |        "      <td>34</td>\n",
 173 |        "      <td>0</td>\n",
 174 |        "      <td>0</td>\n",
 175 |        "      <td>0</td>\n",
 176 |        "      <td>0</td>\n",
 177 |        "      <td>0</td>\n",
 178 |        "      <td>...</td>\n",
 179 |        "      <td>14331</td>\n",
 180 |        "      <td>14948</td>\n",
 181 |        "      <td>15549</td>\n",
 182 |        "      <td>1518</td>\n",
 183 |        "      <td>1500</td>\n",
 184 |        "      <td>1000</td>\n",
 185 |        "      <td>1000</td>\n",
 186 |        "      <td>1000</td>\n",
 187 |        "      <td>5000</td>\n",
 188 |        "      <td>0</td>\n",
 189 |        "    </tr>\n",
 190 |        "    <tr>\n",
 191 |        "      <th>4</th>\n",
 192 |        "      <td>50000</td>\n",
 193 |        "      <td>2</td>\n",
 194 |        "      <td>2</td>\n",
 195 |        "      <td>1</td>\n",
 196 |        "      <td>37</td>\n",
 197 |        "      <td>0</td>\n",
 198 |        "      <td>0</td>\n",
 199 |        "      <td>0</td>\n",
 200 |        "      <td>0</td>\n",
 201 |        "      <td>0</td>\n",
 202 |        "      <td>...</td>\n",
 203 |        "      <td>28314</td>\n",
 204 |        "      <td>28959</td>\n",
 205 |        "      <td>29547</td>\n",
 206 |        "      <td>2000</td>\n",
 207 |        "      <td>2019</td>\n",
 208 |        "      <td>1200</td>\n",
 209 |        "      <td>1100</td>\n",
 210 |        "      <td>1069</td>\n",
 211 |        "      <td>1000</td>\n",
 212 |        "      <td>0</td>\n",
 213 |        "    </tr>\n",
 214 |        "    <tr>\n",
 215 |        "      <th>5</th>\n",
 216 |        "      <td>50000</td>\n",
 217 |        "      <td>1</td>\n",
 218 |        "      <td>2</td>\n",
 219 |        "      <td>1</td>\n",
 220 |        "      <td>57</td>\n",
 221 |        "      <td>-1</td>\n",
 222 |        "      <td>0</td>\n",
 223 |        "      <td>-1</td>\n",
 224 |        "      <td>0</td>\n",
 225 |        "      <td>0</td>\n",
 226 |        "      <td>...</td>\n",
 227 |        "      <td>20940</td>\n",
 228 |        "      <td>19146</td>\n",
 229 |        "      <td>19131</td>\n",
 230 |        "      <td>2000</td>\n",
 231 |        "      <td>36681</td>\n",
 232 |        "      <td>10000</td>\n",
 233 |        "      <td>9000</td>\n",
 234 |        "      <td>689</td>\n",
 235 |        "      <td>679</td>\n",
 236 |        "      <td>0</td>\n",
 237 |        "    </tr>\n",
 238 |        "  </tbody>\n",
 239 |        "</table>\n",
 240 |        "<p>5 rows × 24 columns</p>\n",
 241 |        "</div>"
 242 |       ],
 243 |       "text/plain": [
 244 |        "    LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_1  PAY_2  PAY_3  PAY_4  \\\n",
 245 |        "ID                                                                         \n",
 246 |        "1       20000    2          2         1   24      2      2     -1     -1   \n",
 247 |        "2      120000    2          2         2   26     -1      2      0      0   \n",
 248 |        "3       90000    2          2         2   34      0      0      0      0   \n",
 249 |        "4       50000    2          2         1   37      0      0      0      0   \n",
 250 |        "5       50000    1          2         1   57     -1      0     -1      0   \n",
 251 |        "\n",
 252 |        "    PAY_5             ...              BILL_AMT4  BILL_AMT5  BILL_AMT6  \\\n",
 253 |        "ID                    ...                                                \n",
 254 |        "1      -2             ...                      0          0          0   \n",
 255 |        "2       0             ...                   3272       3455       3261   \n",
 256 |        "3       0             ...                  14331      14948      15549   \n",
 257 |        "4       0             ...                  28314      28959      29547   \n",
 258 |        "5       0             ...                  20940      19146      19131   \n",
 259 |        "\n",
 260 |        "    PAY_AMT1  PAY_AMT2  PAY_AMT3  PAY_AMT4  PAY_AMT5  PAY_AMT6  \\\n",
 261 |        "ID                                                               \n",
 262 |        "1          0       689         0         0         0         0   \n",
 263 |        "2          0      1000      1000      1000         0      2000   \n",
 264 |        "3       1518      1500      1000      1000      1000      5000   \n",
 265 |        "4       2000      2019      1200      1100      1069      1000   \n",
 266 |        "5       2000     36681     10000      9000       689       679   \n",
 267 |        "\n",
 268 |        "    default payment next month  \n",
 269 |        "ID                              \n",
 270 |        "1                            1  \n",
 271 |        "2                            1  \n",
 272 |        "3                            0  \n",
 273 |        "4                            0  \n",
 274 |        "5                            0  \n",
 275 |        "\n",
 276 |        "[5 rows x 24 columns]"
 277 |       ]
 278 |      },
 279 |      "execution_count": 20,
 280 |      "metadata": {},
 281 |      "output_type": "execute_result"
 282 |     }
 283 |    ],
 284 |    "source": [
 285 |     "DATA_DIR = '../data'\n",
 286 |     "FILE_NAME = 'credit_card_default.csv'\n",
 287 |     "data_path = os.path.join(DATA_DIR, FILE_NAME)\n",
 288 |     "ccd = pd.read_csv(data_path, index_col=\"ID\")\n",
 289 |     "ccd.head()"
 290 |    ]
 291 |   },
 292 |   {
 293 |    "cell_type": "code",
 294 |    "execution_count": 21,
 295 |    "metadata": {},
 296 |    "outputs": [
 297 |     {
 298 |      "data": {
 299 |       "text/plain": [
 300 |        "(30000, 24)"
 301 |       ]
 302 |      },
 303 |      "execution_count": 21,
 304 |      "metadata": {},
 305 |      "output_type": "execute_result"
 306 |     }
 307 |    ],
 308 |    "source": [
 309 |     "ccd.shape"
 310 |    ]
 311 |   },
 312 |   {
 313 |    "cell_type": "code",
 314 |    "execution_count": 22,
 315 |    "metadata": {
 316 |     "collapsed": true
 317 |    },
 318 |    "outputs": [],
 319 |    "source": [
 320 |     "ccd.rename(columns=lambda x: x.lower(), inplace=True)"
 321 |    ]
 322 |   },
 323 |   {
 324 |    "cell_type": "markdown",
 325 |    "metadata": {},
 326 |    "source": [
 327 |     "## Numerical features"
 328 |    ]
 329 |   },
 330 |   {
 331 |    "cell_type": "code",
 332 |    "execution_count": 23,
 333 |    "metadata": {},
 334 |    "outputs": [],
 335 |    "source": [
 336 |     "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n",
 337 |     "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n",
 338 |     "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features"
 339 |    ]
 340 |   },
 341 |   {
 342 |    "cell_type": "code",
 343 |    "execution_count": 24,
 344 |    "metadata": {},
 345 |    "outputs": [
 346 |     {
 347 |      "data": {
 348 |       "text/html": [
 349 |        "<div>\n",
 350 |        "<style>\n",
 351 |        "    .dataframe thead tr:only-child th {\n",
 352 |        "        text-align: right;\n",
 353 |        "    }\n",
 354 |        "\n",
 355 |        "    .dataframe thead th {\n",
 356 |        "        text-align: left;\n",
 357 |        "    }\n",
 358 |        "\n",
 359 |        "    .dataframe tbody tr th {\n",
 360 |        "        vertical-align: top;\n",
 361 |        "    }\n",
 362 |        "</style>\n",
 363 |        "<table border=\"1\" class=\"dataframe\">\n",
 364 |        "  <thead>\n",
 365 |        "    <tr style=\"text-align: right;\">\n",
 366 |        "      <th></th>\n",
 367 |        "      <th>limit_bal</th>\n",
 368 |        "      <th>age</th>\n",
 369 |        "    </tr>\n",
 370 |        "  </thead>\n",
 371 |        "  <tbody>\n",
 372 |        "    <tr>\n",
 373 |        "      <th>count</th>\n",
 374 |        "      <td>30000.000000</td>\n",
 375 |        "      <td>30000.000000</td>\n",
 376 |        "    </tr>\n",
 377 |        "    <tr>\n",
 378 |        "      <th>mean</th>\n",
 379 |        "      <td>167484.322667</td>\n",
 380 |        "      <td>35.485500</td>\n",
 381 |        "    </tr>\n",
 382 |        "    <tr>\n",
 383 |        "      <th>std</th>\n",
 384 |        "      <td>129747.661567</td>\n",
 385 |        "      <td>9.217904</td>\n",
 386 |        "    </tr>\n",
 387 |        "    <tr>\n",
 388 |        "      <th>min</th>\n",
 389 |        "      <td>10000.000000</td>\n",
 390 |        "      <td>21.000000</td>\n",
 391 |        "    </tr>\n",
 392 |        "    <tr>\n",
 393 |        "      <th>25%</th>\n",
 394 |        "      <td>50000.000000</td>\n",
 395 |        "      <td>28.000000</td>\n",
 396 |        "    </tr>\n",
 397 |        "    <tr>\n",
 398 |        "      <th>50%</th>\n",
 399 |        "      <td>140000.000000</td>\n",
 400 |        "      <td>34.000000</td>\n",
 401 |        "    </tr>\n",
 402 |        "    <tr>\n",
 403 |        "      <th>75%</th>\n",
 404 |        "      <td>240000.000000</td>\n",
 405 |        "      <td>41.000000</td>\n",
 406 |        "    </tr>\n",
 407 |        "    <tr>\n",
 408 |        "      <th>max</th>\n",
 409 |        "      <td>1000000.000000</td>\n",
 410 |        "      <td>79.000000</td>\n",
 411 |        "    </tr>\n",
 412 |        "  </tbody>\n",
 413 |        "</table>\n",
 414 |        "</div>"
 415 |       ],
 416 |       "text/plain": [
 417 |        "            limit_bal           age\n",
 418 |        "count    30000.000000  30000.000000\n",
 419 |        "mean    167484.322667     35.485500\n",
 420 |        "std     129747.661567      9.217904\n",
 421 |        "min      10000.000000     21.000000\n",
 422 |        "25%      50000.000000     28.000000\n",
 423 |        "50%     140000.000000     34.000000\n",
 424 |        "75%     240000.000000     41.000000\n",
 425 |        "max    1000000.000000     79.000000"
 426 |       ]
 427 |      },
 428 |      "execution_count": 24,
 429 |      "metadata": {},
 430 |      "output_type": "execute_result"
 431 |     }
 432 |    ],
 433 |    "source": [
 434 |     "ccd[['limit_bal','age']].describe()"
 435 |    ]
 436 |   },
 437 |   {
 438 |    "cell_type": "code",
 439 |    "execution_count": 25,
 440 |    "metadata": {},
 441 |    "outputs": [
 442 |     {
 443 |      "data": {
 444 |       "text/html": [
 445 |        "<div>\n",
 446 |        "<style>\n",
 447 |        "    .dataframe thead tr:only-child th {\n",
 448 |        "        text-align: right;\n",
 449 |        "    }\n",
 450 |        "\n",
 451 |        "    .dataframe thead th {\n",
 452 |        "        text-align: left;\n",
 453 |        "    }\n",
 454 |        "\n",
 455 |        "    .dataframe tbody tr th {\n",
 456 |        "        vertical-align: top;\n",
 457 |        "    }\n",
 458 |        "</style>\n",
 459 |        "<table border=\"1\" class=\"dataframe\">\n",
 460 |        "  <thead>\n",
 461 |        "    <tr style=\"text-align: right;\">\n",
 462 |        "      <th></th>\n",
 463 |        "      <th>bill_amt1</th>\n",
 464 |        "      <th>bill_amt2</th>\n",
 465 |        "      <th>bill_amt3</th>\n",
 466 |        "      <th>bill_amt4</th>\n",
 467 |        "      <th>bill_amt5</th>\n",
 468 |        "      <th>bill_amt6</th>\n",
 469 |        "    </tr>\n",
 470 |        "  </thead>\n",
 471 |        "  <tbody>\n",
 472 |        "    <tr>\n",
 473 |        "      <th>count</th>\n",
 474 |        "      <td>30000.0</td>\n",
 475 |        "      <td>30000.0</td>\n",
 476 |        "      <td>30000.0</td>\n",
 477 |        "      <td>30000.0</td>\n",
 478 |        "      <td>30000.0</td>\n",
 479 |        "      <td>30000.0</td>\n",
 480 |        "    </tr>\n",
 481 |        "    <tr>\n",
 482 |        "      <th>mean</th>\n",
 483 |        "      <td>51223.0</td>\n",
 484 |        "      <td>49179.0</td>\n",
 485 |        "      <td>47013.0</td>\n",
 486 |        "      <td>43263.0</td>\n",
 487 |        "      <td>40311.0</td>\n",
 488 |        "      <td>38872.0</td>\n",
 489 |        "    </tr>\n",
 490 |        "    <tr>\n",
 491 |        "      <th>std</th>\n",
 492 |        "      <td>73636.0</td>\n",
 493 |        "      <td>71174.0</td>\n",
 494 |        "      <td>69349.0</td>\n",
 495 |        "      <td>64333.0</td>\n",
 496 |        "      <td>60797.0</td>\n",
 497 |        "      <td>59554.0</td>\n",
 498 |        "    </tr>\n",
 499 |        "    <tr>\n",
 500 |        "      <th>min</th>\n",
 501 |        "      <td>-165580.0</td>\n",
 502 |        "      <td>-69777.0</td>\n",
 503 |        "      <td>-157264.0</td>\n",
 504 |        "      <td>-170000.0</td>\n",
 505 |        "      <td>-81334.0</td>\n",
 506 |        "      <td>-339603.0</td>\n",
 507 |        "    </tr>\n",
 508 |        "    <tr>\n",
 509 |        "      <th>25%</th>\n",
 510 |        "      <td>3559.0</td>\n",
 511 |        "      <td>2985.0</td>\n",
 512 |        "      <td>2666.0</td>\n",
 513 |        "      <td>2327.0</td>\n",
 514 |        "      <td>1763.0</td>\n",
 515 |        "      <td>1256.0</td>\n",
 516 |        "    </tr>\n",
 517 |        "    <tr>\n",
 518 |        "      <th>50%</th>\n",
 519 |        "      <td>22382.0</td>\n",
 520 |        "      <td>21200.0</td>\n",
 521 |        "      <td>20088.0</td>\n",
 522 |        "      <td>19052.0</td>\n",
 523 |        "      <td>18104.0</td>\n",
 524 |        "      <td>17071.0</td>\n",
 525 |        "    </tr>\n",
 526 |        "    <tr>\n",
 527 |        "      <th>75%</th>\n",
 528 |        "      <td>67091.0</td>\n",
 529 |        "      <td>64006.0</td>\n",
 530 |        "      <td>60165.0</td>\n",
 531 |        "      <td>54506.0</td>\n",
 532 |        "      <td>50190.0</td>\n",
 533 |        "      <td>49198.0</td>\n",
 534 |        "    </tr>\n",
 535 |        "    <tr>\n",
 536 |        "      <th>max</th>\n",
 537 |        "      <td>964511.0</td>\n",
 538 |        "      <td>983931.0</td>\n",
 539 |        "      <td>1664089.0</td>\n",
 540 |        "      <td>891586.0</td>\n",
 541 |        "      <td>927171.0</td>\n",
 542 |        "      <td>961664.0</td>\n",
 543 |        "    </tr>\n",
 544 |        "  </tbody>\n",
 545 |        "</table>\n",
 546 |        "</div>"
 547 |       ],
 548 |       "text/plain": [
 549 |        "       bill_amt1  bill_amt2  bill_amt3  bill_amt4  bill_amt5  bill_amt6\n",
 550 |        "count    30000.0    30000.0    30000.0    30000.0    30000.0    30000.0\n",
 551 |        "mean     51223.0    49179.0    47013.0    43263.0    40311.0    38872.0\n",
 552 |        "std      73636.0    71174.0    69349.0    64333.0    60797.0    59554.0\n",
 553 |        "min    -165580.0   -69777.0  -157264.0  -170000.0   -81334.0  -339603.0\n",
 554 |        "25%       3559.0     2985.0     2666.0     2327.0     1763.0     1256.0\n",
 555 |        "50%      22382.0    21200.0    20088.0    19052.0    18104.0    17071.0\n",
 556 |        "75%      67091.0    64006.0    60165.0    54506.0    50190.0    49198.0\n",
 557 |        "max     964511.0   983931.0  1664089.0   891586.0   927171.0   961664.0"
 558 |       ]
 559 |      },
 560 |      "execution_count": 25,
 561 |      "metadata": {},
 562 |      "output_type": "execute_result"
 563 |     }
 564 |    ],
 565 |    "source": [
 566 |     "ccd[bill_amt_features].describe().round()"
 567 |    ]
 568 |   },
 569 |   {
 570 |    "cell_type": "code",
 571 |    "execution_count": 26,
 572 |    "metadata": {},
 573 |    "outputs": [
 574 |     {
 575 |      "data": {
 576 |       "text/html": [
 577 |        "<div>\n",
 578 |        "<style>\n",
 579 |        "    .dataframe thead tr:only-child th {\n",
 580 |        "        text-align: right;\n",
 581 |        "    }\n",
 582 |        "\n",
 583 |        "    .dataframe thead th {\n",
 584 |        "        text-align: left;\n",
 585 |        "    }\n",
 586 |        "\n",
 587 |        "    .dataframe tbody tr th {\n",
 588 |        "        vertical-align: top;\n",
 589 |        "    }\n",
 590 |        "</style>\n",
 591 |        "<table border=\"1\" class=\"dataframe\">\n",
 592 |        "  <thead>\n",
 593 |        "    <tr style=\"text-align: right;\">\n",
 594 |        "      <th></th>\n",
 595 |        "      <th>pay_amt1</th>\n",
 596 |        "      <th>pay_amt2</th>\n",
 597 |        "      <th>pay_amt3</th>\n",
 598 |        "      <th>pay_amt4</th>\n",
 599 |        "      <th>pay_amt5</th>\n",
 600 |        "      <th>pay_amt6</th>\n",
 601 |        "    </tr>\n",
 602 |        "  </thead>\n",
 603 |        "  <tbody>\n",
 604 |        "    <tr>\n",
 605 |        "      <th>count</th>\n",
 606 |        "      <td>30000.0</td>\n",
 607 |        "      <td>30000.0</td>\n",
 608 |        "      <td>30000.0</td>\n",
 609 |        "      <td>30000.0</td>\n",
 610 |        "      <td>30000.0</td>\n",
 611 |        "      <td>30000.0</td>\n",
 612 |        "    </tr>\n",
 613 |        "    <tr>\n",
 614 |        "      <th>mean</th>\n",
 615 |        "      <td>5664.0</td>\n",
 616 |        "      <td>5921.0</td>\n",
 617 |        "      <td>5226.0</td>\n",
 618 |        "      <td>4826.0</td>\n",
 619 |        "      <td>4799.0</td>\n",
 620 |        "      <td>5216.0</td>\n",
 621 |        "    </tr>\n",
 622 |        "    <tr>\n",
 623 |        "      <th>std</th>\n",
 624 |        "      <td>16563.0</td>\n",
 625 |        "      <td>23041.0</td>\n",
 626 |        "      <td>17607.0</td>\n",
 627 |        "      <td>15666.0</td>\n",
 628 |        "      <td>15278.0</td>\n",
 629 |        "      <td>17777.0</td>\n",
 630 |        "    </tr>\n",
 631 |        "    <tr>\n",
 632 |        "      <th>min</th>\n",
 633 |        "      <td>0.0</td>\n",
 634 |        "      <td>0.0</td>\n",
 635 |        "      <td>0.0</td>\n",
 636 |        "      <td>0.0</td>\n",
 637 |        "      <td>0.0</td>\n",
 638 |        "      <td>0.0</td>\n",
 639 |        "    </tr>\n",
 640 |        "    <tr>\n",
 641 |        "      <th>25%</th>\n",
 642 |        "      <td>1000.0</td>\n",
 643 |        "      <td>833.0</td>\n",
 644 |        "      <td>390.0</td>\n",
 645 |        "      <td>296.0</td>\n",
 646 |        "      <td>252.0</td>\n",
 647 |        "      <td>118.0</td>\n",
 648 |        "    </tr>\n",
 649 |        "    <tr>\n",
 650 |        "      <th>50%</th>\n",
 651 |        "      <td>2100.0</td>\n",
 652 |        "      <td>2009.0</td>\n",
 653 |        "      <td>1800.0</td>\n",
 654 |        "      <td>1500.0</td>\n",
 655 |        "      <td>1500.0</td>\n",
 656 |        "      <td>1500.0</td>\n",
 657 |        "    </tr>\n",
 658 |        "    <tr>\n",
 659 |        "      <th>75%</th>\n",
 660 |        "      <td>5006.0</td>\n",
 661 |        "      <td>5000.0</td>\n",
 662 |        "      <td>4505.0</td>\n",
 663 |        "      <td>4013.0</td>\n",
 664 |        "      <td>4032.0</td>\n",
 665 |        "      <td>4000.0</td>\n",
 666 |        "    </tr>\n",
 667 |        "    <tr>\n",
 668 |        "      <th>max</th>\n",
 669 |        "      <td>873552.0</td>\n",
 670 |        "      <td>1684259.0</td>\n",
 671 |        "      <td>896040.0</td>\n",
 672 |        "      <td>621000.0</td>\n",
 673 |        "      <td>426529.0</td>\n",
 674 |        "      <td>528666.0</td>\n",
 675 |        "    </tr>\n",
 676 |        "  </tbody>\n",
 677 |        "</table>\n",
 678 |        "</div>"
 679 |       ],
 680 |       "text/plain": [
 681 |        "       pay_amt1   pay_amt2  pay_amt3  pay_amt4  pay_amt5  pay_amt6\n",
 682 |        "count   30000.0    30000.0   30000.0   30000.0   30000.0   30000.0\n",
 683 |        "mean     5664.0     5921.0    5226.0    4826.0    4799.0    5216.0\n",
 684 |        "std     16563.0    23041.0   17607.0   15666.0   15278.0   17777.0\n",
 685 |        "min         0.0        0.0       0.0       0.0       0.0       0.0\n",
 686 |        "25%      1000.0      833.0     390.0     296.0     252.0     118.0\n",
 687 |        "50%      2100.0     2009.0    1800.0    1500.0    1500.0    1500.0\n",
 688 |        "75%      5006.0     5000.0    4505.0    4013.0    4032.0    4000.0\n",
 689 |        "max    873552.0  1684259.0  896040.0  621000.0  426529.0  528666.0"
 690 |       ]
 691 |      },
 692 |      "execution_count": 26,
 693 |      "metadata": {},
 694 |      "output_type": "execute_result"
 695 |     }
 696 |    ],
 697 |    "source": [
 698 |     "ccd[pay_amt_features].describe().round()"
 699 |    ]
 700 |   },
 701 |   {
 702 |    "cell_type": "markdown",
 703 |    "metadata": {},
 704 |    "source": [
 705 |     "## Encoding categorical features"
 706 |    ]
 707 |   },
 708 |   {
 709 |    "cell_type": "code",
 710 |    "execution_count": 27,
 711 |    "metadata": {},
 712 |    "outputs": [
 713 |     {
 714 |      "data": {
 715 |       "text/plain": [
 716 |        "ID\n",
 717 |        "1     0\n",
 718 |        "2     0\n",
 719 |        "3     0\n",
 720 |        "4     0\n",
 721 |        "5     1\n",
 722 |        "6     1\n",
 723 |        "7     1\n",
 724 |        "8     0\n",
 725 |        "9     0\n",
 726 |        "10    1\n",
 727 |        "Name: male, dtype: int32"
 728 |       ]
 729 |      },
 730 |      "execution_count": 27,
 731 |      "metadata": {},
 732 |      "output_type": "execute_result"
 733 |     }
 734 |    ],
 735 |    "source": [
 736 |     "ccd['male'] = (ccd['sex'] == 1).astype('int')\n",
 737 |     "ccd['male'].head(n=10)"
 738 |    ]
 739 |   },
 740 |   {
 741 |    "cell_type": "code",
 742 |    "execution_count": 28,
 743 |    "metadata": {},
 744 |    "outputs": [
 745 |     {
 746 |      "data": {
 747 |       "text/plain": [
 748 |        "0.39626666666666666"
 749 |       ]
 750 |      },
 751 |      "execution_count": 28,
 752 |      "metadata": {},
 753 |      "output_type": "execute_result"
 754 |     }
 755 |    ],
 756 |    "source": [
 757 |     "ccd['male'].mean()"
 758 |    ]
 759 |   },
 760 |   {
 761 |    "cell_type": "code",
 762 |    "execution_count": 29,
 763 |    "metadata": {},
 764 |    "outputs": [
 765 |     {
 766 |      "data": {
 767 |       "text/plain": [
 768 |        "0       14\n",
 769 |        "1    10585\n",
 770 |        "2    14030\n",
 771 |        "3     4917\n",
 772 |        "4      123\n",
 773 |        "5      280\n",
 774 |        "6       51\n",
 775 |        "Name: education, dtype: int64"
 776 |       ]
 777 |      },
 778 |      "execution_count": 29,
 779 |      "metadata": {},
 780 |      "output_type": "execute_result"
 781 |     }
 782 |    ],
 783 |    "source": [
 784 |     "ccd['education'].value_counts(sort=False)"
 785 |    ]
 786 |   },
 787 |   {
 788 |    "cell_type": "code",
 789 |    "execution_count": 30,
 790 |    "metadata": {
 791 |     "collapsed": true
 792 |    },
 793 |    "outputs": [],
 794 |    "source": [
 795 |     "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n",
 796 |     "ccd['university'] = (ccd['education'] == 2).astype('int')\n",
 797 |     "ccd['high_school'] = (ccd['education'] == 3).astype('int')"
 798 |    ]
 799 |   },
 800 |   {
 801 |    "cell_type": "code",
 802 |    "execution_count": 31,
 803 |    "metadata": {},
 804 |    "outputs": [
 805 |     {
 806 |      "data": {
 807 |       "text/plain": [
 808 |        "ID\n",
 809 |        "48     5\n",
 810 |        "70     5\n",
 811 |        "359    4\n",
 812 |        "386    5\n",
 813 |        "449    4\n",
 814 |        "Name: education, dtype: int64"
 815 |       ]
 816 |      },
 817 |      "execution_count": 31,
 818 |      "metadata": {},
 819 |      "output_type": "execute_result"
 820 |     }
 821 |    ],
 822 |    "source": [
 823 |     "ccd.loc[(ccd['grad_school']==0) & (ccd['university']==0) & (ccd['high_school']==0)]['education'].head()"
 824 |    ]
 825 |   },
 826 |   {
 827 |    "cell_type": "markdown",
 828 |    "metadata": {},
 829 |    "source": [
 830 |     "## Low variance features"
 831 |    ]
 832 |   },
 833 |   {
 834 |    "cell_type": "code",
 835 |    "execution_count": 32,
 836 |    "metadata": {},
 837 |    "outputs": [
 838 |     {
 839 |      "data": {
 840 |       "text/plain": [
 841 |        "1    13713\n",
 842 |        "2    15964\n",
 843 |        "3      323\n",
 844 |        "Name: marriage, dtype: int64"
 845 |       ]
 846 |      },
 847 |      "execution_count": 32,
 848 |      "metadata": {},
 849 |      "output_type": "execute_result"
 850 |     }
 851 |    ],
 852 |    "source": [
 853 |     "ccd['marriage'].value_counts(sort=False)"
 854 |    ]
 855 |   },
 856 |   {
 857 |    "cell_type": "code",
 858 |    "execution_count": 33,
 859 |    "metadata": {
 860 |     "collapsed": true
 861 |    },
 862 |    "outputs": [],
 863 |    "source": [
 864 |     "ccd['single'] = (ccd['marriage'] == 2).astype('int')\n",
 865 |     "ccd['marital_other'] = (ccd['marriage'] == 3).astype('int')"
 866 |    ]
 867 |   },
 868 |   {
 869 |    "cell_type": "code",
 870 |    "execution_count": 34,
 871 |    "metadata": {},
 872 |    "outputs": [
 873 |     {
 874 |      "name": "stdout",
 875 |      "output_type": "stream",
 876 |      "text": [
 877 |       "Proportion of singles:  0.5321333333333333\n",
 878 |       "Proportion of other marital status:  0.010766666666666667\n"
 879 |      ]
 880 |     }
 881 |    ],
 882 |    "source": [
 883 |     "print(\"Proportion of singles: \", ccd['single'].mean())\n",
 884 |     "print(\"Proportion of other marital status: \", ccd['marital_other'].mean())"
 885 |    ]
 886 |   },
 887 |   {
 888 |    "cell_type": "code",
 889 |    "execution_count": 35,
 890 |    "metadata": {},
 891 |    "outputs": [
 892 |     {
 893 |      "name": "stdout",
 894 |      "output_type": "stream",
 895 |      "text": [
 896 |       "0.24816786226195736\n",
 897 |       "0.24897574808047968\n"
 898 |      ]
 899 |     }
 900 |    ],
 901 |    "source": [
 902 |     "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n",
 903 |     "print(ccd['married'].var())\n",
 904 |     "print(ccd['single'].var())"
 905 |    ]
 906 |   },
 907 |   {
 908 |    "cell_type": "code",
 909 |    "execution_count": 36,
 910 |    "metadata": {},
 911 |    "outputs": [
 912 |     {
 913 |      "data": {
 914 |       "text/plain": [
 915 |        "0.9892333333333333"
 916 |       ]
 917 |      },
 918 |      "execution_count": 36,
 919 |      "metadata": {},
 920 |      "output_type": "execute_result"
 921 |     }
 922 |    ],
 923 |    "source": [
 924 |     "(ccd['married'] == (1 - ccd['single'])).mean()"
 925 |    ]
 926 |   },
 927 |   {
 928 |    "cell_type": "markdown",
 929 |    "metadata": {},
 930 |    "source": [
 931 |     "## A brief introduction to Feature Engineering"
 932 |    ]
 933 |   },
 934 |   {
 935 |    "cell_type": "code",
 936 |    "execution_count": 37,
 937 |    "metadata": {},
 938 |    "outputs": [
 939 |     {
 940 |      "data": {
 941 |       "text/plain": [
 942 |        "-2     2759\n",
 943 |        "-1     5686\n",
 944 |        " 0    14737\n",
 945 |        " 1     3688\n",
 946 |        " 2     2667\n",
 947 |        " 3      322\n",
 948 |        " 4       76\n",
 949 |        " 5       26\n",
 950 |        " 6       11\n",
 951 |        " 7        9\n",
 952 |        " 8       19\n",
 953 |        "Name: pay_1, dtype: int64"
 954 |       ]
 955 |      },
 956 |      "execution_count": 37,
 957 |      "metadata": {},
 958 |      "output_type": "execute_result"
 959 |     }
 960 |    ],
 961 |    "source": [
 962 |     "ccd['pay_1'].value_counts().sort_index()"
 963 |    ]
 964 |   },
 965 |   {
 966 |    "cell_type": "code",
 967 |    "execution_count": 38,
 968 |    "metadata": {},
 969 |    "outputs": [],
 970 |    "source": [
 971 |     "# fixing the pay_i features\n",
 972 |     "pay_features= ['pay_' + str(i) for i in range(1,7)]\n",
 973 |     "for x in pay_features:\n",
 974 |     "    ccd.loc[ccd[x] <= 0, x] = 0"
 975 |    ]
 976 |   },
 977 |   {
 978 |    "cell_type": "code",
 979 |    "execution_count": 39,
 980 |    "metadata": {
 981 |     "collapsed": true
 982 |    },
 983 |    "outputs": [],
 984 |    "source": [
 985 |     "# producing delayed features\n",
 986 |     "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n",
 987 |     "for pay, delayed in zip(pay_features, delayed_features):\n",
 988 |     "    ccd[delayed] = (ccd[pay] > 0).astype(int)"
 989 |    ]
 990 |   },
 991 |   {
 992 |    "cell_type": "code",
 993 |    "execution_count": 44,
 994 |    "metadata": {},
 995 |    "outputs": [
 996 |     {
 997 |      "data": {
 998 |       "text/plain": [
 999 |        "delayed_1    0.227267\n",
1000 |        "delayed_2    0.147933\n",
1001 |        "delayed_3    0.140433\n",
1002 |        "delayed_4    0.117000\n",
1003 |        "delayed_5    0.098933\n",
1004 |        "delayed_6    0.102633\n",
1005 |        "dtype: float64"
1006 |       ]
1007 |      },
1008 |      "execution_count": 44,
1009 |      "metadata": {},
1010 |      "output_type": "execute_result"
1011 |     }
1012 |    ],
1013 |    "source": [
1014 |     "ccd[delayed_features].mean()"
1015 |    ]
1016 |   },
1017 |   {
1018 |    "cell_type": "code",
1019 |    "execution_count": null,
1020 |    "metadata": {
1021 |     "collapsed": true
1022 |    },
1023 |    "outputs": [],
1024 |    "source": [
1025 |     "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)"
1026 |    ]
1027 |   },
1028 |   {
1029 |    "cell_type": "markdown",
1030 |    "metadata": {},
1031 |    "source": [
1032 |     "Done."
1033 |    ]
1034 |   },
1035 |   {
1036 |    "cell_type": "code",
1037 |    "execution_count": null,
1038 |    "metadata": {
1039 |     "collapsed": true
1040 |    },
1041 |    "outputs": [],
1042 |    "source": []
1043 |   }
1044 |  ],
1045 |  "metadata": {
1046 |   "kernelspec": {
1047 |    "display_name": "Python 3",
1048 |    "language": "python",
1049 |    "name": "python3"
1050 |   },
1051 |   "language_info": {
1052 |    "codemirror_mode": {
1053 |     "name": "ipython",
1054 |     "version": 3
1055 |    },
1056 |    "file_extension": ".py",
1057 |    "mimetype": "text/x-python",
1058 |    "name": "python",
1059 |    "nbconvert_exporter": "python",
1060 |    "pygments_lexer": "ipython3",
1061 |    "version": "3.6.1"
1062 |   }
1063 |  },
1064 |  "nbformat": 4,
1065 |  "nbformat_minor": 2
1066 | }
1067 | 


--------------------------------------------------------------------------------
/Chapter05/.ipynb_checkpoints/ch5-predicting-credit-card-default-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Predicting Credit Card Default\n",
  8 |     "\n",
  9 |     "If you are using Windows, don't forget to add:\n",
 10 |     "\n",
 11 |     "C:\\Users\\\"user_name\"\\Anaconda3\\\"environment_name\"\\Library\\bin\\graphviz\\\n",
 12 |     "\n",
 13 |     "to the PATH environment variable"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import numpy as np\n",
 23 |     "import pandas as pd\n",
 24 |     "import matplotlib.pyplot as plt\n",
 25 |     "import seaborn as sns\n",
 26 |     "import os\n",
 27 |     "%matplotlib inline"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "### Back with the credit card default dataset"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "# Loading the dataset\n",
 44 |     "DATA_DIR = '../data'\n",
 45 |     "FILE_NAME = 'credit_card_default.csv'\n",
 46 |     "data_path = os.path.join(DATA_DIR, FILE_NAME)\n",
 47 |     "ccd = pd.read_csv(data_path, index_col=\"ID\")\n",
 48 |     "ccd.rename(columns=lambda x: x.lower(), inplace=True)\n",
 49 |     "ccd.rename(columns={'default payment next month':'default'}, inplace=True)\n",
 50 |     "\n",
 51 |     "# getting the groups of features\n",
 52 |     "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n",
 53 |     "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n",
 54 |     "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features\n",
 55 |     "\n",
 56 |     "# Creating creating binary features\n",
 57 |     "ccd['male'] = (ccd['sex'] == 1).astype('int')\n",
 58 |     "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n",
 59 |     "ccd['university'] = (ccd['education'] == 2).astype('int')\n",
 60 |     "#ccd['high_school'] = (ccd['education'] == 3).astype('int')\n",
 61 |     "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n",
 62 |     "\n",
 63 |     "# simplifying pay features \n",
 64 |     "pay_features= ['pay_' + str(i) for i in range(1,7)]\n",
 65 |     "for x in pay_features:\n",
 66 |     "    ccd.loc[ccd[x] <= 0, x] = 0\n",
 67 |     "\n",
 68 |     "# simplifying delayed features\n",
 69 |     "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n",
 70 |     "for pay, delayed in zip(pay_features, delayed_features):\n",
 71 |     "    ccd[delayed] = (ccd[pay] > 0).astype(int)\n",
 72 |     "    \n",
 73 |     "# creating a new feature: months delayed\n",
 74 |     "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "## Splitting the dataset"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "numerical_features = numerical_features + ['months_delayed']\n",
 91 |     "binary_features = ['male','married','grad_school','university']\n",
 92 |     "X = ccd[numerical_features + binary_features]\n",
 93 |     "y = ccd['default'].astype(int)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "from sklearn.model_selection import train_test_split\n",
103 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=101)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {
110 |     "scrolled": true
111 |    },
112 |    "outputs": [],
113 |    "source": [
114 |     "# 1. Import the class you will use\n",
115 |     "from sklearn.preprocessing import StandardScaler\n",
116 |     "# 2. Create an instance of the class\n",
117 |     "scaler = StandardScaler()\n",
118 |     "# 3. Use the fit method of the instance\n",
119 |     "scaler.fit(X_train[numerical_features])\n",
120 |     "# 4. Use the transform method to perform the transformation\n",
121 |     "X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "## Logistic Regression"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "### A simple Logistic Regression model"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "from sklearn.linear_model import LogisticRegression\n",
145 |     "simple_log_reg = LogisticRegression(C=1e6)\n",
146 |     "simple_log_reg.fit(X_train['months_delayed'].values.reshape(-1, 1), y_train)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "print(\"W0: {}, W1: {}\".format(simple_log_reg.intercept_[0], simple_log_reg.coef_[0][0]))"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "def get_probs(months_delayed):\n",
165 |     "    m = scaler.mean_[-1]\n",
166 |     "    std = scaler.var_[-1]**.5\n",
167 |     "    x = (months_delayed - m)/std\n",
168 |     "    prob_default = 1/(1+np.exp(-simple_log_reg.intercept_[0] + -simple_log_reg.coef_[0][0]*x))\n",
169 |     "    return prob_default"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "months = np.arange(13)\n",
179 |     "pred_probs = get_probs(months)\n",
180 |     "pd.DataFrame({'months': months, 'pred_probs':pred_probs})"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "fig, ax = plt.subplots()\n",
190 |     "ax.plot(months, pred_probs)\n",
191 |     "ax.set_xlabel('Months delayed')\n",
192 |     "ax.set_ylabel('Probability of default')\n",
193 |     "ax.grid()"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "### A complete Logistic Regression model"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "log_reg = LogisticRegression(C=1e6)\n",
210 |     "log_reg.fit(X_train, y_train)"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "prob_log_reg = log_reg.predict_proba(X_train)\n",
220 |     "prob_log_reg[:10]"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "y_pred_log_reg = log_reg.predict(X_train)\n",
230 |     "y_pred_log_reg[:10]"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "np.all(y_pred_log_reg == (prob_log_reg[:,1] > 0.5))"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "pd.Series(data=log_reg.coef_[0], index=X_train.columns).sort_values(ascending=False).round(2)"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "from sklearn.metrics import accuracy_score\n",
258 |     "accuracy_log_reg = accuracy_score(y_true=y_train, y_pred=y_pred_log_reg)\n",
259 |     "accuracy_log_reg"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "## Classification Trees"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "from sklearn.tree import DecisionTreeClassifier\n",
276 |     "class_tree = DecisionTreeClassifier(max_depth=3)\n",
277 |     "class_tree.fit(X_train, y_train)"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "from sklearn.externals.six import StringIO  \n",
287 |     "from sklearn.tree import export_graphviz\n",
288 |     "from IPython.display import Image  \n",
289 |     "import pydotplus"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "dot_data = StringIO()\n",
299 |     "export_graphviz(decision_tree=class_tree,\n",
300 |     "                out_file=dot_data,\n",
301 |     "                filled=True,\n",
302 |     "                rounded=True,\n",
303 |     "                feature_names = X_train.columns,\n",
304 |     "                class_names = ['pay','default'],\n",
305 |     "                special_characters=True)\n",
306 |     "graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  \n",
307 |     "Image(graph.create_png())"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": null,
313 |    "metadata": {},
314 |    "outputs": [],
315 |    "source": [
316 |     "dot_data = StringIO()\n",
317 |     "export_graphviz(decision_tree=class_tree,\n",
318 |     "                out_file=dot_data,\n",
319 |     "                filled=True,\n",
320 |     "                rounded=True,\n",
321 |     "                proportion=True,\n",
322 |     "                feature_names = X_train.columns,\n",
323 |     "                class_names = ['pay','default'],\n",
324 |     "                special_characters=True)\n",
325 |     "graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  \n",
326 |     "Image(graph.create_png())"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "markdown",
331 |    "metadata": {},
332 |    "source": [
333 |     "### How trees work"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "from sklearn.datasets import make_blobs"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": null,
348 |    "metadata": {},
349 |    "outputs": [],
350 |    "source": [
351 |     "A, b = make_blobs(n_samples=200, n_features=2, cluster_std=0.6,\n",
352 |     "                  centers=[[-0.5,-1],[0.5,0.5]], shuffle=False, random_state=42)\n",
353 |     "plt.scatter(A[:, 0], A[:, 1], c=b)\n",
354 |     "plt.xlabel('X1', size=15)\n",
355 |     "plt.ylabel('X2', size=15);"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {},
362 |    "outputs": [],
363 |    "source": [
364 |     "plt.scatter(A[:, 0], A[:, 1], c=b)\n",
365 |     "plt.axhline(-0.6, c='red')\n",
366 |     "plt.xlabel('X1', size=15)\n",
367 |     "plt.ylabel('X2', size=15);"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "plt.scatter(A[:, 0], A[:, 1], c=b)\n",
377 |     "plt.axhline(-0.6, c='red')\n",
378 |     "plt.axvline(x=-0.1, ymin=0.34, c='red')\n",
379 |     "plt.xlabel('X1', size=15)\n",
380 |     "plt.ylabel('X2', size=15);"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": null,
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "plt.scatter(A[:, 0], A[:, 1], c=b)\n",
390 |     "plt.axhline(-0.6, c='red')\n",
391 |     "plt.axvline(x=-0.1, ymin=0.34, c='red')\n",
392 |     "plt.axvline(x=0.7, ymax=0.34, c='red')\n",
393 |     "plt.xlabel('X1', size=15)\n",
394 |     "plt.ylabel('X2', size=15);"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "markdown",
399 |    "metadata": {},
400 |    "source": [
401 |     "### Training a larger classification tree"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": [
410 |     "class_tree = DecisionTreeClassifier(max_depth=6, min_samples_split=50)\n",
411 |     "class_tree.fit(X_train, y_train)\n",
412 |     "y_pred_class_tree = class_tree.predict(X_train)"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": null,
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "accuracy_class_tree = accuracy_score(y_true=y_train, y_pred=y_pred_class_tree)\n",
422 |     "accuracy_class_tree"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": null,
428 |    "metadata": {},
429 |    "outputs": [],
430 |    "source": [
431 |     "pd.Series(data=class_tree.feature_importances_, index=X_train.columns).sort_values(ascending=False).round(3)"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": null,
437 |    "metadata": {},
438 |    "outputs": [],
439 |    "source": [
440 |     "pd.Series(data=class_tree.feature_importances_, index=X_train.columns).sort_values(ascending=False).plot(kind='bar');"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "markdown",
445 |    "metadata": {},
446 |    "source": [
447 |     "## Random Forests"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "from sklearn.ensemble import RandomForestClassifier\n",
457 |     "rf = RandomForestClassifier(n_estimators=99,\n",
458 |     "                            max_features=6,\n",
459 |     "                            max_depth=6,\n",
460 |     "                            min_samples_split=100,\n",
461 |     "                            random_state=85)\n",
462 |     "rf.fit(X_train, y_train)\n",
463 |     "y_pred_rf = rf.predict(X_train)"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": null,
469 |    "metadata": {},
470 |    "outputs": [],
471 |    "source": [
472 |     "accuracy_rf = accuracy_score(y_true=y_train, y_pred=y_pred_rf)\n",
473 |     "accuracy_rf"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": null,
479 |    "metadata": {},
480 |    "outputs": [],
481 |    "source": [
482 |     "pd.Series(data=rf.feature_importances_, index=X_train.columns).sort_values(ascending=False).round(3)"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "markdown",
487 |    "metadata": {},
488 |    "source": [
489 |     "## Training vs Testing Error"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": null,
495 |    "metadata": {},
496 |    "outputs": [],
497 |    "source": [
498 |     "y_pred_null = np.zeros_like(y_test)\n",
499 |     "accuracy_score(y_true=y_test, y_pred=y_pred_null)"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": null,
505 |    "metadata": {},
506 |    "outputs": [],
507 |    "source": [
508 |     "## Remember to also standarize the numerical features in the testing set\n",
509 |     "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "code",
514 |    "execution_count": null,
515 |    "metadata": {},
516 |    "outputs": [],
517 |    "source": [
518 |     "## Calculating accuracy\n",
519 |     "accuracies = pd.DataFrame(columns=['train', 'test'], index=['LogisticReg','ClassTree','RF'])\n",
520 |     "model_dict = {'LogisticReg': log_reg, 'ClassTree': class_tree, 'RF': rf}\n",
521 |     "for name, model in model_dict.items():\n",
522 |     "    accuracies.loc[name, 'train'] = accuracy_score(y_true=y_train, y_pred=model.predict(X_train))\n",
523 |     "    accuracies.loc[name, 'test'] = accuracy_score(y_true=y_test, y_pred=model.predict(X_test))\n",
524 |     "\n",
525 |     "accuracies"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "code",
530 |    "execution_count": null,
531 |    "metadata": {},
532 |    "outputs": [],
533 |    "source": [
534 |     "fig, ax = plt.subplots()\n",
535 |     "accuracies.sort_values(by='test', ascending=False).plot(kind='barh', ax=ax, zorder=3)\n",
536 |     "ax.grid(zorder=0)"
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "markdown",
541 |    "metadata": {},
542 |    "source": [
543 |     "## Multiclass classification"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "code",
548 |    "execution_count": null,
549 |    "metadata": {},
550 |    "outputs": [],
551 |    "source": [
552 |     "# Loading the iris dataset\n",
553 |     "from sklearn.datasets import load_iris\n",
554 |     "iris = load_iris()\n",
555 |     "# Training the logistic regression model\n",
556 |     "iris_log_reg = LogisticRegression(C=1e5)\n",
557 |     "iris_log_reg.fit(iris.data, iris.target)\n",
558 |     "iris_probs = iris_log_reg.predict_proba(iris.data)\n",
559 |     "iris_pred = iris_log_reg.predict(iris.data)"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": null,
565 |    "metadata": {},
566 |    "outputs": [],
567 |    "source": [
568 |     "iris_pred_df = pd.DataFrame(iris_probs, columns=iris.target_names).round(4)\n",
569 |     "iris_pred_df['predicted_class'] = iris.target_names[iris_pred]\n",
570 |     "iris_pred_df.sample(12)"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": null,
576 |    "metadata": {},
577 |    "outputs": [],
578 |    "source": []
579 |   }
580 |  ],
581 |  "metadata": {
582 |   "kernelspec": {
583 |    "display_name": "Python 3",
584 |    "language": "python",
585 |    "name": "python3"
586 |   },
587 |   "language_info": {
588 |    "codemirror_mode": {
589 |     "name": "ipython",
590 |     "version": 3
591 |    },
592 |    "file_extension": ".py",
593 |    "mimetype": "text/x-python",
594 |    "name": "python",
595 |    "nbconvert_exporter": "python",
596 |    "pygments_lexer": "ipython3",
597 |    "version": "3.6.10"
598 |   }
599 |  },
600 |  "nbformat": 4,
601 |  "nbformat_minor": 2
602 | }
603 | 


--------------------------------------------------------------------------------
/Chapter05/ch5-predicting-credit-card-default.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Predicting Credit Card Default\n",
  8 |     "\n",
  9 |     "If you are using Windows, don't forget to add:\n",
 10 |     "\n",
 11 |     "C:\\Users\\\"user_name\"\\Anaconda3\\\"environment_name\"\\Library\\bin\\graphviz\\\n",
 12 |     "\n",
 13 |     "to the PATH environment variable"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import numpy as np\n",
 23 |     "import pandas as pd\n",
 24 |     "import matplotlib.pyplot as plt\n",
 25 |     "import seaborn as sns\n",
 26 |     "import os\n",
 27 |     "%matplotlib inline"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "### Back with the credit card default dataset"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "# Loading the dataset\n",
 44 |     "DATA_DIR = '../data'\n",
 45 |     "FILE_NAME = 'credit_card_default.csv'\n",
 46 |     "data_path = os.path.join(DATA_DIR, FILE_NAME)\n",
 47 |     "ccd = pd.read_csv(data_path, index_col=\"ID\")\n",
 48 |     "ccd.rename(columns=lambda x: x.lower(), inplace=True)\n",
 49 |     "ccd.rename(columns={'default payment next month':'default'}, inplace=True)\n",
 50 |     "\n",
 51 |     "# getting the groups of features\n",
 52 |     "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n",
 53 |     "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n",
 54 |     "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features\n",
 55 |     "\n",
 56 |     "# Creating creating binary features\n",
 57 |     "ccd['male'] = (ccd['sex'] == 1).astype('int')\n",
 58 |     "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n",
 59 |     "ccd['university'] = (ccd['education'] == 2).astype('int')\n",
 60 |     "#ccd['high_school'] = (ccd['education'] == 3).astype('int')\n",
 61 |     "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n",
 62 |     "\n",
 63 |     "# simplifying pay features \n",
 64 |     "pay_features= ['pay_' + str(i) for i in range(1,7)]\n",
 65 |     "for x in pay_features:\n",
 66 |     "    ccd.loc[ccd[x] <= 0, x] = 0\n",
 67 |     "\n",
 68 |     "# simplifying delayed features\n",
 69 |     "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n",
 70 |     "for pay, delayed in zip(pay_features, delayed_features):\n",
 71 |     "    ccd[delayed] = (ccd[pay] > 0).astype(int)\n",
 72 |     "    \n",
 73 |     "# creating a new feature: months delayed\n",
 74 |     "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "## Splitting the dataset"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "numerical_features = numerical_features + ['months_delayed']\n",
 91 |     "binary_features = ['male','married','grad_school','university']\n",
 92 |     "X = ccd[numerical_features + binary_features]\n",
 93 |     "y = ccd['default'].astype(int)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "from sklearn.model_selection import train_test_split\n",
103 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=101)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {
110 |     "scrolled": true
111 |    },
112 |    "outputs": [],
113 |    "source": [
114 |     "# 1. Import the class you will use\n",
115 |     "from sklearn.preprocessing import StandardScaler\n",
116 |     "# 2. Create an instance of the class\n",
117 |     "scaler = StandardScaler()\n",
118 |     "# 3. Use the fit method of the instance\n",
119 |     "scaler.fit(X_train[numerical_features])\n",
120 |     "# 4. Use the transform method to perform the transformation\n",
121 |     "X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "## Logistic Regression"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "### A simple Logistic Regression model"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "from sklearn.linear_model import LogisticRegression\n",
145 |     "simple_log_reg = LogisticRegression(C=1e6)\n",
146 |     "simple_log_reg.fit(X_train['months_delayed'].values.reshape(-1, 1), y_train)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "print(\"W0: {}, W1: {}\".format(simple_log_reg.intercept_[0], simple_log_reg.coef_[0][0]))"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "def get_probs(months_delayed):\n",
165 |     "    m = scaler.mean_[-1]\n",
166 |     "    std = scaler.var_[-1]**.5\n",
167 |     "    x = (months_delayed - m)/std\n",
168 |     "    prob_default = 1/(1+np.exp(-simple_log_reg.intercept_[0] + -simple_log_reg.coef_[0][0]*x))\n",
169 |     "    return prob_default"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "months = np.arange(13)\n",
179 |     "pred_probs = get_probs(months)\n",
180 |     "pd.DataFrame({'months': months, 'pred_probs':pred_probs})"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "fig, ax = plt.subplots()\n",
190 |     "ax.plot(months, pred_probs)\n",
191 |     "ax.set_xlabel('Months delayed')\n",
192 |     "ax.set_ylabel('Probability of default')\n",
193 |     "ax.grid()"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "### A complete Logistic Regression model"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "log_reg = LogisticRegression(C=1e6)\n",
210 |     "log_reg.fit(X_train, y_train)"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "prob_log_reg = log_reg.predict_proba(X_train)\n",
220 |     "prob_log_reg[:10]"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "y_pred_log_reg = log_reg.predict(X_train)\n",
230 |     "y_pred_log_reg[:10]"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "np.all(y_pred_log_reg == (prob_log_reg[:,1] > 0.5))"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "pd.Series(data=log_reg.coef_[0], index=X_train.columns).sort_values(ascending=False).round(2)"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "from sklearn.metrics import accuracy_score\n",
258 |     "accuracy_log_reg = accuracy_score(y_true=y_train, y_pred=y_pred_log_reg)\n",
259 |     "accuracy_log_reg"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "## Classification Trees"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "from sklearn.tree import DecisionTreeClassifier\n",
276 |     "class_tree = DecisionTreeClassifier(max_depth=3)\n",
277 |     "class_tree.fit(X_train, y_train)"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "from sklearn.externals.six import StringIO  \n",
287 |     "from sklearn.tree import export_graphviz\n",
288 |     "from IPython.display import Image  \n",
289 |     "import pydotplus"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "dot_data = StringIO()\n",
299 |     "export_graphviz(decision_tree=class_tree,\n",
300 |     "                out_file=dot_data,\n",
301 |     "                filled=True,\n",
302 |     "                rounded=True,\n",
303 |     "                feature_names = X_train.columns,\n",
304 |     "                class_names = ['pay','default'],\n",
305 |     "                special_characters=True)\n",
306 |     "graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  \n",
307 |     "Image(graph.create_png())"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": null,
313 |    "metadata": {},
314 |    "outputs": [],
315 |    "source": [
316 |     "dot_data = StringIO()\n",
317 |     "export_graphviz(decision_tree=class_tree,\n",
318 |     "                out_file=dot_data,\n",
319 |     "                filled=True,\n",
320 |     "                rounded=True,\n",
321 |     "                proportion=True,\n",
322 |     "                feature_names = X_train.columns,\n",
323 |     "                class_names = ['pay','default'],\n",
324 |     "                special_characters=True)\n",
325 |     "graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  \n",
326 |     "Image(graph.create_png())"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "markdown",
331 |    "metadata": {},
332 |    "source": [
333 |     "### How trees work"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "from sklearn.datasets import make_blobs"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": null,
348 |    "metadata": {},
349 |    "outputs": [],
350 |    "source": [
351 |     "A, b = make_blobs(n_samples=200, n_features=2, cluster_std=0.6,\n",
352 |     "                  centers=[[-0.5,-1],[0.5,0.5]], shuffle=False, random_state=42)\n",
353 |     "plt.scatter(A[:, 0], A[:, 1], c=b)\n",
354 |     "plt.xlabel('X1', size=15)\n",
355 |     "plt.ylabel('X2', size=15);"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {},
362 |    "outputs": [],
363 |    "source": [
364 |     "plt.scatter(A[:, 0], A[:, 1], c=b)\n",
365 |     "plt.axhline(-0.6, c='red')\n",
366 |     "plt.xlabel('X1', size=15)\n",
367 |     "plt.ylabel('X2', size=15);"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "plt.scatter(A[:, 0], A[:, 1], c=b)\n",
377 |     "plt.axhline(-0.6, c='red')\n",
378 |     "plt.axvline(x=-0.1, ymin=0.34, c='red')\n",
379 |     "plt.xlabel('X1', size=15)\n",
380 |     "plt.ylabel('X2', size=15);"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": null,
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "plt.scatter(A[:, 0], A[:, 1], c=b)\n",
390 |     "plt.axhline(-0.6, c='red')\n",
391 |     "plt.axvline(x=-0.1, ymin=0.34, c='red')\n",
392 |     "plt.axvline(x=0.7, ymax=0.34, c='red')\n",
393 |     "plt.xlabel('X1', size=15)\n",
394 |     "plt.ylabel('X2', size=15);"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "markdown",
399 |    "metadata": {},
400 |    "source": [
401 |     "### Training a larger classification tree"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": [
410 |     "class_tree = DecisionTreeClassifier(max_depth=6, min_samples_split=50)\n",
411 |     "class_tree.fit(X_train, y_train)\n",
412 |     "y_pred_class_tree = class_tree.predict(X_train)"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": null,
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "accuracy_class_tree = accuracy_score(y_true=y_train, y_pred=y_pred_class_tree)\n",
422 |     "accuracy_class_tree"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": null,
428 |    "metadata": {},
429 |    "outputs": [],
430 |    "source": [
431 |     "pd.Series(data=class_tree.feature_importances_, index=X_train.columns).sort_values(ascending=False).round(3)"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": null,
437 |    "metadata": {},
438 |    "outputs": [],
439 |    "source": [
440 |     "pd.Series(data=class_tree.feature_importances_, index=X_train.columns).sort_values(ascending=False).plot(kind='bar');"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "markdown",
445 |    "metadata": {},
446 |    "source": [
447 |     "## Random Forests"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "from sklearn.ensemble import RandomForestClassifier\n",
457 |     "rf = RandomForestClassifier(n_estimators=99,\n",
458 |     "                            max_features=6,\n",
459 |     "                            max_depth=6,\n",
460 |     "                            min_samples_split=100,\n",
461 |     "                            random_state=85)\n",
462 |     "rf.fit(X_train, y_train)\n",
463 |     "y_pred_rf = rf.predict(X_train)"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": null,
469 |    "metadata": {},
470 |    "outputs": [],
471 |    "source": [
472 |     "accuracy_rf = accuracy_score(y_true=y_train, y_pred=y_pred_rf)\n",
473 |     "accuracy_rf"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": null,
479 |    "metadata": {},
480 |    "outputs": [],
481 |    "source": [
482 |     "pd.Series(data=rf.feature_importances_, index=X_train.columns).sort_values(ascending=False).round(3)"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "markdown",
487 |    "metadata": {},
488 |    "source": [
489 |     "## Training vs Testing Error"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": null,
495 |    "metadata": {},
496 |    "outputs": [],
497 |    "source": [
498 |     "y_pred_null = np.zeros_like(y_test)\n",
499 |     "accuracy_score(y_true=y_test, y_pred=y_pred_null)"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": null,
505 |    "metadata": {},
506 |    "outputs": [],
507 |    "source": [
508 |     "## Remember to also standarize the numerical features in the testing set\n",
509 |     "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "code",
514 |    "execution_count": null,
515 |    "metadata": {},
516 |    "outputs": [],
517 |    "source": [
518 |     "## Calculating accuracy\n",
519 |     "accuracies = pd.DataFrame(columns=['train', 'test'], index=['LogisticReg','ClassTree','RF'])\n",
520 |     "model_dict = {'LogisticReg': log_reg, 'ClassTree': class_tree, 'RF': rf}\n",
521 |     "for name, model in model_dict.items():\n",
522 |     "    accuracies.loc[name, 'train'] = accuracy_score(y_true=y_train, y_pred=model.predict(X_train))\n",
523 |     "    accuracies.loc[name, 'test'] = accuracy_score(y_true=y_test, y_pred=model.predict(X_test))\n",
524 |     "\n",
525 |     "accuracies"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "code",
530 |    "execution_count": null,
531 |    "metadata": {},
532 |    "outputs": [],
533 |    "source": [
534 |     "fig, ax = plt.subplots()\n",
535 |     "accuracies.sort_values(by='test', ascending=False).plot(kind='barh', ax=ax, zorder=3)\n",
536 |     "ax.grid(zorder=0)"
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "markdown",
541 |    "metadata": {},
542 |    "source": [
543 |     "## Multiclass classification"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "code",
548 |    "execution_count": null,
549 |    "metadata": {},
550 |    "outputs": [],
551 |    "source": [
552 |     "# Loading the iris dataset\n",
553 |     "from sklearn.datasets import load_iris\n",
554 |     "iris = load_iris()\n",
555 |     "# Training the logistic regression model\n",
556 |     "iris_log_reg = LogisticRegression(C=1e5)\n",
557 |     "iris_log_reg.fit(iris.data, iris.target)\n",
558 |     "iris_probs = iris_log_reg.predict_proba(iris.data)\n",
559 |     "iris_pred = iris_log_reg.predict(iris.data)"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": null,
565 |    "metadata": {},
566 |    "outputs": [],
567 |    "source": [
568 |     "iris_pred_df = pd.DataFrame(iris_probs, columns=iris.target_names).round(4)\n",
569 |     "iris_pred_df['predicted_class'] = iris.target_names[iris_pred]\n",
570 |     "iris_pred_df.sample(12)"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": null,
576 |    "metadata": {},
577 |    "outputs": [],
578 |    "source": []
579 |   }
580 |  ],
581 |  "metadata": {
582 |   "kernelspec": {
583 |    "display_name": "Python 3",
584 |    "language": "python",
585 |    "name": "python3"
586 |   },
587 |   "language_info": {
588 |    "codemirror_mode": {
589 |     "name": "ipython",
590 |     "version": 3
591 |    },
592 |    "file_extension": ".py",
593 |    "mimetype": "text/x-python",
594 |    "name": "python",
595 |    "nbconvert_exporter": "python",
596 |    "pygments_lexer": "ipython3",
597 |    "version": "3.6.10"
598 |   }
599 |  },
600 |  "nbformat": 4,
601 |  "nbformat_minor": 2
602 | }
603 | 


--------------------------------------------------------------------------------
/Chapter06/.ipynb_checkpoints/ch6-2-classification-with-neural-networks-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Predicting Credit Card Default with Neural Networks"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import numpy as np\n",
 17 |     "import pandas as pd\n",
 18 |     "import matplotlib.pyplot as plt\n",
 19 |     "import seaborn as sns\n",
 20 |     "import os\n",
 21 |     "%matplotlib inline"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "### Back with the credit card default dataset"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "# Loading the dataset\n",
 38 |     "DATA_DIR = '../data'\n",
 39 |     "FILE_NAME = 'credit_card_default.csv'\n",
 40 |     "data_path = os.path.join(DATA_DIR, FILE_NAME)\n",
 41 |     "ccd = pd.read_csv(data_path, index_col=\"ID\")\n",
 42 |     "ccd.rename(columns=lambda x: x.lower(), inplace=True)\n",
 43 |     "ccd.rename(columns={'default payment next month':'default'}, inplace=True)\n",
 44 |     "\n",
 45 |     "# getting the groups of features\n",
 46 |     "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n",
 47 |     "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n",
 48 |     "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features\n",
 49 |     "\n",
 50 |     "# Creating creating binary features\n",
 51 |     "ccd['male'] = (ccd['sex'] == 1).astype('int')\n",
 52 |     "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n",
 53 |     "ccd['university'] = (ccd['education'] == 2).astype('int')\n",
 54 |     "#ccd['high_school'] = (ccd['education'] == 3).astype('int')\n",
 55 |     "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n",
 56 |     "\n",
 57 |     "# simplifying pay features \n",
 58 |     "pay_features= ['pay_' + str(i) for i in range(1,7)]\n",
 59 |     "for x in pay_features:\n",
 60 |     "    ccd.loc[ccd[x] <= 0, x] = 0\n",
 61 |     "\n",
 62 |     "# simplifying delayed features\n",
 63 |     "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n",
 64 |     "for pay, delayed in zip(pay_features, delayed_features):\n",
 65 |     "    ccd[delayed] = (ccd[pay] > 0).astype(int)\n",
 66 |     "    \n",
 67 |     "# creating a new feature: months delayed\n",
 68 |     "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "## Split and standarize the dataset"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 3,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stderr",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "C:\\Anaconda\\envs\\ho-pawp\\lib\\site-packages\\pandas\\core\\indexing.py:966: SettingWithCopyWarning: \n",
 88 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
 89 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
 90 |       "\n",
 91 |       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
 92 |       "  self.obj[item] = s\n",
 93 |       "C:\\Anaconda\\envs\\ho-pawp\\lib\\site-packages\\pandas\\core\\indexing.py:966: SettingWithCopyWarning: \n",
 94 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
 95 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
 96 |       "\n",
 97 |       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
 98 |       "  self.obj[item] = s\n"
 99 |      ]
100 |     }
101 |    ],
102 |    "source": [
103 |     "numerical_features = numerical_features + ['months_delayed']\n",
104 |     "binary_features = ['male','married','grad_school','university']\n",
105 |     "X = ccd[numerical_features + binary_features]\n",
106 |     "y = ccd['default'].astype(int)\n",
107 |     "\n",
108 |     "## Split\n",
109 |     "from sklearn.model_selection import train_test_split\n",
110 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=101)\n",
111 |     "\n",
112 |     "## Standarize\n",
113 |     "from sklearn.preprocessing import StandardScaler\n",
114 |     "scaler = StandardScaler()\n",
115 |     "scaler.fit(X_train[numerical_features])\n",
116 |     "X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])\n",
117 |     "# Standarize also the testing set\n",
118 |     "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "### Building the neural network for classification"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 4,
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "name": "stderr",
135 |      "output_type": "stream",
136 |      "text": [
137 |       "Using TensorFlow backend.\n"
138 |      ]
139 |     }
140 |    ],
141 |    "source": [
142 |     "from keras.models import Sequential\n",
143 |     "nn_classifier = Sequential()"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 5,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "from keras.layers import Dense\n",
153 |     "n_input = X_train.shape[1]\n",
154 |     "n_units_hidden = 64\n",
155 |     "nn_classifier.add(Dense(units=n_units_hidden, activation='relu', input_shape=(n_input,)))"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 6,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "# add 2nd hidden layer\n",
165 |     "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n",
166 |     "# add 3th hidden layer\n",
167 |     "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n",
168 |     "# add 4th hidden layer\n",
169 |     "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n",
170 |     "# add 5th hidden layer\n",
171 |     "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 7,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "# output layer\n",
181 |     "nn_classifier.add(Dense(1, activation='sigmoid'))"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "### Training the network"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 8,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "## compiling step\n",
198 |     "nn_classifier.compile(loss='binary_crossentropy', optimizer='adam')"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 9,
204 |    "metadata": {},
205 |    "outputs": [
206 |     {
207 |      "name": "stdout",
208 |      "output_type": "stream",
209 |      "text": [
210 |       "Model: \"sequential_1\"\n",
211 |       "_________________________________________________________________\n",
212 |       "Layer (type)                 Output Shape              Param #   \n",
213 |       "=================================================================\n",
214 |       "dense_1 (Dense)              (None, 64)                1280      \n",
215 |       "_________________________________________________________________\n",
216 |       "dense_2 (Dense)              (None, 64)                4160      \n",
217 |       "_________________________________________________________________\n",
218 |       "dense_3 (Dense)              (None, 64)                4160      \n",
219 |       "_________________________________________________________________\n",
220 |       "dense_4 (Dense)              (None, 64)                4160      \n",
221 |       "_________________________________________________________________\n",
222 |       "dense_5 (Dense)              (None, 64)                4160      \n",
223 |       "_________________________________________________________________\n",
224 |       "dense_6 (Dense)              (None, 1)                 65        \n",
225 |       "=================================================================\n",
226 |       "Total params: 17,985\n",
227 |       "Trainable params: 17,985\n",
228 |       "Non-trainable params: 0\n",
229 |       "_________________________________________________________________\n"
230 |      ]
231 |     }
232 |    ],
233 |    "source": [
234 |     "nn_classifier.summary()"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 10,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "nn_classifier.save_weights('class_initial_w.h5')"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 11,
249 |    "metadata": {},
250 |    "outputs": [
251 |     {
252 |      "name": "stdout",
253 |      "output_type": "stream",
254 |      "text": [
255 |       "Epoch 1/150\n",
256 |       "25000/25000 [==============================] - 1s 30us/step - loss: 0.4690\n",
257 |       "Epoch 2/150\n",
258 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4481\n",
259 |       "Epoch 3/150\n",
260 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4446\n",
261 |       "Epoch 4/150\n",
262 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4428\n",
263 |       "Epoch 5/150\n",
264 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4405\n",
265 |       "Epoch 6/150\n",
266 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4399\n",
267 |       "Epoch 7/150\n",
268 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4388\n",
269 |       "Epoch 8/150\n",
270 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4379\n",
271 |       "Epoch 9/150\n",
272 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4365\n",
273 |       "Epoch 10/150\n",
274 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4353\n",
275 |       "Epoch 11/150\n",
276 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4348\n",
277 |       "Epoch 12/150\n",
278 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4334\n",
279 |       "Epoch 13/150\n",
280 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4325\n",
281 |       "Epoch 14/150\n",
282 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4303\n",
283 |       "Epoch 15/150\n",
284 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4289\n",
285 |       "Epoch 16/150\n",
286 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4282\n",
287 |       "Epoch 17/150\n",
288 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4262\n",
289 |       "Epoch 18/150\n",
290 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4243\n",
291 |       "Epoch 19/150\n",
292 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4240\n",
293 |       "Epoch 20/150\n",
294 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4217\n",
295 |       "Epoch 21/150\n",
296 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4203\n",
297 |       "Epoch 22/150\n",
298 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4180\n",
299 |       "Epoch 23/150\n",
300 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4166\n",
301 |       "Epoch 24/150\n",
302 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4145\n",
303 |       "Epoch 25/150\n",
304 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4128\n",
305 |       "Epoch 26/150\n",
306 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4113\n",
307 |       "Epoch 27/150\n",
308 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4088\n",
309 |       "Epoch 28/150\n",
310 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4065\n",
311 |       "Epoch 29/150\n",
312 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4038\n",
313 |       "Epoch 30/150\n",
314 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4034\n",
315 |       "Epoch 31/150\n",
316 |       "25000/25000 [==============================] - ETA: 0s - loss: 0.397 - 1s 22us/step - loss: 0.3986\n",
317 |       "Epoch 32/150\n",
318 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3986\n",
319 |       "Epoch 33/150\n",
320 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3968\n",
321 |       "Epoch 34/150\n",
322 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3920\n",
323 |       "Epoch 35/150\n",
324 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3888\n",
325 |       "Epoch 36/150\n",
326 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3870\n",
327 |       "Epoch 37/150\n",
328 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3869\n",
329 |       "Epoch 38/150\n",
330 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3828\n",
331 |       "Epoch 39/150\n",
332 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3821\n",
333 |       "Epoch 40/150\n",
334 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3779\n",
335 |       "Epoch 41/150\n",
336 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3760\n",
337 |       "Epoch 42/150\n",
338 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3739\n",
339 |       "Epoch 43/150\n",
340 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3742\n",
341 |       "Epoch 44/150\n",
342 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3683\n",
343 |       "Epoch 45/150\n",
344 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3663\n",
345 |       "Epoch 46/150\n",
346 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3641\n",
347 |       "Epoch 47/150\n",
348 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3625\n",
349 |       "Epoch 48/150\n",
350 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3605\n",
351 |       "Epoch 49/150\n",
352 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3571\n",
353 |       "Epoch 50/150\n",
354 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3525\n",
355 |       "Epoch 51/150\n",
356 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3547\n",
357 |       "Epoch 52/150\n",
358 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3493\n",
359 |       "Epoch 53/150\n",
360 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3481\n",
361 |       "Epoch 54/150\n",
362 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3484\n",
363 |       "Epoch 55/150\n",
364 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3442\n",
365 |       "Epoch 56/150\n",
366 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3426\n",
367 |       "Epoch 57/150\n",
368 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3386\n",
369 |       "Epoch 58/150\n",
370 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3404\n",
371 |       "Epoch 59/150\n",
372 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3381\n",
373 |       "Epoch 60/150\n",
374 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.3370\n",
375 |       "Epoch 61/150\n",
376 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3307\n",
377 |       "Epoch 62/150\n",
378 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3301\n",
379 |       "Epoch 63/150\n",
380 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3283\n",
381 |       "Epoch 64/150\n",
382 |       "25000/25000 [==============================] - 1s 26us/step - loss: 0.3248\n",
383 |       "Epoch 65/150\n",
384 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3261\n",
385 |       "Epoch 66/150\n",
386 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3221\n",
387 |       "Epoch 67/150\n",
388 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3203\n",
389 |       "Epoch 68/150\n",
390 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3228\n",
391 |       "Epoch 69/150\n",
392 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3181\n",
393 |       "Epoch 70/150\n",
394 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3193\n",
395 |       "Epoch 71/150\n",
396 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3115\n",
397 |       "Epoch 72/150\n",
398 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3155\n",
399 |       "Epoch 73/150\n",
400 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3161\n",
401 |       "Epoch 74/150\n",
402 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.3071\n",
403 |       "Epoch 75/150\n",
404 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3089\n",
405 |       "Epoch 76/150\n",
406 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3074\n",
407 |       "Epoch 77/150\n",
408 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3079\n",
409 |       "Epoch 78/150\n",
410 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3001\n",
411 |       "Epoch 79/150\n",
412 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3061\n",
413 |       "Epoch 80/150\n",
414 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.3023\n",
415 |       "Epoch 81/150\n",
416 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3015\n",
417 |       "Epoch 82/150\n",
418 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.2933\n",
419 |       "Epoch 83/150\n",
420 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.2968\n",
421 |       "Epoch 84/150\n",
422 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.2973\n",
423 |       "Epoch 85/150\n",
424 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.2949\n",
425 |       "Epoch 86/150\n",
426 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.2941\n",
427 |       "Epoch 87/150\n",
428 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.2835\n",
429 |       "Epoch 88/150\n",
430 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.2889\n",
431 |       "Epoch 89/150\n",
432 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.2862\n",
433 |       "Epoch 90/150\n",
434 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.2876\n",
435 |       "Epoch 91/150\n",
436 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.2809\n",
437 |       "Epoch 92/150\n",
438 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.2876\n",
439 |       "Epoch 93/150\n"
440 |      ]
441 |     },
442 |     {
443 |      "name": "stdout",
444 |      "output_type": "stream",
445 |      "text": [
446 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2804\n",
447 |       "Epoch 94/150\n",
448 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2834\n",
449 |       "Epoch 95/150\n",
450 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2803\n",
451 |       "Epoch 96/150\n",
452 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2822\n",
453 |       "Epoch 97/150\n",
454 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2810\n",
455 |       "Epoch 98/150\n",
456 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2751\n",
457 |       "Epoch 99/150\n",
458 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2754\n",
459 |       "Epoch 100/150\n",
460 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2708\n",
461 |       "Epoch 101/150\n",
462 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2708\n",
463 |       "Epoch 102/150\n",
464 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2890\n",
465 |       "Epoch 103/150\n",
466 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2701\n",
467 |       "Epoch 104/150\n",
468 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2674\n",
469 |       "Epoch 105/150\n",
470 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2684\n",
471 |       "Epoch 106/150\n",
472 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2673\n",
473 |       "Epoch 107/150\n",
474 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2690\n",
475 |       "Epoch 108/150\n",
476 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2659\n",
477 |       "Epoch 109/150\n",
478 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2624\n",
479 |       "Epoch 110/150\n",
480 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2663\n",
481 |       "Epoch 111/150\n",
482 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2614\n",
483 |       "Epoch 112/150\n",
484 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2583\n",
485 |       "Epoch 113/150\n",
486 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2560\n",
487 |       "Epoch 114/150\n",
488 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2597\n",
489 |       "Epoch 115/150\n",
490 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2599\n",
491 |       "Epoch 116/150\n",
492 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2570\n",
493 |       "Epoch 117/150\n",
494 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2552\n",
495 |       "Epoch 118/150\n",
496 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2508\n",
497 |       "Epoch 119/150\n",
498 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2494\n",
499 |       "Epoch 120/150\n",
500 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2518\n",
501 |       "Epoch 121/150\n",
502 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2473\n",
503 |       "Epoch 122/150\n",
504 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2574\n",
505 |       "Epoch 123/150\n",
506 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2521\n",
507 |       "Epoch 124/150\n",
508 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2537\n",
509 |       "Epoch 125/150\n",
510 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2490\n",
511 |       "Epoch 126/150\n",
512 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2459\n",
513 |       "Epoch 127/150\n",
514 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2457\n",
515 |       "Epoch 128/150\n",
516 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2479\n",
517 |       "Epoch 129/150\n",
518 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2511\n",
519 |       "Epoch 130/150\n",
520 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2422\n",
521 |       "Epoch 131/150\n",
522 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2419\n",
523 |       "Epoch 132/150\n",
524 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2390\n",
525 |       "Epoch 133/150\n",
526 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2396\n",
527 |       "Epoch 134/150\n",
528 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2336\n",
529 |       "Epoch 135/150\n",
530 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2331\n",
531 |       "Epoch 136/150\n",
532 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2507\n",
533 |       "Epoch 137/150\n",
534 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2420\n",
535 |       "Epoch 138/150\n",
536 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2434\n",
537 |       "Epoch 139/150\n",
538 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2335\n",
539 |       "Epoch 140/150\n",
540 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2268\n",
541 |       "Epoch 141/150\n",
542 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2317\n",
543 |       "Epoch 142/150\n",
544 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2314\n",
545 |       "Epoch 143/150\n",
546 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2426\n",
547 |       "Epoch 144/150\n",
548 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2306\n",
549 |       "Epoch 145/150\n",
550 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2402\n",
551 |       "Epoch 146/150\n",
552 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2297\n",
553 |       "Epoch 147/150\n",
554 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2253\n",
555 |       "Epoch 148/150\n",
556 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2187\n",
557 |       "Epoch 149/150\n",
558 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2243\n",
559 |       "Epoch 150/150\n",
560 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2256\n"
561 |      ]
562 |     },
563 |     {
564 |      "data": {
565 |       "text/plain": [
566 |        "<keras.callbacks.callbacks.History at 0x1caf65c7fd0>"
567 |       ]
568 |      },
569 |      "execution_count": 11,
570 |      "metadata": {},
571 |      "output_type": "execute_result"
572 |     }
573 |    ],
574 |    "source": [
575 |     "batch_size = 64\n",
576 |     "n_epochs = 150\n",
577 |     "nn_classifier.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size)"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "markdown",
582 |    "metadata": {},
583 |    "source": [
584 |     "## Evaluating predictions"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": 12,
590 |    "metadata": {},
591 |    "outputs": [],
592 |    "source": [
593 |     "## Getting the probabilities\n",
594 |     "y_pred_train_prob = nn_classifier.predict(X_train)\n",
595 |     "y_pred_test_prob = nn_classifier.predict(X_test)\n",
596 |     "\n",
597 |     "## Classifications from predictions\n",
598 |     "y_pred_train = (y_pred_train_prob > 0.5).astype(int)\n",
599 |     "y_pred_test = (y_pred_test_prob > 0.5).astype(int)"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": 13,
605 |    "metadata": {},
606 |    "outputs": [
607 |     {
608 |      "name": "stdout",
609 |      "output_type": "stream",
610 |      "text": [
611 |       "Train Accuracy: 0.903 \n",
612 |       "Test Accuracy: 0.750\n"
613 |      ]
614 |     }
615 |    ],
616 |    "source": [
617 |     "from sklearn.metrics import accuracy_score\n",
618 |     "train_acc = accuracy_score(y_true=y_train, y_pred=y_pred_train)\n",
619 |     "test_acc = accuracy_score(y_true=y_test, y_pred=y_pred_test)\n",
620 |     "print(\"Train Accuracy: {:0.3f} \\nTest Accuracy: {:0.3f}\".format(train_acc, test_acc))"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "markdown",
625 |    "metadata": {},
626 |    "source": [
627 |     "## Re-training the network with less epochs"
628 |    ]
629 |   },
630 |   {
631 |    "cell_type": "code",
632 |    "execution_count": 14,
633 |    "metadata": {},
634 |    "outputs": [],
635 |    "source": [
636 |     "## load the initial weights\n",
637 |     "nn_classifier.load_weights('class_initial_w.h5')"
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "code",
642 |    "execution_count": null,
643 |    "metadata": {
644 |     "scrolled": true
645 |    },
646 |    "outputs": [
647 |     {
648 |      "name": "stdout",
649 |      "output_type": "stream",
650 |      "text": [
651 |       "Epoch 1/50\n",
652 |       "25000/25000 [==============================] - 1s 30us/step - loss: 0.4680\n",
653 |       "Epoch 2/50\n",
654 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4479\n",
655 |       "Epoch 3/50\n",
656 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4454\n",
657 |       "Epoch 4/50\n",
658 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4430\n",
659 |       "Epoch 5/50\n",
660 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4407\n",
661 |       "Epoch 6/50\n",
662 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4401\n",
663 |       "Epoch 7/50\n",
664 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4381\n",
665 |       "Epoch 8/50\n",
666 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4372\n",
667 |       "Epoch 9/50\n",
668 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4356\n",
669 |       "Epoch 10/50\n",
670 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4350\n",
671 |       "Epoch 11/50\n",
672 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4334\n",
673 |       "Epoch 12/50\n",
674 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4323\n",
675 |       "Epoch 13/50\n",
676 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4320\n",
677 |       "Epoch 14/50\n",
678 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4302\n",
679 |       "Epoch 15/50\n",
680 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.4284\n",
681 |       "Epoch 16/50\n",
682 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4278\n",
683 |       "Epoch 17/50\n",
684 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4260\n",
685 |       "Epoch 18/50\n",
686 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4249\n",
687 |       "Epoch 19/50\n",
688 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4226\n",
689 |       "Epoch 20/50\n",
690 |       "10752/25000 [===========>..................] - ETA: 0s - loss: 0.4187"
691 |      ]
692 |     }
693 |    ],
694 |    "source": [
695 |     "batch_size = 64\n",
696 |     "n_epochs = 50\n",
697 |     "nn_classifier.compile(loss='binary_crossentropy', optimizer='adam')\n",
698 |     "nn_classifier.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size)"
699 |    ]
700 |   },
701 |   {
702 |    "cell_type": "code",
703 |    "execution_count": null,
704 |    "metadata": {},
705 |    "outputs": [],
706 |    "source": [
707 |     "## Getting the probabilities\n",
708 |     "y_pred_train_prob = nn_classifier.predict(X_train)\n",
709 |     "y_pred_test_prob = nn_classifier.predict(X_test)\n",
710 |     "\n",
711 |     "## Classifications from predictions\n",
712 |     "y_pred_train = (y_pred_train_prob > 0.5).astype(int)\n",
713 |     "y_pred_test = (y_pred_test_prob > 0.5).astype(int)\n",
714 |     "\n",
715 |     "## Calculating accuracy\n",
716 |     "train_acc = accuracy_score(y_true=y_train, y_pred=y_pred_train)\n",
717 |     "test_acc = accuracy_score(y_true=y_test, y_pred=y_pred_test)\n",
718 |     "print(\"Train Accuracy: {:0.3f} \\nTest Accuracy: {:0.3f}\".format(train_acc, test_acc))"
719 |    ]
720 |   },
721 |   {
722 |    "cell_type": "code",
723 |    "execution_count": null,
724 |    "metadata": {},
725 |    "outputs": [],
726 |    "source": []
727 |   },
728 |   {
729 |    "cell_type": "code",
730 |    "execution_count": null,
731 |    "metadata": {},
732 |    "outputs": [],
733 |    "source": []
734 |   },
735 |   {
736 |    "cell_type": "code",
737 |    "execution_count": null,
738 |    "metadata": {},
739 |    "outputs": [],
740 |    "source": []
741 |   },
742 |   {
743 |    "cell_type": "code",
744 |    "execution_count": null,
745 |    "metadata": {},
746 |    "outputs": [],
747 |    "source": []
748 |   }
749 |  ],
750 |  "metadata": {
751 |   "kernelspec": {
752 |    "display_name": "Python 3",
753 |    "language": "python",
754 |    "name": "python3"
755 |   },
756 |   "language_info": {
757 |    "codemirror_mode": {
758 |     "name": "ipython",
759 |     "version": 3
760 |    },
761 |    "file_extension": ".py",
762 |    "mimetype": "text/x-python",
763 |    "name": "python",
764 |    "nbconvert_exporter": "python",
765 |    "pygments_lexer": "ipython3",
766 |    "version": "3.6.10"
767 |   }
768 |  },
769 |  "nbformat": 4,
770 |  "nbformat_minor": 2
771 | }
772 | 


--------------------------------------------------------------------------------
/Chapter06/ch6-2-classification-with-neural-networks.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Predicting Credit Card Default with Neural Networks"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import numpy as np\n",
 17 |     "import pandas as pd\n",
 18 |     "import matplotlib.pyplot as plt\n",
 19 |     "import seaborn as sns\n",
 20 |     "import os\n",
 21 |     "%matplotlib inline"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "### Back with the credit card default dataset"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "# Loading the dataset\n",
 38 |     "DATA_DIR = '../data'\n",
 39 |     "FILE_NAME = 'credit_card_default.csv'\n",
 40 |     "data_path = os.path.join(DATA_DIR, FILE_NAME)\n",
 41 |     "ccd = pd.read_csv(data_path, index_col=\"ID\")\n",
 42 |     "ccd.rename(columns=lambda x: x.lower(), inplace=True)\n",
 43 |     "ccd.rename(columns={'default payment next month':'default'}, inplace=True)\n",
 44 |     "\n",
 45 |     "# getting the groups of features\n",
 46 |     "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n",
 47 |     "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n",
 48 |     "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features\n",
 49 |     "\n",
 50 |     "# Creating creating binary features\n",
 51 |     "ccd['male'] = (ccd['sex'] == 1).astype('int')\n",
 52 |     "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n",
 53 |     "ccd['university'] = (ccd['education'] == 2).astype('int')\n",
 54 |     "#ccd['high_school'] = (ccd['education'] == 3).astype('int')\n",
 55 |     "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n",
 56 |     "\n",
 57 |     "# simplifying pay features \n",
 58 |     "pay_features= ['pay_' + str(i) for i in range(1,7)]\n",
 59 |     "for x in pay_features:\n",
 60 |     "    ccd.loc[ccd[x] <= 0, x] = 0\n",
 61 |     "\n",
 62 |     "# simplifying delayed features\n",
 63 |     "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n",
 64 |     "for pay, delayed in zip(pay_features, delayed_features):\n",
 65 |     "    ccd[delayed] = (ccd[pay] > 0).astype(int)\n",
 66 |     "    \n",
 67 |     "# creating a new feature: months delayed\n",
 68 |     "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "## Split and standarize the dataset"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 3,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stderr",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "C:\\Anaconda\\envs\\ho-pawp\\lib\\site-packages\\pandas\\core\\indexing.py:966: SettingWithCopyWarning: \n",
 88 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
 89 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
 90 |       "\n",
 91 |       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
 92 |       "  self.obj[item] = s\n",
 93 |       "C:\\Anaconda\\envs\\ho-pawp\\lib\\site-packages\\pandas\\core\\indexing.py:966: SettingWithCopyWarning: \n",
 94 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
 95 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
 96 |       "\n",
 97 |       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
 98 |       "  self.obj[item] = s\n"
 99 |      ]
100 |     }
101 |    ],
102 |    "source": [
103 |     "numerical_features = numerical_features + ['months_delayed']\n",
104 |     "binary_features = ['male','married','grad_school','university']\n",
105 |     "X = ccd[numerical_features + binary_features]\n",
106 |     "y = ccd['default'].astype(int)\n",
107 |     "\n",
108 |     "## Split\n",
109 |     "from sklearn.model_selection import train_test_split\n",
110 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=101)\n",
111 |     "\n",
112 |     "## Standarize\n",
113 |     "from sklearn.preprocessing import StandardScaler\n",
114 |     "scaler = StandardScaler()\n",
115 |     "scaler.fit(X_train[numerical_features])\n",
116 |     "X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])\n",
117 |     "# Standarize also the testing set\n",
118 |     "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "### Building the neural network for classification"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 4,
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "name": "stderr",
135 |      "output_type": "stream",
136 |      "text": [
137 |       "Using TensorFlow backend.\n"
138 |      ]
139 |     }
140 |    ],
141 |    "source": [
142 |     "from keras.models import Sequential\n",
143 |     "nn_classifier = Sequential()"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 5,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "from keras.layers import Dense\n",
153 |     "n_input = X_train.shape[1]\n",
154 |     "n_units_hidden = 64\n",
155 |     "nn_classifier.add(Dense(units=n_units_hidden, activation='relu', input_shape=(n_input,)))"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 6,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "# add 2nd hidden layer\n",
165 |     "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n",
166 |     "# add 3th hidden layer\n",
167 |     "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n",
168 |     "# add 4th hidden layer\n",
169 |     "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n",
170 |     "# add 5th hidden layer\n",
171 |     "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 7,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "# output layer\n",
181 |     "nn_classifier.add(Dense(1, activation='sigmoid'))"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "### Training the network"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 8,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "## compiling step\n",
198 |     "nn_classifier.compile(loss='binary_crossentropy', optimizer='adam')"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 9,
204 |    "metadata": {},
205 |    "outputs": [
206 |     {
207 |      "name": "stdout",
208 |      "output_type": "stream",
209 |      "text": [
210 |       "Model: \"sequential_1\"\n",
211 |       "_________________________________________________________________\n",
212 |       "Layer (type)                 Output Shape              Param #   \n",
213 |       "=================================================================\n",
214 |       "dense_1 (Dense)              (None, 64)                1280      \n",
215 |       "_________________________________________________________________\n",
216 |       "dense_2 (Dense)              (None, 64)                4160      \n",
217 |       "_________________________________________________________________\n",
218 |       "dense_3 (Dense)              (None, 64)                4160      \n",
219 |       "_________________________________________________________________\n",
220 |       "dense_4 (Dense)              (None, 64)                4160      \n",
221 |       "_________________________________________________________________\n",
222 |       "dense_5 (Dense)              (None, 64)                4160      \n",
223 |       "_________________________________________________________________\n",
224 |       "dense_6 (Dense)              (None, 1)                 65        \n",
225 |       "=================================================================\n",
226 |       "Total params: 17,985\n",
227 |       "Trainable params: 17,985\n",
228 |       "Non-trainable params: 0\n",
229 |       "_________________________________________________________________\n"
230 |      ]
231 |     }
232 |    ],
233 |    "source": [
234 |     "nn_classifier.summary()"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 10,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "nn_classifier.save_weights('class_initial_w.h5')"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 11,
249 |    "metadata": {},
250 |    "outputs": [
251 |     {
252 |      "name": "stdout",
253 |      "output_type": "stream",
254 |      "text": [
255 |       "Epoch 1/150\n",
256 |       "25000/25000 [==============================] - 1s 30us/step - loss: 0.4690\n",
257 |       "Epoch 2/150\n",
258 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4481\n",
259 |       "Epoch 3/150\n",
260 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4446\n",
261 |       "Epoch 4/150\n",
262 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4428\n",
263 |       "Epoch 5/150\n",
264 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4405\n",
265 |       "Epoch 6/150\n",
266 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4399\n",
267 |       "Epoch 7/150\n",
268 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4388\n",
269 |       "Epoch 8/150\n",
270 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4379\n",
271 |       "Epoch 9/150\n",
272 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4365\n",
273 |       "Epoch 10/150\n",
274 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4353\n",
275 |       "Epoch 11/150\n",
276 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4348\n",
277 |       "Epoch 12/150\n",
278 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4334\n",
279 |       "Epoch 13/150\n",
280 |       "25000/25000 [==============================] - 1s 21us/step - loss: 0.4325\n",
281 |       "Epoch 14/150\n",
282 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4303\n",
283 |       "Epoch 15/150\n",
284 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4289\n",
285 |       "Epoch 16/150\n",
286 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4282\n",
287 |       "Epoch 17/150\n",
288 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4262\n",
289 |       "Epoch 18/150\n",
290 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4243\n",
291 |       "Epoch 19/150\n",
292 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4240\n",
293 |       "Epoch 20/150\n",
294 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4217\n",
295 |       "Epoch 21/150\n",
296 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4203\n",
297 |       "Epoch 22/150\n",
298 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4180\n",
299 |       "Epoch 23/150\n",
300 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4166\n",
301 |       "Epoch 24/150\n",
302 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4145\n",
303 |       "Epoch 25/150\n",
304 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4128\n",
305 |       "Epoch 26/150\n",
306 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4113\n",
307 |       "Epoch 27/150\n",
308 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4088\n",
309 |       "Epoch 28/150\n",
310 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4065\n",
311 |       "Epoch 29/150\n",
312 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4038\n",
313 |       "Epoch 30/150\n",
314 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4034\n",
315 |       "Epoch 31/150\n",
316 |       "25000/25000 [==============================] - ETA: 0s - loss: 0.397 - 1s 22us/step - loss: 0.3986\n",
317 |       "Epoch 32/150\n",
318 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3986\n",
319 |       "Epoch 33/150\n",
320 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3968\n",
321 |       "Epoch 34/150\n",
322 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3920\n",
323 |       "Epoch 35/150\n",
324 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3888\n",
325 |       "Epoch 36/150\n",
326 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3870\n",
327 |       "Epoch 37/150\n",
328 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3869\n",
329 |       "Epoch 38/150\n",
330 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3828\n",
331 |       "Epoch 39/150\n",
332 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3821\n",
333 |       "Epoch 40/150\n",
334 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3779\n",
335 |       "Epoch 41/150\n",
336 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3760\n",
337 |       "Epoch 42/150\n",
338 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3739\n",
339 |       "Epoch 43/150\n",
340 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3742\n",
341 |       "Epoch 44/150\n",
342 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3683\n",
343 |       "Epoch 45/150\n",
344 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3663\n",
345 |       "Epoch 46/150\n",
346 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3641\n",
347 |       "Epoch 47/150\n",
348 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3625\n",
349 |       "Epoch 48/150\n",
350 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3605\n",
351 |       "Epoch 49/150\n",
352 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3571\n",
353 |       "Epoch 50/150\n",
354 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3525\n",
355 |       "Epoch 51/150\n",
356 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3547\n",
357 |       "Epoch 52/150\n",
358 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3493\n",
359 |       "Epoch 53/150\n",
360 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3481\n",
361 |       "Epoch 54/150\n",
362 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3484\n",
363 |       "Epoch 55/150\n",
364 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3442\n",
365 |       "Epoch 56/150\n",
366 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3426\n",
367 |       "Epoch 57/150\n",
368 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3386\n",
369 |       "Epoch 58/150\n",
370 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3404\n",
371 |       "Epoch 59/150\n",
372 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3381\n",
373 |       "Epoch 60/150\n",
374 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.3370\n",
375 |       "Epoch 61/150\n",
376 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3307\n",
377 |       "Epoch 62/150\n",
378 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3301\n",
379 |       "Epoch 63/150\n",
380 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3283\n",
381 |       "Epoch 64/150\n",
382 |       "25000/25000 [==============================] - 1s 26us/step - loss: 0.3248\n",
383 |       "Epoch 65/150\n",
384 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3261\n",
385 |       "Epoch 66/150\n",
386 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3221\n",
387 |       "Epoch 67/150\n",
388 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3203\n",
389 |       "Epoch 68/150\n",
390 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3228\n",
391 |       "Epoch 69/150\n",
392 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3181\n",
393 |       "Epoch 70/150\n",
394 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3193\n",
395 |       "Epoch 71/150\n",
396 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3115\n",
397 |       "Epoch 72/150\n",
398 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3155\n",
399 |       "Epoch 73/150\n",
400 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3161\n",
401 |       "Epoch 74/150\n",
402 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.3071\n",
403 |       "Epoch 75/150\n",
404 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3089\n",
405 |       "Epoch 76/150\n",
406 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3074\n",
407 |       "Epoch 77/150\n",
408 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3079\n",
409 |       "Epoch 78/150\n",
410 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3001\n",
411 |       "Epoch 79/150\n",
412 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3061\n",
413 |       "Epoch 80/150\n",
414 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.3023\n",
415 |       "Epoch 81/150\n",
416 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3015\n",
417 |       "Epoch 82/150\n",
418 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.2933\n",
419 |       "Epoch 83/150\n",
420 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.2968\n",
421 |       "Epoch 84/150\n",
422 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.2973\n",
423 |       "Epoch 85/150\n",
424 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.2949\n",
425 |       "Epoch 86/150\n",
426 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.2941\n",
427 |       "Epoch 87/150\n",
428 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.2835\n",
429 |       "Epoch 88/150\n",
430 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.2889\n",
431 |       "Epoch 89/150\n",
432 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.2862\n",
433 |       "Epoch 90/150\n",
434 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.2876\n",
435 |       "Epoch 91/150\n",
436 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.2809\n",
437 |       "Epoch 92/150\n",
438 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.2876\n",
439 |       "Epoch 93/150\n"
440 |      ]
441 |     },
442 |     {
443 |      "name": "stdout",
444 |      "output_type": "stream",
445 |      "text": [
446 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2804\n",
447 |       "Epoch 94/150\n",
448 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2834\n",
449 |       "Epoch 95/150\n",
450 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2803\n",
451 |       "Epoch 96/150\n",
452 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2822\n",
453 |       "Epoch 97/150\n",
454 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2810\n",
455 |       "Epoch 98/150\n",
456 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2751\n",
457 |       "Epoch 99/150\n",
458 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2754\n",
459 |       "Epoch 100/150\n",
460 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2708\n",
461 |       "Epoch 101/150\n",
462 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2708\n",
463 |       "Epoch 102/150\n",
464 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2890\n",
465 |       "Epoch 103/150\n",
466 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2701\n",
467 |       "Epoch 104/150\n",
468 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2674\n",
469 |       "Epoch 105/150\n",
470 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2684\n",
471 |       "Epoch 106/150\n",
472 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2673\n",
473 |       "Epoch 107/150\n",
474 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2690\n",
475 |       "Epoch 108/150\n",
476 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2659\n",
477 |       "Epoch 109/150\n",
478 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2624\n",
479 |       "Epoch 110/150\n",
480 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2663\n",
481 |       "Epoch 111/150\n",
482 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2614\n",
483 |       "Epoch 112/150\n",
484 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2583\n",
485 |       "Epoch 113/150\n",
486 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2560\n",
487 |       "Epoch 114/150\n",
488 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2597\n",
489 |       "Epoch 115/150\n",
490 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2599\n",
491 |       "Epoch 116/150\n",
492 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2570\n",
493 |       "Epoch 117/150\n",
494 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2552\n",
495 |       "Epoch 118/150\n",
496 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2508\n",
497 |       "Epoch 119/150\n",
498 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2494\n",
499 |       "Epoch 120/150\n",
500 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2518\n",
501 |       "Epoch 121/150\n",
502 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2473\n",
503 |       "Epoch 122/150\n",
504 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2574\n",
505 |       "Epoch 123/150\n",
506 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2521\n",
507 |       "Epoch 124/150\n",
508 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2537\n",
509 |       "Epoch 125/150\n",
510 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2490\n",
511 |       "Epoch 126/150\n",
512 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2459\n",
513 |       "Epoch 127/150\n",
514 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2457\n",
515 |       "Epoch 128/150\n",
516 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2479\n",
517 |       "Epoch 129/150\n",
518 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2511\n",
519 |       "Epoch 130/150\n",
520 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2422\n",
521 |       "Epoch 131/150\n",
522 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2419\n",
523 |       "Epoch 132/150\n",
524 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2390\n",
525 |       "Epoch 133/150\n",
526 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2396\n",
527 |       "Epoch 134/150\n",
528 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2336\n",
529 |       "Epoch 135/150\n",
530 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2331\n",
531 |       "Epoch 136/150\n",
532 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2507\n",
533 |       "Epoch 137/150\n",
534 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2420\n",
535 |       "Epoch 138/150\n",
536 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2434\n",
537 |       "Epoch 139/150\n",
538 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2335\n",
539 |       "Epoch 140/150\n",
540 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2268\n",
541 |       "Epoch 141/150\n",
542 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2317\n",
543 |       "Epoch 142/150\n",
544 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2314\n",
545 |       "Epoch 143/150\n",
546 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2426\n",
547 |       "Epoch 144/150\n",
548 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2306\n",
549 |       "Epoch 145/150\n",
550 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.2402\n",
551 |       "Epoch 146/150\n",
552 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2297\n",
553 |       "Epoch 147/150\n",
554 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2253\n",
555 |       "Epoch 148/150\n",
556 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2187\n",
557 |       "Epoch 149/150\n",
558 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2243\n",
559 |       "Epoch 150/150\n",
560 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.2256\n"
561 |      ]
562 |     },
563 |     {
564 |      "data": {
565 |       "text/plain": [
566 |        "<keras.callbacks.callbacks.History at 0x1caf65c7fd0>"
567 |       ]
568 |      },
569 |      "execution_count": 11,
570 |      "metadata": {},
571 |      "output_type": "execute_result"
572 |     }
573 |    ],
574 |    "source": [
575 |     "batch_size = 64\n",
576 |     "n_epochs = 150\n",
577 |     "nn_classifier.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size)"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "markdown",
582 |    "metadata": {},
583 |    "source": [
584 |     "## Evaluating predictions"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": 12,
590 |    "metadata": {},
591 |    "outputs": [],
592 |    "source": [
593 |     "## Getting the probabilities\n",
594 |     "y_pred_train_prob = nn_classifier.predict(X_train)\n",
595 |     "y_pred_test_prob = nn_classifier.predict(X_test)\n",
596 |     "\n",
597 |     "## Classifications from predictions\n",
598 |     "y_pred_train = (y_pred_train_prob > 0.5).astype(int)\n",
599 |     "y_pred_test = (y_pred_test_prob > 0.5).astype(int)"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": 13,
605 |    "metadata": {},
606 |    "outputs": [
607 |     {
608 |      "name": "stdout",
609 |      "output_type": "stream",
610 |      "text": [
611 |       "Train Accuracy: 0.903 \n",
612 |       "Test Accuracy: 0.750\n"
613 |      ]
614 |     }
615 |    ],
616 |    "source": [
617 |     "from sklearn.metrics import accuracy_score\n",
618 |     "train_acc = accuracy_score(y_true=y_train, y_pred=y_pred_train)\n",
619 |     "test_acc = accuracy_score(y_true=y_test, y_pred=y_pred_test)\n",
620 |     "print(\"Train Accuracy: {:0.3f} \\nTest Accuracy: {:0.3f}\".format(train_acc, test_acc))"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "markdown",
625 |    "metadata": {},
626 |    "source": [
627 |     "## Re-training the network with less epochs"
628 |    ]
629 |   },
630 |   {
631 |    "cell_type": "code",
632 |    "execution_count": 14,
633 |    "metadata": {},
634 |    "outputs": [],
635 |    "source": [
636 |     "## load the initial weights\n",
637 |     "nn_classifier.load_weights('class_initial_w.h5')"
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "code",
642 |    "execution_count": 15,
643 |    "metadata": {
644 |     "scrolled": true
645 |    },
646 |    "outputs": [
647 |     {
648 |      "name": "stdout",
649 |      "output_type": "stream",
650 |      "text": [
651 |       "Epoch 1/50\n",
652 |       "25000/25000 [==============================] - 1s 30us/step - loss: 0.4680\n",
653 |       "Epoch 2/50\n",
654 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4479\n",
655 |       "Epoch 3/50\n",
656 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4454\n",
657 |       "Epoch 4/50\n",
658 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4430\n",
659 |       "Epoch 5/50\n",
660 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4407\n",
661 |       "Epoch 6/50\n",
662 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4401\n",
663 |       "Epoch 7/50\n",
664 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4381\n",
665 |       "Epoch 8/50\n",
666 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4372\n",
667 |       "Epoch 9/50\n",
668 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4356\n",
669 |       "Epoch 10/50\n",
670 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4350\n",
671 |       "Epoch 11/50\n",
672 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4334\n",
673 |       "Epoch 12/50\n",
674 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4323\n",
675 |       "Epoch 13/50\n",
676 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4320\n",
677 |       "Epoch 14/50\n",
678 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4302\n",
679 |       "Epoch 15/50\n",
680 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.4284\n",
681 |       "Epoch 16/50\n",
682 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.4278\n",
683 |       "Epoch 17/50\n",
684 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4260\n",
685 |       "Epoch 18/50\n",
686 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4249\n",
687 |       "Epoch 19/50\n",
688 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4226\n",
689 |       "Epoch 20/50\n",
690 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.4216\n",
691 |       "Epoch 21/50\n",
692 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.4197\n",
693 |       "Epoch 22/50\n",
694 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.4178\n",
695 |       "Epoch 23/50\n",
696 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.4163\n",
697 |       "Epoch 24/50\n",
698 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.4139\n",
699 |       "Epoch 25/50\n",
700 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.4110\n",
701 |       "Epoch 26/50\n",
702 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.4116\n",
703 |       "Epoch 27/50\n",
704 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.4079\n",
705 |       "Epoch 28/50\n",
706 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.4056\n",
707 |       "Epoch 29/50\n",
708 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.4032\n",
709 |       "Epoch 30/50\n",
710 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.4008\n",
711 |       "Epoch 31/50\n",
712 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.3989\n",
713 |       "Epoch 32/50\n",
714 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3959\n",
715 |       "Epoch 33/50\n",
716 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3938\n",
717 |       "Epoch 34/50\n",
718 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3893\n",
719 |       "Epoch 35/50\n",
720 |       "25000/25000 [==============================] - 1s 25us/step - loss: 0.3885\n",
721 |       "Epoch 36/50\n",
722 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3854\n",
723 |       "Epoch 37/50\n",
724 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3817\n",
725 |       "Epoch 38/50\n",
726 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3805\n",
727 |       "Epoch 39/50\n",
728 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3804\n",
729 |       "Epoch 40/50\n",
730 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3751\n",
731 |       "Epoch 41/50\n",
732 |       "25000/25000 [==============================] - 1s 26us/step - loss: 0.3745\n",
733 |       "Epoch 42/50\n",
734 |       "25000/25000 [==============================] - 1s 26us/step - loss: 0.3709\n",
735 |       "Epoch 43/50\n",
736 |       "25000/25000 [==============================] - 1s 26us/step - loss: 0.3712\n",
737 |       "Epoch 44/50\n",
738 |       "25000/25000 [==============================] - 1s 29us/step - loss: 0.3657\n",
739 |       "Epoch 45/50\n",
740 |       "25000/25000 [==============================] - 1s 34us/step - loss: 0.3628\n",
741 |       "Epoch 46/50\n",
742 |       "25000/25000 [==============================] - 1s 30us/step - loss: 0.3600\n",
743 |       "Epoch 47/50\n",
744 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3573\n",
745 |       "Epoch 48/50\n",
746 |       "25000/25000 [==============================] - 1s 23us/step - loss: 0.3576\n",
747 |       "Epoch 49/50\n",
748 |       "25000/25000 [==============================] - 1s 24us/step - loss: 0.3536\n",
749 |       "Epoch 50/50\n",
750 |       "25000/25000 [==============================] - 1s 22us/step - loss: 0.3502\n"
751 |      ]
752 |     },
753 |     {
754 |      "data": {
755 |       "text/plain": [
756 |        "<keras.callbacks.callbacks.History at 0x1cafefa2c18>"
757 |       ]
758 |      },
759 |      "execution_count": 15,
760 |      "metadata": {},
761 |      "output_type": "execute_result"
762 |     }
763 |    ],
764 |    "source": [
765 |     "batch_size = 64\n",
766 |     "n_epochs = 50\n",
767 |     "nn_classifier.compile(loss='binary_crossentropy', optimizer='adam')\n",
768 |     "nn_classifier.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size)"
769 |    ]
770 |   },
771 |   {
772 |    "cell_type": "code",
773 |    "execution_count": 16,
774 |    "metadata": {},
775 |    "outputs": [
776 |     {
777 |      "name": "stdout",
778 |      "output_type": "stream",
779 |      "text": [
780 |       "Train Accuracy: 0.845 \n",
781 |       "Test Accuracy: 0.782\n"
782 |      ]
783 |     }
784 |    ],
785 |    "source": [
786 |     "## Getting the probabilities\n",
787 |     "y_pred_train_prob = nn_classifier.predict(X_train)\n",
788 |     "y_pred_test_prob = nn_classifier.predict(X_test)\n",
789 |     "\n",
790 |     "## Classifications from predictions\n",
791 |     "y_pred_train = (y_pred_train_prob > 0.5).astype(int)\n",
792 |     "y_pred_test = (y_pred_test_prob > 0.5).astype(int)\n",
793 |     "\n",
794 |     "## Calculating accuracy\n",
795 |     "train_acc = accuracy_score(y_true=y_train, y_pred=y_pred_train)\n",
796 |     "test_acc = accuracy_score(y_true=y_test, y_pred=y_pred_test)\n",
797 |     "print(\"Train Accuracy: {:0.3f} \\nTest Accuracy: {:0.3f}\".format(train_acc, test_acc))"
798 |    ]
799 |   },
800 |   {
801 |    "cell_type": "code",
802 |    "execution_count": null,
803 |    "metadata": {},
804 |    "outputs": [],
805 |    "source": []
806 |   },
807 |   {
808 |    "cell_type": "code",
809 |    "execution_count": null,
810 |    "metadata": {},
811 |    "outputs": [],
812 |    "source": []
813 |   },
814 |   {
815 |    "cell_type": "code",
816 |    "execution_count": null,
817 |    "metadata": {},
818 |    "outputs": [],
819 |    "source": []
820 |   },
821 |   {
822 |    "cell_type": "code",
823 |    "execution_count": null,
824 |    "metadata": {},
825 |    "outputs": [],
826 |    "source": []
827 |   }
828 |  ],
829 |  "metadata": {
830 |   "kernelspec": {
831 |    "display_name": "Python 3",
832 |    "language": "python",
833 |    "name": "python3"
834 |   },
835 |   "language_info": {
836 |    "codemirror_mode": {
837 |     "name": "ipython",
838 |     "version": 3
839 |    },
840 |    "file_extension": ".py",
841 |    "mimetype": "text/x-python",
842 |    "name": "python",
843 |    "nbconvert_exporter": "python",
844 |    "pygments_lexer": "ipython3",
845 |    "version": "3.6.10"
846 |   }
847 |  },
848 |  "nbformat": 4,
849 |  "nbformat_minor": 2
850 | }
851 | 


--------------------------------------------------------------------------------
/Chapter06/class_initial_w.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Predictive-Analytics-with-Python/e049d9d2870f596892b62aeddec312788d9d0c2c/Chapter06/class_initial_w.h5


--------------------------------------------------------------------------------
/Chapter08/.ipynb_checkpoints/ch8-credit-card-def-model-tuning-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Credit Card Default: Model Tuning and Improving Performance"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "#### Importing libraries"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import numpy as np\n",
 24 |     "import pandas as pd\n",
 25 |     "import matplotlib.pyplot as plt\n",
 26 |     "import seaborn as sns\n",
 27 |     "import os\n",
 28 |     "\n",
 29 |     "pd.options.mode.chained_assignment = None\n",
 30 |     "%matplotlib inline"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "#### Loading and preparing the dataset"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 2,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# Loading the dataset\n",
 47 |     "DATA_DIR = '../data'\n",
 48 |     "FILE_NAME = 'credit_card_default.csv'\n",
 49 |     "data_path = os.path.join(DATA_DIR, FILE_NAME)\n",
 50 |     "ccd = pd.read_csv(data_path, index_col=\"ID\")\n",
 51 |     "ccd.rename(columns=lambda x: x.lower(), inplace=True)\n",
 52 |     "ccd.rename(columns={'default payment next month':'default'}, inplace=True)\n",
 53 |     "\n",
 54 |     "# getting the groups of features\n",
 55 |     "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n",
 56 |     "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n",
 57 |     "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features\n",
 58 |     "\n",
 59 |     "# Creating creating binary features\n",
 60 |     "ccd['male'] = (ccd['sex'] == 1).astype('int')\n",
 61 |     "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n",
 62 |     "ccd['university'] = (ccd['education'] == 2).astype('int')\n",
 63 |     "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n",
 64 |     "\n",
 65 |     "# simplifying pay features \n",
 66 |     "pay_features= ['pay_' + str(i) for i in range(1,7)]\n",
 67 |     "for x in pay_features:\n",
 68 |     "    ccd.loc[ccd[x] <= 0, x] = 0\n",
 69 |     "\n",
 70 |     "# simplifying delayed features\n",
 71 |     "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n",
 72 |     "for pay, delayed in zip(pay_features, delayed_features):\n",
 73 |     "    ccd[delayed] = (ccd[pay] > 0).astype(int)\n",
 74 |     "    \n",
 75 |     "# creating a new feature: months delayed\n",
 76 |     "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "#### Splitting  and standarizing the dataset"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 3,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "numerical_features = numerical_features + ['months_delayed']\n",
 93 |     "binary_features = ['male','married','grad_school','university']\n",
 94 |     "X = ccd[numerical_features + binary_features]\n",
 95 |     "y = ccd['default'].astype(int)\n",
 96 |     "\n",
 97 |     "## Split\n",
 98 |     "from sklearn.model_selection import train_test_split\n",
 99 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=25)\n",
100 |     "\n",
101 |     "## Standarize\n",
102 |     "from sklearn.preprocessing import StandardScaler\n",
103 |     "scaler = StandardScaler()\n",
104 |     "scaler.fit(X[numerical_features])\n",
105 |     "X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])\n",
106 |     "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "## Optimizing more than one parameter"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "#### Reference model"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 4,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "from sklearn.model_selection import cross_val_score\n",
130 |     "from sklearn.ensemble import RandomForestClassifier\n",
131 |     "ref_rf = RandomForestClassifier(n_estimators=25,\n",
132 |     "                                max_features=4,\n",
133 |     "                                max_depth=4,\n",
134 |     "                                random_state=61)\n",
135 |     "\n",
136 |     "ref_rf_scores = cross_val_score(ref_rf, X_train, y_train, scoring='roc_auc', cv=10)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 5,
142 |    "metadata": {
143 |     "scrolled": true
144 |    },
145 |    "outputs": [
146 |     {
147 |      "name": "stdout",
148 |      "output_type": "stream",
149 |      "text": [
150 |       "Mean AUC for reference model: 0.7589\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "print(\"Mean AUC for reference model: {:0.4f}\".format(ref_rf_scores.mean()))"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "#### Grid Search CV"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 6,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "from sklearn.model_selection import GridSearchCV\n",
172 |     "param_grid = {\"n_estimators\":[25,100,200,400],\n",
173 |     "              \"max_features\":[4,10,19],\n",
174 |     "              \"max_depth\":[4,8,16,20]}"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [
182 |     {
183 |      "name": "stdout",
184 |      "output_type": "stream",
185 |      "text": [
186 |       "Fitting 5 folds for each of 48 candidates, totalling 240 fits\n"
187 |      ]
188 |     },
189 |     {
190 |      "name": "stderr",
191 |      "output_type": "stream",
192 |      "text": [
193 |       "[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.\n"
194 |      ]
195 |     }
196 |    ],
197 |    "source": [
198 |     "rf = RandomForestClassifier(random_state=17)\n",
199 |     "grid_search = GridSearchCV(estimator=rf,\n",
200 |     "                           param_grid=param_grid,\n",
201 |     "                           scoring='roc_auc',\n",
202 |     "                           cv=5,\n",
203 |     "                           verbose=1,\n",
204 |     "                           n_jobs=4)\n",
205 |     "grid_search.fit(X_train, y_train)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "gs_results = pd.Series(grid_search.cv_results_['mean_test_score'], index=grid_search.cv_results_['params'])\n",
215 |     "gs_results.sort_values(ascending=False)"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "from sklearn.metrics import precision_recall_curve\n",
225 |     "## Fitting the initial (not tuned) model:\n",
226 |     "ref_rf.fit(X_train, y_train)\n",
227 |     "\n",
228 |     "## Getting the probabilites\n",
229 |     "y_prob_tunned = grid_search.predict_proba(X_test)[:,1]\n",
230 |     "y_prob_not_tunned = ref_rf.predict_proba(X_test)[:,1]\n",
231 |     "\n",
232 |     "## Values for plotting the curves\n",
233 |     "prec_tuned, recall_tuned, _ = precision_recall_curve(y_test, y_prob_tunned)\n",
234 |     "prec_not_tuned, recall_not_tuned, _ = precision_recall_curve(y_test, y_prob_not_tunned)"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "fig, ax = plt.subplots(figsize=(8,5))\n",
244 |     "ax.plot(prec_tuned, recall_tuned, label='Tuned Model')\n",
245 |     "ax.plot(prec_not_tuned, recall_not_tuned, label='Not Tuned Model')\n",
246 |     "ax.set_title('Precision-recall curves', fontsize=16)\n",
247 |     "ax.set_xlabel('Precision', fontsize=14)\n",
248 |     "ax.set_ylabel('Recall', fontsize=14)\n",
249 |     "ax.set_xlim(0.3,0.7); ax.set_ylim(0.1,0.9)\n",
250 |     "ax.legend(); ax.grid();"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": []
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": []
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": []
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": []
280 |   }
281 |  ],
282 |  "metadata": {
283 |   "kernelspec": {
284 |    "display_name": "Python 3",
285 |    "language": "python",
286 |    "name": "python3"
287 |   },
288 |   "language_info": {
289 |    "codemirror_mode": {
290 |     "name": "ipython",
291 |     "version": 3
292 |    },
293 |    "file_extension": ".py",
294 |    "mimetype": "text/x-python",
295 |    "name": "python",
296 |    "nbconvert_exporter": "python",
297 |    "pygments_lexer": "ipython3",
298 |    "version": "3.6.10"
299 |   }
300 |  },
301 |  "nbformat": 4,
302 |  "nbformat_minor": 2
303 | }
304 | 


--------------------------------------------------------------------------------
/Chapter08/.ipynb_checkpoints/ch8-diamond-prices-model-tuning-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Diamond Prices: Model Tuning and Improving Performance"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "#### Importing libraries"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import numpy as np\n",
 24 |     "import pandas as pd\n",
 25 |     "import matplotlib.pyplot as plt\n",
 26 |     "import seaborn as sns\n",
 27 |     "import os\n",
 28 |     "\n",
 29 |     "pd.options.mode.chained_assignment = None\n",
 30 |     "%matplotlib inline"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "#### Loading the dataset"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 2,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "DATA_DIR = '../data'\n",
 47 |     "FILE_NAME = 'diamonds.csv'\n",
 48 |     "data_path = os.path.join(DATA_DIR, FILE_NAME)\n",
 49 |     "diamonds = pd.read_csv(data_path)"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "#### Preparing the dataset"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 3,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "## Preparation done from Chapter 2\n",
 66 |     "diamonds = diamonds.loc[(diamonds['x']>0) | (diamonds['y']>0)]\n",
 67 |     "diamonds.loc[11182, 'x'] = diamonds['x'].median()\n",
 68 |     "diamonds.loc[11182, 'z'] = diamonds['z'].median()\n",
 69 |     "diamonds = diamonds.loc[~((diamonds['y'] > 30) | (diamonds['z'] > 30))]\n",
 70 |     "diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['cut'], prefix='cut', drop_first=True)], axis=1)\n",
 71 |     "diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['color'], prefix='color', drop_first=True)], axis=1)\n",
 72 |     "diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['clarity'], prefix='clarity', drop_first=True)], axis=1)\n",
 73 |     "\n",
 74 |     "## Dimensionality reduction\n",
 75 |     "from sklearn.decomposition import PCA\n",
 76 |     "pca = PCA(n_components=1, random_state=123)\n",
 77 |     "diamonds['dim_index'] = pca.fit_transform(diamonds[['x','y','z']])\n",
 78 |     "diamonds.drop(['x','y','z'], axis=1, inplace=True)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 4,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "data": {
 88 |       "text/plain": [
 89 |        "Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price',\n",
 90 |        "       'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_E',\n",
 91 |        "       'color_F', 'color_G', 'color_H', 'color_I', 'color_J', 'clarity_IF',\n",
 92 |        "       'clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2',\n",
 93 |        "       'clarity_VVS1', 'clarity_VVS2', 'dim_index'],\n",
 94 |        "      dtype='object')"
 95 |       ]
 96 |      },
 97 |      "execution_count": 4,
 98 |      "metadata": {},
 99 |      "output_type": "execute_result"
100 |     }
101 |    ],
102 |    "source": [
103 |     "diamonds.columns"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "#### Train-test split"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 5,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "X = diamonds.drop(['cut','color','clarity','price'], axis=1)\n",
120 |     "y = diamonds['price']\n",
121 |     "\n",
122 |     "from sklearn.model_selection import train_test_split\n",
123 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=7)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "#### Standarization: centering and scaling "
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 6,
136 |    "metadata": {
137 |     "scrolled": true
138 |    },
139 |    "outputs": [],
140 |    "source": [
141 |     "numerical_features = ['carat', 'depth', 'table', 'dim_index']\n",
142 |     "from sklearn.preprocessing import StandardScaler\n",
143 |     "scaler = StandardScaler()\n",
144 |     "scaler.fit(X_train[numerical_features])\n",
145 |     "X_train.loc[:, numerical_features] = scaler.fit_transform(X_train[numerical_features])\n",
146 |     "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "## Optimizing a single hyper-parameter"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 7,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=13)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 8,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "from sklearn.neighbors import KNeighborsRegressor\n",
172 |     "from sklearn.metrics import mean_absolute_error\n",
173 |     "\n",
174 |     "candidates = np.arange(4,16)\n",
175 |     "mae_metrics = []\n",
176 |     "for k in candidates:\n",
177 |     "    model = KNeighborsRegressor(n_neighbors=k, weights='distance', metric='minkowski', leaf_size=50, n_jobs=4)\n",
178 |     "    model.fit(X_train, y_train)\n",
179 |     "    y_pred = model.predict(X_val)\n",
180 |     "    metric = mean_absolute_error(y_true=y_val, y_pred=y_pred)\n",
181 |     "    mae_metrics.append(metric)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 9,
187 |    "metadata": {},
188 |    "outputs": [
189 |     {
190 |      "data": {
191 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfgAAAFBCAYAAACb7b3CAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deXxU1f3/8deHECAQIKwBEjYVUHYI4tYquIuIVOpW0bb2+3Nvtdal2Nal1mKLVrupbdVaRaVacSkuWJXUFZF9D4IESNiFBAIBsnx+f8yEDiHAAJO5k+H9fDzmwcydM3femWg+c+499xxzd0RERCS51As6gIiIiMSeCryIiEgSUoEXERFJQirwIiIiSUgFXkREJAmpwIuIiCSh+kEHiKXWrVt7ly5dYrrPbdu20aRJk5ju83AlWqZEywPKFC1lio4yRUeZDqw28syYMWOju7fZ6wl3T5pbTk6Ox9qUKVNivs/DlWiZEi2PuzJFS5mio0zRUaYDq408wHSvoSbqEL2IiEgSUoEXERFJQirwIiIiSUgFXkREJAmpwIuIiCQhFXgREZEkpAIvIiKShJJqohsREZFE9NqsQsZNzqOwqJSsqR9w+zk9GDkgq1bfUwVeRESkFr02q5AxE+dRWlYBQGFRKWMmzgOo1SKvQ/QiIiK1aNzkvN3FvUppWQXjJufV6vuqBy8iIhJjO8oqmL2qiC+Wb6KwqLTGNqv3sT1W4l7gzSwFmA4UuvvwiO23AeOANu6+MWJ7J2AhcK+7PxTvvCIiIgdSXFrGjBWbmLZ8M1/kb2JuQRFlFQ5A/XpGeaXv9ZoOGWm1mimIHvzNwCKgWdUGM+sInAWsrKH9I8Db8YkmIiJyYOu27GDa8k18kb+Jacs3kbduK+6hYt4nuzlXn9KV47u0ZFCXFuTmbdjjHDxAWmoKt5/To1YzxrXAm1k2cD7wAHBrxFOPAHcAr1drPxL4CtgWr4wiIiKR3J3lG7eFi3moh75y03YAGjdIYWCnFpzXuz3Hd23BgI4tSGuQssfrqwbS7R5Fn5GWlKPoHyVUyJtWbTCzEYQO188xMyK2NwHuJNSzvy3OOUVE5AhVUeksWrNldw/9i/zNbCzZCUDLJg0Y1LkFV53UmeO7tKRnh2akphx4vPrIAVmMHJBFbm4uQ4YMqeWfIMRCS8nG4Y3MhgPD3P0GMxtCqGhfAkwBznb3YjPLBwa5+0YzewiY5u4vmdm9QElN5+DN7BrgGoDMzMycCRMmxDR3SUkJ6enpMd3n4Uq0TImWB5QpWsoUHWWKTl3NtKvCWV5cSd7mCpZsrmTp5gp2hI+mt2pkdG9Zjx4tUujeIoX2TYzIzmht5DlYQ4cOneHug6pvj2eBHwtcCZQDjQidg38b+CawPdwsG1gNDAZeBjqGt2cAlcDd7v6nfb3HoEGDfPr06THNHc9vW9FKtEyJlgeUKVrKFB1lik4iZdpjYplqh8SLS8uYuWIz0/I38cXyTcwtKGZXRSUA3TPTOb5LSwZ3bcnxXVrGfCBcbXxGZlZjgY/bIXp3HwOMCYcZAtzm7qMi20T24AkV/qrt9xLqwe+zuIuIiEDNE8vc/q85vDJjFRu3lbF47ZbdA+J6ZzXne6d0CQ2I69yCFk0aBJw+dnQdvIiIJJWaJpYpq3A+Xvo1Jx/TipvP6MbgLi3p3ymDxg2StwwG8pO5ey6QW8P2Lvtof2+tBhIRkaSxvwlknv+/E+OYJFiaqlZERJJGeUUlDevXXNpqe2KZRKMCLyIiScHduevVeeworyQ1Zc+R7vGYWCbRqMCLiEhS+O3kPF6aXsCPzujGuG/3IyvcY8/KSGPsRX1qfWKZRJO8owtEROSI8dTHy3k8dxnfOaETPz6zG2YW94llEo168CIiUqe9PruQ+yct5Lze7bj/wt6HNRFNMlGBFxGROuu/Szbwk5fmcOJRLXnk0v6k1FNxr6ICLyIiddLsVUVcP34G3TKb8terBtEoNeXALzqCqMCLiEids3R9Cd//+zRapzfkH1cfT7NGqUFHSjgq8CIiUqesKS7lu09PI6We8dwPBtO2aaOgIyUkjaIXEZE6o2j7Lr779DSKS8uYcM2JdG7VJOhICUs9eBERqRNKd1Xwg39MJ3/jdv56VQ69s5oHHSmhqQcvIiIJr7yikptemMnMlZv583cGcvLRrYOOlPDUgxcRkYTm7oyZOI/3F6/nlxf2Zlif9kFHqhNU4EVEJKH95p08Xp5RwM1ndOPKEzsHHafOUIEXEZGE9eRHX/HEf5dxxQmduOXMbkHHqVNU4EVEJCG9OquAX725iGF92vFLTUF70FTgRUQk4UzJW8/tL8/lpKNaaQraQ6QCLyIiCWXWys3cMH4m3TOb8tercmhYX1PQHgoVeBERSRhL12/l+898QZumDXnm6uNpqiloD5kKvIiIJIQ1xaVc9dQ06terpyloY0AFXkREAle0fRdXPTWNLTvKeeb7x2sK2hhQgRcRkUBVTUG74mtNQRtLmqpWREQCU1ZRyY3hKWgf0xS0MaUevIiIBMLd+ekr8/hg8Xruv7A352kK2phSgRcRkUA8+PZiXplZwC1ndmO0pqCNORV4ERGJu799+BV/+fArrjyxMzefoSloa0PcC7yZpZjZLDObVG37bWbmZtY6/PgsM5thZvPC/54e76wiIhJ7E2cW8MBboSlo7x3RS1PQ1pIgBtndDCwCmlVtMLOOwFnAyoh2G4EL3H21mfUGJgNZ8QwqIiKxNSVvPXf8ay4nH60paGtbXHvwZpYNnA88We2pR4A7AK/a4O6z3H11+OECoJGZNYxLUBERibmZ4Sloj23flL9cqSloa1u8D9E/SqiQV1ZtMLMRQKG7z9nP60YBs9x9Zy3nExGRWrB0/VaufuYL2jZryN+/N1hT0MaBufuBW8XijcyGA8Pc/QYzGwLcBlwCTAHOdvdiM8sHBrn7xojX9QLeCLdZVsN+rwGuAcjMzMyZMGFCTHOXlJSQnp4e030erkTLlGh5QJmipUzRUabo7CvT16WVPPD5Dsor4ecnNqJt4/j1LRPtc6qNPEOHDp3h7oP2esLd43IDxgIFQD6wFtgOvAKsD2/LB8oJnYdvF35NNrAEOCWa98jJyfFYmzJlSsz3ebgSLVOi5XFXpmgpU3SUKTo1ZdpUstPPeDjXe9/9js8vLEqITEGqjTzAdK+hJsZtkJ27jwHGAFT14N19VGSbyB68mWUAbwJj3P2TeOUUEZHY2L6rnKv/8QUrN23nH98fTK8OmoI2nhL5OvibgGOAX5jZ7PCtbdChRETkwMoqKrnx+ZnMWVXEHy7rz0lHtwo60hEnkLno3T0XyK1he5eI+78CfhW3UCIiEhOVlc6dr8xlSt4Gfv2tPpzbW1PQBiGRe/AiIlIHPfjOYibOLOTWs7rznRM6BR3niKUCLyIiMfPXD5fx1w+/4qqTOvPD048JOs4RTcvFiojIYXltViHjJudRWFQKLKZ/dnPuuUBT0AZNPXgRETlkr80qZMzEeeHiHrJ43Vb+PWf1fl4l8aACLyIih2zc5DxKyyr22LajrJJxk/MCSiRVVOBFROSQrY7ouUezXeJHBV5ERA5JZaXToH7NZaRDRlqc00h1KvAiInJI/vjBUnaWV5KasudgurTUFG4/p0dAqaSKCryIiBy0KXnrefT9JVw0IIvfjupLVrjHnpWRxtiL+jByQFbACUWXyYmIyEFZtWk7t0yYTY/MpjzwrT6kNUjhWwOzyc3NZciQIUHHkzD14EVEJGo7yiq4bvwMKt35y5U5pDVICTqS7IN68CIiEhV35+evzWfB6i089d1BdG7VJOhIsh/qwYuISFRenLaKf80o4EenH8MZx2UGHUcOQAVeREQOaPaqIu59YwGndW/DzWd2DzqOREEFXkRE9uvrkp1cP34GbZs15PeX9SelnuaYrwt0Dl5ERPapvKKSH744i6+37WLi9SeT0bhB0JEkSurBi4jIPj38nyV8uuxrfjWyN72zmgcdRw6CCryIiNTonflreTx3GZcP7sQlgzoGHUcOkgq8iIjs5asNJdz28hz6ZTfn3hE9g44jh0AFXkRE9rBtZznXPjeDBvXr8djoHBrW12Q2dZEKvIiI7Obu3PnKXJZtKOGPlw/YPce81D0q8CIistvTn+Qzae4abjunB6cc0zroOHIYVOBFRASAacs38eu3FnF2z0yuP+3ooOPIYVKBFxER1m/ZwY0vzKRzy8Y8dEk/zDSZTV2niW5ERI5wu8orueH5mZTsKOf5/zuBZo1Sg44kMaACLyJyhPv1W4uYvmIzf7h8AN0zmwYdR2JEh+hFRI5gr88u5JlP87n6lK6M6Nch6DgSQ3Ev8GaWYmazzGxSte23mZmbWeuIbWPMbKmZ5ZnZOfHOKiKSzBav3cJPX5nH8V1aMGbYsUHHkRgL4hD9zcAioFnVBjPrCJwFrIzY1hO4DOgFdADeM7Pu7l4R37giIslny44yrntuBumN6vPn7wwkNUUHdJNNXH+jZpYNnA88We2pR4A7AI/YdiEwwd13uvtyYCkwOC5BRUSSWGWl85OX5lCwuZTHrhhI22aNgo4ktSDeX9keJVTIK6s2mNkIoNDd51RrmwWsinhcEN4mIiKH4fH/LuM/C9fxs/OP4/guLYOOI7XE3P3ArWLxRmbDgWHufoOZDQFuAy4BpgBnu3uxmeUDg9x9o5n9GfjM3ceHX/8U8Ja7v1Jtv9cA1wBkZmbmTJgwIaa5S0pKSE9Pj+k+D1eiZUq0PKBM0VKm6CRTpvkby3l4+k5OaJ/CtX0bxvR692T6nGpLbeQZOnToDHcftNcT7h6XGzCWUC88H1gLbAdeAdaHt+UD5YTOw7cDxgBjIl4/GThpf++Rk5PjsTZlypSY7/NwJVqmRMvjrkzRUqboJEumVZu2ef/7JvvZv/uvb9tZlhCZaluiZaqNPMB0r6Emxu0QvbuPcfdsd+9CaPDcB+4+yt3bunuX8PYCYKC7rwXeAC4zs4Zm1hXoBkyLV14RkWSyo6yCG56fSXmF88SVOTRuoGlQkl3C/obdfYGZvQQsJNSzv9E1gl5E5JDc9+8FzC0o5q9X5tC1dZOg40gcBFLg3T0XyK1he5dqjx8AHohLKBGRJPXPL1by4rRV3Dj0aM7u1S7oOBInuvBRRCSJzS0o4hevL+Abx7Tm1rN6BB1H4kgFXkQkSW3atovrx8+kTXpD/nD5AFLqaYW4I0nCnoMXEZFDV1Hp3DxhFhu27uTl606iZZMGQUeSOFOBFxFJQo++t4SPvtzI2Iv60K9jRtBxJAA6RC8ikmTeW7iOP36wlEsHdeTywZ2CjiMBUYEXEUki+Ru38eOXZtMnqzn3Xdgr6DgSIBV4EZEksX1XOdeNn0FKPeOxKwbSKDUl6EgSIJ2DFxFJAu7OmInzyFu3lWe+P5iOLRsHHUkCph68iEgSePazFbw+ezW3ntmd07q3CTqOJAAVeBGROm7Gik3cP2khZx7XlhuHHhN0HEkQKvAiInXY+q07uOH5mWS1SOPhS/pTT5PZSJgKvIhIHVVWUclNL8yiuLSMJ0bn0DwtNehIkkA0yE5EpI76zduLmbZ8E49c2o/j2jcLOo4kGBV4EZE65LVZhYybnEdhUSmwnG8e04pvDcgOOpYkIB2iFxGpI16bVciYifPCxT3kixWbeW1WYYCpJFGpwIuI1BHjJudRWlaxx7YdZZWMm5wXUCJJZCrwIiJ1wJri0j167pFW72O7HNl0Dl5EJIHtLK/gqY+X86cPlu6zTYeMtDgmkrpCBV5EJEHl5q3nvn8vZPnGbZzVM5MTu7bkoXeX7HGYPi01hdvP6RFgSklUKvAiIglm5dfb+eWkhby3aB1HtW7CM98/niE92gLQKr3h7lH0WRlp3H5OD0YOyAo4sSQiFXgRkQRRuquCx/+7jCf+u4z69YyfnncsV5/SlQb1/zdcauSALEYOyCI3N5chQ4YEF1YSngq8iEjA3J3JC9Zy/6RFFBaVMqJfB+4adhztmjcKOprUYSrwIiIBWrq+hPv+vYCPvtzIse2aMuGaEznxqFZBx5IkoAIvIhKArTvK+OMHS3n64+WkNUjh3gt6MvrEztRP0dXLEhsq8CIiceTuvDa7kF+/tZgNW3dy6aCO3H5uD1qnNww6miQZFXgRkThZsLqYe99YwBf5m+mX3Zy/XTWI/h0zgo4lSSruBd7MUoDpQKG7Dzez+4ELgUpgPfA9d19tZqnAk8DAcM5n3X1svPOKiByuou27ePjdJTz/+QoyGjfgN6P6cHFOR63dLrUqiB78zcAioGptw3Hu/gsAM/sRcDdwHXAx0NDd+5hZY2Chmb3o7vkBZBYROWgVlc5L01fx23cWU1xaxlUndeHHZ3aneWOt2y61L64F3syygfOBB4BbAdx9S0STJoCH7zvQxMzqA2nALiCyrYhIwpq5cjP3vL6AeYXFDO7akvtG9NKa7RJX8e7BPwrcATSN3GhmDwBXAcXA0PDmfxE6dL8GaAz82N03xS+qiMjB27B1J799ZzEvzyggs1lDfn9Zf0b064CZDsdLfJm7H7hVLN7IbDgwzN1vMLMhwG3uPrxamzFAI3e/x8xOAW4Avge0AD4CznP3r6q95hrgGoDMzMycCRMmxDR3SUkJ6enpMd3n4Uq0TImWB5QpWsoUnWgyVVQ6768s59Wlu9hVAed0SeWCo1NJq187hb2ufk7xlmiZaiPP0KFDZ7j7oL2ecPcD3oBfA40jHg8D0iIeNyM0CG5/+xgLFAD5wFpgOzC+WpvOwPzw/T8DV0Y89zRwyf7eIycnx2NtypQpMd/n4Uq0TImWx12ZoqVM0TlQpk+XbvSzf/df73znJB/95FRfun5r4JmCoEwHVht5gOleQ02MdkaFO4HIrxwTgPYRj9OAK/a3A3cf4+7Z7t4FuAz4wN1Hm1m3iGYjgMXh+yuB0y2kCXBixHMiIoFbU1zKTS/M5PK/TWXbrnL+cmUOz149mKPbJE6PUY5c0Z6Dr36MKZbHnB40sx6ELpNbQWgEPYR68H8H5off7+/uPjeG7ysickh2llfw5EehNdor3fnxmd259rSjaJSaEnQ0kd0CmejG3XOB3PD9UftoU0LoUjkRkYQxZfF67vv3AvK/3s45vTL5+fk96diycdCxRPaimexERPbhtVmFu9dez/z0PVqnN2DB6q0c1aYJz149mFO7twk6osg+HUyBv87MSiJe9wMz+zr8uOk+XiMiUie9NquQMRPnUVpWAcC6LTtZt2UnI/q256FL+u+xRrtIIoq2wK8Evh/xeC3wnRraiIgkhQffXry7uEeasbJIxV3qhKgKfHjku4hIUnN3Plv2Nc9NXcHaLTtqbLO6qDTOqUQOTUzOwYcvY7vc3Z+Mxf5EROJpy44yJs4o4LmpK1i2YRstGqeS3rA+JTvL92rbISMtgIQiB++wCryZnQT8ALiU0KVsKvAiUmcsXruF5z5bwauzCtm+q4J+HTN46OJ+DO/bnnfmr93jHDxAWmoKt5/TI8DEItE76AJvZq0IzRv/f8CxwJuEivyk2EYTEYm9XeWVvLNgLeM/W8G0/E00rF+PEf06cOVJnemb/b+12UcOyALYPYo+KyON28/psXu7SKKLusCb2TmEivoFwFTgEeBx4KfuvrB24omIxMaa4lJe+HwlL05bxcaSnXRq2Zi7hh3LxTkdadGkQY2vGTkgi5EDssjNzWXIkCHxDSxymKIq8GaWD+wAngNu9/Ca7Gb2eK0lExE5TO7Op8u+5rnPVvCfReuodOf0Hm0ZfVJnTuvWhnr1tMKbJK9oe/DtgNeB2cCq2osjInL4aho09/++eRRXnNBJs87JESPaAt+R0LKtDwFPm9kEYDwQn7VmRUSisGjNFp6buoLXwoPm+nfM4OGL+3F+3/aaJ16OONFeB78BGAeMM7NvEhpUNyX8+mvN7K/uvqD2YoqI1GxXeSVvz1/D+Kkr+CJ/8+5Bc1ed1IU+2c2DjicSmIMeRe/uHwEfmdmPCM1mdzXwQzPLc/fjYh1QRKQmq4tKeXHa/wbNdW7VmJ8NO46LB2WT0bjmQXMiR5JDvg7e3bcATwBPmFkfQiPsRURqjbvzydKveW5qPv9ZuA4Hzji2LaNP7MypGjQnsodoR9G/UdtBRET2pbi0jFdmFDD+8xV8tWEbLZs04NrTjuY7gzVoTmRfou3BDwdWEF7DXUQk1iKXZs2a+gG3n9OD7plNdw+aKy0LDZr73SX9GNZHg+ZEDiTaAv8QMBo4Ffg78Iy7F9RaKhE5olRfmrWwqJRbX5pNpUPD+vW4sH8HrjxRg+ZEDka0o+jvMLMxwPmEBtX9zMxygaeA1929rPYiikiyGzc5b6+lWSsdmqfV57+3D9WgOZFDEPWixu5e4e5vuPtIoCuhy+R+BRSaWXptBRSR5LevJVi3lJaruIscoqgLfDVNgAwgHShBE96IyGHIbN6oxu1amlXk0EVd4M0szcy+a2YfAvOAzsB33f0od99WawlFJKlVVjoZaal7bdfSrCKHJ9rL5P5KaM33Lwmddx/h7kW1GUxEjgxPfvwVi9du5ds52Xy27GstzSoSI9GOov8/YCWwBjgPOM9s7wkl3H1E7KKJSLKbs6qI376Tx7m92jHu230xMy3NKhIj0Rb4Z9F5dhGJoS07yvjhi7PIbNaI34wKFXcRiZ1oL5P7Xi3nEJEjiLtz18R5FBaV8tK1J9K88d7n4EXk8BzqKHoRkUP20vRVTJq7hlvP6k5O55ZBxxFJSnEv8GaWYmazzGxS+PH9ZjbXzGab2btm1iGibV8z+8zMFpjZPDOr+VoaEakzvly3lXveWMApx7Ti+tOODjqOSNIKogd/M7Ao4vE4d+/r7v2BScDdAGZWHxgPXOfuvYAhgGbME6nDdpRV8MMXZ9GkQX0euaS/Vn8TqUVxLfBmlk1outsnq7aFl52t0oT/DeY7G5jr7nPC7b529z3nshSROuVXby5k8dqtPHxJP9o20wE5kdp0yOvBH6JHgTuAppEbzewB4CqgGBga3twdcDObDLQBJrj7b+OYVURi6O15axg/dSXXnnoUQ3q0DTqOSNIz9/hc/WZmw4Fh7n6DmQ0BbnP34dXajAEaufs9ZnYbcCNwPLAdeB/4ubu/X+011wDXAGRmZuZMmDAhprlLSkpIT0+sqfYTLVOi5QFlila8Mm3YXsndn5bSvkk97jqhEfX3c2j+SP6cDoYyRSfRMtVGnqFDh85w90F7PeHucbkBY4ECIB9YS6hoj6/WpjMwP3z/MkLL0lY99wvg9v29R05OjsfalClTYr7Pw5VomRItj7syRSsemXaVV/i3/vyx9777HV+xcVtCZDpYyhQdZTqw2sgDTPcaamLczsG7+xh3z3b3LuHi/YG7jzazbhHNRgCLw/cnA33NrHF4wN1pwMJ45RWR2Hj0vSXMXFnEry/qQ6dWjYOOI3LEiPc5+Jo8aGY9gEpgBXAdgLtvNrPfAV8QGnj3lru/GVxMETlYH3+5kcdyl3HZ8R25oF+HA79ARGImkALv7rlAbvj+qP20G0/oUjkRqWM2bN3Jj1+azdFt0rnngl5BxxE54iRCD15EkkxlpfOTl+ewpbSM534wmLQGKUFHEjniaKpaEYm5v330FR8u2cAvhvfk2HbNgo4jckRSgReRmJq9qohxk/M4r3c7rjihU9BxRI5YKvAiEjOhJWBnktmsEQ9epCVgRYKkc/AiEhPuzpiJ81hdtIOXrj1JS8CKBEw9eBGJiX9+sYo3dy8B2yLoOCJHPBV4ETlsX67byr3/XsA3jmmtJWBFEoQKvIgclh1lFdz0wizSG9bnd5f20xKwIglC5+BF5LDcP2kheeu28o+rB9O2qZaAFUkU6sGLyCF7a94anv98JdeedhSndW8TdBwRiaACLyKHZNWm7dz5ylz6dczgtrN7BB1HRKpRgReRg1ZWUcnNE2aBwx8vG0Bqiv6UiCQanYMXkYP2yH9CS8D+8fIBWgJWJEHpa7eIHJSPvtzA4/9dxuWDtQSsSCJTgReRqG3YupMf/3MOx7RJ5+7hWgJWJJHpEL2IRKWy0rn1pdls3VHG8/93gpaAFUlw6sGLSFT+9tFXfPTlRu6+oCc92jUNOo6IHIAKvIgc0KyVmxk3OY9hfdrxncFaAlakLlCBF5H9Ki4t44cvziKzWSPGaglYkTpD5+BFZJ/cnbsmzmNN8Q5evu4kmqdpCViRukI9eBHZpwlfrOLNeWv4ydndGdhJS8CK1CUq8CJSoyXrtnJfeAnY607VErAidY0KvIjsJbQE7EwtAStSh+kcvIjs5ZeTFrJkXQnPaglYkTpLPXgR2cObc9fwQngJ2FO1BKxInaUCLyK7rdq0nZ9OnEt/LQErUufFvcCbWYqZzTKzSeHH95vZXDObbWbvmlmHau07mVmJmd0W76wiR5Kyikp+VLUE7OVaAlakrgvi/+CbgUURj8e5e1937w9MAu6u1v4R4O14hRM5Uv3uP0uYtbKIB0f1pWNLLQErUtfFtcCbWTZwPvBk1TZ33xLRpAngEe1HAl8BC+KVUeRI9OGSDTyeu4zLB3fi/L7tg44jIjEQ71H0jwJ3AHusVGFmDwBXAcXA0PC2JsCdwFmADs+L1JINW3dy60tz6J6Zzt3DewYdR0RixNz9wK1i8UZmw4Fh7n6DmQ0BbnP34dXajAEaufs9ZvYQMM3dXzKze4ESd3+ohv1eA1wDkJmZmTNhwoSY5i4pKSE9PT2m+zxciZYp0fKAMh3Ip6vLeGVJGV/vqCS1nlFRCb88JY3spsGfd0+kz6mKMkVHmQ6sNvIMHTp0hrsP2usJd4/LDRgLFAD5wFpgOzC+WpvOwPzw/Y/CbfOBImATcNP+3iMnJ8djbcqUKTHf5+FKtEyJlsddmfbn1ZkFfuzP3/bOd07afTvmrjf91ZkFQUdz98T5nCIpU3SU6cBqIw8w3WuoiXH7uu7uY9w92927AJcBH7j7aDPrFtFsBLA43P6b7t4l3P5R4Nfu/qd45RVJVuMm51FaVrHHtrIKZ9zkvIASiUhtSISZ7B40sx5AJbACuC7gPCJJq6yiksKi0hqfW72P7SJSNwVS4N09F8gN3x8VRft7a0acYCYAAByhSURBVDeRSHJbv2UHL0xbyQufr9xnmw4ZaXFMJCK1LRF68CJSC9yd6Ss2849P83ln/lrKK53Turfhwv7pjJ+6gtKyyt1t01JTuP0czVwnkkxU4EWSzPZd5bw+ezXPfraCRWu20LRRfb57chdGn9iZrq2bANCrQ3PGTc6jsKiUrIw0bj+nByMHZAWcXERiSQVeJEnkb9zGc1NX8PL0VWzZUc6x7Zoy9qI+XNi/A40b7Pm/+sgBWYwckEVubi5DhgwJJrCI1CoVeJE6rLLSyV2ynmc/W0Fu3gbq1zPO7d2Oq07qwvFdWmCmddxFjlQq8CJ1UNH2Xbw8vYDnpq5g5abttGnakFvO7MblgzuR2Uzrt4uICnyd8tqswv+dN536gc6bHoHmFxbz3GcreG12ITvLKxncpSW3n9ODc3q1o0H94GehE5HEoQJfR7w2q5AxE+ftnqCksKiUMRPnAajIJ7ld5ZW8PX8Nz362ghkrNpOWmsJFA7O48sQu9OzQLOh4IpKgVODriLFvL9pr9rHSsgru+/cCenVoxlFt0kmpp/OtyWRt8Q5e+HwFL0xbxcaSnXRp1Zifn38cF+d0pHnj1KDjiUiCU4FPcNt2lvPnKUtZt2Vnjc9v3l7GWY98SOMGKfTq0Iw+WRn0zW5O76zmHNW6CfVU9OsUd+fz5Zt49rN8Ji9YR6U7Q3u05aqTOnNqtzb6fYpI1FTgE5S78/rs1Yx9exHrtuwkLTVlrx48QNumDbnz3GOZV1jM3IIiXpi2gqc/CU1gkt6wPj07NKNvVnP6ZDenT1ZzurRS0U9E23aW8+qsQp79LJ8l60ponpbKD77RldEndKZTq8ZBxxOROkgFPgHNKyjm3n8vYMaKzfTJas5jVwxk1abSPc7BQ2j2sbuGHcfIAVmMyskGoLyikqUbSphXUBwu+sU8N3UFO8tDRb9pw/r0ympG3+wMemc1p29Wczq3aqzLqQKybEMJz322gldmFLB1Zzm9OjTjt6P6ckG/DqQ1SAk6nojUYSrwCWRjyU7GvZPHSzNW0apJA347qi/fzsmmXj0jp3OozYFmH6ufUo9j2zXj2HbNuHhQRyC0wMiX60qYX1jM3MIi5hUU88wn+eyqCBf9RvXpE9HL75uVQceWaSr6MVL96oefnNWdpmmpPPtZPh99uZHUFGNYn/ZcdVIXBnbK0OcuIjGhAp8Ayioq+cen+fz+vS8pLavgB6d05UdndqNZoz0HUh3q7GOpKfXo2aEZPTs045LjQ0V/V3klS9ZtDRf9YuYXFvP0x8spq3AAmqel7lH0+2Q1J7vFnkVfl+0dWE1XP/zk5Tk40K5ZI35yVncuG9yJNk0bBhtURJKOCnzA/rtkA7/89wKWbdjGqd3bcPfwnhzTNr3W37dB/Xr0zgoNxrssvG1neQVL1pYwr7CYeYVFzC0o5m8ffkV5ZajoZzRO3V3sS8vKeeHzVbsP/euyvZrVtPa6Ay0bp/LRnUNJTdG16yJSO1TgA5K/cRu/enMh7y1aT+dWjXnyqkGccVzbQA/PNqyfEuqxZzcHOgGwo6yCvLVbQ0W/INTb/8uHX1ERLvqRSssqGDc5TwU+wr7WWN+8vUzFXURqlQp8nG3bWc6fpizlqY+Wk5pi3HnusVz9jS40rJ+YA6oapabQr2MG/Tpm7N62o6yC437xDnuX+H0XtCNVZrNGrN2yY6/tWntdRGqbCnycuDuvzS5k7FuLWb91JxcNzOLOc4+tk/OGN0pNoUNGGoU1FHMVrv8pq6gkrcHevXStvS4i8aBjhHEwt6CIUY9/yo//OYd2zRsx8YaT+d0l/etkca9y+zk9SEvd+6jD2T3bBpAmMT349mKWb9zOlSd2Iiv8xScrI42xF/XRaQwRqXXqwdeiDVt3Mm7yYl6eURC67O3bffn2wOykmGimqkBVjaLv0LwRqSnGP6cXcPkJneme2TTghMF6c+4anvp4Od87uQv3jujF/aC110UkrlTga8Gu8tBlb394P3TZ2/99oys/PGPvy97quuqX7a0t3sHwP37Mtc/N4LUbT6F5WnL9vNFaun4rd/xrDgM7ZXDXsOOCjiMiRygdoo+x3Lz1nPv7D3ngrUXkdGnB5B+fys/O75l0xb0m7Zo3Cs+6t51b/zmbyhpG2ie7bTvLuW78TBqlpvDnKwZqCVcRCYz++sRI/sZt/OCZL/je37/AHZ7+3iCe+f5gjm5T+9e0J5LBXVvyi+E9eX/xen7//pdBx4krd+fOV+by1YYS/nj5ANo314BDEQmODtEfppKd5fzpg6U8/XHosrefnncs3z8lcS97i4erTurM3IJifv/+l/TOas5ZPTODjhQXf/8kn0lz13Dnucdy8jGtg44jIkc4FfhDVFnpvDqrkN+8E7rsbdTAbO48twdt6/DI+FgxMx74Vm/y1m3h1n/O5rWbTkn6IxnT8zfx67cWcVbPTK477aig44iI6BD9oZizqohRT3zKT16eQ/uMNF694WQevqSfinuERqkpPDE6h9T69bj2uRmU7CwPOlKtWb91Bzc8P5PsFmk8fEk/LRYjIglBBf4grN+6g9tfnsOFf/6EVZtKeejifrx6/ckM6NQi6GgJKbtFY/50+QC+2lDCbS/NwT35Bt2VV1TywxdmsWVHGY+PzjkiBlOKSN2gQ/T7ELlSWofP3mdQlxZ8sHgDO8sruPbUo7jp9GNoqj/mB3TyMa25a9hx/OrNRTyWu4wbhx4TdKSYGjc5j8+Xb+J3l/TjuPbNgo4jIrJb3Au8maUA04FCdx9uZvcDFwKVwHrge+6+2szOAh4EGgC7gNvd/YN4ZKy+xOfq4h28MWcNx7Vryp+vGMhRSX4+OdZ+8I2uzCko5qF38+jVoRlDeiTHbHfvzF/DXz78itEnduKigdlBxxER2UMQh+hvBhZFPB7n7n3dvT8wCbg7vH0jcIG79wG+CzwXr4A1LfEJsGVHuYr7ITAzfjOqDz0ym/KjF2ex4uttQUc6bF9tKOG2l+fSr2MGvxjeM+g4IiJ7iWuBN7Ns4Hzgyapt7r4lokkTQstl4+6z3H11ePsCoJGZNYxHzn2tiKaV0g5d4wb1+euVgzAzrn1uBtt31d1Bd9t3lXP9+JmkphiPXTHwiL4kUkQSV7x78I8CdxA6HL+bmT1gZquAK/hfDz7SKGCWu++s/Yj7XhFNK6Udnk6tGvOHyweQt24rd74yr04OunN3xkycx5L1W/nD5QN2LyIjIpJoLF5/ZM1sODDM3W8wsyHAbe4+vFqbMUAjd78nYlsv4A3gbHdfVsN+rwGuAcjMzMyZMGHCYWf9dHUZz8zfxa6IryEN6sH3ejfg5A7BD6wrKSkhPT1xThUcbJ5Jy3bxry/LuLRHA87rWjufZ219Ru+tKGP8ol1c1C2VEUc3SIhMh0OZoqNM0VGmA6uNPEOHDp3h7oP2esLd43IDxgIFQD6wFtgOjK/WpjMwP+JxNrAEOCWa98jJyfFYeXVmgZ889n3vfOckP3ns+/7qzIKY7ftwTZkyJegIezjYPJWVlX7dc9O9608n+SdfbkiITNGYsWKTH3PXm37136d5RUVlQmQ6XMoUHWWKjjIdWG3kAaZ7DTUxbofo3X2Mu2e7exfgMuADdx9tZt0imo0AFgOYWQbwJjDG3T+JV84qIwdk8clPT+eZc5vwyU9P1/rdMWRmjLu4H0e3SeemF2dRsHl70JEOaGPJTm4YP5P2zdP43SX9k2LJXxFJbokw0c2DZjbfzOYCZxMaZQ9wE3AM8Aszmx2+Jcf1VUJ6w/r85cocysoruW78DHbUcNVCoqiodH704iw2b9/F46MH0rxx8KdpREQOJJAC7+65Hj7/7u6j3L23hy6Vu8DdC8Pbf+XuTdy9f8RtfRB5pXYc1SadRy/rz/zCLdz1auIOunv43Tw+XfY194/sTa8OzYOOIyISlUTowcsR7IzjMrnlzG5MnFnIs5+tCDrOXt5dsJbHcpdx+eCOXDKoY9BxRESipgIvgfvR6d0487i23D9pIdOWbwo6zm75G7fxk5fm0CerOfdc0CvoOCIiB0UFXgJXr57xu0v706llY254fgZri3cEHYnSXRVcN34GKeHJbBqlajIbEalbVOAlITRrlMpfrszZXVh3lgc36M7d+dlr88hbt5VHL+1Px5aNA8siInKoVOAlYXTLbMrDl/Rj9qoi7n1jQWA5nv98JRNnFnLzGd2SZmEcETnyqMBLQjm3d3tuGHI0L05bxQufr4z7+89eVcQv/72Q07q34UendzvwC0REEpQKvCScn5zdg1O7t+GeN+Yzc+XmuL3vpm27uGH8DNo0bcijl2oyGxGp21TgJeGk1DP+cFl/2jdP4/rxM1i/tfYH3VVUOjdPmMXGkl08MTqHFk0Obp55EZFEowIvCSmjcQP+cmUOW0rLufH5mewqrzzwiw7D799bwkdfbuS+C3vRJ1uT2YhI3acCLwnruPbN+M23+/JF/mYeeHNhrb3PB4vX8YcPlnJxTjaXHa/JbEQkOdQPOoDI/ozo14F5BUX87aPl9MnO4Ns52THd/8qvt3PLhNn0bN+M+0f2xkzn3UUkOagHLwnvznOP5eSjW3HXq/OYV1Acs/3uKKvg+udnAPDE6BxNZiMiSUUFXhJe/ZR6/PHyAbRJb8i1z03n65KdMdnv3a/PZ8HqLTxyaX86tdJkNiKSXFTgpU5old6QJ0bnsHHbLm56YRblFYc36G7CtJW8NL2AH55+DGcclxmjlCIiiUMFXuqMPtnNGfutPnz21dc8+PbiQ97PvIJi7n5jAd/s1ppbzuwew4QiIolDg+ykThmVk83cgiKe/Hg5fbKbc2H/rIN6fdH2XVz//AxaN2nA7y8bQIomsxGRJKUevNQ5Px/ek8FdWnLnK3NZuHpL1K+rrHRu+eds1m3ZwWOjc2ipyWxEJImpwEudk5pSjz9dMYDmaalcO346Rdt3RfW6P3zwJbl5G7j7gl7075hRyylFRIKlAi91UtumjXh8dA7rinfywxdnUVHp+22fm7ee37//JRcNyGL0CZ3ilFJEJDgq8FJnDezUgvsu7MVHX27k4Xfz9tlu1abt3PLP2fTIbMoD3+qjyWxE5IigAi912uWDO3H54E48lruMt+et2ev5HWUV3PD8TCoqnMdH55DWQJPZiMiRQQVe6rx7R/RkQKcMfvLyHJas27rHc/f9eyHzCot5+JJ+dG3dJKCEIiLxp8vkpM5rWD+Fx6/IYfgfP+Y7f5tKako91hTvIOPDdynaXsZ1px3N2b3aBR1TRCSu1IOXpNCueSO+c0JHNpbsYk1xaP34ou1l1DPo3lY9dxE58qjAS9J4ZUbhXtsqHR7+z5cBpBERCZYKvCSN1UWlB7VdRCSZxb3Am1mKmc0ys0nhx/eb2Vwzm21m75pZh4i2Y8xsqZnlmdk58c4qdUuHjLSD2i4iksyC6MHfDCyKeDzO3fu6e39gEnA3gJn1BC4DegHnAo+Zma5xkn26/ZwepFVb0z0tNYXbz+kRUCIRkeDEtcCbWTZwPvBk1TZ3j5xMvAlQNSXZhcAEd9/p7suBpcDgeGWVumfkgCzGXtSHrHCPPSsjjbEX9WHkgINbkEZEJBnE+zK5R4E7gKaRG83sAeAqoBgYGt6cBUyNaFYQ3iayTyMHZDFyQBa5ubkMGTIk6DgiIoEx9/3P4R2zNzIbDgxz9xvMbAhwm7sPr9ZmDNDI3e8xsz8Dn7n7+PBzTwFvufsr1V5zDXANQGZmZs6ECRNimrukpIT09PSY7vNwJVqmRMsDyhQtZYqOMkVHmQ6sNvIMHTp0hrsP2usJd4/LDRhLqBeeD6wFtgPjq7XpDMwP3x8DjIl4bjJw0v7eIycnx2NtypQpMd/n4Uq0TImWx12ZoqVM0VGm6CjTgdVGHmC611AT43YO3t3HuHu2u3chNHjuA3cfbWbdIpqNABaH778BXGZmDc2sK9ANmBavvCIiInVZIkxV+6CZ9QAqgRXAdQDuvsDMXgIWAuXAje5eEVxMERGRuiOQAu/uuUBu+P6o/bR7AHggPqlERESSh2ayExERSUIq8CIiIklIBV5ERCQJxe06+Hgwsw2EBurFUmtgY4z3ebgSLVOi5QFlipYyRUeZoqNMB1YbeTq7e5vqG5OqwNcGM5vuNU0gEKBEy5RoeUCZoqVM0VGm6CjTgcUzjw7Ri4iIJCEVeBERkSSkAn9gfw06QA0SLVOi5QFlipYyRUeZoqNMBxa3PDoHLyIikoTUgxcREUlCKvD7YWYpZjbLzCYFnQXAzPLNbJ6ZzTaz6UHnATCzDDP7l5ktNrNFZnZSwHl6hD+fqtsWM7slyEzhXD82swVmNt/MXjSzRgmQ6eZwngVBfUZm9rSZrTez+RHbWprZf8zsy/C/LRIg08Xhz6nSzOI+InsfmcaF/7+ba2avmllGwHnuD2eZbWbvmlmHeOXZV6aI524zMzez1kFnMrN7zaww4m/UsNp6fxX4/bsZWBR0iGqGunv/BLrs4/fAO+5+LNCPgD8vd88Lfz79gRxCyxK/GmQmM8sCfgQMcvfeQAqhFRWDzNQb+H/AYEK/t+HVVnaMl2eAc6tt+ynwvrt3A94PPw4603zgIuDDOGep8gx7Z/oP0Nvd+wJLCC2xHWSece7eN/z/3iTg7jjm2VcmzKwjcBawMs55YB+ZgEeq/k65+1u19eYq8PtgZtnA+cCTQWdJVGbWDDgVeArA3Xe5e1GwqfZwBrDM3WM9+dGhqA+kmVl9oDGwOuA8xwFT3X27u5cD/wW+Fe8Q7v4hsKna5guBf4Tv/wMYGXQmd1/k7nnxzFHt/WvK9G74dwcwFcgOOM+WiIdNgLgO8NrHf0sAjwB3xDsP7DdTXKjA79ujhP6jqAw6SAQH3jWzGWZ2TdBhgKOADcDfw6cynjSzJkGHinAZ8GLQIdy9EHiIUA9iDVDs7u8Gm4r5wKlm1srMGgPDgI4BZ6qS6e5rAML/tg04T11wNfB20CHM7AEzWwVcQfx78DXlGQEUuvucoLNUc1P4dMbTtXkKSgW+BmY2HFjv7jOCzlLNKe4+EDgPuNHMTg04T31gIPC4uw8AthH/w6k1MrMGwAjg5QTI0oJQr7Qr0AFoYmajg8zk7ouA3xA6zPsOMAco3++LJCGZ2c8I/e6eDzqLu//M3TuGs9wUZJbwF9efkQBfNKp5HDga6E/oC//DtfVGKvA1OwUYYWb5wATgdDMbH2wkcPfV4X/XEzqvPDjYRBQABe7+efjxvwgV/ERwHjDT3dcFHQQ4E1ju7hvcvQyYCJwccCbc/Sl3H+jupxI6jPhl0JnC1plZe4Dwv+sDzpOwzOy7wHDgCk+sa55fAEYFnOFoQl+q54T/lmcDM82sXZCh3H2du1e4eyXwN2rx77gKfA3cfYy7Z7t7F0KHeT9w90B7XGbWxMyaVt0HziZ0mDUw7r4WWGVmPcKbzgAWBhgp0uUkwOH5sJXAiWbW2MyM0OcU+OBNM2sb/rcToQFkifJ5vQF8N3z/u8DrAWZJWGZ2LnAnMMLdtydAnshBmiOAxUFlAXD3ee7e1t27hP+WFwADw3+3AlP15TXsW9Ti3/H6tbVjiblM4NVQfaA+8IK7vxNsJAB+CDwfPiT+FfD9gPNUHZo7C7g26CwA7v65mf0LmEnoUOosEmN2rVfMrBVQBtzo7pvjHcDMXgSGAK3NrAC4B3gQeMnMfkDoy9HFCZBpE/BHoA3wppnNdvdzAs40BmgI/Cf8d2Gqu18XYJ5h4S/7lYRW9YxLlv1lcven4pkhmkzAEDPrT2hMVT61+HdKM9mJiIgkIR2iFxERSUIq8CIiIklIBV5ERCQJqcCLiIgkIRV4ERGRJKQCLyIikoRU4EUOk5k9YzUsKWxmg8JLVHaJf6oj175+HzHcf5fw73VQxLbGZvaOmS0PaFU+kb1oohuRJBdewa4iXlOZmllqeEreOi3azy281sCbQFNC60UEvVKgCKAevEhcWMhSM7ut2vZu4d7gwPBjN7ObzOxNM9tuZiuqL0xjZllmNsHMNodvb0b2Gs3sXjObb2bfM7NlwE5Cy3dWz5RrZk+Y2e8j9jXOzOpFtBltZl+Y2VYzW29mL4fXt696fkg48zAzm2Zmu4BzzOxoM3vdzNaa2TYzmxlexCny/fPN7O5wj3urma0ys0vNLCP885WY2Zdmdna11/UM/8xVmV6sml/czO4lNL3t+eFcbmZDYvm5VcvSgf+tEX+qirskEhV4kTgI9wKfIrSsZ6SrgdnuPjNi232E5mPvT2hK22erDgeHp+GdAuwATgNOIrQi1Xvh56p0Bb5DaJrXfuH2NbmC0N+BkwhNmXkNcEvE8w0ITa/Zj9CiJq2pec763wA/B44FPgfSCS1felb4ta8AE83s2GqvuwWYRmiRopcIrf/+AvBW+Of/EBhvZo3CP3/78Lb5hBbpODP8Xm+Ev5g8FN7Pe0D78O3TWvjcAI4BPiE0x/mZQUz1K7Jf7q6bbrodxg14htAc8yXVbtsJzTfdJdyuHaF5308MP04BCoGbIvblwN+q7f89YHz4/tWEVn2ziOdTgK+BS8KP7w2/T+YBcucCS6rt6+eEVgjc12uODWfMDj8eEn48KorPaSrw84jH+cCLEY/Tw/v6Q8S2LuFtg8KPfwm8X22/LcJtBkf8PiZVaxPLz60q007gUyA16P8GddOtppt68CKx8SGhHmfk7TuRDTy0itUk/teLPxdoxd7reH9Ww+Oe4fs5hHqZW8OHsEuAYkJF7uiI1xR4eKlcM/tmVdvw7YqIdlPdPfIc82dAlpk1C792YPhQ+woz2wpMD7frVC3j9MgHFlr98LdmtjB8OLwEGFTD6+ZGfD5VX4rmRTxftdxv24if/9TInwdYFX4u8uev7qA/tyi8TugowmVRtheJKw2yE4mN7e6+NHKDmWXU0O5J4AUzu4VQoZ/oB3dotx4wm5qLyqaI+9si7k8n9IWjSlQFzELLEk8mdAThSkLrsrcGPiJ06D7StmqPHyL0BeY2Qj3n7cCzNbyu+mA8r7at6stHvYh/3wzvt7r9/VyH8rkdyG8JfbbPmFmKuz9zEK8VqXUq8CLx9Q6whdBSmhcAw2pocyLwdLXHVevHzyS01v1Gdy+K5g3dvRRYuo+nTzAzi+jFnwisdvctZpZDqKDf5e7LAczsomjeE/gG8Ky7vxJ+XSNCPeUlUb5+X2YClwArfN8j9XcROvxe/XUH9blFw91/a2ZlwFNmVt/dn4zVvkUOlw7Ri8SRu1cQKt5jCZ1/f7+GZheZ2f8Lj7AfA5wBPBp+7nlCPdXXzew0M+tqZqea2cN2aNdfdwAeNbMeZvZt4HbgkfBzKwmdZ77JzI4ys/OB+6Pc7xLgW+FD/H2A8UCjQ8hX3Z+B5sA/zeyEcK4zzeyvZtY03CYf6B3+mVqbWSqx/9x2c/dHgJuBv5hZra3tLXKwVOBF4u9pQoeq/17t/HeVe4FRhM5PXw98392/AHD37cCpwFfAy8BiQiPPWwCHMor7eUK93c+BvxEa6f9I+L02ELrkbCSwkNBo+luj3O+thA7pf0RoNP3U8P3D4qHL0E4BKgkdDVlAqOjvDN8I/xyLCB0+30Do2vRYf27Vc/0JuBF4zMxuONz9icSC1fz3RURqi5mdQOjyqqPcfWW15xy42N3/FYccucB8d7+ptt9LROJP5+BF4sTMGgIdgV8Br1Yv7iIisaRD9CLxczmQR+jSuGgPdYuIHBIdohcREUlC6sGLiIgkIRV4ERGRJKQCLyIikoRU4EVERJKQCryIiEgSUoEXERFJQv8fvYtmdBjFjMIAAAAASUVORK5CYII=\n",
192 |       "text/plain": [
193 |        "<Figure size 576x360 with 1 Axes>"
194 |       ]
195 |      },
196 |      "metadata": {
197 |       "needs_background": "light"
198 |      },
199 |      "output_type": "display_data"
200 |     }
201 |    ],
202 |    "source": [
203 |     "fig, ax = plt.subplots(figsize=(8,5))\n",
204 |     "ax.plot(candidates, mae_metrics, \"o-\")\n",
205 |     "ax.set_xlabel('Hyper-parameter K', fontsize=14)\n",
206 |     "ax.set_ylabel('MAE', fontsize=14)\n",
207 |     "ax.set_xticks(candidates)\n",
208 |     "ax.grid();"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "#### Recalculating train-set split"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 10,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=7)\n",
225 |     "scaler = StandardScaler()\n",
226 |     "scaler.fit(X_train[numerical_features])\n",
227 |     "X_train.loc[:, numerical_features] = scaler.fit_transform(X_train[numerical_features])\n",
228 |     "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "#### Optimizing with cross-validation"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "from sklearn.model_selection import cross_val_score\n",
245 |     "candidates = np.arange(4,16)\n",
246 |     "mean_mae = []\n",
247 |     "std_mae = []\n",
248 |     "for k in candidates:\n",
249 |     "    model = KNeighborsRegressor(n_neighbors=k, weights='distance', metric='minkowski', leaf_size=50, n_jobs=4)\n",
250 |     "    cv_results = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=10)\n",
251 |     "    mean_score, std_score = -1*cv_results.mean(), cv_results.std()\n",
252 |     "    mean_mae.append(mean_score)\n",
253 |     "    std_mae.append(std_score)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": null,
259 |    "metadata": {
260 |     "scrolled": true
261 |    },
262 |    "outputs": [],
263 |    "source": [
264 |     "fig, ax = plt.subplots(figsize=(8,5))\n",
265 |     "ax.plot(candidates, mean_mae, \"o-\")\n",
266 |     "ax.set_xlabel('Hyper-parameter K', fontsize=14)\n",
267 |     "ax.set_ylabel('Mean MAE', fontsize=14)\n",
268 |     "ax.set_xticks(candidates)\n",
269 |     "ax.grid();"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "fig, ax = plt.subplots(figsize=(8,5))\n",
279 |     "ax.plot(candidates, std_mae, \"o-\")\n",
280 |     "ax.set_xlabel('Hyper-parameter K', fontsize=14)\n",
281 |     "ax.set_ylabel('Standard deviation of MAE', fontsize=14)\n",
282 |     "ax.set_xticks(candidates)\n",
283 |     "ax.grid();"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "markdown",
288 |    "metadata": {},
289 |    "source": [
290 |     "# Improving Performance"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "markdown",
295 |    "metadata": {},
296 |    "source": [
297 |     "## Improving our diamond price predictions"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "### Fitting a neural network"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "from keras.models import Sequential\n",
314 |     "from keras.layers import Dense\n",
315 |     "\n",
316 |     "n_input = X_train.shape[1]\n",
317 |     "n_hidden1 = 32\n",
318 |     "n_hidden2 = 16\n",
319 |     "n_hidden3 = 8\n",
320 |     "\n",
321 |     "nn_reg = Sequential()\n",
322 |     "nn_reg.add(Dense(units=n_hidden1, activation='relu', input_shape=(n_input,)))\n",
323 |     "nn_reg.add(Dense(units=n_hidden2, activation='relu'))\n",
324 |     "nn_reg.add(Dense(units=n_hidden3, activation='relu'))\n",
325 |     "# output layer\n",
326 |     "nn_reg.add(Dense(units=1, activation=None))"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {
333 |     "scrolled": true
334 |    },
335 |    "outputs": [],
336 |    "source": [
337 |     "batch_size = 32\n",
338 |     "n_epochs = 40\n",
339 |     "nn_reg.compile(loss='mean_absolute_error', optimizer='adam')\n",
340 |     "nn_reg.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size, validation_split=0.05)"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": null,
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "y_pred = nn_reg.predict(X_test).flatten()\n",
350 |     "mae_neural_net = mean_absolute_error(y_test, y_pred)\n",
351 |     "print(\"MAE Neural Network: {:0.2f}\".format(mae_neural_net))"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "metadata": {},
357 |    "source": [
358 |     "### Transforming the target"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "metadata": {},
365 |    "outputs": [],
366 |    "source": [
367 |     "diamonds['price'].hist(bins=25, ec='k', figsize=(8,5))\n",
368 |     "plt.title(\"Distribution of diamond prices\", fontsize=16)\n",
369 |     "plt.grid(False);"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "y_train = np.log(y_train)\n",
379 |     "pd.Series(y_train).hist(bins=25, ec='k', figsize=(8,5))\n",
380 |     "plt.title(\"Distribution of log diamond prices\", fontsize=16)\n",
381 |     "plt.grid(False);"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": [
390 |     "nn_reg = Sequential()\n",
391 |     "nn_reg.add(Dense(units=n_hidden1, activation='relu', input_shape=(n_input,)))\n",
392 |     "nn_reg.add(Dense(units=n_hidden2, activation='relu'))\n",
393 |     "nn_reg.add(Dense(units=n_hidden3, activation='relu'))\n",
394 |     "# output layer\n",
395 |     "nn_reg.add(Dense(units=1, activation=None))"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": null,
401 |    "metadata": {
402 |     "scrolled": true
403 |    },
404 |    "outputs": [],
405 |    "source": [
406 |     "batch_size = 32\n",
407 |     "n_epochs = 40\n",
408 |     "nn_reg.compile(loss='mean_absolute_error', optimizer='adam')\n",
409 |     "nn_reg.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size, validation_split=0.05)"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "y_pred = nn_reg.predict(X_test).flatten()\n",
419 |     "y_pred = np.exp(y_pred)\n",
420 |     "mae_neural_net2 = mean_absolute_error(y_test, y_pred)\n",
421 |     "print(\"MAE Neural Network (modified target): {:0.2f}\".format(mae_neural_net2))"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {},
428 |    "outputs": [],
429 |    "source": [
430 |     "100*(mae_neural_net - mae_neural_net2)/mae_neural_net2"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "markdown",
435 |    "metadata": {},
436 |    "source": [
437 |     "#### Analyzing the results"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": [
446 |     "fig, ax = plt.subplots(figsize=(8,5))\n",
447 |     "residuals = y_test - y_pred\n",
448 |     "ax.scatter(y_test, residuals, s=3)\n",
449 |     "ax.set_title('Residuals vs. Observed Prices', fontsize=16)\n",
450 |     "ax.set_xlabel('Observed prices', fontsize=14)\n",
451 |     "ax.set_ylabel('Residuals', fontsize=14)\n",
452 |     "ax.grid();"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": null,
458 |    "metadata": {},
459 |    "outputs": [],
460 |    "source": [
461 |     "mask_7500 = y_test <=7500\n",
462 |     "mae_neural_less_7500 = mean_absolute_error(y_test[mask_7500], y_pred[mask_7500])\n",
463 |     "print(\"MAE considering price <= 7500: {:0.2f}\".format(mae_neural_less_7500))"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": null,
469 |    "metadata": {},
470 |    "outputs": [],
471 |    "source": [
472 |     "fig, ax = plt.subplots(figsize=(8,5))\n",
473 |     "percent_residuals = (y_test - y_pred)/y_test\n",
474 |     "ax.scatter(y_test, percent_residuals, s=3)\n",
475 |     "ax.set_title('Pecent residuals vs. Observed Prices', fontsize=16)\n",
476 |     "ax.set_xlabel('Observed prices', fontsize=14)\n",
477 |     "ax.set_ylabel('Pecent residuals', fontsize=14)\n",
478 |     "ax.axhline(y=0.15, color='r'); ax.axhline(y=-0.15, color='r'); \n",
479 |     "ax.grid();"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": null,
485 |    "metadata": {},
486 |    "outputs": [],
487 |    "source": []
488 |   },
489 |   {
490 |    "cell_type": "code",
491 |    "execution_count": null,
492 |    "metadata": {},
493 |    "outputs": [],
494 |    "source": []
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": null,
499 |    "metadata": {},
500 |    "outputs": [],
501 |    "source": []
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": null,
506 |    "metadata": {},
507 |    "outputs": [],
508 |    "source": []
509 |   }
510 |  ],
511 |  "metadata": {
512 |   "kernelspec": {
513 |    "display_name": "Python 3",
514 |    "language": "python",
515 |    "name": "python3"
516 |   },
517 |   "language_info": {
518 |    "codemirror_mode": {
519 |     "name": "ipython",
520 |     "version": 3
521 |    },
522 |    "file_extension": ".py",
523 |    "mimetype": "text/x-python",
524 |    "name": "python",
525 |    "nbconvert_exporter": "python",
526 |    "pygments_lexer": "ipython3",
527 |    "version": "3.6.10"
528 |   }
529 |  },
530 |  "nbformat": 4,
531 |  "nbformat_minor": 2
532 | }
533 | 


--------------------------------------------------------------------------------
/Chapter09/Model/diamond-prices-model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Predictive-Analytics-with-Python/e049d9d2870f596892b62aeddec312788d9d0c2c/Chapter09/Model/diamond-prices-model.h5


--------------------------------------------------------------------------------
/Chapter09/Model/pca.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Predictive-Analytics-with-Python/e049d9d2870f596892b62aeddec312788d9d0c2c/Chapter09/Model/pca.joblib


--------------------------------------------------------------------------------
/Chapter09/Model/scaler.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Predictive-Analytics-with-Python/e049d9d2870f596892b62aeddec312788d9d0c2c/Chapter09/Model/scaler.joblib


--------------------------------------------------------------------------------
/Chapter09/dash-example-no-user-inputs.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Alvaro Fuentes
 4 | Chapter 9. Hands-On Predictive Analytics with Python
 5 | Building a basic static app
 6 | """
 7 | ## imports
 8 | import dash
 9 | import dash_core_components as dcc
10 | import dash_html_components as html
11 | import plotly.graph_objs as go
12 | import pandas as pd
13 | import os
14 | 
15 | ## Importing the dataset
16 | DATA_DIR = '../data'
17 | FILE_NAME = 'diamonds.csv'
18 | data_path = os.path.join(DATA_DIR, FILE_NAME)
19 | diamonds = pd.read_csv(data_path)
20 | 
21 | ## Creating the app
22 | app = dash.Dash(__name__)
23 | 
24 | # Creating a Plotly figure
25 | trace = go.Histogram(
26 |         x = diamonds['price']
27 |         )
28 | 
29 | layout = go.Layout(
30 |         title = 'Diamond Prices',
31 |         xaxis = dict(title='Price'),
32 |         yaxis = dict(title='Count')
33 |         )
34 | 
35 | figure = go.Figure(
36 |         data = [trace],
37 |         layout = layout
38 |         )
39 | 
40 | app.layout = html.Div([
41 |         html.H1('My first Dash App'),
42 |         html.H2('Histogram of diamond prices'),
43 |         html.P('This is some normal text, we can use it to describe something about the application.'),          
44 |         dcc.Graph(id='my-histogram', figure=figure)
45 |         ])
46 |         
47 |       
48 | if __name__ == '__main__':
49 |     app.run_server(debug=True) 
50 |         


--------------------------------------------------------------------------------
/Chapter09/dash-example-user-inputs.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Alvaro Fuentes
 4 | Chapter 9. Hands-On Predictive Analytics with Python
 5 | Building a basic interactive app
 6 | """
 7 | ## imports
 8 | import dash
 9 | import dash_core_components as dcc
10 | import dash_html_components as html
11 | from dash.dependencies import Input, Output
12 | import plotly.graph_objs as go
13 | import pandas as pd
14 | import os
15 | 
16 | ## Importing the dataset
17 | DATA_DIR = '../data'
18 | FILE_NAME = 'diamonds.csv'
19 | data_path = os.path.join(DATA_DIR, FILE_NAME)
20 | diamonds = pd.read_csv(data_path)
21 | diamonds = diamonds.sample(n=2000)
22 | 
23 | 
24 | app = dash.Dash(__name__)
25 | 
26 | app.css.append_css({
27 |     'external_url': 'https://codepen.io/chriddyp/pen/bWLwgP.css'
28 | })
29 | 
30 | numerical_features = ['price','carat','depth','table','x','y','z']
31 | options_dropdown = [{'label':x.upper(), 'value':x} for x in numerical_features]
32 | 
33 | dd_x_var = dcc.Dropdown(
34 |         id='x-var',
35 |         options = options_dropdown,
36 |         value = 'carat'
37 |         )
38 | 
39 | div_x_var =  html.Div(
40 |         children=[html.H4('Variable for x axis: '), dd_x_var],
41 |         className="six columns"
42 |         )
43 |         
44 | 
45 | dd_y_var = dcc.Dropdown(
46 |         id='y-var',
47 |         options = options_dropdown,
48 |         value = 'price'
49 |         ) 
50 | 
51 | div_y_var =  html.Div(
52 |         children=[html.H4('Variable for y axis: '), dd_y_var],
53 |         className="six columns"
54 |         )
55 | 
56 | app.layout = html.Div(children=[
57 |         html.H1('Adding interactive controls'),
58 |         html.H2('Interactive scatter plot example'),
59 |         html.Div(
60 |                 children=[div_x_var, div_y_var],
61 |                 className="row"
62 |                 ),  
63 |         dcc.Graph(id='scatter')
64 |         ])
65 | 
66 | 
67 | @app.callback(
68 |         Output(component_id='scatter', component_property='figure'),
69 |         [Input(component_id='x-var', component_property='value'), Input(component_id='y-var', component_property='value')])
70 | def scatter_plot(x_col, y_col):
71 |     trace = go.Scatter(
72 |             x = diamonds[x_col],
73 |             y = diamonds[y_col],
74 |             mode = 'markers'
75 |             )
76 |     
77 |     layout = go.Layout(
78 |             title = 'Scatter plot',
79 |             xaxis = dict(title = x_col.upper()),
80 |             yaxis = dict(title = y_col.upper())
81 |             )
82 |     
83 |     output_plot = go.Figure(
84 |             data = [trace],
85 |             layout = layout
86 |             )
87 |     
88 |     return output_plot
89 |  
90 |       
91 | if __name__ == '__main__':
92 |     app.run_server(debug=True)


--------------------------------------------------------------------------------
/Chapter09/diamonds-model-training.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | @author: Alvaro Fuentes
 4 | Chapter 9. Hands-On Predictive Analytics with Python
 5 | Producing the predictive model's objects
 6 | """
 7 | ## Imports
 8 | import numpy as np
 9 | import pandas as pd
10 | import os
11 | from keras.models import Sequential
12 | from keras.layers import Dense
13 | from sklearn.externals import joblib
14 | 
15 | ## Loading the dataset
16 | DATA_DIR = '../data'
17 | FILE_NAME = 'diamonds.csv'
18 | data_path = os.path.join(DATA_DIR, FILE_NAME)
19 | diamonds = pd.read_csv(data_path)
20 | 
21 | 
22 | ## Preparing the dataset
23 | diamonds = diamonds.loc[(diamonds['x']>0) | (diamonds['y']>0)]
24 | diamonds.loc[11182, 'x'] = diamonds['x'].median()
25 | diamonds.loc[11182, 'z'] = diamonds['z'].median()
26 | diamonds = diamonds.loc[~((diamonds['y'] > 30) | (diamonds['z'] > 30))]
27 | diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['cut'], prefix='cut', drop_first=True)], axis=1)
28 | diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['color'], prefix='color', drop_first=True)], axis=1)
29 | diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['clarity'], prefix='clarity', drop_first=True)], axis=1)
30 | 
31 | ## Dimensionality reduction
32 | from sklearn.decomposition import PCA
33 | pca = PCA(n_components=1, random_state=123)
34 | diamonds['dim_index'] = pca.fit_transform(diamonds[['x','y','z']])
35 | diamonds.drop(['x','y','z'], axis=1, inplace=True)
36 | 
37 | ## Creating X and y
38 | X = diamonds.drop(['cut','color','clarity','price'], axis=1)
39 | y = np.log(diamonds['price'])
40 | 
41 | ## Standarization: centering and scaling
42 | numerical_features = ['carat', 'depth', 'table', 'dim_index']
43 | from sklearn.preprocessing import StandardScaler
44 | scaler = StandardScaler()
45 | X.loc[:, numerical_features] = scaler.fit_transform(X[numerical_features])
46 | 
47 | ## Building the neural network
48 | n_input = X.shape[1]
49 | n_hidden1 = 32
50 | n_hidden2 = 16
51 | n_hidden3 = 8
52 | 
53 | nn_reg = Sequential()
54 | nn_reg.add(Dense(units=n_hidden1, activation='relu', input_shape=(n_input,)))
55 | nn_reg.add(Dense(units=n_hidden2, activation='relu'))
56 | nn_reg.add(Dense(units=n_hidden3, activation='relu'))
57 | # output layer
58 | nn_reg.add(Dense(units=1, activation=None))
59 | 
60 | ## Training the neural network
61 | batch_size = 32
62 | n_epochs = 40
63 | nn_reg.compile(loss='mean_absolute_error', optimizer='adam')
64 | nn_reg.fit(X, y, epochs=n_epochs, batch_size=batch_size)
65 | 
66 | ## Serializing:
67 | # PCA
68 | joblib.dump(pca, './Model/pca.joblib') 
69 | 
70 | # Scaler
71 | joblib.dump(scaler, './Model/scaler.joblib')
72 | 
73 | # Trained model
74 | nn_reg.save("./Model/diamond-prices-model.h5")


--------------------------------------------------------------------------------
/Chapter09/predict-diamond-prices.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Alvaro Fuentes
  4 | Chapter 9. Hands-On Predictive Analytics with Python
  5 | Building the web application
  6 | """
  7 | 
  8 | import dash
  9 | import dash_core_components as dcc
 10 | import dash_html_components as html
 11 | from dash.dependencies import Input, Output
 12 | 
 13 | from keras.models import load_model
 14 | from sklearn.externals import joblib
 15 | 
 16 | import numpy as np
 17 | import pandas as pd
 18 | 
 19 | app = dash.Dash(__name__)
 20 | app.css.append_css({
 21 |     'external_url': 'https://codepen.io/chriddyp/pen/bWLwgP.css'
 22 | })
 23 | 
 24 | model = load_model('./Model/diamond-prices-model.h5')
 25 | pca = joblib.load('./Model/pca.joblib')
 26 | scaler = joblib.load('./Model/scaler.joblib')
 27 | model._make_predict_function()
 28 | 
 29 | ## Div for carat
 30 | input_carat = dcc.Input(
 31 |     id='carat',
 32 |     type='numeric',
 33 |     value=0.7)
 34 | 
 35 | div_carat = html.Div(
 36 |         children=[html.H3('Carat:'), input_carat],
 37 |         className="four columns"
 38 |         )
 39 | 
 40 | ## Div for depth
 41 | input_depth = dcc.Input(
 42 |     id='depth',
 43 |     placeholder='',
 44 |     type='numeric',
 45 |     value=60)
 46 | 
 47 | div_depth = html.Div(
 48 |         children=[html.H3('Depth:'), input_depth],
 49 |         className="four columns"
 50 |         )
 51 | 
 52 | ## Div for table
 53 | input_table = dcc.Input(
 54 |     id='table',        
 55 |     placeholder='',
 56 |     type='numeric',
 57 |     value=60)
 58 | 
 59 | div_table = html.Div(
 60 |         children=[html.H3('Table:'), input_table],
 61 |         className="four columns"
 62 |         )
 63 | 
 64 | ## Div for x
 65 | input_x = dcc.Input(
 66 |     id='x',        
 67 |     placeholder='',
 68 |     type='numeric',
 69 |     value=5)
 70 | 
 71 | div_x = html.Div(
 72 |         children=[html.H3('x value:'), input_x],
 73 |         className="four columns"
 74 |         )
 75 | 
 76 | ## Div for y
 77 | input_y = dcc.Input(
 78 |     id='y',
 79 |     placeholder='',
 80 |     type='numeric',
 81 |     value=5)
 82 | 
 83 | div_y = html.Div(
 84 |         children=[html.H3('y value:'), input_y],
 85 |         className="four columns"
 86 |         )
 87 | 
 88 | ## Div for z
 89 | input_z = dcc.Input(
 90 |     id='z',        
 91 |     placeholder='',
 92 |     type='numeric',
 93 |     value=3)
 94 | 
 95 | div_z = html.Div(
 96 |         children=[html.H3('z value: '), input_z],
 97 |         className="four columns"
 98 |         )
 99 | 
100 | ## Div for cut
101 | cut_values = ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good']
102 | cut_options = [{'label': x, 'value': x} for x in cut_values]
103 | input_cut = dcc.Dropdown(
104 |     id='cut',
105 |     options = cut_options,
106 |     value = 'Ideal'
107 |     )
108 | 
109 | div_cut = html.Div(
110 |         children=[html.H3('Cut:'), input_cut],
111 |         className="four columns"
112 |         )
113 | 
114 | ## Div for color
115 | color_values = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
116 | color_options = [{'label': x, 'value': x} for x in color_values]
117 | input_color = dcc.Dropdown(
118 |     id='color',        
119 |     options = color_options,
120 |     value = 'G'
121 |     )
122 | 
123 | div_color = html.Div(
124 |         children=[html.H3('Color:'), input_color],
125 |         className="four columns"
126 |         )
127 | 
128 | ## Div for clarity
129 | clarity_values = ['I1', 'IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2']
130 | clarity_options = [{'label': x, 'value': x} for x in clarity_values]
131 | input_clarity = dcc.Dropdown(
132 |     id='clarity',        
133 |     options = clarity_options,
134 |     value = 'SI1'
135 |     )
136 | 
137 | div_clarity = html.Div(
138 |         children=[html.H3('Clarity:'), input_clarity],
139 |         className="four columns"
140 |         )
141 | 
142 | ## Div for numerical characteristics
143 | div_numerical = html.Div(
144 |         children = [div_carat, div_depth, div_table],
145 |         className="row"
146 |         )
147 | 
148 | ## Div for dimensions
149 | div_dimensions = html.Div(
150 |         children = [div_x, div_y, div_z],
151 |         className="row"
152 |         )
153 | 
154 | ## Div for categorical
155 | div_categorical = html.Div(
156 |         children = [div_cut, div_color, div_clarity],
157 |         className="row"
158 |         )
159 | 
160 | def get_prediction(carat, depth, table, x, y, z, cut, color, clarity):
161 |     '''takes the inputs from the user and produces the price prediction'''
162 |     
163 |     cols = ['carat', 'depth', 'table',
164 |             'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good',
165 |             'color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J',
166 |             'clarity_IF','clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2','clarity_VVS1', 'clarity_VVS2',
167 |             'dim_index']
168 | 
169 |     cut_dict = {x: 'cut_' + x for x in cut_values[1:]}
170 |     color_dict = {x: 'color_' + x for x in color_values[1:]}
171 |     clarity_dict = {x: 'clarity_' + x for x in clarity_values[1:]}
172 |     
173 |     ## produce a dataframe with a single row of zeros
174 |     df = pd.DataFrame(data = np.zeros((1,len(cols))), columns = cols)
175 |     
176 |     ## get the numeric characteristics
177 |     df.loc[0,'carat'] = carat
178 |     df.loc[0,'depth'] = depth
179 |     df.loc[0,'table'] = table
180 |     
181 |     ## transform dimensions into a single dim_index using PCA
182 |     dims_df = pd.DataFrame(data=[[x, y, z]], columns=['x','y','z'])
183 |     df.loc[0,'dim_index'] = pca.transform(dims_df).flatten()[0]
184 |     
185 |     ## Use the one-hot encoding for the categorical features
186 |     if cut!='Fair':
187 |         df.loc[0, cut_dict[cut]] = 1
188 |     
189 |     if color!='D':
190 |         df.loc[0, color_dict[color]] = 1
191 |     
192 |     if clarity != 'I1':
193 |         df.loc[0, clarity_dict[clarity]] = 1
194 |     
195 |     ## Scale the numerical features using the trained scaler
196 |     numerical_features = ['carat', 'depth', 'table', 'dim_index']
197 |     df.loc[:,numerical_features] = scaler.transform(df.loc[:,numerical_features])
198 |     
199 |     ## Get the predictions using our trained neural network
200 |     prediction = model.predict(df.values).flatten()[0]
201 |     
202 |     ## Transform the log-prices to prices
203 |     prediction = np.exp(prediction)
204 |    
205 |     return int(prediction)
206 |     
207 | ## App layout
208 | app.layout = html.Div([
209 |         html.H1('IDR Predict diamond prices'),
210 |         
211 |         html.H2('Enter the diamond characteristics to get the predicted price'),
212 |         
213 |         html.Div(
214 |                 children=[div_numerical, div_dimensions, div_categorical]
215 |                 ),
216 |         html.H1(id='output',
217 |                 style={'margin-top': '50px', 'text-align': 'center'})
218 |         ])
219 | 
220 | predictors = ['carat', 'depth', 'table', 'x', 'y', 'z', 'cut', 'color', 'clarity']
221 | @app.callback(
222 |         Output('output', 'children'),
223 |         [Input(x, 'value') for x in predictors])
224 | def show_prediction(carat, depth, table, x, y, z, cut, color, clarity): 
225 |     pred = get_prediction(carat, depth, table, x, y, z, cut, color, clarity)
226 |     return str("Predicted Price: {:,}".format(pred))
227 | 
228 | 
229 | if __name__ == '__main__':
230 |     app.run_server(debug=True)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | # Hands-On Predictive Analytics with Python
 5 | 
 6 | <a href="https://www.packtpub.com/big-data-and-business-intelligence/hands-predictive-analytics-python?utm_source=github&utm_medium=repository&utm_campaign=9781789138719"><img src="https://www.packtpub.com/sites/default/files/9781789138719_cover.png" height="256px" align="right"></a>
 7 | 
 8 | This is the code repository for [Hands-On Predictive Analytics with Python](https://www.packtpub.com/big-data-and-business-intelligence/hands-predictive-analytics-python?utm_source=github&utm_medium=repository&utm_campaign=9781789138719), published by Packt.
 9 | 
10 | **Master the complete predictive analytics process, from problem definition to model deployment**
11 | 
12 | ## What is this book about?
13 | This book will teach you all the processes you need to build a predictive analytics solution: understanding the problem, preparing datasets, exploring relationships, model building, tuning, evaluation, and deployment. You'll earn to use Python and its data analytics ecosystem to implement the main techniques used in real-world projects.
14 | 
15 | This book covers the following exciting features: 
16 | * Get to grips with the main concepts and principles of predictive analytics
17 | * Learn about the stages involved in producing complete predictive analytics solutions
18 | * Understand how to define a problem, propose a solution, and prepare a dataset
19 | * Use visualizations to explore relationships and gain insights into the dataset
20 | * Learn to build regression and classification models using scikit-learn
21 | * Use Keras to build powerful neural network models that produce accurate predictions
22 | * Learn to serve a model's predictions as a web application
23 | 
24 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/178913871X) today!
25 | 
26 | <a href="https://www.packtpub.com/?utm_source=github&utm_medium=banner&utm_campaign=GitHubBanner"><img src="https://raw.githubusercontent.com/PacktPublishing/GitHub/master/GitHub.png" 
27 | alt="https://www.packtpub.com/" border="5" /></a>
28 | 
29 | ## Instructions and Navigations
30 | 
31 | ### Installation
32 | To be able to run the code of the book without any problems, please do the following:
33 | 1. Download the Anaconda distribution for your system, you can find the installers [here](https://www.anaconda.com)
34 | 1. Once you have installed the Anaconda distribution, create a new Python 3.6 environment with the packages you will need.
35 | To create the environment (named `ho-pawp`, but you can use any other name you like) run the following command
36 | in the Anaconda Prompt terminal `conda create --name ho-pawp --file requirements.txt `
37 | 
38 | For a quick guide on conda refer to the conda-cheatsheet.pdf in this repo.
39 | ### Using the code files
40 | 
41 | All of the code is organized into folders. Most of the code consists of Jupyter Notebooks. For example, Chapter02.
42 | 
43 | The code will look like the following:
44 | ```
45 | carat_values = np.arange(0.5, 5.5, 0.5)
46 | preds = first_ml_model(carat_values)
47 | pd.DataFrame({"Carat": carat_values, "Predicted price":preds})
48 | ```
49 | 
50 | **Following is what you need for this book:**
51 | This book is aimed at data scientists, data engineers, software engineers, and business analysts. Also, students and professionals who are constantly working with data in quantitative fields such as finance, economics, and business, among others, who would like to build models to make predictions will find this book useful. In general, this book is aimed at all professionals who would like to focus on the practical implementation of predictive analytics with Python.
52 | 
53 | With the following software and hardware list you can run all code files present in the book (Chapter 1-13).
54 | ### Software and Hardware List
55 | | Chapter | Software required                     | OS required                         |
56 | | ------- | ------------------------------------  | ----------------------------------- |
57 | | 1-9     | Python 3.6 or higher, Jupyter Notebook, Recent versions of the following Python libraries: NumPy, pandas, and matplotlib, Seaborn, scikit-learn, Recent installations of TensorFlow and Keras, Basic libraries for Dash | Windows, Mac OS X, and Linux (Any) |
58 | 
59 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it]().
60 | 
61 | ### Related products
62 | * TensorFlow: Powerful Predictive Analytics with TensorFlow [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/tensorflow-powerful-predictive-analytics-tensorflow?utm_source=github&utm_medium=repository&utm_campaign=9781789136913) [[Amazon]](https://www.amazon.com/dp/1789136911)
63 | 
64 | * Building Machine Learning Systems with Python - Third Edition [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/building-machine-learning-systems-python-third-edition?utm_source=github&utm_medium=repository&utm_campaign=9781788623223) [[Amazon]](https://www.amazon.com/dp/1788623223)
65 | 
66 | 
67 | ## Get to Know the Author
68 | **Alvaro Fuentes** is a Senior Data Scientist with more than 13 years of experience in analytical roles.
69 | He holds an M.S. in applied mathematics and an M.S. in quantitative economics. He has been working for one of the top global
70 | management consulting firms solving analytical and AI problems in different industries like Banking, Telco, Mining and others.
71 | He worked for many years in the Central Bank of Guatemala as an economic analyst, building models for economic and financial data.
72 | He is a big Python fan and has been using it routinely for 5+ years to analyzing data and building and deploying analytical models that transform data into intelligence.
73 | 
74 | 
75 | ### Suggestions and Feedback
76 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions.
77 | 
78 | 
79 | ### Download a free PDF
80 | 
81 |  <i>If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.<br>Simply click on the link to claim your free PDF.</i>
82 | <p align="center"> <a href="https://packt.link/free-ebook/9781789138719">https://packt.link/free-ebook/9781789138719 </a> </p>


--------------------------------------------------------------------------------
/conda-cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Predictive-Analytics-with-Python/e049d9d2870f596892b62aeddec312788d9d0c2c/conda-cheatsheet.pdf


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | # This file may be used to create an environment using:
  2 | # $ conda create --name <env> --file <this file>
  3 | # platform: win-64
  4 | _tflow_select=2.2.0=eigen
  5 | absl-py=0.9.0=py36_0
  6 | astor=0.8.0=py36_0
  7 | attrs=19.3.0=py_0
  8 | backcall=0.2.0=py_0
  9 | blas=1.0=mkl
 10 | bleach=3.1.5=py_0
 11 | blinker=1.4=py36_0
 12 | brotli=1.0.7=pypi_0
 13 | brotlipy=0.7.0=py36he774522_1000
 14 | ca-certificates=2020.6.24=0
 15 | cachetools=4.1.0=py_1
 16 | certifi=2020.6.20=py36_0
 17 | cffi=1.14.0=py36h7a1dbc1_0
 18 | chardet=3.0.4=py36_1003
 19 | click=7.1.2=py_0
 20 | colorama=0.4.3=py_0
 21 | cryptography=2.9.2=py36h7a1dbc1_0
 22 | cycler=0.10.0=py36h009560c_0
 23 | dash=0.28.5=pypi_0
 24 | dash-core-components=0.35.0=pypi_0
 25 | dash-html-components=0.13.2=pypi_0
 26 | dash-renderer=0.15.0=pypi_0
 27 | decorator=4.4.2=py_0
 28 | defusedxml=0.6.0=py_0
 29 | entrypoints=0.3=py36_0
 30 | flask=1.1.2=pypi_0
 31 | flask-compress=1.5.0=pypi_0
 32 | freetype=2.10.2=hd328e21_0
 33 | gast=0.2.2=py36_0
 34 | google-auth=1.17.2=py_0
 35 | google-auth-oauthlib=0.4.1=py_2
 36 | google-pasta=0.2.0=py_0
 37 | graphviz=2.38=hfd603c8_2
 38 | grpcio=1.27.2=py36h351948d_0
 39 | h5py=2.10.0=py36h5e291fa_0
 40 | hdf5=1.10.4=h7ebc959_0
 41 | icc_rt=2019.0.0=h0cc432a_1
 42 | icu=58.2=ha925a31_3
 43 | idna=2.10=py_0
 44 | importlib-metadata=1.7.0=py36_0
 45 | importlib_metadata=1.7.0=0
 46 | intel-openmp=2020.1=216
 47 | ipykernel=5.3.2=py36h5ca1d4c_0
 48 | ipython=7.16.1=py36h5ca1d4c_0
 49 | ipython_genutils=0.2.0=py36_0
 50 | ipywidgets=7.5.1=py_0
 51 | itsdangerous=1.1.0=pypi_0
 52 | jedi=0.17.1=py36_0
 53 | jinja2=2.11.2=py_0
 54 | joblib=0.16.0=py_0
 55 | jpeg=9b=hb83a4c4_2
 56 | jsonschema=3.2.0=py36_0
 57 | jupyter=1.0.0=py36_7
 58 | jupyter_client=6.1.6=py_0
 59 | jupyter_console=6.1.0=py_0
 60 | jupyter_core=4.6.3=py36_0
 61 | keras=2.3.1=0
 62 | keras-applications=1.0.8=py_1
 63 | keras-base=2.3.1=py36_0
 64 | keras-preprocessing=1.1.0=py_1
 65 | kiwisolver=1.2.0=py36h74a9793_0
 66 | libpng=1.6.37=h2a8f88b_0
 67 | libprotobuf=3.12.3=h7bd577a_0
 68 | libsodium=1.0.18=h62dcd97_0
 69 | m2w64-gcc-libgfortran=5.3.0=6
 70 | m2w64-gcc-libs=5.3.0=7
 71 | m2w64-gcc-libs-core=5.3.0=7
 72 | m2w64-gmp=6.1.0=2
 73 | m2w64-libwinpthread-git=5.0.0.4634.697f757=2
 74 | markdown=3.1.1=py36_0
 75 | markupsafe=1.1.1=py36he774522_0
 76 | matplotlib=3.2.2=0
 77 | matplotlib-base=3.2.2=py36h64f37c6_0
 78 | mistune=0.8.4=py36he774522_0
 79 | mkl=2020.1=216
 80 | mkl-service=2.3.0=py36hb782905_0
 81 | mkl_fft=1.1.0=py36h45dec08_0
 82 | mkl_random=1.1.1=py36h47e9c7a_0
 83 | msys2-conda-epoch=20160418=1
 84 | nbconvert=5.6.1=py36_0
 85 | nbformat=5.0.7=py_0
 86 | notebook=6.0.3=py36_0
 87 | numpy=1.18.5=py36h6530119_0
 88 | numpy-base=1.18.5=py36hc3f5095_0
 89 | oauthlib=3.1.0=py_0
 90 | openssl=1.1.1g=he774522_0
 91 | opt_einsum=3.1.0=py_0
 92 | packaging=20.4=py_0
 93 | pandas=1.0.5=py36h47e9c7a_0
 94 | pandoc=2.10=0
 95 | pandocfilters=1.4.2=py36_1
 96 | parso=0.7.0=py_0
 97 | pickleshare=0.7.5=py36_0
 98 | pip=20.1.1=py36_1
 99 | plotly=4.9.0=pypi_0
100 | prometheus_client=0.8.0=py_0
101 | prompt-toolkit=3.0.5=py_0
102 | prompt_toolkit=3.0.5=0
103 | protobuf=3.12.3=py36h33f27b4_0
104 | pyasn1=0.4.8=py_0
105 | pyasn1-modules=0.2.7=py_0
106 | pycparser=2.20=py_2
107 | pydotplus=2.0.2=pypi_0
108 | pygments=2.6.1=py_0
109 | pyjwt=1.7.1=py36_0
110 | pyopenssl=19.1.0=py_1
111 | pyparsing=2.4.7=py_0
112 | pyqt=5.9.2=py36h6538335_2
113 | pyreadline=2.1=py36_1
114 | pyrsistent=0.16.0=py36he774522_0
115 | pysocks=1.7.1=py36_0
116 | python=3.6.10=h9f7ef89_2
117 | python-dateutil=2.8.1=py_0
118 | pytz=2020.1=py_0
119 | pywin32=227=py36he774522_1
120 | pywinpty=0.5.7=py36_0
121 | pyyaml=5.3.1=py36he774522_1
122 | pyzmq=19.0.1=py36ha925a31_1
123 | qt=5.9.7=vc14h73c81de_0
124 | qtconsole=4.7.5=py_0
125 | qtpy=1.9.0=py_0
126 | requests=2.24.0=py_0
127 | requests-oauthlib=1.3.0=py_0
128 | retrying=1.3.3=pypi_0
129 | rsa=4.0=py_0
130 | scikit-learn=0.22=py36h6288b17_0
131 | scipy=1.5.0=py36h9439919_0
132 | seaborn=0.10.1=py_0
133 | send2trash=1.5.0=py36_0
134 | setuptools=49.2.0=py36_0
135 | sip=4.19.8=py36h6538335_0
136 | six=1.15.0=py_0
137 | sqlite=3.32.3=h2a8f88b_0
138 | tensorboard=2.2.1=pyh532a8cf_0
139 | tensorboard-plugin-wit=1.6.0=py_0
140 | tensorflow=2.1.0=eigen_py36hdbbabfe_0
141 | tensorflow-base=2.1.0=eigen_py36h49b2757_0
142 | tensorflow-estimator=2.1.0=pyhd54b08b_0
143 | termcolor=1.1.0=py36_1
144 | terminado=0.8.3=py36_0
145 | testpath=0.4.4=py_0
146 | threadpoolctl=2.1.0=pyh5ca1d4c_0
147 | tornado=6.0.4=py36he774522_1
148 | traitlets=4.3.3=py36_0
149 | urllib3=1.25.9=py_0
150 | vc=14.1=h0510ff6_4
151 | vs2015_runtime=14.16.27012=hf0eaf9b_3
152 | wcwidth=0.2.5=py_0
153 | webencodings=0.5.1=py36_1
154 | werkzeug=0.16.1=py_0
155 | wheel=0.34.2=py36_0
156 | widgetsnbextension=3.5.1=py36_0
157 | win_inet_pton=1.1.0=py36_0
158 | wincertstore=0.2=py36h7fe50ca_0
159 | winpty=0.4.3=4
160 | wrapt=1.12.1=py36he774522_1
161 | yaml=0.2.5=he774522_0
162 | zeromq=4.3.2=ha925a31_2
163 | zipp=3.1.0=py_0
164 | zlib=1.2.11=h62dcd97_4
165 | 


--------------------------------------------------------------------------------