├── Chapter01 ├── .ipynb_checkpoints │ └── ch1-code-snippets-checkpoint.ipynb └── ch1-code-snippets.ipynb ├── Chapter02 ├── ch2-1-diamond-prices.ipynb └── ch2-2-credit-card-default.ipynb ├── Chapter03 ├── .ipynb_checkpoints │ ├── ch3-1-eda-diamond-prices-checkpoint.ipynb │ └── ch3-2-eda-credit-card-default-checkpoint.ipynb ├── ch3-1-eda-diamond-prices.ipynb └── ch3-2-eda-credit-card-default.ipynb ├── Chapter04 ├── .ipynb_checkpoints │ └── ch4-overfitting-example-checkpoint.ipynb ├── ch4-overfitting-example.ipynb └── ch4-predicting-diamond-prices.ipynb ├── Chapter05 ├── .ipynb_checkpoints │ └── ch5-predicting-credit-card-default-checkpoint.ipynb └── ch5-predicting-credit-card-default.ipynb ├── Chapter06 ├── .ipynb_checkpoints │ ├── ch6-1-regression-with-neural-networks-checkpoint.ipynb │ └── ch6-2-classification-with-neural-networks-checkpoint.ipynb ├── ch6-1-regression-with-neural-networks.ipynb ├── ch6-2-classification-with-neural-networks.ipynb └── class_initial_w.h5 ├── Chapter07 ├── .ipynb_checkpoints │ └── ch7-credit-card-def-model-tuning-and-evaluation-checkpoint.ipynb ├── ch7-credit-card-def-model-tuning-and-evaluation.ipynb └── ch7-diamond-prices-model-tuning-and-evaluation.ipynb ├── Chapter08 ├── .ipynb_checkpoints │ ├── ch8-credit-card-def-model-tuning-checkpoint.ipynb │ └── ch8-diamond-prices-model-tuning-checkpoint.ipynb ├── ch8-credit-card-def-model-tuning.ipynb └── ch8-diamond-prices-model-tuning.ipynb ├── Chapter09 ├── Model │ ├── diamond-prices-model.h5 │ ├── pca.joblib │ └── scaler.joblib ├── dash-example-no-user-inputs.py ├── dash-example-user-inputs.py ├── diamonds-model-training.py └── predict-diamond-prices.py ├── Data ├── credit_card_default.csv └── diamonds.csv ├── LICENSE ├── README.md ├── conda-cheatsheet.pdf └── requirements.txt /Chapter02/ch2-2-credit-card-default.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introducing the credit card defualt dataset" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Data Set Information:\n", 15 | "\n", 16 | "**This research aimed at the case of customers default payments in Taiwan**\n", 17 | "\n", 18 | "### Features description:\n", 19 | "\n", 20 | "- LIMIT_BAL: Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit. \n", 21 | "- SEX: Gender (1 = male; 2 = female). \n", 22 | "- EDUCATION: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others). \n", 23 | "- MARRIAGE: Marital status (1 = married; 2 = single; 3 = others). \n", 24 | "- AGE: Age (year). \n", 25 | "- PAY_1 - PAY_6: History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: 1 = the repayment status in September, 2005; 1 = the repayment status in August, 2005; . . .; 6 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.\n", 26 | "- BILL_AMT1-BILL_AMT6: Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005. \n", 27 | "- PAY_AMT1-PAY_AMT6: Amount of previous payment (NT dollar).\n", 28 | "- default payment next month: **positive class: default | negative class: pay**" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 19, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "import numpy as np\n", 40 | "import pandas as pd\n", 41 | "import os" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 20, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/html": [ 52 | "
\n", 53 | "\n", 66 | "\n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | "
LIMIT_BALSEXEDUCATIONMARRIAGEAGEPAY_1PAY_2PAY_3PAY_4PAY_5...BILL_AMT4BILL_AMT5BILL_AMT6PAY_AMT1PAY_AMT2PAY_AMT3PAY_AMT4PAY_AMT5PAY_AMT6default payment next month
ID
1200002212422-1-1-2...000068900001
212000022226-12000...3272345532610100010001000020001
3900002223400000...1433114948155491518150010001000100050000
4500002213700000...2831428959295472000201912001100106910000
55000012157-10-100...2094019146191312000366811000090006896790
\n", 240 | "

5 rows × 24 columns

\n", 241 | "
" 242 | ], 243 | "text/plain": [ 244 | " LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_1 PAY_2 PAY_3 PAY_4 \\\n", 245 | "ID \n", 246 | "1 20000 2 2 1 24 2 2 -1 -1 \n", 247 | "2 120000 2 2 2 26 -1 2 0 0 \n", 248 | "3 90000 2 2 2 34 0 0 0 0 \n", 249 | "4 50000 2 2 1 37 0 0 0 0 \n", 250 | "5 50000 1 2 1 57 -1 0 -1 0 \n", 251 | "\n", 252 | " PAY_5 ... BILL_AMT4 BILL_AMT5 BILL_AMT6 \\\n", 253 | "ID ... \n", 254 | "1 -2 ... 0 0 0 \n", 255 | "2 0 ... 3272 3455 3261 \n", 256 | "3 0 ... 14331 14948 15549 \n", 257 | "4 0 ... 28314 28959 29547 \n", 258 | "5 0 ... 20940 19146 19131 \n", 259 | "\n", 260 | " PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 \\\n", 261 | "ID \n", 262 | "1 0 689 0 0 0 0 \n", 263 | "2 0 1000 1000 1000 0 2000 \n", 264 | "3 1518 1500 1000 1000 1000 5000 \n", 265 | "4 2000 2019 1200 1100 1069 1000 \n", 266 | "5 2000 36681 10000 9000 689 679 \n", 267 | "\n", 268 | " default payment next month \n", 269 | "ID \n", 270 | "1 1 \n", 271 | "2 1 \n", 272 | "3 0 \n", 273 | "4 0 \n", 274 | "5 0 \n", 275 | "\n", 276 | "[5 rows x 24 columns]" 277 | ] 278 | }, 279 | "execution_count": 20, 280 | "metadata": {}, 281 | "output_type": "execute_result" 282 | } 283 | ], 284 | "source": [ 285 | "DATA_DIR = '../data'\n", 286 | "FILE_NAME = 'credit_card_default.csv'\n", 287 | "data_path = os.path.join(DATA_DIR, FILE_NAME)\n", 288 | "ccd = pd.read_csv(data_path, index_col=\"ID\")\n", 289 | "ccd.head()" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 21, 295 | "metadata": {}, 296 | "outputs": [ 297 | { 298 | "data": { 299 | "text/plain": [ 300 | "(30000, 24)" 301 | ] 302 | }, 303 | "execution_count": 21, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "ccd.shape" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 22, 315 | "metadata": { 316 | "collapsed": true 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "ccd.rename(columns=lambda x: x.lower(), inplace=True)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [ 327 | "## Numerical features" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 23, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n", 337 | "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n", 338 | "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 24, 344 | "metadata": {}, 345 | "outputs": [ 346 | { 347 | "data": { 348 | "text/html": [ 349 | "
\n", 350 | "\n", 363 | "\n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | "
limit_balage
count30000.00000030000.000000
mean167484.32266735.485500
std129747.6615679.217904
min10000.00000021.000000
25%50000.00000028.000000
50%140000.00000034.000000
75%240000.00000041.000000
max1000000.00000079.000000
\n", 414 | "
" 415 | ], 416 | "text/plain": [ 417 | " limit_bal age\n", 418 | "count 30000.000000 30000.000000\n", 419 | "mean 167484.322667 35.485500\n", 420 | "std 129747.661567 9.217904\n", 421 | "min 10000.000000 21.000000\n", 422 | "25% 50000.000000 28.000000\n", 423 | "50% 140000.000000 34.000000\n", 424 | "75% 240000.000000 41.000000\n", 425 | "max 1000000.000000 79.000000" 426 | ] 427 | }, 428 | "execution_count": 24, 429 | "metadata": {}, 430 | "output_type": "execute_result" 431 | } 432 | ], 433 | "source": [ 434 | "ccd[['limit_bal','age']].describe()" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 25, 440 | "metadata": {}, 441 | "outputs": [ 442 | { 443 | "data": { 444 | "text/html": [ 445 | "
\n", 446 | "\n", 459 | "\n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | "
bill_amt1bill_amt2bill_amt3bill_amt4bill_amt5bill_amt6
count30000.030000.030000.030000.030000.030000.0
mean51223.049179.047013.043263.040311.038872.0
std73636.071174.069349.064333.060797.059554.0
min-165580.0-69777.0-157264.0-170000.0-81334.0-339603.0
25%3559.02985.02666.02327.01763.01256.0
50%22382.021200.020088.019052.018104.017071.0
75%67091.064006.060165.054506.050190.049198.0
max964511.0983931.01664089.0891586.0927171.0961664.0
\n", 546 | "
" 547 | ], 548 | "text/plain": [ 549 | " bill_amt1 bill_amt2 bill_amt3 bill_amt4 bill_amt5 bill_amt6\n", 550 | "count 30000.0 30000.0 30000.0 30000.0 30000.0 30000.0\n", 551 | "mean 51223.0 49179.0 47013.0 43263.0 40311.0 38872.0\n", 552 | "std 73636.0 71174.0 69349.0 64333.0 60797.0 59554.0\n", 553 | "min -165580.0 -69777.0 -157264.0 -170000.0 -81334.0 -339603.0\n", 554 | "25% 3559.0 2985.0 2666.0 2327.0 1763.0 1256.0\n", 555 | "50% 22382.0 21200.0 20088.0 19052.0 18104.0 17071.0\n", 556 | "75% 67091.0 64006.0 60165.0 54506.0 50190.0 49198.0\n", 557 | "max 964511.0 983931.0 1664089.0 891586.0 927171.0 961664.0" 558 | ] 559 | }, 560 | "execution_count": 25, 561 | "metadata": {}, 562 | "output_type": "execute_result" 563 | } 564 | ], 565 | "source": [ 566 | "ccd[bill_amt_features].describe().round()" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 26, 572 | "metadata": {}, 573 | "outputs": [ 574 | { 575 | "data": { 576 | "text/html": [ 577 | "
\n", 578 | "\n", 591 | "\n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | "
pay_amt1pay_amt2pay_amt3pay_amt4pay_amt5pay_amt6
count30000.030000.030000.030000.030000.030000.0
mean5664.05921.05226.04826.04799.05216.0
std16563.023041.017607.015666.015278.017777.0
min0.00.00.00.00.00.0
25%1000.0833.0390.0296.0252.0118.0
50%2100.02009.01800.01500.01500.01500.0
75%5006.05000.04505.04013.04032.04000.0
max873552.01684259.0896040.0621000.0426529.0528666.0
\n", 678 | "
" 679 | ], 680 | "text/plain": [ 681 | " pay_amt1 pay_amt2 pay_amt3 pay_amt4 pay_amt5 pay_amt6\n", 682 | "count 30000.0 30000.0 30000.0 30000.0 30000.0 30000.0\n", 683 | "mean 5664.0 5921.0 5226.0 4826.0 4799.0 5216.0\n", 684 | "std 16563.0 23041.0 17607.0 15666.0 15278.0 17777.0\n", 685 | "min 0.0 0.0 0.0 0.0 0.0 0.0\n", 686 | "25% 1000.0 833.0 390.0 296.0 252.0 118.0\n", 687 | "50% 2100.0 2009.0 1800.0 1500.0 1500.0 1500.0\n", 688 | "75% 5006.0 5000.0 4505.0 4013.0 4032.0 4000.0\n", 689 | "max 873552.0 1684259.0 896040.0 621000.0 426529.0 528666.0" 690 | ] 691 | }, 692 | "execution_count": 26, 693 | "metadata": {}, 694 | "output_type": "execute_result" 695 | } 696 | ], 697 | "source": [ 698 | "ccd[pay_amt_features].describe().round()" 699 | ] 700 | }, 701 | { 702 | "cell_type": "markdown", 703 | "metadata": {}, 704 | "source": [ 705 | "## Encoding categorical features" 706 | ] 707 | }, 708 | { 709 | "cell_type": "code", 710 | "execution_count": 27, 711 | "metadata": {}, 712 | "outputs": [ 713 | { 714 | "data": { 715 | "text/plain": [ 716 | "ID\n", 717 | "1 0\n", 718 | "2 0\n", 719 | "3 0\n", 720 | "4 0\n", 721 | "5 1\n", 722 | "6 1\n", 723 | "7 1\n", 724 | "8 0\n", 725 | "9 0\n", 726 | "10 1\n", 727 | "Name: male, dtype: int32" 728 | ] 729 | }, 730 | "execution_count": 27, 731 | "metadata": {}, 732 | "output_type": "execute_result" 733 | } 734 | ], 735 | "source": [ 736 | "ccd['male'] = (ccd['sex'] == 1).astype('int')\n", 737 | "ccd['male'].head(n=10)" 738 | ] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": 28, 743 | "metadata": {}, 744 | "outputs": [ 745 | { 746 | "data": { 747 | "text/plain": [ 748 | "0.39626666666666666" 749 | ] 750 | }, 751 | "execution_count": 28, 752 | "metadata": {}, 753 | "output_type": "execute_result" 754 | } 755 | ], 756 | "source": [ 757 | "ccd['male'].mean()" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": 29, 763 | "metadata": {}, 764 | "outputs": [ 765 | { 766 | "data": { 767 | "text/plain": [ 768 | "0 14\n", 769 | "1 10585\n", 770 | "2 14030\n", 771 | "3 4917\n", 772 | "4 123\n", 773 | "5 280\n", 774 | "6 51\n", 775 | "Name: education, dtype: int64" 776 | ] 777 | }, 778 | "execution_count": 29, 779 | "metadata": {}, 780 | "output_type": "execute_result" 781 | } 782 | ], 783 | "source": [ 784 | "ccd['education'].value_counts(sort=False)" 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": 30, 790 | "metadata": { 791 | "collapsed": true 792 | }, 793 | "outputs": [], 794 | "source": [ 795 | "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n", 796 | "ccd['university'] = (ccd['education'] == 2).astype('int')\n", 797 | "ccd['high_school'] = (ccd['education'] == 3).astype('int')" 798 | ] 799 | }, 800 | { 801 | "cell_type": "code", 802 | "execution_count": 31, 803 | "metadata": {}, 804 | "outputs": [ 805 | { 806 | "data": { 807 | "text/plain": [ 808 | "ID\n", 809 | "48 5\n", 810 | "70 5\n", 811 | "359 4\n", 812 | "386 5\n", 813 | "449 4\n", 814 | "Name: education, dtype: int64" 815 | ] 816 | }, 817 | "execution_count": 31, 818 | "metadata": {}, 819 | "output_type": "execute_result" 820 | } 821 | ], 822 | "source": [ 823 | "ccd.loc[(ccd['grad_school']==0) & (ccd['university']==0) & (ccd['high_school']==0)]['education'].head()" 824 | ] 825 | }, 826 | { 827 | "cell_type": "markdown", 828 | "metadata": {}, 829 | "source": [ 830 | "## Low variance features" 831 | ] 832 | }, 833 | { 834 | "cell_type": "code", 835 | "execution_count": 32, 836 | "metadata": {}, 837 | "outputs": [ 838 | { 839 | "data": { 840 | "text/plain": [ 841 | "1 13713\n", 842 | "2 15964\n", 843 | "3 323\n", 844 | "Name: marriage, dtype: int64" 845 | ] 846 | }, 847 | "execution_count": 32, 848 | "metadata": {}, 849 | "output_type": "execute_result" 850 | } 851 | ], 852 | "source": [ 853 | "ccd['marriage'].value_counts(sort=False)" 854 | ] 855 | }, 856 | { 857 | "cell_type": "code", 858 | "execution_count": 33, 859 | "metadata": { 860 | "collapsed": true 861 | }, 862 | "outputs": [], 863 | "source": [ 864 | "ccd['single'] = (ccd['marriage'] == 2).astype('int')\n", 865 | "ccd['marital_other'] = (ccd['marriage'] == 3).astype('int')" 866 | ] 867 | }, 868 | { 869 | "cell_type": "code", 870 | "execution_count": 34, 871 | "metadata": {}, 872 | "outputs": [ 873 | { 874 | "name": "stdout", 875 | "output_type": "stream", 876 | "text": [ 877 | "Proportion of singles: 0.5321333333333333\n", 878 | "Proportion of other marital status: 0.010766666666666667\n" 879 | ] 880 | } 881 | ], 882 | "source": [ 883 | "print(\"Proportion of singles: \", ccd['single'].mean())\n", 884 | "print(\"Proportion of other marital status: \", ccd['marital_other'].mean())" 885 | ] 886 | }, 887 | { 888 | "cell_type": "code", 889 | "execution_count": 35, 890 | "metadata": {}, 891 | "outputs": [ 892 | { 893 | "name": "stdout", 894 | "output_type": "stream", 895 | "text": [ 896 | "0.24816786226195736\n", 897 | "0.24897574808047968\n" 898 | ] 899 | } 900 | ], 901 | "source": [ 902 | "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n", 903 | "print(ccd['married'].var())\n", 904 | "print(ccd['single'].var())" 905 | ] 906 | }, 907 | { 908 | "cell_type": "code", 909 | "execution_count": 36, 910 | "metadata": {}, 911 | "outputs": [ 912 | { 913 | "data": { 914 | "text/plain": [ 915 | "0.9892333333333333" 916 | ] 917 | }, 918 | "execution_count": 36, 919 | "metadata": {}, 920 | "output_type": "execute_result" 921 | } 922 | ], 923 | "source": [ 924 | "(ccd['married'] == (1 - ccd['single'])).mean()" 925 | ] 926 | }, 927 | { 928 | "cell_type": "markdown", 929 | "metadata": {}, 930 | "source": [ 931 | "## A brief introduction to Feature Engineering" 932 | ] 933 | }, 934 | { 935 | "cell_type": "code", 936 | "execution_count": 37, 937 | "metadata": {}, 938 | "outputs": [ 939 | { 940 | "data": { 941 | "text/plain": [ 942 | "-2 2759\n", 943 | "-1 5686\n", 944 | " 0 14737\n", 945 | " 1 3688\n", 946 | " 2 2667\n", 947 | " 3 322\n", 948 | " 4 76\n", 949 | " 5 26\n", 950 | " 6 11\n", 951 | " 7 9\n", 952 | " 8 19\n", 953 | "Name: pay_1, dtype: int64" 954 | ] 955 | }, 956 | "execution_count": 37, 957 | "metadata": {}, 958 | "output_type": "execute_result" 959 | } 960 | ], 961 | "source": [ 962 | "ccd['pay_1'].value_counts().sort_index()" 963 | ] 964 | }, 965 | { 966 | "cell_type": "code", 967 | "execution_count": 38, 968 | "metadata": {}, 969 | "outputs": [], 970 | "source": [ 971 | "# fixing the pay_i features\n", 972 | "pay_features= ['pay_' + str(i) for i in range(1,7)]\n", 973 | "for x in pay_features:\n", 974 | " ccd.loc[ccd[x] <= 0, x] = 0" 975 | ] 976 | }, 977 | { 978 | "cell_type": "code", 979 | "execution_count": 39, 980 | "metadata": { 981 | "collapsed": true 982 | }, 983 | "outputs": [], 984 | "source": [ 985 | "# producing delayed features\n", 986 | "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n", 987 | "for pay, delayed in zip(pay_features, delayed_features):\n", 988 | " ccd[delayed] = (ccd[pay] > 0).astype(int)" 989 | ] 990 | }, 991 | { 992 | "cell_type": "code", 993 | "execution_count": 44, 994 | "metadata": {}, 995 | "outputs": [ 996 | { 997 | "data": { 998 | "text/plain": [ 999 | "delayed_1 0.227267\n", 1000 | "delayed_2 0.147933\n", 1001 | "delayed_3 0.140433\n", 1002 | "delayed_4 0.117000\n", 1003 | "delayed_5 0.098933\n", 1004 | "delayed_6 0.102633\n", 1005 | "dtype: float64" 1006 | ] 1007 | }, 1008 | "execution_count": 44, 1009 | "metadata": {}, 1010 | "output_type": "execute_result" 1011 | } 1012 | ], 1013 | "source": [ 1014 | "ccd[delayed_features].mean()" 1015 | ] 1016 | }, 1017 | { 1018 | "cell_type": "code", 1019 | "execution_count": null, 1020 | "metadata": { 1021 | "collapsed": true 1022 | }, 1023 | "outputs": [], 1024 | "source": [ 1025 | "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "markdown", 1030 | "metadata": {}, 1031 | "source": [ 1032 | "Done." 1033 | ] 1034 | }, 1035 | { 1036 | "cell_type": "code", 1037 | "execution_count": null, 1038 | "metadata": { 1039 | "collapsed": true 1040 | }, 1041 | "outputs": [], 1042 | "source": [] 1043 | } 1044 | ], 1045 | "metadata": { 1046 | "kernelspec": { 1047 | "display_name": "Python 3", 1048 | "language": "python", 1049 | "name": "python3" 1050 | }, 1051 | "language_info": { 1052 | "codemirror_mode": { 1053 | "name": "ipython", 1054 | "version": 3 1055 | }, 1056 | "file_extension": ".py", 1057 | "mimetype": "text/x-python", 1058 | "name": "python", 1059 | "nbconvert_exporter": "python", 1060 | "pygments_lexer": "ipython3", 1061 | "version": "3.6.1" 1062 | } 1063 | }, 1064 | "nbformat": 4, 1065 | "nbformat_minor": 2 1066 | } 1067 | -------------------------------------------------------------------------------- /Chapter05/.ipynb_checkpoints/ch5-predicting-credit-card-default-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Predicting Credit Card Default\n", 8 | "\n", 9 | "If you are using Windows, don't forget to add:\n", 10 | "\n", 11 | "C:\\Users\\\"user_name\"\\Anaconda3\\\"environment_name\"\\Library\\bin\\graphviz\\\n", 12 | "\n", 13 | "to the PATH environment variable" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import numpy as np\n", 23 | "import pandas as pd\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "import seaborn as sns\n", 26 | "import os\n", 27 | "%matplotlib inline" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "### Back with the credit card default dataset" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# Loading the dataset\n", 44 | "DATA_DIR = '../data'\n", 45 | "FILE_NAME = 'credit_card_default.csv'\n", 46 | "data_path = os.path.join(DATA_DIR, FILE_NAME)\n", 47 | "ccd = pd.read_csv(data_path, index_col=\"ID\")\n", 48 | "ccd.rename(columns=lambda x: x.lower(), inplace=True)\n", 49 | "ccd.rename(columns={'default payment next month':'default'}, inplace=True)\n", 50 | "\n", 51 | "# getting the groups of features\n", 52 | "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n", 53 | "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n", 54 | "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features\n", 55 | "\n", 56 | "# Creating creating binary features\n", 57 | "ccd['male'] = (ccd['sex'] == 1).astype('int')\n", 58 | "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n", 59 | "ccd['university'] = (ccd['education'] == 2).astype('int')\n", 60 | "#ccd['high_school'] = (ccd['education'] == 3).astype('int')\n", 61 | "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n", 62 | "\n", 63 | "# simplifying pay features \n", 64 | "pay_features= ['pay_' + str(i) for i in range(1,7)]\n", 65 | "for x in pay_features:\n", 66 | " ccd.loc[ccd[x] <= 0, x] = 0\n", 67 | "\n", 68 | "# simplifying delayed features\n", 69 | "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n", 70 | "for pay, delayed in zip(pay_features, delayed_features):\n", 71 | " ccd[delayed] = (ccd[pay] > 0).astype(int)\n", 72 | " \n", 73 | "# creating a new feature: months delayed\n", 74 | "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## Splitting the dataset" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "numerical_features = numerical_features + ['months_delayed']\n", 91 | "binary_features = ['male','married','grad_school','university']\n", 92 | "X = ccd[numerical_features + binary_features]\n", 93 | "y = ccd['default'].astype(int)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "from sklearn.model_selection import train_test_split\n", 103 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=101)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "scrolled": true 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "# 1. Import the class you will use\n", 115 | "from sklearn.preprocessing import StandardScaler\n", 116 | "# 2. Create an instance of the class\n", 117 | "scaler = StandardScaler()\n", 118 | "# 3. Use the fit method of the instance\n", 119 | "scaler.fit(X_train[numerical_features])\n", 120 | "# 4. Use the transform method to perform the transformation\n", 121 | "X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## Logistic Regression" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### A simple Logistic Regression model" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "from sklearn.linear_model import LogisticRegression\n", 145 | "simple_log_reg = LogisticRegression(C=1e6)\n", 146 | "simple_log_reg.fit(X_train['months_delayed'].values.reshape(-1, 1), y_train)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "print(\"W0: {}, W1: {}\".format(simple_log_reg.intercept_[0], simple_log_reg.coef_[0][0]))" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "def get_probs(months_delayed):\n", 165 | " m = scaler.mean_[-1]\n", 166 | " std = scaler.var_[-1]**.5\n", 167 | " x = (months_delayed - m)/std\n", 168 | " prob_default = 1/(1+np.exp(-simple_log_reg.intercept_[0] + -simple_log_reg.coef_[0][0]*x))\n", 169 | " return prob_default" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "months = np.arange(13)\n", 179 | "pred_probs = get_probs(months)\n", 180 | "pd.DataFrame({'months': months, 'pred_probs':pred_probs})" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "fig, ax = plt.subplots()\n", 190 | "ax.plot(months, pred_probs)\n", 191 | "ax.set_xlabel('Months delayed')\n", 192 | "ax.set_ylabel('Probability of default')\n", 193 | "ax.grid()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "### A complete Logistic Regression model" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "log_reg = LogisticRegression(C=1e6)\n", 210 | "log_reg.fit(X_train, y_train)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "prob_log_reg = log_reg.predict_proba(X_train)\n", 220 | "prob_log_reg[:10]" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "y_pred_log_reg = log_reg.predict(X_train)\n", 230 | "y_pred_log_reg[:10]" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "np.all(y_pred_log_reg == (prob_log_reg[:,1] > 0.5))" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "pd.Series(data=log_reg.coef_[0], index=X_train.columns).sort_values(ascending=False).round(2)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "from sklearn.metrics import accuracy_score\n", 258 | "accuracy_log_reg = accuracy_score(y_true=y_train, y_pred=y_pred_log_reg)\n", 259 | "accuracy_log_reg" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "## Classification Trees" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "from sklearn.tree import DecisionTreeClassifier\n", 276 | "class_tree = DecisionTreeClassifier(max_depth=3)\n", 277 | "class_tree.fit(X_train, y_train)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "from sklearn.externals.six import StringIO \n", 287 | "from sklearn.tree import export_graphviz\n", 288 | "from IPython.display import Image \n", 289 | "import pydotplus" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "dot_data = StringIO()\n", 299 | "export_graphviz(decision_tree=class_tree,\n", 300 | " out_file=dot_data,\n", 301 | " filled=True,\n", 302 | " rounded=True,\n", 303 | " feature_names = X_train.columns,\n", 304 | " class_names = ['pay','default'],\n", 305 | " special_characters=True)\n", 306 | "graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) \n", 307 | "Image(graph.create_png())" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "dot_data = StringIO()\n", 317 | "export_graphviz(decision_tree=class_tree,\n", 318 | " out_file=dot_data,\n", 319 | " filled=True,\n", 320 | " rounded=True,\n", 321 | " proportion=True,\n", 322 | " feature_names = X_train.columns,\n", 323 | " class_names = ['pay','default'],\n", 324 | " special_characters=True)\n", 325 | "graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) \n", 326 | "Image(graph.create_png())" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "### How trees work" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "from sklearn.datasets import make_blobs" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "A, b = make_blobs(n_samples=200, n_features=2, cluster_std=0.6,\n", 352 | " centers=[[-0.5,-1],[0.5,0.5]], shuffle=False, random_state=42)\n", 353 | "plt.scatter(A[:, 0], A[:, 1], c=b)\n", 354 | "plt.xlabel('X1', size=15)\n", 355 | "plt.ylabel('X2', size=15);" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "plt.scatter(A[:, 0], A[:, 1], c=b)\n", 365 | "plt.axhline(-0.6, c='red')\n", 366 | "plt.xlabel('X1', size=15)\n", 367 | "plt.ylabel('X2', size=15);" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "plt.scatter(A[:, 0], A[:, 1], c=b)\n", 377 | "plt.axhline(-0.6, c='red')\n", 378 | "plt.axvline(x=-0.1, ymin=0.34, c='red')\n", 379 | "plt.xlabel('X1', size=15)\n", 380 | "plt.ylabel('X2', size=15);" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "plt.scatter(A[:, 0], A[:, 1], c=b)\n", 390 | "plt.axhline(-0.6, c='red')\n", 391 | "plt.axvline(x=-0.1, ymin=0.34, c='red')\n", 392 | "plt.axvline(x=0.7, ymax=0.34, c='red')\n", 393 | "plt.xlabel('X1', size=15)\n", 394 | "plt.ylabel('X2', size=15);" 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "metadata": {}, 400 | "source": [ 401 | "### Training a larger classification tree" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "class_tree = DecisionTreeClassifier(max_depth=6, min_samples_split=50)\n", 411 | "class_tree.fit(X_train, y_train)\n", 412 | "y_pred_class_tree = class_tree.predict(X_train)" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "accuracy_class_tree = accuracy_score(y_true=y_train, y_pred=y_pred_class_tree)\n", 422 | "accuracy_class_tree" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "pd.Series(data=class_tree.feature_importances_, index=X_train.columns).sort_values(ascending=False).round(3)" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "pd.Series(data=class_tree.feature_importances_, index=X_train.columns).sort_values(ascending=False).plot(kind='bar');" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "## Random Forests" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "from sklearn.ensemble import RandomForestClassifier\n", 457 | "rf = RandomForestClassifier(n_estimators=99,\n", 458 | " max_features=6,\n", 459 | " max_depth=6,\n", 460 | " min_samples_split=100,\n", 461 | " random_state=85)\n", 462 | "rf.fit(X_train, y_train)\n", 463 | "y_pred_rf = rf.predict(X_train)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "accuracy_rf = accuracy_score(y_true=y_train, y_pred=y_pred_rf)\n", 473 | "accuracy_rf" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "metadata": {}, 480 | "outputs": [], 481 | "source": [ 482 | "pd.Series(data=rf.feature_importances_, index=X_train.columns).sort_values(ascending=False).round(3)" 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "metadata": {}, 488 | "source": [ 489 | "## Training vs Testing Error" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "y_pred_null = np.zeros_like(y_test)\n", 499 | "accuracy_score(y_true=y_test, y_pred=y_pred_null)" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "## Remember to also standarize the numerical features in the testing set\n", 509 | "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [ 518 | "## Calculating accuracy\n", 519 | "accuracies = pd.DataFrame(columns=['train', 'test'], index=['LogisticReg','ClassTree','RF'])\n", 520 | "model_dict = {'LogisticReg': log_reg, 'ClassTree': class_tree, 'RF': rf}\n", 521 | "for name, model in model_dict.items():\n", 522 | " accuracies.loc[name, 'train'] = accuracy_score(y_true=y_train, y_pred=model.predict(X_train))\n", 523 | " accuracies.loc[name, 'test'] = accuracy_score(y_true=y_test, y_pred=model.predict(X_test))\n", 524 | "\n", 525 | "accuracies" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": null, 531 | "metadata": {}, 532 | "outputs": [], 533 | "source": [ 534 | "fig, ax = plt.subplots()\n", 535 | "accuracies.sort_values(by='test', ascending=False).plot(kind='barh', ax=ax, zorder=3)\n", 536 | "ax.grid(zorder=0)" 537 | ] 538 | }, 539 | { 540 | "cell_type": "markdown", 541 | "metadata": {}, 542 | "source": [ 543 | "## Multiclass classification" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": null, 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "# Loading the iris dataset\n", 553 | "from sklearn.datasets import load_iris\n", 554 | "iris = load_iris()\n", 555 | "# Training the logistic regression model\n", 556 | "iris_log_reg = LogisticRegression(C=1e5)\n", 557 | "iris_log_reg.fit(iris.data, iris.target)\n", 558 | "iris_probs = iris_log_reg.predict_proba(iris.data)\n", 559 | "iris_pred = iris_log_reg.predict(iris.data)" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "metadata": {}, 566 | "outputs": [], 567 | "source": [ 568 | "iris_pred_df = pd.DataFrame(iris_probs, columns=iris.target_names).round(4)\n", 569 | "iris_pred_df['predicted_class'] = iris.target_names[iris_pred]\n", 570 | "iris_pred_df.sample(12)" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [] 579 | } 580 | ], 581 | "metadata": { 582 | "kernelspec": { 583 | "display_name": "Python 3", 584 | "language": "python", 585 | "name": "python3" 586 | }, 587 | "language_info": { 588 | "codemirror_mode": { 589 | "name": "ipython", 590 | "version": 3 591 | }, 592 | "file_extension": ".py", 593 | "mimetype": "text/x-python", 594 | "name": "python", 595 | "nbconvert_exporter": "python", 596 | "pygments_lexer": "ipython3", 597 | "version": "3.6.10" 598 | } 599 | }, 600 | "nbformat": 4, 601 | "nbformat_minor": 2 602 | } 603 | -------------------------------------------------------------------------------- /Chapter05/ch5-predicting-credit-card-default.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Predicting Credit Card Default\n", 8 | "\n", 9 | "If you are using Windows, don't forget to add:\n", 10 | "\n", 11 | "C:\\Users\\\"user_name\"\\Anaconda3\\\"environment_name\"\\Library\\bin\\graphviz\\\n", 12 | "\n", 13 | "to the PATH environment variable" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import numpy as np\n", 23 | "import pandas as pd\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "import seaborn as sns\n", 26 | "import os\n", 27 | "%matplotlib inline" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "### Back with the credit card default dataset" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# Loading the dataset\n", 44 | "DATA_DIR = '../data'\n", 45 | "FILE_NAME = 'credit_card_default.csv'\n", 46 | "data_path = os.path.join(DATA_DIR, FILE_NAME)\n", 47 | "ccd = pd.read_csv(data_path, index_col=\"ID\")\n", 48 | "ccd.rename(columns=lambda x: x.lower(), inplace=True)\n", 49 | "ccd.rename(columns={'default payment next month':'default'}, inplace=True)\n", 50 | "\n", 51 | "# getting the groups of features\n", 52 | "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n", 53 | "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n", 54 | "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features\n", 55 | "\n", 56 | "# Creating creating binary features\n", 57 | "ccd['male'] = (ccd['sex'] == 1).astype('int')\n", 58 | "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n", 59 | "ccd['university'] = (ccd['education'] == 2).astype('int')\n", 60 | "#ccd['high_school'] = (ccd['education'] == 3).astype('int')\n", 61 | "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n", 62 | "\n", 63 | "# simplifying pay features \n", 64 | "pay_features= ['pay_' + str(i) for i in range(1,7)]\n", 65 | "for x in pay_features:\n", 66 | " ccd.loc[ccd[x] <= 0, x] = 0\n", 67 | "\n", 68 | "# simplifying delayed features\n", 69 | "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n", 70 | "for pay, delayed in zip(pay_features, delayed_features):\n", 71 | " ccd[delayed] = (ccd[pay] > 0).astype(int)\n", 72 | " \n", 73 | "# creating a new feature: months delayed\n", 74 | "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## Splitting the dataset" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "numerical_features = numerical_features + ['months_delayed']\n", 91 | "binary_features = ['male','married','grad_school','university']\n", 92 | "X = ccd[numerical_features + binary_features]\n", 93 | "y = ccd['default'].astype(int)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "from sklearn.model_selection import train_test_split\n", 103 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=101)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "scrolled": true 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "# 1. Import the class you will use\n", 115 | "from sklearn.preprocessing import StandardScaler\n", 116 | "# 2. Create an instance of the class\n", 117 | "scaler = StandardScaler()\n", 118 | "# 3. Use the fit method of the instance\n", 119 | "scaler.fit(X_train[numerical_features])\n", 120 | "# 4. Use the transform method to perform the transformation\n", 121 | "X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## Logistic Regression" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### A simple Logistic Regression model" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "from sklearn.linear_model import LogisticRegression\n", 145 | "simple_log_reg = LogisticRegression(C=1e6)\n", 146 | "simple_log_reg.fit(X_train['months_delayed'].values.reshape(-1, 1), y_train)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "print(\"W0: {}, W1: {}\".format(simple_log_reg.intercept_[0], simple_log_reg.coef_[0][0]))" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "def get_probs(months_delayed):\n", 165 | " m = scaler.mean_[-1]\n", 166 | " std = scaler.var_[-1]**.5\n", 167 | " x = (months_delayed - m)/std\n", 168 | " prob_default = 1/(1+np.exp(-simple_log_reg.intercept_[0] + -simple_log_reg.coef_[0][0]*x))\n", 169 | " return prob_default" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "months = np.arange(13)\n", 179 | "pred_probs = get_probs(months)\n", 180 | "pd.DataFrame({'months': months, 'pred_probs':pred_probs})" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "fig, ax = plt.subplots()\n", 190 | "ax.plot(months, pred_probs)\n", 191 | "ax.set_xlabel('Months delayed')\n", 192 | "ax.set_ylabel('Probability of default')\n", 193 | "ax.grid()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "### A complete Logistic Regression model" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "log_reg = LogisticRegression(C=1e6)\n", 210 | "log_reg.fit(X_train, y_train)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "prob_log_reg = log_reg.predict_proba(X_train)\n", 220 | "prob_log_reg[:10]" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "y_pred_log_reg = log_reg.predict(X_train)\n", 230 | "y_pred_log_reg[:10]" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "np.all(y_pred_log_reg == (prob_log_reg[:,1] > 0.5))" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "pd.Series(data=log_reg.coef_[0], index=X_train.columns).sort_values(ascending=False).round(2)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "from sklearn.metrics import accuracy_score\n", 258 | "accuracy_log_reg = accuracy_score(y_true=y_train, y_pred=y_pred_log_reg)\n", 259 | "accuracy_log_reg" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "## Classification Trees" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "from sklearn.tree import DecisionTreeClassifier\n", 276 | "class_tree = DecisionTreeClassifier(max_depth=3)\n", 277 | "class_tree.fit(X_train, y_train)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "from sklearn.externals.six import StringIO \n", 287 | "from sklearn.tree import export_graphviz\n", 288 | "from IPython.display import Image \n", 289 | "import pydotplus" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "dot_data = StringIO()\n", 299 | "export_graphviz(decision_tree=class_tree,\n", 300 | " out_file=dot_data,\n", 301 | " filled=True,\n", 302 | " rounded=True,\n", 303 | " feature_names = X_train.columns,\n", 304 | " class_names = ['pay','default'],\n", 305 | " special_characters=True)\n", 306 | "graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) \n", 307 | "Image(graph.create_png())" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "dot_data = StringIO()\n", 317 | "export_graphviz(decision_tree=class_tree,\n", 318 | " out_file=dot_data,\n", 319 | " filled=True,\n", 320 | " rounded=True,\n", 321 | " proportion=True,\n", 322 | " feature_names = X_train.columns,\n", 323 | " class_names = ['pay','default'],\n", 324 | " special_characters=True)\n", 325 | "graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) \n", 326 | "Image(graph.create_png())" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "### How trees work" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "from sklearn.datasets import make_blobs" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "A, b = make_blobs(n_samples=200, n_features=2, cluster_std=0.6,\n", 352 | " centers=[[-0.5,-1],[0.5,0.5]], shuffle=False, random_state=42)\n", 353 | "plt.scatter(A[:, 0], A[:, 1], c=b)\n", 354 | "plt.xlabel('X1', size=15)\n", 355 | "plt.ylabel('X2', size=15);" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "plt.scatter(A[:, 0], A[:, 1], c=b)\n", 365 | "plt.axhline(-0.6, c='red')\n", 366 | "plt.xlabel('X1', size=15)\n", 367 | "plt.ylabel('X2', size=15);" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "plt.scatter(A[:, 0], A[:, 1], c=b)\n", 377 | "plt.axhline(-0.6, c='red')\n", 378 | "plt.axvline(x=-0.1, ymin=0.34, c='red')\n", 379 | "plt.xlabel('X1', size=15)\n", 380 | "plt.ylabel('X2', size=15);" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "plt.scatter(A[:, 0], A[:, 1], c=b)\n", 390 | "plt.axhline(-0.6, c='red')\n", 391 | "plt.axvline(x=-0.1, ymin=0.34, c='red')\n", 392 | "plt.axvline(x=0.7, ymax=0.34, c='red')\n", 393 | "plt.xlabel('X1', size=15)\n", 394 | "plt.ylabel('X2', size=15);" 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "metadata": {}, 400 | "source": [ 401 | "### Training a larger classification tree" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "class_tree = DecisionTreeClassifier(max_depth=6, min_samples_split=50)\n", 411 | "class_tree.fit(X_train, y_train)\n", 412 | "y_pred_class_tree = class_tree.predict(X_train)" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "accuracy_class_tree = accuracy_score(y_true=y_train, y_pred=y_pred_class_tree)\n", 422 | "accuracy_class_tree" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "pd.Series(data=class_tree.feature_importances_, index=X_train.columns).sort_values(ascending=False).round(3)" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "pd.Series(data=class_tree.feature_importances_, index=X_train.columns).sort_values(ascending=False).plot(kind='bar');" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "## Random Forests" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "from sklearn.ensemble import RandomForestClassifier\n", 457 | "rf = RandomForestClassifier(n_estimators=99,\n", 458 | " max_features=6,\n", 459 | " max_depth=6,\n", 460 | " min_samples_split=100,\n", 461 | " random_state=85)\n", 462 | "rf.fit(X_train, y_train)\n", 463 | "y_pred_rf = rf.predict(X_train)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "accuracy_rf = accuracy_score(y_true=y_train, y_pred=y_pred_rf)\n", 473 | "accuracy_rf" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "metadata": {}, 480 | "outputs": [], 481 | "source": [ 482 | "pd.Series(data=rf.feature_importances_, index=X_train.columns).sort_values(ascending=False).round(3)" 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "metadata": {}, 488 | "source": [ 489 | "## Training vs Testing Error" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "y_pred_null = np.zeros_like(y_test)\n", 499 | "accuracy_score(y_true=y_test, y_pred=y_pred_null)" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "## Remember to also standarize the numerical features in the testing set\n", 509 | "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [ 518 | "## Calculating accuracy\n", 519 | "accuracies = pd.DataFrame(columns=['train', 'test'], index=['LogisticReg','ClassTree','RF'])\n", 520 | "model_dict = {'LogisticReg': log_reg, 'ClassTree': class_tree, 'RF': rf}\n", 521 | "for name, model in model_dict.items():\n", 522 | " accuracies.loc[name, 'train'] = accuracy_score(y_true=y_train, y_pred=model.predict(X_train))\n", 523 | " accuracies.loc[name, 'test'] = accuracy_score(y_true=y_test, y_pred=model.predict(X_test))\n", 524 | "\n", 525 | "accuracies" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": null, 531 | "metadata": {}, 532 | "outputs": [], 533 | "source": [ 534 | "fig, ax = plt.subplots()\n", 535 | "accuracies.sort_values(by='test', ascending=False).plot(kind='barh', ax=ax, zorder=3)\n", 536 | "ax.grid(zorder=0)" 537 | ] 538 | }, 539 | { 540 | "cell_type": "markdown", 541 | "metadata": {}, 542 | "source": [ 543 | "## Multiclass classification" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": null, 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "# Loading the iris dataset\n", 553 | "from sklearn.datasets import load_iris\n", 554 | "iris = load_iris()\n", 555 | "# Training the logistic regression model\n", 556 | "iris_log_reg = LogisticRegression(C=1e5)\n", 557 | "iris_log_reg.fit(iris.data, iris.target)\n", 558 | "iris_probs = iris_log_reg.predict_proba(iris.data)\n", 559 | "iris_pred = iris_log_reg.predict(iris.data)" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "metadata": {}, 566 | "outputs": [], 567 | "source": [ 568 | "iris_pred_df = pd.DataFrame(iris_probs, columns=iris.target_names).round(4)\n", 569 | "iris_pred_df['predicted_class'] = iris.target_names[iris_pred]\n", 570 | "iris_pred_df.sample(12)" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [] 579 | } 580 | ], 581 | "metadata": { 582 | "kernelspec": { 583 | "display_name": "Python 3", 584 | "language": "python", 585 | "name": "python3" 586 | }, 587 | "language_info": { 588 | "codemirror_mode": { 589 | "name": "ipython", 590 | "version": 3 591 | }, 592 | "file_extension": ".py", 593 | "mimetype": "text/x-python", 594 | "name": "python", 595 | "nbconvert_exporter": "python", 596 | "pygments_lexer": "ipython3", 597 | "version": "3.6.10" 598 | } 599 | }, 600 | "nbformat": 4, 601 | "nbformat_minor": 2 602 | } 603 | -------------------------------------------------------------------------------- /Chapter06/.ipynb_checkpoints/ch6-2-classification-with-neural-networks-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Predicting Credit Card Default with Neural Networks" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "import matplotlib.pyplot as plt\n", 19 | "import seaborn as sns\n", 20 | "import os\n", 21 | "%matplotlib inline" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "### Back with the credit card default dataset" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# Loading the dataset\n", 38 | "DATA_DIR = '../data'\n", 39 | "FILE_NAME = 'credit_card_default.csv'\n", 40 | "data_path = os.path.join(DATA_DIR, FILE_NAME)\n", 41 | "ccd = pd.read_csv(data_path, index_col=\"ID\")\n", 42 | "ccd.rename(columns=lambda x: x.lower(), inplace=True)\n", 43 | "ccd.rename(columns={'default payment next month':'default'}, inplace=True)\n", 44 | "\n", 45 | "# getting the groups of features\n", 46 | "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n", 47 | "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n", 48 | "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features\n", 49 | "\n", 50 | "# Creating creating binary features\n", 51 | "ccd['male'] = (ccd['sex'] == 1).astype('int')\n", 52 | "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n", 53 | "ccd['university'] = (ccd['education'] == 2).astype('int')\n", 54 | "#ccd['high_school'] = (ccd['education'] == 3).astype('int')\n", 55 | "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n", 56 | "\n", 57 | "# simplifying pay features \n", 58 | "pay_features= ['pay_' + str(i) for i in range(1,7)]\n", 59 | "for x in pay_features:\n", 60 | " ccd.loc[ccd[x] <= 0, x] = 0\n", 61 | "\n", 62 | "# simplifying delayed features\n", 63 | "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n", 64 | "for pay, delayed in zip(pay_features, delayed_features):\n", 65 | " ccd[delayed] = (ccd[pay] > 0).astype(int)\n", 66 | " \n", 67 | "# creating a new feature: months delayed\n", 68 | "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Split and standarize the dataset" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stderr", 85 | "output_type": "stream", 86 | "text": [ 87 | "C:\\Anaconda\\envs\\ho-pawp\\lib\\site-packages\\pandas\\core\\indexing.py:966: SettingWithCopyWarning: \n", 88 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 89 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 90 | "\n", 91 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 92 | " self.obj[item] = s\n", 93 | "C:\\Anaconda\\envs\\ho-pawp\\lib\\site-packages\\pandas\\core\\indexing.py:966: SettingWithCopyWarning: \n", 94 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 95 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 96 | "\n", 97 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 98 | " self.obj[item] = s\n" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "numerical_features = numerical_features + ['months_delayed']\n", 104 | "binary_features = ['male','married','grad_school','university']\n", 105 | "X = ccd[numerical_features + binary_features]\n", 106 | "y = ccd['default'].astype(int)\n", 107 | "\n", 108 | "## Split\n", 109 | "from sklearn.model_selection import train_test_split\n", 110 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=101)\n", 111 | "\n", 112 | "## Standarize\n", 113 | "from sklearn.preprocessing import StandardScaler\n", 114 | "scaler = StandardScaler()\n", 115 | "scaler.fit(X_train[numerical_features])\n", 116 | "X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])\n", 117 | "# Standarize also the testing set\n", 118 | "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "### Building the neural network for classification" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 4, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "name": "stderr", 135 | "output_type": "stream", 136 | "text": [ 137 | "Using TensorFlow backend.\n" 138 | ] 139 | } 140 | ], 141 | "source": [ 142 | "from keras.models import Sequential\n", 143 | "nn_classifier = Sequential()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 5, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "from keras.layers import Dense\n", 153 | "n_input = X_train.shape[1]\n", 154 | "n_units_hidden = 64\n", 155 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu', input_shape=(n_input,)))" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 6, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "# add 2nd hidden layer\n", 165 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n", 166 | "# add 3th hidden layer\n", 167 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n", 168 | "# add 4th hidden layer\n", 169 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n", 170 | "# add 5th hidden layer\n", 171 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 7, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "# output layer\n", 181 | "nn_classifier.add(Dense(1, activation='sigmoid'))" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "### Training the network" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 8, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "## compiling step\n", 198 | "nn_classifier.compile(loss='binary_crossentropy', optimizer='adam')" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 9, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "Model: \"sequential_1\"\n", 211 | "_________________________________________________________________\n", 212 | "Layer (type) Output Shape Param # \n", 213 | "=================================================================\n", 214 | "dense_1 (Dense) (None, 64) 1280 \n", 215 | "_________________________________________________________________\n", 216 | "dense_2 (Dense) (None, 64) 4160 \n", 217 | "_________________________________________________________________\n", 218 | "dense_3 (Dense) (None, 64) 4160 \n", 219 | "_________________________________________________________________\n", 220 | "dense_4 (Dense) (None, 64) 4160 \n", 221 | "_________________________________________________________________\n", 222 | "dense_5 (Dense) (None, 64) 4160 \n", 223 | "_________________________________________________________________\n", 224 | "dense_6 (Dense) (None, 1) 65 \n", 225 | "=================================================================\n", 226 | "Total params: 17,985\n", 227 | "Trainable params: 17,985\n", 228 | "Non-trainable params: 0\n", 229 | "_________________________________________________________________\n" 230 | ] 231 | } 232 | ], 233 | "source": [ 234 | "nn_classifier.summary()" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 10, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "nn_classifier.save_weights('class_initial_w.h5')" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 11, 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "Epoch 1/150\n", 256 | "25000/25000 [==============================] - 1s 30us/step - loss: 0.4690\n", 257 | "Epoch 2/150\n", 258 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4481\n", 259 | "Epoch 3/150\n", 260 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4446\n", 261 | "Epoch 4/150\n", 262 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4428\n", 263 | "Epoch 5/150\n", 264 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4405\n", 265 | "Epoch 6/150\n", 266 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4399\n", 267 | "Epoch 7/150\n", 268 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4388\n", 269 | "Epoch 8/150\n", 270 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4379\n", 271 | "Epoch 9/150\n", 272 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4365\n", 273 | "Epoch 10/150\n", 274 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4353\n", 275 | "Epoch 11/150\n", 276 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4348\n", 277 | "Epoch 12/150\n", 278 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4334\n", 279 | "Epoch 13/150\n", 280 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4325\n", 281 | "Epoch 14/150\n", 282 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4303\n", 283 | "Epoch 15/150\n", 284 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4289\n", 285 | "Epoch 16/150\n", 286 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4282\n", 287 | "Epoch 17/150\n", 288 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4262\n", 289 | "Epoch 18/150\n", 290 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4243\n", 291 | "Epoch 19/150\n", 292 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4240\n", 293 | "Epoch 20/150\n", 294 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4217\n", 295 | "Epoch 21/150\n", 296 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4203\n", 297 | "Epoch 22/150\n", 298 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4180\n", 299 | "Epoch 23/150\n", 300 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4166\n", 301 | "Epoch 24/150\n", 302 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4145\n", 303 | "Epoch 25/150\n", 304 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4128\n", 305 | "Epoch 26/150\n", 306 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4113\n", 307 | "Epoch 27/150\n", 308 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4088\n", 309 | "Epoch 28/150\n", 310 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4065\n", 311 | "Epoch 29/150\n", 312 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4038\n", 313 | "Epoch 30/150\n", 314 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4034\n", 315 | "Epoch 31/150\n", 316 | "25000/25000 [==============================] - ETA: 0s - loss: 0.397 - 1s 22us/step - loss: 0.3986\n", 317 | "Epoch 32/150\n", 318 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3986\n", 319 | "Epoch 33/150\n", 320 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3968\n", 321 | "Epoch 34/150\n", 322 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3920\n", 323 | "Epoch 35/150\n", 324 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3888\n", 325 | "Epoch 36/150\n", 326 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3870\n", 327 | "Epoch 37/150\n", 328 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3869\n", 329 | "Epoch 38/150\n", 330 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3828\n", 331 | "Epoch 39/150\n", 332 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3821\n", 333 | "Epoch 40/150\n", 334 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3779\n", 335 | "Epoch 41/150\n", 336 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3760\n", 337 | "Epoch 42/150\n", 338 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3739\n", 339 | "Epoch 43/150\n", 340 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3742\n", 341 | "Epoch 44/150\n", 342 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3683\n", 343 | "Epoch 45/150\n", 344 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3663\n", 345 | "Epoch 46/150\n", 346 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3641\n", 347 | "Epoch 47/150\n", 348 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3625\n", 349 | "Epoch 48/150\n", 350 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3605\n", 351 | "Epoch 49/150\n", 352 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3571\n", 353 | "Epoch 50/150\n", 354 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3525\n", 355 | "Epoch 51/150\n", 356 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3547\n", 357 | "Epoch 52/150\n", 358 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3493\n", 359 | "Epoch 53/150\n", 360 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3481\n", 361 | "Epoch 54/150\n", 362 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3484\n", 363 | "Epoch 55/150\n", 364 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3442\n", 365 | "Epoch 56/150\n", 366 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3426\n", 367 | "Epoch 57/150\n", 368 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3386\n", 369 | "Epoch 58/150\n", 370 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3404\n", 371 | "Epoch 59/150\n", 372 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3381\n", 373 | "Epoch 60/150\n", 374 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.3370\n", 375 | "Epoch 61/150\n", 376 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3307\n", 377 | "Epoch 62/150\n", 378 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3301\n", 379 | "Epoch 63/150\n", 380 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3283\n", 381 | "Epoch 64/150\n", 382 | "25000/25000 [==============================] - 1s 26us/step - loss: 0.3248\n", 383 | "Epoch 65/150\n", 384 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3261\n", 385 | "Epoch 66/150\n", 386 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3221\n", 387 | "Epoch 67/150\n", 388 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3203\n", 389 | "Epoch 68/150\n", 390 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3228\n", 391 | "Epoch 69/150\n", 392 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3181\n", 393 | "Epoch 70/150\n", 394 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3193\n", 395 | "Epoch 71/150\n", 396 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3115\n", 397 | "Epoch 72/150\n", 398 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3155\n", 399 | "Epoch 73/150\n", 400 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3161\n", 401 | "Epoch 74/150\n", 402 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.3071\n", 403 | "Epoch 75/150\n", 404 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3089\n", 405 | "Epoch 76/150\n", 406 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3074\n", 407 | "Epoch 77/150\n", 408 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3079\n", 409 | "Epoch 78/150\n", 410 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3001\n", 411 | "Epoch 79/150\n", 412 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3061\n", 413 | "Epoch 80/150\n", 414 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.3023\n", 415 | "Epoch 81/150\n", 416 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3015\n", 417 | "Epoch 82/150\n", 418 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.2933\n", 419 | "Epoch 83/150\n", 420 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2968\n", 421 | "Epoch 84/150\n", 422 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2973\n", 423 | "Epoch 85/150\n", 424 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2949\n", 425 | "Epoch 86/150\n", 426 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2941\n", 427 | "Epoch 87/150\n", 428 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.2835\n", 429 | "Epoch 88/150\n", 430 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2889\n", 431 | "Epoch 89/150\n", 432 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.2862\n", 433 | "Epoch 90/150\n", 434 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2876\n", 435 | "Epoch 91/150\n", 436 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2809\n", 437 | "Epoch 92/150\n", 438 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2876\n", 439 | "Epoch 93/150\n" 440 | ] 441 | }, 442 | { 443 | "name": "stdout", 444 | "output_type": "stream", 445 | "text": [ 446 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2804\n", 447 | "Epoch 94/150\n", 448 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2834\n", 449 | "Epoch 95/150\n", 450 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2803\n", 451 | "Epoch 96/150\n", 452 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2822\n", 453 | "Epoch 97/150\n", 454 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2810\n", 455 | "Epoch 98/150\n", 456 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2751\n", 457 | "Epoch 99/150\n", 458 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2754\n", 459 | "Epoch 100/150\n", 460 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2708\n", 461 | "Epoch 101/150\n", 462 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2708\n", 463 | "Epoch 102/150\n", 464 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2890\n", 465 | "Epoch 103/150\n", 466 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2701\n", 467 | "Epoch 104/150\n", 468 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2674\n", 469 | "Epoch 105/150\n", 470 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2684\n", 471 | "Epoch 106/150\n", 472 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2673\n", 473 | "Epoch 107/150\n", 474 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2690\n", 475 | "Epoch 108/150\n", 476 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2659\n", 477 | "Epoch 109/150\n", 478 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2624\n", 479 | "Epoch 110/150\n", 480 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2663\n", 481 | "Epoch 111/150\n", 482 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2614\n", 483 | "Epoch 112/150\n", 484 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2583\n", 485 | "Epoch 113/150\n", 486 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2560\n", 487 | "Epoch 114/150\n", 488 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2597\n", 489 | "Epoch 115/150\n", 490 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2599\n", 491 | "Epoch 116/150\n", 492 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2570\n", 493 | "Epoch 117/150\n", 494 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2552\n", 495 | "Epoch 118/150\n", 496 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2508\n", 497 | "Epoch 119/150\n", 498 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2494\n", 499 | "Epoch 120/150\n", 500 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2518\n", 501 | "Epoch 121/150\n", 502 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2473\n", 503 | "Epoch 122/150\n", 504 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2574\n", 505 | "Epoch 123/150\n", 506 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2521\n", 507 | "Epoch 124/150\n", 508 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2537\n", 509 | "Epoch 125/150\n", 510 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2490\n", 511 | "Epoch 126/150\n", 512 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2459\n", 513 | "Epoch 127/150\n", 514 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2457\n", 515 | "Epoch 128/150\n", 516 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2479\n", 517 | "Epoch 129/150\n", 518 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2511\n", 519 | "Epoch 130/150\n", 520 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2422\n", 521 | "Epoch 131/150\n", 522 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2419\n", 523 | "Epoch 132/150\n", 524 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2390\n", 525 | "Epoch 133/150\n", 526 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2396\n", 527 | "Epoch 134/150\n", 528 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2336\n", 529 | "Epoch 135/150\n", 530 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2331\n", 531 | "Epoch 136/150\n", 532 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2507\n", 533 | "Epoch 137/150\n", 534 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2420\n", 535 | "Epoch 138/150\n", 536 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2434\n", 537 | "Epoch 139/150\n", 538 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2335\n", 539 | "Epoch 140/150\n", 540 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2268\n", 541 | "Epoch 141/150\n", 542 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2317\n", 543 | "Epoch 142/150\n", 544 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2314\n", 545 | "Epoch 143/150\n", 546 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2426\n", 547 | "Epoch 144/150\n", 548 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2306\n", 549 | "Epoch 145/150\n", 550 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2402\n", 551 | "Epoch 146/150\n", 552 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2297\n", 553 | "Epoch 147/150\n", 554 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2253\n", 555 | "Epoch 148/150\n", 556 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2187\n", 557 | "Epoch 149/150\n", 558 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2243\n", 559 | "Epoch 150/150\n", 560 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2256\n" 561 | ] 562 | }, 563 | { 564 | "data": { 565 | "text/plain": [ 566 | "" 567 | ] 568 | }, 569 | "execution_count": 11, 570 | "metadata": {}, 571 | "output_type": "execute_result" 572 | } 573 | ], 574 | "source": [ 575 | "batch_size = 64\n", 576 | "n_epochs = 150\n", 577 | "nn_classifier.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size)" 578 | ] 579 | }, 580 | { 581 | "cell_type": "markdown", 582 | "metadata": {}, 583 | "source": [ 584 | "## Evaluating predictions" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": 12, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "## Getting the probabilities\n", 594 | "y_pred_train_prob = nn_classifier.predict(X_train)\n", 595 | "y_pred_test_prob = nn_classifier.predict(X_test)\n", 596 | "\n", 597 | "## Classifications from predictions\n", 598 | "y_pred_train = (y_pred_train_prob > 0.5).astype(int)\n", 599 | "y_pred_test = (y_pred_test_prob > 0.5).astype(int)" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 13, 605 | "metadata": {}, 606 | "outputs": [ 607 | { 608 | "name": "stdout", 609 | "output_type": "stream", 610 | "text": [ 611 | "Train Accuracy: 0.903 \n", 612 | "Test Accuracy: 0.750\n" 613 | ] 614 | } 615 | ], 616 | "source": [ 617 | "from sklearn.metrics import accuracy_score\n", 618 | "train_acc = accuracy_score(y_true=y_train, y_pred=y_pred_train)\n", 619 | "test_acc = accuracy_score(y_true=y_test, y_pred=y_pred_test)\n", 620 | "print(\"Train Accuracy: {:0.3f} \\nTest Accuracy: {:0.3f}\".format(train_acc, test_acc))" 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "metadata": {}, 626 | "source": [ 627 | "## Re-training the network with less epochs" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": 14, 633 | "metadata": {}, 634 | "outputs": [], 635 | "source": [ 636 | "## load the initial weights\n", 637 | "nn_classifier.load_weights('class_initial_w.h5')" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": null, 643 | "metadata": { 644 | "scrolled": true 645 | }, 646 | "outputs": [ 647 | { 648 | "name": "stdout", 649 | "output_type": "stream", 650 | "text": [ 651 | "Epoch 1/50\n", 652 | "25000/25000 [==============================] - 1s 30us/step - loss: 0.4680\n", 653 | "Epoch 2/50\n", 654 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4479\n", 655 | "Epoch 3/50\n", 656 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4454\n", 657 | "Epoch 4/50\n", 658 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4430\n", 659 | "Epoch 5/50\n", 660 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4407\n", 661 | "Epoch 6/50\n", 662 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4401\n", 663 | "Epoch 7/50\n", 664 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4381\n", 665 | "Epoch 8/50\n", 666 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4372\n", 667 | "Epoch 9/50\n", 668 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4356\n", 669 | "Epoch 10/50\n", 670 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4350\n", 671 | "Epoch 11/50\n", 672 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4334\n", 673 | "Epoch 12/50\n", 674 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4323\n", 675 | "Epoch 13/50\n", 676 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4320\n", 677 | "Epoch 14/50\n", 678 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4302\n", 679 | "Epoch 15/50\n", 680 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.4284\n", 681 | "Epoch 16/50\n", 682 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4278\n", 683 | "Epoch 17/50\n", 684 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4260\n", 685 | "Epoch 18/50\n", 686 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4249\n", 687 | "Epoch 19/50\n", 688 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4226\n", 689 | "Epoch 20/50\n", 690 | "10752/25000 [===========>..................] - ETA: 0s - loss: 0.4187" 691 | ] 692 | } 693 | ], 694 | "source": [ 695 | "batch_size = 64\n", 696 | "n_epochs = 50\n", 697 | "nn_classifier.compile(loss='binary_crossentropy', optimizer='adam')\n", 698 | "nn_classifier.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size)" 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": null, 704 | "metadata": {}, 705 | "outputs": [], 706 | "source": [ 707 | "## Getting the probabilities\n", 708 | "y_pred_train_prob = nn_classifier.predict(X_train)\n", 709 | "y_pred_test_prob = nn_classifier.predict(X_test)\n", 710 | "\n", 711 | "## Classifications from predictions\n", 712 | "y_pred_train = (y_pred_train_prob > 0.5).astype(int)\n", 713 | "y_pred_test = (y_pred_test_prob > 0.5).astype(int)\n", 714 | "\n", 715 | "## Calculating accuracy\n", 716 | "train_acc = accuracy_score(y_true=y_train, y_pred=y_pred_train)\n", 717 | "test_acc = accuracy_score(y_true=y_test, y_pred=y_pred_test)\n", 718 | "print(\"Train Accuracy: {:0.3f} \\nTest Accuracy: {:0.3f}\".format(train_acc, test_acc))" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": null, 724 | "metadata": {}, 725 | "outputs": [], 726 | "source": [] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": null, 731 | "metadata": {}, 732 | "outputs": [], 733 | "source": [] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": null, 738 | "metadata": {}, 739 | "outputs": [], 740 | "source": [] 741 | }, 742 | { 743 | "cell_type": "code", 744 | "execution_count": null, 745 | "metadata": {}, 746 | "outputs": [], 747 | "source": [] 748 | } 749 | ], 750 | "metadata": { 751 | "kernelspec": { 752 | "display_name": "Python 3", 753 | "language": "python", 754 | "name": "python3" 755 | }, 756 | "language_info": { 757 | "codemirror_mode": { 758 | "name": "ipython", 759 | "version": 3 760 | }, 761 | "file_extension": ".py", 762 | "mimetype": "text/x-python", 763 | "name": "python", 764 | "nbconvert_exporter": "python", 765 | "pygments_lexer": "ipython3", 766 | "version": "3.6.10" 767 | } 768 | }, 769 | "nbformat": 4, 770 | "nbformat_minor": 2 771 | } 772 | -------------------------------------------------------------------------------- /Chapter06/ch6-2-classification-with-neural-networks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Predicting Credit Card Default with Neural Networks" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "import matplotlib.pyplot as plt\n", 19 | "import seaborn as sns\n", 20 | "import os\n", 21 | "%matplotlib inline" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "### Back with the credit card default dataset" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# Loading the dataset\n", 38 | "DATA_DIR = '../data'\n", 39 | "FILE_NAME = 'credit_card_default.csv'\n", 40 | "data_path = os.path.join(DATA_DIR, FILE_NAME)\n", 41 | "ccd = pd.read_csv(data_path, index_col=\"ID\")\n", 42 | "ccd.rename(columns=lambda x: x.lower(), inplace=True)\n", 43 | "ccd.rename(columns={'default payment next month':'default'}, inplace=True)\n", 44 | "\n", 45 | "# getting the groups of features\n", 46 | "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n", 47 | "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n", 48 | "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features\n", 49 | "\n", 50 | "# Creating creating binary features\n", 51 | "ccd['male'] = (ccd['sex'] == 1).astype('int')\n", 52 | "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n", 53 | "ccd['university'] = (ccd['education'] == 2).astype('int')\n", 54 | "#ccd['high_school'] = (ccd['education'] == 3).astype('int')\n", 55 | "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n", 56 | "\n", 57 | "# simplifying pay features \n", 58 | "pay_features= ['pay_' + str(i) for i in range(1,7)]\n", 59 | "for x in pay_features:\n", 60 | " ccd.loc[ccd[x] <= 0, x] = 0\n", 61 | "\n", 62 | "# simplifying delayed features\n", 63 | "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n", 64 | "for pay, delayed in zip(pay_features, delayed_features):\n", 65 | " ccd[delayed] = (ccd[pay] > 0).astype(int)\n", 66 | " \n", 67 | "# creating a new feature: months delayed\n", 68 | "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Split and standarize the dataset" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stderr", 85 | "output_type": "stream", 86 | "text": [ 87 | "C:\\Anaconda\\envs\\ho-pawp\\lib\\site-packages\\pandas\\core\\indexing.py:966: SettingWithCopyWarning: \n", 88 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 89 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 90 | "\n", 91 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 92 | " self.obj[item] = s\n", 93 | "C:\\Anaconda\\envs\\ho-pawp\\lib\\site-packages\\pandas\\core\\indexing.py:966: SettingWithCopyWarning: \n", 94 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 95 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 96 | "\n", 97 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 98 | " self.obj[item] = s\n" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "numerical_features = numerical_features + ['months_delayed']\n", 104 | "binary_features = ['male','married','grad_school','university']\n", 105 | "X = ccd[numerical_features + binary_features]\n", 106 | "y = ccd['default'].astype(int)\n", 107 | "\n", 108 | "## Split\n", 109 | "from sklearn.model_selection import train_test_split\n", 110 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=101)\n", 111 | "\n", 112 | "## Standarize\n", 113 | "from sklearn.preprocessing import StandardScaler\n", 114 | "scaler = StandardScaler()\n", 115 | "scaler.fit(X_train[numerical_features])\n", 116 | "X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])\n", 117 | "# Standarize also the testing set\n", 118 | "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "### Building the neural network for classification" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 4, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "name": "stderr", 135 | "output_type": "stream", 136 | "text": [ 137 | "Using TensorFlow backend.\n" 138 | ] 139 | } 140 | ], 141 | "source": [ 142 | "from keras.models import Sequential\n", 143 | "nn_classifier = Sequential()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 5, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "from keras.layers import Dense\n", 153 | "n_input = X_train.shape[1]\n", 154 | "n_units_hidden = 64\n", 155 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu', input_shape=(n_input,)))" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 6, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "# add 2nd hidden layer\n", 165 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n", 166 | "# add 3th hidden layer\n", 167 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n", 168 | "# add 4th hidden layer\n", 169 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n", 170 | "# add 5th hidden layer\n", 171 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 7, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "# output layer\n", 181 | "nn_classifier.add(Dense(1, activation='sigmoid'))" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "### Training the network" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 8, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "## compiling step\n", 198 | "nn_classifier.compile(loss='binary_crossentropy', optimizer='adam')" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 9, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "Model: \"sequential_1\"\n", 211 | "_________________________________________________________________\n", 212 | "Layer (type) Output Shape Param # \n", 213 | "=================================================================\n", 214 | "dense_1 (Dense) (None, 64) 1280 \n", 215 | "_________________________________________________________________\n", 216 | "dense_2 (Dense) (None, 64) 4160 \n", 217 | "_________________________________________________________________\n", 218 | "dense_3 (Dense) (None, 64) 4160 \n", 219 | "_________________________________________________________________\n", 220 | "dense_4 (Dense) (None, 64) 4160 \n", 221 | "_________________________________________________________________\n", 222 | "dense_5 (Dense) (None, 64) 4160 \n", 223 | "_________________________________________________________________\n", 224 | "dense_6 (Dense) (None, 1) 65 \n", 225 | "=================================================================\n", 226 | "Total params: 17,985\n", 227 | "Trainable params: 17,985\n", 228 | "Non-trainable params: 0\n", 229 | "_________________________________________________________________\n" 230 | ] 231 | } 232 | ], 233 | "source": [ 234 | "nn_classifier.summary()" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 10, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "nn_classifier.save_weights('class_initial_w.h5')" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 11, 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "Epoch 1/150\n", 256 | "25000/25000 [==============================] - 1s 30us/step - loss: 0.4690\n", 257 | "Epoch 2/150\n", 258 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4481\n", 259 | "Epoch 3/150\n", 260 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4446\n", 261 | "Epoch 4/150\n", 262 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4428\n", 263 | "Epoch 5/150\n", 264 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4405\n", 265 | "Epoch 6/150\n", 266 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4399\n", 267 | "Epoch 7/150\n", 268 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4388\n", 269 | "Epoch 8/150\n", 270 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4379\n", 271 | "Epoch 9/150\n", 272 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4365\n", 273 | "Epoch 10/150\n", 274 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4353\n", 275 | "Epoch 11/150\n", 276 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4348\n", 277 | "Epoch 12/150\n", 278 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4334\n", 279 | "Epoch 13/150\n", 280 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4325\n", 281 | "Epoch 14/150\n", 282 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4303\n", 283 | "Epoch 15/150\n", 284 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4289\n", 285 | "Epoch 16/150\n", 286 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4282\n", 287 | "Epoch 17/150\n", 288 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4262\n", 289 | "Epoch 18/150\n", 290 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4243\n", 291 | "Epoch 19/150\n", 292 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4240\n", 293 | "Epoch 20/150\n", 294 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4217\n", 295 | "Epoch 21/150\n", 296 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4203\n", 297 | "Epoch 22/150\n", 298 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4180\n", 299 | "Epoch 23/150\n", 300 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4166\n", 301 | "Epoch 24/150\n", 302 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4145\n", 303 | "Epoch 25/150\n", 304 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4128\n", 305 | "Epoch 26/150\n", 306 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4113\n", 307 | "Epoch 27/150\n", 308 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4088\n", 309 | "Epoch 28/150\n", 310 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4065\n", 311 | "Epoch 29/150\n", 312 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4038\n", 313 | "Epoch 30/150\n", 314 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4034\n", 315 | "Epoch 31/150\n", 316 | "25000/25000 [==============================] - ETA: 0s - loss: 0.397 - 1s 22us/step - loss: 0.3986\n", 317 | "Epoch 32/150\n", 318 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3986\n", 319 | "Epoch 33/150\n", 320 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3968\n", 321 | "Epoch 34/150\n", 322 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3920\n", 323 | "Epoch 35/150\n", 324 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3888\n", 325 | "Epoch 36/150\n", 326 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3870\n", 327 | "Epoch 37/150\n", 328 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3869\n", 329 | "Epoch 38/150\n", 330 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3828\n", 331 | "Epoch 39/150\n", 332 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3821\n", 333 | "Epoch 40/150\n", 334 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3779\n", 335 | "Epoch 41/150\n", 336 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3760\n", 337 | "Epoch 42/150\n", 338 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3739\n", 339 | "Epoch 43/150\n", 340 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3742\n", 341 | "Epoch 44/150\n", 342 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3683\n", 343 | "Epoch 45/150\n", 344 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3663\n", 345 | "Epoch 46/150\n", 346 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3641\n", 347 | "Epoch 47/150\n", 348 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3625\n", 349 | "Epoch 48/150\n", 350 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3605\n", 351 | "Epoch 49/150\n", 352 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3571\n", 353 | "Epoch 50/150\n", 354 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3525\n", 355 | "Epoch 51/150\n", 356 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3547\n", 357 | "Epoch 52/150\n", 358 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3493\n", 359 | "Epoch 53/150\n", 360 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3481\n", 361 | "Epoch 54/150\n", 362 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3484\n", 363 | "Epoch 55/150\n", 364 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3442\n", 365 | "Epoch 56/150\n", 366 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3426\n", 367 | "Epoch 57/150\n", 368 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3386\n", 369 | "Epoch 58/150\n", 370 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3404\n", 371 | "Epoch 59/150\n", 372 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3381\n", 373 | "Epoch 60/150\n", 374 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.3370\n", 375 | "Epoch 61/150\n", 376 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3307\n", 377 | "Epoch 62/150\n", 378 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3301\n", 379 | "Epoch 63/150\n", 380 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3283\n", 381 | "Epoch 64/150\n", 382 | "25000/25000 [==============================] - 1s 26us/step - loss: 0.3248\n", 383 | "Epoch 65/150\n", 384 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3261\n", 385 | "Epoch 66/150\n", 386 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3221\n", 387 | "Epoch 67/150\n", 388 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3203\n", 389 | "Epoch 68/150\n", 390 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3228\n", 391 | "Epoch 69/150\n", 392 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3181\n", 393 | "Epoch 70/150\n", 394 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3193\n", 395 | "Epoch 71/150\n", 396 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3115\n", 397 | "Epoch 72/150\n", 398 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3155\n", 399 | "Epoch 73/150\n", 400 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3161\n", 401 | "Epoch 74/150\n", 402 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.3071\n", 403 | "Epoch 75/150\n", 404 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3089\n", 405 | "Epoch 76/150\n", 406 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3074\n", 407 | "Epoch 77/150\n", 408 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3079\n", 409 | "Epoch 78/150\n", 410 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3001\n", 411 | "Epoch 79/150\n", 412 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3061\n", 413 | "Epoch 80/150\n", 414 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.3023\n", 415 | "Epoch 81/150\n", 416 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3015\n", 417 | "Epoch 82/150\n", 418 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.2933\n", 419 | "Epoch 83/150\n", 420 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2968\n", 421 | "Epoch 84/150\n", 422 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2973\n", 423 | "Epoch 85/150\n", 424 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2949\n", 425 | "Epoch 86/150\n", 426 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2941\n", 427 | "Epoch 87/150\n", 428 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.2835\n", 429 | "Epoch 88/150\n", 430 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2889\n", 431 | "Epoch 89/150\n", 432 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.2862\n", 433 | "Epoch 90/150\n", 434 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2876\n", 435 | "Epoch 91/150\n", 436 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2809\n", 437 | "Epoch 92/150\n", 438 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2876\n", 439 | "Epoch 93/150\n" 440 | ] 441 | }, 442 | { 443 | "name": "stdout", 444 | "output_type": "stream", 445 | "text": [ 446 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2804\n", 447 | "Epoch 94/150\n", 448 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2834\n", 449 | "Epoch 95/150\n", 450 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2803\n", 451 | "Epoch 96/150\n", 452 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2822\n", 453 | "Epoch 97/150\n", 454 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2810\n", 455 | "Epoch 98/150\n", 456 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2751\n", 457 | "Epoch 99/150\n", 458 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2754\n", 459 | "Epoch 100/150\n", 460 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2708\n", 461 | "Epoch 101/150\n", 462 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2708\n", 463 | "Epoch 102/150\n", 464 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2890\n", 465 | "Epoch 103/150\n", 466 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2701\n", 467 | "Epoch 104/150\n", 468 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2674\n", 469 | "Epoch 105/150\n", 470 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2684\n", 471 | "Epoch 106/150\n", 472 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2673\n", 473 | "Epoch 107/150\n", 474 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2690\n", 475 | "Epoch 108/150\n", 476 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2659\n", 477 | "Epoch 109/150\n", 478 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2624\n", 479 | "Epoch 110/150\n", 480 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2663\n", 481 | "Epoch 111/150\n", 482 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2614\n", 483 | "Epoch 112/150\n", 484 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2583\n", 485 | "Epoch 113/150\n", 486 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2560\n", 487 | "Epoch 114/150\n", 488 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2597\n", 489 | "Epoch 115/150\n", 490 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2599\n", 491 | "Epoch 116/150\n", 492 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2570\n", 493 | "Epoch 117/150\n", 494 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2552\n", 495 | "Epoch 118/150\n", 496 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2508\n", 497 | "Epoch 119/150\n", 498 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2494\n", 499 | "Epoch 120/150\n", 500 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2518\n", 501 | "Epoch 121/150\n", 502 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2473\n", 503 | "Epoch 122/150\n", 504 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2574\n", 505 | "Epoch 123/150\n", 506 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2521\n", 507 | "Epoch 124/150\n", 508 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2537\n", 509 | "Epoch 125/150\n", 510 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2490\n", 511 | "Epoch 126/150\n", 512 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2459\n", 513 | "Epoch 127/150\n", 514 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2457\n", 515 | "Epoch 128/150\n", 516 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2479\n", 517 | "Epoch 129/150\n", 518 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2511\n", 519 | "Epoch 130/150\n", 520 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2422\n", 521 | "Epoch 131/150\n", 522 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2419\n", 523 | "Epoch 132/150\n", 524 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2390\n", 525 | "Epoch 133/150\n", 526 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2396\n", 527 | "Epoch 134/150\n", 528 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2336\n", 529 | "Epoch 135/150\n", 530 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2331\n", 531 | "Epoch 136/150\n", 532 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2507\n", 533 | "Epoch 137/150\n", 534 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2420\n", 535 | "Epoch 138/150\n", 536 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2434\n", 537 | "Epoch 139/150\n", 538 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2335\n", 539 | "Epoch 140/150\n", 540 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2268\n", 541 | "Epoch 141/150\n", 542 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2317\n", 543 | "Epoch 142/150\n", 544 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2314\n", 545 | "Epoch 143/150\n", 546 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2426\n", 547 | "Epoch 144/150\n", 548 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2306\n", 549 | "Epoch 145/150\n", 550 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2402\n", 551 | "Epoch 146/150\n", 552 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2297\n", 553 | "Epoch 147/150\n", 554 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2253\n", 555 | "Epoch 148/150\n", 556 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2187\n", 557 | "Epoch 149/150\n", 558 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2243\n", 559 | "Epoch 150/150\n", 560 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2256\n" 561 | ] 562 | }, 563 | { 564 | "data": { 565 | "text/plain": [ 566 | "" 567 | ] 568 | }, 569 | "execution_count": 11, 570 | "metadata": {}, 571 | "output_type": "execute_result" 572 | } 573 | ], 574 | "source": [ 575 | "batch_size = 64\n", 576 | "n_epochs = 150\n", 577 | "nn_classifier.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size)" 578 | ] 579 | }, 580 | { 581 | "cell_type": "markdown", 582 | "metadata": {}, 583 | "source": [ 584 | "## Evaluating predictions" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": 12, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "## Getting the probabilities\n", 594 | "y_pred_train_prob = nn_classifier.predict(X_train)\n", 595 | "y_pred_test_prob = nn_classifier.predict(X_test)\n", 596 | "\n", 597 | "## Classifications from predictions\n", 598 | "y_pred_train = (y_pred_train_prob > 0.5).astype(int)\n", 599 | "y_pred_test = (y_pred_test_prob > 0.5).astype(int)" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 13, 605 | "metadata": {}, 606 | "outputs": [ 607 | { 608 | "name": "stdout", 609 | "output_type": "stream", 610 | "text": [ 611 | "Train Accuracy: 0.903 \n", 612 | "Test Accuracy: 0.750\n" 613 | ] 614 | } 615 | ], 616 | "source": [ 617 | "from sklearn.metrics import accuracy_score\n", 618 | "train_acc = accuracy_score(y_true=y_train, y_pred=y_pred_train)\n", 619 | "test_acc = accuracy_score(y_true=y_test, y_pred=y_pred_test)\n", 620 | "print(\"Train Accuracy: {:0.3f} \\nTest Accuracy: {:0.3f}\".format(train_acc, test_acc))" 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "metadata": {}, 626 | "source": [ 627 | "## Re-training the network with less epochs" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": 14, 633 | "metadata": {}, 634 | "outputs": [], 635 | "source": [ 636 | "## load the initial weights\n", 637 | "nn_classifier.load_weights('class_initial_w.h5')" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": 15, 643 | "metadata": { 644 | "scrolled": true 645 | }, 646 | "outputs": [ 647 | { 648 | "name": "stdout", 649 | "output_type": "stream", 650 | "text": [ 651 | "Epoch 1/50\n", 652 | "25000/25000 [==============================] - 1s 30us/step - loss: 0.4680\n", 653 | "Epoch 2/50\n", 654 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4479\n", 655 | "Epoch 3/50\n", 656 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4454\n", 657 | "Epoch 4/50\n", 658 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4430\n", 659 | "Epoch 5/50\n", 660 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4407\n", 661 | "Epoch 6/50\n", 662 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4401\n", 663 | "Epoch 7/50\n", 664 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4381\n", 665 | "Epoch 8/50\n", 666 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4372\n", 667 | "Epoch 9/50\n", 668 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4356\n", 669 | "Epoch 10/50\n", 670 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4350\n", 671 | "Epoch 11/50\n", 672 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4334\n", 673 | "Epoch 12/50\n", 674 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4323\n", 675 | "Epoch 13/50\n", 676 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4320\n", 677 | "Epoch 14/50\n", 678 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4302\n", 679 | "Epoch 15/50\n", 680 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.4284\n", 681 | "Epoch 16/50\n", 682 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4278\n", 683 | "Epoch 17/50\n", 684 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4260\n", 685 | "Epoch 18/50\n", 686 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4249\n", 687 | "Epoch 19/50\n", 688 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4226\n", 689 | "Epoch 20/50\n", 690 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.4216\n", 691 | "Epoch 21/50\n", 692 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4197\n", 693 | "Epoch 22/50\n", 694 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4178\n", 695 | "Epoch 23/50\n", 696 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4163\n", 697 | "Epoch 24/50\n", 698 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4139\n", 699 | "Epoch 25/50\n", 700 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4110\n", 701 | "Epoch 26/50\n", 702 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4116\n", 703 | "Epoch 27/50\n", 704 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4079\n", 705 | "Epoch 28/50\n", 706 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4056\n", 707 | "Epoch 29/50\n", 708 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4032\n", 709 | "Epoch 30/50\n", 710 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4008\n", 711 | "Epoch 31/50\n", 712 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.3989\n", 713 | "Epoch 32/50\n", 714 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3959\n", 715 | "Epoch 33/50\n", 716 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3938\n", 717 | "Epoch 34/50\n", 718 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3893\n", 719 | "Epoch 35/50\n", 720 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.3885\n", 721 | "Epoch 36/50\n", 722 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3854\n", 723 | "Epoch 37/50\n", 724 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3817\n", 725 | "Epoch 38/50\n", 726 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3805\n", 727 | "Epoch 39/50\n", 728 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3804\n", 729 | "Epoch 40/50\n", 730 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3751\n", 731 | "Epoch 41/50\n", 732 | "25000/25000 [==============================] - 1s 26us/step - loss: 0.3745\n", 733 | "Epoch 42/50\n", 734 | "25000/25000 [==============================] - 1s 26us/step - loss: 0.3709\n", 735 | "Epoch 43/50\n", 736 | "25000/25000 [==============================] - 1s 26us/step - loss: 0.3712\n", 737 | "Epoch 44/50\n", 738 | "25000/25000 [==============================] - 1s 29us/step - loss: 0.3657\n", 739 | "Epoch 45/50\n", 740 | "25000/25000 [==============================] - 1s 34us/step - loss: 0.3628\n", 741 | "Epoch 46/50\n", 742 | "25000/25000 [==============================] - 1s 30us/step - loss: 0.3600\n", 743 | "Epoch 47/50\n", 744 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3573\n", 745 | "Epoch 48/50\n", 746 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3576\n", 747 | "Epoch 49/50\n", 748 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3536\n", 749 | "Epoch 50/50\n", 750 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3502\n" 751 | ] 752 | }, 753 | { 754 | "data": { 755 | "text/plain": [ 756 | "" 757 | ] 758 | }, 759 | "execution_count": 15, 760 | "metadata": {}, 761 | "output_type": "execute_result" 762 | } 763 | ], 764 | "source": [ 765 | "batch_size = 64\n", 766 | "n_epochs = 50\n", 767 | "nn_classifier.compile(loss='binary_crossentropy', optimizer='adam')\n", 768 | "nn_classifier.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size)" 769 | ] 770 | }, 771 | { 772 | "cell_type": "code", 773 | "execution_count": 16, 774 | "metadata": {}, 775 | "outputs": [ 776 | { 777 | "name": "stdout", 778 | "output_type": "stream", 779 | "text": [ 780 | "Train Accuracy: 0.845 \n", 781 | "Test Accuracy: 0.782\n" 782 | ] 783 | } 784 | ], 785 | "source": [ 786 | "## Getting the probabilities\n", 787 | "y_pred_train_prob = nn_classifier.predict(X_train)\n", 788 | "y_pred_test_prob = nn_classifier.predict(X_test)\n", 789 | "\n", 790 | "## Classifications from predictions\n", 791 | "y_pred_train = (y_pred_train_prob > 0.5).astype(int)\n", 792 | "y_pred_test = (y_pred_test_prob > 0.5).astype(int)\n", 793 | "\n", 794 | "## Calculating accuracy\n", 795 | "train_acc = accuracy_score(y_true=y_train, y_pred=y_pred_train)\n", 796 | "test_acc = accuracy_score(y_true=y_test, y_pred=y_pred_test)\n", 797 | "print(\"Train Accuracy: {:0.3f} \\nTest Accuracy: {:0.3f}\".format(train_acc, test_acc))" 798 | ] 799 | }, 800 | { 801 | "cell_type": "code", 802 | "execution_count": null, 803 | "metadata": {}, 804 | "outputs": [], 805 | "source": [] 806 | }, 807 | { 808 | "cell_type": "code", 809 | "execution_count": null, 810 | "metadata": {}, 811 | "outputs": [], 812 | "source": [] 813 | }, 814 | { 815 | "cell_type": "code", 816 | "execution_count": null, 817 | "metadata": {}, 818 | "outputs": [], 819 | "source": [] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": null, 824 | "metadata": {}, 825 | "outputs": [], 826 | "source": [] 827 | } 828 | ], 829 | "metadata": { 830 | "kernelspec": { 831 | "display_name": "Python 3", 832 | "language": "python", 833 | "name": "python3" 834 | }, 835 | "language_info": { 836 | "codemirror_mode": { 837 | "name": "ipython", 838 | "version": 3 839 | }, 840 | "file_extension": ".py", 841 | "mimetype": "text/x-python", 842 | "name": "python", 843 | "nbconvert_exporter": "python", 844 | "pygments_lexer": "ipython3", 845 | "version": "3.6.10" 846 | } 847 | }, 848 | "nbformat": 4, 849 | "nbformat_minor": 2 850 | } 851 | -------------------------------------------------------------------------------- /Chapter06/class_initial_w.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Predictive-Analytics-with-Python/e049d9d2870f596892b62aeddec312788d9d0c2c/Chapter06/class_initial_w.h5 -------------------------------------------------------------------------------- /Chapter08/.ipynb_checkpoints/ch8-credit-card-def-model-tuning-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Credit Card Default: Model Tuning and Improving Performance" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "#### Importing libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd\n", 25 | "import matplotlib.pyplot as plt\n", 26 | "import seaborn as sns\n", 27 | "import os\n", 28 | "\n", 29 | "pd.options.mode.chained_assignment = None\n", 30 | "%matplotlib inline" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "#### Loading and preparing the dataset" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# Loading the dataset\n", 47 | "DATA_DIR = '../data'\n", 48 | "FILE_NAME = 'credit_card_default.csv'\n", 49 | "data_path = os.path.join(DATA_DIR, FILE_NAME)\n", 50 | "ccd = pd.read_csv(data_path, index_col=\"ID\")\n", 51 | "ccd.rename(columns=lambda x: x.lower(), inplace=True)\n", 52 | "ccd.rename(columns={'default payment next month':'default'}, inplace=True)\n", 53 | "\n", 54 | "# getting the groups of features\n", 55 | "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n", 56 | "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n", 57 | "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features\n", 58 | "\n", 59 | "# Creating creating binary features\n", 60 | "ccd['male'] = (ccd['sex'] == 1).astype('int')\n", 61 | "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n", 62 | "ccd['university'] = (ccd['education'] == 2).astype('int')\n", 63 | "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n", 64 | "\n", 65 | "# simplifying pay features \n", 66 | "pay_features= ['pay_' + str(i) for i in range(1,7)]\n", 67 | "for x in pay_features:\n", 68 | " ccd.loc[ccd[x] <= 0, x] = 0\n", 69 | "\n", 70 | "# simplifying delayed features\n", 71 | "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n", 72 | "for pay, delayed in zip(pay_features, delayed_features):\n", 73 | " ccd[delayed] = (ccd[pay] > 0).astype(int)\n", 74 | " \n", 75 | "# creating a new feature: months delayed\n", 76 | "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "#### Splitting and standarizing the dataset" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 3, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "numerical_features = numerical_features + ['months_delayed']\n", 93 | "binary_features = ['male','married','grad_school','university']\n", 94 | "X = ccd[numerical_features + binary_features]\n", 95 | "y = ccd['default'].astype(int)\n", 96 | "\n", 97 | "## Split\n", 98 | "from sklearn.model_selection import train_test_split\n", 99 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=25)\n", 100 | "\n", 101 | "## Standarize\n", 102 | "from sklearn.preprocessing import StandardScaler\n", 103 | "scaler = StandardScaler()\n", 104 | "scaler.fit(X[numerical_features])\n", 105 | "X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])\n", 106 | "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "## Optimizing more than one parameter" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "#### Reference model" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 4, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "from sklearn.model_selection import cross_val_score\n", 130 | "from sklearn.ensemble import RandomForestClassifier\n", 131 | "ref_rf = RandomForestClassifier(n_estimators=25,\n", 132 | " max_features=4,\n", 133 | " max_depth=4,\n", 134 | " random_state=61)\n", 135 | "\n", 136 | "ref_rf_scores = cross_val_score(ref_rf, X_train, y_train, scoring='roc_auc', cv=10)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 5, 142 | "metadata": { 143 | "scrolled": true 144 | }, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "Mean AUC for reference model: 0.7589\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "print(\"Mean AUC for reference model: {:0.4f}\".format(ref_rf_scores.mean()))" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "#### Grid Search CV" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 6, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "from sklearn.model_selection import GridSearchCV\n", 172 | "param_grid = {\"n_estimators\":[25,100,200,400],\n", 173 | " \"max_features\":[4,10,19],\n", 174 | " \"max_depth\":[4,8,16,20]}" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "Fitting 5 folds for each of 48 candidates, totalling 240 fits\n" 187 | ] 188 | }, 189 | { 190 | "name": "stderr", 191 | "output_type": "stream", 192 | "text": [ 193 | "[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "rf = RandomForestClassifier(random_state=17)\n", 199 | "grid_search = GridSearchCV(estimator=rf,\n", 200 | " param_grid=param_grid,\n", 201 | " scoring='roc_auc',\n", 202 | " cv=5,\n", 203 | " verbose=1,\n", 204 | " n_jobs=4)\n", 205 | "grid_search.fit(X_train, y_train)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "gs_results = pd.Series(grid_search.cv_results_['mean_test_score'], index=grid_search.cv_results_['params'])\n", 215 | "gs_results.sort_values(ascending=False)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "from sklearn.metrics import precision_recall_curve\n", 225 | "## Fitting the initial (not tuned) model:\n", 226 | "ref_rf.fit(X_train, y_train)\n", 227 | "\n", 228 | "## Getting the probabilites\n", 229 | "y_prob_tunned = grid_search.predict_proba(X_test)[:,1]\n", 230 | "y_prob_not_tunned = ref_rf.predict_proba(X_test)[:,1]\n", 231 | "\n", 232 | "## Values for plotting the curves\n", 233 | "prec_tuned, recall_tuned, _ = precision_recall_curve(y_test, y_prob_tunned)\n", 234 | "prec_not_tuned, recall_not_tuned, _ = precision_recall_curve(y_test, y_prob_not_tunned)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "fig, ax = plt.subplots(figsize=(8,5))\n", 244 | "ax.plot(prec_tuned, recall_tuned, label='Tuned Model')\n", 245 | "ax.plot(prec_not_tuned, recall_not_tuned, label='Not Tuned Model')\n", 246 | "ax.set_title('Precision-recall curves', fontsize=16)\n", 247 | "ax.set_xlabel('Precision', fontsize=14)\n", 248 | "ax.set_ylabel('Recall', fontsize=14)\n", 249 | "ax.set_xlim(0.3,0.7); ax.set_ylim(0.1,0.9)\n", 250 | "ax.legend(); ax.grid();" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [] 280 | } 281 | ], 282 | "metadata": { 283 | "kernelspec": { 284 | "display_name": "Python 3", 285 | "language": "python", 286 | "name": "python3" 287 | }, 288 | "language_info": { 289 | "codemirror_mode": { 290 | "name": "ipython", 291 | "version": 3 292 | }, 293 | "file_extension": ".py", 294 | "mimetype": "text/x-python", 295 | "name": "python", 296 | "nbconvert_exporter": "python", 297 | "pygments_lexer": "ipython3", 298 | "version": "3.6.10" 299 | } 300 | }, 301 | "nbformat": 4, 302 | "nbformat_minor": 2 303 | } 304 | -------------------------------------------------------------------------------- /Chapter08/.ipynb_checkpoints/ch8-diamond-prices-model-tuning-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Diamond Prices: Model Tuning and Improving Performance" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "#### Importing libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd\n", 25 | "import matplotlib.pyplot as plt\n", 26 | "import seaborn as sns\n", 27 | "import os\n", 28 | "\n", 29 | "pd.options.mode.chained_assignment = None\n", 30 | "%matplotlib inline" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "#### Loading the dataset" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "DATA_DIR = '../data'\n", 47 | "FILE_NAME = 'diamonds.csv'\n", 48 | "data_path = os.path.join(DATA_DIR, FILE_NAME)\n", 49 | "diamonds = pd.read_csv(data_path)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "#### Preparing the dataset" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "## Preparation done from Chapter 2\n", 66 | "diamonds = diamonds.loc[(diamonds['x']>0) | (diamonds['y']>0)]\n", 67 | "diamonds.loc[11182, 'x'] = diamonds['x'].median()\n", 68 | "diamonds.loc[11182, 'z'] = diamonds['z'].median()\n", 69 | "diamonds = diamonds.loc[~((diamonds['y'] > 30) | (diamonds['z'] > 30))]\n", 70 | "diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['cut'], prefix='cut', drop_first=True)], axis=1)\n", 71 | "diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['color'], prefix='color', drop_first=True)], axis=1)\n", 72 | "diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['clarity'], prefix='clarity', drop_first=True)], axis=1)\n", 73 | "\n", 74 | "## Dimensionality reduction\n", 75 | "from sklearn.decomposition import PCA\n", 76 | "pca = PCA(n_components=1, random_state=123)\n", 77 | "diamonds['dim_index'] = pca.fit_transform(diamonds[['x','y','z']])\n", 78 | "diamonds.drop(['x','y','z'], axis=1, inplace=True)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price',\n", 90 | " 'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_E',\n", 91 | " 'color_F', 'color_G', 'color_H', 'color_I', 'color_J', 'clarity_IF',\n", 92 | " 'clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2',\n", 93 | " 'clarity_VVS1', 'clarity_VVS2', 'dim_index'],\n", 94 | " dtype='object')" 95 | ] 96 | }, 97 | "execution_count": 4, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "diamonds.columns" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "#### Train-test split" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 5, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "X = diamonds.drop(['cut','color','clarity','price'], axis=1)\n", 120 | "y = diamonds['price']\n", 121 | "\n", 122 | "from sklearn.model_selection import train_test_split\n", 123 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=7)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "#### Standarization: centering and scaling " 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 6, 136 | "metadata": { 137 | "scrolled": true 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "numerical_features = ['carat', 'depth', 'table', 'dim_index']\n", 142 | "from sklearn.preprocessing import StandardScaler\n", 143 | "scaler = StandardScaler()\n", 144 | "scaler.fit(X_train[numerical_features])\n", 145 | "X_train.loc[:, numerical_features] = scaler.fit_transform(X_train[numerical_features])\n", 146 | "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "## Optimizing a single hyper-parameter" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 7, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=13)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 8, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "from sklearn.neighbors import KNeighborsRegressor\n", 172 | "from sklearn.metrics import mean_absolute_error\n", 173 | "\n", 174 | "candidates = np.arange(4,16)\n", 175 | "mae_metrics = []\n", 176 | "for k in candidates:\n", 177 | " model = KNeighborsRegressor(n_neighbors=k, weights='distance', metric='minkowski', leaf_size=50, n_jobs=4)\n", 178 | " model.fit(X_train, y_train)\n", 179 | " y_pred = model.predict(X_val)\n", 180 | " metric = mean_absolute_error(y_true=y_val, y_pred=y_pred)\n", 181 | " mae_metrics.append(metric)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 9, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfgAAAFBCAYAAACb7b3CAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deXxU1f3/8deHECAQIKwBEjYVUHYI4tYquIuIVOpW0bb2+3Nvtdal2Nal1mKLVrupbdVaRaVacSkuWJXUFZF9D4IESNiFBAIBsnx+f8yEDiHAAJO5k+H9fDzmwcydM3femWg+c+499xxzd0RERCS51As6gIiIiMSeCryIiEgSUoEXERFJQirwIiIiSUgFXkREJAmpwIuIiCSh+kEHiKXWrVt7ly5dYrrPbdu20aRJk5ju83AlWqZEywPKFC1lio4yRUeZDqw28syYMWOju7fZ6wl3T5pbTk6Ox9qUKVNivs/DlWiZEi2PuzJFS5mio0zRUaYDq408wHSvoSbqEL2IiEgSUoEXERFJQirwIiIiSUgFXkREJAmpwIuIiCQhFXgREZEkpAIvIiKShJJqohsREZFE9NqsQsZNzqOwqJSsqR9w+zk9GDkgq1bfUwVeRESkFr02q5AxE+dRWlYBQGFRKWMmzgOo1SKvQ/QiIiK1aNzkvN3FvUppWQXjJufV6vuqBy8iIhJjO8oqmL2qiC+Wb6KwqLTGNqv3sT1W4l7gzSwFmA4UuvvwiO23AeOANu6+MWJ7J2AhcK+7PxTvvCIiIgdSXFrGjBWbmLZ8M1/kb2JuQRFlFQ5A/XpGeaXv9ZoOGWm1mimIHvzNwCKgWdUGM+sInAWsrKH9I8Db8YkmIiJyYOu27GDa8k18kb+Jacs3kbduK+6hYt4nuzlXn9KV47u0ZFCXFuTmbdjjHDxAWmoKt5/To1YzxrXAm1k2cD7wAHBrxFOPAHcAr1drPxL4CtgWr4wiIiKR3J3lG7eFi3moh75y03YAGjdIYWCnFpzXuz3Hd23BgI4tSGuQssfrqwbS7R5Fn5GWlKPoHyVUyJtWbTCzEYQO188xMyK2NwHuJNSzvy3OOUVE5AhVUeksWrNldw/9i/zNbCzZCUDLJg0Y1LkFV53UmeO7tKRnh2akphx4vPrIAVmMHJBFbm4uQ4YMqeWfIMRCS8nG4Y3MhgPD3P0GMxtCqGhfAkwBznb3YjPLBwa5+0YzewiY5u4vmdm9QElN5+DN7BrgGoDMzMycCRMmxDR3SUkJ6enpMd3n4Uq0TImWB5QpWsoUHWWKTl3NtKvCWV5cSd7mCpZsrmTp5gp2hI+mt2pkdG9Zjx4tUujeIoX2TYzIzmht5DlYQ4cOneHug6pvj2eBHwtcCZQDjQidg38b+CawPdwsG1gNDAZeBjqGt2cAlcDd7v6nfb3HoEGDfPr06THNHc9vW9FKtEyJlgeUKVrKFB1lik4iZdpjYplqh8SLS8uYuWIz0/I38cXyTcwtKGZXRSUA3TPTOb5LSwZ3bcnxXVrGfCBcbXxGZlZjgY/bIXp3HwOMCYcZAtzm7qMi20T24AkV/qrt9xLqwe+zuIuIiEDNE8vc/q85vDJjFRu3lbF47ZbdA+J6ZzXne6d0CQ2I69yCFk0aBJw+dnQdvIiIJJWaJpYpq3A+Xvo1Jx/TipvP6MbgLi3p3ymDxg2StwwG8pO5ey6QW8P2Lvtof2+tBhIRkaSxvwlknv+/E+OYJFiaqlZERJJGeUUlDevXXNpqe2KZRKMCLyIiScHduevVeeworyQ1Zc+R7vGYWCbRqMCLiEhS+O3kPF6aXsCPzujGuG/3IyvcY8/KSGPsRX1qfWKZRJO8owtEROSI8dTHy3k8dxnfOaETPz6zG2YW94llEo168CIiUqe9PruQ+yct5Lze7bj/wt6HNRFNMlGBFxGROuu/Szbwk5fmcOJRLXnk0v6k1FNxr6ICLyIiddLsVUVcP34G3TKb8terBtEoNeXALzqCqMCLiEids3R9Cd//+zRapzfkH1cfT7NGqUFHSjgq8CIiUqesKS7lu09PI6We8dwPBtO2aaOgIyUkjaIXEZE6o2j7Lr779DSKS8uYcM2JdG7VJOhICUs9eBERqRNKd1Xwg39MJ3/jdv56VQ69s5oHHSmhqQcvIiIJr7yikptemMnMlZv583cGcvLRrYOOlPDUgxcRkYTm7oyZOI/3F6/nlxf2Zlif9kFHqhNU4EVEJKH95p08Xp5RwM1ndOPKEzsHHafOUIEXEZGE9eRHX/HEf5dxxQmduOXMbkHHqVNU4EVEJCG9OquAX725iGF92vFLTUF70FTgRUQk4UzJW8/tL8/lpKNaaQraQ6QCLyIiCWXWys3cMH4m3TOb8tercmhYX1PQHgoVeBERSRhL12/l+898QZumDXnm6uNpqiloD5kKvIiIJIQ1xaVc9dQ06terpyloY0AFXkREAle0fRdXPTWNLTvKeeb7x2sK2hhQgRcRkUBVTUG74mtNQRtLmqpWREQCU1ZRyY3hKWgf0xS0MaUevIiIBMLd+ekr8/hg8Xruv7A352kK2phSgRcRkUA8+PZiXplZwC1ndmO0pqCNORV4ERGJu799+BV/+fArrjyxMzefoSloa0PcC7yZpZjZLDObVG37bWbmZtY6/PgsM5thZvPC/54e76wiIhJ7E2cW8MBboSlo7x3RS1PQ1pIgBtndDCwCmlVtMLOOwFnAyoh2G4EL3H21mfUGJgNZ8QwqIiKxNSVvPXf8ay4nH60paGtbXHvwZpYNnA88We2pR4A7AK/a4O6z3H11+OECoJGZNYxLUBERibmZ4Sloj23flL9cqSloa1u8D9E/SqiQV1ZtMLMRQKG7z9nP60YBs9x9Zy3nExGRWrB0/VaufuYL2jZryN+/N1hT0MaBufuBW8XijcyGA8Pc/QYzGwLcBlwCTAHOdvdiM8sHBrn7xojX9QLeCLdZVsN+rwGuAcjMzMyZMGFCTHOXlJSQnp4e030erkTLlGh5QJmipUzRUabo7CvT16WVPPD5Dsor4ecnNqJt4/j1LRPtc6qNPEOHDp3h7oP2esLd43IDxgIFQD6wFtgOvAKsD2/LB8oJnYdvF35NNrAEOCWa98jJyfFYmzJlSsz3ebgSLVOi5XFXpmgpU3SUKTo1ZdpUstPPeDjXe9/9js8vLEqITEGqjTzAdK+hJsZtkJ27jwHGAFT14N19VGSbyB68mWUAbwJj3P2TeOUUEZHY2L6rnKv/8QUrN23nH98fTK8OmoI2nhL5OvibgGOAX5jZ7PCtbdChRETkwMoqKrnx+ZnMWVXEHy7rz0lHtwo60hEnkLno3T0XyK1he5eI+78CfhW3UCIiEhOVlc6dr8xlSt4Gfv2tPpzbW1PQBiGRe/AiIlIHPfjOYibOLOTWs7rznRM6BR3niKUCLyIiMfPXD5fx1w+/4qqTOvPD048JOs4RTcvFiojIYXltViHjJudRWFQKLKZ/dnPuuUBT0AZNPXgRETlkr80qZMzEeeHiHrJ43Vb+PWf1fl4l8aACLyIih2zc5DxKyyr22LajrJJxk/MCSiRVVOBFROSQrY7ouUezXeJHBV5ERA5JZaXToH7NZaRDRlqc00h1KvAiInJI/vjBUnaWV5KasudgurTUFG4/p0dAqaSKCryIiBy0KXnrefT9JVw0IIvfjupLVrjHnpWRxtiL+jByQFbACUWXyYmIyEFZtWk7t0yYTY/MpjzwrT6kNUjhWwOzyc3NZciQIUHHkzD14EVEJGo7yiq4bvwMKt35y5U5pDVICTqS7IN68CIiEhV35+evzWfB6i089d1BdG7VJOhIsh/qwYuISFRenLaKf80o4EenH8MZx2UGHUcOQAVeREQOaPaqIu59YwGndW/DzWd2DzqOREEFXkRE9uvrkp1cP34GbZs15PeX9SelnuaYrwt0Dl5ERPapvKKSH744i6+37WLi9SeT0bhB0JEkSurBi4jIPj38nyV8uuxrfjWyN72zmgcdRw6CCryIiNTonflreTx3GZcP7sQlgzoGHUcOkgq8iIjs5asNJdz28hz6ZTfn3hE9g44jh0AFXkRE9rBtZznXPjeDBvXr8djoHBrW12Q2dZEKvIiI7Obu3PnKXJZtKOGPlw/YPce81D0q8CIistvTn+Qzae4abjunB6cc0zroOHIYVOBFRASAacs38eu3FnF2z0yuP+3ooOPIYVKBFxER1m/ZwY0vzKRzy8Y8dEk/zDSZTV2niW5ERI5wu8orueH5mZTsKOf5/zuBZo1Sg44kMaACLyJyhPv1W4uYvmIzf7h8AN0zmwYdR2JEh+hFRI5gr88u5JlP87n6lK6M6Nch6DgSQ3Ev8GaWYmazzGxSte23mZmbWeuIbWPMbKmZ5ZnZOfHOKiKSzBav3cJPX5nH8V1aMGbYsUHHkRgL4hD9zcAioFnVBjPrCJwFrIzY1hO4DOgFdADeM7Pu7l4R37giIslny44yrntuBumN6vPn7wwkNUUHdJNNXH+jZpYNnA88We2pR4A7AI/YdiEwwd13uvtyYCkwOC5BRUSSWGWl85OX5lCwuZTHrhhI22aNgo4ktSDeX9keJVTIK6s2mNkIoNDd51RrmwWsinhcEN4mIiKH4fH/LuM/C9fxs/OP4/guLYOOI7XE3P3ArWLxRmbDgWHufoOZDQFuAy4BpgBnu3uxmeUDg9x9o5n9GfjM3ceHX/8U8Ja7v1Jtv9cA1wBkZmbmTJgwIaa5S0pKSE9Pj+k+D1eiZUq0PKBM0VKm6CRTpvkby3l4+k5OaJ/CtX0bxvR692T6nGpLbeQZOnToDHcftNcT7h6XGzCWUC88H1gLbAdeAdaHt+UD5YTOw7cDxgBjIl4/GThpf++Rk5PjsTZlypSY7/NwJVqmRMvjrkzRUqboJEumVZu2ef/7JvvZv/uvb9tZlhCZaluiZaqNPMB0r6Emxu0QvbuPcfdsd+9CaPDcB+4+yt3bunuX8PYCYKC7rwXeAC4zs4Zm1hXoBkyLV14RkWSyo6yCG56fSXmF88SVOTRuoGlQkl3C/obdfYGZvQQsJNSzv9E1gl5E5JDc9+8FzC0o5q9X5tC1dZOg40gcBFLg3T0XyK1he5dqjx8AHohLKBGRJPXPL1by4rRV3Dj0aM7u1S7oOBInuvBRRCSJzS0o4hevL+Abx7Tm1rN6BB1H4kgFXkQkSW3atovrx8+kTXpD/nD5AFLqaYW4I0nCnoMXEZFDV1Hp3DxhFhu27uTl606iZZMGQUeSOFOBFxFJQo++t4SPvtzI2Iv60K9jRtBxJAA6RC8ikmTeW7iOP36wlEsHdeTywZ2CjiMBUYEXEUki+Ru38eOXZtMnqzn3Xdgr6DgSIBV4EZEksX1XOdeNn0FKPeOxKwbSKDUl6EgSIJ2DFxFJAu7OmInzyFu3lWe+P5iOLRsHHUkCph68iEgSePazFbw+ezW3ntmd07q3CTqOJAAVeBGROm7Gik3cP2khZx7XlhuHHhN0HEkQKvAiInXY+q07uOH5mWS1SOPhS/pTT5PZSJgKvIhIHVVWUclNL8yiuLSMJ0bn0DwtNehIkkA0yE5EpI76zduLmbZ8E49c2o/j2jcLOo4kGBV4EZE65LVZhYybnEdhUSmwnG8e04pvDcgOOpYkIB2iFxGpI16bVciYifPCxT3kixWbeW1WYYCpJFGpwIuI1BHjJudRWlaxx7YdZZWMm5wXUCJJZCrwIiJ1wJri0j167pFW72O7HNl0Dl5EJIHtLK/gqY+X86cPlu6zTYeMtDgmkrpCBV5EJEHl5q3nvn8vZPnGbZzVM5MTu7bkoXeX7HGYPi01hdvP6RFgSklUKvAiIglm5dfb+eWkhby3aB1HtW7CM98/niE92gLQKr3h7lH0WRlp3H5OD0YOyAo4sSQiFXgRkQRRuquCx/+7jCf+u4z69YyfnncsV5/SlQb1/zdcauSALEYOyCI3N5chQ4YEF1YSngq8iEjA3J3JC9Zy/6RFFBaVMqJfB+4adhztmjcKOprUYSrwIiIBWrq+hPv+vYCPvtzIse2aMuGaEznxqFZBx5IkoAIvIhKArTvK+OMHS3n64+WkNUjh3gt6MvrEztRP0dXLEhsq8CIiceTuvDa7kF+/tZgNW3dy6aCO3H5uD1qnNww6miQZFXgRkThZsLqYe99YwBf5m+mX3Zy/XTWI/h0zgo4lSSruBd7MUoDpQKG7Dzez+4ELgUpgPfA9d19tZqnAk8DAcM5n3X1svPOKiByuou27ePjdJTz/+QoyGjfgN6P6cHFOR63dLrUqiB78zcAioGptw3Hu/gsAM/sRcDdwHXAx0NDd+5hZY2Chmb3o7vkBZBYROWgVlc5L01fx23cWU1xaxlUndeHHZ3aneWOt2y61L64F3syygfOBB4BbAdx9S0STJoCH7zvQxMzqA2nALiCyrYhIwpq5cjP3vL6AeYXFDO7akvtG9NKa7RJX8e7BPwrcATSN3GhmDwBXAcXA0PDmfxE6dL8GaAz82N03xS+qiMjB27B1J799ZzEvzyggs1lDfn9Zf0b064CZDsdLfJm7H7hVLN7IbDgwzN1vMLMhwG3uPrxamzFAI3e/x8xOAW4Avge0AD4CznP3r6q95hrgGoDMzMycCRMmxDR3SUkJ6enpMd3n4Uq0TImWB5QpWsoUnWgyVVQ6768s59Wlu9hVAed0SeWCo1NJq187hb2ufk7xlmiZaiPP0KFDZ7j7oL2ecPcD3oBfA40jHg8D0iIeNyM0CG5/+xgLFAD5wFpgOzC+WpvOwPzw/T8DV0Y89zRwyf7eIycnx2NtypQpMd/n4Uq0TImWx12ZoqVM0TlQpk+XbvSzf/df73znJB/95FRfun5r4JmCoEwHVht5gOleQ02MdkaFO4HIrxwTgPYRj9OAK/a3A3cf4+7Z7t4FuAz4wN1Hm1m3iGYjgMXh+yuB0y2kCXBixHMiIoFbU1zKTS/M5PK/TWXbrnL+cmUOz149mKPbJE6PUY5c0Z6Dr36MKZbHnB40sx6ELpNbQWgEPYR68H8H5off7+/uPjeG7ysickh2llfw5EehNdor3fnxmd259rSjaJSaEnQ0kd0CmejG3XOB3PD9UftoU0LoUjkRkYQxZfF67vv3AvK/3s45vTL5+fk96diycdCxRPaimexERPbhtVmFu9dez/z0PVqnN2DB6q0c1aYJz149mFO7twk6osg+HUyBv87MSiJe9wMz+zr8uOk+XiMiUie9NquQMRPnUVpWAcC6LTtZt2UnI/q256FL+u+xRrtIIoq2wK8Evh/xeC3wnRraiIgkhQffXry7uEeasbJIxV3qhKgKfHjku4hIUnN3Plv2Nc9NXcHaLTtqbLO6qDTOqUQOTUzOwYcvY7vc3Z+Mxf5EROJpy44yJs4o4LmpK1i2YRstGqeS3rA+JTvL92rbISMtgIQiB++wCryZnQT8ALiU0KVsKvAiUmcsXruF5z5bwauzCtm+q4J+HTN46OJ+DO/bnnfmr93jHDxAWmoKt5/TI8DEItE76AJvZq0IzRv/f8CxwJuEivyk2EYTEYm9XeWVvLNgLeM/W8G0/E00rF+PEf06cOVJnemb/b+12UcOyALYPYo+KyON28/psXu7SKKLusCb2TmEivoFwFTgEeBx4KfuvrB24omIxMaa4lJe+HwlL05bxcaSnXRq2Zi7hh3LxTkdadGkQY2vGTkgi5EDssjNzWXIkCHxDSxymKIq8GaWD+wAngNu9/Ca7Gb2eK0lExE5TO7Op8u+5rnPVvCfReuodOf0Hm0ZfVJnTuvWhnr1tMKbJK9oe/DtgNeB2cCq2osjInL4aho09/++eRRXnNBJs87JESPaAt+R0LKtDwFPm9kEYDwQn7VmRUSisGjNFp6buoLXwoPm+nfM4OGL+3F+3/aaJ16OONFeB78BGAeMM7NvEhpUNyX8+mvN7K/uvqD2YoqI1GxXeSVvz1/D+Kkr+CJ/8+5Bc1ed1IU+2c2DjicSmIMeRe/uHwEfmdmPCM1mdzXwQzPLc/fjYh1QRKQmq4tKeXHa/wbNdW7VmJ8NO46LB2WT0bjmQXMiR5JDvg7e3bcATwBPmFkfQiPsRURqjbvzydKveW5qPv9ZuA4Hzji2LaNP7MypGjQnsodoR9G/UdtBRET2pbi0jFdmFDD+8xV8tWEbLZs04NrTjuY7gzVoTmRfou3BDwdWEF7DXUQk1iKXZs2a+gG3n9OD7plNdw+aKy0LDZr73SX9GNZHg+ZEDiTaAv8QMBo4Ffg78Iy7F9RaKhE5olRfmrWwqJRbX5pNpUPD+vW4sH8HrjxRg+ZEDka0o+jvMLMxwPmEBtX9zMxygaeA1929rPYiikiyGzc5b6+lWSsdmqfV57+3D9WgOZFDEPWixu5e4e5vuPtIoCuhy+R+BRSaWXptBRSR5LevJVi3lJaruIscoqgLfDVNgAwgHShBE96IyGHIbN6oxu1amlXk0EVd4M0szcy+a2YfAvOAzsB33f0od99WawlFJKlVVjoZaal7bdfSrCKHJ9rL5P5KaM33Lwmddx/h7kW1GUxEjgxPfvwVi9du5ds52Xy27GstzSoSI9GOov8/YCWwBjgPOM9s7wkl3H1E7KKJSLKbs6qI376Tx7m92jHu230xMy3NKhIj0Rb4Z9F5dhGJoS07yvjhi7PIbNaI34wKFXcRiZ1oL5P7Xi3nEJEjiLtz18R5FBaV8tK1J9K88d7n4EXk8BzqKHoRkUP20vRVTJq7hlvP6k5O55ZBxxFJSnEv8GaWYmazzGxS+PH9ZjbXzGab2btm1iGibV8z+8zMFpjZPDOr+VoaEakzvly3lXveWMApx7Ti+tOODjqOSNIKogd/M7Ao4vE4d+/r7v2BScDdAGZWHxgPXOfuvYAhgGbME6nDdpRV8MMXZ9GkQX0euaS/Vn8TqUVxLfBmlk1outsnq7aFl52t0oT/DeY7G5jr7nPC7b529z3nshSROuVXby5k8dqtPHxJP9o20wE5kdp0yOvBH6JHgTuAppEbzewB4CqgGBga3twdcDObDLQBJrj7b+OYVURi6O15axg/dSXXnnoUQ3q0DTqOSNIz9/hc/WZmw4Fh7n6DmQ0BbnP34dXajAEaufs9ZnYbcCNwPLAdeB/4ubu/X+011wDXAGRmZuZMmDAhprlLSkpIT0+sqfYTLVOi5QFlila8Mm3YXsndn5bSvkk97jqhEfX3c2j+SP6cDoYyRSfRMtVGnqFDh85w90F7PeHucbkBY4ECIB9YS6hoj6/WpjMwP3z/MkLL0lY99wvg9v29R05OjsfalClTYr7Pw5VomRItj7syRSsemXaVV/i3/vyx9777HV+xcVtCZDpYyhQdZTqw2sgDTPcaamLczsG7+xh3z3b3LuHi/YG7jzazbhHNRgCLw/cnA33NrHF4wN1pwMJ45RWR2Hj0vSXMXFnEry/qQ6dWjYOOI3LEiPc5+Jo8aGY9gEpgBXAdgLtvNrPfAV8QGnj3lru/GVxMETlYH3+5kcdyl3HZ8R25oF+HA79ARGImkALv7rlAbvj+qP20G0/oUjkRqWM2bN3Jj1+azdFt0rnngl5BxxE54iRCD15EkkxlpfOTl+ewpbSM534wmLQGKUFHEjniaKpaEYm5v330FR8u2cAvhvfk2HbNgo4jckRSgReRmJq9qohxk/M4r3c7rjihU9BxRI5YKvAiEjOhJWBnktmsEQ9epCVgRYKkc/AiEhPuzpiJ81hdtIOXrj1JS8CKBEw9eBGJiX9+sYo3dy8B2yLoOCJHPBV4ETlsX67byr3/XsA3jmmtJWBFEoQKvIgclh1lFdz0wizSG9bnd5f20xKwIglC5+BF5LDcP2kheeu28o+rB9O2qZaAFUkU6sGLyCF7a94anv98JdeedhSndW8TdBwRiaACLyKHZNWm7dz5ylz6dczgtrN7BB1HRKpRgReRg1ZWUcnNE2aBwx8vG0Bqiv6UiCQanYMXkYP2yH9CS8D+8fIBWgJWJEHpa7eIHJSPvtzA4/9dxuWDtQSsSCJTgReRqG3YupMf/3MOx7RJ5+7hWgJWJJHpEL2IRKWy0rn1pdls3VHG8/93gpaAFUlw6sGLSFT+9tFXfPTlRu6+oCc92jUNOo6IHIAKvIgc0KyVmxk3OY9hfdrxncFaAlakLlCBF5H9Ki4t44cvziKzWSPGaglYkTpD5+BFZJ/cnbsmzmNN8Q5evu4kmqdpCViRukI9eBHZpwlfrOLNeWv4ydndGdhJS8CK1CUq8CJSoyXrtnJfeAnY607VErAidY0KvIjsJbQE7EwtAStSh+kcvIjs5ZeTFrJkXQnPaglYkTpLPXgR2cObc9fwQngJ2FO1BKxInaUCLyK7rdq0nZ9OnEt/LQErUufFvcCbWYqZzTKzSeHH95vZXDObbWbvmlmHau07mVmJmd0W76wiR5Kyikp+VLUE7OVaAlakrgvi/+CbgUURj8e5e1937w9MAu6u1v4R4O14hRM5Uv3uP0uYtbKIB0f1pWNLLQErUtfFtcCbWTZwPvBk1TZ33xLRpAngEe1HAl8BC+KVUeRI9OGSDTyeu4zLB3fi/L7tg44jIjEQ71H0jwJ3AHusVGFmDwBXAcXA0PC2JsCdwFmADs+L1JINW3dy60tz6J6Zzt3DewYdR0RixNz9wK1i8UZmw4Fh7n6DmQ0BbnP34dXajAEaufs9ZvYQMM3dXzKze4ESd3+ohv1eA1wDkJmZmTNhwoSY5i4pKSE9PT2m+zxciZYp0fKAMh3Ip6vLeGVJGV/vqCS1nlFRCb88JY3spsGfd0+kz6mKMkVHmQ6sNvIMHTp0hrsP2usJd4/LDRgLFAD5wFpgOzC+WpvOwPzw/Y/CbfOBImATcNP+3iMnJ8djbcqUKTHf5+FKtEyJlsddmfbn1ZkFfuzP3/bOd07afTvmrjf91ZkFQUdz98T5nCIpU3SU6cBqIw8w3WuoiXH7uu7uY9w92927AJcBH7j7aDPrFtFsBLA43P6b7t4l3P5R4Nfu/qd45RVJVuMm51FaVrHHtrIKZ9zkvIASiUhtSISZ7B40sx5AJbACuC7gPCJJq6yiksKi0hqfW72P7SJSNwVS4N09F8gN3x8VRft7a0acYCYAAByhSURBVDeRSHJbv2UHL0xbyQufr9xnmw4ZaXFMJCK1LRF68CJSC9yd6Ss2849P83ln/lrKK53Turfhwv7pjJ+6gtKyyt1t01JTuP0czVwnkkxU4EWSzPZd5bw+ezXPfraCRWu20LRRfb57chdGn9iZrq2bANCrQ3PGTc6jsKiUrIw0bj+nByMHZAWcXERiSQVeJEnkb9zGc1NX8PL0VWzZUc6x7Zoy9qI+XNi/A40b7Pm/+sgBWYwckEVubi5DhgwJJrCI1CoVeJE6rLLSyV2ynmc/W0Fu3gbq1zPO7d2Oq07qwvFdWmCmddxFjlQq8CJ1UNH2Xbw8vYDnpq5g5abttGnakFvO7MblgzuR2Uzrt4uICnyd8tqswv+dN536gc6bHoHmFxbz3GcreG12ITvLKxncpSW3n9ODc3q1o0H94GehE5HEoQJfR7w2q5AxE+ftnqCksKiUMRPnAajIJ7ld5ZW8PX8Nz362ghkrNpOWmsJFA7O48sQu9OzQLOh4IpKgVODriLFvL9pr9rHSsgru+/cCenVoxlFt0kmpp/OtyWRt8Q5e+HwFL0xbxcaSnXRp1Zifn38cF+d0pHnj1KDjiUiCU4FPcNt2lvPnKUtZt2Vnjc9v3l7GWY98SOMGKfTq0Iw+WRn0zW5O76zmHNW6CfVU9OsUd+fz5Zt49rN8Ji9YR6U7Q3u05aqTOnNqtzb6fYpI1FTgE5S78/rs1Yx9exHrtuwkLTVlrx48QNumDbnz3GOZV1jM3IIiXpi2gqc/CU1gkt6wPj07NKNvVnP6ZDenT1ZzurRS0U9E23aW8+qsQp79LJ8l60ponpbKD77RldEndKZTq8ZBxxOROkgFPgHNKyjm3n8vYMaKzfTJas5jVwxk1abSPc7BQ2j2sbuGHcfIAVmMyskGoLyikqUbSphXUBwu+sU8N3UFO8tDRb9pw/r0ympG3+wMemc1p29Wczq3aqzLqQKybEMJz322gldmFLB1Zzm9OjTjt6P6ckG/DqQ1SAk6nojUYSrwCWRjyU7GvZPHSzNW0apJA347qi/fzsmmXj0jp3OozYFmH6ufUo9j2zXj2HbNuHhQRyC0wMiX60qYX1jM3MIi5hUU88wn+eyqCBf9RvXpE9HL75uVQceWaSr6MVL96oefnNWdpmmpPPtZPh99uZHUFGNYn/ZcdVIXBnbK0OcuIjGhAp8Ayioq+cen+fz+vS8pLavgB6d05UdndqNZoz0HUh3q7GOpKfXo2aEZPTs045LjQ0V/V3klS9ZtDRf9YuYXFvP0x8spq3AAmqel7lH0+2Q1J7vFnkVfl+0dWE1XP/zk5Tk40K5ZI35yVncuG9yJNk0bBhtURJKOCnzA/rtkA7/89wKWbdjGqd3bcPfwnhzTNr3W37dB/Xr0zgoNxrssvG1neQVL1pYwr7CYeYVFzC0o5m8ffkV5ZajoZzRO3V3sS8vKeeHzVbsP/euyvZrVtPa6Ay0bp/LRnUNJTdG16yJSO1TgA5K/cRu/enMh7y1aT+dWjXnyqkGccVzbQA/PNqyfEuqxZzcHOgGwo6yCvLVbQ0W/INTb/8uHX1ERLvqRSssqGDc5TwU+wr7WWN+8vUzFXURqlQp8nG3bWc6fpizlqY+Wk5pi3HnusVz9jS40rJ+YA6oapabQr2MG/Tpm7N62o6yC437xDnuX+H0XtCNVZrNGrN2yY6/tWntdRGqbCnycuDuvzS5k7FuLWb91JxcNzOLOc4+tk/OGN0pNoUNGGoU1FHMVrv8pq6gkrcHevXStvS4i8aBjhHEwt6CIUY9/yo//OYd2zRsx8YaT+d0l/etkca9y+zk9SEvd+6jD2T3bBpAmMT349mKWb9zOlSd2Iiv8xScrI42xF/XRaQwRqXXqwdeiDVt3Mm7yYl6eURC67O3bffn2wOykmGimqkBVjaLv0LwRqSnGP6cXcPkJneme2TTghMF6c+4anvp4Od87uQv3jujF/aC110UkrlTga8Gu8tBlb394P3TZ2/99oys/PGPvy97quuqX7a0t3sHwP37Mtc/N4LUbT6F5WnL9vNFaun4rd/xrDgM7ZXDXsOOCjiMiRygdoo+x3Lz1nPv7D3ngrUXkdGnB5B+fys/O75l0xb0m7Zo3Cs+6t51b/zmbyhpG2ie7bTvLuW78TBqlpvDnKwZqCVcRCYz++sRI/sZt/OCZL/je37/AHZ7+3iCe+f5gjm5T+9e0J5LBXVvyi+E9eX/xen7//pdBx4krd+fOV+by1YYS/nj5ANo314BDEQmODtEfppKd5fzpg6U8/XHosrefnncs3z8lcS97i4erTurM3IJifv/+l/TOas5ZPTODjhQXf/8kn0lz13Dnucdy8jGtg44jIkc4FfhDVFnpvDqrkN+8E7rsbdTAbO48twdt6/DI+FgxMx74Vm/y1m3h1n/O5rWbTkn6IxnT8zfx67cWcVbPTK477aig44iI6BD9oZizqohRT3zKT16eQ/uMNF694WQevqSfinuERqkpPDE6h9T69bj2uRmU7CwPOlKtWb91Bzc8P5PsFmk8fEk/LRYjIglBBf4grN+6g9tfnsOFf/6EVZtKeejifrx6/ckM6NQi6GgJKbtFY/50+QC+2lDCbS/NwT35Bt2VV1TywxdmsWVHGY+PzjkiBlOKSN2gQ/T7ELlSWofP3mdQlxZ8sHgDO8sruPbUo7jp9GNoqj/mB3TyMa25a9hx/OrNRTyWu4wbhx4TdKSYGjc5j8+Xb+J3l/TjuPbNgo4jIrJb3Au8maUA04FCdx9uZvcDFwKVwHrge+6+2szOAh4EGgC7gNvd/YN4ZKy+xOfq4h28MWcNx7Vryp+vGMhRSX4+OdZ+8I2uzCko5qF38+jVoRlDeiTHbHfvzF/DXz78itEnduKigdlBxxER2UMQh+hvBhZFPB7n7n3dvT8wCbg7vH0jcIG79wG+CzwXr4A1LfEJsGVHuYr7ITAzfjOqDz0ym/KjF2ex4uttQUc6bF9tKOG2l+fSr2MGvxjeM+g4IiJ7iWuBN7Ns4Hzgyapt7r4lokkTQstl4+6z3H11ePsCoJGZNYxHzn2tiKaV0g5d4wb1+euVgzAzrn1uBtt31d1Bd9t3lXP9+JmkphiPXTHwiL4kUkQSV7x78I8CdxA6HL+bmT1gZquAK/hfDz7SKGCWu++s/Yj7XhFNK6Udnk6tGvOHyweQt24rd74yr04OunN3xkycx5L1W/nD5QN2LyIjIpJoLF5/ZM1sODDM3W8wsyHAbe4+vFqbMUAjd78nYlsv4A3gbHdfVsN+rwGuAcjMzMyZMGHCYWf9dHUZz8zfxa6IryEN6sH3ejfg5A7BD6wrKSkhPT1xThUcbJ5Jy3bxry/LuLRHA87rWjufZ219Ru+tKGP8ol1c1C2VEUc3SIhMh0OZoqNM0VGmA6uNPEOHDp3h7oP2esLd43IDxgIFQD6wFtgOjK/WpjMwP+JxNrAEOCWa98jJyfFYeXVmgZ889n3vfOckP3ns+/7qzIKY7ftwTZkyJegIezjYPJWVlX7dc9O9608n+SdfbkiITNGYsWKTH3PXm37136d5RUVlQmQ6XMoUHWWKjjIdWG3kAaZ7DTUxbofo3X2Mu2e7exfgMuADdx9tZt0imo0AFgOYWQbwJjDG3T+JV84qIwdk8clPT+eZc5vwyU9P1/rdMWRmjLu4H0e3SeemF2dRsHl70JEOaGPJTm4YP5P2zdP43SX9k2LJXxFJbokw0c2DZjbfzOYCZxMaZQ9wE3AM8Aszmx2+Jcf1VUJ6w/r85cocysoruW78DHbUcNVCoqiodH704iw2b9/F46MH0rxx8KdpREQOJJAC7+65Hj7/7u6j3L23hy6Vu8DdC8Pbf+XuTdy9f8RtfRB5pXYc1SadRy/rz/zCLdz1auIOunv43Tw+XfY194/sTa8OzYOOIyISlUTowcsR7IzjMrnlzG5MnFnIs5+tCDrOXt5dsJbHcpdx+eCOXDKoY9BxRESipgIvgfvR6d0487i23D9pIdOWbwo6zm75G7fxk5fm0CerOfdc0CvoOCIiB0UFXgJXr57xu0v706llY254fgZri3cEHYnSXRVcN34GKeHJbBqlajIbEalbVOAlITRrlMpfrszZXVh3lgc36M7d+dlr88hbt5VHL+1Px5aNA8siInKoVOAlYXTLbMrDl/Rj9qoi7n1jQWA5nv98JRNnFnLzGd2SZmEcETnyqMBLQjm3d3tuGHI0L05bxQufr4z7+89eVcQv/72Q07q34UendzvwC0REEpQKvCScn5zdg1O7t+GeN+Yzc+XmuL3vpm27uGH8DNo0bcijl2oyGxGp21TgJeGk1DP+cFl/2jdP4/rxM1i/tfYH3VVUOjdPmMXGkl08MTqHFk0Obp55EZFEowIvCSmjcQP+cmUOW0rLufH5mewqrzzwiw7D799bwkdfbuS+C3vRJ1uT2YhI3acCLwnruPbN+M23+/JF/mYeeHNhrb3PB4vX8YcPlnJxTjaXHa/JbEQkOdQPOoDI/ozo14F5BUX87aPl9MnO4Ns52THd/8qvt3PLhNn0bN+M+0f2xkzn3UUkOagHLwnvznOP5eSjW3HXq/OYV1Acs/3uKKvg+udnAPDE6BxNZiMiSUUFXhJe/ZR6/PHyAbRJb8i1z03n65KdMdnv3a/PZ8HqLTxyaX86tdJkNiKSXFTgpU5old6QJ0bnsHHbLm56YRblFYc36G7CtJW8NL2AH55+DGcclxmjlCIiiUMFXuqMPtnNGfutPnz21dc8+PbiQ97PvIJi7n5jAd/s1ppbzuwew4QiIolDg+ykThmVk83cgiKe/Hg5fbKbc2H/rIN6fdH2XVz//AxaN2nA7y8bQIomsxGRJKUevNQ5Px/ek8FdWnLnK3NZuHpL1K+rrHRu+eds1m3ZwWOjc2ipyWxEJImpwEudk5pSjz9dMYDmaalcO346Rdt3RfW6P3zwJbl5G7j7gl7075hRyylFRIKlAi91UtumjXh8dA7rinfywxdnUVHp+22fm7ee37//JRcNyGL0CZ3ilFJEJDgq8FJnDezUgvsu7MVHX27k4Xfz9tlu1abt3PLP2fTIbMoD3+qjyWxE5IigAi912uWDO3H54E48lruMt+et2ev5HWUV3PD8TCoqnMdH55DWQJPZiMiRQQVe6rx7R/RkQKcMfvLyHJas27rHc/f9eyHzCot5+JJ+dG3dJKCEIiLxp8vkpM5rWD+Fx6/IYfgfP+Y7f5tKako91hTvIOPDdynaXsZ1px3N2b3aBR1TRCSu1IOXpNCueSO+c0JHNpbsYk1xaP34ou1l1DPo3lY9dxE58qjAS9J4ZUbhXtsqHR7+z5cBpBERCZYKvCSN1UWlB7VdRCSZxb3Am1mKmc0ys0nhx/eb2Vwzm21m75pZh4i2Y8xsqZnlmdk58c4qdUuHjLSD2i4iksyC6MHfDCyKeDzO3fu6e39gEnA3gJn1BC4DegHnAo+Zma5xkn26/ZwepFVb0z0tNYXbz+kRUCIRkeDEtcCbWTZwPvBk1TZ3j5xMvAlQNSXZhcAEd9/p7suBpcDgeGWVumfkgCzGXtSHrHCPPSsjjbEX9WHkgINbkEZEJBnE+zK5R4E7gKaRG83sAeAqoBgYGt6cBUyNaFYQ3iayTyMHZDFyQBa5ubkMGTIk6DgiIoEx9/3P4R2zNzIbDgxz9xvMbAhwm7sPr9ZmDNDI3e8xsz8Dn7n7+PBzTwFvufsr1V5zDXANQGZmZs6ECRNimrukpIT09PSY7vNwJVqmRMsDyhQtZYqOMkVHmQ6sNvIMHTp0hrsP2usJd4/LDRhLqBeeD6wFtgPjq7XpDMwP3x8DjIl4bjJw0v7eIycnx2NtypQpMd/n4Uq0TImWx12ZoqVM0VGm6CjTgdVGHmC611AT43YO3t3HuHu2u3chNHjuA3cfbWbdIpqNABaH778BXGZmDc2sK9ANmBavvCIiInVZIkxV+6CZ9QAqgRXAdQDuvsDMXgIWAuXAje5eEVxMERGRuiOQAu/uuUBu+P6o/bR7AHggPqlERESSh2ayExERSUIq8CIiIklIBV5ERCQJxe06+Hgwsw2EBurFUmtgY4z3ebgSLVOi5QFlipYyRUeZoqNMB1YbeTq7e5vqG5OqwNcGM5vuNU0gEKBEy5RoeUCZoqVM0VGm6CjTgcUzjw7Ri4iIJCEVeBERkSSkAn9gfw06QA0SLVOi5QFlipYyRUeZoqNMBxa3PDoHLyIikoTUgxcREUlCKvD7YWYpZjbLzCYFnQXAzPLNbJ6ZzTaz6UHnATCzDDP7l5ktNrNFZnZSwHl6hD+fqtsWM7slyEzhXD82swVmNt/MXjSzRgmQ6eZwngVBfUZm9rSZrTez+RHbWprZf8zsy/C/LRIg08Xhz6nSzOI+InsfmcaF/7+ba2avmllGwHnuD2eZbWbvmlmHeOXZV6aI524zMzez1kFnMrN7zaww4m/UsNp6fxX4/bsZWBR0iGqGunv/BLrs4/fAO+5+LNCPgD8vd88Lfz79gRxCyxK/GmQmM8sCfgQMcvfeQAqhFRWDzNQb+H/AYEK/t+HVVnaMl2eAc6tt+ynwvrt3A94PPw4603zgIuDDOGep8gx7Z/oP0Nvd+wJLCC2xHWSece7eN/z/3iTg7jjm2VcmzKwjcBawMs55YB+ZgEeq/k65+1u19eYq8PtgZtnA+cCTQWdJVGbWDDgVeArA3Xe5e1GwqfZwBrDM3WM9+dGhqA+kmVl9oDGwOuA8xwFT3X27u5cD/wW+Fe8Q7v4hsKna5guBf4Tv/wMYGXQmd1/k7nnxzFHt/WvK9G74dwcwFcgOOM+WiIdNgLgO8NrHf0sAjwB3xDsP7DdTXKjA79ujhP6jqAw6SAQH3jWzGWZ2TdBhgKOADcDfw6cynjSzJkGHinAZ8GLQIdy9EHiIUA9iDVDs7u8Gm4r5wKlm1srMGgPDgI4BZ6qS6e5rAML/tg04T11wNfB20CHM7AEzWwVcQfx78DXlGQEUuvucoLNUc1P4dMbTtXkKSgW+BmY2HFjv7jOCzlLNKe4+EDgPuNHMTg04T31gIPC4uw8AthH/w6k1MrMGwAjg5QTI0oJQr7Qr0AFoYmajg8zk7ouA3xA6zPsOMAco3++LJCGZ2c8I/e6eDzqLu//M3TuGs9wUZJbwF9efkQBfNKp5HDga6E/oC//DtfVGKvA1OwUYYWb5wATgdDMbH2wkcPfV4X/XEzqvPDjYRBQABe7+efjxvwgV/ERwHjDT3dcFHQQ4E1ju7hvcvQyYCJwccCbc/Sl3H+jupxI6jPhl0JnC1plZe4Dwv+sDzpOwzOy7wHDgCk+sa55fAEYFnOFoQl+q54T/lmcDM82sXZCh3H2du1e4eyXwN2rx77gKfA3cfYy7Z7t7F0KHeT9w90B7XGbWxMyaVt0HziZ0mDUw7r4WWGVmPcKbzgAWBhgp0uUkwOH5sJXAiWbW2MyM0OcU+OBNM2sb/rcToQFkifJ5vQF8N3z/u8DrAWZJWGZ2LnAnMMLdtydAnshBmiOAxUFlAXD3ee7e1t27hP+WFwADw3+3AlP15TXsW9Ti3/H6tbVjiblM4NVQfaA+8IK7vxNsJAB+CDwfPiT+FfD9gPNUHZo7C7g26CwA7v65mf0LmEnoUOosEmN2rVfMrBVQBtzo7pvjHcDMXgSGAK3NrAC4B3gQeMnMfkDoy9HFCZBpE/BHoA3wppnNdvdzAs40BmgI/Cf8d2Gqu18XYJ5h4S/7lYRW9YxLlv1lcven4pkhmkzAEDPrT2hMVT61+HdKM9mJiIgkIR2iFxERSUIq8CIiIklIBV5ERCQJqcCLiIgkIRV4ERGRJKQCLyIikoRU4EUOk5k9YzUsKWxmg8JLVHaJf6oj175+HzHcf5fw73VQxLbGZvaOmS0PaFU+kb1oohuRJBdewa4iXlOZmllqeEreOi3azy281sCbQFNC60UEvVKgCKAevEhcWMhSM7ut2vZu4d7gwPBjN7ObzOxNM9tuZiuqL0xjZllmNsHMNodvb0b2Gs3sXjObb2bfM7NlwE5Cy3dWz5RrZk+Y2e8j9jXOzOpFtBltZl+Y2VYzW29mL4fXt696fkg48zAzm2Zmu4BzzOxoM3vdzNaa2TYzmxlexCny/fPN7O5wj3urma0ys0vNLCP885WY2Zdmdna11/UM/8xVmV6sml/czO4lNL3t+eFcbmZDYvm5VcvSgf+tEX+qirskEhV4kTgI9wKfIrSsZ6SrgdnuPjNi232E5mPvT2hK22erDgeHp+GdAuwATgNOIrQi1Xvh56p0Bb5DaJrXfuH2NbmC0N+BkwhNmXkNcEvE8w0ITa/Zj9CiJq2pec763wA/B44FPgfSCS1felb4ta8AE83s2GqvuwWYRmiRopcIrf/+AvBW+Of/EBhvZo3CP3/78Lb5hBbpODP8Xm+Ev5g8FN7Pe0D78O3TWvjcAI4BPiE0x/mZQUz1K7Jf7q6bbrodxg14htAc8yXVbtsJzTfdJdyuHaF5308MP04BCoGbIvblwN+q7f89YHz4/tWEVn2ziOdTgK+BS8KP7w2/T+YBcucCS6rt6+eEVgjc12uODWfMDj8eEn48KorPaSrw84jH+cCLEY/Tw/v6Q8S2LuFtg8KPfwm8X22/LcJtBkf8PiZVaxPLz60q007gUyA16P8GddOtppt68CKx8SGhHmfk7TuRDTy0itUk/teLPxdoxd7reH9Ww+Oe4fs5hHqZW8OHsEuAYkJF7uiI1xR4eKlcM/tmVdvw7YqIdlPdPfIc82dAlpk1C792YPhQ+woz2wpMD7frVC3j9MgHFlr98LdmtjB8OLwEGFTD6+ZGfD5VX4rmRTxftdxv24if/9TInwdYFX4u8uev7qA/tyi8TugowmVRtheJKw2yE4mN7e6+NHKDmWXU0O5J4AUzu4VQoZ/oB3dotx4wm5qLyqaI+9si7k8n9IWjSlQFzELLEk8mdAThSkLrsrcGPiJ06D7StmqPHyL0BeY2Qj3n7cCzNbyu+mA8r7at6stHvYh/3wzvt7r9/VyH8rkdyG8JfbbPmFmKuz9zEK8VqXUq8CLx9Q6whdBSmhcAw2pocyLwdLXHVevHzyS01v1Gdy+K5g3dvRRYuo+nTzAzi+jFnwisdvctZpZDqKDf5e7LAczsomjeE/gG8Ky7vxJ+XSNCPeUlUb5+X2YClwArfN8j9XcROvxe/XUH9blFw91/a2ZlwFNmVt/dn4zVvkUOlw7Ri8SRu1cQKt5jCZ1/f7+GZheZ2f8Lj7AfA5wBPBp+7nlCPdXXzew0M+tqZqea2cN2aNdfdwAeNbMeZvZt4HbgkfBzKwmdZ77JzI4ys/OB+6Pc7xLgW+FD/H2A8UCjQ8hX3Z+B5sA/zeyEcK4zzeyvZtY03CYf6B3+mVqbWSqx/9x2c/dHgJuBv5hZra3tLXKwVOBF4u9pQoeq/17t/HeVe4FRhM5PXw98392/AHD37cCpwFfAy8BiQiPPWwCHMor7eUK93c+BvxEa6f9I+L02ELrkbCSwkNBo+luj3O+thA7pf0RoNP3U8P3D4qHL0E4BKgkdDVlAqOjvDN8I/xyLCB0+30Do2vRYf27Vc/0JuBF4zMxuONz9icSC1fz3RURqi5mdQOjyqqPcfWW15xy42N3/FYccucB8d7+ptt9LROJP5+BF4sTMGgIdgV8Br1Yv7iIisaRD9CLxczmQR+jSuGgPdYuIHBIdohcREUlC6sGLiIgkIRV4ERGRJKQCLyIikoRU4EVERJKQCryIiEgSUoEXERFJQv8fvYtmdBjFjMIAAAAASUVORK5CYII=\n", 192 | "text/plain": [ 193 | "
" 194 | ] 195 | }, 196 | "metadata": { 197 | "needs_background": "light" 198 | }, 199 | "output_type": "display_data" 200 | } 201 | ], 202 | "source": [ 203 | "fig, ax = plt.subplots(figsize=(8,5))\n", 204 | "ax.plot(candidates, mae_metrics, \"o-\")\n", 205 | "ax.set_xlabel('Hyper-parameter K', fontsize=14)\n", 206 | "ax.set_ylabel('MAE', fontsize=14)\n", 207 | "ax.set_xticks(candidates)\n", 208 | "ax.grid();" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "#### Recalculating train-set split" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 10, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=7)\n", 225 | "scaler = StandardScaler()\n", 226 | "scaler.fit(X_train[numerical_features])\n", 227 | "X_train.loc[:, numerical_features] = scaler.fit_transform(X_train[numerical_features])\n", 228 | "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "#### Optimizing with cross-validation" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "from sklearn.model_selection import cross_val_score\n", 245 | "candidates = np.arange(4,16)\n", 246 | "mean_mae = []\n", 247 | "std_mae = []\n", 248 | "for k in candidates:\n", 249 | " model = KNeighborsRegressor(n_neighbors=k, weights='distance', metric='minkowski', leaf_size=50, n_jobs=4)\n", 250 | " cv_results = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=10)\n", 251 | " mean_score, std_score = -1*cv_results.mean(), cv_results.std()\n", 252 | " mean_mae.append(mean_score)\n", 253 | " std_mae.append(std_score)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": { 260 | "scrolled": true 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "fig, ax = plt.subplots(figsize=(8,5))\n", 265 | "ax.plot(candidates, mean_mae, \"o-\")\n", 266 | "ax.set_xlabel('Hyper-parameter K', fontsize=14)\n", 267 | "ax.set_ylabel('Mean MAE', fontsize=14)\n", 268 | "ax.set_xticks(candidates)\n", 269 | "ax.grid();" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "fig, ax = plt.subplots(figsize=(8,5))\n", 279 | "ax.plot(candidates, std_mae, \"o-\")\n", 280 | "ax.set_xlabel('Hyper-parameter K', fontsize=14)\n", 281 | "ax.set_ylabel('Standard deviation of MAE', fontsize=14)\n", 282 | "ax.set_xticks(candidates)\n", 283 | "ax.grid();" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "# Improving Performance" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "## Improving our diamond price predictions" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "### Fitting a neural network" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "from keras.models import Sequential\n", 314 | "from keras.layers import Dense\n", 315 | "\n", 316 | "n_input = X_train.shape[1]\n", 317 | "n_hidden1 = 32\n", 318 | "n_hidden2 = 16\n", 319 | "n_hidden3 = 8\n", 320 | "\n", 321 | "nn_reg = Sequential()\n", 322 | "nn_reg.add(Dense(units=n_hidden1, activation='relu', input_shape=(n_input,)))\n", 323 | "nn_reg.add(Dense(units=n_hidden2, activation='relu'))\n", 324 | "nn_reg.add(Dense(units=n_hidden3, activation='relu'))\n", 325 | "# output layer\n", 326 | "nn_reg.add(Dense(units=1, activation=None))" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": { 333 | "scrolled": true 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "batch_size = 32\n", 338 | "n_epochs = 40\n", 339 | "nn_reg.compile(loss='mean_absolute_error', optimizer='adam')\n", 340 | "nn_reg.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size, validation_split=0.05)" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "y_pred = nn_reg.predict(X_test).flatten()\n", 350 | "mae_neural_net = mean_absolute_error(y_test, y_pred)\n", 351 | "print(\"MAE Neural Network: {:0.2f}\".format(mae_neural_net))" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "### Transforming the target" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "diamonds['price'].hist(bins=25, ec='k', figsize=(8,5))\n", 368 | "plt.title(\"Distribution of diamond prices\", fontsize=16)\n", 369 | "plt.grid(False);" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "y_train = np.log(y_train)\n", 379 | "pd.Series(y_train).hist(bins=25, ec='k', figsize=(8,5))\n", 380 | "plt.title(\"Distribution of log diamond prices\", fontsize=16)\n", 381 | "plt.grid(False);" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "nn_reg = Sequential()\n", 391 | "nn_reg.add(Dense(units=n_hidden1, activation='relu', input_shape=(n_input,)))\n", 392 | "nn_reg.add(Dense(units=n_hidden2, activation='relu'))\n", 393 | "nn_reg.add(Dense(units=n_hidden3, activation='relu'))\n", 394 | "# output layer\n", 395 | "nn_reg.add(Dense(units=1, activation=None))" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": { 402 | "scrolled": true 403 | }, 404 | "outputs": [], 405 | "source": [ 406 | "batch_size = 32\n", 407 | "n_epochs = 40\n", 408 | "nn_reg.compile(loss='mean_absolute_error', optimizer='adam')\n", 409 | "nn_reg.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size, validation_split=0.05)" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "y_pred = nn_reg.predict(X_test).flatten()\n", 419 | "y_pred = np.exp(y_pred)\n", 420 | "mae_neural_net2 = mean_absolute_error(y_test, y_pred)\n", 421 | "print(\"MAE Neural Network (modified target): {:0.2f}\".format(mae_neural_net2))" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "100*(mae_neural_net - mae_neural_net2)/mae_neural_net2" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "#### Analyzing the results" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "fig, ax = plt.subplots(figsize=(8,5))\n", 447 | "residuals = y_test - y_pred\n", 448 | "ax.scatter(y_test, residuals, s=3)\n", 449 | "ax.set_title('Residuals vs. Observed Prices', fontsize=16)\n", 450 | "ax.set_xlabel('Observed prices', fontsize=14)\n", 451 | "ax.set_ylabel('Residuals', fontsize=14)\n", 452 | "ax.grid();" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "mask_7500 = y_test <=7500\n", 462 | "mae_neural_less_7500 = mean_absolute_error(y_test[mask_7500], y_pred[mask_7500])\n", 463 | "print(\"MAE considering price <= 7500: {:0.2f}\".format(mae_neural_less_7500))" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "fig, ax = plt.subplots(figsize=(8,5))\n", 473 | "percent_residuals = (y_test - y_pred)/y_test\n", 474 | "ax.scatter(y_test, percent_residuals, s=3)\n", 475 | "ax.set_title('Pecent residuals vs. Observed Prices', fontsize=16)\n", 476 | "ax.set_xlabel('Observed prices', fontsize=14)\n", 477 | "ax.set_ylabel('Pecent residuals', fontsize=14)\n", 478 | "ax.axhline(y=0.15, color='r'); ax.axhline(y=-0.15, color='r'); \n", 479 | "ax.grid();" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": null, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": null, 492 | "metadata": {}, 493 | "outputs": [], 494 | "source": [] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [] 509 | } 510 | ], 511 | "metadata": { 512 | "kernelspec": { 513 | "display_name": "Python 3", 514 | "language": "python", 515 | "name": "python3" 516 | }, 517 | "language_info": { 518 | "codemirror_mode": { 519 | "name": "ipython", 520 | "version": 3 521 | }, 522 | "file_extension": ".py", 523 | "mimetype": "text/x-python", 524 | "name": "python", 525 | "nbconvert_exporter": "python", 526 | "pygments_lexer": "ipython3", 527 | "version": "3.6.10" 528 | } 529 | }, 530 | "nbformat": 4, 531 | "nbformat_minor": 2 532 | } 533 | -------------------------------------------------------------------------------- /Chapter09/Model/diamond-prices-model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Predictive-Analytics-with-Python/e049d9d2870f596892b62aeddec312788d9d0c2c/Chapter09/Model/diamond-prices-model.h5 -------------------------------------------------------------------------------- /Chapter09/Model/pca.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Predictive-Analytics-with-Python/e049d9d2870f596892b62aeddec312788d9d0c2c/Chapter09/Model/pca.joblib -------------------------------------------------------------------------------- /Chapter09/Model/scaler.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Predictive-Analytics-with-Python/e049d9d2870f596892b62aeddec312788d9d0c2c/Chapter09/Model/scaler.joblib -------------------------------------------------------------------------------- /Chapter09/dash-example-no-user-inputs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Alvaro Fuentes 4 | Chapter 9. Hands-On Predictive Analytics with Python 5 | Building a basic static app 6 | """ 7 | ## imports 8 | import dash 9 | import dash_core_components as dcc 10 | import dash_html_components as html 11 | import plotly.graph_objs as go 12 | import pandas as pd 13 | import os 14 | 15 | ## Importing the dataset 16 | DATA_DIR = '../data' 17 | FILE_NAME = 'diamonds.csv' 18 | data_path = os.path.join(DATA_DIR, FILE_NAME) 19 | diamonds = pd.read_csv(data_path) 20 | 21 | ## Creating the app 22 | app = dash.Dash(__name__) 23 | 24 | # Creating a Plotly figure 25 | trace = go.Histogram( 26 | x = diamonds['price'] 27 | ) 28 | 29 | layout = go.Layout( 30 | title = 'Diamond Prices', 31 | xaxis = dict(title='Price'), 32 | yaxis = dict(title='Count') 33 | ) 34 | 35 | figure = go.Figure( 36 | data = [trace], 37 | layout = layout 38 | ) 39 | 40 | app.layout = html.Div([ 41 | html.H1('My first Dash App'), 42 | html.H2('Histogram of diamond prices'), 43 | html.P('This is some normal text, we can use it to describe something about the application.'), 44 | dcc.Graph(id='my-histogram', figure=figure) 45 | ]) 46 | 47 | 48 | if __name__ == '__main__': 49 | app.run_server(debug=True) 50 | -------------------------------------------------------------------------------- /Chapter09/dash-example-user-inputs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Alvaro Fuentes 4 | Chapter 9. Hands-On Predictive Analytics with Python 5 | Building a basic interactive app 6 | """ 7 | ## imports 8 | import dash 9 | import dash_core_components as dcc 10 | import dash_html_components as html 11 | from dash.dependencies import Input, Output 12 | import plotly.graph_objs as go 13 | import pandas as pd 14 | import os 15 | 16 | ## Importing the dataset 17 | DATA_DIR = '../data' 18 | FILE_NAME = 'diamonds.csv' 19 | data_path = os.path.join(DATA_DIR, FILE_NAME) 20 | diamonds = pd.read_csv(data_path) 21 | diamonds = diamonds.sample(n=2000) 22 | 23 | 24 | app = dash.Dash(__name__) 25 | 26 | app.css.append_css({ 27 | 'external_url': 'https://codepen.io/chriddyp/pen/bWLwgP.css' 28 | }) 29 | 30 | numerical_features = ['price','carat','depth','table','x','y','z'] 31 | options_dropdown = [{'label':x.upper(), 'value':x} for x in numerical_features] 32 | 33 | dd_x_var = dcc.Dropdown( 34 | id='x-var', 35 | options = options_dropdown, 36 | value = 'carat' 37 | ) 38 | 39 | div_x_var = html.Div( 40 | children=[html.H4('Variable for x axis: '), dd_x_var], 41 | className="six columns" 42 | ) 43 | 44 | 45 | dd_y_var = dcc.Dropdown( 46 | id='y-var', 47 | options = options_dropdown, 48 | value = 'price' 49 | ) 50 | 51 | div_y_var = html.Div( 52 | children=[html.H4('Variable for y axis: '), dd_y_var], 53 | className="six columns" 54 | ) 55 | 56 | app.layout = html.Div(children=[ 57 | html.H1('Adding interactive controls'), 58 | html.H2('Interactive scatter plot example'), 59 | html.Div( 60 | children=[div_x_var, div_y_var], 61 | className="row" 62 | ), 63 | dcc.Graph(id='scatter') 64 | ]) 65 | 66 | 67 | @app.callback( 68 | Output(component_id='scatter', component_property='figure'), 69 | [Input(component_id='x-var', component_property='value'), Input(component_id='y-var', component_property='value')]) 70 | def scatter_plot(x_col, y_col): 71 | trace = go.Scatter( 72 | x = diamonds[x_col], 73 | y = diamonds[y_col], 74 | mode = 'markers' 75 | ) 76 | 77 | layout = go.Layout( 78 | title = 'Scatter plot', 79 | xaxis = dict(title = x_col.upper()), 80 | yaxis = dict(title = y_col.upper()) 81 | ) 82 | 83 | output_plot = go.Figure( 84 | data = [trace], 85 | layout = layout 86 | ) 87 | 88 | return output_plot 89 | 90 | 91 | if __name__ == '__main__': 92 | app.run_server(debug=True) -------------------------------------------------------------------------------- /Chapter09/diamonds-model-training.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Alvaro Fuentes 4 | Chapter 9. Hands-On Predictive Analytics with Python 5 | Producing the predictive model's objects 6 | """ 7 | ## Imports 8 | import numpy as np 9 | import pandas as pd 10 | import os 11 | from keras.models import Sequential 12 | from keras.layers import Dense 13 | from sklearn.externals import joblib 14 | 15 | ## Loading the dataset 16 | DATA_DIR = '../data' 17 | FILE_NAME = 'diamonds.csv' 18 | data_path = os.path.join(DATA_DIR, FILE_NAME) 19 | diamonds = pd.read_csv(data_path) 20 | 21 | 22 | ## Preparing the dataset 23 | diamonds = diamonds.loc[(diamonds['x']>0) | (diamonds['y']>0)] 24 | diamonds.loc[11182, 'x'] = diamonds['x'].median() 25 | diamonds.loc[11182, 'z'] = diamonds['z'].median() 26 | diamonds = diamonds.loc[~((diamonds['y'] > 30) | (diamonds['z'] > 30))] 27 | diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['cut'], prefix='cut', drop_first=True)], axis=1) 28 | diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['color'], prefix='color', drop_first=True)], axis=1) 29 | diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['clarity'], prefix='clarity', drop_first=True)], axis=1) 30 | 31 | ## Dimensionality reduction 32 | from sklearn.decomposition import PCA 33 | pca = PCA(n_components=1, random_state=123) 34 | diamonds['dim_index'] = pca.fit_transform(diamonds[['x','y','z']]) 35 | diamonds.drop(['x','y','z'], axis=1, inplace=True) 36 | 37 | ## Creating X and y 38 | X = diamonds.drop(['cut','color','clarity','price'], axis=1) 39 | y = np.log(diamonds['price']) 40 | 41 | ## Standarization: centering and scaling 42 | numerical_features = ['carat', 'depth', 'table', 'dim_index'] 43 | from sklearn.preprocessing import StandardScaler 44 | scaler = StandardScaler() 45 | X.loc[:, numerical_features] = scaler.fit_transform(X[numerical_features]) 46 | 47 | ## Building the neural network 48 | n_input = X.shape[1] 49 | n_hidden1 = 32 50 | n_hidden2 = 16 51 | n_hidden3 = 8 52 | 53 | nn_reg = Sequential() 54 | nn_reg.add(Dense(units=n_hidden1, activation='relu', input_shape=(n_input,))) 55 | nn_reg.add(Dense(units=n_hidden2, activation='relu')) 56 | nn_reg.add(Dense(units=n_hidden3, activation='relu')) 57 | # output layer 58 | nn_reg.add(Dense(units=1, activation=None)) 59 | 60 | ## Training the neural network 61 | batch_size = 32 62 | n_epochs = 40 63 | nn_reg.compile(loss='mean_absolute_error', optimizer='adam') 64 | nn_reg.fit(X, y, epochs=n_epochs, batch_size=batch_size) 65 | 66 | ## Serializing: 67 | # PCA 68 | joblib.dump(pca, './Model/pca.joblib') 69 | 70 | # Scaler 71 | joblib.dump(scaler, './Model/scaler.joblib') 72 | 73 | # Trained model 74 | nn_reg.save("./Model/diamond-prices-model.h5") -------------------------------------------------------------------------------- /Chapter09/predict-diamond-prices.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Alvaro Fuentes 4 | Chapter 9. Hands-On Predictive Analytics with Python 5 | Building the web application 6 | """ 7 | 8 | import dash 9 | import dash_core_components as dcc 10 | import dash_html_components as html 11 | from dash.dependencies import Input, Output 12 | 13 | from keras.models import load_model 14 | from sklearn.externals import joblib 15 | 16 | import numpy as np 17 | import pandas as pd 18 | 19 | app = dash.Dash(__name__) 20 | app.css.append_css({ 21 | 'external_url': 'https://codepen.io/chriddyp/pen/bWLwgP.css' 22 | }) 23 | 24 | model = load_model('./Model/diamond-prices-model.h5') 25 | pca = joblib.load('./Model/pca.joblib') 26 | scaler = joblib.load('./Model/scaler.joblib') 27 | model._make_predict_function() 28 | 29 | ## Div for carat 30 | input_carat = dcc.Input( 31 | id='carat', 32 | type='numeric', 33 | value=0.7) 34 | 35 | div_carat = html.Div( 36 | children=[html.H3('Carat:'), input_carat], 37 | className="four columns" 38 | ) 39 | 40 | ## Div for depth 41 | input_depth = dcc.Input( 42 | id='depth', 43 | placeholder='', 44 | type='numeric', 45 | value=60) 46 | 47 | div_depth = html.Div( 48 | children=[html.H3('Depth:'), input_depth], 49 | className="four columns" 50 | ) 51 | 52 | ## Div for table 53 | input_table = dcc.Input( 54 | id='table', 55 | placeholder='', 56 | type='numeric', 57 | value=60) 58 | 59 | div_table = html.Div( 60 | children=[html.H3('Table:'), input_table], 61 | className="four columns" 62 | ) 63 | 64 | ## Div for x 65 | input_x = dcc.Input( 66 | id='x', 67 | placeholder='', 68 | type='numeric', 69 | value=5) 70 | 71 | div_x = html.Div( 72 | children=[html.H3('x value:'), input_x], 73 | className="four columns" 74 | ) 75 | 76 | ## Div for y 77 | input_y = dcc.Input( 78 | id='y', 79 | placeholder='', 80 | type='numeric', 81 | value=5) 82 | 83 | div_y = html.Div( 84 | children=[html.H3('y value:'), input_y], 85 | className="four columns" 86 | ) 87 | 88 | ## Div for z 89 | input_z = dcc.Input( 90 | id='z', 91 | placeholder='', 92 | type='numeric', 93 | value=3) 94 | 95 | div_z = html.Div( 96 | children=[html.H3('z value: '), input_z], 97 | className="four columns" 98 | ) 99 | 100 | ## Div for cut 101 | cut_values = ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'] 102 | cut_options = [{'label': x, 'value': x} for x in cut_values] 103 | input_cut = dcc.Dropdown( 104 | id='cut', 105 | options = cut_options, 106 | value = 'Ideal' 107 | ) 108 | 109 | div_cut = html.Div( 110 | children=[html.H3('Cut:'), input_cut], 111 | className="four columns" 112 | ) 113 | 114 | ## Div for color 115 | color_values = ['D', 'E', 'F', 'G', 'H', 'I', 'J'] 116 | color_options = [{'label': x, 'value': x} for x in color_values] 117 | input_color = dcc.Dropdown( 118 | id='color', 119 | options = color_options, 120 | value = 'G' 121 | ) 122 | 123 | div_color = html.Div( 124 | children=[html.H3('Color:'), input_color], 125 | className="four columns" 126 | ) 127 | 128 | ## Div for clarity 129 | clarity_values = ['I1', 'IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2'] 130 | clarity_options = [{'label': x, 'value': x} for x in clarity_values] 131 | input_clarity = dcc.Dropdown( 132 | id='clarity', 133 | options = clarity_options, 134 | value = 'SI1' 135 | ) 136 | 137 | div_clarity = html.Div( 138 | children=[html.H3('Clarity:'), input_clarity], 139 | className="four columns" 140 | ) 141 | 142 | ## Div for numerical characteristics 143 | div_numerical = html.Div( 144 | children = [div_carat, div_depth, div_table], 145 | className="row" 146 | ) 147 | 148 | ## Div for dimensions 149 | div_dimensions = html.Div( 150 | children = [div_x, div_y, div_z], 151 | className="row" 152 | ) 153 | 154 | ## Div for categorical 155 | div_categorical = html.Div( 156 | children = [div_cut, div_color, div_clarity], 157 | className="row" 158 | ) 159 | 160 | def get_prediction(carat, depth, table, x, y, z, cut, color, clarity): 161 | '''takes the inputs from the user and produces the price prediction''' 162 | 163 | cols = ['carat', 'depth', 'table', 164 | 'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good', 165 | 'color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J', 166 | 'clarity_IF','clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2','clarity_VVS1', 'clarity_VVS2', 167 | 'dim_index'] 168 | 169 | cut_dict = {x: 'cut_' + x for x in cut_values[1:]} 170 | color_dict = {x: 'color_' + x for x in color_values[1:]} 171 | clarity_dict = {x: 'clarity_' + x for x in clarity_values[1:]} 172 | 173 | ## produce a dataframe with a single row of zeros 174 | df = pd.DataFrame(data = np.zeros((1,len(cols))), columns = cols) 175 | 176 | ## get the numeric characteristics 177 | df.loc[0,'carat'] = carat 178 | df.loc[0,'depth'] = depth 179 | df.loc[0,'table'] = table 180 | 181 | ## transform dimensions into a single dim_index using PCA 182 | dims_df = pd.DataFrame(data=[[x, y, z]], columns=['x','y','z']) 183 | df.loc[0,'dim_index'] = pca.transform(dims_df).flatten()[0] 184 | 185 | ## Use the one-hot encoding for the categorical features 186 | if cut!='Fair': 187 | df.loc[0, cut_dict[cut]] = 1 188 | 189 | if color!='D': 190 | df.loc[0, color_dict[color]] = 1 191 | 192 | if clarity != 'I1': 193 | df.loc[0, clarity_dict[clarity]] = 1 194 | 195 | ## Scale the numerical features using the trained scaler 196 | numerical_features = ['carat', 'depth', 'table', 'dim_index'] 197 | df.loc[:,numerical_features] = scaler.transform(df.loc[:,numerical_features]) 198 | 199 | ## Get the predictions using our trained neural network 200 | prediction = model.predict(df.values).flatten()[0] 201 | 202 | ## Transform the log-prices to prices 203 | prediction = np.exp(prediction) 204 | 205 | return int(prediction) 206 | 207 | ## App layout 208 | app.layout = html.Div([ 209 | html.H1('IDR Predict diamond prices'), 210 | 211 | html.H2('Enter the diamond characteristics to get the predicted price'), 212 | 213 | html.Div( 214 | children=[div_numerical, div_dimensions, div_categorical] 215 | ), 216 | html.H1(id='output', 217 | style={'margin-top': '50px', 'text-align': 'center'}) 218 | ]) 219 | 220 | predictors = ['carat', 'depth', 'table', 'x', 'y', 'z', 'cut', 'color', 'clarity'] 221 | @app.callback( 222 | Output('output', 'children'), 223 | [Input(x, 'value') for x in predictors]) 224 | def show_prediction(carat, depth, table, x, y, z, cut, color, clarity): 225 | pred = get_prediction(carat, depth, table, x, y, z, cut, color, clarity) 226 | return str("Predicted Price: {:,}".format(pred)) 227 | 228 | 229 | if __name__ == '__main__': 230 | app.run_server(debug=True) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Hands-On Predictive Analytics with Python 5 | 6 | 7 | 8 | This is the code repository for [Hands-On Predictive Analytics with Python](https://www.packtpub.com/big-data-and-business-intelligence/hands-predictive-analytics-python?utm_source=github&utm_medium=repository&utm_campaign=9781789138719), published by Packt. 9 | 10 | **Master the complete predictive analytics process, from problem definition to model deployment** 11 | 12 | ## What is this book about? 13 | This book will teach you all the processes you need to build a predictive analytics solution: understanding the problem, preparing datasets, exploring relationships, model building, tuning, evaluation, and deployment. You'll earn to use Python and its data analytics ecosystem to implement the main techniques used in real-world projects. 14 | 15 | This book covers the following exciting features: 16 | * Get to grips with the main concepts and principles of predictive analytics 17 | * Learn about the stages involved in producing complete predictive analytics solutions 18 | * Understand how to define a problem, propose a solution, and prepare a dataset 19 | * Use visualizations to explore relationships and gain insights into the dataset 20 | * Learn to build regression and classification models using scikit-learn 21 | * Use Keras to build powerful neural network models that produce accurate predictions 22 | * Learn to serve a model's predictions as a web application 23 | 24 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/178913871X) today! 25 | 26 | https://www.packtpub.com/ 28 | 29 | ## Instructions and Navigations 30 | 31 | ### Installation 32 | To be able to run the code of the book without any problems, please do the following: 33 | 1. Download the Anaconda distribution for your system, you can find the installers [here](https://www.anaconda.com) 34 | 1. Once you have installed the Anaconda distribution, create a new Python 3.6 environment with the packages you will need. 35 | To create the environment (named `ho-pawp`, but you can use any other name you like) run the following command 36 | in the Anaconda Prompt terminal `conda create --name ho-pawp --file requirements.txt ` 37 | 38 | For a quick guide on conda refer to the conda-cheatsheet.pdf in this repo. 39 | ### Using the code files 40 | 41 | All of the code is organized into folders. Most of the code consists of Jupyter Notebooks. For example, Chapter02. 42 | 43 | The code will look like the following: 44 | ``` 45 | carat_values = np.arange(0.5, 5.5, 0.5) 46 | preds = first_ml_model(carat_values) 47 | pd.DataFrame({"Carat": carat_values, "Predicted price":preds}) 48 | ``` 49 | 50 | **Following is what you need for this book:** 51 | This book is aimed at data scientists, data engineers, software engineers, and business analysts. Also, students and professionals who are constantly working with data in quantitative fields such as finance, economics, and business, among others, who would like to build models to make predictions will find this book useful. In general, this book is aimed at all professionals who would like to focus on the practical implementation of predictive analytics with Python. 52 | 53 | With the following software and hardware list you can run all code files present in the book (Chapter 1-13). 54 | ### Software and Hardware List 55 | | Chapter | Software required | OS required | 56 | | ------- | ------------------------------------ | ----------------------------------- | 57 | | 1-9 | Python 3.6 or higher, Jupyter Notebook, Recent versions of the following Python libraries: NumPy, pandas, and matplotlib, Seaborn, scikit-learn, Recent installations of TensorFlow and Keras, Basic libraries for Dash | Windows, Mac OS X, and Linux (Any) | 58 | 59 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](). 60 | 61 | ### Related products 62 | * TensorFlow: Powerful Predictive Analytics with TensorFlow [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/tensorflow-powerful-predictive-analytics-tensorflow?utm_source=github&utm_medium=repository&utm_campaign=9781789136913) [[Amazon]](https://www.amazon.com/dp/1789136911) 63 | 64 | * Building Machine Learning Systems with Python - Third Edition [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/building-machine-learning-systems-python-third-edition?utm_source=github&utm_medium=repository&utm_campaign=9781788623223) [[Amazon]](https://www.amazon.com/dp/1788623223) 65 | 66 | 67 | ## Get to Know the Author 68 | **Alvaro Fuentes** is a Senior Data Scientist with more than 13 years of experience in analytical roles. 69 | He holds an M.S. in applied mathematics and an M.S. in quantitative economics. He has been working for one of the top global 70 | management consulting firms solving analytical and AI problems in different industries like Banking, Telco, Mining and others. 71 | He worked for many years in the Central Bank of Guatemala as an economic analyst, building models for economic and financial data. 72 | He is a big Python fan and has been using it routinely for 5+ years to analyzing data and building and deploying analytical models that transform data into intelligence. 73 | 74 | 75 | ### Suggestions and Feedback 76 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions. 77 | 78 | 79 | ### Download a free PDF 80 | 81 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
82 |

https://packt.link/free-ebook/9781789138719

-------------------------------------------------------------------------------- /conda-cheatsheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-Predictive-Analytics-with-Python/e049d9d2870f596892b62aeddec312788d9d0c2c/conda-cheatsheet.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: win-64 4 | _tflow_select=2.2.0=eigen 5 | absl-py=0.9.0=py36_0 6 | astor=0.8.0=py36_0 7 | attrs=19.3.0=py_0 8 | backcall=0.2.0=py_0 9 | blas=1.0=mkl 10 | bleach=3.1.5=py_0 11 | blinker=1.4=py36_0 12 | brotli=1.0.7=pypi_0 13 | brotlipy=0.7.0=py36he774522_1000 14 | ca-certificates=2020.6.24=0 15 | cachetools=4.1.0=py_1 16 | certifi=2020.6.20=py36_0 17 | cffi=1.14.0=py36h7a1dbc1_0 18 | chardet=3.0.4=py36_1003 19 | click=7.1.2=py_0 20 | colorama=0.4.3=py_0 21 | cryptography=2.9.2=py36h7a1dbc1_0 22 | cycler=0.10.0=py36h009560c_0 23 | dash=0.28.5=pypi_0 24 | dash-core-components=0.35.0=pypi_0 25 | dash-html-components=0.13.2=pypi_0 26 | dash-renderer=0.15.0=pypi_0 27 | decorator=4.4.2=py_0 28 | defusedxml=0.6.0=py_0 29 | entrypoints=0.3=py36_0 30 | flask=1.1.2=pypi_0 31 | flask-compress=1.5.0=pypi_0 32 | freetype=2.10.2=hd328e21_0 33 | gast=0.2.2=py36_0 34 | google-auth=1.17.2=py_0 35 | google-auth-oauthlib=0.4.1=py_2 36 | google-pasta=0.2.0=py_0 37 | graphviz=2.38=hfd603c8_2 38 | grpcio=1.27.2=py36h351948d_0 39 | h5py=2.10.0=py36h5e291fa_0 40 | hdf5=1.10.4=h7ebc959_0 41 | icc_rt=2019.0.0=h0cc432a_1 42 | icu=58.2=ha925a31_3 43 | idna=2.10=py_0 44 | importlib-metadata=1.7.0=py36_0 45 | importlib_metadata=1.7.0=0 46 | intel-openmp=2020.1=216 47 | ipykernel=5.3.2=py36h5ca1d4c_0 48 | ipython=7.16.1=py36h5ca1d4c_0 49 | ipython_genutils=0.2.0=py36_0 50 | ipywidgets=7.5.1=py_0 51 | itsdangerous=1.1.0=pypi_0 52 | jedi=0.17.1=py36_0 53 | jinja2=2.11.2=py_0 54 | joblib=0.16.0=py_0 55 | jpeg=9b=hb83a4c4_2 56 | jsonschema=3.2.0=py36_0 57 | jupyter=1.0.0=py36_7 58 | jupyter_client=6.1.6=py_0 59 | jupyter_console=6.1.0=py_0 60 | jupyter_core=4.6.3=py36_0 61 | keras=2.3.1=0 62 | keras-applications=1.0.8=py_1 63 | keras-base=2.3.1=py36_0 64 | keras-preprocessing=1.1.0=py_1 65 | kiwisolver=1.2.0=py36h74a9793_0 66 | libpng=1.6.37=h2a8f88b_0 67 | libprotobuf=3.12.3=h7bd577a_0 68 | libsodium=1.0.18=h62dcd97_0 69 | m2w64-gcc-libgfortran=5.3.0=6 70 | m2w64-gcc-libs=5.3.0=7 71 | m2w64-gcc-libs-core=5.3.0=7 72 | m2w64-gmp=6.1.0=2 73 | m2w64-libwinpthread-git=5.0.0.4634.697f757=2 74 | markdown=3.1.1=py36_0 75 | markupsafe=1.1.1=py36he774522_0 76 | matplotlib=3.2.2=0 77 | matplotlib-base=3.2.2=py36h64f37c6_0 78 | mistune=0.8.4=py36he774522_0 79 | mkl=2020.1=216 80 | mkl-service=2.3.0=py36hb782905_0 81 | mkl_fft=1.1.0=py36h45dec08_0 82 | mkl_random=1.1.1=py36h47e9c7a_0 83 | msys2-conda-epoch=20160418=1 84 | nbconvert=5.6.1=py36_0 85 | nbformat=5.0.7=py_0 86 | notebook=6.0.3=py36_0 87 | numpy=1.18.5=py36h6530119_0 88 | numpy-base=1.18.5=py36hc3f5095_0 89 | oauthlib=3.1.0=py_0 90 | openssl=1.1.1g=he774522_0 91 | opt_einsum=3.1.0=py_0 92 | packaging=20.4=py_0 93 | pandas=1.0.5=py36h47e9c7a_0 94 | pandoc=2.10=0 95 | pandocfilters=1.4.2=py36_1 96 | parso=0.7.0=py_0 97 | pickleshare=0.7.5=py36_0 98 | pip=20.1.1=py36_1 99 | plotly=4.9.0=pypi_0 100 | prometheus_client=0.8.0=py_0 101 | prompt-toolkit=3.0.5=py_0 102 | prompt_toolkit=3.0.5=0 103 | protobuf=3.12.3=py36h33f27b4_0 104 | pyasn1=0.4.8=py_0 105 | pyasn1-modules=0.2.7=py_0 106 | pycparser=2.20=py_2 107 | pydotplus=2.0.2=pypi_0 108 | pygments=2.6.1=py_0 109 | pyjwt=1.7.1=py36_0 110 | pyopenssl=19.1.0=py_1 111 | pyparsing=2.4.7=py_0 112 | pyqt=5.9.2=py36h6538335_2 113 | pyreadline=2.1=py36_1 114 | pyrsistent=0.16.0=py36he774522_0 115 | pysocks=1.7.1=py36_0 116 | python=3.6.10=h9f7ef89_2 117 | python-dateutil=2.8.1=py_0 118 | pytz=2020.1=py_0 119 | pywin32=227=py36he774522_1 120 | pywinpty=0.5.7=py36_0 121 | pyyaml=5.3.1=py36he774522_1 122 | pyzmq=19.0.1=py36ha925a31_1 123 | qt=5.9.7=vc14h73c81de_0 124 | qtconsole=4.7.5=py_0 125 | qtpy=1.9.0=py_0 126 | requests=2.24.0=py_0 127 | requests-oauthlib=1.3.0=py_0 128 | retrying=1.3.3=pypi_0 129 | rsa=4.0=py_0 130 | scikit-learn=0.22=py36h6288b17_0 131 | scipy=1.5.0=py36h9439919_0 132 | seaborn=0.10.1=py_0 133 | send2trash=1.5.0=py36_0 134 | setuptools=49.2.0=py36_0 135 | sip=4.19.8=py36h6538335_0 136 | six=1.15.0=py_0 137 | sqlite=3.32.3=h2a8f88b_0 138 | tensorboard=2.2.1=pyh532a8cf_0 139 | tensorboard-plugin-wit=1.6.0=py_0 140 | tensorflow=2.1.0=eigen_py36hdbbabfe_0 141 | tensorflow-base=2.1.0=eigen_py36h49b2757_0 142 | tensorflow-estimator=2.1.0=pyhd54b08b_0 143 | termcolor=1.1.0=py36_1 144 | terminado=0.8.3=py36_0 145 | testpath=0.4.4=py_0 146 | threadpoolctl=2.1.0=pyh5ca1d4c_0 147 | tornado=6.0.4=py36he774522_1 148 | traitlets=4.3.3=py36_0 149 | urllib3=1.25.9=py_0 150 | vc=14.1=h0510ff6_4 151 | vs2015_runtime=14.16.27012=hf0eaf9b_3 152 | wcwidth=0.2.5=py_0 153 | webencodings=0.5.1=py36_1 154 | werkzeug=0.16.1=py_0 155 | wheel=0.34.2=py36_0 156 | widgetsnbextension=3.5.1=py36_0 157 | win_inet_pton=1.1.0=py36_0 158 | wincertstore=0.2=py36h7fe50ca_0 159 | winpty=0.4.3=4 160 | wrapt=1.12.1=py36he774522_1 161 | yaml=0.2.5=he774522_0 162 | zeromq=4.3.2=ha925a31_2 163 | zipp=3.1.0=py_0 164 | zlib=1.2.11=h62dcd97_4 165 | --------------------------------------------------------------------------------