├── Chapter01
├── .ipynb_checkpoints
│ └── ch1-code-snippets-checkpoint.ipynb
└── ch1-code-snippets.ipynb
├── Chapter02
├── ch2-1-diamond-prices.ipynb
└── ch2-2-credit-card-default.ipynb
├── Chapter03
├── .ipynb_checkpoints
│ ├── ch3-1-eda-diamond-prices-checkpoint.ipynb
│ └── ch3-2-eda-credit-card-default-checkpoint.ipynb
├── ch3-1-eda-diamond-prices.ipynb
└── ch3-2-eda-credit-card-default.ipynb
├── Chapter04
├── .ipynb_checkpoints
│ └── ch4-overfitting-example-checkpoint.ipynb
├── ch4-overfitting-example.ipynb
└── ch4-predicting-diamond-prices.ipynb
├── Chapter05
├── .ipynb_checkpoints
│ └── ch5-predicting-credit-card-default-checkpoint.ipynb
└── ch5-predicting-credit-card-default.ipynb
├── Chapter06
├── .ipynb_checkpoints
│ ├── ch6-1-regression-with-neural-networks-checkpoint.ipynb
│ └── ch6-2-classification-with-neural-networks-checkpoint.ipynb
├── ch6-1-regression-with-neural-networks.ipynb
├── ch6-2-classification-with-neural-networks.ipynb
└── class_initial_w.h5
├── Chapter07
├── .ipynb_checkpoints
│ └── ch7-credit-card-def-model-tuning-and-evaluation-checkpoint.ipynb
├── ch7-credit-card-def-model-tuning-and-evaluation.ipynb
└── ch7-diamond-prices-model-tuning-and-evaluation.ipynb
├── Chapter08
├── .ipynb_checkpoints
│ ├── ch8-credit-card-def-model-tuning-checkpoint.ipynb
│ └── ch8-diamond-prices-model-tuning-checkpoint.ipynb
├── ch8-credit-card-def-model-tuning.ipynb
└── ch8-diamond-prices-model-tuning.ipynb
├── Chapter09
├── Model
│ ├── diamond-prices-model.h5
│ ├── pca.joblib
│ └── scaler.joblib
├── dash-example-no-user-inputs.py
├── dash-example-user-inputs.py
├── diamonds-model-training.py
└── predict-diamond-prices.py
├── Data
├── credit_card_default.csv
└── diamonds.csv
├── LICENSE
├── README.md
├── conda-cheatsheet.pdf
└── requirements.txt
/Chapter02/ch2-2-credit-card-default.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Introducing the credit card defualt dataset"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "### Data Set Information:\n",
15 | "\n",
16 | "**This research aimed at the case of customers default payments in Taiwan**\n",
17 | "\n",
18 | "### Features description:\n",
19 | "\n",
20 | "- LIMIT_BAL: Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit. \n",
21 | "- SEX: Gender (1 = male; 2 = female). \n",
22 | "- EDUCATION: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others). \n",
23 | "- MARRIAGE: Marital status (1 = married; 2 = single; 3 = others). \n",
24 | "- AGE: Age (year). \n",
25 | "- PAY_1 - PAY_6: History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: 1 = the repayment status in September, 2005; 1 = the repayment status in August, 2005; . . .; 6 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.\n",
26 | "- BILL_AMT1-BILL_AMT6: Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005. \n",
27 | "- PAY_AMT1-PAY_AMT6: Amount of previous payment (NT dollar).\n",
28 | "- default payment next month: **positive class: default | negative class: pay**"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 19,
34 | "metadata": {
35 | "collapsed": true
36 | },
37 | "outputs": [],
38 | "source": [
39 | "import numpy as np\n",
40 | "import pandas as pd\n",
41 | "import os"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 20,
47 | "metadata": {},
48 | "outputs": [
49 | {
50 | "data": {
51 | "text/html": [
52 | "
\n",
53 | "\n",
66 | "
\n",
67 | " \n",
68 | " \n",
69 | " | \n",
70 | " LIMIT_BAL | \n",
71 | " SEX | \n",
72 | " EDUCATION | \n",
73 | " MARRIAGE | \n",
74 | " AGE | \n",
75 | " PAY_1 | \n",
76 | " PAY_2 | \n",
77 | " PAY_3 | \n",
78 | " PAY_4 | \n",
79 | " PAY_5 | \n",
80 | " ... | \n",
81 | " BILL_AMT4 | \n",
82 | " BILL_AMT5 | \n",
83 | " BILL_AMT6 | \n",
84 | " PAY_AMT1 | \n",
85 | " PAY_AMT2 | \n",
86 | " PAY_AMT3 | \n",
87 | " PAY_AMT4 | \n",
88 | " PAY_AMT5 | \n",
89 | " PAY_AMT6 | \n",
90 | " default payment next month | \n",
91 | "
\n",
92 | " \n",
93 | " ID | \n",
94 | " | \n",
95 | " | \n",
96 | " | \n",
97 | " | \n",
98 | " | \n",
99 | " | \n",
100 | " | \n",
101 | " | \n",
102 | " | \n",
103 | " | \n",
104 | " | \n",
105 | " | \n",
106 | " | \n",
107 | " | \n",
108 | " | \n",
109 | " | \n",
110 | " | \n",
111 | " | \n",
112 | " | \n",
113 | " | \n",
114 | " | \n",
115 | "
\n",
116 | " \n",
117 | " \n",
118 | " \n",
119 | " 1 | \n",
120 | " 20000 | \n",
121 | " 2 | \n",
122 | " 2 | \n",
123 | " 1 | \n",
124 | " 24 | \n",
125 | " 2 | \n",
126 | " 2 | \n",
127 | " -1 | \n",
128 | " -1 | \n",
129 | " -2 | \n",
130 | " ... | \n",
131 | " 0 | \n",
132 | " 0 | \n",
133 | " 0 | \n",
134 | " 0 | \n",
135 | " 689 | \n",
136 | " 0 | \n",
137 | " 0 | \n",
138 | " 0 | \n",
139 | " 0 | \n",
140 | " 1 | \n",
141 | "
\n",
142 | " \n",
143 | " 2 | \n",
144 | " 120000 | \n",
145 | " 2 | \n",
146 | " 2 | \n",
147 | " 2 | \n",
148 | " 26 | \n",
149 | " -1 | \n",
150 | " 2 | \n",
151 | " 0 | \n",
152 | " 0 | \n",
153 | " 0 | \n",
154 | " ... | \n",
155 | " 3272 | \n",
156 | " 3455 | \n",
157 | " 3261 | \n",
158 | " 0 | \n",
159 | " 1000 | \n",
160 | " 1000 | \n",
161 | " 1000 | \n",
162 | " 0 | \n",
163 | " 2000 | \n",
164 | " 1 | \n",
165 | "
\n",
166 | " \n",
167 | " 3 | \n",
168 | " 90000 | \n",
169 | " 2 | \n",
170 | " 2 | \n",
171 | " 2 | \n",
172 | " 34 | \n",
173 | " 0 | \n",
174 | " 0 | \n",
175 | " 0 | \n",
176 | " 0 | \n",
177 | " 0 | \n",
178 | " ... | \n",
179 | " 14331 | \n",
180 | " 14948 | \n",
181 | " 15549 | \n",
182 | " 1518 | \n",
183 | " 1500 | \n",
184 | " 1000 | \n",
185 | " 1000 | \n",
186 | " 1000 | \n",
187 | " 5000 | \n",
188 | " 0 | \n",
189 | "
\n",
190 | " \n",
191 | " 4 | \n",
192 | " 50000 | \n",
193 | " 2 | \n",
194 | " 2 | \n",
195 | " 1 | \n",
196 | " 37 | \n",
197 | " 0 | \n",
198 | " 0 | \n",
199 | " 0 | \n",
200 | " 0 | \n",
201 | " 0 | \n",
202 | " ... | \n",
203 | " 28314 | \n",
204 | " 28959 | \n",
205 | " 29547 | \n",
206 | " 2000 | \n",
207 | " 2019 | \n",
208 | " 1200 | \n",
209 | " 1100 | \n",
210 | " 1069 | \n",
211 | " 1000 | \n",
212 | " 0 | \n",
213 | "
\n",
214 | " \n",
215 | " 5 | \n",
216 | " 50000 | \n",
217 | " 1 | \n",
218 | " 2 | \n",
219 | " 1 | \n",
220 | " 57 | \n",
221 | " -1 | \n",
222 | " 0 | \n",
223 | " -1 | \n",
224 | " 0 | \n",
225 | " 0 | \n",
226 | " ... | \n",
227 | " 20940 | \n",
228 | " 19146 | \n",
229 | " 19131 | \n",
230 | " 2000 | \n",
231 | " 36681 | \n",
232 | " 10000 | \n",
233 | " 9000 | \n",
234 | " 689 | \n",
235 | " 679 | \n",
236 | " 0 | \n",
237 | "
\n",
238 | " \n",
239 | "
\n",
240 | "
5 rows × 24 columns
\n",
241 | "
"
242 | ],
243 | "text/plain": [
244 | " LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_1 PAY_2 PAY_3 PAY_4 \\\n",
245 | "ID \n",
246 | "1 20000 2 2 1 24 2 2 -1 -1 \n",
247 | "2 120000 2 2 2 26 -1 2 0 0 \n",
248 | "3 90000 2 2 2 34 0 0 0 0 \n",
249 | "4 50000 2 2 1 37 0 0 0 0 \n",
250 | "5 50000 1 2 1 57 -1 0 -1 0 \n",
251 | "\n",
252 | " PAY_5 ... BILL_AMT4 BILL_AMT5 BILL_AMT6 \\\n",
253 | "ID ... \n",
254 | "1 -2 ... 0 0 0 \n",
255 | "2 0 ... 3272 3455 3261 \n",
256 | "3 0 ... 14331 14948 15549 \n",
257 | "4 0 ... 28314 28959 29547 \n",
258 | "5 0 ... 20940 19146 19131 \n",
259 | "\n",
260 | " PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 \\\n",
261 | "ID \n",
262 | "1 0 689 0 0 0 0 \n",
263 | "2 0 1000 1000 1000 0 2000 \n",
264 | "3 1518 1500 1000 1000 1000 5000 \n",
265 | "4 2000 2019 1200 1100 1069 1000 \n",
266 | "5 2000 36681 10000 9000 689 679 \n",
267 | "\n",
268 | " default payment next month \n",
269 | "ID \n",
270 | "1 1 \n",
271 | "2 1 \n",
272 | "3 0 \n",
273 | "4 0 \n",
274 | "5 0 \n",
275 | "\n",
276 | "[5 rows x 24 columns]"
277 | ]
278 | },
279 | "execution_count": 20,
280 | "metadata": {},
281 | "output_type": "execute_result"
282 | }
283 | ],
284 | "source": [
285 | "DATA_DIR = '../data'\n",
286 | "FILE_NAME = 'credit_card_default.csv'\n",
287 | "data_path = os.path.join(DATA_DIR, FILE_NAME)\n",
288 | "ccd = pd.read_csv(data_path, index_col=\"ID\")\n",
289 | "ccd.head()"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": 21,
295 | "metadata": {},
296 | "outputs": [
297 | {
298 | "data": {
299 | "text/plain": [
300 | "(30000, 24)"
301 | ]
302 | },
303 | "execution_count": 21,
304 | "metadata": {},
305 | "output_type": "execute_result"
306 | }
307 | ],
308 | "source": [
309 | "ccd.shape"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 22,
315 | "metadata": {
316 | "collapsed": true
317 | },
318 | "outputs": [],
319 | "source": [
320 | "ccd.rename(columns=lambda x: x.lower(), inplace=True)"
321 | ]
322 | },
323 | {
324 | "cell_type": "markdown",
325 | "metadata": {},
326 | "source": [
327 | "## Numerical features"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": 23,
333 | "metadata": {},
334 | "outputs": [],
335 | "source": [
336 | "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n",
337 | "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n",
338 | "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 24,
344 | "metadata": {},
345 | "outputs": [
346 | {
347 | "data": {
348 | "text/html": [
349 | "\n",
350 | "\n",
363 | "
\n",
364 | " \n",
365 | " \n",
366 | " | \n",
367 | " limit_bal | \n",
368 | " age | \n",
369 | "
\n",
370 | " \n",
371 | " \n",
372 | " \n",
373 | " count | \n",
374 | " 30000.000000 | \n",
375 | " 30000.000000 | \n",
376 | "
\n",
377 | " \n",
378 | " mean | \n",
379 | " 167484.322667 | \n",
380 | " 35.485500 | \n",
381 | "
\n",
382 | " \n",
383 | " std | \n",
384 | " 129747.661567 | \n",
385 | " 9.217904 | \n",
386 | "
\n",
387 | " \n",
388 | " min | \n",
389 | " 10000.000000 | \n",
390 | " 21.000000 | \n",
391 | "
\n",
392 | " \n",
393 | " 25% | \n",
394 | " 50000.000000 | \n",
395 | " 28.000000 | \n",
396 | "
\n",
397 | " \n",
398 | " 50% | \n",
399 | " 140000.000000 | \n",
400 | " 34.000000 | \n",
401 | "
\n",
402 | " \n",
403 | " 75% | \n",
404 | " 240000.000000 | \n",
405 | " 41.000000 | \n",
406 | "
\n",
407 | " \n",
408 | " max | \n",
409 | " 1000000.000000 | \n",
410 | " 79.000000 | \n",
411 | "
\n",
412 | " \n",
413 | "
\n",
414 | "
"
415 | ],
416 | "text/plain": [
417 | " limit_bal age\n",
418 | "count 30000.000000 30000.000000\n",
419 | "mean 167484.322667 35.485500\n",
420 | "std 129747.661567 9.217904\n",
421 | "min 10000.000000 21.000000\n",
422 | "25% 50000.000000 28.000000\n",
423 | "50% 140000.000000 34.000000\n",
424 | "75% 240000.000000 41.000000\n",
425 | "max 1000000.000000 79.000000"
426 | ]
427 | },
428 | "execution_count": 24,
429 | "metadata": {},
430 | "output_type": "execute_result"
431 | }
432 | ],
433 | "source": [
434 | "ccd[['limit_bal','age']].describe()"
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 25,
440 | "metadata": {},
441 | "outputs": [
442 | {
443 | "data": {
444 | "text/html": [
445 | "\n",
446 | "\n",
459 | "
\n",
460 | " \n",
461 | " \n",
462 | " | \n",
463 | " bill_amt1 | \n",
464 | " bill_amt2 | \n",
465 | " bill_amt3 | \n",
466 | " bill_amt4 | \n",
467 | " bill_amt5 | \n",
468 | " bill_amt6 | \n",
469 | "
\n",
470 | " \n",
471 | " \n",
472 | " \n",
473 | " count | \n",
474 | " 30000.0 | \n",
475 | " 30000.0 | \n",
476 | " 30000.0 | \n",
477 | " 30000.0 | \n",
478 | " 30000.0 | \n",
479 | " 30000.0 | \n",
480 | "
\n",
481 | " \n",
482 | " mean | \n",
483 | " 51223.0 | \n",
484 | " 49179.0 | \n",
485 | " 47013.0 | \n",
486 | " 43263.0 | \n",
487 | " 40311.0 | \n",
488 | " 38872.0 | \n",
489 | "
\n",
490 | " \n",
491 | " std | \n",
492 | " 73636.0 | \n",
493 | " 71174.0 | \n",
494 | " 69349.0 | \n",
495 | " 64333.0 | \n",
496 | " 60797.0 | \n",
497 | " 59554.0 | \n",
498 | "
\n",
499 | " \n",
500 | " min | \n",
501 | " -165580.0 | \n",
502 | " -69777.0 | \n",
503 | " -157264.0 | \n",
504 | " -170000.0 | \n",
505 | " -81334.0 | \n",
506 | " -339603.0 | \n",
507 | "
\n",
508 | " \n",
509 | " 25% | \n",
510 | " 3559.0 | \n",
511 | " 2985.0 | \n",
512 | " 2666.0 | \n",
513 | " 2327.0 | \n",
514 | " 1763.0 | \n",
515 | " 1256.0 | \n",
516 | "
\n",
517 | " \n",
518 | " 50% | \n",
519 | " 22382.0 | \n",
520 | " 21200.0 | \n",
521 | " 20088.0 | \n",
522 | " 19052.0 | \n",
523 | " 18104.0 | \n",
524 | " 17071.0 | \n",
525 | "
\n",
526 | " \n",
527 | " 75% | \n",
528 | " 67091.0 | \n",
529 | " 64006.0 | \n",
530 | " 60165.0 | \n",
531 | " 54506.0 | \n",
532 | " 50190.0 | \n",
533 | " 49198.0 | \n",
534 | "
\n",
535 | " \n",
536 | " max | \n",
537 | " 964511.0 | \n",
538 | " 983931.0 | \n",
539 | " 1664089.0 | \n",
540 | " 891586.0 | \n",
541 | " 927171.0 | \n",
542 | " 961664.0 | \n",
543 | "
\n",
544 | " \n",
545 | "
\n",
546 | "
"
547 | ],
548 | "text/plain": [
549 | " bill_amt1 bill_amt2 bill_amt3 bill_amt4 bill_amt5 bill_amt6\n",
550 | "count 30000.0 30000.0 30000.0 30000.0 30000.0 30000.0\n",
551 | "mean 51223.0 49179.0 47013.0 43263.0 40311.0 38872.0\n",
552 | "std 73636.0 71174.0 69349.0 64333.0 60797.0 59554.0\n",
553 | "min -165580.0 -69777.0 -157264.0 -170000.0 -81334.0 -339603.0\n",
554 | "25% 3559.0 2985.0 2666.0 2327.0 1763.0 1256.0\n",
555 | "50% 22382.0 21200.0 20088.0 19052.0 18104.0 17071.0\n",
556 | "75% 67091.0 64006.0 60165.0 54506.0 50190.0 49198.0\n",
557 | "max 964511.0 983931.0 1664089.0 891586.0 927171.0 961664.0"
558 | ]
559 | },
560 | "execution_count": 25,
561 | "metadata": {},
562 | "output_type": "execute_result"
563 | }
564 | ],
565 | "source": [
566 | "ccd[bill_amt_features].describe().round()"
567 | ]
568 | },
569 | {
570 | "cell_type": "code",
571 | "execution_count": 26,
572 | "metadata": {},
573 | "outputs": [
574 | {
575 | "data": {
576 | "text/html": [
577 | "\n",
578 | "\n",
591 | "
\n",
592 | " \n",
593 | " \n",
594 | " | \n",
595 | " pay_amt1 | \n",
596 | " pay_amt2 | \n",
597 | " pay_amt3 | \n",
598 | " pay_amt4 | \n",
599 | " pay_amt5 | \n",
600 | " pay_amt6 | \n",
601 | "
\n",
602 | " \n",
603 | " \n",
604 | " \n",
605 | " count | \n",
606 | " 30000.0 | \n",
607 | " 30000.0 | \n",
608 | " 30000.0 | \n",
609 | " 30000.0 | \n",
610 | " 30000.0 | \n",
611 | " 30000.0 | \n",
612 | "
\n",
613 | " \n",
614 | " mean | \n",
615 | " 5664.0 | \n",
616 | " 5921.0 | \n",
617 | " 5226.0 | \n",
618 | " 4826.0 | \n",
619 | " 4799.0 | \n",
620 | " 5216.0 | \n",
621 | "
\n",
622 | " \n",
623 | " std | \n",
624 | " 16563.0 | \n",
625 | " 23041.0 | \n",
626 | " 17607.0 | \n",
627 | " 15666.0 | \n",
628 | " 15278.0 | \n",
629 | " 17777.0 | \n",
630 | "
\n",
631 | " \n",
632 | " min | \n",
633 | " 0.0 | \n",
634 | " 0.0 | \n",
635 | " 0.0 | \n",
636 | " 0.0 | \n",
637 | " 0.0 | \n",
638 | " 0.0 | \n",
639 | "
\n",
640 | " \n",
641 | " 25% | \n",
642 | " 1000.0 | \n",
643 | " 833.0 | \n",
644 | " 390.0 | \n",
645 | " 296.0 | \n",
646 | " 252.0 | \n",
647 | " 118.0 | \n",
648 | "
\n",
649 | " \n",
650 | " 50% | \n",
651 | " 2100.0 | \n",
652 | " 2009.0 | \n",
653 | " 1800.0 | \n",
654 | " 1500.0 | \n",
655 | " 1500.0 | \n",
656 | " 1500.0 | \n",
657 | "
\n",
658 | " \n",
659 | " 75% | \n",
660 | " 5006.0 | \n",
661 | " 5000.0 | \n",
662 | " 4505.0 | \n",
663 | " 4013.0 | \n",
664 | " 4032.0 | \n",
665 | " 4000.0 | \n",
666 | "
\n",
667 | " \n",
668 | " max | \n",
669 | " 873552.0 | \n",
670 | " 1684259.0 | \n",
671 | " 896040.0 | \n",
672 | " 621000.0 | \n",
673 | " 426529.0 | \n",
674 | " 528666.0 | \n",
675 | "
\n",
676 | " \n",
677 | "
\n",
678 | "
"
679 | ],
680 | "text/plain": [
681 | " pay_amt1 pay_amt2 pay_amt3 pay_amt4 pay_amt5 pay_amt6\n",
682 | "count 30000.0 30000.0 30000.0 30000.0 30000.0 30000.0\n",
683 | "mean 5664.0 5921.0 5226.0 4826.0 4799.0 5216.0\n",
684 | "std 16563.0 23041.0 17607.0 15666.0 15278.0 17777.0\n",
685 | "min 0.0 0.0 0.0 0.0 0.0 0.0\n",
686 | "25% 1000.0 833.0 390.0 296.0 252.0 118.0\n",
687 | "50% 2100.0 2009.0 1800.0 1500.0 1500.0 1500.0\n",
688 | "75% 5006.0 5000.0 4505.0 4013.0 4032.0 4000.0\n",
689 | "max 873552.0 1684259.0 896040.0 621000.0 426529.0 528666.0"
690 | ]
691 | },
692 | "execution_count": 26,
693 | "metadata": {},
694 | "output_type": "execute_result"
695 | }
696 | ],
697 | "source": [
698 | "ccd[pay_amt_features].describe().round()"
699 | ]
700 | },
701 | {
702 | "cell_type": "markdown",
703 | "metadata": {},
704 | "source": [
705 | "## Encoding categorical features"
706 | ]
707 | },
708 | {
709 | "cell_type": "code",
710 | "execution_count": 27,
711 | "metadata": {},
712 | "outputs": [
713 | {
714 | "data": {
715 | "text/plain": [
716 | "ID\n",
717 | "1 0\n",
718 | "2 0\n",
719 | "3 0\n",
720 | "4 0\n",
721 | "5 1\n",
722 | "6 1\n",
723 | "7 1\n",
724 | "8 0\n",
725 | "9 0\n",
726 | "10 1\n",
727 | "Name: male, dtype: int32"
728 | ]
729 | },
730 | "execution_count": 27,
731 | "metadata": {},
732 | "output_type": "execute_result"
733 | }
734 | ],
735 | "source": [
736 | "ccd['male'] = (ccd['sex'] == 1).astype('int')\n",
737 | "ccd['male'].head(n=10)"
738 | ]
739 | },
740 | {
741 | "cell_type": "code",
742 | "execution_count": 28,
743 | "metadata": {},
744 | "outputs": [
745 | {
746 | "data": {
747 | "text/plain": [
748 | "0.39626666666666666"
749 | ]
750 | },
751 | "execution_count": 28,
752 | "metadata": {},
753 | "output_type": "execute_result"
754 | }
755 | ],
756 | "source": [
757 | "ccd['male'].mean()"
758 | ]
759 | },
760 | {
761 | "cell_type": "code",
762 | "execution_count": 29,
763 | "metadata": {},
764 | "outputs": [
765 | {
766 | "data": {
767 | "text/plain": [
768 | "0 14\n",
769 | "1 10585\n",
770 | "2 14030\n",
771 | "3 4917\n",
772 | "4 123\n",
773 | "5 280\n",
774 | "6 51\n",
775 | "Name: education, dtype: int64"
776 | ]
777 | },
778 | "execution_count": 29,
779 | "metadata": {},
780 | "output_type": "execute_result"
781 | }
782 | ],
783 | "source": [
784 | "ccd['education'].value_counts(sort=False)"
785 | ]
786 | },
787 | {
788 | "cell_type": "code",
789 | "execution_count": 30,
790 | "metadata": {
791 | "collapsed": true
792 | },
793 | "outputs": [],
794 | "source": [
795 | "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n",
796 | "ccd['university'] = (ccd['education'] == 2).astype('int')\n",
797 | "ccd['high_school'] = (ccd['education'] == 3).astype('int')"
798 | ]
799 | },
800 | {
801 | "cell_type": "code",
802 | "execution_count": 31,
803 | "metadata": {},
804 | "outputs": [
805 | {
806 | "data": {
807 | "text/plain": [
808 | "ID\n",
809 | "48 5\n",
810 | "70 5\n",
811 | "359 4\n",
812 | "386 5\n",
813 | "449 4\n",
814 | "Name: education, dtype: int64"
815 | ]
816 | },
817 | "execution_count": 31,
818 | "metadata": {},
819 | "output_type": "execute_result"
820 | }
821 | ],
822 | "source": [
823 | "ccd.loc[(ccd['grad_school']==0) & (ccd['university']==0) & (ccd['high_school']==0)]['education'].head()"
824 | ]
825 | },
826 | {
827 | "cell_type": "markdown",
828 | "metadata": {},
829 | "source": [
830 | "## Low variance features"
831 | ]
832 | },
833 | {
834 | "cell_type": "code",
835 | "execution_count": 32,
836 | "metadata": {},
837 | "outputs": [
838 | {
839 | "data": {
840 | "text/plain": [
841 | "1 13713\n",
842 | "2 15964\n",
843 | "3 323\n",
844 | "Name: marriage, dtype: int64"
845 | ]
846 | },
847 | "execution_count": 32,
848 | "metadata": {},
849 | "output_type": "execute_result"
850 | }
851 | ],
852 | "source": [
853 | "ccd['marriage'].value_counts(sort=False)"
854 | ]
855 | },
856 | {
857 | "cell_type": "code",
858 | "execution_count": 33,
859 | "metadata": {
860 | "collapsed": true
861 | },
862 | "outputs": [],
863 | "source": [
864 | "ccd['single'] = (ccd['marriage'] == 2).astype('int')\n",
865 | "ccd['marital_other'] = (ccd['marriage'] == 3).astype('int')"
866 | ]
867 | },
868 | {
869 | "cell_type": "code",
870 | "execution_count": 34,
871 | "metadata": {},
872 | "outputs": [
873 | {
874 | "name": "stdout",
875 | "output_type": "stream",
876 | "text": [
877 | "Proportion of singles: 0.5321333333333333\n",
878 | "Proportion of other marital status: 0.010766666666666667\n"
879 | ]
880 | }
881 | ],
882 | "source": [
883 | "print(\"Proportion of singles: \", ccd['single'].mean())\n",
884 | "print(\"Proportion of other marital status: \", ccd['marital_other'].mean())"
885 | ]
886 | },
887 | {
888 | "cell_type": "code",
889 | "execution_count": 35,
890 | "metadata": {},
891 | "outputs": [
892 | {
893 | "name": "stdout",
894 | "output_type": "stream",
895 | "text": [
896 | "0.24816786226195736\n",
897 | "0.24897574808047968\n"
898 | ]
899 | }
900 | ],
901 | "source": [
902 | "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n",
903 | "print(ccd['married'].var())\n",
904 | "print(ccd['single'].var())"
905 | ]
906 | },
907 | {
908 | "cell_type": "code",
909 | "execution_count": 36,
910 | "metadata": {},
911 | "outputs": [
912 | {
913 | "data": {
914 | "text/plain": [
915 | "0.9892333333333333"
916 | ]
917 | },
918 | "execution_count": 36,
919 | "metadata": {},
920 | "output_type": "execute_result"
921 | }
922 | ],
923 | "source": [
924 | "(ccd['married'] == (1 - ccd['single'])).mean()"
925 | ]
926 | },
927 | {
928 | "cell_type": "markdown",
929 | "metadata": {},
930 | "source": [
931 | "## A brief introduction to Feature Engineering"
932 | ]
933 | },
934 | {
935 | "cell_type": "code",
936 | "execution_count": 37,
937 | "metadata": {},
938 | "outputs": [
939 | {
940 | "data": {
941 | "text/plain": [
942 | "-2 2759\n",
943 | "-1 5686\n",
944 | " 0 14737\n",
945 | " 1 3688\n",
946 | " 2 2667\n",
947 | " 3 322\n",
948 | " 4 76\n",
949 | " 5 26\n",
950 | " 6 11\n",
951 | " 7 9\n",
952 | " 8 19\n",
953 | "Name: pay_1, dtype: int64"
954 | ]
955 | },
956 | "execution_count": 37,
957 | "metadata": {},
958 | "output_type": "execute_result"
959 | }
960 | ],
961 | "source": [
962 | "ccd['pay_1'].value_counts().sort_index()"
963 | ]
964 | },
965 | {
966 | "cell_type": "code",
967 | "execution_count": 38,
968 | "metadata": {},
969 | "outputs": [],
970 | "source": [
971 | "# fixing the pay_i features\n",
972 | "pay_features= ['pay_' + str(i) for i in range(1,7)]\n",
973 | "for x in pay_features:\n",
974 | " ccd.loc[ccd[x] <= 0, x] = 0"
975 | ]
976 | },
977 | {
978 | "cell_type": "code",
979 | "execution_count": 39,
980 | "metadata": {
981 | "collapsed": true
982 | },
983 | "outputs": [],
984 | "source": [
985 | "# producing delayed features\n",
986 | "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n",
987 | "for pay, delayed in zip(pay_features, delayed_features):\n",
988 | " ccd[delayed] = (ccd[pay] > 0).astype(int)"
989 | ]
990 | },
991 | {
992 | "cell_type": "code",
993 | "execution_count": 44,
994 | "metadata": {},
995 | "outputs": [
996 | {
997 | "data": {
998 | "text/plain": [
999 | "delayed_1 0.227267\n",
1000 | "delayed_2 0.147933\n",
1001 | "delayed_3 0.140433\n",
1002 | "delayed_4 0.117000\n",
1003 | "delayed_5 0.098933\n",
1004 | "delayed_6 0.102633\n",
1005 | "dtype: float64"
1006 | ]
1007 | },
1008 | "execution_count": 44,
1009 | "metadata": {},
1010 | "output_type": "execute_result"
1011 | }
1012 | ],
1013 | "source": [
1014 | "ccd[delayed_features].mean()"
1015 | ]
1016 | },
1017 | {
1018 | "cell_type": "code",
1019 | "execution_count": null,
1020 | "metadata": {
1021 | "collapsed": true
1022 | },
1023 | "outputs": [],
1024 | "source": [
1025 | "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)"
1026 | ]
1027 | },
1028 | {
1029 | "cell_type": "markdown",
1030 | "metadata": {},
1031 | "source": [
1032 | "Done."
1033 | ]
1034 | },
1035 | {
1036 | "cell_type": "code",
1037 | "execution_count": null,
1038 | "metadata": {
1039 | "collapsed": true
1040 | },
1041 | "outputs": [],
1042 | "source": []
1043 | }
1044 | ],
1045 | "metadata": {
1046 | "kernelspec": {
1047 | "display_name": "Python 3",
1048 | "language": "python",
1049 | "name": "python3"
1050 | },
1051 | "language_info": {
1052 | "codemirror_mode": {
1053 | "name": "ipython",
1054 | "version": 3
1055 | },
1056 | "file_extension": ".py",
1057 | "mimetype": "text/x-python",
1058 | "name": "python",
1059 | "nbconvert_exporter": "python",
1060 | "pygments_lexer": "ipython3",
1061 | "version": "3.6.1"
1062 | }
1063 | },
1064 | "nbformat": 4,
1065 | "nbformat_minor": 2
1066 | }
1067 |
--------------------------------------------------------------------------------
/Chapter05/.ipynb_checkpoints/ch5-predicting-credit-card-default-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Predicting Credit Card Default\n",
8 | "\n",
9 | "If you are using Windows, don't forget to add:\n",
10 | "\n",
11 | "C:\\Users\\\"user_name\"\\Anaconda3\\\"environment_name\"\\Library\\bin\\graphviz\\\n",
12 | "\n",
13 | "to the PATH environment variable"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "import numpy as np\n",
23 | "import pandas as pd\n",
24 | "import matplotlib.pyplot as plt\n",
25 | "import seaborn as sns\n",
26 | "import os\n",
27 | "%matplotlib inline"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "### Back with the credit card default dataset"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# Loading the dataset\n",
44 | "DATA_DIR = '../data'\n",
45 | "FILE_NAME = 'credit_card_default.csv'\n",
46 | "data_path = os.path.join(DATA_DIR, FILE_NAME)\n",
47 | "ccd = pd.read_csv(data_path, index_col=\"ID\")\n",
48 | "ccd.rename(columns=lambda x: x.lower(), inplace=True)\n",
49 | "ccd.rename(columns={'default payment next month':'default'}, inplace=True)\n",
50 | "\n",
51 | "# getting the groups of features\n",
52 | "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n",
53 | "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n",
54 | "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features\n",
55 | "\n",
56 | "# Creating creating binary features\n",
57 | "ccd['male'] = (ccd['sex'] == 1).astype('int')\n",
58 | "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n",
59 | "ccd['university'] = (ccd['education'] == 2).astype('int')\n",
60 | "#ccd['high_school'] = (ccd['education'] == 3).astype('int')\n",
61 | "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n",
62 | "\n",
63 | "# simplifying pay features \n",
64 | "pay_features= ['pay_' + str(i) for i in range(1,7)]\n",
65 | "for x in pay_features:\n",
66 | " ccd.loc[ccd[x] <= 0, x] = 0\n",
67 | "\n",
68 | "# simplifying delayed features\n",
69 | "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n",
70 | "for pay, delayed in zip(pay_features, delayed_features):\n",
71 | " ccd[delayed] = (ccd[pay] > 0).astype(int)\n",
72 | " \n",
73 | "# creating a new feature: months delayed\n",
74 | "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "## Splitting the dataset"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "numerical_features = numerical_features + ['months_delayed']\n",
91 | "binary_features = ['male','married','grad_school','university']\n",
92 | "X = ccd[numerical_features + binary_features]\n",
93 | "y = ccd['default'].astype(int)"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "from sklearn.model_selection import train_test_split\n",
103 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=101)"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {
110 | "scrolled": true
111 | },
112 | "outputs": [],
113 | "source": [
114 | "# 1. Import the class you will use\n",
115 | "from sklearn.preprocessing import StandardScaler\n",
116 | "# 2. Create an instance of the class\n",
117 | "scaler = StandardScaler()\n",
118 | "# 3. Use the fit method of the instance\n",
119 | "scaler.fit(X_train[numerical_features])\n",
120 | "# 4. Use the transform method to perform the transformation\n",
121 | "X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "## Logistic Regression"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "### A simple Logistic Regression model"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "from sklearn.linear_model import LogisticRegression\n",
145 | "simple_log_reg = LogisticRegression(C=1e6)\n",
146 | "simple_log_reg.fit(X_train['months_delayed'].values.reshape(-1, 1), y_train)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "print(\"W0: {}, W1: {}\".format(simple_log_reg.intercept_[0], simple_log_reg.coef_[0][0]))"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "def get_probs(months_delayed):\n",
165 | " m = scaler.mean_[-1]\n",
166 | " std = scaler.var_[-1]**.5\n",
167 | " x = (months_delayed - m)/std\n",
168 | " prob_default = 1/(1+np.exp(-simple_log_reg.intercept_[0] + -simple_log_reg.coef_[0][0]*x))\n",
169 | " return prob_default"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "months = np.arange(13)\n",
179 | "pred_probs = get_probs(months)\n",
180 | "pd.DataFrame({'months': months, 'pred_probs':pred_probs})"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "fig, ax = plt.subplots()\n",
190 | "ax.plot(months, pred_probs)\n",
191 | "ax.set_xlabel('Months delayed')\n",
192 | "ax.set_ylabel('Probability of default')\n",
193 | "ax.grid()"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {},
199 | "source": [
200 | "### A complete Logistic Regression model"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "log_reg = LogisticRegression(C=1e6)\n",
210 | "log_reg.fit(X_train, y_train)"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "metadata": {},
217 | "outputs": [],
218 | "source": [
219 | "prob_log_reg = log_reg.predict_proba(X_train)\n",
220 | "prob_log_reg[:10]"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "y_pred_log_reg = log_reg.predict(X_train)\n",
230 | "y_pred_log_reg[:10]"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "np.all(y_pred_log_reg == (prob_log_reg[:,1] > 0.5))"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "metadata": {},
246 | "outputs": [],
247 | "source": [
248 | "pd.Series(data=log_reg.coef_[0], index=X_train.columns).sort_values(ascending=False).round(2)"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": null,
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "from sklearn.metrics import accuracy_score\n",
258 | "accuracy_log_reg = accuracy_score(y_true=y_train, y_pred=y_pred_log_reg)\n",
259 | "accuracy_log_reg"
260 | ]
261 | },
262 | {
263 | "cell_type": "markdown",
264 | "metadata": {},
265 | "source": [
266 | "## Classification Trees"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": null,
272 | "metadata": {},
273 | "outputs": [],
274 | "source": [
275 | "from sklearn.tree import DecisionTreeClassifier\n",
276 | "class_tree = DecisionTreeClassifier(max_depth=3)\n",
277 | "class_tree.fit(X_train, y_train)"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {},
284 | "outputs": [],
285 | "source": [
286 | "from sklearn.externals.six import StringIO \n",
287 | "from sklearn.tree import export_graphviz\n",
288 | "from IPython.display import Image \n",
289 | "import pydotplus"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "dot_data = StringIO()\n",
299 | "export_graphviz(decision_tree=class_tree,\n",
300 | " out_file=dot_data,\n",
301 | " filled=True,\n",
302 | " rounded=True,\n",
303 | " feature_names = X_train.columns,\n",
304 | " class_names = ['pay','default'],\n",
305 | " special_characters=True)\n",
306 | "graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) \n",
307 | "Image(graph.create_png())"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {},
314 | "outputs": [],
315 | "source": [
316 | "dot_data = StringIO()\n",
317 | "export_graphviz(decision_tree=class_tree,\n",
318 | " out_file=dot_data,\n",
319 | " filled=True,\n",
320 | " rounded=True,\n",
321 | " proportion=True,\n",
322 | " feature_names = X_train.columns,\n",
323 | " class_names = ['pay','default'],\n",
324 | " special_characters=True)\n",
325 | "graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) \n",
326 | "Image(graph.create_png())"
327 | ]
328 | },
329 | {
330 | "cell_type": "markdown",
331 | "metadata": {},
332 | "source": [
333 | "### How trees work"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "metadata": {},
340 | "outputs": [],
341 | "source": [
342 | "from sklearn.datasets import make_blobs"
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": null,
348 | "metadata": {},
349 | "outputs": [],
350 | "source": [
351 | "A, b = make_blobs(n_samples=200, n_features=2, cluster_std=0.6,\n",
352 | " centers=[[-0.5,-1],[0.5,0.5]], shuffle=False, random_state=42)\n",
353 | "plt.scatter(A[:, 0], A[:, 1], c=b)\n",
354 | "plt.xlabel('X1', size=15)\n",
355 | "plt.ylabel('X2', size=15);"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {},
362 | "outputs": [],
363 | "source": [
364 | "plt.scatter(A[:, 0], A[:, 1], c=b)\n",
365 | "plt.axhline(-0.6, c='red')\n",
366 | "plt.xlabel('X1', size=15)\n",
367 | "plt.ylabel('X2', size=15);"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": null,
373 | "metadata": {},
374 | "outputs": [],
375 | "source": [
376 | "plt.scatter(A[:, 0], A[:, 1], c=b)\n",
377 | "plt.axhline(-0.6, c='red')\n",
378 | "plt.axvline(x=-0.1, ymin=0.34, c='red')\n",
379 | "plt.xlabel('X1', size=15)\n",
380 | "plt.ylabel('X2', size=15);"
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": null,
386 | "metadata": {},
387 | "outputs": [],
388 | "source": [
389 | "plt.scatter(A[:, 0], A[:, 1], c=b)\n",
390 | "plt.axhline(-0.6, c='red')\n",
391 | "plt.axvline(x=-0.1, ymin=0.34, c='red')\n",
392 | "plt.axvline(x=0.7, ymax=0.34, c='red')\n",
393 | "plt.xlabel('X1', size=15)\n",
394 | "plt.ylabel('X2', size=15);"
395 | ]
396 | },
397 | {
398 | "cell_type": "markdown",
399 | "metadata": {},
400 | "source": [
401 | "### Training a larger classification tree"
402 | ]
403 | },
404 | {
405 | "cell_type": "code",
406 | "execution_count": null,
407 | "metadata": {},
408 | "outputs": [],
409 | "source": [
410 | "class_tree = DecisionTreeClassifier(max_depth=6, min_samples_split=50)\n",
411 | "class_tree.fit(X_train, y_train)\n",
412 | "y_pred_class_tree = class_tree.predict(X_train)"
413 | ]
414 | },
415 | {
416 | "cell_type": "code",
417 | "execution_count": null,
418 | "metadata": {},
419 | "outputs": [],
420 | "source": [
421 | "accuracy_class_tree = accuracy_score(y_true=y_train, y_pred=y_pred_class_tree)\n",
422 | "accuracy_class_tree"
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": null,
428 | "metadata": {},
429 | "outputs": [],
430 | "source": [
431 | "pd.Series(data=class_tree.feature_importances_, index=X_train.columns).sort_values(ascending=False).round(3)"
432 | ]
433 | },
434 | {
435 | "cell_type": "code",
436 | "execution_count": null,
437 | "metadata": {},
438 | "outputs": [],
439 | "source": [
440 | "pd.Series(data=class_tree.feature_importances_, index=X_train.columns).sort_values(ascending=False).plot(kind='bar');"
441 | ]
442 | },
443 | {
444 | "cell_type": "markdown",
445 | "metadata": {},
446 | "source": [
447 | "## Random Forests"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": null,
453 | "metadata": {},
454 | "outputs": [],
455 | "source": [
456 | "from sklearn.ensemble import RandomForestClassifier\n",
457 | "rf = RandomForestClassifier(n_estimators=99,\n",
458 | " max_features=6,\n",
459 | " max_depth=6,\n",
460 | " min_samples_split=100,\n",
461 | " random_state=85)\n",
462 | "rf.fit(X_train, y_train)\n",
463 | "y_pred_rf = rf.predict(X_train)"
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": null,
469 | "metadata": {},
470 | "outputs": [],
471 | "source": [
472 | "accuracy_rf = accuracy_score(y_true=y_train, y_pred=y_pred_rf)\n",
473 | "accuracy_rf"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": null,
479 | "metadata": {},
480 | "outputs": [],
481 | "source": [
482 | "pd.Series(data=rf.feature_importances_, index=X_train.columns).sort_values(ascending=False).round(3)"
483 | ]
484 | },
485 | {
486 | "cell_type": "markdown",
487 | "metadata": {},
488 | "source": [
489 | "## Training vs Testing Error"
490 | ]
491 | },
492 | {
493 | "cell_type": "code",
494 | "execution_count": null,
495 | "metadata": {},
496 | "outputs": [],
497 | "source": [
498 | "y_pred_null = np.zeros_like(y_test)\n",
499 | "accuracy_score(y_true=y_test, y_pred=y_pred_null)"
500 | ]
501 | },
502 | {
503 | "cell_type": "code",
504 | "execution_count": null,
505 | "metadata": {},
506 | "outputs": [],
507 | "source": [
508 | "## Remember to also standarize the numerical features in the testing set\n",
509 | "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": null,
515 | "metadata": {},
516 | "outputs": [],
517 | "source": [
518 | "## Calculating accuracy\n",
519 | "accuracies = pd.DataFrame(columns=['train', 'test'], index=['LogisticReg','ClassTree','RF'])\n",
520 | "model_dict = {'LogisticReg': log_reg, 'ClassTree': class_tree, 'RF': rf}\n",
521 | "for name, model in model_dict.items():\n",
522 | " accuracies.loc[name, 'train'] = accuracy_score(y_true=y_train, y_pred=model.predict(X_train))\n",
523 | " accuracies.loc[name, 'test'] = accuracy_score(y_true=y_test, y_pred=model.predict(X_test))\n",
524 | "\n",
525 | "accuracies"
526 | ]
527 | },
528 | {
529 | "cell_type": "code",
530 | "execution_count": null,
531 | "metadata": {},
532 | "outputs": [],
533 | "source": [
534 | "fig, ax = plt.subplots()\n",
535 | "accuracies.sort_values(by='test', ascending=False).plot(kind='barh', ax=ax, zorder=3)\n",
536 | "ax.grid(zorder=0)"
537 | ]
538 | },
539 | {
540 | "cell_type": "markdown",
541 | "metadata": {},
542 | "source": [
543 | "## Multiclass classification"
544 | ]
545 | },
546 | {
547 | "cell_type": "code",
548 | "execution_count": null,
549 | "metadata": {},
550 | "outputs": [],
551 | "source": [
552 | "# Loading the iris dataset\n",
553 | "from sklearn.datasets import load_iris\n",
554 | "iris = load_iris()\n",
555 | "# Training the logistic regression model\n",
556 | "iris_log_reg = LogisticRegression(C=1e5)\n",
557 | "iris_log_reg.fit(iris.data, iris.target)\n",
558 | "iris_probs = iris_log_reg.predict_proba(iris.data)\n",
559 | "iris_pred = iris_log_reg.predict(iris.data)"
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": null,
565 | "metadata": {},
566 | "outputs": [],
567 | "source": [
568 | "iris_pred_df = pd.DataFrame(iris_probs, columns=iris.target_names).round(4)\n",
569 | "iris_pred_df['predicted_class'] = iris.target_names[iris_pred]\n",
570 | "iris_pred_df.sample(12)"
571 | ]
572 | },
573 | {
574 | "cell_type": "code",
575 | "execution_count": null,
576 | "metadata": {},
577 | "outputs": [],
578 | "source": []
579 | }
580 | ],
581 | "metadata": {
582 | "kernelspec": {
583 | "display_name": "Python 3",
584 | "language": "python",
585 | "name": "python3"
586 | },
587 | "language_info": {
588 | "codemirror_mode": {
589 | "name": "ipython",
590 | "version": 3
591 | },
592 | "file_extension": ".py",
593 | "mimetype": "text/x-python",
594 | "name": "python",
595 | "nbconvert_exporter": "python",
596 | "pygments_lexer": "ipython3",
597 | "version": "3.6.10"
598 | }
599 | },
600 | "nbformat": 4,
601 | "nbformat_minor": 2
602 | }
603 |
--------------------------------------------------------------------------------
/Chapter05/ch5-predicting-credit-card-default.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Predicting Credit Card Default\n",
8 | "\n",
9 | "If you are using Windows, don't forget to add:\n",
10 | "\n",
11 | "C:\\Users\\\"user_name\"\\Anaconda3\\\"environment_name\"\\Library\\bin\\graphviz\\\n",
12 | "\n",
13 | "to the PATH environment variable"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "import numpy as np\n",
23 | "import pandas as pd\n",
24 | "import matplotlib.pyplot as plt\n",
25 | "import seaborn as sns\n",
26 | "import os\n",
27 | "%matplotlib inline"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "### Back with the credit card default dataset"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# Loading the dataset\n",
44 | "DATA_DIR = '../data'\n",
45 | "FILE_NAME = 'credit_card_default.csv'\n",
46 | "data_path = os.path.join(DATA_DIR, FILE_NAME)\n",
47 | "ccd = pd.read_csv(data_path, index_col=\"ID\")\n",
48 | "ccd.rename(columns=lambda x: x.lower(), inplace=True)\n",
49 | "ccd.rename(columns={'default payment next month':'default'}, inplace=True)\n",
50 | "\n",
51 | "# getting the groups of features\n",
52 | "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n",
53 | "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n",
54 | "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features\n",
55 | "\n",
56 | "# Creating creating binary features\n",
57 | "ccd['male'] = (ccd['sex'] == 1).astype('int')\n",
58 | "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n",
59 | "ccd['university'] = (ccd['education'] == 2).astype('int')\n",
60 | "#ccd['high_school'] = (ccd['education'] == 3).astype('int')\n",
61 | "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n",
62 | "\n",
63 | "# simplifying pay features \n",
64 | "pay_features= ['pay_' + str(i) for i in range(1,7)]\n",
65 | "for x in pay_features:\n",
66 | " ccd.loc[ccd[x] <= 0, x] = 0\n",
67 | "\n",
68 | "# simplifying delayed features\n",
69 | "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n",
70 | "for pay, delayed in zip(pay_features, delayed_features):\n",
71 | " ccd[delayed] = (ccd[pay] > 0).astype(int)\n",
72 | " \n",
73 | "# creating a new feature: months delayed\n",
74 | "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "## Splitting the dataset"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "numerical_features = numerical_features + ['months_delayed']\n",
91 | "binary_features = ['male','married','grad_school','university']\n",
92 | "X = ccd[numerical_features + binary_features]\n",
93 | "y = ccd['default'].astype(int)"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "from sklearn.model_selection import train_test_split\n",
103 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=101)"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {
110 | "scrolled": true
111 | },
112 | "outputs": [],
113 | "source": [
114 | "# 1. Import the class you will use\n",
115 | "from sklearn.preprocessing import StandardScaler\n",
116 | "# 2. Create an instance of the class\n",
117 | "scaler = StandardScaler()\n",
118 | "# 3. Use the fit method of the instance\n",
119 | "scaler.fit(X_train[numerical_features])\n",
120 | "# 4. Use the transform method to perform the transformation\n",
121 | "X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "## Logistic Regression"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "### A simple Logistic Regression model"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "from sklearn.linear_model import LogisticRegression\n",
145 | "simple_log_reg = LogisticRegression(C=1e6)\n",
146 | "simple_log_reg.fit(X_train['months_delayed'].values.reshape(-1, 1), y_train)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {},
153 | "outputs": [],
154 | "source": [
155 | "print(\"W0: {}, W1: {}\".format(simple_log_reg.intercept_[0], simple_log_reg.coef_[0][0]))"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "def get_probs(months_delayed):\n",
165 | " m = scaler.mean_[-1]\n",
166 | " std = scaler.var_[-1]**.5\n",
167 | " x = (months_delayed - m)/std\n",
168 | " prob_default = 1/(1+np.exp(-simple_log_reg.intercept_[0] + -simple_log_reg.coef_[0][0]*x))\n",
169 | " return prob_default"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "months = np.arange(13)\n",
179 | "pred_probs = get_probs(months)\n",
180 | "pd.DataFrame({'months': months, 'pred_probs':pred_probs})"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "fig, ax = plt.subplots()\n",
190 | "ax.plot(months, pred_probs)\n",
191 | "ax.set_xlabel('Months delayed')\n",
192 | "ax.set_ylabel('Probability of default')\n",
193 | "ax.grid()"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {},
199 | "source": [
200 | "### A complete Logistic Regression model"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "log_reg = LogisticRegression(C=1e6)\n",
210 | "log_reg.fit(X_train, y_train)"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "metadata": {},
217 | "outputs": [],
218 | "source": [
219 | "prob_log_reg = log_reg.predict_proba(X_train)\n",
220 | "prob_log_reg[:10]"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "y_pred_log_reg = log_reg.predict(X_train)\n",
230 | "y_pred_log_reg[:10]"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "np.all(y_pred_log_reg == (prob_log_reg[:,1] > 0.5))"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "metadata": {},
246 | "outputs": [],
247 | "source": [
248 | "pd.Series(data=log_reg.coef_[0], index=X_train.columns).sort_values(ascending=False).round(2)"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": null,
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "from sklearn.metrics import accuracy_score\n",
258 | "accuracy_log_reg = accuracy_score(y_true=y_train, y_pred=y_pred_log_reg)\n",
259 | "accuracy_log_reg"
260 | ]
261 | },
262 | {
263 | "cell_type": "markdown",
264 | "metadata": {},
265 | "source": [
266 | "## Classification Trees"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": null,
272 | "metadata": {},
273 | "outputs": [],
274 | "source": [
275 | "from sklearn.tree import DecisionTreeClassifier\n",
276 | "class_tree = DecisionTreeClassifier(max_depth=3)\n",
277 | "class_tree.fit(X_train, y_train)"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {},
284 | "outputs": [],
285 | "source": [
286 | "from sklearn.externals.six import StringIO \n",
287 | "from sklearn.tree import export_graphviz\n",
288 | "from IPython.display import Image \n",
289 | "import pydotplus"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "dot_data = StringIO()\n",
299 | "export_graphviz(decision_tree=class_tree,\n",
300 | " out_file=dot_data,\n",
301 | " filled=True,\n",
302 | " rounded=True,\n",
303 | " feature_names = X_train.columns,\n",
304 | " class_names = ['pay','default'],\n",
305 | " special_characters=True)\n",
306 | "graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) \n",
307 | "Image(graph.create_png())"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {},
314 | "outputs": [],
315 | "source": [
316 | "dot_data = StringIO()\n",
317 | "export_graphviz(decision_tree=class_tree,\n",
318 | " out_file=dot_data,\n",
319 | " filled=True,\n",
320 | " rounded=True,\n",
321 | " proportion=True,\n",
322 | " feature_names = X_train.columns,\n",
323 | " class_names = ['pay','default'],\n",
324 | " special_characters=True)\n",
325 | "graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) \n",
326 | "Image(graph.create_png())"
327 | ]
328 | },
329 | {
330 | "cell_type": "markdown",
331 | "metadata": {},
332 | "source": [
333 | "### How trees work"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "metadata": {},
340 | "outputs": [],
341 | "source": [
342 | "from sklearn.datasets import make_blobs"
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": null,
348 | "metadata": {},
349 | "outputs": [],
350 | "source": [
351 | "A, b = make_blobs(n_samples=200, n_features=2, cluster_std=0.6,\n",
352 | " centers=[[-0.5,-1],[0.5,0.5]], shuffle=False, random_state=42)\n",
353 | "plt.scatter(A[:, 0], A[:, 1], c=b)\n",
354 | "plt.xlabel('X1', size=15)\n",
355 | "plt.ylabel('X2', size=15);"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {},
362 | "outputs": [],
363 | "source": [
364 | "plt.scatter(A[:, 0], A[:, 1], c=b)\n",
365 | "plt.axhline(-0.6, c='red')\n",
366 | "plt.xlabel('X1', size=15)\n",
367 | "plt.ylabel('X2', size=15);"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": null,
373 | "metadata": {},
374 | "outputs": [],
375 | "source": [
376 | "plt.scatter(A[:, 0], A[:, 1], c=b)\n",
377 | "plt.axhline(-0.6, c='red')\n",
378 | "plt.axvline(x=-0.1, ymin=0.34, c='red')\n",
379 | "plt.xlabel('X1', size=15)\n",
380 | "plt.ylabel('X2', size=15);"
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": null,
386 | "metadata": {},
387 | "outputs": [],
388 | "source": [
389 | "plt.scatter(A[:, 0], A[:, 1], c=b)\n",
390 | "plt.axhline(-0.6, c='red')\n",
391 | "plt.axvline(x=-0.1, ymin=0.34, c='red')\n",
392 | "plt.axvline(x=0.7, ymax=0.34, c='red')\n",
393 | "plt.xlabel('X1', size=15)\n",
394 | "plt.ylabel('X2', size=15);"
395 | ]
396 | },
397 | {
398 | "cell_type": "markdown",
399 | "metadata": {},
400 | "source": [
401 | "### Training a larger classification tree"
402 | ]
403 | },
404 | {
405 | "cell_type": "code",
406 | "execution_count": null,
407 | "metadata": {},
408 | "outputs": [],
409 | "source": [
410 | "class_tree = DecisionTreeClassifier(max_depth=6, min_samples_split=50)\n",
411 | "class_tree.fit(X_train, y_train)\n",
412 | "y_pred_class_tree = class_tree.predict(X_train)"
413 | ]
414 | },
415 | {
416 | "cell_type": "code",
417 | "execution_count": null,
418 | "metadata": {},
419 | "outputs": [],
420 | "source": [
421 | "accuracy_class_tree = accuracy_score(y_true=y_train, y_pred=y_pred_class_tree)\n",
422 | "accuracy_class_tree"
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": null,
428 | "metadata": {},
429 | "outputs": [],
430 | "source": [
431 | "pd.Series(data=class_tree.feature_importances_, index=X_train.columns).sort_values(ascending=False).round(3)"
432 | ]
433 | },
434 | {
435 | "cell_type": "code",
436 | "execution_count": null,
437 | "metadata": {},
438 | "outputs": [],
439 | "source": [
440 | "pd.Series(data=class_tree.feature_importances_, index=X_train.columns).sort_values(ascending=False).plot(kind='bar');"
441 | ]
442 | },
443 | {
444 | "cell_type": "markdown",
445 | "metadata": {},
446 | "source": [
447 | "## Random Forests"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": null,
453 | "metadata": {},
454 | "outputs": [],
455 | "source": [
456 | "from sklearn.ensemble import RandomForestClassifier\n",
457 | "rf = RandomForestClassifier(n_estimators=99,\n",
458 | " max_features=6,\n",
459 | " max_depth=6,\n",
460 | " min_samples_split=100,\n",
461 | " random_state=85)\n",
462 | "rf.fit(X_train, y_train)\n",
463 | "y_pred_rf = rf.predict(X_train)"
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": null,
469 | "metadata": {},
470 | "outputs": [],
471 | "source": [
472 | "accuracy_rf = accuracy_score(y_true=y_train, y_pred=y_pred_rf)\n",
473 | "accuracy_rf"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": null,
479 | "metadata": {},
480 | "outputs": [],
481 | "source": [
482 | "pd.Series(data=rf.feature_importances_, index=X_train.columns).sort_values(ascending=False).round(3)"
483 | ]
484 | },
485 | {
486 | "cell_type": "markdown",
487 | "metadata": {},
488 | "source": [
489 | "## Training vs Testing Error"
490 | ]
491 | },
492 | {
493 | "cell_type": "code",
494 | "execution_count": null,
495 | "metadata": {},
496 | "outputs": [],
497 | "source": [
498 | "y_pred_null = np.zeros_like(y_test)\n",
499 | "accuracy_score(y_true=y_test, y_pred=y_pred_null)"
500 | ]
501 | },
502 | {
503 | "cell_type": "code",
504 | "execution_count": null,
505 | "metadata": {},
506 | "outputs": [],
507 | "source": [
508 | "## Remember to also standarize the numerical features in the testing set\n",
509 | "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": null,
515 | "metadata": {},
516 | "outputs": [],
517 | "source": [
518 | "## Calculating accuracy\n",
519 | "accuracies = pd.DataFrame(columns=['train', 'test'], index=['LogisticReg','ClassTree','RF'])\n",
520 | "model_dict = {'LogisticReg': log_reg, 'ClassTree': class_tree, 'RF': rf}\n",
521 | "for name, model in model_dict.items():\n",
522 | " accuracies.loc[name, 'train'] = accuracy_score(y_true=y_train, y_pred=model.predict(X_train))\n",
523 | " accuracies.loc[name, 'test'] = accuracy_score(y_true=y_test, y_pred=model.predict(X_test))\n",
524 | "\n",
525 | "accuracies"
526 | ]
527 | },
528 | {
529 | "cell_type": "code",
530 | "execution_count": null,
531 | "metadata": {},
532 | "outputs": [],
533 | "source": [
534 | "fig, ax = plt.subplots()\n",
535 | "accuracies.sort_values(by='test', ascending=False).plot(kind='barh', ax=ax, zorder=3)\n",
536 | "ax.grid(zorder=0)"
537 | ]
538 | },
539 | {
540 | "cell_type": "markdown",
541 | "metadata": {},
542 | "source": [
543 | "## Multiclass classification"
544 | ]
545 | },
546 | {
547 | "cell_type": "code",
548 | "execution_count": null,
549 | "metadata": {},
550 | "outputs": [],
551 | "source": [
552 | "# Loading the iris dataset\n",
553 | "from sklearn.datasets import load_iris\n",
554 | "iris = load_iris()\n",
555 | "# Training the logistic regression model\n",
556 | "iris_log_reg = LogisticRegression(C=1e5)\n",
557 | "iris_log_reg.fit(iris.data, iris.target)\n",
558 | "iris_probs = iris_log_reg.predict_proba(iris.data)\n",
559 | "iris_pred = iris_log_reg.predict(iris.data)"
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": null,
565 | "metadata": {},
566 | "outputs": [],
567 | "source": [
568 | "iris_pred_df = pd.DataFrame(iris_probs, columns=iris.target_names).round(4)\n",
569 | "iris_pred_df['predicted_class'] = iris.target_names[iris_pred]\n",
570 | "iris_pred_df.sample(12)"
571 | ]
572 | },
573 | {
574 | "cell_type": "code",
575 | "execution_count": null,
576 | "metadata": {},
577 | "outputs": [],
578 | "source": []
579 | }
580 | ],
581 | "metadata": {
582 | "kernelspec": {
583 | "display_name": "Python 3",
584 | "language": "python",
585 | "name": "python3"
586 | },
587 | "language_info": {
588 | "codemirror_mode": {
589 | "name": "ipython",
590 | "version": 3
591 | },
592 | "file_extension": ".py",
593 | "mimetype": "text/x-python",
594 | "name": "python",
595 | "nbconvert_exporter": "python",
596 | "pygments_lexer": "ipython3",
597 | "version": "3.6.10"
598 | }
599 | },
600 | "nbformat": 4,
601 | "nbformat_minor": 2
602 | }
603 |
--------------------------------------------------------------------------------
/Chapter06/.ipynb_checkpoints/ch6-2-classification-with-neural-networks-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Predicting Credit Card Default with Neural Networks"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import numpy as np\n",
17 | "import pandas as pd\n",
18 | "import matplotlib.pyplot as plt\n",
19 | "import seaborn as sns\n",
20 | "import os\n",
21 | "%matplotlib inline"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "### Back with the credit card default dataset"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# Loading the dataset\n",
38 | "DATA_DIR = '../data'\n",
39 | "FILE_NAME = 'credit_card_default.csv'\n",
40 | "data_path = os.path.join(DATA_DIR, FILE_NAME)\n",
41 | "ccd = pd.read_csv(data_path, index_col=\"ID\")\n",
42 | "ccd.rename(columns=lambda x: x.lower(), inplace=True)\n",
43 | "ccd.rename(columns={'default payment next month':'default'}, inplace=True)\n",
44 | "\n",
45 | "# getting the groups of features\n",
46 | "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n",
47 | "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n",
48 | "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features\n",
49 | "\n",
50 | "# Creating creating binary features\n",
51 | "ccd['male'] = (ccd['sex'] == 1).astype('int')\n",
52 | "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n",
53 | "ccd['university'] = (ccd['education'] == 2).astype('int')\n",
54 | "#ccd['high_school'] = (ccd['education'] == 3).astype('int')\n",
55 | "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n",
56 | "\n",
57 | "# simplifying pay features \n",
58 | "pay_features= ['pay_' + str(i) for i in range(1,7)]\n",
59 | "for x in pay_features:\n",
60 | " ccd.loc[ccd[x] <= 0, x] = 0\n",
61 | "\n",
62 | "# simplifying delayed features\n",
63 | "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n",
64 | "for pay, delayed in zip(pay_features, delayed_features):\n",
65 | " ccd[delayed] = (ccd[pay] > 0).astype(int)\n",
66 | " \n",
67 | "# creating a new feature: months delayed\n",
68 | "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "## Split and standarize the dataset"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 3,
81 | "metadata": {},
82 | "outputs": [
83 | {
84 | "name": "stderr",
85 | "output_type": "stream",
86 | "text": [
87 | "C:\\Anaconda\\envs\\ho-pawp\\lib\\site-packages\\pandas\\core\\indexing.py:966: SettingWithCopyWarning: \n",
88 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
89 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
90 | "\n",
91 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
92 | " self.obj[item] = s\n",
93 | "C:\\Anaconda\\envs\\ho-pawp\\lib\\site-packages\\pandas\\core\\indexing.py:966: SettingWithCopyWarning: \n",
94 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
95 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
96 | "\n",
97 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
98 | " self.obj[item] = s\n"
99 | ]
100 | }
101 | ],
102 | "source": [
103 | "numerical_features = numerical_features + ['months_delayed']\n",
104 | "binary_features = ['male','married','grad_school','university']\n",
105 | "X = ccd[numerical_features + binary_features]\n",
106 | "y = ccd['default'].astype(int)\n",
107 | "\n",
108 | "## Split\n",
109 | "from sklearn.model_selection import train_test_split\n",
110 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=101)\n",
111 | "\n",
112 | "## Standarize\n",
113 | "from sklearn.preprocessing import StandardScaler\n",
114 | "scaler = StandardScaler()\n",
115 | "scaler.fit(X_train[numerical_features])\n",
116 | "X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])\n",
117 | "# Standarize also the testing set\n",
118 | "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "### Building the neural network for classification"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 4,
131 | "metadata": {},
132 | "outputs": [
133 | {
134 | "name": "stderr",
135 | "output_type": "stream",
136 | "text": [
137 | "Using TensorFlow backend.\n"
138 | ]
139 | }
140 | ],
141 | "source": [
142 | "from keras.models import Sequential\n",
143 | "nn_classifier = Sequential()"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 5,
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "from keras.layers import Dense\n",
153 | "n_input = X_train.shape[1]\n",
154 | "n_units_hidden = 64\n",
155 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu', input_shape=(n_input,)))"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 6,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "# add 2nd hidden layer\n",
165 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n",
166 | "# add 3th hidden layer\n",
167 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n",
168 | "# add 4th hidden layer\n",
169 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n",
170 | "# add 5th hidden layer\n",
171 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 7,
177 | "metadata": {},
178 | "outputs": [],
179 | "source": [
180 | "# output layer\n",
181 | "nn_classifier.add(Dense(1, activation='sigmoid'))"
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | "### Training the network"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 8,
194 | "metadata": {},
195 | "outputs": [],
196 | "source": [
197 | "## compiling step\n",
198 | "nn_classifier.compile(loss='binary_crossentropy', optimizer='adam')"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 9,
204 | "metadata": {},
205 | "outputs": [
206 | {
207 | "name": "stdout",
208 | "output_type": "stream",
209 | "text": [
210 | "Model: \"sequential_1\"\n",
211 | "_________________________________________________________________\n",
212 | "Layer (type) Output Shape Param # \n",
213 | "=================================================================\n",
214 | "dense_1 (Dense) (None, 64) 1280 \n",
215 | "_________________________________________________________________\n",
216 | "dense_2 (Dense) (None, 64) 4160 \n",
217 | "_________________________________________________________________\n",
218 | "dense_3 (Dense) (None, 64) 4160 \n",
219 | "_________________________________________________________________\n",
220 | "dense_4 (Dense) (None, 64) 4160 \n",
221 | "_________________________________________________________________\n",
222 | "dense_5 (Dense) (None, 64) 4160 \n",
223 | "_________________________________________________________________\n",
224 | "dense_6 (Dense) (None, 1) 65 \n",
225 | "=================================================================\n",
226 | "Total params: 17,985\n",
227 | "Trainable params: 17,985\n",
228 | "Non-trainable params: 0\n",
229 | "_________________________________________________________________\n"
230 | ]
231 | }
232 | ],
233 | "source": [
234 | "nn_classifier.summary()"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 10,
240 | "metadata": {},
241 | "outputs": [],
242 | "source": [
243 | "nn_classifier.save_weights('class_initial_w.h5')"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 11,
249 | "metadata": {},
250 | "outputs": [
251 | {
252 | "name": "stdout",
253 | "output_type": "stream",
254 | "text": [
255 | "Epoch 1/150\n",
256 | "25000/25000 [==============================] - 1s 30us/step - loss: 0.4690\n",
257 | "Epoch 2/150\n",
258 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4481\n",
259 | "Epoch 3/150\n",
260 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4446\n",
261 | "Epoch 4/150\n",
262 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4428\n",
263 | "Epoch 5/150\n",
264 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4405\n",
265 | "Epoch 6/150\n",
266 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4399\n",
267 | "Epoch 7/150\n",
268 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4388\n",
269 | "Epoch 8/150\n",
270 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4379\n",
271 | "Epoch 9/150\n",
272 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4365\n",
273 | "Epoch 10/150\n",
274 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4353\n",
275 | "Epoch 11/150\n",
276 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4348\n",
277 | "Epoch 12/150\n",
278 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4334\n",
279 | "Epoch 13/150\n",
280 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4325\n",
281 | "Epoch 14/150\n",
282 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4303\n",
283 | "Epoch 15/150\n",
284 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4289\n",
285 | "Epoch 16/150\n",
286 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4282\n",
287 | "Epoch 17/150\n",
288 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4262\n",
289 | "Epoch 18/150\n",
290 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4243\n",
291 | "Epoch 19/150\n",
292 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4240\n",
293 | "Epoch 20/150\n",
294 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4217\n",
295 | "Epoch 21/150\n",
296 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4203\n",
297 | "Epoch 22/150\n",
298 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4180\n",
299 | "Epoch 23/150\n",
300 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4166\n",
301 | "Epoch 24/150\n",
302 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4145\n",
303 | "Epoch 25/150\n",
304 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4128\n",
305 | "Epoch 26/150\n",
306 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4113\n",
307 | "Epoch 27/150\n",
308 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4088\n",
309 | "Epoch 28/150\n",
310 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4065\n",
311 | "Epoch 29/150\n",
312 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4038\n",
313 | "Epoch 30/150\n",
314 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4034\n",
315 | "Epoch 31/150\n",
316 | "25000/25000 [==============================] - ETA: 0s - loss: 0.397 - 1s 22us/step - loss: 0.3986\n",
317 | "Epoch 32/150\n",
318 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3986\n",
319 | "Epoch 33/150\n",
320 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3968\n",
321 | "Epoch 34/150\n",
322 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3920\n",
323 | "Epoch 35/150\n",
324 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3888\n",
325 | "Epoch 36/150\n",
326 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3870\n",
327 | "Epoch 37/150\n",
328 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3869\n",
329 | "Epoch 38/150\n",
330 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3828\n",
331 | "Epoch 39/150\n",
332 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3821\n",
333 | "Epoch 40/150\n",
334 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3779\n",
335 | "Epoch 41/150\n",
336 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3760\n",
337 | "Epoch 42/150\n",
338 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3739\n",
339 | "Epoch 43/150\n",
340 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3742\n",
341 | "Epoch 44/150\n",
342 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3683\n",
343 | "Epoch 45/150\n",
344 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3663\n",
345 | "Epoch 46/150\n",
346 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3641\n",
347 | "Epoch 47/150\n",
348 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3625\n",
349 | "Epoch 48/150\n",
350 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3605\n",
351 | "Epoch 49/150\n",
352 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3571\n",
353 | "Epoch 50/150\n",
354 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3525\n",
355 | "Epoch 51/150\n",
356 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3547\n",
357 | "Epoch 52/150\n",
358 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3493\n",
359 | "Epoch 53/150\n",
360 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3481\n",
361 | "Epoch 54/150\n",
362 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3484\n",
363 | "Epoch 55/150\n",
364 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3442\n",
365 | "Epoch 56/150\n",
366 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3426\n",
367 | "Epoch 57/150\n",
368 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3386\n",
369 | "Epoch 58/150\n",
370 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3404\n",
371 | "Epoch 59/150\n",
372 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3381\n",
373 | "Epoch 60/150\n",
374 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.3370\n",
375 | "Epoch 61/150\n",
376 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3307\n",
377 | "Epoch 62/150\n",
378 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3301\n",
379 | "Epoch 63/150\n",
380 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3283\n",
381 | "Epoch 64/150\n",
382 | "25000/25000 [==============================] - 1s 26us/step - loss: 0.3248\n",
383 | "Epoch 65/150\n",
384 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3261\n",
385 | "Epoch 66/150\n",
386 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3221\n",
387 | "Epoch 67/150\n",
388 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3203\n",
389 | "Epoch 68/150\n",
390 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3228\n",
391 | "Epoch 69/150\n",
392 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3181\n",
393 | "Epoch 70/150\n",
394 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3193\n",
395 | "Epoch 71/150\n",
396 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3115\n",
397 | "Epoch 72/150\n",
398 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3155\n",
399 | "Epoch 73/150\n",
400 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3161\n",
401 | "Epoch 74/150\n",
402 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.3071\n",
403 | "Epoch 75/150\n",
404 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3089\n",
405 | "Epoch 76/150\n",
406 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3074\n",
407 | "Epoch 77/150\n",
408 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3079\n",
409 | "Epoch 78/150\n",
410 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3001\n",
411 | "Epoch 79/150\n",
412 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3061\n",
413 | "Epoch 80/150\n",
414 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.3023\n",
415 | "Epoch 81/150\n",
416 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3015\n",
417 | "Epoch 82/150\n",
418 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.2933\n",
419 | "Epoch 83/150\n",
420 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2968\n",
421 | "Epoch 84/150\n",
422 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2973\n",
423 | "Epoch 85/150\n",
424 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2949\n",
425 | "Epoch 86/150\n",
426 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2941\n",
427 | "Epoch 87/150\n",
428 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.2835\n",
429 | "Epoch 88/150\n",
430 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2889\n",
431 | "Epoch 89/150\n",
432 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.2862\n",
433 | "Epoch 90/150\n",
434 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2876\n",
435 | "Epoch 91/150\n",
436 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2809\n",
437 | "Epoch 92/150\n",
438 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2876\n",
439 | "Epoch 93/150\n"
440 | ]
441 | },
442 | {
443 | "name": "stdout",
444 | "output_type": "stream",
445 | "text": [
446 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2804\n",
447 | "Epoch 94/150\n",
448 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2834\n",
449 | "Epoch 95/150\n",
450 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2803\n",
451 | "Epoch 96/150\n",
452 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2822\n",
453 | "Epoch 97/150\n",
454 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2810\n",
455 | "Epoch 98/150\n",
456 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2751\n",
457 | "Epoch 99/150\n",
458 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2754\n",
459 | "Epoch 100/150\n",
460 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2708\n",
461 | "Epoch 101/150\n",
462 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2708\n",
463 | "Epoch 102/150\n",
464 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2890\n",
465 | "Epoch 103/150\n",
466 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2701\n",
467 | "Epoch 104/150\n",
468 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2674\n",
469 | "Epoch 105/150\n",
470 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2684\n",
471 | "Epoch 106/150\n",
472 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2673\n",
473 | "Epoch 107/150\n",
474 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2690\n",
475 | "Epoch 108/150\n",
476 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2659\n",
477 | "Epoch 109/150\n",
478 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2624\n",
479 | "Epoch 110/150\n",
480 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2663\n",
481 | "Epoch 111/150\n",
482 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2614\n",
483 | "Epoch 112/150\n",
484 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2583\n",
485 | "Epoch 113/150\n",
486 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2560\n",
487 | "Epoch 114/150\n",
488 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2597\n",
489 | "Epoch 115/150\n",
490 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2599\n",
491 | "Epoch 116/150\n",
492 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2570\n",
493 | "Epoch 117/150\n",
494 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2552\n",
495 | "Epoch 118/150\n",
496 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2508\n",
497 | "Epoch 119/150\n",
498 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2494\n",
499 | "Epoch 120/150\n",
500 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2518\n",
501 | "Epoch 121/150\n",
502 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2473\n",
503 | "Epoch 122/150\n",
504 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2574\n",
505 | "Epoch 123/150\n",
506 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2521\n",
507 | "Epoch 124/150\n",
508 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2537\n",
509 | "Epoch 125/150\n",
510 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2490\n",
511 | "Epoch 126/150\n",
512 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2459\n",
513 | "Epoch 127/150\n",
514 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2457\n",
515 | "Epoch 128/150\n",
516 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2479\n",
517 | "Epoch 129/150\n",
518 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2511\n",
519 | "Epoch 130/150\n",
520 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2422\n",
521 | "Epoch 131/150\n",
522 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2419\n",
523 | "Epoch 132/150\n",
524 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2390\n",
525 | "Epoch 133/150\n",
526 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2396\n",
527 | "Epoch 134/150\n",
528 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2336\n",
529 | "Epoch 135/150\n",
530 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2331\n",
531 | "Epoch 136/150\n",
532 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2507\n",
533 | "Epoch 137/150\n",
534 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2420\n",
535 | "Epoch 138/150\n",
536 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2434\n",
537 | "Epoch 139/150\n",
538 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2335\n",
539 | "Epoch 140/150\n",
540 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2268\n",
541 | "Epoch 141/150\n",
542 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2317\n",
543 | "Epoch 142/150\n",
544 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2314\n",
545 | "Epoch 143/150\n",
546 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2426\n",
547 | "Epoch 144/150\n",
548 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2306\n",
549 | "Epoch 145/150\n",
550 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2402\n",
551 | "Epoch 146/150\n",
552 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2297\n",
553 | "Epoch 147/150\n",
554 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2253\n",
555 | "Epoch 148/150\n",
556 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2187\n",
557 | "Epoch 149/150\n",
558 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2243\n",
559 | "Epoch 150/150\n",
560 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2256\n"
561 | ]
562 | },
563 | {
564 | "data": {
565 | "text/plain": [
566 | ""
567 | ]
568 | },
569 | "execution_count": 11,
570 | "metadata": {},
571 | "output_type": "execute_result"
572 | }
573 | ],
574 | "source": [
575 | "batch_size = 64\n",
576 | "n_epochs = 150\n",
577 | "nn_classifier.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size)"
578 | ]
579 | },
580 | {
581 | "cell_type": "markdown",
582 | "metadata": {},
583 | "source": [
584 | "## Evaluating predictions"
585 | ]
586 | },
587 | {
588 | "cell_type": "code",
589 | "execution_count": 12,
590 | "metadata": {},
591 | "outputs": [],
592 | "source": [
593 | "## Getting the probabilities\n",
594 | "y_pred_train_prob = nn_classifier.predict(X_train)\n",
595 | "y_pred_test_prob = nn_classifier.predict(X_test)\n",
596 | "\n",
597 | "## Classifications from predictions\n",
598 | "y_pred_train = (y_pred_train_prob > 0.5).astype(int)\n",
599 | "y_pred_test = (y_pred_test_prob > 0.5).astype(int)"
600 | ]
601 | },
602 | {
603 | "cell_type": "code",
604 | "execution_count": 13,
605 | "metadata": {},
606 | "outputs": [
607 | {
608 | "name": "stdout",
609 | "output_type": "stream",
610 | "text": [
611 | "Train Accuracy: 0.903 \n",
612 | "Test Accuracy: 0.750\n"
613 | ]
614 | }
615 | ],
616 | "source": [
617 | "from sklearn.metrics import accuracy_score\n",
618 | "train_acc = accuracy_score(y_true=y_train, y_pred=y_pred_train)\n",
619 | "test_acc = accuracy_score(y_true=y_test, y_pred=y_pred_test)\n",
620 | "print(\"Train Accuracy: {:0.3f} \\nTest Accuracy: {:0.3f}\".format(train_acc, test_acc))"
621 | ]
622 | },
623 | {
624 | "cell_type": "markdown",
625 | "metadata": {},
626 | "source": [
627 | "## Re-training the network with less epochs"
628 | ]
629 | },
630 | {
631 | "cell_type": "code",
632 | "execution_count": 14,
633 | "metadata": {},
634 | "outputs": [],
635 | "source": [
636 | "## load the initial weights\n",
637 | "nn_classifier.load_weights('class_initial_w.h5')"
638 | ]
639 | },
640 | {
641 | "cell_type": "code",
642 | "execution_count": null,
643 | "metadata": {
644 | "scrolled": true
645 | },
646 | "outputs": [
647 | {
648 | "name": "stdout",
649 | "output_type": "stream",
650 | "text": [
651 | "Epoch 1/50\n",
652 | "25000/25000 [==============================] - 1s 30us/step - loss: 0.4680\n",
653 | "Epoch 2/50\n",
654 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4479\n",
655 | "Epoch 3/50\n",
656 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4454\n",
657 | "Epoch 4/50\n",
658 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4430\n",
659 | "Epoch 5/50\n",
660 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4407\n",
661 | "Epoch 6/50\n",
662 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4401\n",
663 | "Epoch 7/50\n",
664 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4381\n",
665 | "Epoch 8/50\n",
666 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4372\n",
667 | "Epoch 9/50\n",
668 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4356\n",
669 | "Epoch 10/50\n",
670 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4350\n",
671 | "Epoch 11/50\n",
672 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4334\n",
673 | "Epoch 12/50\n",
674 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4323\n",
675 | "Epoch 13/50\n",
676 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4320\n",
677 | "Epoch 14/50\n",
678 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4302\n",
679 | "Epoch 15/50\n",
680 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.4284\n",
681 | "Epoch 16/50\n",
682 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4278\n",
683 | "Epoch 17/50\n",
684 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4260\n",
685 | "Epoch 18/50\n",
686 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4249\n",
687 | "Epoch 19/50\n",
688 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4226\n",
689 | "Epoch 20/50\n",
690 | "10752/25000 [===========>..................] - ETA: 0s - loss: 0.4187"
691 | ]
692 | }
693 | ],
694 | "source": [
695 | "batch_size = 64\n",
696 | "n_epochs = 50\n",
697 | "nn_classifier.compile(loss='binary_crossentropy', optimizer='adam')\n",
698 | "nn_classifier.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size)"
699 | ]
700 | },
701 | {
702 | "cell_type": "code",
703 | "execution_count": null,
704 | "metadata": {},
705 | "outputs": [],
706 | "source": [
707 | "## Getting the probabilities\n",
708 | "y_pred_train_prob = nn_classifier.predict(X_train)\n",
709 | "y_pred_test_prob = nn_classifier.predict(X_test)\n",
710 | "\n",
711 | "## Classifications from predictions\n",
712 | "y_pred_train = (y_pred_train_prob > 0.5).astype(int)\n",
713 | "y_pred_test = (y_pred_test_prob > 0.5).astype(int)\n",
714 | "\n",
715 | "## Calculating accuracy\n",
716 | "train_acc = accuracy_score(y_true=y_train, y_pred=y_pred_train)\n",
717 | "test_acc = accuracy_score(y_true=y_test, y_pred=y_pred_test)\n",
718 | "print(\"Train Accuracy: {:0.3f} \\nTest Accuracy: {:0.3f}\".format(train_acc, test_acc))"
719 | ]
720 | },
721 | {
722 | "cell_type": "code",
723 | "execution_count": null,
724 | "metadata": {},
725 | "outputs": [],
726 | "source": []
727 | },
728 | {
729 | "cell_type": "code",
730 | "execution_count": null,
731 | "metadata": {},
732 | "outputs": [],
733 | "source": []
734 | },
735 | {
736 | "cell_type": "code",
737 | "execution_count": null,
738 | "metadata": {},
739 | "outputs": [],
740 | "source": []
741 | },
742 | {
743 | "cell_type": "code",
744 | "execution_count": null,
745 | "metadata": {},
746 | "outputs": [],
747 | "source": []
748 | }
749 | ],
750 | "metadata": {
751 | "kernelspec": {
752 | "display_name": "Python 3",
753 | "language": "python",
754 | "name": "python3"
755 | },
756 | "language_info": {
757 | "codemirror_mode": {
758 | "name": "ipython",
759 | "version": 3
760 | },
761 | "file_extension": ".py",
762 | "mimetype": "text/x-python",
763 | "name": "python",
764 | "nbconvert_exporter": "python",
765 | "pygments_lexer": "ipython3",
766 | "version": "3.6.10"
767 | }
768 | },
769 | "nbformat": 4,
770 | "nbformat_minor": 2
771 | }
772 |
--------------------------------------------------------------------------------
/Chapter06/ch6-2-classification-with-neural-networks.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Predicting Credit Card Default with Neural Networks"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import numpy as np\n",
17 | "import pandas as pd\n",
18 | "import matplotlib.pyplot as plt\n",
19 | "import seaborn as sns\n",
20 | "import os\n",
21 | "%matplotlib inline"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "### Back with the credit card default dataset"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# Loading the dataset\n",
38 | "DATA_DIR = '../data'\n",
39 | "FILE_NAME = 'credit_card_default.csv'\n",
40 | "data_path = os.path.join(DATA_DIR, FILE_NAME)\n",
41 | "ccd = pd.read_csv(data_path, index_col=\"ID\")\n",
42 | "ccd.rename(columns=lambda x: x.lower(), inplace=True)\n",
43 | "ccd.rename(columns={'default payment next month':'default'}, inplace=True)\n",
44 | "\n",
45 | "# getting the groups of features\n",
46 | "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n",
47 | "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n",
48 | "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features\n",
49 | "\n",
50 | "# Creating creating binary features\n",
51 | "ccd['male'] = (ccd['sex'] == 1).astype('int')\n",
52 | "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n",
53 | "ccd['university'] = (ccd['education'] == 2).astype('int')\n",
54 | "#ccd['high_school'] = (ccd['education'] == 3).astype('int')\n",
55 | "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n",
56 | "\n",
57 | "# simplifying pay features \n",
58 | "pay_features= ['pay_' + str(i) for i in range(1,7)]\n",
59 | "for x in pay_features:\n",
60 | " ccd.loc[ccd[x] <= 0, x] = 0\n",
61 | "\n",
62 | "# simplifying delayed features\n",
63 | "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n",
64 | "for pay, delayed in zip(pay_features, delayed_features):\n",
65 | " ccd[delayed] = (ccd[pay] > 0).astype(int)\n",
66 | " \n",
67 | "# creating a new feature: months delayed\n",
68 | "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "## Split and standarize the dataset"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 3,
81 | "metadata": {},
82 | "outputs": [
83 | {
84 | "name": "stderr",
85 | "output_type": "stream",
86 | "text": [
87 | "C:\\Anaconda\\envs\\ho-pawp\\lib\\site-packages\\pandas\\core\\indexing.py:966: SettingWithCopyWarning: \n",
88 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
89 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
90 | "\n",
91 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
92 | " self.obj[item] = s\n",
93 | "C:\\Anaconda\\envs\\ho-pawp\\lib\\site-packages\\pandas\\core\\indexing.py:966: SettingWithCopyWarning: \n",
94 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
95 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
96 | "\n",
97 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
98 | " self.obj[item] = s\n"
99 | ]
100 | }
101 | ],
102 | "source": [
103 | "numerical_features = numerical_features + ['months_delayed']\n",
104 | "binary_features = ['male','married','grad_school','university']\n",
105 | "X = ccd[numerical_features + binary_features]\n",
106 | "y = ccd['default'].astype(int)\n",
107 | "\n",
108 | "## Split\n",
109 | "from sklearn.model_selection import train_test_split\n",
110 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=101)\n",
111 | "\n",
112 | "## Standarize\n",
113 | "from sklearn.preprocessing import StandardScaler\n",
114 | "scaler = StandardScaler()\n",
115 | "scaler.fit(X_train[numerical_features])\n",
116 | "X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])\n",
117 | "# Standarize also the testing set\n",
118 | "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "### Building the neural network for classification"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 4,
131 | "metadata": {},
132 | "outputs": [
133 | {
134 | "name": "stderr",
135 | "output_type": "stream",
136 | "text": [
137 | "Using TensorFlow backend.\n"
138 | ]
139 | }
140 | ],
141 | "source": [
142 | "from keras.models import Sequential\n",
143 | "nn_classifier = Sequential()"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 5,
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "from keras.layers import Dense\n",
153 | "n_input = X_train.shape[1]\n",
154 | "n_units_hidden = 64\n",
155 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu', input_shape=(n_input,)))"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 6,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "# add 2nd hidden layer\n",
165 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n",
166 | "# add 3th hidden layer\n",
167 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n",
168 | "# add 4th hidden layer\n",
169 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))\n",
170 | "# add 5th hidden layer\n",
171 | "nn_classifier.add(Dense(units=n_units_hidden, activation='relu'))"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 7,
177 | "metadata": {},
178 | "outputs": [],
179 | "source": [
180 | "# output layer\n",
181 | "nn_classifier.add(Dense(1, activation='sigmoid'))"
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | "### Training the network"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 8,
194 | "metadata": {},
195 | "outputs": [],
196 | "source": [
197 | "## compiling step\n",
198 | "nn_classifier.compile(loss='binary_crossentropy', optimizer='adam')"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 9,
204 | "metadata": {},
205 | "outputs": [
206 | {
207 | "name": "stdout",
208 | "output_type": "stream",
209 | "text": [
210 | "Model: \"sequential_1\"\n",
211 | "_________________________________________________________________\n",
212 | "Layer (type) Output Shape Param # \n",
213 | "=================================================================\n",
214 | "dense_1 (Dense) (None, 64) 1280 \n",
215 | "_________________________________________________________________\n",
216 | "dense_2 (Dense) (None, 64) 4160 \n",
217 | "_________________________________________________________________\n",
218 | "dense_3 (Dense) (None, 64) 4160 \n",
219 | "_________________________________________________________________\n",
220 | "dense_4 (Dense) (None, 64) 4160 \n",
221 | "_________________________________________________________________\n",
222 | "dense_5 (Dense) (None, 64) 4160 \n",
223 | "_________________________________________________________________\n",
224 | "dense_6 (Dense) (None, 1) 65 \n",
225 | "=================================================================\n",
226 | "Total params: 17,985\n",
227 | "Trainable params: 17,985\n",
228 | "Non-trainable params: 0\n",
229 | "_________________________________________________________________\n"
230 | ]
231 | }
232 | ],
233 | "source": [
234 | "nn_classifier.summary()"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 10,
240 | "metadata": {},
241 | "outputs": [],
242 | "source": [
243 | "nn_classifier.save_weights('class_initial_w.h5')"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 11,
249 | "metadata": {},
250 | "outputs": [
251 | {
252 | "name": "stdout",
253 | "output_type": "stream",
254 | "text": [
255 | "Epoch 1/150\n",
256 | "25000/25000 [==============================] - 1s 30us/step - loss: 0.4690\n",
257 | "Epoch 2/150\n",
258 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4481\n",
259 | "Epoch 3/150\n",
260 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4446\n",
261 | "Epoch 4/150\n",
262 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4428\n",
263 | "Epoch 5/150\n",
264 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4405\n",
265 | "Epoch 6/150\n",
266 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4399\n",
267 | "Epoch 7/150\n",
268 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4388\n",
269 | "Epoch 8/150\n",
270 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4379\n",
271 | "Epoch 9/150\n",
272 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4365\n",
273 | "Epoch 10/150\n",
274 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4353\n",
275 | "Epoch 11/150\n",
276 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4348\n",
277 | "Epoch 12/150\n",
278 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4334\n",
279 | "Epoch 13/150\n",
280 | "25000/25000 [==============================] - 1s 21us/step - loss: 0.4325\n",
281 | "Epoch 14/150\n",
282 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4303\n",
283 | "Epoch 15/150\n",
284 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4289\n",
285 | "Epoch 16/150\n",
286 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4282\n",
287 | "Epoch 17/150\n",
288 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4262\n",
289 | "Epoch 18/150\n",
290 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4243\n",
291 | "Epoch 19/150\n",
292 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4240\n",
293 | "Epoch 20/150\n",
294 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4217\n",
295 | "Epoch 21/150\n",
296 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4203\n",
297 | "Epoch 22/150\n",
298 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4180\n",
299 | "Epoch 23/150\n",
300 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4166\n",
301 | "Epoch 24/150\n",
302 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4145\n",
303 | "Epoch 25/150\n",
304 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4128\n",
305 | "Epoch 26/150\n",
306 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4113\n",
307 | "Epoch 27/150\n",
308 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4088\n",
309 | "Epoch 28/150\n",
310 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4065\n",
311 | "Epoch 29/150\n",
312 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4038\n",
313 | "Epoch 30/150\n",
314 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4034\n",
315 | "Epoch 31/150\n",
316 | "25000/25000 [==============================] - ETA: 0s - loss: 0.397 - 1s 22us/step - loss: 0.3986\n",
317 | "Epoch 32/150\n",
318 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3986\n",
319 | "Epoch 33/150\n",
320 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3968\n",
321 | "Epoch 34/150\n",
322 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3920\n",
323 | "Epoch 35/150\n",
324 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3888\n",
325 | "Epoch 36/150\n",
326 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3870\n",
327 | "Epoch 37/150\n",
328 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3869\n",
329 | "Epoch 38/150\n",
330 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3828\n",
331 | "Epoch 39/150\n",
332 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3821\n",
333 | "Epoch 40/150\n",
334 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3779\n",
335 | "Epoch 41/150\n",
336 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3760\n",
337 | "Epoch 42/150\n",
338 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3739\n",
339 | "Epoch 43/150\n",
340 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3742\n",
341 | "Epoch 44/150\n",
342 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3683\n",
343 | "Epoch 45/150\n",
344 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3663\n",
345 | "Epoch 46/150\n",
346 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3641\n",
347 | "Epoch 47/150\n",
348 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3625\n",
349 | "Epoch 48/150\n",
350 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3605\n",
351 | "Epoch 49/150\n",
352 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3571\n",
353 | "Epoch 50/150\n",
354 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3525\n",
355 | "Epoch 51/150\n",
356 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3547\n",
357 | "Epoch 52/150\n",
358 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3493\n",
359 | "Epoch 53/150\n",
360 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3481\n",
361 | "Epoch 54/150\n",
362 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3484\n",
363 | "Epoch 55/150\n",
364 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3442\n",
365 | "Epoch 56/150\n",
366 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3426\n",
367 | "Epoch 57/150\n",
368 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3386\n",
369 | "Epoch 58/150\n",
370 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3404\n",
371 | "Epoch 59/150\n",
372 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3381\n",
373 | "Epoch 60/150\n",
374 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.3370\n",
375 | "Epoch 61/150\n",
376 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3307\n",
377 | "Epoch 62/150\n",
378 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3301\n",
379 | "Epoch 63/150\n",
380 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3283\n",
381 | "Epoch 64/150\n",
382 | "25000/25000 [==============================] - 1s 26us/step - loss: 0.3248\n",
383 | "Epoch 65/150\n",
384 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3261\n",
385 | "Epoch 66/150\n",
386 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3221\n",
387 | "Epoch 67/150\n",
388 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3203\n",
389 | "Epoch 68/150\n",
390 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3228\n",
391 | "Epoch 69/150\n",
392 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3181\n",
393 | "Epoch 70/150\n",
394 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3193\n",
395 | "Epoch 71/150\n",
396 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3115\n",
397 | "Epoch 72/150\n",
398 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3155\n",
399 | "Epoch 73/150\n",
400 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3161\n",
401 | "Epoch 74/150\n",
402 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.3071\n",
403 | "Epoch 75/150\n",
404 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3089\n",
405 | "Epoch 76/150\n",
406 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3074\n",
407 | "Epoch 77/150\n",
408 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3079\n",
409 | "Epoch 78/150\n",
410 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3001\n",
411 | "Epoch 79/150\n",
412 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3061\n",
413 | "Epoch 80/150\n",
414 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.3023\n",
415 | "Epoch 81/150\n",
416 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3015\n",
417 | "Epoch 82/150\n",
418 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.2933\n",
419 | "Epoch 83/150\n",
420 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2968\n",
421 | "Epoch 84/150\n",
422 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2973\n",
423 | "Epoch 85/150\n",
424 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2949\n",
425 | "Epoch 86/150\n",
426 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2941\n",
427 | "Epoch 87/150\n",
428 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.2835\n",
429 | "Epoch 88/150\n",
430 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2889\n",
431 | "Epoch 89/150\n",
432 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.2862\n",
433 | "Epoch 90/150\n",
434 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2876\n",
435 | "Epoch 91/150\n",
436 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2809\n",
437 | "Epoch 92/150\n",
438 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.2876\n",
439 | "Epoch 93/150\n"
440 | ]
441 | },
442 | {
443 | "name": "stdout",
444 | "output_type": "stream",
445 | "text": [
446 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2804\n",
447 | "Epoch 94/150\n",
448 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2834\n",
449 | "Epoch 95/150\n",
450 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2803\n",
451 | "Epoch 96/150\n",
452 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2822\n",
453 | "Epoch 97/150\n",
454 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2810\n",
455 | "Epoch 98/150\n",
456 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2751\n",
457 | "Epoch 99/150\n",
458 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2754\n",
459 | "Epoch 100/150\n",
460 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2708\n",
461 | "Epoch 101/150\n",
462 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2708\n",
463 | "Epoch 102/150\n",
464 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2890\n",
465 | "Epoch 103/150\n",
466 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2701\n",
467 | "Epoch 104/150\n",
468 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2674\n",
469 | "Epoch 105/150\n",
470 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2684\n",
471 | "Epoch 106/150\n",
472 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2673\n",
473 | "Epoch 107/150\n",
474 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2690\n",
475 | "Epoch 108/150\n",
476 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2659\n",
477 | "Epoch 109/150\n",
478 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2624\n",
479 | "Epoch 110/150\n",
480 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2663\n",
481 | "Epoch 111/150\n",
482 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2614\n",
483 | "Epoch 112/150\n",
484 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2583\n",
485 | "Epoch 113/150\n",
486 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2560\n",
487 | "Epoch 114/150\n",
488 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2597\n",
489 | "Epoch 115/150\n",
490 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2599\n",
491 | "Epoch 116/150\n",
492 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2570\n",
493 | "Epoch 117/150\n",
494 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2552\n",
495 | "Epoch 118/150\n",
496 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2508\n",
497 | "Epoch 119/150\n",
498 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2494\n",
499 | "Epoch 120/150\n",
500 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2518\n",
501 | "Epoch 121/150\n",
502 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2473\n",
503 | "Epoch 122/150\n",
504 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2574\n",
505 | "Epoch 123/150\n",
506 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2521\n",
507 | "Epoch 124/150\n",
508 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2537\n",
509 | "Epoch 125/150\n",
510 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2490\n",
511 | "Epoch 126/150\n",
512 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2459\n",
513 | "Epoch 127/150\n",
514 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2457\n",
515 | "Epoch 128/150\n",
516 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2479\n",
517 | "Epoch 129/150\n",
518 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2511\n",
519 | "Epoch 130/150\n",
520 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2422\n",
521 | "Epoch 131/150\n",
522 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2419\n",
523 | "Epoch 132/150\n",
524 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2390\n",
525 | "Epoch 133/150\n",
526 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2396\n",
527 | "Epoch 134/150\n",
528 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2336\n",
529 | "Epoch 135/150\n",
530 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2331\n",
531 | "Epoch 136/150\n",
532 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2507\n",
533 | "Epoch 137/150\n",
534 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2420\n",
535 | "Epoch 138/150\n",
536 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2434\n",
537 | "Epoch 139/150\n",
538 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2335\n",
539 | "Epoch 140/150\n",
540 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2268\n",
541 | "Epoch 141/150\n",
542 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2317\n",
543 | "Epoch 142/150\n",
544 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2314\n",
545 | "Epoch 143/150\n",
546 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2426\n",
547 | "Epoch 144/150\n",
548 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2306\n",
549 | "Epoch 145/150\n",
550 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.2402\n",
551 | "Epoch 146/150\n",
552 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2297\n",
553 | "Epoch 147/150\n",
554 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2253\n",
555 | "Epoch 148/150\n",
556 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2187\n",
557 | "Epoch 149/150\n",
558 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2243\n",
559 | "Epoch 150/150\n",
560 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.2256\n"
561 | ]
562 | },
563 | {
564 | "data": {
565 | "text/plain": [
566 | ""
567 | ]
568 | },
569 | "execution_count": 11,
570 | "metadata": {},
571 | "output_type": "execute_result"
572 | }
573 | ],
574 | "source": [
575 | "batch_size = 64\n",
576 | "n_epochs = 150\n",
577 | "nn_classifier.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size)"
578 | ]
579 | },
580 | {
581 | "cell_type": "markdown",
582 | "metadata": {},
583 | "source": [
584 | "## Evaluating predictions"
585 | ]
586 | },
587 | {
588 | "cell_type": "code",
589 | "execution_count": 12,
590 | "metadata": {},
591 | "outputs": [],
592 | "source": [
593 | "## Getting the probabilities\n",
594 | "y_pred_train_prob = nn_classifier.predict(X_train)\n",
595 | "y_pred_test_prob = nn_classifier.predict(X_test)\n",
596 | "\n",
597 | "## Classifications from predictions\n",
598 | "y_pred_train = (y_pred_train_prob > 0.5).astype(int)\n",
599 | "y_pred_test = (y_pred_test_prob > 0.5).astype(int)"
600 | ]
601 | },
602 | {
603 | "cell_type": "code",
604 | "execution_count": 13,
605 | "metadata": {},
606 | "outputs": [
607 | {
608 | "name": "stdout",
609 | "output_type": "stream",
610 | "text": [
611 | "Train Accuracy: 0.903 \n",
612 | "Test Accuracy: 0.750\n"
613 | ]
614 | }
615 | ],
616 | "source": [
617 | "from sklearn.metrics import accuracy_score\n",
618 | "train_acc = accuracy_score(y_true=y_train, y_pred=y_pred_train)\n",
619 | "test_acc = accuracy_score(y_true=y_test, y_pred=y_pred_test)\n",
620 | "print(\"Train Accuracy: {:0.3f} \\nTest Accuracy: {:0.3f}\".format(train_acc, test_acc))"
621 | ]
622 | },
623 | {
624 | "cell_type": "markdown",
625 | "metadata": {},
626 | "source": [
627 | "## Re-training the network with less epochs"
628 | ]
629 | },
630 | {
631 | "cell_type": "code",
632 | "execution_count": 14,
633 | "metadata": {},
634 | "outputs": [],
635 | "source": [
636 | "## load the initial weights\n",
637 | "nn_classifier.load_weights('class_initial_w.h5')"
638 | ]
639 | },
640 | {
641 | "cell_type": "code",
642 | "execution_count": 15,
643 | "metadata": {
644 | "scrolled": true
645 | },
646 | "outputs": [
647 | {
648 | "name": "stdout",
649 | "output_type": "stream",
650 | "text": [
651 | "Epoch 1/50\n",
652 | "25000/25000 [==============================] - 1s 30us/step - loss: 0.4680\n",
653 | "Epoch 2/50\n",
654 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4479\n",
655 | "Epoch 3/50\n",
656 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4454\n",
657 | "Epoch 4/50\n",
658 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4430\n",
659 | "Epoch 5/50\n",
660 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4407\n",
661 | "Epoch 6/50\n",
662 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4401\n",
663 | "Epoch 7/50\n",
664 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4381\n",
665 | "Epoch 8/50\n",
666 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4372\n",
667 | "Epoch 9/50\n",
668 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4356\n",
669 | "Epoch 10/50\n",
670 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4350\n",
671 | "Epoch 11/50\n",
672 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4334\n",
673 | "Epoch 12/50\n",
674 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4323\n",
675 | "Epoch 13/50\n",
676 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4320\n",
677 | "Epoch 14/50\n",
678 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4302\n",
679 | "Epoch 15/50\n",
680 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.4284\n",
681 | "Epoch 16/50\n",
682 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.4278\n",
683 | "Epoch 17/50\n",
684 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4260\n",
685 | "Epoch 18/50\n",
686 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4249\n",
687 | "Epoch 19/50\n",
688 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4226\n",
689 | "Epoch 20/50\n",
690 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.4216\n",
691 | "Epoch 21/50\n",
692 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.4197\n",
693 | "Epoch 22/50\n",
694 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4178\n",
695 | "Epoch 23/50\n",
696 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4163\n",
697 | "Epoch 24/50\n",
698 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4139\n",
699 | "Epoch 25/50\n",
700 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4110\n",
701 | "Epoch 26/50\n",
702 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4116\n",
703 | "Epoch 27/50\n",
704 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4079\n",
705 | "Epoch 28/50\n",
706 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4056\n",
707 | "Epoch 29/50\n",
708 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4032\n",
709 | "Epoch 30/50\n",
710 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.4008\n",
711 | "Epoch 31/50\n",
712 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.3989\n",
713 | "Epoch 32/50\n",
714 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3959\n",
715 | "Epoch 33/50\n",
716 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3938\n",
717 | "Epoch 34/50\n",
718 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3893\n",
719 | "Epoch 35/50\n",
720 | "25000/25000 [==============================] - 1s 25us/step - loss: 0.3885\n",
721 | "Epoch 36/50\n",
722 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3854\n",
723 | "Epoch 37/50\n",
724 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3817\n",
725 | "Epoch 38/50\n",
726 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3805\n",
727 | "Epoch 39/50\n",
728 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3804\n",
729 | "Epoch 40/50\n",
730 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3751\n",
731 | "Epoch 41/50\n",
732 | "25000/25000 [==============================] - 1s 26us/step - loss: 0.3745\n",
733 | "Epoch 42/50\n",
734 | "25000/25000 [==============================] - 1s 26us/step - loss: 0.3709\n",
735 | "Epoch 43/50\n",
736 | "25000/25000 [==============================] - 1s 26us/step - loss: 0.3712\n",
737 | "Epoch 44/50\n",
738 | "25000/25000 [==============================] - 1s 29us/step - loss: 0.3657\n",
739 | "Epoch 45/50\n",
740 | "25000/25000 [==============================] - 1s 34us/step - loss: 0.3628\n",
741 | "Epoch 46/50\n",
742 | "25000/25000 [==============================] - 1s 30us/step - loss: 0.3600\n",
743 | "Epoch 47/50\n",
744 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3573\n",
745 | "Epoch 48/50\n",
746 | "25000/25000 [==============================] - 1s 23us/step - loss: 0.3576\n",
747 | "Epoch 49/50\n",
748 | "25000/25000 [==============================] - 1s 24us/step - loss: 0.3536\n",
749 | "Epoch 50/50\n",
750 | "25000/25000 [==============================] - 1s 22us/step - loss: 0.3502\n"
751 | ]
752 | },
753 | {
754 | "data": {
755 | "text/plain": [
756 | ""
757 | ]
758 | },
759 | "execution_count": 15,
760 | "metadata": {},
761 | "output_type": "execute_result"
762 | }
763 | ],
764 | "source": [
765 | "batch_size = 64\n",
766 | "n_epochs = 50\n",
767 | "nn_classifier.compile(loss='binary_crossentropy', optimizer='adam')\n",
768 | "nn_classifier.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size)"
769 | ]
770 | },
771 | {
772 | "cell_type": "code",
773 | "execution_count": 16,
774 | "metadata": {},
775 | "outputs": [
776 | {
777 | "name": "stdout",
778 | "output_type": "stream",
779 | "text": [
780 | "Train Accuracy: 0.845 \n",
781 | "Test Accuracy: 0.782\n"
782 | ]
783 | }
784 | ],
785 | "source": [
786 | "## Getting the probabilities\n",
787 | "y_pred_train_prob = nn_classifier.predict(X_train)\n",
788 | "y_pred_test_prob = nn_classifier.predict(X_test)\n",
789 | "\n",
790 | "## Classifications from predictions\n",
791 | "y_pred_train = (y_pred_train_prob > 0.5).astype(int)\n",
792 | "y_pred_test = (y_pred_test_prob > 0.5).astype(int)\n",
793 | "\n",
794 | "## Calculating accuracy\n",
795 | "train_acc = accuracy_score(y_true=y_train, y_pred=y_pred_train)\n",
796 | "test_acc = accuracy_score(y_true=y_test, y_pred=y_pred_test)\n",
797 | "print(\"Train Accuracy: {:0.3f} \\nTest Accuracy: {:0.3f}\".format(train_acc, test_acc))"
798 | ]
799 | },
800 | {
801 | "cell_type": "code",
802 | "execution_count": null,
803 | "metadata": {},
804 | "outputs": [],
805 | "source": []
806 | },
807 | {
808 | "cell_type": "code",
809 | "execution_count": null,
810 | "metadata": {},
811 | "outputs": [],
812 | "source": []
813 | },
814 | {
815 | "cell_type": "code",
816 | "execution_count": null,
817 | "metadata": {},
818 | "outputs": [],
819 | "source": []
820 | },
821 | {
822 | "cell_type": "code",
823 | "execution_count": null,
824 | "metadata": {},
825 | "outputs": [],
826 | "source": []
827 | }
828 | ],
829 | "metadata": {
830 | "kernelspec": {
831 | "display_name": "Python 3",
832 | "language": "python",
833 | "name": "python3"
834 | },
835 | "language_info": {
836 | "codemirror_mode": {
837 | "name": "ipython",
838 | "version": 3
839 | },
840 | "file_extension": ".py",
841 | "mimetype": "text/x-python",
842 | "name": "python",
843 | "nbconvert_exporter": "python",
844 | "pygments_lexer": "ipython3",
845 | "version": "3.6.10"
846 | }
847 | },
848 | "nbformat": 4,
849 | "nbformat_minor": 2
850 | }
851 |
--------------------------------------------------------------------------------
/Chapter06/class_initial_w.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Predictive-Analytics-with-Python/e049d9d2870f596892b62aeddec312788d9d0c2c/Chapter06/class_initial_w.h5
--------------------------------------------------------------------------------
/Chapter08/.ipynb_checkpoints/ch8-credit-card-def-model-tuning-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Credit Card Default: Model Tuning and Improving Performance"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "#### Importing libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd\n",
25 | "import matplotlib.pyplot as plt\n",
26 | "import seaborn as sns\n",
27 | "import os\n",
28 | "\n",
29 | "pd.options.mode.chained_assignment = None\n",
30 | "%matplotlib inline"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "#### Loading and preparing the dataset"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 2,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "# Loading the dataset\n",
47 | "DATA_DIR = '../data'\n",
48 | "FILE_NAME = 'credit_card_default.csv'\n",
49 | "data_path = os.path.join(DATA_DIR, FILE_NAME)\n",
50 | "ccd = pd.read_csv(data_path, index_col=\"ID\")\n",
51 | "ccd.rename(columns=lambda x: x.lower(), inplace=True)\n",
52 | "ccd.rename(columns={'default payment next month':'default'}, inplace=True)\n",
53 | "\n",
54 | "# getting the groups of features\n",
55 | "bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]\n",
56 | "pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]\n",
57 | "numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features\n",
58 | "\n",
59 | "# Creating creating binary features\n",
60 | "ccd['male'] = (ccd['sex'] == 1).astype('int')\n",
61 | "ccd['grad_school'] = (ccd['education'] == 1).astype('int')\n",
62 | "ccd['university'] = (ccd['education'] == 2).astype('int')\n",
63 | "ccd['married'] = (ccd['marriage'] == 1).astype('int')\n",
64 | "\n",
65 | "# simplifying pay features \n",
66 | "pay_features= ['pay_' + str(i) for i in range(1,7)]\n",
67 | "for x in pay_features:\n",
68 | " ccd.loc[ccd[x] <= 0, x] = 0\n",
69 | "\n",
70 | "# simplifying delayed features\n",
71 | "delayed_features = ['delayed_' + str(i) for i in range(1,7)]\n",
72 | "for pay, delayed in zip(pay_features, delayed_features):\n",
73 | " ccd[delayed] = (ccd[pay] > 0).astype(int)\n",
74 | " \n",
75 | "# creating a new feature: months delayed\n",
76 | "ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "#### Splitting and standarizing the dataset"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 3,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "numerical_features = numerical_features + ['months_delayed']\n",
93 | "binary_features = ['male','married','grad_school','university']\n",
94 | "X = ccd[numerical_features + binary_features]\n",
95 | "y = ccd['default'].astype(int)\n",
96 | "\n",
97 | "## Split\n",
98 | "from sklearn.model_selection import train_test_split\n",
99 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=25)\n",
100 | "\n",
101 | "## Standarize\n",
102 | "from sklearn.preprocessing import StandardScaler\n",
103 | "scaler = StandardScaler()\n",
104 | "scaler.fit(X[numerical_features])\n",
105 | "X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])\n",
106 | "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "## Optimizing more than one parameter"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "metadata": {},
119 | "source": [
120 | "#### Reference model"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 4,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "from sklearn.model_selection import cross_val_score\n",
130 | "from sklearn.ensemble import RandomForestClassifier\n",
131 | "ref_rf = RandomForestClassifier(n_estimators=25,\n",
132 | " max_features=4,\n",
133 | " max_depth=4,\n",
134 | " random_state=61)\n",
135 | "\n",
136 | "ref_rf_scores = cross_val_score(ref_rf, X_train, y_train, scoring='roc_auc', cv=10)"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 5,
142 | "metadata": {
143 | "scrolled": true
144 | },
145 | "outputs": [
146 | {
147 | "name": "stdout",
148 | "output_type": "stream",
149 | "text": [
150 | "Mean AUC for reference model: 0.7589\n"
151 | ]
152 | }
153 | ],
154 | "source": [
155 | "print(\"Mean AUC for reference model: {:0.4f}\".format(ref_rf_scores.mean()))"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "metadata": {},
161 | "source": [
162 | "#### Grid Search CV"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 6,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "from sklearn.model_selection import GridSearchCV\n",
172 | "param_grid = {\"n_estimators\":[25,100,200,400],\n",
173 | " \"max_features\":[4,10,19],\n",
174 | " \"max_depth\":[4,8,16,20]}"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {},
181 | "outputs": [
182 | {
183 | "name": "stdout",
184 | "output_type": "stream",
185 | "text": [
186 | "Fitting 5 folds for each of 48 candidates, totalling 240 fits\n"
187 | ]
188 | },
189 | {
190 | "name": "stderr",
191 | "output_type": "stream",
192 | "text": [
193 | "[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "rf = RandomForestClassifier(random_state=17)\n",
199 | "grid_search = GridSearchCV(estimator=rf,\n",
200 | " param_grid=param_grid,\n",
201 | " scoring='roc_auc',\n",
202 | " cv=5,\n",
203 | " verbose=1,\n",
204 | " n_jobs=4)\n",
205 | "grid_search.fit(X_train, y_train)"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {},
212 | "outputs": [],
213 | "source": [
214 | "gs_results = pd.Series(grid_search.cv_results_['mean_test_score'], index=grid_search.cv_results_['params'])\n",
215 | "gs_results.sort_values(ascending=False)"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "from sklearn.metrics import precision_recall_curve\n",
225 | "## Fitting the initial (not tuned) model:\n",
226 | "ref_rf.fit(X_train, y_train)\n",
227 | "\n",
228 | "## Getting the probabilites\n",
229 | "y_prob_tunned = grid_search.predict_proba(X_test)[:,1]\n",
230 | "y_prob_not_tunned = ref_rf.predict_proba(X_test)[:,1]\n",
231 | "\n",
232 | "## Values for plotting the curves\n",
233 | "prec_tuned, recall_tuned, _ = precision_recall_curve(y_test, y_prob_tunned)\n",
234 | "prec_not_tuned, recall_not_tuned, _ = precision_recall_curve(y_test, y_prob_not_tunned)"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": null,
240 | "metadata": {},
241 | "outputs": [],
242 | "source": [
243 | "fig, ax = plt.subplots(figsize=(8,5))\n",
244 | "ax.plot(prec_tuned, recall_tuned, label='Tuned Model')\n",
245 | "ax.plot(prec_not_tuned, recall_not_tuned, label='Not Tuned Model')\n",
246 | "ax.set_title('Precision-recall curves', fontsize=16)\n",
247 | "ax.set_xlabel('Precision', fontsize=14)\n",
248 | "ax.set_ylabel('Recall', fontsize=14)\n",
249 | "ax.set_xlim(0.3,0.7); ax.set_ylim(0.1,0.9)\n",
250 | "ax.legend(); ax.grid();"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": null,
256 | "metadata": {},
257 | "outputs": [],
258 | "source": []
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": null,
263 | "metadata": {},
264 | "outputs": [],
265 | "source": []
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "metadata": {},
271 | "outputs": [],
272 | "source": []
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": null,
277 | "metadata": {},
278 | "outputs": [],
279 | "source": []
280 | }
281 | ],
282 | "metadata": {
283 | "kernelspec": {
284 | "display_name": "Python 3",
285 | "language": "python",
286 | "name": "python3"
287 | },
288 | "language_info": {
289 | "codemirror_mode": {
290 | "name": "ipython",
291 | "version": 3
292 | },
293 | "file_extension": ".py",
294 | "mimetype": "text/x-python",
295 | "name": "python",
296 | "nbconvert_exporter": "python",
297 | "pygments_lexer": "ipython3",
298 | "version": "3.6.10"
299 | }
300 | },
301 | "nbformat": 4,
302 | "nbformat_minor": 2
303 | }
304 |
--------------------------------------------------------------------------------
/Chapter08/.ipynb_checkpoints/ch8-diamond-prices-model-tuning-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Diamond Prices: Model Tuning and Improving Performance"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "#### Importing libraries"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd\n",
25 | "import matplotlib.pyplot as plt\n",
26 | "import seaborn as sns\n",
27 | "import os\n",
28 | "\n",
29 | "pd.options.mode.chained_assignment = None\n",
30 | "%matplotlib inline"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "#### Loading the dataset"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 2,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "DATA_DIR = '../data'\n",
47 | "FILE_NAME = 'diamonds.csv'\n",
48 | "data_path = os.path.join(DATA_DIR, FILE_NAME)\n",
49 | "diamonds = pd.read_csv(data_path)"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "#### Preparing the dataset"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 3,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "## Preparation done from Chapter 2\n",
66 | "diamonds = diamonds.loc[(diamonds['x']>0) | (diamonds['y']>0)]\n",
67 | "diamonds.loc[11182, 'x'] = diamonds['x'].median()\n",
68 | "diamonds.loc[11182, 'z'] = diamonds['z'].median()\n",
69 | "diamonds = diamonds.loc[~((diamonds['y'] > 30) | (diamonds['z'] > 30))]\n",
70 | "diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['cut'], prefix='cut', drop_first=True)], axis=1)\n",
71 | "diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['color'], prefix='color', drop_first=True)], axis=1)\n",
72 | "diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['clarity'], prefix='clarity', drop_first=True)], axis=1)\n",
73 | "\n",
74 | "## Dimensionality reduction\n",
75 | "from sklearn.decomposition import PCA\n",
76 | "pca = PCA(n_components=1, random_state=123)\n",
77 | "diamonds['dim_index'] = pca.fit_transform(diamonds[['x','y','z']])\n",
78 | "diamonds.drop(['x','y','z'], axis=1, inplace=True)"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 4,
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "data": {
88 | "text/plain": [
89 | "Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price',\n",
90 | " 'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_E',\n",
91 | " 'color_F', 'color_G', 'color_H', 'color_I', 'color_J', 'clarity_IF',\n",
92 | " 'clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2',\n",
93 | " 'clarity_VVS1', 'clarity_VVS2', 'dim_index'],\n",
94 | " dtype='object')"
95 | ]
96 | },
97 | "execution_count": 4,
98 | "metadata": {},
99 | "output_type": "execute_result"
100 | }
101 | ],
102 | "source": [
103 | "diamonds.columns"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "#### Train-test split"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 5,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "X = diamonds.drop(['cut','color','clarity','price'], axis=1)\n",
120 | "y = diamonds['price']\n",
121 | "\n",
122 | "from sklearn.model_selection import train_test_split\n",
123 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=7)"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "#### Standarization: centering and scaling "
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 6,
136 | "metadata": {
137 | "scrolled": true
138 | },
139 | "outputs": [],
140 | "source": [
141 | "numerical_features = ['carat', 'depth', 'table', 'dim_index']\n",
142 | "from sklearn.preprocessing import StandardScaler\n",
143 | "scaler = StandardScaler()\n",
144 | "scaler.fit(X_train[numerical_features])\n",
145 | "X_train.loc[:, numerical_features] = scaler.fit_transform(X_train[numerical_features])\n",
146 | "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "## Optimizing a single hyper-parameter"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 7,
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=13)"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 8,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "from sklearn.neighbors import KNeighborsRegressor\n",
172 | "from sklearn.metrics import mean_absolute_error\n",
173 | "\n",
174 | "candidates = np.arange(4,16)\n",
175 | "mae_metrics = []\n",
176 | "for k in candidates:\n",
177 | " model = KNeighborsRegressor(n_neighbors=k, weights='distance', metric='minkowski', leaf_size=50, n_jobs=4)\n",
178 | " model.fit(X_train, y_train)\n",
179 | " y_pred = model.predict(X_val)\n",
180 | " metric = mean_absolute_error(y_true=y_val, y_pred=y_pred)\n",
181 | " mae_metrics.append(metric)"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 9,
187 | "metadata": {},
188 | "outputs": [
189 | {
190 | "data": {
191 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfgAAAFBCAYAAACb7b3CAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deXxU1f3/8deHECAQIKwBEjYVUHYI4tYquIuIVOpW0bb2+3Nvtdal2Nal1mKLVrupbdVaRaVacSkuWJXUFZF9D4IESNiFBAIBsnx+f8yEDiHAAJO5k+H9fDzmwcydM3femWg+c+499xxzd0RERCS51As6gIiIiMSeCryIiEgSUoEXERFJQirwIiIiSUgFXkREJAmpwIuIiCSh+kEHiKXWrVt7ly5dYrrPbdu20aRJk5ju83AlWqZEywPKFC1lio4yRUeZDqw28syYMWOju7fZ6wl3T5pbTk6Ox9qUKVNivs/DlWiZEi2PuzJFS5mio0zRUaYDq408wHSvoSbqEL2IiEgSUoEXERFJQirwIiIiSUgFXkREJAmpwIuIiCQhFXgREZEkpAIvIiKShJJqohsREZFE9NqsQsZNzqOwqJSsqR9w+zk9GDkgq1bfUwVeRESkFr02q5AxE+dRWlYBQGFRKWMmzgOo1SKvQ/QiIiK1aNzkvN3FvUppWQXjJufV6vuqBy8iIhJjO8oqmL2qiC+Wb6KwqLTGNqv3sT1W4l7gzSwFmA4UuvvwiO23AeOANu6+MWJ7J2AhcK+7PxTvvCIiIgdSXFrGjBWbmLZ8M1/kb2JuQRFlFQ5A/XpGeaXv9ZoOGWm1mimIHvzNwCKgWdUGM+sInAWsrKH9I8Db8YkmIiJyYOu27GDa8k18kb+Jacs3kbduK+6hYt4nuzlXn9KV47u0ZFCXFuTmbdjjHDxAWmoKt5/To1YzxrXAm1k2cD7wAHBrxFOPAHcAr1drPxL4CtgWr4wiIiKR3J3lG7eFi3moh75y03YAGjdIYWCnFpzXuz3Hd23BgI4tSGuQssfrqwbS7R5Fn5GWlKPoHyVUyJtWbTCzEYQO188xMyK2NwHuJNSzvy3OOUVE5AhVUeksWrNldw/9i/zNbCzZCUDLJg0Y1LkFV53UmeO7tKRnh2akphx4vPrIAVmMHJBFbm4uQ4YMqeWfIMRCS8nG4Y3MhgPD3P0GMxtCqGhfAkwBznb3YjPLBwa5+0YzewiY5u4vmdm9QElN5+DN7BrgGoDMzMycCRMmxDR3SUkJ6enpMd3n4Uq0TImWB5QpWsoUHWWKTl3NtKvCWV5cSd7mCpZsrmTp5gp2hI+mt2pkdG9Zjx4tUujeIoX2TYzIzmht5DlYQ4cOneHug6pvj2eBHwtcCZQDjQidg38b+CawPdwsG1gNDAZeBjqGt2cAlcDd7v6nfb3HoEGDfPr06THNHc9vW9FKtEyJlgeUKVrKFB1lik4iZdpjYplqh8SLS8uYuWIz0/I38cXyTcwtKGZXRSUA3TPTOb5LSwZ3bcnxXVrGfCBcbXxGZlZjgY/bIXp3HwOMCYcZAtzm7qMi20T24AkV/qrt9xLqwe+zuIuIiEDNE8vc/q85vDJjFRu3lbF47ZbdA+J6ZzXne6d0CQ2I69yCFk0aBJw+dnQdvIiIJJWaJpYpq3A+Xvo1Jx/TipvP6MbgLi3p3ymDxg2StwwG8pO5ey6QW8P2Lvtof2+tBhIRkaSxvwlknv+/E+OYJFiaqlZERJJGeUUlDevXXNpqe2KZRKMCLyIiScHduevVeeworyQ1Zc+R7vGYWCbRqMCLiEhS+O3kPF6aXsCPzujGuG/3IyvcY8/KSGPsRX1qfWKZRJO8owtEROSI8dTHy3k8dxnfOaETPz6zG2YW94llEo168CIiUqe9PruQ+yct5Lze7bj/wt6HNRFNMlGBFxGROuu/Szbwk5fmcOJRLXnk0v6k1FNxr6ICLyIiddLsVUVcP34G3TKb8terBtEoNeXALzqCqMCLiEids3R9Cd//+zRapzfkH1cfT7NGqUFHSjgq8CIiUqesKS7lu09PI6We8dwPBtO2aaOgIyUkjaIXEZE6o2j7Lr779DSKS8uYcM2JdG7VJOhICUs9eBERqRNKd1Xwg39MJ3/jdv56VQ69s5oHHSmhqQcvIiIJr7yikptemMnMlZv583cGcvLRrYOOlPDUgxcRkYTm7oyZOI/3F6/nlxf2Zlif9kFHqhNU4EVEJKH95p08Xp5RwM1ndOPKEzsHHafOUIEXEZGE9eRHX/HEf5dxxQmduOXMbkHHqVNU4EVEJCG9OquAX725iGF92vFLTUF70FTgRUQk4UzJW8/tL8/lpKNaaQraQ6QCLyIiCWXWys3cMH4m3TOb8tercmhYX1PQHgoVeBERSRhL12/l+898QZumDXnm6uNpqiloD5kKvIiIJIQ1xaVc9dQ06terpyloY0AFXkREAle0fRdXPTWNLTvKeeb7x2sK2hhQgRcRkUBVTUG74mtNQRtLmqpWREQCU1ZRyY3hKWgf0xS0MaUevIiIBMLd+ekr8/hg8Xruv7A352kK2phSgRcRkUA8+PZiXplZwC1ndmO0pqCNORV4ERGJu799+BV/+fArrjyxMzefoSloa0PcC7yZpZjZLDObVG37bWbmZtY6/PgsM5thZvPC/54e76wiIhJ7E2cW8MBboSlo7x3RS1PQ1pIgBtndDCwCmlVtMLOOwFnAyoh2G4EL3H21mfUGJgNZ8QwqIiKxNSVvPXf8ay4nH60paGtbXHvwZpYNnA88We2pR4A7AK/a4O6z3H11+OECoJGZNYxLUBERibmZ4Sloj23flL9cqSloa1u8D9E/SqiQV1ZtMLMRQKG7z9nP60YBs9x9Zy3nExGRWrB0/VaufuYL2jZryN+/N1hT0MaBufuBW8XijcyGA8Pc/QYzGwLcBlwCTAHOdvdiM8sHBrn7xojX9QLeCLdZVsN+rwGuAcjMzMyZMGFCTHOXlJSQnp4e030erkTLlGh5QJmipUzRUabo7CvT16WVPPD5Dsor4ecnNqJt4/j1LRPtc6qNPEOHDp3h7oP2esLd43IDxgIFQD6wFtgOvAKsD2/LB8oJnYdvF35NNrAEOCWa98jJyfFYmzJlSsz3ebgSLVOi5XFXpmgpU3SUKTo1ZdpUstPPeDjXe9/9js8vLEqITEGqjTzAdK+hJsZtkJ27jwHGAFT14N19VGSbyB68mWUAbwJj3P2TeOUUEZHY2L6rnKv/8QUrN23nH98fTK8OmoI2nhL5OvibgGOAX5jZ7PCtbdChRETkwMoqKrnx+ZnMWVXEHy7rz0lHtwo60hEnkLno3T0XyK1he5eI+78CfhW3UCIiEhOVlc6dr8xlSt4Gfv2tPpzbW1PQBiGRe/AiIlIHPfjOYibOLOTWs7rznRM6BR3niKUCLyIiMfPXD5fx1w+/4qqTOvPD048JOs4RTcvFiojIYXltViHjJudRWFQKLKZ/dnPuuUBT0AZNPXgRETlkr80qZMzEeeHiHrJ43Vb+PWf1fl4l8aACLyIih2zc5DxKyyr22LajrJJxk/MCSiRVVOBFROSQrY7ouUezXeJHBV5ERA5JZaXToH7NZaRDRlqc00h1KvAiInJI/vjBUnaWV5KasudgurTUFG4/p0dAqaSKCryIiBy0KXnrefT9JVw0IIvfjupLVrjHnpWRxtiL+jByQFbACUWXyYmIyEFZtWk7t0yYTY/MpjzwrT6kNUjhWwOzyc3NZciQIUHHkzD14EVEJGo7yiq4bvwMKt35y5U5pDVICTqS7IN68CIiEhV35+evzWfB6i089d1BdG7VJOhIsh/qwYuISFRenLaKf80o4EenH8MZx2UGHUcOQAVeREQOaPaqIu59YwGndW/DzWd2DzqOREEFXkRE9uvrkp1cP34GbZs15PeX9SelnuaYrwt0Dl5ERPapvKKSH744i6+37WLi9SeT0bhB0JEkSurBi4jIPj38nyV8uuxrfjWyN72zmgcdRw6CCryIiNTonflreTx3GZcP7sQlgzoGHUcOkgq8iIjs5asNJdz28hz6ZTfn3hE9g44jh0AFXkRE9rBtZznXPjeDBvXr8djoHBrW12Q2dZEKvIiI7Obu3PnKXJZtKOGPlw/YPce81D0q8CIistvTn+Qzae4abjunB6cc0zroOHIYVOBFRASAacs38eu3FnF2z0yuP+3ooOPIYVKBFxER1m/ZwY0vzKRzy8Y8dEk/zDSZTV2niW5ERI5wu8orueH5mZTsKOf5/zuBZo1Sg44kMaACLyJyhPv1W4uYvmIzf7h8AN0zmwYdR2JEh+hFRI5gr88u5JlP87n6lK6M6Nch6DgSQ3Ev8GaWYmazzGxSte23mZmbWeuIbWPMbKmZ5ZnZOfHOKiKSzBav3cJPX5nH8V1aMGbYsUHHkRgL4hD9zcAioFnVBjPrCJwFrIzY1hO4DOgFdADeM7Pu7l4R37giIslny44yrntuBumN6vPn7wwkNUUHdJNNXH+jZpYNnA88We2pR4A7AI/YdiEwwd13uvtyYCkwOC5BRUSSWGWl85OX5lCwuZTHrhhI22aNgo4ktSDeX9keJVTIK6s2mNkIoNDd51RrmwWsinhcEN4mIiKH4fH/LuM/C9fxs/OP4/guLYOOI7XE3P3ArWLxRmbDgWHufoOZDQFuAy4BpgBnu3uxmeUDg9x9o5n9GfjM3ceHX/8U8Ja7v1Jtv9cA1wBkZmbmTJgwIaa5S0pKSE9Pj+k+D1eiZUq0PKBM0VKm6CRTpvkby3l4+k5OaJ/CtX0bxvR692T6nGpLbeQZOnToDHcftNcT7h6XGzCWUC88H1gLbAdeAdaHt+UD5YTOw7cDxgBjIl4/GThpf++Rk5PjsTZlypSY7/NwJVqmRMvjrkzRUqboJEumVZu2ef/7JvvZv/uvb9tZlhCZaluiZaqNPMB0r6Emxu0QvbuPcfdsd+9CaPDcB+4+yt3bunuX8PYCYKC7rwXeAC4zs4Zm1hXoBkyLV14RkWSyo6yCG56fSXmF88SVOTRuoGlQkl3C/obdfYGZvQQsJNSzv9E1gl5E5JDc9+8FzC0o5q9X5tC1dZOg40gcBFLg3T0XyK1he5dqjx8AHohLKBGRJPXPL1by4rRV3Dj0aM7u1S7oOBInuvBRRCSJzS0o4hevL+Abx7Tm1rN6BB1H4kgFXkQkSW3atovrx8+kTXpD/nD5AFLqaYW4I0nCnoMXEZFDV1Hp3DxhFhu27uTl606iZZMGQUeSOFOBFxFJQo++t4SPvtzI2Iv60K9jRtBxJAA6RC8ikmTeW7iOP36wlEsHdeTywZ2CjiMBUYEXEUki+Ru38eOXZtMnqzn3Xdgr6DgSIBV4EZEksX1XOdeNn0FKPeOxKwbSKDUl6EgSIJ2DFxFJAu7OmInzyFu3lWe+P5iOLRsHHUkCph68iEgSePazFbw+ezW3ntmd07q3CTqOJAAVeBGROm7Gik3cP2khZx7XlhuHHhN0HEkQKvAiInXY+q07uOH5mWS1SOPhS/pTT5PZSJgKvIhIHVVWUclNL8yiuLSMJ0bn0DwtNehIkkA0yE5EpI76zduLmbZ8E49c2o/j2jcLOo4kGBV4EZE65LVZhYybnEdhUSmwnG8e04pvDcgOOpYkIB2iFxGpI16bVciYifPCxT3kixWbeW1WYYCpJFGpwIuI1BHjJudRWlaxx7YdZZWMm5wXUCJJZCrwIiJ1wJri0j167pFW72O7HNl0Dl5EJIHtLK/gqY+X86cPlu6zTYeMtDgmkrpCBV5EJEHl5q3nvn8vZPnGbZzVM5MTu7bkoXeX7HGYPi01hdvP6RFgSklUKvAiIglm5dfb+eWkhby3aB1HtW7CM98/niE92gLQKr3h7lH0WRlp3H5OD0YOyAo4sSQiFXgRkQRRuquCx/+7jCf+u4z69YyfnncsV5/SlQb1/zdcauSALEYOyCI3N5chQ4YEF1YSngq8iEjA3J3JC9Zy/6RFFBaVMqJfB+4adhztmjcKOprUYSrwIiIBWrq+hPv+vYCPvtzIse2aMuGaEznxqFZBx5IkoAIvIhKArTvK+OMHS3n64+WkNUjh3gt6MvrEztRP0dXLEhsq8CIiceTuvDa7kF+/tZgNW3dy6aCO3H5uD1qnNww6miQZFXgRkThZsLqYe99YwBf5m+mX3Zy/XTWI/h0zgo4lSSruBd7MUoDpQKG7Dzez+4ELgUpgPfA9d19tZqnAk8DAcM5n3X1svPOKiByuou27ePjdJTz/+QoyGjfgN6P6cHFOR63dLrUqiB78zcAioGptw3Hu/gsAM/sRcDdwHXAx0NDd+5hZY2Chmb3o7vkBZBYROWgVlc5L01fx23cWU1xaxlUndeHHZ3aneWOt2y61L64F3syygfOBB4BbAdx9S0STJoCH7zvQxMzqA2nALiCyrYhIwpq5cjP3vL6AeYXFDO7akvtG9NKa7RJX8e7BPwrcATSN3GhmDwBXAcXA0PDmfxE6dL8GaAz82N03xS+qiMjB27B1J799ZzEvzyggs1lDfn9Zf0b064CZDsdLfJm7H7hVLN7IbDgwzN1vMLMhwG3uPrxamzFAI3e/x8xOAW4Avge0AD4CznP3r6q95hrgGoDMzMycCRMmxDR3SUkJ6enpMd3n4Uq0TImWB5QpWsoUnWgyVVQ6768s59Wlu9hVAed0SeWCo1NJq187hb2ufk7xlmiZaiPP0KFDZ7j7oL2ecPcD3oBfA40jHg8D0iIeNyM0CG5/+xgLFAD5wFpgOzC+WpvOwPzw/T8DV0Y89zRwyf7eIycnx2NtypQpMd/n4Uq0TImWx12ZoqVM0TlQpk+XbvSzf/df73znJB/95FRfun5r4JmCoEwHVht5gOleQ02MdkaFO4HIrxwTgPYRj9OAK/a3A3cf4+7Z7t4FuAz4wN1Hm1m3iGYjgMXh+yuB0y2kCXBixHMiIoFbU1zKTS/M5PK/TWXbrnL+cmUOz149mKPbJE6PUY5c0Z6Dr36MKZbHnB40sx6ELpNbQWgEPYR68H8H5off7+/uPjeG7ysickh2llfw5EehNdor3fnxmd259rSjaJSaEnQ0kd0CmejG3XOB3PD9UftoU0LoUjkRkYQxZfF67vv3AvK/3s45vTL5+fk96diycdCxRPaimexERPbhtVmFu9dez/z0PVqnN2DB6q0c1aYJz149mFO7twk6osg+HUyBv87MSiJe9wMz+zr8uOk+XiMiUie9NquQMRPnUVpWAcC6LTtZt2UnI/q256FL+u+xRrtIIoq2wK8Evh/xeC3wnRraiIgkhQffXry7uEeasbJIxV3qhKgKfHjku4hIUnN3Plv2Nc9NXcHaLTtqbLO6qDTOqUQOTUzOwYcvY7vc3Z+Mxf5EROJpy44yJs4o4LmpK1i2YRstGqeS3rA+JTvL92rbISMtgIQiB++wCryZnQT8ALiU0KVsKvAiUmcsXruF5z5bwauzCtm+q4J+HTN46OJ+DO/bnnfmr93jHDxAWmoKt5/TI8DEItE76AJvZq0IzRv/f8CxwJuEivyk2EYTEYm9XeWVvLNgLeM/W8G0/E00rF+PEf06cOVJnemb/b+12UcOyALYPYo+KyON28/psXu7SKKLusCb2TmEivoFwFTgEeBx4KfuvrB24omIxMaa4lJe+HwlL05bxcaSnXRq2Zi7hh3LxTkdadGkQY2vGTkgi5EDssjNzWXIkCHxDSxymKIq8GaWD+wAngNu9/Ca7Gb2eK0lExE5TO7Op8u+5rnPVvCfReuodOf0Hm0ZfVJnTuvWhnr1tMKbJK9oe/DtgNeB2cCq2osjInL4aho09/++eRRXnNBJs87JESPaAt+R0LKtDwFPm9kEYDwQn7VmRUSisGjNFp6buoLXwoPm+nfM4OGL+3F+3/aaJ16OONFeB78BGAeMM7NvEhpUNyX8+mvN7K/uvqD2YoqI1GxXeSVvz1/D+Kkr+CJ/8+5Bc1ed1IU+2c2DjicSmIMeRe/uHwEfmdmPCM1mdzXwQzPLc/fjYh1QRKQmq4tKeXHa/wbNdW7VmJ8NO46LB2WT0bjmQXMiR5JDvg7e3bcATwBPmFkfQiPsRURqjbvzydKveW5qPv9ZuA4Hzji2LaNP7MypGjQnsodoR9G/UdtBRET2pbi0jFdmFDD+8xV8tWEbLZs04NrTjuY7gzVoTmRfou3BDwdWEF7DXUQk1iKXZs2a+gG3n9OD7plNdw+aKy0LDZr73SX9GNZHg+ZEDiTaAv8QMBo4Ffg78Iy7F9RaKhE5olRfmrWwqJRbX5pNpUPD+vW4sH8HrjxRg+ZEDka0o+jvMLMxwPmEBtX9zMxygaeA1929rPYiikiyGzc5b6+lWSsdmqfV57+3D9WgOZFDEPWixu5e4e5vuPtIoCuhy+R+BRSaWXptBRSR5LevJVi3lJaruIscoqgLfDVNgAwgHShBE96IyGHIbN6oxu1amlXk0EVd4M0szcy+a2YfAvOAzsB33f0od99WawlFJKlVVjoZaal7bdfSrCKHJ9rL5P5KaM33Lwmddx/h7kW1GUxEjgxPfvwVi9du5ds52Xy27GstzSoSI9GOov8/YCWwBjgPOM9s7wkl3H1E7KKJSLKbs6qI376Tx7m92jHu230xMy3NKhIj0Rb4Z9F5dhGJoS07yvjhi7PIbNaI34wKFXcRiZ1oL5P7Xi3nEJEjiLtz18R5FBaV8tK1J9K88d7n4EXk8BzqKHoRkUP20vRVTJq7hlvP6k5O55ZBxxFJSnEv8GaWYmazzGxS+PH9ZjbXzGab2btm1iGibV8z+8zMFpjZPDOr+VoaEakzvly3lXveWMApx7Ti+tOODjqOSNIKogd/M7Ao4vE4d+/r7v2BScDdAGZWHxgPXOfuvYAhgGbME6nDdpRV8MMXZ9GkQX0euaS/Vn8TqUVxLfBmlk1outsnq7aFl52t0oT/DeY7G5jr7nPC7b529z3nshSROuVXby5k8dqtPHxJP9o20wE5kdp0yOvBH6JHgTuAppEbzewB4CqgGBga3twdcDObDLQBJrj7b+OYVURi6O15axg/dSXXnnoUQ3q0DTqOSNIz9/hc/WZmw4Fh7n6DmQ0BbnP34dXajAEaufs9ZnYbcCNwPLAdeB/4ubu/X+011wDXAGRmZuZMmDAhprlLSkpIT0+sqfYTLVOi5QFlila8Mm3YXsndn5bSvkk97jqhEfX3c2j+SP6cDoYyRSfRMtVGnqFDh85w90F7PeHucbkBY4ECIB9YS6hoj6/WpjMwP3z/MkLL0lY99wvg9v29R05OjsfalClTYr7Pw5VomRItj7syRSsemXaVV/i3/vyx9777HV+xcVtCZDpYyhQdZTqw2sgDTPcaamLczsG7+xh3z3b3LuHi/YG7jzazbhHNRgCLw/cnA33NrHF4wN1pwMJ45RWR2Hj0vSXMXFnEry/qQ6dWjYOOI3LEiPc5+Jo8aGY9gEpgBXAdgLtvNrPfAV8QGnj3lru/GVxMETlYH3+5kcdyl3HZ8R25oF+HA79ARGImkALv7rlAbvj+qP20G0/oUjkRqWM2bN3Jj1+azdFt0rnngl5BxxE54iRCD15EkkxlpfOTl+ewpbSM534wmLQGKUFHEjniaKpaEYm5v330FR8u2cAvhvfk2HbNgo4jckRSgReRmJq9qohxk/M4r3c7rjihU9BxRI5YKvAiEjOhJWBnktmsEQ9epCVgRYKkc/AiEhPuzpiJ81hdtIOXrj1JS8CKBEw9eBGJiX9+sYo3dy8B2yLoOCJHPBV4ETlsX67byr3/XsA3jmmtJWBFEoQKvIgclh1lFdz0wizSG9bnd5f20xKwIglC5+BF5LDcP2kheeu28o+rB9O2qZaAFUkU6sGLyCF7a94anv98JdeedhSndW8TdBwRiaACLyKHZNWm7dz5ylz6dczgtrN7BB1HRKpRgReRg1ZWUcnNE2aBwx8vG0Bqiv6UiCQanYMXkYP2yH9CS8D+8fIBWgJWJEHpa7eIHJSPvtzA4/9dxuWDtQSsSCJTgReRqG3YupMf/3MOx7RJ5+7hWgJWJJHpEL2IRKWy0rn1pdls3VHG8/93gpaAFUlw6sGLSFT+9tFXfPTlRu6+oCc92jUNOo6IHIAKvIgc0KyVmxk3OY9hfdrxncFaAlakLlCBF5H9Ki4t44cvziKzWSPGaglYkTpD5+BFZJ/cnbsmzmNN8Q5evu4kmqdpCViRukI9eBHZpwlfrOLNeWv4ydndGdhJS8CK1CUq8CJSoyXrtnJfeAnY607VErAidY0KvIjsJbQE7EwtAStSh+kcvIjs5ZeTFrJkXQnPaglYkTpLPXgR2cObc9fwQngJ2FO1BKxInaUCLyK7rdq0nZ9OnEt/LQErUufFvcCbWYqZzTKzSeHH95vZXDObbWbvmlmHau07mVmJmd0W76wiR5Kyikp+VLUE7OVaAlakrgvi/+CbgUURj8e5e1937w9MAu6u1v4R4O14hRM5Uv3uP0uYtbKIB0f1pWNLLQErUtfFtcCbWTZwPvBk1TZ33xLRpAngEe1HAl8BC+KVUeRI9OGSDTyeu4zLB3fi/L7tg44jIjEQ71H0jwJ3AHusVGFmDwBXAcXA0PC2JsCdwFmADs+L1JINW3dy60tz6J6Zzt3DewYdR0RixNz9wK1i8UZmw4Fh7n6DmQ0BbnP34dXajAEaufs9ZvYQMM3dXzKze4ESd3+ohv1eA1wDkJmZmTNhwoSY5i4pKSE9PT2m+zxciZYp0fKAMh3Ip6vLeGVJGV/vqCS1nlFRCb88JY3spsGfd0+kz6mKMkVHmQ6sNvIMHTp0hrsP2usJd4/LDRgLFAD5wFpgOzC+WpvOwPzw/Y/CbfOBImATcNP+3iMnJ8djbcqUKTHf5+FKtEyJlsddmfbn1ZkFfuzP3/bOd07afTvmrjf91ZkFQUdz98T5nCIpU3SU6cBqIw8w3WuoiXH7uu7uY9w92927AJcBH7j7aDPrFtFsBLA43P6b7t4l3P5R4Nfu/qd45RVJVuMm51FaVrHHtrIKZ9zkvIASiUhtSISZ7B40sx5AJbACuC7gPCJJq6yiksKi0hqfW72P7SJSNwVS4N09F8gN3x8VRft7a0acYCYAAByhSURBVDeRSHJbv2UHL0xbyQufr9xnmw4ZaXFMJCK1LRF68CJSC9yd6Ss2849P83ln/lrKK53Turfhwv7pjJ+6gtKyyt1t01JTuP0czVwnkkxU4EWSzPZd5bw+ezXPfraCRWu20LRRfb57chdGn9iZrq2bANCrQ3PGTc6jsKiUrIw0bj+nByMHZAWcXERiSQVeJEnkb9zGc1NX8PL0VWzZUc6x7Zoy9qI+XNi/A40b7Pm/+sgBWYwckEVubi5DhgwJJrCI1CoVeJE6rLLSyV2ynmc/W0Fu3gbq1zPO7d2Oq07qwvFdWmCmddxFjlQq8CJ1UNH2Xbw8vYDnpq5g5abttGnakFvO7MblgzuR2Uzrt4uICnyd8tqswv+dN536gc6bHoHmFxbz3GcreG12ITvLKxncpSW3n9ODc3q1o0H94GehE5HEoQJfR7w2q5AxE+ftnqCksKiUMRPnAajIJ7ld5ZW8PX8Nz362ghkrNpOWmsJFA7O48sQu9OzQLOh4IpKgVODriLFvL9pr9rHSsgru+/cCenVoxlFt0kmpp/OtyWRt8Q5e+HwFL0xbxcaSnXRp1Zifn38cF+d0pHnj1KDjiUiCU4FPcNt2lvPnKUtZt2Vnjc9v3l7GWY98SOMGKfTq0Iw+WRn0zW5O76zmHNW6CfVU9OsUd+fz5Zt49rN8Ji9YR6U7Q3u05aqTOnNqtzb6fYpI1FTgE5S78/rs1Yx9exHrtuwkLTVlrx48QNumDbnz3GOZV1jM3IIiXpi2gqc/CU1gkt6wPj07NKNvVnP6ZDenT1ZzurRS0U9E23aW8+qsQp79LJ8l60ponpbKD77RldEndKZTq8ZBxxOROkgFPgHNKyjm3n8vYMaKzfTJas5jVwxk1abSPc7BQ2j2sbuGHcfIAVmMyskGoLyikqUbSphXUBwu+sU8N3UFO8tDRb9pw/r0ympG3+wMemc1p29Wczq3aqzLqQKybEMJz322gldmFLB1Zzm9OjTjt6P6ckG/DqQ1SAk6nojUYSrwCWRjyU7GvZPHSzNW0apJA347qi/fzsmmXj0jp3OozYFmH6ufUo9j2zXj2HbNuHhQRyC0wMiX60qYX1jM3MIi5hUU88wn+eyqCBf9RvXpE9HL75uVQceWaSr6MVL96oefnNWdpmmpPPtZPh99uZHUFGNYn/ZcdVIXBnbK0OcuIjGhAp8Ayioq+cen+fz+vS8pLavgB6d05UdndqNZoz0HUh3q7GOpKfXo2aEZPTs045LjQ0V/V3klS9ZtDRf9YuYXFvP0x8spq3AAmqel7lH0+2Q1J7vFnkVfl+0dWE1XP/zk5Tk40K5ZI35yVncuG9yJNk0bBhtURJKOCnzA/rtkA7/89wKWbdjGqd3bcPfwnhzTNr3W37dB/Xr0zgoNxrssvG1neQVL1pYwr7CYeYVFzC0o5m8ffkV5ZajoZzRO3V3sS8vKeeHzVbsP/euyvZrVtPa6Ay0bp/LRnUNJTdG16yJSO1TgA5K/cRu/enMh7y1aT+dWjXnyqkGccVzbQA/PNqyfEuqxZzcHOgGwo6yCvLVbQ0W/INTb/8uHX1ERLvqRSssqGDc5TwU+wr7WWN+8vUzFXURqlQp8nG3bWc6fpizlqY+Wk5pi3HnusVz9jS40rJ+YA6oapabQr2MG/Tpm7N62o6yC437xDnuX+H0XtCNVZrNGrN2yY6/tWntdRGqbCnycuDuvzS5k7FuLWb91JxcNzOLOc4+tk/OGN0pNoUNGGoU1FHMVrv8pq6gkrcHevXStvS4i8aBjhHEwt6CIUY9/yo//OYd2zRsx8YaT+d0l/etkca9y+zk9SEvd+6jD2T3bBpAmMT349mKWb9zOlSd2Iiv8xScrI42xF/XRaQwRqXXqwdeiDVt3Mm7yYl6eURC67O3bffn2wOykmGimqkBVjaLv0LwRqSnGP6cXcPkJneme2TTghMF6c+4anvp4Od87uQv3jujF/aC110UkrlTga8Gu8tBlb394P3TZ2/99oys/PGPvy97quuqX7a0t3sHwP37Mtc/N4LUbT6F5WnL9vNFaun4rd/xrDgM7ZXDXsOOCjiMiRygdoo+x3Lz1nPv7D3ngrUXkdGnB5B+fys/O75l0xb0m7Zo3Cs+6t51b/zmbyhpG2ie7bTvLuW78TBqlpvDnKwZqCVcRCYz++sRI/sZt/OCZL/je37/AHZ7+3iCe+f5gjm5T+9e0J5LBXVvyi+E9eX/xen7//pdBx4krd+fOV+by1YYS/nj5ANo314BDEQmODtEfppKd5fzpg6U8/XHosrefnncs3z8lcS97i4erTurM3IJifv/+l/TOas5ZPTODjhQXf/8kn0lz13Dnucdy8jGtg44jIkc4FfhDVFnpvDqrkN+8E7rsbdTAbO48twdt6/DI+FgxMx74Vm/y1m3h1n/O5rWbTkn6IxnT8zfx67cWcVbPTK477aig44iI6BD9oZizqohRT3zKT16eQ/uMNF694WQevqSfinuERqkpPDE6h9T69bj2uRmU7CwPOlKtWb91Bzc8P5PsFmk8fEk/LRYjIglBBf4grN+6g9tfnsOFf/6EVZtKeejifrx6/ckM6NQi6GgJKbtFY/50+QC+2lDCbS/NwT35Bt2VV1TywxdmsWVHGY+PzjkiBlOKSN2gQ/T7ELlSWofP3mdQlxZ8sHgDO8sruPbUo7jp9GNoqj/mB3TyMa25a9hx/OrNRTyWu4wbhx4TdKSYGjc5j8+Xb+J3l/TjuPbNgo4jIrJb3Au8maUA04FCdx9uZvcDFwKVwHrge+6+2szOAh4EGgC7gNvd/YN4ZKy+xOfq4h28MWcNx7Vryp+vGMhRSX4+OdZ+8I2uzCko5qF38+jVoRlDeiTHbHfvzF/DXz78itEnduKigdlBxxER2UMQh+hvBhZFPB7n7n3dvT8wCbg7vH0jcIG79wG+CzwXr4A1LfEJsGVHuYr7ITAzfjOqDz0ym/KjF2ex4uttQUc6bF9tKOG2l+fSr2MGvxjeM+g4IiJ7iWuBN7Ns4Hzgyapt7r4lokkTQstl4+6z3H11ePsCoJGZNYxHzn2tiKaV0g5d4wb1+euVgzAzrn1uBtt31d1Bd9t3lXP9+JmkphiPXTHwiL4kUkQSV7x78I8CdxA6HL+bmT1gZquAK/hfDz7SKGCWu++s/Yj7XhFNK6Udnk6tGvOHyweQt24rd74yr04OunN3xkycx5L1W/nD5QN2LyIjIpJoLF5/ZM1sODDM3W8wsyHAbe4+vFqbMUAjd78nYlsv4A3gbHdfVsN+rwGuAcjMzMyZMGHCYWf9dHUZz8zfxa6IryEN6sH3ejfg5A7BD6wrKSkhPT1xThUcbJ5Jy3bxry/LuLRHA87rWjufZ219Ru+tKGP8ol1c1C2VEUc3SIhMh0OZoqNM0VGmA6uNPEOHDp3h7oP2esLd43IDxgIFQD6wFtgOjK/WpjMwP+JxNrAEOCWa98jJyfFYeXVmgZ889n3vfOckP3ns+/7qzIKY7ftwTZkyJegIezjYPJWVlX7dc9O9608n+SdfbkiITNGYsWKTH3PXm37136d5RUVlQmQ6XMoUHWWKjjIdWG3kAaZ7DTUxbofo3X2Mu2e7exfgMuADdx9tZt0imo0AFgOYWQbwJjDG3T+JV84qIwdk8clPT+eZc5vwyU9P1/rdMWRmjLu4H0e3SeemF2dRsHl70JEOaGPJTm4YP5P2zdP43SX9k2LJXxFJbokw0c2DZjbfzOYCZxMaZQ9wE3AM8Aszmx2+Jcf1VUJ6w/r85cocysoruW78DHbUcNVCoqiodH704iw2b9/F46MH0rxx8KdpREQOJJAC7+65Hj7/7u6j3L23hy6Vu8DdC8Pbf+XuTdy9f8RtfRB5pXYc1SadRy/rz/zCLdz1auIOunv43Tw+XfY194/sTa8OzYOOIyISlUTowcsR7IzjMrnlzG5MnFnIs5+tCDrOXt5dsJbHcpdx+eCOXDKoY9BxRESipgIvgfvR6d0487i23D9pIdOWbwo6zm75G7fxk5fm0CerOfdc0CvoOCIiB0UFXgJXr57xu0v706llY254fgZri3cEHYnSXRVcN34GKeHJbBqlajIbEalbVOAlITRrlMpfrszZXVh3lgc36M7d+dlr88hbt5VHL+1Px5aNA8siInKoVOAlYXTLbMrDl/Rj9qoi7n1jQWA5nv98JRNnFnLzGd2SZmEcETnyqMBLQjm3d3tuGHI0L05bxQufr4z7+89eVcQv/72Q07q34UendzvwC0REEpQKvCScn5zdg1O7t+GeN+Yzc+XmuL3vpm27uGH8DNo0bcijl2oyGxGp21TgJeGk1DP+cFl/2jdP4/rxM1i/tfYH3VVUOjdPmMXGkl08MTqHFk0Obp55EZFEowIvCSmjcQP+cmUOW0rLufH5mewqrzzwiw7D799bwkdfbuS+C3vRJ1uT2YhI3acCLwnruPbN+M23+/JF/mYeeHNhrb3PB4vX8YcPlnJxTjaXHa/JbEQkOdQPOoDI/ozo14F5BUX87aPl9MnO4Ns52THd/8qvt3PLhNn0bN+M+0f2xkzn3UUkOagHLwnvznOP5eSjW3HXq/OYV1Acs/3uKKvg+udnAPDE6BxNZiMiSUUFXhJe/ZR6/PHyAbRJb8i1z03n65KdMdnv3a/PZ8HqLTxyaX86tdJkNiKSXFTgpU5old6QJ0bnsHHbLm56YRblFYc36G7CtJW8NL2AH55+DGcclxmjlCIiiUMFXuqMPtnNGfutPnz21dc8+PbiQ97PvIJi7n5jAd/s1ppbzuwew4QiIolDg+ykThmVk83cgiKe/Hg5fbKbc2H/rIN6fdH2XVz//AxaN2nA7y8bQIomsxGRJKUevNQ5Px/ek8FdWnLnK3NZuHpL1K+rrHRu+eds1m3ZwWOjc2ipyWxEJImpwEudk5pSjz9dMYDmaalcO346Rdt3RfW6P3zwJbl5G7j7gl7075hRyylFRIKlAi91UtumjXh8dA7rinfywxdnUVHp+22fm7ee37//JRcNyGL0CZ3ilFJEJDgq8FJnDezUgvsu7MVHX27k4Xfz9tlu1abt3PLP2fTIbMoD3+qjyWxE5IigAi912uWDO3H54E48lruMt+et2ev5HWUV3PD8TCoqnMdH55DWQJPZiMiRQQVe6rx7R/RkQKcMfvLyHJas27rHc/f9eyHzCot5+JJ+dG3dJKCEIiLxp8vkpM5rWD+Fx6/IYfgfP+Y7f5tKako91hTvIOPDdynaXsZ1px3N2b3aBR1TRCSu1IOXpNCueSO+c0JHNpbsYk1xaP34ou1l1DPo3lY9dxE58qjAS9J4ZUbhXtsqHR7+z5cBpBERCZYKvCSN1UWlB7VdRCSZxb3Am1mKmc0ys0nhx/eb2Vwzm21m75pZh4i2Y8xsqZnlmdk58c4qdUuHjLSD2i4iksyC6MHfDCyKeDzO3fu6e39gEnA3gJn1BC4DegHnAo+Zma5xkn26/ZwepFVb0z0tNYXbz+kRUCIRkeDEtcCbWTZwPvBk1TZ3j5xMvAlQNSXZhcAEd9/p7suBpcDgeGWVumfkgCzGXtSHrHCPPSsjjbEX9WHkgINbkEZEJBnE+zK5R4E7gKaRG83sAeAqoBgYGt6cBUyNaFYQ3iayTyMHZDFyQBa5ubkMGTIk6DgiIoEx9/3P4R2zNzIbDgxz9xvMbAhwm7sPr9ZmDNDI3e8xsz8Dn7n7+PBzTwFvufsr1V5zDXANQGZmZs6ECRNimrukpIT09PSY7vNwJVqmRMsDyhQtZYqOMkVHmQ6sNvIMHTp0hrsP2usJd4/LDRhLqBeeD6wFtgPjq7XpDMwP3x8DjIl4bjJw0v7eIycnx2NtypQpMd/n4Uq0TImWx12ZoqVM0VGm6CjTgdVGHmC611AT43YO3t3HuHu2u3chNHjuA3cfbWbdIpqNABaH778BXGZmDc2sK9ANmBavvCIiInVZIkxV+6CZ9QAqgRXAdQDuvsDMXgIWAuXAje5eEVxMERGRuiOQAu/uuUBu+P6o/bR7AHggPqlERESSh2ayExERSUIq8CIiIklIBV5ERCQJxe06+Hgwsw2EBurFUmtgY4z3ebgSLVOi5QFlipYyRUeZoqNMB1YbeTq7e5vqG5OqwNcGM5vuNU0gEKBEy5RoeUCZoqVM0VGm6CjTgcUzjw7Ri4iIJCEVeBERkSSkAn9gfw06QA0SLVOi5QFlipYyRUeZoqNMBxa3PDoHLyIikoTUgxcREUlCKvD7YWYpZjbLzCYFnQXAzPLNbJ6ZzTaz6UHnATCzDDP7l5ktNrNFZnZSwHl6hD+fqtsWM7slyEzhXD82swVmNt/MXjSzRgmQ6eZwngVBfUZm9rSZrTez+RHbWprZf8zsy/C/LRIg08Xhz6nSzOI+InsfmcaF/7+ba2avmllGwHnuD2eZbWbvmlmHeOXZV6aI524zMzez1kFnMrN7zaww4m/UsNp6fxX4/bsZWBR0iGqGunv/BLrs4/fAO+5+LNCPgD8vd88Lfz79gRxCyxK/GmQmM8sCfgQMcvfeQAqhFRWDzNQb+H/AYEK/t+HVVnaMl2eAc6tt+ynwvrt3A94PPw4603zgIuDDOGep8gx7Z/oP0Nvd+wJLCC2xHWSece7eN/z/3iTg7jjm2VcmzKwjcBawMs55YB+ZgEeq/k65+1u19eYq8PtgZtnA+cCTQWdJVGbWDDgVeArA3Xe5e1GwqfZwBrDM3WM9+dGhqA+kmVl9oDGwOuA8xwFT3X27u5cD/wW+Fe8Q7v4hsKna5guBf4Tv/wMYGXQmd1/k7nnxzFHt/WvK9G74dwcwFcgOOM+WiIdNgLgO8NrHf0sAjwB3xDsP7DdTXKjA79ujhP6jqAw6SAQH3jWzGWZ2TdBhgKOADcDfw6cynjSzJkGHinAZ8GLQIdy9EHiIUA9iDVDs7u8Gm4r5wKlm1srMGgPDgI4BZ6qS6e5rAML/tg04T11wNfB20CHM7AEzWwVcQfx78DXlGQEUuvucoLNUc1P4dMbTtXkKSgW+BmY2HFjv7jOCzlLNKe4+EDgPuNHMTg04T31gIPC4uw8AthH/w6k1MrMGwAjg5QTI0oJQr7Qr0AFoYmajg8zk7ouA3xA6zPsOMAco3++LJCGZ2c8I/e6eDzqLu//M3TuGs9wUZJbwF9efkQBfNKp5HDga6E/oC//DtfVGKvA1OwUYYWb5wATgdDMbH2wkcPfV4X/XEzqvPDjYRBQABe7+efjxvwgV/ERwHjDT3dcFHQQ4E1ju7hvcvQyYCJwccCbc/Sl3H+jupxI6jPhl0JnC1plZe4Dwv+sDzpOwzOy7wHDgCk+sa55fAEYFnOFoQl+q54T/lmcDM82sXZCh3H2du1e4eyXwN2rx77gKfA3cfYy7Z7t7F0KHeT9w90B7XGbWxMyaVt0HziZ0mDUw7r4WWGVmPcKbzgAWBhgp0uUkwOH5sJXAiWbW2MyM0OcU+OBNM2sb/rcToQFkifJ5vQF8N3z/u8DrAWZJWGZ2LnAnMMLdtydAnshBmiOAxUFlAXD3ee7e1t27hP+WFwADw3+3AlP15TXsW9Ti3/H6tbVjiblM4NVQfaA+8IK7vxNsJAB+CDwfPiT+FfD9gPNUHZo7C7g26CwA7v65mf0LmEnoUOosEmN2rVfMrBVQBtzo7pvjHcDMXgSGAK3NrAC4B3gQeMnMfkDoy9HFCZBpE/BHoA3wppnNdvdzAs40BmgI/Cf8d2Gqu18XYJ5h4S/7lYRW9YxLlv1lcven4pkhmkzAEDPrT2hMVT61+HdKM9mJiIgkIR2iFxERSUIq8CIiIklIBV5ERCQJqcCLiIgkIRV4ERGRJKQCLyIikoRU4EUOk5k9YzUsKWxmg8JLVHaJf6oj175+HzHcf5fw73VQxLbGZvaOmS0PaFU+kb1oohuRJBdewa4iXlOZmllqeEreOi3azy281sCbQFNC60UEvVKgCKAevEhcWMhSM7ut2vZu4d7gwPBjN7ObzOxNM9tuZiuqL0xjZllmNsHMNodvb0b2Gs3sXjObb2bfM7NlwE5Cy3dWz5RrZk+Y2e8j9jXOzOpFtBltZl+Y2VYzW29mL4fXt696fkg48zAzm2Zmu4BzzOxoM3vdzNaa2TYzmxlexCny/fPN7O5wj3urma0ys0vNLCP885WY2Zdmdna11/UM/8xVmV6sml/czO4lNL3t+eFcbmZDYvm5VcvSgf+tEX+qirskEhV4kTgI9wKfIrSsZ6SrgdnuPjNi232E5mPvT2hK22erDgeHp+GdAuwATgNOIrQi1Xvh56p0Bb5DaJrXfuH2NbmC0N+BkwhNmXkNcEvE8w0ITa/Zj9CiJq2pec763wA/B44FPgfSCS1felb4ta8AE83s2GqvuwWYRmiRopcIrf/+AvBW+Of/EBhvZo3CP3/78Lb5hBbpODP8Xm+Ev5g8FN7Pe0D78O3TWvjcAI4BPiE0x/mZQUz1K7Jf7q6bbrodxg14htAc8yXVbtsJzTfdJdyuHaF5308MP04BCoGbIvblwN+q7f89YHz4/tWEVn2ziOdTgK+BS8KP7w2/T+YBcucCS6rt6+eEVgjc12uODWfMDj8eEn48KorPaSrw84jH+cCLEY/Tw/v6Q8S2LuFtg8KPfwm8X22/LcJtBkf8PiZVaxPLz60q007gUyA16P8GddOtppt68CKx8SGhHmfk7TuRDTy0itUk/teLPxdoxd7reH9Ww+Oe4fs5hHqZW8OHsEuAYkJF7uiI1xR4eKlcM/tmVdvw7YqIdlPdPfIc82dAlpk1C792YPhQ+woz2wpMD7frVC3j9MgHFlr98LdmtjB8OLwEGFTD6+ZGfD5VX4rmRTxftdxv24if/9TInwdYFX4u8uev7qA/tyi8TugowmVRtheJKw2yE4mN7e6+NHKDmWXU0O5J4AUzu4VQoZ/oB3dotx4wm5qLyqaI+9si7k8n9IWjSlQFzELLEk8mdAThSkLrsrcGPiJ06D7StmqPHyL0BeY2Qj3n7cCzNbyu+mA8r7at6stHvYh/3wzvt7r9/VyH8rkdyG8JfbbPmFmKuz9zEK8VqXUq8CLx9Q6whdBSmhcAw2pocyLwdLXHVevHzyS01v1Gdy+K5g3dvRRYuo+nTzAzi+jFnwisdvctZpZDqKDf5e7LAczsomjeE/gG8Ky7vxJ+XSNCPeUlUb5+X2YClwArfN8j9XcROvxe/XUH9blFw91/a2ZlwFNmVt/dn4zVvkUOlw7Ri8SRu1cQKt5jCZ1/f7+GZheZ2f8Lj7AfA5wBPBp+7nlCPdXXzew0M+tqZqea2cN2aNdfdwAeNbMeZvZt4HbgkfBzKwmdZ77JzI4ys/OB+6Pc7xLgW+FD/H2A8UCjQ8hX3Z+B5sA/zeyEcK4zzeyvZtY03CYf6B3+mVqbWSqx/9x2c/dHgJuBv5hZra3tLXKwVOBF4u9pQoeq/17t/HeVe4FRhM5PXw98392/AHD37cCpwFfAy8BiQiPPWwCHMor7eUK93c+BvxEa6f9I+L02ELrkbCSwkNBo+luj3O+thA7pf0RoNP3U8P3D4qHL0E4BKgkdDVlAqOjvDN8I/xyLCB0+30Do2vRYf27Vc/0JuBF4zMxuONz9icSC1fz3RURqi5mdQOjyqqPcfWW15xy42N3/FYccucB8d7+ptt9LROJP5+BF4sTMGgIdgV8Br1Yv7iIisaRD9CLxczmQR+jSuGgPdYuIHBIdohcREUlC6sGLiIgkIRV4ERGRJKQCLyIikoRU4EVERJKQCryIiEgSUoEXERFJQv8fvYtmdBjFjMIAAAAASUVORK5CYII=\n",
192 | "text/plain": [
193 | ""
194 | ]
195 | },
196 | "metadata": {
197 | "needs_background": "light"
198 | },
199 | "output_type": "display_data"
200 | }
201 | ],
202 | "source": [
203 | "fig, ax = plt.subplots(figsize=(8,5))\n",
204 | "ax.plot(candidates, mae_metrics, \"o-\")\n",
205 | "ax.set_xlabel('Hyper-parameter K', fontsize=14)\n",
206 | "ax.set_ylabel('MAE', fontsize=14)\n",
207 | "ax.set_xticks(candidates)\n",
208 | "ax.grid();"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "#### Recalculating train-set split"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 10,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=7)\n",
225 | "scaler = StandardScaler()\n",
226 | "scaler.fit(X_train[numerical_features])\n",
227 | "X_train.loc[:, numerical_features] = scaler.fit_transform(X_train[numerical_features])\n",
228 | "X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "#### Optimizing with cross-validation"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "from sklearn.model_selection import cross_val_score\n",
245 | "candidates = np.arange(4,16)\n",
246 | "mean_mae = []\n",
247 | "std_mae = []\n",
248 | "for k in candidates:\n",
249 | " model = KNeighborsRegressor(n_neighbors=k, weights='distance', metric='minkowski', leaf_size=50, n_jobs=4)\n",
250 | " cv_results = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=10)\n",
251 | " mean_score, std_score = -1*cv_results.mean(), cv_results.std()\n",
252 | " mean_mae.append(mean_score)\n",
253 | " std_mae.append(std_score)"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {
260 | "scrolled": true
261 | },
262 | "outputs": [],
263 | "source": [
264 | "fig, ax = plt.subplots(figsize=(8,5))\n",
265 | "ax.plot(candidates, mean_mae, \"o-\")\n",
266 | "ax.set_xlabel('Hyper-parameter K', fontsize=14)\n",
267 | "ax.set_ylabel('Mean MAE', fontsize=14)\n",
268 | "ax.set_xticks(candidates)\n",
269 | "ax.grid();"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {},
276 | "outputs": [],
277 | "source": [
278 | "fig, ax = plt.subplots(figsize=(8,5))\n",
279 | "ax.plot(candidates, std_mae, \"o-\")\n",
280 | "ax.set_xlabel('Hyper-parameter K', fontsize=14)\n",
281 | "ax.set_ylabel('Standard deviation of MAE', fontsize=14)\n",
282 | "ax.set_xticks(candidates)\n",
283 | "ax.grid();"
284 | ]
285 | },
286 | {
287 | "cell_type": "markdown",
288 | "metadata": {},
289 | "source": [
290 | "# Improving Performance"
291 | ]
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "metadata": {},
296 | "source": [
297 | "## Improving our diamond price predictions"
298 | ]
299 | },
300 | {
301 | "cell_type": "markdown",
302 | "metadata": {},
303 | "source": [
304 | "### Fitting a neural network"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "metadata": {},
311 | "outputs": [],
312 | "source": [
313 | "from keras.models import Sequential\n",
314 | "from keras.layers import Dense\n",
315 | "\n",
316 | "n_input = X_train.shape[1]\n",
317 | "n_hidden1 = 32\n",
318 | "n_hidden2 = 16\n",
319 | "n_hidden3 = 8\n",
320 | "\n",
321 | "nn_reg = Sequential()\n",
322 | "nn_reg.add(Dense(units=n_hidden1, activation='relu', input_shape=(n_input,)))\n",
323 | "nn_reg.add(Dense(units=n_hidden2, activation='relu'))\n",
324 | "nn_reg.add(Dense(units=n_hidden3, activation='relu'))\n",
325 | "# output layer\n",
326 | "nn_reg.add(Dense(units=1, activation=None))"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": null,
332 | "metadata": {
333 | "scrolled": true
334 | },
335 | "outputs": [],
336 | "source": [
337 | "batch_size = 32\n",
338 | "n_epochs = 40\n",
339 | "nn_reg.compile(loss='mean_absolute_error', optimizer='adam')\n",
340 | "nn_reg.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size, validation_split=0.05)"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "metadata": {},
347 | "outputs": [],
348 | "source": [
349 | "y_pred = nn_reg.predict(X_test).flatten()\n",
350 | "mae_neural_net = mean_absolute_error(y_test, y_pred)\n",
351 | "print(\"MAE Neural Network: {:0.2f}\".format(mae_neural_net))"
352 | ]
353 | },
354 | {
355 | "cell_type": "markdown",
356 | "metadata": {},
357 | "source": [
358 | "### Transforming the target"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "metadata": {},
365 | "outputs": [],
366 | "source": [
367 | "diamonds['price'].hist(bins=25, ec='k', figsize=(8,5))\n",
368 | "plt.title(\"Distribution of diamond prices\", fontsize=16)\n",
369 | "plt.grid(False);"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": null,
375 | "metadata": {},
376 | "outputs": [],
377 | "source": [
378 | "y_train = np.log(y_train)\n",
379 | "pd.Series(y_train).hist(bins=25, ec='k', figsize=(8,5))\n",
380 | "plt.title(\"Distribution of log diamond prices\", fontsize=16)\n",
381 | "plt.grid(False);"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": null,
387 | "metadata": {},
388 | "outputs": [],
389 | "source": [
390 | "nn_reg = Sequential()\n",
391 | "nn_reg.add(Dense(units=n_hidden1, activation='relu', input_shape=(n_input,)))\n",
392 | "nn_reg.add(Dense(units=n_hidden2, activation='relu'))\n",
393 | "nn_reg.add(Dense(units=n_hidden3, activation='relu'))\n",
394 | "# output layer\n",
395 | "nn_reg.add(Dense(units=1, activation=None))"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": null,
401 | "metadata": {
402 | "scrolled": true
403 | },
404 | "outputs": [],
405 | "source": [
406 | "batch_size = 32\n",
407 | "n_epochs = 40\n",
408 | "nn_reg.compile(loss='mean_absolute_error', optimizer='adam')\n",
409 | "nn_reg.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size, validation_split=0.05)"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": null,
415 | "metadata": {},
416 | "outputs": [],
417 | "source": [
418 | "y_pred = nn_reg.predict(X_test).flatten()\n",
419 | "y_pred = np.exp(y_pred)\n",
420 | "mae_neural_net2 = mean_absolute_error(y_test, y_pred)\n",
421 | "print(\"MAE Neural Network (modified target): {:0.2f}\".format(mae_neural_net2))"
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": null,
427 | "metadata": {},
428 | "outputs": [],
429 | "source": [
430 | "100*(mae_neural_net - mae_neural_net2)/mae_neural_net2"
431 | ]
432 | },
433 | {
434 | "cell_type": "markdown",
435 | "metadata": {},
436 | "source": [
437 | "#### Analyzing the results"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": null,
443 | "metadata": {},
444 | "outputs": [],
445 | "source": [
446 | "fig, ax = plt.subplots(figsize=(8,5))\n",
447 | "residuals = y_test - y_pred\n",
448 | "ax.scatter(y_test, residuals, s=3)\n",
449 | "ax.set_title('Residuals vs. Observed Prices', fontsize=16)\n",
450 | "ax.set_xlabel('Observed prices', fontsize=14)\n",
451 | "ax.set_ylabel('Residuals', fontsize=14)\n",
452 | "ax.grid();"
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "execution_count": null,
458 | "metadata": {},
459 | "outputs": [],
460 | "source": [
461 | "mask_7500 = y_test <=7500\n",
462 | "mae_neural_less_7500 = mean_absolute_error(y_test[mask_7500], y_pred[mask_7500])\n",
463 | "print(\"MAE considering price <= 7500: {:0.2f}\".format(mae_neural_less_7500))"
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": null,
469 | "metadata": {},
470 | "outputs": [],
471 | "source": [
472 | "fig, ax = plt.subplots(figsize=(8,5))\n",
473 | "percent_residuals = (y_test - y_pred)/y_test\n",
474 | "ax.scatter(y_test, percent_residuals, s=3)\n",
475 | "ax.set_title('Pecent residuals vs. Observed Prices', fontsize=16)\n",
476 | "ax.set_xlabel('Observed prices', fontsize=14)\n",
477 | "ax.set_ylabel('Pecent residuals', fontsize=14)\n",
478 | "ax.axhline(y=0.15, color='r'); ax.axhline(y=-0.15, color='r'); \n",
479 | "ax.grid();"
480 | ]
481 | },
482 | {
483 | "cell_type": "code",
484 | "execution_count": null,
485 | "metadata": {},
486 | "outputs": [],
487 | "source": []
488 | },
489 | {
490 | "cell_type": "code",
491 | "execution_count": null,
492 | "metadata": {},
493 | "outputs": [],
494 | "source": []
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": null,
499 | "metadata": {},
500 | "outputs": [],
501 | "source": []
502 | },
503 | {
504 | "cell_type": "code",
505 | "execution_count": null,
506 | "metadata": {},
507 | "outputs": [],
508 | "source": []
509 | }
510 | ],
511 | "metadata": {
512 | "kernelspec": {
513 | "display_name": "Python 3",
514 | "language": "python",
515 | "name": "python3"
516 | },
517 | "language_info": {
518 | "codemirror_mode": {
519 | "name": "ipython",
520 | "version": 3
521 | },
522 | "file_extension": ".py",
523 | "mimetype": "text/x-python",
524 | "name": "python",
525 | "nbconvert_exporter": "python",
526 | "pygments_lexer": "ipython3",
527 | "version": "3.6.10"
528 | }
529 | },
530 | "nbformat": 4,
531 | "nbformat_minor": 2
532 | }
533 |
--------------------------------------------------------------------------------
/Chapter09/Model/diamond-prices-model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Predictive-Analytics-with-Python/e049d9d2870f596892b62aeddec312788d9d0c2c/Chapter09/Model/diamond-prices-model.h5
--------------------------------------------------------------------------------
/Chapter09/Model/pca.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Predictive-Analytics-with-Python/e049d9d2870f596892b62aeddec312788d9d0c2c/Chapter09/Model/pca.joblib
--------------------------------------------------------------------------------
/Chapter09/Model/scaler.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Predictive-Analytics-with-Python/e049d9d2870f596892b62aeddec312788d9d0c2c/Chapter09/Model/scaler.joblib
--------------------------------------------------------------------------------
/Chapter09/dash-example-no-user-inputs.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @author: Alvaro Fuentes
4 | Chapter 9. Hands-On Predictive Analytics with Python
5 | Building a basic static app
6 | """
7 | ## imports
8 | import dash
9 | import dash_core_components as dcc
10 | import dash_html_components as html
11 | import plotly.graph_objs as go
12 | import pandas as pd
13 | import os
14 |
15 | ## Importing the dataset
16 | DATA_DIR = '../data'
17 | FILE_NAME = 'diamonds.csv'
18 | data_path = os.path.join(DATA_DIR, FILE_NAME)
19 | diamonds = pd.read_csv(data_path)
20 |
21 | ## Creating the app
22 | app = dash.Dash(__name__)
23 |
24 | # Creating a Plotly figure
25 | trace = go.Histogram(
26 | x = diamonds['price']
27 | )
28 |
29 | layout = go.Layout(
30 | title = 'Diamond Prices',
31 | xaxis = dict(title='Price'),
32 | yaxis = dict(title='Count')
33 | )
34 |
35 | figure = go.Figure(
36 | data = [trace],
37 | layout = layout
38 | )
39 |
40 | app.layout = html.Div([
41 | html.H1('My first Dash App'),
42 | html.H2('Histogram of diamond prices'),
43 | html.P('This is some normal text, we can use it to describe something about the application.'),
44 | dcc.Graph(id='my-histogram', figure=figure)
45 | ])
46 |
47 |
48 | if __name__ == '__main__':
49 | app.run_server(debug=True)
50 |
--------------------------------------------------------------------------------
/Chapter09/dash-example-user-inputs.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @author: Alvaro Fuentes
4 | Chapter 9. Hands-On Predictive Analytics with Python
5 | Building a basic interactive app
6 | """
7 | ## imports
8 | import dash
9 | import dash_core_components as dcc
10 | import dash_html_components as html
11 | from dash.dependencies import Input, Output
12 | import plotly.graph_objs as go
13 | import pandas as pd
14 | import os
15 |
16 | ## Importing the dataset
17 | DATA_DIR = '../data'
18 | FILE_NAME = 'diamonds.csv'
19 | data_path = os.path.join(DATA_DIR, FILE_NAME)
20 | diamonds = pd.read_csv(data_path)
21 | diamonds = diamonds.sample(n=2000)
22 |
23 |
24 | app = dash.Dash(__name__)
25 |
26 | app.css.append_css({
27 | 'external_url': 'https://codepen.io/chriddyp/pen/bWLwgP.css'
28 | })
29 |
30 | numerical_features = ['price','carat','depth','table','x','y','z']
31 | options_dropdown = [{'label':x.upper(), 'value':x} for x in numerical_features]
32 |
33 | dd_x_var = dcc.Dropdown(
34 | id='x-var',
35 | options = options_dropdown,
36 | value = 'carat'
37 | )
38 |
39 | div_x_var = html.Div(
40 | children=[html.H4('Variable for x axis: '), dd_x_var],
41 | className="six columns"
42 | )
43 |
44 |
45 | dd_y_var = dcc.Dropdown(
46 | id='y-var',
47 | options = options_dropdown,
48 | value = 'price'
49 | )
50 |
51 | div_y_var = html.Div(
52 | children=[html.H4('Variable for y axis: '), dd_y_var],
53 | className="six columns"
54 | )
55 |
56 | app.layout = html.Div(children=[
57 | html.H1('Adding interactive controls'),
58 | html.H2('Interactive scatter plot example'),
59 | html.Div(
60 | children=[div_x_var, div_y_var],
61 | className="row"
62 | ),
63 | dcc.Graph(id='scatter')
64 | ])
65 |
66 |
67 | @app.callback(
68 | Output(component_id='scatter', component_property='figure'),
69 | [Input(component_id='x-var', component_property='value'), Input(component_id='y-var', component_property='value')])
70 | def scatter_plot(x_col, y_col):
71 | trace = go.Scatter(
72 | x = diamonds[x_col],
73 | y = diamonds[y_col],
74 | mode = 'markers'
75 | )
76 |
77 | layout = go.Layout(
78 | title = 'Scatter plot',
79 | xaxis = dict(title = x_col.upper()),
80 | yaxis = dict(title = y_col.upper())
81 | )
82 |
83 | output_plot = go.Figure(
84 | data = [trace],
85 | layout = layout
86 | )
87 |
88 | return output_plot
89 |
90 |
91 | if __name__ == '__main__':
92 | app.run_server(debug=True)
--------------------------------------------------------------------------------
/Chapter09/diamonds-model-training.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @author: Alvaro Fuentes
4 | Chapter 9. Hands-On Predictive Analytics with Python
5 | Producing the predictive model's objects
6 | """
7 | ## Imports
8 | import numpy as np
9 | import pandas as pd
10 | import os
11 | from keras.models import Sequential
12 | from keras.layers import Dense
13 | from sklearn.externals import joblib
14 |
15 | ## Loading the dataset
16 | DATA_DIR = '../data'
17 | FILE_NAME = 'diamonds.csv'
18 | data_path = os.path.join(DATA_DIR, FILE_NAME)
19 | diamonds = pd.read_csv(data_path)
20 |
21 |
22 | ## Preparing the dataset
23 | diamonds = diamonds.loc[(diamonds['x']>0) | (diamonds['y']>0)]
24 | diamonds.loc[11182, 'x'] = diamonds['x'].median()
25 | diamonds.loc[11182, 'z'] = diamonds['z'].median()
26 | diamonds = diamonds.loc[~((diamonds['y'] > 30) | (diamonds['z'] > 30))]
27 | diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['cut'], prefix='cut', drop_first=True)], axis=1)
28 | diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['color'], prefix='color', drop_first=True)], axis=1)
29 | diamonds = pd.concat([diamonds, pd.get_dummies(diamonds['clarity'], prefix='clarity', drop_first=True)], axis=1)
30 |
31 | ## Dimensionality reduction
32 | from sklearn.decomposition import PCA
33 | pca = PCA(n_components=1, random_state=123)
34 | diamonds['dim_index'] = pca.fit_transform(diamonds[['x','y','z']])
35 | diamonds.drop(['x','y','z'], axis=1, inplace=True)
36 |
37 | ## Creating X and y
38 | X = diamonds.drop(['cut','color','clarity','price'], axis=1)
39 | y = np.log(diamonds['price'])
40 |
41 | ## Standarization: centering and scaling
42 | numerical_features = ['carat', 'depth', 'table', 'dim_index']
43 | from sklearn.preprocessing import StandardScaler
44 | scaler = StandardScaler()
45 | X.loc[:, numerical_features] = scaler.fit_transform(X[numerical_features])
46 |
47 | ## Building the neural network
48 | n_input = X.shape[1]
49 | n_hidden1 = 32
50 | n_hidden2 = 16
51 | n_hidden3 = 8
52 |
53 | nn_reg = Sequential()
54 | nn_reg.add(Dense(units=n_hidden1, activation='relu', input_shape=(n_input,)))
55 | nn_reg.add(Dense(units=n_hidden2, activation='relu'))
56 | nn_reg.add(Dense(units=n_hidden3, activation='relu'))
57 | # output layer
58 | nn_reg.add(Dense(units=1, activation=None))
59 |
60 | ## Training the neural network
61 | batch_size = 32
62 | n_epochs = 40
63 | nn_reg.compile(loss='mean_absolute_error', optimizer='adam')
64 | nn_reg.fit(X, y, epochs=n_epochs, batch_size=batch_size)
65 |
66 | ## Serializing:
67 | # PCA
68 | joblib.dump(pca, './Model/pca.joblib')
69 |
70 | # Scaler
71 | joblib.dump(scaler, './Model/scaler.joblib')
72 |
73 | # Trained model
74 | nn_reg.save("./Model/diamond-prices-model.h5")
--------------------------------------------------------------------------------
/Chapter09/predict-diamond-prices.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @author: Alvaro Fuentes
4 | Chapter 9. Hands-On Predictive Analytics with Python
5 | Building the web application
6 | """
7 |
8 | import dash
9 | import dash_core_components as dcc
10 | import dash_html_components as html
11 | from dash.dependencies import Input, Output
12 |
13 | from keras.models import load_model
14 | from sklearn.externals import joblib
15 |
16 | import numpy as np
17 | import pandas as pd
18 |
19 | app = dash.Dash(__name__)
20 | app.css.append_css({
21 | 'external_url': 'https://codepen.io/chriddyp/pen/bWLwgP.css'
22 | })
23 |
24 | model = load_model('./Model/diamond-prices-model.h5')
25 | pca = joblib.load('./Model/pca.joblib')
26 | scaler = joblib.load('./Model/scaler.joblib')
27 | model._make_predict_function()
28 |
29 | ## Div for carat
30 | input_carat = dcc.Input(
31 | id='carat',
32 | type='numeric',
33 | value=0.7)
34 |
35 | div_carat = html.Div(
36 | children=[html.H3('Carat:'), input_carat],
37 | className="four columns"
38 | )
39 |
40 | ## Div for depth
41 | input_depth = dcc.Input(
42 | id='depth',
43 | placeholder='',
44 | type='numeric',
45 | value=60)
46 |
47 | div_depth = html.Div(
48 | children=[html.H3('Depth:'), input_depth],
49 | className="four columns"
50 | )
51 |
52 | ## Div for table
53 | input_table = dcc.Input(
54 | id='table',
55 | placeholder='',
56 | type='numeric',
57 | value=60)
58 |
59 | div_table = html.Div(
60 | children=[html.H3('Table:'), input_table],
61 | className="four columns"
62 | )
63 |
64 | ## Div for x
65 | input_x = dcc.Input(
66 | id='x',
67 | placeholder='',
68 | type='numeric',
69 | value=5)
70 |
71 | div_x = html.Div(
72 | children=[html.H3('x value:'), input_x],
73 | className="four columns"
74 | )
75 |
76 | ## Div for y
77 | input_y = dcc.Input(
78 | id='y',
79 | placeholder='',
80 | type='numeric',
81 | value=5)
82 |
83 | div_y = html.Div(
84 | children=[html.H3('y value:'), input_y],
85 | className="four columns"
86 | )
87 |
88 | ## Div for z
89 | input_z = dcc.Input(
90 | id='z',
91 | placeholder='',
92 | type='numeric',
93 | value=3)
94 |
95 | div_z = html.Div(
96 | children=[html.H3('z value: '), input_z],
97 | className="four columns"
98 | )
99 |
100 | ## Div for cut
101 | cut_values = ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good']
102 | cut_options = [{'label': x, 'value': x} for x in cut_values]
103 | input_cut = dcc.Dropdown(
104 | id='cut',
105 | options = cut_options,
106 | value = 'Ideal'
107 | )
108 |
109 | div_cut = html.Div(
110 | children=[html.H3('Cut:'), input_cut],
111 | className="four columns"
112 | )
113 |
114 | ## Div for color
115 | color_values = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
116 | color_options = [{'label': x, 'value': x} for x in color_values]
117 | input_color = dcc.Dropdown(
118 | id='color',
119 | options = color_options,
120 | value = 'G'
121 | )
122 |
123 | div_color = html.Div(
124 | children=[html.H3('Color:'), input_color],
125 | className="four columns"
126 | )
127 |
128 | ## Div for clarity
129 | clarity_values = ['I1', 'IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2']
130 | clarity_options = [{'label': x, 'value': x} for x in clarity_values]
131 | input_clarity = dcc.Dropdown(
132 | id='clarity',
133 | options = clarity_options,
134 | value = 'SI1'
135 | )
136 |
137 | div_clarity = html.Div(
138 | children=[html.H3('Clarity:'), input_clarity],
139 | className="four columns"
140 | )
141 |
142 | ## Div for numerical characteristics
143 | div_numerical = html.Div(
144 | children = [div_carat, div_depth, div_table],
145 | className="row"
146 | )
147 |
148 | ## Div for dimensions
149 | div_dimensions = html.Div(
150 | children = [div_x, div_y, div_z],
151 | className="row"
152 | )
153 |
154 | ## Div for categorical
155 | div_categorical = html.Div(
156 | children = [div_cut, div_color, div_clarity],
157 | className="row"
158 | )
159 |
160 | def get_prediction(carat, depth, table, x, y, z, cut, color, clarity):
161 | '''takes the inputs from the user and produces the price prediction'''
162 |
163 | cols = ['carat', 'depth', 'table',
164 | 'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good',
165 | 'color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J',
166 | 'clarity_IF','clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2','clarity_VVS1', 'clarity_VVS2',
167 | 'dim_index']
168 |
169 | cut_dict = {x: 'cut_' + x for x in cut_values[1:]}
170 | color_dict = {x: 'color_' + x for x in color_values[1:]}
171 | clarity_dict = {x: 'clarity_' + x for x in clarity_values[1:]}
172 |
173 | ## produce a dataframe with a single row of zeros
174 | df = pd.DataFrame(data = np.zeros((1,len(cols))), columns = cols)
175 |
176 | ## get the numeric characteristics
177 | df.loc[0,'carat'] = carat
178 | df.loc[0,'depth'] = depth
179 | df.loc[0,'table'] = table
180 |
181 | ## transform dimensions into a single dim_index using PCA
182 | dims_df = pd.DataFrame(data=[[x, y, z]], columns=['x','y','z'])
183 | df.loc[0,'dim_index'] = pca.transform(dims_df).flatten()[0]
184 |
185 | ## Use the one-hot encoding for the categorical features
186 | if cut!='Fair':
187 | df.loc[0, cut_dict[cut]] = 1
188 |
189 | if color!='D':
190 | df.loc[0, color_dict[color]] = 1
191 |
192 | if clarity != 'I1':
193 | df.loc[0, clarity_dict[clarity]] = 1
194 |
195 | ## Scale the numerical features using the trained scaler
196 | numerical_features = ['carat', 'depth', 'table', 'dim_index']
197 | df.loc[:,numerical_features] = scaler.transform(df.loc[:,numerical_features])
198 |
199 | ## Get the predictions using our trained neural network
200 | prediction = model.predict(df.values).flatten()[0]
201 |
202 | ## Transform the log-prices to prices
203 | prediction = np.exp(prediction)
204 |
205 | return int(prediction)
206 |
207 | ## App layout
208 | app.layout = html.Div([
209 | html.H1('IDR Predict diamond prices'),
210 |
211 | html.H2('Enter the diamond characteristics to get the predicted price'),
212 |
213 | html.Div(
214 | children=[div_numerical, div_dimensions, div_categorical]
215 | ),
216 | html.H1(id='output',
217 | style={'margin-top': '50px', 'text-align': 'center'})
218 | ])
219 |
220 | predictors = ['carat', 'depth', 'table', 'x', 'y', 'z', 'cut', 'color', 'clarity']
221 | @app.callback(
222 | Output('output', 'children'),
223 | [Input(x, 'value') for x in predictors])
224 | def show_prediction(carat, depth, table, x, y, z, cut, color, clarity):
225 | pred = get_prediction(carat, depth, table, x, y, z, cut, color, clarity)
226 | return str("Predicted Price: {:,}".format(pred))
227 |
228 |
229 | if __name__ == '__main__':
230 | app.run_server(debug=True)
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Packt
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # Hands-On Predictive Analytics with Python
5 |
6 |
7 |
8 | This is the code repository for [Hands-On Predictive Analytics with Python](https://www.packtpub.com/big-data-and-business-intelligence/hands-predictive-analytics-python?utm_source=github&utm_medium=repository&utm_campaign=9781789138719), published by Packt.
9 |
10 | **Master the complete predictive analytics process, from problem definition to model deployment**
11 |
12 | ## What is this book about?
13 | This book will teach you all the processes you need to build a predictive analytics solution: understanding the problem, preparing datasets, exploring relationships, model building, tuning, evaluation, and deployment. You'll earn to use Python and its data analytics ecosystem to implement the main techniques used in real-world projects.
14 |
15 | This book covers the following exciting features:
16 | * Get to grips with the main concepts and principles of predictive analytics
17 | * Learn about the stages involved in producing complete predictive analytics solutions
18 | * Understand how to define a problem, propose a solution, and prepare a dataset
19 | * Use visualizations to explore relationships and gain insights into the dataset
20 | * Learn to build regression and classification models using scikit-learn
21 | * Use Keras to build powerful neural network models that produce accurate predictions
22 | * Learn to serve a model's predictions as a web application
23 |
24 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/178913871X) today!
25 |
26 |
28 |
29 | ## Instructions and Navigations
30 |
31 | ### Installation
32 | To be able to run the code of the book without any problems, please do the following:
33 | 1. Download the Anaconda distribution for your system, you can find the installers [here](https://www.anaconda.com)
34 | 1. Once you have installed the Anaconda distribution, create a new Python 3.6 environment with the packages you will need.
35 | To create the environment (named `ho-pawp`, but you can use any other name you like) run the following command
36 | in the Anaconda Prompt terminal `conda create --name ho-pawp --file requirements.txt `
37 |
38 | For a quick guide on conda refer to the conda-cheatsheet.pdf in this repo.
39 | ### Using the code files
40 |
41 | All of the code is organized into folders. Most of the code consists of Jupyter Notebooks. For example, Chapter02.
42 |
43 | The code will look like the following:
44 | ```
45 | carat_values = np.arange(0.5, 5.5, 0.5)
46 | preds = first_ml_model(carat_values)
47 | pd.DataFrame({"Carat": carat_values, "Predicted price":preds})
48 | ```
49 |
50 | **Following is what you need for this book:**
51 | This book is aimed at data scientists, data engineers, software engineers, and business analysts. Also, students and professionals who are constantly working with data in quantitative fields such as finance, economics, and business, among others, who would like to build models to make predictions will find this book useful. In general, this book is aimed at all professionals who would like to focus on the practical implementation of predictive analytics with Python.
52 |
53 | With the following software and hardware list you can run all code files present in the book (Chapter 1-13).
54 | ### Software and Hardware List
55 | | Chapter | Software required | OS required |
56 | | ------- | ------------------------------------ | ----------------------------------- |
57 | | 1-9 | Python 3.6 or higher, Jupyter Notebook, Recent versions of the following Python libraries: NumPy, pandas, and matplotlib, Seaborn, scikit-learn, Recent installations of TensorFlow and Keras, Basic libraries for Dash | Windows, Mac OS X, and Linux (Any) |
58 |
59 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it]().
60 |
61 | ### Related products
62 | * TensorFlow: Powerful Predictive Analytics with TensorFlow [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/tensorflow-powerful-predictive-analytics-tensorflow?utm_source=github&utm_medium=repository&utm_campaign=9781789136913) [[Amazon]](https://www.amazon.com/dp/1789136911)
63 |
64 | * Building Machine Learning Systems with Python - Third Edition [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/building-machine-learning-systems-python-third-edition?utm_source=github&utm_medium=repository&utm_campaign=9781788623223) [[Amazon]](https://www.amazon.com/dp/1788623223)
65 |
66 |
67 | ## Get to Know the Author
68 | **Alvaro Fuentes** is a Senior Data Scientist with more than 13 years of experience in analytical roles.
69 | He holds an M.S. in applied mathematics and an M.S. in quantitative economics. He has been working for one of the top global
70 | management consulting firms solving analytical and AI problems in different industries like Banking, Telco, Mining and others.
71 | He worked for many years in the Central Bank of Guatemala as an economic analyst, building models for economic and financial data.
72 | He is a big Python fan and has been using it routinely for 5+ years to analyzing data and building and deploying analytical models that transform data into intelligence.
73 |
74 |
75 | ### Suggestions and Feedback
76 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions.
77 |
78 |
79 | ### Download a free PDF
80 |
81 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
82 | https://packt.link/free-ebook/9781789138719
--------------------------------------------------------------------------------
/conda-cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-Predictive-Analytics-with-Python/e049d9d2870f596892b62aeddec312788d9d0c2c/conda-cheatsheet.pdf
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # This file may be used to create an environment using:
2 | # $ conda create --name --file
3 | # platform: win-64
4 | _tflow_select=2.2.0=eigen
5 | absl-py=0.9.0=py36_0
6 | astor=0.8.0=py36_0
7 | attrs=19.3.0=py_0
8 | backcall=0.2.0=py_0
9 | blas=1.0=mkl
10 | bleach=3.1.5=py_0
11 | blinker=1.4=py36_0
12 | brotli=1.0.7=pypi_0
13 | brotlipy=0.7.0=py36he774522_1000
14 | ca-certificates=2020.6.24=0
15 | cachetools=4.1.0=py_1
16 | certifi=2020.6.20=py36_0
17 | cffi=1.14.0=py36h7a1dbc1_0
18 | chardet=3.0.4=py36_1003
19 | click=7.1.2=py_0
20 | colorama=0.4.3=py_0
21 | cryptography=2.9.2=py36h7a1dbc1_0
22 | cycler=0.10.0=py36h009560c_0
23 | dash=0.28.5=pypi_0
24 | dash-core-components=0.35.0=pypi_0
25 | dash-html-components=0.13.2=pypi_0
26 | dash-renderer=0.15.0=pypi_0
27 | decorator=4.4.2=py_0
28 | defusedxml=0.6.0=py_0
29 | entrypoints=0.3=py36_0
30 | flask=1.1.2=pypi_0
31 | flask-compress=1.5.0=pypi_0
32 | freetype=2.10.2=hd328e21_0
33 | gast=0.2.2=py36_0
34 | google-auth=1.17.2=py_0
35 | google-auth-oauthlib=0.4.1=py_2
36 | google-pasta=0.2.0=py_0
37 | graphviz=2.38=hfd603c8_2
38 | grpcio=1.27.2=py36h351948d_0
39 | h5py=2.10.0=py36h5e291fa_0
40 | hdf5=1.10.4=h7ebc959_0
41 | icc_rt=2019.0.0=h0cc432a_1
42 | icu=58.2=ha925a31_3
43 | idna=2.10=py_0
44 | importlib-metadata=1.7.0=py36_0
45 | importlib_metadata=1.7.0=0
46 | intel-openmp=2020.1=216
47 | ipykernel=5.3.2=py36h5ca1d4c_0
48 | ipython=7.16.1=py36h5ca1d4c_0
49 | ipython_genutils=0.2.0=py36_0
50 | ipywidgets=7.5.1=py_0
51 | itsdangerous=1.1.0=pypi_0
52 | jedi=0.17.1=py36_0
53 | jinja2=2.11.2=py_0
54 | joblib=0.16.0=py_0
55 | jpeg=9b=hb83a4c4_2
56 | jsonschema=3.2.0=py36_0
57 | jupyter=1.0.0=py36_7
58 | jupyter_client=6.1.6=py_0
59 | jupyter_console=6.1.0=py_0
60 | jupyter_core=4.6.3=py36_0
61 | keras=2.3.1=0
62 | keras-applications=1.0.8=py_1
63 | keras-base=2.3.1=py36_0
64 | keras-preprocessing=1.1.0=py_1
65 | kiwisolver=1.2.0=py36h74a9793_0
66 | libpng=1.6.37=h2a8f88b_0
67 | libprotobuf=3.12.3=h7bd577a_0
68 | libsodium=1.0.18=h62dcd97_0
69 | m2w64-gcc-libgfortran=5.3.0=6
70 | m2w64-gcc-libs=5.3.0=7
71 | m2w64-gcc-libs-core=5.3.0=7
72 | m2w64-gmp=6.1.0=2
73 | m2w64-libwinpthread-git=5.0.0.4634.697f757=2
74 | markdown=3.1.1=py36_0
75 | markupsafe=1.1.1=py36he774522_0
76 | matplotlib=3.2.2=0
77 | matplotlib-base=3.2.2=py36h64f37c6_0
78 | mistune=0.8.4=py36he774522_0
79 | mkl=2020.1=216
80 | mkl-service=2.3.0=py36hb782905_0
81 | mkl_fft=1.1.0=py36h45dec08_0
82 | mkl_random=1.1.1=py36h47e9c7a_0
83 | msys2-conda-epoch=20160418=1
84 | nbconvert=5.6.1=py36_0
85 | nbformat=5.0.7=py_0
86 | notebook=6.0.3=py36_0
87 | numpy=1.18.5=py36h6530119_0
88 | numpy-base=1.18.5=py36hc3f5095_0
89 | oauthlib=3.1.0=py_0
90 | openssl=1.1.1g=he774522_0
91 | opt_einsum=3.1.0=py_0
92 | packaging=20.4=py_0
93 | pandas=1.0.5=py36h47e9c7a_0
94 | pandoc=2.10=0
95 | pandocfilters=1.4.2=py36_1
96 | parso=0.7.0=py_0
97 | pickleshare=0.7.5=py36_0
98 | pip=20.1.1=py36_1
99 | plotly=4.9.0=pypi_0
100 | prometheus_client=0.8.0=py_0
101 | prompt-toolkit=3.0.5=py_0
102 | prompt_toolkit=3.0.5=0
103 | protobuf=3.12.3=py36h33f27b4_0
104 | pyasn1=0.4.8=py_0
105 | pyasn1-modules=0.2.7=py_0
106 | pycparser=2.20=py_2
107 | pydotplus=2.0.2=pypi_0
108 | pygments=2.6.1=py_0
109 | pyjwt=1.7.1=py36_0
110 | pyopenssl=19.1.0=py_1
111 | pyparsing=2.4.7=py_0
112 | pyqt=5.9.2=py36h6538335_2
113 | pyreadline=2.1=py36_1
114 | pyrsistent=0.16.0=py36he774522_0
115 | pysocks=1.7.1=py36_0
116 | python=3.6.10=h9f7ef89_2
117 | python-dateutil=2.8.1=py_0
118 | pytz=2020.1=py_0
119 | pywin32=227=py36he774522_1
120 | pywinpty=0.5.7=py36_0
121 | pyyaml=5.3.1=py36he774522_1
122 | pyzmq=19.0.1=py36ha925a31_1
123 | qt=5.9.7=vc14h73c81de_0
124 | qtconsole=4.7.5=py_0
125 | qtpy=1.9.0=py_0
126 | requests=2.24.0=py_0
127 | requests-oauthlib=1.3.0=py_0
128 | retrying=1.3.3=pypi_0
129 | rsa=4.0=py_0
130 | scikit-learn=0.22=py36h6288b17_0
131 | scipy=1.5.0=py36h9439919_0
132 | seaborn=0.10.1=py_0
133 | send2trash=1.5.0=py36_0
134 | setuptools=49.2.0=py36_0
135 | sip=4.19.8=py36h6538335_0
136 | six=1.15.0=py_0
137 | sqlite=3.32.3=h2a8f88b_0
138 | tensorboard=2.2.1=pyh532a8cf_0
139 | tensorboard-plugin-wit=1.6.0=py_0
140 | tensorflow=2.1.0=eigen_py36hdbbabfe_0
141 | tensorflow-base=2.1.0=eigen_py36h49b2757_0
142 | tensorflow-estimator=2.1.0=pyhd54b08b_0
143 | termcolor=1.1.0=py36_1
144 | terminado=0.8.3=py36_0
145 | testpath=0.4.4=py_0
146 | threadpoolctl=2.1.0=pyh5ca1d4c_0
147 | tornado=6.0.4=py36he774522_1
148 | traitlets=4.3.3=py36_0
149 | urllib3=1.25.9=py_0
150 | vc=14.1=h0510ff6_4
151 | vs2015_runtime=14.16.27012=hf0eaf9b_3
152 | wcwidth=0.2.5=py_0
153 | webencodings=0.5.1=py36_1
154 | werkzeug=0.16.1=py_0
155 | wheel=0.34.2=py36_0
156 | widgetsnbextension=3.5.1=py36_0
157 | win_inet_pton=1.1.0=py36_0
158 | wincertstore=0.2=py36h7fe50ca_0
159 | winpty=0.4.3=4
160 | wrapt=1.12.1=py36he774522_1
161 | yaml=0.2.5=he774522_0
162 | zeromq=4.3.2=ha925a31_2
163 | zipp=3.1.0=py_0
164 | zlib=1.2.11=h62dcd97_4
165 |
--------------------------------------------------------------------------------