├── .Module-01b-Explore-Regression.ipynb.icloud
├── .gitignore
├── Exercise.ipynb
├── LICENSE
├── Module-00-Installation-check.ipynb
├── Module-01a-Frame-Regression.ipynb
├── Module-01c-Model-Regression.ipynb
├── Module-02a-Intuition-Logistic.ipynb
├── Module-02b-Explore-Logistic.ipynb
├── Module-02c-Model-Logistic.ipynb
├── Module-03a-Intuition-Trees.ipynb
├── Module-03b-Model-Trees.ipynb
├── Module-03c-Model-Evaluation.ipynb
├── Module-03d-Model-Bagging.ipynb
├── Module-03e-Model-RandomForest.ipynb
├── Module-03f-Model-Boosting.ipynb
├── Module-03g-Model-HyperParameterOpt.ipynb
├── Module-04a-Regression-Basic.ipynb
├── Module-05a-ML-Pipeline.ipynb
├── README.md
├── curriculum.md
├── data
├── cars_small.csv
├── creditRisk-tree.xlsx
├── creditRisk.csv
├── historical_loan.csv
├── housing_test.csv
├── housing_train.csv
├── loan_data.csv
└── loan_data_clean.csv
├── environment.yml
├── img
├── bias_variance.png
├── confusion_matrix.jpg
├── confusion_matrix2.png
├── cross_validation.png
├── generalisation_error.png
├── linear_models.png
├── logistic-curve.png
├── logistic_regression.png
├── model_complexity.png
├── model_complexity_error.png
├── model_selection.png
├── overfitting.png
├── precision_recall.png
├── random_forest.png
├── regression_error.png
├── regularization.png
├── roc-curves.png
├── simple_complex.png
├── tree_titanic.png
└── validation.png
├── installation.md
├── modelvis-local.py
├── outline.md
├── outline.pdf
├── pre-requisites.md
├── reference
├── .Module-01b-reference.ipynb.icloud
├── Module-01a-reference.ipynb
├── Module-01c-reference.ipynb
├── Module-02b-reference.ipynb
└── Module-02c-reference.ipynb
├── schedule.md
├── tree.dot
├── tree2.dot
└── tree_3.dot
/.Module-01b-Explore-Regression.ipynb.icloud:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/.Module-01b-Explore-Regression.ipynb.icloud
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | __pycache__/
3 | Credit Risk Modeling - creating train and test.ipynb
4 | data/videoGames.csv
5 |
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Amit Kapoor
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Module-00-Installation-check.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Installation Check\n",
8 | "\n",
9 | "This notebook is just to check we have all the libraries installationed and ready to go"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import importlib"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "working_libs = [\"jupyterlab\"]\n",
28 | "basic_libs = [\"numpy\", \"pandas\"]\n",
29 | "ml_libs = [\"sklearn\", \"joblib\"]\n",
30 | "vis_libs = [\"matplotlib\", \"seaborn\", \"altair\", \"plotnine\", \"modelvis\", \"yellowbrick\"]"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 3,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "libs = working_libs + basic_libs + ml_libs + vis_libs"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 4,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "def get_version(libs):\n",
49 | " for lib in libs:\n",
50 | " module = importlib.import_module(lib)\n",
51 | " ver = getattr(module, \"__version__\")\n",
52 | " print(ver, \"\\t \", lib)"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 5,
58 | "metadata": {},
59 | "outputs": [
60 | {
61 | "name": "stdout",
62 | "output_type": "stream",
63 | "text": [
64 | "0.35.4 \t jupyterlab\n",
65 | "1.15.4 \t numpy\n",
66 | "0.23.4 \t pandas\n",
67 | "0.20.0 \t sklearn\n",
68 | "0.13.0 \t joblib\n",
69 | "3.0.2 \t matplotlib\n",
70 | "0.9.0 \t seaborn\n",
71 | "2.2.2 \t altair\n",
72 | "0.5.1 \t plotnine\n",
73 | "0.1.6 \t modelvis\n"
74 | ]
75 | }
76 | ],
77 | "source": [
78 | "get_version(libs)"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 6,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "lib_rqmt = []\n",
88 | "def requirements(libs):\n",
89 | " for lib in libs:\n",
90 | " module = importlib.import_module(lib)\n",
91 | " ver = getattr(module, \"__version__\")\n",
92 | " lib_ver = \"- \"+lib+\">=\"+ver\n",
93 | " lib_rqmt.append(lib_ver)\n",
94 | " print(lib_ver)"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 7,
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "name": "stdout",
104 | "output_type": "stream",
105 | "text": [
106 | "- jupyterlab>=0.35.4\n",
107 | "- numpy>=1.15.4\n",
108 | "- pandas>=0.23.4\n",
109 | "- sklearn>=0.20.0\n",
110 | "- joblib>=0.13.0\n",
111 | "- matplotlib>=3.0.2\n",
112 | "- seaborn>=0.9.0\n",
113 | "- altair>=2.2.2\n",
114 | "- plotnine>=0.5.1\n",
115 | "- modelvis>=0.1.6\n"
116 | ]
117 | }
118 | ],
119 | "source": [
120 | "requirements(libs)"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 8,
126 | "metadata": {},
127 | "outputs": [
128 | {
129 | "data": {
130 | "text/plain": [
131 | "['- jupyterlab>=0.35.4',\n",
132 | " '- numpy>=1.15.4',\n",
133 | " '- pandas>=0.23.4',\n",
134 | " '- sklearn>=0.20.0',\n",
135 | " '- joblib>=0.13.0',\n",
136 | " '- matplotlib>=3.0.2',\n",
137 | " '- seaborn>=0.9.0',\n",
138 | " '- altair>=2.2.2',\n",
139 | " '- plotnine>=0.5.1',\n",
140 | " '- modelvis>=0.1.6']"
141 | ]
142 | },
143 | "execution_count": 8,
144 | "metadata": {},
145 | "output_type": "execute_result"
146 | }
147 | ],
148 | "source": [
149 | "lib_rqmt"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": []
158 | }
159 | ],
160 | "metadata": {
161 | "kernelspec": {
162 | "display_name": "Python 3",
163 | "language": "python",
164 | "name": "python3"
165 | },
166 | "language_info": {
167 | "codemirror_mode": {
168 | "name": "ipython",
169 | "version": 3
170 | },
171 | "file_extension": ".py",
172 | "mimetype": "text/x-python",
173 | "name": "python",
174 | "nbconvert_exporter": "python",
175 | "pygments_lexer": "ipython3",
176 | "version": "3.7.1"
177 | }
178 | },
179 | "nbformat": 4,
180 | "nbformat_minor": 2
181 | }
182 |
--------------------------------------------------------------------------------
/Module-02b-Explore-Logistic.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Logistic - Explore the Data"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Raw Data\n",
15 | "\n",
16 | "You are provided with the following data: **loan_data.csv** \n",
17 | "This is the historical data that the bank has provided. It has the following columns\n",
18 | "\n",
19 | "**Application Attributes**:\n",
20 | "- `years`: Number of years the applicant has been employed \n",
21 | "- `ownership`: Whether the applicant owns a house or not \n",
22 | "- `income`: Annual income of the applicant \n",
23 | "- `age`: Age of the applicant \n",
24 | "\n",
25 | "**Behavioural Attributes**:\n",
26 | "- `grade`: Credit grade of the applicant\n",
27 | "\n",
28 | "**Outcome Variable**:\n",
29 | "- `amount` : Amount of Loan provided to the applicant \n",
30 | "- `interest`: Interest rate charged for the applicant \n",
31 | "- `default` : Whether the applicant has defaulted or not "
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "metadata": {},
37 | "source": [
38 | "Let us build some intuition around the Loan Data"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "## Frame the Problem\n",
46 | "\n",
47 | "- What are the features\n",
48 | "- What are the target"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {
55 | "collapsed": true
56 | },
57 | "outputs": [],
58 | "source": []
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "## Load the Refine Data"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 4,
70 | "metadata": {
71 | "collapsed": true
72 | },
73 | "outputs": [],
74 | "source": [
75 | "#Load the libraries\n",
76 | "import numpy as np\n",
77 | "import pandas as pd\n",
78 | "import matplotlib.pyplot as plt\n",
79 | "import seaborn as sns"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 5,
85 | "metadata": {
86 | "collapsed": true
87 | },
88 | "outputs": [],
89 | "source": [
90 | "#Default Variables\n",
91 | "%matplotlib inline\n",
92 | "plt.rcParams['figure.figsize'] = (16,9)\n",
93 | "plt.rcParams['font.size'] = 18\n",
94 | "plt.style.use('fivethirtyeight')\n",
95 | "pd.set_option('display.float_format', lambda x: '%.2f' % x)"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 6,
101 | "metadata": {
102 | "collapsed": true
103 | },
104 | "outputs": [],
105 | "source": [
106 | "#Load the dataset\n",
107 | "df = pd.read_csv(\"data/loan_data_clean.csv\")"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 7,
113 | "metadata": {},
114 | "outputs": [
115 | {
116 | "data": {
117 | "text/html": [
118 | "
\n",
119 | "
\n",
120 | " \n",
121 | " \n",
122 | " | \n",
123 | " default | \n",
124 | " amount | \n",
125 | " interest | \n",
126 | " grade | \n",
127 | " years | \n",
128 | " ownership | \n",
129 | " income | \n",
130 | " age | \n",
131 | "
\n",
132 | " \n",
133 | " \n",
134 | " \n",
135 | " 0 | \n",
136 | " 0 | \n",
137 | " 5000 | \n",
138 | " 10.65 | \n",
139 | " B | \n",
140 | " 10.00 | \n",
141 | " RENT | \n",
142 | " 24000.00 | \n",
143 | " 33 | \n",
144 | "
\n",
145 | " \n",
146 | " 1 | \n",
147 | " 0 | \n",
148 | " 2400 | \n",
149 | " 10.99 | \n",
150 | " C | \n",
151 | " 25.00 | \n",
152 | " RENT | \n",
153 | " 12252.00 | \n",
154 | " 31 | \n",
155 | "
\n",
156 | " \n",
157 | " 2 | \n",
158 | " 0 | \n",
159 | " 10000 | \n",
160 | " 13.49 | \n",
161 | " C | \n",
162 | " 13.00 | \n",
163 | " RENT | \n",
164 | " 49200.00 | \n",
165 | " 24 | \n",
166 | "
\n",
167 | " \n",
168 | " 3 | \n",
169 | " 0 | \n",
170 | " 5000 | \n",
171 | " 10.99 | \n",
172 | " A | \n",
173 | " 3.00 | \n",
174 | " RENT | \n",
175 | " 36000.00 | \n",
176 | " 39 | \n",
177 | "
\n",
178 | " \n",
179 | " 4 | \n",
180 | " 0 | \n",
181 | " 3000 | \n",
182 | " 10.99 | \n",
183 | " E | \n",
184 | " 9.00 | \n",
185 | " RENT | \n",
186 | " 48000.00 | \n",
187 | " 24 | \n",
188 | "
\n",
189 | " \n",
190 | "
\n",
191 | "
"
192 | ],
193 | "text/plain": [
194 | " default amount interest grade years ownership income age\n",
195 | "0 0 5000 10.65 B 10.00 RENT 24000.00 33\n",
196 | "1 0 2400 10.99 C 25.00 RENT 12252.00 31\n",
197 | "2 0 10000 13.49 C 13.00 RENT 49200.00 24\n",
198 | "3 0 5000 10.99 A 3.00 RENT 36000.00 39\n",
199 | "4 0 3000 10.99 E 9.00 RENT 48000.00 24"
200 | ]
201 | },
202 | "execution_count": 7,
203 | "metadata": {},
204 | "output_type": "execute_result"
205 | }
206 | ],
207 | "source": [
208 | "df.head()"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "## Dual Variable Exploration"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 1,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "# Create a crosstab of default and grade\n"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": 2,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "# Create a crosstab of default and grade - percentage by default type\n"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 3,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "# Create a crosstab of default and grade - percentage by all type\n"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 4,
248 | "metadata": {},
249 | "outputs": [],
250 | "source": [
251 | "# Create a crosstab of default and grade - percentage by default type\n"
252 | ]
253 | },
254 | {
255 | "cell_type": "markdown",
256 | "metadata": {},
257 | "source": [
258 | "### Explore the impact of `ownership` with `default`"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": null,
264 | "metadata": {
265 | "collapsed": true
266 | },
267 | "outputs": [],
268 | "source": []
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": null,
273 | "metadata": {
274 | "collapsed": true
275 | },
276 | "outputs": [],
277 | "source": []
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {
283 | "collapsed": true
284 | },
285 | "outputs": [],
286 | "source": []
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "metadata": {},
291 | "source": [
292 | "## Explore the impact of `age` with `defualt`"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": null,
298 | "metadata": {},
299 | "outputs": [],
300 | "source": []
301 | },
302 | {
303 | "cell_type": "markdown",
304 | "metadata": {},
305 | "source": [
306 | "### Explore the impact of `income` with `default`"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": null,
312 | "metadata": {},
313 | "outputs": [],
314 | "source": []
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 17,
319 | "metadata": {
320 | "collapsed": true
321 | },
322 | "outputs": [],
323 | "source": [
324 | "# Create the transformed income variable\n"
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "metadata": {},
330 | "source": [
331 | "### Explore the impact of `years` with `default`"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": null,
337 | "metadata": {
338 | "collapsed": true
339 | },
340 | "outputs": [],
341 | "source": []
342 | },
343 | {
344 | "cell_type": "markdown",
345 | "metadata": {},
346 | "source": [
347 | "## Three Variable Exploration"
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": 5,
353 | "metadata": {},
354 | "outputs": [],
355 | "source": [
356 | "#Plot age, years and default\n"
357 | ]
358 | },
359 | {
360 | "cell_type": "markdown",
361 | "metadata": {},
362 | "source": [
363 | "### Explore the relationship of `age`, `income` and `default`"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": null,
369 | "metadata": {},
370 | "outputs": [],
371 | "source": []
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": null,
376 | "metadata": {
377 | "collapsed": true
378 | },
379 | "outputs": [],
380 | "source": []
381 | },
382 | {
383 | "cell_type": "markdown",
384 | "metadata": {},
385 | "source": [
386 | "### Explore the relationshiop of `age`, `grade` and `default`"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": []
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": null,
399 | "metadata": {
400 | "collapsed": true
401 | },
402 | "outputs": [],
403 | "source": []
404 | }
405 | ],
406 | "metadata": {
407 | "anaconda-cloud": {},
408 | "kernelspec": {
409 | "display_name": "Python 3",
410 | "language": "python",
411 | "name": "python3"
412 | },
413 | "language_info": {
414 | "codemirror_mode": {
415 | "name": "ipython",
416 | "version": 3
417 | },
418 | "file_extension": ".py",
419 | "mimetype": "text/x-python",
420 | "name": "python",
421 | "nbconvert_exporter": "python",
422 | "pygments_lexer": "ipython3",
423 | "version": "3.7.1"
424 | }
425 | },
426 | "nbformat": 4,
427 | "nbformat_minor": 2
428 | }
429 |
--------------------------------------------------------------------------------
/Module-03f-Model-Boosting.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Gradient Boosting Machines"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "**Boosting** \n",
15 | "\n",
16 | "Combine weak learners to build a strong model\n",
17 | "\n",
18 | "How does this work?\n",
19 | "\n",
20 | "Build a base model:\n",
21 | "\n",
22 | "`Y = Model1(x) + Error1`\n",
23 | "\n",
24 | "Models are abstractions. There will be error between predictions and actual\n",
25 | "\n",
26 | "What is this error can be modeled ? Say:\n",
27 | "\n",
28 | "`Error1 = Model2(x) + Error2`\n",
29 | "\n",
30 | "If modeled right, this will improve the accuracy of the predictions.\n",
31 | "\n",
32 | "And we can continue:\n",
33 | "\n",
34 | "`Error2 = Model3(x) + Error3`\n",
35 | "\n",
36 | "Combining these three steps, we have:\n",
37 | "\n",
38 | "`Y = Model1(x) + Model2(x) + Model3(x) + Error3`\n",
39 | "\n",
40 | "And if we find weights(parameters) for these models?\n",
41 | "\n",
42 | "$$ Y = \\alpha Model1(x) + \\beta Model2(x) + \\gamma Model3(x) + Error3$$"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {
48 | "collapsed": true
49 | },
50 | "source": [
51 | "### Intuition\n",
52 | "\n",
53 | ""
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {
59 | "collapsed": true
60 | },
61 | "source": [
62 | "### AdaBoost\n",
63 | "\n",
64 | "AdaBoost is done using decision stump"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "**Exercise**\n",
72 | "\n",
73 | "Run `sklearn.ensemble.AdaBoostClassifier`"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {
80 | "collapsed": true
81 | },
82 | "outputs": [],
83 | "source": []
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {
89 | "collapsed": true
90 | },
91 | "outputs": [],
92 | "source": []
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {},
97 | "source": [
98 | "`xgboost`"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 5,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "import xgboost as xgb\n",
108 | "import pandas as pd\n",
109 | "import numpy as np"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 6,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "#Read the data\n",
119 | "df = pd.read_csv(\"data/historical_loan.csv\")\n",
120 | "\n",
121 | "# refine the data\n",
122 | "df.years = df.years.fillna(np.mean(df.years))\n",
123 | "\n",
124 | "#Load the preprocessing module\n",
125 | "from sklearn import preprocessing\n",
126 | "categorical_variables = df.dtypes[df.dtypes==\"object\"].index.tolist()\n",
127 | "for i in categorical_variables:\n",
128 | " lbl = preprocessing.LabelEncoder()\n",
129 | " lbl.fit(list(df[i]))\n",
130 | " df[i] = lbl.transform(df[i])"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 11,
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "data": {
140 | "text/html": [
141 | "\n",
142 | "\n",
155 | "
\n",
156 | " \n",
157 | " \n",
158 | " | \n",
159 | " default | \n",
160 | " amount | \n",
161 | " grade | \n",
162 | " years | \n",
163 | " ownership | \n",
164 | " income | \n",
165 | " age | \n",
166 | "
\n",
167 | " \n",
168 | " \n",
169 | " \n",
170 | " 0 | \n",
171 | " 0 | \n",
172 | " 1000 | \n",
173 | " 1 | \n",
174 | " 2.0 | \n",
175 | " 3 | \n",
176 | " 19200.0 | \n",
177 | " 24 | \n",
178 | "
\n",
179 | " \n",
180 | " 1 | \n",
181 | " 1 | \n",
182 | " 6500 | \n",
183 | " 0 | \n",
184 | " 2.0 | \n",
185 | " 0 | \n",
186 | " 66000.0 | \n",
187 | " 28 | \n",
188 | "
\n",
189 | " \n",
190 | " 2 | \n",
191 | " 0 | \n",
192 | " 2400 | \n",
193 | " 0 | \n",
194 | " 2.0 | \n",
195 | " 3 | \n",
196 | " 60000.0 | \n",
197 | " 36 | \n",
198 | "
\n",
199 | " \n",
200 | " 3 | \n",
201 | " 0 | \n",
202 | " 10000 | \n",
203 | " 2 | \n",
204 | " 3.0 | \n",
205 | " 3 | \n",
206 | " 62000.0 | \n",
207 | " 24 | \n",
208 | "
\n",
209 | " \n",
210 | " 4 | \n",
211 | " 1 | \n",
212 | " 4000 | \n",
213 | " 2 | \n",
214 | " 2.0 | \n",
215 | " 3 | \n",
216 | " 20000.0 | \n",
217 | " 28 | \n",
218 | "
\n",
219 | " \n",
220 | "
\n",
221 | "
"
222 | ],
223 | "text/plain": [
224 | " default amount grade years ownership income age\n",
225 | "0 0 1000 1 2.0 3 19200.0 24\n",
226 | "1 1 6500 0 2.0 0 66000.0 28\n",
227 | "2 0 2400 0 2.0 3 60000.0 36\n",
228 | "3 0 10000 2 3.0 3 62000.0 24\n",
229 | "4 1 4000 2 2.0 3 20000.0 28"
230 | ]
231 | },
232 | "execution_count": 11,
233 | "metadata": {},
234 | "output_type": "execute_result"
235 | }
236 | ],
237 | "source": [
238 | "df.head()"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 12,
244 | "metadata": {},
245 | "outputs": [],
246 | "source": [
247 | "# Setup the features and target\n",
248 | "X = df.iloc[:,1:]\n",
249 | "y = df.iloc[:,0]"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 13,
255 | "metadata": {
256 | "collapsed": true
257 | },
258 | "outputs": [],
259 | "source": [
260 | "from sklearn.model_selection import train_test_split"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 14,
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "metadata": {},
275 | "source": [
276 | "Details on the various parameters for xgboost can be found here:\n",
277 | "https://github.com/dmlc/xgboost/blob/master/doc/parameter.md\n",
278 | "\n",
279 | "\n",
280 | " General Parameters\n",
281 | "------------------\n",
282 | "* booster [default=gbtree]\n",
283 | " - which booster to use, can be gbtree, gblinear or dart. gbtree and dart use tree based model while gblinear uses linear function.\n",
284 | "* silent [default=0]\n",
285 | " - 0 means printing running messages, 1 means silent mode.\n",
286 | "* nthread [default to maximum number of threads available if not set]\n",
287 | " - number of parallel threads used to run xgboost\n",
288 | "* num_pbuffer [set automatically by xgboost, no need to be set by user]\n",
289 | " - size of prediction buffer, normally set to number of training instances. The buffers are used to save the prediction results of last boosting step.\n",
290 | "* num_feature [set automatically by xgboost, no need to be set by user]\n",
291 | " - feature dimension used in boosting, set to maximum dimension of the feature\n",
292 | "\n",
293 | "Parameters for Tree Booster\n",
294 | "---------------------------\n",
295 | "* eta [default=0.3]\n",
296 | " - step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinks the feature weights to make the boosting process more conservative.\n",
297 | " - range: [0,1]\n",
298 | "* gamma [default=0]\n",
299 | " - minimum loss reduction required to make a further partition on a leaf node of the tree. The larger, the more conservative the algorithm will be.\n",
300 | " - range: [0,∞]\n",
301 | "* max_depth [default=6]\n",
302 | " - maximum depth of a tree, increase this value will make the model more complex / likely to be overfitting.\n",
303 | " - range: [1,∞]\n",
304 | "* min_child_weight [default=1]\n",
305 | " - minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.\n",
306 | " - range: [0,∞]\n",
307 | "* max_delta_step [default=0]\n",
308 | " - Maximum delta step we allow each tree's weight estimation to be. If the value is set to 0, it means there is no constraint. If it is set to a positive value, it can help making the update step more conservative. Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might help control the update\n",
309 | " - range: [0,∞]\n",
310 | "* subsample [default=1]\n",
311 | " - subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting.\n",
312 | " - range: (0,1]\n",
313 | "* colsample_bytree [default=1]\n",
314 | " - subsample ratio of columns when constructing each tree.\n",
315 | " - range: (0,1]\n",
316 | "* colsample_bylevel [default=1]\n",
317 | " - subsample ratio of columns for each split, in each level.\n",
318 | " - range: (0,1]\n",
319 | "* lambda [default=1]\n",
320 | " - L2 regularization term on weights, increase this value will make model more conservative.\n",
321 | "* alpha [default=0]\n",
322 | " - L1 regularization term on weights, increase this value will make model more conservative.\n",
323 | "* tree_method, string [default='auto']\n",
324 | " - The tree construction algorithm used in XGBoost(see description in the [reference paper](http://arxiv.org/abs/1603.02754))\n",
325 | " - Distributed and external memory version only support approximate algorithm.\n",
326 | " - Choices: {'auto', 'exact', 'approx'}\n",
327 | " - 'auto': Use heuristic to choose faster one.\n",
328 | " - For small to medium dataset, exact greedy will be used.\n",
329 | " - For very large-dataset, approximate algorithm will be chosen.\n",
330 | " - Because old behavior is always use exact greedy in single machine,\n",
331 | " user will get a message when approximate algorithm is chosen to notify this choice.\n",
332 | " - 'exact': Exact greedy algorithm.\n",
333 | " - 'approx': Approximate greedy algorithm using sketching and histogram.\n",
334 | "* sketch_eps, [default=0.03]\n",
335 | " - This is only used for approximate greedy algorithm.\n",
336 | " - This roughly translated into ```O(1 / sketch_eps)``` number of bins.\n",
337 | " Compared to directly select number of bins, this comes with theoretical guarantee with sketch accuracy.\n",
338 | " - Usually user does not have to tune this.\n",
339 | " but consider setting to a lower number for more accurate enumeration.\n",
340 | " - range: (0, 1)\n",
341 | "* scale_pos_weight, [default=1]\n",
342 | " - Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases) See [Parameters Tuning](how_to/param_tuning.md) for more discussion. Also see Higgs Kaggle competition demo for examples: [R](../demo/kaggle-higgs/higgs-train.R ), [py1](../demo/kaggle-higgs/higgs-numpy.py ), [py2](../demo/kaggle-higgs/higgs-cv.py ), [py3](../demo/guide-python/cross_validation.py)\n",
343 | "* updater_seq, [default=\"grow_colmaker,prune\"]\n",
344 | " - A comma separated string mentioning tThe sequence of Tree updaters that should be run. A tree updater is a pluggable operation performed on the tree at every step using the gradient information. Tree updaters can be registered using the plugin system provided.\n",
345 | "\n",
346 | "\n",
347 | "Learning Task Parameters\n",
348 | "------------------------\n",
349 | "Specify the learning task and the corresponding learning objective. The objective options are below:\n",
350 | "* objective [ default=reg:linear ]\n",
351 | " - \"reg:linear\" --linear regression\n",
352 | " - \"reg:logistic\" --logistic regression\n",
353 | " - \"binary:logistic\" --logistic regression for binary classification, output probability\n",
354 | " - \"binary:logitraw\" --logistic regression for binary classification, output score before logistic transformation\n",
355 | " - \"count:poisson\" --poisson regression for count data, output mean of poisson distribution\n",
356 | " - max_delta_step is set to 0.7 by default in poisson regression (used to safeguard optimization)\n",
357 | " - \"multi:softmax\" --set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes)\n",
358 | " - \"multi:softprob\" --same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probability of each data point belonging to each class.\n",
359 | " - \"rank:pairwise\" --set XGBoost to do ranking task by minimizing the pairwise loss\n",
360 | " - \"reg:gamma\" --gamma regression for severity data, output mean of gamma distribution\n",
361 | " - \"reg:tweedie\" --tweedie regression for insurance data\n",
362 | " - tweedie_variance_power is set to 1.5 by default in tweedie regression and must be in the range [1, 2)\n",
363 | "* base_score [ default=0.5 ]\n",
364 | " - the initial prediction score of all instances, global bias\n",
365 | " - for sufficient number of iterations, changing this value will not have too much effect.\n",
366 | "* eval_metric [ default according to objective ]\n",
367 | " - evaluation metrics for validation data, a default metric will be assigned according to objective (rmse for regression, and error for classification, mean average precision for ranking )\n",
368 | " - User can add multiple evaluation metrics, for python user, remember to pass the metrics in as list of parameters pairs instead of map, so that latter 'eval_metric' won't override previous one\n",
369 | " - The choices are listed below:\n",
370 | " - \"rmse\": [root mean square error](http://en.wikipedia.org/wiki/Root_mean_square_error)\n",
371 | " - \"mae\": [mean absolute error](https://en.wikipedia.org/wiki/Mean_absolute_error)\n",
372 | " - \"logloss\": negative [log-likelihood](http://en.wikipedia.org/wiki/Log-likelihood)\n",
373 | " - \"error\": Binary classification error rate. It is calculated as #(wrong cases)/#(all cases). For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.\n",
374 | " - \"merror\": Multiclass classification error rate. It is calculated as #(wrong cases)/#(all cases).\n",
375 | " - \"mlogloss\": [Multiclass logloss](https://www.kaggle.com/wiki/MultiClassLogLoss)\n",
376 | " - \"auc\": [Area under the curve](http://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_curve) for ranking evaluation.\n",
377 | " - \"ndcg\":[Normalized Discounted Cumulative Gain](http://en.wikipedia.org/wiki/NDCG)\n",
378 | " - \"map\":[Mean average precision](http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision)\n",
379 | " - \"ndcg@n\",\"map@n\": n can be assigned as an integer to cut off the top positions in the lists for evaluation.\n",
380 | " - \"ndcg-\",\"map-\",\"ndcg@n-\",\"map@n-\": In XGBoost, NDCG and MAP will evaluate the score of a list without any positive samples as 1. By adding \"-\" in the evaluation metric XGBoost will evaluate these score as 0 to be consistent under some conditions.\n",
381 | "training repeatedly\n",
382 | " - \"gamma-deviance\": [residual deviance for gamma regression]\n",
383 | "* seed [ default=0 ]\n",
384 | " - random number seed.\n",
385 | "\n",
386 | "\n"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 15,
392 | "metadata": {
393 | "collapsed": true
394 | },
395 | "outputs": [],
396 | "source": [
397 | "#Parameters\n",
398 | "\n",
399 | "params = {}\n",
400 | "params[\"min_child_weight\"] = 3\n",
401 | "params[\"subsample\"] = 0.7\n",
402 | "params[\"colsample_bytree\"] = 0.7\n",
403 | "params[\"scale_pos_weight\"] = 1\n",
404 | "params[\"silent\"] = 0\n",
405 | "params[\"max_depth\"] = 4\n",
406 | "params[\"nthread\"] = 6\n",
407 | "params[\"gamma\"] = 1\n",
408 | "params[\"objective\"] = \"binary:logistic\"\n",
409 | "params[\"eta\"] = 0.005\n",
410 | "params[\"base_score\"] = 0.1\n",
411 | "params[\"eval_metric\"] = \"auc\"\n",
412 | "params[\"seed\"] = 123\n",
413 | "\n",
414 | "plst = list(params.items())\n",
415 | "num_rounds = 40"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": 16,
421 | "metadata": {},
422 | "outputs": [],
423 | "source": [
424 | "xgtrain = xgb.DMatrix(X_train, label=y_train)\n",
425 | "watchlist = [(xgtrain, 'train')]"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": 17,
431 | "metadata": {
432 | "collapsed": true
433 | },
434 | "outputs": [],
435 | "source": [
436 | "model_xgboost = xgb.train(plst, xgtrain, num_rounds)"
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": 18,
442 | "metadata": {
443 | "collapsed": true
444 | },
445 | "outputs": [],
446 | "source": [
447 | "import matplotlib.pyplot as plt\n",
448 | "%matplotlib inline"
449 | ]
450 | },
451 | {
452 | "cell_type": "code",
453 | "execution_count": 19,
454 | "metadata": {},
455 | "outputs": [
456 | {
457 | "data": {
458 | "text/plain": [
459 | ""
460 | ]
461 | },
462 | "execution_count": 19,
463 | "metadata": {},
464 | "output_type": "execute_result"
465 | },
466 | {
467 | "data": {
468 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAhwAAAFrCAYAAACaMVCYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XmYnWV9//H3J0RkDQQECWu0QEFZIqASXIqCFgRNXFqK\nWAFRS1sKuED5uQZaFalKg7hUQKBCQQFlECyFIhSqBAgYVkEQYkERREDCTuD7++M8iYcwITPDPDkz\nk/frus41z7mf5XzPzQnzmfu+zzmpKiRJkto0rtcFSJKksc/AIUmSWmfgkCRJrTNwSJKk1hk4JElS\n6wwckiSpdQYOSSNekm8m+XSv65A0dPFzOKSxK8lc4KXA013Nm1bVb17ANXcETqmq9V9YdaNTkpOA\nu6rqU72uRRpNHOGQxr63V9UqXbchh43hkGR8Lx//hUiyXK9rkEYrA4e0jEqyfZKfJnkwybXNyMWC\nffsm+XmSeUluT/I3TfvKwH8C6yZ5uLmtm+SkJP/cdf6OSe7quj83yT8muQ54JMn45ryzkvwuyR1J\nDnyeWhdef8G1kxya5N4kdyeZnuRtSX6R5P4kn+g6d0aSM5N8t3k+1yTZumv/5kkuafrhxiTvWORx\nv5HkR0keAfYD9gIObZ77D5vjDkvyy+b6NyV5Z9c19knyv0m+lOSB5rnu2rV/jSQnJvlNs//srn27\nJ5nT1PbTJFsN+D+wNMIYOKRlUJL1gPOAfwbWAD4OnJVkreaQe4HdgQnAvsDRSbapqkeAXYHfDGHE\nZE9gN2B14Bngh8C1wHrATsDBSf58gNdaB1ihOfczwHHA+4BtgTcAn0ny8q7jpwFnNM/1P4Czk7wo\nyYuaOi4A1gb+ATg1yZ92nfte4HPAqsC/A6cCRzXP/e3NMb9sHnc14HDglCSTuq7xWuAW4CXAUcAJ\nSdLs+w6wEvDKpoajAZJsA3wb+BtgTeDfgHOSvHiAfSSNKAYOaew7u/kL+cGuv57fB/yoqn5UVc9U\n1YXAbOBtAFV1XlX9sjr+h84v5De8wDqOqao7q+ox4NXAWlV1RFU9WVW30wkNfzXAaz0FfK6qngJO\np/OLfGZVzauqG4Ebge7RgKur6szm+K/QCSvbN7dVgCObOn4MnEsnHC3QV1U/afrp8f6Kqaozquo3\nzTHfBW4FXtN1yK+q6riqeho4GZgEvLQJJbsC+1fVA1X1VNPfAB8C/q2qrqiqp6vqZOCJpmZp1Bm1\nc6mSBmx6Vf33Im0bAX+R5O1dbS8CLgZohvw/C2xK5w+TlYDrX2Addy7y+OsmebCrbTngsgFe6/fN\nL2+Ax5qf93Ttf4xOkHjOY1fVM810z7oL9lXVM13H/orOyEl/dfcryfuBjwKTm6ZV6ISgBX7b9fiP\nNoMbq9AZcbm/qh7o57IbAXsn+YeutuW76pZGFQOHtGy6E/hOVX1o0R3NkP1ZwPvp/HX/VDMysmAK\noL+3tj1CJ5QssE4/x3SfdydwR1VtMpTih2CDBRtJxgHrAwumgjZIMq4rdGwI/KLr3EWf77PuJ9mI\nzujMTsDlVfV0kjn8sb+ez53AGklWr6oH+9n3uar63ACuI414TqlIy6ZTgLcn+fMkyyVZoVmMuT6d\nv6JfDPwOmN+Mdry169x7gDWTrNbVNgd4W7MAch3g4CU8/pXAQ81C0hWbGrZI8uphe4bPtm2SdzXv\nkDmYztTELOAKOmHp0GZNx47A2+lM0yzOPUD3+pCV6YSQ30FnwS2wxUCKqqq76SzC/XqSiU0Nb2x2\nHwfsn+S16Vg5yW5JVh3gc5ZGFAOHtAyqqjvpLKT8BJ1flHcChwDjqmoecCDwPeABOosmz+k692bg\nNOD2Zl3IunQWPl4LzKWz3uO7S3j8p+n8Yp8C3AHcBxxPZ9FlG/qAPeg8n78G3tWsl3gSeAeddRT3\nAV8H3t88x8U5AXjFgjUxVXUT8GXgcjphZEvgJ4Oo7a/prEm5mc5i3YMBqmo2nXUcxzZ13wbsM4jr\nSiOKH/wlaUxLMgPYuKre1+tapGWZIxySJKl1Bg5JktQ6p1QkSVLrHOGQJEmt83M4htGECRNq0003\n7XUZo8qDDz7I6quv3usyRh37bfDss8Gzz4ZmWeu3q6+++r6qWmtJxxk4htHaa6/N7Nmze13GqNLX\n18e0adN6XcaoY78Nnn02ePbZ0Cxr/ZbkVwM5zikVSZLUOgOHJElqnYFDkiS1zsAhSZJaZ+CQJEmt\nM3BIkqTWGTgkSVLrDBySJKl1Bg5JktQ6A4ckSWqdgUOSJLXOwCFJklpn4JAkSa0zcEiSpNYZOCRJ\nUusMHJIkqXUGDkmS1DoDhyRJap2BQ5Iktc7AIUmSWmfgkCRJrTNwSJKk1hk4JElS6wwckiSpdQYO\nSZLUOgOHJElqnYFDkiS1zsAhSZJaZ+CQJEmtM3BIkqTWGTgkSVLrDBySJKl1Bg5JktQ6A4ckSWqd\ngUOSJLXOwCFJklpn4JAkSa1LVfW6hjFj4403rvnvmdnrMkaVmVPnc9Dl43tdxqhjvw2efTZ49tnQ\nDGe/zT1yt+e0feADH+Dcc89l7bXX5oYbbgDg/vvvZ4899mDu3LlMnjyZ733ve0ycOJFLLrmEadOm\n8bKXvQyAd73rXXzmM58ZltoWSHJ1VW23pONG5AhHkp/2ugZJkkaiffbZh/PPP/9ZbUceeSQ77bQT\nt956KzvttBNHHnnkwn1veMMbmDNnDnPmzBn2sDEYIzJwVNUOva5BkqSR6I1vfCNrrLHGs9r6+vrY\ne++9Adh77705++yze1Ha8xqRgSPJw83PHZNckuTMJDcnOTVJmn2vTvLTJNcmuTLJqklWSHJikuuT\n/CzJm5pj90lydpIfJrkjyQFJPtocMyvJGs1xf5Lk/CRXJ7ksyWa96wVJkgbmnnvuYdKkSQBMmjSJ\ne++9d+G+yy+/nK233ppdd92VG2+8sVclMhom514FvBL4DfAT4HVJrgS+C+xRVVclmQA8BhwEUFVb\nNmHhgiSbNtfZornWCsBtwD9W1auSHA28H/hX4FvA/lV1a5LXAl8H3vx8xSWZAXwWYOLEiZw4df7w\nPfNlxEz7bEjst8GzzwbPPhua4eq3vr6+ftvvuece5s2bt3D/U0899axjF9x/9NFH+drXvsaKK67I\n7Nmzectb3sI3vvGNYaltsEZD4Liyqu4CSDIHmAz8Abi7qq4CqKqHmv2vB77atN2c5FfAgsBxcVXN\nA+Yl+QPww6b9emCrJKsAOwBnNIMoAC9eUnFVNQOYAZ1Foy6wGhwXpQ2N/TZ49tng2WdD0/aiUYC5\nc+dyzDHHMG3aNADWX399XvOa1zBp0iTuvvtu1ltvvYX7Fpg2bRrf+c53eN3rXsdLXvKSYalvMEbk\nlMoinujafppOSArQ39tr0k9bf9d5puv+M801xwEPVtWUrtvmQy9bkqSl4x3veAcnn3wyACeffPLC\nsPHb3/6WBe9GvfLKK3nmmWdYc801e1LjaAgc/bkZWDfJqwGa9RvjgUuBvZq2TYENgVsGcsFmlOSO\nJH/RnJ8kW7dRvCRJQ7XnnnsydepUbrnlFtZff31OOOEEDjvsMC688EI22WQTLrzwQg477DAAzjzz\nTLbYYgu23nprDjzwQE4//XS6RvGXqlE5VlZVTybZA/hqkhXprN/Ymc6ai28muR6YD+xTVU8MonP3\nAr6R5FPAi4DTgWsHU9vihr/Uv76+PvtsCOy3wbPPBs8+G5q2++20007rt/2iiy56TtsBBxzAAQcc\n0FotgzEiA0dVrdL8vAS4pKv9gK7tq4Dt+zl9n36udxJwUtf9yf3tq6o7gF2GXrkkSerPaJ1SkSRJ\no4iBQ5Iktc7AIUmSWmfgkCRJrTNwSJKk1hk4JElS6wwckiSpdQYOSZLUOgOHJElqnYFDkiS1zsAh\nSZJaZ+CQJEmtM3BIkqTWGTgkSVLrDBySJKl1Bg5JktQ6A4ckSWqdgUOSJLXOwCFJklpn4JAkSa0z\ncEiSpNYZOCRJUusMHJIkqXUGDkmS1DoDhyRJap2BQ5Iktc7AIUmSWmfgkCRJrTNwSJKk1hk4JElS\n6wwckiSpdQYOSZLUuvG9LmCsmXzYeb0uYVSZOdU+Gwr7bfDss8Fbmn0298jdlsrjqHcc4ZAkjVhH\nH300r3zlK9liiy3Yc889efzxx3nDG97AlClTmDJlCuuuuy7Tp0/vdZkaAEc4+pFkOvCLqrqp17VI\n0rLq17/+Nccccww33XQTK664In/5l3/J6aefzmWXXbbwmHe/+91Mmzath1VqoBzh6N904BW9LkKS\nlnXz58/nscceY/78+Tz66KOsu+66C/fNmzePH//4x45wjBIjLnAkOTvJ1UluTPLhpu3hJF9s2v87\nyWuSXJLk9iTvaI5ZIcmJSa5P8rMkb2ra90lybNf1z02yY9d1P5fk2iSzkrw0yQ7AO4B/STInyZ8s\n9U6QJLHeeuvx8Y9/nA033JBJkyax2mqr8da3vnXh/h/84AfstNNOTJgwoYdVaqBSVb2u4VmSrFFV\n9ydZEbgK+DPgPuBtVfWfSX4ArAzsRmcU4uSqmpLkY8AWVbVvks2AC4BNgb8CtquqA5rrnwt8qaou\nSVLAO6rqh0mOAh6qqn9OchJwblWdOYB6ZwCfBZg4cSInnnjicHaHJC2zHn74Yb74xS/y8Y9/nJVX\nXpmjjjqKHXbYgR133BGAI444gp133pkddtiht4Uu46ZPn351VW23pONG4hqOA5O8s9neANgEeBI4\nv2m7Hniiqp5Kcj0wuWl/PfBVgKq6Ocmv6ASO5/MkcG6zfTXwlsEWW1UzgBkAG2+8cR10+Ujs0pFr\n5tT52GeDZ78Nnn02eEuzz/p7l8oZZ5zBq1/9at7//vcD8MgjjzBr1iymTZvG73//e+bOncvhhx/O\nCiussFRqHKi+vj7XlfRjRE2pNFMdOwNTq2pr4GfACsBT9cehmGeAJwCq6hn+GJqymMvO59nPs/uV\n2X3dpxmZAUySlkkbbrghs2bN4tFHH6WquOiii9h8882BThjZfffdR1zY0OKNqMABrAY8UFWPNtMi\n2w/i3EuBvQCSbApsCNwCzAWmJBmXZAPgNQO41jxg1cEULkkaXq997Wt5z3vewzbbbMOWW27JM888\nw4c//GEATj/9dPbcc88eV6jBGGl/0Z8P7J/kOjphYdYgzv068M1mmmU+sE9VPZHkJ8AddKZibgCu\nGcC1TgeOS3Ig8J6q+uVAi/DDawanr6/PPhsC+23w7LPBGwl9dvjhh3P44Yc/p/2SSy5Z+sXoBRlR\ngaOqngB27WfXKl3HzFjknFWan48D+/RzzaIZ+ehnX/d1zwTObLZ/gm+LlSRp2Iy0KRVJkjQGGTgk\nSVLrDBySJKl1Bg5JktQ6A4ckSWqdgUOSJLXOwCFJklpn4JAkSa0zcEiSpNYZOCRJUusMHJIkqXUG\nDkmS1DoDhyRJap2BQ5Iktc7AIUmSWmfgkCRJrTNwSJKk1hk4JElS6wwckiSpdQYOSZLUOgOHJElq\nnYFDkiS1zsAhSZJaZ+CQJEmtM3BIkqTWGTgkSVLrDBySJKl1Bg5JktQ6A4ckSWqdgUOSJLXOwCFJ\nklpn4JAkSa0b3+sCxprJh53X6xJGlZlT7bOhsN8G74X22dwjd3tO24MPPsgHP/hBbrjhBpLw7W9/\nm6lTpwLwpS99iUMOOYTf/e53vOQlLxny40pjxTIfOJLMBbarqvt6XYuk0eWggw5il1124cwzz+TJ\nJ5/k0UcfBeDOO+/kwgsvZMMNN+xxhdLIMSanVJIs80FKUrseeughLr30Uvbbbz8All9+eVZffXUA\nPvKRj3DUUUeRpJclSiPKqPzFnOTTwF7AncB9wNXA7sBPgdcB5yT5BfApYHng98BeVXVPkjWB04C1\ngCuBdF33fcCBzTlXAH9XVU8vreclafS4/fbbWWuttdh333259tpr2XbbbZk5cyYXXXQR6623Hltv\nvXWvS5RGlFRVr2sYlCTbAccDU+kEpmuAf6MTOG6qqr9rjpsIPFhVleSDwOZV9bEkxwD3VdURSXYD\nzqUTPtYCjgLeVVVPJfk6MKuq/n0J9cwAPgswceJETjzxxOF/0pJGnNtuu41DDz2UI488kk033ZTj\njz+e8ePHc+ONNzJjxgxWXnllPvShD/HlL3+ZCRMm9LpcqTXTp0+/uqq2W9JxozFwHAxMrKrPNve/\nAvyGTuD4bFX9T9O+JfBlYBKdEYs7qmqXJHPohIrbm+PuBzYF/gr4BHBv81ArAqdV1YyB1rbxxhvX\n/PfMfOFPchkyc+p8Drp8VA609ZT9NngvtM8WXTT629/+lu233565c+cCcNlllzFjxgyuv/56Vlpp\nJQDuuusu1l13Xa688krWWWedIT92r/T19TFt2rRelzHqLGv9lmRAgWNAaziSfDTJas32d5LcnOSt\nL7TIIXq+SdFHura/ChxbVVsCfwOs0LWvv5QV4OSqmtLc/nQwYUPSsmWdddZhgw024JZbbgHgoosu\nYptttuHee+9l7ty5zJ07l/XXX59rrrlmVIYNabgNdNHoPlX1hyRvAtYGPgB8vr2yntf/Am9PskKS\nVYDnvletYzXg18323l3tl9JZ/0GSXYGJTftFwHuSrN3sWyPJRsNdvKSx46tf/Sp77bUXW221FXPm\nzOETn/hEr0uSRqyBji8uWDj5JuDUqvppkp68w6WqrkpyDnAt8CtgNvCHfg6dAZyR5NfALOBlTfvh\nwGlJrgH+B/i/5ro3JfkUcEHz3J4C/r55DEl6jilTpjB79uzF7l8w3SJp4IHjsSSfpDMy8Lp03uu1\nfHtlLdGXqmpGkpXojFh8uaqO6z6gqvqAvkVPrKrfA93TQR/p2vdd4LsvpLD+PhxIi9fX12efDYH9\nNnj2mdRbA55SofMujkOq6rfAy4FT2ypqAL7VLP68Bjirqq7pYS2SJGkJBjTCUVW/AA5OslZz/5fA\nF9osbAn1vLdXjy1JkgZvoO9SeW2SX9EZUSDJdkm+1WplkiRpzBjolMpXgF3pfKonVTWbzid6SpIk\nLdFAA8fyVXXTIm1PDncxkiRpbBpo4Hii+cyLAkjyCuDx1qqSJEljykDfFvt54AJg3SQnAbsA72ur\nKEmSNLYM9F0qP0pyM/DndD4C/J+r6rZWK5MkSWPGEgNHkuWAs6vq7cA32i9JkiSNNUtcw1FVTwMr\n9uqjzCVJ0ug30DUcVwDfT/IfwMMLGqvqR61UJUmSxpSBBo4dmp9/29VWgIFDkiQt0UAXjb6p7UIk\nSdLYNaDAkeRt/bU7pSJJkgZioFMqh3RtrwBMofO9KgYOSZK0REOaUmk+afSjrVQkSZLGnCG91bX5\nXpWthrkWSZI0Rg1lDcc44NUMMaxIkqRlz1DWcMwHfgn8xfCXI0mSxqKBBo7pVfWH7oYkE1qoR5Ik\njUEDnRa5uJ+2S4axDkmSNIY97whHkvHA8sC4JCvS+aZYgNWAlVquTZIkjRFLGuH4JJ3vTtkSeKTZ\nfhj4OXBqu6VJkqSx4nkDR1UdXlXjgG9U1biu2+pV9U9LqUZJkjTKDWgNR1Ud0HYhkiRp7BpQ4Eiy\nVZLLkzya5OkFt7aLkyRJY8NA3xb7DeBTwFeAXYC/B+a1VZQkSRpbBvq22BWq6iJgXFXdXVWfAnZt\nsS5JkjSGDDRwzG9+3p9k6yRrAhu1VJMkSRpjBjql8t0mZHwB+F9gOeAzrVUlSZLGlIF+Pf1Xms3z\nk6xBZ4rFNRySJGlABvoulSTZL8kXq+opYM0kO7RcmyRJGiMGuobjK8BOwLTm/jzgX1upSJIkjTkD\nXcPxJuBVwDUAVfX7JCu0VtUoNvmw83pdwqgyc6p9NhRjpd/mHrlbv+2TJ09m1VVXZbnllmP8+PHM\nnj2bT3/60/T19TFu3DjWXnttTjrpJNZdd92lXLGkoRroCMfjVVUL7iQZxx+/yE2Sht3FF1/MnDlz\nmD17NgCHHHII1113HXPmzGH33XfniCOO6HGFkgZjoIHj+iR70VnOMZnOB4Fd1lZRvZBkuV7XIGnx\nJkyYsHD7kUceIfFvHmk0GWjg+CiwIzAJuKI575CWalqiJP+U5KCu+59LcmCSQ5JcleS6JId37T87\nydVJbkzy4a72h5MckeQKYGqSI5Pc1Jz/paX8tCQ1kvDWt76Vbbfdlm9961sL2z/5yU+ywQYbcOqp\npzrCIY0y6Zopee7O5MtV9bFm+y1VdeFSq+x5NKMs36+qbZrpnVuBT9BZ2Po3dKZ7zgGOqqpLk6xR\nVfcnWRG4CvizZh1KAXtU1feat/teDmxWVZVk9ap6cAC1zAA+CzBx4kROPPHEYX++0rLm/vvvZ401\n1uDBBx9kxowZfOhDH+KVr3zlwv1nnnkmTz31FHvuuWcPq5QEMH369KurarslHbekRaNv6tr+IjAi\nAkdVzU3y+ySvAl4K/Ax4NfDWZhtgFWAT4FLgwCTvbNo3aNp/DzwNnNW0PwQ8Dhyf5Dzg3AHWMgOY\nAbDxxhvXQZcPdB2uAGZOnY99Nnhjpd8Wt2i02x133MHyyy/PtGnTFrZNmTKF3Xbb7VltS9LX1zeo\n42WfDZX91r8lTalkMdsjwfHAPsC+wLfp1PeFqprS3DauqhOS7AjsDEytqq3pBJIF77B5vKqeBqiq\n+cBr6ASQ6cD5S/PJSOp45JFHmDdv3sLtCy64gC222IJbb7114THnnHMOm222Wa9KlDQES/oT6cVJ\nNqfzy7x7G4CquqnN4pbgB8ARwIuA99L5vpd/SnJqVT2cZD3gKWA14IGqejTJZsD2/V0sySrASlX1\noySzgNuWyrOQ9Cz33HMP73xnZ0By/vz5vPe972WXXXbh3e9+N7fccgvjxo1jo4024pvf/GaPK5U0\nGEsKHCsBP+q6371dwMuHvaIBqqonk1wMPNiMUlzQBKLLm9XrDwPvozNSsX+S64BbgFmLueSqQF/z\n+SIBPjKUugYyRKw/6uvrs8+GYCz328tf/nKuvfba57SfddZZ/RwtabR43sBRVZOXUh2D1iwW3R74\niwVtVTUTmNnP4bv2d42qWqVr+246UyqSJGmYDfRtsSNKklfQmfK4qKpuXdLxkiSpt0blMvdm7UjP\npnMkSdLgjMoRDkmSNLoYOCRJUusMHJIkqXUGDkmS1DoDhyRJap2BQ5Iktc7AIUmSWmfgkCRJrTNw\nSJKk1hk4JElS6wwckiSpdQYOSZLUOgOHJElqnYFDkiS1zsAhSZJaZ+CQJEmtM3BIkqTWGTgkSVLr\nDBySJKl1Bg5JktQ6A4ckSWqdgUOSJLXOwCFJklpn4JAkSa0zcEiSpNYZOCRJUusMHJIkqXUGDkmS\n1DoDhyRJap2BQ5Iktc7AIUmSWje+1wWMNZMPO6/XJYwqM6faZ0MxmH6be+Rui9339NNPs91227He\neutx7rnnst9++zF79myqik033ZSTTjqJVVZZZbjKlrQMc4RDWobNnDmTzTfffOH9o48+mmuvvZbr\nrruODTfckGOPPbaH1UkaSwwc0jLqrrvu4rzzzuODH/zgwrYJEyYAUFU89thjJOlVeZLGmDEfOJKc\nneTqJDcm+XDTtl+SXyS5JMlxSY5t2tdKclaSq5rb63pbvdSegw8+mKOOOopx4579v4F9992XddZZ\nh5tvvpl/+Id/6FF1ksaaVFWva2hVkjWq6v4kKwJXAX8O/ATYBpgH/Bi4tqoOSPIfwNer6n+TbAj8\nV1VtvtiLd64/A/gswMSJEznxxBNbfDbS8Ljqqqu4+uqr2X///bn++uvp6+vjU5/61ML9Tz/9NMcd\ndxybbLIJO+20Uw8rlTTSTZ8+/eqq2m5Jxy0Li0YPTPLOZnsD4K+B/6mq+wGSnAFs2uzfGXhF1zDy\nhCSrVtW8xV28qmYAMwA23njjOujyZaFLh8/MqfOxzwZvMP3W36LRWbNmcf3113PQQQfx+OOP89BD\nD3HGGWdwyimnLDxmzTXX5F/+5V845phjhq3uXurr62PatGm9LmNUsc+Gxn7r35ieUkmyI50QMbWq\ntgZ+BtzyPKeMa46d0tzWe76wIY1WX/jCF7jrrruYO3cup59+Om9+85v5zne+w2233QZ01nD88Ic/\nZLPNNutxpZLGijEdOIDVgAeq6tEkmwHbAysBf5ZkYpLxwLu7jr8AOGDBnSRTlmq1Ug9VFXvvvTdb\nbrklW265JXfffTef+cxnel2WpDFirI9lnw/sn+Q6OiMbs4BfA58HrgB+A9wE/KE5/kDga83x44FL\ngf0H84DP95kHeq6+vj77bAiGs9923HFHdtxxRwB+8pOfDMs1JWlRYzpwVNUTwK6LtieZXVXfakY4\nfkBnZIOqug/YY+lWKUnS2DfWp1QWZ0aSOcANwB3A2T2uR5KkMW1Mj3AsTlV9vNc1SJK0LFlWRzgk\nSdJSZOCQJEmtM3BIkqTWGTgkSVLrDBySJKl1Bg5JktQ6A4ckSWqdgUOSJLXOwCFJklpn4JAkSa0z\ncEiSpNYZOCRJUusMHJIkqXUGDkmS1DoDhyRJap2BQ5Iktc7AIUmSWmfgkCRJrTNwSJKk1hk4JElS\n6wwckiSpdQYOSZLUOgOHJElqnYFDkiS1zsAhSZJaZ+CQJEmtM3BIkqTWGTgkSVLrDBySJKl1Bg5J\nktS68b0uYKyZfNh5vS6hdXOP3O05bR/4wAc499xzWXvttbnhhhsAmDFjBscddxxrrbUWAJ///Od5\n29vetlRrlSSNDMvECEeSyUluWMy+I5LsvLRrGmv22Wcfzj///Oe0f+QjH2HOnDnMmTPHsCFJy7Ax\nNcKRZHxVzR/MOVX1mbbqWZa88Y1vZO7cub0uQ5I0QrU6wpHko0luaG4HJzk0yYHNvqOT/LjZ3inJ\nKc32w0k+l+TaJLOSvLRpXyvJWUmuam6va9pnJPlWkguAf0/yyiRXJpmT5LokmzTlLJfkuCQ3Jrkg\nyYrN+ScleU+zPTfJF5vzr0yycZv9syw49thj2WqrrfjABz7AAw880OtyJEk90lrgSLItsC/wWmB7\n4EPAZcAbmkO2A1ZJ8iLg9c0+gJWBWVW1NXBpcx7ATODoqno18G7g+K6H2xaYVlXvBfYHZlbVlOYx\n7mqO2QT3BSkUAAAJdklEQVT4WlW9EniwuUZ/Hqqq1wDHAv86xKcv4G//9m/55S9/yZw5c5g0aRIf\n+9jHel2SJKlH2pxSeT3wg6p6BCDJ94HXANsmWRV4AriGTih4A3Bgc96TwLnN9tXAW5rtnYFXJFlw\n/QnNdQDOqarHmu3LgU8mWR/4flXd2pxzR1XN6bru5MXUfVrXz6OX9CSTzAA+CzBx4kROnDqoGZ1R\nqa+vr9/2e+65h3nz5vW7f6ONNuKUU07pd9/irqfnZ78Nnn02ePbZ0Nhvz9Vm4Eg/bQXMpTPy8VPg\nOuBNwJ8AP2+Oeaqqqtl+uqvGccDUrmDReZBOmHhk4QNU/UeSK4DdgP9K8kHgdjoBZ4GngRUXU3ct\nZrv/g6tmADMANt544zro8jG1LKZf/b1LBWDu3Lkcc8wxTJs2DYC7776bSZMmAXD00UczderUhfsW\n6Ovre06blsx+Gzz7bPDss6Gx3/rX5hqOS4HpSVZKsjLwTjrTJpcCH29+XkZnCmROV8hYnAuAAxbc\nSTKlv4OSvBy4vaqOAc4Bthpk3Xt0/bx8kOcus/bcc0+mTp3KLbfcwvrrr88JJ5zAoYceypZbbslW\nW23FxRdfzNFHL3HASJI0RrX253hVXZPkJODKpun4qvpZkjWATwKXV9UjSR7nj+s3ns+BwNeSXEen\n7kvphJVF7QG8L8lTwG+BI4AJgyj9xc0IyThgz0Gct0w77bTTntO233779aASSdJI1Or4f1V9BfjK\nIm0XAS/qur/pIvtX6do+Eziz2b6PP44+dB8/Y5H7XwC+sMhh9wNbdB3zpa7tfRY59mtVdfjin9Xz\nW9x0gyRJy7Jl4oO/JElSb439FY6DUFWTe12DJEljkSMckiSpdQYOSZLUOgOHJElqnYFDkiS1zsAh\nSZJaZ+CQJEmtM3BIkqTWGTgkSVLrDBySJKl1Bg5JktQ6A4ckSWqdgUOSJLXOwCFJklpn4JAkSa0z\ncEiSpNYZOCRJUusMHJIkqXUGDkmS1DoDhyRJap2BQ5Iktc7AIUmSWmfgkCRJrTNwSJKk1hk4JElS\n6wwckiSpdQYOSZLUOgOHJElqnYFDkiS1zsAhSZJaZ+CQJEmtM3BIkqTWGTgkSVLrDBySJKl1Bg5J\nktQ6A4ckSWpdqqrXNYwZSR4Bft7rOkaZdYHf9LqIUch+Gzz7bPDss6FZ1vpto6paa0kHGTiGUZKq\nqvS6jtHEPhsa+23w7LPBs8+Gxn7rn1MqkiSpdQYOSZLUOgPH8Dq81wWMQvbZ0Nhvg2efDZ59NjT2\nWz9cwyFJklrnCIckSWqdgUOSJLXOwCFJklpn4JAkSa0zcEiSpNYZOCRJUusMHMMkyS5JbklyW5LD\nel3PSJRkgyQXJ/l5khuTHNS0r5HkwiS3Nj8n9rrWkSbJckl+luTc5v7LklzR9Nl3kyzf6xpHmiSr\nJzkzyc3Na26qr7Xnl+Qjzb/NG5KclmQFX2vPluTbSe5NckNXW7+vq3Qc0/xeuC7JNr2rvPcMHMMg\nyXLA14BdgVcAeyZ5RW+rGpHmAx+rqs2B7YG/b/rpMOCiqtoEuKi5r2c7iGd/MeAXgaObPnsA2K8n\nVY1sM4Hzq2ozYGs6/edrbTGSrAccCGxXVVsAywF/ha+1RZ0E7LJI2+JeV7sCmzS3DwPfWEo1jkgG\njuHxGuC2qrq9qp4ETgem9bimEaeq7q6qa5rteXR+AaxHp69Obg47GZjemwpHpiTrA7sBxzf3A7wZ\nOLM5xD5bRJIJwBuBEwCq6smqehBfa0syHlgxyXhgJeBufK09S1VdCty/SPPiXlfTgH+vjlnA6kkm\nLZ1KRx4Dx/BYD7iz6/5dTZsWI8lk4FXAFcBLq+pu6IQSYO3eVTYi/StwKPBMc39N4MGqmt/c9/X2\nXC8Hfgec2ExFHZ9kZXytLVZV/Rr4EvB/dILGH4Cr8bU2EIt7Xfm7oYuBY3j09zXEfmb8YiRZBTgL\nOLiqHup1PSNZkt2Be6vq6u7mfg719fZs44FtgG9U1auAR3D65Hk16w6mAS8D1gVWpjMlsChfawPn\nv9UuBo7hcRewQdf99YHf9KiWES3Ji+iEjVOr6vtN8z0Lhhmbn/f2qr4R6HXAO5LMpTNV92Y6Ix6r\nN8Pe4OutP3cBd1XVFc39M+kEEF9ri7czcEdV/a6qngK+D+yAr7WBWNzryt8NXQwcw+MqYJNmNffy\ndBZandPjmkacZu3BCcDPq+orXbvOAfZutvcG+pZ2bSNVVf2/qlq/qibTeV39uKr2Ai4G3tMcZp8t\noqp+C9yZ5E+bpp2Am/C19nz+D9g+yUrNv9UFfeZrbckW97o6B3h/826V7YE/LJh6WRb5bbHDJMnb\n6PzluRzw7ar6XI9LGnGSvB64DLieP65H+ASddRzfAzak8z+9v6iqRRdlLfOS7Ah8vKp2T/JyOiMe\nawA/A95XVU/0sr6RJskUOgttlwduB/al80eWr7XFSHI4sAedd5T9DPggnTUHvtYaSU4DdgReAtwD\nfBY4m35eV01wO5bOu1oeBfatqtm9qHskMHBIkqTWOaUiSZJaZ+CQJEmtM3BIkqTWGTgkSVLrDByS\nJKl145d8iCQNv+bDzB5vbgAXV9VHeleRpDYZOCT10nuq6oYlHza8kowDqvxcAGmpcUpF0oiVZO0k\n/53k+uZ2dNe+/9e0XZvkp02IIMk/JrmhuZ3YfHcPSWYkOSXJ2cC1dD6y+0+T/GeSq5rr7NubZyqN\nfY5wSOqlM5MsmFL5x6r6r0X27wX8qqp2hoVfMEaSvYF3AK+rqoeSrFlVzyTZFfhrOt8BMo/OV4V/\nGvjH5npvBLapqvua7wf5b2Cvqro5yarA7CSXV9XN7T1ladlk4JDUS0uaUpkFfDTJvwD/AywIJLvT\n+SbYhwCq6vdN+87A6Qvak3wLmNl1vR9V1X3N9qbA5sDpnU+gBuDFTZuBQxpmBg5JI1ZVXd58J8pb\n6IxcHAa8nv6/9pumfdF1Gd33H17k2PuqasowlSvpebiGQ9KIleRlwENVdTrwUWDbZq3GD4G/baZB\nSLJmc8qFwF8lWbX54qwP0pk26c8twKNJ/rrr8TZLMqGlpyMt0xzhkDSS7Qh8LMl8On8g7d+s1fh3\nOt9iOqvZNy/JG6vqP5NsBVzenD8b+Of+LlxV85O8HfjXJIfQ+abne4C/bPcpScsmvy1WkiS1zikV\nSZLUOgOHJElqnYFDkiS1zsAhSZJaZ+CQJEmtM3BIkqTWGTgkSVLr/j/4hads5pETUQAAAABJRU5E\nrkJggg==\n",
469 | "text/plain": [
470 | ""
471 | ]
472 | },
473 | "metadata": {},
474 | "output_type": "display_data"
475 | }
476 | ],
477 | "source": [
478 | "#Variable Importance Plot\n",
479 | "plt.style.use('seaborn-notebook')\n",
480 | "xgb.plot_importance(model_xgboost)"
481 | ]
482 | },
483 | {
484 | "cell_type": "code",
485 | "execution_count": 21,
486 | "metadata": {
487 | "scrolled": true
488 | },
489 | "outputs": [
490 | {
491 | "data": {
492 | "text/plain": [
493 | ""
494 | ]
495 | },
496 | "execution_count": 21,
497 | "metadata": {},
498 | "output_type": "execute_result"
499 | },
500 | {
501 | "data": {
502 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAe0AAAELCAYAAAD0qd5kAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xd4FNX6B/DvphECIRCIcAOhhSIgoKAIiohIR0FALgZR\nAUGU+NOgKMWCKFVBQJBy6eK9AQSkd0OPSA9NOqFITyC9bfL+/tjssJvsZktm9szsvp/nyZPd2Zlz\n3p2dnXdn5sw5OiICY4wxxtTPS3QAjDHGGLMPJ23GGGNMIzhpM8YYYxrBSZsxxhjTCE7ajDHGmEZw\n0maMMcY0gpM2Y4wxphGctBljjDGN4KTNGGOMaYSP6ACs4G7aGGOqdPXqVYSHh6NmzZro168fSpUq\nhX379mHlypUYM2YMvv76a9EhMu3S2ZxBpd2YqjIoxphnIiJ4eXnB3v3lF198gZSUFPz0008KR8bc\nDCdtxhgrDn9/f2RmZjq17IkTJ9CoUSOZI2JujJM2Y4w5q0yZMkhOTi5WGffv30eFChVkioi5OZtJ\nmxuiMcaYBV26dCl2wgYgJewhQ4YUuyzG+EibMcYKOHPmDOrXry97udevX0dYWJjs5TK3wafHGWPM\nEX5+fsjOzhYdBvNMfHqcMcYcoXTC7tChg6LlM/fGR9qMMZZPp9PZfVsXYwrgI23GGLOXqxL2mjVr\nXFIPcz98pM0YYwAePnyIsmXLuqSu+vXr48yZMy6pi2kKH2kzxpg9unXr5rK6tm/f7rK6mHvhpM0Y\nYwCOHz8ue5kHDx60OL1y5cqy18U8AydtxhgD8Pbbb1t9bevWrQCAO3fuAAB+//13AMCoUaMAADt2\n7EBCQoI0v/E0+/37962WGRMTU7yAmUfipM0YYwACAgIsTg8NDcWWLVsAABUrVgQALFu2DFOnTsX4\n8eMBAG3btkX58uVRrlw5AIbr4wDw8ssvW60vIyNDttiZ5+CkzRhjAP7++2+L0318fPDpp59Kz3U6\nHfbu3Yt+/foBMAwK0qJFC4wePRoPHjxAeno6Jk2ahM6dO2PevHlW63v88cdljZ95Bm49zhhjsH6P\n9gsvvICxY8fixRdfFBAV8zDcepwxxuyxZMkSi9P37t0re8IeM2aMrOUxz8FH2owx5mLe3t7Izc0V\nHQZTHz7SZowxe7mqRTcnbOYsTtqMMZbPWmM0Of3f//2f4nUw98WnxxljzMSqVavQs2dP0WEwz8Sn\nxxljzBFBQUGKlV23bl3FymaegY+0GWOsgNKlSyM1NVXWMuPi4tC4cWNZy2Ruh4+0GWPMUampqXj6\n6cuylVe1YkVO2EwWfKTNGGMF5OUBXl6GrkZ9fHzg6+vrdFmhoaG4efMm0Lw5cOCAjFEyN8RH2owx\n5oghQwwJGwBKliwJHx8fzJ0716myXn31VUPCBjhhM1nwkTZjjOUbPhyYNMn66zqdDtnZ2UUeeXfo\n0AFjxoxB8+bNFYiQuTmbR9qctBljDMDYscCXXzq2zPnz55GamoomTZrYv9DDh0D+0J2MFcBJmzHG\nVGfPHqBVK9FRMPXha9qMMVaUIkbPVE6rVsDkyQIqZlrHR9qMMY8VHQ1ERAgM4KWXgJ07BQbAVIZP\njzPGWEFEwPr1QNeuoiMBEBwMJCaKjoKpAydtxhgzlZsLeHuLjqIALy/DzeHM0/E1bcYYMxUbKzoC\nC/LyAAX7PGfug5M2Y8xjHD4MvPCCY8vMmjWr0LQff/xRpohMJCUBLVvKXy5zK5y0GWNuLyUFOH0a\nOHVqMQ4ePAgAGDduHABgUoHeVJYtW4Y1a9bgzp07AAB/f38QEWbMmAEiwvbt29G3b18AQHp6OgBD\nX+ULFy4sfqD79gHjx0tP9Xo9li9fnv8eUnD9+vXi18E0jZM2Y8ytPXgABAYCDRoARITRo0dDp9Ph\niy++APAoeRtFREQgIyMDFStWxLlz5zBq1CjExcVh7dq1iIuLQ6NGjRAVFQUACAgIwM8//4zAwEB0\nlatV26hRhlMCAHx8fNC7d28AwPHjx6HX6+Wpg2kWJ23GmFtLTn70uH///ti8eTNmzZqFXbt25b9u\nmKFz587SfGlpaQCArKwsJCUlITIyEtu3b0dkZCQaNmyIrKwsad6MjAw0b94cDx48kC/omjWBxESz\n4UG7dOmCzZs3y1cH0yRuPc4Yc1u3bwOVKj16HhUVhWnTplmcNyMjAyVLlnSqnrlz5+Lll19GrVq1\nnFqesXx8yxdjzPNcvmxojF2+vOhIGHMI3/LFGPMsf/9tOLvsNgn74UPRETAV4SNtxphbSUsDSpUS\nHYXMypbl5O0Z+EibMeY5srLcMGEDhoTNA4wwcNJmjLmBmBhArwdKlBAdiYKGDTMMMMI8Gp8eZ4xp\n2oYNwCuviI7ChXiAEXfGp8cZY+q0dOlSVKxY0WIvX2+++SYCAgKKXN44vkaXLkpEp2KJicAPPxge\n8yAjHoePtBljLhUWFuZQd5zR0dFo0aIFqlevLk3z9zdcv87LA3Q2j03clJeXYYxRC/vwunXrIjw8\nHJs2bSr0WuvWrdGpUycMHz7cFVEyx/B92owxdWjfvj22bdvm9PKrV69Gjx49AJgnanXuwhRWuTJw\n86bhsckKqF69OuLj4+0uJjg4GIl8ql1N+PQ4Y0y8pUuXFithA5ASdtmyhG++sXqQ6Rn++cfw5iMi\nAJ0OJfJb4DmSsAFICXvdunVyR8gUwkfajDFFlSxZEhkZGbKVd/nyZdSsWVO28rTu6aefxuH8AUaK\no0ePHli9erUMEbFi4CNtxpg4devWlTVhA0DNmjUxePBgWcvUqh49esiSsAHD5YdRo0bJUhZTDh9p\nM8Y0qV27dti+fbvoMIQhIug8thWe2+IjbcaYGEofDW/fvh23bt1StA41W7ZsmSLlfvXVV4qUy+TB\nR9qMMdlt2rTJbHxqpQQFBSEpKUnxetSmbdu22LFjh+gwmPz4SJsx5nqrVq1yST2emLABYOXKlYqW\nz6fd1YuPtBljmnb9+nWEhYWJDsOt8PVyYfhImzHm3t58803RIbjU8uXLFa9Dp9Ph5MmTitfDHMdJ\nmzEmq7i4OKeWi4qKsvpaUUd9f/75p1P1aVX//v2LtXzDhg2xceNGACiyf/d+/foVqx6mDE7ajDFZ\nFXUbVlZWFjp27AhfX1/odDoYL88NGjQIR44cAQD89ttv0vyBgYE262vXrl0xI9YWa/e9T5s2DTVq\n1AAA7N69Gx07dsSyZctw5swZAMD8+fNx//59nDx5El3yR1kp6h76o0ePyhw5k4OP6AAYY+7Fx8f6\nbuXLL7/Eli1bpISdkZGBkiVLYt68eTh37hw+/fRTTJkyBQBQvnx5pKSk2KxPr9fLFruWJSQkoEWL\nFgCAW7duYcuWLdJrY8eOhbe3NypUqAAAqFy5Mv755x8hcbLi4aTNGJPVM888Y/W1yZMno169etLz\nRo0aoWrVqjhw4AC+/fZbTJkyRUroCQkJyMzMhL+/f5H1/fXXX7LFrgXWLhXcvHkTPXv2hF6vR0RE\nBNLT0zF48GBkZWXhyy+/BADUqFEDiYmJdrW6r1WrlqxxM3lw63HGmMt4eXkhT+YxoOvXry+dAvYE\nM2bMwP/93/9Zff3UqVPo1KmTQ8OfWhITE4M2bdoUqwzmMB6akzHm3rKzs+Hn5yc6DLeSkJCA8uXL\niw7DE/EtX4wx12vatKnL6vLEhL17925Fy69ataqi5TPncdJmjMmuZ8+eLqnn6aefdkk9atO7d29F\ny09LS1O0fOY8Pj3OGJOdry9QtWodXLp0XnQobkup8a8bNmzIHauIw9e0GWOu4+MDuOoOrNKlSyM1\nNdU1lXkSnQ5QZ17wBHxNmzGmrLg4YPZsw+OCCfu7775TpM5KlSpxwgawf/9+Wcu7cuWKIWHv3m34\nY6rDR9qMMad5eQG27uBq1aoV9uzZ45qAPJCPj48sHcwEBgYW7sxmzx6gVatil83sxkfajDF5XbgA\n/PCD4bE9t1zv2bMHDRo0kKVuProuTK/X45133nE6cScnJyM3N9dy73OtWhkaKDDV4CNtxphdFi8G\n6tUDnn3WueUPHjyIZs2aOV1/gwYNcPr0aaeX9wSlSpVyqOX3Y489hrt379qe8e23gSVLDNe7mZL4\nSJsxVjzG26D79XM+YQOQEnbp0qUdWq569eoAwAnbDsaEvX79euh0OkyePFk6As/Ozsa4cePg5eWF\n3NxcALAvYQPAL78YEvbPPysSN7MfH2kzxixatQoICgLatlWm/MaNG6Nv37747LPPCr12/PhxtG3b\nFtu2bUOTJk2UCcCDREbKmG///ttwyoUpgW/5Yow5plQpQEjfGvfuASEhAip2f7ImbQDo3x9YtEjG\nAlk+Pj3OGLOP8XIld4bFbFq0CGjdWnQUHomTNmMeLD4eGDfO8FidJ92Yau3aBUyaJDoKj8PjaTPm\noYwdX33xhehImGYNH25oqZidLToSj8FH2ox5kDt3gBEjDI/5yJrJIjsb8PcXHYXH4KTNmAdYvx7Y\ntAmoWBGYOFF0NMztZGYCCxeKjsIjcNJmzI2VKWP4/+qrQOfOYmNhbm7AAKB7d9FRuD1O2oy5oV27\ngOXLgeTk4pfVvHnz4hfC3MbmzZutv/j77/zrUGGctBlzIxUqGP63bg0888xl1KtXD4MGDUKvXr3w\n8OFD9OnTx2z+H3/8Ea+//joiIiKkXrLmz59v9j8+Ph46nQ5RUVEAgL179xarO9Ki9O3bF6Ghofjk\nk09w7NgxRerwRGfP/gIA6N+/P9577z1ERkYWuT2Y9ppm7O/duD1cvXrVbHvIyspCx44dHxWyaRNg\n0nvdtGnTMGHCBP5M5UJEavxjjDkAFr41lStXJiKicePGmcxnPuPZs2cLTUtOTqaEhATpf8WKFUmn\n00mv16lTR8bITdy9SwMHDiQAFBgYSERE58+fV6YuDzNkyKPPPjs7W5pubXtYtmyZNG3btm1E9Gi7\nmD17ttn2MGzYMMuVLl5MREQ3btygChUq8GdqH5v5kXtEY0yj7t8Hxo8HfvzR9rzp6enw8/ODj4/h\nLs9169aha9euFufNzs6Gn5+f9L+gixcvYvTo0fjvf/9brPgLKdAj2uHDh/H000/LW4eHiowEunff\ngbYmfdLq9XppeyjK3bt38dhjj1ndHo4cOYKePXsiPj7eZln8mdrE3Zgy5o7sGcf6t99+Q69evWSv\n29/fH8nJyRZ34MXC3ZgqpnbtN3DkyH9QxtgyUUZeXl7Is7YxPvkkcPy47HW6Me7GlDF3kZICDBpk\neGzPONZKJGwAyMzMlD9hM0W1b79MkYQNwHrCBgwJW6H2D56KkzZjKte1K7B0KRAYCMybJzoaxhx0\n8CD35CMjPj3OmEpVqgTcvi06Chfi0+OK8PIy5ExfX4G9jV68CNSqJahyTeHT44xpTZ8+wOzZHpaw\nmWKMg3FFRgoMolatR9d2WLHwkTZjKlGjBnDliugoBOIjbcUYB4cRbvVqoEcP0VGoGR9pM6Z2xrOG\nHp2wmSLi4uLwww8/IDT0GiZPnoxTp06JDYgTdrHxkTZzCxcuXMDUqVMRGxuL7OxshIWFISIiAv36\n9RMdmkXp6cAHHwBLloiOREX4SLtYUlJSEBQUhIyMDJQoUcLm/KmpqQgMDERWVpZr7wYoWxZ4+NB1\n9WkL36fN3FeLFi3QunVrTJgwocj5VqxYgTfeeAPXrl1DlSpVXBSddVWqADduiI5ChThpO+Xvv//G\nggULMHnyZKfLKF26NM6dO4fKlSvLGBlzAidt5l4iIyMxcuTIYiXfChUq4P79+zJGZVt2NhARAaxa\n5dJqtYWTtkOysrKwevVqREREyFbm1KlTERUVBZ3OZu4onrffBn75Rdk6tImTNnMPJ0+eROXKlREc\nHCxbmTqdDkpv//37AzNnAqVKKVqNe+Ckbbdu3bph7dq1ipVfqlQppKWlKVY+AMO1oXfeUbYO7eGG\naEz7/P390bBhQ1kTNmAYLOfxxx+XtUyjvDygfXtg0SJO2Hbx8gIee8zQzJkVqVevXoombABIS0tD\nixYtFK2DOYeTNlO1qKgoZGZmKlb+2bNnkZKSgt9++02W8v7v/4AHDww5aNs2WYr0DHfvio5AEy5e\nvCjbtmrLn3/+ib179ypXwTvvADL/EPcEfHqcqdahQ4fwzDPPuKy+wYMHY+7cuU4ty+MiyMDXF8jJ\nER2FKqWnp+P+/fuoWrWqy+s+cOAAmjdvrlwFej1gx2hjHoJPjzNtWrVqlUsTNgDMnTsXGzdudGiZ\nihUN/zlhF8+GDRuwYsAALFiwoOgBKDxUQECAkIQNAM2bN8cbb7yhXAV836ND+Eibqc5zzz2H2NhY\nIXXfv38fFSpUsDnf888D+/e7ICA31qJFCwwYMACDLHRvef36dTzzzDM4dOgQwsLCBESnHiVKlEBW\nVpboMJRtuEnE7RkMuPU405ZmzZrh4MGDQmP417/+hVu3bknPw8KAs2cNDcq4X4jia968OQ4cOGD3\n/KdOnUJwcDBCQ0MVjEqd4uLi0LhxY9FhSOz9Ueuwr74CvvtO/nK1h0+PM20RnbAB4NatW/D19QUA\nVKhg6AglMNDwGids582ZMwcAHErYAPDEE08gNDQU73jg7UGiTolbc1upUWy++84whOc33yhTvhvh\nI22mGuXKlcODBw9EhyEJDARSUw2PvbyA3Fyx8WiZXB3aXL16FdWqVZMhIvUbNGgQ5qlwAPWpU6di\n6NCh8hZavz7w99+Gx+rMSa7CR9pMO9SUsAHgiSdagMiwD+GE7bxp06bJ1gNdtWrVULp0aVnKUju1\nXst39EyJXc6ckb9MN8VH2kwV+vXrh8WLF4sOw0xGRgZKliwpOgxmwccff4zp06eLDkMxQ4cOxdSp\nU0WH4XpeXsCdO57cM57NI22+OY6pQrt27USHUEjJkiVBRMr3w+zGOnbsiC1btshe7vTp05GcnIwy\nZcrIXrYayN37n9zatm2LHTt2WHwtMjISTzzxhHMF//wzvKOjkZvfpkROH3zwgexlisBJmwn3559/\n4s033xQdhkW+vr7Q6/Wiw9AsJRK20TfffIMff/xRsfJF+uqrr0SHUCRbfSi4S4JUI76mzYTr0qWL\n6BCsevfdd0WHoFlKt3x214StBbaGw2XK4aTNhOvYsaPoEKxytltTBsyfP1/xOqydotWyGTNmiA7B\nLjExMaJD8EjcEI0Jd/v2bVSqVEl0GEyDXDG8qqsFBQUhKSlJdBg2Weu5MDIyEj///LOAiNwC3/LF\n1E/phG3rdqP93B+p7KZMmVKs5U+cOCF1b1rUuM4dOnQoVj1qlJyc7JJ6+vTpAwC4du0aAPP2BydO\nnLC5/J9//mlXPfaecTl27FiheStUqIBr165Jt4OG5LcqHzp0KMqXLw/A0C87AAwfPtxsWXc9E8BJ\nm6laXFyclAByc3PRvXt3DBo0CBEREYiKigJg+LJHRkZCp9OhR48eyM2/qfrDDz/EuHHjEBISghs3\nbuCVV17BqlWrAABvvfUWAMOY2s8//7yAd+be/jZ2lGFBfHw8AGDv3r1o1qwZBg0aJO2Mz549iy1b\ntqBRo0ZSxyKlihiQvEGDBvIFrRJeXkXvlo1nFnQ6HVq1aoVhw4YBABo1aoTAwEA88cQTSElJAQD0\n7dsXoaGhaNmyJY7nj2oTmN+9X/Xq1QE8anvQqVMnqXw5x5nX6XR47733EBkZiWnTpqFGjRo4fPgw\nDh8+jH379mHfvn24fv06nnrqKQwcOBDAo54RGzZsaNY24s0338TRo0cxdepUjB8/HgDQtWtX5OTk\nmDUYDQ8PR5s2bWR7D6pCRGr8Y4yIiAybKFGfPn0IAE2cONHmvHXq1JGmBQYGStON87Rt25ZiY2Ol\naRUqVJA7bI8XFRVlcx7j5zRu3DhpWrt27czm+eOPP6hy5cpUuXJli2UMHDiwGFGqk+n2akl6ejoR\nEYWFhZlNnzRpEuXm5tLrr78uTRs4cCABoNdee42IiIKDg6XXRo4cabZ8qVKlpMdZWVlOxzlkyBCz\n5/Pnz6fs7GwiIvryyy8pIiJCem38+PE0fvx42r9/v9kymZmZRPRoG0lMTKRbt24REZGvr6/VWC5f\nvixN+/zzz22+BxWymR/5mjZTvRs3bqBKlSpWXy/YtWVSUhKCgoIKzZeeni6dSgOA7Oxs6PV6s2lM\nHqdPn7Z5FHzx4kWMHj0a//3vf52ux8fHx+1uyRN5nT4nJ0fqd9+W0NBQ3Lx5s9B0ua5p2zs4SXZ2\nNvz8/HD58mXUrFkTgO19horxNW2mfrNnzy7ydVtfvoJ9UVtK2NnZ2cjOzjab5ufnxwlbIfactq5W\nrVqxEjYAvP3228VaXo1EDoxib8IGIH//4wXYO5qYn58fAEgJG7C9z9AyPtJmwoWEhODevXuiw7Do\n7t27eOyxx0SHoUmvvPIKNmzYIDoMzTl79qys15SVotfr4eNTuH8ubj1eLHykzdSvV69eokOwqk6d\nOqJD0Kx+/fopWn5OTo6i5YuihYQdFxdnMWEz5XHSZsLNmjVLdAhWGceAZo57/fXXkZWVpVj57ni7\nl1FiYqLoEIr05JNPig7BY/HpcaYK7thJBgPWrl2Lbt26yV5uqVKlirx/W+vU3sAuLy/P5q1pzCl8\nepxpg9rG0gaAjz76SHQImtetWzcsW7ZM9nLdOWEDkO6zVitO2OLwkTZTjX379qFly5aiw5Bo+LYR\n1cnNzYW3t7csZW3dutWtT40bqfXsk9rPAmgcH2kz7VDT2MgjRozghC0jb29vNGnSpNjlbNq0ySMS\nNgBVJmwAnLAF4yNtpipNmzbFkSNHhMaQmJiI4OBgoTG4q8aNGyMuLs7h5YKDgw2Ns3Q6QJ37LEX4\n+/sjMzNTdBiSsLAwXL9+XXQY7oyPtJm2HD58WHQIGD16tOgQ3FZcXBxiY2OxevVqu5dp3Ljxo9bU\nRICb3upliZoSNgBO2CrAR9pMdWrVWomLF18XUnfJkiWRkZEhpG5PNHHiRIwcuRfDhtXH448/jsTE\nRMyZMwfVqlXD1q1bLffQlZcHeFBDqCVLlgjtJc0oNjYWzz33nOgw3J3NI21O2kxVrl8HwsLENEp7\n/vnneZhOAf7zH+C99xxcaPBgYO5cReJRI9FjbHPPgC7Dp8eZNixZYvgfFmb437JlS7vH65VDeHg4\nJ2xBHE7YgCFhT5okeyxqlZSUhKlTpwqp+3bz5pywVYSPtJlwlSoBt29bfu2tt97C0qVLFa3fy8sL\neXl5itbBFPLFF8C4caKjcBlXN0yrUKEC7t+/D6SlAUWMa85kw0faTN2eftp6wgYgJeyqVavKXjcR\nYc+ePZywtcyDEjZgaJj21VdfKV7PP//8g4ULFxoSNmBI2EOGKF4vs42PtJkwd+8Cjpx1u3fvHn7/\n/Xe859T51EeSkpJQo0YN1ffv7Ak++ACwMTKrbVevAgWGZ/UESp0hKrJTl9WrgR49ZK+TSbghGlMf\nPz+gwNDWDgsKCsKpU6cQZrwIboennnoK0dHRmhhFyVPIdtt1+fJAQoIMBWnPk08+iePHjxe7nOrV\nqyM+Pt72jLdvG65pMSXw6XGmLsHBxU/YgOFoOSwsDCtXroROp0OXLl0QExODrKwsEBGuXbuGCRMm\nwMvLC1988QUA4NixY5ywVWb4cJkKSkgAmjaVqTBtOX78OJYsWeJ0D37BwcHIzc21L2EDgPGUOROC\nj7SZy7RqBezZIzoK5tZycwGZ+jjXsldffRUbNmxA79690b59ewQHByMhIQGbN2/GqlWrMGbMGHz9\n9dfOVzByJDBhgnwBMyM+Pc7Emz0biIgAypYVHQlze/fuASEhoqNQnchI4OefZS70n3+AypVlLtTj\n8elxJta2bYbGRpywmUuEhADca5drVK5saJTAXIqTNlNMYCDQvr3oKJhaTZ6sUMGxsQoVzAohAlq3\nFh2FR+GkzRTRqROQkiI6CqZmn32mYOHceMJ1du0CTp8WHYXH4KTNZGXsWXLzZrFxMPX79lsFC2/V\nSuEKmJkGDTyqL3iRuCEakw33u8AcQeSCS6LnzgF16ypcifop0hCtoORkoEwZhStxe9wQjbmGvz8n\nbGa/O3dc1IYpPNwFlTAAhoTdoYPoKNweJ21WbKmpgAvHMGBu4JVXnFvO0ljn94vq7MPHx6PG3hZu\n61agQgXRUbg13pqZ07780vB/5871ZtM7duyI1q1bIzo6utAyq1evhr+/v9m0gpdo2rRpg/Xrzct0\nhK7AIVyJEiWcLospIynJ0G3mkCFDEB4ejoEDB1qcT6fTYeTIkQCAvLw81KhRQ9q2QkJCMG7cOISE\nhCAzMxN6vR4bNmzA4sWLUa5cuUeF5OUB+cneOM/o0aMVf49qo9frceLECaSlpUGn02HmzJm4cuWK\n2TzGHga/NH65TezatQsAMGfOnELfMbPv8P37wPXrAAyf3/r16/HLL78AMIzax4qHkzZzypIlwNix\nhsevvvoq2rRpI722ZcsW7Nq1C9euXSu0XOPGjZGVlSU9j4yMhE6nQ2RkpDQtPDwcDRs2BAD89ttv\nDsdW5A6FqcKECcDnn3+O7777Dnfu3MH8+fMBAHUtXH9u3LgxAODixYsAIG1b9+7dw6xZswAYhqz0\n8fHB1atX8e233+LIkSPmhbz4IgBI85TywGEmfX190ahRI/j5+YGI8OGHH6JmzZpm8xi/d8b/ALB9\n+3YAQNOmTaVBdky/Y5999lmh7xy2bJHma9iwIUaMGAEA+PXXX+V9U56IiNT4x1Sse/fC09q1a0dE\nROnp6WbTMzMziyzr6tWrZv8L2rt3L02YMMGJKA0ePnzo9LLM9Rz5vHJycixOv3LlCi1YsKDwCzDf\ntVy/ft2h2LRsyBCi7du3m02ztv4KunPnDhERZWVlWXz98OHDVK1atcIvTJxo9vTQoUN21efhbOZH\nbj3O7JaXZxjso8DZbej1evj4+IgJijFmk5Ktx7Ozs+Hn52f5RQ8efc1J3HqcyeOjjwzteQombACc\nsJlDLl8WWLmxIwEmG6sJGzAkbJPLYaz4OGkzm/z8gJ9+Eh0Fcxf57crEGD5cnrFhmf0iIkRH4FY4\nabMivfN/uPhAAAAgAElEQVQO7+OYfHQ6YM0aoHp1gUFs2CCwcg/EvS7JipM2s0ivN/wtWSI6Egfp\ndI/+mCplZwPx8QID6NEDGDZMYAAeaPVq0RG4DU7arBAvL0OfFJq+VJ2XJzoCZsGiRYC3t+goYBhi\nTK8XHYVnGT5cdARugZM2M+PtrfF8Z2wUw0faQk2fPh09e/ZEeHg4atasiddeew2TJ09Gv35ATo7o\n6PKFhoqOwLNMmgSULSs6Cs3jpM0kH34I5OaKjqKYsrKAKVNER+FRjh49ikqVKiHB5Naejz/+GKtW\nrcKlS5dw+fJlrFmzBsPyT0nrdEBaWhrCw8Oxc+dOUWEDAQH8487VHj40DOBiocc1Zh9O2gwDBxqO\nrmfOFB2J47744gt4eXlhyJAhOHv2rGHiJ58gJycH//vf/1C5cmV07twZ6enpYgN1Q8HBwQCAJk2a\n4Pbt2yhfvrzdy5YqVQqXLl3CSy+9BAAoWbKkIjEWqXVrw3+T3viMBg8eDJ1Oh88++wyXTe5RS0tL\nw7x58xAYGIjatWtzb3uO8vYGzp8Hvv/e4ss5OTkICwtDq1atEBMTg0yTQQ0OHDiAqKgo6HQ6rFy5\n0lURqw53ruLhfH1VdLrSTjqdDleuXEF1J5ogV61aFRcvXiz63lJWJC8vL+QpdA2lRIkSZt3cKu6v\nv4DmzQEihISE4MqVKyhdurTDxQQEBGDXrl1o1qyZAkEWn0uG5rTXyy8DMTGGsVlh6PZ4w4YNmOnE\nUcO9e/fQrFmzQn2oaxh3rsKs0+m0lbAHDBiAnJwcEJFTCRsArl27Bj8/P/zrX/+SNzgP0aFDB8US\nNgBkZWXh3XffVaz8Qp59FrFBQQAMCcCZhA0A6enpaNasGf8YtMcffwB16iAjIwNlypRBx44dnUrY\nAKQfWhs3bsTRo0dlDlSdOGl7oOxsYNQo6Yeu6n3++ec4d+4cFi5cCF9fX1nKvHXrFgAgMDBQlvLc\n3csvvwwA2Lp1q+J1LViwAADQoEEDReu5cOEC0tLS8NzDh7KVmZ3fqYHSsWtdpaQklCxZEsnJybKU\n16VLFzRp0gR37tzBvXv3ZClTrfj0uIfp3RtYvlx0FParXbs2Lly4oGgdW7ZsQe3atREeHq5oPVrl\n4+MDvaDbo5Q6FR8REWFx6Fg56fV65OXlqeLoWy2nxxMTE6W2EEoqXbo0UlNTFa9HAXx6nD3i76+t\nhB0bG6t4wgYM43+Hh4ejd+/eitelNd7e3sISNmAYQ9tb5hu7Hzx4oHjCBgw/dvz8/LB06VLF69IC\nLy8vlyRsAEhNTcXjjz/ukrpcjZO2h9DpAJOGmKrXoEEDPPfccy6tc/ny5WatVT3dzZs3kauCewBz\nc3MLj4/tpPfffx/lypWTpSx7vfXWWy6tT40GDx6saFsIS86ePYv27du7tE5X4KTt5tLTDdeu1XkV\nxLK//voLp0+fFlK3v6VhzDzQgQMHEKqizkeaNm1a7CPWpk2bYs6cOTJF5Bgiwrlz54TULdqOHTsw\nd+5cIXVv27ZNVduxHDhpq8yMGTOg0+kwYsQIxMXF4cSJE1iwYAECAgIwYcIEh8rq0kV7/UfUq1cP\nzz77rNAY3Oj2Eac1b95cdAiFFOeIdejQobIdrTtDp9Ohbt26wuoXqW3btkLrv3nzJqpWrSo0BlkR\nkRr/PE5gYKDd844fP54SExOLnKdUqeJG5HqRkZGiQ5AcPXpUdAjCBAUFiQ7Bqnr16jm8TPXq1RWI\nxDlff/21kHqHDBFSLS1fvlxMxdplMz9y63HBlixZgooVK6Jjx44OLztmzBiMHj1aej5gALBuHRAV\npc1eAuPj452+/1oJmzZtQufOnUWHwdxMSkqKy281FNF6/MKFC6hdu7ZrKy1C9erVES90eDm72Dwv\nyklbID8/P+m+TmfpdDqpK0XjafDjx4HGjYsbnWu9/vrrquuaMCkpCUH5HW94CtPtSa0c6TWtbNmy\neCjjfdhyWLVqFXr27Omy+kwvj7nyoz1z5gzq16/vugrt8MQTT+DUqVOiwygKJ221kvPeV29vb3h5\n5UKv11aDM1NEBJ0KL75PmzYNUVFRosNgbubq1auoVq2aS+rq3RtYsQL47jvXnYHTwo8/leL7tNVo\n7Nixst77mpubi+DgippN2DqdTpUJGwDCwsJEh+Ay3377regQ7PbGG2/YnGfQoEEuiMQ5w104trSx\nbwZXXjJTc8KeNm2a6BCKhY+0Bbh06RL3vmXCVb0ksaJVqVIFN27cEB2GXdq1a4ft27cXOU+nTp2w\nefNmF0Wkbq4cGCg5ORllypRxTWVOqFatGq5evSo6DGv4SFttnn76acUStlY7cVB7wp4xY4boEFxC\nKwkbgM2EnZeXp/qEXdTZJbnPPMmdsIvq31vt7UBUnLDtwknbxQ4fPqxY2VrsLlHNpzCNxowZIzoE\nZkG7du2svubI2N6i/PHHH6JDUIRxMB6mDE7aLvTSSy8pXkelSpUUr0NOyzXQGbonDPm3ceNG0SE4\n7MyZM1Zf6969uwsjcU6bNm1Eh6AILeyDPv74Y9EhOI2Ttgtdu3ZN8TqaNWumeB1y+uyzz0SHYJNb\n9aZkhahuJotjyJAhVl9zZUMvpj1avuTFSduFijsWsemvQ2sDW6xdu7ZYdbiaq37xTp8+vdA0R0Z6\nun79upzhqM769etdXufixYsLTXPkMxk8eLDV11zVZej8+fMBAN9//7007eDBg9JjOX4M5RRxQXrs\n2LFmzwt2wTt79mwAQFZWFmJjY6XpGzZsAPBoP2KMPzo6WvoMjMsap6tJdHQ01q1bh0OHDkmxmca+\nbNmyIpevXLmy4jEqxp5u0wT8eZRz584REVFKSgqdPHmSiIjGjh0rvX7hwoVCy9jqxtRdTJgwgYiI\n0tLSSK/X0/z584mIaNq0aURENG/ePJo6dSoRkbTu+vTpIy3v4+NDderUkZ7PmDGDiIhGjRpFixYt\nIiKikSNH2oxj2bJlxXwn6mbYFVi3adMmunHjBqWnp9PPP/9Mt2/fpqSkJCIi2rJlC61bt46IDOtp\n2rRp9Msvv1BmZibNmTOHiAzbsPFzIiLq16+f9DgrK4uIDNt07969ici+z6Q4jNtVbm4uERH9+eef\nREQ0evRoIiI6depUoe2qTJky0vI+Pj6Ul5cnPU9ISKBGjRoREdHGjRuJiKRttSimZZiy9nncv3+f\n0tPT6caNG7Rr1y5p/ZqqX78+ERHdu3dPeh4fHy+9bnzPRIZ1/tFHH5nVOWvWLPLy8pKWBVAozrt3\n71qMz7iurDHu686cOUNTpkyRYklOTqZr167RunXrKDk5mYiIduzYQWPGjKGYmBhp+8rJySlUpun7\nadeuHZUtW7bIGIhI2s5UyGZ+FJ2cOWkT0YMHD6h27doUEBBApfI7DR+S31nwc889J81X8Mvm7nJz\nc2n69OlERHT27FkCQEuXLqWnnnqKiAw7mSpVqlCTJk3Mllu4cCEdO3aMtm7dSkRE4eHhtGrVqkLl\np6WlEZF9CWLevHnFei9q5+PjY3MeALRgwQKz582aNTObp0WLFgSA+vbtK03T6XSFyvruu++IiEiv\n10tJ25SSSdt0uyIi6tu3L4WHh9PatWuJiOiTTz6h+fPnS4nCqGrVqmbbFRHRiy++SEREc+fOpW++\n+YaIDD8ajcva+jFk/OFTkKXlunbtSnXr1qUzZ86YvZ6eni79ERF16NCBiIhiYmKIiKhEiRKFyjJ+\n3omJidSqVSuzOlu1aiU9Ni5bpUoVs+WtJe3Y2FiL042M+7qwsDBp2rvvvkubNm2iJUuWmM07cOBA\nunjxovTc2vgMoaGhZs99fX3N9pWWDBw4sMjXBbKZH/n0uAr06NEDCQkJaNSokdTyslevXgCA/fv3\no1u3bsjIyICX16OPyxPGff73v/+NixcvAoDUbWV2drZ0CmzhwoW4ceMGEhISpGWICLGxsXjyySfR\nvn17dOzYERcvXsTIkSPNyo6KipJOJT548MBmLK7qvUoUW539bNq0CYCh32xTcXFx0jjJixYtwuXL\nlwE8WqdEhLy8PHTr1s1suNWpU6cCMPTmZ5x38uTJUqcc9nwmzjJuV8buTRMTE5GdnY2uXbsCACZM\nmIDFixfjt99+k5YhIty6dctsu9qxYwdef/11AMB7772HyZMno3PnzmjRogVeffVVxMbGol69ekXG\n4sj9zOvWrUPv3r0LdQ1asmRJ6e+3335DQEAAgEcNXzMzM/HEE09I+wwiki5NZGZmYvfu3QAgjYK2\ne/duqY/uzMxMHDt2zO7Tyba+J8Z9XVpamjQtMTERvXv3RmpqKgAgLS0NDx48QHZ2trQdXLp0CcnJ\nyfjxxx8BwGycbOP32DitT58+Nsfu1kAf5FZx5youNGXKFHz66adWX587dy4CAwPRp08fp+vYsmWL\nU4OPiBIbG4vnnntOdBgeLygoCElJSaLDcMj+/fvx/PPPiw5DEWrvBvTevXsICQkRHYbTVLx+ue9x\nNfH19S2yUYkcVLwxWtSyZUvs27dPdBgez9YPSjVq3749tm3bZvG1n376CR999JGLI5KP2r/HWk/a\nL774onSGQWU4aavJ+fPnUadOHUXrWLNmDV577TVF65CT2ndOADBx4kSMGDFCdBisgKK2nVKlSpmd\ngtUatX8vtJ60Dxw4gObNm4sOwxLuxlRNlE7YADSVsAHg77//Fh2CTdwjmjoZr7NbooUfWZ06dRId\ngiKGDRsmOgSbVJqw7eIjOgBP0717d/z++++KlF2/fv0ie4lSo8cff1x0CDZlZGSIDsElNm7ciC5d\nuogOwy5XrlwpMul99dVXSElJQWBgoAujckzDhg2LfF3NYwkYG4RZovYf4seOHcNTTz0lOgyn8elx\nAby8vGy2bnTUvn370LJlS1nLdJU2bdogJiZGdBgW5ebmwtvbW3QYLlG3bl2cO3dOdBh2KVmypM0f\nUwEBAUhPT3dRRI7Zs2cPWrVqJToMj1SnTh2cP39edBjW8OlxNcrLy5P9epWPj3ZPmqj5x0aJEiVE\nh+AyWknYAOxq6a7WhA0AnTt3Fh2Conbs2CE6BKtUnLDtwklbEDmH3svNzdX0NZpvv/1Wtbfu2Lp/\n2d2sWbNGdAg2nTt3Dn5+fnbNO23aNIWjcY7xnmR35YpxFpzx3nvviQ6h2DhpC2Ic66A4tx0Yj4zc\n4fTt/v37RYdQSOnSpUWH4HJaeM+O9OHfv39/BSNxjq+vr+gQFDdgwADZxwSXw5w5c0SHUGyctAUY\nOBCYNcvw+MUXX8TKlSsd/uUdFBTkskERXEVtX3J3PxqypG3btphl3DhVaPfu3Q7dfx0UFITHHntM\nwYgco9frFe+rQS3U1l5q5syZZr1KapX234HG9O8P5A8MJHn99ddRunRpu5KWTqfD/fv3Ndd7lT3U\n9CVXc6tjpVWoUEF0CFY502Dx7t27CkTinLZt24oOwaVq1KghOgQAhvuyP/zwQ9FhyIJbj7tQ377A\nr7/aP/+RI0eQl5eHJk2auMUpcHuooRvWMmXKIDk5WWgMolWpUgU3btwQHYaZFi1a4M8//3Rq2UGD\nBmHevHkyR+SY2rVr48KFC0JjEOHll1/GH3/8IToMreAe0dSid29g+XLRUWhDo0aNcOLECSF1BwcH\nIzExUUjdalOxYkXcuXNHdBgAgCeffBLHjx8vVhm7du1C69at5QnIQUrc5qklGzZswCuvvCKk7osX\nL6JWrVpC6nYC3/KlBj16cMJ2xIkTJ4Qc5ZUvX54Ttok7d+5g9OjRosNAdHR0sRM2ALRu3Vqxjo2K\nUk6n8+iEDQCvvPIKVq5c6dI6U1JScOTIES0lbLvwkbbCunUD1q4VHYU29e/fH4sWLXJJXR9//DGm\nT5/ukrqYY4hI1kaK9erVc1mvXU2bNjUMeVmnDqDx+4Pl0LVrV6xbt84ldT18+BBly5Z1SV0y4iNt\nkTp35oRdHIsWLUJOTo7ZeNlKmDVrFidsG5Qc39oaYytrue8q+Pvvv3H16lXF7+GuW7euNEY1zp8H\nwsIUrU8L1q1bp3hr/qFDhwIAytaurWg9onDSVkj79kAR4xkwO/n6+qJ8+fKYNGmS7K3Ly5UrBwAY\nYrxpnlk0caJhXZUsWdJldXp5eSl6P3O1atUQFRWFN954Q/ayjbcVFeph7vp1QHtHfrIztuYfPny4\nrOXGxMQgKioKU6dONUy4d89w5ORm+PS4Atq0AVTalbbmTJoEmH63Bw8ejI8++ggNGjRwukxubGa/\nmjWBy5cfPU9LS0N8fHyx1n9Rbt68CW9vb1SsWFGR8q1p3bo1evTo4fQY3BkZGQgICLDvh6W/P5CZ\n6VQ97sjHx6dYPQ+OGTMG7du3R4sWLSzPoNMB6sxzltg+rUREavzTrFatREfgPo4csf7a8uXLqXHj\nxnaXtX79eoooVUqGqDzHDz9Yfy00NFT2+urVqyd7mY6aPHkydejQwe7558+fT3Xr1nW8olGjHF/G\njeXl5REAWrZsmV3zZ2dnk7e3N/3nP/+xr4L09GJE51I28yMfacvo+ecBFfbG6TH27duHAwcOICcn\nB6GhoejZs6d5t5y3bwOVKokLUENq1ACuXLE9X6tWrTBv3jyne+e7efMmOnTogJMnTzq1vCvExMTg\n8OHDiI8vixdeCESvXr3kGaBHW0eALnf69BUcO7Yfly5dQrly5fDyyy8rdoZHRfhI21WaNRMdgXsJ\nCREdgecaO9a55X799VcCQIsXL7Y6T3R0NAGw/whJRXbuVKBQX18FCnUPCxbIXGBMjMwFKoKPtF2h\naVPA2EiUFZ+iByD16wNnzihUuPb98w9QubJChR84AGh4NLpduwBF+mYJDARSUhQoWNs6dAAcGBvG\nPg8fqr0xIN/ypbSnnuKELTdFf0c+95yChWvfvXsKFi77HthNpKQAp0+LjkJ1du1SoNCyZQGNDxqi\n7egFa9gQOHZMdBTu5fPPFa6g4GgtTNK/P/DkkwpWsGSJgoVrXIMGQHi46ChUJTtboYLz8oCICIUK\nVx4nbSfVrw+ouO2MJj3/PPD99y6oaM0aF1SiLWXLAop3PmdPyzZPdukS8PTToqNQje7dFSw8OtrQ\nMFWDOGk7oW5dviyqBJe1vFdBf9pq8v77hkt9ivOwYSmdcvgw0KmT6ChUoXdvhSuoVAlYvVrhSuQn\nw30LnqVWLeDiRdFRuB+XtsWJi3NRRerXogXg5GiXjuvQwUUVadzmzYZTuBq/9lpcL73kLpXIy7O3\nCgfVqMEJWwleXgIazzZr5uIK1WfFChcmbAB45x0XVqZxXl4uaOChbgp3UW5QrhzwwQcuqEg+nLTt\nVLWqfJfk7t+/L09BbuDhQ8NBhaPS0tKKV/GnnxZveY1LSwP+/W9nlivGeg8JcX5ZT/T998BPP4mO\nwv3Nng3I0VmOi3DStkOVKsC1a4Ber8eJEycQHh6OgQMHYubMmbhSIJOvXLkSI0eONJvWunVrREdH\nY1f+PQwhISHINOl7ePHixdLgFQUVHOHorbfekuEdqYeXF1C9enUAQGpqqtX5pEEAAOTl5eHtt9+W\n1n9IfjIw9qql1+uh0+lQsmRJdO3a1XKB+RfM9Ho9NmzYgKCgIJz3kKETly4FSpUC1q9fbza9Y8eO\nheY1Xe/+/v54++23pf61C/bx0KZNG+h0OowaNQpHjx61WLfpdp+UlCT7CF6u1LFjR+k7XVBGRgZK\nlChRZE9vOp0OTZs2RXJyMoDC6xMA8NFHQP441Mb5f/nlFwDuty8oqGnTpmbPDxw4IO1LC1q9ejX8\n/f3NplnaPtevXy+tbzN6vXQ7qE6nw/r165Gd33y9RIkSxXkb8rOnBxYBf6rxr389emxYXUSlTPqw\nNk4zOnv2LEVHRxcqZ+LEiZScnEwJCQmFlqlRowZdunTJYv06nc7secFltaxLF8P/n3/+md588036\n5JNPpNe2bdtmNu8Ck+6Rzp07RxEREUT0aH14e3vTuXPnpHkA0JEjR+iVV16xHsD27URENHPmTAoM\nDCzWe9GSrKxHj0+dOkVJSUnS86LWOwBpvXt7e5v9JyIaOHAgAaCcnBz64IMPbMah1+vpu+++c+o9\niLBzJ9Hu3bvN1pfxO23J448/Lm2TderUMftPZFif77zzDhGRxX2GJC/PbP5/5e+U3GlfYM3u3bsL\nTZs4cWKhaRcvXjRbH8b1PGTIEGnawIED6cqVK0RUxPpes4Z0Oh1duXKF5s6dS0REvq7ttc5mfhSd\nnFWdtB97zPz59vydvFFOTo5D5WWZ7i1NXLlyxWznaM3Zs2cdqk+Lrl69SkREd+7cISKi999/3+q8\nxvVv7XMYNmwYbdy40XplNWuaPc3MzHQkVE3assX8+YMHD4iIKD1/QAXjel++fLnTdWzbto22bt1K\nEyZMKHK+vPxkpBU7dxKNHDmSiAw/dogefadtra+HDx+a/TeVmZlJMTExVK1aNesFZGYSHT8uPT10\n6JADkWuXcX3Hx8ebTb97926RyxnXs3F/Ysq4vrOzs23Wb+nzUhh3Y+qskBCFe4fycOPHA6NGiY7C\ns7RvD2zb9ui5Xq+XZ+ALR+j1mrp+aEqxbkwdce2aoYGNmztyBHjiiSwxp6ZzcgAFx3K3gbsxdUb5\n8pywlfTiiypK2AkJoiNwiRIlzBM2ANcnbADIvx7LnFS1KvDggegoFLd8ucBryb6+gElbDrXhpF1A\nuXIesx8XIiMD2L1bdBQmqlQRHYHiAgKArCzRUeTj/seL7+5d0REoTpF+xx0xdCjwwguCg7CMk3a+\n9u2BoCCV/YjV6Qx/LunbU3m+vkBSkugoTMyeDWRmGtaxG/LyMnQFmZ4uOpJ83t6Gm8M1uL51OkM/\nHMOHi44Ehi4ZY2OB3FxNrktbdDrg0CHD5iLU3r1AmTKqW8d8TRuGS2y5uYbrVTt3urJmG4wbizo/\nI4dMnAgY74RT1dtxo3VckOreWk4O4OcHnD8P1K4tOhqHqG5dAoZfZYbWxKIjkdW2bYbO8/R6wYl7\nxYpHfanm5bkqedusxOOTtl7/qM3B2bOGH7GqUaMGcPOmis5tOk+VOz0AOHECaNxYhYEVT4kShlGS\nunYF1q4VHY0JRQdLV0758kBioopCJ3rUzWm1akB8vNBw5KaazeS//wX69jV8oUz6GFAQJ21bdDrD\nNezgYFfV6Jm8vJzr+cwl/PwUHAdQDNXs9Aoq2IRdQ7p2BdatEx2FBar9sJ3XsiWwb5/oKEx4extO\nxypPu63HX3JRR+5E8ibs6dOny1eYzHbs2CGs7uIk7MjISPkCsUShhK143EVwZh9+z8otE7J+F12Y\nsOXeh7gqYc+aNcuxBVyQsO3tuU6uHu6UTNhOxahwwnZkX6HapM0YY4wxc5y0GWOMMY3gpM0YY4xp\nhCaSdk5OjtXXhhe4cXL27NlmzxcvXgwAZqNxRUdHIz3/5tWDBw8CANbmN7HduHGjNIrMlClTpGWM\n8znijz/+sPpa37597Yp727ZtyMu/IGyM8dq1a1i6dCkAWBxlyBh/dHR0oVHI7DF06FCL05OTk5GY\nmGgxTqO7+R0/GN9Pamqq2ee3YcMGAMCKFSsAAN+b3IMen98C1tl2AdbinjVrVpFxX7lyRYrbOP3h\nw4fIzb+OZdwOYmJipGWMcZt+FqbbiyMGDhxocfrevXsLTYuNjZUeZ2VlSduGkWmMI0aMkKb9888/\nAMzf94QJExyOtajv4vvvv2/2vOC2Z9wmTKd//fXXAAzbqul2CwApKSnS99S4bk3XtyOK+i46sw+J\ni4vDjRs38Ndff0nTjDEuXLiw0LRvvvnG4ZgBw/u1puA+ZFuB9gLGbdQ0buN3a+HChdKoeuPGjQMA\ns/dibfQyRxS1rdi7/zMVHR2NNWvWWPweqiXGYcOGFZpmzDem38NitX2yp4NyAX/UunXrQj2pGwfV\nMI6c07lzZ/rnn3+k1+/du0dERPXr1yciIi8vL7Pn9evXp1GjRlGNGjWIiGj16tU0e/ZsQy/t+SPE\nAJAGMsjNzS00ny3Tpk2THp8+fZqIHg00smnTJiIyjGq1du1am3EbrVixolCMRh999BERPepY3zT+\nggoOeGKNsfwxY8YQEdGnn35KREQ+Pj5m8xWMs1+/fmbT69evTwAoJCSEiMzXJwAp9hUrVlDp0qVp\n4sSJFBYWRkREZcuWlco1HanHnrh37NghxZ2SkkJz5syxGGdBBd+PcYQwY9xERPv27ZPqWbFihdln\nYTqfI3FXqVKFiAzbhTFuIqLXXnvNbD7TuogMgygYtw3j52+M8f79+2bzvvDCC2bvLzs7mypWrGg2\nj7VBGOz9LpquZ3u+iwXNmjWLEhMTqXfv3tK0Z555htLS0oiIKCMjw6GRrUzjNn4XjXEvWbJEitue\nfYjp9GHDhknzN2vWjBo1akSlS5eWpi1dupT+97//EZH5NmFv7MbtwNTYsWOJ6NH61ul0ZvsQ43ep\n4D7CuH3Ur19f+m4ZLVmyxOwzN74X0/Icjd3SfKb7vxs3bji8/yOiQttFwe+hI4qKMT093aEYLW3H\nBRnzDZHhe1jwcyAy21fYzI+aONI2OnLkCH799Vc89thjqFOnDurWrYvQ0FBkZGQgIyNDGrs2LCwM\nAKSjkEuXLknTx40bh8uXLwMAunfvXqgOIkLV/A75jb9yLc3niN9//x3NmjVDp06d8L///Q99+/ZF\n165dC8VtjNMYt/FIz/h+TGO01gLSNH4qRqvSefPm4c6dOwCAAQMGSHEZY87IyCgUZ3h4uFn8ly5d\nwoQJExAQEAAA8PLygq+vr7Q+jx8/DgA4d+4c4uLiMGLECBw6dAirV6/GU0895VTcoaGh0hjQAwYM\nQOnSpTF48GCzuI1x5hZoEVrw/WzevBmAYTvYuHEjAKBly5bwzu/x4dy5c2afRVFHRbZUrlwZp0+f\nluIGgCZNmpjF7V2gp4lq1aoV2jaMMZYvX97s/aWkpJi9v+K28j1y5Ahq1apl9l00Xc/2fBcB4N13\n31gcpC8AAAQ1SURBVJXKXLZsGcqVK4dly5ZJ0+bOnYuAgACEhobC39+/yG3f3rh//fVXvP3220Xu\nQwp+F43balhYGCZPngwA+PLLL+Hj44MePXogLi4OoaGhAICdO3fi+eefB2C+TRTn+3jz5k00a9ZM\nWt9EZLYPMcZnZLqNGuM2frcAoF+/fpgxYwZCQkKkM3jG9yI30/1flSpVHN7/AZC2i99//x1A4e+h\nXDHGx8cXGaO17dmUcV9hZMw3gOF7aPo5OMWezC7gz+Kv+++//56Sk5PppZdeouDgYPr3v/9t8ZdN\n9+7d6eTJk2bDua1YsYK6d+9OH3/8sdm8BY+0iYieffZZ2rx5s8X5bDE90o6NjSUioubNm9PMmTNp\n48aNNGnSJDpz5ozV5T/77DOzuPPy8qSxYY0xpqWl0cqVK4mI6L333iOiR0NYHj16lJ599lkiIgoK\nCjIr25Ej7czMTDpz5gydOHGC9uzZQykpKRaP9Ixat25NwcHB9J///MdseqVKleibb76R3suvv/5q\n9l6M/P39KTU1lYjMxxwmcuxIOywsjJYvXy7FnZycXOgIMjg4WHq8f/9+WrFiRaG4T548SS+88ILZ\ndpCWlkYzZsyQjoSN01auXFloe3Ek7nLlytGsWbPo6NGjZuu74PKffvopZWRkEJFhfTdo0EBaV8bP\n3xgjEdGMGTPop59+orS0NJowYUKh4Q2Lc6T9/fffU69evYr8Lhq/c9a+i0amR6M//PCDdKbo2Wef\npWnTpklD0q5cudJs27fFNG7jd9G4D0lKSipyH0JEVuM2DsuZk5NDs2bNotKlS5O/v7/Z+2nUqJHZ\nNjFjxgzpzJUtlo60d+7cSTNnzpTWd8HvD5Hh+3XkyBE6efKk2TZqur6N20tqairduXOHsrOzacaM\nGZSdnS29F6JH+xUjS/VZYmk+0/0fgCL3f0SFh+EkMt8uLH0PHVFUjDD0EWI1RuN6tBSjpX2A8cyI\nMW7jcLVF7ONs5kfRydmhpK0FpklbbexN2mpjb/JTG63F7UjS1gKtxm0paYtWnKStNmqM0W1PjzPG\nGGOejJM2Y4wxphGctBljjDGNUO2AIXL1YasUa+uN41YGx628hIQEBFvoiF+NsZrS0jo2pbW47ckV\naopdo+uXR/lijDHGNMJm0vZxRRROUOdPIcYYY0wgvqbNGGOMaQQnbcYYY0wjOGkzxhhjGsFJmzHG\nGNMITtqMMcaYRnDSZowxxjSCkzZjjDGmEZy0GWOMMY3gpM0YY4xpBCdtxhhjTCM4aTPGGGMawUmb\nMcYY0whO2owxxphGcNJmjDHGNIKTNmOMMaYRnLQZY4wxjeCkzRhjjGkEJ23GGGNMIzhpM8YYYxrB\nSZsxxhjTCE7ajDHGmEZw0maMMcY0gpM2Y4wxphGctBljjDGN4KTNGGOMaQQnbcYYY0wjOGkzxhhj\nGvH/DaSDseFYqwcAAAAASUVORK5CYII=\n",
503 | "text/plain": [
504 | ""
505 | ]
506 | },
507 | "metadata": {},
508 | "output_type": "display_data"
509 | }
510 | ],
511 | "source": [
512 | "xgb.plot_tree(model_xgboost, num_trees=39)"
513 | ]
514 | },
515 | {
516 | "cell_type": "code",
517 | "execution_count": null,
518 | "metadata": {
519 | "collapsed": true
520 | },
521 | "outputs": [],
522 | "source": []
523 | },
524 | {
525 | "cell_type": "code",
526 | "execution_count": 22,
527 | "metadata": {},
528 | "outputs": [],
529 | "source": [
530 | "# Prediction using xgboost"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": 23,
536 | "metadata": {
537 | "collapsed": true
538 | },
539 | "outputs": [],
540 | "source": [
541 | "xgb_predict = model_xgboost.predict(xgb.DMatrix(X_test))"
542 | ]
543 | },
544 | {
545 | "cell_type": "code",
546 | "execution_count": 24,
547 | "metadata": {},
548 | "outputs": [
549 | {
550 | "data": {
551 | "text/plain": [
552 | "(1546,)"
553 | ]
554 | },
555 | "execution_count": 24,
556 | "metadata": {},
557 | "output_type": "execute_result"
558 | }
559 | ],
560 | "source": [
561 | "xgb_predict.shape"
562 | ]
563 | },
564 | {
565 | "cell_type": "markdown",
566 | "metadata": {},
567 | "source": [
568 | "**Exercise**"
569 | ]
570 | },
571 | {
572 | "cell_type": "markdown",
573 | "metadata": {},
574 | "source": [
575 | "1. Run the model for 200 trees \n",
576 | "2. Using that model, predict and observe the AUC on test\n",
577 | "3. Now, predict only using the first 120 trees from that model. *Hint* : Use `ntree_limit` option \n",
578 | "4. Use `xgboost's` cross-validation method. Sample code:\n",
579 | "\n",
580 | "```xgb.cv(parameters, train_matrix, num_round, nfold,\n",
581 | " metrics={'error'}, seed = 0,\n",
582 | " callbacks=[xgb.callback.print_evaluation(show_stdv=True)])```\n",
583 | "\n"
584 | ]
585 | },
586 | {
587 | "cell_type": "code",
588 | "execution_count": null,
589 | "metadata": {},
590 | "outputs": [],
591 | "source": []
592 | },
593 | {
594 | "cell_type": "markdown",
595 | "metadata": {},
596 | "source": [
597 | "**Early Stopping**\n",
598 | "\n",
599 | "Source: [xgboost docs](http://xgboost.readthedocs.io/en/latest/python/python_intro.html) \n",
600 | "\n",
601 | "If you have a validation set, you can use early stopping to find the optimal number of boosting rounds.\n",
602 | "\n",
603 | "`train(..., evals=evals, early_stopping_rounds=10)`\n",
604 | "\n",
605 | "The model will train until the validation score stops improving. Validation error needs to decrease at least every early_stopping_rounds to continue training.\n",
606 | "\n",
607 | "If early stopping occurs, the model will have three additional fields: `bst.best_score, bst.best_iteration and bst.best_ntree_limit`. Note that train() will return a model from the last iteration, not the best one.\n",
608 | "\n",
609 | "This works with both metrics to minimize (RMSE, log loss, etc.) and to maximize (MAP, NDCG, AUC). Note that if you specify more than one evaluation metric the last one in param['eval_metric'] is used for early stopping."
610 | ]
611 | },
612 | {
613 | "cell_type": "code",
614 | "execution_count": null,
615 | "metadata": {
616 | "collapsed": true
617 | },
618 | "outputs": [],
619 | "source": []
620 | },
621 | {
622 | "cell_type": "code",
623 | "execution_count": null,
624 | "metadata": {
625 | "collapsed": true
626 | },
627 | "outputs": [],
628 | "source": []
629 | },
630 | {
631 | "cell_type": "markdown",
632 | "metadata": {},
633 | "source": [
634 | "### References\n",
635 | "\n",
636 | "- https://www.analyticsvidhya.com/blog/2015/09/complete-guide-boosting-methods/\n",
637 | "- https://www.analyticsvidhya.com/blog/2015/05/boosting-algorithms-simplified/\n",
638 | "- https://xgboost.readthedocs.io/\n"
639 | ]
640 | },
641 | {
642 | "cell_type": "code",
643 | "execution_count": null,
644 | "metadata": {
645 | "collapsed": true
646 | },
647 | "outputs": [],
648 | "source": []
649 | }
650 | ],
651 | "metadata": {
652 | "anaconda-cloud": {},
653 | "kernelspec": {
654 | "display_name": "Python [conda root]",
655 | "language": "python",
656 | "name": "conda-root-py"
657 | },
658 | "language_info": {
659 | "codemirror_mode": {
660 | "name": "ipython",
661 | "version": 3
662 | },
663 | "file_extension": ".py",
664 | "mimetype": "text/x-python",
665 | "name": "python",
666 | "nbconvert_exporter": "python",
667 | "pygments_lexer": "ipython3",
668 | "version": "3.6.2"
669 | }
670 | },
671 | "nbformat": 4,
672 | "nbformat_minor": 1
673 | }
674 |
--------------------------------------------------------------------------------
/Module-03g-Model-HyperParameterOpt.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Introduction to parameter tuning\n",
8 | "\n",
9 | "**Hyper-parameters**\n",
10 | "\n",
11 | "A machine learning model is a mathematical formula with a number of parameters that are learnt from the data. That is the crux of machine learning: fitting a model to the data.\n",
12 | "\n",
13 | "However, there is another kind of parameters that cannot be directly learned from the regular training process. These parameters express “higher-level” properties of the model such as its complexity or how fast it should learn. They are called hyperparameters. Hyperparameters are usually fixed before the actual training process begins.\n",
14 | "\n",
15 | "So, how are hyperparameters decided?\n",
16 | "\n",
17 | "Broadly speaking, this is done by setting different values for those hyperparameters, training different models, and deciding which ones work best by testing them\n",
18 | "\n",
19 | "So, to summarize. Hyperparameters:\n",
20 | "\n",
21 | "- Define higher level concepts about the model such as complexity, or capacity to learn.\n",
22 | "- Cannot be learned directly from the data in the standard model training process and need to be predefined.\n",
23 | "- Can be decided by setting different values, training different models, and choosing the values that test better\n",
24 | "\n",
25 | "Some examples of hyperparameters:\n",
26 | "\n",
27 | "- Number of leaves or depth of a tree\n",
28 | "- Number of latent factors in a matrix factorization\n",
29 | "- Learning rate (in many models)\n",
30 | "- Number of hidden layers in a deep neural network\n",
31 | "- Number of clusters in a k-means clustering\n",
32 | "\n",
33 | "source: [Quora](https://www.quora.com/What-are-hyperparameters-in-machine-learning)\n"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 1,
39 | "metadata": {
40 | "collapsed": true
41 | },
42 | "outputs": [],
43 | "source": [
44 | "import pandas as pd\n",
45 | "import numpy as np\n",
46 | "import matplotlib.pyplot as plt\n",
47 | "%matplotlib inline\n",
48 | "plt.style.use('fivethirtyeight')"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 2,
54 | "metadata": {
55 | "collapsed": true
56 | },
57 | "outputs": [],
58 | "source": [
59 | "#Read the data \n",
60 | "#Read the data\n",
61 | "df = pd.read_csv(\"data/historical_loan.csv\")\n",
62 | "\n",
63 | "# refine the data\n",
64 | "df.years = df.years.fillna(np.mean(df.years))\n"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 3,
70 | "metadata": {
71 | "collapsed": true
72 | },
73 | "outputs": [],
74 | "source": [
75 | "# Setup the features and target\n",
76 | "X = df.iloc[:,1:]\n",
77 | "y = df.iloc[:,0]\n"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 4,
83 | "metadata": {
84 | "collapsed": true
85 | },
86 | "outputs": [],
87 | "source": [
88 | "from sklearn.model_selection import train_test_split\n",
89 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | "**Basic checks**\n",
97 | "\n",
98 | "Check if the columns are the same in train and test.\n",
99 | "\n",
100 | "What else will you check? [**Discuss**]"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 5,
106 | "metadata": {},
107 | "outputs": [
108 | {
109 | "data": {
110 | "text/plain": [
111 | "Index(['amount', 'grade', 'years', 'ownership', 'income', 'age'], dtype='object')"
112 | ]
113 | },
114 | "execution_count": 5,
115 | "metadata": {},
116 | "output_type": "execute_result"
117 | }
118 | ],
119 | "source": [
120 | "X_train.columns"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 6,
126 | "metadata": {},
127 | "outputs": [
128 | {
129 | "data": {
130 | "text/plain": [
131 | "Index(['amount', 'grade', 'years', 'ownership', 'income', 'age'], dtype='object')"
132 | ]
133 | },
134 | "execution_count": 6,
135 | "metadata": {},
136 | "output_type": "execute_result"
137 | }
138 | ],
139 | "source": [
140 | "X_test.columns"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 7,
146 | "metadata": {},
147 | "outputs": [
148 | {
149 | "name": "stdout",
150 | "output_type": "stream",
151 | "text": [
152 | "(6181, 6) (1546, 6)\n"
153 | ]
154 | }
155 | ],
156 | "source": [
157 | "print(X_train.shape, X_test.shape)"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 9,
163 | "metadata": {},
164 | "outputs": [
165 | {
166 | "name": "stdout",
167 | "output_type": "stream",
168 | "text": [
169 | "train\n",
170 | "amount int64\n",
171 | "grade object\n",
172 | "years float64\n",
173 | "ownership object\n",
174 | "income float64\n",
175 | "age int64\n",
176 | "dtype: object\n",
177 | "\n",
178 | "test\n",
179 | "amount int64\n",
180 | "grade object\n",
181 | "years float64\n",
182 | "ownership object\n",
183 | "income float64\n",
184 | "age int64\n",
185 | "dtype: object\n"
186 | ]
187 | }
188 | ],
189 | "source": [
190 | "print(\"train\")\n",
191 | "print(X_train.dtypes)\n",
192 | "print()\n",
193 | "print(\"test\")\n",
194 | "print(X_test.dtypes)"
195 | ]
196 | },
197 | {
198 | "cell_type": "markdown",
199 | "metadata": {},
200 | "source": [
201 | "The categorical data should be encoded.\n",
202 | "\n",
203 | "We saw LabelEncoder earlier. Now, we will use one-hot encoding"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "### One-hot encoding\n",
211 | "\n",
212 | ""
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": 11,
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "X_train_updated = pd.get_dummies(X_train)"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 12,
227 | "metadata": {},
228 | "outputs": [
229 | {
230 | "data": {
231 | "text/plain": [
232 | "(6181, 6)"
233 | ]
234 | },
235 | "execution_count": 12,
236 | "metadata": {},
237 | "output_type": "execute_result"
238 | }
239 | ],
240 | "source": [
241 | "X_train.shape"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 13,
247 | "metadata": {},
248 | "outputs": [
249 | {
250 | "data": {
251 | "text/plain": [
252 | "(6181, 15)"
253 | ]
254 | },
255 | "execution_count": 13,
256 | "metadata": {},
257 | "output_type": "execute_result"
258 | }
259 | ],
260 | "source": [
261 | "X_train_updated.shape"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 15,
267 | "metadata": {},
268 | "outputs": [
269 | {
270 | "data": {
271 | "text/plain": [
272 | "amount 14500.0\n",
273 | "years 11.0\n",
274 | "income 64000.0\n",
275 | "age 35.0\n",
276 | "grade_A 1.0\n",
277 | "grade_B 0.0\n",
278 | "grade_C 0.0\n",
279 | "grade_D 0.0\n",
280 | "grade_E 0.0\n",
281 | "grade_F 0.0\n",
282 | "grade_G 0.0\n",
283 | "ownership_MORTGAGE 1.0\n",
284 | "ownership_OTHER 0.0\n",
285 | "ownership_OWN 0.0\n",
286 | "ownership_RENT 0.0\n",
287 | "Name: 303, dtype: float64"
288 | ]
289 | },
290 | "execution_count": 15,
291 | "metadata": {},
292 | "output_type": "execute_result"
293 | }
294 | ],
295 | "source": [
296 | "#print the first record\n",
297 | "X_train_updated.iloc[0]"
298 | ]
299 | },
300 | {
301 | "cell_type": "markdown",
302 | "metadata": {},
303 | "source": [
304 | "**Exercise**\n",
305 | "Apply one-hot encoding to test dataset and store in test_updated"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {
312 | "collapsed": true
313 | },
314 | "outputs": [],
315 | "source": [
316 | "#Code here"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 16,
322 | "metadata": {
323 | "collapsed": true
324 | },
325 | "outputs": [],
326 | "source": [
327 | "X_test_updated = pd.get_dummies(X_test)"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": 18,
333 | "metadata": {},
334 | "outputs": [
335 | {
336 | "name": "stdout",
337 | "output_type": "stream",
338 | "text": [
339 | "(1546, 6) (1546, 15)\n"
340 | ]
341 | }
342 | ],
343 | "source": [
344 | "print(X_test.shape, X_test_updated.shape)"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": 20,
350 | "metadata": {},
351 | "outputs": [
352 | {
353 | "data": {
354 | "text/plain": [
355 | "amount 3000.0\n",
356 | "years 1.0\n",
357 | "income 49800.0\n",
358 | "age 22.0\n",
359 | "grade_A 1.0\n",
360 | "grade_B 0.0\n",
361 | "grade_C 0.0\n",
362 | "grade_D 0.0\n",
363 | "grade_E 0.0\n",
364 | "grade_F 0.0\n",
365 | "grade_G 0.0\n",
366 | "ownership_MORTGAGE 0.0\n",
367 | "ownership_OTHER 0.0\n",
368 | "ownership_OWN 0.0\n",
369 | "ownership_RENT 1.0\n",
370 | "Name: 2184, dtype: float64"
371 | ]
372 | },
373 | "execution_count": 20,
374 | "metadata": {},
375 | "output_type": "execute_result"
376 | }
377 | ],
378 | "source": [
379 | "#print the first record\n",
380 | "X_test_updated.iloc[1]"
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": 23,
386 | "metadata": {},
387 | "outputs": [
388 | {
389 | "name": "stdout",
390 | "output_type": "stream",
391 | "text": [
392 | "(6181, 15) (6181,)\n"
393 | ]
394 | }
395 | ],
396 | "source": [
397 | "print(X_train_updated.shape, y_train.shape)"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": 24,
403 | "metadata": {
404 | "collapsed": true
405 | },
406 | "outputs": [],
407 | "source": [
408 | "#Let's build random forest model"
409 | ]
410 | },
411 | {
412 | "cell_type": "code",
413 | "execution_count": 25,
414 | "metadata": {
415 | "collapsed": true
416 | },
417 | "outputs": [],
418 | "source": [
419 | "from sklearn.ensemble import RandomForestClassifier"
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": 43,
425 | "metadata": {
426 | "collapsed": true
427 | },
428 | "outputs": [],
429 | "source": [
430 | "model_rf = RandomForestClassifier(n_estimators=100,\n",
431 | " criterion=\"gini\",\n",
432 | " max_depth=5,\n",
433 | " min_samples_split=2,\n",
434 | " min_samples_leaf= 1,\n",
435 | " oob_score=True,\n",
436 | " n_jobs=-1\n",
437 | " )"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": 44,
443 | "metadata": {},
444 | "outputs": [
445 | {
446 | "data": {
447 | "text/plain": [
448 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
449 | " max_depth=5, max_features='auto', max_leaf_nodes=None,\n",
450 | " min_impurity_decrease=0.0, min_impurity_split=None,\n",
451 | " min_samples_leaf=1, min_samples_split=2,\n",
452 | " min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,\n",
453 | " oob_score=True, random_state=None, verbose=0, warm_start=False)"
454 | ]
455 | },
456 | "execution_count": 44,
457 | "metadata": {},
458 | "output_type": "execute_result"
459 | }
460 | ],
461 | "source": [
462 | "model_rf.fit(X_train_updated, y_train)"
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": 45,
468 | "metadata": {},
469 | "outputs": [
470 | {
471 | "data": {
472 | "text/plain": [
473 | "0.63873159682899205"
474 | ]
475 | },
476 | "execution_count": 45,
477 | "metadata": {},
478 | "output_type": "execute_result"
479 | }
480 | ],
481 | "source": [
482 | "model_rf.oob_score_"
483 | ]
484 | },
485 | {
486 | "cell_type": "markdown",
487 | "metadata": {},
488 | "source": [
489 | "Let's do cross validation and see what the generalization error is"
490 | ]
491 | },
492 | {
493 | "cell_type": "markdown",
494 | "metadata": {},
495 | "source": [
496 | "### Cross-validation"
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": 46,
502 | "metadata": {
503 | "collapsed": true
504 | },
505 | "outputs": [],
506 | "source": [
507 | "from sklearn.model_selection import cross_val_score\n",
508 | "from sklearn.metrics import roc_curve, auc"
509 | ]
510 | },
511 | {
512 | "cell_type": "code",
513 | "execution_count": 47,
514 | "metadata": {
515 | "collapsed": true
516 | },
517 | "outputs": [],
518 | "source": [
519 | "model_rf = RandomForestClassifier(n_estimators=100,\n",
520 | " criterion=\"gini\",\n",
521 | " max_depth=5,\n",
522 | " min_samples_split=2,\n",
523 | " min_samples_leaf= 1,\n",
524 | " oob_score=True,\n",
525 | " n_jobs=-1\n",
526 | " )"
527 | ]
528 | },
529 | {
530 | "cell_type": "code",
531 | "execution_count": 48,
532 | "metadata": {},
533 | "outputs": [
534 | {
535 | "name": "stdout",
536 | "output_type": "stream",
537 | "text": [
538 | "CPU times: user 112 ms, sys: 64.7 ms, total: 176 ms\n",
539 | "Wall time: 2.18 s\n"
540 | ]
541 | }
542 | ],
543 | "source": [
544 | "%%time\n",
545 | "\n",
546 | "#Or use %%timeit -n1 -r1 to time the cell\n",
547 | "\n",
548 | "cross_val_score_rf = cross_val_score(model_rf, \n",
549 | " X_train_updated, \n",
550 | " y_train, scoring=\"roc_auc\",\n",
551 | " cv=5,\n",
552 | " n_jobs=-1\n",
553 | " )"
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": 49,
559 | "metadata": {},
560 | "outputs": [
561 | {
562 | "data": {
563 | "text/plain": [
564 | "array([ 0.6969647 , 0.68786796, 0.69946444, 0.69435555, 0.67146693])"
565 | ]
566 | },
567 | "execution_count": 49,
568 | "metadata": {},
569 | "output_type": "execute_result"
570 | }
571 | ],
572 | "source": [
573 | "cross_val_score_rf"
574 | ]
575 | },
576 | {
577 | "cell_type": "markdown",
578 | "metadata": {},
579 | "source": [
580 | "**Exercise**"
581 | ]
582 | },
583 | {
584 | "cell_type": "code",
585 | "execution_count": 50,
586 | "metadata": {},
587 | "outputs": [
588 | {
589 | "data": {
590 | "text/plain": [
591 | "0.69002391398907892"
592 | ]
593 | },
594 | "execution_count": 50,
595 | "metadata": {},
596 | "output_type": "execute_result"
597 | }
598 | ],
599 | "source": [
600 | "#What is the average cross validation score?\n",
601 | "np.mean(cross_val_score_rf)"
602 | ]
603 | },
604 | {
605 | "cell_type": "markdown",
606 | "metadata": {},
607 | "source": [
608 | "#### grid-search"
609 | ]
610 | },
611 | {
612 | "cell_type": "markdown",
613 | "metadata": {},
614 | "source": [
615 | "The above was for some arbitrary chosen parameter value.\n",
616 | "\n",
617 | "How do we run the model on various choices of hyper-parameters?\n"
618 | ]
619 | },
620 | {
621 | "cell_type": "code",
622 | "execution_count": 51,
623 | "metadata": {
624 | "collapsed": true
625 | },
626 | "outputs": [],
627 | "source": [
628 | "from sklearn.model_selection import GridSearchCV\n",
629 | "from sklearn.metrics import classification_report"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": 59,
635 | "metadata": {},
636 | "outputs": [
637 | {
638 | "name": "stdout",
639 | "output_type": "stream",
640 | "text": [
641 | "# Tuning hyper-parameters for roc_auc\n",
642 | "\n",
643 | "Best parameters set found on development set:\n",
644 | "\n",
645 | "{'max_depth': 6, 'n_estimators': 100}\n",
646 | "\n",
647 | "Grid scores on development set:\n",
648 | "\n",
649 | "0.684 (+/-0.022) for {'max_depth': 3, 'n_estimators': 50}\n",
650 | "0.684 (+/-0.022) for {'max_depth': 3, 'n_estimators': 100}\n",
651 | "0.687 (+/-0.018) for {'max_depth': 4, 'n_estimators': 50}\n",
652 | "0.687 (+/-0.022) for {'max_depth': 4, 'n_estimators': 100}\n",
653 | "0.687 (+/-0.016) for {'max_depth': 5, 'n_estimators': 50}\n",
654 | "0.690 (+/-0.021) for {'max_depth': 5, 'n_estimators': 100}\n",
655 | "0.691 (+/-0.022) for {'max_depth': 6, 'n_estimators': 50}\n",
656 | "0.692 (+/-0.020) for {'max_depth': 6, 'n_estimators': 100}\n",
657 | "\n",
658 | "Detailed classification report:\n",
659 | "\n",
660 | "The model is trained on the full development set.\n",
661 | "The scores are computed on the full evaluation set.\n",
662 | "\n",
663 | "AUC: 0.630219677953\n",
664 | " precision recall f1-score support\n",
665 | "\n",
666 | " 0 0.63 0.71 0.67 807\n",
667 | " 1 0.64 0.55 0.59 739\n",
668 | "\n",
669 | "avg / total 0.63 0.63 0.63 1546\n",
670 | "\n",
671 | "\n",
672 | "1 loop, best of 1: 22.9 s per loop\n"
673 | ]
674 | }
675 | ],
676 | "source": [
677 | "%%timeit -n1 -r1\n",
678 | "\n",
679 | "# Set the parameters by cross-validation\n",
680 | "tuned_parameters = [{'n_estimators': [50,100], \n",
681 | " 'max_depth': [3, 4, 5, 6]\n",
682 | " }]\n",
683 | "\n",
684 | "scores = ['roc_auc']\n",
685 | "\n",
686 | "for score in scores:\n",
687 | " print(\"# Tuning hyper-parameters for %s\" % score)\n",
688 | " print()\n",
689 | "\n",
690 | " clf = GridSearchCV(RandomForestClassifier(n_jobs=-1), \n",
691 | " tuned_parameters, cv=5,\n",
692 | " scoring='%s' % score)\n",
693 | " clf\n",
694 | " clf.fit(X_train_updated, y_train)\n",
695 | "\n",
696 | " print(\"Best parameters set found on development set:\")\n",
697 | " print()\n",
698 | " print(clf.best_params_)\n",
699 | " print()\n",
700 | " print(\"Grid scores on development set:\")\n",
701 | " print()\n",
702 | " means = clf.cv_results_['mean_test_score']\n",
703 | " stds = clf.cv_results_['std_test_score']\n",
704 | " for mean, std, params in zip(means, stds, clf.cv_results_['params']):\n",
705 | " print(\"%0.3f (+/-%0.03f) for %r\"\n",
706 | " % (mean, std * 2, params))\n",
707 | " print()\n",
708 | "\n",
709 | " print(\"Detailed classification report:\")\n",
710 | " print()\n",
711 | " print(\"The model is trained on the full development set.\")\n",
712 | " print(\"The scores are computed on the full evaluation set.\")\n",
713 | " print()\n",
714 | " y_true, y_pred = y_test, clf.predict(X_test_updated)\n",
715 | " \n",
716 | " false_positive_rate, true_positive_rate, thresholds = roc_curve(y_true, y_pred)\n",
717 | " roc_auc = auc(false_positive_rate, true_positive_rate)\n",
718 | " print(\"AUC:\", roc_auc)\n",
719 | " \n",
720 | " print(classification_report(y_true, y_pred))\n",
721 | " print()\n"
722 | ]
723 | },
724 | {
725 | "cell_type": "markdown",
726 | "metadata": {},
727 | "source": [
728 | "**Exercise**\n",
729 | "\n",
730 | "- For `max_depth` include - 6, 10\n",
731 | "- Add `min_samples_split`, `min_samples_leaf` to the grid search\n",
732 | "- In addition to `roc_auc`, add `precision` and `recall` "
733 | ]
734 | },
735 | {
736 | "cell_type": "code",
737 | "execution_count": null,
738 | "metadata": {
739 | "collapsed": true
740 | },
741 | "outputs": [],
742 | "source": []
743 | },
744 | {
745 | "cell_type": "markdown",
746 | "metadata": {},
747 | "source": [
748 | "**Challenges with `grid_search`**\n",
749 | "\n",
750 | "Discuss"
751 | ]
752 | },
753 | {
754 | "cell_type": "code",
755 | "execution_count": null,
756 | "metadata": {
757 | "collapsed": true
758 | },
759 | "outputs": [],
760 | "source": []
761 | },
762 | {
763 | "cell_type": "markdown",
764 | "metadata": {},
765 | "source": [
766 | "### Randomized Search"
767 | ]
768 | },
769 | {
770 | "cell_type": "code",
771 | "execution_count": 56,
772 | "metadata": {
773 | "collapsed": true
774 | },
775 | "outputs": [],
776 | "source": [
777 | "from sklearn.model_selection import RandomizedSearchCV\n",
778 | "from scipy.stats import randint as sp_randint"
779 | ]
780 | },
781 | {
782 | "cell_type": "code",
783 | "execution_count": 60,
784 | "metadata": {},
785 | "outputs": [
786 | {
787 | "name": "stdout",
788 | "output_type": "stream",
789 | "text": [
790 | "# Tuning hyper-parameters for roc_auc\n",
791 | "\n",
792 | "Best parameters set found on development set:\n",
793 | "\n",
794 | "{'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 50}\n",
795 | "\n",
796 | "Grid scores on development set:\n",
797 | "\n",
798 | "0.702 (+/-0.024) for {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 7, 'min_samples_leaf': 5, 'min_samples_split': 4, 'n_estimators': 100}\n",
799 | "0.686 (+/-0.020) for {'bootstrap': True, 'criterion': 'gini', 'max_depth': 4, 'max_features': 5, 'min_samples_leaf': 7, 'min_samples_split': 7, 'n_estimators': 50}\n",
800 | "0.687 (+/-0.016) for {'bootstrap': True, 'criterion': 'gini', 'max_depth': 4, 'max_features': 8, 'min_samples_leaf': 9, 'min_samples_split': 2, 'n_estimators': 100}\n",
801 | "0.685 (+/-0.018) for {'bootstrap': False, 'criterion': 'gini', 'max_depth': 4, 'max_features': 8, 'min_samples_leaf': 7, 'min_samples_split': 2, 'n_estimators': 100}\n",
802 | "0.690 (+/-0.019) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 6, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}\n",
803 | "0.685 (+/-0.019) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 6, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 50}\n",
804 | "0.703 (+/-0.024) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 1, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 100}\n",
805 | "0.682 (+/-0.023) for {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 10, 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 100}\n",
806 | "0.697 (+/-0.023) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 50}\n",
807 | "0.684 (+/-0.018) for {'bootstrap': False, 'criterion': 'gini', 'max_depth': 3, 'max_features': 7, 'min_samples_leaf': 8, 'min_samples_split': 4, 'n_estimators': 100}\n",
808 | "0.686 (+/-0.019) for {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 4, 'max_features': 5, 'min_samples_leaf': 7, 'min_samples_split': 7, 'n_estimators': 50}\n",
809 | "0.692 (+/-0.019) for {'bootstrap': True, 'criterion': 'gini', 'max_depth': 6, 'max_features': 8, 'min_samples_leaf': 9, 'min_samples_split': 3, 'n_estimators': 100}\n",
810 | "0.693 (+/-0.020) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 6, 'max_features': 4, 'min_samples_leaf': 7, 'min_samples_split': 8, 'n_estimators': 100}\n",
811 | "0.692 (+/-0.018) for {'bootstrap': True, 'criterion': 'gini', 'max_depth': 6, 'max_features': 5, 'min_samples_leaf': 7, 'min_samples_split': 5, 'n_estimators': 100}\n",
812 | "0.703 (+/-0.021) for {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 2, 'min_samples_leaf': 6, 'min_samples_split': 6, 'n_estimators': 50}\n",
813 | "0.692 (+/-0.019) for {'bootstrap': True, 'criterion': 'gini', 'max_depth': 6, 'max_features': 8, 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 100}\n",
814 | "0.689 (+/-0.020) for {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 6, 'max_features': 6, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}\n",
815 | "0.701 (+/-0.022) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 4, 'min_samples_leaf': 8, 'min_samples_split': 2, 'n_estimators': 100}\n",
816 | "0.704 (+/-0.026) for {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 50}\n",
817 | "0.687 (+/-0.020) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 4, 'max_features': 2, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}\n",
818 | "\n",
819 | "Detailed classification report:\n",
820 | "\n",
821 | "The model is trained on the full development set.\n",
822 | "The scores are computed on the full evaluation set.\n",
823 | "\n",
824 | "1 loop, best of 1: 30.7 s per loop\n"
825 | ]
826 | }
827 | ],
828 | "source": [
829 | "%%timeit -n1 -r1\n",
830 | "\n",
831 | "# Set the parameters by cross-validation\n",
832 | "tuned_parameters = { \"n_estimators\": [50,100], \n",
833 | " \"max_depth\": [3, 4, 6, None],\n",
834 | " \"max_features\": sp_randint(1, 11),\n",
835 | " \"min_samples_split\": sp_randint(2, 11),\n",
836 | " \"min_samples_leaf\": sp_randint(1, 11),\n",
837 | " \"bootstrap\": [True, False],\n",
838 | " \"criterion\": [\"gini\", \"entropy\"]\n",
839 | " }\n",
840 | "\n",
841 | "scores = ['roc_auc']\n",
842 | "\n",
843 | "\n",
844 | "n_iter_search = 20\n",
845 | "\n",
846 | "for score in scores:\n",
847 | " print(\"# Tuning hyper-parameters for %s\" % score)\n",
848 | " print()\n",
849 | "\n",
850 | " clf = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1), \n",
851 | " param_distributions = tuned_parameters, \n",
852 | " n_iter = n_iter_search,\n",
853 | " n_jobs=-1,\n",
854 | " cv=5,\n",
855 | " scoring='%s' % score)\n",
856 | " clf.fit(X_train_updated, y_train)\n",
857 | "\n",
858 | " print(\"Best parameters set found on development set:\")\n",
859 | " print()\n",
860 | " print(clf.best_params_)\n",
861 | " print()\n",
862 | " print(\"Grid scores on development set:\")\n",
863 | " print()\n",
864 | " means = clf.cv_results_['mean_test_score']\n",
865 | " stds = clf.cv_results_['std_test_score']\n",
866 | " for mean, std, params in zip(means, stds, clf.cv_results_['params']):\n",
867 | " print(\"%0.3f (+/-%0.03f) for %r\"\n",
868 | " % (mean, std * 2, params))\n",
869 | " print()\n",
870 | "\n",
871 | " print(\"Detailed classification report:\")\n",
872 | " print()\n",
873 | " print(\"The model is trained on the full development set.\")\n",
874 | " print(\"The scores are computed on the full evaluation set.\")\n",
875 | " print()\n",
876 | " y_true, y_pred = y_test, clf.predict(X_test_updated)\n",
877 | " \n",
878 | " #false_positive_rate, true_positive_rate, thresholds = roc_curve(y_true, y_pred)\n",
879 | " #roc_auc = auc(false_positive_rate, true_positive_rate)\n",
880 | " #print(\"AUC:\", roc_auc)\n",
881 | " \n",
882 | " #print(classification_report(y_true, y_pred))\n",
883 | " #print()\n"
884 | ]
885 | },
886 | {
887 | "cell_type": "code",
888 | "execution_count": null,
889 | "metadata": {
890 | "collapsed": true
891 | },
892 | "outputs": [],
893 | "source": []
894 | }
895 | ],
896 | "metadata": {
897 | "anaconda-cloud": {},
898 | "kernelspec": {
899 | "display_name": "Python 3",
900 | "language": "python",
901 | "name": "python3"
902 | },
903 | "language_info": {
904 | "codemirror_mode": {
905 | "name": "ipython",
906 | "version": 3
907 | },
908 | "file_extension": ".py",
909 | "mimetype": "text/x-python",
910 | "name": "python",
911 | "nbconvert_exporter": "python",
912 | "pygments_lexer": "ipython3",
913 | "version": "3.7.1"
914 | }
915 | },
916 | "nbformat": 4,
917 | "nbformat_minor": 2
918 | }
919 |
--------------------------------------------------------------------------------
/Module-05a-ML-Pipeline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Machine Learning Supervised Pipeline"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Frame\n",
15 | "\n",
16 | "Supervised Learning - Regression\n",
17 | "\n",
18 | "- `y`: Predict Sale Price\n",
19 | "- `X`: Features about the house\n",
20 | "- `score`: Mean Squared Error"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 5,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "import numpy as np\n",
30 | "import pandas as pd\n",
31 | "import matplotlib.pyplot as plt\n",
32 | "plt.style.use(\"ggplot\")"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 3,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "%matplotlib` inline"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "## Acquire"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 41,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "data = pd.read_csv(\"http://bit.do/df-housing\")"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 44,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "df = data[['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt', 'FireplaceQu', 'LotFrontage']]"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 49,
72 | "metadata": {},
73 | "outputs": [
74 | {
75 | "data": {
76 | "text/html": [
77 | "\n",
78 | "\n",
91 | "
\n",
92 | " \n",
93 | " \n",
94 | " | \n",
95 | " SalePrice | \n",
96 | " OverallQual | \n",
97 | " GrLivArea | \n",
98 | " GarageCars | \n",
99 | " TotalBsmtSF | \n",
100 | " FullBath | \n",
101 | " YearBuilt | \n",
102 | " FireplaceQu | \n",
103 | " LotFrontage | \n",
104 | "
\n",
105 | " \n",
106 | " \n",
107 | " \n",
108 | " 0 | \n",
109 | " 208500 | \n",
110 | " 7 | \n",
111 | " 1710 | \n",
112 | " 2 | \n",
113 | " 856 | \n",
114 | " 2 | \n",
115 | " 2003 | \n",
116 | " NaN | \n",
117 | " 65.0 | \n",
118 | "
\n",
119 | " \n",
120 | " 1 | \n",
121 | " 181500 | \n",
122 | " 6 | \n",
123 | " 1262 | \n",
124 | " 2 | \n",
125 | " 1262 | \n",
126 | " 2 | \n",
127 | " 1976 | \n",
128 | " TA | \n",
129 | " 80.0 | \n",
130 | "
\n",
131 | " \n",
132 | " 2 | \n",
133 | " 223500 | \n",
134 | " 7 | \n",
135 | " 1786 | \n",
136 | " 2 | \n",
137 | " 920 | \n",
138 | " 2 | \n",
139 | " 2001 | \n",
140 | " TA | \n",
141 | " 68.0 | \n",
142 | "
\n",
143 | " \n",
144 | " 3 | \n",
145 | " 140000 | \n",
146 | " 7 | \n",
147 | " 1717 | \n",
148 | " 3 | \n",
149 | " 756 | \n",
150 | " 1 | \n",
151 | " 1915 | \n",
152 | " Gd | \n",
153 | " 60.0 | \n",
154 | "
\n",
155 | " \n",
156 | " 4 | \n",
157 | " 250000 | \n",
158 | " 8 | \n",
159 | " 2198 | \n",
160 | " 3 | \n",
161 | " 1145 | \n",
162 | " 2 | \n",
163 | " 2000 | \n",
164 | " TA | \n",
165 | " 84.0 | \n",
166 | "
\n",
167 | " \n",
168 | "
\n",
169 | "
"
170 | ],
171 | "text/plain": [
172 | " SalePrice OverallQual GrLivArea GarageCars TotalBsmtSF FullBath \\\n",
173 | "0 208500 7 1710 2 856 2 \n",
174 | "1 181500 6 1262 2 1262 2 \n",
175 | "2 223500 7 1786 2 920 2 \n",
176 | "3 140000 7 1717 3 756 1 \n",
177 | "4 250000 8 2198 3 1145 2 \n",
178 | "\n",
179 | " YearBuilt FireplaceQu LotFrontage \n",
180 | "0 2003 NaN 65.0 \n",
181 | "1 1976 TA 80.0 \n",
182 | "2 2001 TA 68.0 \n",
183 | "3 1915 Gd 60.0 \n",
184 | "4 2000 TA 84.0 "
185 | ]
186 | },
187 | "execution_count": 49,
188 | "metadata": {},
189 | "output_type": "execute_result"
190 | }
191 | ],
192 | "source": [
193 | "df.head()"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 56,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": [
202 | "X_raw = df.drop('SalePrice', axis=1)\n",
203 | "y_raw = df.SalePrice"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "# Pipeline\n",
211 | "\n",
212 | "- Refine\n",
213 | "- Transform"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 59,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "from sklearn.pipeline import Pipeline\n",
223 | "from sklearn.compose import ColumnTransformer\n",
224 | "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
225 | "from sklearn.impute import SimpleImputer"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 73,
231 | "metadata": {},
232 | "outputs": [],
233 | "source": [
234 | "cat_cols = ['FireplaceQu']\n",
235 | "num_cols = ['GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt', \"OverallQual\" ,'LotFrontage']"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 74,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "cat_si_step = ('si', SimpleImputer(strategy='constant', fill_value='MISSING'))\n",
245 | "cat_ohe_step = ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))\n",
246 | "cat_steps = [cat_si_step, cat_ohe_step]\n",
247 | "cat_pipe = Pipeline(cat_steps)\n",
248 | "cat_transformers = [('cat', cat_pipe, cat_cols)]"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": 75,
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "num_si_step = ('si', SimpleImputer(strategy='median'))\n",
258 | "num_ss_step = ('ss', StandardScaler())\n",
259 | "num_steps = [num_si_step, num_ss_step]\n",
260 | "num_pipe = Pipeline(num_steps)\n",
261 | "num_transformers = [('num', num_pipe, num_cols)]"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 76,
267 | "metadata": {},
268 | "outputs": [
269 | {
270 | "data": {
271 | "text/plain": [
272 | "(1460, 13)"
273 | ]
274 | },
275 | "execution_count": 76,
276 | "metadata": {},
277 | "output_type": "execute_result"
278 | }
279 | ],
280 | "source": [
281 | "transformers = [('cat', cat_pipe, cat_cols),\n",
282 | " ('num', num_pipe, num_cols)]\n",
283 | "ct = ColumnTransformer(transformers=transformers)\n",
284 | "X_encoded = ct.fit_transform(X)\n",
285 | "X_encoded.shape"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 78,
291 | "metadata": {},
292 | "outputs": [],
293 | "source": [
294 | "from sklearn.linear_model import Ridge"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 79,
300 | "metadata": {},
301 | "outputs": [
302 | {
303 | "data": {
304 | "text/plain": [
305 | "Pipeline(memory=None,\n",
306 | " steps=[('transform', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,\n",
307 | " transformer_weights=None,\n",
308 | " transformers=[('cat', Pipeline(memory=None,\n",
309 | " steps=[('si', SimpleImputer(copy=True, fill_value='MISSING', missing_values=nan,\n",
310 | " strategy='constant', verbos...it_intercept=True, max_iter=None,\n",
311 | " normalize=False, random_state=None, solver='auto', tol=0.001))])"
312 | ]
313 | },
314 | "execution_count": 79,
315 | "metadata": {},
316 | "output_type": "execute_result"
317 | }
318 | ],
319 | "source": [
320 | "ml_pipe = Pipeline([('transform', ct), ('ridge', Ridge())])\n",
321 | "ml_pipe.fit(X, y)"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": 81,
327 | "metadata": {},
328 | "outputs": [
329 | {
330 | "data": {
331 | "text/plain": [
332 | "0.7772097032829757"
333 | ]
334 | },
335 | "execution_count": 81,
336 | "metadata": {},
337 | "output_type": "execute_result"
338 | }
339 | ],
340 | "source": [
341 | "ml_pipe.score(X, y)"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": 83,
347 | "metadata": {},
348 | "outputs": [
349 | {
350 | "data": {
351 | "text/plain": [
352 | "0.7550365007527828"
353 | ]
354 | },
355 | "execution_count": 83,
356 | "metadata": {},
357 | "output_type": "execute_result"
358 | }
359 | ],
360 | "source": [
361 | "from sklearn.model_selection import KFold, cross_val_score\n",
362 | "kf = KFold(n_splits=5, shuffle=True, random_state=123)\n",
363 | "cross_val_score(ml_pipe, X, y, cv=kf).mean()"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": 85,
369 | "metadata": {},
370 | "outputs": [
371 | {
372 | "data": {
373 | "text/plain": [
374 | "{'ridge__alpha': 50, 'transform__num__si__strategy': 'median'}"
375 | ]
376 | },
377 | "execution_count": 85,
378 | "metadata": {},
379 | "output_type": "execute_result"
380 | }
381 | ],
382 | "source": [
383 | "from sklearn.model_selection import GridSearchCV\n",
384 | "param_grid = {\n",
385 | " 'transform__num__si__strategy': ['mean', 'median'],\n",
386 | " 'ridge__alpha': [.001, 0.1, 1.0, 5, 10, 50, 100, 1000],\n",
387 | " }\n",
388 | "gs = GridSearchCV(ml_pipe, param_grid, cv=kf)\n",
389 | "gs.fit(X, y)\n",
390 | "gs.best_params_"
391 | ]
392 | },
393 | {
394 | "cell_type": "code",
395 | "execution_count": null,
396 | "metadata": {},
397 | "outputs": [],
398 | "source": []
399 | }
400 | ],
401 | "metadata": {
402 | "kernelspec": {
403 | "display_name": "Python 3",
404 | "language": "python",
405 | "name": "python3"
406 | },
407 | "language_info": {
408 | "codemirror_mode": {
409 | "name": "ipython",
410 | "version": 3
411 | },
412 | "file_extension": ".py",
413 | "mimetype": "text/x-python",
414 | "name": "python",
415 | "nbconvert_exporter": "python",
416 | "pygments_lexer": "ipython3",
417 | "version": "3.6.6"
418 | }
419 | },
420 | "nbformat": 4,
421 | "nbformat_minor": 2
422 | }
423 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Applied Machine Learning
2 |
3 | Workshop information @ http://amitkaps.com/ml
4 |
5 | - [Curriculum](curriculum.md) - The scope of the workshop.
6 | - [Installation](installation.md) - To get the software installed for the workshop.
7 | - [Pre-requisites](pre-requisites.md) - To get yourself ready for the workshop.
8 | - [Schedule](schedule.md) - The broad schedule for the workshop
9 | - [Overview](overview.md) - The overview presentation for the workshop.
10 |
--------------------------------------------------------------------------------
/curriculum.md:
--------------------------------------------------------------------------------
1 | # Curriculum
2 |
3 | ## Module 0: Introduction
4 | - What is Machine Learning
5 | - Types of ML: Supervised, Unsupervised, Reinforcement
6 | - Types of ML problems: Regression, Classification
7 |
8 | ## Module 1: Linear Models
9 | - Linear Regression
10 | - Logistic Regression
11 |
12 | ## Module 2: Model Evaluation
13 | - Training and Validation
14 | - Model Evaluation Metrics - Accuracy, RMSE, ROC, AUC, Confusion Matrix, Precision, Recall, F1 Score
15 | - Overfitting and Bias-Variance trade-off
16 | - Regularization (L1/L2)
17 | - K-fold Cross Validation
18 |
19 | ## Module 3: Tree-based Models
20 | - Decision Trees
21 | - Bagging and Boosting
22 | - Random Forest
23 | - Gradient Boosting Machines
24 | - Feature Importance
25 |
26 | ## Module 4: Model Selection
27 | - Model Pipelines
28 | - Feature Engineering
29 | - Ensemble Models (Advanced)
30 | - Unbalanced Classes (Advanced)
31 |
--------------------------------------------------------------------------------
/data/cars_small.csv:
--------------------------------------------------------------------------------
1 | brand,model,price,kmpl,bhp,type
2 | Chevrolet,Beat,421,18.6,79,Hatchback
3 | Chevrolet,Sail,551,18.2,82,Sedan
4 | Chevrolet,Sail Hatchback,468,18.2,82,Hatchback
5 | Chevrolet,Spark,345,16.2,62,Hatchback
6 | Fiat,Linea Classic,612,14.9,89,Sedan
7 | Fiat,Linea,700,15.7,112,Sedan
8 | Fiat,Punto Evo,499,15.8,67,Hatchback
9 | Ford,Classic,506,14.1,100,Sedan
10 | Ford,Figo,414,15.3,70,Hatchback
11 | Honda,Amaze,519,18,87,Sedan
12 | Honda,Brio,421,19.4,87,Hatchback
13 | Hyundai,EON,302,21.1,55,Hatchback
14 | Hyundai,i10,418,19.8,68,Hatchback
15 | Hyundai,i20,523,18.6,82,Hatchback
16 | Hyundai,Verna,774,17.4,106,Sedan
17 | Hyundai,Xcent,496,19.1,82,Sedan
18 | Suzuki,Alto,315,24.1,67,Hatchback
19 | Suzuki,Alto 800,248,22.7,47,Hatchback
20 | Suzuki,Celerio,392,23.1,67,Hatchback
21 | Suzuki,Ciaz,725,20.7,91,Sedan
22 | Suzuki,Estilo,338,19,67,Hatchback
23 | Suzuki,Ritz,442,18.5,86,Hatchback
24 | Suzuki,Swift,462,20.4,83,Hatchback
25 | Suzuki,Swift DZire,508,19.1,86,Sedan
26 | Suzuki,SX4,715,16.5,103,Sedan
27 | Suzuki,Wagon-R,363,20.5,67,Hatchback
28 | Nissan,Datsun GO,312,20.6,67,Hatchback
29 | Nissan,Micra,413,19.5,67,Hatchback
30 | Nissan,Sunny,699,16.9,98,Sedan
31 | Renault,Pulse,446,18,74,Hatchback
32 | Renault,Scala,724,16.9,98,Sedan
33 | San,Storm,595,16,59,Sedan
34 | Skoda,Fabia,503,16.4,75,Hatchback
35 | Skoda,Rapid,756,15,104,Sedan
36 | Tata,Indigo,499,14,65,Sedan
37 | Tata,Nano,199,23.9,38,Hatchback
38 | Tata,Zest,481,17.6,89,Sedan
39 | Toyota,Etios,603,16.8,89,Sedan
40 | Toyota,Etios Liva,500,17.7,79,Hatchback
41 | Volkswagen,Polo,535,16.5,74,Hatchback
42 | Volkswagen,Up,360,21,74,Hatchback
43 | Volkswagen,Vento,785,16.1,104,Sedan
--------------------------------------------------------------------------------
/data/creditRisk-tree.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/data/creditRisk-tree.xlsx
--------------------------------------------------------------------------------
/data/creditRisk.csv:
--------------------------------------------------------------------------------
1 | Income,Credit History,Risk
2 | 0,Unknown,High
3 | 0,Bad,High
4 | 0,Good,High
5 | 14000,Unknown,High
6 | 14000,Bad,High
7 | 14000,Good,High
8 | 16000,Unknown,Moderate
9 | 16000,Bad,High
10 | 16000,Good,Moderate
11 | 34000,Unknown,Moderate
12 | 34000,Bad,High
13 | 34000,Good,Moderate
14 | 36000,Unknown,Low
15 | 36000,Bad,Moderate
16 | 36000,Good,Low
17 | 70000,Unknown,Low
18 | 70000,Bad,Moderate
19 | 70000,Good,Low
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: workshop
2 | dependencies:
3 | - jupyterlab
4 | - numpy
5 | - pandas
6 | - scikit-learn
7 | - joblib
8 | - matplotlib
9 | - plotnine
10 | - seaborn
11 | - altair
12 | - graphviz
13 | - pip:
14 | - modelvis
15 | - yellowbrick
--------------------------------------------------------------------------------
/img/bias_variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/bias_variance.png
--------------------------------------------------------------------------------
/img/confusion_matrix.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/confusion_matrix.jpg
--------------------------------------------------------------------------------
/img/confusion_matrix2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/confusion_matrix2.png
--------------------------------------------------------------------------------
/img/cross_validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/cross_validation.png
--------------------------------------------------------------------------------
/img/generalisation_error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/generalisation_error.png
--------------------------------------------------------------------------------
/img/linear_models.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/linear_models.png
--------------------------------------------------------------------------------
/img/logistic-curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/logistic-curve.png
--------------------------------------------------------------------------------
/img/logistic_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/logistic_regression.png
--------------------------------------------------------------------------------
/img/model_complexity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/model_complexity.png
--------------------------------------------------------------------------------
/img/model_complexity_error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/model_complexity_error.png
--------------------------------------------------------------------------------
/img/model_selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/model_selection.png
--------------------------------------------------------------------------------
/img/overfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/overfitting.png
--------------------------------------------------------------------------------
/img/precision_recall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/precision_recall.png
--------------------------------------------------------------------------------
/img/random_forest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/random_forest.png
--------------------------------------------------------------------------------
/img/regression_error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/regression_error.png
--------------------------------------------------------------------------------
/img/regularization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/regularization.png
--------------------------------------------------------------------------------
/img/roc-curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/roc-curves.png
--------------------------------------------------------------------------------
/img/simple_complex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/simple_complex.png
--------------------------------------------------------------------------------
/img/tree_titanic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/tree_titanic.png
--------------------------------------------------------------------------------
/img/validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/img/validation.png
--------------------------------------------------------------------------------
/installation.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | ## Hardware pre-requisites
4 | * Laptop running Linux / OSX / Windows operating system
5 | * Minimum 4GB of RAM
6 | * Laptop charger
7 |
8 | ## Recommended Installation
9 | We will be using **Python 3** for the workshop. Users should install the Anaconda distribution from Continuum - [https://www.continuum.io/downloads](https://www.continuum.io/downloads). We request you to check if your Operating System is 32 bit or 64 bit and download the according Anaconda installer
10 |
11 | To find out what architecture your OS is, here are references for Windows, OSX and Ubuntu:
12 | - Windows and OSX: [http://www.akaipro.com/kb/article/1616#os_32_or_64_bit](http://www.akaipro.com/kb/article/1616#os_32_or_64_bit)
13 | - Ubuntu: [http://askubuntu.com/a/41334](http://askubuntu.com/a/41334)
14 |
15 | Please note that installing Anaconda is the **recommended** option for doing the workshop.
16 |
17 | ## Post Anaconda Installation
18 |
19 | Please run the below commands from your command line interface
20 |
21 | ```conda install seaborn```
22 |
23 | ```pip install pydotplus```
24 |
--------------------------------------------------------------------------------
/modelvis-local.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 | from sklearn import tree
5 | from tree import DecisionTreeClassifier
6 |
7 | def plot_classifier_2d(clf, data, target):
8 | x_min, x_max = data.iloc[:,0].min(), data.iloc[:,0].max()
9 | y_min, y_max = data.iloc[:,1].min(), data.iloc[:,1].max()
10 | xx, yy = np.meshgrid(np.arange(x_min, x_max, (x_max - x_min)/100), np.arange(y_min, y_max, (y_max - y_min)/100))
11 | Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,0]
12 | Z = Z.reshape(xx.shape)
13 | cs = plt.contourf(xx, yy, Z, cmap=plt.cm.magma, alpha = 0.5)
14 | plt.scatter(x = data.iloc[:,0], y = data.iloc[:,1], c = target, s = 50, alpha = 0.3)
15 | plt.colorbar(cs)
16 |
17 | if __name__ == "__main__":
18 | print("welcome to model visualisation")
19 |
20 |
--------------------------------------------------------------------------------
/outline.md:
--------------------------------------------------------------------------------
1 | theme: Olive Green, 8
2 | autoscale: true
3 |
4 | # [fit] **Applied Machine Learning**
5 |
6 |
7 |
8 | *Amit Kapoor*
9 | @amitkaps
10 |
11 |
12 | *Bargava Subramanian*
13 | @bargava
14 |
15 | ---
16 |
17 | # **Getting Started**
18 | - Download the Repo: [https://github.com/amitkaps/applied-machine-learning](https://github.com/amitkaps/applied-machine-learning)
19 | - Finish installation
20 | - Run `jupyter notebook` in the console
21 |
22 | ---
23 |
24 | # **Schedule**
25 |
26 | 0900 - 0930: Breakfast
27 | 0930 - 1115: **Session 1** - *Conceptual*
28 | 1115 - 1130: Tea Break
29 | 1130 - 1315: **Session 2** - *Coding*
30 | 1315 - 1400: Lunch
31 | 1400 - 1530: **Session 3** - *Conceptual*
32 | 1530 - 1545: Tea Break
33 | 1545 - 1700: **Session 4** - *Coding*
34 |
35 | ---
36 |
37 | # **Data-Driven Lens**
38 |
39 | > "Data is a clue to the End Truth"
40 | -- Josh Smith
41 |
42 | ---
43 |
44 | # **Metaphor**
45 |
46 | - A start-up providing loans to the consumer
47 | - Running for the last few years
48 | - Now planning to adopt a data-driven lens
49 |
50 | What are the **type of questions** you can ask?
51 |
52 | ---
53 |
54 | # **Type of Questions**
55 | - What is the trend of loan defaults?
56 | - Do older customers have more loan defaults?
57 | - Which customer is likely to have a loan default?
58 | - Why do customers default on their loan?
59 |
60 | ---
61 |
62 | # **Type of Questions**
63 | - Descriptive
64 | - Inquisitive
65 | - Predictive
66 | - Causal
67 |
68 | ---
69 |
70 | # **Data-driven Analytics**
71 | - **Descriptive**: Understand Pattern, Trends, Outlier
72 | - **Inquisitive**: Conduct Hypothesis Testing
73 | - **Predictive**: Make a prediction
74 | - **Causal**: Establish a causal link
75 |
76 | ---
77 |
78 | # **Prediction Challenge**
79 |
80 | > It’s tough to make predictions, especially about the future.
81 | -- Yogi Berra
82 |
83 | ---
84 |
85 | # **How to make a Prediction?**
86 | - **Human Learning**: Make a *Judgement*
87 | - **Machine Programmed**: Create explicit *Rules*
88 | - **Machine Learning**: Learn from *Data*
89 |
90 | ---
91 |
92 | # **Machine Learning (ML)**
93 |
94 | > [Machine learning is the] field of study that gives computers the ability to learn without being explicitly programmed.
95 | -- *Arthur Samuel*
96 |
97 |
98 |
99 | > Machine learning is the study of computer algorithm that improve automatically through experience
100 | -- *Tom Mitchell*
101 |
102 | ---
103 |
104 | # **Machine Learning: Essense**
105 |
106 | - A pattern exists
107 | - It cannot be pinned down mathematically
108 | - Have data on it to learn from
109 |
110 | > "Use a set of observations (data) to uncover an underlying process"
111 |
112 | ---
113 |
114 | # **Machine Learning**
115 | - **Theory**
116 | - **Paradigms**
117 | - **Models**
118 | - **Methods**
119 | - **Process**
120 |
121 | ---
122 |
123 | # **Applied ML - Approach**
124 | - **Theory**: Understand Key Concepts (Intuition)
125 | - **Paradigms**: Limit to One (Supervised)
126 | - **Models**: Use Two Types (Linear, Trees)
127 | - **Methods**: Apply Key Ones (Validation, Selection)
128 | - **Process**: Code the Approach (Real Examples)
129 |
130 | ---
131 |
132 | # **ML Theory: Data Types**
133 |
134 | - What are the types of data on which we are learning?
135 | - Can you give example of say measuring temperature?
136 |
137 | ---
138 |
139 | # **Data Types e.g. Temperature**
140 |
141 | - **Categorical**
142 | - *Nominal*: Burned, Not Burned
143 | - *Ordinal*: Hot, Warm, Cold
144 | - **Continuous**
145 | - *Interval*: 30 °C, 40 °C, 80 °C
146 | - *Ratio*: 30 K, 40 K, 50 K
147 |
148 | ---
149 |
150 | # **Data Types - Operations**
151 |
152 | - **Categorical**
153 | - *Nominal*: = , !=
154 | - *Ordinal*: =, !=, >, <
155 | - **Continuous**
156 | - *Interval*: =, !=, >, <, -, % of diff
157 | - *Ratio*: =, !=, >, <, -, +, %
158 |
159 | ---
160 |
161 | # **Case Example**
162 |
163 | *Context*: Loan Approval
164 |
165 | *Customer Application*
166 | - **age**: age of the applicant
167 | - **income**: annual income of the applicant
168 | - **year**: no. of years of employment
169 | - **ownership**: type of house owned
170 | - **grade**: credit grade for the applicant
171 |
172 | *Question* - How much loan **amount** to approve?
173 |
174 | ---
175 |
176 | # **Historical Data**
177 |
178 | ```
179 | age income years ownership grade amount
180 | --- ------- ----- --------- ------- -------
181 | 31 12252 25.0 RENT C 2400
182 | 24 49200 13.0 RENT C 10000
183 | 28 75000 11.0 OWN B 12000
184 | 27 110000 13.0 MORTGAGE A 3600
185 | 33 24000 10.0 RENT B 5000
186 |
187 | ```
188 |
189 | ---
190 |
191 | # **Data Types**
192 |
193 | - **Categorical**
194 | - *Nominal*: home owner [rent, own, mortgage]
195 | - *Ordinal*: credit grade [A > B > C > D > E]
196 | - **Continuous**
197 | - *Interval*: approval date [20/04/16, 19/11/15]
198 | - *Ratio*: loan amount [3000, 10000]
199 |
200 | ---
201 |
202 | # **ML Terminology**
203 |
204 | **Features**: $$\mathbf{x}$$
205 | - `age`, `income`, `years`, `ownership`, `grade`
206 |
207 |
208 | **Target**: $$y$$
209 | - `amount`
210 |
211 | **Training Data**: $$ (\mathbf{x}_{1}, y_{1}), (\mathbf{x}_{2}, y_{2}) ... (\mathbf{x}_{n}, y_{n}) $$
212 | - historical records
213 |
214 | ---
215 |
216 | # **ML Paradigm: Supervised**
217 |
218 | Given a set of **feature** $$\mathbf{x}$$, to predict the value of **target** $$y$$
219 |
220 | Learning Paradigm: **Supervised**
221 |
222 | - If $$y$$ is *continuous* - **Regression**
223 | - If $$y$$ is *categorical* - **Classification**
224 |
225 | ---
226 |
227 | # **ML Theory: Formulation**
228 | - **Features** $$\mathbf{x}$$ *(customer application)*
229 | - **Target** $$y$$ *(loan amount)*
230 | - **Target Function** $$\mathcal{f}: \mathcal{X} \to \mathcal{y}$$ (ideal formula)
231 | - **Data** $$ (\mathbf{x}_{1}, y_{1}), (\mathbf{x}_{2}, y_{2}) ... (\mathbf{x}_{n}, y_{n}) $$ *(historical records)*
232 | - **Final Hypothesis** $$\mathcal{g}: \mathcal{X} \to \mathcal{y}$$ (formula to use)
233 | - **Hypothesis Set** $$ \mathcal{H} $$ (all possible formulas)
234 | - **Learning Algorithm** $$\mathcal{A}$$ (how to learn the formula)
235 |
236 |
237 | ---
238 |
239 | # **ML Theory: Formulation**
240 |
241 | $$\text{unknown target function}$$
242 | $$\mathcal{f}: \mathcal{X} \to \mathcal{y}$$
243 | $$ | $$
244 | $$\text{training data}$$
245 | $$ (\mathbf{x}_{1}, y_{1}), (\mathbf{x}_{2}, y_{2}) ... (\mathbf{x}_{n}, y_{n}) $$
246 | $$ | $$
247 | $$\text{hypothesis set} \quad \rightarrow \quad \text{learning algorithm} \qquad \qquad \qquad \qquad $$
248 | $$ \mathcal{H} \qquad \qquad \qquad \qquad \qquad \mathcal{A} \qquad \qquad \qquad \qquad \qquad $$
249 | $$ | $$
250 | $$\text{final hypothesis}$$
251 | $$\mathcal{g} \to \mathcal{f}$$
252 |
253 | ---
254 |
255 | # **ML Theory: Learning Model**
256 |
257 | The Learning Model is composed of the two elements
258 |
259 | - The Hypothesis Set: $$ \mathcal{H} = \{\mathcal{h}\} \qquad \mathcal{g} \in \mathcal{H} $$
260 | - Learning Algorithm: $$ \mathcal{A} $$
261 |
262 | ---
263 |
264 | # **ML Theory: Formulation (Simplified)**
265 |
266 | $$\text{unknown target function}$$
267 | $$ y = \mathcal{f}(\mathbf{x})$$
268 | $$ | $$
269 | $$\text{training data}$$
270 | $$ (\mathbf{x}_{1}, y_{1}), (\mathbf{x}_{2}, y_{2}) ... (\mathbf{x}_{n}, y_{n}) $$
271 | $$ | $$
272 | $$\text{hypothesis set} \quad \rightarrow \quad \text{learning algorithm} \qquad \qquad \qquad \qquad $$
273 | $$ \{ \mathcal{h}(\mathbf{x})\} \qquad \qquad \qquad \qquad \mathcal{A} \qquad \qquad \qquad \qquad \qquad $$
274 | $$ | $$
275 | $$\text{final hypothesis}$$
276 | $$\mathcal{g}(\mathbf{x}) \to \mathcal{f}(\mathbf{x})$$
277 |
278 | ---
279 |
280 | # **Linear Algorithms**
281 |
282 | $$ s = \sum_{i=1}^d w_{i} x_{i} $$
283 |
284 |
285 | 
286 |
287 | ---
288 |
289 | # **Simple Hypothesis Set: Linear Regression**
290 |
291 | For $$d$$ features in training data,
292 |
293 | $$ h(\mathbf{x}) = \sum_{i=1}^d w_{i} x_{i} $$
294 |
295 | How do we choose the right $$w_{i}$$?
296 |
297 | ---
298 |
299 | # **Error**
300 |
301 | 
302 |
303 | [^1]: Source - Learning from Data
304 |
305 | ---
306 |
307 | # **Error Measure - MSE**
308 |
309 | How well does $$ h(\mathbf{x}) $$ approximate to $$ f(\mathbf{x}) $$
310 |
311 | We will use squared error $$ {( h(\mathbf{x}) - f(\mathbf{x}))}^2 $$
312 |
313 | $$ E_{in}(h) = \frac{1}{N} \sum_{i=i}^N {( h(\mathbf{x}) - y_i))}^2 $$
314 |
315 | ---
316 |
317 | # **Learning Algorithm - Linear Regression**
318 |
319 | - Linear Regression algorithm aims to minimise $$ E_{in}(h)$$
320 | - **One-Step Learning** -> Solves to give $$g(\mathbf{x})$$
321 |
322 | $$g(\mathbf{x}) = \hat{y} $$
323 |
324 |
325 | $$ E_{in}(g) = \frac{1}{N} \sum_{i=1}^N {( \hat{y}_i - y_i)}^2 $$
326 |
327 |
328 | ---
329 |
330 | # **Machine Learning Process**
331 |
332 | - *Frame*: Problem definition
333 | - *Acquire*: Data ingestion
334 | - *Refine*: Data wrangling
335 | - *Transform*: Feature creation
336 | - *Explore*: Feature selection
337 | - *Model*: Model creation & assessment
338 | - *Insight*: Communication
339 |
340 | ---
341 |
342 | # **Frame**
343 |
344 | **Variables**
345 | - `age`, `income`, `years`, `ownership`, `grade`, `amount`, `default` and `interest`
346 |
347 | - What are the **Features**: $$\mathbf{x}$$ ?
348 | - What are the **Target**: $$y$$
349 |
350 | ---
351 |
352 | # **Frame**
353 |
354 | **Features**: $$\mathbf{x}$$
355 | - `age`
356 | - `income`
357 | - `years`,
358 | - `ownership`
359 | - `grade`,
360 |
361 | **Target**: $$y$$
362 | - `amount` * (1 - `default`)
363 |
364 |
365 | ---
366 |
367 | # **Acquire**
368 |
369 | - Simple! Just read the data from `csv` file
370 |
371 | ---
372 |
373 | # **Refine - Missing Value**
374 |
375 | - **REMOVE** - NAN rows
376 | - **IMPUTATION** - Replace them with something?
377 | - Mean
378 | - Median
379 | - Fixed Number - Domain Relevant
380 | - High Number (999) - Issue with modelling
381 | - **BINNING** - Categorical variable and "Missing becomes a category*
382 | - **DOMAIN SPECIFIC** - Entry error, pipeline, etc.
383 |
384 | ---
385 |
386 | # **Refine - Outlier Treatment**
387 |
388 | - What is an outlier?
389 | - Descriptive Plots
390 | - Histogram
391 | - Box-Plot
392 | - Measuring
393 | - Z-score
394 | - Modified Z-score > 3.5
395 | where modified Z-score = 0.6745 * (x - x_median) / MAD
396 |
397 | ---
398 |
399 | # **Explore**
400 |
401 | - Single Variable Exploration
402 | - Dual Variable Exploration
403 | - Multi Variable Exploration
404 |
405 | ---
406 |
407 | # **Transform**
408 |
409 | Encodings
410 | - One Hot Encoding
411 | - Label Encoding
412 |
413 | Feature Transformation
414 | - Log Transform
415 | - Sqrt Transform
416 |
417 | ---
418 |
419 | # **Model - Linear Regression**
420 |
421 | **Parameters**
422 | - fit_intercept
423 | - normalization
424 |
425 | **Error Measure**
426 | - mean squared error
427 |
428 | ---
429 |
430 |
431 | # **Real-World Challenge - Noise**
432 |
433 | - The "target function" $$f$$ is not always a *function*
434 | - Not unique target value for same input
435 | - Need to add noise $$N(0,\sigma)$$
436 |
437 | $$ y = f(\mathbf{x}) + \epsilon(\mathbf{x}) $$
438 |
439 | ---
440 |
441 | # **Noise Implication**
442 |
443 | The best model we can create will have an expected error of $$\sigma^2$$
444 |
445 | If Noise ($$\sigma$$) is large, that means feature set does not capture large enough factors in the underlying process
446 | - Need to create **better features**
447 | - Need to find **new features**
448 |
449 | ---
450 |
451 | # **When are we learning?**
452 |
453 | Learning is defined as $$g≈f$$, which happens when
454 |
455 | (1) Can we make $$E_{out}(g)$$ is close enough to $$E_{in}(g)$$?
456 |
457 | $$E_{out}(g)≈E_{in}(g)$$
458 |
459 | (1) Can we make $$E_{in}(g)$$ small enough?
460 |
461 | $$E_{in}(g)≈0$$
462 |
463 | ---
464 |
465 | # **ML Theory: Generalisation**
466 |
467 | 
468 |
469 | For Learning, $$E_{out}(g)≈E_{in}(g)$$
470 |
471 | To find the generalisation error, we need to split our data into training and test samples
472 |
473 | Given large $$N$$, the expected generalisation error should be zero
474 |
475 | ---
476 |
477 | # **ML Theory: Generalisation**
478 |
479 | For Learning, $$E_{in}(g)≈0$$
480 |
481 | **Complex Model**: Better chance of approximating $$f$$
482 | **Simple Model**: Better chance of generalising $$E_{out}$$
483 |
484 | Lets try by increasing the model complexity - More features through interaction effect
485 |
486 | ---
487 |
488 | # **ML Theory: Model Complexity**
489 |
490 | 
491 |
492 | ---
493 |
494 | 
495 | # **ML Theory: Bias-Variance**
496 |
497 | For Learning, $$E_{in}(g)≈0$$
498 |
499 | Given large $$N$$, the expected error should be the bias
500 |
501 | - **Bias** are the simplifying assumptions made by a model to make the target function easier to learn.
502 | - **Variance** is the amount that the estimate of the target function will change if different training data was used.
503 |
504 | ---
505 |
506 | # **ML Theory: Bias-Variance Tradeoff**
507 |
508 | 
509 |
510 | ---
511 |
512 | 
513 |
514 | # **ML Theory: Overfitting**
515 |
516 | - Simple Target Function
517 | - 5th data point - noisy
518 | - 4th order polynomial fit
519 |
520 | $$E_{in}=0$$, $$E_{out}$$ is large
521 |
522 | *Overfitting* - Fitting the data more than warranted, and hence **fitting the noise**
523 |
524 | ---
525 |
526 | # **ML Theory: Addressing Overfitting**
527 |
528 | $$E_{out}(h) = E_{in}(h) + \text{overfit penalty}$$
529 |
530 | - **Regularization**: Not letting the weights grow
531 | - Ridge: add $$||w||^2$$ to error minimisation
532 | - Lasso: add $$||w||$$ to error minimisation
533 | - **Validation**: Checking when we reach bottom point
534 |
535 | ---
536 |
537 | # **Regularization - Ridge**
538 |
539 | $$ Minimize \quad E_{in}(w) + \frac{\lambda}{N}||w||^2 $$
540 |
541 | 
542 |
543 | ---
544 |
545 | 
546 |
547 | # **Validation**
548 |
549 | Validation set: $$K$$
550 | Training set: $$N-K$$
551 |
552 | Rule of Thumb: $$N = \frac{K}{5}$$
553 |
554 | Note: The validation set is used for learning
555 |
556 | ---
557 |
558 | # **Cross Validation**
559 |
560 | Repeats the process 5-times
561 |
562 | 
563 |
564 | ---
565 |
566 | 
567 |
568 | # **Model Selection**
569 |
570 | How to choose between competing model?
571 |
572 | Choose the function $$g_{m}$$ with
573 | lowest cross-validation error $$E_{m}$$
574 |
575 | ---
576 |
577 | # **Applied ML**
578 | - **Theory**: Formulation, Generalisation, Bias-Variance, Overfitting
579 | - **Paradigms**: Supervised - Regression
580 | - **Models**: Linear - OLS, Ridge, Lasso
581 | - **Methods**: Regularisation, Validation
582 | - **Process**: Frame, Acquire, Refine, Transform, Explore, Model
583 |
584 |
585 | ---
586 |
587 | ## **Classification Problem**
588 |
589 | *Context*: Loan Default
590 |
591 | *Customer Application*
592 | - **age**: age of the applicant
593 | - **income**: annual income of the applicant
594 | - **year**: no. of years of employment
595 | - **ownership**: type of house owned
596 | - **grade**: credit grade for the applicant
597 | - **amount**: loan amount given
598 | - **interest**: interest rate of loan
599 |
600 | *Question* - Who is likely to **default**?
601 |
602 | ---
603 |
604 | # **Linear Models**
605 |
606 | $$ s = \sum_{i=1}^d w_{i} x_{i} $$
607 |
608 |
609 | 
610 |
611 | ---
612 |
613 | # **Logit Function**
614 |
615 | ### $$ \theta (s)={\frac {e^{s}}{e^{s}+1}}={\frac {1}{1+e^{-s}}}$$
616 |
617 | 
618 |
619 | ---
620 |
621 | # **Logistic Relationship**
622 |
623 | Find the $$ w_{i} $$ weights that best fit:
624 | $$ y=1 $$ if $$ \sum_{i=1}^d w_{i} x_{i} > 0$$
625 | $$ y=0$$, otherwise
626 |
627 | Follows:
628 |
629 | $$ \theta(y_i)={\frac {1}{1+e^{-(\sum_{i=1}^d w_{i} x_{i})}}} $$
630 |
631 | ---
632 |
633 | # **Error - Likelihood / Probabilities**
634 |
635 | Where, $$h(\mathbf{x}) = \sum_{i=1}^d w_{i} x_{i} $$
636 |
637 | Minimise the **log-likelihood** values
638 |
639 | $$E(\mathbf{h}) = - \frac{1}{N} ln \left( \prod_{i=1}^N \theta (y_i h(\mathbf{x})) \right)$$
640 |
641 |
642 |
643 | ---
644 |
645 | # **Learning Algorithm - Logistic**
646 |
647 | - Logistic Regression algorithm aims to minimise $$ E_{in}(h)$$
648 | - **Iterative Method** -> Solves to give $$g(\mathbf{x})$$
649 |
650 | $$g(\mathbf{x}) = \hat{y} $$
651 |
652 | $$ E_{in}(g) = \frac{1}{N} \sum_{i=1}^N ln( 1 + e^{-y_i \hat{y_i}})$$
653 |
654 |
655 | ---
656 |
657 | # **Error Metric - Confusion Matrix**
658 |
659 | 
660 |
661 | ---
662 |
663 | # **Model Evaluation**
664 |
665 | **Classification Metrics**
666 |
667 | 
668 |
669 | Recall (TPR) = TP / (TP + FN)
670 |
671 | Precision = TP / (TP + FP)
672 |
673 | Specificity (TNR) = TN / (TN + FP)
674 |
675 | ---
676 |
677 | # **Model Evaluation**
678 |
679 | **Receiver Operating Characteristic Curve**
680 |
681 | Plot of TPR vs FPR at different discrimination threshold
682 |
683 | 
684 |
685 | ---
686 |
687 | # **Decision Tree**
688 |
689 | Example: Survivor on Titanic
690 |
691 | 
692 |
693 | ---
694 |
695 | # **Decision Tree**
696 |
697 | - Easy to interpret
698 | - Little data preparation
699 | - Scales well with data
700 | - White-box model
701 | - Instability – changing variables, altering sequence
702 | - Overfitting
703 |
704 | ---
705 |
706 | # **Bagging**
707 |
708 | - Also called bootstrap aggregation, reduces variance
709 | - Uses decision trees and uses a model averaging approach
710 |
711 | ---
712 |
713 | # **Random Forest**
714 |
715 | - Combines bagging idea and random selection of features.
716 | - Similar to decision trees are constructed – but at each split, a random subset of features is used.
717 |
718 | 
719 |
720 | ---
721 |
722 | > If you torture the data enough, it will confess.
723 | -- Ronald Case
724 |
725 | ---
726 |
727 | # **Challenges**
728 | - Data Snooping
729 | - Selection Bias
730 | - Survivor Bias
731 | - Omitted Variable Bias
732 | - Black-box model Vs White-Box model
733 | - Adherence to regulations
734 |
735 | ---
736 |
737 |
738 | # **Applied ML**
739 | - **Theory**: Formulation, Generalisation, Bias-Variance, Overfitting
740 | - **Paradigms**: Supervised - Regression & Classification
741 | - **Models**: Linear Models, Tree Models
742 | - **Methods**: Regularisation, Validation, Aggregation
743 | - **Process**: Frame, Acquire, Refine, Transform, Explore, Model
744 |
745 |
746 | ---
--------------------------------------------------------------------------------
/outline.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/outline.pdf
--------------------------------------------------------------------------------
/pre-requisites.md:
--------------------------------------------------------------------------------
1 | # Pre-requisites
2 |
3 | - Programming knowledge is mandatory. Attendee should be able to write conditional statements, use loops, be comfortable writing functions and be able to understand code snippets and come up with programming logic.
4 | - Participants should have a basic familiarity of Python. Specifically, we expect participants to know the first three sections from this: [http://anandology.com/python-practice-book/](http://anandology.com/python-practice-book/)
5 | - Participants should have experience with using `Pandas` and `Jupyter Notebook`. At the bare minimum, you should be able to understand and run the code in this [The Art of Data Science](https://github.com/amitkaps/art-data-science) repo. Refer to the Onion Notebook's and especially the Acquire, Refine, Transform and Explore sections.
6 |
--------------------------------------------------------------------------------
/reference/.Module-01b-reference.ipynb.icloud:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amitkaps/applied-machine-learning/9752d1cd19a530b36c81632c291a6607ab3049d3/reference/.Module-01b-reference.ipynb.icloud
--------------------------------------------------------------------------------
/reference/Module-01c-reference.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Transform and Model"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Let us build a Regression Model for prediciting the amount to be approved"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "#Load the libraries\n",
26 | "import numpy as np\n",
27 | "import pandas as pd\n",
28 | "import matplotlib.pyplot as plt\n",
29 | "import seaborn as sns"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {
36 | "collapsed": true
37 | },
38 | "outputs": [],
39 | "source": [
40 | "#Default Variables\n",
41 | "%matplotlib inline\n",
42 | "plt.rcParams['figure.figsize'] = (16,9)\n",
43 | "plt.rcParams['font.size'] = 18\n",
44 | "plt.style.use('fivethirtyeight')\n",
45 | "pd.set_option('display.float_format', lambda x: '%.2f' % x)"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 4,
51 | "metadata": {
52 | "collapsed": true
53 | },
54 | "outputs": [],
55 | "source": [
56 | "#Load the dataset\n",
57 | "df = pd.read_csv(\"data/loan_data_clean.csv\")"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 5,
63 | "metadata": {
64 | "collapsed": false
65 | },
66 | "outputs": [
67 | {
68 | "data": {
69 | "text/html": [
70 | "\n",
71 | "
\n",
72 | " \n",
73 | " \n",
74 | " | \n",
75 | " default | \n",
76 | " amount | \n",
77 | " interest | \n",
78 | " grade | \n",
79 | " years | \n",
80 | " ownership | \n",
81 | " income | \n",
82 | " age | \n",
83 | "
\n",
84 | " \n",
85 | " \n",
86 | " \n",
87 | " 0 | \n",
88 | " 0 | \n",
89 | " 5000 | \n",
90 | " 10.65 | \n",
91 | " B | \n",
92 | " 10.00 | \n",
93 | " RENT | \n",
94 | " 24000.00 | \n",
95 | " 33 | \n",
96 | "
\n",
97 | " \n",
98 | " 1 | \n",
99 | " 0 | \n",
100 | " 2400 | \n",
101 | " 10.99 | \n",
102 | " C | \n",
103 | " 25.00 | \n",
104 | " RENT | \n",
105 | " 12252.00 | \n",
106 | " 31 | \n",
107 | "
\n",
108 | " \n",
109 | " 2 | \n",
110 | " 0 | \n",
111 | " 10000 | \n",
112 | " 13.49 | \n",
113 | " C | \n",
114 | " 13.00 | \n",
115 | " RENT | \n",
116 | " 49200.00 | \n",
117 | " 24 | \n",
118 | "
\n",
119 | " \n",
120 | " 3 | \n",
121 | " 0 | \n",
122 | " 5000 | \n",
123 | " 10.99 | \n",
124 | " A | \n",
125 | " 3.00 | \n",
126 | " RENT | \n",
127 | " 36000.00 | \n",
128 | " 39 | \n",
129 | "
\n",
130 | " \n",
131 | " 4 | \n",
132 | " 0 | \n",
133 | " 3000 | \n",
134 | " 10.99 | \n",
135 | " E | \n",
136 | " 9.00 | \n",
137 | " RENT | \n",
138 | " 48000.00 | \n",
139 | " 24 | \n",
140 | "
\n",
141 | " \n",
142 | "
\n",
143 | "
"
144 | ],
145 | "text/plain": [
146 | " default amount interest grade years ownership income age\n",
147 | "0 0 5000 10.65 B 10.00 RENT 24000.00 33\n",
148 | "1 0 2400 10.99 C 25.00 RENT 12252.00 31\n",
149 | "2 0 10000 13.49 C 13.00 RENT 49200.00 24\n",
150 | "3 0 5000 10.99 A 3.00 RENT 36000.00 39\n",
151 | "4 0 3000 10.99 E 9.00 RENT 48000.00 24"
152 | ]
153 | },
154 | "execution_count": 5,
155 | "metadata": {},
156 | "output_type": "execute_result"
157 | }
158 | ],
159 | "source": [
160 | "df.head()"
161 | ]
162 | },
163 | {
164 | "cell_type": "markdown",
165 | "metadata": {},
166 | "source": [
167 | "## Transform Variables\n",
168 | "\n",
169 | "Let us create feature and target"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 8,
175 | "metadata": {
176 | "collapsed": false
177 | },
178 | "outputs": [],
179 | "source": [
180 | "# Select the initial feature set\n",
181 | "df_X = df[['age', 'income', 'ownership' , 'years', 'grade']]"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 11,
187 | "metadata": {
188 | "collapsed": false
189 | },
190 | "outputs": [],
191 | "source": [
192 | "# Convert the categorical variables in to numerical values\n",
193 | "df_X = pd.get_dummies(df_X)"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 12,
199 | "metadata": {
200 | "collapsed": false
201 | },
202 | "outputs": [],
203 | "source": [
204 | "# Create the feature set X\n",
205 | "X = df_X"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 54,
211 | "metadata": {
212 | "collapsed": true
213 | },
214 | "outputs": [],
215 | "source": [
216 | "# Create the target from amount and default\n",
217 | "df['amount_non_default'] = df['amount'] * (1- df['default'])"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 55,
223 | "metadata": {
224 | "collapsed": true
225 | },
226 | "outputs": [],
227 | "source": [
228 | "y = df['amount_non_default']"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "## Build Model - Linear Regression"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 125,
241 | "metadata": {
242 | "collapsed": true
243 | },
244 | "outputs": [],
245 | "source": [
246 | "# import the sklearn linear model\n",
247 | "from sklearn.linear_model import LinearRegression"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 126,
253 | "metadata": {
254 | "collapsed": true
255 | },
256 | "outputs": [],
257 | "source": [
258 | "# initiate the Linear Regression Model\n",
259 | "model_ols = LinearRegression(normalize=True)"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": 127,
265 | "metadata": {
266 | "collapsed": false
267 | },
268 | "outputs": [
269 | {
270 | "data": {
271 | "text/plain": [
272 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)"
273 | ]
274 | },
275 | "execution_count": 127,
276 | "metadata": {},
277 | "output_type": "execute_result"
278 | }
279 | ],
280 | "source": [
281 | "# Review the parameters in the Linear Regression\n",
282 | "model_ols"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": 128,
288 | "metadata": {
289 | "collapsed": false
290 | },
291 | "outputs": [
292 | {
293 | "data": {
294 | "text/plain": [
295 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)"
296 | ]
297 | },
298 | "execution_count": 128,
299 | "metadata": {},
300 | "output_type": "execute_result"
301 | }
302 | ],
303 | "source": [
304 | "# Review the parameters in the Linear Regression\n",
305 | "model_ols.fit(X,y)"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": 129,
311 | "metadata": {
312 | "collapsed": false
313 | },
314 | "outputs": [
315 | {
316 | "data": {
317 | "text/plain": [
318 | "array([ 7.99973540e+00, 3.41670491e-02, 3.75524906e+01,\n",
319 | " -3.40659174e+16, -3.40659174e+16, -3.40659174e+16,\n",
320 | " -3.40659174e+16, 1.42620966e+17, 1.42620966e+17,\n",
321 | " 1.42620966e+17, 1.42620966e+17, 1.42620966e+17,\n",
322 | " 1.42620966e+17, 1.42620966e+17])"
323 | ]
324 | },
325 | "execution_count": 129,
326 | "metadata": {},
327 | "output_type": "execute_result"
328 | }
329 | ],
330 | "source": [
331 | "# What are the coeffecients of the model\n",
332 | "model_ols.coef_"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": 130,
338 | "metadata": {
339 | "collapsed": false
340 | },
341 | "outputs": [
342 | {
343 | "data": {
344 | "text/plain": [
345 | "-1.0855504871673947e+17"
346 | ]
347 | },
348 | "execution_count": 130,
349 | "metadata": {},
350 | "output_type": "execute_result"
351 | }
352 | ],
353 | "source": [
354 | "# What is the intercept of the model\n",
355 | "model_ols.intercept_"
356 | ]
357 | },
358 | {
359 | "cell_type": "markdown",
360 | "metadata": {},
361 | "source": [
362 | "## Calculate Model - Predictions & Error"
363 | ]
364 | },
365 | {
366 | "cell_type": "code",
367 | "execution_count": 131,
368 | "metadata": {
369 | "collapsed": true
370 | },
371 | "outputs": [],
372 | "source": [
373 | "# predict the y\n",
374 | "y_pred_ols = model_ols.predict(X)"
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": 132,
380 | "metadata": {
381 | "collapsed": true
382 | },
383 | "outputs": [],
384 | "source": [
385 | "# import metrics from sklearn\n",
386 | "from sklearn import metrics"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 133,
392 | "metadata": {
393 | "collapsed": false
394 | },
395 | "outputs": [
396 | {
397 | "data": {
398 | "text/plain": [
399 | "40138831.382730052"
400 | ]
401 | },
402 | "execution_count": 133,
403 | "metadata": {},
404 | "output_type": "execute_result"
405 | }
406 | ],
407 | "source": [
408 | "# Calculate mean squared erro\n",
409 | "metrics.mean_squared_error(y_pred_ols, y)"
410 | ]
411 | },
412 | {
413 | "cell_type": "markdown",
414 | "metadata": {},
415 | "source": [
416 | "## Evaluate Model"
417 | ]
418 | },
419 | {
420 | "cell_type": "code",
421 | "execution_count": 134,
422 | "metadata": {
423 | "collapsed": false
424 | },
425 | "outputs": [
426 | {
427 | "data": {
428 | "text/plain": [
429 | "0.097940101660104362"
430 | ]
431 | },
432 | "execution_count": 134,
433 | "metadata": {},
434 | "output_type": "execute_result"
435 | }
436 | ],
437 | "source": [
438 | "# What is the score given by the model\n",
439 | "model_ols.score(X,y)"
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": 135,
445 | "metadata": {
446 | "collapsed": false
447 | },
448 | "outputs": [
449 | {
450 | "data": {
451 | "text/plain": [
452 | "6335.5213978590627"
453 | ]
454 | },
455 | "execution_count": 135,
456 | "metadata": {},
457 | "output_type": "execute_result"
458 | }
459 | ],
460 | "source": [
461 | "# What is the root mean square error\n",
462 | "np.sqrt(metrics.mean_squared_error(y_pred_ols, y))"
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": 136,
468 | "metadata": {
469 | "collapsed": false
470 | },
471 | "outputs": [
472 | {
473 | "data": {
474 | "text/plain": [
475 | "6670.7111933826272"
476 | ]
477 | },
478 | "execution_count": 136,
479 | "metadata": {},
480 | "output_type": "execute_result"
481 | }
482 | ],
483 | "source": [
484 | "# How does rmse compare with standard deviation of the target\n",
485 | "df.amount_non_default.std()"
486 | ]
487 | },
488 | {
489 | "cell_type": "markdown",
490 | "metadata": {},
491 | "source": [
492 | "## Generalisation Error"
493 | ]
494 | },
495 | {
496 | "cell_type": "code",
497 | "execution_count": 137,
498 | "metadata": {
499 | "collapsed": true
500 | },
501 | "outputs": [],
502 | "source": [
503 | "# Get the module for train test split\n",
504 | "from sklearn.model_selection import train_test_split"
505 | ]
506 | },
507 | {
508 | "cell_type": "code",
509 | "execution_count": 138,
510 | "metadata": {
511 | "collapsed": false
512 | },
513 | "outputs": [],
514 | "source": [
515 | "#Split the data in test and training - 20% and 80%\n",
516 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
517 | ]
518 | },
519 | {
520 | "cell_type": "code",
521 | "execution_count": 139,
522 | "metadata": {
523 | "collapsed": true
524 | },
525 | "outputs": [],
526 | "source": [
527 | "#Initiate the model\n",
528 | "model_ols_split = LinearRegression()"
529 | ]
530 | },
531 | {
532 | "cell_type": "code",
533 | "execution_count": 140,
534 | "metadata": {
535 | "collapsed": false
536 | },
537 | "outputs": [
538 | {
539 | "data": {
540 | "text/plain": [
541 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
542 | ]
543 | },
544 | "execution_count": 140,
545 | "metadata": {},
546 | "output_type": "execute_result"
547 | }
548 | ],
549 | "source": [
550 | "#Fit the model\n",
551 | "model_ols_split.fit(X_train, y_train)"
552 | ]
553 | },
554 | {
555 | "cell_type": "code",
556 | "execution_count": 142,
557 | "metadata": {
558 | "collapsed": false
559 | },
560 | "outputs": [],
561 | "source": [
562 | "# Make predictions for test and train\n",
563 | "y_pred_split_train = model_ols_split.predict(X_train)\n",
564 | "y_pred_split_test = model_ols_split.predict(X_test)"
565 | ]
566 | },
567 | {
568 | "cell_type": "code",
569 | "execution_count": 145,
570 | "metadata": {
571 | "collapsed": true
572 | },
573 | "outputs": [],
574 | "source": [
575 | "#Find the errors for test and train\n",
576 | "error_ols_split_train = metrics.mean_squared_error(y_pred_split_train, y_train)\n",
577 | "error_ols_split_test = metrics.mean_squared_error(y_pred_split_test, y_test)"
578 | ]
579 | },
580 | {
581 | "cell_type": "code",
582 | "execution_count": 148,
583 | "metadata": {
584 | "collapsed": false
585 | },
586 | "outputs": [
587 | {
588 | "data": {
589 | "text/plain": [
590 | "(40196922.046010435, 39906625.742081515)"
591 | ]
592 | },
593 | "execution_count": 148,
594 | "metadata": {},
595 | "output_type": "execute_result"
596 | }
597 | ],
598 | "source": [
599 | "error_ols_split_train, error_ols_split_test"
600 | ]
601 | },
602 | {
603 | "cell_type": "code",
604 | "execution_count": 147,
605 | "metadata": {
606 | "collapsed": false
607 | },
608 | "outputs": [
609 | {
610 | "data": {
611 | "text/plain": [
612 | "-290296.30392891914"
613 | ]
614 | },
615 | "execution_count": 147,
616 | "metadata": {},
617 | "output_type": "execute_result"
618 | }
619 | ],
620 | "source": [
621 | "# Find the generalisation error\n",
622 | "generalisation_error = error_ols_split_test - error_ols_split_train\n",
623 | "generalisation_error"
624 | ]
625 | },
626 | {
627 | "cell_type": "markdown",
628 | "metadata": {},
629 | "source": [
630 | "## Build Complex Model"
631 | ]
632 | },
633 | {
634 | "cell_type": "code",
635 | "execution_count": 150,
636 | "metadata": {
637 | "collapsed": true
638 | },
639 | "outputs": [],
640 | "source": [
641 | "# Import Polynominal Features\n",
642 | "from sklearn.preprocessing import PolynomialFeatures"
643 | ]
644 | },
645 | {
646 | "cell_type": "code",
647 | "execution_count": 246,
648 | "metadata": {
649 | "collapsed": true
650 | },
651 | "outputs": [],
652 | "source": [
653 | "# Initiate Polynominal Features for Degree 2\n",
654 | "poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)"
655 | ]
656 | },
657 | {
658 | "cell_type": "code",
659 | "execution_count": 247,
660 | "metadata": {
661 | "collapsed": false
662 | },
663 | "outputs": [],
664 | "source": [
665 | "# Create Polynominal Features\n",
666 | "X_poly = poly.fit_transform(X)"
667 | ]
668 | },
669 | {
670 | "cell_type": "code",
671 | "execution_count": 248,
672 | "metadata": {
673 | "collapsed": false
674 | },
675 | "outputs": [
676 | {
677 | "data": {
678 | "text/plain": [
679 | "(29091, 105)"
680 | ]
681 | },
682 | "execution_count": 248,
683 | "metadata": {},
684 | "output_type": "execute_result"
685 | }
686 | ],
687 | "source": [
688 | "# See the new dataset\n",
689 | "X_poly.shape"
690 | ]
691 | },
692 | {
693 | "cell_type": "code",
694 | "execution_count": 309,
695 | "metadata": {
696 | "collapsed": false
697 | },
698 | "outputs": [],
699 | "source": [
700 | "#Create split and train\n",
701 | "X_poly_train, X_poly_test, y_poly_train, y_poly_test = train_test_split(\n",
702 | " X_poly, y, test_size=0.2, random_state=42)"
703 | ]
704 | },
705 | {
706 | "cell_type": "code",
707 | "execution_count": 324,
708 | "metadata": {
709 | "collapsed": true
710 | },
711 | "outputs": [],
712 | "source": [
713 | "# Initiate the model\n",
714 | "model_ols_poly = LinearRegression(normalize=True)"
715 | ]
716 | },
717 | {
718 | "cell_type": "code",
719 | "execution_count": 325,
720 | "metadata": {
721 | "collapsed": false
722 | },
723 | "outputs": [
724 | {
725 | "data": {
726 | "text/plain": [
727 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)"
728 | ]
729 | },
730 | "execution_count": 325,
731 | "metadata": {},
732 | "output_type": "execute_result"
733 | }
734 | ],
735 | "source": [
736 | "# Fit the model\n",
737 | "model_ols_poly.fit(X_poly_train, y_poly_train)"
738 | ]
739 | },
740 | {
741 | "cell_type": "code",
742 | "execution_count": 326,
743 | "metadata": {
744 | "collapsed": false
745 | },
746 | "outputs": [],
747 | "source": [
748 | "# Make predictions for test and train\n",
749 | "y_pred_poly_train = model_ols_poly.predict(X_poly_train)\n",
750 | "y_pred_poly_test = model_ols_poly.predict(X_poly_test)"
751 | ]
752 | },
753 | {
754 | "cell_type": "code",
755 | "execution_count": 327,
756 | "metadata": {
757 | "collapsed": true
758 | },
759 | "outputs": [],
760 | "source": [
761 | "#Find the errors for test and train\n",
762 | "error_ols_poly_train = metrics.mean_squared_error(y_pred_poly_train, y_poly_train)\n",
763 | "error_ols_poly_test = metrics.mean_squared_error(y_pred_poly_test, y_poly_test)"
764 | ]
765 | },
766 | {
767 | "cell_type": "code",
768 | "execution_count": 328,
769 | "metadata": {
770 | "collapsed": false
771 | },
772 | "outputs": [
773 | {
774 | "data": {
775 | "text/plain": [
776 | "(38576398.190701269, 39240001.867846712)"
777 | ]
778 | },
779 | "execution_count": 328,
780 | "metadata": {},
781 | "output_type": "execute_result"
782 | }
783 | ],
784 | "source": [
785 | "error_ols_poly_train, error_ols_poly_test"
786 | ]
787 | },
788 | {
789 | "cell_type": "code",
790 | "execution_count": 329,
791 | "metadata": {
792 | "collapsed": false
793 | },
794 | "outputs": [
795 | {
796 | "data": {
797 | "text/plain": [
798 | "663603.67714544386"
799 | ]
800 | },
801 | "execution_count": 329,
802 | "metadata": {},
803 | "output_type": "execute_result"
804 | }
805 | ],
806 | "source": [
807 | "# Find the generalisation error\n",
808 | "generalisation_poly_error = error_ols_poly_test - error_ols_poly_train\n",
809 | "generalisation_poly_error"
810 | ]
811 | },
812 | {
813 | "cell_type": "markdown",
814 | "metadata": {},
815 | "source": [
816 | "For Discussion\n",
817 | "- Why has the generalisation error gone up?\n",
818 | "- Should a complex model perform better than a simple one? "
819 | ]
820 | },
821 | {
822 | "cell_type": "markdown",
823 | "metadata": {},
824 | "source": [
825 | "## Regularization - Ridge"
826 | ]
827 | },
828 | {
829 | "cell_type": "code",
830 | "execution_count": 358,
831 | "metadata": {
832 | "collapsed": true
833 | },
834 | "outputs": [],
835 | "source": [
836 | "# Get ridge regression from linear_models\n",
837 | "from sklearn.linear_model import Ridge"
838 | ]
839 | },
840 | {
841 | "cell_type": "code",
842 | "execution_count": 396,
843 | "metadata": {
844 | "collapsed": true
845 | },
846 | "outputs": [],
847 | "source": [
848 | "# Initiate model\n",
849 | "model_ridge = Ridge(alpha = 0.1, normalize=True)"
850 | ]
851 | },
852 | {
853 | "cell_type": "code",
854 | "execution_count": 397,
855 | "metadata": {
856 | "collapsed": false
857 | },
858 | "outputs": [
859 | {
860 | "data": {
861 | "text/plain": [
862 | "Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,\n",
863 | " normalize=True, random_state=None, solver='auto', tol=0.001)"
864 | ]
865 | },
866 | "execution_count": 397,
867 | "metadata": {},
868 | "output_type": "execute_result"
869 | }
870 | ],
871 | "source": [
872 | "# Fit the model\n",
873 | "model_ridge.fit(X_poly_train, y_poly_train)"
874 | ]
875 | },
876 | {
877 | "cell_type": "code",
878 | "execution_count": 398,
879 | "metadata": {
880 | "collapsed": true
881 | },
882 | "outputs": [],
883 | "source": [
884 | "# Make predictions for test and train\n",
885 | "y_pred_ridge_train = model_ridge.predict(X_poly_train)\n",
886 | "y_pred_ridge_test = model_ridge.predict(X_poly_test)"
887 | ]
888 | },
889 | {
890 | "cell_type": "code",
891 | "execution_count": 399,
892 | "metadata": {
893 | "collapsed": true
894 | },
895 | "outputs": [],
896 | "source": [
897 | "#Find the errors for test and train\n",
898 | "error_ridge_train = metrics.mean_squared_error(y_pred_ridge_train, y_poly_train)\n",
899 | "error_ridge_test = metrics.mean_squared_error(y_pred_ridge_test, y_poly_test)"
900 | ]
901 | },
902 | {
903 | "cell_type": "code",
904 | "execution_count": 400,
905 | "metadata": {
906 | "collapsed": false
907 | },
908 | "outputs": [
909 | {
910 | "data": {
911 | "text/plain": [
912 | "(39171885.390347548, 39591479.19005619)"
913 | ]
914 | },
915 | "execution_count": 400,
916 | "metadata": {},
917 | "output_type": "execute_result"
918 | }
919 | ],
920 | "source": [
921 | "error_ridge_train, error_ridge_test"
922 | ]
923 | },
924 | {
925 | "cell_type": "code",
926 | "execution_count": 401,
927 | "metadata": {
928 | "collapsed": false
929 | },
930 | "outputs": [
931 | {
932 | "data": {
933 | "text/plain": [
934 | "419593.79970864207"
935 | ]
936 | },
937 | "execution_count": 401,
938 | "metadata": {},
939 | "output_type": "execute_result"
940 | }
941 | ],
942 | "source": [
943 | "# Find the generalisation error\n",
944 | "generalisation_ridge_error = error_ridge_test - error_ridge_train\n",
945 | "generalisation_ridge_error"
946 | ]
947 | },
948 | {
949 | "cell_type": "markdown",
950 | "metadata": {},
951 | "source": [
952 | "## Cross Validation"
953 | ]
954 | },
955 | {
956 | "cell_type": "markdown",
957 | "metadata": {},
958 | "source": [
959 | "Finding alpha using Cross Validation"
960 | ]
961 | },
962 | {
963 | "cell_type": "code",
964 | "execution_count": 402,
965 | "metadata": {
966 | "collapsed": true
967 | },
968 | "outputs": [],
969 | "source": [
970 | "# Get ridge regression from linear_models\n",
971 | "from sklearn.linear_model import RidgeCV"
972 | ]
973 | },
974 | {
975 | "cell_type": "code",
976 | "execution_count": 403,
977 | "metadata": {
978 | "collapsed": false
979 | },
980 | "outputs": [],
981 | "source": [
982 | "# Initiate model with alphas = 0.1, 0.001, 0.0001\n",
983 | "model_ridge_CV = RidgeCV(alphas=[0.1, 0.001, 0.0001], normalize = True)"
984 | ]
985 | },
986 | {
987 | "cell_type": "code",
988 | "execution_count": 404,
989 | "metadata": {
990 | "collapsed": false
991 | },
992 | "outputs": [
993 | {
994 | "data": {
995 | "text/plain": [
996 | "RidgeCV(alphas=[0.1, 0.001, 0.0001], cv=None, fit_intercept=True,\n",
997 | " gcv_mode=None, normalize=True, scoring=None, store_cv_values=False)"
998 | ]
999 | },
1000 | "execution_count": 404,
1001 | "metadata": {},
1002 | "output_type": "execute_result"
1003 | }
1004 | ],
1005 | "source": [
1006 | "# Fit the model\n",
1007 | "model_ridge_CV.fit(X_poly_train, y_poly_train)"
1008 | ]
1009 | },
1010 | {
1011 | "cell_type": "code",
1012 | "execution_count": 405,
1013 | "metadata": {
1014 | "collapsed": false
1015 | },
1016 | "outputs": [
1017 | {
1018 | "data": {
1019 | "text/plain": [
1020 | "0.001"
1021 | ]
1022 | },
1023 | "execution_count": 405,
1024 | "metadata": {},
1025 | "output_type": "execute_result"
1026 | }
1027 | ],
1028 | "source": [
1029 | "# Find the correct alpha\n",
1030 | "model_ridge_CV.alpha_"
1031 | ]
1032 | },
1033 | {
1034 | "cell_type": "markdown",
1035 | "metadata": {},
1036 | "source": [
1037 | "## Exercise: Regularization - Lasso"
1038 | ]
1039 | },
1040 | {
1041 | "cell_type": "code",
1042 | "execution_count": null,
1043 | "metadata": {
1044 | "collapsed": true
1045 | },
1046 | "outputs": [],
1047 | "source": []
1048 | },
1049 | {
1050 | "cell_type": "code",
1051 | "execution_count": null,
1052 | "metadata": {
1053 | "collapsed": true
1054 | },
1055 | "outputs": [],
1056 | "source": []
1057 | },
1058 | {
1059 | "cell_type": "code",
1060 | "execution_count": null,
1061 | "metadata": {
1062 | "collapsed": true
1063 | },
1064 | "outputs": [],
1065 | "source": []
1066 | },
1067 | {
1068 | "cell_type": "code",
1069 | "execution_count": null,
1070 | "metadata": {
1071 | "collapsed": true
1072 | },
1073 | "outputs": [],
1074 | "source": []
1075 | },
1076 | {
1077 | "cell_type": "code",
1078 | "execution_count": null,
1079 | "metadata": {
1080 | "collapsed": true
1081 | },
1082 | "outputs": [],
1083 | "source": []
1084 | },
1085 | {
1086 | "cell_type": "code",
1087 | "execution_count": null,
1088 | "metadata": {
1089 | "collapsed": true
1090 | },
1091 | "outputs": [],
1092 | "source": []
1093 | },
1094 | {
1095 | "cell_type": "code",
1096 | "execution_count": null,
1097 | "metadata": {
1098 | "collapsed": true
1099 | },
1100 | "outputs": [],
1101 | "source": []
1102 | }
1103 | ],
1104 | "metadata": {
1105 | "anaconda-cloud": {},
1106 | "kernelspec": {
1107 | "display_name": "Python [conda root]",
1108 | "language": "python",
1109 | "name": "conda-root-py"
1110 | },
1111 | "language_info": {
1112 | "codemirror_mode": {
1113 | "name": "ipython",
1114 | "version": 3
1115 | },
1116 | "file_extension": ".py",
1117 | "mimetype": "text/x-python",
1118 | "name": "python",
1119 | "nbconvert_exporter": "python",
1120 | "pygments_lexer": "ipython3",
1121 | "version": "3.5.2"
1122 | }
1123 | },
1124 | "nbformat": 4,
1125 | "nbformat_minor": 1
1126 | }
1127 |
--------------------------------------------------------------------------------
/schedule.md:
--------------------------------------------------------------------------------
1 | # Schedule
2 |
3 | ## Day 1
4 |
5 | ### Session 0: Installation (0900 - 0930)
6 |
7 | ### Session 1: (0930 - 1115)
8 | - Introduction to Machine Learning
9 | - Data Science Pipeline: Frame - Acquire - Refine - Explore - Model - Insight
10 | - Types of ML Problems
11 | - Features and Targets
12 | - ML Thought Process: Regression & Classification
13 |
14 | ### Session 2: (1130 - 1315)
15 | - ML Thought Process (contd.)
16 | - Measurement & Metrics
17 | - Overfitting, Bias & Variance
18 | - Regularization
19 | - Evaluation and Cross Validation
20 |
21 | ### Session 3: (1400 - 1530)
22 | - Hands-on Session: Linear Regression
23 |
24 | ### Session 4: (1545 - 1700)
25 | - Hands-on Session: Logistic Regression
26 |
27 | # Day 2
28 |
29 | ### Session 1: (0930 - 1115)
30 | - Simple Trees and Challenges
31 | - Ensembles - Bagging, Patching, Random Subspace
32 | - Random Forest
33 | - Measurement: Variable Importance, OOB
34 | - Gradient Boosting
35 |
36 | ### Session 2: (1130 - 1315)
37 | - Hands-on Session: Trees
38 | - Hands-on Session: Random Forest
39 | - Hands-on Session: Gradient Boosting
40 |
41 | ### Session 3: (1400 - 1530)
42 | - Feature Engineering
43 | - Unbalanced Classes (Advanced)
44 | - Model Pipelines
45 | - Hands-on Session: Pipelines
46 |
47 | ### Session 4: (1545 - 1700)
48 | - Hands-on Session: Pipelines (contd.)
49 | - Practical Guidelines for ML
50 | - Next Steps
51 | - Wrap-up and Feedback
52 |
53 | ### Optional Session (1700 - 1800)
54 | - Office Hours
55 |
56 |
57 | ===================================================================
58 |
59 | ### Food and Hydration
60 | - 0900 - 0930: Breakfast
61 | - 1115 - 1130: Tea Break
62 | - 1315 - 1400: Lunch
63 | - 1530 - 1545: Tea Break
64 |
--------------------------------------------------------------------------------
/tree.dot:
--------------------------------------------------------------------------------
1 | digraph Tree {
2 | node [shape=box, style="filled, rounded", color="black", fontname=helvetica] ;
3 | edge [fontname=helvetica] ;
4 | 0 [label=gini = 0.642
samples = 18
value = [4, 6, 8]
class = High>, fillcolor="#8139e52a"] ;
5 | 1 [label=samples = 6
value = [0, 0, 6]
class = High>, fillcolor="#8139e5ff"] ;
6 | 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
7 | 2 [label=gini = 0.611
samples = 12
value = [4, 6, 2]
class = Moderate>, fillcolor="#39e58140"] ;
8 | 0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
9 | 3 [label=gini = 0.444
samples = 6
value = [0, 4, 2]
class = Moderate>, fillcolor="#39e5817f"] ;
10 | 2 -> 3 ;
11 | 4 [label=samples = 2
value = [0, 0, 2]
class = High>, fillcolor="#8139e5ff"] ;
12 | 3 -> 4 ;
13 | 5 [label=samples = 4
value = [0, 4, 0]
class = Moderate>, fillcolor="#39e581ff"] ;
14 | 3 -> 5 ;
15 | 6 [label=gini = 0.444
samples = 6
value = [4, 2, 0]
class = Low>, fillcolor="#e581397f"] ;
16 | 2 -> 6 ;
17 | 7 [label=samples = 2
value = [0, 2, 0]
class = Moderate>, fillcolor="#39e581ff"] ;
18 | 6 -> 7 ;
19 | 8 [label=samples = 4
value = [4, 0, 0]
class = Low>, fillcolor="#e58139ff"] ;
20 | 6 -> 8 ;
21 | }
--------------------------------------------------------------------------------
/tree2.dot:
--------------------------------------------------------------------------------
1 | digraph Tree {
2 | node [shape=box, style="filled, rounded", color="black", fontname=helvetica] ;
3 | edge [fontname=helvetica] ;
4 | 0 [label=gini = 0.499
samples = 7727
value = [4030, 3697]
class = no>, fillcolor="#e5813915"] ;
5 | 1 [label=gini = 0.466
samples = 4555
value = [2869, 1686]
class = no>, fillcolor="#e5813969"] ;
6 | 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
7 | 2 [label=gini = 0.411
samples = 2099
value = [1492, 607]
class = no>, fillcolor="#e5813997"] ;
8 | 1 -> 2 ;
9 | 3 [label=gini = 0.498
samples = 200
value = [107, 93]
class = no>, fillcolor="#e5813921"] ;
10 | 2 -> 3 ;
11 | 4 [label=samples = 149
value = [84, 65]
class = no>, fillcolor="#e581393a"] ;
12 | 3 -> 4 ;
13 | 5 [label=samples = 51
value = [23, 28]
class = yes>, fillcolor="#399de52e"] ;
14 | 3 -> 5 ;
15 | 6 [label=gini = 0.395
samples = 1899
value = [1385, 514]
class = no>, fillcolor="#e58139a0"] ;
16 | 2 -> 6 ;
17 | 7 [label=samples = 1815
value = [1313, 502]
class = no>, fillcolor="#e581399e"] ;
18 | 6 -> 7 ;
19 | 8 [label=samples = 84
value = [72, 12]
class = no>, fillcolor="#e58139d4"] ;
20 | 6 -> 8 ;
21 | 9 [label=gini = 0.493
samples = 2456
value = [1377, 1079]
class = no>, fillcolor="#e5813937"] ;
22 | 1 -> 9 ;
23 | 10 [label=gini = 0.497
samples = 1587
value = [856, 731]
class = no>, fillcolor="#e5813925"] ;
24 | 9 -> 10 ;
25 | 11 [label=samples = 1585
value = [856, 729]
class = no>, fillcolor="#e5813926"] ;
26 | 10 -> 11 ;
27 | 12 [label=samples = 2
value = [0, 2]
class = yes>, fillcolor="#399de5ff"] ;
28 | 10 -> 12 ;
29 | 13 [label=gini = 0.48
samples = 869
value = [521, 348]
class = no>, fillcolor="#e5813955"] ;
30 | 9 -> 13 ;
31 | 14 [label=samples = 281
value = [188, 93]
class = no>, fillcolor="#e5813981"] ;
32 | 13 -> 14 ;
33 | 15 [label=samples = 588
value = [333, 255]
class = no>, fillcolor="#e581393c"] ;
34 | 13 -> 15 ;
35 | 16 [label=gini = 0.464
samples = 3172
value = [1161, 2011]
class = yes>, fillcolor="#399de56c"] ;
36 | 0 -> 16 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
37 | 17 [label=gini = 0.468
samples = 3060
value = [1144, 1916]
class = yes>, fillcolor="#399de567"] ;
38 | 16 -> 17 ;
39 | 18 [label=gini = 0.481
samples = 1685
value = [677, 1008]
class = yes>, fillcolor="#399de554"] ;
40 | 17 -> 18 ;
41 | 19 [label=samples = 1650
value = [669, 981]
class = yes>, fillcolor="#399de551"] ;
42 | 18 -> 19 ;
43 | 20 [label=samples = 35
value = [8, 27]
class = yes>, fillcolor="#399de5b3"] ;
44 | 18 -> 20 ;
45 | 21 [label=gini = 0.449
samples = 1375
value = [467, 908]
class = yes>, fillcolor="#399de57c"] ;
46 | 17 -> 21 ;
47 | 22 [label=samples = 1346
value = [450, 896]
class = yes>, fillcolor="#399de57f"] ;
48 | 21 -> 22 ;
49 | 23 [label=samples = 29
value = [17, 12]
class = no>, fillcolor="#e581394b"] ;
50 | 21 -> 23 ;
51 | 24 [label=gini = 0.257
samples = 112
value = [17, 95]
class = yes>, fillcolor="#399de5d1"] ;
52 | 16 -> 24 ;
53 | 25 [label=samples = 1
value = [1, 0]
class = no>, fillcolor="#e58139ff"] ;
54 | 24 -> 25 ;
55 | 26 [label=gini = 0.247
samples = 111
value = [16, 95]
class = yes>, fillcolor="#399de5d4"] ;
56 | 24 -> 26 ;
57 | 27 [label=samples = 6
value = [2, 4]
class = yes>, fillcolor="#399de57f"] ;
58 | 26 -> 27 ;
59 | 28 [label=samples = 105
value = [14, 91]
class = yes>, fillcolor="#399de5d8"] ;
60 | 26 -> 28 ;
61 | }
--------------------------------------------------------------------------------
/tree_3.dot:
--------------------------------------------------------------------------------
1 | digraph Tree {
2 | node [shape=box, style="filled, rounded", color="black", fontname=helvetica] ;
3 | edge [fontname=helvetica] ;
4 | 0 [label=gini = 0.499
samples = 7727
value = [4030, 3697]
class = no>, fillcolor="#e5813915"] ;
5 | 1 [label=gini = 0.498
samples = 7316
value = [3860, 3456]
class = no>, fillcolor="#e581391b"] ;
6 | 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
7 | 2 [label=gini = 0.5
samples = 2411
value = [1186, 1225]
class = yes>, fillcolor="#399de508"] ;
8 | 1 -> 2 ;
9 | 3 [label=samples = 975
value = [437, 538]
class = yes>, fillcolor="#399de530"] ;
10 | 2 -> 3 ;
11 | 4 [label=samples = 1436
value = [749, 687]
class = no>, fillcolor="#e5813915"] ;
12 | 2 -> 4 ;
13 | 5 [label=gini = 0.496
samples = 4905
value = [2674, 2231]
class = no>, fillcolor="#e581392a"] ;
14 | 1 -> 5 ;
15 | 6 [label=samples = 772
value = [386, 386]
class = no>, fillcolor="#e5813900"] ;
16 | 5 -> 6 ;
17 | 7 [label=samples = 4133
value = [2288, 1845]
class = no>, fillcolor="#e5813931"] ;
18 | 5 -> 7 ;
19 | 8 [label=gini = 0.485
samples = 411
value = [170, 241]
class = yes>, fillcolor="#399de54b"] ;
20 | 0 -> 8 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
21 | 9 [label=gini = 0.471
samples = 340
value = [129, 211]
class = yes>, fillcolor="#399de563"] ;
22 | 8 -> 9 ;
23 | 10 [label=samples = 35
value = [19, 16]
class = no>, fillcolor="#e5813928"] ;
24 | 9 -> 10 ;
25 | 11 [label=samples = 305
value = [110, 195]
class = yes>, fillcolor="#399de56f"] ;
26 | 9 -> 11 ;
27 | 12 [label=gini = 0.488
samples = 71
value = [41, 30]
class = no>, fillcolor="#e5813944"] ;
28 | 8 -> 12 ;
29 | 13 [label=samples = 68
value = [41, 27]
class = no>, fillcolor="#e5813957"] ;
30 | 12 -> 13 ;
31 | 14 [label=samples = 3
value = [0, 3]
class = yes>, fillcolor="#399de5ff"] ;
32 | 12 -> 14 ;
33 | }
--------------------------------------------------------------------------------