├── Chapter04_Lab.ipynb
├── Chapter06_Lab1.ipynb
├── Chapter06_Lab2.ipynb
├── Chapter07_Lab.ipynb
├── README.md
├── chapter03_lab.ipynb
├── chapter05_lab.ipynb
└── chapter06_lab3.ipynb
/Chapter06_Lab2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 6.5 Lab 2: Ridge Regression and the Lasso\n",
8 | "\n",
9 | "\n",
10 | "\n",
11 | "
\n",
17 | "\n",
18 | "\n",
19 | "We will now perform ridge regression and the lasso in order to predict **Salary** on the **Hitters** data. Before proceeding ensure that the missing values have been removed from the data.\n",
20 | "\n",
21 | "> You can download **Hitters** data set for from **[here.](https://drive.google.com/file/d/1e2NqNJGkCTAGBee8JHGNGCJHplG5R2YQ/view?usp=sharing)**"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 1,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "import pandas as pd\n",
31 | "import numpy as np\n",
32 | "import matplotlib.pyplot as plt\n",
33 | "import seaborn as sns\n",
34 | "%matplotlib inline"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 2,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "from sklearn.preprocessing import LabelEncoder \n",
44 | "from sklearn.model_selection import train_test_split\n",
45 | "from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV\n",
46 | "from sklearn.metrics import mean_squared_error\n",
47 | "from sklearn.preprocessing import scale"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "metadata": {},
54 | "outputs": [
55 | {
56 | "data": {
57 | "text/plain": [
58 | "(322, 20)"
59 | ]
60 | },
61 | "execution_count": 3,
62 | "metadata": {},
63 | "output_type": "execute_result"
64 | }
65 | ],
66 | "source": [
67 | "hitters = pd.read_csv('Hitters.csv')\n",
68 | "hitters.shape"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 4,
74 | "metadata": {},
75 | "outputs": [
76 | {
77 | "data": {
78 | "text/plain": [
79 | "59"
80 | ]
81 | },
82 | "execution_count": 4,
83 | "metadata": {},
84 | "output_type": "execute_result"
85 | }
86 | ],
87 | "source": [
88 | "hitters.Salary.isna().sum() # number of NaN valeus in Salary column"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 5,
94 | "metadata": {},
95 | "outputs": [
96 | {
97 | "data": {
98 | "text/plain": [
99 | "(263, 20)"
100 | ]
101 | },
102 | "execution_count": 5,
103 | "metadata": {},
104 | "output_type": "execute_result"
105 | }
106 | ],
107 | "source": [
108 | "hitters = hitters.dropna(axis=0)\n",
109 | "hitters.shape"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 6,
115 | "metadata": {},
116 | "outputs": [],
117 | "source": [
118 | "X = hitters.loc[:, hitters.columns != 'Salary']\n",
119 | "y = hitters['Salary']"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 7,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "def get_index(df):\n",
129 | " col_index = {}\n",
130 | " column = df.columns\n",
131 | " for col in column:\n",
132 | " col_index[col] = df.columns.get_loc(col)\n",
133 | " return col_index"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 8,
139 | "metadata": {},
140 | "outputs": [
141 | {
142 | "name": "stdout",
143 | "output_type": "stream",
144 | "text": [
145 | "{'AtBat': 0, 'Hits': 1, 'HmRun': 2, 'Runs': 3, 'RBI': 4, 'Walks': 5, 'Years': 6, 'CAtBat': 7, 'CHits': 8, 'CHmRun': 9, 'CRuns': 10, 'CRBI': 11, 'CWalks': 12, 'League': 13, 'Division': 14, 'PutOuts': 15, 'Assists': 16, 'Errors': 17, 'NewLeague': 18}\n"
146 | ]
147 | }
148 | ],
149 | "source": [
150 | "column_index = get_index(X)\n",
151 | "print(column_index)"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 9,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "def label_encoder(df, index):\n",
161 | " \"\"\"\n",
162 | " We will covnert categorical values into the dummies\n",
163 | " using LabelEncoder from scikit-learn\n",
164 | "\n",
165 | " \"\"\"\n",
166 | " le = LabelEncoder()\n",
167 | " for c in index:\n",
168 | " df.iloc[:, c] = le.fit_transform(df.iloc[:, c])\n",
169 | " return df"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 10,
175 | "metadata": {},
176 | "outputs": [
177 | {
178 | "name": "stderr",
179 | "output_type": "stream",
180 | "text": [
181 | "F:\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:494: SettingWithCopyWarning: \n",
182 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
183 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
184 | "\n",
185 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
186 | " self.obj[item] = s\n"
187 | ]
188 | }
189 | ],
190 | "source": [
191 | "X = label_encoder(X, [13, 14, 18]) # running previously defined function will turn categorical variables into dummies"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 11,
197 | "metadata": {},
198 | "outputs": [
199 | {
200 | "data": {
201 | "text/html": [
202 | "\n",
203 | "\n",
216 | "
\n",
217 | " \n",
218 | " \n",
219 | " \n",
220 | " AtBat \n",
221 | " Hits \n",
222 | " HmRun \n",
223 | " Runs \n",
224 | " RBI \n",
225 | " Walks \n",
226 | " Years \n",
227 | " CAtBat \n",
228 | " CHits \n",
229 | " CHmRun \n",
230 | " CRuns \n",
231 | " CRBI \n",
232 | " CWalks \n",
233 | " League \n",
234 | " Division \n",
235 | " PutOuts \n",
236 | " Assists \n",
237 | " Errors \n",
238 | " NewLeague \n",
239 | " \n",
240 | " \n",
241 | " \n",
242 | " \n",
243 | " -Alan Ashby \n",
244 | " 315 \n",
245 | " 81 \n",
246 | " 7 \n",
247 | " 24 \n",
248 | " 38 \n",
249 | " 39 \n",
250 | " 14 \n",
251 | " 3449 \n",
252 | " 835 \n",
253 | " 69 \n",
254 | " 321 \n",
255 | " 414 \n",
256 | " 375 \n",
257 | " 1 \n",
258 | " 1 \n",
259 | " 632 \n",
260 | " 43 \n",
261 | " 10 \n",
262 | " 1 \n",
263 | " \n",
264 | " \n",
265 | " -Alvin Davis \n",
266 | " 479 \n",
267 | " 130 \n",
268 | " 18 \n",
269 | " 66 \n",
270 | " 72 \n",
271 | " 76 \n",
272 | " 3 \n",
273 | " 1624 \n",
274 | " 457 \n",
275 | " 63 \n",
276 | " 224 \n",
277 | " 266 \n",
278 | " 263 \n",
279 | " 0 \n",
280 | " 1 \n",
281 | " 880 \n",
282 | " 82 \n",
283 | " 14 \n",
284 | " 0 \n",
285 | " \n",
286 | " \n",
287 | " -Andre Dawson \n",
288 | " 496 \n",
289 | " 141 \n",
290 | " 20 \n",
291 | " 65 \n",
292 | " 78 \n",
293 | " 37 \n",
294 | " 11 \n",
295 | " 5628 \n",
296 | " 1575 \n",
297 | " 225 \n",
298 | " 828 \n",
299 | " 838 \n",
300 | " 354 \n",
301 | " 1 \n",
302 | " 0 \n",
303 | " 200 \n",
304 | " 11 \n",
305 | " 3 \n",
306 | " 1 \n",
307 | " \n",
308 | " \n",
309 | "
\n",
310 | "
"
311 | ],
312 | "text/plain": [
313 | " AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits \\\n",
314 | "-Alan Ashby 315 81 7 24 38 39 14 3449 835 \n",
315 | "-Alvin Davis 479 130 18 66 72 76 3 1624 457 \n",
316 | "-Andre Dawson 496 141 20 65 78 37 11 5628 1575 \n",
317 | "\n",
318 | " CHmRun CRuns CRBI CWalks League Division PutOuts \\\n",
319 | "-Alan Ashby 69 321 414 375 1 1 632 \n",
320 | "-Alvin Davis 63 224 266 263 0 1 880 \n",
321 | "-Andre Dawson 225 828 838 354 1 0 200 \n",
322 | "\n",
323 | " Assists Errors NewLeague \n",
324 | "-Alan Ashby 43 10 1 \n",
325 | "-Alvin Davis 82 14 0 \n",
326 | "-Andre Dawson 11 3 1 "
327 | ]
328 | },
329 | "execution_count": 11,
330 | "metadata": {},
331 | "output_type": "execute_result"
332 | }
333 | ],
334 | "source": [
335 | "X.head(3)"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": 12,
341 | "metadata": {},
342 | "outputs": [
343 | {
344 | "data": {
345 | "text/plain": [
346 | "Index(['AtBat', 'Hits', 'HmRun', 'Runs', 'RBI', 'Walks', 'Years', 'CAtBat',\n",
347 | " 'CHits', 'CHmRun', 'CRuns', 'CRBI', 'CWalks', 'League', 'Division',\n",
348 | " 'PutOuts', 'Assists', 'Errors', 'NewLeague'],\n",
349 | " dtype='object')"
350 | ]
351 | },
352 | "execution_count": 12,
353 | "metadata": {},
354 | "output_type": "execute_result"
355 | }
356 | ],
357 | "source": [
358 | "X.columns"
359 | ]
360 | },
361 | {
362 | "cell_type": "markdown",
363 | "metadata": {},
364 | "source": [
365 | "----\n",
366 | "\n",
367 | "## _6.6.1 Ridge Regression_\n",
368 | "\n",
369 | "`Ridge` method from `sklearn` performs ridge regression for an automatically selected range of $\\lambda$. However, here we have chosen to implement the function over a grid of values ranging $\\lambda=10^{10}$ to $\\lambda=10^{-2}$, essentially covering the full range of scenarios from the null model containing only the intercept, to the least-squares fit."
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": 13,
375 | "metadata": {},
376 | "outputs": [
377 | {
378 | "data": {
379 | "text/plain": [
380 | "array([5.00000000e+09, 3.78231664e+09, 2.86118383e+09, 2.16438064e+09,\n",
381 | " 1.63727458e+09, 1.23853818e+09, 9.36908711e+08, 7.08737081e+08,\n",
382 | " 5.36133611e+08, 4.05565415e+08, 3.06795364e+08, 2.32079442e+08,\n",
383 | " 1.75559587e+08, 1.32804389e+08, 1.00461650e+08, 7.59955541e+07,\n",
384 | " 5.74878498e+07, 4.34874501e+07, 3.28966612e+07, 2.48851178e+07,\n",
385 | " 1.88246790e+07, 1.42401793e+07, 1.07721735e+07, 8.14875417e+06,\n",
386 | " 6.16423370e+06, 4.66301673e+06, 3.52740116e+06, 2.66834962e+06,\n",
387 | " 2.01850863e+06, 1.52692775e+06, 1.15506485e+06, 8.73764200e+05,\n",
388 | " 6.60970574e+05, 5.00000000e+05, 3.78231664e+05, 2.86118383e+05,\n",
389 | " 2.16438064e+05, 1.63727458e+05, 1.23853818e+05, 9.36908711e+04,\n",
390 | " 7.08737081e+04, 5.36133611e+04, 4.05565415e+04, 3.06795364e+04,\n",
391 | " 2.32079442e+04, 1.75559587e+04, 1.32804389e+04, 1.00461650e+04,\n",
392 | " 7.59955541e+03, 5.74878498e+03, 4.34874501e+03, 3.28966612e+03,\n",
393 | " 2.48851178e+03, 1.88246790e+03, 1.42401793e+03, 1.07721735e+03,\n",
394 | " 8.14875417e+02, 6.16423370e+02, 4.66301673e+02, 3.52740116e+02,\n",
395 | " 2.66834962e+02, 2.01850863e+02, 1.52692775e+02, 1.15506485e+02,\n",
396 | " 8.73764200e+01, 6.60970574e+01, 5.00000000e+01, 3.78231664e+01,\n",
397 | " 2.86118383e+01, 2.16438064e+01, 1.63727458e+01, 1.23853818e+01,\n",
398 | " 9.36908711e+00, 7.08737081e+00, 5.36133611e+00, 4.05565415e+00,\n",
399 | " 3.06795364e+00, 2.32079442e+00, 1.75559587e+00, 1.32804389e+00,\n",
400 | " 1.00461650e+00, 7.59955541e-01, 5.74878498e-01, 4.34874501e-01,\n",
401 | " 3.28966612e-01, 2.48851178e-01, 1.88246790e-01, 1.42401793e-01,\n",
402 | " 1.07721735e-01, 8.14875417e-02, 6.16423370e-02, 4.66301673e-02,\n",
403 | " 3.52740116e-02, 2.66834962e-02, 2.01850863e-02, 1.52692775e-02,\n",
404 | " 1.15506485e-02, 8.73764200e-03, 6.60970574e-03, 5.00000000e-03])"
405 | ]
406 | },
407 | "execution_count": 13,
408 | "metadata": {},
409 | "output_type": "execute_result"
410 | }
411 | ],
412 | "source": [
413 | "grid = 10**np.linspace(10, -2, 100)*0.5\n",
414 | "grid"
415 | ]
416 | },
417 | {
418 | "cell_type": "markdown",
419 | "metadata": {},
420 | "source": [
421 | "We will build a function `ridge_coefs` to run models for the $\\lambda$ values defined in the previous step."
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": 14,
427 | "metadata": {},
428 | "outputs": [],
429 | "source": [
430 | "def ridge_coefs(X, y, alphas):\n",
431 | " \n",
432 | " coefs = []\n",
433 | " ridge = Ridge(normalize = True)\n",
434 | " \n",
435 | " for a in alphas:\n",
436 | " ridge.set_params(alpha = a)\n",
437 | " ridge.fit(X, y)\n",
438 | " coefs.append(ridge.coef_)\n",
439 | " \n",
440 | " return coefs"
441 | ]
442 | },
443 | {
444 | "cell_type": "code",
445 | "execution_count": 15,
446 | "metadata": {},
447 | "outputs": [],
448 | "source": [
449 | "ridge_coefs = ridge_coefs(X, y, grid)"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": 16,
455 | "metadata": {},
456 | "outputs": [
457 | {
458 | "name": "stdout",
459 | "output_type": "stream",
460 | "text": [
461 | "(100, 19)\n"
462 | ]
463 | }
464 | ],
465 | "source": [
466 | "print(np.shape(ridge_coefs))"
467 | ]
468 | },
469 | {
470 | "cell_type": "markdown",
471 | "metadata": {},
472 | "source": [
473 | "Associated with each value of $\\lambda$ is a vector of ridge regression coefficients, stored in a matrix that can be accessed by `coefs` in the below function. In this case, it is a 19×100 matrix, with 19 rows (one for each predictor, plus an intercept) and 100 columns (one for each value of $\\lambda$)."
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": 17,
479 | "metadata": {},
480 | "outputs": [
481 | {
482 | "data": {
483 | "text/plain": [
484 | "Text(0, 0.5, 'weights')"
485 | ]
486 | },
487 | "execution_count": 17,
488 | "metadata": {},
489 | "output_type": "execute_result"
490 | },
491 | {
492 | "data": {
493 | "image/png": "\n",
494 | "text/plain": [
495 | ""
496 | ]
497 | },
498 | "metadata": {
499 | "needs_background": "light"
500 | },
501 | "output_type": "display_data"
502 | }
503 | ],
504 | "source": [
505 | "ax = plt.gca()\n",
506 | "ax.plot(grid, ridge_coefs)\n",
507 | "ax.set_xscale('log')\n",
508 | "plt.axis('tight')\n",
509 | "plt.xlabel('lambda')\n",
510 | "plt.ylabel('weights')"
511 | ]
512 | },
513 | {
514 | "cell_type": "markdown",
515 | "metadata": {},
516 | "source": [
517 | "We expect the coefficient estimates to be much smaller, in terms of $l_2$ norm, when a large value of $\\lambda$ is used, as compared to when a small value of $\\lambda$ is used.\n",
518 | "\n",
519 | "We now split the samples into a training set and a test set in order to estimate the test error of ridge regression and the lasso. We will use `train_test_split` method from `sklearn` to split the model in half."
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "execution_count": 18,
525 | "metadata": {},
526 | "outputs": [],
527 | "source": [
528 | "# we split the training and test data\n",
529 | "from sklearn.model_selection import train_test_split\n",
530 | "X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)"
531 | ]
532 | },
533 | {
534 | "cell_type": "markdown",
535 | "metadata": {},
536 | "source": [
537 | "Next, we fit a ridge regression model on the training set, and evaluate its MSE on the test set, using $\\lambda$ = 4. Note the use of the `predict()` function again."
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": 19,
543 | "metadata": {},
544 | "outputs": [
545 | {
546 | "name": "stdout",
547 | "output_type": "stream",
548 | "text": [
549 | "AtBat 0.098658\n",
550 | "Hits 0.446094\n",
551 | "HmRun 1.412107\n",
552 | "Runs 0.660773\n",
553 | "RBI 0.843403\n",
554 | "Walks 1.008473\n",
555 | "Years 2.779882\n",
556 | "CAtBat 0.008244\n",
557 | "CHits 0.034149\n",
558 | "CHmRun 0.268634\n",
559 | "CRuns 0.070407\n",
560 | "CRBI 0.070060\n",
561 | "CWalks 0.082795\n",
562 | "League 4.241051\n",
563 | "Division -30.768885\n",
564 | "PutOuts 0.104747\n",
565 | "Assists -0.003739\n",
566 | "Errors 0.268363\n",
567 | "NewLeague 4.123474\n",
568 | "dtype: float64\n",
569 | "\n",
570 | "MSE: 106216.52238005561\n"
571 | ]
572 | }
573 | ],
574 | "source": [
575 | "ridge2 = Ridge(normalize = True, alpha = 4)\n",
576 | "ridge2.fit(X_train, y_train)\n",
577 | "ridge2_pred = ridge2.predict(X_test)\n",
578 | "\n",
579 | "print(pd.Series(ridge2.coef_, index = X.columns)) # Print coefficients\n",
580 | "print('\\nMSE: ', mean_squared_error(y_test, ridge2_pred)) # MSE of the model"
581 | ]
582 | },
583 | {
584 | "cell_type": "markdown",
585 | "metadata": {},
586 | "source": [
587 | "* The test MSE is 106216.52\n",
588 | "\n",
589 | "Note that if we had instead simply fit a model with just an intercept.\n",
590 | "\n",
591 | "Now let's look at the MSE value resulting by fitting a ridge regression model with a very large value of $\\lambda$. Note that 1e10 means 1010."
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": 20,
597 | "metadata": {},
598 | "outputs": [
599 | {
600 | "name": "stdout",
601 | "output_type": "stream",
602 | "text": [
603 | "AtBat 1.317464e-10\n",
604 | "Hits 4.647486e-10\n",
605 | "HmRun 2.079865e-09\n",
606 | "Runs 7.726175e-10\n",
607 | "RBI 9.390640e-10\n",
608 | "Walks 9.769219e-10\n",
609 | "Years 3.961442e-09\n",
610 | "CAtBat 1.060533e-11\n",
611 | "CHits 3.993605e-11\n",
612 | "CHmRun 2.959428e-10\n",
613 | "CRuns 8.245247e-11\n",
614 | "CRBI 7.795451e-11\n",
615 | "CWalks 9.894387e-11\n",
616 | "League -2.501281e-09\n",
617 | "Division -1.549951e-08\n",
618 | "PutOuts 7.268991e-11\n",
619 | "Assists -2.615885e-12\n",
620 | "Errors 2.084514e-10\n",
621 | "NewLeague -2.023196e-09\n",
622 | "dtype: float64\n",
623 | "\n",
624 | "MSE: 172862.23580379886\n"
625 | ]
626 | }
627 | ],
628 | "source": [
629 | "ridge3 = Ridge(normalize = True, alpha = 1e10)\n",
630 | "ridge3.fit(X_train, y_train)\n",
631 | "ridge3_pred = ridge3.predict(X_test)\n",
632 | "\n",
633 | "print(pd.Series(ridge3.coef_, index = X.columns)) # Print coefficients\n",
634 | "print('\\nMSE: ', mean_squared_error(y_test, ridge3_pred)) # MSE of the model"
635 | ]
636 | },
637 | {
638 | "cell_type": "markdown",
639 | "metadata": {},
640 | "source": [
641 | "* The test MSE is 172862\n",
642 | "\n",
643 | "So fitting a ridge regression model with $\\lambda$ = 4 leads to a much lower test MSE than fitting a model with just an intercept. We now check whether there is any benefit to performing ridge regression with $\\lambda$ = 4 instead of just performing least squares regression.\n",
644 | "\n",
645 | "Recall that least squares is simply ridge regression with $\\lambda$ = 0."
646 | ]
647 | },
648 | {
649 | "cell_type": "code",
650 | "execution_count": 21,
651 | "metadata": {},
652 | "outputs": [
653 | {
654 | "name": "stdout",
655 | "output_type": "stream",
656 | "text": [
657 | "AtBat -1.821115\n",
658 | "Hits 4.259156\n",
659 | "HmRun -4.773401\n",
660 | "Runs -0.038760\n",
661 | "RBI 3.984578\n",
662 | "Walks 3.470126\n",
663 | "Years 9.498236\n",
664 | "CAtBat -0.605129\n",
665 | "CHits 2.174979\n",
666 | "CHmRun 2.979306\n",
667 | "CRuns 0.266356\n",
668 | "CRBI -0.598456\n",
669 | "CWalks 0.171383\n",
670 | "League 133.743163\n",
671 | "Division -113.743875\n",
672 | "PutOuts 0.421063\n",
673 | "Assists 0.464379\n",
674 | "Errors -6.024576\n",
675 | "NewLeague -81.927763\n",
676 | "dtype: float64\n",
677 | "\n",
678 | "MSE: 116690.46856659841\n"
679 | ]
680 | }
681 | ],
682 | "source": [
683 | "ridge4= Ridge(normalize = True, alpha = 0)\n",
684 | "ridge4.fit(X_train, y_train)\n",
685 | "ridge4_pred = ridge4.predict(X_test)\n",
686 | "\n",
687 | "print(pd.Series(ridge4.coef_, index = X.columns)) # Print coefficients\n",
688 | "print('\\nMSE: ', mean_squared_error(y_test, ridge4_pred)) # MSE of the model"
689 | ]
690 | },
691 | {
692 | "cell_type": "markdown",
693 | "metadata": {},
694 | "source": [
695 | "We see that ridge regression with $\\lambda$ = 4 indeed improves model over the simple least-squares model.\n",
696 | "\n",
697 | "In general, instead of arbitrarily choosing $\\lambda$ = 4, it would be better to use cross-validation to choose the tuning parameter $\\lambda$. We can do this using the built-in cross-validation function, `RidgeCV`."
698 | ]
699 | },
700 | {
701 | "cell_type": "code",
702 | "execution_count": 22,
703 | "metadata": {},
704 | "outputs": [
705 | {
706 | "name": "stdout",
707 | "output_type": "stream",
708 | "text": [
709 | "Lambda that results in lowest CV error: 0.5749\n"
710 | ]
711 | },
712 | {
713 | "name": "stderr",
714 | "output_type": "stream",
715 | "text": [
716 | "F:\\Anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py:814: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.\n",
717 | " DeprecationWarning)\n"
718 | ]
719 | }
720 | ],
721 | "source": [
722 | "ridge_cv = RidgeCV(alphas = grid, scoring = 'neg_mean_squared_error', normalize=True, cv=10)\n",
723 | "ridge_cv.fit(X_train, y_train)\n",
724 | "print('Lambda that results in lowest CV error: ', round(ridge_cv.alpha_, 4))"
725 | ]
726 | },
727 | {
728 | "cell_type": "markdown",
729 | "metadata": {},
730 | "source": [
731 | "What is the test MSE associated with this value of $\\lambda$?"
732 | ]
733 | },
734 | {
735 | "cell_type": "code",
736 | "execution_count": 23,
737 | "metadata": {},
738 | "outputs": [
739 | {
740 | "data": {
741 | "text/plain": [
742 | "99825.6489629273"
743 | ]
744 | },
745 | "execution_count": 23,
746 | "metadata": {},
747 | "output_type": "execute_result"
748 | }
749 | ],
750 | "source": [
751 | "ridge5 = Ridge(alpha = ridge_cv.alpha_, normalize = True)\n",
752 | "ridge5.fit(X_train, y_train)\n",
753 | "mean_squared_error(y_test, ridge5.predict(X_test))"
754 | ]
755 | },
756 | {
757 | "cell_type": "markdown",
758 | "metadata": {},
759 | "source": [
760 | "This represents a further improvement over the test MSE that we got using $\\lambda$ = 4. Finally, we refit our ridge regression model on the full data set, using the value of $\\lambda$ chosen by cross-validation, and examine the coefficient estimates."
761 | ]
762 | },
763 | {
764 | "cell_type": "code",
765 | "execution_count": 24,
766 | "metadata": {},
767 | "outputs": [
768 | {
769 | "name": "stdout",
770 | "output_type": "stream",
771 | "text": [
772 | "AtBat -0.013534\n",
773 | "Hits 0.864143\n",
774 | "HmRun -0.291615\n",
775 | "Runs 0.721999\n",
776 | "RBI 1.318149\n",
777 | "Walks 2.092914\n",
778 | "Years 1.200022\n",
779 | "CAtBat 0.006651\n",
780 | "CHits 0.052020\n",
781 | "CHmRun 0.513053\n",
782 | "CRuns 0.106745\n",
783 | "CRBI 0.128286\n",
784 | "CWalks 0.098953\n",
785 | "League 18.915652\n",
786 | "Division -92.175508\n",
787 | "PutOuts 0.280247\n",
788 | "Assists -0.017241\n",
789 | "Errors -0.315756\n",
790 | "NewLeague 9.936172\n",
791 | "dtype: float64\n"
792 | ]
793 | }
794 | ],
795 | "source": [
796 | "print(pd.Series(ridge5.coef_, index = X.columns))"
797 | ]
798 | },
799 | {
800 | "cell_type": "markdown",
801 | "metadata": {},
802 | "source": [
803 | "As expected, none of the coefficients are zero—ridge regression does not perform variable selection!\n",
804 | "\n",
805 | "-------\n",
806 | "## _6.6.2 The Lasso_\n",
807 | "\n",
808 | "We saw that ridge regression with a wise choice of $\\lambda$ can outperform the least squares as well as the null model on the Hitters data set. We now ask whether the lasso can yield either a more accurate or a more interpretable model than ridge regression. In order to fit a lasso model, we use `Lasso` method from `sklearn` library. Other than that change, we proceed just as we did in fitting a ridge model."
809 | ]
810 | },
811 | {
812 | "cell_type": "code",
813 | "execution_count": 25,
814 | "metadata": {},
815 | "outputs": [],
816 | "source": [
817 | "def lasso_coefs(X, y, grid):\n",
818 | " \n",
819 | " lasso_coefs = []\n",
820 | " lasso = Lasso(max_iter=100000, normalize=True) # setup maximum iterations to 100000\n",
821 | " \n",
822 | " for g in grid:\n",
823 | " lasso.set_params(alpha = g)\n",
824 | " lasso.fit(X, y)\n",
825 | " lasso_coefs.append(lasso.coef_)\n",
826 | " \n",
827 | " return lasso_coefs"
828 | ]
829 | },
830 | {
831 | "cell_type": "code",
832 | "execution_count": 26,
833 | "metadata": {},
834 | "outputs": [],
835 | "source": [
836 | "lasso_coef = lasso_coefs(scale(X_train), y_train, grid)"
837 | ]
838 | },
839 | {
840 | "cell_type": "code",
841 | "execution_count": 27,
842 | "metadata": {},
843 | "outputs": [
844 | {
845 | "data": {
846 | "text/plain": [
847 | "Text(0, 0.5, 'weights')"
848 | ]
849 | },
850 | "execution_count": 27,
851 | "metadata": {},
852 | "output_type": "execute_result"
853 | },
854 | {
855 | "data": {
856 | "image/png": "\n",
857 | "text/plain": [
858 | ""
859 | ]
860 | },
861 | "metadata": {
862 | "needs_background": "light"
863 | },
864 | "output_type": "display_data"
865 | }
866 | ],
867 | "source": [
868 | "ax = plt.gca()\n",
869 | "ax.plot(grid*2, lasso_coef)\n",
870 | "ax.set_xscale('log')\n",
871 | "plt.axis('tight')\n",
872 | "plt.xlabel('alpha')\n",
873 | "plt.ylabel('weights')"
874 | ]
875 | },
876 | {
877 | "cell_type": "markdown",
878 | "metadata": {},
879 | "source": [
880 | "We can see from the coefficient plot that depending on the choice of tuning parameter, some of the coefficients will be exactly equal to zero. We now perform cross-validation and compute the associated test error."
881 | ]
882 | },
883 | {
884 | "cell_type": "code",
885 | "execution_count": 28,
886 | "metadata": {},
887 | "outputs": [
888 | {
889 | "name": "stdout",
890 | "output_type": "stream",
891 | "text": [
892 | "Lambda that results in lowest CV error is: 2.403\n"
893 | ]
894 | }
895 | ],
896 | "source": [
897 | "lasso_cv = LassoCV(alphas=None, normalize=True, cv=10, max_iter = 100000)\n",
898 | "lasso_cv.fit(X_train, y_train)\n",
899 | "print('Lambda that results in lowest CV error is: ', round(lasso_cv.alpha_, 4))"
900 | ]
901 | },
902 | {
903 | "cell_type": "code",
904 | "execution_count": 29,
905 | "metadata": {},
906 | "outputs": [
907 | {
908 | "data": {
909 | "text/plain": [
910 | "104960.66341829994"
911 | ]
912 | },
913 | "execution_count": 29,
914 | "metadata": {},
915 | "output_type": "execute_result"
916 | }
917 | ],
918 | "source": [
919 | "lasso = Lasso(max_iter = 10000, normalize = True)\n",
920 | "lasso.set_params(alpha=lasso_cv.alpha_)\n",
921 | "lasso_pred = lasso.fit(X_train, y_train).predict(X_test)\n",
922 | "mean_squared_error(y_test, lasso_pred)"
923 | ]
924 | },
925 | {
926 | "cell_type": "markdown",
927 | "metadata": {},
928 | "source": [
929 | "This is substantially lower than the test set MSE of the null model and of the least-squares, and very similar to the test MSE of ridge regression with $\\lambda$ chosen by cross-validation.\n",
930 | "\n",
931 | "However, the lasso has a substantial advantage over ridge regression in that the resulting coefficient estimates are sparse. Here we see that 12 of the 19 coefficient estimates are exactly zero. So the lasso model with $\\lambda$ chosen by cross-validation contains only seven variables."
932 | ]
933 | },
934 | {
935 | "cell_type": "code",
936 | "execution_count": 30,
937 | "metadata": {},
938 | "outputs": [
939 | {
940 | "data": {
941 | "text/plain": [
942 | "AtBat 0.000000\n",
943 | "Hits 1.082448\n",
944 | "HmRun 0.000000\n",
945 | "Runs 0.000000\n",
946 | "RBI 0.000000\n",
947 | "Walks 2.906385\n",
948 | "Years 0.000000\n",
949 | "CAtBat 0.000000\n",
950 | "CHits 0.000000\n",
951 | "CHmRun 0.219372\n",
952 | "CRuns 0.000000\n",
953 | "CRBI 0.513974\n",
954 | "CWalks 0.000000\n",
955 | "League 0.000000\n",
956 | "Division -89.064199\n",
957 | "PutOuts 0.368401\n",
958 | "Assists -0.000000\n",
959 | "Errors -0.000000\n",
960 | "NewLeague 0.000000\n",
961 | "dtype: float64"
962 | ]
963 | },
964 | "execution_count": 30,
965 | "metadata": {},
966 | "output_type": "execute_result"
967 | }
968 | ],
969 | "source": [
970 | "pd.Series(lasso.coef_, index=X.columns)"
971 | ]
972 | },
973 | {
974 | "cell_type": "code",
975 | "execution_count": null,
976 | "metadata": {},
977 | "outputs": [],
978 | "source": []
979 | }
980 | ],
981 | "metadata": {
982 | "kernelspec": {
983 | "display_name": "Python 3",
984 | "language": "python",
985 | "name": "python3"
986 | },
987 | "language_info": {
988 | "codemirror_mode": {
989 | "name": "ipython",
990 | "version": 3
991 | },
992 | "file_extension": ".py",
993 | "mimetype": "text/x-python",
994 | "name": "python",
995 | "nbconvert_exporter": "python",
996 | "pygments_lexer": "ipython3",
997 | "version": "3.7.4"
998 | }
999 | },
1000 | "nbformat": 4,
1001 | "nbformat_minor": 2
1002 | }
1003 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Welcome to The Repository of The Lab Exercises For ITSL
2 |
3 | The lab works are performed in python jupyter notebook and they follow:
4 |
5 | An Introduction To Statistical Learning with Applications in R
6 |
7 |
8 |
9 | This is a great introductory book for people who are interested in machine learning. This book explores the most commonly used supervised and unsupervised machine learning principles. Prior knowledge of basic statistics and linear algebra is helpful but not necessary. However, as the name of the book suggests all the applications of the given concepts and models are implemented using R statistical software.
10 |
11 |
12 | With the increasing popularity of the Python language due to its versatility and user-friendly syntax, lots of newcomers will try to learn machine learning using Python. The goal of this project is to replicate all the lab works using the Python programming language. Notebooks will be provided with all the steps which are performed in the book. I hope these guides will be helpful. The book consists of 10 chapters and the lab exercises start from chapter three. This project follows the ITSL in the same order.
13 |
14 | _* Notes: Some of the R functions are not directly transferable. If you see something is omitted that is because or it was previously entailed somewhere or it's just meaningless. I periodically try to implement functions where there is a repetition of steps. If you have any suggestions or improvements please share!_
15 |
16 | ---------
17 | ### Content:
18 | - Chapter 3 - Linear Regression
19 | - Simple linear regression
20 | - Multiple linear regression
21 | - Interaction Term
22 | - Non-linear Transformations of the Predictors
23 |
24 | - Chapter 4 - Classification
25 | - Logistic Regression
26 | - Linear Discriminant Analysis
27 | - Quadratic Discriminant Analysis
28 | - KNN
29 | - An Application to Caravan Insurance Data
30 |
31 | - Chapter 5 - Resampling Methods
32 | - Leave-One-Out Cross-Validation
33 | - k-Fold Cross-Validation
34 | - The Bootstrap
35 |
36 | - Chapter 6 - Linear Model Selection and Regularization
37 | - Lab 1: Subset Selection Methods
38 | - Best Subset Selection
39 | - Forward and Backward Stepwise Selection
40 | - Choosing Among Models
41 | - Lab 2: Ridge Regression and the Lasso
42 | - Ridge Regression
43 | - The Lasso
44 | - Lab 3: PCR and PLS Regression
45 | - Principal Components Regression
46 | - Partial Least Squares
47 |
48 | - Chapter 7 - Moving Beyond Linearity
49 | - Non-linear Modeling
50 | - Splines
51 | - GAMs
52 |
53 | - Chapter 8 - Tree Based Methods
54 | - Fitting Classification Trees
55 | - Fitting Regression Trees
56 | - Bagging and Random Forests
57 | - Boosting
58 |
59 | more to come...
60 |
61 | --------
62 | ### Datasets used in Lab
63 |
64 | _Chapter 3_ | [Boston.csv](https://rb.gy/n576o8) | [Carseats.csv](https://rb.gy/0p5fob) |
65 |
66 | _Chapter 4_ | [Smarket.csv](https://rb.gy/eg0zwb) | [Caravan.csv](https://rb.gy/pfbpzi) |
67 |
68 | _Chapter 5_ | [Auto.csv](https://rb.gy/ijqxck) | [Portfolio.csv](https://rb.gy/twjsui) |
69 |
70 | _Chapter 6_ | [Hitters.csv](https://drive.google.com/file/d/1e2NqNJGkCTAGBee8JHGNGCJHplG5R2YQ/view?usp=sharing) |
71 |
72 | _Chapter 7_ | [Wage.csv](https://drive.google.com/file/d/1puA-UrAstmnJfb7XjJlp6oLK9vb2l9LD/view?usp=sharing) |
73 |
74 | _Chapter 8_ |
75 |
76 | ---------
77 | ### Follow me on social media
78 | [](https://twitter.com/bexxmodd)
79 | [](https://www.linkedin.com/in/bmodebadze)
80 |
--------------------------------------------------------------------------------
/chapter05_lab.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 5.3 Lab: Cross-Validation and the Bootstrap\n",
8 | "\n",
9 | "\n",
10 | "\n",
11 | "\n",
19 | "\n",
20 | "\n",
21 | "In this lab, we explore the resampling techniques covered in this chapter\n",
22 | "\n",
23 | "\n",
24 | "## _5.3.1 The Validation Set Approach_\n",
25 | "\n",
26 | "We explore the use of the validation set approach in order to estimate the test error rates that result from fitting various linear models on the **Auto** data set.\n",
27 | "> You can download the data set for from **[here](https://drive.google.com/file/d/1hrotP0JbpDXTCVKjaGi0WJl1IdAjHqrO/view?usp=sharing)**"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 1,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "import pandas as pd\n",
37 | "import numpy as np\n",
38 | "import matplotlib.pyplot as plt\n",
39 | "import seaborn as sns\n",
40 | "%matplotlib inline"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 2,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "data": {
50 | "text/html": [
51 | "\n",
52 | "\n",
65 | "
\n",
66 | " \n",
67 | " \n",
68 | " \n",
69 | " mpg \n",
70 | " cylinders \n",
71 | " displacement \n",
72 | " horsepower \n",
73 | " weight \n",
74 | " acceleration \n",
75 | " year \n",
76 | " origin \n",
77 | " name \n",
78 | " \n",
79 | " \n",
80 | " \n",
81 | " \n",
82 | " 0 \n",
83 | " 18.0 \n",
84 | " 8 \n",
85 | " 307.0 \n",
86 | " 130 \n",
87 | " 3504 \n",
88 | " 12.0 \n",
89 | " 70 \n",
90 | " 1 \n",
91 | " chevrolet chevelle malibu \n",
92 | " \n",
93 | " \n",
94 | " 1 \n",
95 | " 15.0 \n",
96 | " 8 \n",
97 | " 350.0 \n",
98 | " 165 \n",
99 | " 3693 \n",
100 | " 11.5 \n",
101 | " 70 \n",
102 | " 1 \n",
103 | " buick skylark 320 \n",
104 | " \n",
105 | " \n",
106 | " 2 \n",
107 | " 18.0 \n",
108 | " 8 \n",
109 | " 318.0 \n",
110 | " 150 \n",
111 | " 3436 \n",
112 | " 11.0 \n",
113 | " 70 \n",
114 | " 1 \n",
115 | " plymouth satellite \n",
116 | " \n",
117 | " \n",
118 | " 3 \n",
119 | " 16.0 \n",
120 | " 8 \n",
121 | " 304.0 \n",
122 | " 150 \n",
123 | " 3433 \n",
124 | " 12.0 \n",
125 | " 70 \n",
126 | " 1 \n",
127 | " amc rebel sst \n",
128 | " \n",
129 | " \n",
130 | " 4 \n",
131 | " 17.0 \n",
132 | " 8 \n",
133 | " 302.0 \n",
134 | " 140 \n",
135 | " 3449 \n",
136 | " 10.5 \n",
137 | " 70 \n",
138 | " 1 \n",
139 | " ford torino \n",
140 | " \n",
141 | " \n",
142 | "
\n",
143 | "
"
144 | ],
145 | "text/plain": [
146 | " mpg cylinders displacement horsepower weight acceleration year \\\n",
147 | "0 18.0 8 307.0 130 3504 12.0 70 \n",
148 | "1 15.0 8 350.0 165 3693 11.5 70 \n",
149 | "2 18.0 8 318.0 150 3436 11.0 70 \n",
150 | "3 16.0 8 304.0 150 3433 12.0 70 \n",
151 | "4 17.0 8 302.0 140 3449 10.5 70 \n",
152 | "\n",
153 | " origin name \n",
154 | "0 1 chevrolet chevelle malibu \n",
155 | "1 1 buick skylark 320 \n",
156 | "2 1 plymouth satellite \n",
157 | "3 1 amc rebel sst \n",
158 | "4 1 ford torino "
159 | ]
160 | },
161 | "execution_count": 2,
162 | "metadata": {},
163 | "output_type": "execute_result"
164 | }
165 | ],
166 | "source": [
167 | "auto = pd.read_csv('Auto.csv')\n",
168 | "auto.head()"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 3,
174 | "metadata": {},
175 | "outputs": [
176 | {
177 | "name": "stdout",
178 | "output_type": "stream",
179 | "text": [
180 | "\n",
181 | "RangeIndex: 397 entries, 0 to 396\n",
182 | "Data columns (total 9 columns):\n",
183 | "mpg 397 non-null float64\n",
184 | "cylinders 397 non-null int64\n",
185 | "displacement 397 non-null float64\n",
186 | "horsepower 397 non-null object\n",
187 | "weight 397 non-null int64\n",
188 | "acceleration 397 non-null float64\n",
189 | "year 397 non-null int64\n",
190 | "origin 397 non-null int64\n",
191 | "name 397 non-null object\n",
192 | "dtypes: float64(3), int64(4), object(2)\n",
193 | "memory usage: 28.0+ KB\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "auto.info()"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 4,
204 | "metadata": {},
205 | "outputs": [
206 | {
207 | "data": {
208 | "text/html": [
209 | "\n",
210 | "\n",
223 | "
\n",
224 | " \n",
225 | " \n",
226 | " \n",
227 | " mpg \n",
228 | " cylinders \n",
229 | " displacement \n",
230 | " weight \n",
231 | " acceleration \n",
232 | " year \n",
233 | " origin \n",
234 | " \n",
235 | " \n",
236 | " \n",
237 | " \n",
238 | " count \n",
239 | " 397.000000 \n",
240 | " 397.000000 \n",
241 | " 397.000000 \n",
242 | " 397.000000 \n",
243 | " 397.000000 \n",
244 | " 397.000000 \n",
245 | " 397.000000 \n",
246 | " \n",
247 | " \n",
248 | " mean \n",
249 | " 23.515869 \n",
250 | " 5.458438 \n",
251 | " 193.532746 \n",
252 | " 2970.261965 \n",
253 | " 15.555668 \n",
254 | " 75.994962 \n",
255 | " 1.574307 \n",
256 | " \n",
257 | " \n",
258 | " std \n",
259 | " 7.825804 \n",
260 | " 1.701577 \n",
261 | " 104.379583 \n",
262 | " 847.904119 \n",
263 | " 2.749995 \n",
264 | " 3.690005 \n",
265 | " 0.802549 \n",
266 | " \n",
267 | " \n",
268 | " min \n",
269 | " 9.000000 \n",
270 | " 3.000000 \n",
271 | " 68.000000 \n",
272 | " 1613.000000 \n",
273 | " 8.000000 \n",
274 | " 70.000000 \n",
275 | " 1.000000 \n",
276 | " \n",
277 | " \n",
278 | " 25% \n",
279 | " 17.500000 \n",
280 | " 4.000000 \n",
281 | " 104.000000 \n",
282 | " 2223.000000 \n",
283 | " 13.800000 \n",
284 | " 73.000000 \n",
285 | " 1.000000 \n",
286 | " \n",
287 | " \n",
288 | " 50% \n",
289 | " 23.000000 \n",
290 | " 4.000000 \n",
291 | " 146.000000 \n",
292 | " 2800.000000 \n",
293 | " 15.500000 \n",
294 | " 76.000000 \n",
295 | " 1.000000 \n",
296 | " \n",
297 | " \n",
298 | " 75% \n",
299 | " 29.000000 \n",
300 | " 8.000000 \n",
301 | " 262.000000 \n",
302 | " 3609.000000 \n",
303 | " 17.100000 \n",
304 | " 79.000000 \n",
305 | " 2.000000 \n",
306 | " \n",
307 | " \n",
308 | " max \n",
309 | " 46.600000 \n",
310 | " 8.000000 \n",
311 | " 455.000000 \n",
312 | " 5140.000000 \n",
313 | " 24.800000 \n",
314 | " 82.000000 \n",
315 | " 3.000000 \n",
316 | " \n",
317 | " \n",
318 | "
\n",
319 | "
"
320 | ],
321 | "text/plain": [
322 | " mpg cylinders displacement weight acceleration \\\n",
323 | "count 397.000000 397.000000 397.000000 397.000000 397.000000 \n",
324 | "mean 23.515869 5.458438 193.532746 2970.261965 15.555668 \n",
325 | "std 7.825804 1.701577 104.379583 847.904119 2.749995 \n",
326 | "min 9.000000 3.000000 68.000000 1613.000000 8.000000 \n",
327 | "25% 17.500000 4.000000 104.000000 2223.000000 13.800000 \n",
328 | "50% 23.000000 4.000000 146.000000 2800.000000 15.500000 \n",
329 | "75% 29.000000 8.000000 262.000000 3609.000000 17.100000 \n",
330 | "max 46.600000 8.000000 455.000000 5140.000000 24.800000 \n",
331 | "\n",
332 | " year origin \n",
333 | "count 397.000000 397.000000 \n",
334 | "mean 75.994962 1.574307 \n",
335 | "std 3.690005 0.802549 \n",
336 | "min 70.000000 1.000000 \n",
337 | "25% 73.000000 1.000000 \n",
338 | "50% 76.000000 1.000000 \n",
339 | "75% 79.000000 2.000000 \n",
340 | "max 82.000000 3.000000 "
341 | ]
342 | },
343 | "execution_count": 4,
344 | "metadata": {},
345 | "output_type": "execute_result"
346 | }
347 | ],
348 | "source": [
349 | "auto.describe()"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": 5,
355 | "metadata": {},
356 | "outputs": [
357 | {
358 | "name": "stdout",
359 | "output_type": "stream",
360 | "text": [
361 | "\n",
362 | "RangeIndex: 397 entries, 0 to 396\n",
363 | "Data columns (total 9 columns):\n",
364 | "mpg 397 non-null float64\n",
365 | "cylinders 397 non-null int64\n",
366 | "displacement 397 non-null float64\n",
367 | "horsepower 397 non-null int32\n",
368 | "weight 397 non-null int64\n",
369 | "acceleration 397 non-null float64\n",
370 | "year 397 non-null int64\n",
371 | "origin 397 non-null int64\n",
372 | "name 397 non-null object\n",
373 | "dtypes: float64(3), int32(1), int64(4), object(1)\n",
374 | "memory usage: 26.5+ KB\n"
375 | ]
376 | }
377 | ],
378 | "source": [
379 | "auto['horsepower'] = auto['horsepower'].replace('?', '105')\n",
380 | "auto = auto.astype({'horsepower':'int32'})\n",
381 | "auto.info()"
382 | ]
383 | },
384 | {
385 | "cell_type": "markdown",
386 | "metadata": {},
387 | "source": [
388 | "We begin by using `sklearn` function to split the set of observations into two halves, by selecting a random subset of 196 observations out of the original 392 observations. We refer to these observations as the training set."
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 6,
394 | "metadata": {},
395 | "outputs": [],
396 | "source": [
397 | "X = auto.iloc[:, 1:-1]\n",
398 | "y = auto.iloc[:, 0]"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": 7,
404 | "metadata": {},
405 | "outputs": [],
406 | "source": [
407 | "from sklearn.model_selection import train_test_split\n",
408 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)"
409 | ]
410 | },
411 | {
412 | "cell_type": "markdown",
413 | "metadata": {},
414 | "source": [
415 | "We start by fitting the linear model into the training set with **mpg** and **hp**."
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": 8,
421 | "metadata": {},
422 | "outputs": [],
423 | "source": [
424 | "hp_train = X_train[['horsepower']].values\n",
425 | "hp_test = X_test[['horsepower']].values"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": 9,
431 | "metadata": {},
432 | "outputs": [
433 | {
434 | "name": "stdout",
435 | "output_type": "stream",
436 | "text": [
437 | "24.627375790258384\n"
438 | ]
439 | }
440 | ],
441 | "source": [
442 | "from sklearn.linear_model import LinearRegression\n",
443 | "from sklearn.metrics import mean_squared_error\n",
444 | "\n",
445 | "lm = LinearRegression()\n",
446 | "lm1 = lm.fit(hp_train, y_train)\n",
447 | "pred = lm1.predict(hp_test)\n",
448 | "print(mean_squared_error(y_test, pred))"
449 | ]
450 | },
451 | {
452 | "cell_type": "markdown",
453 | "metadata": {},
454 | "source": [
455 | "* the estimated test MSE for the linear regression fit is 24.63\n",
456 | "\n",
457 | "We can use the `polynomial` function to estimate the test error for the polynomial\n",
458 | "and cubic regressions."
459 | ]
460 | },
461 | {
462 | "cell_type": "markdown",
463 | "metadata": {},
464 | "source": [
465 | "from sklearn.preprocessing import PolynomialFeatures\n",
466 | "\n",
467 | "poly2 = PolynomialFeatures(degree=2) # we choose the degree of the polynomial function\n",
468 | "poly2_train = poly2.fit_transform(X_train) # we fit into the train data and then transform\n",
469 | "poly2_test = poly2.transform(X_test) # but we only transform the data of the trainning set\n",
470 | "pm = lm.fit(poly2_train, y_train) # we simply run linear model with transformed data\n",
471 | "y_pred2 = pm.predict(poly2_test)"
472 | ]
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": 10,
477 | "metadata": {},
478 | "outputs": [
479 | {
480 | "name": "stdout",
481 | "output_type": "stream",
482 | "text": [
483 | "19.523081625165556\n",
484 | "19.691109995296056\n"
485 | ]
486 | }
487 | ],
488 | "source": [
489 | "from sklearn.preprocessing import PolynomialFeatures\n",
490 | "\n",
491 | "# Quadratic\n",
492 | "poly = PolynomialFeatures(degree=2)\n",
493 | "X_train2 = poly.fit_transform(hp_train)\n",
494 | "X_test2 = poly.transform(hp_test)\n",
495 | "model = lm1.fit(X_train2, y_train)\n",
496 | "print(mean_squared_error(y_test, model.predict(X_test2)))\n",
497 | "\n",
498 | "# Cubic\n",
499 | "poly = PolynomialFeatures(degree=3)\n",
500 | "X_train3 = poly.fit_transform(hp_train)\n",
501 | "X_test3 = poly.transform(hp_test)\n",
502 | "\n",
503 | "model1 = lm1.fit(X_train3, y_train)\n",
504 | "print(mean_squared_error(y_test, model1.predict(X_test3)))"
505 | ]
506 | },
507 | {
508 | "cell_type": "markdown",
509 | "metadata": {},
510 | "source": [
511 | "* For a Quadratic, the model error rate is 19.52\n",
512 | "* For a Cubic model, the error rate is 19.69\n",
513 | "\n",
514 | "If we choose a different training set instead, then we will obtain somewhat different errors on the validation set."
515 | ]
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": 11,
520 | "metadata": {},
521 | "outputs": [],
522 | "source": [
523 | "from sklearn.model_selection import train_test_split\n",
524 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=2)"
525 | ]
526 | },
527 | {
528 | "cell_type": "code",
529 | "execution_count": 12,
530 | "metadata": {},
531 | "outputs": [],
532 | "source": [
533 | "hp_train = X_train[['horsepower']].values\n",
534 | "hp_test = X_test[['horsepower']].values"
535 | ]
536 | },
537 | {
538 | "cell_type": "code",
539 | "execution_count": 13,
540 | "metadata": {},
541 | "outputs": [
542 | {
543 | "name": "stdout",
544 | "output_type": "stream",
545 | "text": [
546 | "24.16498032778625\n",
547 | "19.142361681198697\n",
548 | "19.6895604435906\n"
549 | ]
550 | }
551 | ],
552 | "source": [
553 | "# Linear\n",
554 | "lm = LinearRegression()\n",
555 | "lm1 = lm.fit(hp_train, y_train)\n",
556 | "pred = lm1.predict(hp_test)\n",
557 | "print(mean_squared_error(y_test, pred))\n",
558 | "\n",
559 | "# Quadratic\n",
560 | "poly = PolynomialFeatures(degree=2)\n",
561 | "X_train2 = poly.fit_transform(hp_train)\n",
562 | "X_test2 = poly.transform(hp_test)\n",
563 | "model = lm1.fit(X_train2, y_train)\n",
564 | "print(mean_squared_error(y_test, model.predict(X_test2)))\n",
565 | "\n",
566 | "# Cubic\n",
567 | "poly = PolynomialFeatures(degree=3)\n",
568 | "X_train3 = poly.fit_transform(hp_train)\n",
569 | "X_test3 = poly.transform(hp_test)\n",
570 | "\n",
571 | "model1 = lm1.fit(X_train3, y_train)\n",
572 | "print(mean_squared_error(y_test, model1.predict(X_test3)))"
573 | ]
574 | },
575 | {
576 | "cell_type": "markdown",
577 | "metadata": {},
578 | "source": [
579 | "Using this split of the observations into a training set and a validation set, we find that the validation set error rates for the models:\n",
580 | "* with the linear term is 24.16\n",
581 | "* with the quadratic term is 19.14\n",
582 | "* with cubic terms is 19.69\n",
583 | "\n",
584 | "We can run multiple polynomials and compare their results. For that, I will use the function which calculates MSE's for a range of polynomial models."
585 | ]
586 | },
587 | {
588 | "cell_type": "code",
589 | "execution_count": 14,
590 | "metadata": {},
591 | "outputs": [],
592 | "source": [
593 | "def poly_mse(X_train, X_test, y_train, y_test, degree=10):\n",
594 | " mses = []\n",
595 | " degrees = []\n",
596 | " for deg in range (1, degree+1):\n",
597 | " poly_reg = PolynomialFeatures(degree= deg)\n",
598 | " poly_train = poly_reg.fit_transform(X_train)\n",
599 | " poly_test = poly_reg.transform(X_test)\n",
600 | " pred = lm.fit(poly_train, y_train).predict(poly_test)\n",
601 | " MSE = mean_squared_error(y_test, pred)\n",
602 | " print('Polynomial model with {} degrees has the MSE of {}'.format(deg,\n",
603 | " round(MSE, 4)))\n",
604 | " mses.append(MSE)\n",
605 | " degrees.append(deg)\n",
606 | " return mses, degrees"
607 | ]
608 | },
609 | {
610 | "cell_type": "code",
611 | "execution_count": 15,
612 | "metadata": {},
613 | "outputs": [
614 | {
615 | "name": "stdout",
616 | "output_type": "stream",
617 | "text": [
618 | "Polynomial model with 1 degrees has the MSE of 24.165\n",
619 | "Polynomial model with 2 degrees has the MSE of 19.1424\n",
620 | "Polynomial model with 3 degrees has the MSE of 19.6896\n",
621 | "Polynomial model with 4 degrees has the MSE of 20.1684\n",
622 | "Polynomial model with 5 degrees has the MSE of 19.6966\n",
623 | "Polynomial model with 6 degrees has the MSE of 19.4147\n",
624 | "Polynomial model with 7 degrees has the MSE of 19.699\n",
625 | "Polynomial model with 8 degrees has the MSE of 19.9226\n",
626 | "Polynomial model with 9 degrees has the MSE of 19.9133\n",
627 | "Polynomial model with 10 degrees has the MSE of 19.7294\n"
628 | ]
629 | }
630 | ],
631 | "source": [
632 | "mses, degrees = poly_mse(hp_train, hp_test, y_train, y_test)"
633 | ]
634 | },
635 | {
636 | "cell_type": "markdown",
637 | "metadata": {},
638 | "source": [
639 | "Let's plot a graph to visualize our findings."
640 | ]
641 | },
642 | {
643 | "cell_type": "code",
644 | "execution_count": 16,
645 | "metadata": {},
646 | "outputs": [
647 | {
648 | "data": {
649 | "text/plain": [
650 | "(1, 10)"
651 | ]
652 | },
653 | "execution_count": 16,
654 | "metadata": {},
655 | "output_type": "execute_result"
656 | },
657 | {
658 | "data": {
659 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEWCAYAAACJ0YulAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3dd5xV9Z3/8deboqiADTQWcBSRIiIgGg2WRLOJumDa2lKsWZNdNZo1xTVZo+smm/JbUjabX9ZfjBpbjNHsRsSWRAMaG72IiIpiQcVCESQy8Pn98T135s4wwxTmzLl35v18POYxd84t5zOX4bzvt5zvUURgZmYG0KPoAszMrHI4FMzMrI5DwczM6jgUzMysjkPBzMzqOBTMzKyOQ8GaJekKSTcWXUc5SZ+RdF8rH9vp9bdln5IelPT5vGtqtM+zJD1U9vM7kvZrzWPbsa+7JZ3Z3udbMRwK3YCk5yW9mx0AXpN0raS+RdfVHhFxU0R8ZGtfR9IHJYWkOxptPzjb/uDW7qOjSeojaaWkY5u474eSftvW14yIvhHxXAfUtlkYRsQJEXH91r62dS6HQvcxKSL6AuOAQ4FvFlxPJVgBfEDSrmXbzgSeLqieLYqI9cCtwBnl2yX1BE4HfAC2reZQ6GYi4mXgbmAUgKQ9Jf1e0luSnpH09009T9Jdki5stG2epI9nt0PSFyUtkfS2pP+SpOy+HpK+KekFSa9L+pWkHbP7arLnni3pxey5X5R0aPb6KyX9tGyfjbs/fpw9b7WkmZKOasPb8R7wP8Bp2Wv1BE4Bbmr0e35A0hOSVmXfP1B2376S/ixpjaT7gQGNnnu4pL9kv8dcSR9s5v3dP3udVZLekHRrMzVfD3xK0vZl2z5K+r98d/Zal0p6NqvpSUmfaO4NyN77/bPbu2Z/C6slPQ4MafTYJt9rSccDlwGnZq3Rudn2uu6xVv4NnClpWfb7f6O5mi1fDoVuRtIg4ERgdrbpFuAlYE/g74DvSDquiadeD3y27HUOBvYCppY9ZiKpFXIw6eD60Wz7WdnXh4D9gL7AT2no/cBQ4FTgR8A3gA8DBwKnSDqmmV/pCWAMsAtwM3CbpD7NPLYpv6L+k/dHgYXAK2W/5y7AXcBPgF2BycBdZa2Lm4GZpDC4itTSKD13r+y5/5bV9xXgdkkDm6jjKuA+YGdgb+A/myo2Iv4CLAc+Wbb5c8DNEVGb/fwscBSwI3AlcKOkPVp4HwD+C1gP7AGck32Va/K9joh7gO8At2bdUQc38dpn0fLfwJHAMOA44HJJI1pRs3Uwh0L38T+SVgIPAX8mHfwHkf4jfj0i1kfEHOAXpINMY/8LDJU0NPv5c6SDwHtlj/luRKyMiGXAA6QDCMBngMkR8VxEvAP8M3CapF5lz70qq+E+YC1wS0S8nrVspgNjm/qlIuLGiHgzImoj4j+AbUkHllbJDrK7SBpGCodfNXrI3wJLIuKGbB+3AE8BkyQNJoXgv0TEXyNiGnBn2XM/C0yNiKkRsSki7gdmkEK5sQ3APsCe2fuwpQHeuiCT1B/4GGVdRxFxW0S8ku3zVmAJcNiW3oeslfQp4PKIWBsRC2jUHbWV73Vr/gaujIh3I2IuMJf04cI6mUOh+/h4ROwUEftExD9GxLuk1sFbEbGm7HEvkFoADUTEX4HfAJ+V1IPUh31Do4e9WnZ7HenTINl+Xmi0j17A7mXbXiu7/W4TPzc5MC7pEkmLsm6XlaRPxwOaeuwW3ABcQPoU+7tG9zWuvVT/Xtl9b0fE2kb3lewDnJx1Ha3M6juS9Em8sa8BAh6XtFBS40/p5X4FfChrifwd8ExElFp+SDpD0pyyfY6i5fdkIOnf5MVmfpetfa9b8zfQ3N+PdaJeLT/EurBXSJ+S+5UFw2Dg5WYefz3pAPoQsC4iHmnDfvYp+3kwUEs68O/d5qozWZ/210ndDQsjYpOkt0kH17a4AXgG+FVErMuGQpqrHVL995C6cXaWtENZMAwGSksPvwjcEBFNjtOUi4hXgb/Pfq8jgT9ImhYRzzTx2GWSppM+fZ9AWetG0j7A/yO9J49ExEZJc2j5PVlB+jcZRGoJlX6X0uu29F63tNxyLn8D1vHcUujGIuJF4C/AvytNdxwNnEujgdayxz8CbAL+g81bCVtyC/DlbFC2L/X9z7UtPK8l/UgHlhVAL0mXA/3b+iIRsRQ4hjSO0dhU4ABJn5bUS9KpwEhgSkS8QOoOulLSNtnBfFLZc28kdTN9VFLP7D3+oKTNDoKSTi7b/jbpILtxC2VfT2rdTKDhv9cO2XNXZK97Ntmkghbeg43AHcAVkraXNJKy8RFafq9fA2qyVmRT8vobsA7mULDTgRrSJ7nfAd/K+r6b8yvgINIBr7V+SQqRacBS0mDmhVt8RuvcS5px8zSpO2I9Dbs/Wi0iHoqIV5rY/iZpAP0S4E1SN8/EiHgje8inSYPkbwHfouxTexa6HyPNzFmR1fZVmv5/dyjwmKR3gN8DF2Vh1Zzfkgal/xgRy8v2+SQptB8hHagPAh5u6ffPXEDqsnkVuA64tuy+lt7r27Lvb0qa1cRr5/U3YB1MvsiOtYWkM4DzIuLIomsxs47nloK1WjY3/h+Bq4uuxczy4VCwVpH0UVIXyGukOepm1gW5+8jMzOq4pWBmZnWq4jyFAQMGRE1NTdFlmJlVlZkzZ74REU0tq9KsqgiFmpoaZsyYUXQZZmZVRVLjs/Fb5O4jMzOr41AwM7M6DgUzM6vjUDAzszoOBTMzq+NQMDOzOg4FMzOrUx2hsGRJ0RWYmXUL1REKq1fD+vVFV2Fm1uVVRygALFxYdAVmZl1e9YTC7NktP8bMzLZK9YTCnDlFV2Bm1uXlFgqSBkl6QNIiSQslXdTo/q9ICkkDWvWCDgUzs9zluUpqLXBJRMyS1A+YKen+iHhS0iDgb4BlrX61uXNh0yboUT2NGzOzapPbETYilkfErOz2GmARsFd29w+BrwGtu+xb797wzjvw3HN5lGpmZplO+dgtqQYYCzwm6STg5YiY28JzzpM0Q9KM93plDRoPNpuZ5Sr3UJDUF7gduJjUpfQN4PKWnhcRV0fE+IgYv82OO6aNHlcwM8tVrqEgqTcpEG6KiDuAIcC+wFxJzwN7A7MkvW+LL7T99um7Q8HMLFe5DTRLEnANsCgiJgNExHxgt7LHPA+Mj4g3tvhiDgUzs06RZ0thAvA54FhJc7KvE9v1SttuC/36wSuvwOuvd2iRZmZWL8/ZRw9FhCJidESMyb6mNnpMTYuthJKDD07f525xfNrMzLZC9Uz6HzMmffcMJDOz3FRfKHhcwcwsNw4FMzOrUz2hcOCB0LMnLF4M69YVXY2ZWZdUPaHQpw+MGJHWP1qwoOhqzMy6pOoJBYCxY9N3dyGZmeWiukLBM5DMzHJVnaHgloKZWS6qKxRKJ7DNmwcbNxZbi5lZF1RdobDrrjBoUJp99MwzRVdjZtblVFcogLuQzMxyVH2hUJqB5MFmM7MOV32h4JaCmVluHApmZlan+kKhpgb694fXXoNXXy26GjOzLqX6QkFya8HMLCfVFwrg5S7MzHJSnaHg5S7MzHJR3aHgloKZWYeqzlAYORJ694YlS+Cdd4quxsysy6jOUNhmmxQMETB/ftHVmJl1GdUZCuAuJDOzHFRvKHi5CzOzDle9oeCWgplZh6veUChdW2H+fKitLbYWM7MuonpDYaed0pIX69fD008XXY2ZWZdQvaEA7kIyM+tg1R0KXu7CzKxDVXcoeLkLM7MO1TVCYc6cdCKbmZltldxCQdIgSQ9IWiRpoaSLsu1XSZonaY6k+yTt2e6dDBoEO+8Mb7wBr7zSYbWbmXVXebYUaoFLImIEcDhwvqSRwA8iYnREjAGmAJe3ew++toKZWYfKLRQiYnlEzMpurwEWAXtFxOqyh+0AbF2/j0PBzKzD9OqMnUiqAcYCj2U/fxs4A1gFfKiZ55wHnAcwePDg5l/cM5DMzDpM7gPNkvoCtwMXl1oJEfGNiBgE3ARc0NTzIuLqiBgfEeMHDhzY/A48A8nMrMPkGgqSepMC4aaIuKOJh9wMfGqrdjJ8eFpK+9lnYfXqlh9vZmbNynP2kYBrgEURMbls+9Cyh50EPLVVO+rdG0aNSrfnzduqlzIz6+7ybClMAD4HHJtNP50j6UTgu5IWSJoHfAS4aKv35MFmM7MOkdtAc0Q8BKiJu6Z2+M4cCmZmHaK6z2gu8QV3zMw6RNcIhdGj0/cFC2DDhmJrMTOrYl0jFPr3hyFD4L334KmtG7c2M+vOukYogMcVzMw6gEPBzMzqdJ1Q8HIXZmZbreuEQvlyF762gplZu3SdUNhzTxgwAN5+G158sehqzMyqUtcJBV9bwcxsq3WdUACHgpnZVnIomJlZna4VCp6BZGa2VbpWKBxwAPTpA0uXwsqVRVdjZlZ1ulYo9OoFBx2Ubs+dW2wtZmZVqGuFAnhcwcxsKzgUzMysjkPBzMzqdL1QGD06nci2cGFaStvMzFqt64VC374wdGi62M6TTxZdjZlZVel6oQDuQjIzayeHgpmZ1XEomJlZna4ZCuXLXfjaCmZmrdY1Q+F974Pdd4dVq+D554uuxsysanTNUAB3IZmZtYNDwczM6jgUzMysjkPBzMzqdN1QGDoUtt8eli2Dt94quhozs6rQdUOhZ8+0DhK4tWBm1kq5hYKkQZIekLRI0kJJF2XbfyDpKUnzJP1O0k551eAuJDOztsmzpVALXBIRI4DDgfMljQTuB0ZFxGjgaeCfc6vAoWBm1ia5hUJELI+IWdntNcAiYK+IuC8iarOHPQrsnVcNDgUzs7bplDEFSTXAWOCxRnedA9zdzHPOkzRD0owVK1a0b8cHHQQ9esCiRbB+fftew8ysG8k9FCT1BW4HLo6I1WXbv0HqYrqpqedFxNURMT4ixg8cOLB9O99+exg2DGpr00V3zMxsi3INBUm9SYFwU0TcUbb9TGAi8JmInFescxeSmVmr5Tn7SMA1wKKImFy2/Xjg68BJEbEur/3XcSiYmbVarxxfewLwOWC+pNIR+TLgJ8C2wP0pN3g0Ir6YWxUOBTOzVsstFCLiIUBN3DU1r302qRQKc+fCpk1p4NnMzJq0xSOkpM+W3Z7Q6L4L8iqqQ+22G+y5J6xZA0uXFl2NmVlFa+lj8z+V3f7PRved08G15KfUWpg9u9g6zMwqXEuhoGZuN/Vz5fK4gplZq7QUCtHM7aZ+rlwOBTOzVmlpoHm4pHmkVsGQ7DbZz/vlWllHciiYmbVKS6EwolOqyNuQIdC3L7z8MqxYAe09Q9rMrIvbYvdRRLxQ/gW8A4wDBmQ/V4cePeDgg9NttxbMzJrV0pTUKZJGZbf3ABaQZh3dIOniTqiv47gLycysRS0NNO8bEQuy22cD90fEJOD9VNOUVHAomJm1QkuhsKHs9nFkZyNn10fYlFdRuXAomJm1qKWB5hclXQi8RBpLuAdA0nZA75xr61ijRqXrNj/1FLz7Lmy3XdEVmZlVnJZaCucCBwJnAadGxMps++HAtTnW1fH69IERI9L6RwsWtPx4M7NuaIsthYh4HdhsBdOIeAB4IK+icjNmTAqE2bPh0EOLrsbMrOJsMRQk/X5L90fESR1bTs7GjIEbb/S4gplZM1oaUzgCeBG4hXR95epZ76gpHmw2M9uilkLhfcDfAKcDnwbuAm6JiOq84HHpBLZ582DjxjTwbGZmdVo6o3ljRNwTEWeSBpefAR7MZiRVnwEDYO+9Ye1aePbZoqsxM6s4LV6GTNK2kj4J3AicT7qc5h15F5absWPTd3chmZltpqVlLq4H/kI6R+HKiDg0Iq6KiJc7pbo8+II7ZmbNamlM4XPAWuAA4EtS3TizgIiI/jnWlg8PNpuZNaul8xS63lXuHQpmZs3qegf9ltTUQP/+8Oqr6cvMzOp0v1Aov7bC3LnF1mJmVmG6XyhA/QwkDzabmTXQPUPB4wpmZk1yKJiZWZ3uGQojR0KvXvD00+nsZjMzA7prKGy7bQqGCJg/v+hqzMwqRvcMBfByF2ZmTei+oeDlLszMNuNQcEvBzKxObqEgaZCkByQtkrRQ0kXZ9pOznzdJGp/X/ltUfm2F2trCyjAzqyR5thRqgUsiYgTpWgznSxoJLAA+CUzLcd8t23ln2GcfWL8eliwptBQzs0qRWyhExPKImJXdXgMsAvaKiEURsTiv/baJu5DMzBrolDEFSTXAWNJ1nlv7nPMkzZA0Y8WKFfkU5hlIZmYN5B4KkvoCtwMXR8Tq1j4vIq6OiPERMX7gwIH5FOcZSGZmDeQaCpJ6kwLhpoiovEt4lncfRRRbi5lZBchz9pGAa4BFETE5r/1slcGDYaedYMUKWL686GrMzAqXZ0thAulynsdKmpN9nSjpE5JeAo4A7pJ0b441bJnkwWYzszItXaO53SLiIdK1nJvyu7z222Zjx8KDD6ZQOPHEoqsxMytU9z2jucSDzWZmdRwK7j4yM6vjUBg+HLbZBp55BtasKboaM7NCORS22QYOPDDdnjev2FrMzArmUAB3IZmZZRwK4OUuzMwyDgXwDCQzs4xDAWD06PR9wQLYsKHYWszMCuRQANhxR9hvP/jrX2FxZazqbWZWBIdCiQebzcwcCnUcCmZmDoU6pRlIHmw2s27MoVDiayuYmTkU6uy1F+y6K7z1Frz0UtHVmJkVwqFQ4msrmJk5FBpwKJhZN+dQKOflLsysm3MolPNyF2bWzTkUyg0bBttuC0uXwsqVRVdjZtbpHArlevWCgw5Kt31tBTPrhhwKjXmw2cy6MYdCYw4FM+vGHAqNeQaSmXVjDoXGDjoonci2YAG8917R1ZiZdSqHQmP9+sH++6eL7SxaVHQ1ZmadyqHQFI8rdIxXX4WbbkpTfM2sKjgUmuJQ2DqvvQaXXJKuZvfZz8IBB8DnPw/PPVd0ZWbWAodCUxwK7fPGG/D1r6cwmDwZ3n0XDjsMNm2Ca65J4XDuuQ4HswrmUGhK+QwkX1uhZW++CZddBjU18P3vw7p1MGkSzJwJjz0GTz0FZ56Z3stf/tLhYFbBHApNed/7YLfd0lIXL7xQdDWV6+234V/+BfbdF/7932HtWjjxRHj8cfj972HcuPS4oUPhuuvqwwHqw+Gcc+DZZwv7FcysodxCQdIgSQ9IWiRpoaSLsu27SLpf0pLs+8551dBuvrbClq1aBVdemVoG//ZvsGYNfOQj8MgjcNddcOihTT+vPBzOOittu/batOaUw8GsIuTZUqgFLomIEcDhwPmSRgKXAn+MiKHAH7OfK49DYXOrV6cQqKmBK65IPx93HDz0ENx7Lxx+eOteZ//9Uxg4HMwqTm6hEBHLI2JWdnsNsAjYC/gYcH32sOuBj+dVw1ZxKNR7553UPbTvvqm7aOVKOOYY+POf4Q9/gAkT2ve6pXBYvBjOPjttK4XD2WfDM8903O9gZq3SKWMKkmqAscBjwO4RsRxScAC7NfOc8yTNkDRjxYoVnVFmQ17uIo0R/OAHKQwuuyxdv/rII+FPf4IHH4Sjj+6Y/QwZksYYysPhuutg+HCHg1knyz0UJPUFbgcujojVrX1eRFwdEeMjYvzAgQPzK7A5Q4fCdtulgea33ur8/Rdp3bo0pXS//eBrX0tTTY84Au67D6ZNgw99KJ/9lofDOeekbaVwOOssh4NZJ1DkOOVSUm9gCnBvREzOti0GPhgRyyXtATwYEcO29Drjx4+PGTNm5FZnsw4/PE2p/NOf8jsQVpL16+G//xu++910NjKk8wyuvBI++tE0AN+ZnnsOvv1tuP562LgRevZMJ8N985up68m6hwhYsgSmT0/jV08+mbb37Ak9erTv+9Y8t6nX6N07fXgZOxZ22aXY96uMpJkRMb5NT4qIXL4AAb8CftRo+w+AS7PblwLfb+m1DjnkkCjEF74QARGTJxez/86yfn3ET38aseee6feFiHHjIqZMidi0qejqIp59NuLccyN69ky19egRccYZEU8/XXRllocNGyKeeCLihz+M+OQnI3bbrf7vshq+ampS3VddFXHXXRHLlxf2VgIzoo3H7txaCpKOBKYD84FN2ebLSOMKvwEGA8uAkyNii/0zhbUUfv5z+Id/gDPOSJ9Wu5r33kvdNd/+Nrz0Utp28MGpZXDSSZ3fMmjJc8/Bd76T/i1qa9OntFLLYejQoquz9lq3Dh59NLUCpk9PU5vXrm34mN13T+NZRx0FhxySPplv3JjOlt+4seHtlr635bGtfc66dTB/Psydm87kb2yPPdJ5O+PGpdbEuHEweHDu/8fa01LItfuooxQWCo8+mvrSR49O/9hdxYYN6cB61VWwbFnaNmpUCoOPfzwdbCvZ0qUpHK67rj4cPvOZFA4HHFB0ddaSN96Ahx+u7w6aOTP9O5YbOrQ+BI48MnUXVtqHlKbU1qYxsdmzYdas9DV7dpq+3dguu2weFPvv36H//xwKHW3durSUdo8eaVrmttt2fg0dqbYWbrghhUFp5dKRI9M5B5/6VOWHQWMOh8oXkSZrTJ9eHwKNl6Tv0SNNAS8FwJFHplUFuopNm1IrtxQQs2alIHzzzc0f27dvfUCUwmLEiHT9+HZwKORhxIh0ktXMmfXLNlSb2lq4+eYUBqUZPMOGwbe+BaeckgbKqtnzz6dwuPba+nD49KdTOAzb4hwG62ibNqULVJUCYPp0ePnlho/p0ydN4ii1BI44In346k4iUpdteWti1qzN3ytI79fo0Q2DYtSotL0FDoU8nH46/PrXaZXP0jTJarFxI9x6a+oWevrptG3//VMYnH569YdBYw6HzvfXv8ITT9SHwMMPp2VQyu2yS30L4Kij0oFtm22KqbfSvfba5l1PTS0c2asXHHhgfVCMG5fGA3fYocHDHAp5+N734NJL4cIL4Sc/KaaGttq0CW67LYVBqam+335w+eWpe6WdTdGq8fzz6QzsX/7S4dDRVq6Ev/ylvhXwxBMpGMoNHpwO/qXuoBEjqq9rspK8/XY6ibYUFLNmpXGLxsduKf19lwWFjj3WodDh7r0Xjj8+/YFPm1ZMDa21aRPccUcKgwUL0raamnQwPOOMNGOjO2kqHE4/PS3V4XBonZdfrg+A6dPTDJvGx4xRoxqOBwweXEyt3ck778C8eQ2DYuHCzQbsBQ6FDvfaa2nQq1+/9CmpUj/x3H13atHMm5d+HjQohcFZZ7mp/sIL9eGwYUP6N/zwh9M1HyZOTMFpSW1tagnceWf6Wry44f29e6dVcEtdQR/4QEWdrNWtrV+fgqEsKPT44w6FXOy5JyxfngZphwwpro6mLF4M//RPMHVq+nmvvdI6ReeeW/2zpTpa43AoGTUqhcOkSfD+93e9sZaWrFoF99yTQuDuuxsu69KvXzrwl1oChx2Wln+xquAxhbyceGL6z/Lb36apm5Vg1Sr4139N4xy1tdC/f+oWueCCVs1K6NZWrEghOmVK6h5cs6b+vl13Tf/ekyala0TsuGNxdebp2WfrWwPTpjXsdhg6NP3+kyalFXC7W7djF9KeUOjiI44dZMyYFApz5hQfChs3ptk1l12WDm4SfP7z6ToHu+9ebG3VYuDAdAW4M89MZ3VPm5YC4s4700yPG25IX716pZVgS91M1bze0saN6UzhUhCUnyvQs2f97zlpksdbujm3FFrjN7+BU09NB4Y77yyujunT4aKL0jQ1SM35H/+4es+fqDQR6ZyUUkA8/HAavC8ZNqw+ICZMqPxZXKtXp5bQnXemllH5yVI77pgmUEyaBCec4HGBLsrdR3l5+ul0QNh7b3jxxc7f/7JlaQnrW29NP++9d7rOwamnVsep/9Xqrbca9rWXz7/faad0MJ04MX3fuUKuKrt0aX1r4M9/bjh2MmRIfWvgqKPcLdQNOBTysmlT6rNfuzZ12QwY0Dn7XbcuHfy/9720yFafPvD1r6eA2H77zqnBkg0b6mflTJnScFZOz56p5VBqRQwb1nlhvXFjWt69FAQLF9bf16NHGiQuBcHw4f4Q0c04FPI0YUI6KNx/f5rOmKeI1GX11a/Wt0xOOQW+/33YZ598922ts2RJCocpUzYfqC19Ip84MX0i7+gpwWvWpAselbqFyq9M2K9fw26hzvoAYxXJoZCn88+Hn/0sfXL/ylfy28/s2WncYPr09POYMWncoKMufWkdb9Wq1Hc/Zcrmfff9+6cLFE2cmGY1tfcg/cIL9a2BBx9MA+Ql++5b3xo4+mifl2J1PPsoT2PGpO95XbP59dfTyWa/+EVqKQwYkNbxOeec7jdvvtrsuGNqyZ1ySurOefTR+sHqhQvTkiO33Za6bo44ov6ciAMPbL47Z9MmePzx+iCYP7/+Pqlht9DIke4Wsg7jlkJrPfFEOnHnwAPrl5DoCO+9Bz/9aVqaYvXqNKPlwgvTOkU77dRx+7FiLF0Kd92VQuKBBxp+wq+pSQExcSJ88INp3OL++1MI3HVX+qBQ0rdvanFMmpRaHEVct9yqjruP8vTuu/XL+65Z0zFndU6dCl/+cv0KpscfDz/8YRoQtK5nzRr4wx+aPujvsEMalyhfXG6ffepbA8cc4zPUrc3cfZSn7bZLB+uFC1NL4dBD2/9ajZemOOCAFAYnntgxtVpl6tcPPvGJ9LVpE8yYUT+bac6c1AV0+OH1QTBqlLuFrNM5FNpizJgUCnPmtC8Umlqa4vLLU3eRBwe7lx49UnfkYYelix+9+mrqOvRsIStYhS75WaHaO9i8cWMaQB46FCZPTj9//vOp2+iSSxwIllbidSBYBXBLoS3aEwpemsLMqohbCm1RCoW5cxuuidOUZcvgtNPSvPHZs9PSFLfckk50ciCYWYVyKLTFgAHp4L52bVp6uCnr1qXppcOHp7WK+vRJ10RevDiFhAcOzayCORTaqtRaKHUHlUSkEBg+HK64Ik1hPfXUFAZXXOG1isysKjgU2qqpcYXZs9M88tNOS2sVjRmTuol+/Wtfr9bMqopDoa3KQ+H11+G88+CQQ9KA8oABcPXVaf75UUcVW6eZWTt49lFblUJh2rQ0xbS0NMWXvpQuh+mlKcysijkU2mrffdOZqaXr+p5wQjob2ZcwNLMuwN1HbdWjB1xwAYwfn9avmYHLJlMAAAeCSURBVDrVgWBmXYYXxDMz66LasyBebi0FSb+U9LqkBWXbDpb0iKT5ku6U1D+v/ZuZWdvl2X10HXB8o22/AC6NiIOA3wFfzXH/ZmbWRrmFQkRMA95qtHkYMC27fT/wqbz2b2ZmbdfZA80LgJOy2ycDgzp5/2ZmtgWdHQrnAOdLmgn0A95r7oGSzpM0Q9KMFStWdFqBZmbdWaeGQkQ8FREfiYhDgFuAZlaVg4i4OiLGR8T4gb4erZlZp+jUUJC0W/a9B/BN4OeduX8zM9uyPKek3gI8AgyT9JKkc4HTJT0NPAW8Alyb1/7NzKztquLkNUlrgMVF19HIAOCNootopBJrgsqsyzW1jmtqvUqsa1hE9GvLE6pl7aPFbT0rL2+SZrim1qnEulxT67im1qvEuiS1eSkIr31kZmZ1HApmZlanWkLh6qILaIJrar1KrMs1tY5rar1KrKvNNVXFQLOZmXWOamkpmJlZJ3AomJlZnYoOhaauyVA0SYMkPSBpkaSFki6qgJr6SHpc0tyspiuLrqlEUk9JsyVNKboWAEnPZ9fzmNOe6Xp5kbSTpN9Keir72zqi4HqGZe9R6Wu1pIuLrCmr68vZ3/gCSbdI6lMBNV2U1bOwyPeomWvY7CLpfklLsu87t/Q6FR0KNH1NhqLVApdExAjgcNICfyMLrumvwLERcTAwBjhe0uEF11RyEbCo6CIa+VBEjKmwOeU/Bu6JiOHAwRT8nkXE4uw9GgMcAqwjXQOlMJL2Ar4EjI+IUUBP4LSCaxoF/D1wGOnfbaKkoQWVcx2bHy8vBf4YEUOBP2Y/b1FFh0Iz12QoVEQsj4hZ2e01pP+8exVcU0TEO9mPvbOvwmcQSNob+FvSxZWsGdkVCI8GrgGIiPciYmWxVTVwHPBsRLxQdCGkE263k9QL2J60XE6RRgCPRsS6iKgF/gx8oohCmjlefgy4Prt9PfDxll6nokOh0kmqAcYCjxVbSV03zRzgdeD+iCi8JuBHwNeATUUXUiaA+yTNlHRe0cVk9gNWANdmXW2/kLRD0UWVOY20qnGhIuJl4P8Ay4DlwKqIuK/YqlgAHC1pV0nbAydSWdeJ2T0ilkP6QAvs1tITHArtJKkvcDtwcUSsLrqeiNiYNfX3Bg7LmrWFkTQReD0iZhZZRxMmRMQ44ARS19/RRRdE+vQ7Dvi/ETEWWEsrmvmdQdI2pAtj3VYBtexM+uS7L7AnsIOkzxZZU0QsAr5HupLkPcBcUhdz1XIotIOk3qRAuCki7ii6nnJZt8ODFD8WMwE4SdLzwK+BYyXdWGxJEBGvZN9fJ/WRH1ZsRQC8BLxU1rr7LSkkKsEJwKyIeK3oQoAPA0sjYkVEbADuAD5QcE1ExDURMS4ijiZ13ywpuqYyr0naAyD7/npLT3AotJEkkfp+F0XE5KLrAZA0UNJO2e3tSP95niqypoj454jYOyJqSN0Pf4qIQj/VSdpBUr/SbeAjpOZ/oSLiVeBFScOyTccBTxZYUrnTqYCuo8wy4HBJ22f/D4+jAiYxlF0nZjDwSSrn/QL4PXBmdvtM4H9bekJFr5KaXZPhg8AASS8B34qIa4qtignA54D5WR8+wGURMbXAmvYArpfUkxT0v4mIipgCWmF2B36Xjif0Am6OiHuKLanOhcBNWXfNc8DZBddD1kf+N8AXiq4FICIek/RbYBapi2Y2lbG0xO2SdgU2AOdHxNtFFNHU8RL4LvCb7Ho2y4CTW3wdL3NhZmYl7j4yM7M6DgUzM6vjUDAzszoOBTMzq+NQMDOzOhU9JdWso0naCMwnrQ9VS1oP5kcRUUlLcZgVxqFg3c272XIgpZOObgZ2JM3p3iqSekbExq19HbMiufvIuq1sqYvzgAuU9JT0A0lPSJon6QsAknpI+lm2Xv4USVMl/V123/OSLpf0EHCypCGS7skW3JsuaXj2uIGSbs9e+wlJE7Ltx5Rds2B26Yxrs6K4pWDdWkQ8J6kHafXIj5FW3jxU0rbAw5LuI11PoAY4KHvcIuCXZS+zPiKOBJD0R+CLEbFE0vuBnwHHkq6X8MOIeChbDuFe0rLLXyGdBftwtsji+vx/a7PmORTMQNn3jwCjS60AUrfSUOBI4LZs3OFVSQ80ev6tULdy7geA27KlNAC2zb5/GBhZtr1/1ip4GJgs6Sbgjoh4qUN/M7M2cihYtyZpP2AjafVIARdGxL2NHvO3LbzM2ux7D2BlacyikR7AERHxbqPt35V0F2kd/kclfTgiCl3M0Lo3jylYtyVpIPBz4KeRFgG7F/iHbGl0JB2Qrab6EPCpbGxhd9KiY5vJrquxVNLJ2fMl6eDs7vuAC8r2XRrsHhIR8yPie8AMYHgOv6pZqzkUrLvZLhvUXQj8gXSwvjK77xekJatnKV38/L9JrenbSdc8KG17DFjVzOt/BjhX0lxgIWmcArJrC2cD2E8CX8y2X6x00fe5wLvA3R33q5q1nVdJNWsFSX0j4p1sieTHSVdwe7Xousw6mscUzFpnSnYho22AqxwI1lW5pWBmZnU8pmBmZnUcCmZmVsehYGZmdRwKZmZWx6FgZmZ1/j/HLGodKQaqIwAAAABJRU5ErkJggg==\n",
660 | "text/plain": [
661 | ""
662 | ]
663 | },
664 | "metadata": {
665 | "needs_background": "light"
666 | },
667 | "output_type": "display_data"
668 | }
669 | ],
670 | "source": [
671 | "fig, ax = plt.subplots(figsize=(6,4))\n",
672 | "ax.plot(degrees, mses, linewidth=2, color='r')\n",
673 | "ax.set_title('Polynomial Models Validation')\n",
674 | "ax.set_ylabel('MSE')\n",
675 | "ax.set_xlabel('Degrees')\n",
676 | "ax.set_xlim(1, 10)"
677 | ]
678 | },
679 | {
680 | "cell_type": "markdown",
681 | "metadata": {},
682 | "source": [
683 | "These results are consistent with our previous findings: a model that predicts **mpg** using a quadratic function of **horsepower** performs better than a model that involves only a linear function of **horsepower**, and there is little evidence in favor of a model that uses a cubic function of **horsepower**.\n",
684 | "\n",
685 | "--------\n",
686 | "## _5.3.2 Leave-One-Out Cross-Validation_\n",
687 | "\n",
688 | "The LOOCV estimate can be automatically computed for any generalized linear model using the `LeaveOneOut()` and `KFold()` functions."
689 | ]
690 | },
691 | {
692 | "cell_type": "code",
693 | "execution_count": 17,
694 | "metadata": {},
695 | "outputs": [
696 | {
697 | "data": {
698 | "text/plain": [
699 | "397"
700 | ]
701 | },
702 | "execution_count": 17,
703 | "metadata": {},
704 | "output_type": "execute_result"
705 | }
706 | ],
707 | "source": [
708 | "linear_model = lm.fit(X_train, y_train)\n",
709 | "\n",
710 | "from sklearn.model_selection import LeaveOneOut # import LOOCV\n",
711 | "loo = LeaveOneOut()\n",
712 | "loo.get_n_splits(X)"
713 | ]
714 | },
715 | {
716 | "cell_type": "code",
717 | "execution_count": 18,
718 | "metadata": {},
719 | "outputs": [],
720 | "source": [
721 | "X2 = X[['horsepower']]\n",
722 | "X2 = X2.values\n",
723 | "y2 = y.values"
724 | ]
725 | },
726 | {
727 | "cell_type": "markdown",
728 | "metadata": {},
729 | "source": [
730 | "In python we don't have specific function to use the LOOCV, but the same result can be achieved by setting `kFold()` bin number to the LOO amount"
731 | ]
732 | },
733 | {
734 | "cell_type": "code",
735 | "execution_count": 19,
736 | "metadata": {},
737 | "outputs": [
738 | {
739 | "name": "stdout",
740 | "output_type": "stream",
741 | "text": [
742 | "LOOCV with 392 folds has the MSE: 24.9644\n"
743 | ]
744 | }
745 | ],
746 | "source": [
747 | "from sklearn.model_selection import KFold\n",
748 | "from sklearn.model_selection import cross_val_score\n",
749 | "loocv = KFold(n_splits=392, random_state=0, shuffle=False)\n",
750 | "\n",
751 | "loocv_results = cross_val_score(linear_model, X2, y2, scoring='neg_mean_squared_error', cv=loocv)\n",
752 | "mean_mse = np.abs(loocv_results).mean()\n",
753 | "print('LOOCV with {} folds has the MSE: {}'.format(str(len(loocv_results)), round(mean_mse, 4)))"
754 | ]
755 | },
756 | {
757 | "cell_type": "markdown",
758 | "metadata": {},
759 | "source": [
760 | "We can repeat this procedure for increasingly complex polynomial fits. To automate the process and not repeat each step multiple times I will convert it into the function which takes X and y values, a number of k-Folds, and the number of degrees we want to run our model."
761 | ]
762 | },
763 | {
764 | "cell_type": "code",
765 | "execution_count": 20,
766 | "metadata": {},
767 | "outputs": [],
768 | "source": [
769 | "def cross_val_kfold(X, y, k, degree):\n",
770 | " accu_mean = []\n",
771 | " for d in range (1, degree+1):\n",
772 | " model = PolynomialFeatures(degree= d)\n",
773 | " X_transformed = model.fit_transform(X)\n",
774 | " pm = lm.fit(X_transformed, y)\n",
775 | " accuracy = - cross_val_score(estimator=pm, X=X_transformed, y=y, cv=k,\n",
776 | " scoring='neg_mean_squared_error').mean()\n",
777 | " print('Polynomial Degree: {}, \\tMSE: {}'.format(d, round(accuracy, 4)))\n",
778 | " accu_mean.append(accuracy)"
779 | ]
780 | },
781 | {
782 | "cell_type": "code",
783 | "execution_count": 21,
784 | "metadata": {},
785 | "outputs": [
786 | {
787 | "name": "stdout",
788 | "output_type": "stream",
789 | "text": [
790 | "Polynomial Degree: 1, \tMSE: 24.9644\n",
791 | "Polynomial Degree: 2, \tMSE: 20.6398\n",
792 | "Polynomial Degree: 3, \tMSE: 20.7481\n",
793 | "Polynomial Degree: 4, \tMSE: 20.8618\n",
794 | "Polynomial Degree: 5, \tMSE: 20.5814\n"
795 | ]
796 | }
797 | ],
798 | "source": [
799 | "cross_val_kfold(X2, y2, loocv, 5)"
800 | ]
801 | },
802 | {
803 | "cell_type": "markdown",
804 | "metadata": {},
805 | "source": [
806 | "We see a sharp drop in the estimated test MSE between the linear and quadratic fits, but then no clear improvement from using\n",
807 | "higher-order polynomials.\n",
808 | "\n",
809 | "---------\n",
810 | "## _5.3.3 k-Fold Cross-Validation_\n",
811 | "As mentioned above we will use the same `KFold` function to implement k-Fold Cross-Validation. We start by using k=10, which is a common choice for k.\n",
812 | "\n",
813 | "the CV errors corresponding to the polynomial fit of orders one to ten."
814 | ]
815 | },
816 | {
817 | "cell_type": "code",
818 | "execution_count": 22,
819 | "metadata": {},
820 | "outputs": [
821 | {
822 | "name": "stdout",
823 | "output_type": "stream",
824 | "text": [
825 | "Polynomial Degree: 1, \tMSE: 28.4122\n",
826 | "Polynomial Degree: 2, \tMSE: 22.5188\n",
827 | "Polynomial Degree: 3, \tMSE: 22.6204\n",
828 | "Polynomial Degree: 4, \tMSE: 22.6927\n",
829 | "Polynomial Degree: 5, \tMSE: 22.4232\n",
830 | "Polynomial Degree: 6, \tMSE: 22.3025\n",
831 | "Polynomial Degree: 7, \tMSE: 22.4999\n",
832 | "Polynomial Degree: 8, \tMSE: 22.582\n",
833 | "Polynomial Degree: 9, \tMSE: 22.5067\n",
834 | "Polynomial Degree: 10, \tMSE: 22.4745\n"
835 | ]
836 | }
837 | ],
838 | "source": [
839 | "kf = KFold(n_splits=10, random_state=0, shuffle=False)\n",
840 | "accu_mean = cross_val_kfold(X2, y2, kf, 10)"
841 | ]
842 | },
843 | {
844 | "cell_type": "markdown",
845 | "metadata": {},
846 | "source": [
847 | "The computation time is much shorter than that of LOOCV. We still see little evidence that using cubic or higher-order polynomial terms leads to lower test error than simply using a quadratic fit. We also see that MSE has slightly increased compared to the validation set approach. \n",
848 | "\n",
849 | "We saw that the two errors of the cross-validation are essentially the same when LOOCV is performed. When we instead perform a k-fold CV, then the two numbers differ slightly. This is because with added precision while testing we reduced bias and this slightly increased the variance.\n",
850 | "\n",
851 | "---------\n",
852 | "## _5.3.4 The Bootstrap_\n",
853 | "\n",
854 | "We illustrate the use of the bootstrap in the simple example involving estimating the accuracy of the linear regression model on the **Auto** data set.\n",
855 | "\n",
856 | "### Estimating the Accuracy of a Statistic of Interest\n",
857 | "One of the great advantages of the bootstrap approach is that it can be applied in almost all situations. No complicated mathematical calculations are required\n",
858 | "\n",
859 | "* First, we must create a function that computes the statistic of interest.\n",
860 | "* Second, we perform bootstrap by repeatedly sampling observations from the data set with replacement.\n",
861 | "\n",
862 | "The Portfolio data set in the ISLR package has variables called X and Y.\n",
863 | "> Get the data set from **[here](https://drive.google.com/file/d/1fKYDMtJqrok3g-Ak3YZwqwA55u-k3BLV/view?usp=sharing)**"
864 | ]
865 | },
866 | {
867 | "cell_type": "code",
868 | "execution_count": 23,
869 | "metadata": {},
870 | "outputs": [
871 | {
872 | "data": {
873 | "text/html": [
874 | "\n",
875 | "\n",
888 | "
\n",
889 | " \n",
890 | " \n",
891 | " \n",
892 | " X \n",
893 | " Y \n",
894 | " \n",
895 | " \n",
896 | " \n",
897 | " \n",
898 | " 0 \n",
899 | " -0.895251 \n",
900 | " -0.234924 \n",
901 | " \n",
902 | " \n",
903 | " 1 \n",
904 | " -1.562454 \n",
905 | " -0.885176 \n",
906 | " \n",
907 | " \n",
908 | " 2 \n",
909 | " -0.417090 \n",
910 | " 0.271888 \n",
911 | " \n",
912 | " \n",
913 | " 3 \n",
914 | " 1.044356 \n",
915 | " -0.734198 \n",
916 | " \n",
917 | " \n",
918 | " 4 \n",
919 | " -0.315568 \n",
920 | " 0.841983 \n",
921 | " \n",
922 | " \n",
923 | "
\n",
924 | "
"
925 | ],
926 | "text/plain": [
927 | " X Y\n",
928 | "0 -0.895251 -0.234924\n",
929 | "1 -1.562454 -0.885176\n",
930 | "2 -0.417090 0.271888\n",
931 | "3 1.044356 -0.734198\n",
932 | "4 -0.315568 0.841983"
933 | ]
934 | },
935 | "execution_count": 23,
936 | "metadata": {},
937 | "output_type": "execute_result"
938 | }
939 | ],
940 | "source": [
941 | "portfolio = pd.read_csv('Portfolio.csv')\n",
942 | "portfolio.head()"
943 | ]
944 | },
945 | {
946 | "cell_type": "code",
947 | "execution_count": 24,
948 | "metadata": {},
949 | "outputs": [
950 | {
951 | "data": {
952 | "text/html": [
953 | "\n",
954 | "\n",
967 | "
\n",
968 | " \n",
969 | " \n",
970 | " \n",
971 | " X \n",
972 | " Y \n",
973 | " \n",
974 | " \n",
975 | " \n",
976 | " \n",
977 | " count \n",
978 | " 100.000000 \n",
979 | " 100.000000 \n",
980 | " \n",
981 | " \n",
982 | " mean \n",
983 | " -0.077132 \n",
984 | " -0.096945 \n",
985 | " \n",
986 | " \n",
987 | " std \n",
988 | " 1.062376 \n",
989 | " 1.143782 \n",
990 | " \n",
991 | " \n",
992 | " min \n",
993 | " -2.432764 \n",
994 | " -2.725281 \n",
995 | " \n",
996 | " \n",
997 | " 25% \n",
998 | " -0.888474 \n",
999 | " -0.885722 \n",
1000 | " \n",
1001 | " \n",
1002 | " 50% \n",
1003 | " -0.268889 \n",
1004 | " -0.228708 \n",
1005 | " \n",
1006 | " \n",
1007 | " 75% \n",
1008 | " 0.558093 \n",
1009 | " 0.806708 \n",
1010 | " \n",
1011 | " \n",
1012 | " max \n",
1013 | " 2.460336 \n",
1014 | " 2.565985 \n",
1015 | " \n",
1016 | " \n",
1017 | "
\n",
1018 | "
"
1019 | ],
1020 | "text/plain": [
1021 | " X Y\n",
1022 | "count 100.000000 100.000000\n",
1023 | "mean -0.077132 -0.096945\n",
1024 | "std 1.062376 1.143782\n",
1025 | "min -2.432764 -2.725281\n",
1026 | "25% -0.888474 -0.885722\n",
1027 | "50% -0.268889 -0.228708\n",
1028 | "75% 0.558093 0.806708\n",
1029 | "max 2.460336 2.565985"
1030 | ]
1031 | },
1032 | "execution_count": 24,
1033 | "metadata": {},
1034 | "output_type": "execute_result"
1035 | }
1036 | ],
1037 | "source": [
1038 | "portfolio.describe()"
1039 | ]
1040 | },
1041 | {
1042 | "cell_type": "code",
1043 | "execution_count": 25,
1044 | "metadata": {},
1045 | "outputs": [
1046 | {
1047 | "name": "stdout",
1048 | "output_type": "stream",
1049 | "text": [
1050 | "\n",
1051 | "RangeIndex: 100 entries, 0 to 99\n",
1052 | "Data columns (total 2 columns):\n",
1053 | "X 100 non-null float64\n",
1054 | "Y 100 non-null float64\n",
1055 | "dtypes: float64(2)\n",
1056 | "memory usage: 1.7 KB\n"
1057 | ]
1058 | }
1059 | ],
1060 | "source": [
1061 | "portfolio.info()"
1062 | ]
1063 | },
1064 | {
1065 | "cell_type": "markdown",
1066 | "metadata": {},
1067 | "source": [
1068 | "There is no specific function to perform the bootstrap but we can easily create one, which takes input (X, Y) and returns the estimate for $\\alpha$."
1069 | ]
1070 | },
1071 | {
1072 | "cell_type": "code",
1073 | "execution_count": 26,
1074 | "metadata": {},
1075 | "outputs": [],
1076 | "source": [
1077 | "def alpha_calc(X, y):\n",
1078 | " alpha = ((np.var(y) - np.cov(X,y)) / (np.var(X) + np.var(y) - 2 * np.cov(X,y)))\n",
1079 | " return alpha"
1080 | ]
1081 | },
1082 | {
1083 | "cell_type": "markdown",
1084 | "metadata": {},
1085 | "source": [
1086 | "This function returns, or outputs, an estimate for $\\alpha$. For instance, the following command tells `python` to estimate $\\alpha$ using all 100 observations."
1087 | ]
1088 | },
1089 | {
1090 | "cell_type": "code",
1091 | "execution_count": 27,
1092 | "metadata": {},
1093 | "outputs": [
1094 | {
1095 | "name": "stdout",
1096 | "output_type": "stream",
1097 | "text": [
1098 | "[1.07270947 0.57665115]\n"
1099 | ]
1100 | }
1101 | ],
1102 | "source": [
1103 | "X = portfolio['X'][0:100]\n",
1104 | "y = portfolio['Y'][0:100]\n",
1105 | "print(alpha_calc(X, y)[0])"
1106 | ]
1107 | },
1108 | {
1109 | "cell_type": "markdown",
1110 | "metadata": {},
1111 | "source": [
1112 | "We can create a function that will allow us to perform a random selection of 100 observations from the range of one to 100 with replacement. This is similar to building a new bootstrap data set and calculating $\\alpha$ based on the new data set."
1113 | ]
1114 | },
1115 | {
1116 | "cell_type": "code",
1117 | "execution_count": 28,
1118 | "metadata": {},
1119 | "outputs": [
1120 | {
1121 | "name": "stdout",
1122 | "output_type": "stream",
1123 | "text": [
1124 | "[0.96081732 0.5795274 ]\n"
1125 | ]
1126 | }
1127 | ],
1128 | "source": [
1129 | "def bootstrap(df, fraction, n):\n",
1130 | " result = 0\n",
1131 | " for i in range(0, n + 1):\n",
1132 | " sample = df.sample(frac= fraction, replace=True)\n",
1133 | " X = sample.X[:100]\n",
1134 | " y = sample.Y[:100]\n",
1135 | " alpha_result = alpha_calc(X,y)\n",
1136 | " result += alpha_result\n",
1137 | " final_result = result / n\n",
1138 | " return final_result\n",
1139 | "\n",
1140 | "print(bootstrap(portfolio, 1, 1000)[0])"
1141 | ]
1142 | },
1143 | {
1144 | "cell_type": "markdown",
1145 | "metadata": {},
1146 | "source": [
1147 | "The final output shows that using the original data, $\\alpha$ is approximately 0.58.\n",
1148 | "\n",
1149 | "\n",
1150 | "### Estimating the Accuracy of a Linear Regression Model\n",
1151 | "\n",
1152 | "The bootstrap approach can be used to assess the variability of the coefficient estimates and predictions from a statistical learning method. Here we use the bootstrap approach to assess the variability of the estimates for $β_0$ and $β_1$, the intercept and slope terms for the linear regression model that uses **horsepower** to predict **mpg** in the **Auto** data set. We will compare the estimates obtained using the bootstrap to those obtained using the formulas for $SE(\\hat{\\beta}_0)$ and $SE(\\hat{\\beta}_1)$ described in Lab 3."
1153 | ]
1154 | },
1155 | {
1156 | "cell_type": "code",
1157 | "execution_count": 29,
1158 | "metadata": {},
1159 | "outputs": [
1160 | {
1161 | "name": "stdout",
1162 | "output_type": "stream",
1163 | "text": [
1164 | "Coeficient: \tIntercept:\n",
1165 | " [-0.15781924] \t 40.004202891898615\n"
1166 | ]
1167 | }
1168 | ],
1169 | "source": [
1170 | "X3 = auto[['horsepower']].values\n",
1171 | "y3 = auto['mpg']\n",
1172 | "bootl = lm.fit(X3, y3)\n",
1173 | "\n",
1174 | "print('Coeficient: \\tIntercept:\\n',\n",
1175 | " bootl.coef_, '\\t', bootl.intercept_)"
1176 | ]
1177 | },
1178 | {
1179 | "cell_type": "markdown",
1180 | "metadata": {},
1181 | "source": [
1182 | "We can use the same approach to create bootstrap estimates for the intercept and slope terms by randomly sampling from among the observations with replacement. We can compute the standard errors of 1,000 bootstrap estimates for the intercept and slope terms."
1183 | ]
1184 | },
1185 | {
1186 | "cell_type": "code",
1187 | "execution_count": 30,
1188 | "metadata": {},
1189 | "outputs": [
1190 | {
1191 | "name": "stdout",
1192 | "output_type": "stream",
1193 | "text": [
1194 | "Coeficient: \tIntercept:\n",
1195 | " [-0.16079905] \t 40.4657628966397\n"
1196 | ]
1197 | }
1198 | ],
1199 | "source": [
1200 | "from sklearn.utils import resample\n",
1201 | "\n",
1202 | "X_sample, y_sample = resample(X3, y3, n_samples=1000)\n",
1203 | "bootl2 = lm.fit(X_sample,y_sample)\n",
1204 | "\n",
1205 | "print('Coeficient: \\tIntercept:\\n',\n",
1206 | " bootl2.coef_, '\\t', bootl2.intercept_)"
1207 | ]
1208 | },
1209 | {
1210 | "cell_type": "code",
1211 | "execution_count": null,
1212 | "metadata": {},
1213 | "outputs": [],
1214 | "source": []
1215 | }
1216 | ],
1217 | "metadata": {
1218 | "kernelspec": {
1219 | "display_name": "Python 3",
1220 | "language": "python",
1221 | "name": "python3"
1222 | },
1223 | "language_info": {
1224 | "codemirror_mode": {
1225 | "name": "ipython",
1226 | "version": 3
1227 | },
1228 | "file_extension": ".py",
1229 | "mimetype": "text/x-python",
1230 | "name": "python",
1231 | "nbconvert_exporter": "python",
1232 | "pygments_lexer": "ipython3",
1233 | "version": "3.7.4"
1234 | }
1235 | },
1236 | "nbformat": 4,
1237 | "nbformat_minor": 2
1238 | }
1239 |
--------------------------------------------------------------------------------
/chapter06_lab3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 6.7 Lab 3: PLS Regression\n",
8 | "\n",
9 | "\n",
10 | "\n",
11 | "\n"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 28,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "%matplotlib inline\n",
25 | "\n",
26 | "import pandas as pd\n",
27 | "import numpy as np\n",
28 | "import matplotlib.pyplot as plt\n",
29 | "\n",
30 | "from sklearn.preprocessing import LabelEncoder\n",
31 | "from sklearn.preprocessing import scale\n",
32 | "from sklearn.model_selection import train_test_split\n",
33 | "from sklearn.linear_model import LinearRegression\n",
34 | "from sklearn.cross_decomposition import PLSRegression, PLSSVD\n",
35 | "from sklearn.metrics import mean_squared_error\n",
36 | "from sklearn.model_selection import KFold\n",
37 | "from sklearn.model_selection import cross_val_score"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "Before we run the PLS regression we start with data preparation. Just like in previous Labs for Chapter 6 we'll be using **Hitters** data set and the procedure is the copy of previous data preparation.\n",
45 | "\n",
46 | "> You can download **Hitters** data set for from **[here.](https://drive.google.com/file/d/1e2NqNJGkCTAGBee8JHGNGCJHplG5R2YQ/view?usp=sharing)**\n"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 2,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "hitters = pd.read_csv('Hitters.csv')\n",
56 | "hitters = hitters.dropna(axis=0)"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 4,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "X = hitters.loc[:, hitters.columns != 'Salary']\n",
66 | "y = hitters['Salary']"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 5,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "def get_index(df):\n",
76 | " col_index = {}\n",
77 | " column = df.columns\n",
78 | " for col in column:\n",
79 | " col_index[col] = df.columns.get_loc(col)\n",
80 | " return col_index"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 6,
86 | "metadata": {},
87 | "outputs": [
88 | {
89 | "name": "stdout",
90 | "output_type": "stream",
91 | "text": [
92 | "{'AtBat': 0, 'Hits': 1, 'HmRun': 2, 'Runs': 3, 'RBI': 4, 'Walks': 5, 'Years': 6, 'CAtBat': 7, 'CHits': 8, 'CHmRun': 9, 'CRuns': 10, 'CRBI': 11, 'CWalks': 12, 'League': 13, 'Division': 14, 'PutOuts': 15, 'Assists': 16, 'Errors': 17, 'NewLeague': 18}\n"
93 | ]
94 | }
95 | ],
96 | "source": [
97 | "column_index = get_index(X)\n",
98 | "print(column_index)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 7,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "def label_encoder(df, index):\n",
108 | " \"\"\"\n",
109 | " We will covnert categorical values into the dummies\n",
110 | " using LabelEncoder from scikit-learn\n",
111 | "\n",
112 | " \"\"\"\n",
113 | " le = LabelEncoder()\n",
114 | " for c in index:\n",
115 | " df.iloc[:, c] = le.fit_transform(df.iloc[:, c])\n",
116 | " return df"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 10,
122 | "metadata": {},
123 | "outputs": [
124 | {
125 | "name": "stderr",
126 | "output_type": "stream",
127 | "text": [
128 | "F:\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py:494: SettingWithCopyWarning: \n",
129 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
130 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
131 | "\n",
132 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
133 | " self.obj[item] = s\n"
134 | ]
135 | }
136 | ],
137 | "source": [
138 | "X = label_encoder(X, [13, 14, 18]) # running previously defined function will turn categorical variables into dummies"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "## _6.7.2 Partial Least Squares_\n",
146 | "\n",
147 | "We implement partial least squares (PLS) using the `PLSRegression` function, also in the `sklearn` library. We will run several models using k-Fold. "
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 14,
153 | "metadata": {},
154 | "outputs": [],
155 | "source": [
156 | "# Split into training and test sets\n",
157 | "X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 38,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "def pls_kfold(X, y, k):\n",
167 | " \n",
168 | " kf = KFold(n_splits=k, shuffle=True, random_state=1)\n",
169 | " pls_mse = []\n",
170 | "\n",
171 | " for i in np.arange(1, 20):\n",
172 | " pls = PLSRegression(n_components=i)\n",
173 | " score = cross_val_score(pls, scale(X), y,\n",
174 | " cv=kf, scoring='neg_mean_squared_error').mean()\n",
175 | " pls_mse.append(-score)\n",
176 | " return pls_mse"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 39,
182 | "metadata": {},
183 | "outputs": [
184 | {
185 | "data": {
186 | "text/plain": [
187 | "(-1, 19.9)"
188 | ]
189 | },
190 | "execution_count": 39,
191 | "metadata": {},
192 | "output_type": "execute_result"
193 | },
194 | {
195 | "data": {
196 | "image/png": "\n",
197 | "text/plain": [
198 | ""
199 | ]
200 | },
201 | "metadata": {
202 | "needs_background": "light"
203 | },
204 | "output_type": "display_data"
205 | }
206 | ],
207 | "source": [
208 | "# Plot results\n",
209 | "plt.plot(np.arange(1, 20), np.array(pls_kfold(X_train, y_train, 10)), '-o', color='r')\n",
210 | "plt.xlabel('Number of principal components in regression')\n",
211 | "plt.ylabel('MSE')\n",
212 | "plt.title('Salary')\n",
213 | "plt.xlim(xmin=-1)"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "The lowest cross-validation error occurs when only $M = 2$ partial least squares dimensions are used. We now evaluate the corresponding test set MSE:"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 45,
226 | "metadata": {},
227 | "outputs": [
228 | {
229 | "name": "stdout",
230 | "output_type": "stream",
231 | "text": [
232 | "MSE: 104838.51042760801\n"
233 | ]
234 | }
235 | ],
236 | "source": [
237 | "pls = PLSRegression(n_components=2)\n",
238 | "pls.fit(scale(X_train), y_train)\n",
239 | "\n",
240 | "print('MSE: ', mean_squared_error(y_test, pls.predict(scale(X_test))))"
241 | ]
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "metadata": {},
246 | "source": [
247 | "The test MSE is comparable to, but slightly higher than, the test MSE obtained using ridge regression and the lasso.\n",
248 | "\n",
249 | "Finally, we perform PLS using the full data set, using M = 2, the number of components identified by cross-validation."
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 51,
255 | "metadata": {},
256 | "outputs": [
257 | {
258 | "name": "stdout",
259 | "output_type": "stream",
260 | "text": [
261 | "MSE: 108660.36160213721\n"
262 | ]
263 | }
264 | ],
265 | "source": [
266 | "pls = PLSRegression(n_components=2)\n",
267 | "pls.fit(scale(X), y)\n",
268 | "\n",
269 | "print('MSE: ', mean_squared_error(y, pls.predict(scale(X))))"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {},
276 | "outputs": [],
277 | "source": []
278 | }
279 | ],
280 | "metadata": {
281 | "kernelspec": {
282 | "display_name": "Python 3",
283 | "language": "python",
284 | "name": "python3"
285 | },
286 | "language_info": {
287 | "codemirror_mode": {
288 | "name": "ipython",
289 | "version": 3
290 | },
291 | "file_extension": ".py",
292 | "mimetype": "text/x-python",
293 | "name": "python",
294 | "nbconvert_exporter": "python",
295 | "pygments_lexer": "ipython3",
296 | "version": "3.7.4"
297 | }
298 | },
299 | "nbformat": 4,
300 | "nbformat_minor": 2
301 | }
302 |
--------------------------------------------------------------------------------