├── images
    ├── nn.png
    ├── nn2.png
    ├── animals
    │   ├── bug.jpg
    │   ├── bird.jpg
    │   ├── bird2.jpg
    │   ├── bird3.jpg
    │   ├── elba.jpg
    │   ├── hamish.jpg
    │   ├── poodle.jpg
    │   ├── tortoise.jpg
    │   ├── butterfly.jpg
    │   └── butterfly2.jpg
    └── isl_small.png
├── .gitignore
├── _bookdown.yml
├── ISLRv2-solutions.Rproj
├── index.Rmd
├── _output.yml
├── DESCRIPTION
├── .vscode
    └── settings.json
├── islrv2.css
├── .github
    └── workflows
    │   └── github-actions.yml
├── LICENSE.md
├── README.md
├── 13-multiple-testing.Rmd
├── 02-statistical-learning.Rmd
├── 11-survival-analysis-and-censored-data.Rmd
├── 05-resampling-methods.Rmd
├── 12-unsupervised-learning.Rmd
├── 09-support-vector-mechines.Rmd
├── 08-tree-based-methods.Rmd
├── 10-deep-learning.Rmd
├── 07-moving-beyond-linearity.Rmd
└── data
    └── Auto.data


/images/nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/nn.png


--------------------------------------------------------------------------------
/images/nn2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/nn2.png


--------------------------------------------------------------------------------
/images/animals/bug.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/bug.jpg


--------------------------------------------------------------------------------
/images/isl_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/isl_small.png


--------------------------------------------------------------------------------
/images/animals/bird.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/bird.jpg


--------------------------------------------------------------------------------
/images/animals/bird2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/bird2.jpg


--------------------------------------------------------------------------------
/images/animals/bird3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/bird3.jpg


--------------------------------------------------------------------------------
/images/animals/elba.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/elba.jpg


--------------------------------------------------------------------------------
/images/animals/hamish.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/hamish.jpg


--------------------------------------------------------------------------------
/images/animals/poodle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/poodle.jpg


--------------------------------------------------------------------------------
/images/animals/tortoise.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/tortoise.jpg


--------------------------------------------------------------------------------
/images/animals/butterfly.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/butterfly.jpg


--------------------------------------------------------------------------------
/images/animals/butterfly2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/butterfly2.jpg


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | _book
 6 | .DS_Store
 7 | islrv2-solutions.rds
 8 | _bookdown_files
 9 | [0-9]*.md
10 | [0-9]*_files/
11 | 
12 | 


--------------------------------------------------------------------------------
/_bookdown.yml:
--------------------------------------------------------------------------------
1 | book_filename: "islrv2-solutions"
2 | language:
3 |   ui:
4 |     chapter_name: ""
5 | delete_merged_file: true
6 | new_session: yes
7 | repo: https://github.com/danhalligan/ISLRv2-solutions/
8 | 


--------------------------------------------------------------------------------
/ISLRv2-solutions.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Website
19 | 
20 | SpellingDictionary: en_US
21 | 


--------------------------------------------------------------------------------
/index.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: An Introduction to Statistical Learning
 3 | subtitle: Exercise solutions in R
 4 | site: bookdown::bookdown_site
 5 | github-repo: danhalligan/ISLRv2-solutions
 6 | url: 'https\://danhalligan.github.io/ISLRv2-solutions/'
 7 | ---
 8 | 
 9 | # Introduction
10 | 
11 | This bookdown document provides solutions for exercises in the book
12 | ["An Introduction to Statistical Learning with Applications in R"](https://www.statlearning.com/),
13 | second edition, by Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani.
14 | 
15 | ![](images/isl_small.png)
16 | 


--------------------------------------------------------------------------------
/_output.yml:
--------------------------------------------------------------------------------
 1 | bookdown::gitbook:
 2 |   css: [islrv2.css, https://cdn.jsdelivr.net/gh/aaaakshat/cm-web-fonts@latest/fonts.css]
 3 |   config:
 4 |     toc:
 5 |       collapse: section
 6 |       before: |
 7 |         <li><a href="./">ISLRv2 Solutions</a></li>
 8 |       after: |
 9 |         <li><a href="https://github.com/rstudio/bookdown" target="blank">Published with bookdown</a></li>
10 |     edit: https://github.com/danhalligan/ISLRv2-solutions/edit/main/%s
11 |     sharing:
12 |       facebook: no
13 |       twitter: no
14 |       github: yes
15 |       all: []
16 |   github-repo: danhalligan/ISLRv2-solutions
17 |   url: https://danhalligan.github.io/ISLRv2-solutions/
18 |   
19 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: placeholder
 2 | Type: Book
 3 | Title: Does not matter.
 4 | Version: 0.0.1
 5 | biocviews:
 6 | Imports: 
 7 |     ape,
 8 |     BART,
 9 |     bookdown,
10 |     boot,
11 |     class,
12 |     corrplot,
13 |     e1071,
14 |     gam,
15 |     gbm,
16 |     ggfortify,
17 |     ggplot2,
18 |     ggtree,
19 |     glmnet,
20 |     ISLR2,
21 |     keras,
22 |     knitr,
23 |     leaps,
24 |     MASS,
25 |     neuralnet,
26 |     plotly,
27 |     pls,
28 |     randomForest,
29 |     showtext,
30 |     sigmoid,
31 |     splines,
32 |     survival,
33 |     tree,
34 |     tidyverse,
35 |     xfun
36 | Remotes: 
37 |     rstudio/bookdown,
38 |     github::YuLab-SMU/ggtree
39 |     
40 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "cSpell.words": [
 3 |         "ANOVA",
 4 |         "Bonferroni",
 5 |         "boxplots",
 6 |         "covariate",
 7 |         "dendrogram",
 8 |         "dendrograms",
 9 |         "exponentiating",
10 |         "FWER",
11 |         "hypergeometric",
12 |         "Kaplan",
13 |         "logit",
14 |         "LOOCV",
15 |         "multinomial",
16 |         "overfit",
17 |         "overfitting",
18 |         "quantile",
19 |         "quantiles",
20 |         "scalings",
21 |         "scatterplot",
22 |         "scatterplots",
23 |         "softmax",
24 |         "stochasticity",
25 |         "timepoint"
26 |     ],
27 |     "cSpell.enabled": true,
28 |     "cSpell.language": "en-US"
29 | }


--------------------------------------------------------------------------------
/islrv2.css:
--------------------------------------------------------------------------------
 1 | #header .title {
 2 |   color: #2E71B6;
 3 |   font-family: "Computer Modern Serif", serif;
 4 |   font-weight: normal;
 5 |   font-size: 3em;
 6 |   letter-spacing: -0.8px;
 7 |   line-height: 1.2;
 8 | }
 9 | 
10 | h1,
11 | h2,
12 | h3 {
13 |   color: #2E71B6;
14 |   font-family: "Computer Modern Serif", serif;
15 |   font-weight: normal;
16 | }
17 | 
18 | .book .book-body .page-wrapper .page-inner section.normal h1 {
19 |   font-size: 2.5em;
20 | }
21 | 
22 | .book .book-body .page-wrapper .page-inner section.normal h2 {
23 |   font-size: 2em;
24 |   font-style: italic;
25 | }
26 | 
27 | h3>.header-section-number {
28 |   display: none;
29 | }
30 | 
31 | .book .book-body .page-wrapper .page-inner section.normal pre.sourceCode {
32 |   background: #FDF5E7;
33 | }
34 | 
35 | .book .book-body .page-wrapper .page-inner {
36 |   max-width: 1000px !important;
37 | }


--------------------------------------------------------------------------------
/.github/workflows/github-actions.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Deploy
 2 | on: [push, workflow_dispatch]
 3 | permissions:
 4 |   contents: write
 5 | jobs:
 6 |   build-and-deploy:
 7 |     concurrency: ci-${{ github.ref }} 
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/checkout@v2
11 | 
12 |       - uses: r-lib/actions/setup-pandoc@v1
13 | 
14 |       - uses: r-lib/actions/setup-r@v2
15 |         with:
16 |           use-public-rspm: true
17 | 
18 |       - uses: r-lib/actions/setup-r-dependencies@v2
19 |         with:
20 |           cache: false
21 | 
22 |       - name: Setup keras
23 |         run: Rscript -e "keras::install_keras()"
24 | 
25 |       - name: Build 🔧 
26 |         run: Rscript -e "bookdown::render_book('index.Rmd', 'bookdown::gitbook')"
27 | 
28 |       - name: Deploy 🚀
29 |         uses: JamesIves/github-pages-deploy-action@v4.4.0
30 |         with:
31 |           folder: _book
32 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2022 ISLRv2-solutions authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # An Introduction to Statistical Learning with Applications in R second edition solutions
 2 | 
 3 | [![Build and Deploy](https://github.com/danhalligan/ISLRv2-solutions/actions/workflows/github-actions.yml/badge.svg)](https://github.com/danhalligan/ISLRv2-solutions/actions/workflows/github-actions.yml)
 4 | ![License](https://img.shields.io/github/license/danhalligan/ISLRv2-solutions)
 5 | 
 6 | ![ISLR v2 cover](images/isl_small.png)
 7 | 
 8 | This repository provides my solutions for all exercises in the book
 9 | ["An Introduction to Statistical Learning with Applications in R"](https://www.statlearning.com/),
10 | second edition,
11 | by Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani.
12 | 
13 | If you use these solutions or find them useful, please star this repository!
14 | 
15 | ## About these solutions
16 | 
17 | The solutions are written in [bookdown] format using (my) 
18 | [ISLRv2 solutions template](https://github.com/danhalligan/ISLRv2-solutions-template).
19 | 
20 | For anyone reading this book, I believe there is great value in deriving 
21 | the solutions yourself, and the template above can be forked to provide a 
22 | great starting point as I've created template Rmarkdown files for each chapter
23 | and transcribed all questions as quotes within the chapter files leaving space
24 | for you to write your solutions either as text, or by writing R code chunks.
25 | For more details, refer to the template repository.
26 | 
27 | I've tried my best to provide solutions to each problem in this book, and I
28 | believe my answers should be (at least for the most part) correct. The solutions
29 | are relatively concise but hopefully comprehensive enough to address the 
30 | purpose of each question.
31 | 
32 | If, when reading these solutions, you notice any inconsistencies, mistakes or
33 | have any other questions, please feel free to raise an issue or provide 
34 | suggestions and I'll happily take a look and try to incorporate.
35 | 
36 | ## Compiled solutions
37 | 
38 | The HTML book is compiled from the Rmarkdown files in using a [GitHub Actions] 
39 | workflow and then automatically deployed to [GitHub pages].
40 | 
41 | You can view these solutions hosted from GitHub at the following location:
42 | 
43 | * <https://danhalligan.github.io/ISLRv2-solutions/>
44 | 
45 | ## Building the solutions
46 | 
47 | The R dependencies are provided in the `DESCRIPTION` file and you can install them by running:
48 | 
49 | ```r
50 | devtools::install()
51 | ```
52 | 
53 | Other than that you need the Python packages [tensorflow and keras] installed in the Python environment you are using to run the examples:
54 | 
55 | ```bash
56 | pip3 install tensorflow keras
57 | ```
58 | 
59 | Rebuild the book with compiled solutions by running:
60 | 
61 | ```r
62 | bookdown::render_book('index.Rmd', 'bookdown::gitbook')
63 | ```
64 | 
65 | To render only one file/chapter (e.g. solutions for chapter 2), use:
66 | 
67 | ```r
68 | rmarkdown::render(input="02-statistical-learning.Rmd", output_format=bookdown::pdf_document2())
69 | ```
70 | 
71 | [bookdown]: https://github.com/rstudio/bookdown
72 | [GitHub Actions]: https://docs.github.com/en/actions
73 | [GitHub pages]: https://pages.github.com/
74 | [tensorflow and keras]: https://tensorflow.rstudio.com/install/
75 | 


--------------------------------------------------------------------------------
/13-multiple-testing.Rmd:
--------------------------------------------------------------------------------
  1 | # Multiple Testing
  2 | 
  3 | ## Conceptual
  4 | 
  5 | ### Question 1
  6 | 
  7 | > Suppose we test $m$ null hypotheses, all of which are true. We control the
  8 | > Type I error for each null hypothesis at level $\alpha$. For each sub-problem,
  9 | > justify your answer.
 10 | >
 11 | > a. In total, how many Type I errors do we expect to make?
 12 | 
 13 | We expect $m\alpha$.
 14 | 
 15 | > b. Suppose that the m tests that we perform are independent. What is the
 16 | >    family-wise error rate associated with these m tests?
 17 | >    
 18 | >    _Hint: If two events A and B are independent, then Pr(A ∩ B) = Pr(A) Pr(B)._
 19 | 
 20 | The family-wise error rate (FWER) is defined as the probability of making at 
 21 | least one Type I error. We can think of this as 1 minus the probability of
 22 | no type I errors, which is:
 23 | 
 24 | $1 - (1 - \alpha)^m$
 25 | 
 26 | Alternatively, for two tests this is: Pr(A ∪ B) = Pr(A) + Pr(B) - Pr(A ∩ B).
 27 | For independent tests this is $\alpha + \alpha - \alpha^2$
 28 | 
 29 | > c. Suppose that $m = 2$, and that the p-values for the two tests are
 30 | >    positively correlated, so that if one is small then the other will tend to
 31 | >    be small as well, and if one is large then the other will tend to be large.
 32 | >    How does the family-wise error rate associated with these $m = 2$ tests
 33 | >    qualitatively compare to the answer in (b) with $m = 2$?
 34 | >    
 35 | >    _Hint: First, suppose that the two p-values are perfectly correlated._
 36 | 
 37 | If they were perfectly correlated, we would effectively be performing a single
 38 | test (thus FWER would be $alpha$). In the case when they are positively
 39 | correlated therefore, we can expect the FWER to be less than in b.
 40 | 
 41 | Alternatively, as above, FWEW = Pr(A ∪ B) = Pr(A) + Pr(B) - Pr(A ∩ B).
 42 | For perfectly positively correlated tests Pr(A ∩ B) = $\alpha$, so the 
 43 | FWEW is $\alpha$ which is smaller than b.
 44 | 
 45 | > d. Suppose again that $m = 2$, but that now the p-values for the two tests are
 46 | >    negatively correlated, so that if one is large then the other will tend to
 47 | >    be small. How does the family-wise error rate associated with these $m = 2$
 48 | >    tests qualitatively compare to the answer in (b) with $m = 2$?
 49 | >
 50 | >    _Hint: First, suppose that whenever one p-value is less than $\alpha$,_
 51 | >    _then the other will be greater than $\alpha$. In other words, we can_
 52 | >    _never reject both null hypotheses._
 53 | 
 54 | Taking the equation above, for two tests,
 55 | FWEW = Pr(A ∪ B) = Pr(A) + Pr(B) - Pr(A ∩ B). In the case considered in the
 56 | hint Pr(A ∩ B) = 0, so Pr(A ∪ B) = $2\alpha$, which is larger than b.
 57 | 
 58 | ### Question 2
 59 | 
 60 | > Suppose that we test $m$ hypotheses, and control the Type I error for each
 61 | > hypothesis at level $\alpha$. Assume that all $m$ p-values are independent,
 62 | > and that all null hypotheses are true.
 63 | >
 64 | > a. Let the random variable $A_j$ equal 1 if the $j$th null hypothesis is
 65 | > rejected, and 0 otherwise. What is the distribution of $A_j$?
 66 | 
 67 | $A_j$ follows a Bernoulli distribution: $A_j \sim \text{Bernoulli}(p)$
 68 | 
 69 | > b. What is the distribution of $\sum_{j=1}^m A_j$?
 70 | 
 71 | Follows a binomial distribution $\sum_{j=1}^m A_j \sim Bi(m, \alpha)$.
 72 | 
 73 | > c. What is the standard deviation of the number of Type I errors that we will
 74 | > make?
 75 | 
 76 | The variance of a Binomial is $npq$, so for this situation the standard 
 77 | deviation would be $\sqrt{m \alpha (1-\alpha)}$.
 78 | 
 79 | ### Question 3
 80 | 
 81 | > Suppose we test $m$ null hypotheses, and control the Type I error for the
 82 | > $j$th null hypothesis at level $\alpha_j$, for $j=1,...,m$. Argue that the
 83 | > family-wise error rate is no greater than $\sum_{j=1}^m \alpha_j$.
 84 | 
 85 | $p(A \cup B) = p(A) + p(B)$ if $A$ and $B$ are independent or 
 86 | $p(A) + p(B) - p(A \cap B)$ when they are not. Since $p(A \cap B)$ must be 
 87 | positive, $p(A \cup B) < p(A) + p(B)$ (whether independent or not).
 88 | 
 89 | Therefore, the probability of a type I error in _any_ of $m$ hypotheses can
 90 | be no larger than the sum of the probabilities for each individual hypothesis
 91 | (which is $\alpha_j$ for the $j$th).
 92 | 
 93 | ### Question 4
 94 | 
 95 | > Suppose we test $m = 10$ hypotheses, and obtain the p-values shown in Table
 96 | > 13.4.
 97 | 
 98 | ```{r}
 99 | pvals <- c(0.0011, 0.031, 0.017, 0.32, 0.11, 0.90, 0.07, 0.006, 0.004, 0.0009)
100 | names(pvals) <- paste0("H", sprintf("%02d", 1:10))
101 | ```
102 | 
103 | > a. Suppose that we wish to control the Type I error for each null hypothesis
104 | > at level $\alpha = 0.05$. Which null hypotheses will we reject?
105 | 
106 | ```{r}
107 | names(which(pvals < 0.05))
108 | ```
109 | 
110 | We reject all NULL hypotheses where $p < 0.05$.
111 | 
112 | > b. Now suppose that we wish to control the FWER at level $\alpha = 0.05$.
113 | > Which null hypotheses will we reject? Justify your answer.
114 | 
115 | ```{r}
116 | names(which(pvals < 0.05 / 10))
117 | ```
118 | 
119 | We reject all NULL hypotheses where $p < 0.005$.
120 | 
121 | > c. Now suppose that we wish to control the FDR at level $q = 0.05$. Which null
122 | > hypotheses will we reject? Justify your answer.
123 | 
124 | ```{r}
125 | names(which(p.adjust(pvals, "fdr") < 0.05))
126 | ```
127 | 
128 | We reject all NULL hypotheses where $q < 0.05$.
129 | 
130 | > d. Now suppose that we wish to control the FDR at level $q = 0.2$. Which null
131 | > hypotheses will we reject? Justify your answer.
132 | 
133 | ```{r}
134 | names(which(p.adjust(pvals, "fdr") < 0.2))
135 | ```
136 | 
137 | We reject all NULL hypotheses where $q < 0.2$.
138 | 
139 | > e. Of the null hypotheses rejected at FDR level $q = 0.2$, approximately how
140 | > many are false positives? Justify your answer.
141 | 
142 | We expect 20% (in this case 2 out of the 8) rejections to be false (false
143 | positives).
144 | 
145 | ### Question 5
146 | 
147 | > For this problem, you will make up p-values that lead to a certain number of
148 | > rejections using the Bonferroni and Holm procedures.
149 | >
150 | > a. Give an example of five p-values (i.e. five numbers between 0 and 1 which,
151 | > for the purpose of this problem, we will interpret as p-values) for which
152 | > both Bonferroni’s method and Holm’s method reject exactly one null hypothesis
153 | > when controlling the FWER at level 0.1.
154 | 
155 | In this case, for Bonferroni, we need one p-value to be less than $0.1 / 5 =
156 | 0.02$. and the others to be above. For Holm's method, we need the most
157 | significant p-value to be below $0.1/(5 + 1 - 1) = 0.02$ also.
158 | 
159 | An example would be: 1, 1, 1, 1, 0.001.
160 | 
161 | ```{r}
162 | pvals <- c(1, 1, 1, 1, 0.001)
163 | sum(p.adjust(pvals, method = "bonferroni") < 0.1)
164 | sum(p.adjust(pvals, method = "holm") < 0.1)
165 | ```
166 | 
167 | > b. Now give an example of five p-values for which Bonferroni rejects one
168 | > null hypothesis and Holm rejects more than one null hypothesis at level 0.1.
169 | 
170 | An example would be: 1, 1, 1, 0.02, 0.001. For Holm's method we reject two
171 | because $0.02 < 0.1/(5 + 1 - 2)$.
172 | 
173 | ```{r}
174 | pvals <- c(1, 1, 1, 0.02, 0.001)
175 | sum(p.adjust(pvals, method = "bonferroni") < 0.1)
176 | sum(p.adjust(pvals, method = "holm") < 0.1)
177 | ```
178 | 
179 | ### Question 6
180 | 
181 | > For each of the three panels in Figure 13.3, answer the following questions:
182 | 
183 | * There are always: 8 positives (red) and 2 negatives (black).
184 | * False / true positives are black / red points _below_ the line respectively.
185 | * False / true negatives are red / black points _above_ the line respectively.
186 | * Type I / II errors are the same as false positives and false negatives
187 |   respectively.
188 | 
189 | > a. How many false positives, false negatives, true positives, true negatives,
190 | > Type I errors, and Type II errors result from applying the Bonferroni
191 | > procedure to control the FWER at level $\alpha = 0.05$?
192 | 
193 | | Panel | FP | FN | TP | TN | Type I | Type II |
194 | |------ |--- |--- |--- |--- |------- |-------- |
195 | | 1     | 0  | 1  | 7  | 2  | 0      | 1       |
196 | | 2     | 0  | 1  | 7  | 2  | 0      | 1       |
197 | | 3     | 0  | 5  | 3  | 2  | 0      | 5       |
198 | 
199 | > b. How many false positives, false negatives, true positives, true negatives,
200 | > Type I errors, and Type II errors result from applying the Holm procedure to
201 | > control the FWER at level $\alpha = 0.05$?
202 | 
203 | | Panel | FP | FN | TP | TN | Type I | Type II |
204 | |------ |--- |--- |--- |--- |------- |-------- |
205 | | 1     | 0  | 1  | 7  | 2  | 0      | 1       |
206 | | 2     | 0  | 0  | 8  | 2  | 0      | 0       |
207 | | 3     | 0  | 0  | 8  | 2  | 0      | 0       |
208 | 
209 | > c. What is the false discovery rate associated with using the Bonferroni
210 | > procedure to control the FWER at level $\alpha = 0.05$?
211 | 
212 | False discovery rate is the expected ratio of false positives (FP) to total
213 | positive (FP + TP).
214 | 
215 | For panels 1, 2, 3 this would be 0/7, 0/7 and 0/3 respectively.
216 | 
217 | > d. What is the false discovery rate associated with using the Holm procedure
218 | > to control the FWER at level $\alpha = 0.05$?
219 | 
220 | For panels 1, 2, 3 this would be 0/7, 0/8 and 0/8 respectively.
221 | 
222 | > e. How would the answers to (a) and (c) change if we instead used the
223 | > Bonferroni procedure to control the FWER at level $\alpha = 0.001$?
224 | 
225 | This would equate to a more stringent threshold. We would not call any more
226 | false positives, so the results would not change.
227 | 
228 | ## Applied
229 | 
230 | ### Question 7
231 | 
232 | > This problem makes use of the `Carseats` dataset in the `ISLR2` package.
233 | >
234 | > a. For each quantitative variable in the dataset besides `Sales`, fit a linear
235 | > model to predict `Sales` using that quantitative variable. Report the p-values
236 | > associated with the coefficients for the variables. That is, for each model of
237 | > the form $Y = \beta_0 + \beta_1X + \epsilon$, report the p-value associated
238 | > with the coefficient $\beta_1$. Here, $Y$ represents `Sales` and $X$
239 | > represents one of the other quantitative variables.
240 | 
241 | ```{r}
242 | library(ISLR2)
243 | 
244 | nm <- c("CompPrice", "Income", "Advertising", "Population", "Price", "Age")
245 | pvals <- sapply(nm, function(n) {
246 |   summary(lm(Carseats[["Sales"]] ~ Carseats[[n]]))$coef[2, 4]
247 | })
248 | ```
249 | 
250 | > b. Suppose we control the Type I error at level $\alpha = 0.05$ for the
251 | > p-values obtained in (a). Which null hypotheses do we reject?
252 | 
253 | ```{r}
254 | names(which(pvals < 0.05))
255 | ```
256 | 
257 | > c. Now suppose we control the FWER at level 0.05 for the p-values. Which null
258 | > hypotheses do we reject?
259 | 
260 | ```{r}
261 | names(which(pvals < 0.05 / length(nm)))
262 | ```
263 | 
264 | > d. Finally, suppose we control the FDR at level 0.2 for the p-values. Which
265 | > null hypotheses do we reject?
266 | 
267 | ```{r}
268 | names(which(p.adjust(pvals, "fdr") < 0.2))
269 | ```
270 | 
271 | ### Question 8
272 | 
273 | > In this problem, we will simulate data from $m = 100$ fund managers.
274 | >
275 | > ```r
276 | > set.seed(1)
277 | > n <- 20
278 | > m <- 100
279 | > X <- matrix(rnorm(n * m), ncol = m)
280 | > ```
281 | 
282 | ```{r}
283 | set.seed(1)
284 | n <- 20
285 | m <- 100
286 | X <- matrix(rnorm(n * m), ncol = m)
287 | ```
288 | 
289 | > These data represent each fund manager’s percentage returns for each of $n =
290 | > 20$ months. We wish to test the null hypothesis that each fund manager’s
291 | > percentage returns have population mean equal to zero. Notice that we
292 | > simulated the data in such a way that each fund manager’s percentage returns
293 | > do have population mean zero; in other words, all $m$ null hypotheses are true.
294 | >
295 | > a. Conduct a one-sample $t$-test for each fund manager, and plot a histogram
296 | > of the $p$-values obtained.
297 | 
298 | ```{r}
299 | pvals <- apply(X, 2, function(p) t.test(p)$p.value)
300 | hist(pvals, main = NULL)
301 | ```
302 | 
303 | > b. If we control Type I error for each null hypothesis at level $\alpha =
304 | > 0.05$, then how many null hypotheses do we reject?
305 | 
306 | ```{r}
307 | sum(pvals < 0.05)
308 | ```
309 | > c. If we control the FWER at level 0.05, then how many null hypotheses do we
310 | > reject?
311 | 
312 | ```{r}
313 | sum(pvals < 0.05 / length(pvals))
314 | ```
315 | 
316 | > d. If we control the FDR at level 0.05, then how many null hypotheses do we
317 | > reject?
318 | 
319 | ```{r}
320 | sum(p.adjust(pvals, "fdr") < 0.05)
321 | ```
322 | 
323 | > e. Now suppose we “cherry-pick” the 10 fund managers who perform the best in
324 | > our data. If we control the FWER for just these 10 fund managers at level
325 | > 0.05, then how many null hypotheses do we reject? If we control the FDR for
326 | > just these 10 fund managers at level 0.05, then how many null hypotheses do we
327 | > reject?
328 | 
329 | ```{r}
330 | best <- order(apply(X, 2, sum), decreasing = TRUE)[1:10]
331 | sum(pvals[best] < 0.05 / 10)
332 | sum(p.adjust(pvals[best], "fdr") < 0.05)
333 | ```
334 | 
335 | > f. Explain why the analysis in (e) is misleading.
336 | >
337 | > _Hint The standard approaches for controlling the FWER and FDR assume that all
338 | > tested null hypotheses are adjusted for multiplicity, and that no
339 | > “cherry-picking” of the smallest p-values has occurred. What goes wrong if we
340 | > cherry-pick?_
341 | 
342 | This is misleading because we are not correctly accounting for all tests
343 | performed. Cherry picking the similar to repeating a test until by chance we
344 | find a significant result.
345 | 


--------------------------------------------------------------------------------
/02-statistical-learning.Rmd:
--------------------------------------------------------------------------------
  1 | # Statistical Learning
  2 | 
  3 | ## Conceptual
  4 | 
  5 | ### Question 1
  6 | 
  7 | > For each of parts (a) through (d), indicate whether we would generally expect
  8 | > the performance of a flexible statistical learning method to be better or
  9 | > worse than an inflexible method. Justify your answer.
 10 | >
 11 | > a. The sample size $n$ is extremely large, and the number of predictors $p$ is
 12 | >    small.
 13 | 
 14 | Flexible best - opposite of b.
 15 | 
 16 | > b. The number of predictors $p$ is extremely large, and the number of
 17 | >    observations $n$ is small.
 18 | 
 19 | Inflexible best - high chance of some predictors being randomly associated.
 20 | 
 21 | > c. The relationship between the predictors and response is highly
 22 | >    non-linear.
 23 | 
 24 | Flexible best - inflexible leads to high bias.
 25 | 
 26 | > d. The variance of the error terms, i.e. $\sigma^2 = Var(\epsilon)$, is
 27 | >    extremely high.
 28 | 
 29 | Inflexible best - opposite of c.
 30 | 
 31 | ### Question 2
 32 | 
 33 | > Explain whether each scenario is a classification or regression problem, and
 34 | > indicate whether we are most interested in inference or prediction. Finally,
 35 | > provide $n$ and $p$.
 36 | >
 37 | > a. We collect a set of data on the top 500 firms in the US. For each firm
 38 | >    we record profit, number of employees, industry and the CEO salary. We are
 39 | >    interested in understanding which factors affect CEO salary.
 40 | 
 41 | $n=500$, $p=3$, regression, inference.
 42 | 
 43 | > b. We are considering launching a new product and wish to know whether
 44 | >    it will be a success or a failure. We collect data on 20 similar products
 45 | >    that were previously launched. For each product we have recorded whether it
 46 | >    was a success or failure, price charged for the product, marketing budget,
 47 | >    competition price, and ten other variables.
 48 | 
 49 | $n=20$, $p=13$, classification, prediction.
 50 | 
 51 | > c. We are interested in predicting the % change in the USD/Euro exchange
 52 | >    rate in relation to the weekly changes in the world stock markets. Hence we
 53 | >    collect weekly data for all of 2012. For each week we record the % change
 54 | >    in the USD/Euro, the % change in the US market, the % change in the British
 55 | >    market, and the % change in the German market.
 56 | 
 57 | $n=52$, $p=3$, regression, prediction.
 58 | 
 59 | ### Question 3
 60 | 
 61 | > We now revisit the bias-variance decomposition.
 62 | >
 63 | > a. Provide a sketch of typical (squared) bias, variance, training error,
 64 | >    test error, and Bayes (or irreducible) error curves, on a single plot, as
 65 | >    we go from less flexible statistical learning methods towards more flexible
 66 | >    approaches. The x-axis should represent the amount of flexibility in the
 67 | >    method, and the y-axis should represent the values for each curve. There
 68 | >    should be five curves. Make sure to label each one.
 69 | >
 70 | > b. Explain why each of the five curves has the shape displayed in
 71 | >    part (a).
 72 | 
 73 | * (squared) bias: Decreases with increasing flexibility (Generally, more
 74 |   flexible methods result in less bias).
 75 | * variance: Increases with increasing flexibility (In general, more flexible
 76 |   statistical methods have higher variance).
 77 | * training error: Decreases with model flexibility (More complex models will
 78 |   better fit the training data).
 79 | * test error: Decreases initially, then increases due to overfitting (less
 80 |   bias but more training error).
 81 | * Bayes (irreducible) error: fixed (does not change with model).
 82 | 
 83 | ### Question 4
 84 | 
 85 | > You will now think of some real-life applications for statistical learning.
 86 | >
 87 | > a. Describe three real-life applications in which classification might
 88 | >    be useful. Describe the response, as well as the predictors. Is the goal of
 89 | >    each application inference or prediction? Explain your answer.
 90 | 
 91 | * Coffee machine cleaned? (day of week, person assigned), inference.
 92 | * Is a flight delayed? (airline, airport etc), inference.
 93 | * Beer type (IPA, pilsner etc.), prediction.
 94 | 
 95 | > b. Describe three real-life applications in which regression might be
 96 | >    useful. Describe the response, as well as the predictors. Is the goal of
 97 | >    each application inference or prediction? Explain your answer.
 98 | 
 99 | * Amount of bonus paid (profitability, client feedback), prediction.
100 | * Person's height, prediction.
101 | * House price, inference.
102 | 
103 | > c. Describe three real-life applications in which cluster analysis might be
104 | >    useful.
105 | 
106 | * RNAseq tumour gene expression data.
107 | * SNPs in human populations.
108 | * Frequencies of mutations (with base pair context) in somatic mutation data.
109 | 
110 | ### Question 5
111 | 
112 | > What are the advantages and disadvantages of a very flexible (versus a less
113 | > flexible) approach for regression or classification? Under what circumstances
114 | > might a more flexible approach be preferred to a less flexible approach? When
115 | > might a less flexible approach be preferred?
116 | 
117 | Inflexible is more interpretable, fewer observations required, can be biased.
118 | Flexible can overfit (high error variance). In cases where we have high $n$ or
119 | non-linear patterns flexible will be preferred.
120 | 
121 | ### Question 6
122 | 
123 | > Describe the differences between a parametric and a non-parametric statistical
124 | > learning approach. What are the advantages of a parametric approach to
125 | > regression or classification (as opposed to a non-parametric approach)? What
126 | > are its disadvantages?
127 | 
128 | Parametric uses (model) parameters. Parametric models can be more interpretable
129 | as there is a model behind how data is generated. However, the disadvantage is
130 | that the model might not reflect reality. If the model is too far from the
131 | truth, estimates will be poor and more flexible models can fit many different
132 | forms and require more parameters (leading to overfitting). Non-parametric
133 | approaches do not estimate a small number of parameters, so a large number
134 | of observations may be needed to obtain accurate estimates.
135 | 
136 | ### Question 7
137 | 
138 | > The table below provides a training data set containing six observations,
139 | > three predictors, and one qualitative response variable.
140 | > 
141 | > | Obs. | $X_1$ | $X_2$ | $X_3$ | $Y$   |
142 | > |------|-------|-------|-------|-------|
143 | > | 1    | 0     | 3     | 0     | Red   |
144 | > | 2    | 2     | 0     | 0     | Red   |
145 | > | 3    | 0     | 1     | 3     | Red   |
146 | > | 4    | 0     | 1     | 2     | Green |
147 | > | 5    | -1    | 0     | 1     | Green |
148 | > | 6    | 1     | 1     | 1     | Red   |
149 | > 
150 | > Suppose we wish to use this data set to make a prediction for $Y$ when 
151 | > $X_1 = X_2 = X_3 = 0$ using $K$-nearest neighbors.
152 | >
153 | > a. Compute the Euclidean distance between each observation and the test
154 | >    point, $X_1 = X_2 = X_3 = 0$.
155 | 
156 | ```{r}
157 | dat <- data.frame(
158 |   "x1" = c(0, 2, 0, 0, -1, 1),
159 |   "x2" = c(3, 0, 1, 1, 0, 1),
160 |   "x3" = c(0, 0, 3, 2, 1, 1),
161 |   "y" = c("Red", "Red", "Red", "Green", "Green", "Red")
162 | )
163 | 
164 | # Euclidean distance between points and c(0, 0, 0)
165 | dist <- sqrt(dat[["x1"]]^2 + dat[["x2"]]^2 + dat[["x3"]]^2)
166 | signif(dist, 3)
167 | ```
168 | 
169 | > b. What is our prediction with $K = 1$? Why?
170 | 
171 | ```{r}
172 | knn <- function(k) {
173 |   names(which.max(table(dat[["y"]][order(dist)[1:k]])))
174 | }
175 | knn(1)
176 | ```
177 | 
178 | Green (based on data point 5 only)
179 | 
180 | > c. What is our prediction with $K = 3$? Why?
181 | 
182 | ```{r}
183 | knn(3)
184 | ```
185 | 
186 | Red (based on data points 2, 5, 6)
187 | 
188 | > d. If the Bayes decision boundary in this problem is highly non-linear, then
189 | >    would we expect the best value for $K$ to be large or small? Why?
190 | 
191 | Small (high $k$ leads to linear boundaries due to averaging)
192 | 
193 | ## Applied
194 | 
195 | ### Question 8
196 | 
197 | > This exercise relates to the `College` data set, which can be found in
198 | > the file `College.csv`. It contains a number of variables for 777 different
199 | > universities and colleges in the US. The variables are
200 | >
201 | > * `Private` : Public/private indicator
202 | > * `Apps` : Number of applications received
203 | > * `Accept` : Number of applicants accepted
204 | > * `Enroll` : Number of new students enrolled
205 | > * `Top10perc` : New students from top 10% of high school class
206 | > * `Top25perc` : New students from top 25% of high school class
207 | > * `F.Undergrad` : Number of full-time undergraduates
208 | > * `P.Undergrad` : Number of part-time undergraduates
209 | > * `Outstate` : Out-of-state tuition
210 | > * `Room.Board` : Room and board costs
211 | > * `Books` : Estimated book costs
212 | > * `Personal` : Estimated personal spending
213 | > * `PhD` : Percent of faculty with Ph.D.'s
214 | > * `Terminal` : Percent of faculty with terminal degree
215 | > * `S.F.Ratio` : Student/faculty ratio
216 | > * `perc.alumni` : Percent of alumni who donate
217 | > * `Expend` : Instructional expenditure per student
218 | > * `Grad.Rate` : Graduation rate
219 | >
220 | > Before reading the data into `R`, it can be viewed in Excel or a text
221 | > editor.
222 | >
223 | > a. Use the `read.csv()` function to read the data into `R`. Call the loaded
224 | >    data `college`. Make sure that you have the directory set to the correct
225 | >    location for the data.
226 | 
227 | ```{r}
228 | college <- read.csv("data/College.csv")
229 | ```
230 | 
231 | > b. Look at the data using the `View()` function. You should notice that the
232 | >    first column is just the name of each university. We don't really want `R`
233 | >    to treat this as data. However, it may be handy to have these names for 
234 | >    later. Try the following commands:
235 | >    
236 | >    ```r
237 | >    rownames(college) <- college[, 1]
238 | >    View(college)
239 | >    ```
240 | >    
241 | >    You should see that there is now a `row.names` column with the name of 
242 | >    each university recorded. This means that R has given each row a name 
243 | >    corresponding to the appropriate university. `R` will not try to perform 
244 | >    calculations on the row names. However, we still need to eliminate the 
245 | >    first column in the data where the names are stored. Try
246 | >    
247 | >    ```r
248 | >    college <- college [, -1]
249 | >    View(college)
250 | >    ```
251 | >    
252 | >    Now you should see that the first data column is `Private`. Note that 
253 | >    another column labeled `row.names` now appears before the `Private` column.
254 | >    However, this is not a data column but rather the name that R is giving to
255 | >    each row.
256 | 
257 | ```{r}
258 | rownames(college) <- college[, 1]
259 | college <- college[, -1]
260 | ```
261 | 
262 | > c.
263 | >     i.   Use the `summary()` function to produce a numerical summary of the
264 | >          variables in the data set.
265 | >     ii.  Use the `pairs()` function to produce a scatterplot matrix of the
266 | >          first ten columns or variables of the data. Recall that you can 
267 | >          reference the first ten columns of a matrix A using `A[,1:10]`.
268 | >     iii. Use the `plot()` function to produce side-by-side boxplots of 
269 | >          `Outstate` versus `Private`.
270 | >     iv. Create a new qualitative variable, called `Elite`, by _binning_ the
271 | >         `Top10perc` variable. We are going to divide universities into two
272 | >         groups based on whether or not the proportion of students coming from
273 | >         the top 10% of their high school classes exceeds 50%.
274 | >         
275 | >         ```r
276 | >         > Elite <- rep("No", nrow(college))
277 | >         > Elite[college$Top10perc > 50] <- "Yes"
278 | >         > Elite <- as.factor(Elite)
279 | >         > college <- data.frame(college, Elite)
280 | >         ```
281 | >         
282 | >         Use the `summary()` function to see how many elite universities there
283 | >         are. Now use the `plot()` function to produce side-by-side boxplots of
284 | >         `Outstate` versus `Elite`.
285 | >     v.   Use the `hist()` function to produce some histograms with differing
286 | >          numbers of bins for a few of the quantitative variables. You may find
287 | >          the command `par(mfrow=c(2,2))` useful: it will divide the print 
288 | >          window into four regions so that four plots can be made 
289 | >          simultaneously. Modifying the arguments to this function will divide
290 | >          the screen in other ways.
291 | >     vi.  Continue exploring the data, and provide a brief summary of what you
292 | >          discover.
293 | 
294 | ```{r}
295 | summary(college)
296 | 
297 | college$Private <- college$Private == "Yes"
298 | pairs(college[, 1:10], cex = 0.2)
299 | plot(college$Outstate ~ factor(college$Private), xlab = "Private", ylab = "Outstate")
300 | 
301 | college$Elite <- factor(ifelse(college$Top10perc > 50, "Yes", "No"))
302 | summary(college$Elite)
303 | plot(college$Outstate ~ college$Elite, xlab = "Elite", ylab = "Outstate")
304 | 
305 | par(mfrow = c(2, 2))
306 | for (n in c(5, 10, 20, 50)) {
307 |   hist(college$Enroll, breaks = n, main = paste("n =", n), xlab = "Enroll")
308 | }
309 | 
310 | chisq.test(college$Private, college$Elite)
311 | ```
312 | 
313 | Whether a college is Private and Elite is not random!
314 | 
315 | ### Question 9
316 | 
317 | > This exercise involves the Auto data set studied in the lab. Make sure
318 | > that the missing values have been removed from the data.
319 | 
320 | ```{r}
321 | x <- read.table("data/Auto.data", header = TRUE, na.strings = "?")
322 | x <- na.omit(x)
323 | ```
324 | 
325 | > a. Which of the predictors are quantitative, and which are qualitative?
326 | 
327 | ```{r}
328 | sapply(x, class)
329 | numeric <- which(sapply(x, class) == "numeric")
330 | names(numeric)
331 | ```
332 | 
333 | > b. What is the range of each quantitative predictor? You can answer this using
334 | >    the `range()` function.
335 | 
336 | ```{r}
337 | sapply(x[, numeric], function(x) diff(range(x)))
338 | ```
339 | 
340 | > c. What is the mean and standard deviation of each quantitative predictor?
341 | 
342 | ```{r}
343 | library(tidyverse)
344 | library(knitr)
345 | 
346 | x[, numeric] |>
347 |   pivot_longer(everything()) |>
348 |   group_by(name) |>
349 |   summarise(
350 |     Mean = mean(value),
351 |     SD = sd(value)
352 |   ) |>
353 |   kable()
354 | ```
355 | 
356 | > d. Now remove the 10th through 85th observations. What is the range, mean, and
357 | >    standard deviation of each predictor in the subset of the data that 
358 | >    remains?
359 | 
360 | ```{r}
361 | x[-(10:85), numeric] |>
362 |   pivot_longer(everything()) |>
363 |   group_by(name) |>
364 |   summarise(
365 |     Range = diff(range(value)),
366 |     Mean = mean(value),
367 |     SD = sd(value)
368 |   ) |>
369 |   kable()
370 | ```
371 | 
372 | > e. Using the full data set, investigate the predictors graphically, using
373 | >    scatterplots or other tools of your choice. Create some plots highlighting
374 | >    the relationships among the predictors. Comment on your findings.
375 | 
376 | ```{r}
377 | pairs(x[, numeric], cex = 0.2)
378 | cor(x[, numeric]) |>
379 |   kable()
380 | 
381 | heatmap(cor(x[, numeric]), cexRow = 1.1, cexCol = 1.1, margins = c(8, 8))
382 | ```
383 | 
384 | Many of the variables appear to be highly (positively or negatively) correlated
385 | with some relationships being non-linear.
386 | 
387 | > f. Suppose that we wish to predict gas mileage (`mpg`) on the basis of the
388 | >    other variables. Do your plots suggest that any of the other variables
389 | >    might be useful in predicting `mpg`? Justify your answer.
390 | 
391 | Yes, since other variables are correlated. However, horsepower, weight and 
392 | displacement are highly related.
393 | 
394 | ### Question 10
395 | 
396 | > This exercise involves the `Boston` housing data set.
397 | >
398 | > a. To begin, load in the `Boston` data set. The `Boston` data set is part of 
399 | >    the `ISLR2` library in R.
400 | >    ```r
401 | >    > library(ISLR2)
402 | >    ```
403 | >    Now the data set is contained in the object `Boston`.
404 | >    ```r
405 | >    > Boston
406 | >    ```
407 | >    Read about the data set:
408 | >    ```r
409 | >    > ?Boston
410 | >    ```
411 | >    How many rows are in this data set? How many columns? What do the rows and
412 | >    columns represent?
413 | 
414 | ```{r}
415 | library(ISLR2)
416 | dim(Boston)
417 | ```
418 | 
419 | > b. Make some pairwise scatterplots of the predictors (columns) in this data
420 | >    set. Describe your findings.
421 | 
422 | ```{r, message = FALSE, warning = FALSE}
423 | library(ggplot2)
424 | library(tidyverse)
425 | ```
426 | 
427 | ```{r}
428 | ggplot(Boston, aes(nox, rm)) +
429 |   geom_point()
430 | ggplot(Boston, aes(ptratio, rm)) +
431 |   geom_point()
432 | heatmap(cor(Boston, method = "spearman"), cexRow = 1.1, cexCol = 1.1)
433 | ```
434 | 
435 | > c. Are any of the predictors associated with per capita crime rate? If so,
436 | >    explain the relationship.
437 | 
438 | Yes
439 | 
440 | > d. Do any of the census tracts of Boston appear to have particularly high 
441 | >    crime rates? Tax rates? Pupil-teacher ratios? Comment on the range of each
442 | >    predictor.
443 | 
444 | ```{r}
445 | Boston |>
446 |   pivot_longer(cols = 1:13) |>
447 |   filter(name %in% c("crim", "tax", "ptratio")) |>
448 |   ggplot(aes(value)) +
449 |   geom_histogram(bins = 20) +
450 |   facet_wrap(~name, scales = "free", ncol = 1)
451 | ```
452 | 
453 | Yes, particularly crime and tax rates.
454 | 
455 | > e. How many of the census tracts in this data set bound the Charles river?
456 | 
457 | ```{r}
458 | sum(Boston$chas)
459 | ```
460 | 
461 | > f. What is the median pupil-teacher ratio among the towns in this data set?
462 | 
463 | ```{r}
464 | median(Boston$ptratio)
465 | ```
466 | 
467 | > g. Which census tract of Boston has lowest median value of owner-occupied
468 | >    homes? What are the values of the other predictors for that census tract,
469 | >    and how do those values compare to the overall ranges for those predictors?
470 | >    Comment on your findings.
471 | 
472 | ```{r}
473 | Boston[Boston$medv == min(Boston$medv), ] |>
474 |   kable()
475 | 
476 | sapply(Boston, quantile) |>
477 |   kable()
478 | ```
479 | 
480 | > h. In this data set, how many of the census tract average more than seven
481 | >    rooms per dwelling? More than eight rooms per dwelling? Comment on the
482 | >    census tracts that average more than eight rooms per dwelling.
483 | 
484 | ```{r}
485 | sum(Boston$rm > 7)
486 | sum(Boston$rm > 8)
487 | ```
488 | 
489 | Let's compare median statistics for those census tracts with more than eight
490 | rooms per dwelling on average, with the statistics for those with fewer.
491 | 
492 | ```{r}
493 | Boston |>
494 |   mutate(
495 |     `log(crim)` = log(crim),
496 |     `log(zn)` = log(zn)
497 |   ) |>
498 |   select(-c(crim, zn)) |>
499 |   pivot_longer(!rm) |>
500 |   mutate(">8 rooms" = rm > 8) |>
501 |   ggplot(aes(`>8 rooms`, value)) +
502 |   geom_boxplot() +
503 |   facet_wrap(~name, scales = "free")
504 | ```
505 | 
506 | Census tracts with big average properties (more than eight rooms per dwelling)
507 | have higher median value (`medv`), a lower proportion of non-retail
508 | business acres (`indus`), a lower pupil-teacher ratio (`ptratio`), a lower
509 | status of the population (`lstat`) among other differences.
510 | 


--------------------------------------------------------------------------------
/11-survival-analysis-and-censored-data.Rmd:
--------------------------------------------------------------------------------
  1 | # Survival Analysis and Censored Data
  2 | 
  3 | ## Conceptual
  4 | 
  5 | ### Question 1
  6 | 
  7 | > For each example, state whether or not the censoring mechanism is independent.
  8 | > Justify your answer.
  9 | >
 10 | > a. In a study of disease relapse, due to a careless research scientist, all
 11 | > patients whose phone numbers begin with the number "2" are lost to follow up.
 12 | 
 13 | Independent. There's no reason to think disease relapse should be related to 
 14 | the first digit of a phone number.
 15 | 
 16 | > b. In a study of longevity, a formatting error causes all patient ages that
 17 | > exceed 99 years to be lost (i.e. we know that those patients are more than 99
 18 | > years old, but we do not know their exact ages).
 19 | 
 20 | Not independent. Older patients are more likely to see an event that younger.
 21 | 
 22 | > c. Hospital A conducts a study of longevity. However, very sick patients tend
 23 | > to be transferred to Hospital B, and are lost to follow up.
 24 | 
 25 | Not independent. Sick patients are more likely to see an event that healthy.
 26 | 
 27 | > d. In a study of unemployment duration, the people who find work earlier are
 28 | > less motivated to stay in touch with study investigators, and therefore are
 29 | > more likely to be lost to follow up.
 30 | 
 31 | Not independent. More employable individuals are more likely to see an event.
 32 | 
 33 | > e. In a study of pregnancy duration, women who deliver their babies pre-term
 34 | > are more likely to do so away from their usual hospital, and thus are more
 35 | > likely to be censored, relative to women who deliver full-term babies.
 36 | 
 37 | Not independent. Delivery away from hospital will be associated with pregnancy
 38 | duration.
 39 | 
 40 | > f. A researcher wishes to model the number of years of education of the
 41 | > residents of a small town. Residents who enroll in college out of town are
 42 | > more likely to be lost to follow up, and are also more likely to attend
 43 | > graduate school, relative to those who attend college in town.
 44 | 
 45 | Not independent. Years of education will be associated with enrolling in out
 46 | of town colleges.
 47 | 
 48 | > g. Researchers conduct a study of disease-free survival (i.e. time until
 49 | > disease relapse following treatment). Patients who have not relapsed within
 50 | > five years are considered to be cured, and thus their survival time is
 51 | > censored at five years.
 52 | 
 53 | In other words we assume all events happen within five years, so
 54 | censoring after this time is equivalent to not censoring at all so
 55 | the censoring is independent.
 56 | 
 57 | > h. We wish to model the failure time for some electrical component. This
 58 | > component can be manufactured in Iowa or in Pittsburgh, with no difference in
 59 | > quality. The Iowa factory opened five years ago, and so components
 60 | > manufactured in Iowa are censored at five years. The Pittsburgh factory opened
 61 | > two years ago, so those components are censored at two years.
 62 | 
 63 | If there is no difference in quality then location and therefore censoring is
 64 | independent of failure time.
 65 | 
 66 | > i. We wish to model the failure time of an electrical component made in two
 67 | > different factories, one of which opened before the other. We have reason to
 68 | > believe that the components manufactured in the factory that opened earlier
 69 | > are of higher quality.
 70 | 
 71 | In this case, the difference in opening times of the two locations will mean
 72 | that any difference in quality between locations will be associated with
 73 | censoring, so censoring is not independent.
 74 | 
 75 | ### Question 2
 76 | 
 77 | > We conduct a study with $n = 4$ participants who have just purchased cell
 78 | > phones, in order to model the time until phone replacement. The first
 79 | > participant replaces her phone after 1.2 years. The second participant still
 80 | > has not replaced her phone at the end of the two-year study period. The third
 81 | > participant changes her phone number and is lost to follow up (but has not yet
 82 | > replaced her phone) 1.5 years into the study. The fourth participant replaces
 83 | > her phone after 0.2 years.
 84 | >
 85 | > For each of the four participants ($i = 1,..., 4$), answer the following
 86 | > questions using the notation introduced in Section 11.1:
 87 | >
 88 | > a. Is the participant's cell phone replacement time censored?
 89 | 
 90 | No, Yes, Yes and No. Censoring occurs when we do not know if or when the phone
 91 | was replaced.
 92 | 
 93 | > b. Is the value of $c_i$ known, and if so, then what is it?
 94 | 
 95 | $c_i$ is censoring time. For the four participants these are: NA. 2. 1.5 and NA.
 96 | 
 97 | > c. Is the value of $t_i$ known, and if so, then what is it?
 98 | 
 99 | $t_i$ is time to event. For the four participants these are: 1.2, NA, NA and
100 | 0.2.
101 | 
102 | > d. Is the value of $y_i$ known, and if so, then what is it?
103 | 
104 | $y_i$ is the observed time. For the four participants these are: 1.2, 2, 1.5 and
105 | 0.2.
106 | 
107 | > e. Is the value of $\delta_i$ known, and if so, then what is it?
108 | 
109 | $\delta_i$ is an indicator for censoring. The nomenclature introduced here
110 | defines this to be 1 if we observe the true "survival" time and 0 if we observe
111 | the censored time. Therefore, for these participants, the values are: 1, 0, 0
112 | and 1.
113 | 
114 | ### Question 3
115 | 
116 | > For the example in Exercise 2, report the values of $K$, $d_1,...,d_K$,
117 | > $r_1,...,r_K$, and $q_1,...,q_K$, where this notation was defined in Section
118 | > 11.3.
119 | 
120 | * $K$ is the number of unique deaths, which is 2.
121 | * $d_k$ represents the unique death times, which are: 0.2, 1.2.
122 | * $r_k$ denotes the number of patients alive and in the study just before $d_k$.
123 |   Note the first event is for patient 4, then patient 1, then patient 3 is 
124 |   censored and finally the study ends with patient 2 still involved. Therefore
125 |   $r_k$ takes values are: 4, 3.
126 | * $q_k$ denotes the number of patients who died at time $d_k$, therefore this
127 |   takes values: 1, 1.
128 |   
129 | We can check by using the `survival` package.
130 | 
131 | ```{r}
132 | library(survival)
133 | x <- Surv(c(1.2, 2, 1.5, 0.2), event = c(1, 0, 0, 1))
134 | summary(survfit(x ~ 1))
135 | ```
136 | 
137 | ### Question 4
138 | 
139 | > This problem makes use of the Kaplan-Meier survival curve displayed in Figure
140 | > 11.9. The raw data that went into plotting this survival curve is given in
141 | > Table 11.4. The covariate column of that table is not needed for this problem.
142 | >
143 | > a. What is the estimated probability of survival past 50 days?
144 | 
145 | There are 2 events that happen before 50 days. The number at
146 | risk $r_k$ are 5 and 4 (one was censored early on), thus survival probability is
147 | $4/5 * 3/4 = 0.6$.
148 | 
149 | Equivalently, we can use the survival package.
150 | 
151 | ```{r, message = FALSE, warning = FALSE}
152 | library(tidyverse)
153 | ```
154 | 
155 | ```{r}
156 | table_data <- tribble(
157 |   ~Y, ~D, ~X,
158 |   26.5, 1, 0.1,
159 |   37.2, 1, 11,
160 |   57.3, 1, -0.3,
161 |   90.8, 0, 2.8,
162 |   20.2, 0, 1.8,
163 |   89.8, 0, 0.4
164 | )
165 | x <- Surv(table_data$Y, table_data$D)
166 | summary(survfit(x ~ 1))
167 | ```
168 | 
169 | > b. Write out an analytical expression for the estimated survival function. For
170 | > instance, your answer might be something along the lines of 
171 | >
172 | > $$
173 | > \hat{S}(t) = \begin{cases}
174 | > 0.8  & \text{if } t < 31\\
175 | > 0.5  & \text{if } 31 \le t < 77\\
176 | > 0.22 & \text{if } 77 \le t
177 | > \end{cases}
178 | > $$
179 | >
180 | > (The previous equation is for illustration only: it is not the correct
181 | > answer!)
182 | 
183 | $$
184 | \hat{S}(t) = \begin{cases}
185 | 1   & \text{if } t < 26.5 \\
186 | 0.8 & \text{if } 26.5 \le t < 37.2 \\
187 | 0.6 & \text{if } 37.2 \le t < 57.3 \\
188 | 0.4 & \text{if } 57.3 \le t
189 | \end{cases}
190 | $$
191 | 
192 | 
193 | ### Question 5
194 | 
195 | > Sketch the survival function given by the equation
196 | >
197 | > $$
198 | > \hat{S}(t) = \begin{cases}
199 | > 0.8, & \text{if } t < 31\\
200 | > 0.5, & \text{if } 31 \le t < 77\\
201 | > 0.22 & \text{if } 77 \le t
202 | > \end{cases}
203 | > $$
204 | >
205 | > Your answer should look something like Figure 11.9.
206 | 
207 | We can draw this plot, or even engineer data that will generate the required
208 | plot...
209 | 
210 | ```{r}
211 | plot(NULL,
212 |   xlim = c(0, 100),
213 |   ylim = c(0, 1),
214 |   ylab = "Estimated Probability of Survival",
215 |   xlab = "Time in Days"
216 | )
217 | lines(
218 |   c(0, 31, 31, 77, 77, 100),
219 |   c(0.8, 0.8, 0.5, 0.5, 0.22, 0.22)
220 | )
221 | ```
222 | 
223 | ### Question 6
224 | 
225 | > This problem makes use of the data displayed in Figure 11.1. In completing
226 | > this problem, you can refer to the observation times as $y_1,...,y_4$. The
227 | > ordering of these observation times can be seen from Figure 11.1; their exact
228 | > values are not required.
229 | >
230 | > a. Report the values of $\delta_1,...,\delta_4$, $K$, $d_1,...,d_K$,
231 | > $r_1,...,r_K$, and $q_1,...,q_K$. The relevant notation is defined in Sections
232 | > 11.1 and 11.3.
233 | 
234 | * $\delta$ values are: 1, 0, 1, 0.
235 | * $K$ is 2
236 | * $d$ values are $y_3$ and $y_1$.
237 | * $r$ values are 4 and 2.
238 | * $q$ values are 1 and 1.
239 | 
240 | > b. Sketch the Kaplan-Meier survival curve corresponding to this data set. (You
241 | > do not need to use any software to do this---you can sketch it by hand using
242 | > the results obtained in (a).)
243 | 
244 | ```{r}
245 | plot(NULL,
246 |   xlim = c(0, 350),
247 |   ylim = c(0, 1),
248 |   ylab = "Estimated Probability of Survival",
249 |   xlab = "Time in Days"
250 | )
251 | lines(
252 |   c(0, 150, 150, 300, 300, 350),
253 |   c(1, 1, 0.75, 0.75, 0.375, 0.375)
254 | )
255 | ```
256 | 
257 | x <- Surv(c(300, 350, 150, 250), c(1, 0, 1, 0))
258 | 
259 | > c. Based on the survival curve estimated in (b), what is the probability that
260 | > the event occurs within 200 days? What is the probability that the event does
261 | > not occur within 310 days?
262 | 
263 | 0.25 and 0.375.
264 | 
265 | > d. Write out an expression for the estimated survival curve from (b).
266 | 
267 | $$
268 | \hat{S}(t) = \begin{cases}
269 | 1     & \text{if } t < y_3 \\
270 | 0.75  & \text{if } y_3 \le t < y_1 \\
271 | 0.375 & \text{if } y_1 \le t
272 | \end{cases}
273 | $$
274 | 
275 | ### Question 7
276 | 
277 | > In this problem, we will derive (11.5) and (11.6), which are needed for the
278 | > construction of the log-rank test statistic (11.8). Recall the notation in
279 | > Table 11.1.
280 | >
281 | > a. Assume that there is no difference between the survival functions of the
282 | > two groups. Then we can think of $q_{1k}$ as the number of failures if we draw
283 | > $r_{1k} observations, without replacement, from a risk set of $r_k$
284 | > observations that contains a total of $q_k$ failures. Argue that $q_{1k}$
285 | > follows a hypergeometric distribution. Write the parameters of this
286 | > distribution in terms of $r_{1k}$, $r_k$, and $q_k$.
287 | 
288 | A hypergeometric distributions models sampling without replacement from a finite
289 | pool where each sample is a success or failure. This fits the situation here,
290 | where with have a finite number of samples in the risk set.
291 | 
292 | The hypergeometric distribution is parameterized as $k$ successes in $n$ draws, without replacement, from a population of size $N$ with $K$ objects with that feature.
293 | 
294 | Mapping to our situation, $q_{1k}$ is $k$, $r_{1k}$ is $n$, $r_k$ is $N$ and $q_k$ is $K$.
295 | 
296 | > b. Given your previous answer, and the properties of the hypergeometric
297 | > distribution, what are the mean and variance of $q_{1k}$? Compare your answer
298 | > to (11.5) and (11.6).
299 | 
300 | With the above parameterization, the mean ($n K/N$) is $r_{1k} q_k/r_K$.
301 | The variance $n K/N (N-K)/N (N-n)/(N-1)$ is 
302 | 
303 | $$
304 | r_{1k} \frac{q_k}{r_k} \frac{r_k-q_k}{r_k} \frac{r_k - r_{1k}}{r_k - 1}
305 | $$
306 | 
307 | These are equivalent to 11.5 and 11.6.
308 | 
309 | ### Question 8
310 | 
311 | > Recall that the survival function $S(t)$, the hazard function $h(t)$, and the
312 | > density function $f(t)$ are defined in (11.2), (11.9), and (11.11),
313 | > respectively. Furthermore, define $F(t) = 1 - S(t)$. Show that the following
314 | > relationships hold:
315 | >
316 | > $$
317 | > f(t) = dF(t)/dt \\
318 | > S(t) = \exp\left(-\int_0^t h(u)du\right)
319 | > $$
320 | 
321 | If $F(t) = 1 - S(t)$, then $F(t)$ is the *cumulative density function* (cdf)
322 | for $t$.
323 | 
324 | For a continuous distribution, a cdf, e.g. $F(t)$ can be expressed as an
325 | integral (up to some value $x$) of the *probability density function* (pdf),
326 | i.e. $F(t) = \int_{-\infty}^x f(x) dt$. Equivalently, the derivative of the cdf
327 | is its pdf: $f(t) = \frac{d F(t)}{dt}$.
328 | 
329 | Then,
330 | $h(t) = \frac{f(t)}{S(t)} = \frac{dF(t)/dt}{S(t)} = \frac{-dS(t)/dt}{S(t)}$. 
331 | From basic calculus, this can be rewritten as $h(t) = -\frac{d}{dt}\log{S(t)}$. 
332 | Integrating and then exponentiating we get the second identity.
333 | 
334 | ### Question 9
335 | 
336 | > In this exercise, we will explore the consequences of assuming that the
337 | > survival times follow an exponential distribution.
338 | >
339 | > a. Suppose that a survival time follows an $Exp(\lambda)$ distribution, so
340 | > that its density function is $f(t) = \lambda\exp(-\lambda t)$. Using the
341 | > relationships provided in Exercise 8, show that $S(t) = \exp(-\lambda t)$.
342 | 
343 | The cdf of an exponential distribution is $1 - \exp(-\lambda x)$ and 
344 | $S(t)$ is $1 - F(t)$ where $F(t)$ is the cdf.
345 | 
346 | Hence, $S(t) = \exp(-\lambda t)$.
347 | 
348 | > b. Now suppose that each of $n$ independent survival times follows an
349 | > $\exp(\lambda)$ distribution. Write out an expression for the likelihood
350 | > function (11.13).
351 | 
352 | The reference to (11.13) gives us the following formula:
353 | 
354 | $$
355 | L = \prod_{i=1}^{n} h(y_i)^{\delta_i} S(y_i)
356 | $$
357 | 
358 | (11.10) also gives us
359 | 
360 | $$
361 | h(t) = \frac{f(t)}{S(t)}
362 | $$
363 | 
364 | Plugging in the expressions from part (a), we get
365 | 
366 | \begin{align*}
367 | h(t) &= \frac{\lambda \exp(- \lambda t)}{\exp(- \lambda t)} \\
368 |      &= \lambda
369 | \end{align*}
370 | 
371 | Using (11.13), we get the following loss expression:
372 | 
373 | $$
374 | \ell = \prod_i \lambda^{\delta_i} e^{- \lambda y_i}
375 | $$
376 | 
377 | > c. Show that the maximum likelihood estimator for $\lambda$ is
378 | > $$ 
379 | > \hat\lambda = \sum_{i=1}^n \delta_i / \sum_{i=1}^n y_i.
380 | > $$
381 | 
382 | Take the log likelihood.
383 | 
384 | \begin{align*}
385 | \log \ell &= \sum_i \log \left( \lambda^{\delta_i} e^{- \lambda y_i} \right) \\
386 |     &= \sum_i{\delta_i\log\lambda - \lambda y_i \log e} \\
387 |     &= \sum_i{\delta_i\log\lambda - \lambda y_i} \\
388 |     &= \log\lambda\sum_i{\delta_i} - \lambda\sum_i{y_i}
389 | \end{align*}
390 | 
391 | Differentiating this expression with respect to $\lambda$ we get:
392 | 
393 | $$
394 | \frac{d \log \ell}{d \lambda} = \frac{\sum_i{\delta_i}}{\lambda} - \sum_i{y_i}
395 | $$
396 | 
397 | This function maximises when its gradient is 0. Solving for this gives a MLE of
398 | $\hat\lambda = \sum_{i=1}^n \delta_i / \sum_{i=1}^n y_i$.
399 | 
400 | > d. Use your answer to (c) to derive an estimator of the mean survival time.
401 | >
402 | > _Hint: For (d), recall that the mean of an $Exp(\lambda)$ random variable is
403 | > $1/\lambda$._
404 | 
405 | Estimated mean survival would be $1/\lambda$ which given the above would be
406 | $\sum_{i=1}^n y_i / \sum_{i=1}^n \delta_i$, which can be thought of as 
407 | the total observation time over the total number of deaths.
408 | 
409 | ## Applied
410 | 
411 | ### Question 10
412 | 
413 | > This exercise focuses on the brain tumor data, which is included in the
414 | > `ISLR2` `R` library.
415 | >
416 | > a. Plot the Kaplan-Meier survival curve with ±1 standard error bands, using
417 | > the `survfit()` function in the `survival` package.
418 | 
419 | ```{r}
420 | library(ISLR2)
421 | x <- Surv(BrainCancer$time, BrainCancer$status)
422 | plot(survfit(x ~ 1),
423 |   xlab = "Months",
424 |   ylab = "Estimated Probability of Survival",
425 |   col = "steelblue",
426 |   conf.int = 0.67
427 | )
428 | ```
429 | 
430 | > b. Draw a bootstrap sample of size $n = 88$ from the pairs ($y_i$,
431 | > $\delta_i$), and compute the resulting Kaplan-Meier survival curve. Repeat
432 | > this process $B = 200$ times. Use the results to obtain an estimate of the
433 | > standard error of the Kaplan-Meier survival curve at each timepoint. Compare
434 | > this to the standard errors obtained in (a).
435 | 
436 | ```{r}
437 | plot(survfit(x ~ 1),
438 |   xlab = "Months",
439 |   ylab = "Estimated Probability of Survival",
440 |   col = "steelblue",
441 |   conf.int = 0.67
442 | )
443 | fit <- survfit(x ~ 1)
444 | dat <- tibble(time = c(0, fit$time))
445 | for (i in 1:200) {
446 |   y <- survfit(sample(x, 88, replace = TRUE) ~ 1)
447 |   y <- tibble(time = c(0, y$time), "s{i}" := c(1, y$surv))
448 |   dat <- left_join(dat, y, by = "time")
449 | }
450 | res <- fill(dat, starts_with("s")) |>
451 |   rowwise() |>
452 |   transmute(sd = sd(c_across(starts_with("s"))))
453 | se <- res$sd[2:nrow(res)]
454 | lines(fit$time, fit$surv - se, lty = 2, col = "red")
455 | lines(fit$time, fit$surv + se, lty = 2, col = "red")
456 | ```
457 | 
458 | > c. Fit a Cox proportional hazards model that uses all of the predictors to
459 | > predict survival. Summarize the main findings.
460 | 
461 | ```{r}
462 | fit <- coxph(Surv(time, status) ~ sex + diagnosis + loc + ki + gtv + stereo, data = BrainCancer)
463 | fit
464 | ```
465 | 
466 | `diagnosisHG` and `ki` are highly significant.
467 | 
468 | > d. Stratify the data by the value of `ki`. (Since only one observation has
469 | > `ki=40`, you can group that observation together with the observations that
470 | > have `ki=60`.) Plot Kaplan-Meier survival curves for each of the five strata,
471 | > adjusted for the other predictors.
472 | 
473 | To adjust for other predictors, we fit a model that includes those predictors
474 | and use this model to predict new, artificial, data where we allow `ki` to 
475 | take each possible value, but set the other predictors to be the mode or mean
476 | of the other predictors.
477 | 
478 | ```{r}
479 | library(ggfortify)
480 | 
481 | modaldata <- data.frame(
482 |   sex = rep("Female", 5),
483 |   diagnosis = rep("Meningioma", 5),
484 |   loc = rep("Supratentorial", 5),
485 |   ki = c(60, 70, 80, 90, 100),
486 |   gtv = rep(mean(BrainCancer$gtv), 5),
487 |   stereo = rep("SRT", 5)
488 | )
489 | survplots <- survfit(fit, newdata = modaldata)
490 | plot(survplots, xlab = "Months", ylab = "Survival Probability", col = 2:6)
491 | legend("bottomleft", c("60", "70", "80", "90", "100"), col = 2:6, lty = 1)
492 | ```
493 | 
494 | ### Question 11
495 | 
496 | > This example makes use of the data in Table 11.4.
497 | >
498 | > a. Create two groups of observations. In Group 1, $X < 2$, whereas in Group 2,
499 | > $X \ge 2$. Plot the Kaplan-Meier survival curves corresponding to the two
500 | > groups. Be sure to label the curves so that it is clear which curve
501 | > corresponds to which group. By eye, does there appear to be a difference
502 | > between the two groups' survival curves?
503 | 
504 | ```{r}
505 | x <- split(Surv(table_data$Y, table_data$D), table_data$X < 2)
506 | plot(NULL, xlim = c(0, 100), ylim = c(0, 1), ylab = "Survival Probability")
507 | lines(survfit(x[[1]] ~ 1), conf.int = FALSE, col = 2)
508 | lines(survfit(x[[2]] ~ 1), conf.int = FALSE, col = 3)
509 | legend("bottomleft", c(">= 2", "<2"), col = 2:3, lty = 1)
510 | ```
511 | 
512 | There does not appear to be any difference between the curves.
513 | 
514 | > b. Fit Cox's proportional hazards model, using the group indicator as a
515 | > covariate. What is the estimated coefficient? Write a sentence providing the
516 | > interpretation of this coefficient, in terms of the hazard or the
517 | > instantaneous probability of the event. Is there evidence that the true
518 | > coefficient value is non-zero?
519 | 
520 | ```{r}
521 | fit <- coxph(Surv(Y, D) ~ X < 2, data = table_data)
522 | fit
523 | ```
524 | 
525 | The coefficient is $0.3401$. This implies a slightly increased hazard when
526 | $X < 2$ but it is not significantly different to zero (P = 0.8).
527 | 
528 | > c. Recall from Section 11.5.2 that in the case of a single binary covariate,
529 | > the log-rank test statistic should be identical to the score statistic for the
530 | > Cox model. Conduct a log-rank test to determine whether there is a difference
531 | > between the survival curves for the two groups. How does the p-value for the
532 | > log-rank test statistic compare to the $p$-value for the score statistic for
533 | > the Cox model from (b)?
534 | 
535 | ```{r}
536 | summary(fit)$sctest
537 | survdiff(Surv(Y, D) ~ X < 2, data = table_data)$chisq
538 | ```
539 | 
540 | They are identical.
541 | 


--------------------------------------------------------------------------------
/05-resampling-methods.Rmd:
--------------------------------------------------------------------------------
  1 | # Resampling Methods
  2 | 
  3 | ## Conceptual
  4 | 
  5 | ### Question 1
  6 | 
  7 | > Using basic statistical properties of the variance, as well as single-
  8 | > variable calculus, derive (5.6). In other words, prove that $\alpha$ given by
  9 | > (5.6) does indeed minimize $Var(\alpha X + (1 - \alpha)Y)$.
 10 | 
 11 | Equation 5.6 is:
 12 | 
 13 | $$
 14 | \alpha = \frac{\sigma^2_Y - \sigma_{XY}}{\sigma^2_X + \sigma^2_Y - 2\sigma_{XY}}
 15 | $$
 16 | 
 17 | Remember that: 
 18 | 
 19 | $$
 20 | Var(aX) = a^2Var(X), \\ 
 21 | \mathrm{Var}(X + Y) = \mathrm{Var}(X) + \mathrm{Var}(Y) + 2\mathrm{Cov}(X,Y), \\
 22 | \mathrm{Cov}(aX, bY) = ab\mathrm{Cov}(X, Y)
 23 | $$
 24 | 
 25 | If we define $\sigma^2_X = \mathrm{Var}(X)$, $\sigma^2_Y = \mathrm{Var}(Y)$ and
 26 | $\sigma_{XY} = \mathrm{Cov}(X, Y)$
 27 | 
 28 | \begin{align}
 29 | Var(\alpha X + (1 - \alpha)Y) 
 30 |   &= \alpha^2\sigma^2_X + (1-\alpha)^2\sigma^2_Y + 2\alpha(1 - \alpha)\sigma_{XY} \\
 31 |   &= \alpha^2\sigma^2_X + \sigma^2_Y - 2\alpha\sigma^2_Y + \alpha^2\sigma^2_Y + 
 32 |      2\alpha\sigma_{XY} - 2\alpha^2\sigma_{XY} 
 33 | \end{align}
 34 | 
 35 | Now we want to find when the rate of change of this function is 0 with respect
 36 | to $\alpha$, so we compute the partial derivative, set to 0 and solve.
 37 | 
 38 | $$
 39 | \frac{\partial}{\partial{\alpha}} = 
 40 |   2\alpha\sigma^2_X - 2\sigma^2_Y + 2\alpha\sigma^2_Y + 2\sigma_{XY} - 4\alpha\sigma_{XY} = 0
 41 | $$
 42 | 
 43 | Moving $\alpha$ terms to the same side:
 44 | 
 45 | $$
 46 | \alpha\sigma^2_X + \alpha\sigma^2_Y - 2\alpha\sigma_{XY} = \sigma^2_Y - \sigma_{XY}
 47 | $$
 48 | 
 49 | $$
 50 | \alpha = \frac{\sigma^2_Y - \sigma_{XY}}{\sigma^2_X + \sigma^2_Y - 2\sigma_{XY}}
 51 | $$
 52 | 
 53 | We should also show that this is a minimum, so that the second partial 
 54 | derivative wrt $\alpha$ is $>= 0$.
 55 | 
 56 | \begin{align}
 57 | \frac{\partial^2}{\partial{\alpha^2}} 
 58 |   &=  2\sigma^2_X + 2\sigma^2_Y - 4\sigma_{XY} \\
 59 |   &=  2(\sigma^2_X + \sigma^2_Y - 2\sigma_{XY}) \\
 60 |   &= 2\mathrm{Var}(X - Y)
 61 | \end{align}
 62 | 
 63 | Since variance is positive, then this must be positive.
 64 | 
 65 | ### Question 2
 66 | 
 67 | > We will now derive the probability that a given observation is part of a
 68 | > bootstrap sample. Suppose that we obtain a bootstrap sample from a set of n
 69 | > observations.
 70 | >
 71 | > a. What is the probability that the first bootstrap observation is _not_ the
 72 | >    $j$th observation from the original sample? Justify your answer.
 73 | 
 74 | This is 1 - probability that it is the $j$th = $1 - 1/n$.
 75 | 
 76 | > b. What is the probability that the second bootstrap observation is _not_ the
 77 | >    $j$th observation from the original sample?
 78 | 
 79 | Since each bootstrap observation is a random sample, this probability is the 
 80 | same ($1 - 1/n$).
 81 | 
 82 | > c. Argue that the probability that the $j$th observation is _not_ in the
 83 | >    bootstrap sample is $(1 - 1/n)^n$.
 84 | 
 85 | For the $j$th observation to not be in the sample, it would have to _not_ be 
 86 | picked for each of $n$ positions, so not picked for $1, 2, ..., n$, thus
 87 | the probability is $(1 - 1/n)^n$ 
 88 | 
 89 | > d. When $n = 5$, what is the probability that the $j$th observation is in the
 90 | >    bootstrap sample?
 91 | 
 92 | ```{r}
 93 | n <- 5
 94 | 1 - (1 - 1 / n)^n
 95 | ```
 96 | 
 97 | $p = 0.67$
 98 | 
 99 | > e. When $n = 100$, what is the probability that the $j$th observation is in
100 | >    the bootstrap sample?
101 | 
102 | ```{r}
103 | n <- 100
104 | 1 - (1 - 1 / n)^n
105 | ```
106 | 
107 | $p = 0.64$
108 | 
109 | > f. When $n = 10,000$, what is the probability that the $j$th observation is
110 | >    in the bootstrap sample?
111 | 
112 | ```{r}
113 | n <- 100000
114 | 1 - (1 - 1 / n)^n
115 | ```
116 | 
117 | $p = 0.63$
118 | 
119 | > g. Create a plot that displays, for each integer value of $n$ from 1 to
120 | >    100,000, the probability that the $j$th observation is in the bootstrap
121 | >    sample. Comment on what you observe.
122 | 
123 | ```{r}
124 | x <- sapply(1:100000, function(n) 1 - (1 - 1 / n)^n)
125 | plot(x, log = "x", type = "o")
126 | ```
127 | 
128 | The probability rapidly approaches 0.63 with increasing $n$.
129 | 
130 | Note that $$e^x = \lim_{x \to \inf} \left(1 + \frac{x}{n}\right)^n,$$ so with $x = -1$, we
131 | can see that our limit is $1 - e^{-1} = 1 - 1/e$.
132 | 
133 | > h. We will now investigate numerically the probability that a bootstrap
134 | >    sample of size $n = 100$ contains the $j$th observation. Here $j = 4$. We
135 | >    repeatedly create bootstrap samples, and each time we record whether or not
136 | >    the fourth observation is contained in the bootstrap sample.
137 | >    
138 | >    ```r
139 | >    > store <- rep (NA, 10000)
140 | >    > for (i in 1:10000) {
141 | >        store[i] <- sum(sample(1:100, rep = TRUE) == 4) > 0
142 | >    }
143 | >    > mean(store)
144 | >    ```
145 | >    
146 | >    Comment on the results obtained.
147 | 
148 | ```{r}
149 | store <- replicate(10000, sum(sample(1:100, replace = TRUE) == 4) > 0)
150 | mean(store)
151 | ```
152 | 
153 | The probability of including $4$ when resampling numbers $1...100$ is close to
154 | $1 - (1 - 1/100)^{100}$.
155 | 
156 | ### Question 3
157 | 
158 | > 3. We now review $k$-fold cross-validation.
159 | >
160 | > a. Explain how $k$-fold cross-validation is implemented.
161 | 
162 | We divided our data into (approximately equal) $k$ subsets, and then generate
163 | predictions for each $k$th set, training on the exclusive $k$ sets combined.
164 | 
165 | > b. What are the advantages and disadvantages of $k$-fold cross-validation
166 | >    relative to:
167 | >    i. The validation set approach?
168 | >    ii. LOOCV?
169 | 
170 | When using a validation set, we can only train on a small portion of the data
171 | as we must reserve the rest for validation. As a result it can overestimate the
172 | test error rate (assuming we then train using the complete data for future
173 | prediction). It is also sensitive to which observations are including in train
174 | vs. test. It is, however, low cost in terms of processing time (as we only
175 | have to fit one model).
176 | 
177 | When using LOOCV, we can train on $n-1$ observations, however, the trained
178 | models we generate each differ only by the inclusion (and exclusion) of a
179 | single observation. As a result, LOOCV can have high variance (the models fit
180 | will be similar, and might be quite different to what we would obtain with a
181 | different data set). LOOCV is also costly in terms of processing time.
182 | 
183 | ### Question 4
184 | 
185 | > Suppose that we use some statistical learning method to make a prediction for
186 | > the response $Y$ for a particular value of the predictor $X$. Carefully
187 | > describe how we might estimate the standard deviation of our prediction.
188 | 
189 | We could address this with bootstrapping. Our procedure would be to (jointly)
190 | resample $Y$ and $X$ variables and fit our model many times. For each model we
191 | could obtain a summary of our prediction and calculate the standard deviation
192 | over bootstrapped samples.
193 | 
194 | ## Applied
195 | 
196 | ### Question 5
197 | 
198 | > In Chapter 4, we used logistic regression to predict the probability of
199 | > `default` using `income` and `balance` on the `Default` data set. We will now
200 | > estimate the test error of this logistic regression model using the
201 | > validation set approach. Do not forget to set a random seed before beginning
202 | > your analysis.
203 | >
204 | > a. Fit a logistic regression model that uses `income` and `balance` to predict
205 | >    `default`.
206 | 
207 | ```{r}
208 | library(ISLR2)
209 | set.seed(42)
210 | fit <- glm(default ~ income + balance, data = Default, family = "binomial")
211 | ```
212 | 
213 | > b. Using the validation set approach, estimate the test error of this model.
214 | >    In order to do this, you must perform the following steps:
215 | >    i. Split the sample set into a training set and a validation set.
216 | >    ii. Fit a multiple logistic regression model using only the training
217 | >    observations.
218 | >    iii. Obtain a prediction of default status for each individual in
219 | >    the validation set by computing the posterior probability of
220 | >    default for that individual, and classifying the individual to
221 | >    the `default` category if the posterior probability is greater
222 | >    than 0.5.
223 | >    iv. Compute the validation set error, which is the fraction of
224 | >    the observations in the validation set that are misclassified.
225 | 
226 | ```{r}
227 | train <- sample(nrow(Default), nrow(Default) / 2)
228 | fit <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
229 | pred <- ifelse(predict(fit, newdata = Default[-train, ], type = "response") > 0.5, "Yes", "No")
230 | table(pred, Default$default[-train])
231 | mean(pred != Default$default[-train])
232 | ```
233 | 
234 | > c. Repeat the process in (b) three times, using three different splits of the
235 | >    observations into a training set and a validation set. Comment on the
236 | >    results obtained.
237 | 
238 | ```{r}
239 | replicate(3, {
240 |   train <- sample(nrow(Default), nrow(Default) / 2)
241 |   fit <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
242 |   pred <- ifelse(predict(fit, newdata = Default[-train, ], type = "response") > 0.5, "Yes", "No")
243 |   mean(pred != Default$default[-train])
244 | })
245 | ```
246 | 
247 | The results obtained are variable and depend on the samples allocated to 
248 | training vs. test.
249 | 
250 | > d. Now consider a logistic regression model that predicts the probability of
251 | >    `default` using `income`, `balance`, and a dummy variable for `student`.
252 | >    Estimate the test error for this model using the validation set approach.
253 | >    Comment on whether or not including a dummy variable for `student` leads to
254 | >    a reduction in the test error rate.
255 | 
256 | ```{r}
257 | replicate(3, {
258 |   train <- sample(nrow(Default), nrow(Default) / 2)
259 |   fit <- glm(default ~ income + balance + student, data = Default, family = "binomial", subset = train)
260 |   pred <- ifelse(predict(fit, newdata = Default[-train, ], type = "response") > 0.5, "Yes", "No")
261 |   mean(pred != Default$default[-train])
262 | })
263 | ```
264 | 
265 | Including `student` does not seem to make a substantial improvement to the 
266 | test error.
267 | 
268 | ### Question 6
269 | 
270 | > We continue to consider the use of a logistic regression model to predict the
271 | > probability of `default` using `income` and `balance` on the `Default` data
272 | > set. In particular, we will now compute estimates for the standard errors of
273 | > the `income` and `balance` logistic regression coefficients in two different
274 | > ways: (1) using the bootstrap, and (2) using the standard formula for
275 | > computing the standard errors in the `glm()` function. Do not forget to set a
276 | > random seed before beginning your analysis.
277 | >
278 | > a. Using the `summary()` and `glm()` functions, determine the estimated
279 | >    standard errors for the coefficients associated with `income` and
280 | >    `balance` in a multiple logistic regression model that uses both
281 | >    predictors.
282 | 
283 | ```{r}
284 | fit <- glm(default ~ income + balance, data = Default, family = "binomial")
285 | summary(fit)
286 | ```
287 | 
288 | The standard errors obtained by bootstrapping are $\beta_1$ = 5.0e-6 and 
289 | $\beta_2$ = 2.3e-4.
290 | 
291 | > b. Write a function, `boot.fn()`, that takes as input the `Default` data set
292 | >    as well as an index of the observations, and that outputs the coefficient
293 | >    estimates for `income` and `balance` in the multiple logistic regression
294 | >    model.
295 | 
296 | ```{r}
297 | boot.fn <- function(x, i) {
298 |   fit <- glm(default ~ income + balance, data = x[i, ], family = "binomial")
299 |   coef(fit)[-1]
300 | }
301 | ```
302 | 
303 | > c. Use the `boot()` function together with your `boot.fn()` function to
304 | >    estimate the standard errors of the logistic regression coefficients for
305 | >    income and balance.
306 | 
307 | ```{r, cache = TRUE}
308 | library(boot)
309 | set.seed(42)
310 | boot(Default, boot.fn, R = 1000)
311 | ```
312 | 
313 | > d. Comment on the estimated standard errors obtained using the `glm()`
314 | >    function and using your bootstrap function.
315 | 
316 | The standard errors obtained by bootstrapping are similar to those estimated
317 | by `glm`.
318 | 
319 | ### Question 7
320 | 
321 | > In Sections 5.3.2 and 5.3.3, we saw that the `cv.glm()` function can be used
322 | > in order to compute the LOOCV test error estimate. Alternatively, one could
323 | > compute those quantities using just the `glm()` and `predict.glm()`
324 | > functions, and a for loop. You will now take this approach in order to
325 | > compute the LOOCV error for a simple logistic regression model on the `Weekly`
326 | > data set. Recall that in the context of classification problems, the LOOCV
327 | > error is given in (5.4).
328 | >
329 | > a. Fit a logistic regression model that predicts `Direction` using `Lag1` and
330 | >    `Lag2`.
331 | 
332 | ```{r}
333 | fit <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = "binomial")
334 | ```
335 | 
336 | > b. Fit a logistic regression model that predicts `Direction` using `Lag1` and
337 | >    `Lag2` _using all but the first observation_.
338 | 
339 | ```{r}
340 | fit <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-1, ], family = "binomial")
341 | ```
342 | 
343 | > c. Use the model from (b) to predict the direction of the first
344 | >    observation. You can do this by predicting that the first observation will
345 | >    go up if $P($`Direction="Up" | Lag1 , Lag2`$) > 0.5$. Was this observation
346 | >    correctly classified?
347 | 
348 | ```{r}
349 | predict(fit, newdata = Weekly[1, , drop = FALSE], type = "response") > 0.5
350 | ```
351 | 
352 | Yes the observation was correctly classified.
353 | 
354 | > d. Write a for loop from $i = 1$ to $i = n$, where $n$ is the number of
355 | >    observations in the data set, that performs each of the following steps:
356 | >    i.   Fit a logistic regression model using all but the $i$th observation
357 | >         to predict `Direction` using `Lag1` and `Lag2` .
358 | >    ii.  Compute the posterior probability of the market moving up
359 | >         for the $i$th observation.
360 | >    iii. Use the posterior probability for the $i$th observation in order
361 | >         to predict whether or not the market moves up.
362 | >    iv.  Determine whether or not an error was made in predicting the
363 | >         direction for the $i$th observation. If an error was made, then
364 | >         indicate this as a 1, and otherwise indicate it as a 0.
365 | 
366 | ```{r}
367 | error <- numeric(nrow(Weekly))
368 | for (i in 1:nrow(Weekly)) {
369 |   fit <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-i, ], family = "binomial")
370 |   p <- predict(fit, newdata = Weekly[i, , drop = FALSE], type = "response") > 0.5
371 |   error[i] <- ifelse(p, "Down", "Up") == Weekly$Direction[i]
372 | }
373 | ```
374 | 
375 | > e. Take the average of the $n$ numbers obtained in (d) in order to obtain the
376 | >    LOOCV estimate for the test error. Comment on the results.
377 | 
378 | ```{r}
379 | mean(error)
380 | ```
381 | 
382 | The LOOCV test error rate is 45% which implies that our predictions are
383 | marginally more often correct than not.
384 | 
385 | ### Question 8
386 | 
387 | > We will now perform cross-validation on a simulated data set.
388 | >
389 | > a. Generate a simulated data set as follows:
390 | >    ```r
391 | >    > set.seed(1)
392 | >    > x <- rnorm(100)
393 | >    > y <- x - 2 *x^2 + rnorm(100)
394 | >    ```
395 | >    In this data set, what is $n$ and what is $p$? Write out the model
396 | >    used to generate the data in equation form.
397 | 
398 | ```{r}
399 | set.seed(1)
400 | x <- rnorm(100)
401 | y <- x - 2 * x^2 + rnorm(100)
402 | ```
403 | 
404 | $n$ is 100 and $p$ is 1 (there are 100 observations and $y$ is predicted with 
405 | a single variable $x$). The model equation is: $$y = -2x^2 + x + \epsilon$$. 
406 | 
407 | > b. Create a scatterplot of $X$ against $Y$. Comment on what you find.
408 | 
409 | ```{r}
410 | plot(x, y)
411 | ```
412 | 
413 | $y$ has a (negative) quadratic relationship with $x$.
414 | 
415 | > c. Set a random seed, and then compute the LOOCV errors that result from
416 | >    fitting the following four models using least squares:
417 | >    i.   $Y = \beta_0 + \beta_1 X + \epsilon$
418 | >    ii.  $Y = \beta_0 + \beta_1 X + \beta_2 X^2 + \epsilon$
419 | >    iii. $Y = \beta_0 + \beta_1 X + \beta_2 X^2 + \beta_3 X^3 + \epsilon$
420 | >    iv.  $Y = \beta_0 + \beta_1 X + \beta_2 X^2 + \beta_3 X^3 + \beta_4 X^4 + \epsilon$.
421 | >
422 | >    Note you may find it helpful to use the `data.frame()` function
423 | >    to create a single data set containing both $X$ and $Y$.
424 | 
425 | ```{r}
426 | library(boot)
427 | set.seed(42)
428 | dat <- data.frame(x, y)
429 | sapply(1:4, function(i) cv.glm(dat, glm(y ~ poly(x, i)))$delta[1])
430 | ```
431 | 
432 | > d. Repeat (c) using another random seed, and report your results.
433 | >    Are your results the same as what you got in (c)? Why?
434 | 
435 | ```{r}
436 | set.seed(43)
437 | dat <- data.frame(x, y)
438 | sapply(1:4, function(i) cv.glm(dat, glm(y ~ poly(x, i)))$delta[1])
439 | ```
440 | 
441 | The results are the same because we are using LOOCV. When doing this, the model
442 | is fit leaving each one of the observations out in turn, and thus there is no
443 | stochasticity involved.
444 | 
445 | > e. Which of the models in (c) had the smallest LOOCV error? Is this what you
446 | >    expected? Explain your answer.
447 | 
448 | The second model had the smallest LOOCV. This what would be expected since the
449 | model to generate the data was quadratic and we are measuring the test (rather
450 | than training) error rate to evaluate performance.
451 | 
452 | > f. Comment on the statistical significance of the coefficient estimates
453 | >    that results from fitting each of the models in (c) using least squares.
454 | >    Do these results agree with the conclusions drawn based on the
455 | >    cross-validation results?
456 | 
457 | ```{r}
458 | for (i in 1:4) printCoefmat(coef(summary(glm(y ~ poly(x, i), data = dat))))
459 | ```
460 | 
461 | We can see that the coefficients in the first model are not highly significant,
462 | but all terms ($\beta_0, \beta_1$ and $\beta_2$) are in the quadratic model.
463 | After this, subsequent $\beta_n$ terms are not significant. Therefore, these
464 | results agree with those from cross-validation.
465 | 
466 | ### Question 9
467 | 
468 | > We will now consider the `Boston` housing data set, from the `ISLR2`
469 | > library.
470 | >
471 | > a.  Based on this data set, provide an estimate for the population mean of
472 | >    `medv`. Call this estimate $\hat\mu$.
473 | 
474 | ```{r}
475 | (mu <- mean(Boston$medv))
476 | ```
477 | 
478 | > b.  Provide an estimate of the standard error of $\hat\mu$. Interpret this
479 | >    result.
480 | >
481 | >    _Hint: We can compute the standard error of the sample mean by
482 | >    dividing the sample standard deviation by the square root of the number of
483 | >    observations._
484 | 
485 | ```{r}
486 | sd(Boston$medv) / sqrt(length(Boston$medv))
487 | ```
488 | 
489 | > c.  Now estimate the standard error of $\hat\mu$ using the bootstrap. How does
490 | >    this compare to your answer from (b)?
491 | 
492 | ```{r}
493 | set.seed(42)
494 | (bs <- boot(Boston$medv, function(v, i) mean(v[i]), 10000))
495 | ```
496 | 
497 | The standard error using the bootstrap (0.403) is very close to that
498 | obtained from the formula above (0.409).
499 | 
500 | > d.  Based on your bootstrap estimate from (c), provide a 95% confidence
501 | >    interval for the mean of `medv`. Compare it to the results obtained using
502 | >    `t.test(Boston$medv)`.
503 | >
504 | >    _Hint: You can approximate a 95% confidence interval using the
505 | >    formula $[\hat\mu - 2SE(\hat\mu),  \hat\mu + 2SE(\hat\mu)].$_
506 | 
507 | ```{r}
508 | se <- sd(bs$t)
509 | c(mu - 2 * se, mu + 2 * se)
510 | ```
511 | 
512 | > e.  Based on this data set, provide an estimate, $\hat\mu_{med}$, for the
513 | >    median value of `medv` in the population.
514 | 
515 | ```{r}
516 | median(Boston$medv)
517 | ```
518 | 
519 | > f.  We now would like to estimate the standard error of $\hat\mu_{med}$.
520 | >    Unfortunately, there is no simple formula for computing the standard error
521 | >    of the median. Instead, estimate the standard error of the median using
522 | >    the bootstrap. Comment on your findings.
523 | 
524 | ```{r}
525 | set.seed(42)
526 | boot(Boston$medv, function(v, i) median(v[i]), 10000)
527 | ```
528 | 
529 | The estimated standard error of the median is 0.374. This is lower than the
530 | standard error of the mean.
531 | 
532 | > g.  Based on this data set, provide an estimate for the tenth percentile of
533 | >    `medv` in Boston census tracts. Call this quantity $\hat\mu_{0.1}$. (You
534 | >    can use the `quantile()` function.)
535 | 
536 | ```{r}
537 | quantile(Boston$medv, 0.1)
538 | ```
539 | 
540 | > h.  Use the bootstrap to estimate the standard error of $\hat\mu_{0.1}$.
541 | >    Comment on your findings.
542 | 
543 | ```{r}
544 | set.seed(42)
545 | boot(Boston$medv, function(v, i) quantile(v[i], 0.1), 10000)
546 | ```
547 | 
548 | We get a standard error of ~0.5. This is higher than the standard error of the 
549 | median. Nevertheless the standard error is quite small, thus we can be fairly
550 | confidence about the value of the 10th percentile.
551 | 


--------------------------------------------------------------------------------
/12-unsupervised-learning.Rmd:
--------------------------------------------------------------------------------
  1 | # Unsupervised Learning
  2 | 
  3 | ## Conceptual
  4 | 
  5 | ### Question 1
  6 | 
  7 | > This problem involves the $K$-means clustering algorithm.
  8 | >
  9 | > a. Prove (12.18).
 10 | 
 11 | 12.18 is:
 12 | 
 13 | $$
 14 | \frac{1}{|C_k|}\sum_{i,i' \in C_k} \sum_{j=1}^p (x_{ij} - x_{i'j})^2 =
 15 | 2 \sum_{i \in C_k} \sum_{j=1}^p  (x_{ij} - \bar{x}_{kj})^2
 16 | $$
 17 | 
 18 | where $$\bar{x}_{kj} = \frac{1}{|C_k|}\sum_{i \in C_k} x_{ij}$$
 19 | 
 20 | On the left hand side we compute the difference between each observation
 21 | (indexed by $i$ and $i'$). In the second we compute the difference between
 22 | each observation and the mean. Intuitively this identity is clear (the factor
 23 | of 2 is present because we calculate the difference between each pair twice).
 24 | However, to prove.
 25 | 
 26 | Note first that,
 27 | \begin{align}
 28 | (x_{ij} - x_{i'j})^2
 29 |   = & ((x_{ij} - \bar{x}_{kj}) - (x_{i'j} - \bar{x}_{kj}))^2 \\
 30 |   = & (x_{ij} - \bar{x}_{kj})^2 -
 31 |       2(x_{ij} - \bar{x}_{kj})(x_{i'j} - \bar{x}_{kj}) +
 32 |       (x_{i'j} - \bar{x}_{kj})^2
 33 | \end{align}
 34 | 
 35 | Note that the first term is independent of $i'$ and the last is independent of
 36 | $i$.
 37 | 
 38 | Therefore, 10.12 can be written as:
 39 | 
 40 | \begin{align}
 41 | \frac{1}{|C_k|}\sum_{i,i' \in C_k} \sum_{j=1}^p (x_{ij} - x_{i'j})^2
 42 | = & \frac{1}{|C_k|}\sum_{i,i' \in C_k}\sum_{j=1}^p (x_{ij} - \bar{x}_{kj})^2 -
 43 |     \frac{1}{|C_k|}\sum_{i,i' \in C_k}\sum_{j=1}^p 2(x_{ij} - \bar{x}_{kj})(x_{i'j} - \bar{x}_{kj}) +
 44 |     \frac{1}{|C_k|}\sum_{i,i' \in C_k}\sum_{j=1}^p (x_{i'j} - \bar{x}_{kj})^2 \\
 45 | = & \frac{|C_k|}{|C_k|}\sum_{i \in C_k}\sum_{j=1}^p (x_{ij} - \bar{x}_{kj})^2 -
 46 |     \frac{2}{|C_k|}\sum_{i,i' \in C_k}\sum_{j=1}^p (x_{ij} - \bar{x}_{kj})(x_{i'j} - \bar{x}_{kj}) +
 47 |     \frac{|C_k|}{|C_k|}\sum_{i \in C_k}\sum_{j=1}^p (x_{ij} - \bar{x}_{kj})^2 \\
 48 | = & 2 \sum_{i \in C_k}\sum_{j=1}^p (x_{ij} - \bar{x}_{kj})^2
 49 | \end{align}
 50 | 
 51 | Note that we can drop the term containing
 52 | $(x_{ij} - \bar{x}_{kj})(x_{i'j} - \bar{x}_{kj})$ since this is 0 when summed
 53 | over combinations of $i$ and $i'$ for a given $j$.
 54 | 
 55 | > b. On the basis of this identity, argue that the $K$-means clustering
 56 | >    algorithm (Algorithm 12.2) decreases the objective (12.17) at each
 57 | >    iteration.
 58 | 
 59 | Equation 10.12 demonstrates that the euclidean distance between each possible
 60 | pair of samples can be related to the difference from each sample to the mean
 61 | of the cluster. The K-means algorithm works by minimizing the euclidean distance
 62 | to each centroid, thus also minimizes the within-cluster variance.
 63 | 
 64 | ### Question 2
 65 | 
 66 | > Suppose that we have four observations, for which we compute a dissimilarity
 67 | > matrix, given by
 68 | > 
 69 | > \begin{bmatrix}
 70 | >      & 0.3  & 0.4  & 0.7  \\
 71 | > 0.3  &      & 0.5  & 0.8  \\
 72 | > 0.4  & 0.5  &      & 0.45 \\
 73 | > 0.7  & 0.8  & 0.45 &      \\
 74 | > \end{bmatrix}
 75 | > 
 76 | > For instance, the dissimilarity between the first and second observations is
 77 | > 0.3, and the dissimilarity between the second and fourth observations is 0.8.
 78 | > 
 79 | > a. On the basis of this dissimilarity matrix, sketch the dendrogram that
 80 | >    results from hierarchically clustering these four observations using
 81 | >    complete linkage. Be sure to indicate on the plot the height at which each
 82 | >    fusion occurs, as well as the observations corresponding to each leaf in
 83 | >    the dendrogram.
 84 | 
 85 | ```{r}
 86 | m <- matrix(c(0, 0.3, 0.4, 0.7, 0.3, 0, 0.5, 0.8, 0.4, 0.5, 0., 0.45, 0.7, 0.8, 0.45, 0), ncol = 4)
 87 | c1 <- hclust(as.dist(m), method = "complete")
 88 | plot(c1)
 89 | ```
 90 | 
 91 | > b. Repeat (a), this time using single linkage clustering.
 92 | 
 93 | ```{r}
 94 | c2 <- hclust(as.dist(m), method = "single")
 95 | plot(c2)
 96 | ```
 97 | 
 98 | > c. Suppose that we cut the dendrogram obtained in (a) such that two clusters
 99 | >    result. Which observations are in each cluster?
100 | 
101 | ```{r}
102 | table(1:4, cutree(c1, 2))
103 | ```
104 | 
105 | > d. Suppose that we cut the dendrogram obtained in (b) such that two clusters
106 | >    result. Which observations are in each cluster?
107 | 
108 | ```{r}
109 | table(1:4, cutree(c2, 2))
110 | ```
111 | 
112 | > e. It is mentioned in the chapter that at each fusion in the dendrogram, the
113 | >    position of the two clusters being fused can be swapped without changing
114 | >    the meaning of the dendrogram. Draw a dendrogram that is equivalent to the
115 | >    dendrogram in (a), for which two or more of the leaves are repositioned,
116 | >    but for which the meaning of the dendrogram is the same.
117 | 
118 | ```{r}
119 | plot(c1, labels = c(2, 1, 3, 4))
120 | ```
121 | 
122 | ### Question 3
123 | 
124 | > In this problem, you will perform $K$-means clustering manually, with $K = 2$,
125 | > on a small example with $n = 6$ observations and $p = 2$ features. The
126 | > observations are as follows.
127 | > 
128 | > | Obs. | $X_1$ | $X_2$ |
129 | > |------|-------|-------|
130 | > | 1    | 1     | 4     |
131 | > | 2    | 1     | 3     |
132 | > | 3    | 0     | 4     |
133 | > | 4    | 5     | 1     |
134 | > | 5    | 6     | 2     |
135 | > | 6    | 4     | 0     |
136 | > 
137 | > a. Plot the observations.
138 | 
139 | ```{r}
140 | library(ggplot2)
141 | d <- data.frame(
142 |   x1 = c(1, 1, 0, 5, 6, 4),
143 |   x2 = c(4, 3, 4, 1, 2, 0)
144 | )
145 | ggplot(d, aes(x = x1, y = x2)) +
146 |   geom_point()
147 | ```
148 | 
149 | > b. Randomly assign a cluster label to each observation. You can use the
150 | >    `sample()` command in `R` to do this. Report the cluster labels for each
151 | >    observation.
152 | 
153 | ```{r}
154 | set.seed(42)
155 | d$cluster <- sample(c(1, 2), size = nrow(d), replace = TRUE)
156 | ```
157 | 
158 | > c. Compute the centroid for each cluster.
159 | 
160 | ```{r}
161 | centroids <- sapply(c(1, 2), function(i) colMeans(d[d$cluster == i, 1:2]))
162 | ```
163 | 
164 | > d. Assign each observation to the centroid to which it is closest, in terms of
165 | >    Euclidean distance. Report the cluster labels for each observation.
166 | 
167 | ```{r}
168 | dist <- sapply(1:2, function(i) {
169 |   sqrt((d$x1 - centroids[1, i])^2 + (d$x2 - centroids[2, i])^2)
170 | })
171 | d$cluster <- apply(dist, 1, which.min)
172 | ```
173 | 
174 | > e. Repeat (c) and (d) until the answers obtained stop changing.
175 | 
176 | ```{r}
177 | centroids <- sapply(c(1, 2), function(i) colMeans(d[d$cluster == i, 1:2]))
178 | dist <- sapply(1:2, function(i) {
179 |   sqrt((d$x1 - centroids[1, i])^2 + (d$x2 - centroids[2, i])^2)
180 | })
181 | d$cluster <- apply(dist, 1, which.min)
182 | ```
183 | 
184 | In this case, we get stable labels after the first iteration.
185 | 
186 | > f. In your plot from (a), color the observations according to the cluster
187 | >    labels obtained.
188 | 
189 | ```{r}
190 | ggplot(d, aes(x = x1, y = x2, color = factor(cluster))) +
191 |   geom_point()
192 | ```
193 | 
194 | ### Question 4
195 | 
196 | > Suppose that for a particular data set, we perform hierarchical clustering
197 | > using single linkage and using complete linkage. We obtain two dendrograms.
198 | >
199 | > a. At a certain point on the single linkage dendrogram, the clusters {1, 2, 3}
200 | >    and {4, 5} fuse. On the complete linkage dendrogram, the clusters {1, 2, 3}
201 | >    and {4, 5} also fuse at a certain point. Which fusion will occur higher on
202 | >    the tree, or will they fuse at the same height, or is there not enough
203 | >    information to tell?
204 | 
205 | The complete linkage fusion will likely be higher in the tree since single
206 | linkage is defined as being the minimum distance between two clusters. However,
207 | there is a chance that they could be at the same height (so technically there
208 | is not enough information to tell).
209 | 
210 | > b. At a certain point on the single linkage dendrogram, the clusters {5} and
211 | >    {6} fuse. On the complete linkage dendrogram, the clusters {5} and {6} also
212 | >    fuse at a certain point. Which fusion will occur higher on the tree, or
213 | >    will they fuse at the same height, or is there not enough information to
214 | >    tell?
215 | 
216 | They will fuse at the same height (the algorithm for calculating distance is
217 | the same when the clusters are of size 1).
218 | 
219 | ### Question 5
220 | 
221 | > In words, describe the results that you would expect if you performed
222 | > $K$-means clustering of the eight shoppers in Figure 12.16, on the basis of
223 | > their sock and computer purchases, with $K = 2$. Give three answers, one for
224 | > each of the variable scalings displayed. Explain.
225 | 
226 | In cases where variables are scaled we would expect clusters to correspond
227 | to whether or not the retainer sold a computer. In the first case (raw numbers
228 | of items sold), we would expect clusters to represent low vs high numbers of 
229 | sock purchases.
230 | 
231 | To test, we can run the analysis in R:
232 | 
233 | ```{r}
234 | set.seed(42)
235 | dat <- data.frame(
236 |   socks = c(8, 11, 7, 6, 5, 6, 7, 8),
237 |   computers = c(0, 0, 0, 0, 1, 1, 1, 1)
238 | )
239 | kmeans(dat, 2)$cluster
240 | kmeans(scale(dat), 2)$cluster
241 | dat$computers <- dat$computers * 2000
242 | kmeans(dat, 2)$cluster
243 | ```
244 | 
245 | ### Question 6
246 | 
247 | > We saw in Section 12.2.2 that the principal component loading and score
248 | > vectors provide an approximation to a matrix, in the sense of (12.5).
249 | > Specifically, the principal component score and loading vectors solve the
250 | > optimization problem given in (12.6).
251 | >
252 | > Now, suppose that the M principal component score vectors zim, $m = 1,...,M$,
253 | > are known. Using (12.6), explain that the first $M$ principal component
254 | > loading vectors $\phi_{jm}$, $m = 1,...,M$, can be obtaining by performing $M$
255 | > separate least squares linear regressions. In each regression, the principal
256 | > component score vectors are the predictors, and one of the features of the
257 | > data matrix is the response.
258 | 
259 | ## Applied
260 | 
261 | ### Question 7
262 | 
263 | > In the chapter, we mentioned the use of correlation-based distance and
264 | > Euclidean distance as dissimilarity measures for hierarchical clustering.
265 | > It turns out that these two measures are almost equivalent: if each
266 | > observation has been centered to have mean zero and standard deviation one,
267 | > and if we let $r_{ij}$ denote the correlation between the $i$th and $j$th
268 | > observations, then the quantity $1 - r_{ij}$ is proportional to the squared
269 | > Euclidean distance between the ith and jth observations.
270 | >
271 | > On the `USArrests` data, show that this proportionality holds.
272 | >
273 | > _Hint: The Euclidean distance can be calculated using the `dist()` function,_
274 | > _and correlations can be calculated using the `cor()` function._
275 | 
276 | ```{r}
277 | dat <- t(scale(t(USArrests)))
278 | d1 <- dist(dat)^2
279 | d2 <- as.dist(1 - cor(t(dat)))
280 | plot(d1, d2)
281 | ```
282 | 
283 | ### Question 8
284 | 
285 | > In Section 12.2.3, a formula for calculating PVE was given in Equation
286 | > 12.10. We also saw that the PVE can be obtained using the `sdev` output of the
287 | > `prcomp()` function.
288 | >
289 | > On the `USArrests` data, calculate PVE in two ways:
290 | >
291 | > a. Using the `sdev` output of the `prcomp()` function, as was done in Section
292 | >    12.2.3.
293 | 
294 | ```{r}
295 | pr <- prcomp(USArrests, scale = TRUE)
296 | pr$sdev^2 / sum(pr$sdev^2)
297 | ```
298 | 
299 | > b. By applying Equation 12.10 directly. That is, use the `prcomp()` function to
300 | >    compute the principal component loadings. Then, use those loadings in
301 | >    Equation 12.10 to obtain the PVE. 
302 | >
303 | > These two approaches should give the same results.
304 | 
305 | ```{r}
306 | colSums(pr$x^2) / sum(colSums(scale(USArrests)^2))
307 | ```
308 | 
309 | > _Hint: You will only obtain the same results in (a) and (b) if the same_
310 | > _data is used in both cases. For instance, if in (a) you performed_
311 | > _`prcomp()` using centered and scaled variables, then you must center and_
312 | > _scale the variables before applying Equation 12.10 in (b)._
313 | 
314 | ### Question 9
315 | 
316 | > Consider the `USArrests` data. We will now perform hierarchical clustering on
317 | > the states.
318 | >
319 | > a. Using hierarchical clustering with complete linkage and Euclidean distance,
320 | >    cluster the states.
321 | 
322 | ```{r}
323 | set.seed(42)
324 | hc <- hclust(dist(USArrests), method = "complete")
325 | ```
326 | 
327 | > b. Cut the dendrogram at a height that results in three distinct clusters.
328 | >    Which states belong to which clusters?
329 | 
330 | ```{r}
331 | ct <- cutree(hc, 3)
332 | sapply(1:3, function(i) names(ct)[ct == i])
333 | ```
334 | 
335 | > c. Hierarchically cluster the states using complete linkage and Euclidean
336 | >    distance, _after scaling the variables to have standard deviation one_.
337 | 
338 | ```{r}
339 | hc2 <- hclust(dist(scale(USArrests)), method = "complete")
340 | ```
341 | 
342 | > d. What effect does scaling the variables have on the hierarchical clustering
343 | >    obtained? In your opinion, should the variables be scaled before the
344 | >    inter-observation dissimilarities are computed? Provide a justification for
345 | >    your answer.
346 | 
347 | ```{r}
348 | ct <- cutree(hc, 3)
349 | sapply(1:3, function(i) names(ct)[ct == i])
350 | ```
351 | 
352 | Scaling results in different clusters and the choice of whether to scale or 
353 | not depends on the data in question. In this case, the variables are:
354 | 
355 |   - Murder    numeric  Murder arrests (per 100,000)  
356 |   - Assault   numeric  Assault arrests (per 100,000) 
357 |   - UrbanPop  numeric  Percent urban population      
358 |   - Rape      numeric  Rape arrests (per 100,000)    
359 | 
360 | These variables are not naturally on the same unit and the units involved are
361 | somewhat arbitrary (so for example, Murder could be measured per 1 million 
362 | rather than per 100,000) so in this case I would argue the data should be 
363 | scaled.
364 | 
365 | ### Question 10
366 | 
367 | > In this problem, you will generate simulated data, and then perform PCA and
368 | > $K$-means clustering on the data.
369 | >
370 | > a. Generate a simulated data set with 20 observations in each of three classes
371 | >    (i.e. 60 observations total), and 50 variables. 
372 | >    
373 | >    _Hint: There are a number of functions in `R` that you can use to generate_
374 | >    _data. One example is the `rnorm()` function; `runif()` is another option._
375 | >    _Be sure to add a mean shift to the observations in each class so that_
376 | >    _there are three distinct classes._
377 | 
378 | ```{r}
379 | set.seed(42)
380 | data <- matrix(rnorm(60 * 50), ncol = 50)
381 | classes <- rep(c("A", "B", "C"), each = 20)
382 | dimnames(data) <- list(classes, paste0("v", 1:50))
383 | data[classes == "B", 1:10] <- data[classes == "B", 1:10] + 1.2
384 | data[classes == "C", 5:30] <- data[classes == "C", 5:30] + 1
385 | ```
386 | 
387 | > b. Perform PCA on the 60 observations and plot the first two principal
388 | >    component score vectors. Use a different color to indicate the
389 | >    observations in each of the three classes. If the three classes appear
390 | >    separated in this plot, then continue on to part (c). If not, then return
391 | >    to part (a) and modify the simulation so that there is greater separation
392 | >    between the three classes. Do not continue to part (c) until the three
393 | >    classes show at least some separation in the first two principal component
394 | >    score vectors.
395 | 
396 | ```{r}
397 | pca <- prcomp(data)
398 | ggplot(
399 |   data.frame(Class = classes, PC1 = pca$x[, 1], PC2 = pca$x[, 2]),
400 |   aes(x = PC1, y = PC2, col = Class)
401 | ) +
402 |   geom_point()
403 | ```
404 | 
405 | > c. Perform $K$-means clustering of the observations with $K = 3$. How well do
406 | >    the clusters that you obtained in $K$-means clustering compare to the true
407 | >    class labels?
408 | >    
409 | >    _Hint: You can use the `table()` function in `R` to compare the true class_
410 | >    _labels to the class labels obtained by clustering. Be careful how you_
411 | >    _interpret the results: $K$-means clustering will arbitrarily number the_
412 | >    _clusters, so you cannot simply check whether the true class labels and_
413 | >    _clustering labels are the same._
414 | 
415 | ```{r}
416 | km <- kmeans(data, 3)$cluster
417 | table(km, names(km))
418 | ```
419 | 
420 | $K$-means separates out the clusters nearly perfectly.
421 | 
422 | > d.  Perform $K$-means clustering with $K = 2$. Describe your results.
423 | 
424 | ```{r}
425 | km <- kmeans(data, 2)$cluster
426 | table(km, names(km))
427 | ```
428 | 
429 | $K$-means effectively defines cluster 2 to be class B, but cluster 1 is a mix
430 | of classes A and B. 
431 | 
432 | > e.  Now perform $K$-means clustering with $K = 4$, and describe your results.
433 | 
434 | ```{r}
435 | km <- kmeans(data, 4)$cluster
436 | table(km, names(km))
437 | ```
438 | 
439 | $K$-means effectively defines cluster 1 to be class B, cluster 2 to be class A
440 | but clusters 3 and 4 are split over class C.
441 | 
442 | > f.  Now perform $K$-means clustering with $K = 3$ on the first two principal
443 | >     component score vectors, rather than on the raw data. That is, perform
444 | >     $K$-means clustering on the $60 \times 2$ matrix of which the first column
445 | >     is the first principal component score vector, and the second column is
446 | >     the second principal component score vector. Comment on the results.
447 | 
448 | ```{r}
449 | km <- kmeans(pca$x[, 1:2], 3)$cluster
450 | table(km, names(km))
451 | ```
452 | 
453 | $K$-means again separates out the clusters nearly perfectly.
454 | 
455 | > g.  Using the `scale()` function, perform $K$-means clustering with $K = 3$ on
456 | >     the data _after scaling each variable to have standard deviation one_. How
457 | >     do these results compare to those obtained in (b)? Explain.
458 | 
459 | ```{r}
460 | km <- kmeans(scale(data), 3)$cluster
461 | table(km, names(km))
462 | ```
463 | 
464 | $K$-means appears to perform less well on the scaled data in this case.
465 | 
466 | ### Question 11
467 | 
468 | > Write an `R` function to perform matrix completion as in Algorithm 12.1, and
469 | > as outlined in Section 12.5.2. In each iteration, the function should keep
470 | > track of the relative error, as well as the iteration count. Iterations should
471 | > continue until the relative error is small enough or until some maximum number
472 | > of iterations is reached (set a default value for this maximum number).
473 | > Furthermore, there should be an option to print out the progress in each
474 | > iteration.
475 | > 
476 | > Test your function on the `Boston` data. First, standardize the features to
477 | > have mean zero and standard deviation one using the `scale()` function. Run an
478 | > experiment where you randomly leave out an increasing (and nested) number of
479 | > observations from 5% to 30%, in steps of 5%. Apply Algorithm 12.1 with $M =
480 | > 1,2,...,8$. Display the approximation error as a function of the fraction of
481 | > observations that are missing, and the value of $M$, averaged over 10
482 | > repetitions of the experiment.
483 | 
484 | ### Question 12
485 | 
486 | > In Section 12.5.2, Algorithm 12.1 was implemented using the `svd()` function.
487 | > However, given the connection between the `svd()` function and the `prcomp()`
488 | > function highlighted in the lab, we could have instead implemented the
489 | > algorithm using `prcomp()`.
490 | >
491 | > Write a function to implement Algorithm 12.1 that makes use of `prcomp()`
492 | > rather than `svd()`.
493 | 
494 | ### Question 13
495 | 
496 | > On the book website, `www.StatLearning.com`, there is a gene expression data
497 | > set (`Ch12Ex13.csv`) that consists of 40 tissue samples with measurements on
498 | > 1,000 genes. The first 20 samples are from healthy patients, while the
499 | > second 20 are from a diseased group.
500 | >
501 | > a. Load in the data using `read.csv()`. You will need to select `header = F`.
502 | 
503 | ```{r}
504 | data <- read.csv("data/Ch12Ex13.csv", header = FALSE)
505 | colnames(data) <- c(paste0("H", 1:20), paste0("D", 1:20))
506 | ```
507 | 
508 | > b. Apply hierarchical clustering to the samples using correlation-based
509 | >    distance, and plot the dendrogram. Do the genes separate the samples into
510 | >    the two groups? Do your results depend on the type of linkage used?
511 | 
512 | ```{r}
513 | hc.complete <- hclust(as.dist(1 - cor(data)), method = "complete")
514 | plot(hc.complete)
515 | 
516 | hc.complete <- hclust(as.dist(1 - cor(data)), method = "average")
517 | plot(hc.complete)
518 | 
519 | hc.complete <- hclust(as.dist(1 - cor(data)), method = "single")
520 | plot(hc.complete)
521 | ```
522 | 
523 | Yes the samples clearly separate into the two groups, although the results 
524 | depend somewhat on the linkage method used. In the case of average clustering,
525 | the disease samples all fall within a subset of the healthy samples.
526 | 
527 | > c. Your collaborator wants to know which genes differ the most across the two
528 | >    groups. Suggest a way to answer this question, and apply it here.
529 | 
530 | This is probably best achieved with a supervised approach. A simple method
531 | would be to determine which genes show the most significant differences between
532 | the groups by applying a t-test to each group. We can then select those with a
533 | FDR adjusted p-value less than some given threshold (e.g. 0.05).
534 | 
535 | ```{r}
536 | class <- factor(rep(c("Healthy", "Diseased"), each = 20))
537 | pvals <- p.adjust(apply(data, 1, function(v) t.test(v ~ class)$p.value))
538 | which(pvals < 0.05)
539 | ```
540 | 


--------------------------------------------------------------------------------
/09-support-vector-mechines.Rmd:
--------------------------------------------------------------------------------
  1 | # Support Vector Machines
  2 | 
  3 | ## Conceptual
  4 | 
  5 | ### Question 1
  6 | 
  7 | > This problem involves hyperplanes in two dimensions.
  8 | >
  9 | > a. Sketch the hyperplane $1 + 3X_1 - X_2 = 0$. Indicate the set of points for
 10 | >    which $1 + 3X_1 - X_2 > 0$, as well as the set of points for which
 11 | >    $1 + 3X_1 - X_2 < 0$.
 12 | 
 13 | ```{r}
 14 | library(ggplot2)
 15 | xlim <- c(-10, 10)
 16 | ylim <- c(-30, 30)
 17 | points <- expand.grid(
 18 |   X1 = seq(xlim[1], xlim[2], length.out = 50),
 19 |   X2 = seq(ylim[1], ylim[2], length.out = 50)
 20 | )
 21 | p <- ggplot(points, aes(x = X1, y = X2)) +
 22 |   geom_abline(intercept = 1, slope = 3) + # X2 = 1 + 3X1
 23 |   theme_bw()
 24 | p + geom_point(aes(color = 1 + 3 * X1 - X2 > 0), size = 0.1) +
 25 |   scale_color_discrete(name = "1 + 3X1 - X2 > 0")
 26 | ```
 27 | 
 28 | > b. On the same plot, sketch the hyperplane $-2 + X_1 + 2X_2 = 0$. Indicate the
 29 | >    set of points for which $-2 + X_1 + 2X_2 > 0$, as well as the set of points
 30 | >    for which $-2 + X_1 + 2X_2 < 0$.
 31 | 
 32 | ```{r}
 33 | p + geom_abline(intercept = 1, slope = -1 / 2) + # X2 = 1 - X1/2
 34 |   geom_point(
 35 |     aes(color = interaction(1 + 3 * X1 - X2 > 0, -2 + X1 + 2 * X2 > 0)),
 36 |     size = 0.5
 37 |   ) +
 38 |   scale_color_discrete(name = "(1 + 3X1 - X2 > 0).(-2 + X1 + 2X2 > 0)")
 39 | ```
 40 | 
 41 | ### Question 2
 42 | 
 43 | > We have seen that in $p = 2$ dimensions, a linear decision boundary takes the
 44 | > form $\beta_0 + \beta_1X_1 + \beta_2X_2 = 0$. We now investigate a non-linear
 45 | > decision boundary.
 46 | >
 47 | > a. Sketch the curve $$(1+X_1)^2 +(2-X_2)^2 = 4$$.
 48 | 
 49 | ```{r}
 50 | points <- expand.grid(
 51 |   X1 = seq(-4, 2, length.out = 100),
 52 |   X2 = seq(-1, 5, length.out = 100)
 53 | )
 54 | p <- ggplot(points, aes(x = X1, y = X2, z = (1 + X1)^2 + (2 - X2)^2 - 4)) +
 55 |   geom_contour(breaks = 0, colour = "black") +
 56 |   theme_bw()
 57 | p
 58 | ```
 59 | 
 60 | > b. On your sketch, indicate the set of points for which
 61 | >    $$(1 + X_1)^2 + (2 - X_2)^2 > 4,$$ as well as the set of points for which
 62 | >    $$(1 + X_1)^2 + (2 - X_2)^2 \leq 4.$$
 63 | 
 64 | ```{r}
 65 | p + geom_point(aes(color = (1 + X1)^2 + (2 - X2)^2 - 4 > 0), size = 0.1)
 66 | ```
 67 | 
 68 | > c. Suppose that a classifier assigns an observation to the blue class if $$(1
 69 | >    + X_1)^2 + (2 - X_2)^2 > 4,$$ and to the red class otherwise. To what class
 70 | >    is the observation $(0, 0)$ classified? $(-1, 1)$? $(2, 2)$? $(3, 8)$?
 71 | 
 72 | ```{r}
 73 | points <- data.frame(
 74 |   X1 = c(0, -1, 2, 3),
 75 |   X2 = c(0, 1, 2, 8)
 76 | )
 77 | ifelse((1 + points$X1)^2 + (2 - points$X2)^2 > 4, "blue", "red")
 78 | ```
 79 | 
 80 | > d. Argue that while the decision boundary in (c) is not linear in terms of
 81 | >    $X_1$ and $X_2$, it is linear in terms of $X_1$, $X_1^2$, $X_2$, and
 82 | >    $X_2^2$.
 83 | 
 84 | The decision boundary is $$(1 + X_1)^2 + (2 - X_2)^2 -4 = 0$$ which we can expand 
 85 | to:
 86 | $$1 + 2X_1 + X_1^2 + 4 - 4X_2 + X_2^2 - 4 = 0$$
 87 | which is linear in terms of $X_1$, $X_1^2$, $X_2$, $X_2^2$.
 88 | 
 89 | ### Question 3
 90 | 
 91 | > Here we explore the maximal margin classifier on a toy data set.
 92 | >
 93 | > a. We are given $n = 7$ observations in $p = 2$ dimensions. For each
 94 | >    observation, there is an associated class label.
 95 | >    
 96 | >    | Obs. | $X_1$ | $X_2$ | $Y$  |
 97 | >    |------|-------|-------|------|
 98 | >    | 1    | 3     | 4     | Red  |
 99 | >    | 2    | 2     | 2     | Red  |
100 | >    | 3    | 4     | 4     | Red  |
101 | >    | 4    | 1     | 4     | Red  |
102 | >    | 5    | 2     | 1     | Blue |
103 | >    | 6    | 4     | 3     | Blue |
104 | >    | 7    | 4     | 1     | Blue |
105 | >    
106 | >    Sketch the observations.
107 | 
108 | ```{r}
109 | data <- data.frame(
110 |   X1 = c(3, 2, 4, 1, 2, 4, 4),
111 |   X2 = c(4, 2, 4, 4, 1, 3, 1),
112 |   Y  = c(rep("Red", 4), rep("Blue", 3))
113 | )
114 | p <- ggplot(data, aes(x = X1, y = X2, color = Y)) +
115 |   geom_point(size = 2) +
116 |   scale_colour_identity() +
117 |   coord_cartesian(xlim = c(0.5, 4.5), ylim = c(0.5, 4.5))
118 | p
119 | ```
120 | 
121 | > b. Sketch the optimal separating hyperplane, and provide the equation for this
122 | >    hyperplane (of the form (9.1)).
123 | 
124 | ```{r}
125 | library(e1071)
126 | 
127 | fit <- svm(as.factor(Y) ~ ., data = data, kernel = "linear", cost = 10, scale = FALSE)
128 | 
129 | # Extract beta_0, beta_1, beta_2
130 | beta <- c(
131 |   -fit$rho,
132 |   drop(t(fit$coefs) %*% as.matrix(data[fit$index, 1:2]))
133 | )
134 | names(beta) <- c("B0", "B1", "B2")
135 | p <- p + geom_abline(intercept = -beta[1] / beta[3], slope = -beta[2] / beta[3], lty = 2)
136 | p
137 | ```
138 | 
139 | > c. Describe the classification rule for the maximal margin classifier. It
140 | >    should be something along the lines of "Classify to Red if $\beta_0 +
141 | >    \beta_1X_1 + \beta_2X_2 > 0$, and classify to Blue otherwise." Provide the
142 | >    values for $\beta_0, \beta_1,$ and $\beta_2$.
143 | 
144 | Classify to red if $\beta_0 + \beta_1X_1 + \beta_2X_2 > 0$ and blue otherwise
145 | where $\beta_0 = `r round(beta[1])`$,  $\beta_1 = `r round(beta[2])`$,
146 | $\beta_2 = `r round(beta[3])`$.
147 | 
148 | > d. On your sketch, indicate the margin for the maximal margin hyperplane.
149 | 
150 | ```{r}
151 | p <- p + geom_ribbon(
152 |   aes(x = x, ymin = ymin, ymax = ymax),
153 |   data = data.frame(x = c(0, 5), ymin = c(-1, 4), ymax = c(0, 5)),
154 |   alpha = 0.1, fill = "blue",
155 |   inherit.aes = FALSE
156 | )
157 | p
158 | ```
159 | 
160 | > e. Indicate the support vectors for the maximal margin classifier.
161 | 
162 | ```{r}
163 | p <- p + geom_point(data = data[fit$index, ], size = 4)
164 | p
165 | ```
166 | 
167 | The support vectors (from the svm fit object) are shown above. Arguably, 
168 | there's another support vector, since four points exactly touch the margin.
169 | 
170 | > f. Argue that a slight movement of the seventh observation would not affect
171 | >    the maximal margin hyperplane.
172 | 
173 | ```{r}
174 | p + geom_point(data = data[7, , drop = FALSE], size = 4, color = "purple")
175 | ```
176 | 
177 | The 7th point is shown in purple above. It is not a support vector, and not 
178 | close to the margin, so small changes in its X1, X2 values would not affect the
179 | current calculated margin. 
180 | 
181 | > g. Sketch a hyperplane that is _not_ the optimal separating hyperplane, and
182 | >    provide the equation for this hyperplane.
183 | 
184 | A non-optimal hyperline that still separates the blue and red points would 
185 | be one that touches the (red) point at X1 = 2, X2 = 2 and the (blue) point at
186 | X1 = 4, X2 = 3. This gives line $y = x/2 + 1$  or, when  $\beta_0 = -1$, 
187 | $\beta_1 = -1/2$, $\beta_2 = 1$.
188 | 
189 | ```{r}
190 | p + geom_abline(intercept = 1, slope = 0.5, lty = 2, col = "red")
191 | ```
192 | 
193 | > h. Draw an additional observation on the plot so that the two classes are no
194 | >    longer separable by a hyperplane.
195 | 
196 | ```{r}
197 | p + geom_point(data = data.frame(X1 = 1, X2 = 3, Y = "Blue"), shape = 15, size = 4)
198 | ```
199 | 
200 | ## Applied
201 | 
202 | ### Question 4
203 | 
204 | > Generate a simulated two-class data set with 100 observations and two features
205 | > in which there is a visible but non-linear separation between the two classes.
206 | > Show that in this setting, a support vector machine with a polynomial kernel
207 | > (with degree greater than 1) or a radial kernel will outperform a support
208 | > vector classifier on the training data. Which technique performs best on the
209 | > test data? Make plots and report training and test error rates in order to
210 | > back up your assertions.
211 | 
212 | ```{r}
213 | set.seed(10)
214 | data <- data.frame(
215 |   x = runif(100),
216 |   y = runif(100)
217 | )
218 | score <- (2 * data$x - 0.5)^2 + (data$y)^2 - 0.5
219 | data$class <- factor(ifelse(score > 0, "red", "blue"))
220 | 
221 | p <- ggplot(data, aes(x = x, y = y, color = class)) +
222 |   geom_point(size = 2) +
223 |   scale_colour_identity()
224 | p
225 | 
226 | train <- 1:50
227 | test <- 51:100
228 | 
229 | fits <- list(
230 |   "Radial" = svm(class ~ ., data = data[train, ], kernel = "radial"),
231 |   "Polynomial" = svm(class ~ ., data = data[train, ], kernel = "polynomial", degree = 2),
232 |   "Linear" = svm(class ~ ., data = data[train, ], kernel = "linear")
233 | )
234 | 
235 | err <- function(model, data) {
236 |   out <- table(predict(model, data), data$class)
237 |   (out[1, 2] + out[2, 1]) / sum(out)
238 | }
239 | plot(fits[[1]], data)
240 | plot(fits[[2]], data)
241 | plot(fits[[3]], data)
242 | sapply(fits, err, data = data[train, ])
243 | sapply(fits, err, data = data[test, ])
244 | ```
245 | 
246 | In this case, the radial kernel performs best, followed by a linear kernel with
247 | the 2nd degree polynomial performing worst. The ordering of these models is the
248 | same for the training and test data sets.
249 | 
250 | ### Question 5
251 | 
252 | > We have seen that we can fit an SVM with a non-linear kernel in order to
253 | > perform classification using a non-linear decision boundary. We will now see
254 | > that we can also obtain a non-linear decision boundary by performing logistic
255 | > regression using non-linear transformations of the features.
256 | >
257 | > a. Generate a data set with $n = 500$ and $p = 2$, such that the observations
258 | >    belong to two classes with a quadratic decision boundary between them. For
259 | >    instance, you can do this as follows:
260 | >    
261 | >    ```r
262 | >    > x1 <- runif(500) - 0.5
263 | >    > x2 <- runif(500) - 0.5
264 | >    > y <- 1 * (x1^2 - x2^2 > 0)
265 | >    ```
266 | 
267 | ```{r}
268 | set.seed(42)
269 | train <- data.frame(
270 |   x1 = runif(500) - 0.5,
271 |   x2 = runif(500) - 0.5
272 | )
273 | train$y <- factor(as.numeric((train$x1^2 - train$x2^2 > 0)))
274 | ```
275 | 
276 | > b. Plot the observations, colored according to their class labels. Your plot
277 | >    should display $X_1$ on the $x$-axis, and $X_2$ on the $y$-axis.
278 | 
279 | ```{r}
280 | p <- ggplot(train, aes(x = x1, y = x2, color = y)) +
281 |   geom_point(size = 2)
282 | p
283 | ```
284 | 
285 | > c. Fit a logistic regression model to the data, using $X_1$ and $X_2$ as 
286 | >    predictors.
287 | 
288 | ```{r}
289 | fit1 <- glm(y ~ ., data = train, family = "binomial")
290 | ```
291 | 
292 | > d. Apply this model to the _training data_ in order to obtain a predicted class
293 | >    label for each training observation. Plot the observations, colored
294 | >    according to the _predicted_ class labels. The decision boundary should be
295 | >    linear.
296 | 
297 | ```{r}
298 | plot_model <- function(fit) {
299 |   if (inherits(fit, "svm")) {
300 |     train$p <- predict(fit)
301 |   } else {
302 |     train$p <- factor(as.numeric(predict(fit) > 0))
303 |   }
304 |   ggplot(train, aes(x = x1, y = x2, color = p)) +
305 |     geom_point(size = 2)
306 | }
307 | 
308 | plot_model(fit1)
309 | ```
310 | 
311 | > e. Now fit a logistic regression model to the data using non-linear functions
312 | >    of $X_1$ and $X_2$ as predictors (e.g. $X_1^2, X_1 \times X_2, \log(X_2),$
313 | >    and so forth).
314 | 
315 | ```{r}
316 | fit2 <- glm(y ~ poly(x1, 2) + poly(x2, 2), data = train, family = "binomial")
317 | ```
318 | 
319 | > f. Apply this model to the _training data_ in order to obtain a predicted
320 | >    class label for each training observation. Plot the observations, colored
321 | >    according to the _predicted_ class labels. The decision boundary should be
322 | >    obviously non-linear. If it is not, then repeat (a)-(e) until you come up
323 | >    with an example in which the predicted class labels are obviously
324 | >    non-linear.
325 | 
326 | ```{r}
327 | plot_model(fit2)
328 | ```
329 | 
330 | > g. Fit a support vector classifier to the data with $X_1$ and $X_2$ as
331 | >    predictors. Obtain a class prediction for each training observation. Plot
332 | >    the observations, colored according to the _predicted class labels_.
333 | 
334 | ```{r}
335 | fit3 <- svm(y ~ x1 + x2, data = train, kernel = "linear")
336 | plot_model(fit3)
337 | ```
338 | 
339 | > h. Fit a SVM using a non-linear kernel to the data. Obtain a class prediction
340 | >    for each training observation. Plot the observations, colored according to
341 | >    the _predicted class labels_.
342 | 
343 | ```{r} 
344 | fit4 <- svm(y ~ x1 + x2, data = train, kernel = "polynomial", degree = 2)
345 | plot_model(fit4)
346 | ```
347 | 
348 | > i. Comment on your results.
349 | 
350 | When simulating data with a quadratic decision boundary, a logistic model with
351 | quadratic transformations of the variables and an svm model with a quadratic
352 | kernel both produce much better (and similar fits) than standard linear methods.
353 | 
354 | ### Question 6
355 | 
356 | > At the end of Section 9.6.1, it is claimed that in the case of data that is
357 | > just barely linearly separable, a support vector classifier with a small
358 | > value of `cost` that misclassifies a couple of training observations may
359 | > perform better on test data than one with a huge value of `cost` that does not
360 | > misclassify any training observations. You will now investigate this claim.
361 | >
362 | > a. Generate two-class data with $p = 2$ in such a way that the classes are
363 | >    just barely linearly separable.
364 | 
365 | ```{r}
366 | set.seed(2)
367 | 
368 | # Simulate data that is separable by a line at y = 2.5
369 | data <- data.frame(
370 |   x = rnorm(200),
371 |   class = sample(c("red", "blue"), 200, replace = TRUE)
372 | )
373 | data$y <- (data$class == "red") * 5 + rnorm(200)
374 | 
375 | # Add barley separable points (these are simulated "noise" values)
376 | newdata <- data.frame(x = rnorm(30))
377 | newdata$y <- 1.5 * newdata$x + 3 + rnorm(30, 0, 1)
378 | newdata$class <- ifelse((1.5 * newdata$x + 3) - newdata$y > 0, "blue", "red")
379 | 
380 | data <- rbind(data, newdata)
381 | 
382 | # remove any that cause misclassification leaving data that is barley linearly
383 | # separable, but along an axis that is not y = 2.5 (which would be correct
384 | # for the "true" data.
385 | data <- data[!(data$class == "red") == ((1.5 * data$x + 3 - data$y) > 0), ]
386 | data <- data[sample(seq_len(nrow(data)), 200), ]
387 | 
388 | p <- ggplot(data, aes(x = x, y = y, color = class)) +
389 |   geom_point(size = 2) +
390 |   scale_colour_identity() +
391 |   geom_abline(intercept = 3, slope = 1.5, lty = 2)
392 | p
393 | ```
394 | 
395 | > b. Compute the cross-validation error rates for support vector classifiers
396 | >    with a range of `cost` values. How many training errors are misclassified
397 | >    for each value of `cost` considered, and how does this relate to the
398 | >    cross-validation errors obtained?
399 | 
400 | How many training errors are misclassified for each value of cost?
401 | 
402 | ```{r}
403 | costs <- 10^seq(-3, 5)
404 | 
405 | sapply(costs, function(cost) {
406 |   fit <- svm(as.factor(class) ~ ., data = data, kernel = "linear", cost = cost)
407 |   pred <- predict(fit, data)
408 |   sum(pred != data$class)
409 | })
410 | ```
411 | 
412 | Cross-validation errors
413 | 
414 | ```{r}
415 | out <- tune(svm, as.factor(class) ~ ., data = data, kernel = "linear", ranges = list(cost = costs))
416 | summary(out)
417 | data.frame(
418 |   cost = out$performances$cost,
419 |   misclass = out$performances$error * nrow(data)
420 | )
421 | ```
422 | 
423 | > c. Generate an appropriate test data set, and compute the test errors
424 | >    corresponding to each of the values of `cost` considered. Which value of
425 | >    `cost` leads to the fewest test errors, and how does this compare to the
426 | >    values of `cost` that yield the fewest training errors and the fewest
427 | >    cross-validation errors?
428 | 
429 | ```{r}
430 | set.seed(2)
431 | test <- data.frame(
432 |   x = rnorm(200),
433 |   class = sample(c("red", "blue"), 200, replace = TRUE)
434 | )
435 | test$y <- (test$class == "red") * 5 + rnorm(200)
436 | p + geom_point(data = test, pch = 21)
437 | 
438 | (errs <- sapply(costs, function(cost) {
439 |   fit <- svm(as.factor(class) ~ ., data = data, kernel = "linear", cost = cost)
440 |   pred <- predict(fit, test)
441 |   sum(pred != test$class)
442 | }))
443 | (cost <- costs[which.min(errs)])
444 | 
445 | (fit <- svm(as.factor(class) ~ ., data = data, kernel = "linear", cost = cost))
446 | 
447 | test$prediction <- predict(fit, test)
448 | p <- ggplot(test, aes(x = x, y = y, color = class, shape = prediction == class)) +
449 |   geom_point(size = 2) +
450 |   scale_colour_identity()
451 | p
452 | ```
453 | 
454 | > d. Discuss your results.
455 | 
456 | A large cost leads to overfitting as the model finds the perfect linear 
457 | separation between red and blue in the training data. A lower cost then 
458 | leads to improved prediction in the test data.
459 | 
460 | ### Question 7
461 | 
462 | > In this problem, you will use support vector approaches in order to predict
463 | > whether a given car gets high or low gas mileage based on the `Auto` data set.
464 | > 
465 | > a. Create a binary variable that takes on a 1 for cars with gas mileage above
466 | >    the median, and a 0 for cars with gas mileage below the median.
467 | 
468 | ```{r}
469 | library(ISLR2)
470 | data <- Auto
471 | data$high_mpg <- as.factor(as.numeric(data$mpg > median(data$mpg)))
472 | ```
473 | 
474 | > b. Fit a support vector classifier to the data with various values of `cost`,
475 | >    in order to predict whether a car gets high or low gas mileage. Report the
476 | >    cross-validation errors associated with different values of this parameter.
477 | >    Comment on your results. Note you will need to fit the classifier without
478 | >    the gas mileage variable to produce sensible results.
479 | 
480 | ```{r}
481 | set.seed(42)
482 | costs <- 10^seq(-4, 3, by = 0.5)
483 | results <- list()
484 | f <- high_mpg ~ displacement + horsepower + weight
485 | results$linear <- tune(svm, f,
486 |   data = data, kernel = "linear",
487 |   ranges = list(cost = costs)
488 | )
489 | summary(results$linear)
490 | ```
491 | 
492 | > c. Now repeat (b), this time using SVMs with radial and polynomial basis
493 | >    kernels, with different values of `gamma` and `degree` and `cost`. Comment
494 | >    on your results.
495 | 
496 | ```{r}
497 | results$polynomial <- tune(svm, f,
498 |   data = data, kernel = "polynomial",
499 |   ranges = list(cost = costs, degree = 1:3)
500 | )
501 | summary(results$polynomial)
502 | 
503 | results$radial <- tune(svm, f,
504 |   data = data, kernel = "radial",
505 |   ranges = list(cost = costs, gamma = 10^(-2:1))
506 | )
507 | summary(results$radial)
508 | 
509 | sapply(results, function(x) x$best.performance)
510 | sapply(results, function(x) x$best.parameters)
511 | ```
512 | 
513 | > d. Make some plots to back up your assertions in (b) and (c).
514 | > 
515 | >    _Hint: In the lab, we used the `plot()` function for `svm` objects only in 
516 | >    cases with $p = 2$. When $p > 2$, you can use the `plot()` function to 
517 | >    create plots displaying pairs of variables at a time. Essentially, instead 
518 | >    of typing_
519 | >    
520 | >    ```r
521 | >    > plot(svmfit, dat)
522 | >    ```
523 | >    
524 | >    _where `svmfit` contains your fitted model and dat is a data frame 
525 | >    containing your data, you can type_
526 | >    
527 | >    ```r
528 | >    > plot(svmfit, dat, x1 ∼ x4)
529 | >    ``` 
530 | >    
531 | >    _in order to plot just the first and fourth variables. However, you must 
532 | >    replace `x1` and `x4` with the correct variable names. To find out more, 
533 | >    type `?plot.svm`._
534 | 
535 | ```{r}
536 | table(predict(results$radial$best.model, data), data$high_mpg)
537 | 
538 | plot(results$radial$best.model, data, horsepower ~ displacement)
539 | plot(results$radial$best.model, data, horsepower ~ weight)
540 | plot(results$radial$best.model, data, displacement ~ weight)
541 | ```
542 | 
543 | ### Question 8
544 | 
545 | > This problem involves the `OJ` data set which is part of the `ISLR2` package.
546 | > 
547 | > a. Create a training set containing a random sample of 800 observations, and a
548 | >    test set containing the remaining observations.
549 | 
550 | ```{r}
551 | set.seed(42)
552 | train <- sample(seq_len(nrow(OJ)), 800)
553 | test <- setdiff(seq_len(nrow(OJ)), train)
554 | ```
555 | 
556 | > b. Fit a support vector classifier to the training data using `cost = 0.01`,
557 | >    with `Purchase` as the response and the other variables as predictors. Use
558 | >    the `summary()` function to produce summary statistics, and describe the
559 | >    results obtained.
560 | 
561 | ```{r}
562 | fit <- svm(Purchase ~ ., data = OJ[train, ], kernel = "linear", cost = 0.01)
563 | summary(fit)
564 | ```
565 | 
566 | > c. What are the training and test error rates?
567 | 
568 | ```{r}
569 | err <- function(model, data) {
570 |   t <- table(predict(model, data), data[["Purchase"]])
571 |   1 - sum(diag(t)) / sum(t)
572 | }
573 | errs <- function(model) {
574 |   c(train = err(model, OJ[train, ]), test = err(model, OJ[test, ]))
575 | }
576 | errs(fit)
577 | ```
578 | 
579 | > d. Use the `tune()` function to select an optimal cost. Consider values in the
580 | >    range 0.01 to 10.
581 | 
582 | ```{r}
583 | tuned <- tune(svm, Purchase ~ .,
584 |   data = OJ[train, ], kernel = "linear",
585 |   ranges = list(cost = 10^seq(-2, 1, length.out = 10))
586 | )
587 | tuned$best.parameters
588 | summary(tuned)
589 | ```
590 | 
591 | > e. Compute the training and test error rates using this new value for `cost`.
592 | 
593 | ```{r}
594 | errs(tuned$best.model)
595 | ```
596 | 
597 | > f. Repeat parts (b) through (e) using a support vector machine with a radial
598 | >    kernel. Use the default value for `gamma`.
599 | 
600 | ```{r}
601 | tuned2 <- tune(svm, Purchase ~ .,
602 |   data = OJ[train, ], kernel = "radial",
603 |   ranges = list(cost = 10^seq(-2, 1, length.out = 10))
604 | )
605 | tuned2$best.parameters
606 | errs(tuned2$best.model)
607 | ```
608 | 
609 | > g. Repeat parts (b) through (e) using a support vector machine with a
610 | >    polynomial kernel. Set `degree = 2`.
611 | 
612 | ```{r}
613 | tuned3 <- tune(svm, Purchase ~ .,
614 |   data = OJ[train, ], kernel = "polynomial",
615 |   ranges = list(cost = 10^seq(-2, 1, length.out = 10)), degree = 2
616 | )
617 | tuned3$best.parameters
618 | errs(tuned3$best.model)
619 | ```
620 | 
621 | > h. Overall, which approach seems to give the best results on this data?
622 | 
623 | Overall the "radial" kernel appears to perform best in this case.
624 | 


--------------------------------------------------------------------------------
/08-tree-based-methods.Rmd:
--------------------------------------------------------------------------------
  1 | # Tree-Based Methods
  2 | 
  3 | ## Conceptual
  4 | 
  5 | ### Question 1
  6 | 
  7 | > Draw an example (of your own invention) of a partition of two-dimensional
  8 | > feature space that could result from recursive binary splitting. Your example
  9 | > should contain at least six regions. Draw a decision tree corresponding to
 10 | > this partition. Be sure to label all aspects of your figures, including the
 11 | > regions $R_1, R_2, ...,$ the cutpoints $t_1, t_2, ...,$ and so forth.
 12 | >
 13 | > _Hint: Your result should look something like Figures 8.1 and 8.2._
 14 | 
 15 | ```{r, message = FALSE, warning = FALSE}
 16 | library(showtext)
 17 | showtext::showtext_auto()
 18 | library(ggplot2)
 19 | library(tidyverse)
 20 | library(ggtree)
 21 | ```
 22 | 
 23 | ```{r}
 24 | tree <- ape::read.tree(text = "(((R1:1,R2:1)N1:2,R3:4)N2:2,(R4:2,(R5:1,R6:1)R3:2)N4:5)R;")
 25 | tree$node.label <- c("Age < 40", "Weight < 100", "Weight < 70", "Age < 60", "Weight < 80")
 26 | 
 27 | ggtree(tree, ladderize = FALSE) + scale_x_reverse() + coord_flip() +
 28 |   geom_tiplab(vjust = 2, hjust = 0.5) +
 29 |   geom_text2(aes(label = label, subset = !isTip), hjust = -0.1, vjust = -1)
 30 | ```
 31 | 
 32 | ```{r}
 33 | plot(NULL,
 34 |   xlab = "Age (years)", ylab = "Weight (kg)",
 35 |   xlim = c(0, 100), ylim = c(40, 160), xaxs = "i", yaxs = "i"
 36 | )
 37 | abline(v = 40, col = "red", lty = 2)
 38 | lines(c(0, 40), c(100, 100), col = "blue", lty = 2)
 39 | lines(c(0, 40), c(70, 70), col = "blue", lty = 2)
 40 | abline(v = 60, col = "red", lty = 2)
 41 | lines(c(60, 100), c(80, 80), col = "blue", lty = 2)
 42 | 
 43 | text(
 44 |   c(20, 20, 20, 50, 80, 80),
 45 |   c(55, 85, 130, 100, 60, 120),
 46 |   labels = c("R1", "R2", "R3", "R4", "R5", "R6")
 47 | )
 48 | ```
 49 | 
 50 | ### Question 2
 51 | 
 52 | > It is mentioned in Section 8.2.3 that boosting using depth-one trees (or
 53 | > _stumps_) leads to an _additive_ model: that is, a model of the form
 54 | > $$
 55 | > f(X) = \sum_{j=1}^p f_j(X_j).
 56 | > $$
 57 | > Explain why this is the case. You can begin with (8.12) in Algorithm 8.2.
 58 | 
 59 | Equation 8.1 is:
 60 | 
 61 | $$
 62 | f(x) = \sum_{b=1}^B(\lambda \hat{f}^b(x)
 63 | $$
 64 | 
 65 | where $\hat{f}^b(x)$ represents the $b$th tree with (in this case) 1 split.
 66 | Since 1-depth trees involve only one variable, and the total function for
 67 | $x$ involves adding the outcome for each, this model is an additive. Depth
 68 | 2 trees would allow for interactions between two variables.
 69 | 
 70 | ### Question 3
 71 | 
 72 | > Consider the Gini index, classification error, and cross-entropy in a simple
 73 | > classification setting with two classes. Create a single plot that displays
 74 | > each of these quantities as a function of $\hat{p}_{m1}$. The $x$-axis should
 75 | > display $\hat{p}_{m1}$, ranging from 0 to 1, and the $y$-axis should display
 76 | > the value of the Gini index, classification error, and entropy.
 77 | >
 78 | > _Hint: In a setting with two classes, $\hat{p}_{m1} = 1 - \hat{p}_{m2}$. You
 79 | > could make this plot by hand, but it will be much easier to make in `R`._
 80 | 
 81 | The *Gini index* is defined by
 82 | 
 83 | $$G = \sum_{k=1}^{K} \hat{p}_{mk}(1 - \hat{p}_{mk})$$
 84 | 
 85 | *Entropy* is given by
 86 | 
 87 | $$D = -\sum_{k=1}^{K} \hat{p}_{mk}\log(\hat{p}_{mk})$$
 88 | 
 89 | The *classification error* is
 90 | 
 91 | $$E = 1 - \max_k(\hat{p}_{mk})$$
 92 | 
 93 | ```{r}
 94 | # Function definitions are for when there's two classes only
 95 | p <- seq(0, 1, length.out = 100)
 96 | data.frame(
 97 |   x = p,
 98 |   "Gini index" = p * (1 - p) * 2,
 99 |   "Entropy" = -(p * log(p) + (1 - p) * log(1 - p)),
100 |   "Classification error" = 1 - pmax(p, 1 - p),
101 |   check.names = FALSE
102 | ) |>
103 |   pivot_longer(!x) |>
104 |   ggplot(aes(x = x, y = value, color = name)) +
105 |   geom_line(na.rm = TRUE)
106 | ```
107 | 
108 | ### Question 4
109 | 
110 | > This question relates to the plots in Figure 8.12.
111 | >
112 | > a. Sketch the tree corresponding to the partition of the predictor space
113 | >    illustrated in the left-hand panel of Figure 8.12. The numbers inside the
114 | >    boxes indicate the mean of $Y$ within each region.
115 | 
116 | ```{r}
117 | tree <- ape::read.tree(text = "(((3:1.5,(10:1,0:1)A:1)B:1,15:2)C:1,5:2)D;")
118 | tree$node.label <- c("X1 < 1", "X2 < 1", "X1 < 0", "X2 < 0")
119 | 
120 | ggtree(tree, ladderize = FALSE) + scale_x_reverse() + coord_flip() +
121 |   geom_tiplab(vjust = 2, hjust = 0.5) +
122 |   geom_text2(aes(label = label, subset = !isTip), hjust = -0.1, vjust = -1)
123 | ```
124 | 
125 | > b. Create a diagram similar to the left-hand panel of Figure 8.12, using the
126 | >    tree illustrated in the right-hand panel of the same figure. You should
127 | >    divide up the predictor space into the correct regions, and indicate the
128 | >    mean for each region.
129 | 
130 | ```{r}
131 | plot(NULL, xlab = "X1", ylab = "X2", xlim = c(-1, 2), ylim = c(0, 3), xaxs = "i", yaxs = "i")
132 | abline(h = 1, col = "red", lty = 2)
133 | lines(c(1, 1), c(0, 1), col = "blue", lty = 2)
134 | lines(c(-1, 2), c(2, 2), col = "red", lty = 2)
135 | lines(c(0, 0), c(1, 2), col = "blue", lty = 2)
136 | text(
137 |   c(0, 1.5, -0.5, 1, 0.5),
138 |   c(0.5, 0.5, 1.5, 1.5, 2.5),
139 |   labels = c("-1.80", "0.63", "-1.06", "0.21", "2.49")
140 | )
141 | ```
142 | 
143 | ### Question 5
144 | 
145 | > Suppose we produce ten bootstrapped samples from a data set containing red and
146 | > green classes. We then apply a classification tree to each bootstrapped sample
147 | > and, for a specific value of $X$, produce 10 estimates of
148 | > $P(\textrm{Class is Red}|X)$:
149 | > $$0.1, 0.15, 0.2, 0.2, 0.55, 0.6, 0.6, 0.65, 0.7, \textrm{and } 0.75.$$
150 | > There are two common ways to combine these results together into a single
151 | > class prediction. One is the majority vote approach discussed in this chapter.
152 | > The second approach is to classify based on the average probability. In this
153 | > example, what is the final classification under each of these two approaches?
154 | 
155 | ```{r}
156 | x <- c(0.1, 0.15, 0.2, 0.2, 0.55, 0.6, 0.6, 0.65, 0.7, 0.75)
157 | ifelse(mean(x > 0.5), "red", "green") # majority vote
158 | ifelse(mean(x) > 0.5, "red", "green") # average probability
159 | ```
160 | 
161 | ### Question 6
162 | 
163 | > Provide a detailed explanation of the algorithm that is used to fit a
164 | > regression tree.
165 | 
166 | First we perform binary recursive splitting of the data, to minimize RSS at
167 | each split. This is continued until there are n samples present in each leaf.
168 | Then we prune the tree to a set of subtrees determined by a parameter $\alpha$.
169 | Using K-fold CV, we select $\alpha$ to minimize the cross validation error. The
170 | final tree is then calculated using the complete dataset with the selected
171 | $\alpha$ value.
172 | 
173 | ## Applied
174 | 
175 | ### Question 7
176 | 
177 | > In the lab, we applied random forests to the `Boston` data using `mtry = 6`
178 | > and using `ntree = 25` and `ntree = 500`. Create a plot displaying the test
179 | > error resulting from random forests on this data set for a more comprehensive
180 | > range of values for `mtry` and `ntree`. You can model your plot after Figure
181 | > 8.10. Describe the results obtained.
182 | 
183 | ```{r}
184 | library(ISLR2)
185 | library(randomForest)
186 | set.seed(42)
187 | 
188 | train <- sample(c(TRUE, FALSE), nrow(Boston), replace = TRUE)
189 | 
190 | rf_err <- function(mtry) {
191 |   randomForest(
192 |     Boston[train, -13],
193 |     y = Boston[train, 13],
194 |     xtest = Boston[!train, -13],
195 |     ytest = Boston[!train, 13],
196 |     mtry = mtry,
197 |     ntree = 500
198 |   )$test$mse
199 | }
200 | res <- lapply(c(1, 2, 3, 5, 7, 10, 12), rf_err)
201 | names(res) <- c(1, 2, 3, 5, 7, 10, 12)
202 | data.frame(res, check.names = FALSE) |>
203 |   mutate(n = 1:500) |>
204 |   pivot_longer(!n) |>
205 |   ggplot(aes(x = n, y = value, color = name)) +
206 |   geom_line(na.rm = TRUE) +
207 |   xlab("Number of trees") +
208 |   ylab("Error") +
209 |   scale_y_log10() +
210 |   scale_color_discrete(name = "No. variables at\neach split")
211 | ```
212 | 
213 | ### Question 8
214 | 
215 | > In the lab, a classification tree was applied to the `Carseats` data set after
216 | > converting `Sales` into a qualitative response variable. Now we will seek to
217 | > predict `Sales` using regression trees and related approaches, treating the
218 | > response as a quantitative variable.
219 | >
220 | > a. Split the data set into a training set and a test set.
221 | 
222 | ```{r}
223 | set.seed(42)
224 | train <- sample(c(TRUE, FALSE), nrow(Carseats), replace = TRUE)
225 | ```
226 | 
227 | > b. Fit a regression tree to the training set. Plot the tree, and interpret the
228 | >    results. What test error rate do you obtain?
229 | 
230 | ```{r}
231 | library(tree)
232 | tr <- tree(Sales ~ ., data = Carseats[train, ])
233 | summary(tr)
234 | plot(tr)
235 | text(tr, pretty = 0, digits = 2, cex = 0.8)
236 | 
237 | carseats_mse <- function(model) {
238 |   p <- predict(model, newdata = Carseats[!train, ])
239 |   mean((p - Carseats[!train, "Sales"])^2)
240 | }
241 | carseats_mse(tr)
242 | ```
243 | 
244 | > c. Use cross-validation in order to determine the optimal level of tree
245 | >    complexity. Does pruning the tree improve the test error rate?
246 | 
247 | ```{r}
248 | res <- cv.tree(tr)
249 | plot(res$size, res$dev, type = "b", xlab = "Tree size", ylab = "Deviance")
250 | min <- which.min(res$dev)
251 | abline(v = res$size[min], lty = 2, col = "red")
252 | ```
253 | 
254 | Pruning improves performance very slightly (though this is not repeatable in
255 | different rounds of cross-validation). Arguably, a good balance is achieved
256 | when the tree size is 11.
257 | 
258 | ```{r}
259 | ptr <- prune.tree(tr, best = 11)
260 | plot(ptr)
261 | text(ptr, pretty = 0, digits = 2, cex = 0.8)
262 | carseats_mse(ptr)
263 | ```
264 | 
265 | > d. Use the bagging approach in order to analyze this data. What test error
266 | >    rate do you obtain? Use the `importance()` function to determine which
267 | >    variables are most important.
268 | 
269 | ```{r}
270 | # Here we can use random Forest with mtry = 10 = p (the number of predictor
271 | # variables) to perform bagging
272 | bagged <- randomForest(Sales ~ .,
273 |   data = Carseats[train, ], mtry = 10,
274 |   ntree = 200, importance = TRUE
275 | )
276 | carseats_mse(bagged)
277 | importance(bagged)
278 | ```
279 | 
280 | The test error rate is ~2.8 which is a substantial improvement over the pruned
281 | regression tree above.
282 | 
283 | > e. Use random forests to analyze this data. What test error rate do you
284 | >    obtain? Use the `importance()` function to determine which variables are
285 | >    most important. Describe the effect of $m$, the number of variables
286 | >    considered at each split, on the error rate obtained.
287 | 
288 | ```{r}
289 | rf <- randomForest(Sales ~ .,
290 |   data = Carseats[train, ], mtry = 3,
291 |   ntree = 500, importance = TRUE
292 | )
293 | carseats_mse(rf)
294 | importance(rf)
295 | ```
296 | 
297 | The test error rate is ~3.0 which is a substantial improvement over the pruned
298 | regression tree above, although not quite as good as the bagging approach.
299 | 
300 | > f. Now analyze the data using BART, and report your results.
301 | 
302 | ```{r}
303 | library(BART)
304 | 
305 | # For ease, we'll create a fake "predict" method that just returns
306 | # yhat.test.mean regardless of provided "newdata"
307 | predict.wbart <- function(model, ...) model$yhat.test.mean
308 | 
309 | bartfit <- gbart(Carseats[train, 2:11], Carseats[train, 1],
310 |   x.test = Carseats[!train, 2:11]
311 | )
312 | carseats_mse(bartfit)
313 | ```
314 | 
315 | The test error rate is ~1.6 which is an improvement over random forest and
316 | bagging.
317 | 
318 | ### Question 9
319 | 
320 | > This problem involves the `OJ` data set which is part of the `ISLR2` package.
321 | >
322 | > a. Create a training set containing a random sample of 800 observations, and a
323 | >    test set containing the remaining observations.
324 | 
325 | ```{r}
326 | set.seed(42)
327 | train <- sample(1:nrow(OJ), 800)
328 | test <- setdiff(1:nrow(OJ), train)
329 | ```
330 | 
331 | > b. Fit a tree to the training data, with `Purchase` as the response and the
332 | >    other variables except for `Buy` as predictors. Use the `summary()`
333 | >    function to produce summary statistics about the tree, and describe the
334 | >    results obtained. What is the training error rate? How many terminal nodes
335 | >    does the tree have?
336 | 
337 | ```{r}
338 | tr <- tree(Purchase ~ ., data = OJ[train, ])
339 | summary(tr)
340 | ```
341 | 
342 | > c. Type in the name of the tree object in order to get a detailed text output.
343 | >    Pick one of the terminal nodes, and interpret the information displayed.
344 | 
345 | ```{r}
346 | tr
347 | ```
348 | 
349 | > d. Create a plot of the tree, and interpret the results.
350 | 
351 | ```{r}
352 | plot(tr)
353 | text(tr, pretty = 0, digits = 2, cex = 0.8)
354 | ```
355 | 
356 | > e. Predict the response on the test data, and produce a confusion matrix
357 | >    comparing the test labels to the predicted test labels. What is the test
358 | >    error rate?
359 | 
360 | ```{r}
361 | table(predict(tr, OJ[test, ], type = "class"), OJ[test, "Purchase"])
362 | ```
363 | 
364 | > f. Apply the `cv.tree()` function to the training set in order to determine
365 | >    the optimal tree size.
366 | 
367 | ```{r}
368 | set.seed(42)
369 | res <- cv.tree(tr)
370 | ```
371 | 
372 | > g. Produce a plot with tree size on the $x$-axis and cross-validated
373 | >    classification error rate on the $y$-axis.
374 | 
375 | ```{r}
376 | plot(res$size, res$dev, type = "b", xlab = "Tree size", ylab = "Deviance")
377 | min <- which.min(res$dev)
378 | abline(v = res$size[min], lty = 2, col = "red")
379 | ```
380 | 
381 | > h. Which tree size corresponds to the lowest cross-validated classification
382 | >    error rate?
383 | 
384 | ```{r}
385 | res$size[min]
386 | ```
387 | 
388 | > i. Produce a pruned tree corresponding to the optimal tree size obtained using
389 | >    cross-validation. If cross-validation does not lead to selection of a
390 | >    pruned tree, then create a pruned tree with five terminal nodes.
391 | 
392 | ```{r}
393 | ptr <- prune.tree(tr, best = res$size[min])
394 | plot(ptr)
395 | text(ptr, pretty = 0, digits = 2, cex = 0.8)
396 | ```
397 | 
398 | > j. Compare the training error rates between the pruned and unpruned trees.
399 | >    Which is higher?
400 | 
401 | ```{r}
402 | oj_misclass <- function(model) {
403 |   summary(model)$misclass[1] / summary(model)$misclass[2]
404 | }
405 | oj_misclass(tr)
406 | oj_misclass(ptr)
407 | ```
408 | 
409 | The training misclassification error rate is slightly higher for the pruned tree.
410 | 
411 | > k. Compare the test error rates between the pruned and unpruned trees. Which
412 | >    is higher?
413 | 
414 | ```{r}
415 | oj_err <- function(model) {
416 |   p <- predict(model, newdata = OJ[test, ], type = "class")
417 |   mean(p != OJ[test, "Purchase"])
418 | }
419 | oj_err(tr)
420 | oj_err(ptr)
421 | ```
422 | 
423 | The test misclassification error rate is slightly higher for the pruned tree.
424 | 
425 | ### Question 10
426 | 
427 | > We now use boosting to predict `Salary` in the `Hitters` data set.
428 | >
429 | > a. Remove the observations for whom the salary information is unknown, and
430 | >    then log-transform the salaries.
431 | 
432 | ```{r}
433 | dat <- Hitters
434 | dat <- dat[!is.na(dat$Salary), ]
435 | dat$Salary <- log(dat$Salary)
436 | ```
437 | 
438 | > b. Create a training set consisting of the first 200 observations, and a test
439 | >    set consisting of the remaining observations.
440 | 
441 | ```{r}
442 | train <- 1:200
443 | test <- setdiff(1:nrow(dat), train)
444 | ```
445 | 
446 | > c. Perform boosting on the training set with 1,000 trees for a range of values
447 | >    of the shrinkage parameter $\lambda$. Produce a plot with different
448 | >    shrinkage values on the $x$-axis and the corresponding training set MSE on
449 | >    the $y$-axis.
450 | 
451 | ```{r}
452 | library(gbm)
453 | set.seed(42)
454 | lambdas <- 10^seq(-3, 0, by = 0.1)
455 | fits <- lapply(lambdas, function(lam) {
456 |   gbm(Salary ~ .,
457 |     data = dat[train, ], distribution = "gaussian",
458 |     n.trees = 1000, shrinkage = lam
459 |   )
460 | })
461 | errs <- sapply(fits, function(fit) {
462 |   p <- predict(fit, dat[train, ], n.trees = 1000)
463 |   mean((p - dat[train, ]$Salary)^2)
464 | })
465 | plot(lambdas, errs,
466 |   type = "b", xlab = "Shrinkage values",
467 |   ylab = "Training MSE", log = "xy"
468 | )
469 | ```
470 | 
471 | > d. Produce a plot with different shrinkage values on the $x$-axis and the
472 | >    corresponding test set MSE on the $y$-axis.
473 | 
474 | ```{r}
475 | errs <- sapply(fits, function(fit) {
476 |   p <- predict(fit, dat[test, ], n.trees = 1000)
477 |   mean((p - dat[test, ]$Salary)^2)
478 | })
479 | plot(lambdas, errs,
480 |   type = "b", xlab = "Shrinkage values",
481 |   ylab = "Training MSE", log = "xy"
482 | )
483 | min(errs)
484 | abline(v = lambdas[which.min(errs)], lty = 2, col = "red")
485 | ```
486 | 
487 | > e. Compare the test MSE of boosting to the test MSE that results from applying
488 | >    two of the regression approaches seen in Chapters 3 and 6.
489 | 
490 | Linear regression
491 | 
492 | ```{r}
493 | fit1 <- lm(Salary ~ ., data = dat[train, ])
494 | mean((predict(fit1, dat[test, ]) - dat[test, "Salary"])^2)
495 | ```
496 | 
497 | Ridge regression
498 | 
499 | ```{r}
500 | library(glmnet)
501 | x <- model.matrix(Salary ~ ., data = dat[train, ])
502 | x.test <- model.matrix(Salary ~ ., data = dat[test, ])
503 | y <- dat[train, "Salary"]
504 | fit2 <- glmnet(x, y, alpha = 1)
505 | mean((predict(fit2, s = 0.1, newx = x.test) - dat[test, "Salary"])^2)
506 | ```
507 | 
508 | > f. Which variables appear to be the most important predictors in the boosted
509 | >    model?
510 | 
511 | ```{r}
512 | summary(fits[[which.min(errs)]])
513 | ```
514 | 
515 | > g. Now apply bagging to the training set. What is the test set MSE for this
516 | >    approach?
517 | 
518 | ```{r}
519 | set.seed(42)
520 | bagged <- randomForest(Salary ~ ., data = dat[train, ], mtry = 19, ntree = 1000)
521 | mean((predict(bagged, newdata = dat[test, ]) - dat[test, "Salary"])^2)
522 | ```
523 | 
524 | ### Question 11
525 | 
526 | > This question uses the `Caravan` data set.
527 | >
528 | > a. Create a training set consisting of the first 1,000 observations, and a
529 | >    test set consisting of the remaining observations.
530 | 
531 | ```{r}
532 | train <- 1:1000
533 | test <- setdiff(1:nrow(Caravan), train)
534 | ```
535 | 
536 | > b. Fit a boosting model to the training set with `Purchase` as the response
537 | >    and the other variables as predictors. Use 1,000 trees, and a shrinkage
538 | >    value of 0.01. Which predictors appear to be the most important?
539 | 
540 | ```{r}
541 | set.seed(42)
542 | fit <- gbm(as.numeric(Purchase == "Yes") ~ ., data = Caravan[train, ], n.trees = 1000, shrinkage = 0.01)
543 | head(summary(fit))
544 | ```
545 | 
546 | > c. Use the boosting model to predict the response on the test data. Predict
547 | >    that a person will make a purchase if the estimated probability of purchase
548 | >    is greater than 20%. Form a confusion matrix. What fraction of the people
549 | >    predicted to make a purchase do in fact make one? How does this compare
550 | >    with the results obtained from applying KNN or logistic regression to this
551 | >    data set?
552 | 
553 | ```{r}
554 | p <- predict(fit, Caravan[test, ], n.trees = 1000, type = "response")
555 | table(p > 0.2, Caravan[test, "Purchase"] == "Yes")
556 | sum(p > 0.2 & Caravan[test, "Purchase"] == "Yes") / sum(p > 0.2)
557 | ```
558 | 
559 | 141 (109 + 32) are predicted to purchase. Of these 32 do which is 21%.
560 | 
561 | ```{r}
562 | # Logistic regression
563 | fit <- glm(Purchase == "Yes" ~ ., data = Caravan[train, ], family = "binomial")
564 | p <- predict(fit, Caravan[test, ], type = "response")
565 | table(p > 0.2, Caravan[test, "Purchase"] == "Yes")
566 | sum(p > 0.2 & Caravan[test, "Purchase"] == "Yes") / sum(p > 0.2)
567 | ```
568 | 
569 | For logistic regression we correctly predict 14% of those predicted to purchase.
570 | 
571 | ```{r}
572 | library(class)
573 | # KNN
574 | fit <- knn(Caravan[train, -86], Caravan[test, -86], Caravan$Purchase[train])
575 | table(fit, Caravan[test, "Purchase"] == "Yes")
576 | sum(fit == "Yes" & Caravan[test, "Purchase"] == "Yes") / sum(fit == "Yes")
577 | ```
578 | 
579 | For KNN we correctly predict 8.7% of those predicted to purchase.
580 | 
581 | ### Question 12
582 | 
583 | > Apply boosting, bagging, random forests and BART to a data set of your choice.
584 | > Be sure to fit the models on a training set and to evaluate their performance
585 | > on a test set. How accurate are the results compared to simple methods like
586 | > linear or logistic regression? Which of these approaches yields the best
587 | > performance?
588 | 
589 | Here I'm going to use the College dataset (used in Question 10 from Chapter 7
590 | to compare performance with the GAM we previously built). In this model we 
591 | were trying to predict `Outstate` using the other variables in `College`.
592 | 
593 | ```{r}
594 | library(gam)
595 | set.seed(42)
596 | train <- sample(1:nrow(College), 400)
597 | test <- setdiff(1:nrow(College), train)
598 | 
599 | # Linear regression
600 | lr <- gam(Outstate ~ ., data = College[train, ])
601 | 
602 | # GAM from chapter 7
603 | gam <- gam(Outstate ~ Private + s(Room.Board, 2) + s(PhD, 2) +
604 |   s(perc.alumni, 2) + s(Expend, 2) + s(Grad.Rate, 2), data = College[train, ])
605 | 
606 | # Boosting
607 | boosted <- gbm(Outstate ~ ., data = College[train, ], n.trees = 1000, shrinkage = 0.01)
608 | 
609 | # Bagging (random forest with mtry = no. predictors)
610 | bagged <- randomForest(Outstate ~ ., data = College[train, ], mtry = 17, ntree = 1000)
611 | 
612 | # Random forest with mtry = sqrt(no. predictors)
613 | rf <- randomForest(Outstate ~ ., data = College[train, ], mtry = 4, ntree = 1000)
614 | 
615 | # BART
616 | pred <- setdiff(colnames(College), "Outstate")
617 | bart <- gbart(College[train, pred], College[train, "Outstate"],
618 |   x.test = College[test, pred]
619 | )
620 | 
621 | mse <- function(model, ...) {
622 |   pred <- predict(model, College[test, ], ...)
623 |   mean((College$Outstate[test] - pred)^2)
624 | }
625 | 
626 | res <- c(
627 |   "Linear regression" = mse(lr),
628 |   "GAM" = mse(gam),
629 |   "Boosting" = mse(boosted, n.trees = 1000),
630 |   "Bagging" = mse(bagged),
631 |   "Random forest" = mse(rf),
632 |   "BART" = mse(bart)
633 | )
634 | res <- data.frame("MSE" = res)
635 | res$Model <- factor(row.names(res), levels = rev(row.names(res)))
636 | ggplot(res, aes(Model, MSE)) +
637 |   coord_flip() +
638 |   geom_bar(stat = "identity", fill = "steelblue")
639 | ```
640 | 
641 | In this case, it looks like bagging produces the best performing model in terms
642 | of test mean square error.
643 | 


--------------------------------------------------------------------------------
/10-deep-learning.Rmd:
--------------------------------------------------------------------------------
  1 | # Deep Learning
  2 | 
  3 | ## Conceptual
  4 | 
  5 | ### Question 1
  6 | 
  7 | > Consider a neural network with two hidden layers: $p = 4$ input units, 2 units
  8 | > in the first hidden layer, 3 units in the second hidden layer, and a single
  9 | > output.
 10 | >
 11 | > a. Draw a picture of the network, similar to Figures 10.1 or 10.4.
 12 | 
 13 | ```{r, echo=FALSE, out.width="80%"}
 14 | knitr::include_graphics("images/nn.png")
 15 | ```
 16 | 
 17 | > b. Write out an expression for $f(X)$, assuming ReLU activation functions. Be
 18 | > as explicit as you can!
 19 | 
 20 | The three layers (from our final output layer back to the start of our network)
 21 | can be described as:
 22 | 
 23 | \begin{align*}
 24 | f(X) &= g(w_{0}^{(3)} + \sum^{K_2}_{l=1} w_{l}^{(3)} A_l^{(2)}) \\
 25 | A_l^{(2)} &= h_l^{(2)}(X) = g(w_{l0}^{(2)} + \sum_{k=1}^{K_1} w_{lk}^{(2)} A_k^{(1)})\\
 26 | A_k^{(1)} &= h_k^{(1)}(X) = g(w_{k0}^{(1)} + \sum_{j=1}^p w_{kj}^{(1)} X_j) \\
 27 | \end{align*}
 28 | 
 29 | for $l = 1, ..., K_2 = 3$ and $k = 1, ..., K_1 = 2$ and $p = 4$, where,
 30 | 
 31 | $$
 32 | g(z) = (z)_+ = \begin{cases}
 33 |   0, & \text{if } z < 0 \\
 34 |   z, & \text{otherwise}
 35 | \end{cases}
 36 | $$
 37 | 
 38 | > c. Now plug in some values for the coefficients and write out the value of
 39 | > $f(X)$.
 40 | 
 41 | We can perhaps achieve this most easily by fitting a real model. Note,
 42 | in the plot shown here, we also include the "bias" or intercept terms.
 43 | 
 44 | ```{r}
 45 | library(ISLR2)
 46 | library(neuralnet)
 47 | library(sigmoid)
 48 | set.seed(5)
 49 | train <- sample(seq_len(nrow(ISLR2::Boston)), nrow(ISLR2::Boston) * 2 / 3)
 50 | 
 51 | net <- neuralnet(crim ~ lstat + medv + ptratio + rm,
 52 |   data = ISLR2::Boston[train, ],
 53 |   act.fct = relu,
 54 |   hidden = c(2, 3)
 55 | )
 56 | plot(net)
 57 | ```
 58 | 
 59 | We can make a prediction for a given observation using this object.
 60 | 
 61 | Firstly, let's find an "ambiguous" test sample
 62 | 
 63 | ```{r}
 64 | p <- predict(net, ISLR2::Boston[-train, ])
 65 | x <- ISLR2::Boston[-train, ][which.min(abs(p - mean(c(max(p), min(p))))), ]
 66 | x <- x[, c("lstat", "medv", "ptratio", "rm")]
 67 | predict(net, x)
 68 | ```
 69 | 
 70 | Or, repeating by "hand":
 71 | 
 72 | ```{r}
 73 | g <- function(x) ifelse(x > 0, x, 0) # relu activation function
 74 | w <- net$weights[[1]] # the estimated weights for each layer
 75 | v <- as.numeric(x) # our input predictors
 76 | 
 77 | # to calculate our prediction we can take the dot product of our predictors
 78 | # (with 1 at the start for the bias term) and our layer weights, lw)
 79 | for (lw in w) v <- g(c(1, v) %*% lw)
 80 | v
 81 | ```
 82 | 
 83 | > d. How many parameters are there?
 84 | 
 85 | ```{r} 
 86 | length(unlist(net$weights))
 87 | ```
 88 | 
 89 | There are $4*2+2 + 2*3+3 + 3*1+1 = 23$ parameters.
 90 | 
 91 | ### Question 2
 92 | 
 93 | > Consider the _softmax_ function in (10.13) (see also (4.13) on page 141)
 94 | > for modeling multinomial probabilities.
 95 | >
 96 | > a. In (10.13), show that if we add a constant $c$ to each of the $z_l$, then
 97 | > the probability is unchanged.
 98 | 
 99 | If we add a constant $c$ to each $Z_l$ in equation 10.13 we get:
100 | 
101 | \begin{align*}
102 | Pr(Y=m|X) 
103 |  &= \frac{e^{Z_m+c}}{\sum_{l=0}^9e^{Z_l+c}} \\
104 |  &= \frac{e^{Z_m}e^c}{\sum_{l=0}^9e^{Z_l}e^c} \\
105 |  &= \frac{e^{Z_m}e^c}{e^c\sum_{l=0}^9e^{Z_l}} \\
106 |  &= \frac{e^{Z_m}}{\sum_{l=0}^9e^{Z_l}} \\
107 | \end{align*}
108 | 
109 | which is just equation 10.13.
110 | 
111 | > b. In (4.13), show that if we add constants $c_j$, $j = 0,1,...,p$, to each of
112 | > the corresponding coefficients for each of the classes, then the predictions
113 | > at any new point $x$ are unchanged.
114 | 
115 | 4.13 is 
116 | 
117 | $$
118 | Pr(Y=k|X=x) = \frac
119 | {e^{\beta_{K0} + \beta_{K1}x_1 + ... + \beta_{Kp}x_p}}
120 | {\sum_{l=1}^K e^{\beta_{l0} + \beta_{l1}x1 + ... + \beta_{lp}x_p}}
121 | $$
122 | 
123 | adding constants $c_j$ to each class gives:
124 | 
125 | \begin{align*}
126 | Pr(Y=k|X=x) 
127 | &= \frac
128 |   {e^{\beta_{K0} + \beta_{K1}x_1 + c_1 + ... + \beta_{Kp}x_p + c_p}}
129 |   {\sum_{l=1}^K e^{\beta_{l0} + \beta_{l1}x1 + c_1 + ... + \beta_{lp}x_p + c_p}} \\
130 | &= \frac
131 |   {e^{c1 + ... + c_p}e^{\beta_{K0} + \beta_{K1}x_1 + ... + \beta_{Kp}x_p}}
132 |   {\sum_{l=1}^K e^{c1 + ... + c_p}e^{\beta_{l0} + \beta_{l1}x1 + ... + \beta_{lp}x_p}} \\
133 | &= \frac
134 |   {e^{c1 + ... + c_p}e^{\beta_{K0} + \beta_{K1}x_1 + ... + \beta_{Kp}x_p}}
135 |   {e^{c1 + ... + c_p}\sum_{l=1}^K e^{\beta_{l0} + \beta_{l1}x1 + ... + \beta_{lp}x_p}} \\
136 | &= \frac
137 |   {e^{\beta_{K0} + \beta_{K1}x_1 + ... + \beta_{Kp}x_p}}
138 |   {\sum_{l=1}^K e^{\beta_{l0} + \beta_{l1}x1 + ... + \beta_{lp}x_p}} \\
139 | \end{align*}
140 | 
141 | which collapses to 4.13 (with the same argument as above).
142 | 
143 | > This shows that the softmax function is _over-parametrized_. However,
144 | > regularization and SGD typically constrain the solutions so that this is not a
145 | > problem.
146 | 
147 | ### Question 3
148 | 
149 | > Show that the negative multinomial log-likelihood (10.14) is equivalent to
150 | > the negative log of the likelihood expression (4.5) when there are $M = 2$
151 | > classes.
152 | 
153 | Equation 10.14 is 
154 | 
155 | $$
156 | -\sum_{i=1}^n \sum_{m=0}^9 y_{im}\log(f_m(x_i))
157 | $$
158 | 
159 | Equation 4.5 is:
160 | 
161 | $$
162 | \ell(\beta_0, \beta_1) = \prod_{i:y_i=1}p(x_i) \prod_{i':y_i'=0}(1-p(x_i'))
163 | $$
164 | 
165 | So, $\log(\ell)$ is:
166 | 
167 | \begin{align*}
168 | \log(\ell) 
169 |  &= \log \left( \prod_{i:y_i=1}p(x_i) \prod_{i':y_i'=0}(1-p(x_i')) \right ) \\
170 |  &= \sum_{i:y_1=1}\log(p(x_i)) + \sum_{i':y_i'=0}\log(1-p(x_i')) \\
171 | \end{align*}
172 | 
173 | If we set $y_i$ to be an indicator variable such that $y_{i1}$ and $y_{i0}$ are
174 | 1 and 0 (or 0 and 1) when our $i$th observation is 1 (or 0) respectively, then
175 | we can write:
176 | 
177 | $$
178 | \log(\ell) = \sum_{i}y_{i1}\log(p(x_i)) + \sum_{i}y_{i0}\log(1-p(x_i'))
179 | $$
180 | 
181 | If we also let $f_1(x) = p(x)$ and $f_0(x) = 1 - p(x)$ then:
182 | 
183 | \begin{align*}
184 | \log(\ell) 
185 |  &= \sum_i y_{i1}\log(f_1(x_i)) + \sum_{i}y_{i0}\log(f_0(x_i')) \\
186 |  &= \sum_i \sum_{m=0}^1 y_{im}\log(f_m(x_i)) \\
187 | \end{align*}
188 | 
189 | When we take the negative of this, it is equivalent to 10.14 for two classes 
190 | ($m = 0,1$).
191 | 
192 | ### Question 4
193 | 
194 | > Consider a CNN that takes in $32 \times 32$ grayscale images and has a single
195 | > convolution layer with three $5 \times 5$ convolution filters (without
196 | > boundary padding).
197 | >
198 | > a. Draw a sketch of the input and first hidden layer similar to Figure 10.8.
199 | 
200 | ```{r, echo=FALSE, out.width="50%"}
201 | knitr::include_graphics("images/nn2.png")
202 | ```
203 | 
204 | Note that, because there is no boundary padding, each convolution layer will
205 | consist of a 28x28 array.
206 | 
207 | > b. How many parameters are in this model?
208 | 
209 | There are 3 convolution matrices each with 5x5 weights (plus 3 bias terms) to
210 | estimate, therefore $3 \times 5 \times 5 + 3 = 78$ parameters 
211 | 
212 | > c. Explain how this model can be thought of as an ordinary feed-forward
213 | > neural network with the individual pixels as inputs, and with constraints on
214 | > the weights in the hidden units. What are the constraints?
215 | 
216 | We can think of a convolution layer as a regularized fully connected layer.
217 | The regularization in this case is due to not all inputs being connected to
218 | all outputs, and weights being shared between connections.
219 | 
220 | Each output node in the convolved image can be thought of as taking inputs from
221 | a limited number of input pixels (the neighboring pixels), with a set of
222 | weights specified by the convolution layer which are then shared by the
223 | connections to all other output nodes.
224 | 
225 | > d. If there were no constraints, then how many weights would there be in the
226 | > ordinary feed-forward neural network in (c)?
227 | 
228 | With no constraints, we would connect each input pixel in our original 32x32 
229 | image to each output pixel in each of our convolution layers, with an bias 
230 | term for each original pixel. So each output pixel would require 32x32 weights 
231 | + 1 bias term. This would give a total of (32×32+1)×28×28×3 = 2,410,800 
232 | parameters.
233 | 
234 | ### Question 5
235 | 
236 | > In Table 10.2 on page 433, we see that the ordering of the three methods with
237 | > respect to mean absolute error is different from the ordering with respect to
238 | > test set $R^2$. How can this be?
239 | 
240 | Mean absolute error considers _absolute_ differences between predictions and 
241 | observed values, whereas $R^2$ considers the (normalized) sum of _squared_
242 | differences, thus larger errors contribute relatively ore to $R^2$ than mean
243 | absolute error.
244 | 
245 | ## Applied
246 | 
247 | ### Question 6
248 | 
249 | > Consider the simple function $R(\beta) = sin(\beta) + \beta/10$.
250 | >
251 | > a. Draw a graph of this function over the range $\beta \in [-6, 6]$.
252 | 
253 | ```{r}
254 | r <- function(x) sin(x) + x / 10
255 | x <- seq(-6, 6, 0.1)
256 | plot(x, r(x), type = "l")
257 | ```
258 | 
259 | > b. What is the derivative of this function?
260 | 
261 | $$
262 | cos(x) + 1/10
263 | $$
264 | 
265 | > c. Given $\beta^0 = 2.3$, run gradient descent to find a local minimum of
266 | > $R(\beta)$ using a learning rate of $\rho = 0.1$. Show each of 
267 | > $\beta^0, \beta^1, ...$ in your plot, as well as the final answer.
268 | 
269 | The derivative of our function, i.e. $cos(x) + 1/10$ gives us the gradient for
270 | a given $x$. For gradient descent, we move $x$ a little in the _opposite_
271 | direction, for some learning rate $\rho = 0.1$:
272 | 
273 | $$
274 | x^{m+1} = x^m - \rho (cos(x^m) + 1/10)
275 | $$
276 | 
277 | ```{r}
278 | iter <- function(x, rho) x - rho * (cos(x) + 1 / 10)
279 | gd <- function(start, rho = 0.1) {
280 |   b <- start
281 |   v <- b
282 |   while (abs(b - iter(b, 0.1)) > 1e-8) {
283 |     b <- iter(b, 0.1)
284 |     v <- c(v, b)
285 |   }
286 |   v
287 | }
288 | 
289 | res <- gd(2.3)
290 | res[length(res)]
291 | ```
292 | 
293 | ```{r}
294 | plot(x, r(x), type = "l")
295 | points(res, r(res), col = "red", pch = 19)
296 | ```
297 | 
298 | 
299 | > d. Repeat with $\beta^0 = 1.4$.
300 | 
301 | ```{r}
302 | res <- gd(1.4)
303 | res[length(res)]
304 | ```
305 | 
306 | ```{r}
307 | plot(x, r(x), type = "l")
308 | points(res, r(res), col = "red", pch = 19)
309 | ```
310 | 
311 | ### Question 7
312 | 
313 | > Fit a neural network to the `Default` data. Use a single hidden layer with 10
314 | > units, and dropout regularization. Have a look at Labs 10.9.1--10.9.2 for
315 | > guidance. Compare the classification performance of your model with that of
316 | > linear logistic regression.
317 | 
318 | ```{r, cache = TRUE}
319 | library(keras)
320 | 
321 | dat <- ISLR2::Boston
322 | x <- scale(model.matrix(crim ~ . - 1, data = dat))
323 | n <- nrow(dat)
324 | ntest <- trunc(n / 3)
325 | testid <- sample(1:n, ntest)
326 | y <- dat$crim
327 | 
328 | # logistic regression
329 | lfit <- lm(crim ~ ., data = dat[-testid, ])
330 | lpred <- predict(lfit, dat[testid, ])
331 | with(dat[testid, ], mean(abs(lpred - crim)))
332 | 
333 | # keras
334 | nn <- keras_model_sequential() |>
335 |   layer_dense(units = 10, activation = "relu", input_shape = ncol(x)) |>
336 |   layer_dropout(rate = 0.4) |>
337 |   layer_dense(units = 1)
338 | 
339 | compile(nn,
340 |   loss = "mse",
341 |   optimizer = optimizer_rmsprop(),
342 |   metrics = list("mean_absolute_error")
343 | )
344 | 
345 | history <- fit(nn,
346 |   x[-testid, ], y[-testid],
347 |   epochs = 100,
348 |   batch_size = 26,
349 |   validation_data = list(x[testid, ], y[testid]),
350 |   verbose = 0
351 | )
352 | plot(history, smooth = FALSE)
353 | npred <- predict(nn, x[testid, ])
354 | mean(abs(y[testid] - npred))
355 | ```
356 | 
357 | In this case, the neural network outperforms logistic regression having a lower
358 | absolute error rate on the test data.
359 | 
360 | ### Question 8
361 | 
362 | > From your collection of personal photographs, pick 10 images of animals (such
363 | > as dogs, cats, birds, farm animals, etc.). If the subject does not occupy a
364 | > reasonable part of the image, then crop the image. Now use a pretrained image
365 | > classification CNN as in Lab 10.9.4 to predict the class of each of your
366 | > images, and report the probabilities for the top five predicted classes for
367 | > each image.
368 | 
369 | ```{r, echo=FALSE}
370 | knitr::include_graphics(c(
371 |   "images/animals/bird.jpg",
372 |   "images/animals/bird2.jpg",
373 |   "images/animals/bird3.jpg",
374 |   "images/animals/bug.jpg",
375 |   "images/animals/butterfly.jpg",
376 |   "images/animals/butterfly2.jpg",
377 |   "images/animals/elba.jpg",
378 |   "images/animals/hamish.jpg",
379 |   "images/animals/poodle.jpg",
380 |   "images/animals/tortoise.jpg"
381 | ))
382 | ```
383 | 
384 | ```{r}
385 | library(keras)
386 | images <- list.files("images/animals")
387 | x <- array(dim = c(length(images), 224, 224, 3))
388 | for (i in seq_len(length(images))) {
389 |   img <- image_load(paste0("images/animals/", images[i]), target_size = c(224, 224))
390 |   x[i, , , ] <- image_to_array(img)
391 | }
392 | 
393 | model <- application_resnet50(weights = "imagenet")
394 | 
395 | pred <- model |>
396 |   predict(x) |>
397 |   imagenet_decode_predictions(top = 5)
398 | 
399 | names(pred) <- images
400 | print(pred)
401 | ```
402 | 
403 | ### Question 9
404 | 
405 | > Fit a lag-5 autoregressive model to the `NYSE` data, as described in the text
406 | > and Lab 10.9.6. Refit the model with a 12-level factor representing the
407 | > month. Does this factor improve the performance of the model?
408 | 
409 | Fitting the model as described in the text.
410 | 
411 | ```{r}
412 | library(tidyverse)
413 | library(ISLR2)
414 | xdata <- data.matrix(NYSE[, c("DJ_return", "log_volume", "log_volatility")])
415 | istrain <- NYSE[, "train"]
416 | xdata <- scale(xdata)
417 | 
418 | lagm <- function(x, k = 1) {
419 |   n <- nrow(x)
420 |   pad <- matrix(NA, k, ncol(x))
421 |   rbind(pad, x[1:(n - k), ])
422 | }
423 | 
424 | arframe <- data.frame(
425 |   log_volume = xdata[, "log_volume"],
426 |   L1 = lagm(xdata, 1),
427 |   L2 = lagm(xdata, 2),
428 |   L3 = lagm(xdata, 3),
429 |   L4 = lagm(xdata, 4),
430 |   L5 = lagm(xdata, 5)
431 | )
432 | 
433 | arframe <- arframe[-(1:5), ]
434 | istrain <- istrain[-(1:5)]
435 | 
436 | arfit <- lm(log_volume ~ ., data = arframe[istrain, ])
437 | arpred <- predict(arfit, arframe[!istrain, ])
438 | V0 <- var(arframe[!istrain, "log_volume"])
439 | 1 - mean((arpred - arframe[!istrain, "log_volume"])^2) / V0
440 | ```
441 | 
442 | Now we add month (and work with tidyverse).
443 | 
444 | ```{r}
445 | arframe$month <- as.factor(str_match(NYSE$date, "-(\\d+)-")[, 2])[-(1:5)]
446 | arfit2 <- lm(log_volume ~ ., data = arframe[istrain, ])
447 | arpred2 <- predict(arfit2, arframe[!istrain, ])
448 | V0 <- var(arframe[!istrain, "log_volume"])
449 | 1 - mean((arpred2 - arframe[!istrain, "log_volume"])^2) / V0
450 | ```
451 | 
452 | Adding month as a factor marginally improves the $R^2$ of our model (from 
453 | 0.413223 to 0.4170418). This is a significant improvement in fit and model
454 | 2 has a lower AIC.
455 | 
456 | ```{r}
457 | anova(arfit, arfit2)
458 | AIC(arfit, arfit2)
459 | ```
460 | 
461 | ### Question 10
462 | 
463 | > In Section 10.9.6, we showed how to fit a linear AR model to the `NYSE` data
464 | > using the `lm()` function. However, we also mentioned that we can "flatten"
465 | > the short sequences produced for the RNN model in order to fit a linear AR
466 | > model. Use this latter approach to fit a linear AR model to the NYSE data.
467 | > Compare the test $R^2$ of this linear AR model to that of the linear AR model
468 | > that we fit in the lab. What are the advantages/disadvantages of each
469 | > approach?
470 | 
471 | The `lm` model is the same as that fit above:
472 | 
473 | ```{r}
474 | arfit <- lm(log_volume ~ ., data = arframe[istrain, ])
475 | arpred <- predict(arfit, arframe[!istrain, ])
476 | V0 <- var(arframe[!istrain, "log_volume"])
477 | 1 - mean((arpred - arframe[!istrain, "log_volume"])^2) / V0
478 | ```
479 | 
480 | Now we reshape the data for the RNN
481 | 
482 | ```{r}
483 | n <- nrow(arframe)
484 | xrnn <- data.matrix(arframe[, -1])
485 | xrnn <- array(xrnn, c(n, 3, 5))
486 | xrnn <- xrnn[, , 5:1]
487 | xrnn <- aperm(xrnn, c(1, 3, 2))
488 | ```
489 | 
490 | We can add a "flatten" layer to turn the reshaped data into a long vector of
491 | predictors resulting in a linear AR model.
492 | 
493 | ```{r}
494 | model <- keras_model_sequential() |>
495 |   layer_flatten(input_shape = c(5, 3)) |>
496 |   layer_dense(units = 1)
497 | ```
498 | 
499 | Now let's fit this model.
500 | 
501 | ```{r}
502 | model |>
503 |   compile(optimizer = optimizer_rmsprop(), loss = "mse")
504 | 
505 | history <- model |>
506 |   fit(
507 |     xrnn[istrain, , ],
508 |     arframe[istrain, "log_volume"],
509 |     batch_size = 64,
510 |     epochs = 200,
511 |     validation_data = list(xrnn[!istrain, , ], arframe[!istrain, "log_volume"]),
512 |     verbose = 0
513 |   )
514 | 
515 | plot(history, smooth = FALSE)
516 | kpred <- predict(model, xrnn[!istrain, , ])
517 | 1 - mean((kpred - arframe[!istrain, "log_volume"])^2) / V0
518 | ```
519 | 
520 | Both models estimate the same number of coefficients/weights (16):
521 | 
522 | ```{r}
523 | coef(arfit)
524 | model$get_weights()
525 | ```
526 | 
527 | The flattened RNN has a lower $R^2$ on the test data than our `lm` model
528 | above. The `lm` model is quicker to fit and conceptually simpler also 
529 | giving us the ability to inspect the coefficients for different variables.
530 | 
531 | The flattened RNN is regularized to some extent as data are processed in
532 | batches.
533 | 
534 | ### Question 11
535 | 
536 | > Repeat the previous exercise, but now fit a nonlinear AR model by "flattening"
537 | > the short sequences produced for the RNN model.
538 | 
539 | From the book:
540 | 
541 | > To fit a nonlinear AR model, we could add in a hidden layer.
542 | 
543 | ```{r, c10q11}
544 | xfun::cache_rds({
545 |   model <- keras_model_sequential() |>
546 |     layer_flatten(input_shape = c(5, 3)) |>
547 |     layer_dense(units = 32, activation = "relu") |>
548 |     layer_dropout(rate = 0.4) |>
549 |     layer_dense(units = 1)
550 | 
551 |   model |> compile(
552 |     loss = "mse",
553 |     optimizer = optimizer_rmsprop(),
554 |     metrics = "mse"
555 |   )
556 | 
557 |   history <- model |>
558 |     fit(
559 |       xrnn[istrain, , ],
560 |       arframe[istrain, "log_volume"],
561 |       batch_size = 64,
562 |       epochs = 200,
563 |       validation_data = list(xrnn[!istrain, , ], arframe[!istrain, "log_volume"]),
564 |       verbose = 0
565 |     )
566 | 
567 |   plot(history, smooth = FALSE, metrics = "mse")
568 |   kpred <- predict(model, xrnn[!istrain, , ])
569 |   1 - mean((kpred - arframe[!istrain, "log_volume"])^2) / V0
570 | })
571 | ```
572 | 
573 | This approach improves our $R^2$ over the linear model above.
574 | 
575 | ### Question 12
576 | 
577 | > Consider the RNN fit to the `NYSE` data in Section 10.9.6. Modify the code to
578 | > allow inclusion of the variable `day_of_week`, and fit the RNN. Compute the
579 | > test $R^2$.
580 | 
581 | To accomplish this, I'll include day of the week as one of the lagged variables
582 | in the RNN. Thus, our input for each observation will be 4 x 5 (rather than
583 | 3 x 5).
584 | 
585 | ```{r, c10q12}
586 | xfun::cache_rds({
587 |   xdata <- data.matrix(
588 |     NYSE[, c("day_of_week", "DJ_return", "log_volume", "log_volatility")]
589 |   )
590 |   istrain <- NYSE[, "train"]
591 |   xdata <- scale(xdata)
592 | 
593 |   arframe <- data.frame(
594 |     log_volume = xdata[, "log_volume"],
595 |     L1 = lagm(xdata, 1),
596 |     L2 = lagm(xdata, 2),
597 |     L3 = lagm(xdata, 3),
598 |     L4 = lagm(xdata, 4),
599 |     L5 = lagm(xdata, 5)
600 |   )
601 |   arframe <- arframe[-(1:5), ]
602 |   istrain <- istrain[-(1:5)]
603 | 
604 |   n <- nrow(arframe)
605 |   xrnn <- data.matrix(arframe[, -1])
606 |   xrnn <- array(xrnn, c(n, 4, 5))
607 |   xrnn <- xrnn[, , 5:1]
608 |   xrnn <- aperm(xrnn, c(1, 3, 2))
609 |   dim(xrnn)
610 | 
611 |   model <- keras_model_sequential() |>
612 |     layer_simple_rnn(
613 |       units = 12,
614 |       input_shape = list(5, 4),
615 |       dropout = 0.1,
616 |       recurrent_dropout = 0.1
617 |     ) |>
618 |     layer_dense(units = 1)
619 | 
620 |   model |> compile(optimizer = optimizer_rmsprop(), loss = "mse")
621 | 
622 |   history <- model |>
623 |     fit(
624 |       xrnn[istrain, , ],
625 |       arframe[istrain, "log_volume"],
626 |       batch_size = 64,
627 |       epochs = 200,
628 |       validation_data = list(xrnn[!istrain, , ], arframe[!istrain, "log_volume"]),
629 |       verbose = 0
630 |     )
631 | 
632 |   kpred <- predict(model, xrnn[!istrain, , ])
633 |   1 - mean((kpred - arframe[!istrain, "log_volume"])^2) / V0
634 | })
635 | ```
636 | 
637 | ### Question 13
638 | 
639 | > Repeat the analysis of Lab 10.9.5 on the `IMDb` data using a similarly
640 | > structured neural network. There we used a dictionary of size 10,000. Consider
641 | > the effects of varying the dictionary size. Try the values 1000, 3000, 5000,
642 | > and 10,000, and compare the results.
643 | 
644 | ```{r, c10q13}
645 | xfun::cache_rds({
646 |   library(knitr)
647 |   accuracy <- c()
648 |   for (max_features in c(1000, 3000, 5000, 10000)) {
649 |     imdb <- dataset_imdb(num_words = max_features)
650 |     c(c(x_train, y_train), c(x_test, y_test)) %<-% imdb
651 | 
652 |     maxlen <- 500
653 |     x_train <- pad_sequences(x_train, maxlen = maxlen)
654 |     x_test <- pad_sequences(x_test, maxlen = maxlen)
655 | 
656 |     model <- keras_model_sequential() |>
657 |       layer_embedding(input_dim = max_features, output_dim = 32) |>
658 |       layer_lstm(units = 32) |>
659 |       layer_dense(units = 1, activation = "sigmoid")
660 | 
661 |     model |> compile(
662 |       optimizer = "rmsprop",
663 |       loss = "binary_crossentropy",
664 |       metrics = "acc"
665 |     )
666 | 
667 |     history <- fit(model, x_train, y_train,
668 |       epochs = 10,
669 |       batch_size = 128,
670 |       validation_data = list(x_test, y_test),
671 |       verbose = 0
672 |     )
673 | 
674 |     predy <- predict(model, x_test) > 0.5
675 |     accuracy <- c(accuracy, mean(abs(y_test == as.numeric(predy))))
676 |   }
677 | 
678 |   tibble(
679 |     "Max Features" = c(1000, 3000, 5000, 10000),
680 |     "Accuracy" = accuracy
681 |   ) |>
682 |     kable()
683 | })
684 | ```
685 | 
686 | Varying the dictionary size does not make a substantial impact on our estimates
687 | of accuracy. However, the models do take a substantial amount of time to fit and
688 | it is not clear we are finding the best fitting models in each case. For
689 | example, the model using a dictionary size of 10,000 obtained an accuracy of
690 | 0.8721 in the text which is as different from the estimate obtained here as
691 | are the differences between the models with different dictionary sizes.
692 | 


--------------------------------------------------------------------------------
/07-moving-beyond-linearity.Rmd:
--------------------------------------------------------------------------------
  1 | # Moving Beyond Linearity
  2 | 
  3 | ## Conceptual
  4 | 
  5 | ### Question 1
  6 | 
  7 | > It was mentioned in the chapter that a cubic regression spline with one knot
  8 | > at $\xi$ can be obtained using a basis of the form $x, x^2, x^3, (x-\xi)^3_+$,
  9 | > where $(x-\xi)^3_+ = (x-\xi)^3$ if $x>\xi$ and equals 0 otherwise. We will now
 10 | > show that a function of the form
 11 | > $$
 12 | > f(x)=\beta_0 +\beta_1x+\beta_2x^2 +\beta_3x^3 +\beta_4(x-\xi)^3_+
 13 | > $$
 14 | > is indeed a cubic regression spline, regardless of the values of
 15 | > $\beta_0, \beta_1, \beta_2, \beta_3,\beta_4$.
 16 | >
 17 | > a. Find a cubic polynomial
 18 | > $$
 19 | > f_1(x) = a_1 + b_1x + c_1x^2 + d_1x^3
 20 | > $$
 21 | > such that $f(x) = f_1(x)$ for all $x \le \xi$. Express $a_1,b_1,c_1,d_1$ in
 22 | > terms of $\beta_0, \beta_1, \beta_2, \beta_3, \beta_4$.
 23 | 
 24 | In this case, for $x \le \xi$, the cubic polynomial simply has terms
 25 | $a_1 = \beta_0$, $b_1 = \beta_1$, $c_1 = \beta_2$, $d_1 = \beta_3$
 26 | 
 27 | > b. Find a cubic polynomial
 28 | > $$
 29 | > f_2(x) = a_2 + b_2x + c_2x^2 + d_2x^3
 30 | > $$
 31 | > such that $f(x) = f_2(x)$ for all $x > \xi$. Express $a_2, b_2, c_2, d_2$ in
 32 | > terms of $\beta_0, \beta_1, \beta_2, \beta_3, \beta_4$. We have now established
 33 | > that $f(x)$ is a piecewise polynomial.
 34 | 
 35 | For $x \gt \xi$, the cubic polynomial would be (we include the $\beta_4$ term).
 36 | \begin{align}
 37 | f(x) = & \beta_0 + \beta_1x + \beta_2x^2 + \beta_3x^3 + \beta_4(x-\xi)^3 \\
 38 |      = & \beta_0 + \beta_1x + \beta_2x^2 +  + \beta_4(x^3 - 3x^2\xi + 3x\xi^2 -\xi^3) \\
 39 |      = & \beta_0 - \beta_4\xi^3 + (\beta_1 + 3\beta_4\xi^2)x +
 40 |          (\beta_2 - 3\beta_4\xi)x^2 + (\beta_3 + \beta_4)x^3
 41 | \end{align}
 42 | 
 43 | Therefore,
 44 | $a_1 = \beta_0 - \beta_4\xi^3$, $b_1 = \beta_1 + 3\beta_4\xi^2$,
 45 | $c_1 = \beta_2 - 3\beta_4\xi$, $d_1 = \beta_3 + \beta_4$
 46 | 
 47 | > c. Show that $f_1(\xi) = f_2(\xi)$. That is, $f(x)$ is continuous at $\xi$.
 48 | 
 49 | To do this, we replace $x$ with $\xi$ in the above equations and simplify.
 50 | 
 51 | \begin{align}
 52 | f_1(\xi) = \beta_0 + \beta_1\xi + \beta_2\xi^2 + \beta_3\xi^3
 53 | \end{align}
 54 | 
 55 | \begin{align}
 56 | f_2(\xi) = & \beta_0 - \beta_4\xi^3 + (\beta_1 + 3\beta_4\xi^2)\xi +
 57 |              (\beta_2 - 3\beta_4\xi)\xi^2 + (\beta_3 + \beta_4)\xi^3 \\
 58 |          = & \beta_0 - \beta_4\xi^3 + \beta_1\xi + 3\beta_4\xi^3 +
 59 |              \beta_2\xi^2 - 3\beta_4\xi^3 + \beta_3\xi^3 + \beta_4\xi^3 \\
 60 |          = & \beta_0 + \beta_1\xi + \beta_2\xi^2 + \beta_3\xi^3
 61 | \end{align}
 62 | 
 63 | > d. Show that $f_1'(\xi) = f_2'(\xi)$. That is, $f'(x)$ is continuous at $\xi$.
 64 | 
 65 | To do this we differentiate the above with respect to $x$.
 66 | 
 67 | $$
 68 | f_1'(x) = \beta_1 + 2\beta_2x + 3\beta_3x^2
 69 | f_1'(\xi) = \beta_1 + 2\beta_2\xi + 3\beta_3\xi^2
 70 | $$
 71 | 
 72 | \begin{align}
 73 | f_2'(x)   & = \beta_1 + 3\beta_4\xi^2 + 2(\beta_2 - 3\beta_4\xi)x + 3(\beta_3 + \beta_4)x^2 \\
 74 | f_2'(\xi) & = \beta_1 + 3\beta_4\xi^2 + 2(\beta_2 - 3\beta_4\xi)\xi + 3(\beta_3 + \beta_4)\xi^2 \\
 75 |           & = \beta_1 + 3\beta_4\xi^2 + 2\beta_2\xi - 6\beta_4\xi^2 + 3\beta_3\xi^2 + 3\beta_4\xi^2 \\
 76 |           & = \beta_1 + 2\beta_2\xi + 3\beta_3\xi^2
 77 | \end{align}
 78 | 
 79 | > e. Show that $f_1''(\xi) = f_2''(\xi)$. That is, $f''(x)$ is continuous at $\xi$.
 80 | >
 81 | > Therefore, $f(x)$ is indeed a cubic spline.
 82 | 
 83 | $$
 84 | f_1'(x) = 2\beta_2x + 6\beta_3x \\
 85 | f_1''(\xi) = 2\beta_2\xi + 6\beta_3\xi
 86 | $$
 87 | 
 88 | $$
 89 | f_2''(x) = 2\beta_2 - 6\beta_4\xi + 6(\beta_3 + \beta_4)x \\
 90 | $$
 91 | \begin{align}
 92 | f_2''(\xi) & = 2\beta_2 - 6\beta_4\xi + 6\beta_3\xi + 6\beta_4\xi \\
 93 |            & = 2\beta_2 + 6\beta_3\xi
 94 | \end{align}
 95 | 
 96 | > _Hint: Parts (d) and (e) of this problem require knowledge of single-variable
 97 | > calculus. As a reminder, given a cubic polynomial_
 98 | > $$f_1(x) = a_1 + b_1x + c_1x^2 + d_1x^3,$$
 99 | > _the first derivative takes the form_
100 | > $$f_1'(x) = b_1 + 2c_1x + 3d_1x^2$$
101 | > _and the second derivative takes the form_
102 | > $$f_1''(x) = 2c_1 + 6d_1x.$$
103 | 
104 | ### Question 2
105 | 
106 | > Suppose that a curve $\hat{g}$ is computed to smoothly fit a set of $n$ points
107 | > using the following formula:
108 | > $$
109 | > \DeclareMathOperator*{\argmin}{arg\,min} % Jan Hlavacek
110 | > \hat{g} = \argmin_g \left(\sum_{i=1}^n (y_i - g(x_i))^2 + \lambda \int \left[ g^{(m)}(x) \right]^2 dx \right),
111 | > $$
112 | > where $g^{(m)}$ represents the $m$th derivative of $g$ (and $g^{(0)} = g$).
113 | > Provide example sketches of $\hat{g}$ in each of the following scenarios.
114 | >
115 | > a. $\lambda=\infty, m=0$.
116 | 
117 | Here we penalize the $g$ and a infinite $\lambda$ means that this penalty
118 | dominates. This means that the $\hat{g}$ will be 0.
119 | 
120 | > b. $\lambda=\infty, m=1$.
121 | 
122 | Here we penalize the first derivative (the slope) of $g$ and a infinite
123 | $\lambda$ means that this penalty dominates. Thus the slope will be 0
124 | (and otherwise best fitting $x$, i.e. at the mean of $x$).
125 | 
126 | > c. $\lambda=\infty, m=2$.
127 | 
128 | Here we penalize the second derivative (the change of slope) of $g$ and a
129 | infinite $\lambda$ means that this penalty dominates. Thus the line will be
130 | straight (and otherwise best fitting $x$).
131 | 
132 | > d. $\lambda=\infty, m=3$.
133 | 
134 | Here we penalize the third derivative (the change of the change of slope) of $g$
135 | and a infinite $\lambda$ means that this penalty dominates. In other words,
136 | the curve will have a consistent rate of change (e.g. a quadratic
137 | function or similar).
138 | 
139 | > e. $\lambda=0, m=3$.
140 | 
141 | Here we penalize the third derivative, but a value of $\lambda = 0$ means that
142 | there is no penalty. As a result, the curve is able to interpolate all points.
143 | 
144 | ### Question 3
145 | 
146 | > Suppose we fit a curve with basis functions
147 | > $b_1(X) = X$,
148 | > $b_2(X) = (X - 1)^2I(X \geq 1)$.
149 | > (Note that $I(X \geq 1)$ equals 1 for $X \geq 1$ and 0 otherwise.) We fit the
150 | > linear regression model
151 | > $$Y = \beta_0 +\beta_1b_1(X) + \beta_2b_2(X) + \epsilon,$$
152 | > and obtain coefficient estimates
153 | > $\hat{\beta}_0 = 1, \hat{\beta}_1 = 1, \hat{\beta}_2 = -2$.
154 | > Sketch the estimated curve between $X = -2$ and $X = 2$. Note the intercepts,
155 | > slopes, and other relevant information.
156 | 
157 | ```{r}
158 | x <- seq(-2, 2, length.out = 1000)
159 | f <- function(x) 1 + x + -2 * (x - 1)^2 * I(x >= 1)
160 | plot(x, f(x), type = "l")
161 | grid()
162 | ```
163 | 
164 | ### Question 4
165 | 
166 | > Suppose we fit a curve with basis functions
167 | > $b_1(X) = I(0 \leq X \leq 2) - (X -1)I(1 \leq X \leq 2),$
168 | > $b_2(X) = (X -3)I(3 \leq X \leq 4) + I(4 \lt X \leq 5)$.
169 | > We fit the linear regression model
170 | > $$Y = \beta_0 +\beta_1b_1(X) + \beta_2b_2(X) + \epsilon,$$
171 | > and obtain coefficient estimates
172 | > $\hat{\beta}_0 = 1, \hat{\beta}_1 = 1, \hat{\beta}_2 = 3$.
173 | > Sketch the estimated curve between $X = -2$ and $X = 6$. Note the intercepts,
174 | > slopes, and other relevant information.
175 | 
176 | ```{r}
177 | x <- seq(-2, 6, length.out = 1000)
178 | b1 <- function(x) I(0 <= x & x <= 2) - (x - 1) * I(1 <= x & x <= 2)
179 | b2 <- function(x) (x - 3) * I(3 <= x & x <= 4) + I(4 < x & x <= 5)
180 | f <- function(x) 1 + 1 * b1(x) + 3 * b2(x)
181 | plot(x, f(x), type = "l")
182 | grid()
183 | ```
184 | 
185 | ### Question 5
186 | 
187 | > Consider two curves, $\hat{g}$ and $\hat{g}_2$, defined by
188 | >
189 | > $$
190 | > \hat{g}_1 = \argmin_g \left(\sum_{i=1}^n (y_i - g(x_i))^2 +
191 | >   \lambda \int \left[ g^{(3)}(x) \right]^2 dx \right),
192 | > $$
193 | > $$
194 | > \hat{g}_2 = \argmin_g \left(\sum_{i=1}^n (y_i - g(x_i))^2 +
195 | >   \lambda \int \left[ g^{(4)}(x) \right]^2 dx \right),
196 | > $$
197 | >
198 | > where $g^{(m)}$ represents the $m$th derivative of $g$.
199 | >
200 | > a. As $\lambda \to \infty$, will $\hat{g}_1$ or $\hat{g}_2$ have the smaller
201 | >    training RSS?
202 | 
203 | $\hat{g}_2$ is more flexible (by penalizing a higher derivative of $g$) and
204 | so will have a smaller training RSS.
205 | 
206 | > b. As $\lambda \to \infty$, will $\hat{g}_1$ or $\hat{g}_2$ have the smaller
207 | >    test RSS?
208 | 
209 | We cannot tell which function will produce a smaller test RSS, but there is
210 | chance that $\hat{g}_1$ will if $\hat{g}_2$ overfits the data.
211 | 
212 | > c. For $\lambda = 0$, will $\hat{g}_1$ or $\hat{g}_2$ have the smaller training
213 | >    and test RSS?
214 | 
215 | When $\lambda = 0$ there is no penalty, so both functions will give the same
216 | result: perfect interpolation of the training data. Thus training RSS will be
217 | 0 but test RSS could be high.
218 | 
219 | ## Applied
220 | 
221 | ### Question 6
222 | 
223 | > In this exercise, you will further analyze the `Wage` data set considered
224 | > throughout this chapter.
225 | >
226 | > a. Perform polynomial regression to predict `wage` using `age`. Use
227 | >    cross-validation to select the optimal degree $d$ for the polynomial. What
228 | >    degree was chosen, and how does this compare to the results of hypothesis
229 | >    testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
230 | 
231 | ```{r}
232 | library(ISLR2)
233 | library(boot)
234 | library(ggplot2)
235 | set.seed(42)
236 | res <- sapply(1:6, function(i) {
237 |   fit <- glm(wage ~ poly(age, i), data = Wage)
238 |   cv.glm(Wage, fit, K = 5)$delta[1]
239 | })
240 | which.min(res)
241 | plot(1:6, res, xlab = "Degree", ylab = "Test MSE", type = "l")
242 | abline(v = which.min(res), col = "red", lty = 2)
243 | 
244 | fit <- glm(wage ~ poly(age, which.min(res)), data = Wage)
245 | plot(Wage$age, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
246 | points(1:100, predict(fit, data.frame(age = 1:100)), type = "l", col = "red")
247 | 
248 | summary(glm(wage ~ poly(age, 6), data = Wage))
249 | 
250 | fit1 <- lm(wage ~ poly(age, 1), data = Wage)
251 | fit2 <- lm(wage ~ poly(age, 2), data = Wage)
252 | fit3 <- lm(wage ~ poly(age, 3), data = Wage)
253 | fit4 <- lm(wage ~ poly(age, 4), data = Wage)
254 | fit5 <- lm(wage ~ poly(age, 5), data = Wage)
255 | anova(fit1, fit2, fit3, fit4, fit5)
256 | ```
257 | 
258 | The selected degree is 4. When testing with ANOVA, degrees 1, 2 and 3 are highly
259 | significant and 4 is marginal.
260 | 
261 | > b. Fit a step function to predict `wage` using `age`, and perform
262 | >    cross-validation to choose the optimal number of cuts. Make a plot of the fit
263 | >    obtained.
264 | 
265 | ```{r}
266 | set.seed(42)
267 | res <- sapply(2:10, function(i) {
268 |   Wage$cats <- cut(Wage$age, i)
269 |   fit <- glm(wage ~ cats, data = Wage)
270 |   cv.glm(Wage, fit, K = 5)$delta[1]
271 | })
272 | names(res) <- 2:10
273 | plot(2:10, res, xlab = "Cuts", ylab = "Test MSE", type = "l")
274 | which.min(res)
275 | abline(v = names(which.min(res)), col = "red", lty = 2)
276 | 
277 | fit <- glm(wage ~ cut(age, 8), data = Wage)
278 | plot(Wage$age, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
279 | points(18:80, predict(fit, data.frame(age = 18:80)), type = "l", col = "red")
280 | ```
281 | 
282 | ### Question 7
283 | 
284 | > The `Wage` data set contains a number of other features not explored in this
285 | > chapter, such as marital status (`maritl`), job class (`jobclass`), and others.
286 | > Explore the relationships between some of these other predictors and `wage`, and
287 | > use non-linear fitting techniques in order to fit flexible models to the data.
288 | > Create plots of the results obtained, and write a summary of your findings.
289 | 
290 | ```{r}
291 | plot(Wage$year, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
292 | plot(Wage$age, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
293 | plot(Wage$maritl, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
294 | plot(Wage$jobclass, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
295 | plot(Wage$education, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
296 | ```
297 | 
298 | We have a mix of categorical and continuous variables and also want to
299 | incorporate non-linear aspects of the continuous variables. A GAM is a good
300 | choice to model this situation.
301 | 
302 | ```{r}
303 | library(gam)
304 | fit0 <- gam(wage ~ s(year, 4) + s(age, 5) + education, data = Wage)
305 | fit2 <- gam(wage ~ s(year, 4) + s(age, 5) + education + maritl, data = Wage)
306 | fit1 <- gam(wage ~ s(year, 4) + s(age, 5) + education + jobclass, data = Wage)
307 | fit3 <- gam(wage ~ s(year, 4) + s(age, 5) + education + jobclass + maritl, data = Wage)
308 | anova(fit0, fit1, fit2, fit3)
309 | par(mfrow = c(2, 3))
310 | plot(fit3, se = TRUE, col = "blue")
311 | ```
312 | 
313 | ### Question 8
314 | 
315 | > Fit some of the non-linear models investigated in this chapter to the `Auto`
316 | > data set. Is there evidence for non-linear relationships in this data set?
317 | > Create some informative plots to justify your answer.
318 | 
319 | Here we want to explore a range of non-linear models. First let's look at the
320 | relationships between the variables in the data.
321 | 
322 | ```{r}
323 | pairs(Auto, cex = 0.4, pch = 19)
324 | ```
325 | 
326 | It does appear that there are some non-linear relationships (e.g.
327 | horsepower / weight and mpg). We will pick one variable (horsepower) to predict
328 | mpg and try the range of models discussed in this chapter. We will measure
329 | test MSE through cross-validation to compare the models.
330 | 
331 | ```{r}
332 | library(tidyverse)
333 | set.seed(42)
334 | fit <- glm(mpg ~ horsepower, data = Auto)
335 | err <- cv.glm(Auto, fit, K = 10)$delta[1]
336 | 
337 | fit1 <- glm(mpg ~ poly(horsepower, 4), data = Auto)
338 | err1 <- cv.glm(Auto, fit1, K = 10)$delta[1]
339 | 
340 | q <- quantile(Auto$horsepower)
341 | Auto$hp_cats <- cut(Auto$horsepower, breaks = q, include.lowest = TRUE)
342 | fit2 <- glm(mpg ~ hp_cats, data = Auto)
343 | err2 <- cv.glm(Auto, fit2, K = 10)$delta[1]
344 | 
345 | fit3 <- glm(mpg ~ bs(horsepower, df = 4), data = Auto)
346 | err3 <- cv.glm(Auto, fit3, K = 10)$delta[1]
347 | 
348 | fit4 <- glm(mpg ~ ns(horsepower, 4), data = Auto)
349 | err4 <- cv.glm(Auto, fit4, K = 10)$delta[1]
350 | 
351 | fit5 <- gam(mpg ~ s(horsepower, df = 4), data = Auto)
352 | # rough 10-fold cross-validation for gam.
353 | err5 <- mean(replicate(10, {
354 |   b <- cut(sample(seq_along(Auto$horsepower)), 10)
355 |   pred <- numeric()
356 |   for (i in 1:10) {
357 |     train <- b %in% levels(b)[-i]
358 |     f <- gam(mpg ~ s(horsepower, df = 4), data = Auto[train, ])
359 |     pred[!train] <- predict(f, Auto[!train, ])
360 |   }
361 |   mean((Auto$mpg - pred)^2) # MSE
362 | }))
363 | 
364 | c(err, err1, err2, err3, err4, err5)
365 | anova(fit, fit1, fit2, fit3, fit4, fit5)
366 | 
367 | x <- seq(min(Auto$horsepower), max(Auto$horsepower), length.out = 1000)
368 | pred <- data.frame(
369 |   x = x,
370 |   "Linear" = predict(fit, data.frame(horsepower = x)),
371 |   "Polynomial" = predict(fit1, data.frame(horsepower = x)),
372 |   "Step" = predict(fit2, data.frame(hp_cats = cut(x, breaks = q, include.lowest = TRUE))),
373 |   "Regression spline" = predict(fit3, data.frame(horsepower = x)),
374 |   "Natural spline" = predict(fit4, data.frame(horsepower = x)),
375 |   "Smoothing spline" = predict(fit5, data.frame(horsepower = x)),
376 |   check.names = FALSE
377 | )
378 | pred <- pivot_longer(pred, -x)
379 | ggplot(Auto, aes(horsepower, mpg)) +
380 |   geom_point(color = alpha("steelblue", 0.4)) +
381 |   geom_line(data = pred, aes(x, value, color = name)) +
382 |   theme_bw()
383 | ```
384 | 
385 | ### Question 9
386 | 
387 | > This question uses the variables `dis` (the weighted mean of distances to five
388 | > Boston employment centers) and `nox` (nitrogen oxides concentration in parts per
389 | > 10 million) from the `Boston` data. We will treat `dis` as the predictor and
390 | > `nox` as the response.
391 | >
392 | > a. Use the `poly()` function to fit a cubic polynomial regression to predict
393 | >    `nox` using `dis`. Report the regression output, and plot the resulting data
394 | >    and polynomial fits.
395 | 
396 | ```{r}
397 | fit <- glm(nox ~ poly(dis, 3), data = Boston)
398 | summary(fit)
399 | plot(nox ~ dis, data = Boston, col = alpha("steelblue", 0.4), pch = 19)
400 | x <- seq(min(Boston$dis), max(Boston$dis), length.out = 1000)
401 | lines(x, predict(fit, data.frame(dis = x)), col = "red", lty = 2)
402 | ```
403 | 
404 | > b. Plot the polynomial fits for a range of different polynomial degrees (say,
405 | >    from 1 to 10), and report the associated residual sum of squares.
406 | 
407 | ```{r}
408 | fits <- lapply(1:10, function(i) glm(nox ~ poly(dis, i), data = Boston))
409 | 
410 | x <- seq(min(Boston$dis), max(Boston$dis), length.out = 1000)
411 | pred <- data.frame(lapply(fits, function(fit) predict(fit, data.frame(dis = x))))
412 | colnames(pred) <- 1:10
413 | pred$x <- x
414 | pred <- pivot_longer(pred, !x)
415 | ggplot(Boston, aes(dis, nox)) +
416 |   geom_point(color = alpha("steelblue", 0.4)) +
417 |   geom_line(data = pred, aes(x, value, color = name)) +
418 |   theme_bw()
419 | 
420 | # residual sum of squares
421 | do.call(anova, fits)[, 2]
422 | ```
423 | 
424 | > c. Perform cross-validation or another approach to select the optimal degree
425 | >    for the polynomial, and explain your results.
426 | 
427 | ```{r}
428 | res <- sapply(1:10, function(i) {
429 |   fit <- glm(nox ~ poly(dis, i), data = Boston)
430 |   cv.glm(Boston, fit, K = 10)$delta[1]
431 | })
432 | which.min(res)
433 | ```
434 | 
435 | The optimal degree is 3 based on cross-validation. Higher values tend to
436 | lead to overfitting.
437 | 
438 | > d. Use the `bs()` function to fit a regression spline to predict `nox` using
439 | >    `dis`. Report the output for the fit using four degrees of freedom. How did
440 | >    you choose the knots? Plot the resulting fit.
441 | 
442 | ```{r}
443 | fit <- glm(nox ~ bs(dis, df = 4), data = Boston)
444 | summary(fit)
445 | 
446 | plot(nox ~ dis, data = Boston, col = alpha("steelblue", 0.4), pch = 19)
447 | x <- seq(min(Boston$dis), max(Boston$dis), length.out = 1000)
448 | lines(x, predict(fit, data.frame(dis = x)), col = "red", lty = 2)
449 | ```
450 | 
451 | Knots are chosen based on quantiles of the data.
452 | 
453 | > e. Now fit a regression spline for a range of degrees of freedom, and plot the
454 | >    resulting fits and report the resulting RSS. Describe the results obtained.
455 | 
456 | ```{r}
457 | fits <- lapply(3:10, function(i) {
458 |   glm(nox ~ bs(dis, df = i), data = Boston)
459 | })
460 | 
461 | x <- seq(min(Boston$dis), max(Boston$dis), length.out = 1000)
462 | pred <- data.frame(lapply(fits, function(fit) predict(fit, data.frame(dis = x))))
463 | colnames(pred) <- 3:10
464 | pred$x <- x
465 | pred <- pivot_longer(pred, !x)
466 | ggplot(Boston, aes(dis, nox)) +
467 |   geom_point(color = alpha("steelblue", 0.4)) +
468 |   geom_line(data = pred, aes(x, value, color = name)) +
469 |   theme_bw()
470 | ```
471 | 
472 | At high numbers of degrees of freedom the splines overfit the data (particularly
473 | at extreme ends of the distribution of the predictor variable).
474 | 
475 | > f. Perform cross-validation or another approach in order to select the best
476 | >    degrees of freedom for a regression spline on this data. Describe your
477 | >    results.
478 | 
479 | ```{r}
480 | set.seed(42)
481 | err <- sapply(3:10, function(i) {
482 |   fit <- glm(nox ~ bs(dis, df = i), data = Boston)
483 |   suppressWarnings(cv.glm(Boston, fit, K = 10)$delta[1])
484 | })
485 | which.min(err)
486 | ```
487 | 
488 | This approach would select 4 degrees of freedom for the spline.
489 | 
490 | ### Question 10
491 | 
492 | > This question relates to the `College` data set.
493 | >
494 | > a. Split the data into a training set and a test set. Using out-of-state tuition
495 | >    as the response and the other variables as the predictors, perform forward
496 | >    stepwise selection on the training set in order to identify a satisfactory
497 | >    model that uses just a subset of the predictors.
498 | 
499 | ```{r}
500 | library(leaps)
501 | 
502 | # helper function to predict from a regsubsets model
503 | predict.regsubsets <- function(object, newdata, id, ...) {
504 |   form <- as.formula(object$call[[2]])
505 |   mat <- model.matrix(form, newdata)
506 |   coefi <- coef(object, id = id)
507 |   xvars <- names(coefi)
508 |   mat[, xvars] %*% coefi
509 | }
510 | 
511 | set.seed(42)
512 | train <- rep(TRUE, nrow(College))
513 | train[sample(1:nrow(College), nrow(College) * 1 / 3)] <- FALSE
514 | fit <- regsubsets(Outstate ~ ., data = College[train, ], nvmax = 17, method = "forward")
515 | 
516 | plot(summary(fit)$bic, type = "b")
517 | which.min(summary(fit)$bic)
518 | 
519 | # or via cross-validation
520 | err <- sapply(1:17, function(i) {
521 |   x <- coef(fit, id = i)
522 |   mean((College$Outstate[!train] - predict(fit, College[!train, ], i))^2)
523 | })
524 | which.min(err)
525 | min(summary(fit)$bic)
526 | ```
527 | 
528 | For the sake of simplicity we'll choose 6
529 | 
530 | ```{r}
531 | coef(fit, id = 6)
532 | ```
533 | 
534 | > b. Fit a GAM on the training data, using out-of-state tuition as the response
535 | >    and the features selected in the previous step as the predictors. Plot the
536 | >    results, and explain your findings.
537 | 
538 | ```{r}
539 | fit <- gam(Outstate ~ Private + s(Room.Board, 2) + s(PhD, 2) + s(perc.alumni, 2) +
540 |   s(Expend, 2) + s(Grad.Rate, 2), data = College[train, ])
541 | ```
542 | 
543 | > c. Evaluate the model obtained on the test set, and explain the results
544 | >    obtained.
545 | 
546 | ```{r}
547 | pred <- predict(fit, College[!train, ])
548 | err_gam <- mean((College$Outstate[!train] - pred)^2)
549 | plot(err, ylim = c(min(err_gam, err), max(err)), type = "b")
550 | abline(h = err_gam, col = "red", lty = 2)
551 | 
552 | # r-squared
553 | 1 - err_gam / mean((College$Outstate[!train] - mean(College$Outstate[!train]))^2)
554 | ```
555 | 
556 | > d. For which variables, if any, is there evidence of a non-linear relationship
557 | >    with the response?
558 | 
559 | ```{r}
560 | summary(fit)
561 | ```
562 | 
563 | Non-linear relationships are significant for Expend and PhD.
564 | 
565 | ### Question 11
566 | 
567 | > In Section 7.7, it was mentioned that GAMs are generally fit using a
568 | > _backfitting_ approach. The idea behind backfitting is actually quite simple. We
569 | > will now explore backfitting in the context of multiple linear regression.
570 | >
571 | > Suppose that we would like to perform multiple linear regression, but we do not
572 | > have software to do so. Instead, we only have software to perform simple linear
573 | > regression. Therefore, we take the following iterative approach: we repeatedly
574 | > hold all but one coefficient estimate fixed at its current value, and update
575 | > only that coefficient estimate using a simple linear regression. The process is
576 | > continued until _convergence_---that is, until the coefficient estimates stop
577 | > changing.
578 | >
579 | > We now try this out on a toy example.
580 | >
581 | > a. Generate a response $Y$ and two predictors $X_1$ and $X_2$, with $n = 100$.
582 | 
583 | ```{r}
584 | set.seed(42)
585 | x1 <- rnorm(100)
586 | x2 <- rnorm(100)
587 | y <- 2 + 0.2 * x1 + 4 * x2 + rnorm(100)
588 | ```
589 | 
590 | > b. Initialize $\hat{\beta}_1$ to take on a value of your choice. It does not
591 | >    matter 1 what value you choose.
592 | 
593 | ```{r}
594 | beta1 <- 20
595 | ```
596 | 
597 | > c. Keeping $\hat{\beta}_1$ fixed, fit the model
598 | >    $$Y - \hat{\beta}_1X_1 = \beta_0 + \beta_2X_2 + \epsilon.$$
599 | >    You can do this as follows:
600 | >    
601 | >    ```r
602 | >    > a <- y - beta1 * x1
603 | >    > beta2 <- lm(a ~ x2)$coef[2]
604 | >    ```
605 | 
606 | ```{r}
607 | a <- y - beta1 * x1
608 | beta2 <- lm(a ~ x2)$coef[2]
609 | ```
610 | 
611 | > d. Keeping $\hat{\beta}_2$ fixed, fit the model
612 | >     $$Y - \hat{\beta}_2X_2 = \beta_0 + \beta_1 X_1 + \epsilon.$$
613 | >     You can do this as follows:
614 | >    
615 | >     ```r
616 | >     > a <- y - beta2 * x2
617 | >     > beta1 <- lm(a ~ x1)$coef[2]
618 | >     ```
619 | 
620 | ```{r}
621 | a <- y - beta2 * x2
622 | beta1 <- lm(a ~ x1)$coef[2]
623 | ```
624 | 
625 | > e. Write a for loop to repeat (c) and (d) 1,000 times. Report the estimates of
626 | >    $\hat{\beta}_0, \hat{\beta}_1,$ and $\hat{\beta}_2$ at each iteration of the
627 | >    for loop. Create a plot in which each of these values is displayed, with
628 | >    $\hat{\beta}_0, \hat{\beta}_1,$ and $\hat{\beta}_2$ each shown in a different
629 | >    color.
630 | 
631 | ```{r}
632 | res <- matrix(NA, nrow = 1000, ncol = 3)
633 | colnames(res) <- c("beta0", "beta1", "beta2")
634 | beta1 <- 20
635 | for (i in 1:1000) {
636 |   beta2 <- lm(y - beta1 * x1 ~ x2)$coef[2]
637 |   beta1 <- lm(y - beta2 * x2 ~ x1)$coef[2]
638 |   beta0 <- lm(y - beta2 * x2 ~ x1)$coef[1]
639 |   res[i, ] <- c(beta0, beta1, beta2)
640 | }
641 | res <- as.data.frame(res)
642 | res$Iteration <- 1:1000
643 | res <- pivot_longer(res, !Iteration)
644 | p <- ggplot(res, aes(x = Iteration, y = value, color = name)) +
645 |   geom_line() +
646 |   geom_point() +
647 |   scale_x_continuous(trans = "log10")
648 | p
649 | ```
650 | 
651 | > f. Compare your answer in (e) to the results of simply performing multiple
652 | >    linear regression to predict $Y$ using $X_1$ and $X_2$. Use the `abline()`
653 | >    function to overlay those multiple linear regression coefficient estimates on
654 | >    the plot obtained in (e).
655 | 
656 | ```{r}
657 | fit <- lm(y ~ x1 + x2)
658 | coef(fit)
659 | p + geom_hline(yintercept = coef(fit), lty = 2)
660 | ```
661 | 
662 | > g. On this data set, how many backfitting iterations were required in order to
663 | >    obtain a "good" approximation to the multiple regression coefficient
664 | >    estimates?
665 | 
666 | In this case, good estimates were obtained after 3 iterations.
667 | 
668 | ### Question 12
669 | 
670 | > This problem is a continuation of the previous exercise. In a toy example with
671 | > $p = 100$, show that one can approximate the multiple linear regression
672 | > coefficient estimates by repeatedly performing simple linear regression in a
673 | > backfitting procedure. How many backfitting iterations are required in order to
674 | > obtain a "good" approximation to the multiple regression coefficient estimates?
675 | > Create a plot to justify your answer.
676 | 
677 | ```{r}
678 | set.seed(42)
679 | 
680 | p <- 100
681 | n <- 1000
682 | 
683 | betas <- rnorm(p) * 5
684 | x <- matrix(rnorm(n * p), ncol = p, nrow = n)
685 | y <- (x %*% betas) + rnorm(n) # ignore beta0 for simplicity
686 | 
687 | # multiple regression
688 | fit <- lm(y ~ x - 1)
689 | coef(fit)
690 | 
691 | # backfitting
692 | backfit <- function(x, y, iter = 20) {
693 |   beta <- matrix(0, ncol = ncol(x), nrow = iter + 1)
694 |   for (i in 1:iter) {
695 |     for (k in 1:ncol(x)) {
696 |       residual <- y - (x[, -k] %*% beta[i, -k])
697 |       beta[i + 1, k] <- lm(residual ~ x[, k])$coef[2]
698 |     }
699 |   }
700 |   beta[-1, ]
701 | }
702 | res <- backfit(x, y)
703 | error <- rowMeans(sweep(res, 2, betas)^2)
704 | plot(error, log = "x", type = "b")
705 | 
706 | # backfitting error
707 | error[length(error)]
708 | 
709 | # lm error
710 | mean((coef(fit) - betas)^2)
711 | ```
712 | 
713 | We need around 5 to 6 iterations to obtain a good estimate of the coefficients.
714 | 


--------------------------------------------------------------------------------
/data/Auto.data:
--------------------------------------------------------------------------------
  1 | mpg	cylinders	displacement	horsepower weight	acceleration	year	origin	name						
  2 | 18.0   8   307.0      130.0      3504.      12.0   70  1	"chevrolet chevelle malibu"
  3 | 15.0   8   350.0      165.0      3693.      11.5   70  1	"buick skylark 320"
  4 | 18.0   8   318.0      150.0      3436.      11.0   70  1	"plymouth satellite"
  5 | 16.0   8   304.0      150.0      3433.      12.0   70  1	"amc rebel sst"
  6 | 17.0   8   302.0      140.0      3449.      10.5   70  1	"ford torino"
  7 | 15.0   8   429.0      198.0      4341.      10.0   70  1	"ford galaxie 500"
  8 | 14.0   8   454.0      220.0      4354.       9.0   70  1	"chevrolet impala"
  9 | 14.0   8   440.0      215.0      4312.       8.5   70  1	"plymouth fury iii"
 10 | 14.0   8   455.0      225.0      4425.      10.0   70  1	"pontiac catalina"
 11 | 15.0   8   390.0      190.0      3850.       8.5   70  1	"amc ambassador dpl"
 12 | 15.0   8   383.0      170.0      3563.      10.0   70  1	"dodge challenger se"
 13 | 14.0   8   340.0      160.0      3609.       8.0   70  1	"plymouth 'cuda 340"
 14 | 15.0   8   400.0      150.0      3761.       9.5   70  1	"chevrolet monte carlo"
 15 | 14.0   8   455.0      225.0      3086.      10.0   70  1	"buick estate wagon (sw)"
 16 | 24.0   4   113.0      95.00      2372.      15.0   70  3	"toyota corona mark ii"
 17 | 22.0   6   198.0      95.00      2833.      15.5   70  1	"plymouth duster"
 18 | 18.0   6   199.0      97.00      2774.      15.5   70  1	"amc hornet"
 19 | 21.0   6   200.0      85.00      2587.      16.0   70  1	"ford maverick"
 20 | 27.0   4   97.00      88.00      2130.      14.5   70  3	"datsun pl510"
 21 | 26.0   4   97.00      46.00      1835.      20.5   70  2	"volkswagen 1131 deluxe sedan"
 22 | 25.0   4   110.0      87.00      2672.      17.5   70  2	"peugeot 504"
 23 | 24.0   4   107.0      90.00      2430.      14.5   70  2	"audi 100 ls"
 24 | 25.0   4   104.0      95.00      2375.      17.5   70  2	"saab 99e"
 25 | 26.0   4   121.0      113.0      2234.      12.5   70  2	"bmw 2002"
 26 | 21.0   6   199.0      90.00      2648.      15.0   70  1	"amc gremlin"
 27 | 10.0   8   360.0      215.0      4615.      14.0   70  1	"ford f250"
 28 | 10.0   8   307.0      200.0      4376.      15.0   70  1	"chevy c20"
 29 | 11.0   8   318.0      210.0      4382.      13.5   70  1	"dodge d200"
 30 | 9.0    8   304.0      193.0      4732.      18.5   70  1	"hi 1200d"
 31 | 27.0   4   97.00      88.00      2130.      14.5   71  3	"datsun pl510"
 32 | 28.0   4   140.0      90.00      2264.      15.5   71  1	"chevrolet vega 2300"
 33 | 25.0   4   113.0      95.00      2228.      14.0   71  3	"toyota corona"
 34 | 25.0   4   98.00      ?          2046.      19.0   71  1	"ford pinto"
 35 | 19.0   6   232.0      100.0      2634.      13.0   71  1	"amc gremlin"
 36 | 16.0   6   225.0      105.0      3439.      15.5   71  1	"plymouth satellite custom"
 37 | 17.0   6   250.0      100.0      3329.      15.5   71  1	"chevrolet chevelle malibu"
 38 | 19.0   6   250.0      88.00      3302.      15.5   71  1	"ford torino 500"
 39 | 18.0   6   232.0      100.0      3288.      15.5   71  1	"amc matador"
 40 | 14.0   8   350.0      165.0      4209.      12.0   71  1	"chevrolet impala"
 41 | 14.0   8   400.0      175.0      4464.      11.5   71  1	"pontiac catalina brougham"
 42 | 14.0   8   351.0      153.0      4154.      13.5   71  1	"ford galaxie 500"
 43 | 14.0   8   318.0      150.0      4096.      13.0   71  1	"plymouth fury iii"
 44 | 12.0   8   383.0      180.0      4955.      11.5   71  1	"dodge monaco (sw)"
 45 | 13.0   8   400.0      170.0      4746.      12.0   71  1	"ford country squire (sw)"
 46 | 13.0   8   400.0      175.0      5140.      12.0   71  1	"pontiac safari (sw)"
 47 | 18.0   6   258.0      110.0      2962.      13.5   71  1	"amc hornet sportabout (sw)"
 48 | 22.0   4   140.0      72.00      2408.      19.0   71  1	"chevrolet vega (sw)"
 49 | 19.0   6   250.0      100.0      3282.      15.0   71  1	"pontiac firebird"
 50 | 18.0   6   250.0      88.00      3139.      14.5   71  1	"ford mustang"
 51 | 23.0   4   122.0      86.00      2220.      14.0   71  1	"mercury capri 2000"
 52 | 28.0   4   116.0      90.00      2123.      14.0   71  2	"opel 1900"
 53 | 30.0   4   79.00      70.00      2074.      19.5   71  2	"peugeot 304"
 54 | 30.0   4   88.00      76.00      2065.      14.5   71  2	"fiat 124b"
 55 | 31.0   4   71.00      65.00      1773.      19.0   71  3	"toyota corolla 1200"
 56 | 35.0   4   72.00      69.00      1613.      18.0   71  3	"datsun 1200"
 57 | 27.0   4   97.00      60.00      1834.      19.0   71  2	"volkswagen model 111"
 58 | 26.0   4   91.00      70.00      1955.      20.5   71  1	"plymouth cricket"
 59 | 24.0   4   113.0      95.00      2278.      15.5   72  3	"toyota corona hardtop"
 60 | 25.0   4   97.50      80.00      2126.      17.0   72  1	"dodge colt hardtop"
 61 | 23.0   4   97.00      54.00      2254.      23.5   72  2	"volkswagen type 3"
 62 | 20.0   4   140.0      90.00      2408.      19.5   72  1	"chevrolet vega"
 63 | 21.0   4   122.0      86.00      2226.      16.5   72  1	"ford pinto runabout"
 64 | 13.0   8   350.0      165.0      4274.      12.0   72  1	"chevrolet impala"
 65 | 14.0   8   400.0      175.0      4385.      12.0   72  1	"pontiac catalina"
 66 | 15.0   8   318.0      150.0      4135.      13.5   72  1	"plymouth fury iii"
 67 | 14.0   8   351.0      153.0      4129.      13.0   72  1	"ford galaxie 500"
 68 | 17.0   8   304.0      150.0      3672.      11.5   72  1	"amc ambassador sst"
 69 | 11.0   8   429.0      208.0      4633.      11.0   72  1	"mercury marquis"
 70 | 13.0   8   350.0      155.0      4502.      13.5   72  1	"buick lesabre custom"
 71 | 12.0   8   350.0      160.0      4456.      13.5   72  1	"oldsmobile delta 88 royale"
 72 | 13.0   8   400.0      190.0      4422.      12.5   72  1	"chrysler newport royal"
 73 | 19.0   3   70.00      97.00      2330.      13.5   72  3	"mazda rx2 coupe"
 74 | 15.0   8   304.0      150.0      3892.      12.5   72  1	"amc matador (sw)"
 75 | 13.0   8   307.0      130.0      4098.      14.0   72  1	"chevrolet chevelle concours (sw)"
 76 | 13.0   8   302.0      140.0      4294.      16.0   72  1	"ford gran torino (sw)"
 77 | 14.0   8   318.0      150.0      4077.      14.0   72  1	"plymouth satellite custom (sw)"
 78 | 18.0   4   121.0      112.0      2933.      14.5   72  2	"volvo 145e (sw)"
 79 | 22.0   4   121.0      76.00      2511.      18.0   72  2	"volkswagen 411 (sw)"
 80 | 21.0   4   120.0      87.00      2979.      19.5   72  2	"peugeot 504 (sw)"
 81 | 26.0   4   96.00      69.00      2189.      18.0   72  2	"renault 12 (sw)"
 82 | 22.0   4   122.0      86.00      2395.      16.0   72  1	"ford pinto (sw)"
 83 | 28.0   4   97.00      92.00      2288.      17.0   72  3	"datsun 510 (sw)"
 84 | 23.0   4   120.0      97.00      2506.      14.5   72  3	"toyouta corona mark ii (sw)"
 85 | 28.0   4   98.00      80.00      2164.      15.0   72  1	"dodge colt (sw)"
 86 | 27.0   4   97.00      88.00      2100.      16.5   72  3	"toyota corolla 1600 (sw)"
 87 | 13.0   8   350.0      175.0      4100.      13.0   73  1	"buick century 350"
 88 | 14.0   8   304.0      150.0      3672.      11.5   73  1	"amc matador"
 89 | 13.0   8   350.0      145.0      3988.      13.0   73  1	"chevrolet malibu"
 90 | 14.0   8   302.0      137.0      4042.      14.5   73  1	"ford gran torino"
 91 | 15.0   8   318.0      150.0      3777.      12.5   73  1	"dodge coronet custom"
 92 | 12.0   8   429.0      198.0      4952.      11.5   73  1	"mercury marquis brougham"
 93 | 13.0   8   400.0      150.0      4464.      12.0   73  1	"chevrolet caprice classic"
 94 | 13.0   8   351.0      158.0      4363.      13.0   73  1	"ford ltd"
 95 | 14.0   8   318.0      150.0      4237.      14.5   73  1	"plymouth fury gran sedan"
 96 | 13.0   8   440.0      215.0      4735.      11.0   73  1	"chrysler new yorker brougham"
 97 | 12.0   8   455.0      225.0      4951.      11.0   73  1	"buick electra 225 custom"
 98 | 13.0   8   360.0      175.0      3821.      11.0   73  1	"amc ambassador brougham"
 99 | 18.0   6   225.0      105.0      3121.      16.5   73  1	"plymouth valiant"
100 | 16.0   6   250.0      100.0      3278.      18.0   73  1	"chevrolet nova custom"
101 | 18.0   6   232.0      100.0      2945.      16.0   73  1	"amc hornet"
102 | 18.0   6   250.0      88.00      3021.      16.5   73  1	"ford maverick"
103 | 23.0   6   198.0      95.00      2904.      16.0   73  1	"plymouth duster"
104 | 26.0   4   97.00      46.00      1950.      21.0   73  2	"volkswagen super beetle"
105 | 11.0   8   400.0      150.0      4997.      14.0   73  1	"chevrolet impala"
106 | 12.0   8   400.0      167.0      4906.      12.5   73  1	"ford country"
107 | 13.0   8   360.0      170.0      4654.      13.0   73  1	"plymouth custom suburb"
108 | 12.0   8   350.0      180.0      4499.      12.5   73  1	"oldsmobile vista cruiser"
109 | 18.0   6   232.0      100.0      2789.      15.0   73  1	"amc gremlin"
110 | 20.0   4   97.00      88.00      2279.      19.0   73  3	"toyota carina"
111 | 21.0   4   140.0      72.00      2401.      19.5   73  1	"chevrolet vega"
112 | 22.0   4   108.0      94.00      2379.      16.5   73  3	"datsun 610"
113 | 18.0   3   70.00      90.00      2124.      13.5   73  3	"maxda rx3"
114 | 19.0   4   122.0      85.00      2310.      18.5   73  1	"ford pinto"
115 | 21.0   6   155.0      107.0      2472.      14.0   73  1	"mercury capri v6"
116 | 26.0   4   98.00      90.00      2265.      15.5   73  2	"fiat 124 sport coupe"
117 | 15.0   8   350.0      145.0      4082.      13.0   73  1	"chevrolet monte carlo s"
118 | 16.0   8   400.0      230.0      4278.      9.50   73  1	"pontiac grand prix"
119 | 29.0   4   68.00      49.00      1867.      19.5   73  2	"fiat 128"
120 | 24.0   4   116.0      75.00      2158.      15.5   73  2	"opel manta"
121 | 20.0   4   114.0      91.00      2582.      14.0   73  2	"audi 100ls"
122 | 19.0   4   121.0      112.0      2868.      15.5   73  2	"volvo 144ea"
123 | 15.0   8   318.0      150.0      3399.      11.0   73  1	"dodge dart custom"
124 | 24.0   4   121.0      110.0      2660.      14.0   73  2	"saab 99le"
125 | 20.0   6   156.0      122.0      2807.      13.5   73  3	"toyota mark ii"
126 | 11.0   8   350.0      180.0      3664.      11.0   73  1	"oldsmobile omega"
127 | 20.0   6   198.0      95.00      3102.      16.5   74  1	"plymouth duster"
128 | 21.0   6   200.0      ?          2875.      17.0   74  1	"ford maverick"
129 | 19.0   6   232.0      100.0      2901.      16.0   74  1	"amc hornet"
130 | 15.0   6   250.0      100.0      3336.      17.0   74  1	"chevrolet nova"
131 | 31.0   4   79.00      67.00      1950.      19.0   74  3	"datsun b210"
132 | 26.0   4   122.0      80.00      2451.      16.5   74  1	"ford pinto"
133 | 32.0   4   71.00      65.00      1836.      21.0   74  3	"toyota corolla 1200"
134 | 25.0   4   140.0      75.00      2542.      17.0   74  1	"chevrolet vega"
135 | 16.0   6   250.0      100.0      3781.      17.0   74  1	"chevrolet chevelle malibu classic"
136 | 16.0   6   258.0      110.0      3632.      18.0   74  1	"amc matador"
137 | 18.0   6   225.0      105.0      3613.      16.5   74  1	"plymouth satellite sebring"
138 | 16.0   8   302.0      140.0      4141.      14.0   74  1	"ford gran torino"
139 | 13.0   8   350.0      150.0      4699.      14.5   74  1	"buick century luxus (sw)"
140 | 14.0   8   318.0      150.0      4457.      13.5   74  1	"dodge coronet custom (sw)"
141 | 14.0   8   302.0      140.0      4638.      16.0   74  1	"ford gran torino (sw)"
142 | 14.0   8   304.0      150.0      4257.      15.5   74  1	"amc matador (sw)"
143 | 29.0   4   98.00      83.00      2219.      16.5   74  2	"audi fox"
144 | 26.0   4   79.00      67.00      1963.      15.5   74  2	"volkswagen dasher"
145 | 26.0   4   97.00      78.00      2300.      14.5   74  2	"opel manta"
146 | 31.0   4   76.00      52.00      1649.      16.5   74  3	"toyota corona"
147 | 32.0   4   83.00      61.00      2003.      19.0   74  3	"datsun 710"
148 | 28.0   4   90.00      75.00      2125.      14.5   74  1	"dodge colt"
149 | 24.0   4   90.00      75.00      2108.      15.5   74  2	"fiat 128"
150 | 26.0   4   116.0      75.00      2246.      14.0   74  2	"fiat 124 tc"
151 | 24.0   4   120.0      97.00      2489.      15.0   74  3	"honda civic"
152 | 26.0   4   108.0      93.00      2391.      15.5   74  3	"subaru"
153 | 31.0   4   79.00      67.00      2000.      16.0   74  2	"fiat x1.9"
154 | 19.0   6   225.0      95.00      3264.      16.0   75  1	"plymouth valiant custom"
155 | 18.0   6   250.0      105.0      3459.      16.0   75  1	"chevrolet nova"
156 | 15.0   6   250.0      72.00      3432.      21.0   75  1	"mercury monarch"
157 | 15.0   6   250.0      72.00      3158.      19.5   75  1	"ford maverick"
158 | 16.0   8   400.0      170.0      4668.      11.5   75  1	"pontiac catalina"
159 | 15.0   8   350.0      145.0      4440.      14.0   75  1	"chevrolet bel air"
160 | 16.0   8   318.0      150.0      4498.      14.5   75  1	"plymouth grand fury"
161 | 14.0   8   351.0      148.0      4657.      13.5   75  1	"ford ltd"
162 | 17.0   6   231.0      110.0      3907.      21.0   75  1	"buick century"
163 | 16.0   6   250.0      105.0      3897.      18.5   75  1	"chevroelt chevelle malibu"
164 | 15.0   6   258.0      110.0      3730.      19.0   75  1	"amc matador"
165 | 18.0   6   225.0      95.00      3785.      19.0   75  1	"plymouth fury"
166 | 21.0   6   231.0      110.0      3039.      15.0   75  1	"buick skyhawk"
167 | 20.0   8   262.0      110.0      3221.      13.5   75  1	"chevrolet monza 2+2"
168 | 13.0   8   302.0      129.0      3169.      12.0   75  1	"ford mustang ii"
169 | 29.0   4   97.00      75.00      2171.      16.0   75  3	"toyota corolla"
170 | 23.0   4   140.0      83.00      2639.      17.0   75  1	"ford pinto"
171 | 20.0   6   232.0      100.0      2914.      16.0   75  1	"amc gremlin"
172 | 23.0   4   140.0      78.00      2592.      18.5   75  1	"pontiac astro"
173 | 24.0   4   134.0      96.00      2702.      13.5   75  3	"toyota corona"
174 | 25.0   4   90.00      71.00      2223.      16.5   75  2	"volkswagen dasher"
175 | 24.0   4   119.0      97.00      2545.      17.0   75  3	"datsun 710"
176 | 18.0   6   171.0      97.00      2984.      14.5   75  1	"ford pinto"
177 | 29.0   4   90.00      70.00      1937.      14.0   75  2	"volkswagen rabbit"
178 | 19.0   6   232.0      90.00      3211.      17.0   75  1	"amc pacer"
179 | 23.0   4   115.0      95.00      2694.      15.0   75  2	"audi 100ls"
180 | 23.0   4   120.0      88.00      2957.      17.0   75  2	"peugeot 504"
181 | 22.0   4   121.0      98.00      2945.      14.5   75  2	"volvo 244dl"
182 | 25.0   4   121.0      115.0      2671.      13.5   75  2	"saab 99le"
183 | 33.0   4   91.00      53.00      1795.      17.5   75  3	"honda civic cvcc"
184 | 28.0   4   107.0      86.00      2464.      15.5   76  2	"fiat 131"
185 | 25.0   4   116.0      81.00      2220.      16.9   76  2	"opel 1900"
186 | 25.0   4   140.0      92.00      2572.      14.9   76  1	"capri ii"
187 | 26.0   4   98.00      79.00      2255.      17.7   76  1	"dodge colt"
188 | 27.0   4   101.0      83.00      2202.      15.3   76  2	"renault 12tl"
189 | 17.5   8   305.0      140.0      4215.      13.0   76  1	"chevrolet chevelle malibu classic"
190 | 16.0   8   318.0      150.0      4190.      13.0   76  1	"dodge coronet brougham"
191 | 15.5   8   304.0      120.0      3962.      13.9   76  1	"amc matador"
192 | 14.5   8   351.0      152.0      4215.      12.8   76  1	"ford gran torino"
193 | 22.0   6   225.0      100.0      3233.      15.4   76  1	"plymouth valiant"
194 | 22.0   6   250.0      105.0      3353.      14.5   76  1	"chevrolet nova"
195 | 24.0   6   200.0      81.00      3012.      17.6   76  1	"ford maverick"
196 | 22.5   6   232.0      90.00      3085.      17.6   76  1	"amc hornet"
197 | 29.0   4   85.00      52.00      2035.      22.2   76  1	"chevrolet chevette"
198 | 24.5   4   98.00      60.00      2164.      22.1   76  1	"chevrolet woody"
199 | 29.0   4   90.00      70.00      1937.      14.2   76  2	"vw rabbit"
200 | 33.0   4   91.00      53.00      1795.      17.4   76  3	"honda civic"
201 | 20.0   6   225.0      100.0      3651.      17.7   76  1	"dodge aspen se"
202 | 18.0   6   250.0      78.00      3574.      21.0   76  1	"ford granada ghia"
203 | 18.5   6   250.0      110.0      3645.      16.2   76  1	"pontiac ventura sj"
204 | 17.5   6   258.0      95.00      3193.      17.8   76  1	"amc pacer d/l"
205 | 29.5   4   97.00      71.00      1825.      12.2   76  2	"volkswagen rabbit"
206 | 32.0   4   85.00      70.00      1990.      17.0   76  3	"datsun b-210"
207 | 28.0   4   97.00      75.00      2155.      16.4   76  3	"toyota corolla"
208 | 26.5   4   140.0      72.00      2565.      13.6   76  1	"ford pinto"
209 | 20.0   4   130.0      102.0      3150.      15.7   76  2	"volvo 245"
210 | 13.0   8   318.0      150.0      3940.      13.2   76  1	"plymouth volare premier v8"
211 | 19.0   4   120.0      88.00      3270.      21.9   76  2	"peugeot 504"
212 | 19.0   6   156.0      108.0      2930.      15.5   76  3	"toyota mark ii"
213 | 16.5   6   168.0      120.0      3820.      16.7   76  2	"mercedes-benz 280s"
214 | 16.5   8   350.0      180.0      4380.      12.1   76  1	"cadillac seville"
215 | 13.0   8   350.0      145.0      4055.      12.0   76  1	"chevy c10"
216 | 13.0   8   302.0      130.0      3870.      15.0   76  1	"ford f108"
217 | 13.0   8   318.0      150.0      3755.      14.0   76  1	"dodge d100"
218 | 31.5   4   98.00      68.00      2045.      18.5   77  3	"honda accord cvcc"
219 | 30.0   4   111.0      80.00      2155.      14.8   77  1	"buick opel isuzu deluxe"
220 | 36.0   4   79.00      58.00      1825.      18.6   77  2	"renault 5 gtl"
221 | 25.5   4   122.0      96.00      2300.      15.5   77  1	"plymouth arrow gs"
222 | 33.5   4   85.00      70.00      1945.      16.8   77  3	"datsun f-10 hatchback"
223 | 17.5   8   305.0      145.0      3880.      12.5   77  1	"chevrolet caprice classic"
224 | 17.0   8   260.0      110.0      4060.      19.0   77  1	"oldsmobile cutlass supreme"
225 | 15.5   8   318.0      145.0      4140.      13.7   77  1	"dodge monaco brougham"
226 | 15.0   8   302.0      130.0      4295.      14.9   77  1	"mercury cougar brougham"
227 | 17.5   6   250.0      110.0      3520.      16.4   77  1	"chevrolet concours"
228 | 20.5   6   231.0      105.0      3425.      16.9   77  1	"buick skylark"
229 | 19.0   6   225.0      100.0      3630.      17.7   77  1	"plymouth volare custom"
230 | 18.5   6   250.0      98.00      3525.      19.0   77  1	"ford granada"
231 | 16.0   8   400.0      180.0      4220.      11.1   77  1	"pontiac grand prix lj"
232 | 15.5   8   350.0      170.0      4165.      11.4   77  1	"chevrolet monte carlo landau"
233 | 15.5   8   400.0      190.0      4325.      12.2   77  1	"chrysler cordoba"
234 | 16.0   8   351.0      149.0      4335.      14.5   77  1	"ford thunderbird"
235 | 29.0   4   97.00      78.00      1940.      14.5   77  2	"volkswagen rabbit custom"
236 | 24.5   4   151.0      88.00      2740.      16.0   77  1	"pontiac sunbird coupe"
237 | 26.0   4   97.00      75.00      2265.      18.2   77  3	"toyota corolla liftback"
238 | 25.5   4   140.0      89.00      2755.      15.8   77  1	"ford mustang ii 2+2"
239 | 30.5   4   98.00      63.00      2051.      17.0   77  1	"chevrolet chevette"
240 | 33.5   4   98.00      83.00      2075.      15.9   77  1	"dodge colt m/m"
241 | 30.0   4   97.00      67.00      1985.      16.4   77  3	"subaru dl"
242 | 30.5   4   97.00      78.00      2190.      14.1   77  2	"volkswagen dasher"
243 | 22.0   6   146.0      97.00      2815.      14.5   77  3	"datsun 810"
244 | 21.5   4   121.0      110.0      2600.      12.8   77  2	"bmw 320i"
245 | 21.5   3   80.00      110.0      2720.      13.5   77  3	"mazda rx-4"
246 | 43.1   4   90.00      48.00      1985.      21.5   78  2	"volkswagen rabbit custom diesel"
247 | 36.1   4   98.00      66.00      1800.      14.4   78  1	"ford fiesta"
248 | 32.8   4   78.00      52.00      1985.      19.4   78  3	"mazda glc deluxe"
249 | 39.4   4   85.00      70.00      2070.      18.6   78  3	"datsun b210 gx"
250 | 36.1   4   91.00      60.00      1800.      16.4   78  3	"honda civic cvcc"
251 | 19.9   8   260.0      110.0      3365.      15.5   78  1	"oldsmobile cutlass salon brougham"
252 | 19.4   8   318.0      140.0      3735.      13.2   78  1	"dodge diplomat"
253 | 20.2   8   302.0      139.0      3570.      12.8   78  1	"mercury monarch ghia"
254 | 19.2   6   231.0      105.0      3535.      19.2   78  1	"pontiac phoenix lj"
255 | 20.5   6   200.0      95.00      3155.      18.2   78  1	"chevrolet malibu"
256 | 20.2   6   200.0      85.00      2965.      15.8   78  1	"ford fairmont (auto)"
257 | 25.1   4   140.0      88.00      2720.      15.4   78  1	"ford fairmont (man)"
258 | 20.5   6   225.0      100.0      3430.      17.2   78  1	"plymouth volare"
259 | 19.4   6   232.0      90.00      3210.      17.2   78  1	"amc concord"
260 | 20.6   6   231.0      105.0      3380.      15.8   78  1	"buick century special"
261 | 20.8   6   200.0      85.00      3070.      16.7   78  1	"mercury zephyr"
262 | 18.6   6   225.0      110.0      3620.      18.7   78  1	"dodge aspen"
263 | 18.1   6   258.0      120.0      3410.      15.1   78  1	"amc concord d/l"
264 | 19.2   8   305.0      145.0      3425.      13.2   78  1	"chevrolet monte carlo landau"
265 | 17.7   6   231.0      165.0      3445.      13.4   78  1	"buick regal sport coupe (turbo)"
266 | 18.1   8   302.0      139.0      3205.      11.2   78  1	"ford futura"
267 | 17.5   8   318.0      140.0      4080.      13.7   78  1	"dodge magnum xe"
268 | 30.0   4   98.00      68.00      2155.      16.5   78  1	"chevrolet chevette"
269 | 27.5   4   134.0      95.00      2560.      14.2   78  3	"toyota corona"
270 | 27.2   4   119.0      97.00      2300.      14.7   78  3	"datsun 510"
271 | 30.9   4   105.0      75.00      2230.      14.5   78  1	"dodge omni"
272 | 21.1   4   134.0      95.00      2515.      14.8   78  3	"toyota celica gt liftback"
273 | 23.2   4   156.0      105.0      2745.      16.7   78  1	"plymouth sapporo"
274 | 23.8   4   151.0      85.00      2855.      17.6   78  1	"oldsmobile starfire sx"
275 | 23.9   4   119.0      97.00      2405.      14.9   78  3	"datsun 200-sx"
276 | 20.3   5   131.0      103.0      2830.      15.9   78  2	"audi 5000"
277 | 17.0   6   163.0      125.0      3140.      13.6   78  2	"volvo 264gl"
278 | 21.6   4   121.0      115.0      2795.      15.7   78  2	"saab 99gle"
279 | 16.2   6   163.0      133.0      3410.      15.8   78  2	"peugeot 604sl"
280 | 31.5   4   89.00      71.00      1990.      14.9   78  2	"volkswagen scirocco"
281 | 29.5   4   98.00      68.00      2135.      16.6   78  3	"honda accord lx"
282 | 21.5   6   231.0      115.0      3245.      15.4   79  1	"pontiac lemans v6"
283 | 19.8   6   200.0      85.00      2990.      18.2   79  1	"mercury zephyr 6"
284 | 22.3   4   140.0      88.00      2890.      17.3   79  1	"ford fairmont 4"
285 | 20.2   6   232.0      90.00      3265.      18.2   79  1	"amc concord dl 6"
286 | 20.6   6   225.0      110.0      3360.      16.6   79  1	"dodge aspen 6"
287 | 17.0   8   305.0      130.0      3840.      15.4   79  1	"chevrolet caprice classic"
288 | 17.6   8   302.0      129.0      3725.      13.4   79  1	"ford ltd landau"
289 | 16.5   8   351.0      138.0      3955.      13.2   79  1	"mercury grand marquis"
290 | 18.2   8   318.0      135.0      3830.      15.2   79  1	"dodge st. regis"
291 | 16.9   8   350.0      155.0      4360.      14.9   79  1	"buick estate wagon (sw)"
292 | 15.5   8   351.0      142.0      4054.      14.3   79  1	"ford country squire (sw)"
293 | 19.2   8   267.0      125.0      3605.      15.0   79  1	"chevrolet malibu classic (sw)"
294 | 18.5   8   360.0      150.0      3940.      13.0   79  1	"chrysler lebaron town @ country (sw)"
295 | 31.9   4   89.00      71.00      1925.      14.0   79  2	"vw rabbit custom"
296 | 34.1   4   86.00      65.00      1975.      15.2   79  3	"maxda glc deluxe"
297 | 35.7   4   98.00      80.00      1915.      14.4   79  1	"dodge colt hatchback custom"
298 | 27.4   4   121.0      80.00      2670.      15.0   79  1	"amc spirit dl"
299 | 25.4   5   183.0      77.00      3530.      20.1   79  2	"mercedes benz 300d"
300 | 23.0   8   350.0      125.0      3900.      17.4   79  1	"cadillac eldorado"
301 | 27.2   4   141.0      71.00      3190.      24.8   79  2	"peugeot 504"
302 | 23.9   8   260.0      90.00      3420.      22.2   79  1	"oldsmobile cutlass salon brougham"
303 | 34.2   4   105.0      70.00      2200.      13.2   79  1	"plymouth horizon"
304 | 34.5   4   105.0      70.00      2150.      14.9   79  1	"plymouth horizon tc3"
305 | 31.8   4   85.00      65.00      2020.      19.2   79  3	"datsun 210"
306 | 37.3   4   91.00      69.00      2130.      14.7   79  2	"fiat strada custom"
307 | 28.4   4   151.0      90.00      2670.      16.0   79  1	"buick skylark limited"
308 | 28.8   6   173.0      115.0      2595.      11.3   79  1	"chevrolet citation"
309 | 26.8   6   173.0      115.0      2700.      12.9   79  1	"oldsmobile omega brougham"
310 | 33.5   4   151.0      90.00      2556.      13.2   79  1	"pontiac phoenix"
311 | 41.5   4   98.00      76.00      2144.      14.7   80  2	"vw rabbit"
312 | 38.1   4   89.00      60.00      1968.      18.8   80  3	"toyota corolla tercel"
313 | 32.1   4   98.00      70.00      2120.      15.5   80  1	"chevrolet chevette"
314 | 37.2   4   86.00      65.00      2019.      16.4   80  3	"datsun 310"
315 | 28.0   4   151.0      90.00      2678.      16.5   80  1	"chevrolet citation"
316 | 26.4   4   140.0      88.00      2870.      18.1   80  1	"ford fairmont"
317 | 24.3   4   151.0      90.00      3003.      20.1   80  1	"amc concord"
318 | 19.1   6   225.0      90.00      3381.      18.7   80  1	"dodge aspen"
319 | 34.3   4   97.00      78.00      2188.      15.8   80  2	"audi 4000"
320 | 29.8   4   134.0      90.00      2711.      15.5   80  3	"toyota corona liftback"
321 | 31.3   4   120.0      75.00      2542.      17.5   80  3	"mazda 626"
322 | 37.0   4   119.0      92.00      2434.      15.0   80  3	"datsun 510 hatchback"
323 | 32.2   4   108.0      75.00      2265.      15.2   80  3	"toyota corolla"
324 | 46.6   4   86.00      65.00      2110.      17.9   80  3	"mazda glc"
325 | 27.9   4   156.0      105.0      2800.      14.4   80  1	"dodge colt"
326 | 40.8   4   85.00      65.00      2110.      19.2   80  3	"datsun 210"
327 | 44.3   4   90.00      48.00      2085.      21.7   80  2	"vw rabbit c (diesel)"
328 | 43.4   4   90.00      48.00      2335.      23.7   80  2	"vw dasher (diesel)"
329 | 36.4   5   121.0      67.00      2950.      19.9   80  2	"audi 5000s (diesel)"
330 | 30.0   4   146.0      67.00      3250.      21.8   80  2	"mercedes-benz 240d"
331 | 44.6   4   91.00      67.00      1850.      13.8   80  3	"honda civic 1500 gl"
332 | 40.9   4   85.00      ?          1835.      17.3   80  2	"renault lecar deluxe"
333 | 33.8   4   97.00      67.00      2145.      18.0   80  3	"subaru dl"
334 | 29.8   4   89.00      62.00      1845.      15.3   80  2	"vokswagen rabbit"
335 | 32.7   6   168.0      132.0      2910.      11.4   80  3	"datsun 280-zx"
336 | 23.7   3   70.00      100.0      2420.      12.5   80  3	"mazda rx-7 gs"
337 | 35.0   4   122.0      88.00      2500.      15.1   80  2	"triumph tr7 coupe"
338 | 23.6   4   140.0      ?          2905.      14.3   80  1	"ford mustang cobra"
339 | 32.4   4   107.0      72.00      2290.      17.0   80  3	"honda accord"
340 | 27.2   4   135.0      84.00      2490.      15.7   81  1	"plymouth reliant"
341 | 26.6   4   151.0      84.00      2635.      16.4   81  1	"buick skylark"
342 | 25.8   4   156.0      92.00      2620.      14.4   81  1	"dodge aries wagon (sw)"
343 | 23.5   6   173.0      110.0      2725.      12.6   81  1	"chevrolet citation"
344 | 30.0   4   135.0      84.00      2385.      12.9   81  1	"plymouth reliant"
345 | 39.1   4   79.00      58.00      1755.      16.9   81  3	"toyota starlet"
346 | 39.0   4   86.00      64.00      1875.      16.4   81  1	"plymouth champ"
347 | 35.1   4   81.00      60.00      1760.      16.1   81  3	"honda civic 1300"
348 | 32.3   4   97.00      67.00      2065.      17.8   81  3	"subaru"
349 | 37.0   4   85.00      65.00      1975.      19.4   81  3	"datsun 210 mpg"
350 | 37.7   4   89.00      62.00      2050.      17.3   81  3	"toyota tercel"
351 | 34.1   4   91.00      68.00      1985.      16.0   81  3	"mazda glc 4"
352 | 34.7   4   105.0      63.00      2215.      14.9   81  1	"plymouth horizon 4"
353 | 34.4   4   98.00      65.00      2045.      16.2   81  1	"ford escort 4w"
354 | 29.9   4   98.00      65.00      2380.      20.7   81  1	"ford escort 2h"
355 | 33.0   4   105.0      74.00      2190.      14.2   81  2	"volkswagen jetta"
356 | 34.5   4   100.0      ?          2320.      15.8   81  2	"renault 18i"
357 | 33.7   4   107.0      75.00      2210.      14.4   81  3	"honda prelude"
358 | 32.4   4   108.0      75.00      2350.      16.8   81  3	"toyota corolla"
359 | 32.9   4   119.0      100.0      2615.      14.8   81  3	"datsun 200sx"
360 | 31.6   4   120.0      74.00      2635.      18.3   81  3	"mazda 626"
361 | 28.1   4   141.0      80.00      3230.      20.4   81  2	"peugeot 505s turbo diesel"
362 | 30.7   6   145.0      76.00      3160.      19.6   81  2	"volvo diesel"
363 | 25.4   6   168.0      116.0      2900.      12.6   81  3	"toyota cressida"
364 | 24.2   6   146.0      120.0      2930.      13.8   81  3	"datsun 810 maxima"
365 | 22.4   6   231.0      110.0      3415.      15.8   81  1	"buick century"
366 | 26.6   8   350.0      105.0      3725.      19.0   81  1	"oldsmobile cutlass ls"
367 | 20.2   6   200.0      88.00      3060.      17.1   81  1	"ford granada gl"
368 | 17.6   6   225.0      85.00      3465.      16.6   81  1	"chrysler lebaron salon"
369 | 28.0   4   112.0      88.00      2605.      19.6   82  1	"chevrolet cavalier"
370 | 27.0   4   112.0      88.00      2640.      18.6   82  1	"chevrolet cavalier wagon"
371 | 34.0   4   112.0      88.00      2395.      18.0   82  1	"chevrolet cavalier 2-door"
372 | 31.0   4   112.0      85.00      2575.      16.2   82  1	"pontiac j2000 se hatchback"
373 | 29.0   4   135.0      84.00      2525.      16.0   82  1	"dodge aries se"
374 | 27.0   4   151.0      90.00      2735.      18.0   82  1	"pontiac phoenix"
375 | 24.0   4   140.0      92.00      2865.      16.4   82  1	"ford fairmont futura"
376 | 36.0   4   105.0      74.00      1980.      15.3   82  2	"volkswagen rabbit l"
377 | 37.0   4   91.00      68.00      2025.      18.2   82  3	"mazda glc custom l"
378 | 31.0   4   91.00      68.00      1970.      17.6   82  3	"mazda glc custom"
379 | 38.0   4   105.0      63.00      2125.      14.7   82  1	"plymouth horizon miser"
380 | 36.0   4   98.00      70.00      2125.      17.3   82  1	"mercury lynx l"
381 | 36.0   4   120.0      88.00      2160.      14.5   82  3	"nissan stanza xe"
382 | 36.0   4   107.0      75.00      2205.      14.5   82  3	"honda accord"
383 | 34.0   4   108.0      70.00      2245       16.9   82  3	"toyota corolla"
384 | 38.0   4   91.00      67.00      1965.      15.0   82  3	"honda civic"
385 | 32.0   4   91.00      67.00      1965.      15.7   82  3	"honda civic (auto)"
386 | 38.0   4   91.00      67.00      1995.      16.2   82  3	"datsun 310 gx"
387 | 25.0   6   181.0      110.0      2945.      16.4   82  1	"buick century limited"
388 | 38.0   6   262.0      85.00      3015.      17.0   82  1	"oldsmobile cutlass ciera (diesel)"
389 | 26.0   4   156.0      92.00      2585.      14.5   82  1	"chrysler lebaron medallion"
390 | 22.0   6   232.0      112.0      2835       14.7   82  1	"ford granada l"
391 | 32.0   4   144.0      96.00      2665.      13.9   82  3	"toyota celica gt"
392 | 36.0   4   135.0      84.00      2370.      13.0   82  1	"dodge charger 2.2"
393 | 27.0   4   151.0      90.00      2950.      17.3   82  1	"chevrolet camaro"
394 | 27.0   4   140.0      86.00      2790.      15.6   82  1	"ford mustang gl"
395 | 44.0   4   97.00      52.00      2130.      24.6   82  2	"vw pickup"
396 | 32.0   4   135.0      84.00      2295.      11.6   82  1	"dodge rampage"
397 | 28.0   4   120.0      79.00      2625.      18.6   82  1	"ford ranger"
398 | 31.0   4   119.0      82.00      2720.      19.4   82  1	"chevy s-10"
399 | 


--------------------------------------------------------------------------------