├── images
├── nn.png
├── nn2.png
├── animals
│ ├── bug.jpg
│ ├── bird.jpg
│ ├── bird2.jpg
│ ├── bird3.jpg
│ ├── elba.jpg
│ ├── hamish.jpg
│ ├── poodle.jpg
│ ├── tortoise.jpg
│ ├── butterfly.jpg
│ └── butterfly2.jpg
└── isl_small.png
├── .gitignore
├── _bookdown.yml
├── ISLRv2-solutions.Rproj
├── index.Rmd
├── _output.yml
├── DESCRIPTION
├── .vscode
└── settings.json
├── islrv2.css
├── .github
└── workflows
│ └── github-actions.yml
├── LICENSE.md
├── README.md
├── 13-multiple-testing.Rmd
├── 02-statistical-learning.Rmd
├── 11-survival-analysis-and-censored-data.Rmd
├── 05-resampling-methods.Rmd
├── 12-unsupervised-learning.Rmd
├── 09-support-vector-mechines.Rmd
├── 08-tree-based-methods.Rmd
├── 10-deep-learning.Rmd
├── 07-moving-beyond-linearity.Rmd
└── data
└── Auto.data
/images/nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/nn.png
--------------------------------------------------------------------------------
/images/nn2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/nn2.png
--------------------------------------------------------------------------------
/images/animals/bug.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/bug.jpg
--------------------------------------------------------------------------------
/images/isl_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/isl_small.png
--------------------------------------------------------------------------------
/images/animals/bird.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/bird.jpg
--------------------------------------------------------------------------------
/images/animals/bird2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/bird2.jpg
--------------------------------------------------------------------------------
/images/animals/bird3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/bird3.jpg
--------------------------------------------------------------------------------
/images/animals/elba.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/elba.jpg
--------------------------------------------------------------------------------
/images/animals/hamish.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/hamish.jpg
--------------------------------------------------------------------------------
/images/animals/poodle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/poodle.jpg
--------------------------------------------------------------------------------
/images/animals/tortoise.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/tortoise.jpg
--------------------------------------------------------------------------------
/images/animals/butterfly.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/butterfly.jpg
--------------------------------------------------------------------------------
/images/animals/butterfly2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/butterfly2.jpg
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | _book
6 | .DS_Store
7 | islrv2-solutions.rds
8 | _bookdown_files
9 | [0-9]*.md
10 | [0-9]*_files/
11 |
12 |
--------------------------------------------------------------------------------
/_bookdown.yml:
--------------------------------------------------------------------------------
1 | book_filename: "islrv2-solutions"
2 | language:
3 | ui:
4 | chapter_name: ""
5 | delete_merged_file: true
6 | new_session: yes
7 | repo: https://github.com/danhalligan/ISLRv2-solutions/
8 |
--------------------------------------------------------------------------------
/ISLRv2-solutions.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
18 | BuildType: Website
19 |
20 | SpellingDictionary: en_US
21 |
--------------------------------------------------------------------------------
/index.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: An Introduction to Statistical Learning
3 | subtitle: Exercise solutions in R
4 | site: bookdown::bookdown_site
5 | github-repo: danhalligan/ISLRv2-solutions
6 | url: 'https\://danhalligan.github.io/ISLRv2-solutions/'
7 | ---
8 |
9 | # Introduction
10 |
11 | This bookdown document provides solutions for exercises in the book
12 | ["An Introduction to Statistical Learning with Applications in R"](https://www.statlearning.com/),
13 | second edition, by Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani.
14 |
15 | 
16 |
--------------------------------------------------------------------------------
/_output.yml:
--------------------------------------------------------------------------------
1 | bookdown::gitbook:
2 | css: [islrv2.css, https://cdn.jsdelivr.net/gh/aaaakshat/cm-web-fonts@latest/fonts.css]
3 | config:
4 | toc:
5 | collapse: section
6 | before: |
7 |
ISLRv2 Solutions
8 | after: |
9 | Published with bookdown
10 | edit: https://github.com/danhalligan/ISLRv2-solutions/edit/main/%s
11 | sharing:
12 | facebook: no
13 | twitter: no
14 | github: yes
15 | all: []
16 | github-repo: danhalligan/ISLRv2-solutions
17 | url: https://danhalligan.github.io/ISLRv2-solutions/
18 |
19 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: placeholder
2 | Type: Book
3 | Title: Does not matter.
4 | Version: 0.0.1
5 | biocviews:
6 | Imports:
7 | ape,
8 | BART,
9 | bookdown,
10 | boot,
11 | class,
12 | corrplot,
13 | e1071,
14 | gam,
15 | gbm,
16 | ggfortify,
17 | ggplot2,
18 | ggtree,
19 | glmnet,
20 | ISLR2,
21 | keras,
22 | knitr,
23 | leaps,
24 | MASS,
25 | neuralnet,
26 | plotly,
27 | pls,
28 | randomForest,
29 | showtext,
30 | sigmoid,
31 | splines,
32 | survival,
33 | tree,
34 | tidyverse,
35 | xfun
36 | Remotes:
37 | rstudio/bookdown,
38 | github::YuLab-SMU/ggtree
39 |
40 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "cSpell.words": [
3 | "ANOVA",
4 | "Bonferroni",
5 | "boxplots",
6 | "covariate",
7 | "dendrogram",
8 | "dendrograms",
9 | "exponentiating",
10 | "FWER",
11 | "hypergeometric",
12 | "Kaplan",
13 | "logit",
14 | "LOOCV",
15 | "multinomial",
16 | "overfit",
17 | "overfitting",
18 | "quantile",
19 | "quantiles",
20 | "scalings",
21 | "scatterplot",
22 | "scatterplots",
23 | "softmax",
24 | "stochasticity",
25 | "timepoint"
26 | ],
27 | "cSpell.enabled": true,
28 | "cSpell.language": "en-US"
29 | }
--------------------------------------------------------------------------------
/islrv2.css:
--------------------------------------------------------------------------------
1 | #header .title {
2 | color: #2E71B6;
3 | font-family: "Computer Modern Serif", serif;
4 | font-weight: normal;
5 | font-size: 3em;
6 | letter-spacing: -0.8px;
7 | line-height: 1.2;
8 | }
9 |
10 | h1,
11 | h2,
12 | h3 {
13 | color: #2E71B6;
14 | font-family: "Computer Modern Serif", serif;
15 | font-weight: normal;
16 | }
17 |
18 | .book .book-body .page-wrapper .page-inner section.normal h1 {
19 | font-size: 2.5em;
20 | }
21 |
22 | .book .book-body .page-wrapper .page-inner section.normal h2 {
23 | font-size: 2em;
24 | font-style: italic;
25 | }
26 |
27 | h3>.header-section-number {
28 | display: none;
29 | }
30 |
31 | .book .book-body .page-wrapper .page-inner section.normal pre.sourceCode {
32 | background: #FDF5E7;
33 | }
34 |
35 | .book .book-body .page-wrapper .page-inner {
36 | max-width: 1000px !important;
37 | }
--------------------------------------------------------------------------------
/.github/workflows/github-actions.yml:
--------------------------------------------------------------------------------
1 | name: Build and Deploy
2 | on: [push, workflow_dispatch]
3 | permissions:
4 | contents: write
5 | jobs:
6 | build-and-deploy:
7 | concurrency: ci-${{ github.ref }}
8 | runs-on: ubuntu-latest
9 | steps:
10 | - uses: actions/checkout@v2
11 |
12 | - uses: r-lib/actions/setup-pandoc@v1
13 |
14 | - uses: r-lib/actions/setup-r@v2
15 | with:
16 | use-public-rspm: true
17 |
18 | - uses: r-lib/actions/setup-r-dependencies@v2
19 | with:
20 | cache: false
21 |
22 | - name: Setup keras
23 | run: Rscript -e "keras::install_keras()"
24 |
25 | - name: Build 🔧
26 | run: Rscript -e "bookdown::render_book('index.Rmd', 'bookdown::gitbook')"
27 |
28 | - name: Deploy 🚀
29 | uses: JamesIves/github-pages-deploy-action@v4.4.0
30 | with:
31 | folder: _book
32 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | Copyright (c) 2022 ISLRv2-solutions authors
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # An Introduction to Statistical Learning with Applications in R second edition solutions
2 |
3 | [](https://github.com/danhalligan/ISLRv2-solutions/actions/workflows/github-actions.yml)
4 | 
5 |
6 | 
7 |
8 | This repository provides my solutions for all exercises in the book
9 | ["An Introduction to Statistical Learning with Applications in R"](https://www.statlearning.com/),
10 | second edition,
11 | by Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani.
12 |
13 | If you use these solutions or find them useful, please star this repository!
14 |
15 | ## About these solutions
16 |
17 | The solutions are written in [bookdown] format using (my)
18 | [ISLRv2 solutions template](https://github.com/danhalligan/ISLRv2-solutions-template).
19 |
20 | For anyone reading this book, I believe there is great value in deriving
21 | the solutions yourself, and the template above can be forked to provide a
22 | great starting point as I've created template Rmarkdown files for each chapter
23 | and transcribed all questions as quotes within the chapter files leaving space
24 | for you to write your solutions either as text, or by writing R code chunks.
25 | For more details, refer to the template repository.
26 |
27 | I've tried my best to provide solutions to each problem in this book, and I
28 | believe my answers should be (at least for the most part) correct. The solutions
29 | are relatively concise but hopefully comprehensive enough to address the
30 | purpose of each question.
31 |
32 | If, when reading these solutions, you notice any inconsistencies, mistakes or
33 | have any other questions, please feel free to raise an issue or provide
34 | suggestions and I'll happily take a look and try to incorporate.
35 |
36 | ## Compiled solutions
37 |
38 | The HTML book is compiled from the Rmarkdown files in using a [GitHub Actions]
39 | workflow and then automatically deployed to [GitHub pages].
40 |
41 | You can view these solutions hosted from GitHub at the following location:
42 |
43 | *
44 |
45 | ## Building the solutions
46 |
47 | The R dependencies are provided in the `DESCRIPTION` file and you can install them by running:
48 |
49 | ```r
50 | devtools::install()
51 | ```
52 |
53 | Other than that you need the Python packages [tensorflow and keras] installed in the Python environment you are using to run the examples:
54 |
55 | ```bash
56 | pip3 install tensorflow keras
57 | ```
58 |
59 | Rebuild the book with compiled solutions by running:
60 |
61 | ```r
62 | bookdown::render_book('index.Rmd', 'bookdown::gitbook')
63 | ```
64 |
65 | To render only one file/chapter (e.g. solutions for chapter 2), use:
66 |
67 | ```r
68 | rmarkdown::render(input="02-statistical-learning.Rmd", output_format=bookdown::pdf_document2())
69 | ```
70 |
71 | [bookdown]: https://github.com/rstudio/bookdown
72 | [GitHub Actions]: https://docs.github.com/en/actions
73 | [GitHub pages]: https://pages.github.com/
74 | [tensorflow and keras]: https://tensorflow.rstudio.com/install/
75 |
--------------------------------------------------------------------------------
/13-multiple-testing.Rmd:
--------------------------------------------------------------------------------
1 | # Multiple Testing
2 |
3 | ## Conceptual
4 |
5 | ### Question 1
6 |
7 | > Suppose we test $m$ null hypotheses, all of which are true. We control the
8 | > Type I error for each null hypothesis at level $\alpha$. For each sub-problem,
9 | > justify your answer.
10 | >
11 | > a. In total, how many Type I errors do we expect to make?
12 |
13 | We expect $m\alpha$.
14 |
15 | > b. Suppose that the m tests that we perform are independent. What is the
16 | > family-wise error rate associated with these m tests?
17 | >
18 | > _Hint: If two events A and B are independent, then Pr(A ∩ B) = Pr(A) Pr(B)._
19 |
20 | The family-wise error rate (FWER) is defined as the probability of making at
21 | least one Type I error. We can think of this as 1 minus the probability of
22 | no type I errors, which is:
23 |
24 | $1 - (1 - \alpha)^m$
25 |
26 | Alternatively, for two tests this is: Pr(A ∪ B) = Pr(A) + Pr(B) - Pr(A ∩ B).
27 | For independent tests this is $\alpha + \alpha - \alpha^2$
28 |
29 | > c. Suppose that $m = 2$, and that the p-values for the two tests are
30 | > positively correlated, so that if one is small then the other will tend to
31 | > be small as well, and if one is large then the other will tend to be large.
32 | > How does the family-wise error rate associated with these $m = 2$ tests
33 | > qualitatively compare to the answer in (b) with $m = 2$?
34 | >
35 | > _Hint: First, suppose that the two p-values are perfectly correlated._
36 |
37 | If they were perfectly correlated, we would effectively be performing a single
38 | test (thus FWER would be $alpha$). In the case when they are positively
39 | correlated therefore, we can expect the FWER to be less than in b.
40 |
41 | Alternatively, as above, FWEW = Pr(A ∪ B) = Pr(A) + Pr(B) - Pr(A ∩ B).
42 | For perfectly positively correlated tests Pr(A ∩ B) = $\alpha$, so the
43 | FWEW is $\alpha$ which is smaller than b.
44 |
45 | > d. Suppose again that $m = 2$, but that now the p-values for the two tests are
46 | > negatively correlated, so that if one is large then the other will tend to
47 | > be small. How does the family-wise error rate associated with these $m = 2$
48 | > tests qualitatively compare to the answer in (b) with $m = 2$?
49 | >
50 | > _Hint: First, suppose that whenever one p-value is less than $\alpha$,_
51 | > _then the other will be greater than $\alpha$. In other words, we can_
52 | > _never reject both null hypotheses._
53 |
54 | Taking the equation above, for two tests,
55 | FWEW = Pr(A ∪ B) = Pr(A) + Pr(B) - Pr(A ∩ B). In the case considered in the
56 | hint Pr(A ∩ B) = 0, so Pr(A ∪ B) = $2\alpha$, which is larger than b.
57 |
58 | ### Question 2
59 |
60 | > Suppose that we test $m$ hypotheses, and control the Type I error for each
61 | > hypothesis at level $\alpha$. Assume that all $m$ p-values are independent,
62 | > and that all null hypotheses are true.
63 | >
64 | > a. Let the random variable $A_j$ equal 1 if the $j$th null hypothesis is
65 | > rejected, and 0 otherwise. What is the distribution of $A_j$?
66 |
67 | $A_j$ follows a Bernoulli distribution: $A_j \sim \text{Bernoulli}(p)$
68 |
69 | > b. What is the distribution of $\sum_{j=1}^m A_j$?
70 |
71 | Follows a binomial distribution $\sum_{j=1}^m A_j \sim Bi(m, \alpha)$.
72 |
73 | > c. What is the standard deviation of the number of Type I errors that we will
74 | > make?
75 |
76 | The variance of a Binomial is $npq$, so for this situation the standard
77 | deviation would be $\sqrt{m \alpha (1-\alpha)}$.
78 |
79 | ### Question 3
80 |
81 | > Suppose we test $m$ null hypotheses, and control the Type I error for the
82 | > $j$th null hypothesis at level $\alpha_j$, for $j=1,...,m$. Argue that the
83 | > family-wise error rate is no greater than $\sum_{j=1}^m \alpha_j$.
84 |
85 | $p(A \cup B) = p(A) + p(B)$ if $A$ and $B$ are independent or
86 | $p(A) + p(B) - p(A \cap B)$ when they are not. Since $p(A \cap B)$ must be
87 | positive, $p(A \cup B) < p(A) + p(B)$ (whether independent or not).
88 |
89 | Therefore, the probability of a type I error in _any_ of $m$ hypotheses can
90 | be no larger than the sum of the probabilities for each individual hypothesis
91 | (which is $\alpha_j$ for the $j$th).
92 |
93 | ### Question 4
94 |
95 | > Suppose we test $m = 10$ hypotheses, and obtain the p-values shown in Table
96 | > 13.4.
97 |
98 | ```{r}
99 | pvals <- c(0.0011, 0.031, 0.017, 0.32, 0.11, 0.90, 0.07, 0.006, 0.004, 0.0009)
100 | names(pvals) <- paste0("H", sprintf("%02d", 1:10))
101 | ```
102 |
103 | > a. Suppose that we wish to control the Type I error for each null hypothesis
104 | > at level $\alpha = 0.05$. Which null hypotheses will we reject?
105 |
106 | ```{r}
107 | names(which(pvals < 0.05))
108 | ```
109 |
110 | We reject all NULL hypotheses where $p < 0.05$.
111 |
112 | > b. Now suppose that we wish to control the FWER at level $\alpha = 0.05$.
113 | > Which null hypotheses will we reject? Justify your answer.
114 |
115 | ```{r}
116 | names(which(pvals < 0.05 / 10))
117 | ```
118 |
119 | We reject all NULL hypotheses where $p < 0.005$.
120 |
121 | > c. Now suppose that we wish to control the FDR at level $q = 0.05$. Which null
122 | > hypotheses will we reject? Justify your answer.
123 |
124 | ```{r}
125 | names(which(p.adjust(pvals, "fdr") < 0.05))
126 | ```
127 |
128 | We reject all NULL hypotheses where $q < 0.05$.
129 |
130 | > d. Now suppose that we wish to control the FDR at level $q = 0.2$. Which null
131 | > hypotheses will we reject? Justify your answer.
132 |
133 | ```{r}
134 | names(which(p.adjust(pvals, "fdr") < 0.2))
135 | ```
136 |
137 | We reject all NULL hypotheses where $q < 0.2$.
138 |
139 | > e. Of the null hypotheses rejected at FDR level $q = 0.2$, approximately how
140 | > many are false positives? Justify your answer.
141 |
142 | We expect 20% (in this case 2 out of the 8) rejections to be false (false
143 | positives).
144 |
145 | ### Question 5
146 |
147 | > For this problem, you will make up p-values that lead to a certain number of
148 | > rejections using the Bonferroni and Holm procedures.
149 | >
150 | > a. Give an example of five p-values (i.e. five numbers between 0 and 1 which,
151 | > for the purpose of this problem, we will interpret as p-values) for which
152 | > both Bonferroni’s method and Holm’s method reject exactly one null hypothesis
153 | > when controlling the FWER at level 0.1.
154 |
155 | In this case, for Bonferroni, we need one p-value to be less than $0.1 / 5 =
156 | 0.02$. and the others to be above. For Holm's method, we need the most
157 | significant p-value to be below $0.1/(5 + 1 - 1) = 0.02$ also.
158 |
159 | An example would be: 1, 1, 1, 1, 0.001.
160 |
161 | ```{r}
162 | pvals <- c(1, 1, 1, 1, 0.001)
163 | sum(p.adjust(pvals, method = "bonferroni") < 0.1)
164 | sum(p.adjust(pvals, method = "holm") < 0.1)
165 | ```
166 |
167 | > b. Now give an example of five p-values for which Bonferroni rejects one
168 | > null hypothesis and Holm rejects more than one null hypothesis at level 0.1.
169 |
170 | An example would be: 1, 1, 1, 0.02, 0.001. For Holm's method we reject two
171 | because $0.02 < 0.1/(5 + 1 - 2)$.
172 |
173 | ```{r}
174 | pvals <- c(1, 1, 1, 0.02, 0.001)
175 | sum(p.adjust(pvals, method = "bonferroni") < 0.1)
176 | sum(p.adjust(pvals, method = "holm") < 0.1)
177 | ```
178 |
179 | ### Question 6
180 |
181 | > For each of the three panels in Figure 13.3, answer the following questions:
182 |
183 | * There are always: 8 positives (red) and 2 negatives (black).
184 | * False / true positives are black / red points _below_ the line respectively.
185 | * False / true negatives are red / black points _above_ the line respectively.
186 | * Type I / II errors are the same as false positives and false negatives
187 | respectively.
188 |
189 | > a. How many false positives, false negatives, true positives, true negatives,
190 | > Type I errors, and Type II errors result from applying the Bonferroni
191 | > procedure to control the FWER at level $\alpha = 0.05$?
192 |
193 | | Panel | FP | FN | TP | TN | Type I | Type II |
194 | |------ |--- |--- |--- |--- |------- |-------- |
195 | | 1 | 0 | 1 | 7 | 2 | 0 | 1 |
196 | | 2 | 0 | 1 | 7 | 2 | 0 | 1 |
197 | | 3 | 0 | 5 | 3 | 2 | 0 | 5 |
198 |
199 | > b. How many false positives, false negatives, true positives, true negatives,
200 | > Type I errors, and Type II errors result from applying the Holm procedure to
201 | > control the FWER at level $\alpha = 0.05$?
202 |
203 | | Panel | FP | FN | TP | TN | Type I | Type II |
204 | |------ |--- |--- |--- |--- |------- |-------- |
205 | | 1 | 0 | 1 | 7 | 2 | 0 | 1 |
206 | | 2 | 0 | 0 | 8 | 2 | 0 | 0 |
207 | | 3 | 0 | 0 | 8 | 2 | 0 | 0 |
208 |
209 | > c. What is the false discovery rate associated with using the Bonferroni
210 | > procedure to control the FWER at level $\alpha = 0.05$?
211 |
212 | False discovery rate is the expected ratio of false positives (FP) to total
213 | positive (FP + TP).
214 |
215 | For panels 1, 2, 3 this would be 0/7, 0/7 and 0/3 respectively.
216 |
217 | > d. What is the false discovery rate associated with using the Holm procedure
218 | > to control the FWER at level $\alpha = 0.05$?
219 |
220 | For panels 1, 2, 3 this would be 0/7, 0/8 and 0/8 respectively.
221 |
222 | > e. How would the answers to (a) and (c) change if we instead used the
223 | > Bonferroni procedure to control the FWER at level $\alpha = 0.001$?
224 |
225 | This would equate to a more stringent threshold. We would not call any more
226 | false positives, so the results would not change.
227 |
228 | ## Applied
229 |
230 | ### Question 7
231 |
232 | > This problem makes use of the `Carseats` dataset in the `ISLR2` package.
233 | >
234 | > a. For each quantitative variable in the dataset besides `Sales`, fit a linear
235 | > model to predict `Sales` using that quantitative variable. Report the p-values
236 | > associated with the coefficients for the variables. That is, for each model of
237 | > the form $Y = \beta_0 + \beta_1X + \epsilon$, report the p-value associated
238 | > with the coefficient $\beta_1$. Here, $Y$ represents `Sales` and $X$
239 | > represents one of the other quantitative variables.
240 |
241 | ```{r}
242 | library(ISLR2)
243 |
244 | nm <- c("CompPrice", "Income", "Advertising", "Population", "Price", "Age")
245 | pvals <- sapply(nm, function(n) {
246 | summary(lm(Carseats[["Sales"]] ~ Carseats[[n]]))$coef[2, 4]
247 | })
248 | ```
249 |
250 | > b. Suppose we control the Type I error at level $\alpha = 0.05$ for the
251 | > p-values obtained in (a). Which null hypotheses do we reject?
252 |
253 | ```{r}
254 | names(which(pvals < 0.05))
255 | ```
256 |
257 | > c. Now suppose we control the FWER at level 0.05 for the p-values. Which null
258 | > hypotheses do we reject?
259 |
260 | ```{r}
261 | names(which(pvals < 0.05 / length(nm)))
262 | ```
263 |
264 | > d. Finally, suppose we control the FDR at level 0.2 for the p-values. Which
265 | > null hypotheses do we reject?
266 |
267 | ```{r}
268 | names(which(p.adjust(pvals, "fdr") < 0.2))
269 | ```
270 |
271 | ### Question 8
272 |
273 | > In this problem, we will simulate data from $m = 100$ fund managers.
274 | >
275 | > ```r
276 | > set.seed(1)
277 | > n <- 20
278 | > m <- 100
279 | > X <- matrix(rnorm(n * m), ncol = m)
280 | > ```
281 |
282 | ```{r}
283 | set.seed(1)
284 | n <- 20
285 | m <- 100
286 | X <- matrix(rnorm(n * m), ncol = m)
287 | ```
288 |
289 | > These data represent each fund manager’s percentage returns for each of $n =
290 | > 20$ months. We wish to test the null hypothesis that each fund manager’s
291 | > percentage returns have population mean equal to zero. Notice that we
292 | > simulated the data in such a way that each fund manager’s percentage returns
293 | > do have population mean zero; in other words, all $m$ null hypotheses are true.
294 | >
295 | > a. Conduct a one-sample $t$-test for each fund manager, and plot a histogram
296 | > of the $p$-values obtained.
297 |
298 | ```{r}
299 | pvals <- apply(X, 2, function(p) t.test(p)$p.value)
300 | hist(pvals, main = NULL)
301 | ```
302 |
303 | > b. If we control Type I error for each null hypothesis at level $\alpha =
304 | > 0.05$, then how many null hypotheses do we reject?
305 |
306 | ```{r}
307 | sum(pvals < 0.05)
308 | ```
309 | > c. If we control the FWER at level 0.05, then how many null hypotheses do we
310 | > reject?
311 |
312 | ```{r}
313 | sum(pvals < 0.05 / length(pvals))
314 | ```
315 |
316 | > d. If we control the FDR at level 0.05, then how many null hypotheses do we
317 | > reject?
318 |
319 | ```{r}
320 | sum(p.adjust(pvals, "fdr") < 0.05)
321 | ```
322 |
323 | > e. Now suppose we “cherry-pick” the 10 fund managers who perform the best in
324 | > our data. If we control the FWER for just these 10 fund managers at level
325 | > 0.05, then how many null hypotheses do we reject? If we control the FDR for
326 | > just these 10 fund managers at level 0.05, then how many null hypotheses do we
327 | > reject?
328 |
329 | ```{r}
330 | best <- order(apply(X, 2, sum), decreasing = TRUE)[1:10]
331 | sum(pvals[best] < 0.05 / 10)
332 | sum(p.adjust(pvals[best], "fdr") < 0.05)
333 | ```
334 |
335 | > f. Explain why the analysis in (e) is misleading.
336 | >
337 | > _Hint The standard approaches for controlling the FWER and FDR assume that all
338 | > tested null hypotheses are adjusted for multiplicity, and that no
339 | > “cherry-picking” of the smallest p-values has occurred. What goes wrong if we
340 | > cherry-pick?_
341 |
342 | This is misleading because we are not correctly accounting for all tests
343 | performed. Cherry picking the similar to repeating a test until by chance we
344 | find a significant result.
345 |
--------------------------------------------------------------------------------
/02-statistical-learning.Rmd:
--------------------------------------------------------------------------------
1 | # Statistical Learning
2 |
3 | ## Conceptual
4 |
5 | ### Question 1
6 |
7 | > For each of parts (a) through (d), indicate whether we would generally expect
8 | > the performance of a flexible statistical learning method to be better or
9 | > worse than an inflexible method. Justify your answer.
10 | >
11 | > a. The sample size $n$ is extremely large, and the number of predictors $p$ is
12 | > small.
13 |
14 | Flexible best - opposite of b.
15 |
16 | > b. The number of predictors $p$ is extremely large, and the number of
17 | > observations $n$ is small.
18 |
19 | Inflexible best - high chance of some predictors being randomly associated.
20 |
21 | > c. The relationship between the predictors and response is highly
22 | > non-linear.
23 |
24 | Flexible best - inflexible leads to high bias.
25 |
26 | > d. The variance of the error terms, i.e. $\sigma^2 = Var(\epsilon)$, is
27 | > extremely high.
28 |
29 | Inflexible best - opposite of c.
30 |
31 | ### Question 2
32 |
33 | > Explain whether each scenario is a classification or regression problem, and
34 | > indicate whether we are most interested in inference or prediction. Finally,
35 | > provide $n$ and $p$.
36 | >
37 | > a. We collect a set of data on the top 500 firms in the US. For each firm
38 | > we record profit, number of employees, industry and the CEO salary. We are
39 | > interested in understanding which factors affect CEO salary.
40 |
41 | $n=500$, $p=3$, regression, inference.
42 |
43 | > b. We are considering launching a new product and wish to know whether
44 | > it will be a success or a failure. We collect data on 20 similar products
45 | > that were previously launched. For each product we have recorded whether it
46 | > was a success or failure, price charged for the product, marketing budget,
47 | > competition price, and ten other variables.
48 |
49 | $n=20$, $p=13$, classification, prediction.
50 |
51 | > c. We are interested in predicting the % change in the USD/Euro exchange
52 | > rate in relation to the weekly changes in the world stock markets. Hence we
53 | > collect weekly data for all of 2012. For each week we record the % change
54 | > in the USD/Euro, the % change in the US market, the % change in the British
55 | > market, and the % change in the German market.
56 |
57 | $n=52$, $p=3$, regression, prediction.
58 |
59 | ### Question 3
60 |
61 | > We now revisit the bias-variance decomposition.
62 | >
63 | > a. Provide a sketch of typical (squared) bias, variance, training error,
64 | > test error, and Bayes (or irreducible) error curves, on a single plot, as
65 | > we go from less flexible statistical learning methods towards more flexible
66 | > approaches. The x-axis should represent the amount of flexibility in the
67 | > method, and the y-axis should represent the values for each curve. There
68 | > should be five curves. Make sure to label each one.
69 | >
70 | > b. Explain why each of the five curves has the shape displayed in
71 | > part (a).
72 |
73 | * (squared) bias: Decreases with increasing flexibility (Generally, more
74 | flexible methods result in less bias).
75 | * variance: Increases with increasing flexibility (In general, more flexible
76 | statistical methods have higher variance).
77 | * training error: Decreases with model flexibility (More complex models will
78 | better fit the training data).
79 | * test error: Decreases initially, then increases due to overfitting (less
80 | bias but more training error).
81 | * Bayes (irreducible) error: fixed (does not change with model).
82 |
83 | ### Question 4
84 |
85 | > You will now think of some real-life applications for statistical learning.
86 | >
87 | > a. Describe three real-life applications in which classification might
88 | > be useful. Describe the response, as well as the predictors. Is the goal of
89 | > each application inference or prediction? Explain your answer.
90 |
91 | * Coffee machine cleaned? (day of week, person assigned), inference.
92 | * Is a flight delayed? (airline, airport etc), inference.
93 | * Beer type (IPA, pilsner etc.), prediction.
94 |
95 | > b. Describe three real-life applications in which regression might be
96 | > useful. Describe the response, as well as the predictors. Is the goal of
97 | > each application inference or prediction? Explain your answer.
98 |
99 | * Amount of bonus paid (profitability, client feedback), prediction.
100 | * Person's height, prediction.
101 | * House price, inference.
102 |
103 | > c. Describe three real-life applications in which cluster analysis might be
104 | > useful.
105 |
106 | * RNAseq tumour gene expression data.
107 | * SNPs in human populations.
108 | * Frequencies of mutations (with base pair context) in somatic mutation data.
109 |
110 | ### Question 5
111 |
112 | > What are the advantages and disadvantages of a very flexible (versus a less
113 | > flexible) approach for regression or classification? Under what circumstances
114 | > might a more flexible approach be preferred to a less flexible approach? When
115 | > might a less flexible approach be preferred?
116 |
117 | Inflexible is more interpretable, fewer observations required, can be biased.
118 | Flexible can overfit (high error variance). In cases where we have high $n$ or
119 | non-linear patterns flexible will be preferred.
120 |
121 | ### Question 6
122 |
123 | > Describe the differences between a parametric and a non-parametric statistical
124 | > learning approach. What are the advantages of a parametric approach to
125 | > regression or classification (as opposed to a non-parametric approach)? What
126 | > are its disadvantages?
127 |
128 | Parametric uses (model) parameters. Parametric models can be more interpretable
129 | as there is a model behind how data is generated. However, the disadvantage is
130 | that the model might not reflect reality. If the model is too far from the
131 | truth, estimates will be poor and more flexible models can fit many different
132 | forms and require more parameters (leading to overfitting). Non-parametric
133 | approaches do not estimate a small number of parameters, so a large number
134 | of observations may be needed to obtain accurate estimates.
135 |
136 | ### Question 7
137 |
138 | > The table below provides a training data set containing six observations,
139 | > three predictors, and one qualitative response variable.
140 | >
141 | > | Obs. | $X_1$ | $X_2$ | $X_3$ | $Y$ |
142 | > |------|-------|-------|-------|-------|
143 | > | 1 | 0 | 3 | 0 | Red |
144 | > | 2 | 2 | 0 | 0 | Red |
145 | > | 3 | 0 | 1 | 3 | Red |
146 | > | 4 | 0 | 1 | 2 | Green |
147 | > | 5 | -1 | 0 | 1 | Green |
148 | > | 6 | 1 | 1 | 1 | Red |
149 | >
150 | > Suppose we wish to use this data set to make a prediction for $Y$ when
151 | > $X_1 = X_2 = X_3 = 0$ using $K$-nearest neighbors.
152 | >
153 | > a. Compute the Euclidean distance between each observation and the test
154 | > point, $X_1 = X_2 = X_3 = 0$.
155 |
156 | ```{r}
157 | dat <- data.frame(
158 | "x1" = c(0, 2, 0, 0, -1, 1),
159 | "x2" = c(3, 0, 1, 1, 0, 1),
160 | "x3" = c(0, 0, 3, 2, 1, 1),
161 | "y" = c("Red", "Red", "Red", "Green", "Green", "Red")
162 | )
163 |
164 | # Euclidean distance between points and c(0, 0, 0)
165 | dist <- sqrt(dat[["x1"]]^2 + dat[["x2"]]^2 + dat[["x3"]]^2)
166 | signif(dist, 3)
167 | ```
168 |
169 | > b. What is our prediction with $K = 1$? Why?
170 |
171 | ```{r}
172 | knn <- function(k) {
173 | names(which.max(table(dat[["y"]][order(dist)[1:k]])))
174 | }
175 | knn(1)
176 | ```
177 |
178 | Green (based on data point 5 only)
179 |
180 | > c. What is our prediction with $K = 3$? Why?
181 |
182 | ```{r}
183 | knn(3)
184 | ```
185 |
186 | Red (based on data points 2, 5, 6)
187 |
188 | > d. If the Bayes decision boundary in this problem is highly non-linear, then
189 | > would we expect the best value for $K$ to be large or small? Why?
190 |
191 | Small (high $k$ leads to linear boundaries due to averaging)
192 |
193 | ## Applied
194 |
195 | ### Question 8
196 |
197 | > This exercise relates to the `College` data set, which can be found in
198 | > the file `College.csv`. It contains a number of variables for 777 different
199 | > universities and colleges in the US. The variables are
200 | >
201 | > * `Private` : Public/private indicator
202 | > * `Apps` : Number of applications received
203 | > * `Accept` : Number of applicants accepted
204 | > * `Enroll` : Number of new students enrolled
205 | > * `Top10perc` : New students from top 10% of high school class
206 | > * `Top25perc` : New students from top 25% of high school class
207 | > * `F.Undergrad` : Number of full-time undergraduates
208 | > * `P.Undergrad` : Number of part-time undergraduates
209 | > * `Outstate` : Out-of-state tuition
210 | > * `Room.Board` : Room and board costs
211 | > * `Books` : Estimated book costs
212 | > * `Personal` : Estimated personal spending
213 | > * `PhD` : Percent of faculty with Ph.D.'s
214 | > * `Terminal` : Percent of faculty with terminal degree
215 | > * `S.F.Ratio` : Student/faculty ratio
216 | > * `perc.alumni` : Percent of alumni who donate
217 | > * `Expend` : Instructional expenditure per student
218 | > * `Grad.Rate` : Graduation rate
219 | >
220 | > Before reading the data into `R`, it can be viewed in Excel or a text
221 | > editor.
222 | >
223 | > a. Use the `read.csv()` function to read the data into `R`. Call the loaded
224 | > data `college`. Make sure that you have the directory set to the correct
225 | > location for the data.
226 |
227 | ```{r}
228 | college <- read.csv("data/College.csv")
229 | ```
230 |
231 | > b. Look at the data using the `View()` function. You should notice that the
232 | > first column is just the name of each university. We don't really want `R`
233 | > to treat this as data. However, it may be handy to have these names for
234 | > later. Try the following commands:
235 | >
236 | > ```r
237 | > rownames(college) <- college[, 1]
238 | > View(college)
239 | > ```
240 | >
241 | > You should see that there is now a `row.names` column with the name of
242 | > each university recorded. This means that R has given each row a name
243 | > corresponding to the appropriate university. `R` will not try to perform
244 | > calculations on the row names. However, we still need to eliminate the
245 | > first column in the data where the names are stored. Try
246 | >
247 | > ```r
248 | > college <- college [, -1]
249 | > View(college)
250 | > ```
251 | >
252 | > Now you should see that the first data column is `Private`. Note that
253 | > another column labeled `row.names` now appears before the `Private` column.
254 | > However, this is not a data column but rather the name that R is giving to
255 | > each row.
256 |
257 | ```{r}
258 | rownames(college) <- college[, 1]
259 | college <- college[, -1]
260 | ```
261 |
262 | > c.
263 | > i. Use the `summary()` function to produce a numerical summary of the
264 | > variables in the data set.
265 | > ii. Use the `pairs()` function to produce a scatterplot matrix of the
266 | > first ten columns or variables of the data. Recall that you can
267 | > reference the first ten columns of a matrix A using `A[,1:10]`.
268 | > iii. Use the `plot()` function to produce side-by-side boxplots of
269 | > `Outstate` versus `Private`.
270 | > iv. Create a new qualitative variable, called `Elite`, by _binning_ the
271 | > `Top10perc` variable. We are going to divide universities into two
272 | > groups based on whether or not the proportion of students coming from
273 | > the top 10% of their high school classes exceeds 50%.
274 | >
275 | > ```r
276 | > > Elite <- rep("No", nrow(college))
277 | > > Elite[college$Top10perc > 50] <- "Yes"
278 | > > Elite <- as.factor(Elite)
279 | > > college <- data.frame(college, Elite)
280 | > ```
281 | >
282 | > Use the `summary()` function to see how many elite universities there
283 | > are. Now use the `plot()` function to produce side-by-side boxplots of
284 | > `Outstate` versus `Elite`.
285 | > v. Use the `hist()` function to produce some histograms with differing
286 | > numbers of bins for a few of the quantitative variables. You may find
287 | > the command `par(mfrow=c(2,2))` useful: it will divide the print
288 | > window into four regions so that four plots can be made
289 | > simultaneously. Modifying the arguments to this function will divide
290 | > the screen in other ways.
291 | > vi. Continue exploring the data, and provide a brief summary of what you
292 | > discover.
293 |
294 | ```{r}
295 | summary(college)
296 |
297 | college$Private <- college$Private == "Yes"
298 | pairs(college[, 1:10], cex = 0.2)
299 | plot(college$Outstate ~ factor(college$Private), xlab = "Private", ylab = "Outstate")
300 |
301 | college$Elite <- factor(ifelse(college$Top10perc > 50, "Yes", "No"))
302 | summary(college$Elite)
303 | plot(college$Outstate ~ college$Elite, xlab = "Elite", ylab = "Outstate")
304 |
305 | par(mfrow = c(2, 2))
306 | for (n in c(5, 10, 20, 50)) {
307 | hist(college$Enroll, breaks = n, main = paste("n =", n), xlab = "Enroll")
308 | }
309 |
310 | chisq.test(college$Private, college$Elite)
311 | ```
312 |
313 | Whether a college is Private and Elite is not random!
314 |
315 | ### Question 9
316 |
317 | > This exercise involves the Auto data set studied in the lab. Make sure
318 | > that the missing values have been removed from the data.
319 |
320 | ```{r}
321 | x <- read.table("data/Auto.data", header = TRUE, na.strings = "?")
322 | x <- na.omit(x)
323 | ```
324 |
325 | > a. Which of the predictors are quantitative, and which are qualitative?
326 |
327 | ```{r}
328 | sapply(x, class)
329 | numeric <- which(sapply(x, class) == "numeric")
330 | names(numeric)
331 | ```
332 |
333 | > b. What is the range of each quantitative predictor? You can answer this using
334 | > the `range()` function.
335 |
336 | ```{r}
337 | sapply(x[, numeric], function(x) diff(range(x)))
338 | ```
339 |
340 | > c. What is the mean and standard deviation of each quantitative predictor?
341 |
342 | ```{r}
343 | library(tidyverse)
344 | library(knitr)
345 |
346 | x[, numeric] |>
347 | pivot_longer(everything()) |>
348 | group_by(name) |>
349 | summarise(
350 | Mean = mean(value),
351 | SD = sd(value)
352 | ) |>
353 | kable()
354 | ```
355 |
356 | > d. Now remove the 10th through 85th observations. What is the range, mean, and
357 | > standard deviation of each predictor in the subset of the data that
358 | > remains?
359 |
360 | ```{r}
361 | x[-(10:85), numeric] |>
362 | pivot_longer(everything()) |>
363 | group_by(name) |>
364 | summarise(
365 | Range = diff(range(value)),
366 | Mean = mean(value),
367 | SD = sd(value)
368 | ) |>
369 | kable()
370 | ```
371 |
372 | > e. Using the full data set, investigate the predictors graphically, using
373 | > scatterplots or other tools of your choice. Create some plots highlighting
374 | > the relationships among the predictors. Comment on your findings.
375 |
376 | ```{r}
377 | pairs(x[, numeric], cex = 0.2)
378 | cor(x[, numeric]) |>
379 | kable()
380 |
381 | heatmap(cor(x[, numeric]), cexRow = 1.1, cexCol = 1.1, margins = c(8, 8))
382 | ```
383 |
384 | Many of the variables appear to be highly (positively or negatively) correlated
385 | with some relationships being non-linear.
386 |
387 | > f. Suppose that we wish to predict gas mileage (`mpg`) on the basis of the
388 | > other variables. Do your plots suggest that any of the other variables
389 | > might be useful in predicting `mpg`? Justify your answer.
390 |
391 | Yes, since other variables are correlated. However, horsepower, weight and
392 | displacement are highly related.
393 |
394 | ### Question 10
395 |
396 | > This exercise involves the `Boston` housing data set.
397 | >
398 | > a. To begin, load in the `Boston` data set. The `Boston` data set is part of
399 | > the `ISLR2` library in R.
400 | > ```r
401 | > > library(ISLR2)
402 | > ```
403 | > Now the data set is contained in the object `Boston`.
404 | > ```r
405 | > > Boston
406 | > ```
407 | > Read about the data set:
408 | > ```r
409 | > > ?Boston
410 | > ```
411 | > How many rows are in this data set? How many columns? What do the rows and
412 | > columns represent?
413 |
414 | ```{r}
415 | library(ISLR2)
416 | dim(Boston)
417 | ```
418 |
419 | > b. Make some pairwise scatterplots of the predictors (columns) in this data
420 | > set. Describe your findings.
421 |
422 | ```{r, message = FALSE, warning = FALSE}
423 | library(ggplot2)
424 | library(tidyverse)
425 | ```
426 |
427 | ```{r}
428 | ggplot(Boston, aes(nox, rm)) +
429 | geom_point()
430 | ggplot(Boston, aes(ptratio, rm)) +
431 | geom_point()
432 | heatmap(cor(Boston, method = "spearman"), cexRow = 1.1, cexCol = 1.1)
433 | ```
434 |
435 | > c. Are any of the predictors associated with per capita crime rate? If so,
436 | > explain the relationship.
437 |
438 | Yes
439 |
440 | > d. Do any of the census tracts of Boston appear to have particularly high
441 | > crime rates? Tax rates? Pupil-teacher ratios? Comment on the range of each
442 | > predictor.
443 |
444 | ```{r}
445 | Boston |>
446 | pivot_longer(cols = 1:13) |>
447 | filter(name %in% c("crim", "tax", "ptratio")) |>
448 | ggplot(aes(value)) +
449 | geom_histogram(bins = 20) +
450 | facet_wrap(~name, scales = "free", ncol = 1)
451 | ```
452 |
453 | Yes, particularly crime and tax rates.
454 |
455 | > e. How many of the census tracts in this data set bound the Charles river?
456 |
457 | ```{r}
458 | sum(Boston$chas)
459 | ```
460 |
461 | > f. What is the median pupil-teacher ratio among the towns in this data set?
462 |
463 | ```{r}
464 | median(Boston$ptratio)
465 | ```
466 |
467 | > g. Which census tract of Boston has lowest median value of owner-occupied
468 | > homes? What are the values of the other predictors for that census tract,
469 | > and how do those values compare to the overall ranges for those predictors?
470 | > Comment on your findings.
471 |
472 | ```{r}
473 | Boston[Boston$medv == min(Boston$medv), ] |>
474 | kable()
475 |
476 | sapply(Boston, quantile) |>
477 | kable()
478 | ```
479 |
480 | > h. In this data set, how many of the census tract average more than seven
481 | > rooms per dwelling? More than eight rooms per dwelling? Comment on the
482 | > census tracts that average more than eight rooms per dwelling.
483 |
484 | ```{r}
485 | sum(Boston$rm > 7)
486 | sum(Boston$rm > 8)
487 | ```
488 |
489 | Let's compare median statistics for those census tracts with more than eight
490 | rooms per dwelling on average, with the statistics for those with fewer.
491 |
492 | ```{r}
493 | Boston |>
494 | mutate(
495 | `log(crim)` = log(crim),
496 | `log(zn)` = log(zn)
497 | ) |>
498 | select(-c(crim, zn)) |>
499 | pivot_longer(!rm) |>
500 | mutate(">8 rooms" = rm > 8) |>
501 | ggplot(aes(`>8 rooms`, value)) +
502 | geom_boxplot() +
503 | facet_wrap(~name, scales = "free")
504 | ```
505 |
506 | Census tracts with big average properties (more than eight rooms per dwelling)
507 | have higher median value (`medv`), a lower proportion of non-retail
508 | business acres (`indus`), a lower pupil-teacher ratio (`ptratio`), a lower
509 | status of the population (`lstat`) among other differences.
510 |
--------------------------------------------------------------------------------
/11-survival-analysis-and-censored-data.Rmd:
--------------------------------------------------------------------------------
1 | # Survival Analysis and Censored Data
2 |
3 | ## Conceptual
4 |
5 | ### Question 1
6 |
7 | > For each example, state whether or not the censoring mechanism is independent.
8 | > Justify your answer.
9 | >
10 | > a. In a study of disease relapse, due to a careless research scientist, all
11 | > patients whose phone numbers begin with the number "2" are lost to follow up.
12 |
13 | Independent. There's no reason to think disease relapse should be related to
14 | the first digit of a phone number.
15 |
16 | > b. In a study of longevity, a formatting error causes all patient ages that
17 | > exceed 99 years to be lost (i.e. we know that those patients are more than 99
18 | > years old, but we do not know their exact ages).
19 |
20 | Not independent. Older patients are more likely to see an event that younger.
21 |
22 | > c. Hospital A conducts a study of longevity. However, very sick patients tend
23 | > to be transferred to Hospital B, and are lost to follow up.
24 |
25 | Not independent. Sick patients are more likely to see an event that healthy.
26 |
27 | > d. In a study of unemployment duration, the people who find work earlier are
28 | > less motivated to stay in touch with study investigators, and therefore are
29 | > more likely to be lost to follow up.
30 |
31 | Not independent. More employable individuals are more likely to see an event.
32 |
33 | > e. In a study of pregnancy duration, women who deliver their babies pre-term
34 | > are more likely to do so away from their usual hospital, and thus are more
35 | > likely to be censored, relative to women who deliver full-term babies.
36 |
37 | Not independent. Delivery away from hospital will be associated with pregnancy
38 | duration.
39 |
40 | > f. A researcher wishes to model the number of years of education of the
41 | > residents of a small town. Residents who enroll in college out of town are
42 | > more likely to be lost to follow up, and are also more likely to attend
43 | > graduate school, relative to those who attend college in town.
44 |
45 | Not independent. Years of education will be associated with enrolling in out
46 | of town colleges.
47 |
48 | > g. Researchers conduct a study of disease-free survival (i.e. time until
49 | > disease relapse following treatment). Patients who have not relapsed within
50 | > five years are considered to be cured, and thus their survival time is
51 | > censored at five years.
52 |
53 | In other words we assume all events happen within five years, so
54 | censoring after this time is equivalent to not censoring at all so
55 | the censoring is independent.
56 |
57 | > h. We wish to model the failure time for some electrical component. This
58 | > component can be manufactured in Iowa or in Pittsburgh, with no difference in
59 | > quality. The Iowa factory opened five years ago, and so components
60 | > manufactured in Iowa are censored at five years. The Pittsburgh factory opened
61 | > two years ago, so those components are censored at two years.
62 |
63 | If there is no difference in quality then location and therefore censoring is
64 | independent of failure time.
65 |
66 | > i. We wish to model the failure time of an electrical component made in two
67 | > different factories, one of which opened before the other. We have reason to
68 | > believe that the components manufactured in the factory that opened earlier
69 | > are of higher quality.
70 |
71 | In this case, the difference in opening times of the two locations will mean
72 | that any difference in quality between locations will be associated with
73 | censoring, so censoring is not independent.
74 |
75 | ### Question 2
76 |
77 | > We conduct a study with $n = 4$ participants who have just purchased cell
78 | > phones, in order to model the time until phone replacement. The first
79 | > participant replaces her phone after 1.2 years. The second participant still
80 | > has not replaced her phone at the end of the two-year study period. The third
81 | > participant changes her phone number and is lost to follow up (but has not yet
82 | > replaced her phone) 1.5 years into the study. The fourth participant replaces
83 | > her phone after 0.2 years.
84 | >
85 | > For each of the four participants ($i = 1,..., 4$), answer the following
86 | > questions using the notation introduced in Section 11.1:
87 | >
88 | > a. Is the participant's cell phone replacement time censored?
89 |
90 | No, Yes, Yes and No. Censoring occurs when we do not know if or when the phone
91 | was replaced.
92 |
93 | > b. Is the value of $c_i$ known, and if so, then what is it?
94 |
95 | $c_i$ is censoring time. For the four participants these are: NA. 2. 1.5 and NA.
96 |
97 | > c. Is the value of $t_i$ known, and if so, then what is it?
98 |
99 | $t_i$ is time to event. For the four participants these are: 1.2, NA, NA and
100 | 0.2.
101 |
102 | > d. Is the value of $y_i$ known, and if so, then what is it?
103 |
104 | $y_i$ is the observed time. For the four participants these are: 1.2, 2, 1.5 and
105 | 0.2.
106 |
107 | > e. Is the value of $\delta_i$ known, and if so, then what is it?
108 |
109 | $\delta_i$ is an indicator for censoring. The nomenclature introduced here
110 | defines this to be 1 if we observe the true "survival" time and 0 if we observe
111 | the censored time. Therefore, for these participants, the values are: 1, 0, 0
112 | and 1.
113 |
114 | ### Question 3
115 |
116 | > For the example in Exercise 2, report the values of $K$, $d_1,...,d_K$,
117 | > $r_1,...,r_K$, and $q_1,...,q_K$, where this notation was defined in Section
118 | > 11.3.
119 |
120 | * $K$ is the number of unique deaths, which is 2.
121 | * $d_k$ represents the unique death times, which are: 0.2, 1.2.
122 | * $r_k$ denotes the number of patients alive and in the study just before $d_k$.
123 | Note the first event is for patient 4, then patient 1, then patient 3 is
124 | censored and finally the study ends with patient 2 still involved. Therefore
125 | $r_k$ takes values are: 4, 3.
126 | * $q_k$ denotes the number of patients who died at time $d_k$, therefore this
127 | takes values: 1, 1.
128 |
129 | We can check by using the `survival` package.
130 |
131 | ```{r}
132 | library(survival)
133 | x <- Surv(c(1.2, 2, 1.5, 0.2), event = c(1, 0, 0, 1))
134 | summary(survfit(x ~ 1))
135 | ```
136 |
137 | ### Question 4
138 |
139 | > This problem makes use of the Kaplan-Meier survival curve displayed in Figure
140 | > 11.9. The raw data that went into plotting this survival curve is given in
141 | > Table 11.4. The covariate column of that table is not needed for this problem.
142 | >
143 | > a. What is the estimated probability of survival past 50 days?
144 |
145 | There are 2 events that happen before 50 days. The number at
146 | risk $r_k$ are 5 and 4 (one was censored early on), thus survival probability is
147 | $4/5 * 3/4 = 0.6$.
148 |
149 | Equivalently, we can use the survival package.
150 |
151 | ```{r, message = FALSE, warning = FALSE}
152 | library(tidyverse)
153 | ```
154 |
155 | ```{r}
156 | table_data <- tribble(
157 | ~Y, ~D, ~X,
158 | 26.5, 1, 0.1,
159 | 37.2, 1, 11,
160 | 57.3, 1, -0.3,
161 | 90.8, 0, 2.8,
162 | 20.2, 0, 1.8,
163 | 89.8, 0, 0.4
164 | )
165 | x <- Surv(table_data$Y, table_data$D)
166 | summary(survfit(x ~ 1))
167 | ```
168 |
169 | > b. Write out an analytical expression for the estimated survival function. For
170 | > instance, your answer might be something along the lines of
171 | >
172 | > $$
173 | > \hat{S}(t) = \begin{cases}
174 | > 0.8 & \text{if } t < 31\\
175 | > 0.5 & \text{if } 31 \le t < 77\\
176 | > 0.22 & \text{if } 77 \le t
177 | > \end{cases}
178 | > $$
179 | >
180 | > (The previous equation is for illustration only: it is not the correct
181 | > answer!)
182 |
183 | $$
184 | \hat{S}(t) = \begin{cases}
185 | 1 & \text{if } t < 26.5 \\
186 | 0.8 & \text{if } 26.5 \le t < 37.2 \\
187 | 0.6 & \text{if } 37.2 \le t < 57.3 \\
188 | 0.4 & \text{if } 57.3 \le t
189 | \end{cases}
190 | $$
191 |
192 |
193 | ### Question 5
194 |
195 | > Sketch the survival function given by the equation
196 | >
197 | > $$
198 | > \hat{S}(t) = \begin{cases}
199 | > 0.8, & \text{if } t < 31\\
200 | > 0.5, & \text{if } 31 \le t < 77\\
201 | > 0.22 & \text{if } 77 \le t
202 | > \end{cases}
203 | > $$
204 | >
205 | > Your answer should look something like Figure 11.9.
206 |
207 | We can draw this plot, or even engineer data that will generate the required
208 | plot...
209 |
210 | ```{r}
211 | plot(NULL,
212 | xlim = c(0, 100),
213 | ylim = c(0, 1),
214 | ylab = "Estimated Probability of Survival",
215 | xlab = "Time in Days"
216 | )
217 | lines(
218 | c(0, 31, 31, 77, 77, 100),
219 | c(0.8, 0.8, 0.5, 0.5, 0.22, 0.22)
220 | )
221 | ```
222 |
223 | ### Question 6
224 |
225 | > This problem makes use of the data displayed in Figure 11.1. In completing
226 | > this problem, you can refer to the observation times as $y_1,...,y_4$. The
227 | > ordering of these observation times can be seen from Figure 11.1; their exact
228 | > values are not required.
229 | >
230 | > a. Report the values of $\delta_1,...,\delta_4$, $K$, $d_1,...,d_K$,
231 | > $r_1,...,r_K$, and $q_1,...,q_K$. The relevant notation is defined in Sections
232 | > 11.1 and 11.3.
233 |
234 | * $\delta$ values are: 1, 0, 1, 0.
235 | * $K$ is 2
236 | * $d$ values are $y_3$ and $y_1$.
237 | * $r$ values are 4 and 2.
238 | * $q$ values are 1 and 1.
239 |
240 | > b. Sketch the Kaplan-Meier survival curve corresponding to this data set. (You
241 | > do not need to use any software to do this---you can sketch it by hand using
242 | > the results obtained in (a).)
243 |
244 | ```{r}
245 | plot(NULL,
246 | xlim = c(0, 350),
247 | ylim = c(0, 1),
248 | ylab = "Estimated Probability of Survival",
249 | xlab = "Time in Days"
250 | )
251 | lines(
252 | c(0, 150, 150, 300, 300, 350),
253 | c(1, 1, 0.75, 0.75, 0.375, 0.375)
254 | )
255 | ```
256 |
257 | x <- Surv(c(300, 350, 150, 250), c(1, 0, 1, 0))
258 |
259 | > c. Based on the survival curve estimated in (b), what is the probability that
260 | > the event occurs within 200 days? What is the probability that the event does
261 | > not occur within 310 days?
262 |
263 | 0.25 and 0.375.
264 |
265 | > d. Write out an expression for the estimated survival curve from (b).
266 |
267 | $$
268 | \hat{S}(t) = \begin{cases}
269 | 1 & \text{if } t < y_3 \\
270 | 0.75 & \text{if } y_3 \le t < y_1 \\
271 | 0.375 & \text{if } y_1 \le t
272 | \end{cases}
273 | $$
274 |
275 | ### Question 7
276 |
277 | > In this problem, we will derive (11.5) and (11.6), which are needed for the
278 | > construction of the log-rank test statistic (11.8). Recall the notation in
279 | > Table 11.1.
280 | >
281 | > a. Assume that there is no difference between the survival functions of the
282 | > two groups. Then we can think of $q_{1k}$ as the number of failures if we draw
283 | > $r_{1k} observations, without replacement, from a risk set of $r_k$
284 | > observations that contains a total of $q_k$ failures. Argue that $q_{1k}$
285 | > follows a hypergeometric distribution. Write the parameters of this
286 | > distribution in terms of $r_{1k}$, $r_k$, and $q_k$.
287 |
288 | A hypergeometric distributions models sampling without replacement from a finite
289 | pool where each sample is a success or failure. This fits the situation here,
290 | where with have a finite number of samples in the risk set.
291 |
292 | The hypergeometric distribution is parameterized as $k$ successes in $n$ draws, without replacement, from a population of size $N$ with $K$ objects with that feature.
293 |
294 | Mapping to our situation, $q_{1k}$ is $k$, $r_{1k}$ is $n$, $r_k$ is $N$ and $q_k$ is $K$.
295 |
296 | > b. Given your previous answer, and the properties of the hypergeometric
297 | > distribution, what are the mean and variance of $q_{1k}$? Compare your answer
298 | > to (11.5) and (11.6).
299 |
300 | With the above parameterization, the mean ($n K/N$) is $r_{1k} q_k/r_K$.
301 | The variance $n K/N (N-K)/N (N-n)/(N-1)$ is
302 |
303 | $$
304 | r_{1k} \frac{q_k}{r_k} \frac{r_k-q_k}{r_k} \frac{r_k - r_{1k}}{r_k - 1}
305 | $$
306 |
307 | These are equivalent to 11.5 and 11.6.
308 |
309 | ### Question 8
310 |
311 | > Recall that the survival function $S(t)$, the hazard function $h(t)$, and the
312 | > density function $f(t)$ are defined in (11.2), (11.9), and (11.11),
313 | > respectively. Furthermore, define $F(t) = 1 - S(t)$. Show that the following
314 | > relationships hold:
315 | >
316 | > $$
317 | > f(t) = dF(t)/dt \\
318 | > S(t) = \exp\left(-\int_0^t h(u)du\right)
319 | > $$
320 |
321 | If $F(t) = 1 - S(t)$, then $F(t)$ is the *cumulative density function* (cdf)
322 | for $t$.
323 |
324 | For a continuous distribution, a cdf, e.g. $F(t)$ can be expressed as an
325 | integral (up to some value $x$) of the *probability density function* (pdf),
326 | i.e. $F(t) = \int_{-\infty}^x f(x) dt$. Equivalently, the derivative of the cdf
327 | is its pdf: $f(t) = \frac{d F(t)}{dt}$.
328 |
329 | Then,
330 | $h(t) = \frac{f(t)}{S(t)} = \frac{dF(t)/dt}{S(t)} = \frac{-dS(t)/dt}{S(t)}$.
331 | From basic calculus, this can be rewritten as $h(t) = -\frac{d}{dt}\log{S(t)}$.
332 | Integrating and then exponentiating we get the second identity.
333 |
334 | ### Question 9
335 |
336 | > In this exercise, we will explore the consequences of assuming that the
337 | > survival times follow an exponential distribution.
338 | >
339 | > a. Suppose that a survival time follows an $Exp(\lambda)$ distribution, so
340 | > that its density function is $f(t) = \lambda\exp(-\lambda t)$. Using the
341 | > relationships provided in Exercise 8, show that $S(t) = \exp(-\lambda t)$.
342 |
343 | The cdf of an exponential distribution is $1 - \exp(-\lambda x)$ and
344 | $S(t)$ is $1 - F(t)$ where $F(t)$ is the cdf.
345 |
346 | Hence, $S(t) = \exp(-\lambda t)$.
347 |
348 | > b. Now suppose that each of $n$ independent survival times follows an
349 | > $\exp(\lambda)$ distribution. Write out an expression for the likelihood
350 | > function (11.13).
351 |
352 | The reference to (11.13) gives us the following formula:
353 |
354 | $$
355 | L = \prod_{i=1}^{n} h(y_i)^{\delta_i} S(y_i)
356 | $$
357 |
358 | (11.10) also gives us
359 |
360 | $$
361 | h(t) = \frac{f(t)}{S(t)}
362 | $$
363 |
364 | Plugging in the expressions from part (a), we get
365 |
366 | \begin{align*}
367 | h(t) &= \frac{\lambda \exp(- \lambda t)}{\exp(- \lambda t)} \\
368 | &= \lambda
369 | \end{align*}
370 |
371 | Using (11.13), we get the following loss expression:
372 |
373 | $$
374 | \ell = \prod_i \lambda^{\delta_i} e^{- \lambda y_i}
375 | $$
376 |
377 | > c. Show that the maximum likelihood estimator for $\lambda$ is
378 | > $$
379 | > \hat\lambda = \sum_{i=1}^n \delta_i / \sum_{i=1}^n y_i.
380 | > $$
381 |
382 | Take the log likelihood.
383 |
384 | \begin{align*}
385 | \log \ell &= \sum_i \log \left( \lambda^{\delta_i} e^{- \lambda y_i} \right) \\
386 | &= \sum_i{\delta_i\log\lambda - \lambda y_i \log e} \\
387 | &= \sum_i{\delta_i\log\lambda - \lambda y_i} \\
388 | &= \log\lambda\sum_i{\delta_i} - \lambda\sum_i{y_i}
389 | \end{align*}
390 |
391 | Differentiating this expression with respect to $\lambda$ we get:
392 |
393 | $$
394 | \frac{d \log \ell}{d \lambda} = \frac{\sum_i{\delta_i}}{\lambda} - \sum_i{y_i}
395 | $$
396 |
397 | This function maximises when its gradient is 0. Solving for this gives a MLE of
398 | $\hat\lambda = \sum_{i=1}^n \delta_i / \sum_{i=1}^n y_i$.
399 |
400 | > d. Use your answer to (c) to derive an estimator of the mean survival time.
401 | >
402 | > _Hint: For (d), recall that the mean of an $Exp(\lambda)$ random variable is
403 | > $1/\lambda$._
404 |
405 | Estimated mean survival would be $1/\lambda$ which given the above would be
406 | $\sum_{i=1}^n y_i / \sum_{i=1}^n \delta_i$, which can be thought of as
407 | the total observation time over the total number of deaths.
408 |
409 | ## Applied
410 |
411 | ### Question 10
412 |
413 | > This exercise focuses on the brain tumor data, which is included in the
414 | > `ISLR2` `R` library.
415 | >
416 | > a. Plot the Kaplan-Meier survival curve with ±1 standard error bands, using
417 | > the `survfit()` function in the `survival` package.
418 |
419 | ```{r}
420 | library(ISLR2)
421 | x <- Surv(BrainCancer$time, BrainCancer$status)
422 | plot(survfit(x ~ 1),
423 | xlab = "Months",
424 | ylab = "Estimated Probability of Survival",
425 | col = "steelblue",
426 | conf.int = 0.67
427 | )
428 | ```
429 |
430 | > b. Draw a bootstrap sample of size $n = 88$ from the pairs ($y_i$,
431 | > $\delta_i$), and compute the resulting Kaplan-Meier survival curve. Repeat
432 | > this process $B = 200$ times. Use the results to obtain an estimate of the
433 | > standard error of the Kaplan-Meier survival curve at each timepoint. Compare
434 | > this to the standard errors obtained in (a).
435 |
436 | ```{r}
437 | plot(survfit(x ~ 1),
438 | xlab = "Months",
439 | ylab = "Estimated Probability of Survival",
440 | col = "steelblue",
441 | conf.int = 0.67
442 | )
443 | fit <- survfit(x ~ 1)
444 | dat <- tibble(time = c(0, fit$time))
445 | for (i in 1:200) {
446 | y <- survfit(sample(x, 88, replace = TRUE) ~ 1)
447 | y <- tibble(time = c(0, y$time), "s{i}" := c(1, y$surv))
448 | dat <- left_join(dat, y, by = "time")
449 | }
450 | res <- fill(dat, starts_with("s")) |>
451 | rowwise() |>
452 | transmute(sd = sd(c_across(starts_with("s"))))
453 | se <- res$sd[2:nrow(res)]
454 | lines(fit$time, fit$surv - se, lty = 2, col = "red")
455 | lines(fit$time, fit$surv + se, lty = 2, col = "red")
456 | ```
457 |
458 | > c. Fit a Cox proportional hazards model that uses all of the predictors to
459 | > predict survival. Summarize the main findings.
460 |
461 | ```{r}
462 | fit <- coxph(Surv(time, status) ~ sex + diagnosis + loc + ki + gtv + stereo, data = BrainCancer)
463 | fit
464 | ```
465 |
466 | `diagnosisHG` and `ki` are highly significant.
467 |
468 | > d. Stratify the data by the value of `ki`. (Since only one observation has
469 | > `ki=40`, you can group that observation together with the observations that
470 | > have `ki=60`.) Plot Kaplan-Meier survival curves for each of the five strata,
471 | > adjusted for the other predictors.
472 |
473 | To adjust for other predictors, we fit a model that includes those predictors
474 | and use this model to predict new, artificial, data where we allow `ki` to
475 | take each possible value, but set the other predictors to be the mode or mean
476 | of the other predictors.
477 |
478 | ```{r}
479 | library(ggfortify)
480 |
481 | modaldata <- data.frame(
482 | sex = rep("Female", 5),
483 | diagnosis = rep("Meningioma", 5),
484 | loc = rep("Supratentorial", 5),
485 | ki = c(60, 70, 80, 90, 100),
486 | gtv = rep(mean(BrainCancer$gtv), 5),
487 | stereo = rep("SRT", 5)
488 | )
489 | survplots <- survfit(fit, newdata = modaldata)
490 | plot(survplots, xlab = "Months", ylab = "Survival Probability", col = 2:6)
491 | legend("bottomleft", c("60", "70", "80", "90", "100"), col = 2:6, lty = 1)
492 | ```
493 |
494 | ### Question 11
495 |
496 | > This example makes use of the data in Table 11.4.
497 | >
498 | > a. Create two groups of observations. In Group 1, $X < 2$, whereas in Group 2,
499 | > $X \ge 2$. Plot the Kaplan-Meier survival curves corresponding to the two
500 | > groups. Be sure to label the curves so that it is clear which curve
501 | > corresponds to which group. By eye, does there appear to be a difference
502 | > between the two groups' survival curves?
503 |
504 | ```{r}
505 | x <- split(Surv(table_data$Y, table_data$D), table_data$X < 2)
506 | plot(NULL, xlim = c(0, 100), ylim = c(0, 1), ylab = "Survival Probability")
507 | lines(survfit(x[[1]] ~ 1), conf.int = FALSE, col = 2)
508 | lines(survfit(x[[2]] ~ 1), conf.int = FALSE, col = 3)
509 | legend("bottomleft", c(">= 2", "<2"), col = 2:3, lty = 1)
510 | ```
511 |
512 | There does not appear to be any difference between the curves.
513 |
514 | > b. Fit Cox's proportional hazards model, using the group indicator as a
515 | > covariate. What is the estimated coefficient? Write a sentence providing the
516 | > interpretation of this coefficient, in terms of the hazard or the
517 | > instantaneous probability of the event. Is there evidence that the true
518 | > coefficient value is non-zero?
519 |
520 | ```{r}
521 | fit <- coxph(Surv(Y, D) ~ X < 2, data = table_data)
522 | fit
523 | ```
524 |
525 | The coefficient is $0.3401$. This implies a slightly increased hazard when
526 | $X < 2$ but it is not significantly different to zero (P = 0.8).
527 |
528 | > c. Recall from Section 11.5.2 that in the case of a single binary covariate,
529 | > the log-rank test statistic should be identical to the score statistic for the
530 | > Cox model. Conduct a log-rank test to determine whether there is a difference
531 | > between the survival curves for the two groups. How does the p-value for the
532 | > log-rank test statistic compare to the $p$-value for the score statistic for
533 | > the Cox model from (b)?
534 |
535 | ```{r}
536 | summary(fit)$sctest
537 | survdiff(Surv(Y, D) ~ X < 2, data = table_data)$chisq
538 | ```
539 |
540 | They are identical.
541 |
--------------------------------------------------------------------------------
/05-resampling-methods.Rmd:
--------------------------------------------------------------------------------
1 | # Resampling Methods
2 |
3 | ## Conceptual
4 |
5 | ### Question 1
6 |
7 | > Using basic statistical properties of the variance, as well as single-
8 | > variable calculus, derive (5.6). In other words, prove that $\alpha$ given by
9 | > (5.6) does indeed minimize $Var(\alpha X + (1 - \alpha)Y)$.
10 |
11 | Equation 5.6 is:
12 |
13 | $$
14 | \alpha = \frac{\sigma^2_Y - \sigma_{XY}}{\sigma^2_X + \sigma^2_Y - 2\sigma_{XY}}
15 | $$
16 |
17 | Remember that:
18 |
19 | $$
20 | Var(aX) = a^2Var(X), \\
21 | \mathrm{Var}(X + Y) = \mathrm{Var}(X) + \mathrm{Var}(Y) + 2\mathrm{Cov}(X,Y), \\
22 | \mathrm{Cov}(aX, bY) = ab\mathrm{Cov}(X, Y)
23 | $$
24 |
25 | If we define $\sigma^2_X = \mathrm{Var}(X)$, $\sigma^2_Y = \mathrm{Var}(Y)$ and
26 | $\sigma_{XY} = \mathrm{Cov}(X, Y)$
27 |
28 | \begin{align}
29 | Var(\alpha X + (1 - \alpha)Y)
30 | &= \alpha^2\sigma^2_X + (1-\alpha)^2\sigma^2_Y + 2\alpha(1 - \alpha)\sigma_{XY} \\
31 | &= \alpha^2\sigma^2_X + \sigma^2_Y - 2\alpha\sigma^2_Y + \alpha^2\sigma^2_Y +
32 | 2\alpha\sigma_{XY} - 2\alpha^2\sigma_{XY}
33 | \end{align}
34 |
35 | Now we want to find when the rate of change of this function is 0 with respect
36 | to $\alpha$, so we compute the partial derivative, set to 0 and solve.
37 |
38 | $$
39 | \frac{\partial}{\partial{\alpha}} =
40 | 2\alpha\sigma^2_X - 2\sigma^2_Y + 2\alpha\sigma^2_Y + 2\sigma_{XY} - 4\alpha\sigma_{XY} = 0
41 | $$
42 |
43 | Moving $\alpha$ terms to the same side:
44 |
45 | $$
46 | \alpha\sigma^2_X + \alpha\sigma^2_Y - 2\alpha\sigma_{XY} = \sigma^2_Y - \sigma_{XY}
47 | $$
48 |
49 | $$
50 | \alpha = \frac{\sigma^2_Y - \sigma_{XY}}{\sigma^2_X + \sigma^2_Y - 2\sigma_{XY}}
51 | $$
52 |
53 | We should also show that this is a minimum, so that the second partial
54 | derivative wrt $\alpha$ is $>= 0$.
55 |
56 | \begin{align}
57 | \frac{\partial^2}{\partial{\alpha^2}}
58 | &= 2\sigma^2_X + 2\sigma^2_Y - 4\sigma_{XY} \\
59 | &= 2(\sigma^2_X + \sigma^2_Y - 2\sigma_{XY}) \\
60 | &= 2\mathrm{Var}(X - Y)
61 | \end{align}
62 |
63 | Since variance is positive, then this must be positive.
64 |
65 | ### Question 2
66 |
67 | > We will now derive the probability that a given observation is part of a
68 | > bootstrap sample. Suppose that we obtain a bootstrap sample from a set of n
69 | > observations.
70 | >
71 | > a. What is the probability that the first bootstrap observation is _not_ the
72 | > $j$th observation from the original sample? Justify your answer.
73 |
74 | This is 1 - probability that it is the $j$th = $1 - 1/n$.
75 |
76 | > b. What is the probability that the second bootstrap observation is _not_ the
77 | > $j$th observation from the original sample?
78 |
79 | Since each bootstrap observation is a random sample, this probability is the
80 | same ($1 - 1/n$).
81 |
82 | > c. Argue that the probability that the $j$th observation is _not_ in the
83 | > bootstrap sample is $(1 - 1/n)^n$.
84 |
85 | For the $j$th observation to not be in the sample, it would have to _not_ be
86 | picked for each of $n$ positions, so not picked for $1, 2, ..., n$, thus
87 | the probability is $(1 - 1/n)^n$
88 |
89 | > d. When $n = 5$, what is the probability that the $j$th observation is in the
90 | > bootstrap sample?
91 |
92 | ```{r}
93 | n <- 5
94 | 1 - (1 - 1 / n)^n
95 | ```
96 |
97 | $p = 0.67$
98 |
99 | > e. When $n = 100$, what is the probability that the $j$th observation is in
100 | > the bootstrap sample?
101 |
102 | ```{r}
103 | n <- 100
104 | 1 - (1 - 1 / n)^n
105 | ```
106 |
107 | $p = 0.64$
108 |
109 | > f. When $n = 10,000$, what is the probability that the $j$th observation is
110 | > in the bootstrap sample?
111 |
112 | ```{r}
113 | n <- 100000
114 | 1 - (1 - 1 / n)^n
115 | ```
116 |
117 | $p = 0.63$
118 |
119 | > g. Create a plot that displays, for each integer value of $n$ from 1 to
120 | > 100,000, the probability that the $j$th observation is in the bootstrap
121 | > sample. Comment on what you observe.
122 |
123 | ```{r}
124 | x <- sapply(1:100000, function(n) 1 - (1 - 1 / n)^n)
125 | plot(x, log = "x", type = "o")
126 | ```
127 |
128 | The probability rapidly approaches 0.63 with increasing $n$.
129 |
130 | Note that $$e^x = \lim_{x \to \inf} \left(1 + \frac{x}{n}\right)^n,$$ so with $x = -1$, we
131 | can see that our limit is $1 - e^{-1} = 1 - 1/e$.
132 |
133 | > h. We will now investigate numerically the probability that a bootstrap
134 | > sample of size $n = 100$ contains the $j$th observation. Here $j = 4$. We
135 | > repeatedly create bootstrap samples, and each time we record whether or not
136 | > the fourth observation is contained in the bootstrap sample.
137 | >
138 | > ```r
139 | > > store <- rep (NA, 10000)
140 | > > for (i in 1:10000) {
141 | > store[i] <- sum(sample(1:100, rep = TRUE) == 4) > 0
142 | > }
143 | > > mean(store)
144 | > ```
145 | >
146 | > Comment on the results obtained.
147 |
148 | ```{r}
149 | store <- replicate(10000, sum(sample(1:100, replace = TRUE) == 4) > 0)
150 | mean(store)
151 | ```
152 |
153 | The probability of including $4$ when resampling numbers $1...100$ is close to
154 | $1 - (1 - 1/100)^{100}$.
155 |
156 | ### Question 3
157 |
158 | > 3. We now review $k$-fold cross-validation.
159 | >
160 | > a. Explain how $k$-fold cross-validation is implemented.
161 |
162 | We divided our data into (approximately equal) $k$ subsets, and then generate
163 | predictions for each $k$th set, training on the exclusive $k$ sets combined.
164 |
165 | > b. What are the advantages and disadvantages of $k$-fold cross-validation
166 | > relative to:
167 | > i. The validation set approach?
168 | > ii. LOOCV?
169 |
170 | When using a validation set, we can only train on a small portion of the data
171 | as we must reserve the rest for validation. As a result it can overestimate the
172 | test error rate (assuming we then train using the complete data for future
173 | prediction). It is also sensitive to which observations are including in train
174 | vs. test. It is, however, low cost in terms of processing time (as we only
175 | have to fit one model).
176 |
177 | When using LOOCV, we can train on $n-1$ observations, however, the trained
178 | models we generate each differ only by the inclusion (and exclusion) of a
179 | single observation. As a result, LOOCV can have high variance (the models fit
180 | will be similar, and might be quite different to what we would obtain with a
181 | different data set). LOOCV is also costly in terms of processing time.
182 |
183 | ### Question 4
184 |
185 | > Suppose that we use some statistical learning method to make a prediction for
186 | > the response $Y$ for a particular value of the predictor $X$. Carefully
187 | > describe how we might estimate the standard deviation of our prediction.
188 |
189 | We could address this with bootstrapping. Our procedure would be to (jointly)
190 | resample $Y$ and $X$ variables and fit our model many times. For each model we
191 | could obtain a summary of our prediction and calculate the standard deviation
192 | over bootstrapped samples.
193 |
194 | ## Applied
195 |
196 | ### Question 5
197 |
198 | > In Chapter 4, we used logistic regression to predict the probability of
199 | > `default` using `income` and `balance` on the `Default` data set. We will now
200 | > estimate the test error of this logistic regression model using the
201 | > validation set approach. Do not forget to set a random seed before beginning
202 | > your analysis.
203 | >
204 | > a. Fit a logistic regression model that uses `income` and `balance` to predict
205 | > `default`.
206 |
207 | ```{r}
208 | library(ISLR2)
209 | set.seed(42)
210 | fit <- glm(default ~ income + balance, data = Default, family = "binomial")
211 | ```
212 |
213 | > b. Using the validation set approach, estimate the test error of this model.
214 | > In order to do this, you must perform the following steps:
215 | > i. Split the sample set into a training set and a validation set.
216 | > ii. Fit a multiple logistic regression model using only the training
217 | > observations.
218 | > iii. Obtain a prediction of default status for each individual in
219 | > the validation set by computing the posterior probability of
220 | > default for that individual, and classifying the individual to
221 | > the `default` category if the posterior probability is greater
222 | > than 0.5.
223 | > iv. Compute the validation set error, which is the fraction of
224 | > the observations in the validation set that are misclassified.
225 |
226 | ```{r}
227 | train <- sample(nrow(Default), nrow(Default) / 2)
228 | fit <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
229 | pred <- ifelse(predict(fit, newdata = Default[-train, ], type = "response") > 0.5, "Yes", "No")
230 | table(pred, Default$default[-train])
231 | mean(pred != Default$default[-train])
232 | ```
233 |
234 | > c. Repeat the process in (b) three times, using three different splits of the
235 | > observations into a training set and a validation set. Comment on the
236 | > results obtained.
237 |
238 | ```{r}
239 | replicate(3, {
240 | train <- sample(nrow(Default), nrow(Default) / 2)
241 | fit <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train)
242 | pred <- ifelse(predict(fit, newdata = Default[-train, ], type = "response") > 0.5, "Yes", "No")
243 | mean(pred != Default$default[-train])
244 | })
245 | ```
246 |
247 | The results obtained are variable and depend on the samples allocated to
248 | training vs. test.
249 |
250 | > d. Now consider a logistic regression model that predicts the probability of
251 | > `default` using `income`, `balance`, and a dummy variable for `student`.
252 | > Estimate the test error for this model using the validation set approach.
253 | > Comment on whether or not including a dummy variable for `student` leads to
254 | > a reduction in the test error rate.
255 |
256 | ```{r}
257 | replicate(3, {
258 | train <- sample(nrow(Default), nrow(Default) / 2)
259 | fit <- glm(default ~ income + balance + student, data = Default, family = "binomial", subset = train)
260 | pred <- ifelse(predict(fit, newdata = Default[-train, ], type = "response") > 0.5, "Yes", "No")
261 | mean(pred != Default$default[-train])
262 | })
263 | ```
264 |
265 | Including `student` does not seem to make a substantial improvement to the
266 | test error.
267 |
268 | ### Question 6
269 |
270 | > We continue to consider the use of a logistic regression model to predict the
271 | > probability of `default` using `income` and `balance` on the `Default` data
272 | > set. In particular, we will now compute estimates for the standard errors of
273 | > the `income` and `balance` logistic regression coefficients in two different
274 | > ways: (1) using the bootstrap, and (2) using the standard formula for
275 | > computing the standard errors in the `glm()` function. Do not forget to set a
276 | > random seed before beginning your analysis.
277 | >
278 | > a. Using the `summary()` and `glm()` functions, determine the estimated
279 | > standard errors for the coefficients associated with `income` and
280 | > `balance` in a multiple logistic regression model that uses both
281 | > predictors.
282 |
283 | ```{r}
284 | fit <- glm(default ~ income + balance, data = Default, family = "binomial")
285 | summary(fit)
286 | ```
287 |
288 | The standard errors obtained by bootstrapping are $\beta_1$ = 5.0e-6 and
289 | $\beta_2$ = 2.3e-4.
290 |
291 | > b. Write a function, `boot.fn()`, that takes as input the `Default` data set
292 | > as well as an index of the observations, and that outputs the coefficient
293 | > estimates for `income` and `balance` in the multiple logistic regression
294 | > model.
295 |
296 | ```{r}
297 | boot.fn <- function(x, i) {
298 | fit <- glm(default ~ income + balance, data = x[i, ], family = "binomial")
299 | coef(fit)[-1]
300 | }
301 | ```
302 |
303 | > c. Use the `boot()` function together with your `boot.fn()` function to
304 | > estimate the standard errors of the logistic regression coefficients for
305 | > income and balance.
306 |
307 | ```{r, cache = TRUE}
308 | library(boot)
309 | set.seed(42)
310 | boot(Default, boot.fn, R = 1000)
311 | ```
312 |
313 | > d. Comment on the estimated standard errors obtained using the `glm()`
314 | > function and using your bootstrap function.
315 |
316 | The standard errors obtained by bootstrapping are similar to those estimated
317 | by `glm`.
318 |
319 | ### Question 7
320 |
321 | > In Sections 5.3.2 and 5.3.3, we saw that the `cv.glm()` function can be used
322 | > in order to compute the LOOCV test error estimate. Alternatively, one could
323 | > compute those quantities using just the `glm()` and `predict.glm()`
324 | > functions, and a for loop. You will now take this approach in order to
325 | > compute the LOOCV error for a simple logistic regression model on the `Weekly`
326 | > data set. Recall that in the context of classification problems, the LOOCV
327 | > error is given in (5.4).
328 | >
329 | > a. Fit a logistic regression model that predicts `Direction` using `Lag1` and
330 | > `Lag2`.
331 |
332 | ```{r}
333 | fit <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = "binomial")
334 | ```
335 |
336 | > b. Fit a logistic regression model that predicts `Direction` using `Lag1` and
337 | > `Lag2` _using all but the first observation_.
338 |
339 | ```{r}
340 | fit <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-1, ], family = "binomial")
341 | ```
342 |
343 | > c. Use the model from (b) to predict the direction of the first
344 | > observation. You can do this by predicting that the first observation will
345 | > go up if $P($`Direction="Up" | Lag1 , Lag2`$) > 0.5$. Was this observation
346 | > correctly classified?
347 |
348 | ```{r}
349 | predict(fit, newdata = Weekly[1, , drop = FALSE], type = "response") > 0.5
350 | ```
351 |
352 | Yes the observation was correctly classified.
353 |
354 | > d. Write a for loop from $i = 1$ to $i = n$, where $n$ is the number of
355 | > observations in the data set, that performs each of the following steps:
356 | > i. Fit a logistic regression model using all but the $i$th observation
357 | > to predict `Direction` using `Lag1` and `Lag2` .
358 | > ii. Compute the posterior probability of the market moving up
359 | > for the $i$th observation.
360 | > iii. Use the posterior probability for the $i$th observation in order
361 | > to predict whether or not the market moves up.
362 | > iv. Determine whether or not an error was made in predicting the
363 | > direction for the $i$th observation. If an error was made, then
364 | > indicate this as a 1, and otherwise indicate it as a 0.
365 |
366 | ```{r}
367 | error <- numeric(nrow(Weekly))
368 | for (i in 1:nrow(Weekly)) {
369 | fit <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-i, ], family = "binomial")
370 | p <- predict(fit, newdata = Weekly[i, , drop = FALSE], type = "response") > 0.5
371 | error[i] <- ifelse(p, "Down", "Up") == Weekly$Direction[i]
372 | }
373 | ```
374 |
375 | > e. Take the average of the $n$ numbers obtained in (d) in order to obtain the
376 | > LOOCV estimate for the test error. Comment on the results.
377 |
378 | ```{r}
379 | mean(error)
380 | ```
381 |
382 | The LOOCV test error rate is 45% which implies that our predictions are
383 | marginally more often correct than not.
384 |
385 | ### Question 8
386 |
387 | > We will now perform cross-validation on a simulated data set.
388 | >
389 | > a. Generate a simulated data set as follows:
390 | > ```r
391 | > > set.seed(1)
392 | > > x <- rnorm(100)
393 | > > y <- x - 2 *x^2 + rnorm(100)
394 | > ```
395 | > In this data set, what is $n$ and what is $p$? Write out the model
396 | > used to generate the data in equation form.
397 |
398 | ```{r}
399 | set.seed(1)
400 | x <- rnorm(100)
401 | y <- x - 2 * x^2 + rnorm(100)
402 | ```
403 |
404 | $n$ is 100 and $p$ is 1 (there are 100 observations and $y$ is predicted with
405 | a single variable $x$). The model equation is: $$y = -2x^2 + x + \epsilon$$.
406 |
407 | > b. Create a scatterplot of $X$ against $Y$. Comment on what you find.
408 |
409 | ```{r}
410 | plot(x, y)
411 | ```
412 |
413 | $y$ has a (negative) quadratic relationship with $x$.
414 |
415 | > c. Set a random seed, and then compute the LOOCV errors that result from
416 | > fitting the following four models using least squares:
417 | > i. $Y = \beta_0 + \beta_1 X + \epsilon$
418 | > ii. $Y = \beta_0 + \beta_1 X + \beta_2 X^2 + \epsilon$
419 | > iii. $Y = \beta_0 + \beta_1 X + \beta_2 X^2 + \beta_3 X^3 + \epsilon$
420 | > iv. $Y = \beta_0 + \beta_1 X + \beta_2 X^2 + \beta_3 X^3 + \beta_4 X^4 + \epsilon$.
421 | >
422 | > Note you may find it helpful to use the `data.frame()` function
423 | > to create a single data set containing both $X$ and $Y$.
424 |
425 | ```{r}
426 | library(boot)
427 | set.seed(42)
428 | dat <- data.frame(x, y)
429 | sapply(1:4, function(i) cv.glm(dat, glm(y ~ poly(x, i)))$delta[1])
430 | ```
431 |
432 | > d. Repeat (c) using another random seed, and report your results.
433 | > Are your results the same as what you got in (c)? Why?
434 |
435 | ```{r}
436 | set.seed(43)
437 | dat <- data.frame(x, y)
438 | sapply(1:4, function(i) cv.glm(dat, glm(y ~ poly(x, i)))$delta[1])
439 | ```
440 |
441 | The results are the same because we are using LOOCV. When doing this, the model
442 | is fit leaving each one of the observations out in turn, and thus there is no
443 | stochasticity involved.
444 |
445 | > e. Which of the models in (c) had the smallest LOOCV error? Is this what you
446 | > expected? Explain your answer.
447 |
448 | The second model had the smallest LOOCV. This what would be expected since the
449 | model to generate the data was quadratic and we are measuring the test (rather
450 | than training) error rate to evaluate performance.
451 |
452 | > f. Comment on the statistical significance of the coefficient estimates
453 | > that results from fitting each of the models in (c) using least squares.
454 | > Do these results agree with the conclusions drawn based on the
455 | > cross-validation results?
456 |
457 | ```{r}
458 | for (i in 1:4) printCoefmat(coef(summary(glm(y ~ poly(x, i), data = dat))))
459 | ```
460 |
461 | We can see that the coefficients in the first model are not highly significant,
462 | but all terms ($\beta_0, \beta_1$ and $\beta_2$) are in the quadratic model.
463 | After this, subsequent $\beta_n$ terms are not significant. Therefore, these
464 | results agree with those from cross-validation.
465 |
466 | ### Question 9
467 |
468 | > We will now consider the `Boston` housing data set, from the `ISLR2`
469 | > library.
470 | >
471 | > a. Based on this data set, provide an estimate for the population mean of
472 | > `medv`. Call this estimate $\hat\mu$.
473 |
474 | ```{r}
475 | (mu <- mean(Boston$medv))
476 | ```
477 |
478 | > b. Provide an estimate of the standard error of $\hat\mu$. Interpret this
479 | > result.
480 | >
481 | > _Hint: We can compute the standard error of the sample mean by
482 | > dividing the sample standard deviation by the square root of the number of
483 | > observations._
484 |
485 | ```{r}
486 | sd(Boston$medv) / sqrt(length(Boston$medv))
487 | ```
488 |
489 | > c. Now estimate the standard error of $\hat\mu$ using the bootstrap. How does
490 | > this compare to your answer from (b)?
491 |
492 | ```{r}
493 | set.seed(42)
494 | (bs <- boot(Boston$medv, function(v, i) mean(v[i]), 10000))
495 | ```
496 |
497 | The standard error using the bootstrap (0.403) is very close to that
498 | obtained from the formula above (0.409).
499 |
500 | > d. Based on your bootstrap estimate from (c), provide a 95% confidence
501 | > interval for the mean of `medv`. Compare it to the results obtained using
502 | > `t.test(Boston$medv)`.
503 | >
504 | > _Hint: You can approximate a 95% confidence interval using the
505 | > formula $[\hat\mu - 2SE(\hat\mu), \hat\mu + 2SE(\hat\mu)].$_
506 |
507 | ```{r}
508 | se <- sd(bs$t)
509 | c(mu - 2 * se, mu + 2 * se)
510 | ```
511 |
512 | > e. Based on this data set, provide an estimate, $\hat\mu_{med}$, for the
513 | > median value of `medv` in the population.
514 |
515 | ```{r}
516 | median(Boston$medv)
517 | ```
518 |
519 | > f. We now would like to estimate the standard error of $\hat\mu_{med}$.
520 | > Unfortunately, there is no simple formula for computing the standard error
521 | > of the median. Instead, estimate the standard error of the median using
522 | > the bootstrap. Comment on your findings.
523 |
524 | ```{r}
525 | set.seed(42)
526 | boot(Boston$medv, function(v, i) median(v[i]), 10000)
527 | ```
528 |
529 | The estimated standard error of the median is 0.374. This is lower than the
530 | standard error of the mean.
531 |
532 | > g. Based on this data set, provide an estimate for the tenth percentile of
533 | > `medv` in Boston census tracts. Call this quantity $\hat\mu_{0.1}$. (You
534 | > can use the `quantile()` function.)
535 |
536 | ```{r}
537 | quantile(Boston$medv, 0.1)
538 | ```
539 |
540 | > h. Use the bootstrap to estimate the standard error of $\hat\mu_{0.1}$.
541 | > Comment on your findings.
542 |
543 | ```{r}
544 | set.seed(42)
545 | boot(Boston$medv, function(v, i) quantile(v[i], 0.1), 10000)
546 | ```
547 |
548 | We get a standard error of ~0.5. This is higher than the standard error of the
549 | median. Nevertheless the standard error is quite small, thus we can be fairly
550 | confidence about the value of the 10th percentile.
551 |
--------------------------------------------------------------------------------
/12-unsupervised-learning.Rmd:
--------------------------------------------------------------------------------
1 | # Unsupervised Learning
2 |
3 | ## Conceptual
4 |
5 | ### Question 1
6 |
7 | > This problem involves the $K$-means clustering algorithm.
8 | >
9 | > a. Prove (12.18).
10 |
11 | 12.18 is:
12 |
13 | $$
14 | \frac{1}{|C_k|}\sum_{i,i' \in C_k} \sum_{j=1}^p (x_{ij} - x_{i'j})^2 =
15 | 2 \sum_{i \in C_k} \sum_{j=1}^p (x_{ij} - \bar{x}_{kj})^2
16 | $$
17 |
18 | where $$\bar{x}_{kj} = \frac{1}{|C_k|}\sum_{i \in C_k} x_{ij}$$
19 |
20 | On the left hand side we compute the difference between each observation
21 | (indexed by $i$ and $i'$). In the second we compute the difference between
22 | each observation and the mean. Intuitively this identity is clear (the factor
23 | of 2 is present because we calculate the difference between each pair twice).
24 | However, to prove.
25 |
26 | Note first that,
27 | \begin{align}
28 | (x_{ij} - x_{i'j})^2
29 | = & ((x_{ij} - \bar{x}_{kj}) - (x_{i'j} - \bar{x}_{kj}))^2 \\
30 | = & (x_{ij} - \bar{x}_{kj})^2 -
31 | 2(x_{ij} - \bar{x}_{kj})(x_{i'j} - \bar{x}_{kj}) +
32 | (x_{i'j} - \bar{x}_{kj})^2
33 | \end{align}
34 |
35 | Note that the first term is independent of $i'$ and the last is independent of
36 | $i$.
37 |
38 | Therefore, 10.12 can be written as:
39 |
40 | \begin{align}
41 | \frac{1}{|C_k|}\sum_{i,i' \in C_k} \sum_{j=1}^p (x_{ij} - x_{i'j})^2
42 | = & \frac{1}{|C_k|}\sum_{i,i' \in C_k}\sum_{j=1}^p (x_{ij} - \bar{x}_{kj})^2 -
43 | \frac{1}{|C_k|}\sum_{i,i' \in C_k}\sum_{j=1}^p 2(x_{ij} - \bar{x}_{kj})(x_{i'j} - \bar{x}_{kj}) +
44 | \frac{1}{|C_k|}\sum_{i,i' \in C_k}\sum_{j=1}^p (x_{i'j} - \bar{x}_{kj})^2 \\
45 | = & \frac{|C_k|}{|C_k|}\sum_{i \in C_k}\sum_{j=1}^p (x_{ij} - \bar{x}_{kj})^2 -
46 | \frac{2}{|C_k|}\sum_{i,i' \in C_k}\sum_{j=1}^p (x_{ij} - \bar{x}_{kj})(x_{i'j} - \bar{x}_{kj}) +
47 | \frac{|C_k|}{|C_k|}\sum_{i \in C_k}\sum_{j=1}^p (x_{ij} - \bar{x}_{kj})^2 \\
48 | = & 2 \sum_{i \in C_k}\sum_{j=1}^p (x_{ij} - \bar{x}_{kj})^2
49 | \end{align}
50 |
51 | Note that we can drop the term containing
52 | $(x_{ij} - \bar{x}_{kj})(x_{i'j} - \bar{x}_{kj})$ since this is 0 when summed
53 | over combinations of $i$ and $i'$ for a given $j$.
54 |
55 | > b. On the basis of this identity, argue that the $K$-means clustering
56 | > algorithm (Algorithm 12.2) decreases the objective (12.17) at each
57 | > iteration.
58 |
59 | Equation 10.12 demonstrates that the euclidean distance between each possible
60 | pair of samples can be related to the difference from each sample to the mean
61 | of the cluster. The K-means algorithm works by minimizing the euclidean distance
62 | to each centroid, thus also minimizes the within-cluster variance.
63 |
64 | ### Question 2
65 |
66 | > Suppose that we have four observations, for which we compute a dissimilarity
67 | > matrix, given by
68 | >
69 | > \begin{bmatrix}
70 | > & 0.3 & 0.4 & 0.7 \\
71 | > 0.3 & & 0.5 & 0.8 \\
72 | > 0.4 & 0.5 & & 0.45 \\
73 | > 0.7 & 0.8 & 0.45 & \\
74 | > \end{bmatrix}
75 | >
76 | > For instance, the dissimilarity between the first and second observations is
77 | > 0.3, and the dissimilarity between the second and fourth observations is 0.8.
78 | >
79 | > a. On the basis of this dissimilarity matrix, sketch the dendrogram that
80 | > results from hierarchically clustering these four observations using
81 | > complete linkage. Be sure to indicate on the plot the height at which each
82 | > fusion occurs, as well as the observations corresponding to each leaf in
83 | > the dendrogram.
84 |
85 | ```{r}
86 | m <- matrix(c(0, 0.3, 0.4, 0.7, 0.3, 0, 0.5, 0.8, 0.4, 0.5, 0., 0.45, 0.7, 0.8, 0.45, 0), ncol = 4)
87 | c1 <- hclust(as.dist(m), method = "complete")
88 | plot(c1)
89 | ```
90 |
91 | > b. Repeat (a), this time using single linkage clustering.
92 |
93 | ```{r}
94 | c2 <- hclust(as.dist(m), method = "single")
95 | plot(c2)
96 | ```
97 |
98 | > c. Suppose that we cut the dendrogram obtained in (a) such that two clusters
99 | > result. Which observations are in each cluster?
100 |
101 | ```{r}
102 | table(1:4, cutree(c1, 2))
103 | ```
104 |
105 | > d. Suppose that we cut the dendrogram obtained in (b) such that two clusters
106 | > result. Which observations are in each cluster?
107 |
108 | ```{r}
109 | table(1:4, cutree(c2, 2))
110 | ```
111 |
112 | > e. It is mentioned in the chapter that at each fusion in the dendrogram, the
113 | > position of the two clusters being fused can be swapped without changing
114 | > the meaning of the dendrogram. Draw a dendrogram that is equivalent to the
115 | > dendrogram in (a), for which two or more of the leaves are repositioned,
116 | > but for which the meaning of the dendrogram is the same.
117 |
118 | ```{r}
119 | plot(c1, labels = c(2, 1, 3, 4))
120 | ```
121 |
122 | ### Question 3
123 |
124 | > In this problem, you will perform $K$-means clustering manually, with $K = 2$,
125 | > on a small example with $n = 6$ observations and $p = 2$ features. The
126 | > observations are as follows.
127 | >
128 | > | Obs. | $X_1$ | $X_2$ |
129 | > |------|-------|-------|
130 | > | 1 | 1 | 4 |
131 | > | 2 | 1 | 3 |
132 | > | 3 | 0 | 4 |
133 | > | 4 | 5 | 1 |
134 | > | 5 | 6 | 2 |
135 | > | 6 | 4 | 0 |
136 | >
137 | > a. Plot the observations.
138 |
139 | ```{r}
140 | library(ggplot2)
141 | d <- data.frame(
142 | x1 = c(1, 1, 0, 5, 6, 4),
143 | x2 = c(4, 3, 4, 1, 2, 0)
144 | )
145 | ggplot(d, aes(x = x1, y = x2)) +
146 | geom_point()
147 | ```
148 |
149 | > b. Randomly assign a cluster label to each observation. You can use the
150 | > `sample()` command in `R` to do this. Report the cluster labels for each
151 | > observation.
152 |
153 | ```{r}
154 | set.seed(42)
155 | d$cluster <- sample(c(1, 2), size = nrow(d), replace = TRUE)
156 | ```
157 |
158 | > c. Compute the centroid for each cluster.
159 |
160 | ```{r}
161 | centroids <- sapply(c(1, 2), function(i) colMeans(d[d$cluster == i, 1:2]))
162 | ```
163 |
164 | > d. Assign each observation to the centroid to which it is closest, in terms of
165 | > Euclidean distance. Report the cluster labels for each observation.
166 |
167 | ```{r}
168 | dist <- sapply(1:2, function(i) {
169 | sqrt((d$x1 - centroids[1, i])^2 + (d$x2 - centroids[2, i])^2)
170 | })
171 | d$cluster <- apply(dist, 1, which.min)
172 | ```
173 |
174 | > e. Repeat (c) and (d) until the answers obtained stop changing.
175 |
176 | ```{r}
177 | centroids <- sapply(c(1, 2), function(i) colMeans(d[d$cluster == i, 1:2]))
178 | dist <- sapply(1:2, function(i) {
179 | sqrt((d$x1 - centroids[1, i])^2 + (d$x2 - centroids[2, i])^2)
180 | })
181 | d$cluster <- apply(dist, 1, which.min)
182 | ```
183 |
184 | In this case, we get stable labels after the first iteration.
185 |
186 | > f. In your plot from (a), color the observations according to the cluster
187 | > labels obtained.
188 |
189 | ```{r}
190 | ggplot(d, aes(x = x1, y = x2, color = factor(cluster))) +
191 | geom_point()
192 | ```
193 |
194 | ### Question 4
195 |
196 | > Suppose that for a particular data set, we perform hierarchical clustering
197 | > using single linkage and using complete linkage. We obtain two dendrograms.
198 | >
199 | > a. At a certain point on the single linkage dendrogram, the clusters {1, 2, 3}
200 | > and {4, 5} fuse. On the complete linkage dendrogram, the clusters {1, 2, 3}
201 | > and {4, 5} also fuse at a certain point. Which fusion will occur higher on
202 | > the tree, or will they fuse at the same height, or is there not enough
203 | > information to tell?
204 |
205 | The complete linkage fusion will likely be higher in the tree since single
206 | linkage is defined as being the minimum distance between two clusters. However,
207 | there is a chance that they could be at the same height (so technically there
208 | is not enough information to tell).
209 |
210 | > b. At a certain point on the single linkage dendrogram, the clusters {5} and
211 | > {6} fuse. On the complete linkage dendrogram, the clusters {5} and {6} also
212 | > fuse at a certain point. Which fusion will occur higher on the tree, or
213 | > will they fuse at the same height, or is there not enough information to
214 | > tell?
215 |
216 | They will fuse at the same height (the algorithm for calculating distance is
217 | the same when the clusters are of size 1).
218 |
219 | ### Question 5
220 |
221 | > In words, describe the results that you would expect if you performed
222 | > $K$-means clustering of the eight shoppers in Figure 12.16, on the basis of
223 | > their sock and computer purchases, with $K = 2$. Give three answers, one for
224 | > each of the variable scalings displayed. Explain.
225 |
226 | In cases where variables are scaled we would expect clusters to correspond
227 | to whether or not the retainer sold a computer. In the first case (raw numbers
228 | of items sold), we would expect clusters to represent low vs high numbers of
229 | sock purchases.
230 |
231 | To test, we can run the analysis in R:
232 |
233 | ```{r}
234 | set.seed(42)
235 | dat <- data.frame(
236 | socks = c(8, 11, 7, 6, 5, 6, 7, 8),
237 | computers = c(0, 0, 0, 0, 1, 1, 1, 1)
238 | )
239 | kmeans(dat, 2)$cluster
240 | kmeans(scale(dat), 2)$cluster
241 | dat$computers <- dat$computers * 2000
242 | kmeans(dat, 2)$cluster
243 | ```
244 |
245 | ### Question 6
246 |
247 | > We saw in Section 12.2.2 that the principal component loading and score
248 | > vectors provide an approximation to a matrix, in the sense of (12.5).
249 | > Specifically, the principal component score and loading vectors solve the
250 | > optimization problem given in (12.6).
251 | >
252 | > Now, suppose that the M principal component score vectors zim, $m = 1,...,M$,
253 | > are known. Using (12.6), explain that the first $M$ principal component
254 | > loading vectors $\phi_{jm}$, $m = 1,...,M$, can be obtaining by performing $M$
255 | > separate least squares linear regressions. In each regression, the principal
256 | > component score vectors are the predictors, and one of the features of the
257 | > data matrix is the response.
258 |
259 | ## Applied
260 |
261 | ### Question 7
262 |
263 | > In the chapter, we mentioned the use of correlation-based distance and
264 | > Euclidean distance as dissimilarity measures for hierarchical clustering.
265 | > It turns out that these two measures are almost equivalent: if each
266 | > observation has been centered to have mean zero and standard deviation one,
267 | > and if we let $r_{ij}$ denote the correlation between the $i$th and $j$th
268 | > observations, then the quantity $1 - r_{ij}$ is proportional to the squared
269 | > Euclidean distance between the ith and jth observations.
270 | >
271 | > On the `USArrests` data, show that this proportionality holds.
272 | >
273 | > _Hint: The Euclidean distance can be calculated using the `dist()` function,_
274 | > _and correlations can be calculated using the `cor()` function._
275 |
276 | ```{r}
277 | dat <- t(scale(t(USArrests)))
278 | d1 <- dist(dat)^2
279 | d2 <- as.dist(1 - cor(t(dat)))
280 | plot(d1, d2)
281 | ```
282 |
283 | ### Question 8
284 |
285 | > In Section 12.2.3, a formula for calculating PVE was given in Equation
286 | > 12.10. We also saw that the PVE can be obtained using the `sdev` output of the
287 | > `prcomp()` function.
288 | >
289 | > On the `USArrests` data, calculate PVE in two ways:
290 | >
291 | > a. Using the `sdev` output of the `prcomp()` function, as was done in Section
292 | > 12.2.3.
293 |
294 | ```{r}
295 | pr <- prcomp(USArrests, scale = TRUE)
296 | pr$sdev^2 / sum(pr$sdev^2)
297 | ```
298 |
299 | > b. By applying Equation 12.10 directly. That is, use the `prcomp()` function to
300 | > compute the principal component loadings. Then, use those loadings in
301 | > Equation 12.10 to obtain the PVE.
302 | >
303 | > These two approaches should give the same results.
304 |
305 | ```{r}
306 | colSums(pr$x^2) / sum(colSums(scale(USArrests)^2))
307 | ```
308 |
309 | > _Hint: You will only obtain the same results in (a) and (b) if the same_
310 | > _data is used in both cases. For instance, if in (a) you performed_
311 | > _`prcomp()` using centered and scaled variables, then you must center and_
312 | > _scale the variables before applying Equation 12.10 in (b)._
313 |
314 | ### Question 9
315 |
316 | > Consider the `USArrests` data. We will now perform hierarchical clustering on
317 | > the states.
318 | >
319 | > a. Using hierarchical clustering with complete linkage and Euclidean distance,
320 | > cluster the states.
321 |
322 | ```{r}
323 | set.seed(42)
324 | hc <- hclust(dist(USArrests), method = "complete")
325 | ```
326 |
327 | > b. Cut the dendrogram at a height that results in three distinct clusters.
328 | > Which states belong to which clusters?
329 |
330 | ```{r}
331 | ct <- cutree(hc, 3)
332 | sapply(1:3, function(i) names(ct)[ct == i])
333 | ```
334 |
335 | > c. Hierarchically cluster the states using complete linkage and Euclidean
336 | > distance, _after scaling the variables to have standard deviation one_.
337 |
338 | ```{r}
339 | hc2 <- hclust(dist(scale(USArrests)), method = "complete")
340 | ```
341 |
342 | > d. What effect does scaling the variables have on the hierarchical clustering
343 | > obtained? In your opinion, should the variables be scaled before the
344 | > inter-observation dissimilarities are computed? Provide a justification for
345 | > your answer.
346 |
347 | ```{r}
348 | ct <- cutree(hc, 3)
349 | sapply(1:3, function(i) names(ct)[ct == i])
350 | ```
351 |
352 | Scaling results in different clusters and the choice of whether to scale or
353 | not depends on the data in question. In this case, the variables are:
354 |
355 | - Murder numeric Murder arrests (per 100,000)
356 | - Assault numeric Assault arrests (per 100,000)
357 | - UrbanPop numeric Percent urban population
358 | - Rape numeric Rape arrests (per 100,000)
359 |
360 | These variables are not naturally on the same unit and the units involved are
361 | somewhat arbitrary (so for example, Murder could be measured per 1 million
362 | rather than per 100,000) so in this case I would argue the data should be
363 | scaled.
364 |
365 | ### Question 10
366 |
367 | > In this problem, you will generate simulated data, and then perform PCA and
368 | > $K$-means clustering on the data.
369 | >
370 | > a. Generate a simulated data set with 20 observations in each of three classes
371 | > (i.e. 60 observations total), and 50 variables.
372 | >
373 | > _Hint: There are a number of functions in `R` that you can use to generate_
374 | > _data. One example is the `rnorm()` function; `runif()` is another option._
375 | > _Be sure to add a mean shift to the observations in each class so that_
376 | > _there are three distinct classes._
377 |
378 | ```{r}
379 | set.seed(42)
380 | data <- matrix(rnorm(60 * 50), ncol = 50)
381 | classes <- rep(c("A", "B", "C"), each = 20)
382 | dimnames(data) <- list(classes, paste0("v", 1:50))
383 | data[classes == "B", 1:10] <- data[classes == "B", 1:10] + 1.2
384 | data[classes == "C", 5:30] <- data[classes == "C", 5:30] + 1
385 | ```
386 |
387 | > b. Perform PCA on the 60 observations and plot the first two principal
388 | > component score vectors. Use a different color to indicate the
389 | > observations in each of the three classes. If the three classes appear
390 | > separated in this plot, then continue on to part (c). If not, then return
391 | > to part (a) and modify the simulation so that there is greater separation
392 | > between the three classes. Do not continue to part (c) until the three
393 | > classes show at least some separation in the first two principal component
394 | > score vectors.
395 |
396 | ```{r}
397 | pca <- prcomp(data)
398 | ggplot(
399 | data.frame(Class = classes, PC1 = pca$x[, 1], PC2 = pca$x[, 2]),
400 | aes(x = PC1, y = PC2, col = Class)
401 | ) +
402 | geom_point()
403 | ```
404 |
405 | > c. Perform $K$-means clustering of the observations with $K = 3$. How well do
406 | > the clusters that you obtained in $K$-means clustering compare to the true
407 | > class labels?
408 | >
409 | > _Hint: You can use the `table()` function in `R` to compare the true class_
410 | > _labels to the class labels obtained by clustering. Be careful how you_
411 | > _interpret the results: $K$-means clustering will arbitrarily number the_
412 | > _clusters, so you cannot simply check whether the true class labels and_
413 | > _clustering labels are the same._
414 |
415 | ```{r}
416 | km <- kmeans(data, 3)$cluster
417 | table(km, names(km))
418 | ```
419 |
420 | $K$-means separates out the clusters nearly perfectly.
421 |
422 | > d. Perform $K$-means clustering with $K = 2$. Describe your results.
423 |
424 | ```{r}
425 | km <- kmeans(data, 2)$cluster
426 | table(km, names(km))
427 | ```
428 |
429 | $K$-means effectively defines cluster 2 to be class B, but cluster 1 is a mix
430 | of classes A and B.
431 |
432 | > e. Now perform $K$-means clustering with $K = 4$, and describe your results.
433 |
434 | ```{r}
435 | km <- kmeans(data, 4)$cluster
436 | table(km, names(km))
437 | ```
438 |
439 | $K$-means effectively defines cluster 1 to be class B, cluster 2 to be class A
440 | but clusters 3 and 4 are split over class C.
441 |
442 | > f. Now perform $K$-means clustering with $K = 3$ on the first two principal
443 | > component score vectors, rather than on the raw data. That is, perform
444 | > $K$-means clustering on the $60 \times 2$ matrix of which the first column
445 | > is the first principal component score vector, and the second column is
446 | > the second principal component score vector. Comment on the results.
447 |
448 | ```{r}
449 | km <- kmeans(pca$x[, 1:2], 3)$cluster
450 | table(km, names(km))
451 | ```
452 |
453 | $K$-means again separates out the clusters nearly perfectly.
454 |
455 | > g. Using the `scale()` function, perform $K$-means clustering with $K = 3$ on
456 | > the data _after scaling each variable to have standard deviation one_. How
457 | > do these results compare to those obtained in (b)? Explain.
458 |
459 | ```{r}
460 | km <- kmeans(scale(data), 3)$cluster
461 | table(km, names(km))
462 | ```
463 |
464 | $K$-means appears to perform less well on the scaled data in this case.
465 |
466 | ### Question 11
467 |
468 | > Write an `R` function to perform matrix completion as in Algorithm 12.1, and
469 | > as outlined in Section 12.5.2. In each iteration, the function should keep
470 | > track of the relative error, as well as the iteration count. Iterations should
471 | > continue until the relative error is small enough or until some maximum number
472 | > of iterations is reached (set a default value for this maximum number).
473 | > Furthermore, there should be an option to print out the progress in each
474 | > iteration.
475 | >
476 | > Test your function on the `Boston` data. First, standardize the features to
477 | > have mean zero and standard deviation one using the `scale()` function. Run an
478 | > experiment where you randomly leave out an increasing (and nested) number of
479 | > observations from 5% to 30%, in steps of 5%. Apply Algorithm 12.1 with $M =
480 | > 1,2,...,8$. Display the approximation error as a function of the fraction of
481 | > observations that are missing, and the value of $M$, averaged over 10
482 | > repetitions of the experiment.
483 |
484 | ### Question 12
485 |
486 | > In Section 12.5.2, Algorithm 12.1 was implemented using the `svd()` function.
487 | > However, given the connection between the `svd()` function and the `prcomp()`
488 | > function highlighted in the lab, we could have instead implemented the
489 | > algorithm using `prcomp()`.
490 | >
491 | > Write a function to implement Algorithm 12.1 that makes use of `prcomp()`
492 | > rather than `svd()`.
493 |
494 | ### Question 13
495 |
496 | > On the book website, `www.StatLearning.com`, there is a gene expression data
497 | > set (`Ch12Ex13.csv`) that consists of 40 tissue samples with measurements on
498 | > 1,000 genes. The first 20 samples are from healthy patients, while the
499 | > second 20 are from a diseased group.
500 | >
501 | > a. Load in the data using `read.csv()`. You will need to select `header = F`.
502 |
503 | ```{r}
504 | data <- read.csv("data/Ch12Ex13.csv", header = FALSE)
505 | colnames(data) <- c(paste0("H", 1:20), paste0("D", 1:20))
506 | ```
507 |
508 | > b. Apply hierarchical clustering to the samples using correlation-based
509 | > distance, and plot the dendrogram. Do the genes separate the samples into
510 | > the two groups? Do your results depend on the type of linkage used?
511 |
512 | ```{r}
513 | hc.complete <- hclust(as.dist(1 - cor(data)), method = "complete")
514 | plot(hc.complete)
515 |
516 | hc.complete <- hclust(as.dist(1 - cor(data)), method = "average")
517 | plot(hc.complete)
518 |
519 | hc.complete <- hclust(as.dist(1 - cor(data)), method = "single")
520 | plot(hc.complete)
521 | ```
522 |
523 | Yes the samples clearly separate into the two groups, although the results
524 | depend somewhat on the linkage method used. In the case of average clustering,
525 | the disease samples all fall within a subset of the healthy samples.
526 |
527 | > c. Your collaborator wants to know which genes differ the most across the two
528 | > groups. Suggest a way to answer this question, and apply it here.
529 |
530 | This is probably best achieved with a supervised approach. A simple method
531 | would be to determine which genes show the most significant differences between
532 | the groups by applying a t-test to each group. We can then select those with a
533 | FDR adjusted p-value less than some given threshold (e.g. 0.05).
534 |
535 | ```{r}
536 | class <- factor(rep(c("Healthy", "Diseased"), each = 20))
537 | pvals <- p.adjust(apply(data, 1, function(v) t.test(v ~ class)$p.value))
538 | which(pvals < 0.05)
539 | ```
540 |
--------------------------------------------------------------------------------
/09-support-vector-mechines.Rmd:
--------------------------------------------------------------------------------
1 | # Support Vector Machines
2 |
3 | ## Conceptual
4 |
5 | ### Question 1
6 |
7 | > This problem involves hyperplanes in two dimensions.
8 | >
9 | > a. Sketch the hyperplane $1 + 3X_1 - X_2 = 0$. Indicate the set of points for
10 | > which $1 + 3X_1 - X_2 > 0$, as well as the set of points for which
11 | > $1 + 3X_1 - X_2 < 0$.
12 |
13 | ```{r}
14 | library(ggplot2)
15 | xlim <- c(-10, 10)
16 | ylim <- c(-30, 30)
17 | points <- expand.grid(
18 | X1 = seq(xlim[1], xlim[2], length.out = 50),
19 | X2 = seq(ylim[1], ylim[2], length.out = 50)
20 | )
21 | p <- ggplot(points, aes(x = X1, y = X2)) +
22 | geom_abline(intercept = 1, slope = 3) + # X2 = 1 + 3X1
23 | theme_bw()
24 | p + geom_point(aes(color = 1 + 3 * X1 - X2 > 0), size = 0.1) +
25 | scale_color_discrete(name = "1 + 3X1 - X2 > 0")
26 | ```
27 |
28 | > b. On the same plot, sketch the hyperplane $-2 + X_1 + 2X_2 = 0$. Indicate the
29 | > set of points for which $-2 + X_1 + 2X_2 > 0$, as well as the set of points
30 | > for which $-2 + X_1 + 2X_2 < 0$.
31 |
32 | ```{r}
33 | p + geom_abline(intercept = 1, slope = -1 / 2) + # X2 = 1 - X1/2
34 | geom_point(
35 | aes(color = interaction(1 + 3 * X1 - X2 > 0, -2 + X1 + 2 * X2 > 0)),
36 | size = 0.5
37 | ) +
38 | scale_color_discrete(name = "(1 + 3X1 - X2 > 0).(-2 + X1 + 2X2 > 0)")
39 | ```
40 |
41 | ### Question 2
42 |
43 | > We have seen that in $p = 2$ dimensions, a linear decision boundary takes the
44 | > form $\beta_0 + \beta_1X_1 + \beta_2X_2 = 0$. We now investigate a non-linear
45 | > decision boundary.
46 | >
47 | > a. Sketch the curve $$(1+X_1)^2 +(2-X_2)^2 = 4$$.
48 |
49 | ```{r}
50 | points <- expand.grid(
51 | X1 = seq(-4, 2, length.out = 100),
52 | X2 = seq(-1, 5, length.out = 100)
53 | )
54 | p <- ggplot(points, aes(x = X1, y = X2, z = (1 + X1)^2 + (2 - X2)^2 - 4)) +
55 | geom_contour(breaks = 0, colour = "black") +
56 | theme_bw()
57 | p
58 | ```
59 |
60 | > b. On your sketch, indicate the set of points for which
61 | > $$(1 + X_1)^2 + (2 - X_2)^2 > 4,$$ as well as the set of points for which
62 | > $$(1 + X_1)^2 + (2 - X_2)^2 \leq 4.$$
63 |
64 | ```{r}
65 | p + geom_point(aes(color = (1 + X1)^2 + (2 - X2)^2 - 4 > 0), size = 0.1)
66 | ```
67 |
68 | > c. Suppose that a classifier assigns an observation to the blue class if $$(1
69 | > + X_1)^2 + (2 - X_2)^2 > 4,$$ and to the red class otherwise. To what class
70 | > is the observation $(0, 0)$ classified? $(-1, 1)$? $(2, 2)$? $(3, 8)$?
71 |
72 | ```{r}
73 | points <- data.frame(
74 | X1 = c(0, -1, 2, 3),
75 | X2 = c(0, 1, 2, 8)
76 | )
77 | ifelse((1 + points$X1)^2 + (2 - points$X2)^2 > 4, "blue", "red")
78 | ```
79 |
80 | > d. Argue that while the decision boundary in (c) is not linear in terms of
81 | > $X_1$ and $X_2$, it is linear in terms of $X_1$, $X_1^2$, $X_2$, and
82 | > $X_2^2$.
83 |
84 | The decision boundary is $$(1 + X_1)^2 + (2 - X_2)^2 -4 = 0$$ which we can expand
85 | to:
86 | $$1 + 2X_1 + X_1^2 + 4 - 4X_2 + X_2^2 - 4 = 0$$
87 | which is linear in terms of $X_1$, $X_1^2$, $X_2$, $X_2^2$.
88 |
89 | ### Question 3
90 |
91 | > Here we explore the maximal margin classifier on a toy data set.
92 | >
93 | > a. We are given $n = 7$ observations in $p = 2$ dimensions. For each
94 | > observation, there is an associated class label.
95 | >
96 | > | Obs. | $X_1$ | $X_2$ | $Y$ |
97 | > |------|-------|-------|------|
98 | > | 1 | 3 | 4 | Red |
99 | > | 2 | 2 | 2 | Red |
100 | > | 3 | 4 | 4 | Red |
101 | > | 4 | 1 | 4 | Red |
102 | > | 5 | 2 | 1 | Blue |
103 | > | 6 | 4 | 3 | Blue |
104 | > | 7 | 4 | 1 | Blue |
105 | >
106 | > Sketch the observations.
107 |
108 | ```{r}
109 | data <- data.frame(
110 | X1 = c(3, 2, 4, 1, 2, 4, 4),
111 | X2 = c(4, 2, 4, 4, 1, 3, 1),
112 | Y = c(rep("Red", 4), rep("Blue", 3))
113 | )
114 | p <- ggplot(data, aes(x = X1, y = X2, color = Y)) +
115 | geom_point(size = 2) +
116 | scale_colour_identity() +
117 | coord_cartesian(xlim = c(0.5, 4.5), ylim = c(0.5, 4.5))
118 | p
119 | ```
120 |
121 | > b. Sketch the optimal separating hyperplane, and provide the equation for this
122 | > hyperplane (of the form (9.1)).
123 |
124 | ```{r}
125 | library(e1071)
126 |
127 | fit <- svm(as.factor(Y) ~ ., data = data, kernel = "linear", cost = 10, scale = FALSE)
128 |
129 | # Extract beta_0, beta_1, beta_2
130 | beta <- c(
131 | -fit$rho,
132 | drop(t(fit$coefs) %*% as.matrix(data[fit$index, 1:2]))
133 | )
134 | names(beta) <- c("B0", "B1", "B2")
135 | p <- p + geom_abline(intercept = -beta[1] / beta[3], slope = -beta[2] / beta[3], lty = 2)
136 | p
137 | ```
138 |
139 | > c. Describe the classification rule for the maximal margin classifier. It
140 | > should be something along the lines of "Classify to Red if $\beta_0 +
141 | > \beta_1X_1 + \beta_2X_2 > 0$, and classify to Blue otherwise." Provide the
142 | > values for $\beta_0, \beta_1,$ and $\beta_2$.
143 |
144 | Classify to red if $\beta_0 + \beta_1X_1 + \beta_2X_2 > 0$ and blue otherwise
145 | where $\beta_0 = `r round(beta[1])`$, $\beta_1 = `r round(beta[2])`$,
146 | $\beta_2 = `r round(beta[3])`$.
147 |
148 | > d. On your sketch, indicate the margin for the maximal margin hyperplane.
149 |
150 | ```{r}
151 | p <- p + geom_ribbon(
152 | aes(x = x, ymin = ymin, ymax = ymax),
153 | data = data.frame(x = c(0, 5), ymin = c(-1, 4), ymax = c(0, 5)),
154 | alpha = 0.1, fill = "blue",
155 | inherit.aes = FALSE
156 | )
157 | p
158 | ```
159 |
160 | > e. Indicate the support vectors for the maximal margin classifier.
161 |
162 | ```{r}
163 | p <- p + geom_point(data = data[fit$index, ], size = 4)
164 | p
165 | ```
166 |
167 | The support vectors (from the svm fit object) are shown above. Arguably,
168 | there's another support vector, since four points exactly touch the margin.
169 |
170 | > f. Argue that a slight movement of the seventh observation would not affect
171 | > the maximal margin hyperplane.
172 |
173 | ```{r}
174 | p + geom_point(data = data[7, , drop = FALSE], size = 4, color = "purple")
175 | ```
176 |
177 | The 7th point is shown in purple above. It is not a support vector, and not
178 | close to the margin, so small changes in its X1, X2 values would not affect the
179 | current calculated margin.
180 |
181 | > g. Sketch a hyperplane that is _not_ the optimal separating hyperplane, and
182 | > provide the equation for this hyperplane.
183 |
184 | A non-optimal hyperline that still separates the blue and red points would
185 | be one that touches the (red) point at X1 = 2, X2 = 2 and the (blue) point at
186 | X1 = 4, X2 = 3. This gives line $y = x/2 + 1$ or, when $\beta_0 = -1$,
187 | $\beta_1 = -1/2$, $\beta_2 = 1$.
188 |
189 | ```{r}
190 | p + geom_abline(intercept = 1, slope = 0.5, lty = 2, col = "red")
191 | ```
192 |
193 | > h. Draw an additional observation on the plot so that the two classes are no
194 | > longer separable by a hyperplane.
195 |
196 | ```{r}
197 | p + geom_point(data = data.frame(X1 = 1, X2 = 3, Y = "Blue"), shape = 15, size = 4)
198 | ```
199 |
200 | ## Applied
201 |
202 | ### Question 4
203 |
204 | > Generate a simulated two-class data set with 100 observations and two features
205 | > in which there is a visible but non-linear separation between the two classes.
206 | > Show that in this setting, a support vector machine with a polynomial kernel
207 | > (with degree greater than 1) or a radial kernel will outperform a support
208 | > vector classifier on the training data. Which technique performs best on the
209 | > test data? Make plots and report training and test error rates in order to
210 | > back up your assertions.
211 |
212 | ```{r}
213 | set.seed(10)
214 | data <- data.frame(
215 | x = runif(100),
216 | y = runif(100)
217 | )
218 | score <- (2 * data$x - 0.5)^2 + (data$y)^2 - 0.5
219 | data$class <- factor(ifelse(score > 0, "red", "blue"))
220 |
221 | p <- ggplot(data, aes(x = x, y = y, color = class)) +
222 | geom_point(size = 2) +
223 | scale_colour_identity()
224 | p
225 |
226 | train <- 1:50
227 | test <- 51:100
228 |
229 | fits <- list(
230 | "Radial" = svm(class ~ ., data = data[train, ], kernel = "radial"),
231 | "Polynomial" = svm(class ~ ., data = data[train, ], kernel = "polynomial", degree = 2),
232 | "Linear" = svm(class ~ ., data = data[train, ], kernel = "linear")
233 | )
234 |
235 | err <- function(model, data) {
236 | out <- table(predict(model, data), data$class)
237 | (out[1, 2] + out[2, 1]) / sum(out)
238 | }
239 | plot(fits[[1]], data)
240 | plot(fits[[2]], data)
241 | plot(fits[[3]], data)
242 | sapply(fits, err, data = data[train, ])
243 | sapply(fits, err, data = data[test, ])
244 | ```
245 |
246 | In this case, the radial kernel performs best, followed by a linear kernel with
247 | the 2nd degree polynomial performing worst. The ordering of these models is the
248 | same for the training and test data sets.
249 |
250 | ### Question 5
251 |
252 | > We have seen that we can fit an SVM with a non-linear kernel in order to
253 | > perform classification using a non-linear decision boundary. We will now see
254 | > that we can also obtain a non-linear decision boundary by performing logistic
255 | > regression using non-linear transformations of the features.
256 | >
257 | > a. Generate a data set with $n = 500$ and $p = 2$, such that the observations
258 | > belong to two classes with a quadratic decision boundary between them. For
259 | > instance, you can do this as follows:
260 | >
261 | > ```r
262 | > > x1 <- runif(500) - 0.5
263 | > > x2 <- runif(500) - 0.5
264 | > > y <- 1 * (x1^2 - x2^2 > 0)
265 | > ```
266 |
267 | ```{r}
268 | set.seed(42)
269 | train <- data.frame(
270 | x1 = runif(500) - 0.5,
271 | x2 = runif(500) - 0.5
272 | )
273 | train$y <- factor(as.numeric((train$x1^2 - train$x2^2 > 0)))
274 | ```
275 |
276 | > b. Plot the observations, colored according to their class labels. Your plot
277 | > should display $X_1$ on the $x$-axis, and $X_2$ on the $y$-axis.
278 |
279 | ```{r}
280 | p <- ggplot(train, aes(x = x1, y = x2, color = y)) +
281 | geom_point(size = 2)
282 | p
283 | ```
284 |
285 | > c. Fit a logistic regression model to the data, using $X_1$ and $X_2$ as
286 | > predictors.
287 |
288 | ```{r}
289 | fit1 <- glm(y ~ ., data = train, family = "binomial")
290 | ```
291 |
292 | > d. Apply this model to the _training data_ in order to obtain a predicted class
293 | > label for each training observation. Plot the observations, colored
294 | > according to the _predicted_ class labels. The decision boundary should be
295 | > linear.
296 |
297 | ```{r}
298 | plot_model <- function(fit) {
299 | if (inherits(fit, "svm")) {
300 | train$p <- predict(fit)
301 | } else {
302 | train$p <- factor(as.numeric(predict(fit) > 0))
303 | }
304 | ggplot(train, aes(x = x1, y = x2, color = p)) +
305 | geom_point(size = 2)
306 | }
307 |
308 | plot_model(fit1)
309 | ```
310 |
311 | > e. Now fit a logistic regression model to the data using non-linear functions
312 | > of $X_1$ and $X_2$ as predictors (e.g. $X_1^2, X_1 \times X_2, \log(X_2),$
313 | > and so forth).
314 |
315 | ```{r}
316 | fit2 <- glm(y ~ poly(x1, 2) + poly(x2, 2), data = train, family = "binomial")
317 | ```
318 |
319 | > f. Apply this model to the _training data_ in order to obtain a predicted
320 | > class label for each training observation. Plot the observations, colored
321 | > according to the _predicted_ class labels. The decision boundary should be
322 | > obviously non-linear. If it is not, then repeat (a)-(e) until you come up
323 | > with an example in which the predicted class labels are obviously
324 | > non-linear.
325 |
326 | ```{r}
327 | plot_model(fit2)
328 | ```
329 |
330 | > g. Fit a support vector classifier to the data with $X_1$ and $X_2$ as
331 | > predictors. Obtain a class prediction for each training observation. Plot
332 | > the observations, colored according to the _predicted class labels_.
333 |
334 | ```{r}
335 | fit3 <- svm(y ~ x1 + x2, data = train, kernel = "linear")
336 | plot_model(fit3)
337 | ```
338 |
339 | > h. Fit a SVM using a non-linear kernel to the data. Obtain a class prediction
340 | > for each training observation. Plot the observations, colored according to
341 | > the _predicted class labels_.
342 |
343 | ```{r}
344 | fit4 <- svm(y ~ x1 + x2, data = train, kernel = "polynomial", degree = 2)
345 | plot_model(fit4)
346 | ```
347 |
348 | > i. Comment on your results.
349 |
350 | When simulating data with a quadratic decision boundary, a logistic model with
351 | quadratic transformations of the variables and an svm model with a quadratic
352 | kernel both produce much better (and similar fits) than standard linear methods.
353 |
354 | ### Question 6
355 |
356 | > At the end of Section 9.6.1, it is claimed that in the case of data that is
357 | > just barely linearly separable, a support vector classifier with a small
358 | > value of `cost` that misclassifies a couple of training observations may
359 | > perform better on test data than one with a huge value of `cost` that does not
360 | > misclassify any training observations. You will now investigate this claim.
361 | >
362 | > a. Generate two-class data with $p = 2$ in such a way that the classes are
363 | > just barely linearly separable.
364 |
365 | ```{r}
366 | set.seed(2)
367 |
368 | # Simulate data that is separable by a line at y = 2.5
369 | data <- data.frame(
370 | x = rnorm(200),
371 | class = sample(c("red", "blue"), 200, replace = TRUE)
372 | )
373 | data$y <- (data$class == "red") * 5 + rnorm(200)
374 |
375 | # Add barley separable points (these are simulated "noise" values)
376 | newdata <- data.frame(x = rnorm(30))
377 | newdata$y <- 1.5 * newdata$x + 3 + rnorm(30, 0, 1)
378 | newdata$class <- ifelse((1.5 * newdata$x + 3) - newdata$y > 0, "blue", "red")
379 |
380 | data <- rbind(data, newdata)
381 |
382 | # remove any that cause misclassification leaving data that is barley linearly
383 | # separable, but along an axis that is not y = 2.5 (which would be correct
384 | # for the "true" data.
385 | data <- data[!(data$class == "red") == ((1.5 * data$x + 3 - data$y) > 0), ]
386 | data <- data[sample(seq_len(nrow(data)), 200), ]
387 |
388 | p <- ggplot(data, aes(x = x, y = y, color = class)) +
389 | geom_point(size = 2) +
390 | scale_colour_identity() +
391 | geom_abline(intercept = 3, slope = 1.5, lty = 2)
392 | p
393 | ```
394 |
395 | > b. Compute the cross-validation error rates for support vector classifiers
396 | > with a range of `cost` values. How many training errors are misclassified
397 | > for each value of `cost` considered, and how does this relate to the
398 | > cross-validation errors obtained?
399 |
400 | How many training errors are misclassified for each value of cost?
401 |
402 | ```{r}
403 | costs <- 10^seq(-3, 5)
404 |
405 | sapply(costs, function(cost) {
406 | fit <- svm(as.factor(class) ~ ., data = data, kernel = "linear", cost = cost)
407 | pred <- predict(fit, data)
408 | sum(pred != data$class)
409 | })
410 | ```
411 |
412 | Cross-validation errors
413 |
414 | ```{r}
415 | out <- tune(svm, as.factor(class) ~ ., data = data, kernel = "linear", ranges = list(cost = costs))
416 | summary(out)
417 | data.frame(
418 | cost = out$performances$cost,
419 | misclass = out$performances$error * nrow(data)
420 | )
421 | ```
422 |
423 | > c. Generate an appropriate test data set, and compute the test errors
424 | > corresponding to each of the values of `cost` considered. Which value of
425 | > `cost` leads to the fewest test errors, and how does this compare to the
426 | > values of `cost` that yield the fewest training errors and the fewest
427 | > cross-validation errors?
428 |
429 | ```{r}
430 | set.seed(2)
431 | test <- data.frame(
432 | x = rnorm(200),
433 | class = sample(c("red", "blue"), 200, replace = TRUE)
434 | )
435 | test$y <- (test$class == "red") * 5 + rnorm(200)
436 | p + geom_point(data = test, pch = 21)
437 |
438 | (errs <- sapply(costs, function(cost) {
439 | fit <- svm(as.factor(class) ~ ., data = data, kernel = "linear", cost = cost)
440 | pred <- predict(fit, test)
441 | sum(pred != test$class)
442 | }))
443 | (cost <- costs[which.min(errs)])
444 |
445 | (fit <- svm(as.factor(class) ~ ., data = data, kernel = "linear", cost = cost))
446 |
447 | test$prediction <- predict(fit, test)
448 | p <- ggplot(test, aes(x = x, y = y, color = class, shape = prediction == class)) +
449 | geom_point(size = 2) +
450 | scale_colour_identity()
451 | p
452 | ```
453 |
454 | > d. Discuss your results.
455 |
456 | A large cost leads to overfitting as the model finds the perfect linear
457 | separation between red and blue in the training data. A lower cost then
458 | leads to improved prediction in the test data.
459 |
460 | ### Question 7
461 |
462 | > In this problem, you will use support vector approaches in order to predict
463 | > whether a given car gets high or low gas mileage based on the `Auto` data set.
464 | >
465 | > a. Create a binary variable that takes on a 1 for cars with gas mileage above
466 | > the median, and a 0 for cars with gas mileage below the median.
467 |
468 | ```{r}
469 | library(ISLR2)
470 | data <- Auto
471 | data$high_mpg <- as.factor(as.numeric(data$mpg > median(data$mpg)))
472 | ```
473 |
474 | > b. Fit a support vector classifier to the data with various values of `cost`,
475 | > in order to predict whether a car gets high or low gas mileage. Report the
476 | > cross-validation errors associated with different values of this parameter.
477 | > Comment on your results. Note you will need to fit the classifier without
478 | > the gas mileage variable to produce sensible results.
479 |
480 | ```{r}
481 | set.seed(42)
482 | costs <- 10^seq(-4, 3, by = 0.5)
483 | results <- list()
484 | f <- high_mpg ~ displacement + horsepower + weight
485 | results$linear <- tune(svm, f,
486 | data = data, kernel = "linear",
487 | ranges = list(cost = costs)
488 | )
489 | summary(results$linear)
490 | ```
491 |
492 | > c. Now repeat (b), this time using SVMs with radial and polynomial basis
493 | > kernels, with different values of `gamma` and `degree` and `cost`. Comment
494 | > on your results.
495 |
496 | ```{r}
497 | results$polynomial <- tune(svm, f,
498 | data = data, kernel = "polynomial",
499 | ranges = list(cost = costs, degree = 1:3)
500 | )
501 | summary(results$polynomial)
502 |
503 | results$radial <- tune(svm, f,
504 | data = data, kernel = "radial",
505 | ranges = list(cost = costs, gamma = 10^(-2:1))
506 | )
507 | summary(results$radial)
508 |
509 | sapply(results, function(x) x$best.performance)
510 | sapply(results, function(x) x$best.parameters)
511 | ```
512 |
513 | > d. Make some plots to back up your assertions in (b) and (c).
514 | >
515 | > _Hint: In the lab, we used the `plot()` function for `svm` objects only in
516 | > cases with $p = 2$. When $p > 2$, you can use the `plot()` function to
517 | > create plots displaying pairs of variables at a time. Essentially, instead
518 | > of typing_
519 | >
520 | > ```r
521 | > > plot(svmfit, dat)
522 | > ```
523 | >
524 | > _where `svmfit` contains your fitted model and dat is a data frame
525 | > containing your data, you can type_
526 | >
527 | > ```r
528 | > > plot(svmfit, dat, x1 ∼ x4)
529 | > ```
530 | >
531 | > _in order to plot just the first and fourth variables. However, you must
532 | > replace `x1` and `x4` with the correct variable names. To find out more,
533 | > type `?plot.svm`._
534 |
535 | ```{r}
536 | table(predict(results$radial$best.model, data), data$high_mpg)
537 |
538 | plot(results$radial$best.model, data, horsepower ~ displacement)
539 | plot(results$radial$best.model, data, horsepower ~ weight)
540 | plot(results$radial$best.model, data, displacement ~ weight)
541 | ```
542 |
543 | ### Question 8
544 |
545 | > This problem involves the `OJ` data set which is part of the `ISLR2` package.
546 | >
547 | > a. Create a training set containing a random sample of 800 observations, and a
548 | > test set containing the remaining observations.
549 |
550 | ```{r}
551 | set.seed(42)
552 | train <- sample(seq_len(nrow(OJ)), 800)
553 | test <- setdiff(seq_len(nrow(OJ)), train)
554 | ```
555 |
556 | > b. Fit a support vector classifier to the training data using `cost = 0.01`,
557 | > with `Purchase` as the response and the other variables as predictors. Use
558 | > the `summary()` function to produce summary statistics, and describe the
559 | > results obtained.
560 |
561 | ```{r}
562 | fit <- svm(Purchase ~ ., data = OJ[train, ], kernel = "linear", cost = 0.01)
563 | summary(fit)
564 | ```
565 |
566 | > c. What are the training and test error rates?
567 |
568 | ```{r}
569 | err <- function(model, data) {
570 | t <- table(predict(model, data), data[["Purchase"]])
571 | 1 - sum(diag(t)) / sum(t)
572 | }
573 | errs <- function(model) {
574 | c(train = err(model, OJ[train, ]), test = err(model, OJ[test, ]))
575 | }
576 | errs(fit)
577 | ```
578 |
579 | > d. Use the `tune()` function to select an optimal cost. Consider values in the
580 | > range 0.01 to 10.
581 |
582 | ```{r}
583 | tuned <- tune(svm, Purchase ~ .,
584 | data = OJ[train, ], kernel = "linear",
585 | ranges = list(cost = 10^seq(-2, 1, length.out = 10))
586 | )
587 | tuned$best.parameters
588 | summary(tuned)
589 | ```
590 |
591 | > e. Compute the training and test error rates using this new value for `cost`.
592 |
593 | ```{r}
594 | errs(tuned$best.model)
595 | ```
596 |
597 | > f. Repeat parts (b) through (e) using a support vector machine with a radial
598 | > kernel. Use the default value for `gamma`.
599 |
600 | ```{r}
601 | tuned2 <- tune(svm, Purchase ~ .,
602 | data = OJ[train, ], kernel = "radial",
603 | ranges = list(cost = 10^seq(-2, 1, length.out = 10))
604 | )
605 | tuned2$best.parameters
606 | errs(tuned2$best.model)
607 | ```
608 |
609 | > g. Repeat parts (b) through (e) using a support vector machine with a
610 | > polynomial kernel. Set `degree = 2`.
611 |
612 | ```{r}
613 | tuned3 <- tune(svm, Purchase ~ .,
614 | data = OJ[train, ], kernel = "polynomial",
615 | ranges = list(cost = 10^seq(-2, 1, length.out = 10)), degree = 2
616 | )
617 | tuned3$best.parameters
618 | errs(tuned3$best.model)
619 | ```
620 |
621 | > h. Overall, which approach seems to give the best results on this data?
622 |
623 | Overall the "radial" kernel appears to perform best in this case.
624 |
--------------------------------------------------------------------------------
/08-tree-based-methods.Rmd:
--------------------------------------------------------------------------------
1 | # Tree-Based Methods
2 |
3 | ## Conceptual
4 |
5 | ### Question 1
6 |
7 | > Draw an example (of your own invention) of a partition of two-dimensional
8 | > feature space that could result from recursive binary splitting. Your example
9 | > should contain at least six regions. Draw a decision tree corresponding to
10 | > this partition. Be sure to label all aspects of your figures, including the
11 | > regions $R_1, R_2, ...,$ the cutpoints $t_1, t_2, ...,$ and so forth.
12 | >
13 | > _Hint: Your result should look something like Figures 8.1 and 8.2._
14 |
15 | ```{r, message = FALSE, warning = FALSE}
16 | library(showtext)
17 | showtext::showtext_auto()
18 | library(ggplot2)
19 | library(tidyverse)
20 | library(ggtree)
21 | ```
22 |
23 | ```{r}
24 | tree <- ape::read.tree(text = "(((R1:1,R2:1)N1:2,R3:4)N2:2,(R4:2,(R5:1,R6:1)R3:2)N4:5)R;")
25 | tree$node.label <- c("Age < 40", "Weight < 100", "Weight < 70", "Age < 60", "Weight < 80")
26 |
27 | ggtree(tree, ladderize = FALSE) + scale_x_reverse() + coord_flip() +
28 | geom_tiplab(vjust = 2, hjust = 0.5) +
29 | geom_text2(aes(label = label, subset = !isTip), hjust = -0.1, vjust = -1)
30 | ```
31 |
32 | ```{r}
33 | plot(NULL,
34 | xlab = "Age (years)", ylab = "Weight (kg)",
35 | xlim = c(0, 100), ylim = c(40, 160), xaxs = "i", yaxs = "i"
36 | )
37 | abline(v = 40, col = "red", lty = 2)
38 | lines(c(0, 40), c(100, 100), col = "blue", lty = 2)
39 | lines(c(0, 40), c(70, 70), col = "blue", lty = 2)
40 | abline(v = 60, col = "red", lty = 2)
41 | lines(c(60, 100), c(80, 80), col = "blue", lty = 2)
42 |
43 | text(
44 | c(20, 20, 20, 50, 80, 80),
45 | c(55, 85, 130, 100, 60, 120),
46 | labels = c("R1", "R2", "R3", "R4", "R5", "R6")
47 | )
48 | ```
49 |
50 | ### Question 2
51 |
52 | > It is mentioned in Section 8.2.3 that boosting using depth-one trees (or
53 | > _stumps_) leads to an _additive_ model: that is, a model of the form
54 | > $$
55 | > f(X) = \sum_{j=1}^p f_j(X_j).
56 | > $$
57 | > Explain why this is the case. You can begin with (8.12) in Algorithm 8.2.
58 |
59 | Equation 8.1 is:
60 |
61 | $$
62 | f(x) = \sum_{b=1}^B(\lambda \hat{f}^b(x)
63 | $$
64 |
65 | where $\hat{f}^b(x)$ represents the $b$th tree with (in this case) 1 split.
66 | Since 1-depth trees involve only one variable, and the total function for
67 | $x$ involves adding the outcome for each, this model is an additive. Depth
68 | 2 trees would allow for interactions between two variables.
69 |
70 | ### Question 3
71 |
72 | > Consider the Gini index, classification error, and cross-entropy in a simple
73 | > classification setting with two classes. Create a single plot that displays
74 | > each of these quantities as a function of $\hat{p}_{m1}$. The $x$-axis should
75 | > display $\hat{p}_{m1}$, ranging from 0 to 1, and the $y$-axis should display
76 | > the value of the Gini index, classification error, and entropy.
77 | >
78 | > _Hint: In a setting with two classes, $\hat{p}_{m1} = 1 - \hat{p}_{m2}$. You
79 | > could make this plot by hand, but it will be much easier to make in `R`._
80 |
81 | The *Gini index* is defined by
82 |
83 | $$G = \sum_{k=1}^{K} \hat{p}_{mk}(1 - \hat{p}_{mk})$$
84 |
85 | *Entropy* is given by
86 |
87 | $$D = -\sum_{k=1}^{K} \hat{p}_{mk}\log(\hat{p}_{mk})$$
88 |
89 | The *classification error* is
90 |
91 | $$E = 1 - \max_k(\hat{p}_{mk})$$
92 |
93 | ```{r}
94 | # Function definitions are for when there's two classes only
95 | p <- seq(0, 1, length.out = 100)
96 | data.frame(
97 | x = p,
98 | "Gini index" = p * (1 - p) * 2,
99 | "Entropy" = -(p * log(p) + (1 - p) * log(1 - p)),
100 | "Classification error" = 1 - pmax(p, 1 - p),
101 | check.names = FALSE
102 | ) |>
103 | pivot_longer(!x) |>
104 | ggplot(aes(x = x, y = value, color = name)) +
105 | geom_line(na.rm = TRUE)
106 | ```
107 |
108 | ### Question 4
109 |
110 | > This question relates to the plots in Figure 8.12.
111 | >
112 | > a. Sketch the tree corresponding to the partition of the predictor space
113 | > illustrated in the left-hand panel of Figure 8.12. The numbers inside the
114 | > boxes indicate the mean of $Y$ within each region.
115 |
116 | ```{r}
117 | tree <- ape::read.tree(text = "(((3:1.5,(10:1,0:1)A:1)B:1,15:2)C:1,5:2)D;")
118 | tree$node.label <- c("X1 < 1", "X2 < 1", "X1 < 0", "X2 < 0")
119 |
120 | ggtree(tree, ladderize = FALSE) + scale_x_reverse() + coord_flip() +
121 | geom_tiplab(vjust = 2, hjust = 0.5) +
122 | geom_text2(aes(label = label, subset = !isTip), hjust = -0.1, vjust = -1)
123 | ```
124 |
125 | > b. Create a diagram similar to the left-hand panel of Figure 8.12, using the
126 | > tree illustrated in the right-hand panel of the same figure. You should
127 | > divide up the predictor space into the correct regions, and indicate the
128 | > mean for each region.
129 |
130 | ```{r}
131 | plot(NULL, xlab = "X1", ylab = "X2", xlim = c(-1, 2), ylim = c(0, 3), xaxs = "i", yaxs = "i")
132 | abline(h = 1, col = "red", lty = 2)
133 | lines(c(1, 1), c(0, 1), col = "blue", lty = 2)
134 | lines(c(-1, 2), c(2, 2), col = "red", lty = 2)
135 | lines(c(0, 0), c(1, 2), col = "blue", lty = 2)
136 | text(
137 | c(0, 1.5, -0.5, 1, 0.5),
138 | c(0.5, 0.5, 1.5, 1.5, 2.5),
139 | labels = c("-1.80", "0.63", "-1.06", "0.21", "2.49")
140 | )
141 | ```
142 |
143 | ### Question 5
144 |
145 | > Suppose we produce ten bootstrapped samples from a data set containing red and
146 | > green classes. We then apply a classification tree to each bootstrapped sample
147 | > and, for a specific value of $X$, produce 10 estimates of
148 | > $P(\textrm{Class is Red}|X)$:
149 | > $$0.1, 0.15, 0.2, 0.2, 0.55, 0.6, 0.6, 0.65, 0.7, \textrm{and } 0.75.$$
150 | > There are two common ways to combine these results together into a single
151 | > class prediction. One is the majority vote approach discussed in this chapter.
152 | > The second approach is to classify based on the average probability. In this
153 | > example, what is the final classification under each of these two approaches?
154 |
155 | ```{r}
156 | x <- c(0.1, 0.15, 0.2, 0.2, 0.55, 0.6, 0.6, 0.65, 0.7, 0.75)
157 | ifelse(mean(x > 0.5), "red", "green") # majority vote
158 | ifelse(mean(x) > 0.5, "red", "green") # average probability
159 | ```
160 |
161 | ### Question 6
162 |
163 | > Provide a detailed explanation of the algorithm that is used to fit a
164 | > regression tree.
165 |
166 | First we perform binary recursive splitting of the data, to minimize RSS at
167 | each split. This is continued until there are n samples present in each leaf.
168 | Then we prune the tree to a set of subtrees determined by a parameter $\alpha$.
169 | Using K-fold CV, we select $\alpha$ to minimize the cross validation error. The
170 | final tree is then calculated using the complete dataset with the selected
171 | $\alpha$ value.
172 |
173 | ## Applied
174 |
175 | ### Question 7
176 |
177 | > In the lab, we applied random forests to the `Boston` data using `mtry = 6`
178 | > and using `ntree = 25` and `ntree = 500`. Create a plot displaying the test
179 | > error resulting from random forests on this data set for a more comprehensive
180 | > range of values for `mtry` and `ntree`. You can model your plot after Figure
181 | > 8.10. Describe the results obtained.
182 |
183 | ```{r}
184 | library(ISLR2)
185 | library(randomForest)
186 | set.seed(42)
187 |
188 | train <- sample(c(TRUE, FALSE), nrow(Boston), replace = TRUE)
189 |
190 | rf_err <- function(mtry) {
191 | randomForest(
192 | Boston[train, -13],
193 | y = Boston[train, 13],
194 | xtest = Boston[!train, -13],
195 | ytest = Boston[!train, 13],
196 | mtry = mtry,
197 | ntree = 500
198 | )$test$mse
199 | }
200 | res <- lapply(c(1, 2, 3, 5, 7, 10, 12), rf_err)
201 | names(res) <- c(1, 2, 3, 5, 7, 10, 12)
202 | data.frame(res, check.names = FALSE) |>
203 | mutate(n = 1:500) |>
204 | pivot_longer(!n) |>
205 | ggplot(aes(x = n, y = value, color = name)) +
206 | geom_line(na.rm = TRUE) +
207 | xlab("Number of trees") +
208 | ylab("Error") +
209 | scale_y_log10() +
210 | scale_color_discrete(name = "No. variables at\neach split")
211 | ```
212 |
213 | ### Question 8
214 |
215 | > In the lab, a classification tree was applied to the `Carseats` data set after
216 | > converting `Sales` into a qualitative response variable. Now we will seek to
217 | > predict `Sales` using regression trees and related approaches, treating the
218 | > response as a quantitative variable.
219 | >
220 | > a. Split the data set into a training set and a test set.
221 |
222 | ```{r}
223 | set.seed(42)
224 | train <- sample(c(TRUE, FALSE), nrow(Carseats), replace = TRUE)
225 | ```
226 |
227 | > b. Fit a regression tree to the training set. Plot the tree, and interpret the
228 | > results. What test error rate do you obtain?
229 |
230 | ```{r}
231 | library(tree)
232 | tr <- tree(Sales ~ ., data = Carseats[train, ])
233 | summary(tr)
234 | plot(tr)
235 | text(tr, pretty = 0, digits = 2, cex = 0.8)
236 |
237 | carseats_mse <- function(model) {
238 | p <- predict(model, newdata = Carseats[!train, ])
239 | mean((p - Carseats[!train, "Sales"])^2)
240 | }
241 | carseats_mse(tr)
242 | ```
243 |
244 | > c. Use cross-validation in order to determine the optimal level of tree
245 | > complexity. Does pruning the tree improve the test error rate?
246 |
247 | ```{r}
248 | res <- cv.tree(tr)
249 | plot(res$size, res$dev, type = "b", xlab = "Tree size", ylab = "Deviance")
250 | min <- which.min(res$dev)
251 | abline(v = res$size[min], lty = 2, col = "red")
252 | ```
253 |
254 | Pruning improves performance very slightly (though this is not repeatable in
255 | different rounds of cross-validation). Arguably, a good balance is achieved
256 | when the tree size is 11.
257 |
258 | ```{r}
259 | ptr <- prune.tree(tr, best = 11)
260 | plot(ptr)
261 | text(ptr, pretty = 0, digits = 2, cex = 0.8)
262 | carseats_mse(ptr)
263 | ```
264 |
265 | > d. Use the bagging approach in order to analyze this data. What test error
266 | > rate do you obtain? Use the `importance()` function to determine which
267 | > variables are most important.
268 |
269 | ```{r}
270 | # Here we can use random Forest with mtry = 10 = p (the number of predictor
271 | # variables) to perform bagging
272 | bagged <- randomForest(Sales ~ .,
273 | data = Carseats[train, ], mtry = 10,
274 | ntree = 200, importance = TRUE
275 | )
276 | carseats_mse(bagged)
277 | importance(bagged)
278 | ```
279 |
280 | The test error rate is ~2.8 which is a substantial improvement over the pruned
281 | regression tree above.
282 |
283 | > e. Use random forests to analyze this data. What test error rate do you
284 | > obtain? Use the `importance()` function to determine which variables are
285 | > most important. Describe the effect of $m$, the number of variables
286 | > considered at each split, on the error rate obtained.
287 |
288 | ```{r}
289 | rf <- randomForest(Sales ~ .,
290 | data = Carseats[train, ], mtry = 3,
291 | ntree = 500, importance = TRUE
292 | )
293 | carseats_mse(rf)
294 | importance(rf)
295 | ```
296 |
297 | The test error rate is ~3.0 which is a substantial improvement over the pruned
298 | regression tree above, although not quite as good as the bagging approach.
299 |
300 | > f. Now analyze the data using BART, and report your results.
301 |
302 | ```{r}
303 | library(BART)
304 |
305 | # For ease, we'll create a fake "predict" method that just returns
306 | # yhat.test.mean regardless of provided "newdata"
307 | predict.wbart <- function(model, ...) model$yhat.test.mean
308 |
309 | bartfit <- gbart(Carseats[train, 2:11], Carseats[train, 1],
310 | x.test = Carseats[!train, 2:11]
311 | )
312 | carseats_mse(bartfit)
313 | ```
314 |
315 | The test error rate is ~1.6 which is an improvement over random forest and
316 | bagging.
317 |
318 | ### Question 9
319 |
320 | > This problem involves the `OJ` data set which is part of the `ISLR2` package.
321 | >
322 | > a. Create a training set containing a random sample of 800 observations, and a
323 | > test set containing the remaining observations.
324 |
325 | ```{r}
326 | set.seed(42)
327 | train <- sample(1:nrow(OJ), 800)
328 | test <- setdiff(1:nrow(OJ), train)
329 | ```
330 |
331 | > b. Fit a tree to the training data, with `Purchase` as the response and the
332 | > other variables except for `Buy` as predictors. Use the `summary()`
333 | > function to produce summary statistics about the tree, and describe the
334 | > results obtained. What is the training error rate? How many terminal nodes
335 | > does the tree have?
336 |
337 | ```{r}
338 | tr <- tree(Purchase ~ ., data = OJ[train, ])
339 | summary(tr)
340 | ```
341 |
342 | > c. Type in the name of the tree object in order to get a detailed text output.
343 | > Pick one of the terminal nodes, and interpret the information displayed.
344 |
345 | ```{r}
346 | tr
347 | ```
348 |
349 | > d. Create a plot of the tree, and interpret the results.
350 |
351 | ```{r}
352 | plot(tr)
353 | text(tr, pretty = 0, digits = 2, cex = 0.8)
354 | ```
355 |
356 | > e. Predict the response on the test data, and produce a confusion matrix
357 | > comparing the test labels to the predicted test labels. What is the test
358 | > error rate?
359 |
360 | ```{r}
361 | table(predict(tr, OJ[test, ], type = "class"), OJ[test, "Purchase"])
362 | ```
363 |
364 | > f. Apply the `cv.tree()` function to the training set in order to determine
365 | > the optimal tree size.
366 |
367 | ```{r}
368 | set.seed(42)
369 | res <- cv.tree(tr)
370 | ```
371 |
372 | > g. Produce a plot with tree size on the $x$-axis and cross-validated
373 | > classification error rate on the $y$-axis.
374 |
375 | ```{r}
376 | plot(res$size, res$dev, type = "b", xlab = "Tree size", ylab = "Deviance")
377 | min <- which.min(res$dev)
378 | abline(v = res$size[min], lty = 2, col = "red")
379 | ```
380 |
381 | > h. Which tree size corresponds to the lowest cross-validated classification
382 | > error rate?
383 |
384 | ```{r}
385 | res$size[min]
386 | ```
387 |
388 | > i. Produce a pruned tree corresponding to the optimal tree size obtained using
389 | > cross-validation. If cross-validation does not lead to selection of a
390 | > pruned tree, then create a pruned tree with five terminal nodes.
391 |
392 | ```{r}
393 | ptr <- prune.tree(tr, best = res$size[min])
394 | plot(ptr)
395 | text(ptr, pretty = 0, digits = 2, cex = 0.8)
396 | ```
397 |
398 | > j. Compare the training error rates between the pruned and unpruned trees.
399 | > Which is higher?
400 |
401 | ```{r}
402 | oj_misclass <- function(model) {
403 | summary(model)$misclass[1] / summary(model)$misclass[2]
404 | }
405 | oj_misclass(tr)
406 | oj_misclass(ptr)
407 | ```
408 |
409 | The training misclassification error rate is slightly higher for the pruned tree.
410 |
411 | > k. Compare the test error rates between the pruned and unpruned trees. Which
412 | > is higher?
413 |
414 | ```{r}
415 | oj_err <- function(model) {
416 | p <- predict(model, newdata = OJ[test, ], type = "class")
417 | mean(p != OJ[test, "Purchase"])
418 | }
419 | oj_err(tr)
420 | oj_err(ptr)
421 | ```
422 |
423 | The test misclassification error rate is slightly higher for the pruned tree.
424 |
425 | ### Question 10
426 |
427 | > We now use boosting to predict `Salary` in the `Hitters` data set.
428 | >
429 | > a. Remove the observations for whom the salary information is unknown, and
430 | > then log-transform the salaries.
431 |
432 | ```{r}
433 | dat <- Hitters
434 | dat <- dat[!is.na(dat$Salary), ]
435 | dat$Salary <- log(dat$Salary)
436 | ```
437 |
438 | > b. Create a training set consisting of the first 200 observations, and a test
439 | > set consisting of the remaining observations.
440 |
441 | ```{r}
442 | train <- 1:200
443 | test <- setdiff(1:nrow(dat), train)
444 | ```
445 |
446 | > c. Perform boosting on the training set with 1,000 trees for a range of values
447 | > of the shrinkage parameter $\lambda$. Produce a plot with different
448 | > shrinkage values on the $x$-axis and the corresponding training set MSE on
449 | > the $y$-axis.
450 |
451 | ```{r}
452 | library(gbm)
453 | set.seed(42)
454 | lambdas <- 10^seq(-3, 0, by = 0.1)
455 | fits <- lapply(lambdas, function(lam) {
456 | gbm(Salary ~ .,
457 | data = dat[train, ], distribution = "gaussian",
458 | n.trees = 1000, shrinkage = lam
459 | )
460 | })
461 | errs <- sapply(fits, function(fit) {
462 | p <- predict(fit, dat[train, ], n.trees = 1000)
463 | mean((p - dat[train, ]$Salary)^2)
464 | })
465 | plot(lambdas, errs,
466 | type = "b", xlab = "Shrinkage values",
467 | ylab = "Training MSE", log = "xy"
468 | )
469 | ```
470 |
471 | > d. Produce a plot with different shrinkage values on the $x$-axis and the
472 | > corresponding test set MSE on the $y$-axis.
473 |
474 | ```{r}
475 | errs <- sapply(fits, function(fit) {
476 | p <- predict(fit, dat[test, ], n.trees = 1000)
477 | mean((p - dat[test, ]$Salary)^2)
478 | })
479 | plot(lambdas, errs,
480 | type = "b", xlab = "Shrinkage values",
481 | ylab = "Training MSE", log = "xy"
482 | )
483 | min(errs)
484 | abline(v = lambdas[which.min(errs)], lty = 2, col = "red")
485 | ```
486 |
487 | > e. Compare the test MSE of boosting to the test MSE that results from applying
488 | > two of the regression approaches seen in Chapters 3 and 6.
489 |
490 | Linear regression
491 |
492 | ```{r}
493 | fit1 <- lm(Salary ~ ., data = dat[train, ])
494 | mean((predict(fit1, dat[test, ]) - dat[test, "Salary"])^2)
495 | ```
496 |
497 | Ridge regression
498 |
499 | ```{r}
500 | library(glmnet)
501 | x <- model.matrix(Salary ~ ., data = dat[train, ])
502 | x.test <- model.matrix(Salary ~ ., data = dat[test, ])
503 | y <- dat[train, "Salary"]
504 | fit2 <- glmnet(x, y, alpha = 1)
505 | mean((predict(fit2, s = 0.1, newx = x.test) - dat[test, "Salary"])^2)
506 | ```
507 |
508 | > f. Which variables appear to be the most important predictors in the boosted
509 | > model?
510 |
511 | ```{r}
512 | summary(fits[[which.min(errs)]])
513 | ```
514 |
515 | > g. Now apply bagging to the training set. What is the test set MSE for this
516 | > approach?
517 |
518 | ```{r}
519 | set.seed(42)
520 | bagged <- randomForest(Salary ~ ., data = dat[train, ], mtry = 19, ntree = 1000)
521 | mean((predict(bagged, newdata = dat[test, ]) - dat[test, "Salary"])^2)
522 | ```
523 |
524 | ### Question 11
525 |
526 | > This question uses the `Caravan` data set.
527 | >
528 | > a. Create a training set consisting of the first 1,000 observations, and a
529 | > test set consisting of the remaining observations.
530 |
531 | ```{r}
532 | train <- 1:1000
533 | test <- setdiff(1:nrow(Caravan), train)
534 | ```
535 |
536 | > b. Fit a boosting model to the training set with `Purchase` as the response
537 | > and the other variables as predictors. Use 1,000 trees, and a shrinkage
538 | > value of 0.01. Which predictors appear to be the most important?
539 |
540 | ```{r}
541 | set.seed(42)
542 | fit <- gbm(as.numeric(Purchase == "Yes") ~ ., data = Caravan[train, ], n.trees = 1000, shrinkage = 0.01)
543 | head(summary(fit))
544 | ```
545 |
546 | > c. Use the boosting model to predict the response on the test data. Predict
547 | > that a person will make a purchase if the estimated probability of purchase
548 | > is greater than 20%. Form a confusion matrix. What fraction of the people
549 | > predicted to make a purchase do in fact make one? How does this compare
550 | > with the results obtained from applying KNN or logistic regression to this
551 | > data set?
552 |
553 | ```{r}
554 | p <- predict(fit, Caravan[test, ], n.trees = 1000, type = "response")
555 | table(p > 0.2, Caravan[test, "Purchase"] == "Yes")
556 | sum(p > 0.2 & Caravan[test, "Purchase"] == "Yes") / sum(p > 0.2)
557 | ```
558 |
559 | 141 (109 + 32) are predicted to purchase. Of these 32 do which is 21%.
560 |
561 | ```{r}
562 | # Logistic regression
563 | fit <- glm(Purchase == "Yes" ~ ., data = Caravan[train, ], family = "binomial")
564 | p <- predict(fit, Caravan[test, ], type = "response")
565 | table(p > 0.2, Caravan[test, "Purchase"] == "Yes")
566 | sum(p > 0.2 & Caravan[test, "Purchase"] == "Yes") / sum(p > 0.2)
567 | ```
568 |
569 | For logistic regression we correctly predict 14% of those predicted to purchase.
570 |
571 | ```{r}
572 | library(class)
573 | # KNN
574 | fit <- knn(Caravan[train, -86], Caravan[test, -86], Caravan$Purchase[train])
575 | table(fit, Caravan[test, "Purchase"] == "Yes")
576 | sum(fit == "Yes" & Caravan[test, "Purchase"] == "Yes") / sum(fit == "Yes")
577 | ```
578 |
579 | For KNN we correctly predict 8.7% of those predicted to purchase.
580 |
581 | ### Question 12
582 |
583 | > Apply boosting, bagging, random forests and BART to a data set of your choice.
584 | > Be sure to fit the models on a training set and to evaluate their performance
585 | > on a test set. How accurate are the results compared to simple methods like
586 | > linear or logistic regression? Which of these approaches yields the best
587 | > performance?
588 |
589 | Here I'm going to use the College dataset (used in Question 10 from Chapter 7
590 | to compare performance with the GAM we previously built). In this model we
591 | were trying to predict `Outstate` using the other variables in `College`.
592 |
593 | ```{r}
594 | library(gam)
595 | set.seed(42)
596 | train <- sample(1:nrow(College), 400)
597 | test <- setdiff(1:nrow(College), train)
598 |
599 | # Linear regression
600 | lr <- gam(Outstate ~ ., data = College[train, ])
601 |
602 | # GAM from chapter 7
603 | gam <- gam(Outstate ~ Private + s(Room.Board, 2) + s(PhD, 2) +
604 | s(perc.alumni, 2) + s(Expend, 2) + s(Grad.Rate, 2), data = College[train, ])
605 |
606 | # Boosting
607 | boosted <- gbm(Outstate ~ ., data = College[train, ], n.trees = 1000, shrinkage = 0.01)
608 |
609 | # Bagging (random forest with mtry = no. predictors)
610 | bagged <- randomForest(Outstate ~ ., data = College[train, ], mtry = 17, ntree = 1000)
611 |
612 | # Random forest with mtry = sqrt(no. predictors)
613 | rf <- randomForest(Outstate ~ ., data = College[train, ], mtry = 4, ntree = 1000)
614 |
615 | # BART
616 | pred <- setdiff(colnames(College), "Outstate")
617 | bart <- gbart(College[train, pred], College[train, "Outstate"],
618 | x.test = College[test, pred]
619 | )
620 |
621 | mse <- function(model, ...) {
622 | pred <- predict(model, College[test, ], ...)
623 | mean((College$Outstate[test] - pred)^2)
624 | }
625 |
626 | res <- c(
627 | "Linear regression" = mse(lr),
628 | "GAM" = mse(gam),
629 | "Boosting" = mse(boosted, n.trees = 1000),
630 | "Bagging" = mse(bagged),
631 | "Random forest" = mse(rf),
632 | "BART" = mse(bart)
633 | )
634 | res <- data.frame("MSE" = res)
635 | res$Model <- factor(row.names(res), levels = rev(row.names(res)))
636 | ggplot(res, aes(Model, MSE)) +
637 | coord_flip() +
638 | geom_bar(stat = "identity", fill = "steelblue")
639 | ```
640 |
641 | In this case, it looks like bagging produces the best performing model in terms
642 | of test mean square error.
643 |
--------------------------------------------------------------------------------
/10-deep-learning.Rmd:
--------------------------------------------------------------------------------
1 | # Deep Learning
2 |
3 | ## Conceptual
4 |
5 | ### Question 1
6 |
7 | > Consider a neural network with two hidden layers: $p = 4$ input units, 2 units
8 | > in the first hidden layer, 3 units in the second hidden layer, and a single
9 | > output.
10 | >
11 | > a. Draw a picture of the network, similar to Figures 10.1 or 10.4.
12 |
13 | ```{r, echo=FALSE, out.width="80%"}
14 | knitr::include_graphics("images/nn.png")
15 | ```
16 |
17 | > b. Write out an expression for $f(X)$, assuming ReLU activation functions. Be
18 | > as explicit as you can!
19 |
20 | The three layers (from our final output layer back to the start of our network)
21 | can be described as:
22 |
23 | \begin{align*}
24 | f(X) &= g(w_{0}^{(3)} + \sum^{K_2}_{l=1} w_{l}^{(3)} A_l^{(2)}) \\
25 | A_l^{(2)} &= h_l^{(2)}(X) = g(w_{l0}^{(2)} + \sum_{k=1}^{K_1} w_{lk}^{(2)} A_k^{(1)})\\
26 | A_k^{(1)} &= h_k^{(1)}(X) = g(w_{k0}^{(1)} + \sum_{j=1}^p w_{kj}^{(1)} X_j) \\
27 | \end{align*}
28 |
29 | for $l = 1, ..., K_2 = 3$ and $k = 1, ..., K_1 = 2$ and $p = 4$, where,
30 |
31 | $$
32 | g(z) = (z)_+ = \begin{cases}
33 | 0, & \text{if } z < 0 \\
34 | z, & \text{otherwise}
35 | \end{cases}
36 | $$
37 |
38 | > c. Now plug in some values for the coefficients and write out the value of
39 | > $f(X)$.
40 |
41 | We can perhaps achieve this most easily by fitting a real model. Note,
42 | in the plot shown here, we also include the "bias" or intercept terms.
43 |
44 | ```{r}
45 | library(ISLR2)
46 | library(neuralnet)
47 | library(sigmoid)
48 | set.seed(5)
49 | train <- sample(seq_len(nrow(ISLR2::Boston)), nrow(ISLR2::Boston) * 2 / 3)
50 |
51 | net <- neuralnet(crim ~ lstat + medv + ptratio + rm,
52 | data = ISLR2::Boston[train, ],
53 | act.fct = relu,
54 | hidden = c(2, 3)
55 | )
56 | plot(net)
57 | ```
58 |
59 | We can make a prediction for a given observation using this object.
60 |
61 | Firstly, let's find an "ambiguous" test sample
62 |
63 | ```{r}
64 | p <- predict(net, ISLR2::Boston[-train, ])
65 | x <- ISLR2::Boston[-train, ][which.min(abs(p - mean(c(max(p), min(p))))), ]
66 | x <- x[, c("lstat", "medv", "ptratio", "rm")]
67 | predict(net, x)
68 | ```
69 |
70 | Or, repeating by "hand":
71 |
72 | ```{r}
73 | g <- function(x) ifelse(x > 0, x, 0) # relu activation function
74 | w <- net$weights[[1]] # the estimated weights for each layer
75 | v <- as.numeric(x) # our input predictors
76 |
77 | # to calculate our prediction we can take the dot product of our predictors
78 | # (with 1 at the start for the bias term) and our layer weights, lw)
79 | for (lw in w) v <- g(c(1, v) %*% lw)
80 | v
81 | ```
82 |
83 | > d. How many parameters are there?
84 |
85 | ```{r}
86 | length(unlist(net$weights))
87 | ```
88 |
89 | There are $4*2+2 + 2*3+3 + 3*1+1 = 23$ parameters.
90 |
91 | ### Question 2
92 |
93 | > Consider the _softmax_ function in (10.13) (see also (4.13) on page 141)
94 | > for modeling multinomial probabilities.
95 | >
96 | > a. In (10.13), show that if we add a constant $c$ to each of the $z_l$, then
97 | > the probability is unchanged.
98 |
99 | If we add a constant $c$ to each $Z_l$ in equation 10.13 we get:
100 |
101 | \begin{align*}
102 | Pr(Y=m|X)
103 | &= \frac{e^{Z_m+c}}{\sum_{l=0}^9e^{Z_l+c}} \\
104 | &= \frac{e^{Z_m}e^c}{\sum_{l=0}^9e^{Z_l}e^c} \\
105 | &= \frac{e^{Z_m}e^c}{e^c\sum_{l=0}^9e^{Z_l}} \\
106 | &= \frac{e^{Z_m}}{\sum_{l=0}^9e^{Z_l}} \\
107 | \end{align*}
108 |
109 | which is just equation 10.13.
110 |
111 | > b. In (4.13), show that if we add constants $c_j$, $j = 0,1,...,p$, to each of
112 | > the corresponding coefficients for each of the classes, then the predictions
113 | > at any new point $x$ are unchanged.
114 |
115 | 4.13 is
116 |
117 | $$
118 | Pr(Y=k|X=x) = \frac
119 | {e^{\beta_{K0} + \beta_{K1}x_1 + ... + \beta_{Kp}x_p}}
120 | {\sum_{l=1}^K e^{\beta_{l0} + \beta_{l1}x1 + ... + \beta_{lp}x_p}}
121 | $$
122 |
123 | adding constants $c_j$ to each class gives:
124 |
125 | \begin{align*}
126 | Pr(Y=k|X=x)
127 | &= \frac
128 | {e^{\beta_{K0} + \beta_{K1}x_1 + c_1 + ... + \beta_{Kp}x_p + c_p}}
129 | {\sum_{l=1}^K e^{\beta_{l0} + \beta_{l1}x1 + c_1 + ... + \beta_{lp}x_p + c_p}} \\
130 | &= \frac
131 | {e^{c1 + ... + c_p}e^{\beta_{K0} + \beta_{K1}x_1 + ... + \beta_{Kp}x_p}}
132 | {\sum_{l=1}^K e^{c1 + ... + c_p}e^{\beta_{l0} + \beta_{l1}x1 + ... + \beta_{lp}x_p}} \\
133 | &= \frac
134 | {e^{c1 + ... + c_p}e^{\beta_{K0} + \beta_{K1}x_1 + ... + \beta_{Kp}x_p}}
135 | {e^{c1 + ... + c_p}\sum_{l=1}^K e^{\beta_{l0} + \beta_{l1}x1 + ... + \beta_{lp}x_p}} \\
136 | &= \frac
137 | {e^{\beta_{K0} + \beta_{K1}x_1 + ... + \beta_{Kp}x_p}}
138 | {\sum_{l=1}^K e^{\beta_{l0} + \beta_{l1}x1 + ... + \beta_{lp}x_p}} \\
139 | \end{align*}
140 |
141 | which collapses to 4.13 (with the same argument as above).
142 |
143 | > This shows that the softmax function is _over-parametrized_. However,
144 | > regularization and SGD typically constrain the solutions so that this is not a
145 | > problem.
146 |
147 | ### Question 3
148 |
149 | > Show that the negative multinomial log-likelihood (10.14) is equivalent to
150 | > the negative log of the likelihood expression (4.5) when there are $M = 2$
151 | > classes.
152 |
153 | Equation 10.14 is
154 |
155 | $$
156 | -\sum_{i=1}^n \sum_{m=0}^9 y_{im}\log(f_m(x_i))
157 | $$
158 |
159 | Equation 4.5 is:
160 |
161 | $$
162 | \ell(\beta_0, \beta_1) = \prod_{i:y_i=1}p(x_i) \prod_{i':y_i'=0}(1-p(x_i'))
163 | $$
164 |
165 | So, $\log(\ell)$ is:
166 |
167 | \begin{align*}
168 | \log(\ell)
169 | &= \log \left( \prod_{i:y_i=1}p(x_i) \prod_{i':y_i'=0}(1-p(x_i')) \right ) \\
170 | &= \sum_{i:y_1=1}\log(p(x_i)) + \sum_{i':y_i'=0}\log(1-p(x_i')) \\
171 | \end{align*}
172 |
173 | If we set $y_i$ to be an indicator variable such that $y_{i1}$ and $y_{i0}$ are
174 | 1 and 0 (or 0 and 1) when our $i$th observation is 1 (or 0) respectively, then
175 | we can write:
176 |
177 | $$
178 | \log(\ell) = \sum_{i}y_{i1}\log(p(x_i)) + \sum_{i}y_{i0}\log(1-p(x_i'))
179 | $$
180 |
181 | If we also let $f_1(x) = p(x)$ and $f_0(x) = 1 - p(x)$ then:
182 |
183 | \begin{align*}
184 | \log(\ell)
185 | &= \sum_i y_{i1}\log(f_1(x_i)) + \sum_{i}y_{i0}\log(f_0(x_i')) \\
186 | &= \sum_i \sum_{m=0}^1 y_{im}\log(f_m(x_i)) \\
187 | \end{align*}
188 |
189 | When we take the negative of this, it is equivalent to 10.14 for two classes
190 | ($m = 0,1$).
191 |
192 | ### Question 4
193 |
194 | > Consider a CNN that takes in $32 \times 32$ grayscale images and has a single
195 | > convolution layer with three $5 \times 5$ convolution filters (without
196 | > boundary padding).
197 | >
198 | > a. Draw a sketch of the input and first hidden layer similar to Figure 10.8.
199 |
200 | ```{r, echo=FALSE, out.width="50%"}
201 | knitr::include_graphics("images/nn2.png")
202 | ```
203 |
204 | Note that, because there is no boundary padding, each convolution layer will
205 | consist of a 28x28 array.
206 |
207 | > b. How many parameters are in this model?
208 |
209 | There are 3 convolution matrices each with 5x5 weights (plus 3 bias terms) to
210 | estimate, therefore $3 \times 5 \times 5 + 3 = 78$ parameters
211 |
212 | > c. Explain how this model can be thought of as an ordinary feed-forward
213 | > neural network with the individual pixels as inputs, and with constraints on
214 | > the weights in the hidden units. What are the constraints?
215 |
216 | We can think of a convolution layer as a regularized fully connected layer.
217 | The regularization in this case is due to not all inputs being connected to
218 | all outputs, and weights being shared between connections.
219 |
220 | Each output node in the convolved image can be thought of as taking inputs from
221 | a limited number of input pixels (the neighboring pixels), with a set of
222 | weights specified by the convolution layer which are then shared by the
223 | connections to all other output nodes.
224 |
225 | > d. If there were no constraints, then how many weights would there be in the
226 | > ordinary feed-forward neural network in (c)?
227 |
228 | With no constraints, we would connect each input pixel in our original 32x32
229 | image to each output pixel in each of our convolution layers, with an bias
230 | term for each original pixel. So each output pixel would require 32x32 weights
231 | + 1 bias term. This would give a total of (32×32+1)×28×28×3 = 2,410,800
232 | parameters.
233 |
234 | ### Question 5
235 |
236 | > In Table 10.2 on page 433, we see that the ordering of the three methods with
237 | > respect to mean absolute error is different from the ordering with respect to
238 | > test set $R^2$. How can this be?
239 |
240 | Mean absolute error considers _absolute_ differences between predictions and
241 | observed values, whereas $R^2$ considers the (normalized) sum of _squared_
242 | differences, thus larger errors contribute relatively ore to $R^2$ than mean
243 | absolute error.
244 |
245 | ## Applied
246 |
247 | ### Question 6
248 |
249 | > Consider the simple function $R(\beta) = sin(\beta) + \beta/10$.
250 | >
251 | > a. Draw a graph of this function over the range $\beta \in [-6, 6]$.
252 |
253 | ```{r}
254 | r <- function(x) sin(x) + x / 10
255 | x <- seq(-6, 6, 0.1)
256 | plot(x, r(x), type = "l")
257 | ```
258 |
259 | > b. What is the derivative of this function?
260 |
261 | $$
262 | cos(x) + 1/10
263 | $$
264 |
265 | > c. Given $\beta^0 = 2.3$, run gradient descent to find a local minimum of
266 | > $R(\beta)$ using a learning rate of $\rho = 0.1$. Show each of
267 | > $\beta^0, \beta^1, ...$ in your plot, as well as the final answer.
268 |
269 | The derivative of our function, i.e. $cos(x) + 1/10$ gives us the gradient for
270 | a given $x$. For gradient descent, we move $x$ a little in the _opposite_
271 | direction, for some learning rate $\rho = 0.1$:
272 |
273 | $$
274 | x^{m+1} = x^m - \rho (cos(x^m) + 1/10)
275 | $$
276 |
277 | ```{r}
278 | iter <- function(x, rho) x - rho * (cos(x) + 1 / 10)
279 | gd <- function(start, rho = 0.1) {
280 | b <- start
281 | v <- b
282 | while (abs(b - iter(b, 0.1)) > 1e-8) {
283 | b <- iter(b, 0.1)
284 | v <- c(v, b)
285 | }
286 | v
287 | }
288 |
289 | res <- gd(2.3)
290 | res[length(res)]
291 | ```
292 |
293 | ```{r}
294 | plot(x, r(x), type = "l")
295 | points(res, r(res), col = "red", pch = 19)
296 | ```
297 |
298 |
299 | > d. Repeat with $\beta^0 = 1.4$.
300 |
301 | ```{r}
302 | res <- gd(1.4)
303 | res[length(res)]
304 | ```
305 |
306 | ```{r}
307 | plot(x, r(x), type = "l")
308 | points(res, r(res), col = "red", pch = 19)
309 | ```
310 |
311 | ### Question 7
312 |
313 | > Fit a neural network to the `Default` data. Use a single hidden layer with 10
314 | > units, and dropout regularization. Have a look at Labs 10.9.1--10.9.2 for
315 | > guidance. Compare the classification performance of your model with that of
316 | > linear logistic regression.
317 |
318 | ```{r, cache = TRUE}
319 | library(keras)
320 |
321 | dat <- ISLR2::Boston
322 | x <- scale(model.matrix(crim ~ . - 1, data = dat))
323 | n <- nrow(dat)
324 | ntest <- trunc(n / 3)
325 | testid <- sample(1:n, ntest)
326 | y <- dat$crim
327 |
328 | # logistic regression
329 | lfit <- lm(crim ~ ., data = dat[-testid, ])
330 | lpred <- predict(lfit, dat[testid, ])
331 | with(dat[testid, ], mean(abs(lpred - crim)))
332 |
333 | # keras
334 | nn <- keras_model_sequential() |>
335 | layer_dense(units = 10, activation = "relu", input_shape = ncol(x)) |>
336 | layer_dropout(rate = 0.4) |>
337 | layer_dense(units = 1)
338 |
339 | compile(nn,
340 | loss = "mse",
341 | optimizer = optimizer_rmsprop(),
342 | metrics = list("mean_absolute_error")
343 | )
344 |
345 | history <- fit(nn,
346 | x[-testid, ], y[-testid],
347 | epochs = 100,
348 | batch_size = 26,
349 | validation_data = list(x[testid, ], y[testid]),
350 | verbose = 0
351 | )
352 | plot(history, smooth = FALSE)
353 | npred <- predict(nn, x[testid, ])
354 | mean(abs(y[testid] - npred))
355 | ```
356 |
357 | In this case, the neural network outperforms logistic regression having a lower
358 | absolute error rate on the test data.
359 |
360 | ### Question 8
361 |
362 | > From your collection of personal photographs, pick 10 images of animals (such
363 | > as dogs, cats, birds, farm animals, etc.). If the subject does not occupy a
364 | > reasonable part of the image, then crop the image. Now use a pretrained image
365 | > classification CNN as in Lab 10.9.4 to predict the class of each of your
366 | > images, and report the probabilities for the top five predicted classes for
367 | > each image.
368 |
369 | ```{r, echo=FALSE}
370 | knitr::include_graphics(c(
371 | "images/animals/bird.jpg",
372 | "images/animals/bird2.jpg",
373 | "images/animals/bird3.jpg",
374 | "images/animals/bug.jpg",
375 | "images/animals/butterfly.jpg",
376 | "images/animals/butterfly2.jpg",
377 | "images/animals/elba.jpg",
378 | "images/animals/hamish.jpg",
379 | "images/animals/poodle.jpg",
380 | "images/animals/tortoise.jpg"
381 | ))
382 | ```
383 |
384 | ```{r}
385 | library(keras)
386 | images <- list.files("images/animals")
387 | x <- array(dim = c(length(images), 224, 224, 3))
388 | for (i in seq_len(length(images))) {
389 | img <- image_load(paste0("images/animals/", images[i]), target_size = c(224, 224))
390 | x[i, , , ] <- image_to_array(img)
391 | }
392 |
393 | model <- application_resnet50(weights = "imagenet")
394 |
395 | pred <- model |>
396 | predict(x) |>
397 | imagenet_decode_predictions(top = 5)
398 |
399 | names(pred) <- images
400 | print(pred)
401 | ```
402 |
403 | ### Question 9
404 |
405 | > Fit a lag-5 autoregressive model to the `NYSE` data, as described in the text
406 | > and Lab 10.9.6. Refit the model with a 12-level factor representing the
407 | > month. Does this factor improve the performance of the model?
408 |
409 | Fitting the model as described in the text.
410 |
411 | ```{r}
412 | library(tidyverse)
413 | library(ISLR2)
414 | xdata <- data.matrix(NYSE[, c("DJ_return", "log_volume", "log_volatility")])
415 | istrain <- NYSE[, "train"]
416 | xdata <- scale(xdata)
417 |
418 | lagm <- function(x, k = 1) {
419 | n <- nrow(x)
420 | pad <- matrix(NA, k, ncol(x))
421 | rbind(pad, x[1:(n - k), ])
422 | }
423 |
424 | arframe <- data.frame(
425 | log_volume = xdata[, "log_volume"],
426 | L1 = lagm(xdata, 1),
427 | L2 = lagm(xdata, 2),
428 | L3 = lagm(xdata, 3),
429 | L4 = lagm(xdata, 4),
430 | L5 = lagm(xdata, 5)
431 | )
432 |
433 | arframe <- arframe[-(1:5), ]
434 | istrain <- istrain[-(1:5)]
435 |
436 | arfit <- lm(log_volume ~ ., data = arframe[istrain, ])
437 | arpred <- predict(arfit, arframe[!istrain, ])
438 | V0 <- var(arframe[!istrain, "log_volume"])
439 | 1 - mean((arpred - arframe[!istrain, "log_volume"])^2) / V0
440 | ```
441 |
442 | Now we add month (and work with tidyverse).
443 |
444 | ```{r}
445 | arframe$month <- as.factor(str_match(NYSE$date, "-(\\d+)-")[, 2])[-(1:5)]
446 | arfit2 <- lm(log_volume ~ ., data = arframe[istrain, ])
447 | arpred2 <- predict(arfit2, arframe[!istrain, ])
448 | V0 <- var(arframe[!istrain, "log_volume"])
449 | 1 - mean((arpred2 - arframe[!istrain, "log_volume"])^2) / V0
450 | ```
451 |
452 | Adding month as a factor marginally improves the $R^2$ of our model (from
453 | 0.413223 to 0.4170418). This is a significant improvement in fit and model
454 | 2 has a lower AIC.
455 |
456 | ```{r}
457 | anova(arfit, arfit2)
458 | AIC(arfit, arfit2)
459 | ```
460 |
461 | ### Question 10
462 |
463 | > In Section 10.9.6, we showed how to fit a linear AR model to the `NYSE` data
464 | > using the `lm()` function. However, we also mentioned that we can "flatten"
465 | > the short sequences produced for the RNN model in order to fit a linear AR
466 | > model. Use this latter approach to fit a linear AR model to the NYSE data.
467 | > Compare the test $R^2$ of this linear AR model to that of the linear AR model
468 | > that we fit in the lab. What are the advantages/disadvantages of each
469 | > approach?
470 |
471 | The `lm` model is the same as that fit above:
472 |
473 | ```{r}
474 | arfit <- lm(log_volume ~ ., data = arframe[istrain, ])
475 | arpred <- predict(arfit, arframe[!istrain, ])
476 | V0 <- var(arframe[!istrain, "log_volume"])
477 | 1 - mean((arpred - arframe[!istrain, "log_volume"])^2) / V0
478 | ```
479 |
480 | Now we reshape the data for the RNN
481 |
482 | ```{r}
483 | n <- nrow(arframe)
484 | xrnn <- data.matrix(arframe[, -1])
485 | xrnn <- array(xrnn, c(n, 3, 5))
486 | xrnn <- xrnn[, , 5:1]
487 | xrnn <- aperm(xrnn, c(1, 3, 2))
488 | ```
489 |
490 | We can add a "flatten" layer to turn the reshaped data into a long vector of
491 | predictors resulting in a linear AR model.
492 |
493 | ```{r}
494 | model <- keras_model_sequential() |>
495 | layer_flatten(input_shape = c(5, 3)) |>
496 | layer_dense(units = 1)
497 | ```
498 |
499 | Now let's fit this model.
500 |
501 | ```{r}
502 | model |>
503 | compile(optimizer = optimizer_rmsprop(), loss = "mse")
504 |
505 | history <- model |>
506 | fit(
507 | xrnn[istrain, , ],
508 | arframe[istrain, "log_volume"],
509 | batch_size = 64,
510 | epochs = 200,
511 | validation_data = list(xrnn[!istrain, , ], arframe[!istrain, "log_volume"]),
512 | verbose = 0
513 | )
514 |
515 | plot(history, smooth = FALSE)
516 | kpred <- predict(model, xrnn[!istrain, , ])
517 | 1 - mean((kpred - arframe[!istrain, "log_volume"])^2) / V0
518 | ```
519 |
520 | Both models estimate the same number of coefficients/weights (16):
521 |
522 | ```{r}
523 | coef(arfit)
524 | model$get_weights()
525 | ```
526 |
527 | The flattened RNN has a lower $R^2$ on the test data than our `lm` model
528 | above. The `lm` model is quicker to fit and conceptually simpler also
529 | giving us the ability to inspect the coefficients for different variables.
530 |
531 | The flattened RNN is regularized to some extent as data are processed in
532 | batches.
533 |
534 | ### Question 11
535 |
536 | > Repeat the previous exercise, but now fit a nonlinear AR model by "flattening"
537 | > the short sequences produced for the RNN model.
538 |
539 | From the book:
540 |
541 | > To fit a nonlinear AR model, we could add in a hidden layer.
542 |
543 | ```{r, c10q11}
544 | xfun::cache_rds({
545 | model <- keras_model_sequential() |>
546 | layer_flatten(input_shape = c(5, 3)) |>
547 | layer_dense(units = 32, activation = "relu") |>
548 | layer_dropout(rate = 0.4) |>
549 | layer_dense(units = 1)
550 |
551 | model |> compile(
552 | loss = "mse",
553 | optimizer = optimizer_rmsprop(),
554 | metrics = "mse"
555 | )
556 |
557 | history <- model |>
558 | fit(
559 | xrnn[istrain, , ],
560 | arframe[istrain, "log_volume"],
561 | batch_size = 64,
562 | epochs = 200,
563 | validation_data = list(xrnn[!istrain, , ], arframe[!istrain, "log_volume"]),
564 | verbose = 0
565 | )
566 |
567 | plot(history, smooth = FALSE, metrics = "mse")
568 | kpred <- predict(model, xrnn[!istrain, , ])
569 | 1 - mean((kpred - arframe[!istrain, "log_volume"])^2) / V0
570 | })
571 | ```
572 |
573 | This approach improves our $R^2$ over the linear model above.
574 |
575 | ### Question 12
576 |
577 | > Consider the RNN fit to the `NYSE` data in Section 10.9.6. Modify the code to
578 | > allow inclusion of the variable `day_of_week`, and fit the RNN. Compute the
579 | > test $R^2$.
580 |
581 | To accomplish this, I'll include day of the week as one of the lagged variables
582 | in the RNN. Thus, our input for each observation will be 4 x 5 (rather than
583 | 3 x 5).
584 |
585 | ```{r, c10q12}
586 | xfun::cache_rds({
587 | xdata <- data.matrix(
588 | NYSE[, c("day_of_week", "DJ_return", "log_volume", "log_volatility")]
589 | )
590 | istrain <- NYSE[, "train"]
591 | xdata <- scale(xdata)
592 |
593 | arframe <- data.frame(
594 | log_volume = xdata[, "log_volume"],
595 | L1 = lagm(xdata, 1),
596 | L2 = lagm(xdata, 2),
597 | L3 = lagm(xdata, 3),
598 | L4 = lagm(xdata, 4),
599 | L5 = lagm(xdata, 5)
600 | )
601 | arframe <- arframe[-(1:5), ]
602 | istrain <- istrain[-(1:5)]
603 |
604 | n <- nrow(arframe)
605 | xrnn <- data.matrix(arframe[, -1])
606 | xrnn <- array(xrnn, c(n, 4, 5))
607 | xrnn <- xrnn[, , 5:1]
608 | xrnn <- aperm(xrnn, c(1, 3, 2))
609 | dim(xrnn)
610 |
611 | model <- keras_model_sequential() |>
612 | layer_simple_rnn(
613 | units = 12,
614 | input_shape = list(5, 4),
615 | dropout = 0.1,
616 | recurrent_dropout = 0.1
617 | ) |>
618 | layer_dense(units = 1)
619 |
620 | model |> compile(optimizer = optimizer_rmsprop(), loss = "mse")
621 |
622 | history <- model |>
623 | fit(
624 | xrnn[istrain, , ],
625 | arframe[istrain, "log_volume"],
626 | batch_size = 64,
627 | epochs = 200,
628 | validation_data = list(xrnn[!istrain, , ], arframe[!istrain, "log_volume"]),
629 | verbose = 0
630 | )
631 |
632 | kpred <- predict(model, xrnn[!istrain, , ])
633 | 1 - mean((kpred - arframe[!istrain, "log_volume"])^2) / V0
634 | })
635 | ```
636 |
637 | ### Question 13
638 |
639 | > Repeat the analysis of Lab 10.9.5 on the `IMDb` data using a similarly
640 | > structured neural network. There we used a dictionary of size 10,000. Consider
641 | > the effects of varying the dictionary size. Try the values 1000, 3000, 5000,
642 | > and 10,000, and compare the results.
643 |
644 | ```{r, c10q13}
645 | xfun::cache_rds({
646 | library(knitr)
647 | accuracy <- c()
648 | for (max_features in c(1000, 3000, 5000, 10000)) {
649 | imdb <- dataset_imdb(num_words = max_features)
650 | c(c(x_train, y_train), c(x_test, y_test)) %<-% imdb
651 |
652 | maxlen <- 500
653 | x_train <- pad_sequences(x_train, maxlen = maxlen)
654 | x_test <- pad_sequences(x_test, maxlen = maxlen)
655 |
656 | model <- keras_model_sequential() |>
657 | layer_embedding(input_dim = max_features, output_dim = 32) |>
658 | layer_lstm(units = 32) |>
659 | layer_dense(units = 1, activation = "sigmoid")
660 |
661 | model |> compile(
662 | optimizer = "rmsprop",
663 | loss = "binary_crossentropy",
664 | metrics = "acc"
665 | )
666 |
667 | history <- fit(model, x_train, y_train,
668 | epochs = 10,
669 | batch_size = 128,
670 | validation_data = list(x_test, y_test),
671 | verbose = 0
672 | )
673 |
674 | predy <- predict(model, x_test) > 0.5
675 | accuracy <- c(accuracy, mean(abs(y_test == as.numeric(predy))))
676 | }
677 |
678 | tibble(
679 | "Max Features" = c(1000, 3000, 5000, 10000),
680 | "Accuracy" = accuracy
681 | ) |>
682 | kable()
683 | })
684 | ```
685 |
686 | Varying the dictionary size does not make a substantial impact on our estimates
687 | of accuracy. However, the models do take a substantial amount of time to fit and
688 | it is not clear we are finding the best fitting models in each case. For
689 | example, the model using a dictionary size of 10,000 obtained an accuracy of
690 | 0.8721 in the text which is as different from the estimate obtained here as
691 | are the differences between the models with different dictionary sizes.
692 |
--------------------------------------------------------------------------------
/07-moving-beyond-linearity.Rmd:
--------------------------------------------------------------------------------
1 | # Moving Beyond Linearity
2 |
3 | ## Conceptual
4 |
5 | ### Question 1
6 |
7 | > It was mentioned in the chapter that a cubic regression spline with one knot
8 | > at $\xi$ can be obtained using a basis of the form $x, x^2, x^3, (x-\xi)^3_+$,
9 | > where $(x-\xi)^3_+ = (x-\xi)^3$ if $x>\xi$ and equals 0 otherwise. We will now
10 | > show that a function of the form
11 | > $$
12 | > f(x)=\beta_0 +\beta_1x+\beta_2x^2 +\beta_3x^3 +\beta_4(x-\xi)^3_+
13 | > $$
14 | > is indeed a cubic regression spline, regardless of the values of
15 | > $\beta_0, \beta_1, \beta_2, \beta_3,\beta_4$.
16 | >
17 | > a. Find a cubic polynomial
18 | > $$
19 | > f_1(x) = a_1 + b_1x + c_1x^2 + d_1x^3
20 | > $$
21 | > such that $f(x) = f_1(x)$ for all $x \le \xi$. Express $a_1,b_1,c_1,d_1$ in
22 | > terms of $\beta_0, \beta_1, \beta_2, \beta_3, \beta_4$.
23 |
24 | In this case, for $x \le \xi$, the cubic polynomial simply has terms
25 | $a_1 = \beta_0$, $b_1 = \beta_1$, $c_1 = \beta_2$, $d_1 = \beta_3$
26 |
27 | > b. Find a cubic polynomial
28 | > $$
29 | > f_2(x) = a_2 + b_2x + c_2x^2 + d_2x^3
30 | > $$
31 | > such that $f(x) = f_2(x)$ for all $x > \xi$. Express $a_2, b_2, c_2, d_2$ in
32 | > terms of $\beta_0, \beta_1, \beta_2, \beta_3, \beta_4$. We have now established
33 | > that $f(x)$ is a piecewise polynomial.
34 |
35 | For $x \gt \xi$, the cubic polynomial would be (we include the $\beta_4$ term).
36 | \begin{align}
37 | f(x) = & \beta_0 + \beta_1x + \beta_2x^2 + \beta_3x^3 + \beta_4(x-\xi)^3 \\
38 | = & \beta_0 + \beta_1x + \beta_2x^2 + + \beta_4(x^3 - 3x^2\xi + 3x\xi^2 -\xi^3) \\
39 | = & \beta_0 - \beta_4\xi^3 + (\beta_1 + 3\beta_4\xi^2)x +
40 | (\beta_2 - 3\beta_4\xi)x^2 + (\beta_3 + \beta_4)x^3
41 | \end{align}
42 |
43 | Therefore,
44 | $a_1 = \beta_0 - \beta_4\xi^3$, $b_1 = \beta_1 + 3\beta_4\xi^2$,
45 | $c_1 = \beta_2 - 3\beta_4\xi$, $d_1 = \beta_3 + \beta_4$
46 |
47 | > c. Show that $f_1(\xi) = f_2(\xi)$. That is, $f(x)$ is continuous at $\xi$.
48 |
49 | To do this, we replace $x$ with $\xi$ in the above equations and simplify.
50 |
51 | \begin{align}
52 | f_1(\xi) = \beta_0 + \beta_1\xi + \beta_2\xi^2 + \beta_3\xi^3
53 | \end{align}
54 |
55 | \begin{align}
56 | f_2(\xi) = & \beta_0 - \beta_4\xi^3 + (\beta_1 + 3\beta_4\xi^2)\xi +
57 | (\beta_2 - 3\beta_4\xi)\xi^2 + (\beta_3 + \beta_4)\xi^3 \\
58 | = & \beta_0 - \beta_4\xi^3 + \beta_1\xi + 3\beta_4\xi^3 +
59 | \beta_2\xi^2 - 3\beta_4\xi^3 + \beta_3\xi^3 + \beta_4\xi^3 \\
60 | = & \beta_0 + \beta_1\xi + \beta_2\xi^2 + \beta_3\xi^3
61 | \end{align}
62 |
63 | > d. Show that $f_1'(\xi) = f_2'(\xi)$. That is, $f'(x)$ is continuous at $\xi$.
64 |
65 | To do this we differentiate the above with respect to $x$.
66 |
67 | $$
68 | f_1'(x) = \beta_1 + 2\beta_2x + 3\beta_3x^2
69 | f_1'(\xi) = \beta_1 + 2\beta_2\xi + 3\beta_3\xi^2
70 | $$
71 |
72 | \begin{align}
73 | f_2'(x) & = \beta_1 + 3\beta_4\xi^2 + 2(\beta_2 - 3\beta_4\xi)x + 3(\beta_3 + \beta_4)x^2 \\
74 | f_2'(\xi) & = \beta_1 + 3\beta_4\xi^2 + 2(\beta_2 - 3\beta_4\xi)\xi + 3(\beta_3 + \beta_4)\xi^2 \\
75 | & = \beta_1 + 3\beta_4\xi^2 + 2\beta_2\xi - 6\beta_4\xi^2 + 3\beta_3\xi^2 + 3\beta_4\xi^2 \\
76 | & = \beta_1 + 2\beta_2\xi + 3\beta_3\xi^2
77 | \end{align}
78 |
79 | > e. Show that $f_1''(\xi) = f_2''(\xi)$. That is, $f''(x)$ is continuous at $\xi$.
80 | >
81 | > Therefore, $f(x)$ is indeed a cubic spline.
82 |
83 | $$
84 | f_1'(x) = 2\beta_2x + 6\beta_3x \\
85 | f_1''(\xi) = 2\beta_2\xi + 6\beta_3\xi
86 | $$
87 |
88 | $$
89 | f_2''(x) = 2\beta_2 - 6\beta_4\xi + 6(\beta_3 + \beta_4)x \\
90 | $$
91 | \begin{align}
92 | f_2''(\xi) & = 2\beta_2 - 6\beta_4\xi + 6\beta_3\xi + 6\beta_4\xi \\
93 | & = 2\beta_2 + 6\beta_3\xi
94 | \end{align}
95 |
96 | > _Hint: Parts (d) and (e) of this problem require knowledge of single-variable
97 | > calculus. As a reminder, given a cubic polynomial_
98 | > $$f_1(x) = a_1 + b_1x + c_1x^2 + d_1x^3,$$
99 | > _the first derivative takes the form_
100 | > $$f_1'(x) = b_1 + 2c_1x + 3d_1x^2$$
101 | > _and the second derivative takes the form_
102 | > $$f_1''(x) = 2c_1 + 6d_1x.$$
103 |
104 | ### Question 2
105 |
106 | > Suppose that a curve $\hat{g}$ is computed to smoothly fit a set of $n$ points
107 | > using the following formula:
108 | > $$
109 | > \DeclareMathOperator*{\argmin}{arg\,min} % Jan Hlavacek
110 | > \hat{g} = \argmin_g \left(\sum_{i=1}^n (y_i - g(x_i))^2 + \lambda \int \left[ g^{(m)}(x) \right]^2 dx \right),
111 | > $$
112 | > where $g^{(m)}$ represents the $m$th derivative of $g$ (and $g^{(0)} = g$).
113 | > Provide example sketches of $\hat{g}$ in each of the following scenarios.
114 | >
115 | > a. $\lambda=\infty, m=0$.
116 |
117 | Here we penalize the $g$ and a infinite $\lambda$ means that this penalty
118 | dominates. This means that the $\hat{g}$ will be 0.
119 |
120 | > b. $\lambda=\infty, m=1$.
121 |
122 | Here we penalize the first derivative (the slope) of $g$ and a infinite
123 | $\lambda$ means that this penalty dominates. Thus the slope will be 0
124 | (and otherwise best fitting $x$, i.e. at the mean of $x$).
125 |
126 | > c. $\lambda=\infty, m=2$.
127 |
128 | Here we penalize the second derivative (the change of slope) of $g$ and a
129 | infinite $\lambda$ means that this penalty dominates. Thus the line will be
130 | straight (and otherwise best fitting $x$).
131 |
132 | > d. $\lambda=\infty, m=3$.
133 |
134 | Here we penalize the third derivative (the change of the change of slope) of $g$
135 | and a infinite $\lambda$ means that this penalty dominates. In other words,
136 | the curve will have a consistent rate of change (e.g. a quadratic
137 | function or similar).
138 |
139 | > e. $\lambda=0, m=3$.
140 |
141 | Here we penalize the third derivative, but a value of $\lambda = 0$ means that
142 | there is no penalty. As a result, the curve is able to interpolate all points.
143 |
144 | ### Question 3
145 |
146 | > Suppose we fit a curve with basis functions
147 | > $b_1(X) = X$,
148 | > $b_2(X) = (X - 1)^2I(X \geq 1)$.
149 | > (Note that $I(X \geq 1)$ equals 1 for $X \geq 1$ and 0 otherwise.) We fit the
150 | > linear regression model
151 | > $$Y = \beta_0 +\beta_1b_1(X) + \beta_2b_2(X) + \epsilon,$$
152 | > and obtain coefficient estimates
153 | > $\hat{\beta}_0 = 1, \hat{\beta}_1 = 1, \hat{\beta}_2 = -2$.
154 | > Sketch the estimated curve between $X = -2$ and $X = 2$. Note the intercepts,
155 | > slopes, and other relevant information.
156 |
157 | ```{r}
158 | x <- seq(-2, 2, length.out = 1000)
159 | f <- function(x) 1 + x + -2 * (x - 1)^2 * I(x >= 1)
160 | plot(x, f(x), type = "l")
161 | grid()
162 | ```
163 |
164 | ### Question 4
165 |
166 | > Suppose we fit a curve with basis functions
167 | > $b_1(X) = I(0 \leq X \leq 2) - (X -1)I(1 \leq X \leq 2),$
168 | > $b_2(X) = (X -3)I(3 \leq X \leq 4) + I(4 \lt X \leq 5)$.
169 | > We fit the linear regression model
170 | > $$Y = \beta_0 +\beta_1b_1(X) + \beta_2b_2(X) + \epsilon,$$
171 | > and obtain coefficient estimates
172 | > $\hat{\beta}_0 = 1, \hat{\beta}_1 = 1, \hat{\beta}_2 = 3$.
173 | > Sketch the estimated curve between $X = -2$ and $X = 6$. Note the intercepts,
174 | > slopes, and other relevant information.
175 |
176 | ```{r}
177 | x <- seq(-2, 6, length.out = 1000)
178 | b1 <- function(x) I(0 <= x & x <= 2) - (x - 1) * I(1 <= x & x <= 2)
179 | b2 <- function(x) (x - 3) * I(3 <= x & x <= 4) + I(4 < x & x <= 5)
180 | f <- function(x) 1 + 1 * b1(x) + 3 * b2(x)
181 | plot(x, f(x), type = "l")
182 | grid()
183 | ```
184 |
185 | ### Question 5
186 |
187 | > Consider two curves, $\hat{g}$ and $\hat{g}_2$, defined by
188 | >
189 | > $$
190 | > \hat{g}_1 = \argmin_g \left(\sum_{i=1}^n (y_i - g(x_i))^2 +
191 | > \lambda \int \left[ g^{(3)}(x) \right]^2 dx \right),
192 | > $$
193 | > $$
194 | > \hat{g}_2 = \argmin_g \left(\sum_{i=1}^n (y_i - g(x_i))^2 +
195 | > \lambda \int \left[ g^{(4)}(x) \right]^2 dx \right),
196 | > $$
197 | >
198 | > where $g^{(m)}$ represents the $m$th derivative of $g$.
199 | >
200 | > a. As $\lambda \to \infty$, will $\hat{g}_1$ or $\hat{g}_2$ have the smaller
201 | > training RSS?
202 |
203 | $\hat{g}_2$ is more flexible (by penalizing a higher derivative of $g$) and
204 | so will have a smaller training RSS.
205 |
206 | > b. As $\lambda \to \infty$, will $\hat{g}_1$ or $\hat{g}_2$ have the smaller
207 | > test RSS?
208 |
209 | We cannot tell which function will produce a smaller test RSS, but there is
210 | chance that $\hat{g}_1$ will if $\hat{g}_2$ overfits the data.
211 |
212 | > c. For $\lambda = 0$, will $\hat{g}_1$ or $\hat{g}_2$ have the smaller training
213 | > and test RSS?
214 |
215 | When $\lambda = 0$ there is no penalty, so both functions will give the same
216 | result: perfect interpolation of the training data. Thus training RSS will be
217 | 0 but test RSS could be high.
218 |
219 | ## Applied
220 |
221 | ### Question 6
222 |
223 | > In this exercise, you will further analyze the `Wage` data set considered
224 | > throughout this chapter.
225 | >
226 | > a. Perform polynomial regression to predict `wage` using `age`. Use
227 | > cross-validation to select the optimal degree $d$ for the polynomial. What
228 | > degree was chosen, and how does this compare to the results of hypothesis
229 | > testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
230 |
231 | ```{r}
232 | library(ISLR2)
233 | library(boot)
234 | library(ggplot2)
235 | set.seed(42)
236 | res <- sapply(1:6, function(i) {
237 | fit <- glm(wage ~ poly(age, i), data = Wage)
238 | cv.glm(Wage, fit, K = 5)$delta[1]
239 | })
240 | which.min(res)
241 | plot(1:6, res, xlab = "Degree", ylab = "Test MSE", type = "l")
242 | abline(v = which.min(res), col = "red", lty = 2)
243 |
244 | fit <- glm(wage ~ poly(age, which.min(res)), data = Wage)
245 | plot(Wage$age, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
246 | points(1:100, predict(fit, data.frame(age = 1:100)), type = "l", col = "red")
247 |
248 | summary(glm(wage ~ poly(age, 6), data = Wage))
249 |
250 | fit1 <- lm(wage ~ poly(age, 1), data = Wage)
251 | fit2 <- lm(wage ~ poly(age, 2), data = Wage)
252 | fit3 <- lm(wage ~ poly(age, 3), data = Wage)
253 | fit4 <- lm(wage ~ poly(age, 4), data = Wage)
254 | fit5 <- lm(wage ~ poly(age, 5), data = Wage)
255 | anova(fit1, fit2, fit3, fit4, fit5)
256 | ```
257 |
258 | The selected degree is 4. When testing with ANOVA, degrees 1, 2 and 3 are highly
259 | significant and 4 is marginal.
260 |
261 | > b. Fit a step function to predict `wage` using `age`, and perform
262 | > cross-validation to choose the optimal number of cuts. Make a plot of the fit
263 | > obtained.
264 |
265 | ```{r}
266 | set.seed(42)
267 | res <- sapply(2:10, function(i) {
268 | Wage$cats <- cut(Wage$age, i)
269 | fit <- glm(wage ~ cats, data = Wage)
270 | cv.glm(Wage, fit, K = 5)$delta[1]
271 | })
272 | names(res) <- 2:10
273 | plot(2:10, res, xlab = "Cuts", ylab = "Test MSE", type = "l")
274 | which.min(res)
275 | abline(v = names(which.min(res)), col = "red", lty = 2)
276 |
277 | fit <- glm(wage ~ cut(age, 8), data = Wage)
278 | plot(Wage$age, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
279 | points(18:80, predict(fit, data.frame(age = 18:80)), type = "l", col = "red")
280 | ```
281 |
282 | ### Question 7
283 |
284 | > The `Wage` data set contains a number of other features not explored in this
285 | > chapter, such as marital status (`maritl`), job class (`jobclass`), and others.
286 | > Explore the relationships between some of these other predictors and `wage`, and
287 | > use non-linear fitting techniques in order to fit flexible models to the data.
288 | > Create plots of the results obtained, and write a summary of your findings.
289 |
290 | ```{r}
291 | plot(Wage$year, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
292 | plot(Wage$age, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
293 | plot(Wage$maritl, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
294 | plot(Wage$jobclass, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
295 | plot(Wage$education, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4))
296 | ```
297 |
298 | We have a mix of categorical and continuous variables and also want to
299 | incorporate non-linear aspects of the continuous variables. A GAM is a good
300 | choice to model this situation.
301 |
302 | ```{r}
303 | library(gam)
304 | fit0 <- gam(wage ~ s(year, 4) + s(age, 5) + education, data = Wage)
305 | fit2 <- gam(wage ~ s(year, 4) + s(age, 5) + education + maritl, data = Wage)
306 | fit1 <- gam(wage ~ s(year, 4) + s(age, 5) + education + jobclass, data = Wage)
307 | fit3 <- gam(wage ~ s(year, 4) + s(age, 5) + education + jobclass + maritl, data = Wage)
308 | anova(fit0, fit1, fit2, fit3)
309 | par(mfrow = c(2, 3))
310 | plot(fit3, se = TRUE, col = "blue")
311 | ```
312 |
313 | ### Question 8
314 |
315 | > Fit some of the non-linear models investigated in this chapter to the `Auto`
316 | > data set. Is there evidence for non-linear relationships in this data set?
317 | > Create some informative plots to justify your answer.
318 |
319 | Here we want to explore a range of non-linear models. First let's look at the
320 | relationships between the variables in the data.
321 |
322 | ```{r}
323 | pairs(Auto, cex = 0.4, pch = 19)
324 | ```
325 |
326 | It does appear that there are some non-linear relationships (e.g.
327 | horsepower / weight and mpg). We will pick one variable (horsepower) to predict
328 | mpg and try the range of models discussed in this chapter. We will measure
329 | test MSE through cross-validation to compare the models.
330 |
331 | ```{r}
332 | library(tidyverse)
333 | set.seed(42)
334 | fit <- glm(mpg ~ horsepower, data = Auto)
335 | err <- cv.glm(Auto, fit, K = 10)$delta[1]
336 |
337 | fit1 <- glm(mpg ~ poly(horsepower, 4), data = Auto)
338 | err1 <- cv.glm(Auto, fit1, K = 10)$delta[1]
339 |
340 | q <- quantile(Auto$horsepower)
341 | Auto$hp_cats <- cut(Auto$horsepower, breaks = q, include.lowest = TRUE)
342 | fit2 <- glm(mpg ~ hp_cats, data = Auto)
343 | err2 <- cv.glm(Auto, fit2, K = 10)$delta[1]
344 |
345 | fit3 <- glm(mpg ~ bs(horsepower, df = 4), data = Auto)
346 | err3 <- cv.glm(Auto, fit3, K = 10)$delta[1]
347 |
348 | fit4 <- glm(mpg ~ ns(horsepower, 4), data = Auto)
349 | err4 <- cv.glm(Auto, fit4, K = 10)$delta[1]
350 |
351 | fit5 <- gam(mpg ~ s(horsepower, df = 4), data = Auto)
352 | # rough 10-fold cross-validation for gam.
353 | err5 <- mean(replicate(10, {
354 | b <- cut(sample(seq_along(Auto$horsepower)), 10)
355 | pred <- numeric()
356 | for (i in 1:10) {
357 | train <- b %in% levels(b)[-i]
358 | f <- gam(mpg ~ s(horsepower, df = 4), data = Auto[train, ])
359 | pred[!train] <- predict(f, Auto[!train, ])
360 | }
361 | mean((Auto$mpg - pred)^2) # MSE
362 | }))
363 |
364 | c(err, err1, err2, err3, err4, err5)
365 | anova(fit, fit1, fit2, fit3, fit4, fit5)
366 |
367 | x <- seq(min(Auto$horsepower), max(Auto$horsepower), length.out = 1000)
368 | pred <- data.frame(
369 | x = x,
370 | "Linear" = predict(fit, data.frame(horsepower = x)),
371 | "Polynomial" = predict(fit1, data.frame(horsepower = x)),
372 | "Step" = predict(fit2, data.frame(hp_cats = cut(x, breaks = q, include.lowest = TRUE))),
373 | "Regression spline" = predict(fit3, data.frame(horsepower = x)),
374 | "Natural spline" = predict(fit4, data.frame(horsepower = x)),
375 | "Smoothing spline" = predict(fit5, data.frame(horsepower = x)),
376 | check.names = FALSE
377 | )
378 | pred <- pivot_longer(pred, -x)
379 | ggplot(Auto, aes(horsepower, mpg)) +
380 | geom_point(color = alpha("steelblue", 0.4)) +
381 | geom_line(data = pred, aes(x, value, color = name)) +
382 | theme_bw()
383 | ```
384 |
385 | ### Question 9
386 |
387 | > This question uses the variables `dis` (the weighted mean of distances to five
388 | > Boston employment centers) and `nox` (nitrogen oxides concentration in parts per
389 | > 10 million) from the `Boston` data. We will treat `dis` as the predictor and
390 | > `nox` as the response.
391 | >
392 | > a. Use the `poly()` function to fit a cubic polynomial regression to predict
393 | > `nox` using `dis`. Report the regression output, and plot the resulting data
394 | > and polynomial fits.
395 |
396 | ```{r}
397 | fit <- glm(nox ~ poly(dis, 3), data = Boston)
398 | summary(fit)
399 | plot(nox ~ dis, data = Boston, col = alpha("steelblue", 0.4), pch = 19)
400 | x <- seq(min(Boston$dis), max(Boston$dis), length.out = 1000)
401 | lines(x, predict(fit, data.frame(dis = x)), col = "red", lty = 2)
402 | ```
403 |
404 | > b. Plot the polynomial fits for a range of different polynomial degrees (say,
405 | > from 1 to 10), and report the associated residual sum of squares.
406 |
407 | ```{r}
408 | fits <- lapply(1:10, function(i) glm(nox ~ poly(dis, i), data = Boston))
409 |
410 | x <- seq(min(Boston$dis), max(Boston$dis), length.out = 1000)
411 | pred <- data.frame(lapply(fits, function(fit) predict(fit, data.frame(dis = x))))
412 | colnames(pred) <- 1:10
413 | pred$x <- x
414 | pred <- pivot_longer(pred, !x)
415 | ggplot(Boston, aes(dis, nox)) +
416 | geom_point(color = alpha("steelblue", 0.4)) +
417 | geom_line(data = pred, aes(x, value, color = name)) +
418 | theme_bw()
419 |
420 | # residual sum of squares
421 | do.call(anova, fits)[, 2]
422 | ```
423 |
424 | > c. Perform cross-validation or another approach to select the optimal degree
425 | > for the polynomial, and explain your results.
426 |
427 | ```{r}
428 | res <- sapply(1:10, function(i) {
429 | fit <- glm(nox ~ poly(dis, i), data = Boston)
430 | cv.glm(Boston, fit, K = 10)$delta[1]
431 | })
432 | which.min(res)
433 | ```
434 |
435 | The optimal degree is 3 based on cross-validation. Higher values tend to
436 | lead to overfitting.
437 |
438 | > d. Use the `bs()` function to fit a regression spline to predict `nox` using
439 | > `dis`. Report the output for the fit using four degrees of freedom. How did
440 | > you choose the knots? Plot the resulting fit.
441 |
442 | ```{r}
443 | fit <- glm(nox ~ bs(dis, df = 4), data = Boston)
444 | summary(fit)
445 |
446 | plot(nox ~ dis, data = Boston, col = alpha("steelblue", 0.4), pch = 19)
447 | x <- seq(min(Boston$dis), max(Boston$dis), length.out = 1000)
448 | lines(x, predict(fit, data.frame(dis = x)), col = "red", lty = 2)
449 | ```
450 |
451 | Knots are chosen based on quantiles of the data.
452 |
453 | > e. Now fit a regression spline for a range of degrees of freedom, and plot the
454 | > resulting fits and report the resulting RSS. Describe the results obtained.
455 |
456 | ```{r}
457 | fits <- lapply(3:10, function(i) {
458 | glm(nox ~ bs(dis, df = i), data = Boston)
459 | })
460 |
461 | x <- seq(min(Boston$dis), max(Boston$dis), length.out = 1000)
462 | pred <- data.frame(lapply(fits, function(fit) predict(fit, data.frame(dis = x))))
463 | colnames(pred) <- 3:10
464 | pred$x <- x
465 | pred <- pivot_longer(pred, !x)
466 | ggplot(Boston, aes(dis, nox)) +
467 | geom_point(color = alpha("steelblue", 0.4)) +
468 | geom_line(data = pred, aes(x, value, color = name)) +
469 | theme_bw()
470 | ```
471 |
472 | At high numbers of degrees of freedom the splines overfit the data (particularly
473 | at extreme ends of the distribution of the predictor variable).
474 |
475 | > f. Perform cross-validation or another approach in order to select the best
476 | > degrees of freedom for a regression spline on this data. Describe your
477 | > results.
478 |
479 | ```{r}
480 | set.seed(42)
481 | err <- sapply(3:10, function(i) {
482 | fit <- glm(nox ~ bs(dis, df = i), data = Boston)
483 | suppressWarnings(cv.glm(Boston, fit, K = 10)$delta[1])
484 | })
485 | which.min(err)
486 | ```
487 |
488 | This approach would select 4 degrees of freedom for the spline.
489 |
490 | ### Question 10
491 |
492 | > This question relates to the `College` data set.
493 | >
494 | > a. Split the data into a training set and a test set. Using out-of-state tuition
495 | > as the response and the other variables as the predictors, perform forward
496 | > stepwise selection on the training set in order to identify a satisfactory
497 | > model that uses just a subset of the predictors.
498 |
499 | ```{r}
500 | library(leaps)
501 |
502 | # helper function to predict from a regsubsets model
503 | predict.regsubsets <- function(object, newdata, id, ...) {
504 | form <- as.formula(object$call[[2]])
505 | mat <- model.matrix(form, newdata)
506 | coefi <- coef(object, id = id)
507 | xvars <- names(coefi)
508 | mat[, xvars] %*% coefi
509 | }
510 |
511 | set.seed(42)
512 | train <- rep(TRUE, nrow(College))
513 | train[sample(1:nrow(College), nrow(College) * 1 / 3)] <- FALSE
514 | fit <- regsubsets(Outstate ~ ., data = College[train, ], nvmax = 17, method = "forward")
515 |
516 | plot(summary(fit)$bic, type = "b")
517 | which.min(summary(fit)$bic)
518 |
519 | # or via cross-validation
520 | err <- sapply(1:17, function(i) {
521 | x <- coef(fit, id = i)
522 | mean((College$Outstate[!train] - predict(fit, College[!train, ], i))^2)
523 | })
524 | which.min(err)
525 | min(summary(fit)$bic)
526 | ```
527 |
528 | For the sake of simplicity we'll choose 6
529 |
530 | ```{r}
531 | coef(fit, id = 6)
532 | ```
533 |
534 | > b. Fit a GAM on the training data, using out-of-state tuition as the response
535 | > and the features selected in the previous step as the predictors. Plot the
536 | > results, and explain your findings.
537 |
538 | ```{r}
539 | fit <- gam(Outstate ~ Private + s(Room.Board, 2) + s(PhD, 2) + s(perc.alumni, 2) +
540 | s(Expend, 2) + s(Grad.Rate, 2), data = College[train, ])
541 | ```
542 |
543 | > c. Evaluate the model obtained on the test set, and explain the results
544 | > obtained.
545 |
546 | ```{r}
547 | pred <- predict(fit, College[!train, ])
548 | err_gam <- mean((College$Outstate[!train] - pred)^2)
549 | plot(err, ylim = c(min(err_gam, err), max(err)), type = "b")
550 | abline(h = err_gam, col = "red", lty = 2)
551 |
552 | # r-squared
553 | 1 - err_gam / mean((College$Outstate[!train] - mean(College$Outstate[!train]))^2)
554 | ```
555 |
556 | > d. For which variables, if any, is there evidence of a non-linear relationship
557 | > with the response?
558 |
559 | ```{r}
560 | summary(fit)
561 | ```
562 |
563 | Non-linear relationships are significant for Expend and PhD.
564 |
565 | ### Question 11
566 |
567 | > In Section 7.7, it was mentioned that GAMs are generally fit using a
568 | > _backfitting_ approach. The idea behind backfitting is actually quite simple. We
569 | > will now explore backfitting in the context of multiple linear regression.
570 | >
571 | > Suppose that we would like to perform multiple linear regression, but we do not
572 | > have software to do so. Instead, we only have software to perform simple linear
573 | > regression. Therefore, we take the following iterative approach: we repeatedly
574 | > hold all but one coefficient estimate fixed at its current value, and update
575 | > only that coefficient estimate using a simple linear regression. The process is
576 | > continued until _convergence_---that is, until the coefficient estimates stop
577 | > changing.
578 | >
579 | > We now try this out on a toy example.
580 | >
581 | > a. Generate a response $Y$ and two predictors $X_1$ and $X_2$, with $n = 100$.
582 |
583 | ```{r}
584 | set.seed(42)
585 | x1 <- rnorm(100)
586 | x2 <- rnorm(100)
587 | y <- 2 + 0.2 * x1 + 4 * x2 + rnorm(100)
588 | ```
589 |
590 | > b. Initialize $\hat{\beta}_1$ to take on a value of your choice. It does not
591 | > matter 1 what value you choose.
592 |
593 | ```{r}
594 | beta1 <- 20
595 | ```
596 |
597 | > c. Keeping $\hat{\beta}_1$ fixed, fit the model
598 | > $$Y - \hat{\beta}_1X_1 = \beta_0 + \beta_2X_2 + \epsilon.$$
599 | > You can do this as follows:
600 | >
601 | > ```r
602 | > > a <- y - beta1 * x1
603 | > > beta2 <- lm(a ~ x2)$coef[2]
604 | > ```
605 |
606 | ```{r}
607 | a <- y - beta1 * x1
608 | beta2 <- lm(a ~ x2)$coef[2]
609 | ```
610 |
611 | > d. Keeping $\hat{\beta}_2$ fixed, fit the model
612 | > $$Y - \hat{\beta}_2X_2 = \beta_0 + \beta_1 X_1 + \epsilon.$$
613 | > You can do this as follows:
614 | >
615 | > ```r
616 | > > a <- y - beta2 * x2
617 | > > beta1 <- lm(a ~ x1)$coef[2]
618 | > ```
619 |
620 | ```{r}
621 | a <- y - beta2 * x2
622 | beta1 <- lm(a ~ x1)$coef[2]
623 | ```
624 |
625 | > e. Write a for loop to repeat (c) and (d) 1,000 times. Report the estimates of
626 | > $\hat{\beta}_0, \hat{\beta}_1,$ and $\hat{\beta}_2$ at each iteration of the
627 | > for loop. Create a plot in which each of these values is displayed, with
628 | > $\hat{\beta}_0, \hat{\beta}_1,$ and $\hat{\beta}_2$ each shown in a different
629 | > color.
630 |
631 | ```{r}
632 | res <- matrix(NA, nrow = 1000, ncol = 3)
633 | colnames(res) <- c("beta0", "beta1", "beta2")
634 | beta1 <- 20
635 | for (i in 1:1000) {
636 | beta2 <- lm(y - beta1 * x1 ~ x2)$coef[2]
637 | beta1 <- lm(y - beta2 * x2 ~ x1)$coef[2]
638 | beta0 <- lm(y - beta2 * x2 ~ x1)$coef[1]
639 | res[i, ] <- c(beta0, beta1, beta2)
640 | }
641 | res <- as.data.frame(res)
642 | res$Iteration <- 1:1000
643 | res <- pivot_longer(res, !Iteration)
644 | p <- ggplot(res, aes(x = Iteration, y = value, color = name)) +
645 | geom_line() +
646 | geom_point() +
647 | scale_x_continuous(trans = "log10")
648 | p
649 | ```
650 |
651 | > f. Compare your answer in (e) to the results of simply performing multiple
652 | > linear regression to predict $Y$ using $X_1$ and $X_2$. Use the `abline()`
653 | > function to overlay those multiple linear regression coefficient estimates on
654 | > the plot obtained in (e).
655 |
656 | ```{r}
657 | fit <- lm(y ~ x1 + x2)
658 | coef(fit)
659 | p + geom_hline(yintercept = coef(fit), lty = 2)
660 | ```
661 |
662 | > g. On this data set, how many backfitting iterations were required in order to
663 | > obtain a "good" approximation to the multiple regression coefficient
664 | > estimates?
665 |
666 | In this case, good estimates were obtained after 3 iterations.
667 |
668 | ### Question 12
669 |
670 | > This problem is a continuation of the previous exercise. In a toy example with
671 | > $p = 100$, show that one can approximate the multiple linear regression
672 | > coefficient estimates by repeatedly performing simple linear regression in a
673 | > backfitting procedure. How many backfitting iterations are required in order to
674 | > obtain a "good" approximation to the multiple regression coefficient estimates?
675 | > Create a plot to justify your answer.
676 |
677 | ```{r}
678 | set.seed(42)
679 |
680 | p <- 100
681 | n <- 1000
682 |
683 | betas <- rnorm(p) * 5
684 | x <- matrix(rnorm(n * p), ncol = p, nrow = n)
685 | y <- (x %*% betas) + rnorm(n) # ignore beta0 for simplicity
686 |
687 | # multiple regression
688 | fit <- lm(y ~ x - 1)
689 | coef(fit)
690 |
691 | # backfitting
692 | backfit <- function(x, y, iter = 20) {
693 | beta <- matrix(0, ncol = ncol(x), nrow = iter + 1)
694 | for (i in 1:iter) {
695 | for (k in 1:ncol(x)) {
696 | residual <- y - (x[, -k] %*% beta[i, -k])
697 | beta[i + 1, k] <- lm(residual ~ x[, k])$coef[2]
698 | }
699 | }
700 | beta[-1, ]
701 | }
702 | res <- backfit(x, y)
703 | error <- rowMeans(sweep(res, 2, betas)^2)
704 | plot(error, log = "x", type = "b")
705 |
706 | # backfitting error
707 | error[length(error)]
708 |
709 | # lm error
710 | mean((coef(fit) - betas)^2)
711 | ```
712 |
713 | We need around 5 to 6 iterations to obtain a good estimate of the coefficients.
714 |
--------------------------------------------------------------------------------
/data/Auto.data:
--------------------------------------------------------------------------------
1 | mpg cylinders displacement horsepower weight acceleration year origin name
2 | 18.0 8 307.0 130.0 3504. 12.0 70 1 "chevrolet chevelle malibu"
3 | 15.0 8 350.0 165.0 3693. 11.5 70 1 "buick skylark 320"
4 | 18.0 8 318.0 150.0 3436. 11.0 70 1 "plymouth satellite"
5 | 16.0 8 304.0 150.0 3433. 12.0 70 1 "amc rebel sst"
6 | 17.0 8 302.0 140.0 3449. 10.5 70 1 "ford torino"
7 | 15.0 8 429.0 198.0 4341. 10.0 70 1 "ford galaxie 500"
8 | 14.0 8 454.0 220.0 4354. 9.0 70 1 "chevrolet impala"
9 | 14.0 8 440.0 215.0 4312. 8.5 70 1 "plymouth fury iii"
10 | 14.0 8 455.0 225.0 4425. 10.0 70 1 "pontiac catalina"
11 | 15.0 8 390.0 190.0 3850. 8.5 70 1 "amc ambassador dpl"
12 | 15.0 8 383.0 170.0 3563. 10.0 70 1 "dodge challenger se"
13 | 14.0 8 340.0 160.0 3609. 8.0 70 1 "plymouth 'cuda 340"
14 | 15.0 8 400.0 150.0 3761. 9.5 70 1 "chevrolet monte carlo"
15 | 14.0 8 455.0 225.0 3086. 10.0 70 1 "buick estate wagon (sw)"
16 | 24.0 4 113.0 95.00 2372. 15.0 70 3 "toyota corona mark ii"
17 | 22.0 6 198.0 95.00 2833. 15.5 70 1 "plymouth duster"
18 | 18.0 6 199.0 97.00 2774. 15.5 70 1 "amc hornet"
19 | 21.0 6 200.0 85.00 2587. 16.0 70 1 "ford maverick"
20 | 27.0 4 97.00 88.00 2130. 14.5 70 3 "datsun pl510"
21 | 26.0 4 97.00 46.00 1835. 20.5 70 2 "volkswagen 1131 deluxe sedan"
22 | 25.0 4 110.0 87.00 2672. 17.5 70 2 "peugeot 504"
23 | 24.0 4 107.0 90.00 2430. 14.5 70 2 "audi 100 ls"
24 | 25.0 4 104.0 95.00 2375. 17.5 70 2 "saab 99e"
25 | 26.0 4 121.0 113.0 2234. 12.5 70 2 "bmw 2002"
26 | 21.0 6 199.0 90.00 2648. 15.0 70 1 "amc gremlin"
27 | 10.0 8 360.0 215.0 4615. 14.0 70 1 "ford f250"
28 | 10.0 8 307.0 200.0 4376. 15.0 70 1 "chevy c20"
29 | 11.0 8 318.0 210.0 4382. 13.5 70 1 "dodge d200"
30 | 9.0 8 304.0 193.0 4732. 18.5 70 1 "hi 1200d"
31 | 27.0 4 97.00 88.00 2130. 14.5 71 3 "datsun pl510"
32 | 28.0 4 140.0 90.00 2264. 15.5 71 1 "chevrolet vega 2300"
33 | 25.0 4 113.0 95.00 2228. 14.0 71 3 "toyota corona"
34 | 25.0 4 98.00 ? 2046. 19.0 71 1 "ford pinto"
35 | 19.0 6 232.0 100.0 2634. 13.0 71 1 "amc gremlin"
36 | 16.0 6 225.0 105.0 3439. 15.5 71 1 "plymouth satellite custom"
37 | 17.0 6 250.0 100.0 3329. 15.5 71 1 "chevrolet chevelle malibu"
38 | 19.0 6 250.0 88.00 3302. 15.5 71 1 "ford torino 500"
39 | 18.0 6 232.0 100.0 3288. 15.5 71 1 "amc matador"
40 | 14.0 8 350.0 165.0 4209. 12.0 71 1 "chevrolet impala"
41 | 14.0 8 400.0 175.0 4464. 11.5 71 1 "pontiac catalina brougham"
42 | 14.0 8 351.0 153.0 4154. 13.5 71 1 "ford galaxie 500"
43 | 14.0 8 318.0 150.0 4096. 13.0 71 1 "plymouth fury iii"
44 | 12.0 8 383.0 180.0 4955. 11.5 71 1 "dodge monaco (sw)"
45 | 13.0 8 400.0 170.0 4746. 12.0 71 1 "ford country squire (sw)"
46 | 13.0 8 400.0 175.0 5140. 12.0 71 1 "pontiac safari (sw)"
47 | 18.0 6 258.0 110.0 2962. 13.5 71 1 "amc hornet sportabout (sw)"
48 | 22.0 4 140.0 72.00 2408. 19.0 71 1 "chevrolet vega (sw)"
49 | 19.0 6 250.0 100.0 3282. 15.0 71 1 "pontiac firebird"
50 | 18.0 6 250.0 88.00 3139. 14.5 71 1 "ford mustang"
51 | 23.0 4 122.0 86.00 2220. 14.0 71 1 "mercury capri 2000"
52 | 28.0 4 116.0 90.00 2123. 14.0 71 2 "opel 1900"
53 | 30.0 4 79.00 70.00 2074. 19.5 71 2 "peugeot 304"
54 | 30.0 4 88.00 76.00 2065. 14.5 71 2 "fiat 124b"
55 | 31.0 4 71.00 65.00 1773. 19.0 71 3 "toyota corolla 1200"
56 | 35.0 4 72.00 69.00 1613. 18.0 71 3 "datsun 1200"
57 | 27.0 4 97.00 60.00 1834. 19.0 71 2 "volkswagen model 111"
58 | 26.0 4 91.00 70.00 1955. 20.5 71 1 "plymouth cricket"
59 | 24.0 4 113.0 95.00 2278. 15.5 72 3 "toyota corona hardtop"
60 | 25.0 4 97.50 80.00 2126. 17.0 72 1 "dodge colt hardtop"
61 | 23.0 4 97.00 54.00 2254. 23.5 72 2 "volkswagen type 3"
62 | 20.0 4 140.0 90.00 2408. 19.5 72 1 "chevrolet vega"
63 | 21.0 4 122.0 86.00 2226. 16.5 72 1 "ford pinto runabout"
64 | 13.0 8 350.0 165.0 4274. 12.0 72 1 "chevrolet impala"
65 | 14.0 8 400.0 175.0 4385. 12.0 72 1 "pontiac catalina"
66 | 15.0 8 318.0 150.0 4135. 13.5 72 1 "plymouth fury iii"
67 | 14.0 8 351.0 153.0 4129. 13.0 72 1 "ford galaxie 500"
68 | 17.0 8 304.0 150.0 3672. 11.5 72 1 "amc ambassador sst"
69 | 11.0 8 429.0 208.0 4633. 11.0 72 1 "mercury marquis"
70 | 13.0 8 350.0 155.0 4502. 13.5 72 1 "buick lesabre custom"
71 | 12.0 8 350.0 160.0 4456. 13.5 72 1 "oldsmobile delta 88 royale"
72 | 13.0 8 400.0 190.0 4422. 12.5 72 1 "chrysler newport royal"
73 | 19.0 3 70.00 97.00 2330. 13.5 72 3 "mazda rx2 coupe"
74 | 15.0 8 304.0 150.0 3892. 12.5 72 1 "amc matador (sw)"
75 | 13.0 8 307.0 130.0 4098. 14.0 72 1 "chevrolet chevelle concours (sw)"
76 | 13.0 8 302.0 140.0 4294. 16.0 72 1 "ford gran torino (sw)"
77 | 14.0 8 318.0 150.0 4077. 14.0 72 1 "plymouth satellite custom (sw)"
78 | 18.0 4 121.0 112.0 2933. 14.5 72 2 "volvo 145e (sw)"
79 | 22.0 4 121.0 76.00 2511. 18.0 72 2 "volkswagen 411 (sw)"
80 | 21.0 4 120.0 87.00 2979. 19.5 72 2 "peugeot 504 (sw)"
81 | 26.0 4 96.00 69.00 2189. 18.0 72 2 "renault 12 (sw)"
82 | 22.0 4 122.0 86.00 2395. 16.0 72 1 "ford pinto (sw)"
83 | 28.0 4 97.00 92.00 2288. 17.0 72 3 "datsun 510 (sw)"
84 | 23.0 4 120.0 97.00 2506. 14.5 72 3 "toyouta corona mark ii (sw)"
85 | 28.0 4 98.00 80.00 2164. 15.0 72 1 "dodge colt (sw)"
86 | 27.0 4 97.00 88.00 2100. 16.5 72 3 "toyota corolla 1600 (sw)"
87 | 13.0 8 350.0 175.0 4100. 13.0 73 1 "buick century 350"
88 | 14.0 8 304.0 150.0 3672. 11.5 73 1 "amc matador"
89 | 13.0 8 350.0 145.0 3988. 13.0 73 1 "chevrolet malibu"
90 | 14.0 8 302.0 137.0 4042. 14.5 73 1 "ford gran torino"
91 | 15.0 8 318.0 150.0 3777. 12.5 73 1 "dodge coronet custom"
92 | 12.0 8 429.0 198.0 4952. 11.5 73 1 "mercury marquis brougham"
93 | 13.0 8 400.0 150.0 4464. 12.0 73 1 "chevrolet caprice classic"
94 | 13.0 8 351.0 158.0 4363. 13.0 73 1 "ford ltd"
95 | 14.0 8 318.0 150.0 4237. 14.5 73 1 "plymouth fury gran sedan"
96 | 13.0 8 440.0 215.0 4735. 11.0 73 1 "chrysler new yorker brougham"
97 | 12.0 8 455.0 225.0 4951. 11.0 73 1 "buick electra 225 custom"
98 | 13.0 8 360.0 175.0 3821. 11.0 73 1 "amc ambassador brougham"
99 | 18.0 6 225.0 105.0 3121. 16.5 73 1 "plymouth valiant"
100 | 16.0 6 250.0 100.0 3278. 18.0 73 1 "chevrolet nova custom"
101 | 18.0 6 232.0 100.0 2945. 16.0 73 1 "amc hornet"
102 | 18.0 6 250.0 88.00 3021. 16.5 73 1 "ford maverick"
103 | 23.0 6 198.0 95.00 2904. 16.0 73 1 "plymouth duster"
104 | 26.0 4 97.00 46.00 1950. 21.0 73 2 "volkswagen super beetle"
105 | 11.0 8 400.0 150.0 4997. 14.0 73 1 "chevrolet impala"
106 | 12.0 8 400.0 167.0 4906. 12.5 73 1 "ford country"
107 | 13.0 8 360.0 170.0 4654. 13.0 73 1 "plymouth custom suburb"
108 | 12.0 8 350.0 180.0 4499. 12.5 73 1 "oldsmobile vista cruiser"
109 | 18.0 6 232.0 100.0 2789. 15.0 73 1 "amc gremlin"
110 | 20.0 4 97.00 88.00 2279. 19.0 73 3 "toyota carina"
111 | 21.0 4 140.0 72.00 2401. 19.5 73 1 "chevrolet vega"
112 | 22.0 4 108.0 94.00 2379. 16.5 73 3 "datsun 610"
113 | 18.0 3 70.00 90.00 2124. 13.5 73 3 "maxda rx3"
114 | 19.0 4 122.0 85.00 2310. 18.5 73 1 "ford pinto"
115 | 21.0 6 155.0 107.0 2472. 14.0 73 1 "mercury capri v6"
116 | 26.0 4 98.00 90.00 2265. 15.5 73 2 "fiat 124 sport coupe"
117 | 15.0 8 350.0 145.0 4082. 13.0 73 1 "chevrolet monte carlo s"
118 | 16.0 8 400.0 230.0 4278. 9.50 73 1 "pontiac grand prix"
119 | 29.0 4 68.00 49.00 1867. 19.5 73 2 "fiat 128"
120 | 24.0 4 116.0 75.00 2158. 15.5 73 2 "opel manta"
121 | 20.0 4 114.0 91.00 2582. 14.0 73 2 "audi 100ls"
122 | 19.0 4 121.0 112.0 2868. 15.5 73 2 "volvo 144ea"
123 | 15.0 8 318.0 150.0 3399. 11.0 73 1 "dodge dart custom"
124 | 24.0 4 121.0 110.0 2660. 14.0 73 2 "saab 99le"
125 | 20.0 6 156.0 122.0 2807. 13.5 73 3 "toyota mark ii"
126 | 11.0 8 350.0 180.0 3664. 11.0 73 1 "oldsmobile omega"
127 | 20.0 6 198.0 95.00 3102. 16.5 74 1 "plymouth duster"
128 | 21.0 6 200.0 ? 2875. 17.0 74 1 "ford maverick"
129 | 19.0 6 232.0 100.0 2901. 16.0 74 1 "amc hornet"
130 | 15.0 6 250.0 100.0 3336. 17.0 74 1 "chevrolet nova"
131 | 31.0 4 79.00 67.00 1950. 19.0 74 3 "datsun b210"
132 | 26.0 4 122.0 80.00 2451. 16.5 74 1 "ford pinto"
133 | 32.0 4 71.00 65.00 1836. 21.0 74 3 "toyota corolla 1200"
134 | 25.0 4 140.0 75.00 2542. 17.0 74 1 "chevrolet vega"
135 | 16.0 6 250.0 100.0 3781. 17.0 74 1 "chevrolet chevelle malibu classic"
136 | 16.0 6 258.0 110.0 3632. 18.0 74 1 "amc matador"
137 | 18.0 6 225.0 105.0 3613. 16.5 74 1 "plymouth satellite sebring"
138 | 16.0 8 302.0 140.0 4141. 14.0 74 1 "ford gran torino"
139 | 13.0 8 350.0 150.0 4699. 14.5 74 1 "buick century luxus (sw)"
140 | 14.0 8 318.0 150.0 4457. 13.5 74 1 "dodge coronet custom (sw)"
141 | 14.0 8 302.0 140.0 4638. 16.0 74 1 "ford gran torino (sw)"
142 | 14.0 8 304.0 150.0 4257. 15.5 74 1 "amc matador (sw)"
143 | 29.0 4 98.00 83.00 2219. 16.5 74 2 "audi fox"
144 | 26.0 4 79.00 67.00 1963. 15.5 74 2 "volkswagen dasher"
145 | 26.0 4 97.00 78.00 2300. 14.5 74 2 "opel manta"
146 | 31.0 4 76.00 52.00 1649. 16.5 74 3 "toyota corona"
147 | 32.0 4 83.00 61.00 2003. 19.0 74 3 "datsun 710"
148 | 28.0 4 90.00 75.00 2125. 14.5 74 1 "dodge colt"
149 | 24.0 4 90.00 75.00 2108. 15.5 74 2 "fiat 128"
150 | 26.0 4 116.0 75.00 2246. 14.0 74 2 "fiat 124 tc"
151 | 24.0 4 120.0 97.00 2489. 15.0 74 3 "honda civic"
152 | 26.0 4 108.0 93.00 2391. 15.5 74 3 "subaru"
153 | 31.0 4 79.00 67.00 2000. 16.0 74 2 "fiat x1.9"
154 | 19.0 6 225.0 95.00 3264. 16.0 75 1 "plymouth valiant custom"
155 | 18.0 6 250.0 105.0 3459. 16.0 75 1 "chevrolet nova"
156 | 15.0 6 250.0 72.00 3432. 21.0 75 1 "mercury monarch"
157 | 15.0 6 250.0 72.00 3158. 19.5 75 1 "ford maverick"
158 | 16.0 8 400.0 170.0 4668. 11.5 75 1 "pontiac catalina"
159 | 15.0 8 350.0 145.0 4440. 14.0 75 1 "chevrolet bel air"
160 | 16.0 8 318.0 150.0 4498. 14.5 75 1 "plymouth grand fury"
161 | 14.0 8 351.0 148.0 4657. 13.5 75 1 "ford ltd"
162 | 17.0 6 231.0 110.0 3907. 21.0 75 1 "buick century"
163 | 16.0 6 250.0 105.0 3897. 18.5 75 1 "chevroelt chevelle malibu"
164 | 15.0 6 258.0 110.0 3730. 19.0 75 1 "amc matador"
165 | 18.0 6 225.0 95.00 3785. 19.0 75 1 "plymouth fury"
166 | 21.0 6 231.0 110.0 3039. 15.0 75 1 "buick skyhawk"
167 | 20.0 8 262.0 110.0 3221. 13.5 75 1 "chevrolet monza 2+2"
168 | 13.0 8 302.0 129.0 3169. 12.0 75 1 "ford mustang ii"
169 | 29.0 4 97.00 75.00 2171. 16.0 75 3 "toyota corolla"
170 | 23.0 4 140.0 83.00 2639. 17.0 75 1 "ford pinto"
171 | 20.0 6 232.0 100.0 2914. 16.0 75 1 "amc gremlin"
172 | 23.0 4 140.0 78.00 2592. 18.5 75 1 "pontiac astro"
173 | 24.0 4 134.0 96.00 2702. 13.5 75 3 "toyota corona"
174 | 25.0 4 90.00 71.00 2223. 16.5 75 2 "volkswagen dasher"
175 | 24.0 4 119.0 97.00 2545. 17.0 75 3 "datsun 710"
176 | 18.0 6 171.0 97.00 2984. 14.5 75 1 "ford pinto"
177 | 29.0 4 90.00 70.00 1937. 14.0 75 2 "volkswagen rabbit"
178 | 19.0 6 232.0 90.00 3211. 17.0 75 1 "amc pacer"
179 | 23.0 4 115.0 95.00 2694. 15.0 75 2 "audi 100ls"
180 | 23.0 4 120.0 88.00 2957. 17.0 75 2 "peugeot 504"
181 | 22.0 4 121.0 98.00 2945. 14.5 75 2 "volvo 244dl"
182 | 25.0 4 121.0 115.0 2671. 13.5 75 2 "saab 99le"
183 | 33.0 4 91.00 53.00 1795. 17.5 75 3 "honda civic cvcc"
184 | 28.0 4 107.0 86.00 2464. 15.5 76 2 "fiat 131"
185 | 25.0 4 116.0 81.00 2220. 16.9 76 2 "opel 1900"
186 | 25.0 4 140.0 92.00 2572. 14.9 76 1 "capri ii"
187 | 26.0 4 98.00 79.00 2255. 17.7 76 1 "dodge colt"
188 | 27.0 4 101.0 83.00 2202. 15.3 76 2 "renault 12tl"
189 | 17.5 8 305.0 140.0 4215. 13.0 76 1 "chevrolet chevelle malibu classic"
190 | 16.0 8 318.0 150.0 4190. 13.0 76 1 "dodge coronet brougham"
191 | 15.5 8 304.0 120.0 3962. 13.9 76 1 "amc matador"
192 | 14.5 8 351.0 152.0 4215. 12.8 76 1 "ford gran torino"
193 | 22.0 6 225.0 100.0 3233. 15.4 76 1 "plymouth valiant"
194 | 22.0 6 250.0 105.0 3353. 14.5 76 1 "chevrolet nova"
195 | 24.0 6 200.0 81.00 3012. 17.6 76 1 "ford maverick"
196 | 22.5 6 232.0 90.00 3085. 17.6 76 1 "amc hornet"
197 | 29.0 4 85.00 52.00 2035. 22.2 76 1 "chevrolet chevette"
198 | 24.5 4 98.00 60.00 2164. 22.1 76 1 "chevrolet woody"
199 | 29.0 4 90.00 70.00 1937. 14.2 76 2 "vw rabbit"
200 | 33.0 4 91.00 53.00 1795. 17.4 76 3 "honda civic"
201 | 20.0 6 225.0 100.0 3651. 17.7 76 1 "dodge aspen se"
202 | 18.0 6 250.0 78.00 3574. 21.0 76 1 "ford granada ghia"
203 | 18.5 6 250.0 110.0 3645. 16.2 76 1 "pontiac ventura sj"
204 | 17.5 6 258.0 95.00 3193. 17.8 76 1 "amc pacer d/l"
205 | 29.5 4 97.00 71.00 1825. 12.2 76 2 "volkswagen rabbit"
206 | 32.0 4 85.00 70.00 1990. 17.0 76 3 "datsun b-210"
207 | 28.0 4 97.00 75.00 2155. 16.4 76 3 "toyota corolla"
208 | 26.5 4 140.0 72.00 2565. 13.6 76 1 "ford pinto"
209 | 20.0 4 130.0 102.0 3150. 15.7 76 2 "volvo 245"
210 | 13.0 8 318.0 150.0 3940. 13.2 76 1 "plymouth volare premier v8"
211 | 19.0 4 120.0 88.00 3270. 21.9 76 2 "peugeot 504"
212 | 19.0 6 156.0 108.0 2930. 15.5 76 3 "toyota mark ii"
213 | 16.5 6 168.0 120.0 3820. 16.7 76 2 "mercedes-benz 280s"
214 | 16.5 8 350.0 180.0 4380. 12.1 76 1 "cadillac seville"
215 | 13.0 8 350.0 145.0 4055. 12.0 76 1 "chevy c10"
216 | 13.0 8 302.0 130.0 3870. 15.0 76 1 "ford f108"
217 | 13.0 8 318.0 150.0 3755. 14.0 76 1 "dodge d100"
218 | 31.5 4 98.00 68.00 2045. 18.5 77 3 "honda accord cvcc"
219 | 30.0 4 111.0 80.00 2155. 14.8 77 1 "buick opel isuzu deluxe"
220 | 36.0 4 79.00 58.00 1825. 18.6 77 2 "renault 5 gtl"
221 | 25.5 4 122.0 96.00 2300. 15.5 77 1 "plymouth arrow gs"
222 | 33.5 4 85.00 70.00 1945. 16.8 77 3 "datsun f-10 hatchback"
223 | 17.5 8 305.0 145.0 3880. 12.5 77 1 "chevrolet caprice classic"
224 | 17.0 8 260.0 110.0 4060. 19.0 77 1 "oldsmobile cutlass supreme"
225 | 15.5 8 318.0 145.0 4140. 13.7 77 1 "dodge monaco brougham"
226 | 15.0 8 302.0 130.0 4295. 14.9 77 1 "mercury cougar brougham"
227 | 17.5 6 250.0 110.0 3520. 16.4 77 1 "chevrolet concours"
228 | 20.5 6 231.0 105.0 3425. 16.9 77 1 "buick skylark"
229 | 19.0 6 225.0 100.0 3630. 17.7 77 1 "plymouth volare custom"
230 | 18.5 6 250.0 98.00 3525. 19.0 77 1 "ford granada"
231 | 16.0 8 400.0 180.0 4220. 11.1 77 1 "pontiac grand prix lj"
232 | 15.5 8 350.0 170.0 4165. 11.4 77 1 "chevrolet monte carlo landau"
233 | 15.5 8 400.0 190.0 4325. 12.2 77 1 "chrysler cordoba"
234 | 16.0 8 351.0 149.0 4335. 14.5 77 1 "ford thunderbird"
235 | 29.0 4 97.00 78.00 1940. 14.5 77 2 "volkswagen rabbit custom"
236 | 24.5 4 151.0 88.00 2740. 16.0 77 1 "pontiac sunbird coupe"
237 | 26.0 4 97.00 75.00 2265. 18.2 77 3 "toyota corolla liftback"
238 | 25.5 4 140.0 89.00 2755. 15.8 77 1 "ford mustang ii 2+2"
239 | 30.5 4 98.00 63.00 2051. 17.0 77 1 "chevrolet chevette"
240 | 33.5 4 98.00 83.00 2075. 15.9 77 1 "dodge colt m/m"
241 | 30.0 4 97.00 67.00 1985. 16.4 77 3 "subaru dl"
242 | 30.5 4 97.00 78.00 2190. 14.1 77 2 "volkswagen dasher"
243 | 22.0 6 146.0 97.00 2815. 14.5 77 3 "datsun 810"
244 | 21.5 4 121.0 110.0 2600. 12.8 77 2 "bmw 320i"
245 | 21.5 3 80.00 110.0 2720. 13.5 77 3 "mazda rx-4"
246 | 43.1 4 90.00 48.00 1985. 21.5 78 2 "volkswagen rabbit custom diesel"
247 | 36.1 4 98.00 66.00 1800. 14.4 78 1 "ford fiesta"
248 | 32.8 4 78.00 52.00 1985. 19.4 78 3 "mazda glc deluxe"
249 | 39.4 4 85.00 70.00 2070. 18.6 78 3 "datsun b210 gx"
250 | 36.1 4 91.00 60.00 1800. 16.4 78 3 "honda civic cvcc"
251 | 19.9 8 260.0 110.0 3365. 15.5 78 1 "oldsmobile cutlass salon brougham"
252 | 19.4 8 318.0 140.0 3735. 13.2 78 1 "dodge diplomat"
253 | 20.2 8 302.0 139.0 3570. 12.8 78 1 "mercury monarch ghia"
254 | 19.2 6 231.0 105.0 3535. 19.2 78 1 "pontiac phoenix lj"
255 | 20.5 6 200.0 95.00 3155. 18.2 78 1 "chevrolet malibu"
256 | 20.2 6 200.0 85.00 2965. 15.8 78 1 "ford fairmont (auto)"
257 | 25.1 4 140.0 88.00 2720. 15.4 78 1 "ford fairmont (man)"
258 | 20.5 6 225.0 100.0 3430. 17.2 78 1 "plymouth volare"
259 | 19.4 6 232.0 90.00 3210. 17.2 78 1 "amc concord"
260 | 20.6 6 231.0 105.0 3380. 15.8 78 1 "buick century special"
261 | 20.8 6 200.0 85.00 3070. 16.7 78 1 "mercury zephyr"
262 | 18.6 6 225.0 110.0 3620. 18.7 78 1 "dodge aspen"
263 | 18.1 6 258.0 120.0 3410. 15.1 78 1 "amc concord d/l"
264 | 19.2 8 305.0 145.0 3425. 13.2 78 1 "chevrolet monte carlo landau"
265 | 17.7 6 231.0 165.0 3445. 13.4 78 1 "buick regal sport coupe (turbo)"
266 | 18.1 8 302.0 139.0 3205. 11.2 78 1 "ford futura"
267 | 17.5 8 318.0 140.0 4080. 13.7 78 1 "dodge magnum xe"
268 | 30.0 4 98.00 68.00 2155. 16.5 78 1 "chevrolet chevette"
269 | 27.5 4 134.0 95.00 2560. 14.2 78 3 "toyota corona"
270 | 27.2 4 119.0 97.00 2300. 14.7 78 3 "datsun 510"
271 | 30.9 4 105.0 75.00 2230. 14.5 78 1 "dodge omni"
272 | 21.1 4 134.0 95.00 2515. 14.8 78 3 "toyota celica gt liftback"
273 | 23.2 4 156.0 105.0 2745. 16.7 78 1 "plymouth sapporo"
274 | 23.8 4 151.0 85.00 2855. 17.6 78 1 "oldsmobile starfire sx"
275 | 23.9 4 119.0 97.00 2405. 14.9 78 3 "datsun 200-sx"
276 | 20.3 5 131.0 103.0 2830. 15.9 78 2 "audi 5000"
277 | 17.0 6 163.0 125.0 3140. 13.6 78 2 "volvo 264gl"
278 | 21.6 4 121.0 115.0 2795. 15.7 78 2 "saab 99gle"
279 | 16.2 6 163.0 133.0 3410. 15.8 78 2 "peugeot 604sl"
280 | 31.5 4 89.00 71.00 1990. 14.9 78 2 "volkswagen scirocco"
281 | 29.5 4 98.00 68.00 2135. 16.6 78 3 "honda accord lx"
282 | 21.5 6 231.0 115.0 3245. 15.4 79 1 "pontiac lemans v6"
283 | 19.8 6 200.0 85.00 2990. 18.2 79 1 "mercury zephyr 6"
284 | 22.3 4 140.0 88.00 2890. 17.3 79 1 "ford fairmont 4"
285 | 20.2 6 232.0 90.00 3265. 18.2 79 1 "amc concord dl 6"
286 | 20.6 6 225.0 110.0 3360. 16.6 79 1 "dodge aspen 6"
287 | 17.0 8 305.0 130.0 3840. 15.4 79 1 "chevrolet caprice classic"
288 | 17.6 8 302.0 129.0 3725. 13.4 79 1 "ford ltd landau"
289 | 16.5 8 351.0 138.0 3955. 13.2 79 1 "mercury grand marquis"
290 | 18.2 8 318.0 135.0 3830. 15.2 79 1 "dodge st. regis"
291 | 16.9 8 350.0 155.0 4360. 14.9 79 1 "buick estate wagon (sw)"
292 | 15.5 8 351.0 142.0 4054. 14.3 79 1 "ford country squire (sw)"
293 | 19.2 8 267.0 125.0 3605. 15.0 79 1 "chevrolet malibu classic (sw)"
294 | 18.5 8 360.0 150.0 3940. 13.0 79 1 "chrysler lebaron town @ country (sw)"
295 | 31.9 4 89.00 71.00 1925. 14.0 79 2 "vw rabbit custom"
296 | 34.1 4 86.00 65.00 1975. 15.2 79 3 "maxda glc deluxe"
297 | 35.7 4 98.00 80.00 1915. 14.4 79 1 "dodge colt hatchback custom"
298 | 27.4 4 121.0 80.00 2670. 15.0 79 1 "amc spirit dl"
299 | 25.4 5 183.0 77.00 3530. 20.1 79 2 "mercedes benz 300d"
300 | 23.0 8 350.0 125.0 3900. 17.4 79 1 "cadillac eldorado"
301 | 27.2 4 141.0 71.00 3190. 24.8 79 2 "peugeot 504"
302 | 23.9 8 260.0 90.00 3420. 22.2 79 1 "oldsmobile cutlass salon brougham"
303 | 34.2 4 105.0 70.00 2200. 13.2 79 1 "plymouth horizon"
304 | 34.5 4 105.0 70.00 2150. 14.9 79 1 "plymouth horizon tc3"
305 | 31.8 4 85.00 65.00 2020. 19.2 79 3 "datsun 210"
306 | 37.3 4 91.00 69.00 2130. 14.7 79 2 "fiat strada custom"
307 | 28.4 4 151.0 90.00 2670. 16.0 79 1 "buick skylark limited"
308 | 28.8 6 173.0 115.0 2595. 11.3 79 1 "chevrolet citation"
309 | 26.8 6 173.0 115.0 2700. 12.9 79 1 "oldsmobile omega brougham"
310 | 33.5 4 151.0 90.00 2556. 13.2 79 1 "pontiac phoenix"
311 | 41.5 4 98.00 76.00 2144. 14.7 80 2 "vw rabbit"
312 | 38.1 4 89.00 60.00 1968. 18.8 80 3 "toyota corolla tercel"
313 | 32.1 4 98.00 70.00 2120. 15.5 80 1 "chevrolet chevette"
314 | 37.2 4 86.00 65.00 2019. 16.4 80 3 "datsun 310"
315 | 28.0 4 151.0 90.00 2678. 16.5 80 1 "chevrolet citation"
316 | 26.4 4 140.0 88.00 2870. 18.1 80 1 "ford fairmont"
317 | 24.3 4 151.0 90.00 3003. 20.1 80 1 "amc concord"
318 | 19.1 6 225.0 90.00 3381. 18.7 80 1 "dodge aspen"
319 | 34.3 4 97.00 78.00 2188. 15.8 80 2 "audi 4000"
320 | 29.8 4 134.0 90.00 2711. 15.5 80 3 "toyota corona liftback"
321 | 31.3 4 120.0 75.00 2542. 17.5 80 3 "mazda 626"
322 | 37.0 4 119.0 92.00 2434. 15.0 80 3 "datsun 510 hatchback"
323 | 32.2 4 108.0 75.00 2265. 15.2 80 3 "toyota corolla"
324 | 46.6 4 86.00 65.00 2110. 17.9 80 3 "mazda glc"
325 | 27.9 4 156.0 105.0 2800. 14.4 80 1 "dodge colt"
326 | 40.8 4 85.00 65.00 2110. 19.2 80 3 "datsun 210"
327 | 44.3 4 90.00 48.00 2085. 21.7 80 2 "vw rabbit c (diesel)"
328 | 43.4 4 90.00 48.00 2335. 23.7 80 2 "vw dasher (diesel)"
329 | 36.4 5 121.0 67.00 2950. 19.9 80 2 "audi 5000s (diesel)"
330 | 30.0 4 146.0 67.00 3250. 21.8 80 2 "mercedes-benz 240d"
331 | 44.6 4 91.00 67.00 1850. 13.8 80 3 "honda civic 1500 gl"
332 | 40.9 4 85.00 ? 1835. 17.3 80 2 "renault lecar deluxe"
333 | 33.8 4 97.00 67.00 2145. 18.0 80 3 "subaru dl"
334 | 29.8 4 89.00 62.00 1845. 15.3 80 2 "vokswagen rabbit"
335 | 32.7 6 168.0 132.0 2910. 11.4 80 3 "datsun 280-zx"
336 | 23.7 3 70.00 100.0 2420. 12.5 80 3 "mazda rx-7 gs"
337 | 35.0 4 122.0 88.00 2500. 15.1 80 2 "triumph tr7 coupe"
338 | 23.6 4 140.0 ? 2905. 14.3 80 1 "ford mustang cobra"
339 | 32.4 4 107.0 72.00 2290. 17.0 80 3 "honda accord"
340 | 27.2 4 135.0 84.00 2490. 15.7 81 1 "plymouth reliant"
341 | 26.6 4 151.0 84.00 2635. 16.4 81 1 "buick skylark"
342 | 25.8 4 156.0 92.00 2620. 14.4 81 1 "dodge aries wagon (sw)"
343 | 23.5 6 173.0 110.0 2725. 12.6 81 1 "chevrolet citation"
344 | 30.0 4 135.0 84.00 2385. 12.9 81 1 "plymouth reliant"
345 | 39.1 4 79.00 58.00 1755. 16.9 81 3 "toyota starlet"
346 | 39.0 4 86.00 64.00 1875. 16.4 81 1 "plymouth champ"
347 | 35.1 4 81.00 60.00 1760. 16.1 81 3 "honda civic 1300"
348 | 32.3 4 97.00 67.00 2065. 17.8 81 3 "subaru"
349 | 37.0 4 85.00 65.00 1975. 19.4 81 3 "datsun 210 mpg"
350 | 37.7 4 89.00 62.00 2050. 17.3 81 3 "toyota tercel"
351 | 34.1 4 91.00 68.00 1985. 16.0 81 3 "mazda glc 4"
352 | 34.7 4 105.0 63.00 2215. 14.9 81 1 "plymouth horizon 4"
353 | 34.4 4 98.00 65.00 2045. 16.2 81 1 "ford escort 4w"
354 | 29.9 4 98.00 65.00 2380. 20.7 81 1 "ford escort 2h"
355 | 33.0 4 105.0 74.00 2190. 14.2 81 2 "volkswagen jetta"
356 | 34.5 4 100.0 ? 2320. 15.8 81 2 "renault 18i"
357 | 33.7 4 107.0 75.00 2210. 14.4 81 3 "honda prelude"
358 | 32.4 4 108.0 75.00 2350. 16.8 81 3 "toyota corolla"
359 | 32.9 4 119.0 100.0 2615. 14.8 81 3 "datsun 200sx"
360 | 31.6 4 120.0 74.00 2635. 18.3 81 3 "mazda 626"
361 | 28.1 4 141.0 80.00 3230. 20.4 81 2 "peugeot 505s turbo diesel"
362 | 30.7 6 145.0 76.00 3160. 19.6 81 2 "volvo diesel"
363 | 25.4 6 168.0 116.0 2900. 12.6 81 3 "toyota cressida"
364 | 24.2 6 146.0 120.0 2930. 13.8 81 3 "datsun 810 maxima"
365 | 22.4 6 231.0 110.0 3415. 15.8 81 1 "buick century"
366 | 26.6 8 350.0 105.0 3725. 19.0 81 1 "oldsmobile cutlass ls"
367 | 20.2 6 200.0 88.00 3060. 17.1 81 1 "ford granada gl"
368 | 17.6 6 225.0 85.00 3465. 16.6 81 1 "chrysler lebaron salon"
369 | 28.0 4 112.0 88.00 2605. 19.6 82 1 "chevrolet cavalier"
370 | 27.0 4 112.0 88.00 2640. 18.6 82 1 "chevrolet cavalier wagon"
371 | 34.0 4 112.0 88.00 2395. 18.0 82 1 "chevrolet cavalier 2-door"
372 | 31.0 4 112.0 85.00 2575. 16.2 82 1 "pontiac j2000 se hatchback"
373 | 29.0 4 135.0 84.00 2525. 16.0 82 1 "dodge aries se"
374 | 27.0 4 151.0 90.00 2735. 18.0 82 1 "pontiac phoenix"
375 | 24.0 4 140.0 92.00 2865. 16.4 82 1 "ford fairmont futura"
376 | 36.0 4 105.0 74.00 1980. 15.3 82 2 "volkswagen rabbit l"
377 | 37.0 4 91.00 68.00 2025. 18.2 82 3 "mazda glc custom l"
378 | 31.0 4 91.00 68.00 1970. 17.6 82 3 "mazda glc custom"
379 | 38.0 4 105.0 63.00 2125. 14.7 82 1 "plymouth horizon miser"
380 | 36.0 4 98.00 70.00 2125. 17.3 82 1 "mercury lynx l"
381 | 36.0 4 120.0 88.00 2160. 14.5 82 3 "nissan stanza xe"
382 | 36.0 4 107.0 75.00 2205. 14.5 82 3 "honda accord"
383 | 34.0 4 108.0 70.00 2245 16.9 82 3 "toyota corolla"
384 | 38.0 4 91.00 67.00 1965. 15.0 82 3 "honda civic"
385 | 32.0 4 91.00 67.00 1965. 15.7 82 3 "honda civic (auto)"
386 | 38.0 4 91.00 67.00 1995. 16.2 82 3 "datsun 310 gx"
387 | 25.0 6 181.0 110.0 2945. 16.4 82 1 "buick century limited"
388 | 38.0 6 262.0 85.00 3015. 17.0 82 1 "oldsmobile cutlass ciera (diesel)"
389 | 26.0 4 156.0 92.00 2585. 14.5 82 1 "chrysler lebaron medallion"
390 | 22.0 6 232.0 112.0 2835 14.7 82 1 "ford granada l"
391 | 32.0 4 144.0 96.00 2665. 13.9 82 3 "toyota celica gt"
392 | 36.0 4 135.0 84.00 2370. 13.0 82 1 "dodge charger 2.2"
393 | 27.0 4 151.0 90.00 2950. 17.3 82 1 "chevrolet camaro"
394 | 27.0 4 140.0 86.00 2790. 15.6 82 1 "ford mustang gl"
395 | 44.0 4 97.00 52.00 2130. 24.6 82 2 "vw pickup"
396 | 32.0 4 135.0 84.00 2295. 11.6 82 1 "dodge rampage"
397 | 28.0 4 120.0 79.00 2625. 18.6 82 1 "ford ranger"
398 | 31.0 4 119.0 82.00 2720. 19.4 82 1 "chevy s-10"
399 |
--------------------------------------------------------------------------------