├── images ├── nn.png ├── nn2.png ├── animals │ ├── bug.jpg │ ├── bird.jpg │ ├── bird2.jpg │ ├── bird3.jpg │ ├── elba.jpg │ ├── hamish.jpg │ ├── poodle.jpg │ ├── tortoise.jpg │ ├── butterfly.jpg │ └── butterfly2.jpg └── isl_small.png ├── .gitignore ├── _bookdown.yml ├── ISLRv2-solutions.Rproj ├── index.Rmd ├── _output.yml ├── DESCRIPTION ├── .vscode └── settings.json ├── islrv2.css ├── .github └── workflows │ └── github-actions.yml ├── LICENSE.md ├── README.md ├── 13-multiple-testing.Rmd ├── 02-statistical-learning.Rmd ├── 11-survival-analysis-and-censored-data.Rmd ├── 05-resampling-methods.Rmd ├── 12-unsupervised-learning.Rmd ├── 09-support-vector-mechines.Rmd ├── 08-tree-based-methods.Rmd ├── 10-deep-learning.Rmd ├── 07-moving-beyond-linearity.Rmd └── data └── Auto.data /images/nn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/nn.png -------------------------------------------------------------------------------- /images/nn2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/nn2.png -------------------------------------------------------------------------------- /images/animals/bug.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/bug.jpg -------------------------------------------------------------------------------- /images/isl_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/isl_small.png -------------------------------------------------------------------------------- /images/animals/bird.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/bird.jpg -------------------------------------------------------------------------------- /images/animals/bird2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/bird2.jpg -------------------------------------------------------------------------------- /images/animals/bird3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/bird3.jpg -------------------------------------------------------------------------------- /images/animals/elba.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/elba.jpg -------------------------------------------------------------------------------- /images/animals/hamish.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/hamish.jpg -------------------------------------------------------------------------------- /images/animals/poodle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/poodle.jpg -------------------------------------------------------------------------------- /images/animals/tortoise.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/tortoise.jpg -------------------------------------------------------------------------------- /images/animals/butterfly.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/butterfly.jpg -------------------------------------------------------------------------------- /images/animals/butterfly2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danhalligan/ISLRv2-solutions/HEAD/images/animals/butterfly2.jpg -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | _book 6 | .DS_Store 7 | islrv2-solutions.rds 8 | _bookdown_files 9 | [0-9]*.md 10 | [0-9]*_files/ 11 | 12 | -------------------------------------------------------------------------------- /_bookdown.yml: -------------------------------------------------------------------------------- 1 | book_filename: "islrv2-solutions" 2 | language: 3 | ui: 4 | chapter_name: "" 5 | delete_merged_file: true 6 | new_session: yes 7 | repo: https://github.com/danhalligan/ISLRv2-solutions/ 8 | -------------------------------------------------------------------------------- /ISLRv2-solutions.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Website 19 | 20 | SpellingDictionary: en_US 21 | -------------------------------------------------------------------------------- /index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: An Introduction to Statistical Learning 3 | subtitle: Exercise solutions in R 4 | site: bookdown::bookdown_site 5 | github-repo: danhalligan/ISLRv2-solutions 6 | url: 'https\://danhalligan.github.io/ISLRv2-solutions/' 7 | --- 8 | 9 | # Introduction 10 | 11 | This bookdown document provides solutions for exercises in the book 12 | ["An Introduction to Statistical Learning with Applications in R"](https://www.statlearning.com/), 13 | second edition, by Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani. 14 | 15 | ![](images/isl_small.png) 16 | -------------------------------------------------------------------------------- /_output.yml: -------------------------------------------------------------------------------- 1 | bookdown::gitbook: 2 | css: [islrv2.css, https://cdn.jsdelivr.net/gh/aaaakshat/cm-web-fonts@latest/fonts.css] 3 | config: 4 | toc: 5 | collapse: section 6 | before: | 7 |
  • ISLRv2 Solutions
  • 8 | after: | 9 |
  • Published with bookdown
  • 10 | edit: https://github.com/danhalligan/ISLRv2-solutions/edit/main/%s 11 | sharing: 12 | facebook: no 13 | twitter: no 14 | github: yes 15 | all: [] 16 | github-repo: danhalligan/ISLRv2-solutions 17 | url: https://danhalligan.github.io/ISLRv2-solutions/ 18 | 19 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: placeholder 2 | Type: Book 3 | Title: Does not matter. 4 | Version: 0.0.1 5 | biocviews: 6 | Imports: 7 | ape, 8 | BART, 9 | bookdown, 10 | boot, 11 | class, 12 | corrplot, 13 | e1071, 14 | gam, 15 | gbm, 16 | ggfortify, 17 | ggplot2, 18 | ggtree, 19 | glmnet, 20 | ISLR2, 21 | keras, 22 | knitr, 23 | leaps, 24 | MASS, 25 | neuralnet, 26 | plotly, 27 | pls, 28 | randomForest, 29 | showtext, 30 | sigmoid, 31 | splines, 32 | survival, 33 | tree, 34 | tidyverse, 35 | xfun 36 | Remotes: 37 | rstudio/bookdown, 38 | github::YuLab-SMU/ggtree 39 | 40 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "cSpell.words": [ 3 | "ANOVA", 4 | "Bonferroni", 5 | "boxplots", 6 | "covariate", 7 | "dendrogram", 8 | "dendrograms", 9 | "exponentiating", 10 | "FWER", 11 | "hypergeometric", 12 | "Kaplan", 13 | "logit", 14 | "LOOCV", 15 | "multinomial", 16 | "overfit", 17 | "overfitting", 18 | "quantile", 19 | "quantiles", 20 | "scalings", 21 | "scatterplot", 22 | "scatterplots", 23 | "softmax", 24 | "stochasticity", 25 | "timepoint" 26 | ], 27 | "cSpell.enabled": true, 28 | "cSpell.language": "en-US" 29 | } -------------------------------------------------------------------------------- /islrv2.css: -------------------------------------------------------------------------------- 1 | #header .title { 2 | color: #2E71B6; 3 | font-family: "Computer Modern Serif", serif; 4 | font-weight: normal; 5 | font-size: 3em; 6 | letter-spacing: -0.8px; 7 | line-height: 1.2; 8 | } 9 | 10 | h1, 11 | h2, 12 | h3 { 13 | color: #2E71B6; 14 | font-family: "Computer Modern Serif", serif; 15 | font-weight: normal; 16 | } 17 | 18 | .book .book-body .page-wrapper .page-inner section.normal h1 { 19 | font-size: 2.5em; 20 | } 21 | 22 | .book .book-body .page-wrapper .page-inner section.normal h2 { 23 | font-size: 2em; 24 | font-style: italic; 25 | } 26 | 27 | h3>.header-section-number { 28 | display: none; 29 | } 30 | 31 | .book .book-body .page-wrapper .page-inner section.normal pre.sourceCode { 32 | background: #FDF5E7; 33 | } 34 | 35 | .book .book-body .page-wrapper .page-inner { 36 | max-width: 1000px !important; 37 | } -------------------------------------------------------------------------------- /.github/workflows/github-actions.yml: -------------------------------------------------------------------------------- 1 | name: Build and Deploy 2 | on: [push, workflow_dispatch] 3 | permissions: 4 | contents: write 5 | jobs: 6 | build-and-deploy: 7 | concurrency: ci-${{ github.ref }} 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v2 11 | 12 | - uses: r-lib/actions/setup-pandoc@v1 13 | 14 | - uses: r-lib/actions/setup-r@v2 15 | with: 16 | use-public-rspm: true 17 | 18 | - uses: r-lib/actions/setup-r-dependencies@v2 19 | with: 20 | cache: false 21 | 22 | - name: Setup keras 23 | run: Rscript -e "keras::install_keras()" 24 | 25 | - name: Build 🔧 26 | run: Rscript -e "bookdown::render_book('index.Rmd', 'bookdown::gitbook')" 27 | 28 | - name: Deploy 🚀 29 | uses: JamesIves/github-pages-deploy-action@v4.4.0 30 | with: 31 | folder: _book 32 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2022 ISLRv2-solutions authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # An Introduction to Statistical Learning with Applications in R second edition solutions 2 | 3 | [![Build and Deploy](https://github.com/danhalligan/ISLRv2-solutions/actions/workflows/github-actions.yml/badge.svg)](https://github.com/danhalligan/ISLRv2-solutions/actions/workflows/github-actions.yml) 4 | ![License](https://img.shields.io/github/license/danhalligan/ISLRv2-solutions) 5 | 6 | ![ISLR v2 cover](images/isl_small.png) 7 | 8 | This repository provides my solutions for all exercises in the book 9 | ["An Introduction to Statistical Learning with Applications in R"](https://www.statlearning.com/), 10 | second edition, 11 | by Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani. 12 | 13 | If you use these solutions or find them useful, please star this repository! 14 | 15 | ## About these solutions 16 | 17 | The solutions are written in [bookdown] format using (my) 18 | [ISLRv2 solutions template](https://github.com/danhalligan/ISLRv2-solutions-template). 19 | 20 | For anyone reading this book, I believe there is great value in deriving 21 | the solutions yourself, and the template above can be forked to provide a 22 | great starting point as I've created template Rmarkdown files for each chapter 23 | and transcribed all questions as quotes within the chapter files leaving space 24 | for you to write your solutions either as text, or by writing R code chunks. 25 | For more details, refer to the template repository. 26 | 27 | I've tried my best to provide solutions to each problem in this book, and I 28 | believe my answers should be (at least for the most part) correct. The solutions 29 | are relatively concise but hopefully comprehensive enough to address the 30 | purpose of each question. 31 | 32 | If, when reading these solutions, you notice any inconsistencies, mistakes or 33 | have any other questions, please feel free to raise an issue or provide 34 | suggestions and I'll happily take a look and try to incorporate. 35 | 36 | ## Compiled solutions 37 | 38 | The HTML book is compiled from the Rmarkdown files in using a [GitHub Actions] 39 | workflow and then automatically deployed to [GitHub pages]. 40 | 41 | You can view these solutions hosted from GitHub at the following location: 42 | 43 | * 44 | 45 | ## Building the solutions 46 | 47 | The R dependencies are provided in the `DESCRIPTION` file and you can install them by running: 48 | 49 | ```r 50 | devtools::install() 51 | ``` 52 | 53 | Other than that you need the Python packages [tensorflow and keras] installed in the Python environment you are using to run the examples: 54 | 55 | ```bash 56 | pip3 install tensorflow keras 57 | ``` 58 | 59 | Rebuild the book with compiled solutions by running: 60 | 61 | ```r 62 | bookdown::render_book('index.Rmd', 'bookdown::gitbook') 63 | ``` 64 | 65 | To render only one file/chapter (e.g. solutions for chapter 2), use: 66 | 67 | ```r 68 | rmarkdown::render(input="02-statistical-learning.Rmd", output_format=bookdown::pdf_document2()) 69 | ``` 70 | 71 | [bookdown]: https://github.com/rstudio/bookdown 72 | [GitHub Actions]: https://docs.github.com/en/actions 73 | [GitHub pages]: https://pages.github.com/ 74 | [tensorflow and keras]: https://tensorflow.rstudio.com/install/ 75 | -------------------------------------------------------------------------------- /13-multiple-testing.Rmd: -------------------------------------------------------------------------------- 1 | # Multiple Testing 2 | 3 | ## Conceptual 4 | 5 | ### Question 1 6 | 7 | > Suppose we test $m$ null hypotheses, all of which are true. We control the 8 | > Type I error for each null hypothesis at level $\alpha$. For each sub-problem, 9 | > justify your answer. 10 | > 11 | > a. In total, how many Type I errors do we expect to make? 12 | 13 | We expect $m\alpha$. 14 | 15 | > b. Suppose that the m tests that we perform are independent. What is the 16 | > family-wise error rate associated with these m tests? 17 | > 18 | > _Hint: If two events A and B are independent, then Pr(A ∩ B) = Pr(A) Pr(B)._ 19 | 20 | The family-wise error rate (FWER) is defined as the probability of making at 21 | least one Type I error. We can think of this as 1 minus the probability of 22 | no type I errors, which is: 23 | 24 | $1 - (1 - \alpha)^m$ 25 | 26 | Alternatively, for two tests this is: Pr(A ∪ B) = Pr(A) + Pr(B) - Pr(A ∩ B). 27 | For independent tests this is $\alpha + \alpha - \alpha^2$ 28 | 29 | > c. Suppose that $m = 2$, and that the p-values for the two tests are 30 | > positively correlated, so that if one is small then the other will tend to 31 | > be small as well, and if one is large then the other will tend to be large. 32 | > How does the family-wise error rate associated with these $m = 2$ tests 33 | > qualitatively compare to the answer in (b) with $m = 2$? 34 | > 35 | > _Hint: First, suppose that the two p-values are perfectly correlated._ 36 | 37 | If they were perfectly correlated, we would effectively be performing a single 38 | test (thus FWER would be $alpha$). In the case when they are positively 39 | correlated therefore, we can expect the FWER to be less than in b. 40 | 41 | Alternatively, as above, FWEW = Pr(A ∪ B) = Pr(A) + Pr(B) - Pr(A ∩ B). 42 | For perfectly positively correlated tests Pr(A ∩ B) = $\alpha$, so the 43 | FWEW is $\alpha$ which is smaller than b. 44 | 45 | > d. Suppose again that $m = 2$, but that now the p-values for the two tests are 46 | > negatively correlated, so that if one is large then the other will tend to 47 | > be small. How does the family-wise error rate associated with these $m = 2$ 48 | > tests qualitatively compare to the answer in (b) with $m = 2$? 49 | > 50 | > _Hint: First, suppose that whenever one p-value is less than $\alpha$,_ 51 | > _then the other will be greater than $\alpha$. In other words, we can_ 52 | > _never reject both null hypotheses._ 53 | 54 | Taking the equation above, for two tests, 55 | FWEW = Pr(A ∪ B) = Pr(A) + Pr(B) - Pr(A ∩ B). In the case considered in the 56 | hint Pr(A ∩ B) = 0, so Pr(A ∪ B) = $2\alpha$, which is larger than b. 57 | 58 | ### Question 2 59 | 60 | > Suppose that we test $m$ hypotheses, and control the Type I error for each 61 | > hypothesis at level $\alpha$. Assume that all $m$ p-values are independent, 62 | > and that all null hypotheses are true. 63 | > 64 | > a. Let the random variable $A_j$ equal 1 if the $j$th null hypothesis is 65 | > rejected, and 0 otherwise. What is the distribution of $A_j$? 66 | 67 | $A_j$ follows a Bernoulli distribution: $A_j \sim \text{Bernoulli}(p)$ 68 | 69 | > b. What is the distribution of $\sum_{j=1}^m A_j$? 70 | 71 | Follows a binomial distribution $\sum_{j=1}^m A_j \sim Bi(m, \alpha)$. 72 | 73 | > c. What is the standard deviation of the number of Type I errors that we will 74 | > make? 75 | 76 | The variance of a Binomial is $npq$, so for this situation the standard 77 | deviation would be $\sqrt{m \alpha (1-\alpha)}$. 78 | 79 | ### Question 3 80 | 81 | > Suppose we test $m$ null hypotheses, and control the Type I error for the 82 | > $j$th null hypothesis at level $\alpha_j$, for $j=1,...,m$. Argue that the 83 | > family-wise error rate is no greater than $\sum_{j=1}^m \alpha_j$. 84 | 85 | $p(A \cup B) = p(A) + p(B)$ if $A$ and $B$ are independent or 86 | $p(A) + p(B) - p(A \cap B)$ when they are not. Since $p(A \cap B)$ must be 87 | positive, $p(A \cup B) < p(A) + p(B)$ (whether independent or not). 88 | 89 | Therefore, the probability of a type I error in _any_ of $m$ hypotheses can 90 | be no larger than the sum of the probabilities for each individual hypothesis 91 | (which is $\alpha_j$ for the $j$th). 92 | 93 | ### Question 4 94 | 95 | > Suppose we test $m = 10$ hypotheses, and obtain the p-values shown in Table 96 | > 13.4. 97 | 98 | ```{r} 99 | pvals <- c(0.0011, 0.031, 0.017, 0.32, 0.11, 0.90, 0.07, 0.006, 0.004, 0.0009) 100 | names(pvals) <- paste0("H", sprintf("%02d", 1:10)) 101 | ``` 102 | 103 | > a. Suppose that we wish to control the Type I error for each null hypothesis 104 | > at level $\alpha = 0.05$. Which null hypotheses will we reject? 105 | 106 | ```{r} 107 | names(which(pvals < 0.05)) 108 | ``` 109 | 110 | We reject all NULL hypotheses where $p < 0.05$. 111 | 112 | > b. Now suppose that we wish to control the FWER at level $\alpha = 0.05$. 113 | > Which null hypotheses will we reject? Justify your answer. 114 | 115 | ```{r} 116 | names(which(pvals < 0.05 / 10)) 117 | ``` 118 | 119 | We reject all NULL hypotheses where $p < 0.005$. 120 | 121 | > c. Now suppose that we wish to control the FDR at level $q = 0.05$. Which null 122 | > hypotheses will we reject? Justify your answer. 123 | 124 | ```{r} 125 | names(which(p.adjust(pvals, "fdr") < 0.05)) 126 | ``` 127 | 128 | We reject all NULL hypotheses where $q < 0.05$. 129 | 130 | > d. Now suppose that we wish to control the FDR at level $q = 0.2$. Which null 131 | > hypotheses will we reject? Justify your answer. 132 | 133 | ```{r} 134 | names(which(p.adjust(pvals, "fdr") < 0.2)) 135 | ``` 136 | 137 | We reject all NULL hypotheses where $q < 0.2$. 138 | 139 | > e. Of the null hypotheses rejected at FDR level $q = 0.2$, approximately how 140 | > many are false positives? Justify your answer. 141 | 142 | We expect 20% (in this case 2 out of the 8) rejections to be false (false 143 | positives). 144 | 145 | ### Question 5 146 | 147 | > For this problem, you will make up p-values that lead to a certain number of 148 | > rejections using the Bonferroni and Holm procedures. 149 | > 150 | > a. Give an example of five p-values (i.e. five numbers between 0 and 1 which, 151 | > for the purpose of this problem, we will interpret as p-values) for which 152 | > both Bonferroni’s method and Holm’s method reject exactly one null hypothesis 153 | > when controlling the FWER at level 0.1. 154 | 155 | In this case, for Bonferroni, we need one p-value to be less than $0.1 / 5 = 156 | 0.02$. and the others to be above. For Holm's method, we need the most 157 | significant p-value to be below $0.1/(5 + 1 - 1) = 0.02$ also. 158 | 159 | An example would be: 1, 1, 1, 1, 0.001. 160 | 161 | ```{r} 162 | pvals <- c(1, 1, 1, 1, 0.001) 163 | sum(p.adjust(pvals, method = "bonferroni") < 0.1) 164 | sum(p.adjust(pvals, method = "holm") < 0.1) 165 | ``` 166 | 167 | > b. Now give an example of five p-values for which Bonferroni rejects one 168 | > null hypothesis and Holm rejects more than one null hypothesis at level 0.1. 169 | 170 | An example would be: 1, 1, 1, 0.02, 0.001. For Holm's method we reject two 171 | because $0.02 < 0.1/(5 + 1 - 2)$. 172 | 173 | ```{r} 174 | pvals <- c(1, 1, 1, 0.02, 0.001) 175 | sum(p.adjust(pvals, method = "bonferroni") < 0.1) 176 | sum(p.adjust(pvals, method = "holm") < 0.1) 177 | ``` 178 | 179 | ### Question 6 180 | 181 | > For each of the three panels in Figure 13.3, answer the following questions: 182 | 183 | * There are always: 8 positives (red) and 2 negatives (black). 184 | * False / true positives are black / red points _below_ the line respectively. 185 | * False / true negatives are red / black points _above_ the line respectively. 186 | * Type I / II errors are the same as false positives and false negatives 187 | respectively. 188 | 189 | > a. How many false positives, false negatives, true positives, true negatives, 190 | > Type I errors, and Type II errors result from applying the Bonferroni 191 | > procedure to control the FWER at level $\alpha = 0.05$? 192 | 193 | | Panel | FP | FN | TP | TN | Type I | Type II | 194 | |------ |--- |--- |--- |--- |------- |-------- | 195 | | 1 | 0 | 1 | 7 | 2 | 0 | 1 | 196 | | 2 | 0 | 1 | 7 | 2 | 0 | 1 | 197 | | 3 | 0 | 5 | 3 | 2 | 0 | 5 | 198 | 199 | > b. How many false positives, false negatives, true positives, true negatives, 200 | > Type I errors, and Type II errors result from applying the Holm procedure to 201 | > control the FWER at level $\alpha = 0.05$? 202 | 203 | | Panel | FP | FN | TP | TN | Type I | Type II | 204 | |------ |--- |--- |--- |--- |------- |-------- | 205 | | 1 | 0 | 1 | 7 | 2 | 0 | 1 | 206 | | 2 | 0 | 0 | 8 | 2 | 0 | 0 | 207 | | 3 | 0 | 0 | 8 | 2 | 0 | 0 | 208 | 209 | > c. What is the false discovery rate associated with using the Bonferroni 210 | > procedure to control the FWER at level $\alpha = 0.05$? 211 | 212 | False discovery rate is the expected ratio of false positives (FP) to total 213 | positive (FP + TP). 214 | 215 | For panels 1, 2, 3 this would be 0/7, 0/7 and 0/3 respectively. 216 | 217 | > d. What is the false discovery rate associated with using the Holm procedure 218 | > to control the FWER at level $\alpha = 0.05$? 219 | 220 | For panels 1, 2, 3 this would be 0/7, 0/8 and 0/8 respectively. 221 | 222 | > e. How would the answers to (a) and (c) change if we instead used the 223 | > Bonferroni procedure to control the FWER at level $\alpha = 0.001$? 224 | 225 | This would equate to a more stringent threshold. We would not call any more 226 | false positives, so the results would not change. 227 | 228 | ## Applied 229 | 230 | ### Question 7 231 | 232 | > This problem makes use of the `Carseats` dataset in the `ISLR2` package. 233 | > 234 | > a. For each quantitative variable in the dataset besides `Sales`, fit a linear 235 | > model to predict `Sales` using that quantitative variable. Report the p-values 236 | > associated with the coefficients for the variables. That is, for each model of 237 | > the form $Y = \beta_0 + \beta_1X + \epsilon$, report the p-value associated 238 | > with the coefficient $\beta_1$. Here, $Y$ represents `Sales` and $X$ 239 | > represents one of the other quantitative variables. 240 | 241 | ```{r} 242 | library(ISLR2) 243 | 244 | nm <- c("CompPrice", "Income", "Advertising", "Population", "Price", "Age") 245 | pvals <- sapply(nm, function(n) { 246 | summary(lm(Carseats[["Sales"]] ~ Carseats[[n]]))$coef[2, 4] 247 | }) 248 | ``` 249 | 250 | > b. Suppose we control the Type I error at level $\alpha = 0.05$ for the 251 | > p-values obtained in (a). Which null hypotheses do we reject? 252 | 253 | ```{r} 254 | names(which(pvals < 0.05)) 255 | ``` 256 | 257 | > c. Now suppose we control the FWER at level 0.05 for the p-values. Which null 258 | > hypotheses do we reject? 259 | 260 | ```{r} 261 | names(which(pvals < 0.05 / length(nm))) 262 | ``` 263 | 264 | > d. Finally, suppose we control the FDR at level 0.2 for the p-values. Which 265 | > null hypotheses do we reject? 266 | 267 | ```{r} 268 | names(which(p.adjust(pvals, "fdr") < 0.2)) 269 | ``` 270 | 271 | ### Question 8 272 | 273 | > In this problem, we will simulate data from $m = 100$ fund managers. 274 | > 275 | > ```r 276 | > set.seed(1) 277 | > n <- 20 278 | > m <- 100 279 | > X <- matrix(rnorm(n * m), ncol = m) 280 | > ``` 281 | 282 | ```{r} 283 | set.seed(1) 284 | n <- 20 285 | m <- 100 286 | X <- matrix(rnorm(n * m), ncol = m) 287 | ``` 288 | 289 | > These data represent each fund manager’s percentage returns for each of $n = 290 | > 20$ months. We wish to test the null hypothesis that each fund manager’s 291 | > percentage returns have population mean equal to zero. Notice that we 292 | > simulated the data in such a way that each fund manager’s percentage returns 293 | > do have population mean zero; in other words, all $m$ null hypotheses are true. 294 | > 295 | > a. Conduct a one-sample $t$-test for each fund manager, and plot a histogram 296 | > of the $p$-values obtained. 297 | 298 | ```{r} 299 | pvals <- apply(X, 2, function(p) t.test(p)$p.value) 300 | hist(pvals, main = NULL) 301 | ``` 302 | 303 | > b. If we control Type I error for each null hypothesis at level $\alpha = 304 | > 0.05$, then how many null hypotheses do we reject? 305 | 306 | ```{r} 307 | sum(pvals < 0.05) 308 | ``` 309 | > c. If we control the FWER at level 0.05, then how many null hypotheses do we 310 | > reject? 311 | 312 | ```{r} 313 | sum(pvals < 0.05 / length(pvals)) 314 | ``` 315 | 316 | > d. If we control the FDR at level 0.05, then how many null hypotheses do we 317 | > reject? 318 | 319 | ```{r} 320 | sum(p.adjust(pvals, "fdr") < 0.05) 321 | ``` 322 | 323 | > e. Now suppose we “cherry-pick” the 10 fund managers who perform the best in 324 | > our data. If we control the FWER for just these 10 fund managers at level 325 | > 0.05, then how many null hypotheses do we reject? If we control the FDR for 326 | > just these 10 fund managers at level 0.05, then how many null hypotheses do we 327 | > reject? 328 | 329 | ```{r} 330 | best <- order(apply(X, 2, sum), decreasing = TRUE)[1:10] 331 | sum(pvals[best] < 0.05 / 10) 332 | sum(p.adjust(pvals[best], "fdr") < 0.05) 333 | ``` 334 | 335 | > f. Explain why the analysis in (e) is misleading. 336 | > 337 | > _Hint The standard approaches for controlling the FWER and FDR assume that all 338 | > tested null hypotheses are adjusted for multiplicity, and that no 339 | > “cherry-picking” of the smallest p-values has occurred. What goes wrong if we 340 | > cherry-pick?_ 341 | 342 | This is misleading because we are not correctly accounting for all tests 343 | performed. Cherry picking the similar to repeating a test until by chance we 344 | find a significant result. 345 | -------------------------------------------------------------------------------- /02-statistical-learning.Rmd: -------------------------------------------------------------------------------- 1 | # Statistical Learning 2 | 3 | ## Conceptual 4 | 5 | ### Question 1 6 | 7 | > For each of parts (a) through (d), indicate whether we would generally expect 8 | > the performance of a flexible statistical learning method to be better or 9 | > worse than an inflexible method. Justify your answer. 10 | > 11 | > a. The sample size $n$ is extremely large, and the number of predictors $p$ is 12 | > small. 13 | 14 | Flexible best - opposite of b. 15 | 16 | > b. The number of predictors $p$ is extremely large, and the number of 17 | > observations $n$ is small. 18 | 19 | Inflexible best - high chance of some predictors being randomly associated. 20 | 21 | > c. The relationship between the predictors and response is highly 22 | > non-linear. 23 | 24 | Flexible best - inflexible leads to high bias. 25 | 26 | > d. The variance of the error terms, i.e. $\sigma^2 = Var(\epsilon)$, is 27 | > extremely high. 28 | 29 | Inflexible best - opposite of c. 30 | 31 | ### Question 2 32 | 33 | > Explain whether each scenario is a classification or regression problem, and 34 | > indicate whether we are most interested in inference or prediction. Finally, 35 | > provide $n$ and $p$. 36 | > 37 | > a. We collect a set of data on the top 500 firms in the US. For each firm 38 | > we record profit, number of employees, industry and the CEO salary. We are 39 | > interested in understanding which factors affect CEO salary. 40 | 41 | $n=500$, $p=3$, regression, inference. 42 | 43 | > b. We are considering launching a new product and wish to know whether 44 | > it will be a success or a failure. We collect data on 20 similar products 45 | > that were previously launched. For each product we have recorded whether it 46 | > was a success or failure, price charged for the product, marketing budget, 47 | > competition price, and ten other variables. 48 | 49 | $n=20$, $p=13$, classification, prediction. 50 | 51 | > c. We are interested in predicting the % change in the USD/Euro exchange 52 | > rate in relation to the weekly changes in the world stock markets. Hence we 53 | > collect weekly data for all of 2012. For each week we record the % change 54 | > in the USD/Euro, the % change in the US market, the % change in the British 55 | > market, and the % change in the German market. 56 | 57 | $n=52$, $p=3$, regression, prediction. 58 | 59 | ### Question 3 60 | 61 | > We now revisit the bias-variance decomposition. 62 | > 63 | > a. Provide a sketch of typical (squared) bias, variance, training error, 64 | > test error, and Bayes (or irreducible) error curves, on a single plot, as 65 | > we go from less flexible statistical learning methods towards more flexible 66 | > approaches. The x-axis should represent the amount of flexibility in the 67 | > method, and the y-axis should represent the values for each curve. There 68 | > should be five curves. Make sure to label each one. 69 | > 70 | > b. Explain why each of the five curves has the shape displayed in 71 | > part (a). 72 | 73 | * (squared) bias: Decreases with increasing flexibility (Generally, more 74 | flexible methods result in less bias). 75 | * variance: Increases with increasing flexibility (In general, more flexible 76 | statistical methods have higher variance). 77 | * training error: Decreases with model flexibility (More complex models will 78 | better fit the training data). 79 | * test error: Decreases initially, then increases due to overfitting (less 80 | bias but more training error). 81 | * Bayes (irreducible) error: fixed (does not change with model). 82 | 83 | ### Question 4 84 | 85 | > You will now think of some real-life applications for statistical learning. 86 | > 87 | > a. Describe three real-life applications in which classification might 88 | > be useful. Describe the response, as well as the predictors. Is the goal of 89 | > each application inference or prediction? Explain your answer. 90 | 91 | * Coffee machine cleaned? (day of week, person assigned), inference. 92 | * Is a flight delayed? (airline, airport etc), inference. 93 | * Beer type (IPA, pilsner etc.), prediction. 94 | 95 | > b. Describe three real-life applications in which regression might be 96 | > useful. Describe the response, as well as the predictors. Is the goal of 97 | > each application inference or prediction? Explain your answer. 98 | 99 | * Amount of bonus paid (profitability, client feedback), prediction. 100 | * Person's height, prediction. 101 | * House price, inference. 102 | 103 | > c. Describe three real-life applications in which cluster analysis might be 104 | > useful. 105 | 106 | * RNAseq tumour gene expression data. 107 | * SNPs in human populations. 108 | * Frequencies of mutations (with base pair context) in somatic mutation data. 109 | 110 | ### Question 5 111 | 112 | > What are the advantages and disadvantages of a very flexible (versus a less 113 | > flexible) approach for regression or classification? Under what circumstances 114 | > might a more flexible approach be preferred to a less flexible approach? When 115 | > might a less flexible approach be preferred? 116 | 117 | Inflexible is more interpretable, fewer observations required, can be biased. 118 | Flexible can overfit (high error variance). In cases where we have high $n$ or 119 | non-linear patterns flexible will be preferred. 120 | 121 | ### Question 6 122 | 123 | > Describe the differences between a parametric and a non-parametric statistical 124 | > learning approach. What are the advantages of a parametric approach to 125 | > regression or classification (as opposed to a non-parametric approach)? What 126 | > are its disadvantages? 127 | 128 | Parametric uses (model) parameters. Parametric models can be more interpretable 129 | as there is a model behind how data is generated. However, the disadvantage is 130 | that the model might not reflect reality. If the model is too far from the 131 | truth, estimates will be poor and more flexible models can fit many different 132 | forms and require more parameters (leading to overfitting). Non-parametric 133 | approaches do not estimate a small number of parameters, so a large number 134 | of observations may be needed to obtain accurate estimates. 135 | 136 | ### Question 7 137 | 138 | > The table below provides a training data set containing six observations, 139 | > three predictors, and one qualitative response variable. 140 | > 141 | > | Obs. | $X_1$ | $X_2$ | $X_3$ | $Y$ | 142 | > |------|-------|-------|-------|-------| 143 | > | 1 | 0 | 3 | 0 | Red | 144 | > | 2 | 2 | 0 | 0 | Red | 145 | > | 3 | 0 | 1 | 3 | Red | 146 | > | 4 | 0 | 1 | 2 | Green | 147 | > | 5 | -1 | 0 | 1 | Green | 148 | > | 6 | 1 | 1 | 1 | Red | 149 | > 150 | > Suppose we wish to use this data set to make a prediction for $Y$ when 151 | > $X_1 = X_2 = X_3 = 0$ using $K$-nearest neighbors. 152 | > 153 | > a. Compute the Euclidean distance between each observation and the test 154 | > point, $X_1 = X_2 = X_3 = 0$. 155 | 156 | ```{r} 157 | dat <- data.frame( 158 | "x1" = c(0, 2, 0, 0, -1, 1), 159 | "x2" = c(3, 0, 1, 1, 0, 1), 160 | "x3" = c(0, 0, 3, 2, 1, 1), 161 | "y" = c("Red", "Red", "Red", "Green", "Green", "Red") 162 | ) 163 | 164 | # Euclidean distance between points and c(0, 0, 0) 165 | dist <- sqrt(dat[["x1"]]^2 + dat[["x2"]]^2 + dat[["x3"]]^2) 166 | signif(dist, 3) 167 | ``` 168 | 169 | > b. What is our prediction with $K = 1$? Why? 170 | 171 | ```{r} 172 | knn <- function(k) { 173 | names(which.max(table(dat[["y"]][order(dist)[1:k]]))) 174 | } 175 | knn(1) 176 | ``` 177 | 178 | Green (based on data point 5 only) 179 | 180 | > c. What is our prediction with $K = 3$? Why? 181 | 182 | ```{r} 183 | knn(3) 184 | ``` 185 | 186 | Red (based on data points 2, 5, 6) 187 | 188 | > d. If the Bayes decision boundary in this problem is highly non-linear, then 189 | > would we expect the best value for $K$ to be large or small? Why? 190 | 191 | Small (high $k$ leads to linear boundaries due to averaging) 192 | 193 | ## Applied 194 | 195 | ### Question 8 196 | 197 | > This exercise relates to the `College` data set, which can be found in 198 | > the file `College.csv`. It contains a number of variables for 777 different 199 | > universities and colleges in the US. The variables are 200 | > 201 | > * `Private` : Public/private indicator 202 | > * `Apps` : Number of applications received 203 | > * `Accept` : Number of applicants accepted 204 | > * `Enroll` : Number of new students enrolled 205 | > * `Top10perc` : New students from top 10% of high school class 206 | > * `Top25perc` : New students from top 25% of high school class 207 | > * `F.Undergrad` : Number of full-time undergraduates 208 | > * `P.Undergrad` : Number of part-time undergraduates 209 | > * `Outstate` : Out-of-state tuition 210 | > * `Room.Board` : Room and board costs 211 | > * `Books` : Estimated book costs 212 | > * `Personal` : Estimated personal spending 213 | > * `PhD` : Percent of faculty with Ph.D.'s 214 | > * `Terminal` : Percent of faculty with terminal degree 215 | > * `S.F.Ratio` : Student/faculty ratio 216 | > * `perc.alumni` : Percent of alumni who donate 217 | > * `Expend` : Instructional expenditure per student 218 | > * `Grad.Rate` : Graduation rate 219 | > 220 | > Before reading the data into `R`, it can be viewed in Excel or a text 221 | > editor. 222 | > 223 | > a. Use the `read.csv()` function to read the data into `R`. Call the loaded 224 | > data `college`. Make sure that you have the directory set to the correct 225 | > location for the data. 226 | 227 | ```{r} 228 | college <- read.csv("data/College.csv") 229 | ``` 230 | 231 | > b. Look at the data using the `View()` function. You should notice that the 232 | > first column is just the name of each university. We don't really want `R` 233 | > to treat this as data. However, it may be handy to have these names for 234 | > later. Try the following commands: 235 | > 236 | > ```r 237 | > rownames(college) <- college[, 1] 238 | > View(college) 239 | > ``` 240 | > 241 | > You should see that there is now a `row.names` column with the name of 242 | > each university recorded. This means that R has given each row a name 243 | > corresponding to the appropriate university. `R` will not try to perform 244 | > calculations on the row names. However, we still need to eliminate the 245 | > first column in the data where the names are stored. Try 246 | > 247 | > ```r 248 | > college <- college [, -1] 249 | > View(college) 250 | > ``` 251 | > 252 | > Now you should see that the first data column is `Private`. Note that 253 | > another column labeled `row.names` now appears before the `Private` column. 254 | > However, this is not a data column but rather the name that R is giving to 255 | > each row. 256 | 257 | ```{r} 258 | rownames(college) <- college[, 1] 259 | college <- college[, -1] 260 | ``` 261 | 262 | > c. 263 | > i. Use the `summary()` function to produce a numerical summary of the 264 | > variables in the data set. 265 | > ii. Use the `pairs()` function to produce a scatterplot matrix of the 266 | > first ten columns or variables of the data. Recall that you can 267 | > reference the first ten columns of a matrix A using `A[,1:10]`. 268 | > iii. Use the `plot()` function to produce side-by-side boxplots of 269 | > `Outstate` versus `Private`. 270 | > iv. Create a new qualitative variable, called `Elite`, by _binning_ the 271 | > `Top10perc` variable. We are going to divide universities into two 272 | > groups based on whether or not the proportion of students coming from 273 | > the top 10% of their high school classes exceeds 50%. 274 | > 275 | > ```r 276 | > > Elite <- rep("No", nrow(college)) 277 | > > Elite[college$Top10perc > 50] <- "Yes" 278 | > > Elite <- as.factor(Elite) 279 | > > college <- data.frame(college, Elite) 280 | > ``` 281 | > 282 | > Use the `summary()` function to see how many elite universities there 283 | > are. Now use the `plot()` function to produce side-by-side boxplots of 284 | > `Outstate` versus `Elite`. 285 | > v. Use the `hist()` function to produce some histograms with differing 286 | > numbers of bins for a few of the quantitative variables. You may find 287 | > the command `par(mfrow=c(2,2))` useful: it will divide the print 288 | > window into four regions so that four plots can be made 289 | > simultaneously. Modifying the arguments to this function will divide 290 | > the screen in other ways. 291 | > vi. Continue exploring the data, and provide a brief summary of what you 292 | > discover. 293 | 294 | ```{r} 295 | summary(college) 296 | 297 | college$Private <- college$Private == "Yes" 298 | pairs(college[, 1:10], cex = 0.2) 299 | plot(college$Outstate ~ factor(college$Private), xlab = "Private", ylab = "Outstate") 300 | 301 | college$Elite <- factor(ifelse(college$Top10perc > 50, "Yes", "No")) 302 | summary(college$Elite) 303 | plot(college$Outstate ~ college$Elite, xlab = "Elite", ylab = "Outstate") 304 | 305 | par(mfrow = c(2, 2)) 306 | for (n in c(5, 10, 20, 50)) { 307 | hist(college$Enroll, breaks = n, main = paste("n =", n), xlab = "Enroll") 308 | } 309 | 310 | chisq.test(college$Private, college$Elite) 311 | ``` 312 | 313 | Whether a college is Private and Elite is not random! 314 | 315 | ### Question 9 316 | 317 | > This exercise involves the Auto data set studied in the lab. Make sure 318 | > that the missing values have been removed from the data. 319 | 320 | ```{r} 321 | x <- read.table("data/Auto.data", header = TRUE, na.strings = "?") 322 | x <- na.omit(x) 323 | ``` 324 | 325 | > a. Which of the predictors are quantitative, and which are qualitative? 326 | 327 | ```{r} 328 | sapply(x, class) 329 | numeric <- which(sapply(x, class) == "numeric") 330 | names(numeric) 331 | ``` 332 | 333 | > b. What is the range of each quantitative predictor? You can answer this using 334 | > the `range()` function. 335 | 336 | ```{r} 337 | sapply(x[, numeric], function(x) diff(range(x))) 338 | ``` 339 | 340 | > c. What is the mean and standard deviation of each quantitative predictor? 341 | 342 | ```{r} 343 | library(tidyverse) 344 | library(knitr) 345 | 346 | x[, numeric] |> 347 | pivot_longer(everything()) |> 348 | group_by(name) |> 349 | summarise( 350 | Mean = mean(value), 351 | SD = sd(value) 352 | ) |> 353 | kable() 354 | ``` 355 | 356 | > d. Now remove the 10th through 85th observations. What is the range, mean, and 357 | > standard deviation of each predictor in the subset of the data that 358 | > remains? 359 | 360 | ```{r} 361 | x[-(10:85), numeric] |> 362 | pivot_longer(everything()) |> 363 | group_by(name) |> 364 | summarise( 365 | Range = diff(range(value)), 366 | Mean = mean(value), 367 | SD = sd(value) 368 | ) |> 369 | kable() 370 | ``` 371 | 372 | > e. Using the full data set, investigate the predictors graphically, using 373 | > scatterplots or other tools of your choice. Create some plots highlighting 374 | > the relationships among the predictors. Comment on your findings. 375 | 376 | ```{r} 377 | pairs(x[, numeric], cex = 0.2) 378 | cor(x[, numeric]) |> 379 | kable() 380 | 381 | heatmap(cor(x[, numeric]), cexRow = 1.1, cexCol = 1.1, margins = c(8, 8)) 382 | ``` 383 | 384 | Many of the variables appear to be highly (positively or negatively) correlated 385 | with some relationships being non-linear. 386 | 387 | > f. Suppose that we wish to predict gas mileage (`mpg`) on the basis of the 388 | > other variables. Do your plots suggest that any of the other variables 389 | > might be useful in predicting `mpg`? Justify your answer. 390 | 391 | Yes, since other variables are correlated. However, horsepower, weight and 392 | displacement are highly related. 393 | 394 | ### Question 10 395 | 396 | > This exercise involves the `Boston` housing data set. 397 | > 398 | > a. To begin, load in the `Boston` data set. The `Boston` data set is part of 399 | > the `ISLR2` library in R. 400 | > ```r 401 | > > library(ISLR2) 402 | > ``` 403 | > Now the data set is contained in the object `Boston`. 404 | > ```r 405 | > > Boston 406 | > ``` 407 | > Read about the data set: 408 | > ```r 409 | > > ?Boston 410 | > ``` 411 | > How many rows are in this data set? How many columns? What do the rows and 412 | > columns represent? 413 | 414 | ```{r} 415 | library(ISLR2) 416 | dim(Boston) 417 | ``` 418 | 419 | > b. Make some pairwise scatterplots of the predictors (columns) in this data 420 | > set. Describe your findings. 421 | 422 | ```{r, message = FALSE, warning = FALSE} 423 | library(ggplot2) 424 | library(tidyverse) 425 | ``` 426 | 427 | ```{r} 428 | ggplot(Boston, aes(nox, rm)) + 429 | geom_point() 430 | ggplot(Boston, aes(ptratio, rm)) + 431 | geom_point() 432 | heatmap(cor(Boston, method = "spearman"), cexRow = 1.1, cexCol = 1.1) 433 | ``` 434 | 435 | > c. Are any of the predictors associated with per capita crime rate? If so, 436 | > explain the relationship. 437 | 438 | Yes 439 | 440 | > d. Do any of the census tracts of Boston appear to have particularly high 441 | > crime rates? Tax rates? Pupil-teacher ratios? Comment on the range of each 442 | > predictor. 443 | 444 | ```{r} 445 | Boston |> 446 | pivot_longer(cols = 1:13) |> 447 | filter(name %in% c("crim", "tax", "ptratio")) |> 448 | ggplot(aes(value)) + 449 | geom_histogram(bins = 20) + 450 | facet_wrap(~name, scales = "free", ncol = 1) 451 | ``` 452 | 453 | Yes, particularly crime and tax rates. 454 | 455 | > e. How many of the census tracts in this data set bound the Charles river? 456 | 457 | ```{r} 458 | sum(Boston$chas) 459 | ``` 460 | 461 | > f. What is the median pupil-teacher ratio among the towns in this data set? 462 | 463 | ```{r} 464 | median(Boston$ptratio) 465 | ``` 466 | 467 | > g. Which census tract of Boston has lowest median value of owner-occupied 468 | > homes? What are the values of the other predictors for that census tract, 469 | > and how do those values compare to the overall ranges for those predictors? 470 | > Comment on your findings. 471 | 472 | ```{r} 473 | Boston[Boston$medv == min(Boston$medv), ] |> 474 | kable() 475 | 476 | sapply(Boston, quantile) |> 477 | kable() 478 | ``` 479 | 480 | > h. In this data set, how many of the census tract average more than seven 481 | > rooms per dwelling? More than eight rooms per dwelling? Comment on the 482 | > census tracts that average more than eight rooms per dwelling. 483 | 484 | ```{r} 485 | sum(Boston$rm > 7) 486 | sum(Boston$rm > 8) 487 | ``` 488 | 489 | Let's compare median statistics for those census tracts with more than eight 490 | rooms per dwelling on average, with the statistics for those with fewer. 491 | 492 | ```{r} 493 | Boston |> 494 | mutate( 495 | `log(crim)` = log(crim), 496 | `log(zn)` = log(zn) 497 | ) |> 498 | select(-c(crim, zn)) |> 499 | pivot_longer(!rm) |> 500 | mutate(">8 rooms" = rm > 8) |> 501 | ggplot(aes(`>8 rooms`, value)) + 502 | geom_boxplot() + 503 | facet_wrap(~name, scales = "free") 504 | ``` 505 | 506 | Census tracts with big average properties (more than eight rooms per dwelling) 507 | have higher median value (`medv`), a lower proportion of non-retail 508 | business acres (`indus`), a lower pupil-teacher ratio (`ptratio`), a lower 509 | status of the population (`lstat`) among other differences. 510 | -------------------------------------------------------------------------------- /11-survival-analysis-and-censored-data.Rmd: -------------------------------------------------------------------------------- 1 | # Survival Analysis and Censored Data 2 | 3 | ## Conceptual 4 | 5 | ### Question 1 6 | 7 | > For each example, state whether or not the censoring mechanism is independent. 8 | > Justify your answer. 9 | > 10 | > a. In a study of disease relapse, due to a careless research scientist, all 11 | > patients whose phone numbers begin with the number "2" are lost to follow up. 12 | 13 | Independent. There's no reason to think disease relapse should be related to 14 | the first digit of a phone number. 15 | 16 | > b. In a study of longevity, a formatting error causes all patient ages that 17 | > exceed 99 years to be lost (i.e. we know that those patients are more than 99 18 | > years old, but we do not know their exact ages). 19 | 20 | Not independent. Older patients are more likely to see an event that younger. 21 | 22 | > c. Hospital A conducts a study of longevity. However, very sick patients tend 23 | > to be transferred to Hospital B, and are lost to follow up. 24 | 25 | Not independent. Sick patients are more likely to see an event that healthy. 26 | 27 | > d. In a study of unemployment duration, the people who find work earlier are 28 | > less motivated to stay in touch with study investigators, and therefore are 29 | > more likely to be lost to follow up. 30 | 31 | Not independent. More employable individuals are more likely to see an event. 32 | 33 | > e. In a study of pregnancy duration, women who deliver their babies pre-term 34 | > are more likely to do so away from their usual hospital, and thus are more 35 | > likely to be censored, relative to women who deliver full-term babies. 36 | 37 | Not independent. Delivery away from hospital will be associated with pregnancy 38 | duration. 39 | 40 | > f. A researcher wishes to model the number of years of education of the 41 | > residents of a small town. Residents who enroll in college out of town are 42 | > more likely to be lost to follow up, and are also more likely to attend 43 | > graduate school, relative to those who attend college in town. 44 | 45 | Not independent. Years of education will be associated with enrolling in out 46 | of town colleges. 47 | 48 | > g. Researchers conduct a study of disease-free survival (i.e. time until 49 | > disease relapse following treatment). Patients who have not relapsed within 50 | > five years are considered to be cured, and thus their survival time is 51 | > censored at five years. 52 | 53 | In other words we assume all events happen within five years, so 54 | censoring after this time is equivalent to not censoring at all so 55 | the censoring is independent. 56 | 57 | > h. We wish to model the failure time for some electrical component. This 58 | > component can be manufactured in Iowa or in Pittsburgh, with no difference in 59 | > quality. The Iowa factory opened five years ago, and so components 60 | > manufactured in Iowa are censored at five years. The Pittsburgh factory opened 61 | > two years ago, so those components are censored at two years. 62 | 63 | If there is no difference in quality then location and therefore censoring is 64 | independent of failure time. 65 | 66 | > i. We wish to model the failure time of an electrical component made in two 67 | > different factories, one of which opened before the other. We have reason to 68 | > believe that the components manufactured in the factory that opened earlier 69 | > are of higher quality. 70 | 71 | In this case, the difference in opening times of the two locations will mean 72 | that any difference in quality between locations will be associated with 73 | censoring, so censoring is not independent. 74 | 75 | ### Question 2 76 | 77 | > We conduct a study with $n = 4$ participants who have just purchased cell 78 | > phones, in order to model the time until phone replacement. The first 79 | > participant replaces her phone after 1.2 years. The second participant still 80 | > has not replaced her phone at the end of the two-year study period. The third 81 | > participant changes her phone number and is lost to follow up (but has not yet 82 | > replaced her phone) 1.5 years into the study. The fourth participant replaces 83 | > her phone after 0.2 years. 84 | > 85 | > For each of the four participants ($i = 1,..., 4$), answer the following 86 | > questions using the notation introduced in Section 11.1: 87 | > 88 | > a. Is the participant's cell phone replacement time censored? 89 | 90 | No, Yes, Yes and No. Censoring occurs when we do not know if or when the phone 91 | was replaced. 92 | 93 | > b. Is the value of $c_i$ known, and if so, then what is it? 94 | 95 | $c_i$ is censoring time. For the four participants these are: NA. 2. 1.5 and NA. 96 | 97 | > c. Is the value of $t_i$ known, and if so, then what is it? 98 | 99 | $t_i$ is time to event. For the four participants these are: 1.2, NA, NA and 100 | 0.2. 101 | 102 | > d. Is the value of $y_i$ known, and if so, then what is it? 103 | 104 | $y_i$ is the observed time. For the four participants these are: 1.2, 2, 1.5 and 105 | 0.2. 106 | 107 | > e. Is the value of $\delta_i$ known, and if so, then what is it? 108 | 109 | $\delta_i$ is an indicator for censoring. The nomenclature introduced here 110 | defines this to be 1 if we observe the true "survival" time and 0 if we observe 111 | the censored time. Therefore, for these participants, the values are: 1, 0, 0 112 | and 1. 113 | 114 | ### Question 3 115 | 116 | > For the example in Exercise 2, report the values of $K$, $d_1,...,d_K$, 117 | > $r_1,...,r_K$, and $q_1,...,q_K$, where this notation was defined in Section 118 | > 11.3. 119 | 120 | * $K$ is the number of unique deaths, which is 2. 121 | * $d_k$ represents the unique death times, which are: 0.2, 1.2. 122 | * $r_k$ denotes the number of patients alive and in the study just before $d_k$. 123 | Note the first event is for patient 4, then patient 1, then patient 3 is 124 | censored and finally the study ends with patient 2 still involved. Therefore 125 | $r_k$ takes values are: 4, 3. 126 | * $q_k$ denotes the number of patients who died at time $d_k$, therefore this 127 | takes values: 1, 1. 128 | 129 | We can check by using the `survival` package. 130 | 131 | ```{r} 132 | library(survival) 133 | x <- Surv(c(1.2, 2, 1.5, 0.2), event = c(1, 0, 0, 1)) 134 | summary(survfit(x ~ 1)) 135 | ``` 136 | 137 | ### Question 4 138 | 139 | > This problem makes use of the Kaplan-Meier survival curve displayed in Figure 140 | > 11.9. The raw data that went into plotting this survival curve is given in 141 | > Table 11.4. The covariate column of that table is not needed for this problem. 142 | > 143 | > a. What is the estimated probability of survival past 50 days? 144 | 145 | There are 2 events that happen before 50 days. The number at 146 | risk $r_k$ are 5 and 4 (one was censored early on), thus survival probability is 147 | $4/5 * 3/4 = 0.6$. 148 | 149 | Equivalently, we can use the survival package. 150 | 151 | ```{r, message = FALSE, warning = FALSE} 152 | library(tidyverse) 153 | ``` 154 | 155 | ```{r} 156 | table_data <- tribble( 157 | ~Y, ~D, ~X, 158 | 26.5, 1, 0.1, 159 | 37.2, 1, 11, 160 | 57.3, 1, -0.3, 161 | 90.8, 0, 2.8, 162 | 20.2, 0, 1.8, 163 | 89.8, 0, 0.4 164 | ) 165 | x <- Surv(table_data$Y, table_data$D) 166 | summary(survfit(x ~ 1)) 167 | ``` 168 | 169 | > b. Write out an analytical expression for the estimated survival function. For 170 | > instance, your answer might be something along the lines of 171 | > 172 | > $$ 173 | > \hat{S}(t) = \begin{cases} 174 | > 0.8 & \text{if } t < 31\\ 175 | > 0.5 & \text{if } 31 \le t < 77\\ 176 | > 0.22 & \text{if } 77 \le t 177 | > \end{cases} 178 | > $$ 179 | > 180 | > (The previous equation is for illustration only: it is not the correct 181 | > answer!) 182 | 183 | $$ 184 | \hat{S}(t) = \begin{cases} 185 | 1 & \text{if } t < 26.5 \\ 186 | 0.8 & \text{if } 26.5 \le t < 37.2 \\ 187 | 0.6 & \text{if } 37.2 \le t < 57.3 \\ 188 | 0.4 & \text{if } 57.3 \le t 189 | \end{cases} 190 | $$ 191 | 192 | 193 | ### Question 5 194 | 195 | > Sketch the survival function given by the equation 196 | > 197 | > $$ 198 | > \hat{S}(t) = \begin{cases} 199 | > 0.8, & \text{if } t < 31\\ 200 | > 0.5, & \text{if } 31 \le t < 77\\ 201 | > 0.22 & \text{if } 77 \le t 202 | > \end{cases} 203 | > $$ 204 | > 205 | > Your answer should look something like Figure 11.9. 206 | 207 | We can draw this plot, or even engineer data that will generate the required 208 | plot... 209 | 210 | ```{r} 211 | plot(NULL, 212 | xlim = c(0, 100), 213 | ylim = c(0, 1), 214 | ylab = "Estimated Probability of Survival", 215 | xlab = "Time in Days" 216 | ) 217 | lines( 218 | c(0, 31, 31, 77, 77, 100), 219 | c(0.8, 0.8, 0.5, 0.5, 0.22, 0.22) 220 | ) 221 | ``` 222 | 223 | ### Question 6 224 | 225 | > This problem makes use of the data displayed in Figure 11.1. In completing 226 | > this problem, you can refer to the observation times as $y_1,...,y_4$. The 227 | > ordering of these observation times can be seen from Figure 11.1; their exact 228 | > values are not required. 229 | > 230 | > a. Report the values of $\delta_1,...,\delta_4$, $K$, $d_1,...,d_K$, 231 | > $r_1,...,r_K$, and $q_1,...,q_K$. The relevant notation is defined in Sections 232 | > 11.1 and 11.3. 233 | 234 | * $\delta$ values are: 1, 0, 1, 0. 235 | * $K$ is 2 236 | * $d$ values are $y_3$ and $y_1$. 237 | * $r$ values are 4 and 2. 238 | * $q$ values are 1 and 1. 239 | 240 | > b. Sketch the Kaplan-Meier survival curve corresponding to this data set. (You 241 | > do not need to use any software to do this---you can sketch it by hand using 242 | > the results obtained in (a).) 243 | 244 | ```{r} 245 | plot(NULL, 246 | xlim = c(0, 350), 247 | ylim = c(0, 1), 248 | ylab = "Estimated Probability of Survival", 249 | xlab = "Time in Days" 250 | ) 251 | lines( 252 | c(0, 150, 150, 300, 300, 350), 253 | c(1, 1, 0.75, 0.75, 0.375, 0.375) 254 | ) 255 | ``` 256 | 257 | x <- Surv(c(300, 350, 150, 250), c(1, 0, 1, 0)) 258 | 259 | > c. Based on the survival curve estimated in (b), what is the probability that 260 | > the event occurs within 200 days? What is the probability that the event does 261 | > not occur within 310 days? 262 | 263 | 0.25 and 0.375. 264 | 265 | > d. Write out an expression for the estimated survival curve from (b). 266 | 267 | $$ 268 | \hat{S}(t) = \begin{cases} 269 | 1 & \text{if } t < y_3 \\ 270 | 0.75 & \text{if } y_3 \le t < y_1 \\ 271 | 0.375 & \text{if } y_1 \le t 272 | \end{cases} 273 | $$ 274 | 275 | ### Question 7 276 | 277 | > In this problem, we will derive (11.5) and (11.6), which are needed for the 278 | > construction of the log-rank test statistic (11.8). Recall the notation in 279 | > Table 11.1. 280 | > 281 | > a. Assume that there is no difference between the survival functions of the 282 | > two groups. Then we can think of $q_{1k}$ as the number of failures if we draw 283 | > $r_{1k} observations, without replacement, from a risk set of $r_k$ 284 | > observations that contains a total of $q_k$ failures. Argue that $q_{1k}$ 285 | > follows a hypergeometric distribution. Write the parameters of this 286 | > distribution in terms of $r_{1k}$, $r_k$, and $q_k$. 287 | 288 | A hypergeometric distributions models sampling without replacement from a finite 289 | pool where each sample is a success or failure. This fits the situation here, 290 | where with have a finite number of samples in the risk set. 291 | 292 | The hypergeometric distribution is parameterized as $k$ successes in $n$ draws, without replacement, from a population of size $N$ with $K$ objects with that feature. 293 | 294 | Mapping to our situation, $q_{1k}$ is $k$, $r_{1k}$ is $n$, $r_k$ is $N$ and $q_k$ is $K$. 295 | 296 | > b. Given your previous answer, and the properties of the hypergeometric 297 | > distribution, what are the mean and variance of $q_{1k}$? Compare your answer 298 | > to (11.5) and (11.6). 299 | 300 | With the above parameterization, the mean ($n K/N$) is $r_{1k} q_k/r_K$. 301 | The variance $n K/N (N-K)/N (N-n)/(N-1)$ is 302 | 303 | $$ 304 | r_{1k} \frac{q_k}{r_k} \frac{r_k-q_k}{r_k} \frac{r_k - r_{1k}}{r_k - 1} 305 | $$ 306 | 307 | These are equivalent to 11.5 and 11.6. 308 | 309 | ### Question 8 310 | 311 | > Recall that the survival function $S(t)$, the hazard function $h(t)$, and the 312 | > density function $f(t)$ are defined in (11.2), (11.9), and (11.11), 313 | > respectively. Furthermore, define $F(t) = 1 - S(t)$. Show that the following 314 | > relationships hold: 315 | > 316 | > $$ 317 | > f(t) = dF(t)/dt \\ 318 | > S(t) = \exp\left(-\int_0^t h(u)du\right) 319 | > $$ 320 | 321 | If $F(t) = 1 - S(t)$, then $F(t)$ is the *cumulative density function* (cdf) 322 | for $t$. 323 | 324 | For a continuous distribution, a cdf, e.g. $F(t)$ can be expressed as an 325 | integral (up to some value $x$) of the *probability density function* (pdf), 326 | i.e. $F(t) = \int_{-\infty}^x f(x) dt$. Equivalently, the derivative of the cdf 327 | is its pdf: $f(t) = \frac{d F(t)}{dt}$. 328 | 329 | Then, 330 | $h(t) = \frac{f(t)}{S(t)} = \frac{dF(t)/dt}{S(t)} = \frac{-dS(t)/dt}{S(t)}$. 331 | From basic calculus, this can be rewritten as $h(t) = -\frac{d}{dt}\log{S(t)}$. 332 | Integrating and then exponentiating we get the second identity. 333 | 334 | ### Question 9 335 | 336 | > In this exercise, we will explore the consequences of assuming that the 337 | > survival times follow an exponential distribution. 338 | > 339 | > a. Suppose that a survival time follows an $Exp(\lambda)$ distribution, so 340 | > that its density function is $f(t) = \lambda\exp(-\lambda t)$. Using the 341 | > relationships provided in Exercise 8, show that $S(t) = \exp(-\lambda t)$. 342 | 343 | The cdf of an exponential distribution is $1 - \exp(-\lambda x)$ and 344 | $S(t)$ is $1 - F(t)$ where $F(t)$ is the cdf. 345 | 346 | Hence, $S(t) = \exp(-\lambda t)$. 347 | 348 | > b. Now suppose that each of $n$ independent survival times follows an 349 | > $\exp(\lambda)$ distribution. Write out an expression for the likelihood 350 | > function (11.13). 351 | 352 | The reference to (11.13) gives us the following formula: 353 | 354 | $$ 355 | L = \prod_{i=1}^{n} h(y_i)^{\delta_i} S(y_i) 356 | $$ 357 | 358 | (11.10) also gives us 359 | 360 | $$ 361 | h(t) = \frac{f(t)}{S(t)} 362 | $$ 363 | 364 | Plugging in the expressions from part (a), we get 365 | 366 | \begin{align*} 367 | h(t) &= \frac{\lambda \exp(- \lambda t)}{\exp(- \lambda t)} \\ 368 | &= \lambda 369 | \end{align*} 370 | 371 | Using (11.13), we get the following loss expression: 372 | 373 | $$ 374 | \ell = \prod_i \lambda^{\delta_i} e^{- \lambda y_i} 375 | $$ 376 | 377 | > c. Show that the maximum likelihood estimator for $\lambda$ is 378 | > $$ 379 | > \hat\lambda = \sum_{i=1}^n \delta_i / \sum_{i=1}^n y_i. 380 | > $$ 381 | 382 | Take the log likelihood. 383 | 384 | \begin{align*} 385 | \log \ell &= \sum_i \log \left( \lambda^{\delta_i} e^{- \lambda y_i} \right) \\ 386 | &= \sum_i{\delta_i\log\lambda - \lambda y_i \log e} \\ 387 | &= \sum_i{\delta_i\log\lambda - \lambda y_i} \\ 388 | &= \log\lambda\sum_i{\delta_i} - \lambda\sum_i{y_i} 389 | \end{align*} 390 | 391 | Differentiating this expression with respect to $\lambda$ we get: 392 | 393 | $$ 394 | \frac{d \log \ell}{d \lambda} = \frac{\sum_i{\delta_i}}{\lambda} - \sum_i{y_i} 395 | $$ 396 | 397 | This function maximises when its gradient is 0. Solving for this gives a MLE of 398 | $\hat\lambda = \sum_{i=1}^n \delta_i / \sum_{i=1}^n y_i$. 399 | 400 | > d. Use your answer to (c) to derive an estimator of the mean survival time. 401 | > 402 | > _Hint: For (d), recall that the mean of an $Exp(\lambda)$ random variable is 403 | > $1/\lambda$._ 404 | 405 | Estimated mean survival would be $1/\lambda$ which given the above would be 406 | $\sum_{i=1}^n y_i / \sum_{i=1}^n \delta_i$, which can be thought of as 407 | the total observation time over the total number of deaths. 408 | 409 | ## Applied 410 | 411 | ### Question 10 412 | 413 | > This exercise focuses on the brain tumor data, which is included in the 414 | > `ISLR2` `R` library. 415 | > 416 | > a. Plot the Kaplan-Meier survival curve with ±1 standard error bands, using 417 | > the `survfit()` function in the `survival` package. 418 | 419 | ```{r} 420 | library(ISLR2) 421 | x <- Surv(BrainCancer$time, BrainCancer$status) 422 | plot(survfit(x ~ 1), 423 | xlab = "Months", 424 | ylab = "Estimated Probability of Survival", 425 | col = "steelblue", 426 | conf.int = 0.67 427 | ) 428 | ``` 429 | 430 | > b. Draw a bootstrap sample of size $n = 88$ from the pairs ($y_i$, 431 | > $\delta_i$), and compute the resulting Kaplan-Meier survival curve. Repeat 432 | > this process $B = 200$ times. Use the results to obtain an estimate of the 433 | > standard error of the Kaplan-Meier survival curve at each timepoint. Compare 434 | > this to the standard errors obtained in (a). 435 | 436 | ```{r} 437 | plot(survfit(x ~ 1), 438 | xlab = "Months", 439 | ylab = "Estimated Probability of Survival", 440 | col = "steelblue", 441 | conf.int = 0.67 442 | ) 443 | fit <- survfit(x ~ 1) 444 | dat <- tibble(time = c(0, fit$time)) 445 | for (i in 1:200) { 446 | y <- survfit(sample(x, 88, replace = TRUE) ~ 1) 447 | y <- tibble(time = c(0, y$time), "s{i}" := c(1, y$surv)) 448 | dat <- left_join(dat, y, by = "time") 449 | } 450 | res <- fill(dat, starts_with("s")) |> 451 | rowwise() |> 452 | transmute(sd = sd(c_across(starts_with("s")))) 453 | se <- res$sd[2:nrow(res)] 454 | lines(fit$time, fit$surv - se, lty = 2, col = "red") 455 | lines(fit$time, fit$surv + se, lty = 2, col = "red") 456 | ``` 457 | 458 | > c. Fit a Cox proportional hazards model that uses all of the predictors to 459 | > predict survival. Summarize the main findings. 460 | 461 | ```{r} 462 | fit <- coxph(Surv(time, status) ~ sex + diagnosis + loc + ki + gtv + stereo, data = BrainCancer) 463 | fit 464 | ``` 465 | 466 | `diagnosisHG` and `ki` are highly significant. 467 | 468 | > d. Stratify the data by the value of `ki`. (Since only one observation has 469 | > `ki=40`, you can group that observation together with the observations that 470 | > have `ki=60`.) Plot Kaplan-Meier survival curves for each of the five strata, 471 | > adjusted for the other predictors. 472 | 473 | To adjust for other predictors, we fit a model that includes those predictors 474 | and use this model to predict new, artificial, data where we allow `ki` to 475 | take each possible value, but set the other predictors to be the mode or mean 476 | of the other predictors. 477 | 478 | ```{r} 479 | library(ggfortify) 480 | 481 | modaldata <- data.frame( 482 | sex = rep("Female", 5), 483 | diagnosis = rep("Meningioma", 5), 484 | loc = rep("Supratentorial", 5), 485 | ki = c(60, 70, 80, 90, 100), 486 | gtv = rep(mean(BrainCancer$gtv), 5), 487 | stereo = rep("SRT", 5) 488 | ) 489 | survplots <- survfit(fit, newdata = modaldata) 490 | plot(survplots, xlab = "Months", ylab = "Survival Probability", col = 2:6) 491 | legend("bottomleft", c("60", "70", "80", "90", "100"), col = 2:6, lty = 1) 492 | ``` 493 | 494 | ### Question 11 495 | 496 | > This example makes use of the data in Table 11.4. 497 | > 498 | > a. Create two groups of observations. In Group 1, $X < 2$, whereas in Group 2, 499 | > $X \ge 2$. Plot the Kaplan-Meier survival curves corresponding to the two 500 | > groups. Be sure to label the curves so that it is clear which curve 501 | > corresponds to which group. By eye, does there appear to be a difference 502 | > between the two groups' survival curves? 503 | 504 | ```{r} 505 | x <- split(Surv(table_data$Y, table_data$D), table_data$X < 2) 506 | plot(NULL, xlim = c(0, 100), ylim = c(0, 1), ylab = "Survival Probability") 507 | lines(survfit(x[[1]] ~ 1), conf.int = FALSE, col = 2) 508 | lines(survfit(x[[2]] ~ 1), conf.int = FALSE, col = 3) 509 | legend("bottomleft", c(">= 2", "<2"), col = 2:3, lty = 1) 510 | ``` 511 | 512 | There does not appear to be any difference between the curves. 513 | 514 | > b. Fit Cox's proportional hazards model, using the group indicator as a 515 | > covariate. What is the estimated coefficient? Write a sentence providing the 516 | > interpretation of this coefficient, in terms of the hazard or the 517 | > instantaneous probability of the event. Is there evidence that the true 518 | > coefficient value is non-zero? 519 | 520 | ```{r} 521 | fit <- coxph(Surv(Y, D) ~ X < 2, data = table_data) 522 | fit 523 | ``` 524 | 525 | The coefficient is $0.3401$. This implies a slightly increased hazard when 526 | $X < 2$ but it is not significantly different to zero (P = 0.8). 527 | 528 | > c. Recall from Section 11.5.2 that in the case of a single binary covariate, 529 | > the log-rank test statistic should be identical to the score statistic for the 530 | > Cox model. Conduct a log-rank test to determine whether there is a difference 531 | > between the survival curves for the two groups. How does the p-value for the 532 | > log-rank test statistic compare to the $p$-value for the score statistic for 533 | > the Cox model from (b)? 534 | 535 | ```{r} 536 | summary(fit)$sctest 537 | survdiff(Surv(Y, D) ~ X < 2, data = table_data)$chisq 538 | ``` 539 | 540 | They are identical. 541 | -------------------------------------------------------------------------------- /05-resampling-methods.Rmd: -------------------------------------------------------------------------------- 1 | # Resampling Methods 2 | 3 | ## Conceptual 4 | 5 | ### Question 1 6 | 7 | > Using basic statistical properties of the variance, as well as single- 8 | > variable calculus, derive (5.6). In other words, prove that $\alpha$ given by 9 | > (5.6) does indeed minimize $Var(\alpha X + (1 - \alpha)Y)$. 10 | 11 | Equation 5.6 is: 12 | 13 | $$ 14 | \alpha = \frac{\sigma^2_Y - \sigma_{XY}}{\sigma^2_X + \sigma^2_Y - 2\sigma_{XY}} 15 | $$ 16 | 17 | Remember that: 18 | 19 | $$ 20 | Var(aX) = a^2Var(X), \\ 21 | \mathrm{Var}(X + Y) = \mathrm{Var}(X) + \mathrm{Var}(Y) + 2\mathrm{Cov}(X,Y), \\ 22 | \mathrm{Cov}(aX, bY) = ab\mathrm{Cov}(X, Y) 23 | $$ 24 | 25 | If we define $\sigma^2_X = \mathrm{Var}(X)$, $\sigma^2_Y = \mathrm{Var}(Y)$ and 26 | $\sigma_{XY} = \mathrm{Cov}(X, Y)$ 27 | 28 | \begin{align} 29 | Var(\alpha X + (1 - \alpha)Y) 30 | &= \alpha^2\sigma^2_X + (1-\alpha)^2\sigma^2_Y + 2\alpha(1 - \alpha)\sigma_{XY} \\ 31 | &= \alpha^2\sigma^2_X + \sigma^2_Y - 2\alpha\sigma^2_Y + \alpha^2\sigma^2_Y + 32 | 2\alpha\sigma_{XY} - 2\alpha^2\sigma_{XY} 33 | \end{align} 34 | 35 | Now we want to find when the rate of change of this function is 0 with respect 36 | to $\alpha$, so we compute the partial derivative, set to 0 and solve. 37 | 38 | $$ 39 | \frac{\partial}{\partial{\alpha}} = 40 | 2\alpha\sigma^2_X - 2\sigma^2_Y + 2\alpha\sigma^2_Y + 2\sigma_{XY} - 4\alpha\sigma_{XY} = 0 41 | $$ 42 | 43 | Moving $\alpha$ terms to the same side: 44 | 45 | $$ 46 | \alpha\sigma^2_X + \alpha\sigma^2_Y - 2\alpha\sigma_{XY} = \sigma^2_Y - \sigma_{XY} 47 | $$ 48 | 49 | $$ 50 | \alpha = \frac{\sigma^2_Y - \sigma_{XY}}{\sigma^2_X + \sigma^2_Y - 2\sigma_{XY}} 51 | $$ 52 | 53 | We should also show that this is a minimum, so that the second partial 54 | derivative wrt $\alpha$ is $>= 0$. 55 | 56 | \begin{align} 57 | \frac{\partial^2}{\partial{\alpha^2}} 58 | &= 2\sigma^2_X + 2\sigma^2_Y - 4\sigma_{XY} \\ 59 | &= 2(\sigma^2_X + \sigma^2_Y - 2\sigma_{XY}) \\ 60 | &= 2\mathrm{Var}(X - Y) 61 | \end{align} 62 | 63 | Since variance is positive, then this must be positive. 64 | 65 | ### Question 2 66 | 67 | > We will now derive the probability that a given observation is part of a 68 | > bootstrap sample. Suppose that we obtain a bootstrap sample from a set of n 69 | > observations. 70 | > 71 | > a. What is the probability that the first bootstrap observation is _not_ the 72 | > $j$th observation from the original sample? Justify your answer. 73 | 74 | This is 1 - probability that it is the $j$th = $1 - 1/n$. 75 | 76 | > b. What is the probability that the second bootstrap observation is _not_ the 77 | > $j$th observation from the original sample? 78 | 79 | Since each bootstrap observation is a random sample, this probability is the 80 | same ($1 - 1/n$). 81 | 82 | > c. Argue that the probability that the $j$th observation is _not_ in the 83 | > bootstrap sample is $(1 - 1/n)^n$. 84 | 85 | For the $j$th observation to not be in the sample, it would have to _not_ be 86 | picked for each of $n$ positions, so not picked for $1, 2, ..., n$, thus 87 | the probability is $(1 - 1/n)^n$ 88 | 89 | > d. When $n = 5$, what is the probability that the $j$th observation is in the 90 | > bootstrap sample? 91 | 92 | ```{r} 93 | n <- 5 94 | 1 - (1 - 1 / n)^n 95 | ``` 96 | 97 | $p = 0.67$ 98 | 99 | > e. When $n = 100$, what is the probability that the $j$th observation is in 100 | > the bootstrap sample? 101 | 102 | ```{r} 103 | n <- 100 104 | 1 - (1 - 1 / n)^n 105 | ``` 106 | 107 | $p = 0.64$ 108 | 109 | > f. When $n = 10,000$, what is the probability that the $j$th observation is 110 | > in the bootstrap sample? 111 | 112 | ```{r} 113 | n <- 100000 114 | 1 - (1 - 1 / n)^n 115 | ``` 116 | 117 | $p = 0.63$ 118 | 119 | > g. Create a plot that displays, for each integer value of $n$ from 1 to 120 | > 100,000, the probability that the $j$th observation is in the bootstrap 121 | > sample. Comment on what you observe. 122 | 123 | ```{r} 124 | x <- sapply(1:100000, function(n) 1 - (1 - 1 / n)^n) 125 | plot(x, log = "x", type = "o") 126 | ``` 127 | 128 | The probability rapidly approaches 0.63 with increasing $n$. 129 | 130 | Note that $$e^x = \lim_{x \to \inf} \left(1 + \frac{x}{n}\right)^n,$$ so with $x = -1$, we 131 | can see that our limit is $1 - e^{-1} = 1 - 1/e$. 132 | 133 | > h. We will now investigate numerically the probability that a bootstrap 134 | > sample of size $n = 100$ contains the $j$th observation. Here $j = 4$. We 135 | > repeatedly create bootstrap samples, and each time we record whether or not 136 | > the fourth observation is contained in the bootstrap sample. 137 | > 138 | > ```r 139 | > > store <- rep (NA, 10000) 140 | > > for (i in 1:10000) { 141 | > store[i] <- sum(sample(1:100, rep = TRUE) == 4) > 0 142 | > } 143 | > > mean(store) 144 | > ``` 145 | > 146 | > Comment on the results obtained. 147 | 148 | ```{r} 149 | store <- replicate(10000, sum(sample(1:100, replace = TRUE) == 4) > 0) 150 | mean(store) 151 | ``` 152 | 153 | The probability of including $4$ when resampling numbers $1...100$ is close to 154 | $1 - (1 - 1/100)^{100}$. 155 | 156 | ### Question 3 157 | 158 | > 3. We now review $k$-fold cross-validation. 159 | > 160 | > a. Explain how $k$-fold cross-validation is implemented. 161 | 162 | We divided our data into (approximately equal) $k$ subsets, and then generate 163 | predictions for each $k$th set, training on the exclusive $k$ sets combined. 164 | 165 | > b. What are the advantages and disadvantages of $k$-fold cross-validation 166 | > relative to: 167 | > i. The validation set approach? 168 | > ii. LOOCV? 169 | 170 | When using a validation set, we can only train on a small portion of the data 171 | as we must reserve the rest for validation. As a result it can overestimate the 172 | test error rate (assuming we then train using the complete data for future 173 | prediction). It is also sensitive to which observations are including in train 174 | vs. test. It is, however, low cost in terms of processing time (as we only 175 | have to fit one model). 176 | 177 | When using LOOCV, we can train on $n-1$ observations, however, the trained 178 | models we generate each differ only by the inclusion (and exclusion) of a 179 | single observation. As a result, LOOCV can have high variance (the models fit 180 | will be similar, and might be quite different to what we would obtain with a 181 | different data set). LOOCV is also costly in terms of processing time. 182 | 183 | ### Question 4 184 | 185 | > Suppose that we use some statistical learning method to make a prediction for 186 | > the response $Y$ for a particular value of the predictor $X$. Carefully 187 | > describe how we might estimate the standard deviation of our prediction. 188 | 189 | We could address this with bootstrapping. Our procedure would be to (jointly) 190 | resample $Y$ and $X$ variables and fit our model many times. For each model we 191 | could obtain a summary of our prediction and calculate the standard deviation 192 | over bootstrapped samples. 193 | 194 | ## Applied 195 | 196 | ### Question 5 197 | 198 | > In Chapter 4, we used logistic regression to predict the probability of 199 | > `default` using `income` and `balance` on the `Default` data set. We will now 200 | > estimate the test error of this logistic regression model using the 201 | > validation set approach. Do not forget to set a random seed before beginning 202 | > your analysis. 203 | > 204 | > a. Fit a logistic regression model that uses `income` and `balance` to predict 205 | > `default`. 206 | 207 | ```{r} 208 | library(ISLR2) 209 | set.seed(42) 210 | fit <- glm(default ~ income + balance, data = Default, family = "binomial") 211 | ``` 212 | 213 | > b. Using the validation set approach, estimate the test error of this model. 214 | > In order to do this, you must perform the following steps: 215 | > i. Split the sample set into a training set and a validation set. 216 | > ii. Fit a multiple logistic regression model using only the training 217 | > observations. 218 | > iii. Obtain a prediction of default status for each individual in 219 | > the validation set by computing the posterior probability of 220 | > default for that individual, and classifying the individual to 221 | > the `default` category if the posterior probability is greater 222 | > than 0.5. 223 | > iv. Compute the validation set error, which is the fraction of 224 | > the observations in the validation set that are misclassified. 225 | 226 | ```{r} 227 | train <- sample(nrow(Default), nrow(Default) / 2) 228 | fit <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train) 229 | pred <- ifelse(predict(fit, newdata = Default[-train, ], type = "response") > 0.5, "Yes", "No") 230 | table(pred, Default$default[-train]) 231 | mean(pred != Default$default[-train]) 232 | ``` 233 | 234 | > c. Repeat the process in (b) three times, using three different splits of the 235 | > observations into a training set and a validation set. Comment on the 236 | > results obtained. 237 | 238 | ```{r} 239 | replicate(3, { 240 | train <- sample(nrow(Default), nrow(Default) / 2) 241 | fit <- glm(default ~ income + balance, data = Default, family = "binomial", subset = train) 242 | pred <- ifelse(predict(fit, newdata = Default[-train, ], type = "response") > 0.5, "Yes", "No") 243 | mean(pred != Default$default[-train]) 244 | }) 245 | ``` 246 | 247 | The results obtained are variable and depend on the samples allocated to 248 | training vs. test. 249 | 250 | > d. Now consider a logistic regression model that predicts the probability of 251 | > `default` using `income`, `balance`, and a dummy variable for `student`. 252 | > Estimate the test error for this model using the validation set approach. 253 | > Comment on whether or not including a dummy variable for `student` leads to 254 | > a reduction in the test error rate. 255 | 256 | ```{r} 257 | replicate(3, { 258 | train <- sample(nrow(Default), nrow(Default) / 2) 259 | fit <- glm(default ~ income + balance + student, data = Default, family = "binomial", subset = train) 260 | pred <- ifelse(predict(fit, newdata = Default[-train, ], type = "response") > 0.5, "Yes", "No") 261 | mean(pred != Default$default[-train]) 262 | }) 263 | ``` 264 | 265 | Including `student` does not seem to make a substantial improvement to the 266 | test error. 267 | 268 | ### Question 6 269 | 270 | > We continue to consider the use of a logistic regression model to predict the 271 | > probability of `default` using `income` and `balance` on the `Default` data 272 | > set. In particular, we will now compute estimates for the standard errors of 273 | > the `income` and `balance` logistic regression coefficients in two different 274 | > ways: (1) using the bootstrap, and (2) using the standard formula for 275 | > computing the standard errors in the `glm()` function. Do not forget to set a 276 | > random seed before beginning your analysis. 277 | > 278 | > a. Using the `summary()` and `glm()` functions, determine the estimated 279 | > standard errors for the coefficients associated with `income` and 280 | > `balance` in a multiple logistic regression model that uses both 281 | > predictors. 282 | 283 | ```{r} 284 | fit <- glm(default ~ income + balance, data = Default, family = "binomial") 285 | summary(fit) 286 | ``` 287 | 288 | The standard errors obtained by bootstrapping are $\beta_1$ = 5.0e-6 and 289 | $\beta_2$ = 2.3e-4. 290 | 291 | > b. Write a function, `boot.fn()`, that takes as input the `Default` data set 292 | > as well as an index of the observations, and that outputs the coefficient 293 | > estimates for `income` and `balance` in the multiple logistic regression 294 | > model. 295 | 296 | ```{r} 297 | boot.fn <- function(x, i) { 298 | fit <- glm(default ~ income + balance, data = x[i, ], family = "binomial") 299 | coef(fit)[-1] 300 | } 301 | ``` 302 | 303 | > c. Use the `boot()` function together with your `boot.fn()` function to 304 | > estimate the standard errors of the logistic regression coefficients for 305 | > income and balance. 306 | 307 | ```{r, cache = TRUE} 308 | library(boot) 309 | set.seed(42) 310 | boot(Default, boot.fn, R = 1000) 311 | ``` 312 | 313 | > d. Comment on the estimated standard errors obtained using the `glm()` 314 | > function and using your bootstrap function. 315 | 316 | The standard errors obtained by bootstrapping are similar to those estimated 317 | by `glm`. 318 | 319 | ### Question 7 320 | 321 | > In Sections 5.3.2 and 5.3.3, we saw that the `cv.glm()` function can be used 322 | > in order to compute the LOOCV test error estimate. Alternatively, one could 323 | > compute those quantities using just the `glm()` and `predict.glm()` 324 | > functions, and a for loop. You will now take this approach in order to 325 | > compute the LOOCV error for a simple logistic regression model on the `Weekly` 326 | > data set. Recall that in the context of classification problems, the LOOCV 327 | > error is given in (5.4). 328 | > 329 | > a. Fit a logistic regression model that predicts `Direction` using `Lag1` and 330 | > `Lag2`. 331 | 332 | ```{r} 333 | fit <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = "binomial") 334 | ``` 335 | 336 | > b. Fit a logistic regression model that predicts `Direction` using `Lag1` and 337 | > `Lag2` _using all but the first observation_. 338 | 339 | ```{r} 340 | fit <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-1, ], family = "binomial") 341 | ``` 342 | 343 | > c. Use the model from (b) to predict the direction of the first 344 | > observation. You can do this by predicting that the first observation will 345 | > go up if $P($`Direction="Up" | Lag1 , Lag2`$) > 0.5$. Was this observation 346 | > correctly classified? 347 | 348 | ```{r} 349 | predict(fit, newdata = Weekly[1, , drop = FALSE], type = "response") > 0.5 350 | ``` 351 | 352 | Yes the observation was correctly classified. 353 | 354 | > d. Write a for loop from $i = 1$ to $i = n$, where $n$ is the number of 355 | > observations in the data set, that performs each of the following steps: 356 | > i. Fit a logistic regression model using all but the $i$th observation 357 | > to predict `Direction` using `Lag1` and `Lag2` . 358 | > ii. Compute the posterior probability of the market moving up 359 | > for the $i$th observation. 360 | > iii. Use the posterior probability for the $i$th observation in order 361 | > to predict whether or not the market moves up. 362 | > iv. Determine whether or not an error was made in predicting the 363 | > direction for the $i$th observation. If an error was made, then 364 | > indicate this as a 1, and otherwise indicate it as a 0. 365 | 366 | ```{r} 367 | error <- numeric(nrow(Weekly)) 368 | for (i in 1:nrow(Weekly)) { 369 | fit <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-i, ], family = "binomial") 370 | p <- predict(fit, newdata = Weekly[i, , drop = FALSE], type = "response") > 0.5 371 | error[i] <- ifelse(p, "Down", "Up") == Weekly$Direction[i] 372 | } 373 | ``` 374 | 375 | > e. Take the average of the $n$ numbers obtained in (d) in order to obtain the 376 | > LOOCV estimate for the test error. Comment on the results. 377 | 378 | ```{r} 379 | mean(error) 380 | ``` 381 | 382 | The LOOCV test error rate is 45% which implies that our predictions are 383 | marginally more often correct than not. 384 | 385 | ### Question 8 386 | 387 | > We will now perform cross-validation on a simulated data set. 388 | > 389 | > a. Generate a simulated data set as follows: 390 | > ```r 391 | > > set.seed(1) 392 | > > x <- rnorm(100) 393 | > > y <- x - 2 *x^2 + rnorm(100) 394 | > ``` 395 | > In this data set, what is $n$ and what is $p$? Write out the model 396 | > used to generate the data in equation form. 397 | 398 | ```{r} 399 | set.seed(1) 400 | x <- rnorm(100) 401 | y <- x - 2 * x^2 + rnorm(100) 402 | ``` 403 | 404 | $n$ is 100 and $p$ is 1 (there are 100 observations and $y$ is predicted with 405 | a single variable $x$). The model equation is: $$y = -2x^2 + x + \epsilon$$. 406 | 407 | > b. Create a scatterplot of $X$ against $Y$. Comment on what you find. 408 | 409 | ```{r} 410 | plot(x, y) 411 | ``` 412 | 413 | $y$ has a (negative) quadratic relationship with $x$. 414 | 415 | > c. Set a random seed, and then compute the LOOCV errors that result from 416 | > fitting the following four models using least squares: 417 | > i. $Y = \beta_0 + \beta_1 X + \epsilon$ 418 | > ii. $Y = \beta_0 + \beta_1 X + \beta_2 X^2 + \epsilon$ 419 | > iii. $Y = \beta_0 + \beta_1 X + \beta_2 X^2 + \beta_3 X^3 + \epsilon$ 420 | > iv. $Y = \beta_0 + \beta_1 X + \beta_2 X^2 + \beta_3 X^3 + \beta_4 X^4 + \epsilon$. 421 | > 422 | > Note you may find it helpful to use the `data.frame()` function 423 | > to create a single data set containing both $X$ and $Y$. 424 | 425 | ```{r} 426 | library(boot) 427 | set.seed(42) 428 | dat <- data.frame(x, y) 429 | sapply(1:4, function(i) cv.glm(dat, glm(y ~ poly(x, i)))$delta[1]) 430 | ``` 431 | 432 | > d. Repeat (c) using another random seed, and report your results. 433 | > Are your results the same as what you got in (c)? Why? 434 | 435 | ```{r} 436 | set.seed(43) 437 | dat <- data.frame(x, y) 438 | sapply(1:4, function(i) cv.glm(dat, glm(y ~ poly(x, i)))$delta[1]) 439 | ``` 440 | 441 | The results are the same because we are using LOOCV. When doing this, the model 442 | is fit leaving each one of the observations out in turn, and thus there is no 443 | stochasticity involved. 444 | 445 | > e. Which of the models in (c) had the smallest LOOCV error? Is this what you 446 | > expected? Explain your answer. 447 | 448 | The second model had the smallest LOOCV. This what would be expected since the 449 | model to generate the data was quadratic and we are measuring the test (rather 450 | than training) error rate to evaluate performance. 451 | 452 | > f. Comment on the statistical significance of the coefficient estimates 453 | > that results from fitting each of the models in (c) using least squares. 454 | > Do these results agree with the conclusions drawn based on the 455 | > cross-validation results? 456 | 457 | ```{r} 458 | for (i in 1:4) printCoefmat(coef(summary(glm(y ~ poly(x, i), data = dat)))) 459 | ``` 460 | 461 | We can see that the coefficients in the first model are not highly significant, 462 | but all terms ($\beta_0, \beta_1$ and $\beta_2$) are in the quadratic model. 463 | After this, subsequent $\beta_n$ terms are not significant. Therefore, these 464 | results agree with those from cross-validation. 465 | 466 | ### Question 9 467 | 468 | > We will now consider the `Boston` housing data set, from the `ISLR2` 469 | > library. 470 | > 471 | > a. Based on this data set, provide an estimate for the population mean of 472 | > `medv`. Call this estimate $\hat\mu$. 473 | 474 | ```{r} 475 | (mu <- mean(Boston$medv)) 476 | ``` 477 | 478 | > b. Provide an estimate of the standard error of $\hat\mu$. Interpret this 479 | > result. 480 | > 481 | > _Hint: We can compute the standard error of the sample mean by 482 | > dividing the sample standard deviation by the square root of the number of 483 | > observations._ 484 | 485 | ```{r} 486 | sd(Boston$medv) / sqrt(length(Boston$medv)) 487 | ``` 488 | 489 | > c. Now estimate the standard error of $\hat\mu$ using the bootstrap. How does 490 | > this compare to your answer from (b)? 491 | 492 | ```{r} 493 | set.seed(42) 494 | (bs <- boot(Boston$medv, function(v, i) mean(v[i]), 10000)) 495 | ``` 496 | 497 | The standard error using the bootstrap (0.403) is very close to that 498 | obtained from the formula above (0.409). 499 | 500 | > d. Based on your bootstrap estimate from (c), provide a 95% confidence 501 | > interval for the mean of `medv`. Compare it to the results obtained using 502 | > `t.test(Boston$medv)`. 503 | > 504 | > _Hint: You can approximate a 95% confidence interval using the 505 | > formula $[\hat\mu - 2SE(\hat\mu), \hat\mu + 2SE(\hat\mu)].$_ 506 | 507 | ```{r} 508 | se <- sd(bs$t) 509 | c(mu - 2 * se, mu + 2 * se) 510 | ``` 511 | 512 | > e. Based on this data set, provide an estimate, $\hat\mu_{med}$, for the 513 | > median value of `medv` in the population. 514 | 515 | ```{r} 516 | median(Boston$medv) 517 | ``` 518 | 519 | > f. We now would like to estimate the standard error of $\hat\mu_{med}$. 520 | > Unfortunately, there is no simple formula for computing the standard error 521 | > of the median. Instead, estimate the standard error of the median using 522 | > the bootstrap. Comment on your findings. 523 | 524 | ```{r} 525 | set.seed(42) 526 | boot(Boston$medv, function(v, i) median(v[i]), 10000) 527 | ``` 528 | 529 | The estimated standard error of the median is 0.374. This is lower than the 530 | standard error of the mean. 531 | 532 | > g. Based on this data set, provide an estimate for the tenth percentile of 533 | > `medv` in Boston census tracts. Call this quantity $\hat\mu_{0.1}$. (You 534 | > can use the `quantile()` function.) 535 | 536 | ```{r} 537 | quantile(Boston$medv, 0.1) 538 | ``` 539 | 540 | > h. Use the bootstrap to estimate the standard error of $\hat\mu_{0.1}$. 541 | > Comment on your findings. 542 | 543 | ```{r} 544 | set.seed(42) 545 | boot(Boston$medv, function(v, i) quantile(v[i], 0.1), 10000) 546 | ``` 547 | 548 | We get a standard error of ~0.5. This is higher than the standard error of the 549 | median. Nevertheless the standard error is quite small, thus we can be fairly 550 | confidence about the value of the 10th percentile. 551 | -------------------------------------------------------------------------------- /12-unsupervised-learning.Rmd: -------------------------------------------------------------------------------- 1 | # Unsupervised Learning 2 | 3 | ## Conceptual 4 | 5 | ### Question 1 6 | 7 | > This problem involves the $K$-means clustering algorithm. 8 | > 9 | > a. Prove (12.18). 10 | 11 | 12.18 is: 12 | 13 | $$ 14 | \frac{1}{|C_k|}\sum_{i,i' \in C_k} \sum_{j=1}^p (x_{ij} - x_{i'j})^2 = 15 | 2 \sum_{i \in C_k} \sum_{j=1}^p (x_{ij} - \bar{x}_{kj})^2 16 | $$ 17 | 18 | where $$\bar{x}_{kj} = \frac{1}{|C_k|}\sum_{i \in C_k} x_{ij}$$ 19 | 20 | On the left hand side we compute the difference between each observation 21 | (indexed by $i$ and $i'$). In the second we compute the difference between 22 | each observation and the mean. Intuitively this identity is clear (the factor 23 | of 2 is present because we calculate the difference between each pair twice). 24 | However, to prove. 25 | 26 | Note first that, 27 | \begin{align} 28 | (x_{ij} - x_{i'j})^2 29 | = & ((x_{ij} - \bar{x}_{kj}) - (x_{i'j} - \bar{x}_{kj}))^2 \\ 30 | = & (x_{ij} - \bar{x}_{kj})^2 - 31 | 2(x_{ij} - \bar{x}_{kj})(x_{i'j} - \bar{x}_{kj}) + 32 | (x_{i'j} - \bar{x}_{kj})^2 33 | \end{align} 34 | 35 | Note that the first term is independent of $i'$ and the last is independent of 36 | $i$. 37 | 38 | Therefore, 10.12 can be written as: 39 | 40 | \begin{align} 41 | \frac{1}{|C_k|}\sum_{i,i' \in C_k} \sum_{j=1}^p (x_{ij} - x_{i'j})^2 42 | = & \frac{1}{|C_k|}\sum_{i,i' \in C_k}\sum_{j=1}^p (x_{ij} - \bar{x}_{kj})^2 - 43 | \frac{1}{|C_k|}\sum_{i,i' \in C_k}\sum_{j=1}^p 2(x_{ij} - \bar{x}_{kj})(x_{i'j} - \bar{x}_{kj}) + 44 | \frac{1}{|C_k|}\sum_{i,i' \in C_k}\sum_{j=1}^p (x_{i'j} - \bar{x}_{kj})^2 \\ 45 | = & \frac{|C_k|}{|C_k|}\sum_{i \in C_k}\sum_{j=1}^p (x_{ij} - \bar{x}_{kj})^2 - 46 | \frac{2}{|C_k|}\sum_{i,i' \in C_k}\sum_{j=1}^p (x_{ij} - \bar{x}_{kj})(x_{i'j} - \bar{x}_{kj}) + 47 | \frac{|C_k|}{|C_k|}\sum_{i \in C_k}\sum_{j=1}^p (x_{ij} - \bar{x}_{kj})^2 \\ 48 | = & 2 \sum_{i \in C_k}\sum_{j=1}^p (x_{ij} - \bar{x}_{kj})^2 49 | \end{align} 50 | 51 | Note that we can drop the term containing 52 | $(x_{ij} - \bar{x}_{kj})(x_{i'j} - \bar{x}_{kj})$ since this is 0 when summed 53 | over combinations of $i$ and $i'$ for a given $j$. 54 | 55 | > b. On the basis of this identity, argue that the $K$-means clustering 56 | > algorithm (Algorithm 12.2) decreases the objective (12.17) at each 57 | > iteration. 58 | 59 | Equation 10.12 demonstrates that the euclidean distance between each possible 60 | pair of samples can be related to the difference from each sample to the mean 61 | of the cluster. The K-means algorithm works by minimizing the euclidean distance 62 | to each centroid, thus also minimizes the within-cluster variance. 63 | 64 | ### Question 2 65 | 66 | > Suppose that we have four observations, for which we compute a dissimilarity 67 | > matrix, given by 68 | > 69 | > \begin{bmatrix} 70 | > & 0.3 & 0.4 & 0.7 \\ 71 | > 0.3 & & 0.5 & 0.8 \\ 72 | > 0.4 & 0.5 & & 0.45 \\ 73 | > 0.7 & 0.8 & 0.45 & \\ 74 | > \end{bmatrix} 75 | > 76 | > For instance, the dissimilarity between the first and second observations is 77 | > 0.3, and the dissimilarity between the second and fourth observations is 0.8. 78 | > 79 | > a. On the basis of this dissimilarity matrix, sketch the dendrogram that 80 | > results from hierarchically clustering these four observations using 81 | > complete linkage. Be sure to indicate on the plot the height at which each 82 | > fusion occurs, as well as the observations corresponding to each leaf in 83 | > the dendrogram. 84 | 85 | ```{r} 86 | m <- matrix(c(0, 0.3, 0.4, 0.7, 0.3, 0, 0.5, 0.8, 0.4, 0.5, 0., 0.45, 0.7, 0.8, 0.45, 0), ncol = 4) 87 | c1 <- hclust(as.dist(m), method = "complete") 88 | plot(c1) 89 | ``` 90 | 91 | > b. Repeat (a), this time using single linkage clustering. 92 | 93 | ```{r} 94 | c2 <- hclust(as.dist(m), method = "single") 95 | plot(c2) 96 | ``` 97 | 98 | > c. Suppose that we cut the dendrogram obtained in (a) such that two clusters 99 | > result. Which observations are in each cluster? 100 | 101 | ```{r} 102 | table(1:4, cutree(c1, 2)) 103 | ``` 104 | 105 | > d. Suppose that we cut the dendrogram obtained in (b) such that two clusters 106 | > result. Which observations are in each cluster? 107 | 108 | ```{r} 109 | table(1:4, cutree(c2, 2)) 110 | ``` 111 | 112 | > e. It is mentioned in the chapter that at each fusion in the dendrogram, the 113 | > position of the two clusters being fused can be swapped without changing 114 | > the meaning of the dendrogram. Draw a dendrogram that is equivalent to the 115 | > dendrogram in (a), for which two or more of the leaves are repositioned, 116 | > but for which the meaning of the dendrogram is the same. 117 | 118 | ```{r} 119 | plot(c1, labels = c(2, 1, 3, 4)) 120 | ``` 121 | 122 | ### Question 3 123 | 124 | > In this problem, you will perform $K$-means clustering manually, with $K = 2$, 125 | > on a small example with $n = 6$ observations and $p = 2$ features. The 126 | > observations are as follows. 127 | > 128 | > | Obs. | $X_1$ | $X_2$ | 129 | > |------|-------|-------| 130 | > | 1 | 1 | 4 | 131 | > | 2 | 1 | 3 | 132 | > | 3 | 0 | 4 | 133 | > | 4 | 5 | 1 | 134 | > | 5 | 6 | 2 | 135 | > | 6 | 4 | 0 | 136 | > 137 | > a. Plot the observations. 138 | 139 | ```{r} 140 | library(ggplot2) 141 | d <- data.frame( 142 | x1 = c(1, 1, 0, 5, 6, 4), 143 | x2 = c(4, 3, 4, 1, 2, 0) 144 | ) 145 | ggplot(d, aes(x = x1, y = x2)) + 146 | geom_point() 147 | ``` 148 | 149 | > b. Randomly assign a cluster label to each observation. You can use the 150 | > `sample()` command in `R` to do this. Report the cluster labels for each 151 | > observation. 152 | 153 | ```{r} 154 | set.seed(42) 155 | d$cluster <- sample(c(1, 2), size = nrow(d), replace = TRUE) 156 | ``` 157 | 158 | > c. Compute the centroid for each cluster. 159 | 160 | ```{r} 161 | centroids <- sapply(c(1, 2), function(i) colMeans(d[d$cluster == i, 1:2])) 162 | ``` 163 | 164 | > d. Assign each observation to the centroid to which it is closest, in terms of 165 | > Euclidean distance. Report the cluster labels for each observation. 166 | 167 | ```{r} 168 | dist <- sapply(1:2, function(i) { 169 | sqrt((d$x1 - centroids[1, i])^2 + (d$x2 - centroids[2, i])^2) 170 | }) 171 | d$cluster <- apply(dist, 1, which.min) 172 | ``` 173 | 174 | > e. Repeat (c) and (d) until the answers obtained stop changing. 175 | 176 | ```{r} 177 | centroids <- sapply(c(1, 2), function(i) colMeans(d[d$cluster == i, 1:2])) 178 | dist <- sapply(1:2, function(i) { 179 | sqrt((d$x1 - centroids[1, i])^2 + (d$x2 - centroids[2, i])^2) 180 | }) 181 | d$cluster <- apply(dist, 1, which.min) 182 | ``` 183 | 184 | In this case, we get stable labels after the first iteration. 185 | 186 | > f. In your plot from (a), color the observations according to the cluster 187 | > labels obtained. 188 | 189 | ```{r} 190 | ggplot(d, aes(x = x1, y = x2, color = factor(cluster))) + 191 | geom_point() 192 | ``` 193 | 194 | ### Question 4 195 | 196 | > Suppose that for a particular data set, we perform hierarchical clustering 197 | > using single linkage and using complete linkage. We obtain two dendrograms. 198 | > 199 | > a. At a certain point on the single linkage dendrogram, the clusters {1, 2, 3} 200 | > and {4, 5} fuse. On the complete linkage dendrogram, the clusters {1, 2, 3} 201 | > and {4, 5} also fuse at a certain point. Which fusion will occur higher on 202 | > the tree, or will they fuse at the same height, or is there not enough 203 | > information to tell? 204 | 205 | The complete linkage fusion will likely be higher in the tree since single 206 | linkage is defined as being the minimum distance between two clusters. However, 207 | there is a chance that they could be at the same height (so technically there 208 | is not enough information to tell). 209 | 210 | > b. At a certain point on the single linkage dendrogram, the clusters {5} and 211 | > {6} fuse. On the complete linkage dendrogram, the clusters {5} and {6} also 212 | > fuse at a certain point. Which fusion will occur higher on the tree, or 213 | > will they fuse at the same height, or is there not enough information to 214 | > tell? 215 | 216 | They will fuse at the same height (the algorithm for calculating distance is 217 | the same when the clusters are of size 1). 218 | 219 | ### Question 5 220 | 221 | > In words, describe the results that you would expect if you performed 222 | > $K$-means clustering of the eight shoppers in Figure 12.16, on the basis of 223 | > their sock and computer purchases, with $K = 2$. Give three answers, one for 224 | > each of the variable scalings displayed. Explain. 225 | 226 | In cases where variables are scaled we would expect clusters to correspond 227 | to whether or not the retainer sold a computer. In the first case (raw numbers 228 | of items sold), we would expect clusters to represent low vs high numbers of 229 | sock purchases. 230 | 231 | To test, we can run the analysis in R: 232 | 233 | ```{r} 234 | set.seed(42) 235 | dat <- data.frame( 236 | socks = c(8, 11, 7, 6, 5, 6, 7, 8), 237 | computers = c(0, 0, 0, 0, 1, 1, 1, 1) 238 | ) 239 | kmeans(dat, 2)$cluster 240 | kmeans(scale(dat), 2)$cluster 241 | dat$computers <- dat$computers * 2000 242 | kmeans(dat, 2)$cluster 243 | ``` 244 | 245 | ### Question 6 246 | 247 | > We saw in Section 12.2.2 that the principal component loading and score 248 | > vectors provide an approximation to a matrix, in the sense of (12.5). 249 | > Specifically, the principal component score and loading vectors solve the 250 | > optimization problem given in (12.6). 251 | > 252 | > Now, suppose that the M principal component score vectors zim, $m = 1,...,M$, 253 | > are known. Using (12.6), explain that the first $M$ principal component 254 | > loading vectors $\phi_{jm}$, $m = 1,...,M$, can be obtaining by performing $M$ 255 | > separate least squares linear regressions. In each regression, the principal 256 | > component score vectors are the predictors, and one of the features of the 257 | > data matrix is the response. 258 | 259 | ## Applied 260 | 261 | ### Question 7 262 | 263 | > In the chapter, we mentioned the use of correlation-based distance and 264 | > Euclidean distance as dissimilarity measures for hierarchical clustering. 265 | > It turns out that these two measures are almost equivalent: if each 266 | > observation has been centered to have mean zero and standard deviation one, 267 | > and if we let $r_{ij}$ denote the correlation between the $i$th and $j$th 268 | > observations, then the quantity $1 - r_{ij}$ is proportional to the squared 269 | > Euclidean distance between the ith and jth observations. 270 | > 271 | > On the `USArrests` data, show that this proportionality holds. 272 | > 273 | > _Hint: The Euclidean distance can be calculated using the `dist()` function,_ 274 | > _and correlations can be calculated using the `cor()` function._ 275 | 276 | ```{r} 277 | dat <- t(scale(t(USArrests))) 278 | d1 <- dist(dat)^2 279 | d2 <- as.dist(1 - cor(t(dat))) 280 | plot(d1, d2) 281 | ``` 282 | 283 | ### Question 8 284 | 285 | > In Section 12.2.3, a formula for calculating PVE was given in Equation 286 | > 12.10. We also saw that the PVE can be obtained using the `sdev` output of the 287 | > `prcomp()` function. 288 | > 289 | > On the `USArrests` data, calculate PVE in two ways: 290 | > 291 | > a. Using the `sdev` output of the `prcomp()` function, as was done in Section 292 | > 12.2.3. 293 | 294 | ```{r} 295 | pr <- prcomp(USArrests, scale = TRUE) 296 | pr$sdev^2 / sum(pr$sdev^2) 297 | ``` 298 | 299 | > b. By applying Equation 12.10 directly. That is, use the `prcomp()` function to 300 | > compute the principal component loadings. Then, use those loadings in 301 | > Equation 12.10 to obtain the PVE. 302 | > 303 | > These two approaches should give the same results. 304 | 305 | ```{r} 306 | colSums(pr$x^2) / sum(colSums(scale(USArrests)^2)) 307 | ``` 308 | 309 | > _Hint: You will only obtain the same results in (a) and (b) if the same_ 310 | > _data is used in both cases. For instance, if in (a) you performed_ 311 | > _`prcomp()` using centered and scaled variables, then you must center and_ 312 | > _scale the variables before applying Equation 12.10 in (b)._ 313 | 314 | ### Question 9 315 | 316 | > Consider the `USArrests` data. We will now perform hierarchical clustering on 317 | > the states. 318 | > 319 | > a. Using hierarchical clustering with complete linkage and Euclidean distance, 320 | > cluster the states. 321 | 322 | ```{r} 323 | set.seed(42) 324 | hc <- hclust(dist(USArrests), method = "complete") 325 | ``` 326 | 327 | > b. Cut the dendrogram at a height that results in three distinct clusters. 328 | > Which states belong to which clusters? 329 | 330 | ```{r} 331 | ct <- cutree(hc, 3) 332 | sapply(1:3, function(i) names(ct)[ct == i]) 333 | ``` 334 | 335 | > c. Hierarchically cluster the states using complete linkage and Euclidean 336 | > distance, _after scaling the variables to have standard deviation one_. 337 | 338 | ```{r} 339 | hc2 <- hclust(dist(scale(USArrests)), method = "complete") 340 | ``` 341 | 342 | > d. What effect does scaling the variables have on the hierarchical clustering 343 | > obtained? In your opinion, should the variables be scaled before the 344 | > inter-observation dissimilarities are computed? Provide a justification for 345 | > your answer. 346 | 347 | ```{r} 348 | ct <- cutree(hc, 3) 349 | sapply(1:3, function(i) names(ct)[ct == i]) 350 | ``` 351 | 352 | Scaling results in different clusters and the choice of whether to scale or 353 | not depends on the data in question. In this case, the variables are: 354 | 355 | - Murder numeric Murder arrests (per 100,000) 356 | - Assault numeric Assault arrests (per 100,000) 357 | - UrbanPop numeric Percent urban population 358 | - Rape numeric Rape arrests (per 100,000) 359 | 360 | These variables are not naturally on the same unit and the units involved are 361 | somewhat arbitrary (so for example, Murder could be measured per 1 million 362 | rather than per 100,000) so in this case I would argue the data should be 363 | scaled. 364 | 365 | ### Question 10 366 | 367 | > In this problem, you will generate simulated data, and then perform PCA and 368 | > $K$-means clustering on the data. 369 | > 370 | > a. Generate a simulated data set with 20 observations in each of three classes 371 | > (i.e. 60 observations total), and 50 variables. 372 | > 373 | > _Hint: There are a number of functions in `R` that you can use to generate_ 374 | > _data. One example is the `rnorm()` function; `runif()` is another option._ 375 | > _Be sure to add a mean shift to the observations in each class so that_ 376 | > _there are three distinct classes._ 377 | 378 | ```{r} 379 | set.seed(42) 380 | data <- matrix(rnorm(60 * 50), ncol = 50) 381 | classes <- rep(c("A", "B", "C"), each = 20) 382 | dimnames(data) <- list(classes, paste0("v", 1:50)) 383 | data[classes == "B", 1:10] <- data[classes == "B", 1:10] + 1.2 384 | data[classes == "C", 5:30] <- data[classes == "C", 5:30] + 1 385 | ``` 386 | 387 | > b. Perform PCA on the 60 observations and plot the first two principal 388 | > component score vectors. Use a different color to indicate the 389 | > observations in each of the three classes. If the three classes appear 390 | > separated in this plot, then continue on to part (c). If not, then return 391 | > to part (a) and modify the simulation so that there is greater separation 392 | > between the three classes. Do not continue to part (c) until the three 393 | > classes show at least some separation in the first two principal component 394 | > score vectors. 395 | 396 | ```{r} 397 | pca <- prcomp(data) 398 | ggplot( 399 | data.frame(Class = classes, PC1 = pca$x[, 1], PC2 = pca$x[, 2]), 400 | aes(x = PC1, y = PC2, col = Class) 401 | ) + 402 | geom_point() 403 | ``` 404 | 405 | > c. Perform $K$-means clustering of the observations with $K = 3$. How well do 406 | > the clusters that you obtained in $K$-means clustering compare to the true 407 | > class labels? 408 | > 409 | > _Hint: You can use the `table()` function in `R` to compare the true class_ 410 | > _labels to the class labels obtained by clustering. Be careful how you_ 411 | > _interpret the results: $K$-means clustering will arbitrarily number the_ 412 | > _clusters, so you cannot simply check whether the true class labels and_ 413 | > _clustering labels are the same._ 414 | 415 | ```{r} 416 | km <- kmeans(data, 3)$cluster 417 | table(km, names(km)) 418 | ``` 419 | 420 | $K$-means separates out the clusters nearly perfectly. 421 | 422 | > d. Perform $K$-means clustering with $K = 2$. Describe your results. 423 | 424 | ```{r} 425 | km <- kmeans(data, 2)$cluster 426 | table(km, names(km)) 427 | ``` 428 | 429 | $K$-means effectively defines cluster 2 to be class B, but cluster 1 is a mix 430 | of classes A and B. 431 | 432 | > e. Now perform $K$-means clustering with $K = 4$, and describe your results. 433 | 434 | ```{r} 435 | km <- kmeans(data, 4)$cluster 436 | table(km, names(km)) 437 | ``` 438 | 439 | $K$-means effectively defines cluster 1 to be class B, cluster 2 to be class A 440 | but clusters 3 and 4 are split over class C. 441 | 442 | > f. Now perform $K$-means clustering with $K = 3$ on the first two principal 443 | > component score vectors, rather than on the raw data. That is, perform 444 | > $K$-means clustering on the $60 \times 2$ matrix of which the first column 445 | > is the first principal component score vector, and the second column is 446 | > the second principal component score vector. Comment on the results. 447 | 448 | ```{r} 449 | km <- kmeans(pca$x[, 1:2], 3)$cluster 450 | table(km, names(km)) 451 | ``` 452 | 453 | $K$-means again separates out the clusters nearly perfectly. 454 | 455 | > g. Using the `scale()` function, perform $K$-means clustering with $K = 3$ on 456 | > the data _after scaling each variable to have standard deviation one_. How 457 | > do these results compare to those obtained in (b)? Explain. 458 | 459 | ```{r} 460 | km <- kmeans(scale(data), 3)$cluster 461 | table(km, names(km)) 462 | ``` 463 | 464 | $K$-means appears to perform less well on the scaled data in this case. 465 | 466 | ### Question 11 467 | 468 | > Write an `R` function to perform matrix completion as in Algorithm 12.1, and 469 | > as outlined in Section 12.5.2. In each iteration, the function should keep 470 | > track of the relative error, as well as the iteration count. Iterations should 471 | > continue until the relative error is small enough or until some maximum number 472 | > of iterations is reached (set a default value for this maximum number). 473 | > Furthermore, there should be an option to print out the progress in each 474 | > iteration. 475 | > 476 | > Test your function on the `Boston` data. First, standardize the features to 477 | > have mean zero and standard deviation one using the `scale()` function. Run an 478 | > experiment where you randomly leave out an increasing (and nested) number of 479 | > observations from 5% to 30%, in steps of 5%. Apply Algorithm 12.1 with $M = 480 | > 1,2,...,8$. Display the approximation error as a function of the fraction of 481 | > observations that are missing, and the value of $M$, averaged over 10 482 | > repetitions of the experiment. 483 | 484 | ### Question 12 485 | 486 | > In Section 12.5.2, Algorithm 12.1 was implemented using the `svd()` function. 487 | > However, given the connection between the `svd()` function and the `prcomp()` 488 | > function highlighted in the lab, we could have instead implemented the 489 | > algorithm using `prcomp()`. 490 | > 491 | > Write a function to implement Algorithm 12.1 that makes use of `prcomp()` 492 | > rather than `svd()`. 493 | 494 | ### Question 13 495 | 496 | > On the book website, `www.StatLearning.com`, there is a gene expression data 497 | > set (`Ch12Ex13.csv`) that consists of 40 tissue samples with measurements on 498 | > 1,000 genes. The first 20 samples are from healthy patients, while the 499 | > second 20 are from a diseased group. 500 | > 501 | > a. Load in the data using `read.csv()`. You will need to select `header = F`. 502 | 503 | ```{r} 504 | data <- read.csv("data/Ch12Ex13.csv", header = FALSE) 505 | colnames(data) <- c(paste0("H", 1:20), paste0("D", 1:20)) 506 | ``` 507 | 508 | > b. Apply hierarchical clustering to the samples using correlation-based 509 | > distance, and plot the dendrogram. Do the genes separate the samples into 510 | > the two groups? Do your results depend on the type of linkage used? 511 | 512 | ```{r} 513 | hc.complete <- hclust(as.dist(1 - cor(data)), method = "complete") 514 | plot(hc.complete) 515 | 516 | hc.complete <- hclust(as.dist(1 - cor(data)), method = "average") 517 | plot(hc.complete) 518 | 519 | hc.complete <- hclust(as.dist(1 - cor(data)), method = "single") 520 | plot(hc.complete) 521 | ``` 522 | 523 | Yes the samples clearly separate into the two groups, although the results 524 | depend somewhat on the linkage method used. In the case of average clustering, 525 | the disease samples all fall within a subset of the healthy samples. 526 | 527 | > c. Your collaborator wants to know which genes differ the most across the two 528 | > groups. Suggest a way to answer this question, and apply it here. 529 | 530 | This is probably best achieved with a supervised approach. A simple method 531 | would be to determine which genes show the most significant differences between 532 | the groups by applying a t-test to each group. We can then select those with a 533 | FDR adjusted p-value less than some given threshold (e.g. 0.05). 534 | 535 | ```{r} 536 | class <- factor(rep(c("Healthy", "Diseased"), each = 20)) 537 | pvals <- p.adjust(apply(data, 1, function(v) t.test(v ~ class)$p.value)) 538 | which(pvals < 0.05) 539 | ``` 540 | -------------------------------------------------------------------------------- /09-support-vector-mechines.Rmd: -------------------------------------------------------------------------------- 1 | # Support Vector Machines 2 | 3 | ## Conceptual 4 | 5 | ### Question 1 6 | 7 | > This problem involves hyperplanes in two dimensions. 8 | > 9 | > a. Sketch the hyperplane $1 + 3X_1 - X_2 = 0$. Indicate the set of points for 10 | > which $1 + 3X_1 - X_2 > 0$, as well as the set of points for which 11 | > $1 + 3X_1 - X_2 < 0$. 12 | 13 | ```{r} 14 | library(ggplot2) 15 | xlim <- c(-10, 10) 16 | ylim <- c(-30, 30) 17 | points <- expand.grid( 18 | X1 = seq(xlim[1], xlim[2], length.out = 50), 19 | X2 = seq(ylim[1], ylim[2], length.out = 50) 20 | ) 21 | p <- ggplot(points, aes(x = X1, y = X2)) + 22 | geom_abline(intercept = 1, slope = 3) + # X2 = 1 + 3X1 23 | theme_bw() 24 | p + geom_point(aes(color = 1 + 3 * X1 - X2 > 0), size = 0.1) + 25 | scale_color_discrete(name = "1 + 3X1 - X2 > 0") 26 | ``` 27 | 28 | > b. On the same plot, sketch the hyperplane $-2 + X_1 + 2X_2 = 0$. Indicate the 29 | > set of points for which $-2 + X_1 + 2X_2 > 0$, as well as the set of points 30 | > for which $-2 + X_1 + 2X_2 < 0$. 31 | 32 | ```{r} 33 | p + geom_abline(intercept = 1, slope = -1 / 2) + # X2 = 1 - X1/2 34 | geom_point( 35 | aes(color = interaction(1 + 3 * X1 - X2 > 0, -2 + X1 + 2 * X2 > 0)), 36 | size = 0.5 37 | ) + 38 | scale_color_discrete(name = "(1 + 3X1 - X2 > 0).(-2 + X1 + 2X2 > 0)") 39 | ``` 40 | 41 | ### Question 2 42 | 43 | > We have seen that in $p = 2$ dimensions, a linear decision boundary takes the 44 | > form $\beta_0 + \beta_1X_1 + \beta_2X_2 = 0$. We now investigate a non-linear 45 | > decision boundary. 46 | > 47 | > a. Sketch the curve $$(1+X_1)^2 +(2-X_2)^2 = 4$$. 48 | 49 | ```{r} 50 | points <- expand.grid( 51 | X1 = seq(-4, 2, length.out = 100), 52 | X2 = seq(-1, 5, length.out = 100) 53 | ) 54 | p <- ggplot(points, aes(x = X1, y = X2, z = (1 + X1)^2 + (2 - X2)^2 - 4)) + 55 | geom_contour(breaks = 0, colour = "black") + 56 | theme_bw() 57 | p 58 | ``` 59 | 60 | > b. On your sketch, indicate the set of points for which 61 | > $$(1 + X_1)^2 + (2 - X_2)^2 > 4,$$ as well as the set of points for which 62 | > $$(1 + X_1)^2 + (2 - X_2)^2 \leq 4.$$ 63 | 64 | ```{r} 65 | p + geom_point(aes(color = (1 + X1)^2 + (2 - X2)^2 - 4 > 0), size = 0.1) 66 | ``` 67 | 68 | > c. Suppose that a classifier assigns an observation to the blue class if $$(1 69 | > + X_1)^2 + (2 - X_2)^2 > 4,$$ and to the red class otherwise. To what class 70 | > is the observation $(0, 0)$ classified? $(-1, 1)$? $(2, 2)$? $(3, 8)$? 71 | 72 | ```{r} 73 | points <- data.frame( 74 | X1 = c(0, -1, 2, 3), 75 | X2 = c(0, 1, 2, 8) 76 | ) 77 | ifelse((1 + points$X1)^2 + (2 - points$X2)^2 > 4, "blue", "red") 78 | ``` 79 | 80 | > d. Argue that while the decision boundary in (c) is not linear in terms of 81 | > $X_1$ and $X_2$, it is linear in terms of $X_1$, $X_1^2$, $X_2$, and 82 | > $X_2^2$. 83 | 84 | The decision boundary is $$(1 + X_1)^2 + (2 - X_2)^2 -4 = 0$$ which we can expand 85 | to: 86 | $$1 + 2X_1 + X_1^2 + 4 - 4X_2 + X_2^2 - 4 = 0$$ 87 | which is linear in terms of $X_1$, $X_1^2$, $X_2$, $X_2^2$. 88 | 89 | ### Question 3 90 | 91 | > Here we explore the maximal margin classifier on a toy data set. 92 | > 93 | > a. We are given $n = 7$ observations in $p = 2$ dimensions. For each 94 | > observation, there is an associated class label. 95 | > 96 | > | Obs. | $X_1$ | $X_2$ | $Y$ | 97 | > |------|-------|-------|------| 98 | > | 1 | 3 | 4 | Red | 99 | > | 2 | 2 | 2 | Red | 100 | > | 3 | 4 | 4 | Red | 101 | > | 4 | 1 | 4 | Red | 102 | > | 5 | 2 | 1 | Blue | 103 | > | 6 | 4 | 3 | Blue | 104 | > | 7 | 4 | 1 | Blue | 105 | > 106 | > Sketch the observations. 107 | 108 | ```{r} 109 | data <- data.frame( 110 | X1 = c(3, 2, 4, 1, 2, 4, 4), 111 | X2 = c(4, 2, 4, 4, 1, 3, 1), 112 | Y = c(rep("Red", 4), rep("Blue", 3)) 113 | ) 114 | p <- ggplot(data, aes(x = X1, y = X2, color = Y)) + 115 | geom_point(size = 2) + 116 | scale_colour_identity() + 117 | coord_cartesian(xlim = c(0.5, 4.5), ylim = c(0.5, 4.5)) 118 | p 119 | ``` 120 | 121 | > b. Sketch the optimal separating hyperplane, and provide the equation for this 122 | > hyperplane (of the form (9.1)). 123 | 124 | ```{r} 125 | library(e1071) 126 | 127 | fit <- svm(as.factor(Y) ~ ., data = data, kernel = "linear", cost = 10, scale = FALSE) 128 | 129 | # Extract beta_0, beta_1, beta_2 130 | beta <- c( 131 | -fit$rho, 132 | drop(t(fit$coefs) %*% as.matrix(data[fit$index, 1:2])) 133 | ) 134 | names(beta) <- c("B0", "B1", "B2") 135 | p <- p + geom_abline(intercept = -beta[1] / beta[3], slope = -beta[2] / beta[3], lty = 2) 136 | p 137 | ``` 138 | 139 | > c. Describe the classification rule for the maximal margin classifier. It 140 | > should be something along the lines of "Classify to Red if $\beta_0 + 141 | > \beta_1X_1 + \beta_2X_2 > 0$, and classify to Blue otherwise." Provide the 142 | > values for $\beta_0, \beta_1,$ and $\beta_2$. 143 | 144 | Classify to red if $\beta_0 + \beta_1X_1 + \beta_2X_2 > 0$ and blue otherwise 145 | where $\beta_0 = `r round(beta[1])`$, $\beta_1 = `r round(beta[2])`$, 146 | $\beta_2 = `r round(beta[3])`$. 147 | 148 | > d. On your sketch, indicate the margin for the maximal margin hyperplane. 149 | 150 | ```{r} 151 | p <- p + geom_ribbon( 152 | aes(x = x, ymin = ymin, ymax = ymax), 153 | data = data.frame(x = c(0, 5), ymin = c(-1, 4), ymax = c(0, 5)), 154 | alpha = 0.1, fill = "blue", 155 | inherit.aes = FALSE 156 | ) 157 | p 158 | ``` 159 | 160 | > e. Indicate the support vectors for the maximal margin classifier. 161 | 162 | ```{r} 163 | p <- p + geom_point(data = data[fit$index, ], size = 4) 164 | p 165 | ``` 166 | 167 | The support vectors (from the svm fit object) are shown above. Arguably, 168 | there's another support vector, since four points exactly touch the margin. 169 | 170 | > f. Argue that a slight movement of the seventh observation would not affect 171 | > the maximal margin hyperplane. 172 | 173 | ```{r} 174 | p + geom_point(data = data[7, , drop = FALSE], size = 4, color = "purple") 175 | ``` 176 | 177 | The 7th point is shown in purple above. It is not a support vector, and not 178 | close to the margin, so small changes in its X1, X2 values would not affect the 179 | current calculated margin. 180 | 181 | > g. Sketch a hyperplane that is _not_ the optimal separating hyperplane, and 182 | > provide the equation for this hyperplane. 183 | 184 | A non-optimal hyperline that still separates the blue and red points would 185 | be one that touches the (red) point at X1 = 2, X2 = 2 and the (blue) point at 186 | X1 = 4, X2 = 3. This gives line $y = x/2 + 1$ or, when $\beta_0 = -1$, 187 | $\beta_1 = -1/2$, $\beta_2 = 1$. 188 | 189 | ```{r} 190 | p + geom_abline(intercept = 1, slope = 0.5, lty = 2, col = "red") 191 | ``` 192 | 193 | > h. Draw an additional observation on the plot so that the two classes are no 194 | > longer separable by a hyperplane. 195 | 196 | ```{r} 197 | p + geom_point(data = data.frame(X1 = 1, X2 = 3, Y = "Blue"), shape = 15, size = 4) 198 | ``` 199 | 200 | ## Applied 201 | 202 | ### Question 4 203 | 204 | > Generate a simulated two-class data set with 100 observations and two features 205 | > in which there is a visible but non-linear separation between the two classes. 206 | > Show that in this setting, a support vector machine with a polynomial kernel 207 | > (with degree greater than 1) or a radial kernel will outperform a support 208 | > vector classifier on the training data. Which technique performs best on the 209 | > test data? Make plots and report training and test error rates in order to 210 | > back up your assertions. 211 | 212 | ```{r} 213 | set.seed(10) 214 | data <- data.frame( 215 | x = runif(100), 216 | y = runif(100) 217 | ) 218 | score <- (2 * data$x - 0.5)^2 + (data$y)^2 - 0.5 219 | data$class <- factor(ifelse(score > 0, "red", "blue")) 220 | 221 | p <- ggplot(data, aes(x = x, y = y, color = class)) + 222 | geom_point(size = 2) + 223 | scale_colour_identity() 224 | p 225 | 226 | train <- 1:50 227 | test <- 51:100 228 | 229 | fits <- list( 230 | "Radial" = svm(class ~ ., data = data[train, ], kernel = "radial"), 231 | "Polynomial" = svm(class ~ ., data = data[train, ], kernel = "polynomial", degree = 2), 232 | "Linear" = svm(class ~ ., data = data[train, ], kernel = "linear") 233 | ) 234 | 235 | err <- function(model, data) { 236 | out <- table(predict(model, data), data$class) 237 | (out[1, 2] + out[2, 1]) / sum(out) 238 | } 239 | plot(fits[[1]], data) 240 | plot(fits[[2]], data) 241 | plot(fits[[3]], data) 242 | sapply(fits, err, data = data[train, ]) 243 | sapply(fits, err, data = data[test, ]) 244 | ``` 245 | 246 | In this case, the radial kernel performs best, followed by a linear kernel with 247 | the 2nd degree polynomial performing worst. The ordering of these models is the 248 | same for the training and test data sets. 249 | 250 | ### Question 5 251 | 252 | > We have seen that we can fit an SVM with a non-linear kernel in order to 253 | > perform classification using a non-linear decision boundary. We will now see 254 | > that we can also obtain a non-linear decision boundary by performing logistic 255 | > regression using non-linear transformations of the features. 256 | > 257 | > a. Generate a data set with $n = 500$ and $p = 2$, such that the observations 258 | > belong to two classes with a quadratic decision boundary between them. For 259 | > instance, you can do this as follows: 260 | > 261 | > ```r 262 | > > x1 <- runif(500) - 0.5 263 | > > x2 <- runif(500) - 0.5 264 | > > y <- 1 * (x1^2 - x2^2 > 0) 265 | > ``` 266 | 267 | ```{r} 268 | set.seed(42) 269 | train <- data.frame( 270 | x1 = runif(500) - 0.5, 271 | x2 = runif(500) - 0.5 272 | ) 273 | train$y <- factor(as.numeric((train$x1^2 - train$x2^2 > 0))) 274 | ``` 275 | 276 | > b. Plot the observations, colored according to their class labels. Your plot 277 | > should display $X_1$ on the $x$-axis, and $X_2$ on the $y$-axis. 278 | 279 | ```{r} 280 | p <- ggplot(train, aes(x = x1, y = x2, color = y)) + 281 | geom_point(size = 2) 282 | p 283 | ``` 284 | 285 | > c. Fit a logistic regression model to the data, using $X_1$ and $X_2$ as 286 | > predictors. 287 | 288 | ```{r} 289 | fit1 <- glm(y ~ ., data = train, family = "binomial") 290 | ``` 291 | 292 | > d. Apply this model to the _training data_ in order to obtain a predicted class 293 | > label for each training observation. Plot the observations, colored 294 | > according to the _predicted_ class labels. The decision boundary should be 295 | > linear. 296 | 297 | ```{r} 298 | plot_model <- function(fit) { 299 | if (inherits(fit, "svm")) { 300 | train$p <- predict(fit) 301 | } else { 302 | train$p <- factor(as.numeric(predict(fit) > 0)) 303 | } 304 | ggplot(train, aes(x = x1, y = x2, color = p)) + 305 | geom_point(size = 2) 306 | } 307 | 308 | plot_model(fit1) 309 | ``` 310 | 311 | > e. Now fit a logistic regression model to the data using non-linear functions 312 | > of $X_1$ and $X_2$ as predictors (e.g. $X_1^2, X_1 \times X_2, \log(X_2),$ 313 | > and so forth). 314 | 315 | ```{r} 316 | fit2 <- glm(y ~ poly(x1, 2) + poly(x2, 2), data = train, family = "binomial") 317 | ``` 318 | 319 | > f. Apply this model to the _training data_ in order to obtain a predicted 320 | > class label for each training observation. Plot the observations, colored 321 | > according to the _predicted_ class labels. The decision boundary should be 322 | > obviously non-linear. If it is not, then repeat (a)-(e) until you come up 323 | > with an example in which the predicted class labels are obviously 324 | > non-linear. 325 | 326 | ```{r} 327 | plot_model(fit2) 328 | ``` 329 | 330 | > g. Fit a support vector classifier to the data with $X_1$ and $X_2$ as 331 | > predictors. Obtain a class prediction for each training observation. Plot 332 | > the observations, colored according to the _predicted class labels_. 333 | 334 | ```{r} 335 | fit3 <- svm(y ~ x1 + x2, data = train, kernel = "linear") 336 | plot_model(fit3) 337 | ``` 338 | 339 | > h. Fit a SVM using a non-linear kernel to the data. Obtain a class prediction 340 | > for each training observation. Plot the observations, colored according to 341 | > the _predicted class labels_. 342 | 343 | ```{r} 344 | fit4 <- svm(y ~ x1 + x2, data = train, kernel = "polynomial", degree = 2) 345 | plot_model(fit4) 346 | ``` 347 | 348 | > i. Comment on your results. 349 | 350 | When simulating data with a quadratic decision boundary, a logistic model with 351 | quadratic transformations of the variables and an svm model with a quadratic 352 | kernel both produce much better (and similar fits) than standard linear methods. 353 | 354 | ### Question 6 355 | 356 | > At the end of Section 9.6.1, it is claimed that in the case of data that is 357 | > just barely linearly separable, a support vector classifier with a small 358 | > value of `cost` that misclassifies a couple of training observations may 359 | > perform better on test data than one with a huge value of `cost` that does not 360 | > misclassify any training observations. You will now investigate this claim. 361 | > 362 | > a. Generate two-class data with $p = 2$ in such a way that the classes are 363 | > just barely linearly separable. 364 | 365 | ```{r} 366 | set.seed(2) 367 | 368 | # Simulate data that is separable by a line at y = 2.5 369 | data <- data.frame( 370 | x = rnorm(200), 371 | class = sample(c("red", "blue"), 200, replace = TRUE) 372 | ) 373 | data$y <- (data$class == "red") * 5 + rnorm(200) 374 | 375 | # Add barley separable points (these are simulated "noise" values) 376 | newdata <- data.frame(x = rnorm(30)) 377 | newdata$y <- 1.5 * newdata$x + 3 + rnorm(30, 0, 1) 378 | newdata$class <- ifelse((1.5 * newdata$x + 3) - newdata$y > 0, "blue", "red") 379 | 380 | data <- rbind(data, newdata) 381 | 382 | # remove any that cause misclassification leaving data that is barley linearly 383 | # separable, but along an axis that is not y = 2.5 (which would be correct 384 | # for the "true" data. 385 | data <- data[!(data$class == "red") == ((1.5 * data$x + 3 - data$y) > 0), ] 386 | data <- data[sample(seq_len(nrow(data)), 200), ] 387 | 388 | p <- ggplot(data, aes(x = x, y = y, color = class)) + 389 | geom_point(size = 2) + 390 | scale_colour_identity() + 391 | geom_abline(intercept = 3, slope = 1.5, lty = 2) 392 | p 393 | ``` 394 | 395 | > b. Compute the cross-validation error rates for support vector classifiers 396 | > with a range of `cost` values. How many training errors are misclassified 397 | > for each value of `cost` considered, and how does this relate to the 398 | > cross-validation errors obtained? 399 | 400 | How many training errors are misclassified for each value of cost? 401 | 402 | ```{r} 403 | costs <- 10^seq(-3, 5) 404 | 405 | sapply(costs, function(cost) { 406 | fit <- svm(as.factor(class) ~ ., data = data, kernel = "linear", cost = cost) 407 | pred <- predict(fit, data) 408 | sum(pred != data$class) 409 | }) 410 | ``` 411 | 412 | Cross-validation errors 413 | 414 | ```{r} 415 | out <- tune(svm, as.factor(class) ~ ., data = data, kernel = "linear", ranges = list(cost = costs)) 416 | summary(out) 417 | data.frame( 418 | cost = out$performances$cost, 419 | misclass = out$performances$error * nrow(data) 420 | ) 421 | ``` 422 | 423 | > c. Generate an appropriate test data set, and compute the test errors 424 | > corresponding to each of the values of `cost` considered. Which value of 425 | > `cost` leads to the fewest test errors, and how does this compare to the 426 | > values of `cost` that yield the fewest training errors and the fewest 427 | > cross-validation errors? 428 | 429 | ```{r} 430 | set.seed(2) 431 | test <- data.frame( 432 | x = rnorm(200), 433 | class = sample(c("red", "blue"), 200, replace = TRUE) 434 | ) 435 | test$y <- (test$class == "red") * 5 + rnorm(200) 436 | p + geom_point(data = test, pch = 21) 437 | 438 | (errs <- sapply(costs, function(cost) { 439 | fit <- svm(as.factor(class) ~ ., data = data, kernel = "linear", cost = cost) 440 | pred <- predict(fit, test) 441 | sum(pred != test$class) 442 | })) 443 | (cost <- costs[which.min(errs)]) 444 | 445 | (fit <- svm(as.factor(class) ~ ., data = data, kernel = "linear", cost = cost)) 446 | 447 | test$prediction <- predict(fit, test) 448 | p <- ggplot(test, aes(x = x, y = y, color = class, shape = prediction == class)) + 449 | geom_point(size = 2) + 450 | scale_colour_identity() 451 | p 452 | ``` 453 | 454 | > d. Discuss your results. 455 | 456 | A large cost leads to overfitting as the model finds the perfect linear 457 | separation between red and blue in the training data. A lower cost then 458 | leads to improved prediction in the test data. 459 | 460 | ### Question 7 461 | 462 | > In this problem, you will use support vector approaches in order to predict 463 | > whether a given car gets high or low gas mileage based on the `Auto` data set. 464 | > 465 | > a. Create a binary variable that takes on a 1 for cars with gas mileage above 466 | > the median, and a 0 for cars with gas mileage below the median. 467 | 468 | ```{r} 469 | library(ISLR2) 470 | data <- Auto 471 | data$high_mpg <- as.factor(as.numeric(data$mpg > median(data$mpg))) 472 | ``` 473 | 474 | > b. Fit a support vector classifier to the data with various values of `cost`, 475 | > in order to predict whether a car gets high or low gas mileage. Report the 476 | > cross-validation errors associated with different values of this parameter. 477 | > Comment on your results. Note you will need to fit the classifier without 478 | > the gas mileage variable to produce sensible results. 479 | 480 | ```{r} 481 | set.seed(42) 482 | costs <- 10^seq(-4, 3, by = 0.5) 483 | results <- list() 484 | f <- high_mpg ~ displacement + horsepower + weight 485 | results$linear <- tune(svm, f, 486 | data = data, kernel = "linear", 487 | ranges = list(cost = costs) 488 | ) 489 | summary(results$linear) 490 | ``` 491 | 492 | > c. Now repeat (b), this time using SVMs with radial and polynomial basis 493 | > kernels, with different values of `gamma` and `degree` and `cost`. Comment 494 | > on your results. 495 | 496 | ```{r} 497 | results$polynomial <- tune(svm, f, 498 | data = data, kernel = "polynomial", 499 | ranges = list(cost = costs, degree = 1:3) 500 | ) 501 | summary(results$polynomial) 502 | 503 | results$radial <- tune(svm, f, 504 | data = data, kernel = "radial", 505 | ranges = list(cost = costs, gamma = 10^(-2:1)) 506 | ) 507 | summary(results$radial) 508 | 509 | sapply(results, function(x) x$best.performance) 510 | sapply(results, function(x) x$best.parameters) 511 | ``` 512 | 513 | > d. Make some plots to back up your assertions in (b) and (c). 514 | > 515 | > _Hint: In the lab, we used the `plot()` function for `svm` objects only in 516 | > cases with $p = 2$. When $p > 2$, you can use the `plot()` function to 517 | > create plots displaying pairs of variables at a time. Essentially, instead 518 | > of typing_ 519 | > 520 | > ```r 521 | > > plot(svmfit, dat) 522 | > ``` 523 | > 524 | > _where `svmfit` contains your fitted model and dat is a data frame 525 | > containing your data, you can type_ 526 | > 527 | > ```r 528 | > > plot(svmfit, dat, x1 ∼ x4) 529 | > ``` 530 | > 531 | > _in order to plot just the first and fourth variables. However, you must 532 | > replace `x1` and `x4` with the correct variable names. To find out more, 533 | > type `?plot.svm`._ 534 | 535 | ```{r} 536 | table(predict(results$radial$best.model, data), data$high_mpg) 537 | 538 | plot(results$radial$best.model, data, horsepower ~ displacement) 539 | plot(results$radial$best.model, data, horsepower ~ weight) 540 | plot(results$radial$best.model, data, displacement ~ weight) 541 | ``` 542 | 543 | ### Question 8 544 | 545 | > This problem involves the `OJ` data set which is part of the `ISLR2` package. 546 | > 547 | > a. Create a training set containing a random sample of 800 observations, and a 548 | > test set containing the remaining observations. 549 | 550 | ```{r} 551 | set.seed(42) 552 | train <- sample(seq_len(nrow(OJ)), 800) 553 | test <- setdiff(seq_len(nrow(OJ)), train) 554 | ``` 555 | 556 | > b. Fit a support vector classifier to the training data using `cost = 0.01`, 557 | > with `Purchase` as the response and the other variables as predictors. Use 558 | > the `summary()` function to produce summary statistics, and describe the 559 | > results obtained. 560 | 561 | ```{r} 562 | fit <- svm(Purchase ~ ., data = OJ[train, ], kernel = "linear", cost = 0.01) 563 | summary(fit) 564 | ``` 565 | 566 | > c. What are the training and test error rates? 567 | 568 | ```{r} 569 | err <- function(model, data) { 570 | t <- table(predict(model, data), data[["Purchase"]]) 571 | 1 - sum(diag(t)) / sum(t) 572 | } 573 | errs <- function(model) { 574 | c(train = err(model, OJ[train, ]), test = err(model, OJ[test, ])) 575 | } 576 | errs(fit) 577 | ``` 578 | 579 | > d. Use the `tune()` function to select an optimal cost. Consider values in the 580 | > range 0.01 to 10. 581 | 582 | ```{r} 583 | tuned <- tune(svm, Purchase ~ ., 584 | data = OJ[train, ], kernel = "linear", 585 | ranges = list(cost = 10^seq(-2, 1, length.out = 10)) 586 | ) 587 | tuned$best.parameters 588 | summary(tuned) 589 | ``` 590 | 591 | > e. Compute the training and test error rates using this new value for `cost`. 592 | 593 | ```{r} 594 | errs(tuned$best.model) 595 | ``` 596 | 597 | > f. Repeat parts (b) through (e) using a support vector machine with a radial 598 | > kernel. Use the default value for `gamma`. 599 | 600 | ```{r} 601 | tuned2 <- tune(svm, Purchase ~ ., 602 | data = OJ[train, ], kernel = "radial", 603 | ranges = list(cost = 10^seq(-2, 1, length.out = 10)) 604 | ) 605 | tuned2$best.parameters 606 | errs(tuned2$best.model) 607 | ``` 608 | 609 | > g. Repeat parts (b) through (e) using a support vector machine with a 610 | > polynomial kernel. Set `degree = 2`. 611 | 612 | ```{r} 613 | tuned3 <- tune(svm, Purchase ~ ., 614 | data = OJ[train, ], kernel = "polynomial", 615 | ranges = list(cost = 10^seq(-2, 1, length.out = 10)), degree = 2 616 | ) 617 | tuned3$best.parameters 618 | errs(tuned3$best.model) 619 | ``` 620 | 621 | > h. Overall, which approach seems to give the best results on this data? 622 | 623 | Overall the "radial" kernel appears to perform best in this case. 624 | -------------------------------------------------------------------------------- /08-tree-based-methods.Rmd: -------------------------------------------------------------------------------- 1 | # Tree-Based Methods 2 | 3 | ## Conceptual 4 | 5 | ### Question 1 6 | 7 | > Draw an example (of your own invention) of a partition of two-dimensional 8 | > feature space that could result from recursive binary splitting. Your example 9 | > should contain at least six regions. Draw a decision tree corresponding to 10 | > this partition. Be sure to label all aspects of your figures, including the 11 | > regions $R_1, R_2, ...,$ the cutpoints $t_1, t_2, ...,$ and so forth. 12 | > 13 | > _Hint: Your result should look something like Figures 8.1 and 8.2._ 14 | 15 | ```{r, message = FALSE, warning = FALSE} 16 | library(showtext) 17 | showtext::showtext_auto() 18 | library(ggplot2) 19 | library(tidyverse) 20 | library(ggtree) 21 | ``` 22 | 23 | ```{r} 24 | tree <- ape::read.tree(text = "(((R1:1,R2:1)N1:2,R3:4)N2:2,(R4:2,(R5:1,R6:1)R3:2)N4:5)R;") 25 | tree$node.label <- c("Age < 40", "Weight < 100", "Weight < 70", "Age < 60", "Weight < 80") 26 | 27 | ggtree(tree, ladderize = FALSE) + scale_x_reverse() + coord_flip() + 28 | geom_tiplab(vjust = 2, hjust = 0.5) + 29 | geom_text2(aes(label = label, subset = !isTip), hjust = -0.1, vjust = -1) 30 | ``` 31 | 32 | ```{r} 33 | plot(NULL, 34 | xlab = "Age (years)", ylab = "Weight (kg)", 35 | xlim = c(0, 100), ylim = c(40, 160), xaxs = "i", yaxs = "i" 36 | ) 37 | abline(v = 40, col = "red", lty = 2) 38 | lines(c(0, 40), c(100, 100), col = "blue", lty = 2) 39 | lines(c(0, 40), c(70, 70), col = "blue", lty = 2) 40 | abline(v = 60, col = "red", lty = 2) 41 | lines(c(60, 100), c(80, 80), col = "blue", lty = 2) 42 | 43 | text( 44 | c(20, 20, 20, 50, 80, 80), 45 | c(55, 85, 130, 100, 60, 120), 46 | labels = c("R1", "R2", "R3", "R4", "R5", "R6") 47 | ) 48 | ``` 49 | 50 | ### Question 2 51 | 52 | > It is mentioned in Section 8.2.3 that boosting using depth-one trees (or 53 | > _stumps_) leads to an _additive_ model: that is, a model of the form 54 | > $$ 55 | > f(X) = \sum_{j=1}^p f_j(X_j). 56 | > $$ 57 | > Explain why this is the case. You can begin with (8.12) in Algorithm 8.2. 58 | 59 | Equation 8.1 is: 60 | 61 | $$ 62 | f(x) = \sum_{b=1}^B(\lambda \hat{f}^b(x) 63 | $$ 64 | 65 | where $\hat{f}^b(x)$ represents the $b$th tree with (in this case) 1 split. 66 | Since 1-depth trees involve only one variable, and the total function for 67 | $x$ involves adding the outcome for each, this model is an additive. Depth 68 | 2 trees would allow for interactions between two variables. 69 | 70 | ### Question 3 71 | 72 | > Consider the Gini index, classification error, and cross-entropy in a simple 73 | > classification setting with two classes. Create a single plot that displays 74 | > each of these quantities as a function of $\hat{p}_{m1}$. The $x$-axis should 75 | > display $\hat{p}_{m1}$, ranging from 0 to 1, and the $y$-axis should display 76 | > the value of the Gini index, classification error, and entropy. 77 | > 78 | > _Hint: In a setting with two classes, $\hat{p}_{m1} = 1 - \hat{p}_{m2}$. You 79 | > could make this plot by hand, but it will be much easier to make in `R`._ 80 | 81 | The *Gini index* is defined by 82 | 83 | $$G = \sum_{k=1}^{K} \hat{p}_{mk}(1 - \hat{p}_{mk})$$ 84 | 85 | *Entropy* is given by 86 | 87 | $$D = -\sum_{k=1}^{K} \hat{p}_{mk}\log(\hat{p}_{mk})$$ 88 | 89 | The *classification error* is 90 | 91 | $$E = 1 - \max_k(\hat{p}_{mk})$$ 92 | 93 | ```{r} 94 | # Function definitions are for when there's two classes only 95 | p <- seq(0, 1, length.out = 100) 96 | data.frame( 97 | x = p, 98 | "Gini index" = p * (1 - p) * 2, 99 | "Entropy" = -(p * log(p) + (1 - p) * log(1 - p)), 100 | "Classification error" = 1 - pmax(p, 1 - p), 101 | check.names = FALSE 102 | ) |> 103 | pivot_longer(!x) |> 104 | ggplot(aes(x = x, y = value, color = name)) + 105 | geom_line(na.rm = TRUE) 106 | ``` 107 | 108 | ### Question 4 109 | 110 | > This question relates to the plots in Figure 8.12. 111 | > 112 | > a. Sketch the tree corresponding to the partition of the predictor space 113 | > illustrated in the left-hand panel of Figure 8.12. The numbers inside the 114 | > boxes indicate the mean of $Y$ within each region. 115 | 116 | ```{r} 117 | tree <- ape::read.tree(text = "(((3:1.5,(10:1,0:1)A:1)B:1,15:2)C:1,5:2)D;") 118 | tree$node.label <- c("X1 < 1", "X2 < 1", "X1 < 0", "X2 < 0") 119 | 120 | ggtree(tree, ladderize = FALSE) + scale_x_reverse() + coord_flip() + 121 | geom_tiplab(vjust = 2, hjust = 0.5) + 122 | geom_text2(aes(label = label, subset = !isTip), hjust = -0.1, vjust = -1) 123 | ``` 124 | 125 | > b. Create a diagram similar to the left-hand panel of Figure 8.12, using the 126 | > tree illustrated in the right-hand panel of the same figure. You should 127 | > divide up the predictor space into the correct regions, and indicate the 128 | > mean for each region. 129 | 130 | ```{r} 131 | plot(NULL, xlab = "X1", ylab = "X2", xlim = c(-1, 2), ylim = c(0, 3), xaxs = "i", yaxs = "i") 132 | abline(h = 1, col = "red", lty = 2) 133 | lines(c(1, 1), c(0, 1), col = "blue", lty = 2) 134 | lines(c(-1, 2), c(2, 2), col = "red", lty = 2) 135 | lines(c(0, 0), c(1, 2), col = "blue", lty = 2) 136 | text( 137 | c(0, 1.5, -0.5, 1, 0.5), 138 | c(0.5, 0.5, 1.5, 1.5, 2.5), 139 | labels = c("-1.80", "0.63", "-1.06", "0.21", "2.49") 140 | ) 141 | ``` 142 | 143 | ### Question 5 144 | 145 | > Suppose we produce ten bootstrapped samples from a data set containing red and 146 | > green classes. We then apply a classification tree to each bootstrapped sample 147 | > and, for a specific value of $X$, produce 10 estimates of 148 | > $P(\textrm{Class is Red}|X)$: 149 | > $$0.1, 0.15, 0.2, 0.2, 0.55, 0.6, 0.6, 0.65, 0.7, \textrm{and } 0.75.$$ 150 | > There are two common ways to combine these results together into a single 151 | > class prediction. One is the majority vote approach discussed in this chapter. 152 | > The second approach is to classify based on the average probability. In this 153 | > example, what is the final classification under each of these two approaches? 154 | 155 | ```{r} 156 | x <- c(0.1, 0.15, 0.2, 0.2, 0.55, 0.6, 0.6, 0.65, 0.7, 0.75) 157 | ifelse(mean(x > 0.5), "red", "green") # majority vote 158 | ifelse(mean(x) > 0.5, "red", "green") # average probability 159 | ``` 160 | 161 | ### Question 6 162 | 163 | > Provide a detailed explanation of the algorithm that is used to fit a 164 | > regression tree. 165 | 166 | First we perform binary recursive splitting of the data, to minimize RSS at 167 | each split. This is continued until there are n samples present in each leaf. 168 | Then we prune the tree to a set of subtrees determined by a parameter $\alpha$. 169 | Using K-fold CV, we select $\alpha$ to minimize the cross validation error. The 170 | final tree is then calculated using the complete dataset with the selected 171 | $\alpha$ value. 172 | 173 | ## Applied 174 | 175 | ### Question 7 176 | 177 | > In the lab, we applied random forests to the `Boston` data using `mtry = 6` 178 | > and using `ntree = 25` and `ntree = 500`. Create a plot displaying the test 179 | > error resulting from random forests on this data set for a more comprehensive 180 | > range of values for `mtry` and `ntree`. You can model your plot after Figure 181 | > 8.10. Describe the results obtained. 182 | 183 | ```{r} 184 | library(ISLR2) 185 | library(randomForest) 186 | set.seed(42) 187 | 188 | train <- sample(c(TRUE, FALSE), nrow(Boston), replace = TRUE) 189 | 190 | rf_err <- function(mtry) { 191 | randomForest( 192 | Boston[train, -13], 193 | y = Boston[train, 13], 194 | xtest = Boston[!train, -13], 195 | ytest = Boston[!train, 13], 196 | mtry = mtry, 197 | ntree = 500 198 | )$test$mse 199 | } 200 | res <- lapply(c(1, 2, 3, 5, 7, 10, 12), rf_err) 201 | names(res) <- c(1, 2, 3, 5, 7, 10, 12) 202 | data.frame(res, check.names = FALSE) |> 203 | mutate(n = 1:500) |> 204 | pivot_longer(!n) |> 205 | ggplot(aes(x = n, y = value, color = name)) + 206 | geom_line(na.rm = TRUE) + 207 | xlab("Number of trees") + 208 | ylab("Error") + 209 | scale_y_log10() + 210 | scale_color_discrete(name = "No. variables at\neach split") 211 | ``` 212 | 213 | ### Question 8 214 | 215 | > In the lab, a classification tree was applied to the `Carseats` data set after 216 | > converting `Sales` into a qualitative response variable. Now we will seek to 217 | > predict `Sales` using regression trees and related approaches, treating the 218 | > response as a quantitative variable. 219 | > 220 | > a. Split the data set into a training set and a test set. 221 | 222 | ```{r} 223 | set.seed(42) 224 | train <- sample(c(TRUE, FALSE), nrow(Carseats), replace = TRUE) 225 | ``` 226 | 227 | > b. Fit a regression tree to the training set. Plot the tree, and interpret the 228 | > results. What test error rate do you obtain? 229 | 230 | ```{r} 231 | library(tree) 232 | tr <- tree(Sales ~ ., data = Carseats[train, ]) 233 | summary(tr) 234 | plot(tr) 235 | text(tr, pretty = 0, digits = 2, cex = 0.8) 236 | 237 | carseats_mse <- function(model) { 238 | p <- predict(model, newdata = Carseats[!train, ]) 239 | mean((p - Carseats[!train, "Sales"])^2) 240 | } 241 | carseats_mse(tr) 242 | ``` 243 | 244 | > c. Use cross-validation in order to determine the optimal level of tree 245 | > complexity. Does pruning the tree improve the test error rate? 246 | 247 | ```{r} 248 | res <- cv.tree(tr) 249 | plot(res$size, res$dev, type = "b", xlab = "Tree size", ylab = "Deviance") 250 | min <- which.min(res$dev) 251 | abline(v = res$size[min], lty = 2, col = "red") 252 | ``` 253 | 254 | Pruning improves performance very slightly (though this is not repeatable in 255 | different rounds of cross-validation). Arguably, a good balance is achieved 256 | when the tree size is 11. 257 | 258 | ```{r} 259 | ptr <- prune.tree(tr, best = 11) 260 | plot(ptr) 261 | text(ptr, pretty = 0, digits = 2, cex = 0.8) 262 | carseats_mse(ptr) 263 | ``` 264 | 265 | > d. Use the bagging approach in order to analyze this data. What test error 266 | > rate do you obtain? Use the `importance()` function to determine which 267 | > variables are most important. 268 | 269 | ```{r} 270 | # Here we can use random Forest with mtry = 10 = p (the number of predictor 271 | # variables) to perform bagging 272 | bagged <- randomForest(Sales ~ ., 273 | data = Carseats[train, ], mtry = 10, 274 | ntree = 200, importance = TRUE 275 | ) 276 | carseats_mse(bagged) 277 | importance(bagged) 278 | ``` 279 | 280 | The test error rate is ~2.8 which is a substantial improvement over the pruned 281 | regression tree above. 282 | 283 | > e. Use random forests to analyze this data. What test error rate do you 284 | > obtain? Use the `importance()` function to determine which variables are 285 | > most important. Describe the effect of $m$, the number of variables 286 | > considered at each split, on the error rate obtained. 287 | 288 | ```{r} 289 | rf <- randomForest(Sales ~ ., 290 | data = Carseats[train, ], mtry = 3, 291 | ntree = 500, importance = TRUE 292 | ) 293 | carseats_mse(rf) 294 | importance(rf) 295 | ``` 296 | 297 | The test error rate is ~3.0 which is a substantial improvement over the pruned 298 | regression tree above, although not quite as good as the bagging approach. 299 | 300 | > f. Now analyze the data using BART, and report your results. 301 | 302 | ```{r} 303 | library(BART) 304 | 305 | # For ease, we'll create a fake "predict" method that just returns 306 | # yhat.test.mean regardless of provided "newdata" 307 | predict.wbart <- function(model, ...) model$yhat.test.mean 308 | 309 | bartfit <- gbart(Carseats[train, 2:11], Carseats[train, 1], 310 | x.test = Carseats[!train, 2:11] 311 | ) 312 | carseats_mse(bartfit) 313 | ``` 314 | 315 | The test error rate is ~1.6 which is an improvement over random forest and 316 | bagging. 317 | 318 | ### Question 9 319 | 320 | > This problem involves the `OJ` data set which is part of the `ISLR2` package. 321 | > 322 | > a. Create a training set containing a random sample of 800 observations, and a 323 | > test set containing the remaining observations. 324 | 325 | ```{r} 326 | set.seed(42) 327 | train <- sample(1:nrow(OJ), 800) 328 | test <- setdiff(1:nrow(OJ), train) 329 | ``` 330 | 331 | > b. Fit a tree to the training data, with `Purchase` as the response and the 332 | > other variables except for `Buy` as predictors. Use the `summary()` 333 | > function to produce summary statistics about the tree, and describe the 334 | > results obtained. What is the training error rate? How many terminal nodes 335 | > does the tree have? 336 | 337 | ```{r} 338 | tr <- tree(Purchase ~ ., data = OJ[train, ]) 339 | summary(tr) 340 | ``` 341 | 342 | > c. Type in the name of the tree object in order to get a detailed text output. 343 | > Pick one of the terminal nodes, and interpret the information displayed. 344 | 345 | ```{r} 346 | tr 347 | ``` 348 | 349 | > d. Create a plot of the tree, and interpret the results. 350 | 351 | ```{r} 352 | plot(tr) 353 | text(tr, pretty = 0, digits = 2, cex = 0.8) 354 | ``` 355 | 356 | > e. Predict the response on the test data, and produce a confusion matrix 357 | > comparing the test labels to the predicted test labels. What is the test 358 | > error rate? 359 | 360 | ```{r} 361 | table(predict(tr, OJ[test, ], type = "class"), OJ[test, "Purchase"]) 362 | ``` 363 | 364 | > f. Apply the `cv.tree()` function to the training set in order to determine 365 | > the optimal tree size. 366 | 367 | ```{r} 368 | set.seed(42) 369 | res <- cv.tree(tr) 370 | ``` 371 | 372 | > g. Produce a plot with tree size on the $x$-axis and cross-validated 373 | > classification error rate on the $y$-axis. 374 | 375 | ```{r} 376 | plot(res$size, res$dev, type = "b", xlab = "Tree size", ylab = "Deviance") 377 | min <- which.min(res$dev) 378 | abline(v = res$size[min], lty = 2, col = "red") 379 | ``` 380 | 381 | > h. Which tree size corresponds to the lowest cross-validated classification 382 | > error rate? 383 | 384 | ```{r} 385 | res$size[min] 386 | ``` 387 | 388 | > i. Produce a pruned tree corresponding to the optimal tree size obtained using 389 | > cross-validation. If cross-validation does not lead to selection of a 390 | > pruned tree, then create a pruned tree with five terminal nodes. 391 | 392 | ```{r} 393 | ptr <- prune.tree(tr, best = res$size[min]) 394 | plot(ptr) 395 | text(ptr, pretty = 0, digits = 2, cex = 0.8) 396 | ``` 397 | 398 | > j. Compare the training error rates between the pruned and unpruned trees. 399 | > Which is higher? 400 | 401 | ```{r} 402 | oj_misclass <- function(model) { 403 | summary(model)$misclass[1] / summary(model)$misclass[2] 404 | } 405 | oj_misclass(tr) 406 | oj_misclass(ptr) 407 | ``` 408 | 409 | The training misclassification error rate is slightly higher for the pruned tree. 410 | 411 | > k. Compare the test error rates between the pruned and unpruned trees. Which 412 | > is higher? 413 | 414 | ```{r} 415 | oj_err <- function(model) { 416 | p <- predict(model, newdata = OJ[test, ], type = "class") 417 | mean(p != OJ[test, "Purchase"]) 418 | } 419 | oj_err(tr) 420 | oj_err(ptr) 421 | ``` 422 | 423 | The test misclassification error rate is slightly higher for the pruned tree. 424 | 425 | ### Question 10 426 | 427 | > We now use boosting to predict `Salary` in the `Hitters` data set. 428 | > 429 | > a. Remove the observations for whom the salary information is unknown, and 430 | > then log-transform the salaries. 431 | 432 | ```{r} 433 | dat <- Hitters 434 | dat <- dat[!is.na(dat$Salary), ] 435 | dat$Salary <- log(dat$Salary) 436 | ``` 437 | 438 | > b. Create a training set consisting of the first 200 observations, and a test 439 | > set consisting of the remaining observations. 440 | 441 | ```{r} 442 | train <- 1:200 443 | test <- setdiff(1:nrow(dat), train) 444 | ``` 445 | 446 | > c. Perform boosting on the training set with 1,000 trees for a range of values 447 | > of the shrinkage parameter $\lambda$. Produce a plot with different 448 | > shrinkage values on the $x$-axis and the corresponding training set MSE on 449 | > the $y$-axis. 450 | 451 | ```{r} 452 | library(gbm) 453 | set.seed(42) 454 | lambdas <- 10^seq(-3, 0, by = 0.1) 455 | fits <- lapply(lambdas, function(lam) { 456 | gbm(Salary ~ ., 457 | data = dat[train, ], distribution = "gaussian", 458 | n.trees = 1000, shrinkage = lam 459 | ) 460 | }) 461 | errs <- sapply(fits, function(fit) { 462 | p <- predict(fit, dat[train, ], n.trees = 1000) 463 | mean((p - dat[train, ]$Salary)^2) 464 | }) 465 | plot(lambdas, errs, 466 | type = "b", xlab = "Shrinkage values", 467 | ylab = "Training MSE", log = "xy" 468 | ) 469 | ``` 470 | 471 | > d. Produce a plot with different shrinkage values on the $x$-axis and the 472 | > corresponding test set MSE on the $y$-axis. 473 | 474 | ```{r} 475 | errs <- sapply(fits, function(fit) { 476 | p <- predict(fit, dat[test, ], n.trees = 1000) 477 | mean((p - dat[test, ]$Salary)^2) 478 | }) 479 | plot(lambdas, errs, 480 | type = "b", xlab = "Shrinkage values", 481 | ylab = "Training MSE", log = "xy" 482 | ) 483 | min(errs) 484 | abline(v = lambdas[which.min(errs)], lty = 2, col = "red") 485 | ``` 486 | 487 | > e. Compare the test MSE of boosting to the test MSE that results from applying 488 | > two of the regression approaches seen in Chapters 3 and 6. 489 | 490 | Linear regression 491 | 492 | ```{r} 493 | fit1 <- lm(Salary ~ ., data = dat[train, ]) 494 | mean((predict(fit1, dat[test, ]) - dat[test, "Salary"])^2) 495 | ``` 496 | 497 | Ridge regression 498 | 499 | ```{r} 500 | library(glmnet) 501 | x <- model.matrix(Salary ~ ., data = dat[train, ]) 502 | x.test <- model.matrix(Salary ~ ., data = dat[test, ]) 503 | y <- dat[train, "Salary"] 504 | fit2 <- glmnet(x, y, alpha = 1) 505 | mean((predict(fit2, s = 0.1, newx = x.test) - dat[test, "Salary"])^2) 506 | ``` 507 | 508 | > f. Which variables appear to be the most important predictors in the boosted 509 | > model? 510 | 511 | ```{r} 512 | summary(fits[[which.min(errs)]]) 513 | ``` 514 | 515 | > g. Now apply bagging to the training set. What is the test set MSE for this 516 | > approach? 517 | 518 | ```{r} 519 | set.seed(42) 520 | bagged <- randomForest(Salary ~ ., data = dat[train, ], mtry = 19, ntree = 1000) 521 | mean((predict(bagged, newdata = dat[test, ]) - dat[test, "Salary"])^2) 522 | ``` 523 | 524 | ### Question 11 525 | 526 | > This question uses the `Caravan` data set. 527 | > 528 | > a. Create a training set consisting of the first 1,000 observations, and a 529 | > test set consisting of the remaining observations. 530 | 531 | ```{r} 532 | train <- 1:1000 533 | test <- setdiff(1:nrow(Caravan), train) 534 | ``` 535 | 536 | > b. Fit a boosting model to the training set with `Purchase` as the response 537 | > and the other variables as predictors. Use 1,000 trees, and a shrinkage 538 | > value of 0.01. Which predictors appear to be the most important? 539 | 540 | ```{r} 541 | set.seed(42) 542 | fit <- gbm(as.numeric(Purchase == "Yes") ~ ., data = Caravan[train, ], n.trees = 1000, shrinkage = 0.01) 543 | head(summary(fit)) 544 | ``` 545 | 546 | > c. Use the boosting model to predict the response on the test data. Predict 547 | > that a person will make a purchase if the estimated probability of purchase 548 | > is greater than 20%. Form a confusion matrix. What fraction of the people 549 | > predicted to make a purchase do in fact make one? How does this compare 550 | > with the results obtained from applying KNN or logistic regression to this 551 | > data set? 552 | 553 | ```{r} 554 | p <- predict(fit, Caravan[test, ], n.trees = 1000, type = "response") 555 | table(p > 0.2, Caravan[test, "Purchase"] == "Yes") 556 | sum(p > 0.2 & Caravan[test, "Purchase"] == "Yes") / sum(p > 0.2) 557 | ``` 558 | 559 | 141 (109 + 32) are predicted to purchase. Of these 32 do which is 21%. 560 | 561 | ```{r} 562 | # Logistic regression 563 | fit <- glm(Purchase == "Yes" ~ ., data = Caravan[train, ], family = "binomial") 564 | p <- predict(fit, Caravan[test, ], type = "response") 565 | table(p > 0.2, Caravan[test, "Purchase"] == "Yes") 566 | sum(p > 0.2 & Caravan[test, "Purchase"] == "Yes") / sum(p > 0.2) 567 | ``` 568 | 569 | For logistic regression we correctly predict 14% of those predicted to purchase. 570 | 571 | ```{r} 572 | library(class) 573 | # KNN 574 | fit <- knn(Caravan[train, -86], Caravan[test, -86], Caravan$Purchase[train]) 575 | table(fit, Caravan[test, "Purchase"] == "Yes") 576 | sum(fit == "Yes" & Caravan[test, "Purchase"] == "Yes") / sum(fit == "Yes") 577 | ``` 578 | 579 | For KNN we correctly predict 8.7% of those predicted to purchase. 580 | 581 | ### Question 12 582 | 583 | > Apply boosting, bagging, random forests and BART to a data set of your choice. 584 | > Be sure to fit the models on a training set and to evaluate their performance 585 | > on a test set. How accurate are the results compared to simple methods like 586 | > linear or logistic regression? Which of these approaches yields the best 587 | > performance? 588 | 589 | Here I'm going to use the College dataset (used in Question 10 from Chapter 7 590 | to compare performance with the GAM we previously built). In this model we 591 | were trying to predict `Outstate` using the other variables in `College`. 592 | 593 | ```{r} 594 | library(gam) 595 | set.seed(42) 596 | train <- sample(1:nrow(College), 400) 597 | test <- setdiff(1:nrow(College), train) 598 | 599 | # Linear regression 600 | lr <- gam(Outstate ~ ., data = College[train, ]) 601 | 602 | # GAM from chapter 7 603 | gam <- gam(Outstate ~ Private + s(Room.Board, 2) + s(PhD, 2) + 604 | s(perc.alumni, 2) + s(Expend, 2) + s(Grad.Rate, 2), data = College[train, ]) 605 | 606 | # Boosting 607 | boosted <- gbm(Outstate ~ ., data = College[train, ], n.trees = 1000, shrinkage = 0.01) 608 | 609 | # Bagging (random forest with mtry = no. predictors) 610 | bagged <- randomForest(Outstate ~ ., data = College[train, ], mtry = 17, ntree = 1000) 611 | 612 | # Random forest with mtry = sqrt(no. predictors) 613 | rf <- randomForest(Outstate ~ ., data = College[train, ], mtry = 4, ntree = 1000) 614 | 615 | # BART 616 | pred <- setdiff(colnames(College), "Outstate") 617 | bart <- gbart(College[train, pred], College[train, "Outstate"], 618 | x.test = College[test, pred] 619 | ) 620 | 621 | mse <- function(model, ...) { 622 | pred <- predict(model, College[test, ], ...) 623 | mean((College$Outstate[test] - pred)^2) 624 | } 625 | 626 | res <- c( 627 | "Linear regression" = mse(lr), 628 | "GAM" = mse(gam), 629 | "Boosting" = mse(boosted, n.trees = 1000), 630 | "Bagging" = mse(bagged), 631 | "Random forest" = mse(rf), 632 | "BART" = mse(bart) 633 | ) 634 | res <- data.frame("MSE" = res) 635 | res$Model <- factor(row.names(res), levels = rev(row.names(res))) 636 | ggplot(res, aes(Model, MSE)) + 637 | coord_flip() + 638 | geom_bar(stat = "identity", fill = "steelblue") 639 | ``` 640 | 641 | In this case, it looks like bagging produces the best performing model in terms 642 | of test mean square error. 643 | -------------------------------------------------------------------------------- /10-deep-learning.Rmd: -------------------------------------------------------------------------------- 1 | # Deep Learning 2 | 3 | ## Conceptual 4 | 5 | ### Question 1 6 | 7 | > Consider a neural network with two hidden layers: $p = 4$ input units, 2 units 8 | > in the first hidden layer, 3 units in the second hidden layer, and a single 9 | > output. 10 | > 11 | > a. Draw a picture of the network, similar to Figures 10.1 or 10.4. 12 | 13 | ```{r, echo=FALSE, out.width="80%"} 14 | knitr::include_graphics("images/nn.png") 15 | ``` 16 | 17 | > b. Write out an expression for $f(X)$, assuming ReLU activation functions. Be 18 | > as explicit as you can! 19 | 20 | The three layers (from our final output layer back to the start of our network) 21 | can be described as: 22 | 23 | \begin{align*} 24 | f(X) &= g(w_{0}^{(3)} + \sum^{K_2}_{l=1} w_{l}^{(3)} A_l^{(2)}) \\ 25 | A_l^{(2)} &= h_l^{(2)}(X) = g(w_{l0}^{(2)} + \sum_{k=1}^{K_1} w_{lk}^{(2)} A_k^{(1)})\\ 26 | A_k^{(1)} &= h_k^{(1)}(X) = g(w_{k0}^{(1)} + \sum_{j=1}^p w_{kj}^{(1)} X_j) \\ 27 | \end{align*} 28 | 29 | for $l = 1, ..., K_2 = 3$ and $k = 1, ..., K_1 = 2$ and $p = 4$, where, 30 | 31 | $$ 32 | g(z) = (z)_+ = \begin{cases} 33 | 0, & \text{if } z < 0 \\ 34 | z, & \text{otherwise} 35 | \end{cases} 36 | $$ 37 | 38 | > c. Now plug in some values for the coefficients and write out the value of 39 | > $f(X)$. 40 | 41 | We can perhaps achieve this most easily by fitting a real model. Note, 42 | in the plot shown here, we also include the "bias" or intercept terms. 43 | 44 | ```{r} 45 | library(ISLR2) 46 | library(neuralnet) 47 | library(sigmoid) 48 | set.seed(5) 49 | train <- sample(seq_len(nrow(ISLR2::Boston)), nrow(ISLR2::Boston) * 2 / 3) 50 | 51 | net <- neuralnet(crim ~ lstat + medv + ptratio + rm, 52 | data = ISLR2::Boston[train, ], 53 | act.fct = relu, 54 | hidden = c(2, 3) 55 | ) 56 | plot(net) 57 | ``` 58 | 59 | We can make a prediction for a given observation using this object. 60 | 61 | Firstly, let's find an "ambiguous" test sample 62 | 63 | ```{r} 64 | p <- predict(net, ISLR2::Boston[-train, ]) 65 | x <- ISLR2::Boston[-train, ][which.min(abs(p - mean(c(max(p), min(p))))), ] 66 | x <- x[, c("lstat", "medv", "ptratio", "rm")] 67 | predict(net, x) 68 | ``` 69 | 70 | Or, repeating by "hand": 71 | 72 | ```{r} 73 | g <- function(x) ifelse(x > 0, x, 0) # relu activation function 74 | w <- net$weights[[1]] # the estimated weights for each layer 75 | v <- as.numeric(x) # our input predictors 76 | 77 | # to calculate our prediction we can take the dot product of our predictors 78 | # (with 1 at the start for the bias term) and our layer weights, lw) 79 | for (lw in w) v <- g(c(1, v) %*% lw) 80 | v 81 | ``` 82 | 83 | > d. How many parameters are there? 84 | 85 | ```{r} 86 | length(unlist(net$weights)) 87 | ``` 88 | 89 | There are $4*2+2 + 2*3+3 + 3*1+1 = 23$ parameters. 90 | 91 | ### Question 2 92 | 93 | > Consider the _softmax_ function in (10.13) (see also (4.13) on page 141) 94 | > for modeling multinomial probabilities. 95 | > 96 | > a. In (10.13), show that if we add a constant $c$ to each of the $z_l$, then 97 | > the probability is unchanged. 98 | 99 | If we add a constant $c$ to each $Z_l$ in equation 10.13 we get: 100 | 101 | \begin{align*} 102 | Pr(Y=m|X) 103 | &= \frac{e^{Z_m+c}}{\sum_{l=0}^9e^{Z_l+c}} \\ 104 | &= \frac{e^{Z_m}e^c}{\sum_{l=0}^9e^{Z_l}e^c} \\ 105 | &= \frac{e^{Z_m}e^c}{e^c\sum_{l=0}^9e^{Z_l}} \\ 106 | &= \frac{e^{Z_m}}{\sum_{l=0}^9e^{Z_l}} \\ 107 | \end{align*} 108 | 109 | which is just equation 10.13. 110 | 111 | > b. In (4.13), show that if we add constants $c_j$, $j = 0,1,...,p$, to each of 112 | > the corresponding coefficients for each of the classes, then the predictions 113 | > at any new point $x$ are unchanged. 114 | 115 | 4.13 is 116 | 117 | $$ 118 | Pr(Y=k|X=x) = \frac 119 | {e^{\beta_{K0} + \beta_{K1}x_1 + ... + \beta_{Kp}x_p}} 120 | {\sum_{l=1}^K e^{\beta_{l0} + \beta_{l1}x1 + ... + \beta_{lp}x_p}} 121 | $$ 122 | 123 | adding constants $c_j$ to each class gives: 124 | 125 | \begin{align*} 126 | Pr(Y=k|X=x) 127 | &= \frac 128 | {e^{\beta_{K0} + \beta_{K1}x_1 + c_1 + ... + \beta_{Kp}x_p + c_p}} 129 | {\sum_{l=1}^K e^{\beta_{l0} + \beta_{l1}x1 + c_1 + ... + \beta_{lp}x_p + c_p}} \\ 130 | &= \frac 131 | {e^{c1 + ... + c_p}e^{\beta_{K0} + \beta_{K1}x_1 + ... + \beta_{Kp}x_p}} 132 | {\sum_{l=1}^K e^{c1 + ... + c_p}e^{\beta_{l0} + \beta_{l1}x1 + ... + \beta_{lp}x_p}} \\ 133 | &= \frac 134 | {e^{c1 + ... + c_p}e^{\beta_{K0} + \beta_{K1}x_1 + ... + \beta_{Kp}x_p}} 135 | {e^{c1 + ... + c_p}\sum_{l=1}^K e^{\beta_{l0} + \beta_{l1}x1 + ... + \beta_{lp}x_p}} \\ 136 | &= \frac 137 | {e^{\beta_{K0} + \beta_{K1}x_1 + ... + \beta_{Kp}x_p}} 138 | {\sum_{l=1}^K e^{\beta_{l0} + \beta_{l1}x1 + ... + \beta_{lp}x_p}} \\ 139 | \end{align*} 140 | 141 | which collapses to 4.13 (with the same argument as above). 142 | 143 | > This shows that the softmax function is _over-parametrized_. However, 144 | > regularization and SGD typically constrain the solutions so that this is not a 145 | > problem. 146 | 147 | ### Question 3 148 | 149 | > Show that the negative multinomial log-likelihood (10.14) is equivalent to 150 | > the negative log of the likelihood expression (4.5) when there are $M = 2$ 151 | > classes. 152 | 153 | Equation 10.14 is 154 | 155 | $$ 156 | -\sum_{i=1}^n \sum_{m=0}^9 y_{im}\log(f_m(x_i)) 157 | $$ 158 | 159 | Equation 4.5 is: 160 | 161 | $$ 162 | \ell(\beta_0, \beta_1) = \prod_{i:y_i=1}p(x_i) \prod_{i':y_i'=0}(1-p(x_i')) 163 | $$ 164 | 165 | So, $\log(\ell)$ is: 166 | 167 | \begin{align*} 168 | \log(\ell) 169 | &= \log \left( \prod_{i:y_i=1}p(x_i) \prod_{i':y_i'=0}(1-p(x_i')) \right ) \\ 170 | &= \sum_{i:y_1=1}\log(p(x_i)) + \sum_{i':y_i'=0}\log(1-p(x_i')) \\ 171 | \end{align*} 172 | 173 | If we set $y_i$ to be an indicator variable such that $y_{i1}$ and $y_{i0}$ are 174 | 1 and 0 (or 0 and 1) when our $i$th observation is 1 (or 0) respectively, then 175 | we can write: 176 | 177 | $$ 178 | \log(\ell) = \sum_{i}y_{i1}\log(p(x_i)) + \sum_{i}y_{i0}\log(1-p(x_i')) 179 | $$ 180 | 181 | If we also let $f_1(x) = p(x)$ and $f_0(x) = 1 - p(x)$ then: 182 | 183 | \begin{align*} 184 | \log(\ell) 185 | &= \sum_i y_{i1}\log(f_1(x_i)) + \sum_{i}y_{i0}\log(f_0(x_i')) \\ 186 | &= \sum_i \sum_{m=0}^1 y_{im}\log(f_m(x_i)) \\ 187 | \end{align*} 188 | 189 | When we take the negative of this, it is equivalent to 10.14 for two classes 190 | ($m = 0,1$). 191 | 192 | ### Question 4 193 | 194 | > Consider a CNN that takes in $32 \times 32$ grayscale images and has a single 195 | > convolution layer with three $5 \times 5$ convolution filters (without 196 | > boundary padding). 197 | > 198 | > a. Draw a sketch of the input and first hidden layer similar to Figure 10.8. 199 | 200 | ```{r, echo=FALSE, out.width="50%"} 201 | knitr::include_graphics("images/nn2.png") 202 | ``` 203 | 204 | Note that, because there is no boundary padding, each convolution layer will 205 | consist of a 28x28 array. 206 | 207 | > b. How many parameters are in this model? 208 | 209 | There are 3 convolution matrices each with 5x5 weights (plus 3 bias terms) to 210 | estimate, therefore $3 \times 5 \times 5 + 3 = 78$ parameters 211 | 212 | > c. Explain how this model can be thought of as an ordinary feed-forward 213 | > neural network with the individual pixels as inputs, and with constraints on 214 | > the weights in the hidden units. What are the constraints? 215 | 216 | We can think of a convolution layer as a regularized fully connected layer. 217 | The regularization in this case is due to not all inputs being connected to 218 | all outputs, and weights being shared between connections. 219 | 220 | Each output node in the convolved image can be thought of as taking inputs from 221 | a limited number of input pixels (the neighboring pixels), with a set of 222 | weights specified by the convolution layer which are then shared by the 223 | connections to all other output nodes. 224 | 225 | > d. If there were no constraints, then how many weights would there be in the 226 | > ordinary feed-forward neural network in (c)? 227 | 228 | With no constraints, we would connect each input pixel in our original 32x32 229 | image to each output pixel in each of our convolution layers, with an bias 230 | term for each original pixel. So each output pixel would require 32x32 weights 231 | + 1 bias term. This would give a total of (32×32+1)×28×28×3 = 2,410,800 232 | parameters. 233 | 234 | ### Question 5 235 | 236 | > In Table 10.2 on page 433, we see that the ordering of the three methods with 237 | > respect to mean absolute error is different from the ordering with respect to 238 | > test set $R^2$. How can this be? 239 | 240 | Mean absolute error considers _absolute_ differences between predictions and 241 | observed values, whereas $R^2$ considers the (normalized) sum of _squared_ 242 | differences, thus larger errors contribute relatively ore to $R^2$ than mean 243 | absolute error. 244 | 245 | ## Applied 246 | 247 | ### Question 6 248 | 249 | > Consider the simple function $R(\beta) = sin(\beta) + \beta/10$. 250 | > 251 | > a. Draw a graph of this function over the range $\beta \in [-6, 6]$. 252 | 253 | ```{r} 254 | r <- function(x) sin(x) + x / 10 255 | x <- seq(-6, 6, 0.1) 256 | plot(x, r(x), type = "l") 257 | ``` 258 | 259 | > b. What is the derivative of this function? 260 | 261 | $$ 262 | cos(x) + 1/10 263 | $$ 264 | 265 | > c. Given $\beta^0 = 2.3$, run gradient descent to find a local minimum of 266 | > $R(\beta)$ using a learning rate of $\rho = 0.1$. Show each of 267 | > $\beta^0, \beta^1, ...$ in your plot, as well as the final answer. 268 | 269 | The derivative of our function, i.e. $cos(x) + 1/10$ gives us the gradient for 270 | a given $x$. For gradient descent, we move $x$ a little in the _opposite_ 271 | direction, for some learning rate $\rho = 0.1$: 272 | 273 | $$ 274 | x^{m+1} = x^m - \rho (cos(x^m) + 1/10) 275 | $$ 276 | 277 | ```{r} 278 | iter <- function(x, rho) x - rho * (cos(x) + 1 / 10) 279 | gd <- function(start, rho = 0.1) { 280 | b <- start 281 | v <- b 282 | while (abs(b - iter(b, 0.1)) > 1e-8) { 283 | b <- iter(b, 0.1) 284 | v <- c(v, b) 285 | } 286 | v 287 | } 288 | 289 | res <- gd(2.3) 290 | res[length(res)] 291 | ``` 292 | 293 | ```{r} 294 | plot(x, r(x), type = "l") 295 | points(res, r(res), col = "red", pch = 19) 296 | ``` 297 | 298 | 299 | > d. Repeat with $\beta^0 = 1.4$. 300 | 301 | ```{r} 302 | res <- gd(1.4) 303 | res[length(res)] 304 | ``` 305 | 306 | ```{r} 307 | plot(x, r(x), type = "l") 308 | points(res, r(res), col = "red", pch = 19) 309 | ``` 310 | 311 | ### Question 7 312 | 313 | > Fit a neural network to the `Default` data. Use a single hidden layer with 10 314 | > units, and dropout regularization. Have a look at Labs 10.9.1--10.9.2 for 315 | > guidance. Compare the classification performance of your model with that of 316 | > linear logistic regression. 317 | 318 | ```{r, cache = TRUE} 319 | library(keras) 320 | 321 | dat <- ISLR2::Boston 322 | x <- scale(model.matrix(crim ~ . - 1, data = dat)) 323 | n <- nrow(dat) 324 | ntest <- trunc(n / 3) 325 | testid <- sample(1:n, ntest) 326 | y <- dat$crim 327 | 328 | # logistic regression 329 | lfit <- lm(crim ~ ., data = dat[-testid, ]) 330 | lpred <- predict(lfit, dat[testid, ]) 331 | with(dat[testid, ], mean(abs(lpred - crim))) 332 | 333 | # keras 334 | nn <- keras_model_sequential() |> 335 | layer_dense(units = 10, activation = "relu", input_shape = ncol(x)) |> 336 | layer_dropout(rate = 0.4) |> 337 | layer_dense(units = 1) 338 | 339 | compile(nn, 340 | loss = "mse", 341 | optimizer = optimizer_rmsprop(), 342 | metrics = list("mean_absolute_error") 343 | ) 344 | 345 | history <- fit(nn, 346 | x[-testid, ], y[-testid], 347 | epochs = 100, 348 | batch_size = 26, 349 | validation_data = list(x[testid, ], y[testid]), 350 | verbose = 0 351 | ) 352 | plot(history, smooth = FALSE) 353 | npred <- predict(nn, x[testid, ]) 354 | mean(abs(y[testid] - npred)) 355 | ``` 356 | 357 | In this case, the neural network outperforms logistic regression having a lower 358 | absolute error rate on the test data. 359 | 360 | ### Question 8 361 | 362 | > From your collection of personal photographs, pick 10 images of animals (such 363 | > as dogs, cats, birds, farm animals, etc.). If the subject does not occupy a 364 | > reasonable part of the image, then crop the image. Now use a pretrained image 365 | > classification CNN as in Lab 10.9.4 to predict the class of each of your 366 | > images, and report the probabilities for the top five predicted classes for 367 | > each image. 368 | 369 | ```{r, echo=FALSE} 370 | knitr::include_graphics(c( 371 | "images/animals/bird.jpg", 372 | "images/animals/bird2.jpg", 373 | "images/animals/bird3.jpg", 374 | "images/animals/bug.jpg", 375 | "images/animals/butterfly.jpg", 376 | "images/animals/butterfly2.jpg", 377 | "images/animals/elba.jpg", 378 | "images/animals/hamish.jpg", 379 | "images/animals/poodle.jpg", 380 | "images/animals/tortoise.jpg" 381 | )) 382 | ``` 383 | 384 | ```{r} 385 | library(keras) 386 | images <- list.files("images/animals") 387 | x <- array(dim = c(length(images), 224, 224, 3)) 388 | for (i in seq_len(length(images))) { 389 | img <- image_load(paste0("images/animals/", images[i]), target_size = c(224, 224)) 390 | x[i, , , ] <- image_to_array(img) 391 | } 392 | 393 | model <- application_resnet50(weights = "imagenet") 394 | 395 | pred <- model |> 396 | predict(x) |> 397 | imagenet_decode_predictions(top = 5) 398 | 399 | names(pred) <- images 400 | print(pred) 401 | ``` 402 | 403 | ### Question 9 404 | 405 | > Fit a lag-5 autoregressive model to the `NYSE` data, as described in the text 406 | > and Lab 10.9.6. Refit the model with a 12-level factor representing the 407 | > month. Does this factor improve the performance of the model? 408 | 409 | Fitting the model as described in the text. 410 | 411 | ```{r} 412 | library(tidyverse) 413 | library(ISLR2) 414 | xdata <- data.matrix(NYSE[, c("DJ_return", "log_volume", "log_volatility")]) 415 | istrain <- NYSE[, "train"] 416 | xdata <- scale(xdata) 417 | 418 | lagm <- function(x, k = 1) { 419 | n <- nrow(x) 420 | pad <- matrix(NA, k, ncol(x)) 421 | rbind(pad, x[1:(n - k), ]) 422 | } 423 | 424 | arframe <- data.frame( 425 | log_volume = xdata[, "log_volume"], 426 | L1 = lagm(xdata, 1), 427 | L2 = lagm(xdata, 2), 428 | L3 = lagm(xdata, 3), 429 | L4 = lagm(xdata, 4), 430 | L5 = lagm(xdata, 5) 431 | ) 432 | 433 | arframe <- arframe[-(1:5), ] 434 | istrain <- istrain[-(1:5)] 435 | 436 | arfit <- lm(log_volume ~ ., data = arframe[istrain, ]) 437 | arpred <- predict(arfit, arframe[!istrain, ]) 438 | V0 <- var(arframe[!istrain, "log_volume"]) 439 | 1 - mean((arpred - arframe[!istrain, "log_volume"])^2) / V0 440 | ``` 441 | 442 | Now we add month (and work with tidyverse). 443 | 444 | ```{r} 445 | arframe$month <- as.factor(str_match(NYSE$date, "-(\\d+)-")[, 2])[-(1:5)] 446 | arfit2 <- lm(log_volume ~ ., data = arframe[istrain, ]) 447 | arpred2 <- predict(arfit2, arframe[!istrain, ]) 448 | V0 <- var(arframe[!istrain, "log_volume"]) 449 | 1 - mean((arpred2 - arframe[!istrain, "log_volume"])^2) / V0 450 | ``` 451 | 452 | Adding month as a factor marginally improves the $R^2$ of our model (from 453 | 0.413223 to 0.4170418). This is a significant improvement in fit and model 454 | 2 has a lower AIC. 455 | 456 | ```{r} 457 | anova(arfit, arfit2) 458 | AIC(arfit, arfit2) 459 | ``` 460 | 461 | ### Question 10 462 | 463 | > In Section 10.9.6, we showed how to fit a linear AR model to the `NYSE` data 464 | > using the `lm()` function. However, we also mentioned that we can "flatten" 465 | > the short sequences produced for the RNN model in order to fit a linear AR 466 | > model. Use this latter approach to fit a linear AR model to the NYSE data. 467 | > Compare the test $R^2$ of this linear AR model to that of the linear AR model 468 | > that we fit in the lab. What are the advantages/disadvantages of each 469 | > approach? 470 | 471 | The `lm` model is the same as that fit above: 472 | 473 | ```{r} 474 | arfit <- lm(log_volume ~ ., data = arframe[istrain, ]) 475 | arpred <- predict(arfit, arframe[!istrain, ]) 476 | V0 <- var(arframe[!istrain, "log_volume"]) 477 | 1 - mean((arpred - arframe[!istrain, "log_volume"])^2) / V0 478 | ``` 479 | 480 | Now we reshape the data for the RNN 481 | 482 | ```{r} 483 | n <- nrow(arframe) 484 | xrnn <- data.matrix(arframe[, -1]) 485 | xrnn <- array(xrnn, c(n, 3, 5)) 486 | xrnn <- xrnn[, , 5:1] 487 | xrnn <- aperm(xrnn, c(1, 3, 2)) 488 | ``` 489 | 490 | We can add a "flatten" layer to turn the reshaped data into a long vector of 491 | predictors resulting in a linear AR model. 492 | 493 | ```{r} 494 | model <- keras_model_sequential() |> 495 | layer_flatten(input_shape = c(5, 3)) |> 496 | layer_dense(units = 1) 497 | ``` 498 | 499 | Now let's fit this model. 500 | 501 | ```{r} 502 | model |> 503 | compile(optimizer = optimizer_rmsprop(), loss = "mse") 504 | 505 | history <- model |> 506 | fit( 507 | xrnn[istrain, , ], 508 | arframe[istrain, "log_volume"], 509 | batch_size = 64, 510 | epochs = 200, 511 | validation_data = list(xrnn[!istrain, , ], arframe[!istrain, "log_volume"]), 512 | verbose = 0 513 | ) 514 | 515 | plot(history, smooth = FALSE) 516 | kpred <- predict(model, xrnn[!istrain, , ]) 517 | 1 - mean((kpred - arframe[!istrain, "log_volume"])^2) / V0 518 | ``` 519 | 520 | Both models estimate the same number of coefficients/weights (16): 521 | 522 | ```{r} 523 | coef(arfit) 524 | model$get_weights() 525 | ``` 526 | 527 | The flattened RNN has a lower $R^2$ on the test data than our `lm` model 528 | above. The `lm` model is quicker to fit and conceptually simpler also 529 | giving us the ability to inspect the coefficients for different variables. 530 | 531 | The flattened RNN is regularized to some extent as data are processed in 532 | batches. 533 | 534 | ### Question 11 535 | 536 | > Repeat the previous exercise, but now fit a nonlinear AR model by "flattening" 537 | > the short sequences produced for the RNN model. 538 | 539 | From the book: 540 | 541 | > To fit a nonlinear AR model, we could add in a hidden layer. 542 | 543 | ```{r, c10q11} 544 | xfun::cache_rds({ 545 | model <- keras_model_sequential() |> 546 | layer_flatten(input_shape = c(5, 3)) |> 547 | layer_dense(units = 32, activation = "relu") |> 548 | layer_dropout(rate = 0.4) |> 549 | layer_dense(units = 1) 550 | 551 | model |> compile( 552 | loss = "mse", 553 | optimizer = optimizer_rmsprop(), 554 | metrics = "mse" 555 | ) 556 | 557 | history <- model |> 558 | fit( 559 | xrnn[istrain, , ], 560 | arframe[istrain, "log_volume"], 561 | batch_size = 64, 562 | epochs = 200, 563 | validation_data = list(xrnn[!istrain, , ], arframe[!istrain, "log_volume"]), 564 | verbose = 0 565 | ) 566 | 567 | plot(history, smooth = FALSE, metrics = "mse") 568 | kpred <- predict(model, xrnn[!istrain, , ]) 569 | 1 - mean((kpred - arframe[!istrain, "log_volume"])^2) / V0 570 | }) 571 | ``` 572 | 573 | This approach improves our $R^2$ over the linear model above. 574 | 575 | ### Question 12 576 | 577 | > Consider the RNN fit to the `NYSE` data in Section 10.9.6. Modify the code to 578 | > allow inclusion of the variable `day_of_week`, and fit the RNN. Compute the 579 | > test $R^2$. 580 | 581 | To accomplish this, I'll include day of the week as one of the lagged variables 582 | in the RNN. Thus, our input for each observation will be 4 x 5 (rather than 583 | 3 x 5). 584 | 585 | ```{r, c10q12} 586 | xfun::cache_rds({ 587 | xdata <- data.matrix( 588 | NYSE[, c("day_of_week", "DJ_return", "log_volume", "log_volatility")] 589 | ) 590 | istrain <- NYSE[, "train"] 591 | xdata <- scale(xdata) 592 | 593 | arframe <- data.frame( 594 | log_volume = xdata[, "log_volume"], 595 | L1 = lagm(xdata, 1), 596 | L2 = lagm(xdata, 2), 597 | L3 = lagm(xdata, 3), 598 | L4 = lagm(xdata, 4), 599 | L5 = lagm(xdata, 5) 600 | ) 601 | arframe <- arframe[-(1:5), ] 602 | istrain <- istrain[-(1:5)] 603 | 604 | n <- nrow(arframe) 605 | xrnn <- data.matrix(arframe[, -1]) 606 | xrnn <- array(xrnn, c(n, 4, 5)) 607 | xrnn <- xrnn[, , 5:1] 608 | xrnn <- aperm(xrnn, c(1, 3, 2)) 609 | dim(xrnn) 610 | 611 | model <- keras_model_sequential() |> 612 | layer_simple_rnn( 613 | units = 12, 614 | input_shape = list(5, 4), 615 | dropout = 0.1, 616 | recurrent_dropout = 0.1 617 | ) |> 618 | layer_dense(units = 1) 619 | 620 | model |> compile(optimizer = optimizer_rmsprop(), loss = "mse") 621 | 622 | history <- model |> 623 | fit( 624 | xrnn[istrain, , ], 625 | arframe[istrain, "log_volume"], 626 | batch_size = 64, 627 | epochs = 200, 628 | validation_data = list(xrnn[!istrain, , ], arframe[!istrain, "log_volume"]), 629 | verbose = 0 630 | ) 631 | 632 | kpred <- predict(model, xrnn[!istrain, , ]) 633 | 1 - mean((kpred - arframe[!istrain, "log_volume"])^2) / V0 634 | }) 635 | ``` 636 | 637 | ### Question 13 638 | 639 | > Repeat the analysis of Lab 10.9.5 on the `IMDb` data using a similarly 640 | > structured neural network. There we used a dictionary of size 10,000. Consider 641 | > the effects of varying the dictionary size. Try the values 1000, 3000, 5000, 642 | > and 10,000, and compare the results. 643 | 644 | ```{r, c10q13} 645 | xfun::cache_rds({ 646 | library(knitr) 647 | accuracy <- c() 648 | for (max_features in c(1000, 3000, 5000, 10000)) { 649 | imdb <- dataset_imdb(num_words = max_features) 650 | c(c(x_train, y_train), c(x_test, y_test)) %<-% imdb 651 | 652 | maxlen <- 500 653 | x_train <- pad_sequences(x_train, maxlen = maxlen) 654 | x_test <- pad_sequences(x_test, maxlen = maxlen) 655 | 656 | model <- keras_model_sequential() |> 657 | layer_embedding(input_dim = max_features, output_dim = 32) |> 658 | layer_lstm(units = 32) |> 659 | layer_dense(units = 1, activation = "sigmoid") 660 | 661 | model |> compile( 662 | optimizer = "rmsprop", 663 | loss = "binary_crossentropy", 664 | metrics = "acc" 665 | ) 666 | 667 | history <- fit(model, x_train, y_train, 668 | epochs = 10, 669 | batch_size = 128, 670 | validation_data = list(x_test, y_test), 671 | verbose = 0 672 | ) 673 | 674 | predy <- predict(model, x_test) > 0.5 675 | accuracy <- c(accuracy, mean(abs(y_test == as.numeric(predy)))) 676 | } 677 | 678 | tibble( 679 | "Max Features" = c(1000, 3000, 5000, 10000), 680 | "Accuracy" = accuracy 681 | ) |> 682 | kable() 683 | }) 684 | ``` 685 | 686 | Varying the dictionary size does not make a substantial impact on our estimates 687 | of accuracy. However, the models do take a substantial amount of time to fit and 688 | it is not clear we are finding the best fitting models in each case. For 689 | example, the model using a dictionary size of 10,000 obtained an accuracy of 690 | 0.8721 in the text which is as different from the estimate obtained here as 691 | are the differences between the models with different dictionary sizes. 692 | -------------------------------------------------------------------------------- /07-moving-beyond-linearity.Rmd: -------------------------------------------------------------------------------- 1 | # Moving Beyond Linearity 2 | 3 | ## Conceptual 4 | 5 | ### Question 1 6 | 7 | > It was mentioned in the chapter that a cubic regression spline with one knot 8 | > at $\xi$ can be obtained using a basis of the form $x, x^2, x^3, (x-\xi)^3_+$, 9 | > where $(x-\xi)^3_+ = (x-\xi)^3$ if $x>\xi$ and equals 0 otherwise. We will now 10 | > show that a function of the form 11 | > $$ 12 | > f(x)=\beta_0 +\beta_1x+\beta_2x^2 +\beta_3x^3 +\beta_4(x-\xi)^3_+ 13 | > $$ 14 | > is indeed a cubic regression spline, regardless of the values of 15 | > $\beta_0, \beta_1, \beta_2, \beta_3,\beta_4$. 16 | > 17 | > a. Find a cubic polynomial 18 | > $$ 19 | > f_1(x) = a_1 + b_1x + c_1x^2 + d_1x^3 20 | > $$ 21 | > such that $f(x) = f_1(x)$ for all $x \le \xi$. Express $a_1,b_1,c_1,d_1$ in 22 | > terms of $\beta_0, \beta_1, \beta_2, \beta_3, \beta_4$. 23 | 24 | In this case, for $x \le \xi$, the cubic polynomial simply has terms 25 | $a_1 = \beta_0$, $b_1 = \beta_1$, $c_1 = \beta_2$, $d_1 = \beta_3$ 26 | 27 | > b. Find a cubic polynomial 28 | > $$ 29 | > f_2(x) = a_2 + b_2x + c_2x^2 + d_2x^3 30 | > $$ 31 | > such that $f(x) = f_2(x)$ for all $x > \xi$. Express $a_2, b_2, c_2, d_2$ in 32 | > terms of $\beta_0, \beta_1, \beta_2, \beta_3, \beta_4$. We have now established 33 | > that $f(x)$ is a piecewise polynomial. 34 | 35 | For $x \gt \xi$, the cubic polynomial would be (we include the $\beta_4$ term). 36 | \begin{align} 37 | f(x) = & \beta_0 + \beta_1x + \beta_2x^2 + \beta_3x^3 + \beta_4(x-\xi)^3 \\ 38 | = & \beta_0 + \beta_1x + \beta_2x^2 + + \beta_4(x^3 - 3x^2\xi + 3x\xi^2 -\xi^3) \\ 39 | = & \beta_0 - \beta_4\xi^3 + (\beta_1 + 3\beta_4\xi^2)x + 40 | (\beta_2 - 3\beta_4\xi)x^2 + (\beta_3 + \beta_4)x^3 41 | \end{align} 42 | 43 | Therefore, 44 | $a_1 = \beta_0 - \beta_4\xi^3$, $b_1 = \beta_1 + 3\beta_4\xi^2$, 45 | $c_1 = \beta_2 - 3\beta_4\xi$, $d_1 = \beta_3 + \beta_4$ 46 | 47 | > c. Show that $f_1(\xi) = f_2(\xi)$. That is, $f(x)$ is continuous at $\xi$. 48 | 49 | To do this, we replace $x$ with $\xi$ in the above equations and simplify. 50 | 51 | \begin{align} 52 | f_1(\xi) = \beta_0 + \beta_1\xi + \beta_2\xi^2 + \beta_3\xi^3 53 | \end{align} 54 | 55 | \begin{align} 56 | f_2(\xi) = & \beta_0 - \beta_4\xi^3 + (\beta_1 + 3\beta_4\xi^2)\xi + 57 | (\beta_2 - 3\beta_4\xi)\xi^2 + (\beta_3 + \beta_4)\xi^3 \\ 58 | = & \beta_0 - \beta_4\xi^3 + \beta_1\xi + 3\beta_4\xi^3 + 59 | \beta_2\xi^2 - 3\beta_4\xi^3 + \beta_3\xi^3 + \beta_4\xi^3 \\ 60 | = & \beta_0 + \beta_1\xi + \beta_2\xi^2 + \beta_3\xi^3 61 | \end{align} 62 | 63 | > d. Show that $f_1'(\xi) = f_2'(\xi)$. That is, $f'(x)$ is continuous at $\xi$. 64 | 65 | To do this we differentiate the above with respect to $x$. 66 | 67 | $$ 68 | f_1'(x) = \beta_1 + 2\beta_2x + 3\beta_3x^2 69 | f_1'(\xi) = \beta_1 + 2\beta_2\xi + 3\beta_3\xi^2 70 | $$ 71 | 72 | \begin{align} 73 | f_2'(x) & = \beta_1 + 3\beta_4\xi^2 + 2(\beta_2 - 3\beta_4\xi)x + 3(\beta_3 + \beta_4)x^2 \\ 74 | f_2'(\xi) & = \beta_1 + 3\beta_4\xi^2 + 2(\beta_2 - 3\beta_4\xi)\xi + 3(\beta_3 + \beta_4)\xi^2 \\ 75 | & = \beta_1 + 3\beta_4\xi^2 + 2\beta_2\xi - 6\beta_4\xi^2 + 3\beta_3\xi^2 + 3\beta_4\xi^2 \\ 76 | & = \beta_1 + 2\beta_2\xi + 3\beta_3\xi^2 77 | \end{align} 78 | 79 | > e. Show that $f_1''(\xi) = f_2''(\xi)$. That is, $f''(x)$ is continuous at $\xi$. 80 | > 81 | > Therefore, $f(x)$ is indeed a cubic spline. 82 | 83 | $$ 84 | f_1'(x) = 2\beta_2x + 6\beta_3x \\ 85 | f_1''(\xi) = 2\beta_2\xi + 6\beta_3\xi 86 | $$ 87 | 88 | $$ 89 | f_2''(x) = 2\beta_2 - 6\beta_4\xi + 6(\beta_3 + \beta_4)x \\ 90 | $$ 91 | \begin{align} 92 | f_2''(\xi) & = 2\beta_2 - 6\beta_4\xi + 6\beta_3\xi + 6\beta_4\xi \\ 93 | & = 2\beta_2 + 6\beta_3\xi 94 | \end{align} 95 | 96 | > _Hint: Parts (d) and (e) of this problem require knowledge of single-variable 97 | > calculus. As a reminder, given a cubic polynomial_ 98 | > $$f_1(x) = a_1 + b_1x + c_1x^2 + d_1x^3,$$ 99 | > _the first derivative takes the form_ 100 | > $$f_1'(x) = b_1 + 2c_1x + 3d_1x^2$$ 101 | > _and the second derivative takes the form_ 102 | > $$f_1''(x) = 2c_1 + 6d_1x.$$ 103 | 104 | ### Question 2 105 | 106 | > Suppose that a curve $\hat{g}$ is computed to smoothly fit a set of $n$ points 107 | > using the following formula: 108 | > $$ 109 | > \DeclareMathOperator*{\argmin}{arg\,min} % Jan Hlavacek 110 | > \hat{g} = \argmin_g \left(\sum_{i=1}^n (y_i - g(x_i))^2 + \lambda \int \left[ g^{(m)}(x) \right]^2 dx \right), 111 | > $$ 112 | > where $g^{(m)}$ represents the $m$th derivative of $g$ (and $g^{(0)} = g$). 113 | > Provide example sketches of $\hat{g}$ in each of the following scenarios. 114 | > 115 | > a. $\lambda=\infty, m=0$. 116 | 117 | Here we penalize the $g$ and a infinite $\lambda$ means that this penalty 118 | dominates. This means that the $\hat{g}$ will be 0. 119 | 120 | > b. $\lambda=\infty, m=1$. 121 | 122 | Here we penalize the first derivative (the slope) of $g$ and a infinite 123 | $\lambda$ means that this penalty dominates. Thus the slope will be 0 124 | (and otherwise best fitting $x$, i.e. at the mean of $x$). 125 | 126 | > c. $\lambda=\infty, m=2$. 127 | 128 | Here we penalize the second derivative (the change of slope) of $g$ and a 129 | infinite $\lambda$ means that this penalty dominates. Thus the line will be 130 | straight (and otherwise best fitting $x$). 131 | 132 | > d. $\lambda=\infty, m=3$. 133 | 134 | Here we penalize the third derivative (the change of the change of slope) of $g$ 135 | and a infinite $\lambda$ means that this penalty dominates. In other words, 136 | the curve will have a consistent rate of change (e.g. a quadratic 137 | function or similar). 138 | 139 | > e. $\lambda=0, m=3$. 140 | 141 | Here we penalize the third derivative, but a value of $\lambda = 0$ means that 142 | there is no penalty. As a result, the curve is able to interpolate all points. 143 | 144 | ### Question 3 145 | 146 | > Suppose we fit a curve with basis functions 147 | > $b_1(X) = X$, 148 | > $b_2(X) = (X - 1)^2I(X \geq 1)$. 149 | > (Note that $I(X \geq 1)$ equals 1 for $X \geq 1$ and 0 otherwise.) We fit the 150 | > linear regression model 151 | > $$Y = \beta_0 +\beta_1b_1(X) + \beta_2b_2(X) + \epsilon,$$ 152 | > and obtain coefficient estimates 153 | > $\hat{\beta}_0 = 1, \hat{\beta}_1 = 1, \hat{\beta}_2 = -2$. 154 | > Sketch the estimated curve between $X = -2$ and $X = 2$. Note the intercepts, 155 | > slopes, and other relevant information. 156 | 157 | ```{r} 158 | x <- seq(-2, 2, length.out = 1000) 159 | f <- function(x) 1 + x + -2 * (x - 1)^2 * I(x >= 1) 160 | plot(x, f(x), type = "l") 161 | grid() 162 | ``` 163 | 164 | ### Question 4 165 | 166 | > Suppose we fit a curve with basis functions 167 | > $b_1(X) = I(0 \leq X \leq 2) - (X -1)I(1 \leq X \leq 2),$ 168 | > $b_2(X) = (X -3)I(3 \leq X \leq 4) + I(4 \lt X \leq 5)$. 169 | > We fit the linear regression model 170 | > $$Y = \beta_0 +\beta_1b_1(X) + \beta_2b_2(X) + \epsilon,$$ 171 | > and obtain coefficient estimates 172 | > $\hat{\beta}_0 = 1, \hat{\beta}_1 = 1, \hat{\beta}_2 = 3$. 173 | > Sketch the estimated curve between $X = -2$ and $X = 6$. Note the intercepts, 174 | > slopes, and other relevant information. 175 | 176 | ```{r} 177 | x <- seq(-2, 6, length.out = 1000) 178 | b1 <- function(x) I(0 <= x & x <= 2) - (x - 1) * I(1 <= x & x <= 2) 179 | b2 <- function(x) (x - 3) * I(3 <= x & x <= 4) + I(4 < x & x <= 5) 180 | f <- function(x) 1 + 1 * b1(x) + 3 * b2(x) 181 | plot(x, f(x), type = "l") 182 | grid() 183 | ``` 184 | 185 | ### Question 5 186 | 187 | > Consider two curves, $\hat{g}$ and $\hat{g}_2$, defined by 188 | > 189 | > $$ 190 | > \hat{g}_1 = \argmin_g \left(\sum_{i=1}^n (y_i - g(x_i))^2 + 191 | > \lambda \int \left[ g^{(3)}(x) \right]^2 dx \right), 192 | > $$ 193 | > $$ 194 | > \hat{g}_2 = \argmin_g \left(\sum_{i=1}^n (y_i - g(x_i))^2 + 195 | > \lambda \int \left[ g^{(4)}(x) \right]^2 dx \right), 196 | > $$ 197 | > 198 | > where $g^{(m)}$ represents the $m$th derivative of $g$. 199 | > 200 | > a. As $\lambda \to \infty$, will $\hat{g}_1$ or $\hat{g}_2$ have the smaller 201 | > training RSS? 202 | 203 | $\hat{g}_2$ is more flexible (by penalizing a higher derivative of $g$) and 204 | so will have a smaller training RSS. 205 | 206 | > b. As $\lambda \to \infty$, will $\hat{g}_1$ or $\hat{g}_2$ have the smaller 207 | > test RSS? 208 | 209 | We cannot tell which function will produce a smaller test RSS, but there is 210 | chance that $\hat{g}_1$ will if $\hat{g}_2$ overfits the data. 211 | 212 | > c. For $\lambda = 0$, will $\hat{g}_1$ or $\hat{g}_2$ have the smaller training 213 | > and test RSS? 214 | 215 | When $\lambda = 0$ there is no penalty, so both functions will give the same 216 | result: perfect interpolation of the training data. Thus training RSS will be 217 | 0 but test RSS could be high. 218 | 219 | ## Applied 220 | 221 | ### Question 6 222 | 223 | > In this exercise, you will further analyze the `Wage` data set considered 224 | > throughout this chapter. 225 | > 226 | > a. Perform polynomial regression to predict `wage` using `age`. Use 227 | > cross-validation to select the optimal degree $d$ for the polynomial. What 228 | > degree was chosen, and how does this compare to the results of hypothesis 229 | > testing using ANOVA? Make a plot of the resulting polynomial fit to the data. 230 | 231 | ```{r} 232 | library(ISLR2) 233 | library(boot) 234 | library(ggplot2) 235 | set.seed(42) 236 | res <- sapply(1:6, function(i) { 237 | fit <- glm(wage ~ poly(age, i), data = Wage) 238 | cv.glm(Wage, fit, K = 5)$delta[1] 239 | }) 240 | which.min(res) 241 | plot(1:6, res, xlab = "Degree", ylab = "Test MSE", type = "l") 242 | abline(v = which.min(res), col = "red", lty = 2) 243 | 244 | fit <- glm(wage ~ poly(age, which.min(res)), data = Wage) 245 | plot(Wage$age, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4)) 246 | points(1:100, predict(fit, data.frame(age = 1:100)), type = "l", col = "red") 247 | 248 | summary(glm(wage ~ poly(age, 6), data = Wage)) 249 | 250 | fit1 <- lm(wage ~ poly(age, 1), data = Wage) 251 | fit2 <- lm(wage ~ poly(age, 2), data = Wage) 252 | fit3 <- lm(wage ~ poly(age, 3), data = Wage) 253 | fit4 <- lm(wage ~ poly(age, 4), data = Wage) 254 | fit5 <- lm(wage ~ poly(age, 5), data = Wage) 255 | anova(fit1, fit2, fit3, fit4, fit5) 256 | ``` 257 | 258 | The selected degree is 4. When testing with ANOVA, degrees 1, 2 and 3 are highly 259 | significant and 4 is marginal. 260 | 261 | > b. Fit a step function to predict `wage` using `age`, and perform 262 | > cross-validation to choose the optimal number of cuts. Make a plot of the fit 263 | > obtained. 264 | 265 | ```{r} 266 | set.seed(42) 267 | res <- sapply(2:10, function(i) { 268 | Wage$cats <- cut(Wage$age, i) 269 | fit <- glm(wage ~ cats, data = Wage) 270 | cv.glm(Wage, fit, K = 5)$delta[1] 271 | }) 272 | names(res) <- 2:10 273 | plot(2:10, res, xlab = "Cuts", ylab = "Test MSE", type = "l") 274 | which.min(res) 275 | abline(v = names(which.min(res)), col = "red", lty = 2) 276 | 277 | fit <- glm(wage ~ cut(age, 8), data = Wage) 278 | plot(Wage$age, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4)) 279 | points(18:80, predict(fit, data.frame(age = 18:80)), type = "l", col = "red") 280 | ``` 281 | 282 | ### Question 7 283 | 284 | > The `Wage` data set contains a number of other features not explored in this 285 | > chapter, such as marital status (`maritl`), job class (`jobclass`), and others. 286 | > Explore the relationships between some of these other predictors and `wage`, and 287 | > use non-linear fitting techniques in order to fit flexible models to the data. 288 | > Create plots of the results obtained, and write a summary of your findings. 289 | 290 | ```{r} 291 | plot(Wage$year, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4)) 292 | plot(Wage$age, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4)) 293 | plot(Wage$maritl, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4)) 294 | plot(Wage$jobclass, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4)) 295 | plot(Wage$education, Wage$wage, pch = 19, cex = 0.4, col = alpha("steelblue", 0.4)) 296 | ``` 297 | 298 | We have a mix of categorical and continuous variables and also want to 299 | incorporate non-linear aspects of the continuous variables. A GAM is a good 300 | choice to model this situation. 301 | 302 | ```{r} 303 | library(gam) 304 | fit0 <- gam(wage ~ s(year, 4) + s(age, 5) + education, data = Wage) 305 | fit2 <- gam(wage ~ s(year, 4) + s(age, 5) + education + maritl, data = Wage) 306 | fit1 <- gam(wage ~ s(year, 4) + s(age, 5) + education + jobclass, data = Wage) 307 | fit3 <- gam(wage ~ s(year, 4) + s(age, 5) + education + jobclass + maritl, data = Wage) 308 | anova(fit0, fit1, fit2, fit3) 309 | par(mfrow = c(2, 3)) 310 | plot(fit3, se = TRUE, col = "blue") 311 | ``` 312 | 313 | ### Question 8 314 | 315 | > Fit some of the non-linear models investigated in this chapter to the `Auto` 316 | > data set. Is there evidence for non-linear relationships in this data set? 317 | > Create some informative plots to justify your answer. 318 | 319 | Here we want to explore a range of non-linear models. First let's look at the 320 | relationships between the variables in the data. 321 | 322 | ```{r} 323 | pairs(Auto, cex = 0.4, pch = 19) 324 | ``` 325 | 326 | It does appear that there are some non-linear relationships (e.g. 327 | horsepower / weight and mpg). We will pick one variable (horsepower) to predict 328 | mpg and try the range of models discussed in this chapter. We will measure 329 | test MSE through cross-validation to compare the models. 330 | 331 | ```{r} 332 | library(tidyverse) 333 | set.seed(42) 334 | fit <- glm(mpg ~ horsepower, data = Auto) 335 | err <- cv.glm(Auto, fit, K = 10)$delta[1] 336 | 337 | fit1 <- glm(mpg ~ poly(horsepower, 4), data = Auto) 338 | err1 <- cv.glm(Auto, fit1, K = 10)$delta[1] 339 | 340 | q <- quantile(Auto$horsepower) 341 | Auto$hp_cats <- cut(Auto$horsepower, breaks = q, include.lowest = TRUE) 342 | fit2 <- glm(mpg ~ hp_cats, data = Auto) 343 | err2 <- cv.glm(Auto, fit2, K = 10)$delta[1] 344 | 345 | fit3 <- glm(mpg ~ bs(horsepower, df = 4), data = Auto) 346 | err3 <- cv.glm(Auto, fit3, K = 10)$delta[1] 347 | 348 | fit4 <- glm(mpg ~ ns(horsepower, 4), data = Auto) 349 | err4 <- cv.glm(Auto, fit4, K = 10)$delta[1] 350 | 351 | fit5 <- gam(mpg ~ s(horsepower, df = 4), data = Auto) 352 | # rough 10-fold cross-validation for gam. 353 | err5 <- mean(replicate(10, { 354 | b <- cut(sample(seq_along(Auto$horsepower)), 10) 355 | pred <- numeric() 356 | for (i in 1:10) { 357 | train <- b %in% levels(b)[-i] 358 | f <- gam(mpg ~ s(horsepower, df = 4), data = Auto[train, ]) 359 | pred[!train] <- predict(f, Auto[!train, ]) 360 | } 361 | mean((Auto$mpg - pred)^2) # MSE 362 | })) 363 | 364 | c(err, err1, err2, err3, err4, err5) 365 | anova(fit, fit1, fit2, fit3, fit4, fit5) 366 | 367 | x <- seq(min(Auto$horsepower), max(Auto$horsepower), length.out = 1000) 368 | pred <- data.frame( 369 | x = x, 370 | "Linear" = predict(fit, data.frame(horsepower = x)), 371 | "Polynomial" = predict(fit1, data.frame(horsepower = x)), 372 | "Step" = predict(fit2, data.frame(hp_cats = cut(x, breaks = q, include.lowest = TRUE))), 373 | "Regression spline" = predict(fit3, data.frame(horsepower = x)), 374 | "Natural spline" = predict(fit4, data.frame(horsepower = x)), 375 | "Smoothing spline" = predict(fit5, data.frame(horsepower = x)), 376 | check.names = FALSE 377 | ) 378 | pred <- pivot_longer(pred, -x) 379 | ggplot(Auto, aes(horsepower, mpg)) + 380 | geom_point(color = alpha("steelblue", 0.4)) + 381 | geom_line(data = pred, aes(x, value, color = name)) + 382 | theme_bw() 383 | ``` 384 | 385 | ### Question 9 386 | 387 | > This question uses the variables `dis` (the weighted mean of distances to five 388 | > Boston employment centers) and `nox` (nitrogen oxides concentration in parts per 389 | > 10 million) from the `Boston` data. We will treat `dis` as the predictor and 390 | > `nox` as the response. 391 | > 392 | > a. Use the `poly()` function to fit a cubic polynomial regression to predict 393 | > `nox` using `dis`. Report the regression output, and plot the resulting data 394 | > and polynomial fits. 395 | 396 | ```{r} 397 | fit <- glm(nox ~ poly(dis, 3), data = Boston) 398 | summary(fit) 399 | plot(nox ~ dis, data = Boston, col = alpha("steelblue", 0.4), pch = 19) 400 | x <- seq(min(Boston$dis), max(Boston$dis), length.out = 1000) 401 | lines(x, predict(fit, data.frame(dis = x)), col = "red", lty = 2) 402 | ``` 403 | 404 | > b. Plot the polynomial fits for a range of different polynomial degrees (say, 405 | > from 1 to 10), and report the associated residual sum of squares. 406 | 407 | ```{r} 408 | fits <- lapply(1:10, function(i) glm(nox ~ poly(dis, i), data = Boston)) 409 | 410 | x <- seq(min(Boston$dis), max(Boston$dis), length.out = 1000) 411 | pred <- data.frame(lapply(fits, function(fit) predict(fit, data.frame(dis = x)))) 412 | colnames(pred) <- 1:10 413 | pred$x <- x 414 | pred <- pivot_longer(pred, !x) 415 | ggplot(Boston, aes(dis, nox)) + 416 | geom_point(color = alpha("steelblue", 0.4)) + 417 | geom_line(data = pred, aes(x, value, color = name)) + 418 | theme_bw() 419 | 420 | # residual sum of squares 421 | do.call(anova, fits)[, 2] 422 | ``` 423 | 424 | > c. Perform cross-validation or another approach to select the optimal degree 425 | > for the polynomial, and explain your results. 426 | 427 | ```{r} 428 | res <- sapply(1:10, function(i) { 429 | fit <- glm(nox ~ poly(dis, i), data = Boston) 430 | cv.glm(Boston, fit, K = 10)$delta[1] 431 | }) 432 | which.min(res) 433 | ``` 434 | 435 | The optimal degree is 3 based on cross-validation. Higher values tend to 436 | lead to overfitting. 437 | 438 | > d. Use the `bs()` function to fit a regression spline to predict `nox` using 439 | > `dis`. Report the output for the fit using four degrees of freedom. How did 440 | > you choose the knots? Plot the resulting fit. 441 | 442 | ```{r} 443 | fit <- glm(nox ~ bs(dis, df = 4), data = Boston) 444 | summary(fit) 445 | 446 | plot(nox ~ dis, data = Boston, col = alpha("steelblue", 0.4), pch = 19) 447 | x <- seq(min(Boston$dis), max(Boston$dis), length.out = 1000) 448 | lines(x, predict(fit, data.frame(dis = x)), col = "red", lty = 2) 449 | ``` 450 | 451 | Knots are chosen based on quantiles of the data. 452 | 453 | > e. Now fit a regression spline for a range of degrees of freedom, and plot the 454 | > resulting fits and report the resulting RSS. Describe the results obtained. 455 | 456 | ```{r} 457 | fits <- lapply(3:10, function(i) { 458 | glm(nox ~ bs(dis, df = i), data = Boston) 459 | }) 460 | 461 | x <- seq(min(Boston$dis), max(Boston$dis), length.out = 1000) 462 | pred <- data.frame(lapply(fits, function(fit) predict(fit, data.frame(dis = x)))) 463 | colnames(pred) <- 3:10 464 | pred$x <- x 465 | pred <- pivot_longer(pred, !x) 466 | ggplot(Boston, aes(dis, nox)) + 467 | geom_point(color = alpha("steelblue", 0.4)) + 468 | geom_line(data = pred, aes(x, value, color = name)) + 469 | theme_bw() 470 | ``` 471 | 472 | At high numbers of degrees of freedom the splines overfit the data (particularly 473 | at extreme ends of the distribution of the predictor variable). 474 | 475 | > f. Perform cross-validation or another approach in order to select the best 476 | > degrees of freedom for a regression spline on this data. Describe your 477 | > results. 478 | 479 | ```{r} 480 | set.seed(42) 481 | err <- sapply(3:10, function(i) { 482 | fit <- glm(nox ~ bs(dis, df = i), data = Boston) 483 | suppressWarnings(cv.glm(Boston, fit, K = 10)$delta[1]) 484 | }) 485 | which.min(err) 486 | ``` 487 | 488 | This approach would select 4 degrees of freedom for the spline. 489 | 490 | ### Question 10 491 | 492 | > This question relates to the `College` data set. 493 | > 494 | > a. Split the data into a training set and a test set. Using out-of-state tuition 495 | > as the response and the other variables as the predictors, perform forward 496 | > stepwise selection on the training set in order to identify a satisfactory 497 | > model that uses just a subset of the predictors. 498 | 499 | ```{r} 500 | library(leaps) 501 | 502 | # helper function to predict from a regsubsets model 503 | predict.regsubsets <- function(object, newdata, id, ...) { 504 | form <- as.formula(object$call[[2]]) 505 | mat <- model.matrix(form, newdata) 506 | coefi <- coef(object, id = id) 507 | xvars <- names(coefi) 508 | mat[, xvars] %*% coefi 509 | } 510 | 511 | set.seed(42) 512 | train <- rep(TRUE, nrow(College)) 513 | train[sample(1:nrow(College), nrow(College) * 1 / 3)] <- FALSE 514 | fit <- regsubsets(Outstate ~ ., data = College[train, ], nvmax = 17, method = "forward") 515 | 516 | plot(summary(fit)$bic, type = "b") 517 | which.min(summary(fit)$bic) 518 | 519 | # or via cross-validation 520 | err <- sapply(1:17, function(i) { 521 | x <- coef(fit, id = i) 522 | mean((College$Outstate[!train] - predict(fit, College[!train, ], i))^2) 523 | }) 524 | which.min(err) 525 | min(summary(fit)$bic) 526 | ``` 527 | 528 | For the sake of simplicity we'll choose 6 529 | 530 | ```{r} 531 | coef(fit, id = 6) 532 | ``` 533 | 534 | > b. Fit a GAM on the training data, using out-of-state tuition as the response 535 | > and the features selected in the previous step as the predictors. Plot the 536 | > results, and explain your findings. 537 | 538 | ```{r} 539 | fit <- gam(Outstate ~ Private + s(Room.Board, 2) + s(PhD, 2) + s(perc.alumni, 2) + 540 | s(Expend, 2) + s(Grad.Rate, 2), data = College[train, ]) 541 | ``` 542 | 543 | > c. Evaluate the model obtained on the test set, and explain the results 544 | > obtained. 545 | 546 | ```{r} 547 | pred <- predict(fit, College[!train, ]) 548 | err_gam <- mean((College$Outstate[!train] - pred)^2) 549 | plot(err, ylim = c(min(err_gam, err), max(err)), type = "b") 550 | abline(h = err_gam, col = "red", lty = 2) 551 | 552 | # r-squared 553 | 1 - err_gam / mean((College$Outstate[!train] - mean(College$Outstate[!train]))^2) 554 | ``` 555 | 556 | > d. For which variables, if any, is there evidence of a non-linear relationship 557 | > with the response? 558 | 559 | ```{r} 560 | summary(fit) 561 | ``` 562 | 563 | Non-linear relationships are significant for Expend and PhD. 564 | 565 | ### Question 11 566 | 567 | > In Section 7.7, it was mentioned that GAMs are generally fit using a 568 | > _backfitting_ approach. The idea behind backfitting is actually quite simple. We 569 | > will now explore backfitting in the context of multiple linear regression. 570 | > 571 | > Suppose that we would like to perform multiple linear regression, but we do not 572 | > have software to do so. Instead, we only have software to perform simple linear 573 | > regression. Therefore, we take the following iterative approach: we repeatedly 574 | > hold all but one coefficient estimate fixed at its current value, and update 575 | > only that coefficient estimate using a simple linear regression. The process is 576 | > continued until _convergence_---that is, until the coefficient estimates stop 577 | > changing. 578 | > 579 | > We now try this out on a toy example. 580 | > 581 | > a. Generate a response $Y$ and two predictors $X_1$ and $X_2$, with $n = 100$. 582 | 583 | ```{r} 584 | set.seed(42) 585 | x1 <- rnorm(100) 586 | x2 <- rnorm(100) 587 | y <- 2 + 0.2 * x1 + 4 * x2 + rnorm(100) 588 | ``` 589 | 590 | > b. Initialize $\hat{\beta}_1$ to take on a value of your choice. It does not 591 | > matter 1 what value you choose. 592 | 593 | ```{r} 594 | beta1 <- 20 595 | ``` 596 | 597 | > c. Keeping $\hat{\beta}_1$ fixed, fit the model 598 | > $$Y - \hat{\beta}_1X_1 = \beta_0 + \beta_2X_2 + \epsilon.$$ 599 | > You can do this as follows: 600 | > 601 | > ```r 602 | > > a <- y - beta1 * x1 603 | > > beta2 <- lm(a ~ x2)$coef[2] 604 | > ``` 605 | 606 | ```{r} 607 | a <- y - beta1 * x1 608 | beta2 <- lm(a ~ x2)$coef[2] 609 | ``` 610 | 611 | > d. Keeping $\hat{\beta}_2$ fixed, fit the model 612 | > $$Y - \hat{\beta}_2X_2 = \beta_0 + \beta_1 X_1 + \epsilon.$$ 613 | > You can do this as follows: 614 | > 615 | > ```r 616 | > > a <- y - beta2 * x2 617 | > > beta1 <- lm(a ~ x1)$coef[2] 618 | > ``` 619 | 620 | ```{r} 621 | a <- y - beta2 * x2 622 | beta1 <- lm(a ~ x1)$coef[2] 623 | ``` 624 | 625 | > e. Write a for loop to repeat (c) and (d) 1,000 times. Report the estimates of 626 | > $\hat{\beta}_0, \hat{\beta}_1,$ and $\hat{\beta}_2$ at each iteration of the 627 | > for loop. Create a plot in which each of these values is displayed, with 628 | > $\hat{\beta}_0, \hat{\beta}_1,$ and $\hat{\beta}_2$ each shown in a different 629 | > color. 630 | 631 | ```{r} 632 | res <- matrix(NA, nrow = 1000, ncol = 3) 633 | colnames(res) <- c("beta0", "beta1", "beta2") 634 | beta1 <- 20 635 | for (i in 1:1000) { 636 | beta2 <- lm(y - beta1 * x1 ~ x2)$coef[2] 637 | beta1 <- lm(y - beta2 * x2 ~ x1)$coef[2] 638 | beta0 <- lm(y - beta2 * x2 ~ x1)$coef[1] 639 | res[i, ] <- c(beta0, beta1, beta2) 640 | } 641 | res <- as.data.frame(res) 642 | res$Iteration <- 1:1000 643 | res <- pivot_longer(res, !Iteration) 644 | p <- ggplot(res, aes(x = Iteration, y = value, color = name)) + 645 | geom_line() + 646 | geom_point() + 647 | scale_x_continuous(trans = "log10") 648 | p 649 | ``` 650 | 651 | > f. Compare your answer in (e) to the results of simply performing multiple 652 | > linear regression to predict $Y$ using $X_1$ and $X_2$. Use the `abline()` 653 | > function to overlay those multiple linear regression coefficient estimates on 654 | > the plot obtained in (e). 655 | 656 | ```{r} 657 | fit <- lm(y ~ x1 + x2) 658 | coef(fit) 659 | p + geom_hline(yintercept = coef(fit), lty = 2) 660 | ``` 661 | 662 | > g. On this data set, how many backfitting iterations were required in order to 663 | > obtain a "good" approximation to the multiple regression coefficient 664 | > estimates? 665 | 666 | In this case, good estimates were obtained after 3 iterations. 667 | 668 | ### Question 12 669 | 670 | > This problem is a continuation of the previous exercise. In a toy example with 671 | > $p = 100$, show that one can approximate the multiple linear regression 672 | > coefficient estimates by repeatedly performing simple linear regression in a 673 | > backfitting procedure. How many backfitting iterations are required in order to 674 | > obtain a "good" approximation to the multiple regression coefficient estimates? 675 | > Create a plot to justify your answer. 676 | 677 | ```{r} 678 | set.seed(42) 679 | 680 | p <- 100 681 | n <- 1000 682 | 683 | betas <- rnorm(p) * 5 684 | x <- matrix(rnorm(n * p), ncol = p, nrow = n) 685 | y <- (x %*% betas) + rnorm(n) # ignore beta0 for simplicity 686 | 687 | # multiple regression 688 | fit <- lm(y ~ x - 1) 689 | coef(fit) 690 | 691 | # backfitting 692 | backfit <- function(x, y, iter = 20) { 693 | beta <- matrix(0, ncol = ncol(x), nrow = iter + 1) 694 | for (i in 1:iter) { 695 | for (k in 1:ncol(x)) { 696 | residual <- y - (x[, -k] %*% beta[i, -k]) 697 | beta[i + 1, k] <- lm(residual ~ x[, k])$coef[2] 698 | } 699 | } 700 | beta[-1, ] 701 | } 702 | res <- backfit(x, y) 703 | error <- rowMeans(sweep(res, 2, betas)^2) 704 | plot(error, log = "x", type = "b") 705 | 706 | # backfitting error 707 | error[length(error)] 708 | 709 | # lm error 710 | mean((coef(fit) - betas)^2) 711 | ``` 712 | 713 | We need around 5 to 6 iterations to obtain a good estimate of the coefficients. 714 | -------------------------------------------------------------------------------- /data/Auto.data: -------------------------------------------------------------------------------- 1 | mpg cylinders displacement horsepower weight acceleration year origin name 2 | 18.0 8 307.0 130.0 3504. 12.0 70 1 "chevrolet chevelle malibu" 3 | 15.0 8 350.0 165.0 3693. 11.5 70 1 "buick skylark 320" 4 | 18.0 8 318.0 150.0 3436. 11.0 70 1 "plymouth satellite" 5 | 16.0 8 304.0 150.0 3433. 12.0 70 1 "amc rebel sst" 6 | 17.0 8 302.0 140.0 3449. 10.5 70 1 "ford torino" 7 | 15.0 8 429.0 198.0 4341. 10.0 70 1 "ford galaxie 500" 8 | 14.0 8 454.0 220.0 4354. 9.0 70 1 "chevrolet impala" 9 | 14.0 8 440.0 215.0 4312. 8.5 70 1 "plymouth fury iii" 10 | 14.0 8 455.0 225.0 4425. 10.0 70 1 "pontiac catalina" 11 | 15.0 8 390.0 190.0 3850. 8.5 70 1 "amc ambassador dpl" 12 | 15.0 8 383.0 170.0 3563. 10.0 70 1 "dodge challenger se" 13 | 14.0 8 340.0 160.0 3609. 8.0 70 1 "plymouth 'cuda 340" 14 | 15.0 8 400.0 150.0 3761. 9.5 70 1 "chevrolet monte carlo" 15 | 14.0 8 455.0 225.0 3086. 10.0 70 1 "buick estate wagon (sw)" 16 | 24.0 4 113.0 95.00 2372. 15.0 70 3 "toyota corona mark ii" 17 | 22.0 6 198.0 95.00 2833. 15.5 70 1 "plymouth duster" 18 | 18.0 6 199.0 97.00 2774. 15.5 70 1 "amc hornet" 19 | 21.0 6 200.0 85.00 2587. 16.0 70 1 "ford maverick" 20 | 27.0 4 97.00 88.00 2130. 14.5 70 3 "datsun pl510" 21 | 26.0 4 97.00 46.00 1835. 20.5 70 2 "volkswagen 1131 deluxe sedan" 22 | 25.0 4 110.0 87.00 2672. 17.5 70 2 "peugeot 504" 23 | 24.0 4 107.0 90.00 2430. 14.5 70 2 "audi 100 ls" 24 | 25.0 4 104.0 95.00 2375. 17.5 70 2 "saab 99e" 25 | 26.0 4 121.0 113.0 2234. 12.5 70 2 "bmw 2002" 26 | 21.0 6 199.0 90.00 2648. 15.0 70 1 "amc gremlin" 27 | 10.0 8 360.0 215.0 4615. 14.0 70 1 "ford f250" 28 | 10.0 8 307.0 200.0 4376. 15.0 70 1 "chevy c20" 29 | 11.0 8 318.0 210.0 4382. 13.5 70 1 "dodge d200" 30 | 9.0 8 304.0 193.0 4732. 18.5 70 1 "hi 1200d" 31 | 27.0 4 97.00 88.00 2130. 14.5 71 3 "datsun pl510" 32 | 28.0 4 140.0 90.00 2264. 15.5 71 1 "chevrolet vega 2300" 33 | 25.0 4 113.0 95.00 2228. 14.0 71 3 "toyota corona" 34 | 25.0 4 98.00 ? 2046. 19.0 71 1 "ford pinto" 35 | 19.0 6 232.0 100.0 2634. 13.0 71 1 "amc gremlin" 36 | 16.0 6 225.0 105.0 3439. 15.5 71 1 "plymouth satellite custom" 37 | 17.0 6 250.0 100.0 3329. 15.5 71 1 "chevrolet chevelle malibu" 38 | 19.0 6 250.0 88.00 3302. 15.5 71 1 "ford torino 500" 39 | 18.0 6 232.0 100.0 3288. 15.5 71 1 "amc matador" 40 | 14.0 8 350.0 165.0 4209. 12.0 71 1 "chevrolet impala" 41 | 14.0 8 400.0 175.0 4464. 11.5 71 1 "pontiac catalina brougham" 42 | 14.0 8 351.0 153.0 4154. 13.5 71 1 "ford galaxie 500" 43 | 14.0 8 318.0 150.0 4096. 13.0 71 1 "plymouth fury iii" 44 | 12.0 8 383.0 180.0 4955. 11.5 71 1 "dodge monaco (sw)" 45 | 13.0 8 400.0 170.0 4746. 12.0 71 1 "ford country squire (sw)" 46 | 13.0 8 400.0 175.0 5140. 12.0 71 1 "pontiac safari (sw)" 47 | 18.0 6 258.0 110.0 2962. 13.5 71 1 "amc hornet sportabout (sw)" 48 | 22.0 4 140.0 72.00 2408. 19.0 71 1 "chevrolet vega (sw)" 49 | 19.0 6 250.0 100.0 3282. 15.0 71 1 "pontiac firebird" 50 | 18.0 6 250.0 88.00 3139. 14.5 71 1 "ford mustang" 51 | 23.0 4 122.0 86.00 2220. 14.0 71 1 "mercury capri 2000" 52 | 28.0 4 116.0 90.00 2123. 14.0 71 2 "opel 1900" 53 | 30.0 4 79.00 70.00 2074. 19.5 71 2 "peugeot 304" 54 | 30.0 4 88.00 76.00 2065. 14.5 71 2 "fiat 124b" 55 | 31.0 4 71.00 65.00 1773. 19.0 71 3 "toyota corolla 1200" 56 | 35.0 4 72.00 69.00 1613. 18.0 71 3 "datsun 1200" 57 | 27.0 4 97.00 60.00 1834. 19.0 71 2 "volkswagen model 111" 58 | 26.0 4 91.00 70.00 1955. 20.5 71 1 "plymouth cricket" 59 | 24.0 4 113.0 95.00 2278. 15.5 72 3 "toyota corona hardtop" 60 | 25.0 4 97.50 80.00 2126. 17.0 72 1 "dodge colt hardtop" 61 | 23.0 4 97.00 54.00 2254. 23.5 72 2 "volkswagen type 3" 62 | 20.0 4 140.0 90.00 2408. 19.5 72 1 "chevrolet vega" 63 | 21.0 4 122.0 86.00 2226. 16.5 72 1 "ford pinto runabout" 64 | 13.0 8 350.0 165.0 4274. 12.0 72 1 "chevrolet impala" 65 | 14.0 8 400.0 175.0 4385. 12.0 72 1 "pontiac catalina" 66 | 15.0 8 318.0 150.0 4135. 13.5 72 1 "plymouth fury iii" 67 | 14.0 8 351.0 153.0 4129. 13.0 72 1 "ford galaxie 500" 68 | 17.0 8 304.0 150.0 3672. 11.5 72 1 "amc ambassador sst" 69 | 11.0 8 429.0 208.0 4633. 11.0 72 1 "mercury marquis" 70 | 13.0 8 350.0 155.0 4502. 13.5 72 1 "buick lesabre custom" 71 | 12.0 8 350.0 160.0 4456. 13.5 72 1 "oldsmobile delta 88 royale" 72 | 13.0 8 400.0 190.0 4422. 12.5 72 1 "chrysler newport royal" 73 | 19.0 3 70.00 97.00 2330. 13.5 72 3 "mazda rx2 coupe" 74 | 15.0 8 304.0 150.0 3892. 12.5 72 1 "amc matador (sw)" 75 | 13.0 8 307.0 130.0 4098. 14.0 72 1 "chevrolet chevelle concours (sw)" 76 | 13.0 8 302.0 140.0 4294. 16.0 72 1 "ford gran torino (sw)" 77 | 14.0 8 318.0 150.0 4077. 14.0 72 1 "plymouth satellite custom (sw)" 78 | 18.0 4 121.0 112.0 2933. 14.5 72 2 "volvo 145e (sw)" 79 | 22.0 4 121.0 76.00 2511. 18.0 72 2 "volkswagen 411 (sw)" 80 | 21.0 4 120.0 87.00 2979. 19.5 72 2 "peugeot 504 (sw)" 81 | 26.0 4 96.00 69.00 2189. 18.0 72 2 "renault 12 (sw)" 82 | 22.0 4 122.0 86.00 2395. 16.0 72 1 "ford pinto (sw)" 83 | 28.0 4 97.00 92.00 2288. 17.0 72 3 "datsun 510 (sw)" 84 | 23.0 4 120.0 97.00 2506. 14.5 72 3 "toyouta corona mark ii (sw)" 85 | 28.0 4 98.00 80.00 2164. 15.0 72 1 "dodge colt (sw)" 86 | 27.0 4 97.00 88.00 2100. 16.5 72 3 "toyota corolla 1600 (sw)" 87 | 13.0 8 350.0 175.0 4100. 13.0 73 1 "buick century 350" 88 | 14.0 8 304.0 150.0 3672. 11.5 73 1 "amc matador" 89 | 13.0 8 350.0 145.0 3988. 13.0 73 1 "chevrolet malibu" 90 | 14.0 8 302.0 137.0 4042. 14.5 73 1 "ford gran torino" 91 | 15.0 8 318.0 150.0 3777. 12.5 73 1 "dodge coronet custom" 92 | 12.0 8 429.0 198.0 4952. 11.5 73 1 "mercury marquis brougham" 93 | 13.0 8 400.0 150.0 4464. 12.0 73 1 "chevrolet caprice classic" 94 | 13.0 8 351.0 158.0 4363. 13.0 73 1 "ford ltd" 95 | 14.0 8 318.0 150.0 4237. 14.5 73 1 "plymouth fury gran sedan" 96 | 13.0 8 440.0 215.0 4735. 11.0 73 1 "chrysler new yorker brougham" 97 | 12.0 8 455.0 225.0 4951. 11.0 73 1 "buick electra 225 custom" 98 | 13.0 8 360.0 175.0 3821. 11.0 73 1 "amc ambassador brougham" 99 | 18.0 6 225.0 105.0 3121. 16.5 73 1 "plymouth valiant" 100 | 16.0 6 250.0 100.0 3278. 18.0 73 1 "chevrolet nova custom" 101 | 18.0 6 232.0 100.0 2945. 16.0 73 1 "amc hornet" 102 | 18.0 6 250.0 88.00 3021. 16.5 73 1 "ford maverick" 103 | 23.0 6 198.0 95.00 2904. 16.0 73 1 "plymouth duster" 104 | 26.0 4 97.00 46.00 1950. 21.0 73 2 "volkswagen super beetle" 105 | 11.0 8 400.0 150.0 4997. 14.0 73 1 "chevrolet impala" 106 | 12.0 8 400.0 167.0 4906. 12.5 73 1 "ford country" 107 | 13.0 8 360.0 170.0 4654. 13.0 73 1 "plymouth custom suburb" 108 | 12.0 8 350.0 180.0 4499. 12.5 73 1 "oldsmobile vista cruiser" 109 | 18.0 6 232.0 100.0 2789. 15.0 73 1 "amc gremlin" 110 | 20.0 4 97.00 88.00 2279. 19.0 73 3 "toyota carina" 111 | 21.0 4 140.0 72.00 2401. 19.5 73 1 "chevrolet vega" 112 | 22.0 4 108.0 94.00 2379. 16.5 73 3 "datsun 610" 113 | 18.0 3 70.00 90.00 2124. 13.5 73 3 "maxda rx3" 114 | 19.0 4 122.0 85.00 2310. 18.5 73 1 "ford pinto" 115 | 21.0 6 155.0 107.0 2472. 14.0 73 1 "mercury capri v6" 116 | 26.0 4 98.00 90.00 2265. 15.5 73 2 "fiat 124 sport coupe" 117 | 15.0 8 350.0 145.0 4082. 13.0 73 1 "chevrolet monte carlo s" 118 | 16.0 8 400.0 230.0 4278. 9.50 73 1 "pontiac grand prix" 119 | 29.0 4 68.00 49.00 1867. 19.5 73 2 "fiat 128" 120 | 24.0 4 116.0 75.00 2158. 15.5 73 2 "opel manta" 121 | 20.0 4 114.0 91.00 2582. 14.0 73 2 "audi 100ls" 122 | 19.0 4 121.0 112.0 2868. 15.5 73 2 "volvo 144ea" 123 | 15.0 8 318.0 150.0 3399. 11.0 73 1 "dodge dart custom" 124 | 24.0 4 121.0 110.0 2660. 14.0 73 2 "saab 99le" 125 | 20.0 6 156.0 122.0 2807. 13.5 73 3 "toyota mark ii" 126 | 11.0 8 350.0 180.0 3664. 11.0 73 1 "oldsmobile omega" 127 | 20.0 6 198.0 95.00 3102. 16.5 74 1 "plymouth duster" 128 | 21.0 6 200.0 ? 2875. 17.0 74 1 "ford maverick" 129 | 19.0 6 232.0 100.0 2901. 16.0 74 1 "amc hornet" 130 | 15.0 6 250.0 100.0 3336. 17.0 74 1 "chevrolet nova" 131 | 31.0 4 79.00 67.00 1950. 19.0 74 3 "datsun b210" 132 | 26.0 4 122.0 80.00 2451. 16.5 74 1 "ford pinto" 133 | 32.0 4 71.00 65.00 1836. 21.0 74 3 "toyota corolla 1200" 134 | 25.0 4 140.0 75.00 2542. 17.0 74 1 "chevrolet vega" 135 | 16.0 6 250.0 100.0 3781. 17.0 74 1 "chevrolet chevelle malibu classic" 136 | 16.0 6 258.0 110.0 3632. 18.0 74 1 "amc matador" 137 | 18.0 6 225.0 105.0 3613. 16.5 74 1 "plymouth satellite sebring" 138 | 16.0 8 302.0 140.0 4141. 14.0 74 1 "ford gran torino" 139 | 13.0 8 350.0 150.0 4699. 14.5 74 1 "buick century luxus (sw)" 140 | 14.0 8 318.0 150.0 4457. 13.5 74 1 "dodge coronet custom (sw)" 141 | 14.0 8 302.0 140.0 4638. 16.0 74 1 "ford gran torino (sw)" 142 | 14.0 8 304.0 150.0 4257. 15.5 74 1 "amc matador (sw)" 143 | 29.0 4 98.00 83.00 2219. 16.5 74 2 "audi fox" 144 | 26.0 4 79.00 67.00 1963. 15.5 74 2 "volkswagen dasher" 145 | 26.0 4 97.00 78.00 2300. 14.5 74 2 "opel manta" 146 | 31.0 4 76.00 52.00 1649. 16.5 74 3 "toyota corona" 147 | 32.0 4 83.00 61.00 2003. 19.0 74 3 "datsun 710" 148 | 28.0 4 90.00 75.00 2125. 14.5 74 1 "dodge colt" 149 | 24.0 4 90.00 75.00 2108. 15.5 74 2 "fiat 128" 150 | 26.0 4 116.0 75.00 2246. 14.0 74 2 "fiat 124 tc" 151 | 24.0 4 120.0 97.00 2489. 15.0 74 3 "honda civic" 152 | 26.0 4 108.0 93.00 2391. 15.5 74 3 "subaru" 153 | 31.0 4 79.00 67.00 2000. 16.0 74 2 "fiat x1.9" 154 | 19.0 6 225.0 95.00 3264. 16.0 75 1 "plymouth valiant custom" 155 | 18.0 6 250.0 105.0 3459. 16.0 75 1 "chevrolet nova" 156 | 15.0 6 250.0 72.00 3432. 21.0 75 1 "mercury monarch" 157 | 15.0 6 250.0 72.00 3158. 19.5 75 1 "ford maverick" 158 | 16.0 8 400.0 170.0 4668. 11.5 75 1 "pontiac catalina" 159 | 15.0 8 350.0 145.0 4440. 14.0 75 1 "chevrolet bel air" 160 | 16.0 8 318.0 150.0 4498. 14.5 75 1 "plymouth grand fury" 161 | 14.0 8 351.0 148.0 4657. 13.5 75 1 "ford ltd" 162 | 17.0 6 231.0 110.0 3907. 21.0 75 1 "buick century" 163 | 16.0 6 250.0 105.0 3897. 18.5 75 1 "chevroelt chevelle malibu" 164 | 15.0 6 258.0 110.0 3730. 19.0 75 1 "amc matador" 165 | 18.0 6 225.0 95.00 3785. 19.0 75 1 "plymouth fury" 166 | 21.0 6 231.0 110.0 3039. 15.0 75 1 "buick skyhawk" 167 | 20.0 8 262.0 110.0 3221. 13.5 75 1 "chevrolet monza 2+2" 168 | 13.0 8 302.0 129.0 3169. 12.0 75 1 "ford mustang ii" 169 | 29.0 4 97.00 75.00 2171. 16.0 75 3 "toyota corolla" 170 | 23.0 4 140.0 83.00 2639. 17.0 75 1 "ford pinto" 171 | 20.0 6 232.0 100.0 2914. 16.0 75 1 "amc gremlin" 172 | 23.0 4 140.0 78.00 2592. 18.5 75 1 "pontiac astro" 173 | 24.0 4 134.0 96.00 2702. 13.5 75 3 "toyota corona" 174 | 25.0 4 90.00 71.00 2223. 16.5 75 2 "volkswagen dasher" 175 | 24.0 4 119.0 97.00 2545. 17.0 75 3 "datsun 710" 176 | 18.0 6 171.0 97.00 2984. 14.5 75 1 "ford pinto" 177 | 29.0 4 90.00 70.00 1937. 14.0 75 2 "volkswagen rabbit" 178 | 19.0 6 232.0 90.00 3211. 17.0 75 1 "amc pacer" 179 | 23.0 4 115.0 95.00 2694. 15.0 75 2 "audi 100ls" 180 | 23.0 4 120.0 88.00 2957. 17.0 75 2 "peugeot 504" 181 | 22.0 4 121.0 98.00 2945. 14.5 75 2 "volvo 244dl" 182 | 25.0 4 121.0 115.0 2671. 13.5 75 2 "saab 99le" 183 | 33.0 4 91.00 53.00 1795. 17.5 75 3 "honda civic cvcc" 184 | 28.0 4 107.0 86.00 2464. 15.5 76 2 "fiat 131" 185 | 25.0 4 116.0 81.00 2220. 16.9 76 2 "opel 1900" 186 | 25.0 4 140.0 92.00 2572. 14.9 76 1 "capri ii" 187 | 26.0 4 98.00 79.00 2255. 17.7 76 1 "dodge colt" 188 | 27.0 4 101.0 83.00 2202. 15.3 76 2 "renault 12tl" 189 | 17.5 8 305.0 140.0 4215. 13.0 76 1 "chevrolet chevelle malibu classic" 190 | 16.0 8 318.0 150.0 4190. 13.0 76 1 "dodge coronet brougham" 191 | 15.5 8 304.0 120.0 3962. 13.9 76 1 "amc matador" 192 | 14.5 8 351.0 152.0 4215. 12.8 76 1 "ford gran torino" 193 | 22.0 6 225.0 100.0 3233. 15.4 76 1 "plymouth valiant" 194 | 22.0 6 250.0 105.0 3353. 14.5 76 1 "chevrolet nova" 195 | 24.0 6 200.0 81.00 3012. 17.6 76 1 "ford maverick" 196 | 22.5 6 232.0 90.00 3085. 17.6 76 1 "amc hornet" 197 | 29.0 4 85.00 52.00 2035. 22.2 76 1 "chevrolet chevette" 198 | 24.5 4 98.00 60.00 2164. 22.1 76 1 "chevrolet woody" 199 | 29.0 4 90.00 70.00 1937. 14.2 76 2 "vw rabbit" 200 | 33.0 4 91.00 53.00 1795. 17.4 76 3 "honda civic" 201 | 20.0 6 225.0 100.0 3651. 17.7 76 1 "dodge aspen se" 202 | 18.0 6 250.0 78.00 3574. 21.0 76 1 "ford granada ghia" 203 | 18.5 6 250.0 110.0 3645. 16.2 76 1 "pontiac ventura sj" 204 | 17.5 6 258.0 95.00 3193. 17.8 76 1 "amc pacer d/l" 205 | 29.5 4 97.00 71.00 1825. 12.2 76 2 "volkswagen rabbit" 206 | 32.0 4 85.00 70.00 1990. 17.0 76 3 "datsun b-210" 207 | 28.0 4 97.00 75.00 2155. 16.4 76 3 "toyota corolla" 208 | 26.5 4 140.0 72.00 2565. 13.6 76 1 "ford pinto" 209 | 20.0 4 130.0 102.0 3150. 15.7 76 2 "volvo 245" 210 | 13.0 8 318.0 150.0 3940. 13.2 76 1 "plymouth volare premier v8" 211 | 19.0 4 120.0 88.00 3270. 21.9 76 2 "peugeot 504" 212 | 19.0 6 156.0 108.0 2930. 15.5 76 3 "toyota mark ii" 213 | 16.5 6 168.0 120.0 3820. 16.7 76 2 "mercedes-benz 280s" 214 | 16.5 8 350.0 180.0 4380. 12.1 76 1 "cadillac seville" 215 | 13.0 8 350.0 145.0 4055. 12.0 76 1 "chevy c10" 216 | 13.0 8 302.0 130.0 3870. 15.0 76 1 "ford f108" 217 | 13.0 8 318.0 150.0 3755. 14.0 76 1 "dodge d100" 218 | 31.5 4 98.00 68.00 2045. 18.5 77 3 "honda accord cvcc" 219 | 30.0 4 111.0 80.00 2155. 14.8 77 1 "buick opel isuzu deluxe" 220 | 36.0 4 79.00 58.00 1825. 18.6 77 2 "renault 5 gtl" 221 | 25.5 4 122.0 96.00 2300. 15.5 77 1 "plymouth arrow gs" 222 | 33.5 4 85.00 70.00 1945. 16.8 77 3 "datsun f-10 hatchback" 223 | 17.5 8 305.0 145.0 3880. 12.5 77 1 "chevrolet caprice classic" 224 | 17.0 8 260.0 110.0 4060. 19.0 77 1 "oldsmobile cutlass supreme" 225 | 15.5 8 318.0 145.0 4140. 13.7 77 1 "dodge monaco brougham" 226 | 15.0 8 302.0 130.0 4295. 14.9 77 1 "mercury cougar brougham" 227 | 17.5 6 250.0 110.0 3520. 16.4 77 1 "chevrolet concours" 228 | 20.5 6 231.0 105.0 3425. 16.9 77 1 "buick skylark" 229 | 19.0 6 225.0 100.0 3630. 17.7 77 1 "plymouth volare custom" 230 | 18.5 6 250.0 98.00 3525. 19.0 77 1 "ford granada" 231 | 16.0 8 400.0 180.0 4220. 11.1 77 1 "pontiac grand prix lj" 232 | 15.5 8 350.0 170.0 4165. 11.4 77 1 "chevrolet monte carlo landau" 233 | 15.5 8 400.0 190.0 4325. 12.2 77 1 "chrysler cordoba" 234 | 16.0 8 351.0 149.0 4335. 14.5 77 1 "ford thunderbird" 235 | 29.0 4 97.00 78.00 1940. 14.5 77 2 "volkswagen rabbit custom" 236 | 24.5 4 151.0 88.00 2740. 16.0 77 1 "pontiac sunbird coupe" 237 | 26.0 4 97.00 75.00 2265. 18.2 77 3 "toyota corolla liftback" 238 | 25.5 4 140.0 89.00 2755. 15.8 77 1 "ford mustang ii 2+2" 239 | 30.5 4 98.00 63.00 2051. 17.0 77 1 "chevrolet chevette" 240 | 33.5 4 98.00 83.00 2075. 15.9 77 1 "dodge colt m/m" 241 | 30.0 4 97.00 67.00 1985. 16.4 77 3 "subaru dl" 242 | 30.5 4 97.00 78.00 2190. 14.1 77 2 "volkswagen dasher" 243 | 22.0 6 146.0 97.00 2815. 14.5 77 3 "datsun 810" 244 | 21.5 4 121.0 110.0 2600. 12.8 77 2 "bmw 320i" 245 | 21.5 3 80.00 110.0 2720. 13.5 77 3 "mazda rx-4" 246 | 43.1 4 90.00 48.00 1985. 21.5 78 2 "volkswagen rabbit custom diesel" 247 | 36.1 4 98.00 66.00 1800. 14.4 78 1 "ford fiesta" 248 | 32.8 4 78.00 52.00 1985. 19.4 78 3 "mazda glc deluxe" 249 | 39.4 4 85.00 70.00 2070. 18.6 78 3 "datsun b210 gx" 250 | 36.1 4 91.00 60.00 1800. 16.4 78 3 "honda civic cvcc" 251 | 19.9 8 260.0 110.0 3365. 15.5 78 1 "oldsmobile cutlass salon brougham" 252 | 19.4 8 318.0 140.0 3735. 13.2 78 1 "dodge diplomat" 253 | 20.2 8 302.0 139.0 3570. 12.8 78 1 "mercury monarch ghia" 254 | 19.2 6 231.0 105.0 3535. 19.2 78 1 "pontiac phoenix lj" 255 | 20.5 6 200.0 95.00 3155. 18.2 78 1 "chevrolet malibu" 256 | 20.2 6 200.0 85.00 2965. 15.8 78 1 "ford fairmont (auto)" 257 | 25.1 4 140.0 88.00 2720. 15.4 78 1 "ford fairmont (man)" 258 | 20.5 6 225.0 100.0 3430. 17.2 78 1 "plymouth volare" 259 | 19.4 6 232.0 90.00 3210. 17.2 78 1 "amc concord" 260 | 20.6 6 231.0 105.0 3380. 15.8 78 1 "buick century special" 261 | 20.8 6 200.0 85.00 3070. 16.7 78 1 "mercury zephyr" 262 | 18.6 6 225.0 110.0 3620. 18.7 78 1 "dodge aspen" 263 | 18.1 6 258.0 120.0 3410. 15.1 78 1 "amc concord d/l" 264 | 19.2 8 305.0 145.0 3425. 13.2 78 1 "chevrolet monte carlo landau" 265 | 17.7 6 231.0 165.0 3445. 13.4 78 1 "buick regal sport coupe (turbo)" 266 | 18.1 8 302.0 139.0 3205. 11.2 78 1 "ford futura" 267 | 17.5 8 318.0 140.0 4080. 13.7 78 1 "dodge magnum xe" 268 | 30.0 4 98.00 68.00 2155. 16.5 78 1 "chevrolet chevette" 269 | 27.5 4 134.0 95.00 2560. 14.2 78 3 "toyota corona" 270 | 27.2 4 119.0 97.00 2300. 14.7 78 3 "datsun 510" 271 | 30.9 4 105.0 75.00 2230. 14.5 78 1 "dodge omni" 272 | 21.1 4 134.0 95.00 2515. 14.8 78 3 "toyota celica gt liftback" 273 | 23.2 4 156.0 105.0 2745. 16.7 78 1 "plymouth sapporo" 274 | 23.8 4 151.0 85.00 2855. 17.6 78 1 "oldsmobile starfire sx" 275 | 23.9 4 119.0 97.00 2405. 14.9 78 3 "datsun 200-sx" 276 | 20.3 5 131.0 103.0 2830. 15.9 78 2 "audi 5000" 277 | 17.0 6 163.0 125.0 3140. 13.6 78 2 "volvo 264gl" 278 | 21.6 4 121.0 115.0 2795. 15.7 78 2 "saab 99gle" 279 | 16.2 6 163.0 133.0 3410. 15.8 78 2 "peugeot 604sl" 280 | 31.5 4 89.00 71.00 1990. 14.9 78 2 "volkswagen scirocco" 281 | 29.5 4 98.00 68.00 2135. 16.6 78 3 "honda accord lx" 282 | 21.5 6 231.0 115.0 3245. 15.4 79 1 "pontiac lemans v6" 283 | 19.8 6 200.0 85.00 2990. 18.2 79 1 "mercury zephyr 6" 284 | 22.3 4 140.0 88.00 2890. 17.3 79 1 "ford fairmont 4" 285 | 20.2 6 232.0 90.00 3265. 18.2 79 1 "amc concord dl 6" 286 | 20.6 6 225.0 110.0 3360. 16.6 79 1 "dodge aspen 6" 287 | 17.0 8 305.0 130.0 3840. 15.4 79 1 "chevrolet caprice classic" 288 | 17.6 8 302.0 129.0 3725. 13.4 79 1 "ford ltd landau" 289 | 16.5 8 351.0 138.0 3955. 13.2 79 1 "mercury grand marquis" 290 | 18.2 8 318.0 135.0 3830. 15.2 79 1 "dodge st. regis" 291 | 16.9 8 350.0 155.0 4360. 14.9 79 1 "buick estate wagon (sw)" 292 | 15.5 8 351.0 142.0 4054. 14.3 79 1 "ford country squire (sw)" 293 | 19.2 8 267.0 125.0 3605. 15.0 79 1 "chevrolet malibu classic (sw)" 294 | 18.5 8 360.0 150.0 3940. 13.0 79 1 "chrysler lebaron town @ country (sw)" 295 | 31.9 4 89.00 71.00 1925. 14.0 79 2 "vw rabbit custom" 296 | 34.1 4 86.00 65.00 1975. 15.2 79 3 "maxda glc deluxe" 297 | 35.7 4 98.00 80.00 1915. 14.4 79 1 "dodge colt hatchback custom" 298 | 27.4 4 121.0 80.00 2670. 15.0 79 1 "amc spirit dl" 299 | 25.4 5 183.0 77.00 3530. 20.1 79 2 "mercedes benz 300d" 300 | 23.0 8 350.0 125.0 3900. 17.4 79 1 "cadillac eldorado" 301 | 27.2 4 141.0 71.00 3190. 24.8 79 2 "peugeot 504" 302 | 23.9 8 260.0 90.00 3420. 22.2 79 1 "oldsmobile cutlass salon brougham" 303 | 34.2 4 105.0 70.00 2200. 13.2 79 1 "plymouth horizon" 304 | 34.5 4 105.0 70.00 2150. 14.9 79 1 "plymouth horizon tc3" 305 | 31.8 4 85.00 65.00 2020. 19.2 79 3 "datsun 210" 306 | 37.3 4 91.00 69.00 2130. 14.7 79 2 "fiat strada custom" 307 | 28.4 4 151.0 90.00 2670. 16.0 79 1 "buick skylark limited" 308 | 28.8 6 173.0 115.0 2595. 11.3 79 1 "chevrolet citation" 309 | 26.8 6 173.0 115.0 2700. 12.9 79 1 "oldsmobile omega brougham" 310 | 33.5 4 151.0 90.00 2556. 13.2 79 1 "pontiac phoenix" 311 | 41.5 4 98.00 76.00 2144. 14.7 80 2 "vw rabbit" 312 | 38.1 4 89.00 60.00 1968. 18.8 80 3 "toyota corolla tercel" 313 | 32.1 4 98.00 70.00 2120. 15.5 80 1 "chevrolet chevette" 314 | 37.2 4 86.00 65.00 2019. 16.4 80 3 "datsun 310" 315 | 28.0 4 151.0 90.00 2678. 16.5 80 1 "chevrolet citation" 316 | 26.4 4 140.0 88.00 2870. 18.1 80 1 "ford fairmont" 317 | 24.3 4 151.0 90.00 3003. 20.1 80 1 "amc concord" 318 | 19.1 6 225.0 90.00 3381. 18.7 80 1 "dodge aspen" 319 | 34.3 4 97.00 78.00 2188. 15.8 80 2 "audi 4000" 320 | 29.8 4 134.0 90.00 2711. 15.5 80 3 "toyota corona liftback" 321 | 31.3 4 120.0 75.00 2542. 17.5 80 3 "mazda 626" 322 | 37.0 4 119.0 92.00 2434. 15.0 80 3 "datsun 510 hatchback" 323 | 32.2 4 108.0 75.00 2265. 15.2 80 3 "toyota corolla" 324 | 46.6 4 86.00 65.00 2110. 17.9 80 3 "mazda glc" 325 | 27.9 4 156.0 105.0 2800. 14.4 80 1 "dodge colt" 326 | 40.8 4 85.00 65.00 2110. 19.2 80 3 "datsun 210" 327 | 44.3 4 90.00 48.00 2085. 21.7 80 2 "vw rabbit c (diesel)" 328 | 43.4 4 90.00 48.00 2335. 23.7 80 2 "vw dasher (diesel)" 329 | 36.4 5 121.0 67.00 2950. 19.9 80 2 "audi 5000s (diesel)" 330 | 30.0 4 146.0 67.00 3250. 21.8 80 2 "mercedes-benz 240d" 331 | 44.6 4 91.00 67.00 1850. 13.8 80 3 "honda civic 1500 gl" 332 | 40.9 4 85.00 ? 1835. 17.3 80 2 "renault lecar deluxe" 333 | 33.8 4 97.00 67.00 2145. 18.0 80 3 "subaru dl" 334 | 29.8 4 89.00 62.00 1845. 15.3 80 2 "vokswagen rabbit" 335 | 32.7 6 168.0 132.0 2910. 11.4 80 3 "datsun 280-zx" 336 | 23.7 3 70.00 100.0 2420. 12.5 80 3 "mazda rx-7 gs" 337 | 35.0 4 122.0 88.00 2500. 15.1 80 2 "triumph tr7 coupe" 338 | 23.6 4 140.0 ? 2905. 14.3 80 1 "ford mustang cobra" 339 | 32.4 4 107.0 72.00 2290. 17.0 80 3 "honda accord" 340 | 27.2 4 135.0 84.00 2490. 15.7 81 1 "plymouth reliant" 341 | 26.6 4 151.0 84.00 2635. 16.4 81 1 "buick skylark" 342 | 25.8 4 156.0 92.00 2620. 14.4 81 1 "dodge aries wagon (sw)" 343 | 23.5 6 173.0 110.0 2725. 12.6 81 1 "chevrolet citation" 344 | 30.0 4 135.0 84.00 2385. 12.9 81 1 "plymouth reliant" 345 | 39.1 4 79.00 58.00 1755. 16.9 81 3 "toyota starlet" 346 | 39.0 4 86.00 64.00 1875. 16.4 81 1 "plymouth champ" 347 | 35.1 4 81.00 60.00 1760. 16.1 81 3 "honda civic 1300" 348 | 32.3 4 97.00 67.00 2065. 17.8 81 3 "subaru" 349 | 37.0 4 85.00 65.00 1975. 19.4 81 3 "datsun 210 mpg" 350 | 37.7 4 89.00 62.00 2050. 17.3 81 3 "toyota tercel" 351 | 34.1 4 91.00 68.00 1985. 16.0 81 3 "mazda glc 4" 352 | 34.7 4 105.0 63.00 2215. 14.9 81 1 "plymouth horizon 4" 353 | 34.4 4 98.00 65.00 2045. 16.2 81 1 "ford escort 4w" 354 | 29.9 4 98.00 65.00 2380. 20.7 81 1 "ford escort 2h" 355 | 33.0 4 105.0 74.00 2190. 14.2 81 2 "volkswagen jetta" 356 | 34.5 4 100.0 ? 2320. 15.8 81 2 "renault 18i" 357 | 33.7 4 107.0 75.00 2210. 14.4 81 3 "honda prelude" 358 | 32.4 4 108.0 75.00 2350. 16.8 81 3 "toyota corolla" 359 | 32.9 4 119.0 100.0 2615. 14.8 81 3 "datsun 200sx" 360 | 31.6 4 120.0 74.00 2635. 18.3 81 3 "mazda 626" 361 | 28.1 4 141.0 80.00 3230. 20.4 81 2 "peugeot 505s turbo diesel" 362 | 30.7 6 145.0 76.00 3160. 19.6 81 2 "volvo diesel" 363 | 25.4 6 168.0 116.0 2900. 12.6 81 3 "toyota cressida" 364 | 24.2 6 146.0 120.0 2930. 13.8 81 3 "datsun 810 maxima" 365 | 22.4 6 231.0 110.0 3415. 15.8 81 1 "buick century" 366 | 26.6 8 350.0 105.0 3725. 19.0 81 1 "oldsmobile cutlass ls" 367 | 20.2 6 200.0 88.00 3060. 17.1 81 1 "ford granada gl" 368 | 17.6 6 225.0 85.00 3465. 16.6 81 1 "chrysler lebaron salon" 369 | 28.0 4 112.0 88.00 2605. 19.6 82 1 "chevrolet cavalier" 370 | 27.0 4 112.0 88.00 2640. 18.6 82 1 "chevrolet cavalier wagon" 371 | 34.0 4 112.0 88.00 2395. 18.0 82 1 "chevrolet cavalier 2-door" 372 | 31.0 4 112.0 85.00 2575. 16.2 82 1 "pontiac j2000 se hatchback" 373 | 29.0 4 135.0 84.00 2525. 16.0 82 1 "dodge aries se" 374 | 27.0 4 151.0 90.00 2735. 18.0 82 1 "pontiac phoenix" 375 | 24.0 4 140.0 92.00 2865. 16.4 82 1 "ford fairmont futura" 376 | 36.0 4 105.0 74.00 1980. 15.3 82 2 "volkswagen rabbit l" 377 | 37.0 4 91.00 68.00 2025. 18.2 82 3 "mazda glc custom l" 378 | 31.0 4 91.00 68.00 1970. 17.6 82 3 "mazda glc custom" 379 | 38.0 4 105.0 63.00 2125. 14.7 82 1 "plymouth horizon miser" 380 | 36.0 4 98.00 70.00 2125. 17.3 82 1 "mercury lynx l" 381 | 36.0 4 120.0 88.00 2160. 14.5 82 3 "nissan stanza xe" 382 | 36.0 4 107.0 75.00 2205. 14.5 82 3 "honda accord" 383 | 34.0 4 108.0 70.00 2245 16.9 82 3 "toyota corolla" 384 | 38.0 4 91.00 67.00 1965. 15.0 82 3 "honda civic" 385 | 32.0 4 91.00 67.00 1965. 15.7 82 3 "honda civic (auto)" 386 | 38.0 4 91.00 67.00 1995. 16.2 82 3 "datsun 310 gx" 387 | 25.0 6 181.0 110.0 2945. 16.4 82 1 "buick century limited" 388 | 38.0 6 262.0 85.00 3015. 17.0 82 1 "oldsmobile cutlass ciera (diesel)" 389 | 26.0 4 156.0 92.00 2585. 14.5 82 1 "chrysler lebaron medallion" 390 | 22.0 6 232.0 112.0 2835 14.7 82 1 "ford granada l" 391 | 32.0 4 144.0 96.00 2665. 13.9 82 3 "toyota celica gt" 392 | 36.0 4 135.0 84.00 2370. 13.0 82 1 "dodge charger 2.2" 393 | 27.0 4 151.0 90.00 2950. 17.3 82 1 "chevrolet camaro" 394 | 27.0 4 140.0 86.00 2790. 15.6 82 1 "ford mustang gl" 395 | 44.0 4 97.00 52.00 2130. 24.6 82 2 "vw pickup" 396 | 32.0 4 135.0 84.00 2295. 11.6 82 1 "dodge rampage" 397 | 28.0 4 120.0 79.00 2625. 18.6 82 1 "ford ranger" 398 | 31.0 4 119.0 82.00 2720. 19.4 82 1 "chevy s-10" 399 | --------------------------------------------------------------------------------