├── .gitignore
├── ESL-Chap12Solutions.tex
├── ESL-Chap2Solutions.tex
├── ESL-Chap3Solutions.tex
├── ESL-Chap4Solutions.tex
├── ESL-Chap5Solutions.tex
├── ESL-Solutions.pdf
├── ESL-Solutions.tex
├── ElemStatLearnCode
    ├── README
    ├── TODO
    ├── cache
    │   └── .gitignore
    ├── config
    │   ├── .gitignore
    │   └── global.dcf
    ├── data
    │   ├── .gitignore
    │   ├── spam.wsv
    │   ├── zip.test.wsv
    │   └── zip.train.wsv
    ├── diagnostics
    │   ├── .gitignore
    │   └── 1.R
    ├── doc
    │   └── .gitignore
    ├── graphs
    │   ├── .gitignore
    │   ├── exercise_2_8.pdf
    │   ├── exercise_2_8.png
    │   ├── exercise_3_17.pdf
    │   ├── exercise_3_17.png
    │   ├── exercise_3_2.pdf
    │   └── exercise_3_2.png
    ├── lib
    │   └── helpers.R
    ├── logs
    │   └── .gitignore
    ├── munge
    │   ├── .gitignore
    │   ├── spam_munge.R
    │   └── zip_munge.R
    ├── profiling
    │   └── 1.R
    ├── reports
    │   └── .gitignore
    ├── src
    │   ├── .gitignore
    │   ├── exercise_2_8.R
    │   ├── exercise_3_17.R
    │   └── exercise_3_2.R
    └── tests
    │   └── 1.R
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.gz
 2 | *.aux
 3 | *.fdb_latexmk
 4 | *.log
 5 | *.Rdata
 6 | *.RData
 7 | *.Rhistory
 8 | *.latexmain
 9 | *.out
10 | *.toc
11 | *.swp
12 | 


--------------------------------------------------------------------------------
/ESL-Chap12Solutions.tex:
--------------------------------------------------------------------------------
1 | \chapter{Support Vector Machines and Flexible Discriminants}
2 | 
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/ESL-Chap2Solutions.tex:
--------------------------------------------------------------------------------
  1 | \chapter{Overview of Supervised Learning}
  2 | \begin{exer}
  3 |     Suppose that each of $K$-classes has an associated target $t_k$, which is a vector of all zeroes, except a one in the $k$-th position.  Show that classifying the largest element of $\hat y$ amounts to choosing the closest target, $\min_k \| t_k - \hat y \|$ if the elements of $\hat y$ sum to one. 
  4 | \end{exer}
  5 | 
  6 | \begin{proof}
  7 |     The assertion is equivalent to showing that \[
  8 |     \argmax_i \hat y_i = \argmin_k \| t_k - \hat y \| = \argmin_k \|\hat y - t_k \|^2
  9 | \] by monotonicity of $x \mapsto x^2$ and symmetry of the norm.  
 10 | 
 11 | WLOG, let $\| \cdot \|$ be the Euclidean norm $\| \cdot \|_2$.  Let $k = \argmax_i \hat y_i$, with $\hat y_k = \max y_i$.  Note that then $\hat y_k \geq \frac{1}{K}$, since $\sum \hat y_i = 1$.   
 12 | 
 13 | Then for any $k' \neq k$ (note that $y_{k'} \leq y_k$), we have \begin{align*}
 14 |     \| y - t_{k'} \|_2^2 - \| y - t_k \|_2^2 &= y_k^2 + \left(y_{k'} - 1 \right)^2 - \left( y_{k'}^2 + \left(y_k - 1 \right)^2 \right) \\
 15 |     &= 2 \left(y_k - y_{k'}\right) \\
 16 |     &\geq 0
 17 | \end{align*} since $y_{k'} \leq y_k$ by assumption.
 18 | 
 19 | Thus we must have \[
 20 |     \argmin_k \| t_k - \hat y \| = \argmax_i \hat y_i
 21 | \] as required.    
 22 | \end{proof}
 23 | 
 24 | \begin{exer}
 25 |     Show how to compute the Bayes decision boundary for the simulation example in Figure 2.5.
 26 | \end{exer}
 27 | 
 28 | \begin{proof}
 29 |     The Bayes classifier is \[
 30 |         \hat G(X) = \argmax_{g \in \mathcal G} P(g | X = x ).
 31 |     \] In our two-class example $\textsc{orange}$ and $\textsc{blue}$, the decision boundary is the set where \[
 32 |         P(g=\textsc{blue} | X = x) = P(g =\textsc{orange} | X = x) = \frac{1}{2}.
 33 |     \]  
 34 |     
 35 |     By the Bayes rule, this is equivalent to the set of points where \[
 36 |         P(X = x | g = \textsc{blue}) P(g = \textsc{blue}) = P(X = x | g = \textsc{orange}) P(g = \textsc{orange})
 37 |     \] And since we know $P(g)$ and $P(X=x|g)$, the decision boundary can be calculated.
 38 | \end{proof}
 39 | 
 40 | \begin{exer}
 41 |     Derive equation (2.24)
 42 | \end{exer}
 43 | 
 44 | \begin{proof}
 45 |     TODO
 46 | \end{proof}
 47 |     
 48 | \begin{exer}
 49 |     Consider $N$ data points uniformly distributed in a $p$-dimensional unit ball centered at the origin.  Show the the median distance from the origin to the closest data point is given by \[
 50 |         d(p, N) = \left(1-\left(\frac{1}{2}\right)^{1/N}\right)^{1/p}
 51 |     \] 
 52 | \end{exer}
 53 | \begin{proof}
 54 |     Let $r$ be the median distance from the origin to the closest data point.  Then \[
 55 |         P(\text{All $N$ points are further than $r$ from the origin}) = \frac{1}{2}
 56 |     \] by definition of the median.
 57 | 
 58 |     Since the points $x_i$ are independently distributed, this implies that \[
 59 |         \frac{1}{2} = \prod_{i=1}^N P(\|x_i\| > r)
 60 |     \] and as the points $x_i$ are uniformly distributed in the unit ball, we have that \begin{align*}
 61 |         P(\| x_i \| > r) &= 1 - P(\| x_i \| \leq r) \\
 62 |                          &= 1 - \frac{Kr^p}{K} \\
 63 |                          &= 1 - r^p
 64 |     \end{align*}  Putting these together, we obtain that \[
 65 |         \frac{1}{2} = \left(1-r^p \right)^{N}
 66 |     \] and solving for $r$, we have \[
 67 |         r = \left(1-\left(\frac{1}{2}\right)^{1/N}\right)^{1/p}
 68 |     \]
 69 | \end{proof}
 70 | 
 71 | \begin{exer}
 72 |     Consider inputs drawn from a spherical multivariate-normal distribution $X \sim N(0,\mathbf{1}_p)$. The squared distance from any sample point to the origin has a $\chi^2_p$ distribution with mean $p$. Consider a prediction point $x_0$ drawn from this distribution, and let $a = \frac{x_0}{\| x_0\|}$ be an associated unit vector. Let $z_i = a^T x_i$ be the projection of each of the training points on this direction.
 73 |     Show that the $z_i$ are distributed $N(0,1)$ with expected squared distance from the origin 1, while the target point has expected squared distance $p$ from the origin.
 74 |     Hence for $p = 10$, a randomly drawn test point is about 3.1 standard deviations from the origin, while all the training points are on average one standard deviation along direction a. So most prediction points see themselves as lying on the edge of the training set.
 75 | \end{exer}
 76 | 
 77 | \begin{proof}
 78 |     Let $z_i = a^T x_i = \frac{x_0^T}{\| x_0 \|} x_i$.  Then $z_i$ is a linear combination of $N(0,1)$ random variables, and hence normal, with expectation zero and variance \[ 
 79 |         \text{Var}(z_i) = \| a^T \|^2 \text{Var}(x_i) = \text{Var}(x_i) = 1
 80 |     \] as the vector $a$ has unit length and $x_i \sim N(0, 1)$.
 81 |     
 82 |     For each target point $x_i$, the squared distance from the origin is a $\chi^2_p$ distribution with mean $p$, as required.  
 83 | \end{proof}
 84 | 
 85 | \begin{exer}
 86 |     \begin{enumerate}[(a)]
 87 |         \item Derive equation (2.27) in the notes.
 88 |         \item Derive equation (2.28) in the notes.
 89 |     \end{enumerate}
 90 | \end{exer}
 91 | 
 92 | \begin{proof}
 93 |     \begin{enumerate}[(i)]
 94 |         \item We have \begin{align*}
 95 |             EPE(x_0) &= E_{y_0 | x_0} E_{\mathcal{T}}(y_0 - \hat y_0)^2 \\
 96 |                      &= \text{Var}(y_0|x_0) + E_{\mathcal T}[\hat y_0 - E_{\mathcal T} \hat y_0]^2 + [E_{\mathcal T} - x_0^T \beta]^2 \\
 97 |                      &= \text{Var}(y_0 | x_0) + \text{Var}_\mathcal{T}(\hat y_0) + \text{Bias}^2(\hat y_0).
 98 |         \end{align*}  We now treat each term individually.  Since the estimator is unbiased, we have that the third term is zero.  Since $y_0 = x_0^T \beta + \epsilon$ with $\epsilon$ an $N(0,\sigma^2)$ random variable, we must have $\text{Var}(y_0|x_0) = \sigma^2$.  
 99 | 
100 |         The middle term is more difficult.  First, note that we have \begin{align*}
101 |             \text{Var}_{\mathcal T}(\hat y_0) &= \text{Var}_{\mathcal T}(x_0^T \hat \beta) \\
102 |                     &= x_0^T \text{Var}_{\mathcal T}(\hat \beta) x_0 \\
103 |                     &= E_{\mathcal T} x_0^T \sigma^2 (\mathbf{X}^T \mathbf{X})^{-1} x_0
104 |             \end{align*} by conditioning (3.8) on $\mathcal T$.
105 |         \item TODO
106 |     \end{enumerate}
107 | \end{proof}
108 | 
109 | \begin{exer}
110 |     Consider a regression problem with inputs $x_i$ and outputs $y_i$, and a parameterized model $f_\theta(x)$ to be fit with least squares.  Show that if there are observations with \emph{tied} or \emph{identical} values of $x$, then the fit can be obtained from a reduced weighted least squares problem.
111 | \end{exer}
112 | 
113 | \begin{proof}
114 |     This is relatively simple.  WLOG, assume that $x_1 = x_2$, and all other observations are unique.  Then our RSS function in the general least-squares estimation is \[
115 |         RSS(\theta) = \sum_{i=1}^N \left(y_i - f_\theta(x_i) \right)^2 = \sum_{i=2}^N w_i \left(y_i - f_\theta(x_i) \right)^2 
116 |     \] where \[
117 |         w_i = \begin{cases}
118 |             2 & i = 2 \\
119 |             1 & \text{otherwise}
120 |         \end{cases}
121 |     \]
122 |     Thus we have converted our least squares estimation into a reduced weighted least squares estimation.  This minimal example can be easily generalised.
123 | \end{proof}
124 | 
125 | \begin{exer}
126 |     Suppose that we have a sample of $N$ pairs $x_i, y_i$, drawn IID from the distribution such that \begin{align*}
127 |         x_i \sim h(x), \\
128 |         y_i = f(x_i) + \epsilon_i, \\
129 |         E(\epsilon_i) = 0, \\
130 |         \text{Var}(\epsilon_i) = \sigma^2.
131 |     \end{align*}
132 |     
133 |     We construct an estimator for $f$ linear in the $y_i$, \[
134 |         \hat f(x_0) = \sum_{i=1}^N \ell_i(x_0; \mathcal X) y_i
135 |     \] where the weights $\ell_i(x_0; X)$ do not depend on the $y_i$, but do depend on the training sequence $x_i$ denoted by $\mathcal X$.  
136 |     \begin{enumerate}[(a)]
137 |         \item Show that the linear regression and $k$-nearest-neighbour regression are members of this class of estimators.  Describe explicitly the weights $\ell_i(x_0; \mathcal X)$ in each of these cases.
138 |         \item Decompose the conditional mean-squared error \[
139 |             E_{\mathcal Y | \mathcal X} \left( f(x_0) - \hat f(x_0) \right)^2
140 |         \] into a conditional squared bias and a conditional variance component.  $\mathcal Y$ represents the entire training sequence of $y_i$.
141 |         \item Decompose the (unconditional) MSE \[
142 |             E_{\mathcal Y, \mathcal X}\left(f(x_0) - \hat f(x_0) \right)^2
143 |         \] into a squared bias and a variance component.
144 |         \item Establish a relationship between the square biases and variances in the above two cases.
145 |     \end{enumerate}
146 | \end{exer}
147 | 
148 | \begin{proof}
149 |     \begin{enumerate}[(a)]
150 |         \item Recall that the estimator for $f$ in the linear regression case is given by \[
151 |             \hat f(x_0) = x_0^T \beta 
152 |         \] where $\beta = (X^T X)^{-1} X^T y$.  Then we can simply write \[
153 |             \hat f(x_0) = \sum_{i=1}^N \left( x_0^T (X^T X)^{-1} X^T \right)_i y_i.
154 |         \]  Hence \[
155 |             \ell_i(x_0; \mathcal X) = \left( x_0^T (X^T X)^{-1} X^T \right)_i.
156 |         \]
157 |         
158 |         In the $k$-nearest-neighbour representation, we have \[
159 |             \hat f(x_0) = \sum_{i=1}^N \frac{y_i}{k} \mathbf{1}_{x_i \in N_k(x_0)}
160 |         \] where $N_k(x_0)$ represents the set of $k$-nearest-neighbours of $x_0$.  Clearly, \[
161 |             \ell_i(x_0; \mathcal X) = \frac{1}{k} \mathbf{1}_{x_i \in N_k(x_0)}
162 |         \]
163 |         \item TODO
164 |         \item TODO
165 |         \item TODO
166 |     \end{enumerate}
167 | \end{proof}
168 | 
169 | \begin{exer}
170 |     Compare the classification performance of linear regression and $k$-nearest neighbour classification on the \texttt{zipcode} data.  In particular, consider on the \texttt{2}'s and \texttt{3}'s, and $k = 1, 3, 5, 7, 15$.  Show both the training and test error for each choice.
171 | \end{exer}
172 | 
173 | \begin{proof}    
174 | Our implementation in R and graphs are attached.
175 | 
176 | \clearpage
177 | \lstinputlisting{./ElemStatLearnCode/src/exercise_2_8.R}
178 | 
179 | \clearpage
180 | \begin{figure}
181 | \centering\includegraphics[width=\textwidth]{./ElemStatLearnCode/graphs/exercise_2_8.pdf}
182 | \end{figure}
183 | 
184 | \end{proof}
185 | 
186 | \begin{exer}
187 |     Consider a linear regression model with $p$ parameters, fitted by OLS to a set of trainig data $(x_i, y_i)_{1 \leq i \leq N}$ drawn at random from a population.  Let $\hat \beta$ be the least squares estimate.  Suppose we have some test data $(\tilde x_i, \tilde y_i)_{1 \leq i \leq M}$ drawn at random from the same population as the training data.
188 |     
189 |     If $R_{tr}(\beta) = \frac{1}{N} \sum_{i=1}^N \left(y_i \beta^T x_i \right)^2$ and $R_{te}(\beta) = \frac{1}{M} \sum_{i=1}^M \left( \tilde y_i - \beta^T \tilde x_i \right)^2$, prove that \[
190 |         E(R_{tr}(\hat \beta)) \leq E(R_{te}(\hat \beta))
191 |     \] where the expectation is over all that is random in each expression.
192 | \end{exer}
193 | 


--------------------------------------------------------------------------------
/ESL-Chap3Solutions.tex:
--------------------------------------------------------------------------------
  1 | \chapter{Linear Methods for Regression}
  2 | 
  3 | \begin{exer}
  4 |   Show that the $F$ statistic for dropping a single coefficient from a model is equal to the square of the corresponding $z$-score.
  5 | \end{exer}
  6 | 
  7 | \begin{proof}
  8 |   Recall that the $F$ statistic is defined by the following expression \[
  9 |     \frac{(RSS_0 - RSS_1) / (p_1 - p_0)}{RSS_1 / (N - p_1 - 1)}.
 10 |   \] where $RSS_0, RSS_1$ and $p_0 + 1, p_1 + 1$ refer to the residual sum of squares and the number of free parameters in the smaller and bigger models, respectively.  Recall also that the $F$ statistic has a $F_{p_1 - p_0, N-p_1 - 1}$ distribution under the null hypothesis that the smaller model is correct.
 11 | 
 12 |   Next, recall that the $z$-score of a coefficient is \[
 13 |     z_j = \frac{\hat \beta_j}{\hat \sigma \sqrt{v_j}}
 14 |   \] and under the null hypothesis that $\beta_j$ is zero, $z_j$ is distributed according to a $t$-distribution with $N-p-1$ degrees of freedom. 
 15 | 
 16 |   Hence, by dropping a single coefficient from a model, our $F$ statistic has a $F_{1, N-p - 1}$ where $p + 1$ are the number of parameters in the original model.  Similarly, the corresponding $z$-score is distributed according to a $t_{N-p-1}$ distribution, and thus the square of the $z$-score is distributed according to an $F_{1, N-p-1}$ distribution, as required.
 17 | 
 18 |   Thus both the $z$-score and the $F$ statistic test identical hypotheses under identical distributions.  Thus they must have the same value in this case.    
 19 | \end{proof}
 20 | 
 21 | \begin{exer}
 22 |     Given data on two variables $X$ and $Y$, consider fitting a cubic polynomial regression model $f(X) = \sum_{j=0}^{3} \beta_j X^j$.  In addition to plotting the fitted curve, you would like a 95\% confidence band about the curve.  Consider the following two approaches:
 23 | 
 24 | \begin{enumerate}
 25 |     \item At each point $x_0$, form a 95\% confidence interval for the linear function $a^T \beta = \sum_{j=0}^{3}\beta_j x_0^j$.  
 26 |     \item Form a 95\% confidence set for $\beta$ as in (3.15), which in tun generates confidence intervals for $f(x_0)$.  
 27 | \end{enumerate}
 28 | 
 29 |    How do these approaches differ?  Which band is likely to be wider?  Conduct a small simulation experiment to compare the two methods.
 30 | \end{exer}
 31 | 
 32 | \begin{proof}
 33 |     The key distinction is that in the first case, we form the set of points such that we are 95\% confident that $\hat f(x_0)$ is within this set, whereas in the second method, we are 95\% confident that an arbitrary point is within our confidence interval.  This is the distinction between a \emph{pointwise} approach and a \emph{global} confidence estimate. 
 34 |     
 35 |     In the pointwise approach, we seek to estimate the variance of an individual prediction - that is, to calculate $\text{Var}(\hat f(x_0) | x_0)$.  Here, we have \begin{align*}
 36 |         \sigma_0^2 = \text{Var}(\hat f(x_0) | x_0) &= \text{Var}(x_0^T \hat \beta | x_0) \\
 37 |                                     &= x_0^T \text{Var}(\hat \beta) x_0 \\
 38 |                                     &= \hat \sigma^2 x_0^T (X^T X)^{-1} x_0.
 39 |     \end{align*} where $\hat \sigma^2$ is the estimated variance of the innovations $\epsilon_i$.
 40 |     
 41 |     R code and graphs of the simulation are attached.
 42 |     \clearpage
 43 |     \lstinputlisting{./ElemStatLearnCode/src/exercise_3_2.R}
 44 |     \clearpage
 45 |     \begin{figure}
 46 | 	\centering\includegraphics[width=\textwidth]{ElemStatLearnCode/graphs/exercise_3_2.pdf}
 47 |     \end{figure}
 48 | 
 49 | 
 50 |     TODO: Part 2.
 51 | \end{proof}
 52 | 
 53 | \begin{exer}[The Gauss-Markov Theorem]
 54 |     \begin{enumerate}
 55 |     \item Prove the Gauss-Markov theorem: the least squares estimate of a parameter $a^T\beta$ has a variance no bigger than that of any other linear unbiased estimate of $a^T\beta$.
 56 | 
 57 |     \item Secondly, show that if $\hat V$ is the variance-covariance matrix of the least squares estimate of $\beta$ and $\tilde V$ is the variance covariance matrix of any other linear unbiased estimate, then $\hat V \leq \tilde V$, where $B \leq A$ if $A - B$ is positive semidefinite.
 58 |     \end{enumerate}
 59 | \end{exer}
 60 | 
 61 | \begin{proof}
 62 |     Let $\hat \theta = a^T \hat \beta = a^T(X^TX)^{-1}X^T y$ be the least squares estimate of $a^T \beta$.  Let $\tilde \theta = c^T y$ be any other unbiased linear estimator of $a^T \beta$.  Now, let $d^T = c^T - a^T(X^{-1}X)^{-1}X^T$.  Then as $c^T y$ is unbiased, we must have \begin{align*}
 63 |         E(c^T y) &= E\left( a^T(X^{T}X)^{-1}X^T + d^T\right) y \\
 64 |                 &= a^T\beta + d^T X\beta \\
 65 |                 &= a^T\beta
 66 |     \end{align*} as $c^T y$ is unbiased, which implies that $d^T X = 0$.
 67 | 
 68 |     Now we calculate the variance of our estimator.  We have \begin{align*}
 69 |         \text{Var}(c^T y) &= c^T \text{Var}(y) c \\
 70 |                     &= \sigma^2 c^T c \\
 71 |                     &= \sigma^2 \left( a^T(X^{T}X)^{-1}X^T + d^T \right) \left( a^T (X^T X)^{-1} X^T + d^T \right)^T \\
 72 |                     &= \sigma^2 \left( a^T (X^T X)^{-1}X^T + d^T\right) \left(X (X^{T}X)^{-1}a + d\right) \\
 73 |                     &= \sigma^2 \left( a^T (X^TX)^{-1}X^T X(X^T X)^{-1} a + a^T (X^T X)^{-1} \underbrace{X^T d}_{=0} + \underbrace{d^T X}_{=0}(X^T X)^{-1} a + d^T d \right) \\
 74 |                     &= \sigma^2 \left(\underbrace{a^T (X^T X)^{-1} a}_{\text{Var}(\hat \theta)} + \underbrace{d^t d}_{\geq 0} \right)
 75 |     \end{align*}
 76 | 
 77 |     Thus $\text{Var}(\hat \theta) \leq \text{Var}(\tilde \theta)$ for all other unbiased linear estimators $\tilde \theta$.
 78 | 
 79 |     The proof of the matrix version is almost identical, except we replace our vector $d$ with a matrix $D$.  It is then possible to show that $\tilde V = \hat V + D^T D$, and as $D^T D$ is a positive semidefinite matrix for any $D$, we have $\hat V \leq \tilde V$. 
 80 | \end{proof}
 81 | 
 82 | \begin{exer}
 83 |     Show how the vector of least square coefficients can be obtained from a single pass of the Gram-Schmidt procedure.  Represent your solution in terms of the QR decomposition of $X$.  
 84 | \end{exer}
 85 | 
 86 | \begin{proof}
 87 |     Recall that by a single pass of the Gram-Schmidt procedure, we can write our matrix $X$ as \[
 88 |         X = Z \Gamma,
 89 |         \] where $Z$ contains the orthogonal columns $z_j$, and $\Gamma$ is an upper-diagonal matrix with ones on the diagonal, and $\gamma_{ij} = \frac{\langle z_i, x_j \rangle}{\| z_i \|^2}$. This is a reflection of the fact that by definition, \[
 90 |             x_j = z_j + \sum_{k=0}^{j-1} \gamma_{kj} z_k.
 91 |             \]
 92 | 
 93 |             Now, by the $QR$ decomposition, we can write $X = QR$, where $Q$ is an orthogonal matrix and $R$ is an upper triangular matrix.  We have $Q = Z D^{-1}$ and $R = D\Gamma$, where $D$ is a diagonal matrix  with $D_{jj} = \| z_j \|$.  
 94 | 
 95 |     Now, by definition of $\hat \beta$, we have \[
 96 |         (X^T X) \hat \beta = X^T y.
 97 |         \]  Now, using the $QR$ decomposition, we have \begin{align*}
 98 |             (R^T Q^T) (QR) \hat \beta &= R^T Q^T y \\
 99 |             R \hat \beta &= Q^T y
100 |         \end{align*}
101 |     As $R$ is upper triangular, we can write \begin{align*}
102 |         R_{pp} \hat \beta_p &= \langle q_p, y \rangle \\
103 |         \| z_p \| \hat \beta_p &= \| z_p \|^{-1} \langle z_p, y \rangle \\
104 |         \hat \beta_p &= \frac{\langle z_p, y \rangle}{\| z_p \|^2}
105 |     \end{align*} in accordance with our previous results.  Now, by back substitution, we can obtain the sequence of regression coefficients $\hat \beta_j$.  As an example, to calculate $\hat \beta_{p-1}$, we have \begin{align*}
106 |         R_{p-1, p-1} \hat \beta_{p-1} + R_{p-1,p} \hat \beta_p &= \langle q_{p-1}, y \rangle \\
107 |         \| z_{p-1} \| \hat \beta_{p-1} + \| z_{p-1} \| \gamma_{p-1,p} \hat \beta_p &= \| z_{p-1} \|^{-1} \langle z_{p-1}, y \rangle 
108 |     \end{align*} and then solving for $\hat \beta_{p-1}$. This process can be repeated for all $\beta_j$, thus obtaining the regression coefficients in one pass of the Gram-Schmidt procedure.
109 | \end{proof}
110 | 
111 | \begin{exer}
112 |     Consider the ridge regression problem (3.41).  Show that this problem is equivalent to the problem \[
113 |         \hat \beta^c = \argmin_{\beta^c} \left( \sum_{i=1}^{N} \left( y_i - \beta^c_0 - \sum_{j=1}^{p}(x_{ij} - \hat x_j) \beta^c_j \right)^2 + \lambda \sum_{j=1}^{p}{\beta_j^c}^2 \right)^2.
114 |         \]
115 | \end{exer}
116 | 
117 | \begin{proof}
118 |     Consider rewriting our objective function above as  \[
119 |         L(\beta^c) = \sum_{i=1}^{N}\left(y_i - \left(\beta_0^c - \sum_{j=1}^{p} \bar x_j \beta_j^c \right) - \sum_{j=1}^p x_{ij} \beta_j^c \right)^2 + \lambda \sum_{j=1}^p {\beta_j^2}^2
120 |         \]
121 |     Note that making the substitutions \begin{align*}
122 |         \beta_0 &\mapsto \beta_0^c - \sum_{j=1}^p \hat x_j \beta_j \\
123 |         \beta_j &\mapsto \beta^c_j, j = 1, 2, \dots, p
124 |     \end{align*} that $\hat \beta$ is a minimiser of the original ridge regression equation if $\hat \beta^c$ is a minimiser of our modified ridge regression.  
125 | 
126 |     The modified solution merely has a shifted intercept term, and all other coefficients remain the same.  
127 | \end{proof}
128 | 
129 | \begin{exer}
130 |     Show that the ridge regression estimate is the mean (and mode) of the posterior distribution, under a Gaussian prior $\beta \sim N(0, \tau \mathbf{I})$, and Gaussian sampling model $y \sim N(X \beta, \sigma^2 \mathbf{I})$.  Find the relationship between the regularization parameter $\lambda$ in the ridge formula, and the variances $\tau$ and $\sigma^2$.
131 | \end{exer}
132 | 
133 | \begin{exer}
134 |     Assume 
135 |     \[ y_i \sim N(\beta_0 + x_i^T \beta, \sigma^2), i = 1, 2, \dots, N \] and the parameters $\beta_j$ are are each distributed as $N(0, \tau^2)$, independently of one another.  Assume $\sigma^2$ and $\tau^2$ are known, show that the minus log-posterior density of $\beta$ is proportional to 
136 |     \[ \sum_{i=1}^N \left( y_i - \beta_0 - \sum_{j=1}^p x_{ij} \beta_j \right)^2 + \lambda \sum_{j=1}^p \beta_j^2 \]
137 |     where $\lambda = \frac{\sigma^2}{\tau^2}$.  
138 | \end{exer}
139 | 
140 | \begin{exer}
141 |     Consider the $QR$ decomposition of the uncentred $N \times (p+1)$ matrix $X$, whose first column is all ones, and the SVD of the $N \times p$ centred matrix $\tilde X$.  Show that $Q_2$ and $U$ share the same subspace, where $Q_2$ is the submatrix of $Q$ with the first column removed.  Under what circumstances will they be the same, up to sign flips?
142 | \end{exer}
143 | 
144 | \begin{proof}
145 |     Denote the columns of $X$ by $x_0, \dots, x_{p}$, the columns of $Q$ by $z_0, \dots, z_p$, the columns of $\tilde X$ by $\tilde x_1, \dots, x_n$, and the columns of $U$ by $u_1, \dots, u_p$.  Without loss of generality, we can assume that for all $i$, $\| x_i \| = 1$ and that $X$ is non-singular (this cleans up the proof somewhat).
146 | 
147 | 
148 |     First, note that by the QR decomposition, we have that $\text{span}(x_0, \dots, x_j) = \text{span}(z_0, \dots, z_j)$ for any $0 \leq j \leq p$.  
149 | 
150 |     By our assumption, we have that $\tilde x_i = x_i - \bar x_i \mathbf{1}$ for $i = 1, \dots, p$.  Thus we can write $\tilde x_i = \sum_{j \leq i} \alpha_j z_j$, and as the $z_j$ are orthogonal, we must be able to write $\tilde x_i$ in terms of $z_j$ for $j = 1, 2, \dots, i$.  Thus $\text{span}(\tilde x_1, \dots, \tilde x_i) = \text{span}(z_1, \dots, z_i)$.  
151 | 
152 |     Finally, we calculate $\text{span}(u_1, \dots, u_p)$.  We have that $U$ is a unitary $N \times p$ matrix, and thus the columns of $U$ span the column space of $\tilde X$, and thus the span of $Q_2$ is equal to the span of $U$.  
153 | 
154 |     TODO: When is $Q_2$ equal to $U$ up to parity?  Is it where columns of 
155 | \end{proof}
156 | \begin{exer}[Forward stepwise regression]
157 |     Suppose that we have the $QR$ decomposition for the $N \times q$ matrix $X_1$ in a multiple regression problem with response $y$, and we have an additional $p - q$ predictors in matrix $X_2$.  Denote the current residual by $r$.  We wish to establish which one of these additional variables will reduce the residual-sum-of-squares the most when included with those in $X_1$.  Describe an efficient procedure for doing this.
158 | \end{exer}
159 | 
160 | \begin{proof}
161 |     Select the vector $x_{j'}$ where
162 |     \begin{align*}
163 |         x_{j'} = \argmin_{j = q+1, \dots, p} \left| \left\langle \frac{x_q}{\| x_q \|}, r \right\rangle \right| 
164 |     \end{align*}
165 |     
166 |     This selects the vector that explains the maximal amount of variance in $r$ given $X_1$, and thus reduces the residual sum of squares the most.  It is then possible to repeat this procedure by updating $X_2$ as in Algorithm 3.1.
167 | \end{proof}
168 | \begin{exer}[Backward stepwise regression]
169 |     Suppose that we have the multiple regression fit of $y$ on $X$, along with standard errors and $z$-scores.  We wish to establish which variable, when dropped, will increase the RSS the least.  How would you do this?
170 | \end{exer}
171 | 
172 | \begin{proof}
173 |     By Exercise 3.1, we can show that the F-statistic for dropping a single coefficient from a model is equal to the square of the corresponding $z$-score.  Thus, we drop the variable that has the lowest squared $z$-score from the model.
174 | \end{proof}
175 | 
176 | \begin{exer}
177 |     Show that the solution to the multivariate linear regression problem (3.40) is given by (3.39).  What happens if the covariance matrices $\Sigma_i$ are different for each observation?
178 | \end{exer}
179 | 
180 | \begin{exer}
181 |     Show that the ridge regression estimates can be obtained by OLS on an augmented data set.  We augment the centred matrix $X$ with $p$ additional rows $\sqrt{\lambda} \mathbf{I}$, and augment $y$ with $p$ zeroes. 
182 | 
183 | \end{exer}
184 | \begin{proof}
185 |     For our augmented matrix $X_1$, equal to appending $\sqrt{\lambda I}$ to the original observation matrix $X$, we have that the $RSS$ expression for OLS regression becomes \begin{align*}
186 |         RSS &= \sum_{i=1}^{N+p} \left(y_i - \sum_{j=1}^p x_{ij} \beta_j \right)^2 \\
187 |             &= \sum_{i=1}^{N} \left( y_i - \sum_{j=1}^p x_{ij} \beta_j \right)^2 + \sum_{i = N + 1}^{N+p} \left(\sum_{j=1}^p x_{ij} \beta_j \right)^2 \\
188 |             &= \sum_{i=1}^{N} \left( y_i - \sum_{j=1}^p x_{ij} \beta_j \right)^2 + \sum_{j=1}^p \lambda \beta_j^2 
189 |     \end{align*} which is the objective function for the ridge regression estimate.
190 | \end{proof}
191 | 
192 | \begin{exer}
193 |     Derive expression (3.62), and show that $\hat \beta^{\text{pcr}}(p) = \hat \beta^{\text{ls}}$.
194 | \end{exer}
195 | 
196 | \begin{exer}
197 |     Show that in the orthogonal case, PLS stops after $m=1$ steps, because subsequent $\hat \phi_{mj}$ in step 2 in Algorithm 3.3 are zero.
198 | \end{exer}
199 | 
200 | \begin{exer}
201 |     Verity expression (3.64), and hence show that the PLS directions are a compromise between the OLS coefficients and the principal component directions.
202 | \end{exer}
203 | 
204 | \begin{exer}
205 |     Derive the entries in Table 3.4, the explicit forms for estimators in the orthogonal case.
206 | \end{exer}
207 | 
208 | \begin{exer}
209 |     Repeat the analysis of Table 3.3 on the spam data discussed in Chapter 1.
210 | \end{exer}
211 | 
212 | \begin{proof}
213 |     R code implementing this method is attached.  We require the \texttt{MASS}, \texttt{lars}, and \texttt{pls} packages.
214 | 
215 |     \clearpage
216 |     \lstinputlisting{ElemStatLearnCode/src/exercise_3_17.R}
217 | 
218 |     \begin{figure}
219 |         \begin{center}
220 |             \includegraphics[width=\textwidth]{ElemStatLearnCode/graphs/exercise_3_17.pdf}
221 |         \end{center}
222 |     \end{figure}
223 | \end{proof}
224 | 


--------------------------------------------------------------------------------
/ESL-Chap4Solutions.tex:
--------------------------------------------------------------------------------
  1 | \chapter{Linear Methods for Classification}
  2 | 
  3 | \begin{exer}
  4 |     Show how to solve the generalised eigenvalue problem $\max a^T B a$ subject to $a^T W a = 1$ by transforming it to a standard eigenvalue problem.
  5 | \end{exer}
  6 | 
  7 | \begin{proof}
  8 |     By Lagrange multipliers, we have that the function $\mathcal{L}(a) = a^T B a - \lambda(a^T W a - 1)$ has a critical point where \[
  9 |         \frac{d \mathcal{L}}{da} = 2 a^T B^T - 2 \lambda a^T W^T = 0,
 10 |         \] that is, where $Ba = \lambda Wa$.  If we let $W = D^T D$ (Cholesky decomposition), $C = D^{-1} B D^{-1}$, and $y = Da$, we obtain that our solution becomes \[
 11 |             Cy = \lambda y,
 12 |             \] and so we can convert our problem into an eigenvalue problem.  It is clear that if $y_m$ and $\lambda_m$ are the maximal eigenvector and eigenvalue of the reduced problem, then $D^{-1} y_m$ and $\lambda_m$ are the maximal eigenvector and eigenvalue of the generalized problem, as required.
 13 | \end{proof}
 14 | 
 15 | 
 16 | \begin{exer}
 17 |     Suppose that we have features $x \in \mathbb{R}^p$, a two-class response, with class sizes $N_1, N_2$, and the target coded as $-N/N_1, N/N_2$.
 18 | 
 19 |     \begin{enumerate}
 20 |         \item Show that the LDA rule classifies to class 2 if 
 21 |             \[
 22 |                 x^T \hat \Sigma^{-1} (\hat \mu_2 - \hat \mu_1) > \frac{1}{2} \hat \mu_2^T \hat \Sigma^{-1} \hat \mu_2 - \frac{1}{2} \hat \mu_1^T \hat \Sigma^{-1} \hat \mu_1 + \log \frac{N_1}{N} - \log \frac{N_2}{N}
 23 |             \]
 24 |         \item Consider minimization of the least squares criterion 
 25 |             \[
 26 |                 \sum_{i=1}^N \left(y_i - \beta_0 - \beta^T x_i \right)^2
 27 |             \]
 28 |             Show that the solution $\hat \beta$ satisfies 
 29 |             \[
 30 |                 \left( (N-2) \hat \Sigma + \frac{N_1 N_2}{N} \hat \Sigma_B \right) \beta = N (\hat \mu_2 - \hat \mu_1 )
 31 |             \] where $\hat \Sigma_B = (\hat \mu_2 - \hat \mu_1) (\hat \mu_2 - \hat \mu_1)^T$.  
 32 |         \item Hence show that $\hat \Sigma_B \beta$ is in the direction $(\hat \mu_2 - \hat \mu_1)$, and thus \[
 33 |                 \hat \beta \propto \hat \Sigma^{-1}(\hat \mu_2 - \hat \mu_1)
 34 |             \] and therefore the least squares regression coefficient is identical to the LDA coefficient, up to a scalar multiple. 
 35 |         \item Show that this holds for any (distinct) coding of the two classes.  
 36 |         \item Find the solution $\hat \beta_0$, and hence the predicted values $\hat \beta_0 + \hat \beta^T x$.  Consider the following rule: classify to class 2 if $\hat y_i > 0$ and class 1 otherwise.  Show that this is not the same as the LDA rule unless the classes have equal numbers of observations.
 37 |     \end{enumerate}
 38 | \end{exer}
 39 | 
 40 | \begin{proof}
 41 |     We use the notation of Chapter 4.
 42 |     \begin{enumerate}
 43 |         \item Since in the two class case, we classify to class 2 if $\delta_1(x) < \delta_2(x)$.  Substituting this into our equation for the Linear discriminant functions, we have \begin{align*}
 44 |                 \delta_1(x) &< \delta_2(x) \\ 
 45 |                 x^T \hat \Sigma^{-1} (\hat \mu_2 - \hat \mu_1) &> \frac{1}{2} \hat \mu_2^T \hat \Sigma^{-1} \hat \mu_2 - \frac{1}{2} \hat \mu_1^T \hat \Sigma^{-1} \hat \mu_1 + \log \frac{N_1}{N} - \log \frac{N_2}{N}
 46 |             \end{align*}
 47 |             as required.
 48 |         \item Let $U_i$ be the $n$ element vector with $j$-th element $1$ if the $j$-th observation is class $i$, and zero otherwise.  Then we can write our target vector $Y$ as $t_1 U_1 + t_2 U_2$, where $t_i$ are our target labels, and we have $\mathbf{1} = U_1 + U_2$.  Note that we can write our estimates $\hat \mu_1, \hat \mu_2$ as $X^T U_i = N_i \hat \mu_i$, and that $X^T Y = t_1 N_1 \hat \mu_1 + t_2 N_2 \hat \mu_2$.   
 49 |                
 50 |             By the least squares criterion, we can write \[
 51 |                 RSS = \sum_{i=1}^{N} (y_i - \beta_0 - \beta^T X)^2 = (Y - \beta_0 \mathbf{1} - X \beta)^T (Y - \beta_0 \mathbf{1} - X\beta)
 52 |                 \] Minimizing this with respect to $\beta$ and $\beta_0$, we obtain \begin{align*} 2 X^T X \beta - 2X^T Y + 2 \beta_0 X^T \mathbf{1} &= 0 \\ 2N \beta_0 - 2 \mathbf{1}^T (Y - X \beta) &= 0. \end{align*}  
 53 | 
 54 |                 These equations can be solved for $\beta_0$ and $\beta$ by substitution as \begin{align*} \hat \beta_0 &= \frac{1}{N} \mathbf{1}^T (Y - X\beta) \\
 55 |                     \left(X^T X - \frac{1}{N}X^T \mathbf{1} \mathbf{1}^T X\right) \hat \beta &= X^T Y - \frac{1}{N} X^T \mathbf{1} \mathbf{1}^T Y
 56 |                 \end{align*}
 57 | 
 58 |                 The RHS can be written as \begin{align*}
 59 |                     X^T Y - \frac{1}{N} X^T \mathbf{1} \mathbf{1}^T Y &= t_1 N_1 \hat \mu_1 + t_2 N_2 \hat \mu_2 - \frac{1}{N} (N_1 \hat \mu_1 + N_2 \hat \mu_2)(t_1 N_1 + t_2 N_2) \\
 60 |                     &= \frac{N_1 N_2}{N} (t_1 - t_2) (\hat \mu_1 - \hat \mu_2)
 61 |                 \end{align*} where we use our relations for $X^T U_i$ and the fact that $\mathbf{1} = U_1 + U_2$.  
 62 | 
 63 |                 Similarly, the bracketed term on the LHS of our expression for $\beta$ can be rewritten as \begin{align*}
 64 |                     X^T X = (N-2) \hat \Sigma + N_1 \hat \mu_1 \hat \mu_1^T + N_2 \hat \mu_2 \hat \mu_2^T,
 65 |                 \end{align*} and by substituting in the above and the definition of $\hat \Sigma_B$, we can write \begin{align*}
 66 |                     X^T X - \frac{1}{N}X^T \mathbf{1} \mathbf{1}^T X &= (N-2) \hat \Sigma + \frac{N_1 N_2}{N} \hat \Sigma_B
 67 |                 \end{align*} as required.
 68 | 
 69 |                 Putting this together, we obtain our required result, \[
 70 |                     \left( (N-2) \hat \Sigma + \frac{N_1 N_2}{N} \hat \Sigma_B \right) \hat \beta = \frac{N_1 N_2}{N} (t_1 - t_2)(\hat \mu_1 - \hat \mu_2),
 71 |                 \]
 72 |                 and then substituting $t_1 = -N/N_1, t_2 = N/N_2$, we obtain our required result, \[
 73 |                     \left( (N-2) \hat \Sigma + \frac{N_1 N_2}{N} \hat \Sigma_B \right) \hat \beta = N(\hat \mu_2 - \hat \mu_1)
 74 |                 \]
 75 |         \item All that is required is to show that $\hat \Sigma_B \beta$ is in the direction of $(\hat \mu_2 - \hat \mu_1)$.  This is clear from the fact that \[
 76 |                 \hat \Sigma_B \hat \beta = (\hat \mu_2 - \hat \mu_1)(\hat \mu_2 - \hat \mu_1)^T \hat \beta = \lambda (\hat \mu_2 - \hat \mu_1)
 77 |                 \] for some $\lambda \in \mathbb{R}$.  Since $\hat \Sigma \hat \beta$ is a linear combination of terms in the direction of $(\hat \mu_2 - \hat \mu_1)$, we can write \[
 78 |                     \hat \beta \propto \hat \Sigma^{-1} (\hat \mu_2 - \hat \mu_1)
 79 |                     \] as required.
 80 |         \item Since our $t_1, t_2$ were arbitrary and distinct, the result follows.
 81 |         \item From above, we can write \begin{align*}
 82 |                 \hat \beta_0 &= \frac{1}{N} \mathbf{1}^T (Y - X \hat \beta) \\
 83 |                 &= \frac{1}{N}(t_1 N_1 + t_2 N_2)  - \frac{1}{N} \mathbf{1}{^T} X \hat \beta \\
 84 |                 &= -\frac{1}{N}(N_1 \hat \mu_1^T + N_2 \hat \mu_2^T) \hat \beta. 
 85 |             \end{align*}
 86 | 
 87 |             We can then write our predicted value $\hat f(x) = \hat \beta_0 + \hat \beta^T x$ as \begin{align*}
 88 |                 \hat f(x) &= \frac{1}{N}\left( N x^T - N_1 \hat \mu_1^T - N_2 \hat \mu_2^T \right) \hat \beta \\
 89 |                 &=  \frac{1}{N}\left( N x^T - N_1 \hat \mu_1^T - N_2 \hat \mu_2^T \right) \lambda \hat \Sigma^{-1} (\hat \mu_2 - \hat \mu_1) 
 90 |             \end{align*} for some $\lambda \in \mathbb{R}$, and so our classification rule is $\hat f(x) > 0$, or equivalently, \begin{align*}
 91 |                 N x^T \lambda \hat \Sigma^{-1} (\hat \mu_2 - \hat \mu_1) > (N_1 \hat \mu_1^T + N_2 \hat \mu_2^T) \lambda \hat \Sigma^{-1}(\hat \mu_2 - \hat \mu_1) \\
 92 |                 x^T \hat \Sigma^{-1} (\hat \mu_2 - \hat \mu_1) > \frac{1}{N} \left( N_1 \hat \mu^T_1 + N_2 \hat \mu_2^T \right) \hat \Sigma^{-1} (\hat \mu_2 - \hat \mu_1)
 93 |             \end{align*} which is different to the LDA decision rule unless $N_1 = N_2$.
 94 |     \end{enumerate}
 95 | \end{proof}
 96 | 
 97 | \begin{exer}
 98 |     Suppose that we transform the original predictors $X$ to $\hat Y$ by taking the predicted values under linear regression.  Show that LDA using $\hat Y$ is identical to using LDA in the original space.
 99 | \end{exer}
100 | 
101 | \begin{exer}
102 |     Consier the multilogit model with $K$ classes.  Let $\beta$ be the $(p+1)(K-1)$-vector consisting of all the coefficients.  Define a suitable enlarged version of the input vector $x$ to accommodate this vectorized coefficient matrix.  Derive the Newton-Raphson algorithm for maximizing the multinomial log-likelihood, and describe how you would implement the algorithm.
103 | \end{exer}
104 | 
105 | \begin{exer}
106 |     Consider a two-class regression problem with $x \in \mathbb{R}$.  Characterise the MLE of the slope and intercept parameter if the sample $x_i$ for the two classes are separated by a point $x_0 \in \mathbb{R}$.  Generalise this result to $x \in \mathbb{R}^p$ and more than two classes.
107 | \end{exer}
108 | 
109 | \begin{exer}
110 |     Suppose that we have $N$ points $x_i \in \mathbb{R}^p$ in general position, with class labels $y_i \in \{-1, 1 \}$.  Prove that the perceptron learning algorithm converges to a separating hyperplane in a finite number of steps.  
111 |     \begin{enumerate}
112 |         \item Denote a hyperplane by $f(x) = \beta^T x^\star = 0$.  Let $z_i = \frac{x_i^\star}{\| x_i^\star \|}$.  Show that separability implies the existence of a $\beta_{\text{sep}}$ such that $y_i \beta_{\text{sep}}^T z_i \geq 1$ for all $i$.  
113 |         \item Given a current $\beta_{\text{old}}$, the perceptron algorithm identifies a pint $z_i$ that is misclassified, and produces the update $\beta_{\text{new}} \leftarrow \beta_{\text{old}} + y_i z_i$.  Show that 
114 |             \[
115 |                 \| \beta_{\text{new}} - \beta_{\text{sep}} \|^2 \leq \| \beta_{\text{old}} - \beta_{\text{sep}} \|^2 - 1
116 |             \] and hence that the algorithm converges to a separating hyperplane in no more than $\| \beta_{\text{start}} - \beta_{\text{sep}} \|^2$ steps.
117 |     \end{enumerate}
118 | \end{exer}
119 | 
120 | \begin{proof}
121 |     Recall that the definition of separability implies the existence of a separating hyperplane - that is, a vector $\beta_\text{sep}$ such that $\text{sgn}\left( \beta^T_\text{sep} x^\star_i \right) = y_i$.
122 |     \begin{enumerate}
123 |         \item By assumption, there exists $\epsilon > 0$ and $\beta_\text{sep}$ such that \[
124 |                 y_i \beta^T_\text{sep} z^\star_i \geq \epsilon
125 |             \] for all $i$.  Then the hyperplane $\frac{1}{\epsilon} \beta_\text{sep}$ is a separating hyperplane that by linearity satisfies the constraint \[
126 |                 y_i \beta^T_\text{sep} z^\star_i \geq 1.
127 |             \]
128 |         \item We have \begin{align*}
129 |                 \| \beta_\text{new} - \beta_\text{sep} \|^2 &= \| \beta_\text{new} \|^2 + \| \beta_\text{sep} \|^2 - 2 \beta_\text{sep}^T \beta_\text{new} \\
130 |                 &= \| \beta_\text{old} + y_i z_i \|^2 + \| \beta_\text{sep} \|^2 - 2 \beta_\text{sep}^T \left( \beta_\text{old} + y_i z_i \right) \\
131 |                 &= \| \beta_\text{old} \|^2 + \| y_i z_i \|^2 + 2 y_i \beta_\text{old}^T z_i + \| \beta_\text{sep} \|^2 - 2 \beta_\text{sep}^T \beta_0 - 2 y_i \beta^T_\text{sep} z_i \\
132 |                 &\leq \| \beta_\text{old} \|^2 + \| \beta_\text{sep} \|^2 - 2 \beta_\text{sep}^T \beta_\text{old} + 1 - 2 \\
133 |                 &= \| \beta_\text{old} - \beta_\text{sep} \|^2 - 1.
134 |             \end{align*} Let $\beta_k, k = 0, 1, 2, \dots$ be the sequence of iterates formed by this procedure, with $\beta_0 = \beta_\text{start}$. Let $k^\star = \left\lceil \| \beta_\text{start} - \beta_\text{sep} \|^2 \right\rceil$.  
135 |             Then by the above result, we must have $\| \beta_{k^\star} - \beta_\text{sep} \|^2 = 0$, and by properties of the norm we have that $\beta_{k^\star} = \beta_\text{sep}$, and so we have reached a separating hyperplane in no more than $k^\star$ steps.
136 |     \end{enumerate} 
137 | \end{proof}
138 | 


--------------------------------------------------------------------------------
/ESL-Chap5Solutions.tex:
--------------------------------------------------------------------------------
 1 | \chapter{Basis Expansions and Regularization}
 2 | 
 3 | \begin{exer}
 4 |     Show that the truncated power basis functions in (5.3) represent a basis for a cubic spline with the two knots as indicated.
 5 | \end{exer}
 6 | 
 7 | \begin{exer}
 8 | Suppose that $B_{i, M}(x)$ is an order-$M$ $B$-spline.  
 9 |     \begin{enumerate}
10 |         \item Show by induction that $B_{i, M}(x) = 0$ for $x \notin [\tau _i, \tau_{i+M}$.  This shows, for example, that the support of cubic $B$-splines is at most $5$ knots.
11 |         \item Show by induction that $B_{i, M}(x) > 0$ for $x \in (\tau_i, \tau_{i + M }$.  The $B$-splines are positive in the interior of their support. 
12 |         \item Show by induction that $\sum_{i=1}^{K+M} B_{i, M}(x) = 1$ for all $x \in [\xi_0, \xi_{K+1}]$.
13 |         \item Show that 
14 | 
15 |     \end{enumerate}
16 | \end{exer}
17 | 
18 | \begin{exer}
19 |         
20 | \end{exer}
21 | 


--------------------------------------------------------------------------------
/ESL-Solutions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ESL-Solutions.pdf


--------------------------------------------------------------------------------
/ESL-Solutions.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[oneside]{amsbook}
 2 | \usepackage{amsthm, amsmath, amssymb}
 3 | \usepackage{geometry, setspace, graphicx, enumerate}
 4 | \usepackage{listings}
 5 | \onehalfspacing                 
 6 | 
 7 | \usepackage[usenames, dvipsnames]{color}
 8 | \definecolor{graphblue}{RGB}{52, 138, 189}
 9 | \definecolor{graphpurple}{RGB}{122, 104, 166}
10 | 
11 | \theoremstyle{plain}% default 
12 | \newtheorem{thm}{Theorem}[chapter] 
13 | \newtheorem{lem}[thm]{Lemma} 
14 | \newtheorem{prop}[thm]{Proposition} 
15 | \newtheorem{exer}[thm]{Exercise} 
16 | 
17 | \newtheorem*{cor}{Corollary} 
18 | 
19 | \theoremstyle{definition} 
20 | \newtheorem{defn}[thm]{Definition}
21 | \newtheorem{conj}[thm]{Conjecture}
22 | \newtheorem{exmp}[thm]{Example}
23 | 
24 | \theoremstyle{remark} 
25 | \newtheorem*{rem}{Remark} 
26 | \newtheorem*{note}{Note} 
27 | \newtheorem{case}{Case} 
28 | 
29 | \DeclareMathOperator*{\argmax}{arg\,max}
30 | \DeclareMathOperator*{\argmin}{arg\,min}
31 | 
32 | \lstset{
33 | language=Python,
34 | numbers=left,
35 | numberstyle=\scriptsize,
36 | stepnumber=0,
37 | numbersep=5pt,
38 | showspaces=false,
39 | breaklines=true,
40 | basicstyle=\ttfamily\scriptsize,
41 | frame=single,
42 | commentstyle=\scriptsize,
43 | prebreak=\raisebox{0ex}[0ex][0ex]{\ensuremath{\hookleftarrow}},
44 | showstringspaces=false,
45 | showtabs=false,
46 | identifierstyle=\ttfamily,
47 | stringstyle=\color{Gray},
48 | commentstyle=\color{graphpurple},
49 | keywordstyle=\color{graphblue},
50 | commentstyle=\color{RoyalBlue},
51 | keywordstyle=\color{RedViolet},
52 | tabsize=2
53 | }
54 | 
55 | \usepackage{hyperref}
56 | 
57 | \title{Elements of Statistical Learning}
58 | \author{Andrew Tulloch}
59 | 
60 | \begin{document}
61 | \maketitle
62 | 
63 | \tableofcontents
64 | 
65 | \setcounter{chapter}{1}
66 | 
67 | \include{ESL-Chap2Solutions}
68 | \include{ESL-Chap3Solutions}
69 | \include{ESL-Chap4Solutions}
70 | \include{ESL-Chap5Solutions}
71 | 
72 | \setcounter{chapter}{12}
73 | \include{ESL-Chap12Solutions}
74 | \end{document}
75 | 


--------------------------------------------------------------------------------
/ElemStatLearnCode/README:
--------------------------------------------------------------------------------
 1 | Welcome to ProjectTemplate!
 2 | 
 3 | This file introduces you to ProjectTemplate, but you should eventually replace
 4 | the contents of this file with an introduction to your project. People who
 5 | work with your data in the future will thank you for it, including your future
 6 | self.
 7 | 
 8 | ProjectTemplate is an R package that helps you organize your statistical
 9 | analysis projects. Since you're reading this file, we'll assume that you've
10 | already called `create.project()` to set up this project and all of its
11 | contents.
12 | 
13 | To load your new project, you'll first need to `setwd()` into the directory
14 | where this README file is located. Then you need to run the following two
15 | lines of R code:
16 | 
17 | 	library('ProjectTemplate')
18 | 	load.project()
19 | 
20 | After you enter the second line of code, you'll see a series of automated
21 | messages as ProjectTemplate goes about doing its work. This work involves:
22 | * Reading in the global configuration file contained in `config`.
23 | * Loading any R packages you listed in he configuration file.
24 | * Reading in any datasets stored in `data` or `cache`.
25 | * Preprocessing your data using the files in the `munge` directory.
26 | 
27 | Once that's done, you can execute any code you'd like. For every analysis
28 | you create, we'd recommend putting a separate file in the `src` directory.
29 | If the files start with the two lines mentioned above:
30 | 
31 | 	library('ProjectTemplate')
32 | 	load.project()
33 | 
34 | You'll have access to all of your data, already fully preprocessed, and
35 | all of the libraries you want to use.
36 | 
37 | For more details about ProjectTemplate, see http://projecttemplate.net
38 | 


--------------------------------------------------------------------------------
/ElemStatLearnCode/TODO:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/TODO


--------------------------------------------------------------------------------
/ElemStatLearnCode/cache/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/cache/.gitignore


--------------------------------------------------------------------------------
/ElemStatLearnCode/config/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/config/.gitignore


--------------------------------------------------------------------------------
/ElemStatLearnCode/config/global.dcf:
--------------------------------------------------------------------------------
1 | data_loading: on
2 | munging: on
3 | logging: off
4 | load_libraries: on
5 | libraries: reshape2, plyr, ggplot2, mclust, class
6 | as_factors: on
7 | data_tables: off
8 | 


--------------------------------------------------------------------------------
/ElemStatLearnCode/data/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/data/.gitignore


--------------------------------------------------------------------------------
/ElemStatLearnCode/diagnostics/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/diagnostics/.gitignore


--------------------------------------------------------------------------------
/ElemStatLearnCode/diagnostics/1.R:
--------------------------------------------------------------------------------
1 | # Example Data Diagnostics Script
2 | 
3 | data <- rnorm(100000, 0, 1)
4 | expect_that(length(data) == 100000, is_true())
5 | 


--------------------------------------------------------------------------------
/ElemStatLearnCode/doc/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/doc/.gitignore


--------------------------------------------------------------------------------
/ElemStatLearnCode/graphs/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/graphs/.gitignore


--------------------------------------------------------------------------------
/ElemStatLearnCode/graphs/exercise_2_8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/graphs/exercise_2_8.pdf


--------------------------------------------------------------------------------
/ElemStatLearnCode/graphs/exercise_2_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/graphs/exercise_2_8.png


--------------------------------------------------------------------------------
/ElemStatLearnCode/graphs/exercise_3_17.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/graphs/exercise_3_17.pdf


--------------------------------------------------------------------------------
/ElemStatLearnCode/graphs/exercise_3_17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/graphs/exercise_3_17.png


--------------------------------------------------------------------------------
/ElemStatLearnCode/graphs/exercise_3_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/graphs/exercise_3_2.pdf


--------------------------------------------------------------------------------
/ElemStatLearnCode/graphs/exercise_3_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/graphs/exercise_3_2.png


--------------------------------------------------------------------------------
/ElemStatLearnCode/lib/helpers.R:
--------------------------------------------------------------------------------
1 | helper.function <- function()
2 | {
3 |   return(1)
4 | }
5 | 


--------------------------------------------------------------------------------
/ElemStatLearnCode/logs/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/logs/.gitignore


--------------------------------------------------------------------------------
/ElemStatLearnCode/munge/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/munge/.gitignore


--------------------------------------------------------------------------------
/ElemStatLearnCode/munge/spam_munge.R:
--------------------------------------------------------------------------------
 1 | # Spam preprocessing 
 2 | 
 3 | spam <- as.data.frame(read.table(file="data/spam.wsv", header=FALSE))
 4 | colnames(spam) <- c(paste("X.",1:57,sep=""), "Y")
 5 | 
 6 | 
 7 | spam.normalized.y = scale(spam[58])
 8 | spam.normalized.x = scale(spam[1:57])
 9 | 
10 | spam <- data.frame(spam.normalized.x, spam.normalized.y)
11 | # Convert to factors
12 | spam$Y = factor(spam$Y, labels=c("nospam", "spam"))
13 | spam.sub = c(1:nrow(spam))[spam$Y == "spam"]
14 | nospam.sub =   c(1:nrow(spam))[spam$Y == "nospam"]
15 | 
16 | # Convert back to numeric
17 | spam$Y = as.numeric(spam$Y)
18 | 
19 | # Generate Training and test sets
20 | train.spam = sample(spam.sub,floor(length(spam.sub)*2/3))
21 | train.nospam = sample(nospam.sub,floor(length(nospam.sub)*2/3))
22 | spam.training_indices = c(train.spam,train.nospam)
23 | 
24 | spam.train = spam[spam.training_indices,]
25 | spam.test = spam[-spam.training_indices,]
26 | 


--------------------------------------------------------------------------------
/ElemStatLearnCode/munge/zip_munge.R:
--------------------------------------------------------------------------------
 1 | # Example preprocessing script.
 2 | # Load training and test data
 3 | zip.test <- as.data.frame(read.table(file="data/zip.test.wsv", header=FALSE))
 4 | zip.train <- as.data.frame(read.table(file="data/zip.train.wsv", header=FALSE))
 5 | 
 6 | colnames(zip.train) <- c("Y",paste("X.",1:256,sep=""))
 7 | zip.train.filtered <- subset(zip.train, Y == 2 | Y == 3)
 8 | 
 9 | colnames(zip.test) <- c("Y",paste("X.",1:256,sep=""))
10 | zip.test.filtered <- subset(zip.test, Y == 2 | Y == 3)
11 | 
12 | cache("zip.test.filtered")
13 | cache("zip.train.filtered")


--------------------------------------------------------------------------------
/ElemStatLearnCode/profiling/1.R:
--------------------------------------------------------------------------------
1 | # Example Profiling Script
2 | 
3 | replicate(10, system.time(rnorm(100000, 0, 1)))
4 | 


--------------------------------------------------------------------------------
/ElemStatLearnCode/reports/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/reports/.gitignore


--------------------------------------------------------------------------------
/ElemStatLearnCode/src/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ajtulloch/Elements-of-Statistical-Learning/79b776ff743d9b4c32e3622f58999f137d076aec/ElemStatLearnCode/src/.gitignore


--------------------------------------------------------------------------------
/ElemStatLearnCode/src/exercise_2_8.R:
--------------------------------------------------------------------------------
 1 | library('ProjectTemplate')
 2 | load.project()
 3 | 
 4 | ## Linear Regression
 5 | mod <- lm(Y ~ ., data = zip.train.filtered)
 6 | 
 7 | # Round predictions
 8 | category_f <- function(x) { if (x > 2.5) 3 else 2 }
 9 | predictions.lm.test <- as.character(sapply(predict(mod, zip.test.filtered), 
10 |                                            category_f))
11 | predictions.lm.train <- as.character(sapply(predict(mod, zip.train.filtered), 
12 |                                             category_f))
13 | 
14 | ## KNN
15 | knn.train <- zip.train.filtered[, 2:257]
16 | knn.test <- zip.test.filtered[, 2:257]
17 | 
18 | knn.train.Y <- as.factor(zip.train.filtered$Y)
19 | knn.test.Y <- as.factor(zip.test.filtered$Y)
20 | 
21 | # KNN Predictions
22 | predictions.knn.test <- sapply(1:15, function(k) { 
23 |     knn(train = knn.train, 
24 |         test = knn.test, 
25 |         cl = knn.train.Y, 
26 |         k = k) 
27 |   })
28 | predictions.knn.train <- sapply(1:15, function(k) { 
29 |   knn(train = knn.train, 
30 |       test = knn.train, 
31 |       cl = knn.train.Y, 
32 |       k = k) 
33 | })
34 | 
35 | # Compute error rates
36 | errors.xs <- 1:15
37 | 
38 | errors.knn.test <- apply(predictions.knn.test, 2, function(prediction) {
39 |     classError(prediction, as.factor(zip.test.filtered$Y))$errorRate
40 |   })
41 | errors.knn.train <- apply(predictions.knn.train, 2, function(prediction) {
42 |   classError(prediction, as.factor(zip.train.filtered$Y))$errorRate
43 | })
44 | errors.lm.test <- sapply(errors.xs, function(k) {
45 |     classError(predictions.lm.test, as.factor(zip.test.filtered$Y))$errorRate
46 |   })
47 | errors.lm.train <- sapply(errors.xs, function(k) {
48 |   classError(predictions.lm.train, as.factor(zip.train.filtered$Y))$errorRate
49 | })
50 | 
51 | errors <- data.frame("K"=errors.xs, 
52 |                      "KNN.Train"=errors.knn.train, 
53 |                      "KNN.Test"=errors.knn.test,
54 |                      "LR.Train"=errors.lm.train,
55 |                      "LR.Test"=errors.lm.test)
56 | 
57 | # Create Plot
58 | plot.data <- melt(errors, id="K") 
59 | ggplot(data=plot.data,
60 |        aes(x=K, y=value, colour=variable)) +
61 |       geom_line() +
62 |       xlab("k") + 
63 |       ylab("Classification Error") +
64 |       opts(title="Classification Errors for different methods on zipcode data")
65 |       scale_colour_hue(name="Classification Method",
66 |                        labels=c("k-NN (Train)",
67 |                                 "k-NN (Test)",
68 |                                 "Linear Regression (Train)",
69 |                                 "Linear Regression (Test)")
70 |                         )
71 | ggsave(file.path('graphs', 'exercise_2_8.pdf'))
72 | ggsave(file.path('graphs', 'exercise_2_8.png'))


--------------------------------------------------------------------------------
/ElemStatLearnCode/src/exercise_3_17.R:
--------------------------------------------------------------------------------
 1 | library("ProjectTemplate")
 2 | load.project()
 3 | 
 4 | library("lars") # For least-angle and lasso
 5 | library("MASS") # For ridge
 6 | library("pls") # For PLS and PCR
 7 | 
 8 | mod.ls <- lm(Y ~ . - 1, spam.train)
 9 | mod.ridge <- lm.ridge(Y ~ ., spam.train)
10 | mod.pcr <- pcr(formula=Y ~ ., data=spam.train, validation="CV")
11 | mod.plsr <- plsr(formula=Y ~ ., data=spam.train, validation="CV")
12 | mod.lars <- lars(as.matrix(spam.train[,1:ncol(spam.train) - 1]), 
13 |                  spam.train[,ncol(spam.train)], 
14 |                  type="lar")
15 | mod.lasso <- lars(as.matrix(spam.train[,1:ncol(spam.train) - 1]), 
16 |                  spam.train[,ncol(spam.train)], 
17 |                  type="lasso")
18 | 
19 | mods.coeffs <- data.frame(ls=mod.ls$coef,
20 |                           ridge=mod.ridge$coef,
21 |                           lasso=mod.lasso$beta[10,],
22 |                           pcr=mod.pcr$coef[,,10],
23 |                           plsr=mod.plsr$coef[,,10]
24 |                           )
25 | 
26 | mods.coeffs$xs = row.names(mods.coeffs)
27 | plot.data <- melt(mods.coeffs, id="xs")
28 | 
29 | ggplot(data=plot.data, 
30 |        aes(x=factor(xs), 
31 |            y=value, 
32 |            group=variable, 
33 |            colour=variable)) + 
34 |         geom_line() + 
35 |         geom_point() +
36 |         xlab("Factor") + 
37 |         ylab("Regression Coefficient") +
38 |         opts(title = "Estimated coefficients for regression methods on spam data",
39 |              axis.ticks = theme_blank(), 
40 |              axis.text.x = theme_blank()) +
41 |         scale_colour_hue(name="Regression Method",
42 |                          labels=c("OLS",
43 |                                   "Ridge",
44 |                                   "Lasso",
45 |                                   "PCR",
46 |                                   "PLS")
47 |                          )
48 | 
49 | ggsave(file.path('graphs', 'exercise_3_17.pdf'))
50 | ggsave(file.path('graphs', 'exercise_3_17.png'))


--------------------------------------------------------------------------------
/ElemStatLearnCode/src/exercise_3_2.R:
--------------------------------------------------------------------------------
 1 | library('ProjectTemplate')
 2 | load.project()
 3 | 
 4 | # Raw data
 5 | simulation.xs <- c(1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969)
 6 | simulation.ys <- c(4835, 4970, 5085, 5160, 5310, 5260, 5235, 5255, 5235, 5210, 5175)
 7 | simulation.df <- data.frame(pop = simulation.ys, year = simulation.xs)
 8 | 
 9 | # Rescale years
10 | simulation.df$year = simulation.df$year - 1964
11 | 
12 | # Generate regression, construct confidence intervals
13 | fit <- lm(pop ~ year + I(year^2) + I(year^3), data=simulation.df)
14 | xs = seq(-5, 5, 0.1)
15 | fit.confidence = predict(fit, data.frame(year=xs), interval="confidence", level=0.95)
16 | 
17 | 
18 | # Create data frame containing variables of interest
19 | df = as.data.frame(fit.confidence)
20 | df$year <- xs
21 | df = melt(df, id.vars="year")
22 | 
23 | p <- ggplot() + geom_line(aes(x=year, y=value, colour=variable), df) + 
24 |                 geom_point(aes(x=year, y=pop), simulation.df)
25 | p <- p + scale_x_continuous('Year') + scale_y_continuous('Population')
26 | p <- p + opts(title="Cubic regression with confidence intervals")
27 | p <- p + scale_color_brewer(name="Legend",
28 |                             labels=c("Fit", 
29 |                                      "95% Lower Bound", 
30 |                                      "95% Upper Bound"), 
31 |                             palette="Set1")
32 | ggsave(file.path('graphs', 'exercise_3_2.pdf'))
33 | ggsave(file.path('graphs', 'exercise_3_2.png'))
34 | 


--------------------------------------------------------------------------------
/ElemStatLearnCode/tests/1.R:
--------------------------------------------------------------------------------
1 | # Example Unit Testing Script
2 | 
3 | expect_that(1, equals(1))
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Elements of Statistical Learning
2 | ================================
3 | 
4 | Contains LaTeX, SciPy and R code providing solutions to exercises in Elements of Statistical Learning (Hastie, Tibshirani &amp; Friedman)
5 | 
6 | 
7 | [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/ajtulloch/elements-of-statistical-learning/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
8 | 
9 | 


--------------------------------------------------------------------------------