├── README.md
├── compare.R
├── glm.svd.r
├── glm.tex
├── implementations.R
└── sparse_n_dense.R


/README.md:
--------------------------------------------------------------------------------
 1 | Generalized linear models, abridged.
 2 | ===============
 3 | 
 4 | See our old notes on this project at: http://bwlewis.github.io/GLM.
 5 | 
 6 | Very revised and updated notes are in process as of October, 2018.
 7 | 
 8 | 
 9 | Here are some slides from a talk at the Cleveland R User Group: https://bwlewis.github.io/GLM/October_2018_CLERUG.html
10 | 
11 | Our current experimental reference SVD-based GLM implementation, based in turn
12 | on Dianne O'Leary's QR implementation from 1990, can be found here:
13 | https://github.com/bwlewis/GLM/blob/master/glm.svd.r That code is robust and
14 | fast and replicates R's column subset selection routine by falling back to R's
15 | default rank-revealing QR factorization in edge cases. But it's still a work in
16 | progress.
17 | 


--------------------------------------------------------------------------------
/compare.R:
--------------------------------------------------------------------------------
 1 | source("implementations.R")
 2 | data("Contraception",package="mlmRev")
 3 | # Model estimated with R's glm function, returning model matrix and response
 4 | # in $x and $y, respectively:
 5 | R_GLM = glm(formula = use ~ age + I(age^2) + urban + livch, family = binomial, x=TRUE, data=Contraception)
 6 | # Model estimated with our radically stripped-down minimalist implementation:
 7 | mini = irls(R_GLM$x, R_GLM$y, family=binomial)
 8 | print(data.frame(R_GLM=coef(R_GLM), minimalist=coef(mini)))
 9 | 
10 | iqrn = irls_qrnewton(R_GLM$x, R_GLM$y, family=binomial)
11 | print(data.frame(R_GLM=coef(R_GLM), qr_newton=coef(iqrn)))
12 | 
13 | isvdn = irls_svdnewton(R_GLM$x, R_GLM$y, family=binomial)
14 | print(data.frame(R_GLM=coef(R_GLM), svd_newton=coef(isvdn)))
15 | 
16 | # Let's test the sparse-aware IRLS example. But we need some data prep for it
17 | # first. The 1st three columns of our model matrix are dense:
18 | library("Matrix")
19 | A_dense = Matrix(R_GLM$x[,1:3], sparse=FALSE)
20 | # The next four columns are sparse:
21 | A_sparse = Matrix(R_GLM$x[,4:7], sparse=TRUE)
22 | isparse = irls_sparse(A_dense, A_sparse, R_GLM$y, family=binomial)
23 | print(data.frame(R_GLM=coef(R_GLM), irls_sparse=coef(isparse)))
24 | 
25 | # Let's test the incremental implementation...
26 | # Write out the model matrix to a data file for the incremental example.
27 | write.table(R_GLM$x, file="data.csv", sep=",", col.names=FALSE, row.names=FALSE)
28 | inc = irls_incremental("data.csv", 500, R_GLM$y, family=binomial)
29 | print(data.frame(R_GLM=coef(R_GLM), incremental=coef(inc)))
30 | 


--------------------------------------------------------------------------------
/glm.svd.r:
--------------------------------------------------------------------------------
  1 | #' Fitting Generalized Linear Models
  2 | #'
  3 | #' Similar to \code{glm.fit} but uses the SVD to detect ill-conditioned
  4 | #' problems and conducts IRWLS in projected subspace for efficiency.
  5 | #'
  6 | #' @param X an n by p real-valued dense model matrix
  7 | #' @param y a response vector of length n
  8 | #' @param family a family function or the result of a call to a family function
  9 | #' @param maxit integer number of maximum number of IRWLS iterations
 10 | #' @param tol IRWLS positive convergence tolerance
 11 | #' @param stol positive numerical condition tolerance
 12 | #' @param singular.ok if FALSE a numerically-singular fit stops with error
 13 | #' @param weights vector of observation weights
 14 | #' @param reg.method indicates regularization approach: 'column projection' follows R's GLM approach; 'minimum norm' finds the LS solution of minimal norm.
 15 | #' @param LAPACK if FALSE use R's column-ordered subset selection when \code{reg.method == 'column projection'} otherwise use default LAPACK pivots.
 16 | #' @return A list with model coefficients b, number if IRWLS iterations, and column pivoting indices.
 17 | #' Returns a list with entries:
 18 | #' \describe{
 19 | #'   \item{b:}{ model coefficients}
 20 | #'   \item{iterations:}{ number of IRWLS iterations}
 21 | #'   \item{rank:}{ rank of model matrix}
 22 | #'   \item{pivot:}{ model matrix column pivot}
 23 | #' }
 24 | #' @seealso \code{\link{glm.fit}}
 25 | glm.svd =
 26 | function(X, y, family=binomial, maxit=25, tol=1e-10, stol=1e-10,
 27 |          singular.ok=TRUE, weights,
 28 |          reg.method=c("column projection", "minimum norm"),
 29 |          LAPACK=FALSE)
 30 | {
 31 |   singular = ifelse(singular.ok, warning, stop)
 32 |   reg.method = match.arg(reg.method)
 33 |   if(is.list(X)) S = X
 34 |   else S = svd(X)
 35 |   V = S$v
 36 |   nvars = NCOL(S$u)
 37 |   idx = seq(nvars)
 38 |   i = (S$d / S$d[1]) > stol
 39 |   k = sum(i)
 40 |   pivot = seq(nvars)
 41 |   if (k < nvars)
 42 |   {
 43 |     singular("Singular system detected of rank: ", k, " using threshold: ", stol)
 44 |     if(reg.method == "column projection")
 45 |     {
 46 |       Q = qr(t(S$v[, 1:k]), LAPACK=LAPACK) # Golub SVD subsel heuristic
 47 |       # when LAPACK=FALSE uses R's custom pivoting strategy
 48 |       pivot = Q$pivot
 49 |       idx = sort(head(pivot, k))
 50 |       omit = tail(Q$pivot, nvars - k)
 51 | # XXX we can maybe instead use a slightly cheaper downdating svd scheme here:
 52 |       S_new = svd(X[, -omit])
 53 | #     double-check that this worked (it may not have), if not resort to
 54 | #     something else... XXX can this be improved?
 55 |       if((tail(S_new$d, 1) / S_new$d[1]) <= stol)
 56 |       {
 57 |         warning("Whoops! SVD subset selection failed, trying dqrdc2 on full matrix")
 58 |         if(is.list(X)) Q = qr(X$u %*% (X$d * t(X$v)), LAPACK=FALSE)
 59 |         else Q = qr(X, LAPACK=FALSE)
 60 |         pivot = Q$pivot
 61 |         idx = sort(head(pivot, k))
 62 |         omit = tail(Q$pivot, nvars - k)
 63 |         S_new = svd(X[, -omit])
 64 |       }
 65 |       S = S_new
 66 |       message("omittig column(s) ", paste(omit, collapse=","))
 67 |     }
 68 |   }
 69 | 
 70 |   s = rep(0, ncol(S$u))
 71 |   if(!is(family, "family")) family = family()
 72 |   nobs = NROW(y)    # needed by the initialize expression below
 73 |   nvars = NCOL(S$u) # ditto
 74 |   if(missing(weights)) weights = rep(1, nobs)
 75 |   variance = family$variance
 76 |   linkinv = family$linkinv
 77 |   mu.eta = family$mu.eta
 78 |   etastart = NULL
 79 |   eval(family$initialize)
 80 |   eta = family$linkfun(mustart)
 81 |   dev.resids = family$dev.resids
 82 |   dev = sum(dev.resids(y, linkinv(eta), weights))
 83 |   devold = 0
 84 |   for(j in 1:maxit)
 85 |   {
 86 |     g      = linkinv(eta)
 87 |     varg   = variance(g)
 88 |     if(any(is.na(varg))) stop("NAs in variance of the inverse link function")
 89 |     if(any(varg==0)) stop("Zero value in variance of the inverse link function")
 90 |     gprime  = mu.eta(eta)
 91 |     if(any(is.na(gprime))) stop("NAs in the inverse link function derivative")
 92 |     z = eta + (y - g) / gprime
 93 |     W = weights * as.vector(gprime^2 / varg)
 94 |     # The following is as well-conditioned as W is
 95 |     C   = chol(crossprod(S$u, W*S$u), pivot=TRUE)
 96 |     piv = attr(C, "pivot")
 97 |     s   = forwardsolve(t(C), crossprod(S$u, W*z)[piv])
 98 |     s   = backsolve(C, s)[order(piv)]
 99 |     eta   = drop(S$u %*% s)
100 |     dev = sum(dev.resids(y, g, weights))
101 |     if(abs(dev - devold) / (0.1 + abs(dev)) < tol) break
102 |     devold = dev
103 | # R essentially computes this (via dqrdc2.f)
104 | ##  Q = qr(W * X)
105 | ##  omit = tail(Q$pivot, ncol(X) - Q$rank)
106 | ##  now omit columns and solve...
107 | ##  fit = qr.solve(W * X, W * z)
108 | ##  eta = drop(x %*% fit)
109 | ##  g = linkinv(eta)
110 |   }
111 |   x = rep(NA, NCOL(X))
112 |   inv = 1/S$d
113 |   if(reg.method == "minimum norm") inv[inv > 1/stol] = 1
114 |   x[idx] = drop(S$v %*% (s*inv))
115 |   list(coefficients=x,iterations=j, rank=k, pivot=pivot)
116 | }
117 | 


--------------------------------------------------------------------------------
/glm.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[10pt]{article} %See documentation for other class options
  2 | \usepackage{tabularx}
  3 | \usepackage{color}
  4 | %\usepackage{fixltx2e,fix-cm}
  5 | %\usepackage{makeidx}
  6 | %\usepackage{multicol}
  7 | \usepackage{mathtools}
  8 | \usepackage{listings}
  9 | \usepackage{amsmath}
 10 | \usepackage{amssymb}
 11 | \usepackage{hyperref}
 12 | \usepackage{cite}
 13 | \usepackage{authblk}
 14 | \usepackage{graphicx}
 15 | \usepackage{float}
 16 | \newtheorem{thm}{Theorem}
 17 | \newtheorem{lemma}{Lemma}
 18 | \newcommand{\R}{{\mathbb R}}
 19 | 
 20 | 
 21 | \begin{document}
 22 | 
 23 | \floatstyle{ruled}
 24 | \newfloat{program}{thp}{lop}
 25 | \floatname{program}{Program}
 26 | \newfloat{algorithm}{thp}{lop}
 27 | \floatname{algorithm}{Algorithm}
 28 | 
 29 | \setlength{\parindent}{0pt}
 30 | \setlength{\parskip}{0.2em}
 31 | 
 32 | \definecolor{verbgray}{gray}{0.9}
 33 | \definecolor{verbgray2}{gray}{0.975}
 34 | \lstset{backgroundcolor=\color{verbgray},
 35 |   frame=single,
 36 |   framerule=0pt,
 37 |   basicstyle=\ttfamily,
 38 |   keepspaces=true,
 39 |   columns=fullflexible}
 40 | 
 41 | 
 42 | \section*{Generalized linear models, abridged}
 43 | 
 44 | \begin{quote}
 45 | {\it This is the 2nd major revision of this document. This version derives
 46 | algorithmic details of the iteratively re-weighted least squares method
 47 | (IRWLS), and emphasizes advantages of using the singular value decomposition
 48 | (SVD) in its implementation. We added background reference material on the SVD.
 49 | From a suggestion by James Blevins, the notation was revised to bring it closer
 50 | to other references, especially~\cite{MN}.
 51 | --Bryan}
 52 | \end{quote}
 53 | 
 54 | \begin{quote}
 55 | Generalized linear models (GLMs) are indispensable tools in the data science
 56 | toolbox. They are applicable to many real-world problems involving continuous,
 57 | yes/no, count and survival data (and more). The models themselves are intuitive
 58 | and can be used for inference and prediction.  A few very high quality free and
 59 | open source software implementations are available (in particular within
 60 | R~\cite{R}, and also ExaStat/Revolution Analytics), as are a few first-rate
 61 | commercial ones like SAS, and Stata.
 62 | 
 63 | This note grew out of our own desire to better understand the numerics of
 64 | generalized linear models.  We highlight aspects of GLM implementations that we
 65 | find particularly interesting. We present some reference implementations
 66 | stripped down to illuminate core ideas; often with just a few lines of code.
 67 | Our implementations are in R but are close to pseudocode and easily ported to
 68 | other languages. --Bryan and Mike
 69 | \end{quote}
 70 | 
 71 | 
 72 | \section*{Linear algebra background material}
 73 | 
 74 | Skip ahead to the {\bf Linear models} section if you already know all about the
 75 | singular value decomposition...  The following brief introduction closely
 76 | follows the important reference book by Golub and Van Loan, Matrix Computations
 77 | \cite{gvl}. You should read that book.
 78 | 
 79 | \subsection*{Orthonormal vectors and rotations}
 80 | 
 81 | Let $V$ be a real-valued $n\times p$ matrix, which we write as
 82 | $V\in\R^{n\times p}$.  It's sometimes useful to enumerate the
 83 | column vectors of a matrix, which we write for instance as $V=[v_1, v_2,
 84 | \ldots, v_p]$.  The column vectors of $V$ are {\it orthonormal} if and only if
 85 | $V^T V = I$, the $p\times p$ identity matrix.  For instance, the identity
 86 | matrix itself is composed of orthonormal column vectors.  When the matrix $V$
 87 | is square, that is when $n=p$, then we simply say that $V$ is an orthonormal
 88 | matrix.
 89 | The columns of orthonormal matrices form coordinate bases of $\R^n$
 90 | whose directions are orthogonal--in other words, a rotation of
 91 | the usual unit basis coordinate system. For example, consider a $2\times 2$
 92 | orthonormal matrix $V$:
 93 | \[
 94 | V = \left(\begin{array}{cc}
 95 | 1/{\sqrt{2}} & -1{\sqrt{2}} \\
 96 | 1/{\sqrt{2}} & 1/{\sqrt{2}}
 97 | \end{array}\right).
 98 | \]
 99 | Figure \ref{chxx_rotation} illustrates the rotation by plotting
100 | the column vectors of $V$ along with the usual unit basis vectors.
101 | \begin{figure}
102 | \begin{center}
103 | \includegraphics[width=0.5\textwidth]{rotation.pdf}
104 | \end{center}
105 | \caption{Coordinates from matrix $V$ with orthonormal columns (solid lines)
106 | compared to the standard unit basis vectors (dashed). $V$ is a rotation
107 | matrix.}
108 | \label{chxx_rotation}
109 | \end{figure}
110 | Multiplying a vector by $V$ rotates the entries of the vector to the
111 | new coordinate system, and does nothing else. In particular, the Euclidean
112 | norm of the vector is not changed. This is a useful result so we will state
113 | it as a Lemma:
114 | \begin{lemma}\label{invariant}
115 | Let $V\in\R^{n\times n}$ be an orthonormal matrix. Then
116 | $\|Vx\|=\|x\|$ for all $x\in\R^n$.
117 | \end{lemma}
118 | 
119 | 
120 | \subsection*{The singular value decomposition}
121 | 
122 | The singular value decomposition, or SVD, plays a central role in the
123 | analysis--and often implementation--of many computational methods involving
124 | matrices.  If you only plan to know one matrix decomposition, this is the one
125 | to know.  Let $X\in\R^{n \times p}$ and let $k=\min\{n, p\}$.  Then
126 | there exist matrices $U\in\R^{n \times k}$ and $V\in\R^{p\times
127 | k}$ with orthonormal columns $U^T U = V^T V = I$ such that
128 | \begin{equation}\label{SVD}
129 | U^T X V = \Sigma,
130 | \end{equation}
131 | where $\Sigma$ is a $k\times k$ diagonal matrix with non-negative entries
132 | $\sigma_1 \ge \sigma_2 \ge \cdots \ge \sigma_k \ge 0$ along its
133 | main diagonal. This is sometimes called the ``thin'' SVD.  R users will be
134 | familiar with this as the {\tt svd(X)} function, which returns a vector of
135 | $\sigma_i$ values instead of a full diagonal matrix $\Sigma$ but is otherwise
136 | the same.  The singular value decomposition is not unique, but is almost so--it
137 | is unique up to the signs of the singular vector elements.
138 | 
139 | In the case that $n > p$ it's possible to extend the matrix $U$, by adding $n -
140 | p$ additional orthonormal columns, into a square orthonormal $n\times n$ matrix
141 | $\bar{U}$. Similarly, when $n < p$ we can extend $V$ to a square orthonormal
142 | $p\times p$ matrix $\bar{V}$. The extended matrices are especially useful in
143 | analysis, and available to R users using the function invocation
144 | {\tt{svd(X,$\phantom{,}$nu=n,$\phantom{,}$nv=p)}}.  The extended version is
145 | sometimes called the ``full'' SVD or just the SVD in many references and
146 | $\bar{U}^TX\bar{V}=\bar{\Sigma}$ results in an $n\times p$ rectangular diagonal
147 | matrix with the same main diagonal entries $\sigma_1 \ge \sigma_2 \ge \cdots \ge
148 | \sigma_k \ge 0$ as the thin version.
149 | 
150 | The columns of $\bar{U}$ are called the {\it left singular vectors} of $X$ and the
151 | columns of $\bar{V}$ are called the {\it right singular vectors}.  The $\sigma_i$ are
152 | called {\it singular values} of $X$.  The SVD breaks matrix vector
153 | multiplication into three steps: rotation, scaling, then another rotation.
154 | Consider an $n\times p$ matrix
155 | $X$ and its product $y$ with a vector $b\in\R^p$ using the
156 | full SVD
157 | $y=Xb = \bar{U}\bar{\Sigma}\bar{V}^Tb$:
158 | \begin{enumerate}
159 | \item Let $\hat{b}=\bar{V}^T b\in\R^p$.
160 | Since $\bar{V}$ is orthonormal, $\hat{b}$ is simply a rotation of the
161 | vector $b$.
162 | \item Now let $s = \bar{\Sigma}\hat{b}\in\R^n$,
163 | which scales each entry of $\hat{b}$ by the corresponding $\sigma_i$.
164 | \item Finally let $y=\bar{U}s$. This is just another rotation by the
165 | orthonormal matrix $\bar{U}$.
166 | \end{enumerate}
167 | The SVD reveals a lot of information about the structure of the matrix $X$.
168 | Step 2 tells us how much a vector can be scaled by $X$.  The rotations in steps
169 | 1 and 3 tell us about its range and null space.  The number of nonzero singular
170 | values of $X$ is equal to the \emph{rank} of $X$--the dimension of the range of
171 | $X$ (range means the set of all linear combinations of the columns of $X$
172 | a.k.a.  the span of $X$).  The {\it condition number}
173 | of $X$, familiar to R users as the \verb+kappa+ function and also written
174 | $\kappa_2(X)$, is the ratio of largest and smallest singular values.  It
175 | measures how ill-conditioned the matrix is. Computation involving highly
176 | ill-conditioned matrices can be very sensitive to perturbations like noise or
177 | even numerical precision.
178 | 
179 | Let $U^TXV=\Sigma$ be the ``thin'' SVD of $X$ and let
180 | $\bar{U}\in\R^{n\times n}$ and $\bar{V}\in\R^{p\times p}$ be
181 | their extended versions when $n > p$ or $n < p$.  Let $r$ be the index
182 | corresponding to the smallest non-zero singular value of $X$, for instance
183 | $\sigma_1 \ge \sigma_2 \ge \cdots \ge \sigma_r > \sigma_{r+1} = 0 = \cdots = \sigma_k$,
184 | where $k=\min\{n,p\}$. Then $\mbox{rank}(X) = r$ and the singular vectors define the
185 | following bases:
186 | \begin{itemize}
187 | \item The first $r$ columns of $U$ form a basis of the range of $X$.
188 | \item The first $r$ columns of $V$ form a basis of the range of $X^T$.
189 | \item If $r<p$, then the $r+1$ through $p^{\mathrm{th}}$ columns of
190 | $\bar{V}$ form a basis of the null space of $X$.
191 | \item If $r<n$, then the $r+1$ through $n^{\mathrm{th}}$ columns of
192 | $\bar{U}$ form a basis of the null space of $X^T$.
193 | \end{itemize}
194 | Knowledge of these bases are exceedingly useful for analysis, as well as for
195 | development and implementation of good algorithms.
196 | See Golub and Van Loan\cite{gvl} for many more details.
197 | 
198 | 
199 | Several important {\it matrix norms} can be defined in terms of the SVD.
200 | Matrix norms provide a way to measure distance between matrices and exhibit
201 | the usual properties of norms plus a sub-multiplicative property
202 | $\|X Y\| \le \|X\| \|Y\|$ for conformable matrices $X$ and $Y$.
203 | 
204 | Let $X\in\R^{n\times p}$, and let $k=\min(n, p)$. Then the squared
205 | {\it Frobenius norm} of $X$ is
206 | $\|X\|_F^2 = \sigma_1^2 + \sigma_2^2 + \cdots + \sigma_k^2$.
207 | The {\it spectral norm} of $X$ is simply its largest singular value,
208 | $\|X\|_2 = \sigma_1$. Both norms are familiar to R users of the
209 | {\tt norm} function.
210 | 
211 | 
212 | 
213 | The SVD gives us a practical way to think about matrix rank.  The smallest
214 | singular value of a matrix $X$ is the spectral norm distance between $X$ to the
215 | set of rank-deficient matrices; that is, it's a measure of how close to
216 | rank-deficient $X$ is.
217 | 
218 | It might not make sense to think about scaling vectors by values less than the
219 | floating-point unit roundoff $\mathbf{u}$ ({\tt .Machine\$double.eps} in R). In
220 | this case, if the smallest singular value of $X$ is near or below $\mathbf{u}$,
221 | then we might be in trouble--computations involving $X$ beware!  Matrices with
222 | smallest singular values below $\mathbf{u}$ or some small multiple of
223 | $\mathbf{u}$  are called {\it numerically rank deficient}.
224 | 
225 | 
226 | 
227 | \subsection*{Linear models}
228 | 
229 | Let $X\in\R^{n\times p}$ be a matrix formed from $n$ data observations of $p$
230 | features (called a model matrix).  Let $y\in\R^n, y\ne 0$ be a vector of $n$
231 | measurements (called a response vector) and assume that the entries of $y$
232 | are realizations of a set of $n$ independent but identically-distributed
233 | (denoted iid) random variables.  Then the linear system of equations
234 | \begin{equation}\label{linear}
235 | Xb = y + e
236 | \end{equation}
237 | is what we mean when we say ``linear model,'' where a possible solution
238 | $b\in\R^p$, when one exists, is a vector of model coefficients, and the
239 | entries of the residual error term $e\in\R^n$ are also iid.
240 | Given a vector $b$ of model
241 | coefficients, we call the product $Xb$ a {\it predictor} of the response vector
242 | $y$.  We're leaving important definitions of ``random variable'' and
243 | ``independent and identically distributed'' to the references.
244 | 
245 | Given a model matrix $X$ and response vector $y$, there are many ways we might
246 | go about solving for a vector of model coefficients $b$. A particularly
247 | natrural approach is described next.
248 | 
249 | 
250 | \subsection*{Ordinary least squares}
251 | 
252 | The ordinary least squares (OLS) solution to Equation \ref{linear} 
253 | finds a coefficient vector $b$ that
254 | minimizes the residual error between the response and the predictor
255 | vectors,
256 | \[
257 | \min_b \|X b - y\|^2,
258 | \]
259 | where $||\cdot||$ indicates Euclidean norm.  The conditions $p \le n$ and
260 | $\mathrm{rank}(X)=p$ imply that there exists a unique OLS solution--such cases
261 | are called full rank least squares problems.  Otherwise, if more that one
262 | solution exists we can add constraints to form a new problem with a unique
263 | solution, leading to many possible constrained solution methods.
264 | For instance, we may constrain the solution to be the one with minimal
265 | Euclidean norm. Other constrained solutions are possible.
266 | 
267 | Let's see how the SVD can be used to solve general ordinary least squares
268 | problems. The following result is adapted from Golub and Van Loan
269 | \cite[Theorem 5.5.1]{gvl}.
270 | \begin{thm}
271 | Let $X$ be a real $n\times p$ matrix, with full SVD
272 | $\bar{U}^TX\bar{V} = \bar{\Sigma}$ using extended matrices
273 | $\bar{U} = [u_1, u_2, \ldots, u_n] \in\R^{n\times n}$,
274 | $\bar{\Sigma}\in\R^{n\times p}$, and
275 | $\bar{V} = [v_1, v_2, \ldots, v_p]\in\R^{p\times p}$,
276 | and let $r \le \min\{n, p\}$ be the rank of $X$. Then
277 | \begin{equation}\label{SVDLS}
278 | b_{LS} = \sum_{i=1}^r \frac{u_i^T y}{\sigma_i}v_i
279 | \end{equation}
280 | minimizes $\|X b - y\|^2$ and has the smallest Euclidean norm of all
281 | such minimizers.
282 | \\
283 | {\bf Proof}. For any vector $b\in\R^p$, 
284 | \begin{align}
285 | \|X b - y\|^2 &= \|\bar{U}\bar{\Sigma}\bar{V}^T b - y\|^2 \qquad\,\,\,\,\,\,\, (\mbox{replacing X with its full SVD})\nonumber\\
286 | &= \|\bar{U}^T(\bar{U}\bar{\Sigma}\bar{V}^T b - y)\|^2 \,\,\,\,\,\, (\mbox{by Lemma \ref{invariant}})\nonumber\\
287 | &= \|\bar{\Sigma}\bar{V}^T b - \bar{U}^Ty\|^2 \nonumber \\
288 | &= \sum_{i=1}^p(\sigma_iv_i^T b - u_i^Ty)^2 + \sum_{i=p + 1}^n(u_i^Ty)^2. \label{residual}
289 | \end{align}
290 | The columns of $\bar{V}$ for an orthonormal basis of $\R^p$.
291 | Express the solution $b$ as a linear combination of the column vectors $v_i$,
292 | $b = \sum_{i=1}^p \gamma_i v_i$ (that is, $\gamma_i = v_i^Tb$).
293 | Since $\mathrm{rank}(X)=r$ then
294 | $\sigma_{r+1} = \sigma_{r+2} = \cdots = \sigma_p = 0$ and the corresponding
295 | coefficients $\gamma_i$ may take on any value without affecting the
296 | residual norm.  The specific choice
297 | $\gamma_{r+1} = \gamma_{r+2} = \cdots \gamma_p = 0$
298 | minimizes the norm of any possible solution $b$.
299 | Then the residual norm in Equation \ref{residual} is minimized by setting
300 | the remaining coefficients
301 | $v_i^Tb = \gamma_i = (u_i^T y) v_i/ \sigma_i$ for $i=1, 2, \ldots, r$.
302 | $\square$
303 | \end{thm}
304 | 
305 | The minimum-norm least squares solution is widely used in many science and
306 | engineering applications, but it's more common in statistics to constrain
307 | solutions to rank deficient problems in other ways. In particular, we will see
308 | below how R reformulates rank-deficient problems into full-rank ones by
309 | selecting a subset of columns using a heuristic procedure based on the model
310 | matrix column order. Other subset selection approaches include the lasso which
311 | solves the penalized least-squares problem
312 | \begin{equation}\label{lasso}
313 | \min_b\|Xb - y\|^2 + \mu\|b\|_1
314 | \end{equation}
315 | for a parameter $\mu>0$ and the 1-norm of the solution vector $b$. The lasso
316 | is the closest convex estimate of the parameterized
317 | \emph{best subset selection problem}:
318 | \[
319 | \min_b\|Xb - y\|^2 + \mu\|b\|_0,
320 | \]
321 | where $\|b\|_0$ means simply the count of nonzero components of $b$.
322 | (Despite the notation, $\|b\|_0$ is not a vector norm
323 | since for any scalar $\lambda$ with $|\lambda |$ not equal to zero or one,
324 | $\|\lambda b\|_0 \ne |\lambda |\|b\|_0$.) Although the best subset selection
325 | problem might seem to be the most natural way to select subsets of columns of
326 | the matrix $X$, the problem is nonconvex and hard to solve--indeed it is
327 | known to be NP hard.  We
328 | shall see later that there are other approaches to estimating optimal
329 | column subsets including a
330 | fast heuristic method by Golub called SVD subset
331 | selection~\cite[Section 12.2]{gvl}, and a
332 | newer approach by Lanza, Reichel and others based on Krylov subspace
333 | methods~\cite{lanza}.
334 | 
335 | The ordinary least squares solution of linear models has important statistical
336 | properties shown by Gauss~\cite{gauss} and later rediscovered by
337 | Markoff~\cite{markoff}. The least squares solution defines a {\it minimum
338 | variance unbiased estimator}, the technical details of which we leave to the
339 | references, in particular see~\cite{hastie},\cite{MN}.
340 | 
341 | 
342 | \section*{Generalized linear models}
343 | 
344 | Our notes on generalized linear models closely follow the book
345 | ``Generalized Linear Models'' by McCullagh and Nelder~\cite{MN}.  That very
346 | readable and practical book remains, in our opinion, the best all-around applied
347 | reference on GLMs and strongly influenced algorithm implementations in the
348 | R language.
349 | McCullagh and Nelder describe generalizations of the basic linear model
350 | in three parts:
351 | \begin{enumerate}
352 | \item A \emph{random component} describing the distribution of the
353 | measured entries of of a response vector $y$ and their vector of
354 | expected values $\mu = E(y)\in\R^n$.
355 | \item A \emph{systematic component} $\eta = X\beta$ that is just a basic
356 | linear model involving a vector $\eta\in\R^n$, model matrix $X\in\R^{n\times p}$ and
357 | coefficient solution vector $\beta\in\R^p$.
358 | \item A \emph{link function} between the random and systematic components,
359 | $\eta = g(\mu)$, applied component-wise to the vector $\mu$.
360 | \end{enumerate}
361 | The link function $g$ is assumed to be a real-valued monotonic, differentiable
362 | (and therefore invertible) function. If $p=n$ and the matrix $X$ is of full
363 | rank, then the model can exactly match the $n$ data observations in $y$ and all
364 | of the variation between observations is consigned to the systematic component
365 | of the model. Such models are usually \emph{over fit} and rarely generalize
366 | well to new data, although they have practical utility as seen in the next
367 | section. When $p=1$ then the model represents a single common $\mu$
368 | for all $n$ data observations and all of the variation in $y$ is
369 | consigned to the random component. Most real-world GLMs lie somewhere
370 | in-between these two extremes.
371 | 
372 | 
373 | Adding the random component and link function around a basic linear model
374 | lets GLMs model a wider range of scenarios than their OLS cousins. In
375 | particular, the link function lets us model variables $\mu$ that are restricted
376 | to intervals, for instance the interval $[0,1]$ useful for modeling binary
377 | values.  And we can use the random component to pair an appropriate
378 | distribution with such values (say, a binomial distribution in the case of 0/1
379 | data). The added modeling flexibility comes with a cost--the link function can
380 | turn finding the solution of GLMs into a nonlinear problem, despite the
381 | underlying linear model assumption in the systematic component.
382 | 
383 | 
384 | These notes assume that the random component distribution describing the
385 | response belongs to a one- or two-parameter \emph{exponential family} of
386 | probability distributions described below. The exponential family covers many
387 | widely used and important cases like logistic/binomial, Bernoulli, multinomial,
388 | exponential, Poisson, Gaussian, and others. Limiting our discussion to models
389 | that fit into the exponential family, despite a superficial mathematical
390 | complexity, greatly simplifies many details.
391 | 
392 | 
393 | \subsection*{The exponential family of distributions}
394 | 
395 | The following sections include a lot of notation and many functions and
396 | parameters to keep track of. Although a bit complicated, nothing presented here
397 | is harder than elementary Calculus. For the most part, we very closely follow
398 | the exposition of McCullagh and Nelder~\cite{MN}, but we expand on it in some
399 | places to help illuminate key ideas.
400 | 
401 | The exponential family of distributions have probability distributions
402 | that can be written as a function
403 | \begin{equation}\label{expfamily}
404 | f(y; \theta, \phi) = \exp\left(\frac{y\theta - b(\theta)}{a(\phi)} + c(y, \phi)\right),
405 | \end{equation}
406 | for parameters $\theta$ and $\phi$ and fixed functions $a,b,$ and $c$.
407 | The notation $f(y; \theta, \phi)$ means a function $f(y)$ that
408 | depends on the given parameters $\theta$ and $\phi$.
409 | Any probability distribution that can be re-written in this form belongs
410 | to the exponential family.
411 | 
412 | For instance, let
413 | $\theta=\mu$, $\phi=\sigma^2$, $b(\theta)=\theta^2/2$, $a(\phi)=\phi$
414 | and $c(y, \phi) = -\frac{1}{2}(\frac{y^2}{\sigma^2} + \log(2\pi\sigma^2))$.
415 | Then substituting those values in to Equation~\ref{expfamily} yields
416 | \begin{align*}
417 | f(y; \theta, \phi) &=
418 |   \exp\left(\frac{y\mu - \mu^2/2}{\sigma^2} - \frac{1}{2}\left(\frac{y^2}{\sigma^2} + \log(2\pi\sigma^2)\right)\right)\\
419 | &= \frac{1}{\sqrt{2\pi\sigma^2}}\exp\left(\frac{-(y - \mu)^2}{2\sigma^2}\right),
420 | \end{align*}
421 | which is a standard expression of a normal distribution,
422 | showing that the normal distribution fits in to the exponential
423 | family.
424 | 
425 | Similarly, consider the Poisson distribution with single parameter
426 | $\mu$,
427 | \[
428 | \exp(-\mu)\mu^y/{y!}\,\,.
429 | \]
430 | Let $\theta=\log\mu$, $a(\phi)=1$, $b(\theta)=\exp\theta$, and
431 | $c(y, \phi)=-\log{(y!)}$. Then
432 | \begin{align*}
433 | f(y; \theta, \phi) &=  \exp\left(\frac{y\theta - b(\theta)}{a(\phi)} + c(y, \phi)\right)\\
434 | &= \exp\left(\frac{y\log\mu - \exp\theta}{1} - \log y!\right)\\
435 | &= {\exp(y\log\mu - \exp\theta)}/{y!}\\
436 | &= {\exp(y\log\mu - \exp\log\mu)}/{y!}\\
437 | &= \exp(y\log\mu - \mu)/y!\\
438 | &= \exp{(-\mu)}\exp{(y\log\mu)}/y!\\
439 | &= \exp(-\mu)\mu^y/y!\,\,.
440 | \end{align*}
441 | Many other distributions are described by the exponential family.
442 | 
443 | \subsubsection*{The log likelihood function}
444 | If we think of the function $f$ in Equation~\ref{expfamily} as
445 | a function of parameters $\theta$ and $\phi$ given observed data $y$ then
446 | the function describes the likelihood of the observations. Its logarithm,
447 | \begin{equation*}
448 | \ell(\theta, \phi; y) = \frac{y\theta - b(\theta)}{a(\phi)} + c(y, \phi),
449 | \end{equation*}
450 | is called the \emph{log likelihood} function.  In this context the
451 | function $b(\theta)$ is called the \emph{cumulant function} and $\phi$
452 | the \emph{dispersion parameter}.
453 | In the usual case where $y$ is a vector
454 | of $n$ independent observations, the log likelihood function
455 | sums the individual contributions:
456 | \begin{equation}\label{loglik}
457 | \ell(\theta, \phi; y) = \sum_{i=1}^n\frac{y_i\theta_i - b(\theta_i)}{a(\phi)} + c(y, \phi)
458 | \end{equation}
459 | Next we derive a few basic identities that
460 | will be useful later.
461 | 
462 | Let $\partial\ell/\partial\theta$ be the derivative of the log likelihood
463 | function with respect to $\theta$ (how much the function changes as
464 | $\theta$ changes), and similarly $\partial^2\ell/\partial\theta^2$
465 | its 2nd derivative (how much the derivative function changes as
466 | $\theta$ changes). Then
467 | \begin{equation}\label{dl}
468 | \frac{\partial\ell}{\partial\theta} = \frac{y - b'(\theta)}{a(\phi)},
469 | \end{equation}
470 | and,
471 | \begin{equation}\label{d2l}
472 | \frac{\partial^2\ell}{\partial\theta^2} = \frac{-b''(\theta)}{a(\phi)},
473 | \end{equation}
474 | where $b'(\theta)$ means the derivative of the function $b$ taken with
475 | respect to $\theta$.
476 | 
477 | Assume that $a(\phi)\ne 0$ and that 
478 | the expected value $E(\partial\ell/\partial\theta) = 0$
479 | and also that
480 | $E(\partial^2\ell/\partial\theta^2) + E(\partial\ell/\partial\theta)^2 = 0$.
481 | Then
482 | \begin{equation}\label{bprime}
483 | 0 = E(\partial\ell/\partial\theta)  = \frac{E(y) - b'(\theta)}{a(\phi)}
484 | \qquad\mbox{which means that}\,\,b'(\theta) = E(y).
485 | \end{equation}
486 | Recall above that we sometimes use the alternative notation $\mu=E(y)$
487 | for the expected value of $y$; so $\mu=E(y)=b'(\theta)$.
488 | 
489 | Similarly,
490 | \begin{align}
491 | 0 &= 
492 | E(\partial^2\ell/\partial\theta^2) + E(\partial\ell/\partial\theta)^2 \nonumber\\
493 | &=
494 | \frac{-b''(\theta)}{a(\phi)} +
495 | E\left(\frac{y - b'(\theta)}{a(\phi)}\right)^2 \nonumber\\
496 | &= \frac{-b''(\theta)}{a(\phi)} +
497 | E\left(\frac{y^2 - 2yb'(\theta) + b'(\theta)^2}{a(\phi)^2}\right)\nonumber\\
498 | &= \frac{-b''(\theta)}{a(\phi)} +
499 | \frac{E(y^2) - E(y)^2}{a(\phi)^2} \qquad(\mbox{substituting}\,\,b'(\theta)=E(y))\nonumber\\
500 | &= \frac{-b''(\theta)}{a(\phi)} + \frac{V(y)}{a(\phi)^2}\nonumber\\
501 | &\mbox{which means that}\,\,a(\phi)b''(\theta) = V(y)\label{b2},
502 | \end{align}
503 | where $V(y)$ is the usual definition of the variance function for $y$.
504 | 
505 | Finally for this section, one more useful identity showing that the
506 | rate of change of the expected value of $y$ with respect to the parameter $\theta$
507 | is a multiple of the variance function $V(y)$:
508 | \begin{align}
509 | \frac{d}{d\theta}E(y) &= \frac{d}{d\theta}\mu &\mbox{(just notation)}\nonumber\\
510 | &= \frac{d}{d\theta}b'(\theta) &\mbox{(by Equation \ref{bprime})}\nonumber\\
511 | &= b''(\theta)\nonumber\\
512 | &= V(y)/a(\phi)\label{mutheta} &\mbox{(by Equation \ref{b2})}.
513 | \end{align}
514 | 
515 | 
516 | 
517 | 
518 | 
519 | \subsection*{GLMs and the exponential family}
520 | 
521 | The last section introduced the random component of generalized linear models
522 | and corresponding log likelihood function for the exponential family of
523 | distributions.  This section puts that together with the remaining
524 | generalizations, the systematic component's linear model $\eta = X\beta$ and
525 | the link function $\eta=g(\mu)$.
526 | 
527 | One approach for solving generalized linear models is to find the value of the
528 | coefficient vector $\beta$ that maximizes the value of the log likelihood
529 | function in Equation~\ref{loglik}.  Solving for such a \emph{maximum-likelihood
530 | solution} is the main goal of this section. We can phrase the solution as a
531 | standard nonlinear least squares problem by recasting the maximum likelihood
532 | problem in terms of a minimum residual problem using \emph{deviance residuals}.
533 | 
534 | \subsubsection*{Deviance residuals}
535 | 
536 | 
537 | 
538 | \subsubsection*{Jacobian}
539 | 
540 | In order to find a maximum using Calculus, we will need (at least) an
541 | expression for the derivative of the log likelihood function with respect to
542 | each component of the solution $\beta_j$, $\partial{l}/\partial{\beta_j}$.
543 | 
544 | Writing the $n \times p$ matrix $X$ showing each column as
545 | $X = [x_1, x_2, \cdots, x_p]$, then
546 | $\eta = X\beta = x_1\beta_1 + x_2\beta_2 + \cdots + x_p\beta_p$,
547 | and 
548 | \begin{equation}\label{eta}
549 | \partial\eta/\partial\beta_j = x_j.
550 | \end{equation}
551 | 
552 | Then using the chain rule from Calculus, the derivative of the log likelihood
553 | function with respect to each component of the solution $\beta_j$ is
554 | \begin{align}
555 | \frac{\partial\ell}{\partial\beta_j}
556 | &= \frac{\partial\ell}{\partial\theta}\frac{d\theta}{d\mu}\frac{d\mu}{d\eta}\frac{\partial\eta}{\partial\beta_j} \nonumber\\
557 | &= \frac{\partial\ell}{\partial\theta}\frac{1}{V(\mu)}\frac{d\mu}{d\eta}\frac{\partial\eta}{\partial\beta_j} 
558 |     &\mbox{(by Equation \ref{mutheta})} \nonumber\\
559 | &= \frac{\partial\ell}{\partial\theta}\frac{1}{V(\mu)}\frac{d\mu}{d\eta}x_j
560 |     &\mbox{(by Equation \ref{eta})} \nonumber\\
561 | &= \frac{y- b'(\theta)}{a(\phi)}\frac{1}{V(\mu)}\frac{d\mu}{d\eta}x_j
562 |     &\mbox{(by Equation \ref{dl})} \nonumber\\
563 | &= w\frac{y-\mu}{a(\phi)}\frac{d\eta}{d\mu}x_j,
564 |     &\mbox{(by Equations \ref{bprime} and \ref{w})} \label{dldb}
565 | \end{align}
566 | where $w$ is defined as a multiple of the inverse variance function:
567 | \begin{equation}\label{w}
568 | w = \left(\frac{d\mu}{d\eta}\right)^2 \bigg/ V(\mu).
569 | \end{equation}
570 | Since $\eta=g(\mu)$, the term $\frac{d}{d\mu}\eta$ in Equation~\ref{dldb} is
571 | simply $g'$, the derivative of the link function.  We remark that, in the usual
572 | case that the response $y$ is a vector of $n$ iid observations and $X$ is an
573 | $n\times p$ matrix, Equation~\ref{dldb} holds entrywise and defines the 
574 | $n\times p$ Jacobian matrix of the log likelihood function
575 | with $ij^{th}$ entry
576 | \begin{equation}\label{jacobian}
577 | J(\beta)_{ij} = \frac{\partial\ell_i}{\partial\beta_j}
578 |          = w_i\frac{y_i-\mu_i}{a(\phi)}\frac{d\eta_i}{d\mu}x_{ij},
579 | \qquad i=1, 2, \ldots, n,\,\,j=1,2,\ldots,p.
580 | \end{equation}
581 | %Assume that the dispersion function $a(\phi)$ is constant with respect to the
582 | %solution $\beta$. Then the maximum of the log likelihood function $\ell$ with
583 | %respect to each solution component $\beta_j$ for $j=1, 2, \ldots, p$ occurs when
584 | %\begin{equation}\label{max_loglik}
585 | %\sum_{i=1}^n w_i(y-\mu)_i\frac{d}{d\mu}\eta_ix_{ij} = 0.
586 | %\end{equation}
587 | 
588 | 
589 | \subsubsection*{Canonical link functions}
590 | Recall that
591 | the link function relates $\eta$ and $\mu$ by $\eta=g(\mu)$,
592 | and therefore also their derivatives $d\eta/d\mu = g'(\mu)$. Choosing a special \emph{canonical link function}
593 | results in a number of simplifications. Chief among them for our
594 | purposes, a canonical link connects $d\eta/d\mu$ to the variance function by
595 | \begin{equation}
596 | \label{canonical}
597 | {d\eta}/{d\mu}={1}/{V(\mu)}.\qquad\mbox{(canonical link case)}
598 | \end{equation}
599 | When $g$ is a canonical link function we get a simplification
600 | for $w$ using~\ref{canonical}:
601 | \begin{align}
602 | w &= \left(\frac{d\mu}{d\eta}\right)^2 \bigg/ V(\mu) \nonumber \\
603 | &= V^2(\mu) / V(\mu) \nonumber \\
604 | &= V(\mu)\qquad\mbox{(canonical link case)}.  \label{W_canonical}
605 | \end{align}
606 | 
607 | 
608 | 
609 | \subsection*{Maximum likelihood solutions based on first-order approximations}
610 | 
611 | Jacobian :q
612 | :q
613 | 
614 | 
615 | Assemble entries of $w$ and $g'$ along the diagonal of an
616 | $n\times n$ diagonal matrix $W$:
617 | \begin{equation}\label{W}
618 | W_{ij} = \bigg\{\begin{array}{cr}
619 | w_i / g'(\mu)_i & \mbox{if $i=j$}, \\
620 | 0 & \mbox{otherwise}.
621 | \end{array}
622 | \end{equation}
623 | Then a compact formula for the gradient of $\ell$ with respect to $\beta$ is
624 | \begin{equation}\label{gradient}
625 | \nabla_\beta\ell = X^TW(y - \mu).
626 | \end{equation}
627 | 
628 | At this point, we have enough information from Equations~\ref{loglik}
629 | and~\ref{gradient} to formulate a first-order solution method to finding the
630 | maximum likelihood GLM solution.  Possible solution methods include gradient
631 | descent, Gauss-Newton, conjugate gradient, or a quasi-Newton approach.
632 | The following example cooks up a very basic GLM solver using gradient
633 | descent.
634 | 
635 | XXX EXAMPLE XXX
636 | 
637 | 
638 | 
639 | \subsection*{Maximum likelihood solution by Newton's method}
640 | 
641 | We can do better than the quasi-Newton solution in the last section.  Because
642 | we restricted our problems to the exponential family, we can formulate an
643 | analytic representation of the second derivatives of the log likelihood
644 | function to form a Hessian matrix. That knowledge enables us to employ solution
645 | methods using second-order (quadratic) approximations like Newton's
646 | method--such methods have more favorable convergence properties (faster, more
647 | stable) than first order solution methods.
648 | 
649 | We need to differentiate the expression for the derivative of $\ell$ in
650 | Equation~\ref{dldb} with respect to other elements
651 | of $\beta_k$ to compute the entries of the 2nd derivative Hessian matrix,
652 | and again we assume that the dispersion term $a(\phi)$ is constant
653 | with respect to the solution $\beta$. Then the $jk$-entry of the
654 | Hessian matrix is:
655 | \begin{align}
656 | H_{jk} &= \frac{\delta^2\ell}{\delta\beta_k\delta\beta_j}
657 |  = \frac{\delta}{\beta_k}\frac{\delta\ell}{\delta\beta_j} \nonumber\\
658 | &=
659 | \frac{\delta}{\delta\beta_k}
660 | \sum_i\left(
661 | w_i(y-\mu)_i\frac{d\eta_i}{d\mu}x_{ij} \right)\nonumber \\
662 | &= \sum_i\left(
663 | (y-\mu)_i
664 | \frac{\delta}{\delta\beta_k}
665 | w\frac{d\eta_i}{d\mu}x_{ij}
666 | +
667 | w_i\frac{d\eta_i}{d\mu}x_{ij}\frac{\delta}{\delta\beta_k}(y-\mu)_i \label{hsum}
668 | \right)
669 | \end{align}
670 | where the last equality uses the product rule from Calculus.
671 | At this point, we have an (unwieldy) expression for the Hessian and
672 | we can plug that together with the gradient function from Equation~\ref{gradient}
673 | into Newton's method to get a maximum likelihood GLM solver.
674 | However, we will consider an important special case next that is much simpler.
675 | 
676 | \subsubsection*{Canonical link case}
677 | When $g$ is a canonical link function
678 | the expression $w_i\frac{d\eta_i}{d\mu}$ in the
679 | first term of the sum in Equation~\ref{hsum}
680 | is constant because in such cases
681 | $d\eta/d\mu=1/V(\mu)$ and
682 | $w=V(\mu)$ 
683 | by Equations~\ref{canonical}i and~\ref{W_canonical}. Thus, its derivative
684 | \[
685 | \frac{\delta}{\delta\beta_k} w_i\frac{d\eta_i}{d\mu} = 0,
686 | \]
687 | that is, the first term of the Hessian in Equation~\ref{hsum}
688 | drops out when $g$ is a canonical link function.
689 | Meanwhile, consider the second term
690 | \begin{align*}
691 | \sum_i\left(
692 | w_i\frac{d\eta_i}{d\mu}x_{ij}\frac{\delta}{\delta\beta_k}(y-\mu)_i
693 | \right)  &=
694 | \sum_i\left(
695 | w_i\frac{d\eta_i}{d\mu}x_{ij}\frac{\delta\mu}{\delta\beta_k} \right) &\mbox{($y$ is constant wrt $\beta$)}  \\
696 | &= 
697 | \sum_i\left(
698 | w_ix_{ij}\frac{\delta\eta_i}{\delta\beta_k} \right)&\mbox{(chain rule)}\\
699 | &= 
700 | \sum_i\left(
701 | w_ix_{ij}x_{ik} \right), &\mbox{(by Equation \ref{eta})}
702 | \end{align*}
703 | finally arriving (with a substantial subscript-induced headache) at a compact
704 | expression for the Hessian using the definition of $W$ from Equation~\ref{W}:
705 | \begin{equation}\label{Hessian}
706 | H = X^T W X.
707 | \end{equation}
708 | 
709 | With an expression for the gradient from Equation~\ref{gradient} and Hessian
710 | from Equation~\ref{Hessian} of the log likelihood function, we have all we need
711 | to implement a second order solution method. The next example uses R's
712 | \verb+nlm+ function to find the maximum likelihood GLM solution using Newton's
713 | method.
714 | 
715 | XXX EXAMPLE XXX
716 | 
717 | Remember that this derivation assumed that $g$ is a canonical link function. In
718 | the general case we need to resort to the definition of the Hession in
719 | Equation~\ref{hsum} for a Newton's method-based solution.
720 | 
721 | 
722 | 
723 | \subsection*{Iteratively re-weighted least squares}
724 | 
725 | 
726 | 
727 | 
728 | The numerical solution of model problems of this form was carefully analyzed by
729 | Paige\cite{paige}.
730 | ...entries of $W$ are non-zero, the generalized linear model
731 | \ref{glm} results in a weighted nonlinear least squares problem
732 | typically solved by the iteratively reweighted least square method
733 | shown in Algorithm \ref{irls} and defined carefully by Bj\"ork\cite{bjork}...
734 | 
735 | 
736 | \subsection*{Numerical implementation issues}
737 | 
738 | cover edge cases here including zero-variance observations (constant rows in $X$)
739 | and singular/ill-conditioned $X$.
740 | 
741 | Introduce R's rank revealing QR irwls approach.
742 | 
743 | SVD-irwls based on algorithm by O'Leary
744 | 
745 | comparison/examples between R's RRQR-irwls and SVD-irwls
746 | 
747 | large-scale problems and 1st order solution methods
748 | 
749 | 
750 | \subsubsection*{Round off error in QR- and SVD-based methods}
751 | 
752 | 
753 | 
754 | 
755 | 
756 | 
757 | 
758 | 
759 | 
760 | 
761 | 
762 | \section*{Copyright}
763 | Copyright \copyright  2014 Michael Kane and Bryan W. Lewis
764 | 
765 | \begin{quote}
766 |     Permission is granted to copy, distribute and/or modify this document
767 |     under the terms of the GNU Free Documentation License, Version 1.3
768 |     or any later version published by the Free Software Foundation;
769 |     with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
770 |     A copy of the license is included in the Github project files.
771 | \end{quote}
772 | 
773 | 
774 | 
775 | \begin{thebibliography}{99}
776 | \bibitem{anda} Anda, A. and Park, H., Self-scaling fast rotations for stiff least squares problems, Lin. Alg. Appl., 234, 1996, pp. 137-162.
777 | \bibitem{bjork} Bj\"orck, \AA.,  Numerical Methods for Least Squares Problems, SIAM, Philadelphia, 1996.
778 | \bibitem{bates} Bates, D., \url{http://www.stat.wisc.edu/courses/st849-bates/lectures/GLMH.pdf">http://www.stat.wisc.edu/courses/st849-bates/lectures/GLMH.pdf}.
779 | \bibitem{dekker}
780 | Dekker, Theodorus Jozef. ``A floating-point technique for extending the available precision.'' Numerische Mathematik 18.3 (1971): 224-242.
781 | \bibitem{friedman} Friedman, Jerome, Trevor Hastie, and Rob Tibshirani. "Regularization paths for generalized linear models via coordinate descent." Journal of statistical software 33.1 (2010): 1.
782 | \bibitem{glmnet} Friedman, Hastie, Tibshirani, Simon, Narasimhan, Qian, \url{https://cran.r-project.org/package=glmnet}.
783 | \bibitem{gauss} Gauss, C. F., Theoria combinationis observationum erroribus minimis obnoxiae, Pars prior, 1863 (1st written 1821).
784 | \bibitem{hastie} Hastie, T. J. and Pregibon, D., Generalized linear models, Chapter 6 of Statistical Models in S, eds J. M. Chambers and T.  J. Hastie, Wadsworth \& Brooks/Cole, 1992.
785 | \bibitem{fmm} Forsythe, George Elmer, Cleve B. Moler, and Michael A. Malcolm. ``Computer methods for mathematical computations.'' (1977).
786 | \bibitem{gvl} Golub, Gene H., and Charles F. Van Loan. Matrix computations. Vol. 3. JHU Press, 2012.
787 | \bibitem{higham96} Higham, Nicholas J. Accuracy and stability of numerical algorithms. Vol. 80. Siam, 2002.
788 | \bibitem{horn-johnson} Horn, Roger A., and Charles R. Johnson. Matrix analysis. Cambridge university press, 1990.
789 | \bibitem{jordan} Jordan, Michael, \url{https://people.eecs.berkeley.edu/~jordan/courses/260-spring10/other-readings/chapter8.pdf} (online notes).
790 | \bibitem{lamport} Lamport, Leslie. How to write a proof. The American mathematical monthly 102.7 (1995): 600-608.
791 | \bibitem{lanza}Lanza, Alessandro, et al. A Generalized Krylov Subspace Method for $\ell_p-\ell_q$ Minimization. SIAM Journal on Scientific Computing 37.5 (2015): S30-S50.
792 | \bibitem{lumley} Lumley, T, \url{http://cran.r-project.org/web/packages/biglm">http://cran.r-project.org/web/packages/biglm}.
793 | \bibitem{markoff} Markoff, A., Wahrscheinlichheitsrechnug,  Leipzig, 1912.
794 | \bibitem{MN} McCullagh P. and Nelder, J. A., Generalized Linear Models, Chapman and Hall, London 1989.
795 | \bibitem{oleary} O'Leary, D., Robust regression computation using iteratively reweighted least squares, Siam J. Mat. Anal. Appl., Vol. 11 No. 3, 1990, pp. 466-480.
796 | \bibitem{paige} Paige, C. C., Fast numerically stable computations for generalized least squares problems, Siam J. Num. Anal., 16, 1979, pp. 165-171.
797 | \bibitem{R} The R project \url{http://www.r-project.org">http://www.r-project.org}.
798 | \bibitem{trefbau} Trefethen, Lloyd N., and David Bau III. Numerical linear algebra. Vol. 50. Siam, 1997.
799 | \bibitem{zhou} Zhou, H. and Hastie, T., Regularization and Variable Selection via the Elastic Net, J. Royal Statistical Society, B, 2005, pp. 301-320.
800 | \end{thebibliography}
801 | 
802 | 
803 | \end{document}
804 | 


--------------------------------------------------------------------------------
/implementations.R:
--------------------------------------------------------------------------------
  1 | # Example iteratively re-weighted least squares (IRLS) implementations
  2 | # Mike Kane & Bryan Lewis, 2013-2014.
  3 | #
  4 | # The implementations generally follow the same input/output pattern.  They
  5 | # take as inputs a model matrix A, a response vector b whose length is the
  6 | # number of rows of A, an R 'family' function that defines the error
  7 | # distribution family and link function, a maximum number of iterations, and an
  8 | # iteration convergence tolerance. The methods produce a list with two
  9 | # elements, the model coefficients and the number of iterations.
 10 | 
 11 | # The most basic IRLS method, and the shortest implementation we could come
 12 | # up with. This method solves the normal equations associated with a weighted
 13 | # least squares problem in each iteration.
 14 | irls =
 15 | function(A, b, family=binomial, maxit=25, tol=1e-08)
 16 | {
 17 |   x = rep(0,ncol(A))
 18 |   for(j in 1:maxit)
 19 |   {
 20 |     eta   = drop(A %*% x)
 21 |     g     = family()$linkinv(eta)
 22 |     gprime = family()$mu.eta(eta)
 23 |     z     = eta + (b - g) / gprime
 24 |     W     = drop(gprime^2 / family()$variance(g))
 25 |     xold  = x
 26 |     x     = solve(crossprod(A, W * A), crossprod(A, W * z), tol=2*.Machine$double.eps)
 27 |     if(sqrt(drop(crossprod(x - xold))) < tol) break
 28 |   }
 29 |   list(coefficients=x, iterations=j)
 30 | }
 31 | 
 32 | # A method discussed by O'Leary that uses a QR factorization of the model
 33 | # matrix. This method should be much more numerically stable in the face of
 34 | # ill-conditioned model matrices than the simple method defined above.  If the
 35 | # QR method used uses Givens rotations, this method is numerically stable for
 36 | # stiff problems too.
 37 | irls_qrnewton =
 38 | function(A, b, family=binomial, maxit=25, tol=1e-08)
 39 | {
 40 |   s = t  = 0
 41 |   QR = qr(A)
 42 |   Q  = qr.Q(QR)
 43 |   R  = qr.R(QR)
 44 |   for(j in 1:maxit)
 45 |   {
 46 |     g      = family()$linkinv(t)
 47 |     gprime = family()$mu.eta(t)
 48 |     z      = t + (b - g) / gprime
 49 |     W      = as.vector(gprime^2 / family()$variance(g))
 50 |     wmin   = min(W)
 51 |     if(wmin < sqrt(.Machine$double.eps))
 52 |       warning("Tiny weights encountered")
 53 |     s_old   = s
 54 |     C   = chol(crossprod(Q, W*Q))
 55 |     s   = forwardsolve(t(C), crossprod(Q,W*z))
 56 |     s   = backsolve(C,s)
 57 |     t      = Q %*% s
 58 |     if(sqrt(crossprod(s - s_old)) < tol) break
 59 |   }
 60 |   x = backsolve(R, crossprod(Q,t))
 61 |   list(coefficients=x,iterations=j)
 62 | }
 63 | 
 64 | # The next method is a minor variation on the QR Newton method defined above
 65 | # that uses the SVD instead. It exhibits similar numerical stability and can
 66 | # definitively check model matrix rank deficiency, at the cost of computing
 67 | # the SVD instead of the QR factorization up front.
 68 | irls_svdnewton =
 69 | function(A, b, family=binomial, maxit=25, tol=1e-08)
 70 | {
 71 |   s = t = 0
 72 |   S  = svd(A)
 73 |   if(min(S$d)/max(S$d)<tol) warn("Near rank-deficient model matrix")
 74 |   for(j in 1:maxit)
 75 |   {
 76 |     g      = family()$linkinv(t)
 77 |     gprime = family()$mu.eta(t)
 78 |     z      = t + (b - g) / gprime
 79 |     W      = as.vector(gprime^2 / family()$variance(g))
 80 |     wmin   = min(W)
 81 |     if(wmin < sqrt(.Machine$double.eps))
 82 |       warning("Tiny weights encountered")
 83 |     s_old   = s
 84 |     C   = chol(crossprod(S$u, W*S$u))
 85 |     s   = forwardsolve(t(C), crossprod(S$u,W*z))
 86 |     s   = backsolve(C,s)
 87 |     t      = S$u %*% s
 88 |     if(sqrt(crossprod(s - s_old)) < tol) break
 89 |   }
 90 |   x = S$v %*% ((1/S$d) * crossprod(S$u,t))
 91 |   list(coefficients=x,iterations=j)
 92 | }
 93 | 
 94 | 
 95 | # Sparse weighted cross product helper function
 96 | # Input: Dense Matrix A_dense, sparse Matrix A_sparse, weights W,
 97 | # where A = [A_dense, A_sparse] and length W=ncol(A).
 98 | # Output: Dense representation of crossprod(A,W*A)
 99 | sp_wt_cross = function(A_dense, A_sparse, W)
100 | {
101 |   nd = ncol(A_dense)
102 |   ns = ncol(A_sparse)
103 |   n  = nd + ns
104 |   ATWA = matrix(0, nrow=n, ncol=n)
105 |   WA_dense  = W*A_dense
106 |   WA_sparse = W*A_sparse
107 |   ATWA[1:nd,1:nd] = as.matrix(crossprod(A_dense,WA_dense))
108 |   ATWA[1:nd,(nd+1):n] = as.matrix(crossprod(A_dense,WA_sparse))
109 |   ATWA[(nd+1):n,1:nd] = as.matrix(crossprod(A_sparse,WA_dense))
110 |   ATWA[(nd+1):n,(nd+1):n] = as.matrix(crossprod(A_sparse,WA_sparse))
111 |   ATWA
112 | }
113 | 
114 | # Example IRLS implementation that can take advantage of sparse model matrices.
115 | # Here we assume that the model matrix A is already permuted and partitioned
116 | # into A = [A_dense, A_sparse] dense and sparse columns.  The response vector b
117 | # must already be permuted on input to correspond to the matrix splitting.
118 | irls_sparse =
119 | function(A_dense, A_sparse, b, family=binomial, maxit=25, tol=1e-08)
120 | {
121 |   nd = ncol(A_dense)
122 |   ns = ncol(A_sparse)
123 |   n  = nd + ns
124 |   x = rep(0, n)
125 |   for(j in 1:maxit)
126 |   {
127 |     eta   = as.vector(A_dense %*% x[1:nd] + A_sparse %*% x[(nd+1):n])
128 |     g     = family()$linkinv(eta)
129 |     gprime = family()$mu.eta(eta)
130 |     z     = eta + (b - g) / gprime
131 |     W     = as.vector(gprime^2 / family()$variance(g))
132 |     xold  = x
133 |     ATWA  = sp_wt_cross(A_dense,A_sparse,W)
134 |     wz    = W*z
135 |     ATWz  = c(as.vector(crossprod(A_dense, wz)), as.vector(crossprod(A_sparse, wz)))
136 | 
137 |     C   = chol(ATWA, pivot=TRUE)
138 |     if(attr(C,"rank")<ncol(C)) stop("Rank-deficiency detected.")
139 |     p   = attr(C, "pivot")
140 |     s   = forwardsolve(t(C), ATWz[p])
141 |     x   = backsolve(C,s)[p]
142 | 
143 |     if(sqrt(crossprod(x-xold)) < tol) break
144 |   }
145 |   list(coefficients=x,iterations=j)
146 | }
147 | 
148 | 
149 | 
150 | # The following simple iterator function is required by irls_incremental below.
151 | # This function iterates by rows through a delimited text file nrows at a time,
152 | # returning NULL at the end. For more sophisticated iterators, see the
153 | # iterators or lazy.frame packages. Use init=TRUE argument to initialize/reset
154 | # iterator. Example:
155 | # chunk = iterator("mydata.csv")
156 | # chunk(init=TRUE)
157 | # chunk() ... until it returns NULL
158 | iterator = function(filename, nrows=100, sep=",")
159 | {
160 |   function(init=FALSE)
161 |   {
162 |     if(init)
163 |     {
164 |       f <<- file(filename)
165 |       open(f)
166 |       return(NULL)
167 |     }
168 |     tryCatch(
169 |       as.matrix(read.table(f, sep=sep, nrows=nrows)),
170 |       error=function(e)
171 |       {
172 |         close(f)
173 |         NULL
174 |       })
175 |   }
176 | }
177 | 
178 | irls_incremental =
179 | function(filename, chunksize, b, family=binomial, maxit=25, tol=1e-08)
180 | {
181 |   x     = NULL
182 |   chunk = iterator(filename, nrows=chunksize) # a basic data file iterator
183 |   for(j in 1:maxit)
184 |   {
185 |     k = 1                                     # Track the rows
186 |     chunk(init=TRUE)                          # initialize the iterator
187 |     A     = chunk()                           # get first chunk of model matrix
188 | # Initialize first time through (after ascertaining ncol(A)):
189 |     if(is.null(x)) x = rep(0,ncol(A))
190 |     ATWA = matrix(0,ncol(A),ncol(A))
191 |     ATWz = rep(0,ncol(A))
192 |     while(!is.null(A))                        # iterate
193 |     {
194 |       eta    = A %*% x
195 |       g      = family()$linkinv(eta)
196 |       gprime = family()$mu.eta(eta)
197 |       z      = eta + (b[k:(k+nrow(A)-1)] - g) / gprime
198 |       k      = k + nrow(A)
199 |       W      = as.vector(gprime^2 / family()$variance(g))
200 |       ATWz   = ATWz + crossprod(A,W*z)
201 |       ATWA   = ATWA + crossprod(A,W*A)
202 |       A      = chunk()    # Next chunk
203 |     }
204 |     xold  = x
205 |     C     = chol(ATWA, pivot=TRUE)
206 |     if(attr(C, "rank") < ncol(C)) stop("Rank-deficiency detected.")
207 |     p     = attr(C, "pivot")
208 |     x     = backsolve(C,forwardsolve(t(C),ATWz[p]))[p]
209 |     if(sqrt(crossprod(x-xold)) < tol) break
210 |   }
211 |   list(coefficients=x,iterations=j)
212 | }
213 | 


--------------------------------------------------------------------------------
/sparse_n_dense.R:
--------------------------------------------------------------------------------
 1 | # Investigate matrix products involving mixed sparse and dense matrices.
 2 | # We compute 'crossprod(A)' two ways:
 3 | # 1. Using the usual dense linear algebra code path
 4 | # 2. Computing sparse and dense parts separately and then combining
 5 | #
 6 | # The perofrmance difference you will see is highly sensitive to your
 7 | # CPU architecture, the BLAS library that you're using, the size of
 8 | # the problem, and the ratio of sparse to dense portions.
 9 | 
10 | # Set up an example:
11 | m  = 100000  # Number of rows
12 | ns = 900     # Number of sparse columns
13 | nd = 100     # Number of dense columns
14 | n  = nd + ns
15 | 
16 | # The 1st nd columns of A will be dense, the rest sparse (about 1% fill-in).
17 | set.seed(1)
18 | A = cbind(
19 |       matrix(rnorm(m*nd),nrow=m),
20 |       matrix(sample(0:1,prob=c(0.99,0.01),size=ns*m,replace=TRUE), nrow=m))
21 | 
22 | # Time forming the matrix cross product
23 | t1  = proc.time()
24 | ATA = crossprod(A)
25 | t1  = proc.time() - t1
26 | 
27 | # Split A into dense and sparse parts
28 | library("Matrix")
29 | A_dense = Matrix(A[,1:nd],sparse=FALSE)
30 | A_sparse = Matrix(A[,(nd+1):n],sparse=TRUE)
31 | 
32 | # Time split version
33 | t2  = proc.time()
34 | X11 = crossprod(A_dense)
35 | X12 = crossprod(A_dense,A_sparse)
36 | X21 = crossprod(A_sparse,A_dense)
37 | X22 = crossprod(A_sparse)
38 | X   = matrix(0,nrow=n,ncol=n)
39 | X[1:nd,1:nd] =     as.matrix(X11)
40 | X[1:nd,(nd+1):n] = as.matrix(X12)
41 | X[(nd+1):n,1:nd] = as.matrix(X21)
42 | X[(nd+1):n,(nd+1):n] = as.matrix(X22)
43 | t2 = proc.time() - t2
44 | 
45 | # Print our timings:
46 | cat("Time to compute the usual crossprod(A):\n")
47 | print(t1)
48 | cat("Time to compute crossprod(A) by splitting into sparse and dense parts:\n")
49 | print(t2)
50 | 
51 | # Verify that we compute the same thing, within expected numerical accuracy:
52 | cat("Frobenius norm of the difference of the two methods output:\n")
53 | print(sqrt(norm(ATA - X,"F")))
54 | 


--------------------------------------------------------------------------------