├── README.md ├── compare.R ├── glm.svd.r ├── glm.tex ├── implementations.R └── sparse_n_dense.R /README.md: -------------------------------------------------------------------------------- 1 | Generalized linear models, abridged. 2 | =============== 3 | 4 | See our old notes on this project at: http://bwlewis.github.io/GLM. 5 | 6 | Very revised and updated notes are in process as of October, 2018. 7 | 8 | 9 | Here are some slides from a talk at the Cleveland R User Group: https://bwlewis.github.io/GLM/October_2018_CLERUG.html 10 | 11 | Our current experimental reference SVD-based GLM implementation, based in turn 12 | on Dianne O'Leary's QR implementation from 1990, can be found here: 13 | https://github.com/bwlewis/GLM/blob/master/glm.svd.r That code is robust and 14 | fast and replicates R's column subset selection routine by falling back to R's 15 | default rank-revealing QR factorization in edge cases. But it's still a work in 16 | progress. 17 | -------------------------------------------------------------------------------- /compare.R: -------------------------------------------------------------------------------- 1 | source("implementations.R") 2 | data("Contraception",package="mlmRev") 3 | # Model estimated with R's glm function, returning model matrix and response 4 | # in $x and $y, respectively: 5 | R_GLM = glm(formula = use ~ age + I(age^2) + urban + livch, family = binomial, x=TRUE, data=Contraception) 6 | # Model estimated with our radically stripped-down minimalist implementation: 7 | mini = irls(R_GLM$x, R_GLM$y, family=binomial) 8 | print(data.frame(R_GLM=coef(R_GLM), minimalist=coef(mini))) 9 | 10 | iqrn = irls_qrnewton(R_GLM$x, R_GLM$y, family=binomial) 11 | print(data.frame(R_GLM=coef(R_GLM), qr_newton=coef(iqrn))) 12 | 13 | isvdn = irls_svdnewton(R_GLM$x, R_GLM$y, family=binomial) 14 | print(data.frame(R_GLM=coef(R_GLM), svd_newton=coef(isvdn))) 15 | 16 | # Let's test the sparse-aware IRLS example. But we need some data prep for it 17 | # first. The 1st three columns of our model matrix are dense: 18 | library("Matrix") 19 | A_dense = Matrix(R_GLM$x[,1:3], sparse=FALSE) 20 | # The next four columns are sparse: 21 | A_sparse = Matrix(R_GLM$x[,4:7], sparse=TRUE) 22 | isparse = irls_sparse(A_dense, A_sparse, R_GLM$y, family=binomial) 23 | print(data.frame(R_GLM=coef(R_GLM), irls_sparse=coef(isparse))) 24 | 25 | # Let's test the incremental implementation... 26 | # Write out the model matrix to a data file for the incremental example. 27 | write.table(R_GLM$x, file="data.csv", sep=",", col.names=FALSE, row.names=FALSE) 28 | inc = irls_incremental("data.csv", 500, R_GLM$y, family=binomial) 29 | print(data.frame(R_GLM=coef(R_GLM), incremental=coef(inc))) 30 | -------------------------------------------------------------------------------- /glm.svd.r: -------------------------------------------------------------------------------- 1 | #' Fitting Generalized Linear Models 2 | #' 3 | #' Similar to \code{glm.fit} but uses the SVD to detect ill-conditioned 4 | #' problems and conducts IRWLS in projected subspace for efficiency. 5 | #' 6 | #' @param X an n by p real-valued dense model matrix 7 | #' @param y a response vector of length n 8 | #' @param family a family function or the result of a call to a family function 9 | #' @param maxit integer number of maximum number of IRWLS iterations 10 | #' @param tol IRWLS positive convergence tolerance 11 | #' @param stol positive numerical condition tolerance 12 | #' @param singular.ok if FALSE a numerically-singular fit stops with error 13 | #' @param weights vector of observation weights 14 | #' @param reg.method indicates regularization approach: 'column projection' follows R's GLM approach; 'minimum norm' finds the LS solution of minimal norm. 15 | #' @param LAPACK if FALSE use R's column-ordered subset selection when \code{reg.method == 'column projection'} otherwise use default LAPACK pivots. 16 | #' @return A list with model coefficients b, number if IRWLS iterations, and column pivoting indices. 17 | #' Returns a list with entries: 18 | #' \describe{ 19 | #' \item{b:}{ model coefficients} 20 | #' \item{iterations:}{ number of IRWLS iterations} 21 | #' \item{rank:}{ rank of model matrix} 22 | #' \item{pivot:}{ model matrix column pivot} 23 | #' } 24 | #' @seealso \code{\link{glm.fit}} 25 | glm.svd = 26 | function(X, y, family=binomial, maxit=25, tol=1e-10, stol=1e-10, 27 | singular.ok=TRUE, weights, 28 | reg.method=c("column projection", "minimum norm"), 29 | LAPACK=FALSE) 30 | { 31 | singular = ifelse(singular.ok, warning, stop) 32 | reg.method = match.arg(reg.method) 33 | if(is.list(X)) S = X 34 | else S = svd(X) 35 | V = S$v 36 | nvars = NCOL(S$u) 37 | idx = seq(nvars) 38 | i = (S$d / S$d[1]) > stol 39 | k = sum(i) 40 | pivot = seq(nvars) 41 | if (k < nvars) 42 | { 43 | singular("Singular system detected of rank: ", k, " using threshold: ", stol) 44 | if(reg.method == "column projection") 45 | { 46 | Q = qr(t(S$v[, 1:k]), LAPACK=LAPACK) # Golub SVD subsel heuristic 47 | # when LAPACK=FALSE uses R's custom pivoting strategy 48 | pivot = Q$pivot 49 | idx = sort(head(pivot, k)) 50 | omit = tail(Q$pivot, nvars - k) 51 | # XXX we can maybe instead use a slightly cheaper downdating svd scheme here: 52 | S_new = svd(X[, -omit]) 53 | # double-check that this worked (it may not have), if not resort to 54 | # something else... XXX can this be improved? 55 | if((tail(S_new$d, 1) / S_new$d[1]) <= stol) 56 | { 57 | warning("Whoops! SVD subset selection failed, trying dqrdc2 on full matrix") 58 | if(is.list(X)) Q = qr(X$u %*% (X$d * t(X$v)), LAPACK=FALSE) 59 | else Q = qr(X, LAPACK=FALSE) 60 | pivot = Q$pivot 61 | idx = sort(head(pivot, k)) 62 | omit = tail(Q$pivot, nvars - k) 63 | S_new = svd(X[, -omit]) 64 | } 65 | S = S_new 66 | message("omittig column(s) ", paste(omit, collapse=",")) 67 | } 68 | } 69 | 70 | s = rep(0, ncol(S$u)) 71 | if(!is(family, "family")) family = family() 72 | nobs = NROW(y) # needed by the initialize expression below 73 | nvars = NCOL(S$u) # ditto 74 | if(missing(weights)) weights = rep(1, nobs) 75 | variance = family$variance 76 | linkinv = family$linkinv 77 | mu.eta = family$mu.eta 78 | etastart = NULL 79 | eval(family$initialize) 80 | eta = family$linkfun(mustart) 81 | dev.resids = family$dev.resids 82 | dev = sum(dev.resids(y, linkinv(eta), weights)) 83 | devold = 0 84 | for(j in 1:maxit) 85 | { 86 | g = linkinv(eta) 87 | varg = variance(g) 88 | if(any(is.na(varg))) stop("NAs in variance of the inverse link function") 89 | if(any(varg==0)) stop("Zero value in variance of the inverse link function") 90 | gprime = mu.eta(eta) 91 | if(any(is.na(gprime))) stop("NAs in the inverse link function derivative") 92 | z = eta + (y - g) / gprime 93 | W = weights * as.vector(gprime^2 / varg) 94 | # The following is as well-conditioned as W is 95 | C = chol(crossprod(S$u, W*S$u), pivot=TRUE) 96 | piv = attr(C, "pivot") 97 | s = forwardsolve(t(C), crossprod(S$u, W*z)[piv]) 98 | s = backsolve(C, s)[order(piv)] 99 | eta = drop(S$u %*% s) 100 | dev = sum(dev.resids(y, g, weights)) 101 | if(abs(dev - devold) / (0.1 + abs(dev)) < tol) break 102 | devold = dev 103 | # R essentially computes this (via dqrdc2.f) 104 | ## Q = qr(W * X) 105 | ## omit = tail(Q$pivot, ncol(X) - Q$rank) 106 | ## now omit columns and solve... 107 | ## fit = qr.solve(W * X, W * z) 108 | ## eta = drop(x %*% fit) 109 | ## g = linkinv(eta) 110 | } 111 | x = rep(NA, NCOL(X)) 112 | inv = 1/S$d 113 | if(reg.method == "minimum norm") inv[inv > 1/stol] = 1 114 | x[idx] = drop(S$v %*% (s*inv)) 115 | list(coefficients=x,iterations=j, rank=k, pivot=pivot) 116 | } 117 | -------------------------------------------------------------------------------- /glm.tex: -------------------------------------------------------------------------------- 1 | \documentclass[10pt]{article} %See documentation for other class options 2 | \usepackage{tabularx} 3 | \usepackage{color} 4 | %\usepackage{fixltx2e,fix-cm} 5 | %\usepackage{makeidx} 6 | %\usepackage{multicol} 7 | \usepackage{mathtools} 8 | \usepackage{listings} 9 | \usepackage{amsmath} 10 | \usepackage{amssymb} 11 | \usepackage{hyperref} 12 | \usepackage{cite} 13 | \usepackage{authblk} 14 | \usepackage{graphicx} 15 | \usepackage{float} 16 | \newtheorem{thm}{Theorem} 17 | \newtheorem{lemma}{Lemma} 18 | \newcommand{\R}{{\mathbb R}} 19 | 20 | 21 | \begin{document} 22 | 23 | \floatstyle{ruled} 24 | \newfloat{program}{thp}{lop} 25 | \floatname{program}{Program} 26 | \newfloat{algorithm}{thp}{lop} 27 | \floatname{algorithm}{Algorithm} 28 | 29 | \setlength{\parindent}{0pt} 30 | \setlength{\parskip}{0.2em} 31 | 32 | \definecolor{verbgray}{gray}{0.9} 33 | \definecolor{verbgray2}{gray}{0.975} 34 | \lstset{backgroundcolor=\color{verbgray}, 35 | frame=single, 36 | framerule=0pt, 37 | basicstyle=\ttfamily, 38 | keepspaces=true, 39 | columns=fullflexible} 40 | 41 | 42 | \section*{Generalized linear models, abridged} 43 | 44 | \begin{quote} 45 | {\it This is the 2nd major revision of this document. This version derives 46 | algorithmic details of the iteratively re-weighted least squares method 47 | (IRWLS), and emphasizes advantages of using the singular value decomposition 48 | (SVD) in its implementation. We added background reference material on the SVD. 49 | From a suggestion by James Blevins, the notation was revised to bring it closer 50 | to other references, especially~\cite{MN}. 51 | --Bryan} 52 | \end{quote} 53 | 54 | \begin{quote} 55 | Generalized linear models (GLMs) are indispensable tools in the data science 56 | toolbox. They are applicable to many real-world problems involving continuous, 57 | yes/no, count and survival data (and more). The models themselves are intuitive 58 | and can be used for inference and prediction. A few very high quality free and 59 | open source software implementations are available (in particular within 60 | R~\cite{R}, and also ExaStat/Revolution Analytics), as are a few first-rate 61 | commercial ones like SAS, and Stata. 62 | 63 | This note grew out of our own desire to better understand the numerics of 64 | generalized linear models. We highlight aspects of GLM implementations that we 65 | find particularly interesting. We present some reference implementations 66 | stripped down to illuminate core ideas; often with just a few lines of code. 67 | Our implementations are in R but are close to pseudocode and easily ported to 68 | other languages. --Bryan and Mike 69 | \end{quote} 70 | 71 | 72 | \section*{Linear algebra background material} 73 | 74 | Skip ahead to the {\bf Linear models} section if you already know all about the 75 | singular value decomposition... The following brief introduction closely 76 | follows the important reference book by Golub and Van Loan, Matrix Computations 77 | \cite{gvl}. You should read that book. 78 | 79 | \subsection*{Orthonormal vectors and rotations} 80 | 81 | Let $V$ be a real-valued $n\times p$ matrix, which we write as 82 | $V\in\R^{n\times p}$. It's sometimes useful to enumerate the 83 | column vectors of a matrix, which we write for instance as $V=[v_1, v_2, 84 | \ldots, v_p]$. The column vectors of $V$ are {\it orthonormal} if and only if 85 | $V^T V = I$, the $p\times p$ identity matrix. For instance, the identity 86 | matrix itself is composed of orthonormal column vectors. When the matrix $V$ 87 | is square, that is when $n=p$, then we simply say that $V$ is an orthonormal 88 | matrix. 89 | The columns of orthonormal matrices form coordinate bases of $\R^n$ 90 | whose directions are orthogonal--in other words, a rotation of 91 | the usual unit basis coordinate system. For example, consider a $2\times 2$ 92 | orthonormal matrix $V$: 93 | \[ 94 | V = \left(\begin{array}{cc} 95 | 1/{\sqrt{2}} & -1{\sqrt{2}} \\ 96 | 1/{\sqrt{2}} & 1/{\sqrt{2}} 97 | \end{array}\right). 98 | \] 99 | Figure \ref{chxx_rotation} illustrates the rotation by plotting 100 | the column vectors of $V$ along with the usual unit basis vectors. 101 | \begin{figure} 102 | \begin{center} 103 | \includegraphics[width=0.5\textwidth]{rotation.pdf} 104 | \end{center} 105 | \caption{Coordinates from matrix $V$ with orthonormal columns (solid lines) 106 | compared to the standard unit basis vectors (dashed). $V$ is a rotation 107 | matrix.} 108 | \label{chxx_rotation} 109 | \end{figure} 110 | Multiplying a vector by $V$ rotates the entries of the vector to the 111 | new coordinate system, and does nothing else. In particular, the Euclidean 112 | norm of the vector is not changed. This is a useful result so we will state 113 | it as a Lemma: 114 | \begin{lemma}\label{invariant} 115 | Let $V\in\R^{n\times n}$ be an orthonormal matrix. Then 116 | $\|Vx\|=\|x\|$ for all $x\in\R^n$. 117 | \end{lemma} 118 | 119 | 120 | \subsection*{The singular value decomposition} 121 | 122 | The singular value decomposition, or SVD, plays a central role in the 123 | analysis--and often implementation--of many computational methods involving 124 | matrices. If you only plan to know one matrix decomposition, this is the one 125 | to know. Let $X\in\R^{n \times p}$ and let $k=\min\{n, p\}$. Then 126 | there exist matrices $U\in\R^{n \times k}$ and $V\in\R^{p\times 127 | k}$ with orthonormal columns $U^T U = V^T V = I$ such that 128 | \begin{equation}\label{SVD} 129 | U^T X V = \Sigma, 130 | \end{equation} 131 | where $\Sigma$ is a $k\times k$ diagonal matrix with non-negative entries 132 | $\sigma_1 \ge \sigma_2 \ge \cdots \ge \sigma_k \ge 0$ along its 133 | main diagonal. This is sometimes called the ``thin'' SVD. R users will be 134 | familiar with this as the {\tt svd(X)} function, which returns a vector of 135 | $\sigma_i$ values instead of a full diagonal matrix $\Sigma$ but is otherwise 136 | the same. The singular value decomposition is not unique, but is almost so--it 137 | is unique up to the signs of the singular vector elements. 138 | 139 | In the case that $n > p$ it's possible to extend the matrix $U$, by adding $n - 140 | p$ additional orthonormal columns, into a square orthonormal $n\times n$ matrix 141 | $\bar{U}$. Similarly, when $n < p$ we can extend $V$ to a square orthonormal 142 | $p\times p$ matrix $\bar{V}$. The extended matrices are especially useful in 143 | analysis, and available to R users using the function invocation 144 | {\tt{svd(X,$\phantom{,}$nu=n,$\phantom{,}$nv=p)}}. The extended version is 145 | sometimes called the ``full'' SVD or just the SVD in many references and 146 | $\bar{U}^TX\bar{V}=\bar{\Sigma}$ results in an $n\times p$ rectangular diagonal 147 | matrix with the same main diagonal entries $\sigma_1 \ge \sigma_2 \ge \cdots \ge 148 | \sigma_k \ge 0$ as the thin version. 149 | 150 | The columns of $\bar{U}$ are called the {\it left singular vectors} of $X$ and the 151 | columns of $\bar{V}$ are called the {\it right singular vectors}. The $\sigma_i$ are 152 | called {\it singular values} of $X$. The SVD breaks matrix vector 153 | multiplication into three steps: rotation, scaling, then another rotation. 154 | Consider an $n\times p$ matrix 155 | $X$ and its product $y$ with a vector $b\in\R^p$ using the 156 | full SVD 157 | $y=Xb = \bar{U}\bar{\Sigma}\bar{V}^Tb$: 158 | \begin{enumerate} 159 | \item Let $\hat{b}=\bar{V}^T b\in\R^p$. 160 | Since $\bar{V}$ is orthonormal, $\hat{b}$ is simply a rotation of the 161 | vector $b$. 162 | \item Now let $s = \bar{\Sigma}\hat{b}\in\R^n$, 163 | which scales each entry of $\hat{b}$ by the corresponding $\sigma_i$. 164 | \item Finally let $y=\bar{U}s$. This is just another rotation by the 165 | orthonormal matrix $\bar{U}$. 166 | \end{enumerate} 167 | The SVD reveals a lot of information about the structure of the matrix $X$. 168 | Step 2 tells us how much a vector can be scaled by $X$. The rotations in steps 169 | 1 and 3 tell us about its range and null space. The number of nonzero singular 170 | values of $X$ is equal to the \emph{rank} of $X$--the dimension of the range of 171 | $X$ (range means the set of all linear combinations of the columns of $X$ 172 | a.k.a. the span of $X$). The {\it condition number} 173 | of $X$, familiar to R users as the \verb+kappa+ function and also written 174 | $\kappa_2(X)$, is the ratio of largest and smallest singular values. It 175 | measures how ill-conditioned the matrix is. Computation involving highly 176 | ill-conditioned matrices can be very sensitive to perturbations like noise or 177 | even numerical precision. 178 | 179 | Let $U^TXV=\Sigma$ be the ``thin'' SVD of $X$ and let 180 | $\bar{U}\in\R^{n\times n}$ and $\bar{V}\in\R^{p\times p}$ be 181 | their extended versions when $n > p$ or $n < p$. Let $r$ be the index 182 | corresponding to the smallest non-zero singular value of $X$, for instance 183 | $\sigma_1 \ge \sigma_2 \ge \cdots \ge \sigma_r > \sigma_{r+1} = 0 = \cdots = \sigma_k$, 184 | where $k=\min\{n,p\}$. Then $\mbox{rank}(X) = r$ and the singular vectors define the 185 | following bases: 186 | \begin{itemize} 187 | \item The first $r$ columns of $U$ form a basis of the range of $X$. 188 | \item The first $r$ columns of $V$ form a basis of the range of $X^T$. 189 | \item If $r
0$ and the 1-norm of the solution vector $b$. The lasso
316 | is the closest convex estimate of the parameterized
317 | \emph{best subset selection problem}:
318 | \[
319 | \min_b\|Xb - y\|^2 + \mu\|b\|_0,
320 | \]
321 | where $\|b\|_0$ means simply the count of nonzero components of $b$.
322 | (Despite the notation, $\|b\|_0$ is not a vector norm
323 | since for any scalar $\lambda$ with $|\lambda |$ not equal to zero or one,
324 | $\|\lambda b\|_0 \ne |\lambda |\|b\|_0$.) Although the best subset selection
325 | problem might seem to be the most natural way to select subsets of columns of
326 | the matrix $X$, the problem is nonconvex and hard to solve--indeed it is
327 | known to be NP hard. We
328 | shall see later that there are other approaches to estimating optimal
329 | column subsets including a
330 | fast heuristic method by Golub called SVD subset
331 | selection~\cite[Section 12.2]{gvl}, and a
332 | newer approach by Lanza, Reichel and others based on Krylov subspace
333 | methods~\cite{lanza}.
334 |
335 | The ordinary least squares solution of linear models has important statistical
336 | properties shown by Gauss~\cite{gauss} and later rediscovered by
337 | Markoff~\cite{markoff}. The least squares solution defines a {\it minimum
338 | variance unbiased estimator}, the technical details of which we leave to the
339 | references, in particular see~\cite{hastie},\cite{MN}.
340 |
341 |
342 | \section*{Generalized linear models}
343 |
344 | Our notes on generalized linear models closely follow the book
345 | ``Generalized Linear Models'' by McCullagh and Nelder~\cite{MN}. That very
346 | readable and practical book remains, in our opinion, the best all-around applied
347 | reference on GLMs and strongly influenced algorithm implementations in the
348 | R language.
349 | McCullagh and Nelder describe generalizations of the basic linear model
350 | in three parts:
351 | \begin{enumerate}
352 | \item A \emph{random component} describing the distribution of the
353 | measured entries of of a response vector $y$ and their vector of
354 | expected values $\mu = E(y)\in\R^n$.
355 | \item A \emph{systematic component} $\eta = X\beta$ that is just a basic
356 | linear model involving a vector $\eta\in\R^n$, model matrix $X\in\R^{n\times p}$ and
357 | coefficient solution vector $\beta\in\R^p$.
358 | \item A \emph{link function} between the random and systematic components,
359 | $\eta = g(\mu)$, applied component-wise to the vector $\mu$.
360 | \end{enumerate}
361 | The link function $g$ is assumed to be a real-valued monotonic, differentiable
362 | (and therefore invertible) function. If $p=n$ and the matrix $X$ is of full
363 | rank, then the model can exactly match the $n$ data observations in $y$ and all
364 | of the variation between observations is consigned to the systematic component
365 | of the model. Such models are usually \emph{over fit} and rarely generalize
366 | well to new data, although they have practical utility as seen in the next
367 | section. When $p=1$ then the model represents a single common $\mu$
368 | for all $n$ data observations and all of the variation in $y$ is
369 | consigned to the random component. Most real-world GLMs lie somewhere
370 | in-between these two extremes.
371 |
372 |
373 | Adding the random component and link function around a basic linear model
374 | lets GLMs model a wider range of scenarios than their OLS cousins. In
375 | particular, the link function lets us model variables $\mu$ that are restricted
376 | to intervals, for instance the interval $[0,1]$ useful for modeling binary
377 | values. And we can use the random component to pair an appropriate
378 | distribution with such values (say, a binomial distribution in the case of 0/1
379 | data). The added modeling flexibility comes with a cost--the link function can
380 | turn finding the solution of GLMs into a nonlinear problem, despite the
381 | underlying linear model assumption in the systematic component.
382 |
383 |
384 | These notes assume that the random component distribution describing the
385 | response belongs to a one- or two-parameter \emph{exponential family} of
386 | probability distributions described below. The exponential family covers many
387 | widely used and important cases like logistic/binomial, Bernoulli, multinomial,
388 | exponential, Poisson, Gaussian, and others. Limiting our discussion to models
389 | that fit into the exponential family, despite a superficial mathematical
390 | complexity, greatly simplifies many details.
391 |
392 |
393 | \subsection*{The exponential family of distributions}
394 |
395 | The following sections include a lot of notation and many functions and
396 | parameters to keep track of. Although a bit complicated, nothing presented here
397 | is harder than elementary Calculus. For the most part, we very closely follow
398 | the exposition of McCullagh and Nelder~\cite{MN}, but we expand on it in some
399 | places to help illuminate key ideas.
400 |
401 | The exponential family of distributions have probability distributions
402 | that can be written as a function
403 | \begin{equation}\label{expfamily}
404 | f(y; \theta, \phi) = \exp\left(\frac{y\theta - b(\theta)}{a(\phi)} + c(y, \phi)\right),
405 | \end{equation}
406 | for parameters $\theta$ and $\phi$ and fixed functions $a,b,$ and $c$.
407 | The notation $f(y; \theta, \phi)$ means a function $f(y)$ that
408 | depends on the given parameters $\theta$ and $\phi$.
409 | Any probability distribution that can be re-written in this form belongs
410 | to the exponential family.
411 |
412 | For instance, let
413 | $\theta=\mu$, $\phi=\sigma^2$, $b(\theta)=\theta^2/2$, $a(\phi)=\phi$
414 | and $c(y, \phi) = -\frac{1}{2}(\frac{y^2}{\sigma^2} + \log(2\pi\sigma^2))$.
415 | Then substituting those values in to Equation~\ref{expfamily} yields
416 | \begin{align*}
417 | f(y; \theta, \phi) &=
418 | \exp\left(\frac{y\mu - \mu^2/2}{\sigma^2} - \frac{1}{2}\left(\frac{y^2}{\sigma^2} + \log(2\pi\sigma^2)\right)\right)\\
419 | &= \frac{1}{\sqrt{2\pi\sigma^2}}\exp\left(\frac{-(y - \mu)^2}{2\sigma^2}\right),
420 | \end{align*}
421 | which is a standard expression of a normal distribution,
422 | showing that the normal distribution fits in to the exponential
423 | family.
424 |
425 | Similarly, consider the Poisson distribution with single parameter
426 | $\mu$,
427 | \[
428 | \exp(-\mu)\mu^y/{y!}\,\,.
429 | \]
430 | Let $\theta=\log\mu$, $a(\phi)=1$, $b(\theta)=\exp\theta$, and
431 | $c(y, \phi)=-\log{(y!)}$. Then
432 | \begin{align*}
433 | f(y; \theta, \phi) &= \exp\left(\frac{y\theta - b(\theta)}{a(\phi)} + c(y, \phi)\right)\\
434 | &= \exp\left(\frac{y\log\mu - \exp\theta}{1} - \log y!\right)\\
435 | &= {\exp(y\log\mu - \exp\theta)}/{y!}\\
436 | &= {\exp(y\log\mu - \exp\log\mu)}/{y!}\\
437 | &= \exp(y\log\mu - \mu)/y!\\
438 | &= \exp{(-\mu)}\exp{(y\log\mu)}/y!\\
439 | &= \exp(-\mu)\mu^y/y!\,\,.
440 | \end{align*}
441 | Many other distributions are described by the exponential family.
442 |
443 | \subsubsection*{The log likelihood function}
444 | If we think of the function $f$ in Equation~\ref{expfamily} as
445 | a function of parameters $\theta$ and $\phi$ given observed data $y$ then
446 | the function describes the likelihood of the observations. Its logarithm,
447 | \begin{equation*}
448 | \ell(\theta, \phi; y) = \frac{y\theta - b(\theta)}{a(\phi)} + c(y, \phi),
449 | \end{equation*}
450 | is called the \emph{log likelihood} function. In this context the
451 | function $b(\theta)$ is called the \emph{cumulant function} and $\phi$
452 | the \emph{dispersion parameter}.
453 | In the usual case where $y$ is a vector
454 | of $n$ independent observations, the log likelihood function
455 | sums the individual contributions:
456 | \begin{equation}\label{loglik}
457 | \ell(\theta, \phi; y) = \sum_{i=1}^n\frac{y_i\theta_i - b(\theta_i)}{a(\phi)} + c(y, \phi)
458 | \end{equation}
459 | Next we derive a few basic identities that
460 | will be useful later.
461 |
462 | Let $\partial\ell/\partial\theta$ be the derivative of the log likelihood
463 | function with respect to $\theta$ (how much the function changes as
464 | $\theta$ changes), and similarly $\partial^2\ell/\partial\theta^2$
465 | its 2nd derivative (how much the derivative function changes as
466 | $\theta$ changes). Then
467 | \begin{equation}\label{dl}
468 | \frac{\partial\ell}{\partial\theta} = \frac{y - b'(\theta)}{a(\phi)},
469 | \end{equation}
470 | and,
471 | \begin{equation}\label{d2l}
472 | \frac{\partial^2\ell}{\partial\theta^2} = \frac{-b''(\theta)}{a(\phi)},
473 | \end{equation}
474 | where $b'(\theta)$ means the derivative of the function $b$ taken with
475 | respect to $\theta$.
476 |
477 | Assume that $a(\phi)\ne 0$ and that
478 | the expected value $E(\partial\ell/\partial\theta) = 0$
479 | and also that
480 | $E(\partial^2\ell/\partial\theta^2) + E(\partial\ell/\partial\theta)^2 = 0$.
481 | Then
482 | \begin{equation}\label{bprime}
483 | 0 = E(\partial\ell/\partial\theta) = \frac{E(y) - b'(\theta)}{a(\phi)}
484 | \qquad\mbox{which means that}\,\,b'(\theta) = E(y).
485 | \end{equation}
486 | Recall above that we sometimes use the alternative notation $\mu=E(y)$
487 | for the expected value of $y$; so $\mu=E(y)=b'(\theta)$.
488 |
489 | Similarly,
490 | \begin{align}
491 | 0 &=
492 | E(\partial^2\ell/\partial\theta^2) + E(\partial\ell/\partial\theta)^2 \nonumber\\
493 | &=
494 | \frac{-b''(\theta)}{a(\phi)} +
495 | E\left(\frac{y - b'(\theta)}{a(\phi)}\right)^2 \nonumber\\
496 | &= \frac{-b''(\theta)}{a(\phi)} +
497 | E\left(\frac{y^2 - 2yb'(\theta) + b'(\theta)^2}{a(\phi)^2}\right)\nonumber\\
498 | &= \frac{-b''(\theta)}{a(\phi)} +
499 | \frac{E(y^2) - E(y)^2}{a(\phi)^2} \qquad(\mbox{substituting}\,\,b'(\theta)=E(y))\nonumber\\
500 | &= \frac{-b''(\theta)}{a(\phi)} + \frac{V(y)}{a(\phi)^2}\nonumber\\
501 | &\mbox{which means that}\,\,a(\phi)b''(\theta) = V(y)\label{b2},
502 | \end{align}
503 | where $V(y)$ is the usual definition of the variance function for $y$.
504 |
505 | Finally for this section, one more useful identity showing that the
506 | rate of change of the expected value of $y$ with respect to the parameter $\theta$
507 | is a multiple of the variance function $V(y)$:
508 | \begin{align}
509 | \frac{d}{d\theta}E(y) &= \frac{d}{d\theta}\mu &\mbox{(just notation)}\nonumber\\
510 | &= \frac{d}{d\theta}b'(\theta) &\mbox{(by Equation \ref{bprime})}\nonumber\\
511 | &= b''(\theta)\nonumber\\
512 | &= V(y)/a(\phi)\label{mutheta} &\mbox{(by Equation \ref{b2})}.
513 | \end{align}
514 |
515 |
516 |
517 |
518 |
519 | \subsection*{GLMs and the exponential family}
520 |
521 | The last section introduced the random component of generalized linear models
522 | and corresponding log likelihood function for the exponential family of
523 | distributions. This section puts that together with the remaining
524 | generalizations, the systematic component's linear model $\eta = X\beta$ and
525 | the link function $\eta=g(\mu)$.
526 |
527 | One approach for solving generalized linear models is to find the value of the
528 | coefficient vector $\beta$ that maximizes the value of the log likelihood
529 | function in Equation~\ref{loglik}. Solving for such a \emph{maximum-likelihood
530 | solution} is the main goal of this section. We can phrase the solution as a
531 | standard nonlinear least squares problem by recasting the maximum likelihood
532 | problem in terms of a minimum residual problem using \emph{deviance residuals}.
533 |
534 | \subsubsection*{Deviance residuals}
535 |
536 |
537 |
538 | \subsubsection*{Jacobian}
539 |
540 | In order to find a maximum using Calculus, we will need (at least) an
541 | expression for the derivative of the log likelihood function with respect to
542 | each component of the solution $\beta_j$, $\partial{l}/\partial{\beta_j}$.
543 |
544 | Writing the $n \times p$ matrix $X$ showing each column as
545 | $X = [x_1, x_2, \cdots, x_p]$, then
546 | $\eta = X\beta = x_1\beta_1 + x_2\beta_2 + \cdots + x_p\beta_p$,
547 | and
548 | \begin{equation}\label{eta}
549 | \partial\eta/\partial\beta_j = x_j.
550 | \end{equation}
551 |
552 | Then using the chain rule from Calculus, the derivative of the log likelihood
553 | function with respect to each component of the solution $\beta_j$ is
554 | \begin{align}
555 | \frac{\partial\ell}{\partial\beta_j}
556 | &= \frac{\partial\ell}{\partial\theta}\frac{d\theta}{d\mu}\frac{d\mu}{d\eta}\frac{\partial\eta}{\partial\beta_j} \nonumber\\
557 | &= \frac{\partial\ell}{\partial\theta}\frac{1}{V(\mu)}\frac{d\mu}{d\eta}\frac{\partial\eta}{\partial\beta_j}
558 | &\mbox{(by Equation \ref{mutheta})} \nonumber\\
559 | &= \frac{\partial\ell}{\partial\theta}\frac{1}{V(\mu)}\frac{d\mu}{d\eta}x_j
560 | &\mbox{(by Equation \ref{eta})} \nonumber\\
561 | &= \frac{y- b'(\theta)}{a(\phi)}\frac{1}{V(\mu)}\frac{d\mu}{d\eta}x_j
562 | &\mbox{(by Equation \ref{dl})} \nonumber\\
563 | &= w\frac{y-\mu}{a(\phi)}\frac{d\eta}{d\mu}x_j,
564 | &\mbox{(by Equations \ref{bprime} and \ref{w})} \label{dldb}
565 | \end{align}
566 | where $w$ is defined as a multiple of the inverse variance function:
567 | \begin{equation}\label{w}
568 | w = \left(\frac{d\mu}{d\eta}\right)^2 \bigg/ V(\mu).
569 | \end{equation}
570 | Since $\eta=g(\mu)$, the term $\frac{d}{d\mu}\eta$ in Equation~\ref{dldb} is
571 | simply $g'$, the derivative of the link function. We remark that, in the usual
572 | case that the response $y$ is a vector of $n$ iid observations and $X$ is an
573 | $n\times p$ matrix, Equation~\ref{dldb} holds entrywise and defines the
574 | $n\times p$ Jacobian matrix of the log likelihood function
575 | with $ij^{th}$ entry
576 | \begin{equation}\label{jacobian}
577 | J(\beta)_{ij} = \frac{\partial\ell_i}{\partial\beta_j}
578 | = w_i\frac{y_i-\mu_i}{a(\phi)}\frac{d\eta_i}{d\mu}x_{ij},
579 | \qquad i=1, 2, \ldots, n,\,\,j=1,2,\ldots,p.
580 | \end{equation}
581 | %Assume that the dispersion function $a(\phi)$ is constant with respect to the
582 | %solution $\beta$. Then the maximum of the log likelihood function $\ell$ with
583 | %respect to each solution component $\beta_j$ for $j=1, 2, \ldots, p$ occurs when
584 | %\begin{equation}\label{max_loglik}
585 | %\sum_{i=1}^n w_i(y-\mu)_i\frac{d}{d\mu}\eta_ix_{ij} = 0.
586 | %\end{equation}
587 |
588 |
589 | \subsubsection*{Canonical link functions}
590 | Recall that
591 | the link function relates $\eta$ and $\mu$ by $\eta=g(\mu)$,
592 | and therefore also their derivatives $d\eta/d\mu = g'(\mu)$. Choosing a special \emph{canonical link function}
593 | results in a number of simplifications. Chief among them for our
594 | purposes, a canonical link connects $d\eta/d\mu$ to the variance function by
595 | \begin{equation}
596 | \label{canonical}
597 | {d\eta}/{d\mu}={1}/{V(\mu)}.\qquad\mbox{(canonical link case)}
598 | \end{equation}
599 | When $g$ is a canonical link function we get a simplification
600 | for $w$ using~\ref{canonical}:
601 | \begin{align}
602 | w &= \left(\frac{d\mu}{d\eta}\right)^2 \bigg/ V(\mu) \nonumber \\
603 | &= V^2(\mu) / V(\mu) \nonumber \\
604 | &= V(\mu)\qquad\mbox{(canonical link case)}. \label{W_canonical}
605 | \end{align}
606 |
607 |
608 |
609 | \subsection*{Maximum likelihood solutions based on first-order approximations}
610 |
611 | Jacobian :q
612 | :q
613 |
614 |
615 | Assemble entries of $w$ and $g'$ along the diagonal of an
616 | $n\times n$ diagonal matrix $W$:
617 | \begin{equation}\label{W}
618 | W_{ij} = \bigg\{\begin{array}{cr}
619 | w_i / g'(\mu)_i & \mbox{if $i=j$}, \\
620 | 0 & \mbox{otherwise}.
621 | \end{array}
622 | \end{equation}
623 | Then a compact formula for the gradient of $\ell$ with respect to $\beta$ is
624 | \begin{equation}\label{gradient}
625 | \nabla_\beta\ell = X^TW(y - \mu).
626 | \end{equation}
627 |
628 | At this point, we have enough information from Equations~\ref{loglik}
629 | and~\ref{gradient} to formulate a first-order solution method to finding the
630 | maximum likelihood GLM solution. Possible solution methods include gradient
631 | descent, Gauss-Newton, conjugate gradient, or a quasi-Newton approach.
632 | The following example cooks up a very basic GLM solver using gradient
633 | descent.
634 |
635 | XXX EXAMPLE XXX
636 |
637 |
638 |
639 | \subsection*{Maximum likelihood solution by Newton's method}
640 |
641 | We can do better than the quasi-Newton solution in the last section. Because
642 | we restricted our problems to the exponential family, we can formulate an
643 | analytic representation of the second derivatives of the log likelihood
644 | function to form a Hessian matrix. That knowledge enables us to employ solution
645 | methods using second-order (quadratic) approximations like Newton's
646 | method--such methods have more favorable convergence properties (faster, more
647 | stable) than first order solution methods.
648 |
649 | We need to differentiate the expression for the derivative of $\ell$ in
650 | Equation~\ref{dldb} with respect to other elements
651 | of $\beta_k$ to compute the entries of the 2nd derivative Hessian matrix,
652 | and again we assume that the dispersion term $a(\phi)$ is constant
653 | with respect to the solution $\beta$. Then the $jk$-entry of the
654 | Hessian matrix is:
655 | \begin{align}
656 | H_{jk} &= \frac{\delta^2\ell}{\delta\beta_k\delta\beta_j}
657 | = \frac{\delta}{\beta_k}\frac{\delta\ell}{\delta\beta_j} \nonumber\\
658 | &=
659 | \frac{\delta}{\delta\beta_k}
660 | \sum_i\left(
661 | w_i(y-\mu)_i\frac{d\eta_i}{d\mu}x_{ij} \right)\nonumber \\
662 | &= \sum_i\left(
663 | (y-\mu)_i
664 | \frac{\delta}{\delta\beta_k}
665 | w\frac{d\eta_i}{d\mu}x_{ij}
666 | +
667 | w_i\frac{d\eta_i}{d\mu}x_{ij}\frac{\delta}{\delta\beta_k}(y-\mu)_i \label{hsum}
668 | \right)
669 | \end{align}
670 | where the last equality uses the product rule from Calculus.
671 | At this point, we have an (unwieldy) expression for the Hessian and
672 | we can plug that together with the gradient function from Equation~\ref{gradient}
673 | into Newton's method to get a maximum likelihood GLM solver.
674 | However, we will consider an important special case next that is much simpler.
675 |
676 | \subsubsection*{Canonical link case}
677 | When $g$ is a canonical link function
678 | the expression $w_i\frac{d\eta_i}{d\mu}$ in the
679 | first term of the sum in Equation~\ref{hsum}
680 | is constant because in such cases
681 | $d\eta/d\mu=1/V(\mu)$ and
682 | $w=V(\mu)$
683 | by Equations~\ref{canonical}i and~\ref{W_canonical}. Thus, its derivative
684 | \[
685 | \frac{\delta}{\delta\beta_k} w_i\frac{d\eta_i}{d\mu} = 0,
686 | \]
687 | that is, the first term of the Hessian in Equation~\ref{hsum}
688 | drops out when $g$ is a canonical link function.
689 | Meanwhile, consider the second term
690 | \begin{align*}
691 | \sum_i\left(
692 | w_i\frac{d\eta_i}{d\mu}x_{ij}\frac{\delta}{\delta\beta_k}(y-\mu)_i
693 | \right) &=
694 | \sum_i\left(
695 | w_i\frac{d\eta_i}{d\mu}x_{ij}\frac{\delta\mu}{\delta\beta_k} \right) &\mbox{($y$ is constant wrt $\beta$)} \\
696 | &=
697 | \sum_i\left(
698 | w_ix_{ij}\frac{\delta\eta_i}{\delta\beta_k} \right)&\mbox{(chain rule)}\\
699 | &=
700 | \sum_i\left(
701 | w_ix_{ij}x_{ik} \right), &\mbox{(by Equation \ref{eta})}
702 | \end{align*}
703 | finally arriving (with a substantial subscript-induced headache) at a compact
704 | expression for the Hessian using the definition of $W$ from Equation~\ref{W}:
705 | \begin{equation}\label{Hessian}
706 | H = X^T W X.
707 | \end{equation}
708 |
709 | With an expression for the gradient from Equation~\ref{gradient} and Hessian
710 | from Equation~\ref{Hessian} of the log likelihood function, we have all we need
711 | to implement a second order solution method. The next example uses R's
712 | \verb+nlm+ function to find the maximum likelihood GLM solution using Newton's
713 | method.
714 |
715 | XXX EXAMPLE XXX
716 |
717 | Remember that this derivation assumed that $g$ is a canonical link function. In
718 | the general case we need to resort to the definition of the Hession in
719 | Equation~\ref{hsum} for a Newton's method-based solution.
720 |
721 |
722 |
723 | \subsection*{Iteratively re-weighted least squares}
724 |
725 |
726 |
727 |
728 | The numerical solution of model problems of this form was carefully analyzed by
729 | Paige\cite{paige}.
730 | ...entries of $W$ are non-zero, the generalized linear model
731 | \ref{glm} results in a weighted nonlinear least squares problem
732 | typically solved by the iteratively reweighted least square method
733 | shown in Algorithm \ref{irls} and defined carefully by Bj\"ork\cite{bjork}...
734 |
735 |
736 | \subsection*{Numerical implementation issues}
737 |
738 | cover edge cases here including zero-variance observations (constant rows in $X$)
739 | and singular/ill-conditioned $X$.
740 |
741 | Introduce R's rank revealing QR irwls approach.
742 |
743 | SVD-irwls based on algorithm by O'Leary
744 |
745 | comparison/examples between R's RRQR-irwls and SVD-irwls
746 |
747 | large-scale problems and 1st order solution methods
748 |
749 |
750 | \subsubsection*{Round off error in QR- and SVD-based methods}
751 |
752 |
753 |
754 |
755 |
756 |
757 |
758 |
759 |
760 |
761 |
762 | \section*{Copyright}
763 | Copyright \copyright 2014 Michael Kane and Bryan W. Lewis
764 |
765 | \begin{quote}
766 | Permission is granted to copy, distribute and/or modify this document
767 | under the terms of the GNU Free Documentation License, Version 1.3
768 | or any later version published by the Free Software Foundation;
769 | with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
770 | A copy of the license is included in the Github project files.
771 | \end{quote}
772 |
773 |
774 |
775 | \begin{thebibliography}{99}
776 | \bibitem{anda} Anda, A. and Park, H., Self-scaling fast rotations for stiff least squares problems, Lin. Alg. Appl., 234, 1996, pp. 137-162.
777 | \bibitem{bjork} Bj\"orck, \AA., Numerical Methods for Least Squares Problems, SIAM, Philadelphia, 1996.
778 | \bibitem{bates} Bates, D., \url{http://www.stat.wisc.edu/courses/st849-bates/lectures/GLMH.pdf">http://www.stat.wisc.edu/courses/st849-bates/lectures/GLMH.pdf}.
779 | \bibitem{dekker}
780 | Dekker, Theodorus Jozef. ``A floating-point technique for extending the available precision.'' Numerische Mathematik 18.3 (1971): 224-242.
781 | \bibitem{friedman} Friedman, Jerome, Trevor Hastie, and Rob Tibshirani. "Regularization paths for generalized linear models via coordinate descent." Journal of statistical software 33.1 (2010): 1.
782 | \bibitem{glmnet} Friedman, Hastie, Tibshirani, Simon, Narasimhan, Qian, \url{https://cran.r-project.org/package=glmnet}.
783 | \bibitem{gauss} Gauss, C. F., Theoria combinationis observationum erroribus minimis obnoxiae, Pars prior, 1863 (1st written 1821).
784 | \bibitem{hastie} Hastie, T. J. and Pregibon, D., Generalized linear models, Chapter 6 of Statistical Models in S, eds J. M. Chambers and T. J. Hastie, Wadsworth \& Brooks/Cole, 1992.
785 | \bibitem{fmm} Forsythe, George Elmer, Cleve B. Moler, and Michael A. Malcolm. ``Computer methods for mathematical computations.'' (1977).
786 | \bibitem{gvl} Golub, Gene H., and Charles F. Van Loan. Matrix computations. Vol. 3. JHU Press, 2012.
787 | \bibitem{higham96} Higham, Nicholas J. Accuracy and stability of numerical algorithms. Vol. 80. Siam, 2002.
788 | \bibitem{horn-johnson} Horn, Roger A., and Charles R. Johnson. Matrix analysis. Cambridge university press, 1990.
789 | \bibitem{jordan} Jordan, Michael, \url{https://people.eecs.berkeley.edu/~jordan/courses/260-spring10/other-readings/chapter8.pdf} (online notes).
790 | \bibitem{lamport} Lamport, Leslie. How to write a proof. The American mathematical monthly 102.7 (1995): 600-608.
791 | \bibitem{lanza}Lanza, Alessandro, et al. A Generalized Krylov Subspace Method for $\ell_p-\ell_q$ Minimization. SIAM Journal on Scientific Computing 37.5 (2015): S30-S50.
792 | \bibitem{lumley} Lumley, T, \url{http://cran.r-project.org/web/packages/biglm">http://cran.r-project.org/web/packages/biglm}.
793 | \bibitem{markoff} Markoff, A., Wahrscheinlichheitsrechnug, Leipzig, 1912.
794 | \bibitem{MN} McCullagh P. and Nelder, J. A., Generalized Linear Models, Chapman and Hall, London 1989.
795 | \bibitem{oleary} O'Leary, D., Robust regression computation using iteratively reweighted least squares, Siam J. Mat. Anal. Appl., Vol. 11 No. 3, 1990, pp. 466-480.
796 | \bibitem{paige} Paige, C. C., Fast numerically stable computations for generalized least squares problems, Siam J. Num. Anal., 16, 1979, pp. 165-171.
797 | \bibitem{R} The R project \url{http://www.r-project.org">http://www.r-project.org}.
798 | \bibitem{trefbau} Trefethen, Lloyd N., and David Bau III. Numerical linear algebra. Vol. 50. Siam, 1997.
799 | \bibitem{zhou} Zhou, H. and Hastie, T., Regularization and Variable Selection via the Elastic Net, J. Royal Statistical Society, B, 2005, pp. 301-320.
800 | \end{thebibliography}
801 |
802 |
803 | \end{document}
804 |
--------------------------------------------------------------------------------
/implementations.R:
--------------------------------------------------------------------------------
1 | # Example iteratively re-weighted least squares (IRLS) implementations
2 | # Mike Kane & Bryan Lewis, 2013-2014.
3 | #
4 | # The implementations generally follow the same input/output pattern. They
5 | # take as inputs a model matrix A, a response vector b whose length is the
6 | # number of rows of A, an R 'family' function that defines the error
7 | # distribution family and link function, a maximum number of iterations, and an
8 | # iteration convergence tolerance. The methods produce a list with two
9 | # elements, the model coefficients and the number of iterations.
10 |
11 | # The most basic IRLS method, and the shortest implementation we could come
12 | # up with. This method solves the normal equations associated with a weighted
13 | # least squares problem in each iteration.
14 | irls =
15 | function(A, b, family=binomial, maxit=25, tol=1e-08)
16 | {
17 | x = rep(0,ncol(A))
18 | for(j in 1:maxit)
19 | {
20 | eta = drop(A %*% x)
21 | g = family()$linkinv(eta)
22 | gprime = family()$mu.eta(eta)
23 | z = eta + (b - g) / gprime
24 | W = drop(gprime^2 / family()$variance(g))
25 | xold = x
26 | x = solve(crossprod(A, W * A), crossprod(A, W * z), tol=2*.Machine$double.eps)
27 | if(sqrt(drop(crossprod(x - xold))) < tol) break
28 | }
29 | list(coefficients=x, iterations=j)
30 | }
31 |
32 | # A method discussed by O'Leary that uses a QR factorization of the model
33 | # matrix. This method should be much more numerically stable in the face of
34 | # ill-conditioned model matrices than the simple method defined above. If the
35 | # QR method used uses Givens rotations, this method is numerically stable for
36 | # stiff problems too.
37 | irls_qrnewton =
38 | function(A, b, family=binomial, maxit=25, tol=1e-08)
39 | {
40 | s = t = 0
41 | QR = qr(A)
42 | Q = qr.Q(QR)
43 | R = qr.R(QR)
44 | for(j in 1:maxit)
45 | {
46 | g = family()$linkinv(t)
47 | gprime = family()$mu.eta(t)
48 | z = t + (b - g) / gprime
49 | W = as.vector(gprime^2 / family()$variance(g))
50 | wmin = min(W)
51 | if(wmin < sqrt(.Machine$double.eps))
52 | warning("Tiny weights encountered")
53 | s_old = s
54 | C = chol(crossprod(Q, W*Q))
55 | s = forwardsolve(t(C), crossprod(Q,W*z))
56 | s = backsolve(C,s)
57 | t = Q %*% s
58 | if(sqrt(crossprod(s - s_old)) < tol) break
59 | }
60 | x = backsolve(R, crossprod(Q,t))
61 | list(coefficients=x,iterations=j)
62 | }
63 |
64 | # The next method is a minor variation on the QR Newton method defined above
65 | # that uses the SVD instead. It exhibits similar numerical stability and can
66 | # definitively check model matrix rank deficiency, at the cost of computing
67 | # the SVD instead of the QR factorization up front.
68 | irls_svdnewton =
69 | function(A, b, family=binomial, maxit=25, tol=1e-08)
70 | {
71 | s = t = 0
72 | S = svd(A)
73 | if(min(S$d)/max(S$d)