├── README.md ├── compare.R ├── glm.svd.r ├── glm.tex ├── implementations.R └── sparse_n_dense.R /README.md: -------------------------------------------------------------------------------- 1 | Generalized linear models, abridged. 2 | =============== 3 | 4 | See our old notes on this project at: http://bwlewis.github.io/GLM. 5 | 6 | Very revised and updated notes are in process as of October, 2018. 7 | 8 | 9 | Here are some slides from a talk at the Cleveland R User Group: https://bwlewis.github.io/GLM/October_2018_CLERUG.html 10 | 11 | Our current experimental reference SVD-based GLM implementation, based in turn 12 | on Dianne O'Leary's QR implementation from 1990, can be found here: 13 | https://github.com/bwlewis/GLM/blob/master/glm.svd.r That code is robust and 14 | fast and replicates R's column subset selection routine by falling back to R's 15 | default rank-revealing QR factorization in edge cases. But it's still a work in 16 | progress. 17 | -------------------------------------------------------------------------------- /compare.R: -------------------------------------------------------------------------------- 1 | source("implementations.R") 2 | data("Contraception",package="mlmRev") 3 | # Model estimated with R's glm function, returning model matrix and response 4 | # in $x and $y, respectively: 5 | R_GLM = glm(formula = use ~ age + I(age^2) + urban + livch, family = binomial, x=TRUE, data=Contraception) 6 | # Model estimated with our radically stripped-down minimalist implementation: 7 | mini = irls(R_GLM$x, R_GLM$y, family=binomial) 8 | print(data.frame(R_GLM=coef(R_GLM), minimalist=coef(mini))) 9 | 10 | iqrn = irls_qrnewton(R_GLM$x, R_GLM$y, family=binomial) 11 | print(data.frame(R_GLM=coef(R_GLM), qr_newton=coef(iqrn))) 12 | 13 | isvdn = irls_svdnewton(R_GLM$x, R_GLM$y, family=binomial) 14 | print(data.frame(R_GLM=coef(R_GLM), svd_newton=coef(isvdn))) 15 | 16 | # Let's test the sparse-aware IRLS example. But we need some data prep for it 17 | # first. The 1st three columns of our model matrix are dense: 18 | library("Matrix") 19 | A_dense = Matrix(R_GLM$x[,1:3], sparse=FALSE) 20 | # The next four columns are sparse: 21 | A_sparse = Matrix(R_GLM$x[,4:7], sparse=TRUE) 22 | isparse = irls_sparse(A_dense, A_sparse, R_GLM$y, family=binomial) 23 | print(data.frame(R_GLM=coef(R_GLM), irls_sparse=coef(isparse))) 24 | 25 | # Let's test the incremental implementation... 26 | # Write out the model matrix to a data file for the incremental example. 27 | write.table(R_GLM$x, file="data.csv", sep=",", col.names=FALSE, row.names=FALSE) 28 | inc = irls_incremental("data.csv", 500, R_GLM$y, family=binomial) 29 | print(data.frame(R_GLM=coef(R_GLM), incremental=coef(inc))) 30 | -------------------------------------------------------------------------------- /glm.svd.r: -------------------------------------------------------------------------------- 1 | #' Fitting Generalized Linear Models 2 | #' 3 | #' Similar to \code{glm.fit} but uses the SVD to detect ill-conditioned 4 | #' problems and conducts IRWLS in projected subspace for efficiency. 5 | #' 6 | #' @param X an n by p real-valued dense model matrix 7 | #' @param y a response vector of length n 8 | #' @param family a family function or the result of a call to a family function 9 | #' @param maxit integer number of maximum number of IRWLS iterations 10 | #' @param tol IRWLS positive convergence tolerance 11 | #' @param stol positive numerical condition tolerance 12 | #' @param singular.ok if FALSE a numerically-singular fit stops with error 13 | #' @param weights vector of observation weights 14 | #' @param reg.method indicates regularization approach: 'column projection' follows R's GLM approach; 'minimum norm' finds the LS solution of minimal norm. 15 | #' @param LAPACK if FALSE use R's column-ordered subset selection when \code{reg.method == 'column projection'} otherwise use default LAPACK pivots. 16 | #' @return A list with model coefficients b, number if IRWLS iterations, and column pivoting indices. 17 | #' Returns a list with entries: 18 | #' \describe{ 19 | #' \item{b:}{ model coefficients} 20 | #' \item{iterations:}{ number of IRWLS iterations} 21 | #' \item{rank:}{ rank of model matrix} 22 | #' \item{pivot:}{ model matrix column pivot} 23 | #' } 24 | #' @seealso \code{\link{glm.fit}} 25 | glm.svd = 26 | function(X, y, family=binomial, maxit=25, tol=1e-10, stol=1e-10, 27 | singular.ok=TRUE, weights, 28 | reg.method=c("column projection", "minimum norm"), 29 | LAPACK=FALSE) 30 | { 31 | singular = ifelse(singular.ok, warning, stop) 32 | reg.method = match.arg(reg.method) 33 | if(is.list(X)) S = X 34 | else S = svd(X) 35 | V = S$v 36 | nvars = NCOL(S$u) 37 | idx = seq(nvars) 38 | i = (S$d / S$d[1]) > stol 39 | k = sum(i) 40 | pivot = seq(nvars) 41 | if (k < nvars) 42 | { 43 | singular("Singular system detected of rank: ", k, " using threshold: ", stol) 44 | if(reg.method == "column projection") 45 | { 46 | Q = qr(t(S$v[, 1:k]), LAPACK=LAPACK) # Golub SVD subsel heuristic 47 | # when LAPACK=FALSE uses R's custom pivoting strategy 48 | pivot = Q$pivot 49 | idx = sort(head(pivot, k)) 50 | omit = tail(Q$pivot, nvars - k) 51 | # XXX we can maybe instead use a slightly cheaper downdating svd scheme here: 52 | S_new = svd(X[, -omit]) 53 | # double-check that this worked (it may not have), if not resort to 54 | # something else... XXX can this be improved? 55 | if((tail(S_new$d, 1) / S_new$d[1]) <= stol) 56 | { 57 | warning("Whoops! SVD subset selection failed, trying dqrdc2 on full matrix") 58 | if(is.list(X)) Q = qr(X$u %*% (X$d * t(X$v)), LAPACK=FALSE) 59 | else Q = qr(X, LAPACK=FALSE) 60 | pivot = Q$pivot 61 | idx = sort(head(pivot, k)) 62 | omit = tail(Q$pivot, nvars - k) 63 | S_new = svd(X[, -omit]) 64 | } 65 | S = S_new 66 | message("omittig column(s) ", paste(omit, collapse=",")) 67 | } 68 | } 69 | 70 | s = rep(0, ncol(S$u)) 71 | if(!is(family, "family")) family = family() 72 | nobs = NROW(y) # needed by the initialize expression below 73 | nvars = NCOL(S$u) # ditto 74 | if(missing(weights)) weights = rep(1, nobs) 75 | variance = family$variance 76 | linkinv = family$linkinv 77 | mu.eta = family$mu.eta 78 | etastart = NULL 79 | eval(family$initialize) 80 | eta = family$linkfun(mustart) 81 | dev.resids = family$dev.resids 82 | dev = sum(dev.resids(y, linkinv(eta), weights)) 83 | devold = 0 84 | for(j in 1:maxit) 85 | { 86 | g = linkinv(eta) 87 | varg = variance(g) 88 | if(any(is.na(varg))) stop("NAs in variance of the inverse link function") 89 | if(any(varg==0)) stop("Zero value in variance of the inverse link function") 90 | gprime = mu.eta(eta) 91 | if(any(is.na(gprime))) stop("NAs in the inverse link function derivative") 92 | z = eta + (y - g) / gprime 93 | W = weights * as.vector(gprime^2 / varg) 94 | # The following is as well-conditioned as W is 95 | C = chol(crossprod(S$u, W*S$u), pivot=TRUE) 96 | piv = attr(C, "pivot") 97 | s = forwardsolve(t(C), crossprod(S$u, W*z)[piv]) 98 | s = backsolve(C, s)[order(piv)] 99 | eta = drop(S$u %*% s) 100 | dev = sum(dev.resids(y, g, weights)) 101 | if(abs(dev - devold) / (0.1 + abs(dev)) < tol) break 102 | devold = dev 103 | # R essentially computes this (via dqrdc2.f) 104 | ## Q = qr(W * X) 105 | ## omit = tail(Q$pivot, ncol(X) - Q$rank) 106 | ## now omit columns and solve... 107 | ## fit = qr.solve(W * X, W * z) 108 | ## eta = drop(x %*% fit) 109 | ## g = linkinv(eta) 110 | } 111 | x = rep(NA, NCOL(X)) 112 | inv = 1/S$d 113 | if(reg.method == "minimum norm") inv[inv > 1/stol] = 1 114 | x[idx] = drop(S$v %*% (s*inv)) 115 | list(coefficients=x,iterations=j, rank=k, pivot=pivot) 116 | } 117 | -------------------------------------------------------------------------------- /glm.tex: -------------------------------------------------------------------------------- 1 | \documentclass[10pt]{article} %See documentation for other class options 2 | \usepackage{tabularx} 3 | \usepackage{color} 4 | %\usepackage{fixltx2e,fix-cm} 5 | %\usepackage{makeidx} 6 | %\usepackage{multicol} 7 | \usepackage{mathtools} 8 | \usepackage{listings} 9 | \usepackage{amsmath} 10 | \usepackage{amssymb} 11 | \usepackage{hyperref} 12 | \usepackage{cite} 13 | \usepackage{authblk} 14 | \usepackage{graphicx} 15 | \usepackage{float} 16 | \newtheorem{thm}{Theorem} 17 | \newtheorem{lemma}{Lemma} 18 | \newcommand{\R}{{\mathbb R}} 19 | 20 | 21 | \begin{document} 22 | 23 | \floatstyle{ruled} 24 | \newfloat{program}{thp}{lop} 25 | \floatname{program}{Program} 26 | \newfloat{algorithm}{thp}{lop} 27 | \floatname{algorithm}{Algorithm} 28 | 29 | \setlength{\parindent}{0pt} 30 | \setlength{\parskip}{0.2em} 31 | 32 | \definecolor{verbgray}{gray}{0.9} 33 | \definecolor{verbgray2}{gray}{0.975} 34 | \lstset{backgroundcolor=\color{verbgray}, 35 | frame=single, 36 | framerule=0pt, 37 | basicstyle=\ttfamily, 38 | keepspaces=true, 39 | columns=fullflexible} 40 | 41 | 42 | \section*{Generalized linear models, abridged} 43 | 44 | \begin{quote} 45 | {\it This is the 2nd major revision of this document. This version derives 46 | algorithmic details of the iteratively re-weighted least squares method 47 | (IRWLS), and emphasizes advantages of using the singular value decomposition 48 | (SVD) in its implementation. We added background reference material on the SVD. 49 | From a suggestion by James Blevins, the notation was revised to bring it closer 50 | to other references, especially~\cite{MN}. 51 | --Bryan} 52 | \end{quote} 53 | 54 | \begin{quote} 55 | Generalized linear models (GLMs) are indispensable tools in the data science 56 | toolbox. They are applicable to many real-world problems involving continuous, 57 | yes/no, count and survival data (and more). The models themselves are intuitive 58 | and can be used for inference and prediction. A few very high quality free and 59 | open source software implementations are available (in particular within 60 | R~\cite{R}, and also ExaStat/Revolution Analytics), as are a few first-rate 61 | commercial ones like SAS, and Stata. 62 | 63 | This note grew out of our own desire to better understand the numerics of 64 | generalized linear models. We highlight aspects of GLM implementations that we 65 | find particularly interesting. We present some reference implementations 66 | stripped down to illuminate core ideas; often with just a few lines of code. 67 | Our implementations are in R but are close to pseudocode and easily ported to 68 | other languages. --Bryan and Mike 69 | \end{quote} 70 | 71 | 72 | \section*{Linear algebra background material} 73 | 74 | Skip ahead to the {\bf Linear models} section if you already know all about the 75 | singular value decomposition... The following brief introduction closely 76 | follows the important reference book by Golub and Van Loan, Matrix Computations 77 | \cite{gvl}. You should read that book. 78 | 79 | \subsection*{Orthonormal vectors and rotations} 80 | 81 | Let $V$ be a real-valued $n\times p$ matrix, which we write as 82 | $V\in\R^{n\times p}$. It's sometimes useful to enumerate the 83 | column vectors of a matrix, which we write for instance as $V=[v_1, v_2, 84 | \ldots, v_p]$. The column vectors of $V$ are {\it orthonormal} if and only if 85 | $V^T V = I$, the $p\times p$ identity matrix. For instance, the identity 86 | matrix itself is composed of orthonormal column vectors. When the matrix $V$ 87 | is square, that is when $n=p$, then we simply say that $V$ is an orthonormal 88 | matrix. 89 | The columns of orthonormal matrices form coordinate bases of $\R^n$ 90 | whose directions are orthogonal--in other words, a rotation of 91 | the usual unit basis coordinate system. For example, consider a $2\times 2$ 92 | orthonormal matrix $V$: 93 | \[ 94 | V = \left(\begin{array}{cc} 95 | 1/{\sqrt{2}} & -1{\sqrt{2}} \\ 96 | 1/{\sqrt{2}} & 1/{\sqrt{2}} 97 | \end{array}\right). 98 | \] 99 | Figure \ref{chxx_rotation} illustrates the rotation by plotting 100 | the column vectors of $V$ along with the usual unit basis vectors. 101 | \begin{figure} 102 | \begin{center} 103 | \includegraphics[width=0.5\textwidth]{rotation.pdf} 104 | \end{center} 105 | \caption{Coordinates from matrix $V$ with orthonormal columns (solid lines) 106 | compared to the standard unit basis vectors (dashed). $V$ is a rotation 107 | matrix.} 108 | \label{chxx_rotation} 109 | \end{figure} 110 | Multiplying a vector by $V$ rotates the entries of the vector to the 111 | new coordinate system, and does nothing else. In particular, the Euclidean 112 | norm of the vector is not changed. This is a useful result so we will state 113 | it as a Lemma: 114 | \begin{lemma}\label{invariant} 115 | Let $V\in\R^{n\times n}$ be an orthonormal matrix. Then 116 | $\|Vx\|=\|x\|$ for all $x\in\R^n$. 117 | \end{lemma} 118 | 119 | 120 | \subsection*{The singular value decomposition} 121 | 122 | The singular value decomposition, or SVD, plays a central role in the 123 | analysis--and often implementation--of many computational methods involving 124 | matrices. If you only plan to know one matrix decomposition, this is the one 125 | to know. Let $X\in\R^{n \times p}$ and let $k=\min\{n, p\}$. Then 126 | there exist matrices $U\in\R^{n \times k}$ and $V\in\R^{p\times 127 | k}$ with orthonormal columns $U^T U = V^T V = I$ such that 128 | \begin{equation}\label{SVD} 129 | U^T X V = \Sigma, 130 | \end{equation} 131 | where $\Sigma$ is a $k\times k$ diagonal matrix with non-negative entries 132 | $\sigma_1 \ge \sigma_2 \ge \cdots \ge \sigma_k \ge 0$ along its 133 | main diagonal. This is sometimes called the ``thin'' SVD. R users will be 134 | familiar with this as the {\tt svd(X)} function, which returns a vector of 135 | $\sigma_i$ values instead of a full diagonal matrix $\Sigma$ but is otherwise 136 | the same. The singular value decomposition is not unique, but is almost so--it 137 | is unique up to the signs of the singular vector elements. 138 | 139 | In the case that $n > p$ it's possible to extend the matrix $U$, by adding $n - 140 | p$ additional orthonormal columns, into a square orthonormal $n\times n$ matrix 141 | $\bar{U}$. Similarly, when $n < p$ we can extend $V$ to a square orthonormal 142 | $p\times p$ matrix $\bar{V}$. The extended matrices are especially useful in 143 | analysis, and available to R users using the function invocation 144 | {\tt{svd(X,$\phantom{,}$nu=n,$\phantom{,}$nv=p)}}. The extended version is 145 | sometimes called the ``full'' SVD or just the SVD in many references and 146 | $\bar{U}^TX\bar{V}=\bar{\Sigma}$ results in an $n\times p$ rectangular diagonal 147 | matrix with the same main diagonal entries $\sigma_1 \ge \sigma_2 \ge \cdots \ge 148 | \sigma_k \ge 0$ as the thin version. 149 | 150 | The columns of $\bar{U}$ are called the {\it left singular vectors} of $X$ and the 151 | columns of $\bar{V}$ are called the {\it right singular vectors}. The $\sigma_i$ are 152 | called {\it singular values} of $X$. The SVD breaks matrix vector 153 | multiplication into three steps: rotation, scaling, then another rotation. 154 | Consider an $n\times p$ matrix 155 | $X$ and its product $y$ with a vector $b\in\R^p$ using the 156 | full SVD 157 | $y=Xb = \bar{U}\bar{\Sigma}\bar{V}^Tb$: 158 | \begin{enumerate} 159 | \item Let $\hat{b}=\bar{V}^T b\in\R^p$. 160 | Since $\bar{V}$ is orthonormal, $\hat{b}$ is simply a rotation of the 161 | vector $b$. 162 | \item Now let $s = \bar{\Sigma}\hat{b}\in\R^n$, 163 | which scales each entry of $\hat{b}$ by the corresponding $\sigma_i$. 164 | \item Finally let $y=\bar{U}s$. This is just another rotation by the 165 | orthonormal matrix $\bar{U}$. 166 | \end{enumerate} 167 | The SVD reveals a lot of information about the structure of the matrix $X$. 168 | Step 2 tells us how much a vector can be scaled by $X$. The rotations in steps 169 | 1 and 3 tell us about its range and null space. The number of nonzero singular 170 | values of $X$ is equal to the \emph{rank} of $X$--the dimension of the range of 171 | $X$ (range means the set of all linear combinations of the columns of $X$ 172 | a.k.a. the span of $X$). The {\it condition number} 173 | of $X$, familiar to R users as the \verb+kappa+ function and also written 174 | $\kappa_2(X)$, is the ratio of largest and smallest singular values. It 175 | measures how ill-conditioned the matrix is. Computation involving highly 176 | ill-conditioned matrices can be very sensitive to perturbations like noise or 177 | even numerical precision. 178 | 179 | Let $U^TXV=\Sigma$ be the ``thin'' SVD of $X$ and let 180 | $\bar{U}\in\R^{n\times n}$ and $\bar{V}\in\R^{p\times p}$ be 181 | their extended versions when $n > p$ or $n < p$. Let $r$ be the index 182 | corresponding to the smallest non-zero singular value of $X$, for instance 183 | $\sigma_1 \ge \sigma_2 \ge \cdots \ge \sigma_r > \sigma_{r+1} = 0 = \cdots = \sigma_k$, 184 | where $k=\min\{n,p\}$. Then $\mbox{rank}(X) = r$ and the singular vectors define the 185 | following bases: 186 | \begin{itemize} 187 | \item The first $r$ columns of $U$ form a basis of the range of $X$. 188 | \item The first $r$ columns of $V$ form a basis of the range of $X^T$. 189 | \item If $r0$ and the 1-norm of the solution vector $b$. The lasso 316 | is the closest convex estimate of the parameterized 317 | \emph{best subset selection problem}: 318 | \[ 319 | \min_b\|Xb - y\|^2 + \mu\|b\|_0, 320 | \] 321 | where $\|b\|_0$ means simply the count of nonzero components of $b$. 322 | (Despite the notation, $\|b\|_0$ is not a vector norm 323 | since for any scalar $\lambda$ with $|\lambda |$ not equal to zero or one, 324 | $\|\lambda b\|_0 \ne |\lambda |\|b\|_0$.) Although the best subset selection 325 | problem might seem to be the most natural way to select subsets of columns of 326 | the matrix $X$, the problem is nonconvex and hard to solve--indeed it is 327 | known to be NP hard. We 328 | shall see later that there are other approaches to estimating optimal 329 | column subsets including a 330 | fast heuristic method by Golub called SVD subset 331 | selection~\cite[Section 12.2]{gvl}, and a 332 | newer approach by Lanza, Reichel and others based on Krylov subspace 333 | methods~\cite{lanza}. 334 | 335 | The ordinary least squares solution of linear models has important statistical 336 | properties shown by Gauss~\cite{gauss} and later rediscovered by 337 | Markoff~\cite{markoff}. The least squares solution defines a {\it minimum 338 | variance unbiased estimator}, the technical details of which we leave to the 339 | references, in particular see~\cite{hastie},\cite{MN}. 340 | 341 | 342 | \section*{Generalized linear models} 343 | 344 | Our notes on generalized linear models closely follow the book 345 | ``Generalized Linear Models'' by McCullagh and Nelder~\cite{MN}. That very 346 | readable and practical book remains, in our opinion, the best all-around applied 347 | reference on GLMs and strongly influenced algorithm implementations in the 348 | R language. 349 | McCullagh and Nelder describe generalizations of the basic linear model 350 | in three parts: 351 | \begin{enumerate} 352 | \item A \emph{random component} describing the distribution of the 353 | measured entries of of a response vector $y$ and their vector of 354 | expected values $\mu = E(y)\in\R^n$. 355 | \item A \emph{systematic component} $\eta = X\beta$ that is just a basic 356 | linear model involving a vector $\eta\in\R^n$, model matrix $X\in\R^{n\times p}$ and 357 | coefficient solution vector $\beta\in\R^p$. 358 | \item A \emph{link function} between the random and systematic components, 359 | $\eta = g(\mu)$, applied component-wise to the vector $\mu$. 360 | \end{enumerate} 361 | The link function $g$ is assumed to be a real-valued monotonic, differentiable 362 | (and therefore invertible) function. If $p=n$ and the matrix $X$ is of full 363 | rank, then the model can exactly match the $n$ data observations in $y$ and all 364 | of the variation between observations is consigned to the systematic component 365 | of the model. Such models are usually \emph{over fit} and rarely generalize 366 | well to new data, although they have practical utility as seen in the next 367 | section. When $p=1$ then the model represents a single common $\mu$ 368 | for all $n$ data observations and all of the variation in $y$ is 369 | consigned to the random component. Most real-world GLMs lie somewhere 370 | in-between these two extremes. 371 | 372 | 373 | Adding the random component and link function around a basic linear model 374 | lets GLMs model a wider range of scenarios than their OLS cousins. In 375 | particular, the link function lets us model variables $\mu$ that are restricted 376 | to intervals, for instance the interval $[0,1]$ useful for modeling binary 377 | values. And we can use the random component to pair an appropriate 378 | distribution with such values (say, a binomial distribution in the case of 0/1 379 | data). The added modeling flexibility comes with a cost--the link function can 380 | turn finding the solution of GLMs into a nonlinear problem, despite the 381 | underlying linear model assumption in the systematic component. 382 | 383 | 384 | These notes assume that the random component distribution describing the 385 | response belongs to a one- or two-parameter \emph{exponential family} of 386 | probability distributions described below. The exponential family covers many 387 | widely used and important cases like logistic/binomial, Bernoulli, multinomial, 388 | exponential, Poisson, Gaussian, and others. Limiting our discussion to models 389 | that fit into the exponential family, despite a superficial mathematical 390 | complexity, greatly simplifies many details. 391 | 392 | 393 | \subsection*{The exponential family of distributions} 394 | 395 | The following sections include a lot of notation and many functions and 396 | parameters to keep track of. Although a bit complicated, nothing presented here 397 | is harder than elementary Calculus. For the most part, we very closely follow 398 | the exposition of McCullagh and Nelder~\cite{MN}, but we expand on it in some 399 | places to help illuminate key ideas. 400 | 401 | The exponential family of distributions have probability distributions 402 | that can be written as a function 403 | \begin{equation}\label{expfamily} 404 | f(y; \theta, \phi) = \exp\left(\frac{y\theta - b(\theta)}{a(\phi)} + c(y, \phi)\right), 405 | \end{equation} 406 | for parameters $\theta$ and $\phi$ and fixed functions $a,b,$ and $c$. 407 | The notation $f(y; \theta, \phi)$ means a function $f(y)$ that 408 | depends on the given parameters $\theta$ and $\phi$. 409 | Any probability distribution that can be re-written in this form belongs 410 | to the exponential family. 411 | 412 | For instance, let 413 | $\theta=\mu$, $\phi=\sigma^2$, $b(\theta)=\theta^2/2$, $a(\phi)=\phi$ 414 | and $c(y, \phi) = -\frac{1}{2}(\frac{y^2}{\sigma^2} + \log(2\pi\sigma^2))$. 415 | Then substituting those values in to Equation~\ref{expfamily} yields 416 | \begin{align*} 417 | f(y; \theta, \phi) &= 418 | \exp\left(\frac{y\mu - \mu^2/2}{\sigma^2} - \frac{1}{2}\left(\frac{y^2}{\sigma^2} + \log(2\pi\sigma^2)\right)\right)\\ 419 | &= \frac{1}{\sqrt{2\pi\sigma^2}}\exp\left(\frac{-(y - \mu)^2}{2\sigma^2}\right), 420 | \end{align*} 421 | which is a standard expression of a normal distribution, 422 | showing that the normal distribution fits in to the exponential 423 | family. 424 | 425 | Similarly, consider the Poisson distribution with single parameter 426 | $\mu$, 427 | \[ 428 | \exp(-\mu)\mu^y/{y!}\,\,. 429 | \] 430 | Let $\theta=\log\mu$, $a(\phi)=1$, $b(\theta)=\exp\theta$, and 431 | $c(y, \phi)=-\log{(y!)}$. Then 432 | \begin{align*} 433 | f(y; \theta, \phi) &= \exp\left(\frac{y\theta - b(\theta)}{a(\phi)} + c(y, \phi)\right)\\ 434 | &= \exp\left(\frac{y\log\mu - \exp\theta}{1} - \log y!\right)\\ 435 | &= {\exp(y\log\mu - \exp\theta)}/{y!}\\ 436 | &= {\exp(y\log\mu - \exp\log\mu)}/{y!}\\ 437 | &= \exp(y\log\mu - \mu)/y!\\ 438 | &= \exp{(-\mu)}\exp{(y\log\mu)}/y!\\ 439 | &= \exp(-\mu)\mu^y/y!\,\,. 440 | \end{align*} 441 | Many other distributions are described by the exponential family. 442 | 443 | \subsubsection*{The log likelihood function} 444 | If we think of the function $f$ in Equation~\ref{expfamily} as 445 | a function of parameters $\theta$ and $\phi$ given observed data $y$ then 446 | the function describes the likelihood of the observations. Its logarithm, 447 | \begin{equation*} 448 | \ell(\theta, \phi; y) = \frac{y\theta - b(\theta)}{a(\phi)} + c(y, \phi), 449 | \end{equation*} 450 | is called the \emph{log likelihood} function. In this context the 451 | function $b(\theta)$ is called the \emph{cumulant function} and $\phi$ 452 | the \emph{dispersion parameter}. 453 | In the usual case where $y$ is a vector 454 | of $n$ independent observations, the log likelihood function 455 | sums the individual contributions: 456 | \begin{equation}\label{loglik} 457 | \ell(\theta, \phi; y) = \sum_{i=1}^n\frac{y_i\theta_i - b(\theta_i)}{a(\phi)} + c(y, \phi) 458 | \end{equation} 459 | Next we derive a few basic identities that 460 | will be useful later. 461 | 462 | Let $\partial\ell/\partial\theta$ be the derivative of the log likelihood 463 | function with respect to $\theta$ (how much the function changes as 464 | $\theta$ changes), and similarly $\partial^2\ell/\partial\theta^2$ 465 | its 2nd derivative (how much the derivative function changes as 466 | $\theta$ changes). Then 467 | \begin{equation}\label{dl} 468 | \frac{\partial\ell}{\partial\theta} = \frac{y - b'(\theta)}{a(\phi)}, 469 | \end{equation} 470 | and, 471 | \begin{equation}\label{d2l} 472 | \frac{\partial^2\ell}{\partial\theta^2} = \frac{-b''(\theta)}{a(\phi)}, 473 | \end{equation} 474 | where $b'(\theta)$ means the derivative of the function $b$ taken with 475 | respect to $\theta$. 476 | 477 | Assume that $a(\phi)\ne 0$ and that 478 | the expected value $E(\partial\ell/\partial\theta) = 0$ 479 | and also that 480 | $E(\partial^2\ell/\partial\theta^2) + E(\partial\ell/\partial\theta)^2 = 0$. 481 | Then 482 | \begin{equation}\label{bprime} 483 | 0 = E(\partial\ell/\partial\theta) = \frac{E(y) - b'(\theta)}{a(\phi)} 484 | \qquad\mbox{which means that}\,\,b'(\theta) = E(y). 485 | \end{equation} 486 | Recall above that we sometimes use the alternative notation $\mu=E(y)$ 487 | for the expected value of $y$; so $\mu=E(y)=b'(\theta)$. 488 | 489 | Similarly, 490 | \begin{align} 491 | 0 &= 492 | E(\partial^2\ell/\partial\theta^2) + E(\partial\ell/\partial\theta)^2 \nonumber\\ 493 | &= 494 | \frac{-b''(\theta)}{a(\phi)} + 495 | E\left(\frac{y - b'(\theta)}{a(\phi)}\right)^2 \nonumber\\ 496 | &= \frac{-b''(\theta)}{a(\phi)} + 497 | E\left(\frac{y^2 - 2yb'(\theta) + b'(\theta)^2}{a(\phi)^2}\right)\nonumber\\ 498 | &= \frac{-b''(\theta)}{a(\phi)} + 499 | \frac{E(y^2) - E(y)^2}{a(\phi)^2} \qquad(\mbox{substituting}\,\,b'(\theta)=E(y))\nonumber\\ 500 | &= \frac{-b''(\theta)}{a(\phi)} + \frac{V(y)}{a(\phi)^2}\nonumber\\ 501 | &\mbox{which means that}\,\,a(\phi)b''(\theta) = V(y)\label{b2}, 502 | \end{align} 503 | where $V(y)$ is the usual definition of the variance function for $y$. 504 | 505 | Finally for this section, one more useful identity showing that the 506 | rate of change of the expected value of $y$ with respect to the parameter $\theta$ 507 | is a multiple of the variance function $V(y)$: 508 | \begin{align} 509 | \frac{d}{d\theta}E(y) &= \frac{d}{d\theta}\mu &\mbox{(just notation)}\nonumber\\ 510 | &= \frac{d}{d\theta}b'(\theta) &\mbox{(by Equation \ref{bprime})}\nonumber\\ 511 | &= b''(\theta)\nonumber\\ 512 | &= V(y)/a(\phi)\label{mutheta} &\mbox{(by Equation \ref{b2})}. 513 | \end{align} 514 | 515 | 516 | 517 | 518 | 519 | \subsection*{GLMs and the exponential family} 520 | 521 | The last section introduced the random component of generalized linear models 522 | and corresponding log likelihood function for the exponential family of 523 | distributions. This section puts that together with the remaining 524 | generalizations, the systematic component's linear model $\eta = X\beta$ and 525 | the link function $\eta=g(\mu)$. 526 | 527 | One approach for solving generalized linear models is to find the value of the 528 | coefficient vector $\beta$ that maximizes the value of the log likelihood 529 | function in Equation~\ref{loglik}. Solving for such a \emph{maximum-likelihood 530 | solution} is the main goal of this section. We can phrase the solution as a 531 | standard nonlinear least squares problem by recasting the maximum likelihood 532 | problem in terms of a minimum residual problem using \emph{deviance residuals}. 533 | 534 | \subsubsection*{Deviance residuals} 535 | 536 | 537 | 538 | \subsubsection*{Jacobian} 539 | 540 | In order to find a maximum using Calculus, we will need (at least) an 541 | expression for the derivative of the log likelihood function with respect to 542 | each component of the solution $\beta_j$, $\partial{l}/\partial{\beta_j}$. 543 | 544 | Writing the $n \times p$ matrix $X$ showing each column as 545 | $X = [x_1, x_2, \cdots, x_p]$, then 546 | $\eta = X\beta = x_1\beta_1 + x_2\beta_2 + \cdots + x_p\beta_p$, 547 | and 548 | \begin{equation}\label{eta} 549 | \partial\eta/\partial\beta_j = x_j. 550 | \end{equation} 551 | 552 | Then using the chain rule from Calculus, the derivative of the log likelihood 553 | function with respect to each component of the solution $\beta_j$ is 554 | \begin{align} 555 | \frac{\partial\ell}{\partial\beta_j} 556 | &= \frac{\partial\ell}{\partial\theta}\frac{d\theta}{d\mu}\frac{d\mu}{d\eta}\frac{\partial\eta}{\partial\beta_j} \nonumber\\ 557 | &= \frac{\partial\ell}{\partial\theta}\frac{1}{V(\mu)}\frac{d\mu}{d\eta}\frac{\partial\eta}{\partial\beta_j} 558 | &\mbox{(by Equation \ref{mutheta})} \nonumber\\ 559 | &= \frac{\partial\ell}{\partial\theta}\frac{1}{V(\mu)}\frac{d\mu}{d\eta}x_j 560 | &\mbox{(by Equation \ref{eta})} \nonumber\\ 561 | &= \frac{y- b'(\theta)}{a(\phi)}\frac{1}{V(\mu)}\frac{d\mu}{d\eta}x_j 562 | &\mbox{(by Equation \ref{dl})} \nonumber\\ 563 | &= w\frac{y-\mu}{a(\phi)}\frac{d\eta}{d\mu}x_j, 564 | &\mbox{(by Equations \ref{bprime} and \ref{w})} \label{dldb} 565 | \end{align} 566 | where $w$ is defined as a multiple of the inverse variance function: 567 | \begin{equation}\label{w} 568 | w = \left(\frac{d\mu}{d\eta}\right)^2 \bigg/ V(\mu). 569 | \end{equation} 570 | Since $\eta=g(\mu)$, the term $\frac{d}{d\mu}\eta$ in Equation~\ref{dldb} is 571 | simply $g'$, the derivative of the link function. We remark that, in the usual 572 | case that the response $y$ is a vector of $n$ iid observations and $X$ is an 573 | $n\times p$ matrix, Equation~\ref{dldb} holds entrywise and defines the 574 | $n\times p$ Jacobian matrix of the log likelihood function 575 | with $ij^{th}$ entry 576 | \begin{equation}\label{jacobian} 577 | J(\beta)_{ij} = \frac{\partial\ell_i}{\partial\beta_j} 578 | = w_i\frac{y_i-\mu_i}{a(\phi)}\frac{d\eta_i}{d\mu}x_{ij}, 579 | \qquad i=1, 2, \ldots, n,\,\,j=1,2,\ldots,p. 580 | \end{equation} 581 | %Assume that the dispersion function $a(\phi)$ is constant with respect to the 582 | %solution $\beta$. Then the maximum of the log likelihood function $\ell$ with 583 | %respect to each solution component $\beta_j$ for $j=1, 2, \ldots, p$ occurs when 584 | %\begin{equation}\label{max_loglik} 585 | %\sum_{i=1}^n w_i(y-\mu)_i\frac{d}{d\mu}\eta_ix_{ij} = 0. 586 | %\end{equation} 587 | 588 | 589 | \subsubsection*{Canonical link functions} 590 | Recall that 591 | the link function relates $\eta$ and $\mu$ by $\eta=g(\mu)$, 592 | and therefore also their derivatives $d\eta/d\mu = g'(\mu)$. Choosing a special \emph{canonical link function} 593 | results in a number of simplifications. Chief among them for our 594 | purposes, a canonical link connects $d\eta/d\mu$ to the variance function by 595 | \begin{equation} 596 | \label{canonical} 597 | {d\eta}/{d\mu}={1}/{V(\mu)}.\qquad\mbox{(canonical link case)} 598 | \end{equation} 599 | When $g$ is a canonical link function we get a simplification 600 | for $w$ using~\ref{canonical}: 601 | \begin{align} 602 | w &= \left(\frac{d\mu}{d\eta}\right)^2 \bigg/ V(\mu) \nonumber \\ 603 | &= V^2(\mu) / V(\mu) \nonumber \\ 604 | &= V(\mu)\qquad\mbox{(canonical link case)}. \label{W_canonical} 605 | \end{align} 606 | 607 | 608 | 609 | \subsection*{Maximum likelihood solutions based on first-order approximations} 610 | 611 | Jacobian :q 612 | :q 613 | 614 | 615 | Assemble entries of $w$ and $g'$ along the diagonal of an 616 | $n\times n$ diagonal matrix $W$: 617 | \begin{equation}\label{W} 618 | W_{ij} = \bigg\{\begin{array}{cr} 619 | w_i / g'(\mu)_i & \mbox{if $i=j$}, \\ 620 | 0 & \mbox{otherwise}. 621 | \end{array} 622 | \end{equation} 623 | Then a compact formula for the gradient of $\ell$ with respect to $\beta$ is 624 | \begin{equation}\label{gradient} 625 | \nabla_\beta\ell = X^TW(y - \mu). 626 | \end{equation} 627 | 628 | At this point, we have enough information from Equations~\ref{loglik} 629 | and~\ref{gradient} to formulate a first-order solution method to finding the 630 | maximum likelihood GLM solution. Possible solution methods include gradient 631 | descent, Gauss-Newton, conjugate gradient, or a quasi-Newton approach. 632 | The following example cooks up a very basic GLM solver using gradient 633 | descent. 634 | 635 | XXX EXAMPLE XXX 636 | 637 | 638 | 639 | \subsection*{Maximum likelihood solution by Newton's method} 640 | 641 | We can do better than the quasi-Newton solution in the last section. Because 642 | we restricted our problems to the exponential family, we can formulate an 643 | analytic representation of the second derivatives of the log likelihood 644 | function to form a Hessian matrix. That knowledge enables us to employ solution 645 | methods using second-order (quadratic) approximations like Newton's 646 | method--such methods have more favorable convergence properties (faster, more 647 | stable) than first order solution methods. 648 | 649 | We need to differentiate the expression for the derivative of $\ell$ in 650 | Equation~\ref{dldb} with respect to other elements 651 | of $\beta_k$ to compute the entries of the 2nd derivative Hessian matrix, 652 | and again we assume that the dispersion term $a(\phi)$ is constant 653 | with respect to the solution $\beta$. Then the $jk$-entry of the 654 | Hessian matrix is: 655 | \begin{align} 656 | H_{jk} &= \frac{\delta^2\ell}{\delta\beta_k\delta\beta_j} 657 | = \frac{\delta}{\beta_k}\frac{\delta\ell}{\delta\beta_j} \nonumber\\ 658 | &= 659 | \frac{\delta}{\delta\beta_k} 660 | \sum_i\left( 661 | w_i(y-\mu)_i\frac{d\eta_i}{d\mu}x_{ij} \right)\nonumber \\ 662 | &= \sum_i\left( 663 | (y-\mu)_i 664 | \frac{\delta}{\delta\beta_k} 665 | w\frac{d\eta_i}{d\mu}x_{ij} 666 | + 667 | w_i\frac{d\eta_i}{d\mu}x_{ij}\frac{\delta}{\delta\beta_k}(y-\mu)_i \label{hsum} 668 | \right) 669 | \end{align} 670 | where the last equality uses the product rule from Calculus. 671 | At this point, we have an (unwieldy) expression for the Hessian and 672 | we can plug that together with the gradient function from Equation~\ref{gradient} 673 | into Newton's method to get a maximum likelihood GLM solver. 674 | However, we will consider an important special case next that is much simpler. 675 | 676 | \subsubsection*{Canonical link case} 677 | When $g$ is a canonical link function 678 | the expression $w_i\frac{d\eta_i}{d\mu}$ in the 679 | first term of the sum in Equation~\ref{hsum} 680 | is constant because in such cases 681 | $d\eta/d\mu=1/V(\mu)$ and 682 | $w=V(\mu)$ 683 | by Equations~\ref{canonical}i and~\ref{W_canonical}. Thus, its derivative 684 | \[ 685 | \frac{\delta}{\delta\beta_k} w_i\frac{d\eta_i}{d\mu} = 0, 686 | \] 687 | that is, the first term of the Hessian in Equation~\ref{hsum} 688 | drops out when $g$ is a canonical link function. 689 | Meanwhile, consider the second term 690 | \begin{align*} 691 | \sum_i\left( 692 | w_i\frac{d\eta_i}{d\mu}x_{ij}\frac{\delta}{\delta\beta_k}(y-\mu)_i 693 | \right) &= 694 | \sum_i\left( 695 | w_i\frac{d\eta_i}{d\mu}x_{ij}\frac{\delta\mu}{\delta\beta_k} \right) &\mbox{($y$ is constant wrt $\beta$)} \\ 696 | &= 697 | \sum_i\left( 698 | w_ix_{ij}\frac{\delta\eta_i}{\delta\beta_k} \right)&\mbox{(chain rule)}\\ 699 | &= 700 | \sum_i\left( 701 | w_ix_{ij}x_{ik} \right), &\mbox{(by Equation \ref{eta})} 702 | \end{align*} 703 | finally arriving (with a substantial subscript-induced headache) at a compact 704 | expression for the Hessian using the definition of $W$ from Equation~\ref{W}: 705 | \begin{equation}\label{Hessian} 706 | H = X^T W X. 707 | \end{equation} 708 | 709 | With an expression for the gradient from Equation~\ref{gradient} and Hessian 710 | from Equation~\ref{Hessian} of the log likelihood function, we have all we need 711 | to implement a second order solution method. The next example uses R's 712 | \verb+nlm+ function to find the maximum likelihood GLM solution using Newton's 713 | method. 714 | 715 | XXX EXAMPLE XXX 716 | 717 | Remember that this derivation assumed that $g$ is a canonical link function. In 718 | the general case we need to resort to the definition of the Hession in 719 | Equation~\ref{hsum} for a Newton's method-based solution. 720 | 721 | 722 | 723 | \subsection*{Iteratively re-weighted least squares} 724 | 725 | 726 | 727 | 728 | The numerical solution of model problems of this form was carefully analyzed by 729 | Paige\cite{paige}. 730 | ...entries of $W$ are non-zero, the generalized linear model 731 | \ref{glm} results in a weighted nonlinear least squares problem 732 | typically solved by the iteratively reweighted least square method 733 | shown in Algorithm \ref{irls} and defined carefully by Bj\"ork\cite{bjork}... 734 | 735 | 736 | \subsection*{Numerical implementation issues} 737 | 738 | cover edge cases here including zero-variance observations (constant rows in $X$) 739 | and singular/ill-conditioned $X$. 740 | 741 | Introduce R's rank revealing QR irwls approach. 742 | 743 | SVD-irwls based on algorithm by O'Leary 744 | 745 | comparison/examples between R's RRQR-irwls and SVD-irwls 746 | 747 | large-scale problems and 1st order solution methods 748 | 749 | 750 | \subsubsection*{Round off error in QR- and SVD-based methods} 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | \section*{Copyright} 763 | Copyright \copyright 2014 Michael Kane and Bryan W. Lewis 764 | 765 | \begin{quote} 766 | Permission is granted to copy, distribute and/or modify this document 767 | under the terms of the GNU Free Documentation License, Version 1.3 768 | or any later version published by the Free Software Foundation; 769 | with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. 770 | A copy of the license is included in the Github project files. 771 | \end{quote} 772 | 773 | 774 | 775 | \begin{thebibliography}{99} 776 | \bibitem{anda} Anda, A. and Park, H., Self-scaling fast rotations for stiff least squares problems, Lin. Alg. Appl., 234, 1996, pp. 137-162. 777 | \bibitem{bjork} Bj\"orck, \AA., Numerical Methods for Least Squares Problems, SIAM, Philadelphia, 1996. 778 | \bibitem{bates} Bates, D., \url{http://www.stat.wisc.edu/courses/st849-bates/lectures/GLMH.pdf">http://www.stat.wisc.edu/courses/st849-bates/lectures/GLMH.pdf}. 779 | \bibitem{dekker} 780 | Dekker, Theodorus Jozef. ``A floating-point technique for extending the available precision.'' Numerische Mathematik 18.3 (1971): 224-242. 781 | \bibitem{friedman} Friedman, Jerome, Trevor Hastie, and Rob Tibshirani. "Regularization paths for generalized linear models via coordinate descent." Journal of statistical software 33.1 (2010): 1. 782 | \bibitem{glmnet} Friedman, Hastie, Tibshirani, Simon, Narasimhan, Qian, \url{https://cran.r-project.org/package=glmnet}. 783 | \bibitem{gauss} Gauss, C. F., Theoria combinationis observationum erroribus minimis obnoxiae, Pars prior, 1863 (1st written 1821). 784 | \bibitem{hastie} Hastie, T. J. and Pregibon, D., Generalized linear models, Chapter 6 of Statistical Models in S, eds J. M. Chambers and T. J. Hastie, Wadsworth \& Brooks/Cole, 1992. 785 | \bibitem{fmm} Forsythe, George Elmer, Cleve B. Moler, and Michael A. Malcolm. ``Computer methods for mathematical computations.'' (1977). 786 | \bibitem{gvl} Golub, Gene H., and Charles F. Van Loan. Matrix computations. Vol. 3. JHU Press, 2012. 787 | \bibitem{higham96} Higham, Nicholas J. Accuracy and stability of numerical algorithms. Vol. 80. Siam, 2002. 788 | \bibitem{horn-johnson} Horn, Roger A., and Charles R. Johnson. Matrix analysis. Cambridge university press, 1990. 789 | \bibitem{jordan} Jordan, Michael, \url{https://people.eecs.berkeley.edu/~jordan/courses/260-spring10/other-readings/chapter8.pdf} (online notes). 790 | \bibitem{lamport} Lamport, Leslie. How to write a proof. The American mathematical monthly 102.7 (1995): 600-608. 791 | \bibitem{lanza}Lanza, Alessandro, et al. A Generalized Krylov Subspace Method for $\ell_p-\ell_q$ Minimization. SIAM Journal on Scientific Computing 37.5 (2015): S30-S50. 792 | \bibitem{lumley} Lumley, T, \url{http://cran.r-project.org/web/packages/biglm">http://cran.r-project.org/web/packages/biglm}. 793 | \bibitem{markoff} Markoff, A., Wahrscheinlichheitsrechnug, Leipzig, 1912. 794 | \bibitem{MN} McCullagh P. and Nelder, J. A., Generalized Linear Models, Chapman and Hall, London 1989. 795 | \bibitem{oleary} O'Leary, D., Robust regression computation using iteratively reweighted least squares, Siam J. Mat. Anal. Appl., Vol. 11 No. 3, 1990, pp. 466-480. 796 | \bibitem{paige} Paige, C. C., Fast numerically stable computations for generalized least squares problems, Siam J. Num. Anal., 16, 1979, pp. 165-171. 797 | \bibitem{R} The R project \url{http://www.r-project.org">http://www.r-project.org}. 798 | \bibitem{trefbau} Trefethen, Lloyd N., and David Bau III. Numerical linear algebra. Vol. 50. Siam, 1997. 799 | \bibitem{zhou} Zhou, H. and Hastie, T., Regularization and Variable Selection via the Elastic Net, J. Royal Statistical Society, B, 2005, pp. 301-320. 800 | \end{thebibliography} 801 | 802 | 803 | \end{document} 804 | -------------------------------------------------------------------------------- /implementations.R: -------------------------------------------------------------------------------- 1 | # Example iteratively re-weighted least squares (IRLS) implementations 2 | # Mike Kane & Bryan Lewis, 2013-2014. 3 | # 4 | # The implementations generally follow the same input/output pattern. They 5 | # take as inputs a model matrix A, a response vector b whose length is the 6 | # number of rows of A, an R 'family' function that defines the error 7 | # distribution family and link function, a maximum number of iterations, and an 8 | # iteration convergence tolerance. The methods produce a list with two 9 | # elements, the model coefficients and the number of iterations. 10 | 11 | # The most basic IRLS method, and the shortest implementation we could come 12 | # up with. This method solves the normal equations associated with a weighted 13 | # least squares problem in each iteration. 14 | irls = 15 | function(A, b, family=binomial, maxit=25, tol=1e-08) 16 | { 17 | x = rep(0,ncol(A)) 18 | for(j in 1:maxit) 19 | { 20 | eta = drop(A %*% x) 21 | g = family()$linkinv(eta) 22 | gprime = family()$mu.eta(eta) 23 | z = eta + (b - g) / gprime 24 | W = drop(gprime^2 / family()$variance(g)) 25 | xold = x 26 | x = solve(crossprod(A, W * A), crossprod(A, W * z), tol=2*.Machine$double.eps) 27 | if(sqrt(drop(crossprod(x - xold))) < tol) break 28 | } 29 | list(coefficients=x, iterations=j) 30 | } 31 | 32 | # A method discussed by O'Leary that uses a QR factorization of the model 33 | # matrix. This method should be much more numerically stable in the face of 34 | # ill-conditioned model matrices than the simple method defined above. If the 35 | # QR method used uses Givens rotations, this method is numerically stable for 36 | # stiff problems too. 37 | irls_qrnewton = 38 | function(A, b, family=binomial, maxit=25, tol=1e-08) 39 | { 40 | s = t = 0 41 | QR = qr(A) 42 | Q = qr.Q(QR) 43 | R = qr.R(QR) 44 | for(j in 1:maxit) 45 | { 46 | g = family()$linkinv(t) 47 | gprime = family()$mu.eta(t) 48 | z = t + (b - g) / gprime 49 | W = as.vector(gprime^2 / family()$variance(g)) 50 | wmin = min(W) 51 | if(wmin < sqrt(.Machine$double.eps)) 52 | warning("Tiny weights encountered") 53 | s_old = s 54 | C = chol(crossprod(Q, W*Q)) 55 | s = forwardsolve(t(C), crossprod(Q,W*z)) 56 | s = backsolve(C,s) 57 | t = Q %*% s 58 | if(sqrt(crossprod(s - s_old)) < tol) break 59 | } 60 | x = backsolve(R, crossprod(Q,t)) 61 | list(coefficients=x,iterations=j) 62 | } 63 | 64 | # The next method is a minor variation on the QR Newton method defined above 65 | # that uses the SVD instead. It exhibits similar numerical stability and can 66 | # definitively check model matrix rank deficiency, at the cost of computing 67 | # the SVD instead of the QR factorization up front. 68 | irls_svdnewton = 69 | function(A, b, family=binomial, maxit=25, tol=1e-08) 70 | { 71 | s = t = 0 72 | S = svd(A) 73 | if(min(S$d)/max(S$d)