├── .Rbuildignore ├── .gitignore ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md ├── R ├── RcppExports.R ├── coef.sgd.R ├── data-winequality.R ├── fitted.sgd.R ├── plot.sgd.R ├── predict.sgd.R ├── print.sgd.R ├── residuals.sgd.R └── sgd.R ├── README.md ├── cran-comments.md ├── data └── winequality.rda ├── demo ├── 00Index ├── bench-corr-linear-regression.R ├── bench-linear-regression.R ├── bench-logistic-wine.R ├── cox-regression.R ├── glm-logistic-regression.R ├── glm-poisson-regression.R ├── linear-regression.R ├── m-estimation.R └── normal-method-of-moments.R ├── man ├── coef.sgd.Rd ├── fitted.sgd.Rd ├── plot.sgd.Rd ├── predict.sgd.Rd ├── print.sgd.Rd ├── residuals.sgd.Rd ├── sgd.Rd └── winequality.Rd ├── src ├── Makevars ├── Makevars.win ├── RcppExports.cpp ├── basedef.h ├── data │ ├── data_point.h │ └── data_set.h ├── learn-rate │ ├── base_learn_rate.h │ ├── ddim_learn_rate.h │ ├── learn_rate_value.h │ ├── onedim_eigen_learn_rate.h │ └── onedim_learn_rate.h ├── model │ ├── base_model.h │ ├── cox_model.h │ ├── glm │ │ ├── glm_family.h │ │ └── glm_transfer.h │ ├── glm_model.h │ ├── gmm_model.h │ ├── m-estimation │ │ └── m_loss.h │ └── m_model.h ├── post-process │ ├── cox_post_process.h │ ├── glm_post_process.h │ ├── gmm_post_process.h │ └── m_post_process.h ├── sgd.cpp ├── sgd │ ├── base_sgd.h │ ├── explicit_sgd.h │ ├── implicit_sgd.h │ ├── momentum_sgd.h │ └── nesterov_sgd.h └── validity-check │ ├── cox_validity_check_model.h │ ├── glm_validity_check_model.h │ ├── gmm_validity_check_model.h │ ├── m_validity_check_model.h │ └── validity_check.h ├── tests ├── testthat.R └── testthat │ ├── test-fitted.R │ ├── test-lasso.R │ ├── test-linear.R │ ├── test-predict.R │ └── test-residuals.R └── vignettes ├── sgd-jss.pdf └── sgd-jss.pdf.asis /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^cran-comments\.md$ 2 | ^.*\.Rproj$ 3 | ^\.Rproj\.user$ 4 | ^sgd\.Rproj$ 5 | ^CRAN-SUBMISSION$ 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.DS_Store 2 | *.Rhistory 3 | *.Rdata 4 | *.so 5 | *.o 6 | *.Rproj 7 | .Rproj.user 8 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: sgd 2 | Type: Package 3 | Title: Stochastic Gradient Descent for Scalable Estimation 4 | Version: 1.1.2 5 | Authors@R: c( 6 | person("Junhyung Lyle", "Kim", email = "jlylekim@gmail.com", role = c("cre", "aut")), 7 | person("Dustin", "Tran", role = "aut"), 8 | person("Panos", "Toulis", role = "aut"), 9 | person("Tian", "Lian", role = "ctb"), 10 | person("Ye", "Kuang", role = "ctb"), 11 | person("Edoardo", "Airoldi", role = "ctb") 12 | ) 13 | Maintainer: Junhyung Lyle Kim 14 | Description: A fast and flexible set of tools for large scale estimation. It 15 | features many stochastic gradient methods, built-in models, visualization 16 | tools, automated hyperparameter tuning, model checking, interval estimation, 17 | and convergence diagnostics. 18 | URL: https://github.com/airoldilab/sgd 19 | BugReports: https://github.com/airoldilab/sgd/issues 20 | License: GPL-2 21 | Imports: 22 | ggplot2, 23 | MASS, 24 | methods, 25 | Rcpp (>= 0.11.3), 26 | stats 27 | Suggests: 28 | bigmemory, 29 | glmnet, 30 | gridExtra, 31 | R.rsp, 32 | testthat 33 | LinkingTo: 34 | BH, 35 | bigmemory, 36 | Rcpp, 37 | RcppArmadillo 38 | LazyData: yes 39 | VignetteBuilder: R.rsp 40 | Encoding: UTF-8 41 | RoxygenNote: 7.3.1 42 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(coef,sgd) 4 | S3method(fitted,sgd) 5 | S3method(plot,list) 6 | S3method(plot,sgd) 7 | S3method(predict,sgd) 8 | S3method(print,sgd) 9 | S3method(residuals,sgd) 10 | S3method(sgd,big.matrix) 11 | S3method(sgd,default) 12 | S3method(sgd,formula) 13 | S3method(sgd,matrix) 14 | export(predict_all) 15 | export(sgd) 16 | import(MASS) 17 | importFrom(Rcpp,evalCpp) 18 | importFrom(methods,new) 19 | importFrom(stats,coef) 20 | importFrom(stats,fitted) 21 | importFrom(stats,gaussian) 22 | importFrom(stats,is.empty.model) 23 | importFrom(stats,model.matrix) 24 | importFrom(stats,model.response) 25 | importFrom(stats,predict) 26 | importFrom(stats,rnorm) 27 | useDynLib(sgd) 28 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # sgd 1.1.2 2 | 3 | * Added a `NEWS.md` file to track changes to the package. 4 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | run <- function(dataset, model_control, sgd_control) { 5 | .Call('_sgd_run', PACKAGE = 'sgd', dataset, model_control, sgd_control) 6 | } 7 | 8 | -------------------------------------------------------------------------------- /R/coef.sgd.R: -------------------------------------------------------------------------------- 1 | #' Extract Model Coefficients 2 | #' 3 | #' Extract model coefficients from \code{sgd} objects. \code{coefficients} 4 | #' is an \emph{alias} for it. 5 | #' 6 | #' @param object object of class \code{sgd}. 7 | #' @param \dots some methods for this generic require additional 8 | #' arguments. None are used in this method. 9 | #' 10 | #' @return 11 | #' Coefficients extracted from the model object \code{object}. 12 | #' 13 | #' @export 14 | coef.sgd <- function(object, ...) { 15 | return(as.vector(object$coefficients)) 16 | } 17 | -------------------------------------------------------------------------------- /R/data-winequality.R: -------------------------------------------------------------------------------- 1 | #' Wine quality data of white wine samples from Portugal 2 | #' 3 | #' This dataset is a collection of white "Vinho Verde" wine 4 | #' samples from the north of Portugal. Due to privacy and logistic 5 | #' issues, only physicochemical (inputs) and sensory (the output) 6 | #' variables are available (e.g. there is no data about grape types, 7 | #' wine brand, wine selling price, etc.). 8 | #' 9 | #' @format A data frame with 4898 rows and 12 variables 10 | #' \itemize{ 11 | #' \item fixed acidity. 12 | #' \item volatile acidity. 13 | #' \item citric acid. 14 | #' \item residual sugar. 15 | #' \item chlorides. 16 | #' \item free sulfur dioxide. 17 | #' \item total sulfur dioxide. 18 | #' \item density. 19 | #' \item pH. 20 | #' \item sulphates. 21 | #' \item alcohol. 22 | #' \item quality (score between 0 and 10). 23 | #' } 24 | #' @source \url{https://archive.ics.uci.edu/ml/datasets/Wine+Quality} 25 | "winequality" 26 | -------------------------------------------------------------------------------- /R/fitted.sgd.R: -------------------------------------------------------------------------------- 1 | #' Extract Model Fitted Values 2 | #' 3 | #' Extract fitted values from from \code{sgd} objects. 4 | #' \code{fitted.values} is an \emph{alias} for it. 5 | #' 6 | #' @param object object of class \code{sgd}. 7 | #' @param \dots some methods for this generic require additional 8 | #' arguments. None are used in this method. 9 | #' 10 | #' @return 11 | #' Fitted values extracted from the object \code{object}. 12 | #' 13 | #' @export 14 | fitted.sgd <- function(object, ...) { 15 | return(object$fitted.values) 16 | } 17 | -------------------------------------------------------------------------------- /R/plot.sgd.R: -------------------------------------------------------------------------------- 1 | #' Plot objects of class \code{sgd}. 2 | #' 3 | #' @param x object of class \code{sgd}. 4 | #' @param \dots additional arguments used for each type of plot. See 5 | #' \sQuote{Details}. 6 | #' @param type character specifying the type of plot: \code{"mse"}, 7 | #' \code{"clf"}, \code{"mse-param"}. See \sQuote{Details}. Default is 8 | #' \code{"mse"}. 9 | #' @param xaxis character specifying the x-axis of plot: \code{"iteration"} 10 | #' plots the y values over the log-iteration of the algorithm; 11 | #' \code{"runtime"} plots the y values over the time in seconds to reach them. 12 | #' Default is \code{"iteration"}. 13 | #' 14 | #' @details 15 | #' Types of plots available: 16 | #' \describe{ 17 | #' \item{\code{mse}}{Mean squared error in predictions, which takes the 18 | #' following arguments: 19 | #' \describe{ 20 | #' \item{\code{x_test}}{test set} 21 | #' \item{\code{y_test}}{test responses to compare predictions to} 22 | #' }} 23 | #' \item{\code{clf}}{Classification error in predictions, which takes the 24 | #' following arguments: 25 | #' \describe{ 26 | #' \item{\code{x_test}}{test set} 27 | #' \item{\code{y_test}}{test responses to compare predictions to} 28 | #' }} 29 | #' \item{\code{mse-param}}{Mean squared error in parameters, which takes the 30 | #' following arguments: 31 | #' \describe{ 32 | #' \item{\code{true_param}}{true vector of parameters to compare to} 33 | #' }} 34 | #' } 35 | #' 36 | #' @export 37 | plot.sgd <- function(x, ..., type="mse", xaxis="iteration") { 38 | plot <- choose_plot(type, xaxis) 39 | return(plot(x, ...)) 40 | } 41 | 42 | #' @export 43 | #' @rdname plot.sgd 44 | plot.list <- function(x, ..., type="mse", xaxis="iteration") { 45 | plot <- choose_plot(type, xaxis) 46 | return(plot(x, ...)) 47 | } 48 | 49 | ################################################################################ 50 | # Helper functions 51 | ################################################################################ 52 | 53 | choose_plot <- function(type, xaxis) { 54 | if (type == "mse") { 55 | if (xaxis == "iteration") { 56 | return(plot_mse) 57 | } else if (xaxis == "runtime") { 58 | return(function(x, ...) plot_mse(x, ..., xaxis="Runtime (s)")) 59 | } 60 | } else if (type == "mse-param") { 61 | if (xaxis == "iteration") { 62 | return(plot_mse_param) 63 | } else if (xaxis == "runtime") { 64 | return(function(x, ...) plot_mse_param(x, ..., xaxis="Runtime (s)")) 65 | } 66 | } else if (type == "clf") { 67 | if (xaxis == "iteration") { 68 | return(plot_clf) 69 | } else if (xaxis == "runtime") { 70 | return(function(x, ...) plot_clf(x, ..., xaxis="Runtime (s)")) 71 | } 72 | } 73 | stop("'type' not recognized") 74 | } 75 | 76 | get_mse <- function(x, x_test, y_test) { 77 | mu <- predict_all(x, x_test) 78 | nests <- ncol(mu) 79 | mse <- rep(NA, nests) 80 | for (j in 1:nests) { 81 | mse[j] <- mean((mu[, j] - y_test)^2) 82 | } 83 | return(mse) 84 | } 85 | 86 | get_mse_param <- function(x, true_param) { 87 | nests <- ncol(x$estimates) 88 | mse <- rep(NA, nests) 89 | for (j in 1:nests) { 90 | mse[j] <- mean((x$estimates[, j] - true_param)^2) 91 | } 92 | return(mse) 93 | } 94 | 95 | plot_mse <- function(x, x_test, y_test, label="sgd", xaxis="log-Iteration") { 96 | if (toString(class(x)) != "list") { 97 | x <- list(label=x) 98 | } 99 | dat <- data.frame() 100 | for (i in 1:length(x)) { 101 | mse <- get_mse(x[[i]], x_test, y_test) 102 | temp_dat <- data.frame(y=mse, 103 | label=names(x)[i]) 104 | if (xaxis == "log-Iteration") { 105 | temp_dat$x <- x[[i]]$pos 106 | } else if (xaxis == "Runtime (s)") { 107 | temp_dat$x <- x[[i]]$time 108 | } 109 | temp_dat <- temp_dat[!duplicated(temp_dat$x), ] 110 | dat <- rbind(dat, temp_dat) 111 | } 112 | dat$label <- as.factor(dat$label) 113 | 114 | p <- generic_plot(dat, xaxis) + 115 | ggplot2::scale_y_log10() + 116 | ggplot2::labs( 117 | title="Mean Squared Error", 118 | x=xaxis, 119 | y="") 120 | return(p) 121 | } 122 | 123 | plot_mse_param <- function(x, true_param, label="sgd", xaxis="log-Iteration") { 124 | if (toString(class(x)) != "list") { 125 | x <- list(x) 126 | names(x) <- label 127 | } 128 | dat <- data.frame() 129 | for (i in 1:length(x)) { 130 | mse <- get_mse_param(x[[i]], true_param) 131 | temp_dat <- data.frame(y=mse, 132 | label=names(x)[i]) 133 | if (xaxis == "log-Iteration") { 134 | temp_dat$x <- x[[i]]$pos 135 | } else if (xaxis == "Runtime (s)") { 136 | temp_dat$x <- x[[i]]$time 137 | } 138 | temp_dat <- temp_dat[!duplicated(temp_dat$x), ] 139 | dat <- rbind(dat, temp_dat) 140 | } 141 | dat$label <- as.factor(dat$label) 142 | 143 | p <- generic_plot(dat, xaxis) + 144 | ggplot2::scale_y_continuous( 145 | breaks=5 * 1:min((max(dat$y)/5), 100)) + 146 | ggplot2::labs( 147 | title="Mean Squared Error", 148 | x=xaxis, 149 | y="") 150 | return(p) 151 | } 152 | 153 | plot_clf <- function(x, x_test, y_test, label="sgd", xaxis="log-Iteration") { 154 | if (toString(class(x)) != "list") { 155 | x <- list(x) 156 | names(x) <- label 157 | } 158 | dat <- data.frame() 159 | for (i in 1:length(x)) { 160 | pred <- predict_all(x[[i]], x_test) 161 | pred <- (pred > 0.5) * 1 162 | error <- colSums(pred != y_test) / nrow(pred) # is this correct? 163 | temp_dat <- data.frame(y=error, 164 | label=names(x)[i]) 165 | if (xaxis == "log-Iteration") { 166 | temp_dat$x <- x[[i]]$pos 167 | } else if (xaxis == "Runtime (s)") { 168 | temp_dat$x <- x[[i]]$time 169 | } 170 | temp_dat <- temp_dat[!duplicated(temp_dat$x), ] 171 | dat <- rbind(dat, temp_dat) 172 | } 173 | dat$label <- as.factor(dat$label) 174 | 175 | p <- generic_plot(dat, xaxis) + 176 | ggplot2::scale_y_continuous( 177 | #limits=c(max(0, mean(dat$y)-2.5*sd(dat$y)), 178 | # min(1, mean(dat$y)+2*sd(dat$y))), 179 | breaks=seq(0.05, 1, 0.05)) + 180 | ggplot2::labs( 181 | title="Classification Error", 182 | x=xaxis, 183 | y="") 184 | return(p) 185 | } 186 | 187 | generic_plot <- function(dat, xaxis) { 188 | x <- NULL 189 | y <- NULL 190 | label <- NULL 191 | p <- ggplot2::ggplot(dat, ggplot2::aes(x=x, y=y, group=label)) + 192 | ggplot2::geom_line(ggplot2::aes(linetype=label, color=label)) + 193 | ggplot2::theme( 194 | panel.background=ggplot2::element_blank(), 195 | panel.border=ggplot2::element_blank(), 196 | panel.grid.major=ggplot2::element_blank(), 197 | panel.grid.minor=ggplot2::element_blank(), 198 | axis.line=ggplot2::element_line(color="black"), 199 | legend.position=c(1, 1), 200 | legend.justification = c(1, 1), 201 | legend.title=ggplot2::element_blank(), 202 | legend.key=ggplot2::element_blank(), 203 | legend.background=ggplot2::element_rect(linetype="solid", color="black") 204 | ) + 205 | ggplot2::scale_fill_hue(l=50) 206 | if (xaxis == "log-Iteration") { 207 | p <- p + 208 | ggplot2::scale_x_log10( 209 | breaks=10^(1:log(max(dat$x), base=10))) 210 | } 211 | return(p) 212 | } 213 | -------------------------------------------------------------------------------- /R/predict.sgd.R: -------------------------------------------------------------------------------- 1 | #' Model Predictions 2 | #' 3 | #' Form predictions using the estimated model parameters from stochastic 4 | #' gradient descent. 5 | #' 6 | #' @param object object of class \code{sgd}. 7 | #' @param newdata design matrix to form predictions on 8 | #' @param type the type of prediction required. The default "link" is 9 | #' on the scale of the linear predictors; the alternative '"response"' 10 | #' is on the scale of the response variable. Thus for a default 11 | #' binomial model the default predictions are of log-odds 12 | #' (probabilities on logit scale) and 'type = "response"' gives the 13 | #' predicted probabilities. The '"terms"' option returns a matrix 14 | #' giving the fitted values of each term in the model formula on the 15 | #' linear predictor scale. 16 | #' @param \dots further arguments passed to or from other methods. 17 | #' 18 | #' @details 19 | #' A column of 1's must be included to \code{newdata} if the 20 | #' parameters include a bias (intercept) term. 21 | #' 22 | #' @export 23 | predict.sgd <- function(object, newdata, type="link", ...) { 24 | if (!(object$model %in% c("lm", "glm", "m"))) { 25 | stop("'model' not supported") 26 | } 27 | if (!(type %in% c("link", "response", "term"))) { 28 | stop("'type' not recognized") 29 | } 30 | 31 | if (object$model %in% c("lm", "glm")) { 32 | if (type %in% c("link", "response")) { 33 | eta <- newdata %*% coef(object) 34 | if (type == "response") { 35 | y <- object$model.out$family$linkinv(eta) 36 | return(y) 37 | } 38 | return(eta) 39 | } 40 | eta <- newdata %*% diag(coef(object)) 41 | return(eta) 42 | } else if (object$model == "m") { 43 | if (type %in% c("link", "response")) { 44 | eta <- newdata %*% coef(object) 45 | if (type == "response") { 46 | y <- eta 47 | return(y) 48 | } 49 | return(eta) 50 | } 51 | eta <- newdata %*% diag(coef(object)) 52 | return(eta) 53 | } 54 | } 55 | 56 | #' @export 57 | #' @rdname predict.sgd 58 | predict_all <- function(object, newdata, ...) { 59 | if (object$model %in% c("lm", "glm")) { 60 | eta <- newdata %*% object$estimates 61 | y <- object$model.out$family$linkinv(eta) 62 | } else if (object$model == "m") { 63 | eta <- newdata %*% object$estimates 64 | y <- eta 65 | # TODO 66 | } else { 67 | stop("'model' not recognized") 68 | } 69 | return(y) 70 | } 71 | -------------------------------------------------------------------------------- /R/print.sgd.R: -------------------------------------------------------------------------------- 1 | #' Print objects of class \code{sgd}. 2 | #' 3 | #' @param x object of class \code{sgd}. 4 | #' @param \dots further arguments passed to or from other methods. 5 | #' 6 | #' @export 7 | print.sgd <- function(x, ...) { 8 | print(coef(x), ...) 9 | } 10 | -------------------------------------------------------------------------------- /R/residuals.sgd.R: -------------------------------------------------------------------------------- 1 | #' Extract Model Residuals 2 | #' 3 | #' Extract model residuals from \code{sgd} objects. \code{resid} is an 4 | #' \emph{alias} for it. 5 | #' 6 | #' @param object object of class \code{sgd}. 7 | #' @param \dots some methods for this generic require additional 8 | #' arguments. None are used in this method. 9 | #' 10 | #' @return 11 | #' Residuals extracted from the object \code{object}. 12 | #' 13 | #' @export 14 | residuals.sgd <- function(object, ...) { 15 | return(object$residuals) 16 | } 17 | -------------------------------------------------------------------------------- /R/sgd.R: -------------------------------------------------------------------------------- 1 | #' Stochastic gradient descent 2 | #' 3 | #' Run stochastic gradient descent in order to optimize the induced loss 4 | #' function given a model and data. 5 | #' 6 | #' @param formula an object of class \code{"\link{formula}"} (or one that can be 7 | #' coerced to that class): a symbolic description of the model to be fitted. 8 | #' The details can be found in \code{"\link{glm}"}. 9 | #' @param data an optional data frame, list or environment (or object coercible 10 | #' by \code{\link[base]{as.data.frame}} to a data frame) containing the 11 | #' variables in the model. If not found in data, the variables are taken from 12 | #' environment(formula), typically the environment from which glm is called. 13 | #' @param model character specifying the model to be used: \code{"lm"} (linear 14 | #' model), \code{"glm"} (generalized linear model), \code{"cox"} (Cox 15 | #' proportional hazards model), \code{"gmm"} (generalized method of moments), 16 | #' \code{"m"} (M-estimation). See \sQuote{Details}. 17 | #' @param model.control a list of parameters for controlling the model. 18 | #' \describe{ 19 | #' \item{\code{family} (\code{"glm"})}{a description of the error distribution and 20 | #' link function to be used in the model. This can be a character string 21 | #' naming a family function, a family function or the result of a call to 22 | #' a family function. (See \code{\link[stats]{family}} for details of 23 | #' family functions.)} 24 | #' \item{\code{rank} (\code{"glm"})}{logical. Should the rank of the design matrix 25 | #' be checked?} 26 | #' \item{\code{fn} (\code{"gmm"})}{a function \eqn{g(\theta,x)} which returns a 27 | #' \eqn{k}-vector corresponding to the \eqn{k} moment conditions. It is a 28 | #' required argument if \code{gr} not specified.} 29 | #' \item{\code{gr} (\code{"gmm"})}{a function to return the gradient. If 30 | #' unspecified, a finite-difference approximation will be used.} 31 | #' \item{\code{nparams} (\code{"gmm"})}{number of model parameters. This is 32 | #' automatically determined for other models.} 33 | #' \item{\code{type} (\code{"gmm"})}{character specifying the generalized method of 34 | #' moments procedure: \code{"twostep"} (Hansen, 1982), \code{"iterative"} 35 | #' (Hansen et al., 1996). Defaults to \code{"iterative"}.} 36 | #' \item{\code{wmatrix} (\code{"gmm"})}{weighting matrix to be used in the loss 37 | #' function. Defaults to the identity matrix.} 38 | #' \item{\code{loss} (\code{"m"})}{character specifying the loss function to be 39 | #' used in the estimating equation. Default is the Huber loss.} 40 | #' \item{\code{lambda1}}{L1 regularization parameter. Default is 0.} 41 | #' \item{\code{lambda2}}{L2 regularization parameter. Default is 0.} 42 | #' } 43 | #' @param sgd.control an optional list of parameters for controlling the estimation. 44 | #' \describe{ 45 | #' \item{\code{method}}{character specifying the method to be used: \code{"sgd"}, 46 | #' \code{"implicit"}, \code{"asgd"}, \code{"ai-sgd"}, \code{"momentum"}, 47 | #' \code{"nesterov"}. Default is \code{"ai-sgd"}. See \sQuote{Details}.} 48 | #' \item{\code{lr}}{character specifying the learning rate to be used: 49 | #' \code{"one-dim"}, \code{"one-dim-eigen"}, \code{"d-dim"}, 50 | #' \code{"adagrad"}, \code{"rmsprop"}. Default is \code{"one-dim"}. 51 | #' See \sQuote{Details}.} 52 | #' \item{\code{lr.control}}{vector of scalar hyperparameters one can 53 | #' set dependent on the learning rate. For hyperparameters aimed 54 | #' to be left as default, specify \code{NA} in the corresponding 55 | #' entries. See \sQuote{Details}.} 56 | #' \item{\code{start}}{starting values for the parameter estimates. Default is 57 | #' random initialization around zero.} 58 | #' \item{\code{size}}{number of SGD estimates to store for diagnostic purposes 59 | #' (distributed log-uniformly over total number of iterations)} 60 | #' \item{\code{reltol}}{relative convergence tolerance. The algorithm stops 61 | #' if it is unable to change the relative mean squared difference in the 62 | #' parameters by more than the amount. Default is \code{1e-05}.} 63 | #' \item{\code{npasses}}{the maximum number of passes over the data. Default 64 | #' is 3.} 65 | #' \item{\code{pass}}{logical. Should \code{tol} be ignored and run the 66 | #' algorithm for all of \code{npasses}?} 67 | #' \item{\code{shuffle}}{logical. Should the algorithm shuffle the data set 68 | #' including for each pass?} 69 | #' \item{\code{verbose}}{logical. Should the algorithm print progress?} 70 | #' } 71 | #' @param \dots arguments to be used to form the default \code{sgd.control} 72 | #' arguments if it is not supplied directly. 73 | #' @param x,y a design matrix and the respective vector of outcomes. 74 | #' 75 | #' @details 76 | #' Models: 77 | #' The Cox model assumes that the survival data is ordered when passed 78 | #' in, i.e., such that the risk set of an observation i is all data points after 79 | #' it. 80 | #' 81 | #' Methods: 82 | #' \describe{ 83 | #' \item{\code{sgd}}{stochastic gradient descent (Robbins and Monro, 1951)} 84 | #' \item{\code{implicit}}{implicit stochastic gradient descent (Toulis et al., 85 | #' 2014)} 86 | #' \item{\code{asgd}}{stochastic gradient with averaging (Polyak and Juditsky, 87 | #' 1992)} 88 | #' \item{\code{ai-sgd}}{implicit stochastic gradient with averaging (Toulis et 89 | #' al., 2015)} 90 | #' \item{\code{momentum}}{"classical" momentum (Polyak, 1964)} 91 | #' \item{\code{nesterov}}{Nesterov's accelerated gradient (Nesterov, 1983)} 92 | #' } 93 | #' 94 | #' Learning rates and hyperparameters: 95 | #' \describe{ 96 | #' \item{\code{one-dim}}{scalar value prescribed in Xu (2011) as 97 | #' \deqn{a_n = scale * gamma/(1 + alpha*gamma*n)^(-c)} 98 | #' where the defaults are 99 | #' \code{lr.control = (scale=1, gamma=1, alpha=1, c)} 100 | #' where \code{c} is \code{1} if implemented without averaging, 101 | #' \code{2/3} if with averaging} 102 | #' \item{\code{one-dim-eigen}}{diagonal matrix 103 | #' \code{lr.control = NULL}} 104 | #' \item{\code{d-dim}}{diagonal matrix 105 | #' \code{lr.control = (epsilon=1e-6)}} 106 | #' \item{\code{adagrad}}{diagonal matrix prescribed in Duchi et al. (2011) as 107 | #' \code{lr.control = (eta=1, epsilon=1e-6)}} 108 | #' \item{\code{rmsprop}}{diagonal matrix prescribed in Tieleman and Hinton 109 | #' (2012) as 110 | #' \code{lr.control = (eta=1, gamma=0.9, epsilon=1e-6)}} 111 | #' } 112 | #' 113 | #' @return 114 | #' An object of class \code{"sgd"}, which is a list containing the following 115 | #' components: 116 | #' \item{model}{name of the model} 117 | #' \item{coefficients}{a named vector of coefficients} 118 | #' \item{converged}{logical. Was the algorithm judged to have converged?} 119 | #' \item{estimates}{estimates from algorithm stored at each iteration 120 | #' specified in \code{pos}} 121 | #' \item{fitted.values}{the fitted mean values} 122 | #' \item{pos}{vector of indices specifying the iteration number each estimate 123 | #' was stored for} 124 | #' \item{residuals}{the residuals, that is response minus fitted values} 125 | #' \item{times}{vector of times in seconds it took to complete the number of 126 | #' iterations specified in \code{pos}} 127 | #' \item{model.out}{a list of model-specific output attributes} 128 | #' 129 | #' @author Dustin Tran, Tian Lan, Panos Toulis, Ye Kuang, Edoardo Airoldi 130 | #' @references 131 | #' John Duchi, Elad Hazan, and Yoram Singer. Adaptive subgradient methods for 132 | #' online learning and stochastic optimization. \emph{Journal of Machine 133 | #' Learning Research}, 12:2121-2159, 2011. 134 | #' 135 | #' Yurii Nesterov. A method for solving a convex programming problem with 136 | #' convergence rate \eqn{O(1/k^2)}. \emph{Soviet Mathematics Doklady}, 137 | #' 27(2):372-376, 1983. 138 | #' 139 | #' Boris T. Polyak. Some methods of speeding up the convergence of iteration 140 | #' methods. \emph{USSR Computational Mathematics and Mathematical Physics}, 141 | #' 4(5):1-17, 1964. 142 | #' 143 | #' Boris T. Polyak and Anatoli B. Juditsky. Acceleration of stochastic 144 | #' approximation by averaging. \emph{SIAM Journal on Control and Optimization}, 145 | #' 30(4):838-855, 1992. 146 | #' 147 | #' Herbert Robbins and Sutton Monro. A stochastic approximation method. 148 | #' \emph{The Annals of Mathematical Statistics}, pp. 400-407, 1951. 149 | #' 150 | #' Panos Toulis, Jason Rennie, and Edoardo M. Airoldi, "Statistical analysis of 151 | #' stochastic gradient methods for generalized linear models", In 152 | #' \emph{Proceedings of the 31st International Conference on Machine Learning}, 153 | #' 2014. 154 | #' 155 | #' Panos Toulis, Dustin Tran, and Edoardo M. Airoldi, "Stability and optimality 156 | #' in stochastic gradient descent", arXiv preprint arXiv:1505.02417, 2015. 157 | #' 158 | #' Wei Xu. Towards optimal one pass large scale learning with averaged 159 | #' stochastic gradient descent. arXiv preprint arXiv:1107.2490, 2011. 160 | #' 161 | #' # Dimensions 162 | #' @examples 163 | #' ## Linear regression 164 | #' set.seed(42) 165 | #' N <- 1e4 166 | #' d <- 5 167 | #' X <- matrix(rnorm(N*d), ncol=d) 168 | #' theta <- rep(5, d+1) 169 | #' eps <- rnorm(N) 170 | #' y <- cbind(1, X) %*% theta + eps 171 | #' dat <- data.frame(y=y, x=X) 172 | #' sgd.theta <- sgd(y ~ ., data=dat, model="lm") 173 | #' sprintf("Mean squared error: %0.3f", mean((theta - as.numeric(sgd.theta$coefficients))^2)) 174 | #' 175 | #' 176 | #' @useDynLib sgd 177 | #' @import MASS 178 | #' @importFrom methods new 179 | #' @importFrom Rcpp evalCpp 180 | #' @importFrom stats gaussian is.empty.model model.matrix model.response rnorm coef fitted predict 181 | 182 | ################################################################################ 183 | # Classes 184 | ################################################################################ 185 | #' @export 186 | sgd <- function(x, ...) UseMethod("sgd") 187 | 188 | ################################################################################ 189 | # Methods 190 | ################################################################################ 191 | 192 | #' @export 193 | sgd.default <- function(x, ...) { 194 | stop("class of x is not a formula, function, or matrix") 195 | } 196 | 197 | #' @export 198 | #' @rdname sgd 199 | sgd.formula <- function(formula, data, model, 200 | model.control=list(), 201 | sgd.control=list(...), 202 | ...) { 203 | call <- match.call() # set call function to match on arguments 204 | # 1. Validity check. 205 | if (missing(data)) { 206 | data <- environment(formula) 207 | } 208 | 209 | # 2. Build X and Y according to the formula. 210 | mf <- match.call(expand.dots=FALSE) 211 | m <- match(c("formula", "data"), names(mf), 0L) 212 | mf <- mf[c(1L, m)] 213 | mf$drop.unused.levels 214 | mf[[1L]] <- quote(stats::model.frame) 215 | mf <- eval(mf, parent.frame()) 216 | 217 | Y <- model.response(mf, "any") 218 | if (length(dim(Y)) == 1L) { 219 | nm <- rownames(Y) 220 | dim(Y) <- NULL 221 | if (!is.null(nm)) { 222 | names(Y) <- nm 223 | } 224 | } 225 | 226 | mt <- attr(mf, "terms") 227 | if (!is.empty.model(mt)) { 228 | X <- model.matrix(mt, mf) 229 | } else { 230 | X <- matrix(, NROW(Y), 0L) 231 | } 232 | 233 | # 3. Pass into sgd.matrix(). 234 | return(sgd.matrix(X, Y, model, model.control, sgd.control)) 235 | } 236 | 237 | #' @export 238 | #' @rdname sgd 239 | sgd.matrix <- function(x, y, model, 240 | model.control=list(), 241 | sgd.control=list(...), 242 | ...) { 243 | call <- match.call() # set call function to match on arguments 244 | if (missing(x)) { 245 | stop("'x' not specified") 246 | } 247 | if (missing(y)) { 248 | stop("'y' not specified") 249 | } 250 | if (missing(model)) { 251 | stop("'model' not specified") 252 | } 253 | if (!is.list(model.control)) { 254 | stop("'model.control' is not a list") 255 | } 256 | model.control <- do.call("valid_model_control", 257 | c(model.control, model=model, d=ncol(x))) 258 | if (!is.list(sgd.control)) { 259 | stop("'sgd.control' is not a list") 260 | } 261 | sgd.control <- do.call("valid_sgd_control", 262 | c(sgd.control, N=NROW(y), nparams=model.control$nparams)) 263 | 264 | return(fit(x, y, model, model.control, sgd.control)) 265 | } 266 | 267 | #' @export 268 | #' @rdname sgd 269 | # TODO y should be allowed to be a big matrix too; it should be any combination 270 | # (x is a big matrix, y is, etc.) 271 | sgd.big.matrix <- function(x, y, model, 272 | model.control=list(), 273 | sgd.control=list(...), 274 | ...) { 275 | return(sgd.matrix(x, y, model, model.control, sgd.control)) 276 | } 277 | 278 | ################################################################################ 279 | # Helper functions 280 | ################################################################################ 281 | 282 | fit <- function(x, y, model, 283 | model.control, 284 | sgd.control) { 285 | #time_start <- proc.time()[3] # TODO timer only starts here 286 | # TODO 287 | if (model == "gmm") { 288 | if (sgd.control$method %in% c("implicit", "ai-sgd")) { 289 | stop("implicit methods not implemented yet") 290 | } 291 | } 292 | 293 | if (model %in% c("lm", "glm")) { 294 | model.control$transfer <- transfer_name(model.control$family$link) 295 | family <- model.control$family 296 | model.control$family <- family$family 297 | # Enable logistic regression if response is binary factor. 298 | if (is.factor(y) && family == "binomial") { 299 | y <- as.integer(as.character(y)) 300 | } 301 | } 302 | 303 | dataset <- list(X=x, Y=as.matrix(y)) 304 | if ('big.matrix' %in% class(x)) { 305 | dataset$big <- TRUE 306 | dataset[["bigmat"]] <- x@address 307 | } else { 308 | dataset$big <- FALSE 309 | dataset[["bigmat"]] <- new("externalptr") 310 | } 311 | 312 | if (sgd.control$verbose) { 313 | print("Completed pre-processing attributes...") 314 | print("Running C++ algorithm...") 315 | } 316 | out <- run(dataset, model.control, sgd.control) 317 | if (sgd.control$verbose) { 318 | print("Completed C++ algorithm...") 319 | } 320 | if (length(out) == 0) { 321 | stop("An error has occured, program stopped") 322 | } 323 | class(out) <- "sgd" 324 | if (model %in% c("lm", "glm")) { 325 | out$model.out$transfer <- model.control$transfer 326 | out$model.out$family <- family 327 | } 328 | out$pos <- as.vector(out$pos) 329 | #out$times <- as.vector(out$times) + (proc.time()[3] - time_start) # C++ time + R time 330 | out$times <- as.vector(out$times) 331 | out$fitted.values <- predict(out, x, type="response") 332 | out$residuals <- y - fitted(out) 333 | return(out) 334 | } 335 | 336 | valid_model_control <- function(model, model.control=list(...), ...) { 337 | # Run validity check of arguments passed to model.control given model. It 338 | # passes defaults to those unspecified and converts to the correct type if 339 | # possible; otherwise it errors. 340 | # Check validity of regularization parameters. 341 | lambda1 <- model.control$lambda1 342 | if (is.null(lambda1)) { 343 | lambda1 <- 0 344 | } else if (!is.numeric(lambda1)) { 345 | stop("'lambda1' must be numeric") 346 | } else if (length(lambda1) != 1) { 347 | stop(gettextf("length of 'lambda1' should equal %d", 1), domain=NA) 348 | } 349 | lambda2 <- model.control$lambda2 350 | if (is.null(lambda2)) { 351 | lambda2 <- 0 352 | } else if (!is.numeric(lambda2)) { 353 | stop("'lambda2' must be numeric") 354 | } else if (length(lambda2) != 1) { 355 | stop(gettextf("length of 'lambda2' should equal %d", 1), domain=NA) 356 | } 357 | nparams <- model.control$d 358 | # Set family to gaussian for linear model. 359 | if (model == "lm") { 360 | model.control$family <- gaussian() 361 | } 362 | if (model %in% c("lm", "glm")) { 363 | control.family <- model.control$family 364 | control.rank <- model.control$rank 365 | control.trace <- model.control$trace 366 | control.deviance <- model.control$deviance 367 | # Check validity of family. 368 | if (is.null(control.family)) { 369 | control.family <- gaussian() 370 | } else if (is.character(control.family)) { 371 | control.family <- get(control.family, mode="function", envir=parent.frame())() 372 | } else if (is.function(control.family)) { 373 | control.family <- control.family() 374 | } else if (is.null(control.family$family)) { 375 | print(control.family) 376 | stop("'family' not recognized") 377 | } 378 | # Check validity of rank. 379 | if (is.null(control.rank)) { 380 | control.rank <- FALSE 381 | } else if (!is.logical(control.rank)) { 382 | stop ("'rank' not logical") 383 | } 384 | # Check validity of trace. 385 | if (is.null(control.trace)) { 386 | control.trace <- FALSE 387 | } else if (!is.logical(control.trace)) { 388 | stop ("'trace' not logical") 389 | } 390 | # Check validity of deviance. 391 | if (is.null(control.deviance)) { 392 | control.deviance <- FALSE 393 | } else if (!is.logical(control.deviance)) { 394 | stop ("'deviance' not logical") 395 | } 396 | return(list( 397 | name=model, 398 | family=control.family, 399 | rank=control.rank, 400 | trace=control.trace, 401 | deviance=control.deviance, 402 | nparams=nparams, 403 | lambda1=lambda1, 404 | lambda2=lambda2)) 405 | } else if (model == "cox") { 406 | return(list( 407 | name=model, 408 | nparams=nparams, 409 | lambda1=lambda1, 410 | lambda2=lambda2)) 411 | } else if (model == "gmm") { 412 | control.fn <- model.control$fn 413 | control.gr <- model.control$gr 414 | control.nparams <- model.control$nparams 415 | control.type <- model.control$type 416 | control.wmatrix <- model.control$wmatrix 417 | # Check validify of moment function and its gradient. 418 | if (is.null(control.fn) && is.null(control.gr)) { 419 | stop("either 'fn' or 'gr' must be specified") 420 | } else if (!is.null(control.fn) && !is.function(control.fn)) { 421 | stop("'fn' not a function") 422 | } else if (!is.null(control.gr) && !is.function(control.gr)) { 423 | stop("'gr' not a function") 424 | } else if (!is.null(control.fn) && is.null(control.gr)) { 425 | # Default to numerical gradient via central differences. 426 | #library(numDeriv) 427 | # TODO probably does not work 428 | control.gr <- function(x, fn=control.fn) { 429 | d <- length(x) 430 | h <- 1e-5 431 | out <- rep(0, d) 432 | for (i in 1:d) { 433 | ei <- c(rep(0, i-1), h, rep(0, d-i)) 434 | out[i] <- (fn(x + ei) - fn(x - ei)) / (2*h) 435 | } 436 | return(out) 437 | } 438 | } 439 | # Check validity of nparams. 440 | if (is.null(control.nparams)) { 441 | stop("'nparams' not specified") 442 | } else if (!is.numeric(control.nparams) || 443 | control.nparams - as.integer(control.nparams) != 0 || 444 | control.nparams < 1) { 445 | stop("'nparams' must be a positive integer") 446 | } 447 | # Check validity of GMM type. 448 | if (is.null(control.type)) { 449 | control.type <- "iterative" 450 | } else if (!is.character(control.type)) { 451 | stop("'type' must be a string") 452 | # TODO implement cuee 453 | } else if (!(control.type %in% c("twostep", "iterative", "cuee"))) { 454 | stop("'type' not recognized") 455 | } 456 | # Check validity of weighting matrix. 457 | if (is.null(control.wmatrix)) { 458 | # do nothing, since will not store large matrix in R but in C++ 459 | } else if (!is.matrix(control.wmatrix)) { 460 | stop("'wmatrix' not a matrix") 461 | # TODO check if dimensions are same as moment conditions 462 | #} else if (identical(dim(control.wmatrix), c(k,k))) { 463 | } 464 | return(list( 465 | name=model, 466 | gr=control.gr, 467 | type=control.type, 468 | nparams=control.nparams, 469 | lambda1=lambda1, 470 | lambda2=lambda2)) 471 | } else if (model == "m") { 472 | control.loss <- model.control$loss 473 | if (is.null(control.loss)) { 474 | control.loss <- "huber" 475 | } else if (!is.character(control.loss)) { 476 | stop ("'model.control$loss' must be a string") 477 | } else if (control.loss != "huber") { 478 | stop ("'loss' not available") 479 | } 480 | return(list( 481 | name=model, 482 | loss=control.loss, 483 | nparams=nparams, 484 | lambda1=lambda1, 485 | lambda2=lambda2)) 486 | } else { 487 | stop("model not specified") 488 | } 489 | } 490 | 491 | valid_sgd_control <- function(method="ai-sgd", lr="one-dim", 492 | lr.control=NULL, 493 | start=rnorm(nparams, mean=0, sd=1e-5), 494 | size=100, 495 | reltol=1e-5, npasses=3, pass=F, 496 | shuffle=F, verbose=F, 497 | truth=NULL, check=F, 498 | N, nparams, ...) { 499 | # The following are internal parameters that can be used but aren't written in 500 | # the documentation for succinctness: 501 | # check: logical, specifying whether to check against \code{truth} for 502 | # convergence instead of using reltol 503 | # truth: true set of parameters 504 | # TODO size isn't the correct thing since reltol means you don't know when it 505 | # ends. user should specify how often to store the iterates (how many per 506 | # iteration) 507 | # Run validity check of arguments passed to sgd.control. It passes defaults to 508 | # those unspecified and converts to the correct type if possible; otherwise it 509 | # errors. 510 | # Check validity of method. 511 | if (!is.character(method)) { 512 | stop("'method' must be a string") 513 | } else if (!(method %in% c("sgd", "implicit", "asgd", "ai-sgd", "momentum", 514 | "nesterov"))) { 515 | stop("'method' not recognized") 516 | } 517 | 518 | # Check validity of learning rate. 519 | lrs <- c("one-dim", "one-dim-eigen", "d-dim", "adagrad", "rmsprop") 520 | if (is.numeric(lr)) { 521 | if (lr < 1 | lr > length(lrs)) { 522 | stop("'lr' out of range") 523 | } 524 | lr <- lrs[lr] 525 | } else if (is.character(lr)) { 526 | lr <- tolower(lr) 527 | if (!(lr %in% lrs)) { 528 | stop("'lr' not recognized") 529 | } 530 | } else { 531 | stop("invalid 'lr'") 532 | } 533 | 534 | # Check validity of lr.control. 535 | if (!is.null(lr.control) && !is.numeric(lr.control)) { 536 | stop("'lr.control' must be numeric") 537 | } else if (lr == "one-dim") { 538 | if (method %in% c("asgd", "ai-sgd")) { 539 | c <- 2/3 540 | } else { 541 | c <- 1 542 | } 543 | defaults <- c(1, 1, 1, c) 544 | if (is.null(lr.control)) { 545 | lr.control <- defaults 546 | } else if (length(lr.control) != 4) { 547 | stop(gettextf("length of 'lr.control' should equal %d", 4), domain=NA) 548 | } 549 | missing <- which(is.na(lr.control)) 550 | lr.control[missing] <- defaults[missing] 551 | } else if (lr == "one-dim-eigen") { 552 | if (is.null(lr.control)) { 553 | lr.control <- 0 # garbage number to store double in C++ 554 | } else if (length(lr.control) != 0) { 555 | stop(gettextf("length of 'lr.control' should equal %d", 0), domain=NA) 556 | } 557 | } else if (lr == "d-dim") { 558 | defaults <- 1e-6 559 | if (is.null(lr.control)) { 560 | lr.control <- defaults 561 | } else if (length(lr.control) != 1) { 562 | stop(gettextf("length of 'lr.control' should equal %d", 1), domain=NA) 563 | } 564 | missing <- which(is.na(lr.control)) 565 | lr.control[missing] <- defaults[missing] 566 | } else if (lr == "adagrad") { 567 | defaults <- c(1, 1e-6) 568 | if (is.null(lr.control)) { 569 | lr.control <- defaults 570 | } else if (length(lr.control) != 2) { 571 | stop(gettextf("length of 'lr.control' should equal %d", 2), domain=NA) 572 | } 573 | missing <- which(is.na(lr.control)) 574 | lr.control[missing] <- defaults[missing] 575 | } else if (lr == "rmsprop") { 576 | defaults <- c(1, 0.9, 1e-6) 577 | if (is.null(lr.control)) { 578 | lr.control <- defaults 579 | } else if (length(lr.control) != 3) { 580 | stop(gettextf("length of 'lr.control' should equal %d", 3), domain=NA) 581 | } 582 | missing <- which(is.na(lr.control)) 583 | lr.control[missing] <- defaults[missing] 584 | } 585 | 586 | # Check validity of start. 587 | if (!is.numeric(start)) { 588 | stop("'start' must be numeric") 589 | } else if (length(start) != nparams) { 590 | stop(gettextf("length of 'start' should equal %d", nparams), domain=NA) 591 | } 592 | 593 | # Check validity of size. 594 | if (!is.numeric(size) || size - as.integer(size) != 0 || size < 1) { 595 | stop("'size' must be positive integer") 596 | } 597 | 598 | # Check validity of reltol 599 | if (!is.numeric(reltol)) { 600 | stop("'reltol' must be numeric") 601 | } else if (length(reltol) != 1) { 602 | stop("'reltol' must be scalar") 603 | } 604 | 605 | # Check validity of npasses. 606 | if (!is.numeric(npasses) || npasses - as.integer(npasses) != 0 || npasses < 1) { 607 | stop("'npasses' must be positive integer") 608 | } 609 | 610 | # Check validity of pass. 611 | if (!is.logical(pass)) { 612 | stop("'pass' must be logical") 613 | } 614 | 615 | # Check validity of shuffle. 616 | if (!is.logical(shuffle)) { 617 | stop("'shuffle' must be logical") 618 | } 619 | 620 | # Check validity of verbose. 621 | if (!is.logical(verbose)) { 622 | stop("'verbose' must be logical") 623 | } 624 | 625 | # Check validity of additional arguments if the method is implicit. 626 | if (method %in% c("implicit", "ai-sgd")) { 627 | call <- match.call() 628 | implicit.control <- do.call("valid_implicit_control", list(...)) 629 | } else { 630 | implicit.control <- NULL 631 | } 632 | 633 | # TODO they should be vectors in C++, not requiring conversion 634 | start <- as.matrix(start) 635 | if (check) { 636 | truth <- as.matrix(truth) 637 | } 638 | 639 | return(c(list(method=method, 640 | lr=lr, 641 | lr.control=lr.control, 642 | start=start, 643 | size=size, 644 | reltol=reltol, 645 | npasses=npasses, 646 | pass=pass, 647 | shuffle=shuffle, 648 | verbose=verbose, 649 | check=check, 650 | truth=truth, 651 | nparams=nparams), 652 | implicit.control)) 653 | } 654 | 655 | valid_implicit_control <- function(delta=30L, ...) { 656 | # Maintain control parameters for running implicit SGD. Pass defaults 657 | # if unspecified. 658 | # 659 | # Args: 660 | # delta: convergence criterion for the one-dimensional optimization 661 | args <- list(...) 662 | if (!is.null(names(args))) { 663 | stop("Invalid args passed into sgd.control through dots") 664 | } 665 | if (!is.numeric(delta) || delta - as.integer(delta) != 0 || delta <= 0) { 666 | stop("value of 'delta' must be integer > 0") 667 | } 668 | return(list(delta=delta)) 669 | } 670 | 671 | transfer_name <- function(link.name) { 672 | if(!is.character(link.name)) { 673 | stop("link name must be a string") 674 | } 675 | link.names <- c("identity", "log", "logit", "inverse") 676 | transfer.names <- c("identity", "exp", "logistic", "inverse") 677 | transfer.idx <- which(link.names == link.name) 678 | if (length(transfer.idx) == 0) { 679 | stop("no match link function founded!") 680 | } 681 | return(transfer.names[transfer.idx]) 682 | } 683 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sgd 2 | 3 | sgd is an R package for large 4 | scale estimation. It features many stochastic gradient methods, built-in models, 5 | visualization tools, automated hyperparameter tuning, model checking, interval 6 | estimation, and convergence diagnostics. 7 | 8 | ## Features 9 | At the core of the package is the function 10 | ```{R} 11 | sgd(formula, data, model, model.control, sgd.control) 12 | ``` 13 | It estimates parameters for a given data set and model using stochastic gradient 14 | descent. The optional arguments `model.control` and `sgd.control` specify 15 | attributes about the model and stochastic gradient method. Taking advantage of 16 | the bigmemory package, sgd also operates on data sets which are too large to fit 17 | in RAM as well as streaming data. 18 | 19 | Example of large-scale linear regression: 20 | ```{R} 21 | library(sgd) 22 | 23 | # Dimensions 24 | N <- 1e5 # number of data points 25 | d <- 1e2 # number of features 26 | 27 | # Generate data. 28 | X <- matrix(rnorm(N*d), ncol=d) 29 | theta <- rep(5, d+1) 30 | eps <- rnorm(N) 31 | y <- cbind(1, X) %*% theta + eps 32 | dat <- data.frame(y=y, x=X) 33 | 34 | sgd.theta <- sgd(y ~ ., data=dat, model="lm") 35 | ``` 36 | 37 | Any loss function may be specified. For convenience the following are 38 | built-in: 39 | * Linear models 40 | * Generalized linear models 41 | * Method of moments 42 | * Generalized method of moments 43 | * Cox proportional hazards model 44 | * M-estimation 45 | 46 | The following stochastic gradient methods exist: 47 | * (Standard) stochastic gradient descent 48 | * Implicit stochastic gradient descent 49 | * Averaged stochastic gradient descent 50 | * Averaged implicit stochastic gradient descent 51 | * Classical momentum 52 | * Nesterov's accelerated gradient 53 | 54 | Check out the vignette in [`vignettes/`](vignettes/) or examples in [`demo/`](demo/). 55 | In R, the equivalent commands are `vignette(package="sgd")` and 56 | `demo(package="sgd")`. 57 | 58 | ## Installation 59 | To install the latest version from CRAN: 60 | ```{R} 61 | install.packages("sgd") 62 | ``` 63 | 64 | To install the latest development version from Github: 65 | ```{R} 66 | # install.packages("devtools") 67 | devtools::install_github("airoldilab/sgd") 68 | ``` 69 | 70 | ## Authors 71 | sgd is written by [Dustin Tran](http://dustintran.com), [Junhyung Lyle Kim](https://jlylekim.github.io/) and 72 | [Panos Toulis](https://www.ptoulis.com/). Please feel free to contribute by 73 | submitting any issues or requests—or by solving any current issues! 74 | 75 | We thank all other members of the Airoldi Lab (led by Prof. Edo Airoldi) for their feedback and contributions. 76 | 77 | ## Citation 78 | 79 | ``` 80 | @article{tran2015stochastic, 81 | author = {Tran, Dustin and Toulis, Panos and Airoldi, Edoardo M}, 82 | title = {Stochastic gradient descent methods for estimation with large data sets}, 83 | journal = {arXiv preprint arXiv:1509.06459}, 84 | year = {2015} 85 | } 86 | ``` 87 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Test environments 2 | 3 | * local OS X install, R 4.3.2 4 | * win-builder (devel) 5 | 6 | ## R CMD check results 7 | There were no ERRORs or WARNINGs. 8 | 9 | There was 1 NOTE: 10 | 11 | checking CRAN incoming feasibility (8.6s) 12 | Maintainer: ‘Junhyung Lyle Kim ’ 13 | 14 | New submission 15 | 16 | Package was archived on CRAN 17 | 18 | CRAN repository db overrides: 19 | X-CRAN-Comment: Archived on 2024-01-30 as check problems were not 20 | corrected in time. 21 | 22 | -> This is the new submission after fixing the check problems. 23 | 24 | -------------------------------------------------------------------------------- /data/winequality.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/airoldilab/sgd/92246a3e1d81c9cd3077b80cacb6a3f7ee2425c5/data/winequality.rda -------------------------------------------------------------------------------- /demo/00Index: -------------------------------------------------------------------------------- 1 | bench-corr-linear-regression Benchmark linear regression on correlated normal data 2 | bench-linear-regression Benchmark linear regression on normal data 3 | bench-logistic-wine Benchmark logistic regression on wine quality data set 4 | cox-regression Demo Cox regression on simulated data 5 | glm-logistic-regression Demo logistic regression on Bernoulli data 6 | glm-poisson-regression Demo Poisson regression on Poisson data 7 | linear-regression Demo linear regression on normal data 8 | m-estimation Demo M-estimation with Huber loss on contaminated normal data 9 | normal-method-of-moments Demo method of moments for estimating normal parameters 10 | -------------------------------------------------------------------------------- /demo/bench-corr-linear-regression.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Benchmark sgd package for linear regression on simulated data from a 3 | # correlated normal distribution. This follows the experiment in Section 5.1 of 4 | # Friedman et al. (2010). 5 | # 6 | # Data generating process: 7 | # Y = sum_{j=1}^p X_j*beta_j + k*eps, where 8 | # X ~ Multivariate normal where each covariate Xj, Xj' has equal correlation 9 | # rho; rho ranges over (0,0.1,0.2,0.5,0.9,0.95) for each pair (n, d) 10 | # beta_j = (-1)^j exp(-2(j-1)/20) 11 | # eps ~ Normal(0,1) 12 | # k = 3 13 | # 14 | # Dimensions: 15 | # n=1000, d=100 16 | # n=5000, d=100 17 | # n=100, d=1000 18 | # n=100, d=5000 19 | # n=100, d=20000 20 | # n=100, d=50000 21 | 22 | library(sgd) 23 | library(glmnet) 24 | library(microbenchmark) 25 | 26 | # Function taken from Friedman et al. 27 | genx = function(n,p,rho){ 28 | # generate x's multivariate normal with equal corr rho 29 | # Xi = b Z + Wi, and Z, Wi are independent normal. 30 | # Then Var(Xi) = b^2 + 1 31 | # Cov(Xi, Xj) = b^2 and so cor(Xi, Xj) = b^2 / (1+b^2) = rho 32 | z=rnorm(n) 33 | if(abs(rho)<1){ 34 | beta=sqrt(rho/(1-rho)) 35 | x=matrix(rnorm(n*p),ncol=p) 36 | A = matrix(rnorm(n), nrow=n, ncol=p, byrow=F) 37 | x= beta * A + x 38 | } 39 | if(abs(rho)==1){ x=matrix(rnorm(n),nrow=n,ncol=p,byrow=F)} 40 | 41 | return(x) 42 | } 43 | 44 | # Dimensions 45 | Ns <- c(1000, 5000, 100, 100, 100, 100) 46 | ds <- c(100, 100, 1000, 5000, 20000, 50000) 47 | rhos <- c(0, 0.1, 0.2, 0.5, 0.9, 0.95) 48 | 49 | # explicit sgd will error the loop for higher correlation 50 | rhos <- c(0, 0.1, 0.2) 51 | 52 | set.seed(42) 53 | benchmark <- list() 54 | idx <- 1 55 | for (i in 1:length(Ns)) { 56 | for (j in 1:length(rhos)) { 57 | N <- Ns[i] 58 | d <- ds[i] 59 | rho <- rhos[j] 60 | # Generate data. 61 | X <- genx(N, d, rho) 62 | theta <- ((-1)^(1:d))*exp(-2*((1:d)-1)/20) 63 | eps <- rnorm(N) 64 | k <- 3 65 | y <- X %*% theta + k * eps 66 | 67 | # Benchmark! 68 | benchmark[[idx]] <- microbenchmark( 69 | aisgd=sgd(X, y, data=dat, model="lm", 70 | sgd.control=list(method="ai-sgd", npasses=1, pass=T)), 71 | sgd=sgd(X, y, data=dat, model="lm", 72 | sgd.control=list(method="sgd", npasses=1, pass=T)), 73 | glmnet=glmnet(X, y, alpha=1, standardize=FALSE, type.gaussian="covariance"), 74 | times=10L, unit="s" 75 | ) 76 | names(benchmark)[idx] <- sprintf("N: %i; d: %i; rho: %0.2f", N, d, rho) 77 | idx <- idx + 1 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /demo/bench-linear-regression.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Benchmark sgd package for linear regression on simulated data from a normal 3 | # distribution. 4 | # 5 | # Data generating process: 6 | # Y = X %*% theta + epsilon, where 7 | # X ~ Normal(0, 1) 8 | # theta = (5,...,5) 9 | # epsilon ~ Normal(0,1) 10 | # 11 | # Dimensions: 12 | # N=1e5 observations 13 | # d=1e2 parameters 14 | 15 | library(sgd) 16 | library(glmnet) 17 | library(microbenchmark) 18 | 19 | # Dimensions 20 | N <- 1e5 21 | d <- 1e2 22 | 23 | # Generate data. 24 | set.seed(42) 25 | X <- matrix(rnorm(N*d), ncol=d) 26 | theta <- rep(5, d+1) 27 | eps <- rnorm(N) 28 | y <- cbind(1, X) %*% theta + eps 29 | dat <- data.frame(y=y, x=X) 30 | 31 | # Benchmark! 32 | benchmark <- microbenchmark( 33 | sgd=sgd(y ~ ., data=dat, model="lm", 34 | sgd.control=list(method="implicit")), 35 | lm=lm(y ~ ., data=dat), 36 | glmnet=glmnet(X, y, alpha=1, standardize=FALSE, type.gaussian="naive"), 37 | times=10L 38 | ) 39 | benchmark 40 | ## Output (for 2.6 GHz, Intel Core i5) 41 | ## Unit: milliseconds 42 | ## expr min lq mean median uq max neval 43 | ## sgd 644.8761 676.8018 740.3485 733.2575 776.6705 918.4465 10 44 | ## lm 1467.7145 1566.2102 1648.5728 1608.2134 1773.2074 1816.3088 10 45 | ## glmnet 2392.7664 2437.7863 2636.2133 2600.4307 2789.2635 3091.6138 10 46 | -------------------------------------------------------------------------------- /demo/bench-logistic-wine.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Compare out-of-sample log-likelihoods, using sgd() and glm() for 3 | # logistic regression on the wine quality data set. 4 | # 5 | # Dimensions: 6 | # n=4286, d=12 7 | 8 | library(sgd) 9 | 10 | # Generate data. 11 | data("winequality") 12 | dat <- winequality 13 | dat$quality <- as.numeric(dat$quality > 5) # transform to binary 14 | 15 | test.set <- sample(1:nrow(dat), size=nrow(dat)/8, replace=F) 16 | dat.test <- dat[test.set, ] 17 | dat <- dat[-test.set, ] 18 | 19 | # Fit glm() and sgd(). 20 | fit.glm <- glm(quality~., family=binomial(link="logit"), data=dat) 21 | fit.sgd <- sgd(quality ~ ., data=dat, 22 | model="glm", model.control=binomial(link="logit"), 23 | sgd.control=list(reltol=1e-5, npasses=200), lr.control=c(scale=1, gamma=1, alpha=30, c=1)) 24 | 25 | # Compare log likelihoods. 26 | log.lik <- function(theta.est) { 27 | 28 | y <- dat.test$quality 29 | X <- as.matrix(dat.test[, seq(1, ncol(dat)-1)]) 30 | X <- cbind(1, X) 31 | 32 | eta <- plogis(X %*% theta.est) 33 | print(cor(y, eta)) 34 | sum(y * log(eta) + (1-y) * log(1-eta)) 35 | } 36 | 37 | theta.glm <- matrix(as.numeric(fit.glm$coefficients), ncol=1) 38 | theta.sgd <- matrix(as.numeric(fit.sgd$coefficients), ncol=1) 39 | log.lik.glm <- log.lik(fit.glm$coefficients) 40 | log.lik.sgd <- log.lik(theta.sgd) 41 | 42 | print(sprintf("Out-of-sample Log-likelihood for glm()=%.3f sgd=%.3f", log.lik.glm, log.lik.sgd)) 43 | -------------------------------------------------------------------------------- /demo/cox-regression.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Demo usage of sgd for linear regression on simulated normal data. 3 | # 4 | # Data generating process: 5 | # Y = Binomial(N, p), where 6 | # p (see generate.data) 7 | # X ~ correlated Normal (see genx) 8 | # 9 | # Dimensions: 10 | # N=1e3 observations 11 | # d=5 parameters 12 | 13 | # Function taken from Friedman et al. 14 | genx = function(n,p,rho){ 15 | # generate x's multivariate normal with equal corr rho 16 | # Xi = b Z + Wi, and Z, Wi are independent normal. 17 | # Then Var(Xi) = b^2 + 1 18 | # Cov(Xi, Xj) = b^2 and so cor(Xi, Xj) = b^2 / (1+b^2) = rho 19 | z=rnorm(n) 20 | if(abs(rho)<1){ 21 | beta=sqrt(rho/(1-rho)) 22 | x0=matrix(rnorm(n*p),ncol=p) 23 | A = matrix(z, nrow=n, ncol=p, byrow=F) 24 | x= beta * A + x0 25 | } 26 | if(abs(rho)==1){ x=matrix(z,nrow=n,ncol=p,byrow=F)} 27 | 28 | return(x) 29 | } 30 | 31 | generate.data <- function(n, p, rho=0.2) { 32 | ## Generate data 33 | # # 34 | # # Returns: 35 | # # LIST(X, Y, censor, true.beta) 36 | # # X = Nxp matrix of covariates. 37 | # # Y = Nx1 vector of observed times. 38 | # # censor = Nx1 vector {0, 1} of censor indicators. 39 | # # true.beta = p-vector of true model parameters. 40 | # # M = (Y, censor) as matrix 41 | 42 | X = genx(n, p, rho=rho) 43 | # rates. 44 | # rates = runif(n, min=1e-2, max=10) 45 | # Y = rexp(n, 46 | # beta = solve(t(X) %*% X) %*% t(X) %*% log(rates) 47 | beta = 10 *((-1)^(1:p))*exp(-2*((1:p)-1)/20) 48 | # beta = 10 * seq(1, p)**(-0.5) 49 | # warning("Large coefficients") 50 | pred = exp(X %*% beta) 51 | Y = rexp(n, rate =pred) 52 | 53 | q3 = quantile(Y, prob=c(0.8)) # Q3 of Y 54 | epsilon = 0.001 # probability of censoring smallest Y 55 | k = log(1/epsilon - 1) / (q3 - min(Y)) 56 | censor.prob = (1 + exp(-k * (Y-q3)))**(-1) 57 | 58 | C = rbinom(n, size=1, prob= censor.prob) 59 | 60 | ## Order the data 61 | order.i = order(Y) 62 | X = X[order.i, ] 63 | Y = Y[order.i] 64 | C = C[order.i] 65 | 66 | M = matrix(0, nrow=n, ncol=2) 67 | colnames(M) <- c("time", "status") 68 | M[, 1] <- Y 69 | M[, 2] <- 1-C 70 | return(list(X=X, Y=Y, censor=C, M=M, true.beta=beta)) 71 | } 72 | 73 | library(sgd) 74 | 75 | # Dimensions 76 | N <- 1e3 77 | d <- 5 78 | 79 | # Generate data. 80 | set.seed(42) 81 | data <- generate.data(N, d) 82 | X <- data$X 83 | y <- 1 - data$censor # y=1 if fail, 0 if censor 84 | dat <- data.frame(y=y, x=X) 85 | 86 | t <- data$Y # times of observations 87 | theta <- data$true.beta # true coefficients 88 | 89 | # Explicit 90 | sgd.theta <- sgd(y ~ . - 1, data=dat, model="cox", 91 | sgd=list(method="sgd", lr="adagrad", npasses=5)) 92 | sgd.theta$coefficients 93 | 94 | # Implicit 95 | sgd.theta <- sgd(y ~ . - 1, data=dat, model="cox", 96 | sgd=list(method="implicit", 97 | lr="adagrad", 98 | lr.control=c(0.1, NA), 99 | npasses=5)) 100 | sgd.theta$coefficients 101 | -------------------------------------------------------------------------------- /demo/glm-logistic-regression.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Demo usage of sgd for linear regression on simulated normal data. 3 | # 4 | # Data generating process: 5 | # Y = Binomial(N, p), where 6 | # p = 1 / (1 + exp(-X %*% theta + epsilon): 7 | # X ~ Normal(0, 1) 8 | # theta = (5,...,5) 9 | # epsilon ~ Normal(0,1) 10 | # 11 | # Dimensions: 12 | # N=1e5 observations 13 | # d=5 parameters 14 | 15 | library(sgd) 16 | 17 | # Dimensions 18 | N <- 1e5 19 | d <- 5 20 | 21 | # Generate data. 22 | set.seed(42) 23 | X <- matrix(rnorm(N*d), ncol=d) 24 | theta <- rep(5, d+1) 25 | eps <- rnorm(N) 26 | p <- 1/(1+exp(-(cbind(1, X) %*% theta + eps))) 27 | y <- rbinom(N, 1, p) 28 | dat <- data.frame(y=y, x=X) 29 | 30 | sgd.theta <- sgd(y ~ ., data=dat, model="glm", 31 | model.control=list(family="binomial"), 32 | sgd.control=list(lr.control=c(100, NA, NA, NA), npasses=1, 33 | pass=T)) 34 | 35 | plot(sgd.theta, cbind(1, X), y, label="ai-sgd", type="clf") 36 | -------------------------------------------------------------------------------- /demo/glm-poisson-regression.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Demo usage of sgd for fitting generalized linear model with Poisson 3 | # link function with simulated data. 4 | # 5 | # Data generating process: 6 | # X is either (0, 0), (1, 0), (0, 1) with 0.6, 0.2, 0.2 probability 7 | # theta = (log(2), log(4)) 8 | # y = Poisson(exp(X %*% theta)) 9 | # 10 | # Dimensions: 11 | # N=1e5 observations 12 | # d=2 parameters 13 | 14 | library(sgd) 15 | 16 | # Dimensions 17 | N <- 1e5 18 | 19 | # Generate data. 20 | set.seed(42) 21 | Q <- 0.2 22 | code <- sample(0:2, size=N, replace=T, prob=c((1-2*Q), Q, Q)) 23 | X <- matrix(0, nrow=N, ncol=2) 24 | X[,1] <- as.numeric(code==1) 25 | X[,2] <- as.numeric(code==2) 26 | theta <- matrix(c(log(2), log(4)), ncol=1) 27 | y <- matrix(rpois(N, exp(X %*% theta)), ncol=1) 28 | dat <- data.frame(y=y, x=X) 29 | 30 | sgd.theta <- sgd(y ~ ., data=dat, model="glm", 31 | model.control=list(family=poisson())) 32 | # 3 parameters including intercept 33 | mean((sgd.theta$coefficients - c(0, theta))^2) # MSE 34 | -------------------------------------------------------------------------------- /demo/linear-regression.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Demo usage of sgd for linear regression on simulated normal data. 3 | # 4 | # Data generating process: 5 | # Y = X %*% theta + epsilon, where 6 | # X ~ Normal(0, 1) 7 | # theta = (5,...,5) 8 | # epsilon ~ Normal(0,1) 9 | # 10 | # Dimensions: 11 | # N=1e5 observations 12 | # d=1e2 parameters 13 | 14 | library(sgd) 15 | 16 | # Dimensions 17 | N <- 1e5 18 | d <- 1e2 19 | 20 | # Generate data. 21 | set.seed(42) 22 | X <- matrix(rnorm(N*d), ncol=d) 23 | theta <- rep(5, d+1) 24 | eps <- rnorm(N) 25 | y <- cbind(1, X) %*% theta + eps 26 | dat <- data.frame(y=y, x=X) 27 | 28 | sgd.theta <- sgd(y ~ ., data=dat, model="lm") 29 | 30 | plot(sgd.theta, theta, type="mse-param") 31 | -------------------------------------------------------------------------------- /demo/m-estimation.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Demo usage of sgd for M-estimation using the Huber loss. 3 | # Example taken from Donoho and Montanari (2013, Section 2.4). 4 | # 5 | # Data generating process: 6 | # Y = X %*% theta + epsilon, where 7 | # X ~ Normal(0, 1/N) 8 | # theta ~ Unif([0,1]^d) with fixed 2-norm 6*sqrt(d) 9 | # epsilon ~ ContaminatedNormal(0.05, 10) = 0.95z + 0.05h_{10}, 10 | # where z ~ Normal(0,1) and h_x = unit atom at x 11 | # 12 | # Dimensions: 13 | # N=1000 observations 14 | # d=200 parameters 15 | 16 | library(sgd) 17 | library(ggplot2) 18 | 19 | generate.data <- function(N, d) { 20 | l2 <- function(x) sqrt(sum(x**2)) 21 | X <- matrix(rnorm(N*d, mean=0, sd=1/sqrt(N)), nrow=N, ncol=d) 22 | theta <- runif(d) 23 | theta <- theta * 6 *sqrt(d) / l2(theta) 24 | 25 | # noise 26 | ind <- rbinom(N, size=1, prob=.95) 27 | epsilon <- ind * rnorm(N) + (1-ind) * rep(10 ,N) 28 | 29 | Y <- X %*% theta + epsilon 30 | return(list(y=Y, X=X, theta=theta)) 31 | } 32 | 33 | # Dimensions 34 | N <- 1000 35 | d <- 200 36 | 37 | # Generate data. 38 | set.seed(42) 39 | data <- generate.data(N, d) 40 | dat <- data.frame(y=data$y, x=data$X) 41 | 42 | sgd.theta <- sgd(y ~ .-1, data=dat, model="m", sgd.control=list(method="sgd", 43 | lr.control=c(15, NA, NA, 1/2), npass=10, pass=T)) 44 | 45 | plot(sgd.theta, data$theta, label="sgd", type="mse-param") + 46 | geom_hline(yintercept=1.5, color="green") 47 | -------------------------------------------------------------------------------- /demo/normal-method-of-moments.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Demo usage of sgd for estimating parameters of a normal distribution using 3 | # the method of moments. 4 | # 5 | # Data generating process: 6 | # X ~ Normal(4, 2) 7 | # 8 | # Dimensions: 9 | # N=1e3 observations 10 | # d=2 parameters 11 | 12 | library(sgd) 13 | 14 | # Dimensions 15 | N <- 1e3 16 | d <- 2 17 | theta <- c(4, 2) 18 | 19 | # Generate data. 20 | set.seed(42) 21 | X <- matrix(rnorm(N, mean=theta[1], sd=theta[2]), ncol=1) 22 | 23 | # Gradient of moment function (using 3 moments) 24 | gr <- function(theta, x) { 25 | return(as.matrix(c( 26 | mean(2*(theta[1] - x) + 27 | 2*(theta[2]^2 - (x - theta[1])^2) * 2*(-theta[1] +x) + 28 | 2*(x^3 - theta[1]*(theta[1]^2 + 3*theta[2]^2)) * (-3*theta[1]^2 - 29 | 3*theta[2]^2)), 30 | mean(0 + 31 | 2*(theta[2]^2 - (x - theta[1])^2) * 2*theta[2] + 32 | 2*(x^3 - theta[1]*(theta[1]^2 + 3*theta[2]^2)) * -6*theta[1]*theta[2]) 33 | ))) 34 | } 35 | sgd.theta <- sgd(X, y=matrix(NA, nrow=nrow(X)), model="gmm", 36 | model.control=list(gr=gr, nparams=2), 37 | sgd.control=list(method="sgd", npasses=100, lr="adagrad")) 38 | sgd.theta 39 | -------------------------------------------------------------------------------- /man/coef.sgd.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/coef.sgd.R 3 | \name{coef.sgd} 4 | \alias{coef.sgd} 5 | \title{Extract Model Coefficients} 6 | \usage{ 7 | \method{coef}{sgd}(object, ...) 8 | } 9 | \arguments{ 10 | \item{object}{object of class \code{sgd}.} 11 | 12 | \item{\dots}{some methods for this generic require additional 13 | arguments. None are used in this method.} 14 | } 15 | \value{ 16 | Coefficients extracted from the model object \code{object}. 17 | } 18 | \description{ 19 | Extract model coefficients from \code{sgd} objects. \code{coefficients} 20 | is an \emph{alias} for it. 21 | } 22 | -------------------------------------------------------------------------------- /man/fitted.sgd.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fitted.sgd.R 3 | \name{fitted.sgd} 4 | \alias{fitted.sgd} 5 | \title{Extract Model Fitted Values} 6 | \usage{ 7 | \method{fitted}{sgd}(object, ...) 8 | } 9 | \arguments{ 10 | \item{object}{object of class \code{sgd}.} 11 | 12 | \item{\dots}{some methods for this generic require additional 13 | arguments. None are used in this method.} 14 | } 15 | \value{ 16 | Fitted values extracted from the object \code{object}. 17 | } 18 | \description{ 19 | Extract fitted values from from \code{sgd} objects. 20 | \code{fitted.values} is an \emph{alias} for it. 21 | } 22 | -------------------------------------------------------------------------------- /man/plot.sgd.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.sgd.R 3 | \name{plot.sgd} 4 | \alias{plot.sgd} 5 | \alias{plot.list} 6 | \title{Plot objects of class \code{sgd}.} 7 | \usage{ 8 | \method{plot}{sgd}(x, ..., type = "mse", xaxis = "iteration") 9 | 10 | \method{plot}{list}(x, ..., type = "mse", xaxis = "iteration") 11 | } 12 | \arguments{ 13 | \item{x}{object of class \code{sgd}.} 14 | 15 | \item{\dots}{additional arguments used for each type of plot. See 16 | \sQuote{Details}.} 17 | 18 | \item{type}{character specifying the type of plot: \code{"mse"}, 19 | \code{"clf"}, \code{"mse-param"}. See \sQuote{Details}. Default is 20 | \code{"mse"}.} 21 | 22 | \item{xaxis}{character specifying the x-axis of plot: \code{"iteration"} 23 | plots the y values over the log-iteration of the algorithm; 24 | \code{"runtime"} plots the y values over the time in seconds to reach them. 25 | Default is \code{"iteration"}.} 26 | } 27 | \description{ 28 | Plot objects of class \code{sgd}. 29 | } 30 | \details{ 31 | Types of plots available: 32 | \describe{ 33 | \item{\code{mse}}{Mean squared error in predictions, which takes the 34 | following arguments: 35 | \describe{ 36 | \item{\code{x_test}}{test set} 37 | \item{\code{y_test}}{test responses to compare predictions to} 38 | }} 39 | \item{\code{clf}}{Classification error in predictions, which takes the 40 | following arguments: 41 | \describe{ 42 | \item{\code{x_test}}{test set} 43 | \item{\code{y_test}}{test responses to compare predictions to} 44 | }} 45 | \item{\code{mse-param}}{Mean squared error in parameters, which takes the 46 | following arguments: 47 | \describe{ 48 | \item{\code{true_param}}{true vector of parameters to compare to} 49 | }} 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /man/predict.sgd.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/predict.sgd.R 3 | \name{predict.sgd} 4 | \alias{predict.sgd} 5 | \alias{predict_all} 6 | \title{Model Predictions} 7 | \usage{ 8 | \method{predict}{sgd}(object, newdata, type = "link", ...) 9 | 10 | predict_all(object, newdata, ...) 11 | } 12 | \arguments{ 13 | \item{object}{object of class \code{sgd}.} 14 | 15 | \item{newdata}{design matrix to form predictions on} 16 | 17 | \item{type}{the type of prediction required. The default "link" is 18 | on the scale of the linear predictors; the alternative '"response"' 19 | is on the scale of the response variable. Thus for a default 20 | binomial model the default predictions are of log-odds 21 | (probabilities on logit scale) and 'type = "response"' gives the 22 | predicted probabilities. The '"terms"' option returns a matrix 23 | giving the fitted values of each term in the model formula on the 24 | linear predictor scale.} 25 | 26 | \item{\dots}{further arguments passed to or from other methods.} 27 | } 28 | \description{ 29 | Form predictions using the estimated model parameters from stochastic 30 | gradient descent. 31 | } 32 | \details{ 33 | A column of 1's must be included to \code{newdata} if the 34 | parameters include a bias (intercept) term. 35 | } 36 | -------------------------------------------------------------------------------- /man/print.sgd.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/print.sgd.R 3 | \name{print.sgd} 4 | \alias{print.sgd} 5 | \title{Print objects of class \code{sgd}.} 6 | \usage{ 7 | \method{print}{sgd}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{object of class \code{sgd}.} 11 | 12 | \item{\dots}{further arguments passed to or from other methods.} 13 | } 14 | \description{ 15 | Print objects of class \code{sgd}. 16 | } 17 | -------------------------------------------------------------------------------- /man/residuals.sgd.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/residuals.sgd.R 3 | \name{residuals.sgd} 4 | \alias{residuals.sgd} 5 | \title{Extract Model Residuals} 6 | \usage{ 7 | \method{residuals}{sgd}(object, ...) 8 | } 9 | \arguments{ 10 | \item{object}{object of class \code{sgd}.} 11 | 12 | \item{\dots}{some methods for this generic require additional 13 | arguments. None are used in this method.} 14 | } 15 | \value{ 16 | Residuals extracted from the object \code{object}. 17 | } 18 | \description{ 19 | Extract model residuals from \code{sgd} objects. \code{resid} is an 20 | \emph{alias} for it. 21 | } 22 | -------------------------------------------------------------------------------- /man/sgd.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sgd.R 3 | \name{sgd} 4 | \alias{sgd} 5 | \alias{sgd.formula} 6 | \alias{sgd.matrix} 7 | \alias{sgd.big.matrix} 8 | \title{Stochastic gradient descent} 9 | \usage{ 10 | sgd(x, ...) 11 | 12 | \method{sgd}{formula}(formula, data, model, model.control = list(), sgd.control = list(...), ...) 13 | 14 | \method{sgd}{matrix}(x, y, model, model.control = list(), sgd.control = list(...), ...) 15 | 16 | \method{sgd}{big.matrix}(x, y, model, model.control = list(), sgd.control = list(...), ...) 17 | } 18 | \arguments{ 19 | \item{x, y}{a design matrix and the respective vector of outcomes.} 20 | 21 | \item{\dots}{arguments to be used to form the default \code{sgd.control} 22 | arguments if it is not supplied directly.} 23 | 24 | \item{formula}{an object of class \code{"\link{formula}"} (or one that can be 25 | coerced to that class): a symbolic description of the model to be fitted. 26 | The details can be found in \code{"\link{glm}"}.} 27 | 28 | \item{data}{an optional data frame, list or environment (or object coercible 29 | by \code{\link[base]{as.data.frame}} to a data frame) containing the 30 | variables in the model. If not found in data, the variables are taken from 31 | environment(formula), typically the environment from which glm is called.} 32 | 33 | \item{model}{character specifying the model to be used: \code{"lm"} (linear 34 | model), \code{"glm"} (generalized linear model), \code{"cox"} (Cox 35 | proportional hazards model), \code{"gmm"} (generalized method of moments), 36 | \code{"m"} (M-estimation). See \sQuote{Details}.} 37 | 38 | \item{model.control}{a list of parameters for controlling the model. 39 | \describe{ 40 | \item{\code{family} (\code{"glm"})}{a description of the error distribution and 41 | link function to be used in the model. This can be a character string 42 | naming a family function, a family function or the result of a call to 43 | a family function. (See \code{\link[stats]{family}} for details of 44 | family functions.)} 45 | \item{\code{rank} (\code{"glm"})}{logical. Should the rank of the design matrix 46 | be checked?} 47 | \item{\code{fn} (\code{"gmm"})}{a function \eqn{g(\theta,x)} which returns a 48 | \eqn{k}-vector corresponding to the \eqn{k} moment conditions. It is a 49 | required argument if \code{gr} not specified.} 50 | \item{\code{gr} (\code{"gmm"})}{a function to return the gradient. If 51 | unspecified, a finite-difference approximation will be used.} 52 | \item{\code{nparams} (\code{"gmm"})}{number of model parameters. This is 53 | automatically determined for other models.} 54 | \item{\code{type} (\code{"gmm"})}{character specifying the generalized method of 55 | moments procedure: \code{"twostep"} (Hansen, 1982), \code{"iterative"} 56 | (Hansen et al., 1996). Defaults to \code{"iterative"}.} 57 | \item{\code{wmatrix} (\code{"gmm"})}{weighting matrix to be used in the loss 58 | function. Defaults to the identity matrix.} 59 | \item{\code{loss} (\code{"m"})}{character specifying the loss function to be 60 | used in the estimating equation. Default is the Huber loss.} 61 | \item{\code{lambda1}}{L1 regularization parameter. Default is 0.} 62 | \item{\code{lambda2}}{L2 regularization parameter. Default is 0.} 63 | }} 64 | 65 | \item{sgd.control}{an optional list of parameters for controlling the estimation. 66 | \describe{ 67 | \item{\code{method}}{character specifying the method to be used: \code{"sgd"}, 68 | \code{"implicit"}, \code{"asgd"}, \code{"ai-sgd"}, \code{"momentum"}, 69 | \code{"nesterov"}. Default is \code{"ai-sgd"}. See \sQuote{Details}.} 70 | \item{\code{lr}}{character specifying the learning rate to be used: 71 | \code{"one-dim"}, \code{"one-dim-eigen"}, \code{"d-dim"}, 72 | \code{"adagrad"}, \code{"rmsprop"}. Default is \code{"one-dim"}. 73 | See \sQuote{Details}.} 74 | \item{\code{lr.control}}{vector of scalar hyperparameters one can 75 | set dependent on the learning rate. For hyperparameters aimed 76 | to be left as default, specify \code{NA} in the corresponding 77 | entries. See \sQuote{Details}.} 78 | \item{\code{start}}{starting values for the parameter estimates. Default is 79 | random initialization around zero.} 80 | \item{\code{size}}{number of SGD estimates to store for diagnostic purposes 81 | (distributed log-uniformly over total number of iterations)} 82 | \item{\code{reltol}}{relative convergence tolerance. The algorithm stops 83 | if it is unable to change the relative mean squared difference in the 84 | parameters by more than the amount. Default is \code{1e-05}.} 85 | \item{\code{npasses}}{the maximum number of passes over the data. Default 86 | is 3.} 87 | \item{\code{pass}}{logical. Should \code{tol} be ignored and run the 88 | algorithm for all of \code{npasses}?} 89 | \item{\code{shuffle}}{logical. Should the algorithm shuffle the data set 90 | including for each pass?} 91 | \item{\code{verbose}}{logical. Should the algorithm print progress?} 92 | }} 93 | } 94 | \value{ 95 | An object of class \code{"sgd"}, which is a list containing the following 96 | components: 97 | \item{model}{name of the model} 98 | \item{coefficients}{a named vector of coefficients} 99 | \item{converged}{logical. Was the algorithm judged to have converged?} 100 | \item{estimates}{estimates from algorithm stored at each iteration 101 | specified in \code{pos}} 102 | \item{fitted.values}{the fitted mean values} 103 | \item{pos}{vector of indices specifying the iteration number each estimate 104 | was stored for} 105 | \item{residuals}{the residuals, that is response minus fitted values} 106 | \item{times}{vector of times in seconds it took to complete the number of 107 | iterations specified in \code{pos}} 108 | \item{model.out}{a list of model-specific output attributes} 109 | } 110 | \description{ 111 | Run stochastic gradient descent in order to optimize the induced loss 112 | function given a model and data. 113 | } 114 | \details{ 115 | Models: 116 | The Cox model assumes that the survival data is ordered when passed 117 | in, i.e., such that the risk set of an observation i is all data points after 118 | it. 119 | 120 | Methods: 121 | \describe{ 122 | \item{\code{sgd}}{stochastic gradient descent (Robbins and Monro, 1951)} 123 | \item{\code{implicit}}{implicit stochastic gradient descent (Toulis et al., 124 | 2014)} 125 | \item{\code{asgd}}{stochastic gradient with averaging (Polyak and Juditsky, 126 | 1992)} 127 | \item{\code{ai-sgd}}{implicit stochastic gradient with averaging (Toulis et 128 | al., 2015)} 129 | \item{\code{momentum}}{"classical" momentum (Polyak, 1964)} 130 | \item{\code{nesterov}}{Nesterov's accelerated gradient (Nesterov, 1983)} 131 | } 132 | 133 | Learning rates and hyperparameters: 134 | \describe{ 135 | \item{\code{one-dim}}{scalar value prescribed in Xu (2011) as 136 | \deqn{a_n = scale * gamma/(1 + alpha*gamma*n)^(-c)} 137 | where the defaults are 138 | \code{lr.control = (scale=1, gamma=1, alpha=1, c)} 139 | where \code{c} is \code{1} if implemented without averaging, 140 | \code{2/3} if with averaging} 141 | \item{\code{one-dim-eigen}}{diagonal matrix 142 | \code{lr.control = NULL}} 143 | \item{\code{d-dim}}{diagonal matrix 144 | \code{lr.control = (epsilon=1e-6)}} 145 | \item{\code{adagrad}}{diagonal matrix prescribed in Duchi et al. (2011) as 146 | \code{lr.control = (eta=1, epsilon=1e-6)}} 147 | \item{\code{rmsprop}}{diagonal matrix prescribed in Tieleman and Hinton 148 | (2012) as 149 | \code{lr.control = (eta=1, gamma=0.9, epsilon=1e-6)}} 150 | } 151 | } 152 | \examples{ 153 | ## Linear regression 154 | set.seed(42) 155 | N <- 1e4 156 | d <- 5 157 | X <- matrix(rnorm(N*d), ncol=d) 158 | theta <- rep(5, d+1) 159 | eps <- rnorm(N) 160 | y <- cbind(1, X) \%*\% theta + eps 161 | dat <- data.frame(y=y, x=X) 162 | sgd.theta <- sgd(y ~ ., data=dat, model="lm") 163 | sprintf("Mean squared error: \%0.3f", mean((theta - as.numeric(sgd.theta$coefficients))^2)) 164 | 165 | 166 | } 167 | \references{ 168 | John Duchi, Elad Hazan, and Yoram Singer. Adaptive subgradient methods for 169 | online learning and stochastic optimization. \emph{Journal of Machine 170 | Learning Research}, 12:2121-2159, 2011. 171 | 172 | Yurii Nesterov. A method for solving a convex programming problem with 173 | convergence rate \eqn{O(1/k^2)}. \emph{Soviet Mathematics Doklady}, 174 | 27(2):372-376, 1983. 175 | 176 | Boris T. Polyak. Some methods of speeding up the convergence of iteration 177 | methods. \emph{USSR Computational Mathematics and Mathematical Physics}, 178 | 4(5):1-17, 1964. 179 | 180 | Boris T. Polyak and Anatoli B. Juditsky. Acceleration of stochastic 181 | approximation by averaging. \emph{SIAM Journal on Control and Optimization}, 182 | 30(4):838-855, 1992. 183 | 184 | Herbert Robbins and Sutton Monro. A stochastic approximation method. 185 | \emph{The Annals of Mathematical Statistics}, pp. 400-407, 1951. 186 | 187 | Panos Toulis, Jason Rennie, and Edoardo M. Airoldi, "Statistical analysis of 188 | stochastic gradient methods for generalized linear models", In 189 | \emph{Proceedings of the 31st International Conference on Machine Learning}, 190 | 2014. 191 | 192 | Panos Toulis, Dustin Tran, and Edoardo M. Airoldi, "Stability and optimality 193 | in stochastic gradient descent", arXiv preprint arXiv:1505.02417, 2015. 194 | 195 | Wei Xu. Towards optimal one pass large scale learning with averaged 196 | stochastic gradient descent. arXiv preprint arXiv:1107.2490, 2011. 197 | 198 | # Dimensions 199 | } 200 | \author{ 201 | Dustin Tran, Tian Lan, Panos Toulis, Ye Kuang, Edoardo Airoldi 202 | } 203 | -------------------------------------------------------------------------------- /man/winequality.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data-winequality.R 3 | \docType{data} 4 | \name{winequality} 5 | \alias{winequality} 6 | \title{Wine quality data of white wine samples from Portugal} 7 | \format{ 8 | A data frame with 4898 rows and 12 variables 9 | \itemize{ 10 | \item fixed acidity. 11 | \item volatile acidity. 12 | \item citric acid. 13 | \item residual sugar. 14 | \item chlorides. 15 | \item free sulfur dioxide. 16 | \item total sulfur dioxide. 17 | \item density. 18 | \item pH. 19 | \item sulphates. 20 | \item alcohol. 21 | \item quality (score between 0 and 10). 22 | } 23 | } 24 | \source{ 25 | \url{https://archive.ics.uci.edu/ml/datasets/Wine+Quality} 26 | } 27 | \usage{ 28 | winequality 29 | } 30 | \description{ 31 | This dataset is a collection of white "Vinho Verde" wine 32 | samples from the north of Portugal. Due to privacy and logistic 33 | issues, only physicochemical (inputs) and sensory (the output) 34 | variables are available (e.g. there is no data about grape types, 35 | wine brand, wine selling price, etc.). 36 | } 37 | \keyword{datasets} 38 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | #include 6 | 7 | using namespace Rcpp; 8 | 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 10 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 11 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 12 | #endif 13 | 14 | // run 15 | Rcpp::List run(SEXP dataset, SEXP model_control, SEXP sgd_control); 16 | RcppExport SEXP _sgd_run(SEXP datasetSEXP, SEXP model_controlSEXP, SEXP sgd_controlSEXP) { 17 | BEGIN_RCPP 18 | Rcpp::RObject rcpp_result_gen; 19 | Rcpp::RNGScope rcpp_rngScope_gen; 20 | Rcpp::traits::input_parameter< SEXP >::type dataset(datasetSEXP); 21 | Rcpp::traits::input_parameter< SEXP >::type model_control(model_controlSEXP); 22 | Rcpp::traits::input_parameter< SEXP >::type sgd_control(sgd_controlSEXP); 23 | rcpp_result_gen = Rcpp::wrap(run(dataset, model_control, sgd_control)); 24 | return rcpp_result_gen; 25 | END_RCPP 26 | } 27 | 28 | static const R_CallMethodDef CallEntries[] = { 29 | {"_sgd_run", (DL_FUNC) &_sgd_run, 3}, 30 | {NULL, NULL, 0} 31 | }; 32 | 33 | RcppExport void R_init_sgd(DllInfo *dll) { 34 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 35 | R_useDynamicSymbols(dll, FALSE); 36 | } 37 | -------------------------------------------------------------------------------- /src/basedef.h: -------------------------------------------------------------------------------- 1 | #ifndef BASEDEF_H 2 | #define BASEDEF_H 3 | 4 | #define BOOST_DISABLE_ASSERTS true 5 | 6 | #include "RcppArmadillo.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | using namespace arma; 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /src/data/data_point.h: -------------------------------------------------------------------------------- 1 | #ifndef DATA_DATA_POINT_H 2 | #define DATA_DATA_POINT_H 3 | 4 | #include "../basedef.h" 5 | 6 | struct data_point { 7 | /** 8 | * Collection of an individual observation's covariates and response. 9 | * 10 | * @param x covariates for a single sample 11 | * @param y response value for a single sample 12 | * @param idx index of that data point into the data set 13 | */ 14 | data_point(const mat& x, double y, unsigned idx) : x(x), y(y), idx(idx) {} 15 | 16 | mat x; 17 | double y; 18 | unsigned idx; 19 | }; 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /src/data/data_set.h: -------------------------------------------------------------------------------- 1 | #ifndef DATA_DATA_SET_H 2 | #define DATA_DATA_SET_H 3 | 4 | #include "../basedef.h" 5 | #include "data_point.h" 6 | 7 | // wrapper around R's RNG such that we get a uniform distribution over 8 | // [0,n) as required by the STL algorithm 9 | inline int randWrapper(const int n) { return floor(unif_rand()*n); } 10 | 11 | class data_set { 12 | /** 13 | * Collection of all data points. 14 | * 15 | * @param xpMat pointer to bigmat if using bigmatrix 16 | * @param Xx design matrix if not using bigmatrix 17 | * @param Yy response values 18 | * @param n_passes number of passes for data 19 | * @param big whether using bigmatrix or not 20 | * @param shuffle whether to shuffle data set or not 21 | */ 22 | public: 23 | data_set(const SEXP& xpMat, const mat& Xx, const mat& Yy, unsigned n_passes, 24 | bool big, bool shuffle) : Y(Yy), big(big), xpMat_(xpMat), shuffle_(shuffle) { 25 | if (!big) { 26 | X = Xx; 27 | n_samples = X.n_rows; 28 | n_features = X.n_cols; 29 | } else { 30 | n_samples = xpMat_->nrow(); 31 | n_features = xpMat_->ncol(); 32 | } 33 | if (shuffle_) { 34 | idxvec_ = std::vector(n_samples*n_passes); 35 | for (unsigned i = 0; i < n_passes; ++i) { 36 | for (unsigned j = 0; j < n_samples; ++j) { 37 | idxvec_[i * n_samples + j] = j; 38 | } 39 | // std::random_shuffle(idxvec_.begin() + i * n_samples, 40 | // idxvec_.begin() + (i + 1) * n_samples, 41 | // randWrapper); 42 | std::random_device rd; 43 | std::mt19937 gen(rd()); 44 | std::shuffle(idxvec_.begin() + i * n_samples, 45 | idxvec_.begin() + (i + 1) * n_samples, 46 | gen); 47 | } 48 | } 49 | } 50 | 51 | // Index to the @t th data point 52 | data_point get_data_point(unsigned t) const { 53 | t = idxmap_(t - 1); 54 | mat xt; 55 | if (!big) { 56 | xt = mat(X.row(t)); 57 | } else { 58 | MatrixAccessor matacess(*xpMat_); 59 | xt = mat(1, n_features); 60 | for (unsigned i=0; i < n_features; ++i) { 61 | xt(0, i) = matacess[i][t]; 62 | } 63 | } 64 | double yt = Y(t); 65 | return data_point(xt, yt, t); 66 | } 67 | 68 | mat X; 69 | mat Y; 70 | bool big; 71 | unsigned n_samples; 72 | unsigned n_features; 73 | 74 | private: 75 | // index to data point for each iteration 76 | unsigned idxmap_(unsigned t) const { 77 | if (shuffle_) { 78 | return(idxvec_[t]); 79 | } else { 80 | return(t % n_samples); 81 | } 82 | } 83 | 84 | Rcpp::XPtr xpMat_; 85 | std::vector idxvec_; 86 | bool shuffle_; 87 | }; 88 | 89 | #endif 90 | -------------------------------------------------------------------------------- /src/learn-rate/base_learn_rate.h: -------------------------------------------------------------------------------- 1 | #ifndef LEARN_RATE_BASE_LEARN_RATE_H 2 | #define LEARN_RATE_BASE_LEARN_RATE_H 3 | 4 | #include "../basedef.h" 5 | #include "learn_rate_value.h" 6 | 7 | class base_learn_rate { 8 | /** 9 | * Base class for learning rates 10 | */ 11 | public: 12 | base_learn_rate() {} 13 | 14 | virtual const learn_rate_value& operator()(unsigned t, const mat& grad_t) = 0; 15 | }; 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /src/learn-rate/ddim_learn_rate.h: -------------------------------------------------------------------------------- 1 | #ifndef LEARN_RATE_DDIM_LEARN_RATE_H 2 | #define LEARN_RATE_DDIM_LEARN_RATE_H 3 | 4 | #include "../basedef.h" 5 | #include "base_learn_rate.h" 6 | #include "learn_rate_value.h" 7 | 8 | class ddim_learn_rate : public base_learn_rate { 9 | /** 10 | * d-dimensional learning rate, which includes as special cases popular 11 | * learning rates: 12 | * adagrad: a=1, b=1, c=1/2, eta=1, eps=1e-6 13 | * d-dim: a=0, b=1, c=1, eta=1, eps=1e-6 14 | * rmsprop: a=gamma, b=1-gamma, c=1/2, eta=1, eps=1e-6 15 | * 16 | * @param d dimension of learning rate 17 | * @param eta scale factor in numerator 18 | * @param a factor to weigh old gradient information 19 | * @param b factor to weigh new gradient information 20 | * @param c power to exponentiate by 21 | * @param eps value to prevent division by zero 22 | */ 23 | public: 24 | ddim_learn_rate(unsigned d, double eta, double a, double b, double c, 25 | double eps) : 26 | d_(d), Idiag_(ones(d)), eta_(eta), a_(a), b_(b), c_(c), eps_(eps), 27 | v_(1, d) {} 28 | 29 | virtual const learn_rate_value& operator()(unsigned t, const mat& grad_t) { 30 | for (unsigned i = 0; i < d_; ++i) { 31 | Idiag_.at(i) = a_ * Idiag_.at(i) + b_ * pow(grad_t.at(i, 0), 2); 32 | } 33 | 34 | for (unsigned i = 0; i < d_; ++i) { 35 | if (std::abs(Idiag_.at(i)) > 1e-8) { 36 | v_.at(i) = eta_ / pow(Idiag_.at(i) + eps_, c_); 37 | } 38 | else { 39 | v_.at(i) = Idiag_.at(i); 40 | } 41 | } 42 | return v_; 43 | } 44 | 45 | private: 46 | unsigned d_; 47 | vec Idiag_; 48 | double eta_; 49 | double a_; 50 | double b_; 51 | double c_; 52 | double eps_; 53 | learn_rate_value v_; 54 | }; 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /src/learn-rate/learn_rate_value.h: -------------------------------------------------------------------------------- 1 | #ifndef LEARN_RATE_LEARN_RATE_VALUE_H 2 | #define LEARN_RATE_LEARN_RATE_VALUE_H 3 | 4 | #include "../basedef.h" 5 | 6 | class learn_rate_value { 7 | /** 8 | * Object to return for all learning rate classes 9 | * 10 | * @param t type of the value; 0 is scalar, 1 is vector, 2 is matrix 11 | * @param d dimension of parameters 12 | */ 13 | public: 14 | learn_rate_value(unsigned type, unsigned d) : type_(type) { 15 | if (type_ == 0) { 16 | lr_scalar_ = 1; 17 | } else if (type_ == 1) { 18 | lr_vector_ = ones(d); 19 | } else { 20 | lr_matrix_ = eye(d, d); 21 | } 22 | } 23 | 24 | double& at(unsigned i) { 25 | if (type_ == 1) { 26 | return lr_vector_.at(i); 27 | } else if (type_ == 2) { 28 | return lr_matrix_.at(i); 29 | } else { 30 | Rcpp::Rcout << 31 | "Indexing vector/matrix entry when learning rate type is neither" << 32 | std::endl; 33 | return lr_scalar_; 34 | } 35 | } 36 | 37 | // const double& at(unsigned i) const { 38 | // return at(i); 39 | // } 40 | 41 | double& at(unsigned i, unsigned j) { 42 | if (type_ == 2) { 43 | return lr_matrix_.at(i, j); 44 | } else { 45 | Rcpp::Rcout << 46 | "Indexing matrix entry when learning rate type is not matrix" << 47 | std::endl; 48 | return lr_scalar_; 49 | } 50 | } 51 | 52 | // const double& at(unsigned i, unsigned j) const { 53 | // return at(i, j); 54 | // } 55 | 56 | // Take average for usage in implicit SGD 57 | double mean() const { 58 | // double average = 0.0; 59 | if (type_ == 0) { 60 | return lr_scalar_; 61 | } else if (type_ == 1) { 62 | return arma::mean(lr_vector_); 63 | } else { 64 | return arma::mean(arma::mean(lr_matrix_)); 65 | } 66 | } 67 | 68 | learn_rate_value operator=(double scalar) { 69 | if (type_ == 0) { 70 | lr_scalar_ = scalar; 71 | } else { 72 | Rcpp::Rcout << 73 | "Setting learning rate value to scalar when its type is not" << 74 | std::endl; 75 | } 76 | return *this; 77 | } 78 | 79 | learn_rate_value operator=(const vec& vector) { 80 | if (type_ == 1) { 81 | lr_vector_ = vector; 82 | } else { 83 | Rcpp::Rcout << 84 | "Setting learning rate value to vector when its type is not" << 85 | std::endl; 86 | } 87 | return *this; 88 | } 89 | 90 | learn_rate_value operator=(const mat& matrix) { 91 | if (type_ == 2) { 92 | lr_matrix_ = matrix; 93 | } else { 94 | Rcpp::Rcout << 95 | "Setting learning rate value to matrix when its type is not" << 96 | std::endl; 97 | } 98 | return *this; 99 | } 100 | 101 | mat operator*(const mat& matrix) { 102 | if (type_ == 0) { 103 | return lr_scalar_ * matrix; 104 | } else if (type_ == 1) { 105 | //int m = matrix.n_rows; 106 | ////int n = matrix.n_cols; 107 | //mat out = zeros(m, 1); 108 | //for (unsigned i = 0; i < m; ++i) { 109 | // //for (unsigned j = 0; j < n; ++j) { 110 | // //out.at(i) += lr_vector_.at(i) * matrix.at(i, 0); 111 | // //} 112 | // out.at(i, 0) = lr_vector_.at(i) * matrix.at(i, 0); 113 | //} 114 | //return out; 115 | //return diagmat(lr_vector_) * matrix; 116 | return mat(lr_vector_) % matrix; 117 | } else { 118 | return lr_matrix_ * matrix; 119 | } 120 | } 121 | 122 | bool operator<(const double thres) { 123 | if (type_ == 0) { 124 | return lr_scalar_ < thres; 125 | } else if (type_ == 1) { 126 | return all(lr_vector_ < thres); 127 | } else{ 128 | return all(diagvec(lr_matrix_) < thres); 129 | } 130 | } 131 | 132 | bool operator>(const double thres) { 133 | return !(*this < thres); 134 | } 135 | 136 | private: 137 | unsigned type_; 138 | double lr_scalar_; 139 | vec lr_vector_; 140 | mat lr_matrix_; 141 | }; 142 | 143 | #endif 144 | -------------------------------------------------------------------------------- /src/learn-rate/onedim_eigen_learn_rate.h: -------------------------------------------------------------------------------- 1 | #ifndef LEARN_RATE_ONEDIM_EIGEN_LEARN_RATE_H 2 | #define LEARN_RATE_ONEDIM_EIGEN_LEARN_RATE_H 3 | 4 | #include "../basedef.h" 5 | #include "base_learn_rate.h" 6 | #include "learn_rate_value.h" 7 | 8 | class onedim_eigen_learn_rate : public base_learn_rate { 9 | /** 10 | * One-dimensional learning rate to parameterize a diagonal matrix 11 | * 12 | * @param d dimension of learning rate 13 | */ 14 | public: 15 | onedim_eigen_learn_rate(unsigned d) : 16 | d_(d), v_(0, 1) {} 17 | 18 | virtual const learn_rate_value& operator()(unsigned t, const mat& grad_t) { 19 | double sum_eigen = 0; 20 | for (unsigned i = 0; i < d_; ++i) { 21 | sum_eigen += pow(grad_t.at(i, 0), 2); 22 | } 23 | // based on the bound of min_eigen <= d / trace(Fisher_matrix) 24 | double min_eigen_upper = sum_eigen / d_; 25 | v_ = 1. / (min_eigen_upper * t); 26 | return v_; 27 | } 28 | 29 | private: 30 | unsigned d_; 31 | learn_rate_value v_; 32 | }; 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /src/learn-rate/onedim_learn_rate.h: -------------------------------------------------------------------------------- 1 | #ifndef LEARN_RATE_ONEDIM_LEARN_RATE_H 2 | #define LEARN_RATE_ONEDIM_LEARN_RATE_H 3 | 4 | #include "../basedef.h" 5 | #include "base_learn_rate.h" 6 | #include "learn_rate_value.h" 7 | 8 | class onedim_learn_rate : public base_learn_rate { 9 | /** 10 | * One-dimensional (scalar) learning rate, following Xu 11 | * 12 | * @param scale scale factor in numerator 13 | * @param gamma scale factor in both numerator and denominator 14 | * @param alpha scale factor in denominator 15 | * @param c power to exponentiate by 16 | */ 17 | public: 18 | onedim_learn_rate(double scale, double gamma, double alpha, double c) : 19 | scale_(scale), gamma_(gamma), alpha_(alpha), c_(c), v_(0, 1) {} 20 | 21 | virtual const learn_rate_value& operator()(unsigned t, const mat& grad_t) { 22 | v_ = scale_ * gamma_ * pow(1 + alpha_ * gamma_ * t, -c_); 23 | return v_; 24 | } 25 | 26 | private: 27 | double scale_; 28 | double gamma_; 29 | double alpha_; 30 | double c_; 31 | learn_rate_value v_; 32 | }; 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /src/model/base_model.h: -------------------------------------------------------------------------------- 1 | #ifndef MODEL_BASE_MODEL_H 2 | #define MODEL_BASE_MODEL_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_point.h" 6 | 7 | class base_model { 8 | /** 9 | * Base class for models 10 | * 11 | * @param model attributes affiliated with model as R type 12 | */ 13 | public: 14 | base_model(Rcpp::List model) { 15 | name_ = Rcpp::as(model["name"]); 16 | lambda1_ = Rcpp::as(model["lambda1"]); 17 | lambda2_ = Rcpp::as(model["lambda2"]); 18 | } 19 | 20 | std::string name() const { 21 | return name_; 22 | } 23 | 24 | mat gradient(unsigned t, const mat& theta_old, const data_set& data) const; 25 | mat gradient_penalty(const mat& theta) const { 26 | return lambda1_*sign(theta) + lambda2_*theta; 27 | } 28 | 29 | // Functions for implicit update 30 | // Following the JSS paper, we assume C_n = identity, lambda = 1, and use ksi 31 | // rather than s_n, which is slightly less efficient. 32 | // ell'(x^T theta + at x^T grad(penalty) + ksi ||x||^2) 33 | double scale_factor(double ksi, double at, const data_point& data_pt, const 34 | mat& theta_old, double normx) const; 35 | // d/d(ksi) ell' 36 | double scale_factor_first_deriv(double ksi, double at, const data_point& 37 | data_pt, const mat& theta_old, double normx) const; 38 | // d^2/d(ksi)^2 ell' 39 | double scale_factor_second_deriv(double ksi, double at, const data_point& 40 | data_pt, const mat& theta_old, double normx) const; 41 | 42 | protected: 43 | std::string name_; 44 | double lambda1_; 45 | double lambda2_; 46 | }; 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /src/model/cox_model.h: -------------------------------------------------------------------------------- 1 | #ifndef MODEL_COX_MODEL_H 2 | #define MODEL_COX_MODEL_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_point.h" 6 | #include "base_model.h" 7 | 8 | class cox_model : public base_model { 9 | /** 10 | * Cox proportional hazards model 11 | * 12 | * @param model attributes affiliated with model as R type 13 | */ 14 | public: 15 | cox_model(Rcpp::List model) : base_model(model) {} 16 | 17 | mat gradient(unsigned t, const mat& theta_old, const data_set& data) 18 | const { 19 | data_point data_pt = data.get_data_point(t); 20 | unsigned j = data_pt.idx; 21 | 22 | // assuming data points fail in order, i.e., risk set R_i={i,i+1,...,n} 23 | vec xi = exp(data.X * theta_old); 24 | vec h = zeros(j); 25 | double sum_xi = 0; 26 | for (int i = j-1; i < j; --i) { 27 | // h_i = d_i/sum(xi[i:n]) 28 | if (i == j-1) { 29 | for (int k = i; k < data.n_samples; ++k) { 30 | sum_xi += xi(k); 31 | } 32 | } else { 33 | sum_xi += xi(i); 34 | } 35 | h(i) = data.Y(i)/sum_xi; 36 | } 37 | double r = data_pt.y - xi(j) * sum(h); 38 | return (r * data_pt.x).t(); 39 | } 40 | 41 | // TODO 42 | bool rank; 43 | }; 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /src/model/glm/glm_family.h: -------------------------------------------------------------------------------- 1 | #ifndef MODEL_GLM_FAMILY_H 2 | #define MODEL_GLM_FAMILY_H 3 | 4 | #include "../../basedef.h" 5 | 6 | class base_family; 7 | class gaussian_family; 8 | class poisson_family; 9 | class binomial_family; 10 | class gamma_family; 11 | 12 | class base_family { 13 | /* Base class from which all exponential family classes inherit from */ 14 | public: 15 | virtual double variance(double u) const = 0; 16 | virtual double deviance(const mat& y, const mat& mu, const mat& wt) const = 0; 17 | }; 18 | 19 | class gaussian_family : public base_family { 20 | public: 21 | virtual double variance(double u) const { 22 | return 1.; 23 | } 24 | 25 | virtual double deviance(const mat& y, const mat& mu, const mat& wt) const { 26 | return sum(vec(wt % ((y-mu) % (y-mu)))); 27 | } 28 | }; 29 | 30 | class poisson_family : public base_family { 31 | public: 32 | virtual double variance(double u) const { 33 | return u; 34 | } 35 | 36 | virtual double deviance(const mat& y, const mat& mu, const mat& wt) const { 37 | vec r = vec(mu % wt); 38 | for (unsigned i = 0; i < r.n_elem; ++i) { 39 | if (y(i) > 0.) { 40 | r(i) = wt(i) * (y(i) * log(y(i)/mu(i)) - (y(i) - mu(i))); 41 | } 42 | } 43 | return sum(2. * r); 44 | } 45 | }; 46 | 47 | class binomial_family : public base_family { 48 | public: 49 | virtual double variance(double u) const { 50 | return u * (1. - u); 51 | } 52 | 53 | // In R the dev.resids of Binomial family is not exposed. 54 | // Found one [here](http://pages.stat.wisc.edu/~st849-1/lectures/GLMDeviance.pdf) 55 | virtual double deviance(const mat& y, const mat& mu, const mat& wt) const { 56 | vec r(y.n_elem); 57 | for (unsigned i = 0; i < r.n_elem; ++i) { 58 | r(i) = 2. * wt(i) * (y_log_y(y(i), mu(i)) + y_log_y(1.-y(i), 1.-mu(i))); 59 | } 60 | return sum(r); 61 | } 62 | 63 | private: 64 | double y_log_y(double y, double mu) const { 65 | return (y) ? (y * log(y/mu)) : 0.; 66 | } 67 | }; 68 | 69 | class gamma_family : public base_family { 70 | public: 71 | virtual double variance(double u) const { 72 | return pow(u, 2); 73 | } 74 | 75 | virtual double deviance(const mat& y, const mat& mu, const mat& wt) const { 76 | vec r(y.n_elem); 77 | for (unsigned i = 0; i < r.n_elem; ++i) { 78 | r(i) = -2. * wt(i) * (log(y(i) ? y(i)/mu(i) : 1.) - (y(i)-mu(i)) / mu(i)); 79 | } 80 | return sum(r); 81 | } 82 | }; 83 | 84 | #endif 85 | -------------------------------------------------------------------------------- /src/model/glm/glm_transfer.h: -------------------------------------------------------------------------------- 1 | #ifndef MODEL_GLM_TRANSFER_H 2 | #define MODEL_GLM_TRANSFER_H 3 | 4 | #include "../../basedef.h" 5 | 6 | class base_transfer; 7 | class identity_transfer; 8 | class inverse_transfer; 9 | class exp_transfer; 10 | class logistic_transfer; 11 | 12 | class base_transfer { 13 | /* Base class from which all transfer function classes inherit from */ 14 | public: 15 | virtual double transfer(double u) const = 0; 16 | 17 | virtual mat transfer(const mat& u) const { 18 | mat result = mat(u); 19 | for (unsigned i = 0; i < result.n_rows; ++i) { 20 | result(i, 0) = transfer(u(i, 0)); 21 | } 22 | return result; 23 | } 24 | 25 | virtual double link(double u) const = 0; 26 | 27 | virtual double first_derivative(double u) const = 0; 28 | virtual double second_derivative(double u) const = 0; 29 | virtual bool valideta(double eta) const = 0; 30 | }; 31 | 32 | class identity_transfer : public base_transfer { 33 | public: 34 | virtual double transfer(double u) const { 35 | return u; 36 | } 37 | 38 | virtual double link(double u) const { 39 | return u; 40 | } 41 | 42 | virtual double first_derivative(double u) const { 43 | return 1.; 44 | } 45 | 46 | virtual double second_derivative(double u) const { 47 | return 0.; 48 | } 49 | 50 | virtual bool valideta(double eta) const { 51 | return true; 52 | } 53 | }; 54 | 55 | class inverse_transfer : public base_transfer { 56 | public: 57 | virtual double transfer(double u) const { 58 | if (valideta(u)) { 59 | return -1. / u; 60 | } 61 | return 0.; 62 | } 63 | 64 | virtual double link(double u) const { 65 | if (u) { 66 | return -1. / u; 67 | } 68 | return 0.; 69 | } 70 | 71 | virtual double first_derivative(double u) const { 72 | if (valideta(u)) { 73 | return 1. / pow(u, 2); 74 | } 75 | return 0.; 76 | } 77 | 78 | virtual double second_derivative(double u) const { 79 | if (valideta(u)) { 80 | return -2. / pow(u, 3); 81 | } 82 | return 0.; 83 | } 84 | 85 | virtual bool valideta(double eta) const { 86 | return eta != 0; 87 | } 88 | }; 89 | 90 | class exp_transfer : public base_transfer { 91 | public: 92 | virtual double transfer(double u) const { 93 | return exp(u); 94 | } 95 | 96 | virtual double link(double u) const { 97 | if (u > 0.) { 98 | return log(u); 99 | } 100 | return 0.; 101 | } 102 | 103 | virtual double first_derivative(double u) const { 104 | return exp(u); 105 | } 106 | 107 | virtual double second_derivative(double u) const { 108 | return exp(u); 109 | } 110 | 111 | virtual bool valideta(double eta) const { 112 | return true; 113 | } 114 | }; 115 | 116 | class logistic_transfer : public base_transfer { 117 | public: 118 | virtual double transfer(double u) const { 119 | return sigmoid(u); 120 | } 121 | 122 | virtual double link(double u) const { 123 | if (u > 0. && u < 1.) { 124 | return log(u / (1. - u)); 125 | } 126 | return 0.; 127 | } 128 | 129 | virtual double first_derivative(double u) const { 130 | double sig = sigmoid(u); 131 | return sig * (1. - sig); 132 | } 133 | 134 | virtual double second_derivative(double u) const { 135 | double sig = sigmoid(u); 136 | return 2*pow(sig, 3) - 3*pow(sig, 2) + 2*sig; 137 | } 138 | 139 | virtual bool valideta(double eta) const { 140 | return true; 141 | } 142 | 143 | private: 144 | double sigmoid(double u) const { 145 | return 1. / (1. + exp(-u)); 146 | } 147 | }; 148 | 149 | #endif 150 | -------------------------------------------------------------------------------- /src/model/glm_model.h: -------------------------------------------------------------------------------- 1 | #ifndef MODEL_GLM_MODEL_H 2 | #define MODEL_GLM_MODEL_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_point.h" 6 | #include "base_model.h" 7 | #include "glm/glm_family.h" 8 | #include "glm/glm_transfer.h" 9 | 10 | class glm_model : public base_model { 11 | /** 12 | * Generalized linear models 13 | * 14 | * @param model attributes affiliated with model as R type 15 | */ 16 | public: 17 | glm_model(Rcpp::List model) : base_model(model) { 18 | family_ = Rcpp::as(model["family"]); 19 | if (family_ == "gaussian") { 20 | family_obj_ = new gaussian_family(); 21 | } else if (family_ == "poisson") { 22 | family_obj_ = new poisson_family(); 23 | } else if (family_ == "binomial") { 24 | family_obj_ = new binomial_family(); 25 | } else if (family_ == "gamma") { 26 | family_obj_ = new gamma_family(); 27 | } else { 28 | Rcpp::Rcout << "warning: model not implemented yet" << std::endl; 29 | } 30 | transfer_ = Rcpp::as(model["transfer"]); 31 | if (transfer_ == "identity") { 32 | transfer_obj_ = new identity_transfer(); 33 | } else if (transfer_ == "exp") { 34 | transfer_obj_ = new exp_transfer(); 35 | } else if (transfer_ == "inverse") { 36 | transfer_obj_ = new inverse_transfer(); 37 | } else if (transfer_ == "logistic") { 38 | transfer_obj_ = new logistic_transfer(); 39 | } 40 | } 41 | 42 | mat gradient(unsigned t, const mat& theta_old, const data_set& data) 43 | const { 44 | data_point data_pt = data.get_data_point(t); 45 | return ((data_pt.y - h_transfer(dot(data_pt.x, theta_old))) * 46 | data_pt.x).t() - gradient_penalty(theta_old); 47 | } 48 | 49 | double g_link(double u) const { 50 | return transfer_obj_->link(u); 51 | } 52 | 53 | double h_transfer(double u) const { 54 | return transfer_obj_->transfer(u); 55 | } 56 | 57 | mat h_transfer(const mat& u) const { 58 | return transfer_obj_->transfer(u); 59 | } 60 | 61 | double h_first_deriv(double u) const { 62 | return transfer_obj_->first_derivative(u); 63 | } 64 | 65 | double h_second_deriv(double u) const { 66 | return transfer_obj_->second_derivative(u); 67 | } 68 | 69 | bool valideta(double eta) const { 70 | return transfer_obj_->valideta(eta); 71 | } 72 | 73 | double variance(double u) const { 74 | return family_obj_->variance(u); 75 | } 76 | 77 | double deviance(const mat& y, const mat& mu, const mat& wt) const { 78 | return family_obj_->deviance(y, mu, wt); 79 | } 80 | 81 | std::string family() const { 82 | return family_; 83 | } 84 | 85 | std::string transfer() const { 86 | return transfer_; 87 | } 88 | 89 | // Functions for implicit update 90 | double scale_factor(double ksi, double at, const data_point& data_pt, const 91 | mat& theta_old, double normx) const { 92 | return data_pt.y - h_transfer( 93 | dot(theta_old, data_pt.x) - 94 | at * dot(gradient_penalty(theta_old), data_pt.x) + 95 | ksi * normx); 96 | } 97 | 98 | double scale_factor_first_deriv(double ksi, double at, const data_point& 99 | data_pt, const mat& theta_old, double normx) const { 100 | return h_first_deriv( 101 | dot(theta_old, data_pt.x) - 102 | at * dot(gradient_penalty(theta_old), data_pt.x) + 103 | ksi * normx) * normx; 104 | } 105 | 106 | double scale_factor_second_deriv(double ksi, double at, const data_point& 107 | data_pt, const mat& theta_old, double normx) const { 108 | return h_second_deriv( 109 | dot(theta_old, data_pt.x) - 110 | at * dot(gradient_penalty(theta_old), data_pt.x) + 111 | ksi * normx) * normx * normx; 112 | } 113 | 114 | private: 115 | std::string family_; 116 | std::string transfer_; 117 | base_family* family_obj_; 118 | base_transfer* transfer_obj_; 119 | }; 120 | 121 | #endif 122 | -------------------------------------------------------------------------------- /src/model/gmm_model.h: -------------------------------------------------------------------------------- 1 | #ifndef MODEL_GMM_MODEL_H 2 | #define MODEL_GMM_MODEL_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_point.h" 6 | #include "base_model.h" 7 | 8 | class gmm_model : public base_model { 9 | /** 10 | * Generalized method of moments 11 | * 12 | * @param model attributes affiliated with model as R type 13 | */ 14 | public: 15 | gmm_model(Rcpp::List model) : 16 | base_model(model), gr_(Rcpp::as(model["gr"])) { 17 | //if model["wmatrix"] == NULL { 18 | int k = 5; 19 | wmatrix_ = eye(k, k); 20 | //} else { 21 | // wmatrix_ = Rcpp::as(model["wmatrix"]); 22 | //} 23 | } 24 | 25 | mat gradient(unsigned t, const mat& theta_old, const data_set& data) 26 | const { 27 | data_point data_pt = data.get_data_point(t); 28 | // TODO y isn't necessary 29 | // TODO include weighting matrix 30 | Rcpp::NumericVector r_theta_old = 31 | Rcpp::as(Rcpp::wrap(theta_old)); 32 | Rcpp::NumericVector r_data_pt = 33 | Rcpp::as(Rcpp::wrap(data_pt.x)); 34 | Rcpp::NumericMatrix r_out = gr_(r_theta_old, r_data_pt); 35 | mat out = Rcpp::as(r_out); 36 | return -1. * out; // maximize the negative moment function 37 | } 38 | 39 | // TODO 40 | bool rank; 41 | 42 | private: 43 | mat wmatrix_; 44 | Rcpp::Function gr_; 45 | }; 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /src/model/m-estimation/m_loss.h: -------------------------------------------------------------------------------- 1 | #ifndef MODEL_M_LOSS_H 2 | #define MODEL_M_LOSS_H 3 | 4 | #include "../../basedef.h" 5 | 6 | class base_loss; 7 | class huber_loss; 8 | 9 | class base_loss { 10 | /* Base class from which all loss function classes inherit from */ 11 | public: 12 | virtual double loss(double u, double lambda) const = 0; 13 | virtual double first_derivative(double u, double lambda) const = 0; 14 | virtual double second_derivative(double u, double lambda) const = 0; 15 | virtual double third_derivative(double u, double lambda) const = 0; 16 | virtual mat loss(const mat& u, double lambda) const { 17 | mat result = mat(u); 18 | for (unsigned i = 0; i < result.n_rows; ++i) { 19 | result(i, 0) = loss(u(i, 0), lambda); 20 | } 21 | return result; 22 | } 23 | virtual mat first_derivative(const mat& u, double lambda) const { 24 | mat result = mat(u); 25 | for (unsigned i = 0; i < result.n_rows; ++i) { 26 | result(i, 0) = first_derivative(u(i, 0), lambda); 27 | } 28 | return result; 29 | } 30 | }; 31 | 32 | class huber_loss : public base_loss { 33 | public: 34 | virtual double loss(double u, double lambda) const { 35 | if (std::abs(u) <= lambda) { 36 | return pow(u, 2)/2; 37 | } else { 38 | return lambda*std::abs(u) - pow(lambda, 2)/2; 39 | } 40 | } 41 | 42 | virtual double first_derivative(double u, double lambda) const { 43 | if (std::abs(u) <= lambda) { 44 | return u; 45 | } else { 46 | return lambda*sign(u); 47 | } 48 | } 49 | 50 | virtual double second_derivative(double u, double lambda) const { 51 | if (std::abs(u) <= lambda) { 52 | return 1.0; 53 | } else { 54 | return 0.0; 55 | } 56 | } 57 | 58 | virtual double third_derivative(double u, double lambda) const { 59 | return 0.0; 60 | } 61 | 62 | private: 63 | template 64 | double sign(const T& x) const { 65 | if (x > 0) { 66 | return 1.0; 67 | } else if (x < 0) { 68 | return -1.0; 69 | } else { 70 | return 0.0; 71 | } 72 | } 73 | }; 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /src/model/m_model.h: -------------------------------------------------------------------------------- 1 | #ifndef MODEL_M_MODEL_H 2 | #define MODEL_M_MODEL_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_point.h" 6 | #include "base_model.h" 7 | #include "m-estimation/m_loss.h" 8 | 9 | class m_model : public base_model { 10 | /** 11 | * M-estimation 12 | * 13 | * @param model attributes affiliated with model as R type 14 | */ 15 | public: 16 | m_model(Rcpp::List model) : base_model(model) { 17 | loss_ = Rcpp::as(model["loss"]); 18 | if (loss_ == "huber") { 19 | loss_obj_ = new huber_loss(); 20 | } else { 21 | Rcpp::Rcout << "warning: loss not implemented yet" << std::endl; 22 | } 23 | lambda_ = 3.0; // default for huber loss 24 | } 25 | 26 | mat gradient(unsigned t, const mat& theta_old, const data_set& data) 27 | const { 28 | data_point data_pt = data.get_data_point(t); 29 | return (loss_obj_->first_derivative( 30 | data_pt.y - dot(data_pt.x, theta_old), lambda_) * data_pt.x).t() - 31 | gradient_penalty(theta_old); 32 | } 33 | 34 | std::string loss() const { 35 | return loss_; 36 | } 37 | 38 | // Functions for implicit update 39 | double scale_factor(double ksi, double at, const data_point& data_pt, const 40 | mat& theta_old, double normx) const { 41 | return loss_obj_->first_derivative( 42 | data_pt.y - dot(theta_old, data_pt.x) - 43 | at * dot(gradient_penalty(theta_old), data_pt.x) + 44 | ksi * normx, 45 | lambda_); 46 | } 47 | 48 | double scale_factor_first_deriv(double ksi, double at, const data_point& 49 | data_pt, const mat& theta_old, double normx) const { 50 | return loss_obj_->second_derivative( 51 | data_pt.y - dot(theta_old, data_pt.x) - 52 | at * dot(gradient_penalty(theta_old), data_pt.x) + 53 | ksi * normx, 54 | lambda_) * normx; 55 | } 56 | 57 | double scale_factor_second_deriv(double ksi, double at, const data_point& 58 | data_pt, const mat& theta_old, double normx) const { 59 | return loss_obj_->third_derivative( 60 | data_pt.y - dot(theta_old, data_pt.x) - 61 | at * dot(gradient_penalty(theta_old), data_pt.x) + 62 | ksi * normx, 63 | lambda_) * normx * normx; 64 | } 65 | 66 | private: 67 | std::string loss_; 68 | base_loss* loss_obj_; 69 | double lambda_; 70 | }; 71 | 72 | #endif 73 | -------------------------------------------------------------------------------- /src/post-process/cox_post_process.h: -------------------------------------------------------------------------------- 1 | #ifndef POST_PROCESS_COX_POST_PROCESS_H 2 | #define POST_PROCESS_COX_POST_PROCESS_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_set.h" 6 | #include "../model/cox_model.h" 7 | 8 | template 9 | Rcpp::List post_process(const SGD& sgd, const data_set& data, 10 | const cox_model& model) { 11 | // TODO 12 | return Rcpp::List(); 13 | } 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /src/post-process/glm_post_process.h: -------------------------------------------------------------------------------- 1 | #ifndef POST_PROCESS_GLM_POST_PROCESS_H 2 | #define POST_PROCESS_GLM_POST_PROCESS_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_set.h" 6 | #include "../model/glm_model.h" 7 | 8 | template 9 | Rcpp::List post_process(const SGD& sgd, const data_set& data, 10 | const glm_model& model) { 11 | // TODO 12 | return Rcpp::List(); 13 | } 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /src/post-process/gmm_post_process.h: -------------------------------------------------------------------------------- 1 | #ifndef POST_PROCESS_GMM_POST_PROCESS_H 2 | #define POST_PROCESS_GMM_POST_PROCESS_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_set.h" 6 | #include "../model/gmm_model.h" 7 | 8 | // model.out: flag to include weighting matrix 9 | template 10 | Rcpp::List post_process(const SGD& sgd, const data_set& data, 11 | const gmm_model& model) { 12 | // TODO 13 | return Rcpp::List(); 14 | } 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /src/post-process/m_post_process.h: -------------------------------------------------------------------------------- 1 | #ifndef POST_PROCESS_M_POST_PROCESS_H 2 | #define POST_PROCESS_M_POST_PROCESS_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_set.h" 6 | #include "../model/m_model.h" 7 | 8 | template 9 | Rcpp::List post_process(const SGD& sgd, const data_set& data, 10 | const m_model& model) { 11 | return Rcpp::List::create( 12 | Rcpp::Named("loss") = model.loss()); 13 | } 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /src/sgd.cpp: -------------------------------------------------------------------------------- 1 | #include "basedef.h" 2 | #include "data/data_set.h" 3 | #include "model/cox_model.h" 4 | #include "model/glm_model.h" 5 | #include "model/gmm_model.h" 6 | #include "model/m_model.h" 7 | #include "post-process/cox_post_process.h" 8 | #include "post-process/glm_post_process.h" 9 | #include "post-process/gmm_post_process.h" 10 | #include "post-process/m_post_process.h" 11 | #include "sgd/explicit_sgd.h" 12 | #include "sgd/implicit_sgd.h" 13 | #include "sgd/momentum_sgd.h" 14 | #include "sgd/nesterov_sgd.h" 15 | #include "validity-check/validity_check.h" 16 | 17 | // [[Rcpp::depends(BH)]] 18 | // [[Rcpp::depends(RcppArmadillo)]] 19 | // [[Rcpp::plugins(cpp1y)]] 20 | 21 | 22 | template 23 | Rcpp::List run(const data_set& data, MODEL& model, SGD& sgd); 24 | 25 | /** 26 | * Runs the proposed model and stochastic gradient method on the data set 27 | * 28 | * @param dataset data set 29 | * @param model_control attributes affiliated with model 30 | * @param sgd_control attributes affiliated with sgd 31 | */ 32 | // [[Rcpp::export]] 33 | Rcpp::List run(SEXP dataset, SEXP model_control, SEXP sgd_control) { 34 | Rcpp::List Dataset(dataset); 35 | Rcpp::List Model_control(model_control); 36 | Rcpp::List Sgd_control(sgd_control); 37 | if (Rcpp::as(Sgd_control["verbose"])) { 38 | Rcpp::Rcout << "Converting arguments from R to C++ types..." << std::endl; 39 | } 40 | 41 | // Construct data. 42 | data_set data(Dataset["bigmat"], 43 | Rcpp::as(Dataset["X"]), 44 | Rcpp::as(Dataset["Y"]), 45 | Rcpp::as(Sgd_control["npasses"]), 46 | Rcpp::as(Dataset["big"]), 47 | Rcpp::as(Sgd_control["shuffle"])); 48 | 49 | // Construct model. 50 | std::string model_name = Rcpp::as(Model_control["name"]); 51 | if (model_name == "cox") { 52 | cox_model model(Model_control); 53 | // Construct stochastic gradient method. 54 | std::string sgd_name = Rcpp::as(Sgd_control["method"]); 55 | if (sgd_name == "sgd" || sgd_name == "asgd") { 56 | explicit_sgd sgd(Sgd_control, data.n_samples); 57 | return run(data, model, sgd); 58 | } else if (sgd_name == "implicit" || sgd_name == "ai-sgd") { 59 | implicit_sgd sgd(Sgd_control, data.n_samples); 60 | return run(data, model, sgd); 61 | } else if (sgd_name == "momentum") { 62 | momentum_sgd sgd(Sgd_control, data.n_samples); 63 | return run(data, model, sgd); 64 | } else if (sgd_name == "nesterov") { 65 | nesterov_sgd sgd(Sgd_control, data.n_samples); 66 | return run(data, model, sgd); 67 | } else { 68 | Rcpp::Rcout << "error: stochastic gradient method not implemented" << std::endl; 69 | return Rcpp::List(); 70 | } 71 | } else if (model_name == "lm" || model_name == "glm") { 72 | glm_model model(Model_control); 73 | // Construct stochastic gradient method. 74 | std::string sgd_name = Rcpp::as(Sgd_control["method"]); 75 | if (sgd_name == "sgd" || sgd_name == "asgd") { 76 | explicit_sgd sgd(Sgd_control, data.n_samples); 77 | return run(data, model, sgd); 78 | } else if (sgd_name == "implicit" || sgd_name == "ai-sgd") { 79 | implicit_sgd sgd(Sgd_control, data.n_samples); 80 | return run(data, model, sgd); 81 | } else if (sgd_name == "momentum") { 82 | momentum_sgd sgd(Sgd_control, data.n_samples); 83 | return run(data, model, sgd); 84 | } else if (sgd_name == "nesterov") { 85 | nesterov_sgd sgd(Sgd_control, data.n_samples); 86 | return run(data, model, sgd); 87 | } else { 88 | Rcpp::Rcout << "error: stochastic gradient method not implemented" << std::endl; 89 | return Rcpp::List(); 90 | } 91 | } else if (model_name == "gmm") { 92 | gmm_model model(Model_control); 93 | // Construct stochastic gradient method. 94 | std::string sgd_name = Rcpp::as(Sgd_control["method"]); 95 | if (sgd_name == "sgd" || sgd_name == "asgd") { 96 | explicit_sgd sgd(Sgd_control, data.n_samples); 97 | return run(data, model, sgd); 98 | } else if (sgd_name == "implicit" || sgd_name == "ai-sgd") { 99 | implicit_sgd sgd(Sgd_control, data.n_samples); 100 | return run(data, model, sgd); 101 | } else if (sgd_name == "momentum") { 102 | momentum_sgd sgd(Sgd_control, data.n_samples); 103 | return run(data, model, sgd); 104 | } else if (sgd_name == "nesterov") { 105 | nesterov_sgd sgd(Sgd_control, data.n_samples); 106 | return run(data, model, sgd); 107 | } else { 108 | Rcpp::Rcout << "error: stochastic gradient method not implemented" << std::endl; 109 | return Rcpp::List(); 110 | } 111 | } else if (model_name == "m") { 112 | m_model model(Model_control); 113 | // Construct stochastic gradient method. 114 | std::string sgd_name = Rcpp::as(Sgd_control["method"]); 115 | if (sgd_name == "sgd" || sgd_name == "asgd") { 116 | explicit_sgd sgd(Sgd_control, data.n_samples); 117 | return run(data, model, sgd); 118 | } else if (sgd_name == "implicit" || sgd_name == "ai-sgd") { 119 | implicit_sgd sgd(Sgd_control, data.n_samples); 120 | return run(data, model, sgd); 121 | } else if (sgd_name == "momentum") { 122 | momentum_sgd sgd(Sgd_control, data.n_samples); 123 | return run(data, model, sgd); 124 | } else if (sgd_name == "nesterov") { 125 | nesterov_sgd sgd(Sgd_control, data.n_samples); 126 | return run(data, model, sgd); 127 | } else { 128 | Rcpp::Rcout << "error: stochastic gradient method not implemented" << std::endl; 129 | return Rcpp::List(); 130 | } 131 | } else { 132 | Rcpp::Rcout << "error: model not implemented" << std::endl; 133 | return Rcpp::List(); 134 | } 135 | 136 | #if 0 137 | // TODO The above duplicates code within the if-else statement. 138 | // Construct stochastic gradient method. 139 | std::string sgd_name = Rcpp::as(Sgd_control["method"]); 140 | if (sgd_name == "sgd" || sgd_name == "asgd") { 141 | explicit_sgd sgd(Sgd_control, data.n_samples); 142 | return run(data, model, sgd); 143 | } else if (sgd_name == "implicit" || sgd_name == "ai-sgd") { 144 | implicit_sgd sgd(Sgd_control, data.n_samples); 145 | return run(data, model, sgd); 146 | } else if (sgd_name == "momentum") { 147 | momentum_sgd sgd(Sgd_control, data.n_samples); 148 | return run(data, model, sgd); 149 | } else if (sgd_name == "nesterov") { 150 | nesterov_sgd sgd(Sgd_control, data.n_samples); 151 | return run(data, model, sgd); 152 | } else { 153 | Rcpp::Rcout << "error: stochastic gradient method not implemented" << std::endl; 154 | return Rcpp::List(); 155 | } 156 | #endif 157 | } 158 | 159 | /** 160 | * Runs algorithm templated on the model and stochastic gradient method 161 | * 162 | * @param data data set 163 | * @tparam MODEL model class 164 | * @tparam SGD stochastic gradient descent class 165 | */ 166 | template 167 | Rcpp::List run(const data_set& data, MODEL& model, SGD& sgd) { 168 | unsigned n_samples = data.n_samples; 169 | // unsigned n_features = data.n_features; 170 | unsigned n_passes = sgd.get_n_passes(); 171 | 172 | bool good_gradient = true; 173 | bool good_validity = true; 174 | bool averaging = false; 175 | if (sgd.name() == "asgd" || sgd.name() == "ai-sgd") { 176 | averaging = true; 177 | } 178 | 179 | // TODO these should really be vec's 180 | mat theta_new; 181 | mat theta_new_ave; 182 | mat theta_old = sgd.get_last_estimate(); 183 | mat theta_old_ave = theta_old; 184 | 185 | unsigned max_iters = n_samples*n_passes; 186 | bool do_more_iterations = true; 187 | bool converged = false; 188 | if (sgd.verbose()) { 189 | Rcpp::Rcout << "Stochastic gradient method: " << sgd.name() << std::endl; 190 | Rcpp::Rcout << "SGD Start!" << std::endl; 191 | } 192 | for (unsigned t = 1; do_more_iterations; ++t) { 193 | theta_new = sgd.update(t, theta_old, data, model, good_gradient); 194 | 195 | if (averaging) { 196 | if (t != 1) { 197 | theta_new_ave = (1. - 1./(double)t) * theta_old_ave + 198 | 1./((double)t) * theta_new; 199 | } else { 200 | theta_new_ave = theta_new; 201 | } 202 | sgd = theta_new_ave; 203 | } else { 204 | sgd = theta_new; 205 | } 206 | 207 | good_validity = validity_check(data, theta_new, good_gradient, t, model); 208 | if (!good_validity) { 209 | return Rcpp::List(); 210 | } 211 | 212 | // Check if satisfy convergence threshold. 213 | if (averaging) { 214 | converged = sgd.check_convergence(theta_new_ave, theta_old_ave); 215 | } else { 216 | converged = sgd.check_convergence(theta_new, theta_old); 217 | } 218 | if (converged) { 219 | sgd.end_early(); 220 | do_more_iterations = false; 221 | } 222 | // Stop if hit maximum number of iterations. 223 | if (t == max_iters) { 224 | //if (!sgd.pass()) { 225 | // Rcpp::Rcout 226 | // << "Informational Message: The maximum number of iterations is " 227 | // << "reached! The algorithm has not converged." 228 | // << std::endl 229 | // << "Estimates from this stochastic gradient descent are not " 230 | // << "guaranteed to be meaningful." 231 | // << std::endl; 232 | //} 233 | do_more_iterations = false; 234 | } 235 | 236 | // Set old to new updates and repeat. 237 | if (averaging) { 238 | theta_old_ave = theta_new_ave; 239 | } 240 | theta_old = theta_new; 241 | } 242 | 243 | Rcpp::List model_out = post_process(sgd, data, model); 244 | 245 | return Rcpp::List::create( 246 | Rcpp::Named("model") = model.name(), 247 | Rcpp::Named("coefficients") = sgd.get_last_estimate(), 248 | Rcpp::Named("converged") = converged, 249 | Rcpp::Named("estimates") = sgd.get_estimates(), 250 | Rcpp::Named("pos") = sgd.get_pos(), 251 | Rcpp::Named("model.out") = model_out); 252 | } 253 | -------------------------------------------------------------------------------- /src/sgd/base_sgd.h: -------------------------------------------------------------------------------- 1 | #ifndef SGD_BASE_SGD_H 2 | #define SGD_BASE_SGD_H 3 | 4 | #include "../basedef.h" 5 | #include "../learn-rate/base_learn_rate.h" 6 | #include "../learn-rate/onedim_learn_rate.h" 7 | #include "../learn-rate/onedim_eigen_learn_rate.h" 8 | #include "../learn-rate/ddim_learn_rate.h" 9 | 10 | class base_sgd { 11 | /** 12 | * Base class for stochastic gradient descent 13 | * 14 | * @param sgd attributes affiliated with sgd as R type 15 | * @param n_samples number of data samples 16 | * @param ti timer for benchmarking how long to get each estimate 17 | */ 18 | public: 19 | base_sgd(Rcpp::List sgd, unsigned n_samples) { 20 | name_ = Rcpp::as(sgd["method"]); 21 | n_params_ = Rcpp::as(sgd["nparams"]); 22 | reltol_ = Rcpp::as(sgd["reltol"]); 23 | n_passes_ = Rcpp::as(sgd["npasses"]); 24 | size_ = Rcpp::as(sgd["size"]); 25 | estimates_ = zeros(n_params_, size_); 26 | last_estimate_ = Rcpp::as(sgd["start"]); 27 | t_ = 0; 28 | n_recorded_ = 0; 29 | pos_ = Mat(1, size_); 30 | pass_ = Rcpp::as(sgd["pass"]); 31 | verbose_ = Rcpp::as(sgd["verbose"]); 32 | 33 | check_ = Rcpp::as(sgd["check"]); 34 | if (check_) { 35 | truth_ = Rcpp::as(sgd["truth"]); 36 | } 37 | 38 | // Set which iterations to store estimates 39 | unsigned n_iters = n_samples*n_passes_; 40 | for (unsigned i = 0; i < size_; ++i) { 41 | pos_(0, i) = int(round(pow(10., 42 | i * log10(static_cast(n_iters)) / (size_-1)))); 43 | } 44 | if (pos_(0, pos_.n_cols-1) != n_iters) { 45 | pos_(0, pos_.n_cols-1) = n_iters; 46 | } 47 | if (n_iters < size_) { 48 | Rcpp::Rcout << "Warning: Too few data points for plotting!" << std::endl; 49 | } 50 | 51 | // Set learning rate 52 | std:: string lr = Rcpp::as(sgd["lr"]); 53 | vec lr_control = Rcpp::as(sgd["lr.control"]); 54 | if (lr == "one-dim") { 55 | lr_obj_ = new onedim_learn_rate(lr_control(0), lr_control(1), 56 | lr_control(2), lr_control(3)); 57 | } else if (lr == "one-dim-eigen") { 58 | lr_obj_ = new onedim_eigen_learn_rate(n_params_); 59 | } else if (lr == "d-dim") { 60 | lr_obj_ = new ddim_learn_rate(n_params_, 1., 0., 1., 1., 61 | lr_control(0)); 62 | } else if (lr == "adagrad") { 63 | lr_obj_ = new ddim_learn_rate(n_params_, lr_control(0), 1., 1., .5, 64 | lr_control(1)); 65 | } else if (lr == "rmsprop") { 66 | lr_obj_ = new ddim_learn_rate(n_params_, lr_control(0), lr_control(1), 67 | 1-lr_control(1), .5, lr_control(2)); 68 | } 69 | } 70 | 71 | std::string name() const { 72 | return name_; 73 | } 74 | // TODO set naming convention properly 75 | unsigned get_n_passes() const { 76 | return n_passes_; 77 | } 78 | mat get_estimates() const { 79 | return estimates_; 80 | } 81 | mat get_last_estimate() const { 82 | return last_estimate_; 83 | } 84 | Mat get_pos() const { 85 | return pos_; 86 | } 87 | bool pass() const { 88 | return pass_; 89 | } 90 | bool verbose() const { 91 | return verbose_; 92 | } 93 | 94 | // Check if satisfy convergence threshold. 95 | bool check_convergence(const mat& theta_new, const mat& theta_old) const { 96 | // if checking against truth 97 | double diff; 98 | if (check_) { 99 | diff = mean(mean(pow(theta_new - truth_, 2))); 100 | if (diff < 0.001) { 101 | return true; 102 | } 103 | // if not running fixed number of iterations 104 | } else if (!pass_) { 105 | diff = mean(mean(abs(theta_new - theta_old))) / 106 | mean(mean(abs(theta_old))); 107 | if (diff < reltol_) { 108 | return true; 109 | } 110 | } 111 | return false; 112 | } 113 | 114 | const learn_rate_value& learning_rate(unsigned t, const mat& grad_t) { 115 | return (*lr_obj_)(t, grad_t); 116 | } 117 | 118 | //TODO declare update method 119 | //template 120 | //mat update(unsigned t, const mat& theta_old, const data_set& data, 121 | //MODEL& model, bool& good_gradient); 122 | 123 | // base_sgd& operator=(const mat& theta_new) { 124 | // last_estimate_ = theta_new; 125 | // t_ += 1; 126 | // if (t_ == pos_[n_recorded_]) { 127 | // estimates_.col(n_recorded_) = theta_new; 128 | // times_.at(n_recorded_) = ti_.elapsed(); 129 | // n_recorded_ += 1; 130 | // while (n_recorded_ < size_ && pos_[n_recorded_-1] == pos_[n_recorded_]) { 131 | // estimates_.col(n_recorded_) = theta_new; 132 | // times_.at(n_recorded_) = times_.at(n_recorded_-1); 133 | // n_recorded_ += 1; 134 | // } 135 | // } 136 | // return *this; 137 | // } 138 | 139 | base_sgd& operator=(const mat& theta_new) { 140 | last_estimate_ = theta_new; 141 | t_ += 1; 142 | if (t_ == pos_[n_recorded_]) { 143 | estimates_.col(n_recorded_) = theta_new; 144 | n_recorded_ += 1; 145 | while (n_recorded_ < size_ && pos_[n_recorded_-1] == pos_[n_recorded_]) { 146 | estimates_.col(n_recorded_) = theta_new; 147 | n_recorded_ += 1; 148 | } 149 | } 150 | return *this; 151 | } 152 | 153 | void end_early() { 154 | // Throw away the space for things that were not recorded. 155 | pos_.shed_cols(n_recorded_, size_-1); 156 | estimates_.shed_cols(n_recorded_, size_-1); 157 | } 158 | 159 | protected: 160 | std::string name_; // name of stochastic gradient method 161 | unsigned n_params_; // number of parameters 162 | double reltol_; // relative tolerance for convergence 163 | unsigned n_passes_; // number of passes over data 164 | unsigned size_; // number of estimates to be recorded (log-uniformly) 165 | mat estimates_; // collection of stored estimates 166 | mat last_estimate_; // last SGD estimate 167 | base_learn_rate* lr_obj_; // learning rate 168 | unsigned t_; // current iteration 169 | unsigned n_recorded_; // number of coefs that have been recorded 170 | Mat pos_; // the iteration of recorded coefficients 171 | bool pass_; // whether to force running for n_passes_ over data 172 | bool verbose_; 173 | bool check_; 174 | mat truth_; 175 | }; 176 | 177 | #endif 178 | -------------------------------------------------------------------------------- /src/sgd/explicit_sgd.h: -------------------------------------------------------------------------------- 1 | #ifndef SGD_EXPLICIT_SGD_H 2 | #define SGD_EXPLICIT_SGD_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_set.h" 6 | #include "../learn-rate/learn_rate_value.h" 7 | #include "base_sgd.h" 8 | 9 | class explicit_sgd : public base_sgd { 10 | /** 11 | * Stochastic gradient descent in standard formulation, i.e., using an 12 | * "explicit" update 13 | * 14 | * @param sgd attributes affiliated with sgd as R type 15 | * @param n_samples number of data samples 16 | */ 17 | public: 18 | explicit_sgd(Rcpp::List sgd, unsigned n_samples) : 19 | base_sgd(sgd, n_samples) {} 20 | 21 | template 22 | mat update(unsigned t, const mat& theta_old, const data_set& data, 23 | MODEL& model, bool& good_gradient) { 24 | mat grad_t = model.gradient(t, theta_old, data); 25 | if (!is_finite(grad_t)) { 26 | good_gradient = false; 27 | } 28 | learn_rate_value at = learning_rate(t, grad_t); 29 | return theta_old + (at * grad_t); 30 | } 31 | 32 | explicit_sgd& operator=(const mat& theta_new) { 33 | base_sgd::operator=(theta_new); 34 | return *this; 35 | } 36 | }; 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/sgd/implicit_sgd.h: -------------------------------------------------------------------------------- 1 | #ifndef SGD_IMPLICIT_SGD_H 2 | #define SGD_IMPLICIT_SGD_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_point.h" 6 | #include "../data/data_set.h" 7 | #include "../model/cox_model.h" 8 | #include "../model/glm_model.h" 9 | #include "../model/gmm_model.h" 10 | #include "../model/m_model.h" 11 | #include "../learn-rate/learn_rate_value.h" 12 | #include "base_sgd.h" 13 | 14 | template 15 | class Implicit_fn { 16 | // Root finding functor for implicit update 17 | // Evaluates the zeroth, first, and second derivatives of: 18 | // ksi - ell(x^T theta + ||x||^2 * ksi) 19 | public: 20 | typedef boost::math::tuple tuple_type; 21 | 22 | Implicit_fn(const MODEL& m, double a, const data_point& d, const mat& t, 23 | double n) : model_(m), at_(a), data_pt_(d), theta_old_(t), normx_(n) {} 24 | 25 | tuple_type operator()(double ksi) const { 26 | double value = ksi - at_ * 27 | model_.scale_factor(ksi, at_, data_pt_, theta_old_, normx_); 28 | double first = 1 + at_ * 29 | model_.scale_factor_first_deriv(ksi, at_, data_pt_, theta_old_, normx_); 30 | double second = at_ * 31 | model_.scale_factor_second_deriv(ksi, at_, data_pt_, theta_old_, normx_); 32 | tuple_type out(value, first, second); 33 | return out; 34 | } 35 | 36 | private: 37 | const MODEL& model_; 38 | double at_; 39 | const data_point& data_pt_; 40 | const mat& theta_old_; 41 | double normx_; 42 | }; 43 | 44 | class implicit_sgd : public base_sgd { 45 | /** 46 | * Stochastic gradient descent using an "implicit" update 47 | * 48 | * @param sgd attributes affiliated with sgd as R type 49 | * @param n_samples number of data samples 50 | */ 51 | public: 52 | implicit_sgd(Rcpp::List sgd, unsigned n_samples) : 53 | base_sgd(sgd, n_samples) { 54 | delta_ = Rcpp::as(sgd["delta"]); 55 | } 56 | 57 | mat update(unsigned t, const mat& theta_old, const data_set& data, 58 | glm_model& model, bool& good_gradient) { 59 | mat theta_new; 60 | learn_rate_value at = learning_rate(t, model.gradient(t, theta_old, data)); 61 | // TODO how to deal with non-scalar learning rates? 62 | double at_avg = at.mean(); 63 | 64 | data_point data_pt = data.get_data_point(t); 65 | double normx = dot(data_pt.x, data_pt.x); 66 | 67 | double r = at_avg * model.scale_factor(0, at_avg, data_pt, theta_old, normx); 68 | double lower = 0; 69 | double upper = 0; 70 | if (r < 0) { 71 | lower = r; 72 | } else { 73 | upper = r; 74 | } 75 | double ksi; 76 | if (lower != upper) { 77 | Implicit_fn implicit_fn(model, at_avg, data_pt, theta_old, normx); 78 | ksi = boost::math::tools::schroeder_iterate(implicit_fn, (lower + 79 | upper)/2, lower, upper, delta_); 80 | } else { 81 | ksi = lower; 82 | } 83 | return theta_old + 84 | ksi * data_pt.x.t() - 85 | at_avg * model.gradient_penalty(theta_old); 86 | } 87 | 88 | mat update(unsigned t, const mat& theta_old, const data_set& data, 89 | m_model& model, bool& good_gradient) { 90 | mat theta_new; 91 | learn_rate_value at = learning_rate(t, model.gradient(t, theta_old, data)); 92 | // TODO how to deal with non-scalar learning rates? 93 | double at_avg = at.mean(); 94 | 95 | data_point data_pt = data.get_data_point(t); 96 | double normx = dot(data_pt.x, data_pt.x); 97 | 98 | double r = at_avg * model.scale_factor(0, at_avg, data_pt, theta_old, normx); 99 | double lower = 0; 100 | double upper = 0; 101 | if (r < 0) { 102 | lower = r; 103 | } else { 104 | upper = r; 105 | } 106 | double ksi; 107 | if (lower != upper) { 108 | Implicit_fn implicit_fn(model, at_avg, data_pt, theta_old, normx); 109 | ksi = boost::math::tools::schroeder_iterate(implicit_fn, (lower + 110 | upper)/2, lower, upper, delta_); 111 | } else { 112 | ksi = lower; 113 | } 114 | return theta_old + 115 | ksi * data_pt.x.t() - 116 | at_avg * model.gradient_penalty(theta_old); 117 | } 118 | 119 | mat update(unsigned t, const mat& theta_old, const data_set& data, 120 | cox_model& model, bool& good_gradient) { 121 | data_point data_pt = data.get_data_point(t); 122 | unsigned j = data_pt.idx; 123 | 124 | // assuming data points fail in order, i.e., risk set R_i={i,i+1,...,n} 125 | vec xi = exp(data.X * theta_old); 126 | vec h = zeros(j); 127 | double sum_xi = 0; 128 | for (int i = j-1; i < j; --i) { 129 | // h_i = d_i/sum(xi[i:n]) 130 | if (i == j-1) { 131 | for (int k = i; k < data.n_samples; ++k) { 132 | sum_xi += xi(k); 133 | } 134 | } else { 135 | sum_xi += xi(i); 136 | } 137 | h(i) = data.Y(i)/sum_xi; 138 | } 139 | double eta_j = accu(data_pt.x.t() % theta_old); // x_j^T * theta 140 | double z = eta_j + data_pt.y - xi[j] * sum(h); 141 | double xjnorm = accu(data_pt.x % data_pt.x); // |x_j|^2_2 142 | 143 | //learn_rate_value at = learning_rate(t, model.gradient(t, theta_old, data)); 144 | learn_rate_value at = learning_rate(t, zeros(data.n_features)); 145 | // TODO how to deal with non-scalar learning rates? 146 | double at_avg = at.mean(); 147 | 148 | mat grad_t = (z - (eta_j + at_avg*z*xjnorm)/(1 + at_avg*xjnorm)) * 149 | data_pt.x.t(); 150 | if (!is_finite(grad_t)) { 151 | good_gradient = false; 152 | } 153 | return theta_old + (at * grad_t); 154 | } 155 | 156 | template 157 | mat update(unsigned t, const mat& theta_old, const data_set& data, 158 | MODEL& model, bool& good_gradient) { 159 | Rcpp::Rcout << "error: implicit not implemented for model yet" << std::endl; 160 | good_gradient = false; 161 | return theta_old; 162 | } 163 | 164 | implicit_sgd& operator=(const mat& theta_new) { 165 | base_sgd::operator=(theta_new); 166 | return *this; 167 | } 168 | private: 169 | double delta_; 170 | }; 171 | 172 | #endif 173 | -------------------------------------------------------------------------------- /src/sgd/momentum_sgd.h: -------------------------------------------------------------------------------- 1 | #ifndef SGD_MOMENTUM_SGD_H 2 | #define SGD_MOMENTUM_SGD_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_set.h" 6 | #include "../learn-rate/learn_rate_value.h" 7 | #include "base_sgd.h" 8 | 9 | class momentum_sgd : public base_sgd { 10 | /** 11 | * Stochastic gradient descent using classical momentum 12 | * 13 | * @param sgd attributes affiliated with sgd as R type 14 | * @param n_samples number of data samples 15 | */ 16 | public: 17 | momentum_sgd(Rcpp::List sgd, unsigned n_samples) : 18 | base_sgd(sgd, n_samples) { 19 | mu_ = 0.9; 20 | v_ = last_estimate_; 21 | } 22 | 23 | template 24 | mat update(unsigned t, const mat& theta_old, const data_set& data, 25 | MODEL& model, bool& good_gradient) { 26 | mat grad_t = model.gradient(t, theta_old, data); 27 | if (!is_finite(grad_t)) { 28 | good_gradient = false; 29 | } 30 | learn_rate_value at = learning_rate(t, grad_t); 31 | v_ = mu_ * v_ + (at * grad_t); 32 | return theta_old + v_; 33 | } 34 | 35 | momentum_sgd& operator=(const mat& theta_new) { 36 | base_sgd::operator=(theta_new); 37 | return *this; 38 | } 39 | private: 40 | double mu_; // factor to weigh previous "velocity" 41 | mat v_; // "velocity" 42 | }; 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /src/sgd/nesterov_sgd.h: -------------------------------------------------------------------------------- 1 | #ifndef SGD_NESTEROV_SGD_H 2 | #define SGD_NESTEROV_SGD_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_set.h" 6 | #include "../learn-rate/learn_rate_value.h" 7 | #include "base_sgd.h" 8 | 9 | class nesterov_sgd : public base_sgd { 10 | /** 11 | * Stochastic gradient descent using Nesterov momentum 12 | * 13 | * @param sgd attributes affiliated with sgd as R type 14 | * @param n_samples number of data samples 15 | */ 16 | public: 17 | nesterov_sgd(Rcpp::List sgd, unsigned n_samples) : 18 | base_sgd(sgd, n_samples) { 19 | mu_ = 0.9; 20 | v_ = last_estimate_; 21 | } 22 | 23 | template 24 | mat update(unsigned t, const mat& theta_old, const data_set& data, 25 | MODEL& model, bool& good_gradient) { 26 | mat grad_t = model.gradient(t, theta_old + mu_*v_, data); 27 | if (!is_finite(grad_t)) { 28 | good_gradient = false; 29 | } 30 | learn_rate_value at = learning_rate(t, model.gradient(t, theta_old, data)); 31 | v_ = mu_ * v_ + (at * grad_t); 32 | return theta_old + v_; 33 | } 34 | 35 | nesterov_sgd& operator=(const mat& theta_new) { 36 | base_sgd::operator=(theta_new); 37 | return *this; 38 | } 39 | private: 40 | double mu_; // factor to weigh previous "velocity" 41 | mat v_; // "velocity" 42 | }; 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /src/validity-check/cox_validity_check_model.h: -------------------------------------------------------------------------------- 1 | #ifndef VALIDITY_CHECK_COX_VALIDITY_CHECK_MODEL_H 2 | #define VALIDITY_CHECK_COX_VALIDITY_CHECK_MODEL_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_set.h" 6 | #include "../model/cox_model.h" 7 | 8 | bool validity_check_model(const data_set& data, const mat& theta, unsigned t, 9 | const cox_model& model) { 10 | // TODO 11 | return true; 12 | } 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /src/validity-check/glm_validity_check_model.h: -------------------------------------------------------------------------------- 1 | #ifndef VALIDITY_CHECK_GLM_VALIDITY_CHECK_MODEL_H 2 | #define VALIDITY_CHECK_GLM_VALIDITY_CHECK_MODEL_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_set.h" 6 | #include "../model/glm_model.h" 7 | 8 | bool validity_check_model(const data_set& data, const mat& theta, unsigned t, 9 | const glm_model& model) { 10 | // TODO should this really be checked at each iteration? 11 | return true; 12 | } 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /src/validity-check/gmm_validity_check_model.h: -------------------------------------------------------------------------------- 1 | #ifndef VALIDITY_CHECK_GMM_VALIDITY_CHECK_MODEL_H 2 | #define VALIDITY_CHECK_GMM_VALIDITY_CHECK_MODEL_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_set.h" 6 | #include "../model/gmm_model.h" 7 | 8 | bool validity_check_model(const data_set& data, const mat& theta, unsigned t, 9 | const gmm_model& model) { 10 | // TODO 11 | return true; 12 | } 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /src/validity-check/m_validity_check_model.h: -------------------------------------------------------------------------------- 1 | #ifndef VALIDITY_CHECK_M_VALIDITY_CHECK_MODEL_H 2 | #define VALIDITY_CHECK_M_VALIDITY_CHECK_MODEL_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_set.h" 6 | #include "../model/m_model.h" 7 | 8 | bool validity_check_model(const data_set& data, const mat& theta, unsigned t, 9 | const m_model& model) { 10 | // TODO 11 | return true; 12 | } 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /src/validity-check/validity_check.h: -------------------------------------------------------------------------------- 1 | #ifndef VALIDITY_CHECK_VALIDITY_CHECK_H 2 | #define VALIDITY_CHECK_VALIDITY_CHECK_H 3 | 4 | #include "../basedef.h" 5 | #include "../data/data_set.h" 6 | #include "cox_validity_check_model.h" 7 | #include "glm_validity_check_model.h" 8 | #include "gmm_validity_check_model.h" 9 | #include "m_validity_check_model.h" 10 | 11 | template 12 | bool validity_check(const data_set& data, const mat& theta, bool good_gradient, 13 | unsigned t, const MODEL& model) { 14 | // Check if gradient is finite. 15 | if (!good_gradient) { 16 | Rcpp::Rcout << "error: NA or infinite gradient" << std::endl; 17 | return false; 18 | } 19 | 20 | // Check if all estimates are finite. 21 | if (!is_finite(theta)) { 22 | Rcpp::Rcout << "warning: non-finite coefficients at iteration " << t << std::endl; 23 | } 24 | 25 | return validity_check_model(data, theta, t, model); 26 | } 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | test_check("sgd") -------------------------------------------------------------------------------- /tests/testthat/test-fitted.R: -------------------------------------------------------------------------------- 1 | context("Fitted generic method") 2 | 3 | test_that("Fitted generic method", { 4 | 5 | skip_on_cran() 6 | 7 | # Dimensions 8 | N <- 1e4 9 | d <- 5 10 | 11 | # Generate data. 12 | set.seed(42) 13 | X <- matrix(rnorm(N*d), ncol=d) 14 | theta <- rep(5, d+1) 15 | eps <- rnorm(N) 16 | y <- cbind(1, X) %*% theta + eps 17 | dat <- data.frame(y=y, x=X) 18 | 19 | sgd.theta <- sgd(y ~ ., data=dat, model="lm") 20 | fitted(sgd.theta) 21 | 22 | # Check that it executes without error. 23 | expect_true(TRUE) 24 | }) 25 | -------------------------------------------------------------------------------- /tests/testthat/test-lasso.R: -------------------------------------------------------------------------------- 1 | context("Linear regression with lasso penalty") 2 | 3 | test_that("MSE converges for linear regression with lasso", { 4 | skip_on_cran() 5 | 6 | library(glmnet) 7 | 8 | # Dimensions 9 | N <- 1e5 10 | d <- 5 11 | 12 | # Generate data. 13 | set.seed(42) 14 | X <- matrix(rnorm(N*d), ncol=d) 15 | theta <- rep(5, d) 16 | eps <- rnorm(N) 17 | y <- X %*% theta + eps 18 | dat <- data.frame(y=y, x=X) 19 | 20 | glmnet.theta <- glmnet(X, y, alpha=1, lambda=0.5, standardize=FALSE, 21 | type.gaussian="covariance") 22 | truth <- as.vector(glmnet.theta$beta) 23 | 24 | get.mse <- function(method) { 25 | sgd.theta <- sgd(y ~ .-1, data=dat, model="lm", 26 | model.control=list(lambda1=0.5), 27 | sgd.control=list( 28 | method=method, 29 | pass=T)) 30 | mean((sgd.theta$coefficients - truth)^2) 31 | } 32 | 33 | expect_true(get.mse("sgd") < 1e-2) 34 | expect_true(get.mse("ai-sgd") < 1e-2) 35 | }) 36 | -------------------------------------------------------------------------------- /tests/testthat/test-linear.R: -------------------------------------------------------------------------------- 1 | context("Linear regression") 2 | 3 | test_that("MSE converges for linear models", { 4 | 5 | skip_on_cran() 6 | 7 | # Dimensions 8 | N <- 1e4 9 | d <- 5 10 | 11 | # Generate data. 12 | set.seed(42) 13 | X <- matrix(rnorm(N*d), ncol=d) 14 | theta <- rep(5, d+1) 15 | eps <- rnorm(N) 16 | y <- cbind(1, X) %*% theta + eps 17 | dat <- data.frame(y=y, x=X) 18 | 19 | get.mse <- function(method, lr) { 20 | sgd.theta <- sgd(y ~ ., data=dat, model="lm", 21 | sgd.control=list( 22 | method=method, 23 | lr=lr, 24 | npasses=10, 25 | pass=T)) 26 | mean((sgd.theta$coefficients - theta)^2) 27 | } 28 | 29 | expect_true(get.mse("sgd", "one-dim") < 1e-2) 30 | expect_true(get.mse("sgd", "adagrad") < 1e-2) 31 | #expect_true(get.mse("sgd", "rmsprop") < 1e-2) 32 | expect_true(get.mse("implicit", "one-dim") < 1e-2) 33 | #expect_true(get.mse("implicit", "d-dim") < 1e-2) 34 | expect_true(get.mse("implicit", "adagrad") < 1e-2) 35 | #expect_true(get.mse("implicit", "rmsprop") < 1e-2) 36 | expect_true(get.mse("asgd", "one-dim") < 1e-2) 37 | expect_true(get.mse("asgd", "adagrad") < 1e-2) 38 | expect_true(get.mse("asgd", "rmsprop") < 1e-2) 39 | expect_true(get.mse("ai-sgd", "one-dim") < 1e-2) 40 | expect_true(get.mse("ai-sgd", "d-dim") < 1e-2) 41 | expect_true(get.mse("ai-sgd", "adagrad") < 1e-2) 42 | expect_true(get.mse("ai-sgd", "rmsprop") < 1e-2) 43 | expect_true(get.mse("momentum", "one-dim") < 1e-2) 44 | #expect_true(get.mse("momentum", "d-dim") < 1e-2) 45 | #expect_true(get.mse("momentum", "adagrad") < 1e-2) 46 | #expect_true(get.mse("momentum", "rmsprop") < 1e-2) 47 | expect_true(get.mse("nesterov", "one-dim") < 1e-2) 48 | #expect_true(get.mse("nesterov", "d-dim") < 1e-2) 49 | #expect_true(get.mse("nesterov", "adagrad") < 1e-2) 50 | #expect_true(get.mse("nesterov", "rmsprop") < 1e-2) 51 | }) 52 | -------------------------------------------------------------------------------- /tests/testthat/test-predict.R: -------------------------------------------------------------------------------- 1 | context("Predict generic method") 2 | 3 | test_that("Predict generic method", { 4 | 5 | skip_on_cran() 6 | 7 | # Dimensions 8 | N <- 1e4 9 | d <- 5 10 | 11 | # Generate data. 12 | set.seed(42) 13 | X <- matrix(rnorm(N*d), ncol=d) 14 | theta <- rep(5, d+1) 15 | eps <- rnorm(N) 16 | y <- cbind(1, X) %*% theta + eps 17 | dat <- data.frame(y=y, x=X) 18 | 19 | sgd.theta <- sgd(y ~ ., data=dat, model="lm") 20 | predict(sgd.theta, cbind(1, X)) 21 | predict(sgd.theta, cbind(1, X), type="response") 22 | predict(sgd.theta, cbind(1, X), type="term") 23 | 24 | # Check that it executes without error. 25 | expect_true(TRUE) 26 | }) 27 | -------------------------------------------------------------------------------- /tests/testthat/test-residuals.R: -------------------------------------------------------------------------------- 1 | context("Residuals generic method") 2 | 3 | test_that("Residuals generic method", { 4 | 5 | skip_on_cran() 6 | 7 | # Dimensions 8 | N <- 1e4 9 | d <- 5 10 | 11 | # Generate data. 12 | set.seed(42) 13 | X <- matrix(rnorm(N*d), ncol=d) 14 | theta <- rep(5, d+1) 15 | eps <- rnorm(N) 16 | y <- cbind(1, X) %*% theta + eps 17 | dat <- data.frame(y=y, x=X) 18 | 19 | sgd.theta <- sgd(y ~ ., data=dat, model="lm") 20 | residuals(sgd.theta) 21 | 22 | # Check that it executes without error. 23 | expect_true(TRUE) 24 | }) 25 | -------------------------------------------------------------------------------- /vignettes/sgd-jss.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/airoldilab/sgd/92246a3e1d81c9cd3077b80cacb6a3f7ee2425c5/vignettes/sgd-jss.pdf -------------------------------------------------------------------------------- /vignettes/sgd-jss.pdf.asis: -------------------------------------------------------------------------------- 1 | %\VignetteIndexEntry{Stochastic gradient decent methods for estimation with large data sets} 2 | %\VignetteEngine{R.rsp::asis} 3 | %\VignetteKeyword{PDF} 4 | %\VignetteKeyword{HTML} 5 | %\VignetteKeyword{vignette} 6 | %\VignetteKeyword{package} 7 | --------------------------------------------------------------------------------