├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
├── R
    ├── RcppExports.R
    ├── coef.sgd.R
    ├── data-winequality.R
    ├── fitted.sgd.R
    ├── plot.sgd.R
    ├── predict.sgd.R
    ├── print.sgd.R
    ├── residuals.sgd.R
    └── sgd.R
├── README.md
├── cran-comments.md
├── data
    └── winequality.rda
├── demo
    ├── 00Index
    ├── bench-corr-linear-regression.R
    ├── bench-linear-regression.R
    ├── bench-logistic-wine.R
    ├── cox-regression.R
    ├── glm-logistic-regression.R
    ├── glm-poisson-regression.R
    ├── linear-regression.R
    ├── m-estimation.R
    └── normal-method-of-moments.R
├── man
    ├── coef.sgd.Rd
    ├── fitted.sgd.Rd
    ├── plot.sgd.Rd
    ├── predict.sgd.Rd
    ├── print.sgd.Rd
    ├── residuals.sgd.Rd
    ├── sgd.Rd
    └── winequality.Rd
├── src
    ├── Makevars
    ├── Makevars.win
    ├── RcppExports.cpp
    ├── basedef.h
    ├── data
    │   ├── data_point.h
    │   └── data_set.h
    ├── learn-rate
    │   ├── base_learn_rate.h
    │   ├── ddim_learn_rate.h
    │   ├── learn_rate_value.h
    │   ├── onedim_eigen_learn_rate.h
    │   └── onedim_learn_rate.h
    ├── model
    │   ├── base_model.h
    │   ├── cox_model.h
    │   ├── glm
    │   │   ├── glm_family.h
    │   │   └── glm_transfer.h
    │   ├── glm_model.h
    │   ├── gmm_model.h
    │   ├── m-estimation
    │   │   └── m_loss.h
    │   └── m_model.h
    ├── post-process
    │   ├── cox_post_process.h
    │   ├── glm_post_process.h
    │   ├── gmm_post_process.h
    │   └── m_post_process.h
    ├── sgd.cpp
    ├── sgd
    │   ├── base_sgd.h
    │   ├── explicit_sgd.h
    │   ├── implicit_sgd.h
    │   ├── momentum_sgd.h
    │   └── nesterov_sgd.h
    └── validity-check
    │   ├── cox_validity_check_model.h
    │   ├── glm_validity_check_model.h
    │   ├── gmm_validity_check_model.h
    │   ├── m_validity_check_model.h
    │   └── validity_check.h
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-fitted.R
    │   ├── test-lasso.R
    │   ├── test-linear.R
    │   ├── test-predict.R
    │   └── test-residuals.R
└── vignettes
    ├── sgd-jss.pdf
    └── sgd-jss.pdf.asis


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^cran-comments\.md$
2 | ^.*\.Rproj$
3 | ^\.Rproj\.user$
4 | ^sgd\.Rproj$
5 | ^CRAN-SUBMISSION$
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.DS_Store
2 | *.Rhistory
3 | *.Rdata
4 | *.so
5 | *.o
6 | *.Rproj
7 | .Rproj.user
8 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: sgd
 2 | Type: Package
 3 | Title: Stochastic Gradient Descent for Scalable Estimation
 4 | Version: 1.1.2
 5 | Authors@R: c(
 6 |     person("Junhyung Lyle", "Kim", email = "jlylekim@gmail.com", role = c("cre", "aut")),
 7 |     person("Dustin", "Tran", role = "aut"),
 8 |     person("Panos", "Toulis", role = "aut"),
 9 |     person("Tian", "Lian", role = "ctb"),
10 |     person("Ye", "Kuang", role = "ctb"),
11 |     person("Edoardo", "Airoldi", role = "ctb")
12 |     )
13 | Maintainer: Junhyung Lyle Kim <jlylekim@gmail.com>
14 | Description: A fast and flexible set of tools for large scale estimation. It
15 |     features many stochastic gradient methods, built-in models, visualization
16 |     tools, automated hyperparameter tuning, model checking, interval estimation,
17 |     and convergence diagnostics.
18 | URL: https://github.com/airoldilab/sgd
19 | BugReports: https://github.com/airoldilab/sgd/issues
20 | License: GPL-2
21 | Imports:
22 |     ggplot2,
23 |     MASS,
24 |     methods,
25 |     Rcpp (>= 0.11.3),
26 |     stats
27 | Suggests:
28 |     bigmemory,
29 |     glmnet,
30 |     gridExtra,
31 |     R.rsp,
32 |     testthat
33 | LinkingTo:
34 |     BH,
35 |     bigmemory,
36 |     Rcpp,
37 |     RcppArmadillo
38 | LazyData: yes
39 | VignetteBuilder: R.rsp
40 | Encoding: UTF-8
41 | RoxygenNote: 7.3.1
42 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(coef,sgd)
 4 | S3method(fitted,sgd)
 5 | S3method(plot,list)
 6 | S3method(plot,sgd)
 7 | S3method(predict,sgd)
 8 | S3method(print,sgd)
 9 | S3method(residuals,sgd)
10 | S3method(sgd,big.matrix)
11 | S3method(sgd,default)
12 | S3method(sgd,formula)
13 | S3method(sgd,matrix)
14 | export(predict_all)
15 | export(sgd)
16 | import(MASS)
17 | importFrom(Rcpp,evalCpp)
18 | importFrom(methods,new)
19 | importFrom(stats,coef)
20 | importFrom(stats,fitted)
21 | importFrom(stats,gaussian)
22 | importFrom(stats,is.empty.model)
23 | importFrom(stats,model.matrix)
24 | importFrom(stats,model.response)
25 | importFrom(stats,predict)
26 | importFrom(stats,rnorm)
27 | useDynLib(sgd)
28 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # sgd 1.1.2
2 | 
3 | * Added a `NEWS.md` file to track changes to the package.
4 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
3 | 
4 | run <- function(dataset, model_control, sgd_control) {
5 |     .Call('_sgd_run', PACKAGE = 'sgd', dataset, model_control, sgd_control)
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/R/coef.sgd.R:
--------------------------------------------------------------------------------
 1 | #' Extract Model Coefficients
 2 | #'
 3 | #' Extract model coefficients from \code{sgd} objects. \code{coefficients}
 4 | #' is an \emph{alias} for it.
 5 | #'
 6 | #' @param object object of class \code{sgd}.
 7 | #' @param \dots some methods for this generic require additional
 8 | #'   arguments. None are used in this method.
 9 | #'
10 | #' @return
11 | #' Coefficients extracted from the model object \code{object}.
12 | #'
13 | #' @export
14 | coef.sgd <- function(object, ...) {
15 |   return(as.vector(object$coefficients))
16 | }
17 | 


--------------------------------------------------------------------------------
/R/data-winequality.R:
--------------------------------------------------------------------------------
 1 | #' Wine quality data of white wine samples from Portugal
 2 | #'
 3 | #' This dataset is a collection of white "Vinho Verde" wine
 4 | #' samples from the north of Portugal. Due to privacy and logistic
 5 | #' issues, only physicochemical (inputs) and sensory (the output)
 6 | #' variables are available (e.g. there is no data about grape types,
 7 | #' wine brand, wine selling price, etc.).
 8 | #'
 9 | #' @format A data frame with 4898 rows and 12 variables
10 | #' \itemize{
11 | #'   \item fixed acidity.
12 | #'   \item volatile acidity.
13 | #'   \item citric acid.
14 | #'   \item residual sugar.
15 | #'   \item chlorides.
16 | #'   \item free sulfur dioxide.
17 | #'   \item total sulfur dioxide.
18 | #'   \item density.
19 | #'   \item pH.
20 | #'   \item sulphates.
21 | #'   \item alcohol.
22 | #'   \item quality (score between 0 and 10).
23 | #' }
24 | #' @source \url{https://archive.ics.uci.edu/ml/datasets/Wine+Quality}
25 | "winequality"
26 | 


--------------------------------------------------------------------------------
/R/fitted.sgd.R:
--------------------------------------------------------------------------------
 1 | #' Extract Model Fitted Values
 2 | #'
 3 | #' Extract fitted values from from \code{sgd} objects.
 4 | #' \code{fitted.values} is an \emph{alias} for it.
 5 | #'
 6 | #' @param object object of class \code{sgd}.
 7 | #' @param \dots some methods for this generic require additional
 8 | #'   arguments. None are used in this method.
 9 | #'
10 | #' @return
11 | #' Fitted values extracted from the object \code{object}.
12 | #'
13 | #' @export
14 | fitted.sgd <- function(object, ...) {
15 |   return(object$fitted.values)
16 | }
17 | 


--------------------------------------------------------------------------------
/R/plot.sgd.R:
--------------------------------------------------------------------------------
  1 | #' Plot objects of class \code{sgd}.
  2 | #'
  3 | #' @param x object of class \code{sgd}.
  4 | #' @param \dots additional arguments used for each type of plot. See
  5 | #'   \sQuote{Details}.
  6 | #' @param type character specifying the type of plot: \code{"mse"},
  7 | #'   \code{"clf"}, \code{"mse-param"}. See \sQuote{Details}. Default is
  8 | #'   \code{"mse"}.
  9 | #' @param xaxis character specifying the x-axis of plot: \code{"iteration"}
 10 | #'   plots the y values over the log-iteration of the algorithm;
 11 | #'   \code{"runtime"} plots the y values over the time in seconds to reach them.
 12 | #'   Default is \code{"iteration"}.
 13 | #'
 14 | #' @details
 15 | #' Types of plots available:
 16 | #' \describe{
 17 | #'   \item{\code{mse}}{Mean squared error in predictions, which takes the
 18 | #'     following arguments:
 19 | #'     \describe{
 20 | #'       \item{\code{x_test}}{test set}
 21 | #'       \item{\code{y_test}}{test responses to compare predictions to}
 22 | #'     }}
 23 | #'   \item{\code{clf}}{Classification error in predictions, which takes the
 24 | #'     following arguments:
 25 | #'     \describe{
 26 | #'       \item{\code{x_test}}{test set}
 27 | #'       \item{\code{y_test}}{test responses to compare predictions to}
 28 | #'     }}
 29 | #'   \item{\code{mse-param}}{Mean squared error in parameters, which takes the
 30 | #'     following arguments:
 31 | #'     \describe{
 32 | #'       \item{\code{true_param}}{true vector of parameters to compare to}
 33 | #'     }}
 34 | #' }
 35 | #'
 36 | #' @export
 37 | plot.sgd <- function(x, ..., type="mse", xaxis="iteration") {
 38 |   plot <- choose_plot(type, xaxis)
 39 |   return(plot(x, ...))
 40 | }
 41 | 
 42 | #' @export
 43 | #' @rdname plot.sgd
 44 | plot.list <- function(x, ..., type="mse", xaxis="iteration") {
 45 |   plot <- choose_plot(type, xaxis)
 46 |   return(plot(x, ...))
 47 | }
 48 | 
 49 | ################################################################################
 50 | # Helper functions
 51 | ################################################################################
 52 | 
 53 | choose_plot <- function(type, xaxis) {
 54 |   if (type == "mse") {
 55 |     if (xaxis == "iteration") {
 56 |       return(plot_mse)
 57 |     } else if (xaxis == "runtime") {
 58 |       return(function(x, ...) plot_mse(x, ..., xaxis="Runtime (s)"))
 59 |     }
 60 |   } else if (type == "mse-param") {
 61 |     if (xaxis == "iteration") {
 62 |       return(plot_mse_param)
 63 |     } else if (xaxis == "runtime") {
 64 |       return(function(x, ...) plot_mse_param(x, ..., xaxis="Runtime (s)"))
 65 |     }
 66 |   } else if (type == "clf") {
 67 |     if (xaxis == "iteration") {
 68 |       return(plot_clf)
 69 |     } else if (xaxis == "runtime") {
 70 |       return(function(x, ...) plot_clf(x, ..., xaxis="Runtime (s)"))
 71 |     }
 72 |   }
 73 |   stop("'type' not recognized")
 74 | }
 75 | 
 76 | get_mse <- function(x, x_test, y_test) {
 77 |   mu <- predict_all(x, x_test)
 78 |   nests <- ncol(mu)
 79 |   mse <- rep(NA, nests)
 80 |   for (j in 1:nests) {
 81 |     mse[j] <- mean((mu[, j] - y_test)^2)
 82 |   }
 83 |   return(mse)
 84 | }
 85 | 
 86 | get_mse_param <- function(x, true_param) {
 87 |   nests <- ncol(x$estimates)
 88 |   mse <- rep(NA, nests)
 89 |   for (j in 1:nests) {
 90 |     mse[j] <- mean((x$estimates[, j] - true_param)^2)
 91 |   }
 92 |   return(mse)
 93 | }
 94 | 
 95 | plot_mse <- function(x, x_test, y_test, label="sgd", xaxis="log-Iteration") {
 96 |   if (toString(class(x)) != "list") {
 97 |     x <- list(label=x)
 98 |   }
 99 |   dat <- data.frame()
100 |   for (i in 1:length(x)) {
101 |     mse <- get_mse(x[[i]], x_test, y_test)
102 |     temp_dat <- data.frame(y=mse,
103 |                            label=names(x)[i])
104 |     if (xaxis == "log-Iteration") {
105 |       temp_dat$x <- x[[i]]$pos
106 |     } else if (xaxis == "Runtime (s)") {
107 |       temp_dat$x <- x[[i]]$time
108 |     }
109 |     temp_dat <- temp_dat[!duplicated(temp_dat$x), ]
110 |     dat <- rbind(dat, temp_dat)
111 |   }
112 |   dat$label <- as.factor(dat$label)
113 | 
114 |   p <- generic_plot(dat, xaxis) +
115 |     ggplot2::scale_y_log10() +
116 |     ggplot2::labs(
117 |       title="Mean Squared Error",
118 |       x=xaxis,
119 |       y="")
120 |   return(p)
121 | }
122 | 
123 | plot_mse_param <- function(x, true_param, label="sgd", xaxis="log-Iteration") {
124 |   if (toString(class(x)) != "list") {
125 |     x <- list(x)
126 |     names(x) <- label
127 |   }
128 |   dat <- data.frame()
129 |   for (i in 1:length(x)) {
130 |     mse <- get_mse_param(x[[i]], true_param)
131 |     temp_dat <- data.frame(y=mse,
132 |                            label=names(x)[i])
133 |     if (xaxis == "log-Iteration") {
134 |       temp_dat$x <- x[[i]]$pos
135 |     } else if (xaxis == "Runtime (s)") {
136 |       temp_dat$x <- x[[i]]$time
137 |     }
138 |     temp_dat <- temp_dat[!duplicated(temp_dat$x), ]
139 |     dat <- rbind(dat, temp_dat)
140 |   }
141 |   dat$label <- as.factor(dat$label)
142 | 
143 |   p <- generic_plot(dat, xaxis) +
144 |     ggplot2::scale_y_continuous(
145 |       breaks=5 * 1:min((max(dat$y)/5), 100)) +
146 |     ggplot2::labs(
147 |       title="Mean Squared Error",
148 |       x=xaxis,
149 |       y="")
150 |   return(p)
151 | }
152 | 
153 | plot_clf <- function(x, x_test, y_test, label="sgd", xaxis="log-Iteration") {
154 |   if (toString(class(x)) != "list") {
155 |     x <- list(x)
156 |     names(x) <- label
157 |   }
158 |   dat <- data.frame()
159 |   for (i in 1:length(x)) {
160 |     pred <- predict_all(x[[i]], x_test)
161 |     pred <- (pred > 0.5) * 1
162 |     error <- colSums(pred != y_test) / nrow(pred) # is this correct?
163 |     temp_dat <- data.frame(y=error,
164 |                            label=names(x)[i])
165 |     if (xaxis == "log-Iteration") {
166 |       temp_dat$x <- x[[i]]$pos
167 |     } else if (xaxis == "Runtime (s)") {
168 |       temp_dat$x <- x[[i]]$time
169 |     }
170 |     temp_dat <- temp_dat[!duplicated(temp_dat$x), ]
171 |     dat <- rbind(dat, temp_dat)
172 |   }
173 |   dat$label <- as.factor(dat$label)
174 | 
175 |   p <- generic_plot(dat, xaxis) +
176 |     ggplot2::scale_y_continuous(
177 |       #limits=c(max(0, mean(dat$y)-2.5*sd(dat$y)),
178 |       #         min(1, mean(dat$y)+2*sd(dat$y))),
179 |       breaks=seq(0.05, 1, 0.05)) +
180 |     ggplot2::labs(
181 |       title="Classification Error",
182 |       x=xaxis,
183 |       y="")
184 |   return(p)
185 | }
186 | 
187 | generic_plot <- function(dat, xaxis) {
188 |   x <- NULL
189 |   y <- NULL
190 |   label <- NULL
191 |   p <- ggplot2::ggplot(dat, ggplot2::aes(x=x, y=y, group=label)) +
192 |     ggplot2::geom_line(ggplot2::aes(linetype=label, color=label)) +
193 |     ggplot2::theme(
194 |       panel.background=ggplot2::element_blank(),
195 |       panel.border=ggplot2::element_blank(),
196 |       panel.grid.major=ggplot2::element_blank(),
197 |       panel.grid.minor=ggplot2::element_blank(),
198 |       axis.line=ggplot2::element_line(color="black"),
199 |       legend.position=c(1, 1),
200 |       legend.justification = c(1, 1),
201 |       legend.title=ggplot2::element_blank(),
202 |       legend.key=ggplot2::element_blank(),
203 |       legend.background=ggplot2::element_rect(linetype="solid", color="black")
204 |       ) +
205 |     ggplot2::scale_fill_hue(l=50)
206 |   if (xaxis == "log-Iteration") {
207 |     p <- p +
208 |       ggplot2::scale_x_log10(
209 |         breaks=10^(1:log(max(dat$x), base=10)))
210 |   }
211 |   return(p)
212 | }
213 | 


--------------------------------------------------------------------------------
/R/predict.sgd.R:
--------------------------------------------------------------------------------
 1 | #' Model Predictions
 2 | #'
 3 | #' Form predictions using the estimated model parameters from stochastic
 4 | #' gradient descent.
 5 | #'
 6 | #' @param object object of class \code{sgd}.
 7 | #' @param newdata design matrix to form predictions on
 8 | #' @param type the type of prediction required. The default "link" is
 9 | #'   on the scale of the linear predictors; the alternative '"response"'
10 | #'   is on the scale of the response variable. Thus for a default
11 | #'   binomial model the default predictions are of log-odds
12 | #'   (probabilities on logit scale) and 'type = "response"' gives the
13 | #'   predicted probabilities. The '"terms"' option returns a matrix
14 | #'   giving the fitted values of each term in the model formula on the
15 | #'   linear predictor scale.
16 | #' @param \dots further arguments passed to or from other methods.
17 | #'
18 | #' @details
19 | #' A column of 1's must be included to \code{newdata} if the
20 | #' parameters include a bias (intercept) term.
21 | #'
22 | #' @export
23 | predict.sgd <- function(object, newdata, type="link", ...) {
24 |   if (!(object$model %in% c("lm", "glm", "m"))) {
25 |     stop("'model' not supported")
26 |   }
27 |   if (!(type %in% c("link", "response", "term"))) {
28 |     stop("'type' not recognized")
29 |   }
30 | 
31 |   if (object$model %in% c("lm", "glm")) {
32 |     if (type %in% c("link", "response")) {
33 |       eta <- newdata %*% coef(object)
34 |       if (type == "response") {
35 |         y <- object$model.out$family$linkinv(eta)
36 |         return(y)
37 |       }
38 |       return(eta)
39 |     }
40 |     eta <- newdata %*% diag(coef(object))
41 |     return(eta)
42 |   } else if (object$model == "m") {
43 |     if (type %in% c("link", "response")) {
44 |       eta <- newdata %*% coef(object)
45 |       if (type == "response") {
46 |         y <- eta
47 |         return(y)
48 |       }
49 |       return(eta)
50 |     }
51 |     eta <- newdata %*% diag(coef(object))
52 |     return(eta)
53 |   }
54 | }
55 | 
56 | #' @export
57 | #' @rdname predict.sgd
58 | predict_all <- function(object, newdata, ...) {
59 |   if (object$model %in% c("lm", "glm")) {
60 |     eta <- newdata %*% object$estimates
61 |     y <- object$model.out$family$linkinv(eta)
62 |   } else if (object$model == "m") {
63 |     eta <- newdata %*% object$estimates
64 |     y <- eta
65 |   # TODO
66 |   } else {
67 |     stop("'model' not recognized")
68 |   }
69 |   return(y)
70 | }
71 | 


--------------------------------------------------------------------------------
/R/print.sgd.R:
--------------------------------------------------------------------------------
 1 | #' Print objects of class \code{sgd}.
 2 | #'
 3 | #' @param x object of class \code{sgd}.
 4 | #' @param \dots further arguments passed to or from other methods.
 5 | #'
 6 | #' @export
 7 | print.sgd <- function(x, ...) {
 8 |   print(coef(x), ...)
 9 | }
10 | 


--------------------------------------------------------------------------------
/R/residuals.sgd.R:
--------------------------------------------------------------------------------
 1 | #' Extract Model Residuals
 2 | #'
 3 | #' Extract model residuals from \code{sgd} objects. \code{resid} is an
 4 | #' \emph{alias} for it.
 5 | #'
 6 | #' @param object object of class \code{sgd}.
 7 | #' @param \dots some methods for this generic require additional
 8 | #'   arguments. None are used in this method.
 9 | #'
10 | #' @return
11 | #' Residuals extracted from the object \code{object}.
12 | #'
13 | #' @export
14 | residuals.sgd <- function(object, ...) {
15 |   return(object$residuals)
16 | }
17 | 


--------------------------------------------------------------------------------
/R/sgd.R:
--------------------------------------------------------------------------------
  1 | #' Stochastic gradient descent
  2 | #'
  3 | #' Run stochastic gradient descent in order to optimize the induced loss
  4 | #' function given a model and data.
  5 | #'
  6 | #' @param formula an object of class \code{"\link{formula}"} (or one that can be
  7 | #'   coerced to that class): a symbolic description of the model to be fitted.
  8 | #'   The details can be found in \code{"\link{glm}"}.
  9 | #' @param data an optional data frame, list or environment (or object coercible
 10 | #'   by \code{\link[base]{as.data.frame}} to a data frame) containing the
 11 | #'   variables in the model. If not found in data, the variables are taken from
 12 | #'   environment(formula), typically the environment from which glm is called.
 13 | #' @param model character specifying the model to be used: \code{"lm"} (linear
 14 | #'   model), \code{"glm"} (generalized linear model), \code{"cox"} (Cox
 15 | #'   proportional hazards model), \code{"gmm"} (generalized method of moments),
 16 | #'   \code{"m"} (M-estimation). See \sQuote{Details}.
 17 | #' @param model.control a list of parameters for controlling the model.
 18 | #'   \describe{
 19 | #'     \item{\code{family} (\code{"glm"})}{a description of the error distribution and
 20 | #'       link function to be used in the model. This can be a character string
 21 | #'       naming a family function, a family function or the result of a call to
 22 | #'       a family function. (See \code{\link[stats]{family}} for details of
 23 | #'       family functions.)}
 24 | #'     \item{\code{rank} (\code{"glm"})}{logical. Should the rank of the design matrix
 25 | #'       be checked?}
 26 | #'     \item{\code{fn} (\code{"gmm"})}{a function \eqn{g(\theta,x)} which returns a
 27 | #'       \eqn{k}-vector corresponding to the \eqn{k} moment conditions. It is a
 28 | #'       required argument if \code{gr} not specified.}
 29 | #'     \item{\code{gr} (\code{"gmm"})}{a function to return the gradient. If
 30 | #'       unspecified, a finite-difference approximation will be used.}
 31 | #'     \item{\code{nparams} (\code{"gmm"})}{number of model parameters. This is
 32 | #'       automatically determined for other models.}
 33 | #'     \item{\code{type} (\code{"gmm"})}{character specifying the generalized method of
 34 | #'       moments procedure: \code{"twostep"} (Hansen, 1982), \code{"iterative"}
 35 | #'       (Hansen et al., 1996). Defaults to \code{"iterative"}.}
 36 | #'     \item{\code{wmatrix} (\code{"gmm"})}{weighting matrix to be used in the loss
 37 | #'       function. Defaults to the identity matrix.}
 38 | #'     \item{\code{loss} (\code{"m"})}{character specifying the loss function to be
 39 | #'       used in the estimating equation. Default is the Huber loss.}
 40 | #'     \item{\code{lambda1}}{L1 regularization parameter. Default is 0.}
 41 | #'     \item{\code{lambda2}}{L2 regularization parameter. Default is 0.}
 42 | #'   }
 43 | #' @param sgd.control an optional list of parameters for controlling the estimation.
 44 | #'   \describe{
 45 | #'     \item{\code{method}}{character specifying the method to be used: \code{"sgd"},
 46 | #'       \code{"implicit"}, \code{"asgd"}, \code{"ai-sgd"}, \code{"momentum"},
 47 | #'       \code{"nesterov"}. Default is \code{"ai-sgd"}. See \sQuote{Details}.}
 48 | #'     \item{\code{lr}}{character specifying the learning rate to be used:
 49 | #'       \code{"one-dim"}, \code{"one-dim-eigen"}, \code{"d-dim"},
 50 | #'       \code{"adagrad"}, \code{"rmsprop"}. Default is \code{"one-dim"}.
 51 | #'       See \sQuote{Details}.}
 52 | #'     \item{\code{lr.control}}{vector of scalar hyperparameters one can
 53 | #'       set dependent on the learning rate. For hyperparameters aimed
 54 | #'       to be left as default, specify \code{NA} in the corresponding
 55 | #'       entries. See \sQuote{Details}.}
 56 | #'     \item{\code{start}}{starting values for the parameter estimates. Default is
 57 | #'       random initialization around zero.}
 58 | #'     \item{\code{size}}{number of SGD estimates to store for diagnostic purposes
 59 | #'       (distributed log-uniformly over total number of iterations)}
 60 | #'     \item{\code{reltol}}{relative convergence tolerance. The algorithm stops
 61 | #'       if it is unable to change the relative mean squared difference in the
 62 | #'       parameters by more than the amount. Default is \code{1e-05}.}
 63 | #'     \item{\code{npasses}}{the maximum number of passes over the data. Default
 64 | #'       is 3.}
 65 | #'     \item{\code{pass}}{logical. Should \code{tol} be ignored and run the
 66 | #'       algorithm for all of \code{npasses}?}
 67 | #'     \item{\code{shuffle}}{logical. Should the algorithm shuffle the data set
 68 | #'       including for each pass?}
 69 | #'     \item{\code{verbose}}{logical. Should the algorithm print progress?}
 70 | #'   }
 71 | #' @param \dots arguments to be used to form the default \code{sgd.control}
 72 | #'   arguments if it is not supplied directly.
 73 | #' @param x,y a design matrix and the respective vector of outcomes.
 74 | #'
 75 | #' @details
 76 | #' Models:
 77 | #' The Cox model assumes that the survival data is ordered when passed
 78 | #' in, i.e., such that the risk set of an observation i is all data points after
 79 | #' it.
 80 | #'
 81 | #' Methods:
 82 | #' \describe{
 83 | #'   \item{\code{sgd}}{stochastic gradient descent (Robbins and Monro, 1951)}
 84 | #'   \item{\code{implicit}}{implicit stochastic gradient descent (Toulis et al.,
 85 | #'     2014)}
 86 | #'   \item{\code{asgd}}{stochastic gradient with averaging (Polyak and Juditsky,
 87 | #'     1992)}
 88 | #'   \item{\code{ai-sgd}}{implicit stochastic gradient with averaging (Toulis et
 89 | #'     al., 2015)}
 90 | #'   \item{\code{momentum}}{"classical" momentum (Polyak, 1964)}
 91 | #'   \item{\code{nesterov}}{Nesterov's accelerated gradient (Nesterov, 1983)}
 92 | #' }
 93 | #'
 94 | #' Learning rates and hyperparameters:
 95 | #' \describe{
 96 | #'   \item{\code{one-dim}}{scalar value prescribed in Xu (2011) as
 97 | #'     \deqn{a_n = scale * gamma/(1 + alpha*gamma*n)^(-c)}
 98 | #'     where the defaults are
 99 | #'     \code{lr.control = (scale=1, gamma=1, alpha=1, c)}
100 | #'     where \code{c} is \code{1} if implemented without averaging,
101 | #'     \code{2/3} if with averaging}
102 | #'   \item{\code{one-dim-eigen}}{diagonal matrix
103 | #'     \code{lr.control = NULL}}
104 | #'   \item{\code{d-dim}}{diagonal matrix
105 | #'     \code{lr.control = (epsilon=1e-6)}}
106 | #'   \item{\code{adagrad}}{diagonal matrix prescribed in Duchi et al. (2011) as
107 | #'     \code{lr.control = (eta=1, epsilon=1e-6)}}
108 | #'   \item{\code{rmsprop}}{diagonal matrix prescribed in Tieleman and Hinton
109 | #'     (2012) as
110 | #'     \code{lr.control = (eta=1, gamma=0.9, epsilon=1e-6)}}
111 | #' }
112 | #'
113 | #' @return
114 | #' An object of class \code{"sgd"}, which is a list containing the following
115 | #' components:
116 | #' \item{model}{name of the model}
117 | #' \item{coefficients}{a named vector of coefficients}
118 | #' \item{converged}{logical. Was the algorithm judged to have converged?}
119 | #' \item{estimates}{estimates from algorithm stored at each iteration
120 | #'     specified in \code{pos}}
121 | #' \item{fitted.values}{the fitted mean values}
122 | #' \item{pos}{vector of indices specifying the iteration number each estimate
123 | #'     was stored for}
124 | #' \item{residuals}{the residuals, that is response minus fitted values}
125 | #' \item{times}{vector of times in seconds it took to complete the number of
126 | #'     iterations specified in \code{pos}}
127 | #' \item{model.out}{a list of model-specific output attributes}
128 | #'
129 | #' @author Dustin Tran, Tian Lan, Panos Toulis, Ye Kuang, Edoardo Airoldi
130 | #' @references
131 | #' John Duchi, Elad Hazan, and Yoram Singer. Adaptive subgradient methods for
132 | #' online learning and stochastic optimization. \emph{Journal of Machine
133 | #' Learning Research}, 12:2121-2159, 2011.
134 | #'
135 | #' Yurii Nesterov. A method for solving a convex programming problem with
136 | #' convergence rate \eqn{O(1/k^2)}. \emph{Soviet Mathematics Doklady},
137 | #' 27(2):372-376, 1983.
138 | #'
139 | #' Boris T. Polyak. Some methods of speeding up the convergence of iteration
140 | #' methods. \emph{USSR Computational Mathematics and Mathematical Physics},
141 | #' 4(5):1-17, 1964.
142 | #'
143 | #' Boris T. Polyak and Anatoli B. Juditsky. Acceleration of stochastic
144 | #' approximation by averaging. \emph{SIAM Journal on Control and Optimization},
145 | #' 30(4):838-855, 1992.
146 | #'
147 | #' Herbert Robbins and Sutton Monro. A stochastic approximation method.
148 | #' \emph{The Annals of Mathematical Statistics}, pp. 400-407, 1951.
149 | #'
150 | #' Panos Toulis, Jason Rennie, and Edoardo M. Airoldi, "Statistical analysis of
151 | #' stochastic gradient methods for generalized linear models", In
152 | #' \emph{Proceedings of the 31st International Conference on Machine Learning},
153 | #' 2014.
154 | #'
155 | #' Panos Toulis, Dustin Tran, and Edoardo M. Airoldi, "Stability and optimality
156 | #' in stochastic gradient descent", arXiv preprint arXiv:1505.02417, 2015.
157 | #'
158 | #' Wei Xu. Towards optimal one pass large scale learning with averaged
159 | #' stochastic gradient descent. arXiv preprint arXiv:1107.2490, 2011.
160 | #'
161 | #'  # Dimensions
162 | #' @examples
163 | #' ## Linear regression
164 | #' set.seed(42)
165 | #' N <- 1e4
166 | #' d <- 5
167 | #' X <- matrix(rnorm(N*d), ncol=d)
168 | #' theta <- rep(5, d+1)
169 | #' eps <- rnorm(N)
170 | #' y <- cbind(1, X) %*% theta + eps
171 | #' dat <- data.frame(y=y, x=X)
172 | #' sgd.theta <- sgd(y ~ ., data=dat, model="lm")
173 | #' sprintf("Mean squared error: %0.3f", mean((theta - as.numeric(sgd.theta$coefficients))^2))
174 | #'
175 | #'
176 | #' @useDynLib sgd
177 | #' @import MASS
178 | #' @importFrom methods new
179 | #' @importFrom Rcpp evalCpp
180 | #' @importFrom stats gaussian is.empty.model model.matrix model.response rnorm coef fitted predict
181 | 
182 | ################################################################################
183 | # Classes
184 | ################################################################################
185 | #' @export
186 | sgd <- function(x, ...) UseMethod("sgd")
187 | 
188 | ################################################################################
189 | # Methods
190 | ################################################################################
191 | 
192 | #' @export
193 | sgd.default <- function(x, ...) {
194 |   stop("class of x is not a formula, function, or matrix")
195 | }
196 | 
197 | #' @export
198 | #' @rdname sgd
199 | sgd.formula <- function(formula, data, model,
200 |                         model.control=list(),
201 |                         sgd.control=list(...),
202 |                         ...) {
203 |   call <- match.call() # set call function to match on arguments
204 |   # 1. Validity check.
205 |   if (missing(data)) {
206 |     data <- environment(formula)
207 |   }
208 | 
209 |   # 2. Build X and Y according to the formula.
210 |   mf <- match.call(expand.dots=FALSE)
211 |   m <- match(c("formula", "data"), names(mf), 0L)
212 |   mf <- mf[c(1L, m)]
213 |   mf$drop.unused.levels
214 |   mf[[1L]] <- quote(stats::model.frame)
215 |   mf <- eval(mf, parent.frame())
216 | 
217 |   Y <- model.response(mf, "any")
218 |   if (length(dim(Y)) == 1L) {
219 |     nm <- rownames(Y)
220 |     dim(Y) <- NULL
221 |     if (!is.null(nm)) {
222 |       names(Y) <- nm
223 |     }
224 |   }
225 | 
226 |   mt <- attr(mf, "terms")
227 |   if (!is.empty.model(mt)) {
228 |     X <- model.matrix(mt, mf)
229 |   } else {
230 |     X <- matrix(, NROW(Y), 0L)
231 |   }
232 | 
233 |   # 3. Pass into sgd.matrix().
234 |   return(sgd.matrix(X, Y, model, model.control, sgd.control))
235 | }
236 | 
237 | #' @export
238 | #' @rdname sgd
239 | sgd.matrix <- function(x, y, model,
240 |                        model.control=list(),
241 |                        sgd.control=list(...),
242 |                        ...) {
243 |   call <- match.call() # set call function to match on arguments
244 |   if (missing(x)) {
245 |     stop("'x' not specified")
246 |   }
247 |   if (missing(y)) {
248 |     stop("'y' not specified")
249 |   }
250 |   if (missing(model)) {
251 |     stop("'model' not specified")
252 |   }
253 |   if (!is.list(model.control)) {
254 |     stop("'model.control' is not a list")
255 |   }
256 |   model.control <- do.call("valid_model_control",
257 |                            c(model.control, model=model, d=ncol(x)))
258 |   if (!is.list(sgd.control))  {
259 |     stop("'sgd.control' is not a list")
260 |   }
261 |   sgd.control <- do.call("valid_sgd_control",
262 |                          c(sgd.control, N=NROW(y), nparams=model.control$nparams))
263 | 
264 |   return(fit(x, y, model, model.control, sgd.control))
265 | }
266 | 
267 | #' @export
268 | #' @rdname sgd
269 | # TODO y should be allowed to be a big matrix too; it should be any combination
270 | # (x is a big matrix, y is, etc.)
271 | sgd.big.matrix <- function(x, y, model,
272 |                        model.control=list(),
273 |                        sgd.control=list(...),
274 |                        ...) {
275 |   return(sgd.matrix(x, y, model, model.control, sgd.control))
276 | }
277 | 
278 | ################################################################################
279 | # Helper functions
280 | ################################################################################
281 | 
282 | fit <- function(x, y, model,
283 |                 model.control,
284 |                 sgd.control) {
285 |   #time_start <- proc.time()[3] # TODO timer only starts here
286 |   # TODO
287 |   if (model == "gmm") {
288 |     if (sgd.control$method %in% c("implicit", "ai-sgd")) {
289 |       stop("implicit methods not implemented yet")
290 |     }
291 |   }
292 | 
293 |   if (model %in% c("lm", "glm")) {
294 |     model.control$transfer <- transfer_name(model.control$family$link)
295 |     family <- model.control$family
296 |     model.control$family <- family$family
297 |     # Enable logistic regression if response is binary factor.
298 |     if (is.factor(y) && family == "binomial") {
299 |       y <- as.integer(as.character(y))
300 |     }
301 |   }
302 | 
303 |   dataset <- list(X=x, Y=as.matrix(y))
304 |   if ('big.matrix' %in% class(x)) {
305 |     dataset$big <- TRUE
306 |     dataset[["bigmat"]] <- x@address
307 |   } else {
308 |     dataset$big <- FALSE
309 |     dataset[["bigmat"]] <- new("externalptr")
310 |   }
311 | 
312 |   if (sgd.control$verbose) {
313 |     print("Completed pre-processing attributes...")
314 |     print("Running C++ algorithm...")
315 |   }
316 |   out <- run(dataset, model.control, sgd.control)
317 |   if (sgd.control$verbose) {
318 |     print("Completed C++ algorithm...")
319 |   }
320 |   if (length(out) == 0) {
321 |     stop("An error has occured, program stopped")
322 |   }
323 |   class(out) <- "sgd"
324 |   if (model %in% c("lm", "glm")) {
325 |     out$model.out$transfer <- model.control$transfer
326 |     out$model.out$family <- family
327 |   }
328 |   out$pos <- as.vector(out$pos)
329 |   #out$times <- as.vector(out$times) + (proc.time()[3] - time_start) # C++ time + R time
330 |   out$times <- as.vector(out$times)
331 |   out$fitted.values <- predict(out, x, type="response")
332 |   out$residuals <- y - fitted(out)
333 |   return(out)
334 | }
335 | 
336 | valid_model_control <- function(model, model.control=list(...), ...) {
337 |   # Run validity check of arguments passed to model.control given model. It
338 |   # passes defaults to those unspecified and converts to the correct type if
339 |   # possible; otherwise it errors.
340 |   # Check validity of regularization parameters.
341 |   lambda1 <- model.control$lambda1
342 |   if (is.null(lambda1)) {
343 |     lambda1 <- 0
344 |   } else if (!is.numeric(lambda1)) {
345 |     stop("'lambda1' must be numeric")
346 |   } else if (length(lambda1) != 1) {
347 |     stop(gettextf("length of 'lambda1' should equal %d", 1), domain=NA)
348 |   }
349 |   lambda2 <- model.control$lambda2
350 |   if (is.null(lambda2)) {
351 |     lambda2 <- 0
352 |   } else if (!is.numeric(lambda2)) {
353 |     stop("'lambda2' must be numeric")
354 |   } else if (length(lambda2) != 1) {
355 |     stop(gettextf("length of 'lambda2' should equal %d", 1), domain=NA)
356 |   }
357 |   nparams <- model.control$d
358 |   # Set family to gaussian for linear model.
359 |   if (model == "lm") {
360 |     model.control$family <- gaussian()
361 |   }
362 |   if (model %in% c("lm", "glm")) {
363 |     control.family <- model.control$family
364 |     control.rank <- model.control$rank
365 |     control.trace <- model.control$trace
366 |     control.deviance <- model.control$deviance
367 |     # Check validity of family.
368 |     if (is.null(control.family)) {
369 |       control.family <- gaussian()
370 |     } else if (is.character(control.family)) {
371 |       control.family <- get(control.family, mode="function", envir=parent.frame())()
372 |     } else if (is.function(control.family)) {
373 |       control.family <- control.family()
374 |     } else if (is.null(control.family$family)) {
375 |       print(control.family)
376 |       stop("'family' not recognized")
377 |     }
378 |     # Check validity of rank.
379 |     if (is.null(control.rank)) {
380 |       control.rank <- FALSE
381 |     } else if (!is.logical(control.rank)) {
382 |       stop ("'rank' not logical")
383 |     }
384 |     # Check validity of trace.
385 |     if (is.null(control.trace)) {
386 |       control.trace <- FALSE
387 |     } else if (!is.logical(control.trace)) {
388 |       stop ("'trace' not logical")
389 |     }
390 |     # Check validity of deviance.
391 |     if (is.null(control.deviance)) {
392 |       control.deviance <- FALSE
393 |     } else if (!is.logical(control.deviance)) {
394 |       stop ("'deviance' not logical")
395 |     }
396 |     return(list(
397 |       name=model,
398 |       family=control.family,
399 |       rank=control.rank,
400 |       trace=control.trace,
401 |       deviance=control.deviance,
402 |       nparams=nparams,
403 |       lambda1=lambda1,
404 |       lambda2=lambda2))
405 |   } else if (model == "cox") {
406 |     return(list(
407 |       name=model,
408 |       nparams=nparams,
409 |       lambda1=lambda1,
410 |       lambda2=lambda2))
411 |   } else if (model == "gmm") {
412 |     control.fn <- model.control$fn
413 |     control.gr <- model.control$gr
414 |     control.nparams <- model.control$nparams
415 |     control.type <- model.control$type
416 |     control.wmatrix <- model.control$wmatrix
417 |     # Check validify of moment function and its gradient.
418 |     if (is.null(control.fn) && is.null(control.gr)) {
419 |       stop("either 'fn' or 'gr' must be specified")
420 |     } else if (!is.null(control.fn) && !is.function(control.fn)) {
421 |       stop("'fn' not a function")
422 |     } else if (!is.null(control.gr) && !is.function(control.gr)) {
423 |       stop("'gr' not a function")
424 |     } else if (!is.null(control.fn) && is.null(control.gr)) {
425 |       # Default to numerical gradient via central differences.
426 |       #library(numDeriv)
427 |       # TODO probably does not work
428 |       control.gr <- function(x, fn=control.fn) {
429 |         d <- length(x)
430 |         h <- 1e-5
431 |         out <- rep(0, d)
432 |         for (i in 1:d) {
433 |           ei <- c(rep(0, i-1), h, rep(0, d-i))
434 |           out[i] <- (fn(x + ei) - fn(x - ei)) / (2*h)
435 |         }
436 |         return(out)
437 |       }
438 |     }
439 |     # Check validity of nparams.
440 |     if (is.null(control.nparams)) {
441 |       stop("'nparams' not specified")
442 |     } else if (!is.numeric(control.nparams) ||
443 |                control.nparams - as.integer(control.nparams) != 0 ||
444 |                control.nparams < 1) {
445 |       stop("'nparams' must be a positive integer")
446 |     }
447 |     # Check validity of GMM type.
448 |     if (is.null(control.type)) {
449 |       control.type <- "iterative"
450 |     } else if (!is.character(control.type)) {
451 |       stop("'type' must be a string")
452 |     # TODO implement cuee
453 |     } else if (!(control.type %in% c("twostep", "iterative", "cuee"))) {
454 |       stop("'type' not recognized")
455 |     }
456 |     # Check validity of weighting matrix.
457 |     if (is.null(control.wmatrix)) {
458 |       # do nothing, since will not store large matrix in R but in C++
459 |     } else if (!is.matrix(control.wmatrix)) {
460 |       stop("'wmatrix' not a matrix")
461 |     # TODO check if dimensions are same as moment conditions
462 |     #} else if (identical(dim(control.wmatrix), c(k,k))) {
463 |     }
464 |     return(list(
465 |       name=model,
466 |       gr=control.gr,
467 |       type=control.type,
468 |       nparams=control.nparams,
469 |       lambda1=lambda1,
470 |       lambda2=lambda2))
471 |   } else if (model == "m") {
472 |     control.loss <-  model.control$loss
473 |     if (is.null(control.loss)) {
474 |       control.loss <- "huber"
475 |     } else if (!is.character(control.loss)) {
476 |       stop ("'model.control$loss' must be a string")
477 |     } else if (control.loss != "huber") {
478 |       stop ("'loss' not available")
479 |     }
480 |     return(list(
481 |       name=model,
482 |       loss=control.loss,
483 |       nparams=nparams,
484 |       lambda1=lambda1,
485 |       lambda2=lambda2))
486 |   } else {
487 |     stop("model not specified")
488 |   }
489 | }
490 | 
491 | valid_sgd_control <- function(method="ai-sgd", lr="one-dim",
492 |                               lr.control=NULL,
493 |                               start=rnorm(nparams, mean=0, sd=1e-5),
494 |                               size=100,
495 |                               reltol=1e-5, npasses=3, pass=F,
496 |                               shuffle=F, verbose=F,
497 |                               truth=NULL, check=F,
498 |                               N, nparams, ...) {
499 |   # The following are internal parameters that can be used but aren't written in
500 |   # the documentation for succinctness:
501 |   #   check: logical, specifying whether to check against \code{truth} for
502 |   #          convergence instead of using reltol
503 |   #   truth: true set of parameters
504 |   # TODO size isn't the correct thing since reltol means you don't know when it
505 |   # ends. user should specify how often to store the iterates (how many per
506 |   # iteration)
507 |   # Run validity check of arguments passed to sgd.control. It passes defaults to
508 |   # those unspecified and converts to the correct type if possible; otherwise it
509 |   # errors.
510 |   # Check validity of method.
511 |   if (!is.character(method)) {
512 |     stop("'method' must be a string")
513 |   } else if (!(method %in% c("sgd", "implicit", "asgd", "ai-sgd", "momentum",
514 |                              "nesterov"))) {
515 |     stop("'method' not recognized")
516 |   }
517 | 
518 |   # Check validity of learning rate.
519 |   lrs <- c("one-dim", "one-dim-eigen", "d-dim", "adagrad", "rmsprop")
520 |   if (is.numeric(lr)) {
521 |     if (lr < 1 | lr > length(lrs)) {
522 |       stop("'lr' out of range")
523 |     }
524 |     lr <- lrs[lr]
525 |   } else if (is.character(lr)) {
526 |     lr <- tolower(lr)
527 |     if (!(lr %in% lrs)) {
528 |       stop("'lr' not recognized")
529 |     }
530 |   } else {
531 |     stop("invalid 'lr'")
532 |   }
533 | 
534 |   # Check validity of lr.control.
535 |   if (!is.null(lr.control) && !is.numeric(lr.control)) {
536 |     stop("'lr.control' must be numeric")
537 |   } else if (lr == "one-dim") {
538 |     if (method %in% c("asgd", "ai-sgd")) {
539 |       c <- 2/3
540 |     } else {
541 |       c <- 1
542 |     }
543 |     defaults <- c(1, 1, 1, c)
544 |     if (is.null(lr.control)) {
545 |       lr.control <- defaults
546 |     } else if (length(lr.control) != 4) {
547 |       stop(gettextf("length of 'lr.control' should equal %d", 4), domain=NA)
548 |     }
549 |     missing <- which(is.na(lr.control))
550 |     lr.control[missing] <- defaults[missing]
551 |   } else if (lr == "one-dim-eigen") {
552 |     if (is.null(lr.control)) {
553 |       lr.control <- 0 # garbage number to store double in C++
554 |     } else if (length(lr.control) != 0) {
555 |       stop(gettextf("length of 'lr.control' should equal %d", 0), domain=NA)
556 |     }
557 |   } else if (lr == "d-dim") {
558 |     defaults <- 1e-6
559 |     if (is.null(lr.control)) {
560 |       lr.control <- defaults
561 |     } else if (length(lr.control) != 1) {
562 |       stop(gettextf("length of 'lr.control' should equal %d", 1), domain=NA)
563 |     }
564 |     missing <- which(is.na(lr.control))
565 |     lr.control[missing] <- defaults[missing]
566 |   } else if (lr == "adagrad") {
567 |     defaults <- c(1, 1e-6)
568 |     if (is.null(lr.control)) {
569 |       lr.control <- defaults
570 |     } else if (length(lr.control) != 2) {
571 |       stop(gettextf("length of 'lr.control' should equal %d", 2), domain=NA)
572 |     }
573 |     missing <- which(is.na(lr.control))
574 |     lr.control[missing] <- defaults[missing]
575 |   } else if (lr == "rmsprop") {
576 |     defaults <- c(1, 0.9, 1e-6)
577 |     if (is.null(lr.control)) {
578 |       lr.control <- defaults
579 |     } else if (length(lr.control) != 3) {
580 |       stop(gettextf("length of 'lr.control' should equal %d", 3), domain=NA)
581 |     }
582 |     missing <- which(is.na(lr.control))
583 |     lr.control[missing] <- defaults[missing]
584 |   }
585 | 
586 |   # Check validity of start.
587 |   if (!is.numeric(start)) {
588 |     stop("'start' must be numeric")
589 |   } else if (length(start) != nparams) {
590 |     stop(gettextf("length of 'start' should equal %d", nparams), domain=NA)
591 |   }
592 | 
593 |   # Check validity of size.
594 |   if (!is.numeric(size) || size - as.integer(size) != 0 || size < 1) {
595 |     stop("'size' must be positive integer")
596 |   }
597 | 
598 |   # Check validity of reltol
599 |   if (!is.numeric(reltol)) {
600 |     stop("'reltol' must be numeric")
601 |   } else if (length(reltol) != 1) {
602 |     stop("'reltol' must be scalar")
603 |   }
604 | 
605 |   # Check validity of npasses.
606 |   if (!is.numeric(npasses) || npasses - as.integer(npasses) != 0 || npasses < 1) {
607 |     stop("'npasses' must be positive integer")
608 |   }
609 | 
610 |   # Check validity of pass.
611 |   if (!is.logical(pass)) {
612 |     stop("'pass' must be logical")
613 |   }
614 | 
615 |   # Check validity of shuffle.
616 |   if (!is.logical(shuffle)) {
617 |     stop("'shuffle' must be logical")
618 |   }
619 | 
620 |   # Check validity of verbose.
621 |   if (!is.logical(verbose)) {
622 |     stop("'verbose' must be logical")
623 |   }
624 | 
625 |   # Check validity of additional arguments if the method is implicit.
626 |   if (method %in% c("implicit", "ai-sgd")) {
627 |     call <- match.call()
628 |     implicit.control <- do.call("valid_implicit_control", list(...))
629 |   } else {
630 |     implicit.control <- NULL
631 |   }
632 | 
633 |   # TODO they should be vectors in C++, not requiring conversion
634 |   start <- as.matrix(start)
635 |   if (check) {
636 |     truth <- as.matrix(truth)
637 |   }
638 | 
639 |   return(c(list(method=method,
640 |                 lr=lr,
641 |                 lr.control=lr.control,
642 |                 start=start,
643 |                 size=size,
644 |                 reltol=reltol,
645 |                 npasses=npasses,
646 |                 pass=pass,
647 |                 shuffle=shuffle,
648 |                 verbose=verbose,
649 |                 check=check,
650 |                 truth=truth,
651 |                 nparams=nparams),
652 |            implicit.control))
653 | }
654 | 
655 | valid_implicit_control <- function(delta=30L, ...) {
656 |   # Maintain control parameters for running implicit SGD. Pass defaults
657 |   # if unspecified.
658 |   #
659 |   # Args:
660 |   #   delta: convergence criterion for the one-dimensional optimization
661 |   args <- list(...)
662 |   if (!is.null(names(args))) {
663 |     stop("Invalid args passed into sgd.control through dots")
664 |   }
665 |   if (!is.numeric(delta) || delta - as.integer(delta) != 0 || delta <= 0) {
666 |     stop("value of 'delta' must be integer > 0")
667 |   }
668 |   return(list(delta=delta))
669 | }
670 | 
671 | transfer_name <- function(link.name) {
672 |   if(!is.character(link.name)) {
673 |     stop("link name must be a string")
674 |   }
675 |   link.names <- c("identity", "log", "logit", "inverse")
676 |   transfer.names <- c("identity", "exp", "logistic", "inverse")
677 |   transfer.idx <- which(link.names == link.name)
678 |   if (length(transfer.idx) == 0) {
679 |     stop("no match link function founded!")
680 |   }
681 |   return(transfer.names[transfer.idx])
682 | }
683 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # sgd
 2 | 
 3 | sgd is an R package for large
 4 | scale estimation. It features many stochastic gradient methods, built-in models,
 5 | visualization tools, automated hyperparameter tuning, model checking, interval
 6 | estimation, and convergence diagnostics.
 7 | 
 8 | ## Features
 9 | At the core of the package is the function
10 | ```{R}
11 | sgd(formula, data, model, model.control, sgd.control)
12 | ```
13 | It estimates parameters for a given data set and model using stochastic gradient
14 | descent. The optional arguments `model.control` and `sgd.control` specify
15 | attributes about the model and stochastic gradient method. Taking advantage of
16 | the bigmemory package, sgd also operates on data sets which are too large to fit
17 | in RAM as well as streaming data.
18 | 
19 | Example of large-scale linear regression:
20 | ```{R}
21 | library(sgd)
22 | 
23 | # Dimensions
24 | N <- 1e5  # number of data points
25 | d <- 1e2  # number of features
26 | 
27 | # Generate data.
28 | X <- matrix(rnorm(N*d), ncol=d)
29 | theta <- rep(5, d+1)
30 | eps <- rnorm(N)
31 | y <- cbind(1, X) %*% theta + eps
32 | dat <- data.frame(y=y, x=X)
33 | 
34 | sgd.theta <- sgd(y ~ ., data=dat, model="lm")
35 | ```
36 | 
37 | Any loss function may be specified. For convenience the following are
38 | built-in:
39 | * Linear models
40 | * Generalized linear models
41 | * Method of moments
42 | * Generalized method of moments
43 | * Cox proportional hazards model
44 | * M-estimation
45 | 
46 | The following stochastic gradient methods exist:
47 | * (Standard) stochastic gradient descent
48 | * Implicit stochastic gradient descent
49 | * Averaged stochastic gradient descent
50 | * Averaged implicit stochastic gradient descent
51 | * Classical momentum
52 | * Nesterov's accelerated gradient
53 | 
54 | Check out the vignette in [`vignettes/`](vignettes/) or examples in [`demo/`](demo/).
55 | In R, the equivalent commands are `vignette(package="sgd")` and
56 | `demo(package="sgd")`.
57 | 
58 | ## Installation
59 | To install the latest version from CRAN:
60 | ```{R}
61 | install.packages("sgd")
62 | ```
63 | 
64 | To install the latest development version from Github:
65 | ```{R}
66 | # install.packages("devtools")
67 | devtools::install_github("airoldilab/sgd")
68 | ```
69 | 
70 | ## Authors
71 | sgd is written by [Dustin Tran](http://dustintran.com), [Junhyung Lyle Kim](https://jlylekim.github.io/) and
72 | [Panos Toulis](https://www.ptoulis.com/). Please feel free to contribute by
73 | submitting any issues or requests—or by solving any current issues!
74 | 
75 | We thank all other members of the Airoldi Lab (led by Prof. Edo Airoldi) for their feedback and contributions.
76 | 
77 | ## Citation
78 | 
79 | ```
80 | @article{tran2015stochastic,
81 |   author = {Tran, Dustin and Toulis, Panos and Airoldi, Edoardo M},
82 |   title = {Stochastic gradient descent methods for estimation with large data sets},
83 |   journal = {arXiv preprint arXiv:1509.06459},
84 |   year = {2015}
85 | }
86 | ```
87 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## Test environments
 2 | 
 3 | * local OS X install, R 4.3.2
 4 | * win-builder (devel)
 5 | 
 6 | ## R CMD check results
 7 | There were no ERRORs or WARNINGs.
 8 | 
 9 | There was 1 NOTE:
10 | 
11 | checking CRAN incoming feasibility (8.6s)
12 |    Maintainer: ‘Junhyung Lyle Kim <jlylekim@gmail.com>’
13 |    
14 |    New submission
15 |    
16 |    Package was archived on CRAN
17 |    
18 |    CRAN repository db overrides:
19 |      X-CRAN-Comment: Archived on 2024-01-30 as check problems were not
20 |        corrected in time.
21 | 
22 |   -> This is the new submission after fixing the check problems. 
23 | 
24 | 


--------------------------------------------------------------------------------
/data/winequality.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airoldilab/sgd/92246a3e1d81c9cd3077b80cacb6a3f7ee2425c5/data/winequality.rda


--------------------------------------------------------------------------------
/demo/00Index:
--------------------------------------------------------------------------------
 1 | bench-corr-linear-regression Benchmark linear regression on correlated normal data
 2 | bench-linear-regression      Benchmark linear regression on normal data
 3 | bench-logistic-wine          Benchmark logistic regression on wine quality data set
 4 | cox-regression               Demo Cox regression on simulated data
 5 | glm-logistic-regression      Demo logistic regression on Bernoulli data
 6 | glm-poisson-regression       Demo Poisson regression on Poisson data
 7 | linear-regression            Demo linear regression on normal data
 8 | m-estimation                 Demo M-estimation with Huber loss on contaminated normal data
 9 | normal-method-of-moments     Demo method of moments for estimating normal parameters
10 | 


--------------------------------------------------------------------------------
/demo/bench-corr-linear-regression.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Benchmark sgd package for linear regression on simulated data from a
 3 | # correlated normal distribution. This follows the experiment in Section 5.1 of
 4 | # Friedman et al. (2010).
 5 | #
 6 | # Data generating process:
 7 | #   Y = sum_{j=1}^p X_j*beta_j + k*eps, where
 8 | #     X ~ Multivariate normal where each covariate Xj, Xj' has equal correlation
 9 | #       rho; rho ranges over (0,0.1,0.2,0.5,0.9,0.95) for each pair (n, d)
10 | #     beta_j = (-1)^j exp(-2(j-1)/20)
11 | #     eps ~ Normal(0,1)
12 | #     k = 3
13 | #
14 | # Dimensions:
15 | #   n=1000, d=100
16 | #   n=5000, d=100
17 | #   n=100, d=1000
18 | #   n=100, d=5000
19 | #   n=100, d=20000
20 | #   n=100, d=50000
21 | 
22 | library(sgd)
23 | library(glmnet)
24 | library(microbenchmark)
25 | 
26 | # Function taken from Friedman et al.
27 | genx = function(n,p,rho){
28 |   #    generate x's multivariate normal with equal corr rho
29 |   # Xi = b Z + Wi, and Z, Wi are independent normal.
30 |   # Then Var(Xi) = b^2 + 1
31 |   #  Cov(Xi, Xj) = b^2  and so cor(Xi, Xj) = b^2 / (1+b^2) = rho
32 |   z=rnorm(n)
33 |   if(abs(rho)<1){
34 |     beta=sqrt(rho/(1-rho))
35 |     x=matrix(rnorm(n*p),ncol=p)
36 |     A = matrix(rnorm(n), nrow=n, ncol=p, byrow=F)
37 |     x= beta * A + x
38 |   }
39 |   if(abs(rho)==1){ x=matrix(rnorm(n),nrow=n,ncol=p,byrow=F)}
40 | 
41 |   return(x)
42 | }
43 | 
44 | # Dimensions
45 | Ns <- c(1000, 5000, 100, 100, 100, 100)
46 | ds <- c(100, 100, 1000, 5000, 20000, 50000)
47 | rhos <- c(0, 0.1, 0.2, 0.5, 0.9, 0.95)
48 | 
49 | # explicit sgd will error the loop for higher correlation
50 | rhos <- c(0, 0.1, 0.2)
51 | 
52 | set.seed(42)
53 | benchmark <- list()
54 | idx <- 1
55 | for (i in 1:length(Ns)) {
56 |   for (j in 1:length(rhos)) {
57 |     N <- Ns[i]
58 |     d <- ds[i]
59 |     rho <- rhos[j]
60 |     # Generate data.
61 |     X <- genx(N, d, rho)
62 |     theta <- ((-1)^(1:d))*exp(-2*((1:d)-1)/20)
63 |     eps <- rnorm(N)
64 |     k <- 3
65 |     y <- X %*% theta + k * eps
66 | 
67 |     # Benchmark!
68 |     benchmark[[idx]] <- microbenchmark(
69 |       aisgd=sgd(X, y, data=dat, model="lm",
70 |               sgd.control=list(method="ai-sgd", npasses=1, pass=T)),
71 |       sgd=sgd(X, y, data=dat, model="lm",
72 |               sgd.control=list(method="sgd", npasses=1, pass=T)),
73 |       glmnet=glmnet(X, y, alpha=1, standardize=FALSE, type.gaussian="covariance"),
74 |       times=10L, unit="s"
75 |     )
76 |     names(benchmark)[idx] <- sprintf("N: %i; d: %i; rho: %0.2f", N, d, rho)
77 |     idx <- idx + 1
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/demo/bench-linear-regression.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Benchmark sgd package for linear regression on simulated data from a normal
 3 | # distribution.
 4 | #
 5 | # Data generating process:
 6 | #   Y = X %*% theta + epsilon, where
 7 | #     X ~ Normal(0, 1)
 8 | #     theta = (5,...,5)
 9 | #     epsilon ~ Normal(0,1)
10 | #
11 | # Dimensions:
12 | #   N=1e5 observations
13 | #   d=1e2 parameters
14 | 
15 | library(sgd)
16 | library(glmnet)
17 | library(microbenchmark)
18 | 
19 | # Dimensions
20 | N <- 1e5
21 | d <- 1e2
22 | 
23 | # Generate data.
24 | set.seed(42)
25 | X <- matrix(rnorm(N*d), ncol=d)
26 | theta <- rep(5, d+1)
27 | eps <- rnorm(N)
28 | y <- cbind(1, X) %*% theta + eps
29 | dat <- data.frame(y=y, x=X)
30 | 
31 | # Benchmark!
32 | benchmark <- microbenchmark(
33 |   sgd=sgd(y ~ ., data=dat, model="lm",
34 |           sgd.control=list(method="implicit")),
35 |   lm=lm(y ~ ., data=dat),
36 |   glmnet=glmnet(X, y, alpha=1, standardize=FALSE, type.gaussian="naive"),
37 |   times=10L
38 | )
39 | benchmark
40 | ## Output (for 2.6 GHz, Intel Core i5)
41 | ## Unit: milliseconds
42 | ##    expr       min        lq      mean    median        uq       max neval
43 | ##     sgd  644.8761  676.8018  740.3485  733.2575  776.6705  918.4465    10
44 | ##      lm 1467.7145 1566.2102 1648.5728 1608.2134 1773.2074 1816.3088    10
45 | ##  glmnet 2392.7664 2437.7863 2636.2133 2600.4307 2789.2635 3091.6138    10
46 | 


--------------------------------------------------------------------------------
/demo/bench-logistic-wine.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Compare out-of-sample log-likelihoods, using sgd() and glm() for
 3 | # logistic regression on the wine quality data set.
 4 | #
 5 | # Dimensions:
 6 | #   n=4286, d=12
 7 | 
 8 | library(sgd)
 9 | 
10 | # Generate data.
11 | data("winequality")
12 | dat <- winequality
13 | dat$quality <- as.numeric(dat$quality > 5) # transform to binary
14 | 
15 | test.set <- sample(1:nrow(dat), size=nrow(dat)/8, replace=F)
16 | dat.test <- dat[test.set, ]
17 | dat <- dat[-test.set, ]
18 | 
19 | # Fit glm() and sgd().
20 | fit.glm <- glm(quality~., family=binomial(link="logit"), data=dat)
21 | fit.sgd <- sgd(quality ~ ., data=dat,
22 |                model="glm", model.control=binomial(link="logit"),
23 |                sgd.control=list(reltol=1e-5, npasses=200), lr.control=c(scale=1, gamma=1, alpha=30, c=1))
24 | 
25 | # Compare log likelihoods.
26 | log.lik <- function(theta.est) {
27 |   
28 |   y <- dat.test$quality
29 |   X <- as.matrix(dat.test[, seq(1, ncol(dat)-1)])
30 |   X <- cbind(1, X)
31 | 
32 |   eta <- plogis(X %*% theta.est)
33 |   print(cor(y, eta))
34 |   sum(y * log(eta) + (1-y) * log(1-eta))
35 | }
36 | 
37 | theta.glm <- matrix(as.numeric(fit.glm$coefficients), ncol=1)
38 | theta.sgd <- matrix(as.numeric(fit.sgd$coefficients), ncol=1)
39 | log.lik.glm <- log.lik(fit.glm$coefficients)
40 | log.lik.sgd <- log.lik(theta.sgd)
41 | 
42 | print(sprintf("Out-of-sample Log-likelihood for glm()=%.3f  sgd=%.3f", log.lik.glm, log.lik.sgd))
43 | 


--------------------------------------------------------------------------------
/demo/cox-regression.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | # Demo usage of sgd for linear regression on simulated normal data.
  3 | #
  4 | # Data generating process:
  5 | #   Y = Binomial(N, p), where
  6 | #     p (see generate.data)
  7 | #     X ~ correlated Normal (see genx)
  8 | #
  9 | # Dimensions:
 10 | #   N=1e3 observations
 11 | #   d=5 parameters
 12 | 
 13 | # Function taken from Friedman et al.
 14 | genx = function(n,p,rho){
 15 |   #    generate x's multivariate normal with equal corr rho
 16 |   # Xi = b Z + Wi, and Z, Wi are independent normal.
 17 |   # Then Var(Xi) = b^2 + 1
 18 |   #  Cov(Xi, Xj) = b^2  and so cor(Xi, Xj) = b^2 / (1+b^2) = rho
 19 |   z=rnorm(n)
 20 |   if(abs(rho)<1){
 21 |     beta=sqrt(rho/(1-rho))
 22 |     x0=matrix(rnorm(n*p),ncol=p)
 23 |     A = matrix(z, nrow=n, ncol=p, byrow=F)
 24 |     x= beta * A + x0
 25 |   }
 26 |   if(abs(rho)==1){ x=matrix(z,nrow=n,ncol=p,byrow=F)}
 27 | 
 28 |   return(x)
 29 | }
 30 | 
 31 | generate.data <- function(n, p, rho=0.2) {
 32 |   ## Generate data
 33 |   #   #
 34 |   #   # Returns:
 35 |   #   #   LIST(X, Y, censor, true.beta)
 36 |   #   #     X = Nxp matrix of covariates.
 37 |   #   #     Y = Nx1 vector of observed times.
 38 |   #   #     censor = Nx1 vector {0, 1} of censor indicators.
 39 |   #   #     true.beta = p-vector of true model parameters.
 40 |   #   #     M = (Y, censor) as matrix
 41 | 
 42 |   X = genx(n, p, rho=rho)
 43 |   # rates.
 44 |   # rates = runif(n, min=1e-2, max=10)
 45 |   # Y = rexp(n,
 46 |   # beta = solve(t(X) %*% X) %*% t(X) %*% log(rates)
 47 |   beta = 10  *((-1)^(1:p))*exp(-2*((1:p)-1)/20)
 48 |   # beta = 10 * seq(1, p)**(-0.5)
 49 |   # warning("Large coefficients")
 50 |   pred = exp(X %*% beta)
 51 |   Y = rexp(n, rate =pred)
 52 | 
 53 |   q3 = quantile(Y, prob=c(0.8))  # Q3 of Y
 54 |   epsilon = 0.001 # probability of censoring smallest Y
 55 |   k = log(1/epsilon - 1) / (q3 - min(Y))
 56 |   censor.prob = (1 + exp(-k * (Y-q3)))**(-1)
 57 | 
 58 |   C = rbinom(n, size=1, prob= censor.prob)
 59 | 
 60 |   ## Order the data
 61 |   order.i = order(Y)
 62 |   X = X[order.i, ]
 63 |   Y = Y[order.i]
 64 |   C = C[order.i]
 65 | 
 66 |   M = matrix(0, nrow=n, ncol=2)
 67 |   colnames(M) <- c("time", "status")
 68 |   M[, 1] <- Y
 69 |   M[, 2] <- 1-C
 70 |   return(list(X=X, Y=Y, censor=C, M=M, true.beta=beta))
 71 | }
 72 | 
 73 | library(sgd)
 74 | 
 75 | # Dimensions
 76 | N <- 1e3
 77 | d <- 5
 78 | 
 79 | # Generate data.
 80 | set.seed(42)
 81 | data <- generate.data(N, d)
 82 | X <- data$X
 83 | y <- 1 - data$censor # y=1 if fail, 0 if censor
 84 | dat <- data.frame(y=y, x=X)
 85 | 
 86 | t <- data$Y # times of observations
 87 | theta <- data$true.beta # true coefficients
 88 | 
 89 | # Explicit
 90 | sgd.theta <- sgd(y ~ . - 1, data=dat, model="cox",
 91 |                  sgd=list(method="sgd", lr="adagrad", npasses=5))
 92 | sgd.theta$coefficients
 93 | 
 94 | # Implicit
 95 | sgd.theta <- sgd(y ~ . - 1, data=dat, model="cox",
 96 |                  sgd=list(method="implicit",
 97 |                           lr="adagrad",
 98 |                           lr.control=c(0.1, NA),
 99 |                           npasses=5))
100 | sgd.theta$coefficients
101 | 


--------------------------------------------------------------------------------
/demo/glm-logistic-regression.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Demo usage of sgd for linear regression on simulated normal data.
 3 | #
 4 | # Data generating process:
 5 | #   Y = Binomial(N, p), where
 6 | #     p = 1 / (1 + exp(-X %*% theta + epsilon):
 7 | #     X ~ Normal(0, 1)
 8 | #     theta = (5,...,5)
 9 | #     epsilon ~ Normal(0,1)
10 | #
11 | # Dimensions:
12 | #   N=1e5 observations
13 | #   d=5 parameters
14 | 
15 | library(sgd)
16 | 
17 | # Dimensions
18 | N <- 1e5
19 | d <- 5
20 | 
21 | # Generate data.
22 | set.seed(42)
23 | X <- matrix(rnorm(N*d), ncol=d)
24 | theta <- rep(5, d+1)
25 | eps <- rnorm(N)
26 | p <- 1/(1+exp(-(cbind(1, X) %*% theta + eps)))
27 | y <- rbinom(N, 1, p)
28 | dat <- data.frame(y=y, x=X)
29 | 
30 | sgd.theta <- sgd(y ~ ., data=dat, model="glm",
31 |                  model.control=list(family="binomial"),
32 |                  sgd.control=list(lr.control=c(100, NA, NA, NA), npasses=1,
33 |                  pass=T))
34 | 
35 | plot(sgd.theta, cbind(1, X), y, label="ai-sgd", type="clf")
36 | 


--------------------------------------------------------------------------------
/demo/glm-poisson-regression.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Demo usage of sgd for fitting generalized linear model with Poisson
 3 | # link function with simulated data.
 4 | #
 5 | # Data generating process:
 6 | #   X is either (0, 0), (1, 0), (0, 1) with 0.6, 0.2, 0.2 probability
 7 | #   theta = (log(2), log(4))
 8 | #   y = Poisson(exp(X %*% theta))
 9 | #
10 | # Dimensions:
11 | #   N=1e5 observations
12 | #   d=2 parameters
13 | 
14 | library(sgd)
15 | 
16 | # Dimensions
17 | N <- 1e5
18 | 
19 | # Generate data.
20 | set.seed(42)
21 | Q <- 0.2
22 | code <- sample(0:2, size=N, replace=T, prob=c((1-2*Q), Q, Q))
23 | X <- matrix(0, nrow=N, ncol=2)
24 | X[,1] <- as.numeric(code==1)
25 | X[,2] <- as.numeric(code==2)
26 | theta <- matrix(c(log(2), log(4)), ncol=1)
27 | y <- matrix(rpois(N, exp(X %*% theta)), ncol=1)
28 | dat <- data.frame(y=y, x=X)
29 | 
30 | sgd.theta <- sgd(y ~ ., data=dat, model="glm",
31 |                  model.control=list(family=poisson()))
32 | # 3 parameters including intercept
33 | mean((sgd.theta$coefficients - c(0, theta))^2) # MSE
34 | 


--------------------------------------------------------------------------------
/demo/linear-regression.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Demo usage of sgd for linear regression on simulated normal data.
 3 | #
 4 | # Data generating process:
 5 | #   Y = X %*% theta + epsilon, where
 6 | #     X ~ Normal(0, 1)
 7 | #     theta = (5,...,5)
 8 | #     epsilon ~ Normal(0,1)
 9 | #
10 | # Dimensions:
11 | #   N=1e5 observations
12 | #   d=1e2 parameters
13 | 
14 | library(sgd)
15 | 
16 | # Dimensions
17 | N <- 1e5
18 | d <- 1e2
19 | 
20 | # Generate data.
21 | set.seed(42)
22 | X <- matrix(rnorm(N*d), ncol=d)
23 | theta <- rep(5, d+1)
24 | eps <- rnorm(N)
25 | y <- cbind(1, X) %*% theta + eps
26 | dat <- data.frame(y=y, x=X)
27 | 
28 | sgd.theta <- sgd(y ~ ., data=dat, model="lm")
29 | 
30 | plot(sgd.theta, theta, type="mse-param")
31 | 


--------------------------------------------------------------------------------
/demo/m-estimation.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Demo usage of sgd for M-estimation using the Huber loss.
 3 | # Example taken from Donoho and Montanari (2013, Section 2.4).
 4 | #
 5 | # Data generating process:
 6 | #   Y = X %*% theta + epsilon, where
 7 | #     X ~ Normal(0, 1/N)
 8 | #     theta ~ Unif([0,1]^d) with fixed 2-norm 6*sqrt(d)
 9 | #     epsilon ~ ContaminatedNormal(0.05, 10) = 0.95z + 0.05h_{10},
10 | #       where z ~ Normal(0,1) and h_x = unit atom at x
11 | #
12 | # Dimensions:
13 | #   N=1000 observations
14 | #   d=200 parameters
15 | 
16 | library(sgd)
17 | library(ggplot2)
18 | 
19 | generate.data <- function(N, d) {
20 |   l2 <- function(x) sqrt(sum(x**2))
21 |   X <- matrix(rnorm(N*d, mean=0, sd=1/sqrt(N)), nrow=N, ncol=d)
22 |   theta <- runif(d)
23 |   theta <- theta * 6 *sqrt(d) / l2(theta)
24 | 
25 |   # noise
26 |   ind <- rbinom(N, size=1, prob=.95)
27 |   epsilon <- ind * rnorm(N) + (1-ind) * rep(10 ,N)
28 | 
29 |   Y <- X %*% theta + epsilon
30 |   return(list(y=Y, X=X, theta=theta))
31 | }
32 | 
33 | # Dimensions
34 | N <- 1000
35 | d <- 200
36 | 
37 | # Generate data.
38 | set.seed(42)
39 | data <- generate.data(N, d)
40 | dat <- data.frame(y=data$y, x=data$X)
41 | 
42 | sgd.theta <- sgd(y ~ .-1, data=dat, model="m", sgd.control=list(method="sgd",
43 |   lr.control=c(15, NA, NA, 1/2), npass=10, pass=T))
44 | 
45 | plot(sgd.theta, data$theta, label="sgd", type="mse-param") +
46 |   geom_hline(yintercept=1.5, color="green")
47 | 


--------------------------------------------------------------------------------
/demo/normal-method-of-moments.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Demo usage of sgd for estimating parameters of a normal distribution using
 3 | # the method of moments.
 4 | #
 5 | # Data generating process:
 6 | #   X ~ Normal(4, 2)
 7 | #
 8 | # Dimensions:
 9 | #   N=1e3 observations
10 | #   d=2   parameters
11 | 
12 | library(sgd)
13 | 
14 | # Dimensions
15 | N <- 1e3
16 | d <- 2
17 | theta <- c(4, 2)
18 | 
19 | # Generate data.
20 | set.seed(42)
21 | X <- matrix(rnorm(N, mean=theta[1], sd=theta[2]), ncol=1)
22 | 
23 | # Gradient of moment function (using 3 moments)
24 | gr <- function(theta, x) {
25 |   return(as.matrix(c(
26 |     mean(2*(theta[1] - x) +
27 |          2*(theta[2]^2 - (x - theta[1])^2) * 2*(-theta[1] +x) +
28 |          2*(x^3 - theta[1]*(theta[1]^2 + 3*theta[2]^2)) * (-3*theta[1]^2 -
29 |            3*theta[2]^2)),
30 |     mean(0 +
31 |          2*(theta[2]^2 - (x - theta[1])^2) * 2*theta[2] +
32 |          2*(x^3 - theta[1]*(theta[1]^2 + 3*theta[2]^2)) * -6*theta[1]*theta[2])
33 |     )))
34 | }
35 | sgd.theta <- sgd(X, y=matrix(NA, nrow=nrow(X)), model="gmm",
36 |   model.control=list(gr=gr, nparams=2),
37 |   sgd.control=list(method="sgd", npasses=100, lr="adagrad"))
38 | sgd.theta
39 | 


--------------------------------------------------------------------------------
/man/coef.sgd.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/coef.sgd.R
 3 | \name{coef.sgd}
 4 | \alias{coef.sgd}
 5 | \title{Extract Model Coefficients}
 6 | \usage{
 7 | \method{coef}{sgd}(object, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{object of class \code{sgd}.}
11 | 
12 | \item{\dots}{some methods for this generic require additional
13 | arguments. None are used in this method.}
14 | }
15 | \value{
16 | Coefficients extracted from the model object \code{object}.
17 | }
18 | \description{
19 | Extract model coefficients from \code{sgd} objects. \code{coefficients}
20 | is an \emph{alias} for it.
21 | }
22 | 


--------------------------------------------------------------------------------
/man/fitted.sgd.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fitted.sgd.R
 3 | \name{fitted.sgd}
 4 | \alias{fitted.sgd}
 5 | \title{Extract Model Fitted Values}
 6 | \usage{
 7 | \method{fitted}{sgd}(object, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{object of class \code{sgd}.}
11 | 
12 | \item{\dots}{some methods for this generic require additional
13 | arguments. None are used in this method.}
14 | }
15 | \value{
16 | Fitted values extracted from the object \code{object}.
17 | }
18 | \description{
19 | Extract fitted values from from \code{sgd} objects.
20 | \code{fitted.values} is an \emph{alias} for it.
21 | }
22 | 


--------------------------------------------------------------------------------
/man/plot.sgd.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.sgd.R
 3 | \name{plot.sgd}
 4 | \alias{plot.sgd}
 5 | \alias{plot.list}
 6 | \title{Plot objects of class \code{sgd}.}
 7 | \usage{
 8 | \method{plot}{sgd}(x, ..., type = "mse", xaxis = "iteration")
 9 | 
10 | \method{plot}{list}(x, ..., type = "mse", xaxis = "iteration")
11 | }
12 | \arguments{
13 | \item{x}{object of class \code{sgd}.}
14 | 
15 | \item{\dots}{additional arguments used for each type of plot. See
16 | \sQuote{Details}.}
17 | 
18 | \item{type}{character specifying the type of plot: \code{"mse"},
19 | \code{"clf"}, \code{"mse-param"}. See \sQuote{Details}. Default is
20 | \code{"mse"}.}
21 | 
22 | \item{xaxis}{character specifying the x-axis of plot: \code{"iteration"}
23 | plots the y values over the log-iteration of the algorithm;
24 | \code{"runtime"} plots the y values over the time in seconds to reach them.
25 | Default is \code{"iteration"}.}
26 | }
27 | \description{
28 | Plot objects of class \code{sgd}.
29 | }
30 | \details{
31 | Types of plots available:
32 | \describe{
33 |   \item{\code{mse}}{Mean squared error in predictions, which takes the
34 |     following arguments:
35 |     \describe{
36 |       \item{\code{x_test}}{test set}
37 |       \item{\code{y_test}}{test responses to compare predictions to}
38 |     }}
39 |   \item{\code{clf}}{Classification error in predictions, which takes the
40 |     following arguments:
41 |     \describe{
42 |       \item{\code{x_test}}{test set}
43 |       \item{\code{y_test}}{test responses to compare predictions to}
44 |     }}
45 |   \item{\code{mse-param}}{Mean squared error in parameters, which takes the
46 |     following arguments:
47 |     \describe{
48 |       \item{\code{true_param}}{true vector of parameters to compare to}
49 |     }}
50 | }
51 | }
52 | 


--------------------------------------------------------------------------------
/man/predict.sgd.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/predict.sgd.R
 3 | \name{predict.sgd}
 4 | \alias{predict.sgd}
 5 | \alias{predict_all}
 6 | \title{Model Predictions}
 7 | \usage{
 8 | \method{predict}{sgd}(object, newdata, type = "link", ...)
 9 | 
10 | predict_all(object, newdata, ...)
11 | }
12 | \arguments{
13 | \item{object}{object of class \code{sgd}.}
14 | 
15 | \item{newdata}{design matrix to form predictions on}
16 | 
17 | \item{type}{the type of prediction required. The default "link" is
18 | on the scale of the linear predictors; the alternative '"response"'
19 | is on the scale of the response variable. Thus for a default
20 | binomial model the default predictions are of log-odds
21 | (probabilities on logit scale) and 'type = "response"' gives the
22 | predicted probabilities. The '"terms"' option returns a matrix
23 | giving the fitted values of each term in the model formula on the
24 | linear predictor scale.}
25 | 
26 | \item{\dots}{further arguments passed to or from other methods.}
27 | }
28 | \description{
29 | Form predictions using the estimated model parameters from stochastic
30 | gradient descent.
31 | }
32 | \details{
33 | A column of 1's must be included to \code{newdata} if the
34 | parameters include a bias (intercept) term.
35 | }
36 | 


--------------------------------------------------------------------------------
/man/print.sgd.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/print.sgd.R
 3 | \name{print.sgd}
 4 | \alias{print.sgd}
 5 | \title{Print objects of class \code{sgd}.}
 6 | \usage{
 7 | \method{print}{sgd}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{object of class \code{sgd}.}
11 | 
12 | \item{\dots}{further arguments passed to or from other methods.}
13 | }
14 | \description{
15 | Print objects of class \code{sgd}.
16 | }
17 | 


--------------------------------------------------------------------------------
/man/residuals.sgd.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/residuals.sgd.R
 3 | \name{residuals.sgd}
 4 | \alias{residuals.sgd}
 5 | \title{Extract Model Residuals}
 6 | \usage{
 7 | \method{residuals}{sgd}(object, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{object of class \code{sgd}.}
11 | 
12 | \item{\dots}{some methods for this generic require additional
13 | arguments. None are used in this method.}
14 | }
15 | \value{
16 | Residuals extracted from the object \code{object}.
17 | }
18 | \description{
19 | Extract model residuals from \code{sgd} objects. \code{resid} is an
20 | \emph{alias} for it.
21 | }
22 | 


--------------------------------------------------------------------------------
/man/sgd.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/sgd.R
  3 | \name{sgd}
  4 | \alias{sgd}
  5 | \alias{sgd.formula}
  6 | \alias{sgd.matrix}
  7 | \alias{sgd.big.matrix}
  8 | \title{Stochastic gradient descent}
  9 | \usage{
 10 | sgd(x, ...)
 11 | 
 12 | \method{sgd}{formula}(formula, data, model, model.control = list(), sgd.control = list(...), ...)
 13 | 
 14 | \method{sgd}{matrix}(x, y, model, model.control = list(), sgd.control = list(...), ...)
 15 | 
 16 | \method{sgd}{big.matrix}(x, y, model, model.control = list(), sgd.control = list(...), ...)
 17 | }
 18 | \arguments{
 19 | \item{x, y}{a design matrix and the respective vector of outcomes.}
 20 | 
 21 | \item{\dots}{arguments to be used to form the default \code{sgd.control}
 22 | arguments if it is not supplied directly.}
 23 | 
 24 | \item{formula}{an object of class \code{"\link{formula}"} (or one that can be
 25 | coerced to that class): a symbolic description of the model to be fitted.
 26 | The details can be found in \code{"\link{glm}"}.}
 27 | 
 28 | \item{data}{an optional data frame, list or environment (or object coercible
 29 | by \code{\link[base]{as.data.frame}} to a data frame) containing the
 30 | variables in the model. If not found in data, the variables are taken from
 31 | environment(formula), typically the environment from which glm is called.}
 32 | 
 33 | \item{model}{character specifying the model to be used: \code{"lm"} (linear
 34 | model), \code{"glm"} (generalized linear model), \code{"cox"} (Cox
 35 | proportional hazards model), \code{"gmm"} (generalized method of moments),
 36 | \code{"m"} (M-estimation). See \sQuote{Details}.}
 37 | 
 38 | \item{model.control}{a list of parameters for controlling the model.
 39 | \describe{
 40 |   \item{\code{family} (\code{"glm"})}{a description of the error distribution and
 41 |     link function to be used in the model. This can be a character string
 42 |     naming a family function, a family function or the result of a call to
 43 |     a family function. (See \code{\link[stats]{family}} for details of
 44 |     family functions.)}
 45 |   \item{\code{rank} (\code{"glm"})}{logical. Should the rank of the design matrix
 46 |     be checked?}
 47 |   \item{\code{fn} (\code{"gmm"})}{a function \eqn{g(\theta,x)} which returns a
 48 |     \eqn{k}-vector corresponding to the \eqn{k} moment conditions. It is a
 49 |     required argument if \code{gr} not specified.}
 50 |   \item{\code{gr} (\code{"gmm"})}{a function to return the gradient. If
 51 |     unspecified, a finite-difference approximation will be used.}
 52 |   \item{\code{nparams} (\code{"gmm"})}{number of model parameters. This is
 53 |     automatically determined for other models.}
 54 |   \item{\code{type} (\code{"gmm"})}{character specifying the generalized method of
 55 |     moments procedure: \code{"twostep"} (Hansen, 1982), \code{"iterative"}
 56 |     (Hansen et al., 1996). Defaults to \code{"iterative"}.}
 57 |   \item{\code{wmatrix} (\code{"gmm"})}{weighting matrix to be used in the loss
 58 |     function. Defaults to the identity matrix.}
 59 |   \item{\code{loss} (\code{"m"})}{character specifying the loss function to be
 60 |     used in the estimating equation. Default is the Huber loss.}
 61 |   \item{\code{lambda1}}{L1 regularization parameter. Default is 0.}
 62 |   \item{\code{lambda2}}{L2 regularization parameter. Default is 0.}
 63 | }}
 64 | 
 65 | \item{sgd.control}{an optional list of parameters for controlling the estimation.
 66 | \describe{
 67 |   \item{\code{method}}{character specifying the method to be used: \code{"sgd"},
 68 |     \code{"implicit"}, \code{"asgd"}, \code{"ai-sgd"}, \code{"momentum"},
 69 |     \code{"nesterov"}. Default is \code{"ai-sgd"}. See \sQuote{Details}.}
 70 |   \item{\code{lr}}{character specifying the learning rate to be used:
 71 |     \code{"one-dim"}, \code{"one-dim-eigen"}, \code{"d-dim"},
 72 |     \code{"adagrad"}, \code{"rmsprop"}. Default is \code{"one-dim"}.
 73 |     See \sQuote{Details}.}
 74 |   \item{\code{lr.control}}{vector of scalar hyperparameters one can
 75 |     set dependent on the learning rate. For hyperparameters aimed
 76 |     to be left as default, specify \code{NA} in the corresponding
 77 |     entries. See \sQuote{Details}.}
 78 |   \item{\code{start}}{starting values for the parameter estimates. Default is
 79 |     random initialization around zero.}
 80 |   \item{\code{size}}{number of SGD estimates to store for diagnostic purposes
 81 |     (distributed log-uniformly over total number of iterations)}
 82 |   \item{\code{reltol}}{relative convergence tolerance. The algorithm stops
 83 |     if it is unable to change the relative mean squared difference in the
 84 |     parameters by more than the amount. Default is \code{1e-05}.}
 85 |   \item{\code{npasses}}{the maximum number of passes over the data. Default
 86 |     is 3.}
 87 |   \item{\code{pass}}{logical. Should \code{tol} be ignored and run the
 88 |     algorithm for all of \code{npasses}?}
 89 |   \item{\code{shuffle}}{logical. Should the algorithm shuffle the data set
 90 |     including for each pass?}
 91 |   \item{\code{verbose}}{logical. Should the algorithm print progress?}
 92 | }}
 93 | }
 94 | \value{
 95 | An object of class \code{"sgd"}, which is a list containing the following
 96 | components:
 97 | \item{model}{name of the model}
 98 | \item{coefficients}{a named vector of coefficients}
 99 | \item{converged}{logical. Was the algorithm judged to have converged?}
100 | \item{estimates}{estimates from algorithm stored at each iteration
101 |     specified in \code{pos}}
102 | \item{fitted.values}{the fitted mean values}
103 | \item{pos}{vector of indices specifying the iteration number each estimate
104 |     was stored for}
105 | \item{residuals}{the residuals, that is response minus fitted values}
106 | \item{times}{vector of times in seconds it took to complete the number of
107 |     iterations specified in \code{pos}}
108 | \item{model.out}{a list of model-specific output attributes}
109 | }
110 | \description{
111 | Run stochastic gradient descent in order to optimize the induced loss
112 | function given a model and data.
113 | }
114 | \details{
115 | Models:
116 | The Cox model assumes that the survival data is ordered when passed
117 | in, i.e., such that the risk set of an observation i is all data points after
118 | it.
119 | 
120 | Methods:
121 | \describe{
122 |   \item{\code{sgd}}{stochastic gradient descent (Robbins and Monro, 1951)}
123 |   \item{\code{implicit}}{implicit stochastic gradient descent (Toulis et al.,
124 |     2014)}
125 |   \item{\code{asgd}}{stochastic gradient with averaging (Polyak and Juditsky,
126 |     1992)}
127 |   \item{\code{ai-sgd}}{implicit stochastic gradient with averaging (Toulis et
128 |     al., 2015)}
129 |   \item{\code{momentum}}{"classical" momentum (Polyak, 1964)}
130 |   \item{\code{nesterov}}{Nesterov's accelerated gradient (Nesterov, 1983)}
131 | }
132 | 
133 | Learning rates and hyperparameters:
134 | \describe{
135 |   \item{\code{one-dim}}{scalar value prescribed in Xu (2011) as
136 |     \deqn{a_n = scale * gamma/(1 + alpha*gamma*n)^(-c)}
137 |     where the defaults are
138 |     \code{lr.control = (scale=1, gamma=1, alpha=1, c)}
139 |     where \code{c} is \code{1} if implemented without averaging,
140 |     \code{2/3} if with averaging}
141 |   \item{\code{one-dim-eigen}}{diagonal matrix
142 |     \code{lr.control = NULL}}
143 |   \item{\code{d-dim}}{diagonal matrix
144 |     \code{lr.control = (epsilon=1e-6)}}
145 |   \item{\code{adagrad}}{diagonal matrix prescribed in Duchi et al. (2011) as
146 |     \code{lr.control = (eta=1, epsilon=1e-6)}}
147 |   \item{\code{rmsprop}}{diagonal matrix prescribed in Tieleman and Hinton
148 |     (2012) as
149 |     \code{lr.control = (eta=1, gamma=0.9, epsilon=1e-6)}}
150 | }
151 | }
152 | \examples{
153 | ## Linear regression
154 | set.seed(42)
155 | N <- 1e4
156 | d <- 5
157 | X <- matrix(rnorm(N*d), ncol=d)
158 | theta <- rep(5, d+1)
159 | eps <- rnorm(N)
160 | y <- cbind(1, X) \%*\% theta + eps
161 | dat <- data.frame(y=y, x=X)
162 | sgd.theta <- sgd(y ~ ., data=dat, model="lm")
163 | sprintf("Mean squared error: \%0.3f", mean((theta - as.numeric(sgd.theta$coefficients))^2))
164 | 
165 | 
166 | }
167 | \references{
168 | John Duchi, Elad Hazan, and Yoram Singer. Adaptive subgradient methods for
169 | online learning and stochastic optimization. \emph{Journal of Machine
170 | Learning Research}, 12:2121-2159, 2011.
171 | 
172 | Yurii Nesterov. A method for solving a convex programming problem with
173 | convergence rate \eqn{O(1/k^2)}. \emph{Soviet Mathematics Doklady},
174 | 27(2):372-376, 1983.
175 | 
176 | Boris T. Polyak. Some methods of speeding up the convergence of iteration
177 | methods. \emph{USSR Computational Mathematics and Mathematical Physics},
178 | 4(5):1-17, 1964.
179 | 
180 | Boris T. Polyak and Anatoli B. Juditsky. Acceleration of stochastic
181 | approximation by averaging. \emph{SIAM Journal on Control and Optimization},
182 | 30(4):838-855, 1992.
183 | 
184 | Herbert Robbins and Sutton Monro. A stochastic approximation method.
185 | \emph{The Annals of Mathematical Statistics}, pp. 400-407, 1951.
186 | 
187 | Panos Toulis, Jason Rennie, and Edoardo M. Airoldi, "Statistical analysis of
188 | stochastic gradient methods for generalized linear models", In
189 | \emph{Proceedings of the 31st International Conference on Machine Learning},
190 | 2014.
191 | 
192 | Panos Toulis, Dustin Tran, and Edoardo M. Airoldi, "Stability and optimality
193 | in stochastic gradient descent", arXiv preprint arXiv:1505.02417, 2015.
194 | 
195 | Wei Xu. Towards optimal one pass large scale learning with averaged
196 | stochastic gradient descent. arXiv preprint arXiv:1107.2490, 2011.
197 | 
198 |  # Dimensions
199 | }
200 | \author{
201 | Dustin Tran, Tian Lan, Panos Toulis, Ye Kuang, Edoardo Airoldi
202 | }
203 | 


--------------------------------------------------------------------------------
/man/winequality.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data-winequality.R
 3 | \docType{data}
 4 | \name{winequality}
 5 | \alias{winequality}
 6 | \title{Wine quality data of white wine samples from Portugal}
 7 | \format{
 8 | A data frame with 4898 rows and 12 variables
 9 | \itemize{
10 |   \item fixed acidity.
11 |   \item volatile acidity.
12 |   \item citric acid.
13 |   \item residual sugar.
14 |   \item chlorides.
15 |   \item free sulfur dioxide.
16 |   \item total sulfur dioxide.
17 |   \item density.
18 |   \item pH.
19 |   \item sulphates.
20 |   \item alcohol.
21 |   \item quality (score between 0 and 10).
22 | }
23 | }
24 | \source{
25 | \url{https://archive.ics.uci.edu/ml/datasets/Wine+Quality}
26 | }
27 | \usage{
28 | winequality
29 | }
30 | \description{
31 | This dataset is a collection of white "Vinho Verde" wine
32 | samples from the north of Portugal. Due to privacy and logistic
33 | issues, only physicochemical (inputs) and sensory (the output)
34 | variables are available (e.g. there is no data about grape types,
35 | wine brand, wine selling price, etc.).
36 | }
37 | \keyword{datasets}
38 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
1 | PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #include <RcppArmadillo.h>
 5 | #include <Rcpp.h>
 6 | 
 7 | using namespace Rcpp;
 8 | 
 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
10 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
11 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
12 | #endif
13 | 
14 | // run
15 | Rcpp::List run(SEXP dataset, SEXP model_control, SEXP sgd_control);
16 | RcppExport SEXP _sgd_run(SEXP datasetSEXP, SEXP model_controlSEXP, SEXP sgd_controlSEXP) {
17 | BEGIN_RCPP
18 |     Rcpp::RObject rcpp_result_gen;
19 |     Rcpp::RNGScope rcpp_rngScope_gen;
20 |     Rcpp::traits::input_parameter< SEXP >::type dataset(datasetSEXP);
21 |     Rcpp::traits::input_parameter< SEXP >::type model_control(model_controlSEXP);
22 |     Rcpp::traits::input_parameter< SEXP >::type sgd_control(sgd_controlSEXP);
23 |     rcpp_result_gen = Rcpp::wrap(run(dataset, model_control, sgd_control));
24 |     return rcpp_result_gen;
25 | END_RCPP
26 | }
27 | 
28 | static const R_CallMethodDef CallEntries[] = {
29 |     {"_sgd_run", (DL_FUNC) &_sgd_run, 3},
30 |     {NULL, NULL, 0}
31 | };
32 | 
33 | RcppExport void R_init_sgd(DllInfo *dll) {
34 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
35 |     R_useDynamicSymbols(dll, FALSE);
36 | }
37 | 


--------------------------------------------------------------------------------
/src/basedef.h:
--------------------------------------------------------------------------------
 1 | #ifndef BASEDEF_H
 2 | #define BASEDEF_H
 3 | 
 4 | #define BOOST_DISABLE_ASSERTS true
 5 | 
 6 | #include "RcppArmadillo.h"
 7 | #include <bigmemory/MatrixAccessor.hpp>
 8 | #include <boost/function.hpp>
 9 | #include <boost/math/tools/roots.hpp>
10 | #include <boost/math/tools/tuple.hpp>
11 | #include <math.h>
12 | #include <stdlib.h>
13 | #include <string>
14 | #include <vector>
15 | #include <algorithm>
16 | #include <random>
17 | 
18 | using namespace arma;
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/src/data/data_point.h:
--------------------------------------------------------------------------------
 1 | #ifndef DATA_DATA_POINT_H
 2 | #define DATA_DATA_POINT_H
 3 | 
 4 | #include "../basedef.h"
 5 | 
 6 | struct data_point {
 7 |   /**
 8 |    * Collection of an individual observation's covariates and response.
 9 |    *
10 |    * @param x   covariates for a single sample
11 |    * @param y   response value for a single sample
12 |    * @param idx index of that data point into the data set
13 |    */
14 |   data_point(const mat& x, double y, unsigned idx) : x(x), y(y), idx(idx) {}
15 | 
16 |   mat x;
17 |   double y;
18 |   unsigned idx;
19 | };
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/src/data/data_set.h:
--------------------------------------------------------------------------------
 1 | #ifndef DATA_DATA_SET_H
 2 | #define DATA_DATA_SET_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "data_point.h"
 6 | 
 7 | // wrapper around R's RNG such that we get a uniform distribution over
 8 | // [0,n) as required by the STL algorithm
 9 | inline int randWrapper(const int n) { return floor(unif_rand()*n); }
10 | 
11 | class data_set {
12 |   /**
13 |    * Collection of all data points.
14 |    *
15 |    * @param xpMat    pointer to bigmat if using bigmatrix
16 |    * @param Xx       design matrix if not using bigmatrix
17 |    * @param Yy       response values
18 |    * @param n_passes number of passes for data
19 |    * @param big      whether using bigmatrix or not
20 |    * @param shuffle  whether to shuffle data set or not
21 |    */
22 | public:
23 |   data_set(const SEXP& xpMat, const mat& Xx, const mat& Yy, unsigned n_passes,
24 |     bool big, bool shuffle) : Y(Yy), big(big), xpMat_(xpMat), shuffle_(shuffle) {
25 |     if (!big) {
26 |       X = Xx;
27 |       n_samples = X.n_rows;
28 |       n_features = X.n_cols;
29 |     } else {
30 |       n_samples = xpMat_->nrow();
31 |       n_features = xpMat_->ncol();
32 |     }
33 |     if (shuffle_) {
34 |       idxvec_ = std::vector<unsigned>(n_samples*n_passes);
35 |       for (unsigned i = 0; i < n_passes; ++i) {
36 |         for (unsigned j = 0; j < n_samples; ++j) {
37 |           idxvec_[i * n_samples + j] = j;
38 |         }
39 |         // std::random_shuffle(idxvec_.begin() + i * n_samples,
40 |         //                     idxvec_.begin() + (i + 1) * n_samples,
41 |         //                     randWrapper);
42 |         std::random_device rd;
43 |         std::mt19937 gen(rd());
44 |         std::shuffle(idxvec_.begin() + i * n_samples,
45 |                             idxvec_.begin() + (i + 1) * n_samples,
46 |                             gen);
47 |       }
48 |     }
49 |   }
50 | 
51 |   // Index to the @t th data point
52 |   data_point get_data_point(unsigned t) const {
53 |     t = idxmap_(t - 1);
54 |     mat xt;
55 |     if (!big) {
56 |       xt = mat(X.row(t));
57 |     } else {
58 |       MatrixAccessor<double> matacess(*xpMat_);
59 |       xt = mat(1, n_features);
60 |       for (unsigned i=0; i < n_features; ++i) {
61 |         xt(0, i) = matacess[i][t];
62 |       }
63 |     }
64 |     double yt = Y(t);
65 |     return data_point(xt, yt, t);
66 |   }
67 | 
68 |   mat X;
69 |   mat Y;
70 |   bool big;
71 |   unsigned n_samples;
72 |   unsigned n_features;
73 | 
74 | private:
75 |   // index to data point for each iteration
76 |   unsigned idxmap_(unsigned t) const {
77 |     if (shuffle_) {
78 |       return(idxvec_[t]);
79 |     } else {
80 |       return(t % n_samples);
81 |     }
82 |   }
83 | 
84 |   Rcpp::XPtr<BigMatrix> xpMat_;
85 |   std::vector<unsigned> idxvec_;
86 |   bool shuffle_;
87 | };
88 | 
89 | #endif
90 | 


--------------------------------------------------------------------------------
/src/learn-rate/base_learn_rate.h:
--------------------------------------------------------------------------------
 1 | #ifndef LEARN_RATE_BASE_LEARN_RATE_H
 2 | #define LEARN_RATE_BASE_LEARN_RATE_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "learn_rate_value.h"
 6 | 
 7 | class base_learn_rate {
 8 |   /**
 9 |    * Base class for learning rates
10 |    */
11 | public:
12 |   base_learn_rate() {}
13 | 
14 |   virtual const learn_rate_value& operator()(unsigned t, const mat& grad_t) = 0;
15 | };
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/src/learn-rate/ddim_learn_rate.h:
--------------------------------------------------------------------------------
 1 | #ifndef LEARN_RATE_DDIM_LEARN_RATE_H
 2 | #define LEARN_RATE_DDIM_LEARN_RATE_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "base_learn_rate.h"
 6 | #include "learn_rate_value.h"
 7 | 
 8 | class ddim_learn_rate : public base_learn_rate {
 9 |   /**
10 |    * d-dimensional learning rate, which includes as special cases popular
11 |    * learning rates:
12 |    * adagrad: a=1, b=1, c=1/2, eta=1, eps=1e-6
13 |    * d-dim: a=0, b=1, c=1, eta=1, eps=1e-6
14 |    * rmsprop: a=gamma, b=1-gamma, c=1/2, eta=1, eps=1e-6
15 |    *
16 |    * @param d   dimension of learning rate
17 |    * @param eta scale factor in numerator
18 |    * @param a   factor to weigh old gradient information
19 |    * @param b   factor to weigh new gradient information
20 |    * @param c   power to exponentiate by
21 |    * @param eps value to prevent division by zero
22 |    */
23 | public:
24 |   ddim_learn_rate(unsigned d, double eta, double a, double b, double c,
25 |                   double eps) :
26 |     d_(d), Idiag_(ones<vec>(d)), eta_(eta), a_(a), b_(b), c_(c), eps_(eps),
27 |     v_(1, d) {}
28 | 
29 |   virtual const learn_rate_value& operator()(unsigned t, const mat& grad_t) {
30 |     for (unsigned i = 0; i < d_; ++i) {
31 |       Idiag_.at(i) = a_ * Idiag_.at(i) + b_ * pow(grad_t.at(i, 0), 2);
32 |     }
33 | 
34 |     for (unsigned i = 0; i < d_; ++i) {
35 |       if (std::abs(Idiag_.at(i)) > 1e-8) {
36 |         v_.at(i) = eta_ / pow(Idiag_.at(i) + eps_, c_);
37 |       }
38 |       else {
39 |         v_.at(i) = Idiag_.at(i);
40 |       }
41 |     }
42 |     return v_;
43 |   }
44 | 
45 | private:
46 |   unsigned d_;
47 |   vec Idiag_;
48 |   double eta_;
49 |   double a_;
50 |   double b_;
51 |   double c_;
52 |   double eps_;
53 |   learn_rate_value v_;
54 | };
55 | 
56 | #endif
57 | 


--------------------------------------------------------------------------------
/src/learn-rate/learn_rate_value.h:
--------------------------------------------------------------------------------
  1 | #ifndef LEARN_RATE_LEARN_RATE_VALUE_H
  2 | #define LEARN_RATE_LEARN_RATE_VALUE_H
  3 | 
  4 | #include "../basedef.h"
  5 | 
  6 | class learn_rate_value {
  7 |   /**
  8 |    * Object to return for all learning rate classes
  9 |    *
 10 |    * @param t type of the value; 0 is scalar, 1 is vector, 2 is matrix
 11 |    * @param d dimension of parameters
 12 |    */
 13 | public:
 14 |   learn_rate_value(unsigned type, unsigned d) : type_(type) {
 15 |     if (type_ == 0) {
 16 |       lr_scalar_ = 1;
 17 |     } else if (type_ == 1) {
 18 |       lr_vector_ = ones<vec>(d);
 19 |     } else {
 20 |       lr_matrix_ = eye<mat>(d, d);
 21 |     }
 22 |   }
 23 | 
 24 |   double& at(unsigned i) {
 25 |     if (type_ == 1) {
 26 |       return lr_vector_.at(i);
 27 |     } else if (type_ == 2) {
 28 |       return lr_matrix_.at(i);
 29 |     } else {
 30 |       Rcpp::Rcout <<
 31 |         "Indexing vector/matrix entry when learning rate type is neither" <<
 32 |         std::endl;
 33 |       return lr_scalar_;
 34 |     }
 35 |   }
 36 | 
 37 |   // const double& at(unsigned i) const {
 38 |   //   return at(i);
 39 |   // }
 40 | 
 41 |   double& at(unsigned i, unsigned j) {
 42 |     if (type_ == 2) {
 43 |       return lr_matrix_.at(i, j);
 44 |     } else {
 45 |       Rcpp::Rcout <<
 46 |         "Indexing matrix entry when learning rate type is not matrix" <<
 47 |         std::endl;
 48 |       return lr_scalar_;
 49 |     }
 50 |   }
 51 | 
 52 |   // const double& at(unsigned i, unsigned j) const {
 53 |   //   return at(i, j);
 54 |   // }
 55 | 
 56 |   // Take average for usage in implicit SGD
 57 |   double mean() const {
 58 |     // double average = 0.0;
 59 |     if (type_ == 0) {
 60 |       return lr_scalar_;
 61 |     } else if (type_ == 1) {
 62 |       return arma::mean(lr_vector_);
 63 |     } else {
 64 |       return arma::mean(arma::mean(lr_matrix_));
 65 |     }
 66 |   }
 67 | 
 68 |   learn_rate_value operator=(double scalar) {
 69 |     if (type_ == 0) {
 70 |       lr_scalar_ = scalar;
 71 |     } else {
 72 |       Rcpp::Rcout <<
 73 |         "Setting learning rate value to scalar when its type is not" <<
 74 |         std::endl;
 75 |     }
 76 |     return *this;
 77 |   }
 78 | 
 79 |   learn_rate_value operator=(const vec& vector) {
 80 |     if (type_ == 1) {
 81 |       lr_vector_ = vector;
 82 |     } else {
 83 |       Rcpp::Rcout <<
 84 |         "Setting learning rate value to vector when its type is not" <<
 85 |         std::endl;
 86 |     }
 87 |     return *this;
 88 |   }
 89 | 
 90 |   learn_rate_value operator=(const mat& matrix) {
 91 |     if (type_ == 2) {
 92 |       lr_matrix_ = matrix;
 93 |     } else {
 94 |       Rcpp::Rcout <<
 95 |         "Setting learning rate value to matrix when its type is not" <<
 96 |         std::endl;
 97 |     }
 98 |     return *this;
 99 |   }
100 | 
101 |   mat operator*(const mat& matrix) {
102 |     if (type_ == 0) {
103 |       return lr_scalar_ * matrix;
104 |     } else if (type_ == 1) {
105 |       //int m = matrix.n_rows;
106 |       ////int n = matrix.n_cols;
107 |       //mat out = zeros<mat>(m, 1);
108 |       //for (unsigned i = 0; i < m; ++i) {
109 |       //  //for (unsigned j = 0; j < n; ++j) {
110 |       //    //out.at(i) += lr_vector_.at(i) * matrix.at(i, 0);
111 |       //  //}
112 |       //  out.at(i, 0) = lr_vector_.at(i) * matrix.at(i, 0);
113 |       //}
114 |       //return out;
115 |       //return diagmat(lr_vector_) * matrix;
116 |       return mat(lr_vector_) % matrix;
117 |     } else {
118 |       return lr_matrix_ * matrix;
119 |     }
120 |   }
121 | 
122 |   bool operator<(const double thres) {
123 |     if (type_ == 0) {
124 |       return lr_scalar_ < thres;
125 |     } else if (type_ == 1) {
126 |       return all(lr_vector_ < thres);
127 |     } else{
128 |       return all(diagvec(lr_matrix_) < thres);
129 |     }
130 |   }
131 | 
132 |   bool operator>(const double thres) {
133 |     return !(*this < thres);
134 |   }
135 | 
136 | private:
137 |   unsigned type_;
138 |   double lr_scalar_;
139 |   vec lr_vector_;
140 |   mat lr_matrix_;
141 | };
142 | 
143 | #endif
144 | 


--------------------------------------------------------------------------------
/src/learn-rate/onedim_eigen_learn_rate.h:
--------------------------------------------------------------------------------
 1 | #ifndef LEARN_RATE_ONEDIM_EIGEN_LEARN_RATE_H
 2 | #define LEARN_RATE_ONEDIM_EIGEN_LEARN_RATE_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "base_learn_rate.h"
 6 | #include "learn_rate_value.h"
 7 | 
 8 | class onedim_eigen_learn_rate : public base_learn_rate {
 9 |   /**
10 |    * One-dimensional learning rate to parameterize a diagonal matrix
11 |    *
12 |    * @param d  dimension of learning rate
13 |    */
14 | public:
15 |   onedim_eigen_learn_rate(unsigned d) :
16 |     d_(d), v_(0, 1) {}
17 | 
18 |   virtual const learn_rate_value& operator()(unsigned t, const mat& grad_t) {
19 |     double sum_eigen = 0;
20 |     for (unsigned i = 0; i < d_; ++i) {
21 |       sum_eigen += pow(grad_t.at(i, 0), 2);
22 |     }
23 |     // based on the bound of min_eigen <= d / trace(Fisher_matrix)
24 |     double min_eigen_upper = sum_eigen / d_;
25 |     v_ = 1. / (min_eigen_upper * t);
26 |     return v_;
27 |   }
28 | 
29 | private:
30 |   unsigned d_;
31 |   learn_rate_value v_;
32 | };
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/src/learn-rate/onedim_learn_rate.h:
--------------------------------------------------------------------------------
 1 | #ifndef LEARN_RATE_ONEDIM_LEARN_RATE_H
 2 | #define LEARN_RATE_ONEDIM_LEARN_RATE_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "base_learn_rate.h"
 6 | #include "learn_rate_value.h"
 7 | 
 8 | class onedim_learn_rate : public base_learn_rate {
 9 |   /**
10 |    * One-dimensional (scalar) learning rate, following Xu
11 |    *
12 |    * @param scale scale factor in numerator
13 |    * @param gamma scale factor in both numerator and denominator
14 |    * @param alpha scale factor in denominator
15 |    * @param c     power to exponentiate by
16 |    */
17 | public:
18 |   onedim_learn_rate(double scale, double gamma, double alpha, double c) :
19 |     scale_(scale), gamma_(gamma), alpha_(alpha), c_(c), v_(0, 1) {}
20 | 
21 |   virtual const learn_rate_value& operator()(unsigned t, const mat& grad_t) {
22 |     v_ = scale_ * gamma_ * pow(1 + alpha_ * gamma_ * t, -c_);
23 |     return v_;
24 |   }
25 | 
26 | private:
27 |   double scale_;
28 |   double gamma_;
29 |   double alpha_;
30 |   double c_;
31 |   learn_rate_value v_;
32 | };
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/src/model/base_model.h:
--------------------------------------------------------------------------------
 1 | #ifndef MODEL_BASE_MODEL_H
 2 | #define MODEL_BASE_MODEL_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "../data/data_point.h"
 6 | 
 7 | class base_model {
 8 |   /**
 9 |    * Base class for models
10 |    *
11 |    * @param model attributes affiliated with model as R type
12 |    */
13 | public:
14 |   base_model(Rcpp::List model) {
15 |     name_ = Rcpp::as<std::string>(model["name"]);
16 |     lambda1_ = Rcpp::as<double>(model["lambda1"]);
17 |     lambda2_ = Rcpp::as<double>(model["lambda2"]);
18 |   }
19 | 
20 |   std::string name() const {
21 |     return name_;
22 |   }
23 | 
24 |   mat gradient(unsigned t, const mat& theta_old, const data_set& data) const;
25 |   mat gradient_penalty(const mat& theta) const {
26 |     return lambda1_*sign(theta) + lambda2_*theta;
27 |   }
28 | 
29 |   // Functions for implicit update
30 |   // Following the JSS paper, we assume C_n = identity, lambda = 1, and use ksi
31 |   // rather than s_n, which is slightly less efficient.
32 |   // ell'(x^T theta + at x^T grad(penalty) + ksi ||x||^2)
33 |   double scale_factor(double ksi, double at, const data_point& data_pt, const
34 |     mat& theta_old, double normx) const;
35 |   // d/d(ksi) ell'
36 |   double scale_factor_first_deriv(double ksi, double at, const data_point&
37 |     data_pt, const mat& theta_old, double normx) const;
38 |   // d^2/d(ksi)^2 ell'
39 |   double scale_factor_second_deriv(double ksi, double at, const data_point&
40 |     data_pt, const mat& theta_old, double normx) const;
41 | 
42 | protected:
43 |   std::string name_;
44 |   double lambda1_;
45 |   double lambda2_;
46 | };
47 | 
48 | #endif
49 | 


--------------------------------------------------------------------------------
/src/model/cox_model.h:
--------------------------------------------------------------------------------
 1 | #ifndef MODEL_COX_MODEL_H
 2 | #define MODEL_COX_MODEL_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "../data/data_point.h"
 6 | #include "base_model.h"
 7 | 
 8 | class cox_model : public base_model {
 9 |   /**
10 |    * Cox proportional hazards model
11 |    *
12 |    * @param model attributes affiliated with model as R type
13 |    */
14 | public:
15 |   cox_model(Rcpp::List model) : base_model(model) {}
16 | 
17 |   mat gradient(unsigned t, const mat& theta_old, const data_set& data)
18 |     const {
19 |     data_point data_pt = data.get_data_point(t);
20 |     unsigned j = data_pt.idx;
21 | 
22 |     // assuming data points fail in order, i.e., risk set R_i={i,i+1,...,n}
23 |     vec xi = exp(data.X * theta_old);
24 |     vec h = zeros<vec>(j);
25 |     double sum_xi = 0;
26 |     for (int i = j-1; i < j; --i) {
27 |       // h_i = d_i/sum(xi[i:n])
28 |       if (i == j-1) {
29 |         for (int k = i; k < data.n_samples; ++k) {
30 |           sum_xi += xi(k);
31 |         }
32 |       } else {
33 |         sum_xi += xi(i);
34 |       }
35 |       h(i) = data.Y(i)/sum_xi;
36 |     }
37 |     double r = data_pt.y - xi(j) * sum(h);
38 |     return (r * data_pt.x).t();
39 |   }
40 | 
41 |   // TODO
42 |   bool rank;
43 | };
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/src/model/glm/glm_family.h:
--------------------------------------------------------------------------------
 1 | #ifndef MODEL_GLM_FAMILY_H
 2 | #define MODEL_GLM_FAMILY_H
 3 | 
 4 | #include "../../basedef.h"
 5 | 
 6 | class base_family;
 7 | class gaussian_family;
 8 | class poisson_family;
 9 | class binomial_family;
10 | class gamma_family;
11 | 
12 | class base_family {
13 |   /* Base class from which all exponential family classes inherit from */
14 | public:
15 |   virtual double variance(double u) const = 0;
16 |   virtual double deviance(const mat& y, const mat& mu, const mat& wt) const = 0;
17 | };
18 | 
19 | class gaussian_family : public base_family {
20 | public:
21 |   virtual double variance(double u) const {
22 |     return 1.;
23 |   }
24 | 
25 |   virtual double deviance(const mat& y, const mat& mu, const mat& wt) const {
26 |     return sum(vec(wt % ((y-mu) % (y-mu))));
27 |   }
28 | };
29 | 
30 | class poisson_family : public base_family {
31 | public:
32 |   virtual double variance(double u) const {
33 |     return u;
34 |   }
35 | 
36 |   virtual double deviance(const mat& y, const mat& mu, const mat& wt) const {
37 |     vec r = vec(mu % wt);
38 |     for (unsigned i = 0; i < r.n_elem; ++i) {
39 |       if (y(i) > 0.) {
40 |         r(i) = wt(i) * (y(i) * log(y(i)/mu(i)) - (y(i) - mu(i)));
41 |       }
42 |     }
43 |     return sum(2. * r);
44 |   }
45 | };
46 | 
47 | class binomial_family : public base_family {
48 | public:
49 |   virtual double variance(double u) const {
50 |     return u * (1. - u);
51 |   }
52 | 
53 |   // In R the dev.resids of Binomial family is not exposed.
54 |   // Found one [here](http://pages.stat.wisc.edu/~st849-1/lectures/GLMDeviance.pdf)
55 |   virtual double deviance(const mat& y, const mat& mu, const mat& wt) const {
56 |     vec r(y.n_elem);
57 |     for (unsigned i = 0; i < r.n_elem; ++i) {
58 |       r(i) = 2. * wt(i) * (y_log_y(y(i), mu(i)) + y_log_y(1.-y(i), 1.-mu(i)));
59 |     }
60 |     return sum(r);
61 |   }
62 | 
63 | private:
64 |   double y_log_y(double y, double mu) const {
65 |     return (y) ? (y * log(y/mu)) : 0.;
66 |   }
67 | };
68 | 
69 | class gamma_family : public base_family {
70 | public:
71 |   virtual double variance(double u) const {
72 |     return pow(u, 2);
73 |   }
74 | 
75 |   virtual double deviance(const mat& y, const mat& mu, const mat& wt) const {
76 |     vec r(y.n_elem);
77 |     for (unsigned i = 0; i < r.n_elem; ++i) {
78 |       r(i) = -2. * wt(i) * (log(y(i) ? y(i)/mu(i) : 1.) - (y(i)-mu(i)) / mu(i));
79 |     }
80 |     return sum(r);
81 |   }
82 | };
83 | 
84 | #endif
85 | 


--------------------------------------------------------------------------------
/src/model/glm/glm_transfer.h:
--------------------------------------------------------------------------------
  1 | #ifndef MODEL_GLM_TRANSFER_H
  2 | #define MODEL_GLM_TRANSFER_H
  3 | 
  4 | #include "../../basedef.h"
  5 | 
  6 | class base_transfer;
  7 | class identity_transfer;
  8 | class inverse_transfer;
  9 | class exp_transfer;
 10 | class logistic_transfer;
 11 | 
 12 | class base_transfer {
 13 |   /* Base class from which all transfer function classes inherit from */
 14 | public:
 15 |   virtual double transfer(double u) const = 0;
 16 | 
 17 |   virtual mat transfer(const mat& u) const {
 18 |     mat result = mat(u);
 19 |     for (unsigned i = 0; i < result.n_rows; ++i) {
 20 |       result(i, 0) = transfer(u(i, 0));
 21 |     }
 22 |     return result;
 23 |   }
 24 | 
 25 |   virtual double link(double u) const = 0;
 26 | 
 27 |   virtual double first_derivative(double u) const = 0;
 28 |   virtual double second_derivative(double u) const = 0;
 29 |   virtual bool valideta(double eta) const = 0;
 30 | };
 31 | 
 32 | class identity_transfer : public base_transfer {
 33 | public:
 34 |   virtual double transfer(double u) const {
 35 |     return u;
 36 |   }
 37 | 
 38 |   virtual double link(double u) const {
 39 |     return u;
 40 |   }
 41 | 
 42 |   virtual double first_derivative(double u) const {
 43 |     return 1.;
 44 |   }
 45 | 
 46 |   virtual double second_derivative(double u) const {
 47 |     return 0.;
 48 |   }
 49 | 
 50 |   virtual bool valideta(double eta) const {
 51 |     return true;
 52 |   }
 53 | };
 54 | 
 55 | class inverse_transfer : public base_transfer {
 56 | public:
 57 |   virtual double transfer(double u) const {
 58 |     if (valideta(u)) {
 59 |       return -1. / u;
 60 |     }
 61 |     return 0.;
 62 |   }
 63 | 
 64 |   virtual double link(double u) const {
 65 |     if (u) {
 66 |       return -1. / u;
 67 |     }
 68 |     return 0.;
 69 |   }
 70 | 
 71 |   virtual double first_derivative(double u) const {
 72 |     if (valideta(u)) {
 73 |       return 1. / pow(u, 2);
 74 |     }
 75 |     return 0.;
 76 |   }
 77 | 
 78 |   virtual double second_derivative(double u) const {
 79 |     if (valideta(u)) {
 80 |       return -2. / pow(u, 3);
 81 |     }
 82 |     return 0.;
 83 |   }
 84 | 
 85 |   virtual bool valideta(double eta) const {
 86 |     return eta != 0;
 87 |   }
 88 | };
 89 | 
 90 | class exp_transfer : public base_transfer {
 91 | public:
 92 |   virtual double transfer(double u) const {
 93 |     return exp(u);
 94 |   }
 95 | 
 96 |   virtual double link(double u) const {
 97 |     if (u > 0.) {
 98 |       return log(u);
 99 |     }
100 |     return 0.;
101 |   }
102 | 
103 |   virtual double first_derivative(double u) const {
104 |     return exp(u);
105 |   }
106 | 
107 |   virtual double second_derivative(double u) const {
108 |     return exp(u);
109 |   }
110 | 
111 |   virtual bool valideta(double eta) const {
112 |     return true;
113 |   }
114 | };
115 | 
116 | class logistic_transfer : public base_transfer {
117 | public:
118 |   virtual double transfer(double u) const {
119 |     return sigmoid(u);
120 |   }
121 | 
122 |   virtual double link(double u) const {
123 |     if (u > 0. && u < 1.) {
124 |       return log(u / (1. - u));
125 |     }
126 |     return 0.;
127 |   }
128 | 
129 |   virtual double first_derivative(double u) const {
130 |     double sig = sigmoid(u);
131 |     return sig * (1. - sig);
132 |   }
133 | 
134 |   virtual double second_derivative(double u) const {
135 |     double sig = sigmoid(u);
136 |     return 2*pow(sig, 3) - 3*pow(sig, 2) + 2*sig;
137 |   }
138 | 
139 |   virtual bool valideta(double eta) const {
140 |     return true;
141 |   }
142 | 
143 | private:
144 |   double sigmoid(double u) const {
145 |       return 1. / (1. + exp(-u));
146 |   }
147 | };
148 | 
149 | #endif
150 | 


--------------------------------------------------------------------------------
/src/model/glm_model.h:
--------------------------------------------------------------------------------
  1 | #ifndef MODEL_GLM_MODEL_H
  2 | #define MODEL_GLM_MODEL_H
  3 | 
  4 | #include "../basedef.h"
  5 | #include "../data/data_point.h"
  6 | #include "base_model.h"
  7 | #include "glm/glm_family.h"
  8 | #include "glm/glm_transfer.h"
  9 | 
 10 | class glm_model : public base_model {
 11 |   /**
 12 |    * Generalized linear models
 13 |    *
 14 |    * @param model attributes affiliated with model as R type
 15 |    */
 16 | public:
 17 |   glm_model(Rcpp::List model) : base_model(model) {
 18 |     family_ = Rcpp::as<std::string>(model["family"]);
 19 |     if (family_ == "gaussian") {
 20 |       family_obj_ = new gaussian_family();
 21 |     } else if (family_ == "poisson") {
 22 |       family_obj_ = new poisson_family();
 23 |     } else if (family_ == "binomial") {
 24 |       family_obj_ = new binomial_family();
 25 |     } else if (family_ == "gamma") {
 26 |       family_obj_ = new gamma_family();
 27 |     } else {
 28 |       Rcpp::Rcout << "warning: model not implemented yet" << std::endl;
 29 |     }
 30 |     transfer_ = Rcpp::as<std::string>(model["transfer"]);
 31 |     if (transfer_ == "identity") {
 32 |       transfer_obj_ = new identity_transfer();
 33 |     } else if (transfer_ == "exp") {
 34 |       transfer_obj_ = new exp_transfer();
 35 |     } else if (transfer_ == "inverse") {
 36 |       transfer_obj_ = new inverse_transfer();
 37 |     } else if (transfer_ == "logistic") {
 38 |       transfer_obj_ = new logistic_transfer();
 39 |     }
 40 |   }
 41 | 
 42 |   mat gradient(unsigned t, const mat& theta_old, const data_set& data)
 43 |     const {
 44 |     data_point data_pt = data.get_data_point(t);
 45 |     return ((data_pt.y - h_transfer(dot(data_pt.x, theta_old))) *
 46 |       data_pt.x).t() - gradient_penalty(theta_old);
 47 |   }
 48 | 
 49 |   double g_link(double u) const {
 50 |     return transfer_obj_->link(u);
 51 |   }
 52 | 
 53 |   double h_transfer(double u) const {
 54 |     return transfer_obj_->transfer(u);
 55 |   }
 56 | 
 57 |   mat h_transfer(const mat& u) const {
 58 |     return transfer_obj_->transfer(u);
 59 |   }
 60 | 
 61 |   double h_first_deriv(double u) const {
 62 |     return transfer_obj_->first_derivative(u);
 63 |   }
 64 | 
 65 |   double h_second_deriv(double u) const {
 66 |     return transfer_obj_->second_derivative(u);
 67 |   }
 68 | 
 69 |   bool valideta(double eta) const {
 70 |     return transfer_obj_->valideta(eta);
 71 |   }
 72 | 
 73 |   double variance(double u) const {
 74 |     return family_obj_->variance(u);
 75 |   }
 76 | 
 77 |   double deviance(const mat& y, const mat& mu, const mat& wt) const {
 78 |     return family_obj_->deviance(y, mu, wt);
 79 |   }
 80 | 
 81 |   std::string family() const {
 82 |     return family_;
 83 |   }
 84 | 
 85 |   std::string transfer() const {
 86 |     return transfer_;
 87 |   }
 88 | 
 89 |   // Functions for implicit update
 90 |   double scale_factor(double ksi, double at, const data_point& data_pt, const
 91 |     mat& theta_old, double normx) const {
 92 |     return data_pt.y - h_transfer(
 93 |       dot(theta_old, data_pt.x) -
 94 |       at * dot(gradient_penalty(theta_old), data_pt.x) +
 95 |       ksi * normx);
 96 |   }
 97 | 
 98 |   double scale_factor_first_deriv(double ksi, double at, const data_point&
 99 |     data_pt, const mat& theta_old, double normx) const {
100 |     return h_first_deriv(
101 |       dot(theta_old, data_pt.x) -
102 |       at * dot(gradient_penalty(theta_old), data_pt.x) +
103 |       ksi * normx) * normx;
104 |   }
105 | 
106 |   double scale_factor_second_deriv(double ksi, double at, const data_point&
107 |     data_pt, const mat& theta_old, double normx) const {
108 |     return h_second_deriv(
109 |       dot(theta_old, data_pt.x) -
110 |       at * dot(gradient_penalty(theta_old), data_pt.x) +
111 |       ksi * normx) * normx * normx;
112 |   }
113 | 
114 | private:
115 |   std::string family_;
116 |   std::string transfer_;
117 |   base_family* family_obj_;
118 |   base_transfer* transfer_obj_;
119 | };
120 | 
121 | #endif
122 | 


--------------------------------------------------------------------------------
/src/model/gmm_model.h:
--------------------------------------------------------------------------------
 1 | #ifndef MODEL_GMM_MODEL_H
 2 | #define MODEL_GMM_MODEL_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "../data/data_point.h"
 6 | #include "base_model.h"
 7 | 
 8 | class gmm_model : public base_model {
 9 |   /**
10 |    * Generalized method of moments
11 |    *
12 |    * @param model attributes affiliated with model as R type
13 |    */
14 | public:
15 |   gmm_model(Rcpp::List model) :
16 |     base_model(model), gr_(Rcpp::as<Rcpp::Function>(model["gr"])) {
17 |     //if model["wmatrix"] == NULL {
18 |       int k = 5;
19 |       wmatrix_ = eye<mat>(k, k);
20 |     //} else {
21 |     //  wmatrix_ = Rcpp::as<mat>(model["wmatrix"]);
22 |     //}
23 |   }
24 | 
25 |   mat gradient(unsigned t, const mat& theta_old, const data_set& data)
26 |     const {
27 |     data_point data_pt = data.get_data_point(t);
28 |     // TODO y isn't necessary
29 |     // TODO include weighting matrix
30 |     Rcpp::NumericVector r_theta_old =
31 |       Rcpp::as<Rcpp::NumericVector>(Rcpp::wrap(theta_old));
32 |     Rcpp::NumericVector r_data_pt =
33 |       Rcpp::as<Rcpp::NumericVector>(Rcpp::wrap(data_pt.x));
34 |     Rcpp::NumericMatrix r_out = gr_(r_theta_old, r_data_pt);
35 |     mat out = Rcpp::as<mat>(r_out);
36 |     return -1. * out; // maximize the negative moment function
37 |   }
38 | 
39 |   // TODO
40 |   bool rank;
41 | 
42 | private:
43 |   mat wmatrix_;
44 |   Rcpp::Function gr_;
45 | };
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/src/model/m-estimation/m_loss.h:
--------------------------------------------------------------------------------
 1 | #ifndef MODEL_M_LOSS_H
 2 | #define MODEL_M_LOSS_H
 3 | 
 4 | #include "../../basedef.h"
 5 | 
 6 | class base_loss;
 7 | class huber_loss;
 8 | 
 9 | class base_loss {
10 |   /* Base class from which all loss function classes inherit from */
11 | public:
12 |   virtual double loss(double u, double lambda) const = 0;
13 |   virtual double first_derivative(double u, double lambda) const = 0;
14 |   virtual double second_derivative(double u, double lambda) const = 0;
15 |   virtual double third_derivative(double u, double lambda) const = 0;
16 |   virtual mat loss(const mat& u, double lambda) const {
17 |     mat result = mat(u);
18 |     for (unsigned i = 0; i < result.n_rows; ++i) {
19 |       result(i, 0) = loss(u(i, 0), lambda);
20 |     }
21 |     return result;
22 |   }
23 |   virtual mat first_derivative(const mat& u, double lambda) const {
24 |     mat result = mat(u);
25 |     for (unsigned i = 0; i < result.n_rows; ++i) {
26 |       result(i, 0) = first_derivative(u(i, 0), lambda);
27 |     }
28 |     return result;
29 |   }
30 | };
31 | 
32 | class huber_loss : public base_loss {
33 | public:
34 |   virtual double loss(double u, double lambda) const {
35 |     if (std::abs(u) <= lambda) {
36 |       return pow(u, 2)/2;
37 |     } else {
38 |       return lambda*std::abs(u) - pow(lambda, 2)/2;
39 |     }
40 |   }
41 | 
42 |   virtual double first_derivative(double u, double lambda) const {
43 |     if (std::abs(u) <= lambda) {
44 |       return u;
45 |     } else {
46 |       return lambda*sign(u);
47 |     }
48 |   }
49 | 
50 |   virtual double second_derivative(double u, double lambda) const {
51 |     if (std::abs(u) <= lambda) {
52 |       return 1.0;
53 |     } else {
54 |       return 0.0;
55 |     }
56 |   }
57 | 
58 |   virtual double third_derivative(double u, double lambda) const {
59 |     return 0.0;
60 |   }
61 | 
62 | private:
63 |   template<typename T>
64 |   double sign(const T& x) const {
65 |     if (x > 0) {
66 |       return 1.0;
67 |     } else if (x < 0) {
68 |       return -1.0;
69 |     } else {
70 |       return 0.0;
71 |     }
72 |   }
73 | };
74 | 
75 | #endif
76 | 


--------------------------------------------------------------------------------
/src/model/m_model.h:
--------------------------------------------------------------------------------
 1 | #ifndef MODEL_M_MODEL_H
 2 | #define MODEL_M_MODEL_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "../data/data_point.h"
 6 | #include "base_model.h"
 7 | #include "m-estimation/m_loss.h"
 8 | 
 9 | class m_model : public base_model {
10 |   /**
11 |    * M-estimation
12 |    *
13 |    * @param model attributes affiliated with model as R type
14 |    */
15 | public:
16 |   m_model(Rcpp::List model) : base_model(model) {
17 |     loss_ = Rcpp::as<std::string>(model["loss"]);
18 |     if (loss_ == "huber") {
19 |       loss_obj_ = new huber_loss();
20 |     } else {
21 |       Rcpp::Rcout << "warning: loss not implemented yet" << std::endl;
22 |     }
23 |     lambda_ = 3.0; // default for huber loss
24 |   }
25 | 
26 |   mat gradient(unsigned t, const mat& theta_old, const data_set& data)
27 |     const {
28 |     data_point data_pt = data.get_data_point(t);
29 |     return (loss_obj_->first_derivative(
30 |       data_pt.y - dot(data_pt.x, theta_old), lambda_) * data_pt.x).t() -
31 |       gradient_penalty(theta_old);
32 |   }
33 | 
34 |   std::string loss() const {
35 |     return loss_;
36 |   }
37 | 
38 |   // Functions for implicit update
39 |   double scale_factor(double ksi, double at, const data_point& data_pt, const
40 |     mat& theta_old, double normx) const {
41 |     return loss_obj_->first_derivative(
42 |       data_pt.y - dot(theta_old, data_pt.x) -
43 |         at * dot(gradient_penalty(theta_old), data_pt.x) +
44 |         ksi * normx,
45 |       lambda_);
46 |   }
47 | 
48 |   double scale_factor_first_deriv(double ksi, double at, const data_point&
49 |     data_pt, const mat& theta_old, double normx) const {
50 |     return loss_obj_->second_derivative(
51 |       data_pt.y - dot(theta_old, data_pt.x) -
52 |         at * dot(gradient_penalty(theta_old), data_pt.x) +
53 |         ksi * normx,
54 |       lambda_) * normx;
55 |   }
56 | 
57 |   double scale_factor_second_deriv(double ksi, double at, const data_point&
58 |     data_pt, const mat& theta_old, double normx) const {
59 |     return loss_obj_->third_derivative(
60 |       data_pt.y - dot(theta_old, data_pt.x) -
61 |         at * dot(gradient_penalty(theta_old), data_pt.x) +
62 |         ksi * normx,
63 |       lambda_) * normx * normx;
64 |   }
65 | 
66 | private:
67 |   std::string loss_;
68 |   base_loss* loss_obj_;
69 |   double lambda_;
70 | };
71 | 
72 | #endif
73 | 


--------------------------------------------------------------------------------
/src/post-process/cox_post_process.h:
--------------------------------------------------------------------------------
 1 | #ifndef POST_PROCESS_COX_POST_PROCESS_H
 2 | #define POST_PROCESS_COX_POST_PROCESS_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "../data/data_set.h"
 6 | #include "../model/cox_model.h"
 7 | 
 8 | template <typename SGD>
 9 | Rcpp::List post_process(const SGD& sgd, const data_set& data,
10 |   const cox_model& model) {
11 |   // TODO
12 |   return Rcpp::List();
13 | }
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/src/post-process/glm_post_process.h:
--------------------------------------------------------------------------------
 1 | #ifndef POST_PROCESS_GLM_POST_PROCESS_H
 2 | #define POST_PROCESS_GLM_POST_PROCESS_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "../data/data_set.h"
 6 | #include "../model/glm_model.h"
 7 | 
 8 | template <typename SGD>
 9 | Rcpp::List post_process(const SGD& sgd, const data_set& data,
10 |   const glm_model& model) {
11 |   // TODO
12 |   return Rcpp::List();
13 | }
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/src/post-process/gmm_post_process.h:
--------------------------------------------------------------------------------
 1 | #ifndef POST_PROCESS_GMM_POST_PROCESS_H
 2 | #define POST_PROCESS_GMM_POST_PROCESS_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "../data/data_set.h"
 6 | #include "../model/gmm_model.h"
 7 | 
 8 | // model.out: flag to include weighting matrix
 9 | template <typename SGD>
10 | Rcpp::List post_process(const SGD& sgd, const data_set& data,
11 |   const gmm_model& model) {
12 |   // TODO
13 |   return Rcpp::List();
14 | }
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/src/post-process/m_post_process.h:
--------------------------------------------------------------------------------
 1 | #ifndef POST_PROCESS_M_POST_PROCESS_H
 2 | #define POST_PROCESS_M_POST_PROCESS_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "../data/data_set.h"
 6 | #include "../model/m_model.h"
 7 | 
 8 | template <typename SGD>
 9 | Rcpp::List post_process(const SGD& sgd, const data_set& data,
10 |   const m_model& model) {
11 |   return Rcpp::List::create(
12 |     Rcpp::Named("loss") = model.loss());
13 | }
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/src/sgd.cpp:
--------------------------------------------------------------------------------
  1 | #include "basedef.h"
  2 | #include "data/data_set.h"
  3 | #include "model/cox_model.h"
  4 | #include "model/glm_model.h"
  5 | #include "model/gmm_model.h"
  6 | #include "model/m_model.h"
  7 | #include "post-process/cox_post_process.h"
  8 | #include "post-process/glm_post_process.h"
  9 | #include "post-process/gmm_post_process.h"
 10 | #include "post-process/m_post_process.h"
 11 | #include "sgd/explicit_sgd.h"
 12 | #include "sgd/implicit_sgd.h"
 13 | #include "sgd/momentum_sgd.h"
 14 | #include "sgd/nesterov_sgd.h"
 15 | #include "validity-check/validity_check.h"
 16 | 
 17 | // [[Rcpp::depends(BH)]]
 18 | // [[Rcpp::depends(RcppArmadillo)]]
 19 | // [[Rcpp::plugins(cpp1y)]]
 20 | 
 21 | 
 22 | template<typename MODEL, typename SGD>
 23 | Rcpp::List run(const data_set& data, MODEL& model, SGD& sgd);
 24 | 
 25 | /**
 26 |  * Runs the proposed model and stochastic gradient method on the data set
 27 |  *
 28 |  * @param dataset       data set
 29 |  * @param model_control attributes affiliated with model
 30 |  * @param sgd_control   attributes affiliated with sgd
 31 |  */
 32 | // [[Rcpp::export]]
 33 | Rcpp::List run(SEXP dataset, SEXP model_control, SEXP sgd_control) {
 34 |   Rcpp::List Dataset(dataset);
 35 |   Rcpp::List Model_control(model_control);
 36 |   Rcpp::List Sgd_control(sgd_control);
 37 |   if (Rcpp::as<bool>(Sgd_control["verbose"])) {
 38 |     Rcpp::Rcout << "Converting arguments from R to C++ types..." << std::endl;
 39 |   }
 40 | 
 41 |   // Construct data.
 42 |   data_set data(Dataset["bigmat"],
 43 |                 Rcpp::as<mat>(Dataset["X"]),
 44 |                 Rcpp::as<mat>(Dataset["Y"]),
 45 |                 Rcpp::as<unsigned>(Sgd_control["npasses"]),
 46 |                 Rcpp::as<bool>(Dataset["big"]),
 47 |                 Rcpp::as<bool>(Sgd_control["shuffle"]));
 48 | 
 49 |   // Construct model.
 50 |   std::string model_name = Rcpp::as<std::string>(Model_control["name"]);
 51 |   if (model_name == "cox") {
 52 |     cox_model model(Model_control);
 53 |     // Construct stochastic gradient method.
 54 |     std::string sgd_name = Rcpp::as<std::string>(Sgd_control["method"]);
 55 |     if (sgd_name == "sgd" || sgd_name == "asgd") {
 56 |       explicit_sgd sgd(Sgd_control, data.n_samples);
 57 |       return run(data, model, sgd);
 58 |     } else if (sgd_name == "implicit" || sgd_name == "ai-sgd") {
 59 |       implicit_sgd sgd(Sgd_control, data.n_samples);
 60 |       return run(data, model, sgd);
 61 |     } else if (sgd_name == "momentum") {
 62 |       momentum_sgd sgd(Sgd_control, data.n_samples);
 63 |       return run(data, model, sgd);
 64 |     } else if (sgd_name == "nesterov") {
 65 |       nesterov_sgd sgd(Sgd_control, data.n_samples);
 66 |       return run(data, model, sgd);
 67 |     } else {
 68 |       Rcpp::Rcout << "error: stochastic gradient method not implemented" << std::endl;
 69 |       return Rcpp::List();
 70 |     }
 71 |   } else if (model_name == "lm" || model_name == "glm") {
 72 |     glm_model model(Model_control);
 73 |     // Construct stochastic gradient method.
 74 |     std::string sgd_name = Rcpp::as<std::string>(Sgd_control["method"]);
 75 |     if (sgd_name == "sgd" || sgd_name == "asgd") {
 76 |       explicit_sgd sgd(Sgd_control, data.n_samples);
 77 |       return run(data, model, sgd);
 78 |     } else if (sgd_name == "implicit" || sgd_name == "ai-sgd") {
 79 |       implicit_sgd sgd(Sgd_control, data.n_samples);
 80 |       return run(data, model, sgd);
 81 |     } else if (sgd_name == "momentum") {
 82 |       momentum_sgd sgd(Sgd_control, data.n_samples);
 83 |       return run(data, model, sgd);
 84 |     } else if (sgd_name == "nesterov") {
 85 |       nesterov_sgd sgd(Sgd_control, data.n_samples);
 86 |       return run(data, model, sgd);
 87 |     } else {
 88 |       Rcpp::Rcout << "error: stochastic gradient method not implemented" << std::endl;
 89 |       return Rcpp::List();
 90 |     }
 91 |   } else if (model_name == "gmm") {
 92 |     gmm_model model(Model_control);
 93 |     // Construct stochastic gradient method.
 94 |     std::string sgd_name = Rcpp::as<std::string>(Sgd_control["method"]);
 95 |     if (sgd_name == "sgd" || sgd_name == "asgd") {
 96 |       explicit_sgd sgd(Sgd_control, data.n_samples);
 97 |       return run(data, model, sgd);
 98 |     } else if (sgd_name == "implicit" || sgd_name == "ai-sgd") {
 99 |       implicit_sgd sgd(Sgd_control, data.n_samples);
100 |       return run(data, model, sgd);
101 |     } else if (sgd_name == "momentum") {
102 |       momentum_sgd sgd(Sgd_control, data.n_samples);
103 |       return run(data, model, sgd);
104 |     } else if (sgd_name == "nesterov") {
105 |       nesterov_sgd sgd(Sgd_control, data.n_samples);
106 |       return run(data, model, sgd);
107 |     } else {
108 |       Rcpp::Rcout << "error: stochastic gradient method not implemented" << std::endl;
109 |       return Rcpp::List();
110 |     }
111 |   } else if (model_name == "m") {
112 |     m_model model(Model_control);
113 |     // Construct stochastic gradient method.
114 |     std::string sgd_name = Rcpp::as<std::string>(Sgd_control["method"]);
115 |     if (sgd_name == "sgd" || sgd_name == "asgd") {
116 |       explicit_sgd sgd(Sgd_control, data.n_samples);
117 |       return run(data, model, sgd);
118 |     } else if (sgd_name == "implicit" || sgd_name == "ai-sgd") {
119 |       implicit_sgd sgd(Sgd_control, data.n_samples);
120 |       return run(data, model, sgd);
121 |     } else if (sgd_name == "momentum") {
122 |       momentum_sgd sgd(Sgd_control, data.n_samples);
123 |       return run(data, model, sgd);
124 |     } else if (sgd_name == "nesterov") {
125 |       nesterov_sgd sgd(Sgd_control, data.n_samples);
126 |       return run(data, model, sgd);
127 |     } else {
128 |       Rcpp::Rcout << "error: stochastic gradient method not implemented" << std::endl;
129 |       return Rcpp::List();
130 |     }
131 |   } else {
132 |     Rcpp::Rcout << "error: model not implemented" << std::endl;
133 |     return Rcpp::List();
134 |   }
135 | 
136 |   #if 0
137 |   // TODO The above duplicates code within the if-else statement.
138 |   // Construct stochastic gradient method.
139 |   std::string sgd_name = Rcpp::as<std::string>(Sgd_control["method"]);
140 |   if (sgd_name == "sgd" || sgd_name == "asgd") {
141 |     explicit_sgd sgd(Sgd_control, data.n_samples);
142 |     return run(data, model, sgd);
143 |   } else if (sgd_name == "implicit" || sgd_name == "ai-sgd") {
144 |     implicit_sgd sgd(Sgd_control, data.n_samples);
145 |     return run(data, model, sgd);
146 |   } else if (sgd_name == "momentum") {
147 |     momentum_sgd sgd(Sgd_control, data.n_samples);
148 |     return run(data, model, sgd);
149 |   } else if (sgd_name == "nesterov") {
150 |     nesterov_sgd sgd(Sgd_control, data.n_samples);
151 |     return run(data, model, sgd);
152 |   } else {
153 |     Rcpp::Rcout << "error: stochastic gradient method not implemented" << std::endl;
154 |     return Rcpp::List();
155 |   }
156 |   #endif
157 | }
158 | 
159 | /**
160 |  * Runs algorithm templated on the model and stochastic gradient method
161 |  *
162 |  * @param  data     data set
163 |  * @tparam MODEL    model class
164 |  * @tparam SGD      stochastic gradient descent class
165 |  */
166 | template<typename MODEL, typename SGD>
167 | Rcpp::List run(const data_set& data, MODEL& model, SGD& sgd) {
168 |   unsigned n_samples = data.n_samples;
169 |   // unsigned n_features = data.n_features;
170 |   unsigned n_passes = sgd.get_n_passes();
171 | 
172 |   bool good_gradient = true;
173 |   bool good_validity = true;
174 |   bool averaging = false;
175 |   if (sgd.name() == "asgd" || sgd.name() == "ai-sgd") {
176 |     averaging = true;
177 |   }
178 | 
179 |   // TODO these should really be vec's
180 |   mat theta_new;
181 |   mat theta_new_ave;
182 |   mat theta_old = sgd.get_last_estimate();
183 |   mat theta_old_ave = theta_old;
184 | 
185 |   unsigned max_iters = n_samples*n_passes;
186 |   bool do_more_iterations = true;
187 |   bool converged = false;
188 |   if (sgd.verbose()) {
189 |     Rcpp::Rcout << "Stochastic gradient method: " << sgd.name() << std::endl;
190 |     Rcpp::Rcout << "SGD Start!" << std::endl;
191 |   }
192 |   for (unsigned t = 1; do_more_iterations; ++t) {
193 |     theta_new = sgd.update(t, theta_old, data, model, good_gradient);
194 | 
195 |     if (averaging) {
196 |       if (t != 1) {
197 |         theta_new_ave = (1. - 1./(double)t) * theta_old_ave +
198 |           1./((double)t) * theta_new;
199 |       } else {
200 |         theta_new_ave = theta_new;
201 |       }
202 |       sgd = theta_new_ave;
203 |     } else {
204 |       sgd = theta_new;
205 |     }
206 | 
207 |     good_validity = validity_check(data, theta_new, good_gradient, t, model);
208 |     if (!good_validity) {
209 |       return Rcpp::List();
210 |     }
211 | 
212 |     // Check if satisfy convergence threshold.
213 |     if (averaging) {
214 |       converged = sgd.check_convergence(theta_new_ave, theta_old_ave);
215 |     } else {
216 |       converged = sgd.check_convergence(theta_new, theta_old);
217 |     }
218 |     if (converged) {
219 |       sgd.end_early();
220 |       do_more_iterations = false;
221 |     }
222 |     // Stop if hit maximum number of iterations.
223 |     if (t == max_iters) {
224 |       //if (!sgd.pass()) {
225 |       //  Rcpp::Rcout
226 |       //    << "Informational Message: The maximum number of iterations is "
227 |       //    << "reached! The algorithm has not converged."
228 |       //    << std::endl
229 |       //    << "Estimates from this stochastic gradient descent are not "
230 |       //    << "guaranteed to be meaningful."
231 |       //    << std::endl;
232 |       //}
233 |       do_more_iterations = false;
234 |     }
235 | 
236 |     // Set old to new updates and repeat.
237 |     if (averaging) {
238 |       theta_old_ave = theta_new_ave;
239 |     }
240 |     theta_old = theta_new;
241 |   }
242 | 
243 |   Rcpp::List model_out = post_process(sgd, data, model);
244 | 
245 |   return Rcpp::List::create(
246 |     Rcpp::Named("model") = model.name(),
247 |     Rcpp::Named("coefficients") = sgd.get_last_estimate(),
248 |     Rcpp::Named("converged") = converged,
249 |     Rcpp::Named("estimates") = sgd.get_estimates(),
250 |     Rcpp::Named("pos") = sgd.get_pos(),
251 |     Rcpp::Named("model.out") = model_out);
252 | }
253 | 


--------------------------------------------------------------------------------
/src/sgd/base_sgd.h:
--------------------------------------------------------------------------------
  1 | #ifndef SGD_BASE_SGD_H
  2 | #define SGD_BASE_SGD_H
  3 | 
  4 | #include "../basedef.h"
  5 | #include "../learn-rate/base_learn_rate.h"
  6 | #include "../learn-rate/onedim_learn_rate.h"
  7 | #include "../learn-rate/onedim_eigen_learn_rate.h"
  8 | #include "../learn-rate/ddim_learn_rate.h"
  9 | 
 10 | class base_sgd {
 11 |   /**
 12 |    * Base class for stochastic gradient descent
 13 |    *
 14 |    * @param sgd       attributes affiliated with sgd as R type
 15 |    * @param n_samples number of data samples
 16 |    * @param ti        timer for benchmarking how long to get each estimate
 17 |    */
 18 | public:
 19 |   base_sgd(Rcpp::List sgd, unsigned n_samples) {
 20 |     name_ = Rcpp::as<std::string>(sgd["method"]);
 21 |     n_params_ = Rcpp::as<unsigned>(sgd["nparams"]);
 22 |     reltol_ = Rcpp::as<double>(sgd["reltol"]);
 23 |     n_passes_ = Rcpp::as<unsigned>(sgd["npasses"]);
 24 |     size_ = Rcpp::as<unsigned>(sgd["size"]);
 25 |     estimates_ = zeros<mat>(n_params_, size_);
 26 |     last_estimate_ = Rcpp::as<mat>(sgd["start"]);
 27 |     t_ = 0;
 28 |     n_recorded_ = 0;
 29 |     pos_ = Mat<unsigned>(1, size_);
 30 |     pass_ = Rcpp::as<bool>(sgd["pass"]);
 31 |     verbose_ = Rcpp::as<bool>(sgd["verbose"]);
 32 | 
 33 |     check_ = Rcpp::as<bool>(sgd["check"]);
 34 |     if (check_) {
 35 |       truth_ = Rcpp::as<mat>(sgd["truth"]);
 36 |     }
 37 | 
 38 |     // Set which iterations to store estimates
 39 |     unsigned n_iters = n_samples*n_passes_;
 40 |     for (unsigned i = 0; i < size_; ++i) {
 41 |       pos_(0, i) = int(round(pow(10.,
 42 |                    i * log10(static_cast<double>(n_iters)) / (size_-1))));
 43 |     }
 44 |     if (pos_(0, pos_.n_cols-1) != n_iters) {
 45 |       pos_(0, pos_.n_cols-1) = n_iters;
 46 |     }
 47 |     if (n_iters < size_) {
 48 |       Rcpp::Rcout << "Warning: Too few data points for plotting!" << std::endl;
 49 |     }
 50 | 
 51 |     // Set learning rate
 52 |     std:: string lr = Rcpp::as<std::string>(sgd["lr"]);
 53 |     vec lr_control = Rcpp::as<vec>(sgd["lr.control"]);
 54 |     if (lr == "one-dim") {
 55 |       lr_obj_ = new onedim_learn_rate(lr_control(0), lr_control(1),
 56 |                                       lr_control(2), lr_control(3));
 57 |     } else if (lr == "one-dim-eigen") {
 58 |       lr_obj_ = new onedim_eigen_learn_rate(n_params_);
 59 |     } else if (lr == "d-dim") {
 60 |       lr_obj_ = new ddim_learn_rate(n_params_, 1., 0., 1., 1.,
 61 |                                     lr_control(0));
 62 |     } else if (lr == "adagrad") {
 63 |       lr_obj_ = new ddim_learn_rate(n_params_, lr_control(0), 1., 1., .5,
 64 |                                     lr_control(1));
 65 |     } else if (lr == "rmsprop") {
 66 |       lr_obj_ = new ddim_learn_rate(n_params_, lr_control(0), lr_control(1),
 67 |                                     1-lr_control(1), .5, lr_control(2));
 68 |     }
 69 |   }
 70 | 
 71 |   std::string name() const {
 72 |     return name_;
 73 |   }
 74 |   // TODO set naming convention properly
 75 |   unsigned get_n_passes() const {
 76 |     return n_passes_;
 77 |   }
 78 |   mat get_estimates() const {
 79 |     return estimates_;
 80 |   }
 81 |   mat get_last_estimate() const {
 82 |     return last_estimate_;
 83 |   }
 84 |   Mat<unsigned> get_pos() const {
 85 |     return pos_;
 86 |   }
 87 |   bool pass() const {
 88 |     return pass_;
 89 |   }
 90 |   bool verbose() const {
 91 |     return verbose_;
 92 |   }
 93 | 
 94 |   // Check if satisfy convergence threshold.
 95 |   bool check_convergence(const mat& theta_new, const mat& theta_old) const {
 96 |     // if checking against truth
 97 |     double diff;
 98 |     if (check_) {
 99 |       diff = mean(mean(pow(theta_new - truth_, 2)));
100 |       if (diff < 0.001) {
101 |         return true;
102 |       }
103 |     // if not running fixed number of iterations
104 |     } else if (!pass_) {
105 |       diff = mean(mean(abs(theta_new - theta_old))) /
106 |         mean(mean(abs(theta_old)));
107 |       if (diff < reltol_) {
108 |         return true;
109 |       }
110 |     }
111 |     return false;
112 |   }
113 | 
114 |   const learn_rate_value& learning_rate(unsigned t, const mat& grad_t) {
115 |     return (*lr_obj_)(t, grad_t);
116 |   }
117 | 
118 |   //TODO declare update method
119 |   //template<typename MODEL>
120 |   //mat update(unsigned t, const mat& theta_old, const data_set& data,
121 |   //MODEL& model, bool& good_gradient);
122 | 
123 |   // base_sgd& operator=(const mat& theta_new) {
124 |   //   last_estimate_ = theta_new;
125 |   //   t_ += 1;
126 |   //   if (t_ == pos_[n_recorded_]) {
127 |   //     estimates_.col(n_recorded_) = theta_new;
128 |   //     times_.at(n_recorded_) = ti_.elapsed();
129 |   //     n_recorded_ += 1;
130 |   //     while (n_recorded_ < size_ && pos_[n_recorded_-1] == pos_[n_recorded_]) {
131 |   //       estimates_.col(n_recorded_) = theta_new;
132 |   //       times_.at(n_recorded_) = times_.at(n_recorded_-1);
133 |   //       n_recorded_ += 1;
134 |   //     }
135 |   //   }
136 |   //   return *this;
137 |   // }
138 |   
139 |   base_sgd& operator=(const mat& theta_new) {
140 |     last_estimate_ = theta_new;
141 |     t_ += 1;
142 |     if (t_ == pos_[n_recorded_]) {
143 |       estimates_.col(n_recorded_) = theta_new;
144 |       n_recorded_ += 1;
145 |       while (n_recorded_ < size_ && pos_[n_recorded_-1] == pos_[n_recorded_]) {
146 |         estimates_.col(n_recorded_) = theta_new;
147 |         n_recorded_ += 1;
148 |       }
149 |     }
150 |     return *this;
151 |   }
152 | 
153 |   void end_early() {
154 |     // Throw away the space for things that were not recorded.
155 |     pos_.shed_cols(n_recorded_, size_-1);
156 |     estimates_.shed_cols(n_recorded_, size_-1);
157 |   }
158 | 
159 | protected:
160 |   std::string name_;        // name of stochastic gradient method
161 |   unsigned n_params_;       // number of parameters
162 |   double reltol_;           // relative tolerance for convergence
163 |   unsigned n_passes_;       // number of passes over data
164 |   unsigned size_;           // number of estimates to be recorded (log-uniformly)
165 |   mat estimates_;           // collection of stored estimates
166 |   mat last_estimate_;       // last SGD estimate
167 |   base_learn_rate* lr_obj_; // learning rate
168 |   unsigned t_;              // current iteration
169 |   unsigned n_recorded_;     // number of coefs that have been recorded
170 |   Mat<unsigned> pos_;       // the iteration of recorded coefficients
171 |   bool pass_;               // whether to force running for n_passes_ over data
172 |   bool verbose_;
173 |   bool check_;
174 |   mat truth_;
175 | };
176 | 
177 | #endif
178 | 


--------------------------------------------------------------------------------
/src/sgd/explicit_sgd.h:
--------------------------------------------------------------------------------
 1 | #ifndef SGD_EXPLICIT_SGD_H
 2 | #define SGD_EXPLICIT_SGD_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "../data/data_set.h"
 6 | #include "../learn-rate/learn_rate_value.h"
 7 | #include "base_sgd.h"
 8 | 
 9 | class explicit_sgd : public base_sgd {
10 |   /**
11 |    * Stochastic gradient descent in standard formulation, i.e., using an
12 |    * "explicit" update
13 |    *
14 |    * @param sgd       attributes affiliated with sgd as R type
15 |    * @param n_samples number of data samples
16 |    */
17 | public:
18 |   explicit_sgd(Rcpp::List sgd, unsigned n_samples) :
19 |     base_sgd(sgd, n_samples) {}
20 | 
21 |   template<typename MODEL>
22 |   mat update(unsigned t, const mat& theta_old, const data_set& data,
23 |     MODEL& model, bool& good_gradient) {
24 |     mat grad_t = model.gradient(t, theta_old, data);
25 |     if (!is_finite(grad_t)) {
26 |       good_gradient = false;
27 |     }
28 |     learn_rate_value at = learning_rate(t, grad_t);
29 |     return theta_old + (at * grad_t);
30 |   }
31 | 
32 |   explicit_sgd& operator=(const mat& theta_new) {
33 |     base_sgd::operator=(theta_new);
34 |     return *this;
35 |   }
36 | };
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/src/sgd/implicit_sgd.h:
--------------------------------------------------------------------------------
  1 | #ifndef SGD_IMPLICIT_SGD_H
  2 | #define SGD_IMPLICIT_SGD_H
  3 | 
  4 | #include "../basedef.h"
  5 | #include "../data/data_point.h"
  6 | #include "../data/data_set.h"
  7 | #include "../model/cox_model.h"
  8 | #include "../model/glm_model.h"
  9 | #include "../model/gmm_model.h"
 10 | #include "../model/m_model.h"
 11 | #include "../learn-rate/learn_rate_value.h"
 12 | #include "base_sgd.h"
 13 | 
 14 | template<typename MODEL>
 15 | class Implicit_fn {
 16 |   // Root finding functor for implicit update
 17 |   // Evaluates the zeroth, first, and second derivatives of:
 18 |   // ksi - ell(x^T theta + ||x||^2 * ksi)
 19 | public:
 20 |   typedef boost::math::tuple<double, double, double> tuple_type;
 21 | 
 22 |   Implicit_fn(const MODEL& m, double a, const data_point& d, const mat& t,
 23 |     double n) : model_(m), at_(a), data_pt_(d), theta_old_(t), normx_(n) {}
 24 | 
 25 |   tuple_type operator()(double ksi) const {
 26 |     double value = ksi - at_ *
 27 |       model_.scale_factor(ksi, at_, data_pt_, theta_old_, normx_);
 28 |     double first = 1 + at_ *
 29 |       model_.scale_factor_first_deriv(ksi, at_, data_pt_, theta_old_, normx_);
 30 |     double second = at_ *
 31 |       model_.scale_factor_second_deriv(ksi, at_, data_pt_, theta_old_, normx_);
 32 |     tuple_type out(value, first, second);
 33 |     return out;
 34 |   }
 35 | 
 36 | private:
 37 |   const MODEL& model_;
 38 |   double at_;
 39 |   const data_point& data_pt_;
 40 |   const mat& theta_old_;
 41 |   double normx_;
 42 | };
 43 | 
 44 | class implicit_sgd : public base_sgd {
 45 |   /**
 46 |    * Stochastic gradient descent using an "implicit" update
 47 |    *
 48 |    * @param sgd       attributes affiliated with sgd as R type
 49 |    * @param n_samples number of data samples
 50 |    */
 51 | public:
 52 |   implicit_sgd(Rcpp::List sgd, unsigned n_samples) :
 53 |     base_sgd(sgd, n_samples) {
 54 |     delta_ = Rcpp::as<double>(sgd["delta"]);
 55 |   }
 56 | 
 57 |   mat update(unsigned t, const mat& theta_old, const data_set& data,
 58 |     glm_model& model, bool& good_gradient) {
 59 |     mat theta_new;
 60 |     learn_rate_value at = learning_rate(t, model.gradient(t, theta_old, data));
 61 |     // TODO how to deal with non-scalar learning rates?
 62 |     double at_avg = at.mean();
 63 | 
 64 |     data_point data_pt = data.get_data_point(t);
 65 |     double normx = dot(data_pt.x, data_pt.x);
 66 | 
 67 |     double r = at_avg * model.scale_factor(0, at_avg, data_pt, theta_old, normx);
 68 |     double lower = 0;
 69 |     double upper = 0;
 70 |     if (r < 0) {
 71 |       lower = r;
 72 |     } else {
 73 |       upper = r;
 74 |     }
 75 |     double ksi;
 76 |     if (lower != upper) {
 77 |       Implicit_fn<glm_model> implicit_fn(model, at_avg, data_pt, theta_old, normx);
 78 |       ksi = boost::math::tools::schroeder_iterate(implicit_fn, (lower +
 79 |         upper)/2, lower, upper, delta_);
 80 |     } else {
 81 |       ksi = lower;
 82 |     }
 83 |     return theta_old +
 84 |       ksi * data_pt.x.t() -
 85 |       at_avg * model.gradient_penalty(theta_old);
 86 |   }
 87 | 
 88 |   mat update(unsigned t, const mat& theta_old, const data_set& data,
 89 |     m_model& model, bool& good_gradient) {
 90 |     mat theta_new;
 91 |     learn_rate_value at = learning_rate(t, model.gradient(t, theta_old, data));
 92 |     // TODO how to deal with non-scalar learning rates?
 93 |     double at_avg = at.mean();
 94 | 
 95 |     data_point data_pt = data.get_data_point(t);
 96 |     double normx = dot(data_pt.x, data_pt.x);
 97 | 
 98 |     double r = at_avg * model.scale_factor(0, at_avg, data_pt, theta_old, normx);
 99 |     double lower = 0;
100 |     double upper = 0;
101 |     if (r < 0) {
102 |       lower = r;
103 |     } else {
104 |       upper = r;
105 |     }
106 |     double ksi;
107 |     if (lower != upper) {
108 |       Implicit_fn<m_model> implicit_fn(model, at_avg, data_pt, theta_old, normx);
109 |       ksi = boost::math::tools::schroeder_iterate(implicit_fn, (lower +
110 |         upper)/2, lower, upper, delta_);
111 |     } else {
112 |       ksi = lower;
113 |     }
114 |     return theta_old +
115 |       ksi * data_pt.x.t() -
116 |       at_avg * model.gradient_penalty(theta_old);
117 |   }
118 | 
119 |   mat update(unsigned t, const mat& theta_old, const data_set& data,
120 |     cox_model& model, bool& good_gradient) {
121 |     data_point data_pt = data.get_data_point(t);
122 |     unsigned j = data_pt.idx;
123 | 
124 |     // assuming data points fail in order, i.e., risk set R_i={i,i+1,...,n}
125 |     vec xi = exp(data.X * theta_old);
126 |     vec h = zeros<vec>(j);
127 |     double sum_xi = 0;
128 |     for (int i = j-1; i < j; --i) {
129 |       // h_i = d_i/sum(xi[i:n])
130 |       if (i == j-1) {
131 |         for (int k = i; k < data.n_samples; ++k) {
132 |           sum_xi += xi(k);
133 |         }
134 |       } else {
135 |         sum_xi += xi(i);
136 |       }
137 |       h(i) = data.Y(i)/sum_xi;
138 |     }
139 |     double eta_j = accu(data_pt.x.t() % theta_old); // x_j^T * theta
140 |     double z = eta_j + data_pt.y - xi[j] * sum(h);
141 |     double xjnorm = accu(data_pt.x % data_pt.x); // |x_j|^2_2
142 | 
143 |     //learn_rate_value at = learning_rate(t, model.gradient(t, theta_old, data));
144 |     learn_rate_value at = learning_rate(t, zeros<mat>(data.n_features));
145 |     // TODO how to deal with non-scalar learning rates?
146 |     double at_avg = at.mean();
147 | 
148 |     mat grad_t = (z - (eta_j + at_avg*z*xjnorm)/(1 + at_avg*xjnorm)) *
149 |       data_pt.x.t();
150 |     if (!is_finite(grad_t)) {
151 |       good_gradient = false;
152 |     }
153 |     return theta_old + (at * grad_t);
154 |   }
155 | 
156 |   template <typename MODEL>
157 |   mat update(unsigned t, const mat& theta_old, const data_set& data,
158 |     MODEL& model, bool& good_gradient) {
159 |     Rcpp::Rcout << "error: implicit not implemented for model yet" << std::endl;
160 |     good_gradient = false;
161 |     return theta_old;
162 |   }
163 | 
164 |   implicit_sgd& operator=(const mat& theta_new) {
165 |     base_sgd::operator=(theta_new);
166 |     return *this;
167 |   }
168 | private:
169 |   double delta_;
170 | };
171 | 
172 | #endif
173 | 


--------------------------------------------------------------------------------
/src/sgd/momentum_sgd.h:
--------------------------------------------------------------------------------
 1 | #ifndef SGD_MOMENTUM_SGD_H
 2 | #define SGD_MOMENTUM_SGD_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "../data/data_set.h"
 6 | #include "../learn-rate/learn_rate_value.h"
 7 | #include "base_sgd.h"
 8 | 
 9 | class momentum_sgd : public base_sgd {
10 |   /**
11 |    * Stochastic gradient descent using classical momentum
12 |    *
13 |    * @param sgd       attributes affiliated with sgd as R type
14 |    * @param n_samples number of data samples
15 |    */
16 | public:
17 |   momentum_sgd(Rcpp::List sgd, unsigned n_samples) :
18 |     base_sgd(sgd, n_samples) {
19 |     mu_ = 0.9;
20 |     v_ = last_estimate_;
21 |   }
22 | 
23 |   template<typename MODEL>
24 |   mat update(unsigned t, const mat& theta_old, const data_set& data,
25 |     MODEL& model, bool& good_gradient) {
26 |     mat grad_t = model.gradient(t, theta_old, data);
27 |     if (!is_finite(grad_t)) {
28 |       good_gradient = false;
29 |     }
30 |     learn_rate_value at = learning_rate(t, grad_t);
31 |     v_ = mu_ * v_ + (at * grad_t);
32 |     return theta_old + v_;
33 |   }
34 | 
35 |   momentum_sgd& operator=(const mat& theta_new) {
36 |     base_sgd::operator=(theta_new);
37 |     return *this;
38 |   }
39 | private:
40 |   double mu_; // factor to weigh previous "velocity"
41 |   mat v_;     // "velocity"
42 | };
43 | 
44 | #endif
45 | 


--------------------------------------------------------------------------------
/src/sgd/nesterov_sgd.h:
--------------------------------------------------------------------------------
 1 | #ifndef SGD_NESTEROV_SGD_H
 2 | #define SGD_NESTEROV_SGD_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "../data/data_set.h"
 6 | #include "../learn-rate/learn_rate_value.h"
 7 | #include "base_sgd.h"
 8 | 
 9 | class nesterov_sgd : public base_sgd {
10 |   /**
11 |    * Stochastic gradient descent using Nesterov momentum
12 |    *
13 |    * @param sgd       attributes affiliated with sgd as R type
14 |    * @param n_samples number of data samples
15 |    */
16 | public:
17 |   nesterov_sgd(Rcpp::List sgd, unsigned n_samples) :
18 |     base_sgd(sgd, n_samples) {
19 |     mu_ = 0.9;
20 |     v_ = last_estimate_;
21 |   }
22 | 
23 |   template<typename MODEL>
24 |   mat update(unsigned t, const mat& theta_old, const data_set& data,
25 |     MODEL& model, bool& good_gradient) {
26 |     mat grad_t = model.gradient(t, theta_old + mu_*v_, data);
27 |     if (!is_finite(grad_t)) {
28 |       good_gradient = false;
29 |     }
30 |     learn_rate_value at = learning_rate(t, model.gradient(t, theta_old, data));
31 |     v_ = mu_ * v_ + (at * grad_t);
32 |     return theta_old + v_;
33 |   }
34 | 
35 |   nesterov_sgd& operator=(const mat& theta_new) {
36 |     base_sgd::operator=(theta_new);
37 |     return *this;
38 |   }
39 | private:
40 |   double mu_; // factor to weigh previous "velocity"
41 |   mat v_;     // "velocity"
42 | };
43 | 
44 | #endif
45 | 


--------------------------------------------------------------------------------
/src/validity-check/cox_validity_check_model.h:
--------------------------------------------------------------------------------
 1 | #ifndef VALIDITY_CHECK_COX_VALIDITY_CHECK_MODEL_H
 2 | #define VALIDITY_CHECK_COX_VALIDITY_CHECK_MODEL_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "../data/data_set.h"
 6 | #include "../model/cox_model.h"
 7 | 
 8 | bool validity_check_model(const data_set& data, const mat& theta, unsigned t,
 9 |   const cox_model& model) {
10 |   // TODO
11 |   return true;
12 | }
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/src/validity-check/glm_validity_check_model.h:
--------------------------------------------------------------------------------
 1 | #ifndef VALIDITY_CHECK_GLM_VALIDITY_CHECK_MODEL_H
 2 | #define VALIDITY_CHECK_GLM_VALIDITY_CHECK_MODEL_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "../data/data_set.h"
 6 | #include "../model/glm_model.h"
 7 | 
 8 | bool validity_check_model(const data_set& data, const mat& theta, unsigned t,
 9 |   const glm_model& model) {
10 |   // TODO should this really be checked at each iteration?
11 |   return true;
12 | }
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/src/validity-check/gmm_validity_check_model.h:
--------------------------------------------------------------------------------
 1 | #ifndef VALIDITY_CHECK_GMM_VALIDITY_CHECK_MODEL_H
 2 | #define VALIDITY_CHECK_GMM_VALIDITY_CHECK_MODEL_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "../data/data_set.h"
 6 | #include "../model/gmm_model.h"
 7 | 
 8 | bool validity_check_model(const data_set& data, const mat& theta, unsigned t,
 9 |   const gmm_model& model) {
10 |   // TODO
11 |   return true;
12 | }
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/src/validity-check/m_validity_check_model.h:
--------------------------------------------------------------------------------
 1 | #ifndef VALIDITY_CHECK_M_VALIDITY_CHECK_MODEL_H
 2 | #define VALIDITY_CHECK_M_VALIDITY_CHECK_MODEL_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "../data/data_set.h"
 6 | #include "../model/m_model.h"
 7 | 
 8 | bool validity_check_model(const data_set& data, const mat& theta, unsigned t,
 9 |   const m_model& model) {
10 |   // TODO
11 |   return true;
12 | }
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/src/validity-check/validity_check.h:
--------------------------------------------------------------------------------
 1 | #ifndef VALIDITY_CHECK_VALIDITY_CHECK_H
 2 | #define VALIDITY_CHECK_VALIDITY_CHECK_H
 3 | 
 4 | #include "../basedef.h"
 5 | #include "../data/data_set.h"
 6 | #include "cox_validity_check_model.h"
 7 | #include "glm_validity_check_model.h"
 8 | #include "gmm_validity_check_model.h"
 9 | #include "m_validity_check_model.h"
10 | 
11 | template<typename MODEL>
12 | bool validity_check(const data_set& data, const mat& theta, bool good_gradient,
13 |   unsigned t, const MODEL& model) {
14 |   // Check if gradient is finite.
15 |   if (!good_gradient) {
16 |     Rcpp::Rcout << "error: NA or infinite gradient" << std::endl;
17 |     return false;
18 |   }
19 | 
20 |   // Check if all estimates are finite.
21 |   if (!is_finite(theta)) {
22 |     Rcpp::Rcout << "warning: non-finite coefficients at iteration " << t << std::endl;
23 |   }
24 | 
25 |   return validity_check_model(data, theta, t, model);
26 | }
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | test_check("sgd")


--------------------------------------------------------------------------------
/tests/testthat/test-fitted.R:
--------------------------------------------------------------------------------
 1 | context("Fitted generic method")
 2 | 
 3 | test_that("Fitted generic method", {
 4 | 
 5 |   skip_on_cran()
 6 | 
 7 |   # Dimensions
 8 |   N <- 1e4
 9 |   d <- 5
10 | 
11 |   # Generate data.
12 |   set.seed(42)
13 |   X <- matrix(rnorm(N*d), ncol=d)
14 |   theta <- rep(5, d+1)
15 |   eps <- rnorm(N)
16 |   y <- cbind(1, X) %*% theta + eps
17 |   dat <- data.frame(y=y, x=X)
18 | 
19 |   sgd.theta <- sgd(y ~ ., data=dat, model="lm")
20 |   fitted(sgd.theta)
21 | 
22 |   # Check that it executes without error.
23 |   expect_true(TRUE)
24 | })
25 | 


--------------------------------------------------------------------------------
/tests/testthat/test-lasso.R:
--------------------------------------------------------------------------------
 1 | context("Linear regression with lasso penalty")
 2 | 
 3 | test_that("MSE converges for linear regression with lasso", {
 4 |   skip_on_cran()
 5 | 
 6 |   library(glmnet)
 7 | 
 8 |   # Dimensions
 9 |   N <- 1e5
10 |   d <- 5
11 | 
12 |   # Generate data.
13 |   set.seed(42)
14 |   X <- matrix(rnorm(N*d), ncol=d)
15 |   theta <- rep(5, d)
16 |   eps <- rnorm(N)
17 |   y <- X %*% theta + eps
18 |   dat <- data.frame(y=y, x=X)
19 | 
20 |   glmnet.theta <- glmnet(X, y, alpha=1, lambda=0.5, standardize=FALSE,
21 |     type.gaussian="covariance")
22 |   truth <- as.vector(glmnet.theta$beta)
23 | 
24 |   get.mse <- function(method) {
25 |     sgd.theta <- sgd(y ~ .-1, data=dat, model="lm",
26 |                      model.control=list(lambda1=0.5),
27 |                      sgd.control=list(
28 |                        method=method,
29 |                        pass=T))
30 |     mean((sgd.theta$coefficients - truth)^2)
31 |   }
32 | 
33 |   expect_true(get.mse("sgd") < 1e-2)
34 |   expect_true(get.mse("ai-sgd") < 1e-2)
35 | })
36 | 


--------------------------------------------------------------------------------
/tests/testthat/test-linear.R:
--------------------------------------------------------------------------------
 1 | context("Linear regression")
 2 | 
 3 | test_that("MSE converges for linear models", {
 4 | 
 5 |   skip_on_cran()
 6 | 
 7 |   # Dimensions
 8 |   N <- 1e4
 9 |   d <- 5
10 | 
11 |   # Generate data.
12 |   set.seed(42)
13 |   X <- matrix(rnorm(N*d), ncol=d)
14 |   theta <- rep(5, d+1)
15 |   eps <- rnorm(N)
16 |   y <- cbind(1, X) %*% theta + eps
17 |   dat <- data.frame(y=y, x=X)
18 | 
19 |   get.mse <- function(method, lr) {
20 |     sgd.theta <- sgd(y ~ ., data=dat, model="lm",
21 |                      sgd.control=list(
22 |                        method=method,
23 |                        lr=lr,
24 |                        npasses=10,
25 |                        pass=T))
26 |     mean((sgd.theta$coefficients - theta)^2)
27 |   }
28 | 
29 |   expect_true(get.mse("sgd", "one-dim") < 1e-2)
30 |   expect_true(get.mse("sgd", "adagrad") < 1e-2)
31 |   #expect_true(get.mse("sgd", "rmsprop") < 1e-2)
32 |   expect_true(get.mse("implicit", "one-dim") < 1e-2)
33 |   #expect_true(get.mse("implicit", "d-dim") < 1e-2)
34 |   expect_true(get.mse("implicit", "adagrad") < 1e-2)
35 |   #expect_true(get.mse("implicit", "rmsprop") < 1e-2)
36 |   expect_true(get.mse("asgd", "one-dim") < 1e-2)
37 |   expect_true(get.mse("asgd", "adagrad") < 1e-2)
38 |   expect_true(get.mse("asgd", "rmsprop") < 1e-2)
39 |   expect_true(get.mse("ai-sgd", "one-dim") < 1e-2)
40 |   expect_true(get.mse("ai-sgd", "d-dim") < 1e-2)
41 |   expect_true(get.mse("ai-sgd", "adagrad") < 1e-2)
42 |   expect_true(get.mse("ai-sgd", "rmsprop") < 1e-2)
43 |   expect_true(get.mse("momentum", "one-dim") < 1e-2)
44 |   #expect_true(get.mse("momentum", "d-dim") < 1e-2)
45 |   #expect_true(get.mse("momentum", "adagrad") < 1e-2)
46 |   #expect_true(get.mse("momentum", "rmsprop") < 1e-2)
47 |   expect_true(get.mse("nesterov", "one-dim") < 1e-2)
48 |   #expect_true(get.mse("nesterov", "d-dim") < 1e-2)
49 |   #expect_true(get.mse("nesterov", "adagrad") < 1e-2)
50 |   #expect_true(get.mse("nesterov", "rmsprop") < 1e-2)
51 | })
52 | 


--------------------------------------------------------------------------------
/tests/testthat/test-predict.R:
--------------------------------------------------------------------------------
 1 | context("Predict generic method")
 2 | 
 3 | test_that("Predict generic method", {
 4 | 
 5 |   skip_on_cran()
 6 | 
 7 |   # Dimensions
 8 |   N <- 1e4
 9 |   d <- 5
10 | 
11 |   # Generate data.
12 |   set.seed(42)
13 |   X <- matrix(rnorm(N*d), ncol=d)
14 |   theta <- rep(5, d+1)
15 |   eps <- rnorm(N)
16 |   y <- cbind(1, X) %*% theta + eps
17 |   dat <- data.frame(y=y, x=X)
18 | 
19 |   sgd.theta <- sgd(y ~ ., data=dat, model="lm")
20 |   predict(sgd.theta, cbind(1, X))
21 |   predict(sgd.theta, cbind(1, X), type="response")
22 |   predict(sgd.theta, cbind(1, X), type="term")
23 | 
24 |   # Check that it executes without error.
25 |   expect_true(TRUE)
26 | })
27 | 


--------------------------------------------------------------------------------
/tests/testthat/test-residuals.R:
--------------------------------------------------------------------------------
 1 | context("Residuals generic method")
 2 | 
 3 | test_that("Residuals generic method", {
 4 | 
 5 |   skip_on_cran()
 6 | 
 7 |   # Dimensions
 8 |   N <- 1e4
 9 |   d <- 5
10 | 
11 |   # Generate data.
12 |   set.seed(42)
13 |   X <- matrix(rnorm(N*d), ncol=d)
14 |   theta <- rep(5, d+1)
15 |   eps <- rnorm(N)
16 |   y <- cbind(1, X) %*% theta + eps
17 |   dat <- data.frame(y=y, x=X)
18 | 
19 |   sgd.theta <- sgd(y ~ ., data=dat, model="lm")
20 |   residuals(sgd.theta)
21 | 
22 |   # Check that it executes without error.
23 |   expect_true(TRUE)
24 | })
25 | 


--------------------------------------------------------------------------------
/vignettes/sgd-jss.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airoldilab/sgd/92246a3e1d81c9cd3077b80cacb6a3f7ee2425c5/vignettes/sgd-jss.pdf


--------------------------------------------------------------------------------
/vignettes/sgd-jss.pdf.asis:
--------------------------------------------------------------------------------
1 | %\VignetteIndexEntry{Stochastic gradient decent methods for estimation with large data sets}
2 | %\VignetteEngine{R.rsp::asis}
3 | %\VignetteKeyword{PDF}
4 | %\VignetteKeyword{HTML}
5 | %\VignetteKeyword{vignette}
6 | %\VignetteKeyword{package}
7 | 


--------------------------------------------------------------------------------