├── .gitignore
├── LICENSE
├── README.md
└── l2-regularization
    ├── bayes-reg.R
    └── plots
        ├── mtcars-coef-shrinkage-net-overlay.pdf
        ├── mtcars-coef-shrinkage-net-overlay.png
        ├── mtcars-coef-shrinkage.pdf
        ├── mtcars-coef-shrinkage.png
        ├── mtcars-loocv-mse.pdf
        └── mtcars-loocv-mse.png


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Tony Fischetti
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # bayesian-regularization
 3 | 
 4 | ### A demonstration of Bayesian approaches to linear model regularization
 5 | 
 6 | 
 7 | This is code for personal research on Bayesian approaches to linear model
 8 | regularization through thoughtful selection of priors on the beta
 9 | coefficients, and it's equivalence to penalized MLE methods.
10 | 


--------------------------------------------------------------------------------
/l2-regularization/bayes-reg.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/Rscript --vanilla
  2 | 
  3 | ###########################################################
  4 | ##                                                       ##
  5 | ##   bayes-reg.R                                         ##
  6 | ##                                                       ##
  7 | ##                Author: Tony Fischetti                 ##
  8 | ##                        tony.fischetti@gmail.com       ##
  9 | ##                                                       ##
 10 | ###########################################################
 11 | 
 12 | # workspace cleanup
 13 | rm(list=ls())
 14 | 
 15 | # options
 16 | options(echo=TRUE)
 17 | options(stringsAsFactors=FALSE)
 18 | options(datatable.fread.datatable=FALSE)
 19 | options(mc.cores = parallel::detectCores())
 20 | 
 21 | # cli args
 22 | args <- commandArgs(trailingOnly=TRUE)
 23 | 
 24 | # libraries
 25 | library(glmnet)
 26 | library(magrittr)
 27 | library(dplyr)
 28 | library(rethinking)
 29 | library(pbapply)
 30 | library(cvTools)
 31 | library(ggplot2)
 32 | library(tidyr)
 33 | library(boot)
 34 | 
 35 | 
 36 | 
 37 | 
 38 | 
 39 | #--------------------------------------------------#
 40 | #--------------------------------------------------#
 41 | #                    FUNCTIONS                     #
 42 | #--------------------------------------------------#
 43 | #--------------------------------------------------#
 44 | make.map.formula <- function(startofalist, numofcoefs, stddev){
 45 |   # doesn't count the intercept
 46 |   gencoeffs <- paste0("b", 1:numofcoefs)
 47 |   this <- list()
 48 |   for(i in 1:length(gencoeffs)){
 49 |     this <- c(this, list(bquote(.(as.name(gencoeffs[i])) ~ dnorm(0, .(stddev)))))
 50 |   }
 51 |   return(c(startofalist, this))
 52 | }
 53 | 
 54 | get.folds <- function(df, k=5){
 55 |   tmp <- cvFolds(nrow(df), K=k)
 56 |   folds <- lapply(1:length(unique(tmp$which)),
 57 |                   function(x) tmp$subsets[which(tmp$which!=x),])
 58 |   return(folds)
 59 | }
 60 | 
 61 | pevaluate <- function(X, y, design.matrix, basealist, prior.stddev,
 62 |                       startlist, k=5, parallel=TRUE, show.result=TRUE){
 63 |   if(parallel){
 64 |     mapcarfn <- function(...){ unlist(mclapply(...)) }
 65 |   } else{
 66 |     mapcarfn <- sapply
 67 |   }
 68 |   df <- cbind(X, y)
 69 |   form <- make.map.formula(basealist, ncol(X), prior.stddev)
 70 |   folds <- get.folds(df, k=k)
 71 |   mserrors <- mapcarfn(1:length(folds),
 72 |                        function(x){
 73 |                          rows <- folds[[x]]
 74 |                          fit <- map(form, data=df[rows,], start=startlist)
 75 |                          best.bayes.coefs <- coef(fit)[-length(coef(fit))] %>% matrix
 76 |                          pred.bayes <- design.matrix %*% best.bayes.coefs
 77 |                          the.errors <- pred.bayes - y[,1]
 78 |                          serrors <- the.errors[-rows]^2
 79 |                          return(mean(serrors))
 80 |                        })
 81 |   erfit <- map(alist(mserrors ~ dnorm(mu, stddev),
 82 |                      mu ~ dnorm(0,100),
 83 |                      stddev ~ dunif(0,100)),
 84 |                data=data.frame(mserrors=mserrors),
 85 |                start=list(mu=8, stddev=10))
 86 |   emean <- mean(extract.samples(erfit)$mu)
 87 |   hdi95 <- HPDI(extract.samples(erfit)$mu, prob=.95)
 88 |   fullfit <- map(form, data=df, start=startlist)
 89 |   best.bayes.coefs <- coef(fullfit)[-length(coef(fullfit))] %>% matrix
 90 |   retlist <- list(coefs=best.bayes.coefs, cv.mse=emean, hdi=hdi95)
 91 |   if(show.result){
 92 |     cat(sprintf("\n%s\n  LOOCV MSE ESTIMATE FOR PRIOR WIDTH of %.3f: %.3f\n%s\n",
 93 |                 paste0(rep("*", 55), collapse=""),
 94 |                 prior.stddev,
 95 |                 emean,
 96 |                 paste0(rep("*", 55), collapse="")))
 97 |   }
 98 |   return(retlist)
 99 | }
100 | 
101 | 
102 | 
103 | #==================================================#
104 | #==================================================#
105 | #=============         MTCARS         =============#
106 | #=============         MTCARS         =============#
107 | #=============         MTCARS         =============#
108 | #==================================================#
109 | #==================================================#
110 | 
111 | # first let's use ridge regression and get info
112 | # on the highest performing model fit
113 | 
114 | mtstd <- mtcars %>% lapply(scale) %>% data.frame
115 | row.names(mtstd) <- row.names(mtcars)
116 | mtstd$mpg <- mtcars$mpg
117 | 
118 | design.matrix <- model.matrix(mpg ~ ., data=mtstd)
119 | 
120 | X <- design.matrix[,-1]
121 | y <- mtstd[, 1, drop=FALSE]
122 | 
123 | cvfits <- cv.glmnet(X, y[,1], alpha=0, nfolds=10)
124 | # plot(cvfits)
125 | 
126 | 
127 | loc <- which(cvfits$lambda==cvfits$lambda.min)
128 | best.coefs <- coef(cvfits, s="lambda.min")
129 | mse <- cvfits$cvm[loc]                           # 7.336113
130 | 
131 | 
132 | 
133 | 
134 | 
135 | ####### messing around
136 | basealist <- alist(mpg ~ dnorm(mu, sigma),
137 |                    mu <- b0 + b1*cyl + b2*disp + b3*hp + b4*drat + b5*wt + b6*qsec + b7*vs + b8*am + b9*gear + b10*carb,
138 |                    sigma ~ dunif(0,100),
139 |                    b0 ~ dnorm(20, 100))
140 | form <- make.map.formula(basealist, 10, 5)
141 | 
142 | startlist <- list(b0=20, b1=0, b2=0, b3=0, b4=0,
143 |                   b5=0, b6=0, b7=0, b8=0, b9=0,
144 |                   b10=0, sigma=30)
145 | 
146 | 
147 | this <- map(form, data=mtstd, start=startlist)
148 | best.bayes.coefs <- coef(this)[-length(coef(this))] %>% matrix
149 | preds.bayes <- design.matrix %*% best.bayes.coefs
150 | serrors <- (preds.bayes - mtstd$mpg)^2
151 | mse <- mean((preds.bayes - mtstd$mpg)^2)         # 4.622117
152 | mse
153 | 
154 | tmp <- map(alist(serrors ~ dnorm(mu, stddev),
155 |                  mu ~ dnorm(5, 10),
156 |                  stddev ~ dunif(0, 20)),
157 |            data=data.frame(serrors=serrors),
158 |            start=list(mu=4, stddev=6))
159 | 
160 | 
161 | # example
162 | pevaluate(X, y, design.matrix, basealist, 0.575, startlist, k=10)
163 | 
164 | 
165 | 
166 | 
167 | #--------------------------------------------------#
168 | # 10-fold CV with mtcars predicting `mpg` from     #
169 | # all other variables and gaussian priors of       #
170 | # increasing precision                             #
171 | #--------------------------------------------------#
172 | 
173 | prior.widths <- seq(0.05, 5, by=0.025)
174 | 
175 | results <- pblapply(prior.widths, function(x){
176 |                   tryCatch({
177 |                     pevaluate(X, y, design.matrix, basealist, x, startlist, k=10)
178 |                   }, error = function(e){
179 |                     return(list(coeffs=rep(NA, 11), cv.mse=NA, hdi=c(NA, NA)))
180 |                   })
181 |                })
182 | MTCARSK10RESULTS <- results
183 | 
184 | tresults.cv <- lapply(results, function(x) x$cv.mse) %>% unlist
185 | tresults.lower <- lapply(results, function(x) x$hdi[1]) %>% unlist
186 | tresults.upper <- lapply(results, function(x) x$hdi[2]) %>% unlist
187 | tresults.width <- prior.widths
188 | 
189 | tresults.df <- data.frame(mse=tresults.cv, width=tresults.width,
190 |                           lower=tresults.lower,
191 |                           upper=tresults.upper)
192 | 
193 | plot(mse ~ width, data=tresults.df, type="l", ylim=c(0, 22),
194 |      main="10K CV MSE as a function of prior width of coefficients")
195 | lines(upper ~ width, data=tresults.df, type="l", col="red")
196 | lines(tresults.df$width, tresults.df$lower, type="l", col="red")
197 | abline(v=0.775, lty=2)
198 | 
199 | 
200 | 
201 | 
202 | #--------------------------------------------------#
203 | # LOOCV with mtcars predicting `mpg` from          #
204 | # all other variables and gaussian priors of       #
205 | # increasing precision                             #
206 | #--------------------------------------------------#
207 | 
208 | prior.widths <- seq(0.05, 5, by=0.025)
209 | 
210 | results <- pblapply(prior.widths, function(x){
211 |                   tryCatch({
212 |                     pevaluate(X, y, design.matrix, basealist, x, startlist, k=31)
213 |                   }, error = function(e){
214 |                     return(list(coeffs=rep(NA, 11), cv.mse=NA, hdi=c(NA, NA)))
215 |                   })
216 |                })
217 | ## up to here :)
218 | MTCARSLOORESULTS <- results
219 | 
220 | tresults.cv <- lapply(results, function(x) x$cv.mse) %>% unlist %>% .[5:length(prior.widths)]
221 | tresults.lower <- lapply(results, function(x) x$hdi[1]) %>% unlist %>% .[5:length(prior.widths)]
222 | tresults.upper <- lapply(results, function(x) x$hdi[2]) %>% unlist %>% .[5:length(prior.widths)]
223 | tresults.width <- prior.widths %>% .[5:length(prior.widths)]
224 | 
225 | 
226 | tresults.df <- data.frame(mse=tresults.cv, width=tresults.width,
227 |                           lower=tresults.lower,
228 |                           upper=tresults.upper)
229 | 
230 | minxintercept <- tresults.width[which.min(tresults.cv)]
231 | 
232 | plot(mse ~ width, data=tresults.df, type="l", ylim=c(0, 22),
233 |      main="LOOCV MSE as a function of prior width of coefficients")
234 | lines(upper ~ width, data=tresults.df, type="l", col="red")
235 | lines(tresults.df$width, tresults.df$lower, type="l", col="red")
236 | abline(v=minxintercept, lty=2)
237 | 
238 | 
239 | ggplot(tresults.df, aes(width, mse)) +
240 |   geom_ribbon(aes(ymin=lower, ymax=upper, alpha=0.5), fill = "grey70",
241 |               show.legend=FALSE) +
242 |   geom_line() +
243 |   geom_vline(xintercept=minxintercept, linetype=2) +
244 |   ylab("LOOCV MSE") +
245 |   xlab("standard deviation of gaussian coefficient priors") +
246 |   ggtitle('Bayesian "ridge regression" LOOCV MSE with different "penalties" (mtcars)') +
247 |   ggsave("./plots/mtcars-loocv-mse.png") +
248 |   ggsave("./plots/mtcars-loocv-mse.pdf")
249 | 
250 | 
251 | 
252 | tmp <- lapply(1:length(results),
253 |          function(x){ df <- data.frame(pred=names(mtcars)[-1],
254 |                                  cvalue=results[[x]]$coefs[-1,1])
255 |            df %>% spread(pred, cvalue) %>% cbind(data.frame(width=prior.widths[x]), .)
256 |          })
257 | 
258 | do.call(rbind, tmp) -> coefdf
259 | 
260 | gcoefdf <- coefdf %>% gather(width, cvalue)
261 | names(gcoefdf)[2] <- "predictor"
262 | 
263 | ggplot(gcoefdf, aes(x=width, y=cvalue, color=predictor)) +
264 |   geom_line() +
265 |   ylab("coefficient value") +
266 |   xlab("standard deviation of gaussian coefficient priors") +
267 |   geom_vline(xintercept=minxintercept, linetype=2, color="grey") +
268 |   ggtitle('Coefficient shrinkage in bayesian "ridge regression" (mtcars)') +
269 |   ggsave("./plots/mtcars-coef-shrinkage.png") +
270 |   ggsave("./plots/mtcars-coef-shrinkage.pdf")
271 | 
272 | 
273 | 
274 | netbest.coefs <- best.coefs %>% as.matrix %>% .[-1,] %>% data.frame
275 | 
276 | netbest.coefs$predictor <- row.names(best.coefs)[-1]
277 | row.names(netbest.coefs) <- NULL
278 | names(netbest.coefs)[1] <- "ncvalue"
279 | 
280 | agcoefdf <- gcoefdf %>% left_join(netbest.coefs)
281 | 
282 | gagcoefdf <- agcoefdf %>% gather(width, predictor)
283 | names(gagcoefdf)[3] <- "bayes_or_net"
284 | gagcoefdf$bayes_or_net <- ifelse(gagcoefdf$bayes_or_net=="cvalue",
285 |                                  "bayes", "elastic net")
286 | names(gagcoefdf)[4] <- "value"
287 | 
288 | ggplot(gagcoefdf, aes(x=width, y=value, color=predictor, linetype=bayes_or_net)) +
289 |   geom_line() +
290 |   ylab("coefficient value") +
291 |   xlab("standard deviation of gaussian coefficient priors") +
292 |   geom_vline(xintercept=minxintercept, linetype=2, color="grey") +
293 |   ggtitle('Coefficient shrinkage in bayesian "ridge regression" (mtcars)') +
294 |   ggsave("./plots/mtcars-coef-shrinkage-net-overlay.png") +
295 |   ggsave("./plots/mtcars-coef-shrinkage-net-overlay.pdf")
296 | 
297 | 
298 | 


--------------------------------------------------------------------------------
/l2-regularization/plots/mtcars-coef-shrinkage-net-overlay.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tonyfischetti/bayesian-regularization/cf5f9bd937fdfaa5ad1b2dab4cbdfc9847df3d1c/l2-regularization/plots/mtcars-coef-shrinkage-net-overlay.pdf


--------------------------------------------------------------------------------
/l2-regularization/plots/mtcars-coef-shrinkage-net-overlay.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tonyfischetti/bayesian-regularization/cf5f9bd937fdfaa5ad1b2dab4cbdfc9847df3d1c/l2-regularization/plots/mtcars-coef-shrinkage-net-overlay.png


--------------------------------------------------------------------------------
/l2-regularization/plots/mtcars-coef-shrinkage.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tonyfischetti/bayesian-regularization/cf5f9bd937fdfaa5ad1b2dab4cbdfc9847df3d1c/l2-regularization/plots/mtcars-coef-shrinkage.pdf


--------------------------------------------------------------------------------
/l2-regularization/plots/mtcars-coef-shrinkage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tonyfischetti/bayesian-regularization/cf5f9bd937fdfaa5ad1b2dab4cbdfc9847df3d1c/l2-regularization/plots/mtcars-coef-shrinkage.png


--------------------------------------------------------------------------------
/l2-regularization/plots/mtcars-loocv-mse.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tonyfischetti/bayesian-regularization/cf5f9bd937fdfaa5ad1b2dab4cbdfc9847df3d1c/l2-regularization/plots/mtcars-loocv-mse.pdf


--------------------------------------------------------------------------------
/l2-regularization/plots/mtcars-loocv-mse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tonyfischetti/bayesian-regularization/cf5f9bd937fdfaa5ad1b2dab4cbdfc9847df3d1c/l2-regularization/plots/mtcars-loocv-mse.png


--------------------------------------------------------------------------------