├── .gitignore ├── LICENSE ├── README.md └── l2-regularization ├── bayes-reg.R └── plots ├── mtcars-coef-shrinkage-net-overlay.pdf ├── mtcars-coef-shrinkage-net-overlay.png ├── mtcars-coef-shrinkage.pdf ├── mtcars-coef-shrinkage.png ├── mtcars-loocv-mse.pdf └── mtcars-loocv-mse.png /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Tony Fischetti 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # bayesian-regularization 3 | 4 | ### A demonstration of Bayesian approaches to linear model regularization 5 | 6 | 7 | This is code for personal research on Bayesian approaches to linear model 8 | regularization through thoughtful selection of priors on the beta 9 | coefficients, and it's equivalence to penalized MLE methods. 10 | -------------------------------------------------------------------------------- /l2-regularization/bayes-reg.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript --vanilla 2 | 3 | ########################################################### 4 | ## ## 5 | ## bayes-reg.R ## 6 | ## ## 7 | ## Author: Tony Fischetti ## 8 | ## tony.fischetti@gmail.com ## 9 | ## ## 10 | ########################################################### 11 | 12 | # workspace cleanup 13 | rm(list=ls()) 14 | 15 | # options 16 | options(echo=TRUE) 17 | options(stringsAsFactors=FALSE) 18 | options(datatable.fread.datatable=FALSE) 19 | options(mc.cores = parallel::detectCores()) 20 | 21 | # cli args 22 | args <- commandArgs(trailingOnly=TRUE) 23 | 24 | # libraries 25 | library(glmnet) 26 | library(magrittr) 27 | library(dplyr) 28 | library(rethinking) 29 | library(pbapply) 30 | library(cvTools) 31 | library(ggplot2) 32 | library(tidyr) 33 | library(boot) 34 | 35 | 36 | 37 | 38 | 39 | #--------------------------------------------------# 40 | #--------------------------------------------------# 41 | # FUNCTIONS # 42 | #--------------------------------------------------# 43 | #--------------------------------------------------# 44 | make.map.formula <- function(startofalist, numofcoefs, stddev){ 45 | # doesn't count the intercept 46 | gencoeffs <- paste0("b", 1:numofcoefs) 47 | this <- list() 48 | for(i in 1:length(gencoeffs)){ 49 | this <- c(this, list(bquote(.(as.name(gencoeffs[i])) ~ dnorm(0, .(stddev))))) 50 | } 51 | return(c(startofalist, this)) 52 | } 53 | 54 | get.folds <- function(df, k=5){ 55 | tmp <- cvFolds(nrow(df), K=k) 56 | folds <- lapply(1:length(unique(tmp$which)), 57 | function(x) tmp$subsets[which(tmp$which!=x),]) 58 | return(folds) 59 | } 60 | 61 | pevaluate <- function(X, y, design.matrix, basealist, prior.stddev, 62 | startlist, k=5, parallel=TRUE, show.result=TRUE){ 63 | if(parallel){ 64 | mapcarfn <- function(...){ unlist(mclapply(...)) } 65 | } else{ 66 | mapcarfn <- sapply 67 | } 68 | df <- cbind(X, y) 69 | form <- make.map.formula(basealist, ncol(X), prior.stddev) 70 | folds <- get.folds(df, k=k) 71 | mserrors <- mapcarfn(1:length(folds), 72 | function(x){ 73 | rows <- folds[[x]] 74 | fit <- map(form, data=df[rows,], start=startlist) 75 | best.bayes.coefs <- coef(fit)[-length(coef(fit))] %>% matrix 76 | pred.bayes <- design.matrix %*% best.bayes.coefs 77 | the.errors <- pred.bayes - y[,1] 78 | serrors <- the.errors[-rows]^2 79 | return(mean(serrors)) 80 | }) 81 | erfit <- map(alist(mserrors ~ dnorm(mu, stddev), 82 | mu ~ dnorm(0,100), 83 | stddev ~ dunif(0,100)), 84 | data=data.frame(mserrors=mserrors), 85 | start=list(mu=8, stddev=10)) 86 | emean <- mean(extract.samples(erfit)$mu) 87 | hdi95 <- HPDI(extract.samples(erfit)$mu, prob=.95) 88 | fullfit <- map(form, data=df, start=startlist) 89 | best.bayes.coefs <- coef(fullfit)[-length(coef(fullfit))] %>% matrix 90 | retlist <- list(coefs=best.bayes.coefs, cv.mse=emean, hdi=hdi95) 91 | if(show.result){ 92 | cat(sprintf("\n%s\n LOOCV MSE ESTIMATE FOR PRIOR WIDTH of %.3f: %.3f\n%s\n", 93 | paste0(rep("*", 55), collapse=""), 94 | prior.stddev, 95 | emean, 96 | paste0(rep("*", 55), collapse=""))) 97 | } 98 | return(retlist) 99 | } 100 | 101 | 102 | 103 | #==================================================# 104 | #==================================================# 105 | #============= MTCARS =============# 106 | #============= MTCARS =============# 107 | #============= MTCARS =============# 108 | #==================================================# 109 | #==================================================# 110 | 111 | # first let's use ridge regression and get info 112 | # on the highest performing model fit 113 | 114 | mtstd <- mtcars %>% lapply(scale) %>% data.frame 115 | row.names(mtstd) <- row.names(mtcars) 116 | mtstd$mpg <- mtcars$mpg 117 | 118 | design.matrix <- model.matrix(mpg ~ ., data=mtstd) 119 | 120 | X <- design.matrix[,-1] 121 | y <- mtstd[, 1, drop=FALSE] 122 | 123 | cvfits <- cv.glmnet(X, y[,1], alpha=0, nfolds=10) 124 | # plot(cvfits) 125 | 126 | 127 | loc <- which(cvfits$lambda==cvfits$lambda.min) 128 | best.coefs <- coef(cvfits, s="lambda.min") 129 | mse <- cvfits$cvm[loc] # 7.336113 130 | 131 | 132 | 133 | 134 | 135 | ####### messing around 136 | basealist <- alist(mpg ~ dnorm(mu, sigma), 137 | mu <- b0 + b1*cyl + b2*disp + b3*hp + b4*drat + b5*wt + b6*qsec + b7*vs + b8*am + b9*gear + b10*carb, 138 | sigma ~ dunif(0,100), 139 | b0 ~ dnorm(20, 100)) 140 | form <- make.map.formula(basealist, 10, 5) 141 | 142 | startlist <- list(b0=20, b1=0, b2=0, b3=0, b4=0, 143 | b5=0, b6=0, b7=0, b8=0, b9=0, 144 | b10=0, sigma=30) 145 | 146 | 147 | this <- map(form, data=mtstd, start=startlist) 148 | best.bayes.coefs <- coef(this)[-length(coef(this))] %>% matrix 149 | preds.bayes <- design.matrix %*% best.bayes.coefs 150 | serrors <- (preds.bayes - mtstd$mpg)^2 151 | mse <- mean((preds.bayes - mtstd$mpg)^2) # 4.622117 152 | mse 153 | 154 | tmp <- map(alist(serrors ~ dnorm(mu, stddev), 155 | mu ~ dnorm(5, 10), 156 | stddev ~ dunif(0, 20)), 157 | data=data.frame(serrors=serrors), 158 | start=list(mu=4, stddev=6)) 159 | 160 | 161 | # example 162 | pevaluate(X, y, design.matrix, basealist, 0.575, startlist, k=10) 163 | 164 | 165 | 166 | 167 | #--------------------------------------------------# 168 | # 10-fold CV with mtcars predicting `mpg` from # 169 | # all other variables and gaussian priors of # 170 | # increasing precision # 171 | #--------------------------------------------------# 172 | 173 | prior.widths <- seq(0.05, 5, by=0.025) 174 | 175 | results <- pblapply(prior.widths, function(x){ 176 | tryCatch({ 177 | pevaluate(X, y, design.matrix, basealist, x, startlist, k=10) 178 | }, error = function(e){ 179 | return(list(coeffs=rep(NA, 11), cv.mse=NA, hdi=c(NA, NA))) 180 | }) 181 | }) 182 | MTCARSK10RESULTS <- results 183 | 184 | tresults.cv <- lapply(results, function(x) x$cv.mse) %>% unlist 185 | tresults.lower <- lapply(results, function(x) x$hdi[1]) %>% unlist 186 | tresults.upper <- lapply(results, function(x) x$hdi[2]) %>% unlist 187 | tresults.width <- prior.widths 188 | 189 | tresults.df <- data.frame(mse=tresults.cv, width=tresults.width, 190 | lower=tresults.lower, 191 | upper=tresults.upper) 192 | 193 | plot(mse ~ width, data=tresults.df, type="l", ylim=c(0, 22), 194 | main="10K CV MSE as a function of prior width of coefficients") 195 | lines(upper ~ width, data=tresults.df, type="l", col="red") 196 | lines(tresults.df$width, tresults.df$lower, type="l", col="red") 197 | abline(v=0.775, lty=2) 198 | 199 | 200 | 201 | 202 | #--------------------------------------------------# 203 | # LOOCV with mtcars predicting `mpg` from # 204 | # all other variables and gaussian priors of # 205 | # increasing precision # 206 | #--------------------------------------------------# 207 | 208 | prior.widths <- seq(0.05, 5, by=0.025) 209 | 210 | results <- pblapply(prior.widths, function(x){ 211 | tryCatch({ 212 | pevaluate(X, y, design.matrix, basealist, x, startlist, k=31) 213 | }, error = function(e){ 214 | return(list(coeffs=rep(NA, 11), cv.mse=NA, hdi=c(NA, NA))) 215 | }) 216 | }) 217 | ## up to here :) 218 | MTCARSLOORESULTS <- results 219 | 220 | tresults.cv <- lapply(results, function(x) x$cv.mse) %>% unlist %>% .[5:length(prior.widths)] 221 | tresults.lower <- lapply(results, function(x) x$hdi[1]) %>% unlist %>% .[5:length(prior.widths)] 222 | tresults.upper <- lapply(results, function(x) x$hdi[2]) %>% unlist %>% .[5:length(prior.widths)] 223 | tresults.width <- prior.widths %>% .[5:length(prior.widths)] 224 | 225 | 226 | tresults.df <- data.frame(mse=tresults.cv, width=tresults.width, 227 | lower=tresults.lower, 228 | upper=tresults.upper) 229 | 230 | minxintercept <- tresults.width[which.min(tresults.cv)] 231 | 232 | plot(mse ~ width, data=tresults.df, type="l", ylim=c(0, 22), 233 | main="LOOCV MSE as a function of prior width of coefficients") 234 | lines(upper ~ width, data=tresults.df, type="l", col="red") 235 | lines(tresults.df$width, tresults.df$lower, type="l", col="red") 236 | abline(v=minxintercept, lty=2) 237 | 238 | 239 | ggplot(tresults.df, aes(width, mse)) + 240 | geom_ribbon(aes(ymin=lower, ymax=upper, alpha=0.5), fill = "grey70", 241 | show.legend=FALSE) + 242 | geom_line() + 243 | geom_vline(xintercept=minxintercept, linetype=2) + 244 | ylab("LOOCV MSE") + 245 | xlab("standard deviation of gaussian coefficient priors") + 246 | ggtitle('Bayesian "ridge regression" LOOCV MSE with different "penalties" (mtcars)') + 247 | ggsave("./plots/mtcars-loocv-mse.png") + 248 | ggsave("./plots/mtcars-loocv-mse.pdf") 249 | 250 | 251 | 252 | tmp <- lapply(1:length(results), 253 | function(x){ df <- data.frame(pred=names(mtcars)[-1], 254 | cvalue=results[[x]]$coefs[-1,1]) 255 | df %>% spread(pred, cvalue) %>% cbind(data.frame(width=prior.widths[x]), .) 256 | }) 257 | 258 | do.call(rbind, tmp) -> coefdf 259 | 260 | gcoefdf <- coefdf %>% gather(width, cvalue) 261 | names(gcoefdf)[2] <- "predictor" 262 | 263 | ggplot(gcoefdf, aes(x=width, y=cvalue, color=predictor)) + 264 | geom_line() + 265 | ylab("coefficient value") + 266 | xlab("standard deviation of gaussian coefficient priors") + 267 | geom_vline(xintercept=minxintercept, linetype=2, color="grey") + 268 | ggtitle('Coefficient shrinkage in bayesian "ridge regression" (mtcars)') + 269 | ggsave("./plots/mtcars-coef-shrinkage.png") + 270 | ggsave("./plots/mtcars-coef-shrinkage.pdf") 271 | 272 | 273 | 274 | netbest.coefs <- best.coefs %>% as.matrix %>% .[-1,] %>% data.frame 275 | 276 | netbest.coefs$predictor <- row.names(best.coefs)[-1] 277 | row.names(netbest.coefs) <- NULL 278 | names(netbest.coefs)[1] <- "ncvalue" 279 | 280 | agcoefdf <- gcoefdf %>% left_join(netbest.coefs) 281 | 282 | gagcoefdf <- agcoefdf %>% gather(width, predictor) 283 | names(gagcoefdf)[3] <- "bayes_or_net" 284 | gagcoefdf$bayes_or_net <- ifelse(gagcoefdf$bayes_or_net=="cvalue", 285 | "bayes", "elastic net") 286 | names(gagcoefdf)[4] <- "value" 287 | 288 | ggplot(gagcoefdf, aes(x=width, y=value, color=predictor, linetype=bayes_or_net)) + 289 | geom_line() + 290 | ylab("coefficient value") + 291 | xlab("standard deviation of gaussian coefficient priors") + 292 | geom_vline(xintercept=minxintercept, linetype=2, color="grey") + 293 | ggtitle('Coefficient shrinkage in bayesian "ridge regression" (mtcars)') + 294 | ggsave("./plots/mtcars-coef-shrinkage-net-overlay.png") + 295 | ggsave("./plots/mtcars-coef-shrinkage-net-overlay.pdf") 296 | 297 | 298 | -------------------------------------------------------------------------------- /l2-regularization/plots/mtcars-coef-shrinkage-net-overlay.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tonyfischetti/bayesian-regularization/cf5f9bd937fdfaa5ad1b2dab4cbdfc9847df3d1c/l2-regularization/plots/mtcars-coef-shrinkage-net-overlay.pdf -------------------------------------------------------------------------------- /l2-regularization/plots/mtcars-coef-shrinkage-net-overlay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tonyfischetti/bayesian-regularization/cf5f9bd937fdfaa5ad1b2dab4cbdfc9847df3d1c/l2-regularization/plots/mtcars-coef-shrinkage-net-overlay.png -------------------------------------------------------------------------------- /l2-regularization/plots/mtcars-coef-shrinkage.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tonyfischetti/bayesian-regularization/cf5f9bd937fdfaa5ad1b2dab4cbdfc9847df3d1c/l2-regularization/plots/mtcars-coef-shrinkage.pdf -------------------------------------------------------------------------------- /l2-regularization/plots/mtcars-coef-shrinkage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tonyfischetti/bayesian-regularization/cf5f9bd937fdfaa5ad1b2dab4cbdfc9847df3d1c/l2-regularization/plots/mtcars-coef-shrinkage.png -------------------------------------------------------------------------------- /l2-regularization/plots/mtcars-loocv-mse.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tonyfischetti/bayesian-regularization/cf5f9bd937fdfaa5ad1b2dab4cbdfc9847df3d1c/l2-regularization/plots/mtcars-loocv-mse.pdf -------------------------------------------------------------------------------- /l2-regularization/plots/mtcars-loocv-mse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tonyfischetti/bayesian-regularization/cf5f9bd937fdfaa5ad1b2dab4cbdfc9847df3d1c/l2-regularization/plots/mtcars-loocv-mse.png --------------------------------------------------------------------------------