├── .gitattributes ├── .gitignore ├── .nojekyll ├── BayesOpt.R ├── BayesOpt.Rproj ├── README.md ├── bibliography.bib ├── draft.Rmd ├── images ├── bayesopt_cb.gif ├── bayesopt_cb.png ├── bayesopt_ei.gif ├── bayesopt_ei.png ├── bayesopt_poi.gif ├── bayesopt_poi.png └── torch_adam.gif └── index.html /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # Example code in package build process 9 | *-Ex.R 10 | 11 | # Output files from R CMD build 12 | /*.tar.gz 13 | 14 | # Output files from R CMD check 15 | /*.Rcheck/ 16 | 17 | # RStudio files 18 | .Rproj.user/ 19 | 20 | # produced vignettes 21 | vignettes/*.html 22 | vignettes/*.pdf 23 | 24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 25 | .httr-oauth 26 | 27 | # knitr and R markdown default cache directories 28 | /*_cache/ 29 | /cache/ 30 | 31 | # Temporary files created by R markdown 32 | *.utf8.md 33 | *.knit.md 34 | 35 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html 36 | rsconnect/ 37 | context-cards.js 38 | -------------------------------------------------------------------------------- /.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bearloga/bayesopt-tutorial-r/492999d1313c36efd3e3629e9bde3ea0afebfd00/.nojekyll -------------------------------------------------------------------------------- /BayesOpt.R: -------------------------------------------------------------------------------- 1 | # library(magrittr) 2 | library(zeallot) 3 | library(glue) 4 | library(GPfit) 5 | library(animation) 6 | 7 | # Goal: find x and y which maximize f 8 | f <- function(x) { 9 | return((6 * x - 2)^2 * sin(12 * x - 4)) 10 | } 11 | 12 | c(max_evals, seed_evals) %<-% c(8, 4) 13 | 14 | # par(mfrow = c(1, 1), mar = c(5.1, 4.1, 4.1, 2.1)) 15 | # curve(f(x), x_min, x_max) 16 | # points(evaluations[1:seed_evals, ], pch = 16) 17 | 18 | # evaluations of f: 19 | evaluations <- matrix( 20 | as.numeric(NA), 21 | ncol = 2, nrow = seed_evals, 22 | dimnames = list(NULL, c("x", "y")) 23 | ) 24 | # seed with a few evaluations: 25 | evaluations[1:seed_evals, "x"] <- seq(0, 1, length.out = seed_evals) 26 | evaluations[1:seed_evals, "y"] <- f(evaluations[1:seed_evals, "x"]) 27 | 28 | set.seed(0) 29 | bayesian_optimize <- function(optmize_function, init_evals, max_iter, acquisition_function, minimize = TRUE, control = NULL) { 30 | # expand to hold additional evaluations: 31 | evaluations <- rbind(init_evals, matrix( 32 | as.numeric(NA), 33 | ncol = 2, nrow = max_iter, 34 | dimnames = list(NULL, c("x", "y")) 35 | )) 36 | 37 | if (is.null(control)) { 38 | control <- list(cov = list(type = "exponential", power = 1.95)) 39 | # control <- list(type = "matern", nu = 5/2) 40 | } 41 | if (acquisition_function == "cb") { 42 | if (is.null(control$kappa)) { 43 | control$kappa <- 2 44 | } 45 | } 46 | x_new <- seq(0, 1, length.out = 100) # potential x's to evaluate 47 | 48 | for (eval_iter in (nrow(init_evals) + 1):(nrow(init_evals) + max_iter)) { 49 | 50 | fit <- GP_fit( 51 | X = evaluations[1:(eval_iter - 1), "x"], 52 | Y = evaluations[1:(eval_iter - 1), "y"], 53 | corr = control$cov 54 | ) 55 | 56 | predictions <- predict.GP(fit, xnew = data.frame(x = x_new)) 57 | mu <- predictions$Y_hat 58 | sigma <- sqrt(predictions$MSE) 59 | 60 | if (minimize) { 61 | y_best <- min(evaluations[, "y"], na.rm = TRUE) 62 | } else { 63 | y_best <- max(evaluations[, "y"], na.rm = TRUE) 64 | } 65 | 66 | if (acquisition_function == "poi") { 67 | # Probability of improvement: 68 | acquisition <- purrr::map2_dbl(mu, sigma, function(m, s) { 69 | if (s == 0) return(0) 70 | else return(pnorm((y_best - m) / s)) 71 | }) 72 | if (!minimize) { 73 | acquisition <- 1 - acquisition 74 | } 75 | x_next <- x_new[which.max(acquisition)] 76 | plot(x_new, acquisition, type = "l", col = "red", ylim = c(0, 1), xlab = "x", ylab = expression("a"["POI"])) 77 | } else if (acquisition_function == "ei") { 78 | # Expected improvement: 79 | acquisition <- purrr::map2_dbl(mu, sigma, function(m, s) { 80 | if (s == 0) return(0) 81 | gamma <- (y_best - m) / s 82 | if (minimize) { 83 | phi <- pnorm(gamma) 84 | } else { 85 | phi <- 1 - pnorm(gamma) 86 | } 87 | return(s * (gamma * phi + dnorm(gamma))) 88 | }) 89 | x_next <- x_new[which.max(acquisition)] 90 | plot(x_new, acquisition, type = "l", col = "red", xlab = "x", ylab = expression("a"["EI"])) 91 | } else if (acquisition_function == "cb") { 92 | # GB upper/lower confidence bound: 93 | if (minimize) { 94 | acquisition <- mu - control$kappa * sigma 95 | x_next <- x_new[which.min(acquisition)] 96 | plot(x_new, acquisition, type = "l", col = "red", xlab = "x", ylab = expression("a"["LCB"])) 97 | } else { 98 | acquisition <- mu + control$kappa * sigma 99 | x_next <- x_new[which.max(acquisition)] 100 | plot(x_new, acquisition, type = "l", col = "red", xlab = "x", ylab = expression("a"["UCB"])) 101 | } 102 | } else { 103 | stop("acquisition_function must be 'poi', 'ei', 'cb'") 104 | } 105 | 106 | abline(v = x_next, lty = "dashed", col = "red", lwd = 2) 107 | acquisition_function_label <- switch( 108 | acquisition_function, 109 | "poi" = "probability of improvement", 110 | "ei" = "expected improvement", 111 | "cb" = paste("GP", ifelse(minimize, "lower"), "confidence bound") 112 | ) 113 | legend("topleft", glue("proposal via {acquisition_function_label}"), bty = "n", col = "red", lty = "dashed", lwd = 2) 114 | 115 | # Visualize hidden function and GP fit: 116 | curve(f(x), 0, 1, lwd = 1.5) 117 | lines(x_new, mu, col = "blue", lwd = 2, lty = "dotted") 118 | polygon(c(x_new, rev(x_new)), c(mu + sigma, rev(mu - sigma)), col = rgb(0, 0, 1, 0.25), border = NA) 119 | points(evaluations, pch = 16) 120 | points(evaluations[(eval_iter - 1), "x"], evaluations[(eval_iter - 1), "y"], pch = 16, col = "red") 121 | abline(v = x_next, lty = "dashed", col = "red", lwd = 2) 122 | legend("topleft", "most recent evaluation", bty = "n", col = "red", pch = 16) 123 | 124 | y_next <- f(x_next) 125 | evaluations[eval_iter, ] <- c(x_next, y_next) 126 | } 127 | 128 | return(list(x = x_next, y = y_next)) 129 | } 130 | 131 | # Visualize | mar = c(5.1, 4.1, 4.1, 2.1) (bottom, left, top, right) 132 | purrr::walk(c("poi", "ei", "cb"), function(af) { 133 | # Static images: 134 | png(glue("bayesopt_{af}.png"), width = 12, height = 12, units = "in", res = 300) 135 | par(mfrow = c(4, 2), mar = c(4.1, 4.1, 0.5, 0.5), cex = 1.1) 136 | bayesian_optimize(f, evaluations, max_evals - seed_evals, af) 137 | dev.off() 138 | # Animated GIF: 139 | saveGIF( 140 | { 141 | par(mfrow = c(1, 2), mar = c(4.1, 4.1, 0.5, 0.5), cex = 1.1) 142 | bayesian_optimize(f, evaluations, max_evals - seed_evals, af) 143 | }, 144 | glue("bayesopt_{af}.gif"), nmax = 4, loop = TRUE, interval = 1.5, 145 | ani.width = 900, ani.height = 300, ani.dev = "png", 146 | autobrowse = FALSE 147 | ) 148 | }) 149 | 150 | # Optimization with gradient descent in {torch}: 151 | library(torch) 152 | library(animation) 153 | 154 | saveGIF( 155 | { 156 | x <- torch_zeros(1, requires_grad = TRUE) 157 | f <- function(x) (6 * x - 2)^2 * torch_sin(12 * x - 4) 158 | 159 | optimizer <- optim_adam(x, lr = 0.25) 160 | 161 | par(mfrow = c(1, 1), mar = c(4.1, 4.1, 0.5, 0.5), cex = 2) 162 | for (i in 1:50) { 163 | 164 | curve( 165 | (6 * x - 2)^2 * sin(12 * x - 4), 166 | from = 0, to = 1, 167 | xlab = "x", ylab = "f(x)", lwd = 2 168 | ) 169 | 170 | loss <- f(x) 171 | 172 | y <- as.numeric(loss) 173 | points(as.numeric(x), y, cex = 2, col = "red", pch = 16) 174 | points(as.numeric(x), y, cex = 2, col = "black", lwd = 2) 175 | 176 | optimizer$zero_grad() 177 | loss$backward() 178 | optimizer$step() 179 | } 180 | }, 181 | "torch_adam.gif", 182 | loop = TRUE, autobrowse = FALSE, 183 | ani.width = 900, ani.height = 600, ani.dev = "png", 184 | interval = 0.2 185 | ) 186 | -------------------------------------------------------------------------------- /BayesOpt.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tutorial on Bayesian optimization in R 2 | 3 | ![Results after letter BayesOpt choose 4 more points to evaluate via their expected improvement](images/bayesopt_ei.gif) 4 | 5 | ---- 6 | 7 | ## GIFs 8 | 9 | Software: 10 | 11 | - [ImageMagick](https://imagemagick.org/script/download.php#windows) 12 | 13 | R packages: 14 | 15 | ``` r 16 | install.packages(c("remotes", "magick", "gganimate")) 17 | # remotes::install_github("r-rust/gifski") # gif.ski/ 18 | ``` 19 | -------------------------------------------------------------------------------- /bibliography.bib: -------------------------------------------------------------------------------- 1 | %% This BibTeX bibliography file was created using BibDesk. 2 | %% https://bibdesk.sourceforge.io/ 3 | 4 | %% Created for Mikhail Popov at 2019-09-30 09:59:47 -0400 5 | 6 | 7 | %% Saved with string encoding Unicode (UTF-8) 8 | 9 | 10 | 11 | @misc{Fb2019, 12 | Author = {Bakshy, Eytan and Balandat, Max and Kashin, Kostya}, 13 | Month = {May}, 14 | Note = {Accessed on 2019-09-30}, 15 | Title = {Open-sourcing Ax and BoTorch: New AI tools for adaptive experimentation}, 16 | Url = {https://ai.facebook.com/blog/open-sourcing-ax-and-botorch-new-ai-tools-for-adaptive-experimentation/}, 17 | Year = {2019}, 18 | Bdsk-Url-1 = {https://ai.facebook.com/blog/open-sourcing-ax-and-botorch-new-ai-tools-for-adaptive-experimentation/}} 19 | 20 | @misc{Shi2019, 21 | Author = {Yuge Shi}, 22 | Month = {September}, 23 | Note = {Accessed on 2019-09-30}, 24 | Title = {Gaussian Process, not quite for dummies}, 25 | Url = {https://yugeten.github.io/posts/2019/09/GP/}, 26 | Year = {2019}, 27 | Bdsk-Url-1 = {https://yugeten.github.io/posts/2019/09/GP/}} 28 | 29 | @misc{Zhu2019, 30 | Author = {Hao Zhu}, 31 | Month = {September}, 32 | Note = {Accessed on 2019-09-30}, 33 | Title = {Implementing Gaussian Process in Python and R}, 34 | Url = {https://zhuhao.org/post/gp_python_r/}, 35 | Year = {2019}, 36 | Bdsk-Url-1 = {https://zhuhao.org/post/gp_python_r/}} 37 | 38 | @article{Snoek2012vl, 39 | Author = {Snoek, Jasper and Larochelle, Hugo and Adams, Ryan P}, 40 | Eprintclass = {stat.ML}, 41 | Month = jun, 42 | Title = {Practical Bayesian Optimization of Machine Learning Algorithms}, 43 | Url = {https://arxiv.org/abs/1206.2944}, 44 | Year = {2012}, 45 | Bdsk-Url-1 = {https://arxiv.org/abs/1206.2944}} 46 | 47 | @article{Frazier2018id, 48 | Author = {Frazier, Peter I}, 49 | Eprint = {1807.02811v1}, 50 | Eprinttype = {arxiv}, 51 | Journal = {arXiv.org}, 52 | Month = jul, 53 | Title = {A Tutorial on Bayesian Optimization}, 54 | Url = {https://arxiv.org/abs/1807.02811}, 55 | Year = {2018}, 56 | Bdsk-Url-1 = {https://arxiv.org/abs/1807.02811}} 57 | 58 | @article{Shahriari2016je, 59 | Author = {Shahriari, Bobak and Swersky, Kevin and Wang, Ziyu and Adams, Ryan P and de Freitas, Nando}, 60 | Journal = {Proceedings of the IEEE}, 61 | Number = {1}, 62 | Pages = {148--175}, 63 | Title = {Taking the Human Out of the Loop: A Review of Bayesian Optimization}, 64 | Volume = {104}, 65 | Year = {2016}} 66 | 67 | @article{Letham2018ep, 68 | Author = {Letham, Benjamin and Karrer, Brian and Ottoni, Guilherme and Bakshy, Eytan}, 69 | Eprint = {1706.07094v2}, 70 | Eprinttype = {arxiv}, 71 | Journal = {arXiv.org}, 72 | Month = aug, 73 | Title = {Constrained Bayesian Optimization with Noisy Experiments}, 74 | Url = {https://arxiv.org/abs/1706.07094}, 75 | Year = {2018}, 76 | Bdsk-Url-1 = {https://arxiv.org/abs/1706.07094}} 77 | 78 | @article{Gortler2019a, 79 | Author = {G{\"o}rtler, Jochen and Kehlbeck, Rebecca and Deussen, Oliver}, 80 | Doi = {10.23915/distill.00017}, 81 | Journal = {Distill}, 82 | Note = {https://distill.pub/2019/visual-exploration-gaussian-processes}, 83 | Title = {A Visual Exploration of Gaussian Processes}, 84 | Url = {https://doi.org/10.23915/distill.00017}, 85 | Year = {2019}, 86 | Bdsk-Url-1 = {https://doi.org/10.23915/distill.00017}} 87 | 88 | @manual{r-base, 89 | Address = {Vienna, Austria}, 90 | Author = {{R Core Team}}, 91 | Organization = {R Foundation for Statistical Computing}, 92 | Title = {R: A Language and Environment for Statistical Computing}, 93 | Url = {https://www.R-project.org/}, 94 | Year = {2018}, 95 | Bdsk-Url-1 = {https://www.R-project.org/}} 96 | 97 | @article{r-GPfit, 98 | Author = {Blake MacDonald and Pritam Ranjan and Hugh Chipman}, 99 | Journal = {Journal of Statistical Software}, 100 | Number = {12}, 101 | Pages = {1--23}, 102 | Title = {{GPfit}: An {R} Package for Fitting a Gaussian Process Model to Deterministic Simulator Outputs}, 103 | Url = {https://www.jstatsoft.org/v64/i12/}, 104 | Volume = {64}, 105 | Year = {2015}, 106 | Bdsk-Url-1 = {https://www.jstatsoft.org/v64/i12/}} 107 | 108 | @article{r-animation, 109 | Author = {Yihui Xie}, 110 | Journal = {Journal of Statistical Software}, 111 | Number = {1}, 112 | Pages = {1--27}, 113 | Title = {{animation}: An {R} Package for Creating Animations and Demonstrating Statistical Methods}, 114 | Url = {https://www.jstatsoft.org/v53/i01/}, 115 | Volume = {53}, 116 | Year = {2013}, 117 | Bdsk-Url-1 = {https://www.jstatsoft.org/v53/i01/}} 118 | 119 | @Mmnual{r-torch, 120 | title = {torch: Tensors and Neural Networks with 'GPU' Acceleration}, 121 | author = {Daniel Falbel and Javier Luraschi}, 122 | year = {2021}, 123 | note = {R package version 0.2.1}, 124 | url = {https://CRAN.R-project.org/package=torch}} 125 | 126 | @article{Adam, 127 | Author = {Kingma, Diederik P and Ba, Jimmy}, 128 | Eprint = {arXiv:1412.6980v9}, 129 | Eprintclass = {cs.LG}, 130 | Eprinttype = {arxiv}, 131 | Journal = {arXiv.org}, 132 | Month = dec, 133 | Title = {Adam: A Method for Stochastic Optimization}, 134 | Url = {https://arxiv.org/abs/1412.6980}, 135 | Year = {2014}, 136 | Bdsk-Url-1 = {https://arxiv.org/abs/1412.6980}} 137 | 138 | @article{Chen2018ta, 139 | Author = {Chen, Yutian and Huang, Aja and Wang, Ziyu and Antonoglou, Ioannis and Schrittwieser, Julian and Silver, David and de Freitas, Nando}, 140 | Eprint = {1812.06855v1}, 141 | Eprintclass = {cs.LG}, 142 | Eprinttype = {arxiv}, 143 | Journal = {arXiv.org}, 144 | Month = dec, 145 | Title = {Bayesian Optimization in AlphaGo}, 146 | Url = {https://arxiv.org/abs/1812.06855}, 147 | Year = {2018}, 148 | Bdsk-Url-1 = {https://arxiv.org/abs/1812.06855}} 149 | 150 | @book{Forrester, 151 | Author = {Forrester, A., Sobester, A., & Keane, A.}, 152 | Publisher = {Wiley}, 153 | Title = {Engineering design via surrogate modelling: a practical guide}, 154 | Year = {2008}} 155 | -------------------------------------------------------------------------------- /draft.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "A tutorial on Bayesian optimization in R" 3 | description: | 4 | Step-by-step demonstration of BayesOpt for derivative-free minimization of a noiseless, black-box function 5 | author: 6 | - name: Mikhail Popov 7 | url: https://mpopov.com 8 | date: 2019-05-19 9 | bibliography: bibliography.bib 10 | repository_url: https://github.com/bearloga/bayesopt-tutorial-r 11 | creative_commons: CC BY-ND 12 | output: 13 | distill::distill_article: 14 | toc: true 15 | toc_depth: 2 16 | --- 17 | 18 | ```{r setup, include=FALSE} 19 | library(knitr) 20 | opts_chunk$set( 21 | echo = FALSE, 22 | dev = "svg" 23 | ) 24 | ``` 25 | ```{r packages} 26 | library(purrr) 27 | library(gt) 28 | ``` 29 | ```{css} 30 | a.wiki-preview { 31 | color: #0645ad; 32 | text-decoration: none; 33 | border-bottom: 1px dotted #0645ad; 34 | } 35 | .wiki-preview::after { 36 | font-family: serif; 37 | content: " W"; 38 | vertical-align: super; 39 | font-size: 6pt; 40 | } 41 | ``` 42 | 43 | 44 | 45 | 46 | # Introduction 47 | 48 | Optimization of function $f$ is finding an input value $\mathbf{x}_*$ which minimizes (or maximizes) the output value: 49 | 50 | $$ 51 | \mathbf{x}_* = \underset{\mathbf{x}}{\arg\min}~f(\mathbf{x}) 52 | $$ 53 | 54 | In this tutorial we will optimize $f(x) = (6x-2)^2~\text{sin}(12x-4)$[@Forrester], which looks like this when $x \in [0, 1]$: 55 | 56 | ```{r curve} 57 | par(mar = c(4.1, 4.1, 0.5, 0.5), cex = 1.1) 58 | curve( 59 | (6 * x - 2)^2 * sin(12 * x - 4), 60 | from = 0, to = 1, 61 | xlab = "x", ylab = "f(x)", lwd = 2 62 | ) 63 | ``` 64 | 65 | The ideal scenario is that $f$ is known, has a closed, analytical form, and is differentiable -- which would enable us to use gradient descent-based algorithms For example, here's how we might optimize it with Adam[@Adam] in {torch}[@r-torch]: 66 | 67 | ```{r torch, eval=FALSE, echo=TRUE} 68 | library(torch) 69 | 70 | x <- torch_zeros(1, requires_grad = TRUE) 71 | f <- function(x) (6 * x - 2) ^ 2 * torch_sin(12 * x - 4) 72 | 73 | optimizer <- optim_adam(x, lr = 0.25) 74 | 75 | for (i in 1:50) { 76 | y <- f(x) 77 | optimizer$zero_grad() 78 | y$backward() 79 | optimizer$step() 80 | } 81 | ``` 82 | 83 | ![Optimization using Adam in {torch}](images/torch_adam.gif) 84 | 85 | But that's not always the case. Maybe we don't have a derivative to work with and the evaluation of the function is expensive -- hours to train a model or weeks to do an A/B test. Bayesian optimization (BayesOpt) is one algorithm that helps us perform derivative-free optimization of black-box functions. 86 | 87 | # Algorithm 88 | 89 | The BayesOpt algorithm for $N$ maximum evaluations can be described using the following pseudocode[@Frazier2018id]: 90 | 91 | ``` 92 | Place Gaussian process prior on 'f' 93 | Observe 'f' at n0 initial points; set n = n0 94 | while n ≤ N do: 95 | Update posterior on 'f' using all available data 96 | Compute acqusition function 'a' using posterior 97 | Let x* be the value which maximizes 'a' 98 | Observe f(x*) 99 | Increment n 100 | end while 101 | Return x for which f(x) was at its best 102 | ``` 103 | 104 | We seed the algorithm with a few initial evaluations and then proceed to sequentially find and evaluate new values, chosen based on some acqusition function, until we've exhausted the number of attempts we're allowed to make. 105 | 106 | ## Acquisition functions 107 | 108 | Let $y_\text{best}$ be the best observed value of $f_n$ (the $n$ evaluations of $f$). How do we choose the next value at which to evaluate $f$? We use an *acquisition function* to guide our choice. There are three major acquisition functions out there, each with its own pros and cons: 109 | 110 | 1. **Probability of improvement** (least popular): $a_\text{PI}(x)$ measures the probability that a point $x$ will lead to an improvement over $y_\text{best}$ 111 | 2. **Expected improvement** (most popular): $a_\text{EI}$ incorporates the amount of improvement over $y_\text{best}$ 112 | 3. **GP lower confidence bound** (newer of the three): $a_\text{LCB}$ (*upper* in case of maximization) balances *exploitation* (points with best expected value) against *exploration* (points with high uncertainty). 113 | 114 | In the sections below, each acquisition function will be formally introduced and we'll see how to implement it in R[@r-base]. 115 | 116 | # Implementation 117 | 118 | We will use the **GPfit**[@r-GPfit] package for working with Gaussian processes. 119 | 120 | ```{r deps, echo=TRUE} 121 | library(GPfit) # install.packages("GPfit") 122 | ``` 123 | 124 | The algorithm is executed in a loop: 125 | 126 | ```R 127 | for (iteration in 1:max_iterations) { 128 | # step 1: fit GP model to evaluated points 129 | # step 2: calculate utility to find next point 130 | } 131 | ``` 132 | 133 | ```{r function, echo=TRUE} 134 | f <- function(x) { 135 | return((6 * x - 2)^2 * sin(12 * x - 4)) 136 | } 137 | ``` 138 | 139 | We start with $n_0$ equally-spaced points between 0 and 1 on which to evaluate $f$ (without noise) and store these in a matrix `evaluations`: 140 | 141 | ```{r evaluations} 142 | # seed with a few evaluations: 143 | n0 <- 4 144 | evaluations <- matrix( 145 | as.numeric(NA), 146 | ncol = 2, nrow = n0, 147 | dimnames = list(NULL, c("x", "y")) 148 | ) 149 | evaluations[, "x"] <- seq(0, 1, length.out = n0) 150 | evaluations[, "y"] <- f(evaluations[, "x"]) 151 | 152 | evaluations %>% 153 | as.data.frame %>% 154 | gt() %>% 155 | cols_label(y = "f(x)") %>% 156 | fmt_number(vars(x, y), decimals = 3) %>% 157 | tab_header( 158 | "Initial evaluations" 159 | ) 160 | ``` 161 | 162 | ## GP model fitting 163 | 164 | In this example we are going to employ the popular choice of the power exponential correlation function, but the Màtern correlation function `list(type = "matern", nu = 5/2)` may also be used. 165 | 166 | 169 | 170 | ```{r fit, echo=TRUE} 171 | fit <- GP_fit( 172 | X = evaluations[, "x"], 173 | Y = evaluations[, "y"], 174 | corr = list(type = "exponential", power = 1.95) 175 | ) 176 | ``` 177 | 178 | Now that we have a fitted GP model, we can calculate the expected value $\mu(x)$ at each possible value of $x$ and the corresponding uncertainty $\sigma(x)$. These will be used when computing the acquisition functions over the possible values of $x$. 179 | 180 | ```{r pred, echo=TRUE} 181 | x_new <- seq(0, 1, length.out = 100) 182 | pred <- predict.GP(fit, xnew = data.frame(x = x_new)) 183 | mu <- pred$Y_hat 184 | sigma <- sqrt(pred$MSE) 185 | ``` 186 | 187 | ```{r, fig.width=8, fig.height=4} 188 | plot_posterior <- function() { 189 | plot( 190 | x_new, mu, 191 | type = "l", col = "blue", lwd = 2, lty = "dotted", 192 | ylim = c(-10, 20), 193 | xlab = "x", ylab = "f(x)", main = "Posterior of f" 194 | ) 195 | polygon( 196 | c(x_new, rev(x_new)), 197 | c(mu + sigma, rev(mu - sigma)), 198 | col = rgb(0, 0, 1, 0.25), border = NA 199 | ) 200 | points(evaluations, pch = 16) 201 | legend( 202 | "topleft", 203 | c(expression(f[n[0]]), expression(mu(x)), expression(mu(x) %+-% sigma(x))), 204 | col = c("black", "blue", "blue"), pch = c(16, NA, NA), 205 | lty = c(NA, "dotted", NA), lwd = c(NA, 2, 1), bty = "n", 206 | fill = c(NA, NA, rgb(0, 0, 1, 0.25)), 207 | border = c(NA, NA, NA), ncol = 3, text.width = 0.1 208 | ) 209 | } 210 | par(cex = 1.1, mfrow = c(1, 1), mar = c(5.1, 4.1, 4.1, 2.1)) 211 | plot_posterior() 212 | ``` 213 | 214 | ## Calculating utility 215 | 216 | As mentioned before, suppose $y_\text{best}$ is the best evaluation we have so far: 217 | 218 | ```{r y_best, echo=TRUE} 219 | y_best <- min(evaluations[, "y"]) 220 | ``` 221 | 222 | ### Probability of improvement 223 | 224 | This utility measures the probability of improving upon $y_\text{best}$, and -- since the posterior is Gaussian -- we can compute it analytically: 225 | 226 | $$ 227 | a_\text{POI}(x) = \Phi\left(\frac{y_\text{best} - \mu(x)}{\sigma(x)}\right) 228 | $$ 229 | 230 | where $\Phi$ is the standard normal cumulative distribution function. In R, it looks like this: 231 | 232 | ```{r probability_improvement, echo=TRUE} 233 | probability_improvement <- map2_dbl( 234 | mu, 235 | sigma, 236 | function(m, s) { 237 | if (s == 0) return(0) 238 | else { 239 | poi <- pnorm((y_best - m) / s) 240 | # poi <- 1 - poi (if maximizing) 241 | return(poi) 242 | } 243 | } 244 | ) 245 | ``` 246 | 247 | ```{r, layout="l-body-outset", fig.width=10, fig.height=5} 248 | par(cex = 1.1, mfrow = c(1, 2)) 249 | plot( 250 | x_new, probability_improvement, 251 | type = "l", col = "red", 252 | ylim = c(0, 1), xlab = "x", ylab = expression("a"["POI"]), 253 | main = "Probability of improvement" 254 | ) 255 | plot_posterior() 256 | ``` 257 | 258 | Using this acquisition function, the next point which should be evaluated is `x_new[which.max(probability_improvement)]`. After evaluating each new point, we repeat steps 1 and 2 until we have exhausted all tries: 259 | 260 | ![Results after letting BayesOpt choose 4 more points to evaluate via their probability of improvement](images/bayesopt_poi.gif) 261 | 262 | ### Expected improvement 263 | 264 | Let $\gamma(x)$ be the quantity we used in $a_\text{POI}$: 265 | 266 | $$ 267 | \gamma(x) = \frac{y_\text{best} - \mu(x)}{\sigma(x)} 268 | $$ 269 | 270 | Building on probability of improvement, this utility incorporates the amount of improvement: 271 | 272 | $$ 273 | a_\text{EI} = \sigma(x)\left(\gamma(x) \Phi(\gamma(x)) + \mathcal{N}(\gamma(x); 0, 1)\right) 274 | $$ 275 | 276 | In R, it looks like this: 277 | 278 | ```{r expected_improvement, echo=TRUE} 279 | expected_improvement <- map2_dbl( 280 | mu, sigma, 281 | function(m, s) { 282 | if (s == 0) return(0) 283 | gamma <- (y_best - m) / s 284 | phi <- pnorm(gamma) 285 | return(s * (gamma * phi + dnorm(gamma))) 286 | } 287 | ) 288 | ``` 289 | 290 | ```{r, layout="l-body-outset", fig.width=10, fig.height=5} 291 | par(cex = 1.1, mfrow = c(1, 2)) 292 | plot( 293 | x_new, expected_improvement, 294 | type = "l", col = "red", 295 | xlab = "x", ylab = expression("a"["EI"]), 296 | main = "Expected improvement" 297 | ) 298 | plot_posterior() 299 | ``` 300 | 301 | Using this acquisition function, the next point which should be evaluated is `x_new[which.max(expected_improvement)]`. After evaluating each new point, we repeat steps 1 and 2 until we have exhausted all tries: 302 | 303 | ![Results after letter BayesOpt choose 4 more points to evaluate via their expected improvement](images/bayesopt_ei.gif) 304 | 305 | ### GP lower confidence bound 306 | 307 | As mentioned above, this utility enables us to control whether the algorithm prefers *exploitation* -- picking points which have the best expected values -- or *exploration* -- picking points which have the highest uncertainty, and this would be more informative to evaluate on. This balance is controlled by a tunable hyperparameter $\kappa$, and in R it looks like: 308 | 309 | ```{r lcb, echo=TRUE} 310 | kappa <- 2 # tunable 311 | lower_confidence_bound <- mu - kappa * sigma 312 | # if maximizing: upper_confidence_bound <- mu + kappa * sigma 313 | ``` 314 | 315 | ```{r, layout="l-body-outset", fig.width=10, fig.height=5} 316 | par(cex = 1.1, mfrow = c(1, 2)) 317 | plot( 318 | x_new, lower_confidence_bound, 319 | type = "l", col = "red", 320 | xlab = "x", ylab = expression("a"["LCB"]), 321 | main = "GP lower confidence bound" 322 | ) 323 | plot_posterior() 324 | ``` 325 | 326 | Using this acquisition function, the next point which should be evaluated is `x_new[which.min(lower_confidence_bound)]` (or `x_new[which.max(upper_confidence_bound)]` if maximizing). After evaluating each new point, we repeat steps 1 and 2 until we have exhausted all tries: 327 | 328 | ![Results after letter BayesOpt choose 4 more points to evaluate via their GP lower confidence bound](images/bayesopt_cb.gif) 329 | 330 | # Closing thoughts 331 | 332 | This was only a one-dimensional optimization example to show the key ideas and how one might implement them. If you are interested in using this algorithm to tune your models' parameters, I encourage you to check out [this documentation](http://pyro.ai/examples/bo.html) which describes how to perform Bayesian optimization with [Pyro](http://pyro.ai/) (the probabilistic programming language built on [PyTorch](https://pytorch.org/)); and [pyGPGO](https://pygpgo.readthedocs.io/en/latest/), which is a Bayesian optimization library for Python. 333 | 334 | # Further reading 335 | 336 | - [@Snoek2012vl] explains how BayesOpt may be used for automatic parameter tuning in machine learning 337 | - [@Shahriari2016je] provides a comprehensive review of the algorithm and its applications 338 | - [@Chen2018ta] shows how DeepMind used BayesOpt to tune AlphaGo during development 339 | - **Gaussian processes** 340 | - [@Gortler2019a] is a visual exploration of Gaussian processes 341 | - [@Shi2019] is GP and covariance matrices 342 | - [@Zhu2019] is a walkthrough on implementing GP in R and Python 343 | - [@Letham2018ep] shows how Facebook uses BayesOpt to find next set of parameter values to evaluate with online experiments (A/B tests) 344 | 345 | **_Update 2019-09-30_**: not long after I published this tutorial, Facebook open-sourced PyTorch-based [BoTorch](https://www.botorch.org/) and "adaptive experimentation platform" [Ax](https://ax.dev/). See [@Fb2019] for more details. 346 | 347 | 350 | 351 | # Acknowledgements {.appendix} 352 | 353 | You might have noticed a few blue links with "W"s on this page. Those are links to the Wikipedia articles on those topics and if you hover over them, you will see a preview of the article. This is possible with the [ContextCards library](https://chimeces.com/context-cards/) developed by my coworker Joaquin over at Wikimedia, based on the [Popups extension for MediaWiki](https://www.mediawiki.org/wiki/Extension:Popups). 354 | 355 | # TensorFlow {.appendix} 356 | 357 | **_Update 2021-02-28_**: I have migrated the part about optimization from [{tensorflow}](https://tensorflow.rstudio.com/) to [{torch}](https://torch.mlverse.org/). For posterity, the (pre-TensorFlow 2.0.0) code was: 358 | 359 | ``` r 360 | library(tensorflow) 361 | sess = tf$Session() 362 | 363 | x <- tf$Variable(0.0, trainable = TRUE) 364 | f <- function(x) (6 * x - 2)^2 * tf$sin(12 * x - 4) 365 | 366 | adam <- tf$train$AdamOptimizer(learning_rate = 0.3) 367 | opt <- adam$minimize(f(x), var_list = x) 368 | 369 | sess$run(tf$global_variables_initializer()) 370 | 371 | for (i in 1:20) sess$run(opt) 372 | x_best <- sess$run(x) 373 | ``` 374 | -------------------------------------------------------------------------------- /images/bayesopt_cb.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bearloga/bayesopt-tutorial-r/492999d1313c36efd3e3629e9bde3ea0afebfd00/images/bayesopt_cb.gif -------------------------------------------------------------------------------- /images/bayesopt_cb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bearloga/bayesopt-tutorial-r/492999d1313c36efd3e3629e9bde3ea0afebfd00/images/bayesopt_cb.png -------------------------------------------------------------------------------- /images/bayesopt_ei.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bearloga/bayesopt-tutorial-r/492999d1313c36efd3e3629e9bde3ea0afebfd00/images/bayesopt_ei.gif -------------------------------------------------------------------------------- /images/bayesopt_ei.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bearloga/bayesopt-tutorial-r/492999d1313c36efd3e3629e9bde3ea0afebfd00/images/bayesopt_ei.png -------------------------------------------------------------------------------- /images/bayesopt_poi.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bearloga/bayesopt-tutorial-r/492999d1313c36efd3e3629e9bde3ea0afebfd00/images/bayesopt_poi.gif -------------------------------------------------------------------------------- /images/bayesopt_poi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bearloga/bayesopt-tutorial-r/492999d1313c36efd3e3629e9bde3ea0afebfd00/images/bayesopt_poi.png -------------------------------------------------------------------------------- /images/torch_adam.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bearloga/bayesopt-tutorial-r/492999d1313c36efd3e3629e9bde3ea0afebfd00/images/torch_adam.gif -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Redirecting to https://mpopov.com/tutorials/bayesopt-r/ 4 | 5 | 6 | --------------------------------------------------------------------------------