├── LICENSE
├── Makefile
├── README.md
├── biblio.bib
├── code
    ├── .ipynb_checkpoints
    │   └── Bayesian optimisation-checkpoint.ipynb
    ├── Bayesian optimisation.ipynb
    ├── fig1.pdf
    ├── fig2.pdf
    ├── fig3-ei.pdf
    ├── fig3-pi.pdf
    ├── fig3-ucb.pdf
    ├── fig3.pdf
    ├── fig4-0.pdf
    ├── fig4-1.pdf
    ├── fig4-2.pdf
    ├── fig4-3.pdf
    ├── fig4-4.pdf
    └── fig4-5.pdf
├── nyu.jpg
├── slides.pdf
└── slides.tex


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, Gilles Louppe
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of talk-approximating-likelihood-ratios-with-classifiers nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | slides:
2 | 	pdflatex -shell-escape slides
3 | 	bibtex slides
4 | 	pdflatex -shell-escape slides
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # talk-bayesian-optimisation
2 | 
3 | Talk on "Bayesian optimisation".
4 | - Presented at the ATLAS ML workshop, CERN, March 2016
5 | - Presented at the DIANA-HEP weekly meeting, CERN, April 2016
6 | 
7 | See `code/` for reproducing the figures.
8 | 


--------------------------------------------------------------------------------
/biblio.bib:
--------------------------------------------------------------------------------
 1 | @article{brochu2010tutorial,
 2 |   title={A tutorial on Bayesian optimization of expensive cost functions, with application to active user modeling and hierarchical reinforcement learning},
 3 |   author={Brochu, Eric and Cora, Vlad M and De Freitas, Nando},
 4 |   journal={arXiv preprint arXiv:1012.2599},
 5 |   year={2010}
 6 | }
 7 | 
 8 | @article{shahriari2016taking,
 9 |   title={Taking the human out of the loop: A review of bayesian optimization},
10 |   author={Shahriari, Bobak and Swersky, Kevin and Wang, Ziyu and Adams, Ryan P and de Freitas, Nando},
11 |   journal={Proceedings of the IEEE},
12 |   volume={104},
13 |   number={1},
14 |   pages={148--175},
15 |   year={2016},
16 |   publisher={IEEE}
17 | }
18 | 


--------------------------------------------------------------------------------
/code/fig1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/talk-bayesian-optimisation/e467bdd44b14c19b292758195585be833a82b371/code/fig1.pdf


--------------------------------------------------------------------------------
/code/fig2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/talk-bayesian-optimisation/e467bdd44b14c19b292758195585be833a82b371/code/fig2.pdf


--------------------------------------------------------------------------------
/code/fig3-ei.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/talk-bayesian-optimisation/e467bdd44b14c19b292758195585be833a82b371/code/fig3-ei.pdf


--------------------------------------------------------------------------------
/code/fig3-pi.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/talk-bayesian-optimisation/e467bdd44b14c19b292758195585be833a82b371/code/fig3-pi.pdf


--------------------------------------------------------------------------------
/code/fig3-ucb.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/talk-bayesian-optimisation/e467bdd44b14c19b292758195585be833a82b371/code/fig3-ucb.pdf


--------------------------------------------------------------------------------
/code/fig3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/talk-bayesian-optimisation/e467bdd44b14c19b292758195585be833a82b371/code/fig3.pdf


--------------------------------------------------------------------------------
/code/fig4-0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/talk-bayesian-optimisation/e467bdd44b14c19b292758195585be833a82b371/code/fig4-0.pdf


--------------------------------------------------------------------------------
/code/fig4-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/talk-bayesian-optimisation/e467bdd44b14c19b292758195585be833a82b371/code/fig4-1.pdf


--------------------------------------------------------------------------------
/code/fig4-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/talk-bayesian-optimisation/e467bdd44b14c19b292758195585be833a82b371/code/fig4-2.pdf


--------------------------------------------------------------------------------
/code/fig4-3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/talk-bayesian-optimisation/e467bdd44b14c19b292758195585be833a82b371/code/fig4-3.pdf


--------------------------------------------------------------------------------
/code/fig4-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/talk-bayesian-optimisation/e467bdd44b14c19b292758195585be833a82b371/code/fig4-4.pdf


--------------------------------------------------------------------------------
/code/fig4-5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/talk-bayesian-optimisation/e467bdd44b14c19b292758195585be833a82b371/code/fig4-5.pdf


--------------------------------------------------------------------------------
/nyu.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/talk-bayesian-optimisation/e467bdd44b14c19b292758195585be833a82b371/nyu.jpg


--------------------------------------------------------------------------------
/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/talk-bayesian-optimisation/e467bdd44b14c19b292758195585be833a82b371/slides.pdf


--------------------------------------------------------------------------------
/slides.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | 
  3 | \usepackage[utf8]{inputenc}
  4 | \usepackage[english]{babel}
  5 | \usepackage{graphicx}
  6 | \usepackage{color}
  7 | \usepackage{natbib}
  8 | \usepackage{amssymb}
  9 | \usepackage{algorithm}
 10 | \usepackage{algpseudocode}
 11 | \usepackage{caption}
 12 | 
 13 | \usepackage{amsmath}
 14 | \usepackage{tikz}
 15 | \usetikzlibrary{arrows,calc,tikzmark,shapes.misc}
 16 | 
 17 | \tikzset{every picture/.style=remember picture}
 18 | % Define a TikZ node for math content:
 19 | \newcommand{\mathnode}[2]{%
 20 |   \mathord{\tikz[baseline=(#1.base), inner sep = 0pt]{\node (#1) {$#2$};}}}
 21 | 
 22 | \DeclareMathOperator*{\argmin}{arg\,min}
 23 | \DeclareMathOperator*{\argmax}{arg\,max}
 24 | 
 25 | 
 26 | % Beamer layout
 27 | \hypersetup{colorlinks=True, citecolor=green, linkcolor=blue}
 28 | 
 29 | \let\oldbibitem=\bibitem
 30 | \renewcommand{\bibitem}[2][]{\label{#2}\oldbibitem[#1]{#2}}
 31 | \let\oldcite=\cite
 32 | \renewcommand\cite[1]{\hyperlink{#1}{\oldcite{#1}}}
 33 | \let\oldcitep=\citep
 34 | \renewcommand\citep[1]{\hyperlink{#1}{\oldcitep{#1}}}
 35 | \let\oldciteauthor=\citeauthor
 36 | \renewcommand\citeauthor[1]{\hyperlink{#1}{\oldciteauthor{#1}}}
 37 | 
 38 | \usetheme{boxes}
 39 | \beamertemplatenavigationsymbolsempty
 40 | \setbeamertemplate{sections/subsections in toc}[circle]
 41 | \setbeamertemplate{footline}[frame number]
 42 | \setbeamertemplate{itemize items}[circle]
 43 | \setbeamertemplate{itemize subitem}[square]
 44 | 
 45 | % Front slide
 46 | \title{{\bf Bayesian optimisation}}
 47 | \author{
 48 | Gilles Louppe}
 49 | \date{April 11, 2016}
 50 | 
 51 | \begin{document}
 52 | 
 53 | \begin{frame}[plain]
 54 | \titlepage
 55 | \centering
 56 | \includegraphics[height=1.5em]{nyu.jpg}
 57 | \end{frame}
 58 | 
 59 | \begin{frame}
 60 |     \frametitle{Problem statement}
 61 | 
 62 |     \begin{center}
 63 |         $$x^* = \arg \max_x f(x)$$
 64 |     \end{center}
 65 | 
 66 |     \vspace{2em}
 67 | 
 68 |     Constraints:
 69 |     \begin{itemize}
 70 |         \item $f$ is a black box for which no closed form is known;
 71 |             \begin{itemize}
 72 |                 \item gradients $\frac{df}{dx}$ are not available.
 73 |             \end{itemize}
 74 | 
 75 |         \item $f$ is expensive to evaluate;
 76 | 
 77 |         \item (optional) uncertainty on observations $y_i$ of $f$
 78 |             \begin{itemize}
 79 |                 \item e.g., $y_i = f(x_i) + \epsilon_i$ because of Poisson fluctuations.
 80 |             \end{itemize}
 81 |     \end{itemize}
 82 | 
 83 |     \vspace{2em}
 84 | 
 85 |     Goal: find $x^*$, while minimizing the number of evaluations $f(x)$.
 86 | \end{frame}
 87 | 
 88 | \begin{frame}
 89 |     \frametitle{\color{red} Disclaimer}
 90 |     \begin{center}
 91 |         If you do not have these constraints, there is certainly a better optimisation algorithm than Bayesian optimisation.
 92 | 
 93 |         \vspace{3em}
 94 | 
 95 |         (e.g., L-BFGS-B, Powell's method (as in Minuit), etc)
 96 |     \end{center}
 97 | \end{frame}
 98 | 
 99 | \begin{frame}
100 |     \frametitle{Bayesian optimisation}
101 | 
102 |     for $t=1:T$,
103 |     \begin{enumerate}
104 |         \item Given observations $(x_i, y_i)$ for $i=1:t$, build a probabilistic model for the objective $f$.
105 |         \begin{itemize}
106 |             \item Integrate out all possible true functions, using
107 |             Gaussian process regression.
108 |         \end{itemize}
109 | 
110 |         \item Optimise a cheap utility  function $u$ based on the posterior distribution for sampling the next point.
111 |             $$x_{t+1} = \arg \max_x u(x)$$
112 |             Exploit uncertainty to balance exploration against exploitation.
113 | 
114 |         \item Sample the next observation $y_{t+1}$ at $x_{t+1}$.
115 |     \end{enumerate}
116 | 
117 | \end{frame}
118 | 
119 | \begin{frame}
120 |     \frametitle{Where shall we sample next?}
121 | 
122 |     \begin{center}
123 |         \includegraphics[width=\textwidth]{code/fig1.pdf}
124 |     \end{center}
125 | \end{frame}
126 | 
127 | \begin{frame}
128 |     \frametitle{Build a probabilistic model for the objective function}
129 |     \begin{center}
130 |         \includegraphics[width=\textwidth]{code/fig2.pdf} \\
131 |         This gives a posterior distribution over functions that could have generated the observed data.
132 |     \end{center}
133 | \end{frame}
134 | 
135 | \begin{frame}
136 |     \frametitle{Acquisition functions}
137 | 
138 |     Acquisition functions $\text{u}(x)$ specify which sample $x$ should be tried next:
139 | 
140 |     \begin{itemize}
141 |         \item Upper confidence bound
142 |             $\text{UCB}(x) = \mu_{GP}(x) + \kappa \sigma_{GP}(x)$;
143 |         \item Probability of improvement
144 |             $\text{PI}(x) = P(f(x) \geq f(x_t^+) + \kappa) $;
145 |         \item Expected improvement
146 |             $\text{EI}(x) = \mathbb{E} [f(x) - f(x_t^+)] $;
147 |         \item ... and many others.
148 |     \end{itemize}
149 | 
150 |     where $x_t^+$ is the best point observed so far.
151 | 
152 |     \vspace{1em}
153 | 
154 |     In most cases, acquisition functions provide knobs (e.g., $\kappa$) for
155 |     controlling the exploration-exploitation trade-off.
156 |         \begin{itemize}
157 |             \item Search in regions where $\mu_{GP}(x)$ is high (exploitation)
158 |             \item Probe regions where uncertainty $\sigma_{GP}(x)$ is high (exploration)
159 |         \end{itemize}
160 | 
161 | \end{frame}
162 | 
163 | \begin{frame}
164 |     \frametitle{Plugging everything together ($t=0$)}
165 |     \begin{center}
166 |         \includegraphics[width=\textwidth]{code/fig4-0.pdf}\\
167 |         $x_{t+1} = \arg \max_{x} \text{UCB}(x)$
168 |     \end{center}
169 | \end{frame}
170 | 
171 | \begin{frame}
172 |     \frametitle{... and repeat until convergence ($t=1$)}
173 |     \begin{center}
174 |         \includegraphics[width=\textwidth]{code/fig4-1.pdf}
175 |     \end{center}
176 | \end{frame}
177 | 
178 | \begin{frame}
179 |     \frametitle{... and repeat until convergence ($t=2$)}
180 |     \begin{center}
181 |         \includegraphics[width=\textwidth]{code/fig4-2.pdf}
182 |     \end{center}
183 | \end{frame}
184 | 
185 | \begin{frame}
186 |     \frametitle{... and repeat until convergence ($t=3$)}
187 |     \begin{center}
188 |         \includegraphics[width=\textwidth]{code/fig4-3.pdf}
189 |     \end{center}
190 | \end{frame}
191 | 
192 | \begin{frame}
193 |     \frametitle{... and repeat until convergence ($t=4$)}
194 |     \begin{center}
195 |         \includegraphics[width=\textwidth]{code/fig4-4.pdf}
196 |     \end{center}
197 | \end{frame}
198 | 
199 | \begin{frame}
200 |     \frametitle{... and repeat until convergence ($t=5$)}
201 |     \begin{center}
202 |         \includegraphics[width=\textwidth]{code/fig4-5.pdf}
203 |     \end{center}
204 | \end{frame}
205 | 
206 | \begin{frame}
207 |     \frametitle{What is Bayesian about Bayesian optimization?}
208 | 
209 |     \begin{itemize}
210 |         \item The Bayesian strategy treats the unknown objective function
211 |         as a random function and place a {\it prior} over it.
212 |         \begin{itemize}
213 |             \item The prior captures our beliefs about the behaviour of
214 |             the function. It is here defined by a
215 |             Gaussian process whose covariance function captures assumptions
216 |             about the smoothness of the objective.
217 |         \end{itemize}
218 |         \item Function evaluations are treated as
219 |         data. They are used to update the prior to form the {\it posterior}
220 |         distribution over the objective function.
221 |         \item The posterior distribution, in turn, is used to construct
222 |         an acquisition function for querying the next point.
223 |     \end{itemize}
224 | \end{frame}
225 | 
226 | \begin{frame}
227 |     \frametitle{Limitations}
228 | 
229 |     \begin{itemize}
230 |         \item Bayesian optimisation has parameters itself!
231 |             \begin{itemize}
232 |                 \item Choice of the acquisition function
233 |                 \item Choice of the kernel (i.e. design of the prior)
234 |                 \item Parameter wrapping
235 |                 \item Initialization scheme
236 |             \end{itemize}
237 | 
238 |         \vspace{1em}
239 | 
240 |         \item Gaussian processes usually do not scale well to many observations and to high-dimensional data.
241 |             \begin{itemize}
242 |                 \item Sequential model-based optimization provides a direct and effective alternative (i.e., replace GPs by a tree-based model).
243 |             \end{itemize}
244 |     \end{itemize}
245 | \end{frame}
246 | 
247 | \begin{frame}
248 |     \frametitle{Applications}
249 | 
250 |     \begin{itemize}
251 |         \item Bayesian optimization has been used in many scientific fields,
252 |         including robotics, machine learning or life sciences.
253 | 
254 |         \vspace{1em}
255 | 
256 |         \item Use cases for high energy physics?
257 |             \begin{itemize}
258 |                 \item Optimisation of simulation parameters in event generators;
259 |                 \item Optimisation of compiler flags to maximize execution speed;
260 |                 \item Optimisation of hyper-parameters in machine learning for HEP;
261 |                 \item ... let's discuss further ideas?
262 |             \end{itemize}
263 |     \end{itemize}
264 | \end{frame}
265 | 
266 | \begin{frame}
267 |     \frametitle{Software}
268 | 
269 |     \begin{itemize}
270 |         \item Python
271 |         \begin{itemize}\scriptsize
272 |             \item Spearmint \url{https://github.com/JasperSnoek/spearmint}
273 |             \item GPyOpt \url{https://github.com/SheffieldML/GPyOpt}
274 |             \item RoBO \url{https://github.com/automl/RoBO}
275 |             \item scikit-optimize \url{https://github.com/MechCoder/scikit-optimize} (work in progress)
276 |         \end{itemize}
277 | 
278 |         \item C++
279 |         \begin{itemize}\scriptsize
280 |             \item MOE \url{https://github.com/yelp/MOE}
281 |         \end{itemize}
282 |     \end{itemize}
283 | 
284 |     \vspace{1em}
285 | 
286 |     \begin{center}
287 |         Check also this \href{https://github.com/glouppe/talk-bayesian-optimisation}{Github} repo for a vanilla implementation reproducing these slides.
288 |     \end{center}
289 | 
290 | \end{frame}
291 | 
292 | \begin{frame}
293 |     \frametitle{Summary}
294 | 
295 |     \begin{itemize}
296 |         \item Bayesian optimisation provides a principled approach for optimising an expensive function $f$;
297 | 
298 |         \vspace{1em}
299 | 
300 |         \item Often very effective, provided it is itself properly configured;
301 | 
302 |         \vspace{1em}
303 | 
304 |         \item Hot topic in machine learning research. Expect quick improvements!
305 | 
306 |     \end{itemize}
307 | 
308 | \end{frame}
309 | 
310 | \begin{frame}[plain,noframenumbering]
311 |     \frametitle{References}
312 |     \nocite{brochu2010tutorial}
313 |     \nocite{shahriari2016taking}
314 |     {\footnotesize
315 |     \bibliographystyle{apalike}
316 |     \bibliography{biblio}}
317 | \end{frame}
318 | 
319 | \end{document}
320 | 


--------------------------------------------------------------------------------