├── .gitignore
├── L10_Deep_Learning.tex
├── L11_Non_Parametric_Bayesian_Methods.tex
├── L12_PAC_Learning.tex
├── L2_Representations.tex
├── L3_Density_Estimation.tex
├── L4_Regression.tex
├── L5_Gaussian_Processes.tex
├── L6_Linear_Classification.tex
├── L7_SVM.tex
├── L8_Structured_SVM.tex
├── L9_Ensemble_Methods.tex
├── README.md
├── img
    ├── adaboost.png
    ├── adaboost_margin.png
    ├── algo_gp.png
    ├── anyboost.png
    ├── cp.png
    ├── discriminant.png
    ├── erm.png
    ├── fisher.png
    ├── frequencies.png
    ├── gaussian_process.png
    ├── gem1.png
    ├── gem10.png
    ├── gem100.png
    ├── gem2.png
    ├── gen_reg.png
    ├── geo_ls.png
    ├── lagrangian.jpg
    ├── least_squares.png
    ├── lrelu.png
    ├── margin.jpg
    ├── multiclass.png
    ├── nn.png
    ├── nn_rect.png
    ├── pac1.png
    ├── pac2.png
    ├── relu.png
    ├── ridge_vs_lasso.png
    ├── riemann.png
    ├── riemann_nn.png
    ├── sigmoid.png
    ├── slack.png
    ├── stick-breaking.png
    ├── stick-breaking2.png
    ├── struct_algo.png
    ├── syn_out.png
    ├── syn_tree.png
    ├── tanh.png
    ├── us.png
    ├── vae_1.png
    ├── vae_2.png
    └── vae_3.png
└── pdf
    ├── L10_Deep_Learning.pdf
    ├── L11_Non_Parametric_Bayesian_Methods.pdf
    ├── L12_PAC_Learning.pdf
    ├── L2_Representations.pdf
    ├── L3_Density_Estimation.pdf
    ├── L4_Regression.pdf
    ├── L5_Gaussian_Processes.pdf
    ├── L6_Linear_Classification.pdf
    ├── L8_Structured_SVM.pdf
    └── L9_Ensemble_Methods.pdf


/.gitignore:
--------------------------------------------------------------------------------
1 | *.aux
2 | *.log
3 | 
4 | 


--------------------------------------------------------------------------------
/L10_Deep_Learning.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[twoside]{article}
  2 | \setlength{\oddsidemargin}{0.25 in}
  3 | \setlength{\evensidemargin}{-0.25 in}
  4 | \setlength{\topmargin}{-0.6 in}
  5 | \setlength{\textwidth}{6.5 in}
  6 | \setlength{\textheight}{8.5 in}
  7 | \setlength{\headsep}{0.75 in}
  8 | \setlength{\parindent}{0 in}
  9 | \setlength{\parskip}{0.1 in}
 10 | \newcommand{\eqdef}{:\mathrel{\mathop=}}
 11 | \newcommand{\norm}[1]{\left\lVert #1 \right\rVert}
 12 | 
 13 | %
 14 | % ADD PACKAGES here:
 15 | %
 16 | 
 17 | \usepackage{amsmath,amsfonts,graphicx}
 18 | 
 19 | %
 20 | % The following commands set up the lecnum (lecture number)
 21 | % counter and make various numbering schemes work relative
 22 | % to the lecture number.
 23 | %
 24 | \newcounter{lecnum}
 25 | \renewcommand{\thepage}{\thelecnum-\arabic{page}}
 26 | \renewcommand{\thesection}{\thelecnum.\arabic{section}}
 27 | \renewcommand{\theequation}{\thelecnum.\arabic{equation}}
 28 | \renewcommand{\thefigure}{\thelecnum.\arabic{figure}}
 29 | \renewcommand{\thetable}{\thelecnum.\arabic{table}}
 30 | 
 31 | %
 32 | % The following macro is used to generate the header.
 33 | %
 34 | \newcommand{\lecture}[4]{
 35 |    \pagestyle{myheadings}
 36 |    \thispagestyle{plain}
 37 |    \newpage
 38 |    \setcounter{lecnum}{#1}
 39 |    \setcounter{page}{1}
 40 |    \noindent
 41 |    \begin{center}
 42 |    \framebox{
 43 |       \vbox{\vspace{2mm}
 44 |     \hbox to 6.28in { {\bf Advanced Machine Learning
 45 | 	\hfill Fall 2020} }
 46 |        \vspace{4mm}
 47 |        \hbox to 6.28in { {\Large \hfill Lecture #1: #2  \hfill} }
 48 |        \vspace{2mm}
 49 |        \hbox to 6.28in { {\it  #3 \hfill  #4} }
 50 |       \vspace{2mm}}
 51 |    }
 52 |    \end{center}
 53 |    \markboth{Lecture #1: #2}{Lecture #1: #2}
 54 | 
 55 |    {\bf Note}: {\it LaTeX template courtesy of UC Berkeley EECS dept.}
 56 | 
 57 |    {\bf Disclaimer}: {\it These notes are adapted from ETH's Advanced Machine Learning Course, EPFL's CS433 Course, Stanford's CS231N Course and "Neural Networks and Deep Learning" book.}
 58 |    \vspace*{4mm}
 59 | }
 60 | %
 61 | % Convention for citations is authors' initials followed by the year.
 62 | % For example, to cite a paper by Leighton and Maggs you would type
 63 | % \cite{LM89}, and to cite a paper by Strassen you would type \cite{S69}.
 64 | % (To avoid bibliography problems, for now we redefine the \cite command.)
 65 | % Also commands that create a suitable format for the reference list.
 66 | \renewcommand{\cite}[1]{[#1]}
 67 | \def\beginrefs{\begin{list}%
 68 |         {[\arabic{equation}]}{\usecounter{equation}
 69 |          \setlength{\leftmargin}{2.0truecm}\setlength{\labelsep}{0.4truecm}%
 70 |          \setlength{\labelwidth}{1.6truecm}}}
 71 | \def\endrefs{\end{list}}
 72 | \def\bibentry#1{\item[\hbox{[#1]}]}
 73 | 
 74 | %Use this command for a figure; it puts a figure in wherever you want it.
 75 | %usage: \fig{NUMBER}{SPACE-IN-INCHES}{CAPTION}
 76 | \newcommand{\fig}[3]{
 77 | 			\vspace{#2}
 78 | 			\begin{center}
 79 | 			Figure \thelecnum.#1:~#3
 80 | 			\end{center}
 81 | 	}
 82 | % Use these for theorems, lemmas, proofs, etc.
 83 | \newtheorem{theorem}{Theorem}[lecnum]
 84 | \newtheorem{lemma}[theorem]{Lemma}
 85 | \newtheorem{proposition}[theorem]{Proposition}
 86 | \newtheorem{claim}[theorem]{Claim}
 87 | \newtheorem{corollary}[theorem]{Corollary}
 88 | \newtheorem{definition}[theorem]{Definition}
 89 | \newenvironment{proof}{{\bf Proof:}}{\hfill\rule{2mm}{2mm}}
 90 | 
 91 | % **** IF YOU WANT TO DEFINE ADDITIONAL MACROS FOR YOURSELF, PUT THEM HERE:
 92 | 
 93 | \newcommand\E{\mathbb{E}}
 94 | 
 95 | \begin{document}
 96 | %FILL IN THE RIGHT INFO.
 97 | %\lecture{**LECTURE-NUMBER**}{**DATE**}{**LECTURER**}{**SCRIBE**}
 98 | \lecture{10}{Deep Learning}{}{}
 99 | %\footnotetext{These notes are partially based on those of Nigel Mansell.}
100 | 
101 | % **** YOUR NOTES GO HERE:
102 | 
103 | % Some general latex examples and examples making use of the
104 | % macros follow.  
105 | %**** IN GENERAL, BE BRIEF. LONG SCRIBE NOTES, NO MATTER HOW WELL WRITTEN,
106 | %**** ARE NEVER READ BY ANYBODY.
107 | 
108 | 
109 | \section{Neural Networks} % Don't be this informal in your notes!
110 | Let us look at the structure of a neural network. It is shown
111 | in Figure 10.1. This is a neural net with one input layer of size
112 | $\mathbf{D}$, $\mathbf{L}$ hidden layers of size $\mathbf{K}$, and one output layer. It is
113 | a feedfoward network: the computation performed by the
114 | network starts with the input from the left and flows to the
115 | right. There is no feedback loop.
116 | As always, we assume that our input is a $\mathbf{D}$-dimensional
117 | vector. We see that there is a node drawn in Figure 1 for
118 | each of the $D$ components of $x$. We denote these nodes by
119 | $x^{(0)}_{i}$
120 | , where the superscript $(0)$ specifies that this is the input
121 | layer.
122 | \begin{figure}[h]
123 | \centering
124 | 
125 | \includegraphics[width=0.5\textwidth]{img/nn.png}
126 | \caption{A neural network with one input layer, L hidden
127 | layers, and one output layer.}
128 | \end{figure}
129 |  
130 |  Let us assume that there are $\mathbf{K}$ nodes in each hidden layer, where $K$ is a hyper-parameter. Each node in the hidden layer $l, \; l = 1, \ldots , L$, is connected
131 | to all the nodes in the previous layer via a weighted edge.
132 | We denote the edge from node $i$ in layer $l - 1$ to the node
133 | $j$ in layer $l$ by $w^{(l)}_{i,j} $. The super-script $(l)$ indicates that these
134 | are the weights of edges that lead to layer $l$. The output at the node $j$ in layer $l$ is denoted by $x^{(l)}_{j}$ and it is given by:
135 | $$ x^{(l)}_{j}  = \phi \Big( \sum_{i} w_{i,j}^{(l)} x_{i}^{(l-1)} + b_{j}^{(l)} \Big) $$
136 | 
137 | In simple words, in order to compute the output we first compute
138 | the weighted sum of the inputs and then apply a function $\phi$
139 | to this sum.
140 | \newpage
141 | A few remarks:
142 | 
143 | \begin{itemize}
144 |     \item The constant term $b^{(l)}_{j}$ is called the \textbf{bias term} and is a parameter like any of the weights $w^{(l)}_{i,j}$ . The learning part will consist of choosing all these parameters appropriately for the task.
145 |     \item The function $\phi$
146 | is called the \textbf{activation function}. It is crucial that this function is \textbf{non-linear}.\\ Why is this?
147 | Assume not, then the whole neural-net would just be a highly
148 | factorized linear function of the input data and there would
149 | be no gain compared to standard linear regression/classification. 
150 | \end{itemize}
151 | \subsection{Representation Power}
152 | 
153 | How “powerful” are neural nets? More precisely, what functions $f(x)$ can they represent, or better, what functions can they approximate? Before we get into our heuristic argument let us state the
154 | main theorem of the paper by Barron: “Universal approximation bounds for superpositions of a sigmoidal function”. This gives you a
155 | flavor of what kind of results can be proved.
156 | 
157 | \begin{lemma}
158 | Let $f: \mathbb{R}^{D} \rightarrow \mathbb{R}$ be a function such that $$\int_{\mathbb{R}^{D}} |w||\Tilde{f}(w)|dw \leq C$$
159 | where 
160 | $$\Tilde{f}(w) = \int_{\mathbb{R}^{D}} f(x) e^{-jw^T x}dx$$
161 | is the Fourier transform of $f(x)$\\
162 | \\Then for all $n \geq 1 $, there exists a function $f_{n}$ of the form
163 | $$f_{n}(x) = \sum_{j=1}^{n}c_{j}\phi(x^T w_{j} + b_{j}) + c_{0}$$
164 | i.e., a function that is representable by a NN with one
165 | hidden layer with n nodes and “sigmoid-like” activation
166 | functions so that
167 | $$\int_{|x| \leq r}(f(x) - f_{n}(x))^2dx \leq \dfrac{(2Cr)^2}{n}$$
168 | \end{lemma}
169 | Let's discuss this result:
170 | \begin{itemize}
171 |     \item First note that the condition on the Fourier
172 | transform is a “smoothness condition.” E.g., functions so
173 | that $\int_{\mathbb{R}^{D}} |w||\Tilde{f}(w)|dw \leq \infty$ can be shown to be continuously
174 | differentiable.
175 | \item Second note that the lemma only guarantees a good approximation in a \textbf{bounded domain}. The larger the domain, the more nodes we need in order to approximate a function to the same level (see the term $r^2$, where $r$ is the radius of the ball where we want the approximation to be good, in the upper bound).
176 | \item Third, this is an approximation “in average”, more precisely in $\mathbb{L}_{2}$-norm. 
177 | \item Fourth, the approximation $f_{n}$ with $n$ terms corresponds exactly to our model of a neural net with one hidden layer containing $n$ nodes and sigmoids as activation functions.
178 | \item Fifth, the theorem applies to all activation functions that are
179 | “sigmoid-like,” i.e., all activation functions whose left limit
180 | is $0$, whose right limit is $1$, and that are sufficiently smooth.
181 | \end{itemize}
182 | In simple words, the lemma says that a sufficiently “smooth” function can be approximated by a neural net with one hidden
183 | layer and the approximation error goes down like one over
184 | the number of nodes in the hidden layer. Note that this is a
185 | very fast convergence.
186 | \subsection{Approximation in Average}
187 | We start with a scalar function $f(x): \mathbb{R} \rightarrow \mathbb{R}$ on a bounded domain. Recall that if this function is continuous then it is Riemann integrable, i.e., it can be approximated arbitrarily closely
188 | by “upper” and “lower” sums of rectangles, see Figure 10.2. Of course, we might need a lot of such rectangles to approximate
189 | the area with an error of at most $\epsilon$, but for every $\epsilon > 0$ we
190 | can find such an approximation. 
191 | 
192 | \begin{figure}[h]
193 | \centering
194 | 
195 | \includegraphics[width=0.5\textwidth]{img/riemann.png}
196 | \caption{A lower and an upper Riemann sum.}
197 | \end{figure}
198 | 
199 | We will now show that if we do not limit the weights, then
200 | with two hidden nodes (of a neural network with one hidden
201 | layer) we can construct a function which is arbitrarily close
202 | to a given rectangle. But since, as we have just seen, a finite number of rectangles suffices to approximate a bounded
203 | continuous function arbitrarily closely, it follows that with a
204 | finite number of hidden nodes of a neural network with one
205 | hidden layer we can approximate any such function arbitrarily closely.\\ \\
206 | Let $\phi(x) = \dfrac{1}{1+e^{-x}}$ be the sigmoid function. Consider the function $f(x) = \phi(w(x - b))$, where $w$ is the
207 | weight of a particular edge and $-wb$ is the bias term. \\If we want to create a rectangle that jumps from
208 | $0$ to $1$ at $x = a$ and jumps back to $0$ at $x = b$, $a < b$, then we can accomplish this by taking
209 | $$\phi(w(x - a)) - \phi(w(x - b))$$
210 | and taking $w$ very large (see Figure 10.3).
211 | 
212 | 
213 | \begin{figure}[h]
214 | \centering
215 | 
216 | \includegraphics[width=0.5\textwidth]{img/riemann_nn.png}
217 | \caption{An approximate rectangle of the form $\phi(w(x -
218 | a))-\phi(w(x-b))$ with $w = 10$, $20$, and $50$, respectively.}
219 | \end{figure}
220 | 
221 | Note that these "rectangles"  have a very simple representation in
222 | form of a neural network. This is shown in Figure 10.4. There is
223 | one input node which contains the value x. This value is multiplied by some large weight (in the figure it is 50) and
224 | it is then forwarded to the two hidden nodes. One of these
225 | hidden nodes has a bias of 150 the other one has a bias
226 | of -250, so that the sums at these two hidden nodes are
227 | $50(x + 3)$ and $50(x - 5)$, respectively. Each node applies the
228 | sigmoid function and forwards the result to the output layer.
229 | The edge from the top hidden node to the output has weight
230 | 1 and the one from the bottom hidden node to the output
231 | has weight -1. The output node adds the two inputs. The
232 | result is $\phi(50(x+ 3))-\phi(50(x-5))$, which is approximately
233 | a unit-height rectangle from -3 to 5.
234 | 
235 | \begin{figure}[h]
236 | \centering
237 | \includegraphics[width=0.5\textwidth]{img/nn_rect.png}
238 | \caption{A simple NN implementation of a unit-height rectangle from -3 to 5}
239 | \end{figure}
240 | 
241 | 
242 | It is hopefully clear at this point why any continuous function
243 | on a bounded domain can be approximated via a neural
244 | network with one hidden layer. Let us summarize in telegram
245 | style: Take the function. Approximate it in the Riemann sense. Approximate each of the rectangles in the Riemann
246 | sum by means of two nodes in the hidden layer of a neural
247 | net. Compute the sum (with appropriate sign) of all the
248 | hidden layers at the output node. If we are using a Riemann
249 | sum with $\mathbf{K}$ rectangles we get therefore a neural network
250 | approximation with one hidden layer containing $\mathbf{2K}$ nodes.
251 | \newpage
252 | 
253 | \subsection{Activation Functions}
254 | There are many activation functions that are being used in
255 | practice. Let us list here some of them and briefly discuss
256 | their merits.
257 | 
258 | \paragraph{Sigmoid}
259 | We start with the sigmoid $\phi(x)$, which we have encountered
260 | already several times. Just to summarize, it is defined by:
261 | $$\phi(x) = \dfrac{1}{1+e^{-x}}$$
262 | and a plot is shown in Figure 10.5. Note that the sigmoid is always positive (not really an issue) and that it is bounded. \\ Further, for $|x|$ large, $ \phi'(x)  \approx 0$. This can cause the gradient to become very small (which is known as the “\textbf{vanishing
263 | gradient problem}”), sometimes making learning slow.
264 | 
265 | 
266 | \begin{figure}[h]
267 | \centering
268 | \includegraphics[width=0.43\textwidth]{img/sigmoid.png}
269 | \caption{The sigmoid function $\phi(x)$.}
270 | \end{figure}
271 | 
272 | \paragraph{Tanh}
273 | Very much related to the sigmoid is $tanh(x)$. It is defined by
274 | $$tanh(x) = \dfrac{e^x - e^{-x}}{e^x + e^{-x}} = 2\phi(2x) -1$$
275 | and a plot is shown in Figure 10.6. 
276 | 
277 | \begin{figure}[h]
278 | \centering
279 | \includegraphics[width=0.43\textwidth]{img/tanh.png}
280 | \caption{The function $\tanh(x)$.}
281 | \end{figure}
282 | 
283 | Note that $tanh(x)$ is “balanced” (positive and negative) and that it is bounded. But
284 | it has the same problem as the sigmoid function, namely for
285 | $|x|$ large, $tanh'(x) \approx 0$. As mentioned before, this can cause the
286 | gradient to become very small, sometimes making learning slow.
287 | 
288 | \paragraph{Rectified linear Unit – ReLU}
289 | Very popular is the rectified linear unit (ReLU) , which is defined by
290 | $$(x)_{+} = max\{0,x\}$$
291 | and a plot is shown in Figure 10.7.
292 | 
293 | \begin{figure}[h]
294 | \centering
295 | \includegraphics[width=0.43\textwidth]{img/relu.png}
296 | \caption{The ReLU $(x)_{+}$.}
297 | \end{figure}
298 | 
299 | Note that the ReLU is always positive and that it is unbounded. One nice property
300 | of the ReLU is that its derivative is $1$ (and does not vanish)
301 | for positive values of $x$ (it has $0$ derivative for negative values
302 | of $x$ though).
303 | 
304 | \paragraph{Leaky ReLU}
305 | In order to solve the $0$-derivative problem of the ReLU (for
306 | negative values of $x$) one can add a very small slope $\alpha$ in
307 | the negative part. This gives rise to the leaky rectified linear
308 | unit (LReLU), which is defined by
309 | 
310 | $$f(x) = max\{ \alpha x, x\}$$
311 | and a plot is shown in Figure 10.8. The constant $\alpha$ is of course
312 | a hyper-parameter that can be optimized.
313 | 
314 | \begin{figure}[h]
315 | \centering
316 | \includegraphics[width=0.43\textwidth]{img/lrelu.png}
317 | \caption{LReLU with $\alpha = 0.05$.}
318 | \end{figure}
319 | 
320 | \newpage
321 | 
322 | \subsection{Compact Description of Output}
323 | Let us start by writing down the output as a function of the input explicitly in compact form. It is natural and convenient to describe the function that is implemented by each layer of the network separately at first. The overall function is then
324 | the composition of these functions. \\ \\
325 | Let $\mathbf{W}^{(l)}$ denote the weight matrix that connects layer $l  - 1$
326 | to layer $l$. The matrix $W^{(1)}$ is of dimension $D \times K$, the
327 | matrices $W^{(l)}$
328 | , $2 \leq l \leq L$, are of dimension $K \times K$, and the
329 | matrix $W^{(L+1)}$ is of dimension $K \times 1$. The entries of each
330 | matrix are given by
331 | 
332 | $$W^{(l)}_{i,j} = w^{(l)}_{i,j} $$
333 | 
334 | where we recall that $ w^{(l)}_{i,j}$ is the weight on the edge that connects node $i$ on layer $l - 1$ to node $j$ on layer $l$.\\ 
335 | Further, let us introduce the bias vectors 
336 | $\mathbf{b}^{(l)}$, $1 \leq l \leq L+1$,
337 | that collect all the bias terms. All these vectors are of length
338 | $K$, except the term $b^{(L+1)}$, that is a scalar. \\ \\
339 | With this notation we can describe the function that is implemented by each layer in the form: 
340 | $$x^{(l)} = f^{(l)}(x^{(l-1)}) = \phi((W^{(l)})^T  x^{(l-1)} + b^{(l)})$$
341 | 
342 | where the (generic) activation function is applied point-wise to the vector.
343 | The overall function $y = f(x
344 | ^{(0)})$ can then be written in terms
345 | of these functions as the composition:
346 | $$f(x^{(0)}) = f^{(L+1)} \circ \ldots \circ f^{(2)} \circ f^{(1)}(x^{(0)}).$$
347 | 
348 | \subsection{The Backpropagation Algorithm}
349 | The cost function can be written as:
350 | 
351 | $$\mathcal{L} = \dfrac{1}{N} \sum_{i=1}^{N} (y_{i} -f^{(L+1)} \circ \ldots \circ f^{(2)} \circ f^{(1)}(x_{i}))^2$$
352 | 
353 | Note that this cost function is a function of all weight matrices and bias vectors and that it is a composition of all the functions describing the transformation at each layer.\\
354 | Note also that the specific form of the loss does not really matter for the workings of the back propagation algorithm that we now discuss. Just to be
355 | specific we stick to the square loss. Only the initialization of the back recursion changes if we pick a different loss function.
356 | 
357 | In SGD we compute the gradient of this function with respect to one single sample. Therefore, we start with the function:
358 | 
359 | $$\mathcal{L}_{n} =  (y_{n} -f^{(L+1)} \circ \ldots \circ f^{(2)} \circ f^{(1)}(x_{n}))^2$$
360 | Recall that our aim is to compute:
361 | \begin{equation*}
362 | \begin{aligned}
363 |    & \dfrac{\partial \mathcal{L}_{n} }{\partial  w^{(l)}_{i,j} } , \; l=1,\ldots,L+1 \\
364 |    & \dfrac{\partial \mathcal{L}_{n} }{\partial  b^{(l)}_{j} } , \; l=1,\ldots,L+1
365 |   \end{aligned}
366 | \end{equation*}
367 | 
368 | It will be convenient to first compute two preliminary quantities. The desired derivatives are then easily expressed in
369 | terms of those quantities.
370 | Let:
371 | 
372 | $$z^{(l)} = (W^{(l)})^T  x^{(l-1)} + b^{(l)}$$
373 | 
374 | where $x^{(0)} = x_{n}$ and $x^{(l)} = \phi(z^{(l)})$. In simple words, $z^{(l)}$
375 | is the input at the $l$-th layer before applying the activation function. These quantities are easy to compute by a \textbf{forward pass} in the network. \\More precisely, start with $x^{(0)} = x_{n}$ and
376 | then apply this recursion for $l = 1, \ldots, L + 1$, first always computing $z^{(l)}$ and then computing $x^{(l)} = \phi(z^{(l)})$.\\ \\
377 | Further, let
378 | $$\delta_{j} = \dfrac{\partial \mathcal{L}_{n} }{\partial z^{(l)}_{j}}$$
379 | 
380 | Let $\delta ^{(l)}$ be the corresponding vector at level $l$. Whereas the
381 | quantities $z^{(l)}$ were easily computed by a forward pass, the quantities $\delta^{(l)}$
382 | are easily computed by a \textbf{backwards pass}:
383 | \begin{equation*}
384 | \begin{aligned}
385 |    & \delta^{(l)}_{j} =  \dfrac{\partial \mathcal{L}_{n} }{\partial z^{(l)}_{j}} =
386 |    \sum_{k=1}^{K} \dfrac{\partial \mathcal{L}_{n} }{\partial z^{(l+1)}_{k}} \dfrac{\partial z^{(l+1)}_{k}}{\partial z^{(l)}_{j}} 
387 |    \\
388 |    & = \sum_{k=1}^{K} \delta_{k}^{(l+1)} W^{(l+1)}_{j,k} \phi'(z_{j}^{(l)})
389 |   \end{aligned}
390 | \end{equation*}
391 | The sum comes from the fact the loss $\mathcal{L}_{n}$ at the layer $l+1$ is the sum of the losses evaluated at each of the neurons ($z_{k}$) in that layer, then we applied the chain rule. In vector form, we can write this as:
392 | $$\delta^{(l)} = (W^{(l+1)}\delta^{(l+1)}) \odot \phi'(z^{(l)})$$
393 | 
394 | where $\odot$ denotes the Hadamard product. Now that we both have $z^{(l)}$ and $\delta^{(l)}$ let us get back to our initial goal:
395 | $$\dfrac{\partial \mathcal{L}_{n} }{\partial w^{(l)}_{i,j}} = \sum_{k=1}^{K} \dfrac{\partial \mathcal{L}_{n} }{\partial z^{(l)}_{k}}  \dfrac{\partial z^{(l)}_{k}}{\partial w^{(l)}_{i,j}} = 
396 | \overbrace{\dfrac{\partial \mathcal{L}_{n} }{\partial z^{(l)}_{j}}}^\text{$\delta^{(l)}_{j}$}  \overbrace{\dfrac{\partial z^{(l)}_{j}}{\partial w^{(l)}_{i,j}}}^\text{$x_{i}^{(l-1)}$} =\delta^{(l)}_{j}  x_{i}^{(l-1)}$$
397 | 
398 | Why could we drop the sum in the above expression? When we change the weight $w^{(l)}_{i,j}$ then it only changes the sum $z^{(l)}_{j}$.
399 | All other sums at level $l$ stay unchanged.
400 | \\In a similar manner,
401 | $$\dfrac{\partial \mathcal{L}_{n} }{\partial b^{(l)}_{j}} = \sum_{k=1}^{K} \dfrac{\partial \mathcal{L}_{n} }{\partial z^{(l)}_{k}}  \dfrac{\partial z^{(l)}_{k}}{\partial b^{(l)}_{j}} = 
402 | \overbrace{\dfrac{\partial \mathcal{L}_{n} }{\partial z^{(l)}_{j}}}^\text{$\delta^{(l)}_{j}$}  \overbrace{\dfrac{\partial z^{(l)}_{j}}{\partial b^{(l)}_{j}}}^\text{$1$} =\delta^{(l)}_{j} $$ \newpage
403 | 
404 | Now we can outline the steps of the training algorithm:
405 | \begin{itemize}
406 | \item \textbf{Forward Pass}: Set $x^{(0)} = x_{n}$. Compute for $l = 1, \ldots  L+1$ :    $z^{(l)} = (W^{(l)})^T x^{(l-1)} + b^{(l)}$ and  $x^{(l)} = \phi(z^{(l)})$
407 | \item \textbf{Backward Pass}: Set $\delta^{(L+1)} = -2(y_{n} - x^{(L+1)})\phi'(z^{(L+1)})$. Compute for
408 | $l = L, \ldots 1$:  $\delta^{(l)} = (W^{(l+1)}\delta^{(l+1)}) \odot \phi'(z^{(l)})$
409 | 
410 | \item \textbf{Final Computation}: For all parameters compute $\dfrac{\partial \mathcal{L}_{n} }{\partial w^{(l)}_{i,j}} = \delta^{(l)}_{j}  x_{i}^{(l-1)}$ and $\dfrac{\partial \mathcal{L}_{n} }{\partial b^{(l)}_{j}} = \delta^{(l)}_{j}$
411 | \end{itemize}
412 | 
413 | 
414 | 
415 | 
416 | 
417 | 
418 | Now that we have the gradient with respect to all parameters, the SGD algorithm makes a small step in the direction
419 | opposite to the gradient, then picks a new sample $(x_{n}, y_{n})$,
420 | and repeats.
421 | 
422 | 
423 | \section{Variational Autoencoders}
424 | 
425 | Assume that the training data $\{x^{(i)}\}_{i=1}^{N}$ is generated from an underlying unobserved (latent)
426 | representation $\mathbf{z}$. For example, imagine that
427 | $\mathbf{x}$ is an image and $\mathbf{z}$ are the latent factors used to
428 | generate $\mathbf{x}$: attributes, orientation, etc. \\
429 | 
430 | \begin{figure}[h]
431 | \centering
432 | \includegraphics[width=0.33\textwidth]{img/vae_1.png}
433 | \end{figure}
434 | 
435 |  We want to estimate the true parameters $\mathbf{\theta}^*$
436 | of this generative model:
437 | 
438 | \begin{itemize}
439 |     \item We choose the  prior ${p(z)}$ to be simple, e.g. Gaussian $p(z) \sim \mathcal{N}(0,\mathcal{I})$
440 |     \item The conditional distribution $p(x|z)$ on the other hand is complex. We can model it with a neural network. In the case of images, an example would be a multivariate Bernoulli representing the state of each pixel $p(x|z) \sim Ber(\theta)$.
441 |     \item We train the model to maximize the likelihood of the training data: $$p_{\theta}(x)= \int \overbrace{p_{\theta}(z)}^\text{Gaussian Prior}\overbrace{p_{\theta}(x|z)}^\text{Decoder NN}dz$$
442 | \end{itemize}
443 | 
444 | However, the problem with this approach is that the \textbf{integral is intractable}: it is not feasible to compute $p(x|z)$ for every $z$.
445 | 
446 | \newpage
447 | To overcome this,  we can define an additional encoder network $q_{\phi}(z|x)$ that approximates $p_{\theta}
448 | (z|x)$ (note that $p_{\theta}(z|x)$ is intractable as well because of the integral computation).
449 | 
450 | \begin{figure}[h]
451 | \centering
452 | \includegraphics[width=0.8\textwidth]{img/vae_2.png}
453 | \end{figure}
454 | 
455 | 
456 | Now, we can find a lower bound (Evidence Lower Bound) for the likelihood of the training data:
457 | 
458 | \begin{equation*}
459 | \begin{aligned}
460 |    & \log{p_{\theta}(x^{(i)})} = \mathbb{E}_{z \sim q_{\phi}(z|x^{(i)}) } \Big[\log{p_{\theta}(x^{(i)})} \Big] &&&  (\text{$p_{\theta}(x^{(i)})$ does not depend on $z$})
461 |    \\ & =  \mathbb{E}_{z} \Big[\log{\dfrac{p_{\theta}(x^{(i)} | z) p_{\theta}(z)}{p_{\theta}(z| x^{(i)})}} \Big] &&& \text{(Bayes Rule)} \\
462 |    & =  \mathbb{E}_{z} \Big[\log{\dfrac{p_{\theta}(x^{(i)} | z) p_{\theta}(z)}{p_{\theta}(z| x^{(i)})} \dfrac{q_{\phi}(z|x^{(i)})}{q_{\phi}(z|x^{(i)})}}  \Big] &&& \text{(Multiply and divide by q)}
463 |    \\ & =  \mathbb{E}_{z} \Big[\log{p_{\theta}(x^{(i)} | z)}  \Big] - \mathbb{E}_{z} \Big[\log{\dfrac{q_{\phi}(z|x^{(i)})}{p_{\theta}(z)}} \Big]  + \mathbb{E}_{z} \Big[\log{\dfrac{q_{\phi}(z|x^{(i)})}{p_{\theta}(z| x^{(i)})}} \Big] &&& \text{(Logarithms property)} \\
464 |    & =  \overbrace{\mathbb{E}_{z} \Big[\log{p_{\theta}(x^{(i)} | z)}  \Big] - D_{KL}\Big(q_{\phi}(z|x^{(i)}) \; || \; p(z)\Big)}^\text{$=\mathcal{L}_{\theta,\phi}$ (ELBO)} + \overbrace{D_{KL}\Big(q_{\phi}(z|x^{(i)}) \; || \; p_{\theta}(z |x^{(i)})\Big)}^\text{$\geq 0$}
465 |    \end{aligned}
466 | \end{equation*}
467 | In this context, the first term on the RHS  can be interpreted as expected negative reconstruction error. The second term is a regularization term that encourages the variational distribution to be close to the prior over latent variables. If the regularization term is omitted, the variational distribution would collapse to a delta function and the variational autoencoder would degenerate to a “usual” deterministic autoencoder.
468 | 
469 | Finally, we can find the optimal parameters optimizing the ELBO:
470 | 
471 | $$\theta^*, \phi^* = argmax_{\theta,\phi} \sum_{i=1}^{N} \mathcal{L}(x^{(i)},\theta,\phi)$$
472 | 
473 | \begin{figure}[h]
474 | \centering
475 | \includegraphics[width=0.45\textwidth]{img/vae_3.png}
476 | \end{figure}
477 | 
478 | \end{document}
479 | 


--------------------------------------------------------------------------------
/L11_Non_Parametric_Bayesian_Methods.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[twoside]{article}
  2 | \setlength{\oddsidemargin}{0.25 in}
  3 | \setlength{\evensidemargin}{-0.25 in}
  4 | \setlength{\topmargin}{-0.6 in}
  5 | \setlength{\textwidth}{6.5 in}
  6 | \setlength{\textheight}{8.5 in}
  7 | \setlength{\headsep}{0.75 in}
  8 | \setlength{\parindent}{0 in}
  9 | \setlength{\parskip}{0.1 in}
 10 | \newcommand{\eqdef}{:\mathrel{\mathop=}}
 11 | \newcommand{\norm}[1]{\left\lVert #1 \right\rVert}
 12 | 
 13 | %
 14 | % ADD PACKAGES here:
 15 | %
 16 | 
 17 | \usepackage{amsmath,amsfonts,graphicx}
 18 | 
 19 | %
 20 | % The following commands set up the lecnum (lecture number)
 21 | % counter and make various numbering schemes work relative
 22 | % to the lecture number.
 23 | %
 24 | \newcounter{lecnum}
 25 | \renewcommand{\thepage}{\thelecnum-\arabic{page}}
 26 | \renewcommand{\thesection}{\thelecnum.\arabic{section}}
 27 | \renewcommand{\theequation}{\thelecnum.\arabic{equation}}
 28 | \renewcommand{\thefigure}{\thelecnum.\arabic{figure}}
 29 | \renewcommand{\thetable}{\thelecnum.\arabic{table}}
 30 | \newcommand{\indep}{\raisebox{0.05em}{\rotatebox[origin=c]{90}{$\models$}}}
 31 | 
 32 | %
 33 | % The following macro is used to generate the header.
 34 | %
 35 | \newcommand{\lecture}[4]{
 36 |    \pagestyle{myheadings}
 37 |    \thispagestyle{plain}
 38 |    \newpage
 39 |    \setcounter{lecnum}{#1}
 40 |    \setcounter{page}{1}
 41 |    \noindent
 42 |    \begin{center}
 43 |    \framebox{
 44 |       \vbox{\vspace{2mm}
 45 |     \hbox to 6.28in { {\bf Advanced Machine Learning
 46 | 	\hfill Fall 2020} }
 47 |        \vspace{4mm}
 48 |        \hbox to 6.28in { {\Large \hfill Lecture #1: #2  \hfill} }
 49 |        \vspace{2mm}
 50 |        \hbox to 6.28in { {\it  #3 \hfill  #4} }
 51 |       \vspace{2mm}}
 52 |    }
 53 |    \end{center}
 54 |    \markboth{Lecture #1: #2}{Lecture #1: #2}
 55 | 
 56 |    {\bf Note}: {\it LaTeX template courtesy of UC Berkeley EECS dept.}
 57 | 
 58 |    {\bf Disclaimer}: {\it These notes are adapted from ETH's Advanced Machine Learning Course and "Machine Learning: a Probabilistic Perspective" book.}
 59 |    \vspace*{4mm}
 60 | }
 61 | %
 62 | % Convention for citations is authors' initials followed by the year.
 63 | % For example, to cite a paper by Leighton and Maggs you would type
 64 | % \cite{LM89}, and to cite a paper by Strassen you would type \cite{S69}.
 65 | % (To avoid bibliography problems, for now we redefine the \cite command.)
 66 | % Also commands that create a suitable format for the reference list.
 67 | \renewcommand{\cite}[1]{[#1]}
 68 | \def\beginrefs{\begin{list}%
 69 |         {[\arabic{equation}]}{\usecounter{equation}
 70 |          \setlength{\leftmargin}{2.0truecm}\setlength{\labelsep}{0.4truecm}%
 71 |          \setlength{\labelwidth}{1.6truecm}}}
 72 | \def\endrefs{\end{list}}
 73 | \def\bibentry#1{\item[\hbox{[#1]}]}
 74 | 
 75 | %Use this command for a figure; it puts a figure in wherever you want it.
 76 | %usage: \fig{NUMBER}{SPACE-IN-INCHES}{CAPTION}
 77 | \newcommand{\fig}[3]{
 78 | 			\vspace{#2}
 79 | 			\begin{center}
 80 | 			Figure \thelecnum.#1:~#3
 81 | 			\end{center}
 82 | 	}
 83 | % Use these for theorems, lemmas, proofs, etc.
 84 | \newtheorem{theorem}{Theorem}[lecnum]
 85 | \newtheorem{lemma}[theorem]{Lemma}
 86 | \newtheorem{proposition}[theorem]{Proposition}
 87 | \newtheorem{claim}[theorem]{Claim}
 88 | \newtheorem{corollary}[theorem]{Corollary}
 89 | \newtheorem{definition}[theorem]{Definition}
 90 | \newenvironment{proof}{{\bf Proof:}}{\hfill\rule{2mm}{2mm}}
 91 | 
 92 | % **** IF YOU WANT TO DEFINE ADDITIONAL MACROS FOR YOURSELF, PUT THEM HERE:
 93 | 
 94 | \newcommand\E{\mathbb{E}}
 95 | 
 96 | \begin{document}
 97 | %FILL IN THE RIGHT INFO.
 98 | %\lecture{**LECTURE-NUMBER**}{**DATE**}{**LECTURER**}{**SCRIBE**}
 99 | \lecture{11}{Non-parametric Bayesian methods}{}{}
100 | %\footnotetext{These notes are partially based on those of Nigel Mansell.}
101 | 
102 | % **** YOUR NOTES GO HERE:
103 | 
104 | % Some general latex examples and examples making use of the
105 | % macros follow.  
106 | %**** IN GENERAL, BE BRIEF. LONG SCRIBE NOTES, NO MATTER HOW WELL WRITTEN,
107 | %**** ARE NEVER READ BY ANYBODY.
108 | 
109 | 
110 | \section{Model Selection} % Don't be this informal in your notes!
111 | 
112 | We start our discussion by modeling the posterior distribution:
113 | 
114 | \begin{equation*}
115 | \begin{aligned}
116 |    & p(\theta | x) = e^{-\beta w(\theta,x)}
117 |    \\
118 |    & w(\theta,x) = R(\theta,x) - F(x)\\
119 |    & F(x) = - \dfrac{1}{\beta} \log{\int{e^{-\beta R(\theta,x)}d\theta}} &&& \text{(Normalization term for $p(\theta|x)$)}
120 |   \end{aligned}
121 | \end{equation*}
122 |  
123 |  Next, we can compute the validation error on training set $x'$ and validation set $x''$:
124 | 
125 | \begin{equation*}
126 | \begin{aligned}
127 |    & \mathbb{E}_{\theta|x'} \Big[ - \log{p(\theta|x'')}\Big]
128 |    \\
129 |    & = \mathbb{E}_{\theta|x'} \Big[\beta w(\theta,x'')\Big]\\
130 |    & = \beta \Big( \underbrace{\mathbb{E}_{\theta|x'} \Big[ R(\theta,x'')\Big]}_{Loss} \underbrace{-F(x'')}_\text{Free Energy} \Big ) &&& \text{(Note that $F(x'')$ does not depend on $\theta$ since we integrate it out)}
131 |   \end{aligned}
132 | \end{equation*}
133 | 
134 | The problem, in practice, is that people just minimize the loss while completely ignoring the free energy. This is the draw back of using the validation error for model selection.
135 | 
136 | To overcome this limitation, we can perform  "posterior selection":
137 | 
138 | \begin{equation*}
139 | \begin{aligned}
140 |    & \underset{p(.|.)}{min} \; \mathbb{E}_{\theta|x'} \Big[ - \log{p(\theta|x'')}\Big] \geq \underset{p(.|.)}{min} \; - \log{\mathbb{E}_{\theta|x'} \Big[ {p(\theta|x'')}}\Big] &&& \text{(Jensen's inequality)}
141 |    \\
142 |    & = \underset{p(.|.)}{min} \; - \log{ \Big( \int{p(\theta|x') p(\theta|x'') d\theta} \Big)} \\
143 |    & = - \underset{p(.|.)}{max} \;  \log{ \Big( \underbrace{\int{p(\theta|x') p(\theta|x'') d\theta}}_{\text{Probability Kernel $k(x',x'')$}} \Big)}
144 |  \end{aligned}
145 | \end{equation*}
146 | \newpage 
147 | 
148 | A few remarks on this:
149 | \begin{itemize}
150 |     \item The kernel $k(x',x'')$  measures the "agreement" of the two posteriors.
151 |     \item This strategy chooses a posterior that is concentrated (peaked) and agrees between $x'$ and $x''$
152 |     \item When we apply Jensen's inequality we have no guarantee that the posteriors (the one of the original optimization problem and the one from Jensen's) are similar.
153 |     \item Maximizing the $\mathbb{E}_{x',x''} \Big[ \log{k(x',x'')} \Big ]$ is a metric concept, since the kernel captures the similarity between $x'$ and $x''$. Whereas minimizing $\mathbb{E}_{\theta | x'} \Big [R(\theta,x'')\Big]$ is a search strategy based on a partial order. The argument we make here is that the latter is more sensible to noise. (In the space of $\theta$, two solutions might be far away from each other and still have the same cost value)
154 | \end{itemize}
155 | 
156 | 
157 | \section{Dirichlet Processes}
158 | The principle problem with finite mixture models is how to choose the number of components $\mathbf{K}$. However, in many cases, there is no well- defined number of clusters. It would be much better if we did not have to choose K at all.
159 | In this section, we discuss infinite mixture models, in which we do not impose any a priori bound on $K$. To do this, we will use a non-parametric prior based on the Dirichlet process ($DP$). This allows the number of clusters to grow as the amount of data increases.
160 | 
161 | \subsection{Stick-breaking Construction}
162 | To build the mixture model we need to sample the cluster probabilities:  $\rho_{1 : K} \sim Dir(a_{1 :K})$.\\ However, there are two problems:
163 | \begin{itemize}
164 |     \item As $K \to \infty$ we cannot sample infinite points from $Dir()$.
165 |     \item The sum of these probabilities should sum to 1. 
166 | \end{itemize}
167 | We observe that:
168 | $$ \rho_{1:K} \sim Dir\Big(a_{1:K}\Big) \iff
169 | \rho_{1} = Beta\Big(a_{1}, \sum_{k=1}^{K} a_{k} - a_{1} \Big) \; \indep \; Dir\Big(a_{2:K} \Big)
170 | $$
171 | 
172 | Thus, we can sample from the Dirichlet using this technique:
173 | \begin{figure}[h]
174 | \centering
175 | \includegraphics[width=0.7\textwidth]{img/stick-breaking.png}
176 | \end{figure}
177 | 
178 | \newpage
179 | 
180 | How can we generate $K \to \infty$ probabilities that strictly sum to one?\\ 
181 | We fix the Betas in the stick-breaking process to $Beta(1,\alpha)$.
182 | 
183 | \vspace{1mm}
184 | 
185 | \begin{figure}[h]
186 | \centering
187 | \includegraphics[width=0.7\textwidth]{img/stick-breaking2.png}
188 | \end{figure}
189 | 
190 | \vspace{1mm}
191 | We finally obtain a distribution $\rho \sim GEM(\alpha)$ from which we can sample our infinite probabilities.
192 | Observe that $\alpha$ is an hyper-parameter that regulates how much stick the first draws will obtain. The larger the $\alpha$ the smaller the first piece of the stick will be.
193 | 
194 | \begin{figure}[h]
195 | \centering
196 | \includegraphics[width=0.6\textwidth]{img/gem1.png}
197 | \includegraphics[width=0.6\textwidth]{img/gem10.png}
198 | \includegraphics[width=0.6\textwidth]{img/gem100.png}
199 | \end{figure}
200 | \newpage
201 | \subsection{Mixture Model}
202 | We now have all we need to build our Dirichlet Process Mixture Model. \\ \\
203 | First, we draw an infinity of cluster probabilities from $\rho \sim GEM(\alpha)$
204 | 
205 | \begin{figure}[h]
206 |     \centering
207 |     \includegraphics[width=0.3\textwidth]{img/frequencies.png}
208 | \end{figure}
209 | 
210 | Second, we draw an infinity of $\mu_{k} \sim \mathcal{N}\Big(\mu_{0}, \Sigma_{0}\Big), k=1,2 \ldots$  and we assign each $u_{k}$ to the respective probability $\rho$.
211 | 
212 | \begin{figure}[h]
213 |     \centering
214 |     \includegraphics[width=0.3\textwidth]{img/us.png}
215 | \end{figure}
216 | 
217 | The resulting distribution is $G= \sum_{k=1}^{\infty} \rho_{k} \delta{\mu_{k}} = DP\Big(\alpha, \mathcal{N}(\mu_{0}, \Sigma_{0})\Big)$. We have successfully constructed a Dirichlet Process. 
218 | 
219 | 
220 | To generate data from this model, we can sample $z_{n} \sim Categorical(\rho)$ and $\mu_{n} = \mu_{z_{n}}$  i.e. $\mu_{n} \sim G$. Next, we sample the data point from $x_{n} \sim \mathcal{N}(\mu_{n}, \Sigma_{0})$.
221 | 
222 | However, this is unfeasible in practice because we need to draw an infinity of $\rho$. The key idea to solve this problem, is to draw the $\rho$ on demand. We can draw from $GEM(2)$ with  Uniform$(0,1)$: if the sample we draw from the uniform distribution (red cursor in figure) is already covered we draw $x_{1}$ from the corresponding Gaussian with mean $\mu_{1}$. If it is not covered (white space in figure) we continue the stick breaking process until it is covered. 
223 | 
224 | 
225 | \begin{figure}[h]
226 |     \centering
227 |     \includegraphics[width=0.6\textwidth]{img/gem2.png}
228 | \end{figure}
229 | \newpage
230 | \subsection{Chinese Restaurant Process}
231 | Chinese restaurant process or CRP, is based on the seemingly infinite supply of tables at certain Chinese restaurants. 
232 | The analogy is as follows: The tables are like clusters, and the customers are like observations. When a person enters the restaurant, he may choose to join an existing table with probability proportional to the number of people already sitting at this table (the $|\tau|$ term); otherwise, with a probability that diminishes as more people enter the room (due to the $\dfrac{1}{\alpha + n} $ term), he may choose to sit at a new table. The result is a distribution over partitions of the integers, which is like a distribution of customers to tables.
233 | The fact that currently occupied tables are more likely to get new customers is sometimes called the \textbf{rich get richer} phenomenon. 
234 | 
235 | \begin{figure}[h]
236 |     \centering
237 |     \includegraphics[width=0.6\textwidth]{img/cp.png}
238 | \end{figure}
239 | 
240 | $$\pi_{[10]} = \{\{1,2,4,7\}, \{3,6,8\},\{5,9\},\{10\}\}$$
241 | \[
242 |   P(\text{customer} \; n+1 \; \text{joins table} \; \tau \; | \pi)=
243 |   \begin{cases}
244 |                                   \dfrac{ |\tau|}{\alpha+n} & \text{if $\tau \in \pi$} \\
245 |                                    \dfrac{\alpha}{\alpha +n} & \text{otherwise} \\
246 |   \end{cases}
247 | \]
248 | 
249 | The probability for a specific tables configuration is given by:
250 | $$P(\pi_{[n]}) =\dfrac{ \alpha^{|\pi_{[n]}|}}{\alpha^{(n)}} \prod_{\tau \in \pi_{[n]}} (\tau -1)! $$ 
251 | where $\alpha^{(n)}$ is the ascending factorial.
252 | 
253 | \subsection{Exchangeability}
254 | 
255 | Let $(X_{1},X_{2},\ldots)$ be a sequence of random variables. The sequence in exchangeable when, for every permutation $\pi$ of $\mathcal{N}$, the random vectors:
256 | 
257 | $$(X_{1},X_{2},\ldots) \text{ and} \; (X_{\pi(1)}, X_{\pi(2)}, \ldots)$$
258 | 
259 | have the same distribution.
260 | 
261 | \begin{theorem}(De Finetti)\\
262 | Let $(X_{1},X_{2},\ldots)$ be an infinitely exchangeable sequence of random variables. Then , $\forall n$:
263 | 
264 | $$p(X_{1},\ldots,X_{n}) = \int \Big( \prod_{i=1}^{n} p(x_{i}|G)\Big)dP(G)$$
265 | 
266 | for some random variable G.
267 | 
268 | \end{theorem}
269 | 
270 | In the case of i.i.d. random variables we would only have  one $G$ and the theorem would reduce to $p(X_{1},\ldots,X_{n}) =  \prod_{i=1}^{n} p(x_{i})$
271 | \\ \\
272 | Notice that CRP is exchangeable $\implies $ we can apply De Finetti's Theorem. In the CRP's case, the Dirichlet Process is the random variable G of De Finetti's theorem. The intuition behind this is that the probability of a particular table configuration is given by a "voting"  weighted on all the underlying distributions G from which the probabilities of the tables (remember the $\rho$'s) are picked from.
273 | 
274 | \subsection{Fitting}
275 | We can leverage exchangeability to fit our model: any point can be considered the last arrived.\\ Considering the case of CRP: for	each	observation, we	remove	the	customer dish from	the	restaurant	and	resample	as	if	they	were	the	last	to	enter.
276 | 
277 | \begin{itemize}
278 |     \item Take a random guess initially.
279 |     \item Unassign observation i.
280 |     \item Compute $p(z_{i} | z_{-i}, x, \alpha, \mu)$ which  represents the cluster assignment for element i.
281 |     \item Update $z_{i}$ by sampling from this distribution.
282 |     \item Keep going.
283 |     
284 | \end{itemize}
285 | 
286 | $p(z_{i} | z_{-i}, x, \alpha, \mu)$ is computed as follows:
287 | 
288 | $$p(z_{i}=k | z_{-i}, \mathbf{x}, \alpha, \mathbf{\mu}) \propto \underbrace{p(z_{i}=k | z_{-i}, \alpha)}_{Prior} \; \underbrace{p(x_{i}| \mu, z_{i}=k, z_{-i}, \mathbf{x_{-i}})}_{Likelihood} \;$$
289 | 
290 | 
291 | The prior computation is simple (CRP):
292 | 
293 | 
294 | \[
295 |   p(z_{i}=k | z_{-i}, \alpha)=
296 |   \begin{cases}
297 |                                   \dfrac{N_{k,-i}}{\alpha+N-1} & \text{for existing k} \\
298 |                                    \dfrac{\alpha}{\alpha +N-1} & \text{otherwise} \\
299 |   \end{cases}
300 | \]
301 | 
302 | Finally, for the likelihood we don't need to consider point in $x$ that are not in cluster $k$:
303 | 
304 | \[
305 |   p(x_{i}| \mu, z_{i}=k, z_{-i}, \mathbf{x_{-i}})=
306 |   \begin{cases}
307 |                                   p(x_{i} | x_{-i,k}, \mu) & \text{for existing k} \\
308 |                                    p(x_{i} | \mu) & \text{otherwise} \\
309 |   \end{cases}
310 | \]
311 | 
312 | 
313 | 
314 | 
315 | \end{document}


--------------------------------------------------------------------------------
/L12_PAC_Learning.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[twoside]{article}
  2 | \setlength{\oddsidemargin}{0.25 in}
  3 | \setlength{\evensidemargin}{-0.25 in}
  4 | \setlength{\topmargin}{-0.6 in}
  5 | \setlength{\textwidth}{6.5 in}
  6 | \setlength{\textheight}{8.5 in}
  7 | \setlength{\headsep}{0.75 in}
  8 | \setlength{\parindent}{0 in}
  9 | \setlength{\parskip}{0.1 in}
 10 | \newcommand{\eqdef}{:\mathrel{\mathop=}}
 11 | \newcommand{\norm}[1]{\left\lVert #1 \right\rVert}
 12 | \def\Inf{\operatornamewithlimits{inf\vphantom{p}}}
 13 | 
 14 | %
 15 | % ADD PACKAGES here:
 16 | %
 17 | 
 18 | \usepackage{amsmath,amsfonts,graphicx}
 19 | 
 20 | %
 21 | % The following commands set up the lecnum (lecture number)
 22 | % counter and make various numbering schemes work relative
 23 | % to the lecture number.
 24 | %
 25 | \newcounter{lecnum}
 26 | \renewcommand{\thepage}{\thelecnum-\arabic{page}}
 27 | \renewcommand{\thesection}{\thelecnum.\arabic{section}}
 28 | \renewcommand{\theequation}{\thelecnum.\arabic{equation}}
 29 | \renewcommand{\thefigure}{\thelecnum.\arabic{figure}}
 30 | \renewcommand{\thetable}{\thelecnum.\arabic{table}}
 31 | \newcommand{\indep}{\raisebox{0.05em}{\rotatebox[origin=c]{90}{$\models$}}}
 32 | 
 33 | %
 34 | % The following macro is used to generate the header.
 35 | %
 36 | \newcommand{\lecture}[4]{
 37 |    \pagestyle{myheadings}
 38 |    \thispagestyle{plain}
 39 |    \newpage
 40 |    \setcounter{lecnum}{#1}
 41 |    \setcounter{page}{1}
 42 |    \noindent
 43 |    \begin{center}
 44 |    \framebox{
 45 |       \vbox{\vspace{2mm}
 46 |     \hbox to 6.28in { {\bf Advanced Machine Learning
 47 | 	\hfill Fall 2020} }
 48 |        \vspace{4mm}
 49 |        \hbox to 6.28in { {\Large \hfill Lecture #1: #2  \hfill} }
 50 |        \vspace{2mm}
 51 |        \hbox to 6.28in { {\it  #3 \hfill  #4} }
 52 |       \vspace{2mm}}
 53 |    }
 54 |    \end{center}
 55 |    \markboth{Lecture #1: #2}{Lecture #1: #2}
 56 | 
 57 |    {\bf Note}: {\it LaTeX template courtesy of UC Berkeley EECS dept.}
 58 | 
 59 |    {\bf Disclaimer}: {\it These notes are adapted from ETH's Advanced Machine Learning Course, ETH's Statistical Learning Theory Script and "An Introduction to Computational Learning Theory" book.}
 60 |    \vspace*{4mm}
 61 | }
 62 | %
 63 | % Convention for citations is authors' initials followed by the year.
 64 | % For example, to cite a paper by Leighton and Maggs you would type
 65 | % \cite{LM89}, and to cite a paper by Strassen you would type \cite{S69}.
 66 | % (To avoid bibliography problems, for now we redefine the \cite command.)
 67 | % Also commands that create a suitable format for the reference list.
 68 | \renewcommand{\cite}[1]{[#1]}
 69 | \def\beginrefs{\begin{list}%
 70 |         {[\arabic{equation}]}{\usecounter{equation}
 71 |          \setlength{\leftmargin}{2.0truecm}\setlength{\labelsep}{0.4truecm}%
 72 |          \setlength{\labelwidth}{1.6truecm}}}
 73 | \def\endrefs{\end{list}}
 74 | \def\bibentry#1{\item[\hbox{[#1]}]}
 75 | 
 76 | %Use this command for a figure; it puts a figure in wherever you want it.
 77 | %usage: \fig{NUMBER}{SPACE-IN-INCHES}{CAPTION}
 78 | \newcommand{\fig}[3]{
 79 | 			\vspace{#2}
 80 | 			\begin{center}
 81 | 			Figure \thelecnum.#1:~#3
 82 | 			\end{center}
 83 | 	}
 84 | % Use these for theorems, lemmas, proofs, etc.
 85 | \newtheorem{theorem}{Theorem}[lecnum]
 86 | \newtheorem{lemma}[theorem]{Lemma}
 87 | \newtheorem{proposition}[theorem]{Proposition}
 88 | \newtheorem{claim}[theorem]{Claim}
 89 | \newtheorem{corollary}[theorem]{Corollary}
 90 | \newtheorem{definition}[theorem]{Definition}
 91 | \newenvironment{proof}{{\bf Proof:}}{\hfill\rule{2mm}{2mm}}
 92 | 
 93 | % **** IF YOU WANT TO DEFINE ADDITIONAL MACROS FOR YOURSELF, PUT THEM HERE:
 94 | 
 95 | \newcommand\E{\mathbb{E}}
 96 | 
 97 | \begin{document}
 98 | %FILL IN THE RIGHT INFO.
 99 | %\lecture{**LECTURE-NUMBER**}{**DATE**}{**LECTURER**}{**SCRIBE**}
100 | \lecture{12}{PAC Learning}{}{}
101 | %\footnotetext{These notes are partially based on those of Nigel Mansell.}
102 | 
103 | % **** YOUR NOTES GO HERE:
104 | 
105 | % Some general latex examples and examples making use of the
106 | % macros follow.  
107 | %**** IN GENERAL, BE BRIEF. LONG SCRIBE NOTES, NO MATTER HOW WELL WRITTEN,
108 | %**** ARE NEVER READ BY ANYBODY.
109 | 
110 | 
111 | \section{Introduction} % Don't be this informal in your notes!
112 | The aim of the probably approximately correct (PAC) learning
113 | model is to provide a framework for the classification problem that is distribution
114 | independent, thus avoiding the density estimation task, in the spirit of V.N.Vapnik: “Don’t solve a harder problem than necessary”. In this section the PAC model is
115 | presented. \\ \\
116 | The classification problem can be formalized using the following ideas: Let $\mathbf{\mathcal{X}}$ denote the \textbf{instance} space, that is the space that encodes the objects in the learners
117 | (algorithm) world, a \textbf{concept} over $\mathbf{\mathcal{X}}$ is defined as a subset $\mathbf{c} \subset \mathbf{\mathcal{X}}$ of the instance
118 | space or equivalently as a function $c : \mathbf{\mathcal{X}} \to \{0, 1\}$.\\ A \textbf{concept class} is a collection of concepts. For instance $\mathbf{\mathcal{X}}$
119 | can be the set of all possible configurations of a pixel array and the concept class of
120 | “$A$” can be the subset of configurations that represent the letter A.\\ \\
121 | The \textbf{set of samples} or data is defined as a subset $\mathbf{\mathcal{Z}} = \{(x_{i}
122 | , y_{i}) : 1 \leq i \leq n \} \subset \mathbf{\mathcal{X}} \times \{0, 1\}$ where $n$ is
123 | the number of samples and $\{0, 1\}$ is the set of labels that denotes negative or positive
124 | examples.\\
125 | A \textbf{classifier} is a function $c : \mathbf{\mathcal{X}}  \to \{0, 1\}$ from the instance space
126 | to the label space. When the classifier has been trained under a specific set of samples it will be denoted by $\hat{c}_{n}$, where the $n$ specifies the number of sample points used. Finally, an \textbf{hypothesis class} is another set of concepts that we use to learn a target concept from the concept class.
127 | 
128 | \begin{definition}
129 | Let $\mathbf{\mathcal{H}}$ and $c$ be an hypothesis class and a concept. A \textbf{learning algorithm} is an algorithm that receives as input a labeled sample $\mathbf{\mathcal{Z}} = \{(x_{i}
130 | , y_{i}) : 1 \leq i \leq n, \;  \forall i \; y_{i} = c(x_{i}) \} $ and outputs an hypothesis $\hat{c} \in \mathbf{\mathcal{H}}$
131 | \end{definition}
132 | 
133 | \begin{definition}
134 | A learning algorithm $\mathbf{\mathcal{A}}$ can learn a concept $c \in \mathbf{\mathcal{C}}$ if there is a polynomial function $poly(\ldots)$ such that for any distribution $\mathbf{\mathcal{D}}$ on $\mathbf{\mathcal{X}}$ and for any $0 < \epsilon <\dfrac{1}{2}$ and $0 < \delta <\dfrac{1}{2}$: \\
135 | if $\mathbf{\mathcal{A}}$ receives as input a sample $\mathbf{\mathcal{Z}}$ of size $n \geq poly\Big(\dfrac{1}{\epsilon},\dfrac{1}{\delta},size(\mathcal{X})\Big)$, then $\mathbf{\mathcal{A}}$ outputs $\hat{c}$ such that:
136 | $$\mathbf{\mathbb{P}}_{\mathcal{Z} \sim \mathcal{D}^n}\Big(\mathcal{R}(\hat{c}) \leq \epsilon \Big) \geq 1- \delta$$
137 | \end{definition}
138 | 
139 | First notice that the distribution of the data is arbitrary but fixed, therefore if a concept class is
140 | known to be PAC learnable, no matter how pathological the distribution $\mathbf{\mathcal{D}}$ might
141 | be in applications, the $\epsilon$,$ \delta$ guarantees will hold. \\
142 | Second the concept class $\mathbf{\mathcal{C}}$ is fixed
143 | and known to the algorithm designer in advance, but the target concept $c \in $ $\mathbf{\mathcal{C}}$ that
144 | needs to be learned is not and the algorithm must be designed such that it works for
145 | any concept in the class.\\
146 | Third, if an algorithm requires resources (samples, time)
147 | that scale polynomial in $\dfrac{1}{\epsilon}$ 
148 | and $\dfrac{1}{\delta}$ where $\epsilon$ and $\delta$ are known respectively as the \textbf{error}
149 | and \textbf{confidence} parameters, then it is said that the concept class of the problem is
150 | \textbf{PAC efficient}.
151 | 
152 | \begin{definition}
153 | A concept class $\mathbf{\mathcal{C}}$ is PAC learnable from a hypothesis class $\mathbf{\mathcal{H}}$ if there is an algorithm that can learn every concept $ c \in \mathbf{\mathcal{C}}$.
154 | \end{definition}
155 | 
156 |  \section{Rectangle Learning}
157 | 
158 | The idea of the game is to learn, or find, an
159 | unknown rectangle that is aligned with the Cartesian axes in the $\mathbb{R}^2$
160 | space. For this
161 | task the player can request an oracle to provide points $(x_{i}
162 | , y_{i})$, that are distributed
163 | according to a fixed but unknown distribution $\mathbf{\mathcal{D}}$ , and are labeled as positive if they are inside the unknown rectangle and negative in the opposite case. In the language
164 | of the PAC definition, the instance space is the euclidean plane $\mathbf{\mathcal{X}}= \mathbb{R}^2$, the concept
165 | class is the set of all possible axis-aligned rectangles and a concept is just one specific
166 | rectangle from the concept class (see Fig. 12.1). The classification rule or hypothesis
167 | $h$ formulated by the player is tested by comparing its predictions against those of
168 | the oracle.\\ \\ 
169 | \begin{figure}[h]
170 | \centering
171 | 
172 | \includegraphics[width=0.4\textwidth]{img/pac1.png}
173 | \caption{(Left) A depiction of the concept class of axis aligned rectangles in the instance
174 | space $\mathcal{X} = \mathbb{R}^2$. (Right) Illustration of a particular concept $c$ from the concept class and
175 | the associated training data, observe that every data point inside the rectangle (concept) is
176 | labeled as positive and that the player must find the closest approximation to c.}
177 | \end{figure}
178 | 
179 | A simple strategy to approach this task is known as the “tightest fitting rectangle”. The idea is to request $n$ data points from the oracle and from those that are labeled as positive build the largest possible rectangle that encloses them, without enclosing any negative point (see Fig. 12.2 Left).
180 | \begin{figure}[h]
181 | \centering
182 | \includegraphics[width=0.4\textwidth]{img/pac2.png}
183 | \caption{(Left) Tightest fitting rectangle hypothesis h (dark shaded rectangle) (Right)
184 | Construction for the error analysis of the tightest fitting rectangle strategy.}
185 | \end{figure}
186 | 
187 | Formally, we measure the error of $\hat{R}$ as the probability that a randomly chosen point from $\mathcal{D}$ falls in the region $R \Delta \hat{R}$. In our learning game, we allow the distribution $\mathcal{D}$ to be arbitrary, but we assume that is fixed, and that each example is drawn \textbf{independently} from this distribution. We will now show that the tightest-fit rectangle algorithm $\mathcal{A}$ can learn any concept $ R \in \mathcal{C}$.\\\\
188 | 
189 | First, observe that the tightest-fit rectangle $\hat{R}$ is always contained in the target rectangle $R$ and so $R \Delta \hat{R} = R - \hat{R}$. We can express the difference $R-\hat{R}$ as the union of four rectangular strips. For instance, the topmost of these strips, which is shaded and denoted $T'$ in Figure 12.2, is the region above  the upper boundary of $R$. Note that there is some overlap between these four rectangular strips at the corner.\\ Now if we can guarantee that the weight under $\mathcal{D}$ of each strip (that is the probability with respect to $\mathcal{D}$ of falling in the strip) is at most $\epsilon/4$, then we can conclude that the error of $\hat{R}$ is at most $\epsilon$. (Here we have erred on the side of pessimism by counting each overlap region twice).\\\\
190 | Let us analyze the weight of the top strip $T'$. Define T to be the rectangular strip along the inside top of $R$ which encloses exactly weight $\epsilon/4$ under $\mathcal{D}$ (thus we sweep the top edge of $R$ downward until we have swept out weight $\epsilon/4$, see Figure 12.2). Clearly, $T'$
191 | has weight exceeding $\epsilon/4$ under $\mathcal{D}$ if and only if $T'$ includes $T$. Furthermore, $T'$ includes $T$ if and only if no point in $T$ appears in the sample $\mathcal{Z}$ - since if $\mathcal{Z}$ does contain a point $p \in T$, this point has a positive label since it is contained in $R$, and then by construction  $\hat{R}$ must extend upwards into $T$ to cover $p$.\\\\
192 | By definition, the probability that a single draw from the distribution $\mathcal{D}$ misses the region $T$ is exactly $1- \epsilon / 4$. Therefore, the probability that $n$ independent draws from $\mathcal{D}$ all miss the region $T$ is exactly $(1-\epsilon/4)^n$. The same analysis holds for the other three rectangular regions, so by the union bound, the probability that any of the four strips of $R \Delta \hat{R}$ has weight greater than $\epsilon/4$ is at most $4(1-\epsilon/4)^n$.\\\\
193 | Provided that we choose $n$ to satisfy $4(1-\epsilon/4)^n \leq \delta$, then with probability $1-\delta$ over the $n$ random examples, the weight of the error region $R \Delta \hat{R}$ will be bounded by $\epsilon$, as claimed.\\ Using the inequality $(1-x) \leq e^{-x}$:
194 | $$4 \; (1-\epsilon/4)^n \leq 4 \; e^{-n\epsilon/4} \leq \delta$$
195 | Finally solving for $n$:
196 | 
197 | $$n \geq \dfrac{4}{\epsilon} \; \dfrac{4}{\delta} \geq \dfrac{4}{\epsilon} \log{\dfrac{4}{\delta}}$$
198 | 
199 | In summary, provided our tightest-fit algorithm takes a sample of at least $n \geq 4/\epsilon \; 4/\delta$ examples to form its hypothesis rectangle $\hat{R}$, we can assert that with probability at least $1-\delta$, $\hat{R}$ will misclassify a new point with probability at most $\epsilon$.\\\\
200 | A few briefs comment are appropriate. First, note that the analysis does hold for any fixed probability distribution. We only needed the independence of successive point to obtain the bound. Second, the sample size bound behaves as we might expect, in that as we increase our demands - that is, as we ask for greater \textbf{accuracy} by decreasing $\epsilon$ or greater \textbf{confidence} by decreasing $\delta$ - our algorithm requires more examples to meet those demands. Finally, the algorithm we analyzed is PAC efficient: the required sample size is a slowly growing function of $1/\epsilon$ and $1/\delta$.
201 | \newpage
202 | \section{Error probability for realizable finite hypothesis classes}
203 | 
204 | \begin{theorem}
205 | Let $\mathcal{C}$ be a finite concept class and assume that $\mathcal{H}$ = $\mathcal{C}$. Let $\mathcal{A}$ be an algorithm that returns a consistent hypothesis $\hat{c}$ (i.e. , $\forall n < \infty : \hat{\mathcal{R}_{n}}(\hat{c})=0)$ for any target concept $c \in \mathcal{C}$ and any i.i.d. sample $\mathcal{Z}$. For any $\epsilon, \delta >0$ if :
206 | $$ n \geq \dfrac{1}{\epsilon}\Big(log{|\mathcal{H}|} + log{\dfrac{1}{\delta}}\Big)$$
207 | Then the error probability is bounded by:
208 | 
209 | $$\mathbb{P}(\mathcal{R}(\hat{c})> \epsilon) \leq \delta $$
210 | \end{theorem}
211 | 
212 | \begin{proof}\\ \\
213 | $$\mathbb{P}\Big(\mathcal{R}(\hat{c})> \epsilon\Big) \leq \mathbb{P}\Big( \exists \hat{c}: \; \mathcal{R}(\hat{c}) > \epsilon,\; \hat{\mathcal{R}}_{n}(\hat{c}) = 0\Big) \leq \sum_{\hat{c}:\mathcal{R}(\hat{c}) > \epsilon}\mathbb{P}\Big(\mathcal{R}_{n}(\hat{c}) =0 \Big)$$
214 | 
215 | $$\leq |\mathcal{C}| \; \Big  (1-\mathcal{R}(\hat{c})\Big)^n \leq |\mathcal{C}| \; (1-\epsilon)^n \leq |\mathcal{C}| \; e^{-n \epsilon} \leq \delta$$
216 | 
217 | In particular, if we want
218 | this probability to be smaller than some $\delta > 0$, it would suffice to let\\ $n > \dfrac{1}{\epsilon} \big(\log{|\mathcal{C}|}+  log(1/\delta) \big)$.
219 | \end{proof}
220 | 
221 | \section{The General PAC-learning model}
222 | In general, an instance label is not determined by the underlying concept. This is modeled with a distribution $\mathcal{D}$ on $\mathcal{X} \times \{0,1\}$ and reflects the fact that two instances with identical features might have different labels. The training set $\mathcal{Z}$ is therefore a sample from $\mathcal{D}$. Our goal is to find a hypothesis $\hat{c} \in \mathcal{H}$ with small generalization error $\mathcal{R}(\hat{c})$. \\ \\
223 | However, if the optimal classifier is not an element of the hypothesis class, then it is impossible to attain $ \forall  \; 0  < \epsilon < 1/2: \mathcal{R}(\hat{c}) \leq \epsilon$. Instead, we aim to obtain the best possible solution i.e.,
224 | $$\mathcal{R}(\hat{c}) - \inf_{c \in \mathcal{C}}{\mathcal{R}(c) \leq \epsilon}$$
225 | 
226 | \begin{definition}
227 | A learning algorithm $\mathbf{\mathcal{A}}$ can learn a concept $c \in \mathbf{\mathcal{C}}$ if there is a polynomial function $poly(\ldots)$ such that for any distribution $\mathbf{\mathcal{D}}$ on $\mathbf{\mathcal{X}}$ and for any $0 < \epsilon <\dfrac{1}{2}$ and $0 < \delta <\dfrac{1}{2}$: \\
228 | if $\mathbf{\mathcal{A}}$ receives as input a sample $\mathbf{\mathcal{Z}}$ of size $n \geq poly\Big(\dfrac{1}{\epsilon},\dfrac{1}{\delta},size(\mathcal{X})\Big)$, then $\mathbf{\mathcal{A}}$ outputs $\hat{c}$ such that:
229 | $$\mathbf{\mathbb{P}}_{\mathcal{Z} \sim \mathcal{D}^n}\Big(\mathcal{R}(\hat{c}) - \inf_{c \in \mathcal{C}}{\mathcal{R}(c)} \leq \epsilon \Big) \geq 1- \delta$$
230 | \end{definition}
231 | \newpage
232 | 
233 | \section{Vapnik-Chervonenkis Inequality}
234 | 
235 | Theorem 12.4 made two restrictive assumptions:
236 | \begin{itemize}
237 |     \item First, there exists a perfect hypothesis (realizability). What happens when the
238 | problem is not realizable (all hypotheses make some error)? 
239 | \item Second, the hypothesis class is finite. What happens when the number of hypotheses is infinite? We can’t just apply a union bound any more. To answer
240 | this, we need to have more suitable ways of measuring the “size” of a set other
241 | than cardinality. 
242 | \end{itemize}
243 | 
244 | Breaking free of these restrictive assumptions, we will show how bounding expected
245 | risk can be reduced to one of uniform convergence. Recall that our goal is to bound
246 | the excess risk, the amount by which ERM’s (empirical-risk-minimizer) expected risk exceeds the lowest possible
247 | expected risk:
248 | $$\mathbb{P}[\mathcal{R}(\hat{c})- \mathcal{R}(c^*)] \leq \delta$$
249 | 
250 | \begin{theorem} The following bound holds:\\ \\
251 | $\mathbb{P}\Big[\mathcal{R}(\hat{c}) - \inf_{c \in \mathcal{C}}{\mathcal{R}(c)} > \epsilon \Big] \leq \mathbb{P}\Big[ \sup_{c \in  \mathcal{C}}{|\mathcal{R}(c)- \mathcal{\hat{R}}_{n}(c) | > \epsilon/2} \Big]$
252 | \end{theorem}
253 | 
254 | \begin{proof}
255 | $$\mathcal{R}(\hat{c})- \inf_{c \in \mathcal{C}}{\mathcal{R}(c)} \leq \mathcal{R}(\hat{c})- \mathcal{\hat{R}}_{n}(\hat{c})+
256 | \mathcal{\hat{R}}_{n}(\hat{c}) - \mathcal{R}(c^*)$$
257 | $$\leq \sup_{c \in  \mathcal{C}}{|\mathcal{R}(c)- \mathcal{\hat{R}}_{n}(c)|} + \sup_{c \in  \mathcal{C}}{|\mathcal{R}(c)- \mathcal{\hat{R}}_{n}(c) |}$$
258 | $$\leq 2\sup_{c \in  \mathcal{C}}{|\mathcal{R}(c)- \mathcal{\hat{R}}_{n}(c)|}$$
259 | \end{proof}
260 | 
261 | On the LHS is a statement about excess risk, and on the RHS is a statement about
262 | uniform convergence. The RHS is the probability of the event that the largest difference
263 | between the empirical and expected risk is at least $\epsilon/2$.
264 | \begin{figure}[h]
265 | \centering
266 | \includegraphics[width=0.6\textwidth]{img/erm.png}
267 | \caption{In the worst case scenario pointwise convergence is not enough for the ERM algorithm to work. In Figure, $\mathcal{\hat{R}}_{n}(c)$ is  converging point-wise to $\mathcal{R}(c)$; however, since the convergence is not uniform, we have some $c$'s that do not converge for the current value of $n$ and cause the deviation in Figure. In this case, the ERM algorithm will pick the $\hat{c}$ that deviates the most $\mathcal{R}(c)$  thus, getting further and further away from the best classifier $c^*$.}
268 | \end{figure}
269 | 
270 | But why do we need \textbf{uniform convergence}?\\ 
271 | Overfitting the data implies that $\mathcal{\hat{R}}_{n}(c)$ and $\mathcal{R}(c)$ are very different, even if $\mathcal{\hat{R}}_{n}(c)$ is an unbiased
272 | estimator of $\mathcal{R}(c)$!
273 | Why will this happen? $\mathcal{\hat{R}}_{n}(c)$ is just the sample-average version of $\mathcal{R}(c)$, right? Is this contradicting the law
274 | of large numbers that $\mathcal{\hat{R}}_{n}(c)$ converges to $\mathcal{R}(c)$?\\
275 | It is true that $\mathcal{\hat{R}}_{n}(c)$ is an unbiased estimator of $\mathcal{R}(c)$ and yes indeed the law of large numbers is applicable
276 | in this case. But a key requirement for using the law of large numbers is that we assume $c$ is fixed. Namely,
277 | if the classifier $c$ is fixed, then the law of large numbers guarantees that the empirical risk $\mathcal{\hat{R}}_{n}(c)$ converges
278 | to the true risk function $\mathcal{R}(c)$.
279 | However, when we are finding the best classifier, we are considering many many many possible classifiers $c$.
280 | Although for a given classifier $c$ the law of large numbers works, it may not work when we consider many
281 | classifiers. The empirical risk minimization works if:
282 | $$\sup_{c \in \mathcal{C}}{|\mathcal{R}(c) - \mathcal{\hat{R}}_{n}(c)|}  \overset{p}{\to} 0$$
283 | 
284 | Namely, the convergence is uniform for all classifiers in the collection that we are considering.
285 | 
286 | 
287 | \end{document}


--------------------------------------------------------------------------------
/L2_Representations.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[twoside]{article}
  2 | \setlength{\oddsidemargin}{0.25 in}
  3 | \setlength{\evensidemargin}{-0.25 in}
  4 | \setlength{\topmargin}{-0.6 in}
  5 | \setlength{\textwidth}{6.5 in}
  6 | \setlength{\textheight}{8.5 in}
  7 | \setlength{\headsep}{0.75 in}
  8 | \setlength{\parindent}{0 in}
  9 | \setlength{\parskip}{0.1 in}
 10 | \newcommand{\eqdef}{:\mathrel{\mathop=}}
 11 | \newcommand{\norm}[1]{\left\lVert #1 \right\rVert}
 12 | 
 13 | %
 14 | % ADD PACKAGES here:
 15 | %
 16 | 
 17 | \usepackage{amsmath,amsfonts,graphicx,dsfont,amssymb}
 18 | %
 19 | % The following commands set up the lecnum (lecture number)
 20 | % counter and make various numbering schemes work relative
 21 | % to the lecture number.
 22 | %
 23 | \newcounter{lecnum}
 24 | \renewcommand{\thepage}{\thelecnum-\arabic{page}}
 25 | \renewcommand{\thesection}{\thelecnum.\arabic{section}}
 26 | \renewcommand{\theequation}{\thelecnum.\arabic{equation}}
 27 | \renewcommand{\thefigure}{\thelecnum.\arabic{figure}}
 28 | \renewcommand{\thetable}{\thelecnum.\arabic{table}}
 29 | \newcommand{\indep}{\raisebox{0.05em}{\rotatebox[origin=c]{90}{$\models$}}}
 30 | 
 31 | %
 32 | % The following macro is used to generate the header.
 33 | %
 34 | \newcommand{\lecture}[4]{
 35 |    \pagestyle{myheadings}
 36 |    \thispagestyle{plain}
 37 |    \newpage
 38 |    \setcounter{lecnum}{#1}
 39 |    \setcounter{page}{1}
 40 |    \noindent
 41 |    \begin{center}
 42 |    \framebox{
 43 |       \vbox{\vspace{2mm}
 44 |     \hbox to 6.28in { {\bf Advanced Machine Learning
 45 | 	\hfill Fall 2021} }
 46 |        \vspace{4mm}
 47 |        \hbox to 6.28in { {\Large \hfill Lecture #1: #2  \hfill} }
 48 |        \vspace{2mm}
 49 |        \hbox to 6.28in { {\it  #3 \hfill  #4} }
 50 |       \vspace{2mm}}
 51 |    }
 52 |    \end{center}
 53 |    \markboth{Lecture #1: #2}{Lecture #1: #2}
 54 | 
 55 |    {\bf Note}: {\it LaTeX template courtesy of UC Berkeley EECS dept.}
 56 | 
 57 |    {\bf Disclaimer}: {\it These notes are adapted from ETH's Advanced Machine Learning Course, All Of Statistics, Larry Wasserman, Springer and Statistical Machine Learning Notes 3, Justin Domke, UMASS.}
 58 |    \vspace*{4mm}
 59 | }
 60 | %
 61 | % Convention for citations is authors' initials followed by the year.
 62 | % For example, to cite a paper by Leighton and Maggs you would type
 63 | % \cite{LM89}, and to cite a paper by Strassen you would type \cite{S69}.
 64 | % (To avoid bibliography problems, for now we redefine the \cite command.)
 65 | % Also commands that create a suitable format for the reference list.
 66 | \renewcommand{\cite}[1]{[#1]}
 67 | \def\beginrefs{\begin{list}%
 68 |         {[\arabic{equation}]}{\usecounter{equation}
 69 |          \setlength{\leftmargin}{2.0truecm}\setlength{\labelsep}{0.4truecm}%
 70 |          \setlength{\labelwidth}{1.6truecm}}}
 71 | \def\endrefs{\end{list}}
 72 | \def\bibentry#1{\item[\hbox{[#1]}]}
 73 | 
 74 | %Use this command for a figure; it puts a figure in wherever you want it.
 75 | %usage: \fig{NUMBER}{SPACE-IN-INCHES}{CAPTION}
 76 | \newcommand{\fig}[3]{
 77 | 			\vspace{#2}
 78 | 			\begin{center}
 79 | 			Figure \thelecnum.#1:~#3
 80 | 			\end{center}
 81 | 	}
 82 | % Use these for theorems, lemmas, proofs, etc.
 83 | \newtheorem{theorem}{Theorem}[lecnum]
 84 | \newtheorem{lemma}[theorem]{Lemma}
 85 | \newtheorem{proposition}[theorem]{Proposition}
 86 | \newtheorem{claim}[theorem]{Claim}
 87 | \newtheorem{corollary}[theorem]{Corollary}
 88 | \newtheorem{definition}[theorem]{Definition}
 89 | \newenvironment{proof}{{\bf Proof:}}{\hfill\rule{2mm}{2mm}}
 90 | 
 91 | % **** IF YOU WANT TO DEFINE ADDITIONAL MACROS FOR YOURSELF, PUT THEM HERE:
 92 | 
 93 | \newcommand\E{\mathbb{E}}
 94 | \DeclareMathOperator*{\argmin}{arg\,min} 
 95 | 
 96 | \begin{document}
 97 | %FILL IN THE RIGHT INFO.
 98 | %\lecture{**LECTURE-NUMBER**}{**DATE**}{**LECTURER**}{**SCRIBE**}
 99 | \lecture{2}{Representations, measurements, data types}{}{}
100 | %\footnotetext{These notes are partially based on those of Nigel Mansell.}
101 | 
102 | % **** YOUR NOTES GO HERE:
103 | 
104 | % Some general latex examples and examples making use of the
105 | % macros follow.  
106 | %**** IN GENERAL, BE BRIEF. LONG SCRIBE NOTES, NO MATTER HOW WELL WRITTEN,
107 | %**** ARE NEVER READ BY ANYBODY.
108 | 
109 | \section{Data Representation}
110 | 
111 | One of the fundamental problems in Machine Learning is how a machine should represent an object/concept.\\
112 | Machines only understand numbers, thus we need to find a way to condensate all the information about an object in a set of numbers (i.e a vector). Such representation can later be used to perform different tasks; whether we want to classify the object, generate new ones or do anything else.\\
113 | One must be extremely careful in choosing the data representation: a wrong data representation can induce inappropriate similarity measures.
114 | 
115 | \subsection{Definition of Structures}
116 | 
117 | A statistical definition of good and poor structures is mandatory for rational pattern recognition.\\
118 | Multi-scale optimization yields efficient algorithms to detect good structures in data, but are the structures indeed in the data or are they explained by fluctuations?\\
119 | Without \textbf{validation}, any pattern recognition strategy is doomed to fail.
120 | 
121 | \subsection{Definition of Data}
122 | 
123 | \textbf{Measurements}: associations of numbers with physical quantities and natural phenomena by comparing an unknown quantity with a known quantity of the same kind.\\
124 | Our goal is to represent objects of interest and characterize them according to their typical patterns for detection, classification and abstraction.
125 | Measurements represent objects in a data space, e.g digits as objects and pixel intensities as measurements.
126 | 
127 | \section{Feature Space}
128 | 
129 | \textbf{Measurement space $\mathcal{X}$:} the mathematical space in which the data are represented, e.g numerical ($\mathcal{X} \subset \mathbb{R}^d$), Boolean ($\mathcal{X} = \mathbb{B}$) or categorical ($\mathcal{X} = \{1, ..., K\}$) features.\\
130 | Features are derived quantities or indirect observations which often significantly compress the information content of measurements.\\
131 | The selection of a specific feature space predetermines the metric to compare data, this choice is the first significant design decision in a machine learning system.
132 | 
133 | \newpage
134 | 
135 | \section{Learning Problems}
136 | 
137 | Learning requires to infer a functional or statistical relationship between variables when we only observe noisy samples.
138 | Approximation and interpolation in function estimation are such procedures. \\
139 | The problem without additional assumptions is mathematically ill-defined since many different functions might be compatible with our observations. We, therefore, require that our inference has to "work" on future data.
140 | Mathematically, the expected quality of inference should be high and not necessarily the empirically observed quality.\medskip
141 | 
142 | Applications in which the training data comprises examples of the input vectors along with their corresponding target vectors are known as \textbf{supervised learning} problems.\\
143 | Cases such as the digit recognition example, in which the aim is to assign each input vector to one of a finite numbers of discrete categories, are called \textbf{classification} problems.\\
144 | If the desired output consists of one or more continuous variables, then the task is called \textbf{regression}.\medskip
145 | 
146 | In other pattern recognition problems, the training data consists of a set of input vectors $\mathcal{X}$ without any corresponding target values. The goal in such \textbf{unsupervised learning} problems may be to discover groups of similar examples within the data, where it is called \textbf{clustering}, or to determine the distribution of data within the input space, known as \textbf{density estimation}, or to project the data from a high dimensional space down to two or three dimensions for the purpose of visualization (\textbf{dimension reduction}).\medskip
147 | 
148 | Another task could be \textbf{data compression}, where a system predicts the posterior probabilities of a sequence given its entire history. This can be used for optimal data compression.\medskip
149 | 
150 | Finally, the technique called \textbf{reinforcement learning} is concerned with the problem of finding suitable actions to take in a given situation in order to maximise a reward. Here, the learning algorithm is not given examples of optimal outputs, in contrast to supervised learning, but must instead discover them by a process of trial and error.
151 | 
152 | \subsection{Supervised Learning}
153 | 
154 | A teacher (oracle) provides the correct answer during training.\\
155 | Data are pairs of features and response variables $\{(x_1, y_1), ..., (x_n, y_n) : x_i \in \mathcal{X} \subset \mathbb{R}^d, y_i \in  \mathbb{K}\}$ with:\\
156 | $\mathbb{K} = \{1,...,K\}$ for classification where $\mathbb{K}$ is an index set for the classes;\\
157 | $\mathbb{K} = [0, 1]^K$ is the space of assignments for probabilistic classification;\\
158 | $\mathbb{K} \subset \mathbb{R}$ for regression.\medskip
159 | 
160 | \textbf{Problem}: The data are noise contaminated, e.g the response variable $y = f(x,w) + \eta$ depends on the function with parameters $w$ and Gaussian white noise $\eta$.\\
161 | \textbf{Question}: How can we infer a functional relationship $f(x, w)$ from data which are described by the statistical relationship $P(X = x, Y = y)$?\\
162 | Statistical learning theory provides the \textbf{answer}: define a function class $\mathcal{C} = \{f(x, w): w \in W, x \in \mathbb{R}^d\}$ where $w$ indexes the functions (hypothesis) in class $\mathcal{C}$. It turns out that the "complexity" of the function class $\mathcal{C}$ is the essential concept to describe the difficulty of learning. If we have too few data and we work with a too complex function class then learning algorithms have a strong tendency to overfit, i.e to confuse/interpret noise as signal.
163 | 
164 | \newpage
165 | 
166 | \section{The Dilemma of Learning}
167 | 
168 | What should we do about overfitting?
169 | \begin{itemize}
170 |     \item Minimize \textbf{expected} classification error.
171 |     \item Maximize generalisation.
172 | \end{itemize}
173 | 
174 | What can we do about overfitting?
175 | \begin{itemize}
176 |     \item Minimize \textbf{empirical} classification error.
177 |     \item Maximize estimated empirical generalisation performance by cross-validation.
178 | \end{itemize}
179 | 
180 | We search a function $f(x) \in \mathcal{C}$ out of the hypothesis class/solution space $C$ such that:
181 | \begin{center}
182 | $f : X \rightarrow Y$\\
183 | $x \rightarrow y = f(x)$
184 | \end{center}
185 | Often we index the function $f(x) = f_\theta(x)$ by a parameter $\theta$.\\
186 | 
187 | 
188 | \subsection{Generalization}
189 | Given an input space $\mathcal{X}$ with data $x \in \mathcal{X}$ find an interpretation $c \in \mathcal{C}$.
190 | 
191 | $$x \;\text{is a random variable and} \; c \; \text{ is calculated from} \; x \text{ by a procedure}\; \mathcal{A} \implies c \; \text{is a random variable}$$
192 | 
193 | Thus, we can search for a posterior distribution $P(c|x)$.\\
194 | One special choice is the Gibbs Distribution: $P(c | x) = \dfrac{\exp{(-\beta \mathcal{R}(c, x))}}{\sum_{c \in \mathcal{C}}\exp{(-\beta \mathcal{R}(c, x))}} $ 
195 | 
196 | If you know what the $\mathcal{R}$ is then this probability distribution tells you how for a certain parameter $\beta$ you should draw your solution. The problem with this special choice is that now we need to model this $\mathcal{R}$ and furthermore, it has to be  \textbf{validated}.\\\\
197 | Assume that two datasets $\mathcal{X}',\mathcal{X}''  \sim p(x',x'') = p(x')p(x'')$ are given. We can consider the expected risk to assess how well our choice of $\mathcal{R}$ generalizes:
198 | 
199 | $$\mathbb{E}_{\mathcal{X}' \mathcal{X}''}\Big[ \sum_{c \in \mathcal{C}} p(c|x')\mathcal{R}(c,x'') \Big] = \sum_{\mathcal{X}'}\sum_{\mathcal{X}''} p(x'') \sum_{c \in \mathcal{C}} p(c,x')\mathcal{R}(c,x'')= \sum_{c \in \mathcal{C}} p(c) \mathbb{E}_{\mathcal{X}''}\big[ \mathcal{R}(c,x'') \big]$$
200 | 
201 | One might argue that minimizing the expected risk is not the optimal inference principle for validating $\mathcal{R}$. The problem is that $\mathcal{R}$ is particularly large when you are far away from the minimum and that is exactly when you expect $p(c)$ to be very small because solutions which give you an high cost on the test data presumably should be rare also on the train data. Thus, the situation is  unstable, you multiply a very large cost value with a very small probability ($0 \cdot \infty$). From an information-theory point of view, score-maximization is a better principle; however, risk-minimization leads to convex problems that are computationally more feasible.
202 | 
203 | \subsection{Quality of the estimate}
204 | 
205 | The \textbf{loss function} $L$ measures the deviation between dependent variables $y$ and  prediction $f(x)$:
206 | \begin{equation*}
207 |   L(f(x),y) =
208 |     \begin{cases}
209 |       (y - f(x))^2 & \text{quadratic loss (regression)}\\
210 |       \mathds{1}_{y \neq f(x)} & \text{0-1 loss (classification)}\\
211 |       \exp{(-\beta y f(x))} & \text{exponential loss (classification)}
212 |     \end{cases}       
213 | \end{equation*}
214 | 
215 | \textbf{Conditional expected risk}:\\
216 | Given the random variable $X$ the conditional expected risk is defined as:
217 | \begin{center}
218 |     $R(f,x) = \int_y L(f(x), y)P(Y | X) dY$
219 | \end{center}
220 | \textbf{Expected true risk}:
221 | \begin{center}
222 |     $R_{true}(f) = \underset{p}{\mathbb{E}}[L(f(x), y)] = \int_x\int_y L(f(x), y)P(x, y) dx dy$
223 | \end{center}
224 | Where $p$ is the true distribution over the inputs $x$ and $y$. The risk measures how much error we have, on average,  using $f$ as our prediction algorithm.
225 | \newpage
226 | This can be clarified by considering an example. Suppose we want to fit a function for predicting if it will rain or not. The input $x$ will be the sky: CLEAR, CLOUDY or MIXED.\\
227 | The output y will be either RAIN or NOPE. The loss function is now a function $L : \{\text{RAIN, NOPE}\}^2 \rightarrow \mathbb{R}$.\\
228 | Which loss function is appropriate? It depends on the priorities of the user. For example, we have $L$:
229 | \begin{center}
230 | \begin{tabular}{ |c|c|c| } 
231 |  \hline
232 |  $Y_1$\textbackslash $Y_2$ & RAIN & NOPE \\ 
233 |  RAIN & 0 & 1 \\ 
234 |  NOPE & 25 & 0 \\ 
235 |  \hline
236 | \end{tabular}
237 | \end{center}
238 | Meaning that predicting NOPE instead of RAIN it is 25 times worse than predicting RAIN when it does not rain.\\
239 | Now suppose the distribution $p$ is given as follows:
240 | \begin{center}
241 | \begin{tabular}{ |c|c|c| } 
242 |  \hline
243 |  $X$\textbackslash $Y$ & RAIN & NOPE \\ 
244 |  CLEAR & 0 & 1/4 \\ 
245 |  CLOUDY & 1/4 & 0 \\ 
246 |  MIXED & 1/6 & 1/3 \\ 
247 |  \hline
248 | \end{tabular}
249 | \end{center}
250 | Let's consider two possible predictors:
251 | \begin{equation*}
252 |     f_1(x) = \begin{cases}
253 |         \text{CLEAR NOPE} \\
254 |         \text{CLOUDY RAIN} \\
255 |         \text{MIXED NOPE} 
256 |     \end{cases}
257 | \end{equation*}
258 | \begin{equation*}
259 |     f_2(x) = \begin{cases}
260 |         \text{CLEAR NOPE} \\
261 |         \text{CLOUDY RAIN} \\
262 |         \text{MIXED RAIN} 
263 |     \end{cases}
264 | \end{equation*}
265 | If we use $L$, it is easy to calculate $R(f_1) = 1/6\cdot25$ and $R(f_2) = 1/3\cdot1$ and so $f_2$ has the lowest risk.\\
266 | So it sounds like the thing to do is picking $f$ to minimize the risk. Trouble is, that it is impossible.
267 | To calculate the risk, we would need to know the true distribution $p$. If we knew it, we would not be doing machine learning.\\
268 | Since the data comes from $p$, we should be able to get a reasonable approximation:
269 | \begin{center}
270 |     $\underset{p}{\mathbb{E}}[L(f(x), y)] \approx \hat{\mathbb{E}}[L(f(x), y)]$
271 | \end{center}
272 | The right hand side of the equation is called the \textbf{empirical risk}.
273 | \begin{equation*}
274 |     R(f) = \hat{\mathbb{E}}[L(f(x), y)] = \frac{1}{n}\sum\limits_{i = 1}^{n}L(f(x_i), y_i)
275 | \end{equation*}
276 | Picking the function $f^*$ that minimizes it is known as \textbf{Empirical Risk Minimization}.
277 | \begin{equation*}
278 |     f^* = \argmin_{f \in C} R(f)
279 | \end{equation*}
280 | Our hope is that empirical risk minimization performs similarly to true risk minimization, i.e that:
281 | \begin{equation}
282 |     \argmin_{f \in C} R(f) \approx \argmin_{f \in C} R_{true}(f)
283 | \end{equation}
284 | \newpage
285 | How true is Eq. 2.1 in practice depends on four factors:
286 | \begin{itemize}
287 |     \item How much data we have. For any given function $f$ as we get more and more data we can expect that $R(f) \to R_{true}(f)$.
288 |     \item The true distribution $p$. Depending on how "complex" the true distribution is, more or less data may be necessary to get a good approximation of it.
289 |     \item The loss function $L$. If the loss function is very "weird" - giving extremely high loss in certain unlikely situations - this can lead to trouble.
290 |     \item The class of functions of $C$. Roughly speaking, if the size of $C$ is "large", and the functions in $C$ are "complex", this worsens the approximation, all else being equal.
291 | \end{itemize}
292 | So why not use a small set of "simple" functions? It is true, this will lead to empirical risk minimization approximating true risk minimization. However, it also worsens the value of the minimum of the true risk:
293 | \begin{equation*}
294 |     \argmin_{f \in C} R_{true}(f)
295 | \end{equation*}
296 | 
297 | This phenomenon is called \textbf{Bias-Variance Trade-off}.\\
298 | When used in practice, it is usually necessary to perform some sort of model selection or regularization to make empirical risk minimization generalize well to new data.\medskip
299 | 
300 | In practice, what usually happens is that the samples are split into training data and test data. Additional validation data is used to guide the estimator selection.\\
301 | Test data cannot be used before the final estimator has been selected.
302 | \begin{equation*}
303 |     \mathcal{Z}^{train} = \{(x_1, y_1),...,(x_n, y_n)\}\\
304 | \end{equation*}
305 | \begin{equation*}
306 |     \mathcal{Z}^{test} = \{(x_{n+1}, y_{n+1}),...,(x_{n+m}, y_{n+m})\}
307 | \end{equation*}
308 | We have the training error $\hat{R}(f, \mathcal{Z}^{train}) = \frac{1}{n}\sum\limits_{i = 1}^{n}L(f^*(x_i), y_i)$ and its minimizer $f^* = \argmin_{f \in C} \hat{R}(f)$.\\
309 | The test error amounts to $\hat{R}(f, \mathcal{Z}^{test}) = \frac{1}{m}\sum\limits_{i = n+1}^{m+n}L(f^*(x_i), y_i)$.\\
310 | When we use test data for validation, then estimator adaption introduces statistical dependencies between outcome of the learning process (estimator) and test data. This design flow yields a too optimistic estimate of the test error.\\
311 | Furthermore, it is important to \textbf{distinguish between test error and expected risk}:
312 | \begin{equation*}
313 |     \hat{\mathcal{R}}(f^*, \mathcal{Z}^{test}) \neq \underset{x}{\mathbb{E}}[\mathcal{R}(f^*, X)]
314 | \end{equation*}
315 | 
316 | Notice that $\underset{x}{\mathbb{E}}[\mathcal{R}(f^*, X)]$ is a random variable since $f^*$ is random. Moreover,  this inequality holds because there is more randomness on the LHS than on the RHS ($f^*$ and $\mathcal{Z}^{test}$ vs $f^*$). 
317 | 
318 | We can ask what is the probability $P(|\hat{R}(f^*, \mathcal{Z}^{test})-\underset{x}{\mathbb{E}}[R(f^*, X)]|> \epsilon)$? If we succeed in bounding this probability close to 0, then we have an assurance against bad surprises. \\
319 | The test error empirically estimates the expected risk. To assess the quality of the estimate we should report mean and variance or another measure of deviation.
320 | 
321 | \newpage
322 | 
323 | \section{Taxonomy of Data}
324 | 
325 | Pattern analysis requires to find structures in sets of object representations.\\
326 | We are given an object space $O$ and a measurement $X$ that maps an object set into a domain $\mathbb{K}$. Measurements provide information from reality to feed our modeling in more quantitative terms.
327 | \begin{equation*}
328 |     X : O^{(1)} \times ... \times O^{(R)} \rightarrow \mathbb{K}
329 | \end{equation*}
330 | \begin{equation*}
331 |     (o_1,...,o_R) \rightarrow X_{o_1,...,o_R}
332 | \end{equation*}
333 | \textbf{Examples:}
334 | \begin{itemize}
335 |     \item Feature Vectors: $X : O \rightarrow \mathbb{R}^d$, $o \rightarrow X _o$
336 |     \item Classification Data: $X : O \rightarrow \mathbb{R}^d \times \{1,...,k\} $, $o \rightarrow (X _o, Y_o)$
337 |     \item Regression Data: $X : O \rightarrow \mathbb{R}^d \times \mathbb{R}$, $o \rightarrow (X _o, Y_o)$
338 |     \item Proximity Data: $X: O \times O \rightarrow \mathbb{R}$, $(o_1,o_2) \rightarrow X_{o_1,o_2}$
339 | \end{itemize}
340 | \renewcommand{\theenumi}{\alph{enumi}}
341 | \begin{enumerate}
342 |     \item \textbf{Monadic Data}: $X : O \rightarrow \mathbb{R}^d$, $o \rightarrow X _o$
343 |     \\Monadic data characterize configurations or objects without reference to other configurations, e.g temperature and pressure are measured for each location in absolute terms.
344 |     \item \textbf{Diadic Data}: $X: O^{(1)} \times O^{(2)} \rightarrow \mathbb{R}$, $(o_1,o_2) \rightarrow X_{o_1,o_2}$
345 |     \item \textbf{Polyadic Data:} $X: O^{(1)} \times O^{(2)} \times O^{(3)} \rightarrow \mathbb{R}$, $(o_1,o_2,o_3) \rightarrow X_{o_1,o_2,o_3}$
346 | \end{enumerate}
347 | Choosing the correct representation of objects is important, because the choice reflects on the algorithm itself. Otherwise the model has to learn a wrong representation and try to adjust itself to wrong constraints.
348 | \subsection{Scales}
349 | \begin{enumerate}
350 |     \item \textbf{Nominal or Categorical Scale}: qualitative but without quantitative measurements, e.g \textit{binary scale} $X = \{0,1\}$.
351 |     \item \textbf{Ordinal Scale}: measurement values are meaningful only with respect to other measurements, i.e the rank order carries the information, not the numerical differences.
352 |     \item \textbf{Quantitative Scale}:
353 |     \renewcommand{\theenumii}{\roman{enumii}}
354 |     \begin{enumerate}
355 |         \item \textbf{Interval Scale}: the relation of numerical differences carries the information. Invariance w.r.t translation and scaling (e.g Fahrenheit scale).
356 |         \item \textbf{Ratio Scale}: zero value of the scale carries the information but not the measurement unit (e.g Kelvin scale).
357 |         \item \textbf{Absolute Scale}: absolute values are meaningful (e.g grades).
358 |     \end{enumerate}
359 | \end{enumerate}
360 | \begin{center}
361 | \begin{tabular}{ |c|c| } 
362 |  \hline
363 |  Scale Type & Transformation invariances \\ 
364 |  Nominal & $T = \{f : \mathbb{R} \rightarrow \mathbb{R} \:| f \: \text{bijective}\}$ \\ 
365 |  Ordinal & $T = \{f : \mathbb{R} \rightarrow \mathbb{R} \:| f(x_1) < f(x_2), \forall x_1 < x_2\}$ \\ 
366 |  Interval & $T = \{f : \mathbb{R} \rightarrow \mathbb{R} \:| f(x) = ax + c,  a \in \mathbb{R}^+, c \in \mathbb{R}\}$ \\ 
367 |  Ratio & $T = \{f : \mathbb{R} \rightarrow \mathbb{R} \:| f(x) = ax,  a \in \mathbb{R}^+\}$ \\
368 |  Absolute & $T = \{f : \mathbb{R} \rightarrow \mathbb{R} \:| f \: \text{is identity map}\}$ \\ 
369 |  \hline
370 | \end{tabular}
371 | \end{center}
372 | The cost function has to obey the invariants and if they are known in advance we can search for a better learning procedure.
373 | \section{Mathematical Spaces}
374 | \begin{definition} \textbf{Topological Space} \\
375 | Let $X$ be a non-empty set and $\Im$ a collection of subsets of $X$ such that:
376 | \renewcommand{\theenumi}{\Roman{enumi}}
377 | \begin{enumerate}
378 |     \item $X \in \Im$;
379 |     \item $\emptyset \in \Im$;
380 |     \item If $O_1,...,O_n \in \Im$, then $O_1 \cap... \cap O_n \in \Im$;
381 |     \item If for each $\alpha \in I$, $O_\alpha \in \Im$, then $\cup_{\alpha \in I} O_\alpha \in \Im$;
382 | \end{enumerate}
383 | The pair of objects $(X, \Im)$ is called a topological space.
384 | \end{definition}
385 | Topological spaces only describe the closeness/neighborhood of objects but they do not model any quantitative differences (distances) between the "degrees of closeness". The concept of a topological space is one of the most fruitful concepts of modern mathematics. It is the proper setting for discussion based on considerations of continuity.\\
386 | Topological spaces allow us to introduce the concept of a neighborhood and to define \textit{neighborhood spaces} in a natural way.
387 | \begin{definition} \textbf{Metric Space}
388 | A pair of objects $(X, d)$ consisting of a non-empty set $X$ and a function $d: X \times X \rightarrow \mathbb{R}$ is called a metric space provided that:
389 | \renewcommand{\theenumi}{\Roman{enumi}}
390 | \begin{enumerate}
391 |     \item Positivity: $d(x,y) \geq 0, \: \forall x, y \in X$;
392 |     \item Uniqueness: $d(x,y) = 0 \Leftrightarrow x = y$, \: $\forall x,y \in X$;
393 |     \item Simmetry: $d(x,y) = d(y,x), \: \forall x, y \in X$;
394 |     \item $\Delta$ inequality: $d(x,z) \leq d(x,y) + d(y,z), \: \forall x, y, z \in X$;
395 | \end{enumerate}
396 | The function $d$ is called a distance function or metric on $X$ and the set $X$ is called the underlying set.
397 | \end{definition}
398 | \textbf{Example:} $(\mathbb{R}, d)$ is a metric space, where $d$ is the function defined by $d(a,b) = |a-b|, \forall a,b \in \mathbb{R}$.
399 | \begin{definition} \textbf{Scalar Product}
400 | Let $X$ be a non-empty set and $\mathcal{V} = (X,+,\cdot)$ a vector space. A function $\phi : X \times X \rightarrow \mathbb{R}$ which assigns two vectors $\boldsymbol{x}, \boldsymbol{y} \in \mathcal{V}$ a real number is called \textbf{scalar product} on $\mathcal{V}$ if the following properties hold:
401 | \renewcommand{\theenumi}{\Roman{enumi}}
402 | \begin{enumerate}
403 |     \item Distributivity: $\phi(\boldsymbol{x_1} + \boldsymbol{x_2}, \boldsymbol{y}) = \phi(\boldsymbol{x_1}, \boldsymbol{y}) + \phi(\boldsymbol{x_2}, \boldsymbol{y})$;
404 |     \item Commutativity: $\phi(\boldsymbol{x}, \boldsymbol{y}) = \phi(\boldsymbol{y}, \boldsymbol{x})$;
405 |     \item Homogeneity: $\phi(\alpha\boldsymbol{x}, \boldsymbol{y}) = \alpha\phi(\boldsymbol{x}, \boldsymbol{y}), \: \forall \alpha \in \mathbb{R}$;
406 |     \item Positive definiteness: $\phi(\boldsymbol{x}, \boldsymbol{x}) > 0, \: \forall \boldsymbol{x} \neq 0$;
407 | \end{enumerate}
408 | \end{definition}
409 | A vector space with such a scalar product is called \textbf{Euclidean vector space}. The scalar product defines the norm $\norm{\boldsymbol{x}} \triangleq \sqrt{\phi(\boldsymbol{x},\boldsymbol{x})}$.\medskip
410 | 
411 | Every element of a metric space corresponds to an element in a metrizable space.\\
412 | From a machine learning point of view we have to answer the question how precisely we can actually gather metric information rather than topological information in an application scenario. Such an analysis then suggests the appropriate space to model the structures in the data.
413 | \subsection{Probability Spaces}
414 | \begin{definition} \textbf{Elementary event}\\
415 | $\omega_1,...,\omega_n$ are samples points
416 | \end{definition}
417 | \begin{definition} \textbf{Sample Space}\\
418 | $\Omega = \{\omega_1,...,\omega_n\}$\\
419 | The sample space $\Omega$ is the set of possible outcomes of an experiment. Points $\omega$ in $\Omega$ are called sample outcomes, realizations, or elements. Subsets of $\Omega$ are called Events.
420 | \end{definition}
421 | \textbf{Example}: If we toss a coin twice then $\Omega = \text{\{HH,HT,TH,TT\}} $ . The event that the first toss is heads is $A = \text{\{HH,HT\}}$.
422 | \begin{definition} \textbf{Family of Sets}\\
423 | An event $A$ of an experiment is a set of elementary events with the following conditions:
424 | \begin{itemize}
425 |     \item $A \subset \Omega$
426 |     \item $\omega \in A \vee \omega \not\in A$
427 | \end{itemize}
428 | \end{definition}
429 | \begin{definition} \textbf{Algebra of events}\\
430 | Let $A, B$ be events. $\mathcal{A}$ is an algebra of events, i.e, a set of subsets $A \subset \Omega$, for which holds:
431 | \begin{itemize}
432 |     \item $\Omega \in \mathcal{A}$
433 |     \item if $A \in \mathcal{A} \wedge B \in \mathcal{A}$, then $A \cup B \in \mathcal{A} \wedge A \setminus B \in \mathcal{A}$
434 | \end{itemize}
435 | \end{definition}
436 | \begin{definition} \textbf{Probability of events}\\
437 | A function $\mathbb{P}$ that assigns a real number $\mathbb{P}(A)$ to each event $A$ is a \textbf{probability distribution} or a \textbf{probability measure} if it satisfies the following three axioms:
438 | \begin{itemize}
439 |     \item \textbf{Axiom 1}: $\mathbb{P}(A) \geq 0$ for every $A$
440 |     \item \textbf{Axiom 2}: $\mathbb{P}(\Omega) = 1$
441 |     \item \textbf{Axiom 3}: If $A_1, A_2,...$ are disjoint then:
442 |     \begin{equation*}
443 |         \mathbb{P}(\bigcup\limits_{i=1}^{\infty} A_i) = \sum\limits_{i=1}^{\infty} \mathbb{P}(A_i)
444 |     \end{equation*}
445 | \end{itemize}
446 | \end{definition}
447 | \begin{definition} \textbf{Probability model}\\
448 | A probability model or a probability space is a triple
449 | \begin{equation*}
450 |     (\Omega, \mathcal{A}, \mathbb{P})
451 | \end{equation*}
452 | with the sample set $\Omega = \{\omega_1,...,\omega_n\}$ the event algebra $\mathcal{A}$ and the probabilities $\mathcal{P} = \{\mathbb{P}(A) | A \in \mathcal{A}\}$.
453 | \end{definition}
454 | \end{document}
455 | 


--------------------------------------------------------------------------------
/L3_Density_Estimation.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[twoside]{article}
  2 | \setlength{\oddsidemargin}{0.25 in}
  3 | \setlength{\evensidemargin}{-0.25 in}
  4 | \setlength{\topmargin}{-0.6 in}
  5 | \setlength{\textwidth}{6.5 in}
  6 | \setlength{\textheight}{8.5 in}
  7 | \setlength{\headsep}{0.75 in}
  8 | \setlength{\parindent}{0 in}
  9 | \setlength{\parskip}{0.1 in}
 10 | \newcommand{\eqdef}{:\mathrel{\mathop=}}
 11 | \newcommand{\norm}[1]{\left\lVert #1 \right\rVert}
 12 | 
 13 | %
 14 | % ADD PACKAGES here:
 15 | %
 16 | 
 17 | \usepackage{amsmath,amsfonts,graphicx,dsfont,amssymb, cool, cancel}
 18 | %
 19 | % The following commands set up the lecnum (lecture number)
 20 | % counter and make various numbering schemes work relative
 21 | % to the lecture number.
 22 | %
 23 | \newcounter{lecnum}
 24 | \renewcommand{\thepage}{\thelecnum-\arabic{page}}
 25 | \renewcommand{\thesection}{\thelecnum.\arabic{section}}
 26 | \renewcommand{\theequation}{\thelecnum.\arabic{equation}}
 27 | \renewcommand{\thefigure}{\thelecnum.\arabic{figure}}
 28 | \renewcommand{\thetable}{\thelecnum.\arabic{table}}
 29 | \newcommand{\indep}{\raisebox{0.05em}{\rotatebox[origin=c]{90}{$\models$}}}
 30 | 
 31 | %
 32 | % The following macro is used to generate the header.
 33 | %
 34 | \newcommand{\lecture}[4]{
 35 |    \pagestyle{myheadings}
 36 |    \thispagestyle{plain}
 37 |    \newpage
 38 |    \setcounter{lecnum}{#1}
 39 |    \setcounter{page}{1}
 40 |    \noindent
 41 |    \begin{center}
 42 |    \framebox{
 43 |       \vbox{\vspace{2mm}
 44 |     \hbox to 6.28in { {\bf Advanced Machine Learning
 45 | 	\hfill Fall 2020} }
 46 |        \vspace{4mm}
 47 |        \hbox to 6.28in { {\Large \hfill Lecture #1: #2  \hfill} }
 48 |        \vspace{2mm}
 49 |        \hbox to 6.28in { {\it  #3 \hfill  #4} }
 50 |       \vspace{2mm}}
 51 |    }
 52 |    \end{center}
 53 |    \markboth{Lecture #1: #2}{Lecture #1: #2}
 54 | 
 55 |    {\bf Note}: {\it LaTeX template courtesy of UC Berkeley EECS dept.}
 56 | 
 57 |    {\bf Disclaimer}: {\it These notes are adapted from ETH's Advanced Machine Learning Course and the book All Of Statistics, Larry Wasserman, Springer.}
 58 |    \vspace*{4mm}
 59 | }
 60 | %
 61 | % Convention for citations is authors' initials followed by the year.
 62 | % For example, to cite a paper by Leighton and Maggs you would type
 63 | % \cite{LM89}, and to cite a paper by Strassen you would type \cite{S69}.
 64 | % (To avoid bibliography problems, for now we redefine the \cite command.)
 65 | % Also commands that create a suitable format for the reference list.
 66 | \renewcommand{\cite}[1]{[#1]}
 67 | \def\beginrefs{\begin{list}%
 68 |         {[\arabic{equation}]}{\usecounter{equation}
 69 |          \setlength{\leftmargin}{2.0truecm}\setlength{\labelsep}{0.4truecm}%
 70 |          \setlength{\labelwidth}{1.6truecm}}}
 71 | \def\endrefs{\end{list}}
 72 | \def\bibentry#1{\item[\hbox{[#1]}]}
 73 | 
 74 | %Use this command for a figure; it puts a figure in wherever you want it.
 75 | %usage: \fig{NUMBER}{SPACE-IN-INCHES}{CAPTION}
 76 | \newcommand{\fig}[3]{
 77 | 			\vspace{#2}
 78 | 			\begin{center}
 79 | 			Figure \thelecnum.#1:~#3
 80 | 			\end{center}
 81 | 	}
 82 | % Use these for theorems, lemmas, proofs, etc.
 83 | \newtheorem{theorem}{Theorem}[lecnum]
 84 | \newtheorem{lemma}[theorem]{Lemma}
 85 | \newtheorem{proposition}[theorem]{Proposition}
 86 | \newtheorem{claim}[theorem]{Claim}
 87 | \newtheorem{corollary}[theorem]{Corollary}
 88 | \newtheorem{definition}[theorem]{Definition}
 89 | \newtheorem{example}{Example}[lecnum]
 90 | \newenvironment{proof}{{\bf Proof:}}{\hfill\rule{2mm}{2mm}}
 91 | 
 92 | % **** IF YOU WANT TO DEFINE ADDITIONAL MACROS FOR YOURSELF, PUT THEM HERE:
 93 | 
 94 | \DeclareMathOperator*{\argmax}{arg\,max} 
 95 | \DeclareMathOperator*{\argmin}{arg\,min} 
 96 | 
 97 | \begin{document}
 98 | %FILL IN THE RIGHT INFO.
 99 | %\lecture{**LECTURE-NUMBER**}{**DATE**}{**LECTURER**}{**SCRIBE**}
100 | \lecture{3}{Density Estimation}{}{}
101 | %\footnotetext{These notes are partially based on those of Nigel Mansell.}
102 | 
103 | % **** YOUR NOTES GO HERE:
104 | 
105 | % Some general latex examples and examples making use of the
106 | % macros follow.  
107 | %**** IN GENERAL, BE BRIEF. LONG SCRIBE NOTES, NO MATTER HOW WELL WRITTEN,
108 | %**** ARE NEVER READ BY ANYBODY.
109 | 
110 | \section{Parametric Inference}
111 | 
112 | We now turn our attention to parametric models, that is, models of the form:
113 | \begin{equation*}
114 |     \mathfrak{F} = \{f(x; \theta) : \theta \in \Theta\}
115 | \end{equation*}
116 | where the $\theta \subset \mathbb{R}^k$ is the parameter space and $\theta = (\theta_1,...,\theta_k)$ is the parameter. The problem of inference then reduces to the problem of estimating the parameter $\theta$. \\
117 | Often, we are only interested in some function $T(\theta)$. For example, if $X \sim \mathcal{N}(\mu, \sigma^2)$ then the parameter is $\theta = (\mu, \sigma)$. If our goal is to estimate $\mu$ then $\mu = T(\theta)$ is called the parameter of interest and $\sigma$ is called a nuisance parameter.
118 | 
119 | \subsection{Maximum Likelihood}
120 | 
121 | The most common method for estimating parameters in a parametric model is the maximum likelihood method. Let $X_1,...,X_n$ be IID with pdf $f(x; \theta)$.
122 | \begin{definition}
123 | The \textbf{likelihood function} is defined by:
124 | \begin{equation*}
125 |     \mathcal{L}_n(\theta) = \prod\limits_{i = 1}^{n}f(x_i; \theta)
126 | \end{equation*}
127 | The \textbf{log-likelihood function} is defined by $\ell_n(\theta) = \log\mathcal{L}_n(\theta)$.
128 | \end{definition}
129 | The likelihood function is just the joint density of the data, except that we treat it is a function of the parameter $\theta$. Thus,  $\mathcal{L}_n(\theta) : \Theta \to [0, \infty)$. The likelihood function is not a density function: in general, it is not true that $\mathcal{L}_n(\theta)$ integrates to 1 (with respect to $\theta$).
130 | \begin{definition}
131 | The \textbf{maximum likelihood estimator} MLE, denoted by $\hat{\theta}_n$, is the value of $\theta$ that maximizes $\mathcal{L}_n(\theta)$.
132 | \end{definition}
133 | The maximum of $\ell_n(\theta)$ occurs at the same place as the maximum of $\mathcal{L}_n(\theta)$, so maximizing the log-likelihood leads to the same result as maximizing the likelihood. Often, it is easier to work with the log-likelihood.
134 | \begin{claim}
135 | If we multiply $\mathcal{L}_n(\theta)$ by any positive constant $c$ (not depending on $\theta$) then this will not change the MLE. Hence, we shall often drop constants in the likelihood function.
136 | \end{claim}
137 | \begin{example}
138 | Let $X_1,...,X_n \sim \mathcal{N}(\mu, \sigma^2)$. The parameter is $\theta = (\mu, \sigma)$ and the likelihood function (ignoring some constants) is:
139 | \begin{equation*}
140 | \begin{aligned}
141 |     \mathcal{L}_n(\mu, \sigma) &= \prod\limits_{i}\frac{1}{\sigma}\exp{\{-\frac{1}{2\sigma^2}(X_i - \mu)^2\}}\\
142 |     &= \sigma^{-n}\exp{\{-\frac{1}{2\sigma^2}\sum\limits_i(X_i - \mu)^2\}}\\
143 |     &= \sigma^{-n}\exp{\{-\frac{nS^2}{2\sigma^2}\}}\exp{\{-\frac{n(\bar{X} - \mu)^2}{2\sigma^2}\}}
144 | \end{aligned}
145 | \end{equation*}
146 | where $\bar{X} = n^{-1}\sum\limits_i X_i$ is the sample mean and $S^2 = n^{-1}\sum\limits_i(X_i - \bar{X})^2$ .The
147 | last equality above follows from the fact that $\sum\limits_i(X_i - \mu)^2 = nS^2 + n(\bar{X} - \mu)^2$
148 | which can be verified by writing $\sum\limits_i(X_i - \mu)^2 = \sum\limits_i(X_i - \bar{X} + \bar{X} -\mu)^2$ and then expanding the square. The log-likelihood is:
149 | \begin{equation*}
150 |     l(\mu, \sigma) = -n\log\sigma - \frac{nS^2}{2\sigma^2} - \frac{n(\bar{X} - \mu)^2}{2\sigma^2}
151 | \end{equation*}
152 | Solving the equations:
153 | \begin{equation*}
154 | \frac{\partial l(\mu, \sigma)}{\partial\mu} = 0 \hspace{10pt} \text{and} \hspace{10pt} \frac{\partial l(\mu, \sigma)}{\partial\sigma} = 0,
155 | \end{equation*}
156 | we conclude that $\hat{\mu} = \bar{X}$ and $\hat{\sigma} = S$. It can be verified that these are indeed global maxima of the likelihood.
157 | \end{example}
158 | \subsection{Properties of Maximum Likelihood Estimators}
159 | Under certain conditions on the model, the maximum likelihood estimator $\hat{\theta}_n$ possesses many properties that make it an appealing choice of estimator. The main properties of the MLE are:
160 | \begin{enumerate}
161 |     \item The MLE is \textbf{consistent}: $\hat{\theta}_n \overset{P}{\to} \theta^*$ where $\theta^*$ denotes the true value of the parameter $\theta$;
162 |     \item The MLE is \textbf{equivariant}: if $\hat{\theta}_n$ is the MLE of $\theta$ then $g(\hat{\theta}_n)$ is the MLE of $g(\theta)$;
163 |     \item  The MLE is \textbf{asymptotically Normal}: $(\hat{\theta} - \theta^* / \hat{\text{se}}) \sim \mathcal{N}(0, 1)$; also, the
164 | estimated standard error $\hat{\text{se}}$ can often be computed analytically;
165 |     \item The MLE is \textbf{asymptotically optimal} or \textbf{efficient}: roughly, this means that among all well-behaved estimators, the MLE has the smallest variance, at least for large samples. That is, $\hat{\theta}_n$ minimizes $\mathbb{E}\big[ (\hat{\theta}_n -\theta^*)^2\big]$ as  $n \to \infty$;
166 |     \item The MLE is approximately the Bayes estimator.  
167 | \end{enumerate}
168 | The properties we discuss only hold if the model satisfies certain regularity conditions. These are essentially smoothness conditions on $f(x; \theta)$, unless otherwise stated we shall tacitly assume that these conditions hold.
169 | 
170 | \subsection{Understanding Asymptotic efficiency}
171 | The expected square error is a measure for quantifying how good an estimator $\hat{\theta}$ is:
172 | $$\mathbb{E}\big[ (\hat{\theta}- \theta_0)^2\big]$$
173 | 
174 | The Rao-Cramer bound shows that there does not exists an estimator that reaches $\mathbb{E}\big[ (\hat{\theta}- \theta_0)^2\big] = 0$
175 | 
176 | \begin{theorem}For any  estimator $\hat{\theta}$ of $\theta$ it holds that:\medskip
177 | 
178 | $\mathbb{E}_{x|\theta}\big[ (\hat{\theta} - \theta)^2\big] \geq \dfrac{\big(\dfrac{\partial}{\partial \theta} b_{\hat{\theta}} + 1 \big)^2}{\mathbb{E}_{x|\theta}\big[ \Lambda^2\big]} + b_{\hat{\theta}}^2$\medskip
179 | 
180 | Where:
181 | $$\Lambda = \dfrac{\partial}{\partial{\theta}} \log{p(x|\theta)}= \dfrac{1}{p(x|\theta)} \dfrac{\partial}{\partial{\theta}} p(x|\theta) \hspace{10pt} \text{and} \hspace{10pt} b_{\hat{\theta}} = \mathbb{E}_{x|\theta}[\hat{\theta}]-\theta$$
182 | \end{theorem}
183 | 
184 | \begin{proof} 
185 | 
186 | $$\mathbb{E}_{x|\theta}[\Lambda] = \int_{x} p(x|\theta) \Lambda \; dx= \int_{x} \dfrac{\partial}{\partial{\theta}} p(x|\theta) dx = \dfrac{\partial}{\partial{\theta}}\overbrace{\int_{x} p(x|\theta)dx}^{=1} = 0$$
187 | 
188 | $$\mathbb{E}_{x|\theta}[\Lambda \hat{\theta}]= \int_{x} p(x|\theta) \Lambda \hat{\theta} \; dx = \int_{x} \dfrac{\partial}{\partial{\theta}} p(x|\theta) \hat{\theta} \; dx
189 | = \dfrac{\partial}{\partial{\theta}} \int_{x} p(x|\theta) \hat{\theta} \; dx = \dfrac{\partial}{\partial{\theta}} \mathbb{E}_{x|\theta}[\hat{\theta}] = \dfrac{\partial}{\partial{\theta}} (\mathbb{E}_{x|\theta}[\hat{\theta}]-\theta) + 1 =\dfrac{\partial}{\partial{\theta}} b_{\hat{\theta}}+1 $$
190 | 
191 | Consider the covariance between $\Lambda$ and $\hat{\theta}$:
192 | $$\Big(\mathbb{E}_{x|\theta}\big[ (\Lambda - \overbrace{\mathbb{E}_{x|\theta}[\Lambda]}^{=0}) (\hat{\theta} - \mathbb{E}_{x|\theta}[\hat{\theta}\big])] \Big)^2 = \Big(\mathbb{E}_{x|\theta}[\Lambda \hat{\theta}] - \mathbb{E}_{x|\theta}\big[ \Lambda \mathbb{E}_{x|\theta}[\hat{\theta}]  
193 | \big] \Big)^2 = \Big(\mathbb{E}_{x|\theta}[\Lambda \hat{\theta}] -  \overbrace{\mathbb{E}_{x|\theta}[\Lambda]}^{=0} \mathbb{E}_{x|\theta}[\hat{\theta}
194 | \big] \Big)^2 =  \Big(\mathbb{E}_{x|\theta}[\Lambda \hat{\theta}] \Big)^2$$
195 | 
196 | Now, let's consider Cauchy-Schwarz inequality i.e. $\big(\mathbb{E}[xy]\big)^2 \leq \mathbb{E}[x^2]\mathbb{E}[y^2]$ applied to the cross-correlation:
197 | $$\Big(\mathbb{E}_{x|\theta}\big[ (\Lambda - \overbrace{\mathbb{E}_{x|\theta}[\Lambda]}^{=0}) (\hat{\theta} - \mathbb{E}_{x|\theta}[\hat{\theta}\big] \Big)^2 \leq \mathbb{E}_{x|\theta}[\Lambda^2]\; \mathbb{E}_{x|\theta}\big[(\hat{\theta} - \mathbb{E}_{x|\theta}[\hat{\theta}])^2\big] = \mathbb{E}_{x|\theta}[\Lambda^2]\; \mathbb{E}_{x|\theta}\big[\big((\hat{\theta} - \theta) -(\mathbb{E}_{x|\theta}[\hat{\theta}]-\theta)\big)^2\big] $$ 
198 | $$ =  \mathbb{E}_{x|\theta}[\Lambda^2]\; \mathbb{E}_{x|\theta}\big[(\hat{\theta} - \theta)^2 + (\mathbb{E}_{x|\theta}[\hat{\theta}]-\theta)^2 -2 (\hat{\theta}-\theta)(\mathbb{E}_{x|\theta}[\hat{\theta}]-\theta)\big]$$ $$ =  \mathbb{E}_{x|\theta}[\Lambda^2]\; \big \{\mathbb{E}_{x|\theta}\big[(\hat{\theta} - \theta)^2 \big] + \overbrace{\mathbb{E}_{x|\theta} \big[(\mathbb{E}_{x|\theta}[\hat{\theta}]-\theta)^2 -2 (\hat{\theta}-\theta)(\mathbb{E}_{x|\theta}[\hat{\theta}]-\theta)\big]}^{-b_{\hat{\theta}}^2} \big \}  = \mathbb{E}_{x|\theta}[\Lambda^2]\; \big\{\mathbb{E}_{x|\theta}\big[(\hat{\theta} - \theta)^2\big]- b_{\hat{\theta}}^2 \big\}$$ 
199 | 
200 | It's easy to verify that $\mathbb{E}_{x|\theta} \big[(\mathbb{E}_{x|\theta}[\hat{\theta}]-\theta)^2 -2 (\hat{\theta}-\theta)(\mathbb{E}_{x|\theta}[\hat{\theta}]-\theta)\big] = -b_{\hat{\theta}}^2$:
201 | 
202 | $$\mathbb{E}_{x|\theta}\Big[ \mathbb{E}_{x|\theta}^2[\hat{\theta}] + \theta^2 \cancel{-2 \theta \mathbb{E}_{x|\theta}[\hat{\theta}]} -2 \hat{\theta} \mathbb{E}_{x|\theta}[\hat{\theta}]  +2\hat{\theta}\theta + \cancel{2 \theta \mathbb{E}_{x|\theta}[\hat{\theta}}] - 2 \theta^2
203 | \Big] $$ $$=\mathbb{E}^2_{x|\theta}[\hat{\theta}] + \mathbb{E}_{x|\theta}[\theta^2] -2 \mathbb{E}^2_{x|\theta}[\hat{\theta}] +2 \theta \mathbb{E}_{x|\theta}[\hat{\theta}] -2 \mathbb{E}_{x|\theta}[\theta^2]$$$$ =  - \mathbb{E}^2_{x|\theta}[\hat{\theta}] - \mathbb{E}_{x|\theta}[\theta^2] +2 \theta \mathbb{E}_{x|\theta}[\hat{\theta}] = - \mathbb{E}^2_{x|\theta}[\hat{\theta}] -\theta^2 +2 \theta \mathbb{E}_{x|\theta}[\hat{\theta}] = - \big( \mathbb{E}_{x|\theta}[\hat{\theta}]  - \theta \big )^2 = - b_{\hat{\theta}}^2 $$
204 | 
205 | Finally, from the inequality proved earlier we know that:
206 | 
207 | $$\big(\mathbb{E}_{x|\theta}[\Lambda \hat{\theta}]\big)^2 = \big( \dfrac{\partial}{\partial{\theta}} b_{\hat{\theta}}+1\big)^2 \leq \mathbb{E}_{x|\theta}[\Lambda^2]\; \mathbb{E}_{x|\theta}\big[(\hat{\theta} - \theta)^2 - b_{\hat{\theta}}^2\big]$$
208 | It follows that:
209 | $$\mathbb{E}_{x|\theta}\big[ (\hat{\theta} - \theta)^2\big] \geq \dfrac{\big(\dfrac{\partial}{\partial \theta} b_{\hat{\theta}} + 1 \big)^2}{\mathbb{E}_{x|\theta}\big[ \Lambda^2\big]} + b_{\hat{\theta}}^2$$
210 | \end{proof}
211 | 
212 | \subsection{Stein Estimator}
213 | For finite samples, the maximum-likelihood estimator is not necessarily efficient.\\ Consider a multivariate random variable with distribution $\mathcal{N}(\theta_0,\sigma^2I)$ with range $\mathbb{R}^d$ and $d \geq 3$.
214 | If we sample a single point $y$ from this distribution then the Stein Estimator is:
215 | 
216 | $$\hat{\theta}_{JS} := \Big(1 - \dfrac{(d-2)\sigma^2}{||y||^2} \Big) y$$
217 | 
218 | It is possible to prove that the Stein Estimator is better than the maximum-likelihood estimator for any $\theta_0$.  That is:
219 | $$
220 | \mathbb{E}\Big[ (\hat{\theta}_{JS}- \theta_0)^2\Big] \leq \mathbb{E}\Big[ (\hat{\theta}_{ML}- \theta_0)^2\Big] \; \; \text{for any} \; \theta_0 
221 | $$
222 | Moreover, the inequality is strict for some values of $\theta_0$.
223 | 
224 | \section{Bayesian Learning}
225 | Bayesian inference is usually carried out in the following way:
226 | 
227 | \begin{itemize}
228 |     \item $\theta$ is considered to be a \textbf{random variable}  with distribution $p(\theta|\mathcal{X})$.
229 |     \item $X \sim p(x)$  and $p(x)$ is unknown. 
230 |     \item  $p(x|\theta)$ is  a statistical model that reflects our beliefs about $x$ given $\theta$.
231 | \end{itemize}
232 | 
233 | We are looking for $p(X=x|\mathcal{X})$, i.e., the probability of $x$ given the sample set $\mathcal{X}$ (class conditional density):
234 | 
235 | $$p(X=x|\mathcal{X}) = \int \underbrace{p(x,\theta|\mathcal{X})}_{p(x|\theta,\mathcal{X})p(\theta|\mathcal{X})}d\theta =
236 | \int p(x|\theta,\mathcal{X})p(\theta|\mathcal{X})d\theta = \int p(x|\theta)p(\theta|\mathcal{X})d\theta  $$
237 | 
238 | Where $p(x|\theta,\mathcal{X})= p(x|\theta)$ since $x_i \in \mathcal{X}$ and $x$ are i.i.d.\\\\
239 | Moreover, asymptotically it holds that $p(\theta| \mathcal{X}) \sim \delta(\theta- \hat{\theta}) $; intuitively, this follows from the fact that $\hat{\theta}  \overset{p}{\to} \theta_{true}$. Thus, in the asymptotic case, we can approximate the integral with:
240 | 
241 | $$p(X=x|\mathcal{X}) = \int p(x|\theta)p(\theta|\mathcal{X})d\theta \approx
242 | \int p(x|\theta) \delta(\theta- \hat{\theta})d\theta  = p(x|\hat{\theta}) $$
243 | 
244 | This approximation was used in the early days of Bayesian inference when it was not possible to evaluate the integral. 
245 | \newpage
246 | \subsection{Bayesian Learning of a Normal Distribution}
247 | Let us begin with a simple example in which we consider a single Gaussian random variable $x$. We shall suppose that the variance $\sigma^2$ is known, and we consider the task of inferring the mean $\mu$ given a set of $N$ observations:
248 | \begin{itemize}
249 |     \item The likelihood is $p(x|\mu)= \mathcal{N}(\mu,\sigma^2)$
250 |     \item The prior is $p(\mu) = \mathcal{N}(\mu_0,\sigma_0^2)$
251 |     \item The data is $\mathcal{X} = \{x_1,\ldots,x_n\}$
252 | \end{itemize}
253 | 
254 | We want to compute the posterior distribution $p(\mu|\mathcal{X})$:
255 | $$p(\mu|\mathcal{X}) \propto p(\mathcal{X}|\mu)p(\mu) \implies p(\mu|\mathcal{X}) = \alpha \; p(\mathcal{X}|\mu)p(\mu)
256 | = \alpha \cdot \prod_{i \leq n} \Big\{ \dfrac{1}{\sqrt{2\pi \sigma}} \exp{\Big(-\dfrac{1}{2} \big(\dfrac{x_i-\mu}{\sigma}\big)^2 \Big)}\Big\} \cdot \dfrac{1}{\sqrt{2\pi \sigma_0}} \exp{\Big(-\dfrac{1}{2} \big(\dfrac{\mu-\mu_0}{\sigma_0}\big)^2 \Big)}$$
257 | $$=\alpha' \cdot  \prod_{i \leq n} \Big\{  \exp{\Big(-\dfrac{1}{2} \big(\dfrac{x_i-\mu}{\sigma}\big)^2 \Big)}\Big\} \cdot \exp{\Big(-\dfrac{1}{2} \big(\dfrac{\mu-\mu_0}{\sigma_0}\big)^2 \Big)} = \alpha' \cdot \exp{\Big\{-\dfrac{1}{2} \Big( \sum_{i \leq n} \Big(\dfrac{x_i-\mu}{\sigma}\Big)^2 + \Big(\dfrac{\mu-\mu_0}{\sigma_0}\Big)^2  \Big)\Big\}}$$
258 | Expanding the squares we get:
259 | $$p(\mu|\mathcal{X})= \alpha'\cdot \exp{\Big(  \mu^2 \overbrace{\big( \dfrac{n}{\sigma^2} + \dfrac{1}{\sigma_0^2}\big)}^{a} -2\mu \overbrace{\big(\dfrac{\mu_0}{\sigma_0^2} + \dfrac{1}{\sigma^2}\sum_{i \leq n}x_i^2  \big)}^{b} + c \Big)} $$
260 | Which we know is a Gaussian Distribution, i.e. $p(\mu|\mathcal{X}) \sim \mathcal{N}(\mu_n,\sigma_n^2)$, because the exponent is a quadratic form. Furthermore, by completing the square we know that:
261 | $$\mu_n = \dfrac{b}{a} = \dfrac{n \sigma_0^2}{n \sigma_0^2 + \sigma^2} \hat{\mu}_n + \dfrac{ \sigma^2}{n \sigma_0^2 + \sigma^2} \mu_0$$
262 | $$\sigma_n^2 = \dfrac{1}{a} = \dfrac{\sigma^2 \sigma_0^2}{n\sigma_0^2+\sigma^2}$$
263 | 
264 | It is worth spending a moment studying the form of the posterior mean and variance. First of all, note that the mean of the posterior is a compromise between $\mu_0$ and the maximum likelihood solution $\hat{\mu}$. If the number of observed data points $n=0$ then $\mu_n$ reduces to the prior mean as expected. For $n \to \infty$, $\mu_n$ is given by the maximum likelihood solution. \\ Similarly, consider the result for the variance of the posterior distribution $\sigma_n^2$. With no observed data points, we have the prior variance, whereas if the number of data points $n \to \infty$, the variance goes to zero and the posterior distribution becomes infinitely peaked around the maximum likelihood solution.\\
265 | We therefore see that the maximum likelihood result of a point estimate for $\mu$ is recovered precisely from the Bayesian formalism in the limit of an infinite number of observations.
266 | 
267 | \end{document}


--------------------------------------------------------------------------------
/L5_Gaussian_Processes.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[twoside]{article}
  2 | \setlength{\oddsidemargin}{0.25 in}
  3 | \setlength{\evensidemargin}{-0.25 in}
  4 | \setlength{\topmargin}{-0.6 in}
  5 | \setlength{\textwidth}{6.5 in}
  6 | \setlength{\textheight}{8.5 in}
  7 | \setlength{\headsep}{0.75 in}
  8 | \setlength{\parindent}{0 in}
  9 | \setlength{\parskip}{0.1 in}
 10 | \newcommand{\eqdef}{:\mathrel{\mathop=}}
 11 | \newcommand{\norm}[1]{\left\lVert #1 \right\rVert}
 12 | 
 13 | %
 14 | % ADD PACKAGES here:
 15 | %
 16 | 
 17 | \usepackage{amsmath,amsfonts,graphicx,dsfont,amssymb, cool, cancel, mathtools}
 18 | %
 19 | % The following commands set up the lecnum (lecture number)
 20 | % counter and make various numbering schemes work relative
 21 | % to the lecture number.
 22 | %
 23 | \newcounter{lecnum}
 24 | \renewcommand{\thepage}{\thelecnum-\arabic{page}}
 25 | \renewcommand{\thesection}{\thelecnum.\arabic{section}}
 26 | \renewcommand{\theequation}{\thelecnum.\arabic{equation}}
 27 | \renewcommand{\thefigure}{\thelecnum.\arabic{figure}}
 28 | \renewcommand{\thetable}{\thelecnum.\arabic{table}}
 29 | \newcommand{\indep}{\raisebox{0.05em}{\rotatebox[origin=c]{90}{$\models$}}}
 30 | 
 31 | %
 32 | % The following macro is used to generate the header.
 33 | %
 34 | \newcommand{\lecture}[4]{
 35 |    \pagestyle{myheadings}
 36 |    \thispagestyle{plain}
 37 |    \newpage
 38 |    \setcounter{lecnum}{#1}
 39 |    \setcounter{page}{1}
 40 |    \noindent
 41 |    \begin{center}
 42 |    \framebox{
 43 |       \vbox{\vspace{2mm}
 44 |     \hbox to 6.28in { {\bf Advanced Machine Learning
 45 | 	\hfill Fall 2020} }
 46 |        \vspace{4mm}
 47 |        \hbox to 6.28in { {\Large \hfill Lecture #1: #2  \hfill} }
 48 |        \vspace{2mm}
 49 |        \hbox to 6.28in { {\it  #3 \hfill  #4} }
 50 |       \vspace{2mm}}
 51 |    }
 52 |    \end{center}
 53 |    \markboth{Lecture #1: #2}{Lecture #1: #2}
 54 | 
 55 |    {\bf Note}: {\it LaTeX template courtesy of UC Berkeley EECS dept.}
 56 | 
 57 |    {\bf Disclaimer}: {\it These notes are adapted from ETH's Advanced Machine Learning Course, "Gaussian Processes for Machine Learning, the MIT Press", "Probabilistic Machine Learning: An introduction, MIT Press" and MA4270 Lecture Notes 5.}
 58 |    \vspace*{4mm}
 59 | }
 60 | %
 61 | % Convention for citations is authors' initials followed by the year.
 62 | % For example, to cite a paper by Leighton and Maggs you would type
 63 | % \cite{LM89}, and to cite a paper by Strassen you would type \cite{S69}.
 64 | % (To avoid bibliography problems, for now we redefine the \cite command.)
 65 | % Also commands that create a suitable format for the reference list.
 66 | \renewcommand{\cite}[1]{[#1]}
 67 | \def\beginrefs{\begin{list}%
 68 |         {[\arabic{equation}]}{\usecounter{equation}
 69 |          \setlength{\leftmargin}{2.0truecm}\setlength{\labelsep}{0.4truecm}%
 70 |          \setlength{\labelwidth}{1.6truecm}}}
 71 | \def\endrefs{\end{list}}
 72 | \def\bibentry#1{\item[\hbox{[#1]}]}
 73 | 
 74 | %Use this command for a figure; it puts a figure in wherever you want it.
 75 | %usage: \fig{NUMBER}{SPACE-IN-INCHES}{CAPTION}
 76 | \newcommand{\fig}[3]{
 77 | 			\vspace{#2}
 78 | 			\begin{center}
 79 | 			Figure \thelecnum.#1:~#3
 80 | 			\end{center}
 81 | 	}
 82 | % Use these for theorems, lemmas, proofs, etc.
 83 | \newtheorem{theorem}{Theorem}[lecnum]
 84 | \newtheorem{lemma}[theorem]{Lemma}
 85 | \newtheorem{proposition}[theorem]{Proposition}
 86 | \newtheorem{claim}[theorem]{Claim}
 87 | \newtheorem{corollary}[theorem]{Corollary}
 88 | \newtheorem{definition}[theorem]{Definition}
 89 | \newenvironment{proof}{{\bf Proof:}}{\hfill\rule{2mm}{2mm}}
 90 | 
 91 | % **** IF YOU WANT TO DEFINE ADDITIONAL MACROS FOR YOURSELF, PUT THEM HERE:
 92 | 
 93 | \DeclareMathOperator*{\argmax}{arg\,max} 
 94 | \DeclareMathOperator*{\argmin}{arg\,min} 
 95 | 
 96 | \begin{document}
 97 | %FILL IN THE RIGHT INFO.
 98 | %\lecture{**LECTURE-NUMBER**}{**DATE**}{**LECTURER**}{**SCRIBE**}
 99 | \lecture{5}{Gaussian Processes}{}{}
100 | %\footnotetext{These notes are partially based on those of Nigel Mansell.}
101 | 
102 | % **** YOUR NOTES GO HERE:
103 | 
104 | % Some general latex examples and examples making use of the
105 | % macros follow.  
106 | %**** IN GENERAL, BE BRIEF. LONG SCRIBE NOTES, NO MATTER HOW WELL WRITTEN,
107 | %**** ARE NEVER READ BY ANYBODY.
108 | 
109 | \section{Kernel Methods}
110 | 
111 | In the previous lecture we have looked at \textit{linear predictors} for real-valued variables of the form:
112 | \begin{equation*}
113 |       \hat{y} = \hat{f}(\boldsymbol{x}, \boldsymbol{\theta}) = \theta_0 +  \boldsymbol{\theta}^\intercal\boldsymbol{x}
114 | \end{equation*}
115 | \textit{Motivation example (Fitting a quadratic):} If we know that $y$ is (well-approximated by) a quadratic function of a single input $\boldsymbol{x}$, then we can use $\boldsymbol{x}$ to construct $\Tilde{\boldsymbol{x}} = [x, x^2]^\intercal$, and then perform linear regression with input $\Tilde{\boldsymbol{x}}$ and $\boldsymbol{\theta} \in \mathbb{R}^2$. Then $\hat{y} = \theta_2x^2 + \theta_1x + \theta_0$, an arbitrary quadratic function.\medskip
116 | 
117 | \textbf{Overview}
118 | \begin{itemize}
119 |     \item Many machine learning algorithms depend on the data $x_1,...,x_n$ only through products $\langle x_i, x_j \rangle$.\\
120 |     Inner products capture the geometry of the data set, so one generally expects geometrically inspired algorithms (e.g. SVM) to depend only on inner products. For algorithms that use distances, note that $\norm{\boldsymbol{x} - \boldsymbol{x'}}^2 = \langle \boldsymbol{x}, \boldsymbol{x} \rangle + \langle \boldsymbol{x'}, \boldsymbol{x'} \rangle - 2\langle \boldsymbol{x}, \boldsymbol{x'} \rangle$, so distances can be expressed in terms of inner products.
121 |     \item We know that moving to feature spaces can help, so we could map each $x_i \rightarrow \phi(x_i)$ and apply the algorithm using $\langle \phi(x_i), \phi(x_j) \rangle$.
122 |     \item A \textit{kernel function} $k(x_i, x_j)$ can be thought of as an inner product in a \textit{possibly implicit} feature space:
123 |     \begin{itemize}
124 |         \item \textbf{Key idea}: There are clever choices of the mapping $\phi(\cdot)$  ensuring that we can efficiently compute $\langle \phi(x_i), \phi(x_j) \rangle$ without ever explicitly mapping to the feature space.
125 |         \item In some cases, the feature space is infinite-dimensional, so we could not explicitly map to it even if we wanted to.
126 |     \end{itemize}
127 |     \item \textbf{Kernel trick}: The terminology “kernel trick” simply refers to taking any algorithm that depends on the data only through inner products, and replacing each inner product $\langle \boldsymbol{x}, \boldsymbol{x'} \rangle$ by a kernel value $k(\boldsymbol{x}, \boldsymbol{x'})$.
128 | \end{itemize}\medskip
129 | 
130 | \begin{definition}
131 | A function $k : \mathbb{R}^d \times \mathbb{R}^d \rightarrow \mathbb{R}$ is said to be a kernel function if and only if it is an inner product $\langle \boldsymbol{\phi(x)}, \boldsymbol{\phi(x')} \rangle$  for some (possibly infinite dimensional) mapping $\boldsymbol{\phi(x)}$.
132 | \end{definition}
133 | \newpage
134 | \begin{definition}
135 | A function $k : \mathbb{R}^d \times \mathbb{R}^d \rightarrow \mathbb{R}$ is said to be symmetric positive semidefinite (PSD) if it is symmetric, i.e \: $k(\boldsymbol{x}, \boldsymbol{x'}) = k(\boldsymbol{x'}, \boldsymbol{x})$; For any integer $m > 0$ and any set of inputs $\boldsymbol{x_1,...,x_m }\in \mathbb{R}^d$, the following matrix is positive semi-definite:
136 | \begin{center}
137 |     $\boldsymbol{K} =
138 |     \begin{bmatrix}
139 |     k(\boldsymbol{x_1}, \boldsymbol{x_1}) & \hdots & k(\boldsymbol{x_1}, \boldsymbol{x_m})\\
140 |     \vdots & \ddots & \vdots\\
141 |     k(\boldsymbol{x_m}, \boldsymbol{x_1}) & \hdots & k(\boldsymbol{x_m}, \boldsymbol{x_m})
142 |     \end{bmatrix}
143 |     \succeq \boldsymbol{0}
144 | $
145 | \end{center}
146 | This matrix, with $(i, j)$-th entry equal to $k(\boldsymbol{x_i}, \boldsymbol{x_j})$, is called the \textbf{Gram matrix}.
147 | \end{definition}
148 | 
149 | \begin{theorem}
150 | The above two definitions are equivalent. That is, $k$ is a kernel function if and only if it is symmetric PSD.
151 | \end{theorem}
152 | \begin{proof}
153 | The “only if” part is easy to show (at least when $\phi$ is finite-dimensional): The inner product is certainly symmetric, and the Gram matrix can be written as $\boldsymbol{K = \Phi^\intercal\Phi}$, where $\boldsymbol{\Phi} \in \mathbb{R}^{\text{dim}(\boldsymbol{\Phi}) \times m}$ contains the $m$ feature vectors $\{\boldsymbol{\phi(x_t)}\}_{t = 1}^{m}$ as columns. The matrix $\boldsymbol{K = \Phi^\intercal\Phi}$  is trivially positive semidefinite, since for any $\boldsymbol{z}$ we have $\boldsymbol{z^\intercal\Phi^\intercal\Phi z} = \boldsymbol{\norm{\Phi z}^2} \geq 0$.\medskip
154 | 
155 | The “if” part is more challenging and comes from \textit{Mercer's Theorem}.
156 | \end{proof}\medskip
157 | 
158 | \begin{theorem}{\textbf{Mercer's  Theorem}\\}
159 | Recall: any positive definite matrix $\boldsymbol{K}$ can be represented using an eigendecomposition of the form $\boldsymbol{K = U^\intercal\Lambda U}$, where $\boldsymbol{\Lambda}$ is a diagonal matrix of eigenvalues $\lambda_i > 0$, and $\boldsymbol{U}$ is a matrix containing the eigenvectors.\\
160 | Now consider element $(i, j)$ of $\boldsymbol{K}$:
161 | \begin{equation*}
162 |     k_{ij} = (\boldsymbol{\Lambda}^\frac{1}{2}\boldsymbol{U}_{:i})^\intercal(\boldsymbol{\Lambda}^\frac{1}{2}\boldsymbol{U}_{:j})
163 | \end{equation*}
164 | where $\boldsymbol{U}_{:i}$ is the \textit{i}'th column of $\boldsymbol{U}$. If we define $\boldsymbol{\phi(x_i)} = \boldsymbol{\Lambda}^\frac{1}{2}\boldsymbol{U}_{:i}$, then we can write:
165 | \begin{equation*}
166 |     k_{ij} = \boldsymbol{\phi(x_i)}^\intercal \boldsymbol{\phi(x_j)} = \sum\limits_m \phi_m (\boldsymbol{x_i})\phi_m(\boldsymbol{x_j})
167 | \end{equation*}
168 | \end{theorem}
169 | Thus we see that the entries in the kernel matrix can be computed by performing an inner product of some feature vectors that are implicitly defined by the eigenvectors of the kernel matrix. This idea can be generalized to apply to kernel functions, not just kernel matrices.\\
170 | For example consider the \textbf{quadratic kernel} $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = \langle\boldsymbol{x}, \boldsymbol{x'}\rangle^2$. In 2d, we have:
171 | \begin{equation*}
172 |     \mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = (x_1x_1' + x_2x_2')^2 = x_1^2(x_1')^2 + 2x_1x_2x_1'x_2' + x_2^2(x_2')^2
173 | \end{equation*}
174 | We can write this as $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = \boldsymbol{\phi(x)}^\intercal\boldsymbol{\phi(x)}$ if we define $\boldsymbol{\phi(x_1, x_2)} = [x_1^2, \sqrt{2}x_1x_2, x_2^2] \in \mathbb{R}^3$. So we embed the 2d inputs $\boldsymbol{x}$ into a 3d feature space $\boldsymbol{\phi(x)}$.\\
175 | Now consider the RBF kernel. In this case, the corresponding feature representation is infinite dimensional. However, by working with kernel functions, we can avoid having to deal with infinite dimensional vectors.
176 | \newpage
177 | \subsection{Making new kernels from old}
178 | Given two valid kernels $\mathcal{K}_1(\boldsymbol{x}, \boldsymbol{x'})$ and $\mathcal{K}_2(\boldsymbol{x}, \boldsymbol{x'})$, we can create a new kernel using any of the following methods:
179 | \begin{itemize}
180 |     \item $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = c\mathcal{K}_1(\boldsymbol{x}, \boldsymbol{x'})$, for any constant $c > 0$
181 |     \item $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = f(\boldsymbol{x})\mathcal{K}_1(\boldsymbol{x}, \boldsymbol{x'})f(\boldsymbol{x'})$, for any function $f$
182 |     \item $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = q(\mathcal{K}_1(\boldsymbol{x}, \boldsymbol{x'}))$, for any function polynomial $q$ with non-negative coefficients
183 |     \item $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = \exp{(\mathcal{K}_1(\boldsymbol{x}, \boldsymbol{x'}))}$
184 |     \item $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = \boldsymbol{x}^\intercal\boldsymbol{Ax'}$, for any PSD matrix $\boldsymbol{A}$
185 | \end{itemize}
186 | For example, suppose we start with the linear kernel $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = \boldsymbol{x^\intercal x'}$. We know this is a valid Mercer kernel, since the corresponding Gram matrix is just the (scaled) covariance matrix of the data. From the above rules, we can see that the polynomial kernel $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = (\boldsymbol{x^\intercal x'})^M$is a valid Mercer kernel. This contains all monomials of order M. For example, if M = 2 and the inputs are 2d, we have:
187 | $$(\boldsymbol{x^\intercal x'})^2 = (x_1x_1' + x_2x_2')^2 = (x_1x_1')^2 + (x_2x_2)^2 + (x_1x_1')(x_2x_2')$$
188 | We can generalize this to contain all terms up to degree M by using the kernel $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = (\boldsymbol{x^\intercal x'} + c)^M$ . For example, if M = 2 and the inputs are 2d, we have:
189 | $$(\boldsymbol{x^\intercal x'} + 1)^2 = (x_1x_1')^2 + (x_1x_1')(x_2x_2') + (x_1x_1') + (x_2x_2)(x_1x_1') + (x_2x_2')^2 + (x_2x_2') + (x_1x_1') + (x_2x_2') + 1$$
190 | We can also use the above rules to establish that the Gaussian kernel is a valid kernel. To see this,
191 | note that:
192 | $$\norm{\boldsymbol{x} - \boldsymbol{x'}}^2 = \boldsymbol{x}^\intercal\boldsymbol{x} + (\boldsymbol{x'})^\intercal\boldsymbol{x'} - 2\boldsymbol{x}^\intercal\boldsymbol{x'}$$
193 | and hence
194 | $$\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = \exp{(-\norm{\boldsymbol{x} - \boldsymbol{x'}}^2/2\sigma^2)} = \exp{(-\boldsymbol{x}^\intercal\boldsymbol{x}/2\sigma^2)}\exp{(\boldsymbol{x}^\intercal\boldsymbol{x'}/\sigma^2)}\exp{(-(\boldsymbol{x'})^\intercal\boldsymbol{x'}/2\sigma^2)}$$
195 | is a valid kernel.
196 | \subsection{Combining kernels by addition and multiplication}
197 | We can also combine kernels using addition or multiplication:
198 | \begin{itemize}
199 |     \item $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = \mathcal{K}_1(\boldsymbol{x}, \boldsymbol{x'}) + \mathcal{K}_2(\boldsymbol{x}, \boldsymbol{x'})$
200 |     \item $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = \mathcal{K}_1(\boldsymbol{x}, \boldsymbol{x'}) \times \mathcal{K}_2(\boldsymbol{x}, \boldsymbol{x'})$
201 | \end{itemize}
202 | Multiplying two positive-definite kernels together always results in another positive definite kernel. This is a way to get a conjunction of the individual properties of each kernel. In addition, adding two positive-definite kernels together always results in another positive definite
203 | kernel. This is a way to get a disjunction of the individual properties of each kernel.
204 | \subsection{Kernels for structured inputs}
205 | Kernels are particularly useful when the inputs are structured objects, such as strings and graphs, since it is often hard to “featurize” variable-sized inputs. For example, we can define a string kernel which compares strings in terms of the number of n-grams they have in common.
206 | We can also define kernels on graphs . For example, the random walk kernel conceptually performs random walks on two graphs simultaneously, and then counts the number of paths that were produced by both walks.
207 | \section{Gaussian Processes}
208 | We use a Gaussian Process(GP) to describe a distribution over functions. Formally:
209 | \begin{definition}
210 | A Gaussian Process is a collection of random variables, any finite number of which have a joint Gaussian distribution.
211 | \end{definition}
212 | A GP is completely specified by its mean function and covariance function. We define mean function $m(\boldsymbol{x})$ and the covariance function $k(\boldsymbol{x}, \boldsymbol{x'})$ of a real process $f(\boldsymbol{x})$ as
213 | \begin{equation*}
214 |     m(\boldsymbol{x}) = \mathbb{E}[f(\boldsymbol{x})]
215 | \end{equation*}
216 | \begin{equation*}
217 |     k(\boldsymbol{x}, \boldsymbol{x'}) = \mathbb{E}[(f(\boldsymbol{x}) - m(\boldsymbol{x}))(f(\boldsymbol{x'}) - m(\boldsymbol{x'}))]
218 | \end{equation*}
219 | and will write the Gaussian process as
220 | \begin{equation*}
221 |     f(\boldsymbol{x}) \sim \mathcal{GP}(m(\boldsymbol{x}), k(\boldsymbol{x}, \boldsymbol{x'}))
222 | \end{equation*}
223 | Usually, for notation simplicity we will take the mean function to be zero, although this need not to be done.\medskip
224 | 
225 | In our case the random variables represent the value of the function $f(\boldsymbol{x})$ at location $\boldsymbol{x}$. For notational convenience we use the (arbitrary) enumeration of the cases in the training set to identify the random variables such that $f_i \triangleq  f(\boldsymbol{x}_i)$ is the random variable corresponding to the case $(\boldsymbol{x_i}, y_i)$ as would expected.\medskip
226 | 
227 | A GP is defined as a collection of random variables. Thus, the definition automatically implies a \textit{consistency} requirement, which is also sometimes known as the marginalization property. This property simply means that if the GP e.g specifies $(y_1, y_2) \sim \mathcal{N}(\boldsymbol{\mu}, \Sigma)$, then it must also specify $y_1 = \mathcal{N}(\mu_1, \Sigma_{11})$ where $ \Sigma_{11}$ is the relevant submatrix of $\Sigma$.\\
228 | In other words, examination of a larger set of variables does not change the distribution of the smaller set. Notice that the consistency requirement is automatically fulfilled if the covariance function specifies entries of the covariance matrix. The definition does not exclude Gaussian processes with finite index sets (which would be simply Gaussian \textit{distributions}), but there are not particularly interesting for our purposes.\medskip
229 | 
230 | A simple example of a GP can be obtained from a Bayesian regression model $f(\boldsymbol{x}) = \boldsymbol{\phi(x)}^\intercal\boldsymbol{w}$ with prior $\boldsymbol{w} \sim \mathcal{N}(\boldsymbol{0}, \Sigma_p)$. We have for the mean and covariance
231 | \begin{equation*}
232 |     \mathbb{E}[f(\boldsymbol{x})] = \boldsymbol{\phi(x)}^\intercal\mathbb{E}[\boldsymbol{w}] = 0
233 | \end{equation*}
234 | \begin{equation*}
235 |     \mathbb{E}[f(\boldsymbol{x})f(\boldsymbol{x'})] = \boldsymbol{\phi(x)}^\intercal\mathbb{E}[\boldsymbol{ww^\intercal}]\boldsymbol{\phi(x')} = \boldsymbol{\phi(x)}^\intercal\Sigma_p\boldsymbol{\phi(x')}
236 | \end{equation*}
237 | Thus $f(\boldsymbol{x})$ and $f(\boldsymbol{x'})$ are jointly Gaussian with zero mean and covariance given by $\boldsymbol{\phi(x)}^\intercal\Sigma_p\boldsymbol{\phi(x')}$. Indeed, the function values $f(\boldsymbol{x}_1),...,f(\boldsymbol{x}_n)$ corresponding to any number of input points $n$ are jointly Gaussian, although if $N < n$ then this Gaussian is singular (as the joint covariance matrix will be of rank $N$).\medskip
238 | 
239 | Our running example of a covariance function will be the \textit{squared exponential}(SE) covariance function that specifies the covariance between pairs of random variables
240 | \begin{equation}
241 |     \text{cov}(f(\boldsymbol{x}_p), f(\boldsymbol{x}_q)) = k(\boldsymbol{x}_p, \boldsymbol{x}_q) = \exp{(-\frac{1}{2}|\boldsymbol{x}_p - \boldsymbol{x}_q|^2)}
242 | \end{equation}
243 | Note, that the covariance between the \textit{outputs} is written as a function of the \textit{inputs}. For this particular covariance function, we see that the covariance is almost unity between variables whose corresponding inputs are very close, and decreases as their distance in the input space increases.\medskip
244 | 
245 | It can be shown that the squared exponential covariance function corresponds to a Bayesian linear regression model with an infinite number of basis functions. Indeed for every positive definite covariance function $k(\cdot)$, there exists a (possibly infinite) expansion in terms of basis functions(see Mercer's Theorem). We can also obtain the SE covariance function from the linear combination of an infinite number of Gaussian-shaped basis functions.\\
246 | The specification of the covariance function implies a distribution over functions. To see this, we can draw samples from the distribution of functions evaluated at any number of points; in detail, we choose a number of input points, $X_*$ and write out the corresponding covariance matrix using Eq.5.1 elementwise.\\
247 | Then we generate a random Gaussian vector with this covariance matrix
248 | \begin{equation*}
249 |     \textbf{f}_* \sim \mathcal{N}(\boldsymbol{0}, K(X_*, X_*))
250 | \end{equation*}
251 | and plot the generated values as a function of the inputs.
252 | \begin{figure}[ht]
253 | \caption{Panel (a) shows three functions drawn at random from a GP prior; the dots indicate values of $y$ actually generated; the two other functions have (less correctly) been drawn as lines by joining a large number of evaluated points. Panel (b) shows three random functions drawn from the posterior, i.e. the prior conditioned on the five noise free observations indicated. In both plots the shaded area represents the pointwise mean plus and minus two times the standard deviation for each input value (corresponding to the 95\% confidence region), for the prior and posterior respectively.}
254 | \centering
255 | \includegraphics[width=0.90\textwidth]{img/gaussian_process.png}
256 | \end{figure}
257 | \newpage
258 | In the example in Figure 5.1 the input values were equidistant, but this need not be the case. Notice that “informally” the functions look smooth. In fact the squared exponential covariance function is infinitely differentiable, leading to the process being infinitely mean-square differentiable. We also see that the functions seem to have a characteristic length-scale,
259 | which informally can be thought as roughly the distance you have to move in input space before the function value can change significantly.\\
260 | For Eq.5.1 the characteristic length-scale is around one unit. By replacing $|\boldsymbol{x}_p, \boldsymbol{x}_q|$ by $|\boldsymbol{x}_p, \boldsymbol{x}_q|/l$ for some positive constant $l$ we could change the characteristic length-scale of the process. Also, the overall variance of the random function can be controlled by a positive pre-factor before the $\exp$ in Eq.5.1.
261 | \subsection{Prediction with Noise-free Observations}
262 | We are usually not primarily interested in drawing random functions from the prior, but want to incorporate the knowledge that the training data provides about the function. Initially, we will consider the simple special case where the observations are noise free, that is we know $\{(\boldsymbol{x}_i, f_i)|i = 1,...,n\}$. The joint distribution of the training outputs, $\textbf{f}$, and the test outputs $\textbf{f}_*$ according to the prior is
263 | \begin{equation*}
264 |     \begin{bmatrix*}[l]
265 | \textbf{f}\\
266 | \textbf{f}_*
267 | \end{bmatrix*}
268 | \sim \mathcal{N}\Bigg(\boldsymbol{0}, \begin{bmatrix*}
269 | K(X, X) & K(X, X_*)\\
270 | K(X_*, X) & K(X_*, X_*)
271 | \end{bmatrix*}\Bigg)
272 | \end{equation*}
273 | If there are $n$ training points and $n_*$ test points then $K(X, X_*)$ denotes the $n\times n_*$ matrix of the covariance evaluated at all pairs of training and test points, and similarly for the other entries.\\
274 | To get the posterior distribution over functions we need to restrict this joint prior distribution to contain only those functions which agree with the observed data points. Graphically in Figure 5.1 you may think of generating functions from the prior, and rejecting the ones that disagree with the observations, although this strategy would not be computationally very efficient. Fortunately, in probabilistic terms this operation is extremely simple, corresponding to \textit{conditioning} the joint Gaussian prior distribution on the observations to give
275 | \begin{equation}
276 |     \textbf{f}_*|X_*,X, \textbf{f} \hspace{3pt}\sim \mathcal{N}(K(X_*, X)K(X, X)^{-1}\textbf{f},\hspace{3pt} K(X_*, X_*) - K(X_*, X)K(X, X)^{-1}K(X, X_*))
277 | \end{equation}
278 | Function values $\textbf{f}_*$ (corresponding to test inputs $X_*$) can be sampled from the joint posterior distribution by evaluating the mean and covariance matrix from Eq.5.2.\\
279 | Figure 5.1(b) shows the results of these computations given the five data points marked with $+$ symbols. Notice that it is trivial to extend these computations to multidimensional inputs – one simply needs to change the evaluation of the covariance function in accordance with Eq.5.1, although the resulting functions may be harder to display graphically.
280 | \subsection{Predicting using Noisy observations}
281 | It is typical for more realistic modelling situations that we do not have access to function values themselves, but only noisy versions, i.e $y = f(\boldsymbol{x}) + \epsilon$. Assuming additive independent identically distributed Gaussian noise $\epsilon$ with variance $\sigma^2_n$, the prior on the noisy observations becomes
282 | \begin{equation*}
283 |     \text{cov}(y_p, y_q) = k(x_p, x_q) + \sigma^2_n\delta_{pq} \hspace{8pt}\text{or}\hspace{8pt} \text{cov}(\boldsymbol{y}) = K(X, X) + \sigma^2\mathds{1}
284 | \end{equation*}
285 | where $\delta_{pq}$ is a Kronecker delta which is one if $p = q$ and zero otherwise. It follows from the independence assumption about noise, that a diagonal matrix is added, in comparison to the noise free case, Eq.5.1. Introducing the noise term we can write the joint distribution of the observed target values and the function values at the test locations under the prior as
286 | \begin{equation*}
287 |     \begin{bmatrix*}[l]
288 | \boldsymbol{y}\\
289 | \textbf{f}_*
290 | \end{bmatrix*}
291 | \sim \mathcal{N}\Bigg(\boldsymbol{0}, \begin{bmatrix*}
292 | K(X, X) +  \sigma^2_n\mathds{1} & K(X, X_*)\\
293 | K(X_*, X) & K(X_*, X_*)
294 | \end{bmatrix*}\Bigg)
295 | \end{equation*}
296 | Deriving the conditional distribution corresponding to Eq.5.2 we arrive at the key predictive equations for Gaussian process regression
297 | \begin{equation*}
298 |     \textbf{f}_*|X,\textbf{y},X_* \sim \mathcal{N}(\overline{\textbf{f}}_*, \text{cov}(\textbf{f}_*)), \text{where}
299 | \end{equation*}
300 | \begin{equation}
301 |     \overline{\textbf{f}}_* \triangleq \mathbb{E}[\textbf{f}_*|X, \boldsymbol{y}, X_*] = K(X_*, X)[K(X, X) + \sigma^2_N\mathds{1}]^{-1}\boldsymbol{y}
302 | \end{equation}
303 | \begin{equation}
304 |     \text{cov}(\textbf{f}_*) = K(X_*, X_*) - K(X_*, X)[K(X, X) + \sigma^2_N\mathds{1}]^{-1}K(X, X_*)
305 | \end{equation}\medskip
306 | 
307 | \textbf{Reminder}: Conditional Gaussian distribution
308 | \begin{equation*}
309 |     \begin{bmatrix*}[l]
310 | \textbf{a}_1\\
311 | \textbf{a}_2
312 | \end{bmatrix*}
313 | \sim \mathcal{N}\Bigg(\begin{bmatrix*}[l]
314 | \boldsymbol{\mu}_1\\
315 | \boldsymbol{\mu}_2
316 | \end{bmatrix*}, \begin{bmatrix*}
317 | \Sigma_{11} & \Sigma_{12}\\
318 | \Sigma_{21} & \Sigma_{22}
319 | \end{bmatrix*}\Bigg)
320 | \end{equation*}
321 | Then,
322 | \begin{equation*}
323 |     \textbf{a}_2 | \textbf{a}_1 = \boldsymbol{z} \sim \mathcal{N}(\boldsymbol{\mu}_2 + \Sigma_{21}\Sigma_{11}^{-1}(\boldsymbol{z} - \boldsymbol{\mu}_1), \Sigma_{22} - \Sigma_{21}\Sigma_{11}^{-1}\Sigma_{12})
324 | \end{equation*}\medskip
325 | 
326 | The expressions involving $K(X, X), K(X, X_*)$ and $K(X_*, X_*)$ etc. can look rather unwieldy, so we now introduce a compact form of the notation setting $K = K(X, X)$ and $K_* = K(X, X_*)$. In the case there is only one test point $\boldsymbol{x}_*$ we write $\boldsymbol{k}(\boldsymbol{x_*}) = \boldsymbol{k_*}$ to denote the vector of covariances between the test point and the $n$ training points. Using this compact notation and for a single test point $\boldsymbol{x}_*$, equations 5.3 and 5.4 reduce to
327 | \begin{equation}
328 |     \bar{f}_* = \boldsymbol{k}_*^\intercal(K + \sigma^2_n\mathds{1})^{-1}\boldsymbol{y}
329 | \end{equation}
330 | \begin{equation}
331 |     \text{Var}[f_*] = k(\boldsymbol{x}_*, \boldsymbol{x}_*) - \boldsymbol{k}_*^\intercal(K + \sigma^2_n\mathds{1})^{-1}\boldsymbol{k}_*
332 | \end{equation}
333 | Let us examine the predictive distribution given by equations 5.5 and 5.6. Note first that the mean prediction Eq.5.5 is a linear combination of observations $\boldsymbol{y}$; this is sometimes referred to as a \textit{linear predictor}. Another way to look at this equation is to see it as a linear combination of $n$ kernel functions, each one centered on a training point, by writing
334 | \begin{equation}
335 |     \bar{f}(\boldsymbol{x}_*) = \sum\limits_{i = 1}^{n} \alpha_i k(\boldsymbol{x}_i, \boldsymbol{x}_*)
336 | \end{equation}
337 | where $\boldsymbol{\alpha} = (K + \sigma^2_n\mathds{1})^{-1}\boldsymbol{y}$.\\
338 | The fact that the mean prediction for $f(\boldsymbol{x}_*)$ can be written as Eq.5.7 despite the fact that the GP can be represented in terms of a (possible infinite) number of basis functions is one manifestation of the \textit{representer theorem}. We can understand this result intuitively because although the GP defines a joint Gaussian distribution over all of the $y$ variables, for making predictions at $\boldsymbol{x}_*$ we only care about the $(n + 1)$-dimensional distribution defined by the $n$ training points and the test point. As a Gaussian distribution is marginalized by just taking the relevant block of joint covariance matrix it is clear that conditioning this $(n + 1)$-dimensional distribution on the observations gives us the desired result.\\
339 | Note also that the variance in Eq.5.4 does not depend on the observed targets, but only on the inputs; this is a property of the Gaussian distribution. The variance is the difference between two terms; the first term $K(X_*, X_*)$ is simply the prior covariance; from that is subtracted a (positive) term, representing the information the observations give us about the function. We can very simply compute the predictive distribution of test targets $\boldsymbol{y}_*$ by adding $\sigma^2_n\mathds{1}$ to the variance in the expression for cov($\textbf{f}_*$).\medskip
340 | 
341 | The predictive distribution for the GP model gives more than just pointwise errorbars of the simplified Eq.5.6. Although not stated explicitly, Eq.5.4 holds unchanged when $X_*$ denotes multiple test inputs; in this case the covariance of the test targets are computed (whose diagonal elements are the pointwise variances). In fact, Eq.5.3 is the mean function and Eq.5.4 the covariance function of the (Gaussian) posterior process.
342 | \newpage
343 | \begin{figure}[ht]
344 | \caption{Algorithm 2.1: Predictions and log marginal likelihood for Gaussian process regression. The implementation addresses the matrix inversion required by equations 5.5 and 5.6 using Cholesky factorization. For multiple test cases lines 4-6 are repeated. The log determinant required in Eq.5.8 is computed from the Cholesky factor (for large $n$ it may not be possible to represent the determinant itself). The computational complexity is $n^3/6$ for the Cholesky decomposition in line 2, and $n^2/2$ for solving triangular systems in line 3 and (for each test case) in line 5.}
345 | \centering
346 | \includegraphics[width=0.90\textwidth]{img/algo_gp.png}
347 | \end{figure}
348 | The \textit{marginal likelihood}(or evidence) $p(\boldsymbol{y}|X)$ is the integral of the likelihood times the prior
349 | \begin{equation*}
350 |     p(\boldsymbol{y}|X) = \int p(\boldsymbol{y}|\textbf{f},X)p(\textbf{f}|X)d\textbf{f}
351 | \end{equation*}
352 | The term marginal likelihood refers to the marginalization over the function values \textbf{f}. Under the Gaussian process model the prior is Gaussian, $\textbf{f}|X \sim \mathcal{\boldsymbol{0}, K}$, or
353 | \begin{equation}
354 |     \log p(\boldsymbol{y}|X) = -\frac{1}{2}\boldsymbol{y}^\intercal(K + \sigma^2_n\mathds{1})^{-1}\boldsymbol{y} -\frac{1}{2}\log|K + \sigma^2_n\mathds{1}| -\frac{n}{2}\log2\pi
355 | \end{equation}
356 | This result can also be obtained directly by observing that $\boldsymbol{y} \sim \mathcal{N}(\boldsymbol{0}, K + \sigma^2_n\mathds{1})$.\medskip
357 | 
358 | A practical implementation of Gaussian process regression (GPR) is shown in Algorithm 2.1. The algorithm uses Cholesky decomposition, instead of directly inverting the matrix, since it is faster and numerically more stable. The algorithm returns the predictive mean and variance for noise free test data—to compute the predictive distribution for noisy test data $\boldsymbol{y}_*$, simply add the noise variance $    \sigma^2_n$ to the predictive variance of $f_*$.
359 | \end{document}


--------------------------------------------------------------------------------
/L7_SVM.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[twoside]{article}
  2 | \setlength{\oddsidemargin}{0.25 in}
  3 | \setlength{\evensidemargin}{-0.25 in}
  4 | \setlength{\topmargin}{-0.6 in}
  5 | \setlength{\textwidth}{6.5 in}
  6 | \setlength{\textheight}{8.5 in}
  7 | \setlength{\headsep}{0.75 in}
  8 | \setlength{\parindent}{0 in}
  9 | \setlength{\parskip}{0.1 in}
 10 | \newcommand{\eqdef}{:\mathrel{\mathop=}}
 11 | \newcommand{\norm}[1]{\left\lVert #1 \right\rVert}
 12 | 
 13 | %
 14 | % ADD PACKAGES here:
 15 | %
 16 | 
 17 | \usepackage{amsmath,amsfonts,graphicx}
 18 | 
 19 | %
 20 | % The following commands set up the lecnum (lecture number)
 21 | % counter and make various numbering schemes work relative
 22 | % to the lecture number.
 23 | %
 24 | \newcounter{lecnum}
 25 | \renewcommand{\thepage}{\thelecnum-\arabic{page}}
 26 | \renewcommand{\thesection}{\thelecnum.\arabic{section}}
 27 | \renewcommand{\theequation}{\thelecnum.\arabic{equation}}
 28 | \renewcommand{\thefigure}{\thelecnum.\arabic{figure}}
 29 | \renewcommand{\thetable}{\thelecnum.\arabic{table}}
 30 | 
 31 | %
 32 | % The following macro is used to generate the header.
 33 | %
 34 | \newcommand{\lecture}[4]{
 35 |    \pagestyle{myheadings}
 36 |    \thispagestyle{plain}
 37 |    \newpage
 38 |    \setcounter{lecnum}{#1}
 39 |    \setcounter{page}{1}
 40 |    \noindent
 41 |    \begin{center}
 42 |    \framebox{
 43 |       \vbox{\vspace{2mm}
 44 |     \hbox to 6.28in { {\bf Advanced Machine Learning
 45 | 	\hfill Fall 2021} }
 46 |        \vspace{4mm}
 47 |        \hbox to 6.28in { {\Large \hfill Lecture #1: #2  \hfill} }
 48 |        \vspace{2mm}
 49 |        \hbox to 6.28in { {\it  #3 \hfill  #4} }
 50 |       \vspace{2mm}}
 51 |    }
 52 |    \end{center}
 53 |    \markboth{Lecture #1: #2}{Lecture #1: #2}
 54 | 
 55 |    {\bf Note}: {\it LaTeX template courtesy of UC Berkeley EECS dept.}
 56 | 
 57 |    {\bf Disclaimer}: {\it These notes are adapted from CMU's 10-725 Course, Stanford's CS229 Course, ETH's Advanced Machine Learning Course and  Bishop's "Pattern Recognition and Machine Learning" book.}
 58 |    \vspace*{4mm}
 59 | }
 60 | %
 61 | % Convention for citations is authors' initials followed by the year.
 62 | % For example, to cite a paper by Leighton and Maggs you would type
 63 | % \cite{LM89}, and to cite a paper by Strassen you would type \cite{S69}.
 64 | % (To avoid bibliography problems, for now we redefine the \cite command.)
 65 | % Also commands that create a suitable format for the reference list.
 66 | \renewcommand{\cite}[1]{[#1]}
 67 | \def\beginrefs{\begin{list}%
 68 |         {[\arabic{equation}]}{\usecounter{equation}
 69 |          \setlength{\leftmargin}{2.0truecm}\setlength{\labelsep}{0.4truecm}%
 70 |          \setlength{\labelwidth}{1.6truecm}}}
 71 | \def\endrefs{\end{list}}
 72 | \def\bibentry#1{\item[\hbox{[#1]}]}
 73 | 
 74 | %Use this command for a figure; it puts a figure in wherever you want it.
 75 | %usage: \fig{NUMBER}{SPACE-IN-INCHES}{CAPTION}
 76 | \newcommand{\fig}[3]{
 77 | 			\vspace{#2}
 78 | 			\begin{center}
 79 | 			Figure \thelecnum.#1:~#3
 80 | 			\end{center}
 81 | 	}
 82 | % Use these for theorems, lemmas, proofs, etc.
 83 | \newtheorem{theorem}{Theorem}[lecnum]
 84 | \newtheorem{lemma}[theorem]{Lemma}
 85 | \newtheorem{proposition}[theorem]{Proposition}
 86 | \newtheorem{claim}[theorem]{Claim}
 87 | \newtheorem{corollary}[theorem]{Corollary}
 88 | \newtheorem{definition}[theorem]{Definition}
 89 | \newenvironment{proof}{{\bf Proof:}}{\hfill\rule{2mm}{2mm}}
 90 | 
 91 | % **** IF YOU WANT TO DEFINE ADDITIONAL MACROS FOR YOURSELF, PUT THEM HERE:
 92 | 
 93 | \newcommand\E{\mathbb{E}}
 94 | 
 95 | \begin{document}
 96 | %FILL IN THE RIGHT INFO.
 97 | %\lecture{**LECTURE-NUMBER**}{**DATE**}{**LECTURER**}{**SCRIBE**}
 98 | \lecture{7}{Support Vector Machines}{}{}
 99 | %\footnotetext{These notes are partially based on those of Nigel Mansell.}
100 | 
101 | % **** YOUR NOTES GO HERE:
102 | 
103 | % Some general latex examples and examples making use of the
104 | % macros follow.  
105 | %**** IN GENERAL, BE BRIEF. LONG SCRIBE NOTES, NO MATTER HOW WELL WRITTEN,
106 | %**** ARE NEVER READ BY ANYBODY.
107 | 
108 | 
109 | \section{Lagrangian} % Don't be this informal in your notes!
110 | 
111 | Consider a general minimization problem. There is no need for it to be convex, let's keep it as general as possible.
112 | 
113 | \begin{equation*}
114 | \begin{aligned}
115 | & \underset{x}{\text{minimize}}
116 | & & f(x) \\
117 | & \text{subject to}
118 | & & h_i(x) \leq 0, \; i = 1, \ldots, m \\
119 | &&& l_j(x) = 0, \; j = 1, \ldots, r
120 | \end{aligned}
121 | \end{equation*}
122 | \\
123 | 
124 | The Lagrangian is defined as:
125 | \begin{equation}
126 |     \mathcal{L}(x,u,v) = f(x) +  \sum_{i=1}^{m} u_i \cdot h_i(x) + \sum_{j=1}^{r} v_j \cdot l_j(x)
127 | \end{equation}
128 | 
129 | 
130 | Where $u \in {\mathbb{R}^m \geq 0}$, $v \in {\mathbb{R}^r}$.
131 | \\
132 | \\
133 | (Small side-note) We define the Lagrangian $\mathcal{L}(x,u,v) = - \infty$ for values of  $u < 0$.\\
134 | \\ We are going to exploit the following property of the Lagrangian: \\
135 | $\forall u \geq 0$ and $\forall v$ :
136 | \textbf{$f(x) \geq \mathcal{L}(x,u,v) $ at each feasible $x$}
137 | 
138 | 
139 | \begin{figure}[h]
140 | \caption{Each dotted line shows $\mathcal{L}(x, u)$ for different choice of $u \geq 0$.}
141 | \centering
142 | \includegraphics[width=0.39\textwidth]{img/lagrangian.jpg}
143 | \end{figure}
144 | 
145 | 
146 | 
147 | 
148 | 
149 | \newpage
150 | 
151 | 
152 | 
153 | 
154 | \subsection{Lagrangian Dual Function}
155 | Let $C$ denote the primal feasible set, $f^{*}$ denote primal optimal value. Minimizing $\mathcal{L}(x, u, v)$ over all $x$ gives a
156 | lower bound:
157 | 
158 | \begin{equation*}
159 | \begin{aligned}
160 | f^{*} \overset{(i)}{\geq} \underset{x \in {C}}{\text{min}}& & \mathcal{L}(x, u, v) \overset{(ii)}{\geq} \underset{x}{\text{min}}& & \mathcal{L}(x, u, v)  \eqdef g(u,v)
161 | \end{aligned}
162 | \end{equation*}
163 | \\
164 | 
165 | Where $(i)$ derives from the property we stated earlier taking the minimum of both sides and $(ii)$ holds because the unconstrained minimum will always be less or equal the constrained one. \\
166 | 
167 | We call $g(u,v)$ the Lagrangian dual function and it gives a lower bound on $f^{*}$. The main takeaway here is that $(i)$ is often not computable because of the constraints. Hence, we prefer to solve the dual problem $(ii)$.
168 | 
169 | \subsection{Langrange Dual Problem}
170 | Given a primal problem 
171 | 
172 | \begin{equation*}
173 | \begin{aligned}
174 | & \underset{x}{\text{minimize}}
175 | & & f(x) \\
176 | & \text{subject to}
177 | & & h_i(x) \leq 0, \; i = 1, \ldots, m \\
178 | &&& l_j(x)\equiv 0, \; j = 1, \ldots, r
179 | \end{aligned}
180 | \end{equation*}
181 | 
182 | We have showed that our dual function satisfies $f^{*}  \geq g(u,v)$ for all $u \geq 0$ and $v$ \\
183 | Thus, we can get the best lower-bound estimate of $f^{*}$ maximizing $g(u,v)$ over the feasible $u,v$, yielding the Lagrange Dual Problem:
184 | 
185 | \begin{equation*}
186 | \begin{aligned}
187 | & \underset{u,v}{\text{max}}
188 | & & g(u,v) \\
189 | & \text{subject to}
190 | & & u \geq 0, \;
191 | \end{aligned}
192 | \end{equation*}\\
193 | 
194 |  A key property is called \textbf{weak duality}: 
195 | \begin{equation*}
196 |     f^{*} \geq g^{*}
197 | \end{equation*}
198 | Where $f^{*},g{*}$ are the optimal values for the primal and dual problems.\\Note that this property always holds, even if the primal problem is not convex. Furthermore, it is easy to prove that  \textbf{the dual problems is always a convex optimization problem}, even if the primal problem is non convex.
199 | 
200 | \newpage
201 | \subsection{Strong Duality}
202 | In some problems we will have $f^{*}=g^{*}$, this property is called Strong Duality.
203 | 
204 | \begin{theorem}{(Slater's Condition)}
205 | \\If the primal problem is a convex problem
206 | and there exists at least one strictly feasible $x \in \mathbb{R} $, then Strong Duality holds.
207 | \end{theorem}
208 | 
209 | In other words, the condition is that exists $x$ such that:
210 | 
211 | \begin{align*}
212 |        h_i(x) < 0, \; i = 1, \ldots, m \\\\        
213 |     l_j (x) = 0,\; j = 1, \ldots, r \\ 
214 | \end{align*}
215 | 
216 | 
217 | 
218 | \subsection{Karush-Kuhn-Tucker (KKT) Conditions}
219 | 
220 | Given a general problem
221 | 
222 | \begin{equation*}
223 | \begin{aligned}
224 | & \underset{x}{\text{minimize}}
225 | & & f(x) \\
226 | & \text{subject to}
227 | & & h_i(x) \leq 0, \; i = 1, \ldots, m \\
228 | &&& l_j(x) =  0, \; j = 1, \ldots, r
229 | \end{aligned}
230 | \end{equation*}
231 | 
232 | The KKT conditions are:
233 | \begin{enumerate}
234 |     \item $0 \in \partial_x (\mathcal{L}(x, u, v)  )$ (stationarity)
235 |     \item $u_i \cdot h_i(x) = 0$ for all $i$ (complementary slackness)
236 |     \item $h_i(x) \leq 0,  l_j(x)  = 0$ for all $i,j$ (primal feasibility)
237 |     \item $u_i \geq 0$ for all $i$ (dual feasibility)
238 | \end{enumerate}
239 | 
240 | \begin{theorem}
241 | For $x^{*}$ and $u^{*}$, $v^{*}$ to be primal and dual solutions, KKT conditions are sufficient. 
242 | \end{theorem}
243 | 
244 | \begin{proof}
245 | $g^* = g(u^{*},v^{*}) = f(x^{*}) +  \sum_{i=1}^{m} u_i^{*} \cdot h_i(x^{*}) + \sum_{j=1}^{r} v_j^{*} \cdot l_j(x^{*})  = f(x^{*}) = f^* $ \\
246 | \\ Where the first equality holds from stationarity and the second equality holds from complementary slackness and primal feasibility.
247 | \end{proof}
248 | 
249 | \newpage 
250 | 
251 | \begin{theorem}
252 | For a problem with strong duality (e.g. assume Slater's condition holds)\\
253 |  $x^{*}$ and $u^{*}$, $v^{*}$ are primal and dual solution $\iff$ $x^{*}$ and $u^{*}$, $v^{*}$ satisfy KKT conditions.
254 | \end{theorem}
255 | 
256 | \begin{proof}\\
257 | \\
258 | \textbf{Sufficiency:} Follows from Theorem 7.2.\\ \\ \textbf{Necessity:} Suppose $x^{*}$ and $u^{*}$, $v^{*}$ to be primal and dual solution, and suppose strong duality holds. Then:\\
259 | \\ $f^* =g^* =  g(u^{*},v^{*}) $ \ (holds by assumptions)\\   
260 | \\$=   \underset{x}{\text{min}}
261 | & &( f(x) +  \sum_{i=1}^{m} u_i^{*} h_i(x) +\sum_{j=1}^{r} v_j^{*} \cdot l_j(x) ) $ \ (holds by definition)  \\ \\
262 | $\leq   f(x^{*}) +  \sum_{i=1}^{m} u_i^{*} \cdot h_i(x^{*}) + \sum_{j=1}^{r} v_j^{*} \cdot l_j(x^{*})$ \ (min(f) is less or equals than the value of f at any other point) \\ \\
263 | $\leq f(x^{*})$ \ (holds by feasibility since the sums must be less or equal than 0)  \\
264 | 
265 | The LHS equals RHS, therefore all the inequalities must be equalities. Looking at KKT Conditions:
266 | \begin{itemize}
267 |     \item \textbf{Primal and dual feasibility} hold by virtue of optimality: $x^*,u^*,v^*$ are optima $ \implies$ $x^*,u^*,v^*$ must be feasible
268 |     \item \textbf{Stationarity} comes from the fact that $x^{*}$ minimizes $g(u^{*},v^{*})$. Since $x^{*}$ is the minimizer it must be a stationary point for this function.
269 |     \item \textbf{Complementary Slackness} comes from the last inequality, since $\sum_{i=1}^{m} u_i^{*} \cdot h_i(x^{*})$ must  be equal to 0.
270 | \end{itemize}
271 | 
272 | 
273 | \end{proof}
274 | \newpage
275 | 
276 | \section{Maximum Margin Classifiers}
277 | 
278 | We begin our discussion of Support Vector Machines by returning to the two-class classification problem using a linear model of the form:
279 | 
280 | \begin{equation*}
281 |     y(x) = w^T  \phi(x) + b
282 | \end{equation*}
283 | 
284 | We shall assume for the moment that the training data is linearly separable. The SVMs approach this problem through the concept of margin, which is defined to be the smallest distance between decision boundary and any of the samples. 
285 | 
286 | \subsection{Finding the margin}
287 | 
288 | Consider an arbitrary point $x$ and let $x_{\bot}$ be its orthogonal projection onto the decision surface, so that:
289 | 
290 | \begin{equation*}
291 |     x = x_{\bot} + r  \dfrac{w}{\norm{w}}
292 | \end{equation*}
293 |     
294 | Multiplying both sides of this result by $w^T$ and adding $b$ we get:
295 | \begin{equation*}
296 |     w^T  x + b = w^T  x_{\bot} + r w^T  \dfrac{w}{\norm{w}} + b
297 | \end{equation*}
298 | 
299 | Applying the definition $y(x) = w^T  x + b$:
300 | \begin{equation*}
301 |     y(x) = y(x_{\bot}) + r  \norm{w}
302 | \end{equation*}
303 | 
304 | From Figure 7.2 it is clear that $x_{\bot}$ lies on the decision surface, hence $y(x_{\bot})=0$. Solving for r:
305 | 
306 | \begin{equation*}
307 |     r = \dfrac{y(x)}{\norm{w}}
308 | \end{equation*}
309 | 
310 | Therefore, the perpendicular distance of a point $x$ from a hyperplane defined by $y(x)=0$ is given by:
311 | \begin{equation*}
312 |     \dfrac{\lvert y(x) \rvert}{\norm{w}}
313 | \end{equation*}
314 | \begin{figure}[h]
315 | \caption{The decision surface shown in red is perpendicular to w.}
316 | \centering
317 | \includegraphics[width=0.385\textwidth]{img/margin.jpg}
318 | \end{figure}
319 | 
320 | \newpage
321 | 
322 | \subsection{SVMs Primal Problem}
323 | 
324 | We wish to optimize the parameters $w$ and $b$ in order to maximize the minimum margin among all data points:
325 | 
326 | \begin{equation*}
327 | \max_{w,b} \  \min_{n}   \dfrac{\lvert y(x_{n}) \rvert}{\norm{w}} 
328 | \end{equation*}
329 | 
330 | We can take the factor $\dfrac{1}{\norm{w}}$ outside of the optimization over $n$ because it does not depend on $n$:
331 | 
332 | \begin{equation*}
333 | \max_{w,b} \dfrac{1}{\norm{w}} \ \min_{n}  [t_{n} (w^T \phi(x) +b) ]
334 | \end{equation*}
335 | 
336 | Direct solution of this optimization problem would be very complex (non-convex), and so we shall convert it into an equivalent problem that is much easier to solve. 
337 | 
338 | To do this, we note that if we make the rescaling $w' = kw$ and $b' = kb$ the margin will remain unchanged: 
339 | \begin{equation*}
340 |     r' = \dfrac{t_{n}((w')^T \phi(x_{n}) +b')}{\norm{w'}} = \dfrac{k t_{n} (w^T \phi(x_{n}) + b)}{k \norm{w}} = r
341 | \end{equation*}
342 | 
343 | We can use this freedom to set the point that is closer to the decision surface:
344 | \begin{equation*}
345 |     t_{n} (w^T \phi(x_{n}) +b) = 1
346 | \end{equation*}
347 | 
348 | In this case all data points will have to satisfy the constraints:
349 | 
350 | \begin{equation*}
351 |     t_{i} (w^T \phi(x_{i}) +b) \geq 1 \ \ i = 1,\dotsc,N
352 | \end{equation*}
353 | 
354 | Thus we can reduce the problem to:
355 | \begin{equation*}
356 | \begin{aligned}
357 | & \underset{w,b}{\text{max}}
358 | & & \dfrac{1}{\norm{w}}\\
359 | & \text{subject to}
360 | & & 1 - t_{i} (w^T \phi(x_{i}) +b)  \leq 0, \; i = 1, \ldots, N \\
361 | \end{aligned}
362 | \end{equation*}
363 | 
364 | Furthermore, maximizing ${\norm{w}^{-1}}$ is equivalent to minimizing $ \norm{w}^2$.We include a factor $1/2$ for later convenience:
365 | \begin{equation*}
366 | \begin{aligned}
367 | & \underset{w,b}{\text{min}}
368 | & & \dfrac{1}{2} \norm{w}^2\\
369 | & \text{subject to}
370 | & & 1 - t_{i} (w^T \phi(x_{i}) +b)  \leq 0, \; i = 1, \ldots, N \\
371 | \end{aligned}
372 | \end{equation*}
373 | 
374 | Two important observations:
375 | \begin{itemize}
376 |     \item It appears that the bias parameter b has disappeared from the optimization. However, it is determined implicitly via the constraints.
377 |     \item The solution to a QP problem in $M$ variables has computational complexity that is $O(M^3)$. Thus, the primal problem is only feasible if we constrain ourselves to a fixed set of basis function (small $M$).  
378 | \end{itemize}
379 | 
380 | \newpage
381 | 
382 | \subsection{SVMs Dual Problem}
383 | 
384 | First thing first, we should check if Slater's conditions is satisfied (remember that Slater's condition is sufficient for \textbf{strong duality}).\\
385 | Let $w^*,b^*$ be an optimal solution and $\lambda > 1$, then $\lambda w^*, \lambda b^*$ will be strictly feasible:
386 | 
387 | \begin{equation*}
388 |     t_{i}(\lambda w^{*T} \phi(x_{i}) + \lambda b^*) =  t_{i} \lambda ( w^{*T} \phi(x_{i}) +  b^*) > t_{i}  ( w^{*T} \phi(x_{i}) +  b^*) \geq 1, \; i = 1, \ldots, N \\
389 | \end{equation*}
390 | 
391 | Since strong duality holds, we can solve the dual problem:
392 | 
393 | \begin{equation*}
394 | \begin{aligned}
395 | & \underset{a}{\text{max}}
396 | &  \underset{w,b}{\text{min}}
397 | & \ \mathcal{L}(w,b,a) \\
398 | & \text{subject to}
399 | & & a_{i} \geq 0, \; i = 1, \ldots, N \
400 | \end{aligned}
401 | \end{equation*}
402 | 
403 | Where we define the Lagrangian Dual Function $\mathcal{L}(w,b,a) = \dfrac{1}{2} \norm{w}^2 + \sum_{i=1}^{N} a_i\{ 1- t_{i}(w^T \phi(x_{i}) +b) \} $
404 | 
405 | Setting the derivatives of $\mathcal{L}(w,b,a)$ with respect to $w$ and $b$ equal to zero, we obtain the following two conditions:
406 | 
407 | \begin{align}
408 |        w = \sum_{i=1}^{N} a_{i}t_{i}\phi(x_{i})  \\        
409 |     \sum_{i=1}^{N} a_{i}t_{i} = 0
410 | \end{align}
411 | 
412 | 
413 | Now we substitute the conditions back into $\mathcal{L}(w,b,a)$:
414 | 
415 | \begin{equation*}
416 |     \mathcal{L}(w,b,a) = \dfrac{1}{2} \norm{w}^2 + \sum_{i=1}^{N} a_{i} 
417 |     - \overbrace{\sum_{i=1}^{N} a_{i}t_{i}b}^\text{=0} - \sum_{i=1}^{N} a_{i}t_{i} w^T \phi(x_{i})
418 | \end{equation*}
419 | 
420 | Substituting (7.2) we obtain:
421 | 
422 | \begin{equation*}
423 |     \mathcal{L}(a) = \dfrac{1}{2} \  [\sum_{i=1}^{N} a_{i}t_{i}\phi(x_{i})]^T [\sum_{i=1}^{N} a_{i}t_{i}\phi(x_{i})] + \sum_{i=1}^{N} a_{i} - \sum_{i=1}^{N} a_{i} t_{i} [\sum_{i=1}^{N} a_{i}t_{i}\phi(x_{i})]^T \phi(x_{i})
424 |     \end{equation*}
425 |     
426 | Expanding the products:
427 | \begin{equation*}
428 |     \mathcal{L}(a) = \dfrac{1}{2} \  \sum_{i=1}^{N} { \sum_{j=1}^{N} a_{i} a_{j} t_{i} t_{j} \phi(x_{i})^T \phi(x_{j})}
429 |     + \sum_{i=1}^{N} a_{i}  -
430 |     \sum_{i=1}^{N} { \sum_{j=1}^{N} a_{i} a_{j} t_{i} t_{j} \phi(x_{i})^T \phi(x_{j})}
431 |     \end{equation*}
432 | \newpage
433 | This gives the dual representation of the SVM problem:
434 | 
435 | \begin{equation*}
436 | \begin{aligned}
437 | & \underset{a}{\text{max}}
438 | & & \sum_{i=1}^{N} a_{i}  - \dfrac{1}{2}
439 |     \sum_{i=1}^{N} { \sum_{j=1}^{N} a_{i} a_{j} t_{i} t_{j} \phi(x_{i})^T \phi(x_{j})}\\
440 | & \text{subject to}
441 | & & a_{i}  \geq 0, \; i = 1, \ldots, N \\
442 | &&& \sum_{i=1}^{N} a_{i}t_{i} = 0, 
443 | \end{aligned}
444 | \end{equation*}
445 | 
446 | Some important observations:
447 | \begin{itemize}
448 |     \item Note that now the time complexity for the QP solver is $O(N^3)$, thus it does not depend on the choice of basis function. For a fixed set of basis functions whose number M is smaller than the number N of data points, the dual problem appears disadvantageous. However, the dual problem makes feasible applying SVMs to feature spaces whose dimensionality exceed the number of data points, including infinite feature spaces. 
449 |     \item In order to classify new data points using the trained model, we evaluate the sign of $y(x)$: 
450 |     \begin{equation*}
451 |         y(x) = \sum_{i=1}^{N}  a_{i}t_{i} \phi(x_i)^T \phi(x) +b
452 |     \end{equation*}
453 |     \item Remember that : Strong Duality  $\implies$ KKT conditions are satisfied. Hence, the following condition must hold:
454 |     
455 |     \begin{equation*}
456 |          a_{i}(t_{i}y(x_{i})-1) = 0, \; i = 1, \ldots, N
457 |     \end{equation*}
458 |     
459 |     Any data point for which $a_{i} = 0$ plays no role in making predictions for new data points. The remaining data points correspond to points that lie on the maximum margin hyperplanes in feature space and they are called support vectors. This property is central to the practical applicability of SVMs: once the model is trained, a significant proportion of the data points can be discarded.
460 |     
461 | \end{itemize}
462 | 
463 | 
464 | \section{Soft Margin SVM} 
465 | So far, we have assumed that the training data points are linearly separable in the feature space. In practice, however, the class conditional distributions may overlap, in which case exact separation of the training data can lead to poor generalization. We therefore need a way to modify support vector machines so as to allow some of the training points to be misclassified. \\
466 | 
467 | To do this, we introduce slack variables $\xi_{i} \geq 0$ where $i=1,\ldots,N$. These are defined by $\xi_{i} = 0$ for data points that are on or inside the correct margin boundary and $\xi_{i}= \lvert t_{i} - y(x_{i}) \rvert$ for other points. Thus a data point $x_{i}$ that is on the decision boundary $y(x_{i})=0$ will have $\xi_{i} = 1$, and points with $\xi_{i} > 1$ will be misclassified. \\
468 | 
469 | \begin{figure}[h]
470 | \caption{Illustration of the slack variables.}
471 | \centering
472 | \includegraphics[width=0.39\textwidth]{img/slack.png}
473 | \end{figure}
474 | 
475 | The exact classification constraints are then replaced with:
476 | 
477 | \begin{align*}
478 |        t_{i} y(x_{i}) \geq 1 - \xi_{i} \ \ i=1,\ldots,N \\        
479 |         \xi_{i} \geq 0 \ \ i=1,\ldots,N\\ 
480 | \end{align*}
481 | 
482 | Our goal is now to maximize the margin while softly penalizing points that lie on the wrong side of the margin boundary. We therefore minimize:
483 | 
484 | \begin{equation*}
485 | \begin{aligned}
486 | & \underset{w,b,\xi}{\text{min}}
487 | & & \dfrac{1}{2} \norm{w}^2 + C \sum_{i=1}^{N} \xi_{i}\\
488 | & \text{subject to}
489 | & & 1 - \xi_{i} - t_{i} (w^T \phi(x_{i}) +b)  \leq 0, \; i = 1, \ldots, N \\
490 | &&& - \xi_{i} \leq 0, \; i = 1, \ldots, N
491 | \end{aligned}
492 | \end{equation*}
493 | 
494 | The parameter $C > 0$ controls the trade-off between the slack variable penalty and the margin (training error vs model complexity). In the limit $C \xrightarrow{} \infty $ we will recover the "hard" margin SVM. 
495 | 
496 | \subsection{Dual Problem}
497 | It is trivial to show that Slater's condition holds since the constraints are affine functions. Therefore, strong duality holds and we can move to the Lagrangian:
498 | 
499 | \begin{equation*}
500 |     \mathcal{L}(w,b,\xi,a,u) = 
501 |     \dfrac{1}{2} \norm{w}^2 + C \sum_{i=1}^{N} \xi_{i} + \sum_{i=1}^{N} a_{i} \{ 1 - \xi_{i} - t_{i} (w^T \phi(x_{i}) +b) \}
502 |     - \sum_{i=1}^{N} u_{i}\xi_{i}
503 |     \end{equation*}
504 | 
505 | Furthermore, the corresponding KKT conditions are:
506 | \begin{align}
507 |         a_{n} \geq 0 \\       
508 |         t_{n}y(x_{n}) -1 +\xi_{n} \geq 0 \\
509 |         a_{n}(t_{n}y(x_{n}) -1 +\xi_{n} ) = 0
510 |         \\
511 |         u_{n} \geq 0 \\
512 |         \xi_{n} \geq 0 \\
513 |         u_{n}\xi_{n} = 0
514 | \end{align}
515 | 
516 | 
517 | We now optimize out $w,b,\xi_{i}$:
518 | 
519 | \begin{align*}
520 |         \frac{\partial{\mathcal{L}}}{\partial{w}} = 0 \implies w = \sum_{i=1}^{N} a_{i}t_{i}\phi(x_{i}) \\       \frac{\partial{\mathcal{L}}}{\partial{b}} = 0 \implies \sum_{i=1}^{N} a_{i}t_{i} = 0 \\
521 |         \frac{\partial{\mathcal{L}}}{\partial{\xi_{i}}} = 0 \implies 
522 |         a_{i} = C - u_{i}
523 | \end{align*}
524 | 
525 | Using the previous results from "hard" margin SVM together with the above  results, we can  eliminate $w,b,\xi_{i}$ from the Lagrangian. We obtain the dual Lagrangian function which is identical to the "hard" margin one, except that the constraints are somewhat different:
526 | 
527 | 
528 | \begin{align*}
529 |          \mathcal{L}(\xi,a,u) = \sum_{i=1}^{N} a_{i}  - \dfrac{1}{2}
530 |     \sum_{i=1}^{N} { \sum_{j=1}^{N} a_{i} a_{j} t_{i} t_{j} \phi(x_{i})^T \phi(x_{j})} + C \sum_{i=1}^{N} \xi_{i} - \sum_{i=1}^{N} a_{i} \xi_{i} -
531 |          \sum_{i=1}^{N} u_{i} \xi_{i} \\       
532 |           \mathcal{L}(\xi,a,u) = \sum_{i=1}^{N} a_{i}  - \dfrac{1}{2}
533 |     \sum_{i=1}^{N} { \sum_{j=1}^{N} a_{i} a_{j} t_{i} t_{j} \phi(x_{i})^T \phi(x_{j})} +  \overbrace{\sum_{i=1}^{N} \xi_{i} (a_{i} + u_{i}) - \sum_{i=1}^{N} a_{i} \xi_{i} -
534 |          \sum_{i=1}^{N} u_{i} \xi_{i}}^\text{=0} 
535 |         \end{align*}
536 | 
537 | This gives the dual representation of the soft margin SVM problem:
538 | \begin{equation*}
539 | \begin{aligned}
540 | & \underset{a}{\text{max}}
541 | & & \sum_{i=1}^{N} a_{i}  - \dfrac{1}{2}
542 |     \sum_{i=1}^{N} { \sum_{j=1}^{N} a_{i} a_{j} t_{i} t_{j} \phi(x_{i})^T \phi(x_{j})}\\
543 | & \text{subject to}
544 | & & 0 \leq a_{i} \leq C  , \; i = 1, \ldots, N \\
545 | &&& \sum_{i=1}^{N} a_{i}t_{i} = 0
546 | \end{aligned}
547 | \end{equation*}
548 | 
549 | Where the first constraint comes from the fact that $a_{i} = C - u_{i}$ and $a_{i} \geq 0, u_{i} \geq 0$ must be satisfied (KKT).
550 | \\
551 | 
552 | We can now interpret the resulting solution:
553 | \begin{itemize}
554 |     \item data points for which $a_{i} = 0$: they do not contribute to the predictive model.
555 |     \item data points for which $0 < a_{i} < C$: lie on the margin since $a_{i} < C \implies u_{i} > 0 \implies \xi_{i} = 0$  \\ where the last implication comes from complementary slackness (8.6)
556 |     \item data points for which $a_{i} = C$: lie inside the margin and can either be correctly classified if $\xi_{i} \leq 1$ or misclassifed if $\xi_{i} > 1$
557 | \end{itemize}
558 | \newpage
559 | 
560 | \end{document}
561 | 


--------------------------------------------------------------------------------
/L8_Structured_SVM.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[twoside]{article}
  2 | \setlength{\oddsidemargin}{0.25 in}
  3 | \setlength{\evensidemargin}{-0.25 in}
  4 | \setlength{\topmargin}{-0.6 in}
  5 | \setlength{\textwidth}{6.5 in}
  6 | \setlength{\textheight}{8.5 in}
  7 | \setlength{\headsep}{0.75 in}
  8 | \setlength{\parindent}{0 in}
  9 | \setlength{\parskip}{0.1 in}
 10 | \newcommand{\eqdef}{:\mathrel{\mathop=}}
 11 | \newcommand{\norm}[1]{\left\lVert #1 \right\rVert}
 12 | 
 13 | %
 14 | % ADD PACKAGES here:
 15 | %
 16 | \usepackage{algorithm}
 17 | \usepackage[noend]{algpseudocode}
 18 | \usepackage{amsmath,amsfonts,graphicx}
 19 | \usepackage{amssymb}
 20 | 
 21 | %
 22 | % The following commands set up the lecnum (lecture number)
 23 | % counter and make various numbering schemes work relative
 24 | % to the lecture number.
 25 | %
 26 | \newcounter{lecnum}
 27 | \renewcommand{\thepage}{\thelecnum-\arabic{page}}
 28 | \renewcommand{\thesection}{\thelecnum.\arabic{section}}
 29 | \renewcommand{\theequation}{\thelecnum.\arabic{equation}}
 30 | \renewcommand{\thefigure}{\thelecnum.\arabic{figure}}
 31 | \renewcommand{\thetable}{\thelecnum.\arabic{table}}
 32 | 
 33 | %
 34 | % The following macro is used to generate the header.
 35 | %
 36 | \newcommand{\lecture}[4]{
 37 |    \pagestyle{myheadings}
 38 |    \thispagestyle{plain}
 39 |    \newpage
 40 |    \setcounter{lecnum}{#1}
 41 |    \setcounter{page}{1}
 42 |    \noindent
 43 |    \begin{center}
 44 |    \framebox{
 45 |       \vbox{\vspace{2mm}
 46 |     \hbox to 6.28in { {\bf Advanced Machine Learning
 47 | 	\hfill Fall 2021} }
 48 |        \vspace{4mm}
 49 |        \hbox to 6.28in { {\Large \hfill Lecture #1: #2  \hfill} }
 50 |        \vspace{2mm}
 51 |        \hbox to 6.28in { {\it  #3 \hfill  #4} }
 52 |       \vspace{2mm}}
 53 |    }
 54 |    \end{center}
 55 |    \markboth{Lecture #1: #2}{Lecture #1: #2}
 56 | 
 57 |    {\bf Note}: {\it LaTeX template courtesy of UC Berkeley EECS dept.}
 58 | 
 59 |    {\bf Disclaimer}: {\it These notes are adapted from ETH's Advanced Machine Learning Course,  Bishop's "Pattern Recognition and Machine Learning" book and Joachims et al.'s paper: "Predicting Structured Objects with Support Vector Machines"}
 60 |    \vspace*{4mm}
 61 | }
 62 | %
 63 | % Convention for citations is authors' initials followed by the year.
 64 | % For example, to cite a paper by Leighton and Maggs you would type
 65 | % \cite{LM89}, and to cite a paper by Strassen you would type \cite{S69}.
 66 | % (To avoid bibliography problems, for now we redefine the \cite command.)
 67 | % Also commands that create a suitable format for the reference list.
 68 | \renewcommand{\cite}[1]{[#1]}
 69 | \def\beginrefs{\begin{list}%
 70 |         {[\arabic{equation}]}{\usecounter{equation}
 71 |          \setlength{\leftmargin}{2.0truecm}\setlength{\labelsep}{0.4truecm}%
 72 |          \setlength{\labelwidth}{1.6truecm}}}
 73 | \def\endrefs{\end{list}}
 74 | \def\bibentry#1{\item[\hbox{[#1]}]}
 75 | 
 76 | %Use this command for a figure; it puts a figure in wherever you want it.
 77 | %usage: \fig{NUMBER}{SPACE-IN-INCHES}{CAPTION}
 78 | \newcommand{\fig}[3]{
 79 | 			\vspace{#2}
 80 | 			\begin{center}
 81 | 			Figure \thelecnum.#1:~#3
 82 | 			\end{center}
 83 | 	}
 84 | % Use these for theorems, lemmas, proofs, etc.
 85 | \newtheorem{theorem}{Theorem}[lecnum]
 86 | \newtheorem{lemma}[theorem]{Lemma}
 87 | \newtheorem{proposition}[theorem]{Proposition}
 88 | \newtheorem{claim}[theorem]{Claim}
 89 | \newtheorem{corollary}[theorem]{Corollary}
 90 | \newtheorem{definition}[theorem]{Definition}
 91 | \newenvironment{proof}{{\bf Proof:}}{\hfill\rule{2mm}{2mm}}
 92 | 
 93 | % **** IF YOU WANT TO DEFINE ADDITIONAL MACROS FOR YOURSELF, PUT THEM HERE:
 94 | 
 95 | \newcommand\E{\mathbb{E}}
 96 | 
 97 | \begin{document}
 98 | %FILL IN THE RIGHT INFO.
 99 | %\lecture{**LECTURE-NUMBER**}{**DATE**}{**LECTURER**}{**SCRIBE**}
100 | \lecture{8}{Structured SVMs }{}{}
101 | %\footnotetext{These notes are partially based on those of Nigel Mansell.}
102 | 
103 | % **** YOUR NOTES GO HERE:
104 | 
105 | % Some general latex examples and examples making use of the
106 | % macros follow.  
107 | %**** IN GENERAL, BE BRIEF. LONG SCRIBE NOTES, NO MATTER HOW WELL WRITTEN,
108 | %**** ARE NEVER READ BY ANYBODY.
109 | 
110 | 
111 | \section{Non-Linear SVMs}
112 | The SVM soft-margin dual formulation is the following:
113 | \begin{equation*}
114 | \begin{aligned}
115 | & \underset{a}{\text{max}}
116 | & & \sum_{i=1}^{N} a_{i}  - \dfrac{1}{2}
117 |     \sum_{i=1}^{N} { \sum_{j=1}^{N} a_{i} a_{j} t_{i} t_{j} \phi(x_{i})^T \phi(x_{j})}\\
118 | & \text{subject to}
119 | & & 0 \leq a_{i} \leq C  , \; i = 1, \ldots, N \\
120 | &&& \sum_{i=1}^{N} a_{i}t_{i} = 0
121 | \end{aligned}
122 | \end{equation*}
123 | For very high dimensional space, the nonlinear transformation $\phi(x)$ might be
124 | computationally too difficult to compute, when we only require the inner product $\langle \phi(x_{i}),\, \phi(x_{j}) \rangle$. A way to overcome this problem is making use of \textbf{Kernel} functions.
125 | 
126 | 
127 | \begin{definition}
128 | A function $k : \mathbb{R}^d \times \mathbb{R}^d \rightarrow \mathbb{R}$ is said to be a kernel function if and only if it is an inner product $\langle \boldsymbol{\phi(x)}, \boldsymbol{\phi(x')} \rangle$  for some (possibly infinite dimensional) mapping $\boldsymbol{\phi(x)}$.
129 | \end{definition}
130 | \newpage
131 | \begin{definition}
132 | A function $k : \mathbb{R}^d \times \mathbb{R}^d \rightarrow \mathbb{R}$ is said to be symmetric positive semidefinite (PSD) if it is symmetric, i.e \: $k(\boldsymbol{x}, \boldsymbol{x'}) = k(\boldsymbol{x'}, \boldsymbol{x})$; For any integer $m > 0$ and any set of inputs $\boldsymbol{x_1,...,x_m }\in \mathbb{R}^d$, the following matrix is positive semi-definite:
133 | \begin{center}
134 |     $\boldsymbol{K} =
135 |     \begin{bmatrix}
136 |     k(\boldsymbol{x_1}, \boldsymbol{x_1}) & \hdots & k(\boldsymbol{x_1}, \boldsymbol{x_m})\\
137 |     \vdots & \ddots & \vdots\\
138 |     k(\boldsymbol{x_m}, \boldsymbol{x_1}) & \hdots & k(\boldsymbol{x_m}, \boldsymbol{x_m})
139 |     \end{bmatrix}
140 |     \succeq \boldsymbol{0}
141 | $
142 | \end{center}
143 | This matrix, with $(i, j)$-th entry equal to $k(\boldsymbol{x_i}, \boldsymbol{x_j})$, is called the \textbf{Gram matrix}.
144 | \end{definition}
145 | 
146 | \begin{theorem}
147 | The above two definitions are equivalent. That is, $k$ is a kernel function if and only if it is symmetric PSD.
148 | \end{theorem}
149 | \begin{proof}
150 | The “only if” part is easy to show (at least when $\phi$ is finite-dimensional): The inner product is certainly symmetric, and the Gram matrix can be written as $\boldsymbol{K = \Phi^\intercal\Phi}$, where $\boldsymbol{\Phi} \in \mathbb{R}^{\text{dim}(\boldsymbol{\Phi}) \times m}$ contains the $m$ feature vectors $\{\boldsymbol{\phi(x_t)}\}_{t = 1}^{m}$ as columns. The matrix $\boldsymbol{K = \Phi^\intercal\Phi}$  is trivially positive semidefinite, since for any $\boldsymbol{z}$ we have $\boldsymbol{z^\intercal\Phi^\intercal\Phi z} = \boldsymbol{\norm{\Phi z}^2} \geq 0$.\medskip
151 | 
152 | The “if” part is more challenging and comes from \textit{Mercer's Theorem}.
153 | \end{proof}\medskip
154 | 
155 | \begin{theorem}{\textbf{Mercer's  Theorem}\\}
156 | Recall: any positive definite matrix $\boldsymbol{K}$ can be represented using an eigendecomposition of the form $\boldsymbol{K = U^\intercal\Lambda U}$, where $\boldsymbol{\Lambda}$ is a diagonal matrix of eigenvalues $\lambda_i > 0$, and $\boldsymbol{U}$ is a matrix containing the eigenvectors.\\
157 | Now consider element $(i, j)$ of $\boldsymbol{K}$:
158 | \begin{equation*}
159 |     k_{ij} = (\boldsymbol{\Lambda}^\frac{1}{2}\boldsymbol{U}_{:i})^\intercal(\boldsymbol{\Lambda}^\frac{1}{2}\boldsymbol{U}_{:j})
160 | \end{equation*}
161 | where $\boldsymbol{U}_{:i}$ is the \textit{i}'th column of $\boldsymbol{U}$. If we define $\boldsymbol{\phi(x_i)} = \boldsymbol{\Lambda}^\frac{1}{2}\boldsymbol{U}_{:i}$, then we can write:
162 | \begin{equation*}
163 |     k_{ij} = \boldsymbol{\phi(x_i)}^\intercal \boldsymbol{\phi(x_j)} = \sum\limits_m \phi_m (\boldsymbol{x_i})\phi_m(\boldsymbol{x_j})
164 | \end{equation*}
165 | \end{theorem}
166 | Thus we see that the entries in the kernel matrix can be computed by performing an inner product of some feature vectors that are implicitly defined by the eigenvectors of the kernel matrix. This idea can be generalized to apply to kernel functions, not just kernel matrices.\\
167 | For example consider the \textbf{quadratic kernel} $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = \langle\boldsymbol{x}, \boldsymbol{x'}\rangle^2$. In 2d, we have:
168 | \begin{equation*}
169 |     \mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = (x_1x_1' + x_2x_2')^2 = x_1^2(x_1')^2 + 2x_1x_2x_1'x_2' + x_2^2(x_2')^2
170 | \end{equation*}
171 | We can write this as $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = \boldsymbol{\phi(x)}^\intercal\boldsymbol{\phi(x)}$ if we define $\boldsymbol{\phi(x_1, x_2)} = [x_1^2, \sqrt{2}x_1x_2, x_2^2] \in \mathbb{R}^3$. So we embed the 2d inputs $\boldsymbol{x}$ into a 3d feature space $\boldsymbol{\phi(x)}$.\\
172 | Now consider the RBF kernel. In this case, the corresponding feature representation is infinite dimensional. However, by working with kernel functions, we can avoid having to deal with infinite dimensional vectors.
173 | \newpage
174 | \subsection{Kernel Engineering}
175 | Given two valud kernels $\mathcal{K}_1(\boldsymbol{x}, \boldsymbol{x'})$ and $\mathcal{K}_2(\boldsymbol{x}, \boldsymbol{x'})$, we can create a new kernel using any of the following methods:
176 | \begin{itemize}
177 |     \item $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = c\mathcal{K}_1(\boldsymbol{x}, \boldsymbol{x'})$, for any constant $c > 0$
178 |     \item $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = f(\boldsymbol{x})\mathcal{K}_1(\boldsymbol{x}, \boldsymbol{x'})f(\boldsymbol{x'})$, for any function $f$
179 |     \item $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = q(\mathcal{K}_1(\boldsymbol{x}, \boldsymbol{x'}))$, for any function polynomial $q$ with non-negative coefficients
180 |     \item $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = \exp{(\mathcal{K}_1(\boldsymbol{x}, \boldsymbol{x'}))}$
181 |     \item $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = \boldsymbol{x}^\intercal\boldsymbol{Ax'}$, for any PSD matrix $\boldsymbol{A}$
182 | \end{itemize}
183 | For example, suppose we start with the linear kernel $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = \boldsymbol{x^\intercal x'}$. We know this is a valid Mercer kernel, since the corresponding Gram matrix is just the (scaled) covariance matrix of the data. From the above rules, we can see that the polynomial kernel $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = (\boldsymbol{x^\intercal x'})^M$is a valid Mercer kernel. This contains all monomials of order M. For example, if M = 2 and the inputs are 2d, we have:
184 | $$(\boldsymbol{x^\intercal x'})^2 = (x_1x_1' + x_2x_2')^2 = (x_1x_1')^2 + (x_2x_2)^2 + (x_1x_1')(x_2x_2')$$
185 | We can generalize this to contain all terms up to degree M by using the kernel $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = (\boldsymbol{x^\intercal x'} + c)^M$ . For example, if M = 2 and the inputs are 2d, we have:
186 | $$(\boldsymbol{x^\intercal x'} + 1)^2 = (x_1x_1')^2 + (x_1x_1')(x_2x_2') + (x_1x_1') + (x_2x_2)(x_1x_1') + (x_2x_2')^2 + (x_2x_2') + (x_1x_1') + (x_2x_2') + 1$$
187 | We can also use the above rules to establish that the Gaussian kernel is a valid kernel. To see this,
188 | note that:
189 | $$\norm{\boldsymbol{x} - \boldsymbol{x'}}^2 = \boldsymbol{x}^\intercal\boldsymbol{x} + (\boldsymbol{x'})^\intercal\boldsymbol{x'} - 2\boldsymbol{x}^\intercal\boldsymbol{x'}$$
190 | and hence
191 | $$\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = \exp{(-\norm{\boldsymbol{x} - \boldsymbol{x'}}^2/2\sigma^2)} = \exp{(-\boldsymbol{x}^\intercal\boldsymbol{x}/2\sigma^2)}\exp{(\boldsymbol{x}^\intercal\boldsymbol{x'}/\sigma^2)}\exp{(-(\boldsymbol{x'})^\intercal\boldsymbol{x'}/2\sigma^2)}$$
192 | is a valid kernel.
193 | \subsection{Combining kernels by addition and multiplication}
194 | We can also combine kernels using addition or multiplication:
195 | \begin{itemize}
196 |     \item $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = \mathcal{K}_1(\boldsymbol{x}, \boldsymbol{x'}) + \mathcal{K}_2(\boldsymbol{x}, \boldsymbol{x'})$
197 |     \item $\mathcal{K}(\boldsymbol{x}, \boldsymbol{x'}) = \mathcal{K}_1(\boldsymbol{x}, \boldsymbol{x'}) \times \mathcal{K}_2(\boldsymbol{x}, \boldsymbol{x'})$
198 | \end{itemize}
199 | Multiplying two positive-definite kernels together always results in another positive definite kernel. This is a way to get a conjunction of the individual properties of each kernel. In addition, adding two positive-definite kernels together always results in another positive definite
200 | kernel. This is a way to get a disjunction of the individual properties of each kernel.
201 | \subsection{Kernels for structured inputs}
202 | Kernels are particularly useful when the inputs are structured objects, such as strings and graphs, since it is often hard to “featurize” variable-sized inputs. For example, we can define a string kernel which compares strings in terms of the number of n-grams they have in common.
203 | We can also define kernels on graphs . For example, the random walk kernel conceptually performs random walks on two graphs simultaneously, and then counts the number of paths that were produced by both walks.
204 | 
205 | 
206 | 
207 | \section{Structured SVMs}
208 | Consider the problem of natural language parsing illustrated in Figure 8.2. A parser takes as input a natural
209 | language sentence, and the desired output is the parse tree
210 | decomposing the sentence into its constituents. How can we take, say,
211 | an SVM and learn a rule for predicting trees?
212 | 
213 | \begin{figure}[h]
214 | \caption{Predicting trees in natural language parsing.}
215 | \centering
216 | \includegraphics[width=0.39\textwidth]{img/syn_tree.png}
217 | \end{figure}
218 | 
219 | 
220 | Obviously, this question arises not only for learning to
221 | predict trees, but similarly for a variety of other structured
222 | and complex outputs. Structured output prediction is the
223 | name for such learning tasks, where one aims at learning
224 | a function $h : X \rightarrow Y$ mapping inputs $x \in X$ to complex
225 | and structured outputs $y \in Y$. 
226 | \\ \\ On an abstract level, a structured prediction task is much like
227 | a multi-class learning task. Each possible structure $y \in Y$
228 | (e.g. parse tree) corresponds to one class (see Figure 8.3), and
229 | classifying a new example $x$ amounts to predicting its correct
230 | “class”. 
231 | 
232 | \begin{figure}[h]
233 | \caption{Structured output prediction as a multiclass problem.}
234 | \centering
235 | \includegraphics[width=0.39\textwidth]{img/syn_out.png}
236 | \end{figure}
237 | 
238 | While the following derivation of structural SVMs
239 | starts from multi-class SVMs, there are three key problems
240 | that need to be overcome. \textbf{All of these problems arise from
241 | the huge number $\mathbf{|Y|}$ of classes}. 
242 | \newpage
243 | 
244 | \subsection{Problem 1: Structural SVM Formulation}
245 | We start the derivation of the structural SVM from the multi-class SVM. These multi-class SVMs use one weight vector $\mathbf{w_{y}}$ for each class $\mathbf{y}$. Each input
246 | $\mathbf{x}$ now has a score for each class $\mathbf{y}$ via $\mathbf{f(x,y) = w_{y} \phi(x)}$.
247 | Here $\mathbf{\phi(x)}$ is a vector of binary or numeric features extracted
248 | from $\mathbf{x}$. Thus, every feature will have an additively weighted
249 | influence in the modeled compatibility between inputs $\mathbf{x}$ and
250 | classes $\mathbf{y}$. To classify $\mathbf{x}$, the prediction rule $\mathbf{h(x)}$ then simply
251 | chooses the highest-scoring class:
252 | 
253 | \begin{equation}
254 |     h(x) = \max_{y \in Y} f(x,y)
255 | \end{equation}
256 | 
257 | as the predicted output. This will result in the correct prediction $\mathbf{y}$ for input $\mathbf{x}$ provided the weights $\mathbf{w = (w_{1}, . . . , w_{k})}$
258 | have been chosen such that the inequalities $\mathbf{f(x,y_{i}) < f(x, y)}$
259 | hold for all incorrect outputs $\mathbf{y_{i} \not =  y}$. \\
260 | 
261 | \begin{equation}
262 | \begin{aligned}
263 | & \underset{\textbf{w}}{\text{min}}
264 | & & \dfrac{1}{2} \norm{w}^2  \\
265 | & \text{subject to}
266 | & & f(x_{i},y_{i}) - f(x_{i},y) = (w_{y_i} - w_y)\cdot \phi(x_i) \geq 1 , \;\forall i, y \not = y_{i}
267 | \end{aligned}
268 | \end{equation}
269 | 
270 | The first challenge in using (8.8) for structured outputs is that, while there is generalization across inputs $\mathbf{x}$, there is no generalization across outputs. This is due to having a
271 | separate weight vector $\mathbf{w_y}$ for each class $\mathbf{y}$. Furthermore,
272 | since the number of possible outputs can become very large (or infinite), naively reducing structured output prediction
273 | to multi-class classification leads to an undesirable blow-up
274 | in the overall number of parameters and in the overall number of inequalities, which is $\mathbf{n(k-1)}$. \\
275 | 
276 | The key idea in overcoming these problems is to extract
277 | features from input-output pairs using a so-called joint feature map $\mathbf{\Psi(x, y)}$ instead of $\mathbf{\Phi(x)}$. These joint features will allow us to generalize across outputs and to define meaningful scores even
278 | for outputs that were never actually observed in the training data. At the same time, since we will define compatibility
279 | functions via $\mathbf{f(x, y) = w\cdot \Psi(x, y)}$, the number of parameters will simply equal the number of features extracted via
280 | $\mathbf{\Psi}$, which may not depend on $\mathbf{|Y|}$. One can then use the formulation in (8.8) with the more flexible definition of $\mathbf{f}$ via $\mathbf{\Psi}$ to
281 | arrive at the following (hard-margin) optimization problem:
282 | 
283 | \begin{equation*}
284 | \begin{aligned}
285 | & \underset{\textbf{w}}{\text{min}}
286 | & & \dfrac{1}{2} \norm{w}^2  \\
287 | & \text{subject to}
288 | & & w \cdot \Psi(x_{i},y_{i}) - w \cdot \Psi(x_{i},y) \geq 1 , \;\forall i, y \not = y_{i}
289 | \end{aligned}
290 | \end{equation*}
291 | 
292 | In other words, find a weight vector $\mathbf{w}$ of an input-output compatibility function $\mathbf{f}$ that is linear in some joint feature map $\mathbf{\Psi}$
293 | so that on each training example it scores the correct output
294 | higher by a fixed margin than every alternative output, while
295 | having low complexity (i.e. small norm $\norm{w}$). Note that the
296 | number of linear constraints is still $n(|Y| -1)$. The design of the features $\mathbf{\Psi}$ is problem-specific, and it is a
297 | strength of the developed methods to allow for a great deal of flexibility in how to choose it.
298 | 
299 | \subsection{Problem 2: Inconsistent Training Data}
300 | So far we have tacitly assumed that the optimization problem has a solution, i.e. there exists a weight vector
301 | that simultaneously fulfills all margin constraints. In practice this may not be the case, either because the training
302 | data is inconsistent or because our model class is not powerful enough. If we allow for mistakes, though, we must
303 | be able to quantify the degree of mismatch between a prediction and the correct output, since usually different incorrect predictions vary in quality. This is exactly the role
304 | played by a loss function, formally $ \Delta : Y \times Y \rightarrow \mathbb{R}$, where
305 | $\mathbf{\Delta(y_{i},y)}$ is the loss (or cost) for predicting $\mathbf{y}$, when the correct output is $\mathbf{y_{i}}$. Like the choice of $\mathbf{\Psi}$, defining $\mathbf{\Delta}$
306 | is problem-specific. One can convert this back into a
307 | quadratic program as follows:
308 | 
309 | \begin{equation*}
310 | \begin{aligned}
311 | & \underset{\textbf{w}, \xi_{i} \geq 0}{\text{min}}
312 | & & \dfrac{1}{2} \norm{w}^2 + \dfrac{C}{n} \sum_{i=1}^{n} \xi_{i} \\
313 | & \text{subject to}
314 | & & w \cdot \Psi(x_{i},y_{i}) - w \cdot \Psi(x_{i},y) \geq  \Delta(y_{i},y) - \xi_{i} , \;\forall i, y \not = y_{i}
315 | \end{aligned}
316 | \end{equation*}
317 | 
318 | Note that we added the $\mathbf{\dfrac{1}{n}}$ term, this is a guarantee that we are minimizing the empirical risk:
319 | 
320 | \begin{theorem}
321 | If $w^*,\xi^*$ are optimal, then the empirical risk of $w^*$ with respect to $\mathbf{\Delta}$:
322 | \begin{equation*}
323 |     \dfrac{1}{n} \sum_{i=1}^{n}\Delta(y_{i},h_{w^*}(x_{i})) \leq \dfrac{1}{n} \sum_{i=1}^{n} \xi_{i}^*
324 | \end{equation*}
325 | \end{theorem}
326 | \begin{proof}
327 | Suffices to prove that  $\Delta(y_{i},h_{w^*}(x_{i})) \leq \xi_{i}^*,  \; \forall i$ \\
328 | 
329 | If $h_{w^*}(x_{i}) = y_{i}$, then $\Delta(y_{i},h_{w^*}(x_{i})) = 0 \leq \xi_{i}^*$ \\
330 | 
331 | If $h_{w^*}(x_{i}) = y \not = y_{i}$, then:
332 | \begin{align*}
333 |        w^{*T}\cdot \Psi(x_{i},y_{i}) < w^{*T} \cdot \Psi(x_{i},y) \implies w^{*T}\cdot \Psi(x_{i},y_{i}) - w^{*T} \cdot \Psi(x_{i},y) = \delta < 0\\      \delta + \xi_{i}^* \geq \Delta(y_{i},y)  \implies  \xi_{i}^* \geq \Delta(y_{i},y)  , \;\forall i \; \text{(from the optimization constraint)}
334 | \end{align*}
335 | \end{proof}
336 | 
337 | 
338 | \subsection{Problem 3: Efficient Training}
339 | 
340 | Last, but not least, we need a training algorithm that finds
341 | the optimal $\mathbf{w}$ solving the quadratic program. Since
342 | there is a constraint for every incorrect label $\mathbf{y}$, we cannot
343 | enumerate all constraints and simply give the optimization problem to a standard QP solver. Instead, we propose to use the cutting-plane Algorithm 1. The key idea is to iteratively construct
344 | a working set of constraints $\mathbf{W}$ that is equivalent to the full
345 | set of constraints  up to a specified precision $\mathbf{\epsilon}$.
346 | 
347 | \begin{figure}[h]
348 | \centering
349 | \includegraphics[width=0.52\textwidth]{img/struct_algo.png}
350 | \end{figure}
351 | 
352 | 
353 | Starting
354 | with an empty $\mathbf{W}$ and $\mathbf{w=0}$, Algorithm 1 iterates through
355 | the training examples. For each example, the $\mathbf{argmax}$ in Line 5 finds the most violated constraint of the quadratic
356 | program. If this constraint is violated by more than $\mathbf{\epsilon}$ (Line 6), it is added to the working set $\mathbf{W}$ in Line 7 and a
357 | new $\mathbf{w}$ is computed by solving the quadratic program over
358 | the new $\mathbf{W}$ (Line 8). The algorithm stops and returns the
359 | current $\mathbf{w}$ if $\mathbf{W}$ did not change between iterations.\\ 
360 | 
361 | \textbf{But how long does
362 | it take to terminate?} It can be shown that Algorithm 1
363 | always terminates in a polynomial number of iterations that is independent of the cardinality of the output space $|Y|$. In
364 | fact, a refined version of Algorithm 1  always terminates after adding at most $O(C\epsilon^{-1})$ constraints to $W$. Note that the number of constraints
365 | is not only independent of $|Y|$, but also independent of the
366 | number of training examples $n$, which makes it an attractive
367 | training algorithm even for conventional SVMs. \\
368 | While the number of iterations is small, the $\mathbf{argmax}$ in
369 | Line 5 might be expensive to compute. In general, this is
370 | true, but note that this $\mathbf{argmax}$ is closely related to the
371 | $\mathbf{argmax}$ for computing a prediction $\mathbf{h(x)}$. It is therefore
372 | called the “loss-augmented”inference problem, and often the
373 | prediction algorithm can be adapted to efficiently solve the
374 | loss-augmented inference problem as well.
375 | 
376 | \end{document}
377 | 


--------------------------------------------------------------------------------
/L9_Ensemble_Methods.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[twoside]{article}
  2 | \setlength{\oddsidemargin}{0.25 in}
  3 | \setlength{\evensidemargin}{-0.25 in}
  4 | \setlength{\topmargin}{-0.6 in}
  5 | \setlength{\textwidth}{6.5 in}
  6 | \setlength{\textheight}{8.5 in}
  7 | \setlength{\headsep}{0.75 in}
  8 | \setlength{\parindent}{0 in}
  9 | \setlength{\parskip}{0.1 in}
 10 | \newcommand{\eqdef}{:\mathrel{\mathop=}}
 11 | \newcommand{\norm}[1]{\left\lVert #1 \right\rVert}
 12 | 
 13 | %
 14 | % ADD PACKAGES here:
 15 | %
 16 | 
 17 | \usepackage{amsmath,amsfonts,graphicx}
 18 | 
 19 | %
 20 | % The following commands set up the lecnum (lecture number)
 21 | % counter and make various numbering schemes work relative
 22 | % to the lecture number.
 23 | %
 24 | \newcounter{lecnum}
 25 | \renewcommand{\thepage}{\thelecnum-\arabic{page}}
 26 | \renewcommand{\thesection}{\thelecnum.\arabic{section}}
 27 | \renewcommand{\theequation}{\thelecnum.\arabic{equation}}
 28 | \renewcommand{\thefigure}{\thelecnum.\arabic{figure}}
 29 | \renewcommand{\thetable}{\thelecnum.\arabic{table}}
 30 | 
 31 | %
 32 | % The following macro is used to generate the header.
 33 | %
 34 | \newcommand{\lecture}[4]{
 35 |    \pagestyle{myheadings}
 36 |    \thispagestyle{plain}
 37 |    \newpage
 38 |    \setcounter{lecnum}{#1}
 39 |    \setcounter{page}{1}
 40 |    \noindent
 41 |    \begin{center}
 42 |    \framebox{
 43 |       \vbox{\vspace{2mm}
 44 |     \hbox to 6.28in { {\bf Advanced Machine Learning
 45 | 	\hfill Fall 2021} }
 46 |        \vspace{4mm}
 47 |        \hbox to 6.28in { {\Large \hfill Lecture #1: #2  \hfill} }
 48 |        \vspace{2mm}
 49 |        \hbox to 6.28in { {\it  #3 \hfill  #4} }
 50 |       \vspace{2mm}}
 51 |    }
 52 |    \end{center}
 53 |    \markboth{Lecture #1: #2}{Lecture #1: #2}
 54 | 
 55 |    {\bf Note}: {\it LaTeX template courtesy of UC Berkeley EECS dept.}
 56 | 
 57 |    {\bf Disclaimer}: {\it These notes are adapted from ETH's Advanced Machine Learning Course, Cornell's CS4780 Course, "The Element of Statistical Learning", "Boosting: Foundations and Algorithms" books and the Wikipedia page on the bias-variance tradeoff.}
 58 |    \vspace*{4mm}
 59 | }
 60 | %
 61 | % Convention for citations is authors' initials followed by the year.
 62 | % For example, to cite a paper by Leighton and Maggs you would type
 63 | % \cite{LM89}, and to cite a paper by Strassen you would type \cite{S69}.
 64 | % (To avoid bibliography problems, for now we redefine the \cite command.)
 65 | % Also commands that create a suitable format for the reference list.
 66 | \renewcommand{\cite}[1]{[#1]}
 67 | \def\beginrefs{\begin{list}%
 68 |         {[\arabic{equation}]}{\usecounter{equation}
 69 |          \setlength{\leftmargin}{2.0truecm}\setlength{\labelsep}{0.4truecm}%
 70 |          \setlength{\labelwidth}{1.6truecm}}}
 71 | \def\endrefs{\end{list}}
 72 | \def\bibentry#1{\item[\hbox{[#1]}]}
 73 | 
 74 | %Use this command for a figure; it puts a figure in wherever you want it.
 75 | %usage: \fig{NUMBER}{SPACE-IN-INCHES}{CAPTION}
 76 | \newcommand{\fig}[3]{
 77 | 			\vspace{#2}
 78 | 			\begin{center}
 79 | 			Figure \thelecnum.#1:~#3
 80 | 			\end{center}
 81 | 	}
 82 | % Use these for theorems, lemmas, proofs, etc.
 83 | \newtheorem{theorem}{Theorem}[lecnum]
 84 | \newtheorem{lemma}[theorem]{Lemma}
 85 | \newtheorem{proposition}[theorem]{Proposition}
 86 | \newtheorem{claim}[theorem]{Claim}
 87 | \newtheorem{corollary}[theorem]{Corollary}
 88 | \newtheorem{definition}[theorem]{Definition}
 89 | \newenvironment{proof}{{\bf Proof:}}{\hfill\rule{2mm}{2mm}}
 90 | 
 91 | % **** IF YOU WANT TO DEFINE ADDITIONAL MACROS FOR YOURSELF, PUT THEM HERE:
 92 | 
 93 | \newcommand\E{\mathbb{E}}
 94 | 
 95 | \begin{document}
 96 | %FILL IN THE RIGHT INFO.
 97 | %\lecture{**LECTURE-NUMBER**}{**DATE**}{**LECTURER**}{**SCRIBE**}
 98 | \lecture{9}{Ensemble Methods}{}{}
 99 | %\footnotetext{These notes are partially based on those of Nigel Mansell.}
100 | 
101 | % **** YOUR NOTES GO HERE:
102 | 
103 | % Some general latex examples and examples making use of the
104 | % macros follow.  
105 | %**** IN GENERAL, BE BRIEF. LONG SCRIBE NOTES, NO MATTER HOW WELL WRITTEN,
106 | %**** ARE NEVER READ BY ANYBODY.
107 | 
108 | 
109 | \section{Bias-Variance Tradeoff}
110 | The bias–variance tradeoff is the property of a model that the variance of the parameter estimated across samples can be reduced by increasing the bias in the estimated parameters.\\ The bias–variance dilemma or bias–variance problem is the conflict in trying to simultaneously minimize these two sources of error that prevent supervised learning algorithms from generalizing beyond their training set:
111 | \begin{itemize}
112 |     \item The \textit{bias} error is an error from erroneous assumptions in the learning algorithm. High bias can cause an algorithm to miss the relevant relations between features and target outputs (underfitting).
113 |     \item The \textit{variance} is an error from sensitivity to small fluctuations in the training set. High variance may result from an algorithm modeling the random noise in the training data (overfitting).
114 | \end{itemize}
115 | The bias–variance decomposition is a way of analyzing a learning algorithm's expected generalization error with respect to a particular problem as a sum of three terms, the bias, variance, and a quantity called the irreducible error, resulting from noise in the problem itself.\\
116 | Below you can find a derivation of the bias-variance decomposition.
117 | \subsection{Derivation}
118 | 
119 | 
120 | \begin{align*}
121 |     \operatorname{Var}[X] = \operatorname{E}[X^2] - \operatorname{E}[X]^2.
122 | \end{align*}
123 | 
124 | Rearranging, we get:
125 | 
126 | \begin{align*}
127 |     \operatorname{E}[X^2] = \operatorname{Var}[X] + \operatorname{E}[X]^2
128 | \end{align*}
129 | 
130 | Since $f$ is Deterministic, i.e. independent of $D$, $\operatorname{E}[f] = f$.
131 | 
132 | Thus, given $y = f + \varepsilon$ and $\operatorname{E}[\varepsilon] = 0$ (because $\varepsilon$ is noise), implies $\operatorname{E}[y] = \operatorname{E}[f + \varepsilon] = \operatorname{E}[f] = f$.
133 | 
134 | Also, since $\operatorname{Var}[\varepsilon] = \sigma^2$,
135 | 
136 | \begin{align*}
137 |     \operatorname{Var}[y] = \operatorname{E}[(y - \operatorname{E}[y])^2] = \operatorname{E}[(y - f)^2] = \operatorname{E}[(f + \varepsilon - f)^2] = \operatorname{E}[\varepsilon^2] = \operatorname{Var}[\varepsilon] + \operatorname{E}[\varepsilon]^2  = \sigma^2 + 0^2 = \sigma^2.
138 | \end{align*}
139 | 
140 | 
141 | Thus, since $\varepsilon$ and $\hat{f}$ are independent, we can write
142 | 
143 | 
144 | \begin{align*}
145 | \operatorname{E}\big[(y - \hat{f})^2\big]
146 |  & = \operatorname{E}\big[(f+\varepsilon  - \hat{f} )^2\big] \\[5pt]
147 |  & = \operatorname{E}\big[(f+\varepsilon  - \hat{f} +\operatorname{E}[\hat{f}]-\operatorname{E}[\hat{f}])^2\big] \\[5pt]
148 |  & = \operatorname{E}\big[(f-\operatorname{E}[\hat{f}])^2\big]+\operatorname{E}[\varepsilon^2]+\operatorname{E}\big[(\operatorname{E}[\hat{f}]- \hat{f})^2\big] 
149 | +2\operatorname{E}\big[(f-\operatorname{E}[\hat{f}])\varepsilon\big]
150 | +2\operatorname{E}\big[\varepsilon(\operatorname{E}[\hat{f}]- \hat{f})\big]
151 | +2\operatorname{E}\big[(\operatorname{E}[\hat{f}]- \hat{f})(f-\operatorname{E}[\hat{f}])\big] \\[5pt]
152 |  & = (f-\operatorname{E}[\hat{f}])^2+\operatorname{E}[\varepsilon^2]+\operatorname{E}\big[(\operatorname{E}[\hat{f}]- \hat{f})^2\big] 
153 | +2(f-\operatorname{E}[\hat{f}])\operatorname{E}[\varepsilon]
154 | +2\operatorname{E}[\varepsilon]\operatorname{E}\big[\operatorname{E}[\hat{f}]- \hat{f}\big]
155 | +2\operatorname{E}\big[\operatorname{E}[\hat{f}]- \hat{f}\big](f-\operatorname{E}[\hat{f}]) \\[5pt]
156 |  & = (f-\operatorname{E}[\hat{f}])^2+\operatorname{E}[\varepsilon^2]+\operatorname{E}\big[(\operatorname{E}[\hat{f}]- \hat{f})^2\big]\\[5pt]
157 |  & = (f-\operatorname{E}[\hat{f}])^2+\operatorname{Var}[\varepsilon]+\operatorname{Var}\big[\hat{f}\big]\\[5pt]
158 |  & = \operatorname{Bias}[\hat{f}]^2+\operatorname{Var}[\varepsilon]+\operatorname{Var}\big[\hat{f}\big]\\[5pt]
159 |  & = \operatorname{Bias}[\hat{f}]^2+\sigma^2+\operatorname{Var}\big[\hat{f}\big].
160 | \end{align*}
161 | 
162 | 
163 | Finally, MSE loss function (or negative log-likelihood) is obtained by taking the expectation value over $x\sim
164 | \text{MSE} = \operatorname{E}_x\bigg\{\operatorname{Bias}_D[\hat{f}(x;D)]^2+\operatorname{Var}_D\big[\hat{f}(x;D)\big]\bigg\} + \sigma^2.$
165 | 
166 | 
167 | \section{Bagging} % Don't be this informal in your notes!
168 | Suppose we fit a model to our training data $\mathcal{Z} = \{(x_{1},y_{1}),(x_{2},y_{2}),... ,(x_{n},y_{n})\}$, obtaining the prediction $\hat{f}(x)$ at input x. Bootstrap aggregation or bagging averages this prediction over a collection of bootstrap samples, thereby reducing its variance.
169 | For each bootstrap sample $\boldsymbol{\mathcal{Z}}^{*b}, \; b = 1, 2,... ,B$, we fit our model, giving
170 | prediction $\hat{f}^{*b}(x)$. The bagging estimate is defined by:
171 | \begin{equation*}
172 |     \hat{f}_{bag}(x) = \dfrac{1}{B} \sum_{b=1}^{B}\hat{f}^{*b}(x)
173 | \end{equation*}
174 | 
175 | 
176 | Bagging can dramatically reduce the variance of unstable procedures like trees, leading to improved prediction. A simple argument shows why bagging helps under squared-error loss, in short because averaging reduces variance and leaves bias unchanged.
177 | 
178 | \begin{theorem}
179 | 
180 | Assume our training observations $(x_{i},y_{i}), i = 1,... ,N $ are independently drawn from a distribution $\mathcal{P}$, and consider the ideal aggregate estimator $f_{ag}(x) = \mathbb{E}_{\mathcal{P}}[\hat{f}^*(x)]
181 | $. Here $x$ is fixed and the bootstrap dataset $\boldsymbol{\mathcal{Z}^*}$
182 | consists of observations $x_{i}^*,y_{i}^*, \;  i = 1, 2,... ,N$
183 | sampled from $\mathcal{P}$.  Then:
184 | 
185 | \begin{equation*}
186 |     \mathbb{E}_{\mathcal{P}}[Y - \hat{f}^*(x)]^2 = \mathbb{E}_{\mathcal{P}}[Y - \hat{f}_{ag}(x)]^2 + \overbrace{\mathbb{E}_{\mathcal{P}}[\hat{f}^*(x) -\hat{f}_{ag}(x) ]^2}^\text{variance}
187 |     \geq \mathbb{E}_{\mathcal{P}}[Y - \hat{f}_{ag}(x)]^2
188 | \end{equation*}
189 | 
190 | \end{theorem}
191 | 
192 | \begin{proof}
193 | \begin{align*}
194 |        \mathbb{E}_{\mathcal{P}}[Y - \hat{f}^*(x)]^2 = \mathbb{E}_{\mathcal{P}}[Y +  \hat{f}_{ag}(x)- \hat{f}_{ag}(x) - \hat{f}^*(x)]^2  \\
195 |        = \mathbb{E}_{\mathcal{P}}[Y - \hat{f}_{ag}(x)]^2 + \mathbb{E}_{\mathcal{P}}[\hat{f}^*(x) - \hat{f}_{ag} ]^2 + \overbrace{2\mathbb{E}_{\mathcal{P}}[(Y-\hat{f}_{ag}(x))( \hat{f}^*(x) - \hat{f}_{ag}(x))]}^\text{=0} \\
196 |        \geq \mathbb{E}_{\mathcal{P}}[Y - \hat{f}_{ag}(x)]^2
197 | \end{align*}
198 | \end{proof}
199 | 
200 | 
201 | Note that
202 | $f_{ag}(x)$ is a bagging estimate, drawing bootstrap samples from the actual population $\mathcal{P}$ rather than the data. It is not an estimate that we can use in practice, but is convenient for analysis. Furthermore, note the strong assumption of independence, which unfortunately does not hold when sampling from $\boldsymbol{\mathcal{Z}}$. The extra error on the right-hand side comes from the variance of $\hat{f}^*(x)$ around its mean $\hat{f}_{ag}(x)$. Therefore true population aggregation never increases mean squared error. This suggests that bagging—drawing samples
203 | from the training data— will often decrease mean-squared error.
204 | However, the above argument does not hold for classification under 0-1 loss, because of the nonadditivity of bias and variance. In that setting, bagging a good classifier can make it better, but bagging a bad classifier can make it worse. Note that when we bag a model, \textbf{any simple structure in the model is lost}. For interpretation of the model this is clearly a drawback. 
205 | 
206 | \subsection{Advantages of Bagging}
207 | \begin{itemize}
208 |     \item Reduces variance, so has a strong beneficial effect on high variance classifiers.
209 |     \item As the prediction is an average of many classifiers, you obtain a mean score and variance. The latter can be interpreted as the uncertainty of the prediction. Especially in regression tasks, such uncertainties are otherwise hard to obtain.
210 |     \item Bagging provides an unbiased estimate of the test error, which we refer to as the \textbf{out-of-bag error}. The idea is to  average the classifiers $\hat{f}^{*b}$  that have not seen a certain sample. Thus, we obtain a classifier that was not trained on $(x_{i},y_{i})$ ever. If we compute the error of all these classifiers, we obtain an estimate of the true test error. \textbf{The beauty is that we can do this without reducing the training set}. We just run bagging as it is intended and obtain this so called out-of-bag error for free. \\ \\
211 |     More formally, for each training point $(x_{i},y_{i}) \in \mathcal{Z}$ let $S_{i}=\{ k| (x_{i},y_{i}) \not \in \mathcal{Z}^{*k}\}$ - in other words $S_{i}$ contains the indexes of all the training sets which do not contain $(x_{i},y_{i})$. Let the averaged classifier over all these data sets be:
212 |     
213 |     \begin{equation*}
214 |         h_{i}(x) = \dfrac{1}{|S_{i}|} \sum_{k \in S_{i}} f^{*k}(x)
215 |     \end{equation*}
216 |     
217 |     The out-of-bag error becomes simply the average loss that all these classifiers yield:
218 |     
219 |     \begin{equation*}
220 |     \epsilon_{OOB} = \dfrac{1}{n} \sum_{i=1}^{n} l(h_{i}(x_{i}),y_{i})
221 |     \end{equation*}
222 | 
223 | This is an estimate of the test error, because for each sample we used the subset of classifiers that never saw that sample during training. If $\mathbf{B}$ is sufficiently large, the fact that we take out some classifiers has no significant effect and the estimate is pretty reliable.
224 | \end{itemize}
225 | 
226 | \subsection{Random Forest}
227 | 
228 | One of the most famous and useful bagged algorithms is the Random Forest. A Random Forest is essentially nothing else but bagged decision trees, with a slightly modified splitting criteria.
229 | 
230 | The algorithm works as follows:
231 | 
232 | \begin{enumerate}
233 |     \item Sample $B$ datasets $\mathcal{Z}^{*1},\ldots,\mathcal{Z}^{*B}$  from $\mathcal{Z}$ with replacement.
234 |     \item For each $\mathcal{Z}^{*b}$ train a full decision tree $\hat{f}^{*b}(x)$ with one small modification: before each split randomly subsample $k \leq d$ features (without replacement) and only consider these for your split.
235 |     It can be shown that this step further increases the variance of the trees.
236 |     \item The final classifier will be $\hat{f}(x) = \dfrac{1}{B} \sum_{b=1}^{B}\hat{f}^{*b}(x)$
237 | \end{enumerate}
238 | 
239 | Random Forest algorithm has two main advantages:
240 | 
241 | \begin{itemize}
242 |     \item It only has two hyper-parameters, $\mathbf{B}$ and $\mathbf{k}$. It is extremely insensitive to both of these. A good choice for $k$ is $k=\sqrt{d}$ (where $d$ denotes the number of features). You can set $B$ as large as you can afford.
243 |     \item Decision trees do not require a lot of preprocessing. For example, the features can be of different scale, magnitude, or slope. This can be highly advantageous in scenarios with heterogeneous data, for example the medical settings where features could be things like blood pressure, age, gender, ..., each of which is recorded in completely different units.
244 | \end{itemize}
245 | 
246 | \section{Boosting}
247 | In his Machine Learning class project in 1988 Michael Kearns famously asked the question: \textbf{Can weak learners be combined to generate a strong learner with low bias?} \\ The answer is yes. Create an ensemble classifier $H_{T}(x) =\sum_{t=1}^{T} \alpha_{t} h_{t}(x)$. This ensemble classifier is built in an iterative fashion: during iteration $t$ we add the classifier $\alpha_{t} h_{t}(x)$ to the ensemble. At test time we evaluate all classifier and return the weighted sum.
248 | 
249 | 
250 | The process of constructing such an ensemble in a stage-wise fashion is very similar to gradient descent (we can think of it as gradient descent in functional space). However, instead of updating the model parameters in each iteration, we add functions to our ensemble.
251 | Let $\ell$ denote a (convex and differentiable) loss function: 
252 | $$ \ell(H) = \dfrac{1}{n} \sum_{i=1}^{n} \ell(H(x_{i}),y_{i}) $$
253 | 
254 | 
255 | Assume we have already finished $\mathbf{t}$ iterations and already have an ensemble classifier $H_{t}(\mathbf{x})$. Now in iteration $t+1$ we want to add one more weak learner $h_{t+1}$ to the ensemble. To this end we search for the weak learner that minimizes the loss:
256 | $$h_{t+1} = argmin_{h \in \mathbb{H}} \; \ell(H_{t} + \alpha h_{t})$$
257 | 
258 | Once $h_{t+1}$ has been found, we add it to our ensemble, i.e. $H_{t+1} := H_{t} + \alpha h_{t+1}$.
259 | 
260 | 
261 | \subsection{Gradient descent in functional space}
262 | 
263 | Given $H$, we want to find the step-size $\alpha$ and (weak learner) $h$ to minimize the loss $\ell(H + \alpha h)$. For this purpose, we can use Taylor Approximation as we did for gradient descent:
264 | 
265 | $$  \ell(H + \alpha h) \approx \ell(H) 
266 | + \alpha \nabla \ell(H) \cdot h$$
267 | 
268 | This approximation (of $\ell$ as a linear function) only holds within a small region around $\ell(H)$, i.e. as long as $\alpha$ is small. We therefore fix it to a small constant (e.g. $\alpha \approx 0.1$). With the step-size $\alpha$ fixed, we can use the approximation above to find an almost optimal $h$:
269 | 
270 | $$argmin_{h \in \mathbb{H}} \; \ell(H + \alpha h) \approx argmin_{h \in \mathbb{H}}  \; \nabla \ell(H) \cdot  h = argmin_{h \in \mathbb{H}} \; \sum_{i=1}^{n} \dfrac{\partial \ell}{\partial H} (x_{i}) \cdot h(x_{i})$$ 
271 | \newpage
272 | 
273 | Hence, we can do boosting if we have an algorithm $\mathbb{A}$ to solve: 
274 | 
275 | $$h_{t+1} = argmin_{h \in \mathbb{H}} \; \sum_{i=1}^{n} \overbrace{\dfrac{\partial \ell}{\partial H} (x_{i})}^\text{$r_{i}$}  \cdot h(x_{i})$$
276 | 
277 | Note that we  make progress as long as
278 | $\sum_{i=1}^{n} r_{i} h(x_{i}) < 0$.
279 | 
280 | \begin{figure}[h]
281 | \centering
282 | \includegraphics[width=0.5\textwidth]{img/anyboost.png}
283 | \end{figure}
284 | 
285 | \subsection{AdaBoost}
286 | 
287 | We begin by describing the most popular boosting algorithm due to
288 | Freund and Schapire (1997) called “AdaBoost”. Consider a two-class
289 | problem where:
290 | \begin{itemize}
291 |     \item The output variable are coded as $y_{i} \in \{ -1, 1\} \; , \forall i $
292 |     \item Weak learners $h \in \mathbb{H}$ are binary, $h(x_{i}) \in \{-1,1\} \; , \forall i$
293 |     \item The loss is the exponential loss: $\ell(H) = \sum_{i=1}^{n} e^{-y_{i}H(x_{i})}$
294 | \end{itemize}
295 | 
296 | First we compute the gradient $\mathbf{r_{i}} = \dfrac{\partial \ell}{\partial H} (x_{i}) = -y_{i}e^{-y_{i}H(x_{i})} $ \\
297 | For notational convenience, let us define $ \mathbf{w_{i}} = \dfrac{e^{-y_{i}H(x_{i})}}{\sum_{i=1}^{n} e^{-y_{i}H(x_{i})}} $ so that $\sum_{i=1}^{n}w_{i} = 1$.  \\
298 | Each weight $\mathbf{w_{i}}$ therefore has a very nice interpretation: it is the relative contribution of the training point $(x_{i},y_{i})$ towards the overall loss. Now, in order to find the best next weak learner, we need to solve the optimization problem posed earlier:
299 | 
300 | 
301 | 
302 | \begin{equation*}
303 | \begin{aligned}
304 | &   h(x_{i}) = argmin_{h \in \mathbb{H}} \; \sum_{i=1}^{n} r_{i} h(x_{i}) 
305 |   \\
306 |   &= argmin_{h \in \mathbb{H}} \; -\sum_{i=1}^{n} y_{i}e^{-y_{i}H(x_{i})} h(x_{i}) && \text{(substitute in $r_{i}$)}\\
307 | &  = argmin_{h \in \mathbb{H}} \; -\sum_{i=1}^{n} y_{i}w_{i} h(x_{i}) &&  \text{(we can divide by $\sum_{i=1}^{n} e^{-y_{i}H(x_{i})}$ since it is a constant)} \\
308 | & = argmin_{h \in \mathbb{H}} \; \sum_{i: h_(x_{i}) \not = y_{i}} w_{i} - \sum_{i: h_(x_{i})  = y_{i}} w_{i} && \text{( since $h(x_{i})y_{i} = 1 \iff h(x_{i}) = y_{i} $)} \\
309 | & = argmin_{h \in \mathbb{H}} \; \sum_{i: h_(x_{i}) \not = y_{i}} w_{i} && \text{(since $\sum_{i: h_(x_{i}) \not = y_{i}} w_{i} = 1 - \sum_{i: h_(x_{i})  = y_{i}} w_{i}$)}
310 | \end{aligned}
311 | \end{equation*}
312 | 
313 | Let us denote this weighted classification error as $\epsilon =\sum_{i: h_(x_{i}) \not = y_{i}} w_{i} $. Then, in order for the inner-product $\sum_{i=1}^{n} r_{i}h(x_{i})$ to be negative, it just needs less than $\epsilon<0.5$ weighted training error.\\
314 | 
315 | The next step is finding the optimal stepsize $\mathbf{\alpha}$ (i.e. the one that minimizes $\ell$ the most). 
316 | 
317 | We would like to solve the following optimization problem:
318 | 
319 | $$\alpha = argmin_{\alpha}\; \ell (H+ \alpha h) = argmin_{\alpha}\; \sum_{i=1}^{n} e^{-y_{i}[H(x_{i}) + \alpha h(x_{i})]} $$
320 | We differentiate w.r.t. $\alpha$ and equate with zero: 
321 | \begin{equation*}
322 | \begin{aligned}
323 | &   \dfrac{\partial \ell(H+\alpha h)}{\partial \alpha} = 0 \implies \sum_{i=1}^{n} y_{i}h(x_{i})e^{-(y_{i}H(x_{i}) + \alpha y_{i}h(x_{i}))} = 0
324 |   \\
325 |   & - \sum_{i:h(x_{i})y_{i}=1}  e^{-(y_{i}H(x_{i}) + \alpha \overbrace{y_{i}h(x_{i}}^{1}))} +  \sum_{i:h(x_{i})y_{i} = -1}  e^{-(y_{i}H(x_{i}) + \alpha \overbrace{y_{i}h(x_{i}))}^\text{$-1$}} = 0 && \text{( $y_{i}h(x_{i}) \in \{1,-1\}$)} \\
326 |   & - \sum_{i:h(x_{i})y_{i}=1} w_{i} e^{-\alpha} + \sum_{i:h(x_{i})y_{i} = -1} w_{i} e^{\alpha} = 0 && \text{(divide everything by $\sum_{i=1}^{n} e^{-y_{i}H(x_{i})}$)}\\
327 |   & - (1 - \epsilon) e^{-\alpha} + \epsilon e^{\alpha} = 0 && \text{($\epsilon = \sum_{i: h_(x_{i}) y_{i} = -1} w_{i} $)} \\
328 |   & \alpha = \dfrac{1}{2} \ln {\dfrac{1-\epsilon}{\epsilon}}
329 | \end{aligned}
330 | \end{equation*}
331 | 
332 | It is unusual that we can find the optimal step-size in such a simple closed form. One consequence is that AdaBoost converges extremely fast. \\
333 | 
334 | Finally, after you take a step, i.e. $H_{t+1}=H_{t}+ \alpha h$, you need to re-compute all the weights and then re-normalize. It is however straight-forward to show that the unnormalized weight $\hat{w}_{i}$ is updated as:
335 | $$\hat{w}_{i} \leftarrow \hat{w}_{i} e^{-\alpha h(x_{i}) y_{i}} $$ 
336 | 
337 | and that the normalizer $Z = \sum_{i=1}^{n} e^{-y_{i}H(x_{i})} $ becomes:
338 | \begin{equation*}
339 | \begin{aligned}
340 | &  
341 |   Z^{(t+1)} = \sum_{i=1}^{n} e^{-y_{i}(H(x_{i})+ \alpha h(x_{i}))} = Z^{(t)} \cdot \sum_{i=1}^{n} \dfrac{1}{Z^{(t)}} e^{-y_{i}(H(x_{i})+ \alpha h(x_{i}))} = Z^{(t)} \cdot \sum_{i=1}^{n} w_{i}^{(t)} e^{-y_{i} \alpha h(x_{i})}
342 |   \\
343 |   &  = Z \cdot ( \sum_{i:h(x_{i})y_{i}=1} w_{i}^{(t)} e^{-\alpha} + \sum_{i:h(x_{i})y_{i}=-1} w_{i}^{(t)} e^{\alpha}) = Z^{(t)} \cdot [(1-\epsilon)e^{-\alpha} + \epsilon e^{\alpha} ] \\ 
344 |   & = Z^{(t)} \cdot [(1-\epsilon) \dfrac{\sqrt{\epsilon}}{\sqrt{1-\epsilon}} + \epsilon \dfrac{\sqrt{1-\epsilon}}{\sqrt{\epsilon}}  ] = Z^{(t)} \cdot 2\sqrt{\epsilon (1-\epsilon)}
345 | \end{aligned}
346 | \end{equation*}
347 | 
348 | $$Z \leftarrow Z \cdot 2 \sqrt{\epsilon (1-\epsilon)}$$
349 | 
350 | Putting these two together we obtain the following multiplicative update rule:
351 | $$ w_{i} \leftarrow w_{i} \cdot \dfrac{e^{-\alpha h(x_{i}) y_{i}}}{2 \sqrt{\epsilon (1-\epsilon)}}$$
352 | 
353 | The pseudo-code for AdaBoost will then be the following: 
354 | \\ \\ 
355 | \begin{figure}[h]
356 | \centering
357 | \includegraphics[width=0.9\textwidth]{img/adaboost.png}
358 | \end{figure}
359 | 
360 | \newpage
361 | Furthermore, we can use the normalizer $Z$ to bound the loss function after $\mathbf{T}$ iterations:
362 | 
363 | \begin{equation*}
364 | \begin{aligned}
365 | &  
366 |   \ell(H) = Z = Z_{0} \prod_{t=1}^{T}2\sqrt{\epsilon_{t}(1-\epsilon_{t})} = n \prod_{t=1}^{T}2\sqrt{\epsilon_{t}(1-\epsilon_{t})} && \text{($Z_{0}=n$ when all weights are $\dfrac{1}{n}$)}
367 |   \\
368 |   & \leq n \cdot (4 c(1-c))^{\dfrac{T}{2}} && \text{(we define $c = max_{t} \; \epsilon_{t}$)}
369 |   \\ 
370 |   & \leq n \cdot (4 (\dfrac{1}{4} - \gamma^2))^{\dfrac{T}{2}} \leq n \cdot (1 - 4\gamma^2)^{\dfrac{T}{2}} &&  \text{($max_{c} \; c(1-c) = \dfrac{1}{4}  \land c < \dfrac{1}{2} \implies$ this bound: $c(1-c) = \dfrac{1}{4} - \gamma^2$)} \\
371 |    \end{aligned}
372 | \end{equation*}
373 | In other words, $ \ell(H) \leq n \cdot (1 - 4\gamma^2)^{\dfrac{T}{2}}$  tells us that the training loss is \textbf{decreasing exponentially!}
374 | In fact, we can go even further and compute after how many iterations we must have zero training error (note that the training loss is an upper bound on the training error).\\ We can  compute the number of steps required until the loss is less than 1, which would imply that not a single training sample is misclassified:
375 | $$n \cdot (1 - 4\gamma^2)^{\dfrac{T}{2}} < 1 \implies T > \dfrac{2\log{(n)}}{\log{(\dfrac{1}{1-4\gamma^2})}}$$
376 | This is an amazing result! It shows that after $\mathbf{O(\log{n})}$ iterations your training error must be zero. 
377 | 
378 | \subsection{The Margins Explanation for Boosting’s Effectiveness}
379 | 
380 | We can visualize the effect AdaBoost has on the margins of the training examples by
381 | plotting their distribution. In particular, we can create a plot showing, for each $\theta \in [-1, +1]$,
382 | the fraction of training examples with margin at most $\theta$. 
383 | 
384 | \begin{figure}[h]
385 | \centering
386 | \includegraphics[width=0.5\textwidth]{img/adaboost_margin.png}
387 | \caption{The margin distribution graph for boosting  showing the cumulative distribution of margins of the training instances after 5, 100, and 1000 iterations, indicated by short-dashed, long-dashed (mostly hidden),
388 | and solid curves, respectively. }
389 | \end{figure}
390 | 
391 | Whereas nothing at all is happening to the training error, these curves expose dramatic changes happening on
392 | the margin distribution. For instance, after five rounds, although the training error is zero
393 | (so that no examples have negative margin), a rather substantial fraction of the training
394 | examples (7.7\%) have margin below 0.5. By round 100, all of these examples have been
395 | swept to the right so that not a single example has margin below 0.5, and nearly all have
396 | margin above 0.6. (On the other hand, many with margin 1.0 have slipped back to the 0.6–0.8
397 | range.) In line with this trend, the minimum margin of any training example has increased
398 | from 0.14 at round 5 to 0.52 at round 100, and 0.55 at round 1000.
399 | \textbf{Thus, this example is indicative of the powerful effect AdaBoost has on the margins,
400 | aggressively pushing up those examples with small or negative margin}. \\ \\
401 | Indeed, as will be seen, AdaBoost can be analyzed theoretically along exactly these lines.
402 | We will first prove a bound on the generalization error of AdaBoost that depends only on the margins of the training examples, and not on the number of rounds of boosting. Thus, this bound predicts that AdaBoost will not overfit regardless
403 | of how long it is run, provided that large margins can be achieved (and provided, of course,
404 | that the base classifiers are not too complex relative to the size of the training set):
405 | 
406 | \begin{theorem}
407 | Let $\mathcal{D}$ be a distribution over $\mathcal{X} \times \{ -1, +1\}$, and let $\mathcal{S}$ be a sample of $m$ examples chosen independently at random according to $\mathcal{D}$. Assume that the base classifier space
408 | $\mathbb{H}$ is finite, and let $\delta > 0$. Then with probability at least $1 -  \delta$ over the random choice of the
409 | training set $\mathcal{S}$, every weighted average function $f$ satisfies the following bound:
410 | $$\mathbb{P}_{\mathcal{D}}[\overbrace{yf(x)}^\text{margin} \leq 0] \leq  \mathbb{P}_{\mathcal{S}}[yf(x) \leq \theta] + O(\sqrt{\dfrac{\log{\mathbb{|H|}}}{m \theta^2} \cdot log(\dfrac{m \theta^2}{\log{\mathbb{|H|}}} )+ \log{\dfrac{1 / \delta}{m}} }) 
411 | $$
412 | $$\text{for all } \theta > \sqrt{\dfrac{log{\mathbb{|H|}}}{4m}}$$
413 | \end{theorem}
414 | 
415 | The term on the left is the generalization error. The first term on the right
416 | is the fraction of training examples with margin below some threshold $\theta$. This term will be
417 | small if most training examples have large margin (i.e., larger than $\theta$). The second term on
418 | the right is an additional term that becomes small as the size of the training set $m$ gets larger,
419 | provided the complexity of the base classifiers is controlled for $\theta$ bounded away from zero. \\ \\ The second part of the analysis is to prove that, as observed empirically in Figure 9.1, AdaBoost generally tends to increase the margins of all training examples. 
420 | 
421 | \begin{theorem}
422 | Given the notation of the previous section, let $\gamma_{t} = \dfrac{1}{2} - \epsilon_{t}$. Then the fraction of training examples with margin at most $\theta$ is at most:
423 | $$\prod_{t=1}^{T} \sqrt{(1+2 \gamma_{t} )^{1+\theta} (1-2 \gamma_{t})^{1-\theta}} $$
424 | \end{theorem}
425 | 
426 | To get a feeling for this bound, consider what happens when, for all $ \;t,$  $ \epsilon_{t} \leq \dfrac{1}{2} - \gamma $   for
427 | some $\gamma > 0$. Given this assumption, we can simplify the upper bound in theorem 9.3 to:
428 | $$\Big(\sqrt{(1+2 \gamma )^{1+\theta} (1-2 \gamma)^{1-\theta}} \Big)^T$$
429 | 
430 | When the expression inside the parentheses is strictly smaller than 1, that is, when:
431 | \begin{equation}
432 |     \sqrt{(1+2 \gamma )^{1+\theta} (1-2 \gamma)^{1-\theta}} < 1
433 | \end{equation}
434 | 
435 | this bound implies that the fraction of training examples with margin  $\leq \theta$ decreases to zero exponentially fast with $T$. Moreover, by solving for $\theta$, we see that equation (9.1)
436 | holds if and only if:
437 | 
438 | $$\theta < - \dfrac{\log{(1-4 \gamma^2)}}{\log{(\dfrac{1+2 \gamma}{1- 2 \gamma})}}$$
439 | 
440 | Thus, the margins
441 | of the training examples are guaranteed to be large after a sufficient number of boosting
442 | iterations.\end{document}
443 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Advanced Machine Learning Lecture Notes Fall 2020
 2 | 
 3 | [@pierobartolo](https://github.com/pierobartolo) and [@MirkoDeVita98](https://github.com/MirkoDeVita98) co-authored these lectures notes for the Advanced Machine Learning course at ETH Zürich. [@lucidBrot](https://github.com/lucidBrot/) collected changes from their two repositories, as well as from [@advilema](https://github.com/advilema) and combined them in one repository.
 4 | 
 5 | **You have all the rights to fork this repository and build upon it as long as you give credits to the authors.**
 6 | 
 7 | ## Access
 8 | To pull the notes to your local computer you must first `clone` the repository.
 9 | In the terminal, navigate to the root directory where you'd like the notes to
10 | be located and enter:
11 | 
12 |     git clone https://github.com/pierobartolo/AML-Lecture-Notes
13 |     cd AML-Lecture-Notes
14 | 
15 | You will see a series of `.tex` files in this folder, which are the latex
16 | source files broken down by lecture. To read the files as a pdf, you must
17 | compile them with the following command:
18 | 
19 |     pdflatex lecture_x
20 | 
21 | Once compiled, you should see `lecture_x.pdf`.
22 | 
23 | To build all the files at once, you can do (Windows Cygwin. I assume you'd know how to do this if you are on linux):
24 | 
25 | ```
26 | for i in *.tex; do /cygdrive/f/Programme/MiKTeX_20.11/miktex/bin/x64/pdflatex -halt-on-error -output-directory pdf "$i" ; done
27 | ```
28 | 
29 | 


--------------------------------------------------------------------------------
/img/adaboost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/adaboost.png


--------------------------------------------------------------------------------
/img/adaboost_margin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/adaboost_margin.png


--------------------------------------------------------------------------------
/img/algo_gp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/algo_gp.png


--------------------------------------------------------------------------------
/img/anyboost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/anyboost.png


--------------------------------------------------------------------------------
/img/cp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/cp.png


--------------------------------------------------------------------------------
/img/discriminant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/discriminant.png


--------------------------------------------------------------------------------
/img/erm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/erm.png


--------------------------------------------------------------------------------
/img/fisher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/fisher.png


--------------------------------------------------------------------------------
/img/frequencies.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/frequencies.png


--------------------------------------------------------------------------------
/img/gaussian_process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/gaussian_process.png


--------------------------------------------------------------------------------
/img/gem1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/gem1.png


--------------------------------------------------------------------------------
/img/gem10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/gem10.png


--------------------------------------------------------------------------------
/img/gem100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/gem100.png


--------------------------------------------------------------------------------
/img/gem2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/gem2.png


--------------------------------------------------------------------------------
/img/gen_reg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/gen_reg.png


--------------------------------------------------------------------------------
/img/geo_ls.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/geo_ls.png


--------------------------------------------------------------------------------
/img/lagrangian.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/lagrangian.jpg


--------------------------------------------------------------------------------
/img/least_squares.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/least_squares.png


--------------------------------------------------------------------------------
/img/lrelu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/lrelu.png


--------------------------------------------------------------------------------
/img/margin.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/margin.jpg


--------------------------------------------------------------------------------
/img/multiclass.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/multiclass.png


--------------------------------------------------------------------------------
/img/nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/nn.png


--------------------------------------------------------------------------------
/img/nn_rect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/nn_rect.png


--------------------------------------------------------------------------------
/img/pac1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/pac1.png


--------------------------------------------------------------------------------
/img/pac2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/pac2.png


--------------------------------------------------------------------------------
/img/relu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/relu.png


--------------------------------------------------------------------------------
/img/ridge_vs_lasso.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/ridge_vs_lasso.png


--------------------------------------------------------------------------------
/img/riemann.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/riemann.png


--------------------------------------------------------------------------------
/img/riemann_nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/riemann_nn.png


--------------------------------------------------------------------------------
/img/sigmoid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/sigmoid.png


--------------------------------------------------------------------------------
/img/slack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/slack.png


--------------------------------------------------------------------------------
/img/stick-breaking.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/stick-breaking.png


--------------------------------------------------------------------------------
/img/stick-breaking2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/stick-breaking2.png


--------------------------------------------------------------------------------
/img/struct_algo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/struct_algo.png


--------------------------------------------------------------------------------
/img/syn_out.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/syn_out.png


--------------------------------------------------------------------------------
/img/syn_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/syn_tree.png


--------------------------------------------------------------------------------
/img/tanh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/tanh.png


--------------------------------------------------------------------------------
/img/us.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/us.png


--------------------------------------------------------------------------------
/img/vae_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/vae_1.png


--------------------------------------------------------------------------------
/img/vae_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/vae_2.png


--------------------------------------------------------------------------------
/img/vae_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/img/vae_3.png


--------------------------------------------------------------------------------
/pdf/L10_Deep_Learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/pdf/L10_Deep_Learning.pdf


--------------------------------------------------------------------------------
/pdf/L11_Non_Parametric_Bayesian_Methods.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/pdf/L11_Non_Parametric_Bayesian_Methods.pdf


--------------------------------------------------------------------------------
/pdf/L12_PAC_Learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/pdf/L12_PAC_Learning.pdf


--------------------------------------------------------------------------------
/pdf/L2_Representations.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/pdf/L2_Representations.pdf


--------------------------------------------------------------------------------
/pdf/L3_Density_Estimation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/pdf/L3_Density_Estimation.pdf


--------------------------------------------------------------------------------
/pdf/L4_Regression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/pdf/L4_Regression.pdf


--------------------------------------------------------------------------------
/pdf/L5_Gaussian_Processes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/pdf/L5_Gaussian_Processes.pdf


--------------------------------------------------------------------------------
/pdf/L6_Linear_Classification.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/pdf/L6_Linear_Classification.pdf


--------------------------------------------------------------------------------
/pdf/L8_Structured_SVM.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/pdf/L8_Structured_SVM.pdf


--------------------------------------------------------------------------------
/pdf/L9_Ensemble_Methods.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pdebartol/aml-lecture-notes/a1d2f90f98eca91f23f4342a83466ee55b53e050/pdf/L9_Ensemble_Methods.pdf


--------------------------------------------------------------------------------