├── README.md
├── cheat.pdf
└── cheat.tex


/README.md:
--------------------------------------------------------------------------------
 1 | A summary for common machine learning algorithms, provided in a handy format. 
 2 | 
 3 | **Classifiers included are:**
 4 | 
 5 | * k-nearest neighbour
 6 | * Naive Bayes
 7 | * Log-linear
 8 | * Perceptron
 9 | * Support Vector Machines
10 | 
11 | **Clustering (EM) algorithms included are:**
12 | 
13 | * k-means
14 | * Mixture of Gaussians
15 | 


--------------------------------------------------------------------------------
/cheat.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eferm/mlcheatsheet/8e34fb35aed981683f3398df99bb7bf978837afe/cheat.pdf


--------------------------------------------------------------------------------
/cheat.tex:
--------------------------------------------------------------------------------
  1 | %
  2 | %  untitled
  3 | %
  4 | %  Created by Emanuel Ferm on 2011-04-25.
  5 | %  Copyright (c) 2011 __MyCompanyName__. All rights reserved.
  6 | %
  7 | \documentclass[landscape,a2paper,8pt]{article}
  8 | 
  9 | % Use utf-8 encoding for foreign characters
 10 | \usepackage[utf8]{inputenc}
 11 | 
 12 | % Setup for fullpage use
 13 | \usepackage{fullpage}
 14 | \usepackage{float}
 15 | \usepackage{amssymb}
 16 | \usepackage{amsmath}
 17 | \usepackage[hmargin=1cm,vmargin=1cm]{geometry}
 18 | \usepackage{mdwlist}
 19 | \usepackage{array}
 20 | \usepackage{hyperref}
 21 | \usepackage{nopageno}
 22 | 
 23 | % Uncomment some of the following if you use the features
 24 | %
 25 | % Running Headers and footers
 26 | %\usepackage{fancyhdr}
 27 | 
 28 | % Multipart figures
 29 | %\usepackage{subfigure}
 30 | 
 31 | % More symbols
 32 | %\usepackage{amsmath}
 33 | %\usepackage{amssymb}
 34 | %\usepackage{latexsym}
 35 | 
 36 | % Surround parts of graphics with box
 37 | \usepackage{boxedminipage}
 38 | 
 39 | % Package for including code in the document
 40 | \usepackage{listings}
 41 | 
 42 | % If you want to generate a toc for each chapter (use with book)
 43 | %\usepackage{minitoc}
 44 | 
 45 | % This is now the recommended way for checking for PDFLaTeX:
 46 | \usepackage{ifpdf}
 47 | 
 48 | %\newif\ifpdf
 49 | %\ifx\pdfoutput\undefined
 50 | %\pdffalse % we are not running PDFLaTeX
 51 | %\else
 52 | %\pdfoutput=1 % we are running PDFLaTeX
 53 | %\pdftrue
 54 | %\fi
 55 | 
 56 | \ifpdf
 57 | \usepackage[pdftex]{graphicx}
 58 | \else
 59 | \usepackage{graphicx}
 60 | \fi
 61 | 
 62 | \DeclareMathOperator*{\argmax}{arg\,max\ }
 63 | \DeclareMathOperator*{\argmin}{arg\,min\ }
 64 | \DeclareMathOperator*{\sign}{sign}
 65 | \newcommand{\E}{\mathop{\mathbb E}}
 66 | 
 67 | \renewcommand{\c}[1]{
 68 | }
 69 | 
 70 | \renewcommand{\labelitemi}{{\tiny$\bullet$}}
 71 | 
 72 | \newcommand{\ColWidth}{
 73 | 5cm
 74 | }
 75 | 
 76 | \newcommand{\RowHeight}{
 77 | 4cm
 78 | }
 79 | 
 80 | \newcommand{\KNNDescr}{
 81 | The label of a new point $\hat{x}$ is classified with the most frequent label $\hat{t}$ of the $k$ nearest training instances.
 82 | }
 83 | 
 84 | \newcommand{\KNNModel}{
 85 | \begin{align*}
 86 | \hat{t} = \argmax_{\mathcal{C}} \sum_{i:x_{i} \in N_k(\boldsymbol{x},\hat{x})} \delta(t_i, \mathcal{C}) 
 87 | \end{align*}
 88 | \begin{itemize}
 89 | 	\item $N_k(\boldsymbol{x},\hat{x}) \leftarrow$ $k$ points in $\boldsymbol{x}$ closest to $\hat{x}$
 90 | 	\item Euclidean distance formula: $\sqrt{\sum_{i=1}^{D} (x_i - \hat{x}_i)^2}$
 91 | 	\item $\delta(a,b) \leftarrow$ 1 if $a = b$; 0 o/w
 92 | \end{itemize}
 93 | }
 94 | 
 95 | \newcommand{\KNNObj}{
 96 | No optimisation needed.
 97 | }
 98 | 
 99 | \newcommand{\KNNTrain}{
100 | Use cross-validation to learn the appropriate $k$; otherwise no training, classification based on existing points.
101 | }
102 | 
103 | \newcommand{\KNNReg}{
104 | $k$ acts as to regularise the classifier: as $k \rightarrow N$ the boundary becomes smoother.
105 | }
106 | 
107 | \newcommand{\KNNCompl}{
108 | $\mathcal{O}(NM)$ space complexity, since all training instances and all their features need to be kept in memory.
109 | }
110 | 
111 | \newcommand{\KNNNonl}{
112 | Natively finds non-linear boundaries.
113 | }
114 | 
115 | \newcommand{\KNNOnl}{
116 | To be added.
117 | }
118 | 
119 | \newcommand{\NBDescr}{
120 | Learn $p(\mathcal{C}_k | x)$ by modelling $p(x | \mathcal{C}_k)$ and $p(\mathcal{C}_k)$, using Bayes' rule to infer the class conditional probability. Assumes each feature independent of all others, ergo `Naive.'
121 | }
122 | 
123 | \newcommand{\NBModel}{
124 | {
125 | \begin{align*}
126 | 	y(\boldsymbol{x}) 	&= \argmax_k p(\mathcal{C}_k | x) \\
127 | 						&= \argmax_k p(x | \mathcal{C}_k) \times p(\mathcal{C}_k) \\
128 | 						&= \argmax_k \prod_{i=1}^D p(x_i | \mathcal{C}_k) \times p(\mathcal{C}_k) \\
129 | 						&= \argmax_k \sum_{i=1}^D \log p(x_i | \mathcal{C}_k) + \log p(\mathcal{C}_k)
130 | \end{align*}
131 | }}
132 | 
133 | \newcommand{\NBObj}{
134 | No optimisation needed.
135 | }
136 | 
137 | \newcommand{\NBTrain}{{
138 | \textbf{Multivariate likelihood}
139 | $
140 | 	p(x | \mathcal{C}_k) = \sum_{i=1}^D \log p(x_i | \mathcal{C}_k)
141 | $
142 | \begin{multline*}
143 | 	p_{\text{MLE}}(x_i = v | \mathcal{C}_k) = \frac{\sum_{j=1}^N \delta(t_j = \mathcal{C}_k \wedge x_{ji} = v)}{\sum_{j=1}^N \delta(t_j = \mathcal{C}_k)}
144 | \end{multline*}
145 | 
146 | \textbf{Multinomial likelihood}
147 | $
148 | 	p(x | \mathcal{C}_k) = \prod_{i=1}^D p(\text{word}_i | \mathcal{C}_k)^{x_i}
149 | $
150 | \begin{multline*}
151 | 	p_{\text{MLE}}(\text{word}_i = v | \mathcal{C}_k) = \frac{\sum_{j=1}^N \delta(t_j = \mathcal{C}_k) \times x_{ji}}{\sum_{j=1}^N \sum_{d=1}^D \delta(t_j = \mathcal{C}_k) \times x_{di}}
152 | \end{multline*}
153 | 
154 | \noindent \ldots where:
155 | \begin{itemize*}
156 | 	\item $x_{ji}$ is the count of word $i$ in test example $j$;
157 | 	\item $x_{di}$ is the count of feature $d$ in test example $j$.
158 | \end{itemize*}
159 | 
160 | \noindent \textbf{Gaussian likelihood}
161 | $
162 | 	p(x | \mathcal{C}_k) = \prod_{i=1}^D \mathcal{N}(v; \mu_{ik}, \sigma_{ik})
163 | $
164 | }}
165 | 
166 | \newcommand{\NBReg}{{
167 | Use a Dirichlet prior on the parameters to obtain a MAP estimate.
168 | \newline
169 | 
170 | \textbf{Multivariate likelihood}
171 | \begin{multline*}
172 | 	p_{\text{MAP}}(x_i = v | \mathcal{C}_k) = \\
173 | 	\frac{(\beta_i - 1) + \sum_{j=1}^N \delta(t_j = \mathcal{C}_k \wedge x_{ji} = v)}{|x_i|(\beta_i - 1) + \sum_{j=1}^N \delta(t_j = \mathcal{C}_k)}
174 | \end{multline*}
175 | 
176 | \noindent \textbf{Multinomial likelihood}
177 | \begin{multline*}
178 | 	p_{\text{MAP}}(\text{word}_i = v | \mathcal{C}_k) = \\
179 | 	\frac{(\alpha_i - 1) + \sum_{j=1}^N \delta(t_j = \mathcal{C}_k) \times x_{ji}}{\sum_{j=1}^N \sum_{d=1}^D \left( \delta(t_j = \mathcal{C}_k) \times x_{di} \right) - D + \sum_{d=1}^D \alpha_d}
180 | \end{multline*}
181 | }}
182 | 
183 | \newcommand{\NBCompl}{{
184 | $\mathcal{O}(NM)$, each training instance must be visited and each of its features counted.
185 | }}
186 | 
187 | \newcommand{\NBNonl}{{
188 | Can only learn linear boundaries for multivariate/multinomial attributes.
189 | \newline
190 | 
191 | With Gaussian attributes, quadratic boundaries can be learned with uni-modal distributions.
192 | }}
193 | 
194 | \newcommand{\NBOnl}{{
195 | To be added.
196 | }}
197 | 
198 | \newcommand{\LLDescr}{{
199 | Estimate $p(\mathcal{C}_k | x)$ directly, by assuming a maximum entropy distribution and optimising an objective function over the conditional entropy distribution.
200 | }}
201 | 
202 | \newcommand{\LLModel}{{
203 | \begin{align*}
204 | 	y(x) 	&= \argmax_k p(\mathcal{C}_k | x) \\
205 | 			&= \argmax_k \sum_m \lambda_m \phi_m(x, \mathcal{C}_k)
206 | %			&= \argmax_k \frac{1}{Z_{\lambda}(x)} e^{\sum_m \lambda_m \phi_m(x, \mathcal{C}_k)}
207 | \end{align*}
208 | 
209 | \noindent \ldots where:
210 | \begin{align*}
211 | 	&p(\mathcal{C}_k | x) = \frac{1}{Z_{\lambda}(x)} e^{\sum_m \lambda_m \phi_m(x, \mathcal{C}_k)} \\
212 | 	&Z_{\lambda}(x) = \sum_k e^{\sum_m \lambda_m \phi_m(x, \mathcal{C}_k)}
213 | \end{align*}
214 | }}
215 | 
216 | \newcommand{\LLObj}{{
217 | Minimise the negative log-likelihood:
218 | \begin{flalign*}
219 | 	&\mathcal{L}_{\text{MLE}}(\lambda, \mathcal{D}) = \prod_{(x,t) \in \mathcal{D}} p(t | x) = - \sum_{(x,t) \in \mathcal{D}} \log p(t | x) \\
220 | 	& \qquad												= \sum_{(x,t) \in \mathcal{D}} \left( \log Z_{\lambda}(x) - \sum_m \lambda_m \phi_m(x, t) \right) \\
221 | 	& \qquad												= \sum_{(x,t) \in \mathcal{D}} \left( \log \sum_k e^{\sum_m \lambda_m \phi_m(x, \mathcal{C}_k)} - \sum_m \lambda_m \phi_m(x, t) \right)
222 | \end{flalign*}
223 | }}
224 | 
225 | \newcommand{\LLTrain}{{
226 | Gradient descent (or gradient ascent if maximising objective):
227 | \begin{align*}
228 | 	\lambda^{n+1} = \lambda^n - \eta \Delta \mathcal{L}
229 | \end{align*}
230 | 
231 | \noindent \ldots where $\eta$ is the step parameter.
232 | 
233 | \begin{align*}
234 | 	&\Delta \mathcal{L}_{\text{MLE}}(\lambda, \mathcal{D}) = \sum_{(x,t) \in \mathcal{D}} \E[\phi(x,\cdot)] - \phi(x,t) \\
235 | 	&\Delta \mathcal{L}_{\text{MAP}}(\lambda, \mathcal{D}, \sigma) = \frac{\lambda}{\sigma^2} + \sum_{(x,t) \in \mathcal{D}} \E[\phi(x,\cdot)] - \sum_{(x,t) \in \mathcal{D}} \phi(x,t)
236 | \end{align*}
237 | 
238 | \noindent \ldots where $\sum_{(x,t) \in \mathcal{D}} \phi(x,t)$ are the empirical counts.
239 | \newline
240 | 
241 | For each class $\mathcal{C}_k$:
242 | \begin{align*}
243 | 	\sum_{(x,t) \in \mathcal{D}} \E[\phi(x,\cdot)] = \sum_{(x,t) \in \mathcal{D}} \phi(x,\cdot) p(\mathcal{C}_k | x)
244 | \end{align*}
245 | }}
246 | 
247 | \newcommand{\LLReg}{{
248 | Penalise large values for the $\lambda$ parameters, by introducing a prior distribution over them (typically a Gaussian).
249 | \newline
250 | 
251 | \textbf{Objective function}
252 | \begin{align*}
253 | 	\mathcal{L}_{\text{MAP}}(\lambda, \mathcal{D}, \sigma) 	&= \argmin_{\lambda} \left( - \log p(\lambda) - \sum_{(x,t) \in \mathcal{D}} \log p(t | x) \right) \\
254 | 													&= \argmin_{\lambda} \left( - \log e^{\frac{(0-\lambda)^2}{2\sigma^2}} - \sum_{(x,t) \in \mathcal{D}} \log p(t | x) \right) \\
255 | 													&= \argmin_{\lambda} \left( \frac{\sum_m \lambda_m^2}{2\sigma^2} - \sum_{(x,t) \in \mathcal{D}} \log p(t | x) \right)
256 | \end{align*}
257 | }}
258 | 
259 | \newcommand{\LLCompl}{{
260 | $\mathcal{O}(INMK)$, since each training instance must be visited and each combination of class and features must be calculated for the appropriate feature mapping.
261 | }}
262 | 
263 | \newcommand{\LLNonl}{{
264 | Reformulate the class conditional distribution in terms of a kernel $K(x,x')$, and use a non-linear kernel (for example $K(x,x') = (1 + \boldsymbol{w}^T x)^2$). By the Representer Theorem:
265 | 
266 | \begin{align*}
267 | 	p(\mathcal{C}_k | x)	&= \frac{1}{Z_{\lambda}(x)} e^{\lambda^T \phi(x, \mathcal{C}_k)} \\
268 | 							&= \frac{1}{Z_{\lambda}(x)} e^{\sum_{n=1}^N \sum_{i=1}^K \alpha_{nk} \phi(x_n, C_i)^T \phi(x, \mathcal{C}_k)} \\
269 | 							&= \frac{1}{Z_{\lambda}(x)} e^{\sum_{n=1}^N \sum_{i=1}^K \alpha_{nk} K((x_n, C_i),(x,C_k))} \\
270 | 							&= \frac{1}{Z_{\lambda}(x)} e^{\sum_{n=1}^N \alpha_{nk} K(x_n, x)}
271 | \end{align*}
272 | }}
273 | 
274 | \newcommand{\LLOnl}{{
275 | \raggedright
276 | Online Gradient Descent: Update the parameters using GD after seeing each training instance.
277 | }}
278 | 
279 | \newcommand{\PDescr}{{
280 | Directly estimate the linear function $y(x)$ by iteratively updating the weight vector when incorrectly classifying a training instance.
281 | }}
282 | 
283 | \newcommand{\PModel}{{
284 | Binary, linear classifier:
285 | \begin{align*}
286 | 	y(x) = \sign(\boldsymbol{w}^T x)
287 | \end{align*}
288 | 
289 | \noindent \ldots where:
290 | \begin{align*}
291 | 	\sign(x) = \left\{
292 | 		\begin{array}{l l}
293 | 			+1 & \quad \text{if } x \geq 0 \\
294 | 			-1 & \quad \text{if } x < 0 \\
295 | 		\end{array} \right.
296 | \end{align*}
297 | 
298 | \noindent Multiclass perceptron:
299 | \begin{align*}
300 | 	y(x) = \argmax_{\mathcal{C}_k} \boldsymbol{w}^T \phi(x, \mathcal{C}_k)
301 | \end{align*}
302 | }}
303 | 
304 | \newcommand{\PObj}{{
305 | Tries to minimise the Error function; the number of incorrectly classified input vectors:
306 | \begin{align*}
307 | 	\argmin_{\boldsymbol{w}} E_P(\boldsymbol{w}) = \argmin_{\boldsymbol{w}} - \sum_{n \in \mathcal{M}} \boldsymbol{w}^T x_n t_n
308 | \end{align*}
309 | 
310 | \noindent \ldots where $\mathcal{M}$ is the set of misclassified training vectors.
311 | 
312 | %A boundary with 100\% accuracy is found when the perceptron criterion is satisfied: $\boldsymbol{w}^T x t > 0$.
313 | }}
314 | 
315 | \newcommand{\PTrain}{{
316 | Iterate over each training example $x_n$, and update the weight vector if misclassification:
317 | \begin{align*}
318 | 	\boldsymbol{w}^{i+1} 	&= \boldsymbol{w}^i + \eta \Delta E_P(\boldsymbol{w}) \\
319 | 							&= \boldsymbol{w}^i + \eta x_n t_n
320 | \end{align*}
321 | 
322 | \noindent \ldots where typically $\eta = 1$.
323 | \newline
324 | 
325 | \noindent For the multiclass perceptron:
326 | \begin{align*}
327 | 	\boldsymbol{w}^{i+1} = \boldsymbol{w}^i + \phi(x, t) - \phi(x, y(x))
328 | \end{align*}
329 | }}
330 | 
331 | \newcommand{\PReg}{{
332 | The Voted Perceptron: run the perceptron $i$ times and store each iteration's weight vector. Then:
333 | \begin{align*}
334 | 	y(x) = \sign \left( \sum_i c_i \times \sign(\boldsymbol{w}_i^T x) \right)
335 | \end{align*}
336 | \ldots where $c_i$ is the number of correctly classified training instances for $\boldsymbol{w}_i$.
337 | }}
338 | 
339 | \newcommand{\PCompl}{{
340 | $\mathcal{O}(INML)$, since each combination of instance, class and features must be calculated (see log-linear).
341 | }}
342 | 
343 | \newcommand{\PNonl}{{
344 | Use a kernel $K(x,x')$, and 1 weight per training instance:
345 | \begin{align*}
346 | 	y(x) = \sign \left( \sum_{n=1}^N w_n t_n K(x, x_n) \right)
347 | \end{align*}
348 | 
349 | \noindent \ldots and the update:
350 | \begin{align*}
351 | 	w_n^{i+1} = w_n^i + 1
352 | \end{align*}
353 | }}
354 | 
355 | \newcommand{\POnl}{{
356 | \raggedright
357 | The perceptron is an online algorithm per default.
358 | }}
359 | 
360 | \newcommand{\SVMDescr}{{
361 | A maximum margin classifier: finds the separating hyperplane with the maximum margin to its closest data points.
362 | }}
363 | 
364 | \newcommand{\SVMModel}{{
365 | \begin{align*}
366 | 	y(x) = \sum_{n=1}^N \lambda_n t_n x^T x_n + w_0
367 | \end{align*}
368 | }}
369 | 
370 | \newcommand{\SVMObj}{{
371 | \textbf{Primal}
372 | \begin{align*}
373 | 	\argmin_{\boldsymbol{w}, w_0} \frac{1}{2} ||\boldsymbol{w}||^2
374 | \end{align*}
375 | \begin{align*}
376 | 	\text{s.t.} \quad	t_n (\boldsymbol{w}^T x_n + w_0) \geq 1 \quad \forall n
377 | \end{align*}
378 | 
379 | \noindent \textbf{Dual}
380 | \begin{align*}
381 | 	\tilde{\mathcal{L}}(\wedge) = \sum_{n=1}^N \lambda_n - \sum_{n=1}^N \sum_{m=1}^N \lambda_n \lambda_m t_n t_m x_n^T x_m
382 | \end{align*}
383 | \begin{align*}
384 | 	\text{s.t.} \quad	& \lambda_n \geq 0, \quad \sum_{n=1}^N \lambda_n t_n = 0, \quad \forall n
385 | \end{align*}
386 | }}
387 | 
388 | \newcommand{\SVMTrain}{{
389 | \begin{itemize}
390 | 	\item Quadratic Programming (QP)
391 | 	\item SMO, Sequential Minimal Optimisation (chunking). 
392 | \end{itemize}
393 | }}
394 | 
395 | \newcommand{\SVMReg}{{
396 | The soft margin SVM: penalise a hyperplane by the number and distance of misclassified points.
397 | \newline
398 | 
399 | \noindent \textbf{Primal}
400 | \begin{align*}
401 | 	\argmin_{\boldsymbol{w}, w_0} \frac{1}{2} ||\boldsymbol{w}||^2 + C \sum_{n=1}^N \xi_n
402 | \end{align*}
403 | \begin{align*}
404 | 	\text{s.t.} \quad	t_n (\boldsymbol{w}^T x_n + w_0) \geq 1 - \xi_n, \quad \xi_n > 0 \quad \forall n
405 | \end{align*}
406 | 
407 | \noindent \textbf{Dual}
408 | \begin{align*}
409 | 	\tilde{\mathcal{L}}(\wedge) = \sum_{n=1}^N \lambda_n - \sum_{n=1}^N \sum_{m=1}^N \lambda_n \lambda_m t_n t_m x_n^T x_m
410 | \end{align*}
411 | \begin{align*}
412 | 	\text{s.t.} \quad 0 \leq \lambda_n \leq C, \quad \sum_{n=1}^N \lambda_n t_n = 0, \quad \forall n
413 | \end{align*}
414 | }}
415 | 
416 | \newcommand{\SVMCompl}{{
417 | \begin{itemize}
418 | 	\item QP: $\mathcal{O}(n^3)$;
419 | 	\item SMO: much more efficient than QP, since computation based only on support vectors.
420 | \end{itemize}
421 | }}
422 | 
423 | \newcommand{\SVMNonl}{{
424 | Use a non-linear kernel $K(x,x')$:
425 | 
426 | \begin{align*}
427 | 	y(x)	&= \sum_{n=1}^N \lambda_n t_n x^T x_n + w_0 \\
428 | 			&= \sum_{n=1}^N \lambda_n t_n K(x, x_n) + w_0
429 | \end{align*}
430 | 
431 | \begin{align*}
432 | 	\tilde{\mathcal{L}}(\wedge) &= \sum_{n=1}^N \lambda_n - \sum_{n=1}^N \sum_{m=1}^N \lambda_n \lambda_m t_n t_m x_n^T x_m \\
433 | 								&= \sum_{n=1}^N \lambda_n - \sum_{n=1}^N \sum_{m=1}^N \lambda_n \lambda_m t_n t_m K(x_n,x_m)
434 | \end{align*}
435 | }}
436 | 
437 | \newcommand{\SVMOnl}{{
438 | \raggedright
439 | Online SVM. See, for example:
440 | \begin{itemize}
441 | 	\item \emph{The Huller: A Simple and Efficient Online SVM}, Bordes \& Bottou (2005)
442 | 	\item \emph{Pegasos: Primal Estimated sub-Gradient Solver for SVM}, Shalev-Shwartz et al. (2007)
443 | \end{itemize}
444 | }}
445 | 
446 | \newcommand{\KMDescr}{{
447 | A hard-margin, geometric clustering algorithm, where each data point is assigned to its closest centroid.
448 | }}
449 | 
450 | \newcommand{\KMModel}{{
451 | Hard assignments $r_{nk} \in \{0,1\}$ s.t. $\forall n \sum_k r_{nk} = 1$, i.e. each data point is assigned to one cluster $k$.
452 | \newline
453 | 
454 | Geometric distance: The Euclidean distance, $l^2$ norm:
455 | \begin{align*}
456 | 	|| x_n - \mu_k ||_2 = \sqrt{\sum_{i=1}^D (x_{ni} - \mu_{ki})^2}
457 | \end{align*}
458 | }}
459 | 
460 | \newcommand{\KMObj}{{
461 | \begin{align*}
462 | 	\argmin_{\boldsymbol{r},\mu} \sum_{n=1}^N \sum_{k=1}^K r_{nk} || x_n - \mu_k ||_2^2
463 | \end{align*}
464 | 
465 | \noindent \ldots i.e. minimise the distance from each cluster centre to each of its points.
466 | }}
467 | 
468 | \newcommand{\KMTrain}{{
469 | \textbf{E}xpectation:
470 | \begin{align*}
471 | 	r_{nk} = \left\{
472 | 		\begin{array}{l l}
473 | 			1 & \quad \text{if } || x_n - \mu_k ||^2 \text{ minimal for } k \\
474 | 			0 & \quad \text{o/w}
475 | 		\end{array} \right.
476 | \end{align*}
477 | 
478 | \textbf{M}aximisation:
479 | \begin{align*}
480 | 	\mu_{\text{MLE}}^{(k)} = \frac{\sum_n r_{nk} x_n}{\sum_n r_{nk}}
481 | \end{align*}
482 | 
483 | \noindent \ldots where $\mu^{(k)}$ is the centroid of cluster $k$.
484 | }}
485 | 
486 | \newcommand{\KMReg}{{
487 | Only hard-margin assignment to clusters.
488 | }}
489 | 
490 | \newcommand{\KMCompl}{{
491 | To be added.
492 | }}
493 | 
494 | \newcommand{\KMNonl}{{
495 | For non-linearly separable data, use kernel k-means as suggested in:
496 | \newline
497 | 
498 | \emph{Kernel k-means, Spectral Clustering and Normalized Cuts}, Dhillon et al. (2004).
499 | 
500 | }}
501 | 
502 | \newcommand{\KMOnl}{{
503 | \raggedright
504 | Sequential $k$-means: update the centroids after processing one point at a time.
505 | }}
506 | 
507 | \newcommand{\MGDescr}{{
508 | A probabilistic clustering algorithm, where clusters are modelled as latent Guassians and each data point is assigned the probability of being drawn from a particular Gaussian.
509 | }}
510 | 
511 | \newcommand{\MGModel}{{
512 | Assignments to clusters by specifying probabilities
513 | \begin{align*}
514 | 	p(x^{(i)}, z^{(i)}) = p(x^{(i)} | z^{(i)})p(z^{(i)})
515 | \end{align*} 
516 | 
517 | \noindent \ldots with $z^{(i)} \sim \text{ Multinomial}(\gamma)$, and $\gamma_{nk} \equiv p(k | x_n)$ s.t. $\sum_{j=1}^k \gamma_{nj} = 1$. I.e. want to maximise the probability of the observed data $\boldsymbol{x}$.
518 | }}
519 | 
520 | \newcommand{\MGObj}{{
521 | \begin{align*}
522 | 	\mathcal{L}(\boldsymbol{x}, \pi, \mu, \Sigma)	&= \log p(\boldsymbol{x} | \pi, \mu, \Sigma) \\
523 | 													&= \sum_{n=1}^N \log \left( \sum_{k=1}^K \pi_k \mathcal{N}_k(x_n | \mu_k, \Sigma_k) \right)
524 | \end{align*}
525 | }}
526 | 
527 | \newcommand{\MGTrain}{{
528 | \textbf{E}xpectation: For each $n,k$ set:
529 | \begin{align*}
530 | 	\gamma_{nk}	&= p(z^{(i)} = k | x^{(i)}; \gamma, \mu, \Sigma) \quad (= p(k | x_n)) \\
531 | 				&= \frac{p(x^{(i)} | z^{(i)} = k; \mu, \Sigma) p(z^{(i)} = k; \pi)}{\sum_{j=1}^K p(x^{(i)} | z^{(i)} = l; \mu, \Sigma) p(z^{(i)} = l; \pi)} \\
532 | 				&= \frac{\pi_k \mathcal{N}(x_n | \mu_k, \Sigma_k)}{\sum_{j=1}^K \pi_j \mathcal{N}(x_n | \mu_j, \Sigma_j)}
533 | \end{align*}
534 | 
535 | \textbf{M}aximisation:
536 | \begin{align*}
537 | 	\pi_{k}	&= \frac{1}{N} \sum_{n=1}^N \gamma_{nk} \\
538 | 	\Sigma_{k}	&= \frac{\sum_{n=1}^N \gamma_{nk} (x_n - \mu_k)(x_n - \mu_k)^T}{\sum_{n=1}^N \gamma_{nk}} \\
539 | 	\mu_k 		&= \frac{\sum_{n=1}^N \gamma_{nk} x_n}{\sum_{n=1}^N \gamma_{nk}}
540 | \end{align*}
541 | }}
542 | 
543 | \newcommand{\MGReg}{{
544 | The mixture of Gaussians assigns probabilities for each cluster to each data point, and as such is capable of capturing ambiguities in the data set.
545 | }}
546 | 
547 | \newcommand{\MGCompl}{{
548 | To be added.
549 | }}
550 | 
551 | \newcommand{\MGNonl}{{
552 | Not applicable.
553 | }}
554 | 
555 | \newcommand{\MGOnl}{{
556 | \raggedright
557 | Online Gaussian Mixture Models. A good start is:
558 | \newline
559 | 
560 | \emph{A View of the EM Algorithm that Justifies Incremental, Sparse, and Other Variants}, Neal \& Hinton (1998).
561 | }}
562 | 
563 | \begin{document}
564 | 
565 | \ifpdf
566 | \DeclareGraphicsExtensions{.pdf, .jpg, .tif}
567 | \else
568 | \DeclareGraphicsExtensions{.eps, .jpg}
569 | \fi
570 | % 
571 | % \maketitle
572 | % 
573 | % 
574 | % \begin{abstract}
575 | % \end{abstract}
576 | % 
577 | \begin{center}
578 | \section*{\sc \LARGE Cheat Sheet: Algorithms for Supervised- and Unsupervised Learning \footnote{Created by \href{http://eferm.com}{Emanuel Ferm}, HT2011, for semi-procrastinational reasons while studying for a \href{http://www.comlab.ox.ac.uk/teaching/courses/2010-2011/machinelearning/}{Machine Learning} exam. Last updated \today.}}
579 | \end{center}
580 | 
581 | \begin{table}[H]
582 | 	\begin{center}
583 | 	% \noalign{\smallskip}
584 | 	\begin{footnotesize}
585 | 	\begin{tabular}{@{\extracolsep{\fill}}
586 | 		>{\raggedright}
587 | 		m{2cm}						>{\raggedright}
588 | 									m{5cm}					>{\raggedright}
589 | 															m{\ColWidth{}}		>{\raggedright}
590 | 																				m{7cm}				>{\raggedright}
591 | 																									m{8cm}				>{\raggedright}
592 | 																														m{7cm}					>{\raggedright}
593 | 																																				m{\ColWidth{}}		>{\raggedright}
594 | 																																									m{6cm}				m{\ColWidth{}}}
595 | 		\sc{Algorithm}			&	\sc{Description}	&	\sc{Model}		&	\sc{Objective}	&	\sc{Training}	&	\sc{Regularisation}	&	\sc{Complexity}	&	\sc{Non-linear}	&	\sc{Online learning}	\\
596 | 		\hline
597 | 		\hline \noalign{\smallskip}
598 | 		\textbf{$k$-nearest 
599 | 				neighbour}		&	\KNNDescr{}			&	\KNNModel{}		&	\KNNObj{}		&	\KNNTrain{}		&	\KNNReg{}			&	\KNNCompl{}		&	\KNNNonl{}		&	\KNNOnl{}	\\
600 | 		\noalign{\smallskip} \hline \noalign{\smallskip}
601 | 		\textbf{Naive Bayes}	&	\NBDescr{}			&	\NBModel{}		&	\NBObj{}		&	\NBTrain{}		&	\NBReg{}			&	\NBCompl{}		&	\NBNonl{}		&	\NBOnl{}	\\
602 | 		\noalign{\smallskip} \hline \noalign{\smallskip}
603 | 		\textbf{Log-linear}		&	\LLDescr{}			&	\LLModel{}		&	\LLObj{}		&	\LLTrain{}		&	\LLReg{}			&	\LLCompl{}		&	\LLNonl{}		&	\LLOnl{}	\\
604 | 		\noalign{\smallskip} \hline \noalign{\smallskip}
605 | 		\textbf{Perceptron}		&	\PDescr{}			&	\PModel{}		&	\PObj{	}		&	\PTrain{}		&	\PReg{}				&	\PCompl{}		&	\PNonl{}		&	\POnl{}		\\
606 | 		\noalign{\smallskip} \hline \noalign{\smallskip}
607 | 		\textbf{Support vector
608 | 				machines}		&	\SVMDescr{}			&	\SVMModel{}		&	\SVMObj{}		&	\SVMTrain{}		&	\SVMReg{}			&	\SVMCompl{}		&	\SVMNonl{}		&	\SVMOnl{}	\\
609 | 		\noalign{\smallskip} \hline \noalign{\smallskip}
610 | 		\textbf{$k$-means}		&	\KMDescr{}			&	\KMModel{}		&	\KMObj{}		&	\KMTrain{}		&	\KMReg{}			&	\KMCompl{}		&	\KMNonl{}		&	\KMOnl{}	\\
611 | 		\noalign{\smallskip} \hline \noalign{\smallskip}
612 | 		\textbf{Mixture of
613 | 				Gaussians}		&	\MGDescr{}			&	\MGModel{}		&	\MGObj{}		&	\MGTrain{}		&	\MGReg{}			&	\MGCompl{}		&	\MGNonl{}		&	\MGOnl{}	\\
614 | 	\end{tabular}
615 | 	\end{footnotesize}
616 | 	\end{center}
617 | \end{table}
618 | % \bibliographystyle{plain}
619 | % \bibliography{}
620 | \end{document}
621 | 


--------------------------------------------------------------------------------