├── README.md ├── cheat.pdf └── cheat.tex /README.md: -------------------------------------------------------------------------------- 1 | A summary for common machine learning algorithms, provided in a handy format. 2 | 3 | **Classifiers included are:** 4 | 5 | * k-nearest neighbour 6 | * Naive Bayes 7 | * Log-linear 8 | * Perceptron 9 | * Support Vector Machines 10 | 11 | **Clustering (EM) algorithms included are:** 12 | 13 | * k-means 14 | * Mixture of Gaussians 15 | -------------------------------------------------------------------------------- /cheat.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eferm/mlcheatsheet/8e34fb35aed981683f3398df99bb7bf978837afe/cheat.pdf -------------------------------------------------------------------------------- /cheat.tex: -------------------------------------------------------------------------------- 1 | % 2 | % untitled 3 | % 4 | % Created by Emanuel Ferm on 2011-04-25. 5 | % Copyright (c) 2011 __MyCompanyName__. All rights reserved. 6 | % 7 | \documentclass[landscape,a2paper,8pt]{article} 8 | 9 | % Use utf-8 encoding for foreign characters 10 | \usepackage[utf8]{inputenc} 11 | 12 | % Setup for fullpage use 13 | \usepackage{fullpage} 14 | \usepackage{float} 15 | \usepackage{amssymb} 16 | \usepackage{amsmath} 17 | \usepackage[hmargin=1cm,vmargin=1cm]{geometry} 18 | \usepackage{mdwlist} 19 | \usepackage{array} 20 | \usepackage{hyperref} 21 | \usepackage{nopageno} 22 | 23 | % Uncomment some of the following if you use the features 24 | % 25 | % Running Headers and footers 26 | %\usepackage{fancyhdr} 27 | 28 | % Multipart figures 29 | %\usepackage{subfigure} 30 | 31 | % More symbols 32 | %\usepackage{amsmath} 33 | %\usepackage{amssymb} 34 | %\usepackage{latexsym} 35 | 36 | % Surround parts of graphics with box 37 | \usepackage{boxedminipage} 38 | 39 | % Package for including code in the document 40 | \usepackage{listings} 41 | 42 | % If you want to generate a toc for each chapter (use with book) 43 | %\usepackage{minitoc} 44 | 45 | % This is now the recommended way for checking for PDFLaTeX: 46 | \usepackage{ifpdf} 47 | 48 | %\newif\ifpdf 49 | %\ifx\pdfoutput\undefined 50 | %\pdffalse % we are not running PDFLaTeX 51 | %\else 52 | %\pdfoutput=1 % we are running PDFLaTeX 53 | %\pdftrue 54 | %\fi 55 | 56 | \ifpdf 57 | \usepackage[pdftex]{graphicx} 58 | \else 59 | \usepackage{graphicx} 60 | \fi 61 | 62 | \DeclareMathOperator*{\argmax}{arg\,max\ } 63 | \DeclareMathOperator*{\argmin}{arg\,min\ } 64 | \DeclareMathOperator*{\sign}{sign} 65 | \newcommand{\E}{\mathop{\mathbb E}} 66 | 67 | \renewcommand{\c}[1]{ 68 | } 69 | 70 | \renewcommand{\labelitemi}{{\tiny$\bullet$}} 71 | 72 | \newcommand{\ColWidth}{ 73 | 5cm 74 | } 75 | 76 | \newcommand{\RowHeight}{ 77 | 4cm 78 | } 79 | 80 | \newcommand{\KNNDescr}{ 81 | The label of a new point $\hat{x}$ is classified with the most frequent label $\hat{t}$ of the $k$ nearest training instances. 82 | } 83 | 84 | \newcommand{\KNNModel}{ 85 | \begin{align*} 86 | \hat{t} = \argmax_{\mathcal{C}} \sum_{i:x_{i} \in N_k(\boldsymbol{x},\hat{x})} \delta(t_i, \mathcal{C}) 87 | \end{align*} 88 | \begin{itemize} 89 | \item $N_k(\boldsymbol{x},\hat{x}) \leftarrow$ $k$ points in $\boldsymbol{x}$ closest to $\hat{x}$ 90 | \item Euclidean distance formula: $\sqrt{\sum_{i=1}^{D} (x_i - \hat{x}_i)^2}$ 91 | \item $\delta(a,b) \leftarrow$ 1 if $a = b$; 0 o/w 92 | \end{itemize} 93 | } 94 | 95 | \newcommand{\KNNObj}{ 96 | No optimisation needed. 97 | } 98 | 99 | \newcommand{\KNNTrain}{ 100 | Use cross-validation to learn the appropriate $k$; otherwise no training, classification based on existing points. 101 | } 102 | 103 | \newcommand{\KNNReg}{ 104 | $k$ acts as to regularise the classifier: as $k \rightarrow N$ the boundary becomes smoother. 105 | } 106 | 107 | \newcommand{\KNNCompl}{ 108 | $\mathcal{O}(NM)$ space complexity, since all training instances and all their features need to be kept in memory. 109 | } 110 | 111 | \newcommand{\KNNNonl}{ 112 | Natively finds non-linear boundaries. 113 | } 114 | 115 | \newcommand{\KNNOnl}{ 116 | To be added. 117 | } 118 | 119 | \newcommand{\NBDescr}{ 120 | Learn $p(\mathcal{C}_k | x)$ by modelling $p(x | \mathcal{C}_k)$ and $p(\mathcal{C}_k)$, using Bayes' rule to infer the class conditional probability. Assumes each feature independent of all others, ergo `Naive.' 121 | } 122 | 123 | \newcommand{\NBModel}{ 124 | { 125 | \begin{align*} 126 | y(\boldsymbol{x}) &= \argmax_k p(\mathcal{C}_k | x) \\ 127 | &= \argmax_k p(x | \mathcal{C}_k) \times p(\mathcal{C}_k) \\ 128 | &= \argmax_k \prod_{i=1}^D p(x_i | \mathcal{C}_k) \times p(\mathcal{C}_k) \\ 129 | &= \argmax_k \sum_{i=1}^D \log p(x_i | \mathcal{C}_k) + \log p(\mathcal{C}_k) 130 | \end{align*} 131 | }} 132 | 133 | \newcommand{\NBObj}{ 134 | No optimisation needed. 135 | } 136 | 137 | \newcommand{\NBTrain}{{ 138 | \textbf{Multivariate likelihood} 139 | $ 140 | p(x | \mathcal{C}_k) = \sum_{i=1}^D \log p(x_i | \mathcal{C}_k) 141 | $ 142 | \begin{multline*} 143 | p_{\text{MLE}}(x_i = v | \mathcal{C}_k) = \frac{\sum_{j=1}^N \delta(t_j = \mathcal{C}_k \wedge x_{ji} = v)}{\sum_{j=1}^N \delta(t_j = \mathcal{C}_k)} 144 | \end{multline*} 145 | 146 | \textbf{Multinomial likelihood} 147 | $ 148 | p(x | \mathcal{C}_k) = \prod_{i=1}^D p(\text{word}_i | \mathcal{C}_k)^{x_i} 149 | $ 150 | \begin{multline*} 151 | p_{\text{MLE}}(\text{word}_i = v | \mathcal{C}_k) = \frac{\sum_{j=1}^N \delta(t_j = \mathcal{C}_k) \times x_{ji}}{\sum_{j=1}^N \sum_{d=1}^D \delta(t_j = \mathcal{C}_k) \times x_{di}} 152 | \end{multline*} 153 | 154 | \noindent \ldots where: 155 | \begin{itemize*} 156 | \item $x_{ji}$ is the count of word $i$ in test example $j$; 157 | \item $x_{di}$ is the count of feature $d$ in test example $j$. 158 | \end{itemize*} 159 | 160 | \noindent \textbf{Gaussian likelihood} 161 | $ 162 | p(x | \mathcal{C}_k) = \prod_{i=1}^D \mathcal{N}(v; \mu_{ik}, \sigma_{ik}) 163 | $ 164 | }} 165 | 166 | \newcommand{\NBReg}{{ 167 | Use a Dirichlet prior on the parameters to obtain a MAP estimate. 168 | \newline 169 | 170 | \textbf{Multivariate likelihood} 171 | \begin{multline*} 172 | p_{\text{MAP}}(x_i = v | \mathcal{C}_k) = \\ 173 | \frac{(\beta_i - 1) + \sum_{j=1}^N \delta(t_j = \mathcal{C}_k \wedge x_{ji} = v)}{|x_i|(\beta_i - 1) + \sum_{j=1}^N \delta(t_j = \mathcal{C}_k)} 174 | \end{multline*} 175 | 176 | \noindent \textbf{Multinomial likelihood} 177 | \begin{multline*} 178 | p_{\text{MAP}}(\text{word}_i = v | \mathcal{C}_k) = \\ 179 | \frac{(\alpha_i - 1) + \sum_{j=1}^N \delta(t_j = \mathcal{C}_k) \times x_{ji}}{\sum_{j=1}^N \sum_{d=1}^D \left( \delta(t_j = \mathcal{C}_k) \times x_{di} \right) - D + \sum_{d=1}^D \alpha_d} 180 | \end{multline*} 181 | }} 182 | 183 | \newcommand{\NBCompl}{{ 184 | $\mathcal{O}(NM)$, each training instance must be visited and each of its features counted. 185 | }} 186 | 187 | \newcommand{\NBNonl}{{ 188 | Can only learn linear boundaries for multivariate/multinomial attributes. 189 | \newline 190 | 191 | With Gaussian attributes, quadratic boundaries can be learned with uni-modal distributions. 192 | }} 193 | 194 | \newcommand{\NBOnl}{{ 195 | To be added. 196 | }} 197 | 198 | \newcommand{\LLDescr}{{ 199 | Estimate $p(\mathcal{C}_k | x)$ directly, by assuming a maximum entropy distribution and optimising an objective function over the conditional entropy distribution. 200 | }} 201 | 202 | \newcommand{\LLModel}{{ 203 | \begin{align*} 204 | y(x) &= \argmax_k p(\mathcal{C}_k | x) \\ 205 | &= \argmax_k \sum_m \lambda_m \phi_m(x, \mathcal{C}_k) 206 | % &= \argmax_k \frac{1}{Z_{\lambda}(x)} e^{\sum_m \lambda_m \phi_m(x, \mathcal{C}_k)} 207 | \end{align*} 208 | 209 | \noindent \ldots where: 210 | \begin{align*} 211 | &p(\mathcal{C}_k | x) = \frac{1}{Z_{\lambda}(x)} e^{\sum_m \lambda_m \phi_m(x, \mathcal{C}_k)} \\ 212 | &Z_{\lambda}(x) = \sum_k e^{\sum_m \lambda_m \phi_m(x, \mathcal{C}_k)} 213 | \end{align*} 214 | }} 215 | 216 | \newcommand{\LLObj}{{ 217 | Minimise the negative log-likelihood: 218 | \begin{flalign*} 219 | &\mathcal{L}_{\text{MLE}}(\lambda, \mathcal{D}) = \prod_{(x,t) \in \mathcal{D}} p(t | x) = - \sum_{(x,t) \in \mathcal{D}} \log p(t | x) \\ 220 | & \qquad = \sum_{(x,t) \in \mathcal{D}} \left( \log Z_{\lambda}(x) - \sum_m \lambda_m \phi_m(x, t) \right) \\ 221 | & \qquad = \sum_{(x,t) \in \mathcal{D}} \left( \log \sum_k e^{\sum_m \lambda_m \phi_m(x, \mathcal{C}_k)} - \sum_m \lambda_m \phi_m(x, t) \right) 222 | \end{flalign*} 223 | }} 224 | 225 | \newcommand{\LLTrain}{{ 226 | Gradient descent (or gradient ascent if maximising objective): 227 | \begin{align*} 228 | \lambda^{n+1} = \lambda^n - \eta \Delta \mathcal{L} 229 | \end{align*} 230 | 231 | \noindent \ldots where $\eta$ is the step parameter. 232 | 233 | \begin{align*} 234 | &\Delta \mathcal{L}_{\text{MLE}}(\lambda, \mathcal{D}) = \sum_{(x,t) \in \mathcal{D}} \E[\phi(x,\cdot)] - \phi(x,t) \\ 235 | &\Delta \mathcal{L}_{\text{MAP}}(\lambda, \mathcal{D}, \sigma) = \frac{\lambda}{\sigma^2} + \sum_{(x,t) \in \mathcal{D}} \E[\phi(x,\cdot)] - \sum_{(x,t) \in \mathcal{D}} \phi(x,t) 236 | \end{align*} 237 | 238 | \noindent \ldots where $\sum_{(x,t) \in \mathcal{D}} \phi(x,t)$ are the empirical counts. 239 | \newline 240 | 241 | For each class $\mathcal{C}_k$: 242 | \begin{align*} 243 | \sum_{(x,t) \in \mathcal{D}} \E[\phi(x,\cdot)] = \sum_{(x,t) \in \mathcal{D}} \phi(x,\cdot) p(\mathcal{C}_k | x) 244 | \end{align*} 245 | }} 246 | 247 | \newcommand{\LLReg}{{ 248 | Penalise large values for the $\lambda$ parameters, by introducing a prior distribution over them (typically a Gaussian). 249 | \newline 250 | 251 | \textbf{Objective function} 252 | \begin{align*} 253 | \mathcal{L}_{\text{MAP}}(\lambda, \mathcal{D}, \sigma) &= \argmin_{\lambda} \left( - \log p(\lambda) - \sum_{(x,t) \in \mathcal{D}} \log p(t | x) \right) \\ 254 | &= \argmin_{\lambda} \left( - \log e^{\frac{(0-\lambda)^2}{2\sigma^2}} - \sum_{(x,t) \in \mathcal{D}} \log p(t | x) \right) \\ 255 | &= \argmin_{\lambda} \left( \frac{\sum_m \lambda_m^2}{2\sigma^2} - \sum_{(x,t) \in \mathcal{D}} \log p(t | x) \right) 256 | \end{align*} 257 | }} 258 | 259 | \newcommand{\LLCompl}{{ 260 | $\mathcal{O}(INMK)$, since each training instance must be visited and each combination of class and features must be calculated for the appropriate feature mapping. 261 | }} 262 | 263 | \newcommand{\LLNonl}{{ 264 | Reformulate the class conditional distribution in terms of a kernel $K(x,x')$, and use a non-linear kernel (for example $K(x,x') = (1 + \boldsymbol{w}^T x)^2$). By the Representer Theorem: 265 | 266 | \begin{align*} 267 | p(\mathcal{C}_k | x) &= \frac{1}{Z_{\lambda}(x)} e^{\lambda^T \phi(x, \mathcal{C}_k)} \\ 268 | &= \frac{1}{Z_{\lambda}(x)} e^{\sum_{n=1}^N \sum_{i=1}^K \alpha_{nk} \phi(x_n, C_i)^T \phi(x, \mathcal{C}_k)} \\ 269 | &= \frac{1}{Z_{\lambda}(x)} e^{\sum_{n=1}^N \sum_{i=1}^K \alpha_{nk} K((x_n, C_i),(x,C_k))} \\ 270 | &= \frac{1}{Z_{\lambda}(x)} e^{\sum_{n=1}^N \alpha_{nk} K(x_n, x)} 271 | \end{align*} 272 | }} 273 | 274 | \newcommand{\LLOnl}{{ 275 | \raggedright 276 | Online Gradient Descent: Update the parameters using GD after seeing each training instance. 277 | }} 278 | 279 | \newcommand{\PDescr}{{ 280 | Directly estimate the linear function $y(x)$ by iteratively updating the weight vector when incorrectly classifying a training instance. 281 | }} 282 | 283 | \newcommand{\PModel}{{ 284 | Binary, linear classifier: 285 | \begin{align*} 286 | y(x) = \sign(\boldsymbol{w}^T x) 287 | \end{align*} 288 | 289 | \noindent \ldots where: 290 | \begin{align*} 291 | \sign(x) = \left\{ 292 | \begin{array}{l l} 293 | +1 & \quad \text{if } x \geq 0 \\ 294 | -1 & \quad \text{if } x < 0 \\ 295 | \end{array} \right. 296 | \end{align*} 297 | 298 | \noindent Multiclass perceptron: 299 | \begin{align*} 300 | y(x) = \argmax_{\mathcal{C}_k} \boldsymbol{w}^T \phi(x, \mathcal{C}_k) 301 | \end{align*} 302 | }} 303 | 304 | \newcommand{\PObj}{{ 305 | Tries to minimise the Error function; the number of incorrectly classified input vectors: 306 | \begin{align*} 307 | \argmin_{\boldsymbol{w}} E_P(\boldsymbol{w}) = \argmin_{\boldsymbol{w}} - \sum_{n \in \mathcal{M}} \boldsymbol{w}^T x_n t_n 308 | \end{align*} 309 | 310 | \noindent \ldots where $\mathcal{M}$ is the set of misclassified training vectors. 311 | 312 | %A boundary with 100\% accuracy is found when the perceptron criterion is satisfied: $\boldsymbol{w}^T x t > 0$. 313 | }} 314 | 315 | \newcommand{\PTrain}{{ 316 | Iterate over each training example $x_n$, and update the weight vector if misclassification: 317 | \begin{align*} 318 | \boldsymbol{w}^{i+1} &= \boldsymbol{w}^i + \eta \Delta E_P(\boldsymbol{w}) \\ 319 | &= \boldsymbol{w}^i + \eta x_n t_n 320 | \end{align*} 321 | 322 | \noindent \ldots where typically $\eta = 1$. 323 | \newline 324 | 325 | \noindent For the multiclass perceptron: 326 | \begin{align*} 327 | \boldsymbol{w}^{i+1} = \boldsymbol{w}^i + \phi(x, t) - \phi(x, y(x)) 328 | \end{align*} 329 | }} 330 | 331 | \newcommand{\PReg}{{ 332 | The Voted Perceptron: run the perceptron $i$ times and store each iteration's weight vector. Then: 333 | \begin{align*} 334 | y(x) = \sign \left( \sum_i c_i \times \sign(\boldsymbol{w}_i^T x) \right) 335 | \end{align*} 336 | \ldots where $c_i$ is the number of correctly classified training instances for $\boldsymbol{w}_i$. 337 | }} 338 | 339 | \newcommand{\PCompl}{{ 340 | $\mathcal{O}(INML)$, since each combination of instance, class and features must be calculated (see log-linear). 341 | }} 342 | 343 | \newcommand{\PNonl}{{ 344 | Use a kernel $K(x,x')$, and 1 weight per training instance: 345 | \begin{align*} 346 | y(x) = \sign \left( \sum_{n=1}^N w_n t_n K(x, x_n) \right) 347 | \end{align*} 348 | 349 | \noindent \ldots and the update: 350 | \begin{align*} 351 | w_n^{i+1} = w_n^i + 1 352 | \end{align*} 353 | }} 354 | 355 | \newcommand{\POnl}{{ 356 | \raggedright 357 | The perceptron is an online algorithm per default. 358 | }} 359 | 360 | \newcommand{\SVMDescr}{{ 361 | A maximum margin classifier: finds the separating hyperplane with the maximum margin to its closest data points. 362 | }} 363 | 364 | \newcommand{\SVMModel}{{ 365 | \begin{align*} 366 | y(x) = \sum_{n=1}^N \lambda_n t_n x^T x_n + w_0 367 | \end{align*} 368 | }} 369 | 370 | \newcommand{\SVMObj}{{ 371 | \textbf{Primal} 372 | \begin{align*} 373 | \argmin_{\boldsymbol{w}, w_0} \frac{1}{2} ||\boldsymbol{w}||^2 374 | \end{align*} 375 | \begin{align*} 376 | \text{s.t.} \quad t_n (\boldsymbol{w}^T x_n + w_0) \geq 1 \quad \forall n 377 | \end{align*} 378 | 379 | \noindent \textbf{Dual} 380 | \begin{align*} 381 | \tilde{\mathcal{L}}(\wedge) = \sum_{n=1}^N \lambda_n - \sum_{n=1}^N \sum_{m=1}^N \lambda_n \lambda_m t_n t_m x_n^T x_m 382 | \end{align*} 383 | \begin{align*} 384 | \text{s.t.} \quad & \lambda_n \geq 0, \quad \sum_{n=1}^N \lambda_n t_n = 0, \quad \forall n 385 | \end{align*} 386 | }} 387 | 388 | \newcommand{\SVMTrain}{{ 389 | \begin{itemize} 390 | \item Quadratic Programming (QP) 391 | \item SMO, Sequential Minimal Optimisation (chunking). 392 | \end{itemize} 393 | }} 394 | 395 | \newcommand{\SVMReg}{{ 396 | The soft margin SVM: penalise a hyperplane by the number and distance of misclassified points. 397 | \newline 398 | 399 | \noindent \textbf{Primal} 400 | \begin{align*} 401 | \argmin_{\boldsymbol{w}, w_0} \frac{1}{2} ||\boldsymbol{w}||^2 + C \sum_{n=1}^N \xi_n 402 | \end{align*} 403 | \begin{align*} 404 | \text{s.t.} \quad t_n (\boldsymbol{w}^T x_n + w_0) \geq 1 - \xi_n, \quad \xi_n > 0 \quad \forall n 405 | \end{align*} 406 | 407 | \noindent \textbf{Dual} 408 | \begin{align*} 409 | \tilde{\mathcal{L}}(\wedge) = \sum_{n=1}^N \lambda_n - \sum_{n=1}^N \sum_{m=1}^N \lambda_n \lambda_m t_n t_m x_n^T x_m 410 | \end{align*} 411 | \begin{align*} 412 | \text{s.t.} \quad 0 \leq \lambda_n \leq C, \quad \sum_{n=1}^N \lambda_n t_n = 0, \quad \forall n 413 | \end{align*} 414 | }} 415 | 416 | \newcommand{\SVMCompl}{{ 417 | \begin{itemize} 418 | \item QP: $\mathcal{O}(n^3)$; 419 | \item SMO: much more efficient than QP, since computation based only on support vectors. 420 | \end{itemize} 421 | }} 422 | 423 | \newcommand{\SVMNonl}{{ 424 | Use a non-linear kernel $K(x,x')$: 425 | 426 | \begin{align*} 427 | y(x) &= \sum_{n=1}^N \lambda_n t_n x^T x_n + w_0 \\ 428 | &= \sum_{n=1}^N \lambda_n t_n K(x, x_n) + w_0 429 | \end{align*} 430 | 431 | \begin{align*} 432 | \tilde{\mathcal{L}}(\wedge) &= \sum_{n=1}^N \lambda_n - \sum_{n=1}^N \sum_{m=1}^N \lambda_n \lambda_m t_n t_m x_n^T x_m \\ 433 | &= \sum_{n=1}^N \lambda_n - \sum_{n=1}^N \sum_{m=1}^N \lambda_n \lambda_m t_n t_m K(x_n,x_m) 434 | \end{align*} 435 | }} 436 | 437 | \newcommand{\SVMOnl}{{ 438 | \raggedright 439 | Online SVM. See, for example: 440 | \begin{itemize} 441 | \item \emph{The Huller: A Simple and Efficient Online SVM}, Bordes \& Bottou (2005) 442 | \item \emph{Pegasos: Primal Estimated sub-Gradient Solver for SVM}, Shalev-Shwartz et al. (2007) 443 | \end{itemize} 444 | }} 445 | 446 | \newcommand{\KMDescr}{{ 447 | A hard-margin, geometric clustering algorithm, where each data point is assigned to its closest centroid. 448 | }} 449 | 450 | \newcommand{\KMModel}{{ 451 | Hard assignments $r_{nk} \in \{0,1\}$ s.t. $\forall n \sum_k r_{nk} = 1$, i.e. each data point is assigned to one cluster $k$. 452 | \newline 453 | 454 | Geometric distance: The Euclidean distance, $l^2$ norm: 455 | \begin{align*} 456 | || x_n - \mu_k ||_2 = \sqrt{\sum_{i=1}^D (x_{ni} - \mu_{ki})^2} 457 | \end{align*} 458 | }} 459 | 460 | \newcommand{\KMObj}{{ 461 | \begin{align*} 462 | \argmin_{\boldsymbol{r},\mu} \sum_{n=1}^N \sum_{k=1}^K r_{nk} || x_n - \mu_k ||_2^2 463 | \end{align*} 464 | 465 | \noindent \ldots i.e. minimise the distance from each cluster centre to each of its points. 466 | }} 467 | 468 | \newcommand{\KMTrain}{{ 469 | \textbf{E}xpectation: 470 | \begin{align*} 471 | r_{nk} = \left\{ 472 | \begin{array}{l l} 473 | 1 & \quad \text{if } || x_n - \mu_k ||^2 \text{ minimal for } k \\ 474 | 0 & \quad \text{o/w} 475 | \end{array} \right. 476 | \end{align*} 477 | 478 | \textbf{M}aximisation: 479 | \begin{align*} 480 | \mu_{\text{MLE}}^{(k)} = \frac{\sum_n r_{nk} x_n}{\sum_n r_{nk}} 481 | \end{align*} 482 | 483 | \noindent \ldots where $\mu^{(k)}$ is the centroid of cluster $k$. 484 | }} 485 | 486 | \newcommand{\KMReg}{{ 487 | Only hard-margin assignment to clusters. 488 | }} 489 | 490 | \newcommand{\KMCompl}{{ 491 | To be added. 492 | }} 493 | 494 | \newcommand{\KMNonl}{{ 495 | For non-linearly separable data, use kernel k-means as suggested in: 496 | \newline 497 | 498 | \emph{Kernel k-means, Spectral Clustering and Normalized Cuts}, Dhillon et al. (2004). 499 | 500 | }} 501 | 502 | \newcommand{\KMOnl}{{ 503 | \raggedright 504 | Sequential $k$-means: update the centroids after processing one point at a time. 505 | }} 506 | 507 | \newcommand{\MGDescr}{{ 508 | A probabilistic clustering algorithm, where clusters are modelled as latent Guassians and each data point is assigned the probability of being drawn from a particular Gaussian. 509 | }} 510 | 511 | \newcommand{\MGModel}{{ 512 | Assignments to clusters by specifying probabilities 513 | \begin{align*} 514 | p(x^{(i)}, z^{(i)}) = p(x^{(i)} | z^{(i)})p(z^{(i)}) 515 | \end{align*} 516 | 517 | \noindent \ldots with $z^{(i)} \sim \text{ Multinomial}(\gamma)$, and $\gamma_{nk} \equiv p(k | x_n)$ s.t. $\sum_{j=1}^k \gamma_{nj} = 1$. I.e. want to maximise the probability of the observed data $\boldsymbol{x}$. 518 | }} 519 | 520 | \newcommand{\MGObj}{{ 521 | \begin{align*} 522 | \mathcal{L}(\boldsymbol{x}, \pi, \mu, \Sigma) &= \log p(\boldsymbol{x} | \pi, \mu, \Sigma) \\ 523 | &= \sum_{n=1}^N \log \left( \sum_{k=1}^K \pi_k \mathcal{N}_k(x_n | \mu_k, \Sigma_k) \right) 524 | \end{align*} 525 | }} 526 | 527 | \newcommand{\MGTrain}{{ 528 | \textbf{E}xpectation: For each $n,k$ set: 529 | \begin{align*} 530 | \gamma_{nk} &= p(z^{(i)} = k | x^{(i)}; \gamma, \mu, \Sigma) \quad (= p(k | x_n)) \\ 531 | &= \frac{p(x^{(i)} | z^{(i)} = k; \mu, \Sigma) p(z^{(i)} = k; \pi)}{\sum_{j=1}^K p(x^{(i)} | z^{(i)} = l; \mu, \Sigma) p(z^{(i)} = l; \pi)} \\ 532 | &= \frac{\pi_k \mathcal{N}(x_n | \mu_k, \Sigma_k)}{\sum_{j=1}^K \pi_j \mathcal{N}(x_n | \mu_j, \Sigma_j)} 533 | \end{align*} 534 | 535 | \textbf{M}aximisation: 536 | \begin{align*} 537 | \pi_{k} &= \frac{1}{N} \sum_{n=1}^N \gamma_{nk} \\ 538 | \Sigma_{k} &= \frac{\sum_{n=1}^N \gamma_{nk} (x_n - \mu_k)(x_n - \mu_k)^T}{\sum_{n=1}^N \gamma_{nk}} \\ 539 | \mu_k &= \frac{\sum_{n=1}^N \gamma_{nk} x_n}{\sum_{n=1}^N \gamma_{nk}} 540 | \end{align*} 541 | }} 542 | 543 | \newcommand{\MGReg}{{ 544 | The mixture of Gaussians assigns probabilities for each cluster to each data point, and as such is capable of capturing ambiguities in the data set. 545 | }} 546 | 547 | \newcommand{\MGCompl}{{ 548 | To be added. 549 | }} 550 | 551 | \newcommand{\MGNonl}{{ 552 | Not applicable. 553 | }} 554 | 555 | \newcommand{\MGOnl}{{ 556 | \raggedright 557 | Online Gaussian Mixture Models. A good start is: 558 | \newline 559 | 560 | \emph{A View of the EM Algorithm that Justifies Incremental, Sparse, and Other Variants}, Neal \& Hinton (1998). 561 | }} 562 | 563 | \begin{document} 564 | 565 | \ifpdf 566 | \DeclareGraphicsExtensions{.pdf, .jpg, .tif} 567 | \else 568 | \DeclareGraphicsExtensions{.eps, .jpg} 569 | \fi 570 | % 571 | % \maketitle 572 | % 573 | % 574 | % \begin{abstract} 575 | % \end{abstract} 576 | % 577 | \begin{center} 578 | \section*{\sc \LARGE Cheat Sheet: Algorithms for Supervised- and Unsupervised Learning \footnote{Created by \href{http://eferm.com}{Emanuel Ferm}, HT2011, for semi-procrastinational reasons while studying for a \href{http://www.comlab.ox.ac.uk/teaching/courses/2010-2011/machinelearning/}{Machine Learning} exam. Last updated \today.}} 579 | \end{center} 580 | 581 | \begin{table}[H] 582 | \begin{center} 583 | % \noalign{\smallskip} 584 | \begin{footnotesize} 585 | \begin{tabular}{@{\extracolsep{\fill}} 586 | >{\raggedright} 587 | m{2cm} >{\raggedright} 588 | m{5cm} >{\raggedright} 589 | m{\ColWidth{}} >{\raggedright} 590 | m{7cm} >{\raggedright} 591 | m{8cm} >{\raggedright} 592 | m{7cm} >{\raggedright} 593 | m{\ColWidth{}} >{\raggedright} 594 | m{6cm} m{\ColWidth{}}} 595 | \sc{Algorithm} & \sc{Description} & \sc{Model} & \sc{Objective} & \sc{Training} & \sc{Regularisation} & \sc{Complexity} & \sc{Non-linear} & \sc{Online learning} \\ 596 | \hline 597 | \hline \noalign{\smallskip} 598 | \textbf{$k$-nearest 599 | neighbour} & \KNNDescr{} & \KNNModel{} & \KNNObj{} & \KNNTrain{} & \KNNReg{} & \KNNCompl{} & \KNNNonl{} & \KNNOnl{} \\ 600 | \noalign{\smallskip} \hline \noalign{\smallskip} 601 | \textbf{Naive Bayes} & \NBDescr{} & \NBModel{} & \NBObj{} & \NBTrain{} & \NBReg{} & \NBCompl{} & \NBNonl{} & \NBOnl{} \\ 602 | \noalign{\smallskip} \hline \noalign{\smallskip} 603 | \textbf{Log-linear} & \LLDescr{} & \LLModel{} & \LLObj{} & \LLTrain{} & \LLReg{} & \LLCompl{} & \LLNonl{} & \LLOnl{} \\ 604 | \noalign{\smallskip} \hline \noalign{\smallskip} 605 | \textbf{Perceptron} & \PDescr{} & \PModel{} & \PObj{ } & \PTrain{} & \PReg{} & \PCompl{} & \PNonl{} & \POnl{} \\ 606 | \noalign{\smallskip} \hline \noalign{\smallskip} 607 | \textbf{Support vector 608 | machines} & \SVMDescr{} & \SVMModel{} & \SVMObj{} & \SVMTrain{} & \SVMReg{} & \SVMCompl{} & \SVMNonl{} & \SVMOnl{} \\ 609 | \noalign{\smallskip} \hline \noalign{\smallskip} 610 | \textbf{$k$-means} & \KMDescr{} & \KMModel{} & \KMObj{} & \KMTrain{} & \KMReg{} & \KMCompl{} & \KMNonl{} & \KMOnl{} \\ 611 | \noalign{\smallskip} \hline \noalign{\smallskip} 612 | \textbf{Mixture of 613 | Gaussians} & \MGDescr{} & \MGModel{} & \MGObj{} & \MGTrain{} & \MGReg{} & \MGCompl{} & \MGNonl{} & \MGOnl{} \\ 614 | \end{tabular} 615 | \end{footnotesize} 616 | \end{center} 617 | \end{table} 618 | % \bibliographystyle{plain} 619 | % \bibliography{} 620 | \end{document} 621 | --------------------------------------------------------------------------------