├── cnn.png ├── data.pdf ├── filter.png ├── fmap.png ├── gnews.png ├── imq1.jpg ├── imq2.png ├── linear.pdf ├── machinelearning.pdf ├── machinelearning.tex ├── maxpool.png ├── mushroom.jpeg ├── mushroom2.jpeg ├── orig.png ├── rbf.pdf ├── relu.pdf └── tanh.pdf /cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynameisjanus/686xMachineLearning/6cd4a9f9e1c01df41a7f51e8a95560037a76ead2/cnn.png -------------------------------------------------------------------------------- /data.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynameisjanus/686xMachineLearning/6cd4a9f9e1c01df41a7f51e8a95560037a76ead2/data.pdf -------------------------------------------------------------------------------- /filter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynameisjanus/686xMachineLearning/6cd4a9f9e1c01df41a7f51e8a95560037a76ead2/filter.png -------------------------------------------------------------------------------- /fmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynameisjanus/686xMachineLearning/6cd4a9f9e1c01df41a7f51e8a95560037a76ead2/fmap.png -------------------------------------------------------------------------------- /gnews.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynameisjanus/686xMachineLearning/6cd4a9f9e1c01df41a7f51e8a95560037a76ead2/gnews.png -------------------------------------------------------------------------------- /imq1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynameisjanus/686xMachineLearning/6cd4a9f9e1c01df41a7f51e8a95560037a76ead2/imq1.jpg -------------------------------------------------------------------------------- /imq2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynameisjanus/686xMachineLearning/6cd4a9f9e1c01df41a7f51e8a95560037a76ead2/imq2.png -------------------------------------------------------------------------------- /linear.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynameisjanus/686xMachineLearning/6cd4a9f9e1c01df41a7f51e8a95560037a76ead2/linear.pdf -------------------------------------------------------------------------------- /machinelearning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynameisjanus/686xMachineLearning/6cd4a9f9e1c01df41a7f51e8a95560037a76ead2/machinelearning.pdf -------------------------------------------------------------------------------- /machinelearning.tex: -------------------------------------------------------------------------------- 1 | \documentclass[10pt,landscape,a4paper]{article} 2 | \usepackage[english]{babel} 3 | \usepackage[utf8]{inputenc} 4 | \usepackage[plain]{algorithm} 5 | \usepackage[noend]{algpseudocode} 6 | \usepackage{tikz} 7 | \usepackage{pgfplots} 8 | \usepackage{palatino} 9 | \usepackage{multicol} 10 | \usepackage{blkarray} 11 | 12 | \usepackage{calc} 13 | \usepackage{ifthen} 14 | \usepackage[landscape]{geometry} 15 | \usepackage{graphicx} 16 | \usepackage{amsmath, amssymb, amsthm} 17 | \DeclareMathOperator*{\argmin}{argmin} 18 | \DeclareMathOperator*{\argmax}{argmax} 19 | 20 | \usepackage{physics} 21 | \usepackage{latexsym, marvosym} 22 | \usepackage{pifont} 23 | \usepackage{lscape} 24 | \usepackage{dsfont} 25 | \usepackage{graphicx} 26 | \usepackage{array} 27 | \usepackage{booktabs} 28 | \usepackage[bottom]{footmisc} 29 | \usepackage{tikz} 30 | \usetikzlibrary{shapes} 31 | \usepackage{pdfpages} 32 | \usepackage{wrapfig} 33 | \usepackage{enumitem} 34 | \setlist[description]{leftmargin=0pt} 35 | \usepackage{xfrac} 36 | \usepackage[pdftex, 37 | pdfauthor={Janus Advincula}, 38 | pdftitle={Machine Learning}, 39 | pdfsubject={A cheatsheet pdf and reference guide made for MIT's 6.86x course.}, 40 | pdfkeywords={machine learning} {statistics} {cheatsheet} {pdf} {cheat} {sheet} {formulas} {equations} 41 | ]{hyperref} 42 | \usepackage[ 43 | open, 44 | openlevel=2 45 | ]{bookmark} 46 | \usepackage{relsize} 47 | \usepackage{rotating} 48 | 49 | 50 | \newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}} 51 | \def\independenT#1#2{\mathrel{\setbox0\hbox{$#1#2$}% 52 | \copy0\kern-\wd0\mkern4mu\box0}} 53 | 54 | \newcommand{\noin}{\noindent} 55 | \newcommand{\logit}{\textrm{logit}} 56 | %\newcommand{\var}{\textrm{Var}} 57 | \newcommand{\cov}{\textrm{Cov}} 58 | \newcommand{\corr}{\textrm{Corr}} 59 | \newcommand{\N}{\mathcal{N}} 60 | \newcommand{\Bern}{\textrm{Bern}} 61 | \newcommand{\Bin}{\textrm{Bin}} 62 | \newcommand{\Beta}{\textrm{Beta}} 63 | \newcommand{\Gam}{\textrm{Gamma}} 64 | \newcommand{\Expo}{\textrm{Expo}} 65 | \newcommand{\Pois}{\textrm{Pois}} 66 | \newcommand{\Unif}{\textrm{Unif}} 67 | \newcommand{\Geom}{\textrm{Geom}} 68 | \newcommand{\NBin}{\textrm{NBin}} 69 | \newcommand{\Hypergeometric}{\textrm{HGeom}} 70 | \newcommand{\HGeom}{\textrm{HGeom}} 71 | \newcommand{\Mult}{\textrm{Mult}} 72 | 73 | \geometry{top=.4in,left=.2in,right=.2in,bottom=.4in} 74 | 75 | \pagestyle{empty} 76 | \makeatletter 77 | \renewcommand{\section}{\@startsection{section}{1}{0mm}% 78 | {-1ex plus -.5ex minus -.2ex}% 79 | {0.5ex plus .2ex}%x 80 | {\normalfont\large\bfseries}} 81 | \renewcommand{\subsection}{\@startsection{subsection}{2}{0mm}% 82 | {-1explus -.5ex minus -.2ex}% 83 | {0.5ex plus .2ex}% 84 | {\normalfont\normalsize\bfseries}} 85 | \renewcommand{\subsubsection}{\@startsection{subsubsection}{3}{0mm}% 86 | {-1ex plus -.5ex minus -.2ex}% 87 | {1ex plus .2ex}% 88 | {\normalfont\small\bfseries}} 89 | \makeatother 90 | 91 | \setcounter{secnumdepth}{0} 92 | 93 | \setlength{\parindent}{0pt} 94 | \setlength{\parskip}{0pt plus 0.5ex} 95 | 96 | % ----------------------------------------------------------------------- 97 | 98 | \usepackage{titlesec} 99 | 100 | \titleformat{\section} 101 | {\color{blue}\normalfont\large\bfseries} 102 | {\color{blue}\thesection}{1em}{} 103 | \titleformat{\subsection} 104 | {\color{violet}\normalfont\normalsize\bfseries} 105 | {\color{violet}\thesection}{1em}{} 106 | % Comment out the above 5 lines for black and white 107 | 108 | \begin{document} 109 | 110 | \raggedright 111 | \footnotesize 112 | \begin{multicols*}{3} 113 | 114 | % multicol parameters 115 | % These lengths are set only within the two main columns 116 | %\setlength{\columnseprule}{0.25pt} 117 | \setlength{\premulticols}{1pt} 118 | \setlength{\postmulticols}{1pt} 119 | \setlength{\multicolsep}{1pt} 120 | \setlength{\columnsep}{2pt} 121 | 122 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 123 | %%% TITLE 124 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 125 | 126 | \begin{center} 127 | {\color{blue} \Large{\textbf{6.86x Machine Learning with Python}}} \\ 128 | % {\Large{\textbf{Probability Cheatsheet}}} \\ 129 | % comment out line with \color{blue} and uncomment above line for b&w 130 | \end{center} 131 | 132 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 133 | %%% ATTRIBUTIONS 134 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 135 | 136 | \scriptsize 137 | 138 | This is a cheat sheet for machine learning based on the online course given by Prof. Tommi Jaakkola and Prof. Regina Barzilay. Compiled by Janus B. Advincula. 139 | 140 | \begin{center} 141 | Last Updated \today 142 | \end{center} 143 | 144 | % Cheatsheet format from 145 | % http://www.stdout.org/$\sim$winston/latex/ 146 | 147 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 148 | %%% BEGIN CHEATSHEET 149 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 150 | 151 | 152 | \section{Linear Classifiers}\smallskip \hrule height 1pt \smallskip 153 | 154 | \subsection{Introduction to Machine Learning} 155 | 156 | \begin{description} 157 | \item[What is machine learning?] Machine learning as a discipline aims to design, understand and apply computer programs that learn from experience (i.e., data) for the purpose of modeling, prediction or control. 158 | \item[Types of Machine Learning] ~ 159 | \begin{itemize} 160 | \item {\bf Supervised learning:} prediction based on examples of correct behavior 161 | \item {\bf Unsupervised learning:} no explicit target, only data, goal is to model/discover 162 | \item {\bf Semi-supervised learning:} supplement limited annotations with unsupervised learning 163 | \item {\bf Active learning:} learn to query the examples actually needed for learning 164 | \item {\bf Transfer learning:} how to apply what you have learned from $A$ to $B$ 165 | \item {\bf Reinforcement learning:} learning to act, not just predict; goal is to optimize the consequences of actions 166 | \end{itemize} 167 | \end{description} 168 | 169 | \subsection{Linear Classifier and Perceptron} 170 | \begin{center} 171 | \begin{tikzpicture}[scale=0.5] 172 | \draw[thick,-] (1,5.25) -- (2,0.5); 173 | \draw [fill,red] (3,4) circle [radius=0.075]; 174 | \draw [fill,red] (3.5,2.5) circle [radius=0.075]; 175 | \node[below right] at (0,0.5) {classifier: $h(x)=0$}; 176 | \draw [fill, blue] (1.15,1.5) circle [radius=0.075]; 177 | \draw [fill, blue] (-0.5,4) circle [radius=0.075]; 178 | \draw [fill, blue] (-0.75,3) circle [radius=0.075]; 179 | \draw [fill, blue] (0.75,3.7842) circle [radius=0.075]; 180 | \draw [fill, blue] (0.3212,0.963) circle [radius=0.075]; 181 | \draw [fill, red] (2.1954,3.7059) circle [radius=0.075]; 182 | \draw [fill, red] (1.6413,4.1978) circle [radius=0.075]; 183 | \draw [fill, red] (2.2835,2.6196) circle [radius=0.075]; 184 | \draw [fill, blue] (0.2869,2) circle [radius=0.075]; 185 | \draw [fill, red] (2.9040,1.5912) circle [radius=0.075]; 186 | \node[below right] at (-2.4,5.25) {$h(x)=+1$}; 187 | \node[below right] at (1.5,5.25) {$h(x)=-1$}; 188 | \end{tikzpicture} 189 | \end{center} 190 | \begin{description}[itemsep=0pt] 191 | \item[Key Concepts] ~ 192 | \begin{itemize} 193 | \item {\bf feature vectors, labels:} $$x\in\mathbb{R}^d,\quad y\in\{-1,+1\}$$ 194 | \item {\bf training set:} $$S_n=\left\{\left(x^{(i)},y^{(i)}\right),i=1,\dots,n\right\}$$ 195 | \item {\bf classifier:} $$h:\mathbb{R}^d\rightarrow\{-1,+1\}$$ 196 | $$\chi^{+}=\left\{x\in\mathbb{R}^d:h(x)=+1\right\}$$ 197 | $$\chi^{-}=\left\{x\in\mathbb{R}^d:h(x)=-1\right\}$$ 198 | \item {\bf training error:} $$\mathcal{E}_n(h)=\frac{1}{n}\sum_{i=1}^{n}\left[\left[h\left(x^{(i)}\right)\neq y^{(i)}\right]\right]$$ 199 | $$\left[\left[h\left(x^{(i)}\right)\neq y^{(i)}\right]\right]= 200 | \begin{cases} 201 | 1\quad\text{if error}\\ 202 | 0\quad\text{otherwise} 203 | \end{cases}$$ 204 | \item {\bf test error:} $\mathcal{E}(h)$ 205 | \item {\bf set of classifiers:} $h\in\mathcal{H}$ 206 | \end{itemize} 207 | \item[Linear Classifiers through the Origin] We consider functions of the form 208 | $$h(x;\theta)=\text{sign}\left(\theta_1x_1+\cdots+\theta_dx_d\right)=\text{sign}\left(\theta\cdot x\right).$$ 209 | \begin{center} 210 | \begin{tikzpicture}[scale=0.5] 211 | \draw[ultra thick,-] (1,5.25) -- (2,0.5); 212 | \draw [fill,red] (3,4) circle [radius=0.075]; 213 | \draw [fill,red] (3.5,2.5) circle [radius=0.075]; 214 | \node[below right] at (1,0.5) {$\theta\cdot x=0$}; 215 | \draw[thick, red, ->] (1.5,2.75) -- (0,2.45); 216 | \draw[thick, ->] (1.5,2.75) -- (0.35,2.03); 217 | \node[above left] at (1,2.55) {$\theta$}; 218 | \node[below left] at (0.35,2.03) {$x$}; 219 | \draw [fill, blue] (1.15,1.5) circle [radius=0.075]; 220 | \draw [fill, blue] (-0.5,4) circle [radius=0.075]; 221 | \draw [fill, blue] (-0.75,3) circle [radius=0.075]; 222 | \draw [fill, blue] (0.75,3.7842) circle [radius=0.075]; 223 | \draw [fill, blue] (0.3212,0.963) circle [radius=0.075]; 224 | \draw [fill, red] (2.1954,3.7059) circle [radius=0.075]; 225 | \draw [fill, red] (1.6413,4.1978) circle [radius=0.075]; 226 | \draw [fill, red] (2.2835,2.6196) circle [radius=0.075]; 227 | \draw [fill, blue] (0.2869,2) circle [radius=0.075]; 228 | \draw [fill, red] (2.9040,1.5912) circle [radius=0.075]; 229 | \node[below right] at (-1.75,5) {$\theta\cdot x>0$}; 230 | \node[below right] at (1.75,5) {$\theta\cdot x<0$}; 231 | \end{tikzpicture} 232 | \end{center} 233 | \item[Linear Classifiers with Offset] We can consider functions of the form $$h(x;\theta)=\text{sign}\left(\theta\cdot x+\theta_0\right)$$ 234 | where $\theta_0$ is the offset parameter. 235 | \item[Linear Separation] Training examples $S_n$ are {\it linearly separable} if there exists a parameter vector $\widehat{\theta}$ and offset parameter $\widehat{\theta}_0$ such that $y^{(i)}\left(\widehat{\theta}\cdot x^{(i)}+\widehat{\theta}_0\right)>0$ for all $i=1,\dots,n$. 236 | \item[Training Error] The training error for a linear classifier is 237 | $$\mathcal{E}_n\left(\theta,\theta_0\right)=\frac{1}{n}\sum_{i=1}^{n}\left[\left[y^{(i)}\left(\theta\cdot x^{(i)}+\theta_0\right)\leq0\right]\right].$$ 238 | \item[Perceptron Algorithm] ~ 239 | \vspace*{-1em} 240 | \begin{algorithm}[H] 241 | {\fontsize{7}{0} 242 | \begin{algorithmic} 243 | \Procedure{Perceptron}{$\{(x^{(i)},y^{(i)}),i=1,\dots,n\},T$} 244 | \State $\theta=0\;\text{(vector)}$ 245 | \For{$t=1,\dots,T$} 246 | \For{$i=1,\dots,n$} 247 | \If{$y^{(i)}\left(\theta\cdot x^{(i)}\right)\leq0$} 248 | \State $\theta=\theta+y^{(i)}x^{(i)}$ 249 | \EndIf 250 | \EndFor 251 | \EndFor 252 | \State \textbf{return} $\theta$ 253 | \EndProcedure 254 | \end{algorithmic} 255 | } 256 | \end{algorithm} 257 | \vspace*{-3em} 258 | 259 | \item[Perceptron Algorithm (with offset)] ~ 260 | \vspace*{-1em} 261 | \begin{algorithm}[H] 262 | {\fontsize{7}{0} 263 | \begin{algorithmic} 264 | \Procedure{Perceptron}{$\{(x^{(i)},y^{(i)}),i=1,\dots,n\},T$} 265 | \State $\theta=0\;\text{(vector)}$ 266 | \For{$t=1,\dots,T$} 267 | \For{$i=1,\dots,n$} 268 | \If{$y^{(i)}\left(\theta\cdot x^{(i)}+\theta_0\right)\leq0$} 269 | \State $\theta=\theta+y^{(i)}x^{(i)}$ 270 | \State $\theta_0=\theta_0+y^{(i)}$ 271 | \EndIf 272 | \EndFor 273 | \EndFor 274 | \State \textbf{return} $\theta,\theta_0$ 275 | \EndProcedure 276 | \end{algorithmic} 277 | } 278 | \end{algorithm} 279 | \vspace*{-3.5em} 280 | \item[Convergence] Assumptions: 281 | \begin{itemize} 282 | \item There exists $\theta^*$ such that $\frac{y^{(i)}\left(\theta^*\cdot x^{(i)}\right)}{\norm{x^{(i)}}}\geq\gamma$ for all $i=1,\dots,n$ for some $\gamma>0.$ 283 | \item All examples are bounded $\norm{x^{(i)}}\leq R$, $i=1,\dots,n.$ 284 | \end{itemize} 285 | Then the number $k$ of updates made by the perceptron algorithm is bounded by $\dfrac{R^2}{\gamma^2}.$ 286 | \end{description} 287 | 288 | 289 | \subsection{Hinge Loss, Margin Boundaries and Regularization} 290 | 291 | \begin{description} 292 | \item[Distance from a Line to a Point] The perpendicular distance from a line with equation $\theta\cdot x+\theta_0=0$ to a point with coordinates $x_0$ is 293 | $$d=\dfrac{\left|\theta\cdot x_0+\theta_0\right|}{\lVert\theta\rVert}$$ 294 | \begin{center} 295 | \begin{tikzpicture}[scale=1] 296 | \draw [fill, gray] (2,2.5) circle [radius=0.05]; 297 | \draw[ultra thick, red, ->] (1.5,1.5) -- (3, 2); 298 | \draw[thick, blue,->] (1.5, 1.5) -- (2,2.5); 299 | \node[right] at (3,2.1) {$\theta$}; 300 | \node[above right] at (2,2.5) {$x_0$}; 301 | \draw[ultra thick,-] (1,3) -- (1.75,0.75); 302 | \draw[dotted, thick] (2.25, 1.75) -- (2,2.5); 303 | \draw [decorate, decoration={brace,amplitude=3pt,raise=3pt,mirror}] (1.5,1.5) -- (2.25,1.75); 304 | \node[below] at (2,1.4) {$d$}; 305 | \end{tikzpicture} 306 | \end{center} 307 | \item[Decision Boundary] The decision boundary is the set of points $x$ which satisfy $$\theta\cdot x+\theta_0=0.$$ 308 | \item[Margin Boundary] The margin boundary is the set of points $x$ which satisfy $$\theta\cdot x+\theta_0=\pm1.$$ 309 | \begin{center} 310 | \begin{tikzpicture}[scale=0.5] 311 | \draw[thick,dashed] (0,5) -- (1,0); 312 | \draw[ultra thick,-] (1,5.25) -- (2,0.25); 313 | \draw[thick,dashed] (2,5.5) -- (3,0.5); 314 | \node[right] at (3,1) {$\theta\cdot x+\theta_0=-1$}; 315 | \node[above right] at (2.9,1.25) {negative margin boundary}; 316 | \node[above left] at (0.75,0.5) {positive margin boundary}; 317 | \node[left] at (0.8,0.25) {$\theta\cdot x +\theta_0=1$}; 318 | \draw [fill,red] (2.3,4) circle [radius=0.075]; 319 | \draw [fill,red] (2.8,2.5) circle [radius=0.075]; 320 | \draw[ultra thick, ->] (1.5,2.75) -- (0.75,2.6); 321 | \node[below right] at (2,0.3) {decision boundary}; 322 | \node[below right] at (2,-0.2) {$\theta\cdot x+\theta_0=0$}; 323 | \draw [fill, blue] (-0.5,4) circle [radius=0.075]; 324 | \draw [fill, blue] (-0.75,3) circle [radius=0.075]; 325 | \draw[thick, -] (1.25,4) -- (2.25,4.2); 326 | \draw[thick] (1.75,4.2) to [out=90,in=180] (3,4.95); 327 | \node[right] at (3,5) {$\gamma_i\left(\theta,\theta_0\right)=\dfrac{y^{(i)}\left(\theta\cdot x^{(i)}+\theta_0\right)}{\norm{\theta}}$}; 328 | \end{tikzpicture} 329 | \end{center} 330 | \item[Hinge Loss] ~ 331 | $$\text{Loss}_h\left(z\right)=\begin{cases} 332 | 0&\text{if }z\geq1\\ 333 | 1-z&\text{if }z<1 334 | \end{cases}$$ 335 | with $z=y^{(i)}\left(\theta\cdot x^{(i)}+\theta_0\right)$. 336 | \item[Regularization] Maximize margin 337 | $$\max\dfrac{1}{\lVert\theta\rVert}\quad\Rightarrow\quad\min\frac{1}{2}\lVert\theta\rVert^2$$ 338 | 339 | \end{description} 340 | 341 | \subsection{Linear Classification and Generalization} 342 | \begin{center} 343 | \begin{tikzpicture}[scale=0.6] 344 | \draw[thick,dashed] (0,5) -- (1,0); 345 | \draw[ultra thick,-] (1,5.25) -- (2,0.25); 346 | \draw[thick,dashed] (2,5.5) -- (3,0.5); 347 | \node[right] at (3,1) {$\theta\cdot x+\theta_0=-1$}; 348 | \node[above right] at (2.9,1.25) {negative margin boundary}; 349 | \node[above left] at (0.75,0.5) {positive margin boundary}; 350 | \node[left] at (0.8,0.25) {$\theta\cdot x +\theta_0=1$}; 351 | \draw [fill,red] (3,4) circle [radius=0.075]; 352 | \draw [fill,red] (3.5,2.5) circle [radius=0.075]; 353 | \draw[ultra thick, ->] (1.5,2.75) -- (0,2.45); 354 | \node[above left] at (1.15,2.65) {$\theta$}; 355 | \node[below right] at (2,0.3) {decision boundary}; 356 | \node[below right] at (2,-0.2) {$\theta\cdot x+\theta_0=0$}; 357 | \draw [fill, blue] (1.15,1.5) circle [radius=0.075]; 358 | \draw [fill, blue] (-0.5,4) circle [radius=0.075]; 359 | \draw [fill, blue] (-0.75,3) circle [radius=0.075]; 360 | \draw [thick] (1.15,1.5) circle [radius=0.15]; 361 | \draw[thick, -] (1.25,4) -- (2.25,4.2); 362 | \draw[thick] (1.75,4.2) to [out=90,in=180] (3,5); 363 | \node[right] at (3,5) {$\dfrac{1}{\norm{\theta}}$}; 364 | \node[right, align=left] at (4,5) {distance from the decision \\ boundary to the margin boundary}; 365 | \end{tikzpicture} 366 | \end{center} 367 | 368 | 369 | \begin{description} 370 | \item[Objective function] ~ 371 | $$J\left(\theta,\theta_0\right)=\frac{1}{n}\sum_{i=1}^{n}\left[\text{Loss}_h\left(y^{(i)}\left(\theta\cdot x^{(i)}+\theta_0\right)\right)+\frac{\lambda}{2}\lVert\theta\rVert^2\right]$$ 372 | $\lambda$ is the regularization factor. 373 | \item[Stochastic Gradient Descent] Select $i\in\{1,\dots,n\}$ at random 374 | $$\theta\quad\leftarrow\quad\theta-\eta_t\nabla_\theta\left[\text{Loss}_h\left(\theta\cdot x^{(i)}+\theta_0\right)+\frac{\lambda}{2}\lVert\theta\rVert^2\right]$$ 375 | $\eta_t$ is the learning rate which can vary at every iteration. 376 | \item[Support Vector Machine] ~ 377 | \begin{itemize} 378 | \item Support Vector Machine finds the maximum margin linear separator by solving the quadratic program that corresponds to $J(\theta,\theta_0)$ 379 | \item In the realizable case, if we disallow any margin violations, the quadratic program we have to solve is: 380 | 381 | \hspace*{1cm}Find $\theta,\theta_0$ that minimize $\frac{1}{2}\lVert\theta\rVert^2$ subject to 382 | $$y^{(i)}\left(\theta\cdot x^{(i)}+\theta_0\right)\geq1,\quad i=1,\dots,n$$ 383 | \end{itemize} 384 | \end{description} 385 | \section{Nonlinear Classification, Linear Regression, \newline Collaborative Filtering} \smallskip \hrule height 1pt \smallskip 386 | 387 | \subsection{Linear Regression} 388 | 389 | \begin{description} 390 | \item[Empirical Risk] ~ 391 | $$R_n\left(\theta\right)=\frac{1}{n}\sum_{i=1}^{n}\frac{1}{2}\left(y^{(i)}-\theta\cdot x^{(i)}\right)^2\qquad\text{squared error}$$ 392 | \item[Gradient-based Approach] We can use stochastic gradient descent to find the minima of the empirical risk.\\ 393 | \vspace{0.25cm} 394 | \hspace*{0.5cm}{\bf Algorithm} \hspace*{0.5cm}Initialize $\theta=0$.\\ 395 | \hspace*{2.25cm}Randomly pick $i=\{1,\dots,n\}$.\\ 396 | \hspace*{2.25cm}$\theta=\theta+\eta\left(y^{(i)}-\theta\cdot x^{(i)}\right)x^{(i)}$. 397 | 398 | $\eta$ is the learning rate. 399 | \item[Closed Form Solution] ~ 400 | Let $$A=\frac{1}{n}\sum_{i=1}^{n}x^{(i)}\left(x^{(i)}\right)^\intercal\quad\text{and}\quad B=\frac{1}{n}\sum_{i=1}^{n}y^{(i)}x^{(i)}.$$ Then, 401 | $$\widehat{\theta}=A^{-1}B.$$ In matrix notation, this is 402 | $$\widehat{\theta}=\left(\mathbb{X}^\intercal\mathbb{X}\right)^{-1}\mathbb{X}^\intercal\mathbb{Y}.$$ 403 | \item[Generalization and Regularization] ~ 404 | 405 | {\bf Ridge Regression:} The loss function is $$J_{\lambda,n}=\frac{\lambda}{2}\norm{\theta}^2+R_n\left(\theta\right)$$ 406 | where $\lambda$ is the regularization factor. We can find its minima using gradient-based approach.\\ 407 | \vspace{0.25cm} 408 | \hspace*{0.5cm}{\bf Algorithm} \hspace*{0.5cm}Initialize $\theta=0$.\\ 409 | \hspace*{2.25cm}Randomly pick $i=\{1,\dots,n\}$.\\ 410 | \hspace*{2.25cm}$\theta=\left(1-\eta\lambda\right)\theta+\eta\left(y^{(i)}-\theta\cdot x^{(i)}\right)x^{(i)}$. 411 | \end{description} 412 | 413 | \subsection{Nonlinear Classification} 414 | \begin{description} 415 | \item[Feature Transformation] ~ 416 | \begin{align*} 417 | x&\mapsto\phi(x)\\ 418 | \theta\cdot x&\rightarrow\theta'\cdot\phi(x) 419 | \end{align*} 420 | \item[Non-linear Classification] ~ 421 | $$h\left(x;\theta,\theta_0\right)=\text{sign}\left(\theta\cdot\phi(x)+\theta_0\right)$$ 422 | \item[Kernel Function] A kernel function is simply an inner product between two feature vectors. Using kernels is advantageous when the inner products are faster to evaluate than using explicit vectors (e.g., when the vectors would be infinite dimensional). 423 | $$K\left(x,x'\right)=\phi(x)\cdot\phi(x')$$ 424 | \item[Perceptron] ~ 425 | \vspace*{-1em} 426 | \begin{algorithm}[H] 427 | {\fontsize{7}{0} 428 | \begin{algorithmic} 429 | \State $\theta=0$ 430 | \For{$i=1,\dots,n$} 431 | \If{$y^{(i)}\theta\cdot\phi\left(x^{(i)}\right)\leq0$} 432 | \State $\theta\leftarrow\theta+y^{(i)}\phi\left(x^{(i)}\right)$ 433 | \EndIf 434 | \EndFor 435 | \end{algorithmic} 436 | } 437 | \end{algorithm} 438 | \vspace*{-3em} 439 | This algorithm gives 440 | $$\theta=\sum_{j=1}^{n}\alpha_jy^{(j)}\phi\left(x^{(j)}\right)$$ 441 | where $\alpha_j$ is the number of mistakes. For the offset parameter, we get 442 | $$\theta_0=\sum_{j=1}^{n}\alpha_jy^{(j)}.$$ 443 | \item[Kernel Perceptron Algorithm] We can reformulate the perceptron algorithm so that we initialize and update $\alpha_j$'s, instead of $\theta$. 444 | $$\theta\cdot\phi\left(x^{(i)}\right)=\sum_{j=1}^{n}\alpha_jy^{(j)}\underbrace{\phi\left(x^{(j)}\right)\cdot\phi\left(x^{(i)}\right)}_{K\left(x^{(j)},x^{(i)}\right)}$$ 445 | \vspace*{-2em} 446 | \begin{algorithm}[H] 447 | {\fontsize{7}{0} 448 | \begin{algorithmic} 449 | \Procedure{Kernel Perceptron}{$\{(x^{(i)},y^{(i)}),i=1,\dots,n\},T$} 450 | \State \textbf{Initialize} $\alpha_1,\dots,\alpha_n \text{ to some values}$ 451 | \For{$t=1,\dots,T$} 452 | \For{$i=1,\dots,n$} 453 | \If{$y^{(i)}\sum\limits_{j=1}^{n}\alpha_jy^{(j)}K\left(x^{(j)},x^{(i)}\right)\leq0$} 454 | \State $\alpha_j=\alpha_j+1$ 455 | \EndIf 456 | \EndFor 457 | \EndFor 458 | \EndProcedure 459 | \end{algorithmic} 460 | } 461 | \end{algorithm} 462 | \vspace*{-2em} 463 | The initilization $\theta=0$ is equivalent to $\alpha_1=\dots=\alpha_n=0$. 464 | \item[Composition rules:] ~ 465 | \begin{enumerate} 466 | \item $K(x,x')=1$ is a kernel function. 467 | \item Let $f:\mathbb{R}^d\rightarrow\mathbb{R}$ and $K(x,x')$ is a kernel. Then so is $\widetilde{K}(x,x')=f(x)K(x,x')f(x')$ 468 | \item If $K_1(x,x')$ and $K_2(x,x')$ are kernels, then $K(x,x')=K_1(x,x')+K_2(x,x')$ is a kernel. 469 | \item If $K_1(x,x')$ and $K_2(x,x')$ are kernels, then $K(x,x')=K_1(x,x')K_2(x,x')$ is a kernel. 470 | \end{enumerate} 471 | \item[Decision Boundary] The decision boundary satisfies 472 | $$\sum_{j=1}^{n}\alpha_jy^{(j)}K\left(x^{(j)},x\right)=0.$$ 473 | \item[Radial Basis Kernel] ~ 474 | $$K(x,x')=\exp(-\frac{1}{2}\norm{x-x'}^2)$$ 475 | \begin{center} 476 | \includegraphics[width=0.6\columnwidth]{rbf.pdf} 477 | \end{center} 478 | \item[Other non-linear classifiers] ~ 479 | \begin{itemize} 480 | \item We can get non-linear classifiers or regression methods by simply mapping examples into feature vectors non-linearly, and applying a linear method on the resulting vectors. 481 | \item These feature vectors can be high dimensional. 482 | \item We can turn the linear methods into kernel methods by casting the computations in terms of inner products. 483 | \end{itemize} 484 | \end{description} 485 | 486 | \subsection{Recommender Systems} 487 | 488 | \begin{description} 489 | \item[Problem Description] We are given a matrix where each row corresponds to a user's rating of movies, for example, and each column corresponds to the user ratings for a particular movie. It can also be product ratings, etc. This matrix will be very sparse. The goal is to predict user ratings for those movies that are yet to be rated. 490 | \[ 491 | \begin{blockarray}{cccc} 492 | & & m\text{ movies} \\ 493 | \begin{block}{c(ccc)} 494 | & & & \\ 495 | & & & \\ 496 | n \text{ users} & & Y_{ai} & \\ 497 | & & & \\ 498 | & & & \\ 499 | \end{block} 500 | \end{blockarray} 501 | \] 502 | \vspace*{-2em} 503 | \item[$K$-Nearest Neighbor Method] The $K$-Nearest Neighbor method makes use of ratings by $K$ other {\it similar} users when predicting $Y_{ai}.$ Let $\text{KNN}(a)$ be the set of $K$ users {\it similar to} user $a$, and let $\text{sim}(a,b)$ be a {\bf similarity measure} between users $a$ and $b\in\text{KNN}(a).$ The KNN method predicts a rating $Y_{ai}$ to be 504 | $$\widehat{Y}_{ai}=\dfrac{\sum\limits_{b\in\text{KNN}(a)}\text{sim}(a,b)Y_{bi}}{\sum\limits_{b\in\text{KNN}(a)}\text{sim}(a,b)}$$ 505 | The similarity measure $\text{sim}(a,b)$ could be any distance function between the feature vectors $x_a$ and $x_b$. 506 | \begin{itemize} 507 | \item Euclidean distance: $\norm{x_a-x_b}$ 508 | \item Cosine similarity: $\cos\theta=\dfrac{x_a\cdot x_b}{\norm{x_a}\norm{x_b}}$ 509 | \end{itemize} 510 | \item[Collaborative Filtering] Our goal is to come up with a matrix $X$ that has no blank entries and whose $(a,i)$\textsuperscript{th} entry $X_{ai}$ is the prediction of user $a$'s rating to movie $i$. 511 | 512 | Let $D$ be the set of all $(a,i)$'s for which a user rating $Y_{ai}$ exists. A naive approach is to minimize the objective function 513 | $$J(X)=\sum_{(a,i)\in D}\frac{1}{2}\left(Y_{ai}-X_{ai}\right)^2+\frac{\lambda}{2}\sum_{(a,i)}X_{ai}^2.$$ 514 | The results are 515 | \begin{align*} 516 | \widehat{X}_{ai}&=\dfrac{Y_{ai}}{1+\lambda}\quad\text{for }(a,i)\in D\\ 517 | \widehat{X}_{ai}&=0\qquad\quad\;\,\text{for }(a,i)\notin D. 518 | \end{align*} 519 | The problem with this approach is that there is no connection between the entries of $X$. We can impose additional constraint on $X$: $$X=UV^\intercal$$ 520 | for some $n\times d$ matrix $U$ and $d\times m$ matrix $V^\intercal$, where $d$ is the {\it rank} of the matrix $X$. 521 | \item[Alternating Minimization] Assume that $U$ and $V$ are rank $k$ matrices. Then, we can write the objective function as 522 | $$J(X)=\sum_{(a,i)\in D}\frac{1}{2}\left(Y_{ai}-\left[UV^\intercal\right]_{ai}\right)^2+\frac{\lambda}{2}\left(\sum_{a,k}U_{ak}^2+\sum_{i,k}V_{ik}^2\right).$$ 523 | To find the solution, we fix (initialize) $U$ (or $V$) and minimize the objective with respect to $V$ (or $U$). We plug-in the result back to the objective and minimize it with respect to $U$ (or $V$). We repeat this alternating process until there is no change in the objective function. 524 | \item[Example] Consider the case $k=1$. Then, $U_{a1}=u_a$ and $V_{i1}=v_i$. If we initialize $u_a$ to some values, then we have to optimize the function 525 | $$\sum_{(a,i)\in D}\frac{1}{2}\left(Y_{ai}-u_av_i\right)^2+\frac{\lambda}{2}\sum_{i}v_{i}^2.$$ 526 | \end{description} 527 | 528 | \section{Neural Networks}\smallskip \hrule height 1pt \smallskip 529 | 530 | \subsection{Introduction to Feedforward Neural Networks} 531 | 532 | \begin{description} 533 | \item[A Unit in a Neural Network] A {\bf neural network unit} is a primitive neural network that consists of only the {\it input layer}, and an output layer with only one output. 534 | \begin{center} 535 | \begin{tikzpicture}[scale=0.75] 536 | \draw[thick,blue] (0,0) circle [radius=0.25]; 537 | \draw[thick,blue] (0,2) circle [radius=0.25]; 538 | \draw[thick,blue] (0,3) circle [radius=0.25]; 539 | \node[left] at (0.2,1.1) {$\vdots$}; 540 | \node[left] at (-0.5,3) {$x_1$}; 541 | \node[left] at (-0.5,2) {$x_2$}; 542 | \node[left] at (-0.5,0) {$x_d$}; 543 | \draw[thick,blue] (3,1.5) circle [radius=0.25];% 544 | \draw[thick,->] (0.25,2.9) -- (2.7,1.6); 545 | \draw[thick,->] (0.25,2) --(2.7,1.5); 546 | \draw[thick,->] (0.25,0) -- (2.7,1.4); 547 | \draw[thick,->] (3.25,1.5) -- (5,1.5); 548 | \node[right] at (1,2.6) {$w_1$}; 549 | \node[right] at (1,1.4) {$w_2$}; 550 | \node[right] at (1,0.25) {$w_d$}; 551 | \node[right] at (3.75,1.8) {$f(z)$}; 552 | \end{tikzpicture} 553 | \end{center} 554 | A neural network unit computes a non-linear weighted combination of its input: $$\widehat{y}=f(z)\qquad\text{where}\quad z=w_0+\sum_{i=1}^{d}x_iw_i$$ 555 | where $w_i$ are the {\bf weights}, $z$ is a number and is the weighted sum of the inputs $x_i$, and $f$ is generally a non-linear function called the {\bf activation function}. 556 | \item[Linear Function] $f(z)=z$ 557 | \item[Rectified Linear Unit (ReLU)] $f(z)=\max\{0,z\}$ 558 | \item[Hyperbolic Tangent Function] $\tanh(z)=\dfrac{e^z-e^{-z}}{e^z+e^{-z}}=1-\dfrac{2}{e^{2z}+1}$ 559 | \item[Deep Neural Networks] A {\bf deep (feedforward) neural network} refers to a neural network that contains not only the input and output layers, but also hidden layers in between. Below is a deep feedforward neural network of 2 hidden layers, with each hidden layer consisting of 5 units: 560 | \begin{center} 561 | \begin{tikzpicture}[scale=0.5] 562 | \draw[thick,blue] (0,0) circle [radius=0.25]; 563 | \draw[thick,blue] (0,2) circle [radius=0.25]; 564 | \draw[thick,blue] (0,4) circle [radius=0.25]; 565 | \node[left] at (-0.5,4) {$x_1$}; 566 | \node[left] at (-0.5,2) {$x_2$}; 567 | \node[left] at (-0.5,0) {$x_3$}; 568 | \draw[thick] (3,6) circle [radius=0.25]; 569 | \draw[thick] (3,4) circle [radius=0.25]; 570 | \draw[thick] (3,2) circle [radius=0.25]; 571 | \draw[thick] (3,0) circle [radius=0.25]; 572 | \draw[thick] (3,-2) circle [radius=0.25]; 573 | \draw[thick] (6,6) circle [radius=0.25]; 574 | \draw[thick] (6,4) circle [radius=0.25]; 575 | \draw[thick] (6,2) circle [radius=0.25]; 576 | \draw[thick] (6,0) circle [radius=0.25]; 577 | \draw[thick] (6,-2) circle [radius=0.25]; 578 | \draw[thick,orange] (9,2) circle [radius=0.25]; 579 | \node[above] at (10,2) {$f$}; 580 | \node[left, rotate=90, blue] at (0,-3) {Input Layer}; 581 | \node[left, rotate=90] at (3,-3) {Hidden Layer}; 582 | \node[left, rotate=90] at (6,-3) {Hidden Layer}; 583 | \node[left, rotate=90, orange] at (9,-3) {Output Layer}; 584 | \draw[thick,->] (9.25,2) -- (11,2); 585 | \draw[thick,->] (0.25,4) -- (2.7,6); 586 | \draw[thick,->] (0.25,4) -- (2.7,4); 587 | \draw[thick,->] (0.25,4) -- (2.7,2); 588 | \draw[thick,->] (0.25,4) -- (2.7,0); 589 | \draw[thick,->] (0.25,4) -- (2.7,-2); 590 | \draw[thick,->] (0.25,2) --(2.7,6); 591 | \draw[thick,->] (0.25,2) --(2.7,4); 592 | \draw[thick,->] (0.25,2) --(2.7,2); 593 | \draw[thick,->] (0.25,2) --(2.7,0); 594 | \draw[thick,->] (0.25,2) --(2.7,-2); 595 | \draw[thick,->] (0.25,0) -- (2.7,6); 596 | \draw[thick,->] (0.25,0) -- (2.7,4); 597 | \draw[thick,->] (0.25,0) -- (2.7,2); 598 | \draw[thick,->] (0.25,0) -- (2.7,0); 599 | \draw[thick,->] (0.25,0) -- (2.7,-2); 600 | \draw[thick,->] (3.25,-2) -- (5.7,6); 601 | \draw[thick,->] (3.25,-2) -- (5.7,4); 602 | \draw[thick,->] (3.25,-2) -- (5.7,2); 603 | \draw[thick,->] (3.25,-2) -- (5.7,0); 604 | \draw[thick,->] (3.25,-2) -- (5.7,-2); 605 | \draw[thick,->] (3.25,0) -- (5.7,6); 606 | \draw[thick,->] (3.25,0) -- (5.7,4); 607 | \draw[thick,->] (3.25,0) -- (5.7,2); 608 | \draw[thick,->] (3.25,0) -- (5.7,0); 609 | \draw[thick,->] (3.25,0) -- (5.7,-2); 610 | \draw[thick,->] (3.25,2) -- (5.7,6); 611 | \draw[thick,->] (3.25,2) -- (5.7,4); 612 | \draw[thick,->] (3.25,2) -- (5.7,2); 613 | \draw[thick,->] (3.25,2) -- (5.7,0); 614 | \draw[thick,->] (3.25,2) -- (5.7,-2); 615 | \draw[thick,->] (3.25,4) -- (5.7,6); 616 | \draw[thick,->] (3.25,4) -- (5.7,4); 617 | \draw[thick,->] (3.25,4) -- (5.7,2); 618 | \draw[thick,->] (3.25,4) -- (5.7,0); 619 | \draw[thick,->] (3.25,4) -- (5.7,-2); 620 | \draw[thick,->] (3.25,6) -- (5.7,6); 621 | \draw[thick,->] (3.25,6) -- (5.7,4); 622 | \draw[thick,->] (3.25,6) -- (5.7,2); 623 | \draw[thick,->] (3.25,6) -- (5.7,0); 624 | \draw[thick,->] (3.25,6) -- (5.7,-2); 625 | \draw[thick,->] (6.25,6) -- (8.7,2); 626 | \draw[thick,->] (6.25,4) -- (8.7,2); 627 | \draw[thick,->] (6.25,2) -- (8.7,2); 628 | \draw[thick,->] (6.25,0) -- (8.7,2); 629 | \draw[thick,->] (6.25,-2) -- (8.7,2); 630 | \end{tikzpicture} 631 | \end{center} 632 | \item[One Hidden Layer Model] ~ 633 | \begin{center} 634 | \begin{tikzpicture}[scale=0.75] 635 | \draw[thick,blue] (0,0) circle [radius=0.25]; 636 | \draw[thick,blue] (0,2) circle [radius=0.25]; 637 | \node[left] at (-0.2,2) {$x_1$}; 638 | \node[left] at (-0.2,0) {$x_2$}; 639 | \draw[thick,blue] (3,2) circle [radius=0.25]; 640 | \draw[thick,blue] (3,0) circle [radius=0.25]; 641 | \draw[thick,blue] (6,1) circle [radius=0.25]; 642 | \draw[thick,->] (6.25,1) -- (7.5,1); 643 | \draw[thick,->] (0.25,2) --(2.7,2); 644 | \draw[thick,->] (0.22,1.9) --(2.7,0); 645 | \draw[thick,->] (0.25,0) -- (2.7,0); 646 | \draw[thick,->] (0.23,0.1) -- (2.7,2); 647 | \node[right] at (0.7,2.2) {$W_{11}$}; 648 | \node[right] at (0.7,1.6) {$W_{12}$}; 649 | \node[right] at (0.7,0.35) {$W_{21}$}; 650 | \node[right] at (0.7,-0.22) {$W_{22}$}; 651 | \draw[thick,->] (3.25,2) -- (5.7,1.1); 652 | \draw[thick,->] (3.25,0) -- (5.7,0.9); 653 | \node[above] at (6.75,1) {$f$}; 654 | \node[above] at (5.7,1.25) {$z$}; 655 | \node[above] at (2.5,2) {$z_1$}; 656 | \node[above] at (3.5,2) {$f_1$}; 657 | \node[below] at (2.5,0) {$z_2$}; 658 | \node[below] at (3.5,0) {$f_2$}; 659 | \node[above,red] at (0,3) {Layer 0}; 660 | \node[above,red] at (3,3) {Layer 1}; 661 | \node[below,red] at (3,3.2) {($\tanh$)}; 662 | \node[above,red] at (6,3) {Layer 2}; 663 | \node[below,red] at (6,3.2) {(linear)}; 664 | \end{tikzpicture} 665 | \end{center} 666 | \begin{alignat*}{3} 667 | z_1&=\sum_{j=1}^{2}x_jW_{j1}+W_{01}\qquad &&z_2&&=\sum_{j=1}^{2}x_jW_{j2}+W_{02}\\ 668 | f_1&=f(z_1)=\tanh(z_1)\qquad &&f_2&&=f(z_2)=\tanh(z_2)\\ 669 | z&=f_1w'_1+f_2w'_2 &&f&&=f(z)=z 670 | \end{alignat*} 671 | \item[Neural Signal Transformation] We can visualize what the hidden layer is doing similarly to a linear classifier. 672 | \begin{center} 673 | \begin{tikzpicture}[scale=0.75] 674 | \node[right] at (2,0) {$x_1$}; 675 | \node[above] at (0,2) {$x_2$}; 676 | \draw[thick,->] (-2,0) -- (2,0); 677 | \draw[thick,->] (0,-2) --(0,2); 678 | \draw[thick,red,->] (0,-1) --(-0.5,-0.5); 679 | \draw[thick,red] (-1,-2) --(2,1); 680 | \node[left] at (-0.5,-0.5) {$\vec{W}_1$}; 681 | \draw[thick,blue] (2,-1) --(-2,1); 682 | \draw[thick,blue,->] (0,0) --(0.4,0.8); 683 | \node[right] at (0.4,0.8) {$\vec{W}_2$}; 684 | \end{tikzpicture} 685 | \end{center} 686 | In the figure, $$\vec{W}_1=\begin{pmatrix} 687 | W_{11}\\ 688 | W_{21} 689 | \end{pmatrix}\qquad\text{and}\quad\vec{W}_2=\begin{pmatrix} 690 | W_{12}\\ 691 | W_{22} 692 | \end{pmatrix}.$$ 693 | They map the input onto the $f_1$-$f_2$ axes. 694 | \end{description} 695 | 696 | \begin{description} 697 | \item[Hidden Layer Representation] ~ 698 | \begin{itemize}[noitemsep, topsep=0pt] 699 | \item Hidden Layer Units 700 | \begin{center} 701 | \includegraphics[width=0.6\columnwidth]{data.pdf} 702 | \end{center} 703 | \item Linear Activation 704 | \begin{center} 705 | \includegraphics[width=0.6\columnwidth]{linear.pdf} 706 | \end{center} 707 | \item $tanh$ Activation 708 | \begin{center} 709 | \includegraphics[width=0.6\columnwidth]{tanh.pdf} 710 | \end{center} 711 | \item ReLU Activation 712 | \begin{center} 713 | \includegraphics[width=0.6\columnwidth]{relu.pdf} 714 | \end{center} 715 | \end{itemize} 716 | \item[Summary] ~ 717 | \begin{itemize} 718 | \item Units in neural networks are linear classifiers, just with different output non-linearity. 719 | \item The units in feedforward neural networks are arranged in layers. 720 | \item By learning the parameters associated with the hidden layer units, we learn how to represent examples (as hidden layer activations). 721 | \item The representations in neural networks are learned directly to facilitate the end-to-end task. 722 | \item A simple classifier (output unit) suffices to solve complex classification tasks if it operates on the hidden layer representations. 723 | \end{itemize} 724 | \end{description} 725 | \subsection{Feedforward Neural Networks, Back Propagation, and Stochastic Gradient Descent (SGD)} 726 | 727 | \begin{description} 728 | \item[Simple Example] This simple neural network is made up of $L$ hidden layers, but each layer consists of only one unit, and each unit has activation function $f$. 729 | \begin{center} 730 | \begin{tikzpicture}[scale=0.8] 731 | \node[left] at (-0.5,0) {$x$}; 732 | \draw[thick] (-0.25,0) circle [radius=0.25]; 733 | \draw[thick,->] (0,0) -- (1.25,0); 734 | \node[above] at (0.45,0) {$w_1$}; 735 | \draw[thick] (1.5,0) circle [radius=0.25]; 736 | \node[above left] at (1.25,0) {$z_1$}; 737 | \node[above right] at (1.75,0) {$f_1$}; 738 | \draw[thick,->] (1.75,0) --(3,0); 739 | \draw[thick,->] (4,0) -- (5.25,0); 740 | \node[above] at (4.5,0) {$w_L$}; 741 | \node[right] at (3.25,0) {$\dots$}; 742 | \draw[thick] (5.5,0) circle [radius=0.25]; 743 | \node[above left] at (5.25,0) {$z_L$}; 744 | \node[above right] at (5.75,0) {$f_L$}; 745 | \draw[thick,->] (5.75,0) -- (6.5,0); 746 | \draw[red,thick] (7.25,0) circle [radius=0.25]; 747 | \node at (7.25,0) {$y$}; 748 | \node[right] at (2.5, -0.75) {$z_1=xw_1$}; 749 | \node[right] at (1.8,-2.45) {$\mathcal{L}(y,f_L)=\text{Loss}\left(y,f_L\right)=\frac{1}{2}\left(y-f_L\right)^2$}; 750 | \node[right] at (2.5,-1.1) {$f_1=\tanh(xw_1)$}; 751 | \node[right] at (2.95,-1.45) {$\vdots$}; 752 | \node[right] at (2.5,-2) {$f_L=\tanh(f_{L-1}w_L)$}; 753 | \end{tikzpicture} 754 | \end{center} 755 | For $i=2,\dots,L$: $z_i=f_{i-1}w_i$ where $f_{i-1}=f(z_{i-1})$. Also, $y$ is the true value and $f_L$ is the output of the neural network. 756 | \item[Gradient Descent] The gradient descent update rule for the parameter $w_i$ is 757 | $$w_i\leftarrow w_i-\eta\cdot\grad_{w_i}{\mathcal{L}(y,f_L)}$$ 758 | where $\eta$ is the learning rate. For instance, we have 759 | \begin{align*} 760 | \dfrac{\partial\mathcal{L}}{\partial w_1}&=\dfrac{\partial f_1}{\partial w_1}\dfrac{\partial\mathcal{L}}{\partial f_1}\\ 761 | \dfrac{\partial f_1}{\partial w_1}&=\left[1-\tanh^2(xw_1)\right]x=\left(1-f_1^2\right)x\\ 762 | \dfrac{\partial\mathcal{L}}{\partial f_1}&=\dfrac{\partial\mathcal{L}}{\partial f_2}\dfrac{\partial f_2}{\partial f_1}=\dfrac{\partial\mathcal{L}}{\partial f_2}\left(1-f_2^2\right)w_2. 763 | \end{align*} 764 | Thus, when we back-propagate, we get 765 | $$\dfrac{\partial\mathcal{L}}{\partial w_1}=x\left(1-f_1^2\right)\cdots\left(1-f_L^2\right)w_2\cdots w_L\cdot2\left(f_L-y\right).$$ 766 | Note that the above derivation applies to $\tanh$ activation. 767 | \item[Backpropagation] Consider the $L$-layer neural network below. 768 | \begin{center} 769 | \begin{tikzpicture}[scale=0.5] 770 | \draw[thick,blue] (0,0) circle [radius=0.25]; 771 | \draw[thick,blue] (0,2) circle [radius=0.25]; 772 | \draw[thick,blue] (0,4) circle [radius=0.25]; 773 | \node[left] at (-0.5,4) {$x_1$}; 774 | \node[left] at (-0.5,2) {$x_2$}; 775 | \node[left] at (-0.5,0) {$x_3$}; 776 | \draw[thick] (3,6) circle [radius=0.25]; 777 | \draw[thick] (3,4) circle [radius=0.25]; 778 | \draw[thick] (3,2) circle [radius=0.25]; 779 | \draw[thick] (3,0) circle [radius=0.25]; 780 | \draw[thick] (3,-2) circle [radius=0.25]; 781 | \draw[thick] (6,6) circle [radius=0.25]; 782 | \draw[thick] (6,4) circle [radius=0.25]; 783 | \draw[thick] (6,2) circle [radius=0.25]; 784 | \draw[thick] (6,0) circle [radius=0.25]; 785 | \draw[thick] (6,-2) circle [radius=0.25]; 786 | \draw[thick,orange] (9,4) circle [radius=0.25]; 787 | \draw[thick,orange] (9,0) circle [radius=0.25]; 788 | \draw[thick,->] (0.25,4) -- (2.7,6); 789 | \draw[thick,->] (0.25,4) -- (2.7,4); 790 | \draw[thick,->] (0.25,4) -- (2.7,2); 791 | \draw[thick,->] (0.25,4) -- (2.7,0); 792 | \draw[thick,->] (0.25,4) -- (2.7,-2); 793 | \draw[thick,->] (0.25,2) --(2.7,6); 794 | \draw[thick,->] (0.25,2) --(2.7,4); 795 | \draw[thick,->] (0.25,2) --(2.7,2); 796 | \draw[thick,->] (0.25,2) --(2.7,0); 797 | \draw[thick,->] (0.25,2) --(2.7,-2); 798 | \draw[thick,->] (0.25,0) -- (2.7,6); 799 | \draw[thick,->] (0.25,0) -- (2.7,4); 800 | \draw[thick,->] (0.25,0) -- (2.7,2); 801 | \draw[thick,->] (0.25,0) -- (2.7,0); 802 | \draw[thick,->] (0.25,0) -- (2.7,-2); 803 | \draw[thick,->] (3.25,-2) -- (5.7,6); 804 | \draw[thick,->] (3.25,-2) -- (5.7,4); 805 | \draw[thick,->] (3.25,-2) -- (5.7,2); 806 | \draw[thick,->] (3.25,-2) -- (5.7,0); 807 | \draw[thick,->] (3.25,-2) -- (5.7,-2); 808 | \draw[thick,->] (3.25,0) -- (5.7,6); 809 | \draw[thick,->] (3.25,0) -- (5.7,4); 810 | \draw[thick,->] (3.25,0) -- (5.7,2); 811 | \draw[thick,->] (3.25,0) -- (5.7,0); 812 | \draw[thick,->] (3.25,0) -- (5.7,-2); 813 | \draw[thick,->] (3.25,2) -- (5.7,6); 814 | \draw[thick,->] (3.25,2) -- (5.7,4); 815 | \draw[thick,->] (3.25,2) -- (5.7,2); 816 | \draw[thick,->] (3.25,2) -- (5.7,0); 817 | \draw[thick,->] (3.25,2) -- (5.7,-2); 818 | \draw[thick,->] (3.25,4) -- (5.7,6); 819 | \draw[thick,->] (3.25,4) -- (5.7,4); 820 | \draw[thick,->] (3.25,4) -- (5.7,2); 821 | \draw[thick,->] (3.25,4) -- (5.7,0); 822 | \draw[thick,->] (3.25,4) -- (5.7,-2); 823 | \draw[thick,->] (3.25,6) -- (5.7,6); 824 | \draw[thick,->] (3.25,6) -- (5.7,4); 825 | \draw[thick,->] (3.25,6) -- (5.7,2); 826 | \draw[thick,->] (3.25,6) -- (5.7,0); 827 | \draw[thick,->] (3.25,6) -- (5.7,-2); 828 | \draw[thick,->] (6.25,6) -- (8.7,4); 829 | \draw[thick,->] (6.25,4) -- (8.7,4); 830 | \draw[thick,->] (6.25,2) -- (8.7,4); 831 | \draw[thick,->] (6.25,0) -- (8.7,4); 832 | \draw[thick,->] (6.25,-2) -- (8.7,4); 833 | \draw[thick,->] (6.25,6) -- (8.7,0); 834 | \draw[thick,->] (6.25,4) -- (8.7,0); 835 | \draw[thick,->] (6.25,2) -- (8.7,0); 836 | \draw[thick,->] (6.25,0) -- (8.7,0); 837 | \draw[thick,->] (6.25,-2) -- (8.7,0); 838 | \node[above] at (0,6.25) {Layer 0}; 839 | \node[above] at (3,6.25) {Layer 1}; 840 | \node[above] at (6,6.25) {Layer 2}; 841 | \node[above] at (9,6.25) {Layer 3}; 842 | \node[below,red] at (8,-2) {$w_{25}^3$}; 843 | \draw[thick,red,->] (8,-2) -- (7.5,-1); 844 | \node[below,red] at (5,-2.5) {$b_{5}^2$}; 845 | \draw[thick,red,->] (5.3,-3) -- (5.9,-2.25); 846 | \node[below,red] at (9.2,3) {$a_{1}^3$}; 847 | \draw[thick,red,->] (9.2,3) -- (9,3.75); 848 | \end{tikzpicture} 849 | \end{center} 850 | We have the following notations: 851 | \begin{itemize} 852 | \item $b_j^{\ell}$ is the bias of the $j$\textsuperscript{th} neuron in the $\ell$\textsuperscript{th} layer. 853 | \item $a_j^{\ell}$ is the activation of the $j$\textsuperscript{th} neuron in the $\ell$\textsuperscript{th} layer. 854 | \item $w_{jk}^\ell$ is the weight for the connection from the $k$\textsuperscript{th} neuron in the $(\ell-1)$\textsuperscript{th} layer to the $j$\textsuperscript{th} neuron in the $\ell$\textsuperscript{th} layer. 855 | \end{itemize} 856 | If the activation function is $f$ and the loss function we are minimizing is $C$, then the equations describing the network are: 857 | \begin{align*} 858 | a_j^\ell&=f\left(\sum_{k}w_{jk}^\ell a_k^{\ell-1}+b_j^\ell\right)\\ 859 | \text{Loss}&=C\left(a^L\right) 860 | \end{align*} 861 | Let the weighted inputs to the $d$ neurons in layer $\ell$ be defined as 862 | $$z^\ell\equiv w^\ell a^{\ell-1}+b^\ell,\quad\text{where }z^\ell\in\mathbb{R}^d.$$ 863 | Then, the activation of layer $\ell$ is also written as $a^\ell\equiv f(z^\ell)$. Also, let $\delta_j^\ell\equiv\frac{\partial C}{\partial z_j^\ell}$ denote the {\it error} of neuron $j$ in layer $\ell$. Then, $\delta^\ell\in\mathbb{R}^d$ denotes the full vector of errors associated with layer $\ell$. 864 | \item[Equations of Backpropagation] ~ 865 | \begin{align*} 866 | \delta^L&=\grad_a{C}\odot f'\left(z^L\right)\\ 867 | \delta^\ell&=\left[\left(w^{\ell+1}\right)^\intercal\delta^{\ell+1}\right]\odot f'\left(z^\ell\right)\\ 868 | \dfrac{\partial C}{\partial b_j^\ell}&=\delta_j^\ell\\ 869 | \dfrac{\partial C}{\partial w_{jk}^\ell}&=a_k^{\ell-1}\delta_j^\ell 870 | \end{align*} 871 | The symbol $\odot$ represents the Hadamard product. 872 | \begin{equation*} 873 | \begin{pmatrix} 874 | a & b\\ 875 | c & d 876 | \end{pmatrix} \odot 877 | \begin{pmatrix} 878 | e & f\\ 879 | g & h 880 | \end{pmatrix}= 881 | \begin{pmatrix} 882 | ae & bf \\ 883 | cg & dh 884 | \end{pmatrix}. 885 | \end{equation*} 886 | \end{description} 887 | \subsection{Recurrent Neural Networks} 888 | 889 | \begin{description} 890 | \item[Temporal/Sequence Problems] ~ 891 | \begin{itemize} 892 | \item Sequence prediction problems can be recast in a form amenable to feedforward neural networks. 893 | \item We have to engineer how {\it history} is mapped to a vector (representation). This vector is then fed into, e.g., a neural network. 894 | \item We would like to learn how to encode the {\it history} into a vector. 895 | \end{itemize} 896 | \item[Key Concepts] ~ 897 | \begin{itemize} 898 | \item {\bf Encoding} -- e.g., mapping a sequence to a vector 899 | \item {\bf Decoding} -- e.g., mapping a vector to, e.g., a sequence 900 | \end{itemize} 901 | \item[Example: Encoding Sentences] ~ 902 | \begin{itemize}[topsep=0pt] 903 | \item Introduce adjustable {\it lego pieces} and optimize them for end-to-end performance. 904 | \begin{center} 905 | \begin{tikzpicture}[scale=0.75] 906 | \draw[fill=black] (0,0) rectangle (0.25,0.25); 907 | \draw[] (0,0.25) rectangle (0.25,0.5); 908 | \draw[fill=black] (0,0.5) rectangle (0.25,0.75); 909 | \draw[] (0,0.75) rectangle (0.25,1); 910 | \draw[] (0,1) rectangle (0.25,1.25); 911 | \draw[] (0,1.25) rectangle (0.25,1.5); 912 | \draw[fill=gray] (0,1.5) rectangle (0.25,1.75); 913 | \draw[] (0,1.75) rectangle (0.25,2); 914 | \draw[thick,->] (0.5,1) -- (1.5,1); 915 | \draw[rounded corners, thick] (1.75,0.75) rectangle (2.25,1.25); 916 | \node[right] at (1.8,1) {$\theta$}; 917 | \node[left, align=center] at (-0.2,1) {\textbf{context} \\ \textbf{or state}}; 918 | \draw[thick,->] (2.5,1) -- (3.5,1); 919 | \draw[fill=gray] (3.75,0) rectangle (4,0.25); 920 | \draw[] (3.75,0.25) rectangle (4,0.5); 921 | \draw[fill=black] (3.75,0.5) rectangle (4,0.75); 922 | \draw[] (3.75,0.75) rectangle (4,1); 923 | \draw[fill=black] (3.75,1) rectangle (4,1.25); 924 | \draw[fill=lightgray] (3.75,1.25) rectangle (4,1.5); 925 | \draw[fill=gray] (3.75,1.5) rectangle (4,1.75); 926 | \draw[] (3.75,1.75) rectangle (4,2); 927 | \draw[thick,->] (2,-0.5) -- (2,0.5); 928 | \draw[fill=lightgray] (1,-1) rectangle (1.25,-0.75); 929 | \draw[] (1.25,-1) rectangle (1.5,-0.75); 930 | \draw[fill=gray] (1.5,-1) rectangle (1.75,-0.75); 931 | \draw[] (1.75,-1) rectangle (2,-0.75); 932 | \node[below] at (2,-1.1) {\textbf{new information}}; 933 | \node[below,blue] at (2,-1.5) {$x_t$}; 934 | \draw[] (2,-1) rectangle (2.25,-0.75); 935 | \draw[fill=gray] (2.25,-1) rectangle (2.5,-0.75); 936 | \draw[] (2.5,-1) rectangle (2.75,-0.75); 937 | \draw[fill=black] (2.75,-1) rectangle (3,-0.75); 938 | \node[right, align=center] at (4.25,1) {\textbf{new context} \\ \textbf{or state}}; 939 | \node[below,blue] at (5.25,0.6) {$s_t$}; 940 | \node[below,blue] at (-0.75,0.6) {$s_{t-1}$}; 941 | \end{tikzpicture} 942 | $$s_t=\tanh(W^{s,s}s_{t-1} + W^{s,x}x_t)$$ 943 | \end{center} 944 | \item Let's say we want to encode the incomplete sentence {\bf Efforts and courage are not}. First, we have to represent the first word as a vector (say, a one-hot vector). This will be $x_1$. Then, 945 | $$s_1=\tanh\left(W^{s,x}x_1\right).$$ 946 | The second word will be $x_2$, and we compute for $s_2$. 947 | $$s_2=\tanh(W^{s,s}s_1+W^{s,x}x_t).$$ 948 | We continue this process until we've encoded all the words in the sentence. We can visualize this as follows: 949 | \begin{center} 950 | \begin{tikzpicture}[scale=0.6] 951 | \draw[orange, thick, dashed] (4.5,1) ellipse (3.75 and 0.75); 952 | \node[above, orange] at (6,1.75) {lego piece (encoder)}; 953 | \draw[] (0,0) rectangle (0.25,0.25); 954 | \draw[] (0,0.25) rectangle (0.25,0.5); 955 | \draw[] (0,0.5) rectangle (0.25,0.75); 956 | \draw[] (0,0.75) rectangle (0.25,1); 957 | \draw[] (0,1) rectangle (0.25,1.25); 958 | \draw[] (0,1.25) rectangle (0.25,1.5); 959 | \draw[] (0,1.5) rectangle (0.25,1.75); 960 | \draw[] (0,1.75) rectangle (0.25,2); 961 | \draw[thick,->] (0.5,1) -- (1,1); 962 | \draw[thick,->] (2,1) -- (2.5,1); 963 | \draw[thick,->] (3.5,1) -- (4,1); 964 | \draw[thick,->] (5,1) -- (5.5,1); 965 | \draw[thick,->] (6.5,1) -- (7,1); 966 | \draw[thick,->] (8,1) -- (8.5,1); 967 | \draw[rounded corners, thick] (1.25,0.75) rectangle (1.75,1.25); 968 | \draw[rounded corners, thick] (2.75,0.75) rectangle (3.25,1.25); 969 | \draw[rounded corners, thick] (4.25,0.75) rectangle (4.75,1.25); 970 | \draw[rounded corners, thick] (5.75,0.75) rectangle (6.25,1.25); 971 | \draw[rounded corners, thick] (7.25,0.75) rectangle (7.75,1.25); 972 | \node[left, align=center,blue] at (-0.2,1) {$s_0$}; 973 | \node[above, blue] at (2.25,1.2) {$s_1$}; 974 | \node[above, blue] at (3.75,1.2) {$s_2$}; 975 | \node[above, blue] at (5.25,1.2) {$s_3$}; 976 | \node[above, blue] at (6.75,1.2) {$s_4$}; 977 | \node[above, blue] at (8.25,1.2) {$s_5$}; 978 | \node[right, red] at (1.5,0) {$x_1$}; 979 | \node[right, red] at (3,0) {$x_2$}; 980 | \node[right, red] at (4.5,0) {$x_3$}; 981 | \node[right, red] at (6,0) {$x_4$}; 982 | \node[right, red] at (7.5,0) {$x_5$}; 983 | \draw[fill=gray] (8.75,0) rectangle (9,0.25); 984 | \draw[] (8.75,0.25) rectangle (9,0.5); 985 | \draw[fill=black] (8.75,0.5) rectangle (9,0.75); 986 | \draw[] (8.75,0.75) rectangle (9,1); 987 | \draw[fill=black] (8.75,1) rectangle (9,1.25); 988 | \draw[fill=lightgray] (8.75,1.25) rectangle (9,1.5); 989 | \draw[fill=gray] (8.75,1.5) rectangle (9,1.75); 990 | \draw[] (8.75,1.75) rectangle (9,2); 991 | \draw[thick,->] (1.5,-0.5) -- (1.5,0.5); 992 | \draw[thick,->] (3,-0.5) -- (3,0.5); 993 | \draw[thick,->] (4.5,-0.5) -- (4.5,0.5); 994 | \draw[thick,->] (6,-0.5) -- (6,0.5); 995 | \draw[thick,->] (7.5,-0.5) -- (7.5,0.5); 996 | \draw[] (1,-1) rectangle (1.25,-0.75); 997 | \draw[] (1.25,-1) rectangle (1.5,-0.75); 998 | \draw[fill=blue] (1.5,-1) rectangle (1.75,-0.75); 999 | \draw[] (1.75,-1) rectangle (2,-0.75); 1000 | \node[below] at (1.5,-1.1) {\textbf{Efforts}}; 1001 | \node[below] at (3,-1.1) {\textbf{and}}; 1002 | \node[below] at (4.5,-1.2) {\textbf{courage}}; 1003 | \node[below] at (6,-1.2) {\textbf{are}}; 1004 | \node[below] at (7.5,-1.15) {\textbf{not}}; 1005 | \node[below] at (8.5,-1.3) {\textbf{\dots}}; 1006 | \draw[] (2.5,-1) rectangle (2.75,-0.75); 1007 | \draw[] (2.75,-1) rectangle (3,-0.75); 1008 | \draw[] (3,-1) rectangle (3.25,-0.75); 1009 | \draw[fill=blue] (3.25,-1) rectangle (3.5,-0.75); 1010 | \draw[fill=blue] (4,-1) rectangle (4.25,-0.75); 1011 | \draw[] (4.25,-1) rectangle (4.5,-0.75); 1012 | \draw[] (4.75,-1) rectangle (5,-0.75); 1013 | \draw[] (4.5,-1) rectangle (4.75,-0.75); 1014 | \draw[] (5.5,-1) rectangle (5.75,-0.75); 1015 | \draw[fill=blue] (5.75,-1) rectangle (6,-0.75); 1016 | \draw[] (6,-1) rectangle (6.25,-0.75); 1017 | \draw[] (6.25,-1) rectangle (6.5,-0.75); 1018 | \draw[] (7,-1) rectangle (7.25,-0.75); 1019 | \draw[fill=blue] (7.25,-1) rectangle (7.5,-0.75); 1020 | \draw[] (7.5,-1) rectangle (7.75,-0.75); 1021 | \draw[fill=blue] (7.75,-1) rectangle (8,-0.75); 1022 | \node[right, align=center] at (9.25,1) {\textbf{sentence} \\ \textbf{as a vector}}; 1023 | \end{tikzpicture} 1024 | \end{center} 1025 | \end{itemize} 1026 | \item[Differences from standard feedforward architecture] ~ 1027 | \begin{itemize} 1028 | \item Input is received at each layer (per word), not just at the beginning as in a typical feedforward network. 1029 | \item The number of layers varies and depends on the length of the sentence. 1030 | \item Parameters of each layer (representing an application of an RNN) are shared (same RNN at each step). 1031 | \end{itemize} 1032 | \item[Basic RNN] ~ 1033 | $$s_t=\tanh(W^{s,s}s_{t-1} + W^{s,x}x_t)$$ 1034 | \item[Simple Gated RNN] ~ 1035 | \begin{align*} 1036 | g_t&=\text{sigmoid}\left(W^{g,s}s_{t-1}+W^{g,x}x_t\right)\\ 1037 | s_t&=\left(1-g_t\right)\odot s_{t-1}+g_t\odot\tanh(W^{s,s}s_{t-1}+W^{s,x}x_t) 1038 | \end{align*} 1039 | \item[Long Short-Term Memory (LSTM)] ~ 1040 | \begin{alignat*}{2} 1041 | f_t&=\text{sigmoid}\left(W^{f,h}h_{t-1}+W^{f,x}x_t\right)\quad&&\textcolor{red}{\text{forget gate}}\\ 1042 | i_t&=\text{sigmoid}\left(W^{i,h}h_{t-1}+W^{i,x}x_t\right) &&\textcolor{red}{\text{input gate}}\\ 1043 | o_t&=\text{sigmoid}\left(W^{o,h}h_{t-1}+W^{o,x}x_t\right) &&\textcolor{red}{\text{output gate}}\\ 1044 | c_t&=f_t\odot c_{t-1}+i_t\odot\tanh(W^{c,h}h_{t-1}+W^{c,x}x_t) \quad&&\textcolor{red}{\text{memory cell}}\\ 1045 | h_t&=o_t\odot\tanh(c_t)&&\textcolor{red}{\text{visible state}} 1046 | \end{alignat*} 1047 | \item[Markov Language Models] Let $w\in V$ denote the set of possible words/symbols that includes 1048 | \begin{itemize} 1049 | \item an $\mathbf{UNK}$ symbol for any unknown word (out of vocabulary) 1050 | \item $\mathbf{\langle beg\rangle}$ symbol for specifying the start of a sentence 1051 | \item $\mathbf{\langle end\rangle}$ symbol for specifying the end of the sentence 1052 | \end{itemize} 1053 | \item[First-order Markov Model] In a first-order Markov model ({\bf bigram model}), the next symbol only depends on the previous one. Each symbol (except $\langle$beg$\rangle$) in the sequence is predicted using the same condition probability table until an $\langle$end$\rangle$ symbol is seen. The probability associated to the sentence is 1054 | $$\prod_{i=1}\mathbb{P}\left(w_i|w_{i-1}\right).$$ 1055 | \item[Maximum Likelihood Estimation] The goal is to maximize the probability that the model can generate all the observed sentences (corpus $S$) $$s\in S, s=\left\{w_1^s,w_2^s,\dots,w_{|s|}^s\right\}$$ 1056 | $$\ell=\log\left\{\prod_{s\in S}\left[\prod_{i=1}^{|s|}\mathbb{P}\left(w_i^s|w_{i-1}^s\right)\right]\right\}$$ 1057 | The maximum likelihood estimate is obtained as normalized counts of successive word occurrences (matching statistics) 1058 | $$\widehat{\mathbb{P}}\left(w'|w\right)=\dfrac{\text{count}\left(w',w\right)}{\sum\limits_{\widetilde{w}}\text{count}\left(w,\widetilde{w}\right)}$$ 1059 | \item[Feature-based Markov Model] We can also represent the Markov model as a feedforward neural network (very extendable). We define a one-hot vector, $\phi\left(w_{i-1}\right)$, corresponding to the previous word. This will be an input to the feedforward neural network. 1060 | \begin{center} 1061 | \begin{tikzpicture}[scale=0.5] 1062 | \draw[thick] (0,4) circle [radius=0.25]; 1063 | \draw[thick,fill=black] (0,3) circle [radius=0.25]; 1064 | \draw[thick] (0,2) circle [radius=0.25]; 1065 | \draw[thick] (0,0) circle [radius=0.25]; 1066 | \node[left] at (-1,2) {$\phi\left(w_{i-1}\right)$}; 1067 | \node[left] at (-1.8,1.25) {$x$}; 1068 | \node[] at (0,1.25) {$\vdots$}; 1069 | \node[] at (4,1.25) {$\vdots$}; 1070 | \node[] at (2,2) {$W$}; 1071 | \draw[rounded corners, thick] (1.5,1.5) rectangle (2.5,2.5); 1072 | \draw[thick] (4,4) circle [radius=0.25]; 1073 | \draw[thick] (4,3) circle [radius=0.25]; 1074 | \draw[thick] (4,2) circle [radius=0.25]; 1075 | \draw[thick] (4,0) circle [radius=0.25]; 1076 | \draw[thick] (4.5,0) -- (4.75,0); 1077 | \draw[thick] (4.5,2) -- (4.75,2); 1078 | \draw[thick] (4.5,3) -- (4.75,3); 1079 | \draw[thick] (4.5,4) -- (4.75,4); 1080 | \node[right] at (5,4) {$p_1$}; 1081 | \node[right] at (5,3) {$p_2$}; 1082 | \node[right] at (5,2) {$p_3$}; 1083 | \node[right] at (5,0) {$p_k$}; 1084 | \end{tikzpicture} 1085 | \end{center} 1086 | In the figure, 1087 | $$p_k=\mathbb{P}\left(w_i=k|w_{i-1}\right)$$ 1088 | is the probability of the next word, given the previous word. The aggregate input to the $k$\textsuperscript{th} output unit is 1089 | $$z_k=\sum_{k}x_jW_{jk}+W_{0k}.$$ 1090 | These input values are not probabilities. A typical transformation is the {\bf softmax transformation}: 1091 | $$p_k=\dfrac{e^{z_k}}{\sum\limits_{j}e^{z_j}}.$$ 1092 | \item[RNNs for Sequences] Our RNN now also produces an output (e.g., a word) as well as update its state 1093 | \begin{center} 1094 | \begin{tikzpicture}[scale=0.75] 1095 | \draw[fill=black] (0,0) rectangle (0.25,0.25); 1096 | \draw[] (0,0.25) rectangle (0.25,0.5); 1097 | \draw[fill=black] (0,0.5) rectangle (0.25,0.75); 1098 | \draw[] (0,0.75) rectangle (0.25,1); 1099 | \draw[] (0,1) rectangle (0.25,1.25); 1100 | \draw[] (0,1.25) rectangle (0.25,1.5); 1101 | \draw[fill=gray] (0,1.5) rectangle (0.25,1.75); 1102 | \draw[] (0,1.75) rectangle (0.25,2); 1103 | \draw[thick,->] (0.5,1) -- (1.5,1); 1104 | \draw[rounded corners, thick] (1.75,0.75) rectangle (2.25,1.25); 1105 | \node[right] at (1.8,1) {$\theta$}; 1106 | \node[left, align=center] at (-0.2,1) {\textbf{previous} \\ \textbf{state}}; 1107 | \draw[thick,->] (2.5,1) -- (3.5,1); 1108 | \draw[fill=gray] (3.75,0) rectangle (4,0.25); 1109 | \draw[] (3.75,0.25) rectangle (4,0.5); 1110 | \draw[fill=black] (3.75,0.5) rectangle (4,0.75); 1111 | \draw[] (3.75,0.75) rectangle (4,1); 1112 | \draw[fill=black] (3.75,1) rectangle (4,1.25); 1113 | \draw[fill=lightgray] (3.75,1.25) rectangle (4,1.5); 1114 | \draw[fill=gray] (3.75,1.5) rectangle (4,1.75); 1115 | \draw[] (3.75,1.75) rectangle (4,2); 1116 | \draw[thick,->] (2,-0.5) -- (2,0.5); 1117 | \draw[fill=lightgray] (1,-1) rectangle (1.25,-0.75); 1118 | \draw[] (1.25,-1) rectangle (1.5,-0.75); 1119 | \draw[fill=gray] (1.5,-1) rectangle (1.75,-0.75); 1120 | \draw[] (1.75,-1) rectangle (2,-0.75); 1121 | \node[right, align=center] at (0.7,-1.5) {\textbf{previous output} \\ \textbf{as an input} $x$}; 1122 | \draw[] (2,-1) rectangle (2.25,-0.75); 1123 | \draw[fill=gray] (2.25,-1) rectangle (2.5,-0.75); 1124 | \draw[] (2.5,-1) rectangle (2.75,-0.75); 1125 | \draw[fill=black] (2.75,-1) rectangle (3,-0.75); 1126 | \node[right, align=center] at (4.25,1) {\textbf{new} \\ \textbf{state}}; 1127 | \draw[thick,->] (2,1.5) -- (2,2.5); 1128 | \node[above] at (3.25,2.5) {$[0.1,0.3,\dots,0.2]$ \textbf{output distribution}}; 1129 | \end{tikzpicture} 1130 | \end{center} 1131 | \begin{alignat*}{2} 1132 | s_t&=\tanh(W^{s,s}s_{t-1}+W^{s,x}x_t)\quad&&\textcolor{red}{\text{state}}\\ 1133 | p_t&=\text{softmax}\left(W^os_t\right)&&\textcolor{red}{\text{output distribution}} 1134 | \end{alignat*} 1135 | \item[Decoding] ~ 1136 | \begin{center} 1137 | \begin{tikzpicture}[scale=0.75] 1138 | \draw[fill=gray] (0,0) rectangle (0.25,0.25); 1139 | \draw[fill=lightgray] (0,0.25) rectangle (0.25,0.5); 1140 | \draw[fill=black] (0,0.5) rectangle (0.25,0.75); 1141 | \draw[] (0,0.75) rectangle (0.25,1); 1142 | \draw[fill=lightgray] (0,1) rectangle (0.25,1.25); 1143 | \draw[] (0,1.25) rectangle (0.25,1.5); 1144 | \draw[fill=gray] (0,1.5) rectangle (0.25,1.75); 1145 | \draw[fill=black] (0,1.75) rectangle (0.25,2); 1146 | \draw[thick,->] (0.5,1) -- (1,1); 1147 | \draw[thick,->] (2,1) -- (2.5,1); 1148 | \draw[thick,->] (3.5,1) -- (4,1); 1149 | \draw[thick,->] (5,1) -- (5.5,1); 1150 | \draw[thick,->] (6.5,1) -- (7,1); 1151 | \draw[rounded corners, thick] (1.25,0.75) rectangle (1.75,1.25); 1152 | \draw[rounded corners, thick] (2.75,0.75) rectangle (3.25,1.25); 1153 | \draw[rounded corners, thick] (4.25,0.75) rectangle (4.75,1.25); 1154 | \draw[rounded corners, thick] (5.75,0.75) rectangle (6.25,1.25); 1155 | \draw[rounded corners, thick] (7.25,0.75) rectangle (7.75,1.25); 1156 | \node[below] at (1.5,0) {\textbf{$\langle$null$\rangle$}}; 1157 | \node[above] at (1.5,2.55) {\textbf{Olen}}; 1158 | \node[above] at (3,2.5) {\textbf{n\"ahnyt}}; 1159 | \node[above] at (4.5,2.53) {\textbf{parempia}}; 1160 | \node[above] at (6,2.5) {\textbf{luentoja}}; 1161 | \draw[->] (2,2.6) arc(70:45:0.5) -- (2.25,0.75) arc(180:360:0.3); 1162 | \draw[->] (3.6,2.6) arc(70:45:0.25) -- (3.75,0.75) arc(180:360:0.3); 1163 | \draw[->] (5.1,2.6) arc(70:45:0.25) -- (5.25,0.75) arc(180:360:0.3); 1164 | \draw[->] (6.6,2.6) arc(70:45:0.25) -- (6.75,0.75) arc(180:360:0.3); 1165 | \node[above, orange] at (-0.25,2.48) {\textbf{sampled word}}; 1166 | \node[above, orange] at (-1,2) {\textbf{\tiny distribution over the possible words}}; 1167 | \node[above] at (7.5,2.5) {$\langle$\textbf{end}$\rangle$}; 1168 | \node[above] at (1.5,2) {$p_1$}; 1169 | \node[above] at (3,2) {$p_2$}; 1170 | \node[above] at (4.5,2) {$p_3$}; 1171 | \node[above] at (6,2) {$p_4$}; 1172 | \node[above] at (7.5,2) {$p_5$}; 1173 | \draw[thick,->] (1.5,0) -- (1.5,0.5); 1174 | \draw[thick,->] (1.5,1.5) -- (1.5,2); 1175 | \draw[thick,->] (3,1.5) -- (3,2); 1176 | \draw[thick,->] (4.5,1.5) -- (4.5,2); 1177 | \draw[thick,->] (6,1.5) -- (6,2); 1178 | \draw[thick,->] (7.5,1.5) -- (7.5,2); 1179 | \node[right, align=center] at (-3,1.3) {\textbf{vector encoding} \\ \textbf{of a sentence}}; 1180 | \node[right, align=center] at (-2.75,0.4) {\textit{I have seen} \\ \textit{better lectures.}}; 1181 | \end{tikzpicture} 1182 | \end{center} 1183 | \end{description} 1184 | 1185 | \subsection{Convolutional Neural Networks} 1186 | 1187 | \begin{description} 1188 | \item[Problem] Image classification 1189 | \begin{itemize} 1190 | \item The presence of objects may vary in location across different images. 1191 | \end{itemize} 1192 | \item[Patch classifier/filter] ~ 1193 | \begin{center} 1194 | \begin{tikzpicture} 1195 | \node[inner sep=0pt] at (0,1) 1196 | {\includegraphics[width=0.25\columnwidth]{mushroom.jpeg}}; 1197 | \node[inner sep=0pt] at (4,1) 1198 | {\includegraphics[width=0.35in]{filter.png}}; 1199 | \draw[red] (-0.1,1.3) -- (2.97,1.47); 1200 | \draw[red] (-0.1,1.1) -- (2.97,0.53); 1201 | \node[inner sep=0pt] at (2.5,1) {\includegraphics[width=0.1\columnwidth]{mushroom2.jpeg}}; 1202 | \draw[red] (-0.3,1.1) rectangle (-0.1,1.3); 1203 | \draw[red] (2.03,0.53) rectangle (2.97,1.47); 1204 | \draw[red] (-0.3,1.1) -- (2.03,0.53); 1205 | \draw[red] (-0.3,1.3) -- (2.03,1.47); 1206 | \node[] at (2.5,0.25) {input}; 1207 | \node[] at (4,0.25) {weights}; 1208 | \draw[step=0.094,black,thin] (3.54,0.53) grid (4.48,1.47); 1209 | \end{tikzpicture} 1210 | \end{center} 1211 | The patch classifier goes through the entire image. We can think of the weights as the image that the unit prefers to see. 1212 | \item[Convolution] The convolution is an operation between two functions $f$ and $g$: 1213 | $$\left(f*g\right)(t)\equiv\int_{-\infty}^{+\infty}f(\tau)g(t-\tau)d\tau.$$ 1214 | Intuitively, convolution {\it blends} the two functions $f$ and $g$ by expressing the amount of overlap of one function as it is shifted over another function. 1215 | \item[Discrete Convolution] For discrete functions, we can define the convolution as 1216 | $$\left(f*g\right)[n]\equiv\sum_{m=-\infty}^{m=+\infty}f[m]g[n-m].$$ 1217 | \item[Pooling] We wish to know whether a feature was there but not exactly where it was. 1218 | \item[Pooling (Max)] Pooling region and {\it stride} may vary. 1219 | \begin{itemize} 1220 | \item Pooling induces translation invariance at the cost of spatial resolution. 1221 | \item Stride reduces the size of the resulting feature map. 1222 | \end{itemize} 1223 | \begin{center} 1224 | \begin{tikzpicture} 1225 | \node[inner sep=0pt] at (0,1) 1226 | {\includegraphics[width=1.25in]{orig.png}}; 1227 | \node[inner sep=0pt] at (3.25,1) 1228 | {\includegraphics[width=1.22in]{fmap.png}}; 1229 | \node[inner sep=0pt] at (6,1) {\includegraphics[width=0.75in]{maxpool.png}}; 1230 | \node[below] at (0,-0.25) {original}; 1231 | \node[below] at (3.25,-0.25) {feature map}; 1232 | \node[below, align=center] at (6,0) {feature map \\ after max pooling}; 1233 | \end{tikzpicture} 1234 | \end{center} 1235 | \item[Example of CNN] From LeCun (2013) 1236 | \begin{center} 1237 | \includegraphics[width=\columnwidth]{cnn.png} 1238 | \end{center} 1239 | \end{description} 1240 | 1241 | 1242 | \section{Unsupervised Learning}\smallskip \hrule height 1pt \smallskip 1243 | 1244 | \subsection{Clustering} 1245 | 1246 | \begin{description} 1247 | \item[Training set] We are provided a training set but with no labels 1248 | $$S_n=\left\{\left.x^{(i)}\right| i=1,\dots,n\right\}$$ 1249 | and the goal is to find structure in the data. 1250 | \item[Example: Google News] ~ 1251 | \begin{center} 1252 | \includegraphics*[width=0.75\columnwidth]{gnews.png} 1253 | \end{center} 1254 | \item[Example: Image Quantization] ~ 1255 | \begin{center} 1256 | \begin{tikzpicture} 1257 | \node[inner sep=0pt] at (0,1.6) 1258 | {\includegraphics[width=1.5in]{imq1.jpg}}; 1259 | \node[inner sep=0pt] at (4,1.6) 1260 | {\includegraphics[width=1.5in]{imq2.png}}; 1261 | \node[] at (0,0) {original}; 1262 | \node[] at (4,0) {compressed}; 1263 | \end{tikzpicture} 1264 | \end{center} 1265 | \item[Partition] A partition of a set is a grouping of the set's elements into non-empty subsets, in such a way that {\bf every} element is included in one and only one of the subsets. In other words, $C_1,\dots,C_K$ is a partition of $\{1,\dots,n\}$ if and only if $$C_1\cup\dots\cup C_K=\{1,\dots,n\}\qquad\text{and}$$ 1266 | $$C_i\cap C_j=\emptyset\quad\text{for any}\quad i\neq j\quad\text{in}\quad\{1,\dots,K\}.$$ 1267 | \item[Clustering: Input] ~ 1268 | \begin{itemize}[] 1269 | \item Set of feature vectors $S_n=\left\{\left.x^{(i)}\right|i=1,\dots,n\right\}$ 1270 | \item The number of clusters $K$ 1271 | \end{itemize} 1272 | \item[Clustering: Output] ~ 1273 | \begin{itemize}[] 1274 | \item A partition of indices $\left\{1,\dots,n\right\}$ into $K$ sets, $C_1,\dots,C_K$ 1275 | \item {\it Representatives} in each of the $K$ partition sets, given as $z_1,\dots,z_K$ 1276 | \end{itemize} 1277 | \item[Cost] We can calculate the total cost by summing the cost of each cluster: 1278 | $$\text{Cost}\left(C_1,\dots,C_K\right)=\sum_{j=1}^{K}\text{Cost}\left(C_j\right)$$ 1279 | \item[Similarity Measure] We use the Euclidean distance between the elements of a cluster and its representative to calculate the cost for each cluster. Then, the total cost is 1280 | $$\text{Cost}\left(C_1,\dots,C_K,z_1,\dots,z_K\right)=\sum_{j=1}^{K}\sum_{i\in C_j}\norm{x^{(i)}-z_j}^2.$$ 1281 | \item[$K$-Means Algorithm] ~ 1282 | \begin{enumerate} 1283 | \item Randomly select $z_1,\dots,z_K$. 1284 | \item Iterate: 1285 | \begin{enumerate} 1286 | \item Given $z_1,\dots,z_K$, assign each data point $x^{(i)}$ to the closest $z_j$ so that 1287 | $$\text{Cost}\left(z_1,\dots,z_K\right)=\sum_{i=1}^{n}\min\limits_{j=1,\dots,K}\norm{x^{(i)}-z_j}^2.$$ 1288 | \item Given $C_1,\dots,C_K$, find the best representatives $z_1,\dots,z_K$, i.e., find $z_1,\dots,z_K$ such that 1289 | $$z_j=\argmin\limits_{z}\sum_{i\in C_j}\norm{x^{(i)}-z}^2=\dfrac{1}{\left|C_j\right|}\sum\limits_{i\in C_j}x^{(i)}.$$ 1290 | \end{enumerate} 1291 | \end{enumerate} 1292 | \item[$K$-Medoids Algorithm] The $K$-Medoids algorithm is a variation of the $K$-Means algorithm that addresses some of the $K$-Means algorithm's limitations. 1293 | \begin{enumerate} 1294 | \item Randomly select $\left\{z_1,\dots,z_K\right\}\subseteq\left\{x_1,\dots,x_n\right\}$. 1295 | \item Iterate: 1296 | \begin{enumerate} 1297 | \item Given $z_1,\dots,z_K$, assign each $x^{(i)}$ to the closest $z_j$ so that 1298 | $$\text{Cost}\left(z_1,\dots,z_K\right)=\sum_{i=1}^{n}\min\limits_{j=1,\dots,K}\text{dist}\left(x^{(i)},z_j\right)$$ 1299 | \item Given $C_j\in\left\{C_1,\dots,C_K\right\}$, find the best representative $z_j\in\left\{x_1,\dots,x_n\right\}$ such that 1300 | $$\sum\limits_{x^{(i)}\in C_j}\text{dist}\left(x^{(i)},z_j\right)$$ 1301 | is minimal. 1302 | \end{enumerate} 1303 | \end{enumerate} 1304 | \end{description} 1305 | 1306 | \subsection{Generative Models} 1307 | \begin{description} 1308 | \item[Generative vs. Discriminative Models] {\it Generative models} work by explicitly modeling the probability distribution of each of the individual classes in the training data. {\it Discriminative models} learn explicit decision boundary between classes. 1309 | \item[Simple Multinomial Generative Model] Consider a multinomial model $M$ to generate text documents. Assume that $M$ has a fixed vocabulary $W$ and we generate a document by sampling one word at a time from this vocabulary. Furthermore, all the words that are generated by $M$ are independent of each other. We denote the probability that $M$ generates certain word $w\in W$ is 1310 | $$\mathbb{P}\left(w|\theta\right)=\theta_w,\quad\theta_w\geq0,\sum_{w\in W}\theta_w=1.$$ 1311 | Then, the probability of generating the document $D$ is 1312 | $$\mathbb{P}\left(D|\theta\right)=\prod_{i=1}^{n}\theta_{w_i}=\prod_{w\in W}\theta_w^{\,\text{count}(w)}.$$ 1313 | \item[Maximum Likelihood Estimate] The log-likelihood for the model is 1314 | $$\ell=\log\mathbb{P}(D|\theta)=\sum_{w\in W}\text{count}(w)\log\theta_w$$ 1315 | and the maximum likelihood estimate is 1316 | $$\widehat{\theta}_w=\dfrac{\text{count}(w)}{\sum\limits_{w'\in W}\text{count}(w')}.$$ 1317 | \item[Prediction] Consider using a multinomial generative model $M$ for the task of binary classification consisting of two classes: $+$ (positive class) and $-$ (negative class). 1318 | \begin{itemize} 1319 | \item $\theta^+$: parameter for the positive class 1320 | \item $\theta^-$: parameter for the negative class 1321 | \end{itemize} 1322 | Suppose that we classify a new document $D$ to belong to the positive class if and only if 1323 | $$\log\dfrac{\mathbb{P}(D|\theta^+)}{\mathbb{P}(D|\theta^-)}\geq0.$$ 1324 | The generative classifier is equivalent to a linear classifier: 1325 | $$\log\dfrac{\mathbb{P}(D|\theta^+)}{\mathbb{P}(D|\theta^-)}=\sum_{w\in W}\text{count}(w)\log\dfrac{\theta_w^+}{\theta_w^-}=\sum_{w\in W}\text{count}(w)\,\theta'_w.$$ 1326 | \item[Prior, Posterior and Likelihood] In the above discussion, there is an assumption that the likelihood of being in one of the classes is the same. However, we may have some prior knowledge and we want to incorporate it into our model. The posterior distribution for the positive class is then 1327 | $$\mathbb{P}\left(y=+|D\right)=\dfrac{\mathbb{P}(D|\theta^+)\,\mathbb{P}(y=+)}{\mathbb{P}(D)}.$$ 1328 | The generative classifier becomes 1329 | $$\log\dfrac{\mathbb{P}\left(y=+|D\right)}{\mathbb{P}\left(y=-|D\right)}=\sum_{w\in W}\text{count}(w)\,\theta'_w+\theta'_0$$ 1330 | where $\theta'_w=\log\dfrac{\theta^+_w}{\theta^-_w}$ and $\theta'_0=\log\dfrac{\mathbb{P}(y=+)}{\mathbb{P}(y=-)}$. 1331 | \item[Gaussian Generative Models] The likelihood of $\mathbf{x}\in\mathbb{R}^d$ being generated by a Gaussian with mean $\mu$ and standard deviation $\sigma$ is 1332 | $$f_X\left(\mathbf{x}|\mu,\sigma^2\right)=\dfrac{1}{\left(2\pi\sigma^2\right)^{d/2}}\exp(-\dfrac{1}{2\sigma^2}\norm{\mathbf{x}-\mu}^2).$$ 1333 | \item[MLE for the Mean] ~ 1334 | $$\widehat{\mu}=\frac{1}{n}\sum_{i=1}^{n}\mathbf{x}^{(i)}$$ 1335 | \item[MLE for the Variance] ~ 1336 | $$\widehat{\sigma}^2=\frac{1}{nd}\sum_{i=1}^{n}\norm{\mathbf{x}^{(i)}-\mu}^2$$ 1337 | \end{description} 1338 | 1339 | \subsection{Mixture Models; EM Algorithm} 1340 | \begin{description} 1341 | \item[Gaussian Mixture Models] Instead of just a single Gaussian, we have a mixture of Gaussian components. Assume that there are $K$ Gaussians with known means and variances. Assume also that the mixture weights $p_1,\dots,p_K$ are known. The likelihood for an observation $\mathbf{x}$ obtained from the model is 1342 | $$p(\mathbf{x}|\theta)=\sum_{j=1}^{K}p_j\mathcal{N}\left(\mathbf{x};\mu^{(j)},\sigma_j^2\mathbf{I}\right).$$ 1343 | For the training set 1344 | $$S_n=\left\{\mathbf{x}^{(i)}, i=1,\dots,n\right\},$$ 1345 | the likelihood is 1346 | $$\mathbb{P}\left(S_n|\theta\right)=\prod_{i=1}^{n}\sum_{j=1}^{K}p_j\mathcal{N}\left(\mathbf{x}^{(i)};\mu^{(j)},\sigma_j^2\mathbf{I}\right).$$ 1347 | \item[Observed Case] Consider the case of hard clustering, i.e., a point either belongs to a cluster or not. Let 1348 | $$\delta(j|i)= 1349 | \begin{cases} 1350 | 1,\quad&\mathbf{x}^{(i)}\text{ is assigned to }j\\ 1351 | 0,&\text{otherwise}. 1352 | \end{cases}$$ 1353 | Also, let $\widehat{n}_j=\sum\limits_{i=1}^{n}\delta(j|i)$ denote the number of points belonging to cluster $j$. Maximizing the likelihood gives 1354 | \begin{align*} 1355 | \widehat{p}_j&=\dfrac{\widehat{n}_j}{n}\\ 1356 | \widehat{\mu}^{(j)}&=\dfrac{1}{\widehat{n}_j}\sum_{i=1}^{n}\delta(j|i)\,\mathbf{x}^{(i)}\\ 1357 | \widehat{\sigma}^2_j&=\dfrac{1}{\widehat{n}_jd}\sum_{i=1}^{n}\delta(j|i)\,\norm{\mathbf{x}^{(i)}-\mu^{(j)}}^2. 1358 | \end{align*} 1359 | \item[The EM Algorithm] Instead of hard clustering, the data can actually be generated from different clusters with different probabilities. We have soft clustering. We can maximize the likelihood through the EM algorithm. 1360 | \begin{enumerate} 1361 | \item[] Randomly initialize $\theta$: $\mu^{(1)},\dots,\mu^{(K)},\sigma^2_1,\dots,\sigma^2_K,p_1,\dots,p_K.$ 1362 | \item {\bf E-step:} 1363 | $$p(j|i)=\dfrac{p_j\mathcal{N}\left(\mathbf{x}^{(i)};\mu^{(j)},\sigma^2_j\mathbf{I}\right)}{p\left(\mathbf{x}|\theta\right)}$$ 1364 | where $p(\mathbf{x}|\theta)=\sum\limits_{j=1}^{K}p_j\mathcal{N}\left(\mathbf{x}^{(i)};\mu^{(j)},\sigma^2_j\mathbf{I}\right)$ 1365 | \item {\bf M-step:} 1366 | \begin{align*} 1367 | \widehat{n}_j&=\sum_{i=1}^{n}p(j|i)\\ 1368 | \widehat{p}_j&=\dfrac{\widehat{n}_j}{n}\\ 1369 | \widehat{\mu}^{(j)}&=\dfrac{1}{\widehat{n}_j}\sum_{i=1}^{n}p(j|i)\,\mathbf{x}^{(i)}\\ 1370 | \widehat{\sigma}^2_j&=\dfrac{1}{\widehat{n}_jd}\sum_{i=1}^{n}p(j|i)\norm{\mathbf{x}^{(i)}-\mu^{(j)}}^2. 1371 | \end{align*} 1372 | \end{enumerate} 1373 | \end{description} 1374 | 1375 | \section{Reinforcement Learning}\smallskip \hrule height 1pt \smallskip 1376 | 1377 | \begin{description} 1378 | \item[Objectives of RL] The goal of RL is to learn a good policy with no or limited supervision. 1379 | \end{description} 1380 | 1381 | \subsection{Markov Decision Processes} 1382 | 1383 | \begin{description} 1384 | \item[Definition] A {\bf Markov decision process (MDP)} is defined by 1385 | \begin{itemize} 1386 | \item a set of {\bf states} $s\in S$ (may be observed or unobserved); 1387 | \item a set of {\bf actions} $a\in A$; 1388 | \item action-dependent {\bf transition probabilities} $T(s,a,s')=\mathbb{P}(s'|s,a)$ so that, for each state $s$ and action $a$, 1389 | $$\sum_{s'\in S}T(s,a,s')=1$$. 1390 | \item {\bf reward functions} $R(s,a,s')$, representing the reward for starting in state $s$, taking action $a$ and ending up in state $s'$ after one step. (The reward function may also depend only on $s$, or only $s$ and $s$.) 1391 | \end{itemize} 1392 | \item[Property] MDPs satisfy the {\bf Markov property} in that the transition probabilities and rewards depend only on the current state and action, and remain unchanged regardless of the history (i.e., past states and actions) that leads to the current state. 1393 | \item[Utility Function] The main problem for MDPs is to optimize the agent's behavior. We first need to specify the criterion that we are trying to maximize in terms of accumulated rewards. We define a {\bf utility function} and maximize its expectation. 1394 | \begin{itemize} 1395 | \item {\bf Finite horizon based utility:} The utility function is the sum of rewards after acting for a fixed number $n$ of steps. When the rewards depend only on the states, the utility function is 1396 | $$U[s_0,s_1,\dots,s_n]=\sum_{i=0}^{n}R(s_i.)$$ 1397 | \item {\bf (Infinite horizon) discounted reward based utility:} In this setting, the reward one step into the future is discounted by a factor $\gamma$, the reward two steps ahead by $\gamma^2$, and so on. The goal is to continue acting (without an end) while maximizing the expected discounted reward. The discounting allows us to focus on near term rewards, and control this focus by changing $\gamma$. If the rewards depend only on the states, the utility function is 1398 | $$U[s_0,s_1,\dots]=\sum_{k=0}^{\infty}\gamma^kR(s_k).$$ 1399 | \end{itemize} 1400 | \item[Optimal Policy] A {\bf policy} is a function $\pi:S\rightarrow A$ that assigns an action $\pi(s)$ to any state $s$. Given an MDP and a utility function $U[s_0,s_1,\dots,s_n]$, our goal is to find an {\bf optimal policy} function that maximizes the expectation of the utility. We denote the optimal policy by $\pi^*$. 1401 | \end{description} 1402 | 1403 | \subsection{Bellman Equations} 1404 | 1405 | \begin{description} 1406 | \item[Value Function] Denote by $Q^*(s,a)$ the expected reward starting at $s$, taking action $a$ and acting optimally. The {\it value function} $V^*(s)$ is the expected reward starting at state $s$ and acting optimally. 1407 | \item[The Bellman Equations] These equations connect the notion of the value of a state and the value of policy. 1408 | \begin{align*} 1409 | V^*(s)&=\max\limits_{a}Q^*(s,a)=Q^*(s,\pi^*(s))\\ 1410 | Q^*(s,a)&=\sum_{s'}T(s,a,s')\left[R(s,a,s')+\gamma V^*(s')\right] 1411 | \end{align*} 1412 | We can define the $V^*(s)$ recursively to get 1413 | $$V^*(s)=\max\limits_{a}\left\{\sum_{s'}T(s,a,s')\left[R(s,a,s')+\gamma V^*(s')\right]\right\}$$ 1414 | \end{description} 1415 | 1416 | \subsection{Value Iteration Algorithm} 1417 | 1418 | \begin{description} 1419 | \item[Definition] {\it Value iteration} is an iterative algorithm that computes the values of states indexed by $k$. Let $V^*_k(s)$ be the expected reward from state $s$ after $k$ steps: 1420 | $$V^*_k(s)\rightarrow V^*(s)\quad\text{as}\quad k\rightarrow\infty.$$ 1421 | \begin{enumerate} 1422 | \item Initialization: $V^*_0(s)=0$ 1423 | \item Iterate until $V^*_k(s)\simeq V^*_{k+1}(s)\;\forall s$ 1424 | $$V^*_{k+1}(s)\leftarrow\max\limits_{a}\left\{\sum_{s'}T(s,a,s')\left[R(s,a,s')+\gamma V^*_k(s')\right]\right\}$$ 1425 | \item Compute $Q^*(s,a)$ and $\pi^*(s,a)=\argmax\limits_{a}Q^*(s,a)$. 1426 | \end{enumerate} 1427 | \item[Convergence] This algorithm will converge as long as $\gamma<1$. 1428 | \end{description} 1429 | 1430 | \subsection{Q-Value Iteration} 1431 | \begin{description} 1432 | \item[Definition] We can directly operate at the level of Q-values. Q-value iteration is a reformulation of value iteration algorithm. 1433 | \item[Update Rule] ~ 1434 | $$Q^*_{k+1}(s,a)\leftarrow\sum_{s'}T(s,a,s')\left[R(s,a,s')+\gamma\max\limits_{a'}Q^*_k(s',a')\right]$$ 1435 | \end{description} 1436 | 1437 | \subsection{Reinforcement Learning} 1438 | \begin{description} 1439 | \item[MDP vs. RL] In MDPs, we are given 4 quantities $\langle S,A,T,R\rangle$. In reinforcement learning, we are given only the states and actions $\langle S,A\rangle$. In the real world, transitions and rewards might not be directly available and they need to be estimated. 1440 | \item[Estimation] Consider a random variable $X$. The goal is to estimate 1441 | $$\mathbb{E}\left[f(X)\right]=\sum_{x}p(x)f(x).$$ 1442 | We have access to $K$ samples: $x_i$, $i=1,\dots,K$. 1443 | \item[Model-based Learning] ~ 1444 | \begin{align*} 1445 | \widehat{p}(x_i)&=\frac{1}{K}\,\text{count}(x_i)\\ 1446 | \mathbb{E}\left[f(X)\right]&\approx\sum_{i=1}^{K}\widehat{p}(x_i)f(x_i) 1447 | \end{align*} 1448 | \item[Model-free Learning] ~ 1449 | $$\mathbb{E}\left[f(X)\right]\approx\frac{1}{K}\sum_{i=1}^{K}f(x_i)$$ 1450 | \item[Q-Value Iteration for RL] ~ 1451 | \begin{enumerate} 1452 | \item Initialization: $Q(s,a)=0\;\forall s,a$ 1453 | \item Iterate until convergence: 1454 | \begin{enumerate} 1455 | \item Collect sample: $s,a,s',R(s,a,s')$ 1456 | \item Update: 1457 | \begin{align*} 1458 | Q_{i+1}(s,a)&\leftarrow\,\alpha\left[R(s,a,s')+\gamma\max\limits_{a'}Q_i(s',a')\right]+(1-\alpha)\,Q_i(s,a)\\ 1459 | &=Q_i(s,a)+\alpha\left[R(s,a,s')+\gamma\max\limits_{a'}Q_i(s',a')-Q_i(s,a)\right] 1460 | \end{align*} 1461 | \end{enumerate} 1462 | \end{enumerate} 1463 | 1464 | \end{description} 1465 | 1466 | 1467 | 1468 | \section{Recommended Resources} \smallskip \hrule height 1pt \smallskip 1469 | 1470 | \bigskip 1471 | 1472 | \begin{itemize} 1473 | \item Introduction to Machine Learning with Python (M\"uller and Guido) 1474 | \item Machine Learning with Python -- From Linear Models to Deep Learning [Lecture Slides] (\url{http://www.edx.org}) 1475 | \item LaTeX File (\texttt{\href{https://github.com/mynameisjanus/686xMachineLearning}{github.com/mynameisjanus/686xMachineLearning}}) 1476 | \end{itemize} 1477 | 1478 | \begin{center}\emph{Please share this cheatsheet with friends!}\end{center} 1479 | 1480 | \end{multicols*} 1481 | 1482 | 1483 | 1484 | \end{document} 1485 | -------------------------------------------------------------------------------- /maxpool.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynameisjanus/686xMachineLearning/6cd4a9f9e1c01df41a7f51e8a95560037a76ead2/maxpool.png -------------------------------------------------------------------------------- /mushroom.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynameisjanus/686xMachineLearning/6cd4a9f9e1c01df41a7f51e8a95560037a76ead2/mushroom.jpeg -------------------------------------------------------------------------------- /mushroom2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynameisjanus/686xMachineLearning/6cd4a9f9e1c01df41a7f51e8a95560037a76ead2/mushroom2.jpeg -------------------------------------------------------------------------------- /orig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynameisjanus/686xMachineLearning/6cd4a9f9e1c01df41a7f51e8a95560037a76ead2/orig.png -------------------------------------------------------------------------------- /rbf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynameisjanus/686xMachineLearning/6cd4a9f9e1c01df41a7f51e8a95560037a76ead2/rbf.pdf -------------------------------------------------------------------------------- /relu.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynameisjanus/686xMachineLearning/6cd4a9f9e1c01df41a7f51e8a95560037a76ead2/relu.pdf -------------------------------------------------------------------------------- /tanh.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynameisjanus/686xMachineLearning/6cd4a9f9e1c01df41a7f51e8a95560037a76ead2/tanh.pdf --------------------------------------------------------------------------------