├── .gitignore
├── 189-cheat-sheet-minicards.pdf
├── 189-cheat-sheet-nominicards.pdf
├── 189-cheat-sheet.lyx
├── README.md
└── graphics
    ├── NN.pdf
    ├── NN1.pdf
    ├── NN2.pdf
    ├── disc09-entropy-1.pdf
    ├── disc10-skipnn-1.pdf
    ├── disc10-skipnn-2.pdf
    ├── disc12-pca-1.pdf
    ├── disc12-pca-2.pdf
    ├── disc12-pca-3.pdf
    └── disc12-pca-4.pdf


/.gitignore:
--------------------------------------------------------------------------------
1 | *.aux
2 | *.fdb_latexmk
3 | *.gz
4 | *.log
5 | *.out
6 | .DS_Store
7 | .pdf
8 | 


--------------------------------------------------------------------------------
/189-cheat-sheet-minicards.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/189-cheat-sheet-minicards.pdf


--------------------------------------------------------------------------------
/189-cheat-sheet-nominicards.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/189-cheat-sheet-nominicards.pdf


--------------------------------------------------------------------------------
/189-cheat-sheet.lyx:
--------------------------------------------------------------------------------
   1 | #LyX 2.1 created this file. For more info see http://www.lyx.org/
   2 | \lyxformat 474
   3 | \begin_document
   4 | \begin_header
   5 | \textclass extarticle
   6 | \begin_preamble
   7 | \usepackage{amsmath,amsthm,amsfonts,amssymb}
   8 | \usepackage{calc}
   9 | \usepackage{color,graphicx,overpic}
  10 | \usepackage[shortlabels]{enumitem}
  11 | \usepackage{hyperref}
  12 | \usepackage{ifthen}
  13 | \usepackage{multicol}
  14 | \usepackage{titlesec}
  15 | \usepackage{wrapfig}
  16 | 
  17 | \titlespacing*{\section}{0pt}{0.5em}{0em}
  18 | \titlespacing*{\subsection}{0pt}{0.5em}{0em}
  19 | \titlespacing*{\subsubsection}{0pt}{0.5em}{0em}
  20 | \titleformat{\section}{\vspace{1em}\titlerule\normalfont\fontsize{7}{7}\bfseries}{\thesection}{1em}{}
  21 | \titleformat{\subsection}{\normalfont\fontsize{6}{6}\bfseries}{\thesection}{1em}{}
  22 | \titleformat{\subsubsection}{\titlerule\normalfont\fontsize{6}{6}}{\thesection}{1em}{}
  23 | \titlespacing*{\labeling}{0pt}{0em}{0em}
  24 | 
  25 | \let\stdboxed\boxed
  26 | \renewcommand{\boxed}[1]{
  27 |   \setlength{\fboxsep}{0.05em}
  28 |   \stdboxed{#1}
  29 | }
  30 | 
  31 | \setlist{nolistsep,leftmargin=*}
  32 | 
  33 | \setlength{\premulticols}{1pt}
  34 | \setlength{\postmulticols}{1pt}
  35 | \setlength{\columnsep}{10pt}
  36 | 
  37 | \newtheorem{example}[section]{Example}
  38 | 
  39 | \let\textquotedbl="
  40 | \def\ci{\perp\!\!\!\perp}
  41 | 
  42 | \raggedright
  43 | 
  44 | \newcommand{\mytitle}[2]{
  45 |   \begin{center}\small{#1} -- \scriptsize{#2}\end{center}
  46 | }
  47 | 
  48 | 
  49 | \hyphenpenalty=100
  50 | \end_preamble
  51 | \options 3pt
  52 | \use_default_options false
  53 | \maintain_unincluded_children false
  54 | \language english
  55 | \language_package none
  56 | \inputencoding auto
  57 | \fontencoding default
  58 | \font_roman times
  59 | \font_sans default
  60 | \font_typewriter default
  61 | \font_math auto
  62 | \font_default_family default
  63 | \use_non_tex_fonts false
  64 | \font_sc false
  65 | \font_osf false
  66 | \font_sf_scale 100
  67 | \font_tt_scale 100
  68 | \graphics default
  69 | \default_output_format default
  70 | \output_sync 0
  71 | \bibtex_command default
  72 | \index_command default
  73 | \paperfontsize default
  74 | \spacing single
  75 | \use_hyperref false
  76 | \papersize default
  77 | \use_geometry true
  78 | \use_package amsmath 1
  79 | \use_package amssymb 0
  80 | \use_package cancel 0
  81 | \use_package esint 1
  82 | \use_package mathdots 0
  83 | \use_package mathtools 0
  84 | \use_package mhchem 0
  85 | \use_package stackrel 0
  86 | \use_package stmaryrd 0
  87 | \use_package undertilde 0
  88 | \cite_engine basic
  89 | \cite_engine_type default
  90 | \biblio_style plain
  91 | \use_bibtopic false
  92 | \use_indices false
  93 | \paperorientation portrait
  94 | \suppress_date false
  95 | \justification false
  96 | \use_refstyle 0
  97 | \index Index
  98 | \shortcut idx
  99 | \color #008000
 100 | \end_index
 101 | \leftmargin 0.25in
 102 | \topmargin 0.25in
 103 | \rightmargin 0.25in
 104 | \bottommargin 0.25in
 105 | \secnumdepth -2
 106 | \tocdepth 3
 107 | \paragraph_separation skip
 108 | \defskip smallskip
 109 | \quotes_language english
 110 | \papercolumns 1
 111 | \papersides 1
 112 | \paperpagestyle empty
 113 | \tracking_changes false
 114 | \output_changes false
 115 | \html_math_output 0
 116 | \html_css_as_file 0
 117 | \html_be_strict false
 118 | \end_header
 119 | 
 120 | \begin_body
 121 | 
 122 | \begin_layout Standard
 123 | \begin_inset ERT
 124 | status open
 125 | 
 126 | \begin_layout Plain Layout
 127 | 
 128 | 
 129 | \backslash
 130 | fontsize{5}{4}
 131 | \backslash
 132 | selectfont
 133 | \end_layout
 134 | 
 135 | \end_inset
 136 | 
 137 | 
 138 | \end_layout
 139 | 
 140 | \begin_layout Standard
 141 | \begin_inset ERT
 142 | status open
 143 | 
 144 | \begin_layout Plain Layout
 145 | 
 146 | 
 147 | \backslash
 148 | mytitle{CS 189 Final Note Sheet}{Rishi Sharma, Peter Gao, et.
 149 |  al.}
 150 | \end_layout
 151 | 
 152 | \begin_layout Plain Layout
 153 | 
 154 | 
 155 | \backslash
 156 | begin{multicols}{4}
 157 | \end_layout
 158 | 
 159 | \end_inset
 160 | 
 161 | 
 162 | \end_layout
 163 | 
 164 | \begin_layout Section
 165 | Probability & Matrix Review
 166 | \end_layout
 167 | 
 168 | \begin_layout Subsection
 169 | Bayesian Decision Theory
 170 | \end_layout
 171 | 
 172 | \begin_layout Standard
 173 | Bayes Rule: 
 174 | \begin_inset Formula $P(\omega|x)=\frac{P(x|\omega)P(\omega)}{P(x)},P(x)=\sum_{i}P(x|\omega_{i})P(\omega_{i})$
 175 | \end_inset
 176 | 
 177 | 
 178 | \end_layout
 179 | 
 180 | \begin_layout Standard
 181 | \begin_inset Formula $P(x,w)=P(x|w)P(w)=P(w|x)P(x)$
 182 | \end_inset
 183 | 
 184 | 
 185 | \end_layout
 186 | 
 187 | \begin_layout Standard
 188 | \begin_inset Formula $P(error)=\int_{-\infty}^{\infty}P(error|x)P(x)dx$
 189 | \end_inset
 190 | 
 191 | 
 192 | \end_layout
 193 | 
 194 | \begin_layout Standard
 195 | \begin_inset Formula $P(error|x)=\left\{ \begin{array}{lr}
 196 | P(\omega_{1}|x) & \text{ if we decide }\omega_{2}\\
 197 | P(\omega_{2}|x) & \text{ if we decide }\omega_{1}
 198 | \end{array}\right.$
 199 | \end_inset
 200 | 
 201 | 
 202 | \end_layout
 203 | 
 204 | \begin_layout Standard
 205 | 0-1 Loss: 
 206 | \begin_inset Formula $\lambda(\alpha_{i}|\omega_{j})=\left\{ \begin{array}{lr}
 207 | 0 & i=j\text{\ (correct)}\\
 208 | 1 & i\not=j\text{\ (mismatch)}
 209 | \end{array}\right.$
 210 | \end_inset
 211 | 
 212 | 
 213 | \end_layout
 214 | 
 215 | \begin_layout Standard
 216 | 
 217 | \family roman
 218 | \series medium
 219 | \shape up
 220 | \size normal
 221 | \emph off
 222 | \bar no
 223 | \strikeout off
 224 | \uuline off
 225 | \uwave off
 226 | \noun off
 227 | \color none
 228 | Expected Loss (Risk)
 229 | \family default
 230 | \series default
 231 | \shape default
 232 | \size default
 233 | \bar default
 234 | \strikeout default
 235 | \uuline default
 236 | \uwave default
 237 | \noun default
 238 | \color inherit
 239 | : 
 240 | \begin_inset Formula $R(\alpha_{i}|x)=\sum_{j=1}^{c}\lambda(\alpha_{i}|\omega_{j})P(\omega_{j}|x)$
 241 | \end_inset
 242 | 
 243 | 
 244 | \end_layout
 245 | 
 246 | \begin_layout Standard
 247 | 
 248 | \family roman
 249 | \series medium
 250 | \shape up
 251 | \size normal
 252 | \emph off
 253 | \bar no
 254 | \strikeout off
 255 | \uuline off
 256 | \uwave off
 257 | \noun off
 258 | \color none
 259 | 0-1 Risk:
 260 | \family default
 261 | \series default
 262 | \shape default
 263 | \size default
 264 | \bar default
 265 | \strikeout default
 266 | \uuline default
 267 | \uwave default
 268 | \noun default
 269 | \color inherit
 270 |  
 271 | \begin_inset Formula $R(\alpha_{i}|x)=\sum_{j\not=i}^{c}P(\omega_{j}|x)=1-P(\omega_{i}|x)$
 272 | \end_inset
 273 | 
 274 | 
 275 | \end_layout
 276 | 
 277 | \begin_layout Subsection
 278 | Generative vs.
 279 |  Discriminative Model
 280 | \end_layout
 281 | 
 282 | \begin_layout Standard
 283 | 
 284 | \series bold
 285 | Generative
 286 | \series default
 287 | : Model class conditional density 
 288 | \begin_inset Formula $p(x|y)$
 289 | \end_inset
 290 | 
 291 |  and find 
 292 | \begin_inset Formula $p(y|x)\propto p(x|y)p(y)$
 293 | \end_inset
 294 | 
 295 |  or model joint density 
 296 | \begin_inset Formula $p(x,y)$
 297 | \end_inset
 298 | 
 299 |  and marginalize to find 
 300 | \begin_inset Formula $p(y=k|x)=\int_{x}p(x,y=k)dx$
 301 | \end_inset
 302 | 
 303 |  (posterior)
 304 | \end_layout
 305 | 
 306 | \begin_layout Standard
 307 | 
 308 | \series bold
 309 | Discriminative
 310 | \series default
 311 | : Model conditional 
 312 | \begin_inset Formula $p(y|x)$
 313 | \end_inset
 314 | 
 315 | .
 316 | \end_layout
 317 | 
 318 | \begin_layout Standard
 319 | \begin_inset Tabular
 320 | <lyxtabular version="3" rows="2" columns="2">
 321 | <features rotate="0" tabularvalignment="middle">
 322 | <column alignment="center" valignment="top">
 323 | <column alignment="center" valignment="top">
 324 | <row>
 325 | <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 326 | \begin_inset Text
 327 | 
 328 | \begin_layout Plain Layout
 329 | 
 330 | \series bold
 331 | class conditional
 332 | \series default
 333 |  
 334 | \begin_inset Formula $P(X|Y)$
 335 | \end_inset
 336 | 
 337 | 
 338 | \end_layout
 339 | 
 340 | \end_inset
 341 | </cell>
 342 | <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 343 | \begin_inset Text
 344 | 
 345 | \begin_layout Plain Layout
 346 | 
 347 | \series bold
 348 | posterior
 349 | \series default
 350 |  
 351 | \begin_inset Formula $P(Y|X)$
 352 | \end_inset
 353 | 
 354 | 
 355 | \end_layout
 356 | 
 357 | \end_inset
 358 | </cell>
 359 | </row>
 360 | <row>
 361 | <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
 362 | \begin_inset Text
 363 | 
 364 | \begin_layout Plain Layout
 365 | 
 366 | \series bold
 367 | prior
 368 | \series default
 369 |  
 370 | \begin_inset Formula $P(Y)$
 371 | \end_inset
 372 | 
 373 | 
 374 | \end_layout
 375 | 
 376 | \end_inset
 377 | </cell>
 378 | <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
 379 | \begin_inset Text
 380 | 
 381 | \begin_layout Plain Layout
 382 | 
 383 | \series bold
 384 | evidence
 385 | \series default
 386 |  
 387 | \begin_inset Formula $P(X)$
 388 | \end_inset
 389 | 
 390 | 
 391 | \end_layout
 392 | 
 393 | \end_inset
 394 | </cell>
 395 | </row>
 396 | </lyxtabular>
 397 | 
 398 | \end_inset
 399 | 
 400 | 
 401 | \end_layout
 402 | 
 403 | \begin_layout Subsection
 404 | Probabilistic Motivation for Least Squares
 405 | \end_layout
 406 | 
 407 | \begin_layout Standard
 408 | \begin_inset Formula $y^{(i)}=\theta^{\intercal}x^{(i)}+\epsilon^{(i)}\ \text{with noise}\ \epsilon{(i)}\sim\mathcal{N}(0,\sigma^{2})$
 409 | \end_inset
 410 | 
 411 | 
 412 | \end_layout
 413 | 
 414 | \begin_layout Standard
 415 | Note: The intercept term 
 416 | \begin_inset Formula $x_{0}=1$
 417 | \end_inset
 418 | 
 419 |  is accounted for in 
 420 | \begin_inset Formula $\theta$
 421 | \end_inset
 422 | 
 423 | 
 424 | \begin_inset Newline newline
 425 | \end_inset
 426 | 
 427 | 
 428 | \begin_inset Formula $\implies p(y^{(i)}|x^{(i)};\theta)=\frac{1}{\sqrt{2\pi\sigma^{2}}}\exp\left(-\frac{(y^{(i)}-\theta^{\intercal}x^{(i)})^{2}}{2\sigma^{2}}\right)$
 429 | \end_inset
 430 | 
 431 | 
 432 | \begin_inset Newline newline
 433 | \end_inset
 434 | 
 435 | 
 436 | \begin_inset Formula $\implies L(\theta)=\prod_{i=1}^{m}\frac{1}{\sqrt{2\pi\sigma^{2}}}\exp\left(-\frac{(y^{(i)}-\theta^{\intercal}x^{(i)})^{2}}{2\sigma^{2}}\right)$
 437 | \end_inset
 438 | 
 439 | 
 440 | \begin_inset Newline newline
 441 | \end_inset
 442 | 
 443 | 
 444 | \begin_inset Formula $\implies l(\theta)=m\log\frac{1}{\sqrt{2\pi\sigma^{2}}}-\frac{1}{2\sigma^{2}}\sum_{i=1}^{m}(y^{(i)}-\theta^{\intercal}x^{(i)})^{2}$
 445 | \end_inset
 446 | 
 447 | 
 448 | \begin_inset Newline newline
 449 | \end_inset
 450 | 
 451 | 
 452 | \begin_inset Formula $\implies\max_{\theta}l(\theta)\equiv\min_{\theta}\sum_{i=1}^{m}(y^{(i)}-h_{\theta}(x))^{2}$
 453 | \end_inset
 454 | 
 455 | 
 456 | \end_layout
 457 | 
 458 | \begin_layout Standard
 459 | Gaussian noise in our data set 
 460 | \begin_inset Formula $\{x^{(i)},y^{(i)}\}_{i=1}^{m}$
 461 | \end_inset
 462 | 
 463 | gives us least squares 
 464 | \end_layout
 465 | 
 466 | \begin_layout Standard
 467 | \begin_inset Formula $min_{\theta}||X\theta-y||_{2}^{2}\equiv\min_{\theta}\theta^{\intercal}X^{\intercal}X\theta-2\theta^{\intercal}X^{\intercal}y+y^{\intercal}Y$
 468 | \end_inset
 469 | 
 470 | 
 471 | \end_layout
 472 | 
 473 | \begin_layout Standard
 474 | \begin_inset Formula $\nabla_{\theta}l(\theta)=X^{\intercal}X\theta-X^{\intercal}y=0\implies\boxed{\theta^{*}=(X^{\intercal}X)^{-1}X^{\intercal}y}$
 475 | \end_inset
 476 | 
 477 | 
 478 | \end_layout
 479 | 
 480 | \begin_layout Standard
 481 | Gradient Descent: 
 482 | \begin_inset Formula $\theta_{t+1}=\theta_{t}+\alpha(y_{t}^{(i)}-h(x_{t}^{(i)}))x_{t}^{(i)},\ \ h_{\theta}(x)=\theta^{\intercal}x$
 483 | \end_inset
 484 | 
 485 | 
 486 | \end_layout
 487 | 
 488 | \begin_layout Subsection
 489 | Multivariate Gaussian 
 490 | \begin_inset Formula $X\sim\mathcal{N}(\mu,\Sigma)$
 491 | \end_inset
 492 | 
 493 | 
 494 | \end_layout
 495 | 
 496 | \begin_layout Standard
 497 | 
 498 | \bar under
 499 | Gaussian class conditionals lead to a logistic posterior.
 500 | \end_layout
 501 | 
 502 | \begin_layout Standard
 503 | \begin_inset Formula $f(x;\mu,\Sigma)=\frac{1}{(2\pi)^{n/2}|\Sigma|^{1/2}}\exp\left(-\frac{1}{2}(x-\mu)^{T}\Sigma^{-1}(x-\mu)\right)$
 504 | \end_inset
 505 | 
 506 | 
 507 | \end_layout
 508 | 
 509 | \begin_layout Standard
 510 | \begin_inset Formula $\Sigma=E[(X-\mu)(X-\mu)^{T}]=E[XX^{T}]-\mu\mu^{T}$
 511 | \end_inset
 512 | 
 513 | 
 514 | \end_layout
 515 | 
 516 | \begin_layout Standard
 517 | \begin_inset Formula $\Sigma\text{ is PSD}\implies x^{T}\Sigma x\ge0\text{, if inverse exists }\Sigma\text{ must be PD}$
 518 | \end_inset
 519 | 
 520 | 
 521 | \end_layout
 522 | 
 523 | \begin_layout Standard
 524 | \begin_inset Formula $\text{If }X\sim N(\mu,\Sigma),\ \text{then}\ AX+b\sim N(A\mu+b,A\Sigma A^{T})$
 525 | \end_inset
 526 | 
 527 | 
 528 | \begin_inset Newline newline
 529 | \end_inset
 530 | 
 531 |  
 532 | \begin_inset Formula $\implies\Sigma^{-\frac{1}{2}}(X-\mu)\sim N(0,I),\text{ where }\Sigma^{-\frac{1}{2}}=U\Lambda^{-\frac{1}{2}}$
 533 | \end_inset
 534 | 
 535 | 
 536 | \end_layout
 537 | 
 538 | \begin_layout Standard
 539 | The distribution is the result of a linear transformation of a vector of
 540 |  univariate Gaussians 
 541 | \begin_inset Formula $Z\sim\mathcal{N}(0,I)$
 542 | \end_inset
 543 | 
 544 |  such that 
 545 | \begin_inset Formula $X=AZ+\mu$
 546 | \end_inset
 547 | 
 548 |  where we have 
 549 | \begin_inset Formula $\Sigma=AA^{\intercal}$
 550 | \end_inset
 551 | 
 552 | .
 553 |  From the pdf, we see that the level curves of the distribution decrease
 554 |  proportionally with 
 555 | \begin_inset Formula $x^{\intercal}\Sigma^{-1}x$
 556 | \end_inset
 557 | 
 558 |  (assume 
 559 | \begin_inset Formula $\mu=0$
 560 | \end_inset
 561 | 
 562 | ) 
 563 | \begin_inset Formula $\implies$
 564 | \end_inset
 565 | 
 566 | 
 567 | \begin_inset Formula 
 568 | \[
 569 | \text{\ensuremath{c}-level set of \ensuremath{f}}\propto\{x:x^{\intercal}\Sigma^{-1}x=c\}
 570 | \]
 571 | 
 572 | \end_inset
 573 | 
 574 | 
 575 | \begin_inset Formula 
 576 | \[
 577 | x^{\intercal}\Sigma^{-1}=c\equiv x^{\intercal}U\Lambda^{-1}U^{\intercal}x=c\implies
 578 | \]
 579 | 
 580 | \end_inset
 581 | 
 582 | 
 583 | \begin_inset Formula 
 584 | \[
 585 | \underbrace{\lambda_{1}^{-1}(u_{1}^{\intercal}x)^{2}}_{\text{axis length: \ensuremath{\sqrt{\lambda_{1}}}}}+\cdots+\underbrace{\lambda_{n}^{-1}(u_{n}^{\intercal}x)^{2}}_{\text{axis length: \ensuremath{\sqrt{\lambda_{n}}}}}=c
 586 | \]
 587 | 
 588 | \end_inset
 589 | 
 590 | 
 591 | \end_layout
 592 | 
 593 | \begin_layout Standard
 594 | Thus the level curves form an ellipsoid with axis lengths equal to the square
 595 |  root of the eigenvalues of the covariance matrix.
 596 | \end_layout
 597 | 
 598 | \begin_layout Subsection
 599 | Loss Functions
 600 | \end_layout
 601 | 
 602 | \begin_layout Standard
 603 | 
 604 | \end_layout
 605 | 
 606 | \begin_layout Itemize
 607 | 
 608 | \series bold
 609 | Binomial deviance
 610 | \series default
 611 |  
 612 | \begin_inset Formula $=\log\left[1+e^{-yf\left(x\right)}\right]$
 613 | \end_inset
 614 | 
 615 | 
 616 | \begin_inset Newline newline
 617 | \end_inset
 618 | 
 619 | minimizing function 
 620 | \begin_inset Formula $f\left(x\right)=\log\frac{\mathrm{P}\left[Y=+1\mid x\right]}{\mathrm{P}\left[Y=-1\mid x\right]}$
 621 | \end_inset
 622 | 
 623 | 
 624 | \end_layout
 625 | 
 626 | \begin_layout Itemize
 627 | 
 628 | \series bold
 629 | SVM hinge loss
 630 | \series default
 631 |  
 632 | \begin_inset Formula $=\left[1-yf\left(x\right)\right]_{+}$
 633 | \end_inset
 634 | 
 635 | 
 636 | \begin_inset Newline newline
 637 | \end_inset
 638 | 
 639 | minimizing function 
 640 | \begin_inset Formula $f\left(x\right)=\mathrm{sign}\left(\mathrm{P}\left[Y=+1\mid x\right]-\frac{1}{2}\right)$
 641 | \end_inset
 642 | 
 643 | 
 644 | \end_layout
 645 | 
 646 | \begin_layout Itemize
 647 | 
 648 | \series bold
 649 | Squared error
 650 | \series default
 651 |  
 652 | \begin_inset Formula $=\left[y-f\left(x\right)\right]^{2}=\left[1-yf\left(x\right)\right]^{2}$
 653 | \end_inset
 654 | 
 655 | 
 656 | \begin_inset Newline newline
 657 | \end_inset
 658 | 
 659 | minimizing function  
 660 | \begin_inset Formula $f\left(x\right)=2\mathrm{P}\left[Y=+1\mid x\right]-1$
 661 | \end_inset
 662 | 
 663 | 
 664 | \end_layout
 665 | 
 666 | \begin_layout Itemize
 667 | 
 668 | \series bold
 669 | \begin_inset Quotes eld
 670 | \end_inset
 671 | 
 672 | Huberized
 673 | \begin_inset Quotes erd
 674 | \end_inset
 675 | 
 676 |  square hinge loss
 677 | \series default
 678 |  
 679 | \begin_inset Formula $=\left\{ \begin{array}{ll}
 680 | -4yf\left(x\right) & \text{if}\ yf\left(x\right)<-1\\
 681 | \left[1-yf\left(x\right)\right]_{+}^{2} & \text{otherwise}
 682 | \end{array}\right.$
 683 | \end_inset
 684 | 
 685 | 
 686 | \begin_inset Newline newline
 687 | \end_inset
 688 | 
 689 | minimizing function 
 690 | \begin_inset Formula $f\left(x\right)=2\mathrm{P}\left[Y=+1\mid x\right]-1$
 691 | \end_inset
 692 | 
 693 | 
 694 | \end_layout
 695 | 
 696 | \begin_layout Subsection
 697 | Optimization
 698 | \end_layout
 699 | 
 700 | \begin_layout Standard
 701 | Newton's Method: 
 702 | \begin_inset Formula $\theta_{t+1}=\theta_{t}-[\nabla_{\theta}^{2}f(\theta_{t})]^{-1}\nabla_{\theta}f(\theta_{t})$
 703 | \end_inset
 704 | 
 705 | 
 706 | \end_layout
 707 | 
 708 | \begin_layout Standard
 709 | Gradient Decent: 
 710 | \begin_inset Formula $\theta_{t+1}=\theta_{t}-\alpha\nabla_{\theta}f(\theta_{t})$
 711 | \end_inset
 712 | 
 713 | , for minimizing
 714 | \end_layout
 715 | 
 716 | \begin_layout Subsection
 717 | Gradients
 718 | \end_layout
 719 | 
 720 | \begin_layout Standard
 721 | \begin_inset Formula $\frac{\partial{\bf {y}}}{\partial{\bf {x}}}\triangleq\begin{bmatrix}\frac{\partial y_{1}}{\partial x_{1}} & \dots & \frac{\partial y_{m}}{\partial x_{1}}\\
 722 | \vdots & \ddots & \vdots\\
 723 | \frac{\partial y_{1}}{\partial x_{n}} & \dots & \frac{\partial y_{m}}{\partial x_{n}}
 724 | \end{bmatrix},$
 725 | \end_inset
 726 | 
 727 |  
 728 | \begin_inset Formula $\frac{\partial(A{\bf x})}{\partial{\bf x}}=A^{T},\frac{\partial({\bf x}^{T}A)}{\partial{\bf x}}=A,$
 729 | \end_inset
 730 | 
 731 | 
 732 | \begin_inset Newline newline
 733 | \end_inset
 734 | 
 735 |  
 736 | \begin_inset Formula $\frac{\partial({\bf x}^{T}{\bf x})}{\partial{\bf x}}=2{\bf x},\frac{\partial({\bf x}^{T}A{\bf x})}{\partial{\bf x}}=(A+A^{T}){\bf x},\frac{\partial(trBA)}{\partial A}=B^{T}$
 737 | \end_inset
 738 | 
 739 | 
 740 | \end_layout
 741 | 
 742 | \begin_layout Standard
 743 | \begin_inset VSpace vfill
 744 | \end_inset
 745 | 
 746 | 
 747 | \end_layout
 748 | 
 749 | \begin_layout Standard
 750 | \begin_inset ERT
 751 | status open
 752 | 
 753 | \begin_layout Plain Layout
 754 | 
 755 | 
 756 | \backslash
 757 | columnbreak
 758 | \end_layout
 759 | 
 760 | \end_inset
 761 | 
 762 | 
 763 | \end_layout
 764 | 
 765 | \begin_layout Section
 766 | Support Vector Machines
 767 | \end_layout
 768 | 
 769 | \begin_layout Standard
 770 | In the strictly separable case, the goal is to find a separating hyperplane
 771 |  (like logistic regression) except now we don't just want any hyperplane,
 772 |  but one with the largest margin.
 773 |  
 774 | \end_layout
 775 | 
 776 | \begin_layout Standard
 777 | \begin_inset Formula $H=\{\omega^{T}x+b=0\}$
 778 | \end_inset
 779 | 
 780 | , since scaling 
 781 | \begin_inset Formula $\omega$
 782 | \end_inset
 783 | 
 784 |  and b in opposite directions doesn't change the hyperplane our optimization
 785 |  function should have scaling invariance built into it.
 786 |  Thus, we do it now and define the closest points to the hyperplane 
 787 | \begin_inset Formula $x_{sv}$
 788 | \end_inset
 789 | 
 790 |  (support vectors) to satisfy: 
 791 | \begin_inset Formula $|\omega^{T}x_{sv}+b|=1$
 792 | \end_inset
 793 | 
 794 | .
 795 |  The distance from any support vector to the hyper plane is now: 
 796 | \begin_inset Formula $\frac{1}{||\omega||_{2}}$
 797 | \end_inset
 798 | 
 799 | .
 800 |  Maximizing the distance to the hyperplane is the same as minimizing 
 801 | \begin_inset Formula $||\omega||_{2}$
 802 | \end_inset
 803 | 
 804 | .
 805 | \end_layout
 806 | 
 807 | \begin_layout Standard
 808 | The final optimization problem is:
 809 | \end_layout
 810 | 
 811 | \begin_layout Standard
 812 | \begin_inset Formula $\boxed{\min_{\omega,b}\frac{1}{2}||\omega||_{2}\ s.t.\ y^{(i)}(w^{T}x^{(i)}+b)\ge1,i=1,\dots,m}$
 813 | \end_inset
 814 | 
 815 | 
 816 | \end_layout
 817 | 
 818 | \begin_layout Standard
 819 | 
 820 | \bar under
 821 | Primal
 822 | \bar default
 823 | : 
 824 | \begin_inset Formula $L_{p}(\omega,b,\alpha)=\frac{1}{2}||\omega||_{2}-\sum_{i=1}^{m}\alpha_{i}(y^{(i)}(w^{T}x^{(i)}+b)-1)$
 825 | \end_inset
 826 | 
 827 | 
 828 | \end_layout
 829 | 
 830 | \begin_layout Standard
 831 | \begin_inset Formula $\frac{\partial L_{p}}{\partial\omega}=\omega-\sum\alpha_{i}y^{(i)}x^{(i)}=0\implies\omega=\sum\alpha_{i}y^{(i)}x^{(i)}$
 832 | \end_inset
 833 | 
 834 | 
 835 | \end_layout
 836 | 
 837 | \begin_layout Standard
 838 | \begin_inset Formula $\frac{\partial L_{p}}{\partial b}=-\sum\alpha_{i}y^{(i)}=0,\text{\ \ \ Note: }\alpha_{i}\ne0$
 839 | \end_inset
 840 | 
 841 |  only for support vectors.
 842 | \end_layout
 843 | 
 844 | \begin_layout Standard
 845 | Substitute the derivatives into the primal to get the dual.
 846 | \end_layout
 847 | 
 848 | \begin_layout Standard
 849 | 
 850 | \bar under
 851 | Dual
 852 | \bar default
 853 | : 
 854 | \begin_inset Formula $L_{d}(\alpha)=\sum_{i=1}^{m}\alpha_{i}-\frac{1}{2}\sum_{i=1}^{m}\sum_{j=1}^{m}y^{(i)}y^{(j)}\alpha_{i}\alpha_{j}(x^{(i)})^{T}x^{(j)}$
 855 | \end_inset
 856 | 
 857 | 
 858 | \end_layout
 859 | 
 860 | \begin_layout Standard
 861 | KKT says 
 862 | \begin_inset Formula $\alpha_{n}(y_{n}(w^{T}x_{n}+b)-1)=0$
 863 | \end_inset
 864 | 
 865 |  where 
 866 | \begin_inset Formula $\alpha_{n}>0$
 867 | \end_inset
 868 | 
 869 | .
 870 | \end_layout
 871 | 
 872 | \begin_layout Standard
 873 | In the non-separable case we allow points to cross the marginal boundary
 874 |  by some amount 
 875 | \begin_inset Formula $\xi$
 876 | \end_inset
 877 | 
 878 |  and penalize it.
 879 | \end_layout
 880 | 
 881 | \begin_layout Standard
 882 | \begin_inset Formula $\boxed{\min_{\omega,b}\frac{1}{2}||\omega||_{2}+C\sum_{i=1}^{m}\xi_{i}\ \ s.t.\ \ y^{(i)}(w^{T}x^{(i)}+b)\ge1-\xi_{i}}$
 883 | \end_inset
 884 | 
 885 | 
 886 | \end_layout
 887 | 
 888 | \begin_layout Standard
 889 | The dual for non-separable doesn't change much except that each 
 890 | \begin_inset Formula $\alpha_{i}$
 891 | \end_inset
 892 | 
 893 |  now has an upper bound of C 
 894 | \begin_inset Formula $\implies0\le\alpha_{i}\le C$
 895 | \end_inset
 896 | 
 897 |  
 898 | \end_layout
 899 | 
 900 | \begin_layout Subsection
 901 | Lagrangian
 902 | \end_layout
 903 | 
 904 | \begin_layout Standard
 905 | \begin_inset Formula $\boxed{L\left(x,\lambda\right)=f_{0}\left(x\right)+\sum_{i=1}^{m}\lambda_{i}f_{i}\left(x\right)}$
 906 | \end_inset
 907 | 
 908 | 
 909 | \end_layout
 910 | 
 911 | \begin_layout Itemize
 912 | Think of the 
 913 | \begin_inset Formula $\lambda_{i}$
 914 | \end_inset
 915 | 
 916 |  as the cost of violating the constraint 
 917 | \begin_inset Formula $f_{i}\left(x\right)\leq0$
 918 | \end_inset
 919 | 
 920 | .
 921 | \end_layout
 922 | 
 923 | \begin_layout Itemize
 924 | \begin_inset Formula $L$
 925 | \end_inset
 926 | 
 927 |  defines a saddle point game: one player (
 928 | \noun on
 929 | Min
 930 | \noun default
 931 | ); the other player (
 932 | \noun on
 933 | Max
 934 | \noun default
 935 | ) chooses 
 936 | \begin_inset Formula $\lambda$
 937 | \end_inset
 938 | 
 939 |  to maximize 
 940 | \begin_inset Formula $L$
 941 | \end_inset
 942 | 
 943 | .
 944 |  If 
 945 | \noun on
 946 | Min
 947 | \noun default
 948 |  violates a constraint, 
 949 | \begin_inset Formula $f_{i}\left(x\right)>0$
 950 | \end_inset
 951 | 
 952 | , then 
 953 | \noun on
 954 | Max
 955 | \noun default
 956 |  can drive 
 957 | \begin_inset Formula $L$
 958 | \end_inset
 959 | 
 960 |  to infinity.
 961 | \end_layout
 962 | 
 963 | \begin_layout Itemize
 964 | We call the original optimization problem the 
 965 | \bar under
 966 | primal
 967 | \bar default
 968 |  problem.
 969 | \begin_inset Newline newline
 970 | \end_inset
 971 | 
 972 | It has value
 973 | \begin_inset Formula $p*=\min_{x}\max_{\lambda\geq0}L\left(x,\lambda\right)$
 974 | \end_inset
 975 | 
 976 | 
 977 | \begin_inset Newline newline
 978 | \end_inset
 979 | 
 980 | (Because of an infeasible 
 981 | \begin_inset Formula $x$
 982 | \end_inset
 983 | 
 984 | , 
 985 | \begin_inset Formula $L\left(x,\lambda\right)$
 986 | \end_inset
 987 | 
 988 |  can be made infinite, and for a feasible 
 989 | \begin_inset Formula $x$
 990 | \end_inset
 991 | 
 992 | , the 
 993 | \begin_inset Formula $\lambda_{i}f_{i}\left(x\right)$
 994 | \end_inset
 995 | 
 996 |  terms will become zero.)
 997 | \end_layout
 998 | 
 999 | \begin_layout Itemize
1000 | Define 
1001 | \begin_inset Formula $g\left(\lambda\right):=\min_{x}L\left(x,\lambda\right)$
1002 | \end_inset
1003 | 
1004 | , and define the 
1005 | \bar under
1006 | dual
1007 | \bar default
1008 |  problem as
1009 | \begin_inset Newline newline
1010 | \end_inset
1011 | 
1012 | 
1013 | \begin_inset Formula $d*=\max_{\lambda\geq0}g\left(\lambda\right)=\max_{\lambda\geq0}\min_{x}L\left(x,\lambda\right)$
1014 | \end_inset
1015 | 
1016 | 
1017 | \end_layout
1018 | 
1019 | \begin_layout Itemize
1020 | In a zero sum game, it's always better to play second: 
1021 | \begin_inset Formula $p*=\min_{x}\max_{\lambda\geq0}L\left(x,\lambda\right)\geq\max_{\lambda\geq0}\min_{x}L\left(x,\lambda\right)=d*$
1022 | \end_inset
1023 | 
1024 | This is called 
1025 | \bar under
1026 | weak duality
1027 | \bar default
1028 | .
1029 | \end_layout
1030 | 
1031 | \begin_layout Itemize
1032 | If there is a 
1033 | \bar under
1034 | saddle point
1035 | \bar default
1036 |  
1037 | \begin_inset Formula $\left(x*,\lambda*\right)$
1038 | \end_inset
1039 | 
1040 | , so that for all 
1041 | \begin_inset Formula $x$
1042 | \end_inset
1043 | 
1044 |  and 
1045 | \begin_inset Formula $\lambda\geq0$
1046 | \end_inset
1047 | 
1048 | , 
1049 | \begin_inset Formula $L\left(x*,\lambda\right)\leq L\left(x*,\lambda*\right)\leq L\left(x,\lambda*\right),$
1050 | \end_inset
1051 | 
1052 |  then we have 
1053 | \bar under
1054 | strong duality
1055 | \bar default
1056 | : the primal and dual have the same value, 
1057 | \begin_inset Formula $p*=\min_{x}\max_{\lambda\geq0}L\left(x,\lambda\right)=\max_{\lambda\geq0}\min_{x}L\left(x,\lambda\right)=d*$
1058 | \end_inset
1059 | 
1060 | 
1061 | \end_layout
1062 | 
1063 | \begin_layout Standard
1064 | Using notation from Peter's notes:
1065 | \end_layout
1066 | 
1067 | \begin_layout Standard
1068 | Given 
1069 | \begin_inset Formula $\min_{x}f(x)\ s.t.\ g_{i}(x)=0,\ h_{i}(x)\le0$
1070 | \end_inset
1071 | 
1072 | , the corresponding Lagrangian is: 
1073 | \begin_inset Formula $L(x,\alpha,\beta)=f(x)+\sum_{i=1}^{k}\alpha_{i}g_{i}(x)+\sum_{i=1}^{l}\beta_{i}h_{i}(x)$
1074 | \end_inset
1075 | 
1076 | 
1077 | \end_layout
1078 | 
1079 | \begin_layout Standard
1080 | We min over x and max over the Lagrange multipliers 
1081 | \begin_inset Formula $\alpha$
1082 | \end_inset
1083 | 
1084 |  and 
1085 | \begin_inset Formula $\beta$
1086 | \end_inset
1087 | 
1088 | 
1089 | \end_layout
1090 | 
1091 | \begin_layout Section
1092 | Regression
1093 | \end_layout
1094 | 
1095 | \begin_layout Standard
1096 | In general the loss function consists of two parts, the loss term and the
1097 |  regularization term.
1098 |  
1099 | \begin_inset Formula $J(\omega)=\sum_{i}Loss_{i}+\lambda R(\omega)$
1100 | \end_inset
1101 | 
1102 | 
1103 | \end_layout
1104 | 
1105 | \begin_layout Standard
1106 | L2 regularization results in 
1107 | \series bold
1108 | ridge regression
1109 | \series default
1110 | .
1111 | \begin_inset Newline newline
1112 | \end_inset
1113 | 
1114 | Used when A contains a null space.
1115 |  L2 reg falls out of the MLE when we add a Gaussian prior on x with 
1116 | \begin_inset Formula $\Sigma=cI$
1117 | \end_inset
1118 | 
1119 | .
1120 | \begin_inset Newline newline
1121 | \end_inset
1122 | 
1123 | 
1124 | \begin_inset Formula $\min_{x}||Ax-y||_{2}^{2}+\lambda||x||_{2}^{2}\implies x^{*}=(A^{T}A+\lambda I)^{-1}X^{T}y$
1125 | \end_inset
1126 | 
1127 | 
1128 | \end_layout
1129 | 
1130 | \begin_layout Standard
1131 | L1 regularization results in 
1132 | \series bold
1133 | lasso regression
1134 | \series default
1135 | .
1136 | \begin_inset Newline newline
1137 | \end_inset
1138 | 
1139 | Used when 
1140 | \begin_inset Formula $x$
1141 | \end_inset
1142 | 
1143 |  has a Laplace prior.
1144 |  Gives sparse results.
1145 | \end_layout
1146 | 
1147 | \begin_layout Subsection
1148 | Logistic Regression
1149 | \end_layout
1150 | 
1151 | \begin_layout Standard
1152 | Classify 
1153 | \begin_inset Formula $y\in\{0,1\}\implies$
1154 | \end_inset
1155 | 
1156 | Model 
1157 | \begin_inset Formula $p(y=1|x)=\frac{1}{1+e^{-\theta^{T}x}}=h_{\theta}(x)$
1158 | \end_inset
1159 | 
1160 | 
1161 | \end_layout
1162 | 
1163 | \begin_layout Standard
1164 | \begin_inset Formula $\frac{dh_{\theta}}{d\theta}=(\frac{1}{1+e^{\theta^{T}x}})^{2}e^{-\theta^{T}x}=\frac{1}{1+e^{\theta^{T}x}}\left(1-\frac{1}{1+e^{-\theta^{T}x}}\right)=h_{\theta}(1-h_{\theta})$
1165 | \end_inset
1166 | 
1167 | 
1168 | \end_layout
1169 | 
1170 | \begin_layout Standard
1171 | \begin_inset Formula $p(y|x;\theta)=(h_{\theta}(x))^{y}(1-h_{\theta}(x))^{1-y}\implies$
1172 | \end_inset
1173 | 
1174 | 
1175 | \end_layout
1176 | 
1177 | \begin_layout Standard
1178 | \begin_inset Formula $L(\theta)=\prod_{i=1}^{m}(h_{\theta}(x^{(i)}))^{y^{(i)}}(1-h_{\theta}(x^{(i)}))^{1-y^{(i)}}\implies$
1179 | \end_inset
1180 | 
1181 | 
1182 | \end_layout
1183 | 
1184 | \begin_layout Standard
1185 | \begin_inset Formula $l(\theta)=\sum_{i=1}^{m}y^{(i)}\log(h_{\theta}(x^{(i)}))+(1-y^{(i)})\log(1-h_{\theta}(x^{(i)}))\implies$
1186 | \end_inset
1187 | 
1188 | 
1189 | \end_layout
1190 | 
1191 | \begin_layout Standard
1192 | \begin_inset Formula $\nabla_{\theta}l=\sum_{i}(y^{(i)}-h_{\theta}(x^{(i)}))x^{(i)}=X^{\intercal}(y-h_{\theta}(X))$
1193 | \end_inset
1194 | 
1195 | , (want 
1196 | \begin_inset Formula $\max\ l(\theta)$
1197 | \end_inset
1198 | 
1199 | )
1200 | \end_layout
1201 | 
1202 | \begin_layout Standard
1203 | Stochastic: 
1204 | \begin_inset Formula $\boxed{\theta_{t+1}=\theta_{t}+\alpha(y_{t}^{(j)}-h_{\theta}(x_{t}^{(j)}))x_{t}^{(j)}}$
1205 | \end_inset
1206 | 
1207 | 
1208 | \end_layout
1209 | 
1210 | \begin_layout Standard
1211 | Batch: 
1212 | \begin_inset Formula $\boxed{\theta_{t+1}=\theta_{t}+\alpha X^{\intercal}(y-h_{\theta}(X))}$
1213 | \end_inset
1214 | 
1215 |  
1216 | \end_layout
1217 | 
1218 | \begin_layout Standard
1219 | \begin_inset VSpace vfill
1220 | \end_inset
1221 | 
1222 | 
1223 | \end_layout
1224 | 
1225 | \begin_layout Standard
1226 | \begin_inset ERT
1227 | status open
1228 | 
1229 | \begin_layout Plain Layout
1230 | 
1231 | 
1232 | \backslash
1233 | columnbreak
1234 | \end_layout
1235 | 
1236 | \end_inset
1237 | 
1238 | 
1239 | \end_layout
1240 | 
1241 | \begin_layout Subsection
1242 | LDA and QDA
1243 | \end_layout
1244 | 
1245 | \begin_layout Standard
1246 | Classify 
1247 | \begin_inset Formula $y\in\{0,1\},$
1248 | \end_inset
1249 | 
1250 |  Model 
1251 | \begin_inset Formula $p(y)=\phi^{y}\phi^{1-y}$
1252 | \end_inset
1253 | 
1254 |  and
1255 | \end_layout
1256 | 
1257 | \begin_layout Standard
1258 | \begin_inset Formula $l(\theta,\mu_{0},\mu_{1},\Sigma)=log\ \Pi_{i=1}^{m}p(x^{(i)}|y^{(i)};\mu_{0},\mu_{1},\Sigma)p(y^{(i)};\Phi)$
1259 | \end_inset
1260 | 
1261 |  gives us
1262 | \end_layout
1263 | 
1264 | \begin_layout Standard
1265 | \begin_inset Formula $\phi_{MLE}=\frac{1}{m}\sum_{i=1}^{m}1\{y^{(i)}=1\}$
1266 | \end_inset
1267 | 
1268 | ,
1269 | \begin_inset Formula $\mu_{k_{MLE}}=\text{avg of x^{(i)} classified as k}$
1270 | \end_inset
1271 | 
1272 | ,
1273 | \end_layout
1274 | 
1275 | \begin_layout Standard
1276 | \begin_inset Formula $\Sigma_{MLE}=\frac{1}{m}\sum_{i=1}^{m}(x^{(i)}-\mu_{y_{(i)}})(x^{(i)}-\mu_{y_{(i)}})^{T}$
1277 | \end_inset
1278 | 
1279 | .
1280 | \end_layout
1281 | 
1282 | \begin_layout Standard
1283 | Notice the covariance matrix is the same for all classes in LDA.
1284 | \end_layout
1285 | 
1286 | \begin_layout Standard
1287 | If 
1288 | \begin_inset Formula $p(x|y)$
1289 | \end_inset
1290 | 
1291 |  multivariate gaussian (w/ shared 
1292 | \begin_inset Formula $\Sigma)$
1293 | \end_inset
1294 | 
1295 | , then 
1296 | \begin_inset Formula $p(y|x)$
1297 | \end_inset
1298 | 
1299 |  is logistic function.
1300 |  The converse is NOT true.
1301 |  LDA makes stronger assumptions about data than does logistic regression.
1302 |  
1303 | \begin_inset Formula $h(x)=arg\max_{k}-\frac{1}{2}(x-\mu_{k})^{T}\Sigma^{-1}(x-\mu_{k})+log(\pi_{k})$
1304 | \end_inset
1305 | 
1306 |  
1307 | \end_layout
1308 | 
1309 | \begin_layout Standard
1310 | where 
1311 | \begin_inset Formula $\pi_{k}=p(y=k)$
1312 | \end_inset
1313 | 
1314 | 
1315 | \end_layout
1316 | 
1317 | \begin_layout Standard
1318 | For QDA, the model is the same as LDA except that each class has a unique
1319 |  covariance matrix.
1320 |  
1321 | \begin_inset Formula $h(x)=arg\max_{k}-\frac{1}{2}log|\Sigma_{k}|-\frac{1}{2}(x-\mu_{k})^{T}\Sigma_{k}^{-1}(x-\mu_{k})+log(\pi_{k})$
1322 | \end_inset
1323 | 
1324 | 
1325 | \end_layout
1326 | 
1327 | \begin_layout Section
1328 | Other Classifiers
1329 | \end_layout
1330 | 
1331 | \begin_layout Subsection
1332 | Nearest Neighbor
1333 | \end_layout
1334 | 
1335 | \begin_layout Standard
1336 | Key Idea: Store all training examples 
1337 | \begin_inset Formula $\left\langle x_{i},f(x_{i})\right\rangle $
1338 | \end_inset
1339 | 
1340 | 
1341 | \end_layout
1342 | 
1343 | \begin_layout Standard
1344 | 
1345 | \series bold
1346 | NN
1347 | \series default
1348 | : Find closest training point using some distance metric and take its label.
1349 | \end_layout
1350 | 
1351 | \begin_layout Standard
1352 | 
1353 | \series bold
1354 | k-NN
1355 | \series default
1356 | : Find closest k training points and take on the most likely label based
1357 |  on some voting scheme (mean, median,...)
1358 | \end_layout
1359 | 
1360 | \begin_layout Standard
1361 | 
1362 | \series bold
1363 | Behavior at the limit
1364 | \series default
1365 | : 1NN 
1366 | \begin_inset Formula $lim_{N\to\infty}\ \epsilon^{*}\le\epsilon_{NN}\le2\epsilon^{*}$
1367 | \end_inset
1368 | 
1369 |  
1370 | \begin_inset Formula $\epsilon^{*}=\text{error of optimal prediction},\ \epsilon_{nn}=\text{error of 1NN classifier}$
1371 | \end_inset
1372 | 
1373 | 
1374 | \end_layout
1375 | 
1376 | \begin_layout Standard
1377 | KNN 
1378 | \begin_inset space \space{}
1379 | \end_inset
1380 | 
1381 | 
1382 | \begin_inset Formula $lim_{N\to\infty,K\to\infty},\frac{K}{N}\to0,\epsilon_{knn}=\epsilon^{*}$
1383 | \end_inset
1384 | 
1385 | 
1386 | \end_layout
1387 | 
1388 | \begin_layout Standard
1389 | 
1390 | \series bold
1391 | Curse of dimensionality
1392 | \series default
1393 | : As the number of dimensions increases, everything becomes farther apart.
1394 |  Our low dimension intuition falls apart.
1395 |  Consider the Hypersphere/Hypercube ratio, it's close to zero at 
1396 | \begin_inset Formula $d=10$
1397 | \end_inset
1398 | 
1399 | .
1400 |  How do deal with this curse:
1401 | \end_layout
1402 | 
1403 | \begin_layout Enumerate
1404 | Get more data to fill all of that empty space
1405 | \end_layout
1406 | 
1407 | \begin_layout Enumerate
1408 | Get better features, reducing the dimensionality and packing the data closer
1409 |  together.
1410 |  Ex: Bag-of-words, Histograms,...
1411 | \end_layout
1412 | 
1413 | \begin_layout Enumerate
1414 | Use a better distance metric.
1415 | \end_layout
1416 | 
1417 | \begin_layout Standard
1418 | Minkowski: 
1419 | \begin_inset Formula $Dis_{p}(x,y)=(\sum_{i=1}^{d}|x_{i}-y_{u}|^{p})^{\frac{1}{p}}=||x-y||_{p}$
1420 | \end_inset
1421 | 
1422 | 
1423 | \end_layout
1424 | 
1425 | \begin_layout Standard
1426 | 0-norm: 
1427 | \begin_inset Formula $Dis_{0}(x,y)=\sum_{i=1}^{d}I|x_{i}=y_{i}|$
1428 | \end_inset
1429 | 
1430 | 
1431 | \end_layout
1432 | 
1433 | \begin_layout Standard
1434 | Mahalanobis: 
1435 | \begin_inset Formula $Dis_{M}(x,y|\Sigma)=\sqrt{(x-y)^{T}\Sigma^{-1}(x-y)}$
1436 | \end_inset
1437 | 
1438 | 
1439 | \end_layout
1440 | 
1441 | \begin_layout Standard
1442 | In high-d we get 
1443 | \begin_inset Quotes eld
1444 | \end_inset
1445 | 
1446 | Hubs
1447 | \begin_inset Quotes erd
1448 | \end_inset
1449 | 
1450 |  s.t most points identify the hubs as their NN.
1451 |  These hubs are usually near the means (Ex: dull gray images, sky and clouds).
1452 |  To avoid having everything classified as these hubs, we can use cosine
1453 |  similarity.
1454 | \end_layout
1455 | 
1456 | \begin_layout Standard
1457 | 
1458 | \series bold
1459 | K-d trees
1460 | \series default
1461 |  increase the efficiency of nearest neighbor lookup.
1462 | \end_layout
1463 | 
1464 | \begin_layout Subsection
1465 | Decision Trees
1466 | \end_layout
1467 | 
1468 | \begin_layout Standard
1469 | Given a set of points and classes 
1470 | \begin_inset Formula $\{x_{i},y_{i}\}_{i=1}^{n}$
1471 | \end_inset
1472 | 
1473 | , test features 
1474 | \begin_inset Formula $x_{j}$
1475 | \end_inset
1476 | 
1477 |  and branch on the feature which 
1478 | \begin_inset Quotes eld
1479 | \end_inset
1480 | 
1481 | best
1482 | \begin_inset Quotes erd
1483 | \end_inset
1484 | 
1485 |  separates the data.
1486 |  Recursively split on the new subset of data.
1487 |  Growing the tree to max depth tends to overfit (training data gets cut
1488 |  quickly 
1489 | \begin_inset Formula $\implies$
1490 | \end_inset
1491 | 
1492 |  subtrees train on small sets).
1493 |  Mistakes high up in the tree propagate to corresponding subtrees.
1494 |  To reduce overfitting, we can prune using a validation set, and we can
1495 |  limit the depth.
1496 | \end_layout
1497 | 
1498 | \begin_layout Standard
1499 | DT's are prone to label noise.
1500 |  Building the correct tree is hard.
1501 | \end_layout
1502 | 
1503 | \begin_layout Standard
1504 | 
1505 | \series bold
1506 | Heurisitic
1507 | \series default
1508 | : For 
1509 | \bar under
1510 | classification
1511 | \bar default
1512 | , maximize information gain 
1513 | \begin_inset Formula 
1514 | \[
1515 | \max_{j}\quad\mathrm{H}(D)\ -\sum_{x_{j}\in X_{j}}P(X_{j}=x_{j})\cdot\mathrm{H}(D|X_{j}=x_{j})
1516 | \]
1517 | 
1518 | \end_inset
1519 | 
1520 | where 
1521 | \begin_inset Formula $\mathrm{H}(D)=-\sum_{c\in C}P(y=c)\log[p(y=c)]$
1522 | \end_inset
1523 | 
1524 |  is the entropy of the data set, 
1525 | \begin_inset Formula $C$
1526 | \end_inset
1527 | 
1528 |  is the set of classes each data point can take, and 
1529 | \begin_inset Formula $P(y=c)$
1530 | \end_inset
1531 | 
1532 |  is the fraction of data points with class 
1533 | \begin_inset Formula $c$
1534 | \end_inset
1535 | 
1536 | .
1537 | \begin_inset Newline newline
1538 | \end_inset
1539 | 
1540 |  For 
1541 | \noun on
1542 | regression
1543 | \noun default
1544 | , minimize the variance.
1545 |  Same optimization problem as above, except H is replaced with var.
1546 |  Pure leaves correspond to low variance, and the result is the mean of the
1547 |  current leaf.
1548 | \end_layout
1549 | 
1550 | \begin_layout Subsection
1551 | Random Forests
1552 | \end_layout
1553 | 
1554 | \begin_layout Standard
1555 | 
1556 | \series bold
1557 | Problem
1558 | \series default
1559 | : DT's are 
1560 | \bar under
1561 | unstable
1562 | \bar default
1563 | : small changes in the input data have large effect on tree structure 
1564 | \begin_inset Formula $\implies$
1565 | \end_inset
1566 | 
1567 |  DT's are high-variance estimators.
1568 | \begin_inset Newline newline
1569 | \end_inset
1570 | 
1571 |  
1572 | \series bold
1573 | Solution
1574 | \series default
1575 | : Random Forests train 
1576 | \begin_inset Formula $M$
1577 | \end_inset
1578 | 
1579 |  different trees with randomly sampled subsets of the data (called bagging),
1580 |  and sometimes with randomly sampled subsets of the features to de-correlate
1581 |  the trees.
1582 |  A new point is tested on all 
1583 | \begin_inset Formula $M$
1584 | \end_inset
1585 | 
1586 |  trees and we take the majority as our output class (for regression we take
1587 |  the average of the output).
1588 | \end_layout
1589 | 
1590 | \begin_layout Subsection
1591 | Boosting
1592 | \end_layout
1593 | 
1594 | \begin_layout Standard
1595 | Weak Learner: Can classify with at least 50% accuracy.
1596 | \end_layout
1597 | 
1598 | \begin_layout Standard
1599 | Train weak learner to get a weak classifier.
1600 |  Test it on the training data, up-weigh misclassified data, down-weigh correctly
1601 |  classified data.
1602 |  Train a new weak learner on the weighted data.
1603 |  Repeat.
1604 |  A new point is classified by every weak learner and the output class is
1605 |  the sign of a weighted avg.
1606 |  of weak learner outputs.
1607 |  Boosting generally overfits.
1608 |  If there is label noise, boosting keeps upweighing the mislabeled data.
1609 | \end_layout
1610 | 
1611 | \begin_layout Standard
1612 | 
1613 | \series bold
1614 | AdaBoost
1615 | \series default
1616 |  is a boosting algorithm.
1617 |  The weak learner weights are given by 
1618 | \begin_inset Formula $\alpha_{t}=\frac{1}{2}\ln(\frac{1-\epsilon_{t}}{\epsilon_{t}})$
1619 | \end_inset
1620 | 
1621 |  where 
1622 | \begin_inset Formula $\epsilon_{t}=Pr_{D_{t}}(h_{t}(x_{i})\ne y_{i})$
1623 | \end_inset
1624 | 
1625 |  (probability of misclassification).
1626 |  The weights are updated 
1627 | \begin_inset Formula $D_{t+1}(i)=\frac{D_{t}(i)exp(-\alpha_{t}y_{i}h_{t}(x_{i}))}{Z_{t}}$
1628 | \end_inset
1629 | 
1630 |  where 
1631 | \begin_inset Formula $Z_{t}$
1632 | \end_inset
1633 | 
1634 |  is a normalization factor.
1635 | \end_layout
1636 | 
1637 | \begin_layout Subsection
1638 | Neural Networks
1639 | \end_layout
1640 | 
1641 | \begin_layout Standard
1642 | Neural Nets explore what you can do by combining perceptrons, each of which
1643 |  is a simple linear classifier.
1644 |  We use a soft threshold for each activation function 
1645 | \begin_inset Formula $\theta$
1646 | \end_inset
1647 | 
1648 |  because it is twice differentiable.
1649 | \end_layout
1650 | 
1651 | \begin_layout Standard
1652 | \begin_inset Graphics
1653 | 	filename graphics/NN.pdf
1654 | 	lyxscale 50
1655 | 	width 72col%
1656 | 
1657 | \end_inset
1658 | 
1659 |  
1660 | \begin_inset space \space{}
1661 | \end_inset
1662 | 
1663 | 
1664 | \begin_inset Graphics
1665 | 	filename graphics/NN2.pdf
1666 | 	lyxscale 35
1667 | 	width 21col%
1668 | 
1669 | \end_inset
1670 | 
1671 |  
1672 | \end_layout
1673 | 
1674 | \begin_layout Standard
1675 | 
1676 | \series bold
1677 | Activation Functions:
1678 | \end_layout
1679 | 
1680 | \begin_layout Standard
1681 | \begin_inset Formula $\theta(s)=\tanh(s)=\frac{e^{s}-e^{-s}}{e^{s}+e^{-s}}\implies\theta'(s)=1-\theta^{2}(s)$
1682 | \end_inset
1683 | 
1684 | 
1685 | \end_layout
1686 | 
1687 | \begin_layout Standard
1688 | \begin_inset Formula $\theta(s)=\sigma(s)=\frac{1}{1+e^{-s}}\implies\theta'(s)=\sigma(s)(1-\sigma(s))$
1689 | \end_inset
1690 | 
1691 | 
1692 | \end_layout
1693 | 
1694 | \begin_layout Standard
1695 | 
1696 | \series bold
1697 | Error Functions
1698 | \series default
1699 | :
1700 | \end_layout
1701 | 
1702 | \begin_layout Standard
1703 | 
1704 | \family roman
1705 | \series medium
1706 | \shape up
1707 | \size normal
1708 | \emph off
1709 | \bar no
1710 | \strikeout off
1711 | \uuline off
1712 | \uwave off
1713 | \noun off
1714 | \color none
1715 | Cross Entropy Loss 
1716 | \begin_inset Formula $\sum_{i=1}^{n_{out}}y\log(h_{\theta}(x))+(1-y)\log(1-h_{\theta}(x))$
1717 | \end_inset
1718 | 
1719 | 
1720 | \end_layout
1721 | 
1722 | \begin_layout Standard
1723 | 
1724 | \family roman
1725 | \series medium
1726 | \shape up
1727 | \size normal
1728 | \emph off
1729 | \bar no
1730 | \strikeout off
1731 | \uuline off
1732 | \uwave off
1733 | \noun off
1734 | \color none
1735 | Mean Squared Error 
1736 | \begin_inset Formula $\sum_{i=1}^{n_{out}}(y-h_{\theta}(x))^{2}$
1737 | \end_inset
1738 | 
1739 | 
1740 | \end_layout
1741 | 
1742 | \begin_layout Standard
1743 | 
1744 | \series bold
1745 | Notation:
1746 | \series default
1747 |  
1748 | \end_layout
1749 | 
1750 | \begin_layout Enumerate
1751 | \begin_inset Formula $w_{ij}^{(l)}$
1752 | \end_inset
1753 | 
1754 |  is the weight from neuron 
1755 | \begin_inset Formula $i$
1756 | \end_inset
1757 | 
1758 |  in layer 
1759 | \begin_inset Formula $l-1$
1760 | \end_inset
1761 | 
1762 |  to neuron 
1763 | \begin_inset Formula $j$
1764 | \end_inset
1765 | 
1766 |  in layer 
1767 | \begin_inset Formula $l$
1768 | \end_inset
1769 | 
1770 | .
1771 |  There are 
1772 | \begin_inset Formula $d^{(l)}$
1773 | \end_inset
1774 | 
1775 |  nodes in the 
1776 | \begin_inset Formula $l^{\text{th}}$
1777 | \end_inset
1778 | 
1779 |  layer.
1780 |  
1781 | \end_layout
1782 | 
1783 | \begin_layout Enumerate
1784 | \begin_inset Formula $L$
1785 | \end_inset
1786 | 
1787 |  layers, where L is output layer and data is 0th layer.
1788 |  
1789 | \end_layout
1790 | 
1791 | \begin_layout Enumerate
1792 | \begin_inset Formula $x_{j}^{(l)}=\theta(s_{j}^{(l)})$
1793 | \end_inset
1794 | 
1795 |  is the output of a neuron.
1796 |  It's the activation function applied to the input signal.
1797 |  
1798 | \begin_inset Formula $s_{j}^{(l)}=\sum_{i}w_{ij}^{(l)}x_{i}^{(l-1)}$
1799 | \end_inset
1800 | 
1801 |  
1802 | \end_layout
1803 | 
1804 | \begin_layout Enumerate
1805 | \begin_inset Formula $e(w)$
1806 | \end_inset
1807 | 
1808 |  is the error as a function of the weights 
1809 | \end_layout
1810 | 
1811 | \begin_layout Standard
1812 | 
1813 | \bar under
1814 | The goal is to learn the weights 
1815 | \begin_inset Formula $w_{ij}^{(l)}$
1816 | \end_inset
1817 | 
1818 | .
1819 | 
1820 | \bar default
1821 |  We use gradient descent, but error function is non-convex so we tend to
1822 |  local minima.
1823 |  The naive version takes 
1824 | \begin_inset Formula $O(w^{2})$
1825 | \end_inset
1826 | 
1827 | .
1828 |  
1829 | \bar under
1830 | Back propagation
1831 | \bar default
1832 | , an algorithm for efficient computation of the gradient, takes 
1833 | \begin_inset Formula $O(w)$
1834 | \end_inset
1835 | 
1836 | .
1837 | \end_layout
1838 | 
1839 | \begin_layout Standard
1840 | \begin_inset Formula $\nabla e(w)\rightarrow\frac{\partial e(w)}{\partial w_{ij}^{(l)}}=\frac{\partial e(w)}{\partial s_{j}^{(l)}}\frac{\partial s_{j}^{(l)}}{\partial w_{ij}^{(l)}}=\delta_{j}^{(l)}x_{i}^{(l-1)}$
1841 | \end_inset
1842 | 
1843 | 
1844 | \end_layout
1845 | 
1846 | \begin_layout Standard
1847 | Final Layer: 
1848 | \begin_inset Formula $\delta_{j}^{(L)}=\frac{\partial e(w)}{\partial s_{j}^{(l)}}=\frac{\partial e(w)}{\partial x_{j}^{(L)}}\frac{\partial x_{j}^{(L)}}{\partial s_{j}^{(L)}}=e'(x_{j}^{(L)})\theta_{out}'(s_{j}^{L})$
1849 | \end_inset
1850 | 
1851 |  
1852 | \end_layout
1853 | 
1854 | \begin_layout Standard
1855 | 
1856 | \family roman
1857 | \series medium
1858 | \shape up
1859 | \size normal
1860 | \emph off
1861 | \bar no
1862 | \strikeout off
1863 | \uuline off
1864 | \uwave off
1865 | \noun off
1866 | \color none
1867 | General:
1868 | \family default
1869 | \series default
1870 | \shape default
1871 | \size default
1872 | \bar default
1873 | \strikeout default
1874 | \uuline default
1875 | \uwave default
1876 | \noun default
1877 | \color inherit
1878 |  
1879 | \begin_inset Formula $\delta_{i}^{(l-1)}=\frac{\partial e(w)}{\partial s_{i}^{(l-1)}}=\sum_{j=1}^{d^{(l)}}\frac{\partial e(w)}{\partial s_{j}^{(l)}}\times\frac{\partial s_{j}^{(l)}}{\partial x_{i}^{(l-1)}}\times\frac{\partial x_{i}^{(l-1)}}{\partial s_{i}^{(l-1)}}$
1880 | \end_inset
1881 | 
1882 | 
1883 | \end_layout
1884 | 
1885 | \begin_layout Standard
1886 | \begin_inset Formula $=\sum_{j=1}^{d^{(l)}}\delta_{j}^{(l)}\times w_{ij}^{(l)}\times\theta'(s_{i}^{(l-1)})$
1887 | \end_inset
1888 | 
1889 | 
1890 | \end_layout
1891 | 
1892 | \begin_layout Standard
1893 | \begin_inset Graphics
1894 | 	filename graphics/NN1.pdf
1895 | 	lyxscale 50
1896 | 	width 100col%
1897 | 
1898 | \end_inset
1899 | 
1900 | 
1901 | \end_layout
1902 | 
1903 | \begin_layout Section
1904 | Unsupervised Learning
1905 | \end_layout
1906 | 
1907 | \begin_layout Subsection
1908 | Clustering
1909 | \end_layout
1910 | 
1911 | \begin_layout Standard
1912 | Unsupervised learning (no labels).
1913 | \end_layout
1914 | 
1915 | \begin_layout Standard
1916 | 
1917 | \series bold
1918 | Distance function
1919 | \series default
1920 | s.
1921 |  Suppose we have two sets of points.
1922 | \end_layout
1923 | 
1924 | \begin_layout Itemize
1925 | 
1926 | \series bold
1927 | Single linkage
1928 | \series default
1929 |  is minimum distance between members.
1930 | \end_layout
1931 | 
1932 | \begin_layout Itemize
1933 | 
1934 | \series bold
1935 | Complete linkage
1936 | \series default
1937 |  is maximum distance between members.
1938 | \end_layout
1939 | 
1940 | \begin_layout Itemize
1941 | 
1942 | \series bold
1943 | Centroid linkage
1944 | \series default
1945 |  is distance between centroids.
1946 | \end_layout
1947 | 
1948 | \begin_layout Itemize
1949 | 
1950 | \series bold
1951 | Average linkage
1952 | \series default
1953 |  is average distance between all pairs.
1954 | \end_layout
1955 | 
1956 | \begin_layout Standard
1957 | 
1958 | \series bold
1959 | Hierarchical
1960 | \series default
1961 | : 
1962 | \end_layout
1963 | 
1964 | \begin_layout Itemize
1965 | 
1966 | \bar under
1967 | Agglomerative
1968 | \bar default
1969 | : Start with n points, merge 2 closest clusters using some measure, such
1970 |  as: Single-link (closest pair), Complete-link (furthest pair), Average-link
1971 |  (average of all pairs), Centroid (centroid distance).
1972 | \begin_inset Newline newline
1973 | \end_inset
1974 | 
1975 |  Note: SL and CL are sensitive to outliers.
1976 | \end_layout
1977 | 
1978 | \begin_layout Itemize
1979 | 
1980 | \bar under
1981 | Divisive
1982 | \bar default
1983 | : Start with single cluster, recursively divide clusters into 2 subclusters.
1984 |  
1985 | \end_layout
1986 | 
1987 | \begin_layout Standard
1988 | 
1989 | \series bold
1990 | Partitioning
1991 | \series default
1992 | : Partition the data into a K mutually exclusive exhaustive groups (i.e.
1993 |  encode k=C(i)).
1994 |  Iteratively reallocate to minimize some loss function.
1995 |  Finding the correct partitions is hard.
1996 |  Use a greedy algorithm called K-means (coordinate decent).
1997 |  Loss function is non-convex thus we find local minima.
1998 | \end_layout
1999 | 
2000 | \begin_layout Itemize
2001 | 
2002 | \series bold
2003 | K-means
2004 | \series default
2005 | : Choose clusters at random, calculate centroid of each cluster, reallocate
2006 |  objects to nearest centroid, repeat.
2007 |  
2008 | \bar under
2009 | Works with: spherical, well-separated clusters of similar volumes and count.
2010 | \end_layout
2011 | 
2012 | \begin_layout Itemize
2013 | 
2014 | \series bold
2015 | K-means
2016 | \series default
2017 | ++: Initialize clusters one by one.
2018 |  D(x) = distance of point x to nearest cluster.
2019 |  Pr(x is new cluster center)
2020 | \begin_inset Formula $\propto D(x)^{2}$
2021 | \end_inset
2022 | 
2023 | 
2024 | \end_layout
2025 | 
2026 | \begin_layout Itemize
2027 | 
2028 | \series bold
2029 | K-medians
2030 | \series default
2031 | : Works with arbitrary distance/dissimilarity metric, the centers 
2032 | \begin_inset Formula $\mu_{k}$
2033 | \end_inset
2034 | 
2035 |  are represented by data points.
2036 |  Is more restrictive thus has higher loss.
2037 | \end_layout
2038 | 
2039 | \begin_layout Standard
2040 | 
2041 | \series bold
2042 | General Loss
2043 | \series default
2044 | : 
2045 | \begin_inset Formula $\sum_{n=1}^{N}\sum_{k=1}^{K}d(x_{n},\mu_{k})r_{nk}$
2046 | \end_inset
2047 | 
2048 |  where 
2049 | \begin_inset Formula $r_{nk}=1$
2050 | \end_inset
2051 | 
2052 |  if 
2053 | \begin_inset Formula $x_{n}$
2054 | \end_inset
2055 | 
2056 |  is in cluster k, and 0 o.w.
2057 | \end_layout
2058 | 
2059 | \begin_layout Subsection
2060 | Vector Quantization
2061 | \end_layout
2062 | 
2063 | \begin_layout Standard
2064 | Use clustering to find representative prototype vectors, which are used
2065 |  to simplify representations of signals.
2066 | \end_layout
2067 | 
2068 | \begin_layout Subsection
2069 | Parametric Density Estimation
2070 | \end_layout
2071 | 
2072 | \begin_layout Standard
2073 | 
2074 | \series bold
2075 | Mixture Models.
2076 | 
2077 | \series default
2078 |  Assume PDF is made up of multiple gaussians with different centers.
2079 |  
2080 | \begin_inset Formula $P(x)=\sum_{i=1}^{n_{c}}P(c_{i})P(x|c_{i})$
2081 | \end_inset
2082 | 
2083 |  with objective function as log likelihood of data.
2084 |  Use 
2085 | \series bold
2086 | EM
2087 | \series default
2088 |  to estimate this model.
2089 |  
2090 | \begin_inset Newline newline
2091 | \end_inset
2092 | 
2093 | E Step: 
2094 | \begin_inset Formula $P(\mu_{i}|x_{k})=\frac{P(\mu_{i})P(x_{k}|\mu_{i})}{\sum_{j}P(\mu_{j})P(x_{j}|\mu_{j})}$
2095 | \end_inset
2096 | 
2097 |  
2098 | \begin_inset Newline newline
2099 | \end_inset
2100 | 
2101 | M Step: 
2102 | \begin_inset Formula $P(c_{i})=\frac{1}{n_{e}}\sum_{k=1}^{n_{e}}P(\mu_{i}|x_{k})$
2103 | \end_inset
2104 | 
2105 |  
2106 | \begin_inset Newline newline
2107 | \end_inset
2108 | 
2109 | 
2110 | \begin_inset Formula $\mu_{i}=\frac{\sum_{k}x_{k}P(\mu_{i}|x_{k})}{\sum_{k}P(\mu_{i}|x_{k})}$
2111 | \end_inset
2112 | 
2113 |  
2114 | \begin_inset Newline newline
2115 | \end_inset
2116 | 
2117 | 
2118 | \begin_inset Formula $\sigma_{i}^{2}=\frac{\sum_{k}(x_{k}-\mu_{i})^{2}P(\mu_{i}|x_{k})}{\sum_{k}P(\mu_{i}|x_{k})}$
2119 | \end_inset
2120 | 
2121 | .
2122 |  
2123 | \end_layout
2124 | 
2125 | \begin_layout Subsection
2126 | Non-parametric Density Estimation
2127 | \end_layout
2128 | 
2129 | \begin_layout Standard
2130 | Can use 
2131 | \series bold
2132 | Histogram
2133 | \series default
2134 |  or Kernel Density Estimation (KDE).
2135 | \end_layout
2136 | 
2137 | \begin_layout Standard
2138 | 
2139 | \series bold
2140 | KDE
2141 | \series default
2142 | : 
2143 | \begin_inset Formula $P(x)=\frac{1}{n}\sum K({\bf x}-{\bf x_{i}})$
2144 | \end_inset
2145 | 
2146 |  is a function of the data.
2147 | \end_layout
2148 | 
2149 | \begin_layout Standard
2150 | The kernel K has the following properties:
2151 | \begin_inset Newline newline
2152 | \end_inset
2153 | 
2154 | Symmetric, Normalized 
2155 | \begin_inset Formula $\int_{\mathbb{R}^{d}}K(x)dx=1$
2156 | \end_inset
2157 | 
2158 | , and 
2159 | \begin_inset Formula $\lim_{||x||\rightarrow\infty}||x||^{d}K(x)=0$
2160 | \end_inset
2161 | 
2162 | .
2163 | \end_layout
2164 | 
2165 | \begin_layout Standard
2166 | The 
2167 | \bar under
2168 | bandwidth
2169 | \bar default
2170 |  is the width of the kernel function.
2171 |  Too small = jagged results, too large = smoothed out results.
2172 | \end_layout
2173 | 
2174 | \begin_layout Subsection
2175 | 
2176 | \series bold
2177 | Principal Component Analysis
2178 | \end_layout
2179 | 
2180 | \begin_layout Standard
2181 | First run 
2182 | \series bold
2183 | singular value decomposition
2184 | \series default
2185 |  on
2186 | \series bold
2187 |  
2188 | \series default
2189 | pattern matrix 
2190 | \begin_inset Formula $X$
2191 | \end_inset
2192 | 
2193 | :
2194 | \end_layout
2195 | 
2196 | \begin_layout Enumerate
2197 | Subtract mean from each point
2198 | \end_layout
2199 | 
2200 | \begin_layout Enumerate
2201 | (Sometimes) scale each dimension by its variance
2202 | \end_layout
2203 | 
2204 | \begin_layout Enumerate
2205 | Compute covariance 
2206 | \begin_inset Formula $\Sigma=X^{T}X$
2207 | \end_inset
2208 | 
2209 |  (must be symmetric)
2210 | \end_layout
2211 | 
2212 | \begin_layout Enumerate
2213 | Compute eigenvectors/values 
2214 | \begin_inset Formula $\Sigma=VSV^{\intercal}$
2215 | \end_inset
2216 | 
2217 |  (spectral thm)
2218 | \end_layout
2219 | 
2220 | \begin_layout Enumerate
2221 | Get back 
2222 | \begin_inset Formula $X=X\Sigma=(XV)SV^{\intercal}=USV^{\intercal}$
2223 | \end_inset
2224 | 
2225 | 
2226 | \end_layout
2227 | 
2228 | \begin_layout Standard
2229 | \begin_inset Formula $S$
2230 | \end_inset
2231 | 
2232 |  contains the eigenvalues of the transformed features.
2233 |  The larger the 
2234 | \begin_inset Formula $S_{ii}$
2235 | \end_inset
2236 | 
2237 | , the larger the variance of that feature.
2238 |  We want the 
2239 | \begin_inset Formula $k$
2240 | \end_inset
2241 | 
2242 |  largest features, so we find the indices of the 
2243 | \begin_inset Formula $k$
2244 | \end_inset
2245 | 
2246 |  largest items in 
2247 | \begin_inset Formula $S$
2248 | \end_inset
2249 | 
2250 |  and we keep only these entries in 
2251 | \begin_inset Formula $U$
2252 | \end_inset
2253 | 
2254 |  and 
2255 | \begin_inset Formula $V$
2256 | \end_inset
2257 | 
2258 | .
2259 | \end_layout
2260 | 
2261 | \begin_layout Standard
2262 | \begin_inset VSpace vfill
2263 | \end_inset
2264 | 
2265 | 
2266 | \end_layout
2267 | 
2268 | \begin_layout Standard
2269 | \begin_inset ERT
2270 | status open
2271 | 
2272 | \begin_layout Plain Layout
2273 | 
2274 | 
2275 | \backslash
2276 | end{multicols}
2277 | \end_layout
2278 | 
2279 | \end_inset
2280 | 
2281 | 
2282 | \end_layout
2283 | 
2284 | \begin_layout Standard
2285 | \begin_inset Newpage newpage
2286 | \end_inset
2287 | 
2288 | 
2289 | \end_layout
2290 | 
2291 | \begin_layout Standard
2292 | \begin_inset ERT
2293 | status open
2294 | 
2295 | \begin_layout Plain Layout
2296 | 
2297 | 
2298 | \backslash
2299 | mytitle{CS 189 ALL OF IT}{Che Yeon, Chloe, Dhruv, Li, Sean}
2300 | \end_layout
2301 | 
2302 | \begin_layout Plain Layout
2303 | 
2304 | 
2305 | \backslash
2306 | begin{multicols}{4}
2307 | \end_layout
2308 | 
2309 | \end_inset
2310 | 
2311 | 
2312 | \end_layout
2313 | 
2314 | \begin_layout Section
2315 | Past Exam Questions
2316 | \end_layout
2317 | 
2318 | \begin_layout Standard
2319 | \begin_inset ERT
2320 | status collapsed
2321 | 
2322 | \begin_layout Plain Layout
2323 | 
2324 | 
2325 | \backslash
2326 | bgroup
2327 | \end_layout
2328 | 
2329 | \begin_layout Plain Layout
2330 | 
2331 | 
2332 | \backslash
2333 | renewcommand
2334 | \backslash
2335 | theenumi{(
2336 | \backslash
2337 | alph{enumi})}
2338 | \end_layout
2339 | 
2340 | \begin_layout Plain Layout
2341 | 
2342 | 
2343 | \backslash
2344 | renewcommand
2345 | \backslash
2346 | labelenumi{
2347 | \backslash
2348 | theenumi}
2349 | \end_layout
2350 | 
2351 | \end_inset
2352 | 
2353 | 
2354 | \end_layout
2355 | 
2356 | \begin_layout Subsection
2357 | Spring 2013 Midterm
2358 | \end_layout
2359 | 
2360 | \begin_layout Enumerate
2361 | 
2362 | \bar under
2363 | False:
2364 | \bar default
2365 |  In SVMs, we maximize 
2366 | \begin_inset Formula $\frac{\left\Vert w\right\Vert ^{2}}{2}$
2367 | \end_inset
2368 | 
2369 |  subject to the margin constraints.
2370 | \end_layout
2371 | 
2372 | \begin_layout Enumerate
2373 | 
2374 | \bar under
2375 | False:
2376 | \bar default
2377 |  In kernelized SVMS, the kernel matrix 
2378 | \begin_inset Formula $K$
2379 | \end_inset
2380 | 
2381 |  has to be positive definite.
2382 | \end_layout
2383 | 
2384 | \begin_layout Enumerate
2385 | 
2386 | \bar under
2387 | True:
2388 | \bar default
2389 |  If two random variables are independent, then they have to be uncorrelated.
2390 | \end_layout
2391 | 
2392 | \begin_layout Enumerate
2393 | 
2394 | \bar under
2395 | False:
2396 | \bar default
2397 |  Isocontours of Gaussian distributions have axes whose lengths are proportional
2398 |  to the eigenvalues of the covariance matrix.
2399 | \end_layout
2400 | 
2401 | \begin_layout Enumerate
2402 | 
2403 | \bar under
2404 | True:
2405 | \bar default
2406 |  The RBF kernel 
2407 | \begin_inset Formula $K\left(x_{i},x_{j}\right)=\exp\left(-\gamma\left\Vert x_{i}-x_{j}\right\Vert ^{2}\right)$
2408 | \end_inset
2409 | 
2410 |  corresponds to an infinite dimensional mapping of the feature vectors.
2411 | \end_layout
2412 | 
2413 | \begin_layout Enumerate
2414 | 
2415 | \bar under
2416 | True:
2417 | \bar default
2418 |  If 
2419 | \begin_inset Formula $(X,Y)$
2420 | \end_inset
2421 | 
2422 |  are jointly Gaussian, then 
2423 | \begin_inset Formula $X$
2424 | \end_inset
2425 | 
2426 |  and 
2427 | \begin_inset Formula $Y$
2428 | \end_inset
2429 | 
2430 |  are also Gaussian distributed.
2431 | \end_layout
2432 | 
2433 | \begin_layout Enumerate
2434 | 
2435 | \bar under
2436 | True:
2437 | \bar default
2438 |  A function f(x,y,z) is convex if the Hessian of f is positive semi-definite.
2439 | \end_layout
2440 | 
2441 | \begin_layout Enumerate
2442 | 
2443 | \bar under
2444 | True:
2445 | \bar default
2446 |  In a least-squares linear regression problem, adding an L2 regularization
2447 |  penalty cannot decrease the L2 error of the solution w on the training
2448 |  data.
2449 | \end_layout
2450 | 
2451 | \begin_layout Enumerate
2452 | 
2453 | \bar under
2454 | True:
2455 | \bar default
2456 |  In linear SVMs, the optimal weight vector w is a linear combination of
2457 |  training data points.
2458 | \end_layout
2459 | 
2460 | \begin_layout Enumerate
2461 | 
2462 | \bar under
2463 | False:
2464 | \bar default
2465 |  In stochastic gradient descent, we take steps in the exact direction of
2466 |  the gradient vector.
2467 | \end_layout
2468 | 
2469 | \begin_layout Enumerate
2470 | 
2471 | \bar under
2472 | False:
2473 | \bar default
2474 |  In a two class problem when the class conditionals 
2475 | \begin_inset Formula $P\left[x\mid y=0\right]andP\left[x\mid y=1\right]$
2476 | \end_inset
2477 | 
2478 |  are modeled as Gaussians with different covariance matrices, the posterior
2479 |  probabilities turn out to be logistic functions.
2480 | \end_layout
2481 | 
2482 | \begin_layout Enumerate
2483 | 
2484 | \bar under
2485 | True:
2486 | \bar default
2487 |  The perceptron training procedure is guaranteed to converge if the two
2488 |  classes are linearly separable.
2489 | \end_layout
2490 | 
2491 | \begin_layout Enumerate
2492 | 
2493 | \bar under
2494 | False:
2495 | \bar default
2496 |  The maximum likelihood estimate for the variance of a univariate Gaussian
2497 |  is unbiased.
2498 | \end_layout
2499 | 
2500 | \begin_layout Enumerate
2501 | 
2502 | \bar under
2503 | True:
2504 | \bar default
2505 |  In linear regression, using an L1 regularization penalty term results in
2506 |  sparser solutions than using an L2 regularization penalty term.
2507 |  
2508 | \end_layout
2509 | 
2510 | \begin_layout Subsection
2511 | Spring 2013 Final
2512 | \end_layout
2513 | 
2514 | \begin_layout Enumerate
2515 | 
2516 | \bar under
2517 | True:
2518 | \bar default
2519 |  Solving a non linear separation problem with a hard margin Kernelized SVM
2520 |  (Gaussian RBF Kernel) might lead to overfitting.
2521 | \end_layout
2522 | 
2523 | \begin_layout Enumerate
2524 | 
2525 | \bar under
2526 | True:
2527 | \bar default
2528 |  In SVMs, the sum of the Lagrange multipliers corresponding to the positive
2529 |  examples is equal to the sum of the Lagrange multipliers corresponding
2530 |  to the negative examples.
2531 | \end_layout
2532 | 
2533 | \begin_layout Enumerate
2534 | 
2535 | \bar under
2536 | False:
2537 | \bar default
2538 |  SVMs directly give us the posterior probabilities 
2539 | \begin_inset Formula $\mathrm{P}\left(y=1\mid x\right)$
2540 | \end_inset
2541 | 
2542 |  and 
2543 | \begin_inset Formula $\mathrm{P}\left(y=−1\mid x\right)$
2544 | \end_inset
2545 | 
2546 | .
2547 | \end_layout
2548 | 
2549 | \begin_layout Enumerate
2550 | 
2551 | \bar under
2552 | False:
2553 | \bar default
2554 |  
2555 | \begin_inset Formula $V(X)=\mathrm{E}[X]^{2}−\mathrm{E}[X^{2}]$
2556 | \end_inset
2557 | 
2558 | 
2559 | \end_layout
2560 | 
2561 | \begin_layout Enumerate
2562 | 
2563 | \bar under
2564 | True:
2565 | \bar default
2566 |  In the discriminative approach to solving classification problems, we model
2567 |  the conditional probability of the labels given the observations.
2568 | \end_layout
2569 | 
2570 | \begin_layout Enumerate
2571 | 
2572 | \bar under
2573 | False:
2574 | \bar default
2575 |  In a two class classification problem, a point on the Bayes optimal decision
2576 |  boundary x* always satisfies 
2577 | \begin_inset Formula $\mathrm{P}\left[y=1\mid x*\right]=\mathrm{P}\left[y=0\mid x*\right]$
2578 | \end_inset
2579 | 
2580 | .
2581 | \end_layout
2582 | 
2583 | \begin_layout Enumerate
2584 | 
2585 | \bar under
2586 | True:
2587 | \bar default
2588 |  Any linear combination of the components of a multivariate Gaussian is
2589 |  a univariate Gaussian.
2590 | \end_layout
2591 | 
2592 | \begin_layout Enumerate
2593 | 
2594 | \bar under
2595 | False:
2596 | \bar default
2597 |  For any two random variables 
2598 | \begin_inset Formula $X\sim N\left(\mu_{1},\sigma_{1}^{2}\right)$
2599 | \end_inset
2600 | 
2601 |  and 
2602 | \begin_inset Formula $Y\sim\mathcal{N}\left(\mu_{2},\sigma_{2}^{2}\right)$
2603 | \end_inset
2604 | 
2605 |  , 
2606 | \begin_inset Formula $X+Y\sim\mathcal{N}\left(\mu_{1}+\mu_{2},\sigma_{1}^{2}+\sigma_{2}^{2}\right)$
2607 | \end_inset
2608 | 
2609 | .
2610 | \end_layout
2611 | 
2612 | \begin_layout Enumerate
2613 | 
2614 | \bar under
2615 | False:
2616 | \bar default
2617 |  For a logistic regression problem differing initialization points can lead
2618 |  to a much better optimum.
2619 | \end_layout
2620 | 
2621 | \begin_layout Enumerate
2622 | 
2623 | \bar under
2624 | False:
2625 | \bar default
2626 |  In logistic regression, we model the odds ratio 
2627 | \begin_inset Formula $\frac{p}{1-p}$
2628 | \end_inset
2629 | 
2630 |  as a linear function.
2631 | \end_layout
2632 | 
2633 | \begin_layout Enumerate
2634 | 
2635 | \bar under
2636 | True:
2637 | \bar default
2638 |  Random forests can be used to classify infinite dimensional data.
2639 | \end_layout
2640 | 
2641 | \begin_layout Enumerate
2642 | 
2643 | \bar under
2644 | False:
2645 | \bar default
2646 |  In boosting we start with a Gaussian weight distribution over the training
2647 |  samples.
2648 | \end_layout
2649 | 
2650 | \begin_layout Enumerate
2651 | 
2652 | \bar under
2653 | False:
2654 | \bar default
2655 |  In Adaboost, the error of each hypothesis is calculated by the ratio of
2656 |  misclassified examples to the total number of examples.
2657 | \end_layout
2658 | 
2659 | \begin_layout Enumerate
2660 | 
2661 | \bar under
2662 | True:
2663 | \bar default
2664 |  When 
2665 | \begin_inset Formula $k=1$
2666 | \end_inset
2667 | 
2668 |  and 
2669 | \begin_inset Formula $N\rightarrow\infty$
2670 | \end_inset
2671 | 
2672 | , the kNN classification rate is bounded above by twice the Bayes error
2673 |  rate.
2674 | \end_layout
2675 | 
2676 | \begin_layout Enumerate
2677 | 
2678 | \bar under
2679 | True:
2680 | \bar default
2681 |  A single layer neural network with a sigmoid activation for binary classificati
2682 | on with the cross entropy loss is exactly equivalent to logistic regression.
2683 | \end_layout
2684 | 
2685 | \begin_layout Enumerate
2686 | 
2687 | \bar under
2688 | True:
2689 | \bar default
2690 |  Convolution is a linear operation i.e.
2691 |  
2692 | \begin_inset Formula $\left(\alpha f_{1}+\beta f_{2}\right)\ast g=\alpha f_{1}\ast g+\beta f_{2}\ast g$
2693 | \end_inset
2694 | 
2695 | .
2696 | \end_layout
2697 | 
2698 | \begin_layout Enumerate
2699 | 
2700 | \bar under
2701 | True:
2702 | \bar default
2703 |  The k-means algorithm does coordinate descent on a non-convex objective
2704 |  function.
2705 | \end_layout
2706 | 
2707 | \begin_layout Enumerate
2708 | 
2709 | \bar under
2710 | True:
2711 | \bar default
2712 |  A 1-NN classifier has higher variance than a 3-NN classifier.
2713 | \end_layout
2714 | 
2715 | \begin_layout Enumerate
2716 | 
2717 | \bar under
2718 | False:
2719 | \bar default
2720 |  The single link agglomerative clustering algorithm groups two clusters
2721 |  on the basis of the maximum distance between points in the two clusters.
2722 | \end_layout
2723 | 
2724 | \begin_layout Enumerate
2725 | 
2726 | \bar under
2727 | False:
2728 | \bar default
2729 |  The largest eigenvector of the covariance matrix is the direction of minimum
2730 |  variance in the data.
2731 | \end_layout
2732 | 
2733 | \begin_layout Enumerate
2734 | 
2735 | \bar under
2736 | False:
2737 | \bar default
2738 |  The eigenvectors of 
2739 | \begin_inset Formula $AA^{T}$
2740 | \end_inset
2741 | 
2742 |  and 
2743 | \begin_inset Formula $A^{T}A$
2744 | \end_inset
2745 | 
2746 |  are the same.
2747 | \end_layout
2748 | 
2749 | \begin_layout Enumerate
2750 | 
2751 | \bar under
2752 | True:
2753 | \bar default
2754 |  The non-zero eigenvalues of 
2755 | \begin_inset Formula $AA^{T}$
2756 | \end_inset
2757 | 
2758 |  and 
2759 | \begin_inset Formula $A^{T}A$
2760 | \end_inset
2761 | 
2762 |  are the same.
2763 | \end_layout
2764 | 
2765 | \begin_layout Standard
2766 | \begin_inset Phantom Phantom
2767 | status open
2768 | 
2769 | \begin_layout Plain Layout
2770 | 
2771 | \end_layout
2772 | 
2773 | \end_inset
2774 | 
2775 | 
2776 | \end_layout
2777 | 
2778 | \begin_layout Enumerate
2779 | In linear regression, the irreducible error is 
2780 | \bar under
2781 | 
2782 | \begin_inset Formula $\sigma^{2}$
2783 | \end_inset
2784 | 
2785 | 
2786 | \bar default
2787 |  and 
2788 | \begin_inset Formula $\boxed{E\left[\left(y-\mathrm{E}(y\mid x)\right)^{^{2}}\right]}$
2789 | \end_inset
2790 | 
2791 | .
2792 | \end_layout
2793 | 
2794 | \begin_layout Enumerate
2795 | Let 
2796 | \begin_inset Formula $S_{1}$
2797 | \end_inset
2798 | 
2799 |  and 
2800 | \begin_inset Formula $S_{2}$
2801 | \end_inset
2802 | 
2803 |  be the support vectors for 
2804 | \begin_inset Formula $w_{1}$
2805 | \end_inset
2806 | 
2807 |  (hard margin) and 
2808 | \begin_inset Formula $w_{2}$
2809 | \end_inset
2810 | 
2811 |  (soft margin).
2812 |  Then 
2813 | \bar under
2814 | 
2815 | \begin_inset Formula $S_{1}$
2816 | \end_inset
2817 | 
2818 |  may not be a subset of 
2819 | \begin_inset Formula $S_{2}$
2820 | \end_inset
2821 | 
2822 | 
2823 | \bar default
2824 |  and 
2825 | \bar under
2826 | 
2827 | \begin_inset Formula $w_{1}$
2828 | \end_inset
2829 | 
2830 |  may not be equal to 
2831 | \begin_inset Formula $w_{2}$
2832 | \end_inset
2833 | 
2834 | 
2835 | \bar default
2836 | .
2837 | \end_layout
2838 | 
2839 | \begin_layout Enumerate
2840 | Ordinary least square regression assumes each data point is generated according
2841 |  to a linear function of the input plus 
2842 | \begin_inset Formula $\mathcal{N}(0,\sigma)$
2843 | \end_inset
2844 | 
2845 |  noise.
2846 |  In many systems, the noise variance is a positive linear function of the
2847 |  input.
2848 |  In this case, the probability model that describes this situation is 
2849 | \begin_inset Formula $\boxed{\ensuremath{P(y|x)=\frac{1}{\sigma\sqrt{2\pi x}}\exp(-\frac{(y-(w_{0}+w_{1}x))^{2}}{2x\sigma^{2}}}}$
2850 | \end_inset
2851 | 
2852 | .
2853 | \end_layout
2854 | 
2855 | \begin_layout Enumerate
2856 | Averaging the outputs of multiple decision trees helps 
2857 | \bar under
2858 | reduce variance
2859 | \bar default
2860 | .
2861 | \end_layout
2862 | 
2863 | \begin_layout Enumerate
2864 | The following loss functions are convex: 
2865 | \bar under
2866 | logistic
2867 | \bar default
2868 | , 
2869 | \bar under
2870 | hinge
2871 | \bar default
2872 | , 
2873 | \bar under
2874 | exponential
2875 | \bar default
2876 | .
2877 |  
2878 | \bar under
2879 | Misclassification loss is not.
2880 | \end_layout
2881 | 
2882 | \begin_layout Enumerate
2883 | 
2884 | \bar under
2885 | Bias will be smaller
2886 | \bar default
2887 |  and 
2888 | \bar under
2889 | variance will be larger
2890 | \bar default
2891 |  for trees of 
2892 | \bar under
2893 | smaller depth
2894 | \bar default
2895 | .
2896 | \end_layout
2897 | 
2898 | \begin_layout Enumerate
2899 | If making a tree with 
2900 | \begin_inset Formula $k$
2901 | \end_inset
2902 | 
2903 | -ary splits, 
2904 | \bar under
2905 | the algorithm will prefer high values of 
2906 | \begin_inset Formula $k$
2907 | \end_inset
2908 | 
2909 | 
2910 | \bar default
2911 |  and 
2912 | \bar under
2913 | there will be 
2914 | \begin_inset Formula $k-1$
2915 | \end_inset
2916 | 
2917 |  thresholds for a 
2918 | \begin_inset Formula $k$
2919 | \end_inset
2920 | 
2921 | -ary split
2922 | \bar default
2923 | .
2924 | \end_layout
2925 | 
2926 | \begin_layout Standard
2927 | \begin_inset VSpace vfill
2928 | \end_inset
2929 | 
2930 | 
2931 | \end_layout
2932 | 
2933 | \begin_layout Standard
2934 | \begin_inset ERT
2935 | status open
2936 | 
2937 | \begin_layout Plain Layout
2938 | 
2939 | 
2940 | \backslash
2941 | columnbreak
2942 | \end_layout
2943 | 
2944 | \end_inset
2945 | 
2946 | 
2947 | \end_layout
2948 | 
2949 | \begin_layout Subsection
2950 | Spring 2014 Final
2951 | \end_layout
2952 | 
2953 | \begin_layout Enumerate
2954 | 
2955 | \bar under
2956 | False:
2957 | \bar default
2958 |  The singular value decomposition of a real matrix is unique.
2959 | \end_layout
2960 | 
2961 | \begin_layout Enumerate
2962 | 
2963 | \bar under
2964 | True:
2965 | \bar default
2966 |  A multiple-layer neural network with linear activation functions is equivalent
2967 |  to one single-layer perceptron that uses the same error function on the
2968 |  output layer and has the same number of inputs.
2969 | \end_layout
2970 | 
2971 | \begin_layout Enumerate
2972 | 
2973 | \bar under
2974 | False:
2975 | \bar default
2976 |  The maximum likelihood estimator for the parameter 
2977 | \begin_inset Formula $\theta$
2978 | \end_inset
2979 | 
2980 |  of a uniform distribution over 
2981 | \begin_inset Formula $[0,\theta]$
2982 | \end_inset
2983 | 
2984 |  is unbiased.
2985 | \end_layout
2986 | 
2987 | \begin_layout Enumerate
2988 | 
2989 | \bar under
2990 | True:
2991 | \bar default
2992 |  The k-means algorithm for clustering is guaranteed to converge to a local
2993 |  optimum.
2994 | \end_layout
2995 | 
2996 | \begin_layout Enumerate
2997 | 
2998 | \bar under
2999 | True:
3000 | \bar default
3001 |  Increasing the depth of a decision tree cannot increase its training error.
3002 | \end_layout
3003 | 
3004 | \begin_layout Enumerate
3005 | 
3006 | \bar under
3007 | False:
3008 | \bar default
3009 |  There exists a one-to-one feature mapping 
3010 | \begin_inset Formula $\phi$
3011 | \end_inset
3012 | 
3013 |  for every valid kernel k.
3014 | \end_layout
3015 | 
3016 | \begin_layout Enumerate
3017 | 
3018 | \bar under
3019 | True:
3020 | \bar default
3021 |  For high-dimensional data data, k-d trees can be slower than brute force
3022 |  nearest neighbor search.
3023 | \end_layout
3024 | 
3025 | \begin_layout Enumerate
3026 | 
3027 | \bar under
3028 | True:
3029 | \bar default
3030 |  If we had infinite data and infinitely fast computers, kNN would be the
3031 |  only algorithm we would study in CS 189.
3032 | \end_layout
3033 | 
3034 | \begin_layout Enumerate
3035 | 
3036 | \bar under
3037 | True:
3038 | \bar default
3039 |  For datasets with high label noise (many data points with incorrect labels,
3040 |  random forests would generally perform better than boosted decision trees.
3041 | \end_layout
3042 | 
3043 | \begin_layout Standard
3044 | \begin_inset Phantom Phantom
3045 | status open
3046 | 
3047 | \begin_layout Plain Layout
3048 | 
3049 | \end_layout
3050 | 
3051 | \end_inset
3052 | 
3053 | 
3054 | \end_layout
3055 | 
3056 | \begin_layout Enumerate
3057 | In Homework 4, you fit a logistic regression model on spam and ham data
3058 |  for a Kaggle Comp.
3059 |  Assume you had a very good score on the public test set, but when the GSIs
3060 |  ran your model on a private test set, your score dropped a lot.
3061 |  This is likely because you overfitted by submitting multiple times and
3062 |  changing the following between submiss
3063 | \bar under
3064 | ions: 
3065 | \begin_inset Formula $\lambda$
3066 | \end_inset
3067 | 
3068 | , your penalty term
3069 | \bar default
3070 | ; 
3071 | \bar under
3072 | 
3073 | \begin_inset Formula $\varepsilon$
3074 | \end_inset
3075 | 
3076 | , your convergence criterion
3077 | \bar default
3078 | ; 
3079 | \bar under
3080 | your step size
3081 | \bar default
3082 | ; 
3083 | \bar under
3084 | fixing a random bug
3085 | \bar default
3086 | .
3087 | \end_layout
3088 | 
3089 | \begin_layout Enumerate
3090 | Given 
3091 | \begin_inset Formula $d$
3092 | \end_inset
3093 | 
3094 | -dimensional data 
3095 | \begin_inset Formula $\{x_{i}\}_{i=1}^{N}$
3096 | \end_inset
3097 | 
3098 |  , you run principal component analysis and pick 
3099 | \begin_inset Formula $P$
3100 | \end_inset
3101 | 
3102 |  principal components.
3103 |  Can you always reconstruct any data point 
3104 | \emph on
3105 | 
3106 | \begin_inset Formula $x_{i}$
3107 | \end_inset
3108 | 
3109 | 
3110 | \emph default
3111 |  for 
3112 | \begin_inset Formula $i$
3113 | \end_inset
3114 | 
3115 |  from 
3116 | \begin_inset Formula $1$
3117 | \end_inset
3118 | 
3119 |  to 
3120 | \begin_inset Formula $N$
3121 | \end_inset
3122 | 
3123 |  from the 
3124 | \begin_inset Formula $P$
3125 | \end_inset
3126 | 
3127 |  principal components with zero reconstruction error? 
3128 | \bar under
3129 | Yes, if 
3130 | \begin_inset Formula $P=d$
3131 | \end_inset
3132 | 
3133 | .
3134 | \end_layout
3135 | 
3136 | \begin_layout Enumerate
3137 | Putting a standard Gaussian prior on the weights for linear regression 
3138 | \begin_inset Formula $(w\sim N(0,I))$
3139 | \end_inset
3140 | 
3141 |  will result in what type of posterior distribution on the weights? 
3142 | \bar under
3143 | Gaussian.
3144 | \end_layout
3145 | 
3146 | \begin_layout Enumerate
3147 | Suppose we have 
3148 | \begin_inset Formula $N$
3149 | \end_inset
3150 | 
3151 |  instances of d-dimensional data.
3152 |  Let 
3153 | \begin_inset Formula $h$
3154 | \end_inset
3155 | 
3156 |  be the amount of data storage necessary for a histogram with a fixed number
3157 |  of ticks per axis, and let 
3158 | \begin_inset Formula $k$
3159 | \end_inset
3160 | 
3161 |  be the amount of data storage necessary for kernel density estimation.
3162 |  Which of the following is true about 
3163 | \begin_inset Formula $h$
3164 | \end_inset
3165 | 
3166 |  and 
3167 | \begin_inset Formula $k$
3168 | \end_inset
3169 | 
3170 | ? 
3171 | \bar under
3172 | 
3173 | \begin_inset Formula $h$
3174 | \end_inset
3175 | 
3176 |  grows exponentially with 
3177 | \bar default
3178 | 
3179 | \begin_inset Formula $d$
3180 | \end_inset
3181 | 
3182 | , and 
3183 | \bar under
3184 | 
3185 | \begin_inset Formula $k$
3186 | \end_inset
3187 | 
3188 |  grows linearly with 
3189 | \begin_inset Formula $N$
3190 | \end_inset
3191 | 
3192 | 
3193 | \bar default
3194 | .
3195 | \end_layout
3196 | 
3197 | \begin_layout Enumerate
3198 | John just trained a decision tree for a digit recognition.
3199 |  He notices an extremely low training error, but an abnormally large test
3200 |  error.
3201 |  He also notices that an SVM with a linear kernel performs much better than
3202 |  his tree.
3203 |  What could be the cause of his problem? 
3204 | \bar under
3205 | Decision tree is too deep
3206 | \bar default
3207 | ; 
3208 | \bar under
3209 | decision tree is overfitting
3210 | \bar default
3211 | .
3212 | \end_layout
3213 | 
3214 | \begin_layout Enumerate
3215 | John has now switched to multilayer neural networks and notices that the
3216 |  training error is going down and converges to a local minimum.
3217 |  Then when he test on the new data, the test error is abnormally high.
3218 |  What is probably going wrong and what do you recommend him to do? 
3219 | \bar under
3220 | The training data size is not large enough so collect a larger training
3221 |  data and retain it
3222 | \bar default
3223 | ; 
3224 | \bar under
3225 | play with learning rate and add regularization term to objective function
3226 | \bar default
3227 | ; 
3228 | \bar under
3229 | use a different initialization and train the network several times and use
3230 |  the average of predictions from all nets to predict test data
3231 | \bar default
3232 | ; 
3233 | \bar under
3234 | use the same training data but use less hidden layers
3235 | \bar default
3236 | .
3237 | \end_layout
3238 | 
3239 | \begin_layout Subsection
3240 | Spring 2015 Midterm
3241 | \end_layout
3242 | 
3243 | \begin_layout Enumerate
3244 | 
3245 | \bar under
3246 | True:
3247 | \bar default
3248 |  If the data is not linearly separable, there is no solution to hard margin
3249 |  SVM.
3250 | \end_layout
3251 | 
3252 | \begin_layout Enumerate
3253 | 
3254 | \bar under
3255 | True:
3256 | \bar default
3257 |  logistic regression can be used for classification.
3258 | \end_layout
3259 | 
3260 | \begin_layout Enumerate
3261 | 
3262 | \bar under
3263 | False:
3264 | \bar default
3265 |  Two ways to prevent beta vectors from getting too large are to use a small
3266 |  step size and use a small regularization value
3267 | \end_layout
3268 | 
3269 | \begin_layout Enumerate
3270 | 
3271 | \bar under
3272 | False:
3273 | \bar default
3274 |  The L2 norm is often used because it produces sparse results, as opposed
3275 |  to the L1 norm which does not
3276 | \end_layout
3277 | 
3278 | \begin_layout Enumerate
3279 | 
3280 | \bar under
3281 | False:
3282 | \bar default
3283 |  For multivariate gaussian, the eigenvalues of the covariance matrix are
3284 |  inversely proportional to the lengths of the ellipsoid axes that determine
3285 |  the isocontours of the density.
3286 | \end_layout
3287 | 
3288 | \begin_layout Enumerate
3289 | 
3290 | \bar under
3291 | True:
3292 | \bar default
3293 |  In a generative binary classification model where we assume the class condition
3294 | als are distributed as poisson and the class priors are bernoulli, the posterior
3295 |  assumes a logistic form.
3296 | \end_layout
3297 | 
3298 | \begin_layout Enumerate
3299 | 
3300 | \bar under
3301 | False:
3302 | \bar default
3303 |  MLE gives us not only a point estimate, but a distribution over the parameters
3304 |  we are estimating.
3305 | \end_layout
3306 | 
3307 | \begin_layout Enumerate
3308 | 
3309 | \bar under
3310 | False:
3311 | \bar default
3312 |  Penalized MLE and bayesian estimators for parameters are better used in
3313 |  the setting of low-dimensional data with many training examples
3314 | \end_layout
3315 | 
3316 | \begin_layout Enumerate
3317 | 
3318 | \bar under
3319 | True:
3320 | \bar default
3321 |  It is not good machine learning practice to use the test set to help adjust
3322 |  the hyperparameters
3323 | \end_layout
3324 | 
3325 | \begin_layout Enumerate
3326 | 
3327 | \bar under
3328 | False:
3329 | \bar default
3330 |  a symmetric positive semidefinite matrix always has nonnegative elements.
3331 |  
3332 | \end_layout
3333 | 
3334 | \begin_layout Enumerate
3335 | 
3336 | \bar under
3337 | True:
3338 | \bar default
3339 |  for a valid kernel function k, the corresponding feature mapping can map
3340 |  a finite dimensional vector to an infinite dimensional vector
3341 | \end_layout
3342 | 
3343 | \begin_layout Enumerate
3344 | 
3345 | \bar under
3346 | False:
3347 | \bar default
3348 |  the more features we use, the better our learning algorithm will generalize
3349 |  to new data points.
3350 | \end_layout
3351 | 
3352 | \begin_layout Enumerate
3353 | 
3354 | \bar under
3355 | True:
3356 | \bar default
3357 |  a discriminative classifier explicitly models 
3358 | \begin_inset Formula $\mathrm{P}\left(Y\mid X\right)$
3359 | \end_inset
3360 | 
3361 | .
3362 | \end_layout
3363 | 
3364 | \begin_layout Standard
3365 | \begin_inset Phantom Phantom
3366 | status open
3367 | 
3368 | \begin_layout Plain Layout
3369 | 
3370 | \end_layout
3371 | 
3372 | \end_inset
3373 | 
3374 | 
3375 | \end_layout
3376 | 
3377 | \begin_layout Enumerate
3378 | You can use kernels with 
3379 | \bar under
3380 | SVM
3381 | \bar default
3382 |  and 
3383 | \bar under
3384 | perceptron
3385 | \bar default
3386 | .
3387 | \end_layout
3388 | 
3389 | \begin_layout Enumerate
3390 | Cross validation is used to select hyperparameters.
3391 |  It prevents overfitting, but is not guaranteed to prevent it.
3392 | \end_layout
3393 | 
3394 | \begin_layout Enumerate
3395 | L2 regularization is equivalent to imposing a Gaussian prior in linear regressio
3396 | n.
3397 | \end_layout
3398 | 
3399 | \begin_layout Enumerate
3400 | If we have 2 two-dimensional Gaussians, the same covariance matrix for both
3401 |  will result in a linear decision boundary.
3402 | \end_layout
3403 | 
3404 | \begin_layout Enumerate
3405 | The normal equations can be derived from minimizing empirical risk, assuming
3406 |  normally distributed noise, and assuming 
3407 | \begin_inset Formula $\mathrm{P}(Y\mid X)$
3408 | \end_inset
3409 | 
3410 |  is distributed normally with mean $B^Tx$ and variance 
3411 | \begin_inset Formula $\sigma^{2}$
3412 | \end_inset
3413 | 
3414 | .
3415 | \end_layout
3416 | 
3417 | \begin_layout Enumerate
3418 | Logistic regression can be motivated from 
3419 | \bar under
3420 | log odds equated to an affine function of x
3421 | \bar default
3422 |  and 
3423 | \bar under
3424 | generative models with gaussian class conditionals
3425 | \bar default
3426 | .
3427 | \end_layout
3428 | 
3429 | \begin_layout Enumerate
3430 | The perceptron algorithm will converge 
3431 | \bar under
3432 | only if the data is linearly separable
3433 | \bar default
3434 | .
3435 | \end_layout
3436 | 
3437 | \begin_layout Enumerate
3438 | 
3439 | \bar under
3440 | True:
3441 | \bar default
3442 |  Newton's method is typically more expensive to calculate than gradient
3443 |  descent per iteration.
3444 | \bar under
3445 | 
3446 | \begin_inset Newline newline
3447 | \end_inset
3448 | 
3449 | True:
3450 | \bar default
3451 |  for quadratic equations, Newton's method typically requires fewer iterations
3452 |  than gradient descent.
3453 | \bar under
3454 | 
3455 | \begin_inset Newline newline
3456 | \end_inset
3457 | 
3458 | False:
3459 | \bar default
3460 |  Gradient descent can be viewed as iteratively reweighted least squares.
3461 | \end_layout
3462 | 
3463 | \begin_layout Enumerate
3464 | 
3465 | \bar under
3466 | True:
3467 | \bar default
3468 |  Complementary slackness implies that every training point that is misclassified
3469 |  by a soft margin SVM is a support vector.
3470 | \bar under
3471 | 
3472 | \begin_inset Newline newline
3473 | \end_inset
3474 | 
3475 | True:
3476 | \bar default
3477 |  When we solve the SVM with the dual problem, we need only the dot product
3478 |  of 
3479 | \begin_inset Formula $x_{i}$
3480 | \end_inset
3481 | 
3482 |  and 
3483 | \begin_inset Formula $x_{j}$
3484 | \end_inset
3485 | 
3486 |  for all 
3487 | \begin_inset Formula $i$
3488 | \end_inset
3489 | 
3490 | , 
3491 | \begin_inset Formula $j$
3492 | \end_inset
3493 | 
3494 | .
3495 | \bar under
3496 | 
3497 | \begin_inset Newline newline
3498 | \end_inset
3499 | 
3500 | True:
3501 | \bar default
3502 |  we use Lagrange multipliers in an optimization problem with inequality
3503 |  constraints.
3504 | \end_layout
3505 | 
3506 | \begin_layout Enumerate
3507 | \begin_inset Formula $\left\Vert \Phi(x)-\Phi(y)\right\Vert _{2}^{2}$
3508 | \end_inset
3509 | 
3510 |  can be computed exclusively with inner products.
3511 | \begin_inset Newline newline
3512 | \end_inset
3513 | 
3514 | But not 
3515 | \begin_inset Formula $\left\Vert \Phi(x)-\Phi(y)\right\Vert _{1}$
3516 | \end_inset
3517 | 
3518 |  norm or 
3519 | \begin_inset Formula $\Phi(x)-\Phi(y)$
3520 | \end_inset
3521 | 
3522 | .
3523 | \end_layout
3524 | 
3525 | \begin_layout Enumerate
3526 | Strong duality holds for 
3527 | \bar under
3528 | hard and soft margin SVM
3529 | \bar default
3530 | , but 
3531 | \bar under
3532 | not constrained optimization problems
3533 | \bar default
3534 |  in general.
3535 | \end_layout
3536 | 
3537 | \begin_layout Standard
3538 | \begin_inset VSpace vfill
3539 | \end_inset
3540 | 
3541 | 
3542 | \end_layout
3543 | 
3544 | \begin_layout Standard
3545 | \begin_inset ERT
3546 | status open
3547 | 
3548 | \begin_layout Plain Layout
3549 | 
3550 | 
3551 | \backslash
3552 | columnbreak
3553 | \end_layout
3554 | 
3555 | \end_inset
3556 | 
3557 | 
3558 | \end_layout
3559 | 
3560 | \begin_layout Section
3561 | Discussion Problems
3562 | \end_layout
3563 | 
3564 | \begin_layout Subsection
3565 | Discussion 9 -- Entropy
3566 | \end_layout
3567 | 
3568 | \begin_layout Standard
3569 | \begin_inset Graphics
3570 | 	filename graphics/disc09-entropy-1.pdf
3571 | 	width 97col%
3572 | 
3573 | \end_inset
3574 | 
3575 | 
3576 | \end_layout
3577 | 
3578 | \begin_layout Subsection
3579 | Discussion 11 -- Skip-Layer NN
3580 | \end_layout
3581 | 
3582 | \begin_layout Standard
3583 | \begin_inset Graphics
3584 | 	filename graphics/disc10-skipnn-1.pdf
3585 | 	width 97col%
3586 | 
3587 | \end_inset
3588 | 
3589 | 
3590 | \end_layout
3591 | 
3592 | \begin_layout Standard
3593 | \begin_inset Graphics
3594 | 	filename graphics/disc10-skipnn-2.pdf
3595 | 	width 97col%
3596 | 
3597 | \end_inset
3598 | 
3599 | 
3600 | \end_layout
3601 | 
3602 | \begin_layout Subsection
3603 | Discussion 12 -- PCA
3604 | \end_layout
3605 | 
3606 | \begin_layout Standard
3607 | \begin_inset Graphics
3608 | 	filename graphics/disc12-pca-1.pdf
3609 | 	width 97col%
3610 | 
3611 | \end_inset
3612 | 
3613 | 
3614 | \end_layout
3615 | 
3616 | \begin_layout Standard
3617 | \begin_inset Graphics
3618 | 	filename graphics/disc12-pca-2.pdf
3619 | 	width 97col%
3620 | 
3621 | \end_inset
3622 | 
3623 | 
3624 | \end_layout
3625 | 
3626 | \begin_layout Standard
3627 | \begin_inset Graphics
3628 | 	filename graphics/disc12-pca-3.pdf
3629 | 	width 97col%
3630 | 
3631 | \end_inset
3632 | 
3633 | 
3634 | \end_layout
3635 | 
3636 | \begin_layout Standard
3637 | \begin_inset Graphics
3638 | 	filename graphics/disc12-pca-4.pdf
3639 | 	width 97col%
3640 | 
3641 | \end_inset
3642 | 
3643 | 
3644 | \end_layout
3645 | 
3646 | \begin_layout Standard
3647 | \begin_inset ERT
3648 | status collapsed
3649 | 
3650 | \begin_layout Plain Layout
3651 | 
3652 | 
3653 | \backslash
3654 | egroup
3655 | \end_layout
3656 | 
3657 | \end_inset
3658 | 
3659 | 
3660 | \end_layout
3661 | 
3662 | \begin_layout Standard
3663 | \begin_inset ERT
3664 | status open
3665 | 
3666 | \begin_layout Plain Layout
3667 | 
3668 | 
3669 | \backslash
3670 | columnbreak
3671 | \end_layout
3672 | 
3673 | \end_inset
3674 | 
3675 | 
3676 | \end_layout
3677 | 
3678 | \begin_layout Section
3679 | Minicards
3680 | \end_layout
3681 | 
3682 | \begin_layout Standard
3683 | 
3684 | \series bold
3685 | \begin_inset Box Boxed
3686 | position "t"
3687 | hor_pos "c"
3688 | has_inner_box 1
3689 | inner_pos "t"
3690 | use_parbox 1
3691 | use_makebox 0
3692 | width "97col%"
3693 | special "none"
3694 | height "1in"
3695 | height_special "totalheight"
3696 | status open
3697 | 
3698 | \begin_layout Plain Layout
3699 | 
3700 | \series bold
3701 | Gaussian distribution
3702 | \series default
3703 |  [7, 8]
3704 | \end_layout
3705 | 
3706 | \begin_layout Plain Layout
3707 | \begin_inset Formula $1$
3708 | \end_inset
3709 | 
3710 | -var (normal): 
3711 | \begin_inset Formula $p(x)=\ensuremath{\frac{1}{\sigma\sqrt{2\pi}}\exp\left(-\frac{\left(x-\mu\right)^{2}}{2\sigma^{2}}\right)}$
3712 | \end_inset
3713 | 
3714 | 
3715 | \end_layout
3716 | 
3717 | \begin_layout Plain Layout
3718 | Multivar: 
3719 | \begin_inset Formula $p(x)=\frac{1}{\sqrt{\left|\Sigma\right|}\sqrt{2\pi}^{d}}\exp\left(-\frac{1}{2}\left(x-\mu\right)^{\intercal}\Sigma^{-1}\left(x-\mu\right)\right)$
3720 | \end_inset
3721 | 
3722 | 
3723 | \end_layout
3724 | 
3725 | \end_inset
3726 | 
3727 | 
3728 | \end_layout
3729 | 
3730 | \begin_layout Standard
3731 | \begin_inset CommandInset line
3732 | LatexCommand rule
3733 | offset "0.5ex"
3734 | width "100col%"
3735 | height "1pt"
3736 | 
3737 | \end_inset
3738 | 
3739 | 
3740 | \end_layout
3741 | 
3742 | \begin_layout Standard
3743 | The 
3744 | \series bold
3745 | covariance
3746 | \series default
3747 |  
3748 | \begin_inset Formula $\Sigma$
3749 | \end_inset
3750 | 
3751 |  of variables 
3752 | \begin_inset Formula $X$
3753 | \end_inset
3754 | 
3755 |  is a matrix such that each entry 
3756 | \begin_inset Formula $\Sigma_{ij}=\mathrm{Cov}(X_{i},X_{j})$
3757 | \end_inset
3758 | 
3759 | .
3760 |  This means that the diagonal entries 
3761 | \begin_inset Formula $\Sigma_{ii}=\mathrm{Var}(X_{i})$
3762 | \end_inset
3763 | 
3764 | .
3765 |  If the matrix is diagonal, then the non-diagonal entries are zero, which
3766 |  means all the variables 
3767 | \begin_inset Formula $X_{i}$
3768 | \end_inset
3769 | 
3770 |  are independent.
3771 | \end_layout
3772 | 
3773 | \begin_layout Standard
3774 | It's nice to have independent variables, so we try to diagonalize non-diagonal
3775 |  covariances.
3776 | \end_layout
3777 | 
3778 | \begin_layout Standard
3779 | 
3780 | \series bold
3781 | \begin_inset Box Boxed
3782 | position "t"
3783 | hor_pos "c"
3784 | has_inner_box 1
3785 | inner_pos "t"
3786 | use_parbox 1
3787 | use_makebox 0
3788 | width "97col%"
3789 | special "none"
3790 | height "1in"
3791 | height_special "totalheight"
3792 | status open
3793 | 
3794 | \begin_layout Plain Layout
3795 | 
3796 | \series bold
3797 | Spectral Theorem 
3798 | \series default
3799 | [7:23]
3800 | \end_layout
3801 | 
3802 | \begin_layout Enumerate
3803 | Take definition of eigenvalue/vector: 
3804 | \begin_inset Formula $Ax=\lambda x$
3805 | \end_inset
3806 | 
3807 | 
3808 | \end_layout
3809 | 
3810 | \begin_layout Enumerate
3811 | Pack multiple eigenvalues into 
3812 | \begin_inset Formula $\Lambda=\mathrm{diag}\left(\lambda_{1},\lambda_{2},\ldots,\lambda_{n}\right)$
3813 | \end_inset
3814 | 
3815 | 
3816 | \begin_inset Newline newline
3817 | \end_inset
3818 | 
3819 | 
3820 | \begin_inset Formula $n$
3821 | \end_inset
3822 | 
3823 |  eigenvalues exist iff 
3824 | \begin_inset Formula $A$
3825 | \end_inset
3826 | 
3827 |  is symmetric.
3828 | \end_layout
3829 | 
3830 | \begin_layout Enumerate
3831 | Pack multiple eigenvectors into 
3832 | \begin_inset Formula $U=\left[x_{1}\ x_{2}\ \ldots\ x_{n}\right]$
3833 | \end_inset
3834 | 
3835 | 
3836 | \end_layout
3837 | 
3838 | \begin_layout Enumerate
3839 | Rewrite equation using these: 
3840 | \begin_inset Formula $\boxed{AU=U\Lambda\longrightarrow A=U\Lambda U'}$
3841 | \end_inset
3842 | 
3843 | .
3844 | \begin_inset Newline newline
3845 | \end_inset
3846 | 
3847 | We can use this to diagonalize a symmetric 
3848 | \begin_inset Formula $A$
3849 | \end_inset
3850 | 
3851 | .
3852 | \end_layout
3853 | 
3854 | \end_inset
3855 | 
3856 | 
3857 | \end_layout
3858 | 
3859 | \begin_layout Standard
3860 | \begin_inset CommandInset line
3861 | LatexCommand rule
3862 | offset "0.5ex"
3863 | width "100col%"
3864 | height "1pt"
3865 | 
3866 | \end_inset
3867 | 
3868 | 
3869 | \end_layout
3870 | 
3871 | \begin_layout Standard
3872 | 
3873 | \series bold
3874 | SVM-like classifiers
3875 | \series default
3876 |  work with a 
3877 | \bar under
3878 | boundary
3879 | \bar default
3880 | , a hyperplane (a line for 2D data) that separates two classes.
3881 |  
3882 | \bar under
3883 | Support vectors
3884 | \bar default
3885 |  are the point(s) closest to the boundary.
3886 |  
3887 | \begin_inset Formula $\gamma$
3888 | \end_inset
3889 | 
3890 |  is the 
3891 | \bar under
3892 | margin
3893 | \bar default
3894 | , the distance between the boundary and the support vector(s).
3895 |  The 
3896 | \bar under
3897 | parameter 
3898 | \begin_inset Formula $\theta$
3899 | \end_inset
3900 | 
3901 | 
3902 | \bar default
3903 |  is a vector.
3904 |  
3905 | \begin_inset Formula $\boxed{\theta\cdot x}$
3906 | \end_inset
3907 | 
3908 |  gives predictions.
3909 |  About 
3910 | \begin_inset Formula $\theta$
3911 | \end_inset
3912 | 
3913 | :
3914 | \end_layout
3915 | 
3916 | \begin_layout Itemize
3917 | The direction of 
3918 | \begin_inset Formula $\theta$
3919 | \end_inset
3920 | 
3921 |  defines the boundary.
3922 |  We can choose this.
3923 | \end_layout
3924 | 
3925 | \begin_layout Itemize
3926 | \begin_inset Formula $\left\Vert \theta\right\Vert $
3927 | \end_inset
3928 | 
3929 |  must be 
3930 | \begin_inset Formula $1/\gamma$
3931 | \end_inset
3932 | 
3933 | , as restricted by 
3934 | \begin_inset Formula $\forall i:y^{i}\theta\cdot x^{i}\geq1$
3935 | \end_inset
3936 | 
3937 | 
3938 | \begin_inset Newline newline
3939 | \end_inset
3940 | 
3941 | We cannot explicitly choose this; it depends on the boundary.
3942 | \begin_inset Newline newline
3943 | \end_inset
3944 | 
3945 | This restriction is turned into a cost in soft-margin SVM.
3946 | \end_layout
3947 | 
3948 | \begin_layout Standard
3949 | 
3950 | \series bold
3951 | \begin_inset Box Boxed
3952 | position "t"
3953 | hor_pos "c"
3954 | has_inner_box 1
3955 | inner_pos "t"
3956 | use_parbox 1
3957 | use_makebox 0
3958 | width "97col%"
3959 | special "none"
3960 | height "1in"
3961 | height_special "totalheight"
3962 | status open
3963 | 
3964 | \begin_layout Plain Layout
3965 | 
3966 | \series bold
3967 | Perceptron
3968 | \series default
3969 |  [2:11, 3:6] picks misclassified point and updates 
3970 | \begin_inset Formula $\theta$
3971 | \end_inset
3972 | 
3973 |  just enough to classify it correctly:
3974 | \begin_inset Newline newline
3975 | \end_inset
3976 | 
3977 | 
3978 | \begin_inset Formula $\boxed{\theta\leftarrow\theta+x^{i}}$
3979 | \end_inset
3980 | 
3981 |  or 
3982 | \begin_inset Formula $\boxed{\theta\leftarrow\theta-\nabla J\left(\theta\right)}$
3983 | \end_inset
3984 | 
3985 | 
3986 | \end_layout
3987 | 
3988 | \begin_layout Plain Layout
3989 | 
3990 | \bar under
3991 | Overfits
3992 | \bar default
3993 |  when outliers skew the boundary.
3994 |  
3995 | \bar under
3996 | Converges
3997 | \bar default
3998 |  iff separable.
3999 | \end_layout
4000 | 
4001 | \begin_layout Plain Layout
4002 | 
4003 | \bar under
4004 | Batch eqn
4005 | \bar default
4006 |  
4007 | \begin_inset Formula $\theta\cdot x=\sum_{i}\alpha^{i}y^{i}x^{i}\cdot x$
4008 | \end_inset
4009 | 
4010 | : 
4011 | \begin_inset Newline newline
4012 | \end_inset
4013 | 
4014 |  
4015 | \begin_inset Formula $\alpha_{i}=\text{\# times point \emph{i} was misclassified}$
4016 | \end_inset
4017 | 
4018 | 
4019 | \end_layout
4020 | 
4021 | \end_inset
4022 | 
4023 | 
4024 | \end_layout
4025 | 
4026 | \begin_layout Standard
4027 | 
4028 | \series bold
4029 | \begin_inset Box Boxed
4030 | position "t"
4031 | hor_pos "c"
4032 | has_inner_box 1
4033 | inner_pos "t"
4034 | use_parbox 1
4035 | use_makebox 0
4036 | width "97col%"
4037 | special "none"
4038 | height "1in"
4039 | height_special "totalheight"
4040 | status open
4041 | 
4042 | \begin_layout Plain Layout
4043 | 
4044 | \series bold
4045 | Hard-margin SVM
4046 | \series default
4047 |  [3:36] maximizes the margin around the boundary.
4048 |  Technically, it minimizes the distance between boundary and the vectors
4049 |  closest to it (the support vectors):
4050 | \begin_inset Newline newline
4051 | \end_inset
4052 | 
4053 | 
4054 | \begin_inset Formula $\boxed{\min_{\theta}\left\Vert \theta\right\Vert ^{2}\quad\text{such that}\ \forall i:y^{i}\theta\cdot x^{i}\geq1}$
4055 | \end_inset
4056 | 
4057 | 
4058 | \end_layout
4059 | 
4060 | \begin_layout Plain Layout
4061 | Sometimes removing a few outliers lets us find a much higher margin or a
4062 |  margin at all.
4063 |  Hard-margin 
4064 | \bar under
4065 | overfits
4066 | \bar default
4067 |  by not seeing this.
4068 | \end_layout
4069 | 
4070 | \begin_layout Plain Layout
4071 | 
4072 | \bar under
4073 | Converges
4074 | \bar default
4075 |  iff separable.
4076 | \end_layout
4077 | 
4078 | \begin_layout Plain Layout
4079 | 
4080 | \bar under
4081 | Batch eqn
4082 | \bar default
4083 |  
4084 | \begin_inset Formula $\theta=\sum_{i}\alpha^{i}y^{i}x^{i}$
4085 | \end_inset
4086 | 
4087 | , where 
4088 | \begin_inset Formula $\alpha^{i}=\mathbf{1}_{i\ \text{is support vector}}$
4089 | \end_inset
4090 | 
4091 | 
4092 | \end_layout
4093 | 
4094 | \end_inset
4095 | 
4096 | 
4097 | \end_layout
4098 | 
4099 | \begin_layout Standard
4100 | 
4101 | \series bold
4102 | \begin_inset Box Boxed
4103 | position "t"
4104 | hor_pos "c"
4105 | has_inner_box 1
4106 | inner_pos "t"
4107 | use_parbox 1
4108 | use_makebox 0
4109 | width "97col%"
4110 | special "none"
4111 | height "1in"
4112 | height_special "totalheight"
4113 | status open
4114 | 
4115 | \begin_layout Plain Layout
4116 | 
4117 | \series bold
4118 | Soft-margin SVM
4119 | \series default
4120 |  [3:37] is like hard-margin SVM but penalizes misclassifications:
4121 | \begin_inset Newline newline
4122 | \end_inset
4123 | 
4124 | 
4125 | \begin_inset Formula $\boxed{\min_{\theta}\left\Vert \theta\right\Vert ^{2}+C\sum_{i=1}^{n}\left(1-y^{i}\theta\cdot x^{i}\right)_{+}}$
4126 | \end_inset
4127 | 
4128 | 
4129 | \end_layout
4130 | 
4131 | \begin_layout Plain Layout
4132 | 
4133 | \bar under
4134 | Hyperparameter
4135 | \bar default
4136 |  
4137 | \begin_inset Formula $C$
4138 | \end_inset
4139 | 
4140 |  is the hardness of the margin.
4141 |  Lower 
4142 | \begin_inset Formula $C$
4143 | \end_inset
4144 | 
4145 |  means more misclassifications but larger soft margin.
4146 | \end_layout
4147 | 
4148 | \begin_layout Plain Layout
4149 | 
4150 | \bar under
4151 | Overfits
4152 | \bar default
4153 |  on less data, more features, higher 
4154 | \begin_inset Formula $C$
4155 | \end_inset
4156 | 
4157 | 
4158 | \end_layout
4159 | 
4160 | \end_inset
4161 | 
4162 | 
4163 | \end_layout
4164 | 
4165 | \begin_layout Standard
4166 | \begin_inset CommandInset line
4167 | LatexCommand rule
4168 | offset "0.5ex"
4169 | width "100col%"
4170 | height "1pt"
4171 | 
4172 | \end_inset
4173 | 
4174 | 
4175 | \end_layout
4176 | 
4177 | \begin_layout Standard
4178 | 
4179 | \series bold
4180 | More classifiers
4181 | \end_layout
4182 | 
4183 | \begin_layout Standard
4184 | 
4185 | \series bold
4186 | \begin_inset Box Boxed
4187 | position "t"
4188 | hor_pos "c"
4189 | has_inner_box 1
4190 | inner_pos "t"
4191 | use_parbox 1
4192 | use_makebox 0
4193 | width "97col%"
4194 | special "none"
4195 | height "1in"
4196 | height_special "totalheight"
4197 | status open
4198 | 
4199 | \begin_layout Plain Layout
4200 | 
4201 | \series bold
4202 | KNN
4203 | \series default
4204 |  [14:4] Given an item 
4205 | \begin_inset Formula $x$
4206 | \end_inset
4207 | 
4208 | , find the 
4209 | \begin_inset Formula $k$
4210 | \end_inset
4211 | 
4212 |  training items 
4213 | \begin_inset Quotes eld
4214 | \end_inset
4215 | 
4216 | closest
4217 | \begin_inset Quotes erd
4218 | \end_inset
4219 | 
4220 |  to 
4221 | \begin_inset Formula $x$
4222 | \end_inset
4223 | 
4224 |  and return the result of a vote.
4225 | \end_layout
4226 | 
4227 | \begin_layout Plain Layout
4228 | 
4229 | \bar under
4230 | Hyperparameter
4231 | \bar default
4232 |  
4233 | \begin_inset Formula $k$
4234 | \end_inset
4235 | 
4236 | , the number of neighbors.
4237 | \begin_inset Newline newline
4238 | \end_inset
4239 | 
4240 | 
4241 | \begin_inset Quotes eld
4242 | \end_inset
4243 | 
4244 | Closest
4245 | \begin_inset Quotes erd
4246 | \end_inset
4247 | 
4248 |  can be defined by some norm (
4249 | \begin_inset Formula $l_{2}$
4250 | \end_inset
4251 | 
4252 |  by default).
4253 | \end_layout
4254 | 
4255 | \begin_layout Plain Layout
4256 | 
4257 | \bar under
4258 | Overfits
4259 | \bar default
4260 |  when 
4261 | \begin_inset Formula $k$
4262 | \end_inset
4263 | 
4264 |  is really small
4265 | \end_layout
4266 | 
4267 | \end_inset
4268 | 
4269 | 
4270 | \end_layout
4271 | 
4272 | \begin_layout Standard
4273 | 
4274 | \series bold
4275 | \begin_inset Box Boxed
4276 | position "t"
4277 | hor_pos "c"
4278 | has_inner_box 1
4279 | inner_pos "t"
4280 | use_parbox 1
4281 | use_makebox 0
4282 | width "97col%"
4283 | special "none"
4284 | height "1in"
4285 | height_special "totalheight"
4286 | status open
4287 | 
4288 | \begin_layout Plain Layout
4289 | 
4290 | \series bold
4291 | Decision trees
4292 | \series default
4293 | : Recursively split on features that yield the best split.
4294 |  Each tree has many nodes, which either split on a feature at a threshold,
4295 |  or all data the same way.
4296 | \begin_inset Newline newline
4297 | \end_inset
4298 | 
4299 | 
4300 | \bar under
4301 | Hyperparameters
4302 | \bar default
4303 |  typically restrict complexity (max tree depth, min points at node) or penalize
4304 |  it.
4305 |  One particular one of interest is 
4306 | \begin_inset Formula $d$
4307 | \end_inset
4308 | 
4309 | , the max number of nodes.
4310 | \end_layout
4311 | 
4312 | \begin_layout Plain Layout
4313 | 
4314 | \bar under
4315 | Overfits
4316 | \bar default
4317 |  when tree is deep or when we are allowed to split on a very small number
4318 |  of items.
4319 | \end_layout
4320 | 
4321 | \begin_layout Plain Layout
4322 | 
4323 | \series bold
4324 | Bagging
4325 | \series default
4326 | : Make multiple trees, each with a random subset of training items.
4327 |  To predict, take vote from trees.
4328 | \end_layout
4329 | 
4330 | \begin_layout Plain Layout
4331 | 
4332 | \bar under
4333 | Hyperparameters
4334 | \bar default
4335 |  # trees, proportion of items to subset.
4336 | \end_layout
4337 | 
4338 | \begin_layout Plain Layout
4339 | 
4340 | \series bold
4341 | Random forests
4342 | \series default
4343 |  is bagging, except, for each node, consider only a random subset of features
4344 |  to split on.
4345 | \end_layout
4346 | 
4347 | \begin_layout Plain Layout
4348 | 
4349 | \bar under
4350 | Hyperparameters
4351 | \bar default
4352 |  proportion of features to consider.
4353 | \end_layout
4354 | 
4355 | \end_inset
4356 | 
4357 | 
4358 | \end_layout
4359 | 
4360 | \begin_layout Standard
4361 | 
4362 | \series bold
4363 | \begin_inset Box Boxed
4364 | position "t"
4365 | hor_pos "c"
4366 | has_inner_box 1
4367 | inner_pos "t"
4368 | use_parbox 1
4369 | use_makebox 0
4370 | width "97col%"
4371 | special "none"
4372 | height "1in"
4373 | height_special "totalheight"
4374 | status open
4375 | 
4376 | \begin_layout Plain Layout
4377 | 
4378 | \series bold
4379 | AdaBoost
4380 | \series default
4381 |  [dtrees3:34] Use any algorithm (i.e., decision trees) to train a weak learner,
4382 |  take all the errors, and train a new learner on with the errors emphasized*.
4383 |  To predict, predict with the first algorithm, then add on the prediction
4384 |  of the second algorithm, and so on.
4385 | \end_layout
4386 | 
4387 | \begin_layout Plain Layout
4388 | \noindent
4389 | * For regression, train the new learner on the errors.
4390 |  For classification, give misclassified items more weight.
4391 | \end_layout
4392 | 
4393 | \begin_layout Plain Layout
4394 | 
4395 | \bar under
4396 | Hyperparameters
4397 | \bar default
4398 |  
4399 | \begin_inset Formula $B$
4400 | \end_inset
4401 | 
4402 | , the number of weak learners; 
4403 | \begin_inset Formula $\lambda$
4404 | \end_inset
4405 | 
4406 | , the learning rate.
4407 | \end_layout
4408 | 
4409 | \end_inset
4410 | 
4411 | 
4412 | \end_layout
4413 | 
4414 | \begin_layout Standard
4415 | \begin_inset VSpace vfill
4416 | \end_inset
4417 | 
4418 | 
4419 | \end_layout
4420 | 
4421 | \begin_layout Standard
4422 | \begin_inset ERT
4423 | status open
4424 | 
4425 | \begin_layout Plain Layout
4426 | 
4427 | 
4428 | \backslash
4429 | end{multicols}
4430 | \end_layout
4431 | 
4432 | \end_inset
4433 | 
4434 | 
4435 | \end_layout
4436 | 
4437 | \end_body
4438 | \end_document
4439 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | CS 189 Machine Learning Cheat Sheet
 2 | ===================================
 3 | 
 4 | Cheat sheets:
 5 | 
 6 | -   [189-cheat-sheet-minicards.pdf](<189-cheat-sheet-minicards.pdf>)
 7 | -   [189-cheat-sheet-nominicards.pdf](<189-cheat-sheet-nominicards.pdf>)
 8 | 
 9 | These cheat sheets include:
10 | 
11 | -   [The original notes](<https://github.com/sharma409/EECScheats>) by Rishi
12 |     Sharma and Peter Gao (from which this repo is forked), with some modifications:
13 |     -   Rearranged sections to form better grouping, add section titles
14 |     -   Reworded/condensed some sections in light of better grouping
15 |     -   Added some new content
16 | -   **All** past T/F and multiple choice questions from the following semesters:
17 |     -   Spring 2013 midterm & final
18 |     -   Spring 2014 final
19 |     -   Spring 2015 midterm
20 | -   Important algorithmic problems from discussions
21 | -   Additional notes ("minicards")
22 |     -   The `no-minicards` version omits these, so you can have space to add your own notes.
23 | 


--------------------------------------------------------------------------------
/graphics/NN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/NN.pdf


--------------------------------------------------------------------------------
/graphics/NN1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/NN1.pdf


--------------------------------------------------------------------------------
/graphics/NN2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/NN2.pdf


--------------------------------------------------------------------------------
/graphics/disc09-entropy-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc09-entropy-1.pdf


--------------------------------------------------------------------------------
/graphics/disc10-skipnn-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc10-skipnn-1.pdf


--------------------------------------------------------------------------------
/graphics/disc10-skipnn-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc10-skipnn-2.pdf


--------------------------------------------------------------------------------
/graphics/disc12-pca-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc12-pca-1.pdf


--------------------------------------------------------------------------------
/graphics/disc12-pca-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc12-pca-2.pdf


--------------------------------------------------------------------------------
/graphics/disc12-pca-3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc12-pca-3.pdf


--------------------------------------------------------------------------------
/graphics/disc12-pca-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc12-pca-4.pdf


--------------------------------------------------------------------------------