├── .gitignore ├── 189-cheat-sheet-minicards.pdf ├── 189-cheat-sheet-nominicards.pdf ├── 189-cheat-sheet.lyx ├── README.md └── graphics ├── NN.pdf ├── NN1.pdf ├── NN2.pdf ├── disc09-entropy-1.pdf ├── disc10-skipnn-1.pdf ├── disc10-skipnn-2.pdf ├── disc12-pca-1.pdf ├── disc12-pca-2.pdf ├── disc12-pca-3.pdf └── disc12-pca-4.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | *.aux 2 | *.fdb_latexmk 3 | *.gz 4 | *.log 5 | *.out 6 | .DS_Store 7 | .pdf 8 | -------------------------------------------------------------------------------- /189-cheat-sheet-minicards.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/189-cheat-sheet-minicards.pdf -------------------------------------------------------------------------------- /189-cheat-sheet-nominicards.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/189-cheat-sheet-nominicards.pdf -------------------------------------------------------------------------------- /189-cheat-sheet.lyx: -------------------------------------------------------------------------------- 1 | #LyX 2.1 created this file. For more info see http://www.lyx.org/ 2 | \lyxformat 474 3 | \begin_document 4 | \begin_header 5 | \textclass extarticle 6 | \begin_preamble 7 | \usepackage{amsmath,amsthm,amsfonts,amssymb} 8 | \usepackage{calc} 9 | \usepackage{color,graphicx,overpic} 10 | \usepackage[shortlabels]{enumitem} 11 | \usepackage{hyperref} 12 | \usepackage{ifthen} 13 | \usepackage{multicol} 14 | \usepackage{titlesec} 15 | \usepackage{wrapfig} 16 | 17 | \titlespacing*{\section}{0pt}{0.5em}{0em} 18 | \titlespacing*{\subsection}{0pt}{0.5em}{0em} 19 | \titlespacing*{\subsubsection}{0pt}{0.5em}{0em} 20 | \titleformat{\section}{\vspace{1em}\titlerule\normalfont\fontsize{7}{7}\bfseries}{\thesection}{1em}{} 21 | \titleformat{\subsection}{\normalfont\fontsize{6}{6}\bfseries}{\thesection}{1em}{} 22 | \titleformat{\subsubsection}{\titlerule\normalfont\fontsize{6}{6}}{\thesection}{1em}{} 23 | \titlespacing*{\labeling}{0pt}{0em}{0em} 24 | 25 | \let\stdboxed\boxed 26 | \renewcommand{\boxed}[1]{ 27 | \setlength{\fboxsep}{0.05em} 28 | \stdboxed{#1} 29 | } 30 | 31 | \setlist{nolistsep,leftmargin=*} 32 | 33 | \setlength{\premulticols}{1pt} 34 | \setlength{\postmulticols}{1pt} 35 | \setlength{\columnsep}{10pt} 36 | 37 | \newtheorem{example}[section]{Example} 38 | 39 | \let\textquotedbl=" 40 | \def\ci{\perp\!\!\!\perp} 41 | 42 | \raggedright 43 | 44 | \newcommand{\mytitle}[2]{ 45 | \begin{center}\small{#1} -- \scriptsize{#2}\end{center} 46 | } 47 | 48 | 49 | \hyphenpenalty=100 50 | \end_preamble 51 | \options 3pt 52 | \use_default_options false 53 | \maintain_unincluded_children false 54 | \language english 55 | \language_package none 56 | \inputencoding auto 57 | \fontencoding default 58 | \font_roman times 59 | \font_sans default 60 | \font_typewriter default 61 | \font_math auto 62 | \font_default_family default 63 | \use_non_tex_fonts false 64 | \font_sc false 65 | \font_osf false 66 | \font_sf_scale 100 67 | \font_tt_scale 100 68 | \graphics default 69 | \default_output_format default 70 | \output_sync 0 71 | \bibtex_command default 72 | \index_command default 73 | \paperfontsize default 74 | \spacing single 75 | \use_hyperref false 76 | \papersize default 77 | \use_geometry true 78 | \use_package amsmath 1 79 | \use_package amssymb 0 80 | \use_package cancel 0 81 | \use_package esint 1 82 | \use_package mathdots 0 83 | \use_package mathtools 0 84 | \use_package mhchem 0 85 | \use_package stackrel 0 86 | \use_package stmaryrd 0 87 | \use_package undertilde 0 88 | \cite_engine basic 89 | \cite_engine_type default 90 | \biblio_style plain 91 | \use_bibtopic false 92 | \use_indices false 93 | \paperorientation portrait 94 | \suppress_date false 95 | \justification false 96 | \use_refstyle 0 97 | \index Index 98 | \shortcut idx 99 | \color #008000 100 | \end_index 101 | \leftmargin 0.25in 102 | \topmargin 0.25in 103 | \rightmargin 0.25in 104 | \bottommargin 0.25in 105 | \secnumdepth -2 106 | \tocdepth 3 107 | \paragraph_separation skip 108 | \defskip smallskip 109 | \quotes_language english 110 | \papercolumns 1 111 | \papersides 1 112 | \paperpagestyle empty 113 | \tracking_changes false 114 | \output_changes false 115 | \html_math_output 0 116 | \html_css_as_file 0 117 | \html_be_strict false 118 | \end_header 119 | 120 | \begin_body 121 | 122 | \begin_layout Standard 123 | \begin_inset ERT 124 | status open 125 | 126 | \begin_layout Plain Layout 127 | 128 | 129 | \backslash 130 | fontsize{5}{4} 131 | \backslash 132 | selectfont 133 | \end_layout 134 | 135 | \end_inset 136 | 137 | 138 | \end_layout 139 | 140 | \begin_layout Standard 141 | \begin_inset ERT 142 | status open 143 | 144 | \begin_layout Plain Layout 145 | 146 | 147 | \backslash 148 | mytitle{CS 189 Final Note Sheet}{Rishi Sharma, Peter Gao, et. 149 | al.} 150 | \end_layout 151 | 152 | \begin_layout Plain Layout 153 | 154 | 155 | \backslash 156 | begin{multicols}{4} 157 | \end_layout 158 | 159 | \end_inset 160 | 161 | 162 | \end_layout 163 | 164 | \begin_layout Section 165 | Probability & Matrix Review 166 | \end_layout 167 | 168 | \begin_layout Subsection 169 | Bayesian Decision Theory 170 | \end_layout 171 | 172 | \begin_layout Standard 173 | Bayes Rule: 174 | \begin_inset Formula $P(\omega|x)=\frac{P(x|\omega)P(\omega)}{P(x)},P(x)=\sum_{i}P(x|\omega_{i})P(\omega_{i})$ 175 | \end_inset 176 | 177 | 178 | \end_layout 179 | 180 | \begin_layout Standard 181 | \begin_inset Formula $P(x,w)=P(x|w)P(w)=P(w|x)P(x)$ 182 | \end_inset 183 | 184 | 185 | \end_layout 186 | 187 | \begin_layout Standard 188 | \begin_inset Formula $P(error)=\int_{-\infty}^{\infty}P(error|x)P(x)dx$ 189 | \end_inset 190 | 191 | 192 | \end_layout 193 | 194 | \begin_layout Standard 195 | \begin_inset Formula $P(error|x)=\left\{ \begin{array}{lr} 196 | P(\omega_{1}|x) & \text{ if we decide }\omega_{2}\\ 197 | P(\omega_{2}|x) & \text{ if we decide }\omega_{1} 198 | \end{array}\right.$ 199 | \end_inset 200 | 201 | 202 | \end_layout 203 | 204 | \begin_layout Standard 205 | 0-1 Loss: 206 | \begin_inset Formula $\lambda(\alpha_{i}|\omega_{j})=\left\{ \begin{array}{lr} 207 | 0 & i=j\text{\ (correct)}\\ 208 | 1 & i\not=j\text{\ (mismatch)} 209 | \end{array}\right.$ 210 | \end_inset 211 | 212 | 213 | \end_layout 214 | 215 | \begin_layout Standard 216 | 217 | \family roman 218 | \series medium 219 | \shape up 220 | \size normal 221 | \emph off 222 | \bar no 223 | \strikeout off 224 | \uuline off 225 | \uwave off 226 | \noun off 227 | \color none 228 | Expected Loss (Risk) 229 | \family default 230 | \series default 231 | \shape default 232 | \size default 233 | \bar default 234 | \strikeout default 235 | \uuline default 236 | \uwave default 237 | \noun default 238 | \color inherit 239 | : 240 | \begin_inset Formula $R(\alpha_{i}|x)=\sum_{j=1}^{c}\lambda(\alpha_{i}|\omega_{j})P(\omega_{j}|x)$ 241 | \end_inset 242 | 243 | 244 | \end_layout 245 | 246 | \begin_layout Standard 247 | 248 | \family roman 249 | \series medium 250 | \shape up 251 | \size normal 252 | \emph off 253 | \bar no 254 | \strikeout off 255 | \uuline off 256 | \uwave off 257 | \noun off 258 | \color none 259 | 0-1 Risk: 260 | \family default 261 | \series default 262 | \shape default 263 | \size default 264 | \bar default 265 | \strikeout default 266 | \uuline default 267 | \uwave default 268 | \noun default 269 | \color inherit 270 | 271 | \begin_inset Formula $R(\alpha_{i}|x)=\sum_{j\not=i}^{c}P(\omega_{j}|x)=1-P(\omega_{i}|x)$ 272 | \end_inset 273 | 274 | 275 | \end_layout 276 | 277 | \begin_layout Subsection 278 | Generative vs. 279 | Discriminative Model 280 | \end_layout 281 | 282 | \begin_layout Standard 283 | 284 | \series bold 285 | Generative 286 | \series default 287 | : Model class conditional density 288 | \begin_inset Formula $p(x|y)$ 289 | \end_inset 290 | 291 | and find 292 | \begin_inset Formula $p(y|x)\propto p(x|y)p(y)$ 293 | \end_inset 294 | 295 | or model joint density 296 | \begin_inset Formula $p(x,y)$ 297 | \end_inset 298 | 299 | and marginalize to find 300 | \begin_inset Formula $p(y=k|x)=\int_{x}p(x,y=k)dx$ 301 | \end_inset 302 | 303 | (posterior) 304 | \end_layout 305 | 306 | \begin_layout Standard 307 | 308 | \series bold 309 | Discriminative 310 | \series default 311 | : Model conditional 312 | \begin_inset Formula $p(y|x)$ 313 | \end_inset 314 | 315 | . 316 | \end_layout 317 | 318 | \begin_layout Standard 319 | \begin_inset Tabular 320 | 321 | 322 | 323 | 324 | 325 | 326 | \begin_inset Text 327 | 328 | \begin_layout Plain Layout 329 | 330 | \series bold 331 | class conditional 332 | \series default 333 | 334 | \begin_inset Formula $P(X|Y)$ 335 | \end_inset 336 | 337 | 338 | \end_layout 339 | 340 | \end_inset 341 | 342 | 343 | \begin_inset Text 344 | 345 | \begin_layout Plain Layout 346 | 347 | \series bold 348 | posterior 349 | \series default 350 | 351 | \begin_inset Formula $P(Y|X)$ 352 | \end_inset 353 | 354 | 355 | \end_layout 356 | 357 | \end_inset 358 | 359 | 360 | 361 | 362 | \begin_inset Text 363 | 364 | \begin_layout Plain Layout 365 | 366 | \series bold 367 | prior 368 | \series default 369 | 370 | \begin_inset Formula $P(Y)$ 371 | \end_inset 372 | 373 | 374 | \end_layout 375 | 376 | \end_inset 377 | 378 | 379 | \begin_inset Text 380 | 381 | \begin_layout Plain Layout 382 | 383 | \series bold 384 | evidence 385 | \series default 386 | 387 | \begin_inset Formula $P(X)$ 388 | \end_inset 389 | 390 | 391 | \end_layout 392 | 393 | \end_inset 394 | 395 | 396 | 397 | 398 | \end_inset 399 | 400 | 401 | \end_layout 402 | 403 | \begin_layout Subsection 404 | Probabilistic Motivation for Least Squares 405 | \end_layout 406 | 407 | \begin_layout Standard 408 | \begin_inset Formula $y^{(i)}=\theta^{\intercal}x^{(i)}+\epsilon^{(i)}\ \text{with noise}\ \epsilon{(i)}\sim\mathcal{N}(0,\sigma^{2})$ 409 | \end_inset 410 | 411 | 412 | \end_layout 413 | 414 | \begin_layout Standard 415 | Note: The intercept term 416 | \begin_inset Formula $x_{0}=1$ 417 | \end_inset 418 | 419 | is accounted for in 420 | \begin_inset Formula $\theta$ 421 | \end_inset 422 | 423 | 424 | \begin_inset Newline newline 425 | \end_inset 426 | 427 | 428 | \begin_inset Formula $\implies p(y^{(i)}|x^{(i)};\theta)=\frac{1}{\sqrt{2\pi\sigma^{2}}}\exp\left(-\frac{(y^{(i)}-\theta^{\intercal}x^{(i)})^{2}}{2\sigma^{2}}\right)$ 429 | \end_inset 430 | 431 | 432 | \begin_inset Newline newline 433 | \end_inset 434 | 435 | 436 | \begin_inset Formula $\implies L(\theta)=\prod_{i=1}^{m}\frac{1}{\sqrt{2\pi\sigma^{2}}}\exp\left(-\frac{(y^{(i)}-\theta^{\intercal}x^{(i)})^{2}}{2\sigma^{2}}\right)$ 437 | \end_inset 438 | 439 | 440 | \begin_inset Newline newline 441 | \end_inset 442 | 443 | 444 | \begin_inset Formula $\implies l(\theta)=m\log\frac{1}{\sqrt{2\pi\sigma^{2}}}-\frac{1}{2\sigma^{2}}\sum_{i=1}^{m}(y^{(i)}-\theta^{\intercal}x^{(i)})^{2}$ 445 | \end_inset 446 | 447 | 448 | \begin_inset Newline newline 449 | \end_inset 450 | 451 | 452 | \begin_inset Formula $\implies\max_{\theta}l(\theta)\equiv\min_{\theta}\sum_{i=1}^{m}(y^{(i)}-h_{\theta}(x))^{2}$ 453 | \end_inset 454 | 455 | 456 | \end_layout 457 | 458 | \begin_layout Standard 459 | Gaussian noise in our data set 460 | \begin_inset Formula $\{x^{(i)},y^{(i)}\}_{i=1}^{m}$ 461 | \end_inset 462 | 463 | gives us least squares 464 | \end_layout 465 | 466 | \begin_layout Standard 467 | \begin_inset Formula $min_{\theta}||X\theta-y||_{2}^{2}\equiv\min_{\theta}\theta^{\intercal}X^{\intercal}X\theta-2\theta^{\intercal}X^{\intercal}y+y^{\intercal}Y$ 468 | \end_inset 469 | 470 | 471 | \end_layout 472 | 473 | \begin_layout Standard 474 | \begin_inset Formula $\nabla_{\theta}l(\theta)=X^{\intercal}X\theta-X^{\intercal}y=0\implies\boxed{\theta^{*}=(X^{\intercal}X)^{-1}X^{\intercal}y}$ 475 | \end_inset 476 | 477 | 478 | \end_layout 479 | 480 | \begin_layout Standard 481 | Gradient Descent: 482 | \begin_inset Formula $\theta_{t+1}=\theta_{t}+\alpha(y_{t}^{(i)}-h(x_{t}^{(i)}))x_{t}^{(i)},\ \ h_{\theta}(x)=\theta^{\intercal}x$ 483 | \end_inset 484 | 485 | 486 | \end_layout 487 | 488 | \begin_layout Subsection 489 | Multivariate Gaussian 490 | \begin_inset Formula $X\sim\mathcal{N}(\mu,\Sigma)$ 491 | \end_inset 492 | 493 | 494 | \end_layout 495 | 496 | \begin_layout Standard 497 | 498 | \bar under 499 | Gaussian class conditionals lead to a logistic posterior. 500 | \end_layout 501 | 502 | \begin_layout Standard 503 | \begin_inset Formula $f(x;\mu,\Sigma)=\frac{1}{(2\pi)^{n/2}|\Sigma|^{1/2}}\exp\left(-\frac{1}{2}(x-\mu)^{T}\Sigma^{-1}(x-\mu)\right)$ 504 | \end_inset 505 | 506 | 507 | \end_layout 508 | 509 | \begin_layout Standard 510 | \begin_inset Formula $\Sigma=E[(X-\mu)(X-\mu)^{T}]=E[XX^{T}]-\mu\mu^{T}$ 511 | \end_inset 512 | 513 | 514 | \end_layout 515 | 516 | \begin_layout Standard 517 | \begin_inset Formula $\Sigma\text{ is PSD}\implies x^{T}\Sigma x\ge0\text{, if inverse exists }\Sigma\text{ must be PD}$ 518 | \end_inset 519 | 520 | 521 | \end_layout 522 | 523 | \begin_layout Standard 524 | \begin_inset Formula $\text{If }X\sim N(\mu,\Sigma),\ \text{then}\ AX+b\sim N(A\mu+b,A\Sigma A^{T})$ 525 | \end_inset 526 | 527 | 528 | \begin_inset Newline newline 529 | \end_inset 530 | 531 | 532 | \begin_inset Formula $\implies\Sigma^{-\frac{1}{2}}(X-\mu)\sim N(0,I),\text{ where }\Sigma^{-\frac{1}{2}}=U\Lambda^{-\frac{1}{2}}$ 533 | \end_inset 534 | 535 | 536 | \end_layout 537 | 538 | \begin_layout Standard 539 | The distribution is the result of a linear transformation of a vector of 540 | univariate Gaussians 541 | \begin_inset Formula $Z\sim\mathcal{N}(0,I)$ 542 | \end_inset 543 | 544 | such that 545 | \begin_inset Formula $X=AZ+\mu$ 546 | \end_inset 547 | 548 | where we have 549 | \begin_inset Formula $\Sigma=AA^{\intercal}$ 550 | \end_inset 551 | 552 | . 553 | From the pdf, we see that the level curves of the distribution decrease 554 | proportionally with 555 | \begin_inset Formula $x^{\intercal}\Sigma^{-1}x$ 556 | \end_inset 557 | 558 | (assume 559 | \begin_inset Formula $\mu=0$ 560 | \end_inset 561 | 562 | ) 563 | \begin_inset Formula $\implies$ 564 | \end_inset 565 | 566 | 567 | \begin_inset Formula 568 | \[ 569 | \text{\ensuremath{c}-level set of \ensuremath{f}}\propto\{x:x^{\intercal}\Sigma^{-1}x=c\} 570 | \] 571 | 572 | \end_inset 573 | 574 | 575 | \begin_inset Formula 576 | \[ 577 | x^{\intercal}\Sigma^{-1}=c\equiv x^{\intercal}U\Lambda^{-1}U^{\intercal}x=c\implies 578 | \] 579 | 580 | \end_inset 581 | 582 | 583 | \begin_inset Formula 584 | \[ 585 | \underbrace{\lambda_{1}^{-1}(u_{1}^{\intercal}x)^{2}}_{\text{axis length: \ensuremath{\sqrt{\lambda_{1}}}}}+\cdots+\underbrace{\lambda_{n}^{-1}(u_{n}^{\intercal}x)^{2}}_{\text{axis length: \ensuremath{\sqrt{\lambda_{n}}}}}=c 586 | \] 587 | 588 | \end_inset 589 | 590 | 591 | \end_layout 592 | 593 | \begin_layout Standard 594 | Thus the level curves form an ellipsoid with axis lengths equal to the square 595 | root of the eigenvalues of the covariance matrix. 596 | \end_layout 597 | 598 | \begin_layout Subsection 599 | Loss Functions 600 | \end_layout 601 | 602 | \begin_layout Standard 603 | 604 | \end_layout 605 | 606 | \begin_layout Itemize 607 | 608 | \series bold 609 | Binomial deviance 610 | \series default 611 | 612 | \begin_inset Formula $=\log\left[1+e^{-yf\left(x\right)}\right]$ 613 | \end_inset 614 | 615 | 616 | \begin_inset Newline newline 617 | \end_inset 618 | 619 | minimizing function 620 | \begin_inset Formula $f\left(x\right)=\log\frac{\mathrm{P}\left[Y=+1\mid x\right]}{\mathrm{P}\left[Y=-1\mid x\right]}$ 621 | \end_inset 622 | 623 | 624 | \end_layout 625 | 626 | \begin_layout Itemize 627 | 628 | \series bold 629 | SVM hinge loss 630 | \series default 631 | 632 | \begin_inset Formula $=\left[1-yf\left(x\right)\right]_{+}$ 633 | \end_inset 634 | 635 | 636 | \begin_inset Newline newline 637 | \end_inset 638 | 639 | minimizing function 640 | \begin_inset Formula $f\left(x\right)=\mathrm{sign}\left(\mathrm{P}\left[Y=+1\mid x\right]-\frac{1}{2}\right)$ 641 | \end_inset 642 | 643 | 644 | \end_layout 645 | 646 | \begin_layout Itemize 647 | 648 | \series bold 649 | Squared error 650 | \series default 651 | 652 | \begin_inset Formula $=\left[y-f\left(x\right)\right]^{2}=\left[1-yf\left(x\right)\right]^{2}$ 653 | \end_inset 654 | 655 | 656 | \begin_inset Newline newline 657 | \end_inset 658 | 659 | minimizing function 660 | \begin_inset Formula $f\left(x\right)=2\mathrm{P}\left[Y=+1\mid x\right]-1$ 661 | \end_inset 662 | 663 | 664 | \end_layout 665 | 666 | \begin_layout Itemize 667 | 668 | \series bold 669 | \begin_inset Quotes eld 670 | \end_inset 671 | 672 | Huberized 673 | \begin_inset Quotes erd 674 | \end_inset 675 | 676 | square hinge loss 677 | \series default 678 | 679 | \begin_inset Formula $=\left\{ \begin{array}{ll} 680 | -4yf\left(x\right) & \text{if}\ yf\left(x\right)<-1\\ 681 | \left[1-yf\left(x\right)\right]_{+}^{2} & \text{otherwise} 682 | \end{array}\right.$ 683 | \end_inset 684 | 685 | 686 | \begin_inset Newline newline 687 | \end_inset 688 | 689 | minimizing function 690 | \begin_inset Formula $f\left(x\right)=2\mathrm{P}\left[Y=+1\mid x\right]-1$ 691 | \end_inset 692 | 693 | 694 | \end_layout 695 | 696 | \begin_layout Subsection 697 | Optimization 698 | \end_layout 699 | 700 | \begin_layout Standard 701 | Newton's Method: 702 | \begin_inset Formula $\theta_{t+1}=\theta_{t}-[\nabla_{\theta}^{2}f(\theta_{t})]^{-1}\nabla_{\theta}f(\theta_{t})$ 703 | \end_inset 704 | 705 | 706 | \end_layout 707 | 708 | \begin_layout Standard 709 | Gradient Decent: 710 | \begin_inset Formula $\theta_{t+1}=\theta_{t}-\alpha\nabla_{\theta}f(\theta_{t})$ 711 | \end_inset 712 | 713 | , for minimizing 714 | \end_layout 715 | 716 | \begin_layout Subsection 717 | Gradients 718 | \end_layout 719 | 720 | \begin_layout Standard 721 | \begin_inset Formula $\frac{\partial{\bf {y}}}{\partial{\bf {x}}}\triangleq\begin{bmatrix}\frac{\partial y_{1}}{\partial x_{1}} & \dots & \frac{\partial y_{m}}{\partial x_{1}}\\ 722 | \vdots & \ddots & \vdots\\ 723 | \frac{\partial y_{1}}{\partial x_{n}} & \dots & \frac{\partial y_{m}}{\partial x_{n}} 724 | \end{bmatrix},$ 725 | \end_inset 726 | 727 | 728 | \begin_inset Formula $\frac{\partial(A{\bf x})}{\partial{\bf x}}=A^{T},\frac{\partial({\bf x}^{T}A)}{\partial{\bf x}}=A,$ 729 | \end_inset 730 | 731 | 732 | \begin_inset Newline newline 733 | \end_inset 734 | 735 | 736 | \begin_inset Formula $\frac{\partial({\bf x}^{T}{\bf x})}{\partial{\bf x}}=2{\bf x},\frac{\partial({\bf x}^{T}A{\bf x})}{\partial{\bf x}}=(A+A^{T}){\bf x},\frac{\partial(trBA)}{\partial A}=B^{T}$ 737 | \end_inset 738 | 739 | 740 | \end_layout 741 | 742 | \begin_layout Standard 743 | \begin_inset VSpace vfill 744 | \end_inset 745 | 746 | 747 | \end_layout 748 | 749 | \begin_layout Standard 750 | \begin_inset ERT 751 | status open 752 | 753 | \begin_layout Plain Layout 754 | 755 | 756 | \backslash 757 | columnbreak 758 | \end_layout 759 | 760 | \end_inset 761 | 762 | 763 | \end_layout 764 | 765 | \begin_layout Section 766 | Support Vector Machines 767 | \end_layout 768 | 769 | \begin_layout Standard 770 | In the strictly separable case, the goal is to find a separating hyperplane 771 | (like logistic regression) except now we don't just want any hyperplane, 772 | but one with the largest margin. 773 | 774 | \end_layout 775 | 776 | \begin_layout Standard 777 | \begin_inset Formula $H=\{\omega^{T}x+b=0\}$ 778 | \end_inset 779 | 780 | , since scaling 781 | \begin_inset Formula $\omega$ 782 | \end_inset 783 | 784 | and b in opposite directions doesn't change the hyperplane our optimization 785 | function should have scaling invariance built into it. 786 | Thus, we do it now and define the closest points to the hyperplane 787 | \begin_inset Formula $x_{sv}$ 788 | \end_inset 789 | 790 | (support vectors) to satisfy: 791 | \begin_inset Formula $|\omega^{T}x_{sv}+b|=1$ 792 | \end_inset 793 | 794 | . 795 | The distance from any support vector to the hyper plane is now: 796 | \begin_inset Formula $\frac{1}{||\omega||_{2}}$ 797 | \end_inset 798 | 799 | . 800 | Maximizing the distance to the hyperplane is the same as minimizing 801 | \begin_inset Formula $||\omega||_{2}$ 802 | \end_inset 803 | 804 | . 805 | \end_layout 806 | 807 | \begin_layout Standard 808 | The final optimization problem is: 809 | \end_layout 810 | 811 | \begin_layout Standard 812 | \begin_inset Formula $\boxed{\min_{\omega,b}\frac{1}{2}||\omega||_{2}\ s.t.\ y^{(i)}(w^{T}x^{(i)}+b)\ge1,i=1,\dots,m}$ 813 | \end_inset 814 | 815 | 816 | \end_layout 817 | 818 | \begin_layout Standard 819 | 820 | \bar under 821 | Primal 822 | \bar default 823 | : 824 | \begin_inset Formula $L_{p}(\omega,b,\alpha)=\frac{1}{2}||\omega||_{2}-\sum_{i=1}^{m}\alpha_{i}(y^{(i)}(w^{T}x^{(i)}+b)-1)$ 825 | \end_inset 826 | 827 | 828 | \end_layout 829 | 830 | \begin_layout Standard 831 | \begin_inset Formula $\frac{\partial L_{p}}{\partial\omega}=\omega-\sum\alpha_{i}y^{(i)}x^{(i)}=0\implies\omega=\sum\alpha_{i}y^{(i)}x^{(i)}$ 832 | \end_inset 833 | 834 | 835 | \end_layout 836 | 837 | \begin_layout Standard 838 | \begin_inset Formula $\frac{\partial L_{p}}{\partial b}=-\sum\alpha_{i}y^{(i)}=0,\text{\ \ \ Note: }\alpha_{i}\ne0$ 839 | \end_inset 840 | 841 | only for support vectors. 842 | \end_layout 843 | 844 | \begin_layout Standard 845 | Substitute the derivatives into the primal to get the dual. 846 | \end_layout 847 | 848 | \begin_layout Standard 849 | 850 | \bar under 851 | Dual 852 | \bar default 853 | : 854 | \begin_inset Formula $L_{d}(\alpha)=\sum_{i=1}^{m}\alpha_{i}-\frac{1}{2}\sum_{i=1}^{m}\sum_{j=1}^{m}y^{(i)}y^{(j)}\alpha_{i}\alpha_{j}(x^{(i)})^{T}x^{(j)}$ 855 | \end_inset 856 | 857 | 858 | \end_layout 859 | 860 | \begin_layout Standard 861 | KKT says 862 | \begin_inset Formula $\alpha_{n}(y_{n}(w^{T}x_{n}+b)-1)=0$ 863 | \end_inset 864 | 865 | where 866 | \begin_inset Formula $\alpha_{n}>0$ 867 | \end_inset 868 | 869 | . 870 | \end_layout 871 | 872 | \begin_layout Standard 873 | In the non-separable case we allow points to cross the marginal boundary 874 | by some amount 875 | \begin_inset Formula $\xi$ 876 | \end_inset 877 | 878 | and penalize it. 879 | \end_layout 880 | 881 | \begin_layout Standard 882 | \begin_inset Formula $\boxed{\min_{\omega,b}\frac{1}{2}||\omega||_{2}+C\sum_{i=1}^{m}\xi_{i}\ \ s.t.\ \ y^{(i)}(w^{T}x^{(i)}+b)\ge1-\xi_{i}}$ 883 | \end_inset 884 | 885 | 886 | \end_layout 887 | 888 | \begin_layout Standard 889 | The dual for non-separable doesn't change much except that each 890 | \begin_inset Formula $\alpha_{i}$ 891 | \end_inset 892 | 893 | now has an upper bound of C 894 | \begin_inset Formula $\implies0\le\alpha_{i}\le C$ 895 | \end_inset 896 | 897 | 898 | \end_layout 899 | 900 | \begin_layout Subsection 901 | Lagrangian 902 | \end_layout 903 | 904 | \begin_layout Standard 905 | \begin_inset Formula $\boxed{L\left(x,\lambda\right)=f_{0}\left(x\right)+\sum_{i=1}^{m}\lambda_{i}f_{i}\left(x\right)}$ 906 | \end_inset 907 | 908 | 909 | \end_layout 910 | 911 | \begin_layout Itemize 912 | Think of the 913 | \begin_inset Formula $\lambda_{i}$ 914 | \end_inset 915 | 916 | as the cost of violating the constraint 917 | \begin_inset Formula $f_{i}\left(x\right)\leq0$ 918 | \end_inset 919 | 920 | . 921 | \end_layout 922 | 923 | \begin_layout Itemize 924 | \begin_inset Formula $L$ 925 | \end_inset 926 | 927 | defines a saddle point game: one player ( 928 | \noun on 929 | Min 930 | \noun default 931 | ); the other player ( 932 | \noun on 933 | Max 934 | \noun default 935 | ) chooses 936 | \begin_inset Formula $\lambda$ 937 | \end_inset 938 | 939 | to maximize 940 | \begin_inset Formula $L$ 941 | \end_inset 942 | 943 | . 944 | If 945 | \noun on 946 | Min 947 | \noun default 948 | violates a constraint, 949 | \begin_inset Formula $f_{i}\left(x\right)>0$ 950 | \end_inset 951 | 952 | , then 953 | \noun on 954 | Max 955 | \noun default 956 | can drive 957 | \begin_inset Formula $L$ 958 | \end_inset 959 | 960 | to infinity. 961 | \end_layout 962 | 963 | \begin_layout Itemize 964 | We call the original optimization problem the 965 | \bar under 966 | primal 967 | \bar default 968 | problem. 969 | \begin_inset Newline newline 970 | \end_inset 971 | 972 | It has value 973 | \begin_inset Formula $p*=\min_{x}\max_{\lambda\geq0}L\left(x,\lambda\right)$ 974 | \end_inset 975 | 976 | 977 | \begin_inset Newline newline 978 | \end_inset 979 | 980 | (Because of an infeasible 981 | \begin_inset Formula $x$ 982 | \end_inset 983 | 984 | , 985 | \begin_inset Formula $L\left(x,\lambda\right)$ 986 | \end_inset 987 | 988 | can be made infinite, and for a feasible 989 | \begin_inset Formula $x$ 990 | \end_inset 991 | 992 | , the 993 | \begin_inset Formula $\lambda_{i}f_{i}\left(x\right)$ 994 | \end_inset 995 | 996 | terms will become zero.) 997 | \end_layout 998 | 999 | \begin_layout Itemize 1000 | Define 1001 | \begin_inset Formula $g\left(\lambda\right):=\min_{x}L\left(x,\lambda\right)$ 1002 | \end_inset 1003 | 1004 | , and define the 1005 | \bar under 1006 | dual 1007 | \bar default 1008 | problem as 1009 | \begin_inset Newline newline 1010 | \end_inset 1011 | 1012 | 1013 | \begin_inset Formula $d*=\max_{\lambda\geq0}g\left(\lambda\right)=\max_{\lambda\geq0}\min_{x}L\left(x,\lambda\right)$ 1014 | \end_inset 1015 | 1016 | 1017 | \end_layout 1018 | 1019 | \begin_layout Itemize 1020 | In a zero sum game, it's always better to play second: 1021 | \begin_inset Formula $p*=\min_{x}\max_{\lambda\geq0}L\left(x,\lambda\right)\geq\max_{\lambda\geq0}\min_{x}L\left(x,\lambda\right)=d*$ 1022 | \end_inset 1023 | 1024 | This is called 1025 | \bar under 1026 | weak duality 1027 | \bar default 1028 | . 1029 | \end_layout 1030 | 1031 | \begin_layout Itemize 1032 | If there is a 1033 | \bar under 1034 | saddle point 1035 | \bar default 1036 | 1037 | \begin_inset Formula $\left(x*,\lambda*\right)$ 1038 | \end_inset 1039 | 1040 | , so that for all 1041 | \begin_inset Formula $x$ 1042 | \end_inset 1043 | 1044 | and 1045 | \begin_inset Formula $\lambda\geq0$ 1046 | \end_inset 1047 | 1048 | , 1049 | \begin_inset Formula $L\left(x*,\lambda\right)\leq L\left(x*,\lambda*\right)\leq L\left(x,\lambda*\right),$ 1050 | \end_inset 1051 | 1052 | then we have 1053 | \bar under 1054 | strong duality 1055 | \bar default 1056 | : the primal and dual have the same value, 1057 | \begin_inset Formula $p*=\min_{x}\max_{\lambda\geq0}L\left(x,\lambda\right)=\max_{\lambda\geq0}\min_{x}L\left(x,\lambda\right)=d*$ 1058 | \end_inset 1059 | 1060 | 1061 | \end_layout 1062 | 1063 | \begin_layout Standard 1064 | Using notation from Peter's notes: 1065 | \end_layout 1066 | 1067 | \begin_layout Standard 1068 | Given 1069 | \begin_inset Formula $\min_{x}f(x)\ s.t.\ g_{i}(x)=0,\ h_{i}(x)\le0$ 1070 | \end_inset 1071 | 1072 | , the corresponding Lagrangian is: 1073 | \begin_inset Formula $L(x,\alpha,\beta)=f(x)+\sum_{i=1}^{k}\alpha_{i}g_{i}(x)+\sum_{i=1}^{l}\beta_{i}h_{i}(x)$ 1074 | \end_inset 1075 | 1076 | 1077 | \end_layout 1078 | 1079 | \begin_layout Standard 1080 | We min over x and max over the Lagrange multipliers 1081 | \begin_inset Formula $\alpha$ 1082 | \end_inset 1083 | 1084 | and 1085 | \begin_inset Formula $\beta$ 1086 | \end_inset 1087 | 1088 | 1089 | \end_layout 1090 | 1091 | \begin_layout Section 1092 | Regression 1093 | \end_layout 1094 | 1095 | \begin_layout Standard 1096 | In general the loss function consists of two parts, the loss term and the 1097 | regularization term. 1098 | 1099 | \begin_inset Formula $J(\omega)=\sum_{i}Loss_{i}+\lambda R(\omega)$ 1100 | \end_inset 1101 | 1102 | 1103 | \end_layout 1104 | 1105 | \begin_layout Standard 1106 | L2 regularization results in 1107 | \series bold 1108 | ridge regression 1109 | \series default 1110 | . 1111 | \begin_inset Newline newline 1112 | \end_inset 1113 | 1114 | Used when A contains a null space. 1115 | L2 reg falls out of the MLE when we add a Gaussian prior on x with 1116 | \begin_inset Formula $\Sigma=cI$ 1117 | \end_inset 1118 | 1119 | . 1120 | \begin_inset Newline newline 1121 | \end_inset 1122 | 1123 | 1124 | \begin_inset Formula $\min_{x}||Ax-y||_{2}^{2}+\lambda||x||_{2}^{2}\implies x^{*}=(A^{T}A+\lambda I)^{-1}X^{T}y$ 1125 | \end_inset 1126 | 1127 | 1128 | \end_layout 1129 | 1130 | \begin_layout Standard 1131 | L1 regularization results in 1132 | \series bold 1133 | lasso regression 1134 | \series default 1135 | . 1136 | \begin_inset Newline newline 1137 | \end_inset 1138 | 1139 | Used when 1140 | \begin_inset Formula $x$ 1141 | \end_inset 1142 | 1143 | has a Laplace prior. 1144 | Gives sparse results. 1145 | \end_layout 1146 | 1147 | \begin_layout Subsection 1148 | Logistic Regression 1149 | \end_layout 1150 | 1151 | \begin_layout Standard 1152 | Classify 1153 | \begin_inset Formula $y\in\{0,1\}\implies$ 1154 | \end_inset 1155 | 1156 | Model 1157 | \begin_inset Formula $p(y=1|x)=\frac{1}{1+e^{-\theta^{T}x}}=h_{\theta}(x)$ 1158 | \end_inset 1159 | 1160 | 1161 | \end_layout 1162 | 1163 | \begin_layout Standard 1164 | \begin_inset Formula $\frac{dh_{\theta}}{d\theta}=(\frac{1}{1+e^{\theta^{T}x}})^{2}e^{-\theta^{T}x}=\frac{1}{1+e^{\theta^{T}x}}\left(1-\frac{1}{1+e^{-\theta^{T}x}}\right)=h_{\theta}(1-h_{\theta})$ 1165 | \end_inset 1166 | 1167 | 1168 | \end_layout 1169 | 1170 | \begin_layout Standard 1171 | \begin_inset Formula $p(y|x;\theta)=(h_{\theta}(x))^{y}(1-h_{\theta}(x))^{1-y}\implies$ 1172 | \end_inset 1173 | 1174 | 1175 | \end_layout 1176 | 1177 | \begin_layout Standard 1178 | \begin_inset Formula $L(\theta)=\prod_{i=1}^{m}(h_{\theta}(x^{(i)}))^{y^{(i)}}(1-h_{\theta}(x^{(i)}))^{1-y^{(i)}}\implies$ 1179 | \end_inset 1180 | 1181 | 1182 | \end_layout 1183 | 1184 | \begin_layout Standard 1185 | \begin_inset Formula $l(\theta)=\sum_{i=1}^{m}y^{(i)}\log(h_{\theta}(x^{(i)}))+(1-y^{(i)})\log(1-h_{\theta}(x^{(i)}))\implies$ 1186 | \end_inset 1187 | 1188 | 1189 | \end_layout 1190 | 1191 | \begin_layout Standard 1192 | \begin_inset Formula $\nabla_{\theta}l=\sum_{i}(y^{(i)}-h_{\theta}(x^{(i)}))x^{(i)}=X^{\intercal}(y-h_{\theta}(X))$ 1193 | \end_inset 1194 | 1195 | , (want 1196 | \begin_inset Formula $\max\ l(\theta)$ 1197 | \end_inset 1198 | 1199 | ) 1200 | \end_layout 1201 | 1202 | \begin_layout Standard 1203 | Stochastic: 1204 | \begin_inset Formula $\boxed{\theta_{t+1}=\theta_{t}+\alpha(y_{t}^{(j)}-h_{\theta}(x_{t}^{(j)}))x_{t}^{(j)}}$ 1205 | \end_inset 1206 | 1207 | 1208 | \end_layout 1209 | 1210 | \begin_layout Standard 1211 | Batch: 1212 | \begin_inset Formula $\boxed{\theta_{t+1}=\theta_{t}+\alpha X^{\intercal}(y-h_{\theta}(X))}$ 1213 | \end_inset 1214 | 1215 | 1216 | \end_layout 1217 | 1218 | \begin_layout Standard 1219 | \begin_inset VSpace vfill 1220 | \end_inset 1221 | 1222 | 1223 | \end_layout 1224 | 1225 | \begin_layout Standard 1226 | \begin_inset ERT 1227 | status open 1228 | 1229 | \begin_layout Plain Layout 1230 | 1231 | 1232 | \backslash 1233 | columnbreak 1234 | \end_layout 1235 | 1236 | \end_inset 1237 | 1238 | 1239 | \end_layout 1240 | 1241 | \begin_layout Subsection 1242 | LDA and QDA 1243 | \end_layout 1244 | 1245 | \begin_layout Standard 1246 | Classify 1247 | \begin_inset Formula $y\in\{0,1\},$ 1248 | \end_inset 1249 | 1250 | Model 1251 | \begin_inset Formula $p(y)=\phi^{y}\phi^{1-y}$ 1252 | \end_inset 1253 | 1254 | and 1255 | \end_layout 1256 | 1257 | \begin_layout Standard 1258 | \begin_inset Formula $l(\theta,\mu_{0},\mu_{1},\Sigma)=log\ \Pi_{i=1}^{m}p(x^{(i)}|y^{(i)};\mu_{0},\mu_{1},\Sigma)p(y^{(i)};\Phi)$ 1259 | \end_inset 1260 | 1261 | gives us 1262 | \end_layout 1263 | 1264 | \begin_layout Standard 1265 | \begin_inset Formula $\phi_{MLE}=\frac{1}{m}\sum_{i=1}^{m}1\{y^{(i)}=1\}$ 1266 | \end_inset 1267 | 1268 | , 1269 | \begin_inset Formula $\mu_{k_{MLE}}=\text{avg of x^{(i)} classified as k}$ 1270 | \end_inset 1271 | 1272 | , 1273 | \end_layout 1274 | 1275 | \begin_layout Standard 1276 | \begin_inset Formula $\Sigma_{MLE}=\frac{1}{m}\sum_{i=1}^{m}(x^{(i)}-\mu_{y_{(i)}})(x^{(i)}-\mu_{y_{(i)}})^{T}$ 1277 | \end_inset 1278 | 1279 | . 1280 | \end_layout 1281 | 1282 | \begin_layout Standard 1283 | Notice the covariance matrix is the same for all classes in LDA. 1284 | \end_layout 1285 | 1286 | \begin_layout Standard 1287 | If 1288 | \begin_inset Formula $p(x|y)$ 1289 | \end_inset 1290 | 1291 | multivariate gaussian (w/ shared 1292 | \begin_inset Formula $\Sigma)$ 1293 | \end_inset 1294 | 1295 | , then 1296 | \begin_inset Formula $p(y|x)$ 1297 | \end_inset 1298 | 1299 | is logistic function. 1300 | The converse is NOT true. 1301 | LDA makes stronger assumptions about data than does logistic regression. 1302 | 1303 | \begin_inset Formula $h(x)=arg\max_{k}-\frac{1}{2}(x-\mu_{k})^{T}\Sigma^{-1}(x-\mu_{k})+log(\pi_{k})$ 1304 | \end_inset 1305 | 1306 | 1307 | \end_layout 1308 | 1309 | \begin_layout Standard 1310 | where 1311 | \begin_inset Formula $\pi_{k}=p(y=k)$ 1312 | \end_inset 1313 | 1314 | 1315 | \end_layout 1316 | 1317 | \begin_layout Standard 1318 | For QDA, the model is the same as LDA except that each class has a unique 1319 | covariance matrix. 1320 | 1321 | \begin_inset Formula $h(x)=arg\max_{k}-\frac{1}{2}log|\Sigma_{k}|-\frac{1}{2}(x-\mu_{k})^{T}\Sigma_{k}^{-1}(x-\mu_{k})+log(\pi_{k})$ 1322 | \end_inset 1323 | 1324 | 1325 | \end_layout 1326 | 1327 | \begin_layout Section 1328 | Other Classifiers 1329 | \end_layout 1330 | 1331 | \begin_layout Subsection 1332 | Nearest Neighbor 1333 | \end_layout 1334 | 1335 | \begin_layout Standard 1336 | Key Idea: Store all training examples 1337 | \begin_inset Formula $\left\langle x_{i},f(x_{i})\right\rangle $ 1338 | \end_inset 1339 | 1340 | 1341 | \end_layout 1342 | 1343 | \begin_layout Standard 1344 | 1345 | \series bold 1346 | NN 1347 | \series default 1348 | : Find closest training point using some distance metric and take its label. 1349 | \end_layout 1350 | 1351 | \begin_layout Standard 1352 | 1353 | \series bold 1354 | k-NN 1355 | \series default 1356 | : Find closest k training points and take on the most likely label based 1357 | on some voting scheme (mean, median,...) 1358 | \end_layout 1359 | 1360 | \begin_layout Standard 1361 | 1362 | \series bold 1363 | Behavior at the limit 1364 | \series default 1365 | : 1NN 1366 | \begin_inset Formula $lim_{N\to\infty}\ \epsilon^{*}\le\epsilon_{NN}\le2\epsilon^{*}$ 1367 | \end_inset 1368 | 1369 | 1370 | \begin_inset Formula $\epsilon^{*}=\text{error of optimal prediction},\ \epsilon_{nn}=\text{error of 1NN classifier}$ 1371 | \end_inset 1372 | 1373 | 1374 | \end_layout 1375 | 1376 | \begin_layout Standard 1377 | KNN 1378 | \begin_inset space \space{} 1379 | \end_inset 1380 | 1381 | 1382 | \begin_inset Formula $lim_{N\to\infty,K\to\infty},\frac{K}{N}\to0,\epsilon_{knn}=\epsilon^{*}$ 1383 | \end_inset 1384 | 1385 | 1386 | \end_layout 1387 | 1388 | \begin_layout Standard 1389 | 1390 | \series bold 1391 | Curse of dimensionality 1392 | \series default 1393 | : As the number of dimensions increases, everything becomes farther apart. 1394 | Our low dimension intuition falls apart. 1395 | Consider the Hypersphere/Hypercube ratio, it's close to zero at 1396 | \begin_inset Formula $d=10$ 1397 | \end_inset 1398 | 1399 | . 1400 | How do deal with this curse: 1401 | \end_layout 1402 | 1403 | \begin_layout Enumerate 1404 | Get more data to fill all of that empty space 1405 | \end_layout 1406 | 1407 | \begin_layout Enumerate 1408 | Get better features, reducing the dimensionality and packing the data closer 1409 | together. 1410 | Ex: Bag-of-words, Histograms,... 1411 | \end_layout 1412 | 1413 | \begin_layout Enumerate 1414 | Use a better distance metric. 1415 | \end_layout 1416 | 1417 | \begin_layout Standard 1418 | Minkowski: 1419 | \begin_inset Formula $Dis_{p}(x,y)=(\sum_{i=1}^{d}|x_{i}-y_{u}|^{p})^{\frac{1}{p}}=||x-y||_{p}$ 1420 | \end_inset 1421 | 1422 | 1423 | \end_layout 1424 | 1425 | \begin_layout Standard 1426 | 0-norm: 1427 | \begin_inset Formula $Dis_{0}(x,y)=\sum_{i=1}^{d}I|x_{i}=y_{i}|$ 1428 | \end_inset 1429 | 1430 | 1431 | \end_layout 1432 | 1433 | \begin_layout Standard 1434 | Mahalanobis: 1435 | \begin_inset Formula $Dis_{M}(x,y|\Sigma)=\sqrt{(x-y)^{T}\Sigma^{-1}(x-y)}$ 1436 | \end_inset 1437 | 1438 | 1439 | \end_layout 1440 | 1441 | \begin_layout Standard 1442 | In high-d we get 1443 | \begin_inset Quotes eld 1444 | \end_inset 1445 | 1446 | Hubs 1447 | \begin_inset Quotes erd 1448 | \end_inset 1449 | 1450 | s.t most points identify the hubs as their NN. 1451 | These hubs are usually near the means (Ex: dull gray images, sky and clouds). 1452 | To avoid having everything classified as these hubs, we can use cosine 1453 | similarity. 1454 | \end_layout 1455 | 1456 | \begin_layout Standard 1457 | 1458 | \series bold 1459 | K-d trees 1460 | \series default 1461 | increase the efficiency of nearest neighbor lookup. 1462 | \end_layout 1463 | 1464 | \begin_layout Subsection 1465 | Decision Trees 1466 | \end_layout 1467 | 1468 | \begin_layout Standard 1469 | Given a set of points and classes 1470 | \begin_inset Formula $\{x_{i},y_{i}\}_{i=1}^{n}$ 1471 | \end_inset 1472 | 1473 | , test features 1474 | \begin_inset Formula $x_{j}$ 1475 | \end_inset 1476 | 1477 | and branch on the feature which 1478 | \begin_inset Quotes eld 1479 | \end_inset 1480 | 1481 | best 1482 | \begin_inset Quotes erd 1483 | \end_inset 1484 | 1485 | separates the data. 1486 | Recursively split on the new subset of data. 1487 | Growing the tree to max depth tends to overfit (training data gets cut 1488 | quickly 1489 | \begin_inset Formula $\implies$ 1490 | \end_inset 1491 | 1492 | subtrees train on small sets). 1493 | Mistakes high up in the tree propagate to corresponding subtrees. 1494 | To reduce overfitting, we can prune using a validation set, and we can 1495 | limit the depth. 1496 | \end_layout 1497 | 1498 | \begin_layout Standard 1499 | DT's are prone to label noise. 1500 | Building the correct tree is hard. 1501 | \end_layout 1502 | 1503 | \begin_layout Standard 1504 | 1505 | \series bold 1506 | Heurisitic 1507 | \series default 1508 | : For 1509 | \bar under 1510 | classification 1511 | \bar default 1512 | , maximize information gain 1513 | \begin_inset Formula 1514 | \[ 1515 | \max_{j}\quad\mathrm{H}(D)\ -\sum_{x_{j}\in X_{j}}P(X_{j}=x_{j})\cdot\mathrm{H}(D|X_{j}=x_{j}) 1516 | \] 1517 | 1518 | \end_inset 1519 | 1520 | where 1521 | \begin_inset Formula $\mathrm{H}(D)=-\sum_{c\in C}P(y=c)\log[p(y=c)]$ 1522 | \end_inset 1523 | 1524 | is the entropy of the data set, 1525 | \begin_inset Formula $C$ 1526 | \end_inset 1527 | 1528 | is the set of classes each data point can take, and 1529 | \begin_inset Formula $P(y=c)$ 1530 | \end_inset 1531 | 1532 | is the fraction of data points with class 1533 | \begin_inset Formula $c$ 1534 | \end_inset 1535 | 1536 | . 1537 | \begin_inset Newline newline 1538 | \end_inset 1539 | 1540 | For 1541 | \noun on 1542 | regression 1543 | \noun default 1544 | , minimize the variance. 1545 | Same optimization problem as above, except H is replaced with var. 1546 | Pure leaves correspond to low variance, and the result is the mean of the 1547 | current leaf. 1548 | \end_layout 1549 | 1550 | \begin_layout Subsection 1551 | Random Forests 1552 | \end_layout 1553 | 1554 | \begin_layout Standard 1555 | 1556 | \series bold 1557 | Problem 1558 | \series default 1559 | : DT's are 1560 | \bar under 1561 | unstable 1562 | \bar default 1563 | : small changes in the input data have large effect on tree structure 1564 | \begin_inset Formula $\implies$ 1565 | \end_inset 1566 | 1567 | DT's are high-variance estimators. 1568 | \begin_inset Newline newline 1569 | \end_inset 1570 | 1571 | 1572 | \series bold 1573 | Solution 1574 | \series default 1575 | : Random Forests train 1576 | \begin_inset Formula $M$ 1577 | \end_inset 1578 | 1579 | different trees with randomly sampled subsets of the data (called bagging), 1580 | and sometimes with randomly sampled subsets of the features to de-correlate 1581 | the trees. 1582 | A new point is tested on all 1583 | \begin_inset Formula $M$ 1584 | \end_inset 1585 | 1586 | trees and we take the majority as our output class (for regression we take 1587 | the average of the output). 1588 | \end_layout 1589 | 1590 | \begin_layout Subsection 1591 | Boosting 1592 | \end_layout 1593 | 1594 | \begin_layout Standard 1595 | Weak Learner: Can classify with at least 50% accuracy. 1596 | \end_layout 1597 | 1598 | \begin_layout Standard 1599 | Train weak learner to get a weak classifier. 1600 | Test it on the training data, up-weigh misclassified data, down-weigh correctly 1601 | classified data. 1602 | Train a new weak learner on the weighted data. 1603 | Repeat. 1604 | A new point is classified by every weak learner and the output class is 1605 | the sign of a weighted avg. 1606 | of weak learner outputs. 1607 | Boosting generally overfits. 1608 | If there is label noise, boosting keeps upweighing the mislabeled data. 1609 | \end_layout 1610 | 1611 | \begin_layout Standard 1612 | 1613 | \series bold 1614 | AdaBoost 1615 | \series default 1616 | is a boosting algorithm. 1617 | The weak learner weights are given by 1618 | \begin_inset Formula $\alpha_{t}=\frac{1}{2}\ln(\frac{1-\epsilon_{t}}{\epsilon_{t}})$ 1619 | \end_inset 1620 | 1621 | where 1622 | \begin_inset Formula $\epsilon_{t}=Pr_{D_{t}}(h_{t}(x_{i})\ne y_{i})$ 1623 | \end_inset 1624 | 1625 | (probability of misclassification). 1626 | The weights are updated 1627 | \begin_inset Formula $D_{t+1}(i)=\frac{D_{t}(i)exp(-\alpha_{t}y_{i}h_{t}(x_{i}))}{Z_{t}}$ 1628 | \end_inset 1629 | 1630 | where 1631 | \begin_inset Formula $Z_{t}$ 1632 | \end_inset 1633 | 1634 | is a normalization factor. 1635 | \end_layout 1636 | 1637 | \begin_layout Subsection 1638 | Neural Networks 1639 | \end_layout 1640 | 1641 | \begin_layout Standard 1642 | Neural Nets explore what you can do by combining perceptrons, each of which 1643 | is a simple linear classifier. 1644 | We use a soft threshold for each activation function 1645 | \begin_inset Formula $\theta$ 1646 | \end_inset 1647 | 1648 | because it is twice differentiable. 1649 | \end_layout 1650 | 1651 | \begin_layout Standard 1652 | \begin_inset Graphics 1653 | filename graphics/NN.pdf 1654 | lyxscale 50 1655 | width 72col% 1656 | 1657 | \end_inset 1658 | 1659 | 1660 | \begin_inset space \space{} 1661 | \end_inset 1662 | 1663 | 1664 | \begin_inset Graphics 1665 | filename graphics/NN2.pdf 1666 | lyxscale 35 1667 | width 21col% 1668 | 1669 | \end_inset 1670 | 1671 | 1672 | \end_layout 1673 | 1674 | \begin_layout Standard 1675 | 1676 | \series bold 1677 | Activation Functions: 1678 | \end_layout 1679 | 1680 | \begin_layout Standard 1681 | \begin_inset Formula $\theta(s)=\tanh(s)=\frac{e^{s}-e^{-s}}{e^{s}+e^{-s}}\implies\theta'(s)=1-\theta^{2}(s)$ 1682 | \end_inset 1683 | 1684 | 1685 | \end_layout 1686 | 1687 | \begin_layout Standard 1688 | \begin_inset Formula $\theta(s)=\sigma(s)=\frac{1}{1+e^{-s}}\implies\theta'(s)=\sigma(s)(1-\sigma(s))$ 1689 | \end_inset 1690 | 1691 | 1692 | \end_layout 1693 | 1694 | \begin_layout Standard 1695 | 1696 | \series bold 1697 | Error Functions 1698 | \series default 1699 | : 1700 | \end_layout 1701 | 1702 | \begin_layout Standard 1703 | 1704 | \family roman 1705 | \series medium 1706 | \shape up 1707 | \size normal 1708 | \emph off 1709 | \bar no 1710 | \strikeout off 1711 | \uuline off 1712 | \uwave off 1713 | \noun off 1714 | \color none 1715 | Cross Entropy Loss 1716 | \begin_inset Formula $\sum_{i=1}^{n_{out}}y\log(h_{\theta}(x))+(1-y)\log(1-h_{\theta}(x))$ 1717 | \end_inset 1718 | 1719 | 1720 | \end_layout 1721 | 1722 | \begin_layout Standard 1723 | 1724 | \family roman 1725 | \series medium 1726 | \shape up 1727 | \size normal 1728 | \emph off 1729 | \bar no 1730 | \strikeout off 1731 | \uuline off 1732 | \uwave off 1733 | \noun off 1734 | \color none 1735 | Mean Squared Error 1736 | \begin_inset Formula $\sum_{i=1}^{n_{out}}(y-h_{\theta}(x))^{2}$ 1737 | \end_inset 1738 | 1739 | 1740 | \end_layout 1741 | 1742 | \begin_layout Standard 1743 | 1744 | \series bold 1745 | Notation: 1746 | \series default 1747 | 1748 | \end_layout 1749 | 1750 | \begin_layout Enumerate 1751 | \begin_inset Formula $w_{ij}^{(l)}$ 1752 | \end_inset 1753 | 1754 | is the weight from neuron 1755 | \begin_inset Formula $i$ 1756 | \end_inset 1757 | 1758 | in layer 1759 | \begin_inset Formula $l-1$ 1760 | \end_inset 1761 | 1762 | to neuron 1763 | \begin_inset Formula $j$ 1764 | \end_inset 1765 | 1766 | in layer 1767 | \begin_inset Formula $l$ 1768 | \end_inset 1769 | 1770 | . 1771 | There are 1772 | \begin_inset Formula $d^{(l)}$ 1773 | \end_inset 1774 | 1775 | nodes in the 1776 | \begin_inset Formula $l^{\text{th}}$ 1777 | \end_inset 1778 | 1779 | layer. 1780 | 1781 | \end_layout 1782 | 1783 | \begin_layout Enumerate 1784 | \begin_inset Formula $L$ 1785 | \end_inset 1786 | 1787 | layers, where L is output layer and data is 0th layer. 1788 | 1789 | \end_layout 1790 | 1791 | \begin_layout Enumerate 1792 | \begin_inset Formula $x_{j}^{(l)}=\theta(s_{j}^{(l)})$ 1793 | \end_inset 1794 | 1795 | is the output of a neuron. 1796 | It's the activation function applied to the input signal. 1797 | 1798 | \begin_inset Formula $s_{j}^{(l)}=\sum_{i}w_{ij}^{(l)}x_{i}^{(l-1)}$ 1799 | \end_inset 1800 | 1801 | 1802 | \end_layout 1803 | 1804 | \begin_layout Enumerate 1805 | \begin_inset Formula $e(w)$ 1806 | \end_inset 1807 | 1808 | is the error as a function of the weights 1809 | \end_layout 1810 | 1811 | \begin_layout Standard 1812 | 1813 | \bar under 1814 | The goal is to learn the weights 1815 | \begin_inset Formula $w_{ij}^{(l)}$ 1816 | \end_inset 1817 | 1818 | . 1819 | 1820 | \bar default 1821 | We use gradient descent, but error function is non-convex so we tend to 1822 | local minima. 1823 | The naive version takes 1824 | \begin_inset Formula $O(w^{2})$ 1825 | \end_inset 1826 | 1827 | . 1828 | 1829 | \bar under 1830 | Back propagation 1831 | \bar default 1832 | , an algorithm for efficient computation of the gradient, takes 1833 | \begin_inset Formula $O(w)$ 1834 | \end_inset 1835 | 1836 | . 1837 | \end_layout 1838 | 1839 | \begin_layout Standard 1840 | \begin_inset Formula $\nabla e(w)\rightarrow\frac{\partial e(w)}{\partial w_{ij}^{(l)}}=\frac{\partial e(w)}{\partial s_{j}^{(l)}}\frac{\partial s_{j}^{(l)}}{\partial w_{ij}^{(l)}}=\delta_{j}^{(l)}x_{i}^{(l-1)}$ 1841 | \end_inset 1842 | 1843 | 1844 | \end_layout 1845 | 1846 | \begin_layout Standard 1847 | Final Layer: 1848 | \begin_inset Formula $\delta_{j}^{(L)}=\frac{\partial e(w)}{\partial s_{j}^{(l)}}=\frac{\partial e(w)}{\partial x_{j}^{(L)}}\frac{\partial x_{j}^{(L)}}{\partial s_{j}^{(L)}}=e'(x_{j}^{(L)})\theta_{out}'(s_{j}^{L})$ 1849 | \end_inset 1850 | 1851 | 1852 | \end_layout 1853 | 1854 | \begin_layout Standard 1855 | 1856 | \family roman 1857 | \series medium 1858 | \shape up 1859 | \size normal 1860 | \emph off 1861 | \bar no 1862 | \strikeout off 1863 | \uuline off 1864 | \uwave off 1865 | \noun off 1866 | \color none 1867 | General: 1868 | \family default 1869 | \series default 1870 | \shape default 1871 | \size default 1872 | \bar default 1873 | \strikeout default 1874 | \uuline default 1875 | \uwave default 1876 | \noun default 1877 | \color inherit 1878 | 1879 | \begin_inset Formula $\delta_{i}^{(l-1)}=\frac{\partial e(w)}{\partial s_{i}^{(l-1)}}=\sum_{j=1}^{d^{(l)}}\frac{\partial e(w)}{\partial s_{j}^{(l)}}\times\frac{\partial s_{j}^{(l)}}{\partial x_{i}^{(l-1)}}\times\frac{\partial x_{i}^{(l-1)}}{\partial s_{i}^{(l-1)}}$ 1880 | \end_inset 1881 | 1882 | 1883 | \end_layout 1884 | 1885 | \begin_layout Standard 1886 | \begin_inset Formula $=\sum_{j=1}^{d^{(l)}}\delta_{j}^{(l)}\times w_{ij}^{(l)}\times\theta'(s_{i}^{(l-1)})$ 1887 | \end_inset 1888 | 1889 | 1890 | \end_layout 1891 | 1892 | \begin_layout Standard 1893 | \begin_inset Graphics 1894 | filename graphics/NN1.pdf 1895 | lyxscale 50 1896 | width 100col% 1897 | 1898 | \end_inset 1899 | 1900 | 1901 | \end_layout 1902 | 1903 | \begin_layout Section 1904 | Unsupervised Learning 1905 | \end_layout 1906 | 1907 | \begin_layout Subsection 1908 | Clustering 1909 | \end_layout 1910 | 1911 | \begin_layout Standard 1912 | Unsupervised learning (no labels). 1913 | \end_layout 1914 | 1915 | \begin_layout Standard 1916 | 1917 | \series bold 1918 | Distance function 1919 | \series default 1920 | s. 1921 | Suppose we have two sets of points. 1922 | \end_layout 1923 | 1924 | \begin_layout Itemize 1925 | 1926 | \series bold 1927 | Single linkage 1928 | \series default 1929 | is minimum distance between members. 1930 | \end_layout 1931 | 1932 | \begin_layout Itemize 1933 | 1934 | \series bold 1935 | Complete linkage 1936 | \series default 1937 | is maximum distance between members. 1938 | \end_layout 1939 | 1940 | \begin_layout Itemize 1941 | 1942 | \series bold 1943 | Centroid linkage 1944 | \series default 1945 | is distance between centroids. 1946 | \end_layout 1947 | 1948 | \begin_layout Itemize 1949 | 1950 | \series bold 1951 | Average linkage 1952 | \series default 1953 | is average distance between all pairs. 1954 | \end_layout 1955 | 1956 | \begin_layout Standard 1957 | 1958 | \series bold 1959 | Hierarchical 1960 | \series default 1961 | : 1962 | \end_layout 1963 | 1964 | \begin_layout Itemize 1965 | 1966 | \bar under 1967 | Agglomerative 1968 | \bar default 1969 | : Start with n points, merge 2 closest clusters using some measure, such 1970 | as: Single-link (closest pair), Complete-link (furthest pair), Average-link 1971 | (average of all pairs), Centroid (centroid distance). 1972 | \begin_inset Newline newline 1973 | \end_inset 1974 | 1975 | Note: SL and CL are sensitive to outliers. 1976 | \end_layout 1977 | 1978 | \begin_layout Itemize 1979 | 1980 | \bar under 1981 | Divisive 1982 | \bar default 1983 | : Start with single cluster, recursively divide clusters into 2 subclusters. 1984 | 1985 | \end_layout 1986 | 1987 | \begin_layout Standard 1988 | 1989 | \series bold 1990 | Partitioning 1991 | \series default 1992 | : Partition the data into a K mutually exclusive exhaustive groups (i.e. 1993 | encode k=C(i)). 1994 | Iteratively reallocate to minimize some loss function. 1995 | Finding the correct partitions is hard. 1996 | Use a greedy algorithm called K-means (coordinate decent). 1997 | Loss function is non-convex thus we find local minima. 1998 | \end_layout 1999 | 2000 | \begin_layout Itemize 2001 | 2002 | \series bold 2003 | K-means 2004 | \series default 2005 | : Choose clusters at random, calculate centroid of each cluster, reallocate 2006 | objects to nearest centroid, repeat. 2007 | 2008 | \bar under 2009 | Works with: spherical, well-separated clusters of similar volumes and count. 2010 | \end_layout 2011 | 2012 | \begin_layout Itemize 2013 | 2014 | \series bold 2015 | K-means 2016 | \series default 2017 | ++: Initialize clusters one by one. 2018 | D(x) = distance of point x to nearest cluster. 2019 | Pr(x is new cluster center) 2020 | \begin_inset Formula $\propto D(x)^{2}$ 2021 | \end_inset 2022 | 2023 | 2024 | \end_layout 2025 | 2026 | \begin_layout Itemize 2027 | 2028 | \series bold 2029 | K-medians 2030 | \series default 2031 | : Works with arbitrary distance/dissimilarity metric, the centers 2032 | \begin_inset Formula $\mu_{k}$ 2033 | \end_inset 2034 | 2035 | are represented by data points. 2036 | Is more restrictive thus has higher loss. 2037 | \end_layout 2038 | 2039 | \begin_layout Standard 2040 | 2041 | \series bold 2042 | General Loss 2043 | \series default 2044 | : 2045 | \begin_inset Formula $\sum_{n=1}^{N}\sum_{k=1}^{K}d(x_{n},\mu_{k})r_{nk}$ 2046 | \end_inset 2047 | 2048 | where 2049 | \begin_inset Formula $r_{nk}=1$ 2050 | \end_inset 2051 | 2052 | if 2053 | \begin_inset Formula $x_{n}$ 2054 | \end_inset 2055 | 2056 | is in cluster k, and 0 o.w. 2057 | \end_layout 2058 | 2059 | \begin_layout Subsection 2060 | Vector Quantization 2061 | \end_layout 2062 | 2063 | \begin_layout Standard 2064 | Use clustering to find representative prototype vectors, which are used 2065 | to simplify representations of signals. 2066 | \end_layout 2067 | 2068 | \begin_layout Subsection 2069 | Parametric Density Estimation 2070 | \end_layout 2071 | 2072 | \begin_layout Standard 2073 | 2074 | \series bold 2075 | Mixture Models. 2076 | 2077 | \series default 2078 | Assume PDF is made up of multiple gaussians with different centers. 2079 | 2080 | \begin_inset Formula $P(x)=\sum_{i=1}^{n_{c}}P(c_{i})P(x|c_{i})$ 2081 | \end_inset 2082 | 2083 | with objective function as log likelihood of data. 2084 | Use 2085 | \series bold 2086 | EM 2087 | \series default 2088 | to estimate this model. 2089 | 2090 | \begin_inset Newline newline 2091 | \end_inset 2092 | 2093 | E Step: 2094 | \begin_inset Formula $P(\mu_{i}|x_{k})=\frac{P(\mu_{i})P(x_{k}|\mu_{i})}{\sum_{j}P(\mu_{j})P(x_{j}|\mu_{j})}$ 2095 | \end_inset 2096 | 2097 | 2098 | \begin_inset Newline newline 2099 | \end_inset 2100 | 2101 | M Step: 2102 | \begin_inset Formula $P(c_{i})=\frac{1}{n_{e}}\sum_{k=1}^{n_{e}}P(\mu_{i}|x_{k})$ 2103 | \end_inset 2104 | 2105 | 2106 | \begin_inset Newline newline 2107 | \end_inset 2108 | 2109 | 2110 | \begin_inset Formula $\mu_{i}=\frac{\sum_{k}x_{k}P(\mu_{i}|x_{k})}{\sum_{k}P(\mu_{i}|x_{k})}$ 2111 | \end_inset 2112 | 2113 | 2114 | \begin_inset Newline newline 2115 | \end_inset 2116 | 2117 | 2118 | \begin_inset Formula $\sigma_{i}^{2}=\frac{\sum_{k}(x_{k}-\mu_{i})^{2}P(\mu_{i}|x_{k})}{\sum_{k}P(\mu_{i}|x_{k})}$ 2119 | \end_inset 2120 | 2121 | . 2122 | 2123 | \end_layout 2124 | 2125 | \begin_layout Subsection 2126 | Non-parametric Density Estimation 2127 | \end_layout 2128 | 2129 | \begin_layout Standard 2130 | Can use 2131 | \series bold 2132 | Histogram 2133 | \series default 2134 | or Kernel Density Estimation (KDE). 2135 | \end_layout 2136 | 2137 | \begin_layout Standard 2138 | 2139 | \series bold 2140 | KDE 2141 | \series default 2142 | : 2143 | \begin_inset Formula $P(x)=\frac{1}{n}\sum K({\bf x}-{\bf x_{i}})$ 2144 | \end_inset 2145 | 2146 | is a function of the data. 2147 | \end_layout 2148 | 2149 | \begin_layout Standard 2150 | The kernel K has the following properties: 2151 | \begin_inset Newline newline 2152 | \end_inset 2153 | 2154 | Symmetric, Normalized 2155 | \begin_inset Formula $\int_{\mathbb{R}^{d}}K(x)dx=1$ 2156 | \end_inset 2157 | 2158 | , and 2159 | \begin_inset Formula $\lim_{||x||\rightarrow\infty}||x||^{d}K(x)=0$ 2160 | \end_inset 2161 | 2162 | . 2163 | \end_layout 2164 | 2165 | \begin_layout Standard 2166 | The 2167 | \bar under 2168 | bandwidth 2169 | \bar default 2170 | is the width of the kernel function. 2171 | Too small = jagged results, too large = smoothed out results. 2172 | \end_layout 2173 | 2174 | \begin_layout Subsection 2175 | 2176 | \series bold 2177 | Principal Component Analysis 2178 | \end_layout 2179 | 2180 | \begin_layout Standard 2181 | First run 2182 | \series bold 2183 | singular value decomposition 2184 | \series default 2185 | on 2186 | \series bold 2187 | 2188 | \series default 2189 | pattern matrix 2190 | \begin_inset Formula $X$ 2191 | \end_inset 2192 | 2193 | : 2194 | \end_layout 2195 | 2196 | \begin_layout Enumerate 2197 | Subtract mean from each point 2198 | \end_layout 2199 | 2200 | \begin_layout Enumerate 2201 | (Sometimes) scale each dimension by its variance 2202 | \end_layout 2203 | 2204 | \begin_layout Enumerate 2205 | Compute covariance 2206 | \begin_inset Formula $\Sigma=X^{T}X$ 2207 | \end_inset 2208 | 2209 | (must be symmetric) 2210 | \end_layout 2211 | 2212 | \begin_layout Enumerate 2213 | Compute eigenvectors/values 2214 | \begin_inset Formula $\Sigma=VSV^{\intercal}$ 2215 | \end_inset 2216 | 2217 | (spectral thm) 2218 | \end_layout 2219 | 2220 | \begin_layout Enumerate 2221 | Get back 2222 | \begin_inset Formula $X=X\Sigma=(XV)SV^{\intercal}=USV^{\intercal}$ 2223 | \end_inset 2224 | 2225 | 2226 | \end_layout 2227 | 2228 | \begin_layout Standard 2229 | \begin_inset Formula $S$ 2230 | \end_inset 2231 | 2232 | contains the eigenvalues of the transformed features. 2233 | The larger the 2234 | \begin_inset Formula $S_{ii}$ 2235 | \end_inset 2236 | 2237 | , the larger the variance of that feature. 2238 | We want the 2239 | \begin_inset Formula $k$ 2240 | \end_inset 2241 | 2242 | largest features, so we find the indices of the 2243 | \begin_inset Formula $k$ 2244 | \end_inset 2245 | 2246 | largest items in 2247 | \begin_inset Formula $S$ 2248 | \end_inset 2249 | 2250 | and we keep only these entries in 2251 | \begin_inset Formula $U$ 2252 | \end_inset 2253 | 2254 | and 2255 | \begin_inset Formula $V$ 2256 | \end_inset 2257 | 2258 | . 2259 | \end_layout 2260 | 2261 | \begin_layout Standard 2262 | \begin_inset VSpace vfill 2263 | \end_inset 2264 | 2265 | 2266 | \end_layout 2267 | 2268 | \begin_layout Standard 2269 | \begin_inset ERT 2270 | status open 2271 | 2272 | \begin_layout Plain Layout 2273 | 2274 | 2275 | \backslash 2276 | end{multicols} 2277 | \end_layout 2278 | 2279 | \end_inset 2280 | 2281 | 2282 | \end_layout 2283 | 2284 | \begin_layout Standard 2285 | \begin_inset Newpage newpage 2286 | \end_inset 2287 | 2288 | 2289 | \end_layout 2290 | 2291 | \begin_layout Standard 2292 | \begin_inset ERT 2293 | status open 2294 | 2295 | \begin_layout Plain Layout 2296 | 2297 | 2298 | \backslash 2299 | mytitle{CS 189 ALL OF IT}{Che Yeon, Chloe, Dhruv, Li, Sean} 2300 | \end_layout 2301 | 2302 | \begin_layout Plain Layout 2303 | 2304 | 2305 | \backslash 2306 | begin{multicols}{4} 2307 | \end_layout 2308 | 2309 | \end_inset 2310 | 2311 | 2312 | \end_layout 2313 | 2314 | \begin_layout Section 2315 | Past Exam Questions 2316 | \end_layout 2317 | 2318 | \begin_layout Standard 2319 | \begin_inset ERT 2320 | status collapsed 2321 | 2322 | \begin_layout Plain Layout 2323 | 2324 | 2325 | \backslash 2326 | bgroup 2327 | \end_layout 2328 | 2329 | \begin_layout Plain Layout 2330 | 2331 | 2332 | \backslash 2333 | renewcommand 2334 | \backslash 2335 | theenumi{( 2336 | \backslash 2337 | alph{enumi})} 2338 | \end_layout 2339 | 2340 | \begin_layout Plain Layout 2341 | 2342 | 2343 | \backslash 2344 | renewcommand 2345 | \backslash 2346 | labelenumi{ 2347 | \backslash 2348 | theenumi} 2349 | \end_layout 2350 | 2351 | \end_inset 2352 | 2353 | 2354 | \end_layout 2355 | 2356 | \begin_layout Subsection 2357 | Spring 2013 Midterm 2358 | \end_layout 2359 | 2360 | \begin_layout Enumerate 2361 | 2362 | \bar under 2363 | False: 2364 | \bar default 2365 | In SVMs, we maximize 2366 | \begin_inset Formula $\frac{\left\Vert w\right\Vert ^{2}}{2}$ 2367 | \end_inset 2368 | 2369 | subject to the margin constraints. 2370 | \end_layout 2371 | 2372 | \begin_layout Enumerate 2373 | 2374 | \bar under 2375 | False: 2376 | \bar default 2377 | In kernelized SVMS, the kernel matrix 2378 | \begin_inset Formula $K$ 2379 | \end_inset 2380 | 2381 | has to be positive definite. 2382 | \end_layout 2383 | 2384 | \begin_layout Enumerate 2385 | 2386 | \bar under 2387 | True: 2388 | \bar default 2389 | If two random variables are independent, then they have to be uncorrelated. 2390 | \end_layout 2391 | 2392 | \begin_layout Enumerate 2393 | 2394 | \bar under 2395 | False: 2396 | \bar default 2397 | Isocontours of Gaussian distributions have axes whose lengths are proportional 2398 | to the eigenvalues of the covariance matrix. 2399 | \end_layout 2400 | 2401 | \begin_layout Enumerate 2402 | 2403 | \bar under 2404 | True: 2405 | \bar default 2406 | The RBF kernel 2407 | \begin_inset Formula $K\left(x_{i},x_{j}\right)=\exp\left(-\gamma\left\Vert x_{i}-x_{j}\right\Vert ^{2}\right)$ 2408 | \end_inset 2409 | 2410 | corresponds to an infinite dimensional mapping of the feature vectors. 2411 | \end_layout 2412 | 2413 | \begin_layout Enumerate 2414 | 2415 | \bar under 2416 | True: 2417 | \bar default 2418 | If 2419 | \begin_inset Formula $(X,Y)$ 2420 | \end_inset 2421 | 2422 | are jointly Gaussian, then 2423 | \begin_inset Formula $X$ 2424 | \end_inset 2425 | 2426 | and 2427 | \begin_inset Formula $Y$ 2428 | \end_inset 2429 | 2430 | are also Gaussian distributed. 2431 | \end_layout 2432 | 2433 | \begin_layout Enumerate 2434 | 2435 | \bar under 2436 | True: 2437 | \bar default 2438 | A function f(x,y,z) is convex if the Hessian of f is positive semi-definite. 2439 | \end_layout 2440 | 2441 | \begin_layout Enumerate 2442 | 2443 | \bar under 2444 | True: 2445 | \bar default 2446 | In a least-squares linear regression problem, adding an L2 regularization 2447 | penalty cannot decrease the L2 error of the solution w on the training 2448 | data. 2449 | \end_layout 2450 | 2451 | \begin_layout Enumerate 2452 | 2453 | \bar under 2454 | True: 2455 | \bar default 2456 | In linear SVMs, the optimal weight vector w is a linear combination of 2457 | training data points. 2458 | \end_layout 2459 | 2460 | \begin_layout Enumerate 2461 | 2462 | \bar under 2463 | False: 2464 | \bar default 2465 | In stochastic gradient descent, we take steps in the exact direction of 2466 | the gradient vector. 2467 | \end_layout 2468 | 2469 | \begin_layout Enumerate 2470 | 2471 | \bar under 2472 | False: 2473 | \bar default 2474 | In a two class problem when the class conditionals 2475 | \begin_inset Formula $P\left[x\mid y=0\right]andP\left[x\mid y=1\right]$ 2476 | \end_inset 2477 | 2478 | are modeled as Gaussians with different covariance matrices, the posterior 2479 | probabilities turn out to be logistic functions. 2480 | \end_layout 2481 | 2482 | \begin_layout Enumerate 2483 | 2484 | \bar under 2485 | True: 2486 | \bar default 2487 | The perceptron training procedure is guaranteed to converge if the two 2488 | classes are linearly separable. 2489 | \end_layout 2490 | 2491 | \begin_layout Enumerate 2492 | 2493 | \bar under 2494 | False: 2495 | \bar default 2496 | The maximum likelihood estimate for the variance of a univariate Gaussian 2497 | is unbiased. 2498 | \end_layout 2499 | 2500 | \begin_layout Enumerate 2501 | 2502 | \bar under 2503 | True: 2504 | \bar default 2505 | In linear regression, using an L1 regularization penalty term results in 2506 | sparser solutions than using an L2 regularization penalty term. 2507 | 2508 | \end_layout 2509 | 2510 | \begin_layout Subsection 2511 | Spring 2013 Final 2512 | \end_layout 2513 | 2514 | \begin_layout Enumerate 2515 | 2516 | \bar under 2517 | True: 2518 | \bar default 2519 | Solving a non linear separation problem with a hard margin Kernelized SVM 2520 | (Gaussian RBF Kernel) might lead to overfitting. 2521 | \end_layout 2522 | 2523 | \begin_layout Enumerate 2524 | 2525 | \bar under 2526 | True: 2527 | \bar default 2528 | In SVMs, the sum of the Lagrange multipliers corresponding to the positive 2529 | examples is equal to the sum of the Lagrange multipliers corresponding 2530 | to the negative examples. 2531 | \end_layout 2532 | 2533 | \begin_layout Enumerate 2534 | 2535 | \bar under 2536 | False: 2537 | \bar default 2538 | SVMs directly give us the posterior probabilities 2539 | \begin_inset Formula $\mathrm{P}\left(y=1\mid x\right)$ 2540 | \end_inset 2541 | 2542 | and 2543 | \begin_inset Formula $\mathrm{P}\left(y=−1\mid x\right)$ 2544 | \end_inset 2545 | 2546 | . 2547 | \end_layout 2548 | 2549 | \begin_layout Enumerate 2550 | 2551 | \bar under 2552 | False: 2553 | \bar default 2554 | 2555 | \begin_inset Formula $V(X)=\mathrm{E}[X]^{2}−\mathrm{E}[X^{2}]$ 2556 | \end_inset 2557 | 2558 | 2559 | \end_layout 2560 | 2561 | \begin_layout Enumerate 2562 | 2563 | \bar under 2564 | True: 2565 | \bar default 2566 | In the discriminative approach to solving classification problems, we model 2567 | the conditional probability of the labels given the observations. 2568 | \end_layout 2569 | 2570 | \begin_layout Enumerate 2571 | 2572 | \bar under 2573 | False: 2574 | \bar default 2575 | In a two class classification problem, a point on the Bayes optimal decision 2576 | boundary x* always satisfies 2577 | \begin_inset Formula $\mathrm{P}\left[y=1\mid x*\right]=\mathrm{P}\left[y=0\mid x*\right]$ 2578 | \end_inset 2579 | 2580 | . 2581 | \end_layout 2582 | 2583 | \begin_layout Enumerate 2584 | 2585 | \bar under 2586 | True: 2587 | \bar default 2588 | Any linear combination of the components of a multivariate Gaussian is 2589 | a univariate Gaussian. 2590 | \end_layout 2591 | 2592 | \begin_layout Enumerate 2593 | 2594 | \bar under 2595 | False: 2596 | \bar default 2597 | For any two random variables 2598 | \begin_inset Formula $X\sim N\left(\mu_{1},\sigma_{1}^{2}\right)$ 2599 | \end_inset 2600 | 2601 | and 2602 | \begin_inset Formula $Y\sim\mathcal{N}\left(\mu_{2},\sigma_{2}^{2}\right)$ 2603 | \end_inset 2604 | 2605 | , 2606 | \begin_inset Formula $X+Y\sim\mathcal{N}\left(\mu_{1}+\mu_{2},\sigma_{1}^{2}+\sigma_{2}^{2}\right)$ 2607 | \end_inset 2608 | 2609 | . 2610 | \end_layout 2611 | 2612 | \begin_layout Enumerate 2613 | 2614 | \bar under 2615 | False: 2616 | \bar default 2617 | For a logistic regression problem differing initialization points can lead 2618 | to a much better optimum. 2619 | \end_layout 2620 | 2621 | \begin_layout Enumerate 2622 | 2623 | \bar under 2624 | False: 2625 | \bar default 2626 | In logistic regression, we model the odds ratio 2627 | \begin_inset Formula $\frac{p}{1-p}$ 2628 | \end_inset 2629 | 2630 | as a linear function. 2631 | \end_layout 2632 | 2633 | \begin_layout Enumerate 2634 | 2635 | \bar under 2636 | True: 2637 | \bar default 2638 | Random forests can be used to classify infinite dimensional data. 2639 | \end_layout 2640 | 2641 | \begin_layout Enumerate 2642 | 2643 | \bar under 2644 | False: 2645 | \bar default 2646 | In boosting we start with a Gaussian weight distribution over the training 2647 | samples. 2648 | \end_layout 2649 | 2650 | \begin_layout Enumerate 2651 | 2652 | \bar under 2653 | False: 2654 | \bar default 2655 | In Adaboost, the error of each hypothesis is calculated by the ratio of 2656 | misclassified examples to the total number of examples. 2657 | \end_layout 2658 | 2659 | \begin_layout Enumerate 2660 | 2661 | \bar under 2662 | True: 2663 | \bar default 2664 | When 2665 | \begin_inset Formula $k=1$ 2666 | \end_inset 2667 | 2668 | and 2669 | \begin_inset Formula $N\rightarrow\infty$ 2670 | \end_inset 2671 | 2672 | , the kNN classification rate is bounded above by twice the Bayes error 2673 | rate. 2674 | \end_layout 2675 | 2676 | \begin_layout Enumerate 2677 | 2678 | \bar under 2679 | True: 2680 | \bar default 2681 | A single layer neural network with a sigmoid activation for binary classificati 2682 | on with the cross entropy loss is exactly equivalent to logistic regression. 2683 | \end_layout 2684 | 2685 | \begin_layout Enumerate 2686 | 2687 | \bar under 2688 | True: 2689 | \bar default 2690 | Convolution is a linear operation i.e. 2691 | 2692 | \begin_inset Formula $\left(\alpha f_{1}+\beta f_{2}\right)\ast g=\alpha f_{1}\ast g+\beta f_{2}\ast g$ 2693 | \end_inset 2694 | 2695 | . 2696 | \end_layout 2697 | 2698 | \begin_layout Enumerate 2699 | 2700 | \bar under 2701 | True: 2702 | \bar default 2703 | The k-means algorithm does coordinate descent on a non-convex objective 2704 | function. 2705 | \end_layout 2706 | 2707 | \begin_layout Enumerate 2708 | 2709 | \bar under 2710 | True: 2711 | \bar default 2712 | A 1-NN classifier has higher variance than a 3-NN classifier. 2713 | \end_layout 2714 | 2715 | \begin_layout Enumerate 2716 | 2717 | \bar under 2718 | False: 2719 | \bar default 2720 | The single link agglomerative clustering algorithm groups two clusters 2721 | on the basis of the maximum distance between points in the two clusters. 2722 | \end_layout 2723 | 2724 | \begin_layout Enumerate 2725 | 2726 | \bar under 2727 | False: 2728 | \bar default 2729 | The largest eigenvector of the covariance matrix is the direction of minimum 2730 | variance in the data. 2731 | \end_layout 2732 | 2733 | \begin_layout Enumerate 2734 | 2735 | \bar under 2736 | False: 2737 | \bar default 2738 | The eigenvectors of 2739 | \begin_inset Formula $AA^{T}$ 2740 | \end_inset 2741 | 2742 | and 2743 | \begin_inset Formula $A^{T}A$ 2744 | \end_inset 2745 | 2746 | are the same. 2747 | \end_layout 2748 | 2749 | \begin_layout Enumerate 2750 | 2751 | \bar under 2752 | True: 2753 | \bar default 2754 | The non-zero eigenvalues of 2755 | \begin_inset Formula $AA^{T}$ 2756 | \end_inset 2757 | 2758 | and 2759 | \begin_inset Formula $A^{T}A$ 2760 | \end_inset 2761 | 2762 | are the same. 2763 | \end_layout 2764 | 2765 | \begin_layout Standard 2766 | \begin_inset Phantom Phantom 2767 | status open 2768 | 2769 | \begin_layout Plain Layout 2770 | 2771 | \end_layout 2772 | 2773 | \end_inset 2774 | 2775 | 2776 | \end_layout 2777 | 2778 | \begin_layout Enumerate 2779 | In linear regression, the irreducible error is 2780 | \bar under 2781 | 2782 | \begin_inset Formula $\sigma^{2}$ 2783 | \end_inset 2784 | 2785 | 2786 | \bar default 2787 | and 2788 | \begin_inset Formula $\boxed{E\left[\left(y-\mathrm{E}(y\mid x)\right)^{^{2}}\right]}$ 2789 | \end_inset 2790 | 2791 | . 2792 | \end_layout 2793 | 2794 | \begin_layout Enumerate 2795 | Let 2796 | \begin_inset Formula $S_{1}$ 2797 | \end_inset 2798 | 2799 | and 2800 | \begin_inset Formula $S_{2}$ 2801 | \end_inset 2802 | 2803 | be the support vectors for 2804 | \begin_inset Formula $w_{1}$ 2805 | \end_inset 2806 | 2807 | (hard margin) and 2808 | \begin_inset Formula $w_{2}$ 2809 | \end_inset 2810 | 2811 | (soft margin). 2812 | Then 2813 | \bar under 2814 | 2815 | \begin_inset Formula $S_{1}$ 2816 | \end_inset 2817 | 2818 | may not be a subset of 2819 | \begin_inset Formula $S_{2}$ 2820 | \end_inset 2821 | 2822 | 2823 | \bar default 2824 | and 2825 | \bar under 2826 | 2827 | \begin_inset Formula $w_{1}$ 2828 | \end_inset 2829 | 2830 | may not be equal to 2831 | \begin_inset Formula $w_{2}$ 2832 | \end_inset 2833 | 2834 | 2835 | \bar default 2836 | . 2837 | \end_layout 2838 | 2839 | \begin_layout Enumerate 2840 | Ordinary least square regression assumes each data point is generated according 2841 | to a linear function of the input plus 2842 | \begin_inset Formula $\mathcal{N}(0,\sigma)$ 2843 | \end_inset 2844 | 2845 | noise. 2846 | In many systems, the noise variance is a positive linear function of the 2847 | input. 2848 | In this case, the probability model that describes this situation is 2849 | \begin_inset Formula $\boxed{\ensuremath{P(y|x)=\frac{1}{\sigma\sqrt{2\pi x}}\exp(-\frac{(y-(w_{0}+w_{1}x))^{2}}{2x\sigma^{2}}}}$ 2850 | \end_inset 2851 | 2852 | . 2853 | \end_layout 2854 | 2855 | \begin_layout Enumerate 2856 | Averaging the outputs of multiple decision trees helps 2857 | \bar under 2858 | reduce variance 2859 | \bar default 2860 | . 2861 | \end_layout 2862 | 2863 | \begin_layout Enumerate 2864 | The following loss functions are convex: 2865 | \bar under 2866 | logistic 2867 | \bar default 2868 | , 2869 | \bar under 2870 | hinge 2871 | \bar default 2872 | , 2873 | \bar under 2874 | exponential 2875 | \bar default 2876 | . 2877 | 2878 | \bar under 2879 | Misclassification loss is not. 2880 | \end_layout 2881 | 2882 | \begin_layout Enumerate 2883 | 2884 | \bar under 2885 | Bias will be smaller 2886 | \bar default 2887 | and 2888 | \bar under 2889 | variance will be larger 2890 | \bar default 2891 | for trees of 2892 | \bar under 2893 | smaller depth 2894 | \bar default 2895 | . 2896 | \end_layout 2897 | 2898 | \begin_layout Enumerate 2899 | If making a tree with 2900 | \begin_inset Formula $k$ 2901 | \end_inset 2902 | 2903 | -ary splits, 2904 | \bar under 2905 | the algorithm will prefer high values of 2906 | \begin_inset Formula $k$ 2907 | \end_inset 2908 | 2909 | 2910 | \bar default 2911 | and 2912 | \bar under 2913 | there will be 2914 | \begin_inset Formula $k-1$ 2915 | \end_inset 2916 | 2917 | thresholds for a 2918 | \begin_inset Formula $k$ 2919 | \end_inset 2920 | 2921 | -ary split 2922 | \bar default 2923 | . 2924 | \end_layout 2925 | 2926 | \begin_layout Standard 2927 | \begin_inset VSpace vfill 2928 | \end_inset 2929 | 2930 | 2931 | \end_layout 2932 | 2933 | \begin_layout Standard 2934 | \begin_inset ERT 2935 | status open 2936 | 2937 | \begin_layout Plain Layout 2938 | 2939 | 2940 | \backslash 2941 | columnbreak 2942 | \end_layout 2943 | 2944 | \end_inset 2945 | 2946 | 2947 | \end_layout 2948 | 2949 | \begin_layout Subsection 2950 | Spring 2014 Final 2951 | \end_layout 2952 | 2953 | \begin_layout Enumerate 2954 | 2955 | \bar under 2956 | False: 2957 | \bar default 2958 | The singular value decomposition of a real matrix is unique. 2959 | \end_layout 2960 | 2961 | \begin_layout Enumerate 2962 | 2963 | \bar under 2964 | True: 2965 | \bar default 2966 | A multiple-layer neural network with linear activation functions is equivalent 2967 | to one single-layer perceptron that uses the same error function on the 2968 | output layer and has the same number of inputs. 2969 | \end_layout 2970 | 2971 | \begin_layout Enumerate 2972 | 2973 | \bar under 2974 | False: 2975 | \bar default 2976 | The maximum likelihood estimator for the parameter 2977 | \begin_inset Formula $\theta$ 2978 | \end_inset 2979 | 2980 | of a uniform distribution over 2981 | \begin_inset Formula $[0,\theta]$ 2982 | \end_inset 2983 | 2984 | is unbiased. 2985 | \end_layout 2986 | 2987 | \begin_layout Enumerate 2988 | 2989 | \bar under 2990 | True: 2991 | \bar default 2992 | The k-means algorithm for clustering is guaranteed to converge to a local 2993 | optimum. 2994 | \end_layout 2995 | 2996 | \begin_layout Enumerate 2997 | 2998 | \bar under 2999 | True: 3000 | \bar default 3001 | Increasing the depth of a decision tree cannot increase its training error. 3002 | \end_layout 3003 | 3004 | \begin_layout Enumerate 3005 | 3006 | \bar under 3007 | False: 3008 | \bar default 3009 | There exists a one-to-one feature mapping 3010 | \begin_inset Formula $\phi$ 3011 | \end_inset 3012 | 3013 | for every valid kernel k. 3014 | \end_layout 3015 | 3016 | \begin_layout Enumerate 3017 | 3018 | \bar under 3019 | True: 3020 | \bar default 3021 | For high-dimensional data data, k-d trees can be slower than brute force 3022 | nearest neighbor search. 3023 | \end_layout 3024 | 3025 | \begin_layout Enumerate 3026 | 3027 | \bar under 3028 | True: 3029 | \bar default 3030 | If we had infinite data and infinitely fast computers, kNN would be the 3031 | only algorithm we would study in CS 189. 3032 | \end_layout 3033 | 3034 | \begin_layout Enumerate 3035 | 3036 | \bar under 3037 | True: 3038 | \bar default 3039 | For datasets with high label noise (many data points with incorrect labels, 3040 | random forests would generally perform better than boosted decision trees. 3041 | \end_layout 3042 | 3043 | \begin_layout Standard 3044 | \begin_inset Phantom Phantom 3045 | status open 3046 | 3047 | \begin_layout Plain Layout 3048 | 3049 | \end_layout 3050 | 3051 | \end_inset 3052 | 3053 | 3054 | \end_layout 3055 | 3056 | \begin_layout Enumerate 3057 | In Homework 4, you fit a logistic regression model on spam and ham data 3058 | for a Kaggle Comp. 3059 | Assume you had a very good score on the public test set, but when the GSIs 3060 | ran your model on a private test set, your score dropped a lot. 3061 | This is likely because you overfitted by submitting multiple times and 3062 | changing the following between submiss 3063 | \bar under 3064 | ions: 3065 | \begin_inset Formula $\lambda$ 3066 | \end_inset 3067 | 3068 | , your penalty term 3069 | \bar default 3070 | ; 3071 | \bar under 3072 | 3073 | \begin_inset Formula $\varepsilon$ 3074 | \end_inset 3075 | 3076 | , your convergence criterion 3077 | \bar default 3078 | ; 3079 | \bar under 3080 | your step size 3081 | \bar default 3082 | ; 3083 | \bar under 3084 | fixing a random bug 3085 | \bar default 3086 | . 3087 | \end_layout 3088 | 3089 | \begin_layout Enumerate 3090 | Given 3091 | \begin_inset Formula $d$ 3092 | \end_inset 3093 | 3094 | -dimensional data 3095 | \begin_inset Formula $\{x_{i}\}_{i=1}^{N}$ 3096 | \end_inset 3097 | 3098 | , you run principal component analysis and pick 3099 | \begin_inset Formula $P$ 3100 | \end_inset 3101 | 3102 | principal components. 3103 | Can you always reconstruct any data point 3104 | \emph on 3105 | 3106 | \begin_inset Formula $x_{i}$ 3107 | \end_inset 3108 | 3109 | 3110 | \emph default 3111 | for 3112 | \begin_inset Formula $i$ 3113 | \end_inset 3114 | 3115 | from 3116 | \begin_inset Formula $1$ 3117 | \end_inset 3118 | 3119 | to 3120 | \begin_inset Formula $N$ 3121 | \end_inset 3122 | 3123 | from the 3124 | \begin_inset Formula $P$ 3125 | \end_inset 3126 | 3127 | principal components with zero reconstruction error? 3128 | \bar under 3129 | Yes, if 3130 | \begin_inset Formula $P=d$ 3131 | \end_inset 3132 | 3133 | . 3134 | \end_layout 3135 | 3136 | \begin_layout Enumerate 3137 | Putting a standard Gaussian prior on the weights for linear regression 3138 | \begin_inset Formula $(w\sim N(0,I))$ 3139 | \end_inset 3140 | 3141 | will result in what type of posterior distribution on the weights? 3142 | \bar under 3143 | Gaussian. 3144 | \end_layout 3145 | 3146 | \begin_layout Enumerate 3147 | Suppose we have 3148 | \begin_inset Formula $N$ 3149 | \end_inset 3150 | 3151 | instances of d-dimensional data. 3152 | Let 3153 | \begin_inset Formula $h$ 3154 | \end_inset 3155 | 3156 | be the amount of data storage necessary for a histogram with a fixed number 3157 | of ticks per axis, and let 3158 | \begin_inset Formula $k$ 3159 | \end_inset 3160 | 3161 | be the amount of data storage necessary for kernel density estimation. 3162 | Which of the following is true about 3163 | \begin_inset Formula $h$ 3164 | \end_inset 3165 | 3166 | and 3167 | \begin_inset Formula $k$ 3168 | \end_inset 3169 | 3170 | ? 3171 | \bar under 3172 | 3173 | \begin_inset Formula $h$ 3174 | \end_inset 3175 | 3176 | grows exponentially with 3177 | \bar default 3178 | 3179 | \begin_inset Formula $d$ 3180 | \end_inset 3181 | 3182 | , and 3183 | \bar under 3184 | 3185 | \begin_inset Formula $k$ 3186 | \end_inset 3187 | 3188 | grows linearly with 3189 | \begin_inset Formula $N$ 3190 | \end_inset 3191 | 3192 | 3193 | \bar default 3194 | . 3195 | \end_layout 3196 | 3197 | \begin_layout Enumerate 3198 | John just trained a decision tree for a digit recognition. 3199 | He notices an extremely low training error, but an abnormally large test 3200 | error. 3201 | He also notices that an SVM with a linear kernel performs much better than 3202 | his tree. 3203 | What could be the cause of his problem? 3204 | \bar under 3205 | Decision tree is too deep 3206 | \bar default 3207 | ; 3208 | \bar under 3209 | decision tree is overfitting 3210 | \bar default 3211 | . 3212 | \end_layout 3213 | 3214 | \begin_layout Enumerate 3215 | John has now switched to multilayer neural networks and notices that the 3216 | training error is going down and converges to a local minimum. 3217 | Then when he test on the new data, the test error is abnormally high. 3218 | What is probably going wrong and what do you recommend him to do? 3219 | \bar under 3220 | The training data size is not large enough so collect a larger training 3221 | data and retain it 3222 | \bar default 3223 | ; 3224 | \bar under 3225 | play with learning rate and add regularization term to objective function 3226 | \bar default 3227 | ; 3228 | \bar under 3229 | use a different initialization and train the network several times and use 3230 | the average of predictions from all nets to predict test data 3231 | \bar default 3232 | ; 3233 | \bar under 3234 | use the same training data but use less hidden layers 3235 | \bar default 3236 | . 3237 | \end_layout 3238 | 3239 | \begin_layout Subsection 3240 | Spring 2015 Midterm 3241 | \end_layout 3242 | 3243 | \begin_layout Enumerate 3244 | 3245 | \bar under 3246 | True: 3247 | \bar default 3248 | If the data is not linearly separable, there is no solution to hard margin 3249 | SVM. 3250 | \end_layout 3251 | 3252 | \begin_layout Enumerate 3253 | 3254 | \bar under 3255 | True: 3256 | \bar default 3257 | logistic regression can be used for classification. 3258 | \end_layout 3259 | 3260 | \begin_layout Enumerate 3261 | 3262 | \bar under 3263 | False: 3264 | \bar default 3265 | Two ways to prevent beta vectors from getting too large are to use a small 3266 | step size and use a small regularization value 3267 | \end_layout 3268 | 3269 | \begin_layout Enumerate 3270 | 3271 | \bar under 3272 | False: 3273 | \bar default 3274 | The L2 norm is often used because it produces sparse results, as opposed 3275 | to the L1 norm which does not 3276 | \end_layout 3277 | 3278 | \begin_layout Enumerate 3279 | 3280 | \bar under 3281 | False: 3282 | \bar default 3283 | For multivariate gaussian, the eigenvalues of the covariance matrix are 3284 | inversely proportional to the lengths of the ellipsoid axes that determine 3285 | the isocontours of the density. 3286 | \end_layout 3287 | 3288 | \begin_layout Enumerate 3289 | 3290 | \bar under 3291 | True: 3292 | \bar default 3293 | In a generative binary classification model where we assume the class condition 3294 | als are distributed as poisson and the class priors are bernoulli, the posterior 3295 | assumes a logistic form. 3296 | \end_layout 3297 | 3298 | \begin_layout Enumerate 3299 | 3300 | \bar under 3301 | False: 3302 | \bar default 3303 | MLE gives us not only a point estimate, but a distribution over the parameters 3304 | we are estimating. 3305 | \end_layout 3306 | 3307 | \begin_layout Enumerate 3308 | 3309 | \bar under 3310 | False: 3311 | \bar default 3312 | Penalized MLE and bayesian estimators for parameters are better used in 3313 | the setting of low-dimensional data with many training examples 3314 | \end_layout 3315 | 3316 | \begin_layout Enumerate 3317 | 3318 | \bar under 3319 | True: 3320 | \bar default 3321 | It is not good machine learning practice to use the test set to help adjust 3322 | the hyperparameters 3323 | \end_layout 3324 | 3325 | \begin_layout Enumerate 3326 | 3327 | \bar under 3328 | False: 3329 | \bar default 3330 | a symmetric positive semidefinite matrix always has nonnegative elements. 3331 | 3332 | \end_layout 3333 | 3334 | \begin_layout Enumerate 3335 | 3336 | \bar under 3337 | True: 3338 | \bar default 3339 | for a valid kernel function k, the corresponding feature mapping can map 3340 | a finite dimensional vector to an infinite dimensional vector 3341 | \end_layout 3342 | 3343 | \begin_layout Enumerate 3344 | 3345 | \bar under 3346 | False: 3347 | \bar default 3348 | the more features we use, the better our learning algorithm will generalize 3349 | to new data points. 3350 | \end_layout 3351 | 3352 | \begin_layout Enumerate 3353 | 3354 | \bar under 3355 | True: 3356 | \bar default 3357 | a discriminative classifier explicitly models 3358 | \begin_inset Formula $\mathrm{P}\left(Y\mid X\right)$ 3359 | \end_inset 3360 | 3361 | . 3362 | \end_layout 3363 | 3364 | \begin_layout Standard 3365 | \begin_inset Phantom Phantom 3366 | status open 3367 | 3368 | \begin_layout Plain Layout 3369 | 3370 | \end_layout 3371 | 3372 | \end_inset 3373 | 3374 | 3375 | \end_layout 3376 | 3377 | \begin_layout Enumerate 3378 | You can use kernels with 3379 | \bar under 3380 | SVM 3381 | \bar default 3382 | and 3383 | \bar under 3384 | perceptron 3385 | \bar default 3386 | . 3387 | \end_layout 3388 | 3389 | \begin_layout Enumerate 3390 | Cross validation is used to select hyperparameters. 3391 | It prevents overfitting, but is not guaranteed to prevent it. 3392 | \end_layout 3393 | 3394 | \begin_layout Enumerate 3395 | L2 regularization is equivalent to imposing a Gaussian prior in linear regressio 3396 | n. 3397 | \end_layout 3398 | 3399 | \begin_layout Enumerate 3400 | If we have 2 two-dimensional Gaussians, the same covariance matrix for both 3401 | will result in a linear decision boundary. 3402 | \end_layout 3403 | 3404 | \begin_layout Enumerate 3405 | The normal equations can be derived from minimizing empirical risk, assuming 3406 | normally distributed noise, and assuming 3407 | \begin_inset Formula $\mathrm{P}(Y\mid X)$ 3408 | \end_inset 3409 | 3410 | is distributed normally with mean $B^Tx$ and variance 3411 | \begin_inset Formula $\sigma^{2}$ 3412 | \end_inset 3413 | 3414 | . 3415 | \end_layout 3416 | 3417 | \begin_layout Enumerate 3418 | Logistic regression can be motivated from 3419 | \bar under 3420 | log odds equated to an affine function of x 3421 | \bar default 3422 | and 3423 | \bar under 3424 | generative models with gaussian class conditionals 3425 | \bar default 3426 | . 3427 | \end_layout 3428 | 3429 | \begin_layout Enumerate 3430 | The perceptron algorithm will converge 3431 | \bar under 3432 | only if the data is linearly separable 3433 | \bar default 3434 | . 3435 | \end_layout 3436 | 3437 | \begin_layout Enumerate 3438 | 3439 | \bar under 3440 | True: 3441 | \bar default 3442 | Newton's method is typically more expensive to calculate than gradient 3443 | descent per iteration. 3444 | \bar under 3445 | 3446 | \begin_inset Newline newline 3447 | \end_inset 3448 | 3449 | True: 3450 | \bar default 3451 | for quadratic equations, Newton's method typically requires fewer iterations 3452 | than gradient descent. 3453 | \bar under 3454 | 3455 | \begin_inset Newline newline 3456 | \end_inset 3457 | 3458 | False: 3459 | \bar default 3460 | Gradient descent can be viewed as iteratively reweighted least squares. 3461 | \end_layout 3462 | 3463 | \begin_layout Enumerate 3464 | 3465 | \bar under 3466 | True: 3467 | \bar default 3468 | Complementary slackness implies that every training point that is misclassified 3469 | by a soft margin SVM is a support vector. 3470 | \bar under 3471 | 3472 | \begin_inset Newline newline 3473 | \end_inset 3474 | 3475 | True: 3476 | \bar default 3477 | When we solve the SVM with the dual problem, we need only the dot product 3478 | of 3479 | \begin_inset Formula $x_{i}$ 3480 | \end_inset 3481 | 3482 | and 3483 | \begin_inset Formula $x_{j}$ 3484 | \end_inset 3485 | 3486 | for all 3487 | \begin_inset Formula $i$ 3488 | \end_inset 3489 | 3490 | , 3491 | \begin_inset Formula $j$ 3492 | \end_inset 3493 | 3494 | . 3495 | \bar under 3496 | 3497 | \begin_inset Newline newline 3498 | \end_inset 3499 | 3500 | True: 3501 | \bar default 3502 | we use Lagrange multipliers in an optimization problem with inequality 3503 | constraints. 3504 | \end_layout 3505 | 3506 | \begin_layout Enumerate 3507 | \begin_inset Formula $\left\Vert \Phi(x)-\Phi(y)\right\Vert _{2}^{2}$ 3508 | \end_inset 3509 | 3510 | can be computed exclusively with inner products. 3511 | \begin_inset Newline newline 3512 | \end_inset 3513 | 3514 | But not 3515 | \begin_inset Formula $\left\Vert \Phi(x)-\Phi(y)\right\Vert _{1}$ 3516 | \end_inset 3517 | 3518 | norm or 3519 | \begin_inset Formula $\Phi(x)-\Phi(y)$ 3520 | \end_inset 3521 | 3522 | . 3523 | \end_layout 3524 | 3525 | \begin_layout Enumerate 3526 | Strong duality holds for 3527 | \bar under 3528 | hard and soft margin SVM 3529 | \bar default 3530 | , but 3531 | \bar under 3532 | not constrained optimization problems 3533 | \bar default 3534 | in general. 3535 | \end_layout 3536 | 3537 | \begin_layout Standard 3538 | \begin_inset VSpace vfill 3539 | \end_inset 3540 | 3541 | 3542 | \end_layout 3543 | 3544 | \begin_layout Standard 3545 | \begin_inset ERT 3546 | status open 3547 | 3548 | \begin_layout Plain Layout 3549 | 3550 | 3551 | \backslash 3552 | columnbreak 3553 | \end_layout 3554 | 3555 | \end_inset 3556 | 3557 | 3558 | \end_layout 3559 | 3560 | \begin_layout Section 3561 | Discussion Problems 3562 | \end_layout 3563 | 3564 | \begin_layout Subsection 3565 | Discussion 9 -- Entropy 3566 | \end_layout 3567 | 3568 | \begin_layout Standard 3569 | \begin_inset Graphics 3570 | filename graphics/disc09-entropy-1.pdf 3571 | width 97col% 3572 | 3573 | \end_inset 3574 | 3575 | 3576 | \end_layout 3577 | 3578 | \begin_layout Subsection 3579 | Discussion 11 -- Skip-Layer NN 3580 | \end_layout 3581 | 3582 | \begin_layout Standard 3583 | \begin_inset Graphics 3584 | filename graphics/disc10-skipnn-1.pdf 3585 | width 97col% 3586 | 3587 | \end_inset 3588 | 3589 | 3590 | \end_layout 3591 | 3592 | \begin_layout Standard 3593 | \begin_inset Graphics 3594 | filename graphics/disc10-skipnn-2.pdf 3595 | width 97col% 3596 | 3597 | \end_inset 3598 | 3599 | 3600 | \end_layout 3601 | 3602 | \begin_layout Subsection 3603 | Discussion 12 -- PCA 3604 | \end_layout 3605 | 3606 | \begin_layout Standard 3607 | \begin_inset Graphics 3608 | filename graphics/disc12-pca-1.pdf 3609 | width 97col% 3610 | 3611 | \end_inset 3612 | 3613 | 3614 | \end_layout 3615 | 3616 | \begin_layout Standard 3617 | \begin_inset Graphics 3618 | filename graphics/disc12-pca-2.pdf 3619 | width 97col% 3620 | 3621 | \end_inset 3622 | 3623 | 3624 | \end_layout 3625 | 3626 | \begin_layout Standard 3627 | \begin_inset Graphics 3628 | filename graphics/disc12-pca-3.pdf 3629 | width 97col% 3630 | 3631 | \end_inset 3632 | 3633 | 3634 | \end_layout 3635 | 3636 | \begin_layout Standard 3637 | \begin_inset Graphics 3638 | filename graphics/disc12-pca-4.pdf 3639 | width 97col% 3640 | 3641 | \end_inset 3642 | 3643 | 3644 | \end_layout 3645 | 3646 | \begin_layout Standard 3647 | \begin_inset ERT 3648 | status collapsed 3649 | 3650 | \begin_layout Plain Layout 3651 | 3652 | 3653 | \backslash 3654 | egroup 3655 | \end_layout 3656 | 3657 | \end_inset 3658 | 3659 | 3660 | \end_layout 3661 | 3662 | \begin_layout Standard 3663 | \begin_inset ERT 3664 | status open 3665 | 3666 | \begin_layout Plain Layout 3667 | 3668 | 3669 | \backslash 3670 | columnbreak 3671 | \end_layout 3672 | 3673 | \end_inset 3674 | 3675 | 3676 | \end_layout 3677 | 3678 | \begin_layout Section 3679 | Minicards 3680 | \end_layout 3681 | 3682 | \begin_layout Standard 3683 | 3684 | \series bold 3685 | \begin_inset Box Boxed 3686 | position "t" 3687 | hor_pos "c" 3688 | has_inner_box 1 3689 | inner_pos "t" 3690 | use_parbox 1 3691 | use_makebox 0 3692 | width "97col%" 3693 | special "none" 3694 | height "1in" 3695 | height_special "totalheight" 3696 | status open 3697 | 3698 | \begin_layout Plain Layout 3699 | 3700 | \series bold 3701 | Gaussian distribution 3702 | \series default 3703 | [7, 8] 3704 | \end_layout 3705 | 3706 | \begin_layout Plain Layout 3707 | \begin_inset Formula $1$ 3708 | \end_inset 3709 | 3710 | -var (normal): 3711 | \begin_inset Formula $p(x)=\ensuremath{\frac{1}{\sigma\sqrt{2\pi}}\exp\left(-\frac{\left(x-\mu\right)^{2}}{2\sigma^{2}}\right)}$ 3712 | \end_inset 3713 | 3714 | 3715 | \end_layout 3716 | 3717 | \begin_layout Plain Layout 3718 | Multivar: 3719 | \begin_inset Formula $p(x)=\frac{1}{\sqrt{\left|\Sigma\right|}\sqrt{2\pi}^{d}}\exp\left(-\frac{1}{2}\left(x-\mu\right)^{\intercal}\Sigma^{-1}\left(x-\mu\right)\right)$ 3720 | \end_inset 3721 | 3722 | 3723 | \end_layout 3724 | 3725 | \end_inset 3726 | 3727 | 3728 | \end_layout 3729 | 3730 | \begin_layout Standard 3731 | \begin_inset CommandInset line 3732 | LatexCommand rule 3733 | offset "0.5ex" 3734 | width "100col%" 3735 | height "1pt" 3736 | 3737 | \end_inset 3738 | 3739 | 3740 | \end_layout 3741 | 3742 | \begin_layout Standard 3743 | The 3744 | \series bold 3745 | covariance 3746 | \series default 3747 | 3748 | \begin_inset Formula $\Sigma$ 3749 | \end_inset 3750 | 3751 | of variables 3752 | \begin_inset Formula $X$ 3753 | \end_inset 3754 | 3755 | is a matrix such that each entry 3756 | \begin_inset Formula $\Sigma_{ij}=\mathrm{Cov}(X_{i},X_{j})$ 3757 | \end_inset 3758 | 3759 | . 3760 | This means that the diagonal entries 3761 | \begin_inset Formula $\Sigma_{ii}=\mathrm{Var}(X_{i})$ 3762 | \end_inset 3763 | 3764 | . 3765 | If the matrix is diagonal, then the non-diagonal entries are zero, which 3766 | means all the variables 3767 | \begin_inset Formula $X_{i}$ 3768 | \end_inset 3769 | 3770 | are independent. 3771 | \end_layout 3772 | 3773 | \begin_layout Standard 3774 | It's nice to have independent variables, so we try to diagonalize non-diagonal 3775 | covariances. 3776 | \end_layout 3777 | 3778 | \begin_layout Standard 3779 | 3780 | \series bold 3781 | \begin_inset Box Boxed 3782 | position "t" 3783 | hor_pos "c" 3784 | has_inner_box 1 3785 | inner_pos "t" 3786 | use_parbox 1 3787 | use_makebox 0 3788 | width "97col%" 3789 | special "none" 3790 | height "1in" 3791 | height_special "totalheight" 3792 | status open 3793 | 3794 | \begin_layout Plain Layout 3795 | 3796 | \series bold 3797 | Spectral Theorem 3798 | \series default 3799 | [7:23] 3800 | \end_layout 3801 | 3802 | \begin_layout Enumerate 3803 | Take definition of eigenvalue/vector: 3804 | \begin_inset Formula $Ax=\lambda x$ 3805 | \end_inset 3806 | 3807 | 3808 | \end_layout 3809 | 3810 | \begin_layout Enumerate 3811 | Pack multiple eigenvalues into 3812 | \begin_inset Formula $\Lambda=\mathrm{diag}\left(\lambda_{1},\lambda_{2},\ldots,\lambda_{n}\right)$ 3813 | \end_inset 3814 | 3815 | 3816 | \begin_inset Newline newline 3817 | \end_inset 3818 | 3819 | 3820 | \begin_inset Formula $n$ 3821 | \end_inset 3822 | 3823 | eigenvalues exist iff 3824 | \begin_inset Formula $A$ 3825 | \end_inset 3826 | 3827 | is symmetric. 3828 | \end_layout 3829 | 3830 | \begin_layout Enumerate 3831 | Pack multiple eigenvectors into 3832 | \begin_inset Formula $U=\left[x_{1}\ x_{2}\ \ldots\ x_{n}\right]$ 3833 | \end_inset 3834 | 3835 | 3836 | \end_layout 3837 | 3838 | \begin_layout Enumerate 3839 | Rewrite equation using these: 3840 | \begin_inset Formula $\boxed{AU=U\Lambda\longrightarrow A=U\Lambda U'}$ 3841 | \end_inset 3842 | 3843 | . 3844 | \begin_inset Newline newline 3845 | \end_inset 3846 | 3847 | We can use this to diagonalize a symmetric 3848 | \begin_inset Formula $A$ 3849 | \end_inset 3850 | 3851 | . 3852 | \end_layout 3853 | 3854 | \end_inset 3855 | 3856 | 3857 | \end_layout 3858 | 3859 | \begin_layout Standard 3860 | \begin_inset CommandInset line 3861 | LatexCommand rule 3862 | offset "0.5ex" 3863 | width "100col%" 3864 | height "1pt" 3865 | 3866 | \end_inset 3867 | 3868 | 3869 | \end_layout 3870 | 3871 | \begin_layout Standard 3872 | 3873 | \series bold 3874 | SVM-like classifiers 3875 | \series default 3876 | work with a 3877 | \bar under 3878 | boundary 3879 | \bar default 3880 | , a hyperplane (a line for 2D data) that separates two classes. 3881 | 3882 | \bar under 3883 | Support vectors 3884 | \bar default 3885 | are the point(s) closest to the boundary. 3886 | 3887 | \begin_inset Formula $\gamma$ 3888 | \end_inset 3889 | 3890 | is the 3891 | \bar under 3892 | margin 3893 | \bar default 3894 | , the distance between the boundary and the support vector(s). 3895 | The 3896 | \bar under 3897 | parameter 3898 | \begin_inset Formula $\theta$ 3899 | \end_inset 3900 | 3901 | 3902 | \bar default 3903 | is a vector. 3904 | 3905 | \begin_inset Formula $\boxed{\theta\cdot x}$ 3906 | \end_inset 3907 | 3908 | gives predictions. 3909 | About 3910 | \begin_inset Formula $\theta$ 3911 | \end_inset 3912 | 3913 | : 3914 | \end_layout 3915 | 3916 | \begin_layout Itemize 3917 | The direction of 3918 | \begin_inset Formula $\theta$ 3919 | \end_inset 3920 | 3921 | defines the boundary. 3922 | We can choose this. 3923 | \end_layout 3924 | 3925 | \begin_layout Itemize 3926 | \begin_inset Formula $\left\Vert \theta\right\Vert $ 3927 | \end_inset 3928 | 3929 | must be 3930 | \begin_inset Formula $1/\gamma$ 3931 | \end_inset 3932 | 3933 | , as restricted by 3934 | \begin_inset Formula $\forall i:y^{i}\theta\cdot x^{i}\geq1$ 3935 | \end_inset 3936 | 3937 | 3938 | \begin_inset Newline newline 3939 | \end_inset 3940 | 3941 | We cannot explicitly choose this; it depends on the boundary. 3942 | \begin_inset Newline newline 3943 | \end_inset 3944 | 3945 | This restriction is turned into a cost in soft-margin SVM. 3946 | \end_layout 3947 | 3948 | \begin_layout Standard 3949 | 3950 | \series bold 3951 | \begin_inset Box Boxed 3952 | position "t" 3953 | hor_pos "c" 3954 | has_inner_box 1 3955 | inner_pos "t" 3956 | use_parbox 1 3957 | use_makebox 0 3958 | width "97col%" 3959 | special "none" 3960 | height "1in" 3961 | height_special "totalheight" 3962 | status open 3963 | 3964 | \begin_layout Plain Layout 3965 | 3966 | \series bold 3967 | Perceptron 3968 | \series default 3969 | [2:11, 3:6] picks misclassified point and updates 3970 | \begin_inset Formula $\theta$ 3971 | \end_inset 3972 | 3973 | just enough to classify it correctly: 3974 | \begin_inset Newline newline 3975 | \end_inset 3976 | 3977 | 3978 | \begin_inset Formula $\boxed{\theta\leftarrow\theta+x^{i}}$ 3979 | \end_inset 3980 | 3981 | or 3982 | \begin_inset Formula $\boxed{\theta\leftarrow\theta-\nabla J\left(\theta\right)}$ 3983 | \end_inset 3984 | 3985 | 3986 | \end_layout 3987 | 3988 | \begin_layout Plain Layout 3989 | 3990 | \bar under 3991 | Overfits 3992 | \bar default 3993 | when outliers skew the boundary. 3994 | 3995 | \bar under 3996 | Converges 3997 | \bar default 3998 | iff separable. 3999 | \end_layout 4000 | 4001 | \begin_layout Plain Layout 4002 | 4003 | \bar under 4004 | Batch eqn 4005 | \bar default 4006 | 4007 | \begin_inset Formula $\theta\cdot x=\sum_{i}\alpha^{i}y^{i}x^{i}\cdot x$ 4008 | \end_inset 4009 | 4010 | : 4011 | \begin_inset Newline newline 4012 | \end_inset 4013 | 4014 | 4015 | \begin_inset Formula $\alpha_{i}=\text{\# times point \emph{i} was misclassified}$ 4016 | \end_inset 4017 | 4018 | 4019 | \end_layout 4020 | 4021 | \end_inset 4022 | 4023 | 4024 | \end_layout 4025 | 4026 | \begin_layout Standard 4027 | 4028 | \series bold 4029 | \begin_inset Box Boxed 4030 | position "t" 4031 | hor_pos "c" 4032 | has_inner_box 1 4033 | inner_pos "t" 4034 | use_parbox 1 4035 | use_makebox 0 4036 | width "97col%" 4037 | special "none" 4038 | height "1in" 4039 | height_special "totalheight" 4040 | status open 4041 | 4042 | \begin_layout Plain Layout 4043 | 4044 | \series bold 4045 | Hard-margin SVM 4046 | \series default 4047 | [3:36] maximizes the margin around the boundary. 4048 | Technically, it minimizes the distance between boundary and the vectors 4049 | closest to it (the support vectors): 4050 | \begin_inset Newline newline 4051 | \end_inset 4052 | 4053 | 4054 | \begin_inset Formula $\boxed{\min_{\theta}\left\Vert \theta\right\Vert ^{2}\quad\text{such that}\ \forall i:y^{i}\theta\cdot x^{i}\geq1}$ 4055 | \end_inset 4056 | 4057 | 4058 | \end_layout 4059 | 4060 | \begin_layout Plain Layout 4061 | Sometimes removing a few outliers lets us find a much higher margin or a 4062 | margin at all. 4063 | Hard-margin 4064 | \bar under 4065 | overfits 4066 | \bar default 4067 | by not seeing this. 4068 | \end_layout 4069 | 4070 | \begin_layout Plain Layout 4071 | 4072 | \bar under 4073 | Converges 4074 | \bar default 4075 | iff separable. 4076 | \end_layout 4077 | 4078 | \begin_layout Plain Layout 4079 | 4080 | \bar under 4081 | Batch eqn 4082 | \bar default 4083 | 4084 | \begin_inset Formula $\theta=\sum_{i}\alpha^{i}y^{i}x^{i}$ 4085 | \end_inset 4086 | 4087 | , where 4088 | \begin_inset Formula $\alpha^{i}=\mathbf{1}_{i\ \text{is support vector}}$ 4089 | \end_inset 4090 | 4091 | 4092 | \end_layout 4093 | 4094 | \end_inset 4095 | 4096 | 4097 | \end_layout 4098 | 4099 | \begin_layout Standard 4100 | 4101 | \series bold 4102 | \begin_inset Box Boxed 4103 | position "t" 4104 | hor_pos "c" 4105 | has_inner_box 1 4106 | inner_pos "t" 4107 | use_parbox 1 4108 | use_makebox 0 4109 | width "97col%" 4110 | special "none" 4111 | height "1in" 4112 | height_special "totalheight" 4113 | status open 4114 | 4115 | \begin_layout Plain Layout 4116 | 4117 | \series bold 4118 | Soft-margin SVM 4119 | \series default 4120 | [3:37] is like hard-margin SVM but penalizes misclassifications: 4121 | \begin_inset Newline newline 4122 | \end_inset 4123 | 4124 | 4125 | \begin_inset Formula $\boxed{\min_{\theta}\left\Vert \theta\right\Vert ^{2}+C\sum_{i=1}^{n}\left(1-y^{i}\theta\cdot x^{i}\right)_{+}}$ 4126 | \end_inset 4127 | 4128 | 4129 | \end_layout 4130 | 4131 | \begin_layout Plain Layout 4132 | 4133 | \bar under 4134 | Hyperparameter 4135 | \bar default 4136 | 4137 | \begin_inset Formula $C$ 4138 | \end_inset 4139 | 4140 | is the hardness of the margin. 4141 | Lower 4142 | \begin_inset Formula $C$ 4143 | \end_inset 4144 | 4145 | means more misclassifications but larger soft margin. 4146 | \end_layout 4147 | 4148 | \begin_layout Plain Layout 4149 | 4150 | \bar under 4151 | Overfits 4152 | \bar default 4153 | on less data, more features, higher 4154 | \begin_inset Formula $C$ 4155 | \end_inset 4156 | 4157 | 4158 | \end_layout 4159 | 4160 | \end_inset 4161 | 4162 | 4163 | \end_layout 4164 | 4165 | \begin_layout Standard 4166 | \begin_inset CommandInset line 4167 | LatexCommand rule 4168 | offset "0.5ex" 4169 | width "100col%" 4170 | height "1pt" 4171 | 4172 | \end_inset 4173 | 4174 | 4175 | \end_layout 4176 | 4177 | \begin_layout Standard 4178 | 4179 | \series bold 4180 | More classifiers 4181 | \end_layout 4182 | 4183 | \begin_layout Standard 4184 | 4185 | \series bold 4186 | \begin_inset Box Boxed 4187 | position "t" 4188 | hor_pos "c" 4189 | has_inner_box 1 4190 | inner_pos "t" 4191 | use_parbox 1 4192 | use_makebox 0 4193 | width "97col%" 4194 | special "none" 4195 | height "1in" 4196 | height_special "totalheight" 4197 | status open 4198 | 4199 | \begin_layout Plain Layout 4200 | 4201 | \series bold 4202 | KNN 4203 | \series default 4204 | [14:4] Given an item 4205 | \begin_inset Formula $x$ 4206 | \end_inset 4207 | 4208 | , find the 4209 | \begin_inset Formula $k$ 4210 | \end_inset 4211 | 4212 | training items 4213 | \begin_inset Quotes eld 4214 | \end_inset 4215 | 4216 | closest 4217 | \begin_inset Quotes erd 4218 | \end_inset 4219 | 4220 | to 4221 | \begin_inset Formula $x$ 4222 | \end_inset 4223 | 4224 | and return the result of a vote. 4225 | \end_layout 4226 | 4227 | \begin_layout Plain Layout 4228 | 4229 | \bar under 4230 | Hyperparameter 4231 | \bar default 4232 | 4233 | \begin_inset Formula $k$ 4234 | \end_inset 4235 | 4236 | , the number of neighbors. 4237 | \begin_inset Newline newline 4238 | \end_inset 4239 | 4240 | 4241 | \begin_inset Quotes eld 4242 | \end_inset 4243 | 4244 | Closest 4245 | \begin_inset Quotes erd 4246 | \end_inset 4247 | 4248 | can be defined by some norm ( 4249 | \begin_inset Formula $l_{2}$ 4250 | \end_inset 4251 | 4252 | by default). 4253 | \end_layout 4254 | 4255 | \begin_layout Plain Layout 4256 | 4257 | \bar under 4258 | Overfits 4259 | \bar default 4260 | when 4261 | \begin_inset Formula $k$ 4262 | \end_inset 4263 | 4264 | is really small 4265 | \end_layout 4266 | 4267 | \end_inset 4268 | 4269 | 4270 | \end_layout 4271 | 4272 | \begin_layout Standard 4273 | 4274 | \series bold 4275 | \begin_inset Box Boxed 4276 | position "t" 4277 | hor_pos "c" 4278 | has_inner_box 1 4279 | inner_pos "t" 4280 | use_parbox 1 4281 | use_makebox 0 4282 | width "97col%" 4283 | special "none" 4284 | height "1in" 4285 | height_special "totalheight" 4286 | status open 4287 | 4288 | \begin_layout Plain Layout 4289 | 4290 | \series bold 4291 | Decision trees 4292 | \series default 4293 | : Recursively split on features that yield the best split. 4294 | Each tree has many nodes, which either split on a feature at a threshold, 4295 | or all data the same way. 4296 | \begin_inset Newline newline 4297 | \end_inset 4298 | 4299 | 4300 | \bar under 4301 | Hyperparameters 4302 | \bar default 4303 | typically restrict complexity (max tree depth, min points at node) or penalize 4304 | it. 4305 | One particular one of interest is 4306 | \begin_inset Formula $d$ 4307 | \end_inset 4308 | 4309 | , the max number of nodes. 4310 | \end_layout 4311 | 4312 | \begin_layout Plain Layout 4313 | 4314 | \bar under 4315 | Overfits 4316 | \bar default 4317 | when tree is deep or when we are allowed to split on a very small number 4318 | of items. 4319 | \end_layout 4320 | 4321 | \begin_layout Plain Layout 4322 | 4323 | \series bold 4324 | Bagging 4325 | \series default 4326 | : Make multiple trees, each with a random subset of training items. 4327 | To predict, take vote from trees. 4328 | \end_layout 4329 | 4330 | \begin_layout Plain Layout 4331 | 4332 | \bar under 4333 | Hyperparameters 4334 | \bar default 4335 | # trees, proportion of items to subset. 4336 | \end_layout 4337 | 4338 | \begin_layout Plain Layout 4339 | 4340 | \series bold 4341 | Random forests 4342 | \series default 4343 | is bagging, except, for each node, consider only a random subset of features 4344 | to split on. 4345 | \end_layout 4346 | 4347 | \begin_layout Plain Layout 4348 | 4349 | \bar under 4350 | Hyperparameters 4351 | \bar default 4352 | proportion of features to consider. 4353 | \end_layout 4354 | 4355 | \end_inset 4356 | 4357 | 4358 | \end_layout 4359 | 4360 | \begin_layout Standard 4361 | 4362 | \series bold 4363 | \begin_inset Box Boxed 4364 | position "t" 4365 | hor_pos "c" 4366 | has_inner_box 1 4367 | inner_pos "t" 4368 | use_parbox 1 4369 | use_makebox 0 4370 | width "97col%" 4371 | special "none" 4372 | height "1in" 4373 | height_special "totalheight" 4374 | status open 4375 | 4376 | \begin_layout Plain Layout 4377 | 4378 | \series bold 4379 | AdaBoost 4380 | \series default 4381 | [dtrees3:34] Use any algorithm (i.e., decision trees) to train a weak learner, 4382 | take all the errors, and train a new learner on with the errors emphasized*. 4383 | To predict, predict with the first algorithm, then add on the prediction 4384 | of the second algorithm, and so on. 4385 | \end_layout 4386 | 4387 | \begin_layout Plain Layout 4388 | \noindent 4389 | * For regression, train the new learner on the errors. 4390 | For classification, give misclassified items more weight. 4391 | \end_layout 4392 | 4393 | \begin_layout Plain Layout 4394 | 4395 | \bar under 4396 | Hyperparameters 4397 | \bar default 4398 | 4399 | \begin_inset Formula $B$ 4400 | \end_inset 4401 | 4402 | , the number of weak learners; 4403 | \begin_inset Formula $\lambda$ 4404 | \end_inset 4405 | 4406 | , the learning rate. 4407 | \end_layout 4408 | 4409 | \end_inset 4410 | 4411 | 4412 | \end_layout 4413 | 4414 | \begin_layout Standard 4415 | \begin_inset VSpace vfill 4416 | \end_inset 4417 | 4418 | 4419 | \end_layout 4420 | 4421 | \begin_layout Standard 4422 | \begin_inset ERT 4423 | status open 4424 | 4425 | \begin_layout Plain Layout 4426 | 4427 | 4428 | \backslash 4429 | end{multicols} 4430 | \end_layout 4431 | 4432 | \end_inset 4433 | 4434 | 4435 | \end_layout 4436 | 4437 | \end_body 4438 | \end_document 4439 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | CS 189 Machine Learning Cheat Sheet 2 | =================================== 3 | 4 | Cheat sheets: 5 | 6 | - [189-cheat-sheet-minicards.pdf](<189-cheat-sheet-minicards.pdf>) 7 | - [189-cheat-sheet-nominicards.pdf](<189-cheat-sheet-nominicards.pdf>) 8 | 9 | These cheat sheets include: 10 | 11 | - [The original notes]() by Rishi 12 | Sharma and Peter Gao (from which this repo is forked), with some modifications: 13 | - Rearranged sections to form better grouping, add section titles 14 | - Reworded/condensed some sections in light of better grouping 15 | - Added some new content 16 | - **All** past T/F and multiple choice questions from the following semesters: 17 | - Spring 2013 midterm & final 18 | - Spring 2014 final 19 | - Spring 2015 midterm 20 | - Important algorithmic problems from discussions 21 | - Additional notes ("minicards") 22 | - The `no-minicards` version omits these, so you can have space to add your own notes. 23 | -------------------------------------------------------------------------------- /graphics/NN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/NN.pdf -------------------------------------------------------------------------------- /graphics/NN1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/NN1.pdf -------------------------------------------------------------------------------- /graphics/NN2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/NN2.pdf -------------------------------------------------------------------------------- /graphics/disc09-entropy-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc09-entropy-1.pdf -------------------------------------------------------------------------------- /graphics/disc10-skipnn-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc10-skipnn-1.pdf -------------------------------------------------------------------------------- /graphics/disc10-skipnn-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc10-skipnn-2.pdf -------------------------------------------------------------------------------- /graphics/disc12-pca-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc12-pca-1.pdf -------------------------------------------------------------------------------- /graphics/disc12-pca-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc12-pca-2.pdf -------------------------------------------------------------------------------- /graphics/disc12-pca-3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc12-pca-3.pdf -------------------------------------------------------------------------------- /graphics/disc12-pca-4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc12-pca-4.pdf --------------------------------------------------------------------------------