├── .gitignore
├── 189-cheat-sheet-minicards.pdf
├── 189-cheat-sheet-nominicards.pdf
├── 189-cheat-sheet.lyx
├── README.md
└── graphics
├── NN.pdf
├── NN1.pdf
├── NN2.pdf
├── disc09-entropy-1.pdf
├── disc10-skipnn-1.pdf
├── disc10-skipnn-2.pdf
├── disc12-pca-1.pdf
├── disc12-pca-2.pdf
├── disc12-pca-3.pdf
└── disc12-pca-4.pdf
/.gitignore:
--------------------------------------------------------------------------------
1 | *.aux
2 | *.fdb_latexmk
3 | *.gz
4 | *.log
5 | *.out
6 | .DS_Store
7 | .pdf
8 |
--------------------------------------------------------------------------------
/189-cheat-sheet-minicards.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/189-cheat-sheet-minicards.pdf
--------------------------------------------------------------------------------
/189-cheat-sheet-nominicards.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/189-cheat-sheet-nominicards.pdf
--------------------------------------------------------------------------------
/189-cheat-sheet.lyx:
--------------------------------------------------------------------------------
1 | #LyX 2.1 created this file. For more info see http://www.lyx.org/
2 | \lyxformat 474
3 | \begin_document
4 | \begin_header
5 | \textclass extarticle
6 | \begin_preamble
7 | \usepackage{amsmath,amsthm,amsfonts,amssymb}
8 | \usepackage{calc}
9 | \usepackage{color,graphicx,overpic}
10 | \usepackage[shortlabels]{enumitem}
11 | \usepackage{hyperref}
12 | \usepackage{ifthen}
13 | \usepackage{multicol}
14 | \usepackage{titlesec}
15 | \usepackage{wrapfig}
16 |
17 | \titlespacing*{\section}{0pt}{0.5em}{0em}
18 | \titlespacing*{\subsection}{0pt}{0.5em}{0em}
19 | \titlespacing*{\subsubsection}{0pt}{0.5em}{0em}
20 | \titleformat{\section}{\vspace{1em}\titlerule\normalfont\fontsize{7}{7}\bfseries}{\thesection}{1em}{}
21 | \titleformat{\subsection}{\normalfont\fontsize{6}{6}\bfseries}{\thesection}{1em}{}
22 | \titleformat{\subsubsection}{\titlerule\normalfont\fontsize{6}{6}}{\thesection}{1em}{}
23 | \titlespacing*{\labeling}{0pt}{0em}{0em}
24 |
25 | \let\stdboxed\boxed
26 | \renewcommand{\boxed}[1]{
27 | \setlength{\fboxsep}{0.05em}
28 | \stdboxed{#1}
29 | }
30 |
31 | \setlist{nolistsep,leftmargin=*}
32 |
33 | \setlength{\premulticols}{1pt}
34 | \setlength{\postmulticols}{1pt}
35 | \setlength{\columnsep}{10pt}
36 |
37 | \newtheorem{example}[section]{Example}
38 |
39 | \let\textquotedbl="
40 | \def\ci{\perp\!\!\!\perp}
41 |
42 | \raggedright
43 |
44 | \newcommand{\mytitle}[2]{
45 | \begin{center}\small{#1} -- \scriptsize{#2}\end{center}
46 | }
47 |
48 |
49 | \hyphenpenalty=100
50 | \end_preamble
51 | \options 3pt
52 | \use_default_options false
53 | \maintain_unincluded_children false
54 | \language english
55 | \language_package none
56 | \inputencoding auto
57 | \fontencoding default
58 | \font_roman times
59 | \font_sans default
60 | \font_typewriter default
61 | \font_math auto
62 | \font_default_family default
63 | \use_non_tex_fonts false
64 | \font_sc false
65 | \font_osf false
66 | \font_sf_scale 100
67 | \font_tt_scale 100
68 | \graphics default
69 | \default_output_format default
70 | \output_sync 0
71 | \bibtex_command default
72 | \index_command default
73 | \paperfontsize default
74 | \spacing single
75 | \use_hyperref false
76 | \papersize default
77 | \use_geometry true
78 | \use_package amsmath 1
79 | \use_package amssymb 0
80 | \use_package cancel 0
81 | \use_package esint 1
82 | \use_package mathdots 0
83 | \use_package mathtools 0
84 | \use_package mhchem 0
85 | \use_package stackrel 0
86 | \use_package stmaryrd 0
87 | \use_package undertilde 0
88 | \cite_engine basic
89 | \cite_engine_type default
90 | \biblio_style plain
91 | \use_bibtopic false
92 | \use_indices false
93 | \paperorientation portrait
94 | \suppress_date false
95 | \justification false
96 | \use_refstyle 0
97 | \index Index
98 | \shortcut idx
99 | \color #008000
100 | \end_index
101 | \leftmargin 0.25in
102 | \topmargin 0.25in
103 | \rightmargin 0.25in
104 | \bottommargin 0.25in
105 | \secnumdepth -2
106 | \tocdepth 3
107 | \paragraph_separation skip
108 | \defskip smallskip
109 | \quotes_language english
110 | \papercolumns 1
111 | \papersides 1
112 | \paperpagestyle empty
113 | \tracking_changes false
114 | \output_changes false
115 | \html_math_output 0
116 | \html_css_as_file 0
117 | \html_be_strict false
118 | \end_header
119 |
120 | \begin_body
121 |
122 | \begin_layout Standard
123 | \begin_inset ERT
124 | status open
125 |
126 | \begin_layout Plain Layout
127 |
128 |
129 | \backslash
130 | fontsize{5}{4}
131 | \backslash
132 | selectfont
133 | \end_layout
134 |
135 | \end_inset
136 |
137 |
138 | \end_layout
139 |
140 | \begin_layout Standard
141 | \begin_inset ERT
142 | status open
143 |
144 | \begin_layout Plain Layout
145 |
146 |
147 | \backslash
148 | mytitle{CS 189 Final Note Sheet}{Rishi Sharma, Peter Gao, et.
149 | al.}
150 | \end_layout
151 |
152 | \begin_layout Plain Layout
153 |
154 |
155 | \backslash
156 | begin{multicols}{4}
157 | \end_layout
158 |
159 | \end_inset
160 |
161 |
162 | \end_layout
163 |
164 | \begin_layout Section
165 | Probability & Matrix Review
166 | \end_layout
167 |
168 | \begin_layout Subsection
169 | Bayesian Decision Theory
170 | \end_layout
171 |
172 | \begin_layout Standard
173 | Bayes Rule:
174 | \begin_inset Formula $P(\omega|x)=\frac{P(x|\omega)P(\omega)}{P(x)},P(x)=\sum_{i}P(x|\omega_{i})P(\omega_{i})$
175 | \end_inset
176 |
177 |
178 | \end_layout
179 |
180 | \begin_layout Standard
181 | \begin_inset Formula $P(x,w)=P(x|w)P(w)=P(w|x)P(x)$
182 | \end_inset
183 |
184 |
185 | \end_layout
186 |
187 | \begin_layout Standard
188 | \begin_inset Formula $P(error)=\int_{-\infty}^{\infty}P(error|x)P(x)dx$
189 | \end_inset
190 |
191 |
192 | \end_layout
193 |
194 | \begin_layout Standard
195 | \begin_inset Formula $P(error|x)=\left\{ \begin{array}{lr}
196 | P(\omega_{1}|x) & \text{ if we decide }\omega_{2}\\
197 | P(\omega_{2}|x) & \text{ if we decide }\omega_{1}
198 | \end{array}\right.$
199 | \end_inset
200 |
201 |
202 | \end_layout
203 |
204 | \begin_layout Standard
205 | 0-1 Loss:
206 | \begin_inset Formula $\lambda(\alpha_{i}|\omega_{j})=\left\{ \begin{array}{lr}
207 | 0 & i=j\text{\ (correct)}\\
208 | 1 & i\not=j\text{\ (mismatch)}
209 | \end{array}\right.$
210 | \end_inset
211 |
212 |
213 | \end_layout
214 |
215 | \begin_layout Standard
216 |
217 | \family roman
218 | \series medium
219 | \shape up
220 | \size normal
221 | \emph off
222 | \bar no
223 | \strikeout off
224 | \uuline off
225 | \uwave off
226 | \noun off
227 | \color none
228 | Expected Loss (Risk)
229 | \family default
230 | \series default
231 | \shape default
232 | \size default
233 | \bar default
234 | \strikeout default
235 | \uuline default
236 | \uwave default
237 | \noun default
238 | \color inherit
239 | :
240 | \begin_inset Formula $R(\alpha_{i}|x)=\sum_{j=1}^{c}\lambda(\alpha_{i}|\omega_{j})P(\omega_{j}|x)$
241 | \end_inset
242 |
243 |
244 | \end_layout
245 |
246 | \begin_layout Standard
247 |
248 | \family roman
249 | \series medium
250 | \shape up
251 | \size normal
252 | \emph off
253 | \bar no
254 | \strikeout off
255 | \uuline off
256 | \uwave off
257 | \noun off
258 | \color none
259 | 0-1 Risk:
260 | \family default
261 | \series default
262 | \shape default
263 | \size default
264 | \bar default
265 | \strikeout default
266 | \uuline default
267 | \uwave default
268 | \noun default
269 | \color inherit
270 |
271 | \begin_inset Formula $R(\alpha_{i}|x)=\sum_{j\not=i}^{c}P(\omega_{j}|x)=1-P(\omega_{i}|x)$
272 | \end_inset
273 |
274 |
275 | \end_layout
276 |
277 | \begin_layout Subsection
278 | Generative vs.
279 | Discriminative Model
280 | \end_layout
281 |
282 | \begin_layout Standard
283 |
284 | \series bold
285 | Generative
286 | \series default
287 | : Model class conditional density
288 | \begin_inset Formula $p(x|y)$
289 | \end_inset
290 |
291 | and find
292 | \begin_inset Formula $p(y|x)\propto p(x|y)p(y)$
293 | \end_inset
294 |
295 | or model joint density
296 | \begin_inset Formula $p(x,y)$
297 | \end_inset
298 |
299 | and marginalize to find
300 | \begin_inset Formula $p(y=k|x)=\int_{x}p(x,y=k)dx$
301 | \end_inset
302 |
303 | (posterior)
304 | \end_layout
305 |
306 | \begin_layout Standard
307 |
308 | \series bold
309 | Discriminative
310 | \series default
311 | : Model conditional
312 | \begin_inset Formula $p(y|x)$
313 | \end_inset
314 |
315 | .
316 | \end_layout
317 |
318 | \begin_layout Standard
319 | \begin_inset Tabular
320 |
321 |
322 |
323 |
324 |
325 |
326 | \begin_inset Text
327 |
328 | \begin_layout Plain Layout
329 |
330 | \series bold
331 | class conditional
332 | \series default
333 |
334 | \begin_inset Formula $P(X|Y)$
335 | \end_inset
336 |
337 |
338 | \end_layout
339 |
340 | \end_inset
341 | |
342 |
343 | \begin_inset Text
344 |
345 | \begin_layout Plain Layout
346 |
347 | \series bold
348 | posterior
349 | \series default
350 |
351 | \begin_inset Formula $P(Y|X)$
352 | \end_inset
353 |
354 |
355 | \end_layout
356 |
357 | \end_inset
358 | |
359 |
360 |
361 |
362 | \begin_inset Text
363 |
364 | \begin_layout Plain Layout
365 |
366 | \series bold
367 | prior
368 | \series default
369 |
370 | \begin_inset Formula $P(Y)$
371 | \end_inset
372 |
373 |
374 | \end_layout
375 |
376 | \end_inset
377 | |
378 |
379 | \begin_inset Text
380 |
381 | \begin_layout Plain Layout
382 |
383 | \series bold
384 | evidence
385 | \series default
386 |
387 | \begin_inset Formula $P(X)$
388 | \end_inset
389 |
390 |
391 | \end_layout
392 |
393 | \end_inset
394 | |
395 |
396 |
397 |
398 | \end_inset
399 |
400 |
401 | \end_layout
402 |
403 | \begin_layout Subsection
404 | Probabilistic Motivation for Least Squares
405 | \end_layout
406 |
407 | \begin_layout Standard
408 | \begin_inset Formula $y^{(i)}=\theta^{\intercal}x^{(i)}+\epsilon^{(i)}\ \text{with noise}\ \epsilon{(i)}\sim\mathcal{N}(0,\sigma^{2})$
409 | \end_inset
410 |
411 |
412 | \end_layout
413 |
414 | \begin_layout Standard
415 | Note: The intercept term
416 | \begin_inset Formula $x_{0}=1$
417 | \end_inset
418 |
419 | is accounted for in
420 | \begin_inset Formula $\theta$
421 | \end_inset
422 |
423 |
424 | \begin_inset Newline newline
425 | \end_inset
426 |
427 |
428 | \begin_inset Formula $\implies p(y^{(i)}|x^{(i)};\theta)=\frac{1}{\sqrt{2\pi\sigma^{2}}}\exp\left(-\frac{(y^{(i)}-\theta^{\intercal}x^{(i)})^{2}}{2\sigma^{2}}\right)$
429 | \end_inset
430 |
431 |
432 | \begin_inset Newline newline
433 | \end_inset
434 |
435 |
436 | \begin_inset Formula $\implies L(\theta)=\prod_{i=1}^{m}\frac{1}{\sqrt{2\pi\sigma^{2}}}\exp\left(-\frac{(y^{(i)}-\theta^{\intercal}x^{(i)})^{2}}{2\sigma^{2}}\right)$
437 | \end_inset
438 |
439 |
440 | \begin_inset Newline newline
441 | \end_inset
442 |
443 |
444 | \begin_inset Formula $\implies l(\theta)=m\log\frac{1}{\sqrt{2\pi\sigma^{2}}}-\frac{1}{2\sigma^{2}}\sum_{i=1}^{m}(y^{(i)}-\theta^{\intercal}x^{(i)})^{2}$
445 | \end_inset
446 |
447 |
448 | \begin_inset Newline newline
449 | \end_inset
450 |
451 |
452 | \begin_inset Formula $\implies\max_{\theta}l(\theta)\equiv\min_{\theta}\sum_{i=1}^{m}(y^{(i)}-h_{\theta}(x))^{2}$
453 | \end_inset
454 |
455 |
456 | \end_layout
457 |
458 | \begin_layout Standard
459 | Gaussian noise in our data set
460 | \begin_inset Formula $\{x^{(i)},y^{(i)}\}_{i=1}^{m}$
461 | \end_inset
462 |
463 | gives us least squares
464 | \end_layout
465 |
466 | \begin_layout Standard
467 | \begin_inset Formula $min_{\theta}||X\theta-y||_{2}^{2}\equiv\min_{\theta}\theta^{\intercal}X^{\intercal}X\theta-2\theta^{\intercal}X^{\intercal}y+y^{\intercal}Y$
468 | \end_inset
469 |
470 |
471 | \end_layout
472 |
473 | \begin_layout Standard
474 | \begin_inset Formula $\nabla_{\theta}l(\theta)=X^{\intercal}X\theta-X^{\intercal}y=0\implies\boxed{\theta^{*}=(X^{\intercal}X)^{-1}X^{\intercal}y}$
475 | \end_inset
476 |
477 |
478 | \end_layout
479 |
480 | \begin_layout Standard
481 | Gradient Descent:
482 | \begin_inset Formula $\theta_{t+1}=\theta_{t}+\alpha(y_{t}^{(i)}-h(x_{t}^{(i)}))x_{t}^{(i)},\ \ h_{\theta}(x)=\theta^{\intercal}x$
483 | \end_inset
484 |
485 |
486 | \end_layout
487 |
488 | \begin_layout Subsection
489 | Multivariate Gaussian
490 | \begin_inset Formula $X\sim\mathcal{N}(\mu,\Sigma)$
491 | \end_inset
492 |
493 |
494 | \end_layout
495 |
496 | \begin_layout Standard
497 |
498 | \bar under
499 | Gaussian class conditionals lead to a logistic posterior.
500 | \end_layout
501 |
502 | \begin_layout Standard
503 | \begin_inset Formula $f(x;\mu,\Sigma)=\frac{1}{(2\pi)^{n/2}|\Sigma|^{1/2}}\exp\left(-\frac{1}{2}(x-\mu)^{T}\Sigma^{-1}(x-\mu)\right)$
504 | \end_inset
505 |
506 |
507 | \end_layout
508 |
509 | \begin_layout Standard
510 | \begin_inset Formula $\Sigma=E[(X-\mu)(X-\mu)^{T}]=E[XX^{T}]-\mu\mu^{T}$
511 | \end_inset
512 |
513 |
514 | \end_layout
515 |
516 | \begin_layout Standard
517 | \begin_inset Formula $\Sigma\text{ is PSD}\implies x^{T}\Sigma x\ge0\text{, if inverse exists }\Sigma\text{ must be PD}$
518 | \end_inset
519 |
520 |
521 | \end_layout
522 |
523 | \begin_layout Standard
524 | \begin_inset Formula $\text{If }X\sim N(\mu,\Sigma),\ \text{then}\ AX+b\sim N(A\mu+b,A\Sigma A^{T})$
525 | \end_inset
526 |
527 |
528 | \begin_inset Newline newline
529 | \end_inset
530 |
531 |
532 | \begin_inset Formula $\implies\Sigma^{-\frac{1}{2}}(X-\mu)\sim N(0,I),\text{ where }\Sigma^{-\frac{1}{2}}=U\Lambda^{-\frac{1}{2}}$
533 | \end_inset
534 |
535 |
536 | \end_layout
537 |
538 | \begin_layout Standard
539 | The distribution is the result of a linear transformation of a vector of
540 | univariate Gaussians
541 | \begin_inset Formula $Z\sim\mathcal{N}(0,I)$
542 | \end_inset
543 |
544 | such that
545 | \begin_inset Formula $X=AZ+\mu$
546 | \end_inset
547 |
548 | where we have
549 | \begin_inset Formula $\Sigma=AA^{\intercal}$
550 | \end_inset
551 |
552 | .
553 | From the pdf, we see that the level curves of the distribution decrease
554 | proportionally with
555 | \begin_inset Formula $x^{\intercal}\Sigma^{-1}x$
556 | \end_inset
557 |
558 | (assume
559 | \begin_inset Formula $\mu=0$
560 | \end_inset
561 |
562 | )
563 | \begin_inset Formula $\implies$
564 | \end_inset
565 |
566 |
567 | \begin_inset Formula
568 | \[
569 | \text{\ensuremath{c}-level set of \ensuremath{f}}\propto\{x:x^{\intercal}\Sigma^{-1}x=c\}
570 | \]
571 |
572 | \end_inset
573 |
574 |
575 | \begin_inset Formula
576 | \[
577 | x^{\intercal}\Sigma^{-1}=c\equiv x^{\intercal}U\Lambda^{-1}U^{\intercal}x=c\implies
578 | \]
579 |
580 | \end_inset
581 |
582 |
583 | \begin_inset Formula
584 | \[
585 | \underbrace{\lambda_{1}^{-1}(u_{1}^{\intercal}x)^{2}}_{\text{axis length: \ensuremath{\sqrt{\lambda_{1}}}}}+\cdots+\underbrace{\lambda_{n}^{-1}(u_{n}^{\intercal}x)^{2}}_{\text{axis length: \ensuremath{\sqrt{\lambda_{n}}}}}=c
586 | \]
587 |
588 | \end_inset
589 |
590 |
591 | \end_layout
592 |
593 | \begin_layout Standard
594 | Thus the level curves form an ellipsoid with axis lengths equal to the square
595 | root of the eigenvalues of the covariance matrix.
596 | \end_layout
597 |
598 | \begin_layout Subsection
599 | Loss Functions
600 | \end_layout
601 |
602 | \begin_layout Standard
603 |
604 | \end_layout
605 |
606 | \begin_layout Itemize
607 |
608 | \series bold
609 | Binomial deviance
610 | \series default
611 |
612 | \begin_inset Formula $=\log\left[1+e^{-yf\left(x\right)}\right]$
613 | \end_inset
614 |
615 |
616 | \begin_inset Newline newline
617 | \end_inset
618 |
619 | minimizing function
620 | \begin_inset Formula $f\left(x\right)=\log\frac{\mathrm{P}\left[Y=+1\mid x\right]}{\mathrm{P}\left[Y=-1\mid x\right]}$
621 | \end_inset
622 |
623 |
624 | \end_layout
625 |
626 | \begin_layout Itemize
627 |
628 | \series bold
629 | SVM hinge loss
630 | \series default
631 |
632 | \begin_inset Formula $=\left[1-yf\left(x\right)\right]_{+}$
633 | \end_inset
634 |
635 |
636 | \begin_inset Newline newline
637 | \end_inset
638 |
639 | minimizing function
640 | \begin_inset Formula $f\left(x\right)=\mathrm{sign}\left(\mathrm{P}\left[Y=+1\mid x\right]-\frac{1}{2}\right)$
641 | \end_inset
642 |
643 |
644 | \end_layout
645 |
646 | \begin_layout Itemize
647 |
648 | \series bold
649 | Squared error
650 | \series default
651 |
652 | \begin_inset Formula $=\left[y-f\left(x\right)\right]^{2}=\left[1-yf\left(x\right)\right]^{2}$
653 | \end_inset
654 |
655 |
656 | \begin_inset Newline newline
657 | \end_inset
658 |
659 | minimizing function
660 | \begin_inset Formula $f\left(x\right)=2\mathrm{P}\left[Y=+1\mid x\right]-1$
661 | \end_inset
662 |
663 |
664 | \end_layout
665 |
666 | \begin_layout Itemize
667 |
668 | \series bold
669 | \begin_inset Quotes eld
670 | \end_inset
671 |
672 | Huberized
673 | \begin_inset Quotes erd
674 | \end_inset
675 |
676 | square hinge loss
677 | \series default
678 |
679 | \begin_inset Formula $=\left\{ \begin{array}{ll}
680 | -4yf\left(x\right) & \text{if}\ yf\left(x\right)<-1\\
681 | \left[1-yf\left(x\right)\right]_{+}^{2} & \text{otherwise}
682 | \end{array}\right.$
683 | \end_inset
684 |
685 |
686 | \begin_inset Newline newline
687 | \end_inset
688 |
689 | minimizing function
690 | \begin_inset Formula $f\left(x\right)=2\mathrm{P}\left[Y=+1\mid x\right]-1$
691 | \end_inset
692 |
693 |
694 | \end_layout
695 |
696 | \begin_layout Subsection
697 | Optimization
698 | \end_layout
699 |
700 | \begin_layout Standard
701 | Newton's Method:
702 | \begin_inset Formula $\theta_{t+1}=\theta_{t}-[\nabla_{\theta}^{2}f(\theta_{t})]^{-1}\nabla_{\theta}f(\theta_{t})$
703 | \end_inset
704 |
705 |
706 | \end_layout
707 |
708 | \begin_layout Standard
709 | Gradient Decent:
710 | \begin_inset Formula $\theta_{t+1}=\theta_{t}-\alpha\nabla_{\theta}f(\theta_{t})$
711 | \end_inset
712 |
713 | , for minimizing
714 | \end_layout
715 |
716 | \begin_layout Subsection
717 | Gradients
718 | \end_layout
719 |
720 | \begin_layout Standard
721 | \begin_inset Formula $\frac{\partial{\bf {y}}}{\partial{\bf {x}}}\triangleq\begin{bmatrix}\frac{\partial y_{1}}{\partial x_{1}} & \dots & \frac{\partial y_{m}}{\partial x_{1}}\\
722 | \vdots & \ddots & \vdots\\
723 | \frac{\partial y_{1}}{\partial x_{n}} & \dots & \frac{\partial y_{m}}{\partial x_{n}}
724 | \end{bmatrix},$
725 | \end_inset
726 |
727 |
728 | \begin_inset Formula $\frac{\partial(A{\bf x})}{\partial{\bf x}}=A^{T},\frac{\partial({\bf x}^{T}A)}{\partial{\bf x}}=A,$
729 | \end_inset
730 |
731 |
732 | \begin_inset Newline newline
733 | \end_inset
734 |
735 |
736 | \begin_inset Formula $\frac{\partial({\bf x}^{T}{\bf x})}{\partial{\bf x}}=2{\bf x},\frac{\partial({\bf x}^{T}A{\bf x})}{\partial{\bf x}}=(A+A^{T}){\bf x},\frac{\partial(trBA)}{\partial A}=B^{T}$
737 | \end_inset
738 |
739 |
740 | \end_layout
741 |
742 | \begin_layout Standard
743 | \begin_inset VSpace vfill
744 | \end_inset
745 |
746 |
747 | \end_layout
748 |
749 | \begin_layout Standard
750 | \begin_inset ERT
751 | status open
752 |
753 | \begin_layout Plain Layout
754 |
755 |
756 | \backslash
757 | columnbreak
758 | \end_layout
759 |
760 | \end_inset
761 |
762 |
763 | \end_layout
764 |
765 | \begin_layout Section
766 | Support Vector Machines
767 | \end_layout
768 |
769 | \begin_layout Standard
770 | In the strictly separable case, the goal is to find a separating hyperplane
771 | (like logistic regression) except now we don't just want any hyperplane,
772 | but one with the largest margin.
773 |
774 | \end_layout
775 |
776 | \begin_layout Standard
777 | \begin_inset Formula $H=\{\omega^{T}x+b=0\}$
778 | \end_inset
779 |
780 | , since scaling
781 | \begin_inset Formula $\omega$
782 | \end_inset
783 |
784 | and b in opposite directions doesn't change the hyperplane our optimization
785 | function should have scaling invariance built into it.
786 | Thus, we do it now and define the closest points to the hyperplane
787 | \begin_inset Formula $x_{sv}$
788 | \end_inset
789 |
790 | (support vectors) to satisfy:
791 | \begin_inset Formula $|\omega^{T}x_{sv}+b|=1$
792 | \end_inset
793 |
794 | .
795 | The distance from any support vector to the hyper plane is now:
796 | \begin_inset Formula $\frac{1}{||\omega||_{2}}$
797 | \end_inset
798 |
799 | .
800 | Maximizing the distance to the hyperplane is the same as minimizing
801 | \begin_inset Formula $||\omega||_{2}$
802 | \end_inset
803 |
804 | .
805 | \end_layout
806 |
807 | \begin_layout Standard
808 | The final optimization problem is:
809 | \end_layout
810 |
811 | \begin_layout Standard
812 | \begin_inset Formula $\boxed{\min_{\omega,b}\frac{1}{2}||\omega||_{2}\ s.t.\ y^{(i)}(w^{T}x^{(i)}+b)\ge1,i=1,\dots,m}$
813 | \end_inset
814 |
815 |
816 | \end_layout
817 |
818 | \begin_layout Standard
819 |
820 | \bar under
821 | Primal
822 | \bar default
823 | :
824 | \begin_inset Formula $L_{p}(\omega,b,\alpha)=\frac{1}{2}||\omega||_{2}-\sum_{i=1}^{m}\alpha_{i}(y^{(i)}(w^{T}x^{(i)}+b)-1)$
825 | \end_inset
826 |
827 |
828 | \end_layout
829 |
830 | \begin_layout Standard
831 | \begin_inset Formula $\frac{\partial L_{p}}{\partial\omega}=\omega-\sum\alpha_{i}y^{(i)}x^{(i)}=0\implies\omega=\sum\alpha_{i}y^{(i)}x^{(i)}$
832 | \end_inset
833 |
834 |
835 | \end_layout
836 |
837 | \begin_layout Standard
838 | \begin_inset Formula $\frac{\partial L_{p}}{\partial b}=-\sum\alpha_{i}y^{(i)}=0,\text{\ \ \ Note: }\alpha_{i}\ne0$
839 | \end_inset
840 |
841 | only for support vectors.
842 | \end_layout
843 |
844 | \begin_layout Standard
845 | Substitute the derivatives into the primal to get the dual.
846 | \end_layout
847 |
848 | \begin_layout Standard
849 |
850 | \bar under
851 | Dual
852 | \bar default
853 | :
854 | \begin_inset Formula $L_{d}(\alpha)=\sum_{i=1}^{m}\alpha_{i}-\frac{1}{2}\sum_{i=1}^{m}\sum_{j=1}^{m}y^{(i)}y^{(j)}\alpha_{i}\alpha_{j}(x^{(i)})^{T}x^{(j)}$
855 | \end_inset
856 |
857 |
858 | \end_layout
859 |
860 | \begin_layout Standard
861 | KKT says
862 | \begin_inset Formula $\alpha_{n}(y_{n}(w^{T}x_{n}+b)-1)=0$
863 | \end_inset
864 |
865 | where
866 | \begin_inset Formula $\alpha_{n}>0$
867 | \end_inset
868 |
869 | .
870 | \end_layout
871 |
872 | \begin_layout Standard
873 | In the non-separable case we allow points to cross the marginal boundary
874 | by some amount
875 | \begin_inset Formula $\xi$
876 | \end_inset
877 |
878 | and penalize it.
879 | \end_layout
880 |
881 | \begin_layout Standard
882 | \begin_inset Formula $\boxed{\min_{\omega,b}\frac{1}{2}||\omega||_{2}+C\sum_{i=1}^{m}\xi_{i}\ \ s.t.\ \ y^{(i)}(w^{T}x^{(i)}+b)\ge1-\xi_{i}}$
883 | \end_inset
884 |
885 |
886 | \end_layout
887 |
888 | \begin_layout Standard
889 | The dual for non-separable doesn't change much except that each
890 | \begin_inset Formula $\alpha_{i}$
891 | \end_inset
892 |
893 | now has an upper bound of C
894 | \begin_inset Formula $\implies0\le\alpha_{i}\le C$
895 | \end_inset
896 |
897 |
898 | \end_layout
899 |
900 | \begin_layout Subsection
901 | Lagrangian
902 | \end_layout
903 |
904 | \begin_layout Standard
905 | \begin_inset Formula $\boxed{L\left(x,\lambda\right)=f_{0}\left(x\right)+\sum_{i=1}^{m}\lambda_{i}f_{i}\left(x\right)}$
906 | \end_inset
907 |
908 |
909 | \end_layout
910 |
911 | \begin_layout Itemize
912 | Think of the
913 | \begin_inset Formula $\lambda_{i}$
914 | \end_inset
915 |
916 | as the cost of violating the constraint
917 | \begin_inset Formula $f_{i}\left(x\right)\leq0$
918 | \end_inset
919 |
920 | .
921 | \end_layout
922 |
923 | \begin_layout Itemize
924 | \begin_inset Formula $L$
925 | \end_inset
926 |
927 | defines a saddle point game: one player (
928 | \noun on
929 | Min
930 | \noun default
931 | ); the other player (
932 | \noun on
933 | Max
934 | \noun default
935 | ) chooses
936 | \begin_inset Formula $\lambda$
937 | \end_inset
938 |
939 | to maximize
940 | \begin_inset Formula $L$
941 | \end_inset
942 |
943 | .
944 | If
945 | \noun on
946 | Min
947 | \noun default
948 | violates a constraint,
949 | \begin_inset Formula $f_{i}\left(x\right)>0$
950 | \end_inset
951 |
952 | , then
953 | \noun on
954 | Max
955 | \noun default
956 | can drive
957 | \begin_inset Formula $L$
958 | \end_inset
959 |
960 | to infinity.
961 | \end_layout
962 |
963 | \begin_layout Itemize
964 | We call the original optimization problem the
965 | \bar under
966 | primal
967 | \bar default
968 | problem.
969 | \begin_inset Newline newline
970 | \end_inset
971 |
972 | It has value
973 | \begin_inset Formula $p*=\min_{x}\max_{\lambda\geq0}L\left(x,\lambda\right)$
974 | \end_inset
975 |
976 |
977 | \begin_inset Newline newline
978 | \end_inset
979 |
980 | (Because of an infeasible
981 | \begin_inset Formula $x$
982 | \end_inset
983 |
984 | ,
985 | \begin_inset Formula $L\left(x,\lambda\right)$
986 | \end_inset
987 |
988 | can be made infinite, and for a feasible
989 | \begin_inset Formula $x$
990 | \end_inset
991 |
992 | , the
993 | \begin_inset Formula $\lambda_{i}f_{i}\left(x\right)$
994 | \end_inset
995 |
996 | terms will become zero.)
997 | \end_layout
998 |
999 | \begin_layout Itemize
1000 | Define
1001 | \begin_inset Formula $g\left(\lambda\right):=\min_{x}L\left(x,\lambda\right)$
1002 | \end_inset
1003 |
1004 | , and define the
1005 | \bar under
1006 | dual
1007 | \bar default
1008 | problem as
1009 | \begin_inset Newline newline
1010 | \end_inset
1011 |
1012 |
1013 | \begin_inset Formula $d*=\max_{\lambda\geq0}g\left(\lambda\right)=\max_{\lambda\geq0}\min_{x}L\left(x,\lambda\right)$
1014 | \end_inset
1015 |
1016 |
1017 | \end_layout
1018 |
1019 | \begin_layout Itemize
1020 | In a zero sum game, it's always better to play second:
1021 | \begin_inset Formula $p*=\min_{x}\max_{\lambda\geq0}L\left(x,\lambda\right)\geq\max_{\lambda\geq0}\min_{x}L\left(x,\lambda\right)=d*$
1022 | \end_inset
1023 |
1024 | This is called
1025 | \bar under
1026 | weak duality
1027 | \bar default
1028 | .
1029 | \end_layout
1030 |
1031 | \begin_layout Itemize
1032 | If there is a
1033 | \bar under
1034 | saddle point
1035 | \bar default
1036 |
1037 | \begin_inset Formula $\left(x*,\lambda*\right)$
1038 | \end_inset
1039 |
1040 | , so that for all
1041 | \begin_inset Formula $x$
1042 | \end_inset
1043 |
1044 | and
1045 | \begin_inset Formula $\lambda\geq0$
1046 | \end_inset
1047 |
1048 | ,
1049 | \begin_inset Formula $L\left(x*,\lambda\right)\leq L\left(x*,\lambda*\right)\leq L\left(x,\lambda*\right),$
1050 | \end_inset
1051 |
1052 | then we have
1053 | \bar under
1054 | strong duality
1055 | \bar default
1056 | : the primal and dual have the same value,
1057 | \begin_inset Formula $p*=\min_{x}\max_{\lambda\geq0}L\left(x,\lambda\right)=\max_{\lambda\geq0}\min_{x}L\left(x,\lambda\right)=d*$
1058 | \end_inset
1059 |
1060 |
1061 | \end_layout
1062 |
1063 | \begin_layout Standard
1064 | Using notation from Peter's notes:
1065 | \end_layout
1066 |
1067 | \begin_layout Standard
1068 | Given
1069 | \begin_inset Formula $\min_{x}f(x)\ s.t.\ g_{i}(x)=0,\ h_{i}(x)\le0$
1070 | \end_inset
1071 |
1072 | , the corresponding Lagrangian is:
1073 | \begin_inset Formula $L(x,\alpha,\beta)=f(x)+\sum_{i=1}^{k}\alpha_{i}g_{i}(x)+\sum_{i=1}^{l}\beta_{i}h_{i}(x)$
1074 | \end_inset
1075 |
1076 |
1077 | \end_layout
1078 |
1079 | \begin_layout Standard
1080 | We min over x and max over the Lagrange multipliers
1081 | \begin_inset Formula $\alpha$
1082 | \end_inset
1083 |
1084 | and
1085 | \begin_inset Formula $\beta$
1086 | \end_inset
1087 |
1088 |
1089 | \end_layout
1090 |
1091 | \begin_layout Section
1092 | Regression
1093 | \end_layout
1094 |
1095 | \begin_layout Standard
1096 | In general the loss function consists of two parts, the loss term and the
1097 | regularization term.
1098 |
1099 | \begin_inset Formula $J(\omega)=\sum_{i}Loss_{i}+\lambda R(\omega)$
1100 | \end_inset
1101 |
1102 |
1103 | \end_layout
1104 |
1105 | \begin_layout Standard
1106 | L2 regularization results in
1107 | \series bold
1108 | ridge regression
1109 | \series default
1110 | .
1111 | \begin_inset Newline newline
1112 | \end_inset
1113 |
1114 | Used when A contains a null space.
1115 | L2 reg falls out of the MLE when we add a Gaussian prior on x with
1116 | \begin_inset Formula $\Sigma=cI$
1117 | \end_inset
1118 |
1119 | .
1120 | \begin_inset Newline newline
1121 | \end_inset
1122 |
1123 |
1124 | \begin_inset Formula $\min_{x}||Ax-y||_{2}^{2}+\lambda||x||_{2}^{2}\implies x^{*}=(A^{T}A+\lambda I)^{-1}X^{T}y$
1125 | \end_inset
1126 |
1127 |
1128 | \end_layout
1129 |
1130 | \begin_layout Standard
1131 | L1 regularization results in
1132 | \series bold
1133 | lasso regression
1134 | \series default
1135 | .
1136 | \begin_inset Newline newline
1137 | \end_inset
1138 |
1139 | Used when
1140 | \begin_inset Formula $x$
1141 | \end_inset
1142 |
1143 | has a Laplace prior.
1144 | Gives sparse results.
1145 | \end_layout
1146 |
1147 | \begin_layout Subsection
1148 | Logistic Regression
1149 | \end_layout
1150 |
1151 | \begin_layout Standard
1152 | Classify
1153 | \begin_inset Formula $y\in\{0,1\}\implies$
1154 | \end_inset
1155 |
1156 | Model
1157 | \begin_inset Formula $p(y=1|x)=\frac{1}{1+e^{-\theta^{T}x}}=h_{\theta}(x)$
1158 | \end_inset
1159 |
1160 |
1161 | \end_layout
1162 |
1163 | \begin_layout Standard
1164 | \begin_inset Formula $\frac{dh_{\theta}}{d\theta}=(\frac{1}{1+e^{\theta^{T}x}})^{2}e^{-\theta^{T}x}=\frac{1}{1+e^{\theta^{T}x}}\left(1-\frac{1}{1+e^{-\theta^{T}x}}\right)=h_{\theta}(1-h_{\theta})$
1165 | \end_inset
1166 |
1167 |
1168 | \end_layout
1169 |
1170 | \begin_layout Standard
1171 | \begin_inset Formula $p(y|x;\theta)=(h_{\theta}(x))^{y}(1-h_{\theta}(x))^{1-y}\implies$
1172 | \end_inset
1173 |
1174 |
1175 | \end_layout
1176 |
1177 | \begin_layout Standard
1178 | \begin_inset Formula $L(\theta)=\prod_{i=1}^{m}(h_{\theta}(x^{(i)}))^{y^{(i)}}(1-h_{\theta}(x^{(i)}))^{1-y^{(i)}}\implies$
1179 | \end_inset
1180 |
1181 |
1182 | \end_layout
1183 |
1184 | \begin_layout Standard
1185 | \begin_inset Formula $l(\theta)=\sum_{i=1}^{m}y^{(i)}\log(h_{\theta}(x^{(i)}))+(1-y^{(i)})\log(1-h_{\theta}(x^{(i)}))\implies$
1186 | \end_inset
1187 |
1188 |
1189 | \end_layout
1190 |
1191 | \begin_layout Standard
1192 | \begin_inset Formula $\nabla_{\theta}l=\sum_{i}(y^{(i)}-h_{\theta}(x^{(i)}))x^{(i)}=X^{\intercal}(y-h_{\theta}(X))$
1193 | \end_inset
1194 |
1195 | , (want
1196 | \begin_inset Formula $\max\ l(\theta)$
1197 | \end_inset
1198 |
1199 | )
1200 | \end_layout
1201 |
1202 | \begin_layout Standard
1203 | Stochastic:
1204 | \begin_inset Formula $\boxed{\theta_{t+1}=\theta_{t}+\alpha(y_{t}^{(j)}-h_{\theta}(x_{t}^{(j)}))x_{t}^{(j)}}$
1205 | \end_inset
1206 |
1207 |
1208 | \end_layout
1209 |
1210 | \begin_layout Standard
1211 | Batch:
1212 | \begin_inset Formula $\boxed{\theta_{t+1}=\theta_{t}+\alpha X^{\intercal}(y-h_{\theta}(X))}$
1213 | \end_inset
1214 |
1215 |
1216 | \end_layout
1217 |
1218 | \begin_layout Standard
1219 | \begin_inset VSpace vfill
1220 | \end_inset
1221 |
1222 |
1223 | \end_layout
1224 |
1225 | \begin_layout Standard
1226 | \begin_inset ERT
1227 | status open
1228 |
1229 | \begin_layout Plain Layout
1230 |
1231 |
1232 | \backslash
1233 | columnbreak
1234 | \end_layout
1235 |
1236 | \end_inset
1237 |
1238 |
1239 | \end_layout
1240 |
1241 | \begin_layout Subsection
1242 | LDA and QDA
1243 | \end_layout
1244 |
1245 | \begin_layout Standard
1246 | Classify
1247 | \begin_inset Formula $y\in\{0,1\},$
1248 | \end_inset
1249 |
1250 | Model
1251 | \begin_inset Formula $p(y)=\phi^{y}\phi^{1-y}$
1252 | \end_inset
1253 |
1254 | and
1255 | \end_layout
1256 |
1257 | \begin_layout Standard
1258 | \begin_inset Formula $l(\theta,\mu_{0},\mu_{1},\Sigma)=log\ \Pi_{i=1}^{m}p(x^{(i)}|y^{(i)};\mu_{0},\mu_{1},\Sigma)p(y^{(i)};\Phi)$
1259 | \end_inset
1260 |
1261 | gives us
1262 | \end_layout
1263 |
1264 | \begin_layout Standard
1265 | \begin_inset Formula $\phi_{MLE}=\frac{1}{m}\sum_{i=1}^{m}1\{y^{(i)}=1\}$
1266 | \end_inset
1267 |
1268 | ,
1269 | \begin_inset Formula $\mu_{k_{MLE}}=\text{avg of x^{(i)} classified as k}$
1270 | \end_inset
1271 |
1272 | ,
1273 | \end_layout
1274 |
1275 | \begin_layout Standard
1276 | \begin_inset Formula $\Sigma_{MLE}=\frac{1}{m}\sum_{i=1}^{m}(x^{(i)}-\mu_{y_{(i)}})(x^{(i)}-\mu_{y_{(i)}})^{T}$
1277 | \end_inset
1278 |
1279 | .
1280 | \end_layout
1281 |
1282 | \begin_layout Standard
1283 | Notice the covariance matrix is the same for all classes in LDA.
1284 | \end_layout
1285 |
1286 | \begin_layout Standard
1287 | If
1288 | \begin_inset Formula $p(x|y)$
1289 | \end_inset
1290 |
1291 | multivariate gaussian (w/ shared
1292 | \begin_inset Formula $\Sigma)$
1293 | \end_inset
1294 |
1295 | , then
1296 | \begin_inset Formula $p(y|x)$
1297 | \end_inset
1298 |
1299 | is logistic function.
1300 | The converse is NOT true.
1301 | LDA makes stronger assumptions about data than does logistic regression.
1302 |
1303 | \begin_inset Formula $h(x)=arg\max_{k}-\frac{1}{2}(x-\mu_{k})^{T}\Sigma^{-1}(x-\mu_{k})+log(\pi_{k})$
1304 | \end_inset
1305 |
1306 |
1307 | \end_layout
1308 |
1309 | \begin_layout Standard
1310 | where
1311 | \begin_inset Formula $\pi_{k}=p(y=k)$
1312 | \end_inset
1313 |
1314 |
1315 | \end_layout
1316 |
1317 | \begin_layout Standard
1318 | For QDA, the model is the same as LDA except that each class has a unique
1319 | covariance matrix.
1320 |
1321 | \begin_inset Formula $h(x)=arg\max_{k}-\frac{1}{2}log|\Sigma_{k}|-\frac{1}{2}(x-\mu_{k})^{T}\Sigma_{k}^{-1}(x-\mu_{k})+log(\pi_{k})$
1322 | \end_inset
1323 |
1324 |
1325 | \end_layout
1326 |
1327 | \begin_layout Section
1328 | Other Classifiers
1329 | \end_layout
1330 |
1331 | \begin_layout Subsection
1332 | Nearest Neighbor
1333 | \end_layout
1334 |
1335 | \begin_layout Standard
1336 | Key Idea: Store all training examples
1337 | \begin_inset Formula $\left\langle x_{i},f(x_{i})\right\rangle $
1338 | \end_inset
1339 |
1340 |
1341 | \end_layout
1342 |
1343 | \begin_layout Standard
1344 |
1345 | \series bold
1346 | NN
1347 | \series default
1348 | : Find closest training point using some distance metric and take its label.
1349 | \end_layout
1350 |
1351 | \begin_layout Standard
1352 |
1353 | \series bold
1354 | k-NN
1355 | \series default
1356 | : Find closest k training points and take on the most likely label based
1357 | on some voting scheme (mean, median,...)
1358 | \end_layout
1359 |
1360 | \begin_layout Standard
1361 |
1362 | \series bold
1363 | Behavior at the limit
1364 | \series default
1365 | : 1NN
1366 | \begin_inset Formula $lim_{N\to\infty}\ \epsilon^{*}\le\epsilon_{NN}\le2\epsilon^{*}$
1367 | \end_inset
1368 |
1369 |
1370 | \begin_inset Formula $\epsilon^{*}=\text{error of optimal prediction},\ \epsilon_{nn}=\text{error of 1NN classifier}$
1371 | \end_inset
1372 |
1373 |
1374 | \end_layout
1375 |
1376 | \begin_layout Standard
1377 | KNN
1378 | \begin_inset space \space{}
1379 | \end_inset
1380 |
1381 |
1382 | \begin_inset Formula $lim_{N\to\infty,K\to\infty},\frac{K}{N}\to0,\epsilon_{knn}=\epsilon^{*}$
1383 | \end_inset
1384 |
1385 |
1386 | \end_layout
1387 |
1388 | \begin_layout Standard
1389 |
1390 | \series bold
1391 | Curse of dimensionality
1392 | \series default
1393 | : As the number of dimensions increases, everything becomes farther apart.
1394 | Our low dimension intuition falls apart.
1395 | Consider the Hypersphere/Hypercube ratio, it's close to zero at
1396 | \begin_inset Formula $d=10$
1397 | \end_inset
1398 |
1399 | .
1400 | How do deal with this curse:
1401 | \end_layout
1402 |
1403 | \begin_layout Enumerate
1404 | Get more data to fill all of that empty space
1405 | \end_layout
1406 |
1407 | \begin_layout Enumerate
1408 | Get better features, reducing the dimensionality and packing the data closer
1409 | together.
1410 | Ex: Bag-of-words, Histograms,...
1411 | \end_layout
1412 |
1413 | \begin_layout Enumerate
1414 | Use a better distance metric.
1415 | \end_layout
1416 |
1417 | \begin_layout Standard
1418 | Minkowski:
1419 | \begin_inset Formula $Dis_{p}(x,y)=(\sum_{i=1}^{d}|x_{i}-y_{u}|^{p})^{\frac{1}{p}}=||x-y||_{p}$
1420 | \end_inset
1421 |
1422 |
1423 | \end_layout
1424 |
1425 | \begin_layout Standard
1426 | 0-norm:
1427 | \begin_inset Formula $Dis_{0}(x,y)=\sum_{i=1}^{d}I|x_{i}=y_{i}|$
1428 | \end_inset
1429 |
1430 |
1431 | \end_layout
1432 |
1433 | \begin_layout Standard
1434 | Mahalanobis:
1435 | \begin_inset Formula $Dis_{M}(x,y|\Sigma)=\sqrt{(x-y)^{T}\Sigma^{-1}(x-y)}$
1436 | \end_inset
1437 |
1438 |
1439 | \end_layout
1440 |
1441 | \begin_layout Standard
1442 | In high-d we get
1443 | \begin_inset Quotes eld
1444 | \end_inset
1445 |
1446 | Hubs
1447 | \begin_inset Quotes erd
1448 | \end_inset
1449 |
1450 | s.t most points identify the hubs as their NN.
1451 | These hubs are usually near the means (Ex: dull gray images, sky and clouds).
1452 | To avoid having everything classified as these hubs, we can use cosine
1453 | similarity.
1454 | \end_layout
1455 |
1456 | \begin_layout Standard
1457 |
1458 | \series bold
1459 | K-d trees
1460 | \series default
1461 | increase the efficiency of nearest neighbor lookup.
1462 | \end_layout
1463 |
1464 | \begin_layout Subsection
1465 | Decision Trees
1466 | \end_layout
1467 |
1468 | \begin_layout Standard
1469 | Given a set of points and classes
1470 | \begin_inset Formula $\{x_{i},y_{i}\}_{i=1}^{n}$
1471 | \end_inset
1472 |
1473 | , test features
1474 | \begin_inset Formula $x_{j}$
1475 | \end_inset
1476 |
1477 | and branch on the feature which
1478 | \begin_inset Quotes eld
1479 | \end_inset
1480 |
1481 | best
1482 | \begin_inset Quotes erd
1483 | \end_inset
1484 |
1485 | separates the data.
1486 | Recursively split on the new subset of data.
1487 | Growing the tree to max depth tends to overfit (training data gets cut
1488 | quickly
1489 | \begin_inset Formula $\implies$
1490 | \end_inset
1491 |
1492 | subtrees train on small sets).
1493 | Mistakes high up in the tree propagate to corresponding subtrees.
1494 | To reduce overfitting, we can prune using a validation set, and we can
1495 | limit the depth.
1496 | \end_layout
1497 |
1498 | \begin_layout Standard
1499 | DT's are prone to label noise.
1500 | Building the correct tree is hard.
1501 | \end_layout
1502 |
1503 | \begin_layout Standard
1504 |
1505 | \series bold
1506 | Heurisitic
1507 | \series default
1508 | : For
1509 | \bar under
1510 | classification
1511 | \bar default
1512 | , maximize information gain
1513 | \begin_inset Formula
1514 | \[
1515 | \max_{j}\quad\mathrm{H}(D)\ -\sum_{x_{j}\in X_{j}}P(X_{j}=x_{j})\cdot\mathrm{H}(D|X_{j}=x_{j})
1516 | \]
1517 |
1518 | \end_inset
1519 |
1520 | where
1521 | \begin_inset Formula $\mathrm{H}(D)=-\sum_{c\in C}P(y=c)\log[p(y=c)]$
1522 | \end_inset
1523 |
1524 | is the entropy of the data set,
1525 | \begin_inset Formula $C$
1526 | \end_inset
1527 |
1528 | is the set of classes each data point can take, and
1529 | \begin_inset Formula $P(y=c)$
1530 | \end_inset
1531 |
1532 | is the fraction of data points with class
1533 | \begin_inset Formula $c$
1534 | \end_inset
1535 |
1536 | .
1537 | \begin_inset Newline newline
1538 | \end_inset
1539 |
1540 | For
1541 | \noun on
1542 | regression
1543 | \noun default
1544 | , minimize the variance.
1545 | Same optimization problem as above, except H is replaced with var.
1546 | Pure leaves correspond to low variance, and the result is the mean of the
1547 | current leaf.
1548 | \end_layout
1549 |
1550 | \begin_layout Subsection
1551 | Random Forests
1552 | \end_layout
1553 |
1554 | \begin_layout Standard
1555 |
1556 | \series bold
1557 | Problem
1558 | \series default
1559 | : DT's are
1560 | \bar under
1561 | unstable
1562 | \bar default
1563 | : small changes in the input data have large effect on tree structure
1564 | \begin_inset Formula $\implies$
1565 | \end_inset
1566 |
1567 | DT's are high-variance estimators.
1568 | \begin_inset Newline newline
1569 | \end_inset
1570 |
1571 |
1572 | \series bold
1573 | Solution
1574 | \series default
1575 | : Random Forests train
1576 | \begin_inset Formula $M$
1577 | \end_inset
1578 |
1579 | different trees with randomly sampled subsets of the data (called bagging),
1580 | and sometimes with randomly sampled subsets of the features to de-correlate
1581 | the trees.
1582 | A new point is tested on all
1583 | \begin_inset Formula $M$
1584 | \end_inset
1585 |
1586 | trees and we take the majority as our output class (for regression we take
1587 | the average of the output).
1588 | \end_layout
1589 |
1590 | \begin_layout Subsection
1591 | Boosting
1592 | \end_layout
1593 |
1594 | \begin_layout Standard
1595 | Weak Learner: Can classify with at least 50% accuracy.
1596 | \end_layout
1597 |
1598 | \begin_layout Standard
1599 | Train weak learner to get a weak classifier.
1600 | Test it on the training data, up-weigh misclassified data, down-weigh correctly
1601 | classified data.
1602 | Train a new weak learner on the weighted data.
1603 | Repeat.
1604 | A new point is classified by every weak learner and the output class is
1605 | the sign of a weighted avg.
1606 | of weak learner outputs.
1607 | Boosting generally overfits.
1608 | If there is label noise, boosting keeps upweighing the mislabeled data.
1609 | \end_layout
1610 |
1611 | \begin_layout Standard
1612 |
1613 | \series bold
1614 | AdaBoost
1615 | \series default
1616 | is a boosting algorithm.
1617 | The weak learner weights are given by
1618 | \begin_inset Formula $\alpha_{t}=\frac{1}{2}\ln(\frac{1-\epsilon_{t}}{\epsilon_{t}})$
1619 | \end_inset
1620 |
1621 | where
1622 | \begin_inset Formula $\epsilon_{t}=Pr_{D_{t}}(h_{t}(x_{i})\ne y_{i})$
1623 | \end_inset
1624 |
1625 | (probability of misclassification).
1626 | The weights are updated
1627 | \begin_inset Formula $D_{t+1}(i)=\frac{D_{t}(i)exp(-\alpha_{t}y_{i}h_{t}(x_{i}))}{Z_{t}}$
1628 | \end_inset
1629 |
1630 | where
1631 | \begin_inset Formula $Z_{t}$
1632 | \end_inset
1633 |
1634 | is a normalization factor.
1635 | \end_layout
1636 |
1637 | \begin_layout Subsection
1638 | Neural Networks
1639 | \end_layout
1640 |
1641 | \begin_layout Standard
1642 | Neural Nets explore what you can do by combining perceptrons, each of which
1643 | is a simple linear classifier.
1644 | We use a soft threshold for each activation function
1645 | \begin_inset Formula $\theta$
1646 | \end_inset
1647 |
1648 | because it is twice differentiable.
1649 | \end_layout
1650 |
1651 | \begin_layout Standard
1652 | \begin_inset Graphics
1653 | filename graphics/NN.pdf
1654 | lyxscale 50
1655 | width 72col%
1656 |
1657 | \end_inset
1658 |
1659 |
1660 | \begin_inset space \space{}
1661 | \end_inset
1662 |
1663 |
1664 | \begin_inset Graphics
1665 | filename graphics/NN2.pdf
1666 | lyxscale 35
1667 | width 21col%
1668 |
1669 | \end_inset
1670 |
1671 |
1672 | \end_layout
1673 |
1674 | \begin_layout Standard
1675 |
1676 | \series bold
1677 | Activation Functions:
1678 | \end_layout
1679 |
1680 | \begin_layout Standard
1681 | \begin_inset Formula $\theta(s)=\tanh(s)=\frac{e^{s}-e^{-s}}{e^{s}+e^{-s}}\implies\theta'(s)=1-\theta^{2}(s)$
1682 | \end_inset
1683 |
1684 |
1685 | \end_layout
1686 |
1687 | \begin_layout Standard
1688 | \begin_inset Formula $\theta(s)=\sigma(s)=\frac{1}{1+e^{-s}}\implies\theta'(s)=\sigma(s)(1-\sigma(s))$
1689 | \end_inset
1690 |
1691 |
1692 | \end_layout
1693 |
1694 | \begin_layout Standard
1695 |
1696 | \series bold
1697 | Error Functions
1698 | \series default
1699 | :
1700 | \end_layout
1701 |
1702 | \begin_layout Standard
1703 |
1704 | \family roman
1705 | \series medium
1706 | \shape up
1707 | \size normal
1708 | \emph off
1709 | \bar no
1710 | \strikeout off
1711 | \uuline off
1712 | \uwave off
1713 | \noun off
1714 | \color none
1715 | Cross Entropy Loss
1716 | \begin_inset Formula $\sum_{i=1}^{n_{out}}y\log(h_{\theta}(x))+(1-y)\log(1-h_{\theta}(x))$
1717 | \end_inset
1718 |
1719 |
1720 | \end_layout
1721 |
1722 | \begin_layout Standard
1723 |
1724 | \family roman
1725 | \series medium
1726 | \shape up
1727 | \size normal
1728 | \emph off
1729 | \bar no
1730 | \strikeout off
1731 | \uuline off
1732 | \uwave off
1733 | \noun off
1734 | \color none
1735 | Mean Squared Error
1736 | \begin_inset Formula $\sum_{i=1}^{n_{out}}(y-h_{\theta}(x))^{2}$
1737 | \end_inset
1738 |
1739 |
1740 | \end_layout
1741 |
1742 | \begin_layout Standard
1743 |
1744 | \series bold
1745 | Notation:
1746 | \series default
1747 |
1748 | \end_layout
1749 |
1750 | \begin_layout Enumerate
1751 | \begin_inset Formula $w_{ij}^{(l)}$
1752 | \end_inset
1753 |
1754 | is the weight from neuron
1755 | \begin_inset Formula $i$
1756 | \end_inset
1757 |
1758 | in layer
1759 | \begin_inset Formula $l-1$
1760 | \end_inset
1761 |
1762 | to neuron
1763 | \begin_inset Formula $j$
1764 | \end_inset
1765 |
1766 | in layer
1767 | \begin_inset Formula $l$
1768 | \end_inset
1769 |
1770 | .
1771 | There are
1772 | \begin_inset Formula $d^{(l)}$
1773 | \end_inset
1774 |
1775 | nodes in the
1776 | \begin_inset Formula $l^{\text{th}}$
1777 | \end_inset
1778 |
1779 | layer.
1780 |
1781 | \end_layout
1782 |
1783 | \begin_layout Enumerate
1784 | \begin_inset Formula $L$
1785 | \end_inset
1786 |
1787 | layers, where L is output layer and data is 0th layer.
1788 |
1789 | \end_layout
1790 |
1791 | \begin_layout Enumerate
1792 | \begin_inset Formula $x_{j}^{(l)}=\theta(s_{j}^{(l)})$
1793 | \end_inset
1794 |
1795 | is the output of a neuron.
1796 | It's the activation function applied to the input signal.
1797 |
1798 | \begin_inset Formula $s_{j}^{(l)}=\sum_{i}w_{ij}^{(l)}x_{i}^{(l-1)}$
1799 | \end_inset
1800 |
1801 |
1802 | \end_layout
1803 |
1804 | \begin_layout Enumerate
1805 | \begin_inset Formula $e(w)$
1806 | \end_inset
1807 |
1808 | is the error as a function of the weights
1809 | \end_layout
1810 |
1811 | \begin_layout Standard
1812 |
1813 | \bar under
1814 | The goal is to learn the weights
1815 | \begin_inset Formula $w_{ij}^{(l)}$
1816 | \end_inset
1817 |
1818 | .
1819 |
1820 | \bar default
1821 | We use gradient descent, but error function is non-convex so we tend to
1822 | local minima.
1823 | The naive version takes
1824 | \begin_inset Formula $O(w^{2})$
1825 | \end_inset
1826 |
1827 | .
1828 |
1829 | \bar under
1830 | Back propagation
1831 | \bar default
1832 | , an algorithm for efficient computation of the gradient, takes
1833 | \begin_inset Formula $O(w)$
1834 | \end_inset
1835 |
1836 | .
1837 | \end_layout
1838 |
1839 | \begin_layout Standard
1840 | \begin_inset Formula $\nabla e(w)\rightarrow\frac{\partial e(w)}{\partial w_{ij}^{(l)}}=\frac{\partial e(w)}{\partial s_{j}^{(l)}}\frac{\partial s_{j}^{(l)}}{\partial w_{ij}^{(l)}}=\delta_{j}^{(l)}x_{i}^{(l-1)}$
1841 | \end_inset
1842 |
1843 |
1844 | \end_layout
1845 |
1846 | \begin_layout Standard
1847 | Final Layer:
1848 | \begin_inset Formula $\delta_{j}^{(L)}=\frac{\partial e(w)}{\partial s_{j}^{(l)}}=\frac{\partial e(w)}{\partial x_{j}^{(L)}}\frac{\partial x_{j}^{(L)}}{\partial s_{j}^{(L)}}=e'(x_{j}^{(L)})\theta_{out}'(s_{j}^{L})$
1849 | \end_inset
1850 |
1851 |
1852 | \end_layout
1853 |
1854 | \begin_layout Standard
1855 |
1856 | \family roman
1857 | \series medium
1858 | \shape up
1859 | \size normal
1860 | \emph off
1861 | \bar no
1862 | \strikeout off
1863 | \uuline off
1864 | \uwave off
1865 | \noun off
1866 | \color none
1867 | General:
1868 | \family default
1869 | \series default
1870 | \shape default
1871 | \size default
1872 | \bar default
1873 | \strikeout default
1874 | \uuline default
1875 | \uwave default
1876 | \noun default
1877 | \color inherit
1878 |
1879 | \begin_inset Formula $\delta_{i}^{(l-1)}=\frac{\partial e(w)}{\partial s_{i}^{(l-1)}}=\sum_{j=1}^{d^{(l)}}\frac{\partial e(w)}{\partial s_{j}^{(l)}}\times\frac{\partial s_{j}^{(l)}}{\partial x_{i}^{(l-1)}}\times\frac{\partial x_{i}^{(l-1)}}{\partial s_{i}^{(l-1)}}$
1880 | \end_inset
1881 |
1882 |
1883 | \end_layout
1884 |
1885 | \begin_layout Standard
1886 | \begin_inset Formula $=\sum_{j=1}^{d^{(l)}}\delta_{j}^{(l)}\times w_{ij}^{(l)}\times\theta'(s_{i}^{(l-1)})$
1887 | \end_inset
1888 |
1889 |
1890 | \end_layout
1891 |
1892 | \begin_layout Standard
1893 | \begin_inset Graphics
1894 | filename graphics/NN1.pdf
1895 | lyxscale 50
1896 | width 100col%
1897 |
1898 | \end_inset
1899 |
1900 |
1901 | \end_layout
1902 |
1903 | \begin_layout Section
1904 | Unsupervised Learning
1905 | \end_layout
1906 |
1907 | \begin_layout Subsection
1908 | Clustering
1909 | \end_layout
1910 |
1911 | \begin_layout Standard
1912 | Unsupervised learning (no labels).
1913 | \end_layout
1914 |
1915 | \begin_layout Standard
1916 |
1917 | \series bold
1918 | Distance function
1919 | \series default
1920 | s.
1921 | Suppose we have two sets of points.
1922 | \end_layout
1923 |
1924 | \begin_layout Itemize
1925 |
1926 | \series bold
1927 | Single linkage
1928 | \series default
1929 | is minimum distance between members.
1930 | \end_layout
1931 |
1932 | \begin_layout Itemize
1933 |
1934 | \series bold
1935 | Complete linkage
1936 | \series default
1937 | is maximum distance between members.
1938 | \end_layout
1939 |
1940 | \begin_layout Itemize
1941 |
1942 | \series bold
1943 | Centroid linkage
1944 | \series default
1945 | is distance between centroids.
1946 | \end_layout
1947 |
1948 | \begin_layout Itemize
1949 |
1950 | \series bold
1951 | Average linkage
1952 | \series default
1953 | is average distance between all pairs.
1954 | \end_layout
1955 |
1956 | \begin_layout Standard
1957 |
1958 | \series bold
1959 | Hierarchical
1960 | \series default
1961 | :
1962 | \end_layout
1963 |
1964 | \begin_layout Itemize
1965 |
1966 | \bar under
1967 | Agglomerative
1968 | \bar default
1969 | : Start with n points, merge 2 closest clusters using some measure, such
1970 | as: Single-link (closest pair), Complete-link (furthest pair), Average-link
1971 | (average of all pairs), Centroid (centroid distance).
1972 | \begin_inset Newline newline
1973 | \end_inset
1974 |
1975 | Note: SL and CL are sensitive to outliers.
1976 | \end_layout
1977 |
1978 | \begin_layout Itemize
1979 |
1980 | \bar under
1981 | Divisive
1982 | \bar default
1983 | : Start with single cluster, recursively divide clusters into 2 subclusters.
1984 |
1985 | \end_layout
1986 |
1987 | \begin_layout Standard
1988 |
1989 | \series bold
1990 | Partitioning
1991 | \series default
1992 | : Partition the data into a K mutually exclusive exhaustive groups (i.e.
1993 | encode k=C(i)).
1994 | Iteratively reallocate to minimize some loss function.
1995 | Finding the correct partitions is hard.
1996 | Use a greedy algorithm called K-means (coordinate decent).
1997 | Loss function is non-convex thus we find local minima.
1998 | \end_layout
1999 |
2000 | \begin_layout Itemize
2001 |
2002 | \series bold
2003 | K-means
2004 | \series default
2005 | : Choose clusters at random, calculate centroid of each cluster, reallocate
2006 | objects to nearest centroid, repeat.
2007 |
2008 | \bar under
2009 | Works with: spherical, well-separated clusters of similar volumes and count.
2010 | \end_layout
2011 |
2012 | \begin_layout Itemize
2013 |
2014 | \series bold
2015 | K-means
2016 | \series default
2017 | ++: Initialize clusters one by one.
2018 | D(x) = distance of point x to nearest cluster.
2019 | Pr(x is new cluster center)
2020 | \begin_inset Formula $\propto D(x)^{2}$
2021 | \end_inset
2022 |
2023 |
2024 | \end_layout
2025 |
2026 | \begin_layout Itemize
2027 |
2028 | \series bold
2029 | K-medians
2030 | \series default
2031 | : Works with arbitrary distance/dissimilarity metric, the centers
2032 | \begin_inset Formula $\mu_{k}$
2033 | \end_inset
2034 |
2035 | are represented by data points.
2036 | Is more restrictive thus has higher loss.
2037 | \end_layout
2038 |
2039 | \begin_layout Standard
2040 |
2041 | \series bold
2042 | General Loss
2043 | \series default
2044 | :
2045 | \begin_inset Formula $\sum_{n=1}^{N}\sum_{k=1}^{K}d(x_{n},\mu_{k})r_{nk}$
2046 | \end_inset
2047 |
2048 | where
2049 | \begin_inset Formula $r_{nk}=1$
2050 | \end_inset
2051 |
2052 | if
2053 | \begin_inset Formula $x_{n}$
2054 | \end_inset
2055 |
2056 | is in cluster k, and 0 o.w.
2057 | \end_layout
2058 |
2059 | \begin_layout Subsection
2060 | Vector Quantization
2061 | \end_layout
2062 |
2063 | \begin_layout Standard
2064 | Use clustering to find representative prototype vectors, which are used
2065 | to simplify representations of signals.
2066 | \end_layout
2067 |
2068 | \begin_layout Subsection
2069 | Parametric Density Estimation
2070 | \end_layout
2071 |
2072 | \begin_layout Standard
2073 |
2074 | \series bold
2075 | Mixture Models.
2076 |
2077 | \series default
2078 | Assume PDF is made up of multiple gaussians with different centers.
2079 |
2080 | \begin_inset Formula $P(x)=\sum_{i=1}^{n_{c}}P(c_{i})P(x|c_{i})$
2081 | \end_inset
2082 |
2083 | with objective function as log likelihood of data.
2084 | Use
2085 | \series bold
2086 | EM
2087 | \series default
2088 | to estimate this model.
2089 |
2090 | \begin_inset Newline newline
2091 | \end_inset
2092 |
2093 | E Step:
2094 | \begin_inset Formula $P(\mu_{i}|x_{k})=\frac{P(\mu_{i})P(x_{k}|\mu_{i})}{\sum_{j}P(\mu_{j})P(x_{j}|\mu_{j})}$
2095 | \end_inset
2096 |
2097 |
2098 | \begin_inset Newline newline
2099 | \end_inset
2100 |
2101 | M Step:
2102 | \begin_inset Formula $P(c_{i})=\frac{1}{n_{e}}\sum_{k=1}^{n_{e}}P(\mu_{i}|x_{k})$
2103 | \end_inset
2104 |
2105 |
2106 | \begin_inset Newline newline
2107 | \end_inset
2108 |
2109 |
2110 | \begin_inset Formula $\mu_{i}=\frac{\sum_{k}x_{k}P(\mu_{i}|x_{k})}{\sum_{k}P(\mu_{i}|x_{k})}$
2111 | \end_inset
2112 |
2113 |
2114 | \begin_inset Newline newline
2115 | \end_inset
2116 |
2117 |
2118 | \begin_inset Formula $\sigma_{i}^{2}=\frac{\sum_{k}(x_{k}-\mu_{i})^{2}P(\mu_{i}|x_{k})}{\sum_{k}P(\mu_{i}|x_{k})}$
2119 | \end_inset
2120 |
2121 | .
2122 |
2123 | \end_layout
2124 |
2125 | \begin_layout Subsection
2126 | Non-parametric Density Estimation
2127 | \end_layout
2128 |
2129 | \begin_layout Standard
2130 | Can use
2131 | \series bold
2132 | Histogram
2133 | \series default
2134 | or Kernel Density Estimation (KDE).
2135 | \end_layout
2136 |
2137 | \begin_layout Standard
2138 |
2139 | \series bold
2140 | KDE
2141 | \series default
2142 | :
2143 | \begin_inset Formula $P(x)=\frac{1}{n}\sum K({\bf x}-{\bf x_{i}})$
2144 | \end_inset
2145 |
2146 | is a function of the data.
2147 | \end_layout
2148 |
2149 | \begin_layout Standard
2150 | The kernel K has the following properties:
2151 | \begin_inset Newline newline
2152 | \end_inset
2153 |
2154 | Symmetric, Normalized
2155 | \begin_inset Formula $\int_{\mathbb{R}^{d}}K(x)dx=1$
2156 | \end_inset
2157 |
2158 | , and
2159 | \begin_inset Formula $\lim_{||x||\rightarrow\infty}||x||^{d}K(x)=0$
2160 | \end_inset
2161 |
2162 | .
2163 | \end_layout
2164 |
2165 | \begin_layout Standard
2166 | The
2167 | \bar under
2168 | bandwidth
2169 | \bar default
2170 | is the width of the kernel function.
2171 | Too small = jagged results, too large = smoothed out results.
2172 | \end_layout
2173 |
2174 | \begin_layout Subsection
2175 |
2176 | \series bold
2177 | Principal Component Analysis
2178 | \end_layout
2179 |
2180 | \begin_layout Standard
2181 | First run
2182 | \series bold
2183 | singular value decomposition
2184 | \series default
2185 | on
2186 | \series bold
2187 |
2188 | \series default
2189 | pattern matrix
2190 | \begin_inset Formula $X$
2191 | \end_inset
2192 |
2193 | :
2194 | \end_layout
2195 |
2196 | \begin_layout Enumerate
2197 | Subtract mean from each point
2198 | \end_layout
2199 |
2200 | \begin_layout Enumerate
2201 | (Sometimes) scale each dimension by its variance
2202 | \end_layout
2203 |
2204 | \begin_layout Enumerate
2205 | Compute covariance
2206 | \begin_inset Formula $\Sigma=X^{T}X$
2207 | \end_inset
2208 |
2209 | (must be symmetric)
2210 | \end_layout
2211 |
2212 | \begin_layout Enumerate
2213 | Compute eigenvectors/values
2214 | \begin_inset Formula $\Sigma=VSV^{\intercal}$
2215 | \end_inset
2216 |
2217 | (spectral thm)
2218 | \end_layout
2219 |
2220 | \begin_layout Enumerate
2221 | Get back
2222 | \begin_inset Formula $X=X\Sigma=(XV)SV^{\intercal}=USV^{\intercal}$
2223 | \end_inset
2224 |
2225 |
2226 | \end_layout
2227 |
2228 | \begin_layout Standard
2229 | \begin_inset Formula $S$
2230 | \end_inset
2231 |
2232 | contains the eigenvalues of the transformed features.
2233 | The larger the
2234 | \begin_inset Formula $S_{ii}$
2235 | \end_inset
2236 |
2237 | , the larger the variance of that feature.
2238 | We want the
2239 | \begin_inset Formula $k$
2240 | \end_inset
2241 |
2242 | largest features, so we find the indices of the
2243 | \begin_inset Formula $k$
2244 | \end_inset
2245 |
2246 | largest items in
2247 | \begin_inset Formula $S$
2248 | \end_inset
2249 |
2250 | and we keep only these entries in
2251 | \begin_inset Formula $U$
2252 | \end_inset
2253 |
2254 | and
2255 | \begin_inset Formula $V$
2256 | \end_inset
2257 |
2258 | .
2259 | \end_layout
2260 |
2261 | \begin_layout Standard
2262 | \begin_inset VSpace vfill
2263 | \end_inset
2264 |
2265 |
2266 | \end_layout
2267 |
2268 | \begin_layout Standard
2269 | \begin_inset ERT
2270 | status open
2271 |
2272 | \begin_layout Plain Layout
2273 |
2274 |
2275 | \backslash
2276 | end{multicols}
2277 | \end_layout
2278 |
2279 | \end_inset
2280 |
2281 |
2282 | \end_layout
2283 |
2284 | \begin_layout Standard
2285 | \begin_inset Newpage newpage
2286 | \end_inset
2287 |
2288 |
2289 | \end_layout
2290 |
2291 | \begin_layout Standard
2292 | \begin_inset ERT
2293 | status open
2294 |
2295 | \begin_layout Plain Layout
2296 |
2297 |
2298 | \backslash
2299 | mytitle{CS 189 ALL OF IT}{Che Yeon, Chloe, Dhruv, Li, Sean}
2300 | \end_layout
2301 |
2302 | \begin_layout Plain Layout
2303 |
2304 |
2305 | \backslash
2306 | begin{multicols}{4}
2307 | \end_layout
2308 |
2309 | \end_inset
2310 |
2311 |
2312 | \end_layout
2313 |
2314 | \begin_layout Section
2315 | Past Exam Questions
2316 | \end_layout
2317 |
2318 | \begin_layout Standard
2319 | \begin_inset ERT
2320 | status collapsed
2321 |
2322 | \begin_layout Plain Layout
2323 |
2324 |
2325 | \backslash
2326 | bgroup
2327 | \end_layout
2328 |
2329 | \begin_layout Plain Layout
2330 |
2331 |
2332 | \backslash
2333 | renewcommand
2334 | \backslash
2335 | theenumi{(
2336 | \backslash
2337 | alph{enumi})}
2338 | \end_layout
2339 |
2340 | \begin_layout Plain Layout
2341 |
2342 |
2343 | \backslash
2344 | renewcommand
2345 | \backslash
2346 | labelenumi{
2347 | \backslash
2348 | theenumi}
2349 | \end_layout
2350 |
2351 | \end_inset
2352 |
2353 |
2354 | \end_layout
2355 |
2356 | \begin_layout Subsection
2357 | Spring 2013 Midterm
2358 | \end_layout
2359 |
2360 | \begin_layout Enumerate
2361 |
2362 | \bar under
2363 | False:
2364 | \bar default
2365 | In SVMs, we maximize
2366 | \begin_inset Formula $\frac{\left\Vert w\right\Vert ^{2}}{2}$
2367 | \end_inset
2368 |
2369 | subject to the margin constraints.
2370 | \end_layout
2371 |
2372 | \begin_layout Enumerate
2373 |
2374 | \bar under
2375 | False:
2376 | \bar default
2377 | In kernelized SVMS, the kernel matrix
2378 | \begin_inset Formula $K$
2379 | \end_inset
2380 |
2381 | has to be positive definite.
2382 | \end_layout
2383 |
2384 | \begin_layout Enumerate
2385 |
2386 | \bar under
2387 | True:
2388 | \bar default
2389 | If two random variables are independent, then they have to be uncorrelated.
2390 | \end_layout
2391 |
2392 | \begin_layout Enumerate
2393 |
2394 | \bar under
2395 | False:
2396 | \bar default
2397 | Isocontours of Gaussian distributions have axes whose lengths are proportional
2398 | to the eigenvalues of the covariance matrix.
2399 | \end_layout
2400 |
2401 | \begin_layout Enumerate
2402 |
2403 | \bar under
2404 | True:
2405 | \bar default
2406 | The RBF kernel
2407 | \begin_inset Formula $K\left(x_{i},x_{j}\right)=\exp\left(-\gamma\left\Vert x_{i}-x_{j}\right\Vert ^{2}\right)$
2408 | \end_inset
2409 |
2410 | corresponds to an infinite dimensional mapping of the feature vectors.
2411 | \end_layout
2412 |
2413 | \begin_layout Enumerate
2414 |
2415 | \bar under
2416 | True:
2417 | \bar default
2418 | If
2419 | \begin_inset Formula $(X,Y)$
2420 | \end_inset
2421 |
2422 | are jointly Gaussian, then
2423 | \begin_inset Formula $X$
2424 | \end_inset
2425 |
2426 | and
2427 | \begin_inset Formula $Y$
2428 | \end_inset
2429 |
2430 | are also Gaussian distributed.
2431 | \end_layout
2432 |
2433 | \begin_layout Enumerate
2434 |
2435 | \bar under
2436 | True:
2437 | \bar default
2438 | A function f(x,y,z) is convex if the Hessian of f is positive semi-definite.
2439 | \end_layout
2440 |
2441 | \begin_layout Enumerate
2442 |
2443 | \bar under
2444 | True:
2445 | \bar default
2446 | In a least-squares linear regression problem, adding an L2 regularization
2447 | penalty cannot decrease the L2 error of the solution w on the training
2448 | data.
2449 | \end_layout
2450 |
2451 | \begin_layout Enumerate
2452 |
2453 | \bar under
2454 | True:
2455 | \bar default
2456 | In linear SVMs, the optimal weight vector w is a linear combination of
2457 | training data points.
2458 | \end_layout
2459 |
2460 | \begin_layout Enumerate
2461 |
2462 | \bar under
2463 | False:
2464 | \bar default
2465 | In stochastic gradient descent, we take steps in the exact direction of
2466 | the gradient vector.
2467 | \end_layout
2468 |
2469 | \begin_layout Enumerate
2470 |
2471 | \bar under
2472 | False:
2473 | \bar default
2474 | In a two class problem when the class conditionals
2475 | \begin_inset Formula $P\left[x\mid y=0\right]andP\left[x\mid y=1\right]$
2476 | \end_inset
2477 |
2478 | are modeled as Gaussians with different covariance matrices, the posterior
2479 | probabilities turn out to be logistic functions.
2480 | \end_layout
2481 |
2482 | \begin_layout Enumerate
2483 |
2484 | \bar under
2485 | True:
2486 | \bar default
2487 | The perceptron training procedure is guaranteed to converge if the two
2488 | classes are linearly separable.
2489 | \end_layout
2490 |
2491 | \begin_layout Enumerate
2492 |
2493 | \bar under
2494 | False:
2495 | \bar default
2496 | The maximum likelihood estimate for the variance of a univariate Gaussian
2497 | is unbiased.
2498 | \end_layout
2499 |
2500 | \begin_layout Enumerate
2501 |
2502 | \bar under
2503 | True:
2504 | \bar default
2505 | In linear regression, using an L1 regularization penalty term results in
2506 | sparser solutions than using an L2 regularization penalty term.
2507 |
2508 | \end_layout
2509 |
2510 | \begin_layout Subsection
2511 | Spring 2013 Final
2512 | \end_layout
2513 |
2514 | \begin_layout Enumerate
2515 |
2516 | \bar under
2517 | True:
2518 | \bar default
2519 | Solving a non linear separation problem with a hard margin Kernelized SVM
2520 | (Gaussian RBF Kernel) might lead to overfitting.
2521 | \end_layout
2522 |
2523 | \begin_layout Enumerate
2524 |
2525 | \bar under
2526 | True:
2527 | \bar default
2528 | In SVMs, the sum of the Lagrange multipliers corresponding to the positive
2529 | examples is equal to the sum of the Lagrange multipliers corresponding
2530 | to the negative examples.
2531 | \end_layout
2532 |
2533 | \begin_layout Enumerate
2534 |
2535 | \bar under
2536 | False:
2537 | \bar default
2538 | SVMs directly give us the posterior probabilities
2539 | \begin_inset Formula $\mathrm{P}\left(y=1\mid x\right)$
2540 | \end_inset
2541 |
2542 | and
2543 | \begin_inset Formula $\mathrm{P}\left(y=−1\mid x\right)$
2544 | \end_inset
2545 |
2546 | .
2547 | \end_layout
2548 |
2549 | \begin_layout Enumerate
2550 |
2551 | \bar under
2552 | False:
2553 | \bar default
2554 |
2555 | \begin_inset Formula $V(X)=\mathrm{E}[X]^{2}−\mathrm{E}[X^{2}]$
2556 | \end_inset
2557 |
2558 |
2559 | \end_layout
2560 |
2561 | \begin_layout Enumerate
2562 |
2563 | \bar under
2564 | True:
2565 | \bar default
2566 | In the discriminative approach to solving classification problems, we model
2567 | the conditional probability of the labels given the observations.
2568 | \end_layout
2569 |
2570 | \begin_layout Enumerate
2571 |
2572 | \bar under
2573 | False:
2574 | \bar default
2575 | In a two class classification problem, a point on the Bayes optimal decision
2576 | boundary x* always satisfies
2577 | \begin_inset Formula $\mathrm{P}\left[y=1\mid x*\right]=\mathrm{P}\left[y=0\mid x*\right]$
2578 | \end_inset
2579 |
2580 | .
2581 | \end_layout
2582 |
2583 | \begin_layout Enumerate
2584 |
2585 | \bar under
2586 | True:
2587 | \bar default
2588 | Any linear combination of the components of a multivariate Gaussian is
2589 | a univariate Gaussian.
2590 | \end_layout
2591 |
2592 | \begin_layout Enumerate
2593 |
2594 | \bar under
2595 | False:
2596 | \bar default
2597 | For any two random variables
2598 | \begin_inset Formula $X\sim N\left(\mu_{1},\sigma_{1}^{2}\right)$
2599 | \end_inset
2600 |
2601 | and
2602 | \begin_inset Formula $Y\sim\mathcal{N}\left(\mu_{2},\sigma_{2}^{2}\right)$
2603 | \end_inset
2604 |
2605 | ,
2606 | \begin_inset Formula $X+Y\sim\mathcal{N}\left(\mu_{1}+\mu_{2},\sigma_{1}^{2}+\sigma_{2}^{2}\right)$
2607 | \end_inset
2608 |
2609 | .
2610 | \end_layout
2611 |
2612 | \begin_layout Enumerate
2613 |
2614 | \bar under
2615 | False:
2616 | \bar default
2617 | For a logistic regression problem differing initialization points can lead
2618 | to a much better optimum.
2619 | \end_layout
2620 |
2621 | \begin_layout Enumerate
2622 |
2623 | \bar under
2624 | False:
2625 | \bar default
2626 | In logistic regression, we model the odds ratio
2627 | \begin_inset Formula $\frac{p}{1-p}$
2628 | \end_inset
2629 |
2630 | as a linear function.
2631 | \end_layout
2632 |
2633 | \begin_layout Enumerate
2634 |
2635 | \bar under
2636 | True:
2637 | \bar default
2638 | Random forests can be used to classify infinite dimensional data.
2639 | \end_layout
2640 |
2641 | \begin_layout Enumerate
2642 |
2643 | \bar under
2644 | False:
2645 | \bar default
2646 | In boosting we start with a Gaussian weight distribution over the training
2647 | samples.
2648 | \end_layout
2649 |
2650 | \begin_layout Enumerate
2651 |
2652 | \bar under
2653 | False:
2654 | \bar default
2655 | In Adaboost, the error of each hypothesis is calculated by the ratio of
2656 | misclassified examples to the total number of examples.
2657 | \end_layout
2658 |
2659 | \begin_layout Enumerate
2660 |
2661 | \bar under
2662 | True:
2663 | \bar default
2664 | When
2665 | \begin_inset Formula $k=1$
2666 | \end_inset
2667 |
2668 | and
2669 | \begin_inset Formula $N\rightarrow\infty$
2670 | \end_inset
2671 |
2672 | , the kNN classification rate is bounded above by twice the Bayes error
2673 | rate.
2674 | \end_layout
2675 |
2676 | \begin_layout Enumerate
2677 |
2678 | \bar under
2679 | True:
2680 | \bar default
2681 | A single layer neural network with a sigmoid activation for binary classificati
2682 | on with the cross entropy loss is exactly equivalent to logistic regression.
2683 | \end_layout
2684 |
2685 | \begin_layout Enumerate
2686 |
2687 | \bar under
2688 | True:
2689 | \bar default
2690 | Convolution is a linear operation i.e.
2691 |
2692 | \begin_inset Formula $\left(\alpha f_{1}+\beta f_{2}\right)\ast g=\alpha f_{1}\ast g+\beta f_{2}\ast g$
2693 | \end_inset
2694 |
2695 | .
2696 | \end_layout
2697 |
2698 | \begin_layout Enumerate
2699 |
2700 | \bar under
2701 | True:
2702 | \bar default
2703 | The k-means algorithm does coordinate descent on a non-convex objective
2704 | function.
2705 | \end_layout
2706 |
2707 | \begin_layout Enumerate
2708 |
2709 | \bar under
2710 | True:
2711 | \bar default
2712 | A 1-NN classifier has higher variance than a 3-NN classifier.
2713 | \end_layout
2714 |
2715 | \begin_layout Enumerate
2716 |
2717 | \bar under
2718 | False:
2719 | \bar default
2720 | The single link agglomerative clustering algorithm groups two clusters
2721 | on the basis of the maximum distance between points in the two clusters.
2722 | \end_layout
2723 |
2724 | \begin_layout Enumerate
2725 |
2726 | \bar under
2727 | False:
2728 | \bar default
2729 | The largest eigenvector of the covariance matrix is the direction of minimum
2730 | variance in the data.
2731 | \end_layout
2732 |
2733 | \begin_layout Enumerate
2734 |
2735 | \bar under
2736 | False:
2737 | \bar default
2738 | The eigenvectors of
2739 | \begin_inset Formula $AA^{T}$
2740 | \end_inset
2741 |
2742 | and
2743 | \begin_inset Formula $A^{T}A$
2744 | \end_inset
2745 |
2746 | are the same.
2747 | \end_layout
2748 |
2749 | \begin_layout Enumerate
2750 |
2751 | \bar under
2752 | True:
2753 | \bar default
2754 | The non-zero eigenvalues of
2755 | \begin_inset Formula $AA^{T}$
2756 | \end_inset
2757 |
2758 | and
2759 | \begin_inset Formula $A^{T}A$
2760 | \end_inset
2761 |
2762 | are the same.
2763 | \end_layout
2764 |
2765 | \begin_layout Standard
2766 | \begin_inset Phantom Phantom
2767 | status open
2768 |
2769 | \begin_layout Plain Layout
2770 |
2771 | \end_layout
2772 |
2773 | \end_inset
2774 |
2775 |
2776 | \end_layout
2777 |
2778 | \begin_layout Enumerate
2779 | In linear regression, the irreducible error is
2780 | \bar under
2781 |
2782 | \begin_inset Formula $\sigma^{2}$
2783 | \end_inset
2784 |
2785 |
2786 | \bar default
2787 | and
2788 | \begin_inset Formula $\boxed{E\left[\left(y-\mathrm{E}(y\mid x)\right)^{^{2}}\right]}$
2789 | \end_inset
2790 |
2791 | .
2792 | \end_layout
2793 |
2794 | \begin_layout Enumerate
2795 | Let
2796 | \begin_inset Formula $S_{1}$
2797 | \end_inset
2798 |
2799 | and
2800 | \begin_inset Formula $S_{2}$
2801 | \end_inset
2802 |
2803 | be the support vectors for
2804 | \begin_inset Formula $w_{1}$
2805 | \end_inset
2806 |
2807 | (hard margin) and
2808 | \begin_inset Formula $w_{2}$
2809 | \end_inset
2810 |
2811 | (soft margin).
2812 | Then
2813 | \bar under
2814 |
2815 | \begin_inset Formula $S_{1}$
2816 | \end_inset
2817 |
2818 | may not be a subset of
2819 | \begin_inset Formula $S_{2}$
2820 | \end_inset
2821 |
2822 |
2823 | \bar default
2824 | and
2825 | \bar under
2826 |
2827 | \begin_inset Formula $w_{1}$
2828 | \end_inset
2829 |
2830 | may not be equal to
2831 | \begin_inset Formula $w_{2}$
2832 | \end_inset
2833 |
2834 |
2835 | \bar default
2836 | .
2837 | \end_layout
2838 |
2839 | \begin_layout Enumerate
2840 | Ordinary least square regression assumes each data point is generated according
2841 | to a linear function of the input plus
2842 | \begin_inset Formula $\mathcal{N}(0,\sigma)$
2843 | \end_inset
2844 |
2845 | noise.
2846 | In many systems, the noise variance is a positive linear function of the
2847 | input.
2848 | In this case, the probability model that describes this situation is
2849 | \begin_inset Formula $\boxed{\ensuremath{P(y|x)=\frac{1}{\sigma\sqrt{2\pi x}}\exp(-\frac{(y-(w_{0}+w_{1}x))^{2}}{2x\sigma^{2}}}}$
2850 | \end_inset
2851 |
2852 | .
2853 | \end_layout
2854 |
2855 | \begin_layout Enumerate
2856 | Averaging the outputs of multiple decision trees helps
2857 | \bar under
2858 | reduce variance
2859 | \bar default
2860 | .
2861 | \end_layout
2862 |
2863 | \begin_layout Enumerate
2864 | The following loss functions are convex:
2865 | \bar under
2866 | logistic
2867 | \bar default
2868 | ,
2869 | \bar under
2870 | hinge
2871 | \bar default
2872 | ,
2873 | \bar under
2874 | exponential
2875 | \bar default
2876 | .
2877 |
2878 | \bar under
2879 | Misclassification loss is not.
2880 | \end_layout
2881 |
2882 | \begin_layout Enumerate
2883 |
2884 | \bar under
2885 | Bias will be smaller
2886 | \bar default
2887 | and
2888 | \bar under
2889 | variance will be larger
2890 | \bar default
2891 | for trees of
2892 | \bar under
2893 | smaller depth
2894 | \bar default
2895 | .
2896 | \end_layout
2897 |
2898 | \begin_layout Enumerate
2899 | If making a tree with
2900 | \begin_inset Formula $k$
2901 | \end_inset
2902 |
2903 | -ary splits,
2904 | \bar under
2905 | the algorithm will prefer high values of
2906 | \begin_inset Formula $k$
2907 | \end_inset
2908 |
2909 |
2910 | \bar default
2911 | and
2912 | \bar under
2913 | there will be
2914 | \begin_inset Formula $k-1$
2915 | \end_inset
2916 |
2917 | thresholds for a
2918 | \begin_inset Formula $k$
2919 | \end_inset
2920 |
2921 | -ary split
2922 | \bar default
2923 | .
2924 | \end_layout
2925 |
2926 | \begin_layout Standard
2927 | \begin_inset VSpace vfill
2928 | \end_inset
2929 |
2930 |
2931 | \end_layout
2932 |
2933 | \begin_layout Standard
2934 | \begin_inset ERT
2935 | status open
2936 |
2937 | \begin_layout Plain Layout
2938 |
2939 |
2940 | \backslash
2941 | columnbreak
2942 | \end_layout
2943 |
2944 | \end_inset
2945 |
2946 |
2947 | \end_layout
2948 |
2949 | \begin_layout Subsection
2950 | Spring 2014 Final
2951 | \end_layout
2952 |
2953 | \begin_layout Enumerate
2954 |
2955 | \bar under
2956 | False:
2957 | \bar default
2958 | The singular value decomposition of a real matrix is unique.
2959 | \end_layout
2960 |
2961 | \begin_layout Enumerate
2962 |
2963 | \bar under
2964 | True:
2965 | \bar default
2966 | A multiple-layer neural network with linear activation functions is equivalent
2967 | to one single-layer perceptron that uses the same error function on the
2968 | output layer and has the same number of inputs.
2969 | \end_layout
2970 |
2971 | \begin_layout Enumerate
2972 |
2973 | \bar under
2974 | False:
2975 | \bar default
2976 | The maximum likelihood estimator for the parameter
2977 | \begin_inset Formula $\theta$
2978 | \end_inset
2979 |
2980 | of a uniform distribution over
2981 | \begin_inset Formula $[0,\theta]$
2982 | \end_inset
2983 |
2984 | is unbiased.
2985 | \end_layout
2986 |
2987 | \begin_layout Enumerate
2988 |
2989 | \bar under
2990 | True:
2991 | \bar default
2992 | The k-means algorithm for clustering is guaranteed to converge to a local
2993 | optimum.
2994 | \end_layout
2995 |
2996 | \begin_layout Enumerate
2997 |
2998 | \bar under
2999 | True:
3000 | \bar default
3001 | Increasing the depth of a decision tree cannot increase its training error.
3002 | \end_layout
3003 |
3004 | \begin_layout Enumerate
3005 |
3006 | \bar under
3007 | False:
3008 | \bar default
3009 | There exists a one-to-one feature mapping
3010 | \begin_inset Formula $\phi$
3011 | \end_inset
3012 |
3013 | for every valid kernel k.
3014 | \end_layout
3015 |
3016 | \begin_layout Enumerate
3017 |
3018 | \bar under
3019 | True:
3020 | \bar default
3021 | For high-dimensional data data, k-d trees can be slower than brute force
3022 | nearest neighbor search.
3023 | \end_layout
3024 |
3025 | \begin_layout Enumerate
3026 |
3027 | \bar under
3028 | True:
3029 | \bar default
3030 | If we had infinite data and infinitely fast computers, kNN would be the
3031 | only algorithm we would study in CS 189.
3032 | \end_layout
3033 |
3034 | \begin_layout Enumerate
3035 |
3036 | \bar under
3037 | True:
3038 | \bar default
3039 | For datasets with high label noise (many data points with incorrect labels,
3040 | random forests would generally perform better than boosted decision trees.
3041 | \end_layout
3042 |
3043 | \begin_layout Standard
3044 | \begin_inset Phantom Phantom
3045 | status open
3046 |
3047 | \begin_layout Plain Layout
3048 |
3049 | \end_layout
3050 |
3051 | \end_inset
3052 |
3053 |
3054 | \end_layout
3055 |
3056 | \begin_layout Enumerate
3057 | In Homework 4, you fit a logistic regression model on spam and ham data
3058 | for a Kaggle Comp.
3059 | Assume you had a very good score on the public test set, but when the GSIs
3060 | ran your model on a private test set, your score dropped a lot.
3061 | This is likely because you overfitted by submitting multiple times and
3062 | changing the following between submiss
3063 | \bar under
3064 | ions:
3065 | \begin_inset Formula $\lambda$
3066 | \end_inset
3067 |
3068 | , your penalty term
3069 | \bar default
3070 | ;
3071 | \bar under
3072 |
3073 | \begin_inset Formula $\varepsilon$
3074 | \end_inset
3075 |
3076 | , your convergence criterion
3077 | \bar default
3078 | ;
3079 | \bar under
3080 | your step size
3081 | \bar default
3082 | ;
3083 | \bar under
3084 | fixing a random bug
3085 | \bar default
3086 | .
3087 | \end_layout
3088 |
3089 | \begin_layout Enumerate
3090 | Given
3091 | \begin_inset Formula $d$
3092 | \end_inset
3093 |
3094 | -dimensional data
3095 | \begin_inset Formula $\{x_{i}\}_{i=1}^{N}$
3096 | \end_inset
3097 |
3098 | , you run principal component analysis and pick
3099 | \begin_inset Formula $P$
3100 | \end_inset
3101 |
3102 | principal components.
3103 | Can you always reconstruct any data point
3104 | \emph on
3105 |
3106 | \begin_inset Formula $x_{i}$
3107 | \end_inset
3108 |
3109 |
3110 | \emph default
3111 | for
3112 | \begin_inset Formula $i$
3113 | \end_inset
3114 |
3115 | from
3116 | \begin_inset Formula $1$
3117 | \end_inset
3118 |
3119 | to
3120 | \begin_inset Formula $N$
3121 | \end_inset
3122 |
3123 | from the
3124 | \begin_inset Formula $P$
3125 | \end_inset
3126 |
3127 | principal components with zero reconstruction error?
3128 | \bar under
3129 | Yes, if
3130 | \begin_inset Formula $P=d$
3131 | \end_inset
3132 |
3133 | .
3134 | \end_layout
3135 |
3136 | \begin_layout Enumerate
3137 | Putting a standard Gaussian prior on the weights for linear regression
3138 | \begin_inset Formula $(w\sim N(0,I))$
3139 | \end_inset
3140 |
3141 | will result in what type of posterior distribution on the weights?
3142 | \bar under
3143 | Gaussian.
3144 | \end_layout
3145 |
3146 | \begin_layout Enumerate
3147 | Suppose we have
3148 | \begin_inset Formula $N$
3149 | \end_inset
3150 |
3151 | instances of d-dimensional data.
3152 | Let
3153 | \begin_inset Formula $h$
3154 | \end_inset
3155 |
3156 | be the amount of data storage necessary for a histogram with a fixed number
3157 | of ticks per axis, and let
3158 | \begin_inset Formula $k$
3159 | \end_inset
3160 |
3161 | be the amount of data storage necessary for kernel density estimation.
3162 | Which of the following is true about
3163 | \begin_inset Formula $h$
3164 | \end_inset
3165 |
3166 | and
3167 | \begin_inset Formula $k$
3168 | \end_inset
3169 |
3170 | ?
3171 | \bar under
3172 |
3173 | \begin_inset Formula $h$
3174 | \end_inset
3175 |
3176 | grows exponentially with
3177 | \bar default
3178 |
3179 | \begin_inset Formula $d$
3180 | \end_inset
3181 |
3182 | , and
3183 | \bar under
3184 |
3185 | \begin_inset Formula $k$
3186 | \end_inset
3187 |
3188 | grows linearly with
3189 | \begin_inset Formula $N$
3190 | \end_inset
3191 |
3192 |
3193 | \bar default
3194 | .
3195 | \end_layout
3196 |
3197 | \begin_layout Enumerate
3198 | John just trained a decision tree for a digit recognition.
3199 | He notices an extremely low training error, but an abnormally large test
3200 | error.
3201 | He also notices that an SVM with a linear kernel performs much better than
3202 | his tree.
3203 | What could be the cause of his problem?
3204 | \bar under
3205 | Decision tree is too deep
3206 | \bar default
3207 | ;
3208 | \bar under
3209 | decision tree is overfitting
3210 | \bar default
3211 | .
3212 | \end_layout
3213 |
3214 | \begin_layout Enumerate
3215 | John has now switched to multilayer neural networks and notices that the
3216 | training error is going down and converges to a local minimum.
3217 | Then when he test on the new data, the test error is abnormally high.
3218 | What is probably going wrong and what do you recommend him to do?
3219 | \bar under
3220 | The training data size is not large enough so collect a larger training
3221 | data and retain it
3222 | \bar default
3223 | ;
3224 | \bar under
3225 | play with learning rate and add regularization term to objective function
3226 | \bar default
3227 | ;
3228 | \bar under
3229 | use a different initialization and train the network several times and use
3230 | the average of predictions from all nets to predict test data
3231 | \bar default
3232 | ;
3233 | \bar under
3234 | use the same training data but use less hidden layers
3235 | \bar default
3236 | .
3237 | \end_layout
3238 |
3239 | \begin_layout Subsection
3240 | Spring 2015 Midterm
3241 | \end_layout
3242 |
3243 | \begin_layout Enumerate
3244 |
3245 | \bar under
3246 | True:
3247 | \bar default
3248 | If the data is not linearly separable, there is no solution to hard margin
3249 | SVM.
3250 | \end_layout
3251 |
3252 | \begin_layout Enumerate
3253 |
3254 | \bar under
3255 | True:
3256 | \bar default
3257 | logistic regression can be used for classification.
3258 | \end_layout
3259 |
3260 | \begin_layout Enumerate
3261 |
3262 | \bar under
3263 | False:
3264 | \bar default
3265 | Two ways to prevent beta vectors from getting too large are to use a small
3266 | step size and use a small regularization value
3267 | \end_layout
3268 |
3269 | \begin_layout Enumerate
3270 |
3271 | \bar under
3272 | False:
3273 | \bar default
3274 | The L2 norm is often used because it produces sparse results, as opposed
3275 | to the L1 norm which does not
3276 | \end_layout
3277 |
3278 | \begin_layout Enumerate
3279 |
3280 | \bar under
3281 | False:
3282 | \bar default
3283 | For multivariate gaussian, the eigenvalues of the covariance matrix are
3284 | inversely proportional to the lengths of the ellipsoid axes that determine
3285 | the isocontours of the density.
3286 | \end_layout
3287 |
3288 | \begin_layout Enumerate
3289 |
3290 | \bar under
3291 | True:
3292 | \bar default
3293 | In a generative binary classification model where we assume the class condition
3294 | als are distributed as poisson and the class priors are bernoulli, the posterior
3295 | assumes a logistic form.
3296 | \end_layout
3297 |
3298 | \begin_layout Enumerate
3299 |
3300 | \bar under
3301 | False:
3302 | \bar default
3303 | MLE gives us not only a point estimate, but a distribution over the parameters
3304 | we are estimating.
3305 | \end_layout
3306 |
3307 | \begin_layout Enumerate
3308 |
3309 | \bar under
3310 | False:
3311 | \bar default
3312 | Penalized MLE and bayesian estimators for parameters are better used in
3313 | the setting of low-dimensional data with many training examples
3314 | \end_layout
3315 |
3316 | \begin_layout Enumerate
3317 |
3318 | \bar under
3319 | True:
3320 | \bar default
3321 | It is not good machine learning practice to use the test set to help adjust
3322 | the hyperparameters
3323 | \end_layout
3324 |
3325 | \begin_layout Enumerate
3326 |
3327 | \bar under
3328 | False:
3329 | \bar default
3330 | a symmetric positive semidefinite matrix always has nonnegative elements.
3331 |
3332 | \end_layout
3333 |
3334 | \begin_layout Enumerate
3335 |
3336 | \bar under
3337 | True:
3338 | \bar default
3339 | for a valid kernel function k, the corresponding feature mapping can map
3340 | a finite dimensional vector to an infinite dimensional vector
3341 | \end_layout
3342 |
3343 | \begin_layout Enumerate
3344 |
3345 | \bar under
3346 | False:
3347 | \bar default
3348 | the more features we use, the better our learning algorithm will generalize
3349 | to new data points.
3350 | \end_layout
3351 |
3352 | \begin_layout Enumerate
3353 |
3354 | \bar under
3355 | True:
3356 | \bar default
3357 | a discriminative classifier explicitly models
3358 | \begin_inset Formula $\mathrm{P}\left(Y\mid X\right)$
3359 | \end_inset
3360 |
3361 | .
3362 | \end_layout
3363 |
3364 | \begin_layout Standard
3365 | \begin_inset Phantom Phantom
3366 | status open
3367 |
3368 | \begin_layout Plain Layout
3369 |
3370 | \end_layout
3371 |
3372 | \end_inset
3373 |
3374 |
3375 | \end_layout
3376 |
3377 | \begin_layout Enumerate
3378 | You can use kernels with
3379 | \bar under
3380 | SVM
3381 | \bar default
3382 | and
3383 | \bar under
3384 | perceptron
3385 | \bar default
3386 | .
3387 | \end_layout
3388 |
3389 | \begin_layout Enumerate
3390 | Cross validation is used to select hyperparameters.
3391 | It prevents overfitting, but is not guaranteed to prevent it.
3392 | \end_layout
3393 |
3394 | \begin_layout Enumerate
3395 | L2 regularization is equivalent to imposing a Gaussian prior in linear regressio
3396 | n.
3397 | \end_layout
3398 |
3399 | \begin_layout Enumerate
3400 | If we have 2 two-dimensional Gaussians, the same covariance matrix for both
3401 | will result in a linear decision boundary.
3402 | \end_layout
3403 |
3404 | \begin_layout Enumerate
3405 | The normal equations can be derived from minimizing empirical risk, assuming
3406 | normally distributed noise, and assuming
3407 | \begin_inset Formula $\mathrm{P}(Y\mid X)$
3408 | \end_inset
3409 |
3410 | is distributed normally with mean $B^Tx$ and variance
3411 | \begin_inset Formula $\sigma^{2}$
3412 | \end_inset
3413 |
3414 | .
3415 | \end_layout
3416 |
3417 | \begin_layout Enumerate
3418 | Logistic regression can be motivated from
3419 | \bar under
3420 | log odds equated to an affine function of x
3421 | \bar default
3422 | and
3423 | \bar under
3424 | generative models with gaussian class conditionals
3425 | \bar default
3426 | .
3427 | \end_layout
3428 |
3429 | \begin_layout Enumerate
3430 | The perceptron algorithm will converge
3431 | \bar under
3432 | only if the data is linearly separable
3433 | \bar default
3434 | .
3435 | \end_layout
3436 |
3437 | \begin_layout Enumerate
3438 |
3439 | \bar under
3440 | True:
3441 | \bar default
3442 | Newton's method is typically more expensive to calculate than gradient
3443 | descent per iteration.
3444 | \bar under
3445 |
3446 | \begin_inset Newline newline
3447 | \end_inset
3448 |
3449 | True:
3450 | \bar default
3451 | for quadratic equations, Newton's method typically requires fewer iterations
3452 | than gradient descent.
3453 | \bar under
3454 |
3455 | \begin_inset Newline newline
3456 | \end_inset
3457 |
3458 | False:
3459 | \bar default
3460 | Gradient descent can be viewed as iteratively reweighted least squares.
3461 | \end_layout
3462 |
3463 | \begin_layout Enumerate
3464 |
3465 | \bar under
3466 | True:
3467 | \bar default
3468 | Complementary slackness implies that every training point that is misclassified
3469 | by a soft margin SVM is a support vector.
3470 | \bar under
3471 |
3472 | \begin_inset Newline newline
3473 | \end_inset
3474 |
3475 | True:
3476 | \bar default
3477 | When we solve the SVM with the dual problem, we need only the dot product
3478 | of
3479 | \begin_inset Formula $x_{i}$
3480 | \end_inset
3481 |
3482 | and
3483 | \begin_inset Formula $x_{j}$
3484 | \end_inset
3485 |
3486 | for all
3487 | \begin_inset Formula $i$
3488 | \end_inset
3489 |
3490 | ,
3491 | \begin_inset Formula $j$
3492 | \end_inset
3493 |
3494 | .
3495 | \bar under
3496 |
3497 | \begin_inset Newline newline
3498 | \end_inset
3499 |
3500 | True:
3501 | \bar default
3502 | we use Lagrange multipliers in an optimization problem with inequality
3503 | constraints.
3504 | \end_layout
3505 |
3506 | \begin_layout Enumerate
3507 | \begin_inset Formula $\left\Vert \Phi(x)-\Phi(y)\right\Vert _{2}^{2}$
3508 | \end_inset
3509 |
3510 | can be computed exclusively with inner products.
3511 | \begin_inset Newline newline
3512 | \end_inset
3513 |
3514 | But not
3515 | \begin_inset Formula $\left\Vert \Phi(x)-\Phi(y)\right\Vert _{1}$
3516 | \end_inset
3517 |
3518 | norm or
3519 | \begin_inset Formula $\Phi(x)-\Phi(y)$
3520 | \end_inset
3521 |
3522 | .
3523 | \end_layout
3524 |
3525 | \begin_layout Enumerate
3526 | Strong duality holds for
3527 | \bar under
3528 | hard and soft margin SVM
3529 | \bar default
3530 | , but
3531 | \bar under
3532 | not constrained optimization problems
3533 | \bar default
3534 | in general.
3535 | \end_layout
3536 |
3537 | \begin_layout Standard
3538 | \begin_inset VSpace vfill
3539 | \end_inset
3540 |
3541 |
3542 | \end_layout
3543 |
3544 | \begin_layout Standard
3545 | \begin_inset ERT
3546 | status open
3547 |
3548 | \begin_layout Plain Layout
3549 |
3550 |
3551 | \backslash
3552 | columnbreak
3553 | \end_layout
3554 |
3555 | \end_inset
3556 |
3557 |
3558 | \end_layout
3559 |
3560 | \begin_layout Section
3561 | Discussion Problems
3562 | \end_layout
3563 |
3564 | \begin_layout Subsection
3565 | Discussion 9 -- Entropy
3566 | \end_layout
3567 |
3568 | \begin_layout Standard
3569 | \begin_inset Graphics
3570 | filename graphics/disc09-entropy-1.pdf
3571 | width 97col%
3572 |
3573 | \end_inset
3574 |
3575 |
3576 | \end_layout
3577 |
3578 | \begin_layout Subsection
3579 | Discussion 11 -- Skip-Layer NN
3580 | \end_layout
3581 |
3582 | \begin_layout Standard
3583 | \begin_inset Graphics
3584 | filename graphics/disc10-skipnn-1.pdf
3585 | width 97col%
3586 |
3587 | \end_inset
3588 |
3589 |
3590 | \end_layout
3591 |
3592 | \begin_layout Standard
3593 | \begin_inset Graphics
3594 | filename graphics/disc10-skipnn-2.pdf
3595 | width 97col%
3596 |
3597 | \end_inset
3598 |
3599 |
3600 | \end_layout
3601 |
3602 | \begin_layout Subsection
3603 | Discussion 12 -- PCA
3604 | \end_layout
3605 |
3606 | \begin_layout Standard
3607 | \begin_inset Graphics
3608 | filename graphics/disc12-pca-1.pdf
3609 | width 97col%
3610 |
3611 | \end_inset
3612 |
3613 |
3614 | \end_layout
3615 |
3616 | \begin_layout Standard
3617 | \begin_inset Graphics
3618 | filename graphics/disc12-pca-2.pdf
3619 | width 97col%
3620 |
3621 | \end_inset
3622 |
3623 |
3624 | \end_layout
3625 |
3626 | \begin_layout Standard
3627 | \begin_inset Graphics
3628 | filename graphics/disc12-pca-3.pdf
3629 | width 97col%
3630 |
3631 | \end_inset
3632 |
3633 |
3634 | \end_layout
3635 |
3636 | \begin_layout Standard
3637 | \begin_inset Graphics
3638 | filename graphics/disc12-pca-4.pdf
3639 | width 97col%
3640 |
3641 | \end_inset
3642 |
3643 |
3644 | \end_layout
3645 |
3646 | \begin_layout Standard
3647 | \begin_inset ERT
3648 | status collapsed
3649 |
3650 | \begin_layout Plain Layout
3651 |
3652 |
3653 | \backslash
3654 | egroup
3655 | \end_layout
3656 |
3657 | \end_inset
3658 |
3659 |
3660 | \end_layout
3661 |
3662 | \begin_layout Standard
3663 | \begin_inset ERT
3664 | status open
3665 |
3666 | \begin_layout Plain Layout
3667 |
3668 |
3669 | \backslash
3670 | columnbreak
3671 | \end_layout
3672 |
3673 | \end_inset
3674 |
3675 |
3676 | \end_layout
3677 |
3678 | \begin_layout Section
3679 | Minicards
3680 | \end_layout
3681 |
3682 | \begin_layout Standard
3683 |
3684 | \series bold
3685 | \begin_inset Box Boxed
3686 | position "t"
3687 | hor_pos "c"
3688 | has_inner_box 1
3689 | inner_pos "t"
3690 | use_parbox 1
3691 | use_makebox 0
3692 | width "97col%"
3693 | special "none"
3694 | height "1in"
3695 | height_special "totalheight"
3696 | status open
3697 |
3698 | \begin_layout Plain Layout
3699 |
3700 | \series bold
3701 | Gaussian distribution
3702 | \series default
3703 | [7, 8]
3704 | \end_layout
3705 |
3706 | \begin_layout Plain Layout
3707 | \begin_inset Formula $1$
3708 | \end_inset
3709 |
3710 | -var (normal):
3711 | \begin_inset Formula $p(x)=\ensuremath{\frac{1}{\sigma\sqrt{2\pi}}\exp\left(-\frac{\left(x-\mu\right)^{2}}{2\sigma^{2}}\right)}$
3712 | \end_inset
3713 |
3714 |
3715 | \end_layout
3716 |
3717 | \begin_layout Plain Layout
3718 | Multivar:
3719 | \begin_inset Formula $p(x)=\frac{1}{\sqrt{\left|\Sigma\right|}\sqrt{2\pi}^{d}}\exp\left(-\frac{1}{2}\left(x-\mu\right)^{\intercal}\Sigma^{-1}\left(x-\mu\right)\right)$
3720 | \end_inset
3721 |
3722 |
3723 | \end_layout
3724 |
3725 | \end_inset
3726 |
3727 |
3728 | \end_layout
3729 |
3730 | \begin_layout Standard
3731 | \begin_inset CommandInset line
3732 | LatexCommand rule
3733 | offset "0.5ex"
3734 | width "100col%"
3735 | height "1pt"
3736 |
3737 | \end_inset
3738 |
3739 |
3740 | \end_layout
3741 |
3742 | \begin_layout Standard
3743 | The
3744 | \series bold
3745 | covariance
3746 | \series default
3747 |
3748 | \begin_inset Formula $\Sigma$
3749 | \end_inset
3750 |
3751 | of variables
3752 | \begin_inset Formula $X$
3753 | \end_inset
3754 |
3755 | is a matrix such that each entry
3756 | \begin_inset Formula $\Sigma_{ij}=\mathrm{Cov}(X_{i},X_{j})$
3757 | \end_inset
3758 |
3759 | .
3760 | This means that the diagonal entries
3761 | \begin_inset Formula $\Sigma_{ii}=\mathrm{Var}(X_{i})$
3762 | \end_inset
3763 |
3764 | .
3765 | If the matrix is diagonal, then the non-diagonal entries are zero, which
3766 | means all the variables
3767 | \begin_inset Formula $X_{i}$
3768 | \end_inset
3769 |
3770 | are independent.
3771 | \end_layout
3772 |
3773 | \begin_layout Standard
3774 | It's nice to have independent variables, so we try to diagonalize non-diagonal
3775 | covariances.
3776 | \end_layout
3777 |
3778 | \begin_layout Standard
3779 |
3780 | \series bold
3781 | \begin_inset Box Boxed
3782 | position "t"
3783 | hor_pos "c"
3784 | has_inner_box 1
3785 | inner_pos "t"
3786 | use_parbox 1
3787 | use_makebox 0
3788 | width "97col%"
3789 | special "none"
3790 | height "1in"
3791 | height_special "totalheight"
3792 | status open
3793 |
3794 | \begin_layout Plain Layout
3795 |
3796 | \series bold
3797 | Spectral Theorem
3798 | \series default
3799 | [7:23]
3800 | \end_layout
3801 |
3802 | \begin_layout Enumerate
3803 | Take definition of eigenvalue/vector:
3804 | \begin_inset Formula $Ax=\lambda x$
3805 | \end_inset
3806 |
3807 |
3808 | \end_layout
3809 |
3810 | \begin_layout Enumerate
3811 | Pack multiple eigenvalues into
3812 | \begin_inset Formula $\Lambda=\mathrm{diag}\left(\lambda_{1},\lambda_{2},\ldots,\lambda_{n}\right)$
3813 | \end_inset
3814 |
3815 |
3816 | \begin_inset Newline newline
3817 | \end_inset
3818 |
3819 |
3820 | \begin_inset Formula $n$
3821 | \end_inset
3822 |
3823 | eigenvalues exist iff
3824 | \begin_inset Formula $A$
3825 | \end_inset
3826 |
3827 | is symmetric.
3828 | \end_layout
3829 |
3830 | \begin_layout Enumerate
3831 | Pack multiple eigenvectors into
3832 | \begin_inset Formula $U=\left[x_{1}\ x_{2}\ \ldots\ x_{n}\right]$
3833 | \end_inset
3834 |
3835 |
3836 | \end_layout
3837 |
3838 | \begin_layout Enumerate
3839 | Rewrite equation using these:
3840 | \begin_inset Formula $\boxed{AU=U\Lambda\longrightarrow A=U\Lambda U'}$
3841 | \end_inset
3842 |
3843 | .
3844 | \begin_inset Newline newline
3845 | \end_inset
3846 |
3847 | We can use this to diagonalize a symmetric
3848 | \begin_inset Formula $A$
3849 | \end_inset
3850 |
3851 | .
3852 | \end_layout
3853 |
3854 | \end_inset
3855 |
3856 |
3857 | \end_layout
3858 |
3859 | \begin_layout Standard
3860 | \begin_inset CommandInset line
3861 | LatexCommand rule
3862 | offset "0.5ex"
3863 | width "100col%"
3864 | height "1pt"
3865 |
3866 | \end_inset
3867 |
3868 |
3869 | \end_layout
3870 |
3871 | \begin_layout Standard
3872 |
3873 | \series bold
3874 | SVM-like classifiers
3875 | \series default
3876 | work with a
3877 | \bar under
3878 | boundary
3879 | \bar default
3880 | , a hyperplane (a line for 2D data) that separates two classes.
3881 |
3882 | \bar under
3883 | Support vectors
3884 | \bar default
3885 | are the point(s) closest to the boundary.
3886 |
3887 | \begin_inset Formula $\gamma$
3888 | \end_inset
3889 |
3890 | is the
3891 | \bar under
3892 | margin
3893 | \bar default
3894 | , the distance between the boundary and the support vector(s).
3895 | The
3896 | \bar under
3897 | parameter
3898 | \begin_inset Formula $\theta$
3899 | \end_inset
3900 |
3901 |
3902 | \bar default
3903 | is a vector.
3904 |
3905 | \begin_inset Formula $\boxed{\theta\cdot x}$
3906 | \end_inset
3907 |
3908 | gives predictions.
3909 | About
3910 | \begin_inset Formula $\theta$
3911 | \end_inset
3912 |
3913 | :
3914 | \end_layout
3915 |
3916 | \begin_layout Itemize
3917 | The direction of
3918 | \begin_inset Formula $\theta$
3919 | \end_inset
3920 |
3921 | defines the boundary.
3922 | We can choose this.
3923 | \end_layout
3924 |
3925 | \begin_layout Itemize
3926 | \begin_inset Formula $\left\Vert \theta\right\Vert $
3927 | \end_inset
3928 |
3929 | must be
3930 | \begin_inset Formula $1/\gamma$
3931 | \end_inset
3932 |
3933 | , as restricted by
3934 | \begin_inset Formula $\forall i:y^{i}\theta\cdot x^{i}\geq1$
3935 | \end_inset
3936 |
3937 |
3938 | \begin_inset Newline newline
3939 | \end_inset
3940 |
3941 | We cannot explicitly choose this; it depends on the boundary.
3942 | \begin_inset Newline newline
3943 | \end_inset
3944 |
3945 | This restriction is turned into a cost in soft-margin SVM.
3946 | \end_layout
3947 |
3948 | \begin_layout Standard
3949 |
3950 | \series bold
3951 | \begin_inset Box Boxed
3952 | position "t"
3953 | hor_pos "c"
3954 | has_inner_box 1
3955 | inner_pos "t"
3956 | use_parbox 1
3957 | use_makebox 0
3958 | width "97col%"
3959 | special "none"
3960 | height "1in"
3961 | height_special "totalheight"
3962 | status open
3963 |
3964 | \begin_layout Plain Layout
3965 |
3966 | \series bold
3967 | Perceptron
3968 | \series default
3969 | [2:11, 3:6] picks misclassified point and updates
3970 | \begin_inset Formula $\theta$
3971 | \end_inset
3972 |
3973 | just enough to classify it correctly:
3974 | \begin_inset Newline newline
3975 | \end_inset
3976 |
3977 |
3978 | \begin_inset Formula $\boxed{\theta\leftarrow\theta+x^{i}}$
3979 | \end_inset
3980 |
3981 | or
3982 | \begin_inset Formula $\boxed{\theta\leftarrow\theta-\nabla J\left(\theta\right)}$
3983 | \end_inset
3984 |
3985 |
3986 | \end_layout
3987 |
3988 | \begin_layout Plain Layout
3989 |
3990 | \bar under
3991 | Overfits
3992 | \bar default
3993 | when outliers skew the boundary.
3994 |
3995 | \bar under
3996 | Converges
3997 | \bar default
3998 | iff separable.
3999 | \end_layout
4000 |
4001 | \begin_layout Plain Layout
4002 |
4003 | \bar under
4004 | Batch eqn
4005 | \bar default
4006 |
4007 | \begin_inset Formula $\theta\cdot x=\sum_{i}\alpha^{i}y^{i}x^{i}\cdot x$
4008 | \end_inset
4009 |
4010 | :
4011 | \begin_inset Newline newline
4012 | \end_inset
4013 |
4014 |
4015 | \begin_inset Formula $\alpha_{i}=\text{\# times point \emph{i} was misclassified}$
4016 | \end_inset
4017 |
4018 |
4019 | \end_layout
4020 |
4021 | \end_inset
4022 |
4023 |
4024 | \end_layout
4025 |
4026 | \begin_layout Standard
4027 |
4028 | \series bold
4029 | \begin_inset Box Boxed
4030 | position "t"
4031 | hor_pos "c"
4032 | has_inner_box 1
4033 | inner_pos "t"
4034 | use_parbox 1
4035 | use_makebox 0
4036 | width "97col%"
4037 | special "none"
4038 | height "1in"
4039 | height_special "totalheight"
4040 | status open
4041 |
4042 | \begin_layout Plain Layout
4043 |
4044 | \series bold
4045 | Hard-margin SVM
4046 | \series default
4047 | [3:36] maximizes the margin around the boundary.
4048 | Technically, it minimizes the distance between boundary and the vectors
4049 | closest to it (the support vectors):
4050 | \begin_inset Newline newline
4051 | \end_inset
4052 |
4053 |
4054 | \begin_inset Formula $\boxed{\min_{\theta}\left\Vert \theta\right\Vert ^{2}\quad\text{such that}\ \forall i:y^{i}\theta\cdot x^{i}\geq1}$
4055 | \end_inset
4056 |
4057 |
4058 | \end_layout
4059 |
4060 | \begin_layout Plain Layout
4061 | Sometimes removing a few outliers lets us find a much higher margin or a
4062 | margin at all.
4063 | Hard-margin
4064 | \bar under
4065 | overfits
4066 | \bar default
4067 | by not seeing this.
4068 | \end_layout
4069 |
4070 | \begin_layout Plain Layout
4071 |
4072 | \bar under
4073 | Converges
4074 | \bar default
4075 | iff separable.
4076 | \end_layout
4077 |
4078 | \begin_layout Plain Layout
4079 |
4080 | \bar under
4081 | Batch eqn
4082 | \bar default
4083 |
4084 | \begin_inset Formula $\theta=\sum_{i}\alpha^{i}y^{i}x^{i}$
4085 | \end_inset
4086 |
4087 | , where
4088 | \begin_inset Formula $\alpha^{i}=\mathbf{1}_{i\ \text{is support vector}}$
4089 | \end_inset
4090 |
4091 |
4092 | \end_layout
4093 |
4094 | \end_inset
4095 |
4096 |
4097 | \end_layout
4098 |
4099 | \begin_layout Standard
4100 |
4101 | \series bold
4102 | \begin_inset Box Boxed
4103 | position "t"
4104 | hor_pos "c"
4105 | has_inner_box 1
4106 | inner_pos "t"
4107 | use_parbox 1
4108 | use_makebox 0
4109 | width "97col%"
4110 | special "none"
4111 | height "1in"
4112 | height_special "totalheight"
4113 | status open
4114 |
4115 | \begin_layout Plain Layout
4116 |
4117 | \series bold
4118 | Soft-margin SVM
4119 | \series default
4120 | [3:37] is like hard-margin SVM but penalizes misclassifications:
4121 | \begin_inset Newline newline
4122 | \end_inset
4123 |
4124 |
4125 | \begin_inset Formula $\boxed{\min_{\theta}\left\Vert \theta\right\Vert ^{2}+C\sum_{i=1}^{n}\left(1-y^{i}\theta\cdot x^{i}\right)_{+}}$
4126 | \end_inset
4127 |
4128 |
4129 | \end_layout
4130 |
4131 | \begin_layout Plain Layout
4132 |
4133 | \bar under
4134 | Hyperparameter
4135 | \bar default
4136 |
4137 | \begin_inset Formula $C$
4138 | \end_inset
4139 |
4140 | is the hardness of the margin.
4141 | Lower
4142 | \begin_inset Formula $C$
4143 | \end_inset
4144 |
4145 | means more misclassifications but larger soft margin.
4146 | \end_layout
4147 |
4148 | \begin_layout Plain Layout
4149 |
4150 | \bar under
4151 | Overfits
4152 | \bar default
4153 | on less data, more features, higher
4154 | \begin_inset Formula $C$
4155 | \end_inset
4156 |
4157 |
4158 | \end_layout
4159 |
4160 | \end_inset
4161 |
4162 |
4163 | \end_layout
4164 |
4165 | \begin_layout Standard
4166 | \begin_inset CommandInset line
4167 | LatexCommand rule
4168 | offset "0.5ex"
4169 | width "100col%"
4170 | height "1pt"
4171 |
4172 | \end_inset
4173 |
4174 |
4175 | \end_layout
4176 |
4177 | \begin_layout Standard
4178 |
4179 | \series bold
4180 | More classifiers
4181 | \end_layout
4182 |
4183 | \begin_layout Standard
4184 |
4185 | \series bold
4186 | \begin_inset Box Boxed
4187 | position "t"
4188 | hor_pos "c"
4189 | has_inner_box 1
4190 | inner_pos "t"
4191 | use_parbox 1
4192 | use_makebox 0
4193 | width "97col%"
4194 | special "none"
4195 | height "1in"
4196 | height_special "totalheight"
4197 | status open
4198 |
4199 | \begin_layout Plain Layout
4200 |
4201 | \series bold
4202 | KNN
4203 | \series default
4204 | [14:4] Given an item
4205 | \begin_inset Formula $x$
4206 | \end_inset
4207 |
4208 | , find the
4209 | \begin_inset Formula $k$
4210 | \end_inset
4211 |
4212 | training items
4213 | \begin_inset Quotes eld
4214 | \end_inset
4215 |
4216 | closest
4217 | \begin_inset Quotes erd
4218 | \end_inset
4219 |
4220 | to
4221 | \begin_inset Formula $x$
4222 | \end_inset
4223 |
4224 | and return the result of a vote.
4225 | \end_layout
4226 |
4227 | \begin_layout Plain Layout
4228 |
4229 | \bar under
4230 | Hyperparameter
4231 | \bar default
4232 |
4233 | \begin_inset Formula $k$
4234 | \end_inset
4235 |
4236 | , the number of neighbors.
4237 | \begin_inset Newline newline
4238 | \end_inset
4239 |
4240 |
4241 | \begin_inset Quotes eld
4242 | \end_inset
4243 |
4244 | Closest
4245 | \begin_inset Quotes erd
4246 | \end_inset
4247 |
4248 | can be defined by some norm (
4249 | \begin_inset Formula $l_{2}$
4250 | \end_inset
4251 |
4252 | by default).
4253 | \end_layout
4254 |
4255 | \begin_layout Plain Layout
4256 |
4257 | \bar under
4258 | Overfits
4259 | \bar default
4260 | when
4261 | \begin_inset Formula $k$
4262 | \end_inset
4263 |
4264 | is really small
4265 | \end_layout
4266 |
4267 | \end_inset
4268 |
4269 |
4270 | \end_layout
4271 |
4272 | \begin_layout Standard
4273 |
4274 | \series bold
4275 | \begin_inset Box Boxed
4276 | position "t"
4277 | hor_pos "c"
4278 | has_inner_box 1
4279 | inner_pos "t"
4280 | use_parbox 1
4281 | use_makebox 0
4282 | width "97col%"
4283 | special "none"
4284 | height "1in"
4285 | height_special "totalheight"
4286 | status open
4287 |
4288 | \begin_layout Plain Layout
4289 |
4290 | \series bold
4291 | Decision trees
4292 | \series default
4293 | : Recursively split on features that yield the best split.
4294 | Each tree has many nodes, which either split on a feature at a threshold,
4295 | or all data the same way.
4296 | \begin_inset Newline newline
4297 | \end_inset
4298 |
4299 |
4300 | \bar under
4301 | Hyperparameters
4302 | \bar default
4303 | typically restrict complexity (max tree depth, min points at node) or penalize
4304 | it.
4305 | One particular one of interest is
4306 | \begin_inset Formula $d$
4307 | \end_inset
4308 |
4309 | , the max number of nodes.
4310 | \end_layout
4311 |
4312 | \begin_layout Plain Layout
4313 |
4314 | \bar under
4315 | Overfits
4316 | \bar default
4317 | when tree is deep or when we are allowed to split on a very small number
4318 | of items.
4319 | \end_layout
4320 |
4321 | \begin_layout Plain Layout
4322 |
4323 | \series bold
4324 | Bagging
4325 | \series default
4326 | : Make multiple trees, each with a random subset of training items.
4327 | To predict, take vote from trees.
4328 | \end_layout
4329 |
4330 | \begin_layout Plain Layout
4331 |
4332 | \bar under
4333 | Hyperparameters
4334 | \bar default
4335 | # trees, proportion of items to subset.
4336 | \end_layout
4337 |
4338 | \begin_layout Plain Layout
4339 |
4340 | \series bold
4341 | Random forests
4342 | \series default
4343 | is bagging, except, for each node, consider only a random subset of features
4344 | to split on.
4345 | \end_layout
4346 |
4347 | \begin_layout Plain Layout
4348 |
4349 | \bar under
4350 | Hyperparameters
4351 | \bar default
4352 | proportion of features to consider.
4353 | \end_layout
4354 |
4355 | \end_inset
4356 |
4357 |
4358 | \end_layout
4359 |
4360 | \begin_layout Standard
4361 |
4362 | \series bold
4363 | \begin_inset Box Boxed
4364 | position "t"
4365 | hor_pos "c"
4366 | has_inner_box 1
4367 | inner_pos "t"
4368 | use_parbox 1
4369 | use_makebox 0
4370 | width "97col%"
4371 | special "none"
4372 | height "1in"
4373 | height_special "totalheight"
4374 | status open
4375 |
4376 | \begin_layout Plain Layout
4377 |
4378 | \series bold
4379 | AdaBoost
4380 | \series default
4381 | [dtrees3:34] Use any algorithm (i.e., decision trees) to train a weak learner,
4382 | take all the errors, and train a new learner on with the errors emphasized*.
4383 | To predict, predict with the first algorithm, then add on the prediction
4384 | of the second algorithm, and so on.
4385 | \end_layout
4386 |
4387 | \begin_layout Plain Layout
4388 | \noindent
4389 | * For regression, train the new learner on the errors.
4390 | For classification, give misclassified items more weight.
4391 | \end_layout
4392 |
4393 | \begin_layout Plain Layout
4394 |
4395 | \bar under
4396 | Hyperparameters
4397 | \bar default
4398 |
4399 | \begin_inset Formula $B$
4400 | \end_inset
4401 |
4402 | , the number of weak learners;
4403 | \begin_inset Formula $\lambda$
4404 | \end_inset
4405 |
4406 | , the learning rate.
4407 | \end_layout
4408 |
4409 | \end_inset
4410 |
4411 |
4412 | \end_layout
4413 |
4414 | \begin_layout Standard
4415 | \begin_inset VSpace vfill
4416 | \end_inset
4417 |
4418 |
4419 | \end_layout
4420 |
4421 | \begin_layout Standard
4422 | \begin_inset ERT
4423 | status open
4424 |
4425 | \begin_layout Plain Layout
4426 |
4427 |
4428 | \backslash
4429 | end{multicols}
4430 | \end_layout
4431 |
4432 | \end_inset
4433 |
4434 |
4435 | \end_layout
4436 |
4437 | \end_body
4438 | \end_document
4439 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | CS 189 Machine Learning Cheat Sheet
2 | ===================================
3 |
4 | Cheat sheets:
5 |
6 | - [189-cheat-sheet-minicards.pdf](<189-cheat-sheet-minicards.pdf>)
7 | - [189-cheat-sheet-nominicards.pdf](<189-cheat-sheet-nominicards.pdf>)
8 |
9 | These cheat sheets include:
10 |
11 | - [The original notes]() by Rishi
12 | Sharma and Peter Gao (from which this repo is forked), with some modifications:
13 | - Rearranged sections to form better grouping, add section titles
14 | - Reworded/condensed some sections in light of better grouping
15 | - Added some new content
16 | - **All** past T/F and multiple choice questions from the following semesters:
17 | - Spring 2013 midterm & final
18 | - Spring 2014 final
19 | - Spring 2015 midterm
20 | - Important algorithmic problems from discussions
21 | - Additional notes ("minicards")
22 | - The `no-minicards` version omits these, so you can have space to add your own notes.
23 |
--------------------------------------------------------------------------------
/graphics/NN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/NN.pdf
--------------------------------------------------------------------------------
/graphics/NN1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/NN1.pdf
--------------------------------------------------------------------------------
/graphics/NN2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/NN2.pdf
--------------------------------------------------------------------------------
/graphics/disc09-entropy-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc09-entropy-1.pdf
--------------------------------------------------------------------------------
/graphics/disc10-skipnn-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc10-skipnn-1.pdf
--------------------------------------------------------------------------------
/graphics/disc10-skipnn-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc10-skipnn-2.pdf
--------------------------------------------------------------------------------
/graphics/disc12-pca-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc12-pca-1.pdf
--------------------------------------------------------------------------------
/graphics/disc12-pca-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc12-pca-2.pdf
--------------------------------------------------------------------------------
/graphics/disc12-pca-3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc12-pca-3.pdf
--------------------------------------------------------------------------------
/graphics/disc12-pca-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szhu/cs189-cheatsheet/c2d0c497727cd4bfb289a0ed50d2ffb04cd31d4c/graphics/disc12-pca-4.pdf
--------------------------------------------------------------------------------