├── .gitignore ├── README.md ├── adv-topic-in-ml ├── adv-topic.pdf ├── adv-topic.tex └── notes │ ├── Opitimization.tex │ ├── RKHS.tex │ ├── RKHSAppendix.tex │ ├── statLearning.tex │ └── statLearningAppendix.tex ├── approx-infer ├── approx-infer.pdf ├── approx-infer.tex ├── contents │ ├── part1.tex │ ├── part2.tex │ ├── part3.tex │ ├── part4.tex │ ├── part5.tex │ ├── part6.tex │ └── part7.tex └── img │ ├── img1.png │ ├── img10.png │ ├── img11.png │ ├── img12.png │ ├── img13.png │ ├── img14.png │ ├── img15.png │ ├── img16.png │ ├── img2.png │ ├── img3.png │ ├── img4.png │ ├── img5.png │ ├── img6.png │ ├── img7.png │ ├── img8.png │ └── img9.png ├── prob-unsup ├── chapter │ ├── part1.tex │ ├── part2.tex │ ├── part3.tex │ ├── part4.tex │ └── part5.tex ├── prob-unsup.pdf ├── prob-unsup.tex └── test │ └── test.py ├── rice-foundation-fortnight ├── contents │ ├── part1.tex │ ├── part2.tex │ ├── part3.tex │ ├── part4.tex │ ├── part5.tex │ ├── part6.tex │ ├── part7.tex │ └── part8.tex ├── img │ ├── img1.png │ ├── img2.png │ ├── img3.png │ └── img4.png ├── rice-foundation-fortnight.pdf └── rice-foundation-fortnight.tex ├── stat-analysis ├── contents │ └── part1.tex ├── stat-analysis.pdf └── stat-analysis.tex └── supervised-learning ├── contents ├── part1.tex ├── part2.tex ├── part3.tex ├── part4.tex ├── part5.tex ├── part6.tex └── part7.tex ├── supervised-learning.pdf └── supervised-learning.tex /.gitignore: -------------------------------------------------------------------------------- 1 | ## Core latex/pdflatex auxiliary files: 2 | *.aux 3 | *.lof 4 | *.log 5 | *.lot 6 | *.fls 7 | *.out 8 | *.toc 9 | *.fmt 10 | *.fot 11 | *.cb 12 | *.cb2 13 | .*.lb 14 | 15 | ## Intermediate documents: 16 | *.dvi 17 | *.xdv 18 | *-converted-to.* 19 | # these rules might exclude image files for figures etc. 20 | # *.ps 21 | # *.eps 22 | # *.pdf 23 | 24 | ## Generated if empty string is given at "Please type another file name for output:" 25 | .pdf 26 | 27 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 28 | *.bbl 29 | *.bcf 30 | *.blg 31 | *-blx.aux 32 | *-blx.bib 33 | *.run.xml 34 | 35 | ## Build tool auxiliary files: 36 | *.fdb_latexmk 37 | *.synctex 38 | *.synctex(busy) 39 | *.synctex.gz 40 | *.synctex.gz(busy) 41 | *.pdfsync 42 | 43 | ## Build tool directories for auxiliary files 44 | # latexrun 45 | latex.out/ 46 | 47 | ## Auxiliary and intermediate files from other packages: 48 | # algorithms 49 | *.alg 50 | *.loa 51 | 52 | # achemso 53 | acs-*.bib 54 | 55 | # amsthm 56 | *.thm 57 | 58 | # beamer 59 | *.nav 60 | *.pre 61 | *.snm 62 | *.vrb 63 | 64 | # changes 65 | *.soc 66 | 67 | # comment 68 | *.cut 69 | 70 | # cprotect 71 | *.cpt 72 | 73 | # elsarticle (documentclass of Elsevier journals) 74 | *.spl 75 | 76 | # endnotes 77 | *.ent 78 | 79 | # fixme 80 | *.lox 81 | 82 | # feynmf/feynmp 83 | *.mf 84 | *.mp 85 | *.t[1-9] 86 | *.t[1-9][0-9] 87 | *.tfm 88 | 89 | #(r)(e)ledmac/(r)(e)ledpar 90 | *.end 91 | *.?end 92 | *.[1-9] 93 | *.[1-9][0-9] 94 | *.[1-9][0-9][0-9] 95 | *.[1-9]R 96 | *.[1-9][0-9]R 97 | *.[1-9][0-9][0-9]R 98 | *.eledsec[1-9] 99 | *.eledsec[1-9]R 100 | *.eledsec[1-9][0-9] 101 | *.eledsec[1-9][0-9]R 102 | *.eledsec[1-9][0-9][0-9] 103 | *.eledsec[1-9][0-9][0-9]R 104 | 105 | # glossaries 106 | *.acn 107 | *.acr 108 | *.glg 109 | *.glo 110 | *.gls 111 | *.glsdefs 112 | *.lzo 113 | *.lzs 114 | 115 | # uncomment this for glossaries-extra (will ignore makeindex's style files!) 116 | # *.ist 117 | 118 | # gnuplottex 119 | *-gnuplottex-* 120 | 121 | # gregoriotex 122 | *.gaux 123 | *.gtex 124 | 125 | # htlatex 126 | *.4ct 127 | *.4tc 128 | *.idv 129 | *.lg 130 | *.trc 131 | *.xref 132 | 133 | # hyperref 134 | *.brf 135 | 136 | # knitr 137 | *-concordance.tex 138 | # TODO Comment the next line if you want to keep your tikz graphics files 139 | *.tikz 140 | *-tikzDictionary 141 | 142 | # listings 143 | *.lol 144 | 145 | # luatexja-ruby 146 | *.ltjruby 147 | 148 | # makeidx 149 | *.idx 150 | *.ilg 151 | *.ind 152 | 153 | # minitoc 154 | *.maf 155 | *.mlf 156 | *.mlt 157 | *.mtc[0-9]* 158 | *.slf[0-9]* 159 | *.slt[0-9]* 160 | *.stc[0-9]* 161 | 162 | # minted 163 | _minted* 164 | *.pyg 165 | 166 | # morewrites 167 | *.mw 168 | 169 | # nomencl 170 | *.nlg 171 | *.nlo 172 | *.nls 173 | 174 | # pax 175 | *.pax 176 | 177 | # pdfpcnotes 178 | *.pdfpc 179 | 180 | # sagetex 181 | *.sagetex.sage 182 | *.sagetex.py 183 | *.sagetex.scmd 184 | 185 | # scrwfile 186 | *.wrt 187 | 188 | # sympy 189 | *.sout 190 | *.sympy 191 | sympy-plots-for-*.tex/ 192 | 193 | # pdfcomment 194 | *.upa 195 | *.upb 196 | 197 | # pythontex 198 | *.pytxcode 199 | pythontex-files-*/ 200 | 201 | # tcolorbox 202 | *.listing 203 | 204 | # thmtools 205 | *.loe 206 | 207 | # TikZ & PGF 208 | *.dpth 209 | *.md5 210 | *.auxlock 211 | 212 | # todonotes 213 | *.tdo 214 | 215 | # vhistory 216 | *.hst 217 | *.ver 218 | 219 | # easy-todo 220 | *.lod 221 | 222 | # xcolor 223 | *.xcp 224 | 225 | # xmpincl 226 | *.xmpi 227 | 228 | # xindy 229 | *.xdy 230 | 231 | # xypic precompiled matrices and outlines 232 | *.xyc 233 | *.xyd 234 | 235 | # endfloat 236 | *.ttt 237 | *.fff 238 | 239 | # Latexian 240 | TSWLatexianTemp* 241 | 242 | ## Editors: 243 | # WinEdt 244 | *.bak 245 | *.sav 246 | 247 | # Texpad 248 | .texpadtmp 249 | 250 | # LyX 251 | *.lyx~ 252 | 253 | # Kile 254 | *.backup 255 | 256 | # gummi 257 | .*.swp 258 | 259 | # KBibTeX 260 | *~[0-9]* 261 | 262 | # TeXnicCenter 263 | *.tps 264 | 265 | # auto folder when using emacs and auctex 266 | ./auto/* 267 | *.el 268 | 269 | # expex forward references with \gathertags 270 | *-tags.tex 271 | 272 | # standalone packages 273 | *.sta 274 | 275 | # Makeindex log files 276 | *.lpz 277 | .DS_Store 278 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UCL-CSML-Notes 2 | My lecture notes during times at UCL. Each of the folder contains a note for that subject: Most of them are transcription (and hopefully smaller) of the source material with some more details (such as additional prove not presented in the lecture), where we have: 3 | 4 | - COMP0083: Advanced Topics in Machine Learning (`adv-topic-in-ml`) 5 | - COMP0085: Approximate Inference and Learning in Probabilistic Models (`approx-infer`) 6 | - COMP0086: Probabilistic and Unsupervised Learning (`prob-unsup`) 7 | - STAT0027: Foundation Fortnight (`rice-foundation-fortnight`) 8 | - STAT0028: Statistical Models and Data Analysis (`stat-analysis`) 9 | - COMP0078: Supervised Learning (`supervised-learning`) 10 | 11 | In COMP0083, I also added some notes from [EE364a: Convex Optimization I](https://web.stanford.edu/class/ee364a/) to refresh some of my memory for convex optimization. STAT0027, on the other hands, is wholly based on John A. Rice's [Mathematical Statistics and Data Analysis](https://www.amazon.co.uk/Mathematical-Statistics-Data-Analysis-John/dp/0495110892). 12 | 13 | Currently it is a **draft** i.e poorly worded, direct copy, bad grammar etc. There are some more courses that I took, such as: *COMP0089: Reinforcement Learning*, *COMP0120: Numerical Optimisation* and *COMP0168: Machine Learning Seminar* that I haven't had time to edits my notes (to a more coherent degree), stay tune!! 14 | 15 | If there is any mistakes on my notes, feels free to make and issue and/or create a pull request. 16 | -------------------------------------------------------------------------------- /adv-topic-in-ml/adv-topic.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/adv-topic-in-ml/adv-topic.pdf -------------------------------------------------------------------------------- /adv-topic-in-ml/adv-topic.tex: -------------------------------------------------------------------------------- 1 | \documentclass{report} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage{amsmath} 4 | \usepackage{amsthm} 5 | \usepackage{amsfonts} 6 | \usepackage[colorlinks]{hyperref} 7 | \usepackage{natbib} 8 | \usepackage{graphicx} 9 | \usepackage{algorithm} 10 | \usepackage{algpseudocode} 11 | \usepackage{booktabs} 12 | \usepackage{caption} 13 | \usepackage{cancel} 14 | \usepackage{hyperref} 15 | 16 | \usepackage{tikz} 17 | \usetikzlibrary{bayesnet} 18 | \usetikzlibrary{arrows} 19 | \usetikzlibrary{calc} 20 | \usetikzlibrary{shadows} 21 | \usetikzlibrary{positioning} 22 | 23 | \newtheorem{theorem}{Theorem}[section] 24 | \newtheorem{corollary}{Corollary}[section] 25 | \newtheorem{proposition}{Proposition}[section] 26 | \newtheorem{lemma}{Lemma}[section] 27 | \newtheorem{claim}{Claim}[section] 28 | \newtheorem{conjecture}{Conjecture}[section] 29 | \newtheorem{example}{Example}[section] 30 | 31 | \theoremstyle{definition} 32 | \newtheorem{definition}{Definition}[section] 33 | 34 | \theoremstyle{remark} 35 | \newtheorem{remark}{Remark} 36 | 37 | 38 | \newcommand{\Phu}[1]{{\bf \color{red} [[Phu: #1]]}} 39 | \setlength\parindent{0pt} 40 | \setlength\parskip{5pt} 41 | \usepackage[margin=1.0in]{geometry} 42 | 43 | \newcommand{\dby}{\ \mathrm{d}} 44 | \newcommand{\argmax}[1]{\underset{#1}{\arg\max \ }} 45 | \newcommand{\argmin}[1]{\underset{#1}{\arg\min \ }} 46 | \newcommand{\const}{\text{const.}} 47 | \newcommand{\bracka}[1]{\left( #1 \right)} 48 | \newcommand{\brackb}[1]{\left[ #1 \right]} 49 | \newcommand{\brackc}[1]{\left\{ #1 \right\}} 50 | \newcommand{\brackd}[1]{\left\langle #1 \right\rangle} 51 | \newcommand{\abs}[1]{\left| #1 \right|} 52 | \newcommand*\circled[1]{\tikz[baseline=(char.base)]{ 53 | \node[shape=circle,draw,inner sep=2pt] (char) {#1};}} 54 | \newcommand{\red}[1]{{\color{red} #1}} 55 | \newcommand{\loss}{\mathcal{L}} 56 | \newcommand{\correctquote}[1]{``#1''} 57 | \newcommand{\norm}[1]{\left\lVert#1\right\rVert} 58 | 59 | % From https://tex.stackexchange.com/questions/194426/split-itemize-into-multiple-columns 60 | \usepackage{etoolbox,refcount} 61 | \usepackage{multicol} 62 | 63 | \newcounter{countitems} 64 | \newcounter{nextitemizecount} 65 | \newcommand{\setupcountitems}{% 66 | \stepcounter{nextitemizecount}% 67 | \setcounter{countitems}{0}% 68 | \preto\item{\stepcounter{countitems}}% 69 | } 70 | \makeatletter 71 | \newcommand{\computecountitems}{% 72 | \edef\@currentlabel{\number\c@countitems}% 73 | \label{countitems@\number\numexpr\value{nextitemizecount}-1\relax}% 74 | } 75 | \newcommand{\nextitemizecount}{% 76 | \getrefnumber{countitems@\number\c@nextitemizecount}% 77 | } 78 | \newcommand{\previtemizecount}{% 79 | \getrefnumber{countitems@\number\numexpr\value{nextitemizecount}-1\relax}% 80 | } 81 | \makeatother 82 | \newenvironment{AutoMultiColItemize}{% 83 | \ifnumcomp{\nextitemizecount}{>}{3}{\begin{multicols}{2}}{}% 84 | \setupcountitems\begin{itemize}}% 85 | {\end{itemize}% 86 | \unskip\computecountitems\ifnumcomp{\previtemizecount}{>}{3}{\end{multicols}}{}} 87 | 88 | 89 | \title{Advanced Topics in Machine Learning} 90 | \author{Phu Sakulwongtana} 91 | \date{} 92 | 93 | \begin{document} 94 | 95 | \maketitle 96 | 97 | \tableofcontents 98 | 99 | \chapter{Convex Optimization} 100 | \input{notes/Opitimization.tex} 101 | 102 | \chapter{RKHS in Machine Learning} 103 | \input{notes/RKHS.tex} 104 | 105 | \chapter{Statisical Learning} 106 | \input{notes/statLearning.tex} 107 | 108 | \appendix 109 | \chapter{Additional Proof} 110 | \input{notes/RKHSAppendix.tex} 111 | \input{notes/statLearningAppendix.tex} 112 | 113 | 114 | % \section{Introduction} 115 | 116 | % \begin{algorithm}[H] 117 | % \caption{$PSRO_{RN}$} 118 | % \begin{algorithmic}[1] 119 | % \State \textbf{Input}: Initial Population $\mathcal{B}_1$ 120 | % \For {$i=1,2,\cdots, T$} 121 | % \State $p \leftarrow \text{Nash}(A_{\mathcal{B}_i})$ 122 | % \For {agent $v_i$ with positive mass in $p_t$} 123 | % \State $v_{i+1} \leftarrow \text{oracle}(v_i, \sum_{w \in \mathcal{B}_i} p[i](\phi_{v_i}(\cdot))_+)$ 124 | % \EndFor 125 | % \State $\mathcal{B}_{i+1} = \mathcal{B} \cup \{v_{i+1} : \text{as updated above}\}$ 126 | % \EndFor 127 | % \end{algorithmic} 128 | % \end{algorithm} 129 | 130 | % \begin{table}[!h] 131 | % \centering 132 | % \begin{tabular}{lcccc} 133 | % \toprule 134 | % \textbf{Methods/Metrics} & \textbf{Accuracy} & \textbf{Precision} & \textbf{Recall} & \textbf{F1} \\ 135 | % \midrule 136 | % Logistic Regression & $48.26 \pm 0.0f0$ & $49.79 \pm 0.00$ & $47.94 \pm 0.00$ & $47.75 \pm 0.00$ \\ 137 | % Support Vector Machine & $\mathbf{48.91} \pm 0.00$ & $\mathbf{50.50} \pm 0.00$ & $\mathbf{48.95} \pm 0.00$ & $\mathbf{49.18 }\pm 0.00$ \\ 138 | % Random Forest Classifier & $44.38 \pm 1.57$ & $44.60 \pm 1.60$ & $44.30 \pm 1.58$ & $44.38 \pm 1.57$ \\ 139 | % \midrule 140 | % Multi-Dimensional ELO & $34.51 \pm 3.12$ & $23.01 \pm 2.06$ & $33.70 \pm 3.03$ & $27.33 \pm 2.47$ \\ 141 | % TrueSkill\texttrademark & $44.99 \pm 0.00$ & $45.26 \pm 0.00$ & $44.17 \pm 0.00$ & $38.32 \pm 0.00$ \\ 142 | % \bottomrule 143 | % \end{tabular} 144 | 145 | % \caption{Results from average of 10 random seeds with 2 standard deviations. The top rows are baseline and the bottom are proposed methods. Precision, Recall and F1 are all macro-averaged. Note that some of the standard deviations are near zero. All deep learning models are trained for 3 epochs.} 146 | 147 | % \label{table:result-table} 148 | % \vspace{-16pt} 149 | % \end{table} 150 | 151 | % \begin{AutoMultiColItemize} 152 | % \item Item 1 153 | % \item Item 2 154 | % \item Item 3 155 | % \item Item 4 156 | % \item Item 5 157 | % \item Item 6 158 | % \end{AutoMultiColItemize} 159 | 160 | 161 | % \bibliographystyle{plain} 162 | % \bibliography{references} 163 | \end{document} 164 | -------------------------------------------------------------------------------- /adv-topic-in-ml/notes/RKHSAppendix.tex: -------------------------------------------------------------------------------- 1 | 2 | \section{RKHS in Machine Learning} 3 | \subsection{Expansion of Centered Matrix for PCA} 4 | \label{appendix:pca-center-matrix} 5 | 6 | % We have the following matrix $XHX^T$, and we have: 7 | % \begin{equation*} 8 | % \begin{aligned} 9 | % &\begin{bmatrix} 10 | % x_{11} & x_{12} & \cdots & x_{1n} \\ 11 | % x_{21} & x_{22} & \cdots & x_{2n} \\ 12 | % \vdots & \vdots & \ddots & \vdots \\ 13 | % x_{d1} & x_{d2} & \cdots & x_{dn} \\ 14 | % \end{bmatrix}\begin{bmatrix} 15 | % 1-1/n & -1/n & \cdots & -1/n \\ 16 | % -1/n & 1-1/n & \cdots & -1/n \\ 17 | % \vdots & \vdots & \ddots & \vdots \\ 18 | % -1/n & -1/n & \cdots & 1-1/n \\ 19 | % \end{bmatrix} 20 | % \begin{bmatrix} 21 | % x_{11} & x_{21} & \cdots & x_{d1} \\ 22 | % x_{12} & x_{22} & \cdots & x_{d2} \\ 23 | % \vdots & \vdots & \ddots & \vdots \\ 24 | % x_{1n} & x_{2n} & \cdots & x_{dn} \\ 25 | % \end{bmatrix} \\ 26 | % &=\begin{bmatrix} 27 | % x_{11}-\frac{1}{n}\sum^n_{i=1}x_{1i} & x_{12}-\frac{1}{n}\sum^n_{i=1}x_{1i} & \cdots & x_{1n}-\frac{1}{n}\sum^n_{i=1}x_{1i} \\ 28 | % x_{21}-\frac{1}{n}\sum^n_{i=1}x_{2i} & x_{22}-\frac{1}{n}\sum^n_{i=1}x_{2i} & \cdots & x_{2n}-\frac{1}{n}\sum^n_{i=1}x_{2i} \\ 29 | % \vdots & \vdots & \ddots & \vdots \\ 30 | % x_{d1}-\frac{1}{n}\sum^n_{i=1}x_{di} & x_{d2}-\frac{1}{n}\sum^n_{i=1}x_{di} & \cdots & x_{dn}-\frac{1}{n}\sum^n_{i=1}x_{di} \\ 31 | % \end{bmatrix}\begin{bmatrix} 32 | % x_{11} & x_{21} & \cdots & x_{d1} \\ 33 | % x_{12} & x_{22} & \cdots & x_{d2} \\ 34 | % \vdots & \vdots & \ddots & \vdots \\ 35 | % x_{1n} & x_{2n} & \cdots & x_{dn} \\ 36 | % \end{bmatrix} \\ 37 | % &=\begin{bmatrix} 38 | % \sum^n_{j=1} x_{1j} \bracka{x_{1j} -\frac{1}{n}\sum^n_{i=1}x_{1i}} & \sum^n_{j=1} x_{2j} \bracka{x_{1j} -\frac{1}{n}\sum^n_{i=1}x_{1i}} & \cdots & \sum^n_{j=1} x_{dj} \bracka{x_{1j} -\frac{1}{n}\sum^n_{i=1}x_{1i}} \\ 39 | % \sum^n_{j=1} x_{1j} \bracka{x_{2j} -\frac{1}{n}\sum^n_{i=1}x_{2i}} & \sum^n_{j=1} x_{2j} \bracka{x_{2j} -\frac{1}{n}\sum^n_{i=1}x_{2i}} & \cdots & \sum^n_{j=1} x_{dj} \bracka{x_{2j} -\frac{1}{n}\sum^n_{i=1}x_{2i}} \\ 40 | % \vdots & \vdots & \ddots & \vdots \\ 41 | % \sum^n_{j=1} x_{1j} \bracka{x_{dj} -\frac{1}{n}\sum^n_{i=1}x_{di}} & \sum^n_{j=1} x_{2j} \bracka{x_{dj} -\frac{1}{n}\sum^n_{i=1}x_{di}} & \cdots & \sum^n_{j=1} x_{dj} \bracka{x_{dj} -\frac{1}{n}\sum^n_{i=1}x_{di}} 42 | % \end{bmatrix} 43 | % \end{aligned} 44 | % \end{equation*} 45 | % Now, consider 46 | % \begin{equation*} 47 | % \sum^n_{i=1}\bracka{x_i-\frac{1}{n}\sum^n_{j=1}x_j}\bracka{x_i-\frac{1}{n}\sum^n_{j=1}x_j}^T 48 | % \end{equation*} 49 | % which, we have: 50 | % \begin{equation*} 51 | % \sum^n_{i=1}\begin{bmatrix} 52 | % x_{1i}-1/n\sum^n_{j=1}x_{1j} \\ x_{2i}-1/n\sum^n_{j=1}x_{2j} \\ \vdots \\ x_{di}-1/n\sum^n_{j=1}x_{dj} 53 | % \end{bmatrix}\begin{bmatrix} 54 | % x_{1i}- \frac{1}{n}\sum^n_{j=1}x_{1j} & x_{2i}-\frac{1}{n}\sum^n_{j=1}x_{2j} & \cdots & x_{di}-\frac{1}{n}\sum^n_{j=1}x_{dj} 55 | % \end{bmatrix} 56 | % \end{equation*} 57 | % Then, we have the following matrix at $(a, b)$: 58 | % \begin{equation*} 59 | % \begin{aligned} 60 | % \sum^n_{i=1} x_{ai}x_{bi} &- \frac{x_{ai}}{n}\sum^n_{j=1}x_{bj} - \frac{x_{bi}}{n}\sum^n_{j=1}x_{aj} + \frac{1}{n^2}\bracka{\sum^n_{j=1}x_{aj}}\bracka{\sum^n_{j=1}x_{bj}} \\ 61 | % &= \brackb{\frac{1}{n}\bracka{\sum^n_{j=1}x_{aj}}\bracka{\sum^n_{j=1}x_{bj}} - \frac{1}{n}\sum^n_{i=1}x_{ai}\sum^n_{j=1}x_{bj}} + \brackb{\sum^n_{i=1}x_{ai}x_{bi} - \frac{1}{n}x_{bi}\sum_{j=1}^n x_{aj}} \\ 62 | % &= \sum^n_{i=1} x_{bi} \bracka{x_{ai} - \frac{1}{n}\sum^n_{j=1}x_{aj}} 63 | % \end{aligned} 64 | % \end{equation*} 65 | Smarter way to do it is: 66 | \begin{equation*} 67 | X\bracka{I - \frac{1}{n}\boldsymbol 1_{n\times n}}X^T = XX^T - \frac{1}{n}X\boldsymbol{1}_{n\times n}X^T 68 | \end{equation*} 69 | Now, we consider the second one: 70 | \begin{equation*} 71 | \begin{aligned} 72 | \sum^n_{i=1}\bracka{x_i-\frac{1}{n}\sum^n_{j=1}x_j}\bracka{x_i-\frac{1}{n}\sum^n_{j=1}x_j}^T &= \sum^n_{i=1} x_ix_i^T - \frac{1}{n}X\boldsymbol{1}x_i - \frac{1}{n} x_i\boldsymbol{1}^TX^T + \frac{1}{n^2}X\boldsymbol{1}\boldsymbol{1}^TX^T \\ 73 | &= \brackb{\frac{1}{n}X\boldsymbol{1}\boldsymbol{1}^TX^T + \sum^n_{i=1}x_ix_i^T} - \brackb{\frac{1}{n}\sum^n_{i=1}X\boldsymbol{1}x_i^T + x_i\boldsymbol{1}^TX^T} \\ 74 | &= \brackb{\frac{1}{n}X\boldsymbol{1}\boldsymbol{1}^TX^T + XX^T} - \brackb{\frac{2}{n}\sum^n_{i=1}X\boldsymbol{1}x_i^T} \\ 75 | &= \brackb{\frac{1}{n}X\boldsymbol{1}\boldsymbol{1}^TX^T + XX^T} - \brackb{\frac{2}{n}X\boldsymbol{1}\sum^n_{i=1}x_i^T} \\ 76 | &= \brackb{\frac{1}{n}X\boldsymbol{1}\boldsymbol{1}^TX^T + XX^T} - \brackb{\frac{2}{n}X\boldsymbol{1}\boldsymbol{1}^TX^T} \\ 77 | &= XX^T - \frac{1}{n}X\boldsymbol{1}\boldsymbol{1}^TX^T\\ 78 | \end{aligned} 79 | \end{equation*} 80 | Note that for vector $\boldsymbol{a}$ and $\boldsymbol{b}$, we have $\boldsymbol{a}\boldsymbol{b}^T = \boldsymbol{b}\boldsymbol{a}^T$ 81 | 82 | \subsection{Centering Kernel Matrix} 83 | \label{appendix:kernel-pca-centering} 84 | Please note that 85 | \begin{equation*} 86 | \tilde{k}(x_i, x_j) = \brackd{\tilde{\phi}(x_i), \tilde{\phi}(x_j)} = \brackd{\phi(x_i) - \frac{1}{n}\sum^n_{k=1}\phi(x_k), \phi(x_j) - \frac{1}{n}\sum^n_{k=1}\phi(x_k)} 87 | \end{equation*} 88 | Let's see that: 89 | \begin{equation*} 90 | \begin{aligned} 91 | \tilde{k}(x_i, x_j) &= \brackd{\phi(x_i) - \frac{1}{n}\sum^n_{k=1}\phi(x_k), \phi(x_j) - \frac{1}{n}\sum^n_{k=1}\phi(x_k)} \\ 92 | &= \brackd{\phi(x_i), \phi(x_j)} - \brackd{\phi(x_i), \frac{1}{n}\sum^n_{k=1}\phi(x_k) } - \brackd{\phi(x_j), \frac{1}{n}\sum^n_{k=1}\phi(x_k)} + \brackd{\frac{1}{n}\sum^n_{k=1}\phi(x_k), \frac{1}{n}\sum^n_{k=1}\phi(x_k)}\\ 93 | &= \underbrace{\brackd{\phi(x_i), \phi(x_j)}}_{\circled{1}} - \underbrace{\frac{1}{n}\sum^n_{k=1}\brackd{\phi(x_i),\phi(x_k)} - \frac{1}{n}\sum^n_{k=1}\brackd{\phi(x_j), \phi(x_k)}}_{\circled{2}} + \underbrace{\frac{1}{n^2} \sum^n_{k=1}\sum^n_{l=1}\brackd{\phi(x_k), \phi(x_l)}}_{\circled{3}} 94 | \end{aligned} 95 | \end{equation*} 96 | Now, let's consider $\tilde{K} = HKH$, which we have: 97 | \begin{equation*} 98 | \begin{aligned} 99 | \tilde{K} &= \bracka{I - \frac{1}{n}\boldsymbol 1_{n\times n}}K\bracka{I - \frac{1}{n}\boldsymbol 1_{n\times n}} = \bracka{K- \frac{1}{n}\boldsymbol{1}_{n\times n}K }\bracka{I - \frac{1}{n}\boldsymbol{1}_{n\times n}} \\ 100 | &= K-\frac{1}{n}K\boldsymbol{1}_{n\times n} - \frac{1}{n}\boldsymbol{1}_{n\times n}K + \frac{1}{n^2}\boldsymbol{1}_{n\times n}K\boldsymbol{1}_{n\times n} 101 | \end{aligned} 102 | \end{equation*} 103 | It is clear that $K$ corresponds to $\circled{1}$, and we can see that: 104 | \begin{equation*} 105 | \begin{aligned} 106 | \frac{1}{n}K\boldsymbol{1}_{n\times n} = \frac{1}{n}\begin{bmatrix} 107 | \cdots & \sum^n_{i=1}\brackd{x_1, x_i} & \cdots \\ 108 | \cdots & \sum^n_{i=1}\brackd{x_2, x_i} & \cdots \\ 109 | & \vdots & \\ 110 | \cdots & \sum^n_{i=1}\brackd{x_n, x_i} & \cdots \\ 111 | \end{bmatrix} \qquad \frac{1}{n}\boldsymbol{1}_{n\times n}K = \frac{1}{n}\begin{bmatrix} 112 | \vdots & \vdots & & \vdots \\ 113 | \sum^n_{i=1}\brackd{x_1, x_i} & \sum^n_{i=1}\brackd{x_2, x_i} & \cdots & \sum^n_{i=1}\brackd{x_n, x_i} \\ 114 | \vdots & \vdots & & \vdots \\ 115 | \end{bmatrix} 116 | \end{aligned} 117 | \end{equation*} 118 | And, so the addition of them would lead to the $\circled{2}$. Finally, $\circled{3}$ can be shown easily as we use the result above and multiply by $\boldsymbol 1_{n\times n}$. 119 | 120 | \subsection{Ridge Regression Expansion} 121 | \label{appendix:ridge-regression} 122 | We will show that 123 | \begin{equation*} 124 | - 2y^TX^TCb + b^Tb = \norm{CXy-b}^2 - \norm{y^TX^TC}^2 125 | \end{equation*} 126 | where $C=(XX^T+\lambda I)^{-1/2}$, please note that $C = C^T$. Let's consider the right handside: 127 | \begin{equation*} 128 | \begin{aligned} 129 | \norm{CXy-b}^2 - \norm{y^TX^TC^T}^2 &= (CXy-b)^T(CXy-b) - (y^TX^TC^T)^T(y^TX^TC^T) \\ 130 | &= (y^TX^TC^T - b^T)(CXy-b) - (y^TX^TC^T)^T(y^TX^TC^T) \\ 131 | &= y^TX^TC^TCXy - y^TX^TC^Tb - b^TCXy + b^Tb - CXyy^TX^TC^T \\ 132 | &= (y^TX^TC^TCXy - CXyy^TX^TC^T) - 2y^TX^TC^Tb + b^Tb \\ 133 | &= - 2y^TX^TC^Tb + b^Tb \\ 134 | \end{aligned} 135 | \end{equation*} 136 | 137 | \subsection{Representor Theorem for Ridge Regression} 138 | \label{appendix:representor-ridge} 139 | We will assume that 140 | \begin{equation*} 141 | \begin{aligned} 142 | X(X^TX + \lambda I_n)^{-1}y &= X\begin{bmatrix} 143 | \beta_{11} & \beta_{12} & \cdots & \beta_{1n} \\ 144 | \beta_{21} & \beta_{22} & \cdots & \beta_{2n} \\ 145 | \vdots & \vdots & \ddots & \vdots \\ 146 | \beta_{n1} & \beta_{n2} & \cdots & \beta_{nn} \\ 147 | \end{bmatrix}y \\ 148 | &= \begin{bmatrix} 149 | \sum^n_{i=1}x_{1i}\beta_{i1} & \sum^n_{i=1}x_{1i}\beta_{i2} & \cdots & \sum^n_{i=1}x_{1i}\beta_{in} \\ 150 | \sum^n_{i=1}x_{2i}\beta_{i1} & \sum^n_{i=1}x_{2i}\beta_{i2} & \cdots & \sum^n_{i=1}x_{2i}\beta_{in} \\ 151 | \vdots & \vdots & \ddots & \vdots \\ 152 | \sum^n_{i=1}x_{di}\beta_{i1} & \sum^n_{i=1}x_{di}\beta_{i2} & \cdots & \sum^n_{i=1}x_{di}\beta_{in} \\ 153 | \end{bmatrix}y \\ 154 | &= \begin{bmatrix} 155 | \sum^n_{i=1}x_{1i}\beta_{i1} & \sum^n_{i=1}x_{1i}\beta_{i2} & \cdots & \sum^n_{i=1}x_{1i}\beta_{in} \\ 156 | \sum^n_{i=1}x_{2i}\beta_{i1} & \sum^n_{i=1}x_{2i}\beta_{i2} & \cdots & \sum^n_{i=1}x_{2i}\beta_{in} \\ 157 | \vdots & \vdots & \ddots & \vdots \\ 158 | \sum^n_{i=1}x_{di}\beta_{i1} & \sum^n_{i=1}x_{di}\beta_{i2} & \cdots & \sum^n_{i=1}x_{di}\beta_{in} \\ 159 | \end{bmatrix} 160 | \begin{bmatrix} 161 | y_1 \\ y_2 \\ \vdots \\ y_n 162 | \end{bmatrix} \\ 163 | &= \begin{bmatrix} 164 | \sum^n_{j=1}y_j\sum^n_{i=1}x_{1i}\beta_{ij} \\ 165 | \sum^n_{j=1}y_j\sum^n_{i=1}x_{2i}\beta_{ij} \\ 166 | \vdots \\ 167 | \sum^n_{j=1}y_j\sum^n_{i=1}x_{ni}\beta_{ij} \\ 168 | \end{bmatrix} = \begin{bmatrix} 169 | \sum^n_{j=1}\sum^n_{i=1}y_jx_{1i}\beta_{ij} \\ 170 | \sum^n_{j=1}\sum^n_{i=1}y_jx_{2i}\beta_{ij} \\ 171 | \vdots \\ 172 | \sum^n_{j=1}\sum^n_{i=1}y_jx_{ni}\beta_{ij} \\ 173 | \end{bmatrix} \\ 174 | &= \begin{bmatrix} 175 | \sum^n_{i=1}\sum^n_{j=1} y_jx_{1i}\beta_{ij} \\ 176 | \sum^n_{i=1}\sum^n_{j=1} y_jx_{2i}\beta_{ij} \\ 177 | \vdots \\ 178 | \sum^n_{i=1}\sum^n_{j=1} y_jx_{ni}\beta_{ij} \\ 179 | \end{bmatrix} 180 | \end{aligned} 181 | \end{equation*} 182 | The rest will be in main proof. 183 | 184 | \subsection{MMD Integration} 185 | \label{appendix:MMD-integration} 186 | We have 187 | \begin{equation*} 188 | \begin{aligned} 189 | \iint &\brackb{k(s-t)\dby(P-Q)(s)}\dby(P-Q)(t) \\ 190 | &= \int \Big[ \mathbb{E}_{s\sim P}\brackb{k(s-t)} - \mathbb{E}_{s\sim Q}\brackb{k(s-t)} \Big] \dby (P-Q)(t) \\ 191 | &= \int \mathbb{E}_{s\sim P}\brackb{k(s-t)} \dby (P-Q)(t) - \int\mathbb{E}_{s\sim Q}\brackb{k(s-t)} \dby (P-Q)(t) \\ 192 | &= \Big[\mathbb{E}_{t\sim P}\mathbb{E}_{s\sim P}[k(s-t)] - \mathbb{E}_{t\sim Q}\mathbb{E}_{s\sim P}[k(s-t)] \Big] - \Big[ \mathbb{E}_{t\sim P}\mathbb{E}_{s\sim Q} [k(s-t)] - \mathbb{E}_{t\sim Q}\mathbb{E}_{s\sim Q} [k(s-t)] \Big]\\ 193 | &= \mathbb{E}_P[k(s-t)] + \mathbb{E}_Q[k(s-t)] - 2\mathbb{E}_{P, Q}[k(s-t)] 194 | \end{aligned} 195 | \end{equation*} 196 | 197 | \subsection{Biased Estimate of HSIC Part 2} 198 | \label{appendix:HSIC-bias-2} 199 | We have 200 | \begin{equation*} 201 | \boldsymbol1^TK = \begin{bmatrix} 202 | \sum^n_{a=1}k_{a1} & \sum^n_{a=1}k_{a2} & \cdots & \sum^n_{a=1}k_{an} 203 | \end{bmatrix} \qquad L\boldsymbol1=\begin{bmatrix} 204 | \sum^n_{b=1}l_{1b} \\ 205 | \sum^n_{b=1}l_{2b} \\ 206 | \vdots \\ 207 | \sum^n_{b=1}l_{nb} \\ 208 | \end{bmatrix} 209 | \end{equation*} -------------------------------------------------------------------------------- /adv-topic-in-ml/notes/statLearningAppendix.tex: -------------------------------------------------------------------------------- 1 | \section{Experimental Proof} 2 | 3 | \subsection{Projected Gradient Descent} 4 | 5 | \begin{lemma} 6 | We would like to note that, for some $y\in \mathbb{R}^d$ and $x\in\Omega$ 7 | \begin{equation*} 8 | \norm{\Pi_\Omega(y) - x}^2 \le \norm{y - x}^2 - \norm{y - \Pi_\Omega(y)}^2 9 | \end{equation*} 10 | \end{lemma} 11 | 12 | \begin{remark} 13 | The projected gradient descent can be splitted into $2$ parts: 14 | \begin{equation*} 15 | \begin{aligned} 16 | &y_{t+1}= x_t - \gamma \nabla f(x_t) \\ 17 | &x_{t+1}= \Pi_\Omega(y_{t+1}) 18 | \end{aligned} 19 | \end{equation*} 20 | \end{remark} 21 | -------------------------------------------------------------------------------- /approx-infer/approx-infer.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/approx-infer.pdf -------------------------------------------------------------------------------- /approx-infer/approx-infer.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage{amsmath} 4 | \usepackage{amsthm} 5 | \usepackage{amsfonts} 6 | \usepackage[colorlinks]{hyperref} 7 | \usepackage{natbib} 8 | \usepackage{graphicx} 9 | \usepackage{algorithm} 10 | \usepackage{algpseudocode} 11 | \usepackage{booktabs} 12 | \usepackage{caption} 13 | \usepackage{tikz} 14 | 15 | \newtheorem{theorem}{Theorem}[section] 16 | \newtheorem{corollary}{Corollary}[section] 17 | \newtheorem{proposition}{Proposition}[section] 18 | \newtheorem{lemma}{Lemma}[section] 19 | \newtheorem{claim}{Claim}[section] 20 | \newtheorem{conjecture}{Conjecture}[section] 21 | \newtheorem{example}{Example}[section] 22 | 23 | \theoremstyle{definition} 24 | \newtheorem{definition}{Definition}[section] 25 | 26 | \theoremstyle{remark} 27 | \newtheorem{remark}{Remark} 28 | 29 | 30 | \newcommand{\Phu}[1]{{\bf \color{red} [[Phu: #1]]}} 31 | \setlength\parindent{0pt} 32 | \setlength\parskip{5pt} 33 | \usepackage[margin=1.0in]{geometry} 34 | 35 | \newcommand{\dby}{\ \mathrm{d}} 36 | \newcommand{\argmax}[1]{\underset{#1}{\arg\max \ }} 37 | \newcommand{\argmin}[1]{\underset{#1}{\arg\min \ }} 38 | \newcommand{\const}{\text{const.}} 39 | \newcommand{\bracka}[1]{\left( #1 \right)} 40 | \newcommand{\brackb}[1]{\left[ #1 \right]} 41 | \newcommand{\brackc}[1]{\left\{ #1 \right\}} 42 | \newcommand{\brackd}[1]{\left\langle #1 \right\rangle} 43 | \newcommand{\abs}[1]{\left| #1 \right|} 44 | \newcommand{\contractop}{\mathcal{B}} 45 | \newcommand*\circled[1]{\tikz[baseline=(char.base)]{ 46 | \node[shape=circle,draw,inner sep=2pt] (char) {#1};}} 47 | \newcommand{\red}[1]{{\color{red} #1}} 48 | \newcommand{\loss}{\mathcal{L}} 49 | \newcommand{\correctquote}[1]{``#1''} 50 | \newcommand{\norm}[1]{\left\lVert#1\right\rVert} 51 | \newcommand{\ind}{\perp \!\!\! \perp } 52 | 53 | % From https://tex.stackexchange.com/questions/194426/split-itemize-into-multiple-columns 54 | \usepackage{etoolbox,refcount} 55 | \usepackage{multicol} 56 | 57 | \newcounter{countitems} 58 | \newcounter{nextitemizecount} 59 | \newcommand{\setupcountitems}{% 60 | \stepcounter{nextitemizecount}% 61 | \setcounter{countitems}{0}% 62 | \preto\item{\stepcounter{countitems}}% 63 | } 64 | \makeatletter 65 | \newcommand{\computecountitems}{% 66 | \edef\@currentlabel{\number\c@countitems}% 67 | \label{countitems@\number\numexpr\value{nextitemizecount}-1\relax}% 68 | } 69 | \newcommand{\nextitemizecount}{% 70 | \getrefnumber{countitems@\number\c@nextitemizecount}% 71 | } 72 | \newcommand{\previtemizecount}{% 73 | \getrefnumber{countitems@\number\numexpr\value{nextitemizecount}-1\relax}% 74 | } 75 | \makeatother 76 | \newenvironment{AutoMultiColItemize}{% 77 | \ifnumcomp{\nextitemizecount}{>}{3}{\begin{multicols}{2}}{}% 78 | \setupcountitems\begin{itemize}}% 79 | {\end{itemize}% 80 | \unskip\computecountitems\ifnumcomp{\previtemizecount}{>}{3}{\end{multicols}}{}} 81 | 82 | 83 | \title{Approximate Inference} 84 | \author{Phu Sakulwongtana} 85 | \date{} 86 | 87 | \begin{document} 88 | 89 | \maketitle 90 | 91 | \input{contents/part1.tex} 92 | \input{contents/part2.tex} 93 | \input{contents/part3.tex} 94 | \input{contents/part4.tex} 95 | \input{contents/part5.tex} 96 | \input{contents/part6.tex} 97 | \input{contents/part7.tex} 98 | 99 | % \begin{algorithm}[H] 100 | % \caption{$PSRO_{RN}$} 101 | % \begin{algorithmic}[1] 102 | % \State \textbf{Input}: Initial Population $\mathcal{B}_1$ 103 | % \For {$i=1,2,\cdots, T$} 104 | % \State $p \leftarrow \text{Nash}(A_{\mathcal{B}_i})$ 105 | % \For {agent $v_i$ with positive mass in $p_t$} 106 | % \State $v_{i+1} \leftarrow \text{oracle}(v_i, \sum_{w \in \mathcal{B}_i} p[i](\phi_{v_i}(\cdot))_+)$ 107 | % \EndFor 108 | % \State $\mathcal{B}_{i+1} = \mathcal{B} \cup \{v_{i+1} : \text{as updated above}\}$ 109 | % \EndFor 110 | % \end{algorithmic} 111 | % \end{algorithm} 112 | 113 | % \begin{table}[!h] 114 | % \centering 115 | % \begin{tabular}{lc} 116 | % \toprule 117 | % \textbf{Methods/Metrics} & \textbf{Accuracy} \\ 118 | % \midrule 119 | % Logistic Regression & $48.26 \pm 0.0f0$ \\ 120 | % Support Vector Machine & $48.91 \pm 0.00$ \\ 121 | % Random Forest Classifier & $44.38 \pm 1.57$ \\ 122 | % \midrule 123 | % Multi-Dimensional ELO & $34.51 \pm 3.12$ \\ 124 | % TrueSkill\texttrademark & $44.99 \pm 0.00$ \\ 125 | % \bottomrule 126 | % \end{tabular} 127 | 128 | % \caption{} 129 | 130 | % \label{table} 131 | % \end{table} 132 | 133 | % \begin{AutoMultiColItemize} 134 | % \item Item 1 135 | % \item Item 2 136 | % \item Item 3 137 | % \item Item 4 138 | % \item Item 5 139 | % \item Item 6 140 | % \end{AutoMultiColItemize} 141 | 142 | 143 | % \bibliographystyle{plain} 144 | % \bibliography{references} 145 | \end{document} 146 | -------------------------------------------------------------------------------- /approx-infer/contents/part7.tex: -------------------------------------------------------------------------------- 1 | \section{Variational Method} 2 | 3 | \subsection{Introduction} 4 | 5 | \begin{remark}{\textbf{(Limitations)}} 6 | Out treatment of variational method has emphasis natural choices of variational family often factorized using some functional (exponential) form as a joint. They are mostly restricted to joint exponential family facilitates hieratical and distributional model but not non-linear and non-conjugate. 7 | \end{remark} 8 | 9 | \begin{remark}{\textbf{(Using Unconstraint Optimization)}} 10 | Consider parameteric variational approximation via a constrained family $q(\mathcal{Z};\boldsymbol \rho)$. The constrained variational E-step becomes 11 | \begin{equation*} 12 | q(\mathcal{Z}) = \argmax{q\in\brackc{q(\mathcal{Z};\rho)}} \mathcal{F}(q(\mathcal{Z}), \boldsymbol \theta^{(k-1)}) \implies \boldsymbol \rho^{(k)} = \argmax{\boldsymbol \rho}\mathcal{F}(q(\mathcal{Z};\boldsymbol \rho), \boldsymbol \theta^{(k-1)}) 13 | \end{equation*} 14 | \end{remark} 15 | 16 | \begin{remark}{\textbf{(Reparameterized Free Energy)}} 17 | We can replace constrained optimization $\mathcal{F}(q, \boldsymbol \theta)$ with unconstrained optimization of constrained $\mathcal{F}(\rho, \boldsymbol \theta)$: 18 | \begin{equation*} 19 | \mathcal{F}(\rho, \boldsymbol \theta) = \brackd{\log P(\mathcal{X}, \mathcal{Z} | \boldsymbol \theta^{(k-1)})}_{q(\mathcal{Z};\boldsymbol \rho)} + H[\boldsymbol \rho] 20 | \end{equation*} 21 | We may use the coordinate ascent in $\boldsymbol \rho$ and $\boldsymbol \theta$ but this no longer necessary, as we have: 22 | \begin{itemize} 23 | \item In special case, the expectation of the log-joint under $q(\mathcal{Z};\boldsymbol \rho)$ can be expressed in closed form. We can follow $\nabla_{\boldsymbol \rho}\mathcal{F}$ 24 | \item This requires evaluation a high-dimensional expectation with respected to $q(\mathcal{Z};\boldsymbol \rho)$ as a function of $\boldsymbol \rho$ that isn't simple. 25 | \item There are $3$ solutions to this problem: 26 | \begin{itemize} 27 | \item \correctquote{Score Based} gradient estimate and Monte-Carlow. 28 | \item Recognition network training in separate place - not strictly variational. 29 | \item Recognition network training simultaneously with generative model using frozen sample. 30 | \end{itemize} 31 | \end{itemize} 32 | \end{remark} 33 | 34 | \begin{proposition} 35 | One can show that: 36 | \begin{equation*} 37 | \nabla_{\boldsymbol \rho} \mathcal{F}(\boldsymbol \rho, \boldsymbol \theta) = \brackd{ [\nabla_{\boldsymbol \rho}\log q(\mathcal{Z};\boldsymbol \rho)] \Big( \log P(\mathcal{X}, \mathcal{Z} | \boldsymbol \theta) - q(\mathcal{Z};\boldsymbol \rho) \Big) }_{q(\mathcal{Z}; \boldsymbol \rho)} 38 | \end{equation*} 39 | \end{proposition} 40 | \begin{proof} 41 | We consider the following gradient: 42 | \begin{equation*} 43 | \begin{aligned} 44 | \nabla_{\boldsymbol \rho}\mathcal{F}(\boldsymbol \rho, \boldsymbol \theta) &= \nabla_{\boldsymbol \rho}\int q(\mathcal{Z};) \Big[ \log P(\mathcal{X}, \mathcal{Z} | \boldsymbol \theta) - \log q(\mathcal{Z};\boldsymbol \rho) \Big]\dby\mathcal{Z} \\ 45 | &= \int [\nabla_{\boldsymbol \rho} q(\mathcal{Z};\boldsymbol \rho) ] \Big( \log P(\mathcal{X}, \mathcal{Z} | \boldsymbol \theta) - \log q(\mathcal{Z}; \boldsymbol \rho) \Big) + q(\mathcal{Z};\boldsymbol \rho) \nabla_{\boldsymbol \rho}\brackb{ \log P(\mathcal{X}, \mathcal{Z} | \boldsymbol \theta) - \log q(\mathcal{Z} ; \boldsymbol \rho) } \dby \mathcal{Z} 46 | \end{aligned} 47 | \end{equation*} 48 | We have the following facts: 49 | \begin{equation*} 50 | \begin{aligned} 51 | &\nabla_{\boldsymbol \rho} \log P(\mathcal{X}, \mathcal{Z} | \boldsymbol \theta) = 0 \\ 52 | &\int q(\mathcal{Z} ; \boldsymbol \rho)\nabla_{\boldsymbol \rho} q(\mathcal{Z}; \boldsymbol \rho) = \nabla_{\boldsymbol \rho}\int q(\mathcal{Z};\boldsymbol \rho)\dby\mathcal{Z} = 0 \\ 53 | &\nabla_{\boldsymbol \rho}q(\mathcal{Z};\boldsymbol \rho) = q(\mathcal{Z};\boldsymbol \rho) \nabla_{\boldsymbol \rho}\log q(\mathcal{Z};\boldsymbol \rho) 54 | \end{aligned} 55 | \end{equation*} 56 | And so we have the solution as required 57 | \end{proof} 58 | 59 | \begin{remark}{\textbf{(Reducing Variance)}} 60 | To reduce the expectation of gradient, due to the high-dimensional problem, we can evaluate by MC. 61 | \begin{itemize} 62 | \item We can reduce by factorization, where $q(\mathcal{Z}) = \prod_iq(\mathcal{Z}_i | \boldsymbol \rho_i)$ factor over disjoint cliques. 63 | \item Let $\bar{\mathcal{Z}}_i$ be the minimal markov blancket of $\mathcal{Z}_i$ in the joint $P_{\bar{\mathcal{Z}}_i}$ be a product of joint factors that include element of $\mathcal{Z}_i$ and $P_{\neg \bar{\mathcal{Z}}_i}$ 64 | \end{itemize} 65 | We have the following gradient: 66 | \begin{equation*} 67 | \begin{aligned} 68 | \nabla_{\rho_i}\mathcal{F}(\brackc{\rho_j}, \boldsymbol \theta) &= \brackd{\brackb{\nabla_{\rho_i} \sum_j\log q(\mathcal{Z}_j ; \rho_j) } \bracka{\log P(\mathcal{X}, \mathcal{Z} | \boldsymbol \theta) - \sum_j \log q(\mathcal{Z}_j ; \rho_j)} }_{q(\mathcal{Z})} \\ 69 | &\begin{aligned} 70 | = &\brackd{ [\nabla_{\rho_i} \log q(\mathcal{Z}_i;\boldsymbol \rho_i)] (\log P_{\bar{\mathcal{Z}}_i}(\mathcal{X}, \bar{\mathcal{Z}}_i ) - \log q(\mathcal{Z}_i ; \boldsymbol \rho_i) ) }_{q(\bar{\mathcal{Z}}_i)} \\ 71 | &+\brackd{ [\nabla_{\rho_i} \log q(\mathcal{Z}_i;\boldsymbol \rho_i)]\bracka{ \log P_{\neg \bar{\mathcal{Z}}_i}(\mathcal{X}, \bar{\mathcal{Z}}_{\neg i}) - \sum_{j\ne i} \log q(\mathcal{Z}_j ; \boldsymbol \rho_j) } }_{q(\mathcal{Z})} 72 | \end{aligned} 73 | \end{aligned} 74 | \end{equation*} 75 | Please note that the second term is propositional to $\brackd{\nabla_{\rho_i}\log q(\mathcal{Z}_i; \boldsymbol \rho_i)}_{q(\mathcal{Z}_i)} = 0$ so we only need to consider the expectation with respected to $q(\bar{\mathcal{Z}}_i)$ which is variational message passing. 76 | \end{remark} 77 | 78 | \begin{remark}{\textbf{(Sampling Methods)}} 79 | We consider the following \correctquote{black-box} variational approach is as follows: 80 | \begin{itemize} 81 | \item Choose a parameteric (factored) variational family $q(\mathcal{Z}) = \prod_i q(\mathcal{Z}_i;\boldsymbol \theta)$ 82 | \item Initiate the factors. 83 | \item Repeat until convergence: 84 | \begin{itemize} 85 | \item Stochastic VE-step: Sample from $q(\bar{\mathcal{Z}}_i)$ and estimate expected gradient $\nabla \rho_i\mathcal{F}$, and we update $\rho_i$ along the gradient. 86 | \item Stochastic M-step: Sample from each $q(\bar{\mathcal{Z}}_i)$ as we can update the corresponding parameter. 87 | \end{itemize} 88 | \item Stochastic may use Robbins Monro to promote convergence. 89 | \item Variance of the gradient estimate can also be controlled by MC techniques. 90 | \end{itemize} 91 | \end{remark} 92 | 93 | \begin{remark}{\textbf{(Batches of Data)}} 94 | We have not distinguish between multi-variate models and iid data instances. As we group, all together in $\mathcal{Z}$; for example, large model such as HMM, we often make with multiple data draws and each instance requires a separate variational optimization. 95 | \end{remark} 96 | 97 | \begin{definition}{\textbf{(Recognition Model)}} 98 | Suppose we have fixed length vector $\brackc{(\boldsymbol x_i, \boldsymbol z_i)}$ when $\boldsymbol z_i$ is latent: 99 | \begin{itemize} 100 | \item Optimal variational distribution $q^*(\boldsymbol z_i)$ will depends on $\boldsymbol x_i$ 101 | \item We want to learn the mapping from $q(\boldsymbol z_i;\boldsymbol \rho = f(\boldsymbol x_i;\boldsymbol \phi))$ 102 | \item Now $\boldsymbol \rho$ is output of a general function approximate $f$ parameterized by $\boldsymbol \phi$, training on map $\boldsymbol x_i$ to the variational parameter of $q(\boldsymbol z_i)$ 103 | \end{itemize} 104 | The mapping function $f$ is called recognition model. 105 | \end{definition} 106 | 107 | \begin{definition}{\textbf{(Helmholtz Model)}} 108 | It is a binary sigmoid belief-network with parallel recognition model. There are $2$ phase of learning: 109 | \begin{itemize} 110 | \item \emph{Wake Phase}: Given current $f$ estimate mean field representation from data: 111 | \begin{equation*} 112 | q(z_i) = \operatorname{Bern}(\hat{z}_i)\qquad \hat{z}_i = f(x_i;\boldsymbol \phi) 113 | \end{equation*} 114 | We will update generative parameter $\boldsymbol \theta$ according to $\nabla_{\boldsymbol \theta} \mathcal{F}(\brackc{\hat{z}_i};\boldsymbol \theta)$ 115 | \item \emph{Sleep Phase}: Sample $\brackc{\boldsymbol z_i, \boldsymbol x_i}^S_{i=1}$ from the current generative model. Update recognition parameter $f(\boldsymbol x_i)$ toward $\boldsymbol z_i$: 116 | \begin{equation*} 117 | \Delta \boldsymbol \phi \propto \sum^S_{i=1} (\boldsymbol z_i - f(\boldsymbol x_i;\boldsymbol \phi))\nabla_{\boldsymbol \phi}f(\boldsymbol x_i;\boldsymbol \phi) 118 | \end{equation*} 119 | Please note that this step minimizes: 120 | \begin{equation*} 121 | \operatorname{KL}\brackb{P_\theta(\boldsymbol z|\boldsymbol x) \Big\| q(\boldsymbol z; f(\boldsymbol x;\boldsymbol \phi)) } 122 | \end{equation*} 123 | This is opposite to variational objective. Opposite to variational objective, but may not matter if divergence is small enough 124 | \end{itemize} 125 | \end{definition} 126 | 127 | \begin{remark}{\textbf{(Comments on Helmholtz Model Evaluation)}} 128 | We have to sample $\boldsymbol z$ from recognition model rather than just evaluate means: 129 | \begin{itemize} 130 | \item Expectation in free-energy can be computed directly rather than by means substuition. 131 | \item In hieratical model, output of higher recognition layer depends on sample at previous stages, which introduces correlation between sample at difference layer. 132 | \end{itemize} 133 | Recognition model structure need not necessary exactly echo generative model. Please note that a more general approach is to train $f$ to yields the parameter of exponential family $q(\boldsymbol z)$. 134 | \end{remark} 135 | 136 | \begin{definition}{\textbf{(Variational Autoencoder)}} 137 | We fuses the wake and sleep phase. We generate the recognition sample used deterministic transformation of external random variable. If $f$ gives marginal $\mu_i$ and $\sigma_i$ for latent $z_i$ and $\boldsymbol \varepsilon^s_i \sim\mathcal{N}(0, 1)$ then: 138 | \begin{equation*} 139 | \boldsymbol z^s_i = \boldsymbol \mu_i + \sigma_i\boldsymbol \varepsilon^s_i 140 | \end{equation*} 141 | Given the generative and recognition model can be trained together with gradient descent. Holding $\boldsymbol \varepsilon^s$ fixed: 142 | \begin{equation*} 143 | \mathcal{F}_i(\boldsymbol \theta, \boldsymbol \phi) = \sum_s \log P(\boldsymbol x_i, \boldsymbol z^s_i;\boldsymbol \theta) - \log q(\boldsymbol z^s_i ; f(\boldsymbol x_i, \boldsymbol \phi)) 144 | \end{equation*} 145 | We have the following derivative as: 146 | \begin{equation*} 147 | \begin{aligned} 148 | &\frac{\partial}{\partial \boldsymbol \theta}\mathcal{F}_i = \sum_s \nabla_{\boldsymbol \theta}\log P(\boldsymbol x_i, \boldsymbol z_i^s ; \boldsymbol \theta) \\ 149 | &\frac{\partial}{\partial \boldsymbol \phi}\mathcal{F}_i = \sum_s \frac{\partial}{\partial z^s_i} \Big( \log P(\boldsymbol x_i, \boldsymbol z_i^s ; \boldsymbol \theta) - \log q(\boldsymbol z^s_i ; f(\boldsymbol x_i)) \Big)\frac{d\boldsymbol z_i^s}{d\phi} + \frac{\partial}{\partial f(\boldsymbol x_i)} \log q(\boldsymbol z^s_i ; f(\boldsymbol x_i))\frac{df(\boldsymbol x_i)}{d\phi} 150 | \end{aligned} 151 | \end{equation*} 152 | \end{definition} 153 | 154 | \begin{remark}{\textbf{(Observations on Variational Auto-Encoder)}} 155 | We consider the following observations on the training of VAE: 156 | \begin{itemize} 157 | \item We start by frozen sample $\boldsymbol \varepsilon^s$ can be redrawn that avoid overfitting. 158 | \item It maybe possible to evaluate the entropy and $\log P(\boldsymbol z)$ without sampling and reducing variance: 159 | \item Differentiable reparameterization are avaliable for number of difference distribution 160 | \item The conditional $P(\boldsymbol x|\boldsymbol z, \boldsymbol \theta)$ is often implemented as a neural network with additive noise at input. 161 | \item In practice, hieratical model appear to be difficult to train. 162 | \end{itemize} 163 | \end{remark} 164 | 165 | \subsection{Additional Models to VAE} 166 | 167 | \begin{definition}{\textbf{(Importance Weigh Free Energy)}} 168 | Consider another interpretation of free energy: 169 | \begin{equation*} 170 | \mathcal{F}(q;\boldsymbol \theta) = \brackd{\log \frac{P(\boldsymbol x, \boldsymbol z)}{q(\boldsymbol z)}}_{q} 171 | \end{equation*} 172 | We consider the jensen's inequality on importance sampled estimate: 173 | \begin{equation*} 174 | l(\boldsymbol \theta) = \log \mathbb{E}_{z\sim q}\brackb{\frac{P(\boldsymbol x, \boldsymbol z)}{q(\boldsymbol z)}} \le \mathbb{E}_{z\sim q}\brackb{\log \frac{P(\boldsymbol x, \boldsymbol z)}{q(\boldsymbol z)}} = \mathcal{F}(q;\boldsymbol \theta) 175 | \end{equation*} 176 | Suggest more accurate importance weight as we have: 177 | \begin{equation*} 178 | l(\boldsymbol \theta) = \log \mathbb{E}_{\boldsymbol z_1,\dots,\boldsymbol z_k \sim q} \brackb{\frac{1}{k}\sum_k \frac{P(\boldsymbol x_k, \boldsymbol z_k)}{q(\boldsymbol z_k)}} \ge \mathbb{E}_{\boldsymbol z_1,\dots,\boldsymbol z_k \sim q} \brackb{\log \frac{1}{k}\sum_k \frac{P(\boldsymbol x_k, \boldsymbol z_k)}{q(\boldsymbol z_k)}} 179 | \end{equation*} 180 | This allows the tight-bound and reparameterization friendly but as $K\rightarrow\infty$, the signal for learning amortized $q$ grows weaker making VAE learning too slow. 181 | \end{definition} 182 | 183 | \begin{definition}{\textbf{(Normalizing Flow)}} 184 | We have the following free energy: 185 | \begin{equation*} 186 | \mathcal{F}(q, \boldsymbol \theta) = \brackd{\log P(\boldsymbol x, \boldsymbol z | \boldsymbol \theta)}_q - \brackd{\log q(\boldsymbol z)}_q 187 | \end{equation*} 188 | To evaluate $\mathcal{F}$, we need to be able to find expectation with respected to $q$ and evaluate the log-density, which usually restrict us to tractable inference families. We consider the followign recognition model $q(\boldsymbol z)$ implicitly by: 189 | \begin{equation*} 190 | \boldsymbol z_0 \sim q(\cdot ; \boldsymbol x) \qquad \boldsymbol z = f_k(f_{k-1}(\cdots f_1(\boldsymbol z_0))) 191 | \end{equation*} 192 | where $q_0$ should be fixed and tractable. And so, we have the following evaluations: 193 | \begin{equation*} 194 | \brackd{F(\boldsymbol z)}_q = \brackd{F(f_k(f_{k-1}(\cdots f_1(\boldsymbol z_0))))}_{q_0} \qquad \log q(\boldsymbol z) = \log q_0(f^{-1}_1(\cdots f^{-1}_{k-1}(f^{-1}_k(\boldsymbol z)))) - \sum_k\log \abs{\nabla f_k} 195 | \end{equation*} 196 | where $\abs{\nabla f_k}$ being the determinant of the jacobian, where we use the following transformation of variables: 197 | \begin{equation*} 198 | \boldsymbol z_k = f_k(\boldsymbol z_{k-1}) \quad \implies \quad q(\boldsymbol z_k) = q(f^{-1}_k(\boldsymbol z_k)) \abs{\frac{\partial \boldsymbol z_{k-1}}{\partial \boldsymbol z_k}} = q(f^{-1}_k(\boldsymbol z_k))\abs{\nabla f_k(\boldsymbol z_{k-1})}^{-1} 199 | \end{equation*} 200 | Given a sample $\boldsymbol z_0^i \sim q_0(\cdot ; \boldsymbol x)$ as we have: 201 | \begin{equation*} 202 | \mathcal{F}(q, \boldsymbol \theta) \approx \frac{1}{S}\sum_s \log p(\boldsymbol x, f_k(f_{k-1}(\cdots f_1(\boldsymbol z^i_0))) ) + h[q_0] + \frac{1}{S}\sum_s\sum_k \abs{\nabla f_k(f_{k-1}(\cdots f_1(\boldsymbol z^i_0)))} 203 | \end{equation*} 204 | \end{definition} 205 | 206 | \begin{remark}{\textbf{(Special $\boldsymbol f$ for Normalizing Flow)}} 207 | Suppose we use the special $f$ and we have: 208 | \begin{equation*} 209 | \begin{aligned} 210 | &f(\boldsymbol z) = \boldsymbol z + \boldsymbol uh(\boldsymbol w^T\boldsymbol z + b) \implies \abs{\nabla f} = \abs{1 + \boldsymbol u^T\boldsymbol \Psi(\boldsymbol z)} \quad \text{ where } \quad \Psi(\boldsymbol z)=h'(\boldsymbol x^T\boldsymbol z + b)\boldsymbol w \\ 211 | &f(\boldsymbol z) = \boldsymbol z + \frac{\beta}{\alpha + \abs{\boldsymbol z - \boldsymbol z_0}} \implies \abs{\nabla f} = [1 + \beta h]^{d-1}\brackb{1+\beta h + \beta h'r} \quad \text{ where }\quad r=\abs{\boldsymbol z-\boldsymbol z_0} \quad h = \frac{1}{\alpha + r} 212 | \end{aligned} 213 | \end{equation*} 214 | \end{remark} 215 | 216 | \begin{definition}{\textbf{(DDC Helmholz Machine)}} 217 | We define $q$ to be unnormalized exponential family with \emph{large} set of sufficient statistics: 218 | \begin{equation*} 219 | q(\boldsymbol z) \propto \exp\bracka{\sim_i \eta_i\psi_i(\boldsymbol z)} 220 | \end{equation*} 221 | and it is parameterized by mean parameter $\boldsymbol \mu = \brackd{\boldsymbol \psi(\boldsymbol z)}$, which we call distributed distribution code (DDC). Train recognition model using a sleep sample as we have: 222 | \begin{equation*} 223 | \begin{aligned} 224 | &\boldsymbol \mu = \brackd{\boldsymbol \psi(\boldsymbol z)}_q = f(\boldsymbol x^* ; \boldsymbol \phi) \\ 225 | &\Delta \boldsymbol \phi \propto \sum_s (\boldsymbol \psi(\boldsymbol z_s) - f(\boldsymbol x_s ; \boldsymbol \phi)) \nabla_{\boldsymbol \phi}f(\boldsymbol x_s ; \boldsymbol \phi) 226 | \end{aligned} 227 | \end{equation*} 228 | Furthermore, we also learn linear approximation $\nabla \log P(\boldsymbol x, \boldsymbol z | \boldsymbol \theta) \approx \boldsymbol A\psi(\boldsymbol z)$, where 229 | \begin{equation*} 230 | \boldsymbol A = \bracka{\sum_s \nabla \log P(\boldsymbol x_s, \boldsymbol z_s | \boldsymbol \theta)\psi(\boldsymbol z_s)}^T\bracka{\sum_s \psi(\boldsymbol z_s)\psi(\boldsymbol z_s)^T}^{-1} 231 | \end{equation*} 232 | Then we have $\brackd{\nabla \log P(\boldsymbol x, \boldsymbol z)}_q \approx \boldsymbol A\brackd{\boldsymbol \psi(\boldsymbol z)} \propto f(\boldsymbol x, \boldsymbol \phi)$ can be generalized into infinite dimension with kernel. 233 | \end{definition} 234 | 235 | \begin{definition}{\textbf{(Amortised Learning)}} 236 | We aren't interested in inference. We can short-circled general recognition and compute expectation for learning directly, as we have: 237 | \begin{equation*} 238 | \nabla_{\boldsymbol \theta} l (\boldsymbol \theta) = \frac{\partial}{\partial \boldsymbol \theta} \mathcal{F}(q^*, \boldsymbol \theta) = \brackd{\nabla_{\boldsymbol \theta}\log P(\mathcal{X},\mathcal{Z}|\boldsymbol \theta)}_{q^*} 239 | \end{equation*} 240 | We can use the wake-sleep approach: 241 | \begin{itemize} 242 | \item Sample $\brackc{\boldsymbol x_s, \boldsymbol z_s} \sim P(\mathcal{X},\mathcal{Z}|\boldsymbol \theta^k)$ 243 | \item Train Regression $\hat{J}_{\boldsymbol \theta^k} : \boldsymbol x_k\mapsto \nabla_{\boldsymbol \theta} \log P(\boldsymbol x_s, \boldsymbol z_s)|_{\boldsymbol \theta^k}$ (Learning the mapping) 244 | \item Set $\boldsymbol \theta^{k+1} = \boldsymbol \theta^k + \eta\sum_i \hat{J}_{\boldsymbol \theta^k}(\boldsymbol x_i)$ 245 | \end{itemize} 246 | Derivate form works for (kernel and GP) regression for which regressor is linear in target. For conditional, exponential family model: 247 | \begin{equation*} 248 | \brackd{\log P(\mathcal{X},\mathcal{Z} | \boldsymbol \theta)}_{q^*} = \brackd{\boldsymbol \eta(\boldsymbol z, \boldsymbol \theta)}_{q^*}^T\boldsymbol T(\boldsymbol x) - \brackd{\boldsymbol \Phi(\boldsymbol z, \boldsymbol \theta) + \log P(\boldsymbol z|\boldsymbol \theta)}_{q^*} 249 | \end{equation*} 250 | and regressor can be trained to function of $\boldsymbol z$ alone, with $T(\boldsymbol x)$ evaluated on (wake-phase) data. 251 | \end{definition} 252 | 253 | \begin{remark}{\textbf{(VAE Comments)}} 254 | Much of the VAE and related work has common generative model as: 255 | \begin{equation*} 256 | \boldsymbol z \sim \mathcal{N}(\boldsymbol 0, \boldsymbol I) \qquad \boldsymbol x \sim\mathcal{N}(\boldsymbol g(\boldsymbol z ; \boldsymbol \theta), \psi\boldsymbol I) 257 | \end{equation*} 258 | where $\boldsymbol g$ is neural network. Let's consider the dimension of $\boldsymbol z$ as we have: 259 | \begin{itemize} 260 | \item Overcomplicated: If $\operatorname{dim}(\boldsymbol z)$ is large enough, the optimal solution has $\psi\rightarrow0$ as $\boldsymbol q(\boldsymbol z;\boldsymbol x) \rightarrow \delta(\boldsymbol z - \boldsymbol f(\boldsymbol x;\boldsymbol \phi))$. In effect, the generative model learns a flow to transform a model density to the target. 261 | \item Oversimplified: If $\operatorname{dim}(\boldsymbol z)$ is small as non-linear PCA. 262 | \end{itemize} 263 | Interesting latent representation are required more structured generative model. 264 | \end{remark} 265 | 266 | \begin{definition}{\textbf{(Structured VAE)}} 267 | Consider a model where $P(\mathcal{Z}|\boldsymbol \theta)$ has tractable joint exponential family potential and: 268 | \begin{equation*} 269 | P(\mathcal{X}|\mathcal{Z},\Gamma) = \prod_iP(\boldsymbol x_i | \boldsymbol z_i, \boldsymbol \gamma_i) 270 | \end{equation*} 271 | are interactable. conditional independent observation $\boldsymbol \gamma_i$ might be the same for all $i$. Consider factored variational inference $q(\mathcal{Z}) = \prod_iq(\boldsymbol z_i)$ with no further constriants: 272 | \begin{equation*} 273 | \begin{aligned} 274 | \log q^*_i(\boldsymbol z_i) &= \brackd{\log P(\mathcal{Z},\mathcal{X})}_{q_{\neg i }} + \const \\ 275 | &= \brackd{\log P(\boldsymbol z_i | \mathcal{Z}_i) + \log P(\boldsymbol x_i | \boldsymbol z_i)}_{q_{\neg i}} + \const \\ 276 | &= \brackd{\boldsymbol \eta_{\neg i}}^T_{q_{\neg i}}\boldsymbol \psi_i(\boldsymbol z_i) + \log P(\boldsymbol x_i | \boldsymbol z_i) 277 | \end{aligned} 278 | \end{equation*} 279 | Let's consider each variables (exploited the exponential family form of $P(\mathcal{Z})$): 280 | \begin{itemize} 281 | \item $\boldsymbol \psi_i$ are effective sufficient statistics included log-normalizer of children of DAG. 282 | \item $\boldsymbol \eta_{\neg i}$ is function of $\mathcal{Z}_{\neg i}$ 283 | \end{itemize} 284 | We will choose the parameteric form of $q_i(\boldsymbol z_i) = \exp(\tilde{\boldsymbol \eta}^T\boldsymbol \psi_i(\boldsymbol z_i) - \boldsymbol \Phi_i(\tilde{\boldsymbol \eta}_i))$ and so the optimum will have: 285 | \begin{equation*} 286 | \log q^*_i(\boldsymbol z_i) = \brackd{\boldsymbol \eta_{\neg i}}^T_{q_{\neg i}}\boldsymbol \psi_i(\boldsymbol z_i) + \boldsymbol \rho(\boldsymbol x_i)^T\boldsymbol \psi_i(\boldsymbol z_i) 287 | \end{equation*} 288 | where $\boldsymbol \rho(\boldsymbol x_i) = \boldsymbol f_i(\boldsymbol x_i;\boldsymbol phi)$ recognition function and it might be the same for all $i$. 289 | \end{definition} 290 | 291 | \begin{remark}{\textbf{(Training of Structured VAE)}} 292 | We consider the free energy: 293 | \begin{equation*} 294 | \begin{aligned} 295 | \mathcal{F}(\boldsymbol \theta, \boldsymbol \Gamma, \brackc{\boldsymbol \phi_i}) &= \brackd{\sum_i \log P(\boldsymbol x_i | \boldsymbol z_i, \boldsymbol \gamma_i) + \log P(\mathcal{Z}|\boldsymbol \theta)}_{q(\mathcal{Z} ; \boldsymbol \theta, \brackd{\boldsymbol \phi})} + \sum_i H[q_i] \\ 296 | &= \sum_i \underbrace{\brackd{\log P(\boldsymbol x_i | \boldsymbol z_i, \boldsymbol \gamma_i) + \log P(\mathcal{Z}|\boldsymbol \theta)}_{q_i(\boldsymbol z_i;\boldsymbol \theta, \boldsymbol \phi_i)} + H[q_i]}_{\mathcal{F}_i} + \brackd{\log P(\mathcal{Z}|\theta)}_{q(\mathcal{Z}, \boldsymbol \theta, \brackc{\boldsymbol \phi_i})} 297 | \end{aligned} 298 | \end{equation*} 299 | Update $\boldsymbol \theta$ are just tractable model. To update each $\phi_i$ and $\gamma_i$, find $\brackd{\boldsymbol \eta_{\neg i}}_{q_{\neg i}}$ to give the prior-like in VAE, and we generated the reparameterization sample $\boldsymbol z^s_i \sim q_i$, then: 300 | \begin{equation*} 301 | \begin{aligned} 302 | &\frac{\partial}{\partial\boldsymbol \gamma_i} \mathcal{F}_i = \sum_s \nabla_{\boldsymbol \gamma_i}\log P(\boldsymbol x_i, \boldsymbol z^s_i; \boldsymbol \gamma_i) \\ 303 | &\frac{\partial}{\partial \boldsymbol \phi_i}\mathcal{F}_i = \sum_{s}\frac{\partial}{\partial \boldsymbol z^s_i}\Big( \log P(\boldsymbol x_i, \boldsymbol z^s_i ; \boldsymbol \gamma_i) - \log q(\boldsymbol z^s_i ; f(\boldsymbol x_i)) \Big)\frac{d\boldsymbol z^s_i}{d\boldsymbol \phi} + \frac{\partial}{\partial f(\boldsymbol x_i)} \log q(\boldsymbol z^s_i ; f(\boldsymbol x_i))\frac{df(\boldsymbol x_i)}{d\boldsymbol \phi} 304 | \end{aligned} 305 | \end{equation*} 306 | This is like standard VAE. 307 | \end{remark} 308 | 309 | -------------------------------------------------------------------------------- /approx-infer/img/img1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/img/img1.png -------------------------------------------------------------------------------- /approx-infer/img/img10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/img/img10.png -------------------------------------------------------------------------------- /approx-infer/img/img11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/img/img11.png -------------------------------------------------------------------------------- /approx-infer/img/img12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/img/img12.png -------------------------------------------------------------------------------- /approx-infer/img/img13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/img/img13.png -------------------------------------------------------------------------------- /approx-infer/img/img14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/img/img14.png -------------------------------------------------------------------------------- /approx-infer/img/img15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/img/img15.png -------------------------------------------------------------------------------- /approx-infer/img/img16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/img/img16.png -------------------------------------------------------------------------------- /approx-infer/img/img2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/img/img2.png -------------------------------------------------------------------------------- /approx-infer/img/img3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/img/img3.png -------------------------------------------------------------------------------- /approx-infer/img/img4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/img/img4.png -------------------------------------------------------------------------------- /approx-infer/img/img5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/img/img5.png -------------------------------------------------------------------------------- /approx-infer/img/img6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/img/img6.png -------------------------------------------------------------------------------- /approx-infer/img/img7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/img/img7.png -------------------------------------------------------------------------------- /approx-infer/img/img8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/img/img8.png -------------------------------------------------------------------------------- /approx-infer/img/img9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/approx-infer/img/img9.png -------------------------------------------------------------------------------- /prob-unsup/chapter/part2.tex: -------------------------------------------------------------------------------- 1 | \section{Latent Variable Model} 2 | 3 | \begin{definition}{\textbf{(Latent Variable Model)}} 4 | The latent variable model can be seen as: 5 | \begin{equation*} 6 | \begin{aligned} 7 | \boldsymbol z &\sim p(\boldsymbol \theta_z) \\ 8 | \boldsymbol x | \boldsymbol z &\sim p(\boldsymbol \theta_z) \\ 9 | p(\boldsymbol x, \boldsymbol z;\boldsymbol \theta_x, \boldsymbol \theta_z) &= p(\boldsymbol x|\boldsymbol z ; \boldsymbol \theta_x)p(\boldsymbol z ; \boldsymbol \theta_z) \\ 10 | p(\boldsymbol x ; \boldsymbol \theta_x, \boldsymbol \theta_z) &= \int p(\boldsymbol x |\boldsymbol z; \boldsymbol \theta_x) p(\boldsymbol z ; \boldsymbol \theta_z) \dby \boldsymbol z 11 | \end{aligned} 12 | \end{equation*} 13 | Note that $p(\boldsymbol z), p(\boldsymbol x|\boldsymbol z)$ and $p(\boldsymbol x, \boldsymbol z)$ are exponential family but $p(\boldsymbol x)$ doesn't have to be an exponential family. 14 | \end{definition} 15 | 16 | \subsection{PCA Formulation} 17 | 18 | \begin{remark} 19 | We will consider the family of PCA formulation. We will start with PCA definition, which can be formulated in $2$ ways: Maximal Variance and Average Projection Cost. Then, we will consider the PPCA a probabilistic version of PCA. 20 | \end{remark} 21 | 22 | \begin{definition}{\textbf{(Maximal Variance)}} 23 | Consider the dataset $\brackc{\boldsymbol x_i}^N_{i=1}$ where $\boldsymbol x_i \in \mathbb{R}^D$. We want to project the data onto space with dimension $M < D$. To do this, we want to find a subspace orthonormal basis $\boldsymbol u_i$ for $i=1,\cdots,M$, such that $\boldsymbol u_i^T\boldsymbol u_j = \delta_{ij}$. So that the empirical projected variance, given as: 24 | \begin{equation*} 25 | \frac{1}{N}\sum^N_{j=1}(\boldsymbol u_i^T\boldsymbol x_j - \boldsymbol u_i^T\bar{\boldsymbol x})^2 26 | \end{equation*} 27 | is maximized for $i=1,\dots,M$ where $\boldsymbol u_1$ gives the highest variance, where $\bar{\boldsymbol x} = 1/N\sum^N_{i=1}\boldsymbol x_i$ 28 | \end{definition} 29 | 30 | \begin{proposition} 31 | The maximum projected variance direction $\boldsymbol u_1,\dots,\boldsymbol u_M$ is the $M$ eigenvectors of the data-covariance matrix: 32 | \begin{equation*} 33 | \boldsymbol S = \frac{1}{N}\sum^N_{i=1}(\boldsymbol x_i - \bar{\boldsymbol x})(\boldsymbol x_i-\bar{\boldsymbol x_i})^T 34 | \end{equation*} 35 | associated with the following eigenvalues $\lambda_1\ge\lambda_2\ge\cdots\ge\lambda_M\ge\cdots\ge\lambda_D$. 36 | \end{proposition} 37 | \begin{proof} 38 | Let's start with the first direction $\boldsymbol u_1$ as we can show that the empirical projected variance: 39 | \begin{equation*} 40 | \frac{1}{N}\sum^N_{j=1}(\boldsymbol u_1^T\boldsymbol x_j - \boldsymbol u_1^T\bar{\boldsymbol x})^2 = \boldsymbol u_1^T\boldsymbol S\boldsymbol u_1 41 | \end{equation*} 42 | We will consider the following contraint optimization problem as we have the following Lagrange multiplier and set the derivative to be $0$: 43 | \begin{equation*} 44 | \begin{aligned} 45 | \frac{\partial}{\partial \boldsymbol u} \Big[\boldsymbol u_1^T\boldsymbol S\boldsymbol u_1 + \lambda_1(1 - \boldsymbol u_1^T\boldsymbol u_1)\Big] &= 2\boldsymbol S\boldsymbol u_1 - 2\lambda_1\boldsymbol u_1 = 0 46 | \end{aligned} 47 | \end{equation*} 48 | Please note that the matrix $\boldsymbol S$ is symmetric. This leads us to the following, equation, which is: 49 | \begin{equation*} 50 | \boldsymbol S \boldsymbol u_1 = \lambda_1\boldsymbol u_1 51 | \end{equation*} 52 | Furthermore, if we multiply $\boldsymbol u_1^T$ on the LHS together with the contraint $\boldsymbol u_1^T\boldsymbol u_1=1$, then we have $\boldsymbol u_1^T\boldsymbol S\boldsymbol u_1 = \lambda_1$. This means that first maximal variance projection direction is the eigenvector $\boldsymbol u_1$ that has the highest assocated eigenvalue. Furthermore, the orthogonal comes from the properties of eigenvectors. 53 | \end{proof} 54 | 55 | \begin{definition}{\textbf{(Minimum-Error Formulation)}} 56 | We get the formulation PCA where we have orthonormal set $D$-dimensional basis vector $\brackc{\boldsymbol u_i}$ where $i=1,\dots,D$ and $\boldsymbol u_i^T\boldsymbol u_j = \delta_{ij}$. This means that the data point can be represented by the basis: 57 | \begin{equation*} 58 | \boldsymbol x_i = \sum^D_{j=1}\alpha_{ij}\boldsymbol u_j = \sum^D_{j=1}(\boldsymbol x_i^T\boldsymbol u_j) \boldsymbol u_j 59 | \end{equation*} 60 | The value of $\alpha_{ij}$ can be found by the inner product $\alpha_{ij} = \boldsymbol x_i^T\boldsymbol u_j$ as above (by the orthogonal properties). Now, we will consider the approximation using the projection over linear subspace $M < D$: 61 | \begin{equation*} 62 | \tilde{\boldsymbol x}_i = \sum^M_{j=1}z_{ij}\boldsymbol u_j + \sum^D_{j=M+1}b_j\boldsymbol u_j 63 | \end{equation*} 64 | where the $\brackc{b_j}$ are component that is same for all data points. Now, we are free to find the $\brackc{b_j}, \brackc{z_{ij}}$ and $\brackc{\boldsymbol u_j}$ given the following objective: 65 | \begin{equation*} 66 | \frac{1}{N}\sum^N_{i=1}\norm{\boldsymbol x_i - \tilde{\boldsymbol x}_i}^2 67 | \end{equation*} 68 | \end{definition} 69 | 70 | \begin{proposition} 71 | The solution to the minimum error formulation is the same as the maximal variance formulation. This gives us the difference interpretation of the PCA. 72 | \end{proposition} 73 | \begin{proof} 74 | Let's start with finding the value $\brackc{z_{ij}}$, first. As we want to find derivative of $z_{ij}$ to be zero: 75 | \allowdisplaybreaks 76 | \begin{align*} 77 | \frac{1}{N}\sum^N_{i=1}&\norm{\boldsymbol x_i - \sum^M_{j=1}z_{ij}\boldsymbol u_j - \sum^D_{j=M+1}b_j\boldsymbol u_j}^2 \\ 78 | &= \frac{1}{N}\sum^N_{i=1}\brackb{\bracka{\boldsymbol x_i - \sum^M_{j=1}z_{ij}\boldsymbol u_j - \sum^D_{j=M+1}b_j\boldsymbol u_j}^T\bracka{\boldsymbol x_i - \sum^M_{j=1}z_{ij}\boldsymbol u_j - \sum^D_{j=M+1}b_j\boldsymbol u_j}} \\ 79 | &= \begin{aligned}[t] 80 | \frac{1}{N}\sum^N_{i=1}\Bigg[ 81 | &\boldsymbol x_i^T\boldsymbol x_i - \sum^M_{j=1}z_{ij}\boldsymbol u_j^T\boldsymbol x_i - \sum^D_{j=M+1}b_j\boldsymbol u_j^T\boldsymbol x_i -\sum^M_{j=1}z_{ij}\boldsymbol x_i^T\boldsymbol u_j \\ 82 | & + \bracka{\sum^M_{j=1}z_{ij}\boldsymbol u_j^T}\bracka{\sum^M_{j=1}z_{ij}\boldsymbol u_j} + \bracka{\sum^D_{j=M+1}b_j\boldsymbol u_j^T}\bracka{\sum^M_{j=1}z_{ij}\boldsymbol u_j} \\ 83 | &-\sum^D_{j=M+1}b_j\boldsymbol x_i^T\boldsymbol u_j + \bracka{\sum^M_{j=1}z_{ij}\boldsymbol u_j^T}\bracka{\sum^D_{j=M+1}b_j\boldsymbol u_j} + \bracka{\sum^D_{j=M+1}b_j\boldsymbol u_j^T}\bracka{\sum^D_{j=M+1}b_j\boldsymbol u_j}\Bigg] \\ 84 | \end{aligned} \\ 85 | &= \begin{aligned}[t] 86 | \frac{1}{N}\sum^N_{i=1}\Bigg[ 87 | &\boldsymbol x_i^T\boldsymbol x_i - \sum^M_{j=1}z_{ij}\boldsymbol u_j^T\boldsymbol x_i - \sum^D_{j=M+1}b_j\boldsymbol u_j^T\boldsymbol x_i -\sum^M_{j=1}z_{ij}\boldsymbol x_i^T\boldsymbol u_j-\sum^D_{j=M+1}b_j\boldsymbol x_i^T\boldsymbol u_j \\ 88 | & + \bracka{\sum^M_{a=1}z_{ia}\boldsymbol u_a^T}\bracka{\sum^M_{b=1}z_{ib}\boldsymbol u_b} + \bracka{\sum^D_{a=M+1}b_a\boldsymbol u_a^T}\bracka{\sum^M_{b=1}z_{ib}\boldsymbol u_b} \\ 89 | &+ \bracka{\sum^M_{a=1}z_{ia}\boldsymbol u_a^T}\bracka{\sum^D_{b=M+1}b_b\boldsymbol u_b} + \bracka{\sum^D_{a=M+1}b_a\boldsymbol u_a^T}\bracka{\sum^D_{b=M+1}b_b\boldsymbol u_b}\Bigg] \\ 90 | \end{aligned} \\ 91 | &= \begin{aligned}[t] 92 | \frac{1}{N}\sum^N_{i=1}\Bigg[ 93 | &\boldsymbol x_i^T\boldsymbol x_i - 2\sum^M_{j=1}z_{ij}\boldsymbol u_j^T\boldsymbol x_i - 2\sum^D_{j=M+1}b_j\boldsymbol u_j^T\boldsymbol x_i + \sum^M_{a=1}\sum^M_{b=1}z_{ia} z_{ib}\boldsymbol u_a^T\boldsymbol u_b \\ 94 | & + 2\sum^D_{a=M+1}\sum^M_{b=1}b_az_{ib}\boldsymbol u_a^T\boldsymbol u_b +\sum^D_{a=M+1}\sum^D_{b=M+1}b_ab_b\boldsymbol u_a^T\boldsymbol u_b\Bigg] \\ 95 | \end{aligned} \\ 96 | &= \begin{aligned}[t] 97 | \frac{1}{N}\sum^N_{i=1}\Bigg[ 98 | &\boldsymbol x_i^T\boldsymbol x_i - 2\sum^M_{j=1}z_{ij}\boldsymbol u_j^T\boldsymbol x_i - 2\sum^D_{j=M+1}b_j\boldsymbol u_j^T\boldsymbol x_i + \sum^M_{j=1}z_{ij}^2 +\sum^D_{j=M+1}b_j^2 + \cancel{2\sum^D_{a=M+1}\sum^M_{b=1}b_az_{ib}\boldsymbol u_a^T\boldsymbol u_b}\Bigg] \\ 99 | \end{aligned} \\ 100 | &= \begin{aligned}[t] 101 | \frac{1}{N}\sum^N_{i=1}\Bigg[\boldsymbol x_i^T\boldsymbol x_i - 2\sum^M_{j=1}z_{ij}\boldsymbol u_j^T\boldsymbol x_i - 2\sum^D_{j=M+1}b_j\boldsymbol u_j^T\boldsymbol x_i + \sum^M_{j=1}z_{ij}^2 +\sum^D_{j=M+1}b_j^2\Bigg] 102 | \end{aligned} \\ 103 | \end{align*} 104 | Now, let's consider the derivative with respected to $z_{ab}$ as we now have: 105 | \begin{equation*} 106 | \begin{aligned} 107 | \frac{\partial}{\partial z_{ab}} \ &\frac{1}{N}\sum^N_{i=1} \Bigg[\boldsymbol x_i^T\boldsymbol x_i - 2\sum^M_{j=1}z_{ij}\boldsymbol u_j^T\boldsymbol x_i - 2\sum^D_{j=M+1}b_j\boldsymbol u_j^T\boldsymbol x_i + \sum^M_{j=1}z_{ij}^2 +\sum^D_{j=M+1}b_j^2\Bigg] \\ 108 | &= -\frac{\partial}{\partial z_{ab}} \ \frac{2}{N}\sum^N_{i=1}\sum^M_{j=1}z_{ij}\boldsymbol u_j^T\boldsymbol x_i + \frac{\partial}{\partial z_{ab}} \ \frac{1}{N}\sum^N_{i=1}\sum^M_{j=1}z_{ij}^2 \\ 109 | &= -\frac{2}{N}\boldsymbol u_b^T\boldsymbol x_a + \frac{2}{N}z_{ab} = 0 110 | \end{aligned} 111 | \end{equation*} 112 | And so, we have the $z_{ij} = \boldsymbol x_i^T\boldsymbol u_j$ for $j=1,\dots,M$. Now, we consider the derivative of the with respected to $b_a$ as we have: 113 | \begin{equation*} 114 | \begin{aligned} 115 | \frac{\partial}{\partial b_a} \ &\frac{1}{N}\sum^N_{i=1} \Bigg[\boldsymbol x_i^T\boldsymbol x_i - 2\sum^M_{j=1}z_{ij}\boldsymbol u_j^T\boldsymbol x_i - 2\sum^D_{j=M+1}b_j\boldsymbol u_j^T\boldsymbol x_i + \sum^M_{j=1}z_{ij}^2 +\sum^D_{j=M+1}b_j^2\Bigg] \\ 116 | &= -\frac{\partial}{\partial b_a} \ \frac{2}{N}\sum^N_{i=1}\sum^M_{j=M+1}b_j\boldsymbol u_j^T\boldsymbol x_i + \frac{\partial}{\partial b_a} \ \frac{1}{N}\sum^N_{i=1}\sum^M_{j=M+1}b_{j}^2 \\ 117 | &= -\frac{\partial}{\partial b_a} \ 2\sum^M_{j=M+1}b_j\boldsymbol u_j^T\bracka{\frac{1}{N}\sum^N_{i=1}\boldsymbol x_i} + \frac{\partial}{\partial b_a} \ \sum^M_{j=M+1}b_{j}^2 \\ 118 | &= -\frac{\partial}{\partial b_a} \ 2\sum^M_{j=M+1}b_j\boldsymbol u_j^T\bracka{\frac{1}{N}\sum^N_{i=1}\boldsymbol x_i} + \frac{\partial}{\partial b_a} \ \sum^M_{j=M+1}b_{j}^2 \\ 119 | &= -2\boldsymbol u^T_a\bar{\boldsymbol x} + 2b_a 120 | \end{aligned} 121 | \end{equation*} 122 | And so, we have $b_j = \bar{\boldsymbol x}^T\boldsymbol u_j$ for $j=M+1,\dots, D$. To find the $\boldsymbol u_i$, we have the following: 123 | \begin{equation*} 124 | \begin{aligned} 125 | \boldsymbol x_i - \tilde{\boldsymbol x}_i &= \boldsymbol x_i - \sum^M_{j=1}z_{ij}\boldsymbol u_j - \sum^D_{j=M+1}b_j\boldsymbol u_j \\ 126 | &= \sum^M_{j=1}(\boldsymbol x_i^T\boldsymbol u_j)\boldsymbol u_j + \sum^D_{j=M+1}(\boldsymbol x_i^T\boldsymbol u_j)\boldsymbol u_j - \sum^M_{j=1}(\boldsymbol x_i^T\boldsymbol u_j)\boldsymbol u_j - \sum^D_{j=M+1}(\bar{\boldsymbol x}^T\boldsymbol u_j)\boldsymbol u_j \\ 127 | &= \sum^D_{j=M+1}(\boldsymbol x_i^T\boldsymbol u_j)\boldsymbol u_j - \sum^D_{j=M+1}(\bar{\boldsymbol x}^T\boldsymbol u_j)\boldsymbol u_j \\ 128 | &= \sum^D_{j=M+1}\brackc{(\boldsymbol x_i - \bar{\boldsymbol x})^T\boldsymbol u_j}\boldsymbol u_j \\ 129 | \end{aligned} 130 | \end{equation*} 131 | And, so we now have, the following objective: 132 | \allowdisplaybreaks 133 | \begin{align*} 134 | \frac{1}{N}\sum^N_{i=1}&\brackb{\bracka{\sum^D_{j=M+1}\brackc{(\boldsymbol x_i - \bar{\boldsymbol x})^T\boldsymbol u_j}\boldsymbol u_j}^T\bracka{\sum^D_{j=M+1}\brackc{(\boldsymbol x_i - \bar{\boldsymbol x})^T\boldsymbol u_j}\boldsymbol u_j}} \\ 135 | &= \frac{1}{N}\sum^N_{i=1}\brackb{\bracka{\sum^D_{a=M+1}\boldsymbol u_a^T\brackc{\boldsymbol u_a^T(\boldsymbol x_i - \bar{\boldsymbol x}) } }\bracka{\sum^D_{b=M+1}\brackc{(\boldsymbol x_i - \bar{\boldsymbol x})^T\boldsymbol u_b}\boldsymbol u_b}}\\ 136 | &= \frac{1}{N}\sum^N_{i=1}\brackb{\bracka{\sum^D_{a=M+1}\sum^D_{b=M+1}\brackc{\boldsymbol u_a^T(\boldsymbol x_i - \bar{\boldsymbol x}) (\boldsymbol x_i - \bar{\boldsymbol x})^T\boldsymbol u_b} \ \boldsymbol u_a^T \boldsymbol u_b}}\\ 137 | &= \frac{1}{N}\sum^N_{i=1}\sum^D_{a=M+1}\boldsymbol u_a^T(\boldsymbol x_i - \bar{\boldsymbol x}) (\boldsymbol x_i - \bar{\boldsymbol x})^T\boldsymbol u_a = \sum^D_{a=M+1}\boldsymbol u_a^T \boldsymbol S \boldsymbol u_a 138 | \end{align*} 139 | Now, we have the same objective to the maximal variance formulation. And the proposition is proven. 140 | \end{proof} 141 | 142 | \subsection{Probabilistic PCA} 143 | 144 | \begin{definition}{\textbf{(PPCA)}} 145 | We consider the following system of probabilities: 146 | \begin{equation*} 147 | p(\boldsymbol z) = \mathcal{N}(\boldsymbol z | \boldsymbol 0,\boldsymbol I) \qquad p(\boldsymbol x|\boldsymbol z) = \mathcal{N}(\boldsymbol x | \boldsymbol W\boldsymbol z + \boldsymbol \mu, \Psi) 148 | \end{equation*} 149 | which we can consider $p(\boldsymbol z)$ to be the PCA projection, while $p(\boldsymbol x|\boldsymbol z)$ is the reconstruction of the PCA. This means that we can perform PPCA of it. 150 | \end{definition} 151 | 152 | \begin{proposition} 153 | We consider the marginalization of Gaussian to be: 154 | \begin{equation*} 155 | p(\boldsymbol x) = \mathcal{N}(\boldsymbol x | \boldsymbol \mu, \boldsymbol C) \qquad \text{ where } \qquad \boldsymbol C = \boldsymbol \Psi + \boldsymbol W\boldsymbol W^T 156 | \end{equation*} 157 | \end{proposition} 158 | \begin{proof} 159 | We consider the linear Gaussian model in this case, where we use the marginalization (see above) to get the value of $p(\boldsymbol x)$ 160 | \end{proof} 161 | 162 | \begin{proposition} 163 | We consider the inference of the latent and we have: 164 | \begin{equation*} 165 | p(\boldsymbol z | \boldsymbol x) = \mathcal{N}\Big(\boldsymbol \Sigma^{-1}\boldsymbol W^T\boldsymbol \Psi^{-1}(\boldsymbol x - \boldsymbol \mu), \boldsymbol \Sigma^{-1} \Big) 166 | \end{equation*} 167 | where we have $\boldsymbol \Sigma = I + \boldsymbol W^T\Psi^{-1}\boldsymbol W$. 168 | \end{proposition} 169 | 170 | \begin{remark} 171 | We have the projection to be: 172 | \begin{equation*} 173 | \hat{\boldsymbol x}_i = \boldsymbol W\boldsymbol \Sigma^{-1}\boldsymbol W^T\boldsymbol \Psi^{-1}(\boldsymbol x_i - \boldsymbol \mu) 174 | \end{equation*} 175 | As we have the PCA projection that also take noise into consideration. Furthermore, if $\boldsymbol \Psi = \psi^2 \boldsymbol I$ and $\psi \rightarrow0$, then it leads to the PCA estimation (given the correct $\boldsymbol W$, which we will explore later). 176 | \end{remark} 177 | 178 | \begin{remark}{\textbf{(Likelihood of PPCA)}} 179 | Now, we are left to find the actual value of $\boldsymbol W$ and we will assume that we are aware of $\boldsymbol \mu$ (which is usually $\boldsymbol 0$), while the covariance matrix is assumed to be $\boldsymbol \Psi = \psi^2 \boldsymbol I$. The log-likelihood of this PPCA is (using the marginalized): 180 | \begin{equation*} 181 | \begin{aligned} 182 | l = \log p(\brackc{\boldsymbol x_i}^N_{i=1} | \boldsymbol \mu, \boldsymbol C) &= \log \prod^N_{i=1} \frac{1}{\sqrt{|2\pi\boldsymbol C|}}\exp\brackc{-\frac{1}{2}(\boldsymbol x_i - \boldsymbol \mu)^T\boldsymbol C^{-1}(\boldsymbol x_i - \boldsymbol \mu)} \\ 183 | &= -\frac{N}{2}\log\abs{\boldsymbol C} -\frac{1}{2}\sum^N_{i=1}(\boldsymbol x_i - \boldsymbol \mu)^T\boldsymbol C^{-1}(\boldsymbol x_i - \boldsymbol \mu) + \const \\ 184 | &= -\frac{N}{2}\log\abs{\boldsymbol C} -\frac{N}{2}\operatorname{Tr}\bracka{\frac{1}{N}\sum^N_{i=1}(\boldsymbol x_i - \boldsymbol \mu)^T\boldsymbol C^{-1}(\boldsymbol x_i - \boldsymbol \mu)} + \const \\ 185 | &= -\frac{N}{2}\log\abs{\boldsymbol C} -\frac{N}{2}\operatorname{Tr}\bracka{\boldsymbol C^{-1}\frac{1}{N}\sum^N_{i=1}(\boldsymbol x_i - \boldsymbol \mu)(\boldsymbol x_i - \boldsymbol \mu)^T} + \const \\ 186 | &= -\frac{N}{2}\log\abs{\boldsymbol C} -\frac{N}{2}\operatorname{Tr}\bracka{\boldsymbol C^{-1}\boldsymbol S} + \const \\ 187 | \end{aligned} 188 | \end{equation*} 189 | Now $\boldsymbol C = \boldsymbol W\boldsymbol W^T + \psi^2\boldsymbol I$ is given above. 190 | \end{remark} 191 | 192 | \begin{proposition} 193 | Non-Trivial Solution of Maximum Likelihood estimate of $\boldsymbol W$ is equal to: 194 | \begin{equation*} 195 | \boldsymbol W_\text{ML} = \boldsymbol U (\boldsymbol \Lambda - \psi^2\boldsymbol I)^{1/2}\boldsymbol V^T 196 | \end{equation*} 197 | where $\boldsymbol U \in \mathbb{R}^{D\times M}$ is the first $M$ eigenvector of $\boldsymbol S$ the empirical variance matrix and $\boldsymbol \Lambda \in \mathbb{R}^{M\times M}$ be the matrix, which is the eigenvalues of $\boldsymbol S$. Finally, $\boldsymbol V \in \mathbb{R}^{M\times M}$ is an arbitrary orthogonal matrix. 198 | \end{proposition} 199 | \begin{proof} 200 | Let's consider the derivative of the log-likelihood with respect to $\boldsymbol W$ as we now have: 201 | \begin{equation*} 202 | \begin{aligned} 203 | \frac{\partial l}{\partial \boldsymbol W} &= \frac{\partial}{\partial \boldsymbol W}\brackb{-\frac{N}{2}\log\abs{\boldsymbol C} -\frac{N}{2}\operatorname{Tr}\bracka{\boldsymbol C^{-1}\boldsymbol S} } \\ 204 | &= -N\brackb{-\boldsymbol C^{-1}\boldsymbol W + \boldsymbol C^{-1}\boldsymbol S \boldsymbol C^{-1}\boldsymbol W } 205 | \end{aligned} 206 | \end{equation*} 207 | Setting this to $\boldsymbol 0$, and we can see that, we have the following equation: $\boldsymbol S\boldsymbol C^{-1}\boldsymbol W=\boldsymbol W$, there are $2$ outcome to the solution: $\boldsymbol W = 0$, which can be shown to be minimum. Or, Consider $\boldsymbol W$ in the SVD form i.e $\boldsymbol W = \boldsymbol U\boldsymbol L\boldsymbol V^T$ for orthogonal matrix $\boldsymbol U$ and $\boldsymbol V$, while $\boldsymbol L$ is diagonal matrix. This would entail: 208 | \begin{equation*} 209 | \begin{aligned} 210 | &\boldsymbol S\Big( \boldsymbol U \boldsymbol L\boldsymbol V^T \boldsymbol V \boldsymbol L^T\boldsymbol U^T + \psi^2\boldsymbol I \Big)^{-1}\boldsymbol U \boldsymbol L\boldsymbol V^T = \boldsymbol U \boldsymbol L\boldsymbol V^T \\ 211 | \implies & \boldsymbol S\Big( \boldsymbol U \boldsymbol L^2\boldsymbol U^T + \psi^2\boldsymbol I \Big)^{-1}\boldsymbol U = \boldsymbol U \\ 212 | \implies & \boldsymbol S\boldsymbol U \Big( \boldsymbol L^2 + \psi^2\boldsymbol I \Big)^{-1}=\boldsymbol U \\ 213 | \implies & \boldsymbol S\boldsymbol U =\boldsymbol U \Big( \boldsymbol L^2 + \psi^2\boldsymbol I \Big) 214 | \end{aligned} 215 | \end{equation*} 216 | For the second implication, we have: 217 | \begin{equation*} 218 | \boldsymbol U\Big(\boldsymbol L^2 + \psi^2\boldsymbol I\Big) = \Big(\boldsymbol U\boldsymbol L^2\boldsymbol U^T+ \psi^2\boldsymbol I\Big)\boldsymbol U \implies \Big( \boldsymbol U\boldsymbol L^2\boldsymbol U^T + \psi^2\boldsymbol I \Big)^{-1}\boldsymbol U = \boldsymbol U\Big(\boldsymbol L^2 + \psi^2\boldsymbol I\Big)^{-1} 219 | \end{equation*} 220 | We can see that the $\boldsymbol U$ is the eigenvector of $\boldsymbol S$, where the corresponding eigenvalues are $\lambda_i = l_i^2 + \psi^2$, and so we can rewrite the weight to be: 221 | \begin{equation*} 222 | \boldsymbol W = \boldsymbol U (\boldsymbol \Lambda - \psi^2\boldsymbol I)^{1/2}\boldsymbol V^T 223 | \end{equation*} 224 | where $\boldsymbol \Lambda$ be the matrix, which is the eigenvalues of $\boldsymbol S$. 225 | \end{proof} 226 | 227 | \subsection{Other Related Models} 228 | 229 | \begin{definition}{\textbf{(Factor Analysis)}} 230 | The factor analysis is PPCA: 231 | \begin{equation*} 232 | p(\boldsymbol z) = \mathcal{N}(\boldsymbol z | \boldsymbol 0,\boldsymbol I) \qquad p(\boldsymbol x|\boldsymbol z) = \mathcal{N}(\boldsymbol x | \boldsymbol W\boldsymbol z + \boldsymbol \mu, \Psi) 233 | \end{equation*} 234 | but we consider the matrix $\boldsymbol \Psi$ to be $D\times D$ diagonal matrix. This means that the inferences still holds. However, the training is much harder now as we may not find the $\boldsymbol W$ from the data in closed form. 235 | \end{definition} 236 | 237 | \begin{definition}{\textbf{(Canonical Correlation Analysis)}} 238 | The data vector $\mathcal{D}=\brackc{(\boldsymbol u_1, \boldsymbol v_1), (\boldsymbol u_2,\boldsymbol v_2),\dots}$ where $\boldsymbol u_i \in \mathcal{U}$ and $\boldsymbol v_i \in \mathcal{V}$. We want to find the correlation: 239 | \begin{itemize} 240 | \item We find te unti vector $\boldsymbol a\in\mathcal{U}$ and $\boldsymbol b\in\mathcal{V}$ such that the correlation $\boldsymbol u_i^T\boldsymbol a$ and $\boldsymbol v_i^T\boldsymbol b$ is the maximum the covariance between them. 241 | \item This also requires some in the orthogonal subspace. 242 | \end{itemize} 243 | Now, the probabilistic CCA is the generative model with latent $\boldsymbol z_i \in \mathbb{R}^K$ such that: 244 | \begin{equation*} 245 | \begin{aligned} 246 | \boldsymbol z \sim \mathcal{N}(\boldsymbol 0, \boldsymbol I) \qquad \boldsymbol u \sim \mathcal{N}(\boldsymbol \Upsilon\boldsymbol z, \boldsymbol \Psi_u) \qquad \boldsymbol v \sim \mathcal{N}(\boldsymbol \Phi\boldsymbol z, \boldsymbol \Psi_u) 247 | \end{aligned} 248 | \end{equation*} 249 | where we have $\boldsymbol\Psi_{\boldsymbol u}\succeq0$ and $\boldsymbol\Psi_{\boldsymbol v}\succeq0$. This is block diagonal noise. There are certain restriction of Gaussian FA and PCA as it is modelled the distribution that is too restrictive. 250 | \end{definition} 251 | 252 | \begin{definition}{\textbf{(Mixture Distribution)}} 253 | The mixture distribution has simple discrete latent variable: 254 | \begin{equation*} 255 | s_i \sim \text{Discrete}[\boldsymbol \pi] \qquad \boldsymbol x_i | \boldsymbol s_i \sim P_{s_i}[\theta_{s_i}] 256 | \end{equation*} 257 | The mixture can be seen as a mixture of multiple sources of data. The probability density of the single data point is given as: 258 | \begin{equation*} 259 | \begin{aligned} 260 | p(\boldsymbol x_i) &= \sum^k_{i=1} p(\boldsymbol x_i | s_i=m)p(s_i=m) = \sum^k_{i=1} \pi_m p(s_i=m) \\ 261 | \end{aligned} 262 | \end{equation*} 263 | The most notable mixture distribution is the mixture of Gaussian distribution. 264 | \end{definition} 265 | 266 | \begin{remark} 267 | Please note that once can perform a Baysian inference to infer the probability that particular point $\boldsymbol x$ belongs to the cluster $m$ of the mixture distribution: 268 | \begin{equation*} 269 | p(s_i = m | \boldsymbol x) = \frac{p_m(\boldsymbol x)\pi_m}{\sum^k_{i=1}p_i(\boldsymbol x)\pi_i} 270 | \end{equation*} 271 | \end{remark} 272 | 273 | \begin{remark}{\textbf{(Mixture of Gaussian)}} 274 | Let's consider the mixture of Gaussian, where we have the following mixture distribution: 275 | \begin{equation*} 276 | p(\brackc{\boldsymbol x_i}^N_{i=1} | \brackc{\boldsymbol \mu_i}^k_{i=1}, \brackc{\boldsymbol \Sigma_i}^k_{i=1}, \boldsymbol \pi) = \prod^n_{i=1}\sum^k_{m=1} \pi_m \frac{1}{\sqrt{\abs{2\pi\boldsymbol \Sigma_m}}}\exp\brackc{-\frac{1}{2}(\boldsymbol x_i-\boldsymbol \mu)\boldsymbol \Sigma_m^{-1}(\boldsymbol x_i-\boldsymbol \mu)} 277 | \end{equation*} 278 | Again it is hard to find the solution to the problem, and so we will consider the method to solve such the problem, which is called Expectation-Maximization (EM). 279 | \end{remark} 280 | 281 | \begin{remark}{\textbf{(Mixture of Factor Analyzers)}} 282 | Now, we consider the clustering and dimensionality reduction: 283 | \begin{equation*} 284 | p(\boldsymbol x | \boldsymbol \theta) = \sum^k_{m=1}\pi_m\mathcal{N}(\boldsymbol \mu_k | \boldsymbol W_k\boldsymbol W_k^T + \boldsymbol \Psi) 285 | \end{equation*} 286 | where $\pi_k$ is the mixing proportion, while the parameter are $\boldsymbol \theta = \brackc{\brackc{\pi_m, \boldsymbol \mu_m, \boldsymbol W_m}^k_{m=1}, \boldsymbol \Psi}$. Please note that this model has 2 kinds of latent variables, which are: 287 | \begin{itemize} 288 | \item Cluster indicator variable $\pi_m$ for $m \in \brackc{1,\cdots,k}$ 289 | \item Continuous factor $\boldsymbol z_{im} \in \mathbb{R}^M$ 290 | \end{itemize} 291 | Together giving us the following data generating distribution: 292 | \begin{equation*} 293 | p(\boldsymbol x | \boldsymbol \theta) = \sum^k_{m=1} p(\pi_m)\int p(\boldsymbol z)p(\boldsymbol x_m | \boldsymbol z, \boldsymbol \theta) \dby \boldsymbol z 294 | \end{equation*} 295 | We can use EM to perform an optimization over it. 296 | \end{remark} 297 | 298 | -------------------------------------------------------------------------------- /prob-unsup/prob-unsup.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/prob-unsup/prob-unsup.pdf -------------------------------------------------------------------------------- /prob-unsup/prob-unsup.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage{amsmath} 4 | \usepackage{amsthm} 5 | \usepackage{amsfonts} 6 | \usepackage[colorlinks]{hyperref} 7 | \usepackage{natbib} 8 | \usepackage{graphicx} 9 | \usepackage{algorithm} 10 | \usepackage{algpseudocode} 11 | \usepackage{booktabs} 12 | \usepackage{caption} 13 | \usepackage{tikz} 14 | \usepackage{chngpage} 15 | \usepackage{xcolor} 16 | \usepackage{cancel} 17 | 18 | \newtheorem{theorem}{Theorem}[section] 19 | \newtheorem{corollary}{Corollary}[section] 20 | \newtheorem{proposition}{Proposition}[section] 21 | \newtheorem{lemma}{Lemma}[section] 22 | \newtheorem{claim}{Claim}[section] 23 | \newtheorem{conjecture}{Conjecture}[section] 24 | \newtheorem{example}{Example}[section] 25 | 26 | \theoremstyle{definition} 27 | \newtheorem{definition}{Definition}[section] 28 | 29 | \theoremstyle{remark} 30 | \newtheorem{remark}{Remark} 31 | 32 | 33 | \newcommand{\Phu}[1]{{\bf \color{red} [[Phu: #1]]}} 34 | \setlength\parindent{0pt} 35 | \setlength\parskip{5pt} 36 | \usepackage[margin=1.0in]{geometry} 37 | 38 | \newcommand{\dby}{\ \mathrm{d}} 39 | \newcommand{\argmax}[1]{\underset{#1}{\arg\max \ }} 40 | \newcommand{\argmin}[1]{\underset{#1}{\arg\min \ }} 41 | \newcommand{\const}{\text{const.}} 42 | \newcommand{\bracka}[1]{\left( #1 \right)} 43 | \newcommand{\brackb}[1]{\left[ #1 \right]} 44 | \newcommand{\brackc}[1]{\left\{ #1 \right\}} 45 | \newcommand{\brackd}[1]{\left\langle #1 \right\rangle} 46 | \newcommand{\abs}[1]{\left| #1 \right|} 47 | \newcommand{\contractop}{\mathcal{B}} 48 | \newcommand*\circled[1]{\tikz[baseline=(char.base)]{ 49 | \node[shape=circle,draw,inner sep=2pt] (char) {#1};}} 50 | \newcommand{\red}[1]{{\color{red} #1}} 51 | \newcommand{\loss}{\mathcal{L}} 52 | \newcommand{\correctquote}[1]{``#1''} 53 | \newcommand{\norm}[1]{\left\lVert#1\right\rVert} 54 | 55 | % From https://tex.stackexchange.com/questions/194426/split-itemize-into-multiple-columns 56 | \usepackage{etoolbox,refcount} 57 | \usepackage{multicol} 58 | 59 | \newcounter{countitems} 60 | \newcounter{nextitemizecount} 61 | \newcommand{\setupcountitems}{% 62 | \stepcounter{nextitemizecount}% 63 | \setcounter{countitems}{0}% 64 | \preto\item{\stepcounter{countitems}}% 65 | } 66 | \makeatletter 67 | \newcommand{\computecountitems}{% 68 | \edef\@currentlabel{\number\c@countitems}% 69 | \label{countitems@\number\numexpr\value{nextitemizecount}-1\relax}% 70 | } 71 | \newcommand{\nextitemizecount}{% 72 | \getrefnumber{countitems@\number\c@nextitemizecount}% 73 | } 74 | \newcommand{\previtemizecount}{% 75 | \getrefnumber{countitems@\number\numexpr\value{nextitemizecount}-1\relax}% 76 | } 77 | \makeatother 78 | \newenvironment{AutoMultiColItemize}{% 79 | \ifnumcomp{\nextitemizecount}{>}{3}{\begin{multicols}{2}}{}% 80 | \setupcountitems\begin{itemize}}% 81 | {\end{itemize}% 82 | \unskip\computecountitems\ifnumcomp{\previtemizecount}{>}{3}{\end{multicols}}{}} 83 | 84 | 85 | \title{Probabilisitic and Unsupervised Learning} 86 | \author{Phu Sakulwongtana} 87 | \date{} 88 | 89 | \begin{document} 90 | 91 | \maketitle 92 | 93 | \input{chapter/part1.tex} 94 | \input{chapter/part2.tex} 95 | \input{chapter/part3.tex} 96 | \input{chapter/part4.tex} 97 | \input{chapter/part5.tex} 98 | 99 | % \begin{algorithm}[H] 100 | % \caption{$PSRO_{RN}$} 101 | % \begin{algorithmic}[1] 102 | % \State \textbf{Input}: Initial Population $\mathcal{B}_1$ 103 | % \For {$i=1,2,\cdots, T$} 104 | % \State $p \leftarrow \text{Nash}(A_{\mathcal{B}_i})$ 105 | % \For {agent $v_i$ with positive mass in $p_t$} 106 | % \State $v_{i+1} \leftarrow \text{oracle}(v_i, \sum_{w \in \mathcal{B}_i} p[i](\phi_{v_i}(\cdot))_+)$ 107 | % \EndFor 108 | % \State $\mathcal{B}_{i+1} = \mathcal{B} \cup \{v_{i+1} : \text{as updated above}\}$ 109 | % \EndFor 110 | % \end{algorithmic} 111 | % \end{algorithm} 112 | 113 | % \begin{table}[!h] 114 | % \centering 115 | % \begin{tabular}{lc} 116 | % \toprule 117 | % \textbf{Methods/Metrics} & \textbf{Accuracy} \\ 118 | % \midrule 119 | % Logistic Regression & $48.26 \pm 0.0f0$ \\ 120 | % Support Vector Machine & $48.91 \pm 0.00$ \\ 121 | % Random Forest Classifier & $44.38 \pm 1.57$ \\ 122 | % \midrule 123 | % Multi-Dimensional ELO & $34.51 \pm 3.12$ \\ 124 | % TrueSkill\texttrademark & $44.99 \pm 0.00$ \\ 125 | % \bottomrule 126 | % \end{tabular} 127 | 128 | % \caption{} 129 | 130 | % \label{table} 131 | % \end{table} 132 | 133 | % \begin{AutoMultiColItemize} 134 | % \item Item 1 135 | % \item Item 2 136 | % \item Item 3 137 | % \item Item 4 138 | % \item Item 5 139 | % \item Item 6 140 | % \end{AutoMultiColItemize} 141 | 142 | 143 | % \bibliographystyle{plain} 144 | % \bibliography{references} 145 | \end{document} 146 | -------------------------------------------------------------------------------- /prob-unsup/test/test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | C = np.random.randn(5, 5) 4 | V0 = np.random.randn(5, 5) 5 | R = np.random.randn(5, 5) 6 | A = np.dot(C, np.dot(V0, C.T)) 7 | 8 | 9 | # first1 = np.linalg.inv(C) 10 | # first2 = np.linalg.inv(np.linalg.inv(R) + np.linalg.inv(A)) 11 | # first3 = np.linalg.inv(R) + np.linalg.inv(A) 12 | 13 | # first = np.dot(first1, np.dot(first2, first3)) 14 | 15 | second3 = np.linalg.inv(R + A) 16 | K = np.dot(V0, np.dot(C.T, second3)) 17 | 18 | first = np.dot(V0, np.dot(C.T, np.linalg.inv(R))) - np.dot(K, np.dot(A, np.linalg.inv(R)) ) 19 | 20 | 21 | print(first - K) 22 | 23 | -------------------------------------------------------------------------------- /rice-foundation-fortnight/contents/part1.tex: -------------------------------------------------------------------------------- 1 | \section{Too Many Distributions (And Its related Quantities)} 2 | 3 | \subsection{Normal Distribution and Friends} 4 | 5 | \begin{definition}{\textbf{(Normal Distribution)}} 6 | We define the normal distribution to be: 7 | \begin{equation*} 8 | \mathcal{N}(x | \mu, \sigma^2) = \frac{1}{\sqrt{2\pi\sigma^2}}\exp\bracka{-\frac{(x-\mu)^2}{2\sigma^2}} 9 | \end{equation*} 10 | \end{definition} 11 | 12 | \begin{definition}{\textbf{(Cumulative Normal Distribution)}} 13 | We define CDF of normal distribution as: 14 | \begin{equation*} 15 | \mathcal{N}(x \le y | \mu, \sigma^2) = \Phi\bracka{\frac{x-\mu}{\sigma}} = \frac{1}{2}\brackb{1 + \operatorname{erf}\bracka{\frac{x-\mu}{\sigma \sqrt{2}}}} \quad \text{ where } \quad \operatorname{erf}(x) = \frac{2}{\sqrt{\pi}}\int^x_0\exp(-t^2)\dby t 16 | \end{equation*} 17 | \end{definition} 18 | 19 | \begin{definition}{\textbf{(Multinomial Cell Probabilities)}} 20 | We consider $X_1,\dots,X_m$ the counts in cells $1,\dots,m$ follows multinomial distribution with total count of $n$ and cell probabilities $p_1,\dots,p_m$ as we have: 21 | \begin{equation*} 22 | p(X_1,\dots,X_m |p_1,\dots,p_m) = \frac{n!}{\prod^m_{i=1} X_i!}\prod^m_{i=1}p_i^{X_i} 23 | \end{equation*} 24 | The marginal distribution of each $X_i$ that is binomial $(n,p_i)$, and the joint frequency function isn't product of marignal frequency function. 25 | \end{definition} 26 | 27 | \subsection{Statistical Properties} 28 | 29 | \begin{definition}{\textbf{(Mean/Variance)}} 30 | Mean and Variance of a random variable $x$ are defined as: 31 | \begin{equation*} 32 | \mathbb{E}[f(x)] = \int f(x)p(x)\dby x \qquad \operatorname{var}(x) = \mathbb{E}[(x - \mathbb{E}[x])^2] 33 | \end{equation*} 34 | \end{definition} 35 | 36 | \begin{definition}{\textbf{(Covariance/Correlation Coefficient)}} 37 | Covariance and Correlation coefficient between $2$ variables are defined as: 38 | \begin{equation*} 39 | \operatorname{cov}(x, y) = \mathbb{E}[(x - \mathbb{E}[x])(y - \mathbb{E}[y])] \qquad \rho = \frac{\operatorname{cov}(x, y)}{\sqrt{\operatorname{var}(x)\operatorname{var}(y)}} 40 | \end{equation*} 41 | \end{definition} 42 | 43 | \begin{theorem}{\textbf{(Markov's Inequality)}} 44 | If $X$ is a random variable with $P(X\ge0) = 1$ and for which $\mathbb{E}[X]$ exists then: 45 | \begin{equation*} 46 | \mathbb{P}(X\ge t) \le \frac{\mathbb{E}[X]}{t} 47 | \end{equation*} 48 | \end{theorem} 49 | \begin{proof} 50 | Consider the expectation: 51 | \begin{equation*} 52 | \begin{aligned} 53 | \mathbb{E}[X] &= \int x p(x)\dby x \\ 54 | &= \int_{x < t} xp(X)\dby x + \int_{x \ge t} xp(X)\dby x 55 | \end{aligned} 56 | \end{equation*} 57 | All the terms in the integral are non-negative because $X$ takes only non-negative value, and so: 58 | \begin{equation*} 59 | \begin{aligned} 60 | \mathbb{E}[X] &\ge \int_{x \ge t} xp(X)\dby x \\ 61 | &\ge \int_{x\ge t} tp(x) = t \mathbb{P}(X\ge t) 62 | \end{aligned} 63 | \end{equation*} 64 | \end{proof} 65 | 66 | \begin{theorem}{\textbf{(Chebyshev's Inequality)}} 67 | Let $X$ be a random variable with mean $\mu$ and $\sigma^2$. Then for any $t>0$: 68 | \begin{equation*} 69 | \mathbb{P}(\abs{X - \mu} > t) \le \frac{\sigma^2}{t^2} 70 | \end{equation*} 71 | \end{theorem} 72 | \begin{proof} 73 | We let $Y = (X - \mu)^2$. Then $\mathbb{E}[Y] = \sigma^2$ and this result follows from Markov inequality to $Y$. 74 | \end{proof} 75 | 76 | \begin{theorem}{\textbf{(Law of Large Number)}} 77 | Let $X_1,X_2,\dots,X_i,\dots$ be sequence of independent random variables with $\mathbb{E}[X_i] = \mu$ and $\operatorname{var}(X_i) = \sigma^2$. Let $\bar{X}_n = 1/n\sum^n_{i=1}X_i$. Then for any $\varepsilon>0$: 78 | \begin{equation*} 79 | \mathbb{P}(\abs{\bar{X}_n - \mu} > \varepsilon) \rightarrow 0 \qquad \text{ as } \qquad n\rightarrow \infty 80 | \end{equation*} 81 | \end{theorem} 82 | \begin{proof} 83 | Let's find the $\mathbb{E}[\bar{X}_n]$ and $\operatorname{var}(\bar{X}_n)$, and since $X_i$ are independent 84 | \begin{equation*} 85 | \mathbb{E}[\bar{X}_n] = \frac{1}{n}\sum^n_{i=1}\mathbb{E}[X_i] = \mu 86 | \qquad \operatorname{var}(\bar{X}_n) = \frac{1}{n^2}\sum^n_{i=1}\operatorname{var}(X_i) = \frac{\sigma^2}{n} 87 | \end{equation*} 88 | This follows from Chebyshev's inequality, which is: 89 | \begin{equation*} 90 | \mathbb{P}(\abs{\bar{X}_n - \mu}>\varepsilon)\le\frac{\operatorname{var}(\bar{X}_n)}{\varepsilon^2} = \frac{\sigma^2}{n\varepsilon^2} \rightarrow 0 91 | \end{equation*} 92 | as $n\rightarrow \infty$. Thus the thoerem is proven. 93 | \end{proof} 94 | 95 | \begin{definition}{\textbf{(Convergence of Distribution Function)}} 96 | Let $X_1,X_2,\dots$ be a sequence of random variable with CDF $F_1, F_2,\dots$ and let $X$ be random variable with distribution $F$. We say that $X_n$ converge to $X$ if: 97 | \begin{equation*} 98 | \lim_{n\rightarrow\infty} F_n(X) = F(X) 99 | \end{equation*} 100 | at every point at which $F$ is continuous. 101 | \end{definition} 102 | 103 | \begin{theorem}{\textbf{(Continuiy Theorem)}} 104 | Let $F_n$ be a sequence of CDF with the corresponding momement generating function $M_n$. Let $F$ be a CDF with momement-generating funcion $M$. If $M_n(t) \rightarrow M(t)$ for all $t$ in an open interval containing zero, then $F_n(x)\rightarrow F(x)$ at all continuity points of $F$. 105 | \end{theorem} 106 | 107 | \begin{theorem}{\textbf{(Central Limit Theorem)}} 108 | Let $X_1,X_2,\dots$ be a sequence of independent random variable having mean $0$ and variance $\sigma^2$ and the common distribution function $F$ and momement-generating function $M$ defined in a neighborhood of zero. Let: 109 | \begin{equation*} 110 | S_n = \sum^n_{i=1} X_i 111 | \end{equation*} 112 | Then, we have: 113 | \begin{equation*} 114 | \lim_{n\rightarrow\infty} \mathbb{P}\bracka{\frac{S_n}{\sigma\sqrt{n}} \le x} = \Phi(x) \qquad -\infty2$ as $\mathbb{E}[W]$ exists and equal $n/(n-2)$. Finally, from the definition of $t_n$ random variable follows an $F_{1,n}$ distribution. 221 | \end{definition} 222 | 223 | \begin{theorem} 224 | The distribution of $(n-1)S^2/\sigma^2$ is $\chi^2_{n-1}$-distribution 225 | \end{theorem} 226 | \begin{proof} 227 | Please note that: 228 | \begin{equation*} 229 | \frac{1}{\sigma^2}\sum^n_{i=1}(X_i - \mu)^2 = \sum^n_{i=1}\bracka{\frac{X_i-\mu}{\sigma}}^2 \sim\chi_n^2 230 | \end{equation*} 231 | And, note that: 232 | \begin{equation*} 233 | \begin{aligned} 234 | \frac{1}{\sigma^2}\sum^n_{i=1}(X_i = \mu)^2 &= \frac{1}{\sigma^2}\sum^n_{i=1}[(X_i - \bar{X}) + (\bar{X} - \mu)]^2 \\ 235 | &= \frac{1}{\sigma^2}\sum^n_{i=1}(X_i - \bar{X})^2 + \bracka{\frac{\bar{X} - \mu}{\sigma/\sqrt{n}}}^2 236 | \end{aligned} 237 | \end{equation*} 238 | Note that $\sum^n_{i=1}(X_i-\bar{X}) = 0$. Now this relation is like $W = U + V$, as $U$ and $V$ are independent, we have $M_W(t)=M_U(t)M_V(t)$ as both $W$ and $V$ are $\chi^2$-distribution, we have: 239 | \begin{equation*} 240 | M_U(t) = \frac{M_W(t)}{M_V(t)} = \frac{(1-2t)^{-n/2}}{(1-2t)^{-1/2}} = (1-2t)^{-(n-1)/2} 241 | \end{equation*} 242 | The last expression is the mgf of a random variable with a $\chi^2_{n-1}$ distribution. 243 | \end{proof} 244 | 245 | \begin{corollary} 246 | We can show that: 247 | \begin{equation*} 248 | \frac{\bar{X}-\mu}{S/\sqrt{n}} \sim t_{n-1} 249 | \end{equation*} 250 | \end{corollary} 251 | \begin{proof} 252 | We can show that it is equivalent to the following ratio: 253 | \begin{equation*} 254 | \frac{\bar{X}-\mu}{S/\sqrt{n}} = \frac{\cfrac{\bar{X}-\mu}{\sigma/\sqrt{n}}}{\sqrt{S^2/\sigma^2}} 255 | \end{equation*} 256 | The latter ratio is $\mathcal{N}(0, 1)$ and the square root of $\chi^2_{n-1}$ distribution. And so from the definition is $t_{n-1}$. 257 | \end{proof} 258 | -------------------------------------------------------------------------------- /rice-foundation-fortnight/contents/part4.tex: -------------------------------------------------------------------------------- 1 | \section{Summarizing Data} 2 | 3 | \subsection{Methods Based on CDF} 4 | 5 | \begin{definition}{\textbf{(Empirical CDF)}} 6 | Suppose we have $x_1,\dots,x_n$ be a batch of numbers. The empirical cumulative distribution function is defined as: 7 | \begin{equation*} 8 | F_n(x) = \frac{1}{n}(\# x_i \le x) 9 | \end{equation*} 10 | Or, we have an ordered number of $x_{(1)}\le x_{(2)} \le \cdots \le x_{(n)}$. We have: if $x_{(k)} \le x < x_{(k+1)}$, then $F_n(x) = k/n$. 11 | \end{definition} 12 | 13 | \begin{remark}{\textbf{(Comments on Empirical CDF)}} 14 | In the analysis, it is better to express $F_n$ in the following way, given random variables $X_1,\dots,X_n$: 15 | \begin{equation*} 16 | F_n(x) = \frac{1}{n} \sum^n_{i=1} I_{(-\infty, x]}(X_i) \qquad \text{ where } \qquad I_{(-\infty, x]}(X_i) = \begin{cases} 17 | 1 & \text{ if } X_i \le x \\ 18 | 0 & \text{ otherwise } 19 | \end{cases} 20 | \end{equation*} 21 | The random variable $I_{(-\infty, x]}(X_i)$ are independent Bernoulli random variables, where we have: 22 | \begin{equation*} 23 | I_{(-\infty, x]}(X_i) = \begin{cases} 24 | 1 & \text{ with probability } F(x) \\ 25 | 0 & \text{ with probability } 1-F(x) \\ 26 | \end{cases} 27 | \end{equation*} 28 | Thus, $nF_n(x)$ is a binomial random variable ($n$ trials with probability of $F(x)$ of success), as we have: 29 | \begin{equation*} 30 | \mathbb{E}[F_n(x)] = F(x) \qquad \operatorname{var}(F_n(x)) = \frac{1}{n}F(x)[1-F(x)] 31 | \end{equation*} 32 | An estimate of $F_n(x)$ is unbiased and has a maximum variacne at the value of $x$ such that $F(x) = 0.5$, which is at median. 33 | \end{remark} 34 | 35 | \begin{remark}{\textbf{(Behavior of $\boldsymbol F_n$)}} 36 | If we consider the stochastic behavior of $F(x)$, then we can show that: 37 | \begin{equation*} 38 | \max_{-\infty t) = 1-F(t) 47 | \end{equation*} 48 | where $T$ is a random variable with CDF of $F$. We use it where the data consists of times until failure or death and so non-negative. $S(t)$ denotes the lifetime will be longer than $t$, and so we can have empirical version to be $S_n(t) = 1-F_n(t)$. 49 | \end{definition} 50 | 51 | \begin{definition}{\textbf{(Hazard Function)}} 52 | It is interpreted as the instantaneous death rate for individual who have survived up to a given time. If an individual is alive at time $t$, the probability that the individual will die at time interval $(t, t + \delta)$ is (assuming density function $f$ is continuous at $t$): 53 | \begin{equation*} 54 | \begin{aligned} 55 | P(t \le T \le t + \delta | T\ge t) &= \frac{P(t\le T \le t + \delta)}{P(T \ge t)} \\ 56 | &= \frac{F(t + \delta) - F(t)}{1 - F(t)} \approx \frac{\delta f(t)}{1-F(t)} 57 | \end{aligned} 58 | \end{equation*} 59 | The hazard function is defined as: 60 | \begin{equation*} 61 | h(t) = \frac{f(t)}{1-F(t)} 62 | \end{equation*} 63 | If $T$ is the lifetime of a manufactured component, it may be natural to think of $h(t)$ as the instantaneous or age-specific failure rate. 64 | \end{definition} 65 | 66 | \begin{remark}{\textbf{(Interpretation of Hazard Function)}} 67 | It can be expressed as: 68 | \begin{equation*} 69 | h(t) = -\frac{d}{dt}\log[1-F(t)] = -\frac{d}{dt} \log S(t) 70 | \end{equation*} 71 | Which is the negative of the log of survival funcion. With the method of propagation of error: 72 | \begin{equation*} 73 | \operatorname{var}\Big( 1 - F_n(t) \Big) \approx \frac{\operatorname{var}[1-F_n(t)]}{(1-F(t))^2} = \frac{1}{n}\bracka{\frac{F(t)}{1-F(t)}} 74 | \end{equation*} 75 | For large value of $t$, the empirical log survial function is unrealiable, because $1-F(t)$ is very small, and so in practice, last few data are disregarded. 76 | \end{remark} 77 | 78 | \begin{remark}{\textbf{(Empirical Survial Function)}} 79 | Suppose that there are no ties and the ordered failure times are: $T_{(1)} < T_{(2)} < \cdots < T_{(n)}$. If $t = T_{(i)}$, $F_n(t) = i/n$ and $S_{n}(t) = 1-i/n$. But since $\log S_n(t)$ is undefined for $t\ge T_{(n)}$, it is ofen defined as: 80 | \begin{equation*} 81 | S_n(t) = 1 - \frac{i}{n+1} 82 | \end{equation*} 83 | for $T_{(i)} \le t < T_{(i+1)}$ 84 | \end{remark} 85 | 86 | \begin{definition}{\textbf{(Quantile-Qunatile Plot)}} 87 | If $X$ is a continuous random variable with a strictly increasing distribution function $F$, the $p$-th quantile of the to be value of $x$ such that: $F(x) = p$ or $x_p = F^{-1}(p)$. In Q-Q plot, the quantile of one distribution is plotted against another. 88 | \end{definition} 89 | 90 | \begin{remark}{\textbf{(Usage of Q-Q)}} 91 | Suppose we have $2$ distributions: 92 | \begin{itemize} 93 | \item $F$ is a model for observations of a control group. 94 | \item $G$ is a model for observations of a group that has received some treatment. 95 | \end{itemize} 96 | Let's consider how difference update changes the plot: 97 | \begin{itemize} 98 | \item Suppose that there is an effect changned by $h$ uniformly i.e $y_p = x_p + h$, where $y_p$ is the group that received the treatment and vice versa. This gives us the relationship to be: $G(y) = F(y - h)$. 99 | \item Similarly, we have the effect with multiplicative differences i.e given $c \in \mathbb{R}$ where we have $y_p = cx_p$ with the relationship to be $G(y) = F(y/h)$ 100 | \end{itemize} 101 | Given the number of samples, we have to use the empirical CDF to create thE Q-Q plot. Now, the results of the changes is shown in the following figure: 102 | \begin{figure}[H] 103 | \centering 104 | \begin{subfigure}{.5\textwidth} 105 | \centering 106 | \includegraphics[width=0.7\linewidth]{img/img3.png} 107 | \caption{Additive Treatment Effect} 108 | \label{fig:1-sub1} 109 | \end{subfigure}% 110 | \begin{subfigure}{.5\textwidth} 111 | \centering 112 | \includegraphics[width=0.7\linewidth]{img/img4.png} 113 | \caption{Multiplicative Treatment Effect} 114 | \label{fig:1-sub2} 115 | \end{subfigure} 116 | \end{figure} 117 | \end{remark} 118 | 119 | \begin{definition}{\textbf{(Kernel Probability Density Estimate)}} 120 | Let $w(x)$ be a non-negative, symmetric weight function, centered at zero and integrating to $1$. It can be standard normal density, with the following rescaled version: 121 | \begin{equation*} 122 | w_h(x) = \frac{1}{h}w\bracka{\frac{x}{h}} 123 | \end{equation*} 124 | is a rescaled version of $w$, as it approaches zero, $w_h$ becomes more concentrated and peaked around zero. On the other hand, as $h$ approaches infinity, $w_h$ becomes flat. If $X_1,\dots,X_n$ is a sample from a probability density function $p$, its esitmate is: 125 | \begin{equation*} 126 | f_h(x) = \frac{1}{n}\sum^n_{i=1}w_h(x - X_i) 127 | \end{equation*} 128 | The parameter $h$ represents bandwidth of estimating function as it controls the smoothness. 129 | \end{definition} 130 | 131 | \subsection{Meansure of Location} 132 | 133 | \begin{definition}{\textbf{(Arithmetic Mean)}} 134 | The commonly used measure of location is the arithmetic mean, which is: 135 | \begin{equation*} 136 | \bar{x} = \frac{1}{n}\sum^n_{i=1} 137 | \end{equation*} 138 | \end{definition} 139 | 140 | \begin{remark}{\textbf{(Problem with Arithmeic Mean)}} 141 | By changing a single number, the arithmetic mean of a batch of numbers can be made arbitary large or smaller. Thus, when used blindly, without careful attention, the mean can produce a misleading results. Or, we need to have the measure of location that are robut or insensitive to outlier. 142 | \end{remark} 143 | 144 | \begin{remark}{\textbf{(Why Sample Mean is Bad)}} 145 | The sample mean minimizers the log-likelihood of: 146 | \begin{equation*} 147 | \sum^n_{i=1}\bracka{\frac{(X_i - \mu)^2}{\sigma}} 148 | \end{equation*} 149 | This is the simpliest case of least square estimate. The outlier have a great effect on this estimate, as the deviation of $\mu$ from $X_i$ is measured by square of their difference. 150 | \end{remark} 151 | 152 | \begin{definition}{\textbf{(Median)}} 153 | It is a middle value of the ordered observation; if the sample size is even, the median is the average of the $2$ middle values. 154 | \end{definition} 155 | 156 | \begin{proposition}{\textbf{(Confidence Interval)}} 157 | We can show that, given the population median $\eta$ and the interval between the order statistics $(X_{(k)}, X_{(n-k+1)})$ 158 | \begin{equation*} 159 | P(X_{(k)} \le \eta \le X_{(n-k+1)}) = 1 - \frac{1}{2^{n-1}}\sum^{k-1}_{j=0} 160 | \end{equation*} 161 | \end{proposition} 162 | \begin{proof} 163 | The coverage probability of this interval is: 164 | \begin{equation*} 165 | \begin{aligned} 166 | P(X_{(k)} \le \eta \le X_{(n-k+1)}) &= 1 - P(\eta < X_{(k)} \text{ or } \eta > X_{n-k+1}) \\ 167 | &= 1 - P(\eta < X_{(k)}) - P(\eta > X_{(n-k+1)}) 168 | \end{aligned} 169 | \end{equation*} 170 | Since the event are mutually exclusive. To evaluate both terms, we note that: 171 | \begin{equation*} 172 | \begin{aligned} 173 | &P(\eta > X_{(n-k+1)}) = \sum^{k-1}_{j=0} \mathbb{P}(j \text{ observations} > \eta) \\ 174 | &P(\eta < X_{(k)}) = \sum^{k-1}_{j=0} \mathbb{P}(j \text{ observations } < \eta) 175 | \end{aligned} 176 | \end{equation*} 177 | The median satisfies $P(X_i > \eta ) = P(X_i < \eta) = 1/2$, since $n$ observations $X_1,\dots,X_n$ are independent and identically distributed, the distribution of the number of observation greater than median is binomial with $n$ trials and probability $1/2$: 178 | \begin{equation*} 179 | P(j \text{ observations } > \eta) = \frac{1}{2}\begin{pmatrix} 180 | n \\ j 181 | \end{pmatrix} 182 | \end{equation*} 183 | and, so we have: 184 | \begin{equation*} 185 | P(\eta > X_{(n-k+1)}) = \frac{1}{2^n}\sum^{k-1}_{j=0}\begin{pmatrix} 186 | n \\ j 187 | \end{pmatrix} 188 | \end{equation*} 189 | This is the same for $P(\eta < X_{(k)})$ due to symmetry. Plugging it back to finish the proof 190 | \end{proof} 191 | 192 | \begin{remark} 193 | Median can be seen as the minimizer of the following loss: 194 | \begin{equation*} 195 | \sum^n_{i=1}\abs{\frac{X_i - \mu}{\sigma}} 196 | \end{equation*} 197 | Here, large deviation are not weighted as heavily, making median robust. The proof follows from the fact that the dervative of absolute is $\operatorname{sgn}(\cdot)$, and so the loss is zero when the positive $x - \mu$ (of the normalized data ) is equal to the negative item $x - \mu$, which is where the median situates. 198 | \end{remark} 199 | 200 | \begin{definition}{\textbf{(Trimmed Mean)}} 201 | The $100\alpha\%$ trimmed mean consider the valuse that is between the lower $100\alpha\%$ and the higher $100\alpha\%$, as we can write it as: 202 | \begin{equation*} 203 | \bar{x}_\alpha = \frac{x_{[n\alpha] + 1} + \cdots + x_{(n - [n\alpha])}}{n - 2[n\alpha]} 204 | \end{equation*} 205 | where $[n\alpha]$ denotes the greatest integer less than or equal to $n\alpha$. 206 | \end{definition} 207 | 208 | \begin{definition}{\textbf{(M-Estimates)}} 209 | Consider the class of esitmates called $M$-estimates, where it is a minimizer: 210 | \begin{equation*} 211 | \sum^n_{i=1}\Psi\bracka{\frac{X_i - \nu}{\sigma}} 212 | \end{equation*} 213 | where $\Psi$ is the weight function that is a compromise between weight function for mean and median. 214 | \end{definition} 215 | 216 | \begin{remark}{\textbf{(Measure of Dispersion)}} 217 | The most commonly used measure is sample standard deviation, where it is given as: 218 | \begin{equation*} 219 | S^2 = \frac{1}{n-1}\sum^n_{i=1}(X_i - \bar{X})^2 220 | \end{equation*} 221 | Using $n-1$ as divisor gives unbiased estimate. But like a sample mean standard deviation is sensitive to outlying observation. Two simple robust measures alternative are: 222 | \begin{itemize} 223 | \item Interquartile range (IQR): Differences between $2$ sample quantiles. 224 | \item Median absolute deviation from the median (MAD): If data are $x_1,\dots,x_n$ with median $\tilde{x}$, then MAD is the median of number $\abs{x_1,\dots,x_n}$. 225 | \end{itemize} 226 | \end{remark} 227 | 228 | 229 | -------------------------------------------------------------------------------- /rice-foundation-fortnight/contents/part7.tex: -------------------------------------------------------------------------------- 1 | \section{The Analysis of Categorical Data} 2 | 3 | \subsection{Fisher's Exact Test} 4 | 5 | \begin{remark}{\textbf{(Setting for the Tests)}} 6 | Let's consider the data that we are given as: 7 | \begin{table}[!h] 8 | \centering 9 | \begin{tabular}{lcccc} 10 | \toprule 11 | \textbf{} & \textbf{Variation 1} & \textbf{Variation 2} & Total \\ 12 | \midrule 13 | \textbf{Category 1} & $N_{11}$ & $N_{12}$ & $n_{1.}$ \\ 14 | \textbf{Category 2} & $N_{21}$ & $N_{22}$ & $n_{2.}$ \\ 15 | Total & $n_{.1}$ & $n_{.2}$ & $n_{..}$ \\ 16 | \bottomrule 17 | \end{tabular} 18 | \end{table} 19 | We want the see whether the count in each category is affected by the some variation of data or not (the null hypothesis is that thet are all randomly assigned). There are auxillary variables denoted (total). 20 | \end{remark} 21 | 22 | \begin{remark}{\textbf{(Probability Under Null Hypothesis)}} 23 | Under the null hypothesis (randomly generated), and so the probability that $N_{11} = n_{11}$ is given as: 24 | \begin{equation*} 25 | p(n_{11}) = \cfrac{\begin{pmatrix} 26 | n_{1.} \\ n_{11} 27 | \end{pmatrix}\begin{pmatrix} 28 | n_{2.} \\ n_{21} 29 | \end{pmatrix}}{\begin{pmatrix} 30 | n_{..} \\ n_{.1} 31 | \end{pmatrix}} 32 | \end{equation*} 33 | We can use $N_{11}$ as the test statistics for testing the null hypothesis. We can generate the table to create $2$ sided rejects for extreme value of $N_{11}$ 34 | \end{remark} 35 | 36 | \subsection{$\chi^2$-Test for Homogeneity} 37 | 38 | \begin{remark}{\textbf{(Settings for $\boldsymbol \chi^2$-Test)}} 39 | We consider the larger setting compared to Fisher's exact test, where we comparing $J$ multinomial distribution each having $I$ categories. If the probability of $i$-th category of $j$-th multinomial is denoted as $\pi_{ij}$, the null hypothesis is: 40 | \begin{equation*} 41 | H_0 : \pi_{i1} = \pi_{i2} = \cdots = \pi_{iJ} \qquad i = 1,\dots,J 42 | \end{equation*} 43 | Under $H_0$ each of the $J$ multinomial has the same probability for the $i$-th category as $\pi_i$. 44 | \end{remark} 45 | 46 | \begin{proposition} 47 | Under $H_0$, the MLE of the parameter $\pi_1,\pi_2,\dots,\pi_I$ are given as: 48 | \begin{equation*} 49 | \hat{\pi}_i = \frac{n_{i.}}{n_{..}} \qquad i = 1,\dots,I 50 | \end{equation*} 51 | where $n_{i.}$ is the total number of response in the $i$-th category and $n_{..}$ is the grand total number of response. 52 | \end{proposition} 53 | \begin{proof} 54 | Since the multinomial distribution are independent: 55 | \begin{equation*} 56 | \begin{aligned} 57 | \operatorname{lik}(\pi_1,\pi_2,\dots,\pi_I) &= 58 | \prod^J_{j=1}\begin{pmatrix} 59 | n_{.j} \\ n_{1j}n_{2j}\cdots n_{Ij} 60 | \end{pmatrix} 61 | \pi^{n_{1j}}_1\pi^{n_{2j}}_2\cdots\pi^{n_{Ij}}_I \\ 62 | &= \pi^{n_{1j}}_1\pi^{n_{2j}}_2\cdots\pi^{n_{Ij}}_I 63 | \prod^J_{j=1} 64 | \begin{pmatrix} 65 | n_{.j} \\ n_{1j}n_{2j}\cdots n_{Ij} 66 | \end{pmatrix} 67 | \end{aligned} 68 | \end{equation*} 69 | Consider maximizing the log-likelihood subject to constraint $\sum^I_{i=1}\pi_i = 1$. Introducing multiplier, we have to maximizing: 70 | \begin{equation*} 71 | \mathcal{L}(\pi, \lambda) = \sum^J_{j=1}\log 72 | \begin{pmatrix} 73 | n_{.j} \\ n_{1j}n_{2j}\cdots n_{Ij} 74 | \end{pmatrix} + \sum^I_{i=1}n_{i.}\log\pi_i + \lambda\bracka{\sum^I_{i=1}\pi_i-1} 75 | \end{equation*} 76 | Now, we have: 77 | \begin{equation*} 78 | \begin{aligned} 79 | &\frac{\partial l}{\partial \pi_i} = \frac{n_{i.}}{\pi_i} + \lambda \qquad i =1,\dots,I \\ 80 | \iff&\hat{\pi}_i = -\frac{n_i}{\lambda} 81 | \end{aligned} 82 | \end{equation*} 83 | Summing over both sides and applying the constraint, we find that $\lambda = -n_{..}$ and the theorem is proven. 84 | \end{proof} 85 | 86 | \begin{definition}{\textbf{(Peason's $\boldsymbol \chi^2$-Test)}} 87 | For $j$-th multinomial, the expected count in the $i$-th category is the etimated probability of the cell times the total number of observation for $j$-th multinomial: 88 | \begin{equation*} 89 | E_{ij} = \frac{n_{i.}}{n_{..}} n_{.j} 90 | \end{equation*} 91 | This gives us the Peason's $\chi^2$-statistics as we have: 92 | \begin{equation*} 93 | X^2 = \sum^I_{i=1}\sum^J_{j=1} \frac{(O_{ij} - E_{ij})^2}{E_{ij}} = \sum^I_{i=1}\sum^J_{j=1} \frac{(n_{ij} - n_{i.}n_{.j}/n_{..})^2}{n_{i.}n_{.j}/n_{..}} 94 | \end{equation*} 95 | For large sample size, the approximate null distribution of this statistics is $\chi^2$. We have the degree of freedom are number of independent counts minus the number of independent parameter: 96 | \begin{itemize} 97 | \item Each multinomial has $I-1$ independent counts, since the total are fixed. 98 | \item $I-1$ independent parameter have been estimated. 99 | \end{itemize} 100 | And so the degree of freedom are given as $J(I-1)-(I-1) = (I-1)(J-1)$. 101 | \end{definition} 102 | 103 | \subsection{$\chi^2$-Test of Independent} 104 | 105 | \begin{definition}{\textbf{(Contingency Table)}} 106 | We will discuss the statistical analysis of sample of size $n$ cross-classifed in table with $I$ rows and $J$ columns. This configuration is called contingency table. 107 | \end{definition} 108 | 109 | \begin{remark}{\textbf{(Settings for the Test)}} 110 | We are interested in the relationship between factors on the table. The joint distribution of the counts $n_{ij}$ where $i=1,\dots,I$ and $j=1,\dots,J$ is multinomial with cell probabilities denoted as: 111 | \begin{equation*} 112 | \pi_{i.} = \sum^J_{j=1}\pi_{ij} \qquad 113 | \pi_{.j} = \sum^I_{i=1}\pi_{ij} \qquad 114 | \end{equation*} 115 | Both are the marginal probability that the observation will fall in $i$-th row or $j$-columns. If both row and columns are independent of each other then: $\pi_{ij} = \pi_{i.}\pi_{.j}$. This leads to the following null hypothesis: 116 | \begin{equation*} 117 | H_0 : \pi_{ij} = \pi_{i.}\pi_{.j} \qquad i = 1,\dots,I \quad j = 1,\dots,J 118 | \end{equation*} 119 | \end{remark} 120 | 121 | \begin{remark}{\textbf{(Defining the $\chi^2$-Test)}} 122 | Let's consider the MLE estimate under each hypothesis 123 | \begin{itemize} 124 | \item Under $H_0$ is the MLE of $\pi_{ij}$ is given as: 125 | \begin{equation*} 126 | \hat{\pi}_{ij} = \hat{\pi}_{i.}\hat{\pi}_{.j} = \frac{n_{i.}}{n}\frac{n_{.j}}{n} 127 | \end{equation*} 128 | \item Under alternative MLE of $\pi_{ij}$ is given as: 129 | \begin{equation*} 130 | \tilde{\pi}_{ij} = \frac{n_{ij}}{n} 131 | \end{equation*} 132 | \end{itemize} 133 | Now we consider $\chi^2$-test as we have: 134 | \begin{equation*} 135 | X^2 = \sum^I_{i=1}\sum^J_{j=1} \frac{(O_{ij} - E_{ij})^2}{E_{ij}} = \sum^I_{i=1}\sum^J_{j=1} \frac{(n_{ij} - (n_{i.}n_{.j})/n)^2}{ (n_{i.}n_{.j})/n} 136 | \end{equation*} 137 | where $O_{ij}$ are the observation count as we have $n_{ij}$. The expected count is $E_{ij} = n\hat{\pi}_{ij} = (n_{i.}n_{.j})/n$. 138 | \begin{itemize} 139 | \item Let's consider the degree of freedom as under $\Omega$, the cell probabilities sum to $1$ as it has the dimension to be $IJ-1$. 140 | \item Under the null hypothesis, the marginal probabilities are estimated from the data are specified to $(I-1)+(J-1)$ 141 | \end{itemize} 142 | We have the following degree of freedom: 143 | \begin{equation*} 144 | \operatorname{df} = IJ - 1 - (I-1) - (J-1) = (I-1)(J-1) 145 | \end{equation*} 146 | \end{remark} 147 | 148 | \subsection{Matched-Pairs Designs} 149 | 150 | \begin{remark}{\textbf{(Setting for the test)}} 151 | We consider the following table 152 | \begin{table}[H] 153 | \centering 154 | \begin{tabular}{lcccc} 155 | \toprule 156 | \textbf{} & \textbf{No Cure (Sibling)} & \textbf{Cure (Sibling)} & Total \\ 157 | \midrule 158 | \textbf{No Cure (Patient)} & $\pi_{11}$ & $\pi_{12}$ & $\pi_{1.}$ \\ 159 | \textbf{Cure (Patient)} & $\pi_{21}$ & $\pi_{22}$ & $\pi_{2.}$ \\ 160 | Total & $\pi_{.1}$ & $\pi_{.2}$ & $1$ \\ 161 | \bottomrule 162 | \end{tabular} 163 | \end{table} 164 | The appropriate null hypothesis is $\pi_{i.} = \pi_{.i}$, where $i = 1,2$ (the probabilities of cure and no cure should be the same for patient and sibling), and so we have: 165 | \begin{equation*} 166 | \pi_{11} + \pi_{12} = \pi_{11} + \pi_{21} \qquad 167 | \pi_{12} + \pi_{22} = \pi_{21} + \pi_{22} \qquad 168 | \end{equation*} 169 | The equation is simplified to $\pi_{12} = \pi_{21}$, where the null hypothesis is thus: 170 | \begin{equation*} 171 | H_0 : \pi_{12} = \pi_{21} 172 | \end{equation*} 173 | \end{remark} 174 | 175 | \begin{proposition}{\textbf{(MLE of Cell Probabilities)}} 176 | Under the $H_0$, the MLE of the cell probabilities are: 177 | \begin{equation*} 178 | \hat{\pi}_{11} = \frac{n_{11}}{n} \qquad \hat{\pi}_{22} = \frac{n_{22}}{n} \qquad \hat{\pi}_{12} = \hat{\pi}_{21} = \frac{n_{12} + n_{21}}{2n} 179 | \end{equation*} 180 | \end{proposition} 181 | 182 | \begin{definition}{\textbf{(McNemar's Test)}} 183 | The contribution to the $\chi^2$ statistics from $n_{11}$ and $n_{22}$ cells are equal to zero. The remainder of statistics is: 184 | \begin{equation*} 185 | X^2 =\frac{[n_{12} - (n_{12} + n_{21})/2]^2}{(n_{12} + n_{21})/2} + \frac{[n_{21} - (n_{12} + n_{21})/2]^2}{(n_{12} + n_{21})/2} = \frac{(n_{12} - n_{21})^2}{n_{12} + n_{21}} 186 | \end{equation*} 187 | Let's consider the degree of freedom, as under $\Omega$ there are $3$ free parameters (since there are $4$ probability that are constrianted to one). On the null hypothesis, there are addiitonal constraint $\pi_{12} = \pi_{21}$ so there are $2$ free parameter. Thus we have $1$ degree of freedom. 188 | \end{definition} 189 | 190 | \subsection{Odd Ratios} 191 | 192 | \begin{definition}{\textbf{(Odd)}} 193 | If an event $A$ has probability $P(A)$ of occuring, the odds of $A$ occuring are defined as (please note that this works with conditional probability): 194 | \begin{equation*} 195 | \operatorname{odds}(A) = \frac{P(A)}{1-P(A)} \implies P(A) = \frac{\operatorname{odds}(A)}{1+\operatorname{odds}(A)} 196 | \end{equation*} 197 | \end{definition} 198 | 199 | \begin{definition}{\textbf{(Odds Ratio)}} 200 | We have the following: 201 | \begin{equation*} 202 | \Delta = \frac{\operatorname{odds}(D | X)}{\operatorname{odds}(D|\bar{X})} 203 | \end{equation*} 204 | where $\bar{X}$ is the complementary element. This measures the influenced of some event $X$ to the event $D$. 205 | \end{definition} 206 | 207 | \begin{remark}{\textbf{(Setting for Test)}} 208 | We consider how the odds and odds ratio could be estimated by sampling from a population with joint and marignal probability defined as: 209 | \begin{table}[H] 210 | \centering 211 | \begin{tabular}{lcccc} 212 | \toprule 213 | \textbf{} & $\bar{D}$ & $D$ & Total \\ 214 | \midrule 215 | $\bar{X}$ & $\pi_{00}$ & $\pi_{01}$ & $\pi_{0.}$ \\ 216 | $X$ & $\pi_{10}$ & $\pi_{11}$ & $\pi_{1.}$ \\ 217 | Total & $\pi_{.0}$ & $\pi_{.1}$ & $1$ \\ 218 | \bottomrule 219 | \end{tabular} 220 | \end{table} 221 | With this notation, as we have: 222 | \begin{equation*} 223 | P(D | X) =\frac{\pi_{11}}{\pi_{10} + \pi_{11}} \qquad P(D|\bar{X}) = \frac{\pi_{01}}{\pi_{00} + \pi_{01}} 224 | \end{equation*} 225 | And, so we have: 226 | \begin{equation*} 227 | \operatorname{odds}(D | X) = \frac{\pi_{11}}{\pi_{10}} 228 | \qquad \operatorname{odds}(D | \bar{X}) = \frac{\pi_{01}}{\pi_{00}} \qquad \Delta = \frac{\pi_{11}\pi_{00}}{\pi_{01}\pi_{10}} 229 | \end{equation*} 230 | The product of diagonal probabilities in the preceding table divided by the product of the off-diagonal probabilities. 231 | \end{remark} 232 | 233 | \begin{remark}{\textbf{(Ways to Sample the Data)}} 234 | \begin{itemize} 235 | \item \emph{Naive Sample}: We can consider drawing a random sample from the entire population. But if the event $D$ is rare, the total sample size would have to be quite large to guarantee that substantial number of $D$ is included. 236 | \item \emph{Prospective Study}: Fixed number of even $X$ and $\bar{X}$ are sample, then incidence of $D$ are compared. This allow use to compare $P(D|X)$ and $P(D|\bar{X})$ and the odd ratio. However $\pi_{ij}$ can not be estiamte from the data. 237 | \item \emph{Retrospective Study}: We fixed number of $D$ and $\bar{D}$ and we compared the number of $X$ and $\bar{X}$. We can estimate $P(X|D)$ and $P(X|\bar{D})$ by the proportion. But, we can't estimate $P(D|X)$ and $P(D|\bar{X})$ or the joint probability. 238 | \end{itemize} 239 | \end{remark} 240 | 241 | \begin{proposition} 242 | The odds ratio on the contingency table $\Delta$ can be expressed as: 243 | \begin{equation*} 244 | \Delta = \frac{\operatorname{odds}(X | D)}{\operatorname{odds}(X|\bar{D})} 245 | \end{equation*} 246 | \end{proposition} 247 | \begin{proof} 248 | This follows from the calculation of $P(X|D)$ and $1-P(X|D)$ where we have: 249 | \begin{equation*} 250 | P(X | D) = \frac{\pi_{11}}{\pi_{01} + \pi_{11}} \qquad 1 - P(X|D) = \frac{\pi_{01}}{\pi_{01} + \pi_{11}} \qquad \operatorname{odds}(X |D) = \frac{\pi_{11}}{\pi_{01}} \qquad \operatorname{odds}(X | \bar{D}) = \frac{\pi_{10}}{\pi_{00}} 251 | \end{equation*} 252 | We can see that the odds ratio $\Delta$ can be expressed as above, thus complete the proof. 253 | \end{proof} 254 | 255 | \begin{remark}{\textbf{(Retrospective Study - Odds Ratio)}} 256 | We can't find the odds ratio of given the restrospective study but we can approximate it. Using the above result. where we replace $\pi_{ij}$ with $n_{ij}$ where $n$ is the count of the observation. 257 | \end{remark} 258 | 259 | \begin{remark}{\textbf{(Statistical Testing)}} 260 | Since the value $\hat{\Delta}$ is non-linear function of the counts, we will have to use the boostrap to construct the approximation of the distribution $\hat{\Delta}$ 261 | \end{remark} 262 | 263 | -------------------------------------------------------------------------------- /rice-foundation-fortnight/img/img1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/rice-foundation-fortnight/img/img1.png -------------------------------------------------------------------------------- /rice-foundation-fortnight/img/img2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/rice-foundation-fortnight/img/img2.png -------------------------------------------------------------------------------- /rice-foundation-fortnight/img/img3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/rice-foundation-fortnight/img/img3.png -------------------------------------------------------------------------------- /rice-foundation-fortnight/img/img4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/rice-foundation-fortnight/img/img4.png -------------------------------------------------------------------------------- /rice-foundation-fortnight/rice-foundation-fortnight.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/rice-foundation-fortnight/rice-foundation-fortnight.pdf -------------------------------------------------------------------------------- /rice-foundation-fortnight/rice-foundation-fortnight.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage{amsmath} 4 | \usepackage{amsthm} 5 | \usepackage{amsfonts} 6 | \usepackage{amssymb} 7 | \usepackage[colorlinks]{hyperref} 8 | \usepackage{natbib} 9 | \usepackage{graphicx} 10 | \usepackage{algorithm} 11 | \usepackage{algpseudocode} 12 | \usepackage{booktabs} 13 | \usepackage{caption} 14 | \usepackage{cancel} 15 | \usepackage{hyperref} 16 | \usepackage{subcaption} 17 | 18 | \newtheorem{theorem}{Theorem}[section] 19 | \newtheorem{corollary}{Corollary}[section] 20 | \newtheorem{proposition}{Proposition}[section] 21 | \newtheorem{lemma}{Lemma}[section] 22 | \newtheorem{claim}{Claim}[section] 23 | \newtheorem{conjecture}{Conjecture}[section] 24 | \newtheorem{example}{Example}[section] 25 | 26 | \theoremstyle{definition} 27 | \newtheorem{definition}{Definition}[section] 28 | 29 | \theoremstyle{remark} 30 | \newtheorem{remark}{Remark} 31 | 32 | 33 | \newcommand{\Phu}[1]{{\bf \color{red} [[Phu: #1]]}} 34 | \setlength\parindent{0pt} 35 | \setlength\parskip{5pt} 36 | \usepackage[margin=1.0in]{geometry} 37 | 38 | \newcommand{\dby}{\ \mathrm{d}} 39 | \newcommand{\argmax}[1]{\underset{#1}{\arg\max \ }} 40 | \newcommand{\argmin}[1]{\underset{#1}{\arg\min \ }} 41 | \newcommand{\const}{\text{const.}} 42 | \newcommand{\bracka}[1]{\left( #1 \right)} 43 | \newcommand{\brackb}[1]{\left[ #1 \right]} 44 | \newcommand{\brackc}[1]{\left\{ #1 \right\}} 45 | \newcommand{\brackd}[1]{\left\langle #1 \right\rangle} 46 | \newcommand{\abs}[1]{\left| #1 \right|} 47 | \newcommand{\contractop}{\mathcal{B}} 48 | \newcommand*\circled[1]{\tikz[baseline=(char.base)]{ 49 | \node[shape=circle,draw,inner sep=2pt] (char) {#1};}} 50 | \newcommand{\red}[1]{{\color{red} #1}} 51 | \newcommand{\loss}{\mathcal{L}} 52 | \newcommand{\correctquote}[1]{``#1''} 53 | \newcommand{\norm}[1]{\left\lVert#1\right\rVert} 54 | \newcommand{\ind}{\perp \!\!\! \perp } 55 | 56 | % From https://tex.stackexchange.com/questions/194426/split-itemize-into-multiple-columns 57 | \usepackage{etoolbox,refcount} 58 | \usepackage{multicol} 59 | 60 | \newcounter{countitems} 61 | \newcounter{nextitemizecount} 62 | \newcommand{\setupcountitems}{% 63 | \stepcounter{nextitemizecount}% 64 | \setcounter{countitems}{0}% 65 | \preto\item{\stepcounter{countitems}}% 66 | } 67 | \makeatletter 68 | \newcommand{\computecountitems}{% 69 | \edef\@currentlabel{\number\c@countitems}% 70 | \label{countitems@\number\numexpr\value{nextitemizecount}-1\relax}% 71 | } 72 | \newcommand{\nextitemizecount}{% 73 | \getrefnumber{countitems@\number\c@nextitemizecount}% 74 | } 75 | \newcommand{\previtemizecount}{% 76 | \getrefnumber{countitems@\number\numexpr\value{nextitemizecount}-1\relax}% 77 | } 78 | \makeatother 79 | \newenvironment{AutoMultiColItemize}{% 80 | \ifnumcomp{\nextitemizecount}{>}{3}{\begin{multicols}{2}}{}% 81 | \setupcountitems\begin{itemize}}% 82 | {\end{itemize}% 83 | \unskip\computecountitems\ifnumcomp{\previtemizecount}{>}{3}{\end{multicols}}{}} 84 | 85 | 86 | \title{Statistical Models and Data Analysis} 87 | \author{Phu Sakulwongtana} 88 | \date{} 89 | 90 | \begin{document} 91 | 92 | \maketitle 93 | 94 | \input{contents/part1.tex} 95 | \input{contents/part2.tex} 96 | \input{contents/part3.tex} 97 | \input{contents/part4.tex} 98 | \input{contents/part5.tex} 99 | \input{contents/part6.tex} 100 | \input{contents/part7.tex} 101 | \input{contents/part8.tex} 102 | 103 | % \begin{algorithm}[H] 104 | % \caption{$PSRO_{RN}$} 105 | % \begin{algorithmic}[1] 106 | % \State \textbf{Input}: Initial Population $\mathcal{B}_1$ 107 | % \For {$i=1,2,\cdots, T$} 108 | % \State $p \leftarrow \text{Nash}(A_{\mathcal{B}_i})$ 109 | % \For {agent $v_i$ with positive mass in $p_t$} 110 | % \State $v_{i+1} \leftarrow \text{oracle}(v_i, \sum_{w \in \mathcal{B}_i} p[i](\phi_{v_i}(\cdot))_+)$ 111 | % \EndFor 112 | % \State $\mathcal{B}_{i+1} = \mathcal{B} \cup \{v_{i+1} : \text{as updated above}\}$ 113 | % \EndFor 114 | % \end{algorithmic} 115 | % \end{algorithm} 116 | 117 | % \begin{table}[!h] 118 | % \centering 119 | % \begin{tabular}{lc} 120 | % \toprule 121 | % \textbf{Methods/Metrics} & \textbf{Accuracy} \\ 122 | % \midrule 123 | % Logistic Regression & $48.26 \pm 0.0f0$ \\ 124 | % Support Vector Machine & $48.91 \pm 0.00$ \\ 125 | % Random Forest Classifier & $44.38 \pm 1.57$ \\ 126 | % \midrule 127 | % Multi-Dimensional ELO & $34.51 \pm 3.12$ \\ 128 | % TrueSkill\texttrademark & $44.99 \pm 0.00$ \\ 129 | % \bottomrule 130 | % \end{tabular} 131 | 132 | % \caption{} 133 | 134 | % \label{table} 135 | % \end{table} 136 | 137 | % \begin{AutoMultiColItemize} 138 | % \item Item 1 139 | % \item Item 2 140 | % \item Item 3 141 | % \item Item 4 142 | % \item Item 5 143 | % \item Item 6 144 | % \end{AutoMultiColItemize} 145 | 146 | 147 | % \bibliographystyle{plain} 148 | % \bibliography{references} 149 | \end{document} 150 | -------------------------------------------------------------------------------- /stat-analysis/stat-analysis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/stat-analysis/stat-analysis.pdf -------------------------------------------------------------------------------- /stat-analysis/stat-analysis.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage{amsmath} 4 | \usepackage{amsthm} 5 | \usepackage{amsfonts} 6 | \usepackage[colorlinks]{hyperref} 7 | \usepackage{natbib} 8 | \usepackage{graphicx} 9 | \usepackage{algorithm} 10 | \usepackage{algpseudocode} 11 | \usepackage{booktabs} 12 | \usepackage{caption} 13 | \usepackage{cancel} 14 | \usepackage{hyperref} 15 | \usepackage{subcaption} 16 | \usepackage{minted} 17 | \usepackage[title]{appendix} 18 | 19 | \usepackage{tikz} 20 | \usetikzlibrary{bayesnet} 21 | \usetikzlibrary{arrows} 22 | \usetikzlibrary{calc} 23 | \usetikzlibrary{shadows} 24 | \usetikzlibrary{positioning} 25 | 26 | \newtheorem{theorem}{Theorem}[section] 27 | \newtheorem{corollary}{Corollary}[section] 28 | \newtheorem{proposition}{Proposition}[section] 29 | \newtheorem{lemma}{Lemma}[section] 30 | \newtheorem{claim}{Claim}[section] 31 | \newtheorem{conjecture}{Conjecture}[section] 32 | \newtheorem{example}{Example}[section] 33 | 34 | \theoremstyle{definition} 35 | \newtheorem{definition}{Definition}[section] 36 | 37 | \theoremstyle{remark} 38 | \newtheorem{remark}{Remark} 39 | 40 | 41 | \newcommand{\Phu}[1]{{\bf \color{red} [[Phu: #1]]}} 42 | \setlength\parindent{0pt} 43 | \setlength\parskip{5pt} 44 | \usepackage[margin=1.0in]{geometry} 45 | 46 | \newcommand{\dby}{\ \mathrm{d}} 47 | \newcommand{\argmax}[1]{\underset{#1}{\arg\max \ }} 48 | \newcommand{\argmin}[1]{\underset{#1}{\arg\min \ }} 49 | \newcommand{\const}{\text{const.}} 50 | \newcommand{\bracka}[1]{\left( #1 \right)} 51 | \newcommand{\brackb}[1]{\left[ #1 \right]} 52 | \newcommand{\brackc}[1]{\left\{ #1 \right\}} 53 | \newcommand{\brackd}[1]{\left\langle #1 \right\rangle} 54 | \newcommand{\abs}[1]{\left| #1 \right|} 55 | \newcommand*\circled[1]{\tikz[baseline=(char.base)]{ 56 | \node[shape=circle,draw,inner sep=2pt] (char) {#1};}} 57 | \newcommand{\red}[1]{{\color{red} #1}} 58 | \newcommand{\loss}{\mathcal{L}} 59 | \newcommand{\correctquote}[1]{``#1''} 60 | \newcommand{\norm}[1]{\left\lVert#1\right\rVert} 61 | 62 | % From https://tex.stackexchange.com/questions/194426/split-itemize-into-multiple-columns 63 | \usepackage{etoolbox,refcount} 64 | \usepackage{multicol} 65 | 66 | \newcounter{countitems} 67 | \newcounter{nextitemizecount} 68 | \newcommand{\setupcountitems}{% 69 | \stepcounter{nextitemizecount}% 70 | \setcounter{countitems}{0}% 71 | \preto\item{\stepcounter{countitems}}% 72 | } 73 | \makeatletter 74 | \newcommand{\computecountitems}{% 75 | \edef\@currentlabel{\number\c@countitems}% 76 | \label{countitems@\number\numexpr\value{nextitemizecount}-1\relax}% 77 | } 78 | \newcommand{\nextitemizecount}{% 79 | \getrefnumber{countitems@\number\c@nextitemizecount}% 80 | } 81 | \newcommand{\previtemizecount}{% 82 | \getrefnumber{countitems@\number\numexpr\value{nextitemizecount}-1\relax}% 83 | } 84 | \makeatother 85 | \newenvironment{AutoMultiColItemize}{% 86 | \ifnumcomp{\nextitemizecount}{>}{3}{\begin{multicols}{2}}{}% 87 | \setupcountitems\begin{itemize}}% 88 | {\end{itemize}% 89 | \unskip\computecountitems\ifnumcomp{\previtemizecount}{>}{3}{\end{multicols}}{}} 90 | 91 | 92 | \title{Statistics and Data Analysis} 93 | \author{Phu Sakulwongtana} 94 | \date{} 95 | 96 | \begin{document} 97 | 98 | \maketitle 99 | 100 | \input{contents/part1.tex} 101 | 102 | \end{document} 103 | -------------------------------------------------------------------------------- /supervised-learning/contents/part2.tex: -------------------------------------------------------------------------------- 1 | \section{Kernel and Regression} 2 | 3 | \subsection{Introduction} 4 | 5 | \begin{definition}{\textbf{(Convex Set)}} 6 | A set $\mathcal{X}$ is convex if $\boldsymbol p, \boldsymbol q \in \mathcal{X}$ is convex if $\alpha \boldsymbol p + (1-\alpha)\boldsymbol q \in \mathcal{X}$ 7 | \end{definition} 8 | 9 | \begin{definition}{\textbf{(Convex Function)}} 10 | A function $f : \mathcal{X}\rightarrow \mathbb{R}$ is convex iff for all $\boldsymbol p, \boldsymbol q \in \mathcal{X}$ in convex set and $\alpha\in(0, 1)$ as we have: 11 | \begin{equation*} 12 | f(\alpha \boldsymbol p + (1-\alpha)\boldsymbol q) \le \alpha f(\boldsymbol p) + (1-\alpha)f(\boldsymbol q) 13 | \end{equation*} 14 | A function $f$ is concave if $-f$ is convex. A function is \emph{strictly convex} if we replace $\le$ with $<$. 15 | \end{definition} 16 | 17 | \begin{remark}{\textbf{(Various Comments on Convex Function)}} 18 | We have the following results on the convex function, as we have: 19 | \begin{itemize} 20 | \item If $f$ and $g$ are convex, then $f + g$ is convex. 21 | \item If $f$ is convex and $g$ is affine (linear + constant) then $f(g(\cdot))$ is convex. 22 | \item Suppose $\boldsymbol M$ is symmetric matrix, then $\boldsymbol M$ is positive semi-definite matrix iff $f(\boldsymbol x) = \boldsymbol x^T\boldsymbol M\boldsymbol x$ is convex. 23 | \item Level set $\brackc{\boldsymbol x : f(\boldsymbol x) = c}$ where $c \in \mathbb{R}$ of convex function $f$ is convex. 24 | \item For $f: (a, b) \rightarrow \mathbb{R}$ if $f''\ge0$ then $f$ is convex. 25 | \item For $f: \mathcal{X} \subseteq \mathbb{R}^n \rightarrow \mathbb{R}$ if $\nabla^2 f(\boldsymbol x)\succeq \boldsymbol 0$ for all $\boldsymbol x \in \mathcal{X}$, then $f$ is convex. 26 | \end{itemize} 27 | \end{remark} 28 | 29 | \subsection{Ridge Regression} 30 | 31 | \begin{definition}{\textbf{(Ridge Regression Problem)}} 32 | Given a function $f(\boldsymbol x) = \boldsymbol w^T\boldsymbol x$ with a dataset: 33 | \begin{equation*} 34 | \mathcal{S} = \brackc{(\boldsymbol x_1, y_1),\dots,(\boldsymbol x_m, y_m)} \subset \mathbb{R}^n \times \mathbb{R} 35 | \end{equation*} 36 | Assuming the dataset is generated by the unknown function $g$ i.e $(\boldsymbol x, g(\boldsymbol x))$. Then suppose that the vector $\boldsymbol x_i$ are linearly independent with $m=n$, then there is a unique solution, whose parameter $\boldsymbol w$ solves: 37 | \begin{equation*} 38 | \boldsymbol X \boldsymbol w = \boldsymbol y 39 | \end{equation*} 40 | where $\boldsymbol y = (y_1,\dots, y_m)^T$ and $\boldsymbol X = [\boldsymbol x_1,\dots, \boldsymbol x_m]^T \in \mathbb{R}^{m\times n}$. 41 | \end{definition} 42 | 43 | \begin{definition}{\textbf{(Well-Posed)}} 44 | The solution/problem is called well-posed if: the solution exists, uniquem and depends continuously on the data. The regularized theory allows general framework to solve ill-posted problem (we can choose the term to penalize complex function). 45 | \end{definition} 46 | 47 | \begin{definition}{\textbf{(Regularized Empirical Error)}} 48 | We minimize the following regularized empirical error, which is given by: 49 | \begin{equation*} 50 | \begin{aligned} 51 | \mathcal{E}_{\text{emph},\lambda}(\boldsymbol w) &= \sum^m_{j=1}(y_i - \boldsymbol w^T\boldsymbol x_i)^2 + \lambda \sum^n_{i=1} w_i^2 \\ 52 | &= (\boldsymbol y - \boldsymbol X\boldsymbol w)^T(\boldsymbol y-\boldsymbol X\boldsymbol w) + \lambda \norm{\boldsymbol w}^2_2 53 | \end{aligned} 54 | \end{equation*} 55 | We can see that the parameter $\lambda > 0$ defines the trade-off between error and the norm of vector $\boldsymbol w$ (which restricts the complexity of the model). 56 | \end{definition} 57 | 58 | \begin{proposition} 59 | Solving the regularized empirical error by setting its gradient to $\boldsymbol 0$, gives us: 60 | \begin{equation*} 61 | \boldsymbol w = (\boldsymbol X^T\boldsymbol X + \lambda \boldsymbol I_n)^{-1}\boldsymbol X^T\boldsymbol y 62 | \end{equation*} 63 | Furthermore, we can show that the weight $\boldsymbol w = \sum^m_{i=1}\alpha_i\boldsymbol x_i$ and the solution can be written as: 64 | \begin{equation*} 65 | f(\boldsymbol x) = \sum^m_{i=1} \alpha_i \boldsymbol x_i^T\boldsymbol x_i 66 | \end{equation*} 67 | where $\boldsymbol \alpha = (\boldsymbol X\boldsymbol X^T + \lambda \boldsymbol I_m)^{-1}\boldsymbol y$. This is called dual form, while $f(\boldsymbol x) = \boldsymbol w^T\boldsymbol x$ is called primal form. 68 | \end{proposition} 69 | \begin{proof} 70 | Starting with the derivative, we have: 71 | \begin{equation*} 72 | \nabla \mathcal{E}_{\text{emp}, \lambda} (\boldsymbol w) = -2\boldsymbol X^T(\boldsymbol y - \boldsymbol X\boldsymbol w) + 2\lambda\boldsymbol w = \boldsymbol 0 73 | \end{equation*} 74 | which implies the weight of the first form i.e $\boldsymbol w = (\boldsymbol X^T\boldsymbol X + \lambda \boldsymbol I_n)^{-1}\boldsymbol X^T\boldsymbol y$. Now, we can also see that: 75 | \begin{equation*} 76 | \boldsymbol w = \frac{\boldsymbol X^T(\boldsymbol y - \boldsymbol X\boldsymbol w)}{\lambda} 77 | \end{equation*} 78 | Assume the the dual form of the weight $\boldsymbol w = \sum^m_{i=1}\alpha_i\boldsymbol x_i$ as we have: 79 | \begin{equation*} 80 | \alpha_i = \frac{y_i - \boldsymbol w^T\boldsymbol x_i}{\lambda} = \frac{y_i - (\sum^m_{i=1}\alpha_i\boldsymbol x_i)^T\boldsymbol x_i}{\lambda} 81 | \end{equation*} 82 | Now solving for the value of $y_i$, which we have: 83 | \begin{equation*} 84 | \begin{aligned} 85 | y &= \bracka{\sum^m_{j=1}\alpha_j\boldsymbol x_j}^T \boldsymbol x_i + \lambda\alpha_i \\ 86 | &= \sum^m_{j=1}(\alpha \boldsymbol x_j^T\boldsymbol x_j + \lambda \alpha_j\delta_{ij}) = \sum^m_{j=1}(\boldsymbol x_j^T\boldsymbol x_j + \lambda\delta_{ij}) \boldsymbol \alpha 87 | \end{aligned} 88 | \end{equation*} 89 | and so we have $(\boldsymbol X\boldsymbol X^T + \lambda \boldsymbol I_m)\boldsymbol \alpha = \boldsymbol y$ 90 | \end{proof} 91 | 92 | \begin{remark}{\textbf{(Advantage of Dual Form)}} 93 | The dual form allow us to gain a computational advantage for both training and testing time: 94 | \begin{itemize} 95 | \item \emph{Training Time}: Solving $\boldsymbol w$ in the primal function requires $\mathcal{O}(mn^2 + n^3)$ operations while solving for dual form $\mathcal{O}(nm^2 + m^3)$ if $m\ll n$ then it is more efficient that primal. 96 | \item \emph{Testing Time}: Computing $f(\boldsymbol x)$ in test vector $\boldsymbol x$ in the primal form requires $\mathcal{O}(n)$ operations but the dual form requires $\mathcal{O}(nm)$ operations. 97 | \end{itemize} 98 | \end{remark} 99 | 100 | \subsection{Basis/Kernel Functions} 101 | 102 | \begin{definition}{\textbf{(Basis/Feature Function)}} 103 | We have the function $\boldsymbol \phi : \mathbb{R}^n\rightarrow \mathbb{R}^N$ as we have: 104 | \begin{equation*} 105 | \boldsymbol \phi(\boldsymbol x) = \Big( \boldsymbol \phi_1(\boldsymbol x),\dots, \boldsymbol \phi_N(\boldsymbol x) \Big)^T 106 | \end{equation*} 107 | for $\boldsymbol x\in \mathbb{R}^n$, where we call $\boldsymbol \phi_1,\dots,\boldsymbol \phi_N$ are called basis function and $\boldsymbol \phi(\boldsymbol x)$ is called feature vector, and feature space is defined by: $\brackc{\boldsymbol \phi(\boldsymbol x) : \boldsymbol x \in \mathbb{R}^n}$ 108 | \end{definition} 109 | 110 | \begin{remark} 111 | We can use the feature map of the data instead of real data $\boldsymbol \phi(\boldsymbol x)$. This gives us the many advantages, for example: 112 | \begin{itemize} 113 | \item The map: $\boldsymbol \phi(\boldsymbol x) = (\boldsymbol x, 1)^T$ allow us to have the bias terms. 114 | \item The map: $\boldsymbol \phi(\boldsymbol x) = (\boldsymbol x_1x_2)^T$ allow us to consider the interaction between inputs (individual elements). 115 | \end{itemize} 116 | We can also consider the second order correlation if $\boldsymbol x\in \mathbb{R}^n$ as: 117 | \begin{equation*} 118 | \boldsymbol \phi(\boldsymbol x) = (x_1x_1,x_1x_2,\dots,x_1x_n,x_2x_2,x_2x_3,\dots,x_2x_n,\dots,x_nx_n)^T 119 | \end{equation*} 120 | now the feature vector has the size of $(n^2 + n)/2$. However, if we consider the inner product, we will have: 121 | \begin{equation*} 122 | \begin{aligned} 123 | \brackd{\boldsymbol \phi(\boldsymbol x), \boldsymbol \phi(\boldsymbol t)} &= (x_1x_1,x_1x_2,\dots,x_nx_n)^T(t_1t_1,t_1t_2,\dots,t_nt_n) \\ 124 | &= (x_1t_1+\cdots + x_nt_n) (x_1t_1+\cdots + x_nt_n) \\ 125 | &= (\boldsymbol x^T\boldsymbol t)^T 126 | \end{aligned} 127 | \end{equation*} 128 | Note that $\mathcal{O}(n)$ but the native computation will take $\mathcal{O}(n^2)$. This leads to decrease the computation complexity (please see the dual form too). 129 | \end{remark} 130 | 131 | \begin{definition}{\textbf{(Kernel Function)}} 132 | Given a feature map $\boldsymbol \phi$, we define the asssociated kernel function $k : \mathbb{R}^n\times \mathbb{R}^n \rightarrow \mathbb{R}$ as we have: 133 | \begin{equation*} 134 | k(\boldsymbol x, \boldsymbol t) = \brackd{\boldsymbol \phi(\boldsymbol x), \boldsymbol \phi(\boldsymbol t)} 135 | \end{equation*} 136 | Please note that the computing $k(\boldsymbol x, \boldsymbol t)$, which it doesn't depends on computing $\boldsymbol \phi(\boldsymbol x)$. 137 | \end{definition} 138 | 139 | \begin{remark}{\textbf{(Feature Map not Unique)}} 140 | The feature map isn't unique. Consider the $\boldsymbol \phi$ that is associated with kernel $k$, and so $\hat{\boldsymbol \phi} = \boldsymbol U\boldsymbol \phi$ where $U \in \mathbb{R}^{N\times N}$. The feature can be difference in values and dimension but gives rise to the same kernel: 141 | \begin{equation*} 142 | (\boldsymbol U\boldsymbol \phi)^T(\boldsymbol U\boldsymbol \phi) = \boldsymbol \phi^T\boldsymbol \phi 143 | \end{equation*} 144 | \end{remark} 145 | 146 | \begin{theorem}{\textbf{(Representor)}} 147 | Consider the loss to be: 148 | \begin{equation*} 149 | \mathcal{E}_{\text{emp}, \lambda}(\boldsymbol w) = \sum^m_{i=1} V(y_i, \brackd{\boldsymbol w, \boldsymbol \phi(\boldsymbol x_i)}) + \lambda\brackd{\boldsymbol w, \boldsymbol w} 150 | \end{equation*} 151 | where $V: \mathbb{R}\times \mathbb{R}\rightarrow \mathbb{R}$ is a loss function. If $V$ is differentiable with respected to its second argument and $\boldsymbol w$ is a minimizer of $\mathcal{E}_\lambda$, then $\boldsymbol w$ has the form of: 152 | \begin{equation*} 153 | \boldsymbol w = \sum^m_{i=1}\alpha_i\boldsymbol \phi(\boldsymbol x_i) \implies f(\boldsymbol x) = \brackd{\boldsymbol w, \boldsymbol \phi(\boldsymbol x)} = \sum^m_{i=1}\alpha_ik(\boldsymbol x_i, \boldsymbol x) 154 | \end{equation*} 155 | \end{theorem} 156 | \begin{proof} 157 | The proof is similar to the dual form. Setting the derivative of $\mathcal{E}_\lambda$ with respected to zero and we have: 158 | \begin{equation*} 159 | \sum^m_{i=1}V'(y_i, \brackd{\boldsymbol w, \boldsymbol \phi(\boldsymbol x_i)})\boldsymbol \phi(\boldsymbol x_i) + 2\lambda\boldsymbol w = 0 160 | \end{equation*} 161 | Compared to $\boldsymbol w = \sum^m_{i=1}\alpha_i\boldsymbol \phi(\boldsymbol x_i)$, we can see that: 162 | \begin{equation*} 163 | \alpha_i = \frac{1}{2\lambda}V'(y_i, \brackd{\boldsymbol w, \boldsymbol \phi(\boldsymbol x_i)}) 164 | \end{equation*} 165 | From the definition of $\boldsymbol w$, we can see that: 166 | \begin{equation*} 167 | \alpha_i = \frac{1}{2\lambda}V'\bracka{y_i, \sum^m_{j=1}k(\boldsymbol x_i, \boldsymbol x_j)\alpha_j} 168 | \end{equation*} 169 | for $i=1,\dots,m$. Finding $\boldsymbol \alpha$ is done by solving the following optimization problem 170 | \begin{equation*} 171 | \argmax{\boldsymbol \alpha} \sum^m_{i=1} V(y_i, (\boldsymbol K\boldsymbol \alpha)_i) + \boldsymbol \alpha^T\boldsymbol K\boldsymbol \alpha 172 | \end{equation*} 173 | \end{proof} 174 | 175 | \begin{definition}{\textbf{(Positive Semi-Definite Kernel)}} 176 | The kernel $k : \mathbb{R}^n\times \mathbb{R}^n \rightarrow \mathbb{R}$ is positive semi-definite if it is symmetrix and given the set of points $\brackc{\boldsymbol x_1,\dots,\boldsymbol x_n}$, the matrix: 177 | \begin{equation*} 178 | \begin{bmatrix} 179 | k(\boldsymbol x_1,\boldsymbol x_1) & \cdots & k(\boldsymbol x_1, \boldsymbol x_n) \\ 180 | \vdots & \ddots & \vdots \\ 181 | k(\boldsymbol x_n,\boldsymbol x_1) & \cdots & k(\boldsymbol x_n, \boldsymbol x_n) \\ 182 | \end{bmatrix} 183 | \end{equation*} 184 | is positive semi-definite. 185 | \end{definition} 186 | 187 | \begin{theorem} 188 | Kernel $k$ is positive definite iff: 189 | \begin{equation*} 190 | k(\boldsymbol x, \boldsymbol y) = \brackd{\boldsymbol \phi(\boldsymbol x), \boldsymbol \phi(\boldsymbol t)} 191 | \end{equation*} 192 | for $\boldsymbol x, \boldsymbol t \in \mathbb{R}^n$ for some feature map $\boldsymbol \phi : \mathbb{R}^n \rightarrow \mathcal{W}$ for Hilber space $\mathcal{W}$ 193 | \end{theorem} 194 | \begin{proof} 195 | We will consider only one direction. If $k(\boldsymbol x, \boldsymbol t) = \brackd{\boldsymbol \phi(\boldsymbol x), \boldsymbol \phi(\boldsymbol t)}$, then we have: 196 | \begin{equation*} 197 | \sum^n_{i=1}\sum^m_{j=1}c_ic_jk(\boldsymbol x_i, \boldsymbol x_j) = \brackd{\sum^m_{i=1}c_i\boldsymbol \phi(\boldsymbol x_i), \sum^m_{j=1}c_j\boldsymbol \phi(\boldsymbol x_j)} =\norm{\sum^m_{i=1}c_i\boldsymbol \phi(\boldsymbol x_i)}^2 \ge 0 198 | \end{equation*} 199 | \end{proof} 200 | 201 | \begin{definition}{\textbf{(Polynomial Kernel)}} 202 | If $p:\mathbb{R}\rightarrow \mathbb{R}$ is a polynomial with non-negative coefficient then $k(\boldsymbol x, \boldsymbol z) = p (\boldsymbol x^T\boldsymbol t)$ where $\boldsymbol x, \boldsymbol t \in \mathbb{R}^n$ and $k$ positive semi-definite kernel. 203 | \end{definition} 204 | 205 | \begin{proposition} 206 | If $\boldsymbol A$ is an $n\times n$ positive semi-definite matrix, the function $k : \mathbb{R}^n\times \mathbb{R}^n \rightarrow \mathbb{R}$ defined by: 207 | \begin{equation*} 208 | k(\boldsymbol x, \boldsymbol t) = \boldsymbol x^T\boldsymbol A\boldsymbol t 209 | \end{equation*} 210 | is a generalized linear kernel and it is a positive semi-definite kernel. 211 | \end{proposition} 212 | \begin{proof} 213 | Since $\boldsymbol A$ is positive semi-definite, we can write $\boldsymbol A$ in the form of $\boldsymbol A=\boldsymbol R\boldsymbol R^T$ for some $\boldsymbol R \in \mathbb{R}^{n\times n}$. Thus, $k$ is represented by a feature map $\boldsymbol \phi(\boldsymbol x) = \boldsymbol R^T\boldsymbol x$. As we can see that: 214 | \begin{equation*} 215 | \begin{aligned} 216 | \sum_{ij} c_ic_j\boldsymbol x_i^T\boldsymbol A\boldsymbol x_j &= \sum_{ij}c_ic_j(\boldsymbol R^T\boldsymbol x_i)^T(\boldsymbol R^T\boldsymbol x_j) \\ 217 | &= \sum_i c_i[\boldsymbol R^T\boldsymbol x_i]^T\brackb{\sum_j c_j(\boldsymbol R^T\boldsymbol x_j)} = \norm{\sum_i c_i\boldsymbol R^T\boldsymbol x_i}^2 \ge 0 218 | \end{aligned} 219 | \end{equation*} 220 | \end{proof} 221 | 222 | \begin{proposition} 223 | If $k:\mathbb{R}^N\times \mathbb{R}^N\rightarrow \mathbb{R}$ is a positve semi-definite kernel and $\boldsymbol \phi: \mathbb{R}^n \rightarrow \mathbb{R}^N$. 224 | \begin{equation*} 225 | \tilde{k}(\boldsymbol x, \boldsymbol t) = k(\boldsymbol \phi(\boldsymbol x), \boldsymbol \phi(\boldsymbol t)) 226 | \end{equation*} 227 | The kernel $\tilde{k}$ defined to be $\tilde{k}:\mathbb{R}^n\times \mathbb{R}^n \rightarrow \mathbb{R}$ is a positive definite kernel. 228 | \end{proposition} 229 | 230 | \begin{proposition} 231 | Given a positive semi-definite kernels $k_1$ and $k_2$, $ak_1$ is a positive semi-definite kernel if $a>0$ and $k_1 + k_2$ is also a positive definite kernel. 232 | \end{proposition} 233 | 234 | \begin{proposition} 235 | We consider the following combination of kernel $k_1$ and $k_2$ are given as: 236 | \begin{equation*} 237 | k(\boldsymbol x, \boldsymbol t) = k_1(\boldsymbol x, \boldsymbol t)k_2(\boldsymbol x, \boldsymbol t) 238 | \end{equation*} 239 | where $\boldsymbol x, \boldsymbol t \in \mathbb{R}^d$ is a kernel. 240 | \end{proposition} 241 | \begin{proof} 242 | For the product of kernel, we have: 243 | \begin{itemize} 244 | \item We want to show that for positive semi-definite $\boldsymbol A$ and $\boldsymbol B$ where $\boldsymbol C = A\odot \boldsymbol B$ is a positive semi-definite. 245 | \item Since $\boldsymbol A$ and $\boldsymbol B$ are positive semi-definite, where it can be factorized as $\boldsymbol A = \boldsymbol U\boldsymbol U^T$ and $\boldsymbol B = \boldsymbol V\boldsymbol V^T$ for $\boldsymbol U, \boldsymbol V \in \mathbb{R}^{n\times n}$ as we have: 246 | \begin{equation*} 247 | \begin{aligned} 248 | \sum^n_{i=1}\sum^n_{j=1} z_iz_jC_{ij} &= \sum^n_{i=1}\sum_{j=1}^n z_iz_j\bracka{\sum^n_{r=1} U_{ir}U_{jr}}\bracka{\sum^n_{s=1} V_{is}V_{js}} \\ 249 | &= \sum^n_{i=1}\sum_{j=1}^n\sum^n_{r=1}\sum^n_{s=1}z_iz_jU_{ir}U_{jr}V_{is}V_{js} \\ 250 | &= \sum^n_{r=1}\sum^n_{s=1}\sum^n_{i=1}\sum_{j=1}^nz_iz_jU_{ir}U_{jr}V_{is}V_{js} \\ 251 | &= \sum^n_{r=1}\sum^n_{s=1}\sum_{i=1}^nz_iU_{ir}V_{is}\sum^n_{j=1}z_jU_{ji}V_{js} = \sum^n_{r=1}\sum^n_{s=1}\bracka{\sum_{i=1}^n z_iU_{ir}V_{is}}^2 \ge 0 252 | \end{aligned} 253 | \end{equation*} 254 | Thus complete the proof. This proves the polynomial kernel is positive definite kernel. 255 | \end{itemize} 256 | \end{proof} 257 | 258 | \begin{remark}{\textbf{(Several Kernels)}} 259 | We have the following positve definite kernel, where we have $a\ge 0$: 260 | \begin{itemize} 261 | \item $k(\boldsymbol x, \boldsymbol t) = (\boldsymbol x^T\boldsymbol t)^r$ 262 | \item $k(\boldsymbol x, \boldsymbol t) = (a + \boldsymbol x^T\boldsymbol t)^r$ 263 | \item $k(\boldsymbol x, \boldsymbol t) = \sum^d_{i=1}(a^i/i!)(\boldsymbol x^T\boldsymbol t)^r$ 264 | \item Gaussian Kernel: $k(\boldsymbol x, \boldsymbol t) = \exp(-\beta\norm{\boldsymbol x - \boldsymbol t}^2)$ for $\beta>0$ the data $\boldsymbol x, \boldsymbol t \in \mathbb{R}^n$ (It has infinite dimensional feature map) 265 | \item ANOVA kernel: $k(\boldsymbol x, \boldsymbol t) = \prod^n_{i=1}(1 + x_it_i)$ 266 | \end{itemize} 267 | \end{remark} 268 | 269 | \begin{remark} 270 | Consider the following polynomial kernel as we have: 271 | \begin{equation*} 272 | \sum^d_{i=1} \frac{a^i}{i!}(\boldsymbol x^T\boldsymbol t)^i 273 | \end{equation*} 274 | Suppose we have $r=\infty$, this can converge uniformly to $\exp(a\boldsymbol x^T\boldsymbol t)$ showing that it is a kernel, where if $n=1$, the feature map is: 275 | \begin{equation*} 276 | \phi = \bracka{1 , \sqrt{2} x, \sqrt{\frac{a}{2}}x^2, \sqrt{\frac{a^3}{6}}x^3, \cdots} = \bracka{\sqrt{\frac{a^i}{i!}} : i \in \mathbb{N}} 277 | \end{equation*} 278 | \end{remark} 279 | 280 | \begin{definition}{\textbf{(Transition Invariance/Radial Kernel)}} 281 | We say that a kernel $k : \mathbb{R}^d \times \mathbb{R}^d \rightarrow \mathbb{R}$ is: 282 | \begin{itemize} 283 | \item \emph{Transition Invariance}: If the kernel has the form: 284 | \begin{equation*} 285 | k(\boldsymbol x, \boldsymbol t) = H(\boldsymbol x - \boldsymbol t) 286 | \end{equation*} 287 | for all $\boldsymbol x, \boldsymbol t \in \mathbb{R}^d$ where $H : \mathbb{R}^d \rightarrow \mathbb{R}$ is a differentiable function. 288 | \item \emph{Radial}, if kernel has the form: 289 | \begin{equation*} 290 | k(\boldsymbol x, \boldsymbol t) = h(\norm{\boldsymbol x- \boldsymbol t}) 291 | \end{equation*} 292 | for all $\boldsymbol x, \boldsymbol t \in \mathbb{R}^d$ where $h : [0, \infty)\rightarrow [0, \theta)$ is the differentiable function. 293 | \end{itemize} 294 | \end{definition} 295 | 296 | \begin{remark} 297 | The important example of a radial kernel in the Gaussian kernel as we have: 298 | \begin{equation*} 299 | k(\boldsymbol x, \boldsymbol t) = \exp(-\beta\norm{\boldsymbol x- \boldsymbol t}^2) 300 | \end{equation*} 301 | which is a product of $2$ kernel as $k(\boldsymbol x, \boldsymbol t) = \exp(-\beta(\boldsymbol x^T\boldsymbol x + \boldsymbol t^T\boldsymbol t))\exp(2\beta\boldsymbol x^T\boldsymbol t)$ 302 | \end{remark} 303 | 304 | \begin{remark}{\textbf{(Ridge Regression with Feature Map)}} 305 | Given the dataset $\boldsymbol X \in \mathbb{R}^{m\times n}$ and $\boldsymbol y \in \mathbb{R}^{m\times 1}$. Starting with the basis function $\phi_1,\dots,\phi_N$ where $\phi_i : \mathbb{R}^n \rightarrow \mathbb{R}$ with the map: 306 | \begin{equation*} 307 | \boldsymbol \Phi = \begin{bmatrix} 308 | \phi_1(\boldsymbol x_1) & \cdots & \phi_N(\boldsymbol x_1) \\ 309 | \vdots & \ddots & \vdots \\ 310 | \phi_1(\boldsymbol x_m) & \cdots & \phi_N(\boldsymbol x_m) \\ 311 | \end{bmatrix} \in \mathbb{R}^{m\times N} 312 | \end{equation*} 313 | We have the regression coefficient as we have $\boldsymbol w = (\boldsymbol \Phi^T\boldsymbol \Phi + \lambda \boldsymbol I_N)^{-1}\boldsymbol \Phi^T\boldsymbol y$ 314 | \end{remark} 315 | 316 | \begin{remark}{\textbf{(Kernel Ridge Regression)}} 317 | Given the same setting, a kernel function $\mathbb{R}^n\times \mathbb{R}^n \rightarrow \mathbb{R}$, where the kernel matrix is given by: 318 | \begin{equation*} 319 | \boldsymbol K = \begin{bmatrix} 320 | k(\boldsymbol x_1,\boldsymbol x_1) & \cdots & k(\boldsymbol x_1, \boldsymbol x_n) \\ 321 | \vdots & \ddots & \vdots \\ 322 | k(\boldsymbol x_n,\boldsymbol x_1) & \cdots & k(\boldsymbol x_n, \boldsymbol x_n) \\ 323 | \end{bmatrix} \in \mathbb{R}^{m\times m} 324 | \end{equation*} 325 | Regression coefficient is then given by $\boldsymbol \alpha = (\boldsymbol K + \lambda \boldsymbol I_m)^{-1}\boldsymbol y$ as the function is: 326 | \begin{equation*} 327 | \hat{y}(\boldsymbol x) = \sum^m_{i=1}\alpha_i k(\boldsymbol x_i, \boldsymbol x) 328 | \end{equation*} 329 | \end{remark} 330 | -------------------------------------------------------------------------------- /supervised-learning/contents/part3.tex: -------------------------------------------------------------------------------- 1 | \section{Support Vector Machine} 2 | 3 | \subsection{Forming Problems} 4 | 5 | \begin{definition}{\textbf{(Seperating Hyperplane)}} 6 | Let the dataset be $S = \brackc{(\boldsymbol x_i, y_i)}^m_{i=1} \in \mathbb{R}^n \times \brackc{-1, 1}$. The hyperplane is the set such that: 7 | \begin{equation*} 8 | \mathcal{H}_{\boldsymbol w, b} = \brackc{\boldsymbol x\in \mathbb{R}^n : \boldsymbol w^T\boldsymbol x + b = 0} 9 | \end{equation*} 10 | \end{definition} 11 | 12 | \begin{definition}{\textbf{(Linearly Separatable)}} 13 | The data are linearly separatable if there exists $\boldsymbol w \in \mathbb{R}^n$ and $b\in \mathbb{R}$ such that: 14 | \begin{equation*} 15 | y_i(\boldsymbol w^T\boldsymbol x_i + b) > 0 16 | \end{equation*} 17 | for $i=1,\dots,m$, which we call $\mathcal{H}_{\boldsymbol w, b}$ a separating hyperplane. Note that it is a strict inequality. 18 | \end{definition} 19 | 20 | \begin{proposition}{\textbf{(Finding A distance from Plane)}} 21 | If $\mathcal{H}_{\boldsymbol w, b}$ is a hyperplane, we also define the distance from a point $\boldsymbol x$ to be: 22 | \begin{equation*} 23 | \frac{\boldsymbol w^T\boldsymbol x + b}{\norm{\boldsymbol w}} 24 | \end{equation*} 25 | \end{proposition} 26 | \begin{proof} 27 | We consider the projection from the point $\boldsymbol x$ to $\mathcal{H}_{\boldsymbol w, b}$ as we have: 28 | \begin{equation*} 29 | \boldsymbol p = \boldsymbol x - \frac{\boldsymbol w(b + \boldsymbol w^T\boldsymbol x)}{\norm{\boldsymbol w}^2} 30 | \end{equation*} 31 | To show that $\boldsymbol p$ is indeed a projection: 32 | \begin{itemize} 33 | \item We will have to show that $\boldsymbol p$ is on hyperplane 34 | \begin{equation*} 35 | \boldsymbol w^T\boldsymbol p + b = \boldsymbol w^T\boldsymbol x - \frac{\boldsymbol w^T\boldsymbol w(b + \boldsymbol w^T\boldsymbol x)}{\norm{\boldsymbol w}^2} + b = 0 36 | \end{equation*} 37 | \item $\boldsymbol x-p$ is orthogonal to $\boldsymbol p - \boldsymbol x'$ where $\boldsymbol x'$ is any point from on the hyperplane: 38 | \begin{equation*} 39 | \begin{aligned} 40 | (\boldsymbol p-\boldsymbol x)^T(\boldsymbol p-\boldsymbol x') 41 | &= \brackd{-\frac{\boldsymbol w(b + \boldsymbol w^T\boldsymbol x)}{\norm{\boldsymbol w}^2}, \boldsymbol p - \boldsymbol x'} \\ 42 | &= \brackd{-\frac{\boldsymbol w(b + \boldsymbol w^T\boldsymbol x)}{\norm{\boldsymbol w}^2}, \boldsymbol x - \frac{\boldsymbol w(b + \boldsymbol w^T\boldsymbol x)}{\norm{\boldsymbol w}^2} - \boldsymbol x'} \\ 43 | &= \brackd{-\frac{\boldsymbol w(b + \boldsymbol w^T\boldsymbol x)}{\norm{\boldsymbol w}^2}, \boldsymbol x - \boldsymbol x'} + \brackd{\frac{\boldsymbol w(b + \boldsymbol w^T\boldsymbol x)}{\norm{\boldsymbol w}^2}, \frac{\boldsymbol w(b + \boldsymbol w^T\boldsymbol x)}{\norm{\boldsymbol w}^2}} \\ 44 | &= \brackd{-\frac{\boldsymbol w(b + \boldsymbol w^T\boldsymbol x)}{\norm{\boldsymbol w}^2}, \boldsymbol x - \boldsymbol x'} + \frac{\norm{\boldsymbol w}^2(b + \boldsymbol w^T\boldsymbol x)^2}{\norm{\boldsymbol w}^4} \\ 45 | &= -\frac{b + \boldsymbol w^T\boldsymbol x}{\norm{\boldsymbol w}^2}\brackd{\boldsymbol w, \boldsymbol x - \boldsymbol x'} + \frac{(b + \boldsymbol w^T\boldsymbol x)^2}{\norm{\boldsymbol w}^2} \\ 46 | &= -\frac{(b + \boldsymbol w^T\boldsymbol x)(\boldsymbol w^T\boldsymbol x - \boldsymbol w^T\boldsymbol x')}{\norm{\boldsymbol w}^2}\brackd{\boldsymbol w, \boldsymbol x - \boldsymbol x'} + \frac{(b + \boldsymbol w^T\boldsymbol x)^2}{\norm{\boldsymbol w}^2} \\ 47 | &= -\frac{b(\boldsymbol w^T\boldsymbol x) - b(\boldsymbol w^T\boldsymbol x') + (\boldsymbol w^T\boldsymbol x)^2 - (\boldsymbol w^T\boldsymbol x)(\boldsymbol w^T\boldsymbol x')}{\norm{\boldsymbol w}^2} + \frac{(b + \boldsymbol w^T\boldsymbol x)^2}{\norm{\boldsymbol w}^2} \\ 48 | &= -\frac{b(\boldsymbol w^T\boldsymbol x) + b^2 + (\boldsymbol w^T\boldsymbol x)^2 + (\boldsymbol w^T\boldsymbol x)b}{\norm{\boldsymbol w}^2} + \frac{(b + \boldsymbol w^T\boldsymbol x)^2}{\norm{\boldsymbol w}^2} =0 \\ 49 | \end{aligned} 50 | \end{equation*} 51 | Please note that $\boldsymbol w^T\boldsymbol x' + b = 0$. 52 | \end{itemize} 53 | Now, we are left to find the distance between $\boldsymbol p$ and $\boldsymbol x$, which we can find it to be: 54 | \begin{equation*} 55 | \sqrt{(\boldsymbol p - \boldsymbol x)^T(\boldsymbol p - \boldsymbol x)} = \sqrt{\brackd{\frac{\boldsymbol w(b + \boldsymbol w^T\boldsymbol x)}{\norm{\boldsymbol w}^2}, \frac{\boldsymbol w(b + \boldsymbol w^T\boldsymbol x)}{\norm{\boldsymbol w}^2}}} = \frac{\abs{b + \boldsymbol x^T\boldsymbol w}}{\norm{\boldsymbol w}} 56 | \end{equation*} 57 | Thus complete the proof. 58 | \end{proof} 59 | 60 | \begin{definition}{\textbf{(Margin)}} 61 | As we have the distance from a point $\boldsymbol x$ to the plane $\mathcal{H}_{\boldsymbol w, b}$ to be $\rho_{\boldsymbol x}(\boldsymbol w, b)$ . If $\mathcal{H}_{\boldsymbol w, b}$ separates the training set $S$, we define a margin as: 62 | \begin{equation*} 63 | \rho_S(\boldsymbol w, b ) = \min_{i \in [m]}\rho_{\boldsymbol x_i}(\boldsymbol w, b) 64 | \end{equation*} 65 | \end{definition} 66 | 67 | \begin{definition}{\textbf{(Optimal Separating Hyper-Planes)}} 68 | We want to find the weight and bias of a separating hyperplane such that the the margin is maximized : 69 | \begin{equation*} 70 | \rho(S) = \max_{\boldsymbol w, b}\min_{i\in[m]}\brackc{\frac{y_i(\boldsymbol w^T\boldsymbol x_i + b)}{\norm{\boldsymbol w}} : y_j (\boldsymbol w^T\boldsymbol x_j + b) > 0 \text{ for } j \in [m] } 71 | \end{equation*} 72 | Furthermore, to get the unqiue $\boldsymbol w, b$, we may consider $2$ choices: 73 | \begin{itemize} 74 | \item Set $\norm{\boldsymbol w} = 1$, so $\rho_{\boldsymbol x}(\boldsymbol w, b) = \abs{\boldsymbol w^T\boldsymbol x + b}$ and so: 75 | \begin{equation*} 76 | \rho_S = \min_{i\in[m]}y_i(\boldsymbol w^T\boldsymbol x_i + b) 77 | \end{equation*} 78 | \item Choose $\norm{\boldsymbol w}$ such that $\rho_S(\boldsymbol w, b) = 1/\norm{\boldsymbol w}$ or: 79 | \begin{equation*} 80 | \min_{i\in[m]} y_i(\boldsymbol w^T\boldsymbol x_i + b) = 1 81 | \end{equation*} 82 | \end{itemize} 83 | We will consider the second case. 84 | \end{definition} 85 | 86 | \begin{proposition} 87 | The optimal separating hyperplane is equivalent to following optimization problem: 88 | \begin{equation*} 89 | \begin{aligned} 90 | \min_{w,b} \quad & \frac{1}{2}\boldsymbol w^{T}\boldsymbol w \\ 91 | \text{\emph{s.t}} \quad & y_{i}(\boldsymbol w^T\boldsymbol x_i + b)\ge1\\ 92 | \end{aligned} 93 | \end{equation*} 94 | for $\boldsymbol w \in \mathbb{R}^n$. The quantity $1/\norm{\boldsymbol w}$ is the margin of optimal separating hyperplane. 95 | \end{proposition} 96 | \begin{proof} 97 | We have following the second case: 98 | \begin{equation*} 99 | \begin{aligned} 100 | \rho(S) &= 101 | \max_{\boldsymbol w, b}\brackc{\frac{1}{\norm{\boldsymbol w}} : \min_{j\in[m]}\brackc{y_j (\boldsymbol w^T\boldsymbol x_j + b)} = 0, y_k(\boldsymbol w^T\boldsymbol x_k + b)>0 \text{ for } k \in [m]} \\ 102 | &= \max_{\boldsymbol w, b}\brackc{\frac{1}{\norm{\boldsymbol w}} : \brackc{y_j (\boldsymbol w^T\boldsymbol x_j + b)} \ge 1} = \frac{1}{\min_{\boldsymbol w, b}\brackc{\norm{\boldsymbol x} : \brackc{y_j (\boldsymbol w^T\boldsymbol x_j + b)} \ge 1}} \\ 103 | \end{aligned} 104 | \end{equation*} 105 | \end{proof} 106 | 107 | \begin{proposition} 108 | To minimize a differentiable convex function $f(\boldsymbol z) : \mathbb{R}^n \rightarrow \mathbb{R}$ subjected to linear inequality $\boldsymbol A\boldsymbol z \le \boldsymbol c$. We may solve the problem with Lagragian: 109 | \begin{equation*} 110 | L(\boldsymbol x, \boldsymbol \alpha) = f(\boldsymbol x) - \boldsymbol \alpha^T(\boldsymbol A\boldsymbol x - c) 111 | \end{equation*} 112 | If the optimization problem is feasible that is $\brackc{\boldsymbol x : \boldsymbol A\boldsymbol x \le \boldsymbol c} \ne \emptyset$, we can show that: 113 | \begin{equation*} 114 | \max_{\boldsymbol \alpha\ge\boldsymbol 0}\min_{\boldsymbol x}L(\boldsymbol x, \boldsymbol \alpha) = \min_{\boldsymbol x}f(\boldsymbol x) \text{ s.t } \boldsymbol A\boldsymbol x\le \boldsymbol c 115 | \end{equation*} 116 | And there is a necessary and sufficient condition called KKT for a solution $(\boldsymbol \alpha^*\boldsymbol z^*)$: 117 | \begin{itemize} 118 | \item $\boldsymbol A\boldsymbol x^* \le \boldsymbol c$ 119 | \item $\boldsymbol \alpha^* \ge \boldsymbol 0$ 120 | \item $\nabla_{\boldsymbol x}L(\boldsymbol x, \boldsymbol \alpha) | _{\boldsymbol x^*} = \boldsymbol 0$ 121 | \item $(\boldsymbol A\boldsymbol x^* - \boldsymbol c)_i\boldsymbol \alpha^*_i = \boldsymbol 0_i$ for $i\in[m]$ 122 | \end{itemize} 123 | \end{proposition} 124 | 125 | \begin{proposition} 126 | The dual form for the SVM is: 127 | \begin{equation*} 128 | \begin{aligned} 129 | \max_{\boldsymbol \alpha} \quad &-\frac{1}{2}\boldsymbol \alpha^T\boldsymbol A\boldsymbol \alpha + \sum^m_{i=1}\alpha_i \\ 130 | \text{\emph{s.t}} \quad &\begin{aligned}[t] 131 | &\sum^m_{i=1}y_i\alpha_i = 0 \text{ \emph{for} } i \in [m] \\ 132 | &\alpha_i \ge 0 133 | \end{aligned} 134 | \end{aligned} 135 | \end{equation*} 136 | where $\boldsymbol A = (y_iy_j\boldsymbol x_i^T\boldsymbol x_j : i,j\in[m])$. The solution to the primal problem is: 137 | \begin{equation*} 138 | \boldsymbol w^* = \sum^m_{i=1}\alpha^*_iy_i\boldsymbol x_i 139 | \end{equation*} 140 | as the weight is the linear combination of the data. Finally the variable $b^*$ can be determine by find the weight $\boldsymbol x_j$ that satisfies the condition: 141 | \begin{equation*} 142 | y_i((\boldsymbol w^*)^T\boldsymbol x_i + b) - 1 = 0 143 | \end{equation*} 144 | Then we bias can be found by rearrange as we have $b^* = y_i- (\boldsymbol w^*)^T\boldsymbol x_j$. The point that satisfies this conditon is called \emph{support vector}. 145 | \end{proposition} 146 | \begin{proof} 147 | We consider the Lagragian to be: 148 | \begin{equation*} 149 | L(\boldsymbol w, b; \boldsymbol \alpha) = \frac{1}{2}\boldsymbol w^T\boldsymbol w - \sum^m_{i=1}\alpha_i[y_i(\boldsymbol w^T\boldsymbol x_i + b) - 1] 150 | \end{equation*} 151 | where $\alpha_i\ge0$ is Lagragian multipler. Let's minimize $L$ over $\boldsymbol w$ and $b$ and maximized over $\boldsymbol \alpha$ with $\boldsymbol \alpha \ge \boldsymbol 0$. We can see that the partial derivative is: 152 | \begin{equation*} 153 | \begin{aligned} 154 | &\frac{\partial L}{\partial b} = -\sum^m_{i=1}y_i\alpha_i = 0 \\ 155 | &\frac{\partial L}{\partial \boldsymbol w} = \boldsymbol w - \sum^m_{i=1}\alpha_iy_i\boldsymbol x_i = 0 \implies \boldsymbol w = \sum^m_{i=1}\alpha_iy_i\boldsymbol x_i 156 | \end{aligned} 157 | \end{equation*} 158 | Now, we can see that the optimal weight will have the linear combination term. Let's plugging this back into Lagragian and we have: 159 | \begin{equation*} 160 | \frac{1}{2}\underbrace{\boldsymbol w^T\boldsymbol w}_{\boldsymbol \alpha^T\boldsymbol A\boldsymbol \alpha} - \underbrace{\sum^m_{i=1}\alpha_iy_i\boldsymbol w^T\boldsymbol x_i}_{\boldsymbol \alpha^T\boldsymbol A\boldsymbol \alpha} - \underbrace{b\sum^m_{i=1}\alpha_iy_i}_{0} + \sum^m_{i=1}\alpha_i 161 | \end{equation*} 162 | \end{proof} 163 | \begin{remark} 164 | The new point $\boldsymbol x$ can be classified as: 165 | \begin{equation*} 166 | \operatorname{sign}\bracka{\sum^m_{i=1}y_i\alpha^*_i\boldsymbol x_i^T\boldsymbol x_i + b^*} 167 | \end{equation*} 168 | One can show that the expected generalization error of SVM trained on $m-1$ sample is bounded by $n_\text{sv}/m$, where $n_\text{sv}$ is the number of support vector. 169 | \end{remark} 170 | 171 | \begin{remark}{\textbf{(Linear Non-Separatable Case)}} 172 | We would like to minimize the following objective function: 173 | \begin{equation*} 174 | \frac{1}{2}\boldsymbol w^T\boldsymbol w + C\sum^m_{i=1}V_\text{mc}(y_i, \boldsymbol w^T\boldsymbol x_i + b) 175 | \end{equation*} 176 | as we have $V_\text{mc}(y, \hat{y}) = \mathbb{I}[y = \operatorname{sign}(\hat{y})]$ but it is NP-Hard and so we will have to convexify the problem by consider the hinge loss, instead: 177 | \begin{equation*} 178 | V_\text{hinge}(y, \hat{y}) = \max(0, 1-h\hat{y}) 179 | \end{equation*} 180 | This will gives us the convex optimization. 181 | \end{remark} 182 | 183 | \begin{proposition} 184 | The hinge loss can be reformulated using the slack variable and gives us the following optimization problem: 185 | \begin{equation*} 186 | \begin{aligned} 187 | \min_{w,b} \quad & \frac{1}{2}\boldsymbol w^{T}\boldsymbol w + C\sum^m_{i=1}\xi_i \\ 188 | \text{\emph{s.t}} \quad & \begin{aligned}[t] 189 | &y_{i}(\boldsymbol w^T\boldsymbol x_i + b)\ge1-\xi_i\\ 190 | &\xi_i \ge 0 \text{ for } i \in i=1,\dots,m 191 | \end{aligned} 192 | \end{aligned} 193 | \end{equation*} 194 | This would in turn, gives us the following dual problem: 195 | \begin{equation*} 196 | \begin{aligned} 197 | \max_{\boldsymbol \alpha} \quad &-\frac{1}{2}\boldsymbol \alpha^T\boldsymbol A\boldsymbol \alpha + \sum^m_{i=1}\alpha_i \\ 198 | \text{\emph{s.t}} \quad &\begin{aligned}[t] 199 | &\sum^m_{i=1}y_i\alpha_i = 0 \text{ \emph{for} } i \in [m] \\ 200 | &0\le\alpha_i \le C 201 | \end{aligned} 202 | \end{aligned} 203 | \end{equation*} 204 | We will consider the implication of KKT conditon afterward. 205 | \end{proposition} 206 | \begin{proof} 207 | We now have the following Lagragian to be: 208 | \begin{equation*} 209 | L(\boldsymbol w, b; \boldsymbol \alpha) = \frac{1}{2}\boldsymbol w^T\boldsymbol w + C\sum^m_{i=1}\xi_i - \sum^m_{i=1}\alpha_i[y_i(\boldsymbol w^T\boldsymbol x_i + b) - 1] - \sum^m_{i=1}\beta_i\xi_i 210 | \end{equation*} 211 | where $\alpha_i, \beta_i \ge 0$ are Lagragian multipler. We minimize $L$ over $(\boldsymbol w, \boldsymbol \xi, b)$ and maxmize $L$ with respected to the variables as: 212 | \begin{equation*} 213 | \begin{aligned} 214 | &\frac{\partial L}{\partial b} = -\sum^m_{i=1}y_i\alpha_i = 0 \\ 215 | &\frac{\partial L}{\partial \boldsymbol w} = \boldsymbol w - \sum^m_{i=1}\alpha_iy_i\boldsymbol x_i = 0 \implies \boldsymbol w = \sum^m_{i=1}\alpha_iy_i\boldsymbol x_i \\ 216 | &\frac{\partial L}{\partial \xi_i} = c - \alpha_i - \beta_i = 0 \implies 0 \le \alpha_i \le C \\ 217 | \end{aligned} 218 | \end{equation*} 219 | Plugging this back gives us the dual form. Please note that both $\alpha_i, \beta_i \ge 0$ 220 | \end{proof} 221 | 222 | \begin{remark}{\textbf{(Interpretation of The Results)}} 223 | The dual problem is similar to the eariler linear separatable case, as we have additional box constraint. The weight is given as: 224 | \begin{equation*} 225 | \boldsymbol w^* = \sum^m_{i=1}\alpha_i^*y_i\boldsymbol x_i 226 | \end{equation*} 227 | where $\boldsymbol b^*$ is the same. For a new KKT conditon, we have: 228 | \begin{equation*} 229 | \begin{aligned} 230 | &\alpha_i^*(y_i(\boldsymbol w^*)^T\boldsymbol x_i + b^* - 1 + \xi^*_i) = 0 \\ 231 | &(C-\alpha^*_i)\xi^*_i = 0 232 | \end{aligned} 233 | \end{equation*} 234 | where the second equation follows from $\beta_i^* = C - \alpha^*_i$. There are difference points to consider: 235 | \begin{itemize} 236 | \item $y_i(\boldsymbol w^*)^T\boldsymbol x_i + b^* > 1$ implies that $\alpha_i^* = 0$ where the point isn't support vector. 237 | \item $y_i(\boldsymbol w^*)^T\boldsymbol x_i + b^* < 1$ implies that $\alpha_i^* = C$ where the point is a support vector slack $\xi^*_i$ outlier. 238 | \item $y_i(\boldsymbol w^*)^T\boldsymbol x_i + b^* = 1$ implies that $\alpha_i^* \in [0, C]$ and if $\alpha_i^*>0$, it is a support vector on a margin. 239 | \end{itemize} 240 | On the otherhand, we have: 241 | \begin{itemize} 242 | \item $\alpha_i^* = 0$ then we have $y_i(\boldsymbol w^*)^T\boldsymbol x_i + b^* \ge 1$ and $\xi_i^*=0$ 243 | \item $\alpha_i^* \in (0, C)$ then we have $y_i(\boldsymbol w^*)^T\boldsymbol x_i + b^* = 1$ and $\xi_i^*=0$ 244 | \item $\alpha_i^* = C$ then we have $y_i(\boldsymbol w^*)^T\boldsymbol x_i + b^* \le 1$ and $\xi_i^*\ge0$ 245 | \end{itemize} 246 | \end{remark} 247 | 248 | \begin{remark} 249 | The role of parameter $C$ is that: 250 | \begin{itemize} 251 | \item The parameter $C$ controls the trade-off between $\norm{\boldsymbol w}^2$ and the training error $\sum^m_{i=1}\xi_i$ 252 | \item The value of $\alpha_i^*$ is piecewise quadratic of $C$ 253 | \item $C$ is selected by minimizing leave-one-out (LOO) cross-validation error. 254 | \end{itemize} 255 | To compute the LOO error, we need to retrain the SVM no more than the number of support vector making it fast to train. One can observe that we can use the $n_\text{sv}/m$ as an upper bound on LOO error. 256 | \end{remark} 257 | 258 | 259 | \begin{definition}{\textbf{(Kernelized SVM)}} 260 | Given the feature map $\boldsymbol \phi(\boldsymbol x) : \mathcal{X} \rightarrow \mathcal{W}$, we can replace $\boldsymbol x$ with $\boldsymbol \phi(\boldsymbol x)$ and $\boldsymbol x^T\boldsymbol t$ by $\brackd{\boldsymbol \phi(\boldsymbol x), \boldsymbol \phi(\boldsymbol t)}$. The result function is: 261 | \begin{equation*} 262 | f(\boldsymbol x) = \sum^m_{i=1}y_i\alpha_i k(\boldsymbol x_i, \boldsymbol x) + b 263 | \end{equation*} 264 | The parameter can be found using the matrix $\boldsymbol A = (y_iy_jk(\boldsymbol x_i, \boldsymbol x_j) : i, j \in[m])$ and the new point is classified the same. 265 | \end{definition} 266 | 267 | \begin{remark}{\textbf{(Connection to the Regularization)}} 268 | SVM formulation is equivalent to the following problem: 269 | \begin{equation*} 270 | \mathcal{E}_\lambda = \sum^m_{i=1} \max\Big( 1 - y_i\bracka{\brackd{\boldsymbol w, \boldsymbol \phi(\boldsymbol x_i)} + b}, 0 \Big) + \lambda\norm{\boldsymbol w}^2 271 | \end{equation*} 272 | where we set $\lambda = 1/(2C)$ and so we have: 273 | \begin{equation*} 274 | \begin{aligned} 275 | &\min_{\boldsymbol w, b, \boldsymbol \xi} \brackc{C\sum^m_{i=1} \xi_i + \frac{1}{2}\norm{\boldsymbol w}^2 : y_i\bracka{\brackd{\boldsymbol w, \boldsymbol \phi(\boldsymbol x_i)} + b } \ge 1 - \xi_i, \xi_i\ge0} \\ 276 | =&\min_{\boldsymbol w, b} \brackc{\min_{\boldsymbol \xi}\brackc{C\sum^m_{i=1} \xi_i + \frac{1}{2}\norm{\boldsymbol w}^2 : y_i\bracka{\brackd{\boldsymbol w, \boldsymbol \phi(\boldsymbol x_i)} + b } \ge 1 - \xi_i, \xi_i\ge0}} \\ 277 | =&\min_{\boldsymbol w, b} \brackc{C\sum^m_{i=1} \Big( 1 - y_i\bracka{\brackd{\boldsymbol w, \boldsymbol \phi(\boldsymbol x_i)} + b}, 0 \Big) + \frac{1}{2}\norm{\boldsymbol w}^2} = C\mathcal{E}_{1/(2C)}(\boldsymbol w, b) 278 | \end{aligned} 279 | \end{equation*} 280 | \end{remark} 281 | 282 | \begin{remark}{\textbf{(SVM for Regression)}} 283 | If we have the regression for the SVM, then we use the following loss: 284 | \begin{equation*} 285 | \abs{y - f(\boldsymbol x)}_\varepsilon = \max(\abs{y - f(\boldsymbol x)} - \varepsilon, 0) 286 | \end{equation*} 287 | This would gives the following optimization problem: 288 | \begin{equation*} 289 | \begin{aligned} 290 | \min & \quad \frac{1}{2}\boldsymbol w^T\boldsymbol w + C\sum^m_{i=1}(\xi_i + \xi_i^*) \\ 291 | \text{s.t}& \quad \begin{aligned}[t] 292 | &\boldsymbol w^T\boldsymbol x_i + b - y_i \le \varepsilon + \xi_i \\ 293 | &y_i-\boldsymbol w^T\boldsymbol x_i - b \le \varepsilon + \xi_i^* \\ 294 | &\xi_i,\xi_i^* \ge 0 \text{ for } i \in [m] 295 | \end{aligned} 296 | \end{aligned} 297 | \end{equation*} 298 | Please note that the loss function is scale sensitive as the error below certain. This gives the sparse solution. One can use decompositve to solve all of the KKT problems. 299 | \end{remark} 300 | 301 | 302 | 303 | -------------------------------------------------------------------------------- /supervised-learning/contents/part4.tex: -------------------------------------------------------------------------------- 1 | \section{Tree Based and Ensemble Model} 2 | 3 | \subsection{Tree Based Method} 4 | 5 | \begin{definition}{\textbf{(Tree Method)}} 6 | We are interesting to partition the input space into retangles and fit simple model in each one; for example, we have the function: 7 | \begin{equation*} 8 | f(\boldsymbol x) = \sum^P_{p=1}c_p \mathbb{I}[\boldsymbol x \in R_p] 9 | \end{equation*} 10 | Where we hve the following: 11 | \begin{itemize} 12 | \item We partition the input space with hyper-retangle $R_1,R_2,\dots,R_p$ where: $\bigcup^P_{p=1} R_p = \mathcal{X}$ and $R_a\cap R_b = \emptyset$ if $a\ne b$ 13 | \item $\brackc{c_p}^P_{p=1}$ is some real parameter with a natural choice to be: 14 | \begin{equation*} 15 | c_p = \operatorname{avg}(y_i | \boldsymbol x_i \in R_p) = \frac{\sum^m_{i=1} y_i \mathbb{I}[\boldsymbol x_i \in R_p]}{\sum^m_{i=1} \mathbb{I}[\boldsymbol x_i \in R_p]} 16 | \end{equation*} 17 | \end{itemize} 18 | We are interested to solving the following optimization problem: 19 | \begin{equation*} 20 | \min_{R_1,\dots,R_p}\brackc{\sum^m_{i=1}\bracka{y_i - \sum^P_{p=1} \operatorname{avg}(y_i | \boldsymbol x_i \in R_p) \mathbb{I}[\boldsymbol x_i \in R_p]}^2} 21 | \end{equation*} 22 | \end{definition} 23 | 24 | \begin{definition}{\textbf{(Heuristic Search)}} 25 | It seem to be intractable, so we need heuristic approach. Let's find the way to split the tree. Define a pair of axis parallel half-spaces: 26 | \begin{equation*} 27 | R_1 (j, s) = \brackc{\boldsymbol x | x_j \le s} \qquad R_2(j, s) = \brackc{\boldsymbol x | x_j > s} 28 | \end{equation*} 29 | Then we search for optimal values $j^*$ and $s^*$, which solves the problem: 30 | \begin{equation*} 31 | \min_{j, s} \brackc{\min_{c_1} \sum_{\boldsymbol x_i \in R_1(j, s)} (y_i - c_1(\boldsymbol x_i))^2 + \min_{c_2}\sum_{\boldsymbol x_i \in R_2(j, s)} (y_i - c_2(\boldsymbol x_i))^2 } 32 | \end{equation*} 33 | The inner minimizer is solved by: 34 | \begin{equation*} 35 | c^*_1 = \operatorname{avg}(y_i | \boldsymbol x_i \in R_1(j, s)) \qquad c_2^* = \operatorname{avg}(y_i | \boldsymbol x_i \in R_2(j, s)) 36 | \end{equation*} 37 | For each splitting variable $j$, the search for best split at point $s$ can be don by $\mathcal{O}(m)$ computation. Thus, the problem is solved in $\mathcal{O}(nm)$ computation. The decision tree can be solved by repeatedly splitting the tree branches. 38 | \end{definition} 39 | 40 | \begin{remark}{\textbf{(Overfitting)}} 41 | If we keep repeating the heuristic search process, we will overfit the data. There are several ways to fix this: 42 | \begin{itemize} 43 | \item The following the split only if it decreases the empirical error more than the threshold. However, this might be the best as we might find split below a bad mode. 44 | \item We might consider the maximal depth of split tree is reached. This could leads to an underfitting or overfitting. We need to look at the data to determine the size of tree. 45 | \end{itemize} 46 | \end{remark} 47 | 48 | \begin{remark}{\textbf{(Solving Overfitting)}} 49 | We choose the tree adapting from the data. We grows the large tree $\hat{T}$ (stopping when the maximum number of data is assigned at each node). Now consider the prune the tree with cost complexity pruining i.e looks for subtree $T_\lambda\subseteq \hat{T}$ that minimizes: 50 | \begin{equation*} 51 | C_\lambda(T) = \sum^{|T|}_{p=1} m_pQ_p(T) + \lambda\abs{T} 52 | \end{equation*} 53 | where $T$ is the subtree of $\hat{T}$, where we have: 54 | \begin{itemize} 55 | \item $p$ runs over leaf nodes of $T$ (a subset of the nodes of $\hat{T}$) 56 | \item $m_p$ is the number of data point assigned to node $p$ 57 | \item $Q_p$ is the training error given as: 58 | \begin{equation*} 59 | Q_p = \frac{1}{m_p}\sum_{\boldsymbol x_i \in R_p}(y_i - c_p)^2 60 | \end{equation*} 61 | At the first term in $C_\lambda$ is the training error. 62 | \end{itemize} 63 | One can show that there is a unique $T_\lambda \subseteq \hat{T}$, with minimize $C_\lambda$, while a good value of $\lambda$ can be found by cross-validation. 64 | \end{remark} 65 | 66 | \begin{definition}{\textbf{(Weakest Link Pruning)}} 67 | We successively collapse the internal nodes that produces the smallest per node increase in: 68 | \begin{equation*} 69 | \sum^{|T|}_{p=1} m_pQ_p(T) 70 | \end{equation*} 71 | We continue until the root the tree is produce. As now, we have a list of prunned trees. We can search along this list for the one that miminizes the objective $C_\lambda$, and one can show that $T_\lambda$ is in the produced list of subtree, hence the algorithm gives the optimal solution. 72 | \end{definition} 73 | 74 | \begin{definition}{\textbf{(Classification Tree)}} 75 | When the output is a categorical variable, we use the same algorithm above with $2$ important modification: 76 | \begin{itemize} 77 | \item For each region $R_n$, we define the empirical class probability, as we have: 78 | \begin{equation*} 79 | p_{nk} = \frac{1}{m_p}\sum_{(\boldsymbol x_i, y_i) \in R_n} \mathbb{I}[y_i=k] 80 | \end{equation*} 81 | \item We classify an input which falls in region $n$ in the class with new probability as we have: 82 | \begin{equation*} 83 | f(\boldsymbol x) = \argmax{k \in \brackc{1,\dots,K}}\sum^N_{n=1}p_{nk} \mathbb{I}[\boldsymbol x \in R_n] 84 | \end{equation*} 85 | \end{itemize} 86 | \end{definition} 87 | 88 | \begin{definition}{\textbf{(Impurity)}} 89 | We consider the training error $Q_p(T)$ to be called impurity, which in can be one of these values: 90 | \begin{itemize} 91 | \item \emph{Misclassification Error}: $1 - p_{pk(n)}$ where $k(n) = \arg\max_{k\in\brackc{1,\dots,k}} p_{nk}$ 92 | \item \emph{Gini-Index}: $\sum_kp_{pk}(1-p_{pk})$ 93 | \item \emph{Cross-Entropy}: $\sum_kp_{pk}\log(1/p_{pk})$ 94 | \end{itemize} 95 | The cross-entropy or gini-index are used to growing the tree, while the misclassification error are often used to prune the tree. 96 | \end{definition} 97 | 98 | \subsection{Ensemble Methods + Bagging} 99 | 100 | \begin{theorem}{\textbf{(Chernoff-Bound)}} 101 | Let $X_1,X_2,\dots,X_n$ be independent random variable. Assuning $0\le x_i \le 1$. We denote the $X = \sum^n_{i=1}X_i$ and $\mu = \mathbb{E}[X] = \sum^n_{i=1}\mathbb{E}[X_i]$, then for all $0\le k \le \mu$ : 102 | \begin{equation*} 103 | \mathbb{P}(X \le k) \le \exp\bracka{-\frac{(\mu - k)^2}{2\mu}} 104 | \end{equation*} 105 | \end{theorem} 106 | 107 | \begin{remark}{\textbf{(Motivation - Wisdom of the Crowd)}} 108 | A single individual might often wrong but the crowd majority may often be corrected. Suppose each individual in the crowd $h_1,h_2,\dots,h_{2T+1}$ of the size $2T+1$ predicts the outcome correctly with probability $1/2+\gamma$ independent from each other. We consider the vote of the crowd to be: 109 | \begin{equation*} 110 | H_T = \operatorname{sgn}\bracka{\sum^{2T+1}_{t=1}h_t} 111 | \end{equation*} 112 | The probability of $H_T$ being wrong is given as: 113 | \begin{equation*} 114 | \mathbb{P}(H_T \text{ is wrong }) = \sum^T_{i=1}\begin{pmatrix} 115 | 2T+1 \\ i 116 | \end{pmatrix}\bracka{\frac{1}{2}+\gamma}^i\bracka{\frac{1}{2}-\gamma}^{2T+1-i} 117 | \end{equation*} 118 | We simplify the above using a Chernoff bound. We let $X_1,\dots,X_i,\dots,X_n$ be Bernoulli random variable where $X_i = 1$ if voter $i$ is correct and $0$ otherwise. Taking $k = T$ and $n=2T+1$ thus: 119 | \begin{equation*} 120 | \mu = (2T+1)\bracka{\frac{1}{2} + \gamma} = T + \frac{1}{2}+2T\gamma + \gamma 121 | \end{equation*} 122 | Now, we substuite the bound: 123 | \begin{equation*} 124 | \begin{aligned} 125 | \mathbb{P}(H_T \text{ is wrong }) &\le \exp\bracka{-\frac{(\mu - T)^2}{2\mu}} \\ 126 | &= \exp\bracka{-\frac{(1/2+2T\gamma + \gamma)^2}{2(T+1/2+2T\gamma + \gamma)}} \\ 127 | &\le \exp\bracka{-\frac{4T^2\gamma^2}{5T}} = \exp\bracka{-\frac{4\gamma^2}{5}T} 128 | \end{aligned} 129 | \end{equation*} 130 | The bound may be too crude but the probability of getting wrong, exponentially decays to zero. 131 | \end{remark} 132 | 133 | \begin{definition}{\textbf{(Bagging Algorithm)}} 134 | The idea of bagging algorithm is to reduce the variance of a classifier by having many variances of the classifier and then voting. We have the following algorithm: 135 | \begin{itemize} 136 | \item Training data: $S = \brackc{(\boldsymbol x_1,y_1),\dots,(\boldsymbol x_m, y_m)} \subset \mathbb{R}^d \times \brackc{-1, 1}$ 137 | \item Ensemble of size $T$ 138 | \item Resample dataset of size $M$ 139 | \item Classifier function $h_\mathcal{S}(\boldsymbol x)$ 140 | \end{itemize} 141 | This leads to the following pseudocode: 142 | \begin{algorithm}[H] 143 | \caption{Bagging Algorithm} 144 | \begin{algorithmic}[1] 145 | \For {$t=1,2,\cdots, T$} 146 | \State $S[t] = M$ examples sampled with repalcement from $S$ 147 | \EndFor 148 | \State \textbf{Return:} We perform the following prediction: 149 | \begin{equation*} 150 | H(\boldsymbol x) = \operatorname{sgm}\bracka{\sum^T_{t=1}h_{S[t]}(\boldsymbol x)} 151 | \end{equation*} 152 | \end{algorithmic} 153 | \end{algorithm} 154 | We may set $M$ to be $m$. 155 | \end{definition} 156 | 157 | \begin{remark} 158 | If we set $M=m$, we can find the number of unique example from $S$ are in bag $S(t)$. The probability that a particular example doesn't appear in the bag is $(1-1/m)^m$, and please note that: 159 | \begin{equation*} 160 | \lim_{m\rightarrow\infty}\bracka{1-\frac{1}{m}}^m = \frac{1}{e}\approx 0.368.. 161 | \end{equation*} 162 | so there will be around $63\%$ examples in each dataset $S[t]$. 163 | \end{remark} 164 | 165 | \begin{definition}{\textbf{(Random Forest)}} 166 | We observe the wisdom of the crowds argument. We can build a tree using a subset of size $k$ features, which is usually $\sqrt{d}$ or $\log d$. 167 | \end{definition} 168 | 169 | \subsection{Boosting} 170 | 171 | \begin{remark}{\textbf{(Concept of Boosting)}} 172 | Some of the problem is easy to find the \correctquote{rule of thumb} that is usually correct. It is hard to find accurate prediction rule. To boosting algorithm is given by: 173 | \begin{itemize} 174 | \item Create a computer program for derriving rough rule of thumb. 175 | \item We can shoow a rule of thumb to fit a subset of example. 176 | \item Repeat $T$ times. 177 | \item Combined the classifier by weighted majority votes. 178 | \end{itemize} 179 | There are two concerns: How do we choose the subset of examples ? At each round as we want to concentrate on the hardest example. How do we combine the weak learner ? This can be done by weighted majority. 180 | \end{remark} 181 | 182 | \begin{definition}{\textbf{(Notation Used in Boosting)}} 183 | We have the following variables, as we have: 184 | \begin{itemize} 185 | \item $D_t(i)$: Weight on example $i$ at time $t$ when $\sum^m_{i=1}D_t(i) = 1$ 186 | \item $\alpha_t$: Weight on weak learner $t$ where $\alpha_t \in \mathbb{R}$ 187 | \item $h_t(\cdot) : \mathbb{R}^d \rightarrow \brackc{-1, +1}$: Weak learner that is generated at time $t$. 188 | \item $f(\cdot)$: Weighted on weak learner. $\sum^T_{t=1}\alpha_th_t(\boldsymbol x)$ 189 | \item $H(\boldsymbol x) = \operatorname{sgn}(f(\boldsymbol x))$: Final classifier. 190 | \item $\varepsilon_t$: Weight error of weak learner $h_t(\cdot)$ at time $t$: 191 | \begin{equation*} 192 | \varepsilon_t = \sum^m_{i=1}D_t(i)\mathbb{I}[h_t(\boldsymbol x_i) \ne y_i] 193 | \end{equation*} 194 | \item Weak learning will generate the output: 195 | \begin{equation*} 196 | D_t(1),\dots,D_t(m),(\boldsymbol x_1,y_1),\dots,(\boldsymbol x_m, y_m) 197 | \end{equation*} 198 | The weak-learner will output a weaker learner $h_t(\cdot)$ such that $\varepsilon_t < 1/2$ 199 | \end{itemize} 200 | \end{definition} 201 | 202 | \begin{definition}{\textbf{(Adaboost Algorithm)}} 203 | We have the following pseudocode for the adaboost this is shown in the pseudocode \ref{algo:adaboost}. 204 | \begin{algorithm}[H] 205 | \caption{Adaboost} 206 | \label{algo:adaboost} 207 | \begin{algorithmic}[1] 208 | \State \textbf{Input}: Training set $S = \brackc{(\boldsymbol x_1,y_1),\dots,(\boldsymbol x_m, y_m)}$ 209 | \State \textbf{Initialize}: $D_1(1)=\cdots=D_1(m)=1/m$ 210 | \For {$i=1,2,\cdots, T$} 211 | \State Fit the classifier $h_t : \mathbb{R}^d\rightarrow\brackc{-1, 1}$ using a distribution $D_t$ 212 | \State Choose $\alpha_t \in \mathbb{R}$: 213 | \begin{equation*} 214 | \alpha_t = \frac{1}{2}\log\frac{1-\varepsilon_t}{\varepsilon_t} 215 | \end{equation*} 216 | \State Update for each $i \in [m]$, where $Z_t$ is normalization factor: 217 | \begin{equation*} 218 | D_{t+1}(i) = \frac{D_t(i)\exp(-\alpha_ty_ih_t(\boldsymbol x_i))}{Z_t} 219 | \end{equation*} 220 | \EndFor 221 | \State \textbf{Return}: Classifier is given as: 222 | \begin{equation*} 223 | H(\boldsymbol x) = \operatorname{sgn}\bracka{\sum^T_{t=1}\alpha_th_t(\boldsymbol x)} 224 | \end{equation*} 225 | \end{algorithmic} 226 | \end{algorithm} 227 | Typically $\varepsilon_t \le 0.5$ hence $\alpha_t \ge 0$. Thus $f$ is a linear combination of $h_t$ with weights controlled by training error. The basic intuition for the adaboost assign a larger weight are assigned to hard examples, hence the weak learner will focus on those example. 228 | \end{definition} 229 | 230 | \begin{theorem} 231 | Given a training set $\brackc{(\boldsymbol x_1,y_1),\dots,(\boldsymbol x_m, y_m)}$ and assume that each iteration of Adaboost the weak learner reutrns a hypothesis with a weighted error $1/2-\gamma\ge\varepsilon_t$, then training error of the output hypothesis is at most: 232 | \begin{equation*} 233 | \frac{1}{m}\sum^m_{i=1}\mathbb{I}[H(\boldsymbol x_i)\ne y_i]\le\exp(-2\gamma^2T) 234 | \end{equation*} 235 | \end{theorem} 236 | \begin{proof} 237 | Please note that the training error is bounded as: 238 | \begin{equation*} 239 | \frac{1}{m}\sum^m_{i=1}\mathbb{I}[H(\boldsymbol x_i)\ne y_i] \le \frac{1}{m}\sum^m_{i=1}\exp(-y_if(\boldsymbol x_i)) 240 | \end{equation*} 241 | where $f = \sum_t \alpha_t h_t$ so that $H(\boldsymbol x) = \operatorname{sgn}(f(\boldsymbol x))$. The inequality follows from $H(\boldsymbol x_i) \ne y_i$ implies that $\exp(-y_if(\boldsymbol x_i))\ge1$. Now consider the definition of $D_t$ where, recursively: 242 | \begin{equation*} 243 | D_{T+1}(i) = \frac{1}{m}\frac{\prod^T_{t=1}\exp(-\alpha_ty_ih_t(\boldsymbol x_i))}{\prod^T_{t=1}Z_t} 244 | \end{equation*} 245 | We can expand this equation, where we have: 246 | \begin{equation*} 247 | \begin{aligned} 248 | \frac{1}{m}\sum^m_{i=1}\exp(-y_if(\boldsymbol x_i)) &= \frac{1}{m}\sum^m_{i=1}\exp\bracka{-y_i\sum^T_{t=1}\alpha_th_t(\boldsymbol x_i)} \\ 249 | &= \frac{1}{m}\sum^m_{i=1}\prod^T_{t=1}\exp(-y_i\alpha_th_t(\boldsymbol x_i)) \\ 250 | &= \sum^m_{i=1}D_{T+1}(i)\prod^T_{t=1}Z_t = \prod^T_{t=1}Z_t 251 | \end{aligned} 252 | \end{equation*} 253 | If at each iteration, we choose $\alpha_t$ and $h_t$ by minimizing $Z_t$, the final training error of $H$ will be reduced most rapidly. Recall that: 254 | \begin{equation*} 255 | Z_t = \sum^m_{i=1}D_t(i)\exp(-\alpha_ty_ih_t(\boldsymbol x_i)) 256 | \end{equation*} 257 | Using the fact that $Z_t$ is a binary, we have that: 258 | \begin{equation*} 259 | \begin{aligned} 260 | Z_t &= \exp(\alpha_t)\sum_{i : y_i \ne h_t(\boldsymbol x_i)}D_t(i) + \exp(-\alpha_t)\sum_{i: y_i = h_t(\boldsymbol x_i)}D_t(i) \\ 261 | &= \varepsilon_t\exp(\alpha_t) + (1-\varepsilon_t)\exp(-\alpha_t) 262 | \end{aligned} 263 | \end{equation*} 264 | Setting the derivative of $Z_t$ to zero with respected to $\alpha_t$, which gives us the weight: 265 | \begin{equation*} 266 | \alpha_t = \frac{1}{2}\log\frac{1-\varepsilon_t}{\varepsilon_t} 267 | \end{equation*} 268 | Placing $\alpha_t$ to the value $Z_t$, and we have: 269 | \begin{equation*} 270 | \begin{aligned} 271 | Z_t &= \varepsilon_t\exp(\alpha_t) + (1-\varepsilon_t)\exp(-\alpha_t) \\ 272 | &= 2\sqrt{\varepsilon_t(1-\varepsilon_t)} = \sqrt{1-4\gamma^2_t} 273 | \end{aligned} 274 | \end{equation*} 275 | Please note that $\gamma_t = 1/2-\varepsilon_t$. Hence we have: 276 | \begin{equation*} 277 | \frac{1}{m}\sum^m_{i=1}\mathbb{I}[H(\boldsymbol x_i) \ne y_i] \le \prod^T_{t=1}Z_t = \prod^T_{t=1}\sqrt{1-4\gamma^2_t} \le \exp\bracka{-2\sum^T_{t=1}\gamma_t^2} 278 | \end{equation*} 279 | The final inequality use the fact that $1-x\le\exp(x)$. If each weak classifier is slightly better than random guessing, the training drops exponentially fast. 280 | \end{proof} 281 | 282 | \begin{remark}{\textbf{(Derivation of Adaboost)}} 283 | The boosting can be seen as a greedy way to solve problem: 284 | \begin{equation*} 285 | \min\brackc{\sum^m_{i=1}V\bracka{y_i, \sum^T_{i=1}\alpha_th_t(\boldsymbol x_i)} : \alpha_1,\dots,\alpha_T\in \mathbb{R}^T, h_1,\dots,h_T \in \mathcal{H}^T} 286 | \end{equation*} 287 | where $\mathcal{H}$ is hypothesis class which contains the weaker learner and the loss function is exponential for instance $V(y, \hat{y}) = \exp(-y\hat{y})$. At each iteration, a new basis function is added to the current basis expansion $f^{(t-1)} = \sum^{t-1}_{s=1}\alpha_sh_s$, which we have: 288 | \begin{equation*} 289 | (\alpha_t, h_t) = \argmin{\alpha_t, h_t} \sum^m_{i=1}V\Big( y_i, f^{(t-1)}(\boldsymbol x_i) + \alpha_th_t(\boldsymbol x_i) \Big) 290 | \end{equation*} 291 | unlike the decision tree, where each iteration in previous basis is re-adjusted. In statistics literature, this kind of model is called stagewise additive model. To derive the adaboost, substute $V(y, \hat{y}) = \exp(-y\hat{y})$ and we consider the followimg optimization problem: 292 | \begin{equation*} 293 | \min_{\alpha_t,h_t}\sum^m_{i=1}\exp\bracka{-y_i\bracka{f^{(t-1)}(\boldsymbol x_i) + \alpha_th_t(\boldsymbol x_i)}} 294 | \end{equation*} 295 | We define $\mathcal{D}_t(i) = \exp(-y_if^{(t-1)}(\boldsymbol x_i))$ as we have: 296 | \begin{equation*} 297 | \min_{\alpha_t, h_t} \sum^m_{i=1}\mathcal{D}_t(i)\exp(-\alpha_th_t(\boldsymbol x_i)y_i) 298 | \end{equation*} 299 | We can see that the This equation can be rewritten as: 300 | \begin{equation*} 301 | \begin{aligned} 302 | \min_{\alpha_t, h_t}&\bracka{\exp(\alpha_t)\sum_{i : y_i \ne h_i(\boldsymbol x_i)} \mathcal{D}_t(i) + \exp(-\alpha_t)\sum_{i:y_i = h_t(\boldsymbol x_i)} \mathcal{D}_t(i) } \\ 303 | =&\min_{\alpha_t, h_t}\bracka{ (e^{\alpha_t} - e^{-\alpha_t})\sum^m_{i=1}\mathcal{D}_t(i)\mathbb{I}[y_i \ne h_t(\boldsymbol x_i)] + e^{-\alpha_t}\sum^m_{i=1}\mathcal{D}_t(i) } 304 | \end{aligned} 305 | \end{equation*} 306 | This is similar to the adaboost, which we have: $h_t$ minimizes the weight misclassification error weight by $\mathcal{D}_t$ that is is propotional to adaboost $D_t$. Finally, minimization of $\alpha_t$ is the same as adaboost. 307 | \end{remark} 308 | 309 | \begin{remark}{\textbf{(Classification and Regression)}} 310 | In the typical setup of classification as we have: 311 | \begin{equation*} 312 | \min_{f\in\mathcal{F}} \sum^m_{i=1}V(y_i, f(\boldsymbol x_i)) + \lambda \ \text{complexity}(f) 313 | \end{equation*} 314 | There are some problems with classification as we use the exponential loss. To make the class of function $\mathcal{F}$ both rich and smooth, we have the function $f$ that maps to $\mathbb{R}$ rather than $\brackc{-1, 1}$ then predict the sign. We have the typical loss function, where we have for $y \in \brackc{-1, +1}$: 315 | \begin{itemize} 316 | \item Misclassification Loss: $V_\text{mc}(y, \hat{y}) = \mathbb{I}[y = \operatorname{sgn}(\hat{y})]$. It isn't continuous. 317 | \item Hinge Loss: $V_\text{hinge}(y, \hat{y}) = \max(0, 1-y\hat{y})$. It punishes the negative margin but not positive margin, but it isn't differetiable everywhere. 318 | \item Square Loss: $V_\text{sq}(y, \hat{y}) = (y-\hat{y})^2$. It unnecessary punishes predicting with increasing positive margin. 319 | \item Exponential Loss: $V_\text{exp}(y, \hat{y}) = \exp(-y\hat{y})$. It punishes negative margine and promote large positve margin. 320 | \end{itemize} 321 | Thus the exponential loss is choosen. 322 | \end{remark} 323 | 324 | -------------------------------------------------------------------------------- /supervised-learning/contents/part6.tex: -------------------------------------------------------------------------------- 1 | \section{Online Learning 2: Bandits} 2 | 3 | \begin{definition}{\textbf{(Partial Feedback Protocal)}} 4 | We cosnider the following setting: 5 | \begin{algorithm}[H] 6 | \caption{Partial Feedback Control} 7 | \begin{algorithmic}[1] 8 | \For {$i=1,2,\cdots, T$} 9 | \State Predict $\hat{y}_t \in [n]$ 10 | \State Observe loss of prediction $l_{t, \hat{y}_t}\in[0, 1]$ 11 | \EndFor 12 | \end{algorithmic} 13 | \end{algorithm} 14 | We have the following goal: 15 | \begin{equation*} 16 | \sum^m_{t=1}l_{t, \hat{y_t}} - \min_{i\in[n]}\sum^m_{t=1}l_{t, i} \le o(m) 17 | \end{equation*} 18 | This is the same as the regret. Please note that we didn't get to see all loss function that is induced by the prediction. 19 | \end{definition} 20 | 21 | \begin{definition}{\textbf{(Unbiased Estimation)}} 22 | An estimator $\hat{\theta}$ estimate a parameter $\theta$ of a distribution from a sample is unbiased if we can show that $\mathbb{E}[\hat{\theta}] = \theta$. 23 | \end{definition} 24 | 25 | \begin{example} 26 | Suppose $X_1,\dots,X_n$ are iid random variable for a distribution with mean $\mu$, then: 27 | \begin{equation*} 28 | \hat{\theta} = \frac{1}{n}(X_1+\dots+X_n) 29 | \end{equation*} 30 | is an unbiased estimate of $\mu$ 31 | \end{example} 32 | 33 | \begin{example} 34 | Suppose $X$ is a random variable with the discrete unifrom distribution over $\brackc{1,\dots,n}$. Suppose $n$ is unknown and we wish to estimate it. 35 | \begin{itemize} 36 | \item The estimate $\hat{\theta}_1 = X$ is the maximum likelihood estimator, since $\mathcal{L}(\theta, X = x) = 1/\theta$ is maximized when $\theta = x$. Then we have: 37 | \begin{equation*} 38 | \mathbb{E}[\hat{\theta}_1 ; \theta = n] = \sum^n_{x=1}\frac{x}{n} = \frac{n+1}{2} 39 | \end{equation*} 40 | \item And so, $\hat{\theta}_2 = 2x - 1$ is unbiased estimator, which is: 41 | \begin{equation*} 42 | \mathbb{E}[\hat{\theta}_2 ; \theta = n] = \sum^n_{x=1}\frac{1}{n}(2x - 1) = 2\sum^n_{x=1} \frac{1}{n} (2x - 1) = 2\sum^n_{x=1}\frac{1}{n}x -\sum^n_{x=1}\frac{1}{n} = n 43 | \end{equation*} 44 | \end{itemize} 45 | \end{example} 46 | 47 | \begin{remark}{\textbf{(Assumption and Estimation)}} 48 | Suppose, we have a distribution $D_i$ over $[0, 1]$ for each $i\in[n]$ arms. For each arm $i$, we use iid sample $l_{t, i}$ for $D_i$. Suppose, we play $i$ on trials $S_{t, i}\subseteq[t]$, then: 49 | \begin{equation*} 50 | \hat{\mu}_{t, i} = \sum_{t\in S_i} \frac{l_{t, i}}{|S_i|} 51 | \end{equation*} 52 | This is unbiased estimator of $\mu_i$. Now, we can consider the usage as we have: 53 | \begin{itemize} 54 | \item We can use a concentration inequality that allows us to quantitatively estimate the likelihood to estimate differently for the parameter. 55 | \item Using the observation, the algorithm UCB balances exploration and exploitation to obtain good regret bounds for this method. 56 | \item Suppose tha tthe underlying $D_i$ is changing over time (being $D_{t, i}$): 57 | \begin{equation*} 58 | \mu_{t, i} = \frac{\sum^t_{j=1} \mathbb{E}[l_{j,t}]}{t} 59 | \end{equation*} 60 | where $S_i = [t]$. However, if we only have $S_{t, i} = [t]$, then we have no information about the other arms. 61 | \item We need to have simultaneous unbiased estimate for all arms $S$ 62 | \end{itemize} 63 | \end{remark} 64 | 65 | \begin{definition}{\textbf{(Importance Weighting)}} 66 | We have the following series of observation: 67 | \begin{itemize} 68 | \item Suppose $X$ is a random variable over $\mathbb{R}$ with a mean $\mu$. By definition, $\mathbb{E}[X] = \mu$ and $\hat{\theta}_1 = X$ is an unbiased estimator of the mean. 69 | \item Consider the biased coin $Z_p$ with outcome $1$ with probability $p$. Suppse, we have the estimator $\hat{\theta}_0$ setting to equal to $X/p$ if $Z_p = 1$. 70 | \item Its expectation is equal to: 71 | \begin{equation*} 72 | \mathbb{E}[\hat{\theta}_0] = \mathbb{P}(Z_p = 1)(X/p) + 0 \mathbb{P}(Z_p = 0) = (p)(X/p) + (1-p)(0) = X 73 | \end{equation*} 74 | This is unbiased. 75 | \end{itemize} 76 | \end{definition} 77 | 78 | \begin{definition}{\textbf{(Hallucinated Loss Vector)}} 79 | We generalize this to obtain an unbiased estimator of $l_t$ in the bandit setting. Given $\boldsymbol v_t \in \Delta_n$ by the relation tha t$\hat{y}_t \sim \boldsymbol v_t$. The unbiased estimator $l^n_t$ or $\boldsymbol k_t$ with respected to $\boldsymbol v_t$ is given as: 80 | \begin{equation*} 81 | \bracka{l^h_{t, i} = \frac{l_{t, i}}{v_{t, i}} \mathbb{I}[i = \hat{y}_t] }_{i \in [n]} 82 | \end{equation*} 83 | \end{definition} 84 | 85 | \begin{remark}{\textbf{(Expectation of Hallucinated Loss Vector)}} 86 | Observed that $l^h_t$ is unbiased for all $i\in[n]$ since we have: 87 | \begin{equation*} 88 | \mathbb{E}_{\hat{y}_t, \boldsymbol v_t}[l^h_{t, i}] = \sum^n_{j=1}v_{t, j}\frac{l_{t, i}}{v_{t, i}} \mathbb{I}[i = j] = l_{t, i} 89 | \end{equation*} 90 | We have unbiased estimator for all arms by only observing the single arm. We can apply the hedge to $l^h_t$ requires bounded loss vector. We can use more careful analysis of the hedge. 91 | % \begin{itemize} 92 | % \item We will be given an expected regret bound and there will be some subtleties in the source of randomness. 93 | % \item This will be clarify the adversariable model that generates the loss $\boldsymbol l_1,\dots,\boldsymbol l_m$ 94 | % \end{itemize} 95 | \end{remark} 96 | 97 | \begin{definition}{\textbf{(EXP3)}} 98 | Exponential-Weight algorithm for Exploration and Exploitation is given by: 99 | \begin{algorithm}[H] 100 | \caption{EXP3} 101 | \begin{algorithmic}[1] 102 | \State \textbf{Initialize}: $\eta \in (0, \infty)$ 103 | \State Set $\boldsymbol v_1 = (1/n, \dots, 1/n)$ 104 | \For {$i=1,2,\cdots, T$} 105 | \State Sample $\hat{y}_t \sim \boldsymbol v_t$ 106 | \State Observe Loss $l_{t, \hat{y}} \in [0, 1]$ 107 | \State Construct Hallucinated Loss vector: 108 | \begin{equation*} 109 | l^h_t = \bracka{l^h_{t, i} = \frac{l_{t, i}}{v_{t, i}} \mathbb{I}[i = \hat{y}_t] }_{i \in [n]} 110 | \end{equation*} 111 | \State Perform the update, for $i\in[n]$ and $Z_t = \sum^n_{i=1}v_{t,i}\exp(-\eta l^h_{t, i})$: 112 | \begin{equation*} 113 | v_{t+1, i} = v_i\exp(-\eta l^h_{t, i})/Z_t 114 | \end{equation*} 115 | \EndFor 116 | \end{algorithmic} 117 | \end{algorithm} 118 | \end{definition} 119 | 120 | \begin{lemma} 121 | For any sequence of loss vector $\boldsymbol l_1,\dots,\boldsymbol l_m \in [0, 1]^n$, we have the following loss bound: 122 | \begin{equation*} 123 | \sum^m_{t=1}\boldsymbol v_t^T\boldsymbol l^h_t - \sum^m_{t=1}\boldsymbol u^T\boldsymbol l^h_t \le \frac{\ln n}{\eta} + \frac{\eta}{2}\sum^m_{t=1}\sum^n_{i=1}v_{t, i}(l^h_{t, i})^2 124 | \end{equation*} 125 | For all $\boldsymbol u \in \Delta_n$ 126 | \end{lemma} 127 | \begin{proof} 128 | The lemma follows from the fact that EXP3 is just Hedge with $\boldsymbol l_t$ weighted to be $\boldsymbol l^h_t$ and the Hedge inequality is proven before. 129 | \end{proof} 130 | 131 | \begin{remark} 132 | We can show the property of EXP3, where we consider that: we need to perform and so we may replace hallucination losses $\boldsymbol l^h_t$ with time loss $\boldsymbol l$: 133 | \begin{itemize} 134 | \item We can model some of the randomness as we use the adversarial loss $\boldsymbol l_1,\dots,\boldsymbol l_m$. 135 | \item We have to bound the term $\sum^m_{t=1}\sum^n_{i=1}v_{t,i}(l^h_{t,i})^2$ and tune $\eta$ 136 | \end{itemize} 137 | \end{remark} 138 | 139 | \begin{definition}{\textbf{(Deterministic Adversarial Model)}} 140 | We will to set $\boldsymbol l_1,\dots,\boldsymbol l_m$ before running the algorithm. The adversary is assumed to be complete given the prior knowledge, and: 141 | \begin{itemize} 142 | \item The limitation of near omniscient adversary is that it is non-adaptive. 143 | \item It many simulate the stochastic model by repeatedly sample the $\mathcal{D}_1,\dots,\mathcal{D}_m$ in advance. 144 | \end{itemize} 145 | \end{definition} 146 | 147 | \begin{theorem} 148 | For any sequence of loss vector $S = l_1,\dots,l_m \in [0, 1]^n$, the regret for EXP3 with $\eta = \sqrt{2\ln n/mn}$ is: 149 | \begin{equation*} 150 | \mathbb{E}[L_A(S)] - \min_i L_i \le \sqrt{2mn\ln n} 151 | \end{equation*} 152 | where $L_A(S) = \sum^m_{t=1}l_{t, \hat{y}_y}$ and $L_i = \sum^m_{t=1}l_{t, i}$ 153 | \end{theorem} 154 | \begin{proof} 155 | Observe that the only source of randomness are the sample $\hat{y}_t \sim \boldsymbol v_t$. As previously argue, note that $\mathbb{E}[l^h_{t, i}] = l_{t, i}$, and we have: 156 | \begin{equation*} 157 | \mathbb{E}[\boldsymbol v_t^T\boldsymbol l^h_t] = \sum^n_{i=1}\mathbb{E}[v_{t, i}l^h_{t, i}] = \sum^n_{i=1}v_{t, i}\mathbb{E}[l^h_{t, i}] = \sum^n_{i=1}v_{t, i}l_{t, i} = \mathbb{E}[l_{t, \hat{y}_t}] 158 | \end{equation*} 159 | Similarly, we have: 160 | \begin{equation*} 161 | \mathbb{E}[(l^h_{t, i})^2] = \sum^n_{j=1}v_{t, j}\bracka{\frac{l_{t, i}}{v_{t, i}}}^2 \mathbb{I}[i = j]^2 = v_{t,i}\bracka{\frac{l_{t,i}}{v_{t, i}}}^2 = \frac{l^2_{t, i}}{v_{t, i}} 162 | \end{equation*} 163 | This implies that: 164 | \begin{equation*} 165 | \mathbb{E}\brackb{\sum^n_{i=1} v_{t, i}(l^h_{t, i})^2 } = \sum^n_{i=1}v_{t, i}\frac{l^2_{t, i}}{v_{t, i}} = \sum^n_{i=1}l^2_{t, i} \le n 166 | \end{equation*} 167 | Taking the expectation over the Hedge terms, and we have for $\boldsymbol u \in \Delta_n$: 168 | \begin{equation*} 169 | \mathbb{E}\brackb{\sum^m_{t=1}\boldsymbol v_t^T\boldsymbol l^h_t - \sum^m_{t=1}\boldsymbol u^T\boldsymbol l^h_t} \le \mathbb{E}\brackb{\frac{\ln n}{\eta} + \frac{\eta}{2}\sum^m_{t=1}\sum^n_{i=1}v_{t, i}(l^h_{t, i})^2} 170 | \end{equation*} 171 | And, so we have using the fact that: $\mathbb{E}[l^h_{t, i}] = l_{t, i}$, and the previous result with $\boldsymbol u$ being a coordinate vector, we have: 172 | \begin{equation*} 173 | \mathbb{E}\brackb{\sum^m_{t=1}\boldsymbol v_t^T\boldsymbol l^h_t} - \min_i\mathbb{E}\brackb{\sum^m_{t=1}\boldsymbol l^h_{t, i}} \le \frac{\ln n}{\eta} + \frac{\eta}{2}\mathbb{E}\brackb{\sum^m_{t=1}\sum^n_{i=1}v_{t, i}(l^h_{t, i})^2} 174 | \end{equation*} 175 | And, so we have: 176 | \begin{equation*} 177 | \mathbb{E}[L_A(S)] - \min_i L_i(S) \le \ln\frac{n}{\eta} + \frac{\eta}{2}mn 178 | \end{equation*} 179 | Substuite the $\eta = \sqrt{2\ln n/mn}$ to prove this theorem. 180 | \end{proof} 181 | -------------------------------------------------------------------------------- /supervised-learning/supervised-learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Phutoast/UCL-CSML-Notes/292a123e2dfd17d3ca1a0e268d199cae9632a5b9/supervised-learning/supervised-learning.pdf -------------------------------------------------------------------------------- /supervised-learning/supervised-learning.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage{amsmath} 4 | \usepackage{amsthm} 5 | \usepackage{amsfonts} 6 | \usepackage[colorlinks]{hyperref} 7 | \usepackage{natbib} 8 | \usepackage{graphicx} 9 | \usepackage{algorithm} 10 | \usepackage{algpseudocode} 11 | \usepackage{booktabs} 12 | \usepackage{caption} 13 | \usepackage{tikz} 14 | \usepackage{chngpage} 15 | \usepackage{xcolor} 16 | \usepackage{cancel} 17 | 18 | \newtheorem{theorem}{Theorem}[section] 19 | \newtheorem{corollary}{Corollary}[section] 20 | \newtheorem{proposition}{Proposition}[section] 21 | \newtheorem{lemma}{Lemma}[section] 22 | \newtheorem{claim}{Claim}[section] 23 | \newtheorem{conjecture}{Conjecture}[section] 24 | \newtheorem{example}{Example}[section] 25 | 26 | \theoremstyle{definition} 27 | \newtheorem{definition}{Definition}[section] 28 | 29 | \theoremstyle{remark} 30 | \newtheorem{remark}{Remark} 31 | 32 | 33 | \newcommand{\Phu}[1]{{\bf \color{red} [[Phu: #1]]}} 34 | \setlength\parindent{0pt} 35 | \setlength\parskip{5pt} 36 | \usepackage[margin=1.0in]{geometry} 37 | 38 | \newcommand{\dby}{\ \mathrm{d}} 39 | \newcommand{\argmax}[1]{\underset{#1}{\arg\max \ }} 40 | \newcommand{\argmin}[1]{\underset{#1}{\arg\min \ }} 41 | \newcommand{\const}{\text{const.}} 42 | \newcommand{\bracka}[1]{\left( #1 \right)} 43 | \newcommand{\brackb}[1]{\left[ #1 \right]} 44 | \newcommand{\brackc}[1]{\left\{ #1 \right\}} 45 | \newcommand{\brackd}[1]{\left\langle #1 \right\rangle} 46 | \newcommand{\abs}[1]{\left| #1 \right|} 47 | \newcommand{\contractop}{\mathcal{B}} 48 | \newcommand*\circled[1]{\tikz[baseline=(char.base)]{ 49 | \node[shape=circle,draw,inner sep=2pt] (char) {#1};}} 50 | \newcommand{\red}[1]{{\color{red} #1}} 51 | \newcommand{\loss}{\mathcal{L}} 52 | \newcommand{\correctquote}[1]{``#1''} 53 | \newcommand{\norm}[1]{\left\lVert#1\right\rVert} 54 | 55 | % From https://tex.stackexchange.com/questions/194426/split-itemize-into-multiple-columns 56 | \usepackage{etoolbox,refcount} 57 | \usepackage{multicol} 58 | 59 | \newcounter{countitems} 60 | \newcounter{nextitemizecount} 61 | \newcommand{\setupcountitems}{% 62 | \stepcounter{nextitemizecount}% 63 | \setcounter{countitems}{0}% 64 | \preto\item{\stepcounter{countitems}}% 65 | } 66 | \makeatletter 67 | \newcommand{\computecountitems}{% 68 | \edef\@currentlabel{\number\c@countitems}% 69 | \label{countitems@\number\numexpr\value{nextitemizecount}-1\relax}% 70 | } 71 | \newcommand{\nextitemizecount}{% 72 | \getrefnumber{countitems@\number\c@nextitemizecount}% 73 | } 74 | \newcommand{\previtemizecount}{% 75 | \getrefnumber{countitems@\number\numexpr\value{nextitemizecount}-1\relax}% 76 | } 77 | \makeatother 78 | \newenvironment{AutoMultiColItemize}{% 79 | \ifnumcomp{\nextitemizecount}{>}{3}{\begin{multicols}{2}}{}% 80 | \setupcountitems\begin{itemize}}% 81 | {\end{itemize}% 82 | \unskip\computecountitems\ifnumcomp{\previtemizecount}{>}{3}{\end{multicols}}{}} 83 | 84 | 85 | \title{Supervised Learning} 86 | \author{Phu Sakulwongtana} 87 | \date{} 88 | 89 | \begin{document} 90 | 91 | \maketitle 92 | 93 | \input{contents/part1.tex} 94 | \input{contents/part2.tex} 95 | \input{contents/part3.tex} 96 | \input{contents/part4.tex} 97 | \input{contents/part5.tex} 98 | \input{contents/part6.tex} 99 | \input{contents/part7.tex} 100 | 101 | % \begin{algorithm}[H] 102 | % \caption{$PSRO_{RN}$} 103 | % \begin{algorithmic}[1] 104 | % \State \textbf{Input}: Initial Population $\mathcal{B}_1$ 105 | % \For {$i=1,2,\cdots, T$} 106 | % \State $p \leftarrow \text{Nash}(A_{\mathcal{B}_i})$ 107 | % \For {agent $v_i$ with positive mass in $p_t$} 108 | % \State $v_{i+1} \leftarrow \text{oracle}(v_i, \sum_{w \in \mathcal{B}_i} p[i](\phi_{v_i}(\cdot))_+)$ 109 | % \EndFor 110 | % \State $\mathcal{B}_{i+1} = \mathcal{B} \cup \{v_{i+1} : \text{as updated above}\}$ 111 | % \EndFor 112 | % \end{algorithmic} 113 | % \end{algorithm} 114 | 115 | % \begin{table}[!h] 116 | % \centering 117 | % \begin{tabular}{lc} 118 | % \toprule 119 | % \textbf{Methods/Metrics} & \textbf{Accuracy} \\ 120 | % \midrule 121 | % Logistic Regression & $48.26 \pm 0.0f0$ \\ 122 | % Support Vector Machine & $48.91 \pm 0.00$ \\ 123 | % Random Forest Classifier & $44.38 \pm 1.57$ \\ 124 | % \midrule 125 | % Multi-Dimensional ELO & $34.51 \pm 3.12$ \\ 126 | % TrueSkill\texttrademark & $44.99 \pm 0.00$ \\ 127 | % \bottomrule 128 | % \end{tabular} 129 | 130 | % \caption{} 131 | 132 | % \label{table} 133 | % \end{table} 134 | 135 | % \begin{AutoMultiColItemize} 136 | % \item Item 1 137 | % \item Item 2 138 | % \item Item 3 139 | % \item Item 4 140 | % \item Item 5 141 | % \item Item 6 142 | % \end{AutoMultiColItemize} 143 | 144 | 145 | % \bibliographystyle{plain} 146 | % \bibliography{references} 147 | \end{document} 148 | --------------------------------------------------------------------------------