├── .gitignore ├── 01.Introduction ├── 1.intro-2.pdf ├── 1.intro-4.pdf ├── 1.intro.pdf ├── cvxbook.png ├── main.tex ├── mathmlbook.png └── probcover-2nd.png ├── 02.LinearAlgebra ├── 2.LA-2.pdf ├── 2.LA-4.pdf ├── 2.LA.pdf ├── L2_affine.png ├── L2_affine_linear.png ├── L2_basischange.png ├── L2_coordinate.png ├── L2_image_kernel.png ├── L2_rank_nullity.png ├── L2_vector_ex.png └── main.tex ├── 03.Geometry ├── 3.AG-2.pdf ├── 3.AG-4.pdf ├── 3.AG.pdf ├── L3_gramschmidt.png ├── L3_ocomp.png ├── L3_projection_1D.png ├── L3_projection_affine.png ├── L3_projection_ex.png └── main.tex ├── 04.MatrixDecomposition ├── 4.MD-2.pdf ├── 4.MD-4.pdf ├── 4.MD.pdf ├── L4_SVD_matrix.png ├── L4_UTM_LTM.png ├── L4_cofactor_ex.png ├── L4_eigendecomposition.png ├── L4_ev_ex1.png ├── L4_ev_ex2.png ├── L4_ev_ex3.png ├── L4_ev_ex4.png ├── L4_ev_ex5.png ├── L4_matrix_approx.png ├── L4_matrix_tree.png └── main.tex ├── 05.VectorCaculus ├── 5.VC-2.pdf ├── 5.VC-4.pdf ├── 5.VC.pdf ├── L5_computation_graph.png ├── L5_grad_matrix_1.png ├── L5_grad_matrix_2.png ├── L5_grad_matrix_3.png ├── L5_useful.png └── main.tex ├── 06.Probability ├── 6.PD-2.pdf ├── 6.PD-4.pdf ├── 6.PD.pdf ├── L6_CDF_ex1.png ├── L6_CDF_ex2.png ├── L6_RV_ex.png ├── L6_binomial_ex.png ├── L6_condind_ex.png ├── L6_cov_ex.png ├── L6_cov_notind.png ├── L6_exp_pdf.png ├── L6_gaussian_formula.png ├── L6_geo_ex.png ├── L6_joint_ex.png ├── L6_marginal_conditional.png ├── L6_needle.png ├── L6_pdf_delta.png ├── L6_pdf_ex.png ├── L6_pdf_uniform_ex.png ├── L6_pmf_ex.png ├── L6_total_ex.png ├── L6_tworolls.png ├── L6_uniform_ex.png └── main.tex ├── 07.Optimization ├── 7.OPT-2.pdf ├── 7.OPT-4.pdf ├── 7.OPT.pdf ├── L7_convex_conjugate.png ├── L7_convex_fn.png ├── L7_convex_set_ex1.png ├── L7_convex_set_ex2.png ├── L7_first_condition.png ├── L7_gradient_ex.png ├── L7_halfspace.png ├── L7_separating.png ├── L7_supporting.png └── main.tex ├── 08.Model_Data ├── 8.MMD-2.pdf ├── 8.MMD-4.pdf ├── 8.MMD.pdf ├── L10_latent.png ├── L8_all_gmodels.png ├── L8_coinflip.png ├── L8_cross_validation.png ├── L8_dsep.png ├── L8_fittings.png ├── L8_gmodel_ex1.png ├── L8_gmodel_ex2.png ├── L8_lung_cancer.png ├── L8_model_class.png ├── L8_model_function.png ├── L8_model_pmodel.png ├── L8_nested_cross_validation.png └── main.tex ├── 09.LinearRegression ├── 9.LR-2.pdf ├── 9.LR-4.pdf ├── 9.LR.pdf ├── L9_LR_gmodel.png ├── L9_bayesian_regression.png ├── L9_overfit_linear.png ├── L9_poly4fit.png ├── L9_posterior_predictive_ex.png ├── L9_regression_ex.png ├── L9_training_test.png └── main.tex ├── 10.PCA ├── 10.PCA-2.pdf ├── 10.PCA-4.pdf ├── 10.PCA.pdf ├── L10_PCA_onepicture.png ├── L10_dr_ex.png ├── L10_latent.png ├── L10_mnist.png ├── L10_pca_algorithm.png ├── L10_pca_picture.png ├── L10_variance_diff.png └── main.tex ├── 11.DensityEstimation ├── 11.GMM-2.pdf ├── 11.GMM-4.pdf ├── 11.GMM.pdf ├── L11_Gaussian_fail.png ├── L11_em_ex.png ├── L11_gm_ex.png ├── L11_gmm_gm.png └── main.tex ├── 12.SVM ├── 12.SVM-2.pdf ├── 12.SVM-4.pdf ├── 12.SVM.pdf ├── L12_disthyper.png ├── L12_halfspace.png ├── L12_hingeloss.png ├── L12_kernel_ex.png ├── L12_soft_hard_svm.png ├── L12_softsvm_geo.png ├── dist_hyperplane.pptx └── main.tex ├── compile.sh ├── kaist_ee.png ├── mydefault.tex ├── myhead.tex ├── mymacro.tex ├── mymath.tex └── print.sh /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.log 3 | *.toc 4 | *.snm 5 | *.out 6 | *.nav 7 | *.aux 8 | *.vrb 9 | 01.Introduction/.DS_Store 10 | -------------------------------------------------------------------------------- /01.Introduction/1.intro-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/01.Introduction/1.intro-2.pdf -------------------------------------------------------------------------------- /01.Introduction/1.intro-4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/01.Introduction/1.intro-4.pdf -------------------------------------------------------------------------------- /01.Introduction/1.intro.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/01.Introduction/1.intro.pdf -------------------------------------------------------------------------------- /01.Introduction/cvxbook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/01.Introduction/cvxbook.png -------------------------------------------------------------------------------- /01.Introduction/main.tex: -------------------------------------------------------------------------------- 1 | %\pdfminorversion=4 2 | \documentclass[handout,fleqn,aspectratio=169]{beamer} 3 | 4 | \input{../myhead} 5 | 6 | 7 | 8 | \title[]{Lecture 1: Introduction} 9 | \author{Yi, Yung (이융)} 10 | \institute{Mathematics for Machine Learning\\ 11 | \url{https://yung-web.github.io/home/courses/mathml.html} 12 | \\KAIST EE} 13 | \date{\today} 14 | 15 | \input{../mymath} 16 | \input{../mymacro} 17 | 18 | \begin{document} 19 | 20 | \input{../mydefault} 21 | 22 | % START START START START START START START START START START START START START 23 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 24 | \begin{frame}{Textbook} 25 | 26 | \begin{center} 27 | \begin{tabular}{ccc} 28 | \includegraphics[width=2.0cm]{mathmlbook.png} & 29 | \includegraphics[width=2.1cm]{cvxbook.png} & 30 | \includegraphics[width=2.4cm]{probcover-2nd.png} 31 | \end{tabular} 32 | \end{center} 33 | 34 | %\small 35 | \vspace{-0.4cm} 36 | \plitemsep 0.02in 37 | 38 | \bci 39 | \item Mathematics for Machine Learning\footnote{The entire textbook can be downloaded at \url{https://mml-book.github.io/}}, Cambridge University Press, Marc Peter Deisenroth, A. Aldo Faisal, and Cheng Soon Ong 40 | \item Other books 41 | \bci 42 | \item Convex Optimization, Cambridge University Press, by Stephen Boyd and Lieven Vandenberghe 43 | \item Introduction to Probability, 2nd edition, Athena Scientific, by Dimitri P. Bertsekas and John N. Tsitsiklis 44 | \eci 45 | \eci 46 | 47 | \end{frame} 48 | 49 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 50 | \begin{frame}{Organization} 51 | 52 | \plitemsep 0.03in 53 | 54 | \bci 55 | \item Part I: Math 56 | \bce 57 | \item Linear Algebra 58 | \item Analytic Geometry 59 | \item Matrix Decomposition 60 | \item Vector Calculus 61 | \item Probability and Distributions 62 | \item Optimization 63 | \ece 64 | 65 | \medskip 66 | \item Part II: 4 Basic Machine Learning Problems 67 | \bce 68 | \item When Models Meet Data 69 | 70 | \item Dimensionality Reduction with Principal Component Analysis 71 | 72 | \item Density Estimation with Gaussian Mixture Models 73 | 74 | \item Classification with Support Vector Machines 75 | \ece 76 | 77 | \eci 78 | \end{frame} 79 | 80 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 81 | \begin{frame}{Suggestions on Course Schedules} 82 | 83 | Total 16 weeks 84 | \vspace{-0.2cm} 85 | \plitemsep 0.01in 86 | 87 | \bci 88 | \item Part I: Math 89 | \bce 90 | \item Linear Algebra \hfill (2 weeks) 91 | \item Analytic Geometry \hfill (1 week) 92 | \item Matrix Decomposition \hfill(1 week) 93 | \item Vector Calculus \hfill(1 week) 94 | \item Probability and Distributions \hfill(2 weeks) 95 | \item Optimization \hfill(2 weeks) 96 | \ece 97 | 98 | \item Part II: 4 Basic Machine Learning Problems 99 | \bce 100 | \item When Models Meet Data \hfill(1 week) 101 | 102 | \item Dimensionality Reduction with Principal Component Analysis \hfill(1 week) 103 | 104 | \item Density Estimation with Gaussian Mixture Models \hfill(1 week) 105 | 106 | \item Classification with Support Vector Machines \hfill(1 week) 107 | \ece 108 | 109 | \item Total 13 weeks + Midterm (1 week) + Final (1 week) + Extra (1 week) 110 | \eci 111 | \end{frame} 112 | 113 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 114 | \begin{frame}{Target Audience} 115 | 116 | \plitemsep 0.1in 117 | 118 | \bci 119 | 120 | \item Undergraduate 121 | \bci 122 | \item They may have partial backgrounds on the math (e.g., only vector calculus + linear algebra). Depending on the students' background, the amount of time for math can be adjusted. 123 | 124 | \item Some mathematical parts may need to be provided with some degree of rigorous proofs. 125 | \eci 126 | 127 | \item Graduate 128 | \bci 129 | \item Graduate students have already taken the basic math courses on linear algebra, vector calculus, probability, optimization, but they don't have almost no background on machine learning. 130 | \item Math parts can be just reviewed by minimizing the proofs, but additional ML problems can be added to the course, so that they can have more exposure to the ML part. 131 | \eci 132 | \eci 133 | \end{frame} 134 | 135 | % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 136 | % \begin{frame}{} 137 | % \vspace{2cm} 138 | % \LARGE How to use the downloaded latex source files 139 | 140 | % \end{frame} 141 | 142 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 143 | \begin{frame}{File Organization} 144 | 145 | \plitemsep 0.1in 146 | 147 | \bci 148 | 149 | \item In each chapter, there is a {\tt main.tex} which you can compile. 150 | 151 | \item Common files for all chapters 152 | \bci 153 | \item {\tt myhead.tex}: common headers, e.g., including necessary packages 154 | \item {\tt mydefault.tex}: default values of many latex environments 155 | \item {\tt mymacro.tex}: macros related to linear algebra, e.g., matrix, transpose, inverse, etc 156 | \item {\tt mymath.tex}: other misc. math macros 157 | \item {\tt compile.sh}: shell script which compiles and generate the pdfs of all chapters 158 | \item {\tt print.sh}: shell script which generates the pdfs of 2/1, 4/1 printed formats 159 | \eci 160 | 161 | \item Just type "./compile.sh" if you want to get all the pdfs\footnote{Please make compile.sh and print.sh executable, if not, by typing {\tt chmod u+x compile.sh}}. 162 | \eci 163 | \end{frame} 164 | 165 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 166 | \begin{frame}[fragile]{Slide vs. Handout} 167 | 168 | \plitemsep 0.1in 169 | 170 | \bci 171 | 172 | \item Handout 173 | 174 | \begin{verbatim} 175 | \documentclass[handout,fleqn,aspectratio=169]{beamer} 176 | \end{verbatim} 177 | 178 | \item Slide 179 | \begin{verbatim} 180 | \documentclass[fleqn,aspectratio=169]{beamer} 181 | \end{verbatim} 182 | 183 | \item Difference between Handout and Slide? If you want to use the functionality of ``beamer overlay" to add animations to the slides, you need to compile without handout option. Please visit the following url if you are interested. 184 | \medskip 185 | \url{https://youtu.be/kkM_VPSM8kA} 186 | 187 | % \item Using shell scripts: run {\tt make_slide.sh} or {\tt make_handout.sh} 188 | 189 | % \item Using mode.tex file 190 | % \begin{verbatim} 191 | % \documentclass[fleqn,aspectratio=169]{beamer} 192 | % \end{verbatim} 193 | 194 | 195 | \eci 196 | \end{frame} 197 | 198 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 199 | \begin{frame}[fragile]{Letter vs. A4} 200 | 201 | \plitemsep 0.3in 202 | 203 | \bci 204 | 205 | \item A4 206 | In the {\tt myhead.tex} file: 207 | 208 | \medskip 209 | {\scriptsize 210 | \begin{verbatim} 211 | \usepackage{pgfpages} 212 | \pgfpagesuselayout{resize to}[a4paper,landscape,border shrink=5mm] 213 | \end{verbatim} 214 | } 215 | 216 | \item Letter 217 | In the {\tt myhead.tex} file: 218 | 219 | \medskip 220 | {\scriptsize 221 | \begin{verbatim} 222 | \usepackage{pgfpages} 223 | \pgfpagesuselayout{resize to}[letterpaper,landscape,border shrink=5mm] 224 | \end{verbatim} 225 | } 226 | 227 | \eci 228 | \end{frame} 229 | 230 | 231 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 232 | \begin{frame}{Basic Notations} 233 | 234 | \plitemsep 0.1in 235 | 236 | \bci 237 | \item Scalars: $a,b,c,\alpha,\beta,\gamma$ 238 | 239 | \item Vectors: $\vec{x},\vec{y},\vec{z}$ 240 | 241 | \item Matrices: $\mat{X},\mat{Y},\mat{Z}$ 242 | 243 | \item Sets: $\set{A}, \set{B}, \set{C}$ 244 | 245 | \item (Ordered) tuple: $B=(\bm{b}_1, \bm{b}_2, \bm{b}_3)$ 246 | 247 | \item Matrix of column vectors: $\mat{B} = [\vec{b}_1, \vec{b}_2, \vec{b}_3]$ or 248 | $\mB = \rowvec{\vb_1 & \vb_2 & \vb_3}$ 249 | 250 | \item Set of vectors: $\set{B} = \sets{\vec{b}_1, \vec{b}_2, \vec{b}_3}$ 251 | 252 | \item $\real,$ $\complex,$ $\integer,$ $\natu,$ $\real^n$, etc 253 | 254 | \item Probability: We use both $p(\cdot)$, $\prob{\cdot}$. 255 | \eci 256 | \end{frame} 257 | 258 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 259 | \begin{frame}{} 260 | \vspace{2cm} 261 | \LARGE Enjoy! 262 | 263 | \bigskip 264 | \large When you modify the latex files for your convenience, if you have any question on macros or pdf generation, feel free to send an email to \url{yiyung@gmail.com}. 265 | \end{frame} 266 | 267 | % \begin{frame}{Review Questions} 268 | % % \tableofcontents 269 | % %\plitemsep 0.1in 270 | % \bce[1)] 271 | % \item 272 | 273 | % \ece 274 | % \end{frame} 275 | 276 | 277 | \end{document} 278 | -------------------------------------------------------------------------------- /01.Introduction/mathmlbook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/01.Introduction/mathmlbook.png -------------------------------------------------------------------------------- /01.Introduction/probcover-2nd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/01.Introduction/probcover-2nd.png -------------------------------------------------------------------------------- /02.LinearAlgebra/2.LA-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/2.LA-2.pdf -------------------------------------------------------------------------------- /02.LinearAlgebra/2.LA-4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/2.LA-4.pdf -------------------------------------------------------------------------------- /02.LinearAlgebra/2.LA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/2.LA.pdf -------------------------------------------------------------------------------- /02.LinearAlgebra/L2_affine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/L2_affine.png -------------------------------------------------------------------------------- /02.LinearAlgebra/L2_affine_linear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/L2_affine_linear.png -------------------------------------------------------------------------------- /02.LinearAlgebra/L2_basischange.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/L2_basischange.png -------------------------------------------------------------------------------- /02.LinearAlgebra/L2_coordinate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/L2_coordinate.png -------------------------------------------------------------------------------- /02.LinearAlgebra/L2_image_kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/L2_image_kernel.png -------------------------------------------------------------------------------- /02.LinearAlgebra/L2_rank_nullity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/L2_rank_nullity.png -------------------------------------------------------------------------------- /02.LinearAlgebra/L2_vector_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/02.LinearAlgebra/L2_vector_ex.png -------------------------------------------------------------------------------- /03.Geometry/3.AG-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/03.Geometry/3.AG-2.pdf -------------------------------------------------------------------------------- /03.Geometry/3.AG-4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/03.Geometry/3.AG-4.pdf -------------------------------------------------------------------------------- /03.Geometry/3.AG.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/03.Geometry/3.AG.pdf -------------------------------------------------------------------------------- /03.Geometry/L3_gramschmidt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/03.Geometry/L3_gramschmidt.png -------------------------------------------------------------------------------- /03.Geometry/L3_ocomp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/03.Geometry/L3_ocomp.png -------------------------------------------------------------------------------- /03.Geometry/L3_projection_1D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/03.Geometry/L3_projection_1D.png -------------------------------------------------------------------------------- /03.Geometry/L3_projection_affine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/03.Geometry/L3_projection_affine.png -------------------------------------------------------------------------------- /03.Geometry/L3_projection_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/03.Geometry/L3_projection_ex.png -------------------------------------------------------------------------------- /03.Geometry/main.tex: -------------------------------------------------------------------------------- 1 | %\pdfminorversion=4 2 | \documentclass[handout,fleqn,aspectratio=169]{beamer} 3 | 4 | \input{../myhead} 5 | 6 | 7 | \title[]{Lecture 3: Analytic Geometry} 8 | \author{Yi, Yung (이융)} 9 | \institute{Mathematics for Machine Learning\\ \url{https://yung-web.github.io/home/courses/mathml.html} 10 | \\KAIST EE} 11 | \date{\today} 12 | 13 | \input{../mymath} 14 | \input{../mymacro} 15 | 16 | \begin{document} 17 | 18 | \input{../mydefault} 19 | 20 | 21 | % START START START START START START START START START START START START START 22 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 23 | \begin{frame}{Roadmap} 24 | 25 | \plitemsep 0.1in 26 | 27 | \bce[(1)] 28 | \item Norms 29 | 30 | \item Inner Products 31 | 32 | \item Lengths and Distances 33 | 34 | \item Angles and Orthogonality 35 | 36 | \item Orthonormal Basis 37 | 38 | \item Orthogonal Complement 39 | 40 | \item Inner Product of Functions 41 | 42 | \item Orthogonal Projections 43 | 44 | \item Rotations 45 | 46 | \ece 47 | \end{frame} 48 | 49 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 50 | \section{L3(1)} 51 | \begin{frame}{Roadmap} 52 | 53 | \plitemsep 0.1in 54 | 55 | \bce[(1)] 56 | \item \redf{Norms} 57 | 58 | \item \grayf{Inner Products 59 | 60 | \item Lengths and Distances 61 | 62 | \item Angles and Orthogonality 63 | 64 | \item Orthonormal Basis 65 | 66 | \item Orthogonal Complement 67 | 68 | \item Inner Product of Functions 69 | 70 | \item Orthogonal Projections 71 | 72 | \item Rotations} 73 | 74 | \ece 75 | \end{frame} 76 | 77 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 78 | \begin{frame}{Norm} 79 | 80 | \plitemsep 0.1in 81 | 82 | \bci 83 | \item A notion of the length of vectors 84 | 85 | \item \defi A norm on a vector space $V$ is a function $\norm{\cdot}: V \mapsto \real,$ such that for all $\lambda \in \real$ the following hold: 86 | 87 | \bci 88 | \item \bluef{Absolutely homogeneous}: $\norm{\lambda \vec{x}} = |\lambda| \norm{\vec{x}}$ 89 | \item \bluef{Triangle inequality}: $\norm{\vec{x} + \vec{y}} \le \norm{\vec{x}} + \norm{\vec{y}} $ 90 | \item \bluef{Positive definite}: $\norm{\vec{x}} \ge 0$ and $\norm{\vec{x}} \Longleftrightarrow \vec{x} = \vec{0}$ 91 | \eci 92 | \eci 93 | \end{frame} 94 | 95 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 96 | \begin{frame}{Example for $V \in \real^n$} 97 | 98 | \plitemsep 0.1in 99 | 100 | \bci 101 | \item \bluef{Manhattan Norm} (also called $\ell_1$ norm) For $\vec{x}= [x_1, \cdots, x_n] \in \real^n,$ 102 | $$ 103 | \norm{\vec{x}}_1 \eqdef = \sum_{i=1}^n |x_i| 104 | $$ 105 | \item \bluef{Euclidean Norm} (also called $\ell_2$ norm) For $\vec{x} \in \real^n,$ 106 | $$ 107 | \norm{\vec{x}}_2 \eqdef = \sqrt{\sum_{i=1}^n x_i^2} = \sqrt{\trans{\vec{x}} \vec{x}} 108 | $$ 109 | 110 | \eci 111 | \end{frame} 112 | 113 | 114 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 115 | \begin{frame}{Roadmap} 116 | 117 | \plitemsep 0.1in 118 | 119 | \bce[(1)] 120 | \item \grayf{Norms} 121 | 122 | \item \redf{Inner Products} 123 | 124 | \item \grayf{Lengths and Distances 125 | 126 | \item Angles and Orthogonality 127 | 128 | \item Orthonormal Basis 129 | 130 | \item Orthogonal Complement 131 | 132 | \item Inner Product of Functions 133 | 134 | \item Orthogonal Projections 135 | 136 | \item Rotations} 137 | 138 | \ece 139 | \end{frame} 140 | 141 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 142 | \section{L3(2)} 143 | \begin{frame}{Motivation} 144 | 145 | \plitemsep 0.1in 146 | 147 | \bci 148 | \item Need to talk about the length of a vector and the angle or distance between two vectors, where vectors are defined in abstract vector spaces 149 | 150 | \item To this end, we define the notion of \bluef{inner product} in an abstract manner. 151 | 152 | \item Dot product: A kind of inner product in vector space $\real^n$. $\trans{\vec{x}} \vec{y} = \sum_{i=1}^n x_i y_i$ 153 | 154 | 155 | \bigskip 156 | \item \question How can we generalize this and do a similar thing in some other vector spaces? 157 | \eci 158 | \end{frame} 159 | 160 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 161 | \begin{frame}{Formal Definition} 162 | 163 | \plitemsep 0.1in 164 | 165 | \bci 166 | \item An inner product is a mapping $\inner{\cdot}{\cdot}: V \times V \mapsto \real$ that satisfies the following conditions for all vectors $\vec{u},\vec{v},\vec{w} \in V$ and all scalars $\lambda \in \real$: 167 | 168 | \medskip 169 | \bce 170 | \item $\inner{\vec{u}+ \vec{v}}{\vec{w}} = \inner{\vec{u}}{\vec{w}} + \inner{\vec{v}}{\vec{w}}$ 171 | \item $\inner{\lambda \vec{v}}{\vec{w}} = \lambda \inner{\vec{v}}{\vec{w}}$ 172 | \item $\inner{\vec{v}}{\vec{w}} = \inner{\vec{w}}{\vec{v}}$ 173 | \item $\inner{\vec{v}}{\vec{v}} \ge 0$ and equal iff $\vec{v}=\vec{0}$ 174 | \ece 175 | \medskip 176 | 177 | \item The pair $(V,\inner{\cdot}{\cdot})$ is called an \bluef{inner product space}. 178 | 179 | \eci 180 | \end{frame} 181 | 182 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 183 | \begin{frame}{Example} 184 | 185 | \plitemsep 0.3in 186 | 187 | \bci 188 | 189 | \item \exam $V=\real^n$ and the dot product $\inner{\vec{x}}{\vec{y}} \eqdef \trans{\vec{x}}\vec{y}$ 190 | 191 | \item \exam $V=\real^2$ and $\inner{\vec{x}}{\vec{y}} \eqdef x_1y_1 - (x_1y_2 + x_2y_1) + 2x_2y_2$ 192 | 193 | \item \exam $V=\{\text{continuous functions in $\real$ over $[a,b]$} \},$ $\inner{u}{v} \eqdef \int_a^b u(x)v(x) dx$ 194 | \eci 195 | \end{frame} 196 | 197 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 198 | \begin{frame}{Symmetric, Positive Definite Matrix} 199 | 200 | \plitemsep 0.1in 201 | 202 | \bci 203 | \item A square matrix $\mat{A} \in \real^{n \times n}$ that satisfies the following is called \bluef{symmetric, positive definite} (or just positive definite): 204 | $$ 205 | \forall \vec{x} \in V \setminus \{\vec{0} \}: \trans{\vec{x}} \mat{A} \vec{x} > 0. 206 | $$ 207 | If only $\ge$ in the above holds, then $\mat{A}$ is called \bluef{symmetric, positive semidefinite.} 208 | 209 | \bigskip 210 | \item $\mat{A}_1 = \begin{nmat} 211 | 9 & 6 \cr 212 | 6 & 5 213 | \end{nmat} 214 | $ is positive definite. 215 | 216 | \item $\mat{A}_2 = \begin{nmat} 217 | 9 & 6 \cr 218 | 6 & 3 219 | \end{nmat} 220 | $ is not positive definite. 221 | 222 | 223 | 224 | \eci 225 | \end{frame} 226 | 227 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 228 | \begin{frame}{Inner Product and Positive Definite Matrix (1)} 229 | 230 | \plitemsep 0.2in 231 | 232 | \bci 233 | \item Consider an $n$-dimensional vector space $V$ with an inner product $\inner{\cdot}{\cdot}$ and an ordered basis $B=(\vec{b}_1, \ldots, \vec{b}_n)$ of $V.$ 234 | 235 | \item Any $\vec{x},\vec{y} \in V$ can be represented as: $\vec{x}=\sum_{i=1}^n \psi_i \vec{b}_i$ and $\vec{y}=\sum_{i=j}^n \lambda_j \vec{b}_j$ for some $\psi_i$ and $\lambda_j,$ $i,j=1, \ldots, n.$ 236 | \aleq{ 237 | \inner{\vec{x}}{\vec{y}} = \inner{\sum_{i=1}^n \psi_i\vec{b}_i}{\sum_{i=j}^n \lambda_j \vec{b}_j} = 238 | \sum_{i=1}^n \sum_{j=1}^n \psi_i \inner{\vec{b}_i}{\vec{b}_j} \lambda_j = \trans{\hat{\vec{x}}} \mat{A} \hat{\vec{y}}, 239 | } 240 | where $\mat{A}_{ij} = \inner{\vec{b}_i}{\vec{b}_j}$ and $\hat{\vec{x}}$ and $\hat{\vec{y}}$ are the coordinates w.r.t. $B.$ 241 | 242 | \eci 243 | \end{frame} 244 | 245 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 246 | \begin{frame}{Inner Product and Positive Definite Matrix (2)} 247 | 248 | \plitemsep 0.2in 249 | 250 | \bci 251 | 252 | \item Then, if $\forall \vec{x} \in V \setminus \{\vec{0} \}: \trans{\vec{x}} \mat{A} \vec{x} > 0$ (i.e., $\mat{A}$ is symmetric, positive definite), $\bluef{\trans{\hat{\vec{x}}} \mat{A} \hat{\vec{y}}}$ legitimately defines an inner product (w.r.t. $B$) 253 | 254 | \item Properties 255 | \bci 256 | \item The kernel of $\mat{A}$ is only $\{\vec{0} \}$, because $\trans{\vec{x}} \mat{A} \vec{x} > 0$ for all $\vec{x} \neq \vec{0} \implies$ $\mat{A} \vec{x} \neq \vec{0}$ if $\vec{x} \neq \vec{0}.$ 257 | \item The diagonal elements $a_{ii}$ of $\mat{A}$ are all positive, because $a_{ii} = \trans{\vec{e}_i} \mat{A} \vec{e}_i >0.$ 258 | \eci 259 | \eci 260 | \end{frame} 261 | 262 | 263 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 264 | \section{L3(3)} 265 | \begin{frame}{Roadmap} 266 | 267 | \plitemsep 0.1in 268 | 269 | \bce[(1)] 270 | \item \grayf{Norms 271 | 272 | \item Inner Products} 273 | 274 | \item \redf{Lengths and Distances 275 | 276 | \item Angles and Orthogonality} 277 | 278 | \item \grayf{Orthonormal Basis 279 | 280 | \item Orthogonal Complement 281 | 282 | \item Inner Product of Functions 283 | 284 | \item Orthogonal Projections 285 | 286 | \item Rotations} 287 | 288 | \ece 289 | \end{frame} 290 | 291 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 292 | 293 | \begin{frame}{Length} 294 | 295 | \plitemsep 0.2in 296 | 297 | \bci 298 | 299 | \item Inner product naturally induces a norm by defining: 300 | $$ 301 | \norm{x} \eqdef \sqrt{\inner{\vec{x}}{\vec{x}}} 302 | $$ 303 | 304 | \item Not every norm is induced by an inner product 305 | 306 | \item \redf{Cachy-Schwarz inequality.} For the induced norm by the inner product, 307 | $$ 308 | |\inner{\vec{x}}{\vec{y}}| \le \norm{\vec{x}} \ \norm{\vec{y}} 309 | $$ 310 | 311 | \eci 312 | \end{frame} 313 | 314 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 315 | \begin{frame}{Distance} 316 | 317 | \plitemsep 0.1in 318 | 319 | \bci 320 | 321 | \item Now, we can introduce a notion of distance using a norm as: 322 | 323 | \medskip 324 | \redf{Distance}. $d(\vec{x},\vec{y}) \eqdef \norm{\vec{x} - \vec{y}} = \sqrt{\inner{\vec{x}-\vec{y}}{\vec{x}-\vec{y}}} $ 325 | 326 | \item If the dot product is used as an inner product in $\real^n,$ it is \bluef{Euclidian distance.} 327 | 328 | \item \redf{Note.} The distance between two vectors does \bluef{NOT} necessarily require the notion of norm. Norm is just sufficient. 329 | 330 | \item Generally, if the following is satisfied, it is a suitable notion of distance, called \bluef{metric}. 331 | \bci 332 | \item \bluef{\em Positive definite}. $d(\vec{x},\vec{y}) \ge 0$ for all $\vec{x},\vec{y}$ and $d(\vec{x},\vec{y}) = 0 \Longleftrightarrow \vec{x}=\vec{y}$ 333 | \item \bluef{\em Symmetric}. $d(\vec{x},\vec{y}) = d(\vec{y},\vec{x})$ 334 | \item \bluef{\em Triangle inequality}. $d(\vec{x},\vec{z}) \le d(\vec{x},\vec{y}) + d(\vec{y},\vec{z})$ 335 | \eci 336 | \eci 337 | \end{frame} 338 | 339 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 340 | \section{L3(4)} 341 | \begin{frame}{Angle, Orthogonal, and Orthonormal} 342 | 343 | \plitemsep 0.1in 344 | 345 | \bci 346 | 347 | \item Using C-S inequality, $$-1 \le \frac{\inner{\vec{x}}{\vec{y}}}{\norm{\vec{x}} \ \norm{\vec{y}}} \le 1$$ 348 | 349 | \item Then, there exists a unique $\omega \in [0,\pi]$ with $$\cos \omega = \frac{\inner{\vec{x}}{\vec{y}}}{\norm{\vec{x}} \ \norm{\vec{y}}}$$ 350 | 351 | \item We define $\omega$ as the \bluef{angle} between $\vec{x}$ and $\vec{y}.$ 352 | 353 | \item \defi If $\inner{\vec{x}}{\vec{y}} = 0,$ in other words their angle is $\pi/2,$ we say that they are \bluef{orthogonal}, denoted by $\vec{x} \perp \vec{y}.$ Additionally, if $\norm{x} = \norm{y} =1,$ they are \bluef{orthonormal}. 354 | \eci 355 | \end{frame} 356 | 357 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 358 | \begin{frame}{Example} 359 | 360 | \plitemsep 0.15in 361 | 362 | \bci 363 | 364 | \item Orthogonality is defined by a given inner product. Thus, different inner products may lead to different results about orthogonality. 365 | 366 | \item \exam Consider two vectors $\vec{x}=\colvec{1 \\1 }$ and $\vec{y}=\colvec{-1 \\ 1 }$ 367 | 368 | \item Using the dot product as the inner product, they are orthogonal. 369 | 370 | \item However, using $\inner{\vec{x}}{\vec{y}} = \trans{\vec{x}} 371 | \begin{nmat} 372 | 2 & 0 \cr 373 | 0 & 1 374 | \end{nmat} \vec{y}$, they are not orthogonal. 375 | \aleq{ 376 | \cos \omega = \frac{\inner{\vec{x}}{\vec{y}}}{\norm{\vec{x}} \ \norm{\vec{y}}} = -\frac{1}{3} \implies \omega \approx 1.91 \text{ rad } \approx 109.5\text{\textdegree} 377 | } 378 | \eci 379 | 380 | 381 | \end{frame} 382 | 383 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 384 | \begin{frame}{Orthogonal Matrix} 385 | 386 | \plitemsep 0.05in 387 | 388 | \bci 389 | 390 | \item \defi A square matrix $\mat{A} \in \real^{n \times n}$ is an \bluef{orthogonal matrix}, iff its columns (or rows) are \bluef{orthonormal} so that 391 | $$ 392 | \mat{A} \trans{\mat{A}} = I = \trans{\mat{A}}\mat{A}, \text{ implying } \inv{\mat{A}} = \trans{\mat{A}}. 393 | $$ 394 | \vspace{-0.3cm} 395 | \bci 396 | \item We can use \bluef{$\inv{\mat{A}} = \trans{\mat{A}}$} for the definition of orthogonal matrices. 397 | \item Fact 1. $\mA,\mB$: orthogonal $\implies$ $\mA\mB$: orthogonal 398 | \item Fact 2. $\mA$: orthogonal $\implies$ $\det(\mA) = \pm 1$ 399 | \eci 400 | 401 | 402 | \item The linear mapping $\Phi$ by orthogonal matrices preserve \bluef{length} and \bluef{angle} (for the dot product) 403 | \aleq{ 404 | \norm{\Phi(\mA)} = \norm{\mat{A}\vec{x}}^2 = \trans{(\mat{A}\vec{\vec{x}})} (\mat{A} \vec{x}) = \trans{\vec{x}} \trans{\mat{A}} \mat{A} \vec{x} = \trans{\vec{x}} \vec{x} = \norm{\vec{x}}^2 405 | } 406 | \vspace{-0.7cm} 407 | \aleq{ 408 | \cos \omega = \frac{\trans{(\mat{A}\vec{x})} (\mat{A}\vec{y})}{\norm{\mat{A}\vec{x}} \ \norm{\mat{A}\vec{y}}} = 409 | \frac 410 | { 411 | \trans{\vec{x}} \trans{\mat{A}} \mat{A} \vec{y} 412 | } 413 | { 414 | \sqrt{\trans{\vec{x}} \trans{\mat{A}} \mat{A} \vec{x} \trans{\vec{y}} \trans{\mat{A}} \mat{A} \vec{y} 415 | } 416 | } 417 | = \frac{\trans{\vec{x}} \vec{y}}{\norm{\vec{x}} \ \norm{\vec{y}}} 418 | } 419 | 420 | \eci 421 | \end{frame} 422 | 423 | 424 | 425 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 426 | \section{L3(5)} 427 | \begin{frame}{Roadmap} 428 | 429 | \plitemsep 0.1in 430 | 431 | \bce[(1)] 432 | \item \grayf{Norms 433 | 434 | \item Inner Products 435 | 436 | \item Lengths and Distances 437 | 438 | \item Angles and Orthogonality} 439 | 440 | \item \redf{Orthonormal Basis 441 | 442 | \item Orthogonal Complement 443 | 444 | \item Inner Product of Functions} 445 | 446 | \item \grayf{Orthogonal Projections 447 | 448 | \item Rotations} 449 | 450 | \ece 451 | \end{frame} 452 | 453 | 454 | 455 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 456 | \begin{frame}{Orthonormal Basis} 457 | 458 | \plitemsep 0.1in 459 | 460 | \bci 461 | 462 | \item Basis that is orthonormal, i.e., they are all orthogonal to each other and their lengths are 1. 463 | 464 | \item Standard basis in $\real^n,$ $\{\vec{e}_1, \ldots, \vec{e}_n \},$ is orthonormal. 465 | 466 | 467 | \item \question How to obtain an orthonormal basis? 468 | 469 | \bigskip 470 | \mycolorbox{ 471 | \item[1.] Use Gaussian elimination to find a basis for a vector space spanned by a set of vectors. 472 | \bci 473 | \item Given a set $\{\vec{b}_1, \ldots, \vec{b}_n \}$ of unorthogonal and unnormalized basis vectors. Apply Gaussian elimination to the augmented matrix $(\mat{B}\trans{\mat{B}}|\mat{B})$ 474 | \eci 475 | 476 | \item[2.] Constructive way: Gram-Schmidt process (we will cover this later) 477 | } 478 | \eci 479 | \end{frame} 480 | 481 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 482 | \section{L3(6)} 483 | \begin{frame}{Orthogonal Complement (1)} 484 | 485 | \plitemsep 0.1in 486 | 487 | \bci 488 | 489 | \item Consider $D$-dimensional vector space $V$ and $M$-dimensional subspace $W \subset V.$ The \bluef{orthogonal complement} $\ocomp{U}$ is a $(D-M)$-dimensional subspace of $V$ and contains all vectors in $V$ that are orthogonal to every vector in $U.$ 490 | 491 | \item $U \cap \ocomp{U} = \vec{0}$ 492 | 493 | \item Any vector $x \in V$ can be uniquely decomposed into: 494 | \aleq{ 495 | \vec{x} = \sum_{m=1}^M \lambda_m \vec{b}_m + \sum_{j=1}^{D-M} \psi_j \ocomp{\vec{b}}_j, \quad \lambda_m, \psi_j \in \real, 496 | } 497 | where $(\vec{b}_1 \ldots, \vec{b}_M)$ and $(\ocomp{\vec{b}}_1, \ldots, \ocomp{\vec{b}}_{D-M} )$ are the \bluef{bases} of $U$ and $\ocomp{U},$ respectively. 498 | \eci 499 | \end{frame} 500 | 501 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 502 | \begin{frame}{Orthogonal Complement (2)} 503 | 504 | \plitemsep 0.1in 505 | 506 | \vspace{-0.3cm} 507 | \begin{center} 508 | \mypic{0.35}{L3_ocomp.png} 509 | \end{center} 510 | \vspace{-0.5cm} 511 | \bci 512 | \item The vector $\vw$ with $\norm{\vw}=1,$ which is orthogonal to $U$, is the basis of $\ocomp{U}.$ 513 | \item Such $\vw$ is called \bluef{normal vector} to $U.$ 514 | 515 | \item For a linear mapping represented by a matrix $\mat{A} \in \real^{m \times n},$ the solution space of $\mat{A} \vec{x} =0$ is $\ocomp{\text{row}(\mat{A})},$ where $\text{row}(\mat{A})$ is the row space of $\mat{A}$ (i.e., span of row vectors). 516 | 517 | In other words, $\ocomp{\text{row}(\mat{A})} = \ker(\mat{A})$ 518 | \eci 519 | 520 | 521 | \end{frame} 522 | 523 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 524 | \section{L3(7)} 525 | \begin{frame}{Inner Product of Functions} 526 | 527 | \plitemsep 0.15in 528 | 529 | \bci 530 | 531 | \item \redf{Remind:} $V=\{\text{continuous functions in $\real$ over $[a,b]$} \},$ the following is a proper inner product. 532 | \bluef{$$\inner{u}{v} \eqdef \int_a^b u(x)v(x) dx$$} 533 | 534 | \item \exam Choose $u(x) = \sin(x)$ and $v(x)= \cos(x),$ where we select $a=-\pi$ and $b=\pi.$ Then, since $f(x) = u(x)v(x)$ is odd (i.e., $f(-x) = -f(x)$), 535 | $$ 536 | \int_{-\pi}^\pi u(x) v(x) dx =0. 537 | $$ 538 | 539 | \item Thus, $u$ and $v$ are orthogonal. 540 | 541 | \item Similarly, $\{1, \cos(x), \cos(2x), \cos(3x), \ldots, \}$ is orthogonal over $[-\pi,\pi].$ 542 | \eci 543 | 544 | 545 | \end{frame} 546 | 547 | 548 | 549 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 550 | \section{L3(8)} 551 | \begin{frame}{Roadmap} 552 | 553 | \plitemsep 0.1in 554 | 555 | \bce[(1)] 556 | \item \grayf{Norms 557 | 558 | \item Inner Products 559 | 560 | \item Lengths and Distances 561 | 562 | \item Angles and Orthogonality 563 | 564 | \item Orthonormal Basis 565 | 566 | \item Orthogonal Complement 567 | 568 | \item Inner Product of Functions} 569 | 570 | \item \redf{Orthogonal Projections} 571 | 572 | \item \grayf{Rotations} 573 | 574 | \ece 575 | \end{frame} 576 | 577 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 578 | \begin{frame}{Projection: Motivation} 579 | 580 | \plitemsep 0.05in 581 | 582 | \bci 583 | 584 | \item Big data: high dimensional 585 | 586 | \item However, most information is contained in a few dimensions 587 | 588 | \item \bluef{Projection}: A process of reducing the dimensions (hopefully) without loss of much information\footnote{In \lecturemark{L10}, we will formally study this with the topic of PCA (Principal Component Analysis).} 589 | 590 | \item \exam Projection of 2D dataset onto 1D subspace 591 | 592 | \centering 593 | \mypic{0.4}{L3_projection_ex.png} 594 | \eci 595 | 596 | 597 | \end{frame} 598 | 599 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 600 | \begin{frame}{Projection onto Lines (1D Subspaces)} 601 | 602 | \plitemsep 0.1in 603 | 604 | \bci 605 | \item Consider a 1D subspace $U \subset \real^n$ spanned by the basis $\vec{b}.$ 606 | 607 | \item For $\vx \in \realn,$ what is its projection \bluef{$\pi_U(\vec{x})$} onto $U$ (assume the dot product)? 608 | \myvartwocols{0.3}{0.7}{0.29} 609 | { 610 | \small 611 | \aleq{ 612 | &\inner{\vec{x} - \pi_U(\vec{x})}{\vec{b}} = 0 \xleftrightarrow{\pi_U(\vec{x}) = \lambda \vec{b}} \inner{\vec{x} - \lambda \vec{b}}{\vec{b}}=0\cr 613 | & \implies \lambda = \frac{\inner{\vec{b}}{\vec{x}}}{\norm{\vec{b}}^2} = \frac{\trans{\vec{b}}\vec{x}}{\norm{\vec{b}}^2}, \ \text{and} \ \pi_U(\vec{x}) = \lambda \vec{b} = \bluef{\frac{\trans{\vec{b}}\vec{x}}{\norm{\vec{b}}^2} \vec{b}} 614 | } 615 | } 616 | { 617 | \vspace{-0.2cm} 618 | \mypic{0.8}{L3_projection_1D.png} 619 | } 620 | \vspace{-0.5cm} 621 | \item Projection matrix \redf{$\mat{P}_\pi \in \realnn$} in $\pi_U(\vec{x}) = \mat{P}_\pi \vec{x}$ 622 | \aleq{ 623 | \pi_U(\vec{x}) = \lambda \vec{b} = \vec{b} \lambda = \frac{\vec{b}\trans{\vec{b}}}{\norm{\vec{b}}^2} \vec{x}, \quad \mat{P}_\pi = \bluef{\frac{\vec{b}\trans{\vec{b}}}{\norm{\vec{b}}^2}} 624 | } 625 | \eci 626 | 627 | \end{frame} 628 | 629 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 630 | \begin{frame}{Inner Product and Projection} 631 | 632 | \plitemsep 0.1in 633 | 634 | \bci 635 | \item We project $\vx$ onto $\vb$, and let $\pi_{\vb}(\vx)$ be the projected vector. 636 | 637 | 638 | \item \question Understanding the inner project $\inner{\vx}{\vb}$ from the projection perspective? 639 | \mycolorbox{ 640 | $$ 641 | \inner{\vx}{\vb} = \norm{\pi_{\vb}(\vx)} \times \norm{\vb} 642 | $$ 643 | } 644 | \mytwocols{0.4} 645 | { 646 | \item In other words, the inner product of $\vx$ and $\vb$ is the product of (\bluef{length of the projection of $\vx$ onto $\vb$}) $\times$ (\bluef{length of $\vb$}) 647 | } 648 | { 649 | \vspace{-0.2cm} 650 | \mypic{0.6}{L3_projection_1D.png} 651 | } 652 | 653 | \eci 654 | 655 | \end{frame} 656 | 657 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 658 | \begin{frame}{Example} 659 | 660 | \plitemsep 0.1in 661 | 662 | \bci 663 | \item $\vec{b} = \colvec{1 \\ 2 \\ 2}$ 664 | \aleq{ 665 | \mat{P}_\pi = \frac{\vec{b}\trans{\vec{b}}}{\norm{\vec{b}}^2} = \frac{1}{9}\colvec{1\\2\\2}\rowvec{1 & 2 & 2} = \frac{1}{9} 666 | \begin{nmat} 667 | 1&2&2 \cr 668 | 2&4&4 \cr 669 | 2&4&4 670 | \end{nmat} 671 | } 672 | For $\vec{x} = \colvec{1\\1\\1},$ 673 | \aleq{ 674 | \pi_U(\vec{x}) = \mat{P}_\pi \vec{x} = \frac{1}{9} 675 | \begin{nmat} 676 | 1&2&2 \cr 677 | 2&4&4 \cr 678 | 2&4&4 679 | \end{nmat} \colvec{1\\1\\1} = \frac{1}{9} \colvec{5\\10\\10} \in \spn{\colvec{1\\2\\2}} 680 | } 681 | \eci 682 | 683 | \end{frame} 684 | 685 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 686 | \begin{frame}{Projection onto General Subspaces} 687 | 688 | \plitemsep 0.1in 689 | 690 | 691 | %\item Compare the results: 692 | 693 | \mytwocols{0.4} 694 | { 695 | \bci 696 | \item $\realn \rightarrow$ 1-Dim 697 | \item A basis vector $\vec{b}$ in 1D subspace 698 | \eci 699 | \centering 700 | $$ 701 | \pi_U(\vec{x}) = \bluef{\frac{\vec{b}\trans{\vec{b}}\vec{x}}{\trans{\vec{b}}\vec{b}}}, \ \lambda = \frac{\trans{\vec{b}}\vec{x}}{\trans{\vec{b}}\vec{b}} 702 | $$ 703 | $$ 704 | \mat{P}_\pi = \redf{\frac{\vec{b}\trans{\vec{b}}}{\trans{\vec{b}}\vec{b} }} 705 | $$ 706 | } 707 | { 708 | \bci 709 | \item $\realn \rightarrow$ $m$-Dim, $(m < n)$ 710 | \item A basis matrix $B=\rowvec{\vec{b}_1, \cdots, \vec{b}_m} \in \real^{n \times m}$ 711 | \eci 712 | $$ 713 | \pi_U(\vec{x}) = \bluef{\mB\inv{(\trans{\mB}\mB)}\trans{\mB} \vec{x}}, \ 714 | \vlam = \inv{(\trans{\mB}\mB)}\trans{\mB} \vec{x} 715 | $$ 716 | $$ 717 | \mat{P}_\pi = \redf{\mB\inv{(\trans{\mB}\mB)}\trans{\mB} } 718 | $$ 719 | } 720 | \vspace{-0.4cm} 721 | \bci 722 | \item $\lambda \in \real^{1}$ and $\vlam \in \realm$ are the coordinates in the projected spaces, respectively. 723 | \item $\inv{(\trans{\mB}\mB)}\trans{\mB}$ is called \bluef{pseudo-inverse}. 724 | \item How to derive is analogous to the case of 1-D lines (see pp. 71). 725 | \eci 726 | \end{frame} 727 | 728 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 729 | \begin{frame}{Example: Projection onto 2D Subspace} 730 | 731 | \plitemsep 0.1in 732 | \small 733 | \bci 734 | \item $U = \spn{\colvec{1\\1\\1}, \colvec{0\\1\\2}} \subset \real^3$ and $\vec{x} = \colvec{6\\0\\0}$. Check that $\{ \trans{\rowvec{1&1&1}}, \trans{\rowvec{0&1&2}}\}$ is a basis. 735 | \item Let $\mat{B} = \begin{nmat} 736 | 1&0\cr 737 | 1&1\cr 738 | 1&2 739 | \end{nmat}.$ Then, $\trans{\mat{B}}\mat{B} = 740 | \begin{nmat} 741 | 1&1&2\cr 742 | 0&1&2 743 | \end{nmat} 744 | \begin{nmat} 745 | 1&0\cr 746 | 1&1\cr 747 | 1&2 748 | \end{nmat} 749 | = 750 | \begin{nmat} 751 | 3&3\cr 752 | 3&5 753 | \end{nmat} 754 | $ 755 | \item Can see that $\mat{P}_\pi = \mB\inv{(\trans{\mB}\mB)}\trans{\mB} = \dfrac{1}{6} 756 | \begin{nmat} 757 | 5&2&-1\cr 758 | 2&2&2\cr 759 | -1&2&5 760 | \end{nmat} 761 | $, and $\pi_U(\vec{x}) = \dfrac{1}{6} 762 | \begin{nmat} 763 | 5&2&-1\cr 764 | 2&2&2\cr 765 | -1&2&5 766 | \end{nmat} \colvec{6\\0\\0} = \colvec{5\\2\\-1}$ 767 | 768 | \eci 769 | 770 | \end{frame} 771 | 772 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 773 | \begin{frame}{Gram-Schmidt Orthogonalization Method (G-S method)} 774 | 775 | \plitemsep 0.05in 776 | 777 | \bci 778 | \item Constructively transform any basis $(\vb_1, \ldots, \vb_n)$ of $n$-dimensional vector space $V$ into an orthogonal/orthonormal basis $(\vu_1, \ldots, \vu_n)$ of $V$ 779 | 780 | \item Iteratively construct as follows 781 | \mycolorbox{ 782 | \vspace{-0.2cm} 783 | \aleq{ 784 | \vu_1 &\eqdef \vb_1 \cr 785 | \vu_k &\eqdef \vb_k - \pi_{\text{span}[\vu_1, \ldots, \vu_{k-1}]}(\vb_k), \ k=2, \ldots, n \qquad \qquad (*) 786 | } 787 | } 788 | %\item In $(*)$ 789 | % \mytwocols{0.3} 790 | % { 791 | % \bci 792 | % \item $\pi_{\text{span}[\vu_1, \ldots, \vu_{k-1}]}(\vb_k)$: projection of $\vb_k$ onto the subspace spanned by $[\vu_1, \ldots, \vu_{k-1}]$ 793 | % \item Then, $\vu_k$ becomes orthogonal to $\text{span}[\vu_1, \ldots, \vu_{k-1}]$ 794 | % \eci 795 | % } 796 | % { 797 | % } 798 | \eci 799 | \vspace{-0.3cm} 800 | \mypic{0.9}{L3_gramschmidt.png} 801 | 802 | \end{frame} 803 | 804 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 805 | \begin{frame}{Example: G-S method} 806 | 807 | \plitemsep 0.15in 808 | 809 | \bci 810 | \item A basis $(\vb_1, \vb_2) \in \real^2,$ $\vb_1 = \colvec{2 \\ 0}$ and $\vb_2 = \colvec{1 \\1}$ 811 | 812 | \item $\vu_1 = \vb_1 = \colvec{2 \\ 0}$ and 813 | \aleq{ 814 | \vu_2 = \vb_2 - \pi_{\text{span}[\vu_1]}(\vb_2) = \frac{\vu_1\trans{\vu_2}}{\norm{\vu_1}} \vb_2 815 | = \colvec{1\\1} - \begin{nmat} 816 | 1 & 0 \cr 817 | 0 & 0 818 | \end{nmat} 819 | \colvec{1 \\1} = \colvec{0 \\1} 820 | } 821 | 822 | \item $\vu_1$ and $\vu_2$ are orthogonal. If we want them to be orthonormal, then just normaliation would do the job. 823 | \eci 824 | \end{frame} 825 | 826 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 827 | \begin{frame}{Projection onto Affine Subspaces} 828 | 829 | \begin{center} 830 | \mypic{0.7}{L3_projection_affine.png} 831 | \end{center} 832 | 833 | \plitemsep 0.05in 834 | \vspace{-0.5cm} 835 | \bci 836 | \item Affine space: $L = \vec{x}_0 + U$ 837 | \item Affine subspaces are not vector spaces 838 | \item Idea: (i) move $\vec{x}$ to a point in $U$, (ii) do the projection, (iii) move back to $L$ 839 | \bluef{$$\pi_L(\vec{x}) = \vec{x}_0 + \pi_{U}(\vec{x} - \vec{x}_0)$$} 840 | \eci 841 | 842 | \end{frame} 843 | 844 | 845 | 846 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 847 | \section{L3(9)} 848 | \begin{frame}{Roadmap} 849 | 850 | \plitemsep 0.1in 851 | 852 | \bce[(1)] 853 | \item \grayf{Norms 854 | 855 | \item Inner Products 856 | 857 | \item Lengths and Distances 858 | 859 | \item Angles and Orthogonality 860 | 861 | \item Orthonormal Basis 862 | 863 | \item Orthogonal Complement 864 | 865 | \item Inner Product of Functions 866 | 867 | \item Orthogonal Projections} 868 | 869 | \item \redf{Rotations} 870 | 871 | \ece 872 | \end{frame} 873 | 874 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 875 | \begin{frame}{Rotation} 876 | 877 | \plitemsep 0.07in 878 | 879 | \bci 880 | \item Length and angle preservation: two properties of linear mappings with \bluef{orthogonal matrices}. Let's look at some of their special cases. 881 | 882 | \item A linear mapping that rotates the given coordinate system by an angle $\theta.$ 883 | 884 | \item Basis change 885 | \item $\vec{e}_1 = \colvec{1 \\ 0} \rightarrow \colvec{\cos\theta \\ \sin\theta}$ and $\vec{e}_2 = \colvec{0 \\ 1} \rightarrow \colvec{-\sin\theta \\ \cos\theta}$ 886 | 887 | \item Rotation matrix $\vec{R}(\theta) = \begin{nmat} 888 | \cos\theta & -\sin\theta \cr 889 | \sin\theta & \cos\theta 890 | \end{nmat}$ 891 | 892 | \item Properties 893 | \bci 894 | \item Preserves distance: $\norm{\vec{x} - \vec{y}} = \norm{\mat{R}_\theta(\vec{x}) - \mat{R}_\theta(\vec{y})}$ 895 | \item Preserves angle 896 | \eci 897 | \eci 898 | 899 | \end{frame} 900 | 901 | 902 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 903 | \begin{frame}{} 904 | \vspace{2cm} 905 | \LARGE Questions? 906 | 907 | 908 | \end{frame} 909 | 910 | \begin{frame}{Review Questions} 911 | % \tableofcontents 912 | %\plitemsep 0.1in 913 | \bce[1)] 914 | \item 915 | 916 | \ece 917 | \end{frame} 918 | 919 | 920 | \end{document} 921 | -------------------------------------------------------------------------------- /04.MatrixDecomposition/4.MD-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/4.MD-2.pdf -------------------------------------------------------------------------------- /04.MatrixDecomposition/4.MD-4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/4.MD-4.pdf -------------------------------------------------------------------------------- /04.MatrixDecomposition/4.MD.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/4.MD.pdf -------------------------------------------------------------------------------- /04.MatrixDecomposition/L4_SVD_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_SVD_matrix.png -------------------------------------------------------------------------------- /04.MatrixDecomposition/L4_UTM_LTM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_UTM_LTM.png -------------------------------------------------------------------------------- /04.MatrixDecomposition/L4_cofactor_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_cofactor_ex.png -------------------------------------------------------------------------------- /04.MatrixDecomposition/L4_eigendecomposition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_eigendecomposition.png -------------------------------------------------------------------------------- /04.MatrixDecomposition/L4_ev_ex1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_ev_ex1.png -------------------------------------------------------------------------------- /04.MatrixDecomposition/L4_ev_ex2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_ev_ex2.png -------------------------------------------------------------------------------- /04.MatrixDecomposition/L4_ev_ex3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_ev_ex3.png -------------------------------------------------------------------------------- /04.MatrixDecomposition/L4_ev_ex4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_ev_ex4.png -------------------------------------------------------------------------------- /04.MatrixDecomposition/L4_ev_ex5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_ev_ex5.png -------------------------------------------------------------------------------- /04.MatrixDecomposition/L4_matrix_approx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_matrix_approx.png -------------------------------------------------------------------------------- /04.MatrixDecomposition/L4_matrix_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/04.MatrixDecomposition/L4_matrix_tree.png -------------------------------------------------------------------------------- /05.VectorCaculus/5.VC-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/05.VectorCaculus/5.VC-2.pdf -------------------------------------------------------------------------------- /05.VectorCaculus/5.VC-4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/05.VectorCaculus/5.VC-4.pdf -------------------------------------------------------------------------------- /05.VectorCaculus/5.VC.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/05.VectorCaculus/5.VC.pdf -------------------------------------------------------------------------------- /05.VectorCaculus/L5_computation_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/05.VectorCaculus/L5_computation_graph.png -------------------------------------------------------------------------------- /05.VectorCaculus/L5_grad_matrix_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/05.VectorCaculus/L5_grad_matrix_1.png -------------------------------------------------------------------------------- /05.VectorCaculus/L5_grad_matrix_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/05.VectorCaculus/L5_grad_matrix_2.png -------------------------------------------------------------------------------- /05.VectorCaculus/L5_grad_matrix_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/05.VectorCaculus/L5_grad_matrix_3.png -------------------------------------------------------------------------------- /05.VectorCaculus/L5_useful.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/05.VectorCaculus/L5_useful.png -------------------------------------------------------------------------------- /05.VectorCaculus/main.tex: -------------------------------------------------------------------------------- 1 | %\pdfminorversion=4 2 | \documentclass[handout,fleqn,aspectratio=169]{beamer} 3 | 4 | \input{../myhead} 5 | 6 | 7 | 8 | \title[]{Lecture 5: Vector Calculus} 9 | \author{Yi, Yung (이융)} 10 | \institute{Mathematics for Machine Learning\\ \url{https://yung-web.github.io/home/courses/mathml.html} 11 | \\KAIST EE} 12 | \date{\today} 13 | 14 | \input{../mymath} 15 | \input{../mymacro} 16 | 17 | \begin{document} 18 | 19 | \input{../mydefault} 20 | 21 | % START START START START START START START START START START START START START 22 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 23 | \begin{frame}{Roadmap} 24 | 25 | \plitemsep 0.1in 26 | 27 | \bce[(1)] 28 | \item Differentiation of Univariate Functions 29 | 30 | \item Partial Differentiation and Gradients 31 | 32 | \item Gradients of Vector-Valued Functions 33 | 34 | \item Gradients of Matrices 35 | 36 | \item Useful Identities for Computing Gradients 37 | 38 | \item Backpropagation and Automatic Differentiation 39 | 40 | \item Higher-Order Derivatives 41 | 42 | \item Linearization and Multivariate Taylor Series 43 | 44 | \ece 45 | \end{frame} 46 | 47 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 48 | \begin{frame}{Summary} 49 | 50 | \plitemsep 0.1in 51 | 52 | \bci 53 | \item Machine learning is about solving an optimization problem whose variables are the parameters of a given model. 54 | 55 | \item Solving optimization problems require gradient information. 56 | 57 | \item Central to this chapter is the concept of the function, which we often write 58 | 59 | \aleq{ 60 | f : \real^{D} \mapsto \real\cr 61 | \vec{x} \mapsto f(\vec{x}) 62 | } 63 | 64 | \eci 65 | \end{frame} 66 | 67 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 68 | \section{L5(1)} 69 | \begin{frame}{Roadmap} 70 | 71 | \plitemsep 0.1in 72 | 73 | \bce[(1)] 74 | \item \redf{Differentiation of Univariate Functions} 75 | 76 | \item \grayf{Partial Differentiation and Gradients 77 | 78 | \item Gradients of Vector-Valued Functions 79 | 80 | \item Gradients of Matrices 81 | 82 | \item Useful Identities for Computing Gradients 83 | 84 | \item Backpropagation and Automatic Differentiation 85 | 86 | \item Higher-Order Derivatives 87 | 88 | \item Linearization and Multivariate Taylor Series} 89 | 90 | \ece 91 | \end{frame} 92 | 93 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 94 | \begin{frame}{Difference Quotient and Derivative} 95 | 96 | \plitemsep 0.3in 97 | 98 | \bci 99 | \item \redf{Difference Quotient.} The average slope of $f$ between $x$ and $x+\partial x$ 100 | 101 | \aleq{ 102 | \pd{y}{x} \eqdef \frac{f(x+\partial x) - f(x)}{\partial x} 103 | } 104 | 105 | \item \redf{Derivative.} Pointing in the direction of steepest ascent of $f.$ 106 | 107 | \aleq{ 108 | \d{f}{x} \eqdef \lim_{h \rightarrow 0} \frac{f(x+h)-f(x)}{h} 109 | } 110 | 111 | \item Unless confusion arises, we often use $f' = \d{f}{x}.$ 112 | \eci 113 | \end{frame} 114 | 115 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 116 | \begin{frame}{Taylor Series} 117 | 118 | \plitemsep 0.1in 119 | 120 | \bci 121 | 122 | \item Representation of a function as an infinite sum of terms, using derivatives of evaluated at $x_0.$ 123 | 124 | \item \redf{Taylor polynomial.} The Taylor polynomial of degree $n$ of $f : \real \mapsto \real$ at $x_0$ is: 125 | \aleq{ 126 | T_n(x) \eqdef \sum_{k=0}^n \frac{f^{(k)}(x_0)}{k!} (x-x_0)^k, \ \text{where $f^{(k)}(x_0)$ is the $k$th derivative of $f$ at $x_0.$} 127 | } 128 | 129 | \item \redf{Taylor Series.} For a smooth function $f\in \set{C}^{\infty},$ the Taylor series of $f$ at $x_0$ is: 130 | \aleq{ 131 | T_\infty(x) \eqdef \sum_{k=0}^\infty \frac{f^{(k)}(x_0)}{k!} (x-x_0)^k. 132 | } 133 | 134 | \item If $f(x) = T_\infty(x),$ $f$ is called \bluef{analytic}. 135 | \eci 136 | \end{frame} 137 | 138 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 139 | \begin{frame}{Differentiation Rules} 140 | 141 | \plitemsep 0.25in 142 | 143 | \bci 144 | 145 | \item \bluef{Product rule.} $(f(x)g(x))' = f'(x)g(x) + f(x)g'(x)$ 146 | 147 | \item \bluef{Quotient rule.} $\left(\dfrac{f(x)}{g(x)}\right)' = \dfrac{f'(x)g(x) - f(x)g'(x)}{(g(x))^2} $ 148 | 149 | \item \bluef{Sum rule.} $(f(x)+g(x))' = f'(x) + g'(x)$ 150 | 151 | \item \bluef{Chain rule.} $(g(f(x)))' = g'(f(x))f'(x)$ 152 | 153 | \eci 154 | \end{frame} 155 | 156 | 157 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 158 | \section{L5(2)} 159 | \begin{frame}{Roadmap} 160 | 161 | \plitemsep 0.1in 162 | 163 | \bce[(1)] 164 | \item \grayf{Differentiation of Univariate Functions} 165 | 166 | \item \redf{Partial Differentiation and Gradients } 167 | 168 | \item \grayf{Gradients of Vector-Valued Functions 169 | 170 | \item Gradients of Matrices 171 | 172 | \item Useful Identities for Computing Gradients 173 | 174 | \item Backpropagation and Automatic Differentiation 175 | 176 | \item Higher-Order Derivatives 177 | 178 | \item Linearization and Multivariate Taylor Series} 179 | 180 | \ece 181 | \end{frame} 182 | 183 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 184 | \begin{frame}{Gradient} 185 | 186 | \plitemsep 0.1in 187 | 188 | \bci 189 | 190 | \item Now, \bluef{$f: \realn \mapsto \real.$} 191 | 192 | \item Gradient of $f$ w.r.t. $\vec{x}$ $\grad_{\vec{x}} f$: Varying one variable at a time and keeping the others constant. 193 | 194 | \bigskip 195 | 196 | \mytwocols{0.5} 197 | { 198 | \redf{Partial Derivative.} 199 | For $f : \realn \mapsto \real,$ 200 | 201 | \aleq{ 202 | \pd{f}{x_1} &= \lim_{h \rightarrow 0} \frac{f(x_1+h,x_2, \ldots, x_n) - f(\vec{x})}{h}\cr 203 | & \vdots \cr 204 | \pd{f}{x_n} &= \lim_{h \rightarrow 0} \frac{f(x_1,x_2, \ldots, x_n+h) - f(\vec{x})}{h} 205 | } 206 | } 207 | { 208 | \redf{Gradient.} Get the partial derivatives and collect them in the row vector. 209 | 210 | \aleq{ 211 | \grad_{\vec{x}} f = \d{f}{\vec{x}} = 212 | \rowvec{\pd{f(\vec{x})}{x_1} & \cdots & \pd{f(\vec{x})}{x_n}} \in \real^{1 \times n} 213 | } 214 | } 215 | \eci 216 | \end{frame} 217 | 218 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 219 | \begin{frame}{Example} 220 | 221 | \plitemsep 0.2in 222 | 223 | \bci 224 | \item \exam $f(x,y) = (x+2y^3)^2$ 225 | \aleq{ 226 | \pd{f(x,y)}{x} &= 2(x+2y^3) \pd{x+2y^3}{x} = 2(x+2y^3)\cr 227 | \pd{f(x,y)}{y} &= 2(x+2y^3) \pd{x+2y^3}{y} = 12(x+2y^3)y^2 228 | } 229 | 230 | \item \exam $f(x_1, x_2) = x_1^2 x_2 + x_1 x_2^3$ 231 | \aleq{ 232 | \grad_{(x_1,x_2)}f = \d{f}{x} = \rowvec{\pd{f(x_1,x_2)}{x_1} &\pd{f(x_1,x_2)}{x_2}} = \rowvec{2x_1x_2 + x_2^3 & 233 | x_1^2 + 3x_1x_2^2} 234 | } 235 | \eci 236 | \end{frame} 237 | 238 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 239 | \begin{frame}{Rules for Partial Differentiation} 240 | 241 | \plitemsep 0.2in 242 | 243 | \bci 244 | \item \bluef{Product rule} $$\pd{}{\vec{x}}\big(f(\vec{x})g(\vec{x})\big) = \pd{f}{\vec{x}} g(\vec{x}) + f(\vec{x})\pd{g}{\vec{x}}$$ 245 | 246 | \item \bluef{Sum rule} $$\pd{}{\vec{x}} \big(f(\vec{x})+ g(\vec{x})\big) = \pd{f}{\vec{x}} + \pd{g}{\vec{x}}$$ 247 | 248 | \item \bluef{Chain rule} $$\pd{}{\vec{x}} g\big(f(\vec{x})\big) = \pd{g}{f}\pd{f}{\vec{x}}$$ 249 | 250 | \eci 251 | \end{frame} 252 | 253 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 254 | \begin{frame}{More about Chain Rule} 255 | 256 | \plitemsep 0.05in 257 | 258 | \bci 259 | \item $f: \real^2 \mapsto \real$ of two variables $x_1$ and $x_2.$ $x_1(t)$ and $x_2(t)$ are functions of $t.$ 260 | \aleq{ 261 | \d{f}{t} = \rowvec{\pd{f}{x_1} & \pd{f}{x_2}} \colvec{\pd{x_1(t)}{t} \\ \pd{x_2(t)}{t}} 262 | = \pd{f}{x_1}\pd{x_1}{t} + \pd{f}{x_2}\pd{x_2}{t} 263 | } 264 | \item \exam $f(x_1, x_2) = x_1^2 + 2 x_2,$ where $x_1(t) = \sin(t),\ x_2(t)=\cos(t)$ 265 | \aleq{ 266 | \d{f}{t} = \pd{f}{x_1}\pd{x_1}{t} + \pd{f}{x_2}\pd{x_2}{t} = 2\sin(t)\cos(t) - 2\sin{t} = 2\sin(t)(\cos(t)-1) 267 | } 268 | 269 | \item $f: \real^2 \mapsto \real$ of two variables $x_1$ and $x_2.$ $x_1(s,t)$ and $x_2(s,t)$ are functions of $s,t.$ 270 | 271 | \myvartwocols{0.2}{0.37}{0.6} 272 | { 273 | \small 274 | \vspace{-0.2cm} 275 | \aleq{ 276 | \pd{f}{s} &= \pd{f}{x_1}\pd{x_1}{s} + \pd{f}{x_2}\pd{x_2}{s}\cr 277 | \pd{f}{t} &= \pd{f}{x_1}\pd{x_1}{t} + \pd{f}{x_2}\pd{x_2}{t} 278 | } 279 | } 280 | { 281 | \aleq{ 282 | \d{f}{(s,t)} = \pd{f}{\vec{x}}\pd{\vec{x}}{(s,t)} = \rowvec{\pd{f}{x_1} & \pd{f}{x_2}} 283 | \begin{nmat} 284 | \pd{x_1}{s} & \pd{x_1}{t} \cr 285 | \pd{x_2}{s} & \pd{x_2}{t} 286 | \end{nmat} 287 | } 288 | } 289 | \eci 290 | \end{frame} 291 | 292 | 293 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 294 | \section{L5(3)} 295 | \begin{frame}{Roadmap} 296 | 297 | \plitemsep 0.1in 298 | 299 | \bce[(1)] 300 | \item \grayf{Differentiation of Univariate Functions} 301 | 302 | \item \grayf{Partial Differentiation and Gradients } 303 | 304 | \item \redf{Gradients of Vector-Valued Functions} 305 | 306 | \item \grayf{Gradients of Matrices 307 | 308 | \item Useful Identities for Computing Gradients 309 | 310 | \item Backpropagation and Automatic Differentiation 311 | 312 | \item Higher-Order Derivatives 313 | 314 | \item Linearization and Multivariate Taylor Series} 315 | 316 | \ece 317 | \end{frame} 318 | 319 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 320 | \begin{frame}{$\vec{f}: \realn \mapsto \realm$} 321 | 322 | \plitemsep 0.1in 323 | 324 | \bci 325 | \item For a function $\vec{f}: \realn \mapsto \realm$ and vector $\vec{x}= \trans{\rowvec{x_1 & \ldots & x_n}} \in \realn,$ the vector-valued function is: 326 | $$ 327 | \vec{f}(\vec{x}) = \colvec{f_1(\vec{x}) \\ \vdots \\ f_m(\vec{x})} 328 | $$ 329 | \item Partial derivative w.r.t. $x_i$ is a column vector: $\displaystyle \pd{\vec{f}}{x_i} = 330 | \colvec{\pd{f_1}{x_i} \\ \vdots \\ \pd{f_m}{x_i}}$ 331 | 332 | \item Gradient (or Jacobian): $\displaystyle \d{\vec{f}(\vec{x})}{\vec{x}} = \rowvec{\pd{\vec{f}(\vec{x})}{x_1} & \cdots & \pd{\vec{f}(\vec{x})}{x_n} }$ 333 | \eci 334 | \end{frame} 335 | 336 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 337 | \begin{frame}{Jacobian} 338 | 339 | \aleq{ 340 | \mJ &= \grad_{\vec{x}} \vec{f} = \d{\vec{f}(\vec{x})}{\vec{x}} = 341 | \rowvec{\pd{\vec{f}(\vec{x})}{x_1} & \cdots & \pd{\vec{f}(\vec{x})}{x_n} }\cr 342 | &= \begin{nmat} 343 | \pd{f_1(\vec{x})}{x_1} & \cdots & \pd{f_1(\vec{x})}{x_n} \cr 344 | \vdots & & \vdots \cr 345 | \pd{f_m(\vec{x})}{x_1} & \cdots & \pd{f_m(\vec{x})}{x_n} 346 | \end{nmat} 347 | } 348 | 349 | \bci 350 | \item For a \bluef{$\realn \mapsto \realm$} function, its Jacobian is a \bluef{$m \times n$} matrix. 351 | \eci 352 | % \plitemsep 0.1in 353 | % \bci 354 | % \item For a function $\vec{f}: \realn \mapsto \realm$ and vector $\vec{x}= \trans{\rowvec{x_1 & \ldots & x_n}} \in \realn,$ the vector-valued function is: 355 | % $$ 356 | % \vec{f}(\vec{x}) = \colvec{f_1(\vec{x}) \\ \vdots \\ f_m(\vec{x})} 357 | % $$ 358 | % \item Partial derivative w.r.t. $x_i$ is a column vector: $\displaystyle \pd{\vec{f}}{x_i} = 359 | % \colvec{\pd{f_1}{x_i} \\ \vdots \\ \pd{f_m}{x_i}}$ 360 | 361 | % \item Gradient (or Jacobian): $\displaystyle \d{\vec{f}(\vec{x})}{\vec{x}} = \rowvec{\pd{\vec{f}(\vec{x})}{x_1} & \cdots & \pd{\vec{f}(\vec{x})}{x_n} }$ 362 | % \eci 363 | \end{frame} 364 | 365 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 366 | \begin{frame}{Example: Gradient of Vector-Valued Function} 367 | 368 | \bci 369 | \item $\vf(\vx) = \mA \vx,$ $\vf: \realn \mapsto \realm,$ $\mA \in \realmn,$ $\vx \in \realn$ 370 | 371 | \item Partial derivatives: 372 | $ 373 | \displaystyle 374 | f_i(\vx) = \sum_{j=1}^n A_{ij} x_j \implies \pd{f_i}{x_j} = A_{ij} 375 | $ 376 | 377 | \item Graident 378 | \aleq{ 379 | \d{\vf}{\vx} = \begin{nmat} 380 | \pd{f_1}{x_1} & \cdots & \pd{f_1}{x_n} \cr 381 | \vdots & & \vdots \cr 382 | \pd{f_m}{x_1} & \cdots & \pd{f_m}{x_n} 383 | \end{nmat} = 384 | \begin{nmat} 385 | A_{11} & \cdots & A_{1n} \cr 386 | \vdots & & \vdots \cr 387 | A_{m1} & \cdots & A_{mn} 388 | \end{nmat} = \mA 389 | } 390 | 391 | \eci 392 | 393 | \end{frame} 394 | 395 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 396 | \begin{frame}{Example: Chain Rule} 397 | 398 | \bci 399 | \item $h: \real \mapsto \real,$ $h(t) = (f\circ g)(t)$ with 400 | \aleq{ 401 | f: \real^2 \mapsto \real, \ f(\vx) = \exp(x_1x_2^2), \quad g: \real \mapsto \real^2, \ 402 | \vx = \colvec{x_1 \\ x_2} = g(t) = \colvec{t\cos(t) \\ t\sin(t)} 403 | } 404 | 405 | \item \bluef{(Note)} $\pd{f}{\vx} \in \real^{1 \times 2}$ and $\pd{g}{t} \in \real^{2 \times 1}$ 406 | 407 | \item Using the chain rule, 408 | \aleq{ 409 | \d{h}{t} = \pd{f}{\vx} \pd{\vx}{t} &= \rowvec{\pd{f}{x_1} & \pd{f}{x_2}}\colvec{\pd{x_1}{t} \\ \pd{x_2}{t}}\cr 410 | &= \rowvec{\exp(x_1x_2^2)x_2^2 & 2\exp(x_1x_2^2)x_1x_2} \colvec{\cos(t)-t\sin(t) \\ \sin(t)+t\cos(t)} 411 | } 412 | \eci 413 | \end{frame} 414 | 415 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 416 | \begin{frame}{Example: Least-Square Loss (1)} 417 | 418 | \plitemsep 0.1in 419 | 420 | \bci 421 | \item A linear model: $\vy = \mat{\Phi} \vth$ 422 | \item $\vth \in \real^D$: parameter vector 423 | \item $\mat{\Phi} \in \real^{N \times D}$: input features 424 | \item $\vy \in \real^N$: observations 425 | 426 | \item Goal: Find a good parameter vector that provides the best-fit, formulated by minimizing the following loss $L: \real^D \mapsto \real$ over the parameter vector $\vth$. 427 | \mycolorbox{ 428 | \vspace{-0.2cm} 429 | $$ 430 | L(\ve) \eqdef \norm{\ve}^2, \quad \text{where} \ \ve(\vth) = \vy - \mat{\Phi} \vth 431 | $$ 432 | } 433 | \eci 434 | \end{frame} 435 | 436 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 437 | \begin{frame}{Example: Least-Square Loss (2)} 438 | 439 | \plitemsep 0.2in 440 | 441 | \bci 442 | 443 | \item $\displaystyle \pd{L}{\vth} = \greenf{\pd{L}{\ve}} \orangef{\pd{\ve}{\vth}}$ 444 | \item \redf{Note.} $\displaystyle \pd{L}{\vth} \in \real^{1 \times D},$ $\displaystyle \greenf{\pd{L}{\ve}} \in \real^{1 \times N},$ $\displaystyle \orangef{\pd{\ve}{\vth}} \in \real^{N \times D}$ 445 | 446 | \item Using that $\norm{\ve}^2 = \trans{\ve}\ve$, $\displaystyle \greenf{\pd{L}{\ve}} = 2 \trans{\ve} \in \real ^{1 \times N}$ and $\displaystyle \orangef{\pd{\ve}{\vth}} = - \mat{\Phi} \in \real^{N \times D}$ 447 | \aleq{ 448 | \text{Finally, we get:} \quad \pd{L}{\vth} = \greenf{2\trans{\ve}}\orangef{(-\mat{\Phi})} = -\underbrace{2(\trans{\vy} - \trans{\vth}\trans{\mat{\Phi}})}_{1 \times N} \underbrace{\mat{\Phi}}_{N \times D} 449 | } 450 | \eci 451 | \end{frame} 452 | 453 | 454 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 455 | \section{L5(4)} 456 | \begin{frame}{Roadmap} 457 | 458 | \plitemsep 0.1in 459 | 460 | \bce[(1)] 461 | \item \grayf{Differentiation of Univariate Functions} 462 | 463 | \item \grayf{Partial Differentiation and Gradients } 464 | 465 | \item \grayf{Gradients of Vector-Valued Functions} 466 | 467 | \item \redf{Gradients of Matrices 468 | 469 | \item Useful Identities for Computing Gradients} 470 | 471 | \item \grayf{Backpropagation and Automatic Differentiation 472 | 473 | \item Higher-Order Derivatives 474 | 475 | \item Linearization and Multivariate Taylor Series} 476 | 477 | \ece 478 | \end{frame} 479 | 480 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 481 | \begin{frame}{Gradients of matrices} 482 | 483 | \plitemsep 0.1in 484 | 485 | \bci 486 | \item Gradient of matrix $\mA \in \real^{m \times n}$ w.r.t. matrix $\mB \in \real^{p \times q}$ 487 | 488 | \item Jacobian: A four-dimensional tensor\footnote{A multidimensional array} $\mJ = \d{\mA}{\mB} \in \real^{(m \times n) \times (p \times q)}$ 489 | 490 | \eci 491 | 492 | \myvartwocols{0.5}{0.15}{0.83} 493 | { 494 | \includegraphics[width=0.9\columnwidth]{L5_grad_matrix_1.png} 495 | } 496 | { 497 | \includegraphics[width=0.47\columnwidth]{L5_grad_matrix_2.png} 498 | \includegraphics[width=0.47\columnwidth]{L5_grad_matrix_3.png} 499 | } 500 | 501 | 502 | 503 | 504 | \end{frame} 505 | 506 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 507 | \begin{frame}{Example: Gradient of Vectors for Matrices (1)} 508 | 509 | \bci 510 | \item $\vf(\vx) = \mA \vx,$ $\vf \in \realm$, $\mA \in \realmn,$ $\vx \in \realn.$ What is \bluef{$\d{\vf}{\mA}$?} 511 | 512 | \item Dimension: If we consider $\vf: \realmn \mapsto \realm,$ $\d{\vf}{\mA} \in \real^{m\times (m \times n)}$ 513 | 514 | 515 | \item Partial derivatives: 516 | $ 517 | \pd{f_i}{\mA} \in \real^{1\times (m \times n)}, \quad \d{\vf}{\mA} = \colvec{ \pd{f_1}{\mA} \\ \vdots \\ \pd{f_m}{\mA}} 518 | $ 519 | \mytwocols{0.4} 520 | { 521 | \small 522 | \aleq{ 523 | f_i &= \sum_{j=1}^n A_{ij} x_j, \ i=1, \ldots, m \implies \pd{f_i}{A_{iq}} = x_q,\cr 524 | \pd{f_i}{A_{i\cdot}} &= \trans{\vx} \in \real^{1\times 1\times n} \ \text{(for $i$th row vector)}\cr 525 | \pd{f_{i}}{A_{{k\neq i}\cdot}} & = \trans{\vec{0}} \in \real^{1\times 1\times n} \ \text{(for $k$th row vector, $k\neq i$)} 526 | } 527 | } 528 | { 529 | \small 530 | \aleq{ 531 | \pd{f_i}{\mA} = \colvec{\trans{\vec{0}} \\ \vdots \\ \trans{\vec{0}} \\ \trans{\vx} \\ \trans{\vec{0}} \\ \vdots \\ \trans{\vec{0}}} \in \real^{1 \times (m \times n)} 532 | } 533 | } 534 | 535 | \eci 536 | 537 | \end{frame} 538 | 539 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 540 | \begin{frame}{Example: Gradient of Matrices for Matrices (2)} 541 | 542 | \bci 543 | \item $\mR \in \realmn$ and $\vf: \realmn \mapsto \realnn$ with $\vf(\mR) = \mK \eqdef \trans{\mR}\mR \in \realnn.$ What is \bluef{$\d{\mK}{\mR} \in \real^{(n\times n) \times (m\times n)}$?} 544 | 545 | \item $\d{K_{pq}}{\mR} \in \real^{1 \times m \times n}.$ Let $\vr_i$ be the $i$th column of $\mR.$ Then 546 | \( 547 | K_{pq} = \trans{\vr_p} \vr_q = \sum_{k=1}^m R_{kp} R_{kq}. 548 | \) 549 | 550 | \item Partial derivative $\pd{K_{pq}}{R_{ij}}$ 551 | \aleq{ 552 | \pd{K_{pq}}{R_{ij}} = \sum_{k=1}^m \pd{}{R_{ij}} R_{kp} R_{kq} = \partial_{pqij}, \ 553 | \partial_{pqij} = 554 | \begin{cases} 555 | R_{iq} & \text{if} \ j=p, p\neq q \cr 556 | R_{ip} & \text{if} \ j=q, p\neq q \cr 557 | 2R_{iq} & \text{if} \ j=p, p=q \cr 558 | 0 & \text{otherwise} 559 | \end{cases} 560 | } 561 | \eci 562 | 563 | \end{frame} 564 | 565 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 566 | \section{L5(5)} 567 | \begin{frame}{Useful Identities} 568 | 569 | \vspace{-0.6cm} 570 | \raggedleft 571 | \includegraphics[width=0.7\columnwidth]{L5_useful.png} 572 | \end{frame} 573 | 574 | 575 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 576 | \section{L5(6)} 577 | \begin{frame}{Roadmap} 578 | 579 | \plitemsep 0.1in 580 | 581 | \bce[(1)] 582 | \item \grayf{Differentiation of Univariate Functions} 583 | 584 | \item \grayf{Partial Differentiation and Gradients } 585 | 586 | \item \grayf{Gradients of Vector-Valued Functions} 587 | 588 | \item \gray{Gradients of Matrices} 589 | 590 | \item \grayf{Useful Identities for Computing Gradients} 591 | 592 | \item \redf{Backpropagation and Automatic Differentiation} 593 | 594 | \item \grayf{Higher-Order Derivatives 595 | 596 | \item Linearization and Multivariate Taylor Series} 597 | 598 | \ece 599 | \end{frame} 600 | 601 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 602 | \begin{frame}{Motivation: Neural Networks with Many Layers (1)} 603 | 604 | \plitemsep 0.01in 605 | 606 | \bci 607 | \item In a neural network with many layers, the function $\vy$ is a many-level function compositions 608 | $$ 609 | \vy = (f_K \circ f_{K-1} \circ \cdots \circ f_1)(\vx), 610 | $$ 611 | where, for example, 612 | \bci 613 | \item $\vx$: images as inputs, $\vy$: class labels (e.g., cat or dog) as outputs 614 | \item each $f_i$ has its own parameters 615 | \eci 616 | 617 | \item In neural networks, with the model parameters $\vth = \{\mA_0, \vb_0, \ldots, \mA_{K-1}, \vb_{K-1} \}$ 618 | 619 | \smallskip 620 | \mysmalltwocols{0.4} 621 | { 622 | \small 623 | \vspace{-0.4cm} 624 | \aleq{ 625 | \begin{cases} 626 | \vf_0 &\eqdef \vx \cr 627 | \vf_1 &\eqdef \sigma_1(\mA_{0}\vf_{0} + \vb_{0})\cr 628 | \vdots& \cr 629 | \vf_K &\eqdef \sigma_K(\mA_{K-1}\vf_{K-1} + \vb_{K-1}) 630 | \end{cases} 631 | } 632 | $\circ$ $\sigma_i$ is called the \bluef{activation function} at $i$-th layer 633 | } 634 | { 635 | \hspace{-0.7cm} $\circ$ Minimizing the loss function over $\vth$: 636 | \aleq{ 637 | \min_{\vth} L(\vth), 638 | } 639 | where 640 | $ 641 | L(\vth) = \norm{\vy - \vf_K(\vth,\vx)}^2 642 | $ 643 | } 644 | 645 | \eci 646 | \end{frame} 647 | 648 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 649 | \begin{frame}{Motivation: Neural Networks with Many Layers (2)} 650 | 651 | \plitemsep 0.01in 652 | 653 | \bci 654 | 655 | \item In neural networks, with the model parameters $\vth = \{\mA_0, \vb_0, \ldots, \mA_{K-1}, \vb_{K-1} \}$ 656 | 657 | \smallskip 658 | \mysmalltwocols{0.4} 659 | { 660 | \small 661 | \vspace{-0.4cm} 662 | \aleq{ 663 | \begin{cases} 664 | \vf_0 &\eqdef \vx \cr 665 | \vf_1 &\eqdef \sigma_1(\mA_{0}\vf_{0} + \vb_{0})\cr 666 | \vdots& \cr 667 | \vf_K &\eqdef \sigma_K(\mA_{K-1}\vf_{K-1} + \vb_{K-1}) 668 | \end{cases} 669 | } 670 | $\circ$ $\sigma_i$ is called the activation function at $i$-th layer 671 | } 672 | { 673 | \hspace{-0.7cm} $\circ$ Minimizing the loss function over $\vth$: 674 | \aleq{ 675 | \min_{\vth} L(\vth), 676 | } 677 | where 678 | $ 679 | L(\vth) = \norm{\vy - \vf_K(\vth,\vx)}^2 680 | $ 681 | } 682 | 683 | \medskip 684 | \item \question \bluef{\large How can we efficiently compute $\displaystyle \d{L}{\vth}$ in computers?} 685 | 686 | \eci 687 | \end{frame} 688 | 689 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 690 | \begin{frame}{Backpropagatin: Example (1)} 691 | 692 | \plitemsep 0.1in 693 | 694 | \bci 695 | 696 | \item $f(x) = \sqrt{x^2 + \exp(x^2)} + \cos\left (x^2 + \exp(x^2)\right)$ 697 | 698 | 699 | 700 | \item Computation graph: Connect via ``elementary'' operations 701 | 702 | \smallskip 703 | \mypic{0.7}{L5_computation_graph.png} 704 | \aleq{ 705 | \bluef{a} = x^2, \ \bluef{b}=\exp(a), \ \bluef{c}=a+b, \ \bluef{d}=\sqrt{c}, \ \bluef{e}=\cos(c), \ \bluef{f} = d+e 706 | } 707 | 708 | \item Automatic Differentiation 709 | \bci 710 | \item A set of techniques to \bluef{numerically} (not symbolically) evaluate the gradient of a function by working with \bluef{intermediate variables} and applying the \bluef{chain rule}. 711 | \eci 712 | \eci 713 | \end{frame} 714 | 715 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 716 | \begin{frame}{Backpropagation: Example (2)} 717 | 718 | \plitemsep 0.1in 719 | 720 | \bci 721 | 722 | \item 723 | % $f(x) = \sqrt{x^2 + \exp(x^2)} + \cos\left (x^2 + \exp(x^2)\right)$ 724 | $ 725 | \bluef{a} = x^2, \ \bluef{b}=\exp(a), \ \bluef{c}=a+b, \ \bluef{d}=\sqrt{c}, \ \bluef{e}=\cos(c), \ \bluef{f} = d+e 726 | $ 727 | \item Derivatives of the intermediate variables with their inputs 728 | \aleq{ 729 | \bluef{\pd{a}{x}} = 2x, \ \bluef{\pd{b}{a}}=\exp(a), \ \bluef{\pd{c}{a}}=1 = \bluef{\pd{c}{b}}, \ \bluef{\pd{d}{c}}=\frac{1}{2\sqrt{c}}, \ \bluef{\pd{e}{c}}=-\sin(c), \ \bluef{\pd{f}{d}} = 1 = \bluef{\pd{f}{e}} 730 | } 731 | \item Compute $\displaystyle \pd{f}{x}$ by working backward from the output 732 | \mytwocols{0.3} 733 | { 734 | \small 735 | \vspace{-0.3cm} 736 | \aleq{ 737 | \orangef{\pd{f}{c}} &= \bluef{\pd{f}{d}\pd{d}{c}} + \bluef{\pd{f}{e}\pd{e}{c}}, \ \redf{\pd{f}{b}} =\bluef{\pd{f}{c}\pd{c}{b}} \cr 738 | \greenf{\pd{f}{a}} &= \redf{\pd{f}{b}}\bluef{\pd{b}{a}} + \orangef{\pd{f}{c}}\bluef{\pd{c}{a}}, \ \mybox{$\displaystyle \pd{f}{x}$} =\greenf{\pd{f}{a}}\bluef{\pd{a}{x}} 739 | } 740 | } 741 | { 742 | \small 743 | \vspace{-0.3cm} 744 | \aleq{ 745 | \orangef{\pd{f}{c}} &= 1\cdot \frac{1}{2\sqrt{c}} + 1\cdot (-\sin(c))\cr 746 | \redf{\pd{f}{b}} &= \orangef{\pd{f}{c}} \cdot 1, \quad \greenf{\pd{f}{a}} = \redf{\pd{f}{b}} \exp(a) + \orangef{\pd{f}{c}}\cdot 1 \cr 747 | \mybox{$\displaystyle \pd{f}{x}$} &=\greenf{\pd{f}{a}} \cdot 2x 748 | } 749 | } 750 | 751 | \eci 752 | \end{frame} 753 | 754 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 755 | \begin{frame}{Backpropagation} 756 | 757 | \plitemsep 0.1in 758 | 759 | \bci 760 | 761 | \item Implementation of gradients can be very expensive, unless we are careful. 762 | 763 | \item Using the idea of automatic differentiation, the whole gradient computation is decomposed into a set of gradients of elementary functions and application of the chain rule. 764 | 765 | \item Why \bluef{backward}? 766 | 767 | \bci 768 | \item In neural networks, the input dimensionality is often much higher than the dimensionality of labels. 769 | \item In this case, the backward computation (than the forward computation) is much cheaper. 770 | \eci 771 | 772 | \item Works if the target is expressed as a computation graph whose elementary functions are differentiable. If not, some care needs to be taken. 773 | \eci 774 | \end{frame} 775 | 776 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 777 | \section{L5(7)} 778 | \begin{frame}{Roadmap} 779 | 780 | \plitemsep 0.1in 781 | 782 | \bce[(1)] 783 | \item \grayf{Differentiation of Univariate Functions} 784 | 785 | \item \grayf{Partial Differentiation and Gradients } 786 | 787 | \item \grayf{Gradients of Vector-Valued Functions} 788 | 789 | \item \gray{Gradients of Matrices} 790 | 791 | \item \grayf{Useful Identities for Computing Gradients} 792 | 793 | \item \grayf{Backpropagation and Automatic Differentiation} 794 | 795 | \item \redf{Higher-Order Derivatives 796 | 797 | \item Linearization and Multivariate Taylor Series} 798 | 799 | \ece 800 | \end{frame} 801 | 802 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 803 | \begin{frame}{Higher-Order Derivatives} 804 | 805 | \plitemsep 0.05in 806 | 807 | \bci 808 | \item Some optimization algorithms (e.g., Newton's method) require second-order derivatives, if they exist. 809 | \item (Truncated) Taylor series is often used as an approximation of a function. 810 | 811 | \item For $f: \realn \mapsto \real$ of variable $\vx \in \realn$, $ 812 | \grad_{\vec{x}} f = \d{f}{\vec{x}} = 813 | \rowvec{\pd{f(\vec{x})}{x_1} & \cdots & \pd{f(\vec{x})}{x_n}} \in \real^{1 \times n} 814 | $ 815 | \bci 816 | \item If $f$ is twice-differentiable, the order doesn't matter. 817 | \aleq{ 818 | \hess_{\vec{x}} f = \begin{nmat} 819 | \pdd{f}{x_1}& \pdda{f}{x_1}{x_2}& \cdots & \pdda{f}{x_1}{x_n}\cr 820 | \vdots & & & \vdots\cr 821 | \pdda{f}{x_1}{x_n} & \pdda{f}{x_2}{x_n} & \cdots & \pdda{f}{x_{n}}{x_n} 822 | \end{nmat} 823 | } 824 | % Gradient $\grad f: \realn \mapsto $ 825 | \eci 826 | 827 | \item For $f: \realn \mapsto \realm$, $\grad_{\vec{x}} f \in \realmn$ 828 | \bci 829 | \item Thus, $\hess_{\vec{x}} f \in \real^{m \times n \times n}$ (a tensor) 830 | \eci 831 | 832 | \eci 833 | \end{frame} 834 | 835 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 836 | \section{L5(7)} 837 | \begin{frame}{Function Approximation: Linearization and More} 838 | 839 | \plitemsep 0.1in 840 | 841 | \bci 842 | \item First-order approximation of $f(\vx)$ (i.e., linearization by taking the first two terms of Taylor Series) 843 | $$ 844 | f(\vx) \approx f(\vx_0) + (\grad_{\vx} f)(\vx_0)(\vx-\vx_0) 845 | $$ 846 | 847 | \item Multivariate Talyer Series for $f: \real^D \mapsto \real$ at $\vx_0$ 848 | $$ 849 | f(\vx) = \sum_{k=0}^\infty \frac{D^k_{\vx} f(\vx_0)}{k!} \vec{\delta}^k, 850 | $$ 851 | where $D^k_{\vx} f(\vx_0)$ is the $k$th derivative of $f$ w.r.t. $\vx$, evaluated at $\vx_0,$ and $\vec{\delta} \eqdef \vx - \vx_0.$ 852 | \bci 853 | \item Partial sum up to, say $n$, can be an approximation of $f(\vx).$ 854 | \item $D^k_{\vx} f(\vx_0)$ and $\vec{\delta}^k$ are $k$th order tensors, i.e., $k$-dimensional array. 855 | 856 | \item $\vec{\delta}^k$ is a $k$-fold outer product $\otimes$. For example, $\vec{\delta}^2 = \vec{\delta} \otimes \vec{\delta} = \vec{\delta}\trans{\vec{\delta}}.$ $\vec{\delta}^3 = \vec{\delta} \otimes \vec{\delta} \otimes \vec{\delta}.$ 857 | \eci 858 | 859 | \eci 860 | 861 | 862 | 863 | \end{frame} 864 | 865 | 866 | 867 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 868 | \begin{frame}{} 869 | \vspace{2cm} 870 | \LARGE Questions? 871 | 872 | 873 | \end{frame} 874 | 875 | \begin{frame}{Review Questions} 876 | % \tableofcontents 877 | %\plitemsep 0.1in 878 | \bce[1)] 879 | \item 880 | 881 | \ece 882 | \end{frame} 883 | 884 | 885 | \end{document} 886 | -------------------------------------------------------------------------------- /06.Probability/6.PD-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/6.PD-2.pdf -------------------------------------------------------------------------------- /06.Probability/6.PD-4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/6.PD-4.pdf -------------------------------------------------------------------------------- /06.Probability/6.PD.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/6.PD.pdf -------------------------------------------------------------------------------- /06.Probability/L6_CDF_ex1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_CDF_ex1.png -------------------------------------------------------------------------------- /06.Probability/L6_CDF_ex2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_CDF_ex2.png -------------------------------------------------------------------------------- /06.Probability/L6_RV_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_RV_ex.png -------------------------------------------------------------------------------- /06.Probability/L6_binomial_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_binomial_ex.png -------------------------------------------------------------------------------- /06.Probability/L6_condind_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_condind_ex.png -------------------------------------------------------------------------------- /06.Probability/L6_cov_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_cov_ex.png -------------------------------------------------------------------------------- /06.Probability/L6_cov_notind.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_cov_notind.png -------------------------------------------------------------------------------- /06.Probability/L6_exp_pdf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_exp_pdf.png -------------------------------------------------------------------------------- /06.Probability/L6_gaussian_formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_gaussian_formula.png -------------------------------------------------------------------------------- /06.Probability/L6_geo_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_geo_ex.png -------------------------------------------------------------------------------- /06.Probability/L6_joint_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_joint_ex.png -------------------------------------------------------------------------------- /06.Probability/L6_marginal_conditional.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_marginal_conditional.png -------------------------------------------------------------------------------- /06.Probability/L6_needle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_needle.png -------------------------------------------------------------------------------- /06.Probability/L6_pdf_delta.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_pdf_delta.png -------------------------------------------------------------------------------- /06.Probability/L6_pdf_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_pdf_ex.png -------------------------------------------------------------------------------- /06.Probability/L6_pdf_uniform_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_pdf_uniform_ex.png -------------------------------------------------------------------------------- /06.Probability/L6_pmf_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_pmf_ex.png -------------------------------------------------------------------------------- /06.Probability/L6_total_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_total_ex.png -------------------------------------------------------------------------------- /06.Probability/L6_tworolls.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_tworolls.png -------------------------------------------------------------------------------- /06.Probability/L6_uniform_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/06.Probability/L6_uniform_ex.png -------------------------------------------------------------------------------- /07.Optimization/7.OPT-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/7.OPT-2.pdf -------------------------------------------------------------------------------- /07.Optimization/7.OPT-4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/7.OPT-4.pdf -------------------------------------------------------------------------------- /07.Optimization/7.OPT.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/7.OPT.pdf -------------------------------------------------------------------------------- /07.Optimization/L7_convex_conjugate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_convex_conjugate.png -------------------------------------------------------------------------------- /07.Optimization/L7_convex_fn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_convex_fn.png -------------------------------------------------------------------------------- /07.Optimization/L7_convex_set_ex1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_convex_set_ex1.png -------------------------------------------------------------------------------- /07.Optimization/L7_convex_set_ex2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_convex_set_ex2.png -------------------------------------------------------------------------------- /07.Optimization/L7_first_condition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_first_condition.png -------------------------------------------------------------------------------- /07.Optimization/L7_gradient_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_gradient_ex.png -------------------------------------------------------------------------------- /07.Optimization/L7_halfspace.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_halfspace.png -------------------------------------------------------------------------------- /07.Optimization/L7_separating.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_separating.png -------------------------------------------------------------------------------- /07.Optimization/L7_supporting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/07.Optimization/L7_supporting.png -------------------------------------------------------------------------------- /08.Model_Data/8.MMD-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/8.MMD-2.pdf -------------------------------------------------------------------------------- /08.Model_Data/8.MMD-4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/8.MMD-4.pdf -------------------------------------------------------------------------------- /08.Model_Data/8.MMD.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/8.MMD.pdf -------------------------------------------------------------------------------- /08.Model_Data/L10_latent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L10_latent.png -------------------------------------------------------------------------------- /08.Model_Data/L8_all_gmodels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_all_gmodels.png -------------------------------------------------------------------------------- /08.Model_Data/L8_coinflip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_coinflip.png -------------------------------------------------------------------------------- /08.Model_Data/L8_cross_validation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_cross_validation.png -------------------------------------------------------------------------------- /08.Model_Data/L8_dsep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_dsep.png -------------------------------------------------------------------------------- /08.Model_Data/L8_fittings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_fittings.png -------------------------------------------------------------------------------- /08.Model_Data/L8_gmodel_ex1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_gmodel_ex1.png -------------------------------------------------------------------------------- /08.Model_Data/L8_gmodel_ex2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_gmodel_ex2.png -------------------------------------------------------------------------------- /08.Model_Data/L8_lung_cancer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_lung_cancer.png -------------------------------------------------------------------------------- /08.Model_Data/L8_model_class.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_model_class.png -------------------------------------------------------------------------------- /08.Model_Data/L8_model_function.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_model_function.png -------------------------------------------------------------------------------- /08.Model_Data/L8_model_pmodel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_model_pmodel.png -------------------------------------------------------------------------------- /08.Model_Data/L8_nested_cross_validation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/08.Model_Data/L8_nested_cross_validation.png -------------------------------------------------------------------------------- /08.Model_Data/main.tex: -------------------------------------------------------------------------------- 1 | %\pdfminorversion=4 2 | \documentclass[handout,fleqn,aspectratio=169]{beamer} 3 | 4 | \input{../myhead} 5 | 6 | \title[]{Lecture 8: When Models Meet Data} 7 | \author{Yi, Yung (이융)} 8 | \institute{Mathematics for Machine Learning\\ \url{https://yung-web.github.io/home/courses/mathml.html} 9 | \\KAIST EE} 10 | \date{\today} 11 | 12 | 13 | \input{../mymath} 14 | \input{../mymacro} 15 | 16 | 17 | %\addtobeamertemplate{footline}{\rule{0.94\paperwidth}{1pt}}{} 18 | 19 | \begin{document} 20 | 21 | \input{../mydefault} 22 | 23 | 24 | 25 | % START START START START START START START START START START START START START 26 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 27 | \begin{frame}{Roadmap} 28 | 29 | \plitemsep 0.1in 30 | 31 | \bce[(1)] 32 | 33 | \item Data, Models, and Learning 34 | \item Models as Functions: Empirical Risk Minimization 35 | \item Models as Probabilistic Models: Parameter Estimation (ML and MAP) 36 | \item Probabilistic Modeling and Inference 37 | \item Directed Graphical Models 38 | \item Model Selection 39 | 40 | \ece 41 | \end{frame} 42 | 43 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 44 | \section{L8(1)} 45 | \begin{frame}{Roadmap} 46 | 47 | \plitemsep 0.1in 48 | 49 | \bce[(1)] 50 | 51 | \item \redf{Data, Models, and Learning} 52 | \item \grayf{Models as Functions: Empirical Risk Minimization 53 | \item Models as Probabilistic Models: Parameter Estimation (ML and MAP) 54 | \item Probabilistic Modeling and Inference 55 | \item Directed Graphical Models 56 | \item Model Selection 57 | } 58 | 59 | \ece 60 | \end{frame} 61 | 62 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 63 | \begin{frame}{Data, Models, and Learning} 64 | 65 | \plitemsep 0.15in 66 | 67 | \bci 68 | 69 | \item Three major components of a machine learning system 70 | 71 | \bce 72 | \item Data: $\{(\vx_1, y_1), \ldots, (\vx_n,y_n), \ldots, (\vx_N,y_N) \}$ 73 | \item Models: deterministic functions or probabilistic models 74 | \item Learning: Training, and prediction/inference 75 | \ece 76 | \item Good machine learning models: Perform well for unseen (untrained) data 77 | 78 | \item Machine learning algorithm: training and prediction 79 | \eci 80 | \end{frame} 81 | 82 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 83 | \begin{frame}{Data as Vectors} 84 | 85 | \plitemsep 0.1in 86 | 87 | \bci 88 | 89 | \item Tabular format or not, numerical or not, good feature extraction etc. 90 | 91 | \item Assume that data is given as $D$-dimensional vector $\vx_n$ of real numbers, each called \bluef{features}, \bluef{attributes}, or \bluef{covariates}. 92 | 93 | \item Dataset: consisting of data points or examples $\{ \vx_1,$ $\vx_2,$ \ldots, $\vx_N \}$ 94 | \item In supervised learning, $\{(\vx_1, y_1), \ldots, (\vx_n,y_n), \ldots, (\vx_N,y_N) \},$ where $y_n$ is the \bluef{label} (or target, response variable, or annotation). 95 | \item Better representation of data as vectors 96 | \bci 97 | \item finding lower-dimensional approximations of the original feature vector (e.g., PCA via SVD or EVD) 98 | \item using nonlinear higher-dimensional combinations of the original feature vector (e.g., feature map and kernel) 99 | \eci 100 | \eci 101 | \end{frame} 102 | 103 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 104 | \begin{frame}{Models: Functions vs. Probabilistic Models} 105 | 106 | \myvartwocols{0.7}{0.65}{0.31} 107 | { 108 | \plitemsep 0.07in 109 | 110 | \bci 111 | 112 | \item Now, the business of constructing a predictor 113 | 114 | \item Models as \bluef{functions} 115 | \bci 116 | \item $f: \realD \mapsto \real.$ 117 | \item \exam $f(\vx) = \trans{\vth}\vx + \theta_0,$ Unknown parameter: $\vth,\theta_0$ 118 | 119 | \eci 120 | 121 | \item Models as \bluef{probabilistic models} 122 | 123 | \bci 124 | \item model our uncertainty due to the \bluef{observation process} and our uncertainty in the \bluef{parameters of our model} 125 | 126 | \item predictors should be able to express some sort of uncertainty via probabilistic models 127 | \item Parameters: parameters of a chosen probabilistic model (e.g., mean and variance of Gaussian) 128 | \eci 129 | 130 | \eci 131 | 132 | } 133 | { 134 | \vspace{-0.3cm} 135 | \mypic{0.99}{L8_model_function.png} 136 | \mypic{0.99}{L8_model_pmodel.png} 137 | 138 | } 139 | 140 | \end{frame} 141 | 142 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 143 | \begin{frame}{Learning Algorithms} 144 | 145 | \plitemsep 0.07in 146 | 147 | \bci 148 | 149 | \item[] Three algorithmic phases 150 | 151 | \item[(1)] Prediction or inference: via function or probabilitic models 152 | 153 | \item[(2)] Training or parameters estimation 154 | 155 | \bci 156 | \item fixed parameter assumption (non-probabilistic) or Bayesisan approach (probabilistic) 157 | \item non-probabilistic: e.g., empirical risk minimization 158 | \item probabilistic: e.g., ML (Maximum Likelihood), MAP (Maximum A Posteriori) 159 | \item cross-validation: simulation of performing for unseen data 160 | \item regularization/prior: balancing models between training and unseen data 161 | 162 | \eci 163 | \item[(3)] Hyperparameter tuning or model selection 164 | \eci 165 | \end{frame} 166 | 167 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 168 | \section{L8(2)} 169 | \begin{frame}{Roadmap} 170 | 171 | \plitemsep 0.1in 172 | 173 | \bce[(1)] 174 | 175 | \item \grayf{Data, Models, and Learning} 176 | \item \redf{Models as Functions: Empirical Risk Minimization} 177 | \item \grayf{Models as Probabilistic Models: Parameter Estimation (ML and MAP) 178 | \item Probabilistic Modeling and Inference 179 | \item Directed Graphical Models 180 | \item Model Selection 181 | } 182 | 183 | \ece 184 | \end{frame} 185 | 186 | 187 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 188 | \begin{frame}{Empirical Risk Minimization} 189 | 190 | \plitemsep 0.07in 191 | 192 | \bci 193 | 194 | \item Predictor as a function 195 | 196 | \item Given $\{(\vx_1, y_1), \ldots, (\vx_n,y_n), \ldots, (\vx_N,y_N) \},$ estimate a predictor $f(\cdot, \vth): \realD \mapsto \real$ 197 | 198 | \item Find a good parameter $\vth^*,$ such that $f(\vx_n,\vth^*) = \hat{y}_n \approx y_n,$ for all $n=1,\ldots, N$ 199 | 200 | 201 | \bigskip 202 | \item \exam Affine function: By adding the unit feature $x^{(0)}=1$ and $\theta_0$, i.e., $\vx_n = \trans{[1, x_n^{(1)}, \ldots, x_n^{(D)}]},$ $\vth = \trans{[\theta_0, \theta_1, \ldots, \theta_D]}$ 203 | \aleq 204 | { 205 | f(\vx_n,\vth) = \trans{\vth} \vx_n = \theta_0 + \sum_{d=1}^D \theta_d x_n^{(d)} 206 | } 207 | 208 | \item \exam Neural network: Complex non-linear function 209 | \eci 210 | \end{frame} 211 | 212 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 213 | \begin{frame}{Loss Function} 214 | 215 | \plitemsep 0.07in 216 | 217 | \bci 218 | 219 | \item Training set: $\{(\vx_1, y_1), \ldots, (\vx_n,y_n), \ldots, (\vx_N,y_N) \},$ an example matrix\footnote{In other chapters, we often use $D \times N$ example matrix by defining it as $\mat{X} \eqdef [\vx_1, \ldots, \vx_N].$ \lecturemark{L10(4)}} 220 | $\mat{X} \eqdef \trans{[\vx_1, \ldots, \vx_N]} \in \real^{N \times D},$ a label vector $\vy \eqdef 221 | \trans{[y_1, \ldots, y_N]},$ 222 | 223 | \item Average loss, empirical risk 224 | $$ 225 | \bm{R}_{\text{emp}}(f,\mat{X},\vy) = \frac{1}{N} \sum_{n=1}^N \ell(y_n,\hat{y}_n) 226 | $$ 227 | 228 | %\bigskip 229 | \item Goal: Minimizing empirical risk 230 | 231 | \item \exam The squared loss function $\ell(y_n,\hat{y}_n) = (y_n - \hat{y}_n)^2$ leads to: 232 | $$ 233 | \min_{\vth \in \realD} \frac{1}{N} \norm{\vy - \mat{X}\vth}^2 234 | $$ 235 | 236 | \item \question Ultimgate goal: Minimizing expected risk (for unseen data) $\bm{R}_{\text{true}} = \expecti{\vx,y}{\ell(y,f(\vx))}$? 237 | 238 | \eci 239 | \end{frame} 240 | 241 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 242 | \begin{frame}{Overfitting and Regularization} 243 | 244 | \plitemsep 0.07in 245 | 246 | \bci 247 | 248 | \item The predictor fits too closely to the training data and does not generalize well to new data 249 | 250 | \item Need to somehow bias the search for the minimizer of empirical risk by introducing a \bluef{penalty term} 251 | 252 | \item \bluef{Regularization}: compromise between accurate solution of empirical risk minimization and the size or complexity of the solution. 253 | 254 | \item \exam Regularized Least Squares 255 | $$ 256 | \min_{\vth \in \realD} \frac{1}{N} \norm{\vy - \mat{X}\vth}^2 + \lambda \norm{\vth}^2 257 | $$ 258 | \bci 259 | \item $\norm{\vth}^2$: regularizer, $\lambda$: regularization parameter 260 | \eci 261 | 262 | \eci 263 | \end{frame} 264 | 265 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 266 | \begin{frame}{Cross-Validation for Generalization Performance} 267 | 268 | \bigskip 269 | 270 | 271 | \mypic{0.8}{L8_cross_validation.png} 272 | 273 | \end{frame} 274 | 275 | 276 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 277 | \section{L8(3)} 278 | \begin{frame}{Roadmap} 279 | 280 | \plitemsep 0.1in 281 | 282 | \bce[(1)] 283 | 284 | \item \grayf{Data, Models, and Learning} 285 | \item \grayf{Models as Functions: Empirical Risk Minimization} 286 | \item \redf{Models as Probabilistic Models: Parameter Estimation (ML and MAP)} 287 | \item \grayf{Probabilistic Modeling and Inference 288 | \item Directed Graphical Models 289 | \item Model Selection 290 | } 291 | 292 | \ece 293 | \end{frame} 294 | 295 | 296 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 297 | \begin{frame}{MLE (Maximum Likelihood Estimation): Concept} 298 | 299 | \plitemsep 0.07in 300 | 301 | \bci 302 | 303 | \item Idea: define a function of the parameters called \bluef{likelihood function}. 304 | 305 | \item Negative log-likelihood for data $\vx$ and a family of probability densities $\cprob{\vx \mid \vth}$ parameterized by $\vth$: 306 | $$ 307 | \cL_{\vx}(\vth) = \cL(\vth) \eqdef - \log \cprob{\vx \mid \vth} 308 | $$ 309 | \bci 310 | \item $\cL(\vth)$: how likely a particular setting of $\vth$ is for the observations $\vx$. 311 | \eci 312 | 313 | \bigskip 314 | \item \redf{MLE}: Find $\vth$ such that $\cL(\vth)$ is \bluef{minimized} (i.e., likelihood is \bluef{maximized}) 315 | \eci 316 | \end{frame} 317 | 318 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 319 | \begin{frame}{MLE: Supervised Learning} 320 | 321 | \plitemsep 0.05in 322 | 323 | \bci 324 | 325 | \item The set of iid examples $(\vx_1, y_1), \ldots, (\vx_N,y_N)$ 326 | 327 | \item $\set{X} = \{\vx_1, \ldots, \vx_N \}$ and $\set{Y} = \{y_1, \ldots, y_N \}$ 328 | 329 | \item Negative log-likelihood 330 | $$ 331 | \cL(\vth) = - \log \cprob{\set{Y} \mid \set{X}, \vth} = \sum_{n=1}^N \log \cprob{y_n \mid \vx_n, \vth} 332 | $$ 333 | 334 | \item \exam Assume independent Gaussian noise $\set{N}(0,\sigma^2)$ and linear model $y_n = \trans{\vx}_n \vth$ for prediction. Then, $Y_n| (\vx_n,\vth) \sim \set{N}(\trans{\vx}_n\vth, \sigma^2).$ 335 | {\small 336 | \aleq{ 337 | \cL(\vth) &= - \sum_{n=1}^N \log \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(y_n-\trans{\vx}_n\vth )^2}{2\sigma^2} \right)= \frac{1}{2\sigma^2} \sum_{n=1}^N (y_n-\trans{\vx}_n\vth )^2 - \sum_{n=1}^N \log \frac{1}{\sqrt{2\pi\sigma^2}} 338 | }} 339 | 340 | \eci 341 | \end{frame} 342 | 343 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 344 | \begin{frame}{MAP (Maximum A Posteriori)} 345 | 346 | \plitemsep 0.1in 347 | 348 | \bci 349 | 350 | \item What if we have some \bluef{prior knowledge} about $\vth$? Then, how should we change our knowledge about $\vth$ after observing data $\vx$? 351 | 352 | \item Compute a posteriori distribution (using Bayes' Theorem) and find $\vth$ that maximizes the distribution: 353 | $$ 354 | \max_{\vth} \cprob{\vth \mid \vx} = \max_{\vth} \frac{\cprob{\vx \mid \vth}\cprob{\vth}}{\cprob{\vx}} 355 | \Longleftrightarrow \min_{\vth}\Big ( -\log \cprob{\vth \mid \vx} \Big ) 356 | $$ 357 | \bci 358 | \item In finding the optimal $\vth,$ $\cprob{\vx}$ can be ignored 359 | \eci 360 | \item ML and MAP: Bridging the non-probabilistic and probabilistic worlds as it explicitly acknowledges the need for a prior distribution, yet producing a \bluef{point estimate} (one single parameter return). 361 | 362 | %\item We later see the full parameter distributions 363 | \eci 364 | \end{frame} 365 | 366 | 367 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 368 | \begin{frame}{Model Fitting} 369 | 370 | \plitemsep 0.1in 371 | 372 | \bci 373 | 374 | \item Model class $M_{\vth}$ vs. Right model $M^*$ 375 | \mypic{0.3}{L8_model_class.png} 376 | 377 | \item Overfitting vs. Underfitting vs. Good fitting 378 | \mypic{0.7}{L8_fittings.png} 379 | \eci 380 | \end{frame} 381 | 382 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 383 | \section{L8(4)} 384 | \begin{frame}{Roadmap} 385 | 386 | \plitemsep 0.1in 387 | 388 | \bce[(1)] 389 | 390 | \item \grayf{Data, Models, and Learning} 391 | \item \grayf{Models as Functions: Empirical Risk Minimization} 392 | \item \grayf{Models as Probabilistic Models: Parameter Estimation (ML and MAP)} 393 | \item \redf{Probabilistic Modeling and Inference} 394 | \item \grayf{Directed Graphical Models 395 | \item Model Selection 396 | } 397 | 398 | \ece 399 | \end{frame} 400 | 401 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 402 | \begin{frame}{Modeling Generative Process and Probabilistic Models} 403 | 404 | \plitemsep 0.1in 405 | 406 | \bci 407 | 408 | \item Many machine learning tasks: prediction of future events and decision making 409 | 410 | \item Often build (probabilistic) models that describe the \bluef{generative process} that generates the observed data 411 | 412 | \item In probabilistic modeling, the joint distribution $\cprob{\vx,\vth}$ of the observed variables 413 | $\vx$ and the hidden parameters $\vth$ encapsulate the key information 414 | \bci 415 | \item Given: \orangef{prior} $\cprob{\vth}$ and \orangef{likelihood} $\cprob{\vx | \vth}$ 416 | \item \greenf{Joint dist.} from prior and likelihood: $\cprob{\vx,\vth} = \cprob{\vx | \vth} \cprob{\vth}$ 417 | \item We get: \redf{marginal likelihood} $\cprob{\vx} = \int \cprob{\vx,\vth} \text{d}\vth$ and \redf{posterior} $\cprob{\vth|\vx} = \frac{\cprob{\vx,\vth}}{\cprob{\vx}}$ 418 | \eci 419 | 420 | % \item Essentially, if we know the \bluef{joint distribution}, we know all about its probabilistic model 421 | \eci 422 | \end{frame} 423 | 424 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 425 | \begin{frame}{Fully Bayesian vs. ML/MAP} 426 | Given the data set $\set{X},$ we want to predict $A,$ i.e., \redblk{$\cprob{A \mid \set{X}}$} 427 | %\vspace{-1.2cm} 428 | \plitemsep 0.05in 429 | \bci 430 | \item \redf{ML}: Easy (high), Exact (low) 431 | $$ 432 | \cprob{A \mid \set{X}} \approx \cprob{A \mid \vth}, \quad \vth = \arg\max \cprob{\set{X} \mid \vth} 433 | $$ 434 | 435 | \item \redf{MAP}: Easy (mid), Exact (mid) 436 | $$ 437 | \cprob{A \mid \set{X}} \approx \cprob{A \mid \vth}, \quad \vth = \arg\max \cprob{\vth \mid \set{X}} 438 | $$ 439 | 440 | \item \redf{Fully Bayesian}: Easy (low), Exact (high) 441 | 442 | \medskip 443 | - predictive inference, use of posterior predictive distribution, bayesian prediction 444 | 445 | - remove dependence on the model parameters $\vth$ 446 | 447 | $$ 448 | \cprob{A \mid \set{X}} = \int \cprob{A \mid \vth} \cprob{\vth \mid \set{X}}\text{d}\vth 449 | $$ 450 | 451 | - Only possible by getting the full posterior distribution $\cprob{\vth \mid \set{X}}$ 452 | \eci 453 | \end{frame} 454 | 455 | 456 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 457 | \begin{frame}{(Fully) Bayesian Inference: Hardness} 458 | 459 | \plitemsep 0.2in 460 | 461 | \bci 462 | 463 | % \item \bluef{Sinle} Earlier, two ways of estimating the parameter $\vth$: ML and MAP. Essentially, it is solving an optimization problem to get a single best value $\vth^*.$ $\implies$ Prediction through $\cprob{\vx \mid \vth^*}.$ 464 | 465 | % \item Rather than just a likelihood, having the \bluef{full posterior distribution} can be useful. 466 | 467 | \item For a data set $\set{X},$ a parameter prior $\cprob{\vth},$ and a likelihood function, the posterior is: 468 | $$ 469 | \cprob{\vth \mid \set{X}} = \frac{\cprob{\set{X} \mid \vth} \cprob{\vth}}{\cprob{\set{X}}}, \quad 470 | \cprob{\set{X}} = \int \cprob{\set{X} \mid \vth} \cprob{\vth} \; \text{d}\vth 471 | $$ 472 | 473 | %\item \question \bluef{Examples of prediction using the posterior distribution?} 474 | 475 | \item Implementation hardness 476 | \bci 477 | \item Bayesian inference requires to solve integration, which is often challenging. In particular, a conjugate prior is not chosen, the integration is not analytically tractable. 478 | 479 | \item Approximation techniques: MCMC (Markov Chain Monte Carlo), Laplace approximation, variational inference, expectation propagation 480 | \eci 481 | \eci 482 | \end{frame} 483 | 484 | 485 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 486 | \begin{frame}{Latent-Variable Models (1)} 487 | 488 | \plitemsep 0.1in 489 | 490 | \bci 491 | 492 | \item Including latent variables in the model $\rightarrow$ contributing to the interpretability of the model 493 | 494 | \item General discussions here would be applied the following examples later 495 | \bci 496 | \item PCA for dimensionality reduction \hfill \lecturemark{L10(7)} 497 | \item Gaussian mixture models for density estimation \hfill \lecturemark{L11(3)} 498 | \eci 499 | 500 | 501 | \item In latent-variable models (LVMs)\footnote{In our note, we express the dependence on the model parameters $\vth$ using subscript notations, e.g., $\cprobi{\vth}{\vx | \vz}$ rather than $\cprob{\vx| \vz, \vth}$ to highlight the role of $\vz.$ }, 502 | \bci 503 | \item Given: \orangef{prior} $\cprob{\vz}$ and \orangef{likelihood} $\cprobi{\vth}{\vx | \vz}$ 504 | \item \greenf{Joint dist.} from prior and likelihood: $\cprobi{\vth}{\vx,\vz} = \cprobi{\vth}{\vx | \vz} \cprob{\vz}$ 505 | \item Our interest: \redf{marginal likelihood} $\cprobi{\vth}{\vx}$ 506 | and \redf{posterior} $\cprobi{\vth}{\vz|\vx}$ 507 | \eci 508 | 509 | 510 | % \item Offers data generation process through parameters: $\cprob{\vx \mid \vth, \vz}$, $\cprob{\vz}$ 511 | 512 | % \item Marginalization over the latent variables, which allows parameter estimation by ML and MAP (using the prior $\cprob{\vth}$) 513 | % $$ 514 | % \cprob{\vx \mid \vth} = \int \cprob{\vx \mid \vth, \vz} \cprob{\vz}\; \text{d}\vz 515 | % $$ 516 | 517 | \eci 518 | \end{frame} 519 | 520 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 521 | \begin{frame}{LVM (2)} 522 | 523 | \plitemsep 0.1in 524 | 525 | \bci 526 | 527 | \item Assuming we know $\vth$, to generate a data sample from the model (i) sample $\vz$ from $\cprob{\vz}$ and (ii) sample $\vx$ from $\cprobi{\vth}{\vx|\vz}$ 528 | 529 | \item \redf{Inference.} computing the \bluef{posterior distribution} $\cprobi{\vth}{\vz | \vx}$: 530 | $$ 531 | \cprobi{\vth}{\vz | \vx} = \frac{\cprobi{\vth}{\vx,\vz}}{\cprobi{\vth}{\vx}} = 532 | \frac{\cprobi{\vth}{\vx,\vz}}{\int \cprobi{\vth}{\vx,\vz} \text{d}\vz} 533 | $$ 534 | 535 | \item This requires to solve the sub-problem of computing the \bluef{marginal likelihood} of the observation: 536 | $$\displaystyle \cprobi{\vth}{\vx} = \int \cprobi{\vth}{\vx,\vz} \text{d}\vz$$ 537 | 538 | \eci 539 | \end{frame} 540 | 541 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 542 | \begin{frame}{LVM (3): Why the posterior distribution $\cprobi{\vth}{\vz | \vx}$?} 543 | 544 | \plitemsep 0.2in 545 | 546 | \bci 547 | 548 | \item \bluef{Explanation of the observation.} Allows us to figure out which latent configurations could have plausibly generated the observation data samples. 549 | 550 | \item \bluef{Learning of model parameters $\vth$.} Training LVMs to estimate $\vth$ (e.g., ML) requires $\cprobi{\vth}{\vz | \vx}$ in its inner loops 551 | 552 | \item[] 553 | \mycolorbox{ 554 | \vspace{-0.3cm} 555 | $$ 556 | \text{marginal likelihood $\cprobi{\vth}{\vx}$} \implies \text{posterior distribution $\cprobi{\vth}{\vz | \vx}$} \implies \text{$\vth_{\ml}$} 557 | $$ 558 | } 559 | \eci 560 | \end{frame} 561 | 562 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 563 | \begin{frame}{LVM (4): How is $\cprobi{\vth}{\vz | \vx}$? used for $\vth_{\ml}$?} 564 | 565 | \plitemsep 0.2in 566 | 567 | \bci 568 | 569 | \item In ML, we need the gradient of the marginal log-likelihood. For a data sample $\vx,$ 570 | \aleq{ 571 | \grad_{\vth} \log p_{\vth}(\vx) &= \frac{\grad_{\vth} p_{\vth}(\vx)}{p_{\vth}(\vx)} 572 | = \frac{\int \grad_{\vth} p_{\vth}(\vx,\vz)\text{d}\vz}{p_{\vth}(\vx)} = 573 | \frac{\int p_{\vth}(\vx,\vz) \grad_{\vth} \log p_{\vth}(\vx,\vz)\text{d}\vz}{p_{\vth}(\vx)} \cr 574 | &= \int \orangef{p_{\vth}(\vz|\vx)} \grad_{\vth} \log p_{\vth}(\vx,\vz)\text{d}\vz 575 | } 576 | 577 | \item $\cprobi{\vth}{\vz | \vx}$ performs \bluef{credit assignment} over latent configurations 578 | \eci 579 | \end{frame} 580 | 581 | % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 582 | % \begin{frame}{Generative Modeling with Latent Variables} 583 | 584 | % \plitemsep 0.1in 585 | 586 | % \myvartwocols{0.7}{0.29}{0.65} 587 | % { 588 | % \bci 589 | % \item Generative process 590 | % \bci 591 | % \item $\vz \sim p(\vz)$ 592 | % \item $\vx \sim p(\vx | \vz)$ 593 | % \eci 594 | % \eci 595 | 596 | % \bigskip 597 | % \aleq 598 | % { 599 | % p(\vx) &= \int p(\vx,\vz)\text{d}\vz \cr 600 | % &=\int p(\vx|\vz)p(\vz)\text{d}\vz 601 | % } 602 | 603 | % } 604 | % { 605 | % \vspace{-0.5cm} 606 | % \mypic{0.4}{L10_latent.png} 607 | % \vspace{-0.6cm} 608 | % \raggedleft 609 | % {\tiny Source: \url{https://dlvu.github.io/slides/dlvu.lecture06.pdf}} 610 | % } 611 | 612 | % \end{frame} 613 | 614 | % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 615 | % \begin{frame}{Latent-Variable Models (2)} 616 | 617 | % \plitemsep 0.1in 618 | 619 | % \bci 620 | 621 | % \item We can compute a posterior on the latent variables, but marginalizing over both $\vz$ and $\vth$ is hard: 622 | % $$ 623 | % \cprob{\vz \mid \set{X}} = \frac{\cprob{\set{X}\mid \vz} \cprob{\vz}}{\cprob{X}}, \quad 624 | % \cprob{\set{X} \mid \vz} = \int \cprob{\set{X} \mid \vz, \vth} \cprob{\vth} \text{d}\vth 625 | % $$ 626 | 627 | 628 | % \item Instead, it is easier to compute the latent-variable posterior, but conditioned on the model parameters, i.e., 629 | % $$ 630 | % \cprob{\vz \mid \set{X},\vth} = \frac{\cprob{\set{X}\mid \vz,\vth} \cprob{\vz}}{\cprob{\set{X} \mid \vth}} 631 | % $$ 632 | 633 | % \item \question How do we use the posteriors $\cprob{\vz \mid \set{X}}$ or $\cprob{\vz \mid \set{X},\vth}$ in practice? Any examples? 634 | % \eci 635 | % \end{frame} 636 | 637 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 638 | \section{L8(5)} 639 | \begin{frame}{Roadmap} 640 | 641 | \plitemsep 0.1in 642 | 643 | \bce[(1)] 644 | 645 | \item \grayf{Data, Models, and Learning} 646 | \item \grayf{Models as Functions: Empirical Risk Minimization} 647 | \item \grayf{Models as Probabilistic Models: Parameter Estimation (ML and MAP)} 648 | \item \grayf{Probabilistic Modeling and Inference} 649 | \item \redf{Directed Graphical Models } 650 | \item \grayf{Model Selection} 651 | \ece 652 | \end{frame} 653 | 654 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 655 | \begin{frame}{Graphical Models} 656 | 657 | \plitemsep 0.1in 658 | 659 | \bci 660 | 661 | \item Joint distribution of a probabilistic model: key quantity of interest, but quite complicated without structural properties 662 | 663 | \item However, there exist relations of \bluef{independence}, \bluef{conditional independence} among random variables. 664 | 665 | \item (Probabilistic) graphical models: Roughly speaking, a graph of random variables. 666 | 667 | \bci 668 | \item Simple ways to visualize the structure of the model 669 | \item Insights into the structural properties, e.g., conditional independence 670 | \item Computations for inference and learning can be expressed in terms of graphical manipulations 671 | \eci 672 | \eci 673 | \end{frame} 674 | 675 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 676 | \begin{frame}{Graph Semantics} 677 | 678 | \mytwocols{0.4} 679 | { 680 | \aleq{ 681 | \cprob{a,b,c} = \cprob{c| a,b} \cprob{b | a}\cprob{a} 682 | } 683 | \mypic{0.4}{L8_gmodel_ex1.png} 684 | } 685 | { 686 | %\vspace{-0.3cm} 687 | \aleq{ 688 | &\cprob{x_1, x_2, x_3, x_4, x_5} = \cr 689 | &\cprob{x_1}\cprob{x_5}\cprob{x_2 | x_5}\cprob{x_3 | x_1, x_2} \cprob{x_4 | x_2}} 690 | \mypic{0.6}{L8_gmodel_ex2.png} 691 | } 692 | \vspace{-0.3cm} 693 | \plitemsep 0.03in 694 | \bci 695 | \item Nodes: random variables 696 | \item Directed edge for direct dependence: $b$ directly depends on $a$: $a \rightarrow b$ 697 | \item Graph layout: factorization of the joint distribution 698 | $$ 699 | \cprob{x_1, \ldots, x_K} = \prod_{k=1}^K \cprob{x_k \mid \mathbf{Pa}_k}, \quad\text{$\mathbf{Pa}_k$ are the parent nodes of $x_k.$} 700 | $$ 701 | \eci 702 | \end{frame} 703 | 704 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 705 | \begin{frame}{Example: $N$ coin-flip experiments} 706 | 707 | \mypic{0.75}{L8_coinflip.png} 708 | \plitemsep 0.07in 709 | \bci 710 | \item Shaded nodes: observables, $\mu$: probability of head, a (latent) random variable 711 | \item Joint distribution 712 | $$ 713 | \cprob{x_1, \ldots, x_N \mid \mu} = \prod_{n=1}^N \cprob{x_n \mid \mu} 714 | $$ 715 | \eci 716 | \end{frame} 717 | 718 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 719 | \begin{frame}{Conditional Independence and $d$-Separation} 720 | 721 | \plitemsep 0.07in 722 | \bci 723 | \item \question How can we see conditional independence in the directed graphical models? For example, $\set{A} \indep \set{B} \mid \set{C}$? 724 | \item \bluef{$d$-separation} 725 | \bci 726 | \item All possible trails\footnote{paths that ignore the direction of the arrows} from any node $\set{A}$ to any node in $\set{B}$ 727 | \item Any such path is blocked if it includes any node such that either of the following is true: 728 | \bci 729 | \item The arrows on the path meet either head to tail or tail to tail at the node, and the node is in \set{C} 730 | \item The arrows meet head to head at the node, and neither the node nor any of its descendants is in \set{C} 731 | \eci 732 | \item If all the paths are blocked, then $\set{A}$ is $d$-separated from $\set{B}$ by $\set{C}.$ 733 | \item If $d$-separated, $\set{A} \indep \set{B} \mid \set{C}$ 734 | \eci 735 | \eci 736 | 737 | % \myvartwocols{0.4}{0.7}{0.26} 738 | % { 739 | % \small 740 | 741 | % } 742 | % { 743 | % \mypic{0.7}{L8_dsep.png} 744 | % } 745 | 746 | \end{frame} 747 | 748 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 749 | \begin{frame}{Example} 750 | 751 | \mypic{0.25}{L8_dsep.png} 752 | % \vspace{-0.8cm} 753 | % \raggedleft{\scriptsize Source: \url{http://www.causality.inf.ethz.ch/data/LUCAS.html}} 754 | \plitemsep 0.1in 755 | \bci 756 | \item $b \indep d \mid a,c$ 757 | \item $a \indep c \mid b$ 758 | \item $b \not\indep d \mid c$ 759 | \item $a \not\indep c \mid b,e$ 760 | \eci 761 | \end{frame} 762 | 763 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 764 | \begin{frame}{Example in Healthcare} 765 | 766 | \mypic{0.6}{L8_lung_cancer.png} 767 | \vspace{-0.8cm} 768 | \raggedleft{\scriptsize Source: \url{http://www.causality.inf.ethz.ch/data/LUCAS.html}} 769 | 770 | \end{frame} 771 | 772 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 773 | \begin{frame}{Three Types of Graphical Models} 774 | 775 | \mypic{0.7}{L8_all_gmodels.png} 776 | 777 | \plitemsep 0.15in 778 | \bci 779 | \item \bluef{Directed graphical models (or Bayesian Networks)} 780 | 781 | \item Undirected graphical models (Markov Random Fields) 782 | 783 | \item Factor graphs 784 | \eci 785 | 786 | \end{frame} 787 | 788 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 789 | \begin{frame}{Roadmap} 790 | 791 | \plitemsep 0.1in 792 | 793 | \bce[(6)] 794 | 795 | \item \grayf{Data, Models, and Learning} 796 | \item \grayf{Models as Functions: Empirical Risk Minimization} 797 | \item \grayf{Models as Probabilistic Models: Parameter Estimation (ML and MAP)} 798 | \item \grayf{Probabilistic Modeling and Inference} 799 | \item \grayf{Directed Graphical Models } 800 | \item \redf{Model Selection} 801 | \ece 802 | \end{frame} 803 | 804 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 805 | \begin{frame}{Nested Cross-Validation} 806 | 807 | 808 | \mypic{0.65}{L8_nested_cross_validation.png} 809 | 810 | \plitemsep 0.1in 811 | 812 | \bci 813 | 814 | \item Model selection 815 | \bci 816 | \item Tradeoff between model complexity and data fit 817 | 818 | \item \bluef{Occam's razor.} Find the simplest model that explains the data resonably well. 819 | \eci 820 | 821 | 822 | \item Test set: estimate the generalization performance 823 | 824 | \item Validation set: choose the best model 825 | \eci 826 | \end{frame} 827 | 828 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 829 | \begin{frame}{Bayesian Model Selection} 830 | 831 | \plitemsep 0.1in 832 | 833 | \bci 834 | 835 | \item A set of models $\bm{M} = \{M_1, \ldots, M_k \},$ where each $M_k$ has $\vth_k$ parameters. A prior $\cprob{M}$ on each model $M \in \bm{M}.$ 836 | $$ 837 | M_k \sim \cprob{M}, \quad \vth_k \sim \cprob{\vth \mid M_k}, \quad \set{D} \sim \cprob{\set{D} \mid \vth_k} 838 | $$ 839 | \item Posterior distribution $\cprob{M_k \mid \set{D}} \propto \cprob{M_k} \cprob{\set{D} \mid M_k},$ where we have the following \bluef{model evidence} or \bluef{marginal likelihood}: 840 | $$ 841 | \cprob{\set{D} \mid M_k} = \int \cprob{\set{D} \mid \vth_k} \cprob{\vth_k \mid M_k} \text{d}\vth_k \quad \text{(***)} 842 | $$ 843 | \item MAP for the model: $M^* = \arg \max_{M_k} \cprob{M_k \mid \set{D}}$ 844 | \item With the uniform model prior (i.e., $\cprob{M_k} = 1/k$), the MAP estimate equals to maximization of model evidence. 845 | \eci 846 | \end{frame} 847 | 848 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 849 | \begin{frame}{Bayes Factors for Model Comparison} 850 | 851 | \plitemsep 0.1in 852 | 853 | \bci 854 | 855 | \item Compare two probabilistic models $M_1$ and $M_2$: 856 | $$ 857 | \text{(Posterior odds)} = \frac{\cprob{M_1 \mid \set{D}}}{\cprob{M_2 \mid \set{D}}} = \frac 858 | { 859 | \frac{\cprob{\set{D} \mid M_1}\cprob{M_1}}{\cprob{\set{D}}} 860 | } 861 | { 862 | \frac{\cprob{\set{D} \mid M_2}\cprob{M_2}}{\cprob{\set{D}}} 863 | } 864 | = \underbrace{\frac{\cprob{M_1}}{\cprob{M_2}}}_{\text{Prior odds}} 865 | \underbrace{\frac{\cprob{\set{D} \mid M_1}}{\cprob{\set{D} \mid M_2}}}_{\text{Bayes factor}} 866 | $$ 867 | \item $\cprob{\set{D} \mid M_k}$: How well the data is predicted by the model $M_k$ 868 | \item With the uniform model prior, the prior odds $= 1$ 869 | 870 | \item Computation of Bayes factor requires the complex integration (***) in the previous slide. In this case, we rely on some approximations such as MCMC (Markov Chain Monte Carlo). 871 | \eci 872 | 873 | \end{frame} 874 | 875 | 876 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 877 | \begin{frame}{Summary} 878 | 879 | \plitemsep 0.1in 880 | 881 | \bci 882 | 883 | \item 884 | \eci 885 | \end{frame} 886 | 887 | 888 | 889 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 890 | \begin{frame}{} 891 | \vspace{2cm} 892 | \LARGE Questions? 893 | 894 | 895 | \end{frame} 896 | 897 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 898 | \begin{frame}{Review Questions} 899 | % \tableofcontents 900 | %\plitemsep 0.1in 901 | \bce[1)] 902 | \item 903 | 904 | \ece 905 | \end{frame} 906 | 907 | 908 | \end{document} 909 | -------------------------------------------------------------------------------- /09.LinearRegression/9.LR-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/9.LR-2.pdf -------------------------------------------------------------------------------- /09.LinearRegression/9.LR-4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/9.LR-4.pdf -------------------------------------------------------------------------------- /09.LinearRegression/9.LR.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/9.LR.pdf -------------------------------------------------------------------------------- /09.LinearRegression/L9_LR_gmodel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/L9_LR_gmodel.png -------------------------------------------------------------------------------- /09.LinearRegression/L9_bayesian_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/L9_bayesian_regression.png -------------------------------------------------------------------------------- /09.LinearRegression/L9_overfit_linear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/L9_overfit_linear.png -------------------------------------------------------------------------------- /09.LinearRegression/L9_poly4fit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/L9_poly4fit.png -------------------------------------------------------------------------------- /09.LinearRegression/L9_posterior_predictive_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/L9_posterior_predictive_ex.png -------------------------------------------------------------------------------- /09.LinearRegression/L9_regression_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/L9_regression_ex.png -------------------------------------------------------------------------------- /09.LinearRegression/L9_training_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/09.LinearRegression/L9_training_test.png -------------------------------------------------------------------------------- /09.LinearRegression/main.tex: -------------------------------------------------------------------------------- 1 | %\pdfminorversion=4 2 | \documentclass[handout,fleqn,aspectratio=169]{beamer} 3 | 4 | \input{../myhead} 5 | 6 | \title[]{Lecture 9: Linear Regression} 7 | \author{Yi, Yung (이융)} 8 | \institute{Mathematics for Machine Learning\\ \url{https://yung-web.github.io/home/courses/mathml.html} 9 | \\KAIST EE} 10 | \date{\today} 11 | 12 | 13 | \input{../mymath} 14 | \input{../mymacro} 15 | 16 | 17 | %\addtobeamertemplate{footline}{\rule{0.94\paperwidth}{1pt}}{} 18 | 19 | \begin{document} 20 | 21 | \input{../mydefault} 22 | 23 | 24 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 25 | \begin{frame}{Warm-Up} 26 | 27 | {\Large Please watch this tutorial video by Luis Serrano on PCA.} 28 | 29 | \bigskip 30 | 31 | \bigskip 32 | 33 | \url{https://www.youtube.com/watch?v=wYPUhge9w5c} 34 | 35 | \end{frame} 36 | 37 | % START START START START START START START START START START START START START 38 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 39 | \begin{frame}{Roadmap} 40 | 41 | \plitemsep 0.1in 42 | 43 | \bce[(1)] 44 | 45 | \item Problem Formulation 46 | \item Parameter Estimation: ML 47 | \item Parameter Estimation: MAP 48 | \item Bayesian Linear Regression 49 | \item Maximum Likelihood as Orthogonal Projection 50 | 51 | \ece 52 | \end{frame} 53 | 54 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 55 | \section{L9(1)} 56 | \begin{frame}{Roadmap} 57 | 58 | \plitemsep 0.1in 59 | 60 | \bce[(1)] 61 | 62 | \item \redf{Problem Formulation} 63 | \item \grayf{Parameter Estimation: ML 64 | \item Parameter Estimation: MAP 65 | \item Bayesian Linear Regression 66 | \item Maximum Likelihood as Orthogonal Projection } 67 | 68 | \ece 69 | \end{frame} 70 | 71 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 72 | \begin{frame}{Regression Problem} 73 | 74 | \mypic{0.75}{L9_regression_ex.png} 75 | 76 | \plitemsep 0.1in 77 | 78 | \bci 79 | 80 | \item For some input values $x_n,$ we observe noisy function values $y_n = f(x_n) + \epsilon$ 81 | 82 | \item Goal: infer the function $f$ that generalizes well to function values at new inputs 83 | 84 | \item Applications: time-series analysis, control and robotics, image recognition, etc. 85 | \eci 86 | \end{frame} 87 | 88 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 89 | \begin{frame}{Formulation} 90 | 91 | \plitemsep 0.07in 92 | 93 | \bci 94 | 95 | 96 | \item[] 97 | {\small 98 | Notation for simplification (this is how the textbook uses) 99 | \aleq{ 100 | \redf{p(y|\vx)} = p_{Y|\vX}(y | \vx), \quad Y \sim \set{N}(\mu,\sigma^2) \xrightarrow{\text{simplifies}} \set{N}(y \mid f(\vx), \sigma^2) 101 | } 102 | } 103 | \item Assume: \bluef{linear} regression, \bluef{Gaussian} noise 104 | 105 | \item $y = f(\vx) + \epsilon,$ where $\epsilon \sim \set{N}(0,\sigma^2)$ 106 | 107 | 108 | \item Likelihood: for $\vx \in \real^D$ and $y \in \real,$ $p(y \mid \vx) = \set{N}(y \mid f(\vx), \sigma^2)$ 109 | 110 | 111 | \item Linear regression with the parameter $\vth \in \realD,$ i.e., $f(\vx) = \trans{\vx}\vth$ 112 | $$ 113 | p(y \mid \vx) = \set{N}(y \mid \trans{\vx}\vth, \sigma^2) \Longleftrightarrow y = \trans{\vx}\vth + \epsilon, \quad \epsilon \sim \set{N}(0,\sigma^2) 114 | $$ 115 | 116 | \mycolorbox 117 | { 118 | \centering 119 | Prior with Gaussian nose: $p(y \mid \vx) = \set{N}(y \mid \trans{\vx}\vth, \sigma^2)$ 120 | } 121 | 122 | \eci 123 | \end{frame} 124 | 125 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 126 | \begin{frame}{Parameter Estimation} 127 | 128 | \plitemsep 0.2in 129 | 130 | \bci 131 | 132 | \item Training set $\set{D} = \{(\vx_1, y_1), \ldots, (\vx_N,y_N) \}$\hspace{3cm} 133 | \myinlinepic{2.5cm}{L9_LR_gmodel.png} 134 | 135 | \item Assuming iid $N$ data samples, the likelihood is factorized into: 136 | $$ 137 | p(\set{Y} \mid \set{X},\vth) = \prod_{n=1}^N p(y_n \mid \vx_n, \vth) = \prod_{n=1}^N 138 | \set{N}(y_n \mid \trans{\vx}_n, \sigma^2), 139 | $$ 140 | where $\set{X} = \{\vx_1,\ldots,\vx_n \}$ and $\set{Y} = \{y_1,\ldots,y_n \}$ 141 | \item Estimation methods: ML and MAP 142 | \eci 143 | \end{frame} 144 | 145 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 146 | \section{L9(2)} 147 | \begin{frame}{Roadmap} 148 | 149 | \plitemsep 0.1in 150 | 151 | \bce[(1)] 152 | 153 | \item \grayf{Problem Formulation} 154 | \item \redf{Parameter Estimation: ML} 155 | \item \grayf{Parameter Estimation: MAP 156 | \item Bayesian Linear Regression 157 | \item Maximum Likelihood as Orthogonal Projection } 158 | 159 | \ece 160 | \end{frame} 161 | 162 | 163 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 164 | \begin{frame}{MLE (Maximum Likelihood Estimation) (1)} 165 | 166 | \plitemsep 0.1in 167 | 168 | \bci 169 | 170 | \item $\vth_\ml = \arg \max_{\vth} p(\cY \mid \cX, \vth) = \arg \min_{\vth} \Big( -\log p(\cY \mid \cX, \vth) \Big)$ 171 | \item For Gaussian noise with $\mX = \trans{[\vx_1, \ldots, \vx_n]}$ and $\vy = \trans{[y_1, \ldots, y_n]},$ 172 | \aleq{ 173 | -\log p(\cY \mid \cX, \vth) &= -\log \prod_{n=1}^N p(y_n \mid \vx_n, \vth) = -\sum_{n=1}^N \log p(y_n \mid \vx_n, \vth) \cr 174 | & = \frac{1}{2\sigma^2} \sum_{n=1}^N (y_n - \trans{\vx}_n \vth)^2 + \ \text{const} = \frac{1}{2\sigma^2} \norm{\vy - \mX \vth}^2 + \ \text{const} 175 | } 176 | 177 | \mycolorbox 178 | { 179 | Negative-log likelihood for $f(\vx) = \trans{\vx}\vth + \set{N}(0,\sigma^2)$: 180 | \vspace{-0.1cm} 181 | $$ 182 | -\log p(\cY \mid \cX, \vth) = \frac{1}{2\sigma^2} \norm{\vy - \mX \vth}^2 + \ \text{const} 183 | $$ 184 | } 185 | \eci 186 | \end{frame} 187 | 188 | 189 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 190 | \begin{frame}{MLE (Maximum Likelihood Estimation) (2)} 191 | 192 | \plitemsep 0.2in 193 | 194 | \bci 195 | 196 | \item For Gaussian noise with $\mX = \trans{[\vx_1, \ldots, \vx_n]}$ and $\vy = \trans{[y_1, \ldots, y_n]},$ 197 | \aleq{ 198 | \vth_\ml = \arg \min_{\vth} \frac{1}{2\sigma^2} \norm{\vy - \mX \vth}^2, \quad L(\vth) = \frac{1}{2\sigma^2} \norm{\vy - \mX \vth}^2 199 | } 200 | 201 | \item In case of Gaussian noise, $\vth_{\ml}= \vth$ that minimizes the empirical risk with the squared loss function 202 | \bci 203 | \item Models as functions $=$ Model as probabilistic models 204 | \eci 205 | 206 | \eci 207 | \end{frame} 208 | 209 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 210 | \begin{frame}{MLE (Maximum Likelihood Estimation) (3)} 211 | 212 | \plitemsep 0.2in 213 | 214 | \bci 215 | 216 | \item We find $\vth$ such that $\d{L}{\vth}=0$ 217 | \aleq{ 218 | &\d{L}{\vth} = \frac{1}{2\sigma^2} \left (-2 \trans{(\vy - \mX \vth)} \mX\right) = \frac{1}{\sigma^2} \left (-\trans{\vy}\vX +\trans{\vth}\trans{\mX}\mX \right ) = 0\cr 219 | &\Longleftrightarrow \trans{\vth}_\ml\trans{\mX}\mX = \trans{\vy}\vX \cr 220 | & \Longleftrightarrow 221 | \trans{\vth}_\ml = \trans{\vy}\vX \inv{(\trans{\mX}\mX)} \quad \text{($\trans{\mX}\mX$ is positive definite if $\rk{\mX}=D$)} \cr 222 | & \Longleftrightarrow \vth_\ml = \inv{(\trans{\mX}\mX)} \trans{\vX} \vy 223 | } 224 | 225 | \eci 226 | \end{frame} 227 | 228 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 229 | \begin{frame}{MLE with Features} 230 | 231 | \plitemsep 0.07in 232 | 233 | \bci 234 | 235 | \item Linear regression: Linear in terms of \bluef{the parameters} 236 | \bci 237 | \item $\trans{\phi(\vx)} \vth$ is also fine, where $\phi(\vx)$ can be non-linear (we will cover this later) 238 | \item $\phi(\vx)$ are the features 239 | \eci 240 | 241 | \item Linear regression with the parameter $\vth \in \real^K,$ $\phi(\vx): \realD \mapsto \real^K$: 242 | $$ 243 | p(y \mid \vx) = \set{N}(y \mid \trans{\phi(\vx)} \vth, \sigma^2) \Longleftrightarrow y = \trans{\phi(\vx)} \vth + \epsilon = \sum_{k=0}^{K-1} \theta_k \phi_k(\vx) + \epsilon 244 | $$ 245 | 246 | \item \exam \bluef{Polynomial regression.} For $x\in \real$ and $\vth \in \real^K$, we lift the original 1-D input into $K$-D feature space with monomials $x^k$: 247 | \aleq{ 248 | \phi(x) = \colvec{\phi_0(x) \\ \vdots \\ \phi_{K-1}(x)} = \colvec{1 \\ \vdots \\ x^{K-1}} \in \real^K 249 | \quad \implies \quad f(x) = \sum_{k=0}^{K-1} \theta_k x^k 250 | } 251 | 252 | \eci 253 | \end{frame} 254 | 255 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 256 | \begin{frame}{Feature Matrix and MLE} 257 | 258 | \plitemsep 0.15in 259 | 260 | \bci 261 | 262 | \item Now, for the entire training set $\{\vx_1, \ldots, \vx_N \}$, 263 | \aleq{ 264 | \bm{\Phi} \eqdef \colvec{\trans{\phi}(\vx_1)\\ \vdots \\\trans{\phi}(\vx_N)} 265 | = \begin{nmat} 266 | \phi_0(\vx_1) & \cdots & \phi_{K-1}(\vx_1) \cr 267 | \vdots & \cdots & \vdots \cr 268 | \phi_0(\vx_N) & \cdots & \phi_{K-1}(\vx_N) 269 | \end{nmat} 270 | \in \real^{N \times K}, \ \mPhi_{ij} = \phi_j(\vx_i), \ \phi_j: \realD \mapsto \real 271 | } 272 | \item Negative log-likelihood: Similarly to the case of $\vy = \mX \vth,$ 273 | \mycolorbox 274 | { 275 | \bci 276 | \item $p(\set{Y}| \set{X},\vth) = \set{N}(\vy \mid \mPhi\vth, \sigma^2\mI)$ 277 | \item Negative-log likelihood for $f(\vx) = \trans{\phi}(\vx)\vth + \set{N}(0,\sigma^2)$: 278 | \vspace{-0.1cm} 279 | $$ 280 | -\log p(\cY \mid \cX, \vth) = \dfrac{1}{2\sigma^2} \norm{\vy - \bm{\Phi}\vth}^2 + \text{const} 281 | $$ 282 | \eci 283 | } 284 | 285 | 286 | % $$ 287 | % -\log p(\set{Y} \mid \set{X},\vth) = \dfrac{1}{2\sigma^2} \norm{\vy - \bm{\Phi}\vth}^2 + \text{const} 288 | % $$ 289 | 290 | \item MLE: 291 | $ 292 | \vth_\ml = \inv{(\trans{\bm{\Phi}}\bm{\Phi})} \trans{\bm{\Phi}} \vy 293 | $ 294 | \eci 295 | \end{frame} 296 | 297 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 298 | \begin{frame}{Polynomial Fit} 299 | 300 | \plitemsep 0.07in 301 | 302 | \bci 303 | 304 | \item $N=10$ data, where $x_n \sim \set{U}[-5,5]$ and $y_n = -\sin(x_n/5) + \cos(x_n) + \epsilon,$ $\epsilon \sim \set{N}(0,0.2^2)$ 305 | 306 | \item Fit with poloynomial with degree 4 using ML 307 | \eci 308 | 309 | \mypic{0.8}{L9_poly4fit.png} 310 | 311 | \end{frame} 312 | 313 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 314 | \begin{frame}{Overfitting in Linear Regression} 315 | 316 | \myvartwocols{0.55}{0.7}{0.27} 317 | { 318 | \vspace{-0.4cm} 319 | \mypic{0.95}{L9_overfit_linear.png} 320 | } 321 | { 322 | %\vspace{-0.4cm} 323 | \mypic{0.99}{L9_training_test.png} 324 | } 325 | 326 | 327 | \plitemsep 0.04in 328 | 329 | \bci 330 | 331 | \item Higher polynomial degree is better (training error always decreases) 332 | 333 | \item Test error increases after some polynomial degree 334 | \eci 335 | \end{frame} 336 | 337 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 338 | \section{L9(3)} 339 | \begin{frame}{Roadmap} 340 | 341 | \plitemsep 0.1in 342 | 343 | \bce[(1)] 344 | 345 | \item \grayf{Problem Formulation} 346 | \item \grayf{Parameter Estimation: ML} 347 | \item \redf{Parameter Estimation: MAP} 348 | \item \grayf{Bayesian Linear Regression 349 | \item Maximum Likelihood as Orthogonal Projection } 350 | 351 | \ece 352 | \end{frame} 353 | 354 | 355 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 356 | \begin{frame}{MAPE (Maximum A Posteriori Estimation)} 357 | 358 | \plitemsep 0.15in 359 | 360 | \bci 361 | 362 | \item MLE: prone to overfitting, where the magnitude of the parameters becomes large. 363 | \item a prior distribution $p(\vth)$ helps: what $\vth$ is plausible 364 | \item MAPE and Bayes' theorem 365 | \aleq{ 366 | p(\vth \mid \set{X},\set{Y}) = \frac{p(\set{Y} \mid \set{X}, \vth) p(\vth)}{p(\set{Y} \mid \set{X})} 367 | \implies 368 | \vth_\map \in \arg\min_{\vth} \Big(-\log p(\set{Y} \mid \set{X},\vth) - \log p(\vth)\Big) 369 | } 370 | \item Gradient 371 | \aleq{ 372 | - \d{\log p(\vth | \set{X},\set{Y})}{\vth} = - \d{\log p(\set{Y}|\set{X},\vth)}{\vth} - \d{\log p(\vth)}{\vth} 373 | } 374 | 375 | \eci 376 | 377 | \end{frame} 378 | 379 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 380 | \begin{frame}{MAPE for Gausssian Prior (1)} 381 | 382 | \plitemsep 0.07in 383 | 384 | \bci 385 | 386 | \item \exam A (conjugate) Gaussian prior $p(\vth) \sim \set{N}(\vec{0}, b^2 \mI)$ 387 | \bci 388 | \item For Gaussian likelihood, Gaussian prior $\implies$ Gaussian posterior \hfill \lecturemark{L6(6)} 389 | \eci 390 | 391 | \item Negative log-posterior 392 | 393 | \medskip 394 | \mycolorbox 395 | { 396 | Negative-log posterior for $f(\vx) = \trans{\phi}(\vx)\vth + \set{N}(0,\sigma^2)$ and $p(\vth) \sim \set{N}(\vec{0}, b^2 \mI)$: 397 | \vspace{-0.1cm} 398 | $$ 399 | -\log p(\vth | \set{X},\set{Y}) = \frac{1}{2\sigma^2} \trans{(\vy - \mPhi\vth)} (\vy - \mPhi\vth) + \frac{1}{2b^2}\trans{\vth}\vth + \text{const} 400 | $$ 401 | } 402 | \item Gradient 403 | \aleq{ 404 | -\d{\log p(\vth | \set{X},\set{Y})}{\vth} &= \frac{1}{\sigma^2} 405 | (\trans{\vth}\trans{\mPhi}\mPhi - \trans{\vy}\mPhi) + \frac{1}{b^2}\trans{\vth} 406 | } 407 | 408 | \eci 409 | 410 | \end{frame} 411 | 412 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 413 | \begin{frame}{MAPE for Gausssian Prior (2)} 414 | 415 | \plitemsep 0.1in 416 | 417 | \bci 418 | 419 | \item MAP vs. ML 420 | $$ 421 | \vth_\map = \inv{ 422 | \underbrace{\Big(\trans{\mPhi} \mPhi + \bluef{\frac{\sigma^2}{b^2}\mI} \Big)}_{(*)} 423 | } \trans{\mPhi} \vy, \quad \vth_\ml = \inv{(\trans{\bm{\Phi}}\bm{\Phi})} \trans{\bm{\Phi}} \vy 424 | $$ 425 | 426 | \item The term $\bluef{\dfrac{\sigma^2}{b^2}\mI}$ 427 | \bci 428 | \item Ensures that $(*)$ is symmetric, strictly positive definite 429 | \item Role of regularizer 430 | \eci 431 | \eci 432 | 433 | \end{frame} 434 | 435 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 436 | \begin{frame}{Aside: MAPE for General Gausssian Prior (3)} 437 | 438 | \plitemsep 0.07in 439 | 440 | \bci 441 | 442 | \item \exam A (conjugate) Gaussian prior $p(\vth) \sim \bluef{\set{N}(\vm_0, \mS_0)}$ 443 | 444 | \item Negative log-posterior 445 | 446 | \medskip 447 | 448 | \begin{tcolorbox}[width=14cm,colback=red!5!white,colframe=red!75!black] 449 | Negative-log posterior for $f(\vx) = \trans{\phi}(\vx)\vth + \set{N}(0,\sigma^2)$ and $p(\vth) \sim \set{N}(\vm_0, \mS_0)$: 450 | \vspace{-0.1cm} 451 | \aleq{ 452 | -\log p(\vth | \set{X},\set{Y}) &= \frac{1}{2\sigma^2} \trans{(\vy - \mPhi\vth)} (\vy - \mPhi\vth) + \bluef{\frac{1}{2}\trans{(\vth - \vm_0)}\inv{\mS}_0(\vth-\vm_0)} + \text{const} 453 | } 454 | \end{tcolorbox} 455 | 456 | \item We will use this later for computing the parameter posterior distribution in Bayesian linear regression. 457 | \eci 458 | 459 | \end{frame} 460 | 461 | 462 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 463 | \begin{frame}{Regularization: MAPE vs. Explicit Regularizer} 464 | 465 | 466 | \plitemsep 0.1in 467 | 468 | 469 | \bci 470 | 471 | \item \bluef{Explicit regularizer} in regularized least squares (RLS) 472 | $$ 473 | \norm{\vy - \mPhi\vth}^2 + \lambda \norm{\vth}^2 474 | $$ 475 | 476 | \item \bluef{MAPE wth Gaussian prior} $p(\vth) \sim \set{N}(\vec{0},b^2 \mI)$ 477 | \bci 478 | \item Negative log-Gaussian prior 479 | $$ 480 | -\log p(\vth) = \frac{1}{2b^2}\trans{\vth}\vth + \text{const} 481 | $$ 482 | \item $\lambda = 1/2b^2$ is the regularization term 483 | \eci 484 | 485 | \item Not surprising that we have 486 | $$ 487 | \vth_{\text{RLS}} = \inv{ 488 | \Big(\trans{\mPhi} \mPhi + \bluef{\lambda \mI} \Big) 489 | } \trans{\mPhi} \vy 490 | $$ 491 | \eci 492 | 493 | \end{frame} 494 | 495 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 496 | \section{L9(4)} 497 | \begin{frame}{Roadmap} 498 | 499 | \plitemsep 0.1in 500 | 501 | \bce[(1)] 502 | 503 | \item \grayf{Problem Formulation} 504 | \item \grayf{Parameter Estimation: ML} 505 | \item \grayf{Parameter Estimation: MAP} 506 | \item \redf{Bayesian Linear Regression} 507 | \item \grayf{Maximum Likelihood as Orthogonal Projection } 508 | 509 | \ece 510 | \end{frame} 511 | 512 | 513 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 514 | \begin{frame}{Bayesian Linear Regression} 515 | 516 | 517 | \plitemsep 0.05in 518 | 519 | 520 | \bci 521 | 522 | \item Earlier, ML and MAP. Now, \bluef{fully Bayesian} \hfill \lecturemark{L8(4)} 523 | 524 | \item Model 525 | \mysmalltwocols{0.25} 526 | { 527 | \vspace{-0.2cm} 528 | \aleq{ 529 | \text{prior} \quad & p(\vth) \sim \set{N}(\vm_0, \mS_0) \cr 530 | \text{likelihood} \quad& p(y | \vx, \vth) \sim \set{N}\big(y \mid \trans{\phi}(\vx)\vth,\sigma^2 \big)\cr 531 | \text{joint} \quad & p(y,\vth | \vx) = p(y \mid \vx,\vth) p(\vth) 532 | } 533 | } 534 | { 535 | \vspace{-0.2cm} 536 | \mypic{0.4}{L9_bayesian_regression.png} 537 | } 538 | \item Goal: For an input $\vx_*,$ we want to compute the following \bluef{posterior predictive distribution}\footnote{\lecturemark{Chapter 9.3.4} For ease of understanding, I've slightly changed the organization of these lecture slides from that of the textbook.} of $y_*$: 539 | \vspace{-0.3cm} 540 | $$ 541 | \displaystyle 542 | p(y_* | x_*, \set{X},\set{Y}) = \int \overbrace{p(y_* | \vx_*, \vth)}^{\text{likelihood}} 543 | \overbrace{p(\vth | \set{X},\set{Y})}^{(*)}\text{d}\vth 544 | $$ 545 | \vspace{-0.3cm} 546 | \bci 547 | \item $(*)$: parameter posterior distribution that needs to be computed 548 | \eci 549 | \eci 550 | 551 | \end{frame} 552 | 553 | % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 554 | % \begin{frame}{Prior Predictions} 555 | 556 | 557 | % \plitemsep 0.07in 558 | 559 | % \bci 560 | 561 | % \item Fully Bayesian: Predictions by taking the parameter distribution and average over all plausible parameter setting. For an input $\vx_*$, 562 | % $$ 563 | % p(y_* | \vx_*) = \int p(y,\vth | \vx) \text{d}\vth 564 | % = \int p(y_* | \vx_*, \vth) p(\vth) \text{d}\vth = \bexpecti{\vth}{p(y_*|\vx_*, \vth)} 565 | % $$ 566 | 567 | % \item This prediction based on \bluef{prior distribution} requires only input, not depending on the training data. 568 | 569 | % \item Later, we will discuss \bluef{posterior prediction} which uses the modified predictive distribution based on the training data. 570 | 571 | % \item \bluef{Prior predictive distribution} 572 | % $$ 573 | % \redf{p(y_* | x_*) = \set{N}\Big(\trans{\phi}(\vx_*)\vm_0, \trans{\phi}(\vx_*)\mS_0 \phi(\vx_*)+ 574 | % \sigma^2 \Big)} 575 | % $$ 576 | 577 | % \eci 578 | 579 | % \end{frame} 580 | 581 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 582 | \begin{frame}{Parameter Posterior Distribution (1)} 583 | 584 | \plitemsep 0.07in 585 | 586 | \bci 587 | 588 | \item \bluef{Parameter posterior distribution}\hfill \lecturemark{Chapter 9.3.3} 589 | \mycolorbox{ 590 | $$ 591 | \redf{p(\vth \mid \set{X},\set{Y}) = \set{N}(\vth \mid \vm_N,\mS_N)}, \quad \text{where} 592 | $$ 593 | $$ 594 | \redf{ 595 | \mS_N = \inv{\big(\inv{\mS}_0 + \sigma^2 \trans{\mPhi}\mPhi \big)}, \quad \vm_N = \mS_N 596 | \big(\inv{\mS}_0\vm_0 + \sigma^{-2}\trans{\mPhi}\vy \big)} 597 | $$ 598 | } 599 | \eci 600 | (Proof Sketch) 601 | \small 602 | \bci 603 | 604 | \item From the negative-log posterior for general Gaussian prior, 605 | \aleq{ 606 | -\log p(\vth | \set{X},\set{Y}) &= \frac{1}{2\sigma^2} \trans{(\vy - \mPhi\vth)} (\vy - \mPhi\vth) + \bluef{\frac{1}{2}\trans{(\vth - \vm_0)}\inv{\mS}_0(\vth-\vm_0)} + \text{const} 607 | } 608 | 609 | \eci 610 | \end{frame} 611 | 612 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 613 | \begin{frame}{Parameter Posterior Distribution (2)} 614 | 615 | \plitemsep 0.07in 616 | \bci 617 | \small 618 | \item[] 619 | \aleq{ 620 | &= \frac{1}{2} \Big( 621 | \sigma^{-2}\trans{\vy}\vy - \orangef{2\sigma^{-2}\trans{\vy}\mPhi\vth} + \cyanf{\trans{\vth}\sigma^{-2} 622 | \trans{\mPhi}\mPhi\vth} + \cyanf{\trans{\vth}\inv{\mS}_0\vth} -\orangef{2\trans{\vm}_0\inv{\mS}_0\vth} 623 | +\trans{\vm}_0\inv{\mS}_0\vm_0 624 | \Big ) \cr 625 | &=\frac{1}{2} \Big( 626 | \cyanf{\trans{\vth}(\sigma^{-2}\trans{\mPhi}\mPhi + \inv{\mS}_0)\vth} 627 | -\orangef{2\trans{(\sigma^{-2}\trans{\mPhi}\vy + \inv{\mS}_0\vm_0)}\vth} 628 | \Big) + \text{const} 629 | } 630 | \item \cyanf{cyan color}: quadratic term, \orangef{orange color}: linear term 631 | 632 | \item $p(\vth|\cX,\cY) \propto \exp(\text{ quadratic in $\vth$ })$ $\implies$ Gaussian distribution 633 | \item Assume that $p(\vth|\cX,\cY) = \set{N}(\vth|\vm_N, \mS_N),$ and find $\vm_N$ and $\mS_N.$ 634 | \aleq{ 635 | - \log\set{N}(\vth|\vm_N,\mS_N) &= \frac{1}{2}\trans{(\vth-\vm_N)}\inv{\mS}_N(\vth-\vm_N) + \text{const} \cr 636 | &= \frac{1}{2}\Big( 637 | \cyanf{\trans{\vth}\inv{\mS}_N\vth} - \orangef{2\trans{\vm_N}\inv{\mS}_N\vth} + \trans{\vm}_N\inv{\mS}_N\vm_N 638 | \Big) + \text{const} 639 | } 640 | \item Thus, 641 | $ 642 | \cyanf{\inv{\mS}_N = \sigma^{-2}\trans{\mPhi}\mPhi + \inv{\mS}_0} \quad \text{and} \quad 643 | \orangef{ 644 | \trans{\vm}_N\inv{\mS}_N = \trans{(\sigma^{-2}\trans{\mPhi}\vy + \inv{\mS}_0\vm_0)} 645 | } 646 | $ 647 | \eci 648 | 649 | \end{frame} 650 | 651 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 652 | \begin{frame}{Posterior Predictions (1)} 653 | 654 | \plitemsep 0.07in 655 | 656 | \bci 657 | 658 | \item \bluef{Posterior predictive distribution} \hfill \lecturemark{L6(5)} 659 | \red{ 660 | \aleq{ 661 | p(y_* | x_*, \set{X},\set{Y}) &= \int p(y_* | \vx_*, \vth)p(\vth | \set{X},\set{Y})\text{d}\vth \cr 662 | &= \int \set{N}\Big(y_* | \trans{\phi}(\vx_*)\vth, \sigma^2\Big) \set{N}\Big(\vth | \vm_N, \mS_N\Big)\text{d}\vth \cr 663 | &= \set{N}\Big(y_* | \trans{\phi}(\vx_*)\vm_N, \trans{\phi}(\vx_*)\mS_N \phi(\vx_*)+ 664 | \sigma^2 \Big) 665 | }} 666 | 667 | \item The mean $\trans{\phi}(\vx_*)\vm_N$ coincides with the MAP estimate 668 | \eci 669 | \end{frame} 670 | 671 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 672 | \begin{frame}{Posterior Predictions (2)} 673 | 674 | \mypic{0.95}{L9_posterior_predictive_ex.png} 675 | 676 | \bci 677 | \item BLR: Bayesian Linear Regression 678 | \eci 679 | \end{frame} 680 | 681 | 682 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 683 | \begin{frame}{Computing Marginal Likelihood} 684 | 685 | \plitemsep 0.07in 686 | 687 | \bci 688 | 689 | \item Likelihood: $p(\cY | \cX, \vth),$ \bluef{Marginal likelihood}: $p(\cY | \cX) = \int p(\cY | \cX,\vth)p(\vth)\text{d}\vth$ 690 | 691 | \item Recall that the marginal likelihood is important for model selection via Bayes factor: 692 | $$ 693 | \text{(Posterior odds)} = \frac{\cprob{M_1 \mid \set{D}}}{\cprob{M_2 \mid \set{D}}} = \frac 694 | { 695 | \frac{\cprob{\set{D} \mid M_1}\cprob{M_1}}{\cprob{\set{D}}} 696 | } 697 | { 698 | \frac{\cprob{\set{D} \mid M_2}\cprob{M_2}}{\cprob{\set{D}}} 699 | } 700 | = \underbrace{\frac{\cprob{M_1}}{\cprob{M_2}}}_{\text{Prior odds}} 701 | \underbrace{\frac{\cprob{\set{D} \mid M_1}}{\cprob{\set{D} \mid M_2}}}_{\bluef{\text{Bayes factor}}} 702 | $$ 703 | 704 | \item[] 705 | \aleq{ 706 | p(\cY | \cX) &= \int p(\cY | \cX,\vth)p(\vth)\text{d}\vth = \int \set{N}(\vy | \mPhi\vth, \sigma^2\mI) 707 | \set{N}(\vth | \vm_0,\mS_0)\, \text{d}\vth \cr 708 | & = \set{N}(\vy \mid \mPhi\vm_0, \mPhi\mS_0\trans{\mPhi} + \sigma^2\mI) 709 | } 710 | \eci 711 | \end{frame} 712 | 713 | 714 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 715 | \section{L9(5)} 716 | \begin{frame}{Roadmap} 717 | 718 | \plitemsep 0.1in 719 | 720 | \bce[(1)] 721 | 722 | \item \grayf{Problem Formulation} 723 | \item \grayf{Parameter Estimation: ML} 724 | \item \grayf{Parameter Estimation: MAP} 725 | \item \grayf{Bayesian Linear Regression} 726 | \item \redf{Maximum Likelihood as Orthogonal Projection} 727 | 728 | \ece 729 | \end{frame} 730 | 731 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 732 | \begin{frame}{ML as Orthogonal Projection} 733 | 734 | \plitemsep 0.07in 735 | 736 | \bci 737 | 738 | \item For $f(\vx) = \trans{\vx}\vth + \set{N}(0,\sigma^2),$ $\vth_\ml = \inv{(\trans{\mX}\mX)} \trans{\vX} \vy = \dfrac{\trans{\vX}\vy}{\trans{\mX}\mX} \in \real$ 739 | $$ 740 | \mX\vth_\ml = \dfrac{\vX\trans{\vX}}{\trans{\mX}\mX}\vy 741 | $$ 742 | \vspace{-0.3cm} 743 | \bci 744 | \item Orthogonal projection of $\vy$ onto the one-dimensional subspace spanned by $\mX$ 745 | \eci 746 | 747 | 748 | \item For $f(\vx) = \trans{\phi}(\vx)\vth + \set{N}(0,\sigma^2),$ $\vth_\ml = \inv{(\trans{\mPhi}\mPhi)} \trans{\mPhi} \vy = \dfrac{\trans{\mPhi}\vy}{\trans{\mPhi}\mPhi} \in \real$ 749 | $$ 750 | \mPhi\vth_\ml = \dfrac{\mPhi\trans{\mPhi}}{\trans{\mPhi}\mPhi}\vy 751 | $$ 752 | \vspace{-0.3cm} 753 | \bci 754 | \item Orthogonal projection of $\vy$ onto the $K$-dimensional subspace spanned by columns of $\mPhi$ 755 | \eci 756 | \eci 757 | \end{frame} 758 | 759 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 760 | \begin{frame}{Summary and Other Issues (1)} 761 | 762 | \plitemsep 0.07in 763 | 764 | \bci 765 | 766 | \item Linear regression for Gaussian likelihood and conjugate Gaussian priors. Nice analytical results and closed forms 767 | 768 | \item Other forms of likelihoods for other applications (e.g., classification) 769 | 770 | \item GLM (generalized linear model): $y = \sigma \circ f$ ($\sigma$: activation function) 771 | \bci 772 | \item No longer linear in $\vth$ 773 | \item Logistic regression: $\sigma(f) = \dfrac{1}{1+\exp(-f)} \in [0,1]$ (interpreted as the probability of becoming 1) 774 | \item Building blocks of (deep) feedforward neural nets 775 | 776 | \item $\vy = \sigma(\mA \vx + \vb)$. $\mA$: weight matrix, $\vb$: bias vector 777 | \item $K$-layer deep neural nets: $\vx_{k+1} = f_k(\vx_k),$ $f_k(\vx_k) = \sigma_k(\mA_k\vx_k + \vb_k)$ 778 | \eci 779 | 780 | \eci 781 | \end{frame} 782 | 783 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 784 | \begin{frame}{Summary and Other Issues (2)} 785 | 786 | \plitemsep 0.1in 787 | 788 | \bci 789 | 790 | \item Gaussian process 791 | \bci 792 | \item A distribution over parameters $\rightarrow$ a distribution over functions 793 | \item Gaussian process: distribution over functions without detouring via parameters 794 | \item Closely related to BLR and support vector regression, also interpreted as Bayesian neural network with a single hidden layer and the infinite number of units 795 | \eci 796 | 797 | \item Gaussian likelihood, but non-Gaussian prior 798 | \bci 799 | \item When $N \ll D$ (small training data) 800 | \item Prior that enforces sparsity, e.g., Laplace prior 801 | \item A linear regression with the Laplace prior $=$ linear regression with LASSO (L1 regularization) 802 | \eci 803 | \eci 804 | \end{frame} 805 | 806 | 807 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 808 | \begin{frame}{} 809 | \vspace{2cm} 810 | \LARGE Questions? 811 | 812 | 813 | \end{frame} 814 | 815 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 816 | \begin{frame}{Review Questions} 817 | % \tableofcontents 818 | %\plitemsep 0.1in 819 | \bce[1)] 820 | \item 821 | 822 | \ece 823 | \end{frame} 824 | 825 | 826 | \end{document} 827 | -------------------------------------------------------------------------------- /10.PCA/10.PCA-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/10.PCA-2.pdf -------------------------------------------------------------------------------- /10.PCA/10.PCA-4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/10.PCA-4.pdf -------------------------------------------------------------------------------- /10.PCA/10.PCA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/10.PCA.pdf -------------------------------------------------------------------------------- /10.PCA/L10_PCA_onepicture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/L10_PCA_onepicture.png -------------------------------------------------------------------------------- /10.PCA/L10_dr_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/L10_dr_ex.png -------------------------------------------------------------------------------- /10.PCA/L10_latent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/L10_latent.png -------------------------------------------------------------------------------- /10.PCA/L10_mnist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/L10_mnist.png -------------------------------------------------------------------------------- /10.PCA/L10_pca_algorithm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/L10_pca_algorithm.png -------------------------------------------------------------------------------- /10.PCA/L10_pca_picture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/L10_pca_picture.png -------------------------------------------------------------------------------- /10.PCA/L10_variance_diff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/10.PCA/L10_variance_diff.png -------------------------------------------------------------------------------- /11.DensityEstimation/11.GMM-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/11.DensityEstimation/11.GMM-2.pdf -------------------------------------------------------------------------------- /11.DensityEstimation/11.GMM-4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/11.DensityEstimation/11.GMM-4.pdf -------------------------------------------------------------------------------- /11.DensityEstimation/11.GMM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/11.DensityEstimation/11.GMM.pdf -------------------------------------------------------------------------------- /11.DensityEstimation/L11_Gaussian_fail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/11.DensityEstimation/L11_Gaussian_fail.png -------------------------------------------------------------------------------- /11.DensityEstimation/L11_em_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/11.DensityEstimation/L11_em_ex.png -------------------------------------------------------------------------------- /11.DensityEstimation/L11_gm_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/11.DensityEstimation/L11_gm_ex.png -------------------------------------------------------------------------------- /11.DensityEstimation/L11_gmm_gm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/11.DensityEstimation/L11_gmm_gm.png -------------------------------------------------------------------------------- /11.DensityEstimation/main.tex: -------------------------------------------------------------------------------- 1 | %\pdfminorversion=4 2 | \documentclass[handout,fleqn,aspectratio=169]{beamer} 3 | 4 | \input{../myhead} 5 | 6 | \title[]{Lecture 11: Density Estimation \\with Gaussian Mixture Models} 7 | \author{Yi, Yung (이융)} 8 | \institute{Mathematics for Machine Learning\\ \url{https://yung-web.github.io/home/courses/mathml.html} 9 | \\KAIST EE} 10 | \date{\today} 11 | 12 | 13 | \input{../mymath} 14 | \input{../mymacro} 15 | 16 | 17 | %\addtobeamertemplate{footline}{\rule{0.94\paperwidth}{1pt}}{} 18 | 19 | \begin{document} 20 | 21 | \input{../mydefault} 22 | 23 | 24 | 25 | % START START START START START START START START START START START START START 26 | 27 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 28 | \begin{frame}{Warm-Up} 29 | 30 | {\Large Please watch this tutorial video by Luis Serrano on Gaussian Mixture Model.} 31 | 32 | \bigskip 33 | 34 | \bigskip 35 | 36 | \url{https://www.youtube.com/watch?v=q71Niz856KE} 37 | 38 | \end{frame} 39 | 40 | 41 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 42 | \begin{frame}{Roadmap} 43 | 44 | \plitemsep 0.1in 45 | 46 | \bce[(1)] 47 | 48 | \item Gaussian Mixture Model 49 | \item Parameter Learning: MLE 50 | \item Latent-Variable Perspective for Probabilistic Modeling 51 | \item EM Algorithm 52 | \ece 53 | \end{frame} 54 | 55 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 56 | \section{L11(1)} 57 | \begin{frame}{Roadmap} 58 | 59 | \plitemsep 0.1in 60 | 61 | \bce[(1)] 62 | 63 | \item \redf{Gaussian Mixture Model} 64 | \item \grayf{Parameter Learning: MLE 65 | \item Latent-Variable Perspective for Probabilistic Modeling 66 | \item EM Algorithm} 67 | \ece 68 | \end{frame} 69 | 70 | 71 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 72 | \begin{frame}{Density Estimation} 73 | 74 | \plitemsep 0.1in 75 | 76 | \bci 77 | 78 | \item Represent data compactly using a density from a parametric family, e.g., Gaussian or Beta distribution 79 | 80 | \item Parameters of those families can be found by MLE and MAPE 81 | 82 | \item However, there are many cases when simple distributions (e.g., just Gaussian) fail to approximate data. 83 | 84 | \mypic{0.4}{L11_Gaussian_fail.png} 85 | 86 | \eci 87 | \end{frame} 88 | 89 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 90 | \begin{frame}{Mixture Models} 91 | 92 | \plitemsep 0.1in 93 | 94 | \bci 95 | 96 | \item More expressive family of distribution 97 | 98 | \item Idea: Let's mix! A \bluef{convex combination} of $K$ ``base'' distributions 99 | \mycolorbox{ 100 | \vspace{-0.2cm} 101 | \aleq{ 102 | p(\vx) = \sum_{k=1}^K \pi_k p_k(\vx), \quad 0 \le \pi_k \le 1, \quad \sum_{k=1}^K \pi_k = 1} 103 | } 104 | \item Multi-modal distributions: Can be used to describe datasets with multiple clusters 105 | 106 | \item Our focus: Gaussian mixture models 107 | 108 | \item Want to finding the parameters using MLE, but \bluef{cannot have the closed form} solution (even with the mixture of Gaussians) $\rightarrow$ some iterative methods needed 109 | \eci 110 | \end{frame} 111 | 112 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 113 | \begin{frame}{Gaussian Mixture Model} 114 | 115 | \mycolorbox{ 116 | \vspace{-0.2cm} 117 | \aleq{ 118 | p(\vx | \vth) = \sum_{k=1}^K \set{N}(\vx | \vmu_k, \msig_k), \quad 0 \le \pi_k \le 1, \quad \sum_{k=1}^K \pi_k = 1, 119 | } 120 | where the parameters $\vth \eqdef \{\vmu_k, \msig_k, \pi_k: k= 1, \ldots, K \}$ 121 | } 122 | 123 | 124 | \vspace{-0.3cm} 125 | \plitemsep 0.01in 126 | 127 | \bci 128 | \item \exam $p(x|\vth) = \bluef{0.5\set{N}(x|-2,1/2)} + \orangef{0.2\set{N}(x|1,2)} + \greenf{0.3\set{N}(x|4,1)}$ 129 | 130 | \mypic{0.4}{L11_gm_ex.png} 131 | \eci 132 | \end{frame} 133 | 134 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 135 | \section{L11(2)} 136 | \begin{frame}{Roadmap} 137 | 138 | \plitemsep 0.1in 139 | 140 | \bce[(1)] 141 | 142 | \item \grayf{Gaussian Mixture Model} 143 | \item \redf{Parameter Learning: MLE} 144 | \item \grayf{Latent-Variable Perspective for Probabilistic Modeling 145 | \item EM Algorithm} 146 | \ece 147 | \end{frame} 148 | 149 | 150 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 151 | \begin{frame}{Parameter Learning: Maximum Likelihood} 152 | 153 | \plitemsep 0.1in 154 | 155 | \bci 156 | \item Given a iid dataset $\set{X}= \{\vx_1, \ldots, \vx_n \},$ the log-likelihood is: 157 | \aleq{ 158 | \cL(\vth) = \log p(\set{X} | \vth) = \sum_{n=1}^N \log p(\vx_n|\vth) = \sum_{n=1}^N \log \sum_{k=1}^K \pi_k 159 | \cN(\vx_n | \vmu_k,\msig_k) 160 | } 161 | 162 | \item $\vth_{\ml} = \arg \min_{\vth} (-\cL(\vth))$ 163 | \item Necessary condition for $\vth_\ml$: $\dfrac{d\cL}{d\vth}\Big|_{\vth_\ml} = 0$ 164 | 165 | \item However, the closed-form solution of $\vth_\ml$ does not exist, so we rely on an iterative algorithm (also called EM algorithm). 166 | 167 | \item We show the algorithm first, and then discuss how we get the algorithm. 168 | \eci 169 | \end{frame} 170 | 171 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 172 | \begin{frame}{Responsibilities} 173 | 174 | \plitemsep 0.1in 175 | 176 | \bci 177 | \item \defi \bluef{Responsibilities.} Given $n$-th data point $\vx_n$ and the parameters $(\vmu_k, \msig_k, \pi_k: k=1, \ldots, K)$, 178 | $$ 179 | r_{nk}= \frac{\pi_k \cN(\vx_n | \vmu_k, \msig_k)}{ \sum_{j}\pi_j \cN(\vx_n | \vmu_j, \msig_j) } 180 | $$ 181 | 182 | \item How much is each component $k$ responsible, if the data $\vx_n$ is sampled from the current mixture model? 183 | 184 | \item $\vec{r}_n = (r_{nk}: k=1, \ldots, K)$ is a probability distribution, so $\sum_{k=1}^K r_{nk} =1$ 185 | 186 | \item Soft assignment of $\vx_n$ to the $K$ mixture components 187 | \eci 188 | \end{frame} 189 | 190 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 191 | \begin{frame}{EM Algorithm: MLE in Gaussian Mixture Models} 192 | 193 | \small 194 | \myblock{EM for MLE in Gaussian Mixture Models} 195 | { 196 | \bce[\red \bf S1.] 197 | \item Initialize $\vmu_k, \msig_k, \pi_k$ 198 | 199 | \item \bluef{\bf E-step:} Evaluate responsibilities $r_{nk}$ for every data point $\vx_n$ using the current $\vmu_k, \msig_k, \pi_k$: 200 | $$ 201 | \greenf{r_{nk}}= \frac{\pi_k \cN(\vx_n | \vmu_k, \msig_k)}{ \sum_{j}\pi_j \cN(\vx_n | \vmu_j, \msig_j) }, \quad \greenf{N_k} = \sum_{n=1}^N \greenf{r_{nk}} 202 | $$ 203 | 204 | \item \bluef{\bf M-step:} Reestimate parameters $\vmu_k, \msig_k, \pi_k$ using the current 205 | responsibilities $r_{nk}$: 206 | \aleq{ 207 | \orangef{\vmu_k} = \frac{1}{\greenf{N_k}} \sum_{n=1}^N \greenf{r_{nk}} \vx_n, \ \orangef{\msig_k} = \frac{1}{\greenf{N_k}} 208 | \sum_{n=1}^N \greenf{r_{nk}} (\vx_n - \vmu_k)\trans{(\vx_n - \vmu_k)}, \ \orangef{\pi_k} = \frac{\greenf{N_k}}{N}, 209 | } 210 | and go to \redf{\bf S2.} 211 | \ece 212 | } 213 | \vspace{-0.3cm} 214 | - The update equation in \bluef{\bf M-step} is still mysterious, which will be covered later. 215 | 216 | % \vspace{-0.5cm} 217 | % \plitemsep 0.1in 218 | % \bci 219 | % \item 220 | % \eci 221 | \end{frame} 222 | 223 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 224 | \begin{frame}{Example: EM Algorithm} 225 | 226 | \mypic{0.7}{L11_em_ex.png} 227 | \end{frame} 228 | 229 | 230 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 231 | \begin{frame}{M-Step: Towards the Zero Gradient} 232 | 233 | \plitemsep 0.07in 234 | 235 | \bci 236 | \item Given $\cX$ and $r_{nk}$ from E-step, the new updates of $\vmu_k$, $\msig_k$, $\pi_k$ should be made, such that the followings are satisfied: 237 | \aleq{ 238 | \pd{\cL}{\vmu_k} &= \trans{\vec{0}} \Longleftrightarrow \sum_{n=1}^N \pd{\log p(\vx_n | \vth)}{\vmu_k} = \trans{\vec{0}} \cr 239 | \pd{\cL}{\msig_k} &= \vec{0} \Longleftrightarrow \sum_{n=1}^N \pd{\log p(\vx_n | \vth)}{\msig_k} = \vec{0} \cr\pd{\cL}{\pi_k} &= 0 \Longleftrightarrow \sum_{n=1}^N \pd{\log p(\vx_n | \vth)}{\pi_k} = 0 240 | } 241 | 242 | \item Nice thing: the new updates of $\vmu_k$, $\msig_k$, $\pi_k$ are all expressed by the responsibilities $[r_{nk}]$ 243 | 244 | \item Let's take a look at them one by one! 245 | \eci 246 | \end{frame} 247 | 248 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 249 | \begin{frame}{M-Step: Update of $\vmu_k$} 250 | \mycolorbox{ 251 | $$ 252 | \vmu_k^{\text{new}} = \frac{\sum_{n=1}^N r_{nk} \vx_n}{\sum_{n=1}^N r_{nk}}, k=1,\ldots, K 253 | $$ 254 | } 255 | 256 | \plitemsep 0.07in 257 | \bci 258 | \item 259 | \eci 260 | \end{frame} 261 | 262 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 263 | \begin{frame}{M-Step: Update of $\msig_k$} 264 | \mycolorbox{ 265 | $$ 266 | \msig_k^{\text{new}} = \frac{1}{N_k} 267 | \sum_{n=1}^N r_{nk} (\vx_n - \vmu_k)\trans{(\vx_n - \vmu_k)}, k=1,\ldots, K 268 | $$ 269 | } 270 | 271 | \plitemsep 0.07in 272 | \bci 273 | \item 274 | \eci 275 | \end{frame} 276 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 277 | \begin{frame}{M-Step: Update of $\pi_k$} 278 | \mycolorbox{ 279 | $$ 280 | \pi_k^{\text{new}} = \frac{\sum_{n=1}^N r_{nk}}{N}, k=1,\ldots, K 281 | $$ 282 | } 283 | 284 | \plitemsep 0.07in 285 | \bci 286 | \item 287 | \eci 288 | \end{frame} 289 | 290 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 291 | \section{L11(3)} 292 | \begin{frame}{Roadmap} 293 | 294 | \plitemsep 0.1in 295 | 296 | \bce[(1)] 297 | 298 | \item \grayf{Gaussian Mixture Model} 299 | \item \grayf{Parameter Learning: MLE} 300 | \item \redf{Latent-Variable Perspective for Probabilistic Modeling} 301 | \item \grayf{EM Algorithm} 302 | \ece 303 | \end{frame} 304 | 305 | 306 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 307 | \begin{frame}{Latent-Variable Perspective} 308 | 309 | \plitemsep 0.07in 310 | \bci 311 | \item Justify some ad hoc decisions made earlier 312 | \item Allow for a concrete interpretation of the responsibilities as \bluef{posterior distributions} 313 | \item Iterative algorithm for updating the model parameters can be derived in a principled manner 314 | \eci 315 | \vspace{-0.9cm} 316 | \mypic{0.3}{L11_gmm_gm.png} 317 | 318 | \end{frame} 319 | 320 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 321 | \begin{frame}{Generative Process} 322 | 323 | \plitemsep 0.07in 324 | \bci 325 | \item \redf{Latent variable $\vz$}: \bluef{One-hot encoding} random vector $\vz = \trans{[z_1, \ldots, z_K]}$ consisting of $K-1$ many 0s and exactly one 1. 326 | 327 | \item An indicator rv $z_k=1$ represents whether \bluef{$k$-th component is used to generate the data sample} $\vx$ or not. 328 | 329 | \item $p(\vx | z_k=1) = \cN(\vx| \vmu_k,\msig_k)$ 330 | \item Prior for $\vz$ with $\pi_k = p(z_k =1)$ 331 | $$ 332 | p(\vz) = \vpi = \trans{[\pi_1, \ldots, \pi_K]}, \quad \sum_{k=1}^K \pi_k = 1 333 | $$ 334 | 335 | \item Sampling procedure 336 | \bce 337 | \item Sample which component to use $z^{(i)} \sim p(\vz)$ 338 | \item Sample data according to $i$-th Gaussian $\vx^{(i)} \sim p(\vx | z^{(i)})$ 339 | \ece 340 | \eci 341 | \end{frame} 342 | 343 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 344 | \begin{frame}{Joint Distribution, Likelihood, and Posterior (1)} 345 | 346 | \plitemsep 0.1in 347 | \bci 348 | \item Joint distribution 349 | $$ 350 | p(\vx,\vz) = \colvec{p(\vx, z_1=1) \\ \vdots \\ p(\vx, z_K=1)} = 351 | \colvec{p(\vx| z_1=1)p(z_1=1) \\ \vdots \\ p(\vx|z_K=1)p(z_K=1)}= 352 | \colvec{\pi_1\cN(\vx|\vmu_1,\msig_1) \\ \vdots \\ \pi_K\cN(\vx|\vmu_K,\msig_K)} 353 | $$ 354 | 355 | \item Likelihood for an arbitrary single data $\vx$: By summing out all latent variables\footnote{In probabilistic PCA, $\vz$ was continuous, so we integrated them out.}, 356 | \aleq{ 357 | p(\vx | \vth) &= \sum_{\vz} p(\vx|\vth,\vz)p(\vz|\vth) = \sum_{k=1}^K p(\vx|\vth,z_k=1)p(z_k=1|\vth) 358 | = \sum_{k=1}^K \pi_k \cN(\vx|\vmu_k, \msig_k) 359 | } 360 | \item For all the data samples $\cX,$ the log-likelihood is: 361 | \aleq{ 362 | \log p(\set{X} | \vth) &= \sum_{n=1}^N \log p(\vx_n|\vth) = \sum_{n=1}^N \log \sum_{k=1}^K \pi_k 363 | \cN(\vx_n | \vmu_k,\msig_k) \hspace{2cm} \lecturemark{\text{Compare: Page 7}} 364 | } 365 | \eci 366 | \end{frame} 367 | 368 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 369 | \begin{frame}{Joint Distribution, Likelihood, and Posterior (2)} 370 | 371 | \plitemsep 0.1in 372 | \bci 373 | 374 | \item Posterior for the $k$-th $z_k$, given an arbitrary single data $\vx$: 375 | \aleq{ 376 | p(z_k=1 | \vx) = \frac{p(z_k=1)p(\vx|z_k=1)}{\sum_{j=1}^K p(z_j=1)p(\vx|z_j=1)} 377 | = \frac{\pi_k 378 | \cN(\vx | \vmu_k,\msig_k)}{\sum_{j=1}^K\pi_j 379 | \cN(\vx | \vmu_j,\msig_j)} 380 | } 381 | \item Now, for all data samples $\set{X},$ each data $\vx_n$ has $\vz_n= \trans{[z_{n1}, \ldots, z_{nK}]},$ but with the same prior $\vpi.$ 382 | \aleq{ 383 | p(z_{nk}=1 | \vx_n) = \frac{p(z_{nk}=1)p(\vx_n|z_{nk}=1)}{\sum_{j=1}^K p(z_{nj}=1)p(\vx_n|z_{nj}=1)} 384 | = \frac{\pi_k 385 | \cN(\vx_n | \vmu_k,\msig_k)}{\sum_{j=1}^K\pi_j 386 | \cN(\vx_n | \vmu_j,\msig_j)} = r_{nk} 387 | } 388 | 389 | \item Responsibilities are mathematically interpreted as \bluef{posterior distributions.} 390 | \eci 391 | \end{frame} 392 | 393 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 394 | \section{L11(4)} 395 | \begin{frame}{Roadmap} 396 | 397 | \plitemsep 0.1in 398 | 399 | \bce[(1)] 400 | 401 | \item \grayf{Gaussian Mixture Model} 402 | \item \grayf{Parameter Learning: MLE} 403 | \item \grayf{Latent-Variable Perspective for Probabilistic Modeling} 404 | \item \redf{EM Algorithm} 405 | \ece 406 | \end{frame} 407 | 408 | 409 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 410 | \begin{frame}{Revisiting EM Algorithm for MLE} 411 | 412 | \mytwocols{0.5} 413 | { 414 | \small 415 | \bce[\red \bf S1.] 416 | \item Initialize $\vmu_k, \msig_k, \pi_k$ 417 | 418 | \item \bluef{\bf E-step:} 419 | $$ 420 | r_{nk}= \frac{\pi_k \cN(\vx_n | \vmu_k, \msig_k)}{ \sum_{j}\pi_j \cN(\vx_n | \vmu_j, \msig_j) } 421 | $$ 422 | 423 | \item \bluef{\bf M-step:} Update $\vmu_k, \msig_k, \pi_k$ using $r_{nk}$ 424 | and go to \redf{\bf S2.} 425 | \ece 426 | } 427 | { 428 | \small 429 | \bci 430 | \item \bluef{\bf E-step.} \orangef{Expectation} over $\vz | \vx, \vth^{(t)}$: 431 | Given the current $\vth^{(t)} = (\vmu_k, \msig_k, \pi_k),$ calculates the expected log-likelihood 432 | \aleq{ 433 | Q(\vth|\vth^{(t)}) &= \expecti{\vz|\vx,\vth^{(t)}}{\log p(\vx,\vz | \vth)} \cr 434 | & = \int \log p(\vx,\vz | \vth) p(\vz|\vx,\vth^{(t)})\text{d}\vz 435 | } 436 | 437 | \item \bluef{\bf M-step.} \orangef{Maximization} of the computation results in E-step for the new model parameters. 438 | 439 | \eci 440 | } 441 | 442 | \bci 443 | \item Only guarantee of just local-optimum because the original optimization is not necessarily a convex optimization. \hfill \lecturemark{L7(4)} 444 | \eci 445 | 446 | \end{frame} 447 | 448 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 449 | \begin{frame}{Other Issues} 450 | 451 | \plitemsep 0.1in 452 | \bci 453 | \item Model selection for finding a good $K$, e.g., using nested cross-validation 454 | 455 | \item Application: Clustering 456 | \bci 457 | \item K-means: Treat the means in GMM as cluster centers and ignore the covariances. 458 | \item K-means: hard assignment, GMM: soft assignment 459 | \eci 460 | 461 | \item EM algorithm: Highly generic in the sense that it can be used for parameter learning in general latent-variable models 462 | 463 | \item Standard criticism for MLE exists such as overfitting. Also, fully-Bayesian approach assuming some priors on the parameters is possible, but not covered in this notes. 464 | 465 | \item Other density estimation methods 466 | \bci 467 | \item Histogram-based method: non-parametric method 468 | \item Kernel-density estimation: non-parametric method 469 | \eci 470 | \eci 471 | \end{frame} 472 | 473 | 474 | 475 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 476 | \begin{frame}{} 477 | \vspace{2cm} 478 | \LARGE Questions? 479 | 480 | 481 | \end{frame} 482 | 483 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 484 | \begin{frame}{Review Questions} 485 | % \tableofcontents 486 | %\plitemsep 0.1in 487 | \bce[1)] 488 | \item 489 | 490 | \ece 491 | \end{frame} 492 | 493 | 494 | \end{document} 495 | -------------------------------------------------------------------------------- /12.SVM/12.SVM-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/12.SVM-2.pdf -------------------------------------------------------------------------------- /12.SVM/12.SVM-4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/12.SVM-4.pdf -------------------------------------------------------------------------------- /12.SVM/12.SVM.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/12.SVM.pdf -------------------------------------------------------------------------------- /12.SVM/L12_disthyper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/L12_disthyper.png -------------------------------------------------------------------------------- /12.SVM/L12_halfspace.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/L12_halfspace.png -------------------------------------------------------------------------------- /12.SVM/L12_hingeloss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/L12_hingeloss.png -------------------------------------------------------------------------------- /12.SVM/L12_kernel_ex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/L12_kernel_ex.png -------------------------------------------------------------------------------- /12.SVM/L12_soft_hard_svm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/L12_soft_hard_svm.png -------------------------------------------------------------------------------- /12.SVM/L12_softsvm_geo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/L12_softsvm_geo.png -------------------------------------------------------------------------------- /12.SVM/dist_hyperplane.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/12.SVM/dist_hyperplane.pptx -------------------------------------------------------------------------------- /12.SVM/main.tex: -------------------------------------------------------------------------------- 1 | %\pdfminorversion=4 2 | \documentclass[handout,fleqn,aspectratio=169]{beamer} 3 | 4 | \input{../myhead} 5 | 6 | \title[]{Lecture 12: Classification with Support Vector Machines} 7 | \author{Yi, Yung (이융)} 8 | \institute{Mathematics for Machine Learning\\ \url{https://yung-web.github.io/home/courses/mathml.html} 9 | \\KAIST EE} 10 | \date{\today} 11 | 12 | 13 | \input{../mymath} 14 | \input{../mymacro} 15 | 16 | 17 | %\addtobeamertemplate{footline}{\rule{0.94\paperwidth}{1pt}}{} 18 | 19 | \begin{document} 20 | 21 | \input{../mydefault} 22 | 23 | 24 | 25 | % START START START START START START START START START START START START START 26 | 27 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 28 | \begin{frame}{Warm-Up} 29 | 30 | {\Large Please watch this tutorial video by Luis Serrano on Support Vector Machine.} 31 | 32 | \bigskip 33 | 34 | \bigskip 35 | 36 | \url{https://youtu.be/Lpr__X8zuE8} 37 | 38 | \end{frame} 39 | 40 | 41 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 42 | \begin{frame}{Roadmap} 43 | 44 | \plitemsep 0.1in 45 | 46 | \bce[(1)] 47 | 48 | \item Story and Separating Hyperplanes 49 | \item Primal SVM: Hard SVM 50 | \item Primal SVM: Soft SVM 51 | \item Dual SVM 52 | \item Kernels 53 | \item Numerical Solution 54 | 55 | \ece 56 | \end{frame} 57 | 58 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 59 | \section{L12(1)} 60 | \begin{frame}{Roadmap} 61 | 62 | \plitemsep 0.1in 63 | 64 | \bce[(1)] 65 | 66 | \item \redf{Story and Separating Hyperplanes} 67 | \item \grayf{Primal SVM: Hard SVM 68 | \item Primal SVM: Soft SVM 69 | \item Dual SVM 70 | \item Kernels 71 | \item Numerical Solution} 72 | 73 | \ece 74 | \end{frame} 75 | 76 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 77 | \begin{frame}{Storyline} 78 | 79 | \plitemsep 0.1in 80 | 81 | \bci 82 | 83 | \item (Binary) classification vs. regression 84 | 85 | \item A Classification predictor $f:\realD \mapsto \{+1, -1 \},$ where $D$ is the dimension of features. 86 | \item Suppervised learning as in the regression with a given dataset $\{(\vx_1,y_1), \ldots, (\vx_N,y_N) \},$ where our task is to learn the model parameters which produces the smallest classification errors. 87 | 88 | \item SVM 89 | \bci 90 | \item Geometric way of thinking about supvervised learning 91 | \item Relying on empirical risk minimization 92 | \item Binary classification = Drawing a separating hyperplane 93 | \item Various interpretation from various perspectives: geometric view, loss function view, the view from convex hulls of data points 94 | \eci 95 | \eci 96 | \end{frame} 97 | 98 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 99 | \begin{frame}{Hard SVM vs. Soft SVM} 100 | 101 | \mypic{0.55}{L12_soft_hard_svm.png} 102 | 103 | \plitemsep 0.1in 104 | 105 | \bci 106 | 107 | \item Hard SVM: Linearly separable, and thus, allow no classification error 108 | 109 | \item Soft SVM: Non-linearly separable, thus, allow some classification error 110 | \eci 111 | \end{frame} 112 | 113 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 114 | \begin{frame}{Separating Hyperplane} 115 | 116 | \plitemsep 0.07in 117 | 118 | \bci 119 | 120 | \item \bluef{Hyperplane} in $\realD$ is a set: 121 | $\{x \mid \trans{a}x=b\}$ where $a\in\realn, a\neq 0, b\in\real$ \hfill \lecturemark{L7(3)} 122 | 123 | In other words, $\{ x \mid \trans{a}(x-x_0) =0\},$ where $x_0$ is any point in 124 | the hyperplane, i.e., $\trans{a} x_0 = b.$ 125 | 126 | \mysmalltwocols{0.2} 127 | { 128 | \item Divides $\realD$ into two {\blue halfspaces}: 129 | $\{x|\trans{a}x\leq b\}$ and $\{x|\trans{a}x>b\}$ 130 | } 131 | { 132 | \vspace{-0.3cm} 133 | \mypic{0.7}{L12_halfspace.png} 134 | } 135 | \vspace{-0.2cm} 136 | \item In our problem, we consider the hyperplane $\trans{\vw}\vx + b=0,$ where $\vw$ and $b$ are the parameters of the model. 137 | 138 | \item Classification logic 139 | \aleq{ 140 | \begin{cases} 141 | \trans{\vw}\vx_n + b \geq 0 & \ \text{when} \ y_n = +1\cr 142 | \trans{\vw}\vx_n + b < 0 & \ \text{when} \ y_n = -1 143 | \end{cases} 144 | \implies \redf{y_n \big(\trans{\vw}\vx_n +b \big) \geq 0} 145 | } 146 | 147 | % \bci 148 | % \item $\trans{\vw}\vx_n + b \geq 0$ when $y_n = +1$ 149 | % \item $\trans{\vw}\vx_n + b < 0$ when $y_n = -1$ 150 | % \eci 151 | \eci 152 | \end{frame} 153 | 154 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 155 | \begin{frame}{Distance bertween Two Hyperplanes} 156 | 157 | \plitemsep 0.07in 158 | 159 | \bci 160 | 161 | \item Consider two hyperplanes $\trans{\vw}\vx - b =0$ and $\trans{\vw}\vx - b= r$, where assume $r >0.$ 162 | 163 | \item \question What is the distance\footnote{Shortested distance between two hyperplanes.} between two hyperplanes? Answer: \bluef{$\dfrac{r}{\norm{w}}$} 164 | \eci 165 | 166 | \vspace{-0.7cm} 167 | \mypic{0.5}{L12_disthyper.png} 168 | 169 | % \mysmalltwocols{0.4} 170 | % { 171 | % } 172 | % { 173 | 174 | % } 175 | 176 | 177 | \end{frame} 178 | 179 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 180 | \section{L12(2)} 181 | \begin{frame}{Roadmap} 182 | 183 | \plitemsep 0.1in 184 | 185 | \bce[(1)] 186 | 187 | \item \grayf{Story and Separating Hyperplanes} 188 | \item \redf{Primal SVM: Hard SVM} 189 | \item \grayf{Primal SVM: Soft SVM 190 | \item Dual SVM 191 | \item Kernels 192 | \item Numerical Solution} 193 | 194 | \ece 195 | \end{frame} 196 | 197 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 198 | \begin{frame}{Hard Support Vector Machine} 199 | 200 | \plitemsep 0.07in 201 | 202 | \bci 203 | 204 | \item Assume that the data points are linearly separable. 205 | 206 | \item Goal: Find the hyperplane that maximizes the margin between the positive and the negative samples 207 | 208 | \item Given the training dataset $\{(\vx_1,y_1), \ldots, (\vx_N,y_N) \}$ 209 | and a hyperplane $\trans{\vw}\vx + b =0,$ what is the constraint that all data points are $\frac{r}{\norm{w}}$-away from the hyperplane? 210 | $$ 211 | y_n \big(\trans{\vw}\vx_n +b \big) \geq \frac{r}{\norm{\vw}} 212 | $$ 213 | 214 | \item Note that $r$ and $\norm{w}$ are scaled together, so if we fix $\norm{w}=1$, then 215 | $$ 216 | y_n \big(\trans{\vw}\vx_n +b \big) \geq r 217 | $$ 218 | 219 | \eci 220 | \end{frame} 221 | 222 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 223 | \begin{frame}{Hard SVM: Formulation 1} 224 | 225 | \plitemsep 0.07in 226 | 227 | \bci 228 | 229 | \item Maximize the margin, such that all the training data points are well-classified into their classes ($+$ or $-$) 230 | \mycolorbox 231 | { 232 | \vspace{-0.3cm} 233 | \aleq{ 234 | \max_{\vw, b, r} \quad &r \cr 235 | \text{subject to} \quad & y_n \big(\trans{\vw}\vx_n +b \big) \geq r, \ \text{for all} \ n=1,\ldots, N, \quad \norm{\vw}=1, \quad r>0 236 | } 237 | } 238 | 239 | \eci 240 | \end{frame} 241 | 242 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 243 | \begin{frame}{Formulation 2 (1)} 244 | 245 | \mycolorbox 246 | { 247 | \aleq{ 248 | \max_{\vw, b, r} \quad &r \cr 249 | \text{subject to} \quad & y_n \big(\trans{\vw}\vx_n +b \big) \geq r, \ \text{for all} \ n=1,\ldots, N, \quad \norm{\vw}=1, \quad r>0 250 | } 251 | } 252 | \plitemsep 0.07in 253 | \bci 254 | 255 | \item Since $\norm{\vw}=1,$ reformulate $\vw$ by $\vw'$ as: 256 | $y_n \Big(\dfrac{\trans{\vw'}}{\norm{\vw'}}\vx_n +b \Big) \geq r$ 257 | \item Change the objective from $r$ to $r^2.$ 258 | \item Define $\vw''$ and $b''$ by rescaling the constraint: 259 | \aleq{ 260 | y_n \Big(\frac{\trans{\vw'}}{\norm{\vw'}}\vx_n +b \Big) \geq r \Longleftrightarrow 261 | y_n \Big(\trans{\vw''}\vx_n +b'' \Big) \geq 1, \quad 262 | \vw'' = \frac{\vw'}{\norm{\vw'}r} \ \text{and} \ b'' = \frac{b}{r} 263 | } 264 | \eci 265 | \end{frame} 266 | 267 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 268 | \begin{frame}{Formulation 2 (2)} 269 | 270 | \plitemsep 0.07in 271 | \bci 272 | 273 | \item Note that $\norm{\vw''} = \frac{1}{r}$ 274 | \item Thus, we have the following reformulated problem: 275 | \mycolorbox 276 | { 277 | \vspace{-0.3cm} 278 | \aleq{ 279 | \max_{\vw'', b''} \quad &\frac{1}{\norm{\vw''}^2} \cr 280 | \text{subject to} \quad & y_n \big(\trans{\vw''}\vx_n +b'' \big) \geq 1, \ \text{for all} \ n=1,\ldots, N, 281 | } 282 | } 283 | = 284 | 285 | \mycolorbox 286 | { 287 | \vspace{-0.3cm} 288 | \aleq{ 289 | \min_{\vw, b} \quad &\frac{1}{2} \norm{\vw}^2 \cr 290 | \text{subject to} \quad & y_n \big(\trans{\vw}\vx_n +b \big) \geq 1, \ \text{for all} \ n=1,\ldots, N, 291 | } 292 | } 293 | 294 | 295 | \eci 296 | \end{frame} 297 | 298 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 299 | \begin{frame}{Understanding Formulation 2 Intuitively} 300 | 301 | \plitemsep 0.07in 302 | \bci 303 | 304 | \item Given the training dataset $\{(\vx_1,y_1), \ldots, (\vx_N,y_N) \}$ 305 | and a hyperplane $\trans{\vw}\vx + b =0,$ what is the constraint that all data points are $\frac{r}{\norm{w}}$-away from the hyperplane? 306 | $$ 307 | y_n \big(\trans{\vw}\vx_n +b \big) \geq \frac{r}{\norm{\vw}} 308 | $$ 309 | 310 | \item \redf{Formulation 1.} Note that $r$ and $\norm{w}$ are scaled together, so if we fix $\norm{w}=1$, then 311 | $$ 312 | y_n \big(\trans{\vw}\vx_n +b \big) \geq r. 313 | $$ 314 | And, \bluef{maximize $r.$} 315 | 316 | \item \redf{Formulation 2.} If we fix $r=1,$ then 317 | $$ 318 | y_n \big(\trans{\vw}\vx_n +b \big) \geq 1. 319 | $$ 320 | And, minimize $\norm{\vw}$ 321 | \eci 322 | \end{frame} 323 | 324 | 325 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 326 | \section{L12(3)} 327 | \begin{frame}{Roadmap} 328 | 329 | \plitemsep 0.1in 330 | 331 | \bce[(1)] 332 | 333 | \item \grayf{Story and Separating Hyperplanes} 334 | \item \grayf{Primal SVM: Hard SVM} 335 | \item \redf{Primal SVM: Soft SVM} 336 | \item \grayf{Dual SVM 337 | \item Kernels 338 | \item Numerical Solution} 339 | 340 | \ece 341 | \end{frame} 342 | 343 | 344 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 345 | \begin{frame}{Soft SVM: Geometric View} 346 | 347 | \plitemsep 0.07in 348 | \bci 349 | 350 | \item Now we allow some classification errors, because it's not linearly separable. 351 | 352 | \item Introduce a slack variable that quantifies how much errors will be allowed in my optimization problem 353 | \mytwocols{0.6} 354 | { 355 | \small 356 | \item $\vxi = (\xi_n: n=1, \ldots, N)$ 357 | \item $\xi_n$: slack for the $n$-th sample $(\vx_n,y_n)$ 358 | \begin{tcolorbox}[colback=red!5!white,colframe=red!75!black] 359 | \vspace{-0.3cm} 360 | \aleq{ 361 | \min_{\vw, b} \quad &\frac{1}{2} \norm{\vw}^2 +C\sum_{n=1}^N \xi_n \cr 362 | \text{subject to} \quad & y_n \big(\trans{\vw}\vx_n +b \big) \geq 1 - \xi_n,\cr 363 | & \xi_n \geq 0, \qquad \text{for all} \ n 364 | } 365 | \end{tcolorbox} 366 | 367 | \item $C$: Trade-off between width and slack 368 | } 369 | { 370 | %\vspace{-0.4cm} 371 | \mypic{0.75}{L12_softsvm_geo.png} 372 | } 373 | 374 | \eci 375 | \end{frame} 376 | 377 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 378 | \begin{frame}{Soft SVM: Loss Function View (1)} 379 | 380 | \plitemsep 0.07in 381 | \bci 382 | 383 | \item From the perspective of empirical risk minimizaiton 384 | 385 | \item Loss function design 386 | \bci 387 | \item \bluef{zero-one loss} $\mathbf{1}(f(x_n) \neq y_n)$: \# of mismatches between the prediction and the label $\implies$ combinatorial optimization (typically NP-hard) 388 | 389 | \item \bluef{hinge loss} 390 | $$ 391 | \ell(t) = \max(0,1-t), \ \text{where} \ t = y f(\vx) = y(\trans{\vw}\vx + b) 392 | $$ 393 | 394 | \mysmalltwocols{0.4} 395 | { 396 | \bci 397 | \item If $\vx$ is really at the correct side, $t \geq 1$ $\rightarrow$ $\ell(t) =0$ 398 | \item If $\vx$ is at the correct side, but too close to the boundary, $0 < t < 1$ \\$\rightarrow$ $0< \ell(t) =1-t <1$ 399 | \item If $\vx$ is at the wrong side, $ t < 0$ \\$\rightarrow$ $1 < \ell(t) =1-t$ 400 | \eci 401 | } 402 | { 403 | \mypic{0.8}{L12_hingeloss.png} 404 | } 405 | 406 | \eci 407 | 408 | \eci 409 | \end{frame} 410 | 411 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 412 | \begin{frame}{Soft SVM: Loss Function View (2)} 413 | 414 | \mycolorbox{ 415 | \vspace{-0.3cm} 416 | \aleq{ 417 | \min_{\vw, b} \ \text{(regularizer + loss)} = \min_{\vw, b} \quad \frac{1}{2} \norm{\vw}^2 +C\sum_{n=1}^N \max \{0,1- y(\trans{\vw}\vx + b) \} 418 | } 419 | } 420 | \plitemsep 0.1in 421 | \bci 422 | 423 | \item $\frac{1}{2}\norm{\vw}^2$: L2-regularizer (margin maximization = regularization) 424 | 425 | \item $C$: regularization parameter, which moves from the regularization term to the loss term 426 | \item Why this loss function view = geometric view? 427 | \aleq{ 428 | \min_t \max(0,1-t) \Longleftrightarrow \min_{\xi,t} \xi, \ \text{subject to} \ \xi \geq 0, \ \xi \geq 1-t 429 | } 430 | 431 | \eci 432 | \end{frame} 433 | 434 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 435 | \section{L12(4)} 436 | \begin{frame}{Roadmap} 437 | 438 | \plitemsep 0.1in 439 | 440 | \bce[(1)] 441 | 442 | \item \grayf{Story and Separating Hyperplanes} 443 | \item \grayf{Primal SVM: Hard SVM} 444 | \item \grayf{Primal SVM: Soft SVM} 445 | \item \red{Dual SVM} 446 | \item \grayf{Kernels 447 | \item Numerical Solution} 448 | 449 | \ece 450 | \end{frame} 451 | 452 | 453 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 454 | \begin{frame}{Dual SVM: Idea} 455 | 456 | \begin{tcolorbox}[colback=red!5!white,colframe=red!75!black] 457 | \vspace{-0.3cm} 458 | \aleq{ 459 | \min_{\vw, b} \quad &\frac{1}{2} \norm{\vw}^2 +C\sum_{n=1}^N \xi_n \cr 460 | \text{subject to} \quad & y_n \big(\trans{\vw}\vx_n +b \big) \geq 1 - \xi_n, \ \xi_n \geq 0, \quad \text{for all} \ n 461 | } 462 | \end{tcolorbox} 463 | 464 | \vspace{-0.3cm} 465 | \plitemsep 0.05in 466 | \bci 467 | 468 | \item The above primal problem is a convex optimization problem. 469 | 470 | \item Let's apply Lagrange multipliers, find another formulation, and see what other nice properties are shown \hfill \lecturemark{L7(2), L7(4)} 471 | 472 | \item Convert the problem into "$\leq$" constraints, so as to apply \redf{min-min-max} rule 473 | \mycolorbox{ 474 | \vspace{-0.3cm} 475 | \aleq{ 476 | \min_{\vw, b} \ \frac{1}{2} \norm{\vw}^2 +C\sum_{n=1}^N \xi_n, \ 477 | \text{s.t.} \ -y_n \big(\trans{\vw}\vx_n +b \big) \leq -1 + \xi_n, \ -\xi_n \leq 0, \quad \text{for all} \ n 478 | } 479 | } 480 | 481 | % \item Lagrangian 482 | % \aleq{ 483 | % \cL(\vw, b, \vxi, \valpha, \vgamma) = \frac{1}{2} \norm{\vw}^2 +C\sum_{n=1}^N \xi_n 484 | % - \sum_{n=1}^N \alpha_n\Big[y_n \big(\trans{\vw}\vx_n +b \big) -1 + \xi_n \Big] - \sum_{n=1}^N \gamma_n \xi_n 485 | % } 486 | 487 | \eci 488 | 489 | 490 | \end{frame} 491 | 492 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 493 | \begin{frame}{Applying Lagrange Multipliers (1)} 494 | 495 | \mycolorbox{ 496 | \vspace{-0.3cm} 497 | \aleq{ 498 | \min_{\vw, b} \ \frac{1}{2} \norm{\vw}^2 +C\sum_{n=1}^N \xi_n, \ 499 | \text{s.t.} \ -y_n \big(\trans{\vw}\vx_n +b \big) \leq -1 + \xi_n, \ -\xi_n \leq 0, \quad \text{for all} \ n 500 | } 501 | } 502 | \vspace{-0.5cm} 503 | \plitemsep 0.05in 504 | \bci 505 | 506 | \item Lagrangian with multipliers $\alpha_n \geq 0$ and $\gamma_n \geq 0$ 507 | \aleq{ 508 | \cL(\vw, b, \vxi, \valpha, \vgamma) = \frac{1}{2} \norm{\vw}^2 +C\sum_{n=1}^N \xi_n 509 | - \sum_{n=1}^N \alpha_n\Big[y_n \big(\trans{\vw}\vx_n +b \big) -1 + \xi_n \Big] - \sum_{n=1}^N \gamma_n \xi_n 510 | } 511 | 512 | \item Dual function: $\cD(\valpha,\vgamma) = \inf_{\vw, b, \vxi} \cL(\vw, b, \vxi, \valpha, \vgamma)$ for which the followings should be met: 513 | \small 514 | \aleq{ 515 | \text{\blue (D1)} \ \pd{\cL}{\vw} = \trans{\vw} - \sum_{n=1}^N \alpha_n y_n \trans{\vx}_n = 0, \ \text{\blue (D2)} \ \pd{\cL}{b} = \sum_{n=1}^N \alpha_n y_n =0 , \ \text{(\blue D3)} \ \pd{\cL}{\xi_n} = C - \alpha_n - \gamma_n = 0 516 | } 517 | \eci 518 | 519 | 520 | \end{frame} 521 | 522 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 523 | \begin{frame}{Applying Lagrange Multipliers (2)} 524 | 525 | \plitemsep 0.07in 526 | \bci 527 | 528 | \item Dual function $\cD(\valpha,\vgamma) = \inf_{\vw, b, \vxi} \cL(\vw, b, \vxi, \valpha, \vgamma)$ with \bluef{(D1)} is given by: 529 | \aleq{ 530 | \cD(\valpha,\vgamma) &= \frac{1}{2} \sum_{i=1}^N \sum_{j=1}^N y_i y_j \alpha_i \alpha_j 531 | \inner{\vx_i}{\vx_j} - \redf{\sum_{i=1}^N y_i \alpha_i} \inner{\sum_{j=1}^N y_j \alpha_j \vx_j}{\vx_i} -b \redf{\sum_{i=1}^N y_i \alpha_i} \cr 532 | & + \sum_{i=1}^N \alpha_i + \sum_{i=1}^N \magenf{(C-\alpha_i -\gamma_i)}\xi_i 533 | } 534 | 535 | \item From \redf{(D2)} and \magenf{(D3)}, the above is simplified into: 536 | \aleq{ 537 | \cD(\valpha,\vgamma) = \frac{1}{2} \sum_{i=1}^N \sum_{j=1}^N y_i y_j \alpha_i \alpha_j 538 | \inner{\vx_i}{\vx_j} + \sum_{i=1}^N \alpha_i 539 | } 540 | 541 | \item $\alpha_i, \gamma_i \geq 0$ and $C-\alpha_i-\gamma_i =0$ $\implies$ $ 0 \le \alpha_i \le C$ 542 | \eci 543 | 544 | 545 | \end{frame} 546 | 547 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 548 | \begin{frame}{Dual SVM} 549 | 550 | \plitemsep 0.07in 551 | \bci 552 | 553 | \item (Lagrangian) Dual Problem: \redf{maximize $\cD(\valpha,\vgamma)$} 554 | \mycolorbox 555 | { 556 | \vspace{-0.3cm} 557 | \aleq{ 558 | \min_{\valpha} \quad & \frac{1}{2} \sum_{i=1}^N \sum_{j=1}^N y_i y_j \alpha_i \alpha_j 559 | \inner{\vx_i}{\vx_j} + \sum_{i=1}^N \alpha_i \cr 560 | \text{subject to} \quad& \sum_{i=1}^N y_i \alpha_i =0, \quad 0 \le \alpha_i \le C, \ \forall i=1, \ldots, N 561 | } 562 | \vspace{-0.2cm} 563 | } 564 | \item Primal SVM: the number of parameters scales as \bluef{the number of features ($D$)} 565 | 566 | \item Dual SVM 567 | \bci 568 | \item the number of parameters scales as \bluef{the number of training data ($N$)} 569 | \item only depends on the inner products of individual training data points $\inner{\vx_i}{\vx_j}$ $\rightarrow$ allow the application of \redf{kernel} 570 | \eci 571 | 572 | \eci 573 | \end{frame} 574 | 575 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 576 | \section{L12(5)} 577 | \begin{frame}{Roadmap} 578 | 579 | \plitemsep 0.1in 580 | 581 | \bce[(1)] 582 | 583 | \item \grayf{Story and Separating Hyperplanes} 584 | \item \grayf{Primal SVM: Hard SVM} 585 | \item \grayf{Primal SVM: Soft SVM} 586 | \item \grayf{Dual SVM} 587 | \item \redf{Kernels 588 | \item Numerical Solution} 589 | 590 | \ece 591 | \end{frame} 592 | 593 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 594 | \begin{frame}{Kernel} 595 | 596 | \mytwocols{0.7} 597 | { 598 | \bigskip 599 | 600 | \plitemsep 0.1in 601 | \bci 602 | 603 | \item Modularity: Using the feature transformation $\vphi(\vx),$ dual SVMs can be modularized 604 | $$ 605 | \inner{\vx_i}{\vx_j} \implies \inner{\vphi(\vx_i)}{\vphi(\vx_j)} 606 | $$ 607 | 608 | \item Similarity function $k: \cX \times \cX \mapsto \real$, $k(\vx_i,\vx_j) = \inner{\vphi(\vx_i)}{\vphi(\vx_j)}$ 609 | 610 | \item Kernel matrix, Gram matrix: must be symmetric and positive semidifinite 611 | 612 | \item Examples: polynomial kernel, Gaussian radial basis function, rational quadratic kernel 613 | \eci 614 | } 615 | { 616 | \mypic{0.9}{L12_kernel_ex.png} 617 | } 618 | 619 | \end{frame} 620 | 621 | 622 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 623 | \begin{frame}{Numerical Solution} 624 | 625 | \plitemsep 0.07in 626 | \bci 627 | 628 | \item 629 | 630 | \eci 631 | \end{frame} 632 | 633 | 634 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 635 | \begin{frame}{} 636 | \vspace{2cm} 637 | \LARGE Questions? 638 | 639 | 640 | \end{frame} 641 | 642 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 643 | \begin{frame}{Review Questions} 644 | % \tableofcontents 645 | %\plitemsep 0.1in 646 | \bce[1)] 647 | \item 648 | 649 | \ece 650 | \end{frame} 651 | 652 | 653 | \end{document} 654 | -------------------------------------------------------------------------------- /compile.sh: -------------------------------------------------------------------------------- 1 | (cd 01.Introduction; pdflatex -jobname=1.intro main.tex) 2 | (cd 02.LinearAlgebra; pdflatex -jobname=2.LA main.tex) 3 | (cd 03.Geometry; pdflatex -jobname=3.AG main.tex) 4 | (cd 04.MatrixDecomposition; pdflatex -jobname=4.MD main.tex) 5 | (cd 05.VectorCaculus; pdflatex -jobname=5.VC main.tex) 6 | (cd 06.Probability; pdflatex -jobname=6.PD main.tex) 7 | (cd 07.Optimization; pdflatex -jobname=7.OPT main.tex) 8 | (cd 08.Model_Data; pdflatex -jobname=8.MMD main.tex) 9 | (cd 09.LinearRegression; pdflatex -jobname=9.LR main.tex) 10 | (cd 10.PCA; pdflatex -jobname=10.PCA main.tex) 11 | (cd 11.DensityEstimation; pdflatex -jobname=11.GMM main.tex) 12 | (cd 12.SVM; pdflatex -jobname=12.SVM main.tex) 13 | -------------------------------------------------------------------------------- /kaist_ee.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yung-web/MathML/6a4082ac366a8416eb945c4a124e2f1281a056c1/kaist_ee.png -------------------------------------------------------------------------------- /mydefault.tex: -------------------------------------------------------------------------------- 1 | %itemshape 2 | \setbeamertemplate{itemize item}{\scriptsize\raise1.25pt\hbox{\donotcoloroutermaths$\bullet$}} 3 | \setbeamertemplate{itemize subitem}{\tiny\raise1.5pt\hbox{\donotcoloroutermaths$\circ$}} 4 | \setbeamertemplate{itemize subsubitem}{\tiny\raise1.5pt\hbox{\donotcoloroutermaths$\blacktriangleright$}} 5 | %default value for spacing 6 | \plitemsep 0.1in 7 | \pltopsep 0.03in 8 | \setlength{\parskip}{0.15in} 9 | %\setlength{\parindent}{-0.5in} 10 | \setlength{\abovedisplayskip}{0.07in} 11 | \setlength{\belowdisplayskip}{0.07in} 12 | \setlength{\mathindent}{0cm} 13 | \setbeamertemplate{frametitle continuation}{[\insertcontinuationcount]} 14 | 15 | \setlength{\leftmargini}{0.5cm} 16 | \setlength{\leftmarginii}{0.5cm} 17 | 18 | \setlength{\fboxrule}{0.05pt} 19 | \setlength{\fboxsep}{5pt} 20 | 21 | 22 | %%%%%%% This should be placed at the end of this file 23 | \logo{\pgfputat{\pgfxy(0.11, 7.4)}{\pgfbox[right,base]{\tikz{\filldraw[fill=dianablue, draw=none] (0 cm, 0 cm) rectangle (50 cm, 1 cm);}\mbox{\hspace{-8 cm}\includegraphics[height=0.7 cm]{../kaist_ee.png} 24 | }}}} 25 | 26 | \begin{frame} 27 | \titlepage 28 | \end{frame} 29 | 30 | \logo{\pgfputat{\pgfxy(0.11, 7.4)}{\pgfbox[right,base]{\tikz{\filldraw[fill=dianablue, draw=none] (0 cm, 0 cm) rectangle (50 cm, 1 cm);}\mbox{\hspace{-8 cm}\includegraphics[height=0.7 cm]{../kaist_ee.png} 31 | }}}} 32 | 33 | % rule color - gray 34 | \makeatletter 35 | \let\old@rule\@rule 36 | \def\@rule[#1]#2#3{\textcolor{gray}{\old@rule[#1]{#2}{#3}}} 37 | \makeatother 38 | 39 | 40 | -------------------------------------------------------------------------------- /myhead.tex: -------------------------------------------------------------------------------- 1 | % when making printed slides 2 | \usepackage{pgfpages} 3 | \pgfpagesuselayout{resize to}[a4paper,landscape,border shrink=5mm] 4 | 5 | \usepackage[english]{babel} 6 | \usepackage{tikz} 7 | \usepackage{courier} 8 | \usepackage{array} 9 | \usepackage{bold-extra} 10 | %\usepackage{minted} 11 | \usepackage[thicklines]{cancel} 12 | \usepackage{fancyvrb} 13 | \usepackage{kotex} 14 | \usepackage{paralist} 15 | \usepackage{collectbox} 16 | \usepackage{bm} 17 | 18 | \usepackage{mathrsfs} 19 | \usepackage[reqno,disallowspaces]{mathtools} % imports amsmath 20 | \usepackage{amsfonts} %for Y&Y BSR AMS fonts 21 | \usepackage{amssymb} 22 | \usepackage{amscd} 23 | %\usepackage{tikz,lipsum,lmodern} 24 | \usepackage[most]{tcolorbox} 25 | \usepackage{verbatim} 26 | \mode 27 | { 28 | \usetheme{default} 29 | \usecolortheme{default} 30 | \usefonttheme{default} 31 | \setbeamertemplate{navigation symbols}{} 32 | \setbeamertemplate{caption}[numbered] 33 | \setbeamertemplate{footline}[frame number] % or "page number" 34 | \setbeamercolor{frametitle}{fg=yellow} 35 | \setbeamercolor{footline}{fg=black} 36 | } 37 | 38 | \setbeamercolor{block body alerted}{bg=alerted text.fg!10} 39 | \setbeamercolor{block title alerted}{bg=alerted text.fg!20} 40 | \setbeamercolor{block body}{bg=structure!10} 41 | \setbeamercolor{block title}{bg=structure!20} 42 | \setbeamercolor{block body example}{bg=green!10} 43 | \setbeamercolor{block title example}{bg=green!20} 44 | \setbeamertemplate{blocks}[rounded][shadow] 45 | 46 | \xdefinecolor{dianablue}{rgb}{0.18,0.24,0.31} 47 | \xdefinecolor{darkblue}{rgb}{0.1,0.1,0.7} 48 | \xdefinecolor{darkgreen}{rgb}{0,0.5,0} 49 | \xdefinecolor{darkgrey}{rgb}{0.35,0.35,0.35} 50 | \xdefinecolor{darkorange}{rgb}{0.8,0.5,0} 51 | \xdefinecolor{darkred}{rgb}{0.7,0,0} 52 | \definecolor{darkgreen}{rgb}{0,0.6,0} 53 | \definecolor{mauve}{rgb}{0.58,0,0.82} 54 | 55 | \usetikzlibrary{shapes.callouts} 56 | 57 | \makeatletter 58 | \setbeamertemplate{footline} 59 | { 60 | \leavevmode% 61 | \hbox{% 62 | \begin{beamercolorbox}[wd=.333333\paperwidth,ht=2.25ex,dp=1ex,center]{author in head/foot}% 63 | \usebeamerfont{author in head/foot}\insertsection 64 | \end{beamercolorbox}% 65 | \begin{beamercolorbox}[wd=.333333\paperwidth,ht=2.25ex,dp=1ex,center]{title in head/foot}% 66 | \usebeamerfont{title in head/foot}\insertsubsection 67 | \end{beamercolorbox}% 68 | \begin{beamercolorbox}[wd=.333333\paperwidth,ht=2.25ex,dp=1ex,right]{date in head/foot}% 69 | \usebeamerfont{date in head/foot} 70 | \insertshortdate{}\hspace*{2em} 71 | \insertframenumber{} / \inserttotalframenumber\hspace*{2ex} 72 | \end{beamercolorbox}}% 73 | 74 | \vskip0pt% 75 | } 76 | \makeatother -------------------------------------------------------------------------------- /mymacro.tex: -------------------------------------------------------------------------------- 1 | %%%%%%%%%% linear algebra macros %%%%%%%%%%%%%%%%%%%%%%% 2 | 3 | %--------linsys 4 | % Use as \begin{linsys}{3} 5 | % x &+ &3y &+ &a &= &7 \\ 6 | % x &- &3y &+ &a &= &7 7 | % \end{linsys} 8 | % Remark: TeXbook pp. 167-170 says to put a medmuskip around a +; and that's 9 | % 4/18-ths of an em. Why does 2/18-ths of an em work? I don't know, but 10 | % comparing to a regular displayed equation suggests it is right. 11 | % (darseneau says LaTeX puts in half an \arraycolsep.) 12 | \newenvironment{linsys}[2][m]{% 13 | \setlength{\arraycolsep}{.1111em} % p. 170 TeXbook; a medmuskip 14 | \begin{array}[#1]{@{}*{#2}{rc}r@{}} 15 | }{% 16 | \end{array}} 17 | 18 | \newsavebox\boxofmathplus 19 | \sbox{\boxofmathplus}{$+$} 20 | \newcommand{\spaceforemptycolumn}{\makebox[\wd\boxofmathplus]{\ }} 21 | 22 | %--------grstep 23 | % For denoting a Gauss' reduction step. 24 | % Use as: \grstep{\rho_1+\rho_3} or \grstep[2\rho_5 \\ 3\rho_6]{\rho_1+\rho_3} 25 | % \newcommand{\grstep}[2][\relax]{% 26 | % \ensuremath{\mathrel{ 27 | % \mathop{\longrightarrow}\limits^{#2\mathstrut}_{ 28 | % \begin{subarray}{l} #1 \end{subarray}}}}} 29 | 30 | % Advantage of length formulation is that between adjacent 31 | % \grstep's you can add \hspace{-\grsteplength} to make it look not too wide 32 | \newlength{\grsteplength} 33 | \setlength{\grsteplength}{1.5ex plus .1ex minus .1ex} 34 | 35 | \newcommand{\grstep}[2][\relax]{% 36 | \ensuremath{\mathrel{ 37 | \hspace{\grsteplength}\mathop{\longrightarrow}\limits^{#2\mathstrut}_{ 38 | \begin{subarray}{l} #1 \end{subarray}}\hspace{\grsteplength}}}} 39 | % If two or more \grsteps are in a row then they need to be tightened 40 | \newcommand{\repeatedgrstep}[2][\relax]{\hspace{-\grsteplength}\grstep[#1]{#2}} 41 | 42 | % row swap operation: \rho_1\swap\rho_2 43 | \newcommand{\swap}{\leftrightarrow} 44 | 45 | %-------------amatrix 46 | % Augmented matrix. Usage (note the argument does not count the aug col): 47 | % \begin{amatrix}{2} 48 | % 1 2 3 \\ 4 5 6 49 | % \end{amatrix} 50 | \newenvironment{amatrix}[1]{% 51 | \left(\begin{array}{@{}*{#1}{c}|c@{}} 52 | }{% 53 | \end{array}\right) 54 | } 55 | 56 | 57 | 58 | %-------------pmat 59 | % For matrices with arguments. 60 | % Usage: \begin{pmat}{c|c|c} 1 &2 &3 \end{pmat} 61 | \newenvironment{pmat}[1]{ 62 | \left(\begin{array}{@{}#1@{}} 63 | }{\end{array}\right) 64 | } 65 | 66 | 67 | 68 | %-------------misc matrices 69 | % \newenvironment{mat}{\left(\begin{array}}{\end{array}\right)} 70 | \newenvironment{detmat}{\left|\begin{array}}{\end{array}\right|} 71 | \newcommand{\deter}[1]{ \mathchoice{\left|#1\right|}{|#1|}{|#1|}{|#1|} } 72 | \newcommand{\generalmatrix}[3]{ %arg1: low-case letter, arg2: rows, arg3: cols 73 | \left( 74 | \begin{array}{cccc} 75 | #1_{1,1} _{1,2} &\ldots _{1,#2} \\ 76 | #1_{2,1} _{2,2} &\ldots _{2,#2} \\ 77 | &\vdots \\ 78 | #1_{#3,1} _{#3,2} &\ldots _{#3,#2} 79 | \end{array} 80 | \right) } 81 | 82 | \newcommand{\generaldet}[3]{ %arg1: low-case letter, arg2: rows, arg3: cols 83 | \left| 84 | \begin{array}{cccc} 85 | #1_{11} _{12} &\ldots _{1 #2} \\ 86 | #1_{21} _{22} &\ldots _{2 #2} \\ 87 | &\vdots \\ 88 | #1_{#3 1} _{#3 2} &\ldots _{#3 #2} 89 | \end{array} 90 | \right| } 91 | 92 | % With mathtools we can have column entries right flushed 93 | % There is an optional argument \begin{mat}[r]{3} .. \end{mat} for 94 | % right-flushed columns. Perhaps the rule is that numbers are better 95 | % right-flushed but if there are any letters it is better centered? 96 | \newenvironment{nmat}[1][c]{\begin{pmatrix*} % disable optional arg [#1] 97 | }{\end{pmatrix*}} 98 | % If mat starts with &\vdots get an error; why? No apparent macro fix, according to texexchange 99 | \newenvironment{vmat}[1][c]{\begin{vmatrix*} % disable optional arg [#1] 100 | }{\end{vmatrix*}} 101 | \newenvironment{amat}[2][c]{% 102 | % disable optional arg \left(\begin{array}{@{}*{#2}{#1}|#1@{}} 103 | \left(\begin{array}{@{}*{#2}{c}|#1@{}} 104 | }{% 105 | \end{array}\right) 106 | } 107 | % \newcommand\vdotswithin[1]{% Taken from mathtools.dtx because my TL is not 2011 108 | % {\mathmakebox[\widthof{\ensuremath{{}#1{}}}][c]{{\vdots}}}} 109 | 110 | 111 | %------------colvec and rowvec 112 | % Column vector and row vector. Usage: 113 | % \colvec{1 \\ 2 \\ 3 \\ 4} and \rowvec{1 &2 &3} 114 | % Colvec takes an optional argument \colvec[r]{x_1 \\ 0}. Perhaps 115 | % digits look better right aligned, but if there are any letters it 116 | % needs to be centered? 117 | \newcommand{\colvec}[2][c]{\begin{nmat}[#1] #2 \end{nmat}} 118 | \newcommand{\smallcolvec}[1]{\left(\begin{smallmatrix} #1 \end{smallmatrix}\right)} 119 | % For row vectors, cannot do \newcommand{\rowvec}[1]{\begin{mat} #1 \end{mat}} 120 | % since the delimiters come out too large. 121 | \newcommand{\rowvec}[1]{\setlength{\arraycolsep}{3pt}\left(\begin{matrix} #1 \end{matrix}\right)} 122 | 123 | 124 | 125 | %-------------making aligned columns 126 | % Usage: \begin{aligncolondecimal}{2} 1.2 \\ .33 \end{aligncolondecimal} 127 | % (negative argument centers decimal pt in column). Also Usage: 128 | % \begin{aligncolondecimal}[0em]{2} 1.2 \\ .33 \end{aligncolondecimal} 129 | % to make the left and right LaTeX-array padding disappear. 130 | \RequirePackage{array}\RequirePackage{dcolumn} 131 | \newenvironment{aligncolondecimal}[2][.1111em]{% 132 | \setlength{\arraycolsep}{#1} 133 | \newcolumntype{.}{D{.}{.}{#2}}\begin{array}{.}}{% 134 | \end{array}} 135 | 136 | % Matrix and vector, with numbers centered on decimal point 137 | % Usage: \begin{dmat}{D{.}{.}{1}D{.}{.}{3}} 0 &.123 \\ .2 &.456 \end{dmat} 138 | % (in the D{.}{.}{number} that is the number of decimal places) 139 | \newlength{\dmatcolsep}\setlength{\dmatcolsep}{5pt} 140 | \newenvironment{dmat}[2][\dmatcolsep]{% 141 | \setlength{\arraycolsep}{#1} 142 | \left(\begin{array}{@{}#2@{}} 143 | }{% 144 | \end{array}\right)} 145 | % Usage: \dcolvec[2]{1.23 \\ 4.56} where the optional argument is the number 146 | % of decimal places. 147 | \newcommand{\dcolvec}[2][-1]{\left(\begin{array}{@{}D{.}{.}{#1}@{}} #2 \end{array}\right)} 148 | 149 | %\newcommand{\trans}[1]{ {{#1}^{\mathsf{T}}} } 150 | \newcommand{\trans}[1]{ {#1}^{\mathsf{T}} } 151 | \newcommand{\inv}[1]{ {#1}^{-1} } 152 | \newcommand{\spn}[1]{\ensuremath{\text{span}[#1]} } 153 | \newcommand{\rk}[1]{\ensuremath{\text{rk}(#1)} } 154 | \newcommand{\dimm}[1]{\ensuremath{\text{dim}(#1)} } 155 | \newcommand{\img}[1]{\ensuremath{\text{Im}(#1)} } 156 | %\newcommand{\norm}[1]{\ensuremath{\left || #1 \right ||} } 157 | \newcommand{\norm}[1]{\ensuremath{\left \lVert #1 \right \rVert} } 158 | % orthogonal complement 159 | \newcommand{\ocomp}[1]{\ensuremath{#1^{\bot}} } 160 | \newcommand{\inner}[2]{\ensuremath{\left\langle #1, #2 \right\rangle} } 161 | \DeclareMathOperator{\tr}{tr} 162 | 163 | 164 | % \NewDocumentCommand{\grad}{e{_^}}{% 165 | % \mathop{}\!% \mathop for good spacing before \nabla 166 | % \nabla 167 | % \IfValueT{#1}{_{\!#1}}% tuck in the subscript 168 | % \IfValueT{#2}{^{#2}}% possible superscript 169 | % } 170 | % \begin{equation*} 171 | % \begin{nmat}[r] 172 | % 1 &2 &13 \\ 173 | % 4 &5 &6 174 | % \end{nmat} 175 | % \end{equation*} 176 | 177 | % \begin{equation*} 178 | % \begin{amat}{2} 179 | % 1 &2 &3 \\ 180 | % 4 &5 &6 181 | % \end{amat} 182 | % \end{equation*} 183 | 184 | % \begin{equation*} 185 | % \begin{pmat}{c|c|c} 186 | % 1 &2 &3 \\ 187 | % 4 &5 &6 188 | % \end{pmat} 189 | % \end{equation*} 190 | 191 | % \begin{equation*} 192 | % \begin{vmat} 193 | % a &c \\ 194 | % b &d 195 | % \end{vmat} 196 | % =ad-bc 197 | % \end{equation*} 198 | 199 | % \begin{equation*} 200 | % \vec{v}=\colvec{-1 \\ -0.5 \\ 0} 201 | % \end{equation*} 202 | 203 | % \begin{equation*} 204 | % \vec{v}=\rowvec{-1 & -0.5 & 0} 205 | % \end{equation*} 206 | 207 | -------------------------------------------------------------------------------- /mymath.tex: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%% real, integer notation 2 | \newcommand{\real}{{\mathbb R}} 3 | \newcommand{\realn}{{\mathbb R}^{n}} 4 | \newcommand{\realm}{{\mathbb R}^{m}} 5 | \newcommand{\realD}{{\mathbb R}^{D}} 6 | \newcommand{\realM}{{\mathbb R}^{M}} 7 | \newcommand{\realN}{{\mathbb R}^{N}} 8 | \newcommand{\realnn}{{\mathbb R}^{n \times n}} 9 | \newcommand{\realmm}{{\mathbb R}^{m \times m}} 10 | \newcommand{\realmn}{{\mathbb R}^{m \times n}} 11 | \newcommand{\realnm}{{\mathbb R}^{n \times m}} 12 | \newcommand{\realDM}{{\mathbb R}^{D \times M}} 13 | \newcommand{\realMD}{{\mathbb R}^{M \times D}} 14 | \newcommand{\complex}{{\mathbb C}} 15 | \newcommand{\integer}{{\mathbb Z}} 16 | \newcommand{\natu}{{\mathbb N}} 17 | 18 | 19 | %%% set, vector, matrix 20 | \newcommand{\set}[1]{\ensuremath{\mathcal #1}} 21 | \newcommand{\sets}[1]{\ensuremath{\{#1 \}}} 22 | \renewcommand{\vec}[1]{\bm{#1}} 23 | \newcommand{\mat}[1]{\bm{#1}} 24 | 25 | %%%% vector 26 | \def\vx{\vec{x}} 27 | \def\vy{\vec{y}} 28 | \def\vz{\vec{z}} 29 | \def\vf{\vec{f}} 30 | \def\ve{\vec{e}} 31 | \def\vr{\vec{r}} 32 | \def\vb{\vec{b}} 33 | \def\vc{\vec{c}} 34 | \def\vd{\vec{d}} 35 | \def\vm{\vec{m}} 36 | \def\vu{\vec{u}} 37 | \def\vv{\vec{v}} 38 | \def\vw{\vec{w}} 39 | \def\vX{\vec{X}} 40 | \def\vY{\vec{Y}} 41 | \def\vZ{\vec{Z}} 42 | \def\vth{\vec{\theta}} 43 | \def\vmu{\vec{\mu}} 44 | \def\vnu{\vec{\nu}} 45 | \def\vlam{\vec{\lambda}} 46 | \def\vep{\vec{\epsilon}} 47 | \def\vpi{\vec{\pi}} 48 | \def\vphi{\vec{\phi}} 49 | \def\vxi{\vec{\xi}} 50 | \def\valpha{\vec{\alpha}} 51 | \def\vgamma{\vec{\gamma}} 52 | 53 | %%%% Well-used matrices 54 | \def\mA{\mat{A}} 55 | \def\mB{\mat{B}} 56 | \def\mC{\mat{C}} 57 | \def\mD{\mat{D}} 58 | \def\mI{\mat{I}} 59 | \def\mJ{\mat{J}} 60 | \def\mK{\mat{K}} 61 | \def\mE{\mat{E}} 62 | \def\mP{\mat{P}} 63 | \def\mQ{\mat{Q}} 64 | \def\mU{\mat{U}} 65 | \def\mV{\mat{V}} 66 | \def\mR{\mat{R}} 67 | \def\mS{\mat{S}} 68 | \def\mX{\mat{X}} 69 | \def\msig{\mat{\Sigma}} 70 | \def\mPhi{\mat{\Phi}} 71 | 72 | 73 | \usepackage{amsmath} 74 | %%%%% vector caculus useful macro 75 | % ...\d, which typesets a derivative. ex: \d{y}{x}, instead of \frac{dx}{dy}. 76 | \renewcommand{\d}[2]{\frac{\text{d} #1}{\text{d} #2}} 77 | 78 | 79 | % ...similar for double-derivatives. ex: \dd{y}{x}. 80 | \newcommand{\dd}[2]{\frac{\text{d}^2 #1}{\text{d} #2^2}} 81 | 82 | % ...similar for partial derivatives. ex: \pd{y}{x}. 83 | \newcommand{\pd}[2]{\frac{\partial #1}{\partial #2}} 84 | 85 | 86 | % ...similar for partial double derivatives. ex: \pdd{y}{x}. 87 | \newcommand{\pdd}[2]{\frac{\partial^2 #1}{\partial #2^2}} 88 | % pdd with argument 89 | \newcommand{\pdda}[3]{\frac{\partial^2 #1}{\partial #2 \partial #3}} 90 | 91 | \usepackage{xparse} 92 | 93 | %%%% caligraphic fonts 94 | \def\cL{\ensuremath{{\cal L}}} 95 | \def\cN{\ensuremath{{\cal N}}} 96 | \def\cD{\ensuremath{{\cal D}}} 97 | \def\cC{\ensuremath{{\cal C}}} 98 | \def\cX{\ensuremath{{\cal X}}} 99 | \def\cY{\ensuremath{{\cal Y}}} 100 | 101 | %%% big parenthesis 102 | \def\Bl{\Bigl} 103 | \def\Br{\Bigr} 104 | \def\lf{\left} 105 | \def\ri{\right} 106 | 107 | 108 | %%% floor notations 109 | \newcommand{\lfl}{{\lfloor}} 110 | \newcommand{\rfl}{{\rfloor}} 111 | \newcommand{\floor}[1]{{\lfloor #1 \rfloor}} 112 | 113 | %%% gradient 114 | \newcommand{\grad}[1]{\nabla #1} 115 | \newcommand{\hess}[1]{\text{H} #1} 116 | 117 | %%% definition 118 | %\newcommand{\eqdef}{\ensuremath{\triangleq}} 119 | \newcommand{\eqdef}{\ensuremath{:=}} 120 | %%% imply 121 | \newcommand{\imp}{\Longrightarrow} 122 | 123 | 124 | 125 | \newcommand{\separator}{ 126 | % \begin{center} 127 | \par\noindent\rule{\columnwidth}{0.3mm} 128 | % \end{center} 129 | } 130 | 131 | \newcommand{\mynote}[1]{{\it \color{red} [#1]}} 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | %%% equation alignment 140 | \newcommand{\aleq}[1]{\begin{align*}#1\end{align*}} 141 | 142 | %%%%%%%%%%%%%%%% colored emphasized font, blanked words 143 | 144 | \newcommand{\empr}[1]{{\color{red}\emph{#1}}} 145 | \newcommand{\empb}[1]{{\color{blue}\emph{#1}}} 146 | \newcommand{\redf}[1]{{\color{red} #1}} 147 | \newcommand{\bluef}[1]{{\color{blue} #1}} 148 | \newcommand{\grayf}[1]{{\color{gray} #1}} 149 | \newcommand{\magenf}[1]{{\color{magenta} #1}} 150 | \newcommand{\greenf}[1]{{\color{green} #1}} 151 | \newcommand{\cyanf}[1]{{\color{cyan} #1}} 152 | \newcommand{\orangef}[1]{{\color{orange} #1}} 153 | 154 | \newcommand{\blk}[1]{\underline{\mbox{\hspace{#1}}}} 155 | 156 | 157 | \newcommand{\redblk}[1]{\framebox{\color{red} #1}} 158 | \newcommand{\redblank}[2]{\framebox{\onslide<#1->{\color{red} #2}}} 159 | \newcommand{\blueblk}[1]{\framebox{\color{blue} #1}} 160 | \newcommand{\blueblank}[2]{\framebox{\onslide<#1->{\color{blue} #2}}} 161 | 162 | 163 | 164 | \makeatletter 165 | \newcommand{\mybox}{% 166 | \collectbox{% 167 | \setlength{\fboxsep}{1pt}% 168 | \fbox{\BOXCONTENT}% 169 | }% 170 | } 171 | \makeatother 172 | 173 | \makeatletter 174 | \newcommand{\lecturemark}{% 175 | \collectbox{% 176 | \setlength{\fboxsep}{1pt}% 177 | \fcolorbox{red}{yellow}{\BOXCONTENT}% 178 | }% 179 | } 180 | \makeatother 181 | 182 | \newcommand{\mycolorbox}[1]{ 183 | \begin{tcolorbox}[colback=red!5!white,colframe=red!75!black] 184 | #1 185 | \end{tcolorbox} 186 | } 187 | %%%% figure inclusion 188 | \newcommand{\mypic}[2]{ 189 | \begin{center} 190 | \includegraphics[width=#1\textwidth]{#2} 191 | \end{center} 192 | } 193 | 194 | \newcommand{\myinlinepic}[2]{ 195 | \makebox[0cm][r]{\raisebox{-4ex}{\includegraphics[height=#1]{#2}}} 196 | } 197 | 198 | 199 | 200 | 201 | %%%% itemized and enumerated list 202 | \newcommand{\bci}{\begin{compactitem}} 203 | \newcommand{\eci}{\end{compactitem}} 204 | \newcommand{\bce}{\begin{compactenum}} 205 | \newcommand{\ece}{\end{compactenum}} 206 | 207 | 208 | %%%% making 0.5/0.5 two columns 209 | %%%% how to use: first number: length of separation bar 210 | % \mytwocols{0.6} 211 | % { 212 | % contents in the left column 213 | % } 214 | % { 215 | % contents in the right column 216 | % } 217 | %%%% 218 | 219 | \newcommand{\mytwocols}[3]{ 220 | \begin{columns}[T] \column{.499\textwidth} #2 \column{.001\textwidth} \rule{.3mm}{{#1}\textheight} \column{.499\textwidth} #3 \end{columns}} 221 | 222 | \newcommand{\mythreecols}[4]{ 223 | \begin{columns}[T] \column{.31\textwidth} #2 \column{.001\textwidth} \rule{.3mm}{{#1}\textheight} \column{.31\textwidth} #3 \column{.001\textwidth} \rule{.3mm}{{#1}\textheight} \column{.31\textwidth} #4 \end{columns}} 224 | 225 | \newcommand{\mysmalltwocols}[3]{ 226 | \begin{columns}[T] \column{.4\textwidth} #2 \column{.001\textwidth} \rule{.3mm}{{#1}\textheight} \column{.4\textwidth} #3 \end{columns}} 227 | 228 | %%%% making two columns with customized ratios 229 | %%%% how to use: 230 | %first parameter: length of separation bar 231 | %second parameter: ratio of left column 232 | %third parameter: ratio of right column 233 | % \mytwocols{0.6}{0.7}{0.29} 234 | % { 235 | % contents in the left column 236 | % } 237 | % { 238 | % contents in the right column 239 | % } 240 | %%%% 241 | \newcommand{\myvartwocols}[5]{ 242 | \begin{columns}[T] \column{#2\textwidth} {#4} \column{.01\textwidth} \rule{.3mm}{{#1}\textheight} \column{#3\textwidth} {#5} \end{columns}} 243 | 244 | %%% making my block in beamer 245 | %%% first parameter: title of block 246 | %%% second parameter: contents of block 247 | \newcommand{\myblock}[2]{ 248 | \begin{block}{#1} {#2} \end{block}} 249 | 250 | %%% independence notation 251 | \newcommand{\indep}{\perp \!\!\! \perp} 252 | 253 | %%%% probability with different shapes (parenthesis or bracket) and different sizes 254 | %%% `i' enables us to insert the subscript to the probability 255 | \newcommand{\bprob}[1]{\mathbb{P}\Bl[ #1 \Br]} 256 | \newcommand{\prob}[1]{\mathbb{P}[ #1 ]} 257 | \newcommand{\cbprob}[1]{\mathbb{P}\Bl( #1 \Br)} 258 | \newcommand{\cprob}[1]{\mathbb{P}( #1 )} 259 | \newcommand{\probi}[2]{\mathbb{P}_{#1}[ #2 ]} 260 | \newcommand{\bprobi}[2]{\mathbb{P}_{#1}\Bl[ #2 \Br]} 261 | \newcommand{\cprobi}[2]{\mathbb{P}_{#1}( #2 )} 262 | \newcommand{\cbprobi}[2]{\mathbb{P}_{#1}\Bl( #2 \Br)} 263 | 264 | %%%% expectation with different shapes (parenthesis or bracket) and different sizes 265 | %%% `i' enables us to insert the subscript to the expectation 266 | \newcommand{\expect}[1]{\mathbb{E}[ #1 ]} 267 | \newcommand{\cexpect}[1]{\mathbb{E}( #1 )} 268 | \newcommand{\bexpect}[1]{\mathbb{E}\Bl[ #1 \Br]} 269 | \newcommand{\cbexpect}[1]{\mathbb{E}\Bl( #1 \Br)} 270 | \newcommand{\bbexpect}[1]{\mathbb{E}\lf[ #1 \ri]} 271 | \newcommand{\expecti}[2]{\mathbb{E}_{#1}[ #2 ]} 272 | \newcommand{\bexpecti}[2]{\mathbb{E}_{#1}\Bl[ #2 \Br]} 273 | \newcommand{\bbexpecti}[2]{\mathbb{E}_{#1}\lf[ #2 \ri]} 274 | 275 | %%%% variance 276 | \newcommand{\var}[1]{\text{var}[ #1 ]} 277 | \newcommand{\bvar}[1]{\text{var}\Bl[ #1 \Br]} 278 | \newcommand{\cvar}[1]{\text{var}( #1 )} 279 | \newcommand{\cbvar}[1]{\text{var}\Bl( #1 \Br)} 280 | 281 | %%%% covariance 282 | \newcommand{\cov}[1]{\text{cov}( #1 )} 283 | \newcommand{\bcov}[1]{\text{cov}\Bl( #1 \Br)} 284 | 285 | %%% Popular pmf, pdf notation to avoid long typing 286 | \newcommand{\px}{\ensuremath{p_X(x)}} 287 | \newcommand{\py}{\ensuremath{p_Y(y)}} 288 | \newcommand{\pz}{\ensuremath{p_Z(z)}} 289 | \newcommand{\pxA}{\ensuremath{p_{X|A}(x)}} 290 | \newcommand{\pyA}{\ensuremath{p_{Y|A}(y)}} 291 | \newcommand{\pzA}{\ensuremath{p_{Z|A}(z)}} 292 | \newcommand{\pxy}{\ensuremath{p_{X,Y}(x,y)}} 293 | \newcommand{\pxcy}{\ensuremath{p_{X|Y}(x|y)}} 294 | \newcommand{\pycx}{\ensuremath{p_{Y|X}(y|x)}} 295 | 296 | \newcommand{\fx}{\ensuremath{f_X(x)}} 297 | \newcommand{\Fx}{\ensuremath{F_X(x)}} 298 | \newcommand{\fy}{\ensuremath{f_Y(y)}} 299 | \newcommand{\Fy}{\ensuremath{F_Y(y)}} 300 | \newcommand{\fz}{\ensuremath{f_Z(z)}} 301 | \newcommand{\Fz}{\ensuremath{F_Z(z)}} 302 | \newcommand{\fxA}{\ensuremath{f_{X|A}(x)}} 303 | \newcommand{\fyA}{\ensuremath{f_{Y|A}(y)}} 304 | \newcommand{\fzA}{\ensuremath{f_{Z|A}(z)}} 305 | \newcommand{\fxy}{\ensuremath{f_{X,Y}(x,y)}} 306 | \newcommand{\Fxy}{\ensuremath{F_{X,Y}(x,y)}} 307 | \newcommand{\fxcy}{\ensuremath{f_{X|Y}(x|y)}} 308 | \newcommand{\fycx}{\ensuremath{f_{Y|X}(y|x)}} 309 | 310 | \newcommand{\fth}{\ensuremath{f_\Theta(\theta)}} 311 | \newcommand{\fxcth}{\ensuremath{f_{X|\Theta}(x|\theta)}} 312 | \newcommand{\fthcx}{\ensuremath{f_{\Theta|X}(\theta|x)}} 313 | 314 | \newcommand{\pkcth}{\ensuremath{p_{X|\Theta}(k|\theta)}} 315 | \newcommand{\fthck}{\ensuremath{f_{\Theta|X}(\theta|k)}} 316 | 317 | 318 | %%%% indicator 319 | \newcommand{\indi}[1]{\mathbf{1}_{ #1 }} 320 | 321 | %%%% exponential rv. 322 | \newcommand{\elambdax}{\ensuremath{e^{-\lambda x}}} 323 | 324 | %%%% normal rv. 325 | \newcommand{\stdnormal}{\ensuremath{\frac{1}{\sqrt{2\pi}} e^{-x^2/2}}} 326 | \newcommand{\gennormal}{\ensuremath{\frac{1}{\sigma\sqrt{2\pi}} e^{-(x-\mu)^2/2}}} 327 | 328 | %%%%%% estimator, estimate 329 | \newcommand{\hth}{\ensuremath{\hat{\theta}}} 330 | \newcommand{\hTH}{\ensuremath{\hat{\Theta}}} 331 | \newcommand{\MAP}{\ensuremath{\text{MAP}}} 332 | \newcommand{\LMS}{\ensuremath{\text{LMS}}} 333 | \newcommand{\LLMS}{\ensuremath{\text{L}}} 334 | \newcommand{\ML}{\ensuremath{\text{ML}}} 335 | 336 | %%%% colored text 337 | \newcommand{\red}[1]{\color{red}#1} 338 | \newcommand{\cyan}[1]{\color{cyan}#1} 339 | \newcommand{\magenta}[1]{\color{magenta}#1} 340 | \newcommand{\blue}[1]{\color{blue}#1} 341 | \newcommand{\green}[1]{\color{green}#1} 342 | \newcommand{\white}[1]{\color{white}#1} 343 | \newcommand{\gray}[1]{\color{gray}#1} 344 | 345 | %%% definition 346 | \newcommand{\defi}{{\color{red} Definition.} } 347 | \newcommand{\exam}{{\color{red} Example.} } 348 | \newcommand{\question}{{\color{red} Question.} } 349 | \newcommand{\thm}{{\color{red} Theorem.} } 350 | \newcommand{\background}{{\color{red} Background.} } 351 | \newcommand{\msg}{{\color{red} Message.} } 352 | 353 | 354 | \def\ml{\text{ML}} 355 | \def\map{\text{MAP}} 356 | 357 | %%%%%%%%%%%%%%%%%%%%%%% old macros that you can ignore %%%%%%%%%%%%%%%%%%%%%%%% 358 | 359 | % \def\un{\underline} 360 | % \def\ov{\overline} 361 | 362 | 363 | % \newcommand{\beq}{\begin{eqnarray*}} 364 | % \newcommand{\eeq}{\end{eqnarray*}} 365 | % \newcommand{\beqn}{\begin{eqnarray}} 366 | % \newcommand{\eeqn}{\end{eqnarray}} 367 | % \newcommand{\bemn}{\begin{multiline}} 368 | % \newcommand{\eemn}{\end{multiline}} 369 | % \newcommand{\beal}{\begin{align}} 370 | % \newcommand{\eeal}{\end{align}} 371 | % \newcommand{\beas}{\begin{align*}} 372 | % \newcommand{\eeas}{\end{align*}} 373 | 374 | 375 | 376 | % \newcommand{\bd}{\begin{displaymath}} 377 | % \newcommand{\ed}{\end{displaymath}} 378 | % \newcommand{\bee}{\begin{equation}} 379 | % \newcommand{\eee}{\end{equation}} 380 | 381 | 382 | % \newcommand{\vs}{\vspace{0.2in}} 383 | % \newcommand{\hs}{\hspace{0.5in}} 384 | % \newcommand{\el}{\end{flushleft}} 385 | % \newcommand{\bl}{\begin{flushleft}} 386 | % \newcommand{\bc}{\begin{center}} 387 | % \newcommand{\ec}{\end{center}} 388 | % \newcommand{\remove}[1]{} 389 | 390 | % \newtheorem{theorem}{Theorem} 391 | % \newtheorem{corollary}{Corollary} 392 | % \newtheorem{prop}{Proposition} 393 | % \newtheorem{lemma}{Lemma} 394 | % \newtheorem{defi}{Definition} 395 | % \newtheorem{assum}{Assumption} 396 | % \newtheorem{example}{Example} 397 | % \newtheorem{property}{Property} 398 | % \newtheorem{remark}{Remark} 399 | 400 | % \newcommand{\separator}{ 401 | % \begin{center} 402 | % \rule{\columnwidth}{0.3mm} 403 | % \end{center} 404 | % } 405 | 406 | % \newenvironment{separation} 407 | % { \vspace{-0.3cm} 408 | % \separator 409 | % \vspace{-0.25cm} 410 | % } 411 | % { 412 | % \vspace{-0.5cm} 413 | % \separator 414 | % \vspace{-0.15cm} 415 | % } 416 | 417 | % \def\A{\mathcal A} 418 | % \def\oA{\overline{\mathcal A}} 419 | % \def\S{\mathcal S} 420 | % \def\D{\mathcal D} 421 | % \def\eff{{\rm Eff}} 422 | % \def\bD{\bm{D}} 423 | % \def\cU{{\cal U}} 424 | % \def\bbs{{\mathbb{s}}} 425 | % \def\bbS{{\mathbb{S} }} 426 | % \def\cM{{\cal M}} 427 | % \def\bV{{\bm{V}}} 428 | % \def\cH{{\cal H}} 429 | % \def\ch{{\cal h}} 430 | % \def\cR{{\cal R}} 431 | % \def\cV{{\cal V}} 432 | % \def\cA{{\cal A}} 433 | % \def\cX{{\cal X}} 434 | % \def\cN{{\cal N}} 435 | % \def\cJ{{\cal J}} 436 | % \def\cK{{\cal K}} 437 | % \def\cL{{\cal L}} 438 | % \def\cI{{\cal I}} 439 | % \def\cY{{\cal Y}} 440 | % \def\cZ{{\cal Z}} 441 | % \def\cC{{\cal C}} 442 | % \def\cR{{\cal R}} 443 | % \def\id{{\rm Id}} 444 | % \def\st{{\rm st}} 445 | % \def\cF{{\cal F}} 446 | % \def\bz{{\bm z}} 447 | % \def\cG{{\cal G}} 448 | % \def\N{\mathbb{N}} 449 | % \def\bbh{\mathbb{h}} 450 | % \def\bbH{\mathbb{H}} 451 | % \def\bbi{\mathbb{i}} 452 | % \def\bbI{\mathbb{I}} 453 | % \def\R{\mathbb{R}} 454 | % \def\bbR{\mathbb{R}} 455 | % \def\bbr{\mathbb{r}} 456 | % \def\cB{{\cal B}} 457 | % \def\cP{{\cal P}} 458 | % \def\cS{{\cal S}} 459 | % \def\bW{{\bm W}} 460 | % \def\bc{{\bm c}} 461 | 462 | % %\def\and{\quad\mbox{and}\quad} 463 | % \def\ind{{\bf 1}} 464 | 465 | 466 | % \def\bmg{{\bm{\gamma}}} 467 | % \def\bmr{{\bm{\rho}}} 468 | % \def\bmq{{\bm{q}}} 469 | % \def\bmt{{\bm{\tau}}} 470 | % \def\bmn{{\bm{n}}} 471 | % \def\bmcapn{{\bm{N}}} 472 | % \def\bmrho{{\bm{\rho}}} 473 | 474 | % \def\igam{\underline{\gamma}(\lambda)} 475 | % \def\sgam{\overline{\gamma}(\lambda)} 476 | % \def\ovt{\overline{\theta}} 477 | % \def\ovT{\overline{\Theta}} 478 | % \def\PP{{\mathrm P}} 479 | % \def\EE{{\mathrm E}} 480 | % \def\iskip{{\vskip -0.4cm}} 481 | % \def\siskip{{\vskip -0.2cm}} 482 | 483 | % \def\bp{\noindent{\it Proof.}\ } 484 | % \def\ep{\hfill $\Box$} 485 | 486 | 487 | -------------------------------------------------------------------------------- /print.sh: -------------------------------------------------------------------------------- 1 | (cd 01.Introduction; 2 | pdfjam --nup 1x2 1.intro.pdf --outfile 1.intro-2.pdf; 3 | pdfjam --nup 2x2 1.intro.pdf --outfile 1.intro-4.pdf --landscape) 4 | 5 | (cd 02.LinearAlgebra; 6 | pdfjam --nup 1x2 2.LA.pdf --outfile 2.LA-2.pdf; 7 | pdfjam --nup 2x2 2.LA.pdf --outfile 2.LA-4.pdf --landscape) 8 | 9 | (cd 03.Geometry; 10 | pdfjam --nup 1x2 3.AG.pdf --outfile 3.AG-2.pdf; 11 | pdfjam --nup 2x2 3.AG.pdf --outfile 3.AG-4.pdf --landscape) 12 | 13 | (cd 04.MatrixDecomposition; 14 | pdfjam --nup 1x2 4.MD.pdf --outfile 4.MD-2.pdf; 15 | pdfjam --nup 2x2 4.MD.pdf --outfile 4.MD-4.pdf --landscape) 16 | 17 | (cd 05.VectorCaculus; 18 | pdfjam --nup 1x2 5.VC.pdf --outfile 5.VC-2.pdf; 19 | pdfjam --nup 2x2 5.VC.pdf --outfile 5.VC-4.pdf --landscape) 20 | 21 | (cd 06.Probability; 22 | pdfjam --nup 1x2 6.PD.pdf --outfile 6.PD-2.pdf; 23 | pdfjam --nup 2x2 6.PD.pdf --outfile 6.PD-4.pdf --landscape) 24 | 25 | (cd 07.Optimization; 26 | pdfjam --nup 1x2 7.OPT.pdf --outfile 7.OPT-2.pdf; 27 | pdfjam --nup 2x2 7.OPT.pdf --outfile 7.OPT-4.pdf --landscape) 28 | 29 | (cd 08.Model_Data; 30 | pdfjam --nup 1x2 8.MMD.pdf --outfile 8.MMD-2.pdf; 31 | pdfjam --nup 2x2 8.MMD.pdf --outfile 8.MMD-4.pdf --landscape) 32 | 33 | (cd 09.LinearRegression; 34 | pdfjam --nup 1x2 9.LR.pdf --outfile 9.LR-2.pdf; 35 | pdfjam --nup 2x2 9.LR.pdf --outfile 9.LR-4.pdf --landscape) 36 | 37 | (cd 10.PCA; 38 | pdfjam --nup 1x2 10.PCA.pdf --outfile 10.PCA-2.pdf; 39 | pdfjam --nup 2x2 10.PCA.pdf --outfile 10.PCA-4.pdf --landscape) 40 | 41 | (cd 11.DensityEstimation; 42 | pdfjam --nup 1x2 11.GMM.pdf --outfile 11.GMM-2.pdf; 43 | pdfjam --nup 2x2 11.GMM.pdf --outfile 11.GMM-4.pdf --landscape) 44 | 45 | (cd 12.SVM; 46 | pdfjam --nup 1x2 12.SVM.pdf --outfile 12.SVM-2.pdf; 47 | pdfjam --nup 2x2 12.SVM.pdf --outfile 12.SVM-4.pdf --landscape) 48 | 49 | --------------------------------------------------------------------------------